Plan 9 from Bell Labs’s /usr/web/sources/contrib/de0u/root/sys/src/cmd/squeak/Cross/plugins/BitBltPlugin/BitBltDispatch.c

Copyright © 2021 Plan 9 Foundation.
Distributed under the MIT License.
Download the Plan 9 distribution.


/*
 * Copyright © 2013 Raspberry Pi Foundation
 * Copyright © 2013 RISC OS Open Ltd
 *
 * Permission to use, copy, modify, distribute, and sell this software and its
 * documentation for any purpose is hereby granted without fee, provided that
 * the above copyright notice appear in all copies and that both that
 * copyright notice and this permission notice appear in supporting
 * documentation, and that the name of the copyright holders not be used in
 * advertising or publicity pertaining to distribution of the software without
 * specific, written prior permission.  The copyright holders make no
 * representations about the suitability of this software for any purpose.  It
 * is provided "as is" without express or implied warranty.
 *
 * THE COPYRIGHT HOLDERS DISCLAIM ALL WARRANTIES WITH REGARD TO THIS
 * SOFTWARE, INCLUDING ALL IMPLIED WARRANTIES OF MERCHANTABILITY AND
 * FITNESS, IN NO EVENT SHALL THE COPYRIGHT HOLDERS BE LIABLE FOR ANY
 * SPECIAL, INDIRECT OR CONSEQUENTIAL DAMAGES OR ANY DAMAGES
 * WHATSOEVER RESULTING FROM LOSS OF USE, DATA OR PROFITS, WHETHER IN
 * AN ACTION OF CONTRACT, NEGLIGENCE OR OTHER TORTIOUS ACTION, ARISING
 * OUT OF OR IN CONNECTION WITH THE USE OR PERFORMANCE OF THIS
 * SOFTWARE.
 *
 */

#if ENABLE_FAST_BLT
#include <stddef.h>
#include<stdio.h>
#include <stdlib.h>
#include <stdint.h>
#include <stdbool.h>
#include <string.h>

#include "BitBltDispatch.h"
#include "BitBltArm.h"
#include "BitBltGeneric.h"
#include "BitBltInternal.h"

/** Define this to enable profiling */
//#define PROFILING

#ifdef PROFILING
#include <stdio.h>
#include <sys/time.h>

typedef struct profile_record {
	struct profile_record *next;
	struct profile_record *prev;
	combination_rule_t     combinationRule;
	uint32_t               flags;
	uint32_t               calls;
	uint32_t               time;
} profile_record_t;

static profile_record_t *profile_records;
static bool profile_atexit_set;

static uint32_t profile_unrecorded_cases[9];
#endif

/** Array of the test words for each fast path - avoid taking up 1/3
 *  of the cache with function pointers, most of which we don't want */
typedef struct {
	uint32_t           flags;
	combination_rule_t combinationRule;
} fast_path_test_t;

/** Pointer to heap block containing the fast path tests - not cacheline aligned */
static void             *fastPathTestBlock;

/** This array is word-aligned to minimise the number of cachelines
 *  that need to be loaded during the lookup process */
static fast_path_test_t *fastPathTestList;

/** The corresponding array of fast path function pointers */
static void           (**fastPathFuncList)(operation_t *, uint32_t);

/** Number of enabled fast paths */
static size_t            numFastPaths;

static const unsigned int  maskTable53[4] = { 0x7000, 0x0380, 0x001C, 0x0000 };
static const          int shiftTable53[4] = {     -6,     -4,     -2,      0 };
static const unsigned int  maskTable54[4] = { 0x7800, 0x03C0, 0x001E, 0x0000 };
static const          int shiftTable54[4] = {     -3,     -2,     -1,      0 };
static const unsigned int  maskTable58[4] = { 0x7C00, 0x03E0, 0x001F, 0x0000 };
static const          int shiftTable58[4] = {      9,      6,      3,      0 };
static const unsigned int  maskTable83[4] = { 0xE00000, 0x00E000, 0x0000E0, 0x000000 };
static const          int shiftTable83[4] = {      -15,      -10,       -5,        0 };
static const unsigned int  maskTable84[4] = { 0xF00000, 0x00F000, 0x0000F0, 0x000000 };
static const          int shiftTable84[4] = {      -12,       -8,       -4,        0 };
static const unsigned int  maskTable85[4] = { 0xF80000, 0x00F800, 0x0000F8, 0x000000 };
static const          int shiftTable85[4] = {       -9,       -6,       -3,        0 };


#ifdef PROFILING
static uint64_t gettime(void)
{
	struct timeval tv;

	gettimeofday (&tv, NULL);
	return tv.tv_sec * 1000000 + tv.tv_usec;
}

static uint32_t clz(uint32_t x)
{
	static const uint8_t table[32] = {
			0, 31, 14, 30, 22, 13, 29, 19,  2, 21, 12, 10, 25, 28, 18,  8,
	        1, 15, 23, 20,  3, 11, 26,  9, 16, 24,  4, 27, 17,  5,  6,  7
	};
	x |= x >> 16;
	if (x == 0)
		return 32;
	x |= x >> 8;
	x |= x >> 4;
	x |= x >> 2;
	x |= x >> 1;
	x = 0x06C9C57D * x + 0x06C9C57D;
	return table[x >> 27];
}

#define BPP_FLAG_TO_BPP(flags, type) (1u << (31 - clz(((flags) / FAST_PATH_##type##_1BPP) & 0x3F)))

static void profile_atexit(void)
{
	size_t i;
	fprintf(stderr, "BitBltDispatch profiling results:\n");
	fprintf(stderr, "Rule,SrcBpp,SrcBE,DestBpp,DestBE,CM,HT,GC,Ov,Calls,Time\n");
	profile_record_t *rec;
	for (rec = profile_records; rec != NULL; rec = rec->next) {
		fprintf(stderr,"%u,%u,%u,%u,%u,%u,%u,%u,%u,%u,%u\n",
				rec->combinationRule,
				BPP_FLAG_TO_BPP(rec->flags, SRC),
				!(rec->flags & FAST_PATH_SRC_LITTLE_ENDIAN),
				BPP_FLAG_TO_BPP(rec->flags, DEST),
				!(rec->flags & FAST_PATH_DEST_LITTLE_ENDIAN),
				rec->flags & FAST_PATH_NO_COLOR_MAP ? 0 :
				rec->flags & FAST_PATH_9BIT_COLOR_MAP ? 9 :
				rec->flags & FAST_PATH_12BIT_COLOR_MAP ? 12 : 15,
				!(rec->flags & FAST_PATH_NO_HALFTONE),
				!!(rec->flags & FAST_PATH_CA_HAS_GAMMA),
				rec->flags & FAST_PATH_NO_OVERLAP ? 0 :
				rec->flags & FAST_PATH_H_OVERLAP ? 1 : 2,
				rec->calls,
				rec->time);
	}
	const char *sep = "Unrecorded cases (hopefully should all be 0):\n";
	for (i = 0; i < sizeof profile_unrecorded_cases / sizeof *profile_unrecorded_cases; i++) {
		fprintf(stderr, "%s%u", sep, profile_unrecorded_cases[i]);
		sep = ",";
	}
	fprintf(stderr, "\n");
	fprintf(stderr, "fast path list\n");
	for (i = 0; i < numFastPaths; i++) {
			fprintf(stderr, "entry %d %u %X\n",i,  fastPathTestList[i].combinationRule, fastPathTestList[i].flags);
	}

}

static void profile_record(combination_rule_t combinationRule, uint32_t flags, uint32_t time)
{
	if (profile_atexit_set == false) {
		atexit(profile_atexit);
		profile_atexit_set = true;
	}

	/* See if we have a matching record in the list */
	profile_record_t *rec, *prev = NULL;
	for (rec = profile_records; rec != NULL; prev = rec, rec = rec->next) {
		if (rec->combinationRule == combinationRule && rec->flags == flags) {
			/* Move to head of list for efficiency */
			if (prev != NULL) {
				prev->next = rec->next;
				if (rec->next != NULL)
					rec->next->prev = prev;
				rec->next = profile_records;
				rec->prev = NULL;
				profile_records = rec;
			}
			break;
		}
	}
	if (rec == NULL) {
		rec = calloc(1, sizeof *rec);
		if (rec == NULL)
			return;
		rec->next = profile_records;
		rec->combinationRule = combinationRule;
		rec->flags = flags;
		profile_records = rec;
	}
	rec->calls++;
	rec->time += time;
}
#endif


void initialiseCopyBits(void)
{
	size_t i;
	addGenericFastPaths();
#ifdef __arm__
	addArmFastPaths();
#endif

}

void addFastPaths(fast_path_t *paths, size_t n)
{
	void *newFastPathTestBlock = malloc((numFastPaths + n) * sizeof (fast_path_test_t) + CACHELINE_LEN);
	void (**newFastPathFuncList)(operation_t *, uint32_t) = malloc((numFastPaths + n) * sizeof *fastPathFuncList);
	if (newFastPathTestBlock == NULL || newFastPathFuncList == NULL) {
		free(newFastPathTestBlock);
		free(newFastPathFuncList);
		return;
	}
	fast_path_test_t *newFastPathTestList = (fast_path_test_t *)(((uintptr_t) newFastPathTestBlock + CACHELINE_LEN - 1) &~ (CACHELINE_LEN - 1));
	if (numFastPaths > 0) {
		/* Place the old fast paths lower down the array */
		memcpy(newFastPathTestList + n, fastPathTestList, numFastPaths * sizeof *fastPathTestList);
		memcpy(newFastPathFuncList + n, fastPathFuncList, numFastPaths * sizeof *fastPathFuncList);
		free(fastPathTestBlock);
		free(fastPathFuncList);
	}
	fastPathTestBlock = newFastPathTestBlock;
	fastPathTestList = newFastPathTestList;
	fastPathFuncList = newFastPathFuncList;
	size_t i;
	for (i = 0; i < n; i++) {
		fastPathTestList[i].combinationRule = paths[i].combinationRule;
		fastPathTestList[i].flags = paths[i].flags;
		fastPathFuncList[i] = paths[i].func;
	}
	numFastPaths += n;
}

void (*lookupFastPath(combination_rule_t combinationRule, uint32_t flags))(operation_t *, uint32_t)
{
	size_t i;
	for (i = 0; i < numFastPaths; i++) {
		if ((fastPathTestList[i].flags & flags) == 0 &&
		    (fastPathTestList[i].combinationRule == combinationRule ||
		     fastPathTestList[i].combinationRule == CR_any))
		{
			return fastPathFuncList[i];
		}
	}
	return NULL;
}

void copyBitsDispatch(operation_t *op)
{
	/* Quick check for zero-size operations here saves lookup overhead on no-ops,
	 * and saves us from having to cope with them in the fast paths */
	if (op->width == 0 || op->height == 0)
		return;

	static const uint32_t bppToFlag[33] = {
			0, FAST_PATH_SRC_1BPP, FAST_PATH_SRC_2BPP, 0, FAST_PATH_SRC_4BPP, 0, 0, 0,
			FAST_PATH_SRC_8BPP, 0, 0, 0, 0, 0, 0, 0,
			FAST_PATH_SRC_16BPP, 0, 0, 0, 0, 0, 0, 0,
			0, 0, 0, 0, 0, 0, 0, 0,
			FAST_PATH_SRC_32BPP
	};
	uint32_t flags;
	if (op->noSource) {
		flags = FAST_PATH_SRC_0BPP | FAST_PATH_NO_OVERLAP;
	} else {
		/* A shame the depth isn't a log2 value, or we could have used a shift here instead of a lookup */
		flags = bppToFlag[op->src.depth];
		if (op->src.depth < 32) {
			if (op->src.msb)
				flags |= FAST_PATH_SRC_BIG_ENDIAN;
			else
				flags |= FAST_PATH_SRC_LITTLE_ENDIAN;
		}
		if (op->dest.bits == op->src.bits) {
			if (op->dest.y == op->src.y) {
				if (op->dest.x > op->src.x && op->dest.x < op->src.x + (signed) op->width)
					flags |= FAST_PATH_H_OVERLAP;
				else
					flags |= FAST_PATH_NO_OVERLAP;
			} else if (op->dest.y > op->src.y && op->dest.y < op->src.y + (signed) op->height)
				flags |= FAST_PATH_V_OVERLAP;
			else
				flags |= FAST_PATH_NO_OVERLAP;
		} else
			flags |= FAST_PATH_NO_OVERLAP;
	}
	flags |= bppToFlag[op->dest.depth] * (FAST_PATH_DEST_1BPP / FAST_PATH_SRC_1BPP);
	if (op->dest.depth < 32) {
		if (op->dest.msb)
			flags |= FAST_PATH_DEST_BIG_ENDIAN;
		else
			flags |= FAST_PATH_DEST_LITTLE_ENDIAN;
	}

	if (!op->noSource) {
		/* Sanity check on colour map parameters - can it be removed? */
		if (((op->cmFlags & ColorMapIndexedPart) == 0) != (op->cmLookupTable == NULL) ||
		    ((op->cmFlags & ColorMapFixedPart) == 0) != (op->cmMaskTable == NULL) ||
		    ((op->cmFlags & ColorMapFixedPart) == 0) != (op->cmShiftTable == NULL)) {
			/* Unsupported case  0 */
			copyBitsFallback(op, 0);
#ifdef PROFILING
			profile_unrecorded_cases[0]++;
#endif
			return;
		}

		if (op->cmFlags & ColorMapIndexedPart) {
			if (op->cmFlags & ColorMapFixedPart) {
				if (op->src.depth == 32) {
					if (op->cmMask == 0x7FFF && memcmp(op->cmMaskTable, maskTable85, sizeof maskTable85) == 0 && memcmp(op->cmShiftTable, shiftTable85, sizeof shiftTable85) == 0)
						flags |= FAST_PATH_15BIT_COLOR_MAP;
					else if (op->cmMask == 0xFFF && memcmp(op->cmMaskTable, maskTable84, sizeof maskTable84) == 0 && memcmp(op->cmShiftTable, shiftTable84, sizeof shiftTable84) == 0)
						flags |= FAST_PATH_12BIT_COLOR_MAP;
					else if (op->cmMask == 0x1FF && memcmp(op->cmMaskTable, maskTable83, sizeof maskTable83) == 0 && memcmp(op->cmShiftTable, shiftTable83, sizeof shiftTable83) == 0)
						flags |= FAST_PATH_9BIT_COLOR_MAP;
					else {
						/* Unsupported case 1 */
						copyBitsFallback(op, 0);
#ifdef PROFILING
						profile_unrecorded_cases[1]++;
#endif
						return;
					}
				} else if (op->src.depth == 16) {
					if (op->cmMask == 0xFFF && memcmp(op->cmMaskTable, maskTable54, sizeof maskTable54) == 0 && memcmp(op->cmShiftTable, shiftTable54, sizeof shiftTable54) == 0)
						flags |= FAST_PATH_12BIT_COLOR_MAP;
					else if (op->cmMask == 0x1FF && memcmp(op->cmMaskTable, maskTable53, sizeof maskTable53) == 0 && memcmp(op->cmShiftTable, shiftTable53, sizeof shiftTable53) == 0)
						flags |= FAST_PATH_9BIT_COLOR_MAP;
					else {
						/* Unsupported case 2 */
						copyBitsFallback(op, 0);
#ifdef PROFILING
						profile_unrecorded_cases[2]++;
#endif
						return;
					}
				} else {
					/* Unsupported case 3 */
					copyBitsFallback(op, 0);
#ifdef PROFILING
					profile_unrecorded_cases[3]++;
#endif
					return;
				}
			} else {
				if ((op->src.depth < 16 && op->cmMask == (1u << op->src.depth) - 1) ||
						(op->src.depth == 16 && op->cmMask == 0x7FFF))
					flags |= FAST_PATH_DIRECT_COLOR_MAP;
				else {
					/* Unsupported case 4 */
					copyBitsFallback(op, 0);
#ifdef PROFILING
					profile_unrecorded_cases[4]++;
#endif
					return;
				}
			}
		} else {
			if (op->cmFlags & ColorMapFixedPart) {
				if ((op->dest.depth == 32 && op->src.depth == 16 && memcmp(op->cmMaskTable, maskTable58, sizeof maskTable58) == 0 && memcmp(op->cmShiftTable, shiftTable58, sizeof shiftTable58) == 0) ||
						(op->dest.depth == 16 && op->src.depth == 32 && memcmp(op->cmMaskTable, maskTable85, sizeof maskTable85) == 0 && memcmp(op->cmShiftTable, shiftTable85, sizeof shiftTable85) == 0))
					flags |= FAST_PATH_NO_COLOR_MAP;
				else {
					/* Unsupported case 5 */
					copyBitsFallback(op, 0);
#ifdef PROFILING
					profile_unrecorded_cases[5]++;
#endif
					return;
				}
			} else {
				if (op->dest.depth == op->src.depth)
					flags |= FAST_PATH_NO_COLOR_MAP;
				else {
					/* Unsupported case 6 */
					copyBitsFallback(op, 0);
#ifdef PROFILING
					profile_unrecorded_cases[6]++;
#endif
					return;
				}
			}
		}
	}

	if (op->noHalftone)
		flags |= FAST_PATH_NO_HALFTONE;
	else if (op->halftoneHeight == 1)
		flags |= FAST_PATH_SCALAR_HALFTONE;
	else
		flags |= FAST_PATH_VECTOR_HALFTONE;

	if (op->combinationRule == CR_rgbComponentAlpha) {
		if ((op->opt.componentAlpha.gammaLookupTable == NULL) != (op->opt.componentAlpha.ungammaLookupTable == NULL)) {
			/* Unsupported case 7 */
			copyBitsFallback(op, 0);
#ifdef PROFILING
			profile_unrecorded_cases[7]++;
#endif
			return;
		}
		if (op->opt.componentAlpha.gammaLookupTable)
			flags |= FAST_PATH_CA_HAS_GAMMA;
		else
			flags |= FAST_PATH_CA_NO_GAMMA;
	}

	/* Remember the fast path function to accelerate the common case
	 * where we're performing the same operation more than once in a row */
	static combination_rule_t storedCombinationRule = CR_any; // guaranteed not to match the first time
	static uint32_t           storedFlags;
	static void             (*storedFunc)(operation_t *, uint32_t);

	if (op->combinationRule != storedCombinationRule || flags != storedFlags) {
		storedCombinationRule = op->combinationRule;
		storedFlags = flags;
		storedFunc = lookupFastPath(op->combinationRule, flags);
		if (storedFunc == NULL) {
			/* Unsupported case 8 */
#ifdef PROFILING
			profile_unrecorded_cases[8]++;
#endif
			storedFunc = copyBitsFallback;
		}
	}
#ifdef PROFILING
	uint32_t before = gettime();
#endif

	storedFunc(op, storedFlags);
#ifdef PROFILING
	uint32_t after = gettime();
	profile_record(storedCombinationRule, storedFlags, after - before);
#endif
}
#endif /* ENABLE_FAST_BLT */

Bell Labs OSI certified Powered by Plan 9

(Return to Plan 9 Home Page)

Copyright © 2021 Plan 9 Foundation. All Rights Reserved.
Comments to webmaster@9p.io.