WebSVN – Games.Prince of Persia – Blame – /xbrz.c

Rev	Author	Line No.	Line
2	pmbaty	1	// ****************************************************************************
		2	// * This file is part of the HqMAME project. It is distributed under *
		3	// * GNU General Public License: https://www.gnu.org/licenses/gpl-3.0 *
		4	// * Copyright (C) Zenju (zenju AT gmx DOT de) - All Rights Reserved *
		5	// * *
		6	// * Additionally and as a special exception, the author gives permission *
		7	// * to link the code of this program with the MAME library (or with modified *
		8	// * versions of MAME that use the same license as MAME), and distribute *
		9	// * linked combinations including the two. You must obey the GNU General *
		10	// * Public License in all respects for all of the code used other than MAME. *
		11	// * If you modify this file, you may extend this exception to your version *
		12	// * of the file, but you are not obligated to do so. If you do not wish to *
		13	// * do so, delete this exception statement from your version. *
		14	// ****************************************************************************
		15
3	pmbaty	16	// -------------------------------------------------------------------------
		17	// \| xBRZ: "Scale by rules" - high quality image upscaling filter by Zenju \|
		18	// -------------------------------------------------------------------------
		19	// using a modified approach of xBR:
		20	// http://board.byuu.org/viewtopic.php?f=10&t=2248
		21	// - new rule set preserving small image features
		22	// - highly optimized for performance
		23	// - support alpha channel
		24	// - support multithreading
		25	// - support 64-bit architectures
		26	// - support processing image slices
		27	// - support scaling up to 6xBRZ
2	pmbaty	28
3	pmbaty	29	// -> map source (srcWidth * srcHeight) to target (scale * width x scale * height) image, optionally processing a half-open slice of rows [yFirst, yLast) only
		30	// -> support for source/target pitch in bytes!
		31	// -> if your emulator changes only a few image slices during each cycle (e.g. DOSBox) then there's no need to run xBRZ on the complete image:
		32	// Just make sure you enlarge the source image slice by 2 rows on top and 2 on bottom (this is the additional range the xBRZ algorithm is using during analysis)
		33	// CAVEAT: If there are multiple changed slices, make sure they do not overlap after adding these additional rows in order to avoid a memory race condition
		34	// in the target image data if you are using multiple threads for processing each enlarged slice!
		35	//
		36	// THREAD-SAFETY: - parts of the same image may be scaled by multiple threads as long as the [yFirst, yLast) ranges do not overlap!
		37	// - there is a minor inefficiency for the first row of a slice, so avoid processing single rows only; suggestion: process at least 8-16 rows
		38
		39
		40	#include <stddef.h> // for size_t
		41	#include <stdint.h> // for uint32_t
		42	#include <memory.h> // for memset()
		43	#include <limits.h>
2	pmbaty	44	#include <math.h>
		45
		46
		47	#ifdef __cplusplus
		48	#define EXTERN_C extern "C"
		49	#else // !__cplusplus
		50	#define EXTERN_C
		51	#endif // __cplusplus
		52
		53
		54	// scaler configuration
		55	#define XBRZ_CFG_LUMINANCE_WEIGHT 1
		56	#define XBRZ_CFG_EQUAL_COLOR_TOLERANCE 30
		57	#define XBRZ_CFG_DOMINANT_DIRECTION_THRESHOLD 3.6
		58	#define XBRZ_CFG_STEEP_DIRECTION_THRESHOLD 2.2
		59
		60
		61	// slice types
		62	#define XBRZ_SLICETYPE_SOURCE 1
		63	#define XBRZ_SLICETYPE_TARGET 2
		64
		65
		66	// handy macros
		67	#define GET_BYTE(val,byteno) ((unsigned char) (((val) >> ((byteno) << 3)) & 0xff))
		68	#define GET_BLUE(val) GET_BYTE (val, 0)
		69	#define GET_GREEN(val) GET_BYTE (val, 1)
		70	#define GET_RED(val) GET_BYTE (val, 2)
		71	#define GET_ALPHA(val) GET_BYTE (val, 3)
3	pmbaty	72	#define CALC_COLOR24(colFront,colBack,M,N) (unsigned char) ((((unsigned char) (colFront)) * ((unsigned int) (M)) + ((unsigned char) (colBack)) * (((unsigned int) (N)) - ((unsigned int) (M)))) / ((unsigned int) (N)))
		73	#define CALC_COLOR32(colFront,colBack,weightFront,weightBack,weightSum) ((unsigned char) ((((unsigned char) (colFront)) * ((unsigned int) (weightFront)) + ((unsigned char) (colBack)) * ((unsigned int) (weightBack))) / ((unsigned int) (weightSum))))
		74	#define BYTE_ADVANCE(buffer,offset) (((char *) buffer) + (offset))
		75	#ifndef MIN
		76	#define MIN(a,b) ((a) < (b) ? (a) : (b))
		77	#endif // MIN
		78	#ifndef MAX
		79	#define MAX(a,b) ((a) > (b) ? (a) : (b))
		80	#endif // MAX
2	pmbaty	81
		82
3	pmbaty	83	typedef void (alphagrad_func) (uint32_t *pixBack, uint32_t pixFront, unsigned int M, unsigned int N);
		84	typedef double (dist_func) (uint32_t pix1, uint32_t pix2);
2	pmbaty	85
		86
		87
		88
		89	namespace
		90	{
		91	#ifdef _MSC_VER
		92	#define FORCE_INLINE __forceinline
		93	#elif defined __GNUC__
		94	#define FORCE_INLINE __attribute__((always_inline)) inline
		95	#else
		96	#define FORCE_INLINE inline
		97	#endif
		98
		99
		100	enum RotationDegree //clock-wise
		101	{
3	pmbaty	102	ROT_0 = 0,
2	pmbaty	103	ROT_90,
		104	ROT_180,
		105	ROT_270
		106	};
		107
3	pmbaty	108
2	pmbaty	109	//calculate input matrix coordinates after rotation at compile time
3	pmbaty	110	template <RotationDegree rotDeg, size_t I, size_t J, size_t N> struct MatrixRotation;
2	pmbaty	111
3	pmbaty	112
		113	template <size_t I, size_t J, size_t N> struct MatrixRotation<ROT_0, I, J, N>
2	pmbaty	114	{
		115	static const size_t I_old = I;
		116	static const size_t J_old = J;
		117	};
		118
3	pmbaty	119
2	pmbaty	120	template <RotationDegree rotDeg, size_t I, size_t J, size_t N> //(i, j) = (row, col) indices, N = size of (square) matrix
		121	struct MatrixRotation
		122	{
3	pmbaty	123	static const size_t I_old = N - 1 - MatrixRotation<(RotationDegree)(rotDeg - 1), I, J, N>::J_old; //old coordinates before rotation!
		124	static const size_t J_old = MatrixRotation<(RotationDegree)(rotDeg - 1), I, J, N>::I_old; //
2	pmbaty	125	};
		126
		127
3	pmbaty	128	template <size_t N, RotationDegree rotDeg> class OutputMatrix
2	pmbaty	129	{
		130	public:
3	pmbaty	131	OutputMatrix (uint32_t *out, int outWidth) //access matrix area, top-left at position "out" for image with given width
		132	{
		133	out_ = out;
		134	outWidth_ = outWidth;
		135	}
2	pmbaty	136
3	pmbaty	137	template <size_t I, size_t J> uint32_t &ref() const
2	pmbaty	138	{
		139	static const size_t I_old = MatrixRotation<rotDeg, I, J, N>::I_old;
		140	static const size_t J_old = MatrixRotation<rotDeg, I, J, N>::J_old;
3	pmbaty	141
2	pmbaty	142	return (out_ + J_old + I_old outWidth_);
		143	}
		144
		145	uint32_t* out_;
3	pmbaty	146	int outWidth_;
2	pmbaty	147	};
		148
		149
		150
		151
		152	enum BlendType
		153	{
		154	BLEND_NONE = 0,
		155	BLEND_NORMAL, //a normal indication to blend
		156	BLEND_DOMINANT, //a strong indication to blend
		157	//attention: BlendType must fit into the value range of 2 bit!!!
		158	};
		159
		160	struct BlendResult
		161	{
		162	BlendType
		163	/**/blend_f, blend_g,
		164	/**/blend_j, blend_k;
		165	};
		166
		167
		168	struct Kernel_4x4 //kernel for preprocessing step
		169	{
		170	uint32_t
		171	/**/a, b, c, d,
		172	/**/e, f, g, h,
		173	/**/i, j, k, l,
		174	/**/m, n, o, p;
		175	};
		176
		177	/*
		178	input kernel area naming convention:
		179	-----------------
		180	\| A \| B \| C \| D \|
		181	----\|---\|---\|---\|
		182	\| E \| F \| G \| H \| //evaluate the four corners between F, G, J, K
		183	----\|---\|---\|---\| //input pixel is at position F
		184	\| I \| J \| K \| L \|
		185	----\|---\|---\|---\|
		186	\| M \| N \| O \| P \|
		187	-----------------
		188	*/
		189	FORCE_INLINE //detect blend direction
3	pmbaty	190	BlendResult preProcessCorners(const Kernel_4x4& ker, dist_func dist) //result: F, G, J, K corners of "GradientType"
2	pmbaty	191	{
		192	BlendResult result = {};
		193
		194	if ((ker.f == ker.g &&
		195	ker.j == ker.k) \|\|
		196	(ker.f == ker.j &&
		197	ker.g == ker.k))
		198	return result;
		199
		200	const int weight = 4;
3	pmbaty	201	double jg = dist (ker.i, ker.f) + dist (ker.f, ker.c) + dist (ker.n, ker.k) + dist (ker.k, ker.h) + weight * dist (ker.j, ker.g);
		202	double fk = dist (ker.e, ker.j) + dist (ker.j, ker.o) + dist (ker.b, ker.g) + dist (ker.g, ker.l) + weight * dist (ker.f, ker.k);
2	pmbaty	203
		204	if (jg < fk) //test sample: 70% of values max(jg, fk) / min(jg, fk) are between 1.1 and 3.7 with median being 1.8
		205	{
		206	const bool dominantGradient = XBRZ_CFG_DOMINANT_DIRECTION_THRESHOLD * jg < fk;
		207	if (ker.f != ker.g && ker.f != ker.j)
		208	result.blend_f = dominantGradient ? BLEND_DOMINANT : BLEND_NORMAL;
		209
		210	if (ker.k != ker.j && ker.k != ker.g)
		211	result.blend_k = dominantGradient ? BLEND_DOMINANT : BLEND_NORMAL;
		212	}
		213	else if (fk < jg)
		214	{
		215	const bool dominantGradient = XBRZ_CFG_DOMINANT_DIRECTION_THRESHOLD * fk < jg;
		216	if (ker.j != ker.f && ker.j != ker.k)
		217	result.blend_j = dominantGradient ? BLEND_DOMINANT : BLEND_NORMAL;
		218
		219	if (ker.g != ker.f && ker.g != ker.k)
		220	result.blend_g = dominantGradient ? BLEND_DOMINANT : BLEND_NORMAL;
		221	}
		222	return result;
		223	}
		224
		225	struct Kernel_3x3
		226	{
		227	uint32_t
		228	/**/a, b, c,
		229	/**/d, e, f,
		230	/**/g, h, i;
		231	};
3	pmbaty	232	/*
2	pmbaty	233	#define DEF_GETTER(x) template <RotationDegree rotDeg> uint32_t inline get_##x(const Kernel_3x3& ker) { return ker.x; }
		234	//we cannot and NEED NOT write "ker.##x" since ## concatenates preprocessor tokens but "." is not a token
		235	DEF_GETTER(a) DEF_GETTER(b) DEF_GETTER(c)
		236	DEF_GETTER(d) DEF_GETTER(e) DEF_GETTER(f)
		237	DEF_GETTER(g) DEF_GETTER(h) DEF_GETTER(i)
		238	#undef DEF_GETTER
		239
		240	#define DEF_GETTER(x, y) template <> inline uint32_t get_##x<ROT_90>(const Kernel_3x3& ker) { return ker.y; }
		241	DEF_GETTER(a, g) DEF_GETTER(b, d) DEF_GETTER(c, a)
		242	DEF_GETTER(d, h) DEF_GETTER(e, e) DEF_GETTER(f, b)
		243	DEF_GETTER(g, i) DEF_GETTER(h, f) DEF_GETTER(i, c)
		244	#undef DEF_GETTER
		245
		246	#define DEF_GETTER(x, y) template <> inline uint32_t get_##x<ROT_180>(const Kernel_3x3& ker) { return ker.y; }
		247	DEF_GETTER(a, i) DEF_GETTER(b, h) DEF_GETTER(c, g)
		248	DEF_GETTER(d, f) DEF_GETTER(e, e) DEF_GETTER(f, d)
		249	DEF_GETTER(g, c) DEF_GETTER(h, b) DEF_GETTER(i, a)
		250	#undef DEF_GETTER
		251
		252	#define DEF_GETTER(x, y) template <> inline uint32_t get_##x<ROT_270>(const Kernel_3x3& ker) { return ker.y; }
		253	DEF_GETTER(a, c) DEF_GETTER(b, f) DEF_GETTER(c, i)
		254	DEF_GETTER(d, b) DEF_GETTER(e, e) DEF_GETTER(f, h)
		255	DEF_GETTER(g, a) DEF_GETTER(h, d) DEF_GETTER(i, g)
		256	#undef DEF_GETTER
3	pmbaty	257	*/
2	pmbaty	258
3	pmbaty	259	template <RotationDegree rotDeg> uint32_t inline get_a (const Kernel_3x3& ker) { return ker.a; }
		260	template <RotationDegree rotDeg> uint32_t inline get_b (const Kernel_3x3& ker) { return ker.b; }
		261	template <RotationDegree rotDeg> uint32_t inline get_c (const Kernel_3x3& ker) { return ker.c; }
		262	template <RotationDegree rotDeg> uint32_t inline get_d (const Kernel_3x3& ker) { return ker.d; }
		263	template <RotationDegree rotDeg> uint32_t inline get_e (const Kernel_3x3& ker) { return ker.e; }
		264	template <RotationDegree rotDeg> uint32_t inline get_f (const Kernel_3x3& ker) { return ker.f; }
		265	template <RotationDegree rotDeg> uint32_t inline get_g (const Kernel_3x3& ker) { return ker.g; }
		266	template <RotationDegree rotDeg> uint32_t inline get_h (const Kernel_3x3& ker) { return ker.h; }
		267	template <RotationDegree rotDeg> uint32_t inline get_i (const Kernel_3x3& ker) { return ker.i; }
2	pmbaty	268
3	pmbaty	269	template <> inline uint32_t get_a<ROT_90>(const Kernel_3x3& ker) { return ker.g; }
		270	template <> inline uint32_t get_b<ROT_90>(const Kernel_3x3& ker) { return ker.d; }
		271	template <> inline uint32_t get_c<ROT_90>(const Kernel_3x3& ker) { return ker.a; }
		272	template <> inline uint32_t get_d<ROT_90>(const Kernel_3x3& ker) { return ker.h; }
		273	template <> inline uint32_t get_e<ROT_90>(const Kernel_3x3& ker) { return ker.e; }
		274	template <> inline uint32_t get_f<ROT_90>(const Kernel_3x3& ker) { return ker.b; }
		275	template <> inline uint32_t get_g<ROT_90>(const Kernel_3x3& ker) { return ker.i; }
		276	template <> inline uint32_t get_h<ROT_90>(const Kernel_3x3& ker) { return ker.f; }
		277	template <> inline uint32_t get_i<ROT_90>(const Kernel_3x3& ker) { return ker.c; }
		278
		279	template <> inline uint32_t get_a<ROT_180>(const Kernel_3x3& ker) { return ker.i; }
		280	template <> inline uint32_t get_b<ROT_180>(const Kernel_3x3& ker) { return ker.h; }
		281	template <> inline uint32_t get_c<ROT_180>(const Kernel_3x3& ker) { return ker.g; }
		282	template <> inline uint32_t get_d<ROT_180>(const Kernel_3x3& ker) { return ker.f; }
		283	template <> inline uint32_t get_e<ROT_180>(const Kernel_3x3& ker) { return ker.e; }
		284	template <> inline uint32_t get_f<ROT_180>(const Kernel_3x3& ker) { return ker.d; }
		285	template <> inline uint32_t get_g<ROT_180>(const Kernel_3x3& ker) { return ker.c; }
		286	template <> inline uint32_t get_h<ROT_180>(const Kernel_3x3& ker) { return ker.b; }
		287	template <> inline uint32_t get_i<ROT_180>(const Kernel_3x3& ker) { return ker.a; }
		288
		289	template <> inline uint32_t get_a<ROT_270>(const Kernel_3x3& ker) { return ker.c; }
		290	template <> inline uint32_t get_b<ROT_270>(const Kernel_3x3& ker) { return ker.f; }
		291	template <> inline uint32_t get_c<ROT_270>(const Kernel_3x3& ker) { return ker.i; }
		292	template <> inline uint32_t get_d<ROT_270>(const Kernel_3x3& ker) { return ker.b; }
		293	template <> inline uint32_t get_e<ROT_270>(const Kernel_3x3& ker) { return ker.e; }
		294	template <> inline uint32_t get_f<ROT_270>(const Kernel_3x3& ker) { return ker.h; }
		295	template <> inline uint32_t get_g<ROT_270>(const Kernel_3x3& ker) { return ker.a; }
		296	template <> inline uint32_t get_h<ROT_270>(const Kernel_3x3& ker) { return ker.d; }
		297	template <> inline uint32_t get_i<ROT_270>(const Kernel_3x3& ker) { return ker.g; }
		298
2	pmbaty	299	//compress four blend types into a single byte
3	pmbaty	300	inline BlendType getTopL (unsigned char b) { return (BlendType)(0x3 & b); }
		301	inline BlendType getTopR (unsigned char b) { return (BlendType)(0x3 & (b >> 2)); }
		302	inline BlendType getBottomR(unsigned char b) { return (BlendType)(0x3 & (b >> 4)); }
		303	inline BlendType getBottomL(unsigned char b) { return (BlendType)(0x3 & (b >> 6)); }
2	pmbaty	304
		305	inline void setTopL (unsigned char& b, BlendType bt) { b \|= bt; } //buffer is assumed to be initialized before preprocessing!
		306	inline void setTopR (unsigned char& b, BlendType bt) { b \|= (bt << 2); }
		307	inline void setBottomR(unsigned char& b, BlendType bt) { b \|= (bt << 4); }
		308	inline void setBottomL(unsigned char& b, BlendType bt) { b \|= (bt << 6); }
		309
		310	template <RotationDegree rotDeg> inline
3	pmbaty	311	unsigned char rotateBlendInfo (unsigned char b) { return b; }
2	pmbaty	312	template <> inline unsigned char rotateBlendInfo<ROT_90 >(unsigned char b) { return ((b << 2) \| (b >> 6)) & 0xff; }
		313	template <> inline unsigned char rotateBlendInfo<ROT_180>(unsigned char b) { return ((b << 4) \| (b >> 4)) & 0xff; }
		314	template <> inline unsigned char rotateBlendInfo<ROT_270>(unsigned char b) { return ((b << 6) \| (b >> 2)) & 0xff; }
		315
		316
		317	/*
		318	input kernel area naming convention:
		319	-------------
		320	\| A \| B \| C \|
		321	----\|---\|---\|
		322	\| D \| E \| F \| //input pixel is at position E
		323	----\|---\|---\|
		324	\| G \| H \| I \|
		325	-------------
		326	*/
3	pmbaty	327	template <class Scaler, RotationDegree rotDeg>
		328	FORCE_INLINE void blendPixel(const Kernel_3x3& ker, uint32_t *target, int trgWidth, unsigned char blendInfo, alphagrad_func alphagrad, dist_func dist) //result of preprocessing all four corners of pixel "e"
2	pmbaty	329	{
		330	#define a get_a<rotDeg>(ker)
		331	#define b get_b<rotDeg>(ker)
		332	#define c get_c<rotDeg>(ker)
		333	#define d get_d<rotDeg>(ker)
		334	#define e get_e<rotDeg>(ker)
		335	#define f get_f<rotDeg>(ker)
		336	#define g get_g<rotDeg>(ker)
		337	#define h get_h<rotDeg>(ker)
		338	#define i get_i<rotDeg>(ker)
		339
		340	const unsigned char blend = rotateBlendInfo<rotDeg>(blendInfo);
		341
		342	if (getBottomR(blend) >= BLEND_NORMAL)
		343	{
3	pmbaty	344	bool doLineBlend;
2	pmbaty	345
3	pmbaty	346	if (getBottomR(blend) >= BLEND_DOMINANT)
		347	doLineBlend = true;
		348	else if (getTopR(blend) != BLEND_NONE && (dist (e, g) >= XBRZ_CFG_EQUAL_COLOR_TOLERANCE)) //but support double-blending for 90� corners
		349	doLineBlend = false; // make sure there is no second blending in an adjacent rotation for this pixel: handles insular pixels, mario eyes
		350	else if (getBottomL(blend) != BLEND_NONE && (dist (e, c) >= XBRZ_CFG_EQUAL_COLOR_TOLERANCE))
		351	doLineBlend = false; // make sure there is no second blending in an adjacent rotation for this pixel: handles insular pixels, mario eyes
		352	else if ((dist (e, i) >= XBRZ_CFG_EQUAL_COLOR_TOLERANCE)
		353	&& (dist (g, h) < XBRZ_CFG_EQUAL_COLOR_TOLERANCE)
		354	&& (dist (h, i) < XBRZ_CFG_EQUAL_COLOR_TOLERANCE)
		355	&& (dist (i, f) < XBRZ_CFG_EQUAL_COLOR_TOLERANCE)
		356	&& (dist (f, c) < XBRZ_CFG_EQUAL_COLOR_TOLERANCE))
		357	doLineBlend = false; // no full blending for L-shapes; blend corner only (handles "mario mushroom eyes")
		358	else
		359	doLineBlend = true;
2	pmbaty	360
3	pmbaty	361	const uint32_t px = (dist (e, f) <= dist (e, h) ? f : h); //choose most similar color
2	pmbaty	362
		363	OutputMatrix<Scaler::scale, rotDeg> out(target, trgWidth);
		364
		365	if (doLineBlend)
		366	{
3	pmbaty	367	const double fg = dist (f, g); //test sample: 70% of values max(fg, hc) / min(fg, hc) are between 1.1 and 3.7 with median being 1.9
		368	const double hc = dist (h, c); //
2	pmbaty	369
		370	const bool haveShallowLine = XBRZ_CFG_STEEP_DIRECTION_THRESHOLD * fg <= hc && e != g && d != g;
		371	const bool haveSteepLine = XBRZ_CFG_STEEP_DIRECTION_THRESHOLD * hc <= fg && e != c && b != c;
		372
		373	if (haveShallowLine)
		374	{
		375	if (haveSteepLine)
3	pmbaty	376	Scaler::blendLineSteepAndShallow(px, out, alphagrad);
2	pmbaty	377	else
3	pmbaty	378	Scaler::blendLineShallow(px, out, alphagrad);
2	pmbaty	379	}
		380	else
		381	{
		382	if (haveSteepLine)
3	pmbaty	383	Scaler::blendLineSteep(px, out, alphagrad);
2	pmbaty	384	else
3	pmbaty	385	Scaler::blendLineDiagonal(px, out, alphagrad);
2	pmbaty	386	}
		387	}
		388	else
3	pmbaty	389	Scaler::blendCorner(px, out, alphagrad);
2	pmbaty	390	}
		391
		392	#undef a
		393	#undef b
		394	#undef c
		395	#undef d
		396	#undef e
		397	#undef f
		398	#undef g
		399	#undef h
		400	#undef i
		401	}
		402
		403
3	pmbaty	404	template <class Scaler> //scaler policy: see "Scaler2x" reference implementation
		405	void scaleImage(const uint32_t src, uint32_t trg, int srcWidth, int srcHeight, int yFirst, int yLast, alphagrad_func alphagrad, dist_func dist)
2	pmbaty	406	{
3	pmbaty	407	yFirst = MAX (yFirst, 0);
		408	yLast = MIN (yLast, srcHeight);
2	pmbaty	409	if (yFirst >= yLast \|\| srcWidth <= 0)
		410	return;
		411
		412	const int trgWidth = srcWidth * Scaler::scale;
		413
		414	//"use" space at the end of the image as temporary buffer for "on the fly preprocessing": we even could use larger area of
		415	//"sizeof(uint32_t) * srcWidth * (yLast - yFirst)" bytes without risk of accidental overwriting before accessing
		416	const int bufferSize = srcWidth;
		417	unsigned char* preProcBuffer = reinterpret_cast<unsigned char>(trg + yLast Scaler::scale * trgWidth) - bufferSize;
3	pmbaty	418	memset (preProcBuffer, 0, bufferSize);
2	pmbaty	419	static_assert(BLEND_NONE == 0, "");
		420
		421	//initialize preprocessing buffer for first row of current stripe: detect upper left and right corner blending
		422	//this cannot be optimized for adjacent processing stripes; we must not allow for a memory race condition!
		423	if (yFirst > 0)
		424	{
		425	const int y = yFirst - 1;
		426
3	pmbaty	427	const uint32_t* s_m1 = src + srcWidth * MAX (y - 1, 0);
2	pmbaty	428	const uint32_t* s_0 = src + srcWidth * y; //center line
3	pmbaty	429	const uint32_t* s_p1 = src + srcWidth * MIN (y + 1, srcHeight - 1);
		430	const uint32_t* s_p2 = src + srcWidth * MIN (y + 2, srcHeight - 1);
2	pmbaty	431
		432	for (int x = 0; x < srcWidth; ++x)
		433	{
3	pmbaty	434	const int x_m1 = MAX (x - 1, 0);
		435	const int x_p1 = MIN (x + 1, srcWidth - 1);
		436	const int x_p2 = MIN (x + 2, srcWidth - 1);
2	pmbaty	437
		438	Kernel_4x4 ker = {}; //perf: initialization is negligible
		439	ker.a = s_m1[x_m1]; //read sequentially from memory as far as possible
		440	ker.b = s_m1[x];
		441	ker.c = s_m1[x_p1];
		442	ker.d = s_m1[x_p2];
		443
		444	ker.e = s_0[x_m1];
		445	ker.f = s_0[x];
		446	ker.g = s_0[x_p1];
		447	ker.h = s_0[x_p2];
		448
		449	ker.i = s_p1[x_m1];
		450	ker.j = s_p1[x];
		451	ker.k = s_p1[x_p1];
		452	ker.l = s_p1[x_p2];
		453
		454	ker.m = s_p2[x_m1];
		455	ker.n = s_p2[x];
		456	ker.o = s_p2[x_p1];
		457	ker.p = s_p2[x_p2];
		458
3	pmbaty	459	const BlendResult res = preProcessCorners (ker, dist);
2	pmbaty	460	/*
		461	preprocessing blend result:
		462	---------
		463	\| F \| G \| //evalute corner between F, G, J, K
		464	----\|---\| //input pixel is at position F
		465	\| J \| K \|
		466	---------
		467	*/
		468	setTopR(preProcBuffer[x], res.blend_j);
		469
		470	if (x + 1 < bufferSize)
		471	setTopL(preProcBuffer[x + 1], res.blend_k);
		472	}
		473	}
		474	//------------------------------------------------------------------------------------
		475
		476	for (int y = yFirst; y < yLast; ++y)
		477	{
3	pmbaty	478	uint32_t out = trg + Scaler::scale y * trgWidth; //consider MT "striped" access
2	pmbaty	479
3	pmbaty	480	const uint32_t* s_m1 = src + srcWidth * MAX (y - 1, 0);
2	pmbaty	481	const uint32_t* s_0 = src + srcWidth * y; //center line
3	pmbaty	482	const uint32_t* s_p1 = src + srcWidth * MIN (y + 1, srcHeight - 1);
		483	const uint32_t* s_p2 = src + srcWidth * MIN (y + 2, srcHeight - 1);
2	pmbaty	484
		485	unsigned char blend_xy1 = 0; //corner blending for current (x, y + 1) position
		486
		487	for (int x = 0; x < srcWidth; ++x, out += Scaler::scale)
		488	{
		489	//all those bounds checks have only insignificant impact on performance!
3	pmbaty	490	const int x_m1 = MAX (x - 1, 0); //perf: prefer array indexing to additional pointers!
		491	const int x_p1 = MIN (x + 1, srcWidth - 1);
		492	const int x_p2 = MIN (x + 2, srcWidth - 1);
2	pmbaty	493
		494	Kernel_4x4 ker4 = {}; //perf: initialization is negligible
		495
		496	ker4.a = s_m1[x_m1]; //read sequentially from memory as far as possible
		497	ker4.b = s_m1[x];
		498	ker4.c = s_m1[x_p1];
		499	ker4.d = s_m1[x_p2];
		500
		501	ker4.e = s_0[x_m1];
		502	ker4.f = s_0[x];
		503	ker4.g = s_0[x_p1];
		504	ker4.h = s_0[x_p2];
		505
		506	ker4.i = s_p1[x_m1];
		507	ker4.j = s_p1[x];
		508	ker4.k = s_p1[x_p1];
		509	ker4.l = s_p1[x_p2];
		510
		511	ker4.m = s_p2[x_m1];
		512	ker4.n = s_p2[x];
		513	ker4.o = s_p2[x_p1];
		514	ker4.p = s_p2[x_p2];
		515
		516	//evaluate the four corners on bottom-right of current pixel
		517	unsigned char blend_xy = 0; //for current (x, y) position
		518	{
3	pmbaty	519	const BlendResult res = preProcessCorners (ker4, dist);
2	pmbaty	520	/*
		521	preprocessing blend result:
		522	---------
		523	\| F \| G \| //evalute corner between F, G, J, K
		524	----\|---\| //current input pixel is at position F
		525	\| J \| K \|
		526	---------
		527	*/
		528	blend_xy = preProcBuffer[x];
		529	setBottomR(blend_xy, res.blend_f); //all four corners of (x, y) have been determined at this point due to processing sequence!
		530
		531	setTopR(blend_xy1, res.blend_j); //set 2nd known corner for (x, y + 1)
		532	preProcBuffer[x] = blend_xy1; //store on current buffer position for use on next row
		533
		534	blend_xy1 = 0;
		535	setTopL(blend_xy1, res.blend_k); //set 1st known corner for (x + 1, y + 1) and buffer for use on next column
		536
		537	if (x + 1 < bufferSize) //set 3rd known corner for (x + 1, y)
		538	setBottomL(preProcBuffer[x + 1], res.blend_g);
		539	}
		540
		541	//fill block of size scale * scale with the given color
3	pmbaty	542	{
		543	uint32_t *blk = out;
		544	for (int _blk_y = 0; _blk_y < Scaler::scale; ++_blk_y, blk = (uint32_t ) BYTE_ADVANCE (blk, trgWidth sizeof (uint32_t)))
		545	for (int _blk_x = 0; _blk_x < Scaler::scale; ++_blk_x)
		546	blk[_blk_x] = ker4.f;
		547	}
2	pmbaty	548	//place after preprocessing step, to not overwrite the results while processing the the last pixel!
		549
		550	//blend four corners of current pixel
3	pmbaty	551	if (blend_xy != 0) //good 5% perf-improvement
2	pmbaty	552	{
		553	Kernel_3x3 ker3 = {}; //perf: initialization is negligible
		554
		555	ker3.a = ker4.a;
		556	ker3.b = ker4.b;
		557	ker3.c = ker4.c;
		558
		559	ker3.d = ker4.e;
		560	ker3.e = ker4.f;
		561	ker3.f = ker4.g;
		562
		563	ker3.g = ker4.i;
		564	ker3.h = ker4.j;
		565	ker3.i = ker4.k;
		566
3	pmbaty	567	blendPixel<Scaler, ROT_0 >(ker3, out, trgWidth, blend_xy, alphagrad, dist);
		568	blendPixel<Scaler, ROT_90 >(ker3, out, trgWidth, blend_xy, alphagrad, dist);
		569	blendPixel<Scaler, ROT_180>(ker3, out, trgWidth, blend_xy, alphagrad, dist);
		570	blendPixel<Scaler, ROT_270>(ker3, out, trgWidth, blend_xy, alphagrad, dist);
2	pmbaty	571	}
		572	}
		573	}
		574	}
		575
		576
		577	//------------------------------------------------------------------------------------
3	pmbaty	578	struct Scaler2x
2	pmbaty	579	{
		580	static const int scale = 2;
		581
		582
		583	template <class OutputMatrix>
3	pmbaty	584	static void blendLineShallow(uint32_t col, OutputMatrix& out, alphagrad_func alphagrad)
2	pmbaty	585	{
3	pmbaty	586	alphagrad (&(out.template ref<scale - 1, 0>()), col, 1, 4);
		587	alphagrad (&(out.template ref<scale - 1, 1>()), col, 3, 4);
2	pmbaty	588	}
		589
		590	template <class OutputMatrix>
3	pmbaty	591	static void blendLineSteep(uint32_t col, OutputMatrix& out, alphagrad_func alphagrad)
2	pmbaty	592	{
3	pmbaty	593	alphagrad (&(out.template ref<0, scale - 1>()), col, 1, 4);
		594	alphagrad (&(out.template ref<1, scale - 1>()), col, 3, 4);
2	pmbaty	595	}
		596
		597	template <class OutputMatrix>
3	pmbaty	598	static void blendLineSteepAndShallow(uint32_t col, OutputMatrix& out, alphagrad_func alphagrad)
2	pmbaty	599	{
3	pmbaty	600	alphagrad (&(out.template ref<1, 0>()), col, 1, 4);
		601	alphagrad (&(out.template ref<0, 1>()), col, 1, 4);
		602	alphagrad (&(out.template ref<1, 1>()), col, 5, 6); //[!] fixes 7/8 used in xBR
2	pmbaty	603	}
		604
		605	template <class OutputMatrix>
3	pmbaty	606	static void blendLineDiagonal(uint32_t col, OutputMatrix& out, alphagrad_func alphagrad)
2	pmbaty	607	{
3	pmbaty	608	alphagrad (&(out.template ref<1, 1>()), col, 1, 2);
2	pmbaty	609	}
		610
		611	template <class OutputMatrix>
3	pmbaty	612	static void blendCorner(uint32_t col, OutputMatrix& out, alphagrad_func alphagrad)
2	pmbaty	613	{
		614	//model a round corner
3	pmbaty	615	alphagrad (&(out.template ref<1, 1>()), col, 21, 100); //exact: 1 - pi/4 = 0.2146018366
2	pmbaty	616	}
		617	};
		618
		619
3	pmbaty	620	struct Scaler3x
2	pmbaty	621	{
		622	static const int scale = 3;
		623
		624
		625	template <class OutputMatrix>
3	pmbaty	626	static void blendLineShallow(uint32_t col, OutputMatrix& out, alphagrad_func alphagrad)
2	pmbaty	627	{
3	pmbaty	628	alphagrad (&(out.template ref<scale - 1, 0>()), col, 1, 4);
		629	alphagrad (&(out.template ref<scale - 2, 2>()), col, 1, 4);
		630	alphagrad (&(out.template ref<scale - 1, 1>()), col, 3, 4);
2	pmbaty	631	out.template ref<scale - 1, 2>() = col;
		632	}
		633
		634	template <class OutputMatrix>
3	pmbaty	635	static void blendLineSteep(uint32_t col, OutputMatrix& out, alphagrad_func alphagrad)
2	pmbaty	636	{
3	pmbaty	637	alphagrad (&(out.template ref<0, scale - 1>()), col, 1, 4);
		638	alphagrad (&(out.template ref<2, scale - 2>()), col, 1, 4);
		639	alphagrad (&(out.template ref<1, scale - 1>()), col, 3, 4);
2	pmbaty	640	out.template ref<2, scale - 1>() = col;
		641	}
		642
		643	template <class OutputMatrix>
3	pmbaty	644	static void blendLineSteepAndShallow(uint32_t col, OutputMatrix& out, alphagrad_func alphagrad)
2	pmbaty	645	{
3	pmbaty	646	alphagrad (&(out.template ref<2, 0>()), col, 1, 4);
		647	alphagrad (&(out.template ref<0, 2>()), col, 1, 4);
		648	alphagrad (&(out.template ref<2, 1>()), col, 3, 4);
		649	alphagrad (&(out.template ref<1, 2>()), col, 3, 4);
2	pmbaty	650	out.template ref<2, 2>() = col;
		651	}
		652
		653	template <class OutputMatrix>
3	pmbaty	654	static void blendLineDiagonal(uint32_t col, OutputMatrix& out, alphagrad_func alphagrad)
2	pmbaty	655	{
3	pmbaty	656	alphagrad (&(out.template ref<1, 2>()), col, 1, 8); //conflict with other rotations for this odd scale
		657	alphagrad (&(out.template ref<2, 1>()), col, 1, 8);
		658	alphagrad (&(out.template ref<2, 2>()), col, 7, 8); //
2	pmbaty	659	}
		660
		661	template <class OutputMatrix>
3	pmbaty	662	static void blendCorner(uint32_t col, OutputMatrix& out, alphagrad_func alphagrad)
2	pmbaty	663	{
		664	//model a round corner
3	pmbaty	665	alphagrad (&(out.template ref<2, 2>()), col, 45, 100); //exact: 0.4545939598
		666	//alphagrad (&(out.template ref<2, 1>()), col, 7, 256); //0.02826017254 -> negligible + avoid conflicts with other rotations for this odd scale
		667	//alphagrad (&(out.template ref<1, 2>()), col, 7, 256); //0.02826017254
2	pmbaty	668	}
		669	};
		670
		671
3	pmbaty	672	struct Scaler4x
2	pmbaty	673	{
		674	static const int scale = 4;
		675
		676
		677	template <class OutputMatrix>
3	pmbaty	678	static void blendLineShallow(uint32_t col, OutputMatrix& out, alphagrad_func alphagrad)
2	pmbaty	679	{
3	pmbaty	680	alphagrad (&(out.template ref<scale - 1, 0>()), col, 1, 4);
		681	alphagrad (&(out.template ref<scale - 2, 2>()), col, 1, 4);
		682	alphagrad (&(out.template ref<scale - 1, 1>()), col, 3, 4);
		683	alphagrad (&(out.template ref<scale - 2, 3>()), col, 3, 4);
2	pmbaty	684
		685	out.template ref<scale - 1, 2>() = col;
		686	out.template ref<scale - 1, 3>() = col;
		687	}
		688
		689	template <class OutputMatrix>
3	pmbaty	690	static void blendLineSteep(uint32_t col, OutputMatrix& out, alphagrad_func alphagrad)
2	pmbaty	691	{
3	pmbaty	692	alphagrad (&(out.template ref<0, scale - 1>()), col, 1, 4);
		693	alphagrad (&(out.template ref<2, scale - 2>()), col, 1, 4);
		694	alphagrad (&(out.template ref<1, scale - 1>()), col, 3, 4);
		695	alphagrad (&(out.template ref<3, scale - 2>()), col, 3, 4);
2	pmbaty	696
		697	out.template ref<2, scale - 1>() = col;
		698	out.template ref<3, scale - 1>() = col;
		699	}
		700
		701	template <class OutputMatrix>
3	pmbaty	702	static void blendLineSteepAndShallow(uint32_t col, OutputMatrix& out, alphagrad_func alphagrad)
2	pmbaty	703	{
3	pmbaty	704	alphagrad (&(out.template ref<3, 1>()), col, 3, 4);
		705	alphagrad (&(out.template ref<1, 3>()), col, 3, 4);
		706	alphagrad (&(out.template ref<3, 0>()), col, 1, 4);
		707	alphagrad (&(out.template ref<0, 3>()), col, 1, 4);
		708	alphagrad (&(out.template ref<2, 2>()), col, 1, 3); //[!] fixes 1/4 used in xBR
2	pmbaty	709
		710	out.template ref<3, 3>() = col;
		711	out.template ref<3, 2>() = col;
		712	out.template ref<2, 3>() = col;
		713	}
		714
		715	template <class OutputMatrix>
3	pmbaty	716	static void blendLineDiagonal(uint32_t col, OutputMatrix& out, alphagrad_func alphagrad)
2	pmbaty	717	{
3	pmbaty	718	alphagrad (&(out.template ref<scale - 1, scale / 2 >()), col, 1, 2);
		719	alphagrad (&(out.template ref<scale - 2, scale / 2 + 1>()), col, 1, 2);
		720
2	pmbaty	721	out.template ref<scale - 1, scale - 1>() = col;
		722	}
		723
		724	template <class OutputMatrix>
3	pmbaty	725	static void blendCorner(uint32_t col, OutputMatrix& out, alphagrad_func alphagrad)
2	pmbaty	726	{
		727	//model a round corner
3	pmbaty	728	alphagrad (&(out.template ref<3, 3>()), col, 68, 100); //exact: 0.6848532563
		729	alphagrad (&(out.template ref<3, 2>()), col, 9, 100); //0.08677704501
		730	alphagrad (&(out.template ref<2, 3>()), col, 9, 100); //0.08677704501
2	pmbaty	731	}
		732	};
		733
		734
3	pmbaty	735	struct Scaler5x
2	pmbaty	736	{
		737	static const int scale = 5;
		738
		739
		740	template <class OutputMatrix>
3	pmbaty	741	static void blendLineShallow(uint32_t col, OutputMatrix& out, alphagrad_func alphagrad)
2	pmbaty	742	{
3	pmbaty	743	alphagrad (&(out.template ref<scale - 1, 0>()), col, 1, 4);
		744	alphagrad (&(out.template ref<scale - 2, 2>()), col, 1, 4);
		745	alphagrad (&(out.template ref<scale - 3, 4>()), col, 1, 4);
		746	alphagrad (&(out.template ref<scale - 1, 1>()), col, 3, 4);
		747	alphagrad (&(out.template ref<scale - 2, 3>()), col, 3, 4);
2	pmbaty	748
		749	out.template ref<scale - 1, 2>() = col;
		750	out.template ref<scale - 1, 3>() = col;
		751	out.template ref<scale - 1, 4>() = col;
		752	out.template ref<scale - 2, 4>() = col;
		753	}
		754
		755	template <class OutputMatrix>
3	pmbaty	756	static void blendLineSteep(uint32_t col, OutputMatrix& out, alphagrad_func alphagrad)
2	pmbaty	757	{
3	pmbaty	758	alphagrad (&(out.template ref<0, scale - 1>()), col, 1, 4);
		759	alphagrad (&(out.template ref<2, scale - 2>()), col, 1, 4);
		760	alphagrad (&(out.template ref<4, scale - 3>()), col, 1, 4);
		761	alphagrad (&(out.template ref<1, scale - 1>()), col, 3, 4);
		762	alphagrad (&(out.template ref<3, scale - 2>()), col, 3, 4);
2	pmbaty	763
		764	out.template ref<2, scale - 1>() = col;
		765	out.template ref<3, scale - 1>() = col;
		766	out.template ref<4, scale - 1>() = col;
		767	out.template ref<4, scale - 2>() = col;
		768	}
		769
		770	template <class OutputMatrix>
3	pmbaty	771	static void blendLineSteepAndShallow(uint32_t col, OutputMatrix& out, alphagrad_func alphagrad)
2	pmbaty	772	{
3	pmbaty	773	alphagrad (&(out.template ref<0, scale - 1>()), col, 1, 4);
		774	alphagrad (&(out.template ref<2, scale - 2>()), col, 1, 4);
		775	alphagrad (&(out.template ref<1, scale - 1>()), col, 3, 4);
		776	alphagrad (&(out.template ref<scale - 1, 0>()), col, 1, 4);
		777	alphagrad (&(out.template ref<scale - 2, 2>()), col, 1, 4);
		778	alphagrad (&(out.template ref<scale - 1, 1>()), col, 3, 4);
		779	alphagrad (&(out.template ref<3, 3>()), col, 2, 3);
2	pmbaty	780
		781	out.template ref<2, scale - 1>() = col;
		782	out.template ref<3, scale - 1>() = col;
		783	out.template ref<4, scale - 1>() = col;
		784	out.template ref<scale - 1, 2>() = col;
		785	out.template ref<scale - 1, 3>() = col;
		786	}
		787
		788	template <class OutputMatrix>
3	pmbaty	789	static void blendLineDiagonal(uint32_t col, OutputMatrix& out, alphagrad_func alphagrad)
2	pmbaty	790	{
3	pmbaty	791	alphagrad (&(out.template ref<scale - 1, scale / 2 >()), col, 1, 8); //conflict with other rotations for this odd scale
		792	alphagrad (&(out.template ref<scale - 2, scale / 2 + 1>()), col, 1, 8);
		793	alphagrad (&(out.template ref<scale - 3, scale / 2 + 2>()), col, 1, 8); //
		794	alphagrad (&(out.template ref<4, 3>()), col, 7, 8);
		795	alphagrad (&(out.template ref<3, 4>()), col, 7, 8);
2	pmbaty	796
		797	out.template ref<4, 4>() = col;
		798	}
		799
		800	template <class OutputMatrix>
3	pmbaty	801	static void blendCorner(uint32_t col, OutputMatrix& out, alphagrad_func alphagrad)
2	pmbaty	802	{
		803	// model a round corner
3	pmbaty	804	alphagrad (&(out.template ref<4, 4>()), col, 86, 100); //exact: 0.8631434088
		805	alphagrad (&(out.template ref<4, 3>()), col, 23, 100); //0.2306749731
		806	alphagrad (&(out.template ref<3, 4>()), col, 23, 100); //0.2306749731
2	pmbaty	807	//alphaGrad<1, 64>(out.template ref<4, 2>(), col); //0.01676812367 -> negligible + avoid conflicts with other rotations for this odd scale
		808	//alphaGrad<1, 64>(out.template ref<2, 4>(), col); //0.01676812367
		809	}
		810	};
		811
		812
3	pmbaty	813	struct Scaler6x
2	pmbaty	814	{
		815	static const int scale = 6;
		816
		817
		818	template <class OutputMatrix>
3	pmbaty	819	static void blendLineShallow(uint32_t col, OutputMatrix& out, alphagrad_func alphagrad)
2	pmbaty	820	{
3	pmbaty	821	alphagrad (&(out.template ref<scale - 1, 0>()), col, 1, 4);
		822	alphagrad (&(out.template ref<scale - 2, 2>()), col, 1, 4);
		823	alphagrad (&(out.template ref<scale - 3, 4>()), col, 1, 4);
		824	alphagrad (&(out.template ref<scale - 1, 1>()), col, 3, 4);
		825	alphagrad (&(out.template ref<scale - 2, 3>()), col, 3, 4);
		826	alphagrad (&(out.template ref<scale - 3, 5>()), col, 3, 4);
2	pmbaty	827
		828	out.template ref<scale - 1, 2>() = col;
		829	out.template ref<scale - 1, 3>() = col;
		830	out.template ref<scale - 1, 4>() = col;
		831	out.template ref<scale - 1, 5>() = col;
		832	out.template ref<scale - 2, 4>() = col;
		833	out.template ref<scale - 2, 5>() = col;
		834	}
		835
		836	template <class OutputMatrix>
3	pmbaty	837	static void blendLineSteep(uint32_t col, OutputMatrix& out, alphagrad_func alphagrad)
2	pmbaty	838	{
3	pmbaty	839	alphagrad (&(out.template ref<0, scale - 1>()), col, 1, 4);
		840	alphagrad (&(out.template ref<2, scale - 2>()), col, 1, 4);
		841	alphagrad (&(out.template ref<4, scale - 3>()), col, 1, 4);
		842	alphagrad (&(out.template ref<1, scale - 1>()), col, 3, 4);
		843	alphagrad (&(out.template ref<3, scale - 2>()), col, 3, 4);
		844	alphagrad (&(out.template ref<5, scale - 3>()), col, 3, 4);
2	pmbaty	845
		846	out.template ref<2, scale - 1>() = col;
		847	out.template ref<3, scale - 1>() = col;
		848	out.template ref<4, scale - 1>() = col;
		849	out.template ref<5, scale - 1>() = col;
		850	out.template ref<4, scale - 2>() = col;
		851	out.template ref<5, scale - 2>() = col;
		852	}
		853
		854	template <class OutputMatrix>
3	pmbaty	855	static void blendLineSteepAndShallow(uint32_t col, OutputMatrix& out, alphagrad_func alphagrad)
2	pmbaty	856	{
3	pmbaty	857	alphagrad (&(out.template ref<0, scale - 1>()), col, 1, 4);
		858	alphagrad (&(out.template ref<2, scale - 2>()), col, 1, 4);
		859	alphagrad (&(out.template ref<1, scale - 1>()), col, 3, 4);
		860	alphagrad (&(out.template ref<3, scale - 2>()), col, 3, 4);
		861	alphagrad (&(out.template ref<scale - 1, 0>()), col, 1, 4);
		862	alphagrad (&(out.template ref<scale - 2, 2>()), col, 1, 4);
		863	alphagrad (&(out.template ref<scale - 1, 1>()), col, 3, 4);
		864	alphagrad (&(out.template ref<scale - 2, 3>()), col, 3, 4);
2	pmbaty	865
		866	out.template ref<2, scale - 1>() = col;
		867	out.template ref<3, scale - 1>() = col;
		868	out.template ref<4, scale - 1>() = col;
		869	out.template ref<5, scale - 1>() = col;
		870	out.template ref<4, scale - 2>() = col;
		871	out.template ref<5, scale - 2>() = col;
		872	out.template ref<scale - 1, 2>() = col;
		873	out.template ref<scale - 1, 3>() = col;
		874	}
		875
		876	template <class OutputMatrix>
3	pmbaty	877	static void blendLineDiagonal(uint32_t col, OutputMatrix& out, alphagrad_func alphagrad)
2	pmbaty	878	{
3	pmbaty	879	alphagrad (&(out.template ref<scale - 1, scale / 2 >()), col, 1, 2);
		880	alphagrad (&(out.template ref<scale - 2, scale / 2 + 1>()), col, 1, 2);
		881	alphagrad (&(out.template ref<scale - 3, scale / 2 + 2>()), col, 1, 2);
2	pmbaty	882
		883	out.template ref<scale - 2, scale - 1>() = col;
		884	out.template ref<scale - 1, scale - 1>() = col;
		885	out.template ref<scale - 1, scale - 2>() = col;
		886	}
		887
		888	template <class OutputMatrix>
3	pmbaty	889	static void blendCorner(uint32_t col, OutputMatrix& out, alphagrad_func alphagrad)
2	pmbaty	890	{
		891	//model a round corner
3	pmbaty	892	alphagrad (&(out.template ref<5, 5>()), col, 97, 100); //exact: 0.9711013910
		893	alphagrad (&(out.template ref<4, 5>()), col, 42, 100); //0.4236372243
		894	alphagrad (&(out.template ref<5, 4>()), col, 42, 100); //0.4236372243
		895	alphagrad (&(out.template ref<5, 3>()), col, 6, 100); //0.05652034508
		896	alphagrad (&(out.template ref<3, 5>()), col, 6, 100); //0.05652034508
2	pmbaty	897	}
		898	};
		899
		900	//------------------------------------------------------------------------------------
3	pmbaty	901	}
2	pmbaty	902
		903
		904
3	pmbaty	905	static double dist24 (uint32_t pix1, uint32_t pix2)
		906	{
		907	//30% perf boost compared to plain distYCbCr()!
		908	//consumes 64 MB memory; using double is only 2% faster, but takes 128 MB
		909	static float diffToDist[256 * 256 * 256];
		910	static bool is_initialized = false;
		911	if (!is_initialized)
		912	{
		913	for (uint32_t i = 0; i < 256 * 256 * 256; ++i) //startup time: 114 ms on Intel Core i5 (four cores)
		914	{
		915	const int r_diff = GET_RED (i) * 2 - 0xFF;
		916	const int g_diff = GET_GREEN (i) * 2 - 0xFF;
		917	const int b_diff = GET_BLUE (i) * 2 - 0xFF;
2	pmbaty	918
3	pmbaty	919	const double k_b = 0.0593; //ITU-R BT.2020 conversion
		920	const double k_r = 0.2627; //
		921	const double k_g = 1 - k_b - k_r;
2	pmbaty	922
3	pmbaty	923	const double scale_b = 0.5 / (1 - k_b);
		924	const double scale_r = 0.5 / (1 - k_r);
2	pmbaty	925
3	pmbaty	926	const double y = k_r * r_diff + k_g * g_diff + k_b * b_diff; //[!], analog YCbCr!
		927	const double c_b = scale_b * (b_diff - y);
		928	const double c_r = scale_r * (r_diff - y);
		929
		930	diffToDist[i] = (float) (sqrt ((y * y) + (c_b * c_b) + (c_r * c_r)));
		931	}
		932	is_initialized = true;
		933	}
		934
		935	const int r_diff = (int) GET_RED (pix1) - (int) GET_RED (pix2);
		936	const int g_diff = (int) GET_GREEN (pix1) - (int) GET_GREEN (pix2);
		937	const int b_diff = (int) GET_BLUE (pix1) - (int) GET_BLUE (pix2);
		938
		939	return diffToDist[(((r_diff + 0xFF) / 2) << 16) \| //slightly reduce precision (division by 2) to squeeze value into single byte
		940	(((g_diff + 0xFF) / 2) << 8) \|
		941	(((b_diff + 0xFF) / 2) << 0)];
2	pmbaty	942	}
		943
		944
3	pmbaty	945	static double dist32 (uint32_t pix1, uint32_t pix2)
		946	{
		947	const double a1 = GET_ALPHA (pix1) / 255.0 ;
		948	const double a2 = GET_ALPHA (pix2) / 255.0 ;
		949	/*
		950	Requirements for a color distance handling alpha channel: with a1, a2 in [0, 1]
2	pmbaty	951
3	pmbaty	952	1. if a1 = a2, distance should be: a1 * distYCbCr()
		953	2. if a1 = 0, distance should be: a2 * distYCbCr(black, white) = a2 * 255
		954	3. if a1 = 1, ??? maybe: 255 * (1 - a2) + a2 * distYCbCr()
		955	*/
		956
		957	//return MIN (a1, a2) * distYCbCrBuffered(pix1, pix2) + 255 * abs(a1 - a2);
		958	//=> following code is 15% faster:
		959	const double d = dist24 (pix1, pix2);
		960	return (a1 < a2 ? a1 * d + 255 * (a2 - a1) : a2 * d + 255 * (a1 - a2));
		961	}
		962
		963
		964	static void alphagrad24 (uint32_t *pixBack, uint32_t pixFront, unsigned int M, unsigned int N)
2	pmbaty	965	{
3	pmbaty	966	// blend front color with opacity M / N over opaque background: http://en.wikipedia.org/wiki/Alpha_compositing#Alpha_blending
		967	pixBack = ( (CALC_COLOR24 (GET_RED (pixFront), GET_RED (pixBack), M, N) << 16)
		968	\| (CALC_COLOR24 (GET_GREEN (pixFront), GET_GREEN (*pixBack), M, N) << 8)
		969	\| (CALC_COLOR24 (GET_BLUE (pixFront), GET_BLUE (*pixBack), M, N) << 0));
2	pmbaty	970	}
		971
		972
3	pmbaty	973	static void alphagrad32 (uint32_t *pixBack, uint32_t pixFront, unsigned int M, unsigned int N)
		974	{
		975	// find intermediate color between two colors with alpha channels (=> NO alpha blending!!!)
		976	const unsigned int weightFront = GET_ALPHA (pixFront) * M;
		977	const unsigned int weightBack = GET_ALPHA (pixBack) (N - M);
		978	const unsigned int weightSum = weightFront + weightBack;
		979	*pixBack = (weightSum == 0 ? 0 :
		980	(((unsigned char) (weightSum / N)) << 24)
		981	\| (CALC_COLOR32 (GET_RED (pixFront), GET_RED (*pixBack), weightFront, weightBack, weightSum) << 16)
		982	\| (CALC_COLOR32 (GET_GREEN (pixFront), GET_GREEN (*pixBack), weightFront, weightBack, weightSum) << 8)
		983	\| (CALC_COLOR32 (GET_BLUE (pixFront), GET_BLUE (*pixBack), weightFront, weightBack, weightSum) << 0));
		984	}
		985
		986
		987	EXTERN_C void nearestNeighborScale(const uint32_t src, int srcWidth, int srcHeight, uint32_t trg, int trgWidth, int trgHeight)
		988	{
		989	// nearestNeighborScale (src, srcWidth, srcHeight, srcWidth * sizeof (uint32_t), trg, trgWidth, trgHeight, trgWidth * sizeof (uint32_t), XBRZ_SLICETYPE_TARGET, 0, trgHeight, [](uint32_t pix) { return pix; });
		990	//static_assert(std::is_integral<PixSrc>::value, "PixSrc* is expected to be cast-able to char*");
		991	//static_assert(std::is_integral<PixTrg>::value, "PixTrg* is expected to be cast-able to char*");
		992	//static_assert(std::is_same<decltype(pixCvrt(PixSrc())), PixTrg>::value, "PixConverter returning wrong pixel format");
		993
		994	int srcPitch = srcWidth * sizeof (uint32_t);
		995	int trgPitch = trgWidth * sizeof (uint32_t);
		996	int yFirst;
		997	int yLast;
		998
		999	#if 0 // going over source image - fast for upscaling, since source is read only once
		1000	yFirst = 0;
		1001	yLast = MIN (trgHeight, srcHeight);
		1002
		1003	if (yFirst >= yLast \|\| trgWidth <= 0 \|\| trgHeight <= 0)
		1004	return; // consistency check
		1005
		1006	for (int y = yFirst; y < yLast; ++y)
		1007	{
		1008	//mathematically: ySrc = floor(srcHeight * yTrg / trgHeight)
		1009	// => search for integers in: [ySrc, ySrc + 1) * trgHeight / srcHeight
		1010
		1011	//keep within for loop to support MT input slices!
		1012	const int yTrg_first = ( y * trgHeight + srcHeight - 1) / srcHeight; //=ceil(y * trgHeight / srcHeight)
		1013	const int yTrg_last = ((y + 1) * trgHeight + srcHeight - 1) / srcHeight; //=ceil(((y + 1) * trgHeight) / srcHeight)
		1014	const int blockHeight = yTrg_last - yTrg_first;
		1015
		1016	if (blockHeight > 0)
		1017	{
		1018	const uint32_t srcLine = (const uint32_t ) BYTE_ADVANCE (src, y * srcPitch);
		1019	/*/ uint32_t trgLine = ( uint32_t ) BYTE_ADVANCE (trg, yTrg_first trgPitch);
		1020	int xTrg_first = 0;
		1021
		1022	for (int x = 0; x < srcWidth; ++x)
		1023	{
		1024	const int xTrg_last = ((x + 1) * trgWidth + srcWidth - 1) / srcWidth;
		1025	const int blockWidth = xTrg_last - xTrg_first;
		1026	if (blockWidth > 0)
		1027	{
		1028	const uint32_t trgColor = srcLine[x];
		1029	uint32_t *blkLine = trgLine;
		1030
		1031	xTrg_first = xTrg_last;
		1032
		1033	for (int blk_y = 0; blk_y < blockHeight; ++blk_y, blkLine = (uint32_t *) BYTE_ADVANCE (blkLine, trgPitch))
		1034	for (int blk_x = 0; blk_x < blockWidth; ++blk_x)
		1035	blkLine[blk_x] = trgColor;
		1036
		1037	trgLine += blockWidth;
		1038	}
		1039	}
		1040	}
		1041	}
		1042	#else // going over target image - slow for upscaling, since source is read multiple times missing out on cache! Fast for similar image sizes!
		1043	yFirst = 0;
		1044	yLast = trgHeight;
		1045
		1046	if (yFirst >= yLast \|\| srcHeight <= 0 \|\| srcWidth <= 0)
		1047	return; // consistency check
		1048
		1049	for (int y = yFirst; y < yLast; ++y)
		1050	{
		1051	/*/ uint32_t trgLine = ( uint32_t ) BYTE_ADVANCE (trg, y trgPitch);
		1052	const int ySrc = srcHeight * y / trgHeight;
		1053	const uint32_t srcLine = (const uint32_t ) BYTE_ADVANCE (src, ySrc * srcPitch);
		1054	for (int x = 0; x < trgWidth; ++x)
		1055	{
		1056	const int xSrc = srcWidth * x / trgWidth;
		1057	trgLine[x] = srcLine[xSrc];
		1058	}
		1059	}
		1060	#endif // going over source or target
		1061
		1062	return;
		1063	}
		1064
		1065
2	pmbaty	1066	EXTERN_C bool xbrz_equalcolortest24 (uint32_t col1, uint32_t col2, double luminanceWeight, double equalColorTolerance)
		1067	{
3	pmbaty	1068	return (dist24 (col1, col2) < equalColorTolerance);
2	pmbaty	1069	}
		1070
		1071
		1072	EXTERN_C bool xbrz_equalcolortest32 (uint32_t col1, uint32_t col2, double luminanceWeight, double equalColorTolerance)
		1073	{
3	pmbaty	1074	return (dist32 (col1, col2) < equalColorTolerance);
2	pmbaty	1075	}
		1076
		1077
		1078	EXTERN_C void xbrz_scale24 (size_t factor, const uint32_t src, uint32_t trg, int srcWidth, int srcHeight)
		1079	{
3	pmbaty	1080	if (factor == 2) return scaleImage<Scaler2x> (src, trg, srcWidth, srcHeight, 0, srcHeight, alphagrad24, dist24);
		1081	else if (factor == 3) return scaleImage<Scaler3x> (src, trg, srcWidth, srcHeight, 0, srcHeight, alphagrad24, dist24);
		1082	else if (factor == 4) return scaleImage<Scaler4x> (src, trg, srcWidth, srcHeight, 0, srcHeight, alphagrad24, dist24);
		1083	else if (factor == 5) return scaleImage<Scaler5x> (src, trg, srcWidth, srcHeight, 0, srcHeight, alphagrad24, dist24);
		1084	else if (factor == 6) return scaleImage<Scaler6x> (src, trg, srcWidth, srcHeight, 0, srcHeight, alphagrad24, dist24);
2	pmbaty	1085	}
		1086
		1087
		1088	EXTERN_C void xbrz_scale32 (size_t factor, const uint32_t src, uint32_t trg, int srcWidth, int srcHeight)
		1089	{
3	pmbaty	1090	if (factor == 2) return scaleImage<Scaler2x> (src, trg, srcWidth, srcHeight, 0, srcHeight, alphagrad32, dist32);
		1091	else if (factor == 3) return scaleImage<Scaler3x> (src, trg, srcWidth, srcHeight, 0, srcHeight, alphagrad32, dist32);
		1092	else if (factor == 4) return scaleImage<Scaler4x> (src, trg, srcWidth, srcHeight, 0, srcHeight, alphagrad32, dist32);
		1093	else if (factor == 5) return scaleImage<Scaler5x> (src, trg, srcWidth, srcHeight, 0, srcHeight, alphagrad32, dist32);
		1094	else if (factor == 6) return scaleImage<Scaler6x> (src, trg, srcWidth, srcHeight, 0, srcHeight, alphagrad32, dist32);
2	pmbaty	1095	}

Subversion Repositories Games.Prince of Persia

Games.Prince of Persia/xbrz.c – Rev 3