Subversion Repositories Games.Prince of Persia

Rev

Rev 2 | Rev 4 | Go to most recent revision | Blame | Compare with Previous | Last modification | View Log | Download | RSS feed

  1. // ****************************************************************************
  2. // * This file is part of the HqMAME project. It is distributed under         *
  3. // * GNU General Public License: https://www.gnu.org/licenses/gpl-3.0         *
  4. // * Copyright (C) Zenju (zenju AT gmx DOT de) - All Rights Reserved          *
  5. // *                                                                          *
  6. // * Additionally and as a special exception, the author gives permission     *
  7. // * to link the code of this program with the MAME library (or with modified *
  8. // * versions of MAME that use the same license as MAME), and distribute      *
  9. // * linked combinations including the two. You must obey the GNU General     *
  10. // * Public License in all respects for all of the code used other than MAME. *
  11. // * If you modify this file, you may extend this exception to your version   *
  12. // * of the file, but you are not obligated to do so. If you do not wish to   *
  13. // * do so, delete this exception statement from your version.                *
  14. // ****************************************************************************
  15.  
  16. // -------------------------------------------------------------------------
  17. // | xBRZ: "Scale by rules" - high quality image upscaling filter by Zenju |
  18. // -------------------------------------------------------------------------
  19. // using a modified approach of xBR:
  20. // http://board.byuu.org/viewtopic.php?f=10&t=2248
  21. //  - new rule set preserving small image features
  22. //  - highly optimized for performance
  23. //  - support alpha channel
  24. //  - support multithreading
  25. //  - support 64-bit architectures
  26. //  - support processing image slices
  27. //  - support scaling up to 6xBRZ
  28.  
  29. // -> map source (srcWidth * srcHeight) to target (scale * width x scale * height) image, optionally processing a half-open slice of rows [yFirst, yLast) only
  30. // -> support for source/target pitch in bytes!
  31. // -> if your emulator changes only a few image slices during each cycle (e.g. DOSBox) then there's no need to run xBRZ on the complete image:
  32. //    Just make sure you enlarge the source image slice by 2 rows on top and 2 on bottom (this is the additional range the xBRZ algorithm is using during analysis)
  33. //    CAVEAT: If there are multiple changed slices, make sure they do not overlap after adding these additional rows in order to avoid a memory race condition
  34. //    in the target image data if you are using multiple threads for processing each enlarged slice!
  35. //
  36. // THREAD-SAFETY: - parts of the same image may be scaled by multiple threads as long as the [yFirst, yLast) ranges do not overlap!
  37. //                - there is a minor inefficiency for the first row of a slice, so avoid processing single rows only; suggestion: process at least 8-16 rows
  38.  
  39.  
  40. #include <stddef.h> // for size_t
  41. #include <stdint.h> // for uint32_t
  42. #include <memory.h> // for memset()
  43. #include <limits.h>
  44. #include <math.h>
  45.  
  46.  
  47. #ifdef __cplusplus
  48. #define EXTERN_C extern "C"
  49. #else // !__cplusplus
  50. #define EXTERN_C
  51. #endif // __cplusplus
  52.  
  53.  
  54. // scaler configuration
  55. #define XBRZ_CFG_LUMINANCE_WEIGHT 1
  56. #define XBRZ_CFG_EQUAL_COLOR_TOLERANCE 30
  57. #define XBRZ_CFG_DOMINANT_DIRECTION_THRESHOLD 3.6
  58. #define XBRZ_CFG_STEEP_DIRECTION_THRESHOLD 2.2
  59.  
  60.  
  61. // slice types
  62. #define XBRZ_SLICETYPE_SOURCE 1
  63. #define XBRZ_SLICETYPE_TARGET 2
  64.  
  65.  
  66. // handy macros
  67. #define GET_BYTE(val,byteno) ((unsigned char) (((val) >> ((byteno) << 3)) & 0xff))
  68. #define GET_BLUE(val)  GET_BYTE (val, 0)
  69. #define GET_GREEN(val) GET_BYTE (val, 1)
  70. #define GET_RED(val)   GET_BYTE (val, 2)
  71. #define GET_ALPHA(val) GET_BYTE (val, 3)
  72. #define CALC_COLOR24(colFront,colBack,M,N) (unsigned char) ((((unsigned char) (colFront)) * ((unsigned int) (M)) + ((unsigned char) (colBack)) * (((unsigned int) (N)) - ((unsigned int) (M)))) / ((unsigned int) (N)))
  73. #define CALC_COLOR32(colFront,colBack,weightFront,weightBack,weightSum) ((unsigned char) ((((unsigned char) (colFront)) * ((unsigned int) (weightFront)) + ((unsigned char) (colBack)) * ((unsigned int) (weightBack))) / ((unsigned int) (weightSum))))
  74. #define BYTE_ADVANCE(buffer,offset) (((char *) buffer) + (offset))
  75. #ifndef MIN
  76. #define MIN(a,b) ((a) < (b) ? (a) : (b))
  77. #endif // MIN
  78. #ifndef MAX
  79. #define MAX(a,b) ((a) > (b) ? (a) : (b))
  80. #endif // MAX
  81.  
  82.  
  83. typedef void (alphagrad_func) (uint32_t *pixBack, uint32_t pixFront, unsigned int M, unsigned int N);
  84. typedef double (dist_func) (uint32_t pix1, uint32_t pix2);
  85.  
  86.  
  87.  
  88.  
  89. namespace
  90. {
  91. #ifdef _MSC_VER
  92.     #define FORCE_INLINE __forceinline
  93. #elif defined __GNUC__
  94.     #define FORCE_INLINE __attribute__((always_inline)) inline
  95. #else
  96.     #define FORCE_INLINE inline
  97. #endif
  98.  
  99.  
  100. enum RotationDegree //clock-wise
  101. {
  102.     ROT_0 = 0,
  103.     ROT_90,
  104.     ROT_180,
  105.     ROT_270
  106. };
  107.  
  108.  
  109. //calculate input matrix coordinates after rotation at compile time
  110. template <RotationDegree rotDeg, size_t I, size_t J, size_t N> struct MatrixRotation;
  111.  
  112.  
  113. template <size_t I, size_t J, size_t N> struct MatrixRotation<ROT_0, I, J, N>
  114. {
  115.     static const size_t I_old = I;
  116.     static const size_t J_old = J;
  117. };
  118.  
  119.  
  120. template <RotationDegree rotDeg, size_t I, size_t J, size_t N> //(i, j) = (row, col) indices, N = size of (square) matrix
  121. struct MatrixRotation
  122. {
  123.     static const size_t I_old = N - 1 - MatrixRotation<(RotationDegree)(rotDeg - 1), I, J, N>::J_old; //old coordinates before rotation!
  124.     static const size_t J_old =         MatrixRotation<(RotationDegree)(rotDeg - 1), I, J, N>::I_old; //
  125. };
  126.  
  127.  
  128. template <size_t N, RotationDegree rotDeg> class OutputMatrix
  129. {
  130. public:
  131.     OutputMatrix (uint32_t *out, int outWidth) //access matrix area, top-left at position "out" for image with given width
  132.     {
  133.         out_ = out;
  134.         outWidth_ = outWidth;
  135.     }
  136.  
  137.     template <size_t I, size_t J> uint32_t &ref() const
  138.     {
  139.         static const size_t I_old = MatrixRotation<rotDeg, I, J, N>::I_old;
  140.         static const size_t J_old = MatrixRotation<rotDeg, I, J, N>::J_old;
  141.  
  142.         return *(out_ + J_old + I_old * outWidth_);
  143.     }
  144.  
  145.     uint32_t* out_;
  146.     int outWidth_;
  147. };
  148.  
  149.  
  150.  
  151.  
  152. enum BlendType
  153. {
  154.     BLEND_NONE = 0,
  155.     BLEND_NORMAL,   //a normal indication to blend
  156.     BLEND_DOMINANT, //a strong indication to blend
  157.     //attention: BlendType must fit into the value range of 2 bit!!!
  158. };
  159.  
  160. struct BlendResult
  161. {
  162.     BlendType
  163.     /**/blend_f, blend_g,
  164.     /**/blend_j, blend_k;
  165. };
  166.  
  167.  
  168. struct Kernel_4x4 //kernel for preprocessing step
  169. {
  170.     uint32_t
  171.     /**/a, b, c, d,
  172.     /**/e, f, g, h,
  173.     /**/i, j, k, l,
  174.     /**/m, n, o, p;
  175. };
  176.  
  177. /*
  178. input kernel area naming convention:
  179. -----------------
  180. | A | B | C | D |
  181. ----|---|---|---|
  182. | E | F | G | H |   //evaluate the four corners between F, G, J, K
  183. ----|---|---|---|   //input pixel is at position F
  184. | I | J | K | L |
  185. ----|---|---|---|
  186. | M | N | O | P |
  187. -----------------
  188. */
  189. FORCE_INLINE //detect blend direction
  190. BlendResult preProcessCorners(const Kernel_4x4& ker, dist_func dist) //result: F, G, J, K corners of "GradientType"
  191. {
  192.     BlendResult result = {};
  193.  
  194.     if ((ker.f == ker.g &&
  195.          ker.j == ker.k) ||
  196.         (ker.f == ker.j &&
  197.          ker.g == ker.k))
  198.         return result;
  199.  
  200.     const int weight = 4;
  201.     double jg = dist (ker.i, ker.f) + dist (ker.f, ker.c) + dist (ker.n, ker.k) + dist (ker.k, ker.h) + weight * dist (ker.j, ker.g);
  202.     double fk = dist (ker.e, ker.j) + dist (ker.j, ker.o) + dist (ker.b, ker.g) + dist (ker.g, ker.l) + weight * dist (ker.f, ker.k);
  203.  
  204.     if (jg < fk) //test sample: 70% of values max(jg, fk) / min(jg, fk) are between 1.1 and 3.7 with median being 1.8
  205.     {
  206.         const bool dominantGradient = XBRZ_CFG_DOMINANT_DIRECTION_THRESHOLD * jg < fk;
  207.         if (ker.f != ker.g && ker.f != ker.j)
  208.             result.blend_f = dominantGradient ? BLEND_DOMINANT : BLEND_NORMAL;
  209.  
  210.         if (ker.k != ker.j && ker.k != ker.g)
  211.             result.blend_k = dominantGradient ? BLEND_DOMINANT : BLEND_NORMAL;
  212.     }
  213.     else if (fk < jg)
  214.     {
  215.         const bool dominantGradient = XBRZ_CFG_DOMINANT_DIRECTION_THRESHOLD * fk < jg;
  216.         if (ker.j != ker.f && ker.j != ker.k)
  217.             result.blend_j = dominantGradient ? BLEND_DOMINANT : BLEND_NORMAL;
  218.  
  219.         if (ker.g != ker.f && ker.g != ker.k)
  220.             result.blend_g = dominantGradient ? BLEND_DOMINANT : BLEND_NORMAL;
  221.     }
  222.     return result;
  223. }
  224.  
  225. struct Kernel_3x3
  226. {
  227.     uint32_t
  228.     /**/a,  b,  c,
  229.     /**/d,  e,  f,
  230.     /**/g,  h,  i;
  231. };
  232. /*
  233. #define DEF_GETTER(x) template <RotationDegree rotDeg> uint32_t inline get_##x(const Kernel_3x3& ker) { return ker.x; }
  234. //we cannot and NEED NOT write "ker.##x" since ## concatenates preprocessor tokens but "." is not a token
  235. DEF_GETTER(a) DEF_GETTER(b) DEF_GETTER(c)
  236. DEF_GETTER(d) DEF_GETTER(e) DEF_GETTER(f)
  237. DEF_GETTER(g) DEF_GETTER(h) DEF_GETTER(i)
  238. #undef DEF_GETTER
  239.  
  240. #define DEF_GETTER(x, y) template <> inline uint32_t get_##x<ROT_90>(const Kernel_3x3& ker) { return ker.y; }
  241. DEF_GETTER(a, g) DEF_GETTER(b, d) DEF_GETTER(c, a)
  242. DEF_GETTER(d, h) DEF_GETTER(e, e) DEF_GETTER(f, b)
  243. DEF_GETTER(g, i) DEF_GETTER(h, f) DEF_GETTER(i, c)
  244. #undef DEF_GETTER
  245.  
  246. #define DEF_GETTER(x, y) template <> inline uint32_t get_##x<ROT_180>(const Kernel_3x3& ker) { return ker.y; }
  247. DEF_GETTER(a, i) DEF_GETTER(b, h) DEF_GETTER(c, g)
  248. DEF_GETTER(d, f) DEF_GETTER(e, e) DEF_GETTER(f, d)
  249. DEF_GETTER(g, c) DEF_GETTER(h, b) DEF_GETTER(i, a)
  250. #undef DEF_GETTER
  251.  
  252. #define DEF_GETTER(x, y) template <> inline uint32_t get_##x<ROT_270>(const Kernel_3x3& ker) { return ker.y; }
  253. DEF_GETTER(a, c) DEF_GETTER(b, f) DEF_GETTER(c, i)
  254. DEF_GETTER(d, b) DEF_GETTER(e, e) DEF_GETTER(f, h)
  255. DEF_GETTER(g, a) DEF_GETTER(h, d) DEF_GETTER(i, g)
  256. #undef DEF_GETTER
  257. */
  258.  
  259. template <RotationDegree rotDeg> uint32_t inline get_a (const Kernel_3x3& ker) { return ker.a; }
  260. template <RotationDegree rotDeg> uint32_t inline get_b (const Kernel_3x3& ker) { return ker.b; }
  261. template <RotationDegree rotDeg> uint32_t inline get_c (const Kernel_3x3& ker) { return ker.c; }
  262. template <RotationDegree rotDeg> uint32_t inline get_d (const Kernel_3x3& ker) { return ker.d; }
  263. template <RotationDegree rotDeg> uint32_t inline get_e (const Kernel_3x3& ker) { return ker.e; }
  264. template <RotationDegree rotDeg> uint32_t inline get_f (const Kernel_3x3& ker) { return ker.f; }
  265. template <RotationDegree rotDeg> uint32_t inline get_g (const Kernel_3x3& ker) { return ker.g; }
  266. template <RotationDegree rotDeg> uint32_t inline get_h (const Kernel_3x3& ker) { return ker.h; }
  267. template <RotationDegree rotDeg> uint32_t inline get_i (const Kernel_3x3& ker) { return ker.i; }
  268.  
  269. template <> inline uint32_t get_a<ROT_90>(const Kernel_3x3& ker) { return ker.g; }
  270. template <> inline uint32_t get_b<ROT_90>(const Kernel_3x3& ker) { return ker.d; }
  271. template <> inline uint32_t get_c<ROT_90>(const Kernel_3x3& ker) { return ker.a; }
  272. template <> inline uint32_t get_d<ROT_90>(const Kernel_3x3& ker) { return ker.h; }
  273. template <> inline uint32_t get_e<ROT_90>(const Kernel_3x3& ker) { return ker.e; }
  274. template <> inline uint32_t get_f<ROT_90>(const Kernel_3x3& ker) { return ker.b; }
  275. template <> inline uint32_t get_g<ROT_90>(const Kernel_3x3& ker) { return ker.i; }
  276. template <> inline uint32_t get_h<ROT_90>(const Kernel_3x3& ker) { return ker.f; }
  277. template <> inline uint32_t get_i<ROT_90>(const Kernel_3x3& ker) { return ker.c; }
  278.  
  279. template <> inline uint32_t get_a<ROT_180>(const Kernel_3x3& ker) { return ker.i; }
  280. template <> inline uint32_t get_b<ROT_180>(const Kernel_3x3& ker) { return ker.h; }
  281. template <> inline uint32_t get_c<ROT_180>(const Kernel_3x3& ker) { return ker.g; }
  282. template <> inline uint32_t get_d<ROT_180>(const Kernel_3x3& ker) { return ker.f; }
  283. template <> inline uint32_t get_e<ROT_180>(const Kernel_3x3& ker) { return ker.e; }
  284. template <> inline uint32_t get_f<ROT_180>(const Kernel_3x3& ker) { return ker.d; }
  285. template <> inline uint32_t get_g<ROT_180>(const Kernel_3x3& ker) { return ker.c; }
  286. template <> inline uint32_t get_h<ROT_180>(const Kernel_3x3& ker) { return ker.b; }
  287. template <> inline uint32_t get_i<ROT_180>(const Kernel_3x3& ker) { return ker.a; }
  288.  
  289. template <> inline uint32_t get_a<ROT_270>(const Kernel_3x3& ker) { return ker.c; }
  290. template <> inline uint32_t get_b<ROT_270>(const Kernel_3x3& ker) { return ker.f; }
  291. template <> inline uint32_t get_c<ROT_270>(const Kernel_3x3& ker) { return ker.i; }
  292. template <> inline uint32_t get_d<ROT_270>(const Kernel_3x3& ker) { return ker.b; }
  293. template <> inline uint32_t get_e<ROT_270>(const Kernel_3x3& ker) { return ker.e; }
  294. template <> inline uint32_t get_f<ROT_270>(const Kernel_3x3& ker) { return ker.h; }
  295. template <> inline uint32_t get_g<ROT_270>(const Kernel_3x3& ker) { return ker.a; }
  296. template <> inline uint32_t get_h<ROT_270>(const Kernel_3x3& ker) { return ker.d; }
  297. template <> inline uint32_t get_i<ROT_270>(const Kernel_3x3& ker) { return ker.g; }
  298.  
  299. //compress four blend types into a single byte
  300. inline BlendType getTopL   (unsigned char b) { return (BlendType)(0x3 & b); }
  301. inline BlendType getTopR   (unsigned char b) { return (BlendType)(0x3 & (b >> 2)); }
  302. inline BlendType getBottomR(unsigned char b) { return (BlendType)(0x3 & (b >> 4)); }
  303. inline BlendType getBottomL(unsigned char b) { return (BlendType)(0x3 & (b >> 6)); }
  304.  
  305. inline void setTopL   (unsigned char& b, BlendType bt) { b |= bt; } //buffer is assumed to be initialized before preprocessing!
  306. inline void setTopR   (unsigned char& b, BlendType bt) { b |= (bt << 2); }
  307. inline void setBottomR(unsigned char& b, BlendType bt) { b |= (bt << 4); }
  308. inline void setBottomL(unsigned char& b, BlendType bt) { b |= (bt << 6); }
  309.  
  310. template <RotationDegree rotDeg> inline
  311. unsigned char rotateBlendInfo (unsigned char b) { return b; }
  312. template <> inline unsigned char rotateBlendInfo<ROT_90 >(unsigned char b) { return ((b << 2) | (b >> 6)) & 0xff; }
  313. template <> inline unsigned char rotateBlendInfo<ROT_180>(unsigned char b) { return ((b << 4) | (b >> 4)) & 0xff; }
  314. template <> inline unsigned char rotateBlendInfo<ROT_270>(unsigned char b) { return ((b << 6) | (b >> 2)) & 0xff; }
  315.  
  316.  
  317. /*
  318. input kernel area naming convention:
  319. -------------
  320. | A | B | C |
  321. ----|---|---|
  322. | D | E | F | //input pixel is at position E
  323. ----|---|---|
  324. | G | H | I |
  325. -------------
  326. */
  327. template <class Scaler, RotationDegree rotDeg>
  328. FORCE_INLINE void blendPixel(const Kernel_3x3& ker, uint32_t *target, int trgWidth, unsigned char blendInfo, alphagrad_func alphagrad, dist_func dist) //result of preprocessing all four corners of pixel "e"
  329. {
  330. #define a get_a<rotDeg>(ker)
  331. #define b get_b<rotDeg>(ker)
  332. #define c get_c<rotDeg>(ker)
  333. #define d get_d<rotDeg>(ker)
  334. #define e get_e<rotDeg>(ker)
  335. #define f get_f<rotDeg>(ker)
  336. #define g get_g<rotDeg>(ker)
  337. #define h get_h<rotDeg>(ker)
  338. #define i get_i<rotDeg>(ker)
  339.  
  340.     const unsigned char blend = rotateBlendInfo<rotDeg>(blendInfo);
  341.  
  342.     if (getBottomR(blend) >= BLEND_NORMAL)
  343.     {
  344.         bool doLineBlend;
  345.  
  346.         if (getBottomR(blend) >= BLEND_DOMINANT)
  347.             doLineBlend = true;
  348.         else if (getTopR(blend) != BLEND_NONE && (dist (e, g) >= XBRZ_CFG_EQUAL_COLOR_TOLERANCE)) //but support double-blending for 90° corners
  349.             doLineBlend = false; // make sure there is no second blending in an adjacent rotation for this pixel: handles insular pixels, mario eyes
  350.         else if (getBottomL(blend) != BLEND_NONE && (dist (e, c) >= XBRZ_CFG_EQUAL_COLOR_TOLERANCE))
  351.             doLineBlend = false; // make sure there is no second blending in an adjacent rotation for this pixel: handles insular pixels, mario eyes
  352.         else if ((dist (e, i) >= XBRZ_CFG_EQUAL_COLOR_TOLERANCE)
  353.             && (dist (g, h) < XBRZ_CFG_EQUAL_COLOR_TOLERANCE)
  354.             && (dist (h, i) < XBRZ_CFG_EQUAL_COLOR_TOLERANCE)
  355.             && (dist (i, f) < XBRZ_CFG_EQUAL_COLOR_TOLERANCE)
  356.             && (dist (f, c) < XBRZ_CFG_EQUAL_COLOR_TOLERANCE))
  357.             doLineBlend = false; // no full blending for L-shapes; blend corner only (handles "mario mushroom eyes")
  358.                 else
  359.             doLineBlend = true;
  360.  
  361.         const uint32_t px = (dist (e, f) <= dist (e, h) ? f : h); //choose most similar color
  362.  
  363.         OutputMatrix<Scaler::scale, rotDeg> out(target, trgWidth);
  364.  
  365.         if (doLineBlend)
  366.         {
  367.             const double fg = dist (f, g); //test sample: 70% of values max(fg, hc) / min(fg, hc) are between 1.1 and 3.7 with median being 1.9
  368.             const double hc = dist (h, c); //
  369.  
  370.             const bool haveShallowLine = XBRZ_CFG_STEEP_DIRECTION_THRESHOLD * fg <= hc && e != g && d != g;
  371.             const bool haveSteepLine   = XBRZ_CFG_STEEP_DIRECTION_THRESHOLD * hc <= fg && e != c && b != c;
  372.  
  373.             if (haveShallowLine)
  374.             {
  375.                 if (haveSteepLine)
  376.                     Scaler::blendLineSteepAndShallow(px, out, alphagrad);
  377.                 else
  378.                     Scaler::blendLineShallow(px, out, alphagrad);
  379.             }
  380.             else
  381.             {
  382.                 if (haveSteepLine)
  383.                     Scaler::blendLineSteep(px, out, alphagrad);
  384.                 else
  385.                     Scaler::blendLineDiagonal(px, out, alphagrad);
  386.             }
  387.         }
  388.         else
  389.             Scaler::blendCorner(px, out, alphagrad);
  390.     }
  391.  
  392. #undef a
  393. #undef b
  394. #undef c
  395. #undef d
  396. #undef e
  397. #undef f
  398. #undef g
  399. #undef h
  400. #undef i
  401. }
  402.  
  403.  
  404. template <class Scaler> //scaler policy: see "Scaler2x" reference implementation
  405. void scaleImage(const uint32_t *src, uint32_t *trg, int srcWidth, int srcHeight, int yFirst, int yLast, alphagrad_func alphagrad, dist_func dist)
  406. {
  407.     yFirst = MAX (yFirst, 0);
  408.     yLast  = MIN (yLast, srcHeight);
  409.     if (yFirst >= yLast || srcWidth <= 0)
  410.         return;
  411.  
  412.     const int trgWidth = srcWidth * Scaler::scale;
  413.  
  414.     //"use" space at the end of the image as temporary buffer for "on the fly preprocessing": we even could use larger area of
  415.     //"sizeof(uint32_t) * srcWidth * (yLast - yFirst)" bytes without risk of accidental overwriting before accessing
  416.     const int bufferSize = srcWidth;
  417.     unsigned char* preProcBuffer = reinterpret_cast<unsigned char*>(trg + yLast * Scaler::scale * trgWidth) - bufferSize;
  418.     memset (preProcBuffer, 0, bufferSize);
  419.     static_assert(BLEND_NONE == 0, "");
  420.  
  421.     //initialize preprocessing buffer for first row of current stripe: detect upper left and right corner blending
  422.     //this cannot be optimized for adjacent processing stripes; we must not allow for a memory race condition!
  423.     if (yFirst > 0)
  424.     {
  425.         const int y = yFirst - 1;
  426.  
  427.         const uint32_t* s_m1 = src + srcWidth * MAX (y - 1, 0);
  428.         const uint32_t* s_0  = src + srcWidth * y; //center line
  429.         const uint32_t* s_p1 = src + srcWidth * MIN (y + 1, srcHeight - 1);
  430.         const uint32_t* s_p2 = src + srcWidth * MIN (y + 2, srcHeight - 1);
  431.  
  432.         for (int x = 0; x < srcWidth; ++x)
  433.         {
  434.             const int x_m1 = MAX (x - 1, 0);
  435.             const int x_p1 = MIN (x + 1, srcWidth - 1);
  436.             const int x_p2 = MIN (x + 2, srcWidth - 1);
  437.  
  438.             Kernel_4x4 ker = {}; //perf: initialization is negligible
  439.             ker.a = s_m1[x_m1]; //read sequentially from memory as far as possible
  440.             ker.b = s_m1[x];
  441.             ker.c = s_m1[x_p1];
  442.             ker.d = s_m1[x_p2];
  443.  
  444.             ker.e = s_0[x_m1];
  445.             ker.f = s_0[x];
  446.             ker.g = s_0[x_p1];
  447.             ker.h = s_0[x_p2];
  448.  
  449.             ker.i = s_p1[x_m1];
  450.             ker.j = s_p1[x];
  451.             ker.k = s_p1[x_p1];
  452.             ker.l = s_p1[x_p2];
  453.  
  454.             ker.m = s_p2[x_m1];
  455.             ker.n = s_p2[x];
  456.             ker.o = s_p2[x_p1];
  457.             ker.p = s_p2[x_p2];
  458.  
  459.             const BlendResult res = preProcessCorners (ker, dist);
  460.             /*
  461.             preprocessing blend result:
  462.             ---------
  463.             | F | G |   //evalute corner between F, G, J, K
  464.             ----|---|   //input pixel is at position F
  465.             | J | K |
  466.             ---------
  467.             */
  468.             setTopR(preProcBuffer[x], res.blend_j);
  469.  
  470.             if (x + 1 < bufferSize)
  471.                 setTopL(preProcBuffer[x + 1], res.blend_k);
  472.         }
  473.     }
  474.     //------------------------------------------------------------------------------------
  475.  
  476.     for (int y = yFirst; y < yLast; ++y)
  477.     {
  478.         uint32_t *out = trg + Scaler::scale * y * trgWidth; //consider MT "striped" access
  479.  
  480.         const uint32_t* s_m1 = src + srcWidth * MAX (y - 1, 0);
  481.         const uint32_t* s_0  = src + srcWidth * y; //center line
  482.         const uint32_t* s_p1 = src + srcWidth * MIN (y + 1, srcHeight - 1);
  483.         const uint32_t* s_p2 = src + srcWidth * MIN (y + 2, srcHeight - 1);
  484.  
  485.         unsigned char blend_xy1 = 0; //corner blending for current (x, y + 1) position
  486.  
  487.         for (int x = 0; x < srcWidth; ++x, out += Scaler::scale)
  488.         {
  489.             //all those bounds checks have only insignificant impact on performance!
  490.             const int x_m1 = MAX (x - 1, 0); //perf: prefer array indexing to additional pointers!
  491.             const int x_p1 = MIN (x + 1, srcWidth - 1);
  492.             const int x_p2 = MIN (x + 2, srcWidth - 1);
  493.  
  494.             Kernel_4x4 ker4 = {}; //perf: initialization is negligible
  495.  
  496.             ker4.a = s_m1[x_m1]; //read sequentially from memory as far as possible
  497.             ker4.b = s_m1[x];
  498.             ker4.c = s_m1[x_p1];
  499.             ker4.d = s_m1[x_p2];
  500.  
  501.             ker4.e = s_0[x_m1];
  502.             ker4.f = s_0[x];
  503.             ker4.g = s_0[x_p1];
  504.             ker4.h = s_0[x_p2];
  505.  
  506.             ker4.i = s_p1[x_m1];
  507.             ker4.j = s_p1[x];
  508.             ker4.k = s_p1[x_p1];
  509.             ker4.l = s_p1[x_p2];
  510.  
  511.             ker4.m = s_p2[x_m1];
  512.             ker4.n = s_p2[x];
  513.             ker4.o = s_p2[x_p1];
  514.             ker4.p = s_p2[x_p2];
  515.  
  516.             //evaluate the four corners on bottom-right of current pixel
  517.             unsigned char blend_xy = 0; //for current (x, y) position
  518.             {
  519.                 const BlendResult res = preProcessCorners (ker4, dist);
  520.                 /*
  521.                 preprocessing blend result:
  522.                 ---------
  523.                 | F | G |   //evalute corner between F, G, J, K
  524.                 ----|---|   //current input pixel is at position F
  525.                 | J | K |
  526.                 ---------
  527.                 */
  528.                 blend_xy = preProcBuffer[x];
  529.                 setBottomR(blend_xy, res.blend_f); //all four corners of (x, y) have been determined at this point due to processing sequence!
  530.  
  531.                 setTopR(blend_xy1, res.blend_j); //set 2nd known corner for (x, y + 1)
  532.                 preProcBuffer[x] = blend_xy1; //store on current buffer position for use on next row
  533.  
  534.                 blend_xy1 = 0;
  535.                 setTopL(blend_xy1, res.blend_k); //set 1st known corner for (x + 1, y + 1) and buffer for use on next column
  536.  
  537.                 if (x + 1 < bufferSize) //set 3rd known corner for (x + 1, y)
  538.                     setBottomL(preProcBuffer[x + 1], res.blend_g);
  539.             }
  540.  
  541.             //fill block of size scale * scale with the given color
  542.                         {
  543.                                 uint32_t *blk = out;
  544.                             for (int _blk_y = 0; _blk_y < Scaler::scale; ++_blk_y, blk = (uint32_t *) BYTE_ADVANCE (blk, trgWidth * sizeof (uint32_t)))
  545.                                 for (int _blk_x = 0; _blk_x < Scaler::scale; ++_blk_x)
  546.                                     blk[_blk_x] = ker4.f;
  547.                         }
  548.             //place *after* preprocessing step, to not overwrite the results while processing the the last pixel!
  549.  
  550.             //blend four corners of current pixel
  551.             if (blend_xy != 0) //good 5% perf-improvement
  552.             {
  553.                 Kernel_3x3 ker3 = {}; //perf: initialization is negligible
  554.  
  555.                 ker3.a = ker4.a;
  556.                 ker3.b = ker4.b;
  557.                 ker3.c = ker4.c;
  558.  
  559.                 ker3.d = ker4.e;
  560.                 ker3.e = ker4.f;
  561.                 ker3.f = ker4.g;
  562.  
  563.                 ker3.g = ker4.i;
  564.                 ker3.h = ker4.j;
  565.                 ker3.i = ker4.k;
  566.  
  567.                 blendPixel<Scaler, ROT_0  >(ker3, out, trgWidth, blend_xy, alphagrad, dist);
  568.                 blendPixel<Scaler, ROT_90 >(ker3, out, trgWidth, blend_xy, alphagrad, dist);
  569.                 blendPixel<Scaler, ROT_180>(ker3, out, trgWidth, blend_xy, alphagrad, dist);
  570.                 blendPixel<Scaler, ROT_270>(ker3, out, trgWidth, blend_xy, alphagrad, dist);
  571.             }
  572.         }
  573.     }
  574. }
  575.  
  576.  
  577. //------------------------------------------------------------------------------------
  578. struct Scaler2x
  579. {
  580.     static const int scale = 2;
  581.  
  582.  
  583.     template <class OutputMatrix>
  584.     static void blendLineShallow(uint32_t col, OutputMatrix& out, alphagrad_func alphagrad)
  585.     {
  586.         alphagrad (&(out.template ref<scale - 1, 0>()), col, 1, 4);
  587.         alphagrad (&(out.template ref<scale - 1, 1>()), col, 3, 4);
  588.     }
  589.  
  590.     template <class OutputMatrix>
  591.     static void blendLineSteep(uint32_t col, OutputMatrix& out, alphagrad_func alphagrad)
  592.     {
  593.         alphagrad (&(out.template ref<0, scale - 1>()), col, 1, 4);
  594.         alphagrad (&(out.template ref<1, scale - 1>()), col, 3, 4);
  595.     }
  596.  
  597.     template <class OutputMatrix>
  598.     static void blendLineSteepAndShallow(uint32_t col, OutputMatrix& out, alphagrad_func alphagrad)
  599.     {
  600.         alphagrad (&(out.template ref<1, 0>()), col, 1, 4);
  601.         alphagrad (&(out.template ref<0, 1>()), col, 1, 4);
  602.         alphagrad (&(out.template ref<1, 1>()), col, 5, 6); //[!] fixes 7/8 used in xBR
  603.     }
  604.  
  605.     template <class OutputMatrix>
  606.     static void blendLineDiagonal(uint32_t col, OutputMatrix& out, alphagrad_func alphagrad)
  607.     {
  608.         alphagrad (&(out.template ref<1, 1>()), col, 1, 2);
  609.     }
  610.  
  611.     template <class OutputMatrix>
  612.     static void blendCorner(uint32_t col, OutputMatrix& out, alphagrad_func alphagrad)
  613.     {
  614.         //model a round corner
  615.         alphagrad (&(out.template ref<1, 1>()), col, 21, 100); //exact: 1 - pi/4 = 0.2146018366
  616.     }
  617. };
  618.  
  619.  
  620. struct Scaler3x
  621. {
  622.     static const int scale = 3;
  623.  
  624.  
  625.     template <class OutputMatrix>
  626.     static void blendLineShallow(uint32_t col, OutputMatrix& out, alphagrad_func alphagrad)
  627.     {
  628.         alphagrad (&(out.template ref<scale - 1, 0>()), col, 1, 4);
  629.         alphagrad (&(out.template ref<scale - 2, 2>()), col, 1, 4);
  630.         alphagrad (&(out.template ref<scale - 1, 1>()), col, 3, 4);
  631.         out.template ref<scale - 1, 2>() = col;
  632.     }
  633.  
  634.     template <class OutputMatrix>
  635.     static void blendLineSteep(uint32_t col, OutputMatrix& out, alphagrad_func alphagrad)
  636.     {
  637.         alphagrad (&(out.template ref<0, scale - 1>()), col, 1, 4);
  638.         alphagrad (&(out.template ref<2, scale - 2>()), col, 1, 4);
  639.         alphagrad (&(out.template ref<1, scale - 1>()), col, 3, 4);
  640.         out.template ref<2, scale - 1>() = col;
  641.     }
  642.  
  643.     template <class OutputMatrix>
  644.     static void blendLineSteepAndShallow(uint32_t col, OutputMatrix& out, alphagrad_func alphagrad)
  645.     {
  646.         alphagrad (&(out.template ref<2, 0>()), col, 1, 4);
  647.         alphagrad (&(out.template ref<0, 2>()), col, 1, 4);
  648.         alphagrad (&(out.template ref<2, 1>()), col, 3, 4);
  649.         alphagrad (&(out.template ref<1, 2>()), col, 3, 4);
  650.         out.template ref<2, 2>() = col;
  651.     }
  652.  
  653.     template <class OutputMatrix>
  654.     static void blendLineDiagonal(uint32_t col, OutputMatrix& out, alphagrad_func alphagrad)
  655.     {
  656.         alphagrad (&(out.template ref<1, 2>()), col, 1, 8); //conflict with other rotations for this odd scale
  657.         alphagrad (&(out.template ref<2, 1>()), col, 1, 8);
  658.         alphagrad (&(out.template ref<2, 2>()), col, 7, 8); //
  659.     }
  660.  
  661.     template <class OutputMatrix>
  662.     static void blendCorner(uint32_t col, OutputMatrix& out, alphagrad_func alphagrad)
  663.     {
  664.         //model a round corner
  665.         alphagrad (&(out.template ref<2, 2>()), col, 45, 100); //exact: 0.4545939598
  666.         //alphagrad (&(out.template ref<2, 1>()), col, 7, 256); //0.02826017254 -> negligible + avoid conflicts with other rotations for this odd scale
  667.         //alphagrad (&(out.template ref<1, 2>()), col, 7, 256); //0.02826017254
  668.     }
  669. };
  670.  
  671.  
  672. struct Scaler4x
  673. {
  674.     static const int scale = 4;
  675.  
  676.  
  677.     template <class OutputMatrix>
  678.     static void blendLineShallow(uint32_t col, OutputMatrix& out, alphagrad_func alphagrad)
  679.     {
  680.         alphagrad (&(out.template ref<scale - 1, 0>()), col, 1, 4);
  681.         alphagrad (&(out.template ref<scale - 2, 2>()), col, 1, 4);
  682.         alphagrad (&(out.template ref<scale - 1, 1>()), col, 3, 4);
  683.         alphagrad (&(out.template ref<scale - 2, 3>()), col, 3, 4);
  684.  
  685.         out.template ref<scale - 1, 2>() = col;
  686.         out.template ref<scale - 1, 3>() = col;
  687.     }
  688.  
  689.     template <class OutputMatrix>
  690.     static void blendLineSteep(uint32_t col, OutputMatrix& out, alphagrad_func alphagrad)
  691.     {
  692.         alphagrad (&(out.template ref<0, scale - 1>()), col, 1, 4);
  693.         alphagrad (&(out.template ref<2, scale - 2>()), col, 1, 4);
  694.         alphagrad (&(out.template ref<1, scale - 1>()), col, 3, 4);
  695.         alphagrad (&(out.template ref<3, scale - 2>()), col, 3, 4);
  696.  
  697.         out.template ref<2, scale - 1>() = col;
  698.         out.template ref<3, scale - 1>() = col;
  699.     }
  700.  
  701.     template <class OutputMatrix>
  702.     static void blendLineSteepAndShallow(uint32_t col, OutputMatrix& out, alphagrad_func alphagrad)
  703.     {
  704.         alphagrad (&(out.template ref<3, 1>()), col, 3, 4);
  705.         alphagrad (&(out.template ref<1, 3>()), col, 3, 4);
  706.         alphagrad (&(out.template ref<3, 0>()), col, 1, 4);
  707.         alphagrad (&(out.template ref<0, 3>()), col, 1, 4);
  708.         alphagrad (&(out.template ref<2, 2>()), col, 1, 3); //[!] fixes 1/4 used in xBR
  709.  
  710.         out.template ref<3, 3>() = col;
  711.         out.template ref<3, 2>() = col;
  712.         out.template ref<2, 3>() = col;
  713.     }
  714.  
  715.     template <class OutputMatrix>
  716.     static void blendLineDiagonal(uint32_t col, OutputMatrix& out, alphagrad_func alphagrad)
  717.     {
  718.         alphagrad (&(out.template ref<scale - 1, scale / 2    >()), col, 1, 2);
  719.         alphagrad (&(out.template ref<scale - 2, scale / 2 + 1>()), col, 1, 2);
  720.  
  721.         out.template ref<scale - 1, scale - 1>() = col;
  722.     }
  723.  
  724.     template <class OutputMatrix>
  725.     static void blendCorner(uint32_t col, OutputMatrix& out, alphagrad_func alphagrad)
  726.     {
  727.         //model a round corner
  728.         alphagrad (&(out.template ref<3, 3>()), col, 68, 100); //exact: 0.6848532563
  729.         alphagrad (&(out.template ref<3, 2>()), col,  9, 100); //0.08677704501
  730.         alphagrad (&(out.template ref<2, 3>()), col,  9, 100); //0.08677704501
  731.     }
  732. };
  733.  
  734.  
  735. struct Scaler5x
  736. {
  737.     static const int scale = 5;
  738.  
  739.  
  740.     template <class OutputMatrix>
  741.     static void blendLineShallow(uint32_t col, OutputMatrix& out, alphagrad_func alphagrad)
  742.     {
  743.         alphagrad (&(out.template ref<scale - 1, 0>()), col, 1, 4);
  744.         alphagrad (&(out.template ref<scale - 2, 2>()), col, 1, 4);
  745.         alphagrad (&(out.template ref<scale - 3, 4>()), col, 1, 4);
  746.         alphagrad (&(out.template ref<scale - 1, 1>()), col, 3, 4);
  747.         alphagrad (&(out.template ref<scale - 2, 3>()), col, 3, 4);
  748.  
  749.         out.template ref<scale - 1, 2>() = col;
  750.         out.template ref<scale - 1, 3>() = col;
  751.         out.template ref<scale - 1, 4>() = col;
  752.         out.template ref<scale - 2, 4>() = col;
  753.     }
  754.  
  755.     template <class OutputMatrix>
  756.     static void blendLineSteep(uint32_t col, OutputMatrix& out, alphagrad_func alphagrad)
  757.     {
  758.         alphagrad (&(out.template ref<0, scale - 1>()), col, 1, 4);
  759.         alphagrad (&(out.template ref<2, scale - 2>()), col, 1, 4);
  760.         alphagrad (&(out.template ref<4, scale - 3>()), col, 1, 4);
  761.         alphagrad (&(out.template ref<1, scale - 1>()), col, 3, 4);
  762.         alphagrad (&(out.template ref<3, scale - 2>()), col, 3, 4);
  763.  
  764.         out.template ref<2, scale - 1>() = col;
  765.         out.template ref<3, scale - 1>() = col;
  766.         out.template ref<4, scale - 1>() = col;
  767.         out.template ref<4, scale - 2>() = col;
  768.     }
  769.  
  770.     template <class OutputMatrix>
  771.     static void blendLineSteepAndShallow(uint32_t col, OutputMatrix& out, alphagrad_func alphagrad)
  772.     {
  773.         alphagrad (&(out.template ref<0, scale - 1>()), col, 1, 4);
  774.         alphagrad (&(out.template ref<2, scale - 2>()), col, 1, 4);
  775.         alphagrad (&(out.template ref<1, scale - 1>()), col, 3, 4);
  776.         alphagrad (&(out.template ref<scale - 1, 0>()), col, 1, 4);
  777.         alphagrad (&(out.template ref<scale - 2, 2>()), col, 1, 4);
  778.         alphagrad (&(out.template ref<scale - 1, 1>()), col, 3, 4);
  779.         alphagrad (&(out.template ref<3, 3>()), col, 2, 3);
  780.  
  781.         out.template ref<2, scale - 1>() = col;
  782.         out.template ref<3, scale - 1>() = col;
  783.         out.template ref<4, scale - 1>() = col;
  784.         out.template ref<scale - 1, 2>() = col;
  785.         out.template ref<scale - 1, 3>() = col;
  786.     }
  787.  
  788.     template <class OutputMatrix>
  789.     static void blendLineDiagonal(uint32_t col, OutputMatrix& out, alphagrad_func alphagrad)
  790.     {
  791.         alphagrad (&(out.template ref<scale - 1, scale / 2    >()), col, 1, 8); //conflict with other rotations for this odd scale
  792.         alphagrad (&(out.template ref<scale - 2, scale / 2 + 1>()), col, 1, 8);
  793.         alphagrad (&(out.template ref<scale - 3, scale / 2 + 2>()), col, 1, 8); //
  794.         alphagrad (&(out.template ref<4, 3>()), col, 7, 8);
  795.         alphagrad (&(out.template ref<3, 4>()), col, 7, 8);
  796.  
  797.         out.template ref<4, 4>() = col;
  798.     }
  799.  
  800.     template <class OutputMatrix>
  801.     static void blendCorner(uint32_t col, OutputMatrix& out, alphagrad_func alphagrad)
  802.     {
  803.         // model a round corner
  804.         alphagrad (&(out.template ref<4, 4>()), col, 86, 100); //exact: 0.8631434088
  805.         alphagrad (&(out.template ref<4, 3>()), col, 23, 100); //0.2306749731
  806.         alphagrad (&(out.template ref<3, 4>()), col, 23, 100); //0.2306749731
  807.         //alphaGrad<1, 64>(out.template ref<4, 2>(), col); //0.01676812367 -> negligible + avoid conflicts with other rotations for this odd scale
  808.         //alphaGrad<1, 64>(out.template ref<2, 4>(), col); //0.01676812367
  809.     }
  810. };
  811.  
  812.  
  813. struct Scaler6x
  814. {
  815.     static const int scale = 6;
  816.  
  817.  
  818.     template <class OutputMatrix>
  819.     static void blendLineShallow(uint32_t col, OutputMatrix& out, alphagrad_func alphagrad)
  820.     {
  821.         alphagrad (&(out.template ref<scale - 1, 0>()), col, 1, 4);
  822.         alphagrad (&(out.template ref<scale - 2, 2>()), col, 1, 4);
  823.         alphagrad (&(out.template ref<scale - 3, 4>()), col, 1, 4);
  824.         alphagrad (&(out.template ref<scale - 1, 1>()), col, 3, 4);
  825.         alphagrad (&(out.template ref<scale - 2, 3>()), col, 3, 4);
  826.         alphagrad (&(out.template ref<scale - 3, 5>()), col, 3, 4);
  827.  
  828.         out.template ref<scale - 1, 2>() = col;
  829.         out.template ref<scale - 1, 3>() = col;
  830.         out.template ref<scale - 1, 4>() = col;
  831.         out.template ref<scale - 1, 5>() = col;
  832.         out.template ref<scale - 2, 4>() = col;
  833.         out.template ref<scale - 2, 5>() = col;
  834.     }
  835.  
  836.     template <class OutputMatrix>
  837.     static void blendLineSteep(uint32_t col, OutputMatrix& out, alphagrad_func alphagrad)
  838.     {
  839.         alphagrad (&(out.template ref<0, scale - 1>()), col, 1, 4);
  840.         alphagrad (&(out.template ref<2, scale - 2>()), col, 1, 4);
  841.         alphagrad (&(out.template ref<4, scale - 3>()), col, 1, 4);
  842.         alphagrad (&(out.template ref<1, scale - 1>()), col, 3, 4);
  843.         alphagrad (&(out.template ref<3, scale - 2>()), col, 3, 4);
  844.         alphagrad (&(out.template ref<5, scale - 3>()), col, 3, 4);
  845.  
  846.         out.template ref<2, scale - 1>() = col;
  847.         out.template ref<3, scale - 1>() = col;
  848.         out.template ref<4, scale - 1>() = col;
  849.         out.template ref<5, scale - 1>() = col;
  850.         out.template ref<4, scale - 2>() = col;
  851.         out.template ref<5, scale - 2>() = col;
  852.     }
  853.  
  854.     template <class OutputMatrix>
  855.     static void blendLineSteepAndShallow(uint32_t col, OutputMatrix& out, alphagrad_func alphagrad)
  856.     {
  857.         alphagrad (&(out.template ref<0, scale - 1>()), col, 1, 4);
  858.         alphagrad (&(out.template ref<2, scale - 2>()), col, 1, 4);
  859.         alphagrad (&(out.template ref<1, scale - 1>()), col, 3, 4);
  860.         alphagrad (&(out.template ref<3, scale - 2>()), col, 3, 4);
  861.         alphagrad (&(out.template ref<scale - 1, 0>()), col, 1, 4);
  862.         alphagrad (&(out.template ref<scale - 2, 2>()), col, 1, 4);
  863.         alphagrad (&(out.template ref<scale - 1, 1>()), col, 3, 4);
  864.         alphagrad (&(out.template ref<scale - 2, 3>()), col, 3, 4);
  865.  
  866.         out.template ref<2, scale - 1>() = col;
  867.         out.template ref<3, scale - 1>() = col;
  868.         out.template ref<4, scale - 1>() = col;
  869.         out.template ref<5, scale - 1>() = col;
  870.         out.template ref<4, scale - 2>() = col;
  871.         out.template ref<5, scale - 2>() = col;
  872.         out.template ref<scale - 1, 2>() = col;
  873.         out.template ref<scale - 1, 3>() = col;
  874.     }
  875.  
  876.     template <class OutputMatrix>
  877.     static void blendLineDiagonal(uint32_t col, OutputMatrix& out, alphagrad_func alphagrad)
  878.     {
  879.         alphagrad (&(out.template ref<scale - 1, scale / 2    >()), col, 1, 2);
  880.         alphagrad (&(out.template ref<scale - 2, scale / 2 + 1>()), col, 1, 2);
  881.         alphagrad (&(out.template ref<scale - 3, scale / 2 + 2>()), col, 1, 2);
  882.  
  883.         out.template ref<scale - 2, scale - 1>() = col;
  884.         out.template ref<scale - 1, scale - 1>() = col;
  885.         out.template ref<scale - 1, scale - 2>() = col;
  886.     }
  887.  
  888.     template <class OutputMatrix>
  889.     static void blendCorner(uint32_t col, OutputMatrix& out, alphagrad_func alphagrad)
  890.     {
  891.         //model a round corner
  892.         alphagrad (&(out.template ref<5, 5>()), col, 97, 100); //exact: 0.9711013910
  893.         alphagrad (&(out.template ref<4, 5>()), col, 42, 100); //0.4236372243
  894.         alphagrad (&(out.template ref<5, 4>()), col, 42, 100); //0.4236372243
  895.         alphagrad (&(out.template ref<5, 3>()), col,  6, 100); //0.05652034508
  896.         alphagrad (&(out.template ref<3, 5>()), col,  6, 100); //0.05652034508
  897.     }
  898. };
  899.  
  900.         //------------------------------------------------------------------------------------
  901. }
  902.  
  903.  
  904.  
  905. static double dist24 (uint32_t pix1, uint32_t pix2)
  906. {
  907.     //30% perf boost compared to plain distYCbCr()!
  908.     //consumes 64 MB memory; using double is only 2% faster, but takes 128 MB
  909.     static float diffToDist[256 * 256 * 256];
  910.     static bool is_initialized = false;
  911.     if (!is_initialized)
  912.     {
  913.         for (uint32_t i = 0; i < 256 * 256 * 256; ++i) //startup time: 114 ms on Intel Core i5 (four cores)
  914.         {
  915.             const int r_diff = GET_RED (i) * 2 - 0xFF;
  916.             const int g_diff = GET_GREEN (i) * 2 - 0xFF;
  917.             const int b_diff = GET_BLUE (i) * 2 - 0xFF;
  918.  
  919.             const double k_b = 0.0593; //ITU-R BT.2020 conversion
  920.             const double k_r = 0.2627; //
  921.             const double k_g = 1 - k_b - k_r;
  922.  
  923.             const double scale_b = 0.5 / (1 - k_b);
  924.             const double scale_r = 0.5 / (1 - k_r);
  925.  
  926.             const double y   = k_r * r_diff + k_g * g_diff + k_b * b_diff; //[!], analog YCbCr!
  927.             const double c_b = scale_b * (b_diff - y);
  928.             const double c_r = scale_r * (r_diff - y);
  929.  
  930.             diffToDist[i] = (float) (sqrt ((y * y) + (c_b * c_b) + (c_r * c_r)));
  931.         }
  932.         is_initialized = true;
  933.     }
  934.  
  935.     const int r_diff = (int) GET_RED   (pix1) - (int) GET_RED   (pix2);
  936.     const int g_diff = (int) GET_GREEN (pix1) - (int) GET_GREEN (pix2);
  937.     const int b_diff = (int) GET_BLUE  (pix1) - (int) GET_BLUE  (pix2);
  938.  
  939.     return diffToDist[(((r_diff + 0xFF) / 2) << 16) | //slightly reduce precision (division by 2) to squeeze value into single byte
  940.                       (((g_diff + 0xFF) / 2) <<  8) |
  941.                       (((b_diff + 0xFF) / 2) <<  0)];
  942. }
  943.  
  944.  
  945. static double dist32 (uint32_t pix1, uint32_t pix2)
  946. {
  947.     const double a1 = GET_ALPHA (pix1) / 255.0 ;
  948.     const double a2 = GET_ALPHA (pix2) / 255.0 ;
  949.     /*
  950.     Requirements for a color distance handling alpha channel: with a1, a2 in [0, 1]
  951.  
  952.         1. if a1 = a2, distance should be: a1 * distYCbCr()
  953.         2. if a1 = 0,  distance should be: a2 * distYCbCr(black, white) = a2 * 255
  954.         3. if a1 = 1,  ??? maybe: 255 * (1 - a2) + a2 * distYCbCr()
  955.     */
  956.  
  957.     //return MIN (a1, a2) * distYCbCrBuffered(pix1, pix2) + 255 * abs(a1 - a2);
  958.     //=> following code is 15% faster:
  959.     const double d = dist24 (pix1, pix2);
  960.     return (a1 < a2 ? a1 * d + 255 * (a2 - a1) : a2 * d + 255 * (a1 - a2));
  961. }
  962.  
  963.  
  964. static void alphagrad24 (uint32_t *pixBack, uint32_t pixFront, unsigned int M, unsigned int N)
  965. {
  966.         // blend front color with opacity M / N over opaque background: http://en.wikipedia.org/wiki/Alpha_compositing#Alpha_blending
  967.         *pixBack = (  (CALC_COLOR24 (GET_RED   (pixFront), GET_RED   (*pixBack), M, N) << 16)
  968.                                 | (CALC_COLOR24 (GET_GREEN (pixFront), GET_GREEN (*pixBack), M, N) <<  8)
  969.                                 | (CALC_COLOR24 (GET_BLUE  (pixFront), GET_BLUE  (*pixBack), M, N) <<  0));
  970. }
  971.  
  972.  
  973. static void alphagrad32 (uint32_t *pixBack, uint32_t pixFront, unsigned int M, unsigned int N)
  974. {
  975.         // find intermediate color between two colors with alpha channels (=> NO alpha blending!!!)
  976.     const unsigned int weightFront = GET_ALPHA (pixFront) * M;
  977.     const unsigned int weightBack  = GET_ALPHA (*pixBack) * (N - M);
  978.     const unsigned int weightSum   = weightFront + weightBack;
  979.     *pixBack = (weightSum == 0 ? 0 :
  980.                                 (((unsigned char) (weightSum / N))                                                               << 24)
  981.                                 | (CALC_COLOR32 (GET_RED   (pixFront), GET_RED   (*pixBack), weightFront, weightBack, weightSum) << 16)
  982.                                 | (CALC_COLOR32 (GET_GREEN (pixFront), GET_GREEN (*pixBack), weightFront, weightBack, weightSum) <<  8)
  983.                                 | (CALC_COLOR32 (GET_BLUE  (pixFront), GET_BLUE  (*pixBack), weightFront, weightBack, weightSum) <<  0));
  984. }
  985.  
  986.  
  987. EXTERN_C void nearestNeighborScale(const uint32_t *src, int srcWidth, int srcHeight, uint32_t *trg, int trgWidth, int trgHeight)
  988. {
  989. //    nearestNeighborScale (src, srcWidth, srcHeight, srcWidth * sizeof (uint32_t), trg, trgWidth, trgHeight, trgWidth * sizeof (uint32_t), XBRZ_SLICETYPE_TARGET, 0, trgHeight, [](uint32_t pix) { return pix; });
  990.     //static_assert(std::is_integral<PixSrc>::value, "PixSrc* is expected to be cast-able to char*");
  991.     //static_assert(std::is_integral<PixTrg>::value, "PixTrg* is expected to be cast-able to char*");
  992.     //static_assert(std::is_same<decltype(pixCvrt(PixSrc())), PixTrg>::value, "PixConverter returning wrong pixel format");
  993.  
  994.     int srcPitch = srcWidth * sizeof (uint32_t);
  995.     int trgPitch = trgWidth * sizeof (uint32_t);
  996.     int yFirst;
  997.     int yLast;
  998.  
  999. #if 0 // going over source image - fast for upscaling, since source is read only once
  1000.     yFirst = 0;
  1001.     yLast  = MIN (trgHeight, srcHeight);
  1002.  
  1003.     if (yFirst >= yLast || trgWidth <= 0 || trgHeight <= 0)
  1004.         return; // consistency check
  1005.  
  1006.     for (int y = yFirst; y < yLast; ++y)
  1007.     {
  1008.         //mathematically: ySrc = floor(srcHeight * yTrg / trgHeight)
  1009.         // => search for integers in: [ySrc, ySrc + 1) * trgHeight / srcHeight
  1010.  
  1011.         //keep within for loop to support MT input slices!
  1012.         const int yTrg_first = ( y      * trgHeight + srcHeight - 1) / srcHeight; //=ceil(y * trgHeight / srcHeight)
  1013.         const int yTrg_last  = ((y + 1) * trgHeight + srcHeight - 1) / srcHeight; //=ceil(((y + 1) * trgHeight) / srcHeight)
  1014.         const int blockHeight = yTrg_last - yTrg_first;
  1015.  
  1016.         if (blockHeight > 0)
  1017.         {
  1018.             const uint32_t *srcLine = (const uint32_t *) BYTE_ADVANCE (src, y * srcPitch);
  1019.             /**/  uint32_t *trgLine = (      uint32_t *) BYTE_ADVANCE (trg, yTrg_first * trgPitch);
  1020.             int xTrg_first = 0;
  1021.  
  1022.             for (int x = 0; x < srcWidth; ++x)
  1023.             {
  1024.                 const int xTrg_last = ((x + 1) * trgWidth + srcWidth - 1) / srcWidth;
  1025.                 const int blockWidth = xTrg_last - xTrg_first;
  1026.                 if (blockWidth > 0)
  1027.                 {
  1028.                     const uint32_t trgColor = srcLine[x];
  1029.                                         uint32_t *blkLine = trgLine;
  1030.  
  1031.                     xTrg_first = xTrg_last;
  1032.  
  1033.                                     for (int blk_y = 0; blk_y < blockHeight; ++blk_y, blkLine = (uint32_t *) BYTE_ADVANCE (blkLine, trgPitch))
  1034.                                         for (int blk_x = 0; blk_x < blockWidth; ++blk_x)
  1035.                                             blkLine[blk_x] = trgColor;
  1036.  
  1037.                     trgLine += blockWidth;
  1038.                 }
  1039.             }
  1040.         }
  1041.     }
  1042. #else // going over target image - slow for upscaling, since source is read multiple times missing out on cache! Fast for similar image sizes!
  1043.     yFirst = 0;
  1044.     yLast  = trgHeight;
  1045.  
  1046.     if (yFirst >= yLast || srcHeight <= 0 || srcWidth <= 0)
  1047.         return; // consistency check
  1048.  
  1049.     for (int y = yFirst; y < yLast; ++y)
  1050.     {
  1051.         /**/  uint32_t *trgLine = (      uint32_t *) BYTE_ADVANCE (trg, y * trgPitch);
  1052.         const int ySrc = srcHeight * y / trgHeight;
  1053.         const uint32_t *srcLine = (const uint32_t *) BYTE_ADVANCE (src, ySrc * srcPitch);
  1054.         for (int x = 0; x < trgWidth; ++x)
  1055.         {
  1056.             const int xSrc = srcWidth * x / trgWidth;
  1057.             trgLine[x] = srcLine[xSrc];
  1058.         }
  1059.     }
  1060. #endif // going over source or target
  1061.  
  1062.         return;
  1063. }
  1064.  
  1065.  
  1066. EXTERN_C bool xbrz_equalcolortest24 (uint32_t col1, uint32_t col2, double luminanceWeight, double equalColorTolerance)
  1067. {
  1068.         return (dist24 (col1, col2) < equalColorTolerance);
  1069. }
  1070.  
  1071.  
  1072. EXTERN_C bool xbrz_equalcolortest32 (uint32_t col1, uint32_t col2, double luminanceWeight, double equalColorTolerance)
  1073. {
  1074.         return (dist32 (col1, col2) < equalColorTolerance);
  1075. }
  1076.  
  1077.  
  1078. EXTERN_C void xbrz_scale24 (size_t factor, const uint32_t *src, uint32_t *trg, int srcWidth, int srcHeight)
  1079. {
  1080.     if      (factor == 2) return scaleImage<Scaler2x> (src, trg, srcWidth, srcHeight, 0, srcHeight, alphagrad24, dist24);
  1081.     else if (factor == 3) return scaleImage<Scaler3x> (src, trg, srcWidth, srcHeight, 0, srcHeight, alphagrad24, dist24);
  1082.     else if (factor == 4) return scaleImage<Scaler4x> (src, trg, srcWidth, srcHeight, 0, srcHeight, alphagrad24, dist24);
  1083.     else if (factor == 5) return scaleImage<Scaler5x> (src, trg, srcWidth, srcHeight, 0, srcHeight, alphagrad24, dist24);
  1084.     else if (factor == 6) return scaleImage<Scaler6x> (src, trg, srcWidth, srcHeight, 0, srcHeight, alphagrad24, dist24);
  1085. }
  1086.  
  1087.  
  1088. EXTERN_C void xbrz_scale32 (size_t factor, const uint32_t *src, uint32_t *trg, int srcWidth, int srcHeight)
  1089. {
  1090.     if      (factor == 2) return scaleImage<Scaler2x> (src, trg, srcWidth, srcHeight, 0, srcHeight, alphagrad32, dist32);
  1091.     else if (factor == 3) return scaleImage<Scaler3x> (src, trg, srcWidth, srcHeight, 0, srcHeight, alphagrad32, dist32);
  1092.     else if (factor == 4) return scaleImage<Scaler4x> (src, trg, srcWidth, srcHeight, 0, srcHeight, alphagrad32, dist32);
  1093.     else if (factor == 5) return scaleImage<Scaler5x> (src, trg, srcWidth, srcHeight, 0, srcHeight, alphagrad32, dist32);
  1094.     else if (factor == 6) return scaleImage<Scaler6x> (src, trg, srcWidth, srcHeight, 0, srcHeight, alphagrad32, dist32);
  1095. }
  1096.