Subversion Repositories Games.Prince of Persia

Rev

Rev 3 | Blame | Last modification | View Log | Download | RSS feed

  1. // ****************************************************************************
  2. // * This file is part of the HqMAME project. It is distributed under         *
  3. // * GNU General Public License: https://www.gnu.org/licenses/gpl-3.0         *
  4. // * Copyright (C) Zenju (zenju AT gmx DOT de) - All Rights Reserved          *
  5. // *                                                                          *
  6. // * Additionally and as a special exception, the author gives permission     *
  7. // * to link the code of this program with the MAME library (or with modified *
  8. // * versions of MAME that use the same license as MAME), and distribute      *
  9. // * linked combinations including the two. You must obey the GNU General     *
  10. // * Public License in all respects for all of the code used other than MAME. *
  11. // * If you modify this file, you may extend this exception to your version   *
  12. // * of the file, but you are not obligated to do so. If you do not wish to   *
  13. // * do so, delete this exception statement from your version.                *
  14. // ****************************************************************************
  15.  
  16. // -------------------------------------------------------------------------
  17. // | xBRZ: "Scale by rules" - high quality image upscaling filter by Zenju |
  18. // -------------------------------------------------------------------------
  19. // using a modified approach of xBR:
  20. // http://board.byuu.org/viewtopic.php?f=10&t=2248
  21. //  - new rule set preserving small image features
  22. //  - highly optimized for performance
  23. //  - support alpha channel
  24. //  - support multithreading
  25. //  - support 64-bit architectures
  26. //  - support processing image slices
  27. //  - support scaling up to 6xBRZ
  28.  
  29. // -> map source (srcWidth * srcHeight) to target (scale * width x scale * height) image, optionally processing a half-open slice of rows [yFirst, yLast) only
  30. // -> support for source/target pitch in bytes!
  31. // -> if your emulator changes only a few image slices during each cycle (e.g. DOSBox) then there's no need to run xBRZ on the complete image:
  32. //    Just make sure you enlarge the source image slice by 2 rows on top and 2 on bottom (this is the additional range the xBRZ algorithm is using during analysis)
  33. //    CAVEAT: If there are multiple changed slices, make sure they do not overlap after adding these additional rows in order to avoid a memory race condition
  34. //    in the target image data if you are using multiple threads for processing each enlarged slice!
  35. //
  36. // THREAD-SAFETY: - parts of the same image may be scaled by multiple threads as long as the [yFirst, yLast) ranges do not overlap!
  37. //                - there is a minor inefficiency for the first row of a slice, so avoid processing single rows only; suggestion: process at least 8-16 rows
  38.  
  39.  
  40. #include <stddef.h> // for size_t
  41. #include <stdint.h> // for uint32_t
  42. #include <memory.h> // for memset()
  43. #include <limits.h>
  44. #include <math.h>
  45.  
  46.  
  47. #ifdef __cplusplus
  48. #define EXTERN_C extern "C"
  49. #else // !__cplusplus
  50. #define EXTERN_C
  51. #endif // __cplusplus
  52.  
  53.  
  54. #ifdef _MSC_VER
  55. #define FORCE_INLINE __forceinline
  56. #elif defined __GNUC__
  57. #define FORCE_INLINE __attribute__((always_inline)) inline
  58. #else
  59. #define FORCE_INLINE inline
  60. #endif
  61.  
  62.  
  63. // scaler configuration
  64. #define XBRZ_CFG_LUMINANCE_WEIGHT 1
  65. #define XBRZ_CFG_EQUAL_COLOR_TOLERANCE 30
  66. #define XBRZ_CFG_DOMINANT_DIRECTION_THRESHOLD 3.6
  67. #define XBRZ_CFG_STEEP_DIRECTION_THRESHOLD 2.2
  68.  
  69.  
  70. // slice types
  71. #define XBRZ_SLICETYPE_SOURCE 1
  72. #define XBRZ_SLICETYPE_TARGET 2
  73.  
  74.  
  75. // handy macros
  76. #define GET_BYTE(val,byteno) ((unsigned char) (((val) >> ((byteno) << 3)) & 0xff))
  77. #define GET_BLUE(val)  GET_BYTE (val, 0)
  78. #define GET_GREEN(val) GET_BYTE (val, 1)
  79. #define GET_RED(val)   GET_BYTE (val, 2)
  80. #define GET_ALPHA(val) GET_BYTE (val, 3)
  81. #define CALC_COLOR24(colFront,colBack,M,N) (unsigned char) ((((unsigned char) (colFront)) * ((unsigned int) (M)) + ((unsigned char) (colBack)) * (((unsigned int) (N)) - ((unsigned int) (M)))) / ((unsigned int) (N)))
  82. #define CALC_COLOR32(colFront,colBack,weightFront,weightBack,weightSum) ((unsigned char) ((((unsigned char) (colFront)) * ((unsigned int) (weightFront)) + ((unsigned char) (colBack)) * ((unsigned int) (weightBack))) / ((unsigned int) (weightSum))))
  83. #define BYTE_ADVANCE(buffer,offset) (((char *) buffer) + (offset))
  84. #ifndef MIN
  85. #define MIN(a,b) ((a) < (b) ? (a) : (b))
  86. #endif // MIN
  87. #ifndef MAX
  88. #define MAX(a,b) ((a) > (b) ? (a) : (b))
  89. #endif // MAX
  90.  
  91.  
  92. typedef void (alphagrad_func) (uint32_t *pixBack, uint32_t pixFront, unsigned int M, unsigned int N);
  93. typedef double (dist_func) (uint32_t pix1, uint32_t pix2);
  94.  
  95.  
  96.  
  97. enum RotationDegree //clock-wise
  98. {
  99.    ROT_0 = 0,
  100.    ROT_90,
  101.    ROT_180,
  102.    ROT_270
  103. };
  104.  
  105.  
  106. enum BlendType
  107. {
  108.    BLEND_NONE = 0,
  109.    BLEND_NORMAL,   //a normal indication to blend
  110.    BLEND_DOMINANT, //a strong indication to blend
  111.    //attention: BlendType must fit into the value range of 2 bit!!!
  112. };
  113.  
  114.  
  115. typedef struct blendresult_s
  116. {
  117.    BlendType
  118.       /**/blend_f, blend_g,
  119.       /**/blend_j, blend_k;
  120. } blendresult_t;
  121.  
  122.  
  123. typedef struct kernel_3x3_s
  124. {
  125.    uint32_t
  126.       /**/a, b, c,
  127.       /**/d, e, f,
  128.       /**/g, h, i;
  129. } kernel_3x3_t;
  130.  
  131.  
  132. typedef struct kernel_4x4_s //kernel for preprocessing step
  133. {
  134.    uint32_t
  135.       /**/a, b, c, d,
  136.       /**/e, f, g, h,
  137.       /**/i, j, k, l,
  138.       /**/m, n, o, p;
  139. } kernel_4x4_t;
  140.  
  141.  
  142. typedef struct outmatrix_s
  143. {
  144.    size_t size;
  145.    uint32_t* ptr;
  146.    int stride;
  147.    RotationDegree rotDeg;
  148. } outmatrix_t;
  149.  
  150.  
  151. static void outmatrix_create (outmatrix_t *mat, size_t size, uint32_t *ptr, int stride, RotationDegree rotDeg) //access matrix area, top-left at position "out" for image with given width
  152. {
  153.    mat->size = size;
  154.    mat->ptr = ptr;
  155.    mat->stride = stride;
  156.    mat->rotDeg = rotDeg;
  157. }
  158.  
  159.  
  160. static uint32_t *outmatrix_ref (outmatrix_t *mat, size_t I, size_t J)
  161. {
  162.    size_t I_old;
  163.    size_t J_old;
  164.    // calculate input matrix coordinates after rotation: (i, j) = (row, col) indices, N = size of (square) matrix
  165.    if      (mat->rotDeg == ROT_270) { I_old = J;                 J_old = mat->size - 1 - I; }
  166.    else if (mat->rotDeg == ROT_180) { I_old = mat->size - 1 - I; J_old = mat->size - 1 - J; }
  167.    else if (mat->rotDeg == ROT_90)  { I_old = mat->size - 1 - J; J_old = I;                 }
  168.    else                             { I_old = I;                 J_old = J;                 }
  169.  
  170.    return (mat->ptr + I_old * mat->stride + J_old);
  171. }
  172.  
  173.  
  174. static FORCE_INLINE void preProcessCorners (blendresult_t *result, const kernel_4x4_t *ker, dist_func dist)
  175. {
  176.    // detect blend direction
  177.    // result: F, G, J, K corners of "GradientType"
  178.  
  179.    // input kernel area naming convention:
  180.    // -----------------
  181.    // | A | B | C | D |
  182.    // ----|---|---|---|
  183.    // | E | F | G | H |   //evaluate the four corners between F, G, J, K
  184.    // ----|---|---|---|   //input pixel is at position F
  185.    // | I | J | K | L |
  186.    // ----|---|---|---|
  187.    // | M | N | O | P |
  188.    // -----------------
  189.  
  190.    memset (result, 0, sizeof (blendresult_t));
  191.  
  192.    if (((ker->f == ker->g) && (ker->j == ker->k)) || ((ker->f == ker->j) && (ker->g == ker->k)))
  193.       return;
  194.  
  195.    const int weight = 4;
  196.    double jg = dist (ker->i, ker->f) + dist (ker->f, ker->c) + dist (ker->n, ker->k) + dist (ker->k, ker->h) + weight * dist (ker->j, ker->g);
  197.    double fk = dist (ker->e, ker->j) + dist (ker->j, ker->o) + dist (ker->b, ker->g) + dist (ker->g, ker->l) + weight * dist (ker->f, ker->k);
  198.  
  199.    if (jg < fk) //test sample: 70% of values max(jg, fk) / min(jg, fk) are between 1.1 and 3.7 with median being 1.8
  200.    {
  201.       const bool dominantGradient = XBRZ_CFG_DOMINANT_DIRECTION_THRESHOLD * jg < fk;
  202.       if (ker->f != ker->g && ker->f != ker->j)
  203.          result->blend_f = dominantGradient ? BLEND_DOMINANT : BLEND_NORMAL;
  204.  
  205.       if (ker->k != ker->j && ker->k != ker->g)
  206.          result->blend_k = dominantGradient ? BLEND_DOMINANT : BLEND_NORMAL;
  207.    }
  208.    else if (fk < jg)
  209.    {
  210.       const bool dominantGradient = XBRZ_CFG_DOMINANT_DIRECTION_THRESHOLD * fk < jg;
  211.       if (ker->j != ker->f && ker->j != ker->k)
  212.          result->blend_j = dominantGradient ? BLEND_DOMINANT : BLEND_NORMAL;
  213.  
  214.       if (ker->g != ker->f && ker->g != ker->k)
  215.          result->blend_g = dominantGradient ? BLEND_DOMINANT : BLEND_NORMAL;
  216.    }
  217.    return;
  218. }
  219.  
  220. // compress four blend types into a single byte
  221. static inline BlendType getTopL (unsigned char b) { return (BlendType) (0x3 & (b >> 0)); }
  222. static inline BlendType getTopR (unsigned char b) { return (BlendType) (0x3 & (b >> 2)); }
  223. static inline BlendType getBottomR (unsigned char b) { return (BlendType) (0x3 & (b >> 4)); }
  224. static inline BlendType getBottomL (unsigned char b) { return (BlendType) (0x3 & (b >> 6)); }
  225.  
  226. static inline void setTopL (unsigned char& b, BlendType bt) { b |= bt; } //buffer is assumed to be initialized before preprocessing!
  227. static inline void setTopR (unsigned char& b, BlendType bt) { b |= (bt << 2); }
  228. static inline void setBottomR (unsigned char& b, BlendType bt) { b |= (bt << 4); }
  229. static inline void setBottomL (unsigned char& b, BlendType bt) { b |= (bt << 6); }
  230.  
  231.  
  232. namespace
  233. {
  234.  
  235.  
  236.  
  237.  
  238.    template <class Scaler>
  239.    FORCE_INLINE void blendPixel (const int scale_factor, const kernel_3x3_t *ker, uint32_t *target, int trgWidth, unsigned char blendInfo, alphagrad_func alphagrad, dist_func dist, RotationDegree rotDeg) //result of preprocessing all four corners of pixel "e"
  240.    {
  241.       // input kernel area naming convention:
  242.       // -------------
  243.       // | A | B | C |
  244.       // ----|---|---|
  245.       // | D | E | F | //input pixel is at position E
  246.       // ----|---|---|
  247.       // | G | H | I |
  248.       // -------------
  249.  
  250.       uint32_t
  251.          a, b, c,
  252.          d, e, f,
  253.          g, h, i;
  254.       unsigned char blend;
  255.  
  256.       if      (rotDeg == ROT_270) { a = ker->c; b = ker->f; c = ker->i; d = ker->b; e = ker->e; f = ker->h; g = ker->a; h = ker->d; i = ker->g; blend = ((blendInfo << 6) | (blendInfo >> 2)) & 0xff; }
  257.       else if (rotDeg == ROT_180) { a = ker->i; b = ker->h; c = ker->g; d = ker->f; e = ker->e; f = ker->d; g = ker->c; h = ker->b; i = ker->a; blend = ((blendInfo << 4) | (blendInfo >> 4)) & 0xff; }
  258.       else if (rotDeg == ROT_90)  { a = ker->g; b = ker->d; c = ker->a; d = ker->h; e = ker->e; f = ker->b; g = ker->i; h = ker->f; i = ker->c; blend = ((blendInfo << 2) | (blendInfo >> 6)) & 0xff; }
  259.       else                        { a = ker->a; b = ker->b; c = ker->c; d = ker->d; e = ker->e; f = ker->f; g = ker->g; h = ker->h; i = ker->i; blend = ((blendInfo << 0) | (blendInfo >> 8)) & 0xff; }
  260.  
  261.       if (getBottomR (blend) >= BLEND_NORMAL)
  262.       {
  263.          outmatrix_t out;
  264.          uint32_t px;
  265.          bool doLineBlend;
  266.  
  267.          if (getBottomR (blend) >= BLEND_DOMINANT)
  268.             doLineBlend = true;
  269.          else if (getTopR (blend) != BLEND_NONE && (dist (e, g) >= XBRZ_CFG_EQUAL_COLOR_TOLERANCE)) //but support double-blending for 90° corners
  270.             doLineBlend = false; // make sure there is no second blending in an adjacent rotation for this pixel: handles insular pixels, mario eyes
  271.          else if (getBottomL (blend) != BLEND_NONE && (dist (e, c) >= XBRZ_CFG_EQUAL_COLOR_TOLERANCE))
  272.             doLineBlend = false; // make sure there is no second blending in an adjacent rotation for this pixel: handles insular pixels, mario eyes
  273.          else if ((dist (e, i) >= XBRZ_CFG_EQUAL_COLOR_TOLERANCE)
  274.             && (dist (g, h) < XBRZ_CFG_EQUAL_COLOR_TOLERANCE)
  275.             && (dist (h, i) < XBRZ_CFG_EQUAL_COLOR_TOLERANCE)
  276.             && (dist (i, f) < XBRZ_CFG_EQUAL_COLOR_TOLERANCE)
  277.             && (dist (f, c) < XBRZ_CFG_EQUAL_COLOR_TOLERANCE))
  278.             doLineBlend = false; // no full blending for L-shapes; blend corner only (handles "mario mushroom eyes")
  279.          else
  280.             doLineBlend = true;
  281.  
  282.          outmatrix_create (&out, scale_factor, target, trgWidth, rotDeg);
  283.          px = (dist (e, f) <= dist (e, h) ? f : h); //choose most similar color
  284.  
  285.          if (doLineBlend)
  286.          {
  287.             const double fg = dist (f, g); //test sample: 70% of values max(fg, hc) / min(fg, hc) are between 1.1 and 3.7 with median being 1.9
  288.             const double hc = dist (h, c); //
  289.             const bool haveShallowLine = (XBRZ_CFG_STEEP_DIRECTION_THRESHOLD * fg <= hc) && (e != g) && (d != g);
  290.             const bool haveSteepLine   = (XBRZ_CFG_STEEP_DIRECTION_THRESHOLD * hc <= fg) && (e != c) && (b != c);
  291.  
  292.             if (haveShallowLine)
  293.             {
  294.                if (haveSteepLine)
  295.                   Scaler::blendLineSteepAndShallow (px, &out, alphagrad);
  296.                else
  297.                   Scaler::blendLineShallow (px, &out, alphagrad);
  298.             }
  299.             else
  300.             {
  301.                if (haveSteepLine)
  302.                   Scaler::blendLineSteep (px, &out, alphagrad);
  303.                else
  304.                   Scaler::blendLineDiagonal (px, &out, alphagrad);
  305.             }
  306.          }
  307.          else
  308.             Scaler::blendCorner (px, &out, alphagrad);
  309.       }
  310.    }
  311.  
  312.  
  313.    template <class Scaler> //scaler policy: see "Scaler2x" reference implementation
  314.    void scaleImage (const uint32_t *src, uint32_t *trg, int srcWidth, int srcHeight, int yFirst, int yLast, alphagrad_func alphagrad, dist_func dist)
  315.    {
  316.       yFirst = MAX (yFirst, 0);
  317.       yLast = MIN (yLast, srcHeight);
  318.       if (yFirst >= yLast || srcWidth <= 0)
  319.          return;
  320.  
  321.       const int trgWidth = srcWidth * Scaler::scale;
  322.  
  323.       //"use" space at the end of the image as temporary buffer for "on the fly preprocessing": we even could use larger area of
  324.       //"sizeof(uint32_t) * srcWidth * (yLast - yFirst)" bytes without risk of accidental overwriting before accessing
  325.       const int bufferSize = srcWidth;
  326.       unsigned char* preProcBuffer = reinterpret_cast<unsigned char*>(trg + yLast * Scaler::scale * trgWidth) - bufferSize;
  327.       memset (preProcBuffer, 0, bufferSize);
  328.       static_assert(BLEND_NONE == 0, "");
  329.  
  330.       //initialize preprocessing buffer for first row of current stripe: detect upper left and right corner blending
  331.       //this cannot be optimized for adjacent processing stripes; we must not allow for a memory race condition!
  332.       if (yFirst > 0)
  333.       {
  334.          const int y = yFirst - 1;
  335.  
  336.          const uint32_t* s_m1 = src + srcWidth * MAX (y - 1, 0);
  337.          const uint32_t* s_0 = src + srcWidth * y; //center line
  338.          const uint32_t* s_p1 = src + srcWidth * MIN (y + 1, srcHeight - 1);
  339.          const uint32_t* s_p2 = src + srcWidth * MIN (y + 2, srcHeight - 1);
  340.  
  341.          for (int x = 0; x < srcWidth; ++x)
  342.          {
  343.             blendresult_t res;
  344.             const int x_m1 = MAX (x - 1, 0);
  345.             const int x_p1 = MIN (x + 1, srcWidth - 1);
  346.             const int x_p2 = MIN (x + 2, srcWidth - 1);
  347.  
  348.             kernel_4x4_t ker; //perf: initialization is negligible
  349.             ker.a = s_m1[x_m1]; //read sequentially from memory as far as possible
  350.             ker.b = s_m1[x];
  351.             ker.c = s_m1[x_p1];
  352.             ker.d = s_m1[x_p2];
  353.  
  354.             ker.e = s_0[x_m1];
  355.             ker.f = s_0[x];
  356.             ker.g = s_0[x_p1];
  357.             ker.h = s_0[x_p2];
  358.  
  359.             ker.i = s_p1[x_m1];
  360.             ker.j = s_p1[x];
  361.             ker.k = s_p1[x_p1];
  362.             ker.l = s_p1[x_p2];
  363.  
  364.             ker.m = s_p2[x_m1];
  365.             ker.n = s_p2[x];
  366.             ker.o = s_p2[x_p1];
  367.             ker.p = s_p2[x_p2];
  368.  
  369.             preProcessCorners (&res, &ker, dist);
  370.             /*
  371.             preprocessing blend result:
  372.             ---------
  373.             | F | G |   //evalute corner between F, G, J, K
  374.             ----|---|   //input pixel is at position F
  375.             | J | K |
  376.             ---------
  377.             */
  378.             setTopR (preProcBuffer[x], res.blend_j);
  379.  
  380.             if (x + 1 < bufferSize)
  381.                setTopL (preProcBuffer[x + 1], res.blend_k);
  382.          }
  383.       }
  384.       //------------------------------------------------------------------------------------
  385.  
  386.       for (int y = yFirst; y < yLast; ++y)
  387.       {
  388.          uint32_t *out = trg + Scaler::scale * y * trgWidth; //consider MT "striped" access
  389.  
  390.          const uint32_t* s_m1 = src + srcWidth * MAX (y - 1, 0);
  391.          const uint32_t* s_0 = src + srcWidth * y; //center line
  392.          const uint32_t* s_p1 = src + srcWidth * MIN (y + 1, srcHeight - 1);
  393.          const uint32_t* s_p2 = src + srcWidth * MIN (y + 2, srcHeight - 1);
  394.  
  395.          unsigned char blend_xy1 = 0; //corner blending for current (x, y + 1) position
  396.  
  397.          for (int x = 0; x < srcWidth; ++x, out += Scaler::scale)
  398.          {
  399.             //all those bounds checks have only insignificant impact on performance!
  400.             const int x_m1 = MAX (x - 1, 0); //perf: prefer array indexing to additional pointers!
  401.             const int x_p1 = MIN (x + 1, srcWidth - 1);
  402.             const int x_p2 = MIN (x + 2, srcWidth - 1);
  403.             kernel_4x4_t ker4; //perf: initialization is negligible
  404.  
  405.             ker4.a = s_m1[x_m1]; //read sequentially from memory as far as possible
  406.             ker4.b = s_m1[x];
  407.             ker4.c = s_m1[x_p1];
  408.             ker4.d = s_m1[x_p2];
  409.  
  410.             ker4.e = s_0[x_m1];
  411.             ker4.f = s_0[x];
  412.             ker4.g = s_0[x_p1];
  413.             ker4.h = s_0[x_p2];
  414.  
  415.             ker4.i = s_p1[x_m1];
  416.             ker4.j = s_p1[x];
  417.             ker4.k = s_p1[x_p1];
  418.             ker4.l = s_p1[x_p2];
  419.  
  420.             ker4.m = s_p2[x_m1];
  421.             ker4.n = s_p2[x];
  422.             ker4.o = s_p2[x_p1];
  423.             ker4.p = s_p2[x_p2];
  424.  
  425.             //evaluate the four corners on bottom-right of current pixel
  426.             unsigned char blend_xy = 0; //for current (x, y) position
  427.             {
  428.                blendresult_t res;
  429.                preProcessCorners (&res, &ker4, dist);
  430.                /*
  431.                preprocessing blend result:
  432.                ---------
  433.                | F | G |   //evalute corner between F, G, J, K
  434.                ----|---|   //current input pixel is at position F
  435.                | J | K |
  436.                ---------
  437.                */
  438.                blend_xy = preProcBuffer[x];
  439.                setBottomR (blend_xy, res.blend_f); //all four corners of (x, y) have been determined at this point due to processing sequence!
  440.  
  441.                setTopR (blend_xy1, res.blend_j); //set 2nd known corner for (x, y + 1)
  442.                preProcBuffer[x] = blend_xy1; //store on current buffer position for use on next row
  443.  
  444.                blend_xy1 = 0;
  445.                setTopL (blend_xy1, res.blend_k); //set 1st known corner for (x + 1, y + 1) and buffer for use on next column
  446.  
  447.                if (x + 1 < bufferSize) //set 3rd known corner for (x + 1, y)
  448.                   setBottomL (preProcBuffer[x + 1], res.blend_g);
  449.             }
  450.  
  451.             //fill block of size scale * scale with the given color
  452.             {
  453.                uint32_t *blk = out;
  454.                for (int _blk_y = 0; _blk_y < Scaler::scale; ++_blk_y, blk = (uint32_t *) BYTE_ADVANCE (blk, trgWidth * sizeof (uint32_t)))
  455.                   for (int _blk_x = 0; _blk_x < Scaler::scale; ++_blk_x)
  456.                      blk[_blk_x] = ker4.f;
  457.             }
  458.             //place *after* preprocessing step, to not overwrite the results while processing the the last pixel!
  459.  
  460.             //blend four corners of current pixel
  461.             if (blend_xy != 0) //good 5% perf-improvement
  462.             {
  463.                kernel_3x3_t ker3; //perf: initialization is negligible
  464.  
  465.                ker3.a = ker4.a;
  466.                ker3.b = ker4.b;
  467.                ker3.c = ker4.c;
  468.  
  469.                ker3.d = ker4.e;
  470.                ker3.e = ker4.f;
  471.                ker3.f = ker4.g;
  472.  
  473.                ker3.g = ker4.i;
  474.                ker3.h = ker4.j;
  475.                ker3.i = ker4.k;
  476.  
  477.                blendPixel<Scaler> (Scaler::scale, &ker3, out, trgWidth, blend_xy, alphagrad, dist, ROT_0);
  478.                blendPixel<Scaler> (Scaler::scale, &ker3, out, trgWidth, blend_xy, alphagrad, dist, ROT_90);
  479.                blendPixel<Scaler> (Scaler::scale, &ker3, out, trgWidth, blend_xy, alphagrad, dist, ROT_180);
  480.                blendPixel<Scaler> (Scaler::scale, &ker3, out, trgWidth, blend_xy, alphagrad, dist, ROT_270);
  481.             }
  482.          }
  483.       }
  484.    }
  485.  
  486.  
  487.    //------------------------------------------------------------------------------------
  488.    struct Scaler2x
  489.    {
  490.       static const int scale = 2;
  491.  
  492.  
  493.       static void blendLineShallow (uint32_t col, outmatrix_t *out, alphagrad_func alphagrad)
  494.       {
  495.          alphagrad (outmatrix_ref (out, scale - 1, 0), col, 1, 4);
  496.          alphagrad (outmatrix_ref (out, scale - 1, 1), col, 3, 4);
  497.       }
  498.  
  499.       static void blendLineSteep (uint32_t col, outmatrix_t *out, alphagrad_func alphagrad)
  500.       {
  501.          alphagrad (outmatrix_ref (out, 0, scale - 1), col, 1, 4);
  502.          alphagrad (outmatrix_ref (out, 1, scale - 1), col, 3, 4);
  503.       }
  504.  
  505.       static void blendLineSteepAndShallow (uint32_t col, outmatrix_t *out, alphagrad_func alphagrad)
  506.       {
  507.          alphagrad (outmatrix_ref (out, 1, 0), col, 1, 4);
  508.          alphagrad (outmatrix_ref (out, 0, 1), col, 1, 4);
  509.          alphagrad (outmatrix_ref (out, 1, 1), col, 5, 6); //[!] fixes 7/8 used in xBR
  510.       }
  511.  
  512.       static void blendLineDiagonal (uint32_t col, outmatrix_t *out, alphagrad_func alphagrad)
  513.       {
  514.          alphagrad (outmatrix_ref (out, 1, 1), col, 1, 2);
  515.       }
  516.  
  517.       static void blendCorner (uint32_t col, outmatrix_t *out, alphagrad_func alphagrad)
  518.       {
  519.          //model a round corner
  520.          alphagrad (outmatrix_ref (out, 1, 1), col, 21, 100); //exact: 1 - pi/4 = 0.2146018366
  521.       }
  522.    };
  523.  
  524.  
  525.    struct Scaler3x
  526.    {
  527.       static const int scale = 3;
  528.  
  529.  
  530.       static void blendLineShallow (uint32_t col, outmatrix_t *out, alphagrad_func alphagrad)
  531.       {
  532.          alphagrad (outmatrix_ref (out, scale - 1, 0), col, 1, 4);
  533.          alphagrad (outmatrix_ref (out, scale - 2, 2), col, 1, 4);
  534.          alphagrad (outmatrix_ref (out, scale - 1, 1), col, 3, 4);
  535.          *outmatrix_ref (out, scale - 1, 2) = col;
  536.       }
  537.  
  538.       static void blendLineSteep (uint32_t col, outmatrix_t *out, alphagrad_func alphagrad)
  539.       {
  540.          alphagrad (outmatrix_ref (out, 0, scale - 1), col, 1, 4);
  541.          alphagrad (outmatrix_ref (out, 2, scale - 2), col, 1, 4);
  542.          alphagrad (outmatrix_ref (out, 1, scale - 1), col, 3, 4);
  543.          *outmatrix_ref (out, 2, scale - 1) = col;
  544.       }
  545.  
  546.       static void blendLineSteepAndShallow (uint32_t col, outmatrix_t *out, alphagrad_func alphagrad)
  547.       {
  548.          alphagrad (outmatrix_ref (out, 2, 0), col, 1, 4);
  549.          alphagrad (outmatrix_ref (out, 0, 2), col, 1, 4);
  550.          alphagrad (outmatrix_ref (out, 2, 1), col, 3, 4);
  551.          alphagrad (outmatrix_ref (out, 1, 2), col, 3, 4);
  552.          *outmatrix_ref (out, 2, 2) = col;
  553.       }
  554.  
  555.       static void blendLineDiagonal (uint32_t col, outmatrix_t *out, alphagrad_func alphagrad)
  556.       {
  557.          alphagrad (outmatrix_ref (out, 1, 2), col, 1, 8); //conflict with other rotations for this odd scale
  558.          alphagrad (outmatrix_ref (out, 2, 1), col, 1, 8);
  559.          alphagrad (outmatrix_ref (out, 2, 2), col, 7, 8); //
  560.       }
  561.  
  562.       static void blendCorner (uint32_t col, outmatrix_t *out, alphagrad_func alphagrad)
  563.       {
  564.          //model a round corner
  565.          alphagrad (outmatrix_ref (out, 2, 2), col, 45, 100); //exact: 0.4545939598
  566.          //alphagrad (outmatrix_ref (out, 2, 1), col, 7, 256); //0.02826017254 -> negligible + avoid conflicts with other rotations for this odd scale
  567.          //alphagrad (outmatrix_ref (out, 1, 2), col, 7, 256); //0.02826017254
  568.       }
  569.    };
  570.  
  571.  
  572.    struct Scaler4x
  573.    {
  574.       static const int scale = 4;
  575.  
  576.  
  577.       static void blendLineShallow (uint32_t col, outmatrix_t *out, alphagrad_func alphagrad)
  578.       {
  579.          alphagrad (outmatrix_ref (out, scale - 1, 0), col, 1, 4);
  580.          alphagrad (outmatrix_ref (out, scale - 2, 2), col, 1, 4);
  581.          alphagrad (outmatrix_ref (out, scale - 1, 1), col, 3, 4);
  582.          alphagrad (outmatrix_ref (out, scale - 2, 3), col, 3, 4);
  583.          *outmatrix_ref (out, scale - 1, 2) = col;
  584.          *outmatrix_ref (out, scale - 1, 3) = col;
  585.       }
  586.  
  587.       static void blendLineSteep (uint32_t col, outmatrix_t *out, alphagrad_func alphagrad)
  588.       {
  589.          alphagrad (outmatrix_ref (out, 0, scale - 1), col, 1, 4);
  590.          alphagrad (outmatrix_ref (out, 2, scale - 2), col, 1, 4);
  591.          alphagrad (outmatrix_ref (out, 1, scale - 1), col, 3, 4);
  592.          alphagrad (outmatrix_ref (out, 3, scale - 2), col, 3, 4);
  593.          *outmatrix_ref (out, 2, scale - 1) = col;
  594.          *outmatrix_ref (out, 3, scale - 1) = col;
  595.       }
  596.  
  597.       static void blendLineSteepAndShallow (uint32_t col, outmatrix_t *out, alphagrad_func alphagrad)
  598.       {
  599.          alphagrad (outmatrix_ref (out, 3, 1), col, 3, 4);
  600.          alphagrad (outmatrix_ref (out, 1, 3), col, 3, 4);
  601.          alphagrad (outmatrix_ref (out, 3, 0), col, 1, 4);
  602.          alphagrad (outmatrix_ref (out, 0, 3), col, 1, 4);
  603.          alphagrad (outmatrix_ref (out, 2, 2), col, 1, 3); //[!] fixes 1/4 used in xBR
  604.          *outmatrix_ref (out, 3, 3) = col;
  605.          *outmatrix_ref (out, 3, 2) = col;
  606.          *outmatrix_ref (out, 2, 3) = col;
  607.       }
  608.  
  609.       static void blendLineDiagonal (uint32_t col, outmatrix_t *out, alphagrad_func alphagrad)
  610.       {
  611.          alphagrad (outmatrix_ref (out, scale - 1, scale / 2), col, 1, 2);
  612.          alphagrad (outmatrix_ref (out, scale - 2, scale / 2 + 1), col, 1, 2);
  613.          *outmatrix_ref (out, scale - 1, scale - 1) = col;
  614.       }
  615.  
  616.       static void blendCorner (uint32_t col, outmatrix_t *out, alphagrad_func alphagrad)
  617.       {
  618.          //model a round corner
  619.          alphagrad (outmatrix_ref (out, 3, 3), col, 68, 100); //exact: 0.6848532563
  620.          alphagrad (outmatrix_ref (out, 3, 2), col, 9, 100); //0.08677704501
  621.          alphagrad (outmatrix_ref (out, 2, 3), col, 9, 100); //0.08677704501
  622.       }
  623.    };
  624.  
  625.  
  626.    struct Scaler5x
  627.    {
  628.       static const int scale = 5;
  629.  
  630.  
  631.       static void blendLineShallow (uint32_t col, outmatrix_t *out, alphagrad_func alphagrad)
  632.       {
  633.          alphagrad (outmatrix_ref (out, scale - 1, 0), col, 1, 4);
  634.          alphagrad (outmatrix_ref (out, scale - 2, 2), col, 1, 4);
  635.          alphagrad (outmatrix_ref (out, scale - 3, 4), col, 1, 4);
  636.          alphagrad (outmatrix_ref (out, scale - 1, 1), col, 3, 4);
  637.          alphagrad (outmatrix_ref (out, scale - 2, 3), col, 3, 4);
  638.          *outmatrix_ref (out, scale - 1, 2) = col;
  639.          *outmatrix_ref (out, scale - 1, 3) = col;
  640.          *outmatrix_ref (out, scale - 1, 4) = col;
  641.          *outmatrix_ref (out, scale - 2, 4) = col;
  642.       }
  643.  
  644.       static void blendLineSteep (uint32_t col, outmatrix_t *out, alphagrad_func alphagrad)
  645.       {
  646.          alphagrad (outmatrix_ref (out, 0, scale - 1), col, 1, 4);
  647.          alphagrad (outmatrix_ref (out, 2, scale - 2), col, 1, 4);
  648.          alphagrad (outmatrix_ref (out, 4, scale - 3), col, 1, 4);
  649.          alphagrad (outmatrix_ref (out, 1, scale - 1), col, 3, 4);
  650.          alphagrad (outmatrix_ref (out, 3, scale - 2), col, 3, 4);
  651.          *outmatrix_ref (out, 2, scale - 1) = col;
  652.          *outmatrix_ref (out, 3, scale - 1) = col;
  653.          *outmatrix_ref (out, 4, scale - 1) = col;
  654.          *outmatrix_ref (out, 4, scale - 2) = col;
  655.       }
  656.  
  657.       static void blendLineSteepAndShallow (uint32_t col, outmatrix_t *out, alphagrad_func alphagrad)
  658.       {
  659.          alphagrad (outmatrix_ref (out, 0, scale - 1), col, 1, 4);
  660.          alphagrad (outmatrix_ref (out, 2, scale - 2), col, 1, 4);
  661.          alphagrad (outmatrix_ref (out, 1, scale - 1), col, 3, 4);
  662.          alphagrad (outmatrix_ref (out, scale - 1, 0), col, 1, 4);
  663.          alphagrad (outmatrix_ref (out, scale - 2, 2), col, 1, 4);
  664.          alphagrad (outmatrix_ref (out, scale - 1, 1), col, 3, 4);
  665.          alphagrad (outmatrix_ref (out, 3, 3), col, 2, 3);
  666.          *outmatrix_ref (out, 2, scale - 1) = col;
  667.          *outmatrix_ref (out, 3, scale - 1) = col;
  668.          *outmatrix_ref (out, 4, scale - 1) = col;
  669.          *outmatrix_ref (out, scale - 1, 2) = col;
  670.          *outmatrix_ref (out, scale - 1, 3) = col;
  671.       }
  672.  
  673.       static void blendLineDiagonal (uint32_t col, outmatrix_t *out, alphagrad_func alphagrad)
  674.       {
  675.          alphagrad (outmatrix_ref (out, scale - 1, scale / 2 + 0), col, 1, 8); //conflict with other rotations for this odd scale
  676.          alphagrad (outmatrix_ref (out, scale - 2, scale / 2 + 1), col, 1, 8);
  677.          alphagrad (outmatrix_ref (out, scale - 3, scale / 2 + 2), col, 1, 8); //
  678.          alphagrad (outmatrix_ref (out, 4, 3), col, 7, 8);
  679.          alphagrad (outmatrix_ref (out, 3, 4), col, 7, 8);
  680.          *outmatrix_ref (out, 4, 4) = col;
  681.       }
  682.  
  683.       static void blendCorner (uint32_t col, outmatrix_t *out, alphagrad_func alphagrad)
  684.       {
  685.          // model a round corner
  686.          alphagrad (outmatrix_ref (out, 4, 4), col, 86, 100); //exact: 0.8631434088
  687.          alphagrad (outmatrix_ref (out, 4, 3), col, 23, 100); //0.2306749731
  688.          alphagrad (outmatrix_ref (out, 3, 4), col, 23, 100); //0.2306749731
  689.          //alphagrad (outmatrix_ref (out, 4, 2), col, 1, 64); //0.01676812367 -> negligible + avoid conflicts with other rotations for this odd scale
  690.          //alphagrad (outmatrix_ref (out, 2, 4), col, 1, 64); //0.01676812367
  691.       }
  692.    };
  693.  
  694.  
  695.    struct Scaler6x
  696.    {
  697.       static const int scale = 6;
  698.  
  699.  
  700.       static void blendLineShallow (uint32_t col, outmatrix_t *out, alphagrad_func alphagrad)
  701.       {
  702.          alphagrad (outmatrix_ref (out, scale - 1, 0), col, 1, 4);
  703.          alphagrad (outmatrix_ref (out, scale - 2, 2), col, 1, 4);
  704.          alphagrad (outmatrix_ref (out, scale - 3, 4), col, 1, 4);
  705.          alphagrad (outmatrix_ref (out, scale - 1, 1), col, 3, 4);
  706.          alphagrad (outmatrix_ref (out, scale - 2, 3), col, 3, 4);
  707.          alphagrad (outmatrix_ref (out, scale - 3, 5), col, 3, 4);
  708.  
  709.          *outmatrix_ref (out, scale - 1, 2) = col;
  710.          *outmatrix_ref (out, scale - 1, 3) = col;
  711.          *outmatrix_ref (out, scale - 1, 4) = col;
  712.          *outmatrix_ref (out, scale - 1, 5) = col;
  713.          *outmatrix_ref (out, scale - 2, 4) = col;
  714.          *outmatrix_ref (out, scale - 2, 5) = col;
  715.       }
  716.  
  717.       static void blendLineSteep (uint32_t col, outmatrix_t *out, alphagrad_func alphagrad)
  718.       {
  719.          alphagrad (outmatrix_ref (out, 0, scale - 1), col, 1, 4);
  720.          alphagrad (outmatrix_ref (out, 2, scale - 2), col, 1, 4);
  721.          alphagrad (outmatrix_ref (out, 4, scale - 3), col, 1, 4);
  722.          alphagrad (outmatrix_ref (out, 1, scale - 1), col, 3, 4);
  723.          alphagrad (outmatrix_ref (out, 3, scale - 2), col, 3, 4);
  724.          alphagrad (outmatrix_ref (out, 5, scale - 3), col, 3, 4);
  725.          *outmatrix_ref (out, 2, scale - 1) = col;
  726.          *outmatrix_ref (out, 3, scale - 1) = col;
  727.          *outmatrix_ref (out, 4, scale - 1) = col;
  728.          *outmatrix_ref (out, 5, scale - 1) = col;
  729.          *outmatrix_ref (out, 4, scale - 2) = col;
  730.          *outmatrix_ref (out, 5, scale - 2) = col;
  731.       }
  732.  
  733.       static void blendLineSteepAndShallow (uint32_t col, outmatrix_t *out, alphagrad_func alphagrad)
  734.       {
  735.          alphagrad (outmatrix_ref (out, 0, scale - 1), col, 1, 4);
  736.          alphagrad (outmatrix_ref (out, 2, scale - 2), col, 1, 4);
  737.          alphagrad (outmatrix_ref (out, 1, scale - 1), col, 3, 4);
  738.          alphagrad (outmatrix_ref (out, 3, scale - 2), col, 3, 4);
  739.          alphagrad (outmatrix_ref (out, scale - 1, 0), col, 1, 4);
  740.          alphagrad (outmatrix_ref (out, scale - 2, 2), col, 1, 4);
  741.          alphagrad (outmatrix_ref (out, scale - 1, 1), col, 3, 4);
  742.          alphagrad (outmatrix_ref (out, scale - 2, 3), col, 3, 4);
  743.          *outmatrix_ref (out, 2, scale - 1) = col;
  744.          *outmatrix_ref (out, 3, scale - 1) = col;
  745.          *outmatrix_ref (out, 4, scale - 1) = col;
  746.          *outmatrix_ref (out, 5, scale - 1) = col;
  747.          *outmatrix_ref (out, 4, scale - 2) = col;
  748.          *outmatrix_ref (out, 5, scale - 2) = col;
  749.          *outmatrix_ref (out, scale - 1, 2) = col;
  750.          *outmatrix_ref (out, scale - 1, 3) = col;
  751.       }
  752.  
  753.       static void blendLineDiagonal (uint32_t col, outmatrix_t *out, alphagrad_func alphagrad)
  754.       {
  755.          alphagrad (outmatrix_ref (out, scale - 1, scale / 2 + 0), col, 1, 2);
  756.          alphagrad (outmatrix_ref (out, scale - 2, scale / 2 + 1), col, 1, 2);
  757.          alphagrad (outmatrix_ref (out, scale - 3, scale / 2 + 2), col, 1, 2);
  758.          *outmatrix_ref (out, scale - 2, scale - 1) = col;
  759.          *outmatrix_ref (out, scale - 1, scale - 1) = col;
  760.          *outmatrix_ref (out, scale - 1, scale - 2) = col;
  761.       }
  762.  
  763.       static void blendCorner (uint32_t col, outmatrix_t *out, alphagrad_func alphagrad)
  764.       {
  765.          //model a round corner
  766.          alphagrad (outmatrix_ref (out, 5, 5), col, 97, 100); //exact: 0.9711013910
  767.          alphagrad (outmatrix_ref (out, 4, 5), col, 42, 100); //0.4236372243
  768.          alphagrad (outmatrix_ref (out, 5, 4), col, 42, 100); //0.4236372243
  769.          alphagrad (outmatrix_ref (out, 5, 3), col, 6, 100); //0.05652034508
  770.          alphagrad (outmatrix_ref (out, 3, 5), col, 6, 100); //0.05652034508
  771.       }
  772.    };
  773.  
  774.    //------------------------------------------------------------------------------------
  775. }
  776.  
  777.  
  778.  
  779. static double dist24 (uint32_t pix1, uint32_t pix2)
  780. {
  781.    //30% perf boost compared to plain distYCbCr()!
  782.    //consumes 64 MB memory; using double is only 2% faster, but takes 128 MB
  783.    static float diffToDist[256 * 256 * 256];
  784.    static bool is_initialized = false;
  785.    if (!is_initialized)
  786.    {
  787.       for (uint32_t i = 0; i < 256 * 256 * 256; ++i) //startup time: 114 ms on Intel Core i5 (four cores)
  788.       {
  789.          const int r_diff = GET_RED (i) * 2 - 0xFF;
  790.          const int g_diff = GET_GREEN (i) * 2 - 0xFF;
  791.          const int b_diff = GET_BLUE (i) * 2 - 0xFF;
  792.  
  793.          const double k_b = 0.0593; //ITU-R BT.2020 conversion
  794.          const double k_r = 0.2627; //
  795.          const double k_g = 1 - k_b - k_r;
  796.  
  797.          const double scale_b = 0.5 / (1 - k_b);
  798.          const double scale_r = 0.5 / (1 - k_r);
  799.  
  800.          const double y = k_r * r_diff + k_g * g_diff + k_b * b_diff; //[!], analog YCbCr!
  801.          const double c_b = scale_b * (b_diff - y);
  802.          const double c_r = scale_r * (r_diff - y);
  803.  
  804.          diffToDist[i] = (float) (sqrt ((y * y) + (c_b * c_b) + (c_r * c_r)));
  805.       }
  806.       is_initialized = true;
  807.    }
  808.  
  809.    const int r_diff = (int) GET_RED (pix1) - (int) GET_RED (pix2);
  810.    const int g_diff = (int) GET_GREEN (pix1) - (int) GET_GREEN (pix2);
  811.    const int b_diff = (int) GET_BLUE (pix1) - (int) GET_BLUE (pix2);
  812.  
  813.    return diffToDist[(((r_diff + 0xFF) / 2) << 16) | //slightly reduce precision (division by 2) to squeeze value into single byte
  814.       (((g_diff + 0xFF) / 2) << 8) |
  815.       (((b_diff + 0xFF) / 2) << 0)];
  816. }
  817.  
  818.  
  819. static double dist32 (uint32_t pix1, uint32_t pix2)
  820. {
  821.    const double a1 = GET_ALPHA (pix1) / 255.0;
  822.    const double a2 = GET_ALPHA (pix2) / 255.0;
  823.    /*
  824.    Requirements for a color distance handling alpha channel: with a1, a2 in [0, 1]
  825.  
  826.        1. if a1 = a2, distance should be: a1 * distYCbCr()
  827.        2. if a1 = 0,  distance should be: a2 * distYCbCr(black, white) = a2 * 255
  828.        3. if a1 = 1,  ??? maybe: 255 * (1 - a2) + a2 * distYCbCr()
  829.    */
  830.  
  831.    //return MIN (a1, a2) * distYCbCrBuffered(pix1, pix2) + 255 * abs(a1 - a2);
  832.    //=> following code is 15% faster:
  833.    const double d = dist24 (pix1, pix2);
  834.    return (a1 < a2 ? a1 * d + 255 * (a2 - a1) : a2 * d + 255 * (a1 - a2));
  835. }
  836.  
  837.  
  838. static void alphagrad24 (uint32_t *pixBack, uint32_t pixFront, unsigned int M, unsigned int N)
  839. {
  840.    // blend front color with opacity M / N over opaque background: http://en.wikipedia.org/wiki/Alpha_compositing#Alpha_blending
  841.    *pixBack = ((CALC_COLOR24 (GET_RED (pixFront), GET_RED (*pixBack), M, N) << 16)
  842.       | (CALC_COLOR24 (GET_GREEN (pixFront), GET_GREEN (*pixBack), M, N) << 8)
  843.       | (CALC_COLOR24 (GET_BLUE (pixFront), GET_BLUE (*pixBack), M, N) << 0));
  844. }
  845.  
  846.  
  847. static void alphagrad32 (uint32_t *pixBack, uint32_t pixFront, unsigned int M, unsigned int N)
  848. {
  849.    // find intermediate color between two colors with alpha channels (=> NO alpha blending!!!)
  850.    const unsigned int weightFront = GET_ALPHA (pixFront) * M;
  851.    const unsigned int weightBack = GET_ALPHA (*pixBack) * (N - M);
  852.    const unsigned int weightSum = weightFront + weightBack;
  853.    *pixBack = (weightSum == 0 ? 0 :
  854.       (((unsigned char) (weightSum / N)) << 24)
  855.       | (CALC_COLOR32 (GET_RED (pixFront), GET_RED (*pixBack), weightFront, weightBack, weightSum) << 16)
  856.       | (CALC_COLOR32 (GET_GREEN (pixFront), GET_GREEN (*pixBack), weightFront, weightBack, weightSum) << 8)
  857.       | (CALC_COLOR32 (GET_BLUE (pixFront), GET_BLUE (*pixBack), weightFront, weightBack, weightSum) << 0));
  858. }
  859.  
  860.  
  861. EXTERN_C void nearestNeighborScale (const uint32_t *src, int srcWidth, int srcHeight, uint32_t *trg, int trgWidth, int trgHeight)
  862. {
  863.    //    nearestNeighborScale (src, srcWidth, srcHeight, srcWidth * sizeof (uint32_t), trg, trgWidth, trgHeight, trgWidth * sizeof (uint32_t), XBRZ_SLICETYPE_TARGET, 0, trgHeight, [](uint32_t pix) { return pix; });
  864.        //static_assert(std::is_integral<PixSrc>::value, "PixSrc* is expected to be cast-able to char*");
  865.        //static_assert(std::is_integral<PixTrg>::value, "PixTrg* is expected to be cast-able to char*");
  866.        //static_assert(std::is_same<decltype(pixCvrt(PixSrc())), PixTrg>::value, "PixConverter returning wrong pixel format");
  867.  
  868.    int srcPitch = srcWidth * sizeof (uint32_t);
  869.    int trgPitch = trgWidth * sizeof (uint32_t);
  870.    int yFirst;
  871.    int yLast;
  872.  
  873. #if 0 // going over source image - fast for upscaling, since source is read only once
  874.    yFirst = 0;
  875.    yLast = MIN (trgHeight, srcHeight);
  876.  
  877.    if (yFirst >= yLast || trgWidth <= 0 || trgHeight <= 0)
  878.       return; // consistency check
  879.  
  880.    for (int y = yFirst; y < yLast; ++y)
  881.    {
  882.       //mathematically: ySrc = floor(srcHeight * yTrg / trgHeight)
  883.       // => search for integers in: [ySrc, ySrc + 1) * trgHeight / srcHeight
  884.  
  885.       //keep within for loop to support MT input slices!
  886.       const int yTrg_first = (y      * trgHeight + srcHeight - 1) / srcHeight; //=ceil(y * trgHeight / srcHeight)
  887.       const int yTrg_last = ((y + 1) * trgHeight + srcHeight - 1) / srcHeight; //=ceil(((y + 1) * trgHeight) / srcHeight)
  888.       const int blockHeight = yTrg_last - yTrg_first;
  889.  
  890.       if (blockHeight > 0)
  891.       {
  892.          const uint32_t *srcLine = (const uint32_t *) BYTE_ADVANCE (src, y * srcPitch);
  893.          /**/  uint32_t *trgLine = (uint32_t *) BYTE_ADVANCE (trg, yTrg_first * trgPitch);
  894.          int xTrg_first = 0;
  895.  
  896.          for (int x = 0; x < srcWidth; ++x)
  897.          {
  898.             const int xTrg_last = ((x + 1) * trgWidth + srcWidth - 1) / srcWidth;
  899.             const int blockWidth = xTrg_last - xTrg_first;
  900.             if (blockWidth > 0)
  901.             {
  902.                const uint32_t trgColor = srcLine[x];
  903.                uint32_t *blkLine = trgLine;
  904.  
  905.                xTrg_first = xTrg_last;
  906.  
  907.                for (int blk_y = 0; blk_y < blockHeight; ++blk_y, blkLine = (uint32_t *) BYTE_ADVANCE (blkLine, trgPitch))
  908.                   for (int blk_x = 0; blk_x < blockWidth; ++blk_x)
  909.                      blkLine[blk_x] = trgColor;
  910.  
  911.                trgLine += blockWidth;
  912.             }
  913.          }
  914.       }
  915.    }
  916. #else // going over target image - slow for upscaling, since source is read multiple times missing out on cache! Fast for similar image sizes!
  917.    yFirst = 0;
  918.    yLast = trgHeight;
  919.  
  920.    if (yFirst >= yLast || srcHeight <= 0 || srcWidth <= 0)
  921.       return; // consistency check
  922.  
  923.    for (int y = yFirst; y < yLast; ++y)
  924.    {
  925.       /**/  uint32_t *trgLine = (uint32_t *) BYTE_ADVANCE (trg, y * trgPitch);
  926.       const int ySrc = srcHeight * y / trgHeight;
  927.       const uint32_t *srcLine = (const uint32_t *) BYTE_ADVANCE (src, ySrc * srcPitch);
  928.       for (int x = 0; x < trgWidth; ++x)
  929.       {
  930.          const int xSrc = srcWidth * x / trgWidth;
  931.          trgLine[x] = srcLine[xSrc];
  932.       }
  933.    }
  934. #endif // going over source or target
  935.  
  936.    return;
  937. }
  938.  
  939.  
  940. EXTERN_C bool xbrz_equalcolortest24 (uint32_t col1, uint32_t col2, double luminanceWeight, double equalColorTolerance)
  941. {
  942.    return (dist24 (col1, col2) < equalColorTolerance);
  943. }
  944.  
  945.  
  946. EXTERN_C bool xbrz_equalcolortest32 (uint32_t col1, uint32_t col2, double luminanceWeight, double equalColorTolerance)
  947. {
  948.    return (dist32 (col1, col2) < equalColorTolerance);
  949. }
  950.  
  951.  
  952. EXTERN_C void xbrz_scale24 (size_t factor, const uint32_t *src, uint32_t *trg, int srcWidth, int srcHeight)
  953. {
  954.    if (factor == 2) return scaleImage<Scaler2x> (src, trg, srcWidth, srcHeight, 0, srcHeight, alphagrad24, dist24);
  955.    else if (factor == 3) return scaleImage<Scaler3x> (src, trg, srcWidth, srcHeight, 0, srcHeight, alphagrad24, dist24);
  956.    else if (factor == 4) return scaleImage<Scaler4x> (src, trg, srcWidth, srcHeight, 0, srcHeight, alphagrad24, dist24);
  957.    else if (factor == 5) return scaleImage<Scaler5x> (src, trg, srcWidth, srcHeight, 0, srcHeight, alphagrad24, dist24);
  958.    else if (factor == 6) return scaleImage<Scaler6x> (src, trg, srcWidth, srcHeight, 0, srcHeight, alphagrad24, dist24);
  959. }
  960.  
  961.  
  962. EXTERN_C void xbrz_scale32 (size_t factor, const uint32_t *src, uint32_t *trg, int srcWidth, int srcHeight)
  963. {
  964.    if (factor == 2) return scaleImage<Scaler2x> (src, trg, srcWidth, srcHeight, 0, srcHeight, alphagrad32, dist32);
  965.    else if (factor == 3) return scaleImage<Scaler3x> (src, trg, srcWidth, srcHeight, 0, srcHeight, alphagrad32, dist32);
  966.    else if (factor == 4) return scaleImage<Scaler4x> (src, trg, srcWidth, srcHeight, 0, srcHeight, alphagrad32, dist32);
  967.    else if (factor == 5) return scaleImage<Scaler5x> (src, trg, srcWidth, srcHeight, 0, srcHeight, alphagrad32, dist32);
  968.    else if (factor == 6) return scaleImage<Scaler6x> (src, trg, srcWidth, srcHeight, 0, srcHeight, alphagrad32, dist32);
  969. }
  970.