Subversion Repositories Games.Prince of Persia

Rev

Rev 4 | Go to most recent revision | Blame | Compare with Previous | Last modification | View Log | Download | RSS feed

  1. // ****************************************************************************
  2. // * This file is part of the HqMAME project. It is distributed under         *
  3. // * GNU General Public License: https://www.gnu.org/licenses/gpl-3.0         *
  4. // * Copyright (C) Zenju (zenju AT gmx DOT de) - All Rights Reserved          *
  5. // *                                                                          *
  6. // * Additionally and as a special exception, the author gives permission     *
  7. // * to link the code of this program with the MAME library (or with modified *
  8. // * versions of MAME that use the same license as MAME), and distribute      *
  9. // * linked combinations including the two. You must obey the GNU General     *
  10. // * Public License in all respects for all of the code used other than MAME. *
  11. // * If you modify this file, you may extend this exception to your version   *
  12. // * of the file, but you are not obligated to do so. If you do not wish to   *
  13. // * do so, delete this exception statement from your version.                *
  14. // ****************************************************************************
  15.  
  16. // -------------------------------------------------------------------------
  17. // | xBRZ: "Scale by rules" - high quality image upscaling filter by Zenju |
  18. // -------------------------------------------------------------------------
  19. // using a modified approach of xBR:
  20. // http://board.byuu.org/viewtopic.php?f=10&t=2248
  21. //  - new rule set preserving small image features
  22. //  - highly optimized for performance
  23. //  - support alpha channel
  24. //  - support multithreading
  25. //  - support 64-bit architectures
  26. //  - support processing image slices
  27. //  - support scaling up to 6xBRZ
  28.  
  29. // -> map source (srcWidth * srcHeight) to target (scale * width x scale * height) image, optionally processing a half-open slice of rows [yFirst, yLast) only
  30. // -> support for source/target pitch in bytes!
  31. // -> if your emulator changes only a few image slices during each cycle (e.g. DOSBox) then there's no need to run xBRZ on the complete image:
  32. //    Just make sure you enlarge the source image slice by 2 rows on top and 2 on bottom (this is the additional range the xBRZ algorithm is using during analysis)
  33. //    CAVEAT: If there are multiple changed slices, make sure they do not overlap after adding these additional rows in order to avoid a memory race condition
  34. //    in the target image data if you are using multiple threads for processing each enlarged slice!
  35. //
  36. // THREAD-SAFETY: - parts of the same image may be scaled by multiple threads as long as the [yFirst, yLast) ranges do not overlap!
  37. //                - there is a minor inefficiency for the first row of a slice, so avoid processing single rows only; suggestion: process at least 8-16 rows
  38.  
  39.  
  40. #include <stddef.h> // for size_t
  41. #include <stdint.h> // for uint32_t
  42. #include <memory.h> // for memset()
  43. #include <limits.h>
  44. #include <math.h>
  45.  
  46.  
  47. #ifdef __cplusplus
  48. #define EXTERN_C extern "C"
  49. #else // !__cplusplus
  50. #define EXTERN_C
  51. #endif // __cplusplus
  52.  
  53.  
  54. #ifdef _MSC_VER
  55. #define FORCE_INLINE __forceinline
  56. #elif defined __GNUC__
  57. #define FORCE_INLINE __attribute__((always_inline)) inline
  58. #else
  59. #define FORCE_INLINE inline
  60. #endif
  61.  
  62.  
  63. // scaler configuration
  64. #define XBRZ_CFG_LUMINANCE_WEIGHT 1
  65. #define XBRZ_CFG_EQUAL_COLOR_TOLERANCE 30
  66. #define XBRZ_CFG_DOMINANT_DIRECTION_THRESHOLD 3.6
  67. #define XBRZ_CFG_STEEP_DIRECTION_THRESHOLD 2.2
  68.  
  69.  
  70. // slice types
  71. #define XBRZ_SLICETYPE_SOURCE 1
  72. #define XBRZ_SLICETYPE_TARGET 2
  73.  
  74.  
  75. // handy macros
  76. #define GET_BYTE(val,byteno) ((unsigned char) (((val) >> ((byteno) << 3)) & 0xff))
  77. #define GET_BLUE(val)  GET_BYTE (val, 0)
  78. #define GET_GREEN(val) GET_BYTE (val, 1)
  79. #define GET_RED(val)   GET_BYTE (val, 2)
  80. #define GET_ALPHA(val) GET_BYTE (val, 3)
  81. #define CALC_COLOR24(colFront,colBack,M,N) (unsigned char) ((((unsigned char) (colFront)) * ((unsigned int) (M)) + ((unsigned char) (colBack)) * (((unsigned int) (N)) - ((unsigned int) (M)))) / ((unsigned int) (N)))
  82. #define CALC_COLOR32(colFront,colBack,weightFront,weightBack,weightSum) ((unsigned char) ((((unsigned char) (colFront)) * ((unsigned int) (weightFront)) + ((unsigned char) (colBack)) * ((unsigned int) (weightBack))) / ((unsigned int) (weightSum))))
  83. #define BYTE_ADVANCE(buffer,offset) (((char *) buffer) + (offset))
  84. #ifndef MIN
  85. #define MIN(a,b) ((a) < (b) ? (a) : (b))
  86. #endif // MIN
  87. #ifndef MAX
  88. #define MAX(a,b) ((a) > (b) ? (a) : (b))
  89. #endif // MAX
  90.  
  91.  
  92. enum BlendType
  93. {
  94.    BLEND_NONE = 0,
  95.    BLEND_NORMAL,   //a normal indication to blend
  96.    BLEND_DOMINANT, //a strong indication to blend
  97.    //attention: BlendType must fit into the value range of 2 bit!!!
  98. };
  99.  
  100.  
  101. typedef struct blendresult_s
  102. {
  103.    BlendType
  104.       /**/blend_f, blend_g,
  105.       /**/blend_j, blend_k;
  106. } blendresult_t;
  107.  
  108.  
  109. typedef struct kernel_3x3_s
  110. {
  111.    uint32_t
  112.       /**/a, b, c,
  113.       /**/d, e, f,
  114.       /**/g, h, i;
  115. } kernel_3x3_t;
  116.  
  117.  
  118. typedef struct kernel_4x4_s //kernel for preprocessing step
  119. {
  120.    uint32_t
  121.       /**/a, b, c, d,
  122.       /**/e, f, g, h,
  123.       /**/i, j, k, l,
  124.       /**/m, n, o, p;
  125. } kernel_4x4_t;
  126.  
  127.  
  128. typedef struct outmatrix_s
  129. {
  130.    size_t size;
  131.    uint32_t* ptr;
  132.    int stride;
  133.    int rotDeg; // either 0, 90, 180 or 270
  134. } outmatrix_t;
  135.  
  136.  
  137. static void outmatrix_create (outmatrix_t *mat, size_t size, uint32_t *ptr, int stride, int rotDeg) //access matrix area, top-left at position "out" for image with given width
  138. {
  139.    mat->size = size;
  140.    mat->ptr = ptr;
  141.    mat->stride = stride;
  142.    mat->rotDeg = rotDeg;
  143. }
  144.  
  145.  
  146. static uint32_t *outmatrix_ref (outmatrix_t *mat, size_t I, size_t J)
  147. {
  148.    size_t I_old;
  149.    size_t J_old;
  150.    // calculate input matrix coordinates after rotation: (i, j) = (row, col) indices, N = size of (square) matrix
  151.    if (mat->rotDeg == 270) { I_old = J;                 J_old = mat->size - 1 - I; }
  152.    else if (mat->rotDeg == 180) { I_old = mat->size - 1 - I; J_old = mat->size - 1 - J; }
  153.    else if (mat->rotDeg == 90) { I_old = mat->size - 1 - J; J_old = I; }
  154.    else { I_old = I;                 J_old = J; }
  155.  
  156.    return (mat->ptr + I_old * mat->stride + J_old);
  157. }
  158.  
  159.  
  160. typedef void (alphagrad_func) (uint32_t *pixBack, uint32_t pixFront, unsigned int M, unsigned int N);
  161. typedef double (dist_func) (uint32_t pix1, uint32_t pix2);
  162.  
  163.  
  164. typedef struct scaler_s
  165. {
  166.    int factor;
  167.    void (*blend_line_shallow) (uint32_t col, outmatrix_t *out, alphagrad_func alphagrad);
  168.    void (*blend_line_steep) (uint32_t col, outmatrix_t *out, alphagrad_func alphagrad);
  169.    void (*blend_line_steep_and_shallow) (uint32_t col, outmatrix_t *out, alphagrad_func alphagrad);
  170.    void (*blend_line_diagonal) (uint32_t col, outmatrix_t *out, alphagrad_func alphagrad);
  171.    void (*blend_corner) (uint32_t col, outmatrix_t *out, alphagrad_func alphagrad);
  172. } scaler_t;
  173.  
  174.  
  175. /////////////////////////////////
  176. // shallow line scaling functions
  177.  
  178. static void blend_line_shallow_2x (uint32_t col, outmatrix_t *out, alphagrad_func alphagrad)
  179. {
  180.    alphagrad (outmatrix_ref (out, 2 - 1, 0), col, 1, 4);
  181.    alphagrad (outmatrix_ref (out, 2 - 1, 1), col, 3, 4);
  182. }
  183. static void blend_line_shallow_3x (uint32_t col, outmatrix_t *out, alphagrad_func alphagrad)
  184. {
  185.    alphagrad (outmatrix_ref (out, 3 - 1, 0), col, 1, 4);
  186.    alphagrad (outmatrix_ref (out, 3 - 2, 2), col, 1, 4);
  187.    alphagrad (outmatrix_ref (out, 3 - 1, 1), col, 3, 4);
  188.    *outmatrix_ref (out, 3 - 1, 2) = col;
  189. }
  190. static void blend_line_shallow_4x (uint32_t col, outmatrix_t *out, alphagrad_func alphagrad)
  191. {
  192.    alphagrad (outmatrix_ref (out, 4 - 1, 0), col, 1, 4);
  193.    alphagrad (outmatrix_ref (out, 4 - 2, 2), col, 1, 4);
  194.    alphagrad (outmatrix_ref (out, 4 - 1, 1), col, 3, 4);
  195.    alphagrad (outmatrix_ref (out, 4 - 2, 3), col, 3, 4);
  196.    *outmatrix_ref (out, 4 - 1, 2) = col;
  197.    *outmatrix_ref (out, 4 - 1, 3) = col;
  198. }
  199. static void blend_line_shallow_5x (uint32_t col, outmatrix_t *out, alphagrad_func alphagrad)
  200. {
  201.    alphagrad (outmatrix_ref (out, 5 - 1, 0), col, 1, 4);
  202.    alphagrad (outmatrix_ref (out, 5 - 2, 2), col, 1, 4);
  203.    alphagrad (outmatrix_ref (out, 5 - 3, 4), col, 1, 4);
  204.    alphagrad (outmatrix_ref (out, 5 - 1, 1), col, 3, 4);
  205.    alphagrad (outmatrix_ref (out, 5 - 2, 3), col, 3, 4);
  206.    *outmatrix_ref (out, 5 - 1, 2) = col;
  207.    *outmatrix_ref (out, 5 - 1, 3) = col;
  208.    *outmatrix_ref (out, 5 - 1, 4) = col;
  209.    *outmatrix_ref (out, 5 - 2, 4) = col;
  210. }
  211. static void blend_line_shallow_6x (uint32_t col, outmatrix_t *out, alphagrad_func alphagrad)
  212. {
  213.    alphagrad (outmatrix_ref (out, 6 - 1, 0), col, 1, 4);
  214.    alphagrad (outmatrix_ref (out, 6 - 2, 2), col, 1, 4);
  215.    alphagrad (outmatrix_ref (out, 6 - 3, 4), col, 1, 4);
  216.    alphagrad (outmatrix_ref (out, 6 - 1, 1), col, 3, 4);
  217.    alphagrad (outmatrix_ref (out, 6 - 2, 3), col, 3, 4);
  218.    alphagrad (outmatrix_ref (out, 6 - 3, 5), col, 3, 4);
  219.    *outmatrix_ref (out, 6 - 1, 2) = col;
  220.    *outmatrix_ref (out, 6 - 1, 3) = col;
  221.    *outmatrix_ref (out, 6 - 1, 4) = col;
  222.    *outmatrix_ref (out, 6 - 1, 5) = col;
  223.    *outmatrix_ref (out, 6 - 2, 4) = col;
  224.    *outmatrix_ref (out, 6 - 2, 5) = col;
  225. }
  226.  
  227. ///////////////////////////////
  228. // steep line scaling functions
  229.  
  230. static void blend_line_steep_2x (uint32_t col, outmatrix_t *out, alphagrad_func alphagrad)
  231. {
  232.    alphagrad (outmatrix_ref (out, 0, 2 - 1), col, 1, 4);
  233.    alphagrad (outmatrix_ref (out, 1, 2 - 1), col, 3, 4);
  234. }
  235. static void blend_line_steep_3x (uint32_t col, outmatrix_t *out, alphagrad_func alphagrad)
  236. {
  237.    alphagrad (outmatrix_ref (out, 0, 3 - 1), col, 1, 4);
  238.    alphagrad (outmatrix_ref (out, 2, 3 - 2), col, 1, 4);
  239.    alphagrad (outmatrix_ref (out, 1, 3 - 1), col, 3, 4);
  240.    *outmatrix_ref (out, 2, 3 - 1) = col;
  241. }
  242. static void blend_line_steep_4x (uint32_t col, outmatrix_t *out, alphagrad_func alphagrad)
  243. {
  244.    alphagrad (outmatrix_ref (out, 0, 4 - 1), col, 1, 4);
  245.    alphagrad (outmatrix_ref (out, 2, 4 - 2), col, 1, 4);
  246.    alphagrad (outmatrix_ref (out, 1, 4 - 1), col, 3, 4);
  247.    alphagrad (outmatrix_ref (out, 3, 4 - 2), col, 3, 4);
  248.    *outmatrix_ref (out, 2, 4 - 1) = col;
  249.    *outmatrix_ref (out, 3, 4 - 1) = col;
  250. }
  251. static void blend_line_steep_5x (uint32_t col, outmatrix_t *out, alphagrad_func alphagrad)
  252. {
  253.    alphagrad (outmatrix_ref (out, 0, 5 - 1), col, 1, 4);
  254.    alphagrad (outmatrix_ref (out, 2, 5 - 2), col, 1, 4);
  255.    alphagrad (outmatrix_ref (out, 4, 5 - 3), col, 1, 4);
  256.    alphagrad (outmatrix_ref (out, 1, 5 - 1), col, 3, 4);
  257.    alphagrad (outmatrix_ref (out, 3, 5 - 2), col, 3, 4);
  258.    *outmatrix_ref (out, 2, 5 - 1) = col;
  259.    *outmatrix_ref (out, 3, 5 - 1) = col;
  260.    *outmatrix_ref (out, 4, 5 - 1) = col;
  261.    *outmatrix_ref (out, 4, 5 - 2) = col;
  262. }
  263. static void blend_line_steep_6x (uint32_t col, outmatrix_t *out, alphagrad_func alphagrad)
  264. {
  265.    alphagrad (outmatrix_ref (out, 0, 6 - 1), col, 1, 4);
  266.    alphagrad (outmatrix_ref (out, 2, 6 - 2), col, 1, 4);
  267.    alphagrad (outmatrix_ref (out, 4, 6 - 3), col, 1, 4);
  268.    alphagrad (outmatrix_ref (out, 1, 6 - 1), col, 3, 4);
  269.    alphagrad (outmatrix_ref (out, 3, 6 - 2), col, 3, 4);
  270.    alphagrad (outmatrix_ref (out, 5, 6 - 3), col, 3, 4);
  271.    *outmatrix_ref (out, 2, 6 - 1) = col;
  272.    *outmatrix_ref (out, 3, 6 - 1) = col;
  273.    *outmatrix_ref (out, 4, 6 - 1) = col;
  274.    *outmatrix_ref (out, 5, 6 - 1) = col;
  275.    *outmatrix_ref (out, 4, 6 - 2) = col;
  276.    *outmatrix_ref (out, 5, 6 - 2) = col;
  277. }
  278.  
  279. ///////////////////////////////////////////
  280. // steep and shallow line scaling functions
  281.  
  282. static void blend_line_steep_and_shallow_2x (uint32_t col, outmatrix_t *out, alphagrad_func alphagrad)
  283. {
  284.    alphagrad (outmatrix_ref (out, 1, 0), col, 1, 4);
  285.    alphagrad (outmatrix_ref (out, 0, 1), col, 1, 4);
  286.    alphagrad (outmatrix_ref (out, 1, 1), col, 5, 6); //[!] fixes 7/8 used in xBR
  287. }
  288. static void blend_line_steep_and_shallow_3x (uint32_t col, outmatrix_t *out, alphagrad_func alphagrad)
  289. {
  290.    alphagrad (outmatrix_ref (out, 2, 0), col, 1, 4);
  291.    alphagrad (outmatrix_ref (out, 0, 2), col, 1, 4);
  292.    alphagrad (outmatrix_ref (out, 2, 1), col, 3, 4);
  293.    alphagrad (outmatrix_ref (out, 1, 2), col, 3, 4);
  294.    *outmatrix_ref (out, 2, 2) = col;
  295. }
  296. static void blend_line_steep_and_shallow_4x (uint32_t col, outmatrix_t *out, alphagrad_func alphagrad)
  297. {
  298.    alphagrad (outmatrix_ref (out, 3, 1), col, 3, 4);
  299.    alphagrad (outmatrix_ref (out, 1, 3), col, 3, 4);
  300.    alphagrad (outmatrix_ref (out, 3, 0), col, 1, 4);
  301.    alphagrad (outmatrix_ref (out, 0, 3), col, 1, 4);
  302.    alphagrad (outmatrix_ref (out, 2, 2), col, 1, 3); //[!] fixes 1/4 used in xBR
  303.    *outmatrix_ref (out, 3, 3) = col;
  304.    *outmatrix_ref (out, 3, 2) = col;
  305.    *outmatrix_ref (out, 2, 3) = col;
  306. }
  307. static void blend_line_steep_and_shallow_5x (uint32_t col, outmatrix_t *out, alphagrad_func alphagrad)
  308. {
  309.    alphagrad (outmatrix_ref (out, 0, 5 - 1), col, 1, 4);
  310.    alphagrad (outmatrix_ref (out, 2, 5 - 2), col, 1, 4);
  311.    alphagrad (outmatrix_ref (out, 1, 5 - 1), col, 3, 4);
  312.    alphagrad (outmatrix_ref (out, 5 - 1, 0), col, 1, 4);
  313.    alphagrad (outmatrix_ref (out, 5 - 2, 2), col, 1, 4);
  314.    alphagrad (outmatrix_ref (out, 5 - 1, 1), col, 3, 4);
  315.    alphagrad (outmatrix_ref (out, 3, 3), col, 2, 3);
  316.    *outmatrix_ref (out, 2, 5 - 1) = col;
  317.    *outmatrix_ref (out, 3, 5 - 1) = col;
  318.    *outmatrix_ref (out, 4, 5 - 1) = col;
  319.    *outmatrix_ref (out, 5 - 1, 2) = col;
  320.    *outmatrix_ref (out, 5 - 1, 3) = col;
  321. }
  322. static void blend_line_steep_and_shallow_6x (uint32_t col, outmatrix_t *out, alphagrad_func alphagrad)
  323. {
  324.    alphagrad (outmatrix_ref (out, 0, 6 - 1), col, 1, 4);
  325.    alphagrad (outmatrix_ref (out, 2, 6 - 2), col, 1, 4);
  326.    alphagrad (outmatrix_ref (out, 1, 6 - 1), col, 3, 4);
  327.    alphagrad (outmatrix_ref (out, 3, 6 - 2), col, 3, 4);
  328.    alphagrad (outmatrix_ref (out, 6 - 1, 0), col, 1, 4);
  329.    alphagrad (outmatrix_ref (out, 6 - 2, 2), col, 1, 4);
  330.    alphagrad (outmatrix_ref (out, 6 - 1, 1), col, 3, 4);
  331.    alphagrad (outmatrix_ref (out, 6 - 2, 3), col, 3, 4);
  332.    *outmatrix_ref (out, 2, 6 - 1) = col;
  333.    *outmatrix_ref (out, 3, 6 - 1) = col;
  334.    *outmatrix_ref (out, 4, 6 - 1) = col;
  335.    *outmatrix_ref (out, 5, 6 - 1) = col;
  336.    *outmatrix_ref (out, 4, 6 - 2) = col;
  337.    *outmatrix_ref (out, 5, 6 - 2) = col;
  338.    *outmatrix_ref (out, 6 - 1, 2) = col;
  339.    *outmatrix_ref (out, 6 - 1, 3) = col;
  340. }
  341.  
  342. //////////////////////////////////
  343. // diagonal line scaling functions
  344.  
  345. static void blend_line_diagonal_2x (uint32_t col, outmatrix_t *out, alphagrad_func alphagrad)
  346. {
  347.    alphagrad (outmatrix_ref (out, 1, 1), col, 1, 2);
  348. }
  349. static void blend_line_diagonal_3x (uint32_t col, outmatrix_t *out, alphagrad_func alphagrad)
  350. {
  351.    alphagrad (outmatrix_ref (out, 1, 2), col, 1, 8); //conflict with other rotations for this odd scale
  352.    alphagrad (outmatrix_ref (out, 2, 1), col, 1, 8);
  353.    alphagrad (outmatrix_ref (out, 2, 2), col, 7, 8); //
  354. }
  355. static void blend_line_diagonal_4x (uint32_t col, outmatrix_t *out, alphagrad_func alphagrad)
  356. {
  357.    alphagrad (outmatrix_ref (out, 4 - 1, 4 / 2), col, 1, 2);
  358.    alphagrad (outmatrix_ref (out, 4 - 2, 4 / 2 + 1), col, 1, 2);
  359.    *outmatrix_ref (out, 4 - 1, 4 - 1) = col;
  360. }
  361. static void blend_line_diagonal_5x (uint32_t col, outmatrix_t *out, alphagrad_func alphagrad)
  362. {
  363.    alphagrad (outmatrix_ref (out, 5 - 1, 5 / 2 + 0), col, 1, 8); //conflict with other rotations for this odd scale
  364.    alphagrad (outmatrix_ref (out, 5 - 2, 5 / 2 + 1), col, 1, 8);
  365.    alphagrad (outmatrix_ref (out, 5 - 3, 5 / 2 + 2), col, 1, 8); //
  366.    alphagrad (outmatrix_ref (out, 4, 3), col, 7, 8);
  367.    alphagrad (outmatrix_ref (out, 3, 4), col, 7, 8);
  368.    *outmatrix_ref (out, 4, 4) = col;
  369. }
  370. static void blend_line_diagonal_6x (uint32_t col, outmatrix_t *out, alphagrad_func alphagrad)
  371. {
  372.    alphagrad (outmatrix_ref (out, 6 - 1, 6 / 2 + 0), col, 1, 2);
  373.    alphagrad (outmatrix_ref (out, 6 - 2, 6 / 2 + 1), col, 1, 2);
  374.    alphagrad (outmatrix_ref (out, 6 - 3, 6 / 2 + 2), col, 1, 2);
  375.    *outmatrix_ref (out, 6 - 2, 6 - 1) = col;
  376.    *outmatrix_ref (out, 6 - 1, 6 - 1) = col;
  377.    *outmatrix_ref (out, 6 - 1, 6 - 2) = col;
  378. }
  379.  
  380. ///////////////////////////
  381. // corner scaling functions
  382.  
  383. static void blend_corner_2x (uint32_t col, outmatrix_t *out, alphagrad_func alphagrad)
  384. {
  385.    //model a round corner
  386.    alphagrad (outmatrix_ref (out, 1, 1), col, 21, 100); //exact: 1 - pi/4 = 0.2146018366
  387. }
  388. static void blend_corner_3x (uint32_t col, outmatrix_t *out, alphagrad_func alphagrad)
  389. {
  390.    //model a round corner
  391.    alphagrad (outmatrix_ref (out, 2, 2), col, 45, 100); //exact: 0.4545939598
  392.    //alphagrad (outmatrix_ref (out, 2, 1), col, 7, 256); //0.02826017254 -> negligible + avoid conflicts with other rotations for this odd scale
  393.    //alphagrad (outmatrix_ref (out, 1, 2), col, 7, 256); //0.02826017254
  394. }
  395. static void blend_corner_4x (uint32_t col, outmatrix_t *out, alphagrad_func alphagrad)
  396. {
  397.    //model a round corner
  398.    alphagrad (outmatrix_ref (out, 3, 3), col, 68, 100); //exact: 0.6848532563
  399.    alphagrad (outmatrix_ref (out, 3, 2), col, 9, 100); //0.08677704501
  400.    alphagrad (outmatrix_ref (out, 2, 3), col, 9, 100); //0.08677704501
  401. }
  402. static void blend_corner_5x (uint32_t col, outmatrix_t *out, alphagrad_func alphagrad)
  403. {
  404.    // model a round corner
  405.    alphagrad (outmatrix_ref (out, 4, 4), col, 86, 100); //exact: 0.8631434088
  406.    alphagrad (outmatrix_ref (out, 4, 3), col, 23, 100); //0.2306749731
  407.    alphagrad (outmatrix_ref (out, 3, 4), col, 23, 100); //0.2306749731
  408.    //alphagrad (outmatrix_ref (out, 4, 2), col, 1, 64); //0.01676812367 -> negligible + avoid conflicts with other rotations for this odd scale
  409.    //alphagrad (outmatrix_ref (out, 2, 4), col, 1, 64); //0.01676812367
  410. }
  411. static void blend_corner_6x (uint32_t col, outmatrix_t *out, alphagrad_func alphagrad)
  412. {
  413.    //model a round corner
  414.    alphagrad (outmatrix_ref (out, 5, 5), col, 97, 100); //exact: 0.9711013910
  415.    alphagrad (outmatrix_ref (out, 4, 5), col, 42, 100); //0.4236372243
  416.    alphagrad (outmatrix_ref (out, 5, 4), col, 42, 100); //0.4236372243
  417.    alphagrad (outmatrix_ref (out, 5, 3), col, 6, 100); //0.05652034508
  418.    alphagrad (outmatrix_ref (out, 3, 5), col, 6, 100); //0.05652034508
  419. }
  420.  
  421. /////////////////////////////////////
  422. // scaler objects for various factors
  423.  
  424. static const scaler_t scalers[] =
  425. {
  426.    { 2, blend_line_shallow_2x, blend_line_steep_2x, blend_line_steep_and_shallow_2x, blend_line_diagonal_2x, blend_corner_2x },
  427.    { 3, blend_line_shallow_3x, blend_line_steep_3x, blend_line_steep_and_shallow_3x, blend_line_diagonal_3x, blend_corner_3x },
  428.    { 4, blend_line_shallow_4x, blend_line_steep_4x, blend_line_steep_and_shallow_4x, blend_line_diagonal_4x, blend_corner_4x },
  429.    { 5, blend_line_shallow_5x, blend_line_steep_5x, blend_line_steep_and_shallow_5x, blend_line_diagonal_5x, blend_corner_5x },
  430.    { 6, blend_line_shallow_6x, blend_line_steep_6x, blend_line_steep_and_shallow_6x, blend_line_diagonal_6x, blend_corner_6x },
  431. };
  432.  
  433.  
  434. static FORCE_INLINE void preProcessCorners (blendresult_t *result, const kernel_4x4_t *ker, dist_func dist)
  435. {
  436.    // detect blend direction
  437.    // result: F, G, J, K corners of "GradientType"
  438.  
  439.    // input kernel area naming convention:
  440.    // -----------------
  441.    // | A | B | C | D |
  442.    // ----|---|---|---|
  443.    // | E | F | G | H |   //evaluate the four corners between F, G, J, K
  444.    // ----|---|---|---|   //input pixel is at position F
  445.    // | I | J | K | L |
  446.    // ----|---|---|---|
  447.    // | M | N | O | P |
  448.    // -----------------
  449.  
  450.    memset (result, 0, sizeof (blendresult_t));
  451.  
  452.    if (((ker->f == ker->g) && (ker->j == ker->k)) || ((ker->f == ker->j) && (ker->g == ker->k)))
  453.       return;
  454.  
  455.    const int weight = 4;
  456.    double jg = dist (ker->i, ker->f) + dist (ker->f, ker->c) + dist (ker->n, ker->k) + dist (ker->k, ker->h) + weight * dist (ker->j, ker->g);
  457.    double fk = dist (ker->e, ker->j) + dist (ker->j, ker->o) + dist (ker->b, ker->g) + dist (ker->g, ker->l) + weight * dist (ker->f, ker->k);
  458.  
  459.    if (jg < fk) //test sample: 70% of values max(jg, fk) / min(jg, fk) are between 1.1 and 3.7 with median being 1.8
  460.    {
  461.       const bool dominantGradient = XBRZ_CFG_DOMINANT_DIRECTION_THRESHOLD * jg < fk;
  462.       if (ker->f != ker->g && ker->f != ker->j)
  463.          result->blend_f = dominantGradient ? BLEND_DOMINANT : BLEND_NORMAL;
  464.  
  465.       if (ker->k != ker->j && ker->k != ker->g)
  466.          result->blend_k = dominantGradient ? BLEND_DOMINANT : BLEND_NORMAL;
  467.    }
  468.    else if (fk < jg)
  469.    {
  470.       const bool dominantGradient = XBRZ_CFG_DOMINANT_DIRECTION_THRESHOLD * fk < jg;
  471.       if (ker->j != ker->f && ker->j != ker->k)
  472.          result->blend_j = dominantGradient ? BLEND_DOMINANT : BLEND_NORMAL;
  473.  
  474.       if (ker->g != ker->f && ker->g != ker->k)
  475.          result->blend_g = dominantGradient ? BLEND_DOMINANT : BLEND_NORMAL;
  476.    }
  477.    return;
  478. }
  479.  
  480. // compress four blend types into a single byte
  481. #define getTopL(b)    ((BlendType) (0x3 & ((unsigned char) (b) >> 0)))
  482. #define getTopR(b)    ((BlendType) (0x3 & ((unsigned char) (b) >> 2)))
  483. #define getBottomR(b) ((BlendType) (0x3 & ((unsigned char) (b) >> 4)))
  484. #define getBottomL(b) ((BlendType) (0x3 & ((unsigned char) (b) >> 6)))
  485.  
  486. static inline void setTopL (unsigned char& b, BlendType bt) { b |= (((BlendType) (bt)) << 0); } //buffer is assumed to be initialized before preprocessing!
  487. static inline void setTopR (unsigned char& b, BlendType bt) { b |= (((BlendType) (bt)) << 2); }
  488. static inline void setBottomR (unsigned char& b, BlendType bt) { b |= (((BlendType) (bt)) << 4); }
  489. static inline void setBottomL (unsigned char& b, BlendType bt) { b |= (((BlendType) (bt)) << 6); }
  490.  
  491.  
  492. FORCE_INLINE void blend_pixel (const scaler_t *scaler, const kernel_3x3_t *ker, uint32_t *target, int trgWidth, unsigned char blendInfo, alphagrad_func alphagrad, dist_func dist, int rotDeg) //result of preprocessing all four corners of pixel "e"
  493. {
  494.    // input kernel area naming convention:
  495.    // -------------
  496.    // | A | B | C |
  497.    // ----|---|---|
  498.    // | D | E | F | //input pixel is at position E
  499.    // ----|---|---|
  500.    // | G | H | I |
  501.    // -------------
  502.  
  503.    uint32_t
  504.       a, b, c,
  505.       d, e, f,
  506.       g, h, i;
  507.    unsigned char blend;
  508.  
  509.    if      (rotDeg == 270) { a = ker->c; b = ker->f; c = ker->i; d = ker->b; e = ker->e; f = ker->h; g = ker->a; h = ker->d; i = ker->g; blend = ((blendInfo << 6) | (blendInfo >> 2)) & 0xff; }
  510.    else if (rotDeg == 180) { a = ker->i; b = ker->h; c = ker->g; d = ker->f; e = ker->e; f = ker->d; g = ker->c; h = ker->b; i = ker->a; blend = ((blendInfo << 4) | (blendInfo >> 4)) & 0xff; }
  511.    else if (rotDeg == 90)  { a = ker->g; b = ker->d; c = ker->a; d = ker->h; e = ker->e; f = ker->b; g = ker->i; h = ker->f; i = ker->c; blend = ((blendInfo << 2) | (blendInfo >> 6)) & 0xff; }
  512.    else                    { a = ker->a; b = ker->b; c = ker->c; d = ker->d; e = ker->e; f = ker->f; g = ker->g; h = ker->h; i = ker->i; blend = ((blendInfo << 0) | (blendInfo >> 8)) & 0xff; }
  513.  
  514.    if (getBottomR (blend) >= BLEND_NORMAL)
  515.    {
  516.       outmatrix_t out;
  517.       uint32_t px;
  518.       bool doLineBlend;
  519.  
  520.       if (getBottomR (blend) >= BLEND_DOMINANT)
  521.          doLineBlend = true;
  522.       else if (getTopR (blend) != BLEND_NONE && (dist (e, g) >= XBRZ_CFG_EQUAL_COLOR_TOLERANCE)) //but support double-blending for 90° corners
  523.          doLineBlend = false; // make sure there is no second blending in an adjacent rotation for this pixel: handles insular pixels, mario eyes
  524.       else if (getBottomL (blend) != BLEND_NONE && (dist (e, c) >= XBRZ_CFG_EQUAL_COLOR_TOLERANCE))
  525.          doLineBlend = false; // make sure there is no second blending in an adjacent rotation for this pixel: handles insular pixels, mario eyes
  526.       else if ((dist (e, i) >= XBRZ_CFG_EQUAL_COLOR_TOLERANCE)
  527.          && (dist (g, h) < XBRZ_CFG_EQUAL_COLOR_TOLERANCE)
  528.          && (dist (h, i) < XBRZ_CFG_EQUAL_COLOR_TOLERANCE)
  529.          && (dist (i, f) < XBRZ_CFG_EQUAL_COLOR_TOLERANCE)
  530.          && (dist (f, c) < XBRZ_CFG_EQUAL_COLOR_TOLERANCE))
  531.          doLineBlend = false; // no full blending for L-shapes; blend corner only (handles "mario mushroom eyes")
  532.       else
  533.          doLineBlend = true;
  534.  
  535.       outmatrix_create (&out, scaler->factor, target, trgWidth, rotDeg);
  536.       px = (dist (e, f) <= dist (e, h) ? f : h); //choose most similar color
  537.  
  538.       if (doLineBlend)
  539.       {
  540.          const double fg = dist (f, g); //test sample: 70% of values max(fg, hc) / min(fg, hc) are between 1.1 and 3.7 with median being 1.9
  541.          const double hc = dist (h, c); //
  542.          const bool haveShallowLine = (XBRZ_CFG_STEEP_DIRECTION_THRESHOLD * fg <= hc) && (e != g) && (d != g);
  543.          const bool haveSteepLine   = (XBRZ_CFG_STEEP_DIRECTION_THRESHOLD * hc <= fg) && (e != c) && (b != c);
  544.  
  545.          if (haveShallowLine)
  546.          {
  547.             if (haveSteepLine)
  548.                scaler->blend_line_steep_and_shallow (px, &out, alphagrad);
  549.             else
  550.                scaler->blend_line_shallow (px, &out, alphagrad);
  551.          }
  552.          else
  553.          {
  554.             if (haveSteepLine)
  555.                scaler->blend_line_steep (px, &out, alphagrad);
  556.             else
  557.                scaler->blend_line_diagonal (px, &out, alphagrad);
  558.          }
  559.       }
  560.       else
  561.          scaler->blend_corner (px, &out, alphagrad);
  562.    }
  563. }
  564.  
  565.  
  566. void scale_image (const scaler_t *scaler, const uint32_t *src, uint32_t *trg, int srcWidth, int srcHeight, int yFirst, int yLast, alphagrad_func alphagrad, dist_func dist)
  567. {
  568.    yFirst = MAX (yFirst, 0);
  569.    yLast = MIN (yLast, srcHeight);
  570.    if (yFirst >= yLast || srcWidth <= 0)
  571.       return;
  572.  
  573.    const int trgWidth = srcWidth * scaler->factor;
  574.  
  575.    //"use" space at the end of the image as temporary buffer for "on the fly preprocessing": we even could use larger area of
  576.    //"sizeof(uint32_t) * srcWidth * (yLast - yFirst)" bytes without risk of accidental overwriting before accessing
  577.    const int bufferSize = srcWidth;
  578.    unsigned char *preProcBuffer = (unsigned char *) (trg + yLast * scaler->factor * trgWidth) - bufferSize;
  579.    memset (preProcBuffer, 0, bufferSize);
  580.    static_assert(BLEND_NONE == 0, "");
  581.  
  582.    //initialize preprocessing buffer for first row of current stripe: detect upper left and right corner blending
  583.    //this cannot be optimized for adjacent processing stripes; we must not allow for a memory race condition!
  584.    if (yFirst > 0)
  585.    {
  586.       const int y = yFirst - 1;
  587.  
  588.       const uint32_t* s_m1 = src + srcWidth * MAX (y - 1, 0);
  589.       const uint32_t* s_0 = src + srcWidth * y; //center line
  590.       const uint32_t* s_p1 = src + srcWidth * MIN (y + 1, srcHeight - 1);
  591.       const uint32_t* s_p2 = src + srcWidth * MIN (y + 2, srcHeight - 1);
  592.  
  593.       for (int x = 0; x < srcWidth; ++x)
  594.       {
  595.          blendresult_t res;
  596.          const int x_m1 = MAX (x - 1, 0);
  597.          const int x_p1 = MIN (x + 1, srcWidth - 1);
  598.          const int x_p2 = MIN (x + 2, srcWidth - 1);
  599.  
  600.          kernel_4x4_t ker; //perf: initialization is negligible
  601.          ker.a = s_m1[x_m1]; //read sequentially from memory as far as possible
  602.          ker.b = s_m1[x];
  603.          ker.c = s_m1[x_p1];
  604.          ker.d = s_m1[x_p2];
  605.  
  606.          ker.e = s_0[x_m1];
  607.          ker.f = s_0[x];
  608.          ker.g = s_0[x_p1];
  609.          ker.h = s_0[x_p2];
  610.  
  611.          ker.i = s_p1[x_m1];
  612.          ker.j = s_p1[x];
  613.          ker.k = s_p1[x_p1];
  614.          ker.l = s_p1[x_p2];
  615.  
  616.          ker.m = s_p2[x_m1];
  617.          ker.n = s_p2[x];
  618.          ker.o = s_p2[x_p1];
  619.          ker.p = s_p2[x_p2];
  620.  
  621.          preProcessCorners (&res, &ker, dist);
  622.          /*
  623.          preprocessing blend result:
  624.          ---------
  625.          | F | G |   //evalute corner between F, G, J, K
  626.          ----|---|   //input pixel is at position F
  627.          | J | K |
  628.          ---------
  629.          */
  630.          setTopR (preProcBuffer[x], res.blend_j);
  631.  
  632.          if (x + 1 < bufferSize)
  633.             setTopL (preProcBuffer[x + 1], res.blend_k);
  634.       }
  635.    }
  636.    //------------------------------------------------------------------------------------
  637.  
  638.    for (int y = yFirst; y < yLast; ++y)
  639.    {
  640.       uint32_t *out = trg + scaler->factor * y * trgWidth; //consider MT "striped" access
  641.  
  642.       const uint32_t* s_m1 = src + srcWidth * MAX (y - 1, 0);
  643.       const uint32_t* s_0 = src + srcWidth * y; //center line
  644.       const uint32_t* s_p1 = src + srcWidth * MIN (y + 1, srcHeight - 1);
  645.       const uint32_t* s_p2 = src + srcWidth * MIN (y + 2, srcHeight - 1);
  646.  
  647.       unsigned char blend_xy1 = 0; //corner blending for current (x, y + 1) position
  648.  
  649.       for (int x = 0; x < srcWidth; ++x, out += scaler->factor)
  650.       {
  651.          //all those bounds checks have only insignificant impact on performance!
  652.          const int x_m1 = MAX (x - 1, 0); //perf: prefer array indexing to additional pointers!
  653.          const int x_p1 = MIN (x + 1, srcWidth - 1);
  654.          const int x_p2 = MIN (x + 2, srcWidth - 1);
  655.          kernel_4x4_t ker4; //perf: initialization is negligible
  656.  
  657.          ker4.a = s_m1[x_m1]; //read sequentially from memory as far as possible
  658.          ker4.b = s_m1[x];
  659.          ker4.c = s_m1[x_p1];
  660.          ker4.d = s_m1[x_p2];
  661.  
  662.          ker4.e = s_0[x_m1];
  663.          ker4.f = s_0[x];
  664.          ker4.g = s_0[x_p1];
  665.          ker4.h = s_0[x_p2];
  666.  
  667.          ker4.i = s_p1[x_m1];
  668.          ker4.j = s_p1[x];
  669.          ker4.k = s_p1[x_p1];
  670.          ker4.l = s_p1[x_p2];
  671.  
  672.          ker4.m = s_p2[x_m1];
  673.          ker4.n = s_p2[x];
  674.          ker4.o = s_p2[x_p1];
  675.          ker4.p = s_p2[x_p2];
  676.  
  677.          //evaluate the four corners on bottom-right of current pixel
  678.          unsigned char blend_xy = 0; //for current (x, y) position
  679.          {
  680.             blendresult_t res;
  681.             preProcessCorners (&res, &ker4, dist);
  682.             /*
  683.             preprocessing blend result:
  684.             ---------
  685.             | F | G |   //evalute corner between F, G, J, K
  686.             ----|---|   //current input pixel is at position F
  687.             | J | K |
  688.             ---------
  689.             */
  690.             blend_xy = preProcBuffer[x];
  691.             setBottomR (blend_xy, res.blend_f); //all four corners of (x, y) have been determined at this point due to processing sequence!
  692.  
  693.             setTopR (blend_xy1, res.blend_j); //set 2nd known corner for (x, y + 1)
  694.             preProcBuffer[x] = blend_xy1; //store on current buffer position for use on next row
  695.  
  696.             blend_xy1 = 0;
  697.             setTopL (blend_xy1, res.blend_k); //set 1st known corner for (x + 1, y + 1) and buffer for use on next column
  698.  
  699.             if (x + 1 < bufferSize) //set 3rd known corner for (x + 1, y)
  700.                setBottomL (preProcBuffer[x + 1], res.blend_g);
  701.          }
  702.  
  703.          //fill block of size scale * scale with the given color
  704.          {
  705.             uint32_t *blk = out;
  706.             for (int _blk_y = 0; _blk_y < scaler->factor; ++_blk_y, blk = (uint32_t *) BYTE_ADVANCE (blk, trgWidth * sizeof (uint32_t)))
  707.                for (int _blk_x = 0; _blk_x < scaler->factor; ++_blk_x)
  708.                   blk[_blk_x] = ker4.f;
  709.          }
  710.          //place *after* preprocessing step, to not overwrite the results while processing the the last pixel!
  711.  
  712.          //blend four corners of current pixel
  713.          if (blend_xy != 0) //good 5% perf-improvement
  714.          {
  715.             kernel_3x3_t ker3; //perf: initialization is negligible
  716.  
  717.             ker3.a = ker4.a;
  718.             ker3.b = ker4.b;
  719.             ker3.c = ker4.c;
  720.  
  721.             ker3.d = ker4.e;
  722.             ker3.e = ker4.f;
  723.             ker3.f = ker4.g;
  724.  
  725.             ker3.g = ker4.i;
  726.             ker3.h = ker4.j;
  727.             ker3.i = ker4.k;
  728.  
  729.             blend_pixel (scaler, &ker3, out, trgWidth, blend_xy, alphagrad, dist, 0);
  730.             blend_pixel (scaler, &ker3, out, trgWidth, blend_xy, alphagrad, dist, 90);
  731.             blend_pixel (scaler, &ker3, out, trgWidth, blend_xy, alphagrad, dist, 180);
  732.             blend_pixel (scaler, &ker3, out, trgWidth, blend_xy, alphagrad, dist, 270);
  733.          }
  734.       }
  735.    }
  736. }
  737.  
  738.  
  739. static double dist24 (uint32_t pix1, uint32_t pix2)
  740. {
  741.    //30% perf boost compared to plain distYCbCr()!
  742.    //consumes 64 MB memory; using double is only 2% faster, but takes 128 MB
  743.    static float diffToDist[256 * 256 * 256];
  744.    static bool is_initialized = false;
  745.    if (!is_initialized)
  746.    {
  747.       for (uint32_t i = 0; i < 256 * 256 * 256; ++i) //startup time: 114 ms on Intel Core i5 (four cores)
  748.       {
  749.          const int r_diff = GET_RED (i) * 2 - 0xFF;
  750.          const int g_diff = GET_GREEN (i) * 2 - 0xFF;
  751.          const int b_diff = GET_BLUE (i) * 2 - 0xFF;
  752.  
  753.          const double k_b = 0.0593; //ITU-R BT.2020 conversion
  754.          const double k_r = 0.2627; //
  755.          const double k_g = 1 - k_b - k_r;
  756.  
  757.          const double scale_b = 0.5 / (1 - k_b);
  758.          const double scale_r = 0.5 / (1 - k_r);
  759.  
  760.          const double y = k_r * r_diff + k_g * g_diff + k_b * b_diff; //[!], analog YCbCr!
  761.          const double c_b = scale_b * (b_diff - y);
  762.          const double c_r = scale_r * (r_diff - y);
  763.  
  764.          diffToDist[i] = (float) (sqrt ((y * y) + (c_b * c_b) + (c_r * c_r)));
  765.       }
  766.       is_initialized = true;
  767.    }
  768.  
  769.    const int r_diff = (int) GET_RED (pix1) - (int) GET_RED (pix2);
  770.    const int g_diff = (int) GET_GREEN (pix1) - (int) GET_GREEN (pix2);
  771.    const int b_diff = (int) GET_BLUE (pix1) - (int) GET_BLUE (pix2);
  772.  
  773.    return diffToDist[(((r_diff + 0xFF) / 2) << 16) | //slightly reduce precision (division by 2) to squeeze value into single byte
  774.       (((g_diff + 0xFF) / 2) << 8) |
  775.       (((b_diff + 0xFF) / 2) << 0)];
  776. }
  777.  
  778.  
  779. static double dist32 (uint32_t pix1, uint32_t pix2)
  780. {
  781.    const double a1 = GET_ALPHA (pix1) / 255.0;
  782.    const double a2 = GET_ALPHA (pix2) / 255.0;
  783.    /*
  784.    Requirements for a color distance handling alpha channel: with a1, a2 in [0, 1]
  785.  
  786.        1. if a1 = a2, distance should be: a1 * distYCbCr()
  787.        2. if a1 = 0,  distance should be: a2 * distYCbCr(black, white) = a2 * 255
  788.        3. if a1 = 1,  ??? maybe: 255 * (1 - a2) + a2 * distYCbCr()
  789.    */
  790.  
  791.    //return MIN (a1, a2) * distYCbCrBuffered(pix1, pix2) + 255 * abs(a1 - a2);
  792.    //=> following code is 15% faster:
  793.    const double d = dist24 (pix1, pix2);
  794.    return (a1 < a2 ? a1 * d + 255 * (a2 - a1) : a2 * d + 255 * (a1 - a2));
  795. }
  796.  
  797.  
  798. static void alphagrad24 (uint32_t *pixBack, uint32_t pixFront, unsigned int M, unsigned int N)
  799. {
  800.    // blend front color with opacity M / N over opaque background: http://en.wikipedia.org/wiki/Alpha_compositing#Alpha_blending
  801.    *pixBack = ((CALC_COLOR24 (GET_RED (pixFront), GET_RED (*pixBack), M, N) << 16)
  802.       | (CALC_COLOR24 (GET_GREEN (pixFront), GET_GREEN (*pixBack), M, N) << 8)
  803.       | (CALC_COLOR24 (GET_BLUE (pixFront), GET_BLUE (*pixBack), M, N) << 0));
  804. }
  805.  
  806.  
  807. static void alphagrad32 (uint32_t *pixBack, uint32_t pixFront, unsigned int M, unsigned int N)
  808. {
  809.    // find intermediate color between two colors with alpha channels (=> NO alpha blending!!!)
  810.    const unsigned int weightFront = GET_ALPHA (pixFront) * M;
  811.    const unsigned int weightBack = GET_ALPHA (*pixBack) * (N - M);
  812.    const unsigned int weightSum = weightFront + weightBack;
  813.    *pixBack = (weightSum == 0 ? 0 :
  814.       (((unsigned char) (weightSum / N)) << 24)
  815.       | (CALC_COLOR32 (GET_RED (pixFront), GET_RED (*pixBack), weightFront, weightBack, weightSum) << 16)
  816.       | (CALC_COLOR32 (GET_GREEN (pixFront), GET_GREEN (*pixBack), weightFront, weightBack, weightSum) << 8)
  817.       | (CALC_COLOR32 (GET_BLUE (pixFront), GET_BLUE (*pixBack), weightFront, weightBack, weightSum) << 0));
  818. }
  819.  
  820.  
  821. EXTERN_C void nearestNeighborScale (const uint32_t *src, int srcWidth, int srcHeight, uint32_t *trg, int trgWidth, int trgHeight)
  822. {
  823.    //    nearestNeighborScale (src, srcWidth, srcHeight, srcWidth * sizeof (uint32_t), trg, trgWidth, trgHeight, trgWidth * sizeof (uint32_t), XBRZ_SLICETYPE_TARGET, 0, trgHeight, [](uint32_t pix) { return pix; });
  824.        //static_assert(std::is_integral<PixSrc>::value, "PixSrc* is expected to be cast-able to char*");
  825.        //static_assert(std::is_integral<PixTrg>::value, "PixTrg* is expected to be cast-able to char*");
  826.        //static_assert(std::is_same<decltype(pixCvrt(PixSrc())), PixTrg>::value, "PixConverter returning wrong pixel format");
  827.  
  828.    int srcPitch = srcWidth * sizeof (uint32_t);
  829.    int trgPitch = trgWidth * sizeof (uint32_t);
  830.    int yFirst;
  831.    int yLast;
  832.  
  833. #if 0 // going over source image - fast for upscaling, since source is read only once
  834.    yFirst = 0;
  835.    yLast = MIN (trgHeight, srcHeight);
  836.  
  837.    if (yFirst >= yLast || trgWidth <= 0 || trgHeight <= 0)
  838.       return; // consistency check
  839.  
  840.    for (int y = yFirst; y < yLast; ++y)
  841.    {
  842.       //mathematically: ySrc = floor(srcHeight * yTrg / trgHeight)
  843.       // => search for integers in: [ySrc, ySrc + 1) * trgHeight / srcHeight
  844.  
  845.       //keep within for loop to support MT input slices!
  846.       const int yTrg_first = (y      * trgHeight + srcHeight - 1) / srcHeight; //=ceil(y * trgHeight / srcHeight)
  847.       const int yTrg_last = ((y + 1) * trgHeight + srcHeight - 1) / srcHeight; //=ceil(((y + 1) * trgHeight) / srcHeight)
  848.       const int blockHeight = yTrg_last - yTrg_first;
  849.  
  850.       if (blockHeight > 0)
  851.       {
  852.          const uint32_t *srcLine = (const uint32_t *) BYTE_ADVANCE (src, y * srcPitch);
  853.          /**/  uint32_t *trgLine = (uint32_t *) BYTE_ADVANCE (trg, yTrg_first * trgPitch);
  854.          int xTrg_first = 0;
  855.  
  856.          for (int x = 0; x < srcWidth; ++x)
  857.          {
  858.             const int xTrg_last = ((x + 1) * trgWidth + srcWidth - 1) / srcWidth;
  859.             const int blockWidth = xTrg_last - xTrg_first;
  860.             if (blockWidth > 0)
  861.             {
  862.                const uint32_t trgColor = srcLine[x];
  863.                uint32_t *blkLine = trgLine;
  864.  
  865.                xTrg_first = xTrg_last;
  866.  
  867.                for (int blk_y = 0; blk_y < blockHeight; ++blk_y, blkLine = (uint32_t *) BYTE_ADVANCE (blkLine, trgPitch))
  868.                   for (int blk_x = 0; blk_x < blockWidth; ++blk_x)
  869.                      blkLine[blk_x] = trgColor;
  870.  
  871.                trgLine += blockWidth;
  872.             }
  873.          }
  874.       }
  875.    }
  876. #else // going over target image - slow for upscaling, since source is read multiple times missing out on cache! Fast for similar image sizes!
  877.    yFirst = 0;
  878.    yLast = trgHeight;
  879.  
  880.    if (yFirst >= yLast || srcHeight <= 0 || srcWidth <= 0)
  881.       return; // consistency check
  882.  
  883.    for (int y = yFirst; y < yLast; ++y)
  884.    {
  885.       /**/  uint32_t *trgLine = (uint32_t *) BYTE_ADVANCE (trg, y * trgPitch);
  886.       const int ySrc = srcHeight * y / trgHeight;
  887.       const uint32_t *srcLine = (const uint32_t *) BYTE_ADVANCE (src, ySrc * srcPitch);
  888.       for (int x = 0; x < trgWidth; ++x)
  889.       {
  890.          const int xSrc = srcWidth * x / trgWidth;
  891.          trgLine[x] = srcLine[xSrc];
  892.       }
  893.    }
  894. #endif // going over source or target
  895.  
  896.    return;
  897. }
  898.  
  899.  
  900. EXTERN_C bool xbrz_equalcolortest24 (uint32_t col1, uint32_t col2, double luminanceWeight, double equalColorTolerance)
  901. {
  902.    return (dist24 (col1, col2) < equalColorTolerance);
  903. }
  904.  
  905.  
  906. EXTERN_C bool xbrz_equalcolortest32 (uint32_t col1, uint32_t col2, double luminanceWeight, double equalColorTolerance)
  907. {
  908.    return (dist32 (col1, col2) < equalColorTolerance);
  909. }
  910.  
  911.  
  912. EXTERN_C void xbrz_scale24 (size_t factor, const uint32_t *src, uint32_t *trg, int srcWidth, int srcHeight)
  913. {
  914.    if (factor < 7)
  915.       return scale_image (&scalers[factor - 2], src, trg, srcWidth, srcHeight, 0, srcHeight, alphagrad24, dist24);
  916. }
  917.  
  918.  
  919. EXTERN_C void xbrz_scale32 (size_t factor, const uint32_t *src, uint32_t *trg, int srcWidth, int srcHeight)
  920. {
  921.    if (factor < 7)
  922.       return scale_image (&scalers[factor - 2], src, trg, srcWidth, srcHeight, 0, srcHeight, alphagrad32, dist32);
  923. }
  924.