Subversion Repositories Games.Rick Dangerous

Rev

Rev 7 | Blame | Compare with Previous | Last modification | View Log | Download | RSS feed

  1. // -------------------------------------------------------------------------
  2. // | xBRZ: "Scale by rules" - high quality image upscaling filter by Zenju |
  3. // -------------------------------------------------------------------------
  4. // using a modified approach of xBR:
  5. // http://board.byuu.org/viewtopic.php?f=10&t=2248
  6. //  - new rule set preserving small image features
  7. //  - highly optimized for performance
  8. //  - support alpha channel
  9. //  - support multithreading
  10. //  - support 64-bit architectures
  11. //  - support processing image slices
  12. //  - support scaling up to 6xBRZ
  13.  
  14. // -> map source (srcWidth * srcHeight) to target (scale * width x scale * height) image, optionally processing a half-open slice of rows [yFirst, yLast) only
  15. // -> support for source/target pitch in bytes!
  16. // -> if your emulator changes only a few image slices during each cycle (e.g. DOSBox) then there's no need to run xBRZ on the complete image:
  17. //    Just make sure you enlarge the source image slice by 2 rows on top and 2 on bottom (this is the additional range the xBRZ algorithm is using during analysis)
  18. //    CAVEAT: If there are multiple changed slices, make sure they do not overlap after adding these additional rows in order to avoid a memory race condition
  19. //    in the target image data if you are using multiple threads for processing each enlarged slice!
  20. //
  21. // THREAD-SAFETY: - parts of the same image may be scaled by multiple threads as long as the [yFirst, yLast) ranges do not overlap!
  22. //                - there is a minor inefficiency for the first row of a slice, so avoid processing single rows only; suggestion: process at least 8-16 rows
  23.  
  24.  
  25. #include <stddef.h> // for size_t
  26. #include <stdint.h> // for uint32_t
  27. #include <stdbool.h> // for bool
  28. #include <memory.h> // for memset()
  29. #include <limits.h>
  30. #include <math.h>
  31.  
  32.  
  33. // prototypes of exported functions
  34. void xbrz_scale (size_t factor, const uint32_t *src, uint32_t *trg, int srcWidth, int srcHeight, bool has_alpha_channel);
  35. void nearest_neighbor_scale (const uint32_t *src, int srcWidth, int srcHeight, uint32_t *trg, int trgWidth, int trgHeight);
  36.  
  37.  
  38. // algorithm configuration
  39. #define XBRZ_CFG_LUMINANCE_WEIGHT 1
  40. #define XBRZ_CFG_EQUAL_COLOR_TOLERANCE 30
  41. #define XBRZ_CFG_DOMINANT_DIRECTION_THRESHOLD 3.6
  42. #define XBRZ_CFG_STEEP_DIRECTION_THRESHOLD 2.2
  43.  
  44.  
  45. // blend types
  46. #define BLEND_NONE     0
  47. #define BLEND_NORMAL   1 // a normal indication to blend
  48. #define BLEND_DOMINANT 2 // a strong indication to blend
  49.  
  50.  
  51. // handy macros
  52. #ifndef MIN
  53. #define MIN(a,b) ((a) < (b) ? (a) : (b))
  54. #endif // MIN
  55. #ifndef MAX
  56. #define MAX(a,b) ((a) > (b) ? (a) : (b))
  57. #endif // MAX
  58. #define GET_BYTE(val,byteno) ((uint8_t) (((val) >> ((byteno) << 3)) & 0xff))
  59. #define GET_BLUE(val)  GET_BYTE (val, 0)
  60. #define GET_GREEN(val) GET_BYTE (val, 1)
  61. #define GET_RED(val)   GET_BYTE (val, 2)
  62. #define GET_ALPHA(val) GET_BYTE (val, 3)
  63. #define CALC_COLOR24(colFront,colBack,M,N) (uint8_t) ((((uint8_t) (colFront)) * ((unsigned int) (M)) + ((uint8_t) (colBack)) * (((unsigned int) (N)) - ((unsigned int) (M)))) / ((unsigned int) (N)))
  64. #define CALC_COLOR32(colFront,colBack,weightFront,weightBack,weightSum) ((uint8_t) ((((uint8_t) (colFront)) * ((unsigned int) (weightFront)) + ((uint8_t) (colBack)) * ((unsigned int) (weightBack))) / ((unsigned int) (weightSum))))
  65. #define BYTE_ADVANCE(buffer,offset) (((char *) buffer) + (offset))
  66.  
  67.  
  68. // compress four blend types into a single byte
  69. #define getTopL(b)    ((uint8_t) (0x3 & ((uint8_t) (b) >> 0)))
  70. #define getTopR(b)    ((uint8_t) (0x3 & ((uint8_t) (b) >> 2)))
  71. #define getBottomR(b) ((uint8_t) (0x3 & ((uint8_t) (b) >> 4)))
  72. #define getBottomL(b) ((uint8_t) (0x3 & ((uint8_t) (b) >> 6)))
  73. #define setTopL(b,blend_type)    *(b) |= (((uint8_t) (blend_type)) << 0) // buffer is assumed to be initialized before preprocessing!
  74. #define setTopR(b,blend_type)    *(b) |= (((uint8_t) (blend_type)) << 2)
  75. #define setBottomR(b,blend_type) *(b) |= (((uint8_t) (blend_type)) << 4)
  76. #define setBottomL(b,blend_type) *(b) |= (((uint8_t) (blend_type)) << 6)
  77.  
  78.  
  79. typedef struct blendresult_s
  80. {
  81.    uint8_t
  82.       blend_f, blend_g,
  83.       blend_j, blend_k;
  84. } blendresult_t;
  85.  
  86.  
  87. typedef struct kernel_3x3_s
  88. {
  89.    uint32_t
  90.       a, b, c,
  91.       d, e, f,
  92.       g, h, i;
  93. } kernel_3x3_t;
  94.  
  95.  
  96. typedef struct kernel_4x4_s //kernel for preprocessing step
  97. {
  98.    uint32_t
  99.       a, b, c, d,
  100.       e, f, g, h,
  101.       i, j, k, l,
  102.       m, n, o, p;
  103. } kernel_4x4_t;
  104.  
  105.  
  106. typedef struct colorformat_s
  107. {
  108.    int bpp;
  109.    void (*alphagrad) (uint32_t *pixBack, uint32_t pixFront, unsigned int M, unsigned int N);
  110.    double (*dist) (uint32_t pix1, uint32_t pix2);
  111. } colorformat_t;
  112.  
  113.  
  114. typedef struct outmatrix_s
  115. {
  116.    size_t size;
  117.    uint32_t* ptr;
  118.    int stride;
  119. } outmatrix_t;
  120.  
  121.  
  122. typedef uint32_t *(outmatrixreffunc_t) (outmatrix_t *mat, size_t I, size_t J);
  123.  
  124.  
  125. typedef struct scaler_s
  126. {
  127.    int factor;
  128.    void (*blend_line_shallow)           (uint32_t col, outmatrix_t *out, colorformat_t *color_format, outmatrixreffunc_t outmatrix_ref);
  129.    void (*blend_line_steep)             (uint32_t col, outmatrix_t *out, colorformat_t *color_format, outmatrixreffunc_t outmatrix_ref);
  130.    void (*blend_line_steep_and_shallow) (uint32_t col, outmatrix_t *out, colorformat_t *color_format, outmatrixreffunc_t outmatrix_ref);
  131.    void (*blend_line_diagonal)          (uint32_t col, outmatrix_t *out, colorformat_t *color_format, outmatrixreffunc_t outmatrix_ref);
  132.    void (*blend_corner)                 (uint32_t col, outmatrix_t *out, colorformat_t *color_format, outmatrixreffunc_t outmatrix_ref);
  133. } scaler_t;
  134.  
  135.  
  136. /////////////////////////////////
  137. // shallow line scaling functions
  138.  
  139. static void blend_line_shallow_2x (uint32_t col, outmatrix_t *out, colorformat_t *color_format, outmatrixreffunc_t outmatrix_ref)
  140. {
  141.    color_format->alphagrad (outmatrix_ref (out, 2 - 1, 0), col, 1, 4);
  142.    color_format->alphagrad (outmatrix_ref (out, 2 - 1, 1), col, 3, 4);
  143. }
  144. static void blend_line_shallow_3x (uint32_t col, outmatrix_t *out, colorformat_t *color_format, outmatrixreffunc_t outmatrix_ref)
  145. {
  146.    color_format->alphagrad (outmatrix_ref (out, 3 - 1, 0), col, 1, 4);
  147.    color_format->alphagrad (outmatrix_ref (out, 3 - 2, 2), col, 1, 4);
  148.    color_format->alphagrad (outmatrix_ref (out, 3 - 1, 1), col, 3, 4);
  149.    *outmatrix_ref (out, 3 - 1, 2) = col;
  150. }
  151. static void blend_line_shallow_4x (uint32_t col, outmatrix_t *out, colorformat_t *color_format, outmatrixreffunc_t outmatrix_ref)
  152. {
  153.    color_format->alphagrad (outmatrix_ref (out, 4 - 1, 0), col, 1, 4);
  154.    color_format->alphagrad (outmatrix_ref (out, 4 - 2, 2), col, 1, 4);
  155.    color_format->alphagrad (outmatrix_ref (out, 4 - 1, 1), col, 3, 4);
  156.    color_format->alphagrad (outmatrix_ref (out, 4 - 2, 3), col, 3, 4);
  157.    *outmatrix_ref (out, 4 - 1, 2) = col;
  158.    *outmatrix_ref (out, 4 - 1, 3) = col;
  159. }
  160. static void blend_line_shallow_5x (uint32_t col, outmatrix_t *out, colorformat_t *color_format, outmatrixreffunc_t outmatrix_ref)
  161. {
  162.    color_format->alphagrad (outmatrix_ref (out, 5 - 1, 0), col, 1, 4);
  163.    color_format->alphagrad (outmatrix_ref (out, 5 - 2, 2), col, 1, 4);
  164.    color_format->alphagrad (outmatrix_ref (out, 5 - 3, 4), col, 1, 4);
  165.    color_format->alphagrad (outmatrix_ref (out, 5 - 1, 1), col, 3, 4);
  166.    color_format->alphagrad (outmatrix_ref (out, 5 - 2, 3), col, 3, 4);
  167.    *outmatrix_ref (out, 5 - 1, 2) = col;
  168.    *outmatrix_ref (out, 5 - 1, 3) = col;
  169.    *outmatrix_ref (out, 5 - 1, 4) = col;
  170.    *outmatrix_ref (out, 5 - 2, 4) = col;
  171. }
  172. static void blend_line_shallow_6x (uint32_t col, outmatrix_t *out, colorformat_t *color_format, outmatrixreffunc_t outmatrix_ref)
  173. {
  174.    color_format->alphagrad (outmatrix_ref (out, 6 - 1, 0), col, 1, 4);
  175.    color_format->alphagrad (outmatrix_ref (out, 6 - 2, 2), col, 1, 4);
  176.    color_format->alphagrad (outmatrix_ref (out, 6 - 3, 4), col, 1, 4);
  177.    color_format->alphagrad (outmatrix_ref (out, 6 - 1, 1), col, 3, 4);
  178.    color_format->alphagrad (outmatrix_ref (out, 6 - 2, 3), col, 3, 4);
  179.    color_format->alphagrad (outmatrix_ref (out, 6 - 3, 5), col, 3, 4);
  180.    *outmatrix_ref (out, 6 - 1, 2) = col;
  181.    *outmatrix_ref (out, 6 - 1, 3) = col;
  182.    *outmatrix_ref (out, 6 - 1, 4) = col;
  183.    *outmatrix_ref (out, 6 - 1, 5) = col;
  184.    *outmatrix_ref (out, 6 - 2, 4) = col;
  185.    *outmatrix_ref (out, 6 - 2, 5) = col;
  186. }
  187.  
  188. ///////////////////////////////
  189. // steep line scaling functions
  190.  
  191. static void blend_line_steep_2x (uint32_t col, outmatrix_t *out, colorformat_t *color_format, outmatrixreffunc_t outmatrix_ref)
  192. {
  193.    color_format->alphagrad (outmatrix_ref (out, 0, 2 - 1), col, 1, 4);
  194.    color_format->alphagrad (outmatrix_ref (out, 1, 2 - 1), col, 3, 4);
  195. }
  196. static void blend_line_steep_3x (uint32_t col, outmatrix_t *out, colorformat_t *color_format, outmatrixreffunc_t outmatrix_ref)
  197. {
  198.    color_format->alphagrad (outmatrix_ref (out, 0, 3 - 1), col, 1, 4);
  199.    color_format->alphagrad (outmatrix_ref (out, 2, 3 - 2), col, 1, 4);
  200.    color_format->alphagrad (outmatrix_ref (out, 1, 3 - 1), col, 3, 4);
  201.    *outmatrix_ref (out, 2, 3 - 1) = col;
  202. }
  203. static void blend_line_steep_4x (uint32_t col, outmatrix_t *out, colorformat_t *color_format, outmatrixreffunc_t outmatrix_ref)
  204. {
  205.    color_format->alphagrad (outmatrix_ref (out, 0, 4 - 1), col, 1, 4);
  206.    color_format->alphagrad (outmatrix_ref (out, 2, 4 - 2), col, 1, 4);
  207.    color_format->alphagrad (outmatrix_ref (out, 1, 4 - 1), col, 3, 4);
  208.    color_format->alphagrad (outmatrix_ref (out, 3, 4 - 2), col, 3, 4);
  209.    *outmatrix_ref (out, 2, 4 - 1) = col;
  210.    *outmatrix_ref (out, 3, 4 - 1) = col;
  211. }
  212. static void blend_line_steep_5x (uint32_t col, outmatrix_t *out, colorformat_t *color_format, outmatrixreffunc_t outmatrix_ref)
  213. {
  214.    color_format->alphagrad (outmatrix_ref (out, 0, 5 - 1), col, 1, 4);
  215.    color_format->alphagrad (outmatrix_ref (out, 2, 5 - 2), col, 1, 4);
  216.    color_format->alphagrad (outmatrix_ref (out, 4, 5 - 3), col, 1, 4);
  217.    color_format->alphagrad (outmatrix_ref (out, 1, 5 - 1), col, 3, 4);
  218.    color_format->alphagrad (outmatrix_ref (out, 3, 5 - 2), col, 3, 4);
  219.    *outmatrix_ref (out, 2, 5 - 1) = col;
  220.    *outmatrix_ref (out, 3, 5 - 1) = col;
  221.    *outmatrix_ref (out, 4, 5 - 1) = col;
  222.    *outmatrix_ref (out, 4, 5 - 2) = col;
  223. }
  224. static void blend_line_steep_6x (uint32_t col, outmatrix_t *out, colorformat_t *color_format, outmatrixreffunc_t outmatrix_ref)
  225. {
  226.    color_format->alphagrad (outmatrix_ref (out, 0, 6 - 1), col, 1, 4);
  227.    color_format->alphagrad (outmatrix_ref (out, 2, 6 - 2), col, 1, 4);
  228.    color_format->alphagrad (outmatrix_ref (out, 4, 6 - 3), col, 1, 4);
  229.    color_format->alphagrad (outmatrix_ref (out, 1, 6 - 1), col, 3, 4);
  230.    color_format->alphagrad (outmatrix_ref (out, 3, 6 - 2), col, 3, 4);
  231.    color_format->alphagrad (outmatrix_ref (out, 5, 6 - 3), col, 3, 4);
  232.    *outmatrix_ref (out, 2, 6 - 1) = col;
  233.    *outmatrix_ref (out, 3, 6 - 1) = col;
  234.    *outmatrix_ref (out, 4, 6 - 1) = col;
  235.    *outmatrix_ref (out, 5, 6 - 1) = col;
  236.    *outmatrix_ref (out, 4, 6 - 2) = col;
  237.    *outmatrix_ref (out, 5, 6 - 2) = col;
  238. }
  239.  
  240. ///////////////////////////////////////////
  241. // steep and shallow line scaling functions
  242.  
  243. static void blend_line_steep_and_shallow_2x (uint32_t col, outmatrix_t *out, colorformat_t *color_format, outmatrixreffunc_t outmatrix_ref)
  244. {
  245.    color_format->alphagrad (outmatrix_ref (out, 1, 0), col, 1, 4);
  246.    color_format->alphagrad (outmatrix_ref (out, 0, 1), col, 1, 4);
  247.    color_format->alphagrad (outmatrix_ref (out, 1, 1), col, 5, 6); // [!] fixes 7/8 used in xBR
  248. }
  249. static void blend_line_steep_and_shallow_3x (uint32_t col, outmatrix_t *out, colorformat_t *color_format, outmatrixreffunc_t outmatrix_ref)
  250. {
  251.    color_format->alphagrad (outmatrix_ref (out, 2, 0), col, 1, 4);
  252.    color_format->alphagrad (outmatrix_ref (out, 0, 2), col, 1, 4);
  253.    color_format->alphagrad (outmatrix_ref (out, 2, 1), col, 3, 4);
  254.    color_format->alphagrad (outmatrix_ref (out, 1, 2), col, 3, 4);
  255.    *outmatrix_ref (out, 2, 2) = col;
  256. }
  257. static void blend_line_steep_and_shallow_4x (uint32_t col, outmatrix_t *out, colorformat_t *color_format, outmatrixreffunc_t outmatrix_ref)
  258. {
  259.    color_format->alphagrad (outmatrix_ref (out, 3, 1), col, 3, 4);
  260.    color_format->alphagrad (outmatrix_ref (out, 1, 3), col, 3, 4);
  261.    color_format->alphagrad (outmatrix_ref (out, 3, 0), col, 1, 4);
  262.    color_format->alphagrad (outmatrix_ref (out, 0, 3), col, 1, 4);
  263.    color_format->alphagrad (outmatrix_ref (out, 2, 2), col, 1, 3); // [!] fixes 1/4 used in xBR
  264.    *outmatrix_ref (out, 3, 3) = col;
  265.    *outmatrix_ref (out, 3, 2) = col;
  266.    *outmatrix_ref (out, 2, 3) = col;
  267. }
  268. static void blend_line_steep_and_shallow_5x (uint32_t col, outmatrix_t *out, colorformat_t *color_format, outmatrixreffunc_t outmatrix_ref)
  269. {
  270.    color_format->alphagrad (outmatrix_ref (out, 0, 5 - 1), col, 1, 4);
  271.    color_format->alphagrad (outmatrix_ref (out, 2, 5 - 2), col, 1, 4);
  272.    color_format->alphagrad (outmatrix_ref (out, 1, 5 - 1), col, 3, 4);
  273.    color_format->alphagrad (outmatrix_ref (out, 5 - 1, 0), col, 1, 4);
  274.    color_format->alphagrad (outmatrix_ref (out, 5 - 2, 2), col, 1, 4);
  275.    color_format->alphagrad (outmatrix_ref (out, 5 - 1, 1), col, 3, 4);
  276.    color_format->alphagrad (outmatrix_ref (out, 3, 3), col, 2, 3);
  277.    *outmatrix_ref (out, 2, 5 - 1) = col;
  278.    *outmatrix_ref (out, 3, 5 - 1) = col;
  279.    *outmatrix_ref (out, 4, 5 - 1) = col;
  280.    *outmatrix_ref (out, 5 - 1, 2) = col;
  281.    *outmatrix_ref (out, 5 - 1, 3) = col;
  282. }
  283. static void blend_line_steep_and_shallow_6x (uint32_t col, outmatrix_t *out, colorformat_t *color_format, outmatrixreffunc_t outmatrix_ref)
  284. {
  285.    color_format->alphagrad (outmatrix_ref (out, 0, 6 - 1), col, 1, 4);
  286.    color_format->alphagrad (outmatrix_ref (out, 2, 6 - 2), col, 1, 4);
  287.    color_format->alphagrad (outmatrix_ref (out, 1, 6 - 1), col, 3, 4);
  288.    color_format->alphagrad (outmatrix_ref (out, 3, 6 - 2), col, 3, 4);
  289.    color_format->alphagrad (outmatrix_ref (out, 6 - 1, 0), col, 1, 4);
  290.    color_format->alphagrad (outmatrix_ref (out, 6 - 2, 2), col, 1, 4);
  291.    color_format->alphagrad (outmatrix_ref (out, 6 - 1, 1), col, 3, 4);
  292.    color_format->alphagrad (outmatrix_ref (out, 6 - 2, 3), col, 3, 4);
  293.    *outmatrix_ref (out, 2, 6 - 1) = col;
  294.    *outmatrix_ref (out, 3, 6 - 1) = col;
  295.    *outmatrix_ref (out, 4, 6 - 1) = col;
  296.    *outmatrix_ref (out, 5, 6 - 1) = col;
  297.    *outmatrix_ref (out, 4, 6 - 2) = col;
  298.    *outmatrix_ref (out, 5, 6 - 2) = col;
  299.    *outmatrix_ref (out, 6 - 1, 2) = col;
  300.    *outmatrix_ref (out, 6 - 1, 3) = col;
  301. }
  302.  
  303. //////////////////////////////////
  304. // diagonal line scaling functions
  305.  
  306. static void blend_line_diagonal_2x (uint32_t col, outmatrix_t *out, colorformat_t *color_format, outmatrixreffunc_t outmatrix_ref)
  307. {
  308.    color_format->alphagrad (outmatrix_ref (out, 1, 1), col, 1, 2);
  309. }
  310. static void blend_line_diagonal_3x (uint32_t col, outmatrix_t *out, colorformat_t *color_format, outmatrixreffunc_t outmatrix_ref)
  311. {
  312.    color_format->alphagrad (outmatrix_ref (out, 1, 2), col, 1, 8); // conflict with other rotations for this odd scale
  313.    color_format->alphagrad (outmatrix_ref (out, 2, 1), col, 1, 8);
  314.    color_format->alphagrad (outmatrix_ref (out, 2, 2), col, 7, 8);
  315. }
  316. static void blend_line_diagonal_4x (uint32_t col, outmatrix_t *out, colorformat_t *color_format, outmatrixreffunc_t outmatrix_ref)
  317. {
  318.    color_format->alphagrad (outmatrix_ref (out, 4 - 1, 4 / 2), col, 1, 2);
  319.    color_format->alphagrad (outmatrix_ref (out, 4 - 2, 4 / 2 + 1), col, 1, 2);
  320.    *outmatrix_ref (out, 4 - 1, 4 - 1) = col;
  321. }
  322. static void blend_line_diagonal_5x (uint32_t col, outmatrix_t *out, colorformat_t *color_format, outmatrixreffunc_t outmatrix_ref)
  323. {
  324.    color_format->alphagrad (outmatrix_ref (out, 5 - 1, 5 / 2 + 0), col, 1, 8); // conflict with other rotations for this odd scale
  325.    color_format->alphagrad (outmatrix_ref (out, 5 - 2, 5 / 2 + 1), col, 1, 8);
  326.    color_format->alphagrad (outmatrix_ref (out, 5 - 3, 5 / 2 + 2), col, 1, 8);
  327.    color_format->alphagrad (outmatrix_ref (out, 4, 3), col, 7, 8);
  328.    color_format->alphagrad (outmatrix_ref (out, 3, 4), col, 7, 8);
  329.    *outmatrix_ref (out, 4, 4) = col;
  330. }
  331. static void blend_line_diagonal_6x (uint32_t col, outmatrix_t *out, colorformat_t *color_format, outmatrixreffunc_t outmatrix_ref)
  332. {
  333.    color_format->alphagrad (outmatrix_ref (out, 6 - 1, 6 / 2 + 0), col, 1, 2);
  334.    color_format->alphagrad (outmatrix_ref (out, 6 - 2, 6 / 2 + 1), col, 1, 2);
  335.    color_format->alphagrad (outmatrix_ref (out, 6 - 3, 6 / 2 + 2), col, 1, 2);
  336.    *outmatrix_ref (out, 6 - 2, 6 - 1) = col;
  337.    *outmatrix_ref (out, 6 - 1, 6 - 1) = col;
  338.    *outmatrix_ref (out, 6 - 1, 6 - 2) = col;
  339. }
  340.  
  341. ///////////////////////////
  342. // corner scaling functions
  343.  
  344. static void blend_corner_2x (uint32_t col, outmatrix_t *out, colorformat_t *color_format, outmatrixreffunc_t outmatrix_ref)
  345. {
  346.    // model a round corner
  347.    color_format->alphagrad (outmatrix_ref (out, 1, 1), col, 21, 100); // exact: 1 - pi/4 = 0.2146018366
  348. }
  349. static void blend_corner_3x (uint32_t col, outmatrix_t *out, colorformat_t *color_format, outmatrixreffunc_t outmatrix_ref)
  350. {
  351.    // model a round corner
  352.    color_format->alphagrad (outmatrix_ref (out, 2, 2), col, 45, 100); // exact: 0.4545939598
  353.    //color_format->alphagrad (outmatrix_ref (out, 2, 1), col,  7, 256); // 0.02826017254 -> negligible + avoid conflicts with other rotations for this odd scale
  354.    //color_format->alphagrad (outmatrix_ref (out, 1, 2), col,  7, 256); // 0.02826017254
  355. }
  356. static void blend_corner_4x (uint32_t col, outmatrix_t *out, colorformat_t *color_format, outmatrixreffunc_t outmatrix_ref)
  357. {
  358.    // model a round corner
  359.    color_format->alphagrad (outmatrix_ref (out, 3, 3), col, 68, 100); // exact: 0.6848532563
  360.    color_format->alphagrad (outmatrix_ref (out, 3, 2), col,  9, 100); // 0.08677704501
  361.    color_format->alphagrad (outmatrix_ref (out, 2, 3), col,  9, 100); // 0.08677704501
  362. }
  363. static void blend_corner_5x (uint32_t col, outmatrix_t *out, colorformat_t *color_format, outmatrixreffunc_t outmatrix_ref)
  364. {
  365.    // model a round corner
  366.    color_format->alphagrad (outmatrix_ref (out, 4, 4), col, 86, 100); // exact: 0.8631434088
  367.    color_format->alphagrad (outmatrix_ref (out, 4, 3), col, 23, 100); // 0.2306749731
  368.    color_format->alphagrad (outmatrix_ref (out, 3, 4), col, 23, 100); // 0.2306749731
  369.    //color_format->alphagrad (outmatrix_ref (out, 4, 2), col,  1,  64); // 0.01676812367 -> negligible + avoid conflicts with other rotations for this odd scale
  370.    //color_format->alphagrad (outmatrix_ref (out, 2, 4), col,  1,  64); // 0.01676812367
  371. }
  372. static void blend_corner_6x (uint32_t col, outmatrix_t *out, colorformat_t *color_format, outmatrixreffunc_t outmatrix_ref)
  373. {
  374.    // model a round corner
  375.    color_format->alphagrad (outmatrix_ref (out, 5, 5), col, 97, 100); // exact: 0.9711013910
  376.    color_format->alphagrad (outmatrix_ref (out, 4, 5), col, 42, 100); // 0.4236372243
  377.    color_format->alphagrad (outmatrix_ref (out, 5, 4), col, 42, 100); // 0.4236372243
  378.    color_format->alphagrad (outmatrix_ref (out, 5, 3), col,  6, 100); // 0.05652034508
  379.    color_format->alphagrad (outmatrix_ref (out, 3, 5), col,  6, 100); // 0.05652034508
  380. }
  381.  
  382. /////////////////////////////////////
  383. // scaler objects for various factors
  384.  
  385. static const scaler_t scalers[] =
  386. {
  387.    { 2, blend_line_shallow_2x, blend_line_steep_2x, blend_line_steep_and_shallow_2x, blend_line_diagonal_2x, blend_corner_2x },
  388.    { 3, blend_line_shallow_3x, blend_line_steep_3x, blend_line_steep_and_shallow_3x, blend_line_diagonal_3x, blend_corner_3x },
  389.    { 4, blend_line_shallow_4x, blend_line_steep_4x, blend_line_steep_and_shallow_4x, blend_line_diagonal_4x, blend_corner_4x },
  390.    { 5, blend_line_shallow_5x, blend_line_steep_5x, blend_line_steep_and_shallow_5x, blend_line_diagonal_5x, blend_corner_5x },
  391.    { 6, blend_line_shallow_6x, blend_line_steep_6x, blend_line_steep_and_shallow_6x, blend_line_diagonal_6x, blend_corner_6x },
  392. };
  393.  
  394. /////////////////////////////////////////////////////
  395. // alpha gradient functions for various color formats
  396.  
  397. static void alphagrad24 (uint32_t *pixBack, uint32_t pixFront, unsigned int M, unsigned int N)
  398. {
  399.    // blend front color with opacity M / N over opaque background: http://en.wikipedia.org/wiki/Alpha_compositing#Alpha_blending
  400.    *pixBack = ((CALC_COLOR24 (GET_RED   (pixFront), GET_RED   (*pixBack), M, N) << 16)
  401.              | (CALC_COLOR24 (GET_GREEN (pixFront), GET_GREEN (*pixBack), M, N) <<  8)
  402.              | (CALC_COLOR24 (GET_BLUE  (pixFront), GET_BLUE  (*pixBack), M, N) <<  0));
  403. }
  404. static void alphagrad32 (uint32_t *pixBack, uint32_t pixFront, unsigned int M, unsigned int N)
  405. {
  406.    // find intermediate color between two colors with alpha channels (=> NO alpha blending!!!)
  407.    const unsigned int weightFront = GET_ALPHA (pixFront) * M;
  408.    const unsigned int weightBack = GET_ALPHA (*pixBack) * (N - M);
  409.    const unsigned int weightSum = weightFront + weightBack;
  410.    *pixBack = (weightSum == 0 ? 0 :
  411.                (((uint8_t) (weightSum / N))                                                                     << 24)
  412.                | (CALC_COLOR32 (GET_RED   (pixFront), GET_RED   (*pixBack), weightFront, weightBack, weightSum) << 16)
  413.                | (CALC_COLOR32 (GET_GREEN (pixFront), GET_GREEN (*pixBack), weightFront, weightBack, weightSum) <<  8)
  414.                | (CALC_COLOR32 (GET_BLUE  (pixFront), GET_BLUE  (*pixBack), weightFront, weightBack, weightSum) <<  0));
  415. }
  416.  
  417. /////////////////////////////////////////////////////
  418. // color distance functions for various color formats
  419.  
  420. static double dist24 (uint32_t pix1, uint32_t pix2)
  421. {
  422.    // 30% perf boost compared to plain distYCbCr()!
  423.    // consumes 64 MB memory; using double is only 2% faster, but takes 128 MB
  424.    static float diffToDist[256 * 256 * 256] = { 0 };
  425.    static bool is_initialized = false;
  426.    if (!is_initialized)
  427.    {
  428.       for (uint32_t i = 0; i < 256 * 256 * 256; ++i) //startup time: 114 ms on Intel Core i5 (four cores)
  429.       {
  430.          const int r_diff = GET_RED (i) * 2 - 0xFF;
  431.          const int g_diff = GET_GREEN (i) * 2 - 0xFF;
  432.          const int b_diff = GET_BLUE (i) * 2 - 0xFF;
  433.  
  434.          const double k_b = 0.0593; //ITU-R BT.2020 conversion
  435.          const double k_r = 0.2627; //
  436.          const double k_g = 1 - k_b - k_r;
  437.  
  438.          const double scale_b = 0.5 / (1 - k_b);
  439.          const double scale_r = 0.5 / (1 - k_r);
  440.  
  441.          const double y = k_r * r_diff + k_g * g_diff + k_b * b_diff; //[!], analog YCbCr!
  442.          const double c_b = scale_b * (b_diff - y);
  443.          const double c_r = scale_r * (r_diff - y);
  444.  
  445.          diffToDist[i] = (float) (sqrt ((y * y) + (c_b * c_b) + (c_r * c_r)));
  446.       }
  447.       is_initialized = true;
  448.    }
  449.  
  450.    const int r_diff = (int) GET_RED (pix1) - (int) GET_RED (pix2);
  451.    const int g_diff = (int) GET_GREEN (pix1) - (int) GET_GREEN (pix2);
  452.    const int b_diff = (int) GET_BLUE (pix1) - (int) GET_BLUE (pix2);
  453.  
  454.    return (diffToDist[  (((r_diff + 0xFF) / 2) << 16) // slightly reduce precision (division by 2) to squeeze value into single byte
  455.                       | (((g_diff + 0xFF) / 2) <<  8)
  456.                       | (((b_diff + 0xFF) / 2) <<  0)]);
  457. }
  458. static double dist32 (uint32_t pix1, uint32_t pix2)
  459. {
  460.    // Requirements for a color distance handling alpha channel: with a1, a2 in [0, 1]
  461.    //    1. if a1 = a2, distance should be: a1 * distYCbCr()
  462.    //    2. if a1 = 0,  distance should be: a2 * distYCbCr(black, white) = a2 * 255
  463.    //    3. if a1 = 1,  ??? maybe: 255 * (1 - a2) + a2 * distYCbCr()
  464.    // return MIN (a1, a2) * distYCbCrBuffered(pix1, pix2) + 255 * abs(a1 - a2);
  465.    // => following code is 15% faster:
  466.    const double d = dist24 (pix1, pix2);
  467.    const double a1 = GET_ALPHA (pix1) / 255.0;
  468.    const double a2 = GET_ALPHA (pix2) / 255.0;
  469.    return (a1 < a2 ? a1 * d + 255 * (a2 - a1) : a2 * d + 255 * (a1 - a2));
  470. }
  471.  
  472. ///////////////////////////////////////
  473. // color format objects for various bpp
  474.  
  475. static colorformat_t color_format_24 = { 24, alphagrad24, dist24 };
  476. static colorformat_t color_format_32 = { 32, alphagrad32, dist32 };
  477.  
  478. //////////////////////////////////////////////////////////
  479. // output matrix reference functions for various rotations
  480.  
  481. static uint32_t *outmatrixref_0   (outmatrix_t *mat, size_t I, size_t J) { return (mat->ptr + I * mat->stride + J); }
  482. static uint32_t *outmatrixref_90  (outmatrix_t *mat, size_t I, size_t J) { return (mat->ptr + (mat->size - 1 - J) * mat->stride + I); }
  483. static uint32_t *outmatrixref_180 (outmatrix_t *mat, size_t I, size_t J) { return (mat->ptr + (mat->size - 1 - I) * mat->stride + (mat->size - 1 - J)); }
  484. static uint32_t *outmatrixref_270 (outmatrix_t *mat, size_t I, size_t J) { return (mat->ptr + J * mat->stride + (mat->size - 1 - I)); }
  485.  
  486.  
  487. ///////////////////////////
  488. // core algorithm functions
  489.  
  490.  
  491. #ifdef _MSC_VER
  492. #define FORCE_INLINE __forceinline
  493. #elif defined __GNUC__
  494. #define FORCE_INLINE __attribute__((always_inline)) inline
  495. #else
  496. #define FORCE_INLINE inline
  497. #endif
  498.  
  499.  
  500. static FORCE_INLINE void preprocess_corners (blendresult_t *result, const kernel_4x4_t *ker, colorformat_t *color_format)
  501. {
  502.    // detect blend direction
  503.    // result: F, G, J, K corners of "GradientType"
  504.  
  505.    // input kernel area naming convention:
  506.    // -----------------
  507.    // | A | B | C | D |
  508.    // ----|---|---|---|
  509.    // | E | F | G | H |   //evaluate the four corners between F, G, J, K
  510.    // ----|---|---|---|   //input pixel is at position F
  511.    // | I | J | K | L |
  512.    // ----|---|---|---|
  513.    // | M | N | O | P |
  514.    // -----------------
  515.  
  516.    memset (result, 0, sizeof (blendresult_t));
  517.  
  518.    if (((ker->f == ker->g) && (ker->j == ker->k)) || ((ker->f == ker->j) && (ker->g == ker->k)))
  519.       return;
  520.  
  521.    const int weight = 4;
  522.    double jg = color_format->dist (ker->i, ker->f) + color_format->dist (ker->f, ker->c) + color_format->dist (ker->n, ker->k) + color_format->dist (ker->k, ker->h) + weight * color_format->dist (ker->j, ker->g);
  523.    double fk = color_format->dist (ker->e, ker->j) + color_format->dist (ker->j, ker->o) + color_format->dist (ker->b, ker->g) + color_format->dist (ker->g, ker->l) + weight * color_format->dist (ker->f, ker->k);
  524.  
  525.    if (jg < fk) // test sample: 70% of values max(jg, fk) / min(jg, fk) are between 1.1 and 3.7 with median being 1.8
  526.    {
  527.       const bool dominantGradient = XBRZ_CFG_DOMINANT_DIRECTION_THRESHOLD * jg < fk;
  528.       if (ker->f != ker->g && ker->f != ker->j)
  529.          result->blend_f = dominantGradient ? BLEND_DOMINANT : BLEND_NORMAL;
  530.  
  531.       if (ker->k != ker->j && ker->k != ker->g)
  532.          result->blend_k = dominantGradient ? BLEND_DOMINANT : BLEND_NORMAL;
  533.    }
  534.    else if (fk < jg)
  535.    {
  536.       const bool dominantGradient = XBRZ_CFG_DOMINANT_DIRECTION_THRESHOLD * fk < jg;
  537.       if (ker->j != ker->f && ker->j != ker->k)
  538.          result->blend_j = dominantGradient ? BLEND_DOMINANT : BLEND_NORMAL;
  539.  
  540.       if (ker->g != ker->f && ker->g != ker->k)
  541.          result->blend_g = dominantGradient ? BLEND_DOMINANT : BLEND_NORMAL;
  542.    }
  543.  
  544.    return;
  545. }
  546.  
  547.  
  548. static FORCE_INLINE void blend_pixel (const scaler_t *scaler, const kernel_3x3_t *ker, uint32_t *target, int trgWidth, uint8_t blendInfo, colorformat_t *color_format, outmatrixreffunc_t outmatrix_ref) //result of preprocessing all four corners of pixel "e"
  549. {
  550.    // input kernel area naming convention:
  551.    // -------------
  552.    // | A | B | C |
  553.    // ----|---|---|
  554.    // | D | E | F | // input pixel is at position E
  555.    // ----|---|---|
  556.    // | G | H | I |
  557.    // -------------
  558.  
  559.    uint32_t
  560.       a, b, c,
  561.       d, e, f,
  562.       g, h, i;
  563.    uint8_t blend;
  564.  
  565.    if      (outmatrix_ref == outmatrixref_270) { a = ker->c; b = ker->f; c = ker->i; d = ker->b; e = ker->e; f = ker->h; g = ker->a; h = ker->d; i = ker->g; blend = ((blendInfo << 6) | (blendInfo >> 2)) & 0xff; }
  566.    else if (outmatrix_ref == outmatrixref_180) { a = ker->i; b = ker->h; c = ker->g; d = ker->f; e = ker->e; f = ker->d; g = ker->c; h = ker->b; i = ker->a; blend = ((blendInfo << 4) | (blendInfo >> 4)) & 0xff; }
  567.    else if (outmatrix_ref == outmatrixref_90)  { a = ker->g; b = ker->d; c = ker->a; d = ker->h; e = ker->e; f = ker->b; g = ker->i; h = ker->f; i = ker->c; blend = ((blendInfo << 2) | (blendInfo >> 6)) & 0xff; }
  568.    else                                        { a = ker->a; b = ker->b; c = ker->c; d = ker->d; e = ker->e; f = ker->f; g = ker->g; h = ker->h; i = ker->i; blend = blendInfo; } // blendInfo here is equivalent to ((blendInfo << 0) | (blendInfo >> 8)) & 0xff
  569.  
  570.    if (getBottomR (blend) >= BLEND_NORMAL)
  571.    {
  572.       uint32_t px;
  573.       bool doLineBlend;
  574.  
  575.       if (getBottomR (blend) >= BLEND_DOMINANT)
  576.          doLineBlend = true;
  577.       else if (getTopR (blend) != BLEND_NONE && (color_format->dist (e, g) >= XBRZ_CFG_EQUAL_COLOR_TOLERANCE)) //but support double-blending for 90° corners
  578.          doLineBlend = false; // make sure there is no second blending in an adjacent rotation for this pixel: handles insular pixels, mario eyes
  579.       else if (getBottomL (blend) != BLEND_NONE && (color_format->dist (e, c) >= XBRZ_CFG_EQUAL_COLOR_TOLERANCE))
  580.          doLineBlend = false; // make sure there is no second blending in an adjacent rotation for this pixel: handles insular pixels, mario eyes
  581.       else if ((color_format->dist (e, i) >= XBRZ_CFG_EQUAL_COLOR_TOLERANCE)
  582.                && (color_format->dist (g, h) < XBRZ_CFG_EQUAL_COLOR_TOLERANCE)
  583.                && (color_format->dist (h, i) < XBRZ_CFG_EQUAL_COLOR_TOLERANCE)
  584.                && (color_format->dist (i, f) < XBRZ_CFG_EQUAL_COLOR_TOLERANCE)
  585.                && (color_format->dist (f, c) < XBRZ_CFG_EQUAL_COLOR_TOLERANCE))
  586.          doLineBlend = false; // no full blending for L-shapes; blend corner only (handles "mario mushroom eyes")
  587.       else
  588.          doLineBlend = true;
  589.  
  590.       outmatrix_t out;
  591.       out.size = scaler->factor;
  592.       out.ptr = target;
  593.       out.stride = trgWidth;
  594.  
  595.       px = (color_format->dist (e, f) <= color_format->dist (e, h) ? f : h); // choose most similar color
  596.  
  597.       if (doLineBlend)
  598.       {
  599.          const double fg = color_format->dist (f, g); // test sample: 70% of values max(fg, hc) / min(fg, hc) are between 1.1 and 3.7 with median being 1.9
  600.          const double hc = color_format->dist (h, c);
  601.          const bool haveShallowLine = (XBRZ_CFG_STEEP_DIRECTION_THRESHOLD * fg <= hc) && (e != g) && (d != g);
  602.          const bool haveSteepLine   = (XBRZ_CFG_STEEP_DIRECTION_THRESHOLD * hc <= fg) && (e != c) && (b != c);
  603.  
  604.          if (haveShallowLine)
  605.          {
  606.             if (haveSteepLine)
  607.                scaler->blend_line_steep_and_shallow (px, &out, color_format, outmatrix_ref);
  608.             else
  609.                scaler->blend_line_shallow (px, &out, color_format, outmatrix_ref);
  610.          }
  611.          else
  612.          {
  613.             if (haveSteepLine)
  614.                scaler->blend_line_steep (px, &out, color_format, outmatrix_ref);
  615.             else
  616.                scaler->blend_line_diagonal (px, &out, color_format, outmatrix_ref);
  617.          }
  618.       }
  619.       else
  620.          scaler->blend_corner (px, &out, color_format, outmatrix_ref);
  621.    }
  622. }
  623.  
  624.  
  625. static void scale_image (const scaler_t *scaler, const uint32_t *src, uint32_t *trg, int srcWidth, int srcHeight, int yFirst, int yLast, colorformat_t *color_format)
  626. {
  627.    yFirst = MAX (yFirst, 0);
  628.    yLast = MIN (yLast, srcHeight);
  629.    if (yFirst >= yLast || srcWidth <= 0)
  630.       return;
  631.  
  632.    const int trgWidth = srcWidth * scaler->factor;
  633.  
  634.    // "use" space at the end of the image as temporary buffer for "on the fly preprocessing": we even could use larger area of
  635.    // "sizeof(uint32_t) * srcWidth * (yLast - yFirst)" bytes without risk of accidental overwriting before accessing
  636.    const int bufferSize = srcWidth;
  637.    uint8_t *preProcBuffer = (uint8_t *) (trg + yLast * scaler->factor * trgWidth) - bufferSize;
  638.    memset (preProcBuffer, 0, bufferSize);
  639.  
  640.    // initialize preprocessing buffer for first row of current stripe: detect upper left and right corner blending
  641.    // this cannot be optimized for adjacent processing stripes; we must not allow for a memory race condition!
  642.    if (yFirst > 0)
  643.    {
  644.       const int y = yFirst - 1;
  645.  
  646.       const uint32_t *s_m1 = src + srcWidth * MAX (y - 1, 0);
  647.       const uint32_t *s_0 = src + srcWidth * y; //center line
  648.       const uint32_t *s_p1 = src + srcWidth * MIN (y + 1, srcHeight - 1);
  649.       const uint32_t *s_p2 = src + srcWidth * MIN (y + 2, srcHeight - 1);
  650.  
  651.       for (int x = 0; x < srcWidth; ++x)
  652.       {
  653.          blendresult_t res;
  654.          const int x_m1 = MAX (x - 1, 0);
  655.          const int x_p1 = MIN (x + 1, srcWidth - 1);
  656.          const int x_p2 = MIN (x + 2, srcWidth - 1);
  657.  
  658.          kernel_4x4_t ker; // perf: initialization is negligible
  659.          ker.a = s_m1[x_m1]; ker.b = s_m1[x]; ker.c = s_m1[x_p1]; ker.d = s_m1[x_p2]; // read sequentially from memory as far as possible
  660.          ker.e = s_0[x_m1];  ker.f = s_0[x];  ker.g = s_0[x_p1];  ker.h = s_0[x_p2];
  661.          ker.i = s_p1[x_m1]; ker.j = s_p1[x]; ker.k = s_p1[x_p1]; ker.l = s_p1[x_p2];
  662.          ker.m = s_p2[x_m1]; ker.n = s_p2[x]; ker.o = s_p2[x_p1]; ker.p = s_p2[x_p2];
  663.  
  664.          preprocess_corners (&res, &ker, color_format);
  665.  
  666.          // preprocessing blend result:
  667.          // ---------
  668.          // | F | G |   // evalute corner between F, G, J, K
  669.          // ----|---|   // input pixel is at position F
  670.          // | J | K |
  671.          // ---------
  672.  
  673.          setTopR (&preProcBuffer[x], res.blend_j);
  674.          if (x + 1 < bufferSize)
  675.             setTopL (&preProcBuffer[x + 1], res.blend_k);
  676.       }
  677.    }
  678.    //------------------------------------------------------------------------------------
  679.  
  680.    for (int y = yFirst; y < yLast; ++y)
  681.    {
  682.       uint32_t *out = trg + scaler->factor * y * trgWidth; // consider MT "striped" access
  683.  
  684.       const uint32_t* s_m1 = src + srcWidth * MAX (y - 1, 0);
  685.       const uint32_t* s_0 = src + srcWidth * y; // center line
  686.       const uint32_t* s_p1 = src + srcWidth * MIN (y + 1, srcHeight - 1);
  687.       const uint32_t* s_p2 = src + srcWidth * MIN (y + 2, srcHeight - 1);
  688.  
  689.       uint8_t blend_xy1 = 0; // corner blending for current (x, y + 1) position
  690.  
  691.       for (int x = 0; x < srcWidth; ++x, out += scaler->factor)
  692.       {
  693.          // all those bounds checks have only insignificant impact on performance!
  694.          const int x_m1 = MAX (x - 1, 0); // perf: prefer array indexing to additional pointers!
  695.          const int x_p1 = MIN (x + 1, srcWidth - 1);
  696.          const int x_p2 = MIN (x + 2, srcWidth - 1);
  697.  
  698.          kernel_4x4_t ker4; // perf: initialization is negligible
  699.          ker4.a = s_m1[x_m1]; ker4.b = s_m1[x]; ker4.c = s_m1[x_p1]; ker4.d = s_m1[x_p2]; // read sequentially from memory as far as possible
  700.          ker4.e = s_0[x_m1];  ker4.f = s_0[x];  ker4.g = s_0[x_p1];  ker4.h = s_0[x_p2];
  701.          ker4.i = s_p1[x_m1]; ker4.j = s_p1[x]; ker4.k = s_p1[x_p1]; ker4.l = s_p1[x_p2];
  702.          ker4.m = s_p2[x_m1]; ker4.n = s_p2[x]; ker4.o = s_p2[x_p1]; ker4.p = s_p2[x_p2];
  703.  
  704.          // evaluate the four corners on bottom-right of current pixel
  705.          uint8_t blend_xy = 0; // for current (x, y) position
  706.          {
  707.             blendresult_t res;
  708.             preprocess_corners (&res, &ker4, color_format);
  709.  
  710.             // preprocessing blend result:
  711.             // ---------
  712.             // | F | G |   // evalute corner between F, G, J, K
  713.             // ----|---|   // current input pixel is at position F
  714.             // | J | K |
  715.             // ---------
  716.  
  717.             blend_xy = preProcBuffer[x];
  718.             setBottomR (&blend_xy, res.blend_f); // all four corners of (x, y) have been determined at this point due to processing sequence!
  719.  
  720.             setTopR (&blend_xy1, res.blend_j); // set 2nd known corner for (x, y + 1)
  721.             preProcBuffer[x] = blend_xy1; // store on current buffer position for use on next row
  722.  
  723.             blend_xy1 = 0;
  724.             setTopL (&blend_xy1, res.blend_k); // set 1st known corner for (x + 1, y + 1) and buffer for use on next column
  725.  
  726.             if (x + 1 < bufferSize) // set 3rd known corner for (x + 1, y)
  727.                setBottomL (&preProcBuffer[x + 1], res.blend_g);
  728.          }
  729.  
  730.          // fill block of size scale * scale with the given color
  731.          uint32_t *blk = out;
  732.          for (int _blk_y = 0; _blk_y < scaler->factor; ++_blk_y, blk = (uint32_t *) BYTE_ADVANCE (blk, trgWidth * sizeof (uint32_t)))
  733.             for (int _blk_x = 0; _blk_x < scaler->factor; ++_blk_x)
  734.                blk[_blk_x] = ker4.f;
  735.  
  736.          // place *after* preprocessing step, to not overwrite the results while processing the the last pixel!
  737.  
  738.          // blend four corners of current pixel
  739.          if (blend_xy != 0) // good 5% perf-improvement
  740.          {
  741.             kernel_3x3_t ker3; // perf: initialization is negligible
  742.             ker3.a = ker4.a; ker3.b = ker4.b; ker3.c = ker4.c;
  743.             ker3.d = ker4.e; ker3.e = ker4.f; ker3.f = ker4.g;
  744.             ker3.g = ker4.i; ker3.h = ker4.j; ker3.i = ker4.k;
  745.  
  746.             blend_pixel (scaler, &ker3, out, trgWidth, blend_xy, color_format, outmatrixref_0);
  747.             blend_pixel (scaler, &ker3, out, trgWidth, blend_xy, color_format, outmatrixref_90);
  748.             blend_pixel (scaler, &ker3, out, trgWidth, blend_xy, color_format, outmatrixref_180);
  749.             blend_pixel (scaler, &ker3, out, trgWidth, blend_xy, color_format, outmatrixref_270);
  750.          }
  751.       }
  752.    }
  753. }
  754.  
  755.  
  756. /////////////////////
  757. // exported functions
  758.  
  759.  
  760. void nearest_neighbor_scale (const uint32_t *src, int srcWidth, int srcHeight, uint32_t *trg, int trgWidth, int trgHeight)
  761. {
  762.    int srcPitch = srcWidth * sizeof (uint32_t);
  763.    int trgPitch = trgWidth * sizeof (uint32_t);
  764.    int yFirst;
  765.    int yLast;
  766.  
  767. #if 0 // going over source image - fast for upscaling, since source is read only once
  768.    yFirst = 0;
  769.    yLast = MIN (trgHeight, srcHeight);
  770.  
  771.    if (yFirst >= yLast || trgWidth <= 0 || trgHeight <= 0)
  772.       return; // consistency check
  773.  
  774.    for (int y = yFirst; y < yLast; ++y)
  775.    {
  776.       //mathematically: ySrc = floor(srcHeight * yTrg / trgHeight)
  777.       // => search for integers in: [ySrc, ySrc + 1) * trgHeight / srcHeight
  778.  
  779.       //keep within for loop to support MT input slices!
  780.       const int yTrg_first = (y      * trgHeight + srcHeight - 1) / srcHeight; // = ceil(y * trgHeight / srcHeight)
  781.       const int yTrg_last = ((y + 1) * trgHeight + srcHeight - 1) / srcHeight; // = ceil(((y + 1) * trgHeight) / srcHeight)
  782.       const int blockHeight = yTrg_last - yTrg_first;
  783.  
  784.       if (blockHeight > 0)
  785.       {
  786.          const uint32_t *srcLine = (const uint32_t *) BYTE_ADVANCE (src, y * srcPitch);
  787.          /**/  uint32_t *trgLine = (uint32_t *) BYTE_ADVANCE (trg, yTrg_first * trgPitch);
  788.          int xTrg_first = 0;
  789.  
  790.          for (int x = 0; x < srcWidth; ++x)
  791.          {
  792.             const int xTrg_last = ((x + 1) * trgWidth + srcWidth - 1) / srcWidth;
  793.             const int blockWidth = xTrg_last - xTrg_first;
  794.             if (blockWidth > 0)
  795.             {
  796.                const uint32_t trgColor = srcLine[x];
  797.                uint32_t *blkLine = trgLine;
  798.  
  799.                xTrg_first = xTrg_last;
  800.  
  801.                for (int blk_y = 0; blk_y < blockHeight; ++blk_y, blkLine = (uint32_t *) BYTE_ADVANCE (blkLine, trgPitch))
  802.                   for (int blk_x = 0; blk_x < blockWidth; ++blk_x)
  803.                      blkLine[blk_x] = trgColor;
  804.  
  805.                trgLine += blockWidth;
  806.             }
  807.          }
  808.       }
  809.    }
  810. #else // going over target image - slow for upscaling, since source is read multiple times missing out on cache! Fast for similar image sizes!
  811.    yFirst = 0;
  812.    yLast = trgHeight;
  813.  
  814.    if (yFirst >= yLast || srcHeight <= 0 || srcWidth <= 0)
  815.       return; // consistency check
  816.  
  817.    for (int y = yFirst; y < yLast; ++y)
  818.    {
  819.       /**/  uint32_t *trgLine = (uint32_t *) BYTE_ADVANCE (trg, y * trgPitch);
  820.       const int ySrc = srcHeight * y / trgHeight;
  821.       const uint32_t *srcLine = (const uint32_t *) BYTE_ADVANCE (src, ySrc * srcPitch);
  822.       for (int x = 0; x < trgWidth; ++x)
  823.       {
  824.          const int xSrc = srcWidth * x / trgWidth;
  825.          trgLine[x] = srcLine[xSrc];
  826.       }
  827.    }
  828. #endif // going over source or target
  829.  
  830.    return;
  831. }
  832.  
  833.  
  834. void xbrz_scale (size_t factor, const uint32_t *src, uint32_t *trg, int srcWidth, int srcHeight, bool has_alpha_channel)
  835. {
  836.    if ((factor < 2) || (factor > 6))
  837.       return; // consistency check
  838.  
  839.    scale_image (&scalers[factor - 2], src, trg, srcWidth, srcHeight, 0, srcHeight, (has_alpha_channel ? &color_format_32 : &color_format_24));
  840.    return;
  841. }
  842.