Subversion Repositories Games.Prince of Persia

Rev

Blame | Last modification | View Log | Download | RSS feed

  1. // ****************************************************************************
  2. // * This file is part of the HqMAME project. It is distributed under         *
  3. // * GNU General Public License: https://www.gnu.org/licenses/gpl-3.0         *
  4. // * Copyright (C) Zenju (zenju AT gmx DOT de) - All Rights Reserved          *
  5. // *                                                                          *
  6. // * Additionally and as a special exception, the author gives permission     *
  7. // * to link the code of this program with the MAME library (or with modified *
  8. // * versions of MAME that use the same license as MAME), and distribute      *
  9. // * linked combinations including the two. You must obey the GNU General     *
  10. // * Public License in all respects for all of the code used other than MAME. *
  11. // * If you modify this file, you may extend this exception to your version   *
  12. // * of the file, but you are not obligated to do so. If you do not wish to   *
  13. // * do so, delete this exception statement from your version.                *
  14. // ****************************************************************************
  15.  
  16.  
  17. #include <cstddef> //size_t
  18. #include <cstdint> //uint32_t
  19. #include <limits>
  20. #include <cassert>
  21. #include <algorithm>
  22. #include <type_traits>
  23. #include <vector>
  24. #include <math.h>
  25.  
  26.  
  27. #ifdef __cplusplus
  28. #define EXTERN_C extern "C"
  29. #else // !__cplusplus
  30. #define EXTERN_C
  31. #endif // __cplusplus
  32.  
  33.  
  34. // scaler configuration
  35. #define XBRZ_CFG_LUMINANCE_WEIGHT 1
  36. #define XBRZ_CFG_EQUAL_COLOR_TOLERANCE 30
  37. #define XBRZ_CFG_DOMINANT_DIRECTION_THRESHOLD 3.6
  38. #define XBRZ_CFG_STEEP_DIRECTION_THRESHOLD 2.2
  39.  
  40.  
  41. // slice types
  42. #define XBRZ_SLICETYPE_SOURCE 1
  43. #define XBRZ_SLICETYPE_TARGET 2
  44.  
  45.  
  46. // handy macros
  47. #define GET_BYTE(val,byteno) ((unsigned char) (((val) >> ((byteno) << 3)) & 0xff))
  48. #define GET_BLUE(val)  GET_BYTE (val, 0)
  49. #define GET_GREEN(val) GET_BYTE (val, 1)
  50. #define GET_RED(val)   GET_BYTE (val, 2)
  51. #define GET_ALPHA(val) GET_BYTE (val, 3)
  52. //inline uint32_t rgb555to888(uint16_t pix) { return ((pix & 0x7C00) << 9) | ((pix & 0x03E0) << 6) | ((pix & 0x001F) << 3); }
  53. //inline uint32_t rgb565to888(uint16_t pix) { return ((pix & 0xF800) << 8) | ((pix & 0x07E0) << 5) | ((pix & 0x001F) << 3); }
  54. //inline uint16_t rgb888to555(uint32_t pix) { return static_cast<uint16_t>(((pix & 0xF80000) >> 9) | ((pix & 0x00F800) >> 6) | ((pix & 0x0000F8) >> 3)); }
  55. //inline uint16_t rgb888to565(uint32_t pix) { return static_cast<uint16_t>(((pix & 0xF80000) >> 8) | ((pix & 0x00FC00) >> 5) | ((pix & 0x0000F8) >> 3)); }
  56.  
  57.  
  58. namespace xbrz
  59. {
  60.         // -------------------------------------------------------------------------
  61.         // | xBRZ: "Scale by rules" - high quality image upscaling filter by Zenju |
  62.         // -------------------------------------------------------------------------
  63.         // using a modified approach of xBR:
  64.         // http://board.byuu.org/viewtopic.php?f=10&t=2248
  65.         //  - new rule set preserving small image features
  66.         //  - highly optimized for performance
  67.         //  - support alpha channel
  68.         //  - support multithreading
  69.         //  - support 64-bit architectures
  70.         //  - support processing image slices
  71.         //  - support scaling up to 6xBRZ
  72.  
  73.         // -> map source (srcWidth * srcHeight) to target (scale * width x scale * height) image, optionally processing a half-open slice of rows [yFirst, yLast) only
  74.         // -> support for source/target pitch in bytes!
  75.         // -> if your emulator changes only a few image slices during each cycle (e.g. DOSBox) then there's no need to run xBRZ on the complete image:
  76.         //    Just make sure you enlarge the source image slice by 2 rows on top and 2 on bottom (this is the additional range the xBRZ algorithm is using during analysis)
  77.         //    CAVEAT: If there are multiple changed slices, make sure they do not overlap after adding these additional rows in order to avoid a memory race condition
  78.         //    in the target image data if you are using multiple threads for processing each enlarged slice!
  79.         //
  80.         // THREAD-SAFETY: - parts of the same image may be scaled by multiple threads as long as the [yFirst, yLast) ranges do not overlap!
  81.         //                - there is a minor inefficiency for the first row of a slice, so avoid processing single rows only; suggestion: process at least 8-16 rows
  82.  
  83.         void nearestNeighborScale(const uint32_t* src, int srcWidth, int srcHeight, uint32_t* trg, int trgWidth, int trgHeight);
  84.  
  85.  
  86.         template <class Pix> inline Pix* byteAdvance(Pix* ptr, int bytes)
  87.         {
  88.             using PixNonConst = typename std::remove_cv<Pix>::type;
  89.             using PixByte     = typename std::conditional<std::is_same<Pix, PixNonConst>::value, char, const char>::type;
  90.  
  91.             static_assert(std::is_integral<PixNonConst>::value, "Pix* is expected to be cast-able to char*");
  92.  
  93.             return reinterpret_cast<Pix*>(reinterpret_cast<PixByte*>(ptr) + bytes);
  94.         }
  95.  
  96.  
  97. //fill block  with the given color
  98. template <class Pix> inline void fillBlock(Pix* trg, int pitch, Pix col, int blockWidth, int blockHeight)
  99. {
  100.     //for (int y = 0; y < blockHeight; ++y, trg = byteAdvance(trg, pitch))
  101.     //    std::fill(trg, trg + blockWidth, col);
  102.  
  103.     for (int y = 0; y < blockHeight; ++y, trg = byteAdvance(trg, pitch))
  104.         for (int x = 0; x < blockWidth; ++x)
  105.             trg[x] = col;
  106. }
  107.  
  108.  
  109. template <class PixSrc, class PixTrg, class PixConverter>
  110. void nearestNeighborScale(const PixSrc* src, int srcWidth, int srcHeight, int srcPitch,
  111.                           /**/  PixTrg* trg, int trgWidth, int trgHeight, int trgPitch,
  112.                           int slice_type, int yFirst, int yLast, PixConverter pixCvrt /*convert PixSrc to PixTrg*/)
  113. {
  114.     static_assert(std::is_integral<PixSrc>::value, "PixSrc* is expected to be cast-able to char*");
  115.     static_assert(std::is_integral<PixTrg>::value, "PixTrg* is expected to be cast-able to char*");
  116.     static_assert(std::is_same<decltype(pixCvrt(PixSrc())), PixTrg>::value, "PixConverter returning wrong pixel format");
  117.  
  118.     if (srcPitch < srcWidth * static_cast<int>(sizeof(PixSrc))  ||
  119.         trgPitch < trgWidth * static_cast<int>(sizeof(PixTrg)))
  120.     {
  121.         assert(false);
  122.         return;
  123.     }
  124.  
  125.     if (slice_type == XBRZ_SLICETYPE_SOURCE)
  126.     {
  127.             //nearest-neighbor (going over source image - fast for upscaling, since source is read only once
  128.             yFirst = std::max(yFirst, 0);
  129.             yLast  = std::min(yLast, srcHeight);
  130.             if (yFirst >= yLast || trgWidth <= 0 || trgHeight <= 0) return;
  131.  
  132.             for (int y = yFirst; y < yLast; ++y)
  133.             {
  134.                 //mathematically: ySrc = floor(srcHeight * yTrg / trgHeight)
  135.                 // => search for integers in: [ySrc, ySrc + 1) * trgHeight / srcHeight
  136.  
  137.                 //keep within for loop to support MT input slices!
  138.                 const int yTrg_first = ( y      * trgHeight + srcHeight - 1) / srcHeight; //=ceil(y * trgHeight / srcHeight)
  139.                 const int yTrg_last  = ((y + 1) * trgHeight + srcHeight - 1) / srcHeight; //=ceil(((y + 1) * trgHeight) / srcHeight)
  140.                 const int blockHeight = yTrg_last - yTrg_first;
  141.  
  142.                 if (blockHeight > 0)
  143.                 {
  144.                     const PixSrc* srcLine = byteAdvance(src, y * srcPitch);
  145.                     /**/  PixTrg* trgLine = byteAdvance(trg, yTrg_first * trgPitch);
  146.                     int xTrg_first = 0;
  147.  
  148.                     for (int x = 0; x < srcWidth; ++x)
  149.                     {
  150.                         const int xTrg_last = ((x + 1) * trgWidth + srcWidth - 1) / srcWidth;
  151.                         const int blockWidth = xTrg_last - xTrg_first;
  152.                         if (blockWidth > 0)
  153.                         {
  154.                             xTrg_first = xTrg_last;
  155.  
  156.                             const auto trgPix = pixCvrt(srcLine[x]);
  157.                             fillBlock(trgLine, trgPitch, trgPix, blockWidth, blockHeight);
  158.                             trgLine += blockWidth;
  159.                         }
  160.                     }
  161.                 }
  162.             }
  163.     }
  164.     else if (slice_type == XBRZ_SLICETYPE_TARGET)
  165.     {
  166.             //nearest-neighbor (going over target image - slow for upscaling, since source is read multiple times missing out on cache! Fast for similar image sizes!)
  167.             yFirst = std::max(yFirst, 0);
  168.             yLast  = std::min(yLast, trgHeight);
  169.             if (yFirst >= yLast || srcHeight <= 0 || srcWidth <= 0) return;
  170.  
  171.             for (int y = yFirst; y < yLast; ++y)
  172.             {
  173.                 PixTrg* trgLine = byteAdvance(trg, y * trgPitch);
  174.                 const int ySrc = srcHeight * y / trgHeight;
  175.                 const PixSrc* srcLine = byteAdvance(src, ySrc * srcPitch);
  176.                 for (int x = 0; x < trgWidth; ++x)
  177.                 {
  178.                     const int xSrc = srcWidth * x / trgWidth;
  179.                     trgLine[x] = pixCvrt(srcLine[xSrc]);
  180.                 }
  181.             }
  182.     }
  183. }
  184. }
  185.  
  186.  
  187.  
  188.  
  189. namespace
  190. {
  191. template <unsigned int M, unsigned int N> inline
  192. uint32_t gradientRGB(uint32_t pixFront, uint32_t pixBack) //blend front color with opacity M / N over opaque background: http://en.wikipedia.org/wiki/Alpha_compositing#Alpha_blending
  193. {
  194.     static_assert(0 < M && M < N && N <= 1000, "");
  195.  
  196.     auto calcColor = [](unsigned char colFront, unsigned char colBack) -> unsigned char { return (colFront * M + colBack * (N - M)) / N; };
  197.  
  198.         return ((calcColor (GET_RED   (pixFront), GET_RED   (pixBack)) << 16)
  199.               | (calcColor (GET_GREEN (pixFront), GET_GREEN (pixBack)) <<  8)
  200.               | (calcColor (GET_BLUE  (pixFront), GET_BLUE  (pixBack)) <<  0));
  201. }
  202.  
  203.  
  204. template <unsigned int M, unsigned int N> inline
  205. uint32_t gradientARGB(uint32_t pixFront, uint32_t pixBack) //find intermediate color between two colors with alpha channels (=> NO alpha blending!!!)
  206. {
  207.     static_assert(0 < M && M < N && N <= 1000, "");
  208.  
  209.     const unsigned int weightFront = GET_ALPHA (pixFront) * M;
  210.     const unsigned int weightBack  = GET_ALPHA (pixBack) * (N - M);
  211.     const unsigned int weightSum   = weightFront + weightBack;
  212.     if (weightSum == 0)
  213.         return 0;
  214.  
  215.     auto calcColor = [=](unsigned char colFront, unsigned char colBack)
  216.     {
  217.         return static_cast<unsigned char>((colFront * weightFront + colBack * weightBack) / weightSum);
  218.     };
  219.  
  220.         return (((unsigned char) (weightSum / N))                      << 24)
  221.               | (calcColor (GET_RED   (pixFront), GET_RED   (pixBack)) << 16)
  222.               | (calcColor (GET_GREEN (pixFront), GET_GREEN (pixBack)) <<  8)
  223.               | (calcColor (GET_BLUE  (pixFront), GET_BLUE  (pixBack)) <<  0);
  224. }
  225.  
  226.  
  227. //inline
  228. //double fastSqrt(double n)
  229. //{
  230. //    __asm //speeds up xBRZ by about 9% compared to /*std::*/sqrt which internally uses the same assembler instructions but adds some "fluff"
  231. //    {
  232. //        fld n
  233. //        fsqrt
  234. //    }
  235. //}
  236. //
  237.  
  238.  
  239. #ifdef _MSC_VER
  240.     #define FORCE_INLINE __forceinline
  241. #elif defined __GNUC__
  242.     #define FORCE_INLINE __attribute__((always_inline)) inline
  243. #else
  244.     #define FORCE_INLINE inline
  245. #endif
  246.  
  247.  
  248. enum RotationDegree //clock-wise
  249. {
  250.     ROT_0,
  251.     ROT_90,
  252.     ROT_180,
  253.     ROT_270
  254. };
  255.  
  256. //calculate input matrix coordinates after rotation at compile time
  257. template <RotationDegree rotDeg, size_t I, size_t J, size_t N>
  258. struct MatrixRotation;
  259.  
  260. template <size_t I, size_t J, size_t N>
  261. struct MatrixRotation<ROT_0, I, J, N>
  262. {
  263.     static const size_t I_old = I;
  264.     static const size_t J_old = J;
  265. };
  266.  
  267. template <RotationDegree rotDeg, size_t I, size_t J, size_t N> //(i, j) = (row, col) indices, N = size of (square) matrix
  268. struct MatrixRotation
  269. {
  270.     static const size_t I_old = N - 1 - MatrixRotation<static_cast<RotationDegree>(rotDeg - 1), I, J, N>::J_old; //old coordinates before rotation!
  271.     static const size_t J_old =         MatrixRotation<static_cast<RotationDegree>(rotDeg - 1), I, J, N>::I_old; //
  272. };
  273.  
  274.  
  275. template <size_t N, RotationDegree rotDeg>
  276. class OutputMatrix
  277. {
  278. public:
  279.     OutputMatrix(uint32_t* out, int outWidth) : //access matrix area, top-left at position "out" for image with given width
  280.         out_(out),
  281.         outWidth_(outWidth) {}
  282.  
  283.     template <size_t I, size_t J>
  284.     uint32_t& ref() const
  285.     {
  286.         static const size_t I_old = MatrixRotation<rotDeg, I, J, N>::I_old;
  287.         static const size_t J_old = MatrixRotation<rotDeg, I, J, N>::J_old;
  288.         return *(out_ + J_old + I_old * outWidth_);
  289.     }
  290.  
  291. private:
  292.     uint32_t* out_;
  293.     const int outWidth_;
  294. };
  295.  
  296.  
  297. template <class T> inline
  298. T square(T value) { return value * value; }
  299.  
  300.  
  301.  
  302. inline
  303. double distRGB(uint32_t pix1, uint32_t pix2)
  304. {
  305.     const double r_diff = static_cast<int>(GET_RED   (pix1)) - GET_RED   (pix2);
  306.     const double g_diff = static_cast<int>(GET_GREEN (pix1)) - GET_GREEN (pix2);
  307.     const double b_diff = static_cast<int>(GET_BLUE  (pix1)) - GET_BLUE  (pix2);
  308.  
  309.     //euklidean RGB distance
  310.     return /*std::*/sqrt(square(r_diff) + square(g_diff) + square(b_diff));
  311. }
  312.  
  313.  
  314. inline
  315. double distYCbCr(uint32_t pix1, uint32_t pix2, double lumaWeight)
  316. {
  317.     //http://en.wikipedia.org/wiki/YCbCr#ITU-R_BT.601_conversion
  318.     //YCbCr conversion is a matrix multiplication => take advantage of linearity by subtracting first!
  319.     const int r_diff = static_cast<int>(GET_RED   (pix1)) - GET_RED   (pix2); //we may delay division by 255 to after matrix multiplication
  320.     const int g_diff = static_cast<int>(GET_GREEN (pix1)) - GET_GREEN (pix2); //
  321.     const int b_diff = static_cast<int>(GET_BLUE  (pix1)) - GET_BLUE  (pix2); //substraction for int is noticeable faster than for double!
  322.  
  323.     //const double k_b = 0.0722; //ITU-R BT.709 conversion
  324.     //const double k_r = 0.2126; //
  325.     const double k_b = 0.0593; //ITU-R BT.2020 conversion
  326.     const double k_r = 0.2627; //
  327.     const double k_g = 1 - k_b - k_r;
  328.  
  329.     const double scale_b = 0.5 / (1 - k_b);
  330.     const double scale_r = 0.5 / (1 - k_r);
  331.  
  332.     const double y   = k_r * r_diff + k_g * g_diff + k_b * b_diff; //[!], analog YCbCr!
  333.     const double c_b = scale_b * (b_diff - y);
  334.     const double c_r = scale_r * (r_diff - y);
  335.  
  336.     //we skip division by 255 to have similar range like other distance functions
  337.     return /*std::*/sqrt(square(lumaWeight * y) + square(c_b) + square(c_r));
  338. }
  339.  
  340.  
  341. inline double distYCbCrBuffered(uint32_t pix1, uint32_t pix2)
  342. {
  343.     //30% perf boost compared to plain distYCbCr()!
  344.     //consumes 64 MB memory; using double is only 2% faster, but takes 128 MB
  345.     static const std::vector<float> diffToDist = []
  346.     {
  347.         std::vector<float> tmp;
  348.  
  349.         for (uint32_t i = 0; i < 256 * 256 * 256; ++i) //startup time: 114 ms on Intel Core i5 (four cores)
  350.         {
  351.             const int r_diff = GET_RED (i) * 2 - 0xFF;
  352.             const int g_diff = GET_GREEN (i) * 2 - 0xFF;
  353.             const int b_diff = GET_BLUE (i) * 2 - 0xFF;
  354.  
  355.             const double k_b = 0.0593; //ITU-R BT.2020 conversion
  356.             const double k_r = 0.2627; //
  357.             const double k_g = 1 - k_b - k_r;
  358.  
  359.             const double scale_b = 0.5 / (1 - k_b);
  360.             const double scale_r = 0.5 / (1 - k_r);
  361.  
  362.             const double y   = k_r * r_diff + k_g * g_diff + k_b * b_diff; //[!], analog YCbCr!
  363.             const double c_b = scale_b * (b_diff - y);
  364.             const double c_r = scale_r * (r_diff - y);
  365.  
  366.             tmp.push_back(static_cast<float>(/*std::*/sqrt(square(y) + square(c_b) + square(c_r))));
  367.         }
  368.         return tmp;
  369.     }();
  370.  
  371.     //if (pix1 == pix2) -> 8% perf degradation!
  372.     //    return 0;
  373.     //if (pix1 < pix2)
  374.     //    std::swap(pix1, pix2); -> 30% perf degradation!!!
  375. #if 1
  376.     const int r_diff = static_cast<int>(GET_RED   (pix1)) - GET_RED   (pix2);
  377.     const int g_diff = static_cast<int>(GET_GREEN (pix1)) - GET_GREEN (pix2);
  378.     const int b_diff = static_cast<int>(GET_BLUE  (pix1)) - GET_BLUE  (pix2);
  379.  
  380.     return diffToDist[(((r_diff + 0xFF) / 2) << 16) | //slightly reduce precision (division by 2) to squeeze value into single byte
  381.                       (((g_diff + 0xFF) / 2) <<  8) |
  382.                       (( b_diff + 0xFF) / 2)];
  383. #else //not noticeably faster:
  384.     const int r_diff_tmp = ((pix1 & 0xFF0000) + 0xFF0000 - (pix2 & 0xFF0000)) / 2;
  385.     const int g_diff_tmp = ((pix1 & 0x00FF00) + 0x00FF00 - (pix2 & 0x00FF00)) / 2; //slightly reduce precision (division by 2) to squeeze value into single byte
  386.     const int b_diff_tmp = ((pix1 & 0x0000FF) + 0x0000FF - (pix2 & 0x0000FF)) / 2;
  387.  
  388.     return diffToDist[(r_diff_tmp & 0xFF0000) | (g_diff_tmp & 0x00FF00) | (b_diff_tmp & 0x0000FF)];
  389. #endif
  390. }
  391.  
  392.  
  393. enum BlendType
  394. {
  395.     BLEND_NONE = 0,
  396.     BLEND_NORMAL,   //a normal indication to blend
  397.     BLEND_DOMINANT, //a strong indication to blend
  398.     //attention: BlendType must fit into the value range of 2 bit!!!
  399. };
  400.  
  401. struct BlendResult
  402. {
  403.     BlendType
  404.     /**/blend_f, blend_g,
  405.     /**/blend_j, blend_k;
  406. };
  407.  
  408.  
  409. struct Kernel_4x4 //kernel for preprocessing step
  410. {
  411.     uint32_t
  412.     /**/a, b, c, d,
  413.     /**/e, f, g, h,
  414.     /**/i, j, k, l,
  415.     /**/m, n, o, p;
  416. };
  417.  
  418. /*
  419. input kernel area naming convention:
  420. -----------------
  421. | A | B | C | D |
  422. ----|---|---|---|
  423. | E | F | G | H |   //evaluate the four corners between F, G, J, K
  424. ----|---|---|---|   //input pixel is at position F
  425. | I | J | K | L |
  426. ----|---|---|---|
  427. | M | N | O | P |
  428. -----------------
  429. */
  430. template <class ColorDistance>
  431. FORCE_INLINE //detect blend direction
  432. BlendResult preProcessCorners(const Kernel_4x4& ker) //result: F, G, J, K corners of "GradientType"
  433. {
  434.     BlendResult result = {};
  435.  
  436.     if ((ker.f == ker.g &&
  437.          ker.j == ker.k) ||
  438.         (ker.f == ker.j &&
  439.          ker.g == ker.k))
  440.         return result;
  441.  
  442.     auto dist = [&](uint32_t pix1, uint32_t pix2) { return ColorDistance::dist(pix1, pix2, XBRZ_CFG_LUMINANCE_WEIGHT); };
  443.  
  444.     const int weight = 4;
  445.     double jg = dist(ker.i, ker.f) + dist(ker.f, ker.c) + dist(ker.n, ker.k) + dist(ker.k, ker.h) + weight * dist(ker.j, ker.g);
  446.     double fk = dist(ker.e, ker.j) + dist(ker.j, ker.o) + dist(ker.b, ker.g) + dist(ker.g, ker.l) + weight * dist(ker.f, ker.k);
  447.  
  448.     if (jg < fk) //test sample: 70% of values max(jg, fk) / min(jg, fk) are between 1.1 and 3.7 with median being 1.8
  449.     {
  450.         const bool dominantGradient = XBRZ_CFG_DOMINANT_DIRECTION_THRESHOLD * jg < fk;
  451.         if (ker.f != ker.g && ker.f != ker.j)
  452.             result.blend_f = dominantGradient ? BLEND_DOMINANT : BLEND_NORMAL;
  453.  
  454.         if (ker.k != ker.j && ker.k != ker.g)
  455.             result.blend_k = dominantGradient ? BLEND_DOMINANT : BLEND_NORMAL;
  456.     }
  457.     else if (fk < jg)
  458.     {
  459.         const bool dominantGradient = XBRZ_CFG_DOMINANT_DIRECTION_THRESHOLD * fk < jg;
  460.         if (ker.j != ker.f && ker.j != ker.k)
  461.             result.blend_j = dominantGradient ? BLEND_DOMINANT : BLEND_NORMAL;
  462.  
  463.         if (ker.g != ker.f && ker.g != ker.k)
  464.             result.blend_g = dominantGradient ? BLEND_DOMINANT : BLEND_NORMAL;
  465.     }
  466.     return result;
  467. }
  468.  
  469. struct Kernel_3x3
  470. {
  471.     uint32_t
  472.     /**/a,  b,  c,
  473.     /**/d,  e,  f,
  474.     /**/g,  h,  i;
  475. };
  476.  
  477. #define DEF_GETTER(x) template <RotationDegree rotDeg> uint32_t inline get_##x(const Kernel_3x3& ker) { return ker.x; }
  478. //we cannot and NEED NOT write "ker.##x" since ## concatenates preprocessor tokens but "." is not a token
  479. DEF_GETTER(a) DEF_GETTER(b) DEF_GETTER(c)
  480. DEF_GETTER(d) DEF_GETTER(e) DEF_GETTER(f)
  481. DEF_GETTER(g) DEF_GETTER(h) DEF_GETTER(i)
  482. #undef DEF_GETTER
  483.  
  484. #define DEF_GETTER(x, y) template <> inline uint32_t get_##x<ROT_90>(const Kernel_3x3& ker) { return ker.y; }
  485. DEF_GETTER(a, g) DEF_GETTER(b, d) DEF_GETTER(c, a)
  486. DEF_GETTER(d, h) DEF_GETTER(e, e) DEF_GETTER(f, b)
  487. DEF_GETTER(g, i) DEF_GETTER(h, f) DEF_GETTER(i, c)
  488. #undef DEF_GETTER
  489.  
  490. #define DEF_GETTER(x, y) template <> inline uint32_t get_##x<ROT_180>(const Kernel_3x3& ker) { return ker.y; }
  491. DEF_GETTER(a, i) DEF_GETTER(b, h) DEF_GETTER(c, g)
  492. DEF_GETTER(d, f) DEF_GETTER(e, e) DEF_GETTER(f, d)
  493. DEF_GETTER(g, c) DEF_GETTER(h, b) DEF_GETTER(i, a)
  494. #undef DEF_GETTER
  495.  
  496. #define DEF_GETTER(x, y) template <> inline uint32_t get_##x<ROT_270>(const Kernel_3x3& ker) { return ker.y; }
  497. DEF_GETTER(a, c) DEF_GETTER(b, f) DEF_GETTER(c, i)
  498. DEF_GETTER(d, b) DEF_GETTER(e, e) DEF_GETTER(f, h)
  499. DEF_GETTER(g, a) DEF_GETTER(h, d) DEF_GETTER(i, g)
  500. #undef DEF_GETTER
  501.  
  502.  
  503. //compress four blend types into a single byte
  504. inline BlendType getTopL   (unsigned char b) { return static_cast<BlendType>(0x3 & b); }
  505. inline BlendType getTopR   (unsigned char b) { return static_cast<BlendType>(0x3 & (b >> 2)); }
  506. inline BlendType getBottomR(unsigned char b) { return static_cast<BlendType>(0x3 & (b >> 4)); }
  507. inline BlendType getBottomL(unsigned char b) { return static_cast<BlendType>(0x3 & (b >> 6)); }
  508.  
  509. inline void setTopL   (unsigned char& b, BlendType bt) { b |= bt; } //buffer is assumed to be initialized before preprocessing!
  510. inline void setTopR   (unsigned char& b, BlendType bt) { b |= (bt << 2); }
  511. inline void setBottomR(unsigned char& b, BlendType bt) { b |= (bt << 4); }
  512. inline void setBottomL(unsigned char& b, BlendType bt) { b |= (bt << 6); }
  513.  
  514. inline bool blendingNeeded(unsigned char b) { return b != 0; }
  515.  
  516. template <RotationDegree rotDeg> inline
  517. unsigned char rotateBlendInfo(unsigned char b) { return b; }
  518. template <> inline unsigned char rotateBlendInfo<ROT_90 >(unsigned char b) { return ((b << 2) | (b >> 6)) & 0xff; }
  519. template <> inline unsigned char rotateBlendInfo<ROT_180>(unsigned char b) { return ((b << 4) | (b >> 4)) & 0xff; }
  520. template <> inline unsigned char rotateBlendInfo<ROT_270>(unsigned char b) { return ((b << 6) | (b >> 2)) & 0xff; }
  521.  
  522.  
  523. /*
  524. input kernel area naming convention:
  525. -------------
  526. | A | B | C |
  527. ----|---|---|
  528. | D | E | F | //input pixel is at position E
  529. ----|---|---|
  530. | G | H | I |
  531. -------------
  532. */
  533. template <class Scaler, class ColorDistance, RotationDegree rotDeg>
  534. FORCE_INLINE //perf: quite worth it!
  535. void blendPixel(const Kernel_3x3& ker,
  536.                 uint32_t* target, int trgWidth,
  537.                 unsigned char blendInfo) //result of preprocessing all four corners of pixel "e"
  538. {
  539. #define a get_a<rotDeg>(ker)
  540. #define b get_b<rotDeg>(ker)
  541. #define c get_c<rotDeg>(ker)
  542. #define d get_d<rotDeg>(ker)
  543. #define e get_e<rotDeg>(ker)
  544. #define f get_f<rotDeg>(ker)
  545. #define g get_g<rotDeg>(ker)
  546. #define h get_h<rotDeg>(ker)
  547. #define i get_i<rotDeg>(ker)
  548.  
  549.     const unsigned char blend = rotateBlendInfo<rotDeg>(blendInfo);
  550.  
  551.     if (getBottomR(blend) >= BLEND_NORMAL)
  552.     {
  553.         auto eq   = [&](uint32_t pix1, uint32_t pix2) { return ColorDistance::dist(pix1, pix2, XBRZ_CFG_LUMINANCE_WEIGHT) < XBRZ_CFG_EQUAL_COLOR_TOLERANCE; };
  554.         auto dist = [&](uint32_t pix1, uint32_t pix2) { return ColorDistance::dist(pix1, pix2, XBRZ_CFG_LUMINANCE_WEIGHT); };
  555.  
  556.         const bool doLineBlend = [&]() -> bool
  557.         {
  558.             if (getBottomR(blend) >= BLEND_DOMINANT)
  559.                 return true;
  560.  
  561.             //make sure there is no second blending in an adjacent rotation for this pixel: handles insular pixels, mario eyes
  562.             if (getTopR(blend) != BLEND_NONE && !eq(e, g)) //but support double-blending for 90° corners
  563.                 return false;
  564.             if (getBottomL(blend) != BLEND_NONE && !eq(e, c))
  565.                 return false;
  566.  
  567.             //no full blending for L-shapes; blend corner only (handles "mario mushroom eyes")
  568.             if (!eq(e, i) && eq(g, h) && eq(h, i) && eq(i, f) && eq(f, c))
  569.                 return false;
  570.  
  571.             return true;
  572.         }();
  573.  
  574.         const uint32_t px = dist(e, f) <= dist(e, h) ? f : h; //choose most similar color
  575.  
  576.         OutputMatrix<Scaler::scale, rotDeg> out(target, trgWidth);
  577.  
  578.         if (doLineBlend)
  579.         {
  580.             const double fg = dist(f, g); //test sample: 70% of values max(fg, hc) / min(fg, hc) are between 1.1 and 3.7 with median being 1.9
  581.             const double hc = dist(h, c); //
  582.  
  583.             const bool haveShallowLine = XBRZ_CFG_STEEP_DIRECTION_THRESHOLD * fg <= hc && e != g && d != g;
  584.             const bool haveSteepLine   = XBRZ_CFG_STEEP_DIRECTION_THRESHOLD * hc <= fg && e != c && b != c;
  585.  
  586.             if (haveShallowLine)
  587.             {
  588.                 if (haveSteepLine)
  589.                     Scaler::blendLineSteepAndShallow(px, out);
  590.                 else
  591.                     Scaler::blendLineShallow(px, out);
  592.             }
  593.             else
  594.             {
  595.                 if (haveSteepLine)
  596.                     Scaler::blendLineSteep(px, out);
  597.                 else
  598.                     Scaler::blendLineDiagonal(px, out);
  599.             }
  600.         }
  601.         else
  602.             Scaler::blendCorner(px, out);
  603.     }
  604.  
  605. #undef a
  606. #undef b
  607. #undef c
  608. #undef d
  609. #undef e
  610. #undef f
  611. #undef g
  612. #undef h
  613. #undef i
  614. }
  615.  
  616.  
  617. template <class Scaler, class ColorDistance> //scaler policy: see "Scaler2x" reference implementation
  618. void scaleImage(const uint32_t* src, uint32_t* trg, int srcWidth, int srcHeight, int yFirst, int yLast)
  619. {
  620.     yFirst = std::max(yFirst, 0);
  621.     yLast  = std::min(yLast, srcHeight);
  622.     if (yFirst >= yLast || srcWidth <= 0)
  623.         return;
  624.  
  625.     const int trgWidth = srcWidth * Scaler::scale;
  626.  
  627.     //"use" space at the end of the image as temporary buffer for "on the fly preprocessing": we even could use larger area of
  628.     //"sizeof(uint32_t) * srcWidth * (yLast - yFirst)" bytes without risk of accidental overwriting before accessing
  629.     const int bufferSize = srcWidth;
  630.     unsigned char* preProcBuffer = reinterpret_cast<unsigned char*>(trg + yLast * Scaler::scale * trgWidth) - bufferSize;
  631.     std::fill(preProcBuffer, preProcBuffer + bufferSize, '\0');
  632.     static_assert(BLEND_NONE == 0, "");
  633.  
  634.     //initialize preprocessing buffer for first row of current stripe: detect upper left and right corner blending
  635.     //this cannot be optimized for adjacent processing stripes; we must not allow for a memory race condition!
  636.     if (yFirst > 0)
  637.     {
  638.         const int y = yFirst - 1;
  639.  
  640.         const uint32_t* s_m1 = src + srcWidth * std::max(y - 1, 0);
  641.         const uint32_t* s_0  = src + srcWidth * y; //center line
  642.         const uint32_t* s_p1 = src + srcWidth * std::min(y + 1, srcHeight - 1);
  643.         const uint32_t* s_p2 = src + srcWidth * std::min(y + 2, srcHeight - 1);
  644.  
  645.         for (int x = 0; x < srcWidth; ++x)
  646.         {
  647.             const int x_m1 = std::max(x - 1, 0);
  648.             const int x_p1 = std::min(x + 1, srcWidth - 1);
  649.             const int x_p2 = std::min(x + 2, srcWidth - 1);
  650.  
  651.             Kernel_4x4 ker = {}; //perf: initialization is negligible
  652.             ker.a = s_m1[x_m1]; //read sequentially from memory as far as possible
  653.             ker.b = s_m1[x];
  654.             ker.c = s_m1[x_p1];
  655.             ker.d = s_m1[x_p2];
  656.  
  657.             ker.e = s_0[x_m1];
  658.             ker.f = s_0[x];
  659.             ker.g = s_0[x_p1];
  660.             ker.h = s_0[x_p2];
  661.  
  662.             ker.i = s_p1[x_m1];
  663.             ker.j = s_p1[x];
  664.             ker.k = s_p1[x_p1];
  665.             ker.l = s_p1[x_p2];
  666.  
  667.             ker.m = s_p2[x_m1];
  668.             ker.n = s_p2[x];
  669.             ker.o = s_p2[x_p1];
  670.             ker.p = s_p2[x_p2];
  671.  
  672.             const BlendResult res = preProcessCorners<ColorDistance>(ker);
  673.             /*
  674.             preprocessing blend result:
  675.             ---------
  676.             | F | G |   //evalute corner between F, G, J, K
  677.             ----|---|   //input pixel is at position F
  678.             | J | K |
  679.             ---------
  680.             */
  681.             setTopR(preProcBuffer[x], res.blend_j);
  682.  
  683.             if (x + 1 < bufferSize)
  684.                 setTopL(preProcBuffer[x + 1], res.blend_k);
  685.         }
  686.     }
  687.     //------------------------------------------------------------------------------------
  688.  
  689.     for (int y = yFirst; y < yLast; ++y)
  690.     {
  691.         uint32_t* out = trg + Scaler::scale * y * trgWidth; //consider MT "striped" access
  692.  
  693.         const uint32_t* s_m1 = src + srcWidth * std::max(y - 1, 0);
  694.         const uint32_t* s_0  = src + srcWidth * y; //center line
  695.         const uint32_t* s_p1 = src + srcWidth * std::min(y + 1, srcHeight - 1);
  696.         const uint32_t* s_p2 = src + srcWidth * std::min(y + 2, srcHeight - 1);
  697.  
  698.         unsigned char blend_xy1 = 0; //corner blending for current (x, y + 1) position
  699.  
  700.         for (int x = 0; x < srcWidth; ++x, out += Scaler::scale)
  701.         {
  702.             //all those bounds checks have only insignificant impact on performance!
  703.             const int x_m1 = std::max(x - 1, 0); //perf: prefer array indexing to additional pointers!
  704.             const int x_p1 = std::min(x + 1, srcWidth - 1);
  705.             const int x_p2 = std::min(x + 2, srcWidth - 1);
  706.  
  707.             Kernel_4x4 ker4 = {}; //perf: initialization is negligible
  708.  
  709.             ker4.a = s_m1[x_m1]; //read sequentially from memory as far as possible
  710.             ker4.b = s_m1[x];
  711.             ker4.c = s_m1[x_p1];
  712.             ker4.d = s_m1[x_p2];
  713.  
  714.             ker4.e = s_0[x_m1];
  715.             ker4.f = s_0[x];
  716.             ker4.g = s_0[x_p1];
  717.             ker4.h = s_0[x_p2];
  718.  
  719.             ker4.i = s_p1[x_m1];
  720.             ker4.j = s_p1[x];
  721.             ker4.k = s_p1[x_p1];
  722.             ker4.l = s_p1[x_p2];
  723.  
  724.             ker4.m = s_p2[x_m1];
  725.             ker4.n = s_p2[x];
  726.             ker4.o = s_p2[x_p1];
  727.             ker4.p = s_p2[x_p2];
  728.  
  729.             //evaluate the four corners on bottom-right of current pixel
  730.             unsigned char blend_xy = 0; //for current (x, y) position
  731.             {
  732.                 const BlendResult res = preProcessCorners<ColorDistance>(ker4);
  733.                 /*
  734.                 preprocessing blend result:
  735.                 ---------
  736.                 | F | G |   //evalute corner between F, G, J, K
  737.                 ----|---|   //current input pixel is at position F
  738.                 | J | K |
  739.                 ---------
  740.                 */
  741.                 blend_xy = preProcBuffer[x];
  742.                 setBottomR(blend_xy, res.blend_f); //all four corners of (x, y) have been determined at this point due to processing sequence!
  743.  
  744.                 setTopR(blend_xy1, res.blend_j); //set 2nd known corner for (x, y + 1)
  745.                 preProcBuffer[x] = blend_xy1; //store on current buffer position for use on next row
  746.  
  747.                 blend_xy1 = 0;
  748.                 setTopL(blend_xy1, res.blend_k); //set 1st known corner for (x + 1, y + 1) and buffer for use on next column
  749.  
  750.                 if (x + 1 < bufferSize) //set 3rd known corner for (x + 1, y)
  751.                     setBottomL(preProcBuffer[x + 1], res.blend_g);
  752.             }
  753.  
  754.             //fill block of size scale * scale with the given color
  755.             xbrz::fillBlock(out, trgWidth * sizeof(uint32_t), ker4.f, Scaler::scale, Scaler::scale);
  756.             //place *after* preprocessing step, to not overwrite the results while processing the the last pixel!
  757.  
  758.             //blend four corners of current pixel
  759.             if (blendingNeeded(blend_xy)) //good 5% perf-improvement
  760.             {
  761.                 Kernel_3x3 ker3 = {}; //perf: initialization is negligible
  762.  
  763.                 ker3.a = ker4.a;
  764.                 ker3.b = ker4.b;
  765.                 ker3.c = ker4.c;
  766.  
  767.                 ker3.d = ker4.e;
  768.                 ker3.e = ker4.f;
  769.                 ker3.f = ker4.g;
  770.  
  771.                 ker3.g = ker4.i;
  772.                 ker3.h = ker4.j;
  773.                 ker3.i = ker4.k;
  774.  
  775.                 blendPixel<Scaler, ColorDistance, ROT_0  >(ker3, out, trgWidth, blend_xy);
  776.                 blendPixel<Scaler, ColorDistance, ROT_90 >(ker3, out, trgWidth, blend_xy);
  777.                 blendPixel<Scaler, ColorDistance, ROT_180>(ker3, out, trgWidth, blend_xy);
  778.                 blendPixel<Scaler, ColorDistance, ROT_270>(ker3, out, trgWidth, blend_xy);
  779.             }
  780.         }
  781.     }
  782. }
  783.  
  784.  
  785. //------------------------------------------------------------------------------------
  786. template <class ColorGradient> struct Scaler2x : public ColorGradient
  787. {
  788.     static const int scale = 2;
  789.  
  790.     template <unsigned int M, unsigned int N> //bring template function into scope for GCC
  791.     static void alphaGrad(uint32_t& pixBack, uint32_t pixFront) { ColorGradient::template alphaGrad<M, N>(pixBack, pixFront); }
  792.  
  793.  
  794.     template <class OutputMatrix>
  795.     static void blendLineShallow(uint32_t col, OutputMatrix& out)
  796.     {
  797.         alphaGrad<1, 4>(out.template ref<scale - 1, 0>(), col);
  798.         alphaGrad<3, 4>(out.template ref<scale - 1, 1>(), col);
  799.     }
  800.  
  801.     template <class OutputMatrix>
  802.     static void blendLineSteep(uint32_t col, OutputMatrix& out)
  803.     {
  804.         alphaGrad<1, 4>(out.template ref<0, scale - 1>(), col);
  805.         alphaGrad<3, 4>(out.template ref<1, scale - 1>(), col);
  806.     }
  807.  
  808.     template <class OutputMatrix>
  809.     static void blendLineSteepAndShallow(uint32_t col, OutputMatrix& out)
  810.     {
  811.         alphaGrad<1, 4>(out.template ref<1, 0>(), col);
  812.         alphaGrad<1, 4>(out.template ref<0, 1>(), col);
  813.         alphaGrad<5, 6>(out.template ref<1, 1>(), col); //[!] fixes 7/8 used in xBR
  814.     }
  815.  
  816.     template <class OutputMatrix>
  817.     static void blendLineDiagonal(uint32_t col, OutputMatrix& out)
  818.     {
  819.         alphaGrad<1, 2>(out.template ref<1, 1>(), col);
  820.     }
  821.  
  822.     template <class OutputMatrix>
  823.     static void blendCorner(uint32_t col, OutputMatrix& out)
  824.     {
  825.         //model a round corner
  826.         alphaGrad<21, 100>(out.template ref<1, 1>(), col); //exact: 1 - pi/4 = 0.2146018366
  827.     }
  828. };
  829.  
  830.  
  831. template <class ColorGradient> struct Scaler3x : public ColorGradient
  832. {
  833.     static const int scale = 3;
  834.  
  835.     template <unsigned int M, unsigned int N> //bring template function into scope for GCC
  836.     static void alphaGrad(uint32_t& pixBack, uint32_t pixFront) { ColorGradient::template alphaGrad<M, N>(pixBack, pixFront); }
  837.  
  838.  
  839.     template <class OutputMatrix>
  840.     static void blendLineShallow(uint32_t col, OutputMatrix& out)
  841.     {
  842.         alphaGrad<1, 4>(out.template ref<scale - 1, 0>(), col);
  843.         alphaGrad<1, 4>(out.template ref<scale - 2, 2>(), col);
  844.  
  845.         alphaGrad<3, 4>(out.template ref<scale - 1, 1>(), col);
  846.         out.template ref<scale - 1, 2>() = col;
  847.     }
  848.  
  849.     template <class OutputMatrix>
  850.     static void blendLineSteep(uint32_t col, OutputMatrix& out)
  851.     {
  852.         alphaGrad<1, 4>(out.template ref<0, scale - 1>(), col);
  853.         alphaGrad<1, 4>(out.template ref<2, scale - 2>(), col);
  854.  
  855.         alphaGrad<3, 4>(out.template ref<1, scale - 1>(), col);
  856.         out.template ref<2, scale - 1>() = col;
  857.     }
  858.  
  859.     template <class OutputMatrix>
  860.     static void blendLineSteepAndShallow(uint32_t col, OutputMatrix& out)
  861.     {
  862.         alphaGrad<1, 4>(out.template ref<2, 0>(), col);
  863.         alphaGrad<1, 4>(out.template ref<0, 2>(), col);
  864.         alphaGrad<3, 4>(out.template ref<2, 1>(), col);
  865.         alphaGrad<3, 4>(out.template ref<1, 2>(), col);
  866.         out.template ref<2, 2>() = col;
  867.     }
  868.  
  869.     template <class OutputMatrix>
  870.     static void blendLineDiagonal(uint32_t col, OutputMatrix& out)
  871.     {
  872.         alphaGrad<1, 8>(out.template ref<1, 2>(), col); //conflict with other rotations for this odd scale
  873.         alphaGrad<1, 8>(out.template ref<2, 1>(), col);
  874.         alphaGrad<7, 8>(out.template ref<2, 2>(), col); //
  875.     }
  876.  
  877.     template <class OutputMatrix>
  878.     static void blendCorner(uint32_t col, OutputMatrix& out)
  879.     {
  880.         //model a round corner
  881.         alphaGrad<45, 100>(out.template ref<2, 2>(), col); //exact: 0.4545939598
  882.         //alphaGrad<7, 256>(out.template ref<2, 1>(), col); //0.02826017254 -> negligible + avoid conflicts with other rotations for this odd scale
  883.         //alphaGrad<7, 256>(out.template ref<1, 2>(), col); //0.02826017254
  884.     }
  885. };
  886.  
  887.  
  888. template <class ColorGradient> struct Scaler4x : public ColorGradient
  889. {
  890.     static const int scale = 4;
  891.  
  892.     template <unsigned int M, unsigned int N> //bring template function into scope for GCC
  893.     static void alphaGrad(uint32_t& pixBack, uint32_t pixFront) { ColorGradient::template alphaGrad<M, N>(pixBack, pixFront); }
  894.  
  895.  
  896.     template <class OutputMatrix>
  897.     static void blendLineShallow(uint32_t col, OutputMatrix& out)
  898.     {
  899.         alphaGrad<1, 4>(out.template ref<scale - 1, 0>(), col);
  900.         alphaGrad<1, 4>(out.template ref<scale - 2, 2>(), col);
  901.  
  902.         alphaGrad<3, 4>(out.template ref<scale - 1, 1>(), col);
  903.         alphaGrad<3, 4>(out.template ref<scale - 2, 3>(), col);
  904.  
  905.         out.template ref<scale - 1, 2>() = col;
  906.         out.template ref<scale - 1, 3>() = col;
  907.     }
  908.  
  909.     template <class OutputMatrix>
  910.     static void blendLineSteep(uint32_t col, OutputMatrix& out)
  911.     {
  912.         alphaGrad<1, 4>(out.template ref<0, scale - 1>(), col);
  913.         alphaGrad<1, 4>(out.template ref<2, scale - 2>(), col);
  914.  
  915.         alphaGrad<3, 4>(out.template ref<1, scale - 1>(), col);
  916.         alphaGrad<3, 4>(out.template ref<3, scale - 2>(), col);
  917.  
  918.         out.template ref<2, scale - 1>() = col;
  919.         out.template ref<3, scale - 1>() = col;
  920.     }
  921.  
  922.     template <class OutputMatrix>
  923.     static void blendLineSteepAndShallow(uint32_t col, OutputMatrix& out)
  924.     {
  925.         alphaGrad<3, 4>(out.template ref<3, 1>(), col);
  926.         alphaGrad<3, 4>(out.template ref<1, 3>(), col);
  927.         alphaGrad<1, 4>(out.template ref<3, 0>(), col);
  928.         alphaGrad<1, 4>(out.template ref<0, 3>(), col);
  929.  
  930.         alphaGrad<1, 3>(out.template ref<2, 2>(), col); //[!] fixes 1/4 used in xBR
  931.  
  932.         out.template ref<3, 3>() = col;
  933.         out.template ref<3, 2>() = col;
  934.         out.template ref<2, 3>() = col;
  935.     }
  936.  
  937.     template <class OutputMatrix>
  938.     static void blendLineDiagonal(uint32_t col, OutputMatrix& out)
  939.     {
  940.         alphaGrad<1, 2>(out.template ref<scale - 1, scale / 2    >(), col);
  941.         alphaGrad<1, 2>(out.template ref<scale - 2, scale / 2 + 1>(), col);
  942.         out.template ref<scale - 1, scale - 1>() = col;
  943.     }
  944.  
  945.     template <class OutputMatrix>
  946.     static void blendCorner(uint32_t col, OutputMatrix& out)
  947.     {
  948.         //model a round corner
  949.         alphaGrad<68, 100>(out.template ref<3, 3>(), col); //exact: 0.6848532563
  950.         alphaGrad< 9, 100>(out.template ref<3, 2>(), col); //0.08677704501
  951.         alphaGrad< 9, 100>(out.template ref<2, 3>(), col); //0.08677704501
  952.     }
  953. };
  954.  
  955.  
  956. template <class ColorGradient> struct Scaler5x : public ColorGradient
  957. {
  958.     static const int scale = 5;
  959.  
  960.     template <unsigned int M, unsigned int N> //bring template function into scope for GCC
  961.     static void alphaGrad(uint32_t& pixBack, uint32_t pixFront) { ColorGradient::template alphaGrad<M, N>(pixBack, pixFront); }
  962.  
  963.  
  964.     template <class OutputMatrix>
  965.     static void blendLineShallow(uint32_t col, OutputMatrix& out)
  966.     {
  967.         alphaGrad<1, 4>(out.template ref<scale - 1, 0>(), col);
  968.         alphaGrad<1, 4>(out.template ref<scale - 2, 2>(), col);
  969.         alphaGrad<1, 4>(out.template ref<scale - 3, 4>(), col);
  970.  
  971.         alphaGrad<3, 4>(out.template ref<scale - 1, 1>(), col);
  972.         alphaGrad<3, 4>(out.template ref<scale - 2, 3>(), col);
  973.  
  974.         out.template ref<scale - 1, 2>() = col;
  975.         out.template ref<scale - 1, 3>() = col;
  976.         out.template ref<scale - 1, 4>() = col;
  977.         out.template ref<scale - 2, 4>() = col;
  978.     }
  979.  
  980.     template <class OutputMatrix>
  981.     static void blendLineSteep(uint32_t col, OutputMatrix& out)
  982.     {
  983.         alphaGrad<1, 4>(out.template ref<0, scale - 1>(), col);
  984.         alphaGrad<1, 4>(out.template ref<2, scale - 2>(), col);
  985.         alphaGrad<1, 4>(out.template ref<4, scale - 3>(), col);
  986.  
  987.         alphaGrad<3, 4>(out.template ref<1, scale - 1>(), col);
  988.         alphaGrad<3, 4>(out.template ref<3, scale - 2>(), col);
  989.  
  990.         out.template ref<2, scale - 1>() = col;
  991.         out.template ref<3, scale - 1>() = col;
  992.         out.template ref<4, scale - 1>() = col;
  993.         out.template ref<4, scale - 2>() = col;
  994.     }
  995.  
  996.     template <class OutputMatrix>
  997.     static void blendLineSteepAndShallow(uint32_t col, OutputMatrix& out)
  998.     {
  999.         alphaGrad<1, 4>(out.template ref<0, scale - 1>(), col);
  1000.         alphaGrad<1, 4>(out.template ref<2, scale - 2>(), col);
  1001.         alphaGrad<3, 4>(out.template ref<1, scale - 1>(), col);
  1002.  
  1003.         alphaGrad<1, 4>(out.template ref<scale - 1, 0>(), col);
  1004.         alphaGrad<1, 4>(out.template ref<scale - 2, 2>(), col);
  1005.         alphaGrad<3, 4>(out.template ref<scale - 1, 1>(), col);
  1006.  
  1007.         alphaGrad<2, 3>(out.template ref<3, 3>(), col);
  1008.  
  1009.         out.template ref<2, scale - 1>() = col;
  1010.         out.template ref<3, scale - 1>() = col;
  1011.         out.template ref<4, scale - 1>() = col;
  1012.  
  1013.         out.template ref<scale - 1, 2>() = col;
  1014.         out.template ref<scale - 1, 3>() = col;
  1015.     }
  1016.  
  1017.     template <class OutputMatrix>
  1018.     static void blendLineDiagonal(uint32_t col, OutputMatrix& out)
  1019.     {
  1020.         alphaGrad<1, 8>(out.template ref<scale - 1, scale / 2    >(), col); //conflict with other rotations for this odd scale
  1021.         alphaGrad<1, 8>(out.template ref<scale - 2, scale / 2 + 1>(), col);
  1022.         alphaGrad<1, 8>(out.template ref<scale - 3, scale / 2 + 2>(), col); //
  1023.  
  1024.         alphaGrad<7, 8>(out.template ref<4, 3>(), col);
  1025.         alphaGrad<7, 8>(out.template ref<3, 4>(), col);
  1026.  
  1027.         out.template ref<4, 4>() = col;
  1028.     }
  1029.  
  1030.     template <class OutputMatrix>
  1031.     static void blendCorner(uint32_t col, OutputMatrix& out)
  1032.     {
  1033.         // model a round corner
  1034.         alphaGrad<86, 100>(out.template ref<4, 4>(), col); //exact: 0.8631434088
  1035.         alphaGrad<23, 100>(out.template ref<4, 3>(), col); //0.2306749731
  1036.         alphaGrad<23, 100>(out.template ref<3, 4>(), col); //0.2306749731
  1037.         //alphaGrad<1, 64>(out.template ref<4, 2>(), col); //0.01676812367 -> negligible + avoid conflicts with other rotations for this odd scale
  1038.         //alphaGrad<1, 64>(out.template ref<2, 4>(), col); //0.01676812367
  1039.     }
  1040. };
  1041.  
  1042.  
  1043. template <class ColorGradient> struct Scaler6x : public ColorGradient
  1044. {
  1045.     static const int scale = 6;
  1046.  
  1047.     template <unsigned int M, unsigned int N> //bring template function into scope for GCC
  1048.     static void alphaGrad(uint32_t& pixBack, uint32_t pixFront) { ColorGradient::template alphaGrad<M, N>(pixBack, pixFront); }
  1049.  
  1050.  
  1051.     template <class OutputMatrix>
  1052.     static void blendLineShallow(uint32_t col, OutputMatrix& out)
  1053.     {
  1054.         alphaGrad<1, 4>(out.template ref<scale - 1, 0>(), col);
  1055.         alphaGrad<1, 4>(out.template ref<scale - 2, 2>(), col);
  1056.         alphaGrad<1, 4>(out.template ref<scale - 3, 4>(), col);
  1057.  
  1058.         alphaGrad<3, 4>(out.template ref<scale - 1, 1>(), col);
  1059.         alphaGrad<3, 4>(out.template ref<scale - 2, 3>(), col);
  1060.         alphaGrad<3, 4>(out.template ref<scale - 3, 5>(), col);
  1061.  
  1062.         out.template ref<scale - 1, 2>() = col;
  1063.         out.template ref<scale - 1, 3>() = col;
  1064.         out.template ref<scale - 1, 4>() = col;
  1065.         out.template ref<scale - 1, 5>() = col;
  1066.  
  1067.         out.template ref<scale - 2, 4>() = col;
  1068.         out.template ref<scale - 2, 5>() = col;
  1069.     }
  1070.  
  1071.     template <class OutputMatrix>
  1072.     static void blendLineSteep(uint32_t col, OutputMatrix& out)
  1073.     {
  1074.         alphaGrad<1, 4>(out.template ref<0, scale - 1>(), col);
  1075.         alphaGrad<1, 4>(out.template ref<2, scale - 2>(), col);
  1076.         alphaGrad<1, 4>(out.template ref<4, scale - 3>(), col);
  1077.  
  1078.         alphaGrad<3, 4>(out.template ref<1, scale - 1>(), col);
  1079.         alphaGrad<3, 4>(out.template ref<3, scale - 2>(), col);
  1080.         alphaGrad<3, 4>(out.template ref<5, scale - 3>(), col);
  1081.  
  1082.         out.template ref<2, scale - 1>() = col;
  1083.         out.template ref<3, scale - 1>() = col;
  1084.         out.template ref<4, scale - 1>() = col;
  1085.         out.template ref<5, scale - 1>() = col;
  1086.  
  1087.         out.template ref<4, scale - 2>() = col;
  1088.         out.template ref<5, scale - 2>() = col;
  1089.     }
  1090.  
  1091.     template <class OutputMatrix>
  1092.     static void blendLineSteepAndShallow(uint32_t col, OutputMatrix& out)
  1093.     {
  1094.         alphaGrad<1, 4>(out.template ref<0, scale - 1>(), col);
  1095.         alphaGrad<1, 4>(out.template ref<2, scale - 2>(), col);
  1096.         alphaGrad<3, 4>(out.template ref<1, scale - 1>(), col);
  1097.         alphaGrad<3, 4>(out.template ref<3, scale - 2>(), col);
  1098.  
  1099.         alphaGrad<1, 4>(out.template ref<scale - 1, 0>(), col);
  1100.         alphaGrad<1, 4>(out.template ref<scale - 2, 2>(), col);
  1101.         alphaGrad<3, 4>(out.template ref<scale - 1, 1>(), col);
  1102.         alphaGrad<3, 4>(out.template ref<scale - 2, 3>(), col);
  1103.  
  1104.         out.template ref<2, scale - 1>() = col;
  1105.         out.template ref<3, scale - 1>() = col;
  1106.         out.template ref<4, scale - 1>() = col;
  1107.         out.template ref<5, scale - 1>() = col;
  1108.  
  1109.         out.template ref<4, scale - 2>() = col;
  1110.         out.template ref<5, scale - 2>() = col;
  1111.  
  1112.         out.template ref<scale - 1, 2>() = col;
  1113.         out.template ref<scale - 1, 3>() = col;
  1114.     }
  1115.  
  1116.     template <class OutputMatrix>
  1117.     static void blendLineDiagonal(uint32_t col, OutputMatrix& out)
  1118.     {
  1119.         alphaGrad<1, 2>(out.template ref<scale - 1, scale / 2    >(), col);
  1120.         alphaGrad<1, 2>(out.template ref<scale - 2, scale / 2 + 1>(), col);
  1121.         alphaGrad<1, 2>(out.template ref<scale - 3, scale / 2 + 2>(), col);
  1122.  
  1123.         out.template ref<scale - 2, scale - 1>() = col;
  1124.         out.template ref<scale - 1, scale - 1>() = col;
  1125.         out.template ref<scale - 1, scale - 2>() = col;
  1126.     }
  1127.  
  1128.     template <class OutputMatrix>
  1129.     static void blendCorner(uint32_t col, OutputMatrix& out)
  1130.     {
  1131.         //model a round corner
  1132.         alphaGrad<97, 100>(out.template ref<5, 5>(), col); //exact: 0.9711013910
  1133.         alphaGrad<42, 100>(out.template ref<4, 5>(), col); //0.4236372243
  1134.         alphaGrad<42, 100>(out.template ref<5, 4>(), col); //0.4236372243
  1135.         alphaGrad< 6, 100>(out.template ref<5, 3>(), col); //0.05652034508
  1136.         alphaGrad< 6, 100>(out.template ref<3, 5>(), col); //0.05652034508
  1137.     }
  1138. };
  1139.  
  1140.         //------------------------------------------------------------------------------------
  1141.         struct ColorDistanceRGB
  1142.         {
  1143.             static double dist(uint32_t pix1, uint32_t pix2, double luminanceWeight)
  1144.             {
  1145.                 return distYCbCrBuffered(pix1, pix2);
  1146.  
  1147.                 //if (pix1 == pix2) //about 4% perf boost
  1148.                 //    return 0;
  1149.                 //return distYCbCr(pix1, pix2, luminanceWeight);
  1150.             }
  1151.         };
  1152.  
  1153.         struct ColorDistanceARGB
  1154.         {
  1155.             static double dist(uint32_t pix1, uint32_t pix2, double luminanceWeight)
  1156.             {
  1157.                 const double a1 = GET_ALPHA (pix1) / 255.0 ;
  1158.                 const double a2 = GET_ALPHA (pix2) / 255.0 ;
  1159.                 /*
  1160.                 Requirements for a color distance handling alpha channel: with a1, a2 in [0, 1]
  1161.        
  1162.                     1. if a1 = a2, distance should be: a1 * distYCbCr()
  1163.                     2. if a1 = 0,  distance should be: a2 * distYCbCr(black, white) = a2 * 255
  1164.                     3. if a1 = 1,  ??? maybe: 255 * (1 - a2) + a2 * distYCbCr()
  1165.                 */
  1166.  
  1167.                 //return std::min(a1, a2) * distYCbCrBuffered(pix1, pix2) + 255 * abs(a1 - a2);
  1168.                 //=> following code is 15% faster:
  1169.                 const double d = distYCbCrBuffered(pix1, pix2);
  1170.                 if (a1 < a2)
  1171.                     return a1 * d + 255 * (a2 - a1);
  1172.                 else
  1173.                     return a2 * d + 255 * (a1 - a2);
  1174.  
  1175.                 //alternative? return /*std::*/sqrt(a1 * a2 * square(distYCbCrBuffered(pix1, pix2)) + square(255 * (a1 - a2)));
  1176.             }
  1177.         };
  1178.  
  1179.         struct ColorGradientRGB
  1180.         {
  1181.             template <unsigned int M, unsigned int N> static void alphaGrad (uint32_t &pixBack, uint32_t pixFront)
  1182.             {
  1183.                 pixBack = gradientRGB<M, N> (pixFront, pixBack);
  1184.             }
  1185.         };
  1186.  
  1187.         struct ColorGradientARGB
  1188.         {
  1189.             template <unsigned int M, unsigned int N> static void alphaGrad (uint32_t &pixBack, uint32_t pixFront)
  1190.             {
  1191.                 pixBack = gradientARGB<M, N> (pixFront, pixBack);
  1192.             }
  1193.         };
  1194. }
  1195.  
  1196.  
  1197.  
  1198. void xbrz::nearestNeighborScale(const uint32_t* src, int srcWidth, int srcHeight, uint32_t* trg, int trgWidth, int trgHeight)
  1199. {
  1200.     nearestNeighborScale (src, srcWidth, srcHeight, srcWidth * sizeof (uint32_t), trg, trgWidth, trgHeight, trgWidth * sizeof (uint32_t), XBRZ_SLICETYPE_TARGET, 0, trgHeight, [](uint32_t pix) { return pix; });
  1201. }
  1202.  
  1203.  
  1204. EXTERN_C bool xbrz_equalcolortest24 (uint32_t col1, uint32_t col2, double luminanceWeight, double equalColorTolerance)
  1205. {
  1206.         return (ColorDistanceRGB::dist(col1, col2, luminanceWeight) < equalColorTolerance);
  1207. }
  1208.  
  1209.  
  1210. EXTERN_C bool xbrz_equalcolortest32 (uint32_t col1, uint32_t col2, double luminanceWeight, double equalColorTolerance)
  1211. {
  1212.         return (ColorDistanceARGB::dist(col1, col2, luminanceWeight) < equalColorTolerance);
  1213. }
  1214.  
  1215.  
  1216. EXTERN_C void xbrz_scale24 (size_t factor, const uint32_t *src, uint32_t *trg, int srcWidth, int srcHeight)
  1217. {
  1218.     if      (factor == 2) return scaleImage<Scaler2x<ColorGradientRGB>, ColorDistanceRGB> (src, trg, srcWidth, srcHeight, 0, srcHeight);
  1219.     else if (factor == 3) return scaleImage<Scaler3x<ColorGradientRGB>, ColorDistanceRGB> (src, trg, srcWidth, srcHeight, 0, srcHeight);
  1220.     else if (factor == 4) return scaleImage<Scaler4x<ColorGradientRGB>, ColorDistanceRGB> (src, trg, srcWidth, srcHeight, 0, srcHeight);
  1221.     else if (factor == 5) return scaleImage<Scaler5x<ColorGradientRGB>, ColorDistanceRGB> (src, trg, srcWidth, srcHeight, 0, srcHeight);
  1222.     else if (factor == 6) return scaleImage<Scaler6x<ColorGradientRGB>, ColorDistanceRGB> (src, trg, srcWidth, srcHeight, 0, srcHeight);
  1223. }
  1224.  
  1225.  
  1226. EXTERN_C void xbrz_scale32 (size_t factor, const uint32_t *src, uint32_t *trg, int srcWidth, int srcHeight)
  1227. {
  1228.     if      (factor == 2) return scaleImage<Scaler2x<ColorGradientARGB>, ColorDistanceARGB> (src, trg, srcWidth, srcHeight, 0, srcHeight);
  1229.     else if (factor == 3) return scaleImage<Scaler3x<ColorGradientARGB>, ColorDistanceARGB> (src, trg, srcWidth, srcHeight, 0, srcHeight);
  1230.     else if (factor == 4) return scaleImage<Scaler4x<ColorGradientARGB>, ColorDistanceARGB> (src, trg, srcWidth, srcHeight, 0, srcHeight);
  1231.     else if (factor == 5) return scaleImage<Scaler5x<ColorGradientARGB>, ColorDistanceARGB> (src, trg, srcWidth, srcHeight, 0, srcHeight);
  1232.     else if (factor == 6) return scaleImage<Scaler6x<ColorGradientARGB>, ColorDistanceARGB> (src, trg, srcWidth, srcHeight, 0, srcHeight);
  1233. }
  1234.