Subversion Repositories Games.Prince of Persia

Rev

Details | Last modification | View Log | RSS feed

Rev Author Line No. Line
2 pmbaty 1
// ****************************************************************************
2
// * This file is part of the HqMAME project. It is distributed under         *
3
// * GNU General Public License: https://www.gnu.org/licenses/gpl-3.0         *
4
// * Copyright (C) Zenju (zenju AT gmx DOT de) - All Rights Reserved          *
5
// *                                                                          *
6
// * Additionally and as a special exception, the author gives permission     *
7
// * to link the code of this program with the MAME library (or with modified *
8
// * versions of MAME that use the same license as MAME), and distribute      *
9
// * linked combinations including the two. You must obey the GNU General     *
10
// * Public License in all respects for all of the code used other than MAME. *
11
// * If you modify this file, you may extend this exception to your version   *
12
// * of the file, but you are not obligated to do so. If you do not wish to   *
13
// * do so, delete this exception statement from your version.                *
14
// ****************************************************************************
15
 
16
 
17
#include <cstddef> //size_t
18
#include <cstdint> //uint32_t
19
#include <limits>
20
#include <cassert>
21
#include <algorithm>
22
#include <type_traits>
23
#include <vector>
24
#include <math.h>
25
 
26
 
27
#ifdef __cplusplus
28
#define EXTERN_C extern "C"
29
#else // !__cplusplus
30
#define EXTERN_C
31
#endif // __cplusplus
32
 
33
 
34
// scaler configuration
35
#define XBRZ_CFG_LUMINANCE_WEIGHT 1
36
#define XBRZ_CFG_EQUAL_COLOR_TOLERANCE 30
37
#define XBRZ_CFG_DOMINANT_DIRECTION_THRESHOLD 3.6
38
#define XBRZ_CFG_STEEP_DIRECTION_THRESHOLD 2.2
39
 
40
 
41
// slice types
42
#define XBRZ_SLICETYPE_SOURCE 1
43
#define XBRZ_SLICETYPE_TARGET 2
44
 
45
 
46
// handy macros
47
#define GET_BYTE(val,byteno) ((unsigned char) (((val) >> ((byteno) << 3)) & 0xff))
48
#define GET_BLUE(val)  GET_BYTE (val, 0)
49
#define GET_GREEN(val) GET_BYTE (val, 1)
50
#define GET_RED(val)   GET_BYTE (val, 2)
51
#define GET_ALPHA(val) GET_BYTE (val, 3)
52
//inline uint32_t rgb555to888(uint16_t pix) { return ((pix & 0x7C00) << 9) | ((pix & 0x03E0) << 6) | ((pix & 0x001F) << 3); }
53
//inline uint32_t rgb565to888(uint16_t pix) { return ((pix & 0xF800) << 8) | ((pix & 0x07E0) << 5) | ((pix & 0x001F) << 3); }
54
//inline uint16_t rgb888to555(uint32_t pix) { return static_cast<uint16_t>(((pix & 0xF80000) >> 9) | ((pix & 0x00F800) >> 6) | ((pix & 0x0000F8) >> 3)); }
55
//inline uint16_t rgb888to565(uint32_t pix) { return static_cast<uint16_t>(((pix & 0xF80000) >> 8) | ((pix & 0x00FC00) >> 5) | ((pix & 0x0000F8) >> 3)); }
56
 
57
 
58
namespace xbrz
59
{
60
        // -------------------------------------------------------------------------
61
        // | xBRZ: "Scale by rules" - high quality image upscaling filter by Zenju |
62
        // -------------------------------------------------------------------------
63
        // using a modified approach of xBR:
64
        // http://board.byuu.org/viewtopic.php?f=10&t=2248
65
        //  - new rule set preserving small image features
66
        //  - highly optimized for performance
67
        //  - support alpha channel
68
        //  - support multithreading
69
        //  - support 64-bit architectures
70
        //  - support processing image slices
71
        //  - support scaling up to 6xBRZ
72
 
73
        // -> map source (srcWidth * srcHeight) to target (scale * width x scale * height) image, optionally processing a half-open slice of rows [yFirst, yLast) only
74
        // -> support for source/target pitch in bytes!
75
        // -> if your emulator changes only a few image slices during each cycle (e.g. DOSBox) then there's no need to run xBRZ on the complete image:
76
        //    Just make sure you enlarge the source image slice by 2 rows on top and 2 on bottom (this is the additional range the xBRZ algorithm is using during analysis)
77
        //    CAVEAT: If there are multiple changed slices, make sure they do not overlap after adding these additional rows in order to avoid a memory race condition
78
        //    in the target image data if you are using multiple threads for processing each enlarged slice!
79
        // 
80
        // THREAD-SAFETY: - parts of the same image may be scaled by multiple threads as long as the [yFirst, yLast) ranges do not overlap!
81
        //                - there is a minor inefficiency for the first row of a slice, so avoid processing single rows only; suggestion: process at least 8-16 rows
82
 
83
        void nearestNeighborScale(const uint32_t* src, int srcWidth, int srcHeight, uint32_t* trg, int trgWidth, int trgHeight);
84
 
85
 
86
        template <class Pix> inline Pix* byteAdvance(Pix* ptr, int bytes)
87
        {
88
            using PixNonConst = typename std::remove_cv<Pix>::type;
89
            using PixByte     = typename std::conditional<std::is_same<Pix, PixNonConst>::value, char, const char>::type;
90
 
91
            static_assert(std::is_integral<PixNonConst>::value, "Pix* is expected to be cast-able to char*");
92
 
93
            return reinterpret_cast<Pix*>(reinterpret_cast<PixByte*>(ptr) + bytes);
94
        }
95
 
96
 
97
//fill block  with the given color
98
template <class Pix> inline void fillBlock(Pix* trg, int pitch, Pix col, int blockWidth, int blockHeight)
99
{
100
    //for (int y = 0; y < blockHeight; ++y, trg = byteAdvance(trg, pitch))
101
    //    std::fill(trg, trg + blockWidth, col);
102
 
103
    for (int y = 0; y < blockHeight; ++y, trg = byteAdvance(trg, pitch))
104
        for (int x = 0; x < blockWidth; ++x)
105
            trg[x] = col;
106
}
107
 
108
 
109
template <class PixSrc, class PixTrg, class PixConverter>
110
void nearestNeighborScale(const PixSrc* src, int srcWidth, int srcHeight, int srcPitch,
111
                          /**/  PixTrg* trg, int trgWidth, int trgHeight, int trgPitch,
112
                          int slice_type, int yFirst, int yLast, PixConverter pixCvrt /*convert PixSrc to PixTrg*/)
113
{
114
    static_assert(std::is_integral<PixSrc>::value, "PixSrc* is expected to be cast-able to char*");
115
    static_assert(std::is_integral<PixTrg>::value, "PixTrg* is expected to be cast-able to char*");
116
    static_assert(std::is_same<decltype(pixCvrt(PixSrc())), PixTrg>::value, "PixConverter returning wrong pixel format");
117
 
118
    if (srcPitch < srcWidth * static_cast<int>(sizeof(PixSrc))  ||
119
        trgPitch < trgWidth * static_cast<int>(sizeof(PixTrg)))
120
    {
121
        assert(false);
122
        return;
123
    }
124
 
125
    if (slice_type == XBRZ_SLICETYPE_SOURCE)
126
    {
127
            //nearest-neighbor (going over source image - fast for upscaling, since source is read only once
128
            yFirst = std::max(yFirst, 0);
129
            yLast  = std::min(yLast, srcHeight);
130
            if (yFirst >= yLast || trgWidth <= 0 || trgHeight <= 0) return;
131
 
132
            for (int y = yFirst; y < yLast; ++y)
133
            {
134
                //mathematically: ySrc = floor(srcHeight * yTrg / trgHeight)
135
                // => search for integers in: [ySrc, ySrc + 1) * trgHeight / srcHeight
136
 
137
                //keep within for loop to support MT input slices!
138
                const int yTrg_first = ( y      * trgHeight + srcHeight - 1) / srcHeight; //=ceil(y * trgHeight / srcHeight)
139
                const int yTrg_last  = ((y + 1) * trgHeight + srcHeight - 1) / srcHeight; //=ceil(((y + 1) * trgHeight) / srcHeight)
140
                const int blockHeight = yTrg_last - yTrg_first;
141
 
142
                if (blockHeight > 0)
143
                {
144
                    const PixSrc* srcLine = byteAdvance(src, y * srcPitch);
145
                    /**/  PixTrg* trgLine = byteAdvance(trg, yTrg_first * trgPitch);
146
                    int xTrg_first = 0;
147
 
148
                    for (int x = 0; x < srcWidth; ++x)
149
                    {
150
                        const int xTrg_last = ((x + 1) * trgWidth + srcWidth - 1) / srcWidth;
151
                        const int blockWidth = xTrg_last - xTrg_first;
152
                        if (blockWidth > 0)
153
                        {
154
                            xTrg_first = xTrg_last;
155
 
156
                            const auto trgPix = pixCvrt(srcLine[x]);
157
                            fillBlock(trgLine, trgPitch, trgPix, blockWidth, blockHeight);
158
                            trgLine += blockWidth;
159
                        }
160
                    }
161
                }
162
            }
163
    }
164
    else if (slice_type == XBRZ_SLICETYPE_TARGET)
165
    {
166
            //nearest-neighbor (going over target image - slow for upscaling, since source is read multiple times missing out on cache! Fast for similar image sizes!)
167
            yFirst = std::max(yFirst, 0);
168
            yLast  = std::min(yLast, trgHeight);
169
            if (yFirst >= yLast || srcHeight <= 0 || srcWidth <= 0) return;
170
 
171
            for (int y = yFirst; y < yLast; ++y)
172
            {
173
                PixTrg* trgLine = byteAdvance(trg, y * trgPitch);
174
                const int ySrc = srcHeight * y / trgHeight;
175
                const PixSrc* srcLine = byteAdvance(src, ySrc * srcPitch);
176
                for (int x = 0; x < trgWidth; ++x)
177
                {
178
                    const int xSrc = srcWidth * x / trgWidth;
179
                    trgLine[x] = pixCvrt(srcLine[xSrc]);
180
                }
181
            }
182
    }
183
}
184
}
185
 
186
 
187
 
188
 
189
namespace
190
{
191
template <unsigned int M, unsigned int N> inline
192
uint32_t gradientRGB(uint32_t pixFront, uint32_t pixBack) //blend front color with opacity M / N over opaque background: http://en.wikipedia.org/wiki/Alpha_compositing#Alpha_blending
193
{
194
    static_assert(0 < M && M < N && N <= 1000, "");
195
 
196
    auto calcColor = [](unsigned char colFront, unsigned char colBack) -> unsigned char { return (colFront * M + colBack * (N - M)) / N; };
197
 
198
        return ((calcColor (GET_RED   (pixFront), GET_RED   (pixBack)) << 16)
199
              | (calcColor (GET_GREEN (pixFront), GET_GREEN (pixBack)) <<  8)
200
              | (calcColor (GET_BLUE  (pixFront), GET_BLUE  (pixBack)) <<  0));
201
}
202
 
203
 
204
template <unsigned int M, unsigned int N> inline
205
uint32_t gradientARGB(uint32_t pixFront, uint32_t pixBack) //find intermediate color between two colors with alpha channels (=> NO alpha blending!!!)
206
{
207
    static_assert(0 < M && M < N && N <= 1000, "");
208
 
209
    const unsigned int weightFront = GET_ALPHA (pixFront) * M;
210
    const unsigned int weightBack  = GET_ALPHA (pixBack) * (N - M);
211
    const unsigned int weightSum   = weightFront + weightBack;
212
    if (weightSum == 0)
213
        return 0;
214
 
215
    auto calcColor = [=](unsigned char colFront, unsigned char colBack)
216
    {
217
        return static_cast<unsigned char>((colFront * weightFront + colBack * weightBack) / weightSum);
218
    };
219
 
220
        return (((unsigned char) (weightSum / N))                      << 24)
221
              | (calcColor (GET_RED   (pixFront), GET_RED   (pixBack)) << 16)
222
              | (calcColor (GET_GREEN (pixFront), GET_GREEN (pixBack)) <<  8)
223
              | (calcColor (GET_BLUE  (pixFront), GET_BLUE  (pixBack)) <<  0);
224
}
225
 
226
 
227
//inline
228
//double fastSqrt(double n)
229
//{
230
//    __asm //speeds up xBRZ by about 9% compared to /*std::*/sqrt which internally uses the same assembler instructions but adds some "fluff"
231
//    {
232
//        fld n
233
//        fsqrt
234
//    }
235
//}
236
//
237
 
238
 
239
#ifdef _MSC_VER
240
    #define FORCE_INLINE __forceinline
241
#elif defined __GNUC__
242
    #define FORCE_INLINE __attribute__((always_inline)) inline
243
#else
244
    #define FORCE_INLINE inline
245
#endif
246
 
247
 
248
enum RotationDegree //clock-wise
249
{
250
    ROT_0,
251
    ROT_90,
252
    ROT_180,
253
    ROT_270
254
};
255
 
256
//calculate input matrix coordinates after rotation at compile time
257
template <RotationDegree rotDeg, size_t I, size_t J, size_t N>
258
struct MatrixRotation;
259
 
260
template <size_t I, size_t J, size_t N>
261
struct MatrixRotation<ROT_0, I, J, N>
262
{
263
    static const size_t I_old = I;
264
    static const size_t J_old = J;
265
};
266
 
267
template <RotationDegree rotDeg, size_t I, size_t J, size_t N> //(i, j) = (row, col) indices, N = size of (square) matrix
268
struct MatrixRotation
269
{
270
    static const size_t I_old = N - 1 - MatrixRotation<static_cast<RotationDegree>(rotDeg - 1), I, J, N>::J_old; //old coordinates before rotation!
271
    static const size_t J_old =         MatrixRotation<static_cast<RotationDegree>(rotDeg - 1), I, J, N>::I_old; //
272
};
273
 
274
 
275
template <size_t N, RotationDegree rotDeg>
276
class OutputMatrix
277
{
278
public:
279
    OutputMatrix(uint32_t* out, int outWidth) : //access matrix area, top-left at position "out" for image with given width
280
        out_(out),
281
        outWidth_(outWidth) {}
282
 
283
    template <size_t I, size_t J>
284
    uint32_t& ref() const
285
    {
286
        static const size_t I_old = MatrixRotation<rotDeg, I, J, N>::I_old;
287
        static const size_t J_old = MatrixRotation<rotDeg, I, J, N>::J_old;
288
        return *(out_ + J_old + I_old * outWidth_);
289
    }
290
 
291
private:
292
    uint32_t* out_;
293
    const int outWidth_;
294
};
295
 
296
 
297
template <class T> inline
298
T square(T value) { return value * value; }
299
 
300
 
301
 
302
inline
303
double distRGB(uint32_t pix1, uint32_t pix2)
304
{
305
    const double r_diff = static_cast<int>(GET_RED   (pix1)) - GET_RED   (pix2);
306
    const double g_diff = static_cast<int>(GET_GREEN (pix1)) - GET_GREEN (pix2);
307
    const double b_diff = static_cast<int>(GET_BLUE  (pix1)) - GET_BLUE  (pix2);
308
 
309
    //euklidean RGB distance
310
    return /*std::*/sqrt(square(r_diff) + square(g_diff) + square(b_diff));
311
}
312
 
313
 
314
inline
315
double distYCbCr(uint32_t pix1, uint32_t pix2, double lumaWeight)
316
{
317
    //http://en.wikipedia.org/wiki/YCbCr#ITU-R_BT.601_conversion
318
    //YCbCr conversion is a matrix multiplication => take advantage of linearity by subtracting first!
319
    const int r_diff = static_cast<int>(GET_RED   (pix1)) - GET_RED   (pix2); //we may delay division by 255 to after matrix multiplication
320
    const int g_diff = static_cast<int>(GET_GREEN (pix1)) - GET_GREEN (pix2); //
321
    const int b_diff = static_cast<int>(GET_BLUE  (pix1)) - GET_BLUE  (pix2); //substraction for int is noticeable faster than for double!
322
 
323
    //const double k_b = 0.0722; //ITU-R BT.709 conversion
324
    //const double k_r = 0.2126; //
325
    const double k_b = 0.0593; //ITU-R BT.2020 conversion
326
    const double k_r = 0.2627; //
327
    const double k_g = 1 - k_b - k_r;
328
 
329
    const double scale_b = 0.5 / (1 - k_b);
330
    const double scale_r = 0.5 / (1 - k_r);
331
 
332
    const double y   = k_r * r_diff + k_g * g_diff + k_b * b_diff; //[!], analog YCbCr!
333
    const double c_b = scale_b * (b_diff - y);
334
    const double c_r = scale_r * (r_diff - y);
335
 
336
    //we skip division by 255 to have similar range like other distance functions
337
    return /*std::*/sqrt(square(lumaWeight * y) + square(c_b) + square(c_r));
338
}
339
 
340
 
341
inline double distYCbCrBuffered(uint32_t pix1, uint32_t pix2)
342
{
343
    //30% perf boost compared to plain distYCbCr()!
344
    //consumes 64 MB memory; using double is only 2% faster, but takes 128 MB
345
    static const std::vector<float> diffToDist = []
346
    {
347
        std::vector<float> tmp;
348
 
349
        for (uint32_t i = 0; i < 256 * 256 * 256; ++i) //startup time: 114 ms on Intel Core i5 (four cores)
350
        {
351
            const int r_diff = GET_RED (i) * 2 - 0xFF;
352
            const int g_diff = GET_GREEN (i) * 2 - 0xFF;
353
            const int b_diff = GET_BLUE (i) * 2 - 0xFF;
354
 
355
            const double k_b = 0.0593; //ITU-R BT.2020 conversion
356
            const double k_r = 0.2627; //
357
            const double k_g = 1 - k_b - k_r;
358
 
359
            const double scale_b = 0.5 / (1 - k_b);
360
            const double scale_r = 0.5 / (1 - k_r);
361
 
362
            const double y   = k_r * r_diff + k_g * g_diff + k_b * b_diff; //[!], analog YCbCr!
363
            const double c_b = scale_b * (b_diff - y);
364
            const double c_r = scale_r * (r_diff - y);
365
 
366
            tmp.push_back(static_cast<float>(/*std::*/sqrt(square(y) + square(c_b) + square(c_r))));
367
        }
368
        return tmp;
369
    }();
370
 
371
    //if (pix1 == pix2) -> 8% perf degradation!
372
    //    return 0;
373
    //if (pix1 < pix2)
374
    //    std::swap(pix1, pix2); -> 30% perf degradation!!!
375
#if 1
376
    const int r_diff = static_cast<int>(GET_RED   (pix1)) - GET_RED   (pix2);
377
    const int g_diff = static_cast<int>(GET_GREEN (pix1)) - GET_GREEN (pix2);
378
    const int b_diff = static_cast<int>(GET_BLUE  (pix1)) - GET_BLUE  (pix2);
379
 
380
    return diffToDist[(((r_diff + 0xFF) / 2) << 16) | //slightly reduce precision (division by 2) to squeeze value into single byte
381
                      (((g_diff + 0xFF) / 2) <<  8) |
382
                      (( b_diff + 0xFF) / 2)];
383
#else //not noticeably faster:
384
    const int r_diff_tmp = ((pix1 & 0xFF0000) + 0xFF0000 - (pix2 & 0xFF0000)) / 2;
385
    const int g_diff_tmp = ((pix1 & 0x00FF00) + 0x00FF00 - (pix2 & 0x00FF00)) / 2; //slightly reduce precision (division by 2) to squeeze value into single byte
386
    const int b_diff_tmp = ((pix1 & 0x0000FF) + 0x0000FF - (pix2 & 0x0000FF)) / 2;
387
 
388
    return diffToDist[(r_diff_tmp & 0xFF0000) | (g_diff_tmp & 0x00FF00) | (b_diff_tmp & 0x0000FF)];
389
#endif
390
}
391
 
392
 
393
enum BlendType
394
{
395
    BLEND_NONE = 0,
396
    BLEND_NORMAL,   //a normal indication to blend
397
    BLEND_DOMINANT, //a strong indication to blend
398
    //attention: BlendType must fit into the value range of 2 bit!!!
399
};
400
 
401
struct BlendResult
402
{
403
    BlendType
404
    /**/blend_f, blend_g,
405
    /**/blend_j, blend_k;
406
};
407
 
408
 
409
struct Kernel_4x4 //kernel for preprocessing step
410
{
411
    uint32_t
412
    /**/a, b, c, d,
413
    /**/e, f, g, h,
414
    /**/i, j, k, l,
415
    /**/m, n, o, p;
416
};
417
 
418
/*
419
input kernel area naming convention:
420
-----------------
421
| A | B | C | D |
422
----|---|---|---|
423
| E | F | G | H |   //evaluate the four corners between F, G, J, K
424
----|---|---|---|   //input pixel is at position F
425
| I | J | K | L |
426
----|---|---|---|
427
| M | N | O | P |
428
-----------------
429
*/
430
template <class ColorDistance>
431
FORCE_INLINE //detect blend direction
432
BlendResult preProcessCorners(const Kernel_4x4& ker) //result: F, G, J, K corners of "GradientType"
433
{
434
    BlendResult result = {};
435
 
436
    if ((ker.f == ker.g &&
437
         ker.j == ker.k) ||
438
        (ker.f == ker.j &&
439
         ker.g == ker.k))
440
        return result;
441
 
442
    auto dist = [&](uint32_t pix1, uint32_t pix2) { return ColorDistance::dist(pix1, pix2, XBRZ_CFG_LUMINANCE_WEIGHT); };
443
 
444
    const int weight = 4;
445
    double jg = dist(ker.i, ker.f) + dist(ker.f, ker.c) + dist(ker.n, ker.k) + dist(ker.k, ker.h) + weight * dist(ker.j, ker.g);
446
    double fk = dist(ker.e, ker.j) + dist(ker.j, ker.o) + dist(ker.b, ker.g) + dist(ker.g, ker.l) + weight * dist(ker.f, ker.k);
447
 
448
    if (jg < fk) //test sample: 70% of values max(jg, fk) / min(jg, fk) are between 1.1 and 3.7 with median being 1.8
449
    {
450
        const bool dominantGradient = XBRZ_CFG_DOMINANT_DIRECTION_THRESHOLD * jg < fk;
451
        if (ker.f != ker.g && ker.f != ker.j)
452
            result.blend_f = dominantGradient ? BLEND_DOMINANT : BLEND_NORMAL;
453
 
454
        if (ker.k != ker.j && ker.k != ker.g)
455
            result.blend_k = dominantGradient ? BLEND_DOMINANT : BLEND_NORMAL;
456
    }
457
    else if (fk < jg)
458
    {
459
        const bool dominantGradient = XBRZ_CFG_DOMINANT_DIRECTION_THRESHOLD * fk < jg;
460
        if (ker.j != ker.f && ker.j != ker.k)
461
            result.blend_j = dominantGradient ? BLEND_DOMINANT : BLEND_NORMAL;
462
 
463
        if (ker.g != ker.f && ker.g != ker.k)
464
            result.blend_g = dominantGradient ? BLEND_DOMINANT : BLEND_NORMAL;
465
    }
466
    return result;
467
}
468
 
469
struct Kernel_3x3
470
{
471
    uint32_t
472
    /**/a,  b,  c,
473
    /**/d,  e,  f,
474
    /**/g,  h,  i;
475
};
476
 
477
#define DEF_GETTER(x) template <RotationDegree rotDeg> uint32_t inline get_##x(const Kernel_3x3& ker) { return ker.x; }
478
//we cannot and NEED NOT write "ker.##x" since ## concatenates preprocessor tokens but "." is not a token
479
DEF_GETTER(a) DEF_GETTER(b) DEF_GETTER(c)
480
DEF_GETTER(d) DEF_GETTER(e) DEF_GETTER(f)
481
DEF_GETTER(g) DEF_GETTER(h) DEF_GETTER(i)
482
#undef DEF_GETTER
483
 
484
#define DEF_GETTER(x, y) template <> inline uint32_t get_##x<ROT_90>(const Kernel_3x3& ker) { return ker.y; }
485
DEF_GETTER(a, g) DEF_GETTER(b, d) DEF_GETTER(c, a)
486
DEF_GETTER(d, h) DEF_GETTER(e, e) DEF_GETTER(f, b)
487
DEF_GETTER(g, i) DEF_GETTER(h, f) DEF_GETTER(i, c)
488
#undef DEF_GETTER
489
 
490
#define DEF_GETTER(x, y) template <> inline uint32_t get_##x<ROT_180>(const Kernel_3x3& ker) { return ker.y; }
491
DEF_GETTER(a, i) DEF_GETTER(b, h) DEF_GETTER(c, g)
492
DEF_GETTER(d, f) DEF_GETTER(e, e) DEF_GETTER(f, d)
493
DEF_GETTER(g, c) DEF_GETTER(h, b) DEF_GETTER(i, a)
494
#undef DEF_GETTER
495
 
496
#define DEF_GETTER(x, y) template <> inline uint32_t get_##x<ROT_270>(const Kernel_3x3& ker) { return ker.y; }
497
DEF_GETTER(a, c) DEF_GETTER(b, f) DEF_GETTER(c, i)
498
DEF_GETTER(d, b) DEF_GETTER(e, e) DEF_GETTER(f, h)
499
DEF_GETTER(g, a) DEF_GETTER(h, d) DEF_GETTER(i, g)
500
#undef DEF_GETTER
501
 
502
 
503
//compress four blend types into a single byte
504
inline BlendType getTopL   (unsigned char b) { return static_cast<BlendType>(0x3 & b); }
505
inline BlendType getTopR   (unsigned char b) { return static_cast<BlendType>(0x3 & (b >> 2)); }
506
inline BlendType getBottomR(unsigned char b) { return static_cast<BlendType>(0x3 & (b >> 4)); }
507
inline BlendType getBottomL(unsigned char b) { return static_cast<BlendType>(0x3 & (b >> 6)); }
508
 
509
inline void setTopL   (unsigned char& b, BlendType bt) { b |= bt; } //buffer is assumed to be initialized before preprocessing!
510
inline void setTopR   (unsigned char& b, BlendType bt) { b |= (bt << 2); }
511
inline void setBottomR(unsigned char& b, BlendType bt) { b |= (bt << 4); }
512
inline void setBottomL(unsigned char& b, BlendType bt) { b |= (bt << 6); }
513
 
514
inline bool blendingNeeded(unsigned char b) { return b != 0; }
515
 
516
template <RotationDegree rotDeg> inline
517
unsigned char rotateBlendInfo(unsigned char b) { return b; }
518
template <> inline unsigned char rotateBlendInfo<ROT_90 >(unsigned char b) { return ((b << 2) | (b >> 6)) & 0xff; }
519
template <> inline unsigned char rotateBlendInfo<ROT_180>(unsigned char b) { return ((b << 4) | (b >> 4)) & 0xff; }
520
template <> inline unsigned char rotateBlendInfo<ROT_270>(unsigned char b) { return ((b << 6) | (b >> 2)) & 0xff; }
521
 
522
 
523
/*
524
input kernel area naming convention:
525
-------------
526
| A | B | C |
527
----|---|---|
528
| D | E | F | //input pixel is at position E
529
----|---|---|
530
| G | H | I |
531
-------------
532
*/
533
template <class Scaler, class ColorDistance, RotationDegree rotDeg>
534
FORCE_INLINE //perf: quite worth it!
535
void blendPixel(const Kernel_3x3& ker,
536
                uint32_t* target, int trgWidth,
537
                unsigned char blendInfo) //result of preprocessing all four corners of pixel "e"
538
{
539
#define a get_a<rotDeg>(ker)
540
#define b get_b<rotDeg>(ker)
541
#define c get_c<rotDeg>(ker)
542
#define d get_d<rotDeg>(ker)
543
#define e get_e<rotDeg>(ker)
544
#define f get_f<rotDeg>(ker)
545
#define g get_g<rotDeg>(ker)
546
#define h get_h<rotDeg>(ker)
547
#define i get_i<rotDeg>(ker)
548
 
549
    const unsigned char blend = rotateBlendInfo<rotDeg>(blendInfo);
550
 
551
    if (getBottomR(blend) >= BLEND_NORMAL)
552
    {
553
        auto eq   = [&](uint32_t pix1, uint32_t pix2) { return ColorDistance::dist(pix1, pix2, XBRZ_CFG_LUMINANCE_WEIGHT) < XBRZ_CFG_EQUAL_COLOR_TOLERANCE; };
554
        auto dist = [&](uint32_t pix1, uint32_t pix2) { return ColorDistance::dist(pix1, pix2, XBRZ_CFG_LUMINANCE_WEIGHT); };
555
 
556
        const bool doLineBlend = [&]() -> bool
557
        {
558
            if (getBottomR(blend) >= BLEND_DOMINANT)
559
                return true;
560
 
561
            //make sure there is no second blending in an adjacent rotation for this pixel: handles insular pixels, mario eyes
562
            if (getTopR(blend) != BLEND_NONE && !eq(e, g)) //but support double-blending for 90° corners
563
                return false;
564
            if (getBottomL(blend) != BLEND_NONE && !eq(e, c))
565
                return false;
566
 
567
            //no full blending for L-shapes; blend corner only (handles "mario mushroom eyes")
568
            if (!eq(e, i) && eq(g, h) && eq(h, i) && eq(i, f) && eq(f, c))
569
                return false;
570
 
571
            return true;
572
        }();
573
 
574
        const uint32_t px = dist(e, f) <= dist(e, h) ? f : h; //choose most similar color
575
 
576
        OutputMatrix<Scaler::scale, rotDeg> out(target, trgWidth);
577
 
578
        if (doLineBlend)
579
        {
580
            const double fg = dist(f, g); //test sample: 70% of values max(fg, hc) / min(fg, hc) are between 1.1 and 3.7 with median being 1.9
581
            const double hc = dist(h, c); //
582
 
583
            const bool haveShallowLine = XBRZ_CFG_STEEP_DIRECTION_THRESHOLD * fg <= hc && e != g && d != g;
584
            const bool haveSteepLine   = XBRZ_CFG_STEEP_DIRECTION_THRESHOLD * hc <= fg && e != c && b != c;
585
 
586
            if (haveShallowLine)
587
            {
588
                if (haveSteepLine)
589
                    Scaler::blendLineSteepAndShallow(px, out);
590
                else
591
                    Scaler::blendLineShallow(px, out);
592
            }
593
            else
594
            {
595
                if (haveSteepLine)
596
                    Scaler::blendLineSteep(px, out);
597
                else
598
                    Scaler::blendLineDiagonal(px, out);
599
            }
600
        }
601
        else
602
            Scaler::blendCorner(px, out);
603
    }
604
 
605
#undef a
606
#undef b
607
#undef c
608
#undef d
609
#undef e
610
#undef f
611
#undef g
612
#undef h
613
#undef i
614
}
615
 
616
 
617
template <class Scaler, class ColorDistance> //scaler policy: see "Scaler2x" reference implementation
618
void scaleImage(const uint32_t* src, uint32_t* trg, int srcWidth, int srcHeight, int yFirst, int yLast)
619
{
620
    yFirst = std::max(yFirst, 0);
621
    yLast  = std::min(yLast, srcHeight);
622
    if (yFirst >= yLast || srcWidth <= 0)
623
        return;
624
 
625
    const int trgWidth = srcWidth * Scaler::scale;
626
 
627
    //"use" space at the end of the image as temporary buffer for "on the fly preprocessing": we even could use larger area of
628
    //"sizeof(uint32_t) * srcWidth * (yLast - yFirst)" bytes without risk of accidental overwriting before accessing
629
    const int bufferSize = srcWidth;
630
    unsigned char* preProcBuffer = reinterpret_cast<unsigned char*>(trg + yLast * Scaler::scale * trgWidth) - bufferSize;
631
    std::fill(preProcBuffer, preProcBuffer + bufferSize, '\0');
632
    static_assert(BLEND_NONE == 0, "");
633
 
634
    //initialize preprocessing buffer for first row of current stripe: detect upper left and right corner blending
635
    //this cannot be optimized for adjacent processing stripes; we must not allow for a memory race condition!
636
    if (yFirst > 0)
637
    {
638
        const int y = yFirst - 1;
639
 
640
        const uint32_t* s_m1 = src + srcWidth * std::max(y - 1, 0);
641
        const uint32_t* s_0  = src + srcWidth * y; //center line
642
        const uint32_t* s_p1 = src + srcWidth * std::min(y + 1, srcHeight - 1);
643
        const uint32_t* s_p2 = src + srcWidth * std::min(y + 2, srcHeight - 1);
644
 
645
        for (int x = 0; x < srcWidth; ++x)
646
        {
647
            const int x_m1 = std::max(x - 1, 0);
648
            const int x_p1 = std::min(x + 1, srcWidth - 1);
649
            const int x_p2 = std::min(x + 2, srcWidth - 1);
650
 
651
            Kernel_4x4 ker = {}; //perf: initialization is negligible
652
            ker.a = s_m1[x_m1]; //read sequentially from memory as far as possible
653
            ker.b = s_m1[x];
654
            ker.c = s_m1[x_p1];
655
            ker.d = s_m1[x_p2];
656
 
657
            ker.e = s_0[x_m1];
658
            ker.f = s_0[x];
659
            ker.g = s_0[x_p1];
660
            ker.h = s_0[x_p2];
661
 
662
            ker.i = s_p1[x_m1];
663
            ker.j = s_p1[x];
664
            ker.k = s_p1[x_p1];
665
            ker.l = s_p1[x_p2];
666
 
667
            ker.m = s_p2[x_m1];
668
            ker.n = s_p2[x];
669
            ker.o = s_p2[x_p1];
670
            ker.p = s_p2[x_p2];
671
 
672
            const BlendResult res = preProcessCorners<ColorDistance>(ker);
673
            /*
674
            preprocessing blend result:
675
            ---------
676
            | F | G |   //evalute corner between F, G, J, K
677
            ----|---|   //input pixel is at position F
678
            | J | K |
679
            ---------
680
            */
681
            setTopR(preProcBuffer[x], res.blend_j);
682
 
683
            if (x + 1 < bufferSize)
684
                setTopL(preProcBuffer[x + 1], res.blend_k);
685
        }
686
    }
687
    //------------------------------------------------------------------------------------
688
 
689
    for (int y = yFirst; y < yLast; ++y)
690
    {
691
        uint32_t* out = trg + Scaler::scale * y * trgWidth; //consider MT "striped" access
692
 
693
        const uint32_t* s_m1 = src + srcWidth * std::max(y - 1, 0);
694
        const uint32_t* s_0  = src + srcWidth * y; //center line
695
        const uint32_t* s_p1 = src + srcWidth * std::min(y + 1, srcHeight - 1);
696
        const uint32_t* s_p2 = src + srcWidth * std::min(y + 2, srcHeight - 1);
697
 
698
        unsigned char blend_xy1 = 0; //corner blending for current (x, y + 1) position
699
 
700
        for (int x = 0; x < srcWidth; ++x, out += Scaler::scale)
701
        {
702
            //all those bounds checks have only insignificant impact on performance!
703
            const int x_m1 = std::max(x - 1, 0); //perf: prefer array indexing to additional pointers!
704
            const int x_p1 = std::min(x + 1, srcWidth - 1);
705
            const int x_p2 = std::min(x + 2, srcWidth - 1);
706
 
707
            Kernel_4x4 ker4 = {}; //perf: initialization is negligible
708
 
709
            ker4.a = s_m1[x_m1]; //read sequentially from memory as far as possible
710
            ker4.b = s_m1[x];
711
            ker4.c = s_m1[x_p1];
712
            ker4.d = s_m1[x_p2];
713
 
714
            ker4.e = s_0[x_m1];
715
            ker4.f = s_0[x];
716
            ker4.g = s_0[x_p1];
717
            ker4.h = s_0[x_p2];
718
 
719
            ker4.i = s_p1[x_m1];
720
            ker4.j = s_p1[x];
721
            ker4.k = s_p1[x_p1];
722
            ker4.l = s_p1[x_p2];
723
 
724
            ker4.m = s_p2[x_m1];
725
            ker4.n = s_p2[x];
726
            ker4.o = s_p2[x_p1];
727
            ker4.p = s_p2[x_p2];
728
 
729
            //evaluate the four corners on bottom-right of current pixel
730
            unsigned char blend_xy = 0; //for current (x, y) position
731
            {
732
                const BlendResult res = preProcessCorners<ColorDistance>(ker4);
733
                /*
734
                preprocessing blend result:
735
                ---------
736
                | F | G |   //evalute corner between F, G, J, K
737
                ----|---|   //current input pixel is at position F
738
                | J | K |
739
                ---------
740
                */
741
                blend_xy = preProcBuffer[x];
742
                setBottomR(blend_xy, res.blend_f); //all four corners of (x, y) have been determined at this point due to processing sequence!
743
 
744
                setTopR(blend_xy1, res.blend_j); //set 2nd known corner for (x, y + 1)
745
                preProcBuffer[x] = blend_xy1; //store on current buffer position for use on next row
746
 
747
                blend_xy1 = 0;
748
                setTopL(blend_xy1, res.blend_k); //set 1st known corner for (x + 1, y + 1) and buffer for use on next column
749
 
750
                if (x + 1 < bufferSize) //set 3rd known corner for (x + 1, y)
751
                    setBottomL(preProcBuffer[x + 1], res.blend_g);
752
            }
753
 
754
            //fill block of size scale * scale with the given color
755
            xbrz::fillBlock(out, trgWidth * sizeof(uint32_t), ker4.f, Scaler::scale, Scaler::scale);
756
            //place *after* preprocessing step, to not overwrite the results while processing the the last pixel!
757
 
758
            //blend four corners of current pixel
759
            if (blendingNeeded(blend_xy)) //good 5% perf-improvement
760
            {
761
                Kernel_3x3 ker3 = {}; //perf: initialization is negligible
762
 
763
                ker3.a = ker4.a;
764
                ker3.b = ker4.b;
765
                ker3.c = ker4.c;
766
 
767
                ker3.d = ker4.e;
768
                ker3.e = ker4.f;
769
                ker3.f = ker4.g;
770
 
771
                ker3.g = ker4.i;
772
                ker3.h = ker4.j;
773
                ker3.i = ker4.k;
774
 
775
                blendPixel<Scaler, ColorDistance, ROT_0  >(ker3, out, trgWidth, blend_xy);
776
                blendPixel<Scaler, ColorDistance, ROT_90 >(ker3, out, trgWidth, blend_xy);
777
                blendPixel<Scaler, ColorDistance, ROT_180>(ker3, out, trgWidth, blend_xy);
778
                blendPixel<Scaler, ColorDistance, ROT_270>(ker3, out, trgWidth, blend_xy);
779
            }
780
        }
781
    }
782
}
783
 
784
 
785
//------------------------------------------------------------------------------------
786
template <class ColorGradient> struct Scaler2x : public ColorGradient
787
{
788
    static const int scale = 2;
789
 
790
    template <unsigned int M, unsigned int N> //bring template function into scope for GCC
791
    static void alphaGrad(uint32_t& pixBack, uint32_t pixFront) { ColorGradient::template alphaGrad<M, N>(pixBack, pixFront); }
792
 
793
 
794
    template <class OutputMatrix>
795
    static void blendLineShallow(uint32_t col, OutputMatrix& out)
796
    {
797
        alphaGrad<1, 4>(out.template ref<scale - 1, 0>(), col);
798
        alphaGrad<3, 4>(out.template ref<scale - 1, 1>(), col);
799
    }
800
 
801
    template <class OutputMatrix>
802
    static void blendLineSteep(uint32_t col, OutputMatrix& out)
803
    {
804
        alphaGrad<1, 4>(out.template ref<0, scale - 1>(), col);
805
        alphaGrad<3, 4>(out.template ref<1, scale - 1>(), col);
806
    }
807
 
808
    template <class OutputMatrix>
809
    static void blendLineSteepAndShallow(uint32_t col, OutputMatrix& out)
810
    {
811
        alphaGrad<1, 4>(out.template ref<1, 0>(), col);
812
        alphaGrad<1, 4>(out.template ref<0, 1>(), col);
813
        alphaGrad<5, 6>(out.template ref<1, 1>(), col); //[!] fixes 7/8 used in xBR
814
    }
815
 
816
    template <class OutputMatrix>
817
    static void blendLineDiagonal(uint32_t col, OutputMatrix& out)
818
    {
819
        alphaGrad<1, 2>(out.template ref<1, 1>(), col);
820
    }
821
 
822
    template <class OutputMatrix>
823
    static void blendCorner(uint32_t col, OutputMatrix& out)
824
    {
825
        //model a round corner
826
        alphaGrad<21, 100>(out.template ref<1, 1>(), col); //exact: 1 - pi/4 = 0.2146018366
827
    }
828
};
829
 
830
 
831
template <class ColorGradient> struct Scaler3x : public ColorGradient
832
{
833
    static const int scale = 3;
834
 
835
    template <unsigned int M, unsigned int N> //bring template function into scope for GCC
836
    static void alphaGrad(uint32_t& pixBack, uint32_t pixFront) { ColorGradient::template alphaGrad<M, N>(pixBack, pixFront); }
837
 
838
 
839
    template <class OutputMatrix>
840
    static void blendLineShallow(uint32_t col, OutputMatrix& out)
841
    {
842
        alphaGrad<1, 4>(out.template ref<scale - 1, 0>(), col);
843
        alphaGrad<1, 4>(out.template ref<scale - 2, 2>(), col);
844
 
845
        alphaGrad<3, 4>(out.template ref<scale - 1, 1>(), col);
846
        out.template ref<scale - 1, 2>() = col;
847
    }
848
 
849
    template <class OutputMatrix>
850
    static void blendLineSteep(uint32_t col, OutputMatrix& out)
851
    {
852
        alphaGrad<1, 4>(out.template ref<0, scale - 1>(), col);
853
        alphaGrad<1, 4>(out.template ref<2, scale - 2>(), col);
854
 
855
        alphaGrad<3, 4>(out.template ref<1, scale - 1>(), col);
856
        out.template ref<2, scale - 1>() = col;
857
    }
858
 
859
    template <class OutputMatrix>
860
    static void blendLineSteepAndShallow(uint32_t col, OutputMatrix& out)
861
    {
862
        alphaGrad<1, 4>(out.template ref<2, 0>(), col);
863
        alphaGrad<1, 4>(out.template ref<0, 2>(), col);
864
        alphaGrad<3, 4>(out.template ref<2, 1>(), col);
865
        alphaGrad<3, 4>(out.template ref<1, 2>(), col);
866
        out.template ref<2, 2>() = col;
867
    }
868
 
869
    template <class OutputMatrix>
870
    static void blendLineDiagonal(uint32_t col, OutputMatrix& out)
871
    {
872
        alphaGrad<1, 8>(out.template ref<1, 2>(), col); //conflict with other rotations for this odd scale
873
        alphaGrad<1, 8>(out.template ref<2, 1>(), col);
874
        alphaGrad<7, 8>(out.template ref<2, 2>(), col); //
875
    }
876
 
877
    template <class OutputMatrix>
878
    static void blendCorner(uint32_t col, OutputMatrix& out)
879
    {
880
        //model a round corner
881
        alphaGrad<45, 100>(out.template ref<2, 2>(), col); //exact: 0.4545939598
882
        //alphaGrad<7, 256>(out.template ref<2, 1>(), col); //0.02826017254 -> negligible + avoid conflicts with other rotations for this odd scale
883
        //alphaGrad<7, 256>(out.template ref<1, 2>(), col); //0.02826017254
884
    }
885
};
886
 
887
 
888
template <class ColorGradient> struct Scaler4x : public ColorGradient
889
{
890
    static const int scale = 4;
891
 
892
    template <unsigned int M, unsigned int N> //bring template function into scope for GCC
893
    static void alphaGrad(uint32_t& pixBack, uint32_t pixFront) { ColorGradient::template alphaGrad<M, N>(pixBack, pixFront); }
894
 
895
 
896
    template <class OutputMatrix>
897
    static void blendLineShallow(uint32_t col, OutputMatrix& out)
898
    {
899
        alphaGrad<1, 4>(out.template ref<scale - 1, 0>(), col);
900
        alphaGrad<1, 4>(out.template ref<scale - 2, 2>(), col);
901
 
902
        alphaGrad<3, 4>(out.template ref<scale - 1, 1>(), col);
903
        alphaGrad<3, 4>(out.template ref<scale - 2, 3>(), col);
904
 
905
        out.template ref<scale - 1, 2>() = col;
906
        out.template ref<scale - 1, 3>() = col;
907
    }
908
 
909
    template <class OutputMatrix>
910
    static void blendLineSteep(uint32_t col, OutputMatrix& out)
911
    {
912
        alphaGrad<1, 4>(out.template ref<0, scale - 1>(), col);
913
        alphaGrad<1, 4>(out.template ref<2, scale - 2>(), col);
914
 
915
        alphaGrad<3, 4>(out.template ref<1, scale - 1>(), col);
916
        alphaGrad<3, 4>(out.template ref<3, scale - 2>(), col);
917
 
918
        out.template ref<2, scale - 1>() = col;
919
        out.template ref<3, scale - 1>() = col;
920
    }
921
 
922
    template <class OutputMatrix>
923
    static void blendLineSteepAndShallow(uint32_t col, OutputMatrix& out)
924
    {
925
        alphaGrad<3, 4>(out.template ref<3, 1>(), col);
926
        alphaGrad<3, 4>(out.template ref<1, 3>(), col);
927
        alphaGrad<1, 4>(out.template ref<3, 0>(), col);
928
        alphaGrad<1, 4>(out.template ref<0, 3>(), col);
929
 
930
        alphaGrad<1, 3>(out.template ref<2, 2>(), col); //[!] fixes 1/4 used in xBR
931
 
932
        out.template ref<3, 3>() = col;
933
        out.template ref<3, 2>() = col;
934
        out.template ref<2, 3>() = col;
935
    }
936
 
937
    template <class OutputMatrix>
938
    static void blendLineDiagonal(uint32_t col, OutputMatrix& out)
939
    {
940
        alphaGrad<1, 2>(out.template ref<scale - 1, scale / 2    >(), col);
941
        alphaGrad<1, 2>(out.template ref<scale - 2, scale / 2 + 1>(), col);
942
        out.template ref<scale - 1, scale - 1>() = col;
943
    }
944
 
945
    template <class OutputMatrix>
946
    static void blendCorner(uint32_t col, OutputMatrix& out)
947
    {
948
        //model a round corner
949
        alphaGrad<68, 100>(out.template ref<3, 3>(), col); //exact: 0.6848532563
950
        alphaGrad< 9, 100>(out.template ref<3, 2>(), col); //0.08677704501
951
        alphaGrad< 9, 100>(out.template ref<2, 3>(), col); //0.08677704501
952
    }
953
};
954
 
955
 
956
template <class ColorGradient> struct Scaler5x : public ColorGradient
957
{
958
    static const int scale = 5;
959
 
960
    template <unsigned int M, unsigned int N> //bring template function into scope for GCC
961
    static void alphaGrad(uint32_t& pixBack, uint32_t pixFront) { ColorGradient::template alphaGrad<M, N>(pixBack, pixFront); }
962
 
963
 
964
    template <class OutputMatrix>
965
    static void blendLineShallow(uint32_t col, OutputMatrix& out)
966
    {
967
        alphaGrad<1, 4>(out.template ref<scale - 1, 0>(), col);
968
        alphaGrad<1, 4>(out.template ref<scale - 2, 2>(), col);
969
        alphaGrad<1, 4>(out.template ref<scale - 3, 4>(), col);
970
 
971
        alphaGrad<3, 4>(out.template ref<scale - 1, 1>(), col);
972
        alphaGrad<3, 4>(out.template ref<scale - 2, 3>(), col);
973
 
974
        out.template ref<scale - 1, 2>() = col;
975
        out.template ref<scale - 1, 3>() = col;
976
        out.template ref<scale - 1, 4>() = col;
977
        out.template ref<scale - 2, 4>() = col;
978
    }
979
 
980
    template <class OutputMatrix>
981
    static void blendLineSteep(uint32_t col, OutputMatrix& out)
982
    {
983
        alphaGrad<1, 4>(out.template ref<0, scale - 1>(), col);
984
        alphaGrad<1, 4>(out.template ref<2, scale - 2>(), col);
985
        alphaGrad<1, 4>(out.template ref<4, scale - 3>(), col);
986
 
987
        alphaGrad<3, 4>(out.template ref<1, scale - 1>(), col);
988
        alphaGrad<3, 4>(out.template ref<3, scale - 2>(), col);
989
 
990
        out.template ref<2, scale - 1>() = col;
991
        out.template ref<3, scale - 1>() = col;
992
        out.template ref<4, scale - 1>() = col;
993
        out.template ref<4, scale - 2>() = col;
994
    }
995
 
996
    template <class OutputMatrix>
997
    static void blendLineSteepAndShallow(uint32_t col, OutputMatrix& out)
998
    {
999
        alphaGrad<1, 4>(out.template ref<0, scale - 1>(), col);
1000
        alphaGrad<1, 4>(out.template ref<2, scale - 2>(), col);
1001
        alphaGrad<3, 4>(out.template ref<1, scale - 1>(), col);
1002
 
1003
        alphaGrad<1, 4>(out.template ref<scale - 1, 0>(), col);
1004
        alphaGrad<1, 4>(out.template ref<scale - 2, 2>(), col);
1005
        alphaGrad<3, 4>(out.template ref<scale - 1, 1>(), col);
1006
 
1007
        alphaGrad<2, 3>(out.template ref<3, 3>(), col);
1008
 
1009
        out.template ref<2, scale - 1>() = col;
1010
        out.template ref<3, scale - 1>() = col;
1011
        out.template ref<4, scale - 1>() = col;
1012
 
1013
        out.template ref<scale - 1, 2>() = col;
1014
        out.template ref<scale - 1, 3>() = col;
1015
    }
1016
 
1017
    template <class OutputMatrix>
1018
    static void blendLineDiagonal(uint32_t col, OutputMatrix& out)
1019
    {
1020
        alphaGrad<1, 8>(out.template ref<scale - 1, scale / 2    >(), col); //conflict with other rotations for this odd scale
1021
        alphaGrad<1, 8>(out.template ref<scale - 2, scale / 2 + 1>(), col);
1022
        alphaGrad<1, 8>(out.template ref<scale - 3, scale / 2 + 2>(), col); //
1023
 
1024
        alphaGrad<7, 8>(out.template ref<4, 3>(), col);
1025
        alphaGrad<7, 8>(out.template ref<3, 4>(), col);
1026
 
1027
        out.template ref<4, 4>() = col;
1028
    }
1029
 
1030
    template <class OutputMatrix>
1031
    static void blendCorner(uint32_t col, OutputMatrix& out)
1032
    {
1033
        // model a round corner
1034
        alphaGrad<86, 100>(out.template ref<4, 4>(), col); //exact: 0.8631434088
1035
        alphaGrad<23, 100>(out.template ref<4, 3>(), col); //0.2306749731
1036
        alphaGrad<23, 100>(out.template ref<3, 4>(), col); //0.2306749731
1037
        //alphaGrad<1, 64>(out.template ref<4, 2>(), col); //0.01676812367 -> negligible + avoid conflicts with other rotations for this odd scale
1038
        //alphaGrad<1, 64>(out.template ref<2, 4>(), col); //0.01676812367
1039
    }
1040
};
1041
 
1042
 
1043
template <class ColorGradient> struct Scaler6x : public ColorGradient
1044
{
1045
    static const int scale = 6;
1046
 
1047
    template <unsigned int M, unsigned int N> //bring template function into scope for GCC
1048
    static void alphaGrad(uint32_t& pixBack, uint32_t pixFront) { ColorGradient::template alphaGrad<M, N>(pixBack, pixFront); }
1049
 
1050
 
1051
    template <class OutputMatrix>
1052
    static void blendLineShallow(uint32_t col, OutputMatrix& out)
1053
    {
1054
        alphaGrad<1, 4>(out.template ref<scale - 1, 0>(), col);
1055
        alphaGrad<1, 4>(out.template ref<scale - 2, 2>(), col);
1056
        alphaGrad<1, 4>(out.template ref<scale - 3, 4>(), col);
1057
 
1058
        alphaGrad<3, 4>(out.template ref<scale - 1, 1>(), col);
1059
        alphaGrad<3, 4>(out.template ref<scale - 2, 3>(), col);
1060
        alphaGrad<3, 4>(out.template ref<scale - 3, 5>(), col);
1061
 
1062
        out.template ref<scale - 1, 2>() = col;
1063
        out.template ref<scale - 1, 3>() = col;
1064
        out.template ref<scale - 1, 4>() = col;
1065
        out.template ref<scale - 1, 5>() = col;
1066
 
1067
        out.template ref<scale - 2, 4>() = col;
1068
        out.template ref<scale - 2, 5>() = col;
1069
    }
1070
 
1071
    template <class OutputMatrix>
1072
    static void blendLineSteep(uint32_t col, OutputMatrix& out)
1073
    {
1074
        alphaGrad<1, 4>(out.template ref<0, scale - 1>(), col);
1075
        alphaGrad<1, 4>(out.template ref<2, scale - 2>(), col);
1076
        alphaGrad<1, 4>(out.template ref<4, scale - 3>(), col);
1077
 
1078
        alphaGrad<3, 4>(out.template ref<1, scale - 1>(), col);
1079
        alphaGrad<3, 4>(out.template ref<3, scale - 2>(), col);
1080
        alphaGrad<3, 4>(out.template ref<5, scale - 3>(), col);
1081
 
1082
        out.template ref<2, scale - 1>() = col;
1083
        out.template ref<3, scale - 1>() = col;
1084
        out.template ref<4, scale - 1>() = col;
1085
        out.template ref<5, scale - 1>() = col;
1086
 
1087
        out.template ref<4, scale - 2>() = col;
1088
        out.template ref<5, scale - 2>() = col;
1089
    }
1090
 
1091
    template <class OutputMatrix>
1092
    static void blendLineSteepAndShallow(uint32_t col, OutputMatrix& out)
1093
    {
1094
        alphaGrad<1, 4>(out.template ref<0, scale - 1>(), col);
1095
        alphaGrad<1, 4>(out.template ref<2, scale - 2>(), col);
1096
        alphaGrad<3, 4>(out.template ref<1, scale - 1>(), col);
1097
        alphaGrad<3, 4>(out.template ref<3, scale - 2>(), col);
1098
 
1099
        alphaGrad<1, 4>(out.template ref<scale - 1, 0>(), col);
1100
        alphaGrad<1, 4>(out.template ref<scale - 2, 2>(), col);
1101
        alphaGrad<3, 4>(out.template ref<scale - 1, 1>(), col);
1102
        alphaGrad<3, 4>(out.template ref<scale - 2, 3>(), col);
1103
 
1104
        out.template ref<2, scale - 1>() = col;
1105
        out.template ref<3, scale - 1>() = col;
1106
        out.template ref<4, scale - 1>() = col;
1107
        out.template ref<5, scale - 1>() = col;
1108
 
1109
        out.template ref<4, scale - 2>() = col;
1110
        out.template ref<5, scale - 2>() = col;
1111
 
1112
        out.template ref<scale - 1, 2>() = col;
1113
        out.template ref<scale - 1, 3>() = col;
1114
    }
1115
 
1116
    template <class OutputMatrix>
1117
    static void blendLineDiagonal(uint32_t col, OutputMatrix& out)
1118
    {
1119
        alphaGrad<1, 2>(out.template ref<scale - 1, scale / 2    >(), col);
1120
        alphaGrad<1, 2>(out.template ref<scale - 2, scale / 2 + 1>(), col);
1121
        alphaGrad<1, 2>(out.template ref<scale - 3, scale / 2 + 2>(), col);
1122
 
1123
        out.template ref<scale - 2, scale - 1>() = col;
1124
        out.template ref<scale - 1, scale - 1>() = col;
1125
        out.template ref<scale - 1, scale - 2>() = col;
1126
    }
1127
 
1128
    template <class OutputMatrix>
1129
    static void blendCorner(uint32_t col, OutputMatrix& out)
1130
    {
1131
        //model a round corner
1132
        alphaGrad<97, 100>(out.template ref<5, 5>(), col); //exact: 0.9711013910
1133
        alphaGrad<42, 100>(out.template ref<4, 5>(), col); //0.4236372243
1134
        alphaGrad<42, 100>(out.template ref<5, 4>(), col); //0.4236372243
1135
        alphaGrad< 6, 100>(out.template ref<5, 3>(), col); //0.05652034508
1136
        alphaGrad< 6, 100>(out.template ref<3, 5>(), col); //0.05652034508
1137
    }
1138
};
1139
 
1140
        //------------------------------------------------------------------------------------
1141
        struct ColorDistanceRGB
1142
        {
1143
            static double dist(uint32_t pix1, uint32_t pix2, double luminanceWeight)
1144
            {
1145
                return distYCbCrBuffered(pix1, pix2);
1146
 
1147
                //if (pix1 == pix2) //about 4% perf boost
1148
                //    return 0;
1149
                //return distYCbCr(pix1, pix2, luminanceWeight);
1150
            }
1151
        };
1152
 
1153
        struct ColorDistanceARGB
1154
        {
1155
            static double dist(uint32_t pix1, uint32_t pix2, double luminanceWeight)
1156
            {
1157
                const double a1 = GET_ALPHA (pix1) / 255.0 ;
1158
                const double a2 = GET_ALPHA (pix2) / 255.0 ;
1159
                /*
1160
                Requirements for a color distance handling alpha channel: with a1, a2 in [0, 1]
1161
 
1162
                    1. if a1 = a2, distance should be: a1 * distYCbCr()
1163
                    2. if a1 = 0,  distance should be: a2 * distYCbCr(black, white) = a2 * 255
1164
                    3. if a1 = 1,  ??? maybe: 255 * (1 - a2) + a2 * distYCbCr()
1165
                */
1166
 
1167
                //return std::min(a1, a2) * distYCbCrBuffered(pix1, pix2) + 255 * abs(a1 - a2);
1168
                //=> following code is 15% faster:
1169
                const double d = distYCbCrBuffered(pix1, pix2);
1170
                if (a1 < a2)
1171
                    return a1 * d + 255 * (a2 - a1);
1172
                else
1173
                    return a2 * d + 255 * (a1 - a2);
1174
 
1175
                //alternative? return /*std::*/sqrt(a1 * a2 * square(distYCbCrBuffered(pix1, pix2)) + square(255 * (a1 - a2)));
1176
            }
1177
        };
1178
 
1179
        struct ColorGradientRGB
1180
        {
1181
            template <unsigned int M, unsigned int N> static void alphaGrad (uint32_t &pixBack, uint32_t pixFront)
1182
            {
1183
                pixBack = gradientRGB<M, N> (pixFront, pixBack);
1184
            }
1185
        };
1186
 
1187
        struct ColorGradientARGB
1188
        {
1189
            template <unsigned int M, unsigned int N> static void alphaGrad (uint32_t &pixBack, uint32_t pixFront)
1190
            {
1191
                pixBack = gradientARGB<M, N> (pixFront, pixBack);
1192
            }
1193
        };
1194
}
1195
 
1196
 
1197
 
1198
void xbrz::nearestNeighborScale(const uint32_t* src, int srcWidth, int srcHeight, uint32_t* trg, int trgWidth, int trgHeight)
1199
{
1200
    nearestNeighborScale (src, srcWidth, srcHeight, srcWidth * sizeof (uint32_t), trg, trgWidth, trgHeight, trgWidth * sizeof (uint32_t), XBRZ_SLICETYPE_TARGET, 0, trgHeight, [](uint32_t pix) { return pix; });
1201
}
1202
 
1203
 
1204
EXTERN_C bool xbrz_equalcolortest24 (uint32_t col1, uint32_t col2, double luminanceWeight, double equalColorTolerance)
1205
{
1206
        return (ColorDistanceRGB::dist(col1, col2, luminanceWeight) < equalColorTolerance);
1207
}
1208
 
1209
 
1210
EXTERN_C bool xbrz_equalcolortest32 (uint32_t col1, uint32_t col2, double luminanceWeight, double equalColorTolerance)
1211
{
1212
        return (ColorDistanceARGB::dist(col1, col2, luminanceWeight) < equalColorTolerance);
1213
}
1214
 
1215
 
1216
EXTERN_C void xbrz_scale24 (size_t factor, const uint32_t *src, uint32_t *trg, int srcWidth, int srcHeight)
1217
{
1218
    if      (factor == 2) return scaleImage<Scaler2x<ColorGradientRGB>, ColorDistanceRGB> (src, trg, srcWidth, srcHeight, 0, srcHeight);
1219
    else if (factor == 3) return scaleImage<Scaler3x<ColorGradientRGB>, ColorDistanceRGB> (src, trg, srcWidth, srcHeight, 0, srcHeight);
1220
    else if (factor == 4) return scaleImage<Scaler4x<ColorGradientRGB>, ColorDistanceRGB> (src, trg, srcWidth, srcHeight, 0, srcHeight);
1221
    else if (factor == 5) return scaleImage<Scaler5x<ColorGradientRGB>, ColorDistanceRGB> (src, trg, srcWidth, srcHeight, 0, srcHeight);
1222
    else if (factor == 6) return scaleImage<Scaler6x<ColorGradientRGB>, ColorDistanceRGB> (src, trg, srcWidth, srcHeight, 0, srcHeight);
1223
}
1224
 
1225
 
1226
EXTERN_C void xbrz_scale32 (size_t factor, const uint32_t *src, uint32_t *trg, int srcWidth, int srcHeight)
1227
{
1228
    if      (factor == 2) return scaleImage<Scaler2x<ColorGradientARGB>, ColorDistanceARGB> (src, trg, srcWidth, srcHeight, 0, srcHeight);
1229
    else if (factor == 3) return scaleImage<Scaler3x<ColorGradientARGB>, ColorDistanceARGB> (src, trg, srcWidth, srcHeight, 0, srcHeight);
1230
    else if (factor == 4) return scaleImage<Scaler4x<ColorGradientARGB>, ColorDistanceARGB> (src, trg, srcWidth, srcHeight, 0, srcHeight);
1231
    else if (factor == 5) return scaleImage<Scaler5x<ColorGradientARGB>, ColorDistanceARGB> (src, trg, srcWidth, srcHeight, 0, srcHeight);
1232
    else if (factor == 6) return scaleImage<Scaler6x<ColorGradientARGB>, ColorDistanceARGB> (src, trg, srcWidth, srcHeight, 0, srcHeight);
1233
}