Subversion Repositories Games.Prince of Persia

Rev

Rev 2 | Show entire file | Ignore whitespace | Details | Blame | Last modification | View Log | RSS feed

Rev 2 Rev 3
Line 11... Line 11...
11
// * If you modify this file, you may extend this exception to your version   *
11
// * If you modify this file, you may extend this exception to your version   *
12
// * of the file, but you are not obligated to do so. If you do not wish to   *
12
// * of the file, but you are not obligated to do so. If you do not wish to   *
13
// * do so, delete this exception statement from your version.                *
13
// * do so, delete this exception statement from your version.                *
14
// ****************************************************************************
14
// ****************************************************************************
15
 
15
 
-
 
16
// -------------------------------------------------------------------------
-
 
17
// | xBRZ: "Scale by rules" - high quality image upscaling filter by Zenju |
-
 
18
// -------------------------------------------------------------------------
-
 
19
// using a modified approach of xBR:
-
 
20
// http://board.byuu.org/viewtopic.php?f=10&t=2248
-
 
21
//  - new rule set preserving small image features
-
 
22
//  - highly optimized for performance
-
 
23
//  - support alpha channel
-
 
24
//  - support multithreading
-
 
25
//  - support 64-bit architectures
-
 
26
//  - support processing image slices
-
 
27
//  - support scaling up to 6xBRZ
16
 
28
 
-
 
29
// -> map source (srcWidth * srcHeight) to target (scale * width x scale * height) image, optionally processing a half-open slice of rows [yFirst, yLast) only
17
#include <cstddef> //size_t
30
// -> support for source/target pitch in bytes!
-
 
31
// -> if your emulator changes only a few image slices during each cycle (e.g. DOSBox) then there's no need to run xBRZ on the complete image:
-
 
32
//    Just make sure you enlarge the source image slice by 2 rows on top and 2 on bottom (this is the additional range the xBRZ algorithm is using during analysis)
-
 
33
//    CAVEAT: If there are multiple changed slices, make sure they do not overlap after adding these additional rows in order to avoid a memory race condition
18
#include <cstdint> //uint32_t
34
//    in the target image data if you are using multiple threads for processing each enlarged slice!
19
#include <limits>
35
// 
-
 
36
// THREAD-SAFETY: - parts of the same image may be scaled by multiple threads as long as the [yFirst, yLast) ranges do not overlap!
-
 
37
//                - there is a minor inefficiency for the first row of a slice, so avoid processing single rows only; suggestion: process at least 8-16 rows
-
 
38
 
-
 
39
 
20
#include <cassert>
40
#include <stddef.h> // for size_t
21
#include <algorithm>
41
#include <stdint.h> // for uint32_t
22
#include <type_traits>
42
#include <memory.h> // for memset()
23
#include <vector>
43
#include <limits.h>
24
#include <math.h>
44
#include <math.h>
25
 
45
 
26
 
46
 
27
#ifdef __cplusplus
47
#ifdef __cplusplus
28
#define EXTERN_C extern "C"
48
#define EXTERN_C extern "C"
Line 47... Line 67...
47
#define GET_BYTE(val,byteno) ((unsigned char) (((val) >> ((byteno) << 3)) & 0xff))
67
#define GET_BYTE(val,byteno) ((unsigned char) (((val) >> ((byteno) << 3)) & 0xff))
48
#define GET_BLUE(val)  GET_BYTE (val, 0)
68
#define GET_BLUE(val)  GET_BYTE (val, 0)
49
#define GET_GREEN(val) GET_BYTE (val, 1)
69
#define GET_GREEN(val) GET_BYTE (val, 1)
50
#define GET_RED(val)   GET_BYTE (val, 2)
70
#define GET_RED(val)   GET_BYTE (val, 2)
51
#define GET_ALPHA(val) GET_BYTE (val, 3)
71
#define GET_ALPHA(val) GET_BYTE (val, 3)
52
//inline uint32_t rgb555to888(uint16_t pix) { return ((pix & 0x7C00) << 9) | ((pix & 0x03E0) << 6) | ((pix & 0x001F) << 3); }
72
#define CALC_COLOR24(colFront,colBack,M,N) (unsigned char) ((((unsigned char) (colFront)) * ((unsigned int) (M)) + ((unsigned char) (colBack)) * (((unsigned int) (N)) - ((unsigned int) (M)))) / ((unsigned int) (N)))
53
//inline uint32_t rgb565to888(uint16_t pix) { return ((pix & 0xF800) << 8) | ((pix & 0x07E0) << 5) | ((pix & 0x001F) << 3); }
73
#define CALC_COLOR32(colFront,colBack,weightFront,weightBack,weightSum) ((unsigned char) ((((unsigned char) (colFront)) * ((unsigned int) (weightFront)) + ((unsigned char) (colBack)) * ((unsigned int) (weightBack))) / ((unsigned int) (weightSum))))
54
//inline uint16_t rgb888to555(uint32_t pix) { return static_cast<uint16_t>(((pix & 0xF80000) >> 9) | ((pix & 0x00F800) >> 6) | ((pix & 0x0000F8) >> 3)); }
74
#define BYTE_ADVANCE(buffer,offset) (((char *) buffer) + (offset))
-
 
75
#ifndef MIN
-
 
76
#define MIN(a,b) ((a) < (b) ? (a) : (b))
-
 
77
#endif // MIN
-
 
78
#ifndef MAX
55
//inline uint16_t rgb888to565(uint32_t pix) { return static_cast<uint16_t>(((pix & 0xF80000) >> 8) | ((pix & 0x00FC00) >> 5) | ((pix & 0x0000F8) >> 3)); }
79
#define MAX(a,b) ((a) > (b) ? (a) : (b))
-
 
80
#endif // MAX
56
 
81
 
57
 
82
 
58
namespace xbrz
-
 
59
{
-
 
60
        // -------------------------------------------------------------------------
-
 
61
        // | xBRZ: "Scale by rules" - high quality image upscaling filter by Zenju |
-
 
62
        // -------------------------------------------------------------------------
-
 
63
        // using a modified approach of xBR:
-
 
64
        // http://board.byuu.org/viewtopic.php?f=10&t=2248
-
 
65
        //  - new rule set preserving small image features
-
 
66
        //  - highly optimized for performance
-
 
67
        //  - support alpha channel
-
 
68
        //  - support multithreading
-
 
69
        //  - support 64-bit architectures
-
 
70
        //  - support processing image slices
-
 
71
        //  - support scaling up to 6xBRZ
-
 
72
 
-
 
73
        // -> map source (srcWidth * srcHeight) to target (scale * width x scale * height) image, optionally processing a half-open slice of rows [yFirst, yLast) only
-
 
74
        // -> support for source/target pitch in bytes!
-
 
75
        // -> if your emulator changes only a few image slices during each cycle (e.g. DOSBox) then there's no need to run xBRZ on the complete image:
-
 
76
        //    Just make sure you enlarge the source image slice by 2 rows on top and 2 on bottom (this is the additional range the xBRZ algorithm is using during analysis)
-
 
77
        //    CAVEAT: If there are multiple changed slices, make sure they do not overlap after adding these additional rows in order to avoid a memory race condition
-
 
78
        //    in the target image data if you are using multiple threads for processing each enlarged slice!
-
 
79
        // 
-
 
80
        // THREAD-SAFETY: - parts of the same image may be scaled by multiple threads as long as the [yFirst, yLast) ranges do not overlap!
-
 
81
        //                - there is a minor inefficiency for the first row of a slice, so avoid processing single rows only; suggestion: process at least 8-16 rows
-
 
82
 
-
 
83
        void nearestNeighborScale(const uint32_t* src, int srcWidth, int srcHeight, uint32_t* trg, int trgWidth, int trgHeight);
83
typedef void (alphagrad_func) (uint32_t *pixBack, uint32_t pixFront, unsigned int M, unsigned int N);
84
 
-
 
85
 
-
 
86
        template <class Pix> inline Pix* byteAdvance(Pix* ptr, int bytes)
-
 
87
        {
-
 
88
            using PixNonConst = typename std::remove_cv<Pix>::type;
-
 
89
            using PixByte     = typename std::conditional<std::is_same<Pix, PixNonConst>::value, char, const char>::type;
-
 
90
 
-
 
91
            static_assert(std::is_integral<PixNonConst>::value, "Pix* is expected to be cast-able to char*");
-
 
92
 
-
 
93
            return reinterpret_cast<Pix*>(reinterpret_cast<PixByte*>(ptr) + bytes);
-
 
94
        }
-
 
95
 
-
 
96
 
-
 
97
//fill block  with the given color
-
 
98
template <class Pix> inline void fillBlock(Pix* trg, int pitch, Pix col, int blockWidth, int blockHeight)
-
 
99
{
-
 
100
    //for (int y = 0; y < blockHeight; ++y, trg = byteAdvance(trg, pitch))
-
 
101
    //    std::fill(trg, trg + blockWidth, col);
-
 
102
 
-
 
103
    for (int y = 0; y < blockHeight; ++y, trg = byteAdvance(trg, pitch))
-
 
104
        for (int x = 0; x < blockWidth; ++x)
-
 
105
            trg[x] = col;
-
 
106
}
-
 
107
 
-
 
108
 
-
 
109
template <class PixSrc, class PixTrg, class PixConverter>
-
 
110
void nearestNeighborScale(const PixSrc* src, int srcWidth, int srcHeight, int srcPitch,
-
 
111
                          /**/  PixTrg* trg, int trgWidth, int trgHeight, int trgPitch,
-
 
112
                          int slice_type, int yFirst, int yLast, PixConverter pixCvrt /*convert PixSrc to PixTrg*/)
-
 
113
{
-
 
114
    static_assert(std::is_integral<PixSrc>::value, "PixSrc* is expected to be cast-able to char*");
-
 
115
    static_assert(std::is_integral<PixTrg>::value, "PixTrg* is expected to be cast-able to char*");
-
 
116
    static_assert(std::is_same<decltype(pixCvrt(PixSrc())), PixTrg>::value, "PixConverter returning wrong pixel format");
-
 
117
 
-
 
118
    if (srcPitch < srcWidth * static_cast<int>(sizeof(PixSrc))  ||
-
 
119
        trgPitch < trgWidth * static_cast<int>(sizeof(PixTrg)))
-
 
120
    {
-
 
121
        assert(false);
-
 
122
        return;
-
 
123
    }
-
 
124
 
-
 
125
    if (slice_type == XBRZ_SLICETYPE_SOURCE)
-
 
126
    {
-
 
127
            //nearest-neighbor (going over source image - fast for upscaling, since source is read only once
-
 
128
            yFirst = std::max(yFirst, 0);
-
 
129
            yLast  = std::min(yLast, srcHeight);
-
 
130
            if (yFirst >= yLast || trgWidth <= 0 || trgHeight <= 0) return;
-
 
131
 
-
 
132
            for (int y = yFirst; y < yLast; ++y)
-
 
133
            {
-
 
134
                //mathematically: ySrc = floor(srcHeight * yTrg / trgHeight)
-
 
135
                // => search for integers in: [ySrc, ySrc + 1) * trgHeight / srcHeight
-
 
136
 
-
 
137
                //keep within for loop to support MT input slices!
-
 
138
                const int yTrg_first = ( y      * trgHeight + srcHeight - 1) / srcHeight; //=ceil(y * trgHeight / srcHeight)
-
 
139
                const int yTrg_last  = ((y + 1) * trgHeight + srcHeight - 1) / srcHeight; //=ceil(((y + 1) * trgHeight) / srcHeight)
-
 
140
                const int blockHeight = yTrg_last - yTrg_first;
-
 
141
 
-
 
142
                if (blockHeight > 0)
-
 
143
                {
-
 
144
                    const PixSrc* srcLine = byteAdvance(src, y * srcPitch);
-
 
145
                    /**/  PixTrg* trgLine = byteAdvance(trg, yTrg_first * trgPitch);
-
 
146
                    int xTrg_first = 0;
-
 
147
 
-
 
148
                    for (int x = 0; x < srcWidth; ++x)
84
typedef double (dist_func) (uint32_t pix1, uint32_t pix2);
149
                    {
-
 
150
                        const int xTrg_last = ((x + 1) * trgWidth + srcWidth - 1) / srcWidth;
-
 
151
                        const int blockWidth = xTrg_last - xTrg_first;
-
 
152
                        if (blockWidth > 0)
-
 
153
                        {
-
 
154
                            xTrg_first = xTrg_last;
-
 
155
 
-
 
156
                            const auto trgPix = pixCvrt(srcLine[x]);
-
 
157
                            fillBlock(trgLine, trgPitch, trgPix, blockWidth, blockHeight);
-
 
158
                            trgLine += blockWidth;
-
 
159
                        }
-
 
160
                    }
-
 
161
                }
-
 
162
            }
-
 
163
    }
-
 
164
    else if (slice_type == XBRZ_SLICETYPE_TARGET)
-
 
165
    {
-
 
166
            //nearest-neighbor (going over target image - slow for upscaling, since source is read multiple times missing out on cache! Fast for similar image sizes!)
-
 
167
            yFirst = std::max(yFirst, 0);
-
 
168
            yLast  = std::min(yLast, trgHeight);
-
 
169
            if (yFirst >= yLast || srcHeight <= 0 || srcWidth <= 0) return;
-
 
170
 
-
 
171
            for (int y = yFirst; y < yLast; ++y)
-
 
172
            {
-
 
173
                PixTrg* trgLine = byteAdvance(trg, y * trgPitch);
-
 
174
                const int ySrc = srcHeight * y / trgHeight;
-
 
175
                const PixSrc* srcLine = byteAdvance(src, ySrc * srcPitch);
-
 
176
                for (int x = 0; x < trgWidth; ++x)
-
 
177
                {
-
 
178
                    const int xSrc = srcWidth * x / trgWidth;
-
 
179
                    trgLine[x] = pixCvrt(srcLine[xSrc]);
-
 
180
                }
-
 
181
            }
-
 
182
    }
-
 
183
}
-
 
184
}
-
 
185
 
85
 
186
 
86
 
187
 
87
 
188
 
88
 
189
namespace
89
namespace
190
{
90
{
191
template <unsigned int M, unsigned int N> inline
-
 
192
uint32_t gradientRGB(uint32_t pixFront, uint32_t pixBack) //blend front color with opacity M / N over opaque background: http://en.wikipedia.org/wiki/Alpha_compositing#Alpha_blending
-
 
193
{
-
 
194
    static_assert(0 < M && M < N && N <= 1000, "");
-
 
195
 
-
 
196
    auto calcColor = [](unsigned char colFront, unsigned char colBack) -> unsigned char { return (colFront * M + colBack * (N - M)) / N; };
-
 
197
 
-
 
198
        return ((calcColor (GET_RED   (pixFront), GET_RED   (pixBack)) << 16)
-
 
199
              | (calcColor (GET_GREEN (pixFront), GET_GREEN (pixBack)) <<  8)
-
 
200
              | (calcColor (GET_BLUE  (pixFront), GET_BLUE  (pixBack)) <<  0));
-
 
201
}
-
 
202
 
-
 
203
 
-
 
204
template <unsigned int M, unsigned int N> inline
-
 
205
uint32_t gradientARGB(uint32_t pixFront, uint32_t pixBack) //find intermediate color between two colors with alpha channels (=> NO alpha blending!!!)
-
 
206
{
-
 
207
    static_assert(0 < M && M < N && N <= 1000, "");
-
 
208
 
-
 
209
    const unsigned int weightFront = GET_ALPHA (pixFront) * M;
-
 
210
    const unsigned int weightBack  = GET_ALPHA (pixBack) * (N - M);
-
 
211
    const unsigned int weightSum   = weightFront + weightBack;
-
 
212
    if (weightSum == 0)
-
 
213
        return 0;
-
 
214
 
-
 
215
    auto calcColor = [=](unsigned char colFront, unsigned char colBack)
-
 
216
    {
-
 
217
        return static_cast<unsigned char>((colFront * weightFront + colBack * weightBack) / weightSum);
-
 
218
    };
-
 
219
 
-
 
220
        return (((unsigned char) (weightSum / N))                      << 24)
-
 
221
              | (calcColor (GET_RED   (pixFront), GET_RED   (pixBack)) << 16)
-
 
222
              | (calcColor (GET_GREEN (pixFront), GET_GREEN (pixBack)) <<  8)
-
 
223
              | (calcColor (GET_BLUE  (pixFront), GET_BLUE  (pixBack)) <<  0);
-
 
224
}
-
 
225
 
-
 
226
 
-
 
227
//inline
-
 
228
//double fastSqrt(double n)
-
 
229
//{
-
 
230
//    __asm //speeds up xBRZ by about 9% compared to /*std::*/sqrt which internally uses the same assembler instructions but adds some "fluff"
-
 
231
//    {
-
 
232
//        fld n
-
 
233
//        fsqrt
-
 
234
//    }
-
 
235
//}
-
 
236
//
-
 
237
 
-
 
238
 
-
 
239
#ifdef _MSC_VER
91
#ifdef _MSC_VER
240
    #define FORCE_INLINE __forceinline
92
    #define FORCE_INLINE __forceinline
241
#elif defined __GNUC__
93
#elif defined __GNUC__
242
    #define FORCE_INLINE __attribute__((always_inline)) inline
94
    #define FORCE_INLINE __attribute__((always_inline)) inline
243
#else
95
#else
Line 245... Line 97...
245
#endif
97
#endif
246
 
98
 
247
 
99
 
248
enum RotationDegree //clock-wise
100
enum RotationDegree //clock-wise
249
{
101
{
250
    ROT_0,
102
    ROT_0 = 0,
251
    ROT_90,
103
    ROT_90,
252
    ROT_180,
104
    ROT_180,
253
    ROT_270
105
    ROT_270
254
};
106
};
-
 
107
 
255
 
108
 
256
//calculate input matrix coordinates after rotation at compile time
109
//calculate input matrix coordinates after rotation at compile time
257
template <RotationDegree rotDeg, size_t I, size_t J, size_t N>
110
template <RotationDegree rotDeg, size_t I, size_t J, size_t N> struct MatrixRotation;
258
struct MatrixRotation;
-
 
-
 
111
 
259
 
112
 
260
template <size_t I, size_t J, size_t N>
113
template <size_t I, size_t J, size_t N> struct MatrixRotation<ROT_0, I, J, N>
261
struct MatrixRotation<ROT_0, I, J, N>
-
 
262
{
114
{
263
    static const size_t I_old = I;
115
    static const size_t I_old = I;
264
    static const size_t J_old = J;
116
    static const size_t J_old = J;
265
};
117
};
-
 
118
 
266
 
119
 
267
template <RotationDegree rotDeg, size_t I, size_t J, size_t N> //(i, j) = (row, col) indices, N = size of (square) matrix
120
template <RotationDegree rotDeg, size_t I, size_t J, size_t N> //(i, j) = (row, col) indices, N = size of (square) matrix
268
struct MatrixRotation
121
struct MatrixRotation
269
{
122
{
270
    static const size_t I_old = N - 1 - MatrixRotation<static_cast<RotationDegree>(rotDeg - 1), I, J, N>::J_old; //old coordinates before rotation!
123
    static const size_t I_old = N - 1 - MatrixRotation<(RotationDegree)(rotDeg - 1), I, J, N>::J_old; //old coordinates before rotation!
271
    static const size_t J_old =         MatrixRotation<static_cast<RotationDegree>(rotDeg - 1), I, J, N>::I_old; //
124
    static const size_t J_old =         MatrixRotation<(RotationDegree)(rotDeg - 1), I, J, N>::I_old; //
272
};
125
};
273
 
126
 
274
 
127
 
275
template <size_t N, RotationDegree rotDeg>
128
template <size_t N, RotationDegree rotDeg> class OutputMatrix
276
class OutputMatrix
-
 
277
{
129
{
278
public:
130
public:
279
    OutputMatrix(uint32_t* out, int outWidth) : //access matrix area, top-left at position "out" for image with given width
131
    OutputMatrix (uint32_t *out, int outWidth) //access matrix area, top-left at position "out" for image with given width
-
 
132
    {
280
        out_(out),
133
        out_ = out;
281
        outWidth_(outWidth) {}
134
        outWidth_ = outWidth;
-
 
135
    }
282
 
136
 
283
    template <size_t I, size_t J>
137
    template <size_t I, size_t J> uint32_t &ref() const
284
    uint32_t& ref() const
-
 
285
    {
138
    {
286
        static const size_t I_old = MatrixRotation<rotDeg, I, J, N>::I_old;
139
        static const size_t I_old = MatrixRotation<rotDeg, I, J, N>::I_old;
287
        static const size_t J_old = MatrixRotation<rotDeg, I, J, N>::J_old;
140
        static const size_t J_old = MatrixRotation<rotDeg, I, J, N>::J_old;
-
 
141
 
288
        return *(out_ + J_old + I_old * outWidth_);
142
        return *(out_ + J_old + I_old * outWidth_);
289
    }
143
    }
290
 
144
 
291
private:
-
 
292
    uint32_t* out_;
145
    uint32_t* out_;
293
    const int outWidth_;
146
    int outWidth_;
294
};
147
};
295
 
148
 
296
 
149
 
297
template <class T> inline
-
 
298
T square(T value) { return value * value; }
-
 
299
 
-
 
300
 
-
 
301
 
-
 
302
inline
-
 
303
double distRGB(uint32_t pix1, uint32_t pix2)
-
 
304
{
-
 
305
    const double r_diff = static_cast<int>(GET_RED   (pix1)) - GET_RED   (pix2);
-
 
306
    const double g_diff = static_cast<int>(GET_GREEN (pix1)) - GET_GREEN (pix2);
-
 
307
    const double b_diff = static_cast<int>(GET_BLUE  (pix1)) - GET_BLUE  (pix2);
-
 
308
 
-
 
309
    //euklidean RGB distance
-
 
310
    return /*std::*/sqrt(square(r_diff) + square(g_diff) + square(b_diff));
-
 
311
}
-
 
312
 
-
 
313
 
-
 
314
inline
-
 
315
double distYCbCr(uint32_t pix1, uint32_t pix2, double lumaWeight)
-
 
316
{
-
 
317
    //http://en.wikipedia.org/wiki/YCbCr#ITU-R_BT.601_conversion
-
 
318
    //YCbCr conversion is a matrix multiplication => take advantage of linearity by subtracting first!
-
 
319
    const int r_diff = static_cast<int>(GET_RED   (pix1)) - GET_RED   (pix2); //we may delay division by 255 to after matrix multiplication
-
 
320
    const int g_diff = static_cast<int>(GET_GREEN (pix1)) - GET_GREEN (pix2); //
-
 
321
    const int b_diff = static_cast<int>(GET_BLUE  (pix1)) - GET_BLUE  (pix2); //substraction for int is noticeable faster than for double!
-
 
322
 
-
 
323
    //const double k_b = 0.0722; //ITU-R BT.709 conversion
-
 
324
    //const double k_r = 0.2126; //
-
 
325
    const double k_b = 0.0593; //ITU-R BT.2020 conversion
-
 
326
    const double k_r = 0.2627; //
-
 
327
    const double k_g = 1 - k_b - k_r;
-
 
328
 
-
 
329
    const double scale_b = 0.5 / (1 - k_b);
-
 
330
    const double scale_r = 0.5 / (1 - k_r);
-
 
331
 
-
 
332
    const double y   = k_r * r_diff + k_g * g_diff + k_b * b_diff; //[!], analog YCbCr!
-
 
333
    const double c_b = scale_b * (b_diff - y);
-
 
334
    const double c_r = scale_r * (r_diff - y);
-
 
335
 
-
 
336
    //we skip division by 255 to have similar range like other distance functions
-
 
337
    return /*std::*/sqrt(square(lumaWeight * y) + square(c_b) + square(c_r));
-
 
338
}
-
 
339
 
-
 
340
 
-
 
341
inline double distYCbCrBuffered(uint32_t pix1, uint32_t pix2)
-
 
342
{
-
 
343
    //30% perf boost compared to plain distYCbCr()!
-
 
344
    //consumes 64 MB memory; using double is only 2% faster, but takes 128 MB
-
 
345
    static const std::vector<float> diffToDist = []
-
 
346
    {
-
 
347
        std::vector<float> tmp;
-
 
348
 
-
 
349
        for (uint32_t i = 0; i < 256 * 256 * 256; ++i) //startup time: 114 ms on Intel Core i5 (four cores)
-
 
350
        {
-
 
351
            const int r_diff = GET_RED (i) * 2 - 0xFF;
-
 
352
            const int g_diff = GET_GREEN (i) * 2 - 0xFF;
-
 
353
            const int b_diff = GET_BLUE (i) * 2 - 0xFF;
-
 
354
 
-
 
355
            const double k_b = 0.0593; //ITU-R BT.2020 conversion
-
 
356
            const double k_r = 0.2627; //
-
 
357
            const double k_g = 1 - k_b - k_r;
-
 
358
 
-
 
359
            const double scale_b = 0.5 / (1 - k_b);
-
 
360
            const double scale_r = 0.5 / (1 - k_r);
-
 
361
 
-
 
362
            const double y   = k_r * r_diff + k_g * g_diff + k_b * b_diff; //[!], analog YCbCr!
-
 
363
            const double c_b = scale_b * (b_diff - y);
-
 
364
            const double c_r = scale_r * (r_diff - y);
-
 
365
 
-
 
366
            tmp.push_back(static_cast<float>(/*std::*/sqrt(square(y) + square(c_b) + square(c_r))));
-
 
367
        }
-
 
368
        return tmp;
-
 
369
    }();
-
 
370
 
-
 
371
    //if (pix1 == pix2) -> 8% perf degradation!
-
 
372
    //    return 0;
-
 
373
    //if (pix1 < pix2)
-
 
374
    //    std::swap(pix1, pix2); -> 30% perf degradation!!!
-
 
375
#if 1
-
 
376
    const int r_diff = static_cast<int>(GET_RED   (pix1)) - GET_RED   (pix2);
-
 
377
    const int g_diff = static_cast<int>(GET_GREEN (pix1)) - GET_GREEN (pix2);
-
 
378
    const int b_diff = static_cast<int>(GET_BLUE  (pix1)) - GET_BLUE  (pix2);
-
 
379
 
-
 
380
    return diffToDist[(((r_diff + 0xFF) / 2) << 16) | //slightly reduce precision (division by 2) to squeeze value into single byte
-
 
381
                      (((g_diff + 0xFF) / 2) <<  8) |
-
 
382
                      (( b_diff + 0xFF) / 2)];
-
 
383
#else //not noticeably faster:
-
 
384
    const int r_diff_tmp = ((pix1 & 0xFF0000) + 0xFF0000 - (pix2 & 0xFF0000)) / 2;
-
 
385
    const int g_diff_tmp = ((pix1 & 0x00FF00) + 0x00FF00 - (pix2 & 0x00FF00)) / 2; //slightly reduce precision (division by 2) to squeeze value into single byte
-
 
386
    const int b_diff_tmp = ((pix1 & 0x0000FF) + 0x0000FF - (pix2 & 0x0000FF)) / 2;
-
 
387
 
-
 
388
    return diffToDist[(r_diff_tmp & 0xFF0000) | (g_diff_tmp & 0x00FF00) | (b_diff_tmp & 0x0000FF)];
-
 
389
#endif
-
 
390
}
-
 
391
 
150
 
392
 
151
 
393
enum BlendType
152
enum BlendType
394
{
153
{
395
    BLEND_NONE = 0,
154
    BLEND_NONE = 0,
Line 425... Line 184...
425
| I | J | K | L |
184
| I | J | K | L |
426
----|---|---|---|
185
----|---|---|---|
427
| M | N | O | P |
186
| M | N | O | P |
428
-----------------
187
-----------------
429
*/
188
*/
430
template <class ColorDistance>
-
 
431
FORCE_INLINE //detect blend direction
189
FORCE_INLINE //detect blend direction
432
BlendResult preProcessCorners(const Kernel_4x4& ker) //result: F, G, J, K corners of "GradientType"
190
BlendResult preProcessCorners(const Kernel_4x4& ker, dist_func dist) //result: F, G, J, K corners of "GradientType"
433
{
191
{
434
    BlendResult result = {};
192
    BlendResult result = {};
435
 
193
 
436
    if ((ker.f == ker.g &&
194
    if ((ker.f == ker.g &&
437
         ker.j == ker.k) ||
195
         ker.j == ker.k) ||
438
        (ker.f == ker.j &&
196
        (ker.f == ker.j &&
439
         ker.g == ker.k))
197
         ker.g == ker.k))
440
        return result;
198
        return result;
441
 
-
 
442
    auto dist = [&](uint32_t pix1, uint32_t pix2) { return ColorDistance::dist(pix1, pix2, XBRZ_CFG_LUMINANCE_WEIGHT); };
-
 
443
 
199
 
444
    const int weight = 4;
200
    const int weight = 4;
445
    double jg = dist(ker.i, ker.f) + dist(ker.f, ker.c) + dist(ker.n, ker.k) + dist(ker.k, ker.h) + weight * dist(ker.j, ker.g);
201
    double jg = dist (ker.i, ker.f) + dist (ker.f, ker.c) + dist (ker.n, ker.k) + dist (ker.k, ker.h) + weight * dist (ker.j, ker.g);
446
    double fk = dist(ker.e, ker.j) + dist(ker.j, ker.o) + dist(ker.b, ker.g) + dist(ker.g, ker.l) + weight * dist(ker.f, ker.k);
202
    double fk = dist (ker.e, ker.j) + dist (ker.j, ker.o) + dist (ker.b, ker.g) + dist (ker.g, ker.l) + weight * dist (ker.f, ker.k);
447
 
203
 
448
    if (jg < fk) //test sample: 70% of values max(jg, fk) / min(jg, fk) are between 1.1 and 3.7 with median being 1.8
204
    if (jg < fk) //test sample: 70% of values max(jg, fk) / min(jg, fk) are between 1.1 and 3.7 with median being 1.8
449
    {
205
    {
450
        const bool dominantGradient = XBRZ_CFG_DOMINANT_DIRECTION_THRESHOLD * jg < fk;
206
        const bool dominantGradient = XBRZ_CFG_DOMINANT_DIRECTION_THRESHOLD * jg < fk;
451
        if (ker.f != ker.g && ker.f != ker.j)
207
        if (ker.f != ker.g && ker.f != ker.j)
Line 471... Line 227...
471
    uint32_t
227
    uint32_t
472
    /**/a,  b,  c,
228
    /**/a,  b,  c,
473
    /**/d,  e,  f,
229
    /**/d,  e,  f,
474
    /**/g,  h,  i;
230
    /**/g,  h,  i;
475
};
231
};
476
 
232
/*
477
#define DEF_GETTER(x) template <RotationDegree rotDeg> uint32_t inline get_##x(const Kernel_3x3& ker) { return ker.x; }
233
#define DEF_GETTER(x) template <RotationDegree rotDeg> uint32_t inline get_##x(const Kernel_3x3& ker) { return ker.x; }
478
//we cannot and NEED NOT write "ker.##x" since ## concatenates preprocessor tokens but "." is not a token
234
//we cannot and NEED NOT write "ker.##x" since ## concatenates preprocessor tokens but "." is not a token
479
DEF_GETTER(a) DEF_GETTER(b) DEF_GETTER(c)
235
DEF_GETTER(a) DEF_GETTER(b) DEF_GETTER(c)
480
DEF_GETTER(d) DEF_GETTER(e) DEF_GETTER(f)
236
DEF_GETTER(d) DEF_GETTER(e) DEF_GETTER(f)
481
DEF_GETTER(g) DEF_GETTER(h) DEF_GETTER(i)
237
DEF_GETTER(g) DEF_GETTER(h) DEF_GETTER(i)
Line 496... Line 252...
496
#define DEF_GETTER(x, y) template <> inline uint32_t get_##x<ROT_270>(const Kernel_3x3& ker) { return ker.y; }
252
#define DEF_GETTER(x, y) template <> inline uint32_t get_##x<ROT_270>(const Kernel_3x3& ker) { return ker.y; }
497
DEF_GETTER(a, c) DEF_GETTER(b, f) DEF_GETTER(c, i)
253
DEF_GETTER(a, c) DEF_GETTER(b, f) DEF_GETTER(c, i)
498
DEF_GETTER(d, b) DEF_GETTER(e, e) DEF_GETTER(f, h)
254
DEF_GETTER(d, b) DEF_GETTER(e, e) DEF_GETTER(f, h)
499
DEF_GETTER(g, a) DEF_GETTER(h, d) DEF_GETTER(i, g)
255
DEF_GETTER(g, a) DEF_GETTER(h, d) DEF_GETTER(i, g)
500
#undef DEF_GETTER
256
#undef DEF_GETTER
-
 
257
*/
-
 
258
 
-
 
259
template <RotationDegree rotDeg> uint32_t inline get_a (const Kernel_3x3& ker) { return ker.a; }
-
 
260
template <RotationDegree rotDeg> uint32_t inline get_b (const Kernel_3x3& ker) { return ker.b; }
-
 
261
template <RotationDegree rotDeg> uint32_t inline get_c (const Kernel_3x3& ker) { return ker.c; }
-
 
262
template <RotationDegree rotDeg> uint32_t inline get_d (const Kernel_3x3& ker) { return ker.d; }
-
 
263
template <RotationDegree rotDeg> uint32_t inline get_e (const Kernel_3x3& ker) { return ker.e; }
-
 
264
template <RotationDegree rotDeg> uint32_t inline get_f (const Kernel_3x3& ker) { return ker.f; }
-
 
265
template <RotationDegree rotDeg> uint32_t inline get_g (const Kernel_3x3& ker) { return ker.g; }
-
 
266
template <RotationDegree rotDeg> uint32_t inline get_h (const Kernel_3x3& ker) { return ker.h; }
-
 
267
template <RotationDegree rotDeg> uint32_t inline get_i (const Kernel_3x3& ker) { return ker.i; }
-
 
268
 
-
 
269
template <> inline uint32_t get_a<ROT_90>(const Kernel_3x3& ker) { return ker.g; }
-
 
270
template <> inline uint32_t get_b<ROT_90>(const Kernel_3x3& ker) { return ker.d; }
-
 
271
template <> inline uint32_t get_c<ROT_90>(const Kernel_3x3& ker) { return ker.a; }
-
 
272
template <> inline uint32_t get_d<ROT_90>(const Kernel_3x3& ker) { return ker.h; }
-
 
273
template <> inline uint32_t get_e<ROT_90>(const Kernel_3x3& ker) { return ker.e; }
-
 
274
template <> inline uint32_t get_f<ROT_90>(const Kernel_3x3& ker) { return ker.b; }
-
 
275
template <> inline uint32_t get_g<ROT_90>(const Kernel_3x3& ker) { return ker.i; }
-
 
276
template <> inline uint32_t get_h<ROT_90>(const Kernel_3x3& ker) { return ker.f; }
-
 
277
template <> inline uint32_t get_i<ROT_90>(const Kernel_3x3& ker) { return ker.c; }
-
 
278
 
-
 
279
template <> inline uint32_t get_a<ROT_180>(const Kernel_3x3& ker) { return ker.i; }
-
 
280
template <> inline uint32_t get_b<ROT_180>(const Kernel_3x3& ker) { return ker.h; }
-
 
281
template <> inline uint32_t get_c<ROT_180>(const Kernel_3x3& ker) { return ker.g; }
-
 
282
template <> inline uint32_t get_d<ROT_180>(const Kernel_3x3& ker) { return ker.f; }
-
 
283
template <> inline uint32_t get_e<ROT_180>(const Kernel_3x3& ker) { return ker.e; }
-
 
284
template <> inline uint32_t get_f<ROT_180>(const Kernel_3x3& ker) { return ker.d; }
-
 
285
template <> inline uint32_t get_g<ROT_180>(const Kernel_3x3& ker) { return ker.c; }
-
 
286
template <> inline uint32_t get_h<ROT_180>(const Kernel_3x3& ker) { return ker.b; }
-
 
287
template <> inline uint32_t get_i<ROT_180>(const Kernel_3x3& ker) { return ker.a; }
501
 
288
 
-
 
289
template <> inline uint32_t get_a<ROT_270>(const Kernel_3x3& ker) { return ker.c; }
-
 
290
template <> inline uint32_t get_b<ROT_270>(const Kernel_3x3& ker) { return ker.f; }
-
 
291
template <> inline uint32_t get_c<ROT_270>(const Kernel_3x3& ker) { return ker.i; }
-
 
292
template <> inline uint32_t get_d<ROT_270>(const Kernel_3x3& ker) { return ker.b; }
-
 
293
template <> inline uint32_t get_e<ROT_270>(const Kernel_3x3& ker) { return ker.e; }
-
 
294
template <> inline uint32_t get_f<ROT_270>(const Kernel_3x3& ker) { return ker.h; }
-
 
295
template <> inline uint32_t get_g<ROT_270>(const Kernel_3x3& ker) { return ker.a; }
-
 
296
template <> inline uint32_t get_h<ROT_270>(const Kernel_3x3& ker) { return ker.d; }
-
 
297
template <> inline uint32_t get_i<ROT_270>(const Kernel_3x3& ker) { return ker.g; }
502
 
298
 
503
//compress four blend types into a single byte
299
//compress four blend types into a single byte
504
inline BlendType getTopL   (unsigned char b) { return static_cast<BlendType>(0x3 & b); }
300
inline BlendType getTopL   (unsigned char b) { return (BlendType)(0x3 & b); }
505
inline BlendType getTopR   (unsigned char b) { return static_cast<BlendType>(0x3 & (b >> 2)); }
301
inline BlendType getTopR   (unsigned char b) { return (BlendType)(0x3 & (b >> 2)); }
506
inline BlendType getBottomR(unsigned char b) { return static_cast<BlendType>(0x3 & (b >> 4)); }
302
inline BlendType getBottomR(unsigned char b) { return (BlendType)(0x3 & (b >> 4)); }
507
inline BlendType getBottomL(unsigned char b) { return static_cast<BlendType>(0x3 & (b >> 6)); }
303
inline BlendType getBottomL(unsigned char b) { return (BlendType)(0x3 & (b >> 6)); }
508
 
304
 
509
inline void setTopL   (unsigned char& b, BlendType bt) { b |= bt; } //buffer is assumed to be initialized before preprocessing!
305
inline void setTopL   (unsigned char& b, BlendType bt) { b |= bt; } //buffer is assumed to be initialized before preprocessing!
510
inline void setTopR   (unsigned char& b, BlendType bt) { b |= (bt << 2); }
306
inline void setTopR   (unsigned char& b, BlendType bt) { b |= (bt << 2); }
511
inline void setBottomR(unsigned char& b, BlendType bt) { b |= (bt << 4); }
307
inline void setBottomR(unsigned char& b, BlendType bt) { b |= (bt << 4); }
512
inline void setBottomL(unsigned char& b, BlendType bt) { b |= (bt << 6); }
308
inline void setBottomL(unsigned char& b, BlendType bt) { b |= (bt << 6); }
513
 
-
 
514
inline bool blendingNeeded(unsigned char b) { return b != 0; }
-
 
515
 
309
 
516
template <RotationDegree rotDeg> inline
310
template <RotationDegree rotDeg> inline
517
unsigned char rotateBlendInfo(unsigned char b) { return b; }
311
unsigned char rotateBlendInfo (unsigned char b) { return b; }
518
template <> inline unsigned char rotateBlendInfo<ROT_90 >(unsigned char b) { return ((b << 2) | (b >> 6)) & 0xff; }
312
template <> inline unsigned char rotateBlendInfo<ROT_90 >(unsigned char b) { return ((b << 2) | (b >> 6)) & 0xff; }
519
template <> inline unsigned char rotateBlendInfo<ROT_180>(unsigned char b) { return ((b << 4) | (b >> 4)) & 0xff; }
313
template <> inline unsigned char rotateBlendInfo<ROT_180>(unsigned char b) { return ((b << 4) | (b >> 4)) & 0xff; }
520
template <> inline unsigned char rotateBlendInfo<ROT_270>(unsigned char b) { return ((b << 6) | (b >> 2)) & 0xff; }
314
template <> inline unsigned char rotateBlendInfo<ROT_270>(unsigned char b) { return ((b << 6) | (b >> 2)) & 0xff; }
521
 
315
 
522
 
316
 
Line 528... Line 322...
528
| D | E | F | //input pixel is at position E
322
| D | E | F | //input pixel is at position E
529
----|---|---|
323
----|---|---|
530
| G | H | I |
324
| G | H | I |
531
-------------
325
-------------
532
*/
326
*/
533
template <class Scaler, class ColorDistance, RotationDegree rotDeg>
327
template <class Scaler, RotationDegree rotDeg>
534
FORCE_INLINE //perf: quite worth it!
-
 
535
void blendPixel(const Kernel_3x3& ker,
-
 
536
                uint32_t* target, int trgWidth,
-
 
537
                unsigned char blendInfo) //result of preprocessing all four corners of pixel "e"
328
FORCE_INLINE void blendPixel(const Kernel_3x3& ker, uint32_t *target, int trgWidth, unsigned char blendInfo, alphagrad_func alphagrad, dist_func dist) //result of preprocessing all four corners of pixel "e"
538
{
329
{
539
#define a get_a<rotDeg>(ker)
330
#define a get_a<rotDeg>(ker)
540
#define b get_b<rotDeg>(ker)
331
#define b get_b<rotDeg>(ker)
541
#define c get_c<rotDeg>(ker)
332
#define c get_c<rotDeg>(ker)
542
#define d get_d<rotDeg>(ker)
333
#define d get_d<rotDeg>(ker)
Line 548... Line 339...
548
 
339
 
549
    const unsigned char blend = rotateBlendInfo<rotDeg>(blendInfo);
340
    const unsigned char blend = rotateBlendInfo<rotDeg>(blendInfo);
550
 
341
 
551
    if (getBottomR(blend) >= BLEND_NORMAL)
342
    if (getBottomR(blend) >= BLEND_NORMAL)
552
    {
343
    {
553
        auto eq   = [&](uint32_t pix1, uint32_t pix2) { return ColorDistance::dist(pix1, pix2, XBRZ_CFG_LUMINANCE_WEIGHT) < XBRZ_CFG_EQUAL_COLOR_TOLERANCE; };
-
 
554
        auto dist = [&](uint32_t pix1, uint32_t pix2) { return ColorDistance::dist(pix1, pix2, XBRZ_CFG_LUMINANCE_WEIGHT); };
-
 
555
 
-
 
556
        const bool doLineBlend = [&]() -> bool
344
        bool doLineBlend;
557
        {
-
 
558
            if (getBottomR(blend) >= BLEND_DOMINANT)
-
 
559
                return true;
-
 
560
 
-
 
561
            //make sure there is no second blending in an adjacent rotation for this pixel: handles insular pixels, mario eyes
-
 
562
            if (getTopR(blend) != BLEND_NONE && !eq(e, g)) //but support double-blending for 90° corners
-
 
563
                return false;
-
 
564
            if (getBottomL(blend) != BLEND_NONE && !eq(e, c))
-
 
565
                return false;
-
 
566
 
-
 
567
            //no full blending for L-shapes; blend corner only (handles "mario mushroom eyes")
-
 
568
            if (!eq(e, i) && eq(g, h) && eq(h, i) && eq(i, f) && eq(f, c))
-
 
569
                return false;
-
 
570
 
345
 
-
 
346
        if (getBottomR(blend) >= BLEND_DOMINANT)
571
            return true;
347
            doLineBlend = true;
-
 
348
        else if (getTopR(blend) != BLEND_NONE && (dist (e, g) >= XBRZ_CFG_EQUAL_COLOR_TOLERANCE)) //but support double-blending for 90° corners
-
 
349
            doLineBlend = false; // make sure there is no second blending in an adjacent rotation for this pixel: handles insular pixels, mario eyes
-
 
350
        else if (getBottomL(blend) != BLEND_NONE && (dist (e, c) >= XBRZ_CFG_EQUAL_COLOR_TOLERANCE))
-
 
351
            doLineBlend = false; // make sure there is no second blending in an adjacent rotation for this pixel: handles insular pixels, mario eyes
-
 
352
        else if ((dist (e, i) >= XBRZ_CFG_EQUAL_COLOR_TOLERANCE)
-
 
353
            && (dist (g, h) < XBRZ_CFG_EQUAL_COLOR_TOLERANCE)
-
 
354
            && (dist (h, i) < XBRZ_CFG_EQUAL_COLOR_TOLERANCE)
-
 
355
            && (dist (i, f) < XBRZ_CFG_EQUAL_COLOR_TOLERANCE)
-
 
356
            && (dist (f, c) < XBRZ_CFG_EQUAL_COLOR_TOLERANCE))
-
 
357
            doLineBlend = false; // no full blending for L-shapes; blend corner only (handles "mario mushroom eyes")
-
 
358
                else
572
        }();
359
            doLineBlend = true;
573
 
360
 
574
        const uint32_t px = dist(e, f) <= dist(e, h) ? f : h; //choose most similar color
361
        const uint32_t px = (dist (e, f) <= dist (e, h) ? f : h); //choose most similar color
575
 
362
 
576
        OutputMatrix<Scaler::scale, rotDeg> out(target, trgWidth);
363
        OutputMatrix<Scaler::scale, rotDeg> out(target, trgWidth);
577
 
364
 
578
        if (doLineBlend)
365
        if (doLineBlend)
579
        {
366
        {
580
            const double fg = dist(f, g); //test sample: 70% of values max(fg, hc) / min(fg, hc) are between 1.1 and 3.7 with median being 1.9
367
            const double fg = dist (f, g); //test sample: 70% of values max(fg, hc) / min(fg, hc) are between 1.1 and 3.7 with median being 1.9
581
            const double hc = dist(h, c); //
368
            const double hc = dist (h, c); //
582
 
369
 
583
            const bool haveShallowLine = XBRZ_CFG_STEEP_DIRECTION_THRESHOLD * fg <= hc && e != g && d != g;
370
            const bool haveShallowLine = XBRZ_CFG_STEEP_DIRECTION_THRESHOLD * fg <= hc && e != g && d != g;
584
            const bool haveSteepLine   = XBRZ_CFG_STEEP_DIRECTION_THRESHOLD * hc <= fg && e != c && b != c;
371
            const bool haveSteepLine   = XBRZ_CFG_STEEP_DIRECTION_THRESHOLD * hc <= fg && e != c && b != c;
585
 
372
 
586
            if (haveShallowLine)
373
            if (haveShallowLine)
587
            {
374
            {
588
                if (haveSteepLine)
375
                if (haveSteepLine)
589
                    Scaler::blendLineSteepAndShallow(px, out);
376
                    Scaler::blendLineSteepAndShallow(px, out, alphagrad);
590
                else
377
                else
591
                    Scaler::blendLineShallow(px, out);
378
                    Scaler::blendLineShallow(px, out, alphagrad);
592
            }
379
            }
593
            else
380
            else
594
            {
381
            {
595
                if (haveSteepLine)
382
                if (haveSteepLine)
596
                    Scaler::blendLineSteep(px, out);
383
                    Scaler::blendLineSteep(px, out, alphagrad);
597
                else
384
                else
598
                    Scaler::blendLineDiagonal(px, out);
385
                    Scaler::blendLineDiagonal(px, out, alphagrad);
599
            }
386
            }
600
        }
387
        }
601
        else
388
        else
602
            Scaler::blendCorner(px, out);
389
            Scaler::blendCorner(px, out, alphagrad);
603
    }
390
    }
604
 
391
 
605
#undef a
392
#undef a
606
#undef b
393
#undef b
607
#undef c
394
#undef c
Line 612... Line 399...
612
#undef h
399
#undef h
613
#undef i
400
#undef i
614
}
401
}
615
 
402
 
616
 
403
 
617
template <class Scaler, class ColorDistance> //scaler policy: see "Scaler2x" reference implementation
404
template <class Scaler> //scaler policy: see "Scaler2x" reference implementation
618
void scaleImage(const uint32_t* src, uint32_t* trg, int srcWidth, int srcHeight, int yFirst, int yLast)
405
void scaleImage(const uint32_t *src, uint32_t *trg, int srcWidth, int srcHeight, int yFirst, int yLast, alphagrad_func alphagrad, dist_func dist)
619
{
406
{
620
    yFirst = std::max(yFirst, 0);
407
    yFirst = MAX (yFirst, 0);
621
    yLast  = std::min(yLast, srcHeight);
408
    yLast  = MIN (yLast, srcHeight);
622
    if (yFirst >= yLast || srcWidth <= 0)
409
    if (yFirst >= yLast || srcWidth <= 0)
623
        return;
410
        return;
624
 
411
 
625
    const int trgWidth = srcWidth * Scaler::scale;
412
    const int trgWidth = srcWidth * Scaler::scale;
626
 
413
 
627
    //"use" space at the end of the image as temporary buffer for "on the fly preprocessing": we even could use larger area of
414
    //"use" space at the end of the image as temporary buffer for "on the fly preprocessing": we even could use larger area of
628
    //"sizeof(uint32_t) * srcWidth * (yLast - yFirst)" bytes without risk of accidental overwriting before accessing
415
    //"sizeof(uint32_t) * srcWidth * (yLast - yFirst)" bytes without risk of accidental overwriting before accessing
629
    const int bufferSize = srcWidth;
416
    const int bufferSize = srcWidth;
630
    unsigned char* preProcBuffer = reinterpret_cast<unsigned char*>(trg + yLast * Scaler::scale * trgWidth) - bufferSize;
417
    unsigned char* preProcBuffer = reinterpret_cast<unsigned char*>(trg + yLast * Scaler::scale * trgWidth) - bufferSize;
631
    std::fill(preProcBuffer, preProcBuffer + bufferSize, '\0');
418
    memset (preProcBuffer, 0, bufferSize);
632
    static_assert(BLEND_NONE == 0, "");
419
    static_assert(BLEND_NONE == 0, "");
633
 
420
 
634
    //initialize preprocessing buffer for first row of current stripe: detect upper left and right corner blending
421
    //initialize preprocessing buffer for first row of current stripe: detect upper left and right corner blending
635
    //this cannot be optimized for adjacent processing stripes; we must not allow for a memory race condition!
422
    //this cannot be optimized for adjacent processing stripes; we must not allow for a memory race condition!
636
    if (yFirst > 0)
423
    if (yFirst > 0)
637
    {
424
    {
638
        const int y = yFirst - 1;
425
        const int y = yFirst - 1;
639
 
426
 
640
        const uint32_t* s_m1 = src + srcWidth * std::max(y - 1, 0);
427
        const uint32_t* s_m1 = src + srcWidth * MAX (y - 1, 0);
641
        const uint32_t* s_0  = src + srcWidth * y; //center line
428
        const uint32_t* s_0  = src + srcWidth * y; //center line
642
        const uint32_t* s_p1 = src + srcWidth * std::min(y + 1, srcHeight - 1);
429
        const uint32_t* s_p1 = src + srcWidth * MIN (y + 1, srcHeight - 1);
643
        const uint32_t* s_p2 = src + srcWidth * std::min(y + 2, srcHeight - 1);
430
        const uint32_t* s_p2 = src + srcWidth * MIN (y + 2, srcHeight - 1);
644
 
431
 
645
        for (int x = 0; x < srcWidth; ++x)
432
        for (int x = 0; x < srcWidth; ++x)
646
        {
433
        {
647
            const int x_m1 = std::max(x - 1, 0);
434
            const int x_m1 = MAX (x - 1, 0);
648
            const int x_p1 = std::min(x + 1, srcWidth - 1);
435
            const int x_p1 = MIN (x + 1, srcWidth - 1);
649
            const int x_p2 = std::min(x + 2, srcWidth - 1);
436
            const int x_p2 = MIN (x + 2, srcWidth - 1);
650
 
437
 
651
            Kernel_4x4 ker = {}; //perf: initialization is negligible
438
            Kernel_4x4 ker = {}; //perf: initialization is negligible
652
            ker.a = s_m1[x_m1]; //read sequentially from memory as far as possible
439
            ker.a = s_m1[x_m1]; //read sequentially from memory as far as possible
653
            ker.b = s_m1[x];
440
            ker.b = s_m1[x];
654
            ker.c = s_m1[x_p1];
441
            ker.c = s_m1[x_p1];
Line 667... Line 454...
667
            ker.m = s_p2[x_m1];
454
            ker.m = s_p2[x_m1];
668
            ker.n = s_p2[x];
455
            ker.n = s_p2[x];
669
            ker.o = s_p2[x_p1];
456
            ker.o = s_p2[x_p1];
670
            ker.p = s_p2[x_p2];
457
            ker.p = s_p2[x_p2];
671
 
458
 
672
            const BlendResult res = preProcessCorners<ColorDistance>(ker);
459
            const BlendResult res = preProcessCorners (ker, dist);
673
            /*
460
            /*
674
            preprocessing blend result:
461
            preprocessing blend result:
675
            ---------
462
            ---------
676
            | F | G |   //evalute corner between F, G, J, K
463
            | F | G |   //evalute corner between F, G, J, K
677
            ----|---|   //input pixel is at position F
464
            ----|---|   //input pixel is at position F
Line 686... Line 473...
686
    }
473
    }
687
    //------------------------------------------------------------------------------------
474
    //------------------------------------------------------------------------------------
688
 
475
 
689
    for (int y = yFirst; y < yLast; ++y)
476
    for (int y = yFirst; y < yLast; ++y)
690
    {
477
    {
691
        uint32_t* out = trg + Scaler::scale * y * trgWidth; //consider MT "striped" access
478
        uint32_t *out = trg + Scaler::scale * y * trgWidth; //consider MT "striped" access
692
 
479
 
693
        const uint32_t* s_m1 = src + srcWidth * std::max(y - 1, 0);
480
        const uint32_t* s_m1 = src + srcWidth * MAX (y - 1, 0);
694
        const uint32_t* s_0  = src + srcWidth * y; //center line
481
        const uint32_t* s_0  = src + srcWidth * y; //center line
695
        const uint32_t* s_p1 = src + srcWidth * std::min(y + 1, srcHeight - 1);
482
        const uint32_t* s_p1 = src + srcWidth * MIN (y + 1, srcHeight - 1);
696
        const uint32_t* s_p2 = src + srcWidth * std::min(y + 2, srcHeight - 1);
483
        const uint32_t* s_p2 = src + srcWidth * MIN (y + 2, srcHeight - 1);
697
 
484
 
698
        unsigned char blend_xy1 = 0; //corner blending for current (x, y + 1) position
485
        unsigned char blend_xy1 = 0; //corner blending for current (x, y + 1) position
699
 
486
 
700
        for (int x = 0; x < srcWidth; ++x, out += Scaler::scale)
487
        for (int x = 0; x < srcWidth; ++x, out += Scaler::scale)
701
        {
488
        {
702
            //all those bounds checks have only insignificant impact on performance!
489
            //all those bounds checks have only insignificant impact on performance!
703
            const int x_m1 = std::max(x - 1, 0); //perf: prefer array indexing to additional pointers!
490
            const int x_m1 = MAX (x - 1, 0); //perf: prefer array indexing to additional pointers!
704
            const int x_p1 = std::min(x + 1, srcWidth - 1);
491
            const int x_p1 = MIN (x + 1, srcWidth - 1);
705
            const int x_p2 = std::min(x + 2, srcWidth - 1);
492
            const int x_p2 = MIN (x + 2, srcWidth - 1);
706
 
493
 
707
            Kernel_4x4 ker4 = {}; //perf: initialization is negligible
494
            Kernel_4x4 ker4 = {}; //perf: initialization is negligible
708
 
495
 
709
            ker4.a = s_m1[x_m1]; //read sequentially from memory as far as possible
496
            ker4.a = s_m1[x_m1]; //read sequentially from memory as far as possible
710
            ker4.b = s_m1[x];
497
            ker4.b = s_m1[x];
Line 727... Line 514...
727
            ker4.p = s_p2[x_p2];
514
            ker4.p = s_p2[x_p2];
728
 
515
 
729
            //evaluate the four corners on bottom-right of current pixel
516
            //evaluate the four corners on bottom-right of current pixel
730
            unsigned char blend_xy = 0; //for current (x, y) position
517
            unsigned char blend_xy = 0; //for current (x, y) position
731
            {
518
            {
732
                const BlendResult res = preProcessCorners<ColorDistance>(ker4);
519
                const BlendResult res = preProcessCorners (ker4, dist);
733
                /*
520
                /*
734
                preprocessing blend result:
521
                preprocessing blend result:
735
                ---------
522
                ---------
736
                | F | G |   //evalute corner between F, G, J, K
523
                | F | G |   //evalute corner between F, G, J, K
737
                ----|---|   //current input pixel is at position F
524
                ----|---|   //current input pixel is at position F
Line 750... Line 537...
750
                if (x + 1 < bufferSize) //set 3rd known corner for (x + 1, y)
537
                if (x + 1 < bufferSize) //set 3rd known corner for (x + 1, y)
751
                    setBottomL(preProcBuffer[x + 1], res.blend_g);
538
                    setBottomL(preProcBuffer[x + 1], res.blend_g);
752
            }
539
            }
753
 
540
 
754
            //fill block of size scale * scale with the given color
541
            //fill block of size scale * scale with the given color
-
 
542
                        {
-
 
543
                                uint32_t *blk = out;
-
 
544
                            for (int _blk_y = 0; _blk_y < Scaler::scale; ++_blk_y, blk = (uint32_t *) BYTE_ADVANCE (blk, trgWidth * sizeof (uint32_t)))
755
            xbrz::fillBlock(out, trgWidth * sizeof(uint32_t), ker4.f, Scaler::scale, Scaler::scale);
545
                                for (int _blk_x = 0; _blk_x < Scaler::scale; ++_blk_x)
-
 
546
                                    blk[_blk_x] = ker4.f;
-
 
547
                        }
756
            //place *after* preprocessing step, to not overwrite the results while processing the the last pixel!
548
            //place *after* preprocessing step, to not overwrite the results while processing the the last pixel!
757
 
549
 
758
            //blend four corners of current pixel
550
            //blend four corners of current pixel
759
            if (blendingNeeded(blend_xy)) //good 5% perf-improvement
551
            if (blend_xy != 0) //good 5% perf-improvement
760
            {
552
            {
761
                Kernel_3x3 ker3 = {}; //perf: initialization is negligible
553
                Kernel_3x3 ker3 = {}; //perf: initialization is negligible
762
 
554
 
763
                ker3.a = ker4.a;
555
                ker3.a = ker4.a;
764
                ker3.b = ker4.b;
556
                ker3.b = ker4.b;
Line 770... Line 562...
770
 
562
 
771
                ker3.g = ker4.i;
563
                ker3.g = ker4.i;
772
                ker3.h = ker4.j;
564
                ker3.h = ker4.j;
773
                ker3.i = ker4.k;
565
                ker3.i = ker4.k;
774
 
566
 
775
                blendPixel<Scaler, ColorDistance, ROT_0  >(ker3, out, trgWidth, blend_xy);
567
                blendPixel<Scaler, ROT_0  >(ker3, out, trgWidth, blend_xy, alphagrad, dist);
776
                blendPixel<Scaler, ColorDistance, ROT_90 >(ker3, out, trgWidth, blend_xy);
568
                blendPixel<Scaler, ROT_90 >(ker3, out, trgWidth, blend_xy, alphagrad, dist);
777
                blendPixel<Scaler, ColorDistance, ROT_180>(ker3, out, trgWidth, blend_xy);
569
                blendPixel<Scaler, ROT_180>(ker3, out, trgWidth, blend_xy, alphagrad, dist);
778
                blendPixel<Scaler, ColorDistance, ROT_270>(ker3, out, trgWidth, blend_xy);
570
                blendPixel<Scaler, ROT_270>(ker3, out, trgWidth, blend_xy, alphagrad, dist);
779
            }
571
            }
780
        }
572
        }
781
    }
573
    }
782
}
574
}
783
 
575
 
784
 
576
 
785
//------------------------------------------------------------------------------------
577
//------------------------------------------------------------------------------------
786
template <class ColorGradient> struct Scaler2x : public ColorGradient
578
struct Scaler2x
787
{
579
{
788
    static const int scale = 2;
580
    static const int scale = 2;
789
 
-
 
790
    template <unsigned int M, unsigned int N> //bring template function into scope for GCC
-
 
791
    static void alphaGrad(uint32_t& pixBack, uint32_t pixFront) { ColorGradient::template alphaGrad<M, N>(pixBack, pixFront); }
-
 
792
 
581
 
793
 
582
 
794
    template <class OutputMatrix>
583
    template <class OutputMatrix>
795
    static void blendLineShallow(uint32_t col, OutputMatrix& out)
584
    static void blendLineShallow(uint32_t col, OutputMatrix& out, alphagrad_func alphagrad)
796
    {
585
    {
797
        alphaGrad<1, 4>(out.template ref<scale - 1, 0>(), col);
586
        alphagrad (&(out.template ref<scale - 1, 0>()), col, 1, 4);
798
        alphaGrad<3, 4>(out.template ref<scale - 1, 1>(), col);
587
        alphagrad (&(out.template ref<scale - 1, 1>()), col, 3, 4);
799
    }
588
    }
800
 
589
 
801
    template <class OutputMatrix>
590
    template <class OutputMatrix>
802
    static void blendLineSteep(uint32_t col, OutputMatrix& out)
591
    static void blendLineSteep(uint32_t col, OutputMatrix& out, alphagrad_func alphagrad)
803
    {
592
    {
804
        alphaGrad<1, 4>(out.template ref<0, scale - 1>(), col);
593
        alphagrad (&(out.template ref<0, scale - 1>()), col, 1, 4);
805
        alphaGrad<3, 4>(out.template ref<1, scale - 1>(), col);
594
        alphagrad (&(out.template ref<1, scale - 1>()), col, 3, 4);
806
    }
595
    }
807
 
596
 
808
    template <class OutputMatrix>
597
    template <class OutputMatrix>
809
    static void blendLineSteepAndShallow(uint32_t col, OutputMatrix& out)
598
    static void blendLineSteepAndShallow(uint32_t col, OutputMatrix& out, alphagrad_func alphagrad)
810
    {
599
    {
811
        alphaGrad<1, 4>(out.template ref<1, 0>(), col);
600
        alphagrad (&(out.template ref<1, 0>()), col, 1, 4);
812
        alphaGrad<1, 4>(out.template ref<0, 1>(), col);
601
        alphagrad (&(out.template ref<0, 1>()), col, 1, 4);
813
        alphaGrad<5, 6>(out.template ref<1, 1>(), col); //[!] fixes 7/8 used in xBR
602
        alphagrad (&(out.template ref<1, 1>()), col, 5, 6); //[!] fixes 7/8 used in xBR
814
    }
603
    }
815
 
604
 
816
    template <class OutputMatrix>
605
    template <class OutputMatrix>
817
    static void blendLineDiagonal(uint32_t col, OutputMatrix& out)
606
    static void blendLineDiagonal(uint32_t col, OutputMatrix& out, alphagrad_func alphagrad)
818
    {
607
    {
819
        alphaGrad<1, 2>(out.template ref<1, 1>(), col);
608
        alphagrad (&(out.template ref<1, 1>()), col, 1, 2);
820
    }
609
    }
821
 
610
 
822
    template <class OutputMatrix>
611
    template <class OutputMatrix>
823
    static void blendCorner(uint32_t col, OutputMatrix& out)
612
    static void blendCorner(uint32_t col, OutputMatrix& out, alphagrad_func alphagrad)
824
    {
613
    {
825
        //model a round corner
614
        //model a round corner
826
        alphaGrad<21, 100>(out.template ref<1, 1>(), col); //exact: 1 - pi/4 = 0.2146018366
615
        alphagrad (&(out.template ref<1, 1>()), col, 21, 100); //exact: 1 - pi/4 = 0.2146018366
827
    }
616
    }
828
};
617
};
829
 
618
 
830
 
619
 
831
template <class ColorGradient> struct Scaler3x : public ColorGradient
620
struct Scaler3x
832
{
621
{
833
    static const int scale = 3;
622
    static const int scale = 3;
834
 
-
 
835
    template <unsigned int M, unsigned int N> //bring template function into scope for GCC
-
 
836
    static void alphaGrad(uint32_t& pixBack, uint32_t pixFront) { ColorGradient::template alphaGrad<M, N>(pixBack, pixFront); }
-
 
837
 
623
 
838
 
624
 
839
    template <class OutputMatrix>
625
    template <class OutputMatrix>
840
    static void blendLineShallow(uint32_t col, OutputMatrix& out)
626
    static void blendLineShallow(uint32_t col, OutputMatrix& out, alphagrad_func alphagrad)
841
    {
627
    {
842
        alphaGrad<1, 4>(out.template ref<scale - 1, 0>(), col);
628
        alphagrad (&(out.template ref<scale - 1, 0>()), col, 1, 4);
843
        alphaGrad<1, 4>(out.template ref<scale - 2, 2>(), col);
629
        alphagrad (&(out.template ref<scale - 2, 2>()), col, 1, 4);
844
 
-
 
845
        alphaGrad<3, 4>(out.template ref<scale - 1, 1>(), col);
630
        alphagrad (&(out.template ref<scale - 1, 1>()), col, 3, 4);
846
        out.template ref<scale - 1, 2>() = col;
631
        out.template ref<scale - 1, 2>() = col;
847
    }
632
    }
848
 
633
 
849
    template <class OutputMatrix>
634
    template <class OutputMatrix>
850
    static void blendLineSteep(uint32_t col, OutputMatrix& out)
635
    static void blendLineSteep(uint32_t col, OutputMatrix& out, alphagrad_func alphagrad)
851
    {
636
    {
852
        alphaGrad<1, 4>(out.template ref<0, scale - 1>(), col);
637
        alphagrad (&(out.template ref<0, scale - 1>()), col, 1, 4);
853
        alphaGrad<1, 4>(out.template ref<2, scale - 2>(), col);
638
        alphagrad (&(out.template ref<2, scale - 2>()), col, 1, 4);
854
 
-
 
855
        alphaGrad<3, 4>(out.template ref<1, scale - 1>(), col);
639
        alphagrad (&(out.template ref<1, scale - 1>()), col, 3, 4);
856
        out.template ref<2, scale - 1>() = col;
640
        out.template ref<2, scale - 1>() = col;
857
    }
641
    }
858
 
642
 
859
    template <class OutputMatrix>
643
    template <class OutputMatrix>
860
    static void blendLineSteepAndShallow(uint32_t col, OutputMatrix& out)
644
    static void blendLineSteepAndShallow(uint32_t col, OutputMatrix& out, alphagrad_func alphagrad)
861
    {
645
    {
862
        alphaGrad<1, 4>(out.template ref<2, 0>(), col);
646
        alphagrad (&(out.template ref<2, 0>()), col, 1, 4);
863
        alphaGrad<1, 4>(out.template ref<0, 2>(), col);
647
        alphagrad (&(out.template ref<0, 2>()), col, 1, 4);
864
        alphaGrad<3, 4>(out.template ref<2, 1>(), col);
648
        alphagrad (&(out.template ref<2, 1>()), col, 3, 4);
865
        alphaGrad<3, 4>(out.template ref<1, 2>(), col);
649
        alphagrad (&(out.template ref<1, 2>()), col, 3, 4);
866
        out.template ref<2, 2>() = col;
650
        out.template ref<2, 2>() = col;
867
    }
651
    }
868
 
652
 
869
    template <class OutputMatrix>
653
    template <class OutputMatrix>
870
    static void blendLineDiagonal(uint32_t col, OutputMatrix& out)
654
    static void blendLineDiagonal(uint32_t col, OutputMatrix& out, alphagrad_func alphagrad)
871
    {
655
    {
872
        alphaGrad<1, 8>(out.template ref<1, 2>(), col); //conflict with other rotations for this odd scale
656
        alphagrad (&(out.template ref<1, 2>()), col, 1, 8); //conflict with other rotations for this odd scale
873
        alphaGrad<1, 8>(out.template ref<2, 1>(), col);
657
        alphagrad (&(out.template ref<2, 1>()), col, 1, 8);
874
        alphaGrad<7, 8>(out.template ref<2, 2>(), col); //
658
        alphagrad (&(out.template ref<2, 2>()), col, 7, 8); //
875
    }
659
    }
876
 
660
 
877
    template <class OutputMatrix>
661
    template <class OutputMatrix>
878
    static void blendCorner(uint32_t col, OutputMatrix& out)
662
    static void blendCorner(uint32_t col, OutputMatrix& out, alphagrad_func alphagrad)
879
    {
663
    {
880
        //model a round corner
664
        //model a round corner
881
        alphaGrad<45, 100>(out.template ref<2, 2>(), col); //exact: 0.4545939598
665
        alphagrad (&(out.template ref<2, 2>()), col, 45, 100); //exact: 0.4545939598
882
        //alphaGrad<7, 256>(out.template ref<2, 1>(), col); //0.02826017254 -> negligible + avoid conflicts with other rotations for this odd scale
666
        //alphagrad (&(out.template ref<2, 1>()), col, 7, 256); //0.02826017254 -> negligible + avoid conflicts with other rotations for this odd scale
883
        //alphaGrad<7, 256>(out.template ref<1, 2>(), col); //0.02826017254
667
        //alphagrad (&(out.template ref<1, 2>()), col, 7, 256); //0.02826017254
884
    }
668
    }
885
};
669
};
886
 
670
 
887
 
671
 
888
template <class ColorGradient> struct Scaler4x : public ColorGradient
672
struct Scaler4x
889
{
673
{
890
    static const int scale = 4;
674
    static const int scale = 4;
891
 
-
 
892
    template <unsigned int M, unsigned int N> //bring template function into scope for GCC
-
 
893
    static void alphaGrad(uint32_t& pixBack, uint32_t pixFront) { ColorGradient::template alphaGrad<M, N>(pixBack, pixFront); }
-
 
894
 
675
 
895
 
676
 
896
    template <class OutputMatrix>
677
    template <class OutputMatrix>
897
    static void blendLineShallow(uint32_t col, OutputMatrix& out)
678
    static void blendLineShallow(uint32_t col, OutputMatrix& out, alphagrad_func alphagrad)
898
    {
679
    {
899
        alphaGrad<1, 4>(out.template ref<scale - 1, 0>(), col);
680
        alphagrad (&(out.template ref<scale - 1, 0>()), col, 1, 4);
900
        alphaGrad<1, 4>(out.template ref<scale - 2, 2>(), col);
681
        alphagrad (&(out.template ref<scale - 2, 2>()), col, 1, 4);
901
 
-
 
902
        alphaGrad<3, 4>(out.template ref<scale - 1, 1>(), col);
682
        alphagrad (&(out.template ref<scale - 1, 1>()), col, 3, 4);
903
        alphaGrad<3, 4>(out.template ref<scale - 2, 3>(), col);
683
        alphagrad (&(out.template ref<scale - 2, 3>()), col, 3, 4);
904
 
684
 
905
        out.template ref<scale - 1, 2>() = col;
685
        out.template ref<scale - 1, 2>() = col;
906
        out.template ref<scale - 1, 3>() = col;
686
        out.template ref<scale - 1, 3>() = col;
907
    }
687
    }
908
 
688
 
909
    template <class OutputMatrix>
689
    template <class OutputMatrix>
910
    static void blendLineSteep(uint32_t col, OutputMatrix& out)
690
    static void blendLineSteep(uint32_t col, OutputMatrix& out, alphagrad_func alphagrad)
911
    {
691
    {
912
        alphaGrad<1, 4>(out.template ref<0, scale - 1>(), col);
692
        alphagrad (&(out.template ref<0, scale - 1>()), col, 1, 4);
913
        alphaGrad<1, 4>(out.template ref<2, scale - 2>(), col);
693
        alphagrad (&(out.template ref<2, scale - 2>()), col, 1, 4);
914
 
-
 
915
        alphaGrad<3, 4>(out.template ref<1, scale - 1>(), col);
694
        alphagrad (&(out.template ref<1, scale - 1>()), col, 3, 4);
916
        alphaGrad<3, 4>(out.template ref<3, scale - 2>(), col);
695
        alphagrad (&(out.template ref<3, scale - 2>()), col, 3, 4);
917
 
696
 
918
        out.template ref<2, scale - 1>() = col;
697
        out.template ref<2, scale - 1>() = col;
919
        out.template ref<3, scale - 1>() = col;
698
        out.template ref<3, scale - 1>() = col;
920
    }
699
    }
921
 
700
 
922
    template <class OutputMatrix>
701
    template <class OutputMatrix>
923
    static void blendLineSteepAndShallow(uint32_t col, OutputMatrix& out)
702
    static void blendLineSteepAndShallow(uint32_t col, OutputMatrix& out, alphagrad_func alphagrad)
924
    {
703
    {
925
        alphaGrad<3, 4>(out.template ref<3, 1>(), col);
704
        alphagrad (&(out.template ref<3, 1>()), col, 3, 4);
926
        alphaGrad<3, 4>(out.template ref<1, 3>(), col);
705
        alphagrad (&(out.template ref<1, 3>()), col, 3, 4);
927
        alphaGrad<1, 4>(out.template ref<3, 0>(), col);
706
        alphagrad (&(out.template ref<3, 0>()), col, 1, 4);
928
        alphaGrad<1, 4>(out.template ref<0, 3>(), col);
707
        alphagrad (&(out.template ref<0, 3>()), col, 1, 4);
929
 
-
 
930
        alphaGrad<1, 3>(out.template ref<2, 2>(), col); //[!] fixes 1/4 used in xBR
708
        alphagrad (&(out.template ref<2, 2>()), col, 1, 3); //[!] fixes 1/4 used in xBR
931
 
709
 
932
        out.template ref<3, 3>() = col;
710
        out.template ref<3, 3>() = col;
933
        out.template ref<3, 2>() = col;
711
        out.template ref<3, 2>() = col;
934
        out.template ref<2, 3>() = col;
712
        out.template ref<2, 3>() = col;
935
    }
713
    }
936
 
714
 
937
    template <class OutputMatrix>
715
    template <class OutputMatrix>
938
    static void blendLineDiagonal(uint32_t col, OutputMatrix& out)
716
    static void blendLineDiagonal(uint32_t col, OutputMatrix& out, alphagrad_func alphagrad)
939
    {
717
    {
940
        alphaGrad<1, 2>(out.template ref<scale - 1, scale / 2    >(), col);
718
        alphagrad (&(out.template ref<scale - 1, scale / 2    >()), col, 1, 2);
941
        alphaGrad<1, 2>(out.template ref<scale - 2, scale / 2 + 1>(), col);
719
        alphagrad (&(out.template ref<scale - 2, scale / 2 + 1>()), col, 1, 2);
-
 
720
 
942
        out.template ref<scale - 1, scale - 1>() = col;
721
        out.template ref<scale - 1, scale - 1>() = col;
943
    }
722
    }
944
 
723
 
945
    template <class OutputMatrix>
724
    template <class OutputMatrix>
946
    static void blendCorner(uint32_t col, OutputMatrix& out)
725
    static void blendCorner(uint32_t col, OutputMatrix& out, alphagrad_func alphagrad)
947
    {
726
    {
948
        //model a round corner
727
        //model a round corner
949
        alphaGrad<68, 100>(out.template ref<3, 3>(), col); //exact: 0.6848532563
728
        alphagrad (&(out.template ref<3, 3>()), col, 68, 100); //exact: 0.6848532563
950
        alphaGrad< 9, 100>(out.template ref<3, 2>(), col); //0.08677704501
729
        alphagrad (&(out.template ref<3, 2>()), col,  9, 100); //0.08677704501
951
        alphaGrad< 9, 100>(out.template ref<2, 3>(), col); //0.08677704501
730
        alphagrad (&(out.template ref<2, 3>()), col,  9, 100); //0.08677704501
952
    }
731
    }
953
};
732
};
954
 
733
 
955
 
734
 
956
template <class ColorGradient> struct Scaler5x : public ColorGradient
735
struct Scaler5x
957
{
736
{
958
    static const int scale = 5;
737
    static const int scale = 5;
959
 
-
 
960
    template <unsigned int M, unsigned int N> //bring template function into scope for GCC
-
 
961
    static void alphaGrad(uint32_t& pixBack, uint32_t pixFront) { ColorGradient::template alphaGrad<M, N>(pixBack, pixFront); }
-
 
962
 
738
 
963
 
739
 
964
    template <class OutputMatrix>
740
    template <class OutputMatrix>
965
    static void blendLineShallow(uint32_t col, OutputMatrix& out)
741
    static void blendLineShallow(uint32_t col, OutputMatrix& out, alphagrad_func alphagrad)
966
    {
742
    {
967
        alphaGrad<1, 4>(out.template ref<scale - 1, 0>(), col);
743
        alphagrad (&(out.template ref<scale - 1, 0>()), col, 1, 4);
968
        alphaGrad<1, 4>(out.template ref<scale - 2, 2>(), col);
744
        alphagrad (&(out.template ref<scale - 2, 2>()), col, 1, 4);
969
        alphaGrad<1, 4>(out.template ref<scale - 3, 4>(), col);
745
        alphagrad (&(out.template ref<scale - 3, 4>()), col, 1, 4);
970
 
-
 
971
        alphaGrad<3, 4>(out.template ref<scale - 1, 1>(), col);
746
        alphagrad (&(out.template ref<scale - 1, 1>()), col, 3, 4);
972
        alphaGrad<3, 4>(out.template ref<scale - 2, 3>(), col);
747
        alphagrad (&(out.template ref<scale - 2, 3>()), col, 3, 4);
973
 
748
 
974
        out.template ref<scale - 1, 2>() = col;
749
        out.template ref<scale - 1, 2>() = col;
975
        out.template ref<scale - 1, 3>() = col;
750
        out.template ref<scale - 1, 3>() = col;
976
        out.template ref<scale - 1, 4>() = col;
751
        out.template ref<scale - 1, 4>() = col;
977
        out.template ref<scale - 2, 4>() = col;
752
        out.template ref<scale - 2, 4>() = col;
978
    }
753
    }
979
 
754
 
980
    template <class OutputMatrix>
755
    template <class OutputMatrix>
981
    static void blendLineSteep(uint32_t col, OutputMatrix& out)
756
    static void blendLineSteep(uint32_t col, OutputMatrix& out, alphagrad_func alphagrad)
982
    {
757
    {
983
        alphaGrad<1, 4>(out.template ref<0, scale - 1>(), col);
758
        alphagrad (&(out.template ref<0, scale - 1>()), col, 1, 4);
984
        alphaGrad<1, 4>(out.template ref<2, scale - 2>(), col);
759
        alphagrad (&(out.template ref<2, scale - 2>()), col, 1, 4);
985
        alphaGrad<1, 4>(out.template ref<4, scale - 3>(), col);
760
        alphagrad (&(out.template ref<4, scale - 3>()), col, 1, 4);
986
 
-
 
987
        alphaGrad<3, 4>(out.template ref<1, scale - 1>(), col);
761
        alphagrad (&(out.template ref<1, scale - 1>()), col, 3, 4);
988
        alphaGrad<3, 4>(out.template ref<3, scale - 2>(), col);
762
        alphagrad (&(out.template ref<3, scale - 2>()), col, 3, 4);
989
 
763
 
990
        out.template ref<2, scale - 1>() = col;
764
        out.template ref<2, scale - 1>() = col;
991
        out.template ref<3, scale - 1>() = col;
765
        out.template ref<3, scale - 1>() = col;
992
        out.template ref<4, scale - 1>() = col;
766
        out.template ref<4, scale - 1>() = col;
993
        out.template ref<4, scale - 2>() = col;
767
        out.template ref<4, scale - 2>() = col;
994
    }
768
    }
995
 
769
 
996
    template <class OutputMatrix>
770
    template <class OutputMatrix>
997
    static void blendLineSteepAndShallow(uint32_t col, OutputMatrix& out)
771
    static void blendLineSteepAndShallow(uint32_t col, OutputMatrix& out, alphagrad_func alphagrad)
998
    {
772
    {
999
        alphaGrad<1, 4>(out.template ref<0, scale - 1>(), col);
773
        alphagrad (&(out.template ref<0, scale - 1>()), col, 1, 4);
1000
        alphaGrad<1, 4>(out.template ref<2, scale - 2>(), col);
774
        alphagrad (&(out.template ref<2, scale - 2>()), col, 1, 4);
1001
        alphaGrad<3, 4>(out.template ref<1, scale - 1>(), col);
775
        alphagrad (&(out.template ref<1, scale - 1>()), col, 3, 4);
1002
 
-
 
1003
        alphaGrad<1, 4>(out.template ref<scale - 1, 0>(), col);
776
        alphagrad (&(out.template ref<scale - 1, 0>()), col, 1, 4);
1004
        alphaGrad<1, 4>(out.template ref<scale - 2, 2>(), col);
777
        alphagrad (&(out.template ref<scale - 2, 2>()), col, 1, 4);
1005
        alphaGrad<3, 4>(out.template ref<scale - 1, 1>(), col);
778
        alphagrad (&(out.template ref<scale - 1, 1>()), col, 3, 4);
1006
 
-
 
1007
        alphaGrad<2, 3>(out.template ref<3, 3>(), col);
779
        alphagrad (&(out.template ref<3, 3>()), col, 2, 3);
1008
 
780
 
1009
        out.template ref<2, scale - 1>() = col;
781
        out.template ref<2, scale - 1>() = col;
1010
        out.template ref<3, scale - 1>() = col;
782
        out.template ref<3, scale - 1>() = col;
1011
        out.template ref<4, scale - 1>() = col;
783
        out.template ref<4, scale - 1>() = col;
1012
 
-
 
1013
        out.template ref<scale - 1, 2>() = col;
784
        out.template ref<scale - 1, 2>() = col;
1014
        out.template ref<scale - 1, 3>() = col;
785
        out.template ref<scale - 1, 3>() = col;
1015
    }
786
    }
1016
 
787
 
1017
    template <class OutputMatrix>
788
    template <class OutputMatrix>
1018
    static void blendLineDiagonal(uint32_t col, OutputMatrix& out)
789
    static void blendLineDiagonal(uint32_t col, OutputMatrix& out, alphagrad_func alphagrad)
1019
    {
790
    {
1020
        alphaGrad<1, 8>(out.template ref<scale - 1, scale / 2    >(), col); //conflict with other rotations for this odd scale
791
        alphagrad (&(out.template ref<scale - 1, scale / 2    >()), col, 1, 8); //conflict with other rotations for this odd scale
1021
        alphaGrad<1, 8>(out.template ref<scale - 2, scale / 2 + 1>(), col);
792
        alphagrad (&(out.template ref<scale - 2, scale / 2 + 1>()), col, 1, 8);
1022
        alphaGrad<1, 8>(out.template ref<scale - 3, scale / 2 + 2>(), col); //
793
        alphagrad (&(out.template ref<scale - 3, scale / 2 + 2>()), col, 1, 8); //
1023
 
-
 
1024
        alphaGrad<7, 8>(out.template ref<4, 3>(), col);
794
        alphagrad (&(out.template ref<4, 3>()), col, 7, 8);
1025
        alphaGrad<7, 8>(out.template ref<3, 4>(), col);
795
        alphagrad (&(out.template ref<3, 4>()), col, 7, 8);
1026
 
796
 
1027
        out.template ref<4, 4>() = col;
797
        out.template ref<4, 4>() = col;
1028
    }
798
    }
1029
 
799
 
1030
    template <class OutputMatrix>
800
    template <class OutputMatrix>
1031
    static void blendCorner(uint32_t col, OutputMatrix& out)
801
    static void blendCorner(uint32_t col, OutputMatrix& out, alphagrad_func alphagrad)
1032
    {
802
    {
1033
        // model a round corner
803
        // model a round corner
1034
        alphaGrad<86, 100>(out.template ref<4, 4>(), col); //exact: 0.8631434088
804
        alphagrad (&(out.template ref<4, 4>()), col, 86, 100); //exact: 0.8631434088
1035
        alphaGrad<23, 100>(out.template ref<4, 3>(), col); //0.2306749731
805
        alphagrad (&(out.template ref<4, 3>()), col, 23, 100); //0.2306749731
1036
        alphaGrad<23, 100>(out.template ref<3, 4>(), col); //0.2306749731
806
        alphagrad (&(out.template ref<3, 4>()), col, 23, 100); //0.2306749731
1037
        //alphaGrad<1, 64>(out.template ref<4, 2>(), col); //0.01676812367 -> negligible + avoid conflicts with other rotations for this odd scale
807
        //alphaGrad<1, 64>(out.template ref<4, 2>(), col); //0.01676812367 -> negligible + avoid conflicts with other rotations for this odd scale
1038
        //alphaGrad<1, 64>(out.template ref<2, 4>(), col); //0.01676812367
808
        //alphaGrad<1, 64>(out.template ref<2, 4>(), col); //0.01676812367
1039
    }
809
    }
1040
};
810
};
1041
 
811
 
1042
 
812
 
1043
template <class ColorGradient> struct Scaler6x : public ColorGradient
813
struct Scaler6x
1044
{
814
{
1045
    static const int scale = 6;
815
    static const int scale = 6;
1046
 
-
 
1047
    template <unsigned int M, unsigned int N> //bring template function into scope for GCC
-
 
1048
    static void alphaGrad(uint32_t& pixBack, uint32_t pixFront) { ColorGradient::template alphaGrad<M, N>(pixBack, pixFront); }
-
 
1049
 
816
 
1050
 
817
 
1051
    template <class OutputMatrix>
818
    template <class OutputMatrix>
1052
    static void blendLineShallow(uint32_t col, OutputMatrix& out)
819
    static void blendLineShallow(uint32_t col, OutputMatrix& out, alphagrad_func alphagrad)
1053
    {
820
    {
1054
        alphaGrad<1, 4>(out.template ref<scale - 1, 0>(), col);
821
        alphagrad (&(out.template ref<scale - 1, 0>()), col, 1, 4);
1055
        alphaGrad<1, 4>(out.template ref<scale - 2, 2>(), col);
822
        alphagrad (&(out.template ref<scale - 2, 2>()), col, 1, 4);
1056
        alphaGrad<1, 4>(out.template ref<scale - 3, 4>(), col);
823
        alphagrad (&(out.template ref<scale - 3, 4>()), col, 1, 4);
1057
 
-
 
1058
        alphaGrad<3, 4>(out.template ref<scale - 1, 1>(), col);
824
        alphagrad (&(out.template ref<scale - 1, 1>()), col, 3, 4);
1059
        alphaGrad<3, 4>(out.template ref<scale - 2, 3>(), col);
825
        alphagrad (&(out.template ref<scale - 2, 3>()), col, 3, 4);
1060
        alphaGrad<3, 4>(out.template ref<scale - 3, 5>(), col);
826
        alphagrad (&(out.template ref<scale - 3, 5>()), col, 3, 4);
1061
 
827
 
1062
        out.template ref<scale - 1, 2>() = col;
828
        out.template ref<scale - 1, 2>() = col;
1063
        out.template ref<scale - 1, 3>() = col;
829
        out.template ref<scale - 1, 3>() = col;
1064
        out.template ref<scale - 1, 4>() = col;
830
        out.template ref<scale - 1, 4>() = col;
1065
        out.template ref<scale - 1, 5>() = col;
831
        out.template ref<scale - 1, 5>() = col;
1066
 
-
 
1067
        out.template ref<scale - 2, 4>() = col;
832
        out.template ref<scale - 2, 4>() = col;
1068
        out.template ref<scale - 2, 5>() = col;
833
        out.template ref<scale - 2, 5>() = col;
1069
    }
834
    }
1070
 
835
 
1071
    template <class OutputMatrix>
836
    template <class OutputMatrix>
1072
    static void blendLineSteep(uint32_t col, OutputMatrix& out)
837
    static void blendLineSteep(uint32_t col, OutputMatrix& out, alphagrad_func alphagrad)
1073
    {
838
    {
1074
        alphaGrad<1, 4>(out.template ref<0, scale - 1>(), col);
839
        alphagrad (&(out.template ref<0, scale - 1>()), col, 1, 4);
1075
        alphaGrad<1, 4>(out.template ref<2, scale - 2>(), col);
840
        alphagrad (&(out.template ref<2, scale - 2>()), col, 1, 4);
1076
        alphaGrad<1, 4>(out.template ref<4, scale - 3>(), col);
841
        alphagrad (&(out.template ref<4, scale - 3>()), col, 1, 4);
1077
 
-
 
1078
        alphaGrad<3, 4>(out.template ref<1, scale - 1>(), col);
842
        alphagrad (&(out.template ref<1, scale - 1>()), col, 3, 4);
1079
        alphaGrad<3, 4>(out.template ref<3, scale - 2>(), col);
843
        alphagrad (&(out.template ref<3, scale - 2>()), col, 3, 4);
1080
        alphaGrad<3, 4>(out.template ref<5, scale - 3>(), col);
844
        alphagrad (&(out.template ref<5, scale - 3>()), col, 3, 4);
1081
 
845
 
1082
        out.template ref<2, scale - 1>() = col;
846
        out.template ref<2, scale - 1>() = col;
1083
        out.template ref<3, scale - 1>() = col;
847
        out.template ref<3, scale - 1>() = col;
1084
        out.template ref<4, scale - 1>() = col;
848
        out.template ref<4, scale - 1>() = col;
1085
        out.template ref<5, scale - 1>() = col;
849
        out.template ref<5, scale - 1>() = col;
1086
 
-
 
1087
        out.template ref<4, scale - 2>() = col;
850
        out.template ref<4, scale - 2>() = col;
1088
        out.template ref<5, scale - 2>() = col;
851
        out.template ref<5, scale - 2>() = col;
1089
    }
852
    }
1090
 
853
 
1091
    template <class OutputMatrix>
854
    template <class OutputMatrix>
1092
    static void blendLineSteepAndShallow(uint32_t col, OutputMatrix& out)
855
    static void blendLineSteepAndShallow(uint32_t col, OutputMatrix& out, alphagrad_func alphagrad)
1093
    {
856
    {
1094
        alphaGrad<1, 4>(out.template ref<0, scale - 1>(), col);
857
        alphagrad (&(out.template ref<0, scale - 1>()), col, 1, 4);
1095
        alphaGrad<1, 4>(out.template ref<2, scale - 2>(), col);
858
        alphagrad (&(out.template ref<2, scale - 2>()), col, 1, 4);
1096
        alphaGrad<3, 4>(out.template ref<1, scale - 1>(), col);
859
        alphagrad (&(out.template ref<1, scale - 1>()), col, 3, 4);
1097
        alphaGrad<3, 4>(out.template ref<3, scale - 2>(), col);
860
        alphagrad (&(out.template ref<3, scale - 2>()), col, 3, 4);
1098
 
-
 
1099
        alphaGrad<1, 4>(out.template ref<scale - 1, 0>(), col);
861
        alphagrad (&(out.template ref<scale - 1, 0>()), col, 1, 4);
1100
        alphaGrad<1, 4>(out.template ref<scale - 2, 2>(), col);
862
        alphagrad (&(out.template ref<scale - 2, 2>()), col, 1, 4);
1101
        alphaGrad<3, 4>(out.template ref<scale - 1, 1>(), col);
863
        alphagrad (&(out.template ref<scale - 1, 1>()), col, 3, 4);
1102
        alphaGrad<3, 4>(out.template ref<scale - 2, 3>(), col);
864
        alphagrad (&(out.template ref<scale - 2, 3>()), col, 3, 4);
1103
 
865
 
1104
        out.template ref<2, scale - 1>() = col;
866
        out.template ref<2, scale - 1>() = col;
1105
        out.template ref<3, scale - 1>() = col;
867
        out.template ref<3, scale - 1>() = col;
1106
        out.template ref<4, scale - 1>() = col;
868
        out.template ref<4, scale - 1>() = col;
1107
        out.template ref<5, scale - 1>() = col;
869
        out.template ref<5, scale - 1>() = col;
1108
 
-
 
1109
        out.template ref<4, scale - 2>() = col;
870
        out.template ref<4, scale - 2>() = col;
1110
        out.template ref<5, scale - 2>() = col;
871
        out.template ref<5, scale - 2>() = col;
1111
 
-
 
1112
        out.template ref<scale - 1, 2>() = col;
872
        out.template ref<scale - 1, 2>() = col;
1113
        out.template ref<scale - 1, 3>() = col;
873
        out.template ref<scale - 1, 3>() = col;
1114
    }
874
    }
1115
 
875
 
1116
    template <class OutputMatrix>
876
    template <class OutputMatrix>
1117
    static void blendLineDiagonal(uint32_t col, OutputMatrix& out)
877
    static void blendLineDiagonal(uint32_t col, OutputMatrix& out, alphagrad_func alphagrad)
1118
    {
878
    {
1119
        alphaGrad<1, 2>(out.template ref<scale - 1, scale / 2    >(), col);
879
        alphagrad (&(out.template ref<scale - 1, scale / 2    >()), col, 1, 2);
1120
        alphaGrad<1, 2>(out.template ref<scale - 2, scale / 2 + 1>(), col);
880
        alphagrad (&(out.template ref<scale - 2, scale / 2 + 1>()), col, 1, 2);
1121
        alphaGrad<1, 2>(out.template ref<scale - 3, scale / 2 + 2>(), col);
881
        alphagrad (&(out.template ref<scale - 3, scale / 2 + 2>()), col, 1, 2);
1122
 
882
 
1123
        out.template ref<scale - 2, scale - 1>() = col;
883
        out.template ref<scale - 2, scale - 1>() = col;
1124
        out.template ref<scale - 1, scale - 1>() = col;
884
        out.template ref<scale - 1, scale - 1>() = col;
1125
        out.template ref<scale - 1, scale - 2>() = col;
885
        out.template ref<scale - 1, scale - 2>() = col;
1126
    }
886
    }
1127
 
887
 
1128
    template <class OutputMatrix>
888
    template <class OutputMatrix>
1129
    static void blendCorner(uint32_t col, OutputMatrix& out)
889
    static void blendCorner(uint32_t col, OutputMatrix& out, alphagrad_func alphagrad)
1130
    {
890
    {
1131
        //model a round corner
891
        //model a round corner
1132
        alphaGrad<97, 100>(out.template ref<5, 5>(), col); //exact: 0.9711013910
892
        alphagrad (&(out.template ref<5, 5>()), col, 97, 100); //exact: 0.9711013910
1133
        alphaGrad<42, 100>(out.template ref<4, 5>(), col); //0.4236372243
893
        alphagrad (&(out.template ref<4, 5>()), col, 42, 100); //0.4236372243
1134
        alphaGrad<42, 100>(out.template ref<5, 4>(), col); //0.4236372243
894
        alphagrad (&(out.template ref<5, 4>()), col, 42, 100); //0.4236372243
1135
        alphaGrad< 6, 100>(out.template ref<5, 3>(), col); //0.05652034508
895
        alphagrad (&(out.template ref<5, 3>()), col,  6, 100); //0.05652034508
1136
        alphaGrad< 6, 100>(out.template ref<3, 5>(), col); //0.05652034508
896
        alphagrad (&(out.template ref<3, 5>()), col,  6, 100); //0.05652034508
1137
    }
897
    }
1138
};
898
};
1139
 
899
 
1140
        //------------------------------------------------------------------------------------
900
        //------------------------------------------------------------------------------------
1141
        struct ColorDistanceRGB
-
 
1142
        {
901
}
1143
            static double dist(uint32_t pix1, uint32_t pix2, double luminanceWeight)
-
 
1144
            {
-
 
1145
                return distYCbCrBuffered(pix1, pix2);
-
 
1146
 
902
 
1147
                //if (pix1 == pix2) //about 4% perf boost
-
 
1148
                //    return 0;
-
 
1149
                //return distYCbCr(pix1, pix2, luminanceWeight);
-
 
1150
            }
-
 
1151
        };
-
 
1152
 
903
 
1153
        struct ColorDistanceARGB
-
 
1154
        {
-
 
1155
            static double dist(uint32_t pix1, uint32_t pix2, double luminanceWeight)
-
 
1156
            {
-
 
1157
                const double a1 = GET_ALPHA (pix1) / 255.0 ;
-
 
1158
                const double a2 = GET_ALPHA (pix2) / 255.0 ;
-
 
1159
                /*
-
 
1160
                Requirements for a color distance handling alpha channel: with a1, a2 in [0, 1]
-
 
1161
       
-
 
1162
                    1. if a1 = a2, distance should be: a1 * distYCbCr()
-
 
1163
                    2. if a1 = 0,  distance should be: a2 * distYCbCr(black, white) = a2 * 255
-
 
1164
                    3. if a1 = 1,  ??? maybe: 255 * (1 - a2) + a2 * distYCbCr()
-
 
1165
                */
-
 
1166
 
904
 
-
 
905
static double dist24 (uint32_t pix1, uint32_t pix2)
-
 
906
{
-
 
907
    //30% perf boost compared to plain distYCbCr()!
1167
                //return std::min(a1, a2) * distYCbCrBuffered(pix1, pix2) + 255 * abs(a1 - a2);
908
    //consumes 64 MB memory; using double is only 2% faster, but takes 128 MB
1168
                //=> following code is 15% faster:
909
    static float diffToDist[256 * 256 * 256];
1169
                const double d = distYCbCrBuffered(pix1, pix2);
910
    static bool is_initialized = false;
1170
                if (a1 < a2)
911
    if (!is_initialized)
-
 
912
    {
1171
                    return a1 * d + 255 * (a2 - a1);
913
        for (uint32_t i = 0; i < 256 * 256 * 256; ++i) //startup time: 114 ms on Intel Core i5 (four cores)
1172
                else
914
        {
1173
                    return a2 * d + 255 * (a1 - a2);
915
            const int r_diff = GET_RED (i) * 2 - 0xFF;
-
 
916
            const int g_diff = GET_GREEN (i) * 2 - 0xFF;
-
 
917
            const int b_diff = GET_BLUE (i) * 2 - 0xFF;
1174
 
918
 
1175
                //alternative? return /*std::*/sqrt(a1 * a2 * square(distYCbCrBuffered(pix1, pix2)) + square(255 * (a1 - a2)));
919
            const double k_b = 0.0593; //ITU-R BT.2020 conversion
1176
            }
920
            const double k_r = 0.2627; //
1177
        };
-
 
-
 
921
            const double k_g = 1 - k_b - k_r;
1178
 
922
 
1179
        struct ColorGradientRGB
-
 
1180
        {
-
 
1181
            template <unsigned int M, unsigned int N> static void alphaGrad (uint32_t &pixBack, uint32_t pixFront)
923
            const double scale_b = 0.5 / (1 - k_b);
1182
            {
-
 
1183
                pixBack = gradientRGB<M, N> (pixFront, pixBack);
924
            const double scale_r = 0.5 / (1 - k_r);
1184
            }
-
 
1185
        };
-
 
1186
 
925
 
-
 
926
            const double y   = k_r * r_diff + k_g * g_diff + k_b * b_diff; //[!], analog YCbCr!
-
 
927
            const double c_b = scale_b * (b_diff - y);
1187
        struct ColorGradientARGB
928
            const double c_r = scale_r * (r_diff - y);
1188
        {
929
 
1189
            template <unsigned int M, unsigned int N> static void alphaGrad (uint32_t &pixBack, uint32_t pixFront)
930
            diffToDist[i] = (float) (sqrt ((y * y) + (c_b * c_b) + (c_r * c_r)));
1190
            {
931
        }
1191
                pixBack = gradientARGB<M, N> (pixFront, pixBack);
932
        is_initialized = true;
1192
            }
933
    }
-
 
934
 
-
 
935
    const int r_diff = (int) GET_RED   (pix1) - (int) GET_RED   (pix2);
-
 
936
    const int g_diff = (int) GET_GREEN (pix1) - (int) GET_GREEN (pix2);
-
 
937
    const int b_diff = (int) GET_BLUE  (pix1) - (int) GET_BLUE  (pix2);
1193
        };
938
 
-
 
939
    return diffToDist[(((r_diff + 0xFF) / 2) << 16) | //slightly reduce precision (division by 2) to squeeze value into single byte
-
 
940
                      (((g_diff + 0xFF) / 2) <<  8) |
-
 
941
                      (((b_diff + 0xFF) / 2) <<  0)];
1194
}
942
}
1195
 
943
 
1196
 
944
 
-
 
945
static double dist32 (uint32_t pix1, uint32_t pix2)
-
 
946
{
-
 
947
    const double a1 = GET_ALPHA (pix1) / 255.0 ;
-
 
948
    const double a2 = GET_ALPHA (pix2) / 255.0 ;
-
 
949
    /*
-
 
950
    Requirements for a color distance handling alpha channel: with a1, a2 in [0, 1]
1197
 
951
 
-
 
952
        1. if a1 = a2, distance should be: a1 * distYCbCr()
-
 
953
        2. if a1 = 0,  distance should be: a2 * distYCbCr(black, white) = a2 * 255
-
 
954
        3. if a1 = 1,  ??? maybe: 255 * (1 - a2) + a2 * distYCbCr()
-
 
955
    */
-
 
956
 
-
 
957
    //return MIN (a1, a2) * distYCbCrBuffered(pix1, pix2) + 255 * abs(a1 - a2);
-
 
958
    //=> following code is 15% faster:
-
 
959
    const double d = dist24 (pix1, pix2);
-
 
960
    return (a1 < a2 ? a1 * d + 255 * (a2 - a1) : a2 * d + 255 * (a1 - a2));
-
 
961
}
-
 
962
 
-
 
963
 
1198
void xbrz::nearestNeighborScale(const uint32_t* src, int srcWidth, int srcHeight, uint32_t* trg, int trgWidth, int trgHeight)
964
static void alphagrad24 (uint32_t *pixBack, uint32_t pixFront, unsigned int M, unsigned int N)
1199
{
965
{
-
 
966
        // blend front color with opacity M / N over opaque background: http://en.wikipedia.org/wiki/Alpha_compositing#Alpha_blending
-
 
967
        *pixBack = (  (CALC_COLOR24 (GET_RED   (pixFront), GET_RED   (*pixBack), M, N) << 16)
-
 
968
                                | (CALC_COLOR24 (GET_GREEN (pixFront), GET_GREEN (*pixBack), M, N) <<  8)
-
 
969
                                | (CALC_COLOR24 (GET_BLUE  (pixFront), GET_BLUE  (*pixBack), M, N) <<  0));
-
 
970
}
-
 
971
 
-
 
972
 
-
 
973
static void alphagrad32 (uint32_t *pixBack, uint32_t pixFront, unsigned int M, unsigned int N)
-
 
974
{
-
 
975
        // find intermediate color between two colors with alpha channels (=> NO alpha blending!!!)
-
 
976
    const unsigned int weightFront = GET_ALPHA (pixFront) * M;
-
 
977
    const unsigned int weightBack  = GET_ALPHA (*pixBack) * (N - M);
-
 
978
    const unsigned int weightSum   = weightFront + weightBack;
-
 
979
    *pixBack = (weightSum == 0 ? 0 :
-
 
980
                                (((unsigned char) (weightSum / N))                                                               << 24)
-
 
981
                                | (CALC_COLOR32 (GET_RED   (pixFront), GET_RED   (*pixBack), weightFront, weightBack, weightSum) << 16)
-
 
982
                                | (CALC_COLOR32 (GET_GREEN (pixFront), GET_GREEN (*pixBack), weightFront, weightBack, weightSum) <<  8)
-
 
983
                                | (CALC_COLOR32 (GET_BLUE  (pixFront), GET_BLUE  (*pixBack), weightFront, weightBack, weightSum) <<  0));
-
 
984
}
-
 
985
 
-
 
986
 
-
 
987
EXTERN_C void nearestNeighborScale(const uint32_t *src, int srcWidth, int srcHeight, uint32_t *trg, int trgWidth, int trgHeight)
-
 
988
{
1200
    nearestNeighborScale (src, srcWidth, srcHeight, srcWidth * sizeof (uint32_t), trg, trgWidth, trgHeight, trgWidth * sizeof (uint32_t), XBRZ_SLICETYPE_TARGET, 0, trgHeight, [](uint32_t pix) { return pix; });
989
//    nearestNeighborScale (src, srcWidth, srcHeight, srcWidth * sizeof (uint32_t), trg, trgWidth, trgHeight, trgWidth * sizeof (uint32_t), XBRZ_SLICETYPE_TARGET, 0, trgHeight, [](uint32_t pix) { return pix; });
-
 
990
    //static_assert(std::is_integral<PixSrc>::value, "PixSrc* is expected to be cast-able to char*");
-
 
991
    //static_assert(std::is_integral<PixTrg>::value, "PixTrg* is expected to be cast-able to char*");
-
 
992
    //static_assert(std::is_same<decltype(pixCvrt(PixSrc())), PixTrg>::value, "PixConverter returning wrong pixel format");
-
 
993
 
-
 
994
    int srcPitch = srcWidth * sizeof (uint32_t);
-
 
995
    int trgPitch = trgWidth * sizeof (uint32_t);
-
 
996
    int yFirst;
-
 
997
    int yLast;
-
 
998
 
-
 
999
#if 0 // going over source image - fast for upscaling, since source is read only once
-
 
1000
    yFirst = 0;
-
 
1001
    yLast  = MIN (trgHeight, srcHeight);
-
 
1002
 
-
 
1003
    if (yFirst >= yLast || trgWidth <= 0 || trgHeight <= 0)
-
 
1004
        return; // consistency check
-
 
1005
 
-
 
1006
    for (int y = yFirst; y < yLast; ++y)
-
 
1007
    {
-
 
1008
        //mathematically: ySrc = floor(srcHeight * yTrg / trgHeight)
-
 
1009
        // => search for integers in: [ySrc, ySrc + 1) * trgHeight / srcHeight
-
 
1010
 
-
 
1011
        //keep within for loop to support MT input slices!
-
 
1012
        const int yTrg_first = ( y      * trgHeight + srcHeight - 1) / srcHeight; //=ceil(y * trgHeight / srcHeight)
-
 
1013
        const int yTrg_last  = ((y + 1) * trgHeight + srcHeight - 1) / srcHeight; //=ceil(((y + 1) * trgHeight) / srcHeight)
-
 
1014
        const int blockHeight = yTrg_last - yTrg_first;
-
 
1015
 
-
 
1016
        if (blockHeight > 0)
-
 
1017
        {
-
 
1018
            const uint32_t *srcLine = (const uint32_t *) BYTE_ADVANCE (src, y * srcPitch);
-
 
1019
            /**/  uint32_t *trgLine = (      uint32_t *) BYTE_ADVANCE (trg, yTrg_first * trgPitch);
-
 
1020
            int xTrg_first = 0;
-
 
1021
 
-
 
1022
            for (int x = 0; x < srcWidth; ++x)
-
 
1023
            {
-
 
1024
                const int xTrg_last = ((x + 1) * trgWidth + srcWidth - 1) / srcWidth;
-
 
1025
                const int blockWidth = xTrg_last - xTrg_first;
-
 
1026
                if (blockWidth > 0)
-
 
1027
                {
-
 
1028
                    const uint32_t trgColor = srcLine[x];
-
 
1029
                                        uint32_t *blkLine = trgLine;
-
 
1030
 
-
 
1031
                    xTrg_first = xTrg_last;
-
 
1032
 
-
 
1033
                                    for (int blk_y = 0; blk_y < blockHeight; ++blk_y, blkLine = (uint32_t *) BYTE_ADVANCE (blkLine, trgPitch))
-
 
1034
                                        for (int blk_x = 0; blk_x < blockWidth; ++blk_x)
-
 
1035
                                            blkLine[blk_x] = trgColor;
-
 
1036
 
-
 
1037
                    trgLine += blockWidth;
-
 
1038
                }
-
 
1039
            }
-
 
1040
        }
-
 
1041
    }
-
 
1042
#else // going over target image - slow for upscaling, since source is read multiple times missing out on cache! Fast for similar image sizes!
-
 
1043
    yFirst = 0;
-
 
1044
    yLast  = trgHeight;
-
 
1045
 
-
 
1046
    if (yFirst >= yLast || srcHeight <= 0 || srcWidth <= 0)
-
 
1047
        return; // consistency check
-
 
1048
 
-
 
1049
    for (int y = yFirst; y < yLast; ++y)
-
 
1050
    {
-
 
1051
        /**/  uint32_t *trgLine = (      uint32_t *) BYTE_ADVANCE (trg, y * trgPitch);
-
 
1052
        const int ySrc = srcHeight * y / trgHeight;
-
 
1053
        const uint32_t *srcLine = (const uint32_t *) BYTE_ADVANCE (src, ySrc * srcPitch);
-
 
1054
        for (int x = 0; x < trgWidth; ++x)
-
 
1055
        {
-
 
1056
            const int xSrc = srcWidth * x / trgWidth;
-
 
1057
            trgLine[x] = srcLine[xSrc];
-
 
1058
        }
-
 
1059
    }
-
 
1060
#endif // going over source or target
-
 
1061
 
-
 
1062
        return;
1201
}
1063
}
1202
 
1064
 
1203
 
1065
 
1204
EXTERN_C bool xbrz_equalcolortest24 (uint32_t col1, uint32_t col2, double luminanceWeight, double equalColorTolerance)
1066
EXTERN_C bool xbrz_equalcolortest24 (uint32_t col1, uint32_t col2, double luminanceWeight, double equalColorTolerance)
1205
{
1067
{
1206
        return (ColorDistanceRGB::dist(col1, col2, luminanceWeight) < equalColorTolerance);
1068
        return (dist24 (col1, col2) < equalColorTolerance);
1207
}
1069
}
1208
 
1070
 
1209
 
1071
 
1210
EXTERN_C bool xbrz_equalcolortest32 (uint32_t col1, uint32_t col2, double luminanceWeight, double equalColorTolerance)
1072
EXTERN_C bool xbrz_equalcolortest32 (uint32_t col1, uint32_t col2, double luminanceWeight, double equalColorTolerance)
1211
{
1073
{
1212
        return (ColorDistanceARGB::dist(col1, col2, luminanceWeight) < equalColorTolerance);
1074
        return (dist32 (col1, col2) < equalColorTolerance);
1213
}
1075
}
1214
 
1076
 
1215
 
1077
 
1216
EXTERN_C void xbrz_scale24 (size_t factor, const uint32_t *src, uint32_t *trg, int srcWidth, int srcHeight)
1078
EXTERN_C void xbrz_scale24 (size_t factor, const uint32_t *src, uint32_t *trg, int srcWidth, int srcHeight)
1217
{
1079
{
1218
    if      (factor == 2) return scaleImage<Scaler2x<ColorGradientRGB>, ColorDistanceRGB> (src, trg, srcWidth, srcHeight, 0, srcHeight);
1080
    if      (factor == 2) return scaleImage<Scaler2x> (src, trg, srcWidth, srcHeight, 0, srcHeight, alphagrad24, dist24);
1219
    else if (factor == 3) return scaleImage<Scaler3x<ColorGradientRGB>, ColorDistanceRGB> (src, trg, srcWidth, srcHeight, 0, srcHeight);
1081
    else if (factor == 3) return scaleImage<Scaler3x> (src, trg, srcWidth, srcHeight, 0, srcHeight, alphagrad24, dist24);
1220
    else if (factor == 4) return scaleImage<Scaler4x<ColorGradientRGB>, ColorDistanceRGB> (src, trg, srcWidth, srcHeight, 0, srcHeight);
1082
    else if (factor == 4) return scaleImage<Scaler4x> (src, trg, srcWidth, srcHeight, 0, srcHeight, alphagrad24, dist24);
1221
    else if (factor == 5) return scaleImage<Scaler5x<ColorGradientRGB>, ColorDistanceRGB> (src, trg, srcWidth, srcHeight, 0, srcHeight);
1083
    else if (factor == 5) return scaleImage<Scaler5x> (src, trg, srcWidth, srcHeight, 0, srcHeight, alphagrad24, dist24);
1222
    else if (factor == 6) return scaleImage<Scaler6x<ColorGradientRGB>, ColorDistanceRGB> (src, trg, srcWidth, srcHeight, 0, srcHeight);
1084
    else if (factor == 6) return scaleImage<Scaler6x> (src, trg, srcWidth, srcHeight, 0, srcHeight, alphagrad24, dist24);
1223
}
1085
}
1224
 
1086
 
1225
 
1087
 
1226
EXTERN_C void xbrz_scale32 (size_t factor, const uint32_t *src, uint32_t *trg, int srcWidth, int srcHeight)
1088
EXTERN_C void xbrz_scale32 (size_t factor, const uint32_t *src, uint32_t *trg, int srcWidth, int srcHeight)
1227
{
1089
{
1228
    if      (factor == 2) return scaleImage<Scaler2x<ColorGradientARGB>, ColorDistanceARGB> (src, trg, srcWidth, srcHeight, 0, srcHeight);
1090
    if      (factor == 2) return scaleImage<Scaler2x> (src, trg, srcWidth, srcHeight, 0, srcHeight, alphagrad32, dist32);
1229
    else if (factor == 3) return scaleImage<Scaler3x<ColorGradientARGB>, ColorDistanceARGB> (src, trg, srcWidth, srcHeight, 0, srcHeight);
1091
    else if (factor == 3) return scaleImage<Scaler3x> (src, trg, srcWidth, srcHeight, 0, srcHeight, alphagrad32, dist32);
1230
    else if (factor == 4) return scaleImage<Scaler4x<ColorGradientARGB>, ColorDistanceARGB> (src, trg, srcWidth, srcHeight, 0, srcHeight);
1092
    else if (factor == 4) return scaleImage<Scaler4x> (src, trg, srcWidth, srcHeight, 0, srcHeight, alphagrad32, dist32);
1231
    else if (factor == 5) return scaleImage<Scaler5x<ColorGradientARGB>, ColorDistanceARGB> (src, trg, srcWidth, srcHeight, 0, srcHeight);
1093
    else if (factor == 5) return scaleImage<Scaler5x> (src, trg, srcWidth, srcHeight, 0, srcHeight, alphagrad32, dist32);
1232
    else if (factor == 6) return scaleImage<Scaler6x<ColorGradientARGB>, ColorDistanceARGB> (src, trg, srcWidth, srcHeight, 0, srcHeight);
1094
    else if (factor == 6) return scaleImage<Scaler6x> (src, trg, srcWidth, srcHeight, 0, srcHeight, alphagrad32, dist32);
1233
}
1095
}