Subversion Repositories Games.Prince of Persia

Rev

Details | Last modification | View Log | RSS feed

Rev Author Line No. Line
2 pmbaty 1
// ****************************************************************************
2
// * This file is part of the HqMAME project. It is distributed under         *
3
// * GNU General Public License: https://www.gnu.org/licenses/gpl-3.0         *
4
// * Copyright (C) Zenju (zenju AT gmx DOT de) - All Rights Reserved          *
5
// *                                                                          *
6
// * Additionally and as a special exception, the author gives permission     *
7
// * to link the code of this program with the MAME library (or with modified *
8
// * versions of MAME that use the same license as MAME), and distribute      *
9
// * linked combinations including the two. You must obey the GNU General     *
10
// * Public License in all respects for all of the code used other than MAME. *
11
// * If you modify this file, you may extend this exception to your version   *
12
// * of the file, but you are not obligated to do so. If you do not wish to   *
13
// * do so, delete this exception statement from your version.                *
14
// ****************************************************************************
15
 
3 pmbaty 16
// -------------------------------------------------------------------------
17
// | xBRZ: "Scale by rules" - high quality image upscaling filter by Zenju |
18
// -------------------------------------------------------------------------
19
// using a modified approach of xBR:
20
// http://board.byuu.org/viewtopic.php?f=10&t=2248
21
//  - new rule set preserving small image features
22
//  - highly optimized for performance
23
//  - support alpha channel
24
//  - support multithreading
25
//  - support 64-bit architectures
26
//  - support processing image slices
27
//  - support scaling up to 6xBRZ
2 pmbaty 28
 
3 pmbaty 29
// -> map source (srcWidth * srcHeight) to target (scale * width x scale * height) image, optionally processing a half-open slice of rows [yFirst, yLast) only
30
// -> support for source/target pitch in bytes!
31
// -> if your emulator changes only a few image slices during each cycle (e.g. DOSBox) then there's no need to run xBRZ on the complete image:
32
//    Just make sure you enlarge the source image slice by 2 rows on top and 2 on bottom (this is the additional range the xBRZ algorithm is using during analysis)
33
//    CAVEAT: If there are multiple changed slices, make sure they do not overlap after adding these additional rows in order to avoid a memory race condition
34
//    in the target image data if you are using multiple threads for processing each enlarged slice!
35
// 
36
// THREAD-SAFETY: - parts of the same image may be scaled by multiple threads as long as the [yFirst, yLast) ranges do not overlap!
37
//                - there is a minor inefficiency for the first row of a slice, so avoid processing single rows only; suggestion: process at least 8-16 rows
38
 
39
 
40
#include <stddef.h> // for size_t
41
#include <stdint.h> // for uint32_t
42
#include <memory.h> // for memset()
43
#include <limits.h>
2 pmbaty 44
#include <math.h>
45
 
46
 
47
#ifdef __cplusplus
48
#define EXTERN_C extern "C"
49
#else // !__cplusplus
50
#define EXTERN_C
51
#endif // __cplusplus
52
 
53
 
54
// scaler configuration
55
#define XBRZ_CFG_LUMINANCE_WEIGHT 1
56
#define XBRZ_CFG_EQUAL_COLOR_TOLERANCE 30
57
#define XBRZ_CFG_DOMINANT_DIRECTION_THRESHOLD 3.6
58
#define XBRZ_CFG_STEEP_DIRECTION_THRESHOLD 2.2
59
 
60
 
61
// slice types
62
#define XBRZ_SLICETYPE_SOURCE 1
63
#define XBRZ_SLICETYPE_TARGET 2
64
 
65
 
66
// handy macros
67
#define GET_BYTE(val,byteno) ((unsigned char) (((val) >> ((byteno) << 3)) & 0xff))
68
#define GET_BLUE(val)  GET_BYTE (val, 0)
69
#define GET_GREEN(val) GET_BYTE (val, 1)
70
#define GET_RED(val)   GET_BYTE (val, 2)
71
#define GET_ALPHA(val) GET_BYTE (val, 3)
3 pmbaty 72
#define CALC_COLOR24(colFront,colBack,M,N) (unsigned char) ((((unsigned char) (colFront)) * ((unsigned int) (M)) + ((unsigned char) (colBack)) * (((unsigned int) (N)) - ((unsigned int) (M)))) / ((unsigned int) (N)))
73
#define CALC_COLOR32(colFront,colBack,weightFront,weightBack,weightSum) ((unsigned char) ((((unsigned char) (colFront)) * ((unsigned int) (weightFront)) + ((unsigned char) (colBack)) * ((unsigned int) (weightBack))) / ((unsigned int) (weightSum))))
74
#define BYTE_ADVANCE(buffer,offset) (((char *) buffer) + (offset))
75
#ifndef MIN
76
#define MIN(a,b) ((a) < (b) ? (a) : (b))
77
#endif // MIN
78
#ifndef MAX
79
#define MAX(a,b) ((a) > (b) ? (a) : (b))
80
#endif // MAX
2 pmbaty 81
 
82
 
3 pmbaty 83
typedef void (alphagrad_func) (uint32_t *pixBack, uint32_t pixFront, unsigned int M, unsigned int N);
84
typedef double (dist_func) (uint32_t pix1, uint32_t pix2);
2 pmbaty 85
 
86
 
87
 
88
 
89
namespace
90
{
91
#ifdef _MSC_VER
92
    #define FORCE_INLINE __forceinline
93
#elif defined __GNUC__
94
    #define FORCE_INLINE __attribute__((always_inline)) inline
95
#else
96
    #define FORCE_INLINE inline
97
#endif
98
 
99
 
100
enum RotationDegree //clock-wise
101
{
3 pmbaty 102
    ROT_0 = 0,
2 pmbaty 103
    ROT_90,
104
    ROT_180,
105
    ROT_270
106
};
107
 
3 pmbaty 108
 
2 pmbaty 109
//calculate input matrix coordinates after rotation at compile time
3 pmbaty 110
template <RotationDegree rotDeg, size_t I, size_t J, size_t N> struct MatrixRotation;
2 pmbaty 111
 
3 pmbaty 112
 
113
template <size_t I, size_t J, size_t N> struct MatrixRotation<ROT_0, I, J, N>
2 pmbaty 114
{
115
    static const size_t I_old = I;
116
    static const size_t J_old = J;
117
};
118
 
3 pmbaty 119
 
2 pmbaty 120
template <RotationDegree rotDeg, size_t I, size_t J, size_t N> //(i, j) = (row, col) indices, N = size of (square) matrix
121
struct MatrixRotation
122
{
3 pmbaty 123
    static const size_t I_old = N - 1 - MatrixRotation<(RotationDegree)(rotDeg - 1), I, J, N>::J_old; //old coordinates before rotation!
124
    static const size_t J_old =         MatrixRotation<(RotationDegree)(rotDeg - 1), I, J, N>::I_old; //
2 pmbaty 125
};
126
 
127
 
3 pmbaty 128
template <size_t N, RotationDegree rotDeg> class OutputMatrix
2 pmbaty 129
{
130
public:
3 pmbaty 131
    OutputMatrix (uint32_t *out, int outWidth) //access matrix area, top-left at position "out" for image with given width
132
    {
133
        out_ = out;
134
        outWidth_ = outWidth;
135
    }
2 pmbaty 136
 
3 pmbaty 137
    template <size_t I, size_t J> uint32_t &ref() const
2 pmbaty 138
    {
139
        static const size_t I_old = MatrixRotation<rotDeg, I, J, N>::I_old;
140
        static const size_t J_old = MatrixRotation<rotDeg, I, J, N>::J_old;
3 pmbaty 141
 
2 pmbaty 142
        return *(out_ + J_old + I_old * outWidth_);
143
    }
144
 
145
    uint32_t* out_;
3 pmbaty 146
    int outWidth_;
2 pmbaty 147
};
148
 
149
 
150
 
151
 
152
enum BlendType
153
{
154
    BLEND_NONE = 0,
155
    BLEND_NORMAL,   //a normal indication to blend
156
    BLEND_DOMINANT, //a strong indication to blend
157
    //attention: BlendType must fit into the value range of 2 bit!!!
158
};
159
 
160
struct BlendResult
161
{
162
    BlendType
163
    /**/blend_f, blend_g,
164
    /**/blend_j, blend_k;
165
};
166
 
167
 
168
struct Kernel_4x4 //kernel for preprocessing step
169
{
170
    uint32_t
171
    /**/a, b, c, d,
172
    /**/e, f, g, h,
173
    /**/i, j, k, l,
174
    /**/m, n, o, p;
175
};
176
 
177
/*
178
input kernel area naming convention:
179
-----------------
180
| A | B | C | D |
181
----|---|---|---|
182
| E | F | G | H |   //evaluate the four corners between F, G, J, K
183
----|---|---|---|   //input pixel is at position F
184
| I | J | K | L |
185
----|---|---|---|
186
| M | N | O | P |
187
-----------------
188
*/
189
FORCE_INLINE //detect blend direction
3 pmbaty 190
BlendResult preProcessCorners(const Kernel_4x4& ker, dist_func dist) //result: F, G, J, K corners of "GradientType"
2 pmbaty 191
{
192
    BlendResult result = {};
193
 
194
    if ((ker.f == ker.g &&
195
         ker.j == ker.k) ||
196
        (ker.f == ker.j &&
197
         ker.g == ker.k))
198
        return result;
199
 
200
    const int weight = 4;
3 pmbaty 201
    double jg = dist (ker.i, ker.f) + dist (ker.f, ker.c) + dist (ker.n, ker.k) + dist (ker.k, ker.h) + weight * dist (ker.j, ker.g);
202
    double fk = dist (ker.e, ker.j) + dist (ker.j, ker.o) + dist (ker.b, ker.g) + dist (ker.g, ker.l) + weight * dist (ker.f, ker.k);
2 pmbaty 203
 
204
    if (jg < fk) //test sample: 70% of values max(jg, fk) / min(jg, fk) are between 1.1 and 3.7 with median being 1.8
205
    {
206
        const bool dominantGradient = XBRZ_CFG_DOMINANT_DIRECTION_THRESHOLD * jg < fk;
207
        if (ker.f != ker.g && ker.f != ker.j)
208
            result.blend_f = dominantGradient ? BLEND_DOMINANT : BLEND_NORMAL;
209
 
210
        if (ker.k != ker.j && ker.k != ker.g)
211
            result.blend_k = dominantGradient ? BLEND_DOMINANT : BLEND_NORMAL;
212
    }
213
    else if (fk < jg)
214
    {
215
        const bool dominantGradient = XBRZ_CFG_DOMINANT_DIRECTION_THRESHOLD * fk < jg;
216
        if (ker.j != ker.f && ker.j != ker.k)
217
            result.blend_j = dominantGradient ? BLEND_DOMINANT : BLEND_NORMAL;
218
 
219
        if (ker.g != ker.f && ker.g != ker.k)
220
            result.blend_g = dominantGradient ? BLEND_DOMINANT : BLEND_NORMAL;
221
    }
222
    return result;
223
}
224
 
225
struct Kernel_3x3
226
{
227
    uint32_t
228
    /**/a,  b,  c,
229
    /**/d,  e,  f,
230
    /**/g,  h,  i;
231
};
3 pmbaty 232
/*
2 pmbaty 233
#define DEF_GETTER(x) template <RotationDegree rotDeg> uint32_t inline get_##x(const Kernel_3x3& ker) { return ker.x; }
234
//we cannot and NEED NOT write "ker.##x" since ## concatenates preprocessor tokens but "." is not a token
235
DEF_GETTER(a) DEF_GETTER(b) DEF_GETTER(c)
236
DEF_GETTER(d) DEF_GETTER(e) DEF_GETTER(f)
237
DEF_GETTER(g) DEF_GETTER(h) DEF_GETTER(i)
238
#undef DEF_GETTER
239
 
240
#define DEF_GETTER(x, y) template <> inline uint32_t get_##x<ROT_90>(const Kernel_3x3& ker) { return ker.y; }
241
DEF_GETTER(a, g) DEF_GETTER(b, d) DEF_GETTER(c, a)
242
DEF_GETTER(d, h) DEF_GETTER(e, e) DEF_GETTER(f, b)
243
DEF_GETTER(g, i) DEF_GETTER(h, f) DEF_GETTER(i, c)
244
#undef DEF_GETTER
245
 
246
#define DEF_GETTER(x, y) template <> inline uint32_t get_##x<ROT_180>(const Kernel_3x3& ker) { return ker.y; }
247
DEF_GETTER(a, i) DEF_GETTER(b, h) DEF_GETTER(c, g)
248
DEF_GETTER(d, f) DEF_GETTER(e, e) DEF_GETTER(f, d)
249
DEF_GETTER(g, c) DEF_GETTER(h, b) DEF_GETTER(i, a)
250
#undef DEF_GETTER
251
 
252
#define DEF_GETTER(x, y) template <> inline uint32_t get_##x<ROT_270>(const Kernel_3x3& ker) { return ker.y; }
253
DEF_GETTER(a, c) DEF_GETTER(b, f) DEF_GETTER(c, i)
254
DEF_GETTER(d, b) DEF_GETTER(e, e) DEF_GETTER(f, h)
255
DEF_GETTER(g, a) DEF_GETTER(h, d) DEF_GETTER(i, g)
256
#undef DEF_GETTER
3 pmbaty 257
*/
2 pmbaty 258
 
3 pmbaty 259
template <RotationDegree rotDeg> uint32_t inline get_a (const Kernel_3x3& ker) { return ker.a; }
260
template <RotationDegree rotDeg> uint32_t inline get_b (const Kernel_3x3& ker) { return ker.b; }
261
template <RotationDegree rotDeg> uint32_t inline get_c (const Kernel_3x3& ker) { return ker.c; }
262
template <RotationDegree rotDeg> uint32_t inline get_d (const Kernel_3x3& ker) { return ker.d; }
263
template <RotationDegree rotDeg> uint32_t inline get_e (const Kernel_3x3& ker) { return ker.e; }
264
template <RotationDegree rotDeg> uint32_t inline get_f (const Kernel_3x3& ker) { return ker.f; }
265
template <RotationDegree rotDeg> uint32_t inline get_g (const Kernel_3x3& ker) { return ker.g; }
266
template <RotationDegree rotDeg> uint32_t inline get_h (const Kernel_3x3& ker) { return ker.h; }
267
template <RotationDegree rotDeg> uint32_t inline get_i (const Kernel_3x3& ker) { return ker.i; }
2 pmbaty 268
 
3 pmbaty 269
template <> inline uint32_t get_a<ROT_90>(const Kernel_3x3& ker) { return ker.g; }
270
template <> inline uint32_t get_b<ROT_90>(const Kernel_3x3& ker) { return ker.d; }
271
template <> inline uint32_t get_c<ROT_90>(const Kernel_3x3& ker) { return ker.a; }
272
template <> inline uint32_t get_d<ROT_90>(const Kernel_3x3& ker) { return ker.h; }
273
template <> inline uint32_t get_e<ROT_90>(const Kernel_3x3& ker) { return ker.e; }
274
template <> inline uint32_t get_f<ROT_90>(const Kernel_3x3& ker) { return ker.b; }
275
template <> inline uint32_t get_g<ROT_90>(const Kernel_3x3& ker) { return ker.i; }
276
template <> inline uint32_t get_h<ROT_90>(const Kernel_3x3& ker) { return ker.f; }
277
template <> inline uint32_t get_i<ROT_90>(const Kernel_3x3& ker) { return ker.c; }
278
 
279
template <> inline uint32_t get_a<ROT_180>(const Kernel_3x3& ker) { return ker.i; }
280
template <> inline uint32_t get_b<ROT_180>(const Kernel_3x3& ker) { return ker.h; }
281
template <> inline uint32_t get_c<ROT_180>(const Kernel_3x3& ker) { return ker.g; }
282
template <> inline uint32_t get_d<ROT_180>(const Kernel_3x3& ker) { return ker.f; }
283
template <> inline uint32_t get_e<ROT_180>(const Kernel_3x3& ker) { return ker.e; }
284
template <> inline uint32_t get_f<ROT_180>(const Kernel_3x3& ker) { return ker.d; }
285
template <> inline uint32_t get_g<ROT_180>(const Kernel_3x3& ker) { return ker.c; }
286
template <> inline uint32_t get_h<ROT_180>(const Kernel_3x3& ker) { return ker.b; }
287
template <> inline uint32_t get_i<ROT_180>(const Kernel_3x3& ker) { return ker.a; }
288
 
289
template <> inline uint32_t get_a<ROT_270>(const Kernel_3x3& ker) { return ker.c; }
290
template <> inline uint32_t get_b<ROT_270>(const Kernel_3x3& ker) { return ker.f; }
291
template <> inline uint32_t get_c<ROT_270>(const Kernel_3x3& ker) { return ker.i; }
292
template <> inline uint32_t get_d<ROT_270>(const Kernel_3x3& ker) { return ker.b; }
293
template <> inline uint32_t get_e<ROT_270>(const Kernel_3x3& ker) { return ker.e; }
294
template <> inline uint32_t get_f<ROT_270>(const Kernel_3x3& ker) { return ker.h; }
295
template <> inline uint32_t get_g<ROT_270>(const Kernel_3x3& ker) { return ker.a; }
296
template <> inline uint32_t get_h<ROT_270>(const Kernel_3x3& ker) { return ker.d; }
297
template <> inline uint32_t get_i<ROT_270>(const Kernel_3x3& ker) { return ker.g; }
298
 
2 pmbaty 299
//compress four blend types into a single byte
3 pmbaty 300
inline BlendType getTopL   (unsigned char b) { return (BlendType)(0x3 & b); }
301
inline BlendType getTopR   (unsigned char b) { return (BlendType)(0x3 & (b >> 2)); }
302
inline BlendType getBottomR(unsigned char b) { return (BlendType)(0x3 & (b >> 4)); }
303
inline BlendType getBottomL(unsigned char b) { return (BlendType)(0x3 & (b >> 6)); }
2 pmbaty 304
 
305
inline void setTopL   (unsigned char& b, BlendType bt) { b |= bt; } //buffer is assumed to be initialized before preprocessing!
306
inline void setTopR   (unsigned char& b, BlendType bt) { b |= (bt << 2); }
307
inline void setBottomR(unsigned char& b, BlendType bt) { b |= (bt << 4); }
308
inline void setBottomL(unsigned char& b, BlendType bt) { b |= (bt << 6); }
309
 
310
template <RotationDegree rotDeg> inline
3 pmbaty 311
unsigned char rotateBlendInfo (unsigned char b) { return b; }
2 pmbaty 312
template <> inline unsigned char rotateBlendInfo<ROT_90 >(unsigned char b) { return ((b << 2) | (b >> 6)) & 0xff; }
313
template <> inline unsigned char rotateBlendInfo<ROT_180>(unsigned char b) { return ((b << 4) | (b >> 4)) & 0xff; }
314
template <> inline unsigned char rotateBlendInfo<ROT_270>(unsigned char b) { return ((b << 6) | (b >> 2)) & 0xff; }
315
 
316
 
317
/*
318
input kernel area naming convention:
319
-------------
320
| A | B | C |
321
----|---|---|
322
| D | E | F | //input pixel is at position E
323
----|---|---|
324
| G | H | I |
325
-------------
326
*/
3 pmbaty 327
template <class Scaler, RotationDegree rotDeg>
328
FORCE_INLINE void blendPixel(const Kernel_3x3& ker, uint32_t *target, int trgWidth, unsigned char blendInfo, alphagrad_func alphagrad, dist_func dist) //result of preprocessing all four corners of pixel "e"
2 pmbaty 329
{
330
#define a get_a<rotDeg>(ker)
331
#define b get_b<rotDeg>(ker)
332
#define c get_c<rotDeg>(ker)
333
#define d get_d<rotDeg>(ker)
334
#define e get_e<rotDeg>(ker)
335
#define f get_f<rotDeg>(ker)
336
#define g get_g<rotDeg>(ker)
337
#define h get_h<rotDeg>(ker)
338
#define i get_i<rotDeg>(ker)
339
 
340
    const unsigned char blend = rotateBlendInfo<rotDeg>(blendInfo);
341
 
342
    if (getBottomR(blend) >= BLEND_NORMAL)
343
    {
3 pmbaty 344
        bool doLineBlend;
2 pmbaty 345
 
3 pmbaty 346
        if (getBottomR(blend) >= BLEND_DOMINANT)
347
            doLineBlend = true;
348
        else if (getTopR(blend) != BLEND_NONE && (dist (e, g) >= XBRZ_CFG_EQUAL_COLOR_TOLERANCE)) //but support double-blending for 90° corners
349
            doLineBlend = false; // make sure there is no second blending in an adjacent rotation for this pixel: handles insular pixels, mario eyes
350
        else if (getBottomL(blend) != BLEND_NONE && (dist (e, c) >= XBRZ_CFG_EQUAL_COLOR_TOLERANCE))
351
            doLineBlend = false; // make sure there is no second blending in an adjacent rotation for this pixel: handles insular pixels, mario eyes
352
        else if ((dist (e, i) >= XBRZ_CFG_EQUAL_COLOR_TOLERANCE)
353
            && (dist (g, h) < XBRZ_CFG_EQUAL_COLOR_TOLERANCE)
354
            && (dist (h, i) < XBRZ_CFG_EQUAL_COLOR_TOLERANCE)
355
            && (dist (i, f) < XBRZ_CFG_EQUAL_COLOR_TOLERANCE)
356
            && (dist (f, c) < XBRZ_CFG_EQUAL_COLOR_TOLERANCE))
357
            doLineBlend = false; // no full blending for L-shapes; blend corner only (handles "mario mushroom eyes")
358
                else
359
            doLineBlend = true;
2 pmbaty 360
 
3 pmbaty 361
        const uint32_t px = (dist (e, f) <= dist (e, h) ? f : h); //choose most similar color
2 pmbaty 362
 
363
        OutputMatrix<Scaler::scale, rotDeg> out(target, trgWidth);
364
 
365
        if (doLineBlend)
366
        {
3 pmbaty 367
            const double fg = dist (f, g); //test sample: 70% of values max(fg, hc) / min(fg, hc) are between 1.1 and 3.7 with median being 1.9
368
            const double hc = dist (h, c); //
2 pmbaty 369
 
370
            const bool haveShallowLine = XBRZ_CFG_STEEP_DIRECTION_THRESHOLD * fg <= hc && e != g && d != g;
371
            const bool haveSteepLine   = XBRZ_CFG_STEEP_DIRECTION_THRESHOLD * hc <= fg && e != c && b != c;
372
 
373
            if (haveShallowLine)
374
            {
375
                if (haveSteepLine)
3 pmbaty 376
                    Scaler::blendLineSteepAndShallow(px, out, alphagrad);
2 pmbaty 377
                else
3 pmbaty 378
                    Scaler::blendLineShallow(px, out, alphagrad);
2 pmbaty 379
            }
380
            else
381
            {
382
                if (haveSteepLine)
3 pmbaty 383
                    Scaler::blendLineSteep(px, out, alphagrad);
2 pmbaty 384
                else
3 pmbaty 385
                    Scaler::blendLineDiagonal(px, out, alphagrad);
2 pmbaty 386
            }
387
        }
388
        else
3 pmbaty 389
            Scaler::blendCorner(px, out, alphagrad);
2 pmbaty 390
    }
391
 
392
#undef a
393
#undef b
394
#undef c
395
#undef d
396
#undef e
397
#undef f
398
#undef g
399
#undef h
400
#undef i
401
}
402
 
403
 
3 pmbaty 404
template <class Scaler> //scaler policy: see "Scaler2x" reference implementation
405
void scaleImage(const uint32_t *src, uint32_t *trg, int srcWidth, int srcHeight, int yFirst, int yLast, alphagrad_func alphagrad, dist_func dist)
2 pmbaty 406
{
3 pmbaty 407
    yFirst = MAX (yFirst, 0);
408
    yLast  = MIN (yLast, srcHeight);
2 pmbaty 409
    if (yFirst >= yLast || srcWidth <= 0)
410
        return;
411
 
412
    const int trgWidth = srcWidth * Scaler::scale;
413
 
414
    //"use" space at the end of the image as temporary buffer for "on the fly preprocessing": we even could use larger area of
415
    //"sizeof(uint32_t) * srcWidth * (yLast - yFirst)" bytes without risk of accidental overwriting before accessing
416
    const int bufferSize = srcWidth;
417
    unsigned char* preProcBuffer = reinterpret_cast<unsigned char*>(trg + yLast * Scaler::scale * trgWidth) - bufferSize;
3 pmbaty 418
    memset (preProcBuffer, 0, bufferSize);
2 pmbaty 419
    static_assert(BLEND_NONE == 0, "");
420
 
421
    //initialize preprocessing buffer for first row of current stripe: detect upper left and right corner blending
422
    //this cannot be optimized for adjacent processing stripes; we must not allow for a memory race condition!
423
    if (yFirst > 0)
424
    {
425
        const int y = yFirst - 1;
426
 
3 pmbaty 427
        const uint32_t* s_m1 = src + srcWidth * MAX (y - 1, 0);
2 pmbaty 428
        const uint32_t* s_0  = src + srcWidth * y; //center line
3 pmbaty 429
        const uint32_t* s_p1 = src + srcWidth * MIN (y + 1, srcHeight - 1);
430
        const uint32_t* s_p2 = src + srcWidth * MIN (y + 2, srcHeight - 1);
2 pmbaty 431
 
432
        for (int x = 0; x < srcWidth; ++x)
433
        {
3 pmbaty 434
            const int x_m1 = MAX (x - 1, 0);
435
            const int x_p1 = MIN (x + 1, srcWidth - 1);
436
            const int x_p2 = MIN (x + 2, srcWidth - 1);
2 pmbaty 437
 
438
            Kernel_4x4 ker = {}; //perf: initialization is negligible
439
            ker.a = s_m1[x_m1]; //read sequentially from memory as far as possible
440
            ker.b = s_m1[x];
441
            ker.c = s_m1[x_p1];
442
            ker.d = s_m1[x_p2];
443
 
444
            ker.e = s_0[x_m1];
445
            ker.f = s_0[x];
446
            ker.g = s_0[x_p1];
447
            ker.h = s_0[x_p2];
448
 
449
            ker.i = s_p1[x_m1];
450
            ker.j = s_p1[x];
451
            ker.k = s_p1[x_p1];
452
            ker.l = s_p1[x_p2];
453
 
454
            ker.m = s_p2[x_m1];
455
            ker.n = s_p2[x];
456
            ker.o = s_p2[x_p1];
457
            ker.p = s_p2[x_p2];
458
 
3 pmbaty 459
            const BlendResult res = preProcessCorners (ker, dist);
2 pmbaty 460
            /*
461
            preprocessing blend result:
462
            ---------
463
            | F | G |   //evalute corner between F, G, J, K
464
            ----|---|   //input pixel is at position F
465
            | J | K |
466
            ---------
467
            */
468
            setTopR(preProcBuffer[x], res.blend_j);
469
 
470
            if (x + 1 < bufferSize)
471
                setTopL(preProcBuffer[x + 1], res.blend_k);
472
        }
473
    }
474
    //------------------------------------------------------------------------------------
475
 
476
    for (int y = yFirst; y < yLast; ++y)
477
    {
3 pmbaty 478
        uint32_t *out = trg + Scaler::scale * y * trgWidth; //consider MT "striped" access
2 pmbaty 479
 
3 pmbaty 480
        const uint32_t* s_m1 = src + srcWidth * MAX (y - 1, 0);
2 pmbaty 481
        const uint32_t* s_0  = src + srcWidth * y; //center line
3 pmbaty 482
        const uint32_t* s_p1 = src + srcWidth * MIN (y + 1, srcHeight - 1);
483
        const uint32_t* s_p2 = src + srcWidth * MIN (y + 2, srcHeight - 1);
2 pmbaty 484
 
485
        unsigned char blend_xy1 = 0; //corner blending for current (x, y + 1) position
486
 
487
        for (int x = 0; x < srcWidth; ++x, out += Scaler::scale)
488
        {
489
            //all those bounds checks have only insignificant impact on performance!
3 pmbaty 490
            const int x_m1 = MAX (x - 1, 0); //perf: prefer array indexing to additional pointers!
491
            const int x_p1 = MIN (x + 1, srcWidth - 1);
492
            const int x_p2 = MIN (x + 2, srcWidth - 1);
2 pmbaty 493
 
494
            Kernel_4x4 ker4 = {}; //perf: initialization is negligible
495
 
496
            ker4.a = s_m1[x_m1]; //read sequentially from memory as far as possible
497
            ker4.b = s_m1[x];
498
            ker4.c = s_m1[x_p1];
499
            ker4.d = s_m1[x_p2];
500
 
501
            ker4.e = s_0[x_m1];
502
            ker4.f = s_0[x];
503
            ker4.g = s_0[x_p1];
504
            ker4.h = s_0[x_p2];
505
 
506
            ker4.i = s_p1[x_m1];
507
            ker4.j = s_p1[x];
508
            ker4.k = s_p1[x_p1];
509
            ker4.l = s_p1[x_p2];
510
 
511
            ker4.m = s_p2[x_m1];
512
            ker4.n = s_p2[x];
513
            ker4.o = s_p2[x_p1];
514
            ker4.p = s_p2[x_p2];
515
 
516
            //evaluate the four corners on bottom-right of current pixel
517
            unsigned char blend_xy = 0; //for current (x, y) position
518
            {
3 pmbaty 519
                const BlendResult res = preProcessCorners (ker4, dist);
2 pmbaty 520
                /*
521
                preprocessing blend result:
522
                ---------
523
                | F | G |   //evalute corner between F, G, J, K
524
                ----|---|   //current input pixel is at position F
525
                | J | K |
526
                ---------
527
                */
528
                blend_xy = preProcBuffer[x];
529
                setBottomR(blend_xy, res.blend_f); //all four corners of (x, y) have been determined at this point due to processing sequence!
530
 
531
                setTopR(blend_xy1, res.blend_j); //set 2nd known corner for (x, y + 1)
532
                preProcBuffer[x] = blend_xy1; //store on current buffer position for use on next row
533
 
534
                blend_xy1 = 0;
535
                setTopL(blend_xy1, res.blend_k); //set 1st known corner for (x + 1, y + 1) and buffer for use on next column
536
 
537
                if (x + 1 < bufferSize) //set 3rd known corner for (x + 1, y)
538
                    setBottomL(preProcBuffer[x + 1], res.blend_g);
539
            }
540
 
541
            //fill block of size scale * scale with the given color
3 pmbaty 542
                        {
543
                                uint32_t *blk = out;
544
                            for (int _blk_y = 0; _blk_y < Scaler::scale; ++_blk_y, blk = (uint32_t *) BYTE_ADVANCE (blk, trgWidth * sizeof (uint32_t)))
545
                                for (int _blk_x = 0; _blk_x < Scaler::scale; ++_blk_x)
546
                                    blk[_blk_x] = ker4.f;
547
                        }
2 pmbaty 548
            //place *after* preprocessing step, to not overwrite the results while processing the the last pixel!
549
 
550
            //blend four corners of current pixel
3 pmbaty 551
            if (blend_xy != 0) //good 5% perf-improvement
2 pmbaty 552
            {
553
                Kernel_3x3 ker3 = {}; //perf: initialization is negligible
554
 
555
                ker3.a = ker4.a;
556
                ker3.b = ker4.b;
557
                ker3.c = ker4.c;
558
 
559
                ker3.d = ker4.e;
560
                ker3.e = ker4.f;
561
                ker3.f = ker4.g;
562
 
563
                ker3.g = ker4.i;
564
                ker3.h = ker4.j;
565
                ker3.i = ker4.k;
566
 
3 pmbaty 567
                blendPixel<Scaler, ROT_0  >(ker3, out, trgWidth, blend_xy, alphagrad, dist);
568
                blendPixel<Scaler, ROT_90 >(ker3, out, trgWidth, blend_xy, alphagrad, dist);
569
                blendPixel<Scaler, ROT_180>(ker3, out, trgWidth, blend_xy, alphagrad, dist);
570
                blendPixel<Scaler, ROT_270>(ker3, out, trgWidth, blend_xy, alphagrad, dist);
2 pmbaty 571
            }
572
        }
573
    }
574
}
575
 
576
 
577
//------------------------------------------------------------------------------------
3 pmbaty 578
struct Scaler2x
2 pmbaty 579
{
580
    static const int scale = 2;
581
 
582
 
583
    template <class OutputMatrix>
3 pmbaty 584
    static void blendLineShallow(uint32_t col, OutputMatrix& out, alphagrad_func alphagrad)
2 pmbaty 585
    {
3 pmbaty 586
        alphagrad (&(out.template ref<scale - 1, 0>()), col, 1, 4);
587
        alphagrad (&(out.template ref<scale - 1, 1>()), col, 3, 4);
2 pmbaty 588
    }
589
 
590
    template <class OutputMatrix>
3 pmbaty 591
    static void blendLineSteep(uint32_t col, OutputMatrix& out, alphagrad_func alphagrad)
2 pmbaty 592
    {
3 pmbaty 593
        alphagrad (&(out.template ref<0, scale - 1>()), col, 1, 4);
594
        alphagrad (&(out.template ref<1, scale - 1>()), col, 3, 4);
2 pmbaty 595
    }
596
 
597
    template <class OutputMatrix>
3 pmbaty 598
    static void blendLineSteepAndShallow(uint32_t col, OutputMatrix& out, alphagrad_func alphagrad)
2 pmbaty 599
    {
3 pmbaty 600
        alphagrad (&(out.template ref<1, 0>()), col, 1, 4);
601
        alphagrad (&(out.template ref<0, 1>()), col, 1, 4);
602
        alphagrad (&(out.template ref<1, 1>()), col, 5, 6); //[!] fixes 7/8 used in xBR
2 pmbaty 603
    }
604
 
605
    template <class OutputMatrix>
3 pmbaty 606
    static void blendLineDiagonal(uint32_t col, OutputMatrix& out, alphagrad_func alphagrad)
2 pmbaty 607
    {
3 pmbaty 608
        alphagrad (&(out.template ref<1, 1>()), col, 1, 2);
2 pmbaty 609
    }
610
 
611
    template <class OutputMatrix>
3 pmbaty 612
    static void blendCorner(uint32_t col, OutputMatrix& out, alphagrad_func alphagrad)
2 pmbaty 613
    {
614
        //model a round corner
3 pmbaty 615
        alphagrad (&(out.template ref<1, 1>()), col, 21, 100); //exact: 1 - pi/4 = 0.2146018366
2 pmbaty 616
    }
617
};
618
 
619
 
3 pmbaty 620
struct Scaler3x
2 pmbaty 621
{
622
    static const int scale = 3;
623
 
624
 
625
    template <class OutputMatrix>
3 pmbaty 626
    static void blendLineShallow(uint32_t col, OutputMatrix& out, alphagrad_func alphagrad)
2 pmbaty 627
    {
3 pmbaty 628
        alphagrad (&(out.template ref<scale - 1, 0>()), col, 1, 4);
629
        alphagrad (&(out.template ref<scale - 2, 2>()), col, 1, 4);
630
        alphagrad (&(out.template ref<scale - 1, 1>()), col, 3, 4);
2 pmbaty 631
        out.template ref<scale - 1, 2>() = col;
632
    }
633
 
634
    template <class OutputMatrix>
3 pmbaty 635
    static void blendLineSteep(uint32_t col, OutputMatrix& out, alphagrad_func alphagrad)
2 pmbaty 636
    {
3 pmbaty 637
        alphagrad (&(out.template ref<0, scale - 1>()), col, 1, 4);
638
        alphagrad (&(out.template ref<2, scale - 2>()), col, 1, 4);
639
        alphagrad (&(out.template ref<1, scale - 1>()), col, 3, 4);
2 pmbaty 640
        out.template ref<2, scale - 1>() = col;
641
    }
642
 
643
    template <class OutputMatrix>
3 pmbaty 644
    static void blendLineSteepAndShallow(uint32_t col, OutputMatrix& out, alphagrad_func alphagrad)
2 pmbaty 645
    {
3 pmbaty 646
        alphagrad (&(out.template ref<2, 0>()), col, 1, 4);
647
        alphagrad (&(out.template ref<0, 2>()), col, 1, 4);
648
        alphagrad (&(out.template ref<2, 1>()), col, 3, 4);
649
        alphagrad (&(out.template ref<1, 2>()), col, 3, 4);
2 pmbaty 650
        out.template ref<2, 2>() = col;
651
    }
652
 
653
    template <class OutputMatrix>
3 pmbaty 654
    static void blendLineDiagonal(uint32_t col, OutputMatrix& out, alphagrad_func alphagrad)
2 pmbaty 655
    {
3 pmbaty 656
        alphagrad (&(out.template ref<1, 2>()), col, 1, 8); //conflict with other rotations for this odd scale
657
        alphagrad (&(out.template ref<2, 1>()), col, 1, 8);
658
        alphagrad (&(out.template ref<2, 2>()), col, 7, 8); //
2 pmbaty 659
    }
660
 
661
    template <class OutputMatrix>
3 pmbaty 662
    static void blendCorner(uint32_t col, OutputMatrix& out, alphagrad_func alphagrad)
2 pmbaty 663
    {
664
        //model a round corner
3 pmbaty 665
        alphagrad (&(out.template ref<2, 2>()), col, 45, 100); //exact: 0.4545939598
666
        //alphagrad (&(out.template ref<2, 1>()), col, 7, 256); //0.02826017254 -> negligible + avoid conflicts with other rotations for this odd scale
667
        //alphagrad (&(out.template ref<1, 2>()), col, 7, 256); //0.02826017254
2 pmbaty 668
    }
669
};
670
 
671
 
3 pmbaty 672
struct Scaler4x
2 pmbaty 673
{
674
    static const int scale = 4;
675
 
676
 
677
    template <class OutputMatrix>
3 pmbaty 678
    static void blendLineShallow(uint32_t col, OutputMatrix& out, alphagrad_func alphagrad)
2 pmbaty 679
    {
3 pmbaty 680
        alphagrad (&(out.template ref<scale - 1, 0>()), col, 1, 4);
681
        alphagrad (&(out.template ref<scale - 2, 2>()), col, 1, 4);
682
        alphagrad (&(out.template ref<scale - 1, 1>()), col, 3, 4);
683
        alphagrad (&(out.template ref<scale - 2, 3>()), col, 3, 4);
2 pmbaty 684
 
685
        out.template ref<scale - 1, 2>() = col;
686
        out.template ref<scale - 1, 3>() = col;
687
    }
688
 
689
    template <class OutputMatrix>
3 pmbaty 690
    static void blendLineSteep(uint32_t col, OutputMatrix& out, alphagrad_func alphagrad)
2 pmbaty 691
    {
3 pmbaty 692
        alphagrad (&(out.template ref<0, scale - 1>()), col, 1, 4);
693
        alphagrad (&(out.template ref<2, scale - 2>()), col, 1, 4);
694
        alphagrad (&(out.template ref<1, scale - 1>()), col, 3, 4);
695
        alphagrad (&(out.template ref<3, scale - 2>()), col, 3, 4);
2 pmbaty 696
 
697
        out.template ref<2, scale - 1>() = col;
698
        out.template ref<3, scale - 1>() = col;
699
    }
700
 
701
    template <class OutputMatrix>
3 pmbaty 702
    static void blendLineSteepAndShallow(uint32_t col, OutputMatrix& out, alphagrad_func alphagrad)
2 pmbaty 703
    {
3 pmbaty 704
        alphagrad (&(out.template ref<3, 1>()), col, 3, 4);
705
        alphagrad (&(out.template ref<1, 3>()), col, 3, 4);
706
        alphagrad (&(out.template ref<3, 0>()), col, 1, 4);
707
        alphagrad (&(out.template ref<0, 3>()), col, 1, 4);
708
        alphagrad (&(out.template ref<2, 2>()), col, 1, 3); //[!] fixes 1/4 used in xBR
2 pmbaty 709
 
710
        out.template ref<3, 3>() = col;
711
        out.template ref<3, 2>() = col;
712
        out.template ref<2, 3>() = col;
713
    }
714
 
715
    template <class OutputMatrix>
3 pmbaty 716
    static void blendLineDiagonal(uint32_t col, OutputMatrix& out, alphagrad_func alphagrad)
2 pmbaty 717
    {
3 pmbaty 718
        alphagrad (&(out.template ref<scale - 1, scale / 2    >()), col, 1, 2);
719
        alphagrad (&(out.template ref<scale - 2, scale / 2 + 1>()), col, 1, 2);
720
 
2 pmbaty 721
        out.template ref<scale - 1, scale - 1>() = col;
722
    }
723
 
724
    template <class OutputMatrix>
3 pmbaty 725
    static void blendCorner(uint32_t col, OutputMatrix& out, alphagrad_func alphagrad)
2 pmbaty 726
    {
727
        //model a round corner
3 pmbaty 728
        alphagrad (&(out.template ref<3, 3>()), col, 68, 100); //exact: 0.6848532563
729
        alphagrad (&(out.template ref<3, 2>()), col,  9, 100); //0.08677704501
730
        alphagrad (&(out.template ref<2, 3>()), col,  9, 100); //0.08677704501
2 pmbaty 731
    }
732
};
733
 
734
 
3 pmbaty 735
struct Scaler5x
2 pmbaty 736
{
737
    static const int scale = 5;
738
 
739
 
740
    template <class OutputMatrix>
3 pmbaty 741
    static void blendLineShallow(uint32_t col, OutputMatrix& out, alphagrad_func alphagrad)
2 pmbaty 742
    {
3 pmbaty 743
        alphagrad (&(out.template ref<scale - 1, 0>()), col, 1, 4);
744
        alphagrad (&(out.template ref<scale - 2, 2>()), col, 1, 4);
745
        alphagrad (&(out.template ref<scale - 3, 4>()), col, 1, 4);
746
        alphagrad (&(out.template ref<scale - 1, 1>()), col, 3, 4);
747
        alphagrad (&(out.template ref<scale - 2, 3>()), col, 3, 4);
2 pmbaty 748
 
749
        out.template ref<scale - 1, 2>() = col;
750
        out.template ref<scale - 1, 3>() = col;
751
        out.template ref<scale - 1, 4>() = col;
752
        out.template ref<scale - 2, 4>() = col;
753
    }
754
 
755
    template <class OutputMatrix>
3 pmbaty 756
    static void blendLineSteep(uint32_t col, OutputMatrix& out, alphagrad_func alphagrad)
2 pmbaty 757
    {
3 pmbaty 758
        alphagrad (&(out.template ref<0, scale - 1>()), col, 1, 4);
759
        alphagrad (&(out.template ref<2, scale - 2>()), col, 1, 4);
760
        alphagrad (&(out.template ref<4, scale - 3>()), col, 1, 4);
761
        alphagrad (&(out.template ref<1, scale - 1>()), col, 3, 4);
762
        alphagrad (&(out.template ref<3, scale - 2>()), col, 3, 4);
2 pmbaty 763
 
764
        out.template ref<2, scale - 1>() = col;
765
        out.template ref<3, scale - 1>() = col;
766
        out.template ref<4, scale - 1>() = col;
767
        out.template ref<4, scale - 2>() = col;
768
    }
769
 
770
    template <class OutputMatrix>
3 pmbaty 771
    static void blendLineSteepAndShallow(uint32_t col, OutputMatrix& out, alphagrad_func alphagrad)
2 pmbaty 772
    {
3 pmbaty 773
        alphagrad (&(out.template ref<0, scale - 1>()), col, 1, 4);
774
        alphagrad (&(out.template ref<2, scale - 2>()), col, 1, 4);
775
        alphagrad (&(out.template ref<1, scale - 1>()), col, 3, 4);
776
        alphagrad (&(out.template ref<scale - 1, 0>()), col, 1, 4);
777
        alphagrad (&(out.template ref<scale - 2, 2>()), col, 1, 4);
778
        alphagrad (&(out.template ref<scale - 1, 1>()), col, 3, 4);
779
        alphagrad (&(out.template ref<3, 3>()), col, 2, 3);
2 pmbaty 780
 
781
        out.template ref<2, scale - 1>() = col;
782
        out.template ref<3, scale - 1>() = col;
783
        out.template ref<4, scale - 1>() = col;
784
        out.template ref<scale - 1, 2>() = col;
785
        out.template ref<scale - 1, 3>() = col;
786
    }
787
 
788
    template <class OutputMatrix>
3 pmbaty 789
    static void blendLineDiagonal(uint32_t col, OutputMatrix& out, alphagrad_func alphagrad)
2 pmbaty 790
    {
3 pmbaty 791
        alphagrad (&(out.template ref<scale - 1, scale / 2    >()), col, 1, 8); //conflict with other rotations for this odd scale
792
        alphagrad (&(out.template ref<scale - 2, scale / 2 + 1>()), col, 1, 8);
793
        alphagrad (&(out.template ref<scale - 3, scale / 2 + 2>()), col, 1, 8); //
794
        alphagrad (&(out.template ref<4, 3>()), col, 7, 8);
795
        alphagrad (&(out.template ref<3, 4>()), col, 7, 8);
2 pmbaty 796
 
797
        out.template ref<4, 4>() = col;
798
    }
799
 
800
    template <class OutputMatrix>
3 pmbaty 801
    static void blendCorner(uint32_t col, OutputMatrix& out, alphagrad_func alphagrad)
2 pmbaty 802
    {
803
        // model a round corner
3 pmbaty 804
        alphagrad (&(out.template ref<4, 4>()), col, 86, 100); //exact: 0.8631434088
805
        alphagrad (&(out.template ref<4, 3>()), col, 23, 100); //0.2306749731
806
        alphagrad (&(out.template ref<3, 4>()), col, 23, 100); //0.2306749731
2 pmbaty 807
        //alphaGrad<1, 64>(out.template ref<4, 2>(), col); //0.01676812367 -> negligible + avoid conflicts with other rotations for this odd scale
808
        //alphaGrad<1, 64>(out.template ref<2, 4>(), col); //0.01676812367
809
    }
810
};
811
 
812
 
3 pmbaty 813
struct Scaler6x
2 pmbaty 814
{
815
    static const int scale = 6;
816
 
817
 
818
    template <class OutputMatrix>
3 pmbaty 819
    static void blendLineShallow(uint32_t col, OutputMatrix& out, alphagrad_func alphagrad)
2 pmbaty 820
    {
3 pmbaty 821
        alphagrad (&(out.template ref<scale - 1, 0>()), col, 1, 4);
822
        alphagrad (&(out.template ref<scale - 2, 2>()), col, 1, 4);
823
        alphagrad (&(out.template ref<scale - 3, 4>()), col, 1, 4);
824
        alphagrad (&(out.template ref<scale - 1, 1>()), col, 3, 4);
825
        alphagrad (&(out.template ref<scale - 2, 3>()), col, 3, 4);
826
        alphagrad (&(out.template ref<scale - 3, 5>()), col, 3, 4);
2 pmbaty 827
 
828
        out.template ref<scale - 1, 2>() = col;
829
        out.template ref<scale - 1, 3>() = col;
830
        out.template ref<scale - 1, 4>() = col;
831
        out.template ref<scale - 1, 5>() = col;
832
        out.template ref<scale - 2, 4>() = col;
833
        out.template ref<scale - 2, 5>() = col;
834
    }
835
 
836
    template <class OutputMatrix>
3 pmbaty 837
    static void blendLineSteep(uint32_t col, OutputMatrix& out, alphagrad_func alphagrad)
2 pmbaty 838
    {
3 pmbaty 839
        alphagrad (&(out.template ref<0, scale - 1>()), col, 1, 4);
840
        alphagrad (&(out.template ref<2, scale - 2>()), col, 1, 4);
841
        alphagrad (&(out.template ref<4, scale - 3>()), col, 1, 4);
842
        alphagrad (&(out.template ref<1, scale - 1>()), col, 3, 4);
843
        alphagrad (&(out.template ref<3, scale - 2>()), col, 3, 4);
844
        alphagrad (&(out.template ref<5, scale - 3>()), col, 3, 4);
2 pmbaty 845
 
846
        out.template ref<2, scale - 1>() = col;
847
        out.template ref<3, scale - 1>() = col;
848
        out.template ref<4, scale - 1>() = col;
849
        out.template ref<5, scale - 1>() = col;
850
        out.template ref<4, scale - 2>() = col;
851
        out.template ref<5, scale - 2>() = col;
852
    }
853
 
854
    template <class OutputMatrix>
3 pmbaty 855
    static void blendLineSteepAndShallow(uint32_t col, OutputMatrix& out, alphagrad_func alphagrad)
2 pmbaty 856
    {
3 pmbaty 857
        alphagrad (&(out.template ref<0, scale - 1>()), col, 1, 4);
858
        alphagrad (&(out.template ref<2, scale - 2>()), col, 1, 4);
859
        alphagrad (&(out.template ref<1, scale - 1>()), col, 3, 4);
860
        alphagrad (&(out.template ref<3, scale - 2>()), col, 3, 4);
861
        alphagrad (&(out.template ref<scale - 1, 0>()), col, 1, 4);
862
        alphagrad (&(out.template ref<scale - 2, 2>()), col, 1, 4);
863
        alphagrad (&(out.template ref<scale - 1, 1>()), col, 3, 4);
864
        alphagrad (&(out.template ref<scale - 2, 3>()), col, 3, 4);
2 pmbaty 865
 
866
        out.template ref<2, scale - 1>() = col;
867
        out.template ref<3, scale - 1>() = col;
868
        out.template ref<4, scale - 1>() = col;
869
        out.template ref<5, scale - 1>() = col;
870
        out.template ref<4, scale - 2>() = col;
871
        out.template ref<5, scale - 2>() = col;
872
        out.template ref<scale - 1, 2>() = col;
873
        out.template ref<scale - 1, 3>() = col;
874
    }
875
 
876
    template <class OutputMatrix>
3 pmbaty 877
    static void blendLineDiagonal(uint32_t col, OutputMatrix& out, alphagrad_func alphagrad)
2 pmbaty 878
    {
3 pmbaty 879
        alphagrad (&(out.template ref<scale - 1, scale / 2    >()), col, 1, 2);
880
        alphagrad (&(out.template ref<scale - 2, scale / 2 + 1>()), col, 1, 2);
881
        alphagrad (&(out.template ref<scale - 3, scale / 2 + 2>()), col, 1, 2);
2 pmbaty 882
 
883
        out.template ref<scale - 2, scale - 1>() = col;
884
        out.template ref<scale - 1, scale - 1>() = col;
885
        out.template ref<scale - 1, scale - 2>() = col;
886
    }
887
 
888
    template <class OutputMatrix>
3 pmbaty 889
    static void blendCorner(uint32_t col, OutputMatrix& out, alphagrad_func alphagrad)
2 pmbaty 890
    {
891
        //model a round corner
3 pmbaty 892
        alphagrad (&(out.template ref<5, 5>()), col, 97, 100); //exact: 0.9711013910
893
        alphagrad (&(out.template ref<4, 5>()), col, 42, 100); //0.4236372243
894
        alphagrad (&(out.template ref<5, 4>()), col, 42, 100); //0.4236372243
895
        alphagrad (&(out.template ref<5, 3>()), col,  6, 100); //0.05652034508
896
        alphagrad (&(out.template ref<3, 5>()), col,  6, 100); //0.05652034508
2 pmbaty 897
    }
898
};
899
 
900
        //------------------------------------------------------------------------------------
3 pmbaty 901
}
2 pmbaty 902
 
903
 
904
 
3 pmbaty 905
static double dist24 (uint32_t pix1, uint32_t pix2)
906
{
907
    //30% perf boost compared to plain distYCbCr()!
908
    //consumes 64 MB memory; using double is only 2% faster, but takes 128 MB
909
    static float diffToDist[256 * 256 * 256];
910
    static bool is_initialized = false;
911
    if (!is_initialized)
912
    {
913
        for (uint32_t i = 0; i < 256 * 256 * 256; ++i) //startup time: 114 ms on Intel Core i5 (four cores)
914
        {
915
            const int r_diff = GET_RED (i) * 2 - 0xFF;
916
            const int g_diff = GET_GREEN (i) * 2 - 0xFF;
917
            const int b_diff = GET_BLUE (i) * 2 - 0xFF;
2 pmbaty 918
 
3 pmbaty 919
            const double k_b = 0.0593; //ITU-R BT.2020 conversion
920
            const double k_r = 0.2627; //
921
            const double k_g = 1 - k_b - k_r;
2 pmbaty 922
 
3 pmbaty 923
            const double scale_b = 0.5 / (1 - k_b);
924
            const double scale_r = 0.5 / (1 - k_r);
2 pmbaty 925
 
3 pmbaty 926
            const double y   = k_r * r_diff + k_g * g_diff + k_b * b_diff; //[!], analog YCbCr!
927
            const double c_b = scale_b * (b_diff - y);
928
            const double c_r = scale_r * (r_diff - y);
929
 
930
            diffToDist[i] = (float) (sqrt ((y * y) + (c_b * c_b) + (c_r * c_r)));
931
        }
932
        is_initialized = true;
933
    }
934
 
935
    const int r_diff = (int) GET_RED   (pix1) - (int) GET_RED   (pix2);
936
    const int g_diff = (int) GET_GREEN (pix1) - (int) GET_GREEN (pix2);
937
    const int b_diff = (int) GET_BLUE  (pix1) - (int) GET_BLUE  (pix2);
938
 
939
    return diffToDist[(((r_diff + 0xFF) / 2) << 16) | //slightly reduce precision (division by 2) to squeeze value into single byte
940
                      (((g_diff + 0xFF) / 2) <<  8) |
941
                      (((b_diff + 0xFF) / 2) <<  0)];
2 pmbaty 942
}
943
 
944
 
3 pmbaty 945
static double dist32 (uint32_t pix1, uint32_t pix2)
946
{
947
    const double a1 = GET_ALPHA (pix1) / 255.0 ;
948
    const double a2 = GET_ALPHA (pix2) / 255.0 ;
949
    /*
950
    Requirements for a color distance handling alpha channel: with a1, a2 in [0, 1]
2 pmbaty 951
 
3 pmbaty 952
        1. if a1 = a2, distance should be: a1 * distYCbCr()
953
        2. if a1 = 0,  distance should be: a2 * distYCbCr(black, white) = a2 * 255
954
        3. if a1 = 1,  ??? maybe: 255 * (1 - a2) + a2 * distYCbCr()
955
    */
956
 
957
    //return MIN (a1, a2) * distYCbCrBuffered(pix1, pix2) + 255 * abs(a1 - a2);
958
    //=> following code is 15% faster:
959
    const double d = dist24 (pix1, pix2);
960
    return (a1 < a2 ? a1 * d + 255 * (a2 - a1) : a2 * d + 255 * (a1 - a2));
961
}
962
 
963
 
964
static void alphagrad24 (uint32_t *pixBack, uint32_t pixFront, unsigned int M, unsigned int N)
2 pmbaty 965
{
3 pmbaty 966
        // blend front color with opacity M / N over opaque background: http://en.wikipedia.org/wiki/Alpha_compositing#Alpha_blending
967
        *pixBack = (  (CALC_COLOR24 (GET_RED   (pixFront), GET_RED   (*pixBack), M, N) << 16)
968
                                | (CALC_COLOR24 (GET_GREEN (pixFront), GET_GREEN (*pixBack), M, N) <<  8)
969
                                | (CALC_COLOR24 (GET_BLUE  (pixFront), GET_BLUE  (*pixBack), M, N) <<  0));
2 pmbaty 970
}
971
 
972
 
3 pmbaty 973
static void alphagrad32 (uint32_t *pixBack, uint32_t pixFront, unsigned int M, unsigned int N)
974
{
975
        // find intermediate color between two colors with alpha channels (=> NO alpha blending!!!)
976
    const unsigned int weightFront = GET_ALPHA (pixFront) * M;
977
    const unsigned int weightBack  = GET_ALPHA (*pixBack) * (N - M);
978
    const unsigned int weightSum   = weightFront + weightBack;
979
    *pixBack = (weightSum == 0 ? 0 :
980
                                (((unsigned char) (weightSum / N))                                                               << 24)
981
                                | (CALC_COLOR32 (GET_RED   (pixFront), GET_RED   (*pixBack), weightFront, weightBack, weightSum) << 16)
982
                                | (CALC_COLOR32 (GET_GREEN (pixFront), GET_GREEN (*pixBack), weightFront, weightBack, weightSum) <<  8)
983
                                | (CALC_COLOR32 (GET_BLUE  (pixFront), GET_BLUE  (*pixBack), weightFront, weightBack, weightSum) <<  0));
984
}
985
 
986
 
987
EXTERN_C void nearestNeighborScale(const uint32_t *src, int srcWidth, int srcHeight, uint32_t *trg, int trgWidth, int trgHeight)
988
{
989
//    nearestNeighborScale (src, srcWidth, srcHeight, srcWidth * sizeof (uint32_t), trg, trgWidth, trgHeight, trgWidth * sizeof (uint32_t), XBRZ_SLICETYPE_TARGET, 0, trgHeight, [](uint32_t pix) { return pix; });
990
    //static_assert(std::is_integral<PixSrc>::value, "PixSrc* is expected to be cast-able to char*");
991
    //static_assert(std::is_integral<PixTrg>::value, "PixTrg* is expected to be cast-able to char*");
992
    //static_assert(std::is_same<decltype(pixCvrt(PixSrc())), PixTrg>::value, "PixConverter returning wrong pixel format");
993
 
994
    int srcPitch = srcWidth * sizeof (uint32_t);
995
    int trgPitch = trgWidth * sizeof (uint32_t);
996
    int yFirst;
997
    int yLast;
998
 
999
#if 0 // going over source image - fast for upscaling, since source is read only once
1000
    yFirst = 0;
1001
    yLast  = MIN (trgHeight, srcHeight);
1002
 
1003
    if (yFirst >= yLast || trgWidth <= 0 || trgHeight <= 0)
1004
        return; // consistency check
1005
 
1006
    for (int y = yFirst; y < yLast; ++y)
1007
    {
1008
        //mathematically: ySrc = floor(srcHeight * yTrg / trgHeight)
1009
        // => search for integers in: [ySrc, ySrc + 1) * trgHeight / srcHeight
1010
 
1011
        //keep within for loop to support MT input slices!
1012
        const int yTrg_first = ( y      * trgHeight + srcHeight - 1) / srcHeight; //=ceil(y * trgHeight / srcHeight)
1013
        const int yTrg_last  = ((y + 1) * trgHeight + srcHeight - 1) / srcHeight; //=ceil(((y + 1) * trgHeight) / srcHeight)
1014
        const int blockHeight = yTrg_last - yTrg_first;
1015
 
1016
        if (blockHeight > 0)
1017
        {
1018
            const uint32_t *srcLine = (const uint32_t *) BYTE_ADVANCE (src, y * srcPitch);
1019
            /**/  uint32_t *trgLine = (      uint32_t *) BYTE_ADVANCE (trg, yTrg_first * trgPitch);
1020
            int xTrg_first = 0;
1021
 
1022
            for (int x = 0; x < srcWidth; ++x)
1023
            {
1024
                const int xTrg_last = ((x + 1) * trgWidth + srcWidth - 1) / srcWidth;
1025
                const int blockWidth = xTrg_last - xTrg_first;
1026
                if (blockWidth > 0)
1027
                {
1028
                    const uint32_t trgColor = srcLine[x];
1029
                                        uint32_t *blkLine = trgLine;
1030
 
1031
                    xTrg_first = xTrg_last;
1032
 
1033
                                    for (int blk_y = 0; blk_y < blockHeight; ++blk_y, blkLine = (uint32_t *) BYTE_ADVANCE (blkLine, trgPitch))
1034
                                        for (int blk_x = 0; blk_x < blockWidth; ++blk_x)
1035
                                            blkLine[blk_x] = trgColor;
1036
 
1037
                    trgLine += blockWidth;
1038
                }
1039
            }
1040
        }
1041
    }
1042
#else // going over target image - slow for upscaling, since source is read multiple times missing out on cache! Fast for similar image sizes!
1043
    yFirst = 0;
1044
    yLast  = trgHeight;
1045
 
1046
    if (yFirst >= yLast || srcHeight <= 0 || srcWidth <= 0)
1047
        return; // consistency check
1048
 
1049
    for (int y = yFirst; y < yLast; ++y)
1050
    {
1051
        /**/  uint32_t *trgLine = (      uint32_t *) BYTE_ADVANCE (trg, y * trgPitch);
1052
        const int ySrc = srcHeight * y / trgHeight;
1053
        const uint32_t *srcLine = (const uint32_t *) BYTE_ADVANCE (src, ySrc * srcPitch);
1054
        for (int x = 0; x < trgWidth; ++x)
1055
        {
1056
            const int xSrc = srcWidth * x / trgWidth;
1057
            trgLine[x] = srcLine[xSrc];
1058
        }
1059
    }
1060
#endif // going over source or target
1061
 
1062
        return;
1063
}
1064
 
1065
 
2 pmbaty 1066
EXTERN_C bool xbrz_equalcolortest24 (uint32_t col1, uint32_t col2, double luminanceWeight, double equalColorTolerance)
1067
{
3 pmbaty 1068
        return (dist24 (col1, col2) < equalColorTolerance);
2 pmbaty 1069
}
1070
 
1071
 
1072
EXTERN_C bool xbrz_equalcolortest32 (uint32_t col1, uint32_t col2, double luminanceWeight, double equalColorTolerance)
1073
{
3 pmbaty 1074
        return (dist32 (col1, col2) < equalColorTolerance);
2 pmbaty 1075
}
1076
 
1077
 
1078
EXTERN_C void xbrz_scale24 (size_t factor, const uint32_t *src, uint32_t *trg, int srcWidth, int srcHeight)
1079
{
3 pmbaty 1080
    if      (factor == 2) return scaleImage<Scaler2x> (src, trg, srcWidth, srcHeight, 0, srcHeight, alphagrad24, dist24);
1081
    else if (factor == 3) return scaleImage<Scaler3x> (src, trg, srcWidth, srcHeight, 0, srcHeight, alphagrad24, dist24);
1082
    else if (factor == 4) return scaleImage<Scaler4x> (src, trg, srcWidth, srcHeight, 0, srcHeight, alphagrad24, dist24);
1083
    else if (factor == 5) return scaleImage<Scaler5x> (src, trg, srcWidth, srcHeight, 0, srcHeight, alphagrad24, dist24);
1084
    else if (factor == 6) return scaleImage<Scaler6x> (src, trg, srcWidth, srcHeight, 0, srcHeight, alphagrad24, dist24);
2 pmbaty 1085
}
1086
 
1087
 
1088
EXTERN_C void xbrz_scale32 (size_t factor, const uint32_t *src, uint32_t *trg, int srcWidth, int srcHeight)
1089
{
3 pmbaty 1090
    if      (factor == 2) return scaleImage<Scaler2x> (src, trg, srcWidth, srcHeight, 0, srcHeight, alphagrad32, dist32);
1091
    else if (factor == 3) return scaleImage<Scaler3x> (src, trg, srcWidth, srcHeight, 0, srcHeight, alphagrad32, dist32);
1092
    else if (factor == 4) return scaleImage<Scaler4x> (src, trg, srcWidth, srcHeight, 0, srcHeight, alphagrad32, dist32);
1093
    else if (factor == 5) return scaleImage<Scaler5x> (src, trg, srcWidth, srcHeight, 0, srcHeight, alphagrad32, dist32);
1094
    else if (factor == 6) return scaleImage<Scaler6x> (src, trg, srcWidth, srcHeight, 0, srcHeight, alphagrad32, dist32);
2 pmbaty 1095
}