yuvbench/yuvbench_claude.c


1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159

#include "yuvbench.h"

#ifdef YUVBENCH_CLAUDE

#include <arm_neon.h>
#include <dispatch/dispatch.h>

// BT.709 limited range, scale >>6 (factor=64):
//   R = clip(( 75*(Y-16) + 119*(Cr-128) + 32) >> 6)
//   G = clip(( 75*(Y-16) -  12*(Cb-128) -  30*(Cr-128) + 32) >> 6)
//   B = clip(( 75*(Y-16) + 129*(Cb-128) + 32) >> 6)
//
// All int16 intermediates stay within [-16512, 31801] — no overflow.
// vqshrun_n_s16 gives saturating unsigned narrow + right shift in one instruction.

// Coefficients:
//   Y:  75  (1.164 * 64 = 74.5 → 75, err +0.67%)
//   Cr_R: 119 (1.856 * 64 = 118.8 → 119, err +0.17%)
//   Cb_B: 129 (2.016 * 64; libyuv uses 2.016 for BT.709 studio swing)
//   Cb_G:  12 (0.187 * 64 = 12.0)
//   Cr_G:  30 (0.468 * 64 = 30.0)

// Convert 8 luma pixels using pre-widened, bias-subtracted chroma vectors.
// vcb = Cb - 128 (int16x8), vcr = Cr - 128 (int16x8)
static inline void convert_8px(
    const uint8_t* __restrict__ y,
    int16x8_t vcb,
    int16x8_t vcr,
    uint8_t* __restrict__ out)
{
    // Y - 16
    int16x8_t vy = vsubq_s16(
        vreinterpretq_s16_u16(vmovl_u8(vld1_u8(y))),
        vdupq_n_s16(16));

    // ry = 75*(Y-16) + rounding
    int16x8_t base = vaddq_s16(vmulq_n_s16(vy, 75), vdupq_n_s16(32));

    // R = base + 119*Cr
    int16x8_t r = vmlaq_n_s16(base, vcr, 119);
    // G = base - 12*Cb - 30*Cr
    int16x8_t g = vmlsq_n_s16(vmlsq_n_s16(base, vcb, 12), vcr, 30);
    // B = base + 129*Cb
    int16x8_t b = vmlaq_n_s16(base, vcb, 129);

    // Saturating narrow+shift: clamps to [0,255] and packs to uint8
    uint8x8x3_t rgb;
    rgb.val[0] = vqshrun_n_s16(r, 6);
    rgb.val[1] = vqshrun_n_s16(g, 6);
    rgb.val[2] = vqshrun_n_s16(b, 6);
    vst3_u8(out, rgb);  // stores R0G0B0 R1G1B1 ... R7G7B7
}

// Process a contiguous range of row-pairs [row_start, row_end).
// row_start and row_end must be even.
static void convert_rows(
    const uint8_t* __restrict__ Y,
    const uint8_t* __restrict__ Cb,
    const uint8_t* __restrict__ Cr,
    uint8_t* __restrict__ RGB,
    uint32_t w,
    uint32_t row_start,
    uint32_t row_end)
{
    for (uint32_t row = row_start; row < row_end; row += 2) {
        const uint8_t* y0   = Y   + row * w;
        const uint8_t* y1   = y0  + w;
        const uint8_t* cb   = Cb  + (row / 2) * (w / 2);
        const uint8_t* cr   = Cr  + (row / 2) * (w / 2);
        uint8_t*       rgb0 = RGB + row * w * 3;
        uint8_t*       rgb1 = rgb0 + w * 3;

        uint32_t col = 0;

        // 16 pixels per inner iteration: 8 chroma samples shared across 2 rows.
        for (; col + 16 <= w; col += 16) {
            // Load 8 Cb/Cr bytes and upsample to 16 via interleave-with-self:
            //   [c0,c1,...,c7] → [c0,c0,c1,c1,...,c7,c7]
            uint8x8x2_t cb_up = vzip_u8(vld1_u8(cb + col/2), vld1_u8(cb + col/2));
            uint8x8x2_t cr_up = vzip_u8(vld1_u8(cr + col/2), vld1_u8(cr + col/2));

            // Widen and bias-subtract chroma for low 8 and high 8 pixels
            int16x8_t vcb_lo = vsubq_s16(vreinterpretq_s16_u16(vmovl_u8(cb_up.val[0])), vdupq_n_s16(128));
            int16x8_t vcb_hi = vsubq_s16(vreinterpretq_s16_u16(vmovl_u8(cb_up.val[1])), vdupq_n_s16(128));
            int16x8_t vcr_lo = vsubq_s16(vreinterpretq_s16_u16(vmovl_u8(cr_up.val[0])), vdupq_n_s16(128));
            int16x8_t vcr_hi = vsubq_s16(vreinterpretq_s16_u16(vmovl_u8(cr_up.val[1])), vdupq_n_s16(128));

            // Row 0
            convert_8px(y0 + col,      vcb_lo, vcr_lo, rgb0 + col * 3);
            convert_8px(y0 + col + 8,  vcb_hi, vcr_hi, rgb0 + (col + 8) * 3);
            // Row 1 — same chroma, different luma
            convert_8px(y1 + col,      vcb_lo, vcr_lo, rgb1 + col * 3);
            convert_8px(y1 + col + 8,  vcb_hi, vcr_hi, rgb1 + (col + 8) * 3);
        }

        // Scalar tail for widths not divisible by 16
        for (; col < w; col += 2) {
            int32_t cb2 = (int32_t)cb[col / 2] - 128;
            int32_t cr2 = (int32_t)cr[col / 2] - 128;
            for (uint32_t dy = 0; dy < 2; ++dy) {
                const uint8_t* yr  = (dy == 0) ? y0   : y1;
                uint8_t*       dst = (dy == 0) ? rgb0 : rgb1;
                for (uint32_t dx = 0; dx < 2 && (col + dx) < w; ++dx) {
                    int32_t yv = (int32_t)yr[col + dx] - 16;
                    int32_t base = 75 * yv + 32;
                    int32_t r = base + 119 * cr2;
                    int32_t g = base -  12 * cb2 - 30 * cr2;
                    int32_t bv = base + 129 * cb2;
                    r  = (r  >> 6); r  = r  < 0 ? 0 : r  > 255 ? 255 : r;
                    g  = (g  >> 6); g  = g  < 0 ? 0 : g  > 255 ? 255 : g;
                    bv = (bv >> 6); bv = bv < 0 ? 0 : bv > 255 ? 255 : bv;
                    dst[(col + dx) * 3 + 0] = (uint8_t)r;
                    dst[(col + dx) * 3 + 1] = (uint8_t)g;
                    dst[(col + dx) * 3 + 2] = (uint8_t)bv;
                }
            }
        }
    }
}

static bool yuvbench_claude_init(Ctx* ctx)
{
    return (ctx->inp_w % 2 == 0) && (ctx->inp_h % 2 == 0);
}

static bool yuvbench_claude_convert(Ctx* ctx)
{
    const uint32_t w  = ctx->inp_w;
    const uint32_t h  = ctx->inp_h;
    const uint8_t* Y  = (const uint8_t*)ctx->inp_buf;
    const uint8_t* Cb = Y  + (size_t)w * h;
    const uint8_t* Cr = Cb + (size_t)(w / 2) * (h / 2);
    uint8_t*       RGB = (uint8_t*)ctx->out_buf;

    // Dispatch row-pairs across performance cores.
    // dispatch_apply is synchronous: returns only after all blocks finish.
    static const uint32_t NCHUNKS = 8;
    uint32_t pairs = h / 2;

    dispatch_apply(NCHUNKS,
                   dispatch_get_global_queue(QOS_CLASS_USER_INTERACTIVE, 0),
                   ^(size_t tid) {
        uint32_t start = (uint32_t)((tid     * pairs / NCHUNKS) * 2);
        uint32_t end   = (uint32_t)(((tid+1) * pairs / NCHUNKS) * 2);
        convert_rows(Y, Cb, Cr, RGB, w, start, end);
    });

    return true;
}

Backend yuvbench_claude(void)
{
    Backend b = { 0 };
    b.init_fn    = yuvbench_claude_init;
    b.convert_fn = yuvbench_claude_convert;
    return b;
}

#endif // YUVBENCH_CLAUDE