4 files changed, 274 insertions, 15 deletions
diff --git a/yuvbench/CLAUDE.md b/yuvbench/CLAUDE.md
new file mode 100644
index 0000000..ee6ee4f
--- /dev/null
+++ b/yuvbench/CLAUDE.md
@@ -0,0 +1,78 @@
+# CLAUDE.md
+
+This file provides guidance to Claude Code (claude.ai/code) when working with code in this repository.
+
+## What This Is
+
+yuvbench benchmarks YUV 4:2:0 → RGB24 color space conversion across multiple implementations. It loads a raw YUV file, runs each enabled backend through 100 warmup + 2500 timed iterations, and reports min/max/avg per-iteration timing in milliseconds.
+
+## Build Commands
+
+```bash
+# macOS (Apple Silicon)
+./build-macos-aarch64-clang.sh
+
+# Linux (x86_64)
+./build-linux-x86_64-gcc.sh
+```
+
+Both scripts create `build/` and produce `build/yuvbench`.
+
+## Running
+
+```bash
+./build/yuvbench images/jellybeans-256x256.yuv
+./build/yuvbench images/capitol-2950x1528.yuv
+./build/yuvbench images/capitol-2950x1528.yuv show   # pipe last frame to ffplay
+```
+
+Input filename must encode dimensions as `name-WIDTHxHEIGHT.yuv`.
+
+## Prepare Test Images
+
+```bash
+# Convert source images in images/src/ to raw YUV 4:2:0
+./images/convert.sh
+```
+
+## Architecture
+
+### Backend Plugin System
+
+Each backend is an optional compilation unit implementing the interface in `yuvbench.h`:
+
+```c
+typedef struct {
+    void (*init_fn)(Ctx *ctx);
+    void (*convert_fn)(Ctx *ctx);
+    void (*deinit_fn)(Ctx *ctx);
+} Backend;
+```
+
+`yuvbench.c:run_backend()` drives warmup + timing loops. Backends are compiled in via `-DYUVBENCH_<NAME>` preprocessor flags set in each build script.
+
+### Backends
+
+| Define | File | Platform | Notes |
+|--------|------|----------|-------|
+| `YUVBENCH_BAD` | `yuvbench_bad.c` | All | Naive BT.709 nested loop; reference baseline |
+| `YUVBENCH_ACCELERATE` | `yuvbench_accelerate.c` | macOS | vImage YUV→ARGB→RGB; caches conversion object |
+| `YUVBENCH_SWSCALE` | `yuvbench_swscale.c` | All | FFmpeg libswscale; SwsContext created in init |
+| `YUVBENCH_LIBYUV` | `yuvbench_libyuv.c` | Linux | Google libyuv `I420ToRAW()`; no init/deinit |
+
+### Timing (`kbench.h`)
+
+- macOS/ARM64: reads `CNTVCT_EL0` / `CNTFRQ_EL0` hardware registers directly
+- Linux: `clock_gettime(CLOCK_MONOTONIC)`
+
+### Adding a New Backend
+
+1. Create `yuvbench_<name>.c` implementing `init`, `convert`, `deinit` functions
+2. Guard the file body with `#ifdef YUVBENCH_<NAME>`
+3. Register it in `yuvbench.c` (see the `backends[]` array)
+4. Add `-DYUVBENCH_<NAME>` and any link flags to the relevant build scripts
+
+## Platform Notes
+
+- The `vk-asylum` branch contains a Vulkan compute shader backend (`build-shaders.sh`, `shaders.h`, `main.c`)
+- Assembly output for the Accelerate backend is emitted to `build/yuvbench_accelerate.S` on macOS builds
diff --git a/yuvbench/build-macos-aarch64-clang.sh b/yuvbench/build-macos-aarch64-clang.sh
index 18706e8..7cb19e6 100755
--- a/yuvbench/build-macos-aarch64-clang.sh
+++ b/yuvbench/build-macos-aarch64-clang.sh
@@ -1,5 +1,5 @@
 #!/bin/sh
-CFLAGS="-Wall -Wextra -Wpedantic -O3 -g -DYUVBENCH_ACCELERATE -DYUVBENCH_BAD -DYUVBENCH_SWSCALE"
+CFLAGS="-Wall -Wextra -Wpedantic -O3 -g -DYUVBENCH_ACCELERATE -DYUVBENCH_BAD -DYUVBENCH_SWSCALE -DYUVBENCH_CLAUDE"
 LFLAGS="-framework Accelerate $(pkg-config --libs libswscale)"
 mkdir -p build
 set -x
@@ -8,4 +8,5 @@ clang -o build/yuvbench_accelerate.o $CFLAGS -c ./yuvbench_accelerate.c
 clang -o build/yuvbench_accelerate.S $CFLAGS -S ./yuvbench_accelerate.c
 clang -o build/yuvbench_bad.o $CFLAGS -c ./yuvbench_bad.c
 clang -o build/yuvbench_swscale.o $CFLAGS $(pkg-config --cflags libswscale) -c ./yuvbench_swscale.c
-clang -o build/yuvbench $LFLAGS build/yuvbench.o build/yuvbench_accelerate.o build/yuvbench_bad.o build/yuvbench_swscale.o
+clang -o build/yuvbench_claude.o $CFLAGS -c ./yuvbench_claude.c
+clang -o build/yuvbench $LFLAGS build/yuvbench.o build/yuvbench_accelerate.o build/yuvbench_bad.o build/yuvbench_swscale.o build/yuvbench_claude.o
diff --git a/yuvbench/yuvbench.c b/yuvbench/yuvbench.c
index 3a92371..669c339 100644
--- a/yuvbench/yuvbench.c
+++ b/yuvbench/yuvbench.c
@@ -3,6 +3,14 @@
 #define KBENCH_IMPLEMENTATION
 #include "kbench.h"
 
+#include <math.h>
+
+static int cmp_double(const void* a, const void* b)
+{
+    double da = *(const double*)a, db = *(const double*)b;
+    return (da > db) - (da < db);
+}
+
 #ifdef YUVBENCH_ACCELERATE
 Backend yuvbench_accelerate(void);
 #endif
@@ -15,6 +23,9 @@ Backend yuvbench_libyuv(void);
 #ifdef YUVBENCH_SWSCALE
 Backend yuvbench_swscale(void);
 #endif
+#ifdef YUVBENCH_CLAUDE
+Backend yuvbench_claude(void);
+#endif
 
 static struct
 {
@@ -70,21 +81,27 @@ static void run_backend(Backend b)
         b.deinit_fn(&ctx);
     }
 
-    double ts_min = -1.0f;
-    double ts_max = -1.0f;
-    double ts_avg = 0.0f;
+    // Sort for percentiles
+    qsort(tests_table, tests, sizeof(double), cmp_double);
+
+    double ts_min = tests_table[0];
+    double ts_max = tests_table[tests - 1];
+    double ts_p50 = tests_table[tests / 2];
+    double ts_p95 = tests_table[(int)(tests * 0.95)];
+    double ts_p99 = tests_table[(int)(tests * 0.99)];
+    double ts_avg = 0.0;
+    for (int i = 0; i < tests; ++i) ts_avg += tests_table[i] / (double)tests;
+    double ts_var = 0.0;
     for (int i = 0; i < tests; ++i) {
-        if (ts_min < 0 || tests_table[i] < ts_min) {
-            ts_min = tests_table[i];
-        }
-        if (ts_max < 0 || tests_table[i] > ts_max) {
-            ts_max = tests_table[i];
-        }
-        ts_avg += (tests_table[i] / (double)tests);
+        double d = tests_table[i] - ts_avg;
+        ts_var += d * d / (double)tests;
     }
-    printf("    min result: %fms\n", ts_min * 1000.0f);
-    printf("    max result: %fms\n", ts_max * 1000.0f);
-    printf("    avg result: %fms\n", ts_avg * 1000.0f);
+    double ts_stddev = sqrt(ts_var);
+
+    #define MS(t) ((t) * 1000.0)
+    printf("    min %8.3fms  p50 %8.3fms  p95 %8.3fms  p99 %8.3fms  max %8.3fms  avg %8.3fms  σ %7.3fms\n",
+           MS(ts_min), MS(ts_p50), MS(ts_p95), MS(ts_p99), MS(ts_max), MS(ts_avg), MS(ts_stddev));
+    #undef MS
 
 
     if (G.show) {
@@ -204,4 +221,8 @@ int main(int argc, char** argv)
     printf("YUVBENCH_SWSCALE\n");
     run_backend(yuvbench_swscale());
 #endif
+#ifdef YUVBENCH_CLAUDE
+    printf("YUVBENCH_CLAUDE\n");
+    run_backend(yuvbench_claude());
+#endif
 }
diff --git a/yuvbench/yuvbench_claude.c b/yuvbench/yuvbench_claude.c
new file mode 100644
index 0000000..c8aae52
--- /dev/null
+++ b/yuvbench/yuvbench_claude.c
@@ -0,0 +1,159 @@
+#include "yuvbench.h"
+
+#ifdef YUVBENCH_CLAUDE
+
+#include <arm_neon.h>
+#include <dispatch/dispatch.h>
+
+// BT.709 limited range, scale >>6 (factor=64):
+//   R = clip(( 75*(Y-16) + 119*(Cr-128) + 32) >> 6)
+//   G = clip(( 75*(Y-16) -  12*(Cb-128) -  30*(Cr-128) + 32) >> 6)
+//   B = clip(( 75*(Y-16) + 129*(Cb-128) + 32) >> 6)
+//
+// All int16 intermediates stay within [-16512, 31801] — no overflow.
+// vqshrun_n_s16 gives saturating unsigned narrow + right shift in one instruction.
+
+// Coefficients:
+//   Y:  75  (1.164 * 64 = 74.5 → 75, err +0.67%)
+//   Cr_R: 119 (1.856 * 64 = 118.8 → 119, err +0.17%)
+//   Cb_B: 129 (2.016 * 64; libyuv uses 2.016 for BT.709 studio swing)
+//   Cb_G:  12 (0.187 * 64 = 12.0)
+//   Cr_G:  30 (0.468 * 64 = 30.0)
+
+// Convert 8 luma pixels using pre-widened, bias-subtracted chroma vectors.
+// vcb = Cb - 128 (int16x8), vcr = Cr - 128 (int16x8)
+static inline void convert_8px(
+    const uint8_t* __restrict__ y,
+    int16x8_t vcb,
+    int16x8_t vcr,
+    uint8_t* __restrict__ out)
+{
+    // Y - 16
+    int16x8_t vy = vsubq_s16(
+        vreinterpretq_s16_u16(vmovl_u8(vld1_u8(y))),
+        vdupq_n_s16(16));
+
+    // ry = 75*(Y-16) + rounding
+    int16x8_t base = vaddq_s16(vmulq_n_s16(vy, 75), vdupq_n_s16(32));
+
+    // R = base + 119*Cr
+    int16x8_t r = vmlaq_n_s16(base, vcr, 119);
+    // G = base - 12*Cb - 30*Cr
+    int16x8_t g = vmlsq_n_s16(vmlsq_n_s16(base, vcb, 12), vcr, 30);
+    // B = base + 129*Cb
+    int16x8_t b = vmlaq_n_s16(base, vcb, 129);
+
+    // Saturating narrow+shift: clamps to [0,255] and packs to uint8
+    uint8x8x3_t rgb;
+    rgb.val[0] = vqshrun_n_s16(r, 6);
+    rgb.val[1] = vqshrun_n_s16(g, 6);
+    rgb.val[2] = vqshrun_n_s16(b, 6);
+    vst3_u8(out, rgb);  // stores R0G0B0 R1G1B1 ... R7G7B7
+}
+
+// Process a contiguous range of row-pairs [row_start, row_end).
+// row_start and row_end must be even.
+static void convert_rows(
+    const uint8_t* __restrict__ Y,
+    const uint8_t* __restrict__ Cb,
+    const uint8_t* __restrict__ Cr,
+    uint8_t* __restrict__ RGB,
+    uint32_t w,
+    uint32_t row_start,
+    uint32_t row_end)
+{
+    for (uint32_t row = row_start; row < row_end; row += 2) {
+        const uint8_t* y0   = Y   + row * w;
+        const uint8_t* y1   = y0  + w;
+        const uint8_t* cb   = Cb  + (row / 2) * (w / 2);
+        const uint8_t* cr   = Cr  + (row / 2) * (w / 2);
+        uint8_t*       rgb0 = RGB + row * w * 3;
+        uint8_t*       rgb1 = rgb0 + w * 3;
+
+        uint32_t col = 0;
+
+        // 16 pixels per inner iteration: 8 chroma samples shared across 2 rows.
+        for (; col + 16 <= w; col += 16) {
+            // Load 8 Cb/Cr bytes and upsample to 16 via interleave-with-self:
+            //   [c0,c1,...,c7] → [c0,c0,c1,c1,...,c7,c7]
+            uint8x8x2_t cb_up = vzip_u8(vld1_u8(cb + col/2), vld1_u8(cb + col/2));
+            uint8x8x2_t cr_up = vzip_u8(vld1_u8(cr + col/2), vld1_u8(cr + col/2));
+
+            // Widen and bias-subtract chroma for low 8 and high 8 pixels
+            int16x8_t vcb_lo = vsubq_s16(vreinterpretq_s16_u16(vmovl_u8(cb_up.val[0])), vdupq_n_s16(128));
+            int16x8_t vcb_hi = vsubq_s16(vreinterpretq_s16_u16(vmovl_u8(cb_up.val[1])), vdupq_n_s16(128));
+            int16x8_t vcr_lo = vsubq_s16(vreinterpretq_s16_u16(vmovl_u8(cr_up.val[0])), vdupq_n_s16(128));
+            int16x8_t vcr_hi = vsubq_s16(vreinterpretq_s16_u16(vmovl_u8(cr_up.val[1])), vdupq_n_s16(128));
+
+            // Row 0
+            convert_8px(y0 + col,      vcb_lo, vcr_lo, rgb0 + col * 3);
+            convert_8px(y0 + col + 8,  vcb_hi, vcr_hi, rgb0 + (col + 8) * 3);
+            // Row 1 — same chroma, different luma
+            convert_8px(y1 + col,      vcb_lo, vcr_lo, rgb1 + col * 3);
+            convert_8px(y1 + col + 8,  vcb_hi, vcr_hi, rgb1 + (col + 8) * 3);
+        }
+
+        // Scalar tail for widths not divisible by 16
+        for (; col < w; col += 2) {
+            int32_t cb2 = (int32_t)cb[col / 2] - 128;
+            int32_t cr2 = (int32_t)cr[col / 2] - 128;
+            for (uint32_t dy = 0; dy < 2; ++dy) {
+                const uint8_t* yr  = (dy == 0) ? y0   : y1;
+                uint8_t*       dst = (dy == 0) ? rgb0 : rgb1;
+                for (uint32_t dx = 0; dx < 2 && (col + dx) < w; ++dx) {
+                    int32_t yv = (int32_t)yr[col + dx] - 16;
+                    int32_t base = 75 * yv + 32;
+                    int32_t r = base + 119 * cr2;
+                    int32_t g = base -  12 * cb2 - 30 * cr2;
+                    int32_t bv = base + 129 * cb2;
+                    r  = (r  >> 6); r  = r  < 0 ? 0 : r  > 255 ? 255 : r;
+                    g  = (g  >> 6); g  = g  < 0 ? 0 : g  > 255 ? 255 : g;
+                    bv = (bv >> 6); bv = bv < 0 ? 0 : bv > 255 ? 255 : bv;
+                    dst[(col + dx) * 3 + 0] = (uint8_t)r;
+                    dst[(col + dx) * 3 + 1] = (uint8_t)g;
+                    dst[(col + dx) * 3 + 2] = (uint8_t)bv;
+                }
+            }
+        }
+    }
+}
+
+static bool yuvbench_claude_init(Ctx* ctx)
+{
+    return (ctx->inp_w % 2 == 0) && (ctx->inp_h % 2 == 0);
+}
+
+static bool yuvbench_claude_convert(Ctx* ctx)
+{
+    const uint32_t w  = ctx->inp_w;
+    const uint32_t h  = ctx->inp_h;
+    const uint8_t* Y  = (const uint8_t*)ctx->inp_buf;
+    const uint8_t* Cb = Y  + (size_t)w * h;
+    const uint8_t* Cr = Cb + (size_t)(w / 2) * (h / 2);
+    uint8_t*       RGB = (uint8_t*)ctx->out_buf;
+
+    // Dispatch row-pairs across performance cores.
+    // dispatch_apply is synchronous: returns only after all blocks finish.
+    static const uint32_t NCHUNKS = 8;
+    uint32_t pairs = h / 2;
+
+    dispatch_apply(NCHUNKS,
+                   dispatch_get_global_queue(QOS_CLASS_USER_INTERACTIVE, 0),
+                   ^(size_t tid) {
+        uint32_t start = (uint32_t)((tid     * pairs / NCHUNKS) * 2);
+        uint32_t end   = (uint32_t)(((tid+1) * pairs / NCHUNKS) * 2);
+        convert_rows(Y, Cb, Cr, RGB, w, start, end);
+    });
+
+    return true;
+}
+
+Backend yuvbench_claude(void)
+{
+    Backend b = { 0 };
+    b.init_fn    = yuvbench_claude_init;
+    b.convert_fn = yuvbench_claude_convert;
+    return b;
+}
+
+#endif // YUVBENCH_CLAUDE