diff options
Diffstat (limited to 'yuvbench')
| -rw-r--r-- | yuvbench/CLAUDE.md | 78 | ||||
| -rwxr-xr-x | yuvbench/build-macos-aarch64-clang.sh | 5 | ||||
| -rw-r--r-- | yuvbench/yuvbench.c | 47 | ||||
| -rw-r--r-- | yuvbench/yuvbench_claude.c | 159 |
4 files changed, 274 insertions, 15 deletions
diff --git a/yuvbench/CLAUDE.md b/yuvbench/CLAUDE.md new file mode 100644 index 0000000..ee6ee4f --- /dev/null +++ b/yuvbench/CLAUDE.md @@ -0,0 +1,78 @@ +# CLAUDE.md + +This file provides guidance to Claude Code (claude.ai/code) when working with code in this repository. + +## What This Is + +yuvbench benchmarks YUV 4:2:0 → RGB24 color space conversion across multiple implementations. It loads a raw YUV file, runs each enabled backend through 100 warmup + 2500 timed iterations, and reports min/max/avg per-iteration timing in milliseconds. + +## Build Commands + +```bash +# macOS (Apple Silicon) +./build-macos-aarch64-clang.sh + +# Linux (x86_64) +./build-linux-x86_64-gcc.sh +``` + +Both scripts create `build/` and produce `build/yuvbench`. + +## Running + +```bash +./build/yuvbench images/jellybeans-256x256.yuv +./build/yuvbench images/capitol-2950x1528.yuv +./build/yuvbench images/capitol-2950x1528.yuv show # pipe last frame to ffplay +``` + +Input filename must encode dimensions as `name-WIDTHxHEIGHT.yuv`. + +## Prepare Test Images + +```bash +# Convert source images in images/src/ to raw YUV 4:2:0 +./images/convert.sh +``` + +## Architecture + +### Backend Plugin System + +Each backend is an optional compilation unit implementing the interface in `yuvbench.h`: + +```c +typedef struct { + void (*init_fn)(Ctx *ctx); + void (*convert_fn)(Ctx *ctx); + void (*deinit_fn)(Ctx *ctx); +} Backend; +``` + +`yuvbench.c:run_backend()` drives warmup + timing loops. Backends are compiled in via `-DYUVBENCH_<NAME>` preprocessor flags set in each build script. + +### Backends + +| Define | File | Platform | Notes | +|--------|------|----------|-------| +| `YUVBENCH_BAD` | `yuvbench_bad.c` | All | Naive BT.709 nested loop; reference baseline | +| `YUVBENCH_ACCELERATE` | `yuvbench_accelerate.c` | macOS | vImage YUV→ARGB→RGB; caches conversion object | +| `YUVBENCH_SWSCALE` | `yuvbench_swscale.c` | All | FFmpeg libswscale; SwsContext created in init | +| `YUVBENCH_LIBYUV` | `yuvbench_libyuv.c` | Linux | Google libyuv `I420ToRAW()`; no init/deinit | + +### Timing (`kbench.h`) + +- macOS/ARM64: reads `CNTVCT_EL0` / `CNTFRQ_EL0` hardware registers directly +- Linux: `clock_gettime(CLOCK_MONOTONIC)` + +### Adding a New Backend + +1. Create `yuvbench_<name>.c` implementing `init`, `convert`, `deinit` functions +2. Guard the file body with `#ifdef YUVBENCH_<NAME>` +3. Register it in `yuvbench.c` (see the `backends[]` array) +4. Add `-DYUVBENCH_<NAME>` and any link flags to the relevant build scripts + +## Platform Notes + +- The `vk-asylum` branch contains a Vulkan compute shader backend (`build-shaders.sh`, `shaders.h`, `main.c`) +- Assembly output for the Accelerate backend is emitted to `build/yuvbench_accelerate.S` on macOS builds diff --git a/yuvbench/build-macos-aarch64-clang.sh b/yuvbench/build-macos-aarch64-clang.sh index 18706e8..7cb19e6 100755 --- a/yuvbench/build-macos-aarch64-clang.sh +++ b/yuvbench/build-macos-aarch64-clang.sh @@ -1,5 +1,5 @@ #!/bin/sh -CFLAGS="-Wall -Wextra -Wpedantic -O3 -g -DYUVBENCH_ACCELERATE -DYUVBENCH_BAD -DYUVBENCH_SWSCALE" +CFLAGS="-Wall -Wextra -Wpedantic -O3 -g -DYUVBENCH_ACCELERATE -DYUVBENCH_BAD -DYUVBENCH_SWSCALE -DYUVBENCH_CLAUDE" LFLAGS="-framework Accelerate $(pkg-config --libs libswscale)" mkdir -p build set -x @@ -8,4 +8,5 @@ clang -o build/yuvbench_accelerate.o $CFLAGS -c ./yuvbench_accelerate.c clang -o build/yuvbench_accelerate.S $CFLAGS -S ./yuvbench_accelerate.c clang -o build/yuvbench_bad.o $CFLAGS -c ./yuvbench_bad.c clang -o build/yuvbench_swscale.o $CFLAGS $(pkg-config --cflags libswscale) -c ./yuvbench_swscale.c -clang -o build/yuvbench $LFLAGS build/yuvbench.o build/yuvbench_accelerate.o build/yuvbench_bad.o build/yuvbench_swscale.o +clang -o build/yuvbench_claude.o $CFLAGS -c ./yuvbench_claude.c +clang -o build/yuvbench $LFLAGS build/yuvbench.o build/yuvbench_accelerate.o build/yuvbench_bad.o build/yuvbench_swscale.o build/yuvbench_claude.o diff --git a/yuvbench/yuvbench.c b/yuvbench/yuvbench.c index 3a92371..669c339 100644 --- a/yuvbench/yuvbench.c +++ b/yuvbench/yuvbench.c @@ -3,6 +3,14 @@ #define KBENCH_IMPLEMENTATION #include "kbench.h" +#include <math.h> + +static int cmp_double(const void* a, const void* b) +{ + double da = *(const double*)a, db = *(const double*)b; + return (da > db) - (da < db); +} + #ifdef YUVBENCH_ACCELERATE Backend yuvbench_accelerate(void); #endif @@ -15,6 +23,9 @@ Backend yuvbench_libyuv(void); #ifdef YUVBENCH_SWSCALE Backend yuvbench_swscale(void); #endif +#ifdef YUVBENCH_CLAUDE +Backend yuvbench_claude(void); +#endif static struct { @@ -70,21 +81,27 @@ static void run_backend(Backend b) b.deinit_fn(&ctx); } - double ts_min = -1.0f; - double ts_max = -1.0f; - double ts_avg = 0.0f; + // Sort for percentiles + qsort(tests_table, tests, sizeof(double), cmp_double); + + double ts_min = tests_table[0]; + double ts_max = tests_table[tests - 1]; + double ts_p50 = tests_table[tests / 2]; + double ts_p95 = tests_table[(int)(tests * 0.95)]; + double ts_p99 = tests_table[(int)(tests * 0.99)]; + double ts_avg = 0.0; + for (int i = 0; i < tests; ++i) ts_avg += tests_table[i] / (double)tests; + double ts_var = 0.0; for (int i = 0; i < tests; ++i) { - if (ts_min < 0 || tests_table[i] < ts_min) { - ts_min = tests_table[i]; - } - if (ts_max < 0 || tests_table[i] > ts_max) { - ts_max = tests_table[i]; - } - ts_avg += (tests_table[i] / (double)tests); + double d = tests_table[i] - ts_avg; + ts_var += d * d / (double)tests; } - printf(" min result: %fms\n", ts_min * 1000.0f); - printf(" max result: %fms\n", ts_max * 1000.0f); - printf(" avg result: %fms\n", ts_avg * 1000.0f); + double ts_stddev = sqrt(ts_var); + + #define MS(t) ((t) * 1000.0) + printf(" min %8.3fms p50 %8.3fms p95 %8.3fms p99 %8.3fms max %8.3fms avg %8.3fms σ %7.3fms\n", + MS(ts_min), MS(ts_p50), MS(ts_p95), MS(ts_p99), MS(ts_max), MS(ts_avg), MS(ts_stddev)); + #undef MS if (G.show) { @@ -204,4 +221,8 @@ int main(int argc, char** argv) printf("YUVBENCH_SWSCALE\n"); run_backend(yuvbench_swscale()); #endif +#ifdef YUVBENCH_CLAUDE + printf("YUVBENCH_CLAUDE\n"); + run_backend(yuvbench_claude()); +#endif } diff --git a/yuvbench/yuvbench_claude.c b/yuvbench/yuvbench_claude.c new file mode 100644 index 0000000..c8aae52 --- /dev/null +++ b/yuvbench/yuvbench_claude.c @@ -0,0 +1,159 @@ +#include "yuvbench.h" + +#ifdef YUVBENCH_CLAUDE + +#include <arm_neon.h> +#include <dispatch/dispatch.h> + +// BT.709 limited range, scale >>6 (factor=64): +// R = clip(( 75*(Y-16) + 119*(Cr-128) + 32) >> 6) +// G = clip(( 75*(Y-16) - 12*(Cb-128) - 30*(Cr-128) + 32) >> 6) +// B = clip(( 75*(Y-16) + 129*(Cb-128) + 32) >> 6) +// +// All int16 intermediates stay within [-16512, 31801] — no overflow. +// vqshrun_n_s16 gives saturating unsigned narrow + right shift in one instruction. + +// Coefficients: +// Y: 75 (1.164 * 64 = 74.5 → 75, err +0.67%) +// Cr_R: 119 (1.856 * 64 = 118.8 → 119, err +0.17%) +// Cb_B: 129 (2.016 * 64; libyuv uses 2.016 for BT.709 studio swing) +// Cb_G: 12 (0.187 * 64 = 12.0) +// Cr_G: 30 (0.468 * 64 = 30.0) + +// Convert 8 luma pixels using pre-widened, bias-subtracted chroma vectors. +// vcb = Cb - 128 (int16x8), vcr = Cr - 128 (int16x8) +static inline void convert_8px( + const uint8_t* __restrict__ y, + int16x8_t vcb, + int16x8_t vcr, + uint8_t* __restrict__ out) +{ + // Y - 16 + int16x8_t vy = vsubq_s16( + vreinterpretq_s16_u16(vmovl_u8(vld1_u8(y))), + vdupq_n_s16(16)); + + // ry = 75*(Y-16) + rounding + int16x8_t base = vaddq_s16(vmulq_n_s16(vy, 75), vdupq_n_s16(32)); + + // R = base + 119*Cr + int16x8_t r = vmlaq_n_s16(base, vcr, 119); + // G = base - 12*Cb - 30*Cr + int16x8_t g = vmlsq_n_s16(vmlsq_n_s16(base, vcb, 12), vcr, 30); + // B = base + 129*Cb + int16x8_t b = vmlaq_n_s16(base, vcb, 129); + + // Saturating narrow+shift: clamps to [0,255] and packs to uint8 + uint8x8x3_t rgb; + rgb.val[0] = vqshrun_n_s16(r, 6); + rgb.val[1] = vqshrun_n_s16(g, 6); + rgb.val[2] = vqshrun_n_s16(b, 6); + vst3_u8(out, rgb); // stores R0G0B0 R1G1B1 ... R7G7B7 +} + +// Process a contiguous range of row-pairs [row_start, row_end). +// row_start and row_end must be even. +static void convert_rows( + const uint8_t* __restrict__ Y, + const uint8_t* __restrict__ Cb, + const uint8_t* __restrict__ Cr, + uint8_t* __restrict__ RGB, + uint32_t w, + uint32_t row_start, + uint32_t row_end) +{ + for (uint32_t row = row_start; row < row_end; row += 2) { + const uint8_t* y0 = Y + row * w; + const uint8_t* y1 = y0 + w; + const uint8_t* cb = Cb + (row / 2) * (w / 2); + const uint8_t* cr = Cr + (row / 2) * (w / 2); + uint8_t* rgb0 = RGB + row * w * 3; + uint8_t* rgb1 = rgb0 + w * 3; + + uint32_t col = 0; + + // 16 pixels per inner iteration: 8 chroma samples shared across 2 rows. + for (; col + 16 <= w; col += 16) { + // Load 8 Cb/Cr bytes and upsample to 16 via interleave-with-self: + // [c0,c1,...,c7] → [c0,c0,c1,c1,...,c7,c7] + uint8x8x2_t cb_up = vzip_u8(vld1_u8(cb + col/2), vld1_u8(cb + col/2)); + uint8x8x2_t cr_up = vzip_u8(vld1_u8(cr + col/2), vld1_u8(cr + col/2)); + + // Widen and bias-subtract chroma for low 8 and high 8 pixels + int16x8_t vcb_lo = vsubq_s16(vreinterpretq_s16_u16(vmovl_u8(cb_up.val[0])), vdupq_n_s16(128)); + int16x8_t vcb_hi = vsubq_s16(vreinterpretq_s16_u16(vmovl_u8(cb_up.val[1])), vdupq_n_s16(128)); + int16x8_t vcr_lo = vsubq_s16(vreinterpretq_s16_u16(vmovl_u8(cr_up.val[0])), vdupq_n_s16(128)); + int16x8_t vcr_hi = vsubq_s16(vreinterpretq_s16_u16(vmovl_u8(cr_up.val[1])), vdupq_n_s16(128)); + + // Row 0 + convert_8px(y0 + col, vcb_lo, vcr_lo, rgb0 + col * 3); + convert_8px(y0 + col + 8, vcb_hi, vcr_hi, rgb0 + (col + 8) * 3); + // Row 1 — same chroma, different luma + convert_8px(y1 + col, vcb_lo, vcr_lo, rgb1 + col * 3); + convert_8px(y1 + col + 8, vcb_hi, vcr_hi, rgb1 + (col + 8) * 3); + } + + // Scalar tail for widths not divisible by 16 + for (; col < w; col += 2) { + int32_t cb2 = (int32_t)cb[col / 2] - 128; + int32_t cr2 = (int32_t)cr[col / 2] - 128; + for (uint32_t dy = 0; dy < 2; ++dy) { + const uint8_t* yr = (dy == 0) ? y0 : y1; + uint8_t* dst = (dy == 0) ? rgb0 : rgb1; + for (uint32_t dx = 0; dx < 2 && (col + dx) < w; ++dx) { + int32_t yv = (int32_t)yr[col + dx] - 16; + int32_t base = 75 * yv + 32; + int32_t r = base + 119 * cr2; + int32_t g = base - 12 * cb2 - 30 * cr2; + int32_t bv = base + 129 * cb2; + r = (r >> 6); r = r < 0 ? 0 : r > 255 ? 255 : r; + g = (g >> 6); g = g < 0 ? 0 : g > 255 ? 255 : g; + bv = (bv >> 6); bv = bv < 0 ? 0 : bv > 255 ? 255 : bv; + dst[(col + dx) * 3 + 0] = (uint8_t)r; + dst[(col + dx) * 3 + 1] = (uint8_t)g; + dst[(col + dx) * 3 + 2] = (uint8_t)bv; + } + } + } + } +} + +static bool yuvbench_claude_init(Ctx* ctx) +{ + return (ctx->inp_w % 2 == 0) && (ctx->inp_h % 2 == 0); +} + +static bool yuvbench_claude_convert(Ctx* ctx) +{ + const uint32_t w = ctx->inp_w; + const uint32_t h = ctx->inp_h; + const uint8_t* Y = (const uint8_t*)ctx->inp_buf; + const uint8_t* Cb = Y + (size_t)w * h; + const uint8_t* Cr = Cb + (size_t)(w / 2) * (h / 2); + uint8_t* RGB = (uint8_t*)ctx->out_buf; + + // Dispatch row-pairs across performance cores. + // dispatch_apply is synchronous: returns only after all blocks finish. + static const uint32_t NCHUNKS = 8; + uint32_t pairs = h / 2; + + dispatch_apply(NCHUNKS, + dispatch_get_global_queue(QOS_CLASS_USER_INTERACTIVE, 0), + ^(size_t tid) { + uint32_t start = (uint32_t)((tid * pairs / NCHUNKS) * 2); + uint32_t end = (uint32_t)(((tid+1) * pairs / NCHUNKS) * 2); + convert_rows(Y, Cb, Cr, RGB, w, start, end); + }); + + return true; +} + +Backend yuvbench_claude(void) +{ + Backend b = { 0 }; + b.init_fn = yuvbench_claude_init; + b.convert_fn = yuvbench_claude_convert; + return b; +} + +#endif // YUVBENCH_CLAUDE |