From f58bfac1cd6d9789ed0f878058179c8b4adb8b23 Mon Sep 17 00:00:00 2001 From: Hunter Kvalevog Date: Fri, 3 Apr 2026 20:49:17 -0500 Subject: --- .gitignore | 2 + microshooter/build.sh | 3 + microshooter/microshooter.c | 606 ++++++++++++++++++++++++++++++++++ vk-asylum/build-shaders.sh | 2 +- vk-asylum/main.c | 447 +++++++++++++++++++------ vk-asylum/shaders.h | 2 +- yuvbench/CLAUDE.md | 78 +++++ yuvbench/build-macos-aarch64-clang.sh | 5 +- yuvbench/yuvbench.c | 47 ++- yuvbench/yuvbench_claude.c | 159 +++++++++ 10 files changed, 1233 insertions(+), 118 deletions(-) create mode 100755 microshooter/build.sh create mode 100644 microshooter/microshooter.c create mode 100644 yuvbench/CLAUDE.md create mode 100644 yuvbench/yuvbench_claude.c diff --git a/.gitignore b/.gitignore index 60adada..58ea676 100644 --- a/.gitignore +++ b/.gitignore @@ -9,5 +9,7 @@ bin build d3d11-re.* +microshooter/microshooter_client* +microshooter/microshooter_server* rushmore-linux/out rushmore-linux/linux* diff --git a/microshooter/build.sh b/microshooter/build.sh new file mode 100755 index 0000000..172c395 --- /dev/null +++ b/microshooter/build.sh @@ -0,0 +1,3 @@ +#!/bin/sh +cc -o microshooter_client -O0 -g -fsanitize=address -Wall -Wextra -Wpedantic ./microshooter.c $(pkg-config --cflags --libs sdl3) -DCLIENT +cc -o microshooter_server -O0 -g -fsanitize=address -Wall -Wextra -Wpedantic ./microshooter.c $(pkg-config --cflags --libs sdl3) -DSERVER diff --git a/microshooter/microshooter.c b/microshooter/microshooter.c new file mode 100644 index 0000000..e209c3b --- /dev/null +++ b/microshooter/microshooter.c @@ -0,0 +1,606 @@ +#if !defined(CLIENT) && !defined(SERVER) +# error Either CLIENT or SERVER must be defined +#endif + +// ================================================================================ +// Core headers +// ================================================================================ + +#include +#include +#include +#include + +// ================================================================================ +// Core helpers +// ================================================================================ + +#define COUNTOF(_X) (sizeof(_X) / sizeof((_X)[0])) +#define UNUSED(_X) ((void)sizeof(_X)) + +#ifdef __GNUC__ +# define NORETURN __attribute__((noreturn)) +# define PRINTF_FORMAT(_X, _Y) __attribute__((format(printf, _X, _Y))) +#endif + +// ================================================================================ +// Game -> OS API +// ================================================================================ + +NORETURN +void OS_SpewError(const char *message); + +void OS_SpewInfo(const char *message); + +// ================================================================================ +// OS -> Client API +// ================================================================================ + +typedef struct CL_InitParams CL_InitParams; +struct CL_InitParams +{ + void *(*glproc)(const char *); +}; + +void CL_Init(const CL_InitParams *params); + +void CL_Render(int vp_width, int vp_height); + +// ================================================================================ +// OS -> Server API +// ================================================================================ + +typedef struct SV_InitParams SV_InitParams; +struct SV_InitParams +{ + const char *hostname; +}; + +void SV_Init(const SV_InitParams *params); + +// ================================================================================ +// Shared utility code +// ================================================================================ + +#define HEAP_FMT(_STR, _FMT) \ + do { \ + va_list va; va_start(va, _FMT); \ + int len = vsnprintf(0, 0, _FMT, va) + 1; \ + va_end(va); \ + _STR = calloc(len, 1); \ + va_start(va, _FMT); \ + vsnprintf(_STR, len, _FMT, va); \ + va_end(va); \ + } while (0); + +NORETURN PRINTF_FORMAT(1, 2) +void Error(const char *fmt, ...) +{ + char *str = 0; + HEAP_FMT(str, fmt); + OS_SpewError(str); +} + +PRINTF_FORMAT(1, 2) +void Info(const char *fmt, ...) +{ + char *str = 0; + HEAP_FMT(str, fmt); + OS_SpewInfo(str); + free(str); +} + +#undef HEAP_FMT + +// ================================================================================ +// Math +// ================================================================================ + +typedef struct Mat4 Mat4; +struct Mat4 +{ + float m[4 * 4]; +}; +#define MAT4(...) (Mat4){ .m = { __VA_ARGS__ }, } + +#if 0 +static inline Mat4 Mat4_Ident(void) +{ + return MAT4( + 1, 0, 0, 0, + 0, 1, 0, 0, + 0, 0, 1, 0, + 0, 0, 0, 1, + ); +} +#endif + +static inline Mat4 Mat4_Ortho(float l, float r, float t, float b, float n, float f) +{ + // ref: https://www.scratchapixel.com/lessons/3d-basic-rendering/perspective-and-orthographic-projection-matrix/orthographic-projection-matrix.html + return MAT4( + 2.0f / (r - l), 0.0f, 0.0f, 0.0f, + 0.0f, 2.0f / (t - b), 0.0f, 0.0f, + 0.0f, 0.0f, -2.0f / (f - n), 0.0f, + -((r + l) / (r - l)), -((t + b) / (t - b)), -((f + n) / (f - n)), 1.0f, + ); +} + +// ================================================================================ +// OpenGL +// ================================================================================ + +// ref: https://registry.khronos.org/OpenGL/api/GL/glcorearb.h + +#define GL_FALSE 0 +#define GL_TRUE 1 + +#define GL_TRIANGLES 0x0004 + +#define GL_FRAGMENT_SHADER 0x8B30 +#define GL_VERTEX_SHADER 0x8B31 + +#define GL_UNSIGNED_SHORT 0x1403 +#define GL_FLOAT 0x1406 + +#define GL_ARRAY_BUFFER 0x8892 +#define GL_ELEMENT_ARRAY_BUFFER 0x8893 + +#define GL_COLOR_BUFFER_BIT 0x00004000 +#define GL_DEPTH_BUFFER_BIT 0x00000100 + +#define GL_VENDOR 0x1F00 +#define GL_RENDERER 0x1F01 + +#define GL_NO_ERROR 0 +#define GL_INVALID_ENUM 0x0500 +#define GL_INVALID_VALUE 0x0501 +#define GL_INVALID_OPERATION 0x0502 +#define GL_OUT_OF_MEMORY 0x0505 + +#define GL_STATIC_DRAW 0x88E4 +#define GL_STREAM_DRAW 0x88E0 + +#define GL_COMPILE_STATUS 0x8B81 +#define GL_LINK_STATUS 0x8B82 + +typedef unsigned int GLenum; +typedef float GLfloat; +typedef int GLint; +typedef int GLsizei; +typedef unsigned int GLbitfield; +typedef unsigned int GLuint; +typedef unsigned char GLboolean; +typedef uint8_t GLubyte; +typedef size_t GLsizeiptr; +typedef intptr_t GLintptr; + +#define GLFUNCS \ + X(glAttachShader, void, GLuint, GLuint) \ + X(glBindBuffer, void, GLenum, GLuint) \ + X(glBindVertexArray, void, GLuint) \ + X(glBufferData, void, GLenum, GLsizeiptr, const void *, GLenum) \ + X(glBufferSubData, void, GLenum, GLintptr, GLsizeiptr, const void *) \ + X(glClear, void, GLbitfield) \ + X(glClearColor, void, GLfloat, GLfloat, GLfloat, GLfloat) \ + X(glCompileShader, void, GLuint) \ + X(glCreateProgram, GLuint, void) \ + X(glCreateShader, GLuint, GLenum) \ + X(glDrawElements, void, GLenum, GLsizei, GLenum, const void *) \ + X(glEnableVertexAttribArray, void, GLuint) \ + X(glGenBuffers, void, GLsizei, GLuint *) \ + X(glGenVertexArrays, void, GLsizei, GLuint *) \ + X(glGetError, GLenum, void) \ + X(glGetProgramInfoLog, void, GLuint, GLsizei, GLsizei *, char *) \ + X(glGetProgramiv, void, GLuint, GLenum, GLint *) \ + X(glGetShaderInfoLog, void, GLuint, GLsizei, GLsizei *, char *) \ + X(glGetShaderiv, void, GLuint, GLenum, GLint *) \ + X(glGetString, const GLubyte *, GLenum) \ + X(glGetUniformLocation, GLint, GLuint, const char *) \ + X(glLinkProgram, void, GLuint) \ + X(glShaderSource, void, GLuint, GLsizei, const char *const *, const GLint *) \ + X(glUniformMatrix4fv, void, GLint, GLsizei, GLboolean, const GLfloat *) \ + X(glUseProgram, void, GLuint) \ + X(glVertexAttribPointer, void, GLuint, GLint, GLenum, GLboolean, GLsizei, const void *) \ + X(glViewport, void, GLint, GLint, GLsizei, GLsizei) + +#define X(_NAME, _RET, ...) static _RET (*_NAME)(__VA_ARGS__) = 0; +GLFUNCS +#undef X + +void LoadOpenGLFunctions(void *(*glproc)(const char *)) +{ + #define X(_NAME, _RET, ...) \ + _NAME = (_RET(*)(__VA_ARGS__))glproc(#_NAME); \ + if (!_NAME) {Error("Failed to load OpenGL function %s", #_NAME); } + GLFUNCS + #undef X +} + +#undef GLFUNCS + +void CheckGLError(unsigned int line, const char *code) +{ + GLenum error = glGetError(); + if (error != GL_NO_ERROR) + { + const char *error_str = "unknown error"; + switch (error) + { +#define BIND_ERROR(_ERR) case _ERR: error_str = #_ERR; break; + BIND_ERROR(GL_INVALID_ENUM); + BIND_ERROR(GL_INVALID_VALUE); + BIND_ERROR(GL_INVALID_OPERATION); + BIND_ERROR(GL_OUT_OF_MEMORY); +#undef BIND_ERROR + }; + Error("%s:%d: %s generated %s", __FILE__, line, code, error_str); + } +} + +#define GL(_CODE) _CODE; CheckGLError(__LINE__, #_CODE) + +// ================================================================================ +// Batched quad renderer +// ================================================================================ + +#define MAX_QUADS 256 + +typedef struct R_Quad R_Quad; +struct R_Quad +{ + float p[2]; + float t[2]; +}; + +typedef struct R_QuadRenderer R_QuadRenderer; +struct R_QuadRenderer +{ + GLuint vao; + GLuint vbo; + GLuint ibo; + + GLuint shader; + + R_Quad vbuf[MAX_QUADS * 4]; + size_t head; +}; + +void R_InitQuads(R_QuadRenderer *qr, GLuint shader) +{ + qr->shader = shader; + + GL(glGenVertexArrays(1, &qr->vao)); + GL(glBindVertexArray(qr->vao)); + + GL(glGenBuffers(1, &qr->vbo)); + GL(glBindBuffer(GL_ARRAY_BUFFER, qr->vbo)); + GL(glBufferData(GL_ARRAY_BUFFER, sizeof(qr->vbuf), qr->vbuf, GL_STREAM_DRAW)); + + uint16_t ibuf[MAX_QUADS * 6]; + for (int i = 0; i < MAX_QUADS; ++i) + { + ibuf[6*i+0] = 4*i+0; + ibuf[6*i+1] = 4*i+1; + ibuf[6*i+2] = 4*i+2; + ibuf[6*i+3] = 4*i+0; + ibuf[6*i+4] = 4*i+2; + ibuf[6*i+5] = 4*i+3; + } + + GL(glGenBuffers(1, &qr->ibo)); + GL(glBindBuffer(GL_ELEMENT_ARRAY_BUFFER, qr->ibo)); + GL(glBufferData(GL_ELEMENT_ARRAY_BUFFER, sizeof(ibuf), ibuf, GL_STATIC_DRAW)); + + GL(glVertexAttribPointer(0, 2, GL_FLOAT, GL_FALSE, sizeof(R_Quad), (const void *)offsetof(R_Quad, p))); + GL(glVertexAttribPointer(1, 2, GL_FLOAT, GL_FALSE, sizeof(R_Quad), (const void *)offsetof(R_Quad, t))); + GL(glEnableVertexAttribArray(0)); + GL(glEnableVertexAttribArray(1)); +} + +void R_FlushQuads(R_QuadRenderer *qr) +{ + if (qr->head > 0) + { + GL(glBufferSubData(GL_ARRAY_BUFFER, 0, sizeof(R_Quad) * 4 * qr->head, qr->vbuf)); + GL(glDrawElements(GL_TRIANGLES, qr->head * 6, GL_UNSIGNED_SHORT, 0)); + qr->head = 0; + } +} + +void R_PushQuad(R_QuadRenderer *qr, const R_Quad *quad4) +{ + qr->vbuf[qr->head*4+0] = quad4[0]; + qr->vbuf[qr->head*4+1] = quad4[1]; + qr->vbuf[qr->head*4+2] = quad4[2]; + qr->vbuf[qr->head*4+3] = quad4[3]; + ++qr->head; + if (qr->head >= MAX_QUADS) + { + R_FlushQuads(qr); + } +} + +void R_BeginQuads(R_QuadRenderer *qr, int vp_w, int vp_h) +{ + GL(glBindVertexArray(qr->vao)); + GL(glBindBuffer(GL_ARRAY_BUFFER, qr->vbo)); + GL(glUseProgram(qr->shader)); + + Mat4 proj = Mat4_Ortho(0, vp_w, 0, vp_h, 0, 10); + GL(glUniformMatrix4fv(glGetUniformLocation(qr->shader, "u_proj"), 1, GL_FALSE, proj.m)); +} + +void R_EndQuads(R_QuadRenderer *qr) +{ + R_FlushQuads(qr); +} + +// ================================================================================ +// Client code +// ================================================================================ + +static struct +{ + R_QuadRenderer qr; + int width; + int height; +} cl = { 0 }; + +void CL_Init(const CL_InitParams *params) +{ + cl.width = -1; + cl.height = -1; + + LoadOpenGLFunctions(params->glproc); + Info("Loaded OpenGL: GL_VENDOR = %s, GL_RENDERER = %s", glGetString(GL_VENDOR), glGetString(GL_RENDERER)); + + GLuint prog_quad; + +#define GLSL(_X) "#version 330 core\n" #_X + const char* quad_vs = GLSL( + layout (location = 0) in vec2 p; + layout (location = 1) in vec2 t; + + uniform mat4 u_proj; + + void main() + { + gl_Position = u_proj * vec4(p.x, p.y, 0.0, 1.0); + } + ); + const char* quad_fs = GLSL( + out vec4 out_color; + + void main() + { + out_color = vec4(1.0, 0.0, 0.0, 1.0); + } + ); +#undef GLSL + + struct { GLuint* prog; const char *vsrc; const char *fsrc; } reg[] = + { + { .prog = &prog_quad, .vsrc = quad_vs, .fsrc = quad_fs, }, + }; + for (size_t i = 0; i < COUNTOF(reg); ++i) + { + GLuint prog = GL(glCreateProgram()); + + GLuint vs = GL(glCreateShader(GL_VERTEX_SHADER)); + GL(glShaderSource(vs, 1, ®[i].vsrc, 0)); + + GLuint fs = GL(glCreateShader(GL_FRAGMENT_SHADER)); + GL(glShaderSource(fs, 1, ®[i].fsrc, 0)); + + GLuint shaders[] = { vs, fs }; + for (size_t j = 0; j < COUNTOF(shaders); ++j) + { + GL(glCompileShader(shaders[j])); + + GLint status = GL_TRUE; + GL(glGetShaderiv(shaders[j], GL_COMPILE_STATUS, &status)); + if (status != GL_TRUE) + { + char shader_log[1024]; + GL(glGetShaderInfoLog(shaders[j], sizeof(shader_log), 0, shader_log)); + Error("Failed to compile shader: %s", shader_log); + } + + GL(glAttachShader(prog, shaders[j])); + } + + GL(glLinkProgram(prog)); + + GLint status = GL_TRUE; + GL(glGetProgramiv(prog, GL_LINK_STATUS, &status)); + if (status != GL_TRUE) + { + char shader_log[1024]; + GL(glGetProgramInfoLog(prog, sizeof(shader_log), 0, shader_log)); + Error("Failed to link shader: %s", shader_log); + } + + *reg[i].prog = prog; + } + + R_InitQuads(&cl.qr, prog_quad); +} + +void CL_Render(int vp_width, int vp_height) +{ + if (cl.width != vp_width || cl.height != vp_height) + { + cl.width = vp_width; + cl.height = vp_height; + Info("Resized to %dx%d", cl.width, cl.height); + } + + GL(glViewport(0, 0, vp_width, vp_height)); + GL(glClearColor(0.1f, 0.1f, 0.1f, 1.0f)); + GL(glClear(GL_COLOR_BUFFER_BIT | GL_DEPTH_BUFFER_BIT)); + + R_BeginQuads(&cl.qr, cl.width, cl.height); + R_Quad rect[4] = + { + (R_Quad){ .p = { 0.0, 0.0 }, .t = { 0, 0 } }, + (R_Quad){ .p = { 100, 0.0 }, .t = { 0, 0 } }, + (R_Quad){ .p = { 100, 100 }, .t = { 0, 0 } }, + (R_Quad){ .p = { 0.0, 100 }, .t = { 0, 0 } }, + }; + R_PushQuad(&cl.qr, rect); + R_EndQuads(&cl.qr); +} + +// ================================================================================ +// Server code +// ================================================================================ + +void SV_Init(const SV_InitParams *params) +{ + Info("Starting server %s", params->hostname); +} + +// ================================================================================ +// SDL3 shared +// ================================================================================ + +#ifdef CLIENT +# define SDL_MAIN_USE_CALLBACKS +#endif +#include +#include + +void OS_SpewError(const char *message) +{ + printf("Error: %s\n", message); +#ifdef CLIENT + SDL_ShowSimpleMessageBox(SDL_MESSAGEBOX_ERROR, "MicroShooter: Error!", message, 0); +#endif + exit(1); +} + +void OS_SpewInfo(const char *message) +{ + printf("%s\n", message); +} + +// ================================================================================ +// SDL3 client +// ================================================================================ + +#ifdef CLIENT + +typedef struct AppState AppState; +struct AppState +{ + SDL_Window *wnd; + SDL_GLContext glctx; +}; + +void RenderFrame(AppState *app) +{ + int vp_width = 0; + int vp_height = 0; + SDL_GetWindowSizeInPixels(app->wnd, &vp_width, &vp_height); + CL_Render(vp_width, vp_height); + SDL_GL_SwapWindow(app->wnd); +} + +SDL_AppResult SDL_AppInit(void **appstate, int argc, char **argv) +{ + UNUSED(argc); + UNUSED(argv); + + *appstate = calloc(1, sizeof(AppState)); + AppState *app = *appstate; + + if (!SDL_Init(SDL_INIT_VIDEO | SDL_INIT_EVENTS)) + { + Error("Failed to initialize SDL: %s", SDL_GetError()); + } + + SDL_GL_SetAttribute(SDL_GL_CONTEXT_MAJOR_VERSION, 3); + SDL_GL_SetAttribute(SDL_GL_CONTEXT_MINOR_VERSION, 3); + SDL_GL_SetAttribute(SDL_GL_CONTEXT_PROFILE_MASK, SDL_GL_CONTEXT_PROFILE_CORE); + + app->wnd = SDL_CreateWindow("MicroShooter", 800, 600, SDL_WINDOW_HIDDEN | SDL_WINDOW_RESIZABLE | SDL_WINDOW_OPENGL); + if (!app->wnd) + { + Error("Failed to create window: %s", SDL_GetError()); + } + + app->glctx = SDL_GL_CreateContext(app->wnd); + if (!app->glctx) + { + Error("Failed to create OpenGL context: %s", SDL_GetError()); + } + + CL_InitParams init = (CL_InitParams) + { + .glproc = (void *(*)(const char *))SDL_GL_GetProcAddress, + }; + CL_Init(&init); + + // Render one frame before showing the window + RenderFrame(app); + + SDL_ShowWindow(app->wnd); + + return SDL_APP_CONTINUE; +} + +SDL_AppResult SDL_AppEvent(void *appstate, SDL_Event *event) +{ + UNUSED(appstate); + + SDL_AppResult result = SDL_APP_CONTINUE; + + switch (event->type) + { + case SDL_EVENT_QUIT: + { + result = SDL_APP_SUCCESS; + break; + } + } + + return result; +} + +SDL_AppResult SDL_AppIterate(void *appstate) +{ + AppState *app = appstate; + + RenderFrame(app); + + return SDL_APP_CONTINUE; +} + +void SDL_AppQuit(void *appstate, SDL_AppResult result) +{ + UNUSED(result); + + AppState *app = appstate; + + SDL_DestroyWindow(app->wnd); + SDL_Quit(); +} + +#endif // CLIENT + +// ================================================================================ +// SDL3 server +// ================================================================================ + +#ifdef SERVER + +int main(int argc, char **argv) +{ + UNUSED(argc); + UNUSED(argv); + printf("\n"); +} + +#endif // SERVER + diff --git a/vk-asylum/build-shaders.sh b/vk-asylum/build-shaders.sh index 51d4475..780ab86 100755 --- a/vk-asylum/build-shaders.sh +++ b/vk-asylum/build-shaders.sh @@ -3,4 +3,4 @@ mkdir -p build glslc -fshader-stage=vert ./triangle_vs.glsl -o build/triangle_vs.spv glslc -fshader-stage=frag ./triangle_fs.glsl -o build/triangle_fs.spv echo "const uint32_t triangle_vert_spv[] = { "$(hexdump -v -e '1/4 "0x%08x, "' build/triangle_vs.spv)" };" > shaders.h -echo "const uint32_t triangle_frag_spv[] = { "$(hexdump -v -e '1/4 "0x%08x, "' build/triangle_vs.spv)" };" >> shaders.h +echo "const uint32_t triangle_frag_spv[] = { "$(hexdump -v -e '1/4 "0x%08x, "' build/triangle_fs.spv)" };" >> shaders.h diff --git a/vk-asylum/main.c b/vk-asylum/main.c index 74e86e2..ef8410b 100644 --- a/vk-asylum/main.c +++ b/vk-asylum/main.c @@ -17,12 +17,27 @@ #define WND_W 1024 #define WND_H 768 +typedef struct Vec3 Vec3; +struct Vec3 +{ + float x; + float y; + float z; +}; + +typedef struct Vertex Vertex; +struct Vertex +{ + Vec3 p; + Vec3 c; +}; + int main(int argc, char* argv[]) { (void)argc; (void)argv; SDL_Init(SDL_INIT_VIDEO); - SDL_Window* wnd = SDL_CreateWindow("vk-asylum", WND_W, WND_H, SDL_WINDOW_VULKAN); + SDL_Window* wnd = SDL_CreateWindow("vk-asylum", WND_W, WND_H, SDL_WINDOW_VULKAN | SDL_WINDOW_HIGH_PIXEL_DENSITY); ASSERT(wnd); VkResult r = VK_SUCCESS; @@ -33,22 +48,35 @@ int main(int argc, char* argv[]) .sType = VK_STRUCTURE_TYPE_APPLICATION_INFO, .apiVersion = VK_API_VERSION_1_4, }; + const char* layers[] = { "VK_LAYER_KHRONOS_validation" }; VkInstanceCreateInfo vkici = (VkInstanceCreateInfo){ .sType = VK_STRUCTURE_TYPE_INSTANCE_CREATE_INFO, .pApplicationInfo = &vkai, + .enabledLayerCount = 1, + .ppEnabledLayerNames = layers, }; - vkici.ppEnabledExtensionNames = SDL_Vulkan_GetInstanceExtensions(&vkici.enabledExtensionCount); - printf("Instance extensions requested by SDL:\n"); - for (uint32_t i = 0; i < vkici.enabledExtensionCount; ++i) { - printf(" %s\n", vkici.ppEnabledExtensionNames[i]); + // Extensions required for the SDL window backend + uint32_t num_sdl_extensions = 0; + const char* const* sdl_extensions = SDL_Vulkan_GetInstanceExtensions(&num_sdl_extensions); + + const char* extensions[64] = { 0 }; + for (; vkici.enabledExtensionCount < num_sdl_extensions; ++vkici.enabledExtensionCount) { + extensions[vkici.enabledExtensionCount] = sdl_extensions[vkici.enabledExtensionCount]; } #ifdef __APPLE__ // Required for instance extension VK_KHR_portability_enumeration (MoltenVK) vkici.flags |= VK_INSTANCE_CREATE_ENUMERATE_PORTABILITY_BIT_KHR; + extensions[vkici.enabledExtensionCount++] = VK_KHR_PORTABILITY_ENUMERATION_EXTENSION_NAME; #endif + vkici.ppEnabledExtensionNames = extensions; + printf("Instance extensions requested by SDL:\n"); + for (uint32_t i = 0; i < vkici.enabledExtensionCount; ++i) { + printf(" %s\n", vkici.ppEnabledExtensionNames[i]); + } + if ((r = vkCreateInstance(&vkici, 0, &vk)) != VK_SUCCESS) { DIE("Failed to create Vulkan instance: %s", VkResultToString(r)); } @@ -59,7 +87,7 @@ int main(int argc, char* argv[]) DIE("Failed to create Vulkan surface: %s", SDL_GetError()); } - VkPhysicalDevice vk_pdev = 0; + VkPhysicalDevice vk_pdev = VK_NULL_HANDLE; { uint32_t vk_pdev_count = 0; vkEnumeratePhysicalDevices(vk, &vk_pdev_count, 0); @@ -69,19 +97,31 @@ int main(int argc, char* argv[]) VkPhysicalDevice* vk_pdevs = calloc(vk_pdev_count, sizeof(VkPhysicalDevice)); vkEnumeratePhysicalDevices(vk, &vk_pdev_count, vk_pdevs); + + // Select the first device with dynamic rendering support // Just pick the first available physical device vk_pdev = vk_pdevs[0]; printf("Vulkan devices:\n"); for (uint32_t i = 0; i < vk_pdev_count; ++i) { - VkPhysicalDeviceProperties vk_pdev_props = { 0 }; - vkGetPhysicalDeviceProperties(vk_pdev, &vk_pdev_props); + VkPhysicalDeviceProperties vk_pdev_props = { 0 }; + vkGetPhysicalDeviceProperties(vk_pdevs[i], &vk_pdev_props); + VkPhysicalDeviceDynamicRenderingFeatures vk_pdev_dr = (VkPhysicalDeviceDynamicRenderingFeatures){ + .sType = VK_STRUCTURE_TYPE_PHYSICAL_DEVICE_DYNAMIC_RENDERING_FEATURES, + }; + VkPhysicalDeviceFeatures2 vk_pdev_features = (VkPhysicalDeviceFeatures2){ + .sType = VK_STRUCTURE_TYPE_PHYSICAL_DEVICE_FEATURES_2, + .pNext = &vk_pdev_dr, + }; + vkGetPhysicalDeviceFeatures2(vk_pdevs[i], &vk_pdev_features); printf(" %s\n", vk_pdev_props.deviceName); + printf(" dynamic rendering? %s\n", vk_pdev_dr.dynamicRendering ? "YES" : "NO"); } free(vk_pdevs); } + ASSERT(vk_pdev != VK_NULL_HANDLE); uint32_t vk_qf_index = (uint32_t)-1; { @@ -113,14 +153,25 @@ int main(int argc, char* argv[]) .queueCount = 1, .pQueuePriorities = &priority, }; - - const char* device_extensions[] = { VK_KHR_SWAPCHAIN_EXTENSION_NAME, }; + + VkPhysicalDeviceDynamicRenderingFeatures dynamic_rendering = (VkPhysicalDeviceDynamicRenderingFeatures){ + .sType = VK_STRUCTURE_TYPE_PHYSICAL_DEVICE_DYNAMIC_RENDERING_FEATURES, + .dynamicRendering = VK_TRUE, + }; + + const char* device_extensions[] = { + VK_KHR_SWAPCHAIN_EXTENSION_NAME, +#ifdef __APPLE__ + "VK_KHR_portability_subset", +#endif + }; VkDeviceCreateInfo device_create = (VkDeviceCreateInfo){ .sType = VK_STRUCTURE_TYPE_DEVICE_CREATE_INFO, .queueCreateInfoCount = 1, .pQueueCreateInfos = &queue_create, .enabledExtensionCount = COUNTOF(device_extensions), .ppEnabledExtensionNames = device_extensions, + .pNext = &dynamic_rendering, }; if ((r = vkCreateDevice(vk_pdev, &device_create, 0, &vk_dev)) != VK_SUCCESS) { @@ -169,11 +220,13 @@ int main(int argc, char* argv[]) uint32_t vk_swap_image_count = 0; VkImage* vk_swap_images = 0; VkImageView* vk_swap_image_views = 0; + VkFence* vk_swap_fences = 0; { vkGetSwapchainImagesKHR(vk_dev, vk_swp, &vk_swap_image_count, 0); ASSERT(vk_swap_image_count > 0); vk_swap_images = calloc(vk_swap_image_count, sizeof(VkImage)); vkGetSwapchainImagesKHR(vk_dev, vk_swp, &vk_swap_image_count, vk_swap_images); + vk_swap_fences = calloc(vk_swap_image_count, sizeof(VkFence)); vk_swap_image_views = calloc(vk_swap_image_count, sizeof(VkImageView)); for (uint32_t i = 0; i < vk_swap_image_count; ++i) { @@ -195,69 +248,87 @@ int main(int argc, char* argv[]) if ((r = vkCreateImageView(vk_dev, &image_view_create, 0, &vk_swap_image_views[i])) != VK_SUCCESS) { DIE("Failed to create image view #%u: %s", i, VkResultToString(r)); } + + VkFenceCreateInfo fence_create = (VkFenceCreateInfo){ + .sType = VK_STRUCTURE_TYPE_FENCE_CREATE_INFO, + .flags = VK_FENCE_CREATE_SIGNALED_BIT, // start in signaled state + }; + if ((r = vkCreateFence(vk_dev, &fence_create, 0, &vk_swap_fences[i])) != VK_SUCCESS) { + DIE("Failed to create swap image fence #%u: %s", i, VkResultToString(r)); + } } } - - VkRenderPass vk_pass = 0; - { - VkAttachmentDescription attachment = (VkAttachmentDescription){ - .format = VK_FORMAT_B8G8R8A8_SRGB, - .samples = VK_SAMPLE_COUNT_1_BIT, - .loadOp = VK_ATTACHMENT_LOAD_OP_CLEAR, - .storeOp = VK_ATTACHMENT_STORE_OP_STORE, - .stencilLoadOp = VK_ATTACHMENT_LOAD_OP_DONT_CARE, - .stencilStoreOp = VK_ATTACHMENT_STORE_OP_DONT_CARE, - .initialLayout = VK_IMAGE_LAYOUT_UNDEFINED, - .finalLayout = VK_IMAGE_LAYOUT_PRESENT_SRC_KHR, - }; - VkAttachmentReference color_ref = (VkAttachmentReference){ - .attachment = 0, - .layout = VK_IMAGE_LAYOUT_COLOR_ATTACHMENT_OPTIMAL, - }; - VkSubpassDescription subpass = (VkSubpassDescription){ - .pipelineBindPoint = VK_PIPELINE_BIND_POINT_GRAPHICS, - .colorAttachmentCount = 1, - .pColorAttachments = &color_ref, - }; - VkSubpassDependency dep = (VkSubpassDependency){ - .srcSubpass = VK_SUBPASS_EXTERNAL, - .dstSubpass = 0, - .srcStageMask = VK_PIPELINE_STAGE_COLOR_ATTACHMENT_OUTPUT_BIT, - .dstStageMask = VK_PIPELINE_STAGE_COLOR_ATTACHMENT_OUTPUT_BIT, - .dstAccessMask = VK_ACCESS_COLOR_ATTACHMENT_READ_BIT | VK_ACCESS_COLOR_ATTACHMENT_WRITE_BIT, - }; - VkRenderPassCreateInfo render_pass_create = (VkRenderPassCreateInfo){ - .sType = VK_STRUCTURE_TYPE_RENDER_PASS_CREATE_INFO, - .attachmentCount = 1, - .pAttachments = &attachment, - .subpassCount = 1, - .pSubpasses = &subpass, - .dependencyCount = 1, - .pDependencies = &dep, - }; - if ((r = vkCreateRenderPass(vk_dev, &render_pass_create, 0, &vk_pass)) != VK_SUCCESS) { - DIE("Failed to create render pass: %s", VkResultToString(r)); - } - } - - VkFramebuffer* vk_framebuffers = calloc(vk_swap_image_count, sizeof(VkFramebuffer)); - { - for (uint32_t i = 0; i < vk_swap_image_count; ++i) { - VkImageView attachments[] = { vk_swap_image_views[i] }; - VkFramebufferCreateInfo framebuffer_create = (VkFramebufferCreateInfo){ - .sType = VK_STRUCTURE_TYPE_FRAMEBUFFER_CREATE_INFO, - .renderPass = vk_pass, - .attachmentCount = 1, - .pAttachments = attachments, - .width = vk_extent.width, - .height = vk_extent.height, - .layers = 1, - }; - if ((r = vkCreateFramebuffer(vk_dev, &framebuffer_create, 0, &vk_framebuffers[i])) != VK_SUCCESS) { - DIE("Failed to create render pass #%u: %s", i, VkResultToString(r)); - } - } - } + + VkShaderModule vk_vshader = VK_NULL_HANDLE; + VkShaderModule vk_fshader = VK_NULL_HANDLE; + { +#include "shaders.h" + VkShaderModuleCreateInfo shader_create = (VkShaderModuleCreateInfo){ + .sType = VK_STRUCTURE_TYPE_SHADER_MODULE_CREATE_INFO, + }; + shader_create.codeSize = sizeof(triangle_vert_spv); + shader_create.pCode = triangle_vert_spv; + if ((r = vkCreateShaderModule(vk_dev, &shader_create, 0, &vk_vshader)) != VK_SUCCESS) { + DIE("Failed to create vertex shader module: %s", VkResultToString(r)); + } + shader_create.codeSize = sizeof(triangle_frag_spv); + shader_create.pCode = triangle_frag_spv; + if ((r = vkCreateShaderModule(vk_dev, &shader_create, 0, &vk_fshader)) != VK_SUCCESS) { + DIE("Failed to create fragment shader module: %s", VkResultToString(r)); + } + } + + VkBuffer vk_vbuf = VK_NULL_HANDLE; + { + Vertex vdata[] = { + (Vertex){ .p = (Vec3){ 0.0f, -0.5f, 0.0f, }, .c = (Vec3){ 1.0f, 0.0f, 0.0f } }, + (Vertex){ .p = (Vec3){ 0.5f, 0.5f, 0.0f, }, .c = (Vec3){ 0.0f, 1.0f, 0.0f } }, + (Vertex){ .p = (Vec3){ -0.5f, 0.5f, 0.0f, }, .c = (Vec3){ 0.0f, 0.0f, 1.0f } }, + }; + VkBufferCreateInfo buffer_create = (VkBufferCreateInfo){ + .sType = VK_STRUCTURE_TYPE_BUFFER_CREATE_INFO, + .size = sizeof(vdata), + .usage = VK_BUFFER_USAGE_VERTEX_BUFFER_BIT, + .sharingMode = VK_SHARING_MODE_EXCLUSIVE, + }; + if ((r = vkCreateBuffer(vk_dev, &buffer_create, 0, &vk_vbuf)) != VK_SUCCESS) { + DIE("Failed to create vertex buffer: %s", VkResultToString(r)); + } + + VkMemoryRequirements memreq; + vkGetBufferMemoryRequirements(vk_dev, vk_vbuf, &memreq); + + VkMemoryAllocateInfo alloc_info = (VkMemoryAllocateInfo){ + .sType = VK_STRUCTURE_TYPE_MEMORY_ALLOCATE_INFO, + .allocationSize = memreq.size, + .memoryTypeIndex = UINT32_MAX, + }; + + VkPhysicalDeviceMemoryProperties memprops; + vkGetPhysicalDeviceMemoryProperties(vk_pdev, &memprops); + for (uint32_t i = 0; i < memprops.memoryTypeCount; ++i) { + if (memreq.memoryTypeBits & (1 << i)) { + const uint32_t needed = VK_MEMORY_PROPERTY_HOST_VISIBLE_BIT | VK_MEMORY_PROPERTY_HOST_COHERENT_BIT; + if ((memprops.memoryTypes[i].propertyFlags & needed) == needed) { + alloc_info.memoryTypeIndex = i; + break; + } + } + } + ASSERT(alloc_info.memoryTypeIndex != UINT32_MAX); + + VkDeviceMemory vkmem; + if ((r = vkAllocateMemory(vk_dev, &alloc_info, 0, &vkmem)) != VK_SUCCESS) { + DIE("Failed to allocate GPU memory for vertex buffer: %s", VkResultToString(r)); + } + + vkBindBufferMemory(vk_dev, vk_vbuf, vkmem, 0); + + void* data = 0; + vkMapMemory(vk_dev, vkmem, 0, sizeof(vdata), 0, &data); + memcpy(data, vdata, sizeof(vdata)); + vkUnmapMemory(vk_dev, vkmem); + } VkPipeline vk_pipeline = 0; { @@ -268,6 +339,128 @@ int main(int argc, char* argv[]) if ((r = vkCreatePipelineLayout(vk_dev, &pipeline_layout_create, 0, &pipeline_layout)) != VK_SUCCESS) { DIE("Failed to create pipeline layout: %s", VkResultToString(r)); } + + VkPipelineShaderStageCreateInfo stages[] = { + (VkPipelineShaderStageCreateInfo){ + .sType = VK_STRUCTURE_TYPE_PIPELINE_SHADER_STAGE_CREATE_INFO, + .stage = VK_SHADER_STAGE_VERTEX_BIT, + .module = vk_vshader, + .pName = "main", + }, + (VkPipelineShaderStageCreateInfo){ + .sType = VK_STRUCTURE_TYPE_PIPELINE_SHADER_STAGE_CREATE_INFO, + .stage = VK_SHADER_STAGE_FRAGMENT_BIT, + .module = vk_fshader, + .pName = "main", + } + }; + + VkPipelineInputAssemblyStateCreateInfo input_assembly = (VkPipelineInputAssemblyStateCreateInfo){ + .sType = VK_STRUCTURE_TYPE_PIPELINE_INPUT_ASSEMBLY_STATE_CREATE_INFO, + .topology = VK_PRIMITIVE_TOPOLOGY_TRIANGLE_LIST, + }; + + VkViewport viewport = (VkViewport){ + .width = vk_extent.width, + .height = vk_extent.height, + .maxDepth = 1.0f, + }; + + VkRect2D scissor = (VkRect2D){ + .extent = vk_extent, + }; + + VkPipelineViewportStateCreateInfo viewport_state_create = (VkPipelineViewportStateCreateInfo){ + .sType = VK_STRUCTURE_TYPE_PIPELINE_VIEWPORT_STATE_CREATE_INFO, + .viewportCount = 1, + .pViewports = &viewport, + .scissorCount = 1, + .pScissors = &scissor, + }; + + VkPipelineRasterizationStateCreateInfo raster_state_create = (VkPipelineRasterizationStateCreateInfo){ + .sType = VK_STRUCTURE_TYPE_PIPELINE_RASTERIZATION_STATE_CREATE_INFO, + .polygonMode = VK_POLYGON_MODE_FILL, + .cullMode = VK_CULL_MODE_NONE, + .frontFace = VK_FRONT_FACE_CLOCKWISE, + .lineWidth = 1.0, + }; + + VkPipelineMultisampleStateCreateInfo multisample_state = (VkPipelineMultisampleStateCreateInfo){ + .sType = VK_STRUCTURE_TYPE_PIPELINE_MULTISAMPLE_STATE_CREATE_INFO, + .rasterizationSamples = VK_SAMPLE_COUNT_1_BIT, + }; + + VkPipelineColorBlendAttachmentState blending_attach = (VkPipelineColorBlendAttachmentState){ + .colorWriteMask = 0x0F, + }; + + VkPipelineColorBlendStateCreateInfo blending = (VkPipelineColorBlendStateCreateInfo){ + .sType = VK_STRUCTURE_TYPE_PIPELINE_COLOR_BLEND_STATE_CREATE_INFO, + .attachmentCount = 1, + .pAttachments = &blending_attach, + }; + + VkFormat color_format = VK_FORMAT_B8G8R8A8_SRGB; + VkPipelineRenderingCreateInfo rendering = (VkPipelineRenderingCreateInfo){ + .sType = VK_STRUCTURE_TYPE_PIPELINE_RENDERING_CREATE_INFO, + .colorAttachmentCount = 1, + .pColorAttachmentFormats = &color_format, + }; + + VkPipelineDepthStencilStateCreateInfo depth_stencil = { + .sType = VK_STRUCTURE_TYPE_PIPELINE_DEPTH_STENCIL_STATE_CREATE_INFO, + .depthTestEnable = VK_FALSE, + .depthWriteEnable = VK_FALSE, + .depthCompareOp = VK_COMPARE_OP_ALWAYS, + }; + + VkVertexInputBindingDescription input_binding = (VkVertexInputBindingDescription){ + .binding = 0, + .stride = sizeof(Vertex), + .inputRate = VK_VERTEX_INPUT_RATE_VERTEX, + }; + + VkVertexInputAttributeDescription input_attrs[] = { + (VkVertexInputAttributeDescription){ + .location = 0, + .binding = 0, + .format = VK_FORMAT_R32G32B32_SFLOAT, + .offset = offsetof(Vertex, p), + }, + (VkVertexInputAttributeDescription){ + .location = 1, + .binding = 0, + .format = VK_FORMAT_R32G32B32_SFLOAT, + .offset = offsetof(Vertex, c), + }, + }; + + VkPipelineVertexInputStateCreateInfo vertex_input = (VkPipelineVertexInputStateCreateInfo){ + .sType = VK_STRUCTURE_TYPE_PIPELINE_VERTEX_INPUT_STATE_CREATE_INFO, + .vertexBindingDescriptionCount = 1, + .pVertexBindingDescriptions = &input_binding, + .vertexAttributeDescriptionCount = COUNTOF(input_attrs), + .pVertexAttributeDescriptions = input_attrs, + }; + + VkGraphicsPipelineCreateInfo pipeline_create = (VkGraphicsPipelineCreateInfo){ + .sType = VK_STRUCTURE_TYPE_GRAPHICS_PIPELINE_CREATE_INFO, + .pNext = &rendering, + .stageCount = COUNTOF(stages), + .pStages = stages, + .pInputAssemblyState = &input_assembly, + .pViewportState = &viewport_state_create, + .pRasterizationState = &raster_state_create, + .pMultisampleState = &multisample_state, + .pColorBlendState = &blending, + .layout = pipeline_layout, + .pVertexInputState = &vertex_input, + .pDepthStencilState = &depth_stencil, + }; + if ((r = vkCreateGraphicsPipelines(vk_dev, VK_NULL_HANDLE, 1, &pipeline_create, 0, &vk_pipeline)) != VK_SUCCESS) { + DIE("Failed to create pipeline: %s", VkResultToString(r)); + } } VkCommandPool vk_cmd_pool = 0; @@ -302,38 +495,82 @@ int main(int argc, char* argv[]) if ((r = vkBeginCommandBuffer(vk_cmd_buffers[i], &cmd_buffer_begin_info)) != VK_SUCCESS) { DIE("Failed to begin command buffer for swapchain image #%u: %s", i, VkResultToString(r)); } - - VkClearValue clear_val = (VkClearValue){ - .color.float32[0] = 1.0f, - .color.float32[1] = 0.0f, - .color.float32[2] = 1.0f, - .color.float32[3] = 1.0f, - }; - VkRenderPassBeginInfo render_pass_begin_info = (VkRenderPassBeginInfo){ - .sType = VK_STRUCTURE_TYPE_RENDER_PASS_BEGIN_INFO, - .renderPass = vk_pass, - .framebuffer = vk_framebuffers[i], - .renderArea.extent = vk_extent, - .clearValueCount = 1, - .pClearValues = &clear_val - }; - vkCmdBeginRenderPass(vk_cmd_buffers[i], &render_pass_begin_info, VK_SUBPASS_CONTENTS_INLINE); - - // - - vkCmdEndRenderPass(vk_cmd_buffers[i]); - vkEndCommandBuffer(vk_cmd_buffers[i]); + + VkImageMemoryBarrier vk_barrier = (VkImageMemoryBarrier){ + .sType = VK_STRUCTURE_TYPE_IMAGE_MEMORY_BARRIER, + .oldLayout = VK_IMAGE_LAYOUT_UNDEFINED, + .newLayout = VK_IMAGE_LAYOUT_COLOR_ATTACHMENT_OPTIMAL, + .srcAccessMask = 0, + .dstAccessMask = VK_ACCESS_COLOR_ATTACHMENT_WRITE_BIT, + .image = vk_swap_images[i], + .subresourceRange.aspectMask = VK_IMAGE_ASPECT_COLOR_BIT, + .subresourceRange.levelCount = 1, + .subresourceRange.layerCount = 1, + }; + vkCmdPipelineBarrier(vk_cmd_buffers[i], VK_PIPELINE_STAGE_TOP_OF_PIPE_BIT, VK_PIPELINE_STAGE_COLOR_ATTACHMENT_OUTPUT_BIT, 0, 0, 0, 0, 0, 1, &vk_barrier); + + VkRenderingAttachmentInfo vk_rendering_attachment = (VkRenderingAttachmentInfo){ + .sType = VK_STRUCTURE_TYPE_RENDERING_ATTACHMENT_INFO, + .imageView = vk_swap_image_views[i], + .imageLayout = VK_IMAGE_LAYOUT_COLOR_ATTACHMENT_OPTIMAL, + .loadOp = VK_ATTACHMENT_LOAD_OP_CLEAR, + .storeOp = VK_ATTACHMENT_STORE_OP_STORE, + .clearValue.color = {{1.0f, 0.0f, 1.0f, 1.0f}}, + }; + VkRenderingInfo vk_rendering_info = (VkRenderingInfo){ + .sType = VK_STRUCTURE_TYPE_RENDERING_INFO, + .renderArea.extent = vk_extent, + .layerCount = 1, + .colorAttachmentCount = 1, + .pColorAttachments = &vk_rendering_attachment, + }; + vkCmdBeginRendering(vk_cmd_buffers[i], &vk_rendering_info); + + vkCmdBindPipeline(vk_cmd_buffers[i], VK_PIPELINE_BIND_POINT_GRAPHICS, vk_pipeline); + VkDeviceSize offset = 0; + vkCmdBindVertexBuffers(vk_cmd_buffers[i], 0, 1, &vk_vbuf, &offset); + vkCmdDraw(vk_cmd_buffers[i], 3, 1, 0, 0); + + vkCmdEndRendering(vk_cmd_buffers[i]); + + vk_barrier.oldLayout = VK_IMAGE_LAYOUT_COLOR_ATTACHMENT_OPTIMAL; + vk_barrier.newLayout = VK_IMAGE_LAYOUT_PRESENT_SRC_KHR; + vk_barrier.srcAccessMask = VK_ACCESS_COLOR_ATTACHMENT_WRITE_BIT; + vk_barrier.dstAccessMask = 0; + vkCmdPipelineBarrier(vk_cmd_buffers[i], VK_PIPELINE_STAGE_COLOR_ATTACHMENT_OUTPUT_BIT, VK_PIPELINE_STAGE_BOTTOM_OF_PIPE_BIT, 0, 0, 0, 0, 0, 1, &vk_barrier); + + if ((r = vkEndCommandBuffer(vk_cmd_buffers[i])) != VK_SUCCESS) { + DIE("Failed to end command buffer for swapchain image #%u: %s", i, VkResultToString(r)); + } } - VkSemaphore vk_image_available = 0; - VkSemaphore vk_render_finished = 0; + VkSemaphore* vk_image_available = calloc(vk_swap_image_count, sizeof(VkSemaphore)); + VkSemaphore* vk_render_finished = calloc(vk_swap_image_count, sizeof(VkSemaphore)); { VkSemaphoreCreateInfo semaphore_create = (VkSemaphoreCreateInfo){ .sType = VK_STRUCTURE_TYPE_SEMAPHORE_CREATE_INFO, }; - vkCreateSemaphore(vk_dev, &semaphore_create, 0, &vk_image_available); - vkCreateSemaphore(vk_dev, &semaphore_create, 0, &vk_render_finished); + for (uint32_t i = 0; i < vk_swap_image_count; ++i) { + vkCreateSemaphore(vk_dev, &semaphore_create, 0, &vk_image_available[i]); + vkCreateSemaphore(vk_dev, &semaphore_create, 0, &vk_render_finished[i]); + } } + +#define MAX_FRAMES_IN_FLIGHT 2 + VkFence vk_frame_fences[MAX_FRAMES_IN_FLIGHT]; + { + VkFenceCreateInfo create = (VkFenceCreateInfo){ + .sType = VK_STRUCTURE_TYPE_FENCE_CREATE_INFO, + .flags = VK_FENCE_CREATE_SIGNALED_BIT, + }; + for (uint32_t i = 0; i < MAX_FRAMES_IN_FLIGHT; ++i) { + if ((r = vkCreateFence(vk_dev, &create, 0, &vk_frame_fences[i])) != VK_SUCCESS) { + DIE("Failed to create frame fence #%u: %s", i, VkResultToString(r)); + } + } + } + + uint32_t frame_idx = 0; bool running = true; while (running) { @@ -347,27 +584,31 @@ int main(int argc, char* argv[]) } uint32_t next_image_idx = 0; - vkAcquireNextImageKHR(vk_dev, vk_swp, UINT64_MAX, vk_image_available, VK_NULL_HANDLE, &next_image_idx); + vkAcquireNextImageKHR(vk_dev, vk_swp, UINT64_MAX, vk_image_available[frame_idx], VK_NULL_HANDLE, &next_image_idx); + // Wait for GPU to finish using this swapchain image + vkWaitForFences(vk_dev, 1, &vk_frame_fences[frame_idx], VK_TRUE, UINT64_MAX); + vkResetFences(vk_dev, 1, &vk_frame_fences[frame_idx]); + VkPipelineStageFlags wait_stages[] = { VK_PIPELINE_STAGE_COLOR_ATTACHMENT_OUTPUT_BIT }; VkSubmitInfo submit_info = (VkSubmitInfo){ .sType = VK_STRUCTURE_TYPE_SUBMIT_INFO, .waitSemaphoreCount = 1, - .pWaitSemaphores = &vk_image_available, + .pWaitSemaphores = &vk_image_available[frame_idx], .pWaitDstStageMask = wait_stages, .commandBufferCount = 1, .pCommandBuffers = &vk_cmd_buffers[next_image_idx], .signalSemaphoreCount = 1, - .pSignalSemaphores = &vk_render_finished, + .pSignalSemaphores = &vk_render_finished[next_image_idx], }; - if ((r = vkQueueSubmit(vk_graphics_queue, 1, &submit_info, VK_NULL_HANDLE)) != VK_SUCCESS) { + if ((r = vkQueueSubmit(vk_graphics_queue, 1, &submit_info, vk_frame_fences[frame_idx])) != VK_SUCCESS) { DIE("Failed to submit graphics queue: %s", VkResultToString(r)); } VkPresentInfoKHR present_info = (VkPresentInfoKHR){ .sType = VK_STRUCTURE_TYPE_PRESENT_INFO_KHR, .waitSemaphoreCount = 1, - .pWaitSemaphores = &vk_render_finished, + .pWaitSemaphores = &vk_render_finished[next_image_idx], .swapchainCount = 1, .pSwapchains = &vk_swp, .pImageIndices = &next_image_idx, @@ -375,6 +616,10 @@ int main(int argc, char* argv[]) if ((r = vkQueuePresentKHR(vk_graphics_queue, &present_info)) != VK_SUCCESS) { DIE("Failed to submit present queue: %s", VkResultToString(r)); } + + frame_idx = (frame_idx + 1) % MAX_FRAMES_IN_FLIGHT; } + + vkDeviceWaitIdle(vk_dev); } diff --git a/vk-asylum/shaders.h b/vk-asylum/shaders.h index 3ceda9c..435e2f0 100644 --- a/vk-asylum/shaders.h +++ b/vk-asylum/shaders.h @@ -1,2 +1,2 @@ const uint32_t triangle_vert_spv[] = { 0x07230203, 0x00010000, 0x000d000b, 0x0000001f, 0x00000000, 0x00020011, 0x00000001, 0x0006000b, 0x00000001, 0x4c534c47, 0x6474732e, 0x3035342e, 0x00000000, 0x0003000e, 0x00000000, 0x00000001, 0x0009000f, 0x00000000, 0x00000004, 0x6e69616d, 0x00000000, 0x0000000d, 0x00000012, 0x0000001c, 0x0000001d, 0x00030003, 0x00000002, 0x000001c2, 0x000a0004, 0x475f4c47, 0x4c474f4f, 0x70635f45, 0x74735f70, 0x5f656c79, 0x656e696c, 0x7269645f, 0x69746365, 0x00006576, 0x00080004, 0x475f4c47, 0x4c474f4f, 0x6e695f45, 0x64756c63, 0x69645f65, 0x74636572, 0x00657669, 0x00040005, 0x00000004, 0x6e69616d, 0x00000000, 0x00060005, 0x0000000b, 0x505f6c67, 0x65567265, 0x78657472, 0x00000000, 0x00060006, 0x0000000b, 0x00000000, 0x505f6c67, 0x7469736f, 0x006e6f69, 0x00070006, 0x0000000b, 0x00000001, 0x505f6c67, 0x746e696f, 0x657a6953, 0x00000000, 0x00070006, 0x0000000b, 0x00000002, 0x435f6c67, 0x4470696c, 0x61747369, 0x0065636e, 0x00070006, 0x0000000b, 0x00000003, 0x435f6c67, 0x446c6c75, 0x61747369, 0x0065636e, 0x00030005, 0x0000000d, 0x00000000, 0x00030005, 0x00000012, 0x00705f76, 0x00030005, 0x0000001c, 0x00635f66, 0x00030005, 0x0000001d, 0x00635f76, 0x00030047, 0x0000000b, 0x00000002, 0x00050048, 0x0000000b, 0x00000000, 0x0000000b, 0x00000000, 0x00050048, 0x0000000b, 0x00000001, 0x0000000b, 0x00000001, 0x00050048, 0x0000000b, 0x00000002, 0x0000000b, 0x00000003, 0x00050048, 0x0000000b, 0x00000003, 0x0000000b, 0x00000004, 0x00040047, 0x00000012, 0x0000001e, 0x00000000, 0x00040047, 0x0000001c, 0x0000001e, 0x00000000, 0x00040047, 0x0000001d, 0x0000001e, 0x00000001, 0x00020013, 0x00000002, 0x00030021, 0x00000003, 0x00000002, 0x00030016, 0x00000006, 0x00000020, 0x00040017, 0x00000007, 0x00000006, 0x00000004, 0x00040015, 0x00000008, 0x00000020, 0x00000000, 0x0004002b, 0x00000008, 0x00000009, 0x00000001, 0x0004001c, 0x0000000a, 0x00000006, 0x00000009, 0x0006001e, 0x0000000b, 0x00000007, 0x00000006, 0x0000000a, 0x0000000a, 0x00040020, 0x0000000c, 0x00000003, 0x0000000b, 0x0004003b, 0x0000000c, 0x0000000d, 0x00000003, 0x00040015, 0x0000000e, 0x00000020, 0x00000001, 0x0004002b, 0x0000000e, 0x0000000f, 0x00000000, 0x00040017, 0x00000010, 0x00000006, 0x00000003, 0x00040020, 0x00000011, 0x00000001, 0x00000010, 0x0004003b, 0x00000011, 0x00000012, 0x00000001, 0x0004002b, 0x00000006, 0x00000014, 0x3f800000, 0x00040020, 0x00000019, 0x00000003, 0x00000007, 0x00040020, 0x0000001b, 0x00000003, 0x00000010, 0x0004003b, 0x0000001b, 0x0000001c, 0x00000003, 0x0004003b, 0x00000011, 0x0000001d, 0x00000001, 0x00050036, 0x00000002, 0x00000004, 0x00000000, 0x00000003, 0x000200f8, 0x00000005, 0x0004003d, 0x00000010, 0x00000013, 0x00000012, 0x00050051, 0x00000006, 0x00000015, 0x00000013, 0x00000000, 0x00050051, 0x00000006, 0x00000016, 0x00000013, 0x00000001, 0x00050051, 0x00000006, 0x00000017, 0x00000013, 0x00000002, 0x00070050, 0x00000007, 0x00000018, 0x00000015, 0x00000016, 0x00000017, 0x00000014, 0x00050041, 0x00000019, 0x0000001a, 0x0000000d, 0x0000000f, 0x0003003e, 0x0000001a, 0x00000018, 0x0004003d, 0x00000010, 0x0000001e, 0x0000001d, 0x0003003e, 0x0000001c, 0x0000001e, 0x000100fd, 0x00010038, }; -const uint32_t triangle_frag_spv[] = { 0x07230203, 0x00010000, 0x000d000b, 0x0000001f, 0x00000000, 0x00020011, 0x00000001, 0x0006000b, 0x00000001, 0x4c534c47, 0x6474732e, 0x3035342e, 0x00000000, 0x0003000e, 0x00000000, 0x00000001, 0x0009000f, 0x00000000, 0x00000004, 0x6e69616d, 0x00000000, 0x0000000d, 0x00000012, 0x0000001c, 0x0000001d, 0x00030003, 0x00000002, 0x000001c2, 0x000a0004, 0x475f4c47, 0x4c474f4f, 0x70635f45, 0x74735f70, 0x5f656c79, 0x656e696c, 0x7269645f, 0x69746365, 0x00006576, 0x00080004, 0x475f4c47, 0x4c474f4f, 0x6e695f45, 0x64756c63, 0x69645f65, 0x74636572, 0x00657669, 0x00040005, 0x00000004, 0x6e69616d, 0x00000000, 0x00060005, 0x0000000b, 0x505f6c67, 0x65567265, 0x78657472, 0x00000000, 0x00060006, 0x0000000b, 0x00000000, 0x505f6c67, 0x7469736f, 0x006e6f69, 0x00070006, 0x0000000b, 0x00000001, 0x505f6c67, 0x746e696f, 0x657a6953, 0x00000000, 0x00070006, 0x0000000b, 0x00000002, 0x435f6c67, 0x4470696c, 0x61747369, 0x0065636e, 0x00070006, 0x0000000b, 0x00000003, 0x435f6c67, 0x446c6c75, 0x61747369, 0x0065636e, 0x00030005, 0x0000000d, 0x00000000, 0x00030005, 0x00000012, 0x00705f76, 0x00030005, 0x0000001c, 0x00635f66, 0x00030005, 0x0000001d, 0x00635f76, 0x00030047, 0x0000000b, 0x00000002, 0x00050048, 0x0000000b, 0x00000000, 0x0000000b, 0x00000000, 0x00050048, 0x0000000b, 0x00000001, 0x0000000b, 0x00000001, 0x00050048, 0x0000000b, 0x00000002, 0x0000000b, 0x00000003, 0x00050048, 0x0000000b, 0x00000003, 0x0000000b, 0x00000004, 0x00040047, 0x00000012, 0x0000001e, 0x00000000, 0x00040047, 0x0000001c, 0x0000001e, 0x00000000, 0x00040047, 0x0000001d, 0x0000001e, 0x00000001, 0x00020013, 0x00000002, 0x00030021, 0x00000003, 0x00000002, 0x00030016, 0x00000006, 0x00000020, 0x00040017, 0x00000007, 0x00000006, 0x00000004, 0x00040015, 0x00000008, 0x00000020, 0x00000000, 0x0004002b, 0x00000008, 0x00000009, 0x00000001, 0x0004001c, 0x0000000a, 0x00000006, 0x00000009, 0x0006001e, 0x0000000b, 0x00000007, 0x00000006, 0x0000000a, 0x0000000a, 0x00040020, 0x0000000c, 0x00000003, 0x0000000b, 0x0004003b, 0x0000000c, 0x0000000d, 0x00000003, 0x00040015, 0x0000000e, 0x00000020, 0x00000001, 0x0004002b, 0x0000000e, 0x0000000f, 0x00000000, 0x00040017, 0x00000010, 0x00000006, 0x00000003, 0x00040020, 0x00000011, 0x00000001, 0x00000010, 0x0004003b, 0x00000011, 0x00000012, 0x00000001, 0x0004002b, 0x00000006, 0x00000014, 0x3f800000, 0x00040020, 0x00000019, 0x00000003, 0x00000007, 0x00040020, 0x0000001b, 0x00000003, 0x00000010, 0x0004003b, 0x0000001b, 0x0000001c, 0x00000003, 0x0004003b, 0x00000011, 0x0000001d, 0x00000001, 0x00050036, 0x00000002, 0x00000004, 0x00000000, 0x00000003, 0x000200f8, 0x00000005, 0x0004003d, 0x00000010, 0x00000013, 0x00000012, 0x00050051, 0x00000006, 0x00000015, 0x00000013, 0x00000000, 0x00050051, 0x00000006, 0x00000016, 0x00000013, 0x00000001, 0x00050051, 0x00000006, 0x00000017, 0x00000013, 0x00000002, 0x00070050, 0x00000007, 0x00000018, 0x00000015, 0x00000016, 0x00000017, 0x00000014, 0x00050041, 0x00000019, 0x0000001a, 0x0000000d, 0x0000000f, 0x0003003e, 0x0000001a, 0x00000018, 0x0004003d, 0x00000010, 0x0000001e, 0x0000001d, 0x0003003e, 0x0000001c, 0x0000001e, 0x000100fd, 0x00010038, }; +const uint32_t triangle_frag_spv[] = { 0x07230203, 0x00010000, 0x000d000b, 0x00000013, 0x00000000, 0x00020011, 0x00000001, 0x0006000b, 0x00000001, 0x4c534c47, 0x6474732e, 0x3035342e, 0x00000000, 0x0003000e, 0x00000000, 0x00000001, 0x0007000f, 0x00000004, 0x00000004, 0x6e69616d, 0x00000000, 0x00000009, 0x0000000c, 0x00030010, 0x00000004, 0x00000007, 0x00030003, 0x00000002, 0x000001c2, 0x000a0004, 0x475f4c47, 0x4c474f4f, 0x70635f45, 0x74735f70, 0x5f656c79, 0x656e696c, 0x7269645f, 0x69746365, 0x00006576, 0x00080004, 0x475f4c47, 0x4c474f4f, 0x6e695f45, 0x64756c63, 0x69645f65, 0x74636572, 0x00657669, 0x00040005, 0x00000004, 0x6e69616d, 0x00000000, 0x00050005, 0x00000009, 0x756f5f66, 0x6f635f74, 0x00726f6c, 0x00030005, 0x0000000c, 0x00635f66, 0x00040047, 0x00000009, 0x0000001e, 0x00000000, 0x00040047, 0x0000000c, 0x0000001e, 0x00000000, 0x00020013, 0x00000002, 0x00030021, 0x00000003, 0x00000002, 0x00030016, 0x00000006, 0x00000020, 0x00040017, 0x00000007, 0x00000006, 0x00000004, 0x00040020, 0x00000008, 0x00000003, 0x00000007, 0x0004003b, 0x00000008, 0x00000009, 0x00000003, 0x00040017, 0x0000000a, 0x00000006, 0x00000003, 0x00040020, 0x0000000b, 0x00000001, 0x0000000a, 0x0004003b, 0x0000000b, 0x0000000c, 0x00000001, 0x0004002b, 0x00000006, 0x0000000e, 0x3f800000, 0x00050036, 0x00000002, 0x00000004, 0x00000000, 0x00000003, 0x000200f8, 0x00000005, 0x0004003d, 0x0000000a, 0x0000000d, 0x0000000c, 0x00050051, 0x00000006, 0x0000000f, 0x0000000d, 0x00000000, 0x00050051, 0x00000006, 0x00000010, 0x0000000d, 0x00000001, 0x00050051, 0x00000006, 0x00000011, 0x0000000d, 0x00000002, 0x00070050, 0x00000007, 0x00000012, 0x0000000f, 0x00000010, 0x00000011, 0x0000000e, 0x0003003e, 0x00000009, 0x00000012, 0x000100fd, 0x00010038, }; diff --git a/yuvbench/CLAUDE.md b/yuvbench/CLAUDE.md new file mode 100644 index 0000000..ee6ee4f --- /dev/null +++ b/yuvbench/CLAUDE.md @@ -0,0 +1,78 @@ +# CLAUDE.md + +This file provides guidance to Claude Code (claude.ai/code) when working with code in this repository. + +## What This Is + +yuvbench benchmarks YUV 4:2:0 → RGB24 color space conversion across multiple implementations. It loads a raw YUV file, runs each enabled backend through 100 warmup + 2500 timed iterations, and reports min/max/avg per-iteration timing in milliseconds. + +## Build Commands + +```bash +# macOS (Apple Silicon) +./build-macos-aarch64-clang.sh + +# Linux (x86_64) +./build-linux-x86_64-gcc.sh +``` + +Both scripts create `build/` and produce `build/yuvbench`. + +## Running + +```bash +./build/yuvbench images/jellybeans-256x256.yuv +./build/yuvbench images/capitol-2950x1528.yuv +./build/yuvbench images/capitol-2950x1528.yuv show # pipe last frame to ffplay +``` + +Input filename must encode dimensions as `name-WIDTHxHEIGHT.yuv`. + +## Prepare Test Images + +```bash +# Convert source images in images/src/ to raw YUV 4:2:0 +./images/convert.sh +``` + +## Architecture + +### Backend Plugin System + +Each backend is an optional compilation unit implementing the interface in `yuvbench.h`: + +```c +typedef struct { + void (*init_fn)(Ctx *ctx); + void (*convert_fn)(Ctx *ctx); + void (*deinit_fn)(Ctx *ctx); +} Backend; +``` + +`yuvbench.c:run_backend()` drives warmup + timing loops. Backends are compiled in via `-DYUVBENCH_` preprocessor flags set in each build script. + +### Backends + +| Define | File | Platform | Notes | +|--------|------|----------|-------| +| `YUVBENCH_BAD` | `yuvbench_bad.c` | All | Naive BT.709 nested loop; reference baseline | +| `YUVBENCH_ACCELERATE` | `yuvbench_accelerate.c` | macOS | vImage YUV→ARGB→RGB; caches conversion object | +| `YUVBENCH_SWSCALE` | `yuvbench_swscale.c` | All | FFmpeg libswscale; SwsContext created in init | +| `YUVBENCH_LIBYUV` | `yuvbench_libyuv.c` | Linux | Google libyuv `I420ToRAW()`; no init/deinit | + +### Timing (`kbench.h`) + +- macOS/ARM64: reads `CNTVCT_EL0` / `CNTFRQ_EL0` hardware registers directly +- Linux: `clock_gettime(CLOCK_MONOTONIC)` + +### Adding a New Backend + +1. Create `yuvbench_.c` implementing `init`, `convert`, `deinit` functions +2. Guard the file body with `#ifdef YUVBENCH_` +3. Register it in `yuvbench.c` (see the `backends[]` array) +4. Add `-DYUVBENCH_` and any link flags to the relevant build scripts + +## Platform Notes + +- The `vk-asylum` branch contains a Vulkan compute shader backend (`build-shaders.sh`, `shaders.h`, `main.c`) +- Assembly output for the Accelerate backend is emitted to `build/yuvbench_accelerate.S` on macOS builds diff --git a/yuvbench/build-macos-aarch64-clang.sh b/yuvbench/build-macos-aarch64-clang.sh index 18706e8..7cb19e6 100755 --- a/yuvbench/build-macos-aarch64-clang.sh +++ b/yuvbench/build-macos-aarch64-clang.sh @@ -1,5 +1,5 @@ #!/bin/sh -CFLAGS="-Wall -Wextra -Wpedantic -O3 -g -DYUVBENCH_ACCELERATE -DYUVBENCH_BAD -DYUVBENCH_SWSCALE" +CFLAGS="-Wall -Wextra -Wpedantic -O3 -g -DYUVBENCH_ACCELERATE -DYUVBENCH_BAD -DYUVBENCH_SWSCALE -DYUVBENCH_CLAUDE" LFLAGS="-framework Accelerate $(pkg-config --libs libswscale)" mkdir -p build set -x @@ -8,4 +8,5 @@ clang -o build/yuvbench_accelerate.o $CFLAGS -c ./yuvbench_accelerate.c clang -o build/yuvbench_accelerate.S $CFLAGS -S ./yuvbench_accelerate.c clang -o build/yuvbench_bad.o $CFLAGS -c ./yuvbench_bad.c clang -o build/yuvbench_swscale.o $CFLAGS $(pkg-config --cflags libswscale) -c ./yuvbench_swscale.c -clang -o build/yuvbench $LFLAGS build/yuvbench.o build/yuvbench_accelerate.o build/yuvbench_bad.o build/yuvbench_swscale.o +clang -o build/yuvbench_claude.o $CFLAGS -c ./yuvbench_claude.c +clang -o build/yuvbench $LFLAGS build/yuvbench.o build/yuvbench_accelerate.o build/yuvbench_bad.o build/yuvbench_swscale.o build/yuvbench_claude.o diff --git a/yuvbench/yuvbench.c b/yuvbench/yuvbench.c index 3a92371..669c339 100644 --- a/yuvbench/yuvbench.c +++ b/yuvbench/yuvbench.c @@ -3,6 +3,14 @@ #define KBENCH_IMPLEMENTATION #include "kbench.h" +#include + +static int cmp_double(const void* a, const void* b) +{ + double da = *(const double*)a, db = *(const double*)b; + return (da > db) - (da < db); +} + #ifdef YUVBENCH_ACCELERATE Backend yuvbench_accelerate(void); #endif @@ -15,6 +23,9 @@ Backend yuvbench_libyuv(void); #ifdef YUVBENCH_SWSCALE Backend yuvbench_swscale(void); #endif +#ifdef YUVBENCH_CLAUDE +Backend yuvbench_claude(void); +#endif static struct { @@ -70,21 +81,27 @@ static void run_backend(Backend b) b.deinit_fn(&ctx); } - double ts_min = -1.0f; - double ts_max = -1.0f; - double ts_avg = 0.0f; + // Sort for percentiles + qsort(tests_table, tests, sizeof(double), cmp_double); + + double ts_min = tests_table[0]; + double ts_max = tests_table[tests - 1]; + double ts_p50 = tests_table[tests / 2]; + double ts_p95 = tests_table[(int)(tests * 0.95)]; + double ts_p99 = tests_table[(int)(tests * 0.99)]; + double ts_avg = 0.0; + for (int i = 0; i < tests; ++i) ts_avg += tests_table[i] / (double)tests; + double ts_var = 0.0; for (int i = 0; i < tests; ++i) { - if (ts_min < 0 || tests_table[i] < ts_min) { - ts_min = tests_table[i]; - } - if (ts_max < 0 || tests_table[i] > ts_max) { - ts_max = tests_table[i]; - } - ts_avg += (tests_table[i] / (double)tests); + double d = tests_table[i] - ts_avg; + ts_var += d * d / (double)tests; } - printf(" min result: %fms\n", ts_min * 1000.0f); - printf(" max result: %fms\n", ts_max * 1000.0f); - printf(" avg result: %fms\n", ts_avg * 1000.0f); + double ts_stddev = sqrt(ts_var); + + #define MS(t) ((t) * 1000.0) + printf(" min %8.3fms p50 %8.3fms p95 %8.3fms p99 %8.3fms max %8.3fms avg %8.3fms σ %7.3fms\n", + MS(ts_min), MS(ts_p50), MS(ts_p95), MS(ts_p99), MS(ts_max), MS(ts_avg), MS(ts_stddev)); + #undef MS if (G.show) { @@ -204,4 +221,8 @@ int main(int argc, char** argv) printf("YUVBENCH_SWSCALE\n"); run_backend(yuvbench_swscale()); #endif +#ifdef YUVBENCH_CLAUDE + printf("YUVBENCH_CLAUDE\n"); + run_backend(yuvbench_claude()); +#endif } diff --git a/yuvbench/yuvbench_claude.c b/yuvbench/yuvbench_claude.c new file mode 100644 index 0000000..c8aae52 --- /dev/null +++ b/yuvbench/yuvbench_claude.c @@ -0,0 +1,159 @@ +#include "yuvbench.h" + +#ifdef YUVBENCH_CLAUDE + +#include +#include + +// BT.709 limited range, scale >>6 (factor=64): +// R = clip(( 75*(Y-16) + 119*(Cr-128) + 32) >> 6) +// G = clip(( 75*(Y-16) - 12*(Cb-128) - 30*(Cr-128) + 32) >> 6) +// B = clip(( 75*(Y-16) + 129*(Cb-128) + 32) >> 6) +// +// All int16 intermediates stay within [-16512, 31801] — no overflow. +// vqshrun_n_s16 gives saturating unsigned narrow + right shift in one instruction. + +// Coefficients: +// Y: 75 (1.164 * 64 = 74.5 → 75, err +0.67%) +// Cr_R: 119 (1.856 * 64 = 118.8 → 119, err +0.17%) +// Cb_B: 129 (2.016 * 64; libyuv uses 2.016 for BT.709 studio swing) +// Cb_G: 12 (0.187 * 64 = 12.0) +// Cr_G: 30 (0.468 * 64 = 30.0) + +// Convert 8 luma pixels using pre-widened, bias-subtracted chroma vectors. +// vcb = Cb - 128 (int16x8), vcr = Cr - 128 (int16x8) +static inline void convert_8px( + const uint8_t* __restrict__ y, + int16x8_t vcb, + int16x8_t vcr, + uint8_t* __restrict__ out) +{ + // Y - 16 + int16x8_t vy = vsubq_s16( + vreinterpretq_s16_u16(vmovl_u8(vld1_u8(y))), + vdupq_n_s16(16)); + + // ry = 75*(Y-16) + rounding + int16x8_t base = vaddq_s16(vmulq_n_s16(vy, 75), vdupq_n_s16(32)); + + // R = base + 119*Cr + int16x8_t r = vmlaq_n_s16(base, vcr, 119); + // G = base - 12*Cb - 30*Cr + int16x8_t g = vmlsq_n_s16(vmlsq_n_s16(base, vcb, 12), vcr, 30); + // B = base + 129*Cb + int16x8_t b = vmlaq_n_s16(base, vcb, 129); + + // Saturating narrow+shift: clamps to [0,255] and packs to uint8 + uint8x8x3_t rgb; + rgb.val[0] = vqshrun_n_s16(r, 6); + rgb.val[1] = vqshrun_n_s16(g, 6); + rgb.val[2] = vqshrun_n_s16(b, 6); + vst3_u8(out, rgb); // stores R0G0B0 R1G1B1 ... R7G7B7 +} + +// Process a contiguous range of row-pairs [row_start, row_end). +// row_start and row_end must be even. +static void convert_rows( + const uint8_t* __restrict__ Y, + const uint8_t* __restrict__ Cb, + const uint8_t* __restrict__ Cr, + uint8_t* __restrict__ RGB, + uint32_t w, + uint32_t row_start, + uint32_t row_end) +{ + for (uint32_t row = row_start; row < row_end; row += 2) { + const uint8_t* y0 = Y + row * w; + const uint8_t* y1 = y0 + w; + const uint8_t* cb = Cb + (row / 2) * (w / 2); + const uint8_t* cr = Cr + (row / 2) * (w / 2); + uint8_t* rgb0 = RGB + row * w * 3; + uint8_t* rgb1 = rgb0 + w * 3; + + uint32_t col = 0; + + // 16 pixels per inner iteration: 8 chroma samples shared across 2 rows. + for (; col + 16 <= w; col += 16) { + // Load 8 Cb/Cr bytes and upsample to 16 via interleave-with-self: + // [c0,c1,...,c7] → [c0,c0,c1,c1,...,c7,c7] + uint8x8x2_t cb_up = vzip_u8(vld1_u8(cb + col/2), vld1_u8(cb + col/2)); + uint8x8x2_t cr_up = vzip_u8(vld1_u8(cr + col/2), vld1_u8(cr + col/2)); + + // Widen and bias-subtract chroma for low 8 and high 8 pixels + int16x8_t vcb_lo = vsubq_s16(vreinterpretq_s16_u16(vmovl_u8(cb_up.val[0])), vdupq_n_s16(128)); + int16x8_t vcb_hi = vsubq_s16(vreinterpretq_s16_u16(vmovl_u8(cb_up.val[1])), vdupq_n_s16(128)); + int16x8_t vcr_lo = vsubq_s16(vreinterpretq_s16_u16(vmovl_u8(cr_up.val[0])), vdupq_n_s16(128)); + int16x8_t vcr_hi = vsubq_s16(vreinterpretq_s16_u16(vmovl_u8(cr_up.val[1])), vdupq_n_s16(128)); + + // Row 0 + convert_8px(y0 + col, vcb_lo, vcr_lo, rgb0 + col * 3); + convert_8px(y0 + col + 8, vcb_hi, vcr_hi, rgb0 + (col + 8) * 3); + // Row 1 — same chroma, different luma + convert_8px(y1 + col, vcb_lo, vcr_lo, rgb1 + col * 3); + convert_8px(y1 + col + 8, vcb_hi, vcr_hi, rgb1 + (col + 8) * 3); + } + + // Scalar tail for widths not divisible by 16 + for (; col < w; col += 2) { + int32_t cb2 = (int32_t)cb[col / 2] - 128; + int32_t cr2 = (int32_t)cr[col / 2] - 128; + for (uint32_t dy = 0; dy < 2; ++dy) { + const uint8_t* yr = (dy == 0) ? y0 : y1; + uint8_t* dst = (dy == 0) ? rgb0 : rgb1; + for (uint32_t dx = 0; dx < 2 && (col + dx) < w; ++dx) { + int32_t yv = (int32_t)yr[col + dx] - 16; + int32_t base = 75 * yv + 32; + int32_t r = base + 119 * cr2; + int32_t g = base - 12 * cb2 - 30 * cr2; + int32_t bv = base + 129 * cb2; + r = (r >> 6); r = r < 0 ? 0 : r > 255 ? 255 : r; + g = (g >> 6); g = g < 0 ? 0 : g > 255 ? 255 : g; + bv = (bv >> 6); bv = bv < 0 ? 0 : bv > 255 ? 255 : bv; + dst[(col + dx) * 3 + 0] = (uint8_t)r; + dst[(col + dx) * 3 + 1] = (uint8_t)g; + dst[(col + dx) * 3 + 2] = (uint8_t)bv; + } + } + } + } +} + +static bool yuvbench_claude_init(Ctx* ctx) +{ + return (ctx->inp_w % 2 == 0) && (ctx->inp_h % 2 == 0); +} + +static bool yuvbench_claude_convert(Ctx* ctx) +{ + const uint32_t w = ctx->inp_w; + const uint32_t h = ctx->inp_h; + const uint8_t* Y = (const uint8_t*)ctx->inp_buf; + const uint8_t* Cb = Y + (size_t)w * h; + const uint8_t* Cr = Cb + (size_t)(w / 2) * (h / 2); + uint8_t* RGB = (uint8_t*)ctx->out_buf; + + // Dispatch row-pairs across performance cores. + // dispatch_apply is synchronous: returns only after all blocks finish. + static const uint32_t NCHUNKS = 8; + uint32_t pairs = h / 2; + + dispatch_apply(NCHUNKS, + dispatch_get_global_queue(QOS_CLASS_USER_INTERACTIVE, 0), + ^(size_t tid) { + uint32_t start = (uint32_t)((tid * pairs / NCHUNKS) * 2); + uint32_t end = (uint32_t)(((tid+1) * pairs / NCHUNKS) * 2); + convert_rows(Y, Cb, Cr, RGB, w, start, end); + }); + + return true; +} + +Backend yuvbench_claude(void) +{ + Backend b = { 0 }; + b.init_fn = yuvbench_claude_init; + b.convert_fn = yuvbench_claude_convert; + return b; +} + +#endif // YUVBENCH_CLAUDE -- cgit v1.2.3