summaryrefslogtreecommitdiff
diff options
context:
space:
mode:
-rw-r--r--.gitignore2
-rwxr-xr-xmicroshooter/build.sh3
-rw-r--r--microshooter/microshooter.c606
-rwxr-xr-xvk-asylum/build-shaders.sh2
-rw-r--r--vk-asylum/main.c447
-rw-r--r--vk-asylum/shaders.h2
-rw-r--r--yuvbench/CLAUDE.md78
-rwxr-xr-xyuvbench/build-macos-aarch64-clang.sh5
-rw-r--r--yuvbench/yuvbench.c47
-rw-r--r--yuvbench/yuvbench_claude.c159
10 files changed, 1233 insertions, 118 deletions
diff --git a/.gitignore b/.gitignore
index 60adada..58ea676 100644
--- a/.gitignore
+++ b/.gitignore
@@ -9,5 +9,7 @@
bin
build
d3d11-re.*
+microshooter/microshooter_client*
+microshooter/microshooter_server*
rushmore-linux/out
rushmore-linux/linux*
diff --git a/microshooter/build.sh b/microshooter/build.sh
new file mode 100755
index 0000000..172c395
--- /dev/null
+++ b/microshooter/build.sh
@@ -0,0 +1,3 @@
+#!/bin/sh
+cc -o microshooter_client -O0 -g -fsanitize=address -Wall -Wextra -Wpedantic ./microshooter.c $(pkg-config --cflags --libs sdl3) -DCLIENT
+cc -o microshooter_server -O0 -g -fsanitize=address -Wall -Wextra -Wpedantic ./microshooter.c $(pkg-config --cflags --libs sdl3) -DSERVER
diff --git a/microshooter/microshooter.c b/microshooter/microshooter.c
new file mode 100644
index 0000000..e209c3b
--- /dev/null
+++ b/microshooter/microshooter.c
@@ -0,0 +1,606 @@
+#if !defined(CLIENT) && !defined(SERVER)
+# error Either CLIENT or SERVER must be defined
+#endif
+
+// ================================================================================
+// Core headers
+// ================================================================================
+
+#include <stdarg.h>
+#include <stddef.h>
+#include <stdio.h>
+#include <stdlib.h>
+
+// ================================================================================
+// Core helpers
+// ================================================================================
+
+#define COUNTOF(_X) (sizeof(_X) / sizeof((_X)[0]))
+#define UNUSED(_X) ((void)sizeof(_X))
+
+#ifdef __GNUC__
+# define NORETURN __attribute__((noreturn))
+# define PRINTF_FORMAT(_X, _Y) __attribute__((format(printf, _X, _Y)))
+#endif
+
+// ================================================================================
+// Game -> OS API
+// ================================================================================
+
+NORETURN
+void OS_SpewError(const char *message);
+
+void OS_SpewInfo(const char *message);
+
+// ================================================================================
+// OS -> Client API
+// ================================================================================
+
+typedef struct CL_InitParams CL_InitParams;
+struct CL_InitParams
+{
+ void *(*glproc)(const char *);
+};
+
+void CL_Init(const CL_InitParams *params);
+
+void CL_Render(int vp_width, int vp_height);
+
+// ================================================================================
+// OS -> Server API
+// ================================================================================
+
+typedef struct SV_InitParams SV_InitParams;
+struct SV_InitParams
+{
+ const char *hostname;
+};
+
+void SV_Init(const SV_InitParams *params);
+
+// ================================================================================
+// Shared utility code
+// ================================================================================
+
+#define HEAP_FMT(_STR, _FMT) \
+ do { \
+ va_list va; va_start(va, _FMT); \
+ int len = vsnprintf(0, 0, _FMT, va) + 1; \
+ va_end(va); \
+ _STR = calloc(len, 1); \
+ va_start(va, _FMT); \
+ vsnprintf(_STR, len, _FMT, va); \
+ va_end(va); \
+ } while (0);
+
+NORETURN PRINTF_FORMAT(1, 2)
+void Error(const char *fmt, ...)
+{
+ char *str = 0;
+ HEAP_FMT(str, fmt);
+ OS_SpewError(str);
+}
+
+PRINTF_FORMAT(1, 2)
+void Info(const char *fmt, ...)
+{
+ char *str = 0;
+ HEAP_FMT(str, fmt);
+ OS_SpewInfo(str);
+ free(str);
+}
+
+#undef HEAP_FMT
+
+// ================================================================================
+// Math
+// ================================================================================
+
+typedef struct Mat4 Mat4;
+struct Mat4
+{
+ float m[4 * 4];
+};
+#define MAT4(...) (Mat4){ .m = { __VA_ARGS__ }, }
+
+#if 0
+static inline Mat4 Mat4_Ident(void)
+{
+ return MAT4(
+ 1, 0, 0, 0,
+ 0, 1, 0, 0,
+ 0, 0, 1, 0,
+ 0, 0, 0, 1,
+ );
+}
+#endif
+
+static inline Mat4 Mat4_Ortho(float l, float r, float t, float b, float n, float f)
+{
+ // ref: https://www.scratchapixel.com/lessons/3d-basic-rendering/perspective-and-orthographic-projection-matrix/orthographic-projection-matrix.html
+ return MAT4(
+ 2.0f / (r - l), 0.0f, 0.0f, 0.0f,
+ 0.0f, 2.0f / (t - b), 0.0f, 0.0f,
+ 0.0f, 0.0f, -2.0f / (f - n), 0.0f,
+ -((r + l) / (r - l)), -((t + b) / (t - b)), -((f + n) / (f - n)), 1.0f,
+ );
+}
+
+// ================================================================================
+// OpenGL
+// ================================================================================
+
+// ref: https://registry.khronos.org/OpenGL/api/GL/glcorearb.h
+
+#define GL_FALSE 0
+#define GL_TRUE 1
+
+#define GL_TRIANGLES 0x0004
+
+#define GL_FRAGMENT_SHADER 0x8B30
+#define GL_VERTEX_SHADER 0x8B31
+
+#define GL_UNSIGNED_SHORT 0x1403
+#define GL_FLOAT 0x1406
+
+#define GL_ARRAY_BUFFER 0x8892
+#define GL_ELEMENT_ARRAY_BUFFER 0x8893
+
+#define GL_COLOR_BUFFER_BIT 0x00004000
+#define GL_DEPTH_BUFFER_BIT 0x00000100
+
+#define GL_VENDOR 0x1F00
+#define GL_RENDERER 0x1F01
+
+#define GL_NO_ERROR 0
+#define GL_INVALID_ENUM 0x0500
+#define GL_INVALID_VALUE 0x0501
+#define GL_INVALID_OPERATION 0x0502
+#define GL_OUT_OF_MEMORY 0x0505
+
+#define GL_STATIC_DRAW 0x88E4
+#define GL_STREAM_DRAW 0x88E0
+
+#define GL_COMPILE_STATUS 0x8B81
+#define GL_LINK_STATUS 0x8B82
+
+typedef unsigned int GLenum;
+typedef float GLfloat;
+typedef int GLint;
+typedef int GLsizei;
+typedef unsigned int GLbitfield;
+typedef unsigned int GLuint;
+typedef unsigned char GLboolean;
+typedef uint8_t GLubyte;
+typedef size_t GLsizeiptr;
+typedef intptr_t GLintptr;
+
+#define GLFUNCS \
+ X(glAttachShader, void, GLuint, GLuint) \
+ X(glBindBuffer, void, GLenum, GLuint) \
+ X(glBindVertexArray, void, GLuint) \
+ X(glBufferData, void, GLenum, GLsizeiptr, const void *, GLenum) \
+ X(glBufferSubData, void, GLenum, GLintptr, GLsizeiptr, const void *) \
+ X(glClear, void, GLbitfield) \
+ X(glClearColor, void, GLfloat, GLfloat, GLfloat, GLfloat) \
+ X(glCompileShader, void, GLuint) \
+ X(glCreateProgram, GLuint, void) \
+ X(glCreateShader, GLuint, GLenum) \
+ X(glDrawElements, void, GLenum, GLsizei, GLenum, const void *) \
+ X(glEnableVertexAttribArray, void, GLuint) \
+ X(glGenBuffers, void, GLsizei, GLuint *) \
+ X(glGenVertexArrays, void, GLsizei, GLuint *) \
+ X(glGetError, GLenum, void) \
+ X(glGetProgramInfoLog, void, GLuint, GLsizei, GLsizei *, char *) \
+ X(glGetProgramiv, void, GLuint, GLenum, GLint *) \
+ X(glGetShaderInfoLog, void, GLuint, GLsizei, GLsizei *, char *) \
+ X(glGetShaderiv, void, GLuint, GLenum, GLint *) \
+ X(glGetString, const GLubyte *, GLenum) \
+ X(glGetUniformLocation, GLint, GLuint, const char *) \
+ X(glLinkProgram, void, GLuint) \
+ X(glShaderSource, void, GLuint, GLsizei, const char *const *, const GLint *) \
+ X(glUniformMatrix4fv, void, GLint, GLsizei, GLboolean, const GLfloat *) \
+ X(glUseProgram, void, GLuint) \
+ X(glVertexAttribPointer, void, GLuint, GLint, GLenum, GLboolean, GLsizei, const void *) \
+ X(glViewport, void, GLint, GLint, GLsizei, GLsizei)
+
+#define X(_NAME, _RET, ...) static _RET (*_NAME)(__VA_ARGS__) = 0;
+GLFUNCS
+#undef X
+
+void LoadOpenGLFunctions(void *(*glproc)(const char *))
+{
+ #define X(_NAME, _RET, ...) \
+ _NAME = (_RET(*)(__VA_ARGS__))glproc(#_NAME); \
+ if (!_NAME) {Error("Failed to load OpenGL function %s", #_NAME); }
+ GLFUNCS
+ #undef X
+}
+
+#undef GLFUNCS
+
+void CheckGLError(unsigned int line, const char *code)
+{
+ GLenum error = glGetError();
+ if (error != GL_NO_ERROR)
+ {
+ const char *error_str = "unknown error";
+ switch (error)
+ {
+#define BIND_ERROR(_ERR) case _ERR: error_str = #_ERR; break;
+ BIND_ERROR(GL_INVALID_ENUM);
+ BIND_ERROR(GL_INVALID_VALUE);
+ BIND_ERROR(GL_INVALID_OPERATION);
+ BIND_ERROR(GL_OUT_OF_MEMORY);
+#undef BIND_ERROR
+ };
+ Error("%s:%d: %s generated %s", __FILE__, line, code, error_str);
+ }
+}
+
+#define GL(_CODE) _CODE; CheckGLError(__LINE__, #_CODE)
+
+// ================================================================================
+// Batched quad renderer
+// ================================================================================
+
+#define MAX_QUADS 256
+
+typedef struct R_Quad R_Quad;
+struct R_Quad
+{
+ float p[2];
+ float t[2];
+};
+
+typedef struct R_QuadRenderer R_QuadRenderer;
+struct R_QuadRenderer
+{
+ GLuint vao;
+ GLuint vbo;
+ GLuint ibo;
+
+ GLuint shader;
+
+ R_Quad vbuf[MAX_QUADS * 4];
+ size_t head;
+};
+
+void R_InitQuads(R_QuadRenderer *qr, GLuint shader)
+{
+ qr->shader = shader;
+
+ GL(glGenVertexArrays(1, &qr->vao));
+ GL(glBindVertexArray(qr->vao));
+
+ GL(glGenBuffers(1, &qr->vbo));
+ GL(glBindBuffer(GL_ARRAY_BUFFER, qr->vbo));
+ GL(glBufferData(GL_ARRAY_BUFFER, sizeof(qr->vbuf), qr->vbuf, GL_STREAM_DRAW));
+
+ uint16_t ibuf[MAX_QUADS * 6];
+ for (int i = 0; i < MAX_QUADS; ++i)
+ {
+ ibuf[6*i+0] = 4*i+0;
+ ibuf[6*i+1] = 4*i+1;
+ ibuf[6*i+2] = 4*i+2;
+ ibuf[6*i+3] = 4*i+0;
+ ibuf[6*i+4] = 4*i+2;
+ ibuf[6*i+5] = 4*i+3;
+ }
+
+ GL(glGenBuffers(1, &qr->ibo));
+ GL(glBindBuffer(GL_ELEMENT_ARRAY_BUFFER, qr->ibo));
+ GL(glBufferData(GL_ELEMENT_ARRAY_BUFFER, sizeof(ibuf), ibuf, GL_STATIC_DRAW));
+
+ GL(glVertexAttribPointer(0, 2, GL_FLOAT, GL_FALSE, sizeof(R_Quad), (const void *)offsetof(R_Quad, p)));
+ GL(glVertexAttribPointer(1, 2, GL_FLOAT, GL_FALSE, sizeof(R_Quad), (const void *)offsetof(R_Quad, t)));
+ GL(glEnableVertexAttribArray(0));
+ GL(glEnableVertexAttribArray(1));
+}
+
+void R_FlushQuads(R_QuadRenderer *qr)
+{
+ if (qr->head > 0)
+ {
+ GL(glBufferSubData(GL_ARRAY_BUFFER, 0, sizeof(R_Quad) * 4 * qr->head, qr->vbuf));
+ GL(glDrawElements(GL_TRIANGLES, qr->head * 6, GL_UNSIGNED_SHORT, 0));
+ qr->head = 0;
+ }
+}
+
+void R_PushQuad(R_QuadRenderer *qr, const R_Quad *quad4)
+{
+ qr->vbuf[qr->head*4+0] = quad4[0];
+ qr->vbuf[qr->head*4+1] = quad4[1];
+ qr->vbuf[qr->head*4+2] = quad4[2];
+ qr->vbuf[qr->head*4+3] = quad4[3];
+ ++qr->head;
+ if (qr->head >= MAX_QUADS)
+ {
+ R_FlushQuads(qr);
+ }
+}
+
+void R_BeginQuads(R_QuadRenderer *qr, int vp_w, int vp_h)
+{
+ GL(glBindVertexArray(qr->vao));
+ GL(glBindBuffer(GL_ARRAY_BUFFER, qr->vbo));
+ GL(glUseProgram(qr->shader));
+
+ Mat4 proj = Mat4_Ortho(0, vp_w, 0, vp_h, 0, 10);
+ GL(glUniformMatrix4fv(glGetUniformLocation(qr->shader, "u_proj"), 1, GL_FALSE, proj.m));
+}
+
+void R_EndQuads(R_QuadRenderer *qr)
+{
+ R_FlushQuads(qr);
+}
+
+// ================================================================================
+// Client code
+// ================================================================================
+
+static struct
+{
+ R_QuadRenderer qr;
+ int width;
+ int height;
+} cl = { 0 };
+
+void CL_Init(const CL_InitParams *params)
+{
+ cl.width = -1;
+ cl.height = -1;
+
+ LoadOpenGLFunctions(params->glproc);
+ Info("Loaded OpenGL: GL_VENDOR = %s, GL_RENDERER = %s", glGetString(GL_VENDOR), glGetString(GL_RENDERER));
+
+ GLuint prog_quad;
+
+#define GLSL(_X) "#version 330 core\n" #_X
+ const char* quad_vs = GLSL(
+ layout (location = 0) in vec2 p;
+ layout (location = 1) in vec2 t;
+
+ uniform mat4 u_proj;
+
+ void main()
+ {
+ gl_Position = u_proj * vec4(p.x, p.y, 0.0, 1.0);
+ }
+ );
+ const char* quad_fs = GLSL(
+ out vec4 out_color;
+
+ void main()
+ {
+ out_color = vec4(1.0, 0.0, 0.0, 1.0);
+ }
+ );
+#undef GLSL
+
+ struct { GLuint* prog; const char *vsrc; const char *fsrc; } reg[] =
+ {
+ { .prog = &prog_quad, .vsrc = quad_vs, .fsrc = quad_fs, },
+ };
+ for (size_t i = 0; i < COUNTOF(reg); ++i)
+ {
+ GLuint prog = GL(glCreateProgram());
+
+ GLuint vs = GL(glCreateShader(GL_VERTEX_SHADER));
+ GL(glShaderSource(vs, 1, &reg[i].vsrc, 0));
+
+ GLuint fs = GL(glCreateShader(GL_FRAGMENT_SHADER));
+ GL(glShaderSource(fs, 1, &reg[i].fsrc, 0));
+
+ GLuint shaders[] = { vs, fs };
+ for (size_t j = 0; j < COUNTOF(shaders); ++j)
+ {
+ GL(glCompileShader(shaders[j]));
+
+ GLint status = GL_TRUE;
+ GL(glGetShaderiv(shaders[j], GL_COMPILE_STATUS, &status));
+ if (status != GL_TRUE)
+ {
+ char shader_log[1024];
+ GL(glGetShaderInfoLog(shaders[j], sizeof(shader_log), 0, shader_log));
+ Error("Failed to compile shader: %s", shader_log);
+ }
+
+ GL(glAttachShader(prog, shaders[j]));
+ }
+
+ GL(glLinkProgram(prog));
+
+ GLint status = GL_TRUE;
+ GL(glGetProgramiv(prog, GL_LINK_STATUS, &status));
+ if (status != GL_TRUE)
+ {
+ char shader_log[1024];
+ GL(glGetProgramInfoLog(prog, sizeof(shader_log), 0, shader_log));
+ Error("Failed to link shader: %s", shader_log);
+ }
+
+ *reg[i].prog = prog;
+ }
+
+ R_InitQuads(&cl.qr, prog_quad);
+}
+
+void CL_Render(int vp_width, int vp_height)
+{
+ if (cl.width != vp_width || cl.height != vp_height)
+ {
+ cl.width = vp_width;
+ cl.height = vp_height;
+ Info("Resized to %dx%d", cl.width, cl.height);
+ }
+
+ GL(glViewport(0, 0, vp_width, vp_height));
+ GL(glClearColor(0.1f, 0.1f, 0.1f, 1.0f));
+ GL(glClear(GL_COLOR_BUFFER_BIT | GL_DEPTH_BUFFER_BIT));
+
+ R_BeginQuads(&cl.qr, cl.width, cl.height);
+ R_Quad rect[4] =
+ {
+ (R_Quad){ .p = { 0.0, 0.0 }, .t = { 0, 0 } },
+ (R_Quad){ .p = { 100, 0.0 }, .t = { 0, 0 } },
+ (R_Quad){ .p = { 100, 100 }, .t = { 0, 0 } },
+ (R_Quad){ .p = { 0.0, 100 }, .t = { 0, 0 } },
+ };
+ R_PushQuad(&cl.qr, rect);
+ R_EndQuads(&cl.qr);
+}
+
+// ================================================================================
+// Server code
+// ================================================================================
+
+void SV_Init(const SV_InitParams *params)
+{
+ Info("Starting server %s", params->hostname);
+}
+
+// ================================================================================
+// SDL3 shared
+// ================================================================================
+
+#ifdef CLIENT
+# define SDL_MAIN_USE_CALLBACKS
+#endif
+#include <SDL3/SDL.h>
+#include <SDL3/SDL_main.h>
+
+void OS_SpewError(const char *message)
+{
+ printf("Error: %s\n", message);
+#ifdef CLIENT
+ SDL_ShowSimpleMessageBox(SDL_MESSAGEBOX_ERROR, "MicroShooter: Error!", message, 0);
+#endif
+ exit(1);
+}
+
+void OS_SpewInfo(const char *message)
+{
+ printf("%s\n", message);
+}
+
+// ================================================================================
+// SDL3 client
+// ================================================================================
+
+#ifdef CLIENT
+
+typedef struct AppState AppState;
+struct AppState
+{
+ SDL_Window *wnd;
+ SDL_GLContext glctx;
+};
+
+void RenderFrame(AppState *app)
+{
+ int vp_width = 0;
+ int vp_height = 0;
+ SDL_GetWindowSizeInPixels(app->wnd, &vp_width, &vp_height);
+ CL_Render(vp_width, vp_height);
+ SDL_GL_SwapWindow(app->wnd);
+}
+
+SDL_AppResult SDL_AppInit(void **appstate, int argc, char **argv)
+{
+ UNUSED(argc);
+ UNUSED(argv);
+
+ *appstate = calloc(1, sizeof(AppState));
+ AppState *app = *appstate;
+
+ if (!SDL_Init(SDL_INIT_VIDEO | SDL_INIT_EVENTS))
+ {
+ Error("Failed to initialize SDL: %s", SDL_GetError());
+ }
+
+ SDL_GL_SetAttribute(SDL_GL_CONTEXT_MAJOR_VERSION, 3);
+ SDL_GL_SetAttribute(SDL_GL_CONTEXT_MINOR_VERSION, 3);
+ SDL_GL_SetAttribute(SDL_GL_CONTEXT_PROFILE_MASK, SDL_GL_CONTEXT_PROFILE_CORE);
+
+ app->wnd = SDL_CreateWindow("MicroShooter", 800, 600, SDL_WINDOW_HIDDEN | SDL_WINDOW_RESIZABLE | SDL_WINDOW_OPENGL);
+ if (!app->wnd)
+ {
+ Error("Failed to create window: %s", SDL_GetError());
+ }
+
+ app->glctx = SDL_GL_CreateContext(app->wnd);
+ if (!app->glctx)
+ {
+ Error("Failed to create OpenGL context: %s", SDL_GetError());
+ }
+
+ CL_InitParams init = (CL_InitParams)
+ {
+ .glproc = (void *(*)(const char *))SDL_GL_GetProcAddress,
+ };
+ CL_Init(&init);
+
+ // Render one frame before showing the window
+ RenderFrame(app);
+
+ SDL_ShowWindow(app->wnd);
+
+ return SDL_APP_CONTINUE;
+}
+
+SDL_AppResult SDL_AppEvent(void *appstate, SDL_Event *event)
+{
+ UNUSED(appstate);
+
+ SDL_AppResult result = SDL_APP_CONTINUE;
+
+ switch (event->type)
+ {
+ case SDL_EVENT_QUIT:
+ {
+ result = SDL_APP_SUCCESS;
+ break;
+ }
+ }
+
+ return result;
+}
+
+SDL_AppResult SDL_AppIterate(void *appstate)
+{
+ AppState *app = appstate;
+
+ RenderFrame(app);
+
+ return SDL_APP_CONTINUE;
+}
+
+void SDL_AppQuit(void *appstate, SDL_AppResult result)
+{
+ UNUSED(result);
+
+ AppState *app = appstate;
+
+ SDL_DestroyWindow(app->wnd);
+ SDL_Quit();
+}
+
+#endif // CLIENT
+
+// ================================================================================
+// SDL3 server
+// ================================================================================
+
+#ifdef SERVER
+
+int main(int argc, char **argv)
+{
+ UNUSED(argc);
+ UNUSED(argv);
+ printf("<server>\n");
+}
+
+#endif // SERVER
+
diff --git a/vk-asylum/build-shaders.sh b/vk-asylum/build-shaders.sh
index 51d4475..780ab86 100755
--- a/vk-asylum/build-shaders.sh
+++ b/vk-asylum/build-shaders.sh
@@ -3,4 +3,4 @@ mkdir -p build
glslc -fshader-stage=vert ./triangle_vs.glsl -o build/triangle_vs.spv
glslc -fshader-stage=frag ./triangle_fs.glsl -o build/triangle_fs.spv
echo "const uint32_t triangle_vert_spv[] = { "$(hexdump -v -e '1/4 "0x%08x, "' build/triangle_vs.spv)" };" > shaders.h
-echo "const uint32_t triangle_frag_spv[] = { "$(hexdump -v -e '1/4 "0x%08x, "' build/triangle_vs.spv)" };" >> shaders.h
+echo "const uint32_t triangle_frag_spv[] = { "$(hexdump -v -e '1/4 "0x%08x, "' build/triangle_fs.spv)" };" >> shaders.h
diff --git a/vk-asylum/main.c b/vk-asylum/main.c
index 74e86e2..ef8410b 100644
--- a/vk-asylum/main.c
+++ b/vk-asylum/main.c
@@ -17,12 +17,27 @@
#define WND_W 1024
#define WND_H 768
+typedef struct Vec3 Vec3;
+struct Vec3
+{
+ float x;
+ float y;
+ float z;
+};
+
+typedef struct Vertex Vertex;
+struct Vertex
+{
+ Vec3 p;
+ Vec3 c;
+};
+
int main(int argc, char* argv[])
{
(void)argc; (void)argv;
SDL_Init(SDL_INIT_VIDEO);
- SDL_Window* wnd = SDL_CreateWindow("vk-asylum", WND_W, WND_H, SDL_WINDOW_VULKAN);
+ SDL_Window* wnd = SDL_CreateWindow("vk-asylum", WND_W, WND_H, SDL_WINDOW_VULKAN | SDL_WINDOW_HIGH_PIXEL_DENSITY);
ASSERT(wnd);
VkResult r = VK_SUCCESS;
@@ -33,22 +48,35 @@ int main(int argc, char* argv[])
.sType = VK_STRUCTURE_TYPE_APPLICATION_INFO,
.apiVersion = VK_API_VERSION_1_4,
};
+ const char* layers[] = { "VK_LAYER_KHRONOS_validation" };
VkInstanceCreateInfo vkici = (VkInstanceCreateInfo){
.sType = VK_STRUCTURE_TYPE_INSTANCE_CREATE_INFO,
.pApplicationInfo = &vkai,
+ .enabledLayerCount = 1,
+ .ppEnabledLayerNames = layers,
};
- vkici.ppEnabledExtensionNames = SDL_Vulkan_GetInstanceExtensions(&vkici.enabledExtensionCount);
- printf("Instance extensions requested by SDL:\n");
- for (uint32_t i = 0; i < vkici.enabledExtensionCount; ++i) {
- printf(" %s\n", vkici.ppEnabledExtensionNames[i]);
+ // Extensions required for the SDL window backend
+ uint32_t num_sdl_extensions = 0;
+ const char* const* sdl_extensions = SDL_Vulkan_GetInstanceExtensions(&num_sdl_extensions);
+
+ const char* extensions[64] = { 0 };
+ for (; vkici.enabledExtensionCount < num_sdl_extensions; ++vkici.enabledExtensionCount) {
+ extensions[vkici.enabledExtensionCount] = sdl_extensions[vkici.enabledExtensionCount];
}
#ifdef __APPLE__
// Required for instance extension VK_KHR_portability_enumeration (MoltenVK)
vkici.flags |= VK_INSTANCE_CREATE_ENUMERATE_PORTABILITY_BIT_KHR;
+ extensions[vkici.enabledExtensionCount++] = VK_KHR_PORTABILITY_ENUMERATION_EXTENSION_NAME;
#endif
+ vkici.ppEnabledExtensionNames = extensions;
+ printf("Instance extensions requested by SDL:\n");
+ for (uint32_t i = 0; i < vkici.enabledExtensionCount; ++i) {
+ printf(" %s\n", vkici.ppEnabledExtensionNames[i]);
+ }
+
if ((r = vkCreateInstance(&vkici, 0, &vk)) != VK_SUCCESS) {
DIE("Failed to create Vulkan instance: %s", VkResultToString(r));
}
@@ -59,7 +87,7 @@ int main(int argc, char* argv[])
DIE("Failed to create Vulkan surface: %s", SDL_GetError());
}
- VkPhysicalDevice vk_pdev = 0;
+ VkPhysicalDevice vk_pdev = VK_NULL_HANDLE;
{
uint32_t vk_pdev_count = 0;
vkEnumeratePhysicalDevices(vk, &vk_pdev_count, 0);
@@ -69,19 +97,31 @@ int main(int argc, char* argv[])
VkPhysicalDevice* vk_pdevs = calloc(vk_pdev_count, sizeof(VkPhysicalDevice));
vkEnumeratePhysicalDevices(vk, &vk_pdev_count, vk_pdevs);
+
+ // Select the first device with dynamic rendering support
// Just pick the first available physical device
vk_pdev = vk_pdevs[0];
printf("Vulkan devices:\n");
for (uint32_t i = 0; i < vk_pdev_count; ++i) {
- VkPhysicalDeviceProperties vk_pdev_props = { 0 };
- vkGetPhysicalDeviceProperties(vk_pdev, &vk_pdev_props);
+ VkPhysicalDeviceProperties vk_pdev_props = { 0 };
+ vkGetPhysicalDeviceProperties(vk_pdevs[i], &vk_pdev_props);
+ VkPhysicalDeviceDynamicRenderingFeatures vk_pdev_dr = (VkPhysicalDeviceDynamicRenderingFeatures){
+ .sType = VK_STRUCTURE_TYPE_PHYSICAL_DEVICE_DYNAMIC_RENDERING_FEATURES,
+ };
+ VkPhysicalDeviceFeatures2 vk_pdev_features = (VkPhysicalDeviceFeatures2){
+ .sType = VK_STRUCTURE_TYPE_PHYSICAL_DEVICE_FEATURES_2,
+ .pNext = &vk_pdev_dr,
+ };
+ vkGetPhysicalDeviceFeatures2(vk_pdevs[i], &vk_pdev_features);
printf(" %s\n", vk_pdev_props.deviceName);
+ printf(" dynamic rendering? %s\n", vk_pdev_dr.dynamicRendering ? "YES" : "NO");
}
free(vk_pdevs);
}
+ ASSERT(vk_pdev != VK_NULL_HANDLE);
uint32_t vk_qf_index = (uint32_t)-1;
{
@@ -113,14 +153,25 @@ int main(int argc, char* argv[])
.queueCount = 1,
.pQueuePriorities = &priority,
};
-
- const char* device_extensions[] = { VK_KHR_SWAPCHAIN_EXTENSION_NAME, };
+
+ VkPhysicalDeviceDynamicRenderingFeatures dynamic_rendering = (VkPhysicalDeviceDynamicRenderingFeatures){
+ .sType = VK_STRUCTURE_TYPE_PHYSICAL_DEVICE_DYNAMIC_RENDERING_FEATURES,
+ .dynamicRendering = VK_TRUE,
+ };
+
+ const char* device_extensions[] = {
+ VK_KHR_SWAPCHAIN_EXTENSION_NAME,
+#ifdef __APPLE__
+ "VK_KHR_portability_subset",
+#endif
+ };
VkDeviceCreateInfo device_create = (VkDeviceCreateInfo){
.sType = VK_STRUCTURE_TYPE_DEVICE_CREATE_INFO,
.queueCreateInfoCount = 1,
.pQueueCreateInfos = &queue_create,
.enabledExtensionCount = COUNTOF(device_extensions),
.ppEnabledExtensionNames = device_extensions,
+ .pNext = &dynamic_rendering,
};
if ((r = vkCreateDevice(vk_pdev, &device_create, 0, &vk_dev)) != VK_SUCCESS) {
@@ -169,11 +220,13 @@ int main(int argc, char* argv[])
uint32_t vk_swap_image_count = 0;
VkImage* vk_swap_images = 0;
VkImageView* vk_swap_image_views = 0;
+ VkFence* vk_swap_fences = 0;
{
vkGetSwapchainImagesKHR(vk_dev, vk_swp, &vk_swap_image_count, 0);
ASSERT(vk_swap_image_count > 0);
vk_swap_images = calloc(vk_swap_image_count, sizeof(VkImage));
vkGetSwapchainImagesKHR(vk_dev, vk_swp, &vk_swap_image_count, vk_swap_images);
+ vk_swap_fences = calloc(vk_swap_image_count, sizeof(VkFence));
vk_swap_image_views = calloc(vk_swap_image_count, sizeof(VkImageView));
for (uint32_t i = 0; i < vk_swap_image_count; ++i) {
@@ -195,69 +248,87 @@ int main(int argc, char* argv[])
if ((r = vkCreateImageView(vk_dev, &image_view_create, 0, &vk_swap_image_views[i])) != VK_SUCCESS) {
DIE("Failed to create image view #%u: %s", i, VkResultToString(r));
}
+
+ VkFenceCreateInfo fence_create = (VkFenceCreateInfo){
+ .sType = VK_STRUCTURE_TYPE_FENCE_CREATE_INFO,
+ .flags = VK_FENCE_CREATE_SIGNALED_BIT, // start in signaled state
+ };
+ if ((r = vkCreateFence(vk_dev, &fence_create, 0, &vk_swap_fences[i])) != VK_SUCCESS) {
+ DIE("Failed to create swap image fence #%u: %s", i, VkResultToString(r));
+ }
}
}
-
- VkRenderPass vk_pass = 0;
- {
- VkAttachmentDescription attachment = (VkAttachmentDescription){
- .format = VK_FORMAT_B8G8R8A8_SRGB,
- .samples = VK_SAMPLE_COUNT_1_BIT,
- .loadOp = VK_ATTACHMENT_LOAD_OP_CLEAR,
- .storeOp = VK_ATTACHMENT_STORE_OP_STORE,
- .stencilLoadOp = VK_ATTACHMENT_LOAD_OP_DONT_CARE,
- .stencilStoreOp = VK_ATTACHMENT_STORE_OP_DONT_CARE,
- .initialLayout = VK_IMAGE_LAYOUT_UNDEFINED,
- .finalLayout = VK_IMAGE_LAYOUT_PRESENT_SRC_KHR,
- };
- VkAttachmentReference color_ref = (VkAttachmentReference){
- .attachment = 0,
- .layout = VK_IMAGE_LAYOUT_COLOR_ATTACHMENT_OPTIMAL,
- };
- VkSubpassDescription subpass = (VkSubpassDescription){
- .pipelineBindPoint = VK_PIPELINE_BIND_POINT_GRAPHICS,
- .colorAttachmentCount = 1,
- .pColorAttachments = &color_ref,
- };
- VkSubpassDependency dep = (VkSubpassDependency){
- .srcSubpass = VK_SUBPASS_EXTERNAL,
- .dstSubpass = 0,
- .srcStageMask = VK_PIPELINE_STAGE_COLOR_ATTACHMENT_OUTPUT_BIT,
- .dstStageMask = VK_PIPELINE_STAGE_COLOR_ATTACHMENT_OUTPUT_BIT,
- .dstAccessMask = VK_ACCESS_COLOR_ATTACHMENT_READ_BIT | VK_ACCESS_COLOR_ATTACHMENT_WRITE_BIT,
- };
- VkRenderPassCreateInfo render_pass_create = (VkRenderPassCreateInfo){
- .sType = VK_STRUCTURE_TYPE_RENDER_PASS_CREATE_INFO,
- .attachmentCount = 1,
- .pAttachments = &attachment,
- .subpassCount = 1,
- .pSubpasses = &subpass,
- .dependencyCount = 1,
- .pDependencies = &dep,
- };
- if ((r = vkCreateRenderPass(vk_dev, &render_pass_create, 0, &vk_pass)) != VK_SUCCESS) {
- DIE("Failed to create render pass: %s", VkResultToString(r));
- }
- }
-
- VkFramebuffer* vk_framebuffers = calloc(vk_swap_image_count, sizeof(VkFramebuffer));
- {
- for (uint32_t i = 0; i < vk_swap_image_count; ++i) {
- VkImageView attachments[] = { vk_swap_image_views[i] };
- VkFramebufferCreateInfo framebuffer_create = (VkFramebufferCreateInfo){
- .sType = VK_STRUCTURE_TYPE_FRAMEBUFFER_CREATE_INFO,
- .renderPass = vk_pass,
- .attachmentCount = 1,
- .pAttachments = attachments,
- .width = vk_extent.width,
- .height = vk_extent.height,
- .layers = 1,
- };
- if ((r = vkCreateFramebuffer(vk_dev, &framebuffer_create, 0, &vk_framebuffers[i])) != VK_SUCCESS) {
- DIE("Failed to create render pass #%u: %s", i, VkResultToString(r));
- }
- }
- }
+
+ VkShaderModule vk_vshader = VK_NULL_HANDLE;
+ VkShaderModule vk_fshader = VK_NULL_HANDLE;
+ {
+#include "shaders.h"
+ VkShaderModuleCreateInfo shader_create = (VkShaderModuleCreateInfo){
+ .sType = VK_STRUCTURE_TYPE_SHADER_MODULE_CREATE_INFO,
+ };
+ shader_create.codeSize = sizeof(triangle_vert_spv);
+ shader_create.pCode = triangle_vert_spv;
+ if ((r = vkCreateShaderModule(vk_dev, &shader_create, 0, &vk_vshader)) != VK_SUCCESS) {
+ DIE("Failed to create vertex shader module: %s", VkResultToString(r));
+ }
+ shader_create.codeSize = sizeof(triangle_frag_spv);
+ shader_create.pCode = triangle_frag_spv;
+ if ((r = vkCreateShaderModule(vk_dev, &shader_create, 0, &vk_fshader)) != VK_SUCCESS) {
+ DIE("Failed to create fragment shader module: %s", VkResultToString(r));
+ }
+ }
+
+ VkBuffer vk_vbuf = VK_NULL_HANDLE;
+ {
+ Vertex vdata[] = {
+ (Vertex){ .p = (Vec3){ 0.0f, -0.5f, 0.0f, }, .c = (Vec3){ 1.0f, 0.0f, 0.0f } },
+ (Vertex){ .p = (Vec3){ 0.5f, 0.5f, 0.0f, }, .c = (Vec3){ 0.0f, 1.0f, 0.0f } },
+ (Vertex){ .p = (Vec3){ -0.5f, 0.5f, 0.0f, }, .c = (Vec3){ 0.0f, 0.0f, 1.0f } },
+ };
+ VkBufferCreateInfo buffer_create = (VkBufferCreateInfo){
+ .sType = VK_STRUCTURE_TYPE_BUFFER_CREATE_INFO,
+ .size = sizeof(vdata),
+ .usage = VK_BUFFER_USAGE_VERTEX_BUFFER_BIT,
+ .sharingMode = VK_SHARING_MODE_EXCLUSIVE,
+ };
+ if ((r = vkCreateBuffer(vk_dev, &buffer_create, 0, &vk_vbuf)) != VK_SUCCESS) {
+ DIE("Failed to create vertex buffer: %s", VkResultToString(r));
+ }
+
+ VkMemoryRequirements memreq;
+ vkGetBufferMemoryRequirements(vk_dev, vk_vbuf, &memreq);
+
+ VkMemoryAllocateInfo alloc_info = (VkMemoryAllocateInfo){
+ .sType = VK_STRUCTURE_TYPE_MEMORY_ALLOCATE_INFO,
+ .allocationSize = memreq.size,
+ .memoryTypeIndex = UINT32_MAX,
+ };
+
+ VkPhysicalDeviceMemoryProperties memprops;
+ vkGetPhysicalDeviceMemoryProperties(vk_pdev, &memprops);
+ for (uint32_t i = 0; i < memprops.memoryTypeCount; ++i) {
+ if (memreq.memoryTypeBits & (1 << i)) {
+ const uint32_t needed = VK_MEMORY_PROPERTY_HOST_VISIBLE_BIT | VK_MEMORY_PROPERTY_HOST_COHERENT_BIT;
+ if ((memprops.memoryTypes[i].propertyFlags & needed) == needed) {
+ alloc_info.memoryTypeIndex = i;
+ break;
+ }
+ }
+ }
+ ASSERT(alloc_info.memoryTypeIndex != UINT32_MAX);
+
+ VkDeviceMemory vkmem;
+ if ((r = vkAllocateMemory(vk_dev, &alloc_info, 0, &vkmem)) != VK_SUCCESS) {
+ DIE("Failed to allocate GPU memory for vertex buffer: %s", VkResultToString(r));
+ }
+
+ vkBindBufferMemory(vk_dev, vk_vbuf, vkmem, 0);
+
+ void* data = 0;
+ vkMapMemory(vk_dev, vkmem, 0, sizeof(vdata), 0, &data);
+ memcpy(data, vdata, sizeof(vdata));
+ vkUnmapMemory(vk_dev, vkmem);
+ }
VkPipeline vk_pipeline = 0;
{
@@ -268,6 +339,128 @@ int main(int argc, char* argv[])
if ((r = vkCreatePipelineLayout(vk_dev, &pipeline_layout_create, 0, &pipeline_layout)) != VK_SUCCESS) {
DIE("Failed to create pipeline layout: %s", VkResultToString(r));
}
+
+ VkPipelineShaderStageCreateInfo stages[] = {
+ (VkPipelineShaderStageCreateInfo){
+ .sType = VK_STRUCTURE_TYPE_PIPELINE_SHADER_STAGE_CREATE_INFO,
+ .stage = VK_SHADER_STAGE_VERTEX_BIT,
+ .module = vk_vshader,
+ .pName = "main",
+ },
+ (VkPipelineShaderStageCreateInfo){
+ .sType = VK_STRUCTURE_TYPE_PIPELINE_SHADER_STAGE_CREATE_INFO,
+ .stage = VK_SHADER_STAGE_FRAGMENT_BIT,
+ .module = vk_fshader,
+ .pName = "main",
+ }
+ };
+
+ VkPipelineInputAssemblyStateCreateInfo input_assembly = (VkPipelineInputAssemblyStateCreateInfo){
+ .sType = VK_STRUCTURE_TYPE_PIPELINE_INPUT_ASSEMBLY_STATE_CREATE_INFO,
+ .topology = VK_PRIMITIVE_TOPOLOGY_TRIANGLE_LIST,
+ };
+
+ VkViewport viewport = (VkViewport){
+ .width = vk_extent.width,
+ .height = vk_extent.height,
+ .maxDepth = 1.0f,
+ };
+
+ VkRect2D scissor = (VkRect2D){
+ .extent = vk_extent,
+ };
+
+ VkPipelineViewportStateCreateInfo viewport_state_create = (VkPipelineViewportStateCreateInfo){
+ .sType = VK_STRUCTURE_TYPE_PIPELINE_VIEWPORT_STATE_CREATE_INFO,
+ .viewportCount = 1,
+ .pViewports = &viewport,
+ .scissorCount = 1,
+ .pScissors = &scissor,
+ };
+
+ VkPipelineRasterizationStateCreateInfo raster_state_create = (VkPipelineRasterizationStateCreateInfo){
+ .sType = VK_STRUCTURE_TYPE_PIPELINE_RASTERIZATION_STATE_CREATE_INFO,
+ .polygonMode = VK_POLYGON_MODE_FILL,
+ .cullMode = VK_CULL_MODE_NONE,
+ .frontFace = VK_FRONT_FACE_CLOCKWISE,
+ .lineWidth = 1.0,
+ };
+
+ VkPipelineMultisampleStateCreateInfo multisample_state = (VkPipelineMultisampleStateCreateInfo){
+ .sType = VK_STRUCTURE_TYPE_PIPELINE_MULTISAMPLE_STATE_CREATE_INFO,
+ .rasterizationSamples = VK_SAMPLE_COUNT_1_BIT,
+ };
+
+ VkPipelineColorBlendAttachmentState blending_attach = (VkPipelineColorBlendAttachmentState){
+ .colorWriteMask = 0x0F,
+ };
+
+ VkPipelineColorBlendStateCreateInfo blending = (VkPipelineColorBlendStateCreateInfo){
+ .sType = VK_STRUCTURE_TYPE_PIPELINE_COLOR_BLEND_STATE_CREATE_INFO,
+ .attachmentCount = 1,
+ .pAttachments = &blending_attach,
+ };
+
+ VkFormat color_format = VK_FORMAT_B8G8R8A8_SRGB;
+ VkPipelineRenderingCreateInfo rendering = (VkPipelineRenderingCreateInfo){
+ .sType = VK_STRUCTURE_TYPE_PIPELINE_RENDERING_CREATE_INFO,
+ .colorAttachmentCount = 1,
+ .pColorAttachmentFormats = &color_format,
+ };
+
+ VkPipelineDepthStencilStateCreateInfo depth_stencil = {
+ .sType = VK_STRUCTURE_TYPE_PIPELINE_DEPTH_STENCIL_STATE_CREATE_INFO,
+ .depthTestEnable = VK_FALSE,
+ .depthWriteEnable = VK_FALSE,
+ .depthCompareOp = VK_COMPARE_OP_ALWAYS,
+ };
+
+ VkVertexInputBindingDescription input_binding = (VkVertexInputBindingDescription){
+ .binding = 0,
+ .stride = sizeof(Vertex),
+ .inputRate = VK_VERTEX_INPUT_RATE_VERTEX,
+ };
+
+ VkVertexInputAttributeDescription input_attrs[] = {
+ (VkVertexInputAttributeDescription){
+ .location = 0,
+ .binding = 0,
+ .format = VK_FORMAT_R32G32B32_SFLOAT,
+ .offset = offsetof(Vertex, p),
+ },
+ (VkVertexInputAttributeDescription){
+ .location = 1,
+ .binding = 0,
+ .format = VK_FORMAT_R32G32B32_SFLOAT,
+ .offset = offsetof(Vertex, c),
+ },
+ };
+
+ VkPipelineVertexInputStateCreateInfo vertex_input = (VkPipelineVertexInputStateCreateInfo){
+ .sType = VK_STRUCTURE_TYPE_PIPELINE_VERTEX_INPUT_STATE_CREATE_INFO,
+ .vertexBindingDescriptionCount = 1,
+ .pVertexBindingDescriptions = &input_binding,
+ .vertexAttributeDescriptionCount = COUNTOF(input_attrs),
+ .pVertexAttributeDescriptions = input_attrs,
+ };
+
+ VkGraphicsPipelineCreateInfo pipeline_create = (VkGraphicsPipelineCreateInfo){
+ .sType = VK_STRUCTURE_TYPE_GRAPHICS_PIPELINE_CREATE_INFO,
+ .pNext = &rendering,
+ .stageCount = COUNTOF(stages),
+ .pStages = stages,
+ .pInputAssemblyState = &input_assembly,
+ .pViewportState = &viewport_state_create,
+ .pRasterizationState = &raster_state_create,
+ .pMultisampleState = &multisample_state,
+ .pColorBlendState = &blending,
+ .layout = pipeline_layout,
+ .pVertexInputState = &vertex_input,
+ .pDepthStencilState = &depth_stencil,
+ };
+ if ((r = vkCreateGraphicsPipelines(vk_dev, VK_NULL_HANDLE, 1, &pipeline_create, 0, &vk_pipeline)) != VK_SUCCESS) {
+ DIE("Failed to create pipeline: %s", VkResultToString(r));
+ }
}
VkCommandPool vk_cmd_pool = 0;
@@ -302,38 +495,82 @@ int main(int argc, char* argv[])
if ((r = vkBeginCommandBuffer(vk_cmd_buffers[i], &cmd_buffer_begin_info)) != VK_SUCCESS) {
DIE("Failed to begin command buffer for swapchain image #%u: %s", i, VkResultToString(r));
}
-
- VkClearValue clear_val = (VkClearValue){
- .color.float32[0] = 1.0f,
- .color.float32[1] = 0.0f,
- .color.float32[2] = 1.0f,
- .color.float32[3] = 1.0f,
- };
- VkRenderPassBeginInfo render_pass_begin_info = (VkRenderPassBeginInfo){
- .sType = VK_STRUCTURE_TYPE_RENDER_PASS_BEGIN_INFO,
- .renderPass = vk_pass,
- .framebuffer = vk_framebuffers[i],
- .renderArea.extent = vk_extent,
- .clearValueCount = 1,
- .pClearValues = &clear_val
- };
- vkCmdBeginRenderPass(vk_cmd_buffers[i], &render_pass_begin_info, VK_SUBPASS_CONTENTS_INLINE);
-
- //
-
- vkCmdEndRenderPass(vk_cmd_buffers[i]);
- vkEndCommandBuffer(vk_cmd_buffers[i]);
+
+ VkImageMemoryBarrier vk_barrier = (VkImageMemoryBarrier){
+ .sType = VK_STRUCTURE_TYPE_IMAGE_MEMORY_BARRIER,
+ .oldLayout = VK_IMAGE_LAYOUT_UNDEFINED,
+ .newLayout = VK_IMAGE_LAYOUT_COLOR_ATTACHMENT_OPTIMAL,
+ .srcAccessMask = 0,
+ .dstAccessMask = VK_ACCESS_COLOR_ATTACHMENT_WRITE_BIT,
+ .image = vk_swap_images[i],
+ .subresourceRange.aspectMask = VK_IMAGE_ASPECT_COLOR_BIT,
+ .subresourceRange.levelCount = 1,
+ .subresourceRange.layerCount = 1,
+ };
+ vkCmdPipelineBarrier(vk_cmd_buffers[i], VK_PIPELINE_STAGE_TOP_OF_PIPE_BIT, VK_PIPELINE_STAGE_COLOR_ATTACHMENT_OUTPUT_BIT, 0, 0, 0, 0, 0, 1, &vk_barrier);
+
+ VkRenderingAttachmentInfo vk_rendering_attachment = (VkRenderingAttachmentInfo){
+ .sType = VK_STRUCTURE_TYPE_RENDERING_ATTACHMENT_INFO,
+ .imageView = vk_swap_image_views[i],
+ .imageLayout = VK_IMAGE_LAYOUT_COLOR_ATTACHMENT_OPTIMAL,
+ .loadOp = VK_ATTACHMENT_LOAD_OP_CLEAR,
+ .storeOp = VK_ATTACHMENT_STORE_OP_STORE,
+ .clearValue.color = {{1.0f, 0.0f, 1.0f, 1.0f}},
+ };
+ VkRenderingInfo vk_rendering_info = (VkRenderingInfo){
+ .sType = VK_STRUCTURE_TYPE_RENDERING_INFO,
+ .renderArea.extent = vk_extent,
+ .layerCount = 1,
+ .colorAttachmentCount = 1,
+ .pColorAttachments = &vk_rendering_attachment,
+ };
+ vkCmdBeginRendering(vk_cmd_buffers[i], &vk_rendering_info);
+
+ vkCmdBindPipeline(vk_cmd_buffers[i], VK_PIPELINE_BIND_POINT_GRAPHICS, vk_pipeline);
+ VkDeviceSize offset = 0;
+ vkCmdBindVertexBuffers(vk_cmd_buffers[i], 0, 1, &vk_vbuf, &offset);
+ vkCmdDraw(vk_cmd_buffers[i], 3, 1, 0, 0);
+
+ vkCmdEndRendering(vk_cmd_buffers[i]);
+
+ vk_barrier.oldLayout = VK_IMAGE_LAYOUT_COLOR_ATTACHMENT_OPTIMAL;
+ vk_barrier.newLayout = VK_IMAGE_LAYOUT_PRESENT_SRC_KHR;
+ vk_barrier.srcAccessMask = VK_ACCESS_COLOR_ATTACHMENT_WRITE_BIT;
+ vk_barrier.dstAccessMask = 0;
+ vkCmdPipelineBarrier(vk_cmd_buffers[i], VK_PIPELINE_STAGE_COLOR_ATTACHMENT_OUTPUT_BIT, VK_PIPELINE_STAGE_BOTTOM_OF_PIPE_BIT, 0, 0, 0, 0, 0, 1, &vk_barrier);
+
+ if ((r = vkEndCommandBuffer(vk_cmd_buffers[i])) != VK_SUCCESS) {
+ DIE("Failed to end command buffer for swapchain image #%u: %s", i, VkResultToString(r));
+ }
}
- VkSemaphore vk_image_available = 0;
- VkSemaphore vk_render_finished = 0;
+ VkSemaphore* vk_image_available = calloc(vk_swap_image_count, sizeof(VkSemaphore));
+ VkSemaphore* vk_render_finished = calloc(vk_swap_image_count, sizeof(VkSemaphore));
{
VkSemaphoreCreateInfo semaphore_create = (VkSemaphoreCreateInfo){
.sType = VK_STRUCTURE_TYPE_SEMAPHORE_CREATE_INFO,
};
- vkCreateSemaphore(vk_dev, &semaphore_create, 0, &vk_image_available);
- vkCreateSemaphore(vk_dev, &semaphore_create, 0, &vk_render_finished);
+ for (uint32_t i = 0; i < vk_swap_image_count; ++i) {
+ vkCreateSemaphore(vk_dev, &semaphore_create, 0, &vk_image_available[i]);
+ vkCreateSemaphore(vk_dev, &semaphore_create, 0, &vk_render_finished[i]);
+ }
}
+
+#define MAX_FRAMES_IN_FLIGHT 2
+ VkFence vk_frame_fences[MAX_FRAMES_IN_FLIGHT];
+ {
+ VkFenceCreateInfo create = (VkFenceCreateInfo){
+ .sType = VK_STRUCTURE_TYPE_FENCE_CREATE_INFO,
+ .flags = VK_FENCE_CREATE_SIGNALED_BIT,
+ };
+ for (uint32_t i = 0; i < MAX_FRAMES_IN_FLIGHT; ++i) {
+ if ((r = vkCreateFence(vk_dev, &create, 0, &vk_frame_fences[i])) != VK_SUCCESS) {
+ DIE("Failed to create frame fence #%u: %s", i, VkResultToString(r));
+ }
+ }
+ }
+
+ uint32_t frame_idx = 0;
bool running = true;
while (running) {
@@ -347,27 +584,31 @@ int main(int argc, char* argv[])
}
uint32_t next_image_idx = 0;
- vkAcquireNextImageKHR(vk_dev, vk_swp, UINT64_MAX, vk_image_available, VK_NULL_HANDLE, &next_image_idx);
+ vkAcquireNextImageKHR(vk_dev, vk_swp, UINT64_MAX, vk_image_available[frame_idx], VK_NULL_HANDLE, &next_image_idx);
+ // Wait for GPU to finish using this swapchain image
+ vkWaitForFences(vk_dev, 1, &vk_frame_fences[frame_idx], VK_TRUE, UINT64_MAX);
+ vkResetFences(vk_dev, 1, &vk_frame_fences[frame_idx]);
+
VkPipelineStageFlags wait_stages[] = { VK_PIPELINE_STAGE_COLOR_ATTACHMENT_OUTPUT_BIT };
VkSubmitInfo submit_info = (VkSubmitInfo){
.sType = VK_STRUCTURE_TYPE_SUBMIT_INFO,
.waitSemaphoreCount = 1,
- .pWaitSemaphores = &vk_image_available,
+ .pWaitSemaphores = &vk_image_available[frame_idx],
.pWaitDstStageMask = wait_stages,
.commandBufferCount = 1,
.pCommandBuffers = &vk_cmd_buffers[next_image_idx],
.signalSemaphoreCount = 1,
- .pSignalSemaphores = &vk_render_finished,
+ .pSignalSemaphores = &vk_render_finished[next_image_idx],
};
- if ((r = vkQueueSubmit(vk_graphics_queue, 1, &submit_info, VK_NULL_HANDLE)) != VK_SUCCESS) {
+ if ((r = vkQueueSubmit(vk_graphics_queue, 1, &submit_info, vk_frame_fences[frame_idx])) != VK_SUCCESS) {
DIE("Failed to submit graphics queue: %s", VkResultToString(r));
}
VkPresentInfoKHR present_info = (VkPresentInfoKHR){
.sType = VK_STRUCTURE_TYPE_PRESENT_INFO_KHR,
.waitSemaphoreCount = 1,
- .pWaitSemaphores = &vk_render_finished,
+ .pWaitSemaphores = &vk_render_finished[next_image_idx],
.swapchainCount = 1,
.pSwapchains = &vk_swp,
.pImageIndices = &next_image_idx,
@@ -375,6 +616,10 @@ int main(int argc, char* argv[])
if ((r = vkQueuePresentKHR(vk_graphics_queue, &present_info)) != VK_SUCCESS) {
DIE("Failed to submit present queue: %s", VkResultToString(r));
}
+
+ frame_idx = (frame_idx + 1) % MAX_FRAMES_IN_FLIGHT;
}
+
+ vkDeviceWaitIdle(vk_dev);
}
diff --git a/vk-asylum/shaders.h b/vk-asylum/shaders.h
index 3ceda9c..435e2f0 100644
--- a/vk-asylum/shaders.h
+++ b/vk-asylum/shaders.h
@@ -1,2 +1,2 @@
const uint32_t triangle_vert_spv[] = { 0x07230203, 0x00010000, 0x000d000b, 0x0000001f, 0x00000000, 0x00020011, 0x00000001, 0x0006000b, 0x00000001, 0x4c534c47, 0x6474732e, 0x3035342e, 0x00000000, 0x0003000e, 0x00000000, 0x00000001, 0x0009000f, 0x00000000, 0x00000004, 0x6e69616d, 0x00000000, 0x0000000d, 0x00000012, 0x0000001c, 0x0000001d, 0x00030003, 0x00000002, 0x000001c2, 0x000a0004, 0x475f4c47, 0x4c474f4f, 0x70635f45, 0x74735f70, 0x5f656c79, 0x656e696c, 0x7269645f, 0x69746365, 0x00006576, 0x00080004, 0x475f4c47, 0x4c474f4f, 0x6e695f45, 0x64756c63, 0x69645f65, 0x74636572, 0x00657669, 0x00040005, 0x00000004, 0x6e69616d, 0x00000000, 0x00060005, 0x0000000b, 0x505f6c67, 0x65567265, 0x78657472, 0x00000000, 0x00060006, 0x0000000b, 0x00000000, 0x505f6c67, 0x7469736f, 0x006e6f69, 0x00070006, 0x0000000b, 0x00000001, 0x505f6c67, 0x746e696f, 0x657a6953, 0x00000000, 0x00070006, 0x0000000b, 0x00000002, 0x435f6c67, 0x4470696c, 0x61747369, 0x0065636e, 0x00070006, 0x0000000b, 0x00000003, 0x435f6c67, 0x446c6c75, 0x61747369, 0x0065636e, 0x00030005, 0x0000000d, 0x00000000, 0x00030005, 0x00000012, 0x00705f76, 0x00030005, 0x0000001c, 0x00635f66, 0x00030005, 0x0000001d, 0x00635f76, 0x00030047, 0x0000000b, 0x00000002, 0x00050048, 0x0000000b, 0x00000000, 0x0000000b, 0x00000000, 0x00050048, 0x0000000b, 0x00000001, 0x0000000b, 0x00000001, 0x00050048, 0x0000000b, 0x00000002, 0x0000000b, 0x00000003, 0x00050048, 0x0000000b, 0x00000003, 0x0000000b, 0x00000004, 0x00040047, 0x00000012, 0x0000001e, 0x00000000, 0x00040047, 0x0000001c, 0x0000001e, 0x00000000, 0x00040047, 0x0000001d, 0x0000001e, 0x00000001, 0x00020013, 0x00000002, 0x00030021, 0x00000003, 0x00000002, 0x00030016, 0x00000006, 0x00000020, 0x00040017, 0x00000007, 0x00000006, 0x00000004, 0x00040015, 0x00000008, 0x00000020, 0x00000000, 0x0004002b, 0x00000008, 0x00000009, 0x00000001, 0x0004001c, 0x0000000a, 0x00000006, 0x00000009, 0x0006001e, 0x0000000b, 0x00000007, 0x00000006, 0x0000000a, 0x0000000a, 0x00040020, 0x0000000c, 0x00000003, 0x0000000b, 0x0004003b, 0x0000000c, 0x0000000d, 0x00000003, 0x00040015, 0x0000000e, 0x00000020, 0x00000001, 0x0004002b, 0x0000000e, 0x0000000f, 0x00000000, 0x00040017, 0x00000010, 0x00000006, 0x00000003, 0x00040020, 0x00000011, 0x00000001, 0x00000010, 0x0004003b, 0x00000011, 0x00000012, 0x00000001, 0x0004002b, 0x00000006, 0x00000014, 0x3f800000, 0x00040020, 0x00000019, 0x00000003, 0x00000007, 0x00040020, 0x0000001b, 0x00000003, 0x00000010, 0x0004003b, 0x0000001b, 0x0000001c, 0x00000003, 0x0004003b, 0x00000011, 0x0000001d, 0x00000001, 0x00050036, 0x00000002, 0x00000004, 0x00000000, 0x00000003, 0x000200f8, 0x00000005, 0x0004003d, 0x00000010, 0x00000013, 0x00000012, 0x00050051, 0x00000006, 0x00000015, 0x00000013, 0x00000000, 0x00050051, 0x00000006, 0x00000016, 0x00000013, 0x00000001, 0x00050051, 0x00000006, 0x00000017, 0x00000013, 0x00000002, 0x00070050, 0x00000007, 0x00000018, 0x00000015, 0x00000016, 0x00000017, 0x00000014, 0x00050041, 0x00000019, 0x0000001a, 0x0000000d, 0x0000000f, 0x0003003e, 0x0000001a, 0x00000018, 0x0004003d, 0x00000010, 0x0000001e, 0x0000001d, 0x0003003e, 0x0000001c, 0x0000001e, 0x000100fd, 0x00010038, };
-const uint32_t triangle_frag_spv[] = { 0x07230203, 0x00010000, 0x000d000b, 0x0000001f, 0x00000000, 0x00020011, 0x00000001, 0x0006000b, 0x00000001, 0x4c534c47, 0x6474732e, 0x3035342e, 0x00000000, 0x0003000e, 0x00000000, 0x00000001, 0x0009000f, 0x00000000, 0x00000004, 0x6e69616d, 0x00000000, 0x0000000d, 0x00000012, 0x0000001c, 0x0000001d, 0x00030003, 0x00000002, 0x000001c2, 0x000a0004, 0x475f4c47, 0x4c474f4f, 0x70635f45, 0x74735f70, 0x5f656c79, 0x656e696c, 0x7269645f, 0x69746365, 0x00006576, 0x00080004, 0x475f4c47, 0x4c474f4f, 0x6e695f45, 0x64756c63, 0x69645f65, 0x74636572, 0x00657669, 0x00040005, 0x00000004, 0x6e69616d, 0x00000000, 0x00060005, 0x0000000b, 0x505f6c67, 0x65567265, 0x78657472, 0x00000000, 0x00060006, 0x0000000b, 0x00000000, 0x505f6c67, 0x7469736f, 0x006e6f69, 0x00070006, 0x0000000b, 0x00000001, 0x505f6c67, 0x746e696f, 0x657a6953, 0x00000000, 0x00070006, 0x0000000b, 0x00000002, 0x435f6c67, 0x4470696c, 0x61747369, 0x0065636e, 0x00070006, 0x0000000b, 0x00000003, 0x435f6c67, 0x446c6c75, 0x61747369, 0x0065636e, 0x00030005, 0x0000000d, 0x00000000, 0x00030005, 0x00000012, 0x00705f76, 0x00030005, 0x0000001c, 0x00635f66, 0x00030005, 0x0000001d, 0x00635f76, 0x00030047, 0x0000000b, 0x00000002, 0x00050048, 0x0000000b, 0x00000000, 0x0000000b, 0x00000000, 0x00050048, 0x0000000b, 0x00000001, 0x0000000b, 0x00000001, 0x00050048, 0x0000000b, 0x00000002, 0x0000000b, 0x00000003, 0x00050048, 0x0000000b, 0x00000003, 0x0000000b, 0x00000004, 0x00040047, 0x00000012, 0x0000001e, 0x00000000, 0x00040047, 0x0000001c, 0x0000001e, 0x00000000, 0x00040047, 0x0000001d, 0x0000001e, 0x00000001, 0x00020013, 0x00000002, 0x00030021, 0x00000003, 0x00000002, 0x00030016, 0x00000006, 0x00000020, 0x00040017, 0x00000007, 0x00000006, 0x00000004, 0x00040015, 0x00000008, 0x00000020, 0x00000000, 0x0004002b, 0x00000008, 0x00000009, 0x00000001, 0x0004001c, 0x0000000a, 0x00000006, 0x00000009, 0x0006001e, 0x0000000b, 0x00000007, 0x00000006, 0x0000000a, 0x0000000a, 0x00040020, 0x0000000c, 0x00000003, 0x0000000b, 0x0004003b, 0x0000000c, 0x0000000d, 0x00000003, 0x00040015, 0x0000000e, 0x00000020, 0x00000001, 0x0004002b, 0x0000000e, 0x0000000f, 0x00000000, 0x00040017, 0x00000010, 0x00000006, 0x00000003, 0x00040020, 0x00000011, 0x00000001, 0x00000010, 0x0004003b, 0x00000011, 0x00000012, 0x00000001, 0x0004002b, 0x00000006, 0x00000014, 0x3f800000, 0x00040020, 0x00000019, 0x00000003, 0x00000007, 0x00040020, 0x0000001b, 0x00000003, 0x00000010, 0x0004003b, 0x0000001b, 0x0000001c, 0x00000003, 0x0004003b, 0x00000011, 0x0000001d, 0x00000001, 0x00050036, 0x00000002, 0x00000004, 0x00000000, 0x00000003, 0x000200f8, 0x00000005, 0x0004003d, 0x00000010, 0x00000013, 0x00000012, 0x00050051, 0x00000006, 0x00000015, 0x00000013, 0x00000000, 0x00050051, 0x00000006, 0x00000016, 0x00000013, 0x00000001, 0x00050051, 0x00000006, 0x00000017, 0x00000013, 0x00000002, 0x00070050, 0x00000007, 0x00000018, 0x00000015, 0x00000016, 0x00000017, 0x00000014, 0x00050041, 0x00000019, 0x0000001a, 0x0000000d, 0x0000000f, 0x0003003e, 0x0000001a, 0x00000018, 0x0004003d, 0x00000010, 0x0000001e, 0x0000001d, 0x0003003e, 0x0000001c, 0x0000001e, 0x000100fd, 0x00010038, };
+const uint32_t triangle_frag_spv[] = { 0x07230203, 0x00010000, 0x000d000b, 0x00000013, 0x00000000, 0x00020011, 0x00000001, 0x0006000b, 0x00000001, 0x4c534c47, 0x6474732e, 0x3035342e, 0x00000000, 0x0003000e, 0x00000000, 0x00000001, 0x0007000f, 0x00000004, 0x00000004, 0x6e69616d, 0x00000000, 0x00000009, 0x0000000c, 0x00030010, 0x00000004, 0x00000007, 0x00030003, 0x00000002, 0x000001c2, 0x000a0004, 0x475f4c47, 0x4c474f4f, 0x70635f45, 0x74735f70, 0x5f656c79, 0x656e696c, 0x7269645f, 0x69746365, 0x00006576, 0x00080004, 0x475f4c47, 0x4c474f4f, 0x6e695f45, 0x64756c63, 0x69645f65, 0x74636572, 0x00657669, 0x00040005, 0x00000004, 0x6e69616d, 0x00000000, 0x00050005, 0x00000009, 0x756f5f66, 0x6f635f74, 0x00726f6c, 0x00030005, 0x0000000c, 0x00635f66, 0x00040047, 0x00000009, 0x0000001e, 0x00000000, 0x00040047, 0x0000000c, 0x0000001e, 0x00000000, 0x00020013, 0x00000002, 0x00030021, 0x00000003, 0x00000002, 0x00030016, 0x00000006, 0x00000020, 0x00040017, 0x00000007, 0x00000006, 0x00000004, 0x00040020, 0x00000008, 0x00000003, 0x00000007, 0x0004003b, 0x00000008, 0x00000009, 0x00000003, 0x00040017, 0x0000000a, 0x00000006, 0x00000003, 0x00040020, 0x0000000b, 0x00000001, 0x0000000a, 0x0004003b, 0x0000000b, 0x0000000c, 0x00000001, 0x0004002b, 0x00000006, 0x0000000e, 0x3f800000, 0x00050036, 0x00000002, 0x00000004, 0x00000000, 0x00000003, 0x000200f8, 0x00000005, 0x0004003d, 0x0000000a, 0x0000000d, 0x0000000c, 0x00050051, 0x00000006, 0x0000000f, 0x0000000d, 0x00000000, 0x00050051, 0x00000006, 0x00000010, 0x0000000d, 0x00000001, 0x00050051, 0x00000006, 0x00000011, 0x0000000d, 0x00000002, 0x00070050, 0x00000007, 0x00000012, 0x0000000f, 0x00000010, 0x00000011, 0x0000000e, 0x0003003e, 0x00000009, 0x00000012, 0x000100fd, 0x00010038, };
diff --git a/yuvbench/CLAUDE.md b/yuvbench/CLAUDE.md
new file mode 100644
index 0000000..ee6ee4f
--- /dev/null
+++ b/yuvbench/CLAUDE.md
@@ -0,0 +1,78 @@
+# CLAUDE.md
+
+This file provides guidance to Claude Code (claude.ai/code) when working with code in this repository.
+
+## What This Is
+
+yuvbench benchmarks YUV 4:2:0 → RGB24 color space conversion across multiple implementations. It loads a raw YUV file, runs each enabled backend through 100 warmup + 2500 timed iterations, and reports min/max/avg per-iteration timing in milliseconds.
+
+## Build Commands
+
+```bash
+# macOS (Apple Silicon)
+./build-macos-aarch64-clang.sh
+
+# Linux (x86_64)
+./build-linux-x86_64-gcc.sh
+```
+
+Both scripts create `build/` and produce `build/yuvbench`.
+
+## Running
+
+```bash
+./build/yuvbench images/jellybeans-256x256.yuv
+./build/yuvbench images/capitol-2950x1528.yuv
+./build/yuvbench images/capitol-2950x1528.yuv show # pipe last frame to ffplay
+```
+
+Input filename must encode dimensions as `name-WIDTHxHEIGHT.yuv`.
+
+## Prepare Test Images
+
+```bash
+# Convert source images in images/src/ to raw YUV 4:2:0
+./images/convert.sh
+```
+
+## Architecture
+
+### Backend Plugin System
+
+Each backend is an optional compilation unit implementing the interface in `yuvbench.h`:
+
+```c
+typedef struct {
+ void (*init_fn)(Ctx *ctx);
+ void (*convert_fn)(Ctx *ctx);
+ void (*deinit_fn)(Ctx *ctx);
+} Backend;
+```
+
+`yuvbench.c:run_backend()` drives warmup + timing loops. Backends are compiled in via `-DYUVBENCH_<NAME>` preprocessor flags set in each build script.
+
+### Backends
+
+| Define | File | Platform | Notes |
+|--------|------|----------|-------|
+| `YUVBENCH_BAD` | `yuvbench_bad.c` | All | Naive BT.709 nested loop; reference baseline |
+| `YUVBENCH_ACCELERATE` | `yuvbench_accelerate.c` | macOS | vImage YUV→ARGB→RGB; caches conversion object |
+| `YUVBENCH_SWSCALE` | `yuvbench_swscale.c` | All | FFmpeg libswscale; SwsContext created in init |
+| `YUVBENCH_LIBYUV` | `yuvbench_libyuv.c` | Linux | Google libyuv `I420ToRAW()`; no init/deinit |
+
+### Timing (`kbench.h`)
+
+- macOS/ARM64: reads `CNTVCT_EL0` / `CNTFRQ_EL0` hardware registers directly
+- Linux: `clock_gettime(CLOCK_MONOTONIC)`
+
+### Adding a New Backend
+
+1. Create `yuvbench_<name>.c` implementing `init`, `convert`, `deinit` functions
+2. Guard the file body with `#ifdef YUVBENCH_<NAME>`
+3. Register it in `yuvbench.c` (see the `backends[]` array)
+4. Add `-DYUVBENCH_<NAME>` and any link flags to the relevant build scripts
+
+## Platform Notes
+
+- The `vk-asylum` branch contains a Vulkan compute shader backend (`build-shaders.sh`, `shaders.h`, `main.c`)
+- Assembly output for the Accelerate backend is emitted to `build/yuvbench_accelerate.S` on macOS builds
diff --git a/yuvbench/build-macos-aarch64-clang.sh b/yuvbench/build-macos-aarch64-clang.sh
index 18706e8..7cb19e6 100755
--- a/yuvbench/build-macos-aarch64-clang.sh
+++ b/yuvbench/build-macos-aarch64-clang.sh
@@ -1,5 +1,5 @@
#!/bin/sh
-CFLAGS="-Wall -Wextra -Wpedantic -O3 -g -DYUVBENCH_ACCELERATE -DYUVBENCH_BAD -DYUVBENCH_SWSCALE"
+CFLAGS="-Wall -Wextra -Wpedantic -O3 -g -DYUVBENCH_ACCELERATE -DYUVBENCH_BAD -DYUVBENCH_SWSCALE -DYUVBENCH_CLAUDE"
LFLAGS="-framework Accelerate $(pkg-config --libs libswscale)"
mkdir -p build
set -x
@@ -8,4 +8,5 @@ clang -o build/yuvbench_accelerate.o $CFLAGS -c ./yuvbench_accelerate.c
clang -o build/yuvbench_accelerate.S $CFLAGS -S ./yuvbench_accelerate.c
clang -o build/yuvbench_bad.o $CFLAGS -c ./yuvbench_bad.c
clang -o build/yuvbench_swscale.o $CFLAGS $(pkg-config --cflags libswscale) -c ./yuvbench_swscale.c
-clang -o build/yuvbench $LFLAGS build/yuvbench.o build/yuvbench_accelerate.o build/yuvbench_bad.o build/yuvbench_swscale.o
+clang -o build/yuvbench_claude.o $CFLAGS -c ./yuvbench_claude.c
+clang -o build/yuvbench $LFLAGS build/yuvbench.o build/yuvbench_accelerate.o build/yuvbench_bad.o build/yuvbench_swscale.o build/yuvbench_claude.o
diff --git a/yuvbench/yuvbench.c b/yuvbench/yuvbench.c
index 3a92371..669c339 100644
--- a/yuvbench/yuvbench.c
+++ b/yuvbench/yuvbench.c
@@ -3,6 +3,14 @@
#define KBENCH_IMPLEMENTATION
#include "kbench.h"
+#include <math.h>
+
+static int cmp_double(const void* a, const void* b)
+{
+ double da = *(const double*)a, db = *(const double*)b;
+ return (da > db) - (da < db);
+}
+
#ifdef YUVBENCH_ACCELERATE
Backend yuvbench_accelerate(void);
#endif
@@ -15,6 +23,9 @@ Backend yuvbench_libyuv(void);
#ifdef YUVBENCH_SWSCALE
Backend yuvbench_swscale(void);
#endif
+#ifdef YUVBENCH_CLAUDE
+Backend yuvbench_claude(void);
+#endif
static struct
{
@@ -70,21 +81,27 @@ static void run_backend(Backend b)
b.deinit_fn(&ctx);
}
- double ts_min = -1.0f;
- double ts_max = -1.0f;
- double ts_avg = 0.0f;
+ // Sort for percentiles
+ qsort(tests_table, tests, sizeof(double), cmp_double);
+
+ double ts_min = tests_table[0];
+ double ts_max = tests_table[tests - 1];
+ double ts_p50 = tests_table[tests / 2];
+ double ts_p95 = tests_table[(int)(tests * 0.95)];
+ double ts_p99 = tests_table[(int)(tests * 0.99)];
+ double ts_avg = 0.0;
+ for (int i = 0; i < tests; ++i) ts_avg += tests_table[i] / (double)tests;
+ double ts_var = 0.0;
for (int i = 0; i < tests; ++i) {
- if (ts_min < 0 || tests_table[i] < ts_min) {
- ts_min = tests_table[i];
- }
- if (ts_max < 0 || tests_table[i] > ts_max) {
- ts_max = tests_table[i];
- }
- ts_avg += (tests_table[i] / (double)tests);
+ double d = tests_table[i] - ts_avg;
+ ts_var += d * d / (double)tests;
}
- printf(" min result: %fms\n", ts_min * 1000.0f);
- printf(" max result: %fms\n", ts_max * 1000.0f);
- printf(" avg result: %fms\n", ts_avg * 1000.0f);
+ double ts_stddev = sqrt(ts_var);
+
+ #define MS(t) ((t) * 1000.0)
+ printf(" min %8.3fms p50 %8.3fms p95 %8.3fms p99 %8.3fms max %8.3fms avg %8.3fms σ %7.3fms\n",
+ MS(ts_min), MS(ts_p50), MS(ts_p95), MS(ts_p99), MS(ts_max), MS(ts_avg), MS(ts_stddev));
+ #undef MS
if (G.show) {
@@ -204,4 +221,8 @@ int main(int argc, char** argv)
printf("YUVBENCH_SWSCALE\n");
run_backend(yuvbench_swscale());
#endif
+#ifdef YUVBENCH_CLAUDE
+ printf("YUVBENCH_CLAUDE\n");
+ run_backend(yuvbench_claude());
+#endif
}
diff --git a/yuvbench/yuvbench_claude.c b/yuvbench/yuvbench_claude.c
new file mode 100644
index 0000000..c8aae52
--- /dev/null
+++ b/yuvbench/yuvbench_claude.c
@@ -0,0 +1,159 @@
+#include "yuvbench.h"
+
+#ifdef YUVBENCH_CLAUDE
+
+#include <arm_neon.h>
+#include <dispatch/dispatch.h>
+
+// BT.709 limited range, scale >>6 (factor=64):
+// R = clip(( 75*(Y-16) + 119*(Cr-128) + 32) >> 6)
+// G = clip(( 75*(Y-16) - 12*(Cb-128) - 30*(Cr-128) + 32) >> 6)
+// B = clip(( 75*(Y-16) + 129*(Cb-128) + 32) >> 6)
+//
+// All int16 intermediates stay within [-16512, 31801] — no overflow.
+// vqshrun_n_s16 gives saturating unsigned narrow + right shift in one instruction.
+
+// Coefficients:
+// Y: 75 (1.164 * 64 = 74.5 → 75, err +0.67%)
+// Cr_R: 119 (1.856 * 64 = 118.8 → 119, err +0.17%)
+// Cb_B: 129 (2.016 * 64; libyuv uses 2.016 for BT.709 studio swing)
+// Cb_G: 12 (0.187 * 64 = 12.0)
+// Cr_G: 30 (0.468 * 64 = 30.0)
+
+// Convert 8 luma pixels using pre-widened, bias-subtracted chroma vectors.
+// vcb = Cb - 128 (int16x8), vcr = Cr - 128 (int16x8)
+static inline void convert_8px(
+ const uint8_t* __restrict__ y,
+ int16x8_t vcb,
+ int16x8_t vcr,
+ uint8_t* __restrict__ out)
+{
+ // Y - 16
+ int16x8_t vy = vsubq_s16(
+ vreinterpretq_s16_u16(vmovl_u8(vld1_u8(y))),
+ vdupq_n_s16(16));
+
+ // ry = 75*(Y-16) + rounding
+ int16x8_t base = vaddq_s16(vmulq_n_s16(vy, 75), vdupq_n_s16(32));
+
+ // R = base + 119*Cr
+ int16x8_t r = vmlaq_n_s16(base, vcr, 119);
+ // G = base - 12*Cb - 30*Cr
+ int16x8_t g = vmlsq_n_s16(vmlsq_n_s16(base, vcb, 12), vcr, 30);
+ // B = base + 129*Cb
+ int16x8_t b = vmlaq_n_s16(base, vcb, 129);
+
+ // Saturating narrow+shift: clamps to [0,255] and packs to uint8
+ uint8x8x3_t rgb;
+ rgb.val[0] = vqshrun_n_s16(r, 6);
+ rgb.val[1] = vqshrun_n_s16(g, 6);
+ rgb.val[2] = vqshrun_n_s16(b, 6);
+ vst3_u8(out, rgb); // stores R0G0B0 R1G1B1 ... R7G7B7
+}
+
+// Process a contiguous range of row-pairs [row_start, row_end).
+// row_start and row_end must be even.
+static void convert_rows(
+ const uint8_t* __restrict__ Y,
+ const uint8_t* __restrict__ Cb,
+ const uint8_t* __restrict__ Cr,
+ uint8_t* __restrict__ RGB,
+ uint32_t w,
+ uint32_t row_start,
+ uint32_t row_end)
+{
+ for (uint32_t row = row_start; row < row_end; row += 2) {
+ const uint8_t* y0 = Y + row * w;
+ const uint8_t* y1 = y0 + w;
+ const uint8_t* cb = Cb + (row / 2) * (w / 2);
+ const uint8_t* cr = Cr + (row / 2) * (w / 2);
+ uint8_t* rgb0 = RGB + row * w * 3;
+ uint8_t* rgb1 = rgb0 + w * 3;
+
+ uint32_t col = 0;
+
+ // 16 pixels per inner iteration: 8 chroma samples shared across 2 rows.
+ for (; col + 16 <= w; col += 16) {
+ // Load 8 Cb/Cr bytes and upsample to 16 via interleave-with-self:
+ // [c0,c1,...,c7] → [c0,c0,c1,c1,...,c7,c7]
+ uint8x8x2_t cb_up = vzip_u8(vld1_u8(cb + col/2), vld1_u8(cb + col/2));
+ uint8x8x2_t cr_up = vzip_u8(vld1_u8(cr + col/2), vld1_u8(cr + col/2));
+
+ // Widen and bias-subtract chroma for low 8 and high 8 pixels
+ int16x8_t vcb_lo = vsubq_s16(vreinterpretq_s16_u16(vmovl_u8(cb_up.val[0])), vdupq_n_s16(128));
+ int16x8_t vcb_hi = vsubq_s16(vreinterpretq_s16_u16(vmovl_u8(cb_up.val[1])), vdupq_n_s16(128));
+ int16x8_t vcr_lo = vsubq_s16(vreinterpretq_s16_u16(vmovl_u8(cr_up.val[0])), vdupq_n_s16(128));
+ int16x8_t vcr_hi = vsubq_s16(vreinterpretq_s16_u16(vmovl_u8(cr_up.val[1])), vdupq_n_s16(128));
+
+ // Row 0
+ convert_8px(y0 + col, vcb_lo, vcr_lo, rgb0 + col * 3);
+ convert_8px(y0 + col + 8, vcb_hi, vcr_hi, rgb0 + (col + 8) * 3);
+ // Row 1 — same chroma, different luma
+ convert_8px(y1 + col, vcb_lo, vcr_lo, rgb1 + col * 3);
+ convert_8px(y1 + col + 8, vcb_hi, vcr_hi, rgb1 + (col + 8) * 3);
+ }
+
+ // Scalar tail for widths not divisible by 16
+ for (; col < w; col += 2) {
+ int32_t cb2 = (int32_t)cb[col / 2] - 128;
+ int32_t cr2 = (int32_t)cr[col / 2] - 128;
+ for (uint32_t dy = 0; dy < 2; ++dy) {
+ const uint8_t* yr = (dy == 0) ? y0 : y1;
+ uint8_t* dst = (dy == 0) ? rgb0 : rgb1;
+ for (uint32_t dx = 0; dx < 2 && (col + dx) < w; ++dx) {
+ int32_t yv = (int32_t)yr[col + dx] - 16;
+ int32_t base = 75 * yv + 32;
+ int32_t r = base + 119 * cr2;
+ int32_t g = base - 12 * cb2 - 30 * cr2;
+ int32_t bv = base + 129 * cb2;
+ r = (r >> 6); r = r < 0 ? 0 : r > 255 ? 255 : r;
+ g = (g >> 6); g = g < 0 ? 0 : g > 255 ? 255 : g;
+ bv = (bv >> 6); bv = bv < 0 ? 0 : bv > 255 ? 255 : bv;
+ dst[(col + dx) * 3 + 0] = (uint8_t)r;
+ dst[(col + dx) * 3 + 1] = (uint8_t)g;
+ dst[(col + dx) * 3 + 2] = (uint8_t)bv;
+ }
+ }
+ }
+ }
+}
+
+static bool yuvbench_claude_init(Ctx* ctx)
+{
+ return (ctx->inp_w % 2 == 0) && (ctx->inp_h % 2 == 0);
+}
+
+static bool yuvbench_claude_convert(Ctx* ctx)
+{
+ const uint32_t w = ctx->inp_w;
+ const uint32_t h = ctx->inp_h;
+ const uint8_t* Y = (const uint8_t*)ctx->inp_buf;
+ const uint8_t* Cb = Y + (size_t)w * h;
+ const uint8_t* Cr = Cb + (size_t)(w / 2) * (h / 2);
+ uint8_t* RGB = (uint8_t*)ctx->out_buf;
+
+ // Dispatch row-pairs across performance cores.
+ // dispatch_apply is synchronous: returns only after all blocks finish.
+ static const uint32_t NCHUNKS = 8;
+ uint32_t pairs = h / 2;
+
+ dispatch_apply(NCHUNKS,
+ dispatch_get_global_queue(QOS_CLASS_USER_INTERACTIVE, 0),
+ ^(size_t tid) {
+ uint32_t start = (uint32_t)((tid * pairs / NCHUNKS) * 2);
+ uint32_t end = (uint32_t)(((tid+1) * pairs / NCHUNKS) * 2);
+ convert_rows(Y, Cb, Cr, RGB, w, start, end);
+ });
+
+ return true;
+}
+
+Backend yuvbench_claude(void)
+{
+ Backend b = { 0 };
+ b.init_fn = yuvbench_claude_init;
+ b.convert_fn = yuvbench_claude_convert;
+ return b;
+}
+
+#endif // YUVBENCH_CLAUDE