diff --git a/.gitattributes b/.gitattributes index 59a59fd..20bc823 100644 --- a/.gitattributes +++ b/.gitattributes @@ -8,6 +8,7 @@ *.kra filter=lfs diff=lfs merge=lfs -text *.png filter=lfs diff=lfs merge=lfs -text *.wav filter=lfs diff=lfs merge=lfs -text +*.safetensors filter=lfs diff=lfs merge=lfs -text *.tga binary diff=hex *.kra binary diff=hex diff --git a/.gitignore b/.gitignore index 9d9ede8..b017cf8 100755 --- a/.gitignore +++ b/.gitignore @@ -13,11 +13,20 @@ tmp_* *.bak *-autosave.kra .directory +!**/CLAUDE.md +!CLAUDE.md -*/__pycache__ +# from OTF build +**/__pycache__ OTFbuild/*.ttf OTFbuild/*.otf OTFbuild/*.woff OTFbuild/*.woff2 +OTFbuild/*.md *.fea *.xdp-* + +# from Autokem build +Autokem/*.o +Autokem/autokem +Autokem/*.md diff --git a/Autokem/Makefile b/Autokem/Makefile new file mode 100644 index 0000000..acfcfe8 --- /dev/null +++ b/Autokem/Makefile @@ -0,0 +1,22 @@ +CC = gcc +CFLAGS = -Ofast -Wall -Wextra -std=c11 +LDFLAGS = -lm +SRC = main.c tga.c nn.c safetensor.c train.c apply.c +OBJ = $(SRC:.c=.o) + +all: autokem + +autokem: $(OBJ) + $(CC) $(CFLAGS) -o $@ $^ $(LDFLAGS) + +%.o: %.c + $(CC) $(CFLAGS) -c $< -o $@ + +debug: CFLAGS = -g -Wall -Wextra -std=c11 -fsanitize=address,undefined +debug: LDFLAGS += -fsanitize=address,undefined +debug: clean autokem + +clean: + rm -f *.o autokem + +.PHONY: all debug clean diff --git a/Autokem/apply.c b/Autokem/apply.c new file mode 100644 index 0000000..314760d --- /dev/null +++ b/Autokem/apply.c @@ -0,0 +1,164 @@ +#include "apply.h" +#include "tga.h" +#include "nn.h" +#include "safetensor.h" + +#include +#include +#include + +/* Copy file for backup */ +static int copy_file(const char *src, const char *dst) { + FILE *in = fopen(src, "rb"); + if (!in) return -1; + + FILE *out = fopen(dst, "wb"); + if (!out) { fclose(in); return -1; } + + char buf[4096]; + size_t n; + while ((n = fread(buf, 1, sizeof(buf), in)) > 0) { + if (fwrite(buf, 1, n, out) != n) { + fclose(in); fclose(out); + return -1; + } + } + + fclose(in); + fclose(out); + return 0; +} + +int apply_model(const char *tga_path) { + /* Validate filename */ + const char *basename = strrchr(tga_path, '/'); + basename = basename ? basename + 1 : tga_path; + + if (strstr(basename, "variable") == NULL) { + fprintf(stderr, "Error: %s does not appear to be a variable sheet\n", tga_path); + return 1; + } + if (strstr(basename, "extrawide") != NULL) { + fprintf(stderr, "Error: extrawide sheets are not supported\n"); + return 1; + } + + int is_xyswap = (strstr(basename, "xyswap") != NULL); + + /* Create backup */ + char bakpath[512]; + snprintf(bakpath, sizeof(bakpath), "%s.bak", tga_path); + if (copy_file(tga_path, bakpath) != 0) { + fprintf(stderr, "Error: failed to create backup %s\n", bakpath); + return 1; + } + printf("Backup: %s\n", bakpath); + + /* Load model */ + Network *net = network_create(); + if (safetensor_load("autokem.safetensors", net) != 0) { + fprintf(stderr, "Error: failed to load model\n"); + network_free(net); + return 1; + } + + /* Load TGA */ + TgaImage *img = tga_read(tga_path); + if (!img) { + fprintf(stderr, "Error: cannot read %s\n", tga_path); + network_free(net); + return 1; + } + + int cell_w = 16, cell_h = 20; + int cols = img->width / cell_w; + int rows = img->height / cell_h; + int total_cells = cols * rows; + + int processed = 0, updated = 0, skipped = 0; + + for (int index = 0; index < total_cells; index++) { + int cell_x, cell_y; + if (is_xyswap) { + cell_x = (index / cols) * cell_w; + cell_y = (index % cols) * cell_h; + } else { + cell_x = (index % cols) * cell_w; + cell_y = (index / cols) * cell_h; + } + + int tag_x = cell_x + (cell_w - 1); + int tag_y = cell_y; + + /* Read width */ + int width = 0; + for (int y = 0; y < 5; y++) { + if (tga_get_pixel(img, tag_x, tag_y + y) & 0xFF) + width |= (1 << y); + } + if (width == 0) { skipped++; continue; } + + /* Check writeOnTop at Y+17 — skip if defined */ + uint32_t wot = tga_get_pixel(img, tag_x, tag_y + 17); + if ((wot & 0xFF) != 0) { skipped++; continue; } + + /* Check compiler directive at Y+9 — skip if opcode != 0 */ + uint32_t dir_pixel = tagify(tga_get_pixel(img, tag_x, tag_y + 9)); + int opcode = (int)((dir_pixel >> 24) & 0xFF); + if (opcode != 0) { skipped++; continue; } + + /* Extract 15x20 binary input */ + float input[300]; + for (int gy = 0; gy < 20; gy++) { + for (int gx = 0; gx < 15; gx++) { + uint32_t p = tga_get_pixel(img, cell_x + gx, cell_y + gy); + input[gy * 15 + gx] = ((p & 0x80) != 0) ? 1.0f : 0.0f; + } + } + + /* Inference */ + float output[12]; + network_infer(net, input, output); + + /* Threshold at 0.5 */ + int A = output[0] >= 0.5f; + int B = output[1] >= 0.5f; + int C = output[2] >= 0.5f; + int D = output[3] >= 0.5f; + int E = output[4] >= 0.5f; + int F = output[5] >= 0.5f; + int G = output[6] >= 0.5f; + int H = output[7] >= 0.5f; + int J = output[8] >= 0.5f; + int K = output[9] >= 0.5f; + int ytype = output[10] >= 0.5f; + int lowheight = output[11] >= 0.5f; + + /* Compose Y+5 pixel: lowheight (alpha=0xFF when set) */ + uint32_t lh_pixel = lowheight ? 0x000000FF : 0x00000000; + tga_write_pixel(tga_path, img, tag_x, tag_y + 5, lh_pixel); + + /* Compose Y+6 pixel: + * Red byte: Y0000000 -> bit 31 + * Green byte: JK000000 -> bits 23,22 + * Blue byte: ABCDEFGH -> bits 15-8 + * Alpha: 0xFF = hasKernData */ + uint32_t pixel = 0; + pixel |= (uint32_t)(ytype ? 0x80 : 0) << 24; + pixel |= (uint32_t)((J ? 0x80 : 0) | (K ? 0x40 : 0)) << 16; + pixel |= (uint32_t)(A<<7 | B<<6 | C<<5 | D<<4 | E<<3 | F<<2 | G<<1 | H) << 8; + pixel |= 0xFF; + + tga_write_pixel(tga_path, img, tag_x, tag_y + 6, pixel); + + processed++; + updated++; + } + + printf("Processed: %d cells, Updated: %d, Skipped: %d (of %d total)\n", + processed, updated, skipped, total_cells); + + tga_free(img); + network_free(net); + return 0; +} diff --git a/Autokem/apply.h b/Autokem/apply.h new file mode 100644 index 0000000..b83b3de --- /dev/null +++ b/Autokem/apply.h @@ -0,0 +1,8 @@ +#ifndef APPLY_H +#define APPLY_H + +/* Apply trained model to a spritesheet. + Creates .bak backup, then writes predicted kerning bits. */ +int apply_model(const char *tga_path); + +#endif diff --git a/Autokem/autokem.safetensors b/Autokem/autokem.safetensors new file mode 100644 index 0000000..026f581 --- /dev/null +++ b/Autokem/autokem.safetensors @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:c7fa2ad5e9d44832590bb7ebc65ed2765e93c89e6ddff99845d5407f2857ae17 +size 470552 diff --git a/Autokem/main.c b/Autokem/main.c new file mode 100644 index 0000000..d4d2d27 --- /dev/null +++ b/Autokem/main.c @@ -0,0 +1,40 @@ +#include +#include +#include "train.h" +#include "apply.h" +#include "safetensor.h" + +static void print_usage(void) { + printf("Usage: autokem [args]\n"); + printf("Commands:\n"); + printf(" train Train model on existing spritesheets\n"); + printf(" apply Apply trained model to a spritesheet\n"); + printf(" stats Print model statistics\n"); + printf(" help Print this message\n"); +} + +int main(int argc, char **argv) { + if (argc < 2) { + print_usage(); + return 1; + } + + if (strcmp(argv[1], "train") == 0) { + return train_model(); + } else if (strcmp(argv[1], "apply") == 0) { + if (argc < 3) { + fprintf(stderr, "Error: apply requires a TGA file path\n"); + return 1; + } + return apply_model(argv[2]); + } else if (strcmp(argv[1], "stats") == 0) { + return safetensor_stats("autokem.safetensors"); + } else if (strcmp(argv[1], "help") == 0) { + print_usage(); + return 0; + } else { + fprintf(stderr, "Unknown command: %s\n", argv[1]); + print_usage(); + return 1; + } +} diff --git a/Autokem/nn.c b/Autokem/nn.c new file mode 100644 index 0000000..a7b9cbf --- /dev/null +++ b/Autokem/nn.c @@ -0,0 +1,556 @@ +#define _GNU_SOURCE +#include "nn.h" +#include +#include +#include +#include + +#ifndef M_PI +#define M_PI 3.14159265358979323846 +#endif + +/* ---- Tensor ---- */ + +Tensor *tensor_alloc(int ndim, const int *shape) { + Tensor *t = malloc(sizeof(Tensor)); + t->ndim = ndim; + t->size = 1; + for (int i = 0; i < ndim; i++) { + t->shape[i] = shape[i]; + t->size *= shape[i]; + } + for (int i = ndim; i < 4; i++) t->shape[i] = 0; + t->data = malloc((size_t)t->size * sizeof(float)); + return t; +} + +Tensor *tensor_zeros(int ndim, const int *shape) { + Tensor *t = tensor_alloc(ndim, shape); + memset(t->data, 0, (size_t)t->size * sizeof(float)); + return t; +} + +void tensor_free(Tensor *t) { + if (!t) return; + free(t->data); + free(t); +} + +/* ---- RNG (Box-Muller) ---- */ + +static uint64_t rng_state = 0; + +static void rng_seed(uint64_t s) { rng_state = s; } + +static uint64_t xorshift64(void) { + uint64_t x = rng_state; + x ^= x << 13; + x ^= x >> 7; + x ^= x << 17; + rng_state = x; + return x; +} + +static float rand_uniform(void) { + return (float)(xorshift64() & 0x7FFFFFFF) / (float)0x7FFFFFFF; +} + +static float rand_normal(void) { + float u1, u2; + do { u1 = rand_uniform(); } while (u1 < 1e-10f); + u2 = rand_uniform(); + return sqrtf(-2.0f * logf(u1)) * cosf(2.0f * (float)M_PI * u2); +} + +/* He init: std = sqrt(2/fan_in) */ +static void he_init(Tensor *w, int fan_in) { + float std = sqrtf(2.0f / (float)fan_in); + for (int i = 0; i < w->size; i++) + w->data[i] = rand_normal() * std; +} + +/* ---- Activations ---- */ + +static inline float leaky_relu(float x) { + return x >= 0.0f ? x : 0.01f * x; +} + +static inline float leaky_relu_grad(float x) { + return x >= 0.0f ? 1.0f : 0.01f; +} + +static inline float sigmoid_f(float x) { + if (x >= 0.0f) { + float ez = expf(-x); + return 1.0f / (1.0f + ez); + } else { + float ez = expf(x); + return ez / (1.0f + ez); + } +} + +/* ---- Conv2D forward/backward ---- */ + +static void conv2d_init(Conv2D *c, int in_ch, int out_ch, int kh, int kw) { + c->in_ch = in_ch; + c->out_ch = out_ch; + c->kh = kh; + c->kw = kw; + + int wshape[] = {out_ch, in_ch, kh, kw}; + int bshape[] = {out_ch}; + + c->weight = tensor_alloc(4, wshape); + c->bias = tensor_zeros(1, bshape); + c->grad_weight = tensor_zeros(4, wshape); + c->grad_bias = tensor_zeros(1, bshape); + c->m_weight = tensor_zeros(4, wshape); + c->v_weight = tensor_zeros(4, wshape); + c->m_bias = tensor_zeros(1, bshape); + c->v_bias = tensor_zeros(1, bshape); + c->input_cache = NULL; + + he_init(c->weight, in_ch * kh * kw); +} + +static void conv2d_free(Conv2D *c) { + tensor_free(c->weight); + tensor_free(c->bias); + tensor_free(c->grad_weight); + tensor_free(c->grad_bias); + tensor_free(c->m_weight); + tensor_free(c->v_weight); + tensor_free(c->m_bias); + tensor_free(c->v_bias); + tensor_free(c->input_cache); +} + +/* Forward: input [batch, in_ch, H, W] -> output [batch, out_ch, H, W] (same padding) */ +static Tensor *conv2d_forward(Conv2D *c, Tensor *input, int training) { + int batch = input->shape[0]; + int in_ch = c->in_ch, out_ch = c->out_ch; + int H = input->shape[2], W = input->shape[3]; + int kh = c->kh, kw = c->kw; + int ph = kh / 2, pw = kw / 2; + + if (training) { + tensor_free(c->input_cache); + c->input_cache = tensor_alloc(input->ndim, input->shape); + memcpy(c->input_cache->data, input->data, (size_t)input->size * sizeof(float)); + } + + int oshape[] = {batch, out_ch, H, W}; + Tensor *out = tensor_alloc(4, oshape); + + for (int b = 0; b < batch; b++) { + for (int oc = 0; oc < out_ch; oc++) { + for (int oh = 0; oh < H; oh++) { + for (int ow = 0; ow < W; ow++) { + float sum = c->bias->data[oc]; + for (int ic = 0; ic < in_ch; ic++) { + for (int fh = 0; fh < kh; fh++) { + for (int fw = 0; fw < kw; fw++) { + int ih = oh + fh - ph; + int iw = ow + fw - pw; + if (ih >= 0 && ih < H && iw >= 0 && iw < W) { + float inp = input->data[((b * in_ch + ic) * H + ih) * W + iw]; + float wt = c->weight->data[((oc * in_ch + ic) * kh + fh) * kw + fw]; + sum += inp * wt; + } + } + } + } + out->data[((b * out_ch + oc) * H + oh) * W + ow] = sum; + } + } + } + } + return out; +} + +/* Backward: grad_output [batch, out_ch, H, W] -> grad_input [batch, in_ch, H, W] */ +static Tensor *conv2d_backward(Conv2D *c, Tensor *grad_output) { + Tensor *input = c->input_cache; + int batch = input->shape[0]; + int in_ch = c->in_ch, out_ch = c->out_ch; + int H = input->shape[2], W = input->shape[3]; + int kh = c->kh, kw = c->kw; + int ph = kh / 2, pw = kw / 2; + + Tensor *grad_input = tensor_zeros(input->ndim, input->shape); + + for (int b = 0; b < batch; b++) { + for (int oc = 0; oc < out_ch; oc++) { + for (int oh = 0; oh < H; oh++) { + for (int ow = 0; ow < W; ow++) { + float go = grad_output->data[((b * out_ch + oc) * H + oh) * W + ow]; + c->grad_bias->data[oc] += go; + for (int ic = 0; ic < in_ch; ic++) { + for (int fh = 0; fh < kh; fh++) { + for (int fw = 0; fw < kw; fw++) { + int ih = oh + fh - ph; + int iw = ow + fw - pw; + if (ih >= 0 && ih < H && iw >= 0 && iw < W) { + float inp = input->data[((b * in_ch + ic) * H + ih) * W + iw]; + c->grad_weight->data[((oc * in_ch + ic) * kh + fh) * kw + fw] += go * inp; + grad_input->data[((b * in_ch + ic) * H + ih) * W + iw] += + go * c->weight->data[((oc * in_ch + ic) * kh + fh) * kw + fw]; + } + } + } + } + } + } + } + } + return grad_input; +} + +/* ---- Dense forward/backward ---- */ + +static void dense_init(Dense *d, int in_f, int out_f) { + d->in_features = in_f; + d->out_features = out_f; + + int wshape[] = {out_f, in_f}; + int bshape[] = {out_f}; + + d->weight = tensor_alloc(2, wshape); + d->bias = tensor_zeros(1, bshape); + d->grad_weight = tensor_zeros(2, wshape); + d->grad_bias = tensor_zeros(1, bshape); + d->m_weight = tensor_zeros(2, wshape); + d->v_weight = tensor_zeros(2, wshape); + d->m_bias = tensor_zeros(1, bshape); + d->v_bias = tensor_zeros(1, bshape); + d->input_cache = NULL; + + he_init(d->weight, in_f); +} + +static void dense_free(Dense *d) { + tensor_free(d->weight); + tensor_free(d->bias); + tensor_free(d->grad_weight); + tensor_free(d->grad_bias); + tensor_free(d->m_weight); + tensor_free(d->v_weight); + tensor_free(d->m_bias); + tensor_free(d->v_bias); + tensor_free(d->input_cache); +} + +/* Forward: input [batch, in_f] -> output [batch, out_f] */ +static Tensor *dense_forward(Dense *d, Tensor *input, int training) { + int batch = input->shape[0]; + int in_f = d->in_features, out_f = d->out_features; + + if (training) { + tensor_free(d->input_cache); + d->input_cache = tensor_alloc(input->ndim, input->shape); + memcpy(d->input_cache->data, input->data, (size_t)input->size * sizeof(float)); + } + + int oshape[] = {batch, out_f}; + Tensor *out = tensor_alloc(2, oshape); + + for (int b = 0; b < batch; b++) { + for (int o = 0; o < out_f; o++) { + float sum = d->bias->data[o]; + for (int i = 0; i < in_f; i++) { + sum += input->data[b * in_f + i] * d->weight->data[o * in_f + i]; + } + out->data[b * out_f + o] = sum; + } + } + return out; +} + +/* Backward: grad_output [batch, out_f] -> grad_input [batch, in_f] */ +static Tensor *dense_backward(Dense *d, Tensor *grad_output) { + Tensor *input = d->input_cache; + int batch = input->shape[0]; + int in_f = d->in_features, out_f = d->out_features; + + int gshape[] = {batch, in_f}; + Tensor *grad_input = tensor_zeros(2, gshape); + + for (int b = 0; b < batch; b++) { + for (int o = 0; o < out_f; o++) { + float go = grad_output->data[b * out_f + o]; + d->grad_bias->data[o] += go; + for (int i = 0; i < in_f; i++) { + d->grad_weight->data[o * in_f + i] += go * input->data[b * in_f + i]; + grad_input->data[b * in_f + i] += go * d->weight->data[o * in_f + i]; + } + } + } + return grad_input; +} + +/* ---- LeakyReLU helpers on tensors ---- */ + +static Tensor *apply_leaky_relu(Tensor *input) { + Tensor *out = tensor_alloc(input->ndim, input->shape); + for (int i = 0; i < input->size; i++) + out->data[i] = leaky_relu(input->data[i]); + return out; +} + +static Tensor *apply_leaky_relu_backward(Tensor *grad_output, Tensor *pre_activation) { + Tensor *grad = tensor_alloc(grad_output->ndim, grad_output->shape); + for (int i = 0; i < grad_output->size; i++) + grad->data[i] = grad_output->data[i] * leaky_relu_grad(pre_activation->data[i]); + return grad; +} + +/* ---- Sigmoid on tensor ---- */ + +static Tensor *apply_sigmoid(Tensor *input) { + Tensor *out = tensor_alloc(input->ndim, input->shape); + for (int i = 0; i < input->size; i++) + out->data[i] = sigmoid_f(input->data[i]); + return out; +} + +/* ---- Adam step for a single parameter tensor ---- */ + +static void adam_update(Tensor *param, Tensor *grad, Tensor *m, Tensor *v, + float lr, float beta1, float beta2, float eps, int t) { + float bc1 = 1.0f - powf(beta1, (float)t); + float bc2 = 1.0f - powf(beta2, (float)t); + + for (int i = 0; i < param->size; i++) { + m->data[i] = beta1 * m->data[i] + (1.0f - beta1) * grad->data[i]; + v->data[i] = beta2 * v->data[i] + (1.0f - beta2) * grad->data[i] * grad->data[i]; + float m_hat = m->data[i] / bc1; + float v_hat = v->data[i] / bc2; + param->data[i] -= lr * m_hat / (sqrtf(v_hat) + eps); + } +} + +/* ---- Network ---- */ + +Network *network_create(void) { + rng_seed((uint64_t)time(NULL) ^ 0xDEADBEEF); + + Network *net = calloc(1, sizeof(Network)); + conv2d_init(&net->conv1, 1, 12, 3, 3); + conv2d_init(&net->conv2, 12, 16, 3, 3); + dense_init(&net->fc1, 4800, 24); + dense_init(&net->head_shape, 24, 10); + dense_init(&net->head_ytype, 24, 1); + dense_init(&net->head_lowheight, 24, 1); + return net; +} + +void network_free(Network *net) { + if (!net) return; + conv2d_free(&net->conv1); + conv2d_free(&net->conv2); + dense_free(&net->fc1); + dense_free(&net->head_shape); + dense_free(&net->head_ytype); + dense_free(&net->head_lowheight); + tensor_free(net->act_conv1); + tensor_free(net->act_relu1); + tensor_free(net->act_conv2); + tensor_free(net->act_relu2); + tensor_free(net->act_flat); + tensor_free(net->act_fc1); + tensor_free(net->act_relu3); + tensor_free(net->out_shape); + tensor_free(net->out_ytype); + tensor_free(net->out_lowheight); + free(net); +} + +static void free_activations(Network *net) { + tensor_free(net->act_conv1); net->act_conv1 = NULL; + tensor_free(net->act_relu1); net->act_relu1 = NULL; + tensor_free(net->act_conv2); net->act_conv2 = NULL; + tensor_free(net->act_relu2); net->act_relu2 = NULL; + tensor_free(net->act_flat); net->act_flat = NULL; + tensor_free(net->act_fc1); net->act_fc1 = NULL; + tensor_free(net->act_relu3); net->act_relu3 = NULL; + tensor_free(net->out_shape); net->out_shape = NULL; + tensor_free(net->out_ytype); net->out_ytype = NULL; + tensor_free(net->out_lowheight); net->out_lowheight = NULL; +} + +void network_forward(Network *net, Tensor *input, int training) { + free_activations(net); + + /* Conv1 -> LeakyReLU */ + net->act_conv1 = conv2d_forward(&net->conv1, input, training); + net->act_relu1 = apply_leaky_relu(net->act_conv1); + + /* Conv2 -> LeakyReLU */ + net->act_conv2 = conv2d_forward(&net->conv2, net->act_relu1, training); + net->act_relu2 = apply_leaky_relu(net->act_conv2); + + /* Flatten: [batch, 16, 20, 15] -> [batch, 4800] */ + int batch = net->act_relu2->shape[0]; + int flat_size = net->act_relu2->size / batch; + int fshape[] = {batch, flat_size}; + net->act_flat = tensor_alloc(2, fshape); + memcpy(net->act_flat->data, net->act_relu2->data, (size_t)net->act_relu2->size * sizeof(float)); + + /* FC1 -> LeakyReLU */ + net->act_fc1 = dense_forward(&net->fc1, net->act_flat, training); + net->act_relu3 = apply_leaky_relu(net->act_fc1); + + /* Three heads with sigmoid */ + Tensor *logit_shape = dense_forward(&net->head_shape, net->act_relu3, training); + Tensor *logit_ytype = dense_forward(&net->head_ytype, net->act_relu3, training); + Tensor *logit_lowheight = dense_forward(&net->head_lowheight, net->act_relu3, training); + + net->out_shape = apply_sigmoid(logit_shape); + net->out_ytype = apply_sigmoid(logit_ytype); + net->out_lowheight = apply_sigmoid(logit_lowheight); + + tensor_free(logit_shape); + tensor_free(logit_ytype); + tensor_free(logit_lowheight); +} + +void network_backward(Network *net, Tensor *target_shape, Tensor *target_ytype, Tensor *target_lowheight) { + int batch = net->out_shape->shape[0]; + + /* BCE gradient at sigmoid: d_logit = pred - target */ + /* Head: shape (10 outputs) */ + int gs[] = {batch, 10}; + Tensor *grad_logit_shape = tensor_alloc(2, gs); + for (int i = 0; i < batch * 10; i++) + grad_logit_shape->data[i] = (net->out_shape->data[i] - target_shape->data[i]) / (float)batch; + + int gy[] = {batch, 1}; + Tensor *grad_logit_ytype = tensor_alloc(2, gy); + for (int i = 0; i < batch; i++) + grad_logit_ytype->data[i] = (net->out_ytype->data[i] - target_ytype->data[i]) / (float)batch; + + Tensor *grad_logit_lh = tensor_alloc(2, gy); + for (int i = 0; i < batch; i++) + grad_logit_lh->data[i] = (net->out_lowheight->data[i] - target_lowheight->data[i]) / (float)batch; + + /* Backward through heads */ + Tensor *grad_relu3_s = dense_backward(&net->head_shape, grad_logit_shape); + Tensor *grad_relu3_y = dense_backward(&net->head_ytype, grad_logit_ytype); + Tensor *grad_relu3_l = dense_backward(&net->head_lowheight, grad_logit_lh); + + /* Sum gradients from three heads */ + int r3shape[] = {batch, 24}; + Tensor *grad_relu3 = tensor_zeros(2, r3shape); + for (int i = 0; i < batch * 24; i++) + grad_relu3->data[i] = grad_relu3_s->data[i] + grad_relu3_y->data[i] + grad_relu3_l->data[i]; + + tensor_free(grad_logit_shape); + tensor_free(grad_logit_ytype); + tensor_free(grad_logit_lh); + tensor_free(grad_relu3_s); + tensor_free(grad_relu3_y); + tensor_free(grad_relu3_l); + + /* LeakyReLU backward (fc1 output) */ + Tensor *grad_fc1_out = apply_leaky_relu_backward(grad_relu3, net->act_fc1); + tensor_free(grad_relu3); + + /* Dense fc1 backward */ + Tensor *grad_flat = dense_backward(&net->fc1, grad_fc1_out); + tensor_free(grad_fc1_out); + + /* Unflatten: [batch, 4800] -> [batch, 16, 20, 15] */ + int ushape[] = {batch, 16, 20, 15}; + Tensor *grad_relu2 = tensor_alloc(4, ushape); + memcpy(grad_relu2->data, grad_flat->data, (size_t)grad_flat->size * sizeof(float)); + tensor_free(grad_flat); + + /* LeakyReLU backward (conv2 output) */ + Tensor *grad_conv2_out = apply_leaky_relu_backward(grad_relu2, net->act_conv2); + tensor_free(grad_relu2); + + /* Conv2 backward */ + Tensor *grad_relu1 = conv2d_backward(&net->conv2, grad_conv2_out); + tensor_free(grad_conv2_out); + + /* LeakyReLU backward (conv1 output) */ + Tensor *grad_conv1_out = apply_leaky_relu_backward(grad_relu1, net->act_conv1); + tensor_free(grad_relu1); + + /* Conv1 backward */ + Tensor *grad_input = conv2d_backward(&net->conv1, grad_conv1_out); + tensor_free(grad_conv1_out); + tensor_free(grad_input); +} + +void network_adam_step(Network *net, float lr, float beta1, float beta2, float eps, int t) { + adam_update(net->conv1.weight, net->conv1.grad_weight, net->conv1.m_weight, net->conv1.v_weight, lr, beta1, beta2, eps, t); + adam_update(net->conv1.bias, net->conv1.grad_bias, net->conv1.m_bias, net->conv1.v_bias, lr, beta1, beta2, eps, t); + adam_update(net->conv2.weight, net->conv2.grad_weight, net->conv2.m_weight, net->conv2.v_weight, lr, beta1, beta2, eps, t); + adam_update(net->conv2.bias, net->conv2.grad_bias, net->conv2.m_bias, net->conv2.v_bias, lr, beta1, beta2, eps, t); + adam_update(net->fc1.weight, net->fc1.grad_weight, net->fc1.m_weight, net->fc1.v_weight, lr, beta1, beta2, eps, t); + adam_update(net->fc1.bias, net->fc1.grad_bias, net->fc1.m_bias, net->fc1.v_bias, lr, beta1, beta2, eps, t); + adam_update(net->head_shape.weight, net->head_shape.grad_weight, net->head_shape.m_weight, net->head_shape.v_weight, lr, beta1, beta2, eps, t); + adam_update(net->head_shape.bias, net->head_shape.grad_bias, net->head_shape.m_bias, net->head_shape.v_bias, lr, beta1, beta2, eps, t); + adam_update(net->head_ytype.weight, net->head_ytype.grad_weight, net->head_ytype.m_weight, net->head_ytype.v_weight, lr, beta1, beta2, eps, t); + adam_update(net->head_ytype.bias, net->head_ytype.grad_bias, net->head_ytype.m_bias, net->head_ytype.v_bias, lr, beta1, beta2, eps, t); + adam_update(net->head_lowheight.weight, net->head_lowheight.grad_weight, net->head_lowheight.m_weight, net->head_lowheight.v_weight, lr, beta1, beta2, eps, t); + adam_update(net->head_lowheight.bias, net->head_lowheight.grad_bias, net->head_lowheight.m_bias, net->head_lowheight.v_bias, lr, beta1, beta2, eps, t); +} + +void network_zero_grad(Network *net) { + memset(net->conv1.grad_weight->data, 0, (size_t)net->conv1.grad_weight->size * sizeof(float)); + memset(net->conv1.grad_bias->data, 0, (size_t)net->conv1.grad_bias->size * sizeof(float)); + memset(net->conv2.grad_weight->data, 0, (size_t)net->conv2.grad_weight->size * sizeof(float)); + memset(net->conv2.grad_bias->data, 0, (size_t)net->conv2.grad_bias->size * sizeof(float)); + memset(net->fc1.grad_weight->data, 0, (size_t)net->fc1.grad_weight->size * sizeof(float)); + memset(net->fc1.grad_bias->data, 0, (size_t)net->fc1.grad_bias->size * sizeof(float)); + memset(net->head_shape.grad_weight->data, 0, (size_t)net->head_shape.grad_weight->size * sizeof(float)); + memset(net->head_shape.grad_bias->data, 0, (size_t)net->head_shape.grad_bias->size * sizeof(float)); + memset(net->head_ytype.grad_weight->data, 0, (size_t)net->head_ytype.grad_weight->size * sizeof(float)); + memset(net->head_ytype.grad_bias->data, 0, (size_t)net->head_ytype.grad_bias->size * sizeof(float)); + memset(net->head_lowheight.grad_weight->data, 0, (size_t)net->head_lowheight.grad_weight->size * sizeof(float)); + memset(net->head_lowheight.grad_bias->data, 0, (size_t)net->head_lowheight.grad_bias->size * sizeof(float)); +} + +float network_bce_loss(Network *net, Tensor *target_shape, Tensor *target_ytype, Tensor *target_lowheight) { + float loss = 0.0f; + int batch = net->out_shape->shape[0]; + + for (int i = 0; i < batch * 10; i++) { + float p = net->out_shape->data[i]; + float t = target_shape->data[i]; + p = fmaxf(1e-7f, fminf(1.0f - 1e-7f, p)); + loss -= t * logf(p) + (1.0f - t) * logf(1.0f - p); + } + for (int i = 0; i < batch; i++) { + float p = net->out_ytype->data[i]; + float t = target_ytype->data[i]; + p = fmaxf(1e-7f, fminf(1.0f - 1e-7f, p)); + loss -= t * logf(p) + (1.0f - t) * logf(1.0f - p); + } + for (int i = 0; i < batch; i++) { + float p = net->out_lowheight->data[i]; + float t = target_lowheight->data[i]; + p = fmaxf(1e-7f, fminf(1.0f - 1e-7f, p)); + loss -= t * logf(p) + (1.0f - t) * logf(1.0f - p); + } + + return loss / (float)batch; +} + +void network_infer(Network *net, const float *input300, float *output12) { + int ishape[] = {1, 1, 20, 15}; + Tensor *input = tensor_alloc(4, ishape); + memcpy(input->data, input300, 300 * sizeof(float)); + + network_forward(net, input, 0); + + /* output order: A,B,C,D,E,F,G,H,J,K, ytype, lowheight */ + for (int i = 0; i < 10; i++) + output12[i] = net->out_shape->data[i]; + output12[10] = net->out_ytype->data[0]; + output12[11] = net->out_lowheight->data[0]; + + tensor_free(input); +} diff --git a/Autokem/nn.h b/Autokem/nn.h new file mode 100644 index 0000000..9769be5 --- /dev/null +++ b/Autokem/nn.h @@ -0,0 +1,90 @@ +#ifndef NN_H +#define NN_H + +#include + +/* ---- Tensor ---- */ + +typedef struct { + float *data; + int shape[4]; /* up to 4 dims */ + int ndim; + int size; /* total number of elements */ +} Tensor; + +Tensor *tensor_alloc(int ndim, const int *shape); +Tensor *tensor_zeros(int ndim, const int *shape); +void tensor_free(Tensor *t); + +/* ---- Layers ---- */ + +typedef struct { + int in_ch, out_ch, kh, kw; + Tensor *weight; /* [out_ch, in_ch, kh, kw] */ + Tensor *bias; /* [out_ch] */ + Tensor *grad_weight; + Tensor *grad_bias; + /* Adam moments */ + Tensor *m_weight, *v_weight; + Tensor *m_bias, *v_bias; + /* cached input for backward */ + Tensor *input_cache; +} Conv2D; + +typedef struct { + int in_features, out_features; + Tensor *weight; /* [out_features, in_features] */ + Tensor *bias; /* [out_features] */ + Tensor *grad_weight; + Tensor *grad_bias; + Tensor *m_weight, *v_weight; + Tensor *m_bias, *v_bias; + Tensor *input_cache; +} Dense; + +/* ---- Network ---- */ + +typedef struct { + Conv2D conv1; /* 1->12, 3x3 */ + Conv2D conv2; /* 12->16, 3x3 */ + Dense fc1; /* 4800->24 */ + Dense head_shape; /* 24->10 (bits A-H, J, K) */ + Dense head_ytype; /* 24->1 */ + Dense head_lowheight;/* 24->1 */ + + /* activation caches (allocated per forward) */ + Tensor *act_conv1; + Tensor *act_relu1; + Tensor *act_conv2; + Tensor *act_relu2; + Tensor *act_flat; + Tensor *act_fc1; + Tensor *act_relu3; + Tensor *out_shape; + Tensor *out_ytype; + Tensor *out_lowheight; +} Network; + +/* Init / free */ +Network *network_create(void); +void network_free(Network *net); + +/* Forward pass. input: [batch, 1, 20, 15]. Outputs stored in net->out_* */ +void network_forward(Network *net, Tensor *input, int training); + +/* Backward pass. targets: shape[batch,10], ytype[batch,1], lowheight[batch,1] */ +void network_backward(Network *net, Tensor *target_shape, Tensor *target_ytype, Tensor *target_lowheight); + +/* Adam update step */ +void network_adam_step(Network *net, float lr, float beta1, float beta2, float eps, int t); + +/* Zero all gradients */ +void network_zero_grad(Network *net); + +/* Compute BCE loss (sum of all heads) */ +float network_bce_loss(Network *net, Tensor *target_shape, Tensor *target_ytype, Tensor *target_lowheight); + +/* Single-sample inference: input float[300], output float[12] (A-H,J,K,ytype,lowheight) */ +void network_infer(Network *net, const float *input300, float *output12); + +#endif diff --git a/Autokem/safetensor.c b/Autokem/safetensor.c new file mode 100644 index 0000000..6873901 --- /dev/null +++ b/Autokem/safetensor.c @@ -0,0 +1,269 @@ +#include "safetensor.h" +#include +#include +#include +#include + +/* Tensor registry entry */ +typedef struct { + const char *name; + float *data; + int size; + int ndim; + int shape[4]; +} TensorEntry; + +static void collect_tensors(Network *net, TensorEntry *entries, int *count) { + int n = 0; +#define ADD(nm, layer, field) do { \ + entries[n].name = nm; \ + entries[n].data = net->layer.field->data; \ + entries[n].size = net->layer.field->size; \ + entries[n].ndim = net->layer.field->ndim; \ + for (int i = 0; i < net->layer.field->ndim; i++) \ + entries[n].shape[i] = net->layer.field->shape[i]; \ + n++; \ +} while(0) + + ADD("conv1.weight", conv1, weight); + ADD("conv1.bias", conv1, bias); + ADD("conv2.weight", conv2, weight); + ADD("conv2.bias", conv2, bias); + ADD("fc1.weight", fc1, weight); + ADD("fc1.bias", fc1, bias); + ADD("head_shape.weight", head_shape, weight); + ADD("head_shape.bias", head_shape, bias); + ADD("head_ytype.weight", head_ytype, weight); + ADD("head_ytype.bias", head_ytype, bias); + ADD("head_lowheight.weight", head_lowheight, weight); + ADD("head_lowheight.bias", head_lowheight, bias); +#undef ADD + *count = n; +} + +int safetensor_save(const char *path, Network *net, int total_samples, int epochs, float val_loss) { + TensorEntry entries[12]; + int count; + collect_tensors(net, entries, &count); + + /* Build JSON header */ + char header[8192]; + int pos = 0; + pos += snprintf(header + pos, sizeof(header) - (size_t)pos, "{"); + + /* metadata */ + pos += snprintf(header + pos, sizeof(header) - (size_t)pos, + "\"__metadata__\":{\"samples\":\"%d\",\"epochs\":\"%d\",\"val_loss\":\"%.6f\"},", + total_samples, epochs, (double)val_loss); + + /* tensor entries */ + size_t data_offset = 0; + for (int i = 0; i < count; i++) { + size_t byte_size = (size_t)entries[i].size * sizeof(float); + + pos += snprintf(header + pos, sizeof(header) - (size_t)pos, + "\"%s\":{\"dtype\":\"F32\",\"shape\":[", entries[i].name); + + for (int d = 0; d < entries[i].ndim; d++) { + if (d > 0) pos += snprintf(header + pos, sizeof(header) - (size_t)pos, ","); + pos += snprintf(header + pos, sizeof(header) - (size_t)pos, "%d", entries[i].shape[d]); + } + + pos += snprintf(header + pos, sizeof(header) - (size_t)pos, + "],\"data_offsets\":[%zu,%zu]}", data_offset, data_offset + byte_size); + + if (i < count - 1) + pos += snprintf(header + pos, sizeof(header) - (size_t)pos, ","); + + data_offset += byte_size; + } + + pos += snprintf(header + pos, sizeof(header) - (size_t)pos, "}"); + + /* Pad header to 8-byte alignment */ + size_t header_len = (size_t)pos; + size_t padded = (header_len + 7) & ~(size_t)7; + while (header_len < padded) { + header[header_len++] = ' '; + } + + FILE *f = fopen(path, "wb"); + if (!f) { + fprintf(stderr, "Error: cannot open %s for writing\n", path); + return -1; + } + + /* 8-byte LE header length */ + uint64_t hlen = (uint64_t)header_len; + fwrite(&hlen, 8, 1, f); + + /* JSON header */ + fwrite(header, 1, header_len, f); + + /* Raw tensor data */ + for (int i = 0; i < count; i++) { + fwrite(entries[i].data, sizeof(float), (size_t)entries[i].size, f); + } + + fclose(f); + printf("Saved model to %s (%zu bytes)\n", path, 8 + header_len + data_offset); + return 0; +} + +/* Minimal JSON parser: find tensor by name, extract data_offsets */ +static int find_tensor_offsets(const char *json, size_t json_len, const char *name, + size_t *off_start, size_t *off_end) { + /* Search for "name": */ + size_t nlen = strlen(name); + for (size_t i = 0; i + nlen + 3 < json_len; i++) { + if (json[i] == '"' && strncmp(json + i + 1, name, nlen) == 0 && json[i + 1 + nlen] == '"') { + /* Found the key, now find data_offsets */ + const char *doff = strstr(json + i, "\"data_offsets\""); + if (!doff || (size_t)(doff - json) > json_len) return -1; + const char *bracket = strchr(doff, '['); + if (!bracket) return -1; + if (sscanf(bracket, "[%zu,%zu]", off_start, off_end) != 2) return -1; + return 0; + } + } + return -1; +} + +int safetensor_load(const char *path, Network *net) { + FILE *f = fopen(path, "rb"); + if (!f) { + fprintf(stderr, "Error: cannot open %s\n", path); + return -1; + } + + uint64_t header_len; + if (fread(&header_len, 8, 1, f) != 1) { fclose(f); return -1; } + + char *json = malloc((size_t)header_len + 1); + if (fread(json, 1, (size_t)header_len, f) != (size_t)header_len) { + free(json); + fclose(f); + return -1; + } + json[header_len] = '\0'; + + long data_start = 8 + (long)header_len; + + TensorEntry entries[12]; + int count; + collect_tensors(net, entries, &count); + + for (int i = 0; i < count; i++) { + size_t off_start, off_end; + if (find_tensor_offsets(json, (size_t)header_len, entries[i].name, &off_start, &off_end) != 0) { + fprintf(stderr, "Error: tensor '%s' not found in %s\n", entries[i].name, path); + free(json); + fclose(f); + return -1; + } + + size_t byte_size = off_end - off_start; + if (byte_size != (size_t)entries[i].size * sizeof(float)) { + fprintf(stderr, "Error: size mismatch for '%s': expected %zu, got %zu\n", + entries[i].name, (size_t)entries[i].size * sizeof(float), byte_size); + free(json); + fclose(f); + return -1; + } + + fseek(f, data_start + (long)off_start, SEEK_SET); + if (fread(entries[i].data, 1, byte_size, f) != byte_size) { + fprintf(stderr, "Error: failed to read tensor '%s'\n", entries[i].name); + free(json); + fclose(f); + return -1; + } + } + + free(json); + fclose(f); + return 0; +} + +int safetensor_stats(const char *path) { + FILE *f = fopen(path, "rb"); + if (!f) { + fprintf(stderr, "Error: cannot open %s\n", path); + return -1; + } + + uint64_t header_len; + if (fread(&header_len, 8, 1, f) != 1) { fclose(f); return -1; } + + char *json = malloc((size_t)header_len + 1); + if (fread(json, 1, (size_t)header_len, f) != (size_t)header_len) { + free(json); + fclose(f); + return -1; + } + json[header_len] = '\0'; + fclose(f); + + printf("Model: %s\n", path); + printf("Header length: %lu bytes\n", (unsigned long)header_len); + + /* Extract a JSON string value: find "key":"value" and return value */ + /* Helper: find value for key within metadata block */ + const char *meta = strstr(json, "\"__metadata__\""); + if (meta) { + const char *keys[] = {"samples", "epochs", "val_loss"}; + const char *labels[] = {"Training samples", "Epochs", "Validation loss"}; + for (int k = 0; k < 3; k++) { + char search[64]; + snprintf(search, sizeof(search), "\"%s\"", keys[k]); + const char *found = strstr(meta, search); + if (!found) continue; + /* skip past key and colon to opening quote of value */ + const char *colon = strchr(found + strlen(search), ':'); + if (!colon) continue; + const char *vstart = strchr(colon, '"'); + if (!vstart) continue; + vstart++; + const char *vend = strchr(vstart, '"'); + if (!vend) continue; + printf("%s: %.*s\n", labels[k], (int)(vend - vstart), vstart); + } + } + + /* List tensors */ + const char *tensor_names[] = { + "conv1.weight", "conv1.bias", "conv2.weight", "conv2.bias", + "fc1.weight", "fc1.bias", + "head_shape.weight", "head_shape.bias", + "head_ytype.weight", "head_ytype.bias", + "head_lowheight.weight", "head_lowheight.bias" + }; + + int total_params = 0; + printf("\nTensors:\n"); + for (int i = 0; i < 12; i++) { + size_t off_start, off_end; + if (find_tensor_offsets(json, (size_t)header_len, tensor_names[i], &off_start, &off_end) == 0) { + int params = (int)(off_end - off_start) / 4; + total_params += params; + + /* Extract shape */ + const char *key = strstr(json, tensor_names[i]); + if (key) { + const char *shp = strstr(key, "\"shape\""); + if (shp) { + const char *br = strchr(shp, '['); + const char *bre = strchr(shp, ']'); + if (br && bre) { + printf(" %-28s shape=[%.*s] params=%d\n", + tensor_names[i], (int)(bre - br - 1), br + 1, params); + } + } + } + } + } + printf("\nTotal parameters: %d (%.1f KB as float32)\n", total_params, (float)total_params * 4.0f / 1024.0f); + + free(json); + return 0; +} diff --git a/Autokem/safetensor.h b/Autokem/safetensor.h new file mode 100644 index 0000000..57e8273 --- /dev/null +++ b/Autokem/safetensor.h @@ -0,0 +1,16 @@ +#ifndef SAFETENSOR_H +#define SAFETENSOR_H + +#include "nn.h" + +/* Save network weights to .safetensors format. + metadata: optional string pairs (key,value,...,NULL) */ +int safetensor_save(const char *path, Network *net, int total_samples, int epochs, float val_loss); + +/* Load network weights from .safetensors file. */ +int safetensor_load(const char *path, Network *net); + +/* Print model stats from .safetensors file. */ +int safetensor_stats(const char *path); + +#endif diff --git a/Autokem/tga.c b/Autokem/tga.c new file mode 100644 index 0000000..4f1adb5 --- /dev/null +++ b/Autokem/tga.c @@ -0,0 +1,105 @@ +#include "tga.h" +#include +#include + +TgaImage *tga_read(const char *path) { + FILE *f = fopen(path, "rb"); + if (!f) return NULL; + + uint8_t header[18]; + if (fread(header, 1, 18, f) != 18) { fclose(f); return NULL; } + + uint8_t id_length = header[0]; + uint8_t colour_map_type = header[1]; + uint8_t image_type = header[2]; + /* skip colour map spec (bytes 3-7) */ + /* image spec starts at byte 8 */ + uint16_t width = header[12] | (header[13] << 8); + uint16_t height = header[14] | (header[15] << 8); + uint8_t bpp = header[16]; + uint8_t descriptor = header[17]; + + if (colour_map_type != 0 || image_type != 2 || bpp != 32) { + fclose(f); + return NULL; + } + + int top_to_bottom = (descriptor & 0x20) != 0; + + /* skip image ID */ + if (id_length > 0) fseek(f, id_length, SEEK_CUR); + + long pixel_data_offset = 18 + id_length; + + TgaImage *img = malloc(sizeof(TgaImage)); + if (!img) { fclose(f); return NULL; } + + img->width = width; + img->height = height; + img->pixel_data_offset = pixel_data_offset; + img->top_to_bottom = top_to_bottom; + img->pixels = malloc((size_t)width * height * sizeof(uint32_t)); + if (!img->pixels) { free(img); fclose(f); return NULL; } + + for (int row = 0; row < height; row++) { + int y = top_to_bottom ? row : (height - 1 - row); + for (int x = 0; x < width; x++) { + uint8_t bgra[4]; + if (fread(bgra, 1, 4, f) != 4) { + free(img->pixels); free(img); fclose(f); + return NULL; + } + /* TGA stores BGRA, convert to RGBA8888 */ + uint32_t r = bgra[2], g = bgra[1], b = bgra[0], a = bgra[3]; + img->pixels[y * width + x] = (r << 24) | (g << 16) | (b << 8) | a; + } + } + + fclose(f); + return img; +} + +uint32_t tga_get_pixel(const TgaImage *img, int x, int y) { + if (x < 0 || x >= img->width || y < 0 || y >= img->height) return 0; + return img->pixels[y * img->width + x]; +} + +int tga_write_pixel(const char *path, TgaImage *img, int x, int y, uint32_t rgba) { + if (x < 0 || x >= img->width || y < 0 || y >= img->height) return -1; + + /* compute file row: reverse the mapping used during read */ + int file_row; + if (img->top_to_bottom) { + file_row = y; + } else { + file_row = img->height - 1 - y; + } + + long offset = img->pixel_data_offset + ((long)file_row * img->width + x) * 4; + + FILE *f = fopen(path, "r+b"); + if (!f) return -1; + + fseek(f, offset, SEEK_SET); + + /* convert RGBA8888 to TGA BGRA */ + uint8_t bgra[4]; + bgra[2] = (rgba >> 24) & 0xFF; /* R */ + bgra[1] = (rgba >> 16) & 0xFF; /* G */ + bgra[0] = (rgba >> 8) & 0xFF; /* B */ + bgra[3] = rgba & 0xFF; /* A */ + + size_t written = fwrite(bgra, 1, 4, f); + fclose(f); + + /* also update in-memory pixel array */ + img->pixels[y * img->width + x] = rgba; + + return (written == 4) ? 0 : -1; +} + +void tga_free(TgaImage *img) { + if (!img) return; + free(img->pixels); + free(img); +} diff --git a/Autokem/tga.h b/Autokem/tga.h new file mode 100644 index 0000000..d5257fe --- /dev/null +++ b/Autokem/tga.h @@ -0,0 +1,33 @@ +#ifndef TGA_H +#define TGA_H + +#include +#include + +typedef struct { + int width; + int height; + uint32_t *pixels; /* RGBA8888: R<<24 | G<<16 | B<<8 | A */ + long pixel_data_offset; /* byte offset of pixel data in file */ + int top_to_bottom; +} TgaImage; + +/* Read an uncompressed 32-bit TGA file. Returns NULL on error. */ +TgaImage *tga_read(const char *path); + +/* Get pixel at (x,y) as RGBA8888. Returns 0 for out-of-bounds. */ +uint32_t tga_get_pixel(const TgaImage *img, int x, int y); + +/* Write a single pixel (RGBA8888) to TGA file on disk at (x,y). + Opens/closes the file internally. */ +int tga_write_pixel(const char *path, TgaImage *img, int x, int y, uint32_t rgba); + +/* Free a TgaImage. */ +void tga_free(TgaImage *img); + +/* tagify: returns 0 if alpha==0, else full pixel value */ +static inline uint32_t tagify(uint32_t pixel) { + return (pixel & 0xFF) == 0 ? 0 : pixel; +} + +#endif diff --git a/Autokem/train.c b/Autokem/train.c new file mode 100644 index 0000000..f22b981 --- /dev/null +++ b/Autokem/train.c @@ -0,0 +1,423 @@ +#include "train.h" +#include "tga.h" +#include "nn.h" +#include "safetensor.h" + +#include +#include +#include +#include +#include +#include + +/* ---- Data sample ---- */ + +typedef struct { + float input[300]; /* 15x20 binary */ + float shape[10]; /* A,B,C,D,E,F,G,H,J,K */ + float ytype; + float lowheight; +} Sample; + +/* ---- Bit extraction from kerning mask ---- */ +/* kerningMask = pixel >> 8 & 0xFFFFFF + * Layout: Red=Y0000000, Green=JK000000, Blue=ABCDEFGH + * After >> 8: bits 23-16 = Red[7:0], bits 15-8 = Green[7:0], bits 7-0 = Blue[7:0] + * Y = bit 23 (already extracted separately as isKernYtype) + * J = bit 15, K = bit 14 + * A = bit 7, B = bit 6, ..., H = bit 0 + */ +static void extract_shape_bits(int kerning_mask, float *shape) { + shape[0] = (float)((kerning_mask >> 7) & 1); /* A */ + shape[1] = (float)((kerning_mask >> 6) & 1); /* B */ + shape[2] = (float)((kerning_mask >> 5) & 1); /* C */ + shape[3] = (float)((kerning_mask >> 4) & 1); /* D */ + shape[4] = (float)((kerning_mask >> 3) & 1); /* E */ + shape[5] = (float)((kerning_mask >> 2) & 1); /* F */ + shape[6] = (float)((kerning_mask >> 1) & 1); /* G */ + shape[7] = (float)((kerning_mask >> 0) & 1); /* H */ + shape[8] = (float)((kerning_mask >> 15) & 1); /* J */ + shape[9] = (float)((kerning_mask >> 14) & 1); /* K */ +} + +/* ---- Collect samples from one TGA ---- */ + +static int collect_from_sheet(const char *path, int is_xyswap, Sample *samples, int max_samples) { + TgaImage *img = tga_read(path); + if (!img) { + fprintf(stderr, "Warning: cannot read %s\n", path); + return 0; + } + + int cell_w = 16, cell_h = 20; + int cols = img->width / cell_w; + int rows = img->height / cell_h; + int total_cells = cols * rows; + int count = 0; + + for (int index = 0; index < total_cells && count < max_samples; index++) { + int cell_x, cell_y; + if (is_xyswap) { + cell_x = (index / cols) * cell_w; + cell_y = (index % cols) * cell_h; + } else { + cell_x = (index % cols) * cell_w; + cell_y = (index / cols) * cell_h; + } + + int tag_x = cell_x + (cell_w - 1); /* rightmost column */ + int tag_y = cell_y; + + /* Read width (5-bit binary from Y+0..Y+4) */ + int width = 0; + for (int y = 0; y < 5; y++) { + if (tga_get_pixel(img, tag_x, tag_y + y) & 0xFF) + width |= (1 << y); + } + if (width == 0) continue; + + /* Read kerning data pixel at Y+6 */ + uint32_t kern_pixel = tagify(tga_get_pixel(img, tag_x, tag_y + 6)); + if ((kern_pixel & 0xFF) == 0) continue; /* no kern data */ + + /* Extract labels */ + int is_kern_ytype = (kern_pixel & 0x80000000u) != 0; + int kerning_mask = (int)((kern_pixel >> 8) & 0xFFFFFF); + + int is_low_height = (tga_get_pixel(img, tag_x, tag_y + 5) & 0xFF) != 0; + + Sample *s = &samples[count]; + extract_shape_bits(kerning_mask, s->shape); + s->ytype = (float)is_kern_ytype; + s->lowheight = (float)is_low_height; + + /* Extract 15x20 binary input from the glyph area */ + for (int gy = 0; gy < 20; gy++) { + for (int gx = 0; gx < 15; gx++) { + uint32_t p = tga_get_pixel(img, cell_x + gx, cell_y + gy); + s->input[gy * 15 + gx] = ((p & 0x80) != 0) ? 1.0f : 0.0f; + } + } + + count++; + } + + tga_free(img); + return count; +} + +/* ---- Fisher-Yates shuffle ---- */ + +static void shuffle_indices(int *arr, int n) { + for (int i = n - 1; i > 0; i--) { + int j = rand() % (i + 1); + int tmp = arr[i]; arr[i] = arr[j]; arr[j] = tmp; + } +} + +/* ---- Copy network weights ---- */ + +static void copy_tensor_data(Tensor *dst, Tensor *src) { + memcpy(dst->data, src->data, (size_t)src->size * sizeof(float)); +} + +static void save_weights(Network *net, Network *best) { + copy_tensor_data(best->conv1.weight, net->conv1.weight); + copy_tensor_data(best->conv1.bias, net->conv1.bias); + copy_tensor_data(best->conv2.weight, net->conv2.weight); + copy_tensor_data(best->conv2.bias, net->conv2.bias); + copy_tensor_data(best->fc1.weight, net->fc1.weight); + copy_tensor_data(best->fc1.bias, net->fc1.bias); + copy_tensor_data(best->head_shape.weight, net->head_shape.weight); + copy_tensor_data(best->head_shape.bias, net->head_shape.bias); + copy_tensor_data(best->head_ytype.weight, net->head_ytype.weight); + copy_tensor_data(best->head_ytype.bias, net->head_ytype.bias); + copy_tensor_data(best->head_lowheight.weight, net->head_lowheight.weight); + copy_tensor_data(best->head_lowheight.bias, net->head_lowheight.bias); +} + +/* ---- Training ---- */ + +int train_model(void) { + const char *assets_dir = "../src/assets"; + const int max_total = 16384; + + Sample *all_samples = calloc((size_t)max_total, sizeof(Sample)); + if (!all_samples) { fprintf(stderr, "Error: out of memory\n"); return 1; } + + int total = 0; + + /* Scan for *_variable.tga files */ + DIR *dir = opendir(assets_dir); + if (!dir) { + fprintf(stderr, "Error: cannot open %s\n", assets_dir); + free(all_samples); + return 1; + } + + struct dirent *ent; + int file_count = 0; + while ((ent = readdir(dir)) != NULL) { + const char *name = ent->d_name; + size_t len = strlen(name); + + /* Must end with _variable.tga */ + if (len < 14) continue; + if (strcmp(name + len - 13, "_variable.tga") != 0) continue; + + /* Skip extrawide */ + if (strstr(name, "extrawide") != NULL) continue; + + /* Check for xyswap */ + int is_xyswap = (strstr(name, "xyswap") != NULL); + + char fullpath[512]; + snprintf(fullpath, sizeof(fullpath), "%s/%s", assets_dir, name); + + int got = collect_from_sheet(fullpath, is_xyswap, all_samples + total, max_total - total); + if (got > 0) { + printf(" %s: %d samples\n", name, got); + total += got; + file_count++; + } + } + closedir(dir); + + printf("Collected %d samples from %d sheets\n", total, file_count); + if (total < 10) { + fprintf(stderr, "Error: too few samples to train\n"); + free(all_samples); + return 1; + } + + /* Print label distribution */ + { + const char *bit_names[] = {"A","B","C","D","E","F","G","H","J","K","Ytype","LowH"}; + int counts[12] = {0}; + int nonzero_input = 0; + for (int i = 0; i < total; i++) { + for (int b = 0; b < 10; b++) + counts[b] += (int)all_samples[i].shape[b]; + counts[10] += (int)all_samples[i].ytype; + counts[11] += (int)all_samples[i].lowheight; + for (int p = 0; p < 300; p++) + if (all_samples[i].input[p] > 0.5f) { nonzero_input++; break; } + } + printf("Label distribution:\n "); + for (int b = 0; b < 12; b++) + printf("%s:%d(%.0f%%) ", bit_names[b], counts[b], 100.0 * counts[b] / total); + printf("\n Non-empty inputs: %d/%d\n\n", nonzero_input, total); + } + + /* Shuffle and split 80/20 */ + srand((unsigned)time(NULL)); + int *indices = malloc((size_t)total * sizeof(int)); + for (int i = 0; i < total; i++) indices[i] = i; + shuffle_indices(indices, total); + + int n_train = (int)(total * 0.8); + int n_val = total - n_train; + printf("Train: %d, Validation: %d\n\n", n_train, n_val); + + /* Create network */ + Network *net = network_create(); + Network *best_net = network_create(); + + int batch_size = 32; + float lr = 0.001f, beta1 = 0.9f, beta2 = 0.999f, eps = 1e-8f; + int max_epochs = 200; + int patience = 10; + float best_val_loss = 1e30f; + int patience_counter = 0; + int best_epoch = 0; + int adam_t = 0; + + for (int epoch = 0; epoch < max_epochs; epoch++) { + /* Shuffle training indices */ + shuffle_indices(indices, n_train); + + float train_loss = 0.0f; + int n_batches = 0; + + /* Training loop */ + for (int start = 0; start < n_train; start += batch_size) { + int bs = (start + batch_size <= n_train) ? batch_size : (n_train - start); + + /* Build batch tensors */ + int ishape[] = {bs, 1, 20, 15}; + Tensor *input = tensor_alloc(4, ishape); + + int sshape[] = {bs, 10}; + Tensor *tgt_shape = tensor_alloc(2, sshape); + + int yshape[] = {bs, 1}; + Tensor *tgt_ytype = tensor_alloc(2, yshape); + Tensor *tgt_lh = tensor_alloc(2, yshape); + + for (int i = 0; i < bs; i++) { + Sample *s = &all_samples[indices[start + i]]; + memcpy(input->data + i * 300, s->input, 300 * sizeof(float)); + memcpy(tgt_shape->data + i * 10, s->shape, 10 * sizeof(float)); + tgt_ytype->data[i] = s->ytype; + tgt_lh->data[i] = s->lowheight; + } + + /* Forward */ + network_zero_grad(net); + network_forward(net, input, 1); + + /* Loss */ + float loss = network_bce_loss(net, tgt_shape, tgt_ytype, tgt_lh); + train_loss += loss; + n_batches++; + + /* Backward */ + network_backward(net, tgt_shape, tgt_ytype, tgt_lh); + + /* Adam step */ + adam_t++; + network_adam_step(net, lr, beta1, beta2, eps, adam_t); + + tensor_free(input); + tensor_free(tgt_shape); + tensor_free(tgt_ytype); + tensor_free(tgt_lh); + } + + train_loss /= (float)n_batches; + + /* Validation */ + float val_loss = 0.0f; + int val_batches = 0; + for (int start = 0; start < n_val; start += batch_size) { + int bs = (start + batch_size <= n_val) ? batch_size : (n_val - start); + + int ishape[] = {bs, 1, 20, 15}; + Tensor *input = tensor_alloc(4, ishape); + + int sshape[] = {bs, 10}; + Tensor *tgt_shape = tensor_alloc(2, sshape); + + int yshape[] = {bs, 1}; + Tensor *tgt_ytype = tensor_alloc(2, yshape); + Tensor *tgt_lh = tensor_alloc(2, yshape); + + for (int i = 0; i < bs; i++) { + Sample *s = &all_samples[indices[n_train + start + i]]; + memcpy(input->data + i * 300, s->input, 300 * sizeof(float)); + memcpy(tgt_shape->data + i * 10, s->shape, 10 * sizeof(float)); + tgt_ytype->data[i] = s->ytype; + tgt_lh->data[i] = s->lowheight; + } + + network_forward(net, input, 0); + val_loss += network_bce_loss(net, tgt_shape, tgt_ytype, tgt_lh); + val_batches++; + + tensor_free(input); + tensor_free(tgt_shape); + tensor_free(tgt_ytype); + tensor_free(tgt_lh); + } + + val_loss /= (float)val_batches; + + printf("Epoch %3d: train_loss=%.4f val_loss=%.4f", epoch + 1, (double)train_loss, (double)val_loss); + + if (val_loss < best_val_loss) { + best_val_loss = val_loss; + best_epoch = epoch + 1; + patience_counter = 0; + save_weights(net, best_net); + printf(" *best*"); + } else { + patience_counter++; + } + printf("\n"); + + if (patience_counter >= patience) { + printf("\nEarly stopping at epoch %d (best epoch: %d)\n", epoch + 1, best_epoch); + break; + } + } + + /* Restore best weights and save */ + save_weights(best_net, net); + safetensor_save("autokem.safetensors", net, total, best_epoch, best_val_loss); + + /* Compute final per-bit accuracy on validation set */ + { + const char *bit_names[] = {"A","B","C","D","E","F","G","H","J","K","Ytype","LowH"}; + int correct_per_bit[12] = {0}; + int total_per_bit = n_val; + int n_examples = 0; + const int max_examples = 8; + + printf("\nGlyph Tags — validation predictions:\n"); + + for (int i = 0; i < n_val; i++) { + Sample *s = &all_samples[indices[n_train + i]]; + float output[12]; + network_infer(net, s->input, output); + + int pred_bits[12], tgt_bits[12]; + int any_mismatch = 0; + for (int b = 0; b < 10; b++) { + pred_bits[b] = output[b] >= 0.5f ? 1 : 0; + tgt_bits[b] = (int)s->shape[b]; + if (pred_bits[b] == tgt_bits[b]) correct_per_bit[b]++; + else any_mismatch = 1; + } + pred_bits[10] = output[10] >= 0.5f ? 1 : 0; + tgt_bits[10] = (int)s->ytype; + if (pred_bits[10] == tgt_bits[10]) correct_per_bit[10]++; + else any_mismatch = 1; + pred_bits[11] = output[11] >= 0.5f ? 1 : 0; + tgt_bits[11] = (int)s->lowheight; + if (pred_bits[11] == tgt_bits[11]) correct_per_bit[11]++; + else any_mismatch = 1; + + /* Print a few examples (mix of correct and mismatched) */ + if (n_examples < max_examples && (any_mismatch || i < 4)) { + /* Build tag string: e.g. "ABCDEFGH(B)" or "AB(Y)" */ + char actual[32] = "", predicted[32] = ""; + int ap = 0, pp = 0; + const char shape_chars[] = "ABCDEFGHJK"; + for (int b = 0; b < 10; b++) { + if (tgt_bits[b]) actual[ap++] = shape_chars[b]; + if (pred_bits[b]) predicted[pp++] = shape_chars[b]; + } + actual[ap] = '\0'; predicted[pp] = '\0'; + + char actual_tag[48], pred_tag[48]; + snprintf(actual_tag, sizeof(actual_tag), "%s%s%s", + ap > 0 ? actual : "(empty)", + tgt_bits[10] ? "(Y)" : "(B)", + tgt_bits[11] ? " low" : ""); + snprintf(pred_tag, sizeof(pred_tag), "%s%s%s", + pp > 0 ? predicted : "(empty)", + pred_bits[10] ? "(Y)" : "(B)", + pred_bits[11] ? " low" : ""); + + printf(" actual=%-20s pred=%-20s %s\n", actual_tag, pred_tag, + any_mismatch ? "MISMATCH" : "ok"); + n_examples++; + } + } + + printf("\nPer-bit accuracy (%d val samples):\n ", n_val); + int total_correct = 0; + for (int b = 0; b < 12; b++) { + printf("%s:%.1f%% ", bit_names[b], 100.0 * correct_per_bit[b] / total_per_bit); + total_correct += correct_per_bit[b]; + } + printf("\n Overall: %d/%d (%.2f%%)\n", + total_correct, n_val * 12, 100.0 * total_correct / (n_val * 12)); + } + + network_free(net); + network_free(best_net); + free(all_samples); + free(indices); + return 0; +} diff --git a/Autokem/train.h b/Autokem/train.h new file mode 100644 index 0000000..c4baf8f --- /dev/null +++ b/Autokem/train.h @@ -0,0 +1,8 @@ +#ifndef TRAIN_H +#define TRAIN_H + +/* Train model on existing spritesheets in ../src/assets/ + Saves to autokem.safetensors */ +int train_model(void); + +#endif