revised autokem model

2026-06-12 08:54:04 +09:00 · 2026-03-08 20:34:45 +09:00
parent 39603d897b
commit 244371aa9d
8 changed files with 774 additions and 235 deletions
--- a/Autokem/nn.c
+++ b/Autokem/nn.c
@@ -71,14 +71,6 @@ static void he_init(Tensor *w, int fan_in) {

 /* ---- Activations ---- */

-static inline float leaky_relu(float x) {
-    return x >= 0.0f ? x : 0.01f * x;
-}
-
-static inline float leaky_relu_grad(float x) {
-    return x >= 0.0f ? 1.0f : 0.01f;
-}
-
 static inline float sigmoid_f(float x) {
    if (x >= 0.0f) {
        float ez = expf(-x);
@@ -89,13 +81,24 @@ static inline float sigmoid_f(float x) {
    }
 }

+static inline float silu_f(float x) {
+    return x * sigmoid_f(x);
+}
+
+static inline float silu_grad(float x) {
+    float s = sigmoid_f(x);
+    return s * (1.0f + x * (1.0f - s));
+}
+
 /* ---- Conv2D forward/backward ---- */

-static void conv2d_init(Conv2D *c, int in_ch, int out_ch, int kh, int kw) {
+static void conv2d_init(Conv2D *c, int in_ch, int out_ch, int kh, int kw, int pad) {
    c->in_ch = in_ch;
    c->out_ch = out_ch;
    c->kh = kh;
    c->kw = kw;
+    c->pad_h = pad;
+    c->pad_w = pad;

    int wshape[] = {out_ch, in_ch, kh, kw};
    int bshape[] = {out_ch};
@@ -125,13 +128,13 @@ static void conv2d_free(Conv2D *c) {
    tensor_free(c->input_cache);
 }

-/* Forward: input [batch, in_ch, H, W] -> output [batch, out_ch, H, W] (same padding) */
+/* Forward: input [batch, in_ch, H, W] -> output [batch, out_ch, oH, oW] */
 static Tensor *conv2d_forward(Conv2D *c, Tensor *input, int training) {
    int batch = input->shape[0];
    int in_ch = c->in_ch, out_ch = c->out_ch;
    int H = input->shape[2], W = input->shape[3];
    int kh = c->kh, kw = c->kw;
-    int ph = kh / 2, pw = kw / 2;
+    int ph = c->pad_h, pw = c->pad_w;

    if (training) {
        tensor_free(c->input_cache);
@@ -139,13 +142,15 @@ static Tensor *conv2d_forward(Conv2D *c, Tensor *input, int training) {
        memcpy(c->input_cache->data, input->data, (size_t)input->size * sizeof(float));
    }

-    int oshape[] = {batch, out_ch, H, W};
+    int oH = H + 2 * ph - kh + 1;
+    int oW = W + 2 * pw - kw + 1;
+    int oshape[] = {batch, out_ch, oH, oW};
    Tensor *out = tensor_alloc(4, oshape);

    for (int b = 0; b < batch; b++) {
        for (int oc = 0; oc < out_ch; oc++) {
-            for (int oh = 0; oh < H; oh++) {
-                for (int ow = 0; ow < W; ow++) {
+            for (int oh = 0; oh < oH; oh++) {
+                for (int ow = 0; ow < oW; ow++) {
                    float sum = c->bias->data[oc];
                    for (int ic = 0; ic < in_ch; ic++) {
                        for (int fh = 0; fh < kh; fh++) {
@@ -160,7 +165,7 @@ static Tensor *conv2d_forward(Conv2D *c, Tensor *input, int training) {
                            }
                        }
                    }
-                    out->data[((b * out_ch + oc) * H + oh) * W + ow] = sum;
+                    out->data[((b * out_ch + oc) * oH + oh) * oW + ow] = sum;
                }
            }
        }
@@ -168,22 +173,23 @@ static Tensor *conv2d_forward(Conv2D *c, Tensor *input, int training) {
    return out;
 }

-/* Backward: grad_output [batch, out_ch, H, W] -> grad_input [batch, in_ch, H, W] */
+/* Backward: grad_output [batch, out_ch, oH, oW] -> grad_input [batch, in_ch, H, W] */
 static Tensor *conv2d_backward(Conv2D *c, Tensor *grad_output) {
    Tensor *input = c->input_cache;
    int batch = input->shape[0];
    int in_ch = c->in_ch, out_ch = c->out_ch;
    int H = input->shape[2], W = input->shape[3];
    int kh = c->kh, kw = c->kw;
-    int ph = kh / 2, pw = kw / 2;
+    int ph = c->pad_h, pw = c->pad_w;
+    int oH = grad_output->shape[2], oW = grad_output->shape[3];

    Tensor *grad_input = tensor_zeros(input->ndim, input->shape);

    for (int b = 0; b < batch; b++) {
        for (int oc = 0; oc < out_ch; oc++) {
-            for (int oh = 0; oh < H; oh++) {
-                for (int ow = 0; ow < W; ow++) {
-                    float go = grad_output->data[((b * out_ch + oc) * H + oh) * W + ow];
+            for (int oh = 0; oh < oH; oh++) {
+                for (int ow = 0; ow < oW; ow++) {
+                    float go = grad_output->data[((b * out_ch + oc) * oH + oh) * oW + ow];
                    c->grad_bias->data[oc] += go;
                    for (int ic = 0; ic < in_ch; ic++) {
                        for (int fh = 0; fh < kh; fh++) {
@@ -288,22 +294,68 @@ static Tensor *dense_backward(Dense *d, Tensor *grad_output) {
    return grad_input;
 }

-/* ---- LeakyReLU helpers on tensors ---- */
+/* ---- SiLU helpers on tensors ---- */

-static Tensor *apply_leaky_relu(Tensor *input) {
+static Tensor *apply_silu(Tensor *input) {
    Tensor *out = tensor_alloc(input->ndim, input->shape);
    for (int i = 0; i < input->size; i++)
-        out->data[i] = leaky_relu(input->data[i]);
+        out->data[i] = silu_f(input->data[i]);
    return out;
 }

-static Tensor *apply_leaky_relu_backward(Tensor *grad_output, Tensor *pre_activation) {
+static Tensor *apply_silu_backward(Tensor *grad_output, Tensor *pre_activation) {
    Tensor *grad = tensor_alloc(grad_output->ndim, grad_output->shape);
    for (int i = 0; i < grad_output->size; i++)
-        grad->data[i] = grad_output->data[i] * leaky_relu_grad(pre_activation->data[i]);
+        grad->data[i] = grad_output->data[i] * silu_grad(pre_activation->data[i]);
    return grad;
 }

+/* ---- Global Average Pooling ---- */
+
+/* Forward: input [batch, C, H, W] -> output [batch, C] */
+static Tensor *global_avg_pool_forward(Tensor *input) {
+    int batch = input->shape[0];
+    int C = input->shape[1];
+    int H = input->shape[2];
+    int W = input->shape[3];
+    int hw = H * W;
+
+    int oshape[] = {batch, C};
+    Tensor *out = tensor_alloc(2, oshape);
+
+    for (int b = 0; b < batch; b++) {
+        for (int c = 0; c < C; c++) {
+            float sum = 0.0f;
+            int base = (b * C + c) * hw;
+            for (int i = 0; i < hw; i++)
+                sum += input->data[base + i];
+            out->data[b * C + c] = sum / (float)hw;
+        }
+    }
+    return out;
+}
+
+/* Backward: grad_output [batch, C] -> grad_input [batch, C, H, W] */
+static Tensor *global_avg_pool_backward(Tensor *grad_output, int H, int W) {
+    int batch = grad_output->shape[0];
+    int C = grad_output->shape[1];
+    int hw = H * W;
+    float scale = 1.0f / (float)hw;
+
+    int ishape[] = {batch, C, H, W};
+    Tensor *grad_input = tensor_alloc(4, ishape);
+
+    for (int b = 0; b < batch; b++) {
+        for (int c = 0; c < C; c++) {
+            float go = grad_output->data[b * C + c] * scale;
+            int base = (b * C + c) * hw;
+            for (int i = 0; i < hw; i++)
+                grad_input->data[base + i] = go;
+        }
+    }
+    return grad_input;
+}
+
 /* ---- Sigmoid on tensor ---- */

 static Tensor *apply_sigmoid(Tensor *input) {
@@ -335,12 +387,10 @@ Network *network_create(void) {
    rng_seed((uint64_t)time(NULL) ^ 0xDEADBEEF);

    Network *net = calloc(1, sizeof(Network));
-    conv2d_init(&net->conv1, 1, 12, 3, 3);
-    conv2d_init(&net->conv2, 12, 16, 3, 3);
-    dense_init(&net->fc1, 4800, 24);
-    dense_init(&net->head_shape, 24, 10);
-    dense_init(&net->head_ytype, 24, 1);
-    dense_init(&net->head_lowheight, 24, 1);
+    conv2d_init(&net->conv1, 1, 32, 7, 7, 1);
+    conv2d_init(&net->conv2, 32, 64, 7, 7, 1);
+    dense_init(&net->fc1, 64, 256);
+    dense_init(&net->output, 256, 12);
    return net;
 }

@@ -349,133 +399,92 @@ void network_free(Network *net) {
    conv2d_free(&net->conv1);
    conv2d_free(&net->conv2);
    dense_free(&net->fc1);
-    dense_free(&net->head_shape);
-    dense_free(&net->head_ytype);
-    dense_free(&net->head_lowheight);
+    dense_free(&net->output);
    tensor_free(net->act_conv1);
-    tensor_free(net->act_relu1);
+    tensor_free(net->act_silu1);
    tensor_free(net->act_conv2);
-    tensor_free(net->act_relu2);
-    tensor_free(net->act_flat);
+    tensor_free(net->act_silu2);
+    tensor_free(net->act_pool);
    tensor_free(net->act_fc1);
-    tensor_free(net->act_relu3);
-    tensor_free(net->out_shape);
-    tensor_free(net->out_ytype);
-    tensor_free(net->out_lowheight);
+    tensor_free(net->act_silu3);
+    tensor_free(net->act_logits);
+    tensor_free(net->out_all);
    free(net);
 }

 static void free_activations(Network *net) {
    tensor_free(net->act_conv1);  net->act_conv1 = NULL;
-    tensor_free(net->act_relu1);  net->act_relu1 = NULL;
+    tensor_free(net->act_silu1);  net->act_silu1 = NULL;
    tensor_free(net->act_conv2);  net->act_conv2 = NULL;
-    tensor_free(net->act_relu2);  net->act_relu2 = NULL;
-    tensor_free(net->act_flat);   net->act_flat = NULL;
+    tensor_free(net->act_silu2);  net->act_silu2 = NULL;
+    tensor_free(net->act_pool);   net->act_pool = NULL;
    tensor_free(net->act_fc1);    net->act_fc1 = NULL;
-    tensor_free(net->act_relu3);  net->act_relu3 = NULL;
-    tensor_free(net->out_shape);  net->out_shape = NULL;
-    tensor_free(net->out_ytype);  net->out_ytype = NULL;
-    tensor_free(net->out_lowheight); net->out_lowheight = NULL;
+    tensor_free(net->act_silu3);  net->act_silu3 = NULL;
+    tensor_free(net->act_logits); net->act_logits = NULL;
+    tensor_free(net->out_all);    net->out_all = NULL;
 }

 void network_forward(Network *net, Tensor *input, int training) {
    free_activations(net);

-    /* Conv1 -> LeakyReLU */
+    /* Conv1 -> SiLU */
    net->act_conv1 = conv2d_forward(&net->conv1, input, training);
-    net->act_relu1 = apply_leaky_relu(net->act_conv1);
+    net->act_silu1 = apply_silu(net->act_conv1);

-    /* Conv2 -> LeakyReLU */
-    net->act_conv2 = conv2d_forward(&net->conv2, net->act_relu1, training);
-    net->act_relu2 = apply_leaky_relu(net->act_conv2);
+    /* Conv2 -> SiLU */
+    net->act_conv2 = conv2d_forward(&net->conv2, net->act_silu1, training);
+    net->act_silu2 = apply_silu(net->act_conv2);

-    /* Flatten: [batch, 16, 20, 15] -> [batch, 4800] */
-    int batch = net->act_relu2->shape[0];
-    int flat_size = net->act_relu2->size / batch;
-    int fshape[] = {batch, flat_size};
-    net->act_flat = tensor_alloc(2, fshape);
-    memcpy(net->act_flat->data, net->act_relu2->data, (size_t)net->act_relu2->size * sizeof(float));
+    /* Global Average Pool */
+    net->act_pool = global_avg_pool_forward(net->act_silu2);

-    /* FC1 -> LeakyReLU */
-    net->act_fc1 = dense_forward(&net->fc1, net->act_flat, training);
-    net->act_relu3 = apply_leaky_relu(net->act_fc1);
+    /* FC1 -> SiLU */
+    net->act_fc1 = dense_forward(&net->fc1, net->act_pool, training);
+    net->act_silu3 = apply_silu(net->act_fc1);

-    /* Three heads with sigmoid */
-    Tensor *logit_shape = dense_forward(&net->head_shape, net->act_relu3, training);
-    Tensor *logit_ytype = dense_forward(&net->head_ytype, net->act_relu3, training);
-    Tensor *logit_lowheight = dense_forward(&net->head_lowheight, net->act_relu3, training);
-
-    net->out_shape = apply_sigmoid(logit_shape);
-    net->out_ytype = apply_sigmoid(logit_ytype);
-    net->out_lowheight = apply_sigmoid(logit_lowheight);
-
-    tensor_free(logit_shape);
-    tensor_free(logit_ytype);
-    tensor_free(logit_lowheight);
+    /* Output -> Sigmoid */
+    net->act_logits = dense_forward(&net->output, net->act_silu3, training);
+    net->out_all = apply_sigmoid(net->act_logits);
 }

-void network_backward(Network *net, Tensor *target_shape, Tensor *target_ytype, Tensor *target_lowheight) {
-    int batch = net->out_shape->shape[0];
+void network_backward(Network *net, Tensor *target) {
+    int batch = net->out_all->shape[0];
+    int n_out = 12;

-    /* BCE gradient at sigmoid: d_logit = pred - target */
-    /* Head: shape (10 outputs) */
-    int gs[] = {batch, 10};
-    Tensor *grad_logit_shape = tensor_alloc(2, gs);
-    for (int i = 0; i < batch * 10; i++)
-        grad_logit_shape->data[i] = (net->out_shape->data[i] - target_shape->data[i]) / (float)batch;
+    /* BCE gradient at sigmoid: d_logit = (pred - target) / batch */
+    int gs[] = {batch, n_out};
+    Tensor *grad_logits = tensor_alloc(2, gs);
+    for (int i = 0; i < batch * n_out; i++)
+        grad_logits->data[i] = (net->out_all->data[i] - target->data[i]) / (float)batch;

-    int gy[] = {batch, 1};
-    Tensor *grad_logit_ytype = tensor_alloc(2, gy);
-    for (int i = 0; i < batch; i++)
-        grad_logit_ytype->data[i] = (net->out_ytype->data[i] - target_ytype->data[i]) / (float)batch;
+    /* Output layer backward */
+    Tensor *grad_silu3 = dense_backward(&net->output, grad_logits);
+    tensor_free(grad_logits);

-    Tensor *grad_logit_lh = tensor_alloc(2, gy);
-    for (int i = 0; i < batch; i++)
-        grad_logit_lh->data[i] = (net->out_lowheight->data[i] - target_lowheight->data[i]) / (float)batch;
+    /* SiLU backward (fc1) */
+    Tensor *grad_fc1_out = apply_silu_backward(grad_silu3, net->act_fc1);
+    tensor_free(grad_silu3);

-    /* Backward through heads */
-    Tensor *grad_relu3_s = dense_backward(&net->head_shape, grad_logit_shape);
-    Tensor *grad_relu3_y = dense_backward(&net->head_ytype, grad_logit_ytype);
-    Tensor *grad_relu3_l = dense_backward(&net->head_lowheight, grad_logit_lh);
-
-    /* Sum gradients from three heads */
-    int r3shape[] = {batch, 24};
-    Tensor *grad_relu3 = tensor_zeros(2, r3shape);
-    for (int i = 0; i < batch * 24; i++)
-        grad_relu3->data[i] = grad_relu3_s->data[i] + grad_relu3_y->data[i] + grad_relu3_l->data[i];
-
-    tensor_free(grad_logit_shape);
-    tensor_free(grad_logit_ytype);
-    tensor_free(grad_logit_lh);
-    tensor_free(grad_relu3_s);
-    tensor_free(grad_relu3_y);
-    tensor_free(grad_relu3_l);
-
-    /* LeakyReLU backward (fc1 output) */
-    Tensor *grad_fc1_out = apply_leaky_relu_backward(grad_relu3, net->act_fc1);
-    tensor_free(grad_relu3);
-
-    /* Dense fc1 backward */
-    Tensor *grad_flat = dense_backward(&net->fc1, grad_fc1_out);
+    /* FC1 backward */
+    Tensor *grad_pool = dense_backward(&net->fc1, grad_fc1_out);
    tensor_free(grad_fc1_out);

-    /* Unflatten: [batch, 4800] -> [batch, 16, 20, 15] */
-    int ushape[] = {batch, 16, 20, 15};
-    Tensor *grad_relu2 = tensor_alloc(4, ushape);
-    memcpy(grad_relu2->data, grad_flat->data, (size_t)grad_flat->size * sizeof(float));
-    tensor_free(grad_flat);
+    /* Global Average Pool backward */
+    int H = net->act_silu2->shape[2], W = net->act_silu2->shape[3];
+    Tensor *grad_silu2 = global_avg_pool_backward(grad_pool, H, W);
+    tensor_free(grad_pool);

-    /* LeakyReLU backward (conv2 output) */
-    Tensor *grad_conv2_out = apply_leaky_relu_backward(grad_relu2, net->act_conv2);
-    tensor_free(grad_relu2);
+    /* SiLU backward (conv2) */
+    Tensor *grad_conv2_out = apply_silu_backward(grad_silu2, net->act_conv2);
+    tensor_free(grad_silu2);

    /* Conv2 backward */
-    Tensor *grad_relu1 = conv2d_backward(&net->conv2, grad_conv2_out);
+    Tensor *grad_silu1 = conv2d_backward(&net->conv2, grad_conv2_out);
    tensor_free(grad_conv2_out);

-    /* LeakyReLU backward (conv1 output) */
-    Tensor *grad_conv1_out = apply_leaky_relu_backward(grad_relu1, net->act_conv1);
-    tensor_free(grad_relu1);
+    /* SiLU backward (conv1) */
+    Tensor *grad_conv1_out = apply_silu_backward(grad_silu1, net->act_conv1);
+    tensor_free(grad_silu1);

    /* Conv1 backward */
    Tensor *grad_input = conv2d_backward(&net->conv1, grad_conv1_out);
@@ -490,12 +499,8 @@ void network_adam_step(Network *net, float lr, float beta1, float beta2, float e
    adam_update(net->conv2.bias, net->conv2.grad_bias, net->conv2.m_bias, net->conv2.v_bias, lr, beta1, beta2, eps, t);
    adam_update(net->fc1.weight, net->fc1.grad_weight, net->fc1.m_weight, net->fc1.v_weight, lr, beta1, beta2, eps, t);
    adam_update(net->fc1.bias, net->fc1.grad_bias, net->fc1.m_bias, net->fc1.v_bias, lr, beta1, beta2, eps, t);
-    adam_update(net->head_shape.weight, net->head_shape.grad_weight, net->head_shape.m_weight, net->head_shape.v_weight, lr, beta1, beta2, eps, t);
-    adam_update(net->head_shape.bias, net->head_shape.grad_bias, net->head_shape.m_bias, net->head_shape.v_bias, lr, beta1, beta2, eps, t);
-    adam_update(net->head_ytype.weight, net->head_ytype.grad_weight, net->head_ytype.m_weight, net->head_ytype.v_weight, lr, beta1, beta2, eps, t);
-    adam_update(net->head_ytype.bias, net->head_ytype.grad_bias, net->head_ytype.m_bias, net->head_ytype.v_bias, lr, beta1, beta2, eps, t);
-    adam_update(net->head_lowheight.weight, net->head_lowheight.grad_weight, net->head_lowheight.m_weight, net->head_lowheight.v_weight, lr, beta1, beta2, eps, t);
-    adam_update(net->head_lowheight.bias, net->head_lowheight.grad_bias, net->head_lowheight.m_bias, net->head_lowheight.v_bias, lr, beta1, beta2, eps, t);
+    adam_update(net->output.weight, net->output.grad_weight, net->output.m_weight, net->output.v_weight, lr, beta1, beta2, eps, t);
+    adam_update(net->output.bias, net->output.grad_bias, net->output.m_bias, net->output.v_bias, lr, beta1, beta2, eps, t);
 }

 void network_zero_grad(Network *net) {
@@ -505,34 +510,18 @@ void network_zero_grad(Network *net) {
    memset(net->conv2.grad_bias->data, 0, (size_t)net->conv2.grad_bias->size * sizeof(float));
    memset(net->fc1.grad_weight->data, 0, (size_t)net->fc1.grad_weight->size * sizeof(float));
    memset(net->fc1.grad_bias->data, 0, (size_t)net->fc1.grad_bias->size * sizeof(float));
-    memset(net->head_shape.grad_weight->data, 0, (size_t)net->head_shape.grad_weight->size * sizeof(float));
-    memset(net->head_shape.grad_bias->data, 0, (size_t)net->head_shape.grad_bias->size * sizeof(float));
-    memset(net->head_ytype.grad_weight->data, 0, (size_t)net->head_ytype.grad_weight->size * sizeof(float));
-    memset(net->head_ytype.grad_bias->data, 0, (size_t)net->head_ytype.grad_bias->size * sizeof(float));
-    memset(net->head_lowheight.grad_weight->data, 0, (size_t)net->head_lowheight.grad_weight->size * sizeof(float));
-    memset(net->head_lowheight.grad_bias->data, 0, (size_t)net->head_lowheight.grad_bias->size * sizeof(float));
+    memset(net->output.grad_weight->data, 0, (size_t)net->output.grad_weight->size * sizeof(float));
+    memset(net->output.grad_bias->data, 0, (size_t)net->output.grad_bias->size * sizeof(float));
 }

-float network_bce_loss(Network *net, Tensor *target_shape, Tensor *target_ytype, Tensor *target_lowheight) {
+float network_bce_loss(Network *net, Tensor *target) {
    float loss = 0.0f;
-    int batch = net->out_shape->shape[0];
+    int batch = net->out_all->shape[0];
+    int n = batch * 12;

-    for (int i = 0; i < batch * 10; i++) {
-        float p = net->out_shape->data[i];
-        float t = target_shape->data[i];
-        p = fmaxf(1e-7f, fminf(1.0f - 1e-7f, p));
-        loss -= t * logf(p) + (1.0f - t) * logf(1.0f - p);
-    }
-    for (int i = 0; i < batch; i++) {
-        float p = net->out_ytype->data[i];
-        float t = target_ytype->data[i];
-        p = fmaxf(1e-7f, fminf(1.0f - 1e-7f, p));
-        loss -= t * logf(p) + (1.0f - t) * logf(1.0f - p);
-    }
-    for (int i = 0; i < batch; i++) {
-        float p = net->out_lowheight->data[i];
-        float t = target_lowheight->data[i];
-        p = fmaxf(1e-7f, fminf(1.0f - 1e-7f, p));
+    for (int i = 0; i < n; i++) {
+        float p = fmaxf(1e-7f, fminf(1.0f - 1e-7f, net->out_all->data[i]));
+        float t = target->data[i];
        loss -= t * logf(p) + (1.0f - t) * logf(1.0f - p);
    }

@@ -546,11 +535,8 @@ void network_infer(Network *net, const float *input300, float *output12) {

    network_forward(net, input, 0);

-    /* output order: A,B,C,D,E,F,G,H,J,K, ytype, lowheight */
-    for (int i = 0; i < 10; i++)
-        output12[i] = net->out_shape->data[i];
-    output12[10] = net->out_ytype->data[0];
-    output12[11] = net->out_lowheight->data[0];
+    for (int i = 0; i < 12; i++)
+        output12[i] = net->out_all->data[i];

    tensor_free(input);
 }