autokem: more filtering

2026-06-06 05:58:30 +09:00 · 2026-03-13 20:08:56 +09:00
parent d57707b210
commit 539a2c9f46
5 changed files with 78 additions and 27 deletions
--- a/Autokem/apply.c
+++ b/Autokem/apply.c
@@ -2,7 +2,7 @@
 #include "tga.h"
 #include "nn.h"
 #include "safetensor.h"
-#include "unicode_lm.h"
+#include "unicode_filter.h"

 #include <stdio.h>
 #include <stdlib.h>
--- a/Autokem/autokem.safetensors
+++ b/Autokem/autokem.safetensors
--- a/Autokem/train.c
+++ b/Autokem/train.c
@@ -2,7 +2,7 @@
 #include "tga.h"
 #include "nn.h"
 #include "safetensor.h"
-#include "unicode_lm.h"
+#include "unicode_filter.h"

 #include <stdio.h>
 #include <stdlib.h>
@@ -78,8 +78,8 @@ static int collect_from_sheet(const char *path, int is_xyswap, int start_code,
        }
        if (width == 0) continue;

-        /* Skip modifier letters (superscripts/subscripts) */
-        if (start_code >= 0 && is_modifier_letter(start_code + index))
+        /* Skip modifier letters, symbols, punctuation */
+        if (start_code >= 0 && is_excluded_from_training(start_code + index))
            continue;

        /* Read kerning data pixel at Y+6 */
--- a/Autokem/train_torch.py
+++ b/Autokem/train_torch.py
@@ -127,11 +127,12 @@ def collect_from_sheet(path, is_xyswap, code_range=None):
        if width == 0:
            continue

-        # Skip modifier letters (superscripts/subscripts)
+        # Skip modifier letters, symbols, punctuation
        if code_range is not None and index < len(code_range):
            cp = code_range[index]
            try:
-                if unicodedata.category(chr(cp)) == 'Lm':
+                cat = unicodedata.category(chr(cp))
+                if cat == 'Lm' or cat[0] in ('S', 'P'):
                    skipped_lm += 1
                    continue
            except (ValueError, OverflowError):
@@ -194,14 +195,14 @@ def collect_all_samples(assets_dir):
        inputs, labels, skipped_lm = collect_from_sheet(path, is_xyswap, code_range)
        total_skipped_lm += skipped_lm
        if inputs:
-            suffix = f" (skipped {skipped_lm} Lm)" if skipped_lm else ""
+            suffix = f" (skipped {skipped_lm})" if skipped_lm else ""
            print(f"  {name}: {len(inputs)} samples{suffix}")
            all_inputs.extend(inputs)
            all_labels.extend(labels)
            file_count += 1

    if total_skipped_lm:
-        print(f"  Total modifier letters filtered: {total_skipped_lm}")
+        print(f"  Filtered (Lm/S/P): {total_skipped_lm}")

    return np.array(all_inputs), np.array(all_labels, dtype=np.float32), file_count

--- a/Autokem/unicode_filter.h
+++ b/Autokem/unicode_filter.h
@@ -1,18 +1,21 @@
-#ifndef UNICODE_LM_H
-#define UNICODE_LM_H
+#ifndef UNICODE_FILTER_H
+#define UNICODE_FILTER_H

 #include <string.h>

 /*
- * Unicode category Lm (Letter, modifier) range checks.
+ * Unicode category filters for training/apply.
 * Generated from Python unicodedata (Unicode 16.0).
 *
- * is_modifier_letter(cp)    — true for all Lm codepoints
- * is_subscript_modifier(cp) — true for Lm codepoints with <sub> decomposition
+ * is_modifier_letter(cp)         — category Lm
+ * is_subscript_modifier(cp)      — Lm with <sub> decomposition
+ * is_symbol_or_punctuation(cp)   — categories S* or P*
+ * is_excluded_from_training(cp)  — Lm or S* or P*
 */

+/* ---- Lm (modifier letter) ---- */
+
 static inline int is_modifier_letter(int cp) {
-    /* 71 contiguous ranges covering all 397 Lm codepoints */
    if (cp >= 0x02B0 && cp <= 0x02C1) return 1;
    if (cp >= 0x02C6 && cp <= 0x02D1) return 1;
    if (cp >= 0x02E0 && cp <= 0x02E4) return 1;
@@ -88,20 +91,67 @@ static inline int is_modifier_letter(int cp) {
 }

 static inline int is_subscript_modifier(int cp) {
-    /* 49 Lm codepoints with <sub> decomposition */
-    if (cp >= 0x1D62 && cp <= 0x1D6A) return 1;  /* 9 */
-    if (cp >= 0x2090 && cp <= 0x209C) return 1;   /* 13 */
-    if (cp == 0x2C7C) return 1;                    /* 1 */
-    if (cp >= 0x1E051 && cp <= 0x1E06A) return 1;  /* 26 */
+    if (cp >= 0x1D62 && cp <= 0x1D6A) return 1;
+    if (cp >= 0x2090 && cp <= 0x209C) return 1;
+    if (cp == 0x2C7C) return 1;
+    if (cp >= 0x1E051 && cp <= 0x1E06A) return 1;
    return 0;
 }

-/*
- * Map sheet filename to first codepoint of its (contiguous) code range.
- * Returns -1 if unknown. For non-contiguous sheets (e.g. Devanagari),
- * returns the start of the first sub-range; cells beyond it won't
- * collide with Lm codepoints in practice.
- */
+/* ---- S* (Symbol) and P* (Punctuation) ---- */
+
+/* Table of {start, end} ranges for S/P codepoints in font sheets */
+static const int sp_ranges[][2] = {
+    {0x00021, 0x0002F}, {0x0003A, 0x00040}, {0x0005B, 0x00060},
+    {0x0007B, 0x0007E}, {0x000A1, 0x000A9}, {0x000AB, 0x000AC},
+    {0x000AE, 0x000B1}, {0x000B4, 0x000B4}, {0x000B6, 0x000B8},
+    {0x000BB, 0x000BB}, {0x000BF, 0x000BF}, {0x000D7, 0x000D7},
+    {0x000F7, 0x000F7}, {0x002C2, 0x002C5}, {0x002D2, 0x002DF},
+    {0x002E5, 0x002EB}, {0x002ED, 0x002ED}, {0x002EF, 0x002FF},
+    {0x00375, 0x00375}, {0x0037E, 0x0037E}, {0x00384, 0x00385},
+    {0x00387, 0x00387}, {0x00482, 0x00482}, {0x0055A, 0x0055F},
+    {0x00589, 0x0058A}, {0x0058D, 0x0058F}, {0x00964, 0x00965},
+    {0x00970, 0x00970}, {0x009F2, 0x009F3}, {0x009FA, 0x009FB},
+    {0x009FD, 0x009FD}, {0x00BF3, 0x00BFA}, {0x00E3F, 0x00E3F},
+    {0x00E4F, 0x00E4F}, {0x00E5A, 0x00E5B}, {0x010FB, 0x010FB},
+    {0x016EB, 0x016ED}, {0x01CC0, 0x01CC7}, {0x01FBD, 0x01FBD},
+    {0x01FBF, 0x01FC1}, {0x01FCD, 0x01FCF}, {0x01FDD, 0x01FDF},
+    {0x01FED, 0x01FEF}, {0x01FFD, 0x01FFE}, {0x02010, 0x02027},
+    {0x02030, 0x0205E}, {0x0207A, 0x0207E}, {0x0208A, 0x0208E},
+    {0x020A0, 0x020C0}, {0x02100, 0x02101}, {0x02103, 0x02106},
+    {0x02108, 0x02109}, {0x02114, 0x02114}, {0x02116, 0x02118},
+    {0x0211E, 0x02123}, {0x02125, 0x02125}, {0x02127, 0x02127},
+    {0x02129, 0x02129}, {0x0212E, 0x0212E}, {0x0213A, 0x0213B},
+    {0x02140, 0x02144}, {0x0214A, 0x0214D}, {0x0214F, 0x0214F},
+    {0x0218A, 0x0218B}, {0x02190, 0x021FF}, {0x02400, 0x02426},
+    {0x02800, 0x028FF}, {0x03001, 0x03004}, {0x03008, 0x03020},
+    {0x03030, 0x03030}, {0x03036, 0x03037}, {0x0303D, 0x0303F},
+    {0x0309B, 0x0309C}, {0x030A0, 0x030A0}, {0x030FB, 0x030FB},
+    {0x04DC0, 0x04DFF}, {0x0A673, 0x0A673}, {0x0A67E, 0x0A67E},
+    {0x0A720, 0x0A721}, {0x0A789, 0x0A78A}, {0x0AB5B, 0x0AB5B},
+    {0x0AB6A, 0x0AB6B}, {0x0FF01, 0x0FF0F}, {0x0FF1A, 0x0FF20},
+    {0x0FF3B, 0x0FF40}, {0x0FF5B, 0x0FF65}, {0x0FFE0, 0x0FFE6},
+    {0x0FFE8, 0x0FFEE}, {0x0FFFC, 0x0FFFD}, {0x1F10D, 0x1F1AD},
+    {0x1F1E6, 0x1F1FF}, {0x1FB00, 0x1FB92}, {0x1FB94, 0x1FBCA},
+};
+
+static inline int is_symbol_or_punctuation(int cp) {
+    int n = (int)(sizeof(sp_ranges) / sizeof(sp_ranges[0]));
+    for (int i = 0; i < n; i++) {
+        if (cp >= sp_ranges[i][0] && cp <= sp_ranges[i][1])
+            return 1;
+    }
+    return 0;
+}
+
+/* ---- Combined filter for training exclusion ---- */
+
+static inline int is_excluded_from_training(int cp) {
+    return is_modifier_letter(cp) || is_symbol_or_punctuation(cp);
+}
+
+/* ---- Sheet filename → start codepoint ---- */
+
 static int sheet_start_code(const char *basename) {
    if (strstr(basename, "ascii_variable"))                return 0x00;
    if (strstr(basename, "latinExtA_variable"))            return 0x100;
@@ -138,4 +188,4 @@ static int sheet_start_code(const char *basename) {
    return -1;
 }

-#endif /* UNICODE_LM_H */
+#endif /* UNICODE_FILTER_H */