demo text update

fixed some mislabeling
Latin Ext F and G
2026-06-06 14:08:30 +09:00 · 2026-03-13 14:13:28 +09:00 · 2026-03-13 13:59:58 +09:00 · 2026-03-13 13:29:43 +09:00
27 changed files with 250 additions and 41 deletions
--- a/Autokem/apply.c
+++ b/Autokem/apply.c
@@ -2,6 +2,7 @@
 #include "tga.h"
 #include "nn.h"
 #include "safetensor.h"
+#include "unicode_lm.h"

 #include <stdio.h>
 #include <stdlib.h>
@@ -75,7 +76,8 @@ int apply_model(const char *tga_path) {
    int rows = img->height / cell_h;
    int total_cells = cols * rows;

-    int processed = 0, updated = 0, skipped = 0;
+    int start_code = sheet_start_code(basename);
+    int processed = 0, updated = 0, skipped = 0, fixed_lm = 0;

    for (int index = 0; index < total_cells; index++) {
        int cell_x, cell_y;
@@ -107,6 +109,21 @@ int apply_model(const char *tga_path) {
        int opcode = (int)((dir_pixel >> 24) & 0xFF);
        if (opcode != 0) { skipped++; continue; }

+        /* Modifier letters: fixed kern pixel, skip inference */
+        if (start_code >= 0 && is_modifier_letter(start_code + index)) {
+            if (is_subscript_modifier(start_code + index)) {
+                /* Subscript: CDEFGHJK(B), lowheight=1 */
+                tga_write_pixel(tga_path, img, tag_x, tag_y + 5, 0xFFFFFFFF);
+                tga_write_pixel(tga_path, img, tag_x, tag_y + 6, 0x00C03FFF);
+            } else {
+                /* Superscript: ABCDEF(B), lowheight=0 */
+                tga_write_pixel(tga_path, img, tag_x, tag_y + 5, 0x00000000);
+                tga_write_pixel(tga_path, img, tag_x, tag_y + 6, 0x0000FCFF);
+            }
+            processed++; updated++; fixed_lm++;
+            continue;
+        }
+
        /* Extract 15x20 binary input */
        float input[300];
        for (int gy = 0; gy < 20; gy++) {
@@ -155,8 +172,8 @@ int apply_model(const char *tga_path) {
        updated++;
    }

-    printf("Processed: %d cells, Updated: %d, Skipped: %d (of %d total)\n",
-           processed, updated, skipped, total_cells);
+    printf("Processed: %d cells, Updated: %d, Skipped: %d, Fixed Lm: %d (of %d total)\n",
+           processed, updated, skipped, fixed_lm, total_cells);

    tga_free(img);
    network_free(net);
--- a/Autokem/autokem.safetensors
+++ b/Autokem/autokem.safetensors
--- a/Autokem/train.c
+++ b/Autokem/train.c
@@ -2,6 +2,7 @@
 #include "tga.h"
 #include "nn.h"
 #include "safetensor.h"
+#include "unicode_lm.h"

 #include <stdio.h>
 #include <stdlib.h>
@@ -42,7 +43,8 @@ static void extract_shape_bits(int kerning_mask, float *shape) {

 /* ---- Collect samples from one TGA ---- */

-static int collect_from_sheet(const char *path, int is_xyswap, Sample *samples, int max_samples) {
+static int collect_from_sheet(const char *path, int is_xyswap, int start_code,
+                              Sample *samples, int max_samples) {
    TgaImage *img = tga_read(path);
    if (!img) {
        fprintf(stderr, "Warning: cannot read %s\n", path);
@@ -76,6 +78,10 @@ static int collect_from_sheet(const char *path, int is_xyswap, Sample *samples,
        }
        if (width == 0) continue;

+        /* Skip modifier letters (superscripts/subscripts) */
+        if (start_code >= 0 && is_modifier_letter(start_code + index))
+            continue;
+
        /* Read kerning data pixel at Y+6 */
        uint32_t kern_pixel = tagify(tga_get_pixel(img, tag_x, tag_y + 6));
        if ((kern_pixel & 0xFF) == 0) continue; /* no kern data */
@@ -170,7 +176,9 @@ int train_model(void) {
        char fullpath[512];
        snprintf(fullpath, sizeof(fullpath), "%s/%s", assets_dir, name);

-        int got = collect_from_sheet(fullpath, is_xyswap, all_samples + total, max_total - total);
+        int start_code = sheet_start_code(name);
+        int got = collect_from_sheet(fullpath, is_xyswap, start_code,
+                                     all_samples + total, max_total - total);
        if (got > 0) {
            printf("  %s: %d samples\n", name, got);
            total += got;
--- a/Autokem/train_torch.py
+++ b/Autokem/train_torch.py
@@ -20,10 +20,26 @@ import json
 import os
 import struct
 import sys
+import unicodedata
 from pathlib import Path

 import numpy as np

+# ---- Sheet code ranges (imported from OTFbuild/sheet_config.py) ----
+
+_otfbuild = os.path.join(os.path.dirname(os.path.abspath(__file__)), '..', 'OTFbuild')
+try:
+    sys.path.insert(0, _otfbuild)
+    from sheet_config import FILE_LIST as _FILE_LIST, CODE_RANGE as _CODE_RANGE
+    sys.path.pop(0)
+    _CODE_RANGE_MAP = {}
+    for _i, _fn in enumerate(_FILE_LIST):
+        if _i < len(_CODE_RANGE):
+            _CODE_RANGE_MAP[_fn] = _CODE_RANGE[_i]
+except ImportError:
+    _CODE_RANGE_MAP = {}
+
+
 # ---- TGA reader (matches OTFbuild/tga_reader.py and Autokem/tga.c) ----

 class TgaImage:
@@ -80,7 +96,7 @@ def tagify(pixel):

 # ---- Data collection (matches Autokem/train.c) ----

-def collect_from_sheet(path, is_xyswap):
+def collect_from_sheet(path, is_xyswap, code_range=None):
    """Extract labelled samples from a single TGA sheet."""
    img = read_tga(path)
    cell_w, cell_h = 16, 20
@@ -90,6 +106,7 @@ def collect_from_sheet(path, is_xyswap):

    inputs = []
    labels = []
+    skipped_lm = 0

    for index in range(total_cells):
        if is_xyswap:
@@ -110,6 +127,16 @@ def collect_from_sheet(path, is_xyswap):
        if width == 0:
            continue

+        # Skip modifier letters (superscripts/subscripts)
+        if code_range is not None and index < len(code_range):
+            cp = code_range[index]
+            try:
+                if unicodedata.category(chr(cp)) == 'Lm':
+                    skipped_lm += 1
+                    continue
+            except (ValueError, OverflowError):
+                pass
+
        # Kern data pixel at Y+6
        kern_pixel = tagify(img.get_pixel(tag_x, tag_y + 6))
        if (kern_pixel & 0xFF) == 0:
@@ -145,7 +172,7 @@ def collect_from_sheet(path, is_xyswap):
        inputs.append(inp)
        labels.append(shape + [is_kern_ytype, is_low_height])

-    return inputs, labels
+    return inputs, labels, skipped_lm


 def collect_all_samples(assets_dir):
@@ -153,6 +180,7 @@ def collect_all_samples(assets_dir):
    all_inputs = []
    all_labels = []
    file_count = 0
+    total_skipped_lm = 0

    for name in sorted(os.listdir(assets_dir)):
        if not name.endswith('_variable.tga'):
@@ -161,14 +189,20 @@ def collect_all_samples(assets_dir):
            continue

        is_xyswap = 'xyswap' in name
+        code_range = _CODE_RANGE_MAP.get(name, None)
        path = os.path.join(assets_dir, name)
-        inputs, labels = collect_from_sheet(path, is_xyswap)
+        inputs, labels, skipped_lm = collect_from_sheet(path, is_xyswap, code_range)
+        total_skipped_lm += skipped_lm
        if inputs:
-            print(f"  {name}: {len(inputs)} samples")
+            suffix = f" (skipped {skipped_lm} Lm)" if skipped_lm else ""
+            print(f"  {name}: {len(inputs)} samples{suffix}")
            all_inputs.extend(inputs)
            all_labels.extend(labels)
            file_count += 1

+    if total_skipped_lm:
+        print(f"  Total modifier letters filtered: {total_skipped_lm}")
+
    return np.array(all_inputs), np.array(all_labels, dtype=np.float32), file_count


--- a/Autokem/unicode_lm.h
+++ b/Autokem/unicode_lm.h
@@ -0,0 +1,141 @@
+#ifndef UNICODE_LM_H
+#define UNICODE_LM_H
+
+#include <string.h>
+
+/*
+ * Unicode category Lm (Letter, modifier) range checks.
+ * Generated from Python unicodedata (Unicode 16.0).
+ *
+ * is_modifier_letter(cp)    — true for all Lm codepoints
+ * is_subscript_modifier(cp) — true for Lm codepoints with <sub> decomposition
+ */
+
+static inline int is_modifier_letter(int cp) {
+    /* 71 contiguous ranges covering all 397 Lm codepoints */
+    if (cp >= 0x02B0 && cp <= 0x02C1) return 1;
+    if (cp >= 0x02C6 && cp <= 0x02D1) return 1;
+    if (cp >= 0x02E0 && cp <= 0x02E4) return 1;
+    if (cp == 0x02EC) return 1;
+    if (cp == 0x02EE) return 1;
+    if (cp == 0x0374) return 1;
+    if (cp == 0x037A) return 1;
+    if (cp == 0x0559) return 1;
+    if (cp == 0x0640) return 1;
+    if (cp >= 0x06E5 && cp <= 0x06E6) return 1;
+    if (cp >= 0x07F4 && cp <= 0x07F5) return 1;
+    if (cp == 0x07FA) return 1;
+    if (cp == 0x081A) return 1;
+    if (cp == 0x0824) return 1;
+    if (cp == 0x0828) return 1;
+    if (cp == 0x08C9) return 1;
+    if (cp == 0x0971) return 1;
+    if (cp == 0x0E46) return 1;
+    if (cp == 0x0EC6) return 1;
+    if (cp == 0x10FC) return 1;
+    if (cp == 0x17D7) return 1;
+    if (cp == 0x1843) return 1;
+    if (cp == 0x1AA7) return 1;
+    if (cp >= 0x1C78 && cp <= 0x1C7D) return 1;
+    if (cp >= 0x1D2C && cp <= 0x1D6A) return 1;
+    if (cp == 0x1D78) return 1;
+    if (cp >= 0x1D9B && cp <= 0x1DBF) return 1;
+    if (cp == 0x2071) return 1;
+    if (cp == 0x207F) return 1;
+    if (cp >= 0x2090 && cp <= 0x209C) return 1;
+    if (cp >= 0x2C7C && cp <= 0x2C7D) return 1;
+    if (cp == 0x2D6F) return 1;
+    if (cp == 0x2E2F) return 1;
+    if (cp == 0x3005) return 1;
+    if (cp >= 0x3031 && cp <= 0x3035) return 1;
+    if (cp == 0x303B) return 1;
+    if (cp >= 0x309D && cp <= 0x309E) return 1;
+    if (cp >= 0x30FC && cp <= 0x30FE) return 1;
+    if (cp == 0xA015) return 1;
+    if (cp >= 0xA4F8 && cp <= 0xA4FD) return 1;
+    if (cp == 0xA60C) return 1;
+    if (cp == 0xA67F) return 1;
+    if (cp >= 0xA69C && cp <= 0xA69D) return 1;
+    if (cp >= 0xA717 && cp <= 0xA71F) return 1;
+    if (cp == 0xA770) return 1;
+    if (cp == 0xA788) return 1;
+    if (cp >= 0xA7F2 && cp <= 0xA7F4) return 1;
+    if (cp >= 0xA7F8 && cp <= 0xA7F9) return 1;
+    if (cp == 0xA9CF) return 1;
+    if (cp == 0xA9E6) return 1;
+    if (cp == 0xAA70) return 1;
+    if (cp == 0xAADD) return 1;
+    if (cp >= 0xAAF3 && cp <= 0xAAF4) return 1;
+    if (cp >= 0xAB5C && cp <= 0xAB5F) return 1;
+    if (cp == 0xAB69) return 1;
+    if (cp == 0xFF70) return 1;
+    if (cp >= 0xFF9E && cp <= 0xFF9F) return 1;
+    if (cp >= 0x10780 && cp <= 0x10785) return 1;
+    if (cp >= 0x10787 && cp <= 0x107B0) return 1;
+    if (cp >= 0x107B2 && cp <= 0x107BA) return 1;
+    if (cp >= 0x16B40 && cp <= 0x16B43) return 1;
+    if (cp >= 0x16F93 && cp <= 0x16F9F) return 1;
+    if (cp >= 0x16FE0 && cp <= 0x16FE1) return 1;
+    if (cp == 0x16FE3) return 1;
+    if (cp >= 0x1AFF0 && cp <= 0x1AFF3) return 1;
+    if (cp >= 0x1AFF5 && cp <= 0x1AFFB) return 1;
+    if (cp >= 0x1AFFD && cp <= 0x1AFFE) return 1;
+    if (cp >= 0x1E030 && cp <= 0x1E06D) return 1;
+    if (cp >= 0x1E137 && cp <= 0x1E13D) return 1;
+    if (cp == 0x1E4EB) return 1;
+    if (cp == 0x1E94B) return 1;
+    return 0;
+}
+
+static inline int is_subscript_modifier(int cp) {
+    /* 49 Lm codepoints with <sub> decomposition */
+    if (cp >= 0x1D62 && cp <= 0x1D6A) return 1;  /* 9 */
+    if (cp >= 0x2090 && cp <= 0x209C) return 1;   /* 13 */
+    if (cp == 0x2C7C) return 1;                    /* 1 */
+    if (cp >= 0x1E051 && cp <= 0x1E06A) return 1;  /* 26 */
+    return 0;
+}
+
+/*
+ * Map sheet filename to first codepoint of its (contiguous) code range.
+ * Returns -1 if unknown. For non-contiguous sheets (e.g. Devanagari),
+ * returns the start of the first sub-range; cells beyond it won't
+ * collide with Lm codepoints in practice.
+ */
+static int sheet_start_code(const char *basename) {
+    if (strstr(basename, "ascii_variable"))                return 0x00;
+    if (strstr(basename, "latinExtA_variable"))            return 0x100;
+    if (strstr(basename, "latinExtB_variable"))            return 0x180;
+    if (strstr(basename, "cyrilic_extC_variable"))         return 0x1C80;
+    if (strstr(basename, "cyrilic_extB_variable"))         return 0xA640;
+    if (strstr(basename, "cyrilic_bulgarian_variable"))    return 0xF0000;
+    if (strstr(basename, "cyrilic_serbian_variable"))      return 0xF0060;
+    if (strstr(basename, "cyrilic_variable"))              return 0x400;
+    if (strstr(basename, "halfwidth_fullwidth_variable"))  return 0xFF00;
+    if (strstr(basename, "unipunct_variable"))             return 0x2000;
+    if (strstr(basename, "greek_polytonic"))               return 0x1F00;
+    if (strstr(basename, "greek_variable"))                return 0x370;
+    if (strstr(basename, "thai_variable"))                 return 0xE00;
+    if (strstr(basename, "hayeren_variable"))              return 0x530;
+    if (strstr(basename, "kartuli_allcaps_variable"))      return 0x1C90;
+    if (strstr(basename, "kartuli_variable"))              return 0x10D0;
+    if (strstr(basename, "ipa_ext_variable"))              return 0x250;
+    if (strstr(basename, "latinExt_additional_variable"))  return 0x1E00;
+    if (strstr(basename, "tsalagi_variable"))              return 0x13A0;
+    if (strstr(basename, "phonetic_extensions_variable"))  return 0x1D00;
+    if (strstr(basename, "latinExtC_variable"))            return 0x2C60;
+    if (strstr(basename, "latinExtD_variable"))            return 0xA720;
+    if (strstr(basename, "internal_variable"))             return 0xFFE00;
+    if (strstr(basename, "letterlike_symbols_variable"))   return 0x2100;
+    if (strstr(basename, "enclosed_alphanumeric"))         return 0x1F100;
+    if (strstr(basename, "sundanese_variable"))            return 0x1B80;
+    if (strstr(basename, "control_pictures_variable"))     return 0x2400;
+    if (strstr(basename, "latinExtE_variable"))            return 0xAB30;
+    if (strstr(basename, "latinExtF_variable"))            return 0x10780;
+    if (strstr(basename, "latinExtG_variable"))            return 0x1DF00;
+    if (strstr(basename, "devanagari") && !strstr(basename, "internal"))
+                                                           return 0x900;
+    return -1;
+}
+
+#endif /* UNICODE_LM_H */
--- a/demo.PNG
+++ b/demo.PNG
--- a/demotext_unaligned.txt
+++ b/demotext_unaligned.txt
@@ -114,12 +114,12 @@ How multilingual? Real multilingual!
 ⁃ Basic Latin
 ⁃ Latin-1 Supplement
 ⁃ Latin Extended Additional
-⁃ Latin Extended-A/B/C/D
+⁃ Latin Extended-A/B/C/D/E/F/G
 ⁃ Armenian
 ⁃ Arrows
 ⁃ Bengali􏿆ᶠⁱ􀀀
 ⁃ Braille Patterns
-⁃ Cherokee􏿆⁷􀀀
+⁃ Cherokee􏿆ᴬ􀀀
 ⁃ CJK Symbols and Punctuation
 ⁃ CJK Unified Ideographs􏿆⁶􀀀
 ⁃ CJK Unified Ideographs Extension A􏿆¹²·¹􀀀
@@ -161,8 +161,8 @@ How multilingual? Real multilingual!
 ⁃ Tamil
 ⁃ Thai

-  􏿆ᴱ􀀀  No support for Coptic
-  􏿆ᶠⁱ􀀀  No support for ligatures　　􏿆ჼ􀀀  Mkhedruli only
-  􏿆⁶􀀀  􏿆⁷􀀀  􏿆⁹􀀀  􏿆¹²·¹􀀀  Up to the specified Unicode version
+  􏿆ᴱ􀀀   No support for Coptic　  　 􏿆ᴬ􀀀  Uppercase only
+  􏿆ᶠⁱ􀀀  No support for ligatures     􏿆ჼ􀀀  Mkhedruli only
+  􏿆⁶􀀀   􏿆¹²·¹􀀀  Up to the specified Unicode version

 GitHub’s issue page is open! You can report any 􏽕errors􀀀, or leave 􏽕suggestions􀀀. You can help this font to be more versatile. (for more languages, more frameworks) 􏽕Clone􀀀 this repo, make changes, and make a 􏽕pull request􀀀! I appreciate any and all supports.
--- a/src/assets/cyrilic_variable.tga
+++ b/src/assets/cyrilic_variable.tga
--- a/src/assets/hayeren_variable.tga
+++ b/src/assets/hayeren_variable.tga
--- a/src/assets/ipa_ext_variable.tga
+++ b/src/assets/ipa_ext_variable.tga
--- a/src/assets/latinExtD_variable.tga
+++ b/src/assets/latinExtD_variable.tga
--- a/src/assets/latinExtF_variable.tga
+++ b/src/assets/latinExtF_variable.tga
--- a/src/assets/latinExtG_variable.tga
+++ b/src/assets/latinExtG_variable.tga
--- a/src/assets/latinExt_additional_variable.tga
+++ b/src/assets/latinExt_additional_variable.tga
--- a/src/assets/phonetic_extensions_variable.tga
+++ b/src/assets/phonetic_extensions_variable.tga
--- a/src/assets/thai_variable.tga
+++ b/src/assets/thai_variable.tga
--- a/src/assets/tsalagi_variable.tga
+++ b/src/assets/tsalagi_variable.tga
--- a/work_files/cyrilic_variable.psd
+++ b/work_files/cyrilic_variable.psd
--- a/work_files/hayeren_variable.psd
+++ b/work_files/hayeren_variable.psd
--- a/work_files/ipa_ext_variable.psd
+++ b/work_files/ipa_ext_variable.psd
--- a/work_files/latinExtD_variable.psd
+++ b/work_files/latinExtD_variable.psd
--- a/work_files/latinExtF_variable.kra
+++ b/work_files/latinExtF_variable.kra
--- a/work_files/latinExtG_variable.kra
+++ b/work_files/latinExtG_variable.kra
--- a/work_files/latinExt_additional_variable.psd
+++ b/work_files/latinExt_additional_variable.psd
--- a/work_files/phonetic_extensions_variable.psd
+++ b/work_files/phonetic_extensions_variable.psd
--- a/work_files/thai_variable.psd
+++ b/work_files/thai_variable.psd
--- a/work_files/tsalagi_variable.psd
+++ b/work_files/tsalagi_variable.psd
Author	SHA1	Message	Date
minjaesong	175fe4edfb	demo text update	2026-03-13 14:13:28 +09:00
minjaesong	4d7aa79740	fixed some mislabeling	2026-03-13 13:59:58 +09:00
minjaesong	9d9efce9d4	Latin Ext F and G	2026-03-13 13:29:43 +09:00