diff --git a/Autokem/apply.c b/Autokem/apply.c index 43455f1..fe8a97f 100644 --- a/Autokem/apply.c +++ b/Autokem/apply.c @@ -2,6 +2,7 @@ #include "tga.h" #include "nn.h" #include "safetensor.h" +#include "unicode_lm.h" #include #include @@ -75,7 +76,8 @@ int apply_model(const char *tga_path) { int rows = img->height / cell_h; int total_cells = cols * rows; - int processed = 0, updated = 0, skipped = 0; + int start_code = sheet_start_code(basename); + int processed = 0, updated = 0, skipped = 0, fixed_lm = 0; for (int index = 0; index < total_cells; index++) { int cell_x, cell_y; @@ -107,6 +109,21 @@ int apply_model(const char *tga_path) { int opcode = (int)((dir_pixel >> 24) & 0xFF); if (opcode != 0) { skipped++; continue; } + /* Modifier letters: fixed kern pixel, skip inference */ + if (start_code >= 0 && is_modifier_letter(start_code + index)) { + if (is_subscript_modifier(start_code + index)) { + /* Subscript: CDEFGHJK(B), lowheight=1 */ + tga_write_pixel(tga_path, img, tag_x, tag_y + 5, 0xFFFFFFFF); + tga_write_pixel(tga_path, img, tag_x, tag_y + 6, 0x00C03FFF); + } else { + /* Superscript: ABCDEF(B), lowheight=0 */ + tga_write_pixel(tga_path, img, tag_x, tag_y + 5, 0x00000000); + tga_write_pixel(tga_path, img, tag_x, tag_y + 6, 0x0000FCFF); + } + processed++; updated++; fixed_lm++; + continue; + } + /* Extract 15x20 binary input */ float input[300]; for (int gy = 0; gy < 20; gy++) { @@ -155,8 +172,8 @@ int apply_model(const char *tga_path) { updated++; } - printf("Processed: %d cells, Updated: %d, Skipped: %d (of %d total)\n", - processed, updated, skipped, total_cells); + printf("Processed: %d cells, Updated: %d, Skipped: %d, Fixed Lm: %d (of %d total)\n", + processed, updated, skipped, fixed_lm, total_cells); tga_free(img); network_free(net); diff --git a/Autokem/autokem.safetensors b/Autokem/autokem.safetensors index d0e8e83..8143cc1 100644 --- a/Autokem/autokem.safetensors +++ b/Autokem/autokem.safetensors @@ -1,3 +1,3 @@ version https://git-lfs.github.com/spec/v1 -oid sha256:1c86449f1fdd0a57c22b4d2c80ab8b574429d325ba952b27f2a87837873b5118 +oid sha256:c20b8357be6f8464c62884fc8a477696324fce1d46c6ed86b816015d3101072c size 487640 diff --git a/Autokem/train.c b/Autokem/train.c index c647296..418a44e 100644 --- a/Autokem/train.c +++ b/Autokem/train.c @@ -2,6 +2,7 @@ #include "tga.h" #include "nn.h" #include "safetensor.h" +#include "unicode_lm.h" #include #include @@ -42,7 +43,8 @@ static void extract_shape_bits(int kerning_mask, float *shape) { /* ---- Collect samples from one TGA ---- */ -static int collect_from_sheet(const char *path, int is_xyswap, Sample *samples, int max_samples) { +static int collect_from_sheet(const char *path, int is_xyswap, int start_code, + Sample *samples, int max_samples) { TgaImage *img = tga_read(path); if (!img) { fprintf(stderr, "Warning: cannot read %s\n", path); @@ -76,6 +78,10 @@ static int collect_from_sheet(const char *path, int is_xyswap, Sample *samples, } if (width == 0) continue; + /* Skip modifier letters (superscripts/subscripts) */ + if (start_code >= 0 && is_modifier_letter(start_code + index)) + continue; + /* Read kerning data pixel at Y+6 */ uint32_t kern_pixel = tagify(tga_get_pixel(img, tag_x, tag_y + 6)); if ((kern_pixel & 0xFF) == 0) continue; /* no kern data */ @@ -170,7 +176,9 @@ int train_model(void) { char fullpath[512]; snprintf(fullpath, sizeof(fullpath), "%s/%s", assets_dir, name); - int got = collect_from_sheet(fullpath, is_xyswap, all_samples + total, max_total - total); + int start_code = sheet_start_code(name); + int got = collect_from_sheet(fullpath, is_xyswap, start_code, + all_samples + total, max_total - total); if (got > 0) { printf(" %s: %d samples\n", name, got); total += got; diff --git a/Autokem/train_torch.py b/Autokem/train_torch.py index a915440..425c980 100644 --- a/Autokem/train_torch.py +++ b/Autokem/train_torch.py @@ -20,10 +20,26 @@ import json import os import struct import sys +import unicodedata from pathlib import Path import numpy as np +# ---- Sheet code ranges (imported from OTFbuild/sheet_config.py) ---- + +_otfbuild = os.path.join(os.path.dirname(os.path.abspath(__file__)), '..', 'OTFbuild') +try: + sys.path.insert(0, _otfbuild) + from sheet_config import FILE_LIST as _FILE_LIST, CODE_RANGE as _CODE_RANGE + sys.path.pop(0) + _CODE_RANGE_MAP = {} + for _i, _fn in enumerate(_FILE_LIST): + if _i < len(_CODE_RANGE): + _CODE_RANGE_MAP[_fn] = _CODE_RANGE[_i] +except ImportError: + _CODE_RANGE_MAP = {} + + # ---- TGA reader (matches OTFbuild/tga_reader.py and Autokem/tga.c) ---- class TgaImage: @@ -80,7 +96,7 @@ def tagify(pixel): # ---- Data collection (matches Autokem/train.c) ---- -def collect_from_sheet(path, is_xyswap): +def collect_from_sheet(path, is_xyswap, code_range=None): """Extract labelled samples from a single TGA sheet.""" img = read_tga(path) cell_w, cell_h = 16, 20 @@ -90,6 +106,7 @@ def collect_from_sheet(path, is_xyswap): inputs = [] labels = [] + skipped_lm = 0 for index in range(total_cells): if is_xyswap: @@ -110,6 +127,16 @@ def collect_from_sheet(path, is_xyswap): if width == 0: continue + # Skip modifier letters (superscripts/subscripts) + if code_range is not None and index < len(code_range): + cp = code_range[index] + try: + if unicodedata.category(chr(cp)) == 'Lm': + skipped_lm += 1 + continue + except (ValueError, OverflowError): + pass + # Kern data pixel at Y+6 kern_pixel = tagify(img.get_pixel(tag_x, tag_y + 6)) if (kern_pixel & 0xFF) == 0: @@ -145,7 +172,7 @@ def collect_from_sheet(path, is_xyswap): inputs.append(inp) labels.append(shape + [is_kern_ytype, is_low_height]) - return inputs, labels + return inputs, labels, skipped_lm def collect_all_samples(assets_dir): @@ -153,6 +180,7 @@ def collect_all_samples(assets_dir): all_inputs = [] all_labels = [] file_count = 0 + total_skipped_lm = 0 for name in sorted(os.listdir(assets_dir)): if not name.endswith('_variable.tga'): @@ -161,14 +189,20 @@ def collect_all_samples(assets_dir): continue is_xyswap = 'xyswap' in name + code_range = _CODE_RANGE_MAP.get(name, None) path = os.path.join(assets_dir, name) - inputs, labels = collect_from_sheet(path, is_xyswap) + inputs, labels, skipped_lm = collect_from_sheet(path, is_xyswap, code_range) + total_skipped_lm += skipped_lm if inputs: - print(f" {name}: {len(inputs)} samples") + suffix = f" (skipped {skipped_lm} Lm)" if skipped_lm else "" + print(f" {name}: {len(inputs)} samples{suffix}") all_inputs.extend(inputs) all_labels.extend(labels) file_count += 1 + if total_skipped_lm: + print(f" Total modifier letters filtered: {total_skipped_lm}") + return np.array(all_inputs), np.array(all_labels, dtype=np.float32), file_count diff --git a/Autokem/unicode_lm.h b/Autokem/unicode_lm.h new file mode 100644 index 0000000..095144d --- /dev/null +++ b/Autokem/unicode_lm.h @@ -0,0 +1,141 @@ +#ifndef UNICODE_LM_H +#define UNICODE_LM_H + +#include + +/* + * Unicode category Lm (Letter, modifier) range checks. + * Generated from Python unicodedata (Unicode 16.0). + * + * is_modifier_letter(cp) — true for all Lm codepoints + * is_subscript_modifier(cp) — true for Lm codepoints with decomposition + */ + +static inline int is_modifier_letter(int cp) { + /* 71 contiguous ranges covering all 397 Lm codepoints */ + if (cp >= 0x02B0 && cp <= 0x02C1) return 1; + if (cp >= 0x02C6 && cp <= 0x02D1) return 1; + if (cp >= 0x02E0 && cp <= 0x02E4) return 1; + if (cp == 0x02EC) return 1; + if (cp == 0x02EE) return 1; + if (cp == 0x0374) return 1; + if (cp == 0x037A) return 1; + if (cp == 0x0559) return 1; + if (cp == 0x0640) return 1; + if (cp >= 0x06E5 && cp <= 0x06E6) return 1; + if (cp >= 0x07F4 && cp <= 0x07F5) return 1; + if (cp == 0x07FA) return 1; + if (cp == 0x081A) return 1; + if (cp == 0x0824) return 1; + if (cp == 0x0828) return 1; + if (cp == 0x08C9) return 1; + if (cp == 0x0971) return 1; + if (cp == 0x0E46) return 1; + if (cp == 0x0EC6) return 1; + if (cp == 0x10FC) return 1; + if (cp == 0x17D7) return 1; + if (cp == 0x1843) return 1; + if (cp == 0x1AA7) return 1; + if (cp >= 0x1C78 && cp <= 0x1C7D) return 1; + if (cp >= 0x1D2C && cp <= 0x1D6A) return 1; + if (cp == 0x1D78) return 1; + if (cp >= 0x1D9B && cp <= 0x1DBF) return 1; + if (cp == 0x2071) return 1; + if (cp == 0x207F) return 1; + if (cp >= 0x2090 && cp <= 0x209C) return 1; + if (cp >= 0x2C7C && cp <= 0x2C7D) return 1; + if (cp == 0x2D6F) return 1; + if (cp == 0x2E2F) return 1; + if (cp == 0x3005) return 1; + if (cp >= 0x3031 && cp <= 0x3035) return 1; + if (cp == 0x303B) return 1; + if (cp >= 0x309D && cp <= 0x309E) return 1; + if (cp >= 0x30FC && cp <= 0x30FE) return 1; + if (cp == 0xA015) return 1; + if (cp >= 0xA4F8 && cp <= 0xA4FD) return 1; + if (cp == 0xA60C) return 1; + if (cp == 0xA67F) return 1; + if (cp >= 0xA69C && cp <= 0xA69D) return 1; + if (cp >= 0xA717 && cp <= 0xA71F) return 1; + if (cp == 0xA770) return 1; + if (cp == 0xA788) return 1; + if (cp >= 0xA7F2 && cp <= 0xA7F4) return 1; + if (cp >= 0xA7F8 && cp <= 0xA7F9) return 1; + if (cp == 0xA9CF) return 1; + if (cp == 0xA9E6) return 1; + if (cp == 0xAA70) return 1; + if (cp == 0xAADD) return 1; + if (cp >= 0xAAF3 && cp <= 0xAAF4) return 1; + if (cp >= 0xAB5C && cp <= 0xAB5F) return 1; + if (cp == 0xAB69) return 1; + if (cp == 0xFF70) return 1; + if (cp >= 0xFF9E && cp <= 0xFF9F) return 1; + if (cp >= 0x10780 && cp <= 0x10785) return 1; + if (cp >= 0x10787 && cp <= 0x107B0) return 1; + if (cp >= 0x107B2 && cp <= 0x107BA) return 1; + if (cp >= 0x16B40 && cp <= 0x16B43) return 1; + if (cp >= 0x16F93 && cp <= 0x16F9F) return 1; + if (cp >= 0x16FE0 && cp <= 0x16FE1) return 1; + if (cp == 0x16FE3) return 1; + if (cp >= 0x1AFF0 && cp <= 0x1AFF3) return 1; + if (cp >= 0x1AFF5 && cp <= 0x1AFFB) return 1; + if (cp >= 0x1AFFD && cp <= 0x1AFFE) return 1; + if (cp >= 0x1E030 && cp <= 0x1E06D) return 1; + if (cp >= 0x1E137 && cp <= 0x1E13D) return 1; + if (cp == 0x1E4EB) return 1; + if (cp == 0x1E94B) return 1; + return 0; +} + +static inline int is_subscript_modifier(int cp) { + /* 49 Lm codepoints with decomposition */ + if (cp >= 0x1D62 && cp <= 0x1D6A) return 1; /* 9 */ + if (cp >= 0x2090 && cp <= 0x209C) return 1; /* 13 */ + if (cp == 0x2C7C) return 1; /* 1 */ + if (cp >= 0x1E051 && cp <= 0x1E06A) return 1; /* 26 */ + return 0; +} + +/* + * Map sheet filename to first codepoint of its (contiguous) code range. + * Returns -1 if unknown. For non-contiguous sheets (e.g. Devanagari), + * returns the start of the first sub-range; cells beyond it won't + * collide with Lm codepoints in practice. + */ +static int sheet_start_code(const char *basename) { + if (strstr(basename, "ascii_variable")) return 0x00; + if (strstr(basename, "latinExtA_variable")) return 0x100; + if (strstr(basename, "latinExtB_variable")) return 0x180; + if (strstr(basename, "cyrilic_extC_variable")) return 0x1C80; + if (strstr(basename, "cyrilic_extB_variable")) return 0xA640; + if (strstr(basename, "cyrilic_bulgarian_variable")) return 0xF0000; + if (strstr(basename, "cyrilic_serbian_variable")) return 0xF0060; + if (strstr(basename, "cyrilic_variable")) return 0x400; + if (strstr(basename, "halfwidth_fullwidth_variable")) return 0xFF00; + if (strstr(basename, "unipunct_variable")) return 0x2000; + if (strstr(basename, "greek_polytonic")) return 0x1F00; + if (strstr(basename, "greek_variable")) return 0x370; + if (strstr(basename, "thai_variable")) return 0xE00; + if (strstr(basename, "hayeren_variable")) return 0x530; + if (strstr(basename, "kartuli_allcaps_variable")) return 0x1C90; + if (strstr(basename, "kartuli_variable")) return 0x10D0; + if (strstr(basename, "ipa_ext_variable")) return 0x250; + if (strstr(basename, "latinExt_additional_variable")) return 0x1E00; + if (strstr(basename, "tsalagi_variable")) return 0x13A0; + if (strstr(basename, "phonetic_extensions_variable")) return 0x1D00; + if (strstr(basename, "latinExtC_variable")) return 0x2C60; + if (strstr(basename, "latinExtD_variable")) return 0xA720; + if (strstr(basename, "internal_variable")) return 0xFFE00; + if (strstr(basename, "letterlike_symbols_variable")) return 0x2100; + if (strstr(basename, "enclosed_alphanumeric")) return 0x1F100; + if (strstr(basename, "sundanese_variable")) return 0x1B80; + if (strstr(basename, "control_pictures_variable")) return 0x2400; + if (strstr(basename, "latinExtE_variable")) return 0xAB30; + if (strstr(basename, "latinExtF_variable")) return 0x10780; + if (strstr(basename, "latinExtG_variable")) return 0x1DF00; + if (strstr(basename, "devanagari") && !strstr(basename, "internal")) + return 0x900; + return -1; +} + +#endif /* UNICODE_LM_H */ diff --git a/demo.PNG b/demo.PNG index 3b14a54..803bef3 100644 Binary files a/demo.PNG and b/demo.PNG differ diff --git a/demotext_unaligned.txt b/demotext_unaligned.txt index 847ec9b..c1bb571 100755 --- a/demotext_unaligned.txt +++ b/demotext_unaligned.txt @@ -114,7 +114,7 @@ How multilingual? Real multilingual! ⁃ Basic Latin ⁃ Latin-1 Supplement ⁃ Latin Extended Additional -⁃ Latin Extended-A/B/C/D +⁃ Latin Extended-A/B/C/D/E/F/G ⁃ Armenian ⁃ Arrows ⁃ Bengali􏿆ᶠⁱ􀀀 diff --git a/src/assets/ipa_ext_variable.tga b/src/assets/ipa_ext_variable.tga index 46c0002..19c9f07 100755 --- a/src/assets/ipa_ext_variable.tga +++ b/src/assets/ipa_ext_variable.tga @@ -1,3 +1,3 @@ version https://git-lfs.github.com/spec/v1 -oid sha256:72a58fa1974770b16f1daed856319914514388c93c33ccbfd62822540d5a32d4 +oid sha256:bbc1d05ede81a0a1d98344edd3893bc6b092a5a3e7587466fcc42c3df48dc4cb size 225298 diff --git a/src/assets/latinExtD_variable.tga b/src/assets/latinExtD_variable.tga index 4c3c401..b541c1b 100644 --- a/src/assets/latinExtD_variable.tga +++ b/src/assets/latinExtD_variable.tga @@ -1,3 +1,3 @@ version https://git-lfs.github.com/spec/v1 -oid sha256:f9e2af1ae604cadc8459aec306cd5325f4dc1bfae7b63ffd6f9346b7299d76ff +oid sha256:bb60f2fc6af0b0b5d9d2757cabd601570405018a5248bc85c74c00747bcb7596 size 286738 diff --git a/src/assets/latinExtF_variable.tga b/src/assets/latinExtF_variable.tga new file mode 100644 index 0000000..acbf7c8 --- /dev/null +++ b/src/assets/latinExtF_variable.tga @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:aa128782b9b5b9641a879723e23227a9e6bc6f292792cf751bb4c5da3c81f4d6 +size 81938 diff --git a/src/assets/latinExtG_variable.tga b/src/assets/latinExtG_variable.tga new file mode 100644 index 0000000..1c9d475 --- /dev/null +++ b/src/assets/latinExtG_variable.tga @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:6fd993ad3a979988ada0e7253c01190ab27bbc87f1339014320fd3e5015d8b50 +size 327698 diff --git a/src/assets/phonetic_extensions_variable.tga b/src/assets/phonetic_extensions_variable.tga index 1d282cb..a734334 100644 --- a/src/assets/phonetic_extensions_variable.tga +++ b/src/assets/phonetic_extensions_variable.tga @@ -1,3 +1,3 @@ version https://git-lfs.github.com/spec/v1 -oid sha256:823083e1b8f00dd3b39f6249845a2eaad5ddf00bb9a27fe98bf3d273d3c0c17f +oid sha256:2fca366fa083c07ebe17decc6537a6040f552c1f841f7639a4b7745a4837a56f size 245778 diff --git a/work_files/ipa_ext_variable.psd b/work_files/ipa_ext_variable.psd index ececc39..812c1e5 100644 --- a/work_files/ipa_ext_variable.psd +++ b/work_files/ipa_ext_variable.psd @@ -1,3 +1,3 @@ version https://git-lfs.github.com/spec/v1 -oid sha256:f9d30b45f78d4c1b45fb7597df61490073ff781df6941c34cc837c13d81d641f -size 227492 +oid sha256:83ae5847833cf2a62a56e76d7752567309473d99fbd49b640877ed97efa1d586 +size 217290 diff --git a/work_files/latinExtD_variable.psd b/work_files/latinExtD_variable.psd index 3581720..39c0bf8 100644 --- a/work_files/latinExtD_variable.psd +++ b/work_files/latinExtD_variable.psd @@ -1,3 +1,3 @@ version https://git-lfs.github.com/spec/v1 -oid sha256:c978e28c87b130f7f243f6b60269bb61930cb5457cb10ffea4ecca2982fc2f47 -size 328120 +oid sha256:11c589309af5d955e383a13bed4778c8f440ded0ac23ce8bdf73e746037b1430 +size 328092 diff --git a/work_files/latinExtF_variable.kra b/work_files/latinExtF_variable.kra index 30b4b6c..1d8aa0a 100644 --- a/work_files/latinExtF_variable.kra +++ b/work_files/latinExtF_variable.kra @@ -1,3 +1,3 @@ version https://git-lfs.github.com/spec/v1 -oid sha256:7a3791ea9d40745fb6114f33b6918e0bc1f5251a80990c68b7815685e60b89cb -size 41672 +oid sha256:5c24aa0777412faf4aab7ae567b75e98b17dc17e2eec31561ea26cf9e2985e0f +size 48467 diff --git a/work_files/latinExtG_variable.kra b/work_files/latinExtG_variable.kra new file mode 100644 index 0000000..755ee82 --- /dev/null +++ b/work_files/latinExtG_variable.kra @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:6bf71d461ed69ca112cef36c9d5ba24f835f473f90518dcc85d863c648742e8a +size 43820 diff --git a/work_files/phonetic_extensions_variable.psd b/work_files/phonetic_extensions_variable.psd index 1536ac5..a2e20b2 100644 --- a/work_files/phonetic_extensions_variable.psd +++ b/work_files/phonetic_extensions_variable.psd @@ -1,3 +1,3 @@ version https://git-lfs.github.com/spec/v1 -oid sha256:37b022f69eb885f38b4edaf0345f8540d821e2bb9af68d510ce24f677f548739 -size 268717 +oid sha256:aabb7b8c2be78c5f08c87b6f2cc8bd22e837b2e84b130e05683932c995d06bc6 +size 238793