From 539a2c9f46fdad42c56371159da1304e3eb0761c Mon Sep 17 00:00:00 2001 From: minjaesong Date: Fri, 13 Mar 2026 20:08:56 +0900 Subject: [PATCH] autokem: more filtering --- Autokem/apply.c | 2 +- Autokem/autokem.safetensors | 2 +- Autokem/train.c | 6 +- Autokem/train_torch.py | 9 ++- Autokem/{unicode_lm.h => unicode_filter.h} | 86 +++++++++++++++++----- 5 files changed, 78 insertions(+), 27 deletions(-) rename Autokem/{unicode_lm.h => unicode_filter.h} (63%) diff --git a/Autokem/apply.c b/Autokem/apply.c index fe8a97f..ff8ed22 100644 --- a/Autokem/apply.c +++ b/Autokem/apply.c @@ -2,7 +2,7 @@ #include "tga.h" #include "nn.h" #include "safetensor.h" -#include "unicode_lm.h" +#include "unicode_filter.h" #include #include diff --git a/Autokem/autokem.safetensors b/Autokem/autokem.safetensors index b478c58..07a9804 100644 --- a/Autokem/autokem.safetensors +++ b/Autokem/autokem.safetensors @@ -1,3 +1,3 @@ version https://git-lfs.github.com/spec/v1 -oid sha256:850ca1933dba7aece95edcdc00748c1ac0e2e6fcc67ab429e11cd2617482c146 +oid sha256:9dd9af14adfdf3c69e631ae859ad7891610f4fb978442f8409ce8027e968f0bd size 487640 diff --git a/Autokem/train.c b/Autokem/train.c index 418a44e..33a4433 100644 --- a/Autokem/train.c +++ b/Autokem/train.c @@ -2,7 +2,7 @@ #include "tga.h" #include "nn.h" #include "safetensor.h" -#include "unicode_lm.h" +#include "unicode_filter.h" #include #include @@ -78,8 +78,8 @@ static int collect_from_sheet(const char *path, int is_xyswap, int start_code, } if (width == 0) continue; - /* Skip modifier letters (superscripts/subscripts) */ - if (start_code >= 0 && is_modifier_letter(start_code + index)) + /* Skip modifier letters, symbols, punctuation */ + if (start_code >= 0 && is_excluded_from_training(start_code + index)) continue; /* Read kerning data pixel at Y+6 */ diff --git a/Autokem/train_torch.py b/Autokem/train_torch.py index 425c980..7be7422 100644 --- a/Autokem/train_torch.py +++ b/Autokem/train_torch.py @@ -127,11 +127,12 @@ def collect_from_sheet(path, is_xyswap, code_range=None): if width == 0: continue - # Skip modifier letters (superscripts/subscripts) + # Skip modifier letters, symbols, punctuation if code_range is not None and index < len(code_range): cp = code_range[index] try: - if unicodedata.category(chr(cp)) == 'Lm': + cat = unicodedata.category(chr(cp)) + if cat == 'Lm' or cat[0] in ('S', 'P'): skipped_lm += 1 continue except (ValueError, OverflowError): @@ -194,14 +195,14 @@ def collect_all_samples(assets_dir): inputs, labels, skipped_lm = collect_from_sheet(path, is_xyswap, code_range) total_skipped_lm += skipped_lm if inputs: - suffix = f" (skipped {skipped_lm} Lm)" if skipped_lm else "" + suffix = f" (skipped {skipped_lm})" if skipped_lm else "" print(f" {name}: {len(inputs)} samples{suffix}") all_inputs.extend(inputs) all_labels.extend(labels) file_count += 1 if total_skipped_lm: - print(f" Total modifier letters filtered: {total_skipped_lm}") + print(f" Filtered (Lm/S/P): {total_skipped_lm}") return np.array(all_inputs), np.array(all_labels, dtype=np.float32), file_count diff --git a/Autokem/unicode_lm.h b/Autokem/unicode_filter.h similarity index 63% rename from Autokem/unicode_lm.h rename to Autokem/unicode_filter.h index 095144d..5ee25cd 100644 --- a/Autokem/unicode_lm.h +++ b/Autokem/unicode_filter.h @@ -1,18 +1,21 @@ -#ifndef UNICODE_LM_H -#define UNICODE_LM_H +#ifndef UNICODE_FILTER_H +#define UNICODE_FILTER_H #include /* - * Unicode category Lm (Letter, modifier) range checks. + * Unicode category filters for training/apply. * Generated from Python unicodedata (Unicode 16.0). * - * is_modifier_letter(cp) — true for all Lm codepoints - * is_subscript_modifier(cp) — true for Lm codepoints with decomposition + * is_modifier_letter(cp) — category Lm + * is_subscript_modifier(cp) — Lm with decomposition + * is_symbol_or_punctuation(cp) — categories S* or P* + * is_excluded_from_training(cp) — Lm or S* or P* */ +/* ---- Lm (modifier letter) ---- */ + static inline int is_modifier_letter(int cp) { - /* 71 contiguous ranges covering all 397 Lm codepoints */ if (cp >= 0x02B0 && cp <= 0x02C1) return 1; if (cp >= 0x02C6 && cp <= 0x02D1) return 1; if (cp >= 0x02E0 && cp <= 0x02E4) return 1; @@ -88,20 +91,67 @@ static inline int is_modifier_letter(int cp) { } static inline int is_subscript_modifier(int cp) { - /* 49 Lm codepoints with decomposition */ - if (cp >= 0x1D62 && cp <= 0x1D6A) return 1; /* 9 */ - if (cp >= 0x2090 && cp <= 0x209C) return 1; /* 13 */ - if (cp == 0x2C7C) return 1; /* 1 */ - if (cp >= 0x1E051 && cp <= 0x1E06A) return 1; /* 26 */ + if (cp >= 0x1D62 && cp <= 0x1D6A) return 1; + if (cp >= 0x2090 && cp <= 0x209C) return 1; + if (cp == 0x2C7C) return 1; + if (cp >= 0x1E051 && cp <= 0x1E06A) return 1; return 0; } -/* - * Map sheet filename to first codepoint of its (contiguous) code range. - * Returns -1 if unknown. For non-contiguous sheets (e.g. Devanagari), - * returns the start of the first sub-range; cells beyond it won't - * collide with Lm codepoints in practice. - */ +/* ---- S* (Symbol) and P* (Punctuation) ---- */ + +/* Table of {start, end} ranges for S/P codepoints in font sheets */ +static const int sp_ranges[][2] = { + {0x00021, 0x0002F}, {0x0003A, 0x00040}, {0x0005B, 0x00060}, + {0x0007B, 0x0007E}, {0x000A1, 0x000A9}, {0x000AB, 0x000AC}, + {0x000AE, 0x000B1}, {0x000B4, 0x000B4}, {0x000B6, 0x000B8}, + {0x000BB, 0x000BB}, {0x000BF, 0x000BF}, {0x000D7, 0x000D7}, + {0x000F7, 0x000F7}, {0x002C2, 0x002C5}, {0x002D2, 0x002DF}, + {0x002E5, 0x002EB}, {0x002ED, 0x002ED}, {0x002EF, 0x002FF}, + {0x00375, 0x00375}, {0x0037E, 0x0037E}, {0x00384, 0x00385}, + {0x00387, 0x00387}, {0x00482, 0x00482}, {0x0055A, 0x0055F}, + {0x00589, 0x0058A}, {0x0058D, 0x0058F}, {0x00964, 0x00965}, + {0x00970, 0x00970}, {0x009F2, 0x009F3}, {0x009FA, 0x009FB}, + {0x009FD, 0x009FD}, {0x00BF3, 0x00BFA}, {0x00E3F, 0x00E3F}, + {0x00E4F, 0x00E4F}, {0x00E5A, 0x00E5B}, {0x010FB, 0x010FB}, + {0x016EB, 0x016ED}, {0x01CC0, 0x01CC7}, {0x01FBD, 0x01FBD}, + {0x01FBF, 0x01FC1}, {0x01FCD, 0x01FCF}, {0x01FDD, 0x01FDF}, + {0x01FED, 0x01FEF}, {0x01FFD, 0x01FFE}, {0x02010, 0x02027}, + {0x02030, 0x0205E}, {0x0207A, 0x0207E}, {0x0208A, 0x0208E}, + {0x020A0, 0x020C0}, {0x02100, 0x02101}, {0x02103, 0x02106}, + {0x02108, 0x02109}, {0x02114, 0x02114}, {0x02116, 0x02118}, + {0x0211E, 0x02123}, {0x02125, 0x02125}, {0x02127, 0x02127}, + {0x02129, 0x02129}, {0x0212E, 0x0212E}, {0x0213A, 0x0213B}, + {0x02140, 0x02144}, {0x0214A, 0x0214D}, {0x0214F, 0x0214F}, + {0x0218A, 0x0218B}, {0x02190, 0x021FF}, {0x02400, 0x02426}, + {0x02800, 0x028FF}, {0x03001, 0x03004}, {0x03008, 0x03020}, + {0x03030, 0x03030}, {0x03036, 0x03037}, {0x0303D, 0x0303F}, + {0x0309B, 0x0309C}, {0x030A0, 0x030A0}, {0x030FB, 0x030FB}, + {0x04DC0, 0x04DFF}, {0x0A673, 0x0A673}, {0x0A67E, 0x0A67E}, + {0x0A720, 0x0A721}, {0x0A789, 0x0A78A}, {0x0AB5B, 0x0AB5B}, + {0x0AB6A, 0x0AB6B}, {0x0FF01, 0x0FF0F}, {0x0FF1A, 0x0FF20}, + {0x0FF3B, 0x0FF40}, {0x0FF5B, 0x0FF65}, {0x0FFE0, 0x0FFE6}, + {0x0FFE8, 0x0FFEE}, {0x0FFFC, 0x0FFFD}, {0x1F10D, 0x1F1AD}, + {0x1F1E6, 0x1F1FF}, {0x1FB00, 0x1FB92}, {0x1FB94, 0x1FBCA}, +}; + +static inline int is_symbol_or_punctuation(int cp) { + int n = (int)(sizeof(sp_ranges) / sizeof(sp_ranges[0])); + for (int i = 0; i < n; i++) { + if (cp >= sp_ranges[i][0] && cp <= sp_ranges[i][1]) + return 1; + } + return 0; +} + +/* ---- Combined filter for training exclusion ---- */ + +static inline int is_excluded_from_training(int cp) { + return is_modifier_letter(cp) || is_symbol_or_punctuation(cp); +} + +/* ---- Sheet filename → start codepoint ---- */ + static int sheet_start_code(const char *basename) { if (strstr(basename, "ascii_variable")) return 0x00; if (strstr(basename, "latinExtA_variable")) return 0x100; @@ -138,4 +188,4 @@ static int sheet_start_code(const char *basename) { return -1; } -#endif /* UNICODE_LM_H */ +#endif /* UNICODE_FILTER_H */