Latin Ext F and G

This commit is contained in:
minjaesong
2026-03-13 13:29:43 +09:00
parent 8daa968d80
commit 9d9efce9d4
17 changed files with 231 additions and 22 deletions

View File

@@ -2,6 +2,7 @@
#include "tga.h" #include "tga.h"
#include "nn.h" #include "nn.h"
#include "safetensor.h" #include "safetensor.h"
#include "unicode_lm.h"
#include <stdio.h> #include <stdio.h>
#include <stdlib.h> #include <stdlib.h>
@@ -75,7 +76,8 @@ int apply_model(const char *tga_path) {
int rows = img->height / cell_h; int rows = img->height / cell_h;
int total_cells = cols * rows; int total_cells = cols * rows;
int processed = 0, updated = 0, skipped = 0; int start_code = sheet_start_code(basename);
int processed = 0, updated = 0, skipped = 0, fixed_lm = 0;
for (int index = 0; index < total_cells; index++) { for (int index = 0; index < total_cells; index++) {
int cell_x, cell_y; int cell_x, cell_y;
@@ -107,6 +109,21 @@ int apply_model(const char *tga_path) {
int opcode = (int)((dir_pixel >> 24) & 0xFF); int opcode = (int)((dir_pixel >> 24) & 0xFF);
if (opcode != 0) { skipped++; continue; } if (opcode != 0) { skipped++; continue; }
/* Modifier letters: fixed kern pixel, skip inference */
if (start_code >= 0 && is_modifier_letter(start_code + index)) {
if (is_subscript_modifier(start_code + index)) {
/* Subscript: CDEFGHJK(B), lowheight=1 */
tga_write_pixel(tga_path, img, tag_x, tag_y + 5, 0xFFFFFFFF);
tga_write_pixel(tga_path, img, tag_x, tag_y + 6, 0x00C03FFF);
} else {
/* Superscript: ABCDEF(B), lowheight=0 */
tga_write_pixel(tga_path, img, tag_x, tag_y + 5, 0x00000000);
tga_write_pixel(tga_path, img, tag_x, tag_y + 6, 0x0000FCFF);
}
processed++; updated++; fixed_lm++;
continue;
}
/* Extract 15x20 binary input */ /* Extract 15x20 binary input */
float input[300]; float input[300];
for (int gy = 0; gy < 20; gy++) { for (int gy = 0; gy < 20; gy++) {
@@ -155,8 +172,8 @@ int apply_model(const char *tga_path) {
updated++; updated++;
} }
printf("Processed: %d cells, Updated: %d, Skipped: %d (of %d total)\n", printf("Processed: %d cells, Updated: %d, Skipped: %d, Fixed Lm: %d (of %d total)\n",
processed, updated, skipped, total_cells); processed, updated, skipped, fixed_lm, total_cells);
tga_free(img); tga_free(img);
network_free(net); network_free(net);

Binary file not shown.

View File

@@ -2,6 +2,7 @@
#include "tga.h" #include "tga.h"
#include "nn.h" #include "nn.h"
#include "safetensor.h" #include "safetensor.h"
#include "unicode_lm.h"
#include <stdio.h> #include <stdio.h>
#include <stdlib.h> #include <stdlib.h>
@@ -42,7 +43,8 @@ static void extract_shape_bits(int kerning_mask, float *shape) {
/* ---- Collect samples from one TGA ---- */ /* ---- Collect samples from one TGA ---- */
static int collect_from_sheet(const char *path, int is_xyswap, Sample *samples, int max_samples) { static int collect_from_sheet(const char *path, int is_xyswap, int start_code,
Sample *samples, int max_samples) {
TgaImage *img = tga_read(path); TgaImage *img = tga_read(path);
if (!img) { if (!img) {
fprintf(stderr, "Warning: cannot read %s\n", path); fprintf(stderr, "Warning: cannot read %s\n", path);
@@ -76,6 +78,10 @@ static int collect_from_sheet(const char *path, int is_xyswap, Sample *samples,
} }
if (width == 0) continue; if (width == 0) continue;
/* Skip modifier letters (superscripts/subscripts) */
if (start_code >= 0 && is_modifier_letter(start_code + index))
continue;
/* Read kerning data pixel at Y+6 */ /* Read kerning data pixel at Y+6 */
uint32_t kern_pixel = tagify(tga_get_pixel(img, tag_x, tag_y + 6)); uint32_t kern_pixel = tagify(tga_get_pixel(img, tag_x, tag_y + 6));
if ((kern_pixel & 0xFF) == 0) continue; /* no kern data */ if ((kern_pixel & 0xFF) == 0) continue; /* no kern data */
@@ -170,7 +176,9 @@ int train_model(void) {
char fullpath[512]; char fullpath[512];
snprintf(fullpath, sizeof(fullpath), "%s/%s", assets_dir, name); snprintf(fullpath, sizeof(fullpath), "%s/%s", assets_dir, name);
int got = collect_from_sheet(fullpath, is_xyswap, all_samples + total, max_total - total); int start_code = sheet_start_code(name);
int got = collect_from_sheet(fullpath, is_xyswap, start_code,
all_samples + total, max_total - total);
if (got > 0) { if (got > 0) {
printf(" %s: %d samples\n", name, got); printf(" %s: %d samples\n", name, got);
total += got; total += got;

View File

@@ -20,10 +20,26 @@ import json
import os import os
import struct import struct
import sys import sys
import unicodedata
from pathlib import Path from pathlib import Path
import numpy as np import numpy as np
# ---- Sheet code ranges (imported from OTFbuild/sheet_config.py) ----
_otfbuild = os.path.join(os.path.dirname(os.path.abspath(__file__)), '..', 'OTFbuild')
try:
sys.path.insert(0, _otfbuild)
from sheet_config import FILE_LIST as _FILE_LIST, CODE_RANGE as _CODE_RANGE
sys.path.pop(0)
_CODE_RANGE_MAP = {}
for _i, _fn in enumerate(_FILE_LIST):
if _i < len(_CODE_RANGE):
_CODE_RANGE_MAP[_fn] = _CODE_RANGE[_i]
except ImportError:
_CODE_RANGE_MAP = {}
# ---- TGA reader (matches OTFbuild/tga_reader.py and Autokem/tga.c) ---- # ---- TGA reader (matches OTFbuild/tga_reader.py and Autokem/tga.c) ----
class TgaImage: class TgaImage:
@@ -80,7 +96,7 @@ def tagify(pixel):
# ---- Data collection (matches Autokem/train.c) ---- # ---- Data collection (matches Autokem/train.c) ----
def collect_from_sheet(path, is_xyswap): def collect_from_sheet(path, is_xyswap, code_range=None):
"""Extract labelled samples from a single TGA sheet.""" """Extract labelled samples from a single TGA sheet."""
img = read_tga(path) img = read_tga(path)
cell_w, cell_h = 16, 20 cell_w, cell_h = 16, 20
@@ -90,6 +106,7 @@ def collect_from_sheet(path, is_xyswap):
inputs = [] inputs = []
labels = [] labels = []
skipped_lm = 0
for index in range(total_cells): for index in range(total_cells):
if is_xyswap: if is_xyswap:
@@ -110,6 +127,16 @@ def collect_from_sheet(path, is_xyswap):
if width == 0: if width == 0:
continue continue
# Skip modifier letters (superscripts/subscripts)
if code_range is not None and index < len(code_range):
cp = code_range[index]
try:
if unicodedata.category(chr(cp)) == 'Lm':
skipped_lm += 1
continue
except (ValueError, OverflowError):
pass
# Kern data pixel at Y+6 # Kern data pixel at Y+6
kern_pixel = tagify(img.get_pixel(tag_x, tag_y + 6)) kern_pixel = tagify(img.get_pixel(tag_x, tag_y + 6))
if (kern_pixel & 0xFF) == 0: if (kern_pixel & 0xFF) == 0:
@@ -145,7 +172,7 @@ def collect_from_sheet(path, is_xyswap):
inputs.append(inp) inputs.append(inp)
labels.append(shape + [is_kern_ytype, is_low_height]) labels.append(shape + [is_kern_ytype, is_low_height])
return inputs, labels return inputs, labels, skipped_lm
def collect_all_samples(assets_dir): def collect_all_samples(assets_dir):
@@ -153,6 +180,7 @@ def collect_all_samples(assets_dir):
all_inputs = [] all_inputs = []
all_labels = [] all_labels = []
file_count = 0 file_count = 0
total_skipped_lm = 0
for name in sorted(os.listdir(assets_dir)): for name in sorted(os.listdir(assets_dir)):
if not name.endswith('_variable.tga'): if not name.endswith('_variable.tga'):
@@ -161,14 +189,20 @@ def collect_all_samples(assets_dir):
continue continue
is_xyswap = 'xyswap' in name is_xyswap = 'xyswap' in name
code_range = _CODE_RANGE_MAP.get(name, None)
path = os.path.join(assets_dir, name) path = os.path.join(assets_dir, name)
inputs, labels = collect_from_sheet(path, is_xyswap) inputs, labels, skipped_lm = collect_from_sheet(path, is_xyswap, code_range)
total_skipped_lm += skipped_lm
if inputs: if inputs:
print(f" {name}: {len(inputs)} samples") suffix = f" (skipped {skipped_lm} Lm)" if skipped_lm else ""
print(f" {name}: {len(inputs)} samples{suffix}")
all_inputs.extend(inputs) all_inputs.extend(inputs)
all_labels.extend(labels) all_labels.extend(labels)
file_count += 1 file_count += 1
if total_skipped_lm:
print(f" Total modifier letters filtered: {total_skipped_lm}")
return np.array(all_inputs), np.array(all_labels, dtype=np.float32), file_count return np.array(all_inputs), np.array(all_labels, dtype=np.float32), file_count

141
Autokem/unicode_lm.h Normal file
View File

@@ -0,0 +1,141 @@
#ifndef UNICODE_LM_H
#define UNICODE_LM_H
#include <string.h>
/*
* Unicode category Lm (Letter, modifier) range checks.
* Generated from Python unicodedata (Unicode 16.0).
*
* is_modifier_letter(cp) — true for all Lm codepoints
* is_subscript_modifier(cp) — true for Lm codepoints with <sub> decomposition
*/
static inline int is_modifier_letter(int cp) {
/* 71 contiguous ranges covering all 397 Lm codepoints */
if (cp >= 0x02B0 && cp <= 0x02C1) return 1;
if (cp >= 0x02C6 && cp <= 0x02D1) return 1;
if (cp >= 0x02E0 && cp <= 0x02E4) return 1;
if (cp == 0x02EC) return 1;
if (cp == 0x02EE) return 1;
if (cp == 0x0374) return 1;
if (cp == 0x037A) return 1;
if (cp == 0x0559) return 1;
if (cp == 0x0640) return 1;
if (cp >= 0x06E5 && cp <= 0x06E6) return 1;
if (cp >= 0x07F4 && cp <= 0x07F5) return 1;
if (cp == 0x07FA) return 1;
if (cp == 0x081A) return 1;
if (cp == 0x0824) return 1;
if (cp == 0x0828) return 1;
if (cp == 0x08C9) return 1;
if (cp == 0x0971) return 1;
if (cp == 0x0E46) return 1;
if (cp == 0x0EC6) return 1;
if (cp == 0x10FC) return 1;
if (cp == 0x17D7) return 1;
if (cp == 0x1843) return 1;
if (cp == 0x1AA7) return 1;
if (cp >= 0x1C78 && cp <= 0x1C7D) return 1;
if (cp >= 0x1D2C && cp <= 0x1D6A) return 1;
if (cp == 0x1D78) return 1;
if (cp >= 0x1D9B && cp <= 0x1DBF) return 1;
if (cp == 0x2071) return 1;
if (cp == 0x207F) return 1;
if (cp >= 0x2090 && cp <= 0x209C) return 1;
if (cp >= 0x2C7C && cp <= 0x2C7D) return 1;
if (cp == 0x2D6F) return 1;
if (cp == 0x2E2F) return 1;
if (cp == 0x3005) return 1;
if (cp >= 0x3031 && cp <= 0x3035) return 1;
if (cp == 0x303B) return 1;
if (cp >= 0x309D && cp <= 0x309E) return 1;
if (cp >= 0x30FC && cp <= 0x30FE) return 1;
if (cp == 0xA015) return 1;
if (cp >= 0xA4F8 && cp <= 0xA4FD) return 1;
if (cp == 0xA60C) return 1;
if (cp == 0xA67F) return 1;
if (cp >= 0xA69C && cp <= 0xA69D) return 1;
if (cp >= 0xA717 && cp <= 0xA71F) return 1;
if (cp == 0xA770) return 1;
if (cp == 0xA788) return 1;
if (cp >= 0xA7F2 && cp <= 0xA7F4) return 1;
if (cp >= 0xA7F8 && cp <= 0xA7F9) return 1;
if (cp == 0xA9CF) return 1;
if (cp == 0xA9E6) return 1;
if (cp == 0xAA70) return 1;
if (cp == 0xAADD) return 1;
if (cp >= 0xAAF3 && cp <= 0xAAF4) return 1;
if (cp >= 0xAB5C && cp <= 0xAB5F) return 1;
if (cp == 0xAB69) return 1;
if (cp == 0xFF70) return 1;
if (cp >= 0xFF9E && cp <= 0xFF9F) return 1;
if (cp >= 0x10780 && cp <= 0x10785) return 1;
if (cp >= 0x10787 && cp <= 0x107B0) return 1;
if (cp >= 0x107B2 && cp <= 0x107BA) return 1;
if (cp >= 0x16B40 && cp <= 0x16B43) return 1;
if (cp >= 0x16F93 && cp <= 0x16F9F) return 1;
if (cp >= 0x16FE0 && cp <= 0x16FE1) return 1;
if (cp == 0x16FE3) return 1;
if (cp >= 0x1AFF0 && cp <= 0x1AFF3) return 1;
if (cp >= 0x1AFF5 && cp <= 0x1AFFB) return 1;
if (cp >= 0x1AFFD && cp <= 0x1AFFE) return 1;
if (cp >= 0x1E030 && cp <= 0x1E06D) return 1;
if (cp >= 0x1E137 && cp <= 0x1E13D) return 1;
if (cp == 0x1E4EB) return 1;
if (cp == 0x1E94B) return 1;
return 0;
}
static inline int is_subscript_modifier(int cp) {
/* 49 Lm codepoints with <sub> decomposition */
if (cp >= 0x1D62 && cp <= 0x1D6A) return 1; /* 9 */
if (cp >= 0x2090 && cp <= 0x209C) return 1; /* 13 */
if (cp == 0x2C7C) return 1; /* 1 */
if (cp >= 0x1E051 && cp <= 0x1E06A) return 1; /* 26 */
return 0;
}
/*
* Map sheet filename to first codepoint of its (contiguous) code range.
* Returns -1 if unknown. For non-contiguous sheets (e.g. Devanagari),
* returns the start of the first sub-range; cells beyond it won't
* collide with Lm codepoints in practice.
*/
static int sheet_start_code(const char *basename) {
if (strstr(basename, "ascii_variable")) return 0x00;
if (strstr(basename, "latinExtA_variable")) return 0x100;
if (strstr(basename, "latinExtB_variable")) return 0x180;
if (strstr(basename, "cyrilic_extC_variable")) return 0x1C80;
if (strstr(basename, "cyrilic_extB_variable")) return 0xA640;
if (strstr(basename, "cyrilic_bulgarian_variable")) return 0xF0000;
if (strstr(basename, "cyrilic_serbian_variable")) return 0xF0060;
if (strstr(basename, "cyrilic_variable")) return 0x400;
if (strstr(basename, "halfwidth_fullwidth_variable")) return 0xFF00;
if (strstr(basename, "unipunct_variable")) return 0x2000;
if (strstr(basename, "greek_polytonic")) return 0x1F00;
if (strstr(basename, "greek_variable")) return 0x370;
if (strstr(basename, "thai_variable")) return 0xE00;
if (strstr(basename, "hayeren_variable")) return 0x530;
if (strstr(basename, "kartuli_allcaps_variable")) return 0x1C90;
if (strstr(basename, "kartuli_variable")) return 0x10D0;
if (strstr(basename, "ipa_ext_variable")) return 0x250;
if (strstr(basename, "latinExt_additional_variable")) return 0x1E00;
if (strstr(basename, "tsalagi_variable")) return 0x13A0;
if (strstr(basename, "phonetic_extensions_variable")) return 0x1D00;
if (strstr(basename, "latinExtC_variable")) return 0x2C60;
if (strstr(basename, "latinExtD_variable")) return 0xA720;
if (strstr(basename, "internal_variable")) return 0xFFE00;
if (strstr(basename, "letterlike_symbols_variable")) return 0x2100;
if (strstr(basename, "enclosed_alphanumeric")) return 0x1F100;
if (strstr(basename, "sundanese_variable")) return 0x1B80;
if (strstr(basename, "control_pictures_variable")) return 0x2400;
if (strstr(basename, "latinExtE_variable")) return 0xAB30;
if (strstr(basename, "latinExtF_variable")) return 0x10780;
if (strstr(basename, "latinExtG_variable")) return 0x1DF00;
if (strstr(basename, "devanagari") && !strstr(basename, "internal"))
return 0x900;
return -1;
}
#endif /* UNICODE_LM_H */

BIN
demo.PNG

Binary file not shown.

Before

Width:  |  Height:  |  Size: 177 KiB

After

Width:  |  Height:  |  Size: 177 KiB

View File

@@ -114,7 +114,7 @@ How multilingual? Real multilingual!
Basic Latin Basic Latin
Latin-1 Supplement Latin-1 Supplement
Latin Extended Additional Latin Extended Additional
Latin Extended-A/B/C/D Latin Extended-A/B/C/D/E/F/G
Armenian Armenian
Arrows Arrows
Bengali􏿆ᶠⁱ􀀀 Bengali􏿆ᶠⁱ􀀀

Binary file not shown.

Binary file not shown.

BIN
src/assets/latinExtF_variable.tga LFS Normal file

Binary file not shown.

BIN
src/assets/latinExtG_variable.tga LFS Normal file

Binary file not shown.

Binary file not shown.

Binary file not shown.

Binary file not shown.

BIN
work_files/latinExtG_variable.kra LFS Normal file

Binary file not shown.