3 Commits

Author SHA1 Message Date
minjaesong
175fe4edfb demo text update 2026-03-13 14:13:28 +09:00
minjaesong
4d7aa79740 fixed some mislabeling 2026-03-13 13:59:58 +09:00
minjaesong
9d9efce9d4 Latin Ext F and G 2026-03-13 13:29:43 +09:00
27 changed files with 250 additions and 41 deletions

View File

@@ -2,6 +2,7 @@
#include "tga.h"
#include "nn.h"
#include "safetensor.h"
#include "unicode_lm.h"
#include <stdio.h>
#include <stdlib.h>
@@ -75,7 +76,8 @@ int apply_model(const char *tga_path) {
int rows = img->height / cell_h;
int total_cells = cols * rows;
int processed = 0, updated = 0, skipped = 0;
int start_code = sheet_start_code(basename);
int processed = 0, updated = 0, skipped = 0, fixed_lm = 0;
for (int index = 0; index < total_cells; index++) {
int cell_x, cell_y;
@@ -107,6 +109,21 @@ int apply_model(const char *tga_path) {
int opcode = (int)((dir_pixel >> 24) & 0xFF);
if (opcode != 0) { skipped++; continue; }
/* Modifier letters: fixed kern pixel, skip inference */
if (start_code >= 0 && is_modifier_letter(start_code + index)) {
if (is_subscript_modifier(start_code + index)) {
/* Subscript: CDEFGHJK(B), lowheight=1 */
tga_write_pixel(tga_path, img, tag_x, tag_y + 5, 0xFFFFFFFF);
tga_write_pixel(tga_path, img, tag_x, tag_y + 6, 0x00C03FFF);
} else {
/* Superscript: ABCDEF(B), lowheight=0 */
tga_write_pixel(tga_path, img, tag_x, tag_y + 5, 0x00000000);
tga_write_pixel(tga_path, img, tag_x, tag_y + 6, 0x0000FCFF);
}
processed++; updated++; fixed_lm++;
continue;
}
/* Extract 15x20 binary input */
float input[300];
for (int gy = 0; gy < 20; gy++) {
@@ -155,8 +172,8 @@ int apply_model(const char *tga_path) {
updated++;
}
printf("Processed: %d cells, Updated: %d, Skipped: %d (of %d total)\n",
processed, updated, skipped, total_cells);
printf("Processed: %d cells, Updated: %d, Skipped: %d, Fixed Lm: %d (of %d total)\n",
processed, updated, skipped, fixed_lm, total_cells);
tga_free(img);
network_free(net);

Binary file not shown.

View File

@@ -2,6 +2,7 @@
#include "tga.h"
#include "nn.h"
#include "safetensor.h"
#include "unicode_lm.h"
#include <stdio.h>
#include <stdlib.h>
@@ -42,7 +43,8 @@ static void extract_shape_bits(int kerning_mask, float *shape) {
/* ---- Collect samples from one TGA ---- */
static int collect_from_sheet(const char *path, int is_xyswap, Sample *samples, int max_samples) {
static int collect_from_sheet(const char *path, int is_xyswap, int start_code,
Sample *samples, int max_samples) {
TgaImage *img = tga_read(path);
if (!img) {
fprintf(stderr, "Warning: cannot read %s\n", path);
@@ -76,6 +78,10 @@ static int collect_from_sheet(const char *path, int is_xyswap, Sample *samples,
}
if (width == 0) continue;
/* Skip modifier letters (superscripts/subscripts) */
if (start_code >= 0 && is_modifier_letter(start_code + index))
continue;
/* Read kerning data pixel at Y+6 */
uint32_t kern_pixel = tagify(tga_get_pixel(img, tag_x, tag_y + 6));
if ((kern_pixel & 0xFF) == 0) continue; /* no kern data */
@@ -170,7 +176,9 @@ int train_model(void) {
char fullpath[512];
snprintf(fullpath, sizeof(fullpath), "%s/%s", assets_dir, name);
int got = collect_from_sheet(fullpath, is_xyswap, all_samples + total, max_total - total);
int start_code = sheet_start_code(name);
int got = collect_from_sheet(fullpath, is_xyswap, start_code,
all_samples + total, max_total - total);
if (got > 0) {
printf(" %s: %d samples\n", name, got);
total += got;

View File

@@ -20,10 +20,26 @@ import json
import os
import struct
import sys
import unicodedata
from pathlib import Path
import numpy as np
# ---- Sheet code ranges (imported from OTFbuild/sheet_config.py) ----
_otfbuild = os.path.join(os.path.dirname(os.path.abspath(__file__)), '..', 'OTFbuild')
try:
sys.path.insert(0, _otfbuild)
from sheet_config import FILE_LIST as _FILE_LIST, CODE_RANGE as _CODE_RANGE
sys.path.pop(0)
_CODE_RANGE_MAP = {}
for _i, _fn in enumerate(_FILE_LIST):
if _i < len(_CODE_RANGE):
_CODE_RANGE_MAP[_fn] = _CODE_RANGE[_i]
except ImportError:
_CODE_RANGE_MAP = {}
# ---- TGA reader (matches OTFbuild/tga_reader.py and Autokem/tga.c) ----
class TgaImage:
@@ -80,7 +96,7 @@ def tagify(pixel):
# ---- Data collection (matches Autokem/train.c) ----
def collect_from_sheet(path, is_xyswap):
def collect_from_sheet(path, is_xyswap, code_range=None):
"""Extract labelled samples from a single TGA sheet."""
img = read_tga(path)
cell_w, cell_h = 16, 20
@@ -90,6 +106,7 @@ def collect_from_sheet(path, is_xyswap):
inputs = []
labels = []
skipped_lm = 0
for index in range(total_cells):
if is_xyswap:
@@ -110,6 +127,16 @@ def collect_from_sheet(path, is_xyswap):
if width == 0:
continue
# Skip modifier letters (superscripts/subscripts)
if code_range is not None and index < len(code_range):
cp = code_range[index]
try:
if unicodedata.category(chr(cp)) == 'Lm':
skipped_lm += 1
continue
except (ValueError, OverflowError):
pass
# Kern data pixel at Y+6
kern_pixel = tagify(img.get_pixel(tag_x, tag_y + 6))
if (kern_pixel & 0xFF) == 0:
@@ -145,7 +172,7 @@ def collect_from_sheet(path, is_xyswap):
inputs.append(inp)
labels.append(shape + [is_kern_ytype, is_low_height])
return inputs, labels
return inputs, labels, skipped_lm
def collect_all_samples(assets_dir):
@@ -153,6 +180,7 @@ def collect_all_samples(assets_dir):
all_inputs = []
all_labels = []
file_count = 0
total_skipped_lm = 0
for name in sorted(os.listdir(assets_dir)):
if not name.endswith('_variable.tga'):
@@ -161,14 +189,20 @@ def collect_all_samples(assets_dir):
continue
is_xyswap = 'xyswap' in name
code_range = _CODE_RANGE_MAP.get(name, None)
path = os.path.join(assets_dir, name)
inputs, labels = collect_from_sheet(path, is_xyswap)
inputs, labels, skipped_lm = collect_from_sheet(path, is_xyswap, code_range)
total_skipped_lm += skipped_lm
if inputs:
print(f" {name}: {len(inputs)} samples")
suffix = f" (skipped {skipped_lm} Lm)" if skipped_lm else ""
print(f" {name}: {len(inputs)} samples{suffix}")
all_inputs.extend(inputs)
all_labels.extend(labels)
file_count += 1
if total_skipped_lm:
print(f" Total modifier letters filtered: {total_skipped_lm}")
return np.array(all_inputs), np.array(all_labels, dtype=np.float32), file_count

141
Autokem/unicode_lm.h Normal file
View File

@@ -0,0 +1,141 @@
#ifndef UNICODE_LM_H
#define UNICODE_LM_H
#include <string.h>
/*
* Unicode category Lm (Letter, modifier) range checks.
* Generated from Python unicodedata (Unicode 16.0).
*
* is_modifier_letter(cp) — true for all Lm codepoints
* is_subscript_modifier(cp) — true for Lm codepoints with <sub> decomposition
*/
static inline int is_modifier_letter(int cp) {
/* 71 contiguous ranges covering all 397 Lm codepoints */
if (cp >= 0x02B0 && cp <= 0x02C1) return 1;
if (cp >= 0x02C6 && cp <= 0x02D1) return 1;
if (cp >= 0x02E0 && cp <= 0x02E4) return 1;
if (cp == 0x02EC) return 1;
if (cp == 0x02EE) return 1;
if (cp == 0x0374) return 1;
if (cp == 0x037A) return 1;
if (cp == 0x0559) return 1;
if (cp == 0x0640) return 1;
if (cp >= 0x06E5 && cp <= 0x06E6) return 1;
if (cp >= 0x07F4 && cp <= 0x07F5) return 1;
if (cp == 0x07FA) return 1;
if (cp == 0x081A) return 1;
if (cp == 0x0824) return 1;
if (cp == 0x0828) return 1;
if (cp == 0x08C9) return 1;
if (cp == 0x0971) return 1;
if (cp == 0x0E46) return 1;
if (cp == 0x0EC6) return 1;
if (cp == 0x10FC) return 1;
if (cp == 0x17D7) return 1;
if (cp == 0x1843) return 1;
if (cp == 0x1AA7) return 1;
if (cp >= 0x1C78 && cp <= 0x1C7D) return 1;
if (cp >= 0x1D2C && cp <= 0x1D6A) return 1;
if (cp == 0x1D78) return 1;
if (cp >= 0x1D9B && cp <= 0x1DBF) return 1;
if (cp == 0x2071) return 1;
if (cp == 0x207F) return 1;
if (cp >= 0x2090 && cp <= 0x209C) return 1;
if (cp >= 0x2C7C && cp <= 0x2C7D) return 1;
if (cp == 0x2D6F) return 1;
if (cp == 0x2E2F) return 1;
if (cp == 0x3005) return 1;
if (cp >= 0x3031 && cp <= 0x3035) return 1;
if (cp == 0x303B) return 1;
if (cp >= 0x309D && cp <= 0x309E) return 1;
if (cp >= 0x30FC && cp <= 0x30FE) return 1;
if (cp == 0xA015) return 1;
if (cp >= 0xA4F8 && cp <= 0xA4FD) return 1;
if (cp == 0xA60C) return 1;
if (cp == 0xA67F) return 1;
if (cp >= 0xA69C && cp <= 0xA69D) return 1;
if (cp >= 0xA717 && cp <= 0xA71F) return 1;
if (cp == 0xA770) return 1;
if (cp == 0xA788) return 1;
if (cp >= 0xA7F2 && cp <= 0xA7F4) return 1;
if (cp >= 0xA7F8 && cp <= 0xA7F9) return 1;
if (cp == 0xA9CF) return 1;
if (cp == 0xA9E6) return 1;
if (cp == 0xAA70) return 1;
if (cp == 0xAADD) return 1;
if (cp >= 0xAAF3 && cp <= 0xAAF4) return 1;
if (cp >= 0xAB5C && cp <= 0xAB5F) return 1;
if (cp == 0xAB69) return 1;
if (cp == 0xFF70) return 1;
if (cp >= 0xFF9E && cp <= 0xFF9F) return 1;
if (cp >= 0x10780 && cp <= 0x10785) return 1;
if (cp >= 0x10787 && cp <= 0x107B0) return 1;
if (cp >= 0x107B2 && cp <= 0x107BA) return 1;
if (cp >= 0x16B40 && cp <= 0x16B43) return 1;
if (cp >= 0x16F93 && cp <= 0x16F9F) return 1;
if (cp >= 0x16FE0 && cp <= 0x16FE1) return 1;
if (cp == 0x16FE3) return 1;
if (cp >= 0x1AFF0 && cp <= 0x1AFF3) return 1;
if (cp >= 0x1AFF5 && cp <= 0x1AFFB) return 1;
if (cp >= 0x1AFFD && cp <= 0x1AFFE) return 1;
if (cp >= 0x1E030 && cp <= 0x1E06D) return 1;
if (cp >= 0x1E137 && cp <= 0x1E13D) return 1;
if (cp == 0x1E4EB) return 1;
if (cp == 0x1E94B) return 1;
return 0;
}
static inline int is_subscript_modifier(int cp) {
/* 49 Lm codepoints with <sub> decomposition */
if (cp >= 0x1D62 && cp <= 0x1D6A) return 1; /* 9 */
if (cp >= 0x2090 && cp <= 0x209C) return 1; /* 13 */
if (cp == 0x2C7C) return 1; /* 1 */
if (cp >= 0x1E051 && cp <= 0x1E06A) return 1; /* 26 */
return 0;
}
/*
* Map sheet filename to first codepoint of its (contiguous) code range.
* Returns -1 if unknown. For non-contiguous sheets (e.g. Devanagari),
* returns the start of the first sub-range; cells beyond it won't
* collide with Lm codepoints in practice.
*/
static int sheet_start_code(const char *basename) {
if (strstr(basename, "ascii_variable")) return 0x00;
if (strstr(basename, "latinExtA_variable")) return 0x100;
if (strstr(basename, "latinExtB_variable")) return 0x180;
if (strstr(basename, "cyrilic_extC_variable")) return 0x1C80;
if (strstr(basename, "cyrilic_extB_variable")) return 0xA640;
if (strstr(basename, "cyrilic_bulgarian_variable")) return 0xF0000;
if (strstr(basename, "cyrilic_serbian_variable")) return 0xF0060;
if (strstr(basename, "cyrilic_variable")) return 0x400;
if (strstr(basename, "halfwidth_fullwidth_variable")) return 0xFF00;
if (strstr(basename, "unipunct_variable")) return 0x2000;
if (strstr(basename, "greek_polytonic")) return 0x1F00;
if (strstr(basename, "greek_variable")) return 0x370;
if (strstr(basename, "thai_variable")) return 0xE00;
if (strstr(basename, "hayeren_variable")) return 0x530;
if (strstr(basename, "kartuli_allcaps_variable")) return 0x1C90;
if (strstr(basename, "kartuli_variable")) return 0x10D0;
if (strstr(basename, "ipa_ext_variable")) return 0x250;
if (strstr(basename, "latinExt_additional_variable")) return 0x1E00;
if (strstr(basename, "tsalagi_variable")) return 0x13A0;
if (strstr(basename, "phonetic_extensions_variable")) return 0x1D00;
if (strstr(basename, "latinExtC_variable")) return 0x2C60;
if (strstr(basename, "latinExtD_variable")) return 0xA720;
if (strstr(basename, "internal_variable")) return 0xFFE00;
if (strstr(basename, "letterlike_symbols_variable")) return 0x2100;
if (strstr(basename, "enclosed_alphanumeric")) return 0x1F100;
if (strstr(basename, "sundanese_variable")) return 0x1B80;
if (strstr(basename, "control_pictures_variable")) return 0x2400;
if (strstr(basename, "latinExtE_variable")) return 0xAB30;
if (strstr(basename, "latinExtF_variable")) return 0x10780;
if (strstr(basename, "latinExtG_variable")) return 0x1DF00;
if (strstr(basename, "devanagari") && !strstr(basename, "internal"))
return 0x900;
return -1;
}
#endif /* UNICODE_LM_H */

BIN
demo.PNG

Binary file not shown.

Before

Width:  |  Height:  |  Size: 177 KiB

After

Width:  |  Height:  |  Size: 178 KiB

View File

@@ -114,12 +114,12 @@ How multilingual? Real multilingual!
Basic Latin
Latin-1 Supplement
Latin Extended Additional
Latin Extended-A/B/C/D
Latin Extended-A/B/C/D/E/F/G
Armenian
Arrows
Bengali􏿆ᶠⁱ􀀀
Braille Patterns
Cherokee􏿆􀀀
Cherokee􏿆􀀀
CJK Symbols and Punctuation
CJK Unified Ideographs􏿆⁶􀀀
CJK Unified Ideographs Extension A􏿆¹²·¹􀀀
@@ -161,8 +161,8 @@ How multilingual? Real multilingual!
Tamil
Thai
􏿆ᴱ􀀀 No support for Coptic
􏿆ᶠⁱ􀀀 No support for ligatures  􏿆ჼ􀀀 Mkhedruli only
􏿆⁶􀀀 􏿆⁷􀀀 􏿆⁹􀀀 􏿆¹²·¹􀀀 Up to the specified Unicode version
􏿆ᴱ􀀀 No support for Coptic    􏿆ᴬ􀀀 Uppercase only
􏿆ᶠⁱ􀀀 No support for ligatures 􏿆ჼ􀀀 Mkhedruli only
􏿆⁶􀀀 􏿆¹²·¹􀀀 Up to the specified Unicode version
GitHubs issue page is open! You can report any 􏽕errors􀀀, or leave 􏽕suggestions􀀀. You can help this font to be more versatile. (for more languages, more frameworks) 􏽕Clone􀀀 this repo, make changes, and make a 􏽕pull request􀀀! I appreciate any and all supports.

Binary file not shown.

Binary file not shown.

Binary file not shown.

Binary file not shown.

BIN
src/assets/latinExtF_variable.tga LFS Normal file

Binary file not shown.

BIN
src/assets/latinExtG_variable.tga LFS Normal file

Binary file not shown.

Binary file not shown.

Binary file not shown.

Binary file not shown.

Binary file not shown.

Binary file not shown.

Binary file not shown.

Binary file not shown.

BIN
work_files/latinExtG_variable.kra LFS Normal file

Binary file not shown.

Binary file not shown.

Binary file not shown.