mirror of
https://github.com/curioustorvald/Terrarum-sans-bitmap.git
synced 2026-03-14 23:16:08 +09:00
Latin Ext F and G
This commit is contained in:
@@ -2,6 +2,7 @@
|
||||
#include "tga.h"
|
||||
#include "nn.h"
|
||||
#include "safetensor.h"
|
||||
#include "unicode_lm.h"
|
||||
|
||||
#include <stdio.h>
|
||||
#include <stdlib.h>
|
||||
@@ -75,7 +76,8 @@ int apply_model(const char *tga_path) {
|
||||
int rows = img->height / cell_h;
|
||||
int total_cells = cols * rows;
|
||||
|
||||
int processed = 0, updated = 0, skipped = 0;
|
||||
int start_code = sheet_start_code(basename);
|
||||
int processed = 0, updated = 0, skipped = 0, fixed_lm = 0;
|
||||
|
||||
for (int index = 0; index < total_cells; index++) {
|
||||
int cell_x, cell_y;
|
||||
@@ -107,6 +109,21 @@ int apply_model(const char *tga_path) {
|
||||
int opcode = (int)((dir_pixel >> 24) & 0xFF);
|
||||
if (opcode != 0) { skipped++; continue; }
|
||||
|
||||
/* Modifier letters: fixed kern pixel, skip inference */
|
||||
if (start_code >= 0 && is_modifier_letter(start_code + index)) {
|
||||
if (is_subscript_modifier(start_code + index)) {
|
||||
/* Subscript: CDEFGHJK(B), lowheight=1 */
|
||||
tga_write_pixel(tga_path, img, tag_x, tag_y + 5, 0xFFFFFFFF);
|
||||
tga_write_pixel(tga_path, img, tag_x, tag_y + 6, 0x00C03FFF);
|
||||
} else {
|
||||
/* Superscript: ABCDEF(B), lowheight=0 */
|
||||
tga_write_pixel(tga_path, img, tag_x, tag_y + 5, 0x00000000);
|
||||
tga_write_pixel(tga_path, img, tag_x, tag_y + 6, 0x0000FCFF);
|
||||
}
|
||||
processed++; updated++; fixed_lm++;
|
||||
continue;
|
||||
}
|
||||
|
||||
/* Extract 15x20 binary input */
|
||||
float input[300];
|
||||
for (int gy = 0; gy < 20; gy++) {
|
||||
@@ -155,8 +172,8 @@ int apply_model(const char *tga_path) {
|
||||
updated++;
|
||||
}
|
||||
|
||||
printf("Processed: %d cells, Updated: %d, Skipped: %d (of %d total)\n",
|
||||
processed, updated, skipped, total_cells);
|
||||
printf("Processed: %d cells, Updated: %d, Skipped: %d, Fixed Lm: %d (of %d total)\n",
|
||||
processed, updated, skipped, fixed_lm, total_cells);
|
||||
|
||||
tga_free(img);
|
||||
network_free(net);
|
||||
|
||||
BIN
Autokem/autokem.safetensors
LFS
BIN
Autokem/autokem.safetensors
LFS
Binary file not shown.
@@ -2,6 +2,7 @@
|
||||
#include "tga.h"
|
||||
#include "nn.h"
|
||||
#include "safetensor.h"
|
||||
#include "unicode_lm.h"
|
||||
|
||||
#include <stdio.h>
|
||||
#include <stdlib.h>
|
||||
@@ -42,7 +43,8 @@ static void extract_shape_bits(int kerning_mask, float *shape) {
|
||||
|
||||
/* ---- Collect samples from one TGA ---- */
|
||||
|
||||
static int collect_from_sheet(const char *path, int is_xyswap, Sample *samples, int max_samples) {
|
||||
static int collect_from_sheet(const char *path, int is_xyswap, int start_code,
|
||||
Sample *samples, int max_samples) {
|
||||
TgaImage *img = tga_read(path);
|
||||
if (!img) {
|
||||
fprintf(stderr, "Warning: cannot read %s\n", path);
|
||||
@@ -76,6 +78,10 @@ static int collect_from_sheet(const char *path, int is_xyswap, Sample *samples,
|
||||
}
|
||||
if (width == 0) continue;
|
||||
|
||||
/* Skip modifier letters (superscripts/subscripts) */
|
||||
if (start_code >= 0 && is_modifier_letter(start_code + index))
|
||||
continue;
|
||||
|
||||
/* Read kerning data pixel at Y+6 */
|
||||
uint32_t kern_pixel = tagify(tga_get_pixel(img, tag_x, tag_y + 6));
|
||||
if ((kern_pixel & 0xFF) == 0) continue; /* no kern data */
|
||||
@@ -170,7 +176,9 @@ int train_model(void) {
|
||||
char fullpath[512];
|
||||
snprintf(fullpath, sizeof(fullpath), "%s/%s", assets_dir, name);
|
||||
|
||||
int got = collect_from_sheet(fullpath, is_xyswap, all_samples + total, max_total - total);
|
||||
int start_code = sheet_start_code(name);
|
||||
int got = collect_from_sheet(fullpath, is_xyswap, start_code,
|
||||
all_samples + total, max_total - total);
|
||||
if (got > 0) {
|
||||
printf(" %s: %d samples\n", name, got);
|
||||
total += got;
|
||||
|
||||
@@ -20,10 +20,26 @@ import json
|
||||
import os
|
||||
import struct
|
||||
import sys
|
||||
import unicodedata
|
||||
from pathlib import Path
|
||||
|
||||
import numpy as np
|
||||
|
||||
# ---- Sheet code ranges (imported from OTFbuild/sheet_config.py) ----
|
||||
|
||||
_otfbuild = os.path.join(os.path.dirname(os.path.abspath(__file__)), '..', 'OTFbuild')
|
||||
try:
|
||||
sys.path.insert(0, _otfbuild)
|
||||
from sheet_config import FILE_LIST as _FILE_LIST, CODE_RANGE as _CODE_RANGE
|
||||
sys.path.pop(0)
|
||||
_CODE_RANGE_MAP = {}
|
||||
for _i, _fn in enumerate(_FILE_LIST):
|
||||
if _i < len(_CODE_RANGE):
|
||||
_CODE_RANGE_MAP[_fn] = _CODE_RANGE[_i]
|
||||
except ImportError:
|
||||
_CODE_RANGE_MAP = {}
|
||||
|
||||
|
||||
# ---- TGA reader (matches OTFbuild/tga_reader.py and Autokem/tga.c) ----
|
||||
|
||||
class TgaImage:
|
||||
@@ -80,7 +96,7 @@ def tagify(pixel):
|
||||
|
||||
# ---- Data collection (matches Autokem/train.c) ----
|
||||
|
||||
def collect_from_sheet(path, is_xyswap):
|
||||
def collect_from_sheet(path, is_xyswap, code_range=None):
|
||||
"""Extract labelled samples from a single TGA sheet."""
|
||||
img = read_tga(path)
|
||||
cell_w, cell_h = 16, 20
|
||||
@@ -90,6 +106,7 @@ def collect_from_sheet(path, is_xyswap):
|
||||
|
||||
inputs = []
|
||||
labels = []
|
||||
skipped_lm = 0
|
||||
|
||||
for index in range(total_cells):
|
||||
if is_xyswap:
|
||||
@@ -110,6 +127,16 @@ def collect_from_sheet(path, is_xyswap):
|
||||
if width == 0:
|
||||
continue
|
||||
|
||||
# Skip modifier letters (superscripts/subscripts)
|
||||
if code_range is not None and index < len(code_range):
|
||||
cp = code_range[index]
|
||||
try:
|
||||
if unicodedata.category(chr(cp)) == 'Lm':
|
||||
skipped_lm += 1
|
||||
continue
|
||||
except (ValueError, OverflowError):
|
||||
pass
|
||||
|
||||
# Kern data pixel at Y+6
|
||||
kern_pixel = tagify(img.get_pixel(tag_x, tag_y + 6))
|
||||
if (kern_pixel & 0xFF) == 0:
|
||||
@@ -145,7 +172,7 @@ def collect_from_sheet(path, is_xyswap):
|
||||
inputs.append(inp)
|
||||
labels.append(shape + [is_kern_ytype, is_low_height])
|
||||
|
||||
return inputs, labels
|
||||
return inputs, labels, skipped_lm
|
||||
|
||||
|
||||
def collect_all_samples(assets_dir):
|
||||
@@ -153,6 +180,7 @@ def collect_all_samples(assets_dir):
|
||||
all_inputs = []
|
||||
all_labels = []
|
||||
file_count = 0
|
||||
total_skipped_lm = 0
|
||||
|
||||
for name in sorted(os.listdir(assets_dir)):
|
||||
if not name.endswith('_variable.tga'):
|
||||
@@ -161,14 +189,20 @@ def collect_all_samples(assets_dir):
|
||||
continue
|
||||
|
||||
is_xyswap = 'xyswap' in name
|
||||
code_range = _CODE_RANGE_MAP.get(name, None)
|
||||
path = os.path.join(assets_dir, name)
|
||||
inputs, labels = collect_from_sheet(path, is_xyswap)
|
||||
inputs, labels, skipped_lm = collect_from_sheet(path, is_xyswap, code_range)
|
||||
total_skipped_lm += skipped_lm
|
||||
if inputs:
|
||||
print(f" {name}: {len(inputs)} samples")
|
||||
suffix = f" (skipped {skipped_lm} Lm)" if skipped_lm else ""
|
||||
print(f" {name}: {len(inputs)} samples{suffix}")
|
||||
all_inputs.extend(inputs)
|
||||
all_labels.extend(labels)
|
||||
file_count += 1
|
||||
|
||||
if total_skipped_lm:
|
||||
print(f" Total modifier letters filtered: {total_skipped_lm}")
|
||||
|
||||
return np.array(all_inputs), np.array(all_labels, dtype=np.float32), file_count
|
||||
|
||||
|
||||
|
||||
141
Autokem/unicode_lm.h
Normal file
141
Autokem/unicode_lm.h
Normal file
@@ -0,0 +1,141 @@
|
||||
#ifndef UNICODE_LM_H
|
||||
#define UNICODE_LM_H
|
||||
|
||||
#include <string.h>
|
||||
|
||||
/*
|
||||
* Unicode category Lm (Letter, modifier) range checks.
|
||||
* Generated from Python unicodedata (Unicode 16.0).
|
||||
*
|
||||
* is_modifier_letter(cp) — true for all Lm codepoints
|
||||
* is_subscript_modifier(cp) — true for Lm codepoints with <sub> decomposition
|
||||
*/
|
||||
|
||||
static inline int is_modifier_letter(int cp) {
|
||||
/* 71 contiguous ranges covering all 397 Lm codepoints */
|
||||
if (cp >= 0x02B0 && cp <= 0x02C1) return 1;
|
||||
if (cp >= 0x02C6 && cp <= 0x02D1) return 1;
|
||||
if (cp >= 0x02E0 && cp <= 0x02E4) return 1;
|
||||
if (cp == 0x02EC) return 1;
|
||||
if (cp == 0x02EE) return 1;
|
||||
if (cp == 0x0374) return 1;
|
||||
if (cp == 0x037A) return 1;
|
||||
if (cp == 0x0559) return 1;
|
||||
if (cp == 0x0640) return 1;
|
||||
if (cp >= 0x06E5 && cp <= 0x06E6) return 1;
|
||||
if (cp >= 0x07F4 && cp <= 0x07F5) return 1;
|
||||
if (cp == 0x07FA) return 1;
|
||||
if (cp == 0x081A) return 1;
|
||||
if (cp == 0x0824) return 1;
|
||||
if (cp == 0x0828) return 1;
|
||||
if (cp == 0x08C9) return 1;
|
||||
if (cp == 0x0971) return 1;
|
||||
if (cp == 0x0E46) return 1;
|
||||
if (cp == 0x0EC6) return 1;
|
||||
if (cp == 0x10FC) return 1;
|
||||
if (cp == 0x17D7) return 1;
|
||||
if (cp == 0x1843) return 1;
|
||||
if (cp == 0x1AA7) return 1;
|
||||
if (cp >= 0x1C78 && cp <= 0x1C7D) return 1;
|
||||
if (cp >= 0x1D2C && cp <= 0x1D6A) return 1;
|
||||
if (cp == 0x1D78) return 1;
|
||||
if (cp >= 0x1D9B && cp <= 0x1DBF) return 1;
|
||||
if (cp == 0x2071) return 1;
|
||||
if (cp == 0x207F) return 1;
|
||||
if (cp >= 0x2090 && cp <= 0x209C) return 1;
|
||||
if (cp >= 0x2C7C && cp <= 0x2C7D) return 1;
|
||||
if (cp == 0x2D6F) return 1;
|
||||
if (cp == 0x2E2F) return 1;
|
||||
if (cp == 0x3005) return 1;
|
||||
if (cp >= 0x3031 && cp <= 0x3035) return 1;
|
||||
if (cp == 0x303B) return 1;
|
||||
if (cp >= 0x309D && cp <= 0x309E) return 1;
|
||||
if (cp >= 0x30FC && cp <= 0x30FE) return 1;
|
||||
if (cp == 0xA015) return 1;
|
||||
if (cp >= 0xA4F8 && cp <= 0xA4FD) return 1;
|
||||
if (cp == 0xA60C) return 1;
|
||||
if (cp == 0xA67F) return 1;
|
||||
if (cp >= 0xA69C && cp <= 0xA69D) return 1;
|
||||
if (cp >= 0xA717 && cp <= 0xA71F) return 1;
|
||||
if (cp == 0xA770) return 1;
|
||||
if (cp == 0xA788) return 1;
|
||||
if (cp >= 0xA7F2 && cp <= 0xA7F4) return 1;
|
||||
if (cp >= 0xA7F8 && cp <= 0xA7F9) return 1;
|
||||
if (cp == 0xA9CF) return 1;
|
||||
if (cp == 0xA9E6) return 1;
|
||||
if (cp == 0xAA70) return 1;
|
||||
if (cp == 0xAADD) return 1;
|
||||
if (cp >= 0xAAF3 && cp <= 0xAAF4) return 1;
|
||||
if (cp >= 0xAB5C && cp <= 0xAB5F) return 1;
|
||||
if (cp == 0xAB69) return 1;
|
||||
if (cp == 0xFF70) return 1;
|
||||
if (cp >= 0xFF9E && cp <= 0xFF9F) return 1;
|
||||
if (cp >= 0x10780 && cp <= 0x10785) return 1;
|
||||
if (cp >= 0x10787 && cp <= 0x107B0) return 1;
|
||||
if (cp >= 0x107B2 && cp <= 0x107BA) return 1;
|
||||
if (cp >= 0x16B40 && cp <= 0x16B43) return 1;
|
||||
if (cp >= 0x16F93 && cp <= 0x16F9F) return 1;
|
||||
if (cp >= 0x16FE0 && cp <= 0x16FE1) return 1;
|
||||
if (cp == 0x16FE3) return 1;
|
||||
if (cp >= 0x1AFF0 && cp <= 0x1AFF3) return 1;
|
||||
if (cp >= 0x1AFF5 && cp <= 0x1AFFB) return 1;
|
||||
if (cp >= 0x1AFFD && cp <= 0x1AFFE) return 1;
|
||||
if (cp >= 0x1E030 && cp <= 0x1E06D) return 1;
|
||||
if (cp >= 0x1E137 && cp <= 0x1E13D) return 1;
|
||||
if (cp == 0x1E4EB) return 1;
|
||||
if (cp == 0x1E94B) return 1;
|
||||
return 0;
|
||||
}
|
||||
|
||||
static inline int is_subscript_modifier(int cp) {
|
||||
/* 49 Lm codepoints with <sub> decomposition */
|
||||
if (cp >= 0x1D62 && cp <= 0x1D6A) return 1; /* 9 */
|
||||
if (cp >= 0x2090 && cp <= 0x209C) return 1; /* 13 */
|
||||
if (cp == 0x2C7C) return 1; /* 1 */
|
||||
if (cp >= 0x1E051 && cp <= 0x1E06A) return 1; /* 26 */
|
||||
return 0;
|
||||
}
|
||||
|
||||
/*
|
||||
* Map sheet filename to first codepoint of its (contiguous) code range.
|
||||
* Returns -1 if unknown. For non-contiguous sheets (e.g. Devanagari),
|
||||
* returns the start of the first sub-range; cells beyond it won't
|
||||
* collide with Lm codepoints in practice.
|
||||
*/
|
||||
static int sheet_start_code(const char *basename) {
|
||||
if (strstr(basename, "ascii_variable")) return 0x00;
|
||||
if (strstr(basename, "latinExtA_variable")) return 0x100;
|
||||
if (strstr(basename, "latinExtB_variable")) return 0x180;
|
||||
if (strstr(basename, "cyrilic_extC_variable")) return 0x1C80;
|
||||
if (strstr(basename, "cyrilic_extB_variable")) return 0xA640;
|
||||
if (strstr(basename, "cyrilic_bulgarian_variable")) return 0xF0000;
|
||||
if (strstr(basename, "cyrilic_serbian_variable")) return 0xF0060;
|
||||
if (strstr(basename, "cyrilic_variable")) return 0x400;
|
||||
if (strstr(basename, "halfwidth_fullwidth_variable")) return 0xFF00;
|
||||
if (strstr(basename, "unipunct_variable")) return 0x2000;
|
||||
if (strstr(basename, "greek_polytonic")) return 0x1F00;
|
||||
if (strstr(basename, "greek_variable")) return 0x370;
|
||||
if (strstr(basename, "thai_variable")) return 0xE00;
|
||||
if (strstr(basename, "hayeren_variable")) return 0x530;
|
||||
if (strstr(basename, "kartuli_allcaps_variable")) return 0x1C90;
|
||||
if (strstr(basename, "kartuli_variable")) return 0x10D0;
|
||||
if (strstr(basename, "ipa_ext_variable")) return 0x250;
|
||||
if (strstr(basename, "latinExt_additional_variable")) return 0x1E00;
|
||||
if (strstr(basename, "tsalagi_variable")) return 0x13A0;
|
||||
if (strstr(basename, "phonetic_extensions_variable")) return 0x1D00;
|
||||
if (strstr(basename, "latinExtC_variable")) return 0x2C60;
|
||||
if (strstr(basename, "latinExtD_variable")) return 0xA720;
|
||||
if (strstr(basename, "internal_variable")) return 0xFFE00;
|
||||
if (strstr(basename, "letterlike_symbols_variable")) return 0x2100;
|
||||
if (strstr(basename, "enclosed_alphanumeric")) return 0x1F100;
|
||||
if (strstr(basename, "sundanese_variable")) return 0x1B80;
|
||||
if (strstr(basename, "control_pictures_variable")) return 0x2400;
|
||||
if (strstr(basename, "latinExtE_variable")) return 0xAB30;
|
||||
if (strstr(basename, "latinExtF_variable")) return 0x10780;
|
||||
if (strstr(basename, "latinExtG_variable")) return 0x1DF00;
|
||||
if (strstr(basename, "devanagari") && !strstr(basename, "internal"))
|
||||
return 0x900;
|
||||
return -1;
|
||||
}
|
||||
|
||||
#endif /* UNICODE_LM_H */
|
||||
BIN
demo.PNG
BIN
demo.PNG
Binary file not shown.
|
Before Width: | Height: | Size: 177 KiB After Width: | Height: | Size: 177 KiB |
@@ -114,7 +114,7 @@ How multilingual? Real multilingual!
|
||||
⁃ Basic Latin
|
||||
⁃ Latin-1 Supplement
|
||||
⁃ Latin Extended Additional
|
||||
⁃ Latin Extended-A/B/C/D
|
||||
⁃ Latin Extended-A/B/C/D/E/F/G
|
||||
⁃ Armenian
|
||||
⁃ Arrows
|
||||
⁃ Bengaliᶠⁱ
|
||||
|
||||
BIN
src/assets/ipa_ext_variable.tga
LFS
BIN
src/assets/ipa_ext_variable.tga
LFS
Binary file not shown.
Binary file not shown.
BIN
src/assets/latinExtF_variable.tga
LFS
Normal file
BIN
src/assets/latinExtF_variable.tga
LFS
Normal file
Binary file not shown.
BIN
src/assets/latinExtG_variable.tga
LFS
Normal file
BIN
src/assets/latinExtG_variable.tga
LFS
Normal file
Binary file not shown.
Binary file not shown.
BIN
work_files/ipa_ext_variable.psd
LFS
BIN
work_files/ipa_ext_variable.psd
LFS
Binary file not shown.
Binary file not shown.
Binary file not shown.
BIN
work_files/latinExtG_variable.kra
LFS
Normal file
BIN
work_files/latinExtG_variable.kra
LFS
Normal file
Binary file not shown.
Binary file not shown.
Reference in New Issue
Block a user