Files
tsvm/video_encoder/encoder_tav_text.c
2025-11-21 16:53:35 +09:00

663 lines
21 KiB
C
Raw Blame History

This file contains ambiguous Unicode characters
This file contains Unicode characters that might be confused with other characters. If you think that this is intentional, you can safely ignore this warning. Use the Escape button to reveal them.
/*
encoder_tav_text.c
Text-based video encoder for TSVM using custom font ROMs
Outputs Videotex files with custom header and packet type 0x3F (text mode)
File structure:
- Videotex header (32 bytes): magic "\x1FTSVM-VT", version, grid dims, fps, total_frames
- Extended header packet (0xEF): BGNT, ENDT, CDAT, VNDR, FMPG
- Font ROM packets (0x30): lowrom and highrom (1920 bytes each)
- Per-frame sequence: [audio 0x20], [timecode 0xFD], [videotex 0x3F], [sync 0xFF]
Videotex packet structure (0x3F): Zstd([rows][cols][fg-array][bg-array][char-array])
- rows: uint8 (32)
- cols: uint8 (80)
- fg-array: rows*cols bytes (foreground colors, 0xF0=black, 0xFE=white)
- bg-array: rows*cols bytes (background colors, 0xF0=black, 0xFE=white)
- char-array: rows*cols bytes (glyph indices 0-255)
Total uncompressed size: 2 + (80*32*3) = 7682 bytes
Separated arrays compress much better (fg/bg are just 0xF0/0xFE runs)
Video size: 80×32 characters (560×448 pixels with 7×14 font)
Audio: MP2 encoding at 96 kbps, 32 KHz stereo (packet 0x20)
Each text frame is treated as an I-frame with sync packet
Usage:
gcc -O3 -std=c11 -Wall encoder_tav_text.c -o encoder_tav_text -lm -lzstd
./encoder_tav_text -i video.mp4 -f font.chr -o output.vtx
*/
#define _POSIX_C_SOURCE 200809L
#include <stdio.h>
#include <stdlib.h>
#include <stdint.h>
#include <string.h>
#include <math.h>
#include <zstd.h>
#include <unistd.h>
#include <time.h>
#include <sys/time.h>
#define ENCODER_VENDOR_STRING "Encoder-TAV-Text 20251121 (videotex)"
#define CHAR_W 7
#define CHAR_H 14
#define GRID_W 80
#define GRID_H 32
#define PIXEL_W (GRID_W * CHAR_W) // 560
#define PIXEL_H (GRID_H * CHAR_H) // 448
#define PATCH_SZ (CHAR_W * CHAR_H)
#define SAMPLE_RATE 32000
// TAV packet types
#define PACKET_TIMECODE 0xFD
#define PACKET_SYNC 0xFF
#define PACKET_AUDIO_MP2 0x20
#define PACKET_SSF 0x30
#define PACKET_TEXT 0x3F
#define PACKET_EXTENDED_HDR 0xEF
// SSF opcodes for font ROM
#define SSF_OPCODE_LOWROM 0x80
#define SSF_OPCODE_HIGHROM 0x81
// Font ROM size constants
#define FONTROM_PADDED_SIZE 1920
#define GLYPHS_PER_ROM 128
// Color mapping (4-bit RGB to TSVM palette)
#define COLOR_BLACK 0xF0
#define COLOR_WHITE 0xFE
typedef struct {
uint8_t *data; // Binary glyph data (PATCH_SZ bytes per glyph)
int count; // Number of glyphs
} FontROM;
// Get FFmpeg version string
char *get_ffmpeg_version(void) {
FILE *pipe = popen("ffmpeg -version 2>&1 | head -1", "r");
if (!pipe) return NULL;
char *version = malloc(256);
if (!version) {
pclose(pipe);
return NULL;
}
if (fgets(version, 256, pipe)) {
// Remove trailing newline
size_t len = strlen(version);
if (len > 0 && version[len - 1] == '\n') {
version[len - 1] = '\0';
}
pclose(pipe);
return version;
}
free(version);
pclose(pipe);
return NULL;
}
// Detect video FPS using ffprobe
float detect_fps(const char *video_path) {
char cmd[1024];
snprintf(cmd, sizeof(cmd),
"ffprobe -v error -select_streams v:0 -show_entries stream=r_frame_rate "
"-of default=noprint_wrappers=1:nokey=1 \"%s\" 2>/dev/null",
video_path);
FILE *pipe = popen(cmd, "r");
if (!pipe) return 30.0f; // fallback
char fps_str[64] = {0};
if (fgets(fps_str, sizeof(fps_str), pipe)) {
// Parse fraction like "30/1" or "24000/1001"
int num = 0, den = 1;
if (sscanf(fps_str, "%d/%d", &num, &den) == 2 && den > 0) {
pclose(pipe);
return (float)num / (float)den;
}
}
pclose(pipe);
return 30.0f; // fallback
}
// Load font ROM (14 bytes per glyph, no header)
FontROM *load_font_rom(const char *path) {
FILE *f = fopen(path, "rb");
if (!f) return NULL;
fseek(f, 0, SEEK_END);
long size = ftell(f);
fseek(f, 0, SEEK_SET);
if (size % 14 != 0) {
fprintf(stderr, "Warning: ROM size not divisible by 14 (got %ld bytes)\n", size);
}
int glyph_count = size / 14;
FontROM *rom = malloc(sizeof(FontROM));
rom->count = glyph_count;
rom->data = malloc(glyph_count * PATCH_SZ);
// Read and unpack glyphs
for (int g = 0; g < glyph_count; g++) {
uint8_t row_bytes[14];
if (fread(row_bytes, 14, 1, f) != 1) {
free(rom->data);
free(rom);
fclose(f);
return NULL;
}
// Unpack bits to binary pixels
for (int row = 0; row < CHAR_H; row++) {
for (int col = 0; col < CHAR_W; col++) {
// Bit 6 = leftmost, bit 0 = rightmost
int bit = (row_bytes[row] >> (6 - col)) & 1;
rom->data[g * PATCH_SZ + row * CHAR_W + col] = bit;
}
}
}
fclose(f);
fprintf(stderr, "Loaded font ROM: %d glyphs\n", glyph_count);
return rom;
}
// Find best matching glyph for a grayscale patch
int find_best_glyph(const uint8_t *patch, const FontROM *rom, uint8_t *out_bg, uint8_t *out_fg) {
// Try both normal and inverted matching
int best_glyph = 0;
float best_error = INFINITY;
uint8_t best_bg = COLOR_BLACK, best_fg = COLOR_WHITE;
for (int g = 0; g < rom->count; g++) {
const uint8_t *glyph = &rom->data[g * PATCH_SZ];
// Try normal: glyph 1 = fg, glyph 0 = bg
float err_normal = 0;
for (int i = 0; i < PATCH_SZ; i++) {
int expected = glyph[i] ? 255 : 0;
int diff = patch[i] - expected;
err_normal += diff * diff;
}
// Try inverted: glyph 0 = fg, glyph 1 = bg
float err_inverted = 0;
for (int i = 0; i < PATCH_SZ; i++) {
int expected = glyph[i] ? 0 : 255;
int diff = patch[i] - expected;
err_inverted += diff * diff;
}
if (err_normal < best_error) {
best_error = err_normal;
best_glyph = g;
best_bg = COLOR_BLACK;
best_fg = COLOR_WHITE;
}
if (err_inverted < best_error) {
best_error = err_inverted;
best_glyph = g;
best_bg = COLOR_WHITE;
best_fg = COLOR_BLACK;
}
}
*out_bg = best_bg;
*out_fg = best_fg;
return best_glyph;
}
// Convert frame to text mode
void frame_to_text(const uint8_t *pixels, const FontROM *rom,
uint8_t *bg_col, uint8_t *fg_col, uint8_t *chars) {
uint8_t patch[PATCH_SZ];
for (int gr = 0; gr < GRID_H; gr++) {
for (int gc = 0; gc < GRID_W; gc++) {
int idx = gr * GRID_W + gc;
// Extract patch
for (int y = 0; y < CHAR_H; y++) {
for (int x = 0; x < CHAR_W; x++) {
int px = gc * CHAR_W + x;
int py = gr * CHAR_H + y;
patch[y * CHAR_W + x] = pixels[py * PIXEL_W + px];
}
}
// Find best match
chars[idx] = find_best_glyph(patch, rom, &bg_col[idx], &fg_col[idx]);
}
}
}
// Get current time in nanoseconds since UNIX epoch
uint64_t get_current_time_ns(void) {
struct timeval tv;
gettimeofday(&tv, NULL);
return (uint64_t)tv.tv_sec * 1000000000ULL + (uint64_t)tv.tv_usec * 1000ULL;
}
// Write Videotex header (32 bytes, similar to TAV but simpler)
void write_videotex_header(FILE *f, uint8_t fps, uint32_t total_frames) {
fwrite("\x1FTSVMTAV", 8, 1, f);
// Version: 1 (uint8)
fputc(1, f);
// Grid dimensions (uint8 each)
fputc(GRID_W, f); // cols = 80
fputc(0, f);
fputc(GRID_H, f); // rows = 32
fputc(0, f);
// FPS (uint8)
fputc(fps, f);
// Total frames (uint32, little-endian)
fwrite(&total_frames, sizeof(uint32_t), 1, f);
fputc(0, f); // wavelet filter type
fputc(0, f); // decomposition levels
fputc(0, f); // quantiser Y
fputc(0, f); // quantiser Co
fputc(0, f); // quantiser Cg
// Feature Flags
fputc(0x03, f); // bit 0 = has audio; bit 1 = has subtitle (Videotex is classified as subtitles)
// Video Flags
fputc(0x80, f); // bit 7 = has no video (Videotex is classified as subtitles)
fputc(0, f); // encoder quality level
fputc(0x02, f); // channel layout: Y only
fputc(0, f); // entropy coder
fputc(0, f); // reserved
fputc(0, f); // reserved
fputc(0, f); // device orientation: no rotation
fputc(0, f); // file role: generic
}
// Write extended header packet with metadata
// Returns the file offset where ENDT value is written (for later update)
long write_extended_header(FILE *f, uint64_t creation_time_ns, const char *ffmpeg_version) {
fputc(PACKET_EXTENDED_HDR, f);
// Helper macros for key-value pairs
#define WRITE_KV_UINT64(key_str, value) do { \
fwrite(key_str, 1, 4, f); \
uint8_t value_type = 0x04; /* Uint64 */ \
fwrite(&value_type, 1, 1, f); \
uint64_t val = (value); \
fwrite(&val, sizeof(uint64_t), 1, f); \
} while(0)
#define WRITE_KV_BYTES(key_str, data, len) do { \
fwrite(key_str, 1, 4, f); \
uint8_t value_type = 0x10; /* Bytes */ \
fwrite(&value_type, 1, 1, f); \
uint16_t length = (len); \
fwrite(&length, sizeof(uint16_t), 1, f); \
fwrite((data), 1, (len), f); \
} while(0)
// Count key-value pairs (BGNT, ENDT, CDAT, VNDR, FMPG)
uint16_t num_pairs = ffmpeg_version ? 5 : 4; // FMPG is optional
fwrite(&num_pairs, sizeof(uint16_t), 1, f);
// BGNT: Video begin time (0 for frame 0)
WRITE_KV_UINT64("BGNT", 0ULL);
// ENDT: Video end time (placeholder, will be updated at end)
long endt_offset = ftell(f);
WRITE_KV_UINT64("ENDT", 0ULL);
// CDAT: Creation time in nanoseconds since UNIX epoch
WRITE_KV_UINT64("CDAT", creation_time_ns);
// VNDR: Encoder name and version
const char *vendor_str = ENCODER_VENDOR_STRING;
WRITE_KV_BYTES("VNDR", vendor_str, strlen(vendor_str));
// FMPG: FFmpeg version (if available)
if (ffmpeg_version) {
WRITE_KV_BYTES("FMPG", ffmpeg_version, strlen(ffmpeg_version));
}
#undef WRITE_KV_UINT64
#undef WRITE_KV_BYTES
// Return offset of ENDT value (skip key, type byte)
return endt_offset + 4 + 1; // 4 bytes for "ENDT", 1 byte for type
}
// Write font ROM packet (SSF packet type 0x30)
void write_fontrom_packet(FILE *f, const uint8_t *rom_data, size_t data_size, uint8_t opcode) {
// Prepare padded ROM data (pad to FONTROM_PADDED_SIZE with zeros)
uint8_t *padded_data = calloc(1, FONTROM_PADDED_SIZE);
memcpy(padded_data, rom_data, data_size);
// Packet structure:
// [type:0x30][size:uint32][index:uint24][opcode:uint8][length:uint16][data][terminator:0x00]
uint32_t packet_size = 3 + 1 + 2 + FONTROM_PADDED_SIZE + 1;
// Write packet type and size
fputc(PACKET_SSF, f);
fwrite(&packet_size, sizeof(uint32_t), 1, f);
// Write SSF payload
// Index (3 bytes, always 0 for font ROM)
fputc(0, f);
fputc(0, f);
fputc(0, f);
// Opcode (0x80=lowrom, 0x81=highrom)
fputc(opcode, f);
// Payload length (uint16, little-endian)
uint16_t payload_len = FONTROM_PADDED_SIZE;
fwrite(&payload_len, sizeof(uint16_t), 1, f);
// Font data (padded to 1920 bytes)
fwrite(padded_data, 1, FONTROM_PADDED_SIZE, f);
// Terminator
fputc(0x00, f);
free(padded_data);
fprintf(stderr, "Font ROM uploaded: %zu bytes (padded to %d), opcode 0x%02X\n",
data_size, FONTROM_PADDED_SIZE, opcode);
}
// Write timecode packet (nanoseconds)
void write_timecode(FILE *f, uint64_t timecode_ns) {
fputc(PACKET_TIMECODE, f);
fwrite(&timecode_ns, sizeof(uint64_t), 1, f);
}
// Write sync packet
void write_sync(FILE *f) {
fputc(PACKET_SYNC, f);
}
// Write MP2 audio packet
void write_audio_mp2(FILE *f, const uint8_t *data, uint32_t size) {
fputc(PACKET_AUDIO_MP2, f);
fwrite(&size, sizeof(uint32_t), 1, f);
fwrite(data, 1, size, f);
}
// Write text packet with separated arrays (better compression)
void write_text_packet(FILE *f, const uint8_t *bg_col, const uint8_t *fg_col,
const uint8_t *chars, int rows, int cols) {
int grid_size = rows * cols;
// Prepare uncompressed data: [rows][cols][fg-array][bg-array][char-array]
// Separated arrays compress much better (fg/bg are just 0xF0/0xFE runs)
size_t uncompressed_size = 2 + grid_size * 3;
uint8_t *uncompressed = malloc(uncompressed_size);
uncompressed[0] = rows;
uncompressed[1] = cols;
// Copy arrays in order: foreground, background, characters
memcpy(&uncompressed[2], fg_col, grid_size); // Foreground first
memcpy(&uncompressed[2 + grid_size], bg_col, grid_size); // Background second
memcpy(&uncompressed[2 + grid_size * 2], chars, grid_size); // Characters third
// Compress with Zstd
size_t max_compressed = ZSTD_compressBound(uncompressed_size);
uint8_t *compressed = malloc(max_compressed);
size_t compressed_size = ZSTD_compress(compressed, max_compressed,
uncompressed, uncompressed_size, 3);
if (ZSTD_isError(compressed_size)) {
fprintf(stderr, "Zstd compression error\n");
exit(1);
}
// Write packet: [type][size][data]
fputc(PACKET_TEXT, f);
uint32_t size32 = compressed_size;
fwrite(&size32, 4, 1, f);
fwrite(compressed, compressed_size, 1, f);
free(compressed);
free(uncompressed);
}
int main(int argc, char **argv) {
if (argc < 7) {
fprintf(stderr, "Usage: %s -i <video> -f <font.chr> -o <output.tav>\n", argv[0]);
return 1;
}
const char *input_video = NULL;
const char *font_path = NULL;
const char *output_path = NULL;
for (int i = 1; i < argc; i++) {
if (strcmp(argv[i], "-i") == 0 && i+1 < argc) input_video = argv[++i];
else if (strcmp(argv[i], "-f") == 0 && i+1 < argc) font_path = argv[++i];
else if (strcmp(argv[i], "-o") == 0 && i+1 < argc) output_path = argv[++i];
}
if (!input_video || !font_path || !output_path) {
fprintf(stderr, "Missing required arguments\n");
return 1;
}
// Capture creation time and FFmpeg version for extended header
uint64_t creation_time_ns = get_current_time_ns();
char *ffmpeg_version = get_ffmpeg_version();
// Detect video FPS
float fps_float = detect_fps(input_video);
uint8_t fps = (uint8_t)(fps_float + 0.5f); // Round to nearest integer
fprintf(stderr, "Detected FPS: %.2f (using %d in TAV header)\n", fps_float, fps);
// Load font ROM
FontROM *rom = load_font_rom(font_path);
if (!rom) {
fprintf(stderr, "Failed to load font ROM: %s\n", font_path);
return 1;
}
// Open FFmpeg pipe for grayscale frames at 560×448
char ffmpeg_cmd[1024];
snprintf(ffmpeg_cmd, sizeof(ffmpeg_cmd),
"ffmpeg -i \"%s\" -vf \"scale=%d:%d:force_original_aspect_ratio=increase,crop=%d:%d\" "
"-f rawvideo -pix_fmt gray - 2>/dev/null",
input_video, PIXEL_W, PIXEL_H, PIXEL_W, PIXEL_H);
fprintf(stderr, "Opening video stream...\n");
FILE *video_pipe = popen(ffmpeg_cmd, "r");
if (!video_pipe) {
fprintf(stderr, "Failed to open FFmpeg pipe\n");
return 1;
}
// Open FFmpeg pipe for MP2 audio (32 KHz stereo)
char audio_cmd[1024];
snprintf(audio_cmd, sizeof(audio_cmd),
"ffmpeg -i \"%s\" -vn -ar %d -ac 2 -f mp2 -b:a 96k - 2>/dev/null",
input_video, SAMPLE_RATE);
fprintf(stderr, "Opening audio stream...\n");
FILE *audio_pipe = popen(audio_cmd, "r");
if (!audio_pipe) {
fprintf(stderr, "Failed to open audio FFmpeg pipe\n");
pclose(video_pipe);
return 1;
}
// Open output file
FILE *out = fopen(output_path, "wb");
if (!out) {
fprintf(stderr, "Failed to open output file\n");
pclose(video_pipe);
pclose(audio_pipe);
return 1;
}
// Write Videotex header with placeholder total_frames (will update at end)
long header_offset = ftell(out);
write_videotex_header(out, fps, 0);
// Write extended header packet (before first timecode)
long endt_offset = write_extended_header(out, creation_time_ns, ffmpeg_version);
// Upload font ROM to TSVM (split into lowrom and highrom)
fprintf(stderr, "Uploading font ROM to TSVM...\n");
FILE *rom_file = fopen(font_path, "rb");
if (rom_file) {
fseek(rom_file, 0, SEEK_END);
long rom_size = ftell(rom_file);
fseek(rom_file, 0, SEEK_SET);
uint8_t *raw_rom = malloc(rom_size);
if (raw_rom && fread(raw_rom, 1, rom_size, rom_file) == rom_size) {
// Split into lowrom and highrom
size_t bytes_per_half = (GLYPHS_PER_ROM * 14); // 128 glyphs × 14 bytes = 1792
// Write lowrom (first 128 glyphs)
if (rom_size >= bytes_per_half) {
write_fontrom_packet(out, raw_rom, bytes_per_half, SSF_OPCODE_LOWROM);
}
// Write highrom (second 128 glyphs)
if (rom_size >= bytes_per_half * 2) {
write_fontrom_packet(out, raw_rom + bytes_per_half, bytes_per_half, SSF_OPCODE_HIGHROM);
} else if (rom_size > bytes_per_half) {
// Partial highrom
write_fontrom_packet(out, raw_rom + bytes_per_half, rom_size - bytes_per_half, SSF_OPCODE_HIGHROM);
}
free(raw_rom);
}
fclose(rom_file);
}
// Allocate buffers
size_t frame_size = PIXEL_W * PIXEL_H;
uint8_t *gray_pixels = malloc(frame_size);
uint8_t *bg_col = malloc(GRID_W * GRID_H);
uint8_t *fg_col = malloc(GRID_W * GRID_H);
uint8_t *chars = malloc(GRID_W * GRID_H);
// Audio buffer (read MP2 frames in 1152-sample chunks, ~36ms at 32 KHz)
#define AUDIO_CHUNK_SIZE 8192 // Arbitrary MP2 frame buffer size
uint8_t *audio_buffer = malloc(AUDIO_CHUNK_SIZE);
size_t audio_available = 0;
uint32_t frame_num = 0;
uint64_t total_audio_bytes = 0;
fprintf(stderr, "Encoding text-mode video (%dx%d chars, %dx%d pixels)...\n",
GRID_W, GRID_H, PIXEL_W, PIXEL_H);
// Track encoding start time
struct timeval start_time, now;
gettimeofday(&start_time, NULL);
// Read and process frames
while (fread(gray_pixels, 1, frame_size, video_pipe) == frame_size) {
// Calculate timecode in nanoseconds
uint64_t timecode_ns = (uint64_t)(frame_num * 1000000000.0 / fps_float);
// Write audio packet first (if available)
// Try to read ~1 frame worth of audio
audio_available = fread(audio_buffer, 1, AUDIO_CHUNK_SIZE, audio_pipe);
if (audio_available > 0) {
write_audio_mp2(out, audio_buffer, audio_available);
total_audio_bytes += audio_available;
}
// Write timecode
write_timecode(out, timecode_ns);
// Convert to text mode
frame_to_text(gray_pixels, rom, bg_col, fg_col, chars);
// Write text packet (treated as I-frame)
write_text_packet(out, bg_col, fg_col, chars, GRID_H, GRID_W);
// Write sync packet after each frame
write_sync(out);
frame_num++;
if (frame_num % 30 == 0) {
// Calculate encoding speed
gettimeofday(&now, NULL);
double elapsed = (now.tv_sec - start_time.tv_sec) +
(now.tv_usec - start_time.tv_usec) / 1000000.0;
double encoding_fps = frame_num / elapsed;
fprintf(stderr, "\rEncoded %u frames (%.1f fps)", frame_num, encoding_fps);
fflush(stderr);
}
}
// Read any remaining audio
while ((audio_available = fread(audio_buffer, 1, AUDIO_CHUNK_SIZE, audio_pipe)) > 0) {
write_audio_mp2(out, audio_buffer, audio_available);
total_audio_bytes += audio_available;
}
// Final timing
gettimeofday(&now, NULL);
double total_time = (now.tv_sec - start_time.tv_sec) +
(now.tv_usec - start_time.tv_usec) / 1000000.0;
double final_fps = frame_num / total_time;
fprintf(stderr, "\nDone! Encoded %u frames in %.2fs (%.1f fps)\n",
frame_num, total_time, final_fps);
fprintf(stderr, "Audio: %llu bytes (%.2f MB)\n",
(unsigned long long)total_audio_bytes,
total_audio_bytes / 1024.0 / 1024.0);
// Update total_frames in header
if (frame_num > 0) {
fseek(out, header_offset + 12, SEEK_SET); // Offset to total_frames field
fwrite(&frame_num, sizeof(uint32_t), 1, out);
fprintf(stderr, "Updated total_frames in header: %u\n", frame_num);
}
// Update ENDT in extended header (calculate end time for last frame)
if (frame_num > 0) {
// Calculate duration: (frame_num - 1) frames * (1/fps) seconds in nanoseconds
uint64_t duration_ns = (uint64_t)((frame_num - 1) * 1000000000.0 / fps_float);
uint64_t endt_ns = duration_ns;
fseek(out, endt_offset, SEEK_SET);
fwrite(&endt_ns, sizeof(uint64_t), 1, out);
fprintf(stderr, "Updated ENDT in extended header: %llu ns (%.3f seconds)\n",
(unsigned long long)endt_ns, endt_ns / 1000000000.0);
}
// Cleanup
pclose(video_pipe);
pclose(audio_pipe);
fclose(out);
free(gray_pixels);
free(bg_col);
free(fg_col);
free(chars);
free(audio_buffer);
free(rom->data);
free(rom);
if (ffmpeg_version) free(ffmpeg_version);
return 0;
}