Files
tsvm/video_encoder/encoder_tav.c
2025-10-16 00:03:58 +09:00

5779 lines
237 KiB
C
Raw Blame History

This file contains ambiguous Unicode characters
This file contains Unicode characters that might be confused with other characters. If you think that this is intentional, you can safely ignore this warning. Use the Escape button to reveal them.
// Created by Claude on 2025-09-13.
// TAV (TSVM Advanced Video) Encoder - DWT-based compression with full resolution YCoCg-R
#include <stdio.h>
#include <stdlib.h>
#include <stdint.h>
#include <stddef.h>
#include <string.h>
#include <math.h>
#include <zstd.h>
#include <unistd.h>
#include <sys/wait.h>
#include <getopt.h>
#include <ctype.h>
#include <sys/time.h>
#include <time.h>
#include <limits.h>
#include <float.h>
#include <fftw3.h>
#define ENCODER_VENDOR_STRING "Encoder-TAV 20251015"
// TSVM Advanced Video (TAV) format constants
#define TAV_MAGIC "\x1F\x54\x53\x56\x4D\x54\x41\x56" // "\x1FTSVM TAV"
// TAV version - dynamic based on colour space and perceptual tuning
// Version 8: ICtCp multi-tile with perceptual quantisation (--ictcp flag)
// Version 7: YCoCg-R multi-tile with perceptual quantisation (default if width > 640 or height > 540)
// Version 6: ICtCp monoblock with perceptual quantisation (--ictcp flag)
// Version 5: YCoCg-R monoblock with perceptual quantisation (default if width <= 640 and height <= 540)
// Version 4: ICtCp monoblock uniform (--ictcp --no-perceptual-tuning)
// Version 3: YCoCg-R monoblock uniform (--no-perceptual-tuning)
// Version 2: ICtCp multi-tile uniform (--ictcp --no-perceptual-tuning)
// Version 1: YCoCg-R multi-tile uniform (--no-perceptual-tuning)
// Tile encoding modes
#define TAV_MODE_SKIP 0x00 // Skip tile (copy from reference)
#define TAV_MODE_INTRA 0x01 // Intra DWT coding (I-frame tiles)
#define TAV_MODE_DELTA 0x02 // Coefficient delta encoding (efficient P-frames)
// Video packet types
#define TAV_PACKET_IFRAME 0x10 // Intra frame (keyframe)
#define TAV_PACKET_PFRAME 0x11 // Predicted frame
#define TAV_PACKET_GOP_UNIFIED 0x12 // Unified 3D DWT GOP (all frames in single block)
#define TAV_PACKET_AUDIO_MP2 0x20 // MP2 audio
#define TAV_PACKET_SUBTITLE 0x30 // Subtitle packet
#define TAV_PACKET_EXTENDED_HDR 0xEF // Extended header packet
#define TAV_PACKET_GOP_SYNC 0xFC // GOP sync packet (N frames decoded)
#define TAV_PACKET_TIMECODE 0xFD // Timecode packet
#define TAV_PACKET_SYNC_NTSC 0xFE // NTSC Sync packet
#define TAV_PACKET_SYNC 0xFF // Sync packet
// DWT settings
#define TILE_SIZE_X 640
#define TILE_SIZE_Y 540
// Simulated overlapping tiles settings for seamless DWT processing
#define DWT_FILTER_HALF_SUPPORT 4 // For 9/7 filter (filter lengths 9,7 → L=4)
#define TILE_MARGIN_LEVELS 3 // Use margin for 3 levels: 4 * (2^3) = 4 * 8 = 32px
#define TILE_MARGIN (DWT_FILTER_HALF_SUPPORT * (1 << TILE_MARGIN_LEVELS)) // 4 * 8 = 32px
#define PADDED_TILE_SIZE_X (TILE_SIZE_X + 2 * TILE_MARGIN)
#define PADDED_TILE_SIZE_Y (TILE_SIZE_Y + 2 * TILE_MARGIN)
// Wavelet filter types
#define WAVELET_5_3_REVERSIBLE 0 // Lossless capable
#define WAVELET_9_7_IRREVERSIBLE 1 // Higher compression
#define WAVELET_BIORTHOGONAL_13_7 2 // Biorthogonal 13/7 wavelet
#define WAVELET_DD4 16 // Four-point interpolating Deslauriers-Dubuc (DD-4)
#define WAVELET_HAAR 255 // Haar wavelet (simplest wavelet transform)
// Channel layout definitions (bit-field design)
// Bit 0: has alpha, Bit 1: has chroma (inverted), Bit 2: has luma (inverted)
#define CHANNEL_LAYOUT_YCOCG 0 // Y-Co-Cg/I-Ct-Cp (000: no alpha, has chroma, has luma)
#define CHANNEL_LAYOUT_YCOCG_A 1 // Y-Co-Cg-A/I-Ct-Cp-A (001: has alpha, has chroma, has luma)
#define CHANNEL_LAYOUT_Y_ONLY 2 // Y/I only (010: no alpha, no chroma, has luma)
#define CHANNEL_LAYOUT_Y_A 3 // Y-A/I-A (011: has alpha, no chroma, has luma)
#define CHANNEL_LAYOUT_COCG 4 // Co-Cg/Ct-Cp (100: no alpha, has chroma, no luma)
#define CHANNEL_LAYOUT_COCG_A 5 // Co-Cg-A/Ct-Cp-A (101: has alpha, has chroma, no luma)
// Channel layout configuration structure
typedef struct {
int layout_id;
int num_channels;
const char* channels[4]; // channel names for display
int has_y, has_co, has_cg, has_alpha;
} channel_layout_config_t;
static const channel_layout_config_t channel_layouts[] = {
{CHANNEL_LAYOUT_YCOCG, 3, {"Y", "Co", "Cg", NULL}, 1, 1, 1, 0}, // 0: Y-Co-Cg
{CHANNEL_LAYOUT_YCOCG_A, 4, {"Y", "Co", "Cg", "A"}, 1, 1, 1, 1}, // 1: Y-Co-Cg-A
{CHANNEL_LAYOUT_Y_ONLY, 1, {"Y", NULL, NULL, NULL}, 1, 0, 0, 0}, // 2: Y only
{CHANNEL_LAYOUT_Y_A, 2, {"Y", NULL, NULL, "A"}, 1, 0, 0, 1}, // 3: Y-A
{CHANNEL_LAYOUT_COCG, 2, {NULL, "Co", "Cg", NULL}, 0, 1, 1, 0}, // 4: Co-Cg
{CHANNEL_LAYOUT_COCG_A, 3, {NULL, "Co", "Cg", "A"}, 0, 1, 1, 1} // 5: Co-Cg-A
};
// Helper function to check if alpha channel is needed for given channel layout
static int needs_alpha_channel(int channel_layout) {
if (channel_layout < 0 || channel_layout >= 6) return 0;
return channel_layouts[channel_layout].has_alpha;
}
// Default settings
#define DEFAULT_WIDTH 560
#define DEFAULT_HEIGHT 448
#define DEFAULT_FPS 30
#define DEFAULT_QUALITY 3
#define DEFAULT_ZSTD_LEVEL 9
#define GOP_SIZE /*1*/4
// Audio/subtitle constants (reused from TEV)
#define MP2_DEFAULT_PACKET_SIZE 1152
#define MAX_SUBTITLE_LENGTH 2048
int debugDumpMade = 0;
int debugDumpFrameTarget = -1; // -1 means disabled
// Subtitle structure
typedef struct subtitle_entry {
int start_frame;
int end_frame;
char *text;
struct subtitle_entry *next;
} subtitle_entry_t;
static void generate_random_filename(char *filename) {
srand(time(NULL));
const char charset[] = "0123456789abcdefghijklmnopqrstuvwxyzABCDEFGHIJKLMNOPQRSTUVWXYZ";
const int charset_size = sizeof(charset) - 1;
// Start with the prefix
strcpy(filename, "/tmp/");
// Generate 32 random characters
for (int i = 0; i < 32; i++) {
filename[5 + i] = charset[rand() % charset_size];
}
// Add the .mp2 extension
strcpy(filename + 37, ".mp2");
filename[41] = '\0'; // Null terminate
}
char TEMP_AUDIO_FILE[42];
// Utility macros
static inline int CLAMP(int x, int min, int max) {
return x < min ? min : (x > max ? max : x);
}
static inline float FCLAMP(float x, float min, float max) {
return x < min ? min : (x > max ? max : x);
}
// Calculate maximum decomposition levels for a given frame size
static int calculate_max_decomp_levels(int width, int height) {
int levels = 0;
int min_size = width < height ? width : height;
// Keep halving until we reach a minimum size (at least 4 pixels)
while (min_size >= 8) { // Need at least 8 pixels to safely halve to 4
min_size /= 2;
levels++;
}
// Cap at a reasonable maximum to avoid going too deep
return levels > 10 ? 10 : levels;
}
// MP2 audio rate table (same as TEV)
static const int MP2_RATE_TABLE[] = {96, 128, 160, 224, 320, 384, 384};
// Valid MP2 bitrates as per MPEG-1 Layer II specification
static const int MP2_VALID_BITRATES[] = {32, 48, 56, 64, 80, 96, 112, 128, 160, 192, 224, 256, 320, 384};
// Validate and return closest valid MP2 bitrate, or 0 if invalid
static int validate_mp2_bitrate(int bitrate) {
for (int i = 0; i < sizeof(MP2_VALID_BITRATES) / sizeof(int); i++) {
if (MP2_VALID_BITRATES[i] == bitrate) {
return bitrate; // Exact match
}
}
return 0; // Invalid bitrate
}
static const int QLUT[] = {1,2,3,4,5,6,7,8,9,10,11,12,13,14,15,16,17,18,19,20,21,22,23,24,25,26,27,28,29,30,31,32,33,34,35,36,37,38,39,40,41,42,43,44,45,46,47,48,49,50,51,52,53,54,55,56,57,58,59,60,61,62,63,64,66,68,70,72,74,76,78,80,82,84,86,88,90,92,94,96,98,100,102,104,106,108,110,112,114,116,118,120,122,124,126,128,132,136,140,144,148,152,156,160,164,168,172,176,180,184,188,192,196,200,204,208,212,216,220,224,228,232,236,240,244,248,252,256,264,272,280,288,296,304,312,320,328,336,344,352,360,368,376,384,392,400,408,416,424,432,440,448,456,464,472,480,488,496,504,512,528,544,560,576,592,608,624,640,656,672,688,704,720,736,752,768,784,800,816,832,848,864,880,896,912,928,944,960,976,992,1008,1024,1056,1088,1120,1152,1184,1216,1248,1280,1312,1344,1376,1408,1440,1472,1504,1536,1568,1600,1632,1664,1696,1728,1760,1792,1824,1856,1888,1920,1952,1984,2016,2048,2112,2176,2240,2304,2368,2432,2496,2560,2624,2688,2752,2816,2880,2944,3008,3072,3136,3200,3264,3328,3392,3456,3520,3584,3648,3712,3776,3840,3904,3968,4032,4096};
// Quality level to quantisation mapping for different channels
// the values are indices to the QLUT
static const int QUALITY_Y[] = {79, 47, 23, 11, 5, 2, 1}; // 96, 48, 24, 12, 6, 3, 2
static const int QUALITY_CO[] = {123, 108, 91, 76, 59, 29, 4}; // 240, 180, 120, 90, 60, 30, 5
static const int QUALITY_CG[] = {148, 133, 113, 99, 76, 39, 7}; // 424, 304, 200, 144, 90, 40, 8
static const int QUALITY_ALPHA[] = {79, 47, 23, 11, 5, 2, 1}; // 96, 48, 24, 12, 6, 3, 2
// Dead-zone quantisation thresholds per quality level
// Higher values = more aggressive (more coefficients set to zero)
static const float DEAD_ZONE_THRESHOLD[] = {1.5f, 1.5f, 1.2f, 1.1f, 0.8f, 0.6f, 0.0f};
// Dead-zone scaling factors for different subband levels
#define DEAD_ZONE_FINEST_SCALE 1.0f // Full dead-zone for finest level (level 6)
#define DEAD_ZONE_FINE_SCALE 0.5f // Reduced dead-zone for second-finest level (level 5)
// Coarser levels (0-4) use 0.0f (no dead-zone) to preserve structural information
// psychovisual tuning parameters
static const float ANISOTROPY_MULT[] = {5.1f, 3.8f, 2.7f, 2.0f, 1.5f, 1.2f, 1.0f};
static const float ANISOTROPY_BIAS[] = {0.4f, 0.3f, 0.2f, 0.1f, 0.0f, 0.0f, 0.0f};
static const float ANISOTROPY_MULT_CHROMA[] = {7.0f, 6.0f, 5.0f, 4.0f, 3.0f, 2.0f, 1.0f};
static const float ANISOTROPY_BIAS_CHROMA[] = {1.0f, 0.8f, 0.6f, 0.4f, 0.2f, 0.0f, 0.0f};
// DWT coefficient structure for each subband
typedef struct {
int16_t *coeffs;
int width, height;
int size;
} dwt_subband_t;
// DWT tile structure
typedef struct {
dwt_subband_t *ll, *lh, *hl, *hh; // Subbands for each level
int decomp_levels;
int tile_x, tile_y;
} dwt_tile_t;
// DWT subband information for perceptual quantisation
typedef struct {
int level; // Decomposition level (1 to enc->decomp_levels)
int subband_type; // 0=LL, 1=LH, 2=HL, 3=HH
int coeff_start; // Starting index in linear coefficient array
int coeff_count; // Number of coefficients in this subband
float perceptual_weight; // Quantisation multiplier for this subband
} dwt_subband_info_t;
// TAV encoder structure
typedef struct tav_encoder_s {
// Input/output files
char *input_file;
char *output_file;
char *subtitle_file;
char *fontrom_lo_file;
char *fontrom_hi_file;
FILE *output_fp;
FILE *mp2_file;
FILE *ffmpeg_video_pipe;
// Video parameters
int width, height;
int fps;
int output_fps; // For frame rate conversion
int total_frames;
int frame_count;
double duration;
int has_audio;
int is_ntsc_framerate;
// Encoding parameters
int quality_level;
int quantiser_y, quantiser_co, quantiser_cg;
int wavelet_filter;
int decomp_levels;
float dead_zone_threshold; // Dead-zone quantisation threshold (0 = disabled)
int bitrate_mode;
int target_bitrate;
// Bitrate control (PID controller)
size_t *video_rate_bin; // Rolling window of compressed sizes
int video_rate_bin_size; // Current number of entries in bin
int video_rate_bin_capacity; // Maximum capacity (fps)
float pid_integral; // PID integral term
float pid_prev_error; // PID previous error for derivative
float pid_filtered_derivative; // Low-pass filtered derivative for smoothing
float adjusted_quantiser_y_float; // Float precision qY for smooth control
size_t prev_frame_size; // Previous frame compressed size for scene change detection
int scene_change_cooldown; // Frames to wait after scene change before responding
float dither_accumulator; // Accumulated dithering error for error diffusion
// Flags
int lossless;
int enable_rcf;
int enable_progressive_transmission;
int enable_roi;
int verbose;
int test_mode;
int ictcp_mode; // 0 = YCoCg-R (default), 1 = ICtCp colour space
int intra_only; // Force all tiles to use INTRA mode (disable delta encoding)
int monoblock; // Single DWT tile mode (encode entire frame as one tile)
int perceptual_tuning; // 1 = perceptual quantisation (default), 0 = uniform quantisation
int channel_layout; // Channel layout: 0=Y-Co-Cg, 1=Y-only, 2=Y-Co-Cg-A, 3=Y-A, 4=Co-Cg
int progressive_mode; // 0 = interlaced (default), 1 = progressive
int grain_synthesis; // 1 = enable grain synthesis (default), 0 = disable
int use_delta_encoding;
int delta_haar_levels; // Number of Haar DWT levels to apply to delta coefficients (0 = disabled)
// Frame buffers - ping-pong implementation
uint8_t *frame_rgb[2]; // [0] and [1] alternate between current and previous
int frame_buffer_index; // 0 or 1, indicates which set is "current"
float *current_frame_y, *current_frame_co, *current_frame_cg, *current_frame_alpha;
// Convenience pointers (updated each frame to point to current ping-pong buffers)
uint8_t *current_frame_rgb;
uint8_t *previous_frame_rgb;
// DWT coefficient buffers (pre-computed for SKIP detection and encoding)
float *current_dwt_y, *current_dwt_co, *current_dwt_cg;
// GOP (Group of Pictures) buffer for temporal 3D DWT
int enable_temporal_dwt; // Flag to enable temporal DWT (default: 0 for backward compatibility)
int gop_capacity; // Maximum GOP size (typically 16)
int gop_frame_count; // Current number of frames accumulated in GOP
uint8_t **gop_rgb_frames; // [frame][pixel*3] - RGB data for each GOP frame
float **gop_y_frames; // [frame][pixel] - Y channel for each GOP frame
float **gop_co_frames; // [frame][pixel] - Co channel for each GOP frame
float **gop_cg_frames; // [frame][pixel] - Cg channel for each GOP frame
int16_t *gop_translation_x; // [frame] - Translation X in quarter-pixel units
int16_t *gop_translation_y; // [frame] - Translation Y in quarter-pixel units
int temporal_decomp_levels; // Number of temporal DWT levels (default: 2)
// Tile processing
int tiles_x, tiles_y;
dwt_tile_t *tiles;
// Audio processing (expanded from TEV)
size_t audio_remaining;
uint8_t *mp2_buffer;
size_t mp2_buffer_size;
int mp2_packet_size;
int mp2_rate_index;
int audio_bitrate; // Custom audio bitrate (0 = use quality table)
int target_audio_buffer_size;
double audio_frames_in_buffer;
// Subtitle processing
subtitle_entry_t *subtitles;
subtitle_entry_t *current_subtitle;
int subtitle_visible;
// Compression
ZSTD_CCtx *zstd_ctx;
void *compressed_buffer;
size_t compressed_buffer_size;
int zstd_level; // Zstd compression level (default: 15)
// OPTIMISATION: Pre-allocated buffers to avoid malloc/free per tile
int16_t *reusable_quantised_y;
int16_t *reusable_quantised_co;
int16_t *reusable_quantised_cg;
int16_t *reusable_quantised_alpha;
// Coefficient delta storage for P-frames (previous frame's coefficients)
float *previous_coeffs_y; // Previous frame Y coefficients for all tiles
float *previous_coeffs_co; // Previous frame Co coefficients for all tiles
float *previous_coeffs_cg; // Previous frame Cg coefficients for all tiles
float *previous_coeffs_alpha; // Previous frame Alpha coefficients for all tiles
int previous_coeffs_allocated; // Flag to track allocation
// Frame type tracking for SKIP mode
uint8_t last_frame_packet_type; // Last emitted packet type (TAV_PACKET_IFRAME or TAV_PACKET_PFRAME)
int is_still_frame_cached; // Cached result from detect_still_frame() for current frame
int used_skip_mode_last_frame; // Set to 1 when SKIP mode was used (suppresses next keyframe timer)
// Statistics
size_t total_compressed_size;
size_t total_uncompressed_size;
// Progress tracking
struct timeval start_time;
int encode_limit; // Maximum number of frames to encode (0 = no limit)
// Extended header support
char *ffmpeg_version; // FFmpeg version string
uint64_t creation_time_ns; // Creation time in nanoseconds since UNIX epoch
long extended_header_offset; // File offset of extended header for ENDT update
} tav_encoder_t;
// Wavelet filter constants removed - using lifting scheme implementation instead
// Bitrate control functions
static void update_video_rate_bin(tav_encoder_t *enc, size_t compressed_size) {
if (!enc->bitrate_mode) return;
if (enc->video_rate_bin_size < enc->video_rate_bin_capacity) {
enc->video_rate_bin[enc->video_rate_bin_size++] = compressed_size;
} else {
// Shift old entries out
memmove(enc->video_rate_bin, enc->video_rate_bin + 1,
(enc->video_rate_bin_capacity - 1) * sizeof(size_t));
enc->video_rate_bin[enc->video_rate_bin_capacity - 1] = compressed_size;
}
}
static float get_video_rate_kbps(tav_encoder_t *enc) {
if (!enc->bitrate_mode || enc->video_rate_bin_size == 0) return 0.0f;
size_t base_rate = 0;
for (int i = 0; i < enc->video_rate_bin_size; i++) {
base_rate += enc->video_rate_bin[i];
}
float mult = (float)enc->output_fps / enc->video_rate_bin_size;
return (base_rate * mult / 1024.0f) * 8.0f; // Convert to kbps
}
// PID controller parameters - heavily damped to prevent oscillation
#define PID_KP 0.08f // Proportional gain - extremely conservative
#define PID_KI 0.002f // Integral gain - very slow to prevent windup
#define PID_KD 0.4f // Derivative gain - moderate damping
#define MAX_QY_CHANGE 0.5f // Maximum quantiser change per frame - extremely conservative
#define DERIVATIVE_FILTER 0.85f // Very heavy low-pass filter for derivative
#define INTEGRAL_DEADBAND 0.05f // Don't accumulate integral within ±5% of target
#define INTEGRAL_CLAMP 500.0f // Clamp integral term to prevent windup
static void adjust_quantiser_for_bitrate(tav_encoder_t *enc) {
if (!enc->bitrate_mode) {
// Not in bitrate mode, use base quantiser
enc->adjusted_quantiser_y_float = (float)enc->quantiser_y;
return;
}
// Need at least a few frames to measure bitrate
if (enc->video_rate_bin_size < (enc->video_rate_bin_capacity / 2)) {
// Not enough data yet, use base quantiser
enc->adjusted_quantiser_y_float = (float)enc->quantiser_y;
return;
}
float current_bitrate = get_video_rate_kbps(enc);
float target_bitrate = (float)enc->target_bitrate;
// Calculate error (positive = over target, negative = under target)
float error = current_bitrate - target_bitrate;
// Calculate error percentage for adaptive scaling
float error_percent = fabsf(error) / target_bitrate;
// Detect scene changes by looking at sudden bitrate jumps
// Scene changes cause temporary spikes that shouldn't trigger aggressive corrections
float derivative_abs = fabsf(error - enc->pid_prev_error);
float derivative_threshold = target_bitrate * 0.4f; // 40% jump = scene change
if (derivative_abs > derivative_threshold && enc->scene_change_cooldown == 0) {
// Scene change detected - start cooldown
enc->scene_change_cooldown = 5; // Wait 5 frames before responding aggressively
}
// Reduce responsiveness during scene change cooldown
float response_factor = (enc->scene_change_cooldown > 0) ? 0.3f : 1.0f;
if (enc->scene_change_cooldown > 0) {
enc->scene_change_cooldown--;
}
// PID calculations with scene change damping
float proportional = error * response_factor;
// Conditional integration: only accumulate when error is outside deadband
// This prevents windup when close to target
// Also don't accumulate during scene change cooldown to prevent overreaction
if (error_percent > INTEGRAL_DEADBAND && enc->scene_change_cooldown == 0) {
enc->pid_integral += error;
} else {
// Aggressively decay integral when within deadband or during scene changes
// This prevents integral windup that causes qY drift
enc->pid_integral *= 0.90f;
}
// Clamp integral immediately to prevent windup
enc->pid_integral = FCLAMP(enc->pid_integral, -INTEGRAL_CLAMP, INTEGRAL_CLAMP);
float derivative = error - enc->pid_prev_error;
enc->pid_prev_error = error;
// Apply low-pass filter to derivative to reduce noise from scene changes
// This smooths out sudden spikes and prevents oscillation
enc->pid_filtered_derivative = (DERIVATIVE_FILTER * enc->pid_filtered_derivative) +
((1.0f - DERIVATIVE_FILTER) * derivative);
// Calculate adjustment using filtered derivative for smoother response
float pid_output = (PID_KP * proportional) + (PID_KI * enc->pid_integral) +
(PID_KD * enc->pid_filtered_derivative);
// Adaptive scaling based on error magnitude and current quantiser position
// At low quantisers (0-10), QLUT is exponential and small changes cause huge bitrate swings
float scale_factor = 100.0f; // Base: ~100 kbps error = 1 quantiser step
float max_change = MAX_QY_CHANGE;
if (enc->adjusted_quantiser_y_float < 5.0f) {
// Extreme lossless (qY 0-4) - be very conservative but still responsive
// At qY=0, QLUT[0]=1, which is essentially lossless and bitrate is huge
// Use fixed scale factor to ensure controller can actually respond
scale_factor = 200.0f; // ~200 kbps error = 1 step
max_change = 0.3f;
} else if (enc->adjusted_quantiser_y_float < 15.0f) {
// Very near lossless (qY 5-14) - very conservative
scale_factor = 400.0f; // ~400 kbps error = 1 step
max_change = 0.4f;
} else if (enc->adjusted_quantiser_y_float < 30.0f) {
// Near lossless range (qY 15-29) - be conservative
scale_factor = 200.0f; // ~200 kbps error = 1 step
max_change = 0.5f;
} else if (error_percent > 0.5f) {
// Large error - be slightly more aggressive
scale_factor = 150.0f;
max_change = 0.6f;
}
// Calculate float adjustment (no integer quantisation yet)
float adjustment_float = pid_output / scale_factor;
// Limit maximum change per frame to prevent wild swings (adaptive limit)
adjustment_float = FCLAMP(adjustment_float, -max_change, max_change);
// Apply logarithmic scaling to adjustment based on current qY
// At low qY (0-10), QLUT is exponential so we need much smaller steps
// At high qY (40+), bitrate changes are small so we can take larger steps
// This makes it "hard to reach towards 1, easy to reach towards large value"
float log_scale = 1.0f;
float current_qy = enc->adjusted_quantiser_y_float;
// Only apply log scaling when moving deeper into low qY region
// If we're at low qY and want to move up (increase qY), use faster response
int wants_to_increase = (adjustment_float > 0);
if (current_qy < 10 && !wants_to_increase) {
// Moving down into very near lossless - be very careful
log_scale = 0.15f + (current_qy / 10.0f) * 0.35f; // 0.15 at qY=0, 0.5 at qY=10
} else if (current_qy < 10 && wants_to_increase) {
// Escaping from very low qY - allow faster movement
log_scale = 0.8f; // Much faster escape from qY < 10
} else if (current_qy < 20) {
// Near lossless - small adjustments
log_scale = 0.5f + ((current_qy - 10) / 10.0f) * 0.3f; // 0.5 at qY=10, 0.8 at qY=20
} else if (current_qy < 40) {
// Moderate quality - normal adjustments
log_scale = 0.8f + ((current_qy - 20) / 20.0f) * 0.2f; // 0.8 at qY=20, 1.0 at qY=40
}
// else: qY >= 40, use full scale (1.0)
adjustment_float *= log_scale;
// Update float quantiser value (no integer quantisation, keeps full precision)
float new_quantiser_y_float = enc->adjusted_quantiser_y_float + adjustment_float;
// Avoid extremely low qY values where QLUT is exponential and causes wild swings
// For 5000 kbps target, qY < 3 is usually too low and causes oscillation
float min_qy = (target_bitrate >= 8000) ? 0.0f : (target_bitrate >= 4000) ? 3.0f : 5.0f;
new_quantiser_y_float = FCLAMP(new_quantiser_y_float, min_qy, 254.0f); // Max index is 254
enc->adjusted_quantiser_y_float = new_quantiser_y_float;
if (enc->verbose) {
printf("Bitrate control: %.1f kbps (target: %.1f kbps) -> qY %.2f->%.2f (adj: %.3f, err: %.1f%%)\n",
current_bitrate, target_bitrate, current_qy, new_quantiser_y_float, adjustment_float, error_percent * 100);
}
}
// Convert float qY to integer with error diffusion dithering
// This prevents the controller from getting stuck at integer boundaries
static int quantiser_float_to_int_dithered(tav_encoder_t *enc) {
float qy_float = enc->adjusted_quantiser_y_float;
// Add accumulated dithering error
float qy_with_error = qy_float + enc->dither_accumulator;
// Round to nearest integer
int qy_int = (int)(qy_with_error + 0.5f);
// Calculate quantisation error and accumulate for next frame
// This is Floyd-Steinberg style error diffusion
float quantisation_error = qy_with_error - (float)qy_int;
enc->dither_accumulator = quantisation_error * 0.5f; // Diffuse 50% of error to next frame
// Clamp to valid range
qy_int = CLAMP(qy_int, 0, 254);
return qy_int;
}
// Swap ping-pong frame buffers (eliminates need for memcpy)
static void swap_frame_buffers(tav_encoder_t *enc) {
// Flip the buffer index
enc->frame_buffer_index = 1 - enc->frame_buffer_index;
// Update convenience pointers to point to the new current/previous buffers
enc->current_frame_rgb = enc->frame_rgb[enc->frame_buffer_index];
enc->previous_frame_rgb = enc->frame_rgb[1 - enc->frame_buffer_index];
}
// Parse resolution string like "1024x768" with keyword recognition
static int parse_resolution(const char *res_str, int *width, int *height) {
if (!res_str) return 0;
if (strcmp(res_str, "cif") == 0 || strcmp(res_str, "CIF") == 0) {
*width = 352;
*height = 288;
return 1;
}
if (strcmp(res_str, "qcif") == 0 || strcmp(res_str, "QCIF") == 0) {
*width = 176;
*height = 144;
return 1;
}
if (strcmp(res_str, "half") == 0 || strcmp(res_str, "HALF") == 0) {
*width = DEFAULT_WIDTH >> 1;
*height = DEFAULT_HEIGHT >> 1;
return 1;
}
if (strcmp(res_str, "default") == 0 || strcmp(res_str, "DEFAULT") == 0) {
*width = DEFAULT_WIDTH;
*height = DEFAULT_HEIGHT;
return 1;
}
return sscanf(res_str, "%dx%d", width, height) == 2;
}
// encoder stats
static size_t count_intra = 0;
static size_t count_delta = 0;
static size_t count_skip = 0;
// Function prototypes
static void show_usage(const char *program_name);
static tav_encoder_t* create_encoder(void);
static void cleanup_encoder(tav_encoder_t *enc);
static int initialise_encoder(tav_encoder_t *enc);
static int get_subband_level_2d(int x, int y, int width, int height, int decomp_levels);
static int get_subband_type_2d(int x, int y, int width, int height, int decomp_levels);
static int get_subband_level(int linear_idx, int width, int height, int decomp_levels);
static int get_subband_type(int linear_idx, int width, int height, int decomp_levels);
static void rgb_to_ycocg(const uint8_t *rgb, float *y, float *co, float *cg, int width, int height);
static int calculate_max_decomp_levels(int width, int height);
// Audio and subtitle processing prototypes (from TEV)
static int start_audio_conversion(tav_encoder_t *enc);
static int get_mp2_packet_size(uint8_t *header);
static int mp2_packet_size_to_rate_index(int packet_size, int is_mono);
static long write_extended_header(tav_encoder_t *enc);
static void write_timecode_packet(FILE *output, int frame_num, int fps, int is_ntsc_framerate);
static int process_audio(tav_encoder_t *enc, int frame_num, FILE *output);
static subtitle_entry_t* parse_subtitle_file(const char *filename, int fps);
static subtitle_entry_t* parse_srt_file(const char *filename, int fps);
static subtitle_entry_t* parse_smi_file(const char *filename, int fps);
static int srt_time_to_frame(const char *time_str, int fps);
static int sami_ms_to_frame(int milliseconds, int fps);
static void free_subtitle_list(subtitle_entry_t *list);
static int write_subtitle_packet(FILE *output, uint32_t index, uint8_t opcode, const char *text);
static int process_subtitles(tav_encoder_t *enc, int frame_num, FILE *output);
// Temporal 3D DWT prototypes
static void dwt_3d_forward(float **gop_data, int width, int height, int num_frames,
int spatial_levels, int temporal_levels, int spatial_filter);
static void dwt_3d_inverse(float **gop_data, int width, int height, int num_frames,
int spatial_levels, int temporal_levels, int spatial_filter);
static size_t gop_flush(tav_encoder_t *enc, FILE *output, int base_quantiser,
int *frame_numbers, int actual_gop_size);
static size_t gop_process_and_flush(tav_encoder_t *enc, FILE *output, int base_quantiser,
int *frame_numbers, int force_flush);
static void dwt_2d_forward_flexible(float *tile_data, int width, int height, int levels, int filter_type);
static void dwt_2d_haar_inverse_flexible(float *tile_data, int width, int height, int levels);
static void quantise_dwt_coefficients_perceptual_per_coeff(tav_encoder_t *enc,
float *coeffs, int16_t *quantised, int size,
int base_quantiser, int width, int height,
int decomp_levels, int is_chroma, int frame_count);
static size_t preprocess_coefficients_variable_layout(int16_t *coeffs_y, int16_t *coeffs_co, int16_t *coeffs_cg, int16_t *coeffs_alpha,
int coeff_count, int channel_layout, uint8_t *output_buffer);
static size_t preprocess_gop_unified(int16_t **quant_y, int16_t **quant_co, int16_t **quant_cg,
int num_frames, int num_pixels, int channel_layout,
uint8_t *output_buffer);
// Film grain synthesis
static uint32_t rng_hash(uint32_t x) {
x ^= x >> 16;
x *= 0x7feb352d;
x ^= x >> 15;
x *= 0x846ca68b;
x ^= x >> 16;
return x;
}
static uint32_t grain_synthesis_rng(uint32_t frame, uint32_t band, uint32_t x, uint32_t y) {
uint32_t key = frame * 0x9e3779b9u ^ band * 0x7f4a7c15u ^ (y << 16) ^ x;
return rng_hash(key);
}
// Show usage information
static void show_usage(const char *program_name) {
int qtsize = sizeof(MP2_RATE_TABLE) / sizeof(int);
printf("TAV DWT-based Video Encoder\n");
printf("Usage: %s [options] -i input.mp4 -o output.mv3\n\n", program_name);
printf("Options:\n");
printf(" -i, --input FILE Input video file\n");
printf(" -o, --output FILE Output video file (use '-' for stdout)\n");
printf(" -s, --size WxH Video size (default: %dx%d)\n", DEFAULT_WIDTH, DEFAULT_HEIGHT);
printf(" -f, --fps N Output frames per second (enables frame rate conversion)\n");
printf(" -q, --quality N Quality level 0-5 (default: 3)\n");
printf(" -Q, --quantiser Y,Co,Cg Quantiser levels 0-255 for each channel (0: lossless, 255: potato)\n");
printf(" -b, --bitrate N Target bitrate in kbps (enables bitrate control mode)\n");
printf(" -c, --channel-layout N Channel layout: 0=Y-Co-Cg, 1=Y-Co-Cg-A, 2=Y-only, 3=Y-A, 4=Co-Cg, 5=Co-Cg-A (default: 0)\n");
printf(" -a, --arate N MP2 audio bitrate in kbps (overrides quality-based audio rate)\n");
printf(" Valid values: 32, 48, 56, 64, 80, 96, 112, 128, 160, 192, 224, 256, 320, 384\n");
printf(" -S, --subtitles FILE SubRip (.srt) or SAMI (.smi) subtitle file\n");
printf(" --fontrom-lo FILE Low font ROM file for internationalised subtitles\n");
printf(" --fontrom-hi FILE High font ROM file for internationalised subtitles\n");
printf(" -v, --verbose Verbose output\n");
printf(" -t, --test Test mode: generate solid colour frames\n");
printf(" --lossless Lossless mode (-q %d -Q1,1,1 -w 0 --intra-only --no-perceptual-tuning --no-dead-zone --arate 384)\n", qtsize);
printf(" --intra-only Disable delta and skip encoding\n");
printf(" --enable-delta Enable delta encoding\n");
printf(" --delta-haar N Apply N-level Haar DWT to delta coefficients (1-6, auto-enables delta)\n");
printf(" --temporal-dwt Enable temporal 3D DWT (GOP-based encoding with temporal transform)\n");
printf(" --ictcp Use ICtCp colour space instead of YCoCg-R (use when source is in BT.2100)\n");
printf(" --no-perceptual-tuning Disable perceptual quantisation\n");
printf(" --no-dead-zone Disable dead-zone quantisation (for comparison/testing)\n");
printf(" --encode-limit N Encode only first N frames (useful for testing/analysis)\n");
printf(" --dump-frame N Dump quantised coefficients for frame N (creates .bin files)\n");
printf(" --wavelet N Wavelet filter: 0=LGT 5/3, 1=CDF 9/7, 2=CDF 13/7, 16=DD-4, 255=Haar (default: 1)\n");
printf(" --zstd-level N Zstd compression level 1-22 (default: %d, higher = better compression but slower)\n", DEFAULT_ZSTD_LEVEL);
printf(" --no-grain-synthesis Disable grain synthesis (enabled by default)\n");
printf(" --help Show this help\n\n");
printf("Audio Rate by Quality:\n ");
for (int i = 0; i < qtsize; i++) {
printf("%d: %d kbps\t", i, MP2_RATE_TABLE[i]);
}
printf("\n\nQuantiser Value by Quality:\n");
printf(" Y - ");
for (int i = 0; i < qtsize; i++) {
printf("%d: Q %d%s(→%d) \t", i, QUALITY_Y[i], QUALITY_Y[i] < 10 ? " " : QUALITY_Y[i] < 100 ? " " : "", QLUT[QUALITY_Y[i]]);
}
printf("\n Co - ");
for (int i = 0; i < qtsize; i++) {
printf("%d: Q %d%s(→%d) \t", i, QUALITY_CO[i], QUALITY_CO[i] < 10 ? " " : QUALITY_CO[i] < 100 ? " " : "", QLUT[QUALITY_CO[i]]);
}
printf("\n Cg - ");
for (int i = 0; i < qtsize; i++) {
printf("%d: Q %d%s(→%d) \t", i, QUALITY_CG[i], QUALITY_CG[i] < 10 ? " " : QUALITY_CG[i] < 100 ? " " : "", QLUT[QUALITY_CG[i]]);
}
printf("\n\nVideo Size Keywords:");
printf("\n -s cif: equal to 352x288");
printf("\n -s qcif: equal to 176x144");
printf("\n -s half: equal to %dx%d", DEFAULT_WIDTH >> 1, DEFAULT_HEIGHT >> 1);
printf("\n -s default: equal to %dx%d", DEFAULT_WIDTH, DEFAULT_HEIGHT);
printf("\n\n");
printf("Features:\n");
printf(" - Single DWT tile (monoblock) encoding for optimal quality\n");
printf(" - Perceptual quantisation optimised for human visual system (default)\n");
printf(" - Full resolution YCoCg-R/ICtCp colour space\n");
printf(" - Lossless and lossy compression modes\n");
printf("\nExamples:\n");
printf(" %s -i input.mp4 -o output.mv3 # Default settings\n", program_name);
printf(" %s -i input.mkv -q 4 -o output.mv3 # At maximum quality\n", program_name);
printf(" %s -i input.avi --lossless -o output.mv3 # Lossless encoding\n", program_name);
printf(" %s -i input.mp4 -b 6000 -o output.mv3 # 6000 kbps bitrate target\n", program_name);
printf(" %s -i input.webm -S subs.srt -o output.mv3 # With subtitles\n", program_name);
}
// Create encoder instance
static tav_encoder_t* create_encoder(void) {
tav_encoder_t *enc = calloc(1, sizeof(tav_encoder_t));
if (!enc) return NULL;
// Set defaults
enc->width = DEFAULT_WIDTH;
enc->height = DEFAULT_HEIGHT;
enc->fps = DEFAULT_FPS;
enc->quality_level = DEFAULT_QUALITY;
enc->wavelet_filter = WAVELET_9_7_IRREVERSIBLE;
enc->decomp_levels = 6;
enc->quantiser_y = QUALITY_Y[DEFAULT_QUALITY];
enc->quantiser_co = QUALITY_CO[DEFAULT_QUALITY];
enc->quantiser_cg = QUALITY_CG[DEFAULT_QUALITY];
enc->dead_zone_threshold = DEAD_ZONE_THRESHOLD[DEFAULT_QUALITY];
enc->intra_only = 0;
enc->monoblock = 1; // Default to monoblock mode
enc->perceptual_tuning = 1; // Default to perceptual quantisation (versions 5/6)
enc->channel_layout = CHANNEL_LAYOUT_YCOCG; // Default to Y-Co-Cg
enc->audio_bitrate = 0; // 0 = use quality table
enc->encode_limit = 0; // Default: no frame limit
enc->zstd_level = DEFAULT_ZSTD_LEVEL; // Default Zstd compression level
enc->progressive_mode = 1; // Default to progressive mode
enc->grain_synthesis = 0; // Default: disable grain synthesis (only do it on the decoder)
enc->use_delta_encoding = 0;
enc->delta_haar_levels = 2;
// GOP / temporal DWT settings
enc->enable_temporal_dwt = 0; // Default: disabled for backward compatibility. Mutually exclusive with use_delta_encoding
enc->gop_capacity = GOP_SIZE; // 16 frames
enc->gop_frame_count = 0;
enc->temporal_decomp_levels = 2; // 2 levels of temporal DWT (16 -> 4x4 subbands)
enc->gop_rgb_frames = NULL;
enc->gop_y_frames = NULL;
enc->gop_co_frames = NULL;
enc->gop_cg_frames = NULL;
enc->gop_translation_x = NULL;
enc->gop_translation_y = NULL;
return enc;
}
// Initialise encoder resources
static int initialise_encoder(tav_encoder_t *enc) {
if (!enc) return -1;
// Automatic decomposition levels for monoblock mode
enc->decomp_levels = calculate_max_decomp_levels(enc->width, enc->height);
// Calculate tile dimensions
if (enc->monoblock) {
// Monoblock mode: single tile covering entire frame
enc->tiles_x = 1;
enc->tiles_y = 1;
} else {
// Standard mode: multiple tiles
enc->tiles_x = (enc->width + TILE_SIZE_X - 1) / TILE_SIZE_X;
enc->tiles_y = (enc->height + TILE_SIZE_Y - 1) / TILE_SIZE_Y;
}
int num_tiles = enc->tiles_x * enc->tiles_y;
// Allocate ping-pong frame buffers
size_t frame_size = enc->width * enc->height;
enc->frame_rgb[0] = malloc(frame_size * 3);
enc->frame_rgb[1] = malloc(frame_size * 3);
// Initialise ping-pong buffer index and convenience pointers
enc->frame_buffer_index = 0;
enc->current_frame_rgb = enc->frame_rgb[0];
enc->previous_frame_rgb = enc->frame_rgb[1];
enc->current_frame_y = malloc(frame_size * sizeof(float));
enc->current_frame_co = malloc(frame_size * sizeof(float));
enc->current_frame_cg = malloc(frame_size * sizeof(float));
enc->current_frame_alpha = malloc(frame_size * sizeof(float));
// Allocate DWT coefficient buffers for SKIP detection
enc->current_dwt_y = malloc(frame_size * sizeof(float));
enc->current_dwt_co = malloc(frame_size * sizeof(float));
enc->current_dwt_cg = malloc(frame_size * sizeof(float));
// Allocate tile structures
enc->tiles = malloc(num_tiles * sizeof(dwt_tile_t));
// Initialise ZSTD compression
enc->zstd_ctx = ZSTD_createCCtx();
// Calculate maximum possible frame size for ZSTD buffer
const size_t max_frame_coeff_count = enc->monoblock ?
(enc->width * enc->height) :
(PADDED_TILE_SIZE_X * PADDED_TILE_SIZE_Y);
const size_t max_frame_size = num_tiles * (4 + max_frame_coeff_count * 3 * sizeof(int16_t));
enc->compressed_buffer_size = ZSTD_compressBound(max_frame_size);
enc->compressed_buffer = malloc(enc->compressed_buffer_size);
// OPTIMISATION: Allocate reusable quantisation buffers
int coeff_count_per_tile;
if (enc->monoblock) {
// Monoblock mode: entire frame
coeff_count_per_tile = enc->width * enc->height;
} else {
// Standard mode: padded tiles (344x288)
coeff_count_per_tile = PADDED_TILE_SIZE_X * PADDED_TILE_SIZE_Y;
}
enc->reusable_quantised_y = malloc(coeff_count_per_tile * sizeof(int16_t));
enc->reusable_quantised_co = malloc(coeff_count_per_tile * sizeof(int16_t));
enc->reusable_quantised_cg = malloc(coeff_count_per_tile * sizeof(int16_t));
enc->reusable_quantised_alpha = malloc(coeff_count_per_tile * sizeof(int16_t));
// Allocate coefficient delta storage for P-frames (per-tile coefficient storage)
size_t total_coeff_size = num_tiles * coeff_count_per_tile * sizeof(float);
enc->previous_coeffs_y = malloc(total_coeff_size);
enc->previous_coeffs_co = malloc(total_coeff_size);
enc->previous_coeffs_cg = malloc(total_coeff_size);
enc->previous_coeffs_alpha = malloc(total_coeff_size);
enc->previous_coeffs_allocated = 0; // Will be set to 1 after first I-frame
// Initialise bitrate control if in bitrate mode
if (enc->bitrate_mode) {
enc->video_rate_bin_capacity = enc->output_fps > 0 ? enc->output_fps : enc->fps;
enc->video_rate_bin = calloc(enc->video_rate_bin_capacity, sizeof(size_t));
enc->video_rate_bin_size = 0;
enc->pid_integral = 0.0f;
enc->pid_prev_error = 0.0f;
enc->adjusted_quantiser_y_float = (float)enc->quantiser_y; // Start with base quantiser
enc->dither_accumulator = 0.0f;
if (!enc->video_rate_bin) {
return -1;
}
printf("Bitrate control enabled: target = %d kbps, initial quality = %d\n",
enc->target_bitrate, enc->quality_level);
}
// Allocate GOP buffers if temporal DWT is enabled
if (enc->enable_temporal_dwt) {
size_t frame_rgb_size = frame_size * 3; // RGB
size_t frame_channel_size = frame_size * sizeof(float);
// Allocate frame arrays
enc->gop_rgb_frames = malloc(enc->gop_capacity * sizeof(uint8_t*));
enc->gop_y_frames = malloc(enc->gop_capacity * sizeof(float*));
enc->gop_co_frames = malloc(enc->gop_capacity * sizeof(float*));
enc->gop_cg_frames = malloc(enc->gop_capacity * sizeof(float*));
if (!enc->gop_rgb_frames || !enc->gop_y_frames ||
!enc->gop_co_frames || !enc->gop_cg_frames) {
return -1;
}
// Allocate individual frame buffers
for (int i = 0; i < enc->gop_capacity; i++) {
enc->gop_rgb_frames[i] = malloc(frame_rgb_size);
enc->gop_y_frames[i] = malloc(frame_channel_size);
enc->gop_co_frames[i] = malloc(frame_channel_size);
enc->gop_cg_frames[i] = malloc(frame_channel_size);
if (!enc->gop_rgb_frames[i] || !enc->gop_y_frames[i] ||
!enc->gop_co_frames[i] || !enc->gop_cg_frames[i]) {
// Cleanup on allocation failure
for (int j = 0; j <= i; j++) {
free(enc->gop_rgb_frames[j]);
free(enc->gop_y_frames[j]);
free(enc->gop_co_frames[j]);
free(enc->gop_cg_frames[j]);
}
free(enc->gop_rgb_frames);
free(enc->gop_y_frames);
free(enc->gop_co_frames);
free(enc->gop_cg_frames);
return -1;
}
}
// Allocate translation vector storage
enc->gop_translation_x = malloc(enc->gop_capacity * sizeof(int16_t));
enc->gop_translation_y = malloc(enc->gop_capacity * sizeof(int16_t));
if (!enc->gop_translation_x || !enc->gop_translation_y) {
return -1;
}
// Initialize translation vectors to zero
memset(enc->gop_translation_x, 0, enc->gop_capacity * sizeof(int16_t));
memset(enc->gop_translation_y, 0, enc->gop_capacity * sizeof(int16_t));
if (enc->verbose) {
printf("Temporal DWT enabled: GOP size=%d, temporal levels=%d\n",
enc->gop_capacity, enc->temporal_decomp_levels);
}
}
if (!enc->frame_rgb[0] || !enc->frame_rgb[1] ||
!enc->current_frame_y || !enc->current_frame_co || !enc->current_frame_cg || !enc->current_frame_alpha ||
!enc->tiles || !enc->zstd_ctx || !enc->compressed_buffer ||
!enc->reusable_quantised_y || !enc->reusable_quantised_co || !enc->reusable_quantised_cg || !enc->reusable_quantised_alpha ||
!enc->previous_coeffs_y || !enc->previous_coeffs_co || !enc->previous_coeffs_cg || !enc->previous_coeffs_alpha) {
return -1;
}
return 0;
}
// =============================================================================
// DWT Implementation - 5/3 Reversible and 9/7 Irreversible Filters
// =============================================================================
// 1D DWT using lifting scheme for 5/3 reversible filter
static void dwt_53_forward_1d(float *data, int length) {
if (length < 2) return;
float *temp = malloc(length * sizeof(float));
int half = (length + 1) / 2; // Handle odd lengths properly
// Predict step (high-pass)
for (int i = 0; i < half; i++) {
int idx = 2 * i + 1;
if (idx < length) {
float pred = 0.5f * (data[2 * i] + (2 * i + 2 < length ? data[2 * i + 2] : data[2 * i]));
temp[half + i] = data[idx] - pred;
}
}
// Update step (low-pass)
for (int i = 0; i < half; i++) {
float update = 0.25f * ((i > 0 ? temp[half + i - 1] : 0) +
(i < half - 1 ? temp[half + i] : 0));
temp[i] = data[2 * i] + update;
}
// Copy back
memcpy(data, temp, length * sizeof(float));
free(temp);
}
// 1D DWT using lifting scheme for 9/7 irreversible filter
static void dwt_97_forward_1d(float *data, int length) {
if (length < 2) return;
float *temp = malloc(length * sizeof(float));
int half = (length + 1) / 2; // Handle odd lengths properly
// Split into even/odd samples
for (int i = 0; i < half; i++) {
temp[i] = data[2 * i]; // Even (low)
}
for (int i = 0; i < length / 2; i++) {
temp[half + i] = data[2 * i + 1]; // Odd (high)
}
// JPEG2000 9/7 forward lifting steps (corrected to match decoder)
const float alpha = -1.586134342f;
const float beta = -0.052980118f;
const float gamma = 0.882911076f;
const float delta = 0.443506852f;
const float K = 1.230174105f;
// Step 1: Predict α - d[i] += α * (s[i] + s[i+1])
for (int i = 0; i < length / 2; i++) {
if (half + i < length) {
float s_curr = temp[i];
float s_next = (i + 1 < half) ? temp[i + 1] : s_curr;
temp[half + i] += alpha * (s_curr + s_next);
}
}
// Step 2: Update β - s[i] += β * (d[i-1] + d[i])
for (int i = 0; i < half; i++) {
float d_curr = (half + i < length) ? temp[half + i] : 0.0f;
float d_prev = (i > 0 && half + i - 1 < length) ? temp[half + i - 1] : d_curr;
temp[i] += beta * (d_prev + d_curr);
}
// Step 3: Predict γ - d[i] += γ * (s[i] + s[i+1])
for (int i = 0; i < length / 2; i++) {
if (half + i < length) {
float s_curr = temp[i];
float s_next = (i + 1 < half) ? temp[i + 1] : s_curr;
temp[half + i] += gamma * (s_curr + s_next);
}
}
// Step 4: Update δ - s[i] += δ * (d[i-1] + d[i])
for (int i = 0; i < half; i++) {
float d_curr = (half + i < length) ? temp[half + i] : 0.0f;
float d_prev = (i > 0 && half + i - 1 < length) ? temp[half + i - 1] : d_curr;
temp[i] += delta * (d_prev + d_curr);
}
// Step 5: Scaling - s[i] *= K, d[i] /= K
for (int i = 0; i < half; i++) {
temp[i] *= K; // Low-pass coefficients
}
for (int i = 0; i < length / 2; i++) {
if (half + i < length) {
temp[half + i] /= K; // High-pass coefficients
}
}
memcpy(data, temp, length * sizeof(float));
free(temp);
}
// Four-point interpolating Deslauriers-Dubuc (DD-4) wavelet forward 1D transform
// Uses four-sample prediction kernel: w[-1]=-1/16, w[0]=9/16, w[1]=9/16, w[2]=-1/16
static void dwt_dd4_forward_1d(float *data, int length) {
if (length < 2) return;
float *temp = malloc(length * sizeof(float));
int half = (length + 1) / 2;
// Split into even/odd samples
for (int i = 0; i < half; i++) {
temp[i] = data[2 * i]; // Even (low)
}
for (int i = 0; i < length / 2; i++) {
temp[half + i] = data[2 * i + 1]; // Odd (high)
}
// DD-4 forward prediction step with four-point kernel
// Predict odd samples using four neighboring even samples
// Prediction: P(x) = (-1/16)*s[i-1] + (9/16)*s[i] + (9/16)*s[i+1] + (-1/16)*s[i+2]
for (int i = 0; i < length / 2; i++) {
// Get four neighboring even samples with symmetric boundary extension
float s_m1, s_0, s_1, s_2;
// s[i-1]
if (i > 0) s_m1 = temp[i - 1];
else s_m1 = temp[0]; // Mirror boundary
// s[i]
s_0 = temp[i];
// s[i+1]
if (i + 1 < half) s_1 = temp[i + 1];
else s_1 = temp[half - 1]; // Mirror boundary
// s[i+2]
if (i + 2 < half) s_2 = temp[i + 2];
else if (half > 1) s_2 = temp[half - 2]; // Mirror boundary
else s_2 = temp[half - 1];
// Apply four-point prediction kernel
float prediction = (-1.0f/16.0f) * s_m1 + (9.0f/16.0f) * s_0 +
(9.0f/16.0f) * s_1 + (-1.0f/16.0f) * s_2;
temp[half + i] -= prediction;
}
// DD-4 update step - use simple averaging of adjacent high-pass coefficients
// s[i] += 0.25 * (d[i-1] + d[i])
for (int i = 0; i < half; i++) {
float d_curr = (i < length / 2) ? temp[half + i] : 0.0f;
float d_prev = (i > 0 && i - 1 < length / 2) ? temp[half + i - 1] : 0.0f;
temp[i] += 0.25f * (d_prev + d_curr);
}
memcpy(data, temp, length * sizeof(float));
free(temp);
}
// Biorthogonal 13/7 wavelet forward 1D transform
// Analysis filters: Low-pass (13 taps), High-pass (7 taps)
// Using lifting scheme with predict and update steps (same structure as 5/3)
static void dwt_bior137_forward_1d(float *data, int length) {
if (length < 2) return;
const float K = 1.230174105f;
float *temp = malloc(length * sizeof(float));
int half = (length + 1) / 2;
// Step 1: Predict step (high-pass) - exactly like 5/3 structure
for (int i = 0; i < half; i++) {
int idx = 2 * i + 1;
if (idx < length) {
float prediction = 0.0f;
// Simple 2-tap prediction for now (will expand to 7-tap later)
float left = data[2 * i];
float right = (2 * i + 2 < length) ? data[2 * i + 2] : data[2 * i];
prediction = 0.5f * (left + right);
temp[half + i] = data[idx] - prediction;
}
}
// Step 2: Update step (low-pass) - exactly like 5/3 structure
for (int i = 0; i < half; i++) {
float update = 0.25f * ((i > 0 ? temp[half + i - 1] : 0) +
(i < half - 1 ? temp[half + i] : 0));
temp[i] = data[2 * i] + update;
}
// Step 5: Scaling - s[i] *= K, d[i] /= K
for (int i = 0; i < half; i++) {
temp[i] *= K; // Low-pass coefficients
}
for (int i = 0; i < length / 2; i++) {
if (half + i < length) {
temp[half + i] /= K; // High-pass coefficients
}
}
memcpy(data, temp, length * sizeof(float));
free(temp);
}
// Haar wavelet forward 1D transform
// The simplest wavelet: averages and differences
static void dwt_haar_forward_1d(float *data, int length) {
if (length < 2) return;
float *temp = malloc(length * sizeof(float));
int half = (length + 1) / 2;
// Haar transform: compute averages (low-pass) and differences (high-pass)
for (int i = 0; i < half; i++) {
if (2 * i + 1 < length) {
// Average of adjacent pairs (low-pass)
temp[i] = (data[2 * i] + data[2 * i + 1]) / 2.0f;
// Difference of adjacent pairs (high-pass)
temp[half + i] = (data[2 * i] - data[2 * i + 1]) / 2.0f;
} else {
// Handle odd length: last sample goes to low-pass
temp[i] = data[2 * i];
if (half + i < length) {
temp[half + i] = 0.0f;
}
}
}
memcpy(data, temp, length * sizeof(float));
free(temp);
}
// Haar wavelet inverse 1D transform
// Reconstructs from averages (low-pass) and differences (high-pass)
static void dwt_haar_inverse_1d(float *data, int length) {
if (length < 2) return;
float *temp = malloc(length * sizeof(float));
int half = (length + 1) / 2;
// Inverse Haar transform: reconstruct from averages and differences
for (int i = 0; i < half; i++) {
if (2 * i + 1 < length) {
// Reconstruct adjacent pairs from average and difference
temp[2 * i] = data[i] + data[half + i]; // average + difference
temp[2 * i + 1] = data[i] - data[half + i]; // average - difference
} else {
// Handle odd length: last sample is just the low-pass value
temp[2 * i] = data[i];
}
}
memcpy(data, temp, length * sizeof(float));
free(temp);
}
// 1D DWT inverse using lifting scheme for 5/3 reversible filter
static void dwt_53_inverse_1d(float *data, int length) {
if (length < 2) return;
float *temp = malloc(length * sizeof(float));
int half = (length + 1) / 2;
// Copy low-pass and high-pass subbands to temp
memcpy(temp, data, length * sizeof(float));
// Undo update step (low-pass)
for (int i = 0; i < half; i++) {
float update = 0.25f * ((i > 0 ? temp[half + i - 1] : 0) +
(i < half - 1 ? temp[half + i] : 0));
temp[i] -= update;
}
// Undo predict step (high-pass) and interleave samples
for (int i = 0; i < half; i++) {
data[2 * i] = temp[i]; // Even samples (low-pass)
int idx = 2 * i + 1;
if (idx < length) {
float pred = 0.5f * (temp[i] + (i < half - 1 ? temp[i + 1] : temp[i]));
data[idx] = temp[half + i] + pred; // Odd samples (high-pass)
}
}
free(temp);
}
// FFT-based phase correlation for global motion estimation
// Uses FFTW3 to compute cross-power spectrum and find translation peak
// Returns quarter-pixel precision translation vectors
static void phase_correlate_fft(const uint8_t *frame1_rgb, const uint8_t *frame2_rgb,
int width, int height, int16_t *dx_qpel, int16_t *dy_qpel) {
// Step 1: Convert RGB to grayscale
float *gray1 = fftwf_malloc(width * height * sizeof(float));
float *gray2 = fftwf_malloc(width * height * sizeof(float));
for (int y = 0; y < height; y++) {
for (int x = 0; x < width; x++) {
int idx = y * width + x;
int rgb_idx = idx * 3;
// ITU-R BT.601 grayscale conversion
gray1[idx] = 0.299f * frame1_rgb[rgb_idx] +
0.587f * frame1_rgb[rgb_idx + 1] +
0.114f * frame1_rgb[rgb_idx + 2];
gray2[idx] = 0.299f * frame2_rgb[rgb_idx] +
0.587f * frame2_rgb[rgb_idx + 1] +
0.114f * frame2_rgb[rgb_idx + 2];
}
}
// Step 2: Plan FFTs (r2c = real to complex)
int fft_height = height;
int fft_width = width / 2 + 1; // R2C FFT only stores half + 1 complex values
fftwf_complex *fft1 = fftwf_malloc(fft_height * fft_width * sizeof(fftwf_complex));
fftwf_complex *fft2 = fftwf_malloc(fft_height * fft_width * sizeof(fftwf_complex));
fftwf_complex *cross_power = fftwf_malloc(fft_height * fft_width * sizeof(fftwf_complex));
float *correlation = fftwf_malloc(width * height * sizeof(float));
fftwf_plan plan_fwd1 = fftwf_plan_dft_r2c_2d(height, width, gray1, fft1, FFTW_ESTIMATE);
fftwf_plan plan_fwd2 = fftwf_plan_dft_r2c_2d(height, width, gray2, fft2, FFTW_ESTIMATE);
fftwf_plan plan_inv = fftwf_plan_dft_c2r_2d(height, width, cross_power, correlation, FFTW_ESTIMATE);
// Step 3: Execute forward FFTs
fftwf_execute(plan_fwd1);
fftwf_execute(plan_fwd2);
// Step 4: Compute cross-power spectrum: F1 * conj(F2) / |F1 * conj(F2)|
for (int i = 0; i < fft_height * fft_width; i++) {
float re1 = fft1[i][0];
float im1 = fft1[i][1];
float re2 = fft2[i][0];
float im2 = fft2[i][1];
// F1 * conj(F2)
float cross_re = re1 * re2 + im1 * im2;
float cross_im = im1 * re2 - re1 * im2;
// Magnitude
float mag = sqrtf(cross_re * cross_re + cross_im * cross_im);
// Normalize (avoid division by zero)
if (mag > 1e-10f) {
cross_power[i][0] = cross_re / mag;
cross_power[i][1] = cross_im / mag;
} else {
cross_power[i][0] = 0.0f;
cross_power[i][1] = 0.0f;
}
}
// Step 5: Inverse FFT to get correlation surface
fftwf_execute(plan_inv);
// Step 6: Find peak in correlation surface (integer pixel accuracy)
float max_val = -1e30f;
int peak_x = 0, peak_y = 0;
for (int y = 0; y < height; y++) {
for (int x = 0; x < width; x++) {
float val = correlation[y * width + x];
if (val > max_val) {
max_val = val;
peak_x = x;
peak_y = y;
}
}
}
// Convert to signed displacement (FFT shift: peak at (0,0) means no motion)
// Peak in second half means negative motion
int dx = peak_x;
int dy = peak_y;
if (dx > width / 2) dx -= width;
if (dy > height / 2) dy -= height;
// Step 7: Quarter-pixel refinement using parabolic interpolation
// Only refine if peak is not at boundary
float subpixel_dx = 0.0f;
float subpixel_dy = 0.0f;
if (peak_x > 0 && peak_x < width - 1) {
float left = correlation[peak_y * width + ((peak_x - 1 + width) % width)];
float center = correlation[peak_y * width + peak_x];
float right = correlation[peak_y * width + ((peak_x + 1) % width)];
// Parabolic fit: offset = (left - right) / (2 * (left - 2*center + right))
float denom = 2.0f * (left - 2.0f * center + right);
if (fabsf(denom) > 1e-6f) {
subpixel_dx = (left - right) / denom;
subpixel_dx = CLAMP(subpixel_dx, -0.5f, 0.5f);
}
}
if (peak_y > 0 && peak_y < height - 1) {
float top = correlation[((peak_y - 1 + height) % height) * width + peak_x];
float center = correlation[peak_y * width + peak_x];
float bottom = correlation[((peak_y + 1) % height) * width + peak_x];
float denom = 2.0f * (top - 2.0f * center + bottom);
if (fabsf(denom) > 1e-6f) {
subpixel_dy = (top - bottom) / denom;
subpixel_dy = CLAMP(subpixel_dy, -0.5f, 0.5f);
}
}
// Step 8: Convert to quarter-pixel units
float final_dx = dx + subpixel_dx;
float final_dy = dy + subpixel_dy;
*dx_qpel = (int16_t)roundf(final_dx * 4.0f);
*dy_qpel = (int16_t)roundf(final_dy * 4.0f);
// Cleanup
fftwf_destroy_plan(plan_fwd1);
fftwf_destroy_plan(plan_fwd2);
fftwf_destroy_plan(plan_inv);
fftwf_free(gray1);
fftwf_free(gray2);
fftwf_free(fft1);
fftwf_free(fft2);
fftwf_free(cross_power);
fftwf_free(correlation);
}
// Apply translation to frame (for frame alignment before temporal DWT)
static void apply_translation(float *frame_data, int width, int height,
int16_t dx_qpel, int16_t dy_qpel, float *output) {
// Convert quarter-pixel to pixel (for now, just use integer translation)
int dx = dx_qpel / 4;
int dy = dy_qpel / 4;
// Apply translation with boundary handling
for (int y = 0; y < height; y++) {
for (int x = 0; x < width; x++) {
int src_x = x - dx;
int src_y = y - dy;
// Clamp to frame boundaries
src_x = CLAMP(src_x, 0, width - 1);
src_y = CLAMP(src_y, 0, height - 1);
output[y * width + x] = frame_data[src_y * width + src_x];
}
}
}
// =============================================================================
// Temporal Subband Quantization
// =============================================================================
// Determine temporal subband level for a frame index after multi-level temporal DWT
// With 2 decomposition levels on 16 frames:
// - Level 0 (tLL): frames 0-3 (4 frames, low-pass)
// - Level 1 (tLH, tHL, tHH of level 1): frames 4-7, 8-11, 12-15 (12 frames, high-pass level 1)
// - Level 2 would be: frames in the high-pass of the high-pass (if we had 3 levels)
static int get_temporal_subband_level(int frame_idx, int num_frames, int temporal_levels) {
// After temporal DWT with 2 levels:
// Frames 0...num_frames/(2^2) = tLL (temporal low-low, coarsest)
// Remaining frames are temporal high-pass subbands
int frames_per_level0 = num_frames >> temporal_levels; // 16 >> 2 = 4
if (frame_idx < frames_per_level0) {
return 0; // Coarsest temporal level (tLL)
} else if (frame_idx < (num_frames >> 1)) {
return 1; // First level high-pass (tLH, tHL, tHH from level 1)
} else {
return 2; // Finest level high-pass
}
}
// Quantize 3D DWT coefficients with SEPARABLE temporal-spatial quantization
//
// IMPORTANT: This implements a separable quantization approach (temporal × spatial)
// After dwt_3d_forward(), the GOP coefficients have this structure:
// - Temporal DWT applied first (16 frames → 2 levels)
// → Results in temporal subbands: tLL (frames 0-3), tLH (4-7), tHL (8-11), tHH (12-15)
// - Then spatial DWT applied to each temporal subband
// → Each frame now contains 2D spatial coefficients (LL, LH, HL, HH subbands)
//
// Quantization strategy:
// 1. Compute temporal base quantizer: tH_base(level) = Qbase_t * 2^(beta*level)
// - tLL (level 0): coarsest temporal, most important → smallest quantizer
// - tHH (level 2): finest temporal, less important → largest quantizer
// 2. Apply spatial perceptual weighting to tH_base (LL: 1.0x, LH/HL: 1.5-2.0x, HH: 2.0-3.0x)
// 3. Final quantizer: Q_effective = tH_base × spatial_weight
//
// This separable approach is efficient and what most 3D wavelet codecs use.
static void quantise_3d_dwt_coefficients(tav_encoder_t *enc,
float **gop_coeffs, // [frame][pixel] - frame = temporal subband
int16_t **quantised, // [frame][pixel] - output quantised coefficients
int num_frames,
int spatial_size,
int base_quantiser,
int is_chroma) {
const float BETA = 0.8f; // Temporal scaling exponent (aggressive for temporal high-pass)
const float TEMPORAL_BASE_SCALE = 1.0f; // Don't reduce tLL quantization (same as intra)
// Process each temporal subband independently (separable approach)
for (int t = 0; t < num_frames; t++) {
// Step 1: Determine temporal subband level
// After 2-level temporal DWT on 16 frames:
// - Frames 0-3: tLL (level 0) - temporal low-pass, most important
// - Frames 4-7, 8-11, 12-15: tLH, tHL, tHH (levels 1-2) - temporal high-pass
int temporal_level = get_temporal_subband_level(t, num_frames, enc->temporal_decomp_levels);
// Step 2: Compute temporal base quantizer using exponential scaling
// Formula: tH_base = Qbase_t * 1.0 * 2^(2.0 * level)
// Example with Qbase_t=16:
// - Level 0 (tLL): 16 * 1.0 * 2^0 = 16 (same as intra-only)
// - Level 1 (tH): 16 * 1.0 * 2^2.0 = 64 (4× base, aggressive)
// - Level 2 (tHH): 16 * 1.0 * 2^4.0 = 256 → clamped to 255 (very aggressive)
float temporal_scale = TEMPORAL_BASE_SCALE * powf(2.0f, BETA * temporal_level);
float temporal_quantiser = base_quantiser * temporal_scale;
// Convert to integer for quantization
int temporal_base_quantiser = (int)roundf(temporal_quantiser);
temporal_base_quantiser = CLAMP(temporal_base_quantiser, 1, 255);
// Step 3: Apply spatial quantization within this temporal subband
// The existing function applies spatial perceptual weighting:
// Q_effective = tH_base × spatial_weight
// Where spatial_weight depends on spatial frequency (LL, LH, HL, HH subbands)
// This reuses all existing perceptual weighting and dead-zone logic
quantise_dwt_coefficients_perceptual_per_coeff(
enc,
gop_coeffs[t], // Input: spatial coefficients for this temporal subband
quantised[t], // Output: quantised spatial coefficients
spatial_size, // Number of spatial coefficients
temporal_base_quantiser, // Temporally-scaled base quantiser (tH_base)
enc->width, // Frame width
enc->height, // Frame height
enc->decomp_levels, // Spatial decomposition levels (typically 6)
is_chroma, // Is chroma channel (gets additional quantization)
enc->frame_count + t // Frame number (for any frame-dependent logic)
);
if (enc->verbose && (t == 0 || t == num_frames - 1)) {
printf(" Temporal subband %d: level=%d, tH_base=%d\n",
t, temporal_level, temporal_base_quantiser);
}
}
}
// =============================================================================
// GOP Management Functions
// =============================================================================
// Add frame to GOP buffer
// Returns 0 on success, -1 on error
static int gop_add_frame(tav_encoder_t *enc, const uint8_t *frame_rgb,
const float *frame_y, const float *frame_co, const float *frame_cg) {
if (!enc->enable_temporal_dwt || enc->gop_frame_count >= enc->gop_capacity) {
return -1;
}
int frame_idx = enc->gop_frame_count;
size_t frame_rgb_size = enc->width * enc->height * 3;
size_t frame_channel_size = enc->width * enc->height * sizeof(float);
// Copy frame data to GOP buffers
memcpy(enc->gop_rgb_frames[frame_idx], frame_rgb, frame_rgb_size);
memcpy(enc->gop_y_frames[frame_idx], frame_y, frame_channel_size);
memcpy(enc->gop_co_frames[frame_idx], frame_co, frame_channel_size);
memcpy(enc->gop_cg_frames[frame_idx], frame_cg, frame_channel_size);
// Compute translation vector if not first frame
if (frame_idx > 0) {
phase_correlate_fft(enc->gop_rgb_frames[frame_idx - 1],
enc->gop_rgb_frames[frame_idx],
enc->width, enc->height,
&enc->gop_translation_x[frame_idx],
&enc->gop_translation_y[frame_idx]);
if (enc->verbose && (frame_idx < 3 || frame_idx == enc->gop_capacity - 1)) {
printf(" GOP frame %d: translation = (%.2f, %.2f) pixels\n",
frame_idx,
enc->gop_translation_x[frame_idx] / 4.0f,
enc->gop_translation_y[frame_idx] / 4.0f);
}
} else {
// First frame has no translation
enc->gop_translation_x[0] = 0;
enc->gop_translation_y[0] = 0;
}
enc->gop_frame_count++;
return 0;
}
// Check if GOP is full
static int gop_is_full(const tav_encoder_t *enc) {
return enc->enable_temporal_dwt && (enc->gop_frame_count >= enc->gop_capacity);
}
// Reset GOP buffer
static void gop_reset(tav_encoder_t *enc) {
enc->gop_frame_count = 0;
if (enc->gop_translation_x && enc->gop_translation_y) {
memset(enc->gop_translation_x, 0, enc->gop_capacity * sizeof(int16_t));
memset(enc->gop_translation_y, 0, enc->gop_capacity * sizeof(int16_t));
}
}
// Check if GOP should be flushed due to large motion (potential scene change)
static int gop_should_flush_motion(tav_encoder_t *enc) {
if (!enc->enable_temporal_dwt || enc->gop_frame_count < 2) {
return 0;
}
// Check last added frame's motion
int last_idx = enc->gop_frame_count - 1;
int16_t dx = enc->gop_translation_x[last_idx];
int16_t dy = enc->gop_translation_y[last_idx];
// Convert quarter-pixel to pixels
float dx_pixels = fabsf(dx / 4.0f);
float dy_pixels = fabsf(dy / 4.0f);
// Flush if motion exceeds threshold (24 pixels in any direction)
// This indicates likely scene change or very fast motion
const float MOTION_THRESHOLD = 24.0f;
if (dx_pixels > MOTION_THRESHOLD || dy_pixels > MOTION_THRESHOLD) {
if (enc->verbose) {
printf(" Large motion detected (%.1f, %.1f pixels) - flushing GOP\n",
dx_pixels, dy_pixels);
}
return 1;
}
return 0;
}
// Flush GOP: apply 3D DWT, quantize, serialize, and write to output
// Returns number of bytes written, or 0 on error
// This function processes the entire GOP and writes all frames with temporal 3D DWT
static size_t gop_flush(tav_encoder_t *enc, FILE *output, int base_quantiser,
int *frame_numbers, int actual_gop_size) {
if (actual_gop_size <= 0 || actual_gop_size > enc->gop_capacity) {
fprintf(stderr, "Error: Invalid GOP size: %d\n", actual_gop_size);
return 0;
}
// Allocate working buffers for each channel
const int num_pixels = enc->width * enc->height;
float **gop_y_coeffs = malloc(actual_gop_size * sizeof(float*));
float **gop_co_coeffs = malloc(actual_gop_size * sizeof(float*));
float **gop_cg_coeffs = malloc(actual_gop_size * sizeof(float*));
for (int i = 0; i < actual_gop_size; i++) {
gop_y_coeffs[i] = malloc(num_pixels * sizeof(float));
gop_co_coeffs[i] = malloc(num_pixels * sizeof(float));
gop_cg_coeffs[i] = malloc(num_pixels * sizeof(float));
// Copy GOP frame data to working buffers
memcpy(gop_y_coeffs[i], enc->gop_y_frames[i], num_pixels * sizeof(float));
memcpy(gop_co_coeffs[i], enc->gop_co_frames[i], num_pixels * sizeof(float));
memcpy(gop_cg_coeffs[i], enc->gop_cg_frames[i], num_pixels * sizeof(float));
}
// Step 0.5: Apply motion compensation to align frames before temporal DWT
// This uses the computed translation vectors to align each frame to the previous one
for (int i = 1; i < actual_gop_size; i++) { // Skip frame 0 (reference frame)
float *aligned_y = malloc(num_pixels * sizeof(float));
float *aligned_co = malloc(num_pixels * sizeof(float));
float *aligned_cg = malloc(num_pixels * sizeof(float));
if (!aligned_y || !aligned_co || !aligned_cg) {
fprintf(stderr, "Error: Failed to allocate motion compensation buffers\n");
// Cleanup and skip motion compensation for this GOP
free(aligned_y);
free(aligned_co);
free(aligned_cg);
break;
}
// Apply translation to align this frame
apply_translation(gop_y_coeffs[i], enc->width, enc->height,
enc->gop_translation_x[i], enc->gop_translation_y[i], aligned_y);
apply_translation(gop_co_coeffs[i], enc->width, enc->height,
enc->gop_translation_x[i], enc->gop_translation_y[i], aligned_co);
apply_translation(gop_cg_coeffs[i], enc->width, enc->height,
enc->gop_translation_x[i], enc->gop_translation_y[i], aligned_cg);
// Copy aligned frames back
memcpy(gop_y_coeffs[i], aligned_y, num_pixels * sizeof(float));
memcpy(gop_co_coeffs[i], aligned_co, num_pixels * sizeof(float));
memcpy(gop_cg_coeffs[i], aligned_cg, num_pixels * sizeof(float));
free(aligned_y);
free(aligned_co);
free(aligned_cg);
}
// Step 1: Apply 3D DWT (temporal + spatial) to each channel
// Note: This modifies gop_*_coeffs in-place
dwt_3d_forward(gop_y_coeffs, enc->width, enc->height, actual_gop_size,
enc->decomp_levels, enc->temporal_decomp_levels, enc->wavelet_filter);
dwt_3d_forward(gop_co_coeffs, enc->width, enc->height, actual_gop_size,
enc->decomp_levels, enc->temporal_decomp_levels, enc->wavelet_filter);
dwt_3d_forward(gop_cg_coeffs, enc->width, enc->height, actual_gop_size,
enc->decomp_levels, enc->temporal_decomp_levels, enc->wavelet_filter);
// Step 2: Allocate quantized coefficient buffers
int16_t **quant_y = malloc(actual_gop_size * sizeof(int16_t*));
int16_t **quant_co = malloc(actual_gop_size * sizeof(int16_t*));
int16_t **quant_cg = malloc(actual_gop_size * sizeof(int16_t*));
for (int i = 0; i < actual_gop_size; i++) {
quant_y[i] = malloc(num_pixels * sizeof(int16_t));
quant_co[i] = malloc(num_pixels * sizeof(int16_t));
quant_cg[i] = malloc(num_pixels * sizeof(int16_t));
}
// Step 3: Quantize 3D DWT coefficients with temporal-spatial quantization
quantise_3d_dwt_coefficients(enc, gop_y_coeffs, quant_y, actual_gop_size,
num_pixels, base_quantiser, 0); // Luma
quantise_3d_dwt_coefficients(enc, gop_co_coeffs, quant_co, actual_gop_size,
num_pixels, base_quantiser, 1); // Chroma Co
quantise_3d_dwt_coefficients(enc, gop_cg_coeffs, quant_cg, actual_gop_size,
num_pixels, base_quantiser, 1); // Chroma Cg
// Step 4: Preprocessing and compression
size_t total_bytes_written = 0;
// Write timecode packet for first frame in GOP
write_timecode_packet(output, frame_numbers[0], enc->output_fps, enc->is_ntsc_framerate);
// Single-frame GOP fallback: use traditional I-frame encoding
if (actual_gop_size == 1) {
// Write I-frame packet header (no motion vectors, no GOP overhead)
uint8_t packet_type = TAV_PACKET_IFRAME;
fwrite(&packet_type, 1, 1, output);
total_bytes_written += 1;
// Preprocess single frame using standard variable layout
size_t max_preprocessed_size = (num_pixels * 3 * 2 + 7) / 8 + (num_pixels * 3 * sizeof(int16_t));
uint8_t *preprocessed_buffer = malloc(max_preprocessed_size);
size_t preprocessed_size = preprocess_coefficients_variable_layout(
quant_y[0], quant_co[0], quant_cg[0], NULL,
num_pixels, enc->channel_layout, preprocessed_buffer);
// Compress with Zstd
size_t max_compressed_size = ZSTD_compressBound(preprocessed_size);
uint8_t *compressed_buffer = malloc(max_compressed_size);
size_t compressed_size = ZSTD_compress(compressed_buffer, max_compressed_size,
preprocessed_buffer, preprocessed_size,
enc->zstd_level);
if (ZSTD_isError(compressed_size)) {
fprintf(stderr, "Error: Zstd compression failed for single-frame GOP\n");
free(preprocessed_buffer);
free(compressed_buffer);
// Free all allocated buffers
for (int i = 0; i < actual_gop_size; i++) {
free(gop_y_coeffs[i]);
free(gop_co_coeffs[i]);
free(gop_cg_coeffs[i]);
free(quant_y[i]);
free(quant_co[i]);
free(quant_cg[i]);
}
free(gop_y_coeffs);
free(gop_co_coeffs);
free(gop_cg_coeffs);
free(quant_y);
free(quant_co);
free(quant_cg);
return 0;
}
// Write compressed size (4 bytes) and compressed data
uint32_t compressed_size_32 = (uint32_t)compressed_size;
fwrite(&compressed_size_32, sizeof(uint32_t), 1, output);
fwrite(compressed_buffer, 1, compressed_size, output);
total_bytes_written += sizeof(uint32_t) + compressed_size;
// Cleanup
free(preprocessed_buffer);
free(compressed_buffer);
if (enc->verbose) {
printf("Frame %d (single-frame GOP as I-frame): %zu bytes\n",
frame_numbers[0], compressed_size);
}
} else {
// Multi-frame GOP: use unified 3D DWT encoding
// Write unified GOP packet header
// Packet structure: [packet_type=0x12][gop_size][motion_vectors...][compressed_size][compressed_data]
uint8_t packet_type = TAV_PACKET_GOP_UNIFIED;
fwrite(&packet_type, 1, 1, output);
total_bytes_written += 1;
// Write GOP size (1 byte)
uint8_t gop_size_byte = (uint8_t)actual_gop_size;
fwrite(&gop_size_byte, 1, 1, output);
total_bytes_written += 1;
// Write all motion vectors (quarter-pixel precision) for the entire GOP
for (int t = 0; t < actual_gop_size; t++) {
int16_t dx = enc->gop_translation_x[t];
int16_t dy = enc->gop_translation_y[t];
fwrite(&dx, sizeof(int16_t), 1, output);
fwrite(&dy, sizeof(int16_t), 1, output);
total_bytes_written += 4;
}
// Preprocess ALL frames with unified significance map
// Allocate buffer: maps (2 bits per coeff per frame) + values (int16 per non-zero/±1 coeff)
size_t max_preprocessed_size = (num_pixels * actual_gop_size * 3 * 2 + 7) / 8 +
(num_pixels * actual_gop_size * 3 * sizeof(int16_t));
uint8_t *preprocessed_buffer = malloc(max_preprocessed_size);
size_t preprocessed_size = preprocess_gop_unified(
quant_y, quant_co, quant_cg,
actual_gop_size, num_pixels, enc->channel_layout,
preprocessed_buffer);
// Compress entire GOP with Zstd (single compression for all frames)
size_t max_compressed_size = ZSTD_compressBound(preprocessed_size);
uint8_t *compressed_buffer = malloc(max_compressed_size);
size_t compressed_size = ZSTD_compress(compressed_buffer, max_compressed_size,
preprocessed_buffer, preprocessed_size,
enc->zstd_level);
if (ZSTD_isError(compressed_size)) {
fprintf(stderr, "Error: Zstd compression failed for unified GOP\n");
free(preprocessed_buffer);
free(compressed_buffer);
// Free all allocated buffers and return 0
for (int i = 0; i < actual_gop_size; i++) {
free(gop_y_coeffs[i]);
free(gop_co_coeffs[i]);
free(gop_cg_coeffs[i]);
free(quant_y[i]);
free(quant_co[i]);
free(quant_cg[i]);
}
free(gop_y_coeffs);
free(gop_co_coeffs);
free(gop_cg_coeffs);
free(quant_y);
free(quant_co);
free(quant_cg);
return 0;
}
// Write compressed size (4 bytes) and compressed data
uint32_t compressed_size_32 = (uint32_t)compressed_size;
fwrite(&compressed_size_32, sizeof(uint32_t), 1, output);
fwrite(compressed_buffer, 1, compressed_size, output);
total_bytes_written += sizeof(uint32_t) + compressed_size;
// Cleanup buffers
free(preprocessed_buffer);
free(compressed_buffer);
// Write GOP_SYNC packet to indicate N frames were decoded from this GOP block
uint8_t sync_packet_type = TAV_PACKET_GOP_SYNC;
uint8_t sync_frame_count = (uint8_t)actual_gop_size;
fwrite(&sync_packet_type, 1, 1, output);
fwrite(&sync_frame_count, 1, 1, output);
total_bytes_written += 2;
// Verbose output
if (enc->verbose) {
printf("GOP (%d frames): %zu bytes (3D DWT unified, %.2f bytes/frame)\n",
actual_gop_size, compressed_size, (double)compressed_size / actual_gop_size);
for (int t = 0; t < actual_gop_size; t++) {
printf(" Frame %d: dx=%d/4, dy=%d/4\n",
frame_numbers[t], enc->gop_translation_x[t], enc->gop_translation_y[t]);
}
}
} // End of if/else for single-frame vs multi-frame GOP
// Cleanup GOP buffers
for (int i = 0; i < actual_gop_size; i++) {
free(gop_y_coeffs[i]);
free(gop_co_coeffs[i]);
free(gop_cg_coeffs[i]);
free(quant_y[i]);
free(quant_co[i]);
free(quant_cg[i]);
}
free(gop_y_coeffs);
free(gop_co_coeffs);
free(gop_cg_coeffs);
free(quant_y);
free(quant_co);
free(quant_cg);
return total_bytes_written;
}
// Process GOP with scene change detection and flush
// Returns number of bytes written, or 0 on error
// This wrapper function handles GOP trimming when scene changes are detected
static size_t gop_process_and_flush(tav_encoder_t *enc, FILE *output, int base_quantiser,
int *frame_numbers, int force_flush) {
if (enc->gop_frame_count == 0) {
return 0; // Nothing to flush
}
int actual_gop_size = enc->gop_frame_count;
int scene_change_frame = -1;
// Check for scene changes within the GOP
if (!force_flush) {
for (int i = 1; i < enc->gop_frame_count; i++) {
// Compare consecutive frames using RGB data
uint8_t *frame1 = enc->gop_rgb_frames[i - 1];
uint8_t *frame2 = enc->gop_rgb_frames[i];
long long total_diff = 0;
int changed_pixels = 0;
int num_pixels = enc->width * enc->height;
// Sample every 4th pixel for performance
for (int p = 0; p < num_pixels; p += 4) {
int offset = p * 3;
int r_diff = abs(frame2[offset] - frame1[offset]);
int g_diff = abs(frame2[offset + 1] - frame1[offset + 1]);
int b_diff = abs(frame2[offset + 2] - frame1[offset + 2]);
int pixel_diff = r_diff + g_diff + b_diff;
total_diff += pixel_diff;
if (pixel_diff > 90) {
changed_pixels++;
}
}
// Scene change thresholds (same as detect_scene_change)
int sampled_pixels = (num_pixels + 3) / 4;
double avg_diff = (double)total_diff / sampled_pixels;
double change_ratio = (double)changed_pixels / sampled_pixels;
// Scene change detected if either threshold exceeded
if (avg_diff > 15.0 || change_ratio > 0.4) {
scene_change_frame = i;
if (enc->verbose) {
printf("Scene change detected within GOP at frame %d (avg_diff=%.2f, change_ratio=%.2f)\n",
frame_numbers[i], avg_diff, change_ratio);
}
break;
}
}
}
// Trim GOP if scene change detected
if (scene_change_frame > 0) {
actual_gop_size = scene_change_frame;
if (enc->verbose) {
printf("Trimming GOP from %d to %d frames due to scene change\n",
enc->gop_frame_count, actual_gop_size);
}
}
// Flush the GOP (or trimmed portion)
size_t bytes_written = gop_flush(enc, output, base_quantiser, frame_numbers, actual_gop_size);
// If GOP was trimmed, shift remaining frames to start of buffer
if (scene_change_frame > 0 && scene_change_frame < enc->gop_frame_count) {
int remaining_frames = enc->gop_frame_count - scene_change_frame;
for (int i = 0; i < remaining_frames; i++) {
int src = scene_change_frame + i;
// Swap pointers instead of copying data
uint8_t *temp_rgb = enc->gop_rgb_frames[i];
float *temp_y = enc->gop_y_frames[i];
float *temp_co = enc->gop_co_frames[i];
float *temp_cg = enc->gop_cg_frames[i];
enc->gop_rgb_frames[i] = enc->gop_rgb_frames[src];
enc->gop_y_frames[i] = enc->gop_y_frames[src];
enc->gop_co_frames[i] = enc->gop_co_frames[src];
enc->gop_cg_frames[i] = enc->gop_cg_frames[src];
enc->gop_rgb_frames[src] = temp_rgb;
enc->gop_y_frames[src] = temp_y;
enc->gop_co_frames[src] = temp_co;
enc->gop_cg_frames[src] = temp_cg;
enc->gop_translation_x[i] = enc->gop_translation_x[src];
enc->gop_translation_y[i] = enc->gop_translation_y[src];
}
enc->gop_frame_count = remaining_frames;
} else {
// Full GOP flushed, reset
gop_reset(enc);
}
return bytes_written;
}
// =============================================================================
// Temporal DWT Functions
// =============================================================================
// Apply 1D temporal DWT along time axis for a spatial location (encoder side)
// data[i] = frame i's coefficient value at this spatial location
// Applies LGT 5/3 wavelet for reversibility
static void dwt_temporal_1d_forward_53(float *temporal_data, int num_frames) {
if (num_frames < 2) return;
dwt_53_forward_1d(temporal_data, num_frames);
}
// Apply inverse 1D temporal DWT (decoder side)
static void dwt_temporal_1d_inverse_53(float *temporal_data, int num_frames) {
if (num_frames < 2) return;
dwt_53_inverse_1d(temporal_data, num_frames);
}
// Apply 3D DWT: temporal DWT across frames, then spatial DWT on each temporal subband
// gop_data[frame][y * width + x] - GOP buffer organized as frame-major
// Modifies gop_data in-place
static void dwt_3d_forward(float **gop_data, int width, int height, int num_frames,
int spatial_levels, int temporal_levels, int spatial_filter) {
if (num_frames < 2 || width < 2 || height < 2) return;
int num_pixels = width * height;
float *temporal_line = malloc(num_frames * sizeof(float));
// Step 1: Apply temporal DWT to each spatial location across all GOP frames
for (int y = 0; y < height; y++) {
for (int x = 0; x < width; x++) {
int pixel_idx = y * width + x;
// Extract temporal signal for this spatial location
for (int t = 0; t < num_frames; t++) {
temporal_line[t] = gop_data[t][pixel_idx];
}
// Apply temporal DWT with multiple levels
for (int level = 0; level < temporal_levels; level++) {
int level_frames = num_frames >> level;
if (level_frames >= 2) {
// dwt_temporal_1d_forward_53(temporal_line, level_frames);
dwt_haar_forward_1d(temporal_line, level_frames);
}
}
// Write back temporal coefficients
for (int t = 0; t < num_frames; t++) {
gop_data[t][pixel_idx] = temporal_line[t];
}
}
}
free(temporal_line);
// Step 2: Apply 2D spatial DWT to each temporal subband (each frame after temporal DWT)
for (int t = 0; t < num_frames; t++) {
// Apply spatial DWT using the appropriate flexible function
dwt_2d_forward_flexible(gop_data[t], width, height, spatial_levels, spatial_filter);
}
}
// Apply inverse 3D DWT: inverse spatial DWT on each temporal subband, then inverse temporal DWT
static void dwt_3d_inverse(float **gop_data, int width, int height, int num_frames,
int spatial_levels, int temporal_levels, int spatial_filter) {
if (num_frames < 2 || width < 2 || height < 2) return;
// Step 1: Apply inverse 2D spatial DWT to each temporal subband
for (int t = 0; t < num_frames; t++) {
// Note: Need to implement appropriate inverse function based on filter type
// For now, using Haar inverse as reference (will need proper inverse for 5/3, 9/7, etc.)
if (spatial_filter == WAVELET_HAAR) {
dwt_2d_haar_inverse_flexible(gop_data[t], width, height, spatial_levels);
} else {
// TODO: Implement proper inverse for other wavelets (5/3, 9/7, etc.)
// For now, log warning
fprintf(stderr, "Warning: Inverse spatial DWT not fully implemented for filter %d\n", spatial_filter);
}
}
// Step 2: Apply inverse temporal DWT to each spatial location
int num_pixels = width * height;
float *temporal_line = malloc(num_frames * sizeof(float));
for (int y = 0; y < height; y++) {
for (int x = 0; x < width; x++) {
int pixel_idx = y * width + x;
// Extract temporal coefficients for this spatial location
for (int t = 0; t < num_frames; t++) {
temporal_line[t] = gop_data[t][pixel_idx];
}
// Apply inverse temporal DWT with multiple levels (reverse order)
for (int level = temporal_levels - 1; level >= 0; level--) {
int level_frames = num_frames >> level;
if (level_frames >= 2) {
dwt_temporal_1d_inverse_53(temporal_line, level_frames);
}
}
// Write back reconstructed values
for (int t = 0; t < num_frames; t++) {
gop_data[t][pixel_idx] = temporal_line[t];
}
}
}
free(temporal_line);
}
// Extract padded tile with margins for seamless DWT processing (correct implementation)
static void extract_padded_tile(tav_encoder_t *enc, int tile_x, int tile_y,
float *padded_y, float *padded_co, float *padded_cg) {
const int core_start_x = tile_x * TILE_SIZE_X;
const int core_start_y = tile_y * TILE_SIZE_Y;
// OPTIMISATION: Process row by row with bulk copying for core region
for (int py = 0; py < PADDED_TILE_SIZE_Y; py++) {
// Map padded row to source image row
int src_y = core_start_y + py - TILE_MARGIN;
// Handle vertical boundary conditions with mirroring
if (src_y < 0) src_y = -src_y;
else if (src_y >= enc->height) src_y = enc->height - 1 - (src_y - enc->height);
src_y = CLAMP(src_y, 0, enc->height - 1);
// Calculate source and destination row offsets
const int padded_row_offset = py * PADDED_TILE_SIZE_X;
const int src_row_offset = src_y * enc->width;
// Check if we can do bulk copying for the core region
int core_start_px = TILE_MARGIN;
int core_end_px = TILE_MARGIN + TILE_SIZE_X;
// Check if core region is entirely within frame bounds
int core_src_start_x = core_start_x;
int core_src_end_x = core_start_x + TILE_SIZE_X;
if (core_src_start_x >= 0 && core_src_end_x <= enc->width) {
// OPTIMISATION: Bulk copy core region in one operation
const int src_core_offset = src_row_offset + core_src_start_x;
memcpy(&padded_y[padded_row_offset + core_start_px],
&enc->current_frame_y[src_core_offset],
TILE_SIZE_X * sizeof(float));
memcpy(&padded_co[padded_row_offset + core_start_px],
&enc->current_frame_co[src_core_offset],
TILE_SIZE_X * sizeof(float));
memcpy(&padded_cg[padded_row_offset + core_start_px],
&enc->current_frame_cg[src_core_offset],
TILE_SIZE_X * sizeof(float));
// Handle margin pixels individually (left and right margins)
for (int px = 0; px < core_start_px; px++) {
int src_x = core_start_x + px - TILE_MARGIN;
if (src_x < 0) src_x = -src_x;
src_x = CLAMP(src_x, 0, enc->width - 1);
int src_idx = src_row_offset + src_x;
int padded_idx = padded_row_offset + px;
padded_y[padded_idx] = enc->current_frame_y[src_idx];
padded_co[padded_idx] = enc->current_frame_co[src_idx];
padded_cg[padded_idx] = enc->current_frame_cg[src_idx];
}
for (int px = core_end_px; px < PADDED_TILE_SIZE_X; px++) {
int src_x = core_start_x + px - TILE_MARGIN;
if (src_x >= enc->width) src_x = enc->width - 1 - (src_x - enc->width);
src_x = CLAMP(src_x, 0, enc->width - 1);
int src_idx = src_row_offset + src_x;
int padded_idx = padded_row_offset + px;
padded_y[padded_idx] = enc->current_frame_y[src_idx];
padded_co[padded_idx] = enc->current_frame_co[src_idx];
padded_cg[padded_idx] = enc->current_frame_cg[src_idx];
}
} else {
// Fallback: process entire row pixel by pixel (for edge tiles)
for (int px = 0; px < PADDED_TILE_SIZE_X; px++) {
int src_x = core_start_x + px - TILE_MARGIN;
// Handle horizontal boundary conditions with mirroring
if (src_x < 0) src_x = -src_x;
else if (src_x >= enc->width) src_x = enc->width - 1 - (src_x - enc->width);
src_x = CLAMP(src_x, 0, enc->width - 1);
int src_idx = src_row_offset + src_x;
int padded_idx = padded_row_offset + px;
padded_y[padded_idx] = enc->current_frame_y[src_idx];
padded_co[padded_idx] = enc->current_frame_co[src_idx];
padded_cg[padded_idx] = enc->current_frame_cg[src_idx];
}
}
}
}
// ==============================================================================
// Grain Synthesis Functions
// ==============================================================================
// Forward declaration for perceptual weight function
static float get_perceptual_weight(tav_encoder_t *enc, int level0, int subband_type, int is_chroma, int max_levels);
// Generate triangular noise from uint32 RNG
// Returns value in range [-1.0, 1.0]
static float grain_triangular_noise(uint32_t rng_val) {
// Get two uniform random values in [0, 1]
float u1 = (rng_val & 0xFFFF) / 65535.0f;
float u2 = ((rng_val >> 16) & 0xFFFF) / 65535.0f;
// Convert to range [-1, 1] and average for triangular distribution
return (u1 + u2) - 1.0f;
}
// Apply grain synthesis to DWT coefficients (encoder adds noise)
static void apply_grain_synthesis_encoder(tav_encoder_t *enc, float *coeffs, int width, int height,
int decomp_levels, uint32_t frame_num,
int quantiser, int is_chroma) {
// Only apply to Y channel, excluding LL band
// Noise amplitude = half of quantization step (scaled by perceptual weight if enabled)
for (int y = 0; y < height; y++) {
for (int x = 0; x < width; x++) {
int idx = y * width + x;
// Check if this is the LL band (level 0)
int level = get_subband_level_2d(x, y, width, height, decomp_levels);
int subband_type = get_subband_type_2d(x, y, width, height, decomp_levels);
if (level == 0) {
continue; // Skip LL band
}
// Get subband type for perceptual weight calculation
/*int subband_type = get_subband_type_2d(x, y, width, height, decomp_levels);
// Calculate noise amplitude based on perceptual tuning mode
float noise_amplitude;
if (enc->perceptual_tuning) {
// Perceptual mode: scale by perceptual weight
float perceptual_weight = get_perceptual_weight(enc, level, subband_type, is_chroma, decomp_levels);
noise_amplitude = (quantiser * perceptual_weight) * 0.5f;
} else {
// Uniform mode: use global quantiser
noise_amplitude = quantiser * 0.5f;
}*/
float noise_amplitude = FCLAMP(quantiser, 0.0f, 32.0f) * 0.5f;
// Generate deterministic noise
uint32_t rng_val = grain_synthesis_rng(frame_num, level + subband_type * 31 + 16777219, x, y);
float noise = grain_triangular_noise(rng_val);
// Add noise to coefficient
coeffs[idx] += noise * noise_amplitude;
}
}
}
// 2D DWT forward transform for rectangular padded tile (344x288)
static void dwt_2d_forward_padded(float *tile_data, int levels, int filter_type) {
const int width = PADDED_TILE_SIZE_X; // 344
const int height = PADDED_TILE_SIZE_Y; // 288
const int max_size = (width > height) ? width : height;
float *temp_row = malloc(max_size * sizeof(float));
float *temp_col = malloc(max_size * sizeof(float));
for (int level = 0; level < levels; level++) {
int current_width = width >> level;
int current_height = height >> level;
if (current_width < 1 || current_height < 1) break;
// Row transform (horizontal)
for (int y = 0; y < current_height; y++) {
for (int x = 0; x < current_width; x++) {
temp_row[x] = tile_data[y * width + x];
}
if (filter_type == WAVELET_5_3_REVERSIBLE) {
dwt_53_forward_1d(temp_row, current_width);
} else if (filter_type == WAVELET_9_7_IRREVERSIBLE) {
dwt_97_forward_1d(temp_row, current_width);
} else if (filter_type == WAVELET_BIORTHOGONAL_13_7) {
dwt_bior137_forward_1d(temp_row, current_width);
} else if (filter_type == WAVELET_DD4) {
dwt_dd4_forward_1d(temp_row, current_width);
} else if (filter_type == WAVELET_HAAR) {
dwt_haar_forward_1d(temp_row, current_width);
}
for (int x = 0; x < current_width; x++) {
tile_data[y * width + x] = temp_row[x];
}
}
// Column transform (vertical)
for (int x = 0; x < current_width; x++) {
for (int y = 0; y < current_height; y++) {
temp_col[y] = tile_data[y * width + x];
}
if (filter_type == WAVELET_5_3_REVERSIBLE) {
dwt_53_forward_1d(temp_col, current_height);
} else if (filter_type == WAVELET_9_7_IRREVERSIBLE) {
dwt_97_forward_1d(temp_col, current_height);
} else if (filter_type == WAVELET_BIORTHOGONAL_13_7) {
dwt_bior137_forward_1d(temp_col, current_height);
} else if (filter_type == WAVELET_DD4) {
dwt_dd4_forward_1d(temp_col, current_height);
} else if (filter_type == WAVELET_HAAR) {
dwt_haar_forward_1d(temp_col, current_height);
}
for (int y = 0; y < current_height; y++) {
tile_data[y * width + x] = temp_col[y];
}
}
}
free(temp_row);
free(temp_col);
}
// 2D DWT forward transform for arbitrary dimensions
static void dwt_2d_forward_flexible(float *tile_data, int width, int height, int levels, int filter_type) {
const int max_size = (width > height) ? width : height;
float *temp_row = malloc(max_size * sizeof(float));
float *temp_col = malloc(max_size * sizeof(float));
for (int level = 0; level < levels; level++) {
int current_width = width >> level;
int current_height = height >> level;
if (current_width < 1 || current_height < 1) break;
// Row transform (horizontal)
for (int y = 0; y < current_height; y++) {
for (int x = 0; x < current_width; x++) {
temp_row[x] = tile_data[y * width + x];
}
if (filter_type == WAVELET_5_3_REVERSIBLE) {
dwt_53_forward_1d(temp_row, current_width);
} else if (filter_type == WAVELET_9_7_IRREVERSIBLE) {
dwt_97_forward_1d(temp_row, current_width);
} else if (filter_type == WAVELET_BIORTHOGONAL_13_7) {
dwt_bior137_forward_1d(temp_row, current_width);
} else if (filter_type == WAVELET_DD4) {
dwt_dd4_forward_1d(temp_row, current_width);
} else if (filter_type == WAVELET_HAAR) {
dwt_haar_forward_1d(temp_row, current_width);
}
for (int x = 0; x < current_width; x++) {
tile_data[y * width + x] = temp_row[x];
}
}
// Column transform (vertical)
for (int x = 0; x < current_width; x++) {
for (int y = 0; y < current_height; y++) {
temp_col[y] = tile_data[y * width + x];
}
if (filter_type == WAVELET_5_3_REVERSIBLE) {
dwt_53_forward_1d(temp_col, current_height);
} else if (filter_type == WAVELET_9_7_IRREVERSIBLE) {
dwt_97_forward_1d(temp_col, current_height);
} else if (filter_type == WAVELET_BIORTHOGONAL_13_7) {
dwt_bior137_forward_1d(temp_col, current_height);
} else if (filter_type == WAVELET_DD4) {
dwt_dd4_forward_1d(temp_col, current_height);
} else if (filter_type == WAVELET_HAAR) {
dwt_haar_forward_1d(temp_col, current_height);
}
for (int y = 0; y < current_height; y++) {
tile_data[y * width + x] = temp_col[y];
}
}
}
free(temp_row);
free(temp_col);
}
// 2D Haar wavelet inverse transform for arbitrary dimensions
// Used for delta coefficient reconstruction (inverse must be done in reverse order of levels)
static void dwt_2d_haar_inverse_flexible(float *tile_data, int width, int height, int levels) {
const int max_size = (width > height) ? width : height;
float *temp_row = malloc(max_size * sizeof(float));
float *temp_col = malloc(max_size * sizeof(float));
// Apply inverse transform in reverse order of levels
for (int level = levels - 1; level >= 0; level--) {
int current_width = width >> level;
int current_height = height >> level;
if (current_width < 1 || current_height < 1) continue;
// Column inverse transform (vertical) - done first to reverse forward order
for (int x = 0; x < current_width; x++) {
for (int y = 0; y < current_height; y++) {
temp_col[y] = tile_data[y * width + x];
}
dwt_haar_inverse_1d(temp_col, current_height);
for (int y = 0; y < current_height; y++) {
tile_data[y * width + x] = temp_col[y];
}
}
// Row inverse transform (horizontal) - done second to reverse forward order
for (int y = 0; y < current_height; y++) {
for (int x = 0; x < current_width; x++) {
temp_row[x] = tile_data[y * width + x];
}
dwt_haar_inverse_1d(temp_row, current_width);
for (int x = 0; x < current_width; x++) {
tile_data[y * width + x] = temp_row[x];
}
}
}
free(temp_row);
free(temp_col);
}
// Variable channel layout preprocessing for concatenated maps
// Significance Map v2.1 (twobit-map): 2 bits per coefficient
// 00=zero, 01=+1, 10=-1, 11=other (stored as int16)
static size_t preprocess_coefficients_variable_layout(int16_t *coeffs_y, int16_t *coeffs_co, int16_t *coeffs_cg, int16_t *coeffs_alpha,
int coeff_count, int channel_layout, uint8_t *output_buffer) {
const channel_layout_config_t *config = &channel_layouts[channel_layout];
int map_bytes = (coeff_count * 2 + 7) / 8; // 2 bits per coefficient
int total_maps = config->num_channels;
// Count "other" values (not 0, +1, or -1) per active channel
int other_counts[4] = {0}; // Y, Co, Cg, Alpha
for (int i = 0; i < coeff_count; i++) {
if (config->has_y && coeffs_y) {
int16_t val = coeffs_y[i];
if (val != 0 && val != 1 && val != -1) other_counts[0]++;
}
if (config->has_co && coeffs_co) {
int16_t val = coeffs_co[i];
if (val != 0 && val != 1 && val != -1) other_counts[1]++;
}
if (config->has_cg && coeffs_cg) {
int16_t val = coeffs_cg[i];
if (val != 0 && val != 1 && val != -1) other_counts[2]++;
}
if (config->has_alpha && coeffs_alpha) {
int16_t val = coeffs_alpha[i];
if (val != 0 && val != 1 && val != -1) other_counts[3]++;
}
}
// Layout maps in order based on channel layout
uint8_t *maps[4];
int map_idx = 0;
if (config->has_y) maps[0] = output_buffer + map_bytes * map_idx++;
if (config->has_co) maps[1] = output_buffer + map_bytes * map_idx++;
if (config->has_cg) maps[2] = output_buffer + map_bytes * map_idx++;
if (config->has_alpha) maps[3] = output_buffer + map_bytes * map_idx++;
// Calculate value array positions (only for "other" values)
int16_t *values[4];
int16_t *value_start = (int16_t *)(output_buffer + map_bytes * total_maps);
int value_offset = 0;
if (config->has_y) { values[0] = value_start + value_offset; value_offset += other_counts[0]; }
if (config->has_co) { values[1] = value_start + value_offset; value_offset += other_counts[1]; }
if (config->has_cg) { values[2] = value_start + value_offset; value_offset += other_counts[2]; }
if (config->has_alpha) { values[3] = value_start + value_offset; value_offset += other_counts[3]; }
// Clear significance maps
memset(output_buffer, 0, map_bytes * total_maps);
// Fill twobit-maps and extract "other" values
int value_indices[4] = {0};
int16_t *channel_coeffs[4] = {coeffs_y, coeffs_co, coeffs_cg, coeffs_alpha};
int channel_active[4] = {config->has_y, config->has_co, config->has_cg, config->has_alpha};
for (int i = 0; i < coeff_count; i++) {
for (int ch = 0; ch < 4; ch++) {
if (!channel_active[ch] || !channel_coeffs[ch]) continue;
int16_t val = channel_coeffs[ch][i];
uint8_t code;
if (val == 0) {
code = 0; // 00
} else if (val == 1) {
code = 1; // 01
} else if (val == -1) {
code = 2; // 10
} else {
code = 3; // 11
values[ch][value_indices[ch]++] = val;
}
// Store 2-bit code (interleaved)
size_t bit_pos = i * 2;
size_t byte_idx = bit_pos / 8;
size_t bit_offset = bit_pos % 8;
maps[ch][byte_idx] |= (code << bit_offset);
// Handle byte boundary crossing
if (bit_offset == 7 && byte_idx + 1 < map_bytes) {
maps[ch][byte_idx + 1] |= (code >> 1);
}
}
}
// Return total size: maps + all "other" values
int total_others = other_counts[0] + other_counts[1] + other_counts[2] + other_counts[3];
return map_bytes * total_maps + total_others * sizeof(int16_t);
}
// Unified GOP preprocessing: single significance map for all frames and channels
// Layout: [All_Y_maps][All_Co_maps][All_Cg_maps][All_Y_values][All_Co_values][All_Cg_values]
// This enables optimal cross-frame compression in the temporal dimension
static size_t preprocess_gop_unified(int16_t **quant_y, int16_t **quant_co, int16_t **quant_cg,
int num_frames, int num_pixels, int channel_layout,
uint8_t *output_buffer) {
const channel_layout_config_t *config = &channel_layouts[channel_layout];
const int map_bytes_per_frame = (num_pixels * 2 + 7) / 8; // 2 bits per coefficient
const int total_coeffs = num_pixels * num_frames;
// Count "other" values (not 0, +1, or -1) for each channel across ALL frames
int other_count_y = 0, other_count_co = 0, other_count_cg = 0;
for (int frame = 0; frame < num_frames; frame++) {
if (config->has_y && quant_y && quant_y[frame]) {
for (int i = 0; i < num_pixels; i++) {
int16_t val = quant_y[frame][i];
if (val != 0 && val != 1 && val != -1) other_count_y++;
}
}
if (config->has_co && quant_co && quant_co[frame]) {
for (int i = 0; i < num_pixels; i++) {
int16_t val = quant_co[frame][i];
if (val != 0 && val != 1 && val != -1) other_count_co++;
}
}
if (config->has_cg && quant_cg && quant_cg[frame]) {
for (int i = 0; i < num_pixels; i++) {
int16_t val = quant_cg[frame][i];
if (val != 0 && val != 1 && val != -1) other_count_cg++;
}
}
}
// Calculate buffer layout
uint8_t *write_ptr = output_buffer;
// Significance maps: grouped by channel (all Y frames, then all Co frames, then all Cg frames)
uint8_t *y_maps_start = write_ptr;
if (config->has_y) write_ptr += map_bytes_per_frame * num_frames;
uint8_t *co_maps_start = write_ptr;
if (config->has_co) write_ptr += map_bytes_per_frame * num_frames;
uint8_t *cg_maps_start = write_ptr;
if (config->has_cg) write_ptr += map_bytes_per_frame * num_frames;
// Value arrays: grouped by channel
int16_t *y_values = (int16_t *)write_ptr;
if (config->has_y) write_ptr += other_count_y * sizeof(int16_t);
int16_t *co_values = (int16_t *)write_ptr;
if (config->has_co) write_ptr += other_count_co * sizeof(int16_t);
int16_t *cg_values = (int16_t *)write_ptr;
if (config->has_cg) write_ptr += other_count_cg * sizeof(int16_t);
// Clear all map bytes
size_t total_map_bytes = 0;
if (config->has_y) total_map_bytes += map_bytes_per_frame * num_frames;
if (config->has_co) total_map_bytes += map_bytes_per_frame * num_frames;
if (config->has_cg) total_map_bytes += map_bytes_per_frame * num_frames;
memset(output_buffer, 0, total_map_bytes);
// Process each frame and fill maps/values
int y_value_idx = 0, co_value_idx = 0, cg_value_idx = 0;
for (int frame = 0; frame < num_frames; frame++) {
uint8_t *y_map = y_maps_start + frame * map_bytes_per_frame;
uint8_t *co_map = co_maps_start + frame * map_bytes_per_frame;
uint8_t *cg_map = cg_maps_start + frame * map_bytes_per_frame;
for (int i = 0; i < num_pixels; i++) {
size_t bit_pos = i * 2;
size_t byte_idx = bit_pos / 8;
size_t bit_offset = bit_pos % 8;
// Process Y channel
if (config->has_y && quant_y && quant_y[frame]) {
int16_t val = quant_y[frame][i];
uint8_t code;
if (val == 0) code = 0; // 00
else if (val == 1) code = 1; // 01
else if (val == -1) code = 2; // 10
else {
code = 3; // 11
y_values[y_value_idx++] = val;
}
y_map[byte_idx] |= (code << bit_offset);
if (bit_offset == 7 && byte_idx + 1 < map_bytes_per_frame) {
y_map[byte_idx + 1] |= (code >> 1);
}
}
// Process Co channel
if (config->has_co && quant_co && quant_co[frame]) {
int16_t val = quant_co[frame][i];
uint8_t code;
if (val == 0) code = 0;
else if (val == 1) code = 1;
else if (val == -1) code = 2;
else {
code = 3;
co_values[co_value_idx++] = val;
}
co_map[byte_idx] |= (code << bit_offset);
if (bit_offset == 7 && byte_idx + 1 < map_bytes_per_frame) {
co_map[byte_idx + 1] |= (code >> 1);
}
}
// Process Cg channel
if (config->has_cg && quant_cg && quant_cg[frame]) {
int16_t val = quant_cg[frame][i];
uint8_t code;
if (val == 0) code = 0;
else if (val == 1) code = 1;
else if (val == -1) code = 2;
else {
code = 3;
cg_values[cg_value_idx++] = val;
}
cg_map[byte_idx] |= (code << bit_offset);
if (bit_offset == 7 && byte_idx + 1 < map_bytes_per_frame) {
cg_map[byte_idx + 1] |= (code >> 1);
}
}
}
}
// Return total size
return (size_t)(write_ptr - output_buffer);
}
// Quantisation for DWT subbands with rate control
static void quantise_dwt_coefficients(float *coeffs, int16_t *quantised, int size, int quantiser, float dead_zone_threshold, int width, int height, int decomp_levels, int is_chroma) {
float effective_q = quantiser;
effective_q = FCLAMP(effective_q, 1.0f, 4096.0f);
for (int i = 0; i < size; i++) {
float quantised_val = coeffs[i] / effective_q;
// Apply dead-zone quantisation ONLY to luma channel and specific subbands
// Chroma channels skip dead-zone (already heavily quantised, avoid colour banding)
// Pattern: HH1 (full), LH1/HL1/HH2 (half), LH2/HL2 (none), others (none)
// Note: Level 1 is finest (280x224), Level 6 is coarsest (8x7)
if (dead_zone_threshold > 0.0f && !is_chroma) {
int level = get_subband_level(i, width, height, decomp_levels);
int subband_type = get_subband_type(i, width, height, decomp_levels);
float level_threshold = 0.0f;
if (level == 1) {
// Finest level (level 1: 280x224)
if (subband_type == 3) {
// HH1: full dead-zone
level_threshold = dead_zone_threshold * DEAD_ZONE_FINEST_SCALE;
} else if (subband_type == 1 || subband_type == 2) {
// LH1, HL1: half dead-zone
level_threshold = dead_zone_threshold * DEAD_ZONE_FINE_SCALE;
}
} else if (level == 2) {
// Second-finest level (level 2: 140x112)
if (subband_type == 3) {
// HH2: half dead-zone
level_threshold = dead_zone_threshold * DEAD_ZONE_FINE_SCALE;
}
// LH2, HL2: no dead-zone
}
// Coarser levels (3-6): no dead-zone to preserve structural information
if (fabsf(quantised_val) <= level_threshold) {
quantised_val = 0.0f;
}
}
quantised[i] = (int16_t)CLAMP((int)(quantised_val + (quantised_val >= 0 ? 0.5f : -0.5f)), -32768, 32767);
}
}
// https://www.desmos.com/calculator/mjlpwqm8ge
static float perceptual_model3_LH(int quality, float level) {
float H4 = 1.2f;
float Q = 2.f; // using fixed value for fixed curve; quantiser will scale it up anyway
float Q12 = Q * 12.f;
float x = level;
float Lx = H4 - ((Q + 1.f) / 15.f) * (x - 4.f);
float C3 = -1.f / 45.f * (Q12 + 92);
float G3x = (-x / 180.f) * (Q12 + 5*x*x - 60*x + 252) - C3 + H4;
return (level >= 4) ? Lx : G3x;
}
static float perceptual_model3_HL(int quality, float LH) {
return fmaf(LH, ANISOTROPY_MULT[quality], ANISOTROPY_BIAS[quality]);
}
static float lerp(float x, float y, float a) {
return x * (1.f - a) + y * a;
}
static float perceptual_model3_HH(float LH, float HL, float level) {
float Kx = fmaf((sqrtf(level) - 1.f), 0.5f, 0.5f);
return lerp(LH, HL, Kx);
}
/*static float perceptual_model3_HH(float LH, float HL, float level) {
return (HL / LH) * 1.44f;
}*/
static float perceptual_model3_LL(int quality, float level) {
float n = perceptual_model3_LH(quality, level);
float m = perceptual_model3_LH(quality, level - 1) / n;
return n / m;
}
static float perceptual_model3_chroma_basecurve(int quality, float level) {
return 1.0f - (1.0f / (0.5f * quality * quality + 1.0f)) * (level - 4.0f); // just a line that passes (4,1)
}
#define FOUR_PIXEL_DETAILER 0.88f
#define TWO_PIXEL_DETAILER 0.92f
// level is one-based index
static float get_perceptual_weight(tav_encoder_t *enc, int level0, int subband_type, int is_chroma, int max_levels) {
// Psychovisual model based on DWT coefficient statistics and Human Visual System sensitivity
float level = 1.0f + ((level0 - 1.0f) / (max_levels - 1.0f)) * 5.0f;
// strategy: more horizontal detail
if (!is_chroma) {
// LL subband - contains most image energy, preserve carefully
if (subband_type == 0)
return perceptual_model3_LL(enc->quality_level, level);
// LH subband - horizontal details (human eyes more sensitive)
float LH = perceptual_model3_LH(enc->quality_level, level);
if (subband_type == 1)
return LH;
// HL subband - vertical details
float HL = perceptual_model3_HL(enc->quality_level, LH);
if (subband_type == 2)
return HL * (2.2f >= level && level >= 1.8f ? TWO_PIXEL_DETAILER : 3.2f >= level && level >= 2.8f ? FOUR_PIXEL_DETAILER : 1.0f);
// HH subband - diagonal details
else return perceptual_model3_HH(LH, HL, level) * (2.2f >= level && level >= 1.8f ? TWO_PIXEL_DETAILER : 3.2f >= level && level >= 2.8f ? FOUR_PIXEL_DETAILER : 1.0f);
} else {
// CHROMA CHANNELS: Less critical for human perception, more aggressive quantisation
// strategy: more horizontal detail
//// mimic 4:4:0 (you heard that right!) chroma subsampling (4:4:4 for higher q, 4:2:0 for lower q)
//// because our eyes are apparently sensitive to horizontal chroma diff as well?
float base = perceptual_model3_chroma_basecurve(enc->quality_level, level - 1);
if (subband_type == 0) { // LL chroma - still important but less than luma
return 1.0f;
} else if (subband_type == 1) { // LH chroma - horizontal chroma details
return FCLAMP(base, 1.0f, 100.0f);
} else if (subband_type == 2) { // HL chroma - vertical chroma details (even less critical)
return FCLAMP(base * ANISOTROPY_MULT_CHROMA[enc->quality_level], 1.0f, 100.0f);
} else { // HH chroma - diagonal chroma details (most aggressive)
return FCLAMP(base * ANISOTROPY_MULT_CHROMA[enc->quality_level] + ANISOTROPY_BIAS_CHROMA[enc->quality_level], 1.0f, 100.0f);
}
}
}
// Get decomposition level and subband type for coefficient at 2D spatial position
// Coefficients are stored in 2D spatial (quad-tree) layout, not linear subband layout
// Returns: level (1=finest to decomp_levels=coarsest, 0 for LL)
static int get_subband_level_2d(int x, int y, int width, int height, int decomp_levels) {
// Recursively determine which level this coefficient belongs to
// by checking which quadrant it's in at each level
for (int level = 1; level <= decomp_levels; level++) {
int half_w = width >> 1;
int half_h = height >> 1;
// Check if in top-left quadrant (LL - contains finer levels)
if (x < half_w && y < half_h) {
// Continue to finer level
width = half_w;
height = half_h;
continue;
}
// In one of the detail bands (LH, HL, HH) at this level
return level;
}
// Reached LL subband at coarsest level
return 0;
}
// Get subband type for coefficient at 2D spatial position
// Returns: 0=LL, 1=LH, 2=HL, 3=HH
static int get_subband_type_2d(int x, int y, int width, int height, int decomp_levels) {
// Recursively determine which subband this coefficient belongs to
for (int level = 1; level <= decomp_levels; level++) {
int half_w = width >> 1;
int half_h = height >> 1;
// Check if in top-left quadrant (LL - contains finer levels)
if (x < half_w && y < half_h) {
// Continue to finer level
width = half_w;
height = half_h;
continue;
}
// Determine which detail band at this level
if (x >= half_w && y < half_h) {
return 1; // LH (top-right)
} else if (x < half_w && y >= half_h) {
return 2; // HL (bottom-left)
} else {
return 3; // HH (bottom-right)
}
}
// Reached LL subband at coarsest level
return 0;
}
// Legacy functions kept for compatibility - convert linear index to 2D coords
static int get_subband_level(int linear_idx, int width, int height, int decomp_levels) {
int x = linear_idx % width;
int y = linear_idx / width;
return get_subband_level_2d(x, y, width, height, decomp_levels);
}
static int get_subband_type(int linear_idx, int width, int height, int decomp_levels) {
int x = linear_idx % width;
int y = linear_idx / width;
return get_subband_type_2d(x, y, width, height, decomp_levels);
}
static float get_perceptual_weight_for_position(tav_encoder_t *enc, int linear_idx, int width, int height, int decomp_levels, int is_chroma) {
// Map linear coefficient index to DWT subband using same layout as decoder
int offset = 0;
// First: LL subband at maximum decomposition level
int ll_width = width >> decomp_levels;
int ll_height = height >> decomp_levels;
int ll_size = ll_width * ll_height;
if (linear_idx < offset + ll_size) {
// LL subband at maximum level - use get_perceptual_weight for consistency
return get_perceptual_weight(enc, decomp_levels, 0, is_chroma, decomp_levels);
}
offset += ll_size;
// Then: LH, HL, HH subbands for each level from max down to 1
for (int level = decomp_levels; level >= 1; level--) {
int level_width = width >> (decomp_levels - level + 1);
int level_height = height >> (decomp_levels - level + 1);
int subband_size = level_width * level_height;
// LH subband (horizontal details)
if (linear_idx < offset + subband_size) {
return get_perceptual_weight(enc, level, 1, is_chroma, decomp_levels);
}
offset += subband_size;
// HL subband (vertical details)
if (linear_idx < offset + subband_size) {
return get_perceptual_weight(enc, level, 2, is_chroma, decomp_levels);
}
offset += subband_size;
// HH subband (diagonal details)
if (linear_idx < offset + subband_size) {
return get_perceptual_weight(enc, level, 3, is_chroma, decomp_levels);
}
offset += subband_size;
}
// Fallback for out-of-bounds indices
return 1.0f;
}
// Apply perceptual quantisation per-coefficient (same loop as uniform but with spatial weights)
static void quantise_dwt_coefficients_perceptual_per_coeff(tav_encoder_t *enc,
float *coeffs, int16_t *quantised, int size,
int base_quantiser, int width, int height,
int decomp_levels, int is_chroma, int frame_count) {
// EXACTLY the same approach as uniform quantisation but apply weight per coefficient
float effective_base_q = base_quantiser;
effective_base_q = FCLAMP(effective_base_q, 1.0f, 4096.0f);
for (int i = 0; i < size; i++) {
// Apply perceptual weight based on coefficient's position in DWT layout
float weight = get_perceptual_weight_for_position(enc, i, width, height, decomp_levels, is_chroma);
float effective_q = effective_base_q * weight;
float quantised_val = coeffs[i] / effective_q;
// Apply dead-zone quantisation ONLY to luma channel and specific subbands
// Chroma channels skip dead-zone (already heavily quantised, avoid colour banding)
// Pattern: HH1 (full), LH1/HL1/HH2 (half), LH2/HL2 (none), others (none)
// Note: Level 1 is finest (280x224), Level 6 is coarsest (8x7)
if (enc->dead_zone_threshold > 0.0f && !is_chroma) {
int level = get_subband_level(i, width, height, decomp_levels);
int subband_type = get_subband_type(i, width, height, decomp_levels);
float level_threshold = 0.0f;
if (level == 1) {
// Finest level (level 1: 280x224)
if (subband_type == 3) {
// HH1: full dead-zone
level_threshold = enc->dead_zone_threshold * DEAD_ZONE_FINEST_SCALE;
} else if (subband_type == 1 || subband_type == 2) {
// LH1, HL1: half dead-zone
level_threshold = enc->dead_zone_threshold * DEAD_ZONE_FINE_SCALE;
}
} else if (level == 2) {
// Second-finest level (level 2: 140x112)
if (subband_type == 3) {
// HH2: half dead-zone
level_threshold = enc->dead_zone_threshold * DEAD_ZONE_FINE_SCALE;
}
// LH2, HL2: no dead-zone
}
// Coarser levels (3-6): no dead-zone to preserve structural information
if (fabsf(quantised_val) <= level_threshold) {
quantised_val = 0.0f;
}
}
quantised[i] = (int16_t)CLAMP((int)(quantised_val + (quantised_val >= 0 ? 0.5f : -0.5f)), -32768, 32767);
}
}
// Convert 2D spatial DWT layout to linear subband layout (for decoder compatibility)
static void convert_2d_to_linear_layout(const int16_t *spatial_2d, int16_t *linear_subbands,
int width, int height, int decomp_levels) {
int linear_offset = 0;
// First: LL subband (top-left corner at finest decomposition level)
int ll_width = width >> decomp_levels;
int ll_height = height >> decomp_levels;
for (int y = 0; y < ll_height; y++) {
for (int x = 0; x < ll_width; x++) {
int spatial_idx = y * width + x;
linear_subbands[linear_offset++] = spatial_2d[spatial_idx];
}
}
// Then: LH, HL, HH subbands for each level from max down to 1
for (int level = decomp_levels; level >= 1; level--) {
int level_width = width >> (decomp_levels - level + 1);
int level_height = height >> (decomp_levels - level + 1);
// LH subband (top-right quadrant)
for (int y = 0; y < level_height; y++) {
for (int x = level_width; x < level_width * 2; x++) {
if (y < height && x < width) {
int spatial_idx = y * width + x;
linear_subbands[linear_offset++] = spatial_2d[spatial_idx];
}
}
}
// HL subband (bottom-left quadrant)
for (int y = level_height; y < level_height * 2; y++) {
for (int x = 0; x < level_width; x++) {
if (y < height && x < width) {
int spatial_idx = y * width + x;
linear_subbands[linear_offset++] = spatial_2d[spatial_idx];
}
}
}
// HH subband (bottom-right quadrant)
for (int y = level_height; y < level_height * 2; y++) {
for (int x = level_width; x < level_width * 2; x++) {
if (y < height && x < width) {
int spatial_idx = y * width + x;
linear_subbands[linear_offset++] = spatial_2d[spatial_idx];
}
}
}
}
}
// Serialise tile data for compression
static size_t serialise_tile_data(tav_encoder_t *enc, int tile_x, int tile_y,
const float *tile_y_data, const float *tile_co_data, const float *tile_cg_data,
uint8_t mode, uint8_t *buffer) {
size_t offset = 0;
// Write tile header
buffer[offset++] = mode;
// Use adjusted quantiser from bitrate control, or base quantiser if not in bitrate mode
int qY_override = enc->bitrate_mode ? quantiser_float_to_int_dithered(enc) : enc->quantiser_y;
buffer[offset++] = (!enc->bitrate_mode) ? 0 : qY_override + 1; // qY override; must be stored with bias of 1
buffer[offset++] = 0; // qCo override, currently unused
buffer[offset++] = 0; // qCg override, currently unused
int this_frame_qY = QLUT[qY_override];
int this_frame_qCo = QLUT[enc->quantiser_co];
int this_frame_qCg = QLUT[enc->quantiser_cg];
if (mode == TAV_MODE_SKIP) {
// No coefficient data for SKIP/MOTION modes
return offset;
}
// Quantise and serialise DWT coefficients
const int tile_size = enc->monoblock ?
(enc->width * enc->height) : // Monoblock mode: full frame
(PADDED_TILE_SIZE_X * PADDED_TILE_SIZE_Y); // Standard mode: padded tiles
// OPTIMISATION: Use pre-allocated buffers instead of malloc/free per tile
int16_t *quantised_y = enc->reusable_quantised_y;
int16_t *quantised_co = enc->reusable_quantised_co;
int16_t *quantised_cg = enc->reusable_quantised_cg;
int16_t *quantised_alpha = enc->reusable_quantised_alpha;
// Debug: check DWT coefficients before quantisation
/*if (tile_x == 0 && tile_y == 0) {
printf("Encoder Debug: Tile (0,0) - DWT Y coeffs before quantisation (first 16): ");
for (int i = 0; i < 16; i++) {
printf("%.2f ", tile_y_data[i]);
}
printf("\n");
printf("Encoder Debug: Quantisers - Y=%d, Co=%d, Cg=%d, rcf=%.2f\n",
this_frame_qY, this_frame_qCo, this_frame_qCg);
}*/
if (mode == TAV_MODE_INTRA) {
// INTRA mode: quantise coefficients directly and store for future reference
if (enc->perceptual_tuning) {
// Perceptual quantisation: EXACTLY like uniform but with per-coefficient weights
quantise_dwt_coefficients_perceptual_per_coeff(enc, (float*)tile_y_data, quantised_y, tile_size, this_frame_qY, enc->width, enc->height, enc->decomp_levels, 0, enc->frame_count);
quantise_dwt_coefficients_perceptual_per_coeff(enc, (float*)tile_co_data, quantised_co, tile_size, this_frame_qCo, enc->width, enc->height, enc->decomp_levels, 1, enc->frame_count);
quantise_dwt_coefficients_perceptual_per_coeff(enc, (float*)tile_cg_data, quantised_cg, tile_size, this_frame_qCg, enc->width, enc->height, enc->decomp_levels, 1, enc->frame_count);
} else {
// Legacy uniform quantisation
quantise_dwt_coefficients((float*)tile_y_data, quantised_y, tile_size, this_frame_qY, enc->dead_zone_threshold, enc->width, enc->height, enc->decomp_levels, 0);
quantise_dwt_coefficients((float*)tile_co_data, quantised_co, tile_size, this_frame_qCo, enc->dead_zone_threshold, enc->width, enc->height, enc->decomp_levels, 1);
quantise_dwt_coefficients((float*)tile_cg_data, quantised_cg, tile_size, this_frame_qCg, enc->dead_zone_threshold, enc->width, enc->height, enc->decomp_levels, 1);
}
// Store current coefficients for future delta reference
int tile_idx = tile_y * enc->tiles_x + tile_x;
float *prev_y = enc->previous_coeffs_y + (tile_idx * tile_size);
float *prev_co = enc->previous_coeffs_co + (tile_idx * tile_size);
float *prev_cg = enc->previous_coeffs_cg + (tile_idx * tile_size);
memcpy(prev_y, tile_y_data, tile_size * sizeof(float));
memcpy(prev_co, tile_co_data, tile_size * sizeof(float));
memcpy(prev_cg, tile_cg_data, tile_size * sizeof(float));
} else if (mode == TAV_MODE_DELTA) {
// DELTA mode: compute coefficient deltas and quantise them
int tile_idx = tile_y * enc->tiles_x + tile_x;
float *prev_y = enc->previous_coeffs_y + (tile_idx * tile_size);
float *prev_co = enc->previous_coeffs_co + (tile_idx * tile_size);
float *prev_cg = enc->previous_coeffs_cg + (tile_idx * tile_size);
// Compute deltas: delta = current - previous
float *delta_y = malloc(tile_size * sizeof(float));
float *delta_co = malloc(tile_size * sizeof(float));
float *delta_cg = malloc(tile_size * sizeof(float));
for (int i = 0; i < tile_size; i++) {
delta_y[i] = tile_y_data[i] - prev_y[i];
delta_co[i] = tile_co_data[i] - prev_co[i];
delta_cg[i] = tile_cg_data[i] - prev_cg[i];
}
// Apply Haar DWT to deltas if enabled (improves compression of sparse deltas)
if (enc->delta_haar_levels > 0) {
int tile_width, tile_height;
if (enc->monoblock) {
tile_width = enc->width;
tile_height = enc->height;
} else {
tile_width = PADDED_TILE_SIZE_X;
tile_height = PADDED_TILE_SIZE_Y;
}
dwt_2d_forward_flexible(delta_y, tile_width, tile_height, enc->delta_haar_levels, WAVELET_HAAR);
dwt_2d_forward_flexible(delta_co, tile_width, tile_height, enc->delta_haar_levels, WAVELET_HAAR);
dwt_2d_forward_flexible(delta_cg, tile_width, tile_height, enc->delta_haar_levels, WAVELET_HAAR);
}
// Quantise the deltas with uniform quantisation (perceptual tuning is for original coefficients, not deltas)
quantise_dwt_coefficients(delta_y, quantised_y, tile_size, this_frame_qY, enc->dead_zone_threshold, enc->width, enc->height, enc->decomp_levels, 0);
quantise_dwt_coefficients(delta_co, quantised_co, tile_size, this_frame_qCo, enc->dead_zone_threshold, enc->width, enc->height, enc->decomp_levels, 1);
quantise_dwt_coefficients(delta_cg, quantised_cg, tile_size, this_frame_qCg, enc->dead_zone_threshold, enc->width, enc->height, enc->decomp_levels, 1);
// Reconstruct coefficients like decoder will (previous + uniform_dequantised_delta)
for (int i = 0; i < tile_size; i++) {
float dequant_delta_y = (float)quantised_y[i] * this_frame_qY;
float dequant_delta_co = (float)quantised_co[i] * this_frame_qCo;
float dequant_delta_cg = (float)quantised_cg[i] * this_frame_qCg;
delta_y[i] = dequant_delta_y;
delta_co[i] = dequant_delta_co;
delta_cg[i] = dequant_delta_cg;
}
// Apply inverse Haar DWT to reconstructed deltas if enabled
if (enc->delta_haar_levels > 0) {
int tile_width, tile_height;
if (enc->monoblock) {
tile_width = enc->width;
tile_height = enc->height;
} else {
tile_width = PADDED_TILE_SIZE_X;
tile_height = PADDED_TILE_SIZE_Y;
}
dwt_2d_haar_inverse_flexible(delta_y, tile_width, tile_height, enc->delta_haar_levels);
dwt_2d_haar_inverse_flexible(delta_co, tile_width, tile_height, enc->delta_haar_levels);
dwt_2d_haar_inverse_flexible(delta_cg, tile_width, tile_height, enc->delta_haar_levels);
}
// Add reconstructed deltas to previous coefficients
for (int i = 0; i < tile_size; i++) {
prev_y[i] = prev_y[i] + delta_y[i];
prev_co[i] = prev_co[i] + delta_co[i];
prev_cg[i] = prev_cg[i] + delta_cg[i];
}
free(delta_y);
free(delta_co);
free(delta_cg);
}
// Debug: check quantised coefficients after quantisation
/*if (tile_x == 0 && tile_y == 0) {
printf("Encoder Debug: Tile (0,0) - Quantised Y coeffs (first 16): ");
for (int i = 0; i < 16; i++) {
printf("%d ", quantised_y[i]);
}
printf("\n");
}*/
// Preprocess and write quantised coefficients using variable channel layout concatenated significance maps
size_t total_compressed_size = preprocess_coefficients_variable_layout(quantised_y, quantised_co, quantised_cg, NULL,
tile_size, enc->channel_layout, buffer + offset);
offset += total_compressed_size;
// DEBUG: Dump raw DWT coefficients for specified frame when it's an intra-frame
if (!debugDumpMade && debugDumpFrameTarget >= 0 &&
enc->frame_count >= debugDumpFrameTarget - 1 && enc->frame_count <= debugDumpFrameTarget + 2 &&
(mode == TAV_MODE_INTRA)) {
char filename[256];
size_t data_size = tile_size * sizeof(int16_t);
// Dump Y channel coefficients
snprintf(filename, sizeof(filename), "frame_%03d.tavframe.y.bin", enc->frame_count);
FILE *debug_fp = fopen(filename, "wb");
if (debug_fp) {
fwrite(quantised_y, 1, data_size, debug_fp);
fclose(debug_fp);
printf("DEBUG: Dumped Y coefficients to %s (%zu bytes)\n", filename, data_size);
}
// Dump Co channel coefficients
snprintf(filename, sizeof(filename), "frame_%03d.tavframe.co.bin", enc->frame_count);
debug_fp = fopen(filename, "wb");
if (debug_fp) {
fwrite(quantised_co, 1, data_size, debug_fp);
fclose(debug_fp);
printf("DEBUG: Dumped Co coefficients to %s (%zu bytes)\n", filename, data_size);
}
// Dump Cg channel coefficients
snprintf(filename, sizeof(filename), "frame_%03d.tavframe.cg.bin", enc->frame_count);
debug_fp = fopen(filename, "wb");
if (debug_fp) {
fwrite(quantised_cg, 1, data_size, debug_fp);
fclose(debug_fp);
printf("DEBUG: Dumped Cg coefficients to %s (%zu bytes)\n", filename, data_size);
}
printf("DEBUG: Frame %d - Dumped all %zu coefficient bytes per channel (total: %zu bytes)\n",
enc->frame_count, data_size, data_size * 3);
debugDumpMade = 1;
}
// OPTIMISATION: No need to free - using pre-allocated reusable buffers
return offset;
}
// Compress and write frame data
static size_t compress_and_write_frame(tav_encoder_t *enc, uint8_t packet_type) {
// Calculate total uncompressed size
const size_t coeff_count = enc->monoblock ?
(enc->width * enc->height) :
(PADDED_TILE_SIZE_X * PADDED_TILE_SIZE_Y);
const size_t max_tile_size = 4 + (coeff_count * 3 * sizeof(int16_t)); // header + 3 channels of coefficients
const size_t total_uncompressed_size = enc->tiles_x * enc->tiles_y * max_tile_size;
// Allocate buffer for uncompressed tile data
uint8_t *uncompressed_buffer = malloc(total_uncompressed_size);
size_t uncompressed_offset = 0;
// Use cached still frame detection result (set in main loop)
int is_still_frame = enc->is_still_frame_cached;
// Serialise all tiles
for (int tile_y = 0; tile_y < enc->tiles_y; tile_y++) {
for (int tile_x = 0; tile_x < enc->tiles_x; tile_x++) {
// Determine tile mode based on frame type, coefficient availability, and intra_only flag
uint8_t mode;
int is_keyframe = (packet_type == TAV_PACKET_IFRAME);
// SKIP mode condition matches main loop logic: still frame during SKIP run
int can_use_skip = is_still_frame && enc->previous_coeffs_allocated;
if (is_keyframe || !enc->previous_coeffs_allocated) {
mode = TAV_MODE_INTRA; // I-frames, first frames, or intra-only mode always use INTRA
count_intra++;
} else if (can_use_skip) {
mode = TAV_MODE_SKIP; // Still frames in SKIP run use SKIP mode
count_skip++;
if (enc->verbose && tile_x == 0 && tile_y == 0) {
printf(" → Using SKIP mode (copying from reference I-frame)\n");
}
} else if (enc->use_delta_encoding) {
mode = TAV_MODE_DELTA; // P-frames use coefficient delta encoding
count_delta++;
} else {
// Delta encoding disabled: use INTRA mode (packet_type is already I-frame from main loop)
mode = TAV_MODE_INTRA;
count_intra++;
}
// Determine tile data size and allocate buffers
int tile_data_size;
if (enc->monoblock) {
// Monoblock mode: entire frame
tile_data_size = enc->width * enc->height;
} else {
// Standard mode: padded tiles (344x288)
tile_data_size = PADDED_TILE_SIZE_X * PADDED_TILE_SIZE_Y;
}
float *tile_y_data = malloc(tile_data_size * sizeof(float));
float *tile_co_data = malloc(tile_data_size * sizeof(float));
float *tile_cg_data = malloc(tile_data_size * sizeof(float));
// Skip processing for SKIP mode - decoder will copy from reference
if (mode != TAV_MODE_SKIP) {
if (enc->monoblock) {
// Extract entire frame (no padding)
memcpy(tile_y_data, enc->current_frame_y, tile_data_size * sizeof(float));
memcpy(tile_co_data, enc->current_frame_co, tile_data_size * sizeof(float));
memcpy(tile_cg_data, enc->current_frame_cg, tile_data_size * sizeof(float));
} else {
// Extract padded tiles using context from neighbours
extract_padded_tile(enc, tile_x, tile_y, tile_y_data, tile_co_data, tile_cg_data);
}
}
// Debug: check input data before DWT
/*if (tile_x == 0 && tile_y == 0) {
printf("Encoder Debug: Tile (0,0) - Y data before DWT (first 16): ");
for (int i = 0; i < 16; i++) {
printf("%.2f ", tile_y_data[i]);
}
printf("\n");
}*/
// Debug: Check Y data before DWT transform
/*if (enc->frame_count == 120 && enc->verbose) {
float max_y_before = 0.0f;
int nonzero_before = 0;
int total_pixels = enc->monoblock ? (enc->width * enc->height) : (PADDED_TILE_SIZE_X * PADDED_TILE_SIZE_Y);
for (int i = 0; i < total_pixels; i++) {
float abs_val = fabsf(tile_y_data[i]);
if (abs_val > max_y_before) max_y_before = abs_val;
if (abs_val > 0.1f) nonzero_before++;
}
printf("DEBUG: Y data before DWT: max=%.2f, nonzero=%d/%d\n", max_y_before, nonzero_before, total_pixels);
}*/
// Apply DWT transform to each channel (skip for SKIP mode)
if (mode != TAV_MODE_SKIP) {
if (enc->monoblock) {
// Monoblock mode: transform entire frame
dwt_2d_forward_flexible(tile_y_data, enc->width, enc->height, enc->decomp_levels, enc->wavelet_filter);
dwt_2d_forward_flexible(tile_co_data, enc->width, enc->height, enc->decomp_levels, enc->wavelet_filter);
dwt_2d_forward_flexible(tile_cg_data, enc->width, enc->height, enc->decomp_levels, enc->wavelet_filter);
} else {
// Standard mode: transform padded tiles (344x288)
dwt_2d_forward_padded(tile_y_data, enc->decomp_levels, enc->wavelet_filter);
dwt_2d_forward_padded(tile_co_data, enc->decomp_levels, enc->wavelet_filter);
dwt_2d_forward_padded(tile_cg_data, enc->decomp_levels, enc->wavelet_filter);
}
}
// Debug: Check Y data after DWT transform for high-frequency content
/*if (enc->frame_count == 120 && enc->verbose) {
printf("DEBUG: Y data after DWT (some high-freq samples): ");
int sample_indices[] = {47034, 47035, 47036, 47037, 47038}; // HH1 start + some samples
for (int i = 0; i < 5; i++) {
printf("%.3f ", tile_y_data[sample_indices[i]]);
}
printf("\n");
}*/
// Apply grain synthesis to Y channel (after DWT, before quantization)
if (enc->grain_synthesis && mode != TAV_MODE_SKIP) {
// Get the quantiser value that will be used for this frame
int qY_value = enc->bitrate_mode ? quantiser_float_to_int_dithered(enc) : enc->quantiser_y;
int actual_qY = QLUT[qY_value];
// Determine dimensions based on mode
int gs_width = enc->monoblock ? enc->width : PADDED_TILE_SIZE_X;
int gs_height = enc->monoblock ? enc->height : PADDED_TILE_SIZE_Y;
// Apply grain synthesis to Y channel only (is_chroma = 0)
apply_grain_synthesis_encoder(enc, tile_y_data, gs_width, gs_height,
enc->decomp_levels, enc->frame_count, actual_qY, 0);
}
// Serialise tile
size_t tile_size = serialise_tile_data(enc, tile_x, tile_y,
tile_y_data, tile_co_data, tile_cg_data,
mode, uncompressed_buffer + uncompressed_offset);
uncompressed_offset += tile_size;
// Free allocated tile data
free(tile_y_data);
free(tile_co_data);
free(tile_cg_data);
}
}
// Compress with zstd
size_t compressed_size = ZSTD_compress(enc->compressed_buffer, enc->compressed_buffer_size,
uncompressed_buffer, uncompressed_offset, enc->zstd_level);
if (ZSTD_isError(compressed_size)) {
fprintf(stderr, "Error: ZSTD compression failed: %s\n", ZSTD_getErrorName(compressed_size));
free(uncompressed_buffer);
return 0;
}
// Write packet header and compressed data
fwrite(&packet_type, 1, 1, enc->output_fp);
uint32_t compressed_size_32 = (uint32_t)compressed_size;
fwrite(&compressed_size_32, sizeof(uint32_t), 1, enc->output_fp);
fwrite(enc->compressed_buffer, 1, compressed_size, enc->output_fp);
free(uncompressed_buffer);
enc->total_compressed_size += compressed_size;
enc->total_uncompressed_size += uncompressed_offset;
// Track last frame type for SKIP mode eligibility
enc->last_frame_packet_type = packet_type;
// Mark coefficient storage as available after first I-frame
if (packet_type == TAV_PACKET_IFRAME) {
enc->previous_coeffs_allocated = 1;
}
return compressed_size + 5; // packet type + size field + compressed data
}
// RGB to YCoCg colour space conversion
static void rgb_to_ycocg(const uint8_t *rgb, float *y, float *co, float *cg, int width, int height) {
const int total_pixels = width * height;
// OPTIMISATION: Process 4 pixels at a time for better cache utilisation
int i = 0;
const int simd_end = (total_pixels / 4) * 4;
// Vectorised processing for groups of 4 pixels
for (i = 0; i < simd_end; i += 4) {
// Load 4 RGB triplets (12 bytes) at once
const uint8_t *rgb_ptr = &rgb[i * 3];
// Process 4 pixels simultaneously with loop unrolling
for (int j = 0; j < 4; j++) {
const int idx = i + j;
const float r = rgb_ptr[j * 3 + 0];
const float g = rgb_ptr[j * 3 + 1];
const float b = rgb_ptr[j * 3 + 2];
// YCoCg-R transform (optimised with fewer temporary variables)
co[idx] = r - b;
const float tmp = b + co[idx] * 0.5f;
cg[idx] = g - tmp;
y[idx] = tmp + cg[idx] * 0.5f;
}
}
// Handle remaining pixels (1-3 pixels)
for (; i < total_pixels; i++) {
const float r = rgb[i * 3 + 0];
const float g = rgb[i * 3 + 1];
const float b = rgb[i * 3 + 2];
co[i] = r - b;
const float tmp = b + co[i] * 0.5f;
cg[i] = g - tmp;
y[i] = tmp + cg[i] * 0.5f;
}
}
// ---------------------- ICtCp Implementation ----------------------
static inline int iround(double v) { return (int)floor(v + 0.5); }
// ---------------------- sRGB gamma helpers ----------------------
static inline double srgb_linearise(double val) {
if (val <= 0.04045) return val / 12.92;
return pow((val + 0.055) / 1.055, 2.4);
}
static inline double srgb_unlinearise(double val) {
if (val <= 0.0031308) return 12.92 * val;
return 1.055 * pow(val, 1.0/2.4) - 0.055;
}
// ---------------------- HLG OETF/EOTF ----------------------
static inline double HLG_OETF(double E) {
const double a = 0.17883277;
const double b = 0.28466892; // 1 - 4*a
const double c = 0.55991073; // 0.5 - a*ln(4*a)
if (E <= 1.0/12.0) return sqrt(3.0 * E);
return a * log(12.0 * E - b) + c;
}
static inline double HLG_EOTF(double Ep) {
const double a = 0.17883277;
const double b = 0.28466892;
const double c = 0.55991073;
if (Ep <= 0.5) {
double val = Ep * Ep / 3.0;
return val;
}
double val = (exp((Ep - c) / a) + b) / 12.0;
return val;
}
// sRGB -> LMS matrix
/*static const double M_RGB_TO_LMS[3][3] = {
{0.2958564579364564, 0.6230869483219083, 0.08106989398623762},
{0.15627390752659093, 0.727308963512872, 0.11639736914944238},
{0.035141262332177715, 0.15657109121101628, 0.8080956851990795}
};*/
// BT.2100 -> LMS matrix
static const double M_RGB_TO_LMS[3][3] = {
{1688.0/4096,2146.0/4096, 262.0/4096},
{ 683.0/4096,2951.0/4096, 462.0/4096},
{ 99.0/4096, 309.0/4096,3688.0/4096}
};
static const double M_LMS_TO_RGB[3][3] = {
{6.1723815689243215, -5.319534979827695, 0.14699442094633924},
{-1.3243428148026244, 2.560286104841917, -0.2359203727576164},
{-0.011819739235953752, -0.26473549971186555, 1.2767952602537955}
};
// ICtCp matrix (L' M' S' -> I Ct Cp). Values are the BT.2100 integer-derived /4096 constants.
static const double M_LMSPRIME_TO_ICTCP[3][3] = {
{ 2048.0/4096.0, 2048.0/4096.0, 0.0 },
{ 3625.0/4096.0, -7465.0/4096.0, 3840.0/4096.0 },
{ 9500.0/4096.0, -9212.0/4096.0, -288.0/4096.0 }
};
// Inverse matrices
static const double M_ICTCP_TO_LMSPRIME[3][3] = {
{ 1.0, 0.015718580108730416, 0.2095810681164055 },
{ 1.0, -0.015718580108730416, -0.20958106811640548 },
{ 1.0, 1.0212710798422344, -0.6052744909924316 }
};
// ---------------------- Forward: sRGB8 -> ICtCp (doubles) ----------------------
void srgb8_to_ictcp_hlg(uint8_t r8, uint8_t g8, uint8_t b8,
double *out_I, double *out_Ct, double *out_Cp)
{
// 1) linearise sRGB to 0..1
double r = srgb_linearise((double)r8 / 255.0);
double g = srgb_linearise((double)g8 / 255.0);
double b = srgb_linearise((double)b8 / 255.0);
// 2) linear RGB -> LMS (single 3x3 multiply)
double L = M_RGB_TO_LMS[0][0]*r + M_RGB_TO_LMS[0][1]*g + M_RGB_TO_LMS[0][2]*b;
double M = M_RGB_TO_LMS[1][0]*r + M_RGB_TO_LMS[1][1]*g + M_RGB_TO_LMS[1][2]*b;
double S = M_RGB_TO_LMS[2][0]*r + M_RGB_TO_LMS[2][1]*g + M_RGB_TO_LMS[2][2]*b;
// 3) HLG OETF
double Lp = HLG_OETF(L);
double Mp = HLG_OETF(M);
double Sp = HLG_OETF(S);
// 4) L'M'S' -> ICtCp
double I = M_LMSPRIME_TO_ICTCP[0][0]*Lp + M_LMSPRIME_TO_ICTCP[0][1]*Mp + M_LMSPRIME_TO_ICTCP[0][2]*Sp;
double Ct = M_LMSPRIME_TO_ICTCP[1][0]*Lp + M_LMSPRIME_TO_ICTCP[1][1]*Mp + M_LMSPRIME_TO_ICTCP[1][2]*Sp;
double Cp = M_LMSPRIME_TO_ICTCP[2][0]*Lp + M_LMSPRIME_TO_ICTCP[2][1]*Mp + M_LMSPRIME_TO_ICTCP[2][2]*Sp;
*out_I = FCLAMP(I * 255.f, 0.f, 255.f);
*out_Ct = FCLAMP(Ct * 255.f + 127.5f, 0.f, 255.f);
*out_Cp = FCLAMP(Cp * 255.f + 127.5f, 0.f, 255.f);
}
// ---------------------- Reverse: ICtCp -> sRGB8 (doubles) ----------------------
void ictcp_hlg_to_srgb8(double I8, double Ct8, double Cp8,
uint8_t *r8, uint8_t *g8, uint8_t *b8)
{
double I = I8 / 255.f;
double Ct = (Ct8 - 127.5f) / 255.f;
double Cp = (Cp8 - 127.5f) / 255.f;
// 1) ICtCp -> L' M' S' (3x3 multiply)
double Lp = M_ICTCP_TO_LMSPRIME[0][0]*I + M_ICTCP_TO_LMSPRIME[0][1]*Ct + M_ICTCP_TO_LMSPRIME[0][2]*Cp;
double Mp = M_ICTCP_TO_LMSPRIME[1][0]*I + M_ICTCP_TO_LMSPRIME[1][1]*Ct + M_ICTCP_TO_LMSPRIME[1][2]*Cp;
double Sp = M_ICTCP_TO_LMSPRIME[2][0]*I + M_ICTCP_TO_LMSPRIME[2][1]*Ct + M_ICTCP_TO_LMSPRIME[2][2]*Cp;
// 2) HLG decode: L' -> linear LMS
double L = HLG_EOTF(Lp);
double M = HLG_EOTF(Mp);
double S = HLG_EOTF(Sp);
// 3) LMS -> linear sRGB (3x3 inverse)
double r_lin = M_LMS_TO_RGB[0][0]*L + M_LMS_TO_RGB[0][1]*M + M_LMS_TO_RGB[0][2]*S;
double g_lin = M_LMS_TO_RGB[1][0]*L + M_LMS_TO_RGB[1][1]*M + M_LMS_TO_RGB[1][2]*S;
double b_lin = M_LMS_TO_RGB[2][0]*L + M_LMS_TO_RGB[2][1]*M + M_LMS_TO_RGB[2][2]*S;
// 4) gamma encode and convert to 0..255 with center-of-bin rounding
double r = srgb_unlinearise(r_lin);
double g = srgb_unlinearise(g_lin);
double b = srgb_unlinearise(b_lin);
*r8 = (uint8_t)iround(FCLAMP(r * 255.0, 0.0, 255.0));
*g8 = (uint8_t)iround(FCLAMP(g * 255.0, 0.0, 255.0));
*b8 = (uint8_t)iround(FCLAMP(b * 255.0, 0.0, 255.0));
}
// ---------------------- Colour Space Switching Functions ----------------------
// Wrapper functions that choose between YCoCg-R and ICtCp based on encoder mode
static void rgb_to_colour_space(tav_encoder_t *enc, uint8_t r, uint8_t g, uint8_t b,
double *c1, double *c2, double *c3) {
if (enc->ictcp_mode) {
// Use ICtCp colour space
srgb8_to_ictcp_hlg(r, g, b, c1, c2, c3);
} else {
// Use YCoCg-R colour space (convert from existing function)
float rf = r, gf = g, bf = b;
float co = rf - bf;
float tmp = bf + co / 2;
float cg = gf - tmp;
float y = tmp + cg / 2;
*c1 = (double)y;
*c2 = (double)co;
*c3 = (double)cg;
}
}
static void colour_space_to_rgb(tav_encoder_t *enc, double c1, double c2, double c3,
uint8_t *r, uint8_t *g, uint8_t *b) {
if (enc->ictcp_mode) {
// Use ICtCp colour space
ictcp_hlg_to_srgb8(c1, c2, c3, r, g, b);
} else {
// Use YCoCg-R colour space (inverse of rgb_to_ycocg)
float y = (float)c1;
float co = (float)c2;
float cg = (float)c3;
float tmp = y - cg / 2.0f;
float g_val = cg + tmp;
float b_val = tmp - co / 2.0f;
float r_val = co + b_val;
*r = (uint8_t)CLAMP((int)(r_val + 0.5f), 0, 255);
*g = (uint8_t)CLAMP((int)(g_val + 0.5f), 0, 255);
*b = (uint8_t)CLAMP((int)(b_val + 0.5f), 0, 255);
}
}
// RGB to colour space conversion for full frames
static void rgb_to_colour_space_frame(tav_encoder_t *enc, const uint8_t *rgb,
float *c1, float *c2, float *c3, int width, int height) {
if (enc->ictcp_mode) {
// ICtCp mode
for (int i = 0; i < width * height; i++) {
double I, Ct, Cp;
srgb8_to_ictcp_hlg(rgb[i*3], rgb[i*3+1], rgb[i*3+2], &I, &Ct, &Cp);
c1[i] = (float)I;
c2[i] = (float)Ct;
c3[i] = (float)Cp;
}
} else {
// Use existing YCoCg function
rgb_to_ycocg(rgb, c1, c2, c3, width, height);
}
}
// RGBA to colour space conversion for full frames with alpha channel
static void rgba_to_colour_space_frame(tav_encoder_t *enc, const uint8_t *rgba,
float *c1, float *c2, float *c3, float *alpha,
int width, int height) {
const int total_pixels = width * height;
if (enc->ictcp_mode) {
// ICtCp mode with alpha
for (int i = 0; i < total_pixels; i++) {
double I, Ct, Cp;
srgb8_to_ictcp_hlg(rgba[i*4], rgba[i*4+1], rgba[i*4+2], &I, &Ct, &Cp);
c1[i] = (float)I;
c2[i] = (float)Ct;
c3[i] = (float)Cp;
alpha[i] = (float)rgba[i*4+3] / 255.0f; // Normalise alpha to [0,1]
}
} else {
// YCoCg mode with alpha - extract RGB first, then convert
uint8_t *temp_rgb = malloc(total_pixels * 3);
for (int i = 0; i < total_pixels; i++) {
temp_rgb[i*3] = rgba[i*4]; // R
temp_rgb[i*3+1] = rgba[i*4+1]; // G
temp_rgb[i*3+2] = rgba[i*4+2]; // B
alpha[i] = (float)rgba[i*4+3] / 255.0f; // Normalise alpha to [0,1]
}
rgb_to_ycocg(temp_rgb, c1, c2, c3, width, height);
free(temp_rgb);
}
}
// Write font ROM upload packet (SSF format)
static int write_fontrom_packet(FILE *fp, const char *filename, uint8_t opcode) {
if (!filename || !fp) return 0;
FILE *rom_file = fopen(filename, "rb");
if (!rom_file) {
fprintf(stderr, "Warning: Could not open font ROM file: %s\n", filename);
return -1;
}
// Get file size
fseek(rom_file, 0, SEEK_END);
long file_size = ftell(rom_file);
fseek(rom_file, 0, SEEK_SET);
if (file_size > 1920) {
fprintf(stderr, "Warning: Font ROM file too large (max 1920 bytes): %s\n", filename);
fclose(rom_file);
return -1;
}
// Read font data
uint8_t *font_data = malloc(file_size);
if (!font_data) {
fprintf(stderr, "Error: Could not allocate memory for font ROM\n");
fclose(rom_file);
return -1;
}
size_t bytes_read = fread(font_data, 1, file_size, rom_file);
fclose(rom_file);
if (bytes_read != file_size) {
fprintf(stderr, "Warning: Could not read entire font ROM file: %s\n", filename);
free(font_data);
return -1;
}
// Write SSF packet
// Packet type: 0x30 (subtitle/SSF)
fputc(0x30, fp);
// Calculate packet size: 3 (index) + 1 (opcode) + 2 (length) + file_size + 1 (terminator)
uint32_t packet_size = 3 + 1 + 2 + file_size + 1;
// Write packet size (uint32, little-endian)
fputc(packet_size & 0xFF, fp);
fputc((packet_size >> 8) & 0xFF, fp);
fputc((packet_size >> 16) & 0xFF, fp);
fputc((packet_size >> 24) & 0xFF, fp);
// SSF payload:
// uint24 index (3 bytes) - use 0 for font ROM uploads
fputc(0, fp);
fputc(0, fp);
fputc(0, fp);
// uint8 opcode (0x80 = low font ROM, 0x81 = high font ROM)
fputc(opcode, fp);
// uint16 payload length (little-endian)
uint16_t payload_len = (uint16_t)file_size;
fputc(payload_len & 0xFF, fp);
fputc((payload_len >> 8) & 0xFF, fp);
// Font data
fwrite(font_data, 1, file_size, fp);
// Terminator
fputc(0x00, fp);
free(font_data);
printf("Font ROM uploaded: %s (%ld bytes, opcode 0x%02X)\n", filename, file_size, opcode);
return 0;
}
// Write TAV file header
static int write_tav_header(tav_encoder_t *enc) {
if (!enc->output_fp) return -1;
// Magic number
fwrite(TAV_MAGIC, 1, 8, enc->output_fp);
// Version (dynamic based on colour space, monoblock mode, and perceptual tuning)
uint8_t version;
if (enc->monoblock) {
if (enc->perceptual_tuning) {
version = enc->ictcp_mode ? 6 : 5; // Version 6 for ICtCp perceptual, 5 for YCoCg-R perceptual
} else {
version = enc->ictcp_mode ? 4 : 3; // Version 4 for ICtCp uniform, 3 for YCoCg-R uniform
}
} else {
if (enc->perceptual_tuning) {
version = enc->ictcp_mode ? 8 : 7;
} else {
version = enc->ictcp_mode ? 2 : 1;
}
}
fputc(version, enc->output_fp);
// Video parameters
// For interlaced: enc->height is already halved internally, so double it back for display height
uint16_t height = enc->progressive_mode ? enc->height : enc->height * 2;
fwrite(&enc->width, sizeof(uint16_t), 1, enc->output_fp);
fwrite(&height, sizeof(uint16_t), 1, enc->output_fp);
fputc(enc->output_fps, enc->output_fp);
fwrite(&enc->total_frames, sizeof(uint32_t), 1, enc->output_fp);
// Encoder parameters
fputc(enc->wavelet_filter, enc->output_fp);
fputc(enc->decomp_levels, enc->output_fp);
fputc(enc->quantiser_y, enc->output_fp);
fputc(enc->quantiser_co, enc->output_fp);
fputc(enc->quantiser_cg, enc->output_fp);
// Feature flags
uint8_t extra_flags = 0;
if (enc->has_audio) extra_flags |= 0x01; // Has audio (placeholder)
if (enc->subtitle_file) extra_flags |= 0x02; // Has subtitles
if (enc->enable_progressive_transmission) extra_flags |= 0x04;
if (enc->enable_roi) extra_flags |= 0x08;
fputc(extra_flags, enc->output_fp);
uint8_t video_flags = 0;
if (!enc->progressive_mode) video_flags |= 0x01; // Interlaced
if (enc->is_ntsc_framerate) video_flags |= 0x02; // NTSC
if (enc->lossless) video_flags |= 0x04; // Lossless
fputc(video_flags, enc->output_fp);
fputc(enc->quality_level+1, enc->output_fp);
fputc(enc->channel_layout, enc->output_fp);
// Device Orientation (default: 0 = no rotation)
fputc(0, enc->output_fp);
// Reserved bytes (3 bytes)
for (int i = 0; i < 3; i++) {
fputc(0, enc->output_fp);
}
// File Role (0 = generic)
fputc(0, enc->output_fp);
return 0;
}
// =============================================================================
// Video Processing Pipeline (from TEV for compatibility)
// =============================================================================
// Execute command and capture output
static char* execute_command(const char* command) {
FILE* pipe = popen(command, "r");
if (!pipe) return NULL;
size_t buffer_size = 4096;
char* buffer = malloc(buffer_size);
size_t total_size = 0;
size_t bytes_read;
while ((bytes_read = fread(buffer + total_size, 1, buffer_size - total_size - 1, pipe)) > 0) {
total_size += bytes_read;
if (total_size + 1 >= buffer_size) {
buffer_size *= 2;
buffer = realloc(buffer, buffer_size);
}
}
buffer[total_size] = '\0';
pclose(pipe);
return buffer;
}
// Get FFmpeg version string (first line before copyright)
static char* get_ffmpeg_version(void) {
char *output = execute_command("ffmpeg -version 2>&1 | head -1");
if (!output) return NULL;
// Trim trailing newline
size_t len = strlen(output);
while (len > 0 && (output[len-1] == '\n' || output[len-1] == '\r')) {
output[len-1] = '\0';
len--;
}
return output; // Caller must free
}
// Get video metadata using ffprobe
static int get_video_metadata(tav_encoder_t *config) {
char command[1024];
char *output;
// Get all metadata without frame count (much faster)
snprintf(command, sizeof(command),
"ffprobe -v quiet "
"-show_entries stream=r_frame_rate:format=duration "
"-select_streams v:0 -of csv=p=0 \"%s\" 2>/dev/null; "
"ffprobe -v quiet -select_streams a:0 -show_entries stream=index -of csv=p=0 \"%s\" 2>/dev/null",
config->input_file, config->input_file);
output = execute_command(command);
if (!output) {
fprintf(stderr, "Failed to get video metadata (ffprobe failed)\n");
return 0;
}
// Parse the combined output
char *line = strtok(output, "\n");
int line_num = 0;
double inputFramerate = 0;
while (line) {
switch (line_num) {
case 0: // framerate (e.g., "30000/1001", "30/1")
if (strlen(line) > 0) {
double num, den;
if (sscanf(line, "%lf/%lf", &num, &den) == 2) {
inputFramerate = num / den;
config->fps = (int)round(inputFramerate);
config->is_ntsc_framerate = (fabs(den - 1001.0) < 0.1);
} else {
config->fps = (int)round(atof(line));
config->is_ntsc_framerate = 0;
}
// Frame count will be determined during encoding
config->total_frames = 0;
}
break;
case 1: // duration in seconds
config->duration = atof(line);
break;
}
line = strtok(NULL, "\n");
line_num++;
}
// Check for audio (line_num > 2 means audio stream was found)
config->has_audio = (line_num > 2);
free(output);
if (config->fps <= 0) {
fprintf(stderr, "Invalid or missing framerate in input file\n");
return 0;
}
// Set output FPS to input FPS if not specified
if (config->output_fps == 0) {
config->output_fps = config->fps;
}
// Frame count will be determined during encoding
config->total_frames = 0;
fprintf(stderr, "Video metadata:\n");
fprintf(stderr, " Frames: (will be determined during encoding)\n");
fprintf(stderr, " FPS: %.2f input, %d output\n", inputFramerate, config->output_fps);
fprintf(stderr, " Duration: %.2fs\n", config->duration);
fprintf(stderr, " Audio: %s\n", config->has_audio ? "Yes" : "No");
if (config->progressive_mode) {
fprintf(stderr, " Resolution: %dx%d\n", config->width, config->height);
} else {
fprintf(stderr, " Resolution: %dx%d (interlaced)\n", config->width, config->height);
}
return 1;
}
// Start FFmpeg process for video conversion with frame rate support
static int start_video_conversion(tav_encoder_t *enc) {
char command[2048];
// Build FFmpeg command with potential frame rate conversion and interlacing support
if (enc->progressive_mode) {
if (enc->output_fps > 0 && enc->output_fps != enc->fps) {
// Frame rate conversion requested
enc->is_ntsc_framerate = 0;
snprintf(command, sizeof(command),
"ffmpeg -v error -i \"%s\" -f rawvideo -pix_fmt rgb24 "
"-vf \"fps=%d,scale=%d:%d:force_original_aspect_ratio=increase,crop=%d:%d\" "
"-y - 2>&1",
enc->input_file, enc->output_fps, enc->width, enc->height, enc->width, enc->height);
} else {
// No frame rate conversion
snprintf(command, sizeof(command),
"ffmpeg -v error -i \"%s\" -f rawvideo -pix_fmt rgb24 "
"-vf \"scale=%d:%d:force_original_aspect_ratio=increase,crop=%d:%d\" "
"-y -",
enc->input_file, enc->width, enc->height, enc->width, enc->height);
}
// Let FFmpeg handle the interlacing
} else {
if (enc->output_fps > 0 && enc->output_fps != enc->fps) {
// Frame rate conversion requested
// filtergraph path:
// 1. FPS conversion
// 2. scale and crop to requested size
// 3. tinterlace weave-overwrites even and odd fields together to produce intermediate video at half framerate, full height (we're losing half the information here -- and that's on purpose)
// 4. separatefields separates weave-overwritten frame as two consecutive frames, at half height. Since the frame rate is halved in Step 3. and being doubled here, the final framerate is identical to given framerate
enc->is_ntsc_framerate = 0;
snprintf(command, sizeof(command),
"ffmpeg -v error -i \"%s\" -f rawvideo -pix_fmt rgb24 "
"-vf \"fps=%d,scale=%d:%d:force_original_aspect_ratio=increase,crop=%d:%d,tinterlace=interleave_top:cvlpf,separatefields\" "
"-y - 2>&1",
enc->input_file, enc->output_fps, enc->width, enc->height * 2, enc->width, enc->height * 2);
} else {
// No frame rate conversion
// filtergraph path:
// 1. scale and crop to requested size
// 2. tinterlace weave-overwrites even and odd fields together to produce intermediate video at half framerate, full height (we're losing half the information here -- and that's on purpose)
// 3. separatefields separates weave-overwritten frame as two consecutive frames, at half height. Since the frame rate is halved in Step 2. and being doubled here, the final framerate is identical to the original framerate
snprintf(command, sizeof(command),
"ffmpeg -v error -i \"%s\" -f rawvideo -pix_fmt rgb24 "
"-vf \"scale=%d:%d:force_original_aspect_ratio=increase,crop=%d:%d,tinterlace=interleave_top:cvlpf,separatefields\" "
"-y -",
enc->input_file, enc->width, enc->height * 2, enc->width, enc->height * 2);
}
}
if (enc->verbose) {
printf("FFmpeg command: %s\n", command);
}
enc->ffmpeg_video_pipe = popen(command, "r");
if (!enc->ffmpeg_video_pipe) {
fprintf(stderr, "Failed to start FFmpeg video conversion\n");
return 0;
}
return 1;
}
// Start audio conversion
static int start_audio_conversion(tav_encoder_t *enc) {
if (!enc->has_audio) return 1;
char command[2048];
int bitrate;
if (enc->audio_bitrate > 0) {
bitrate = enc->audio_bitrate;
} else {
bitrate = enc->lossless ? 384 : MP2_RATE_TABLE[enc->quality_level];
}
printf(" Audio format: MP2 %dkbps (via libtwolame)\n", bitrate);
snprintf(command, sizeof(command),
"ffmpeg -v quiet -i \"%s\" -acodec libtwolame -psymodel 4 -b:a %dk -ar 32000 -ac 2 -y \"%s\" 2>/dev/null",
enc->input_file, bitrate, TEMP_AUDIO_FILE);
int result = system(command);
if (result == 0) {
enc->mp2_file = fopen(TEMP_AUDIO_FILE, "rb");
if (enc->mp2_file) {
fseek(enc->mp2_file, 0, SEEK_END);
enc->audio_remaining = ftell(enc->mp2_file);
fseek(enc->mp2_file, 0, SEEK_SET);
}
return 1;
}
return 0;
}
// Get MP2 packet size from header (copied from TEV)
static int get_mp2_packet_size(uint8_t *header) {
int bitrate_index = (header[2] >> 4) & 0x0F;
int bitrates[] = {0, 32, 48, 56, 64, 80, 96, 112, 128, 160, 192, 224, 256, 320, 384};
if (bitrate_index >= 15) return MP2_DEFAULT_PACKET_SIZE;
int bitrate = bitrates[bitrate_index];
if (bitrate == 0) return MP2_DEFAULT_PACKET_SIZE;
int sampling_freq_index = (header[2] >> 2) & 0x03;
int sampling_freqs[] = {44100, 48000, 32000, 0};
int sampling_freq = sampling_freqs[sampling_freq_index];
if (sampling_freq == 0) return MP2_DEFAULT_PACKET_SIZE;
int padding = (header[2] >> 1) & 0x01;
return (144 * bitrate * 1000) / sampling_freq + padding;
}
// Convert MP2 packet size to rate index (copied from TEV)
static int mp2_packet_size_to_rate_index(int packet_size, int is_mono) {
// Map packet size to rate index for MP2_RATE_TABLE
if (packet_size <= 576) return is_mono ? 0 : 0; // 128k
else if (packet_size <= 720) return 1; // 160k
else if (packet_size <= 1008) return 2; // 224k
else if (packet_size <= 1440) return 3; // 320k
else return 4; // 384k
}
// Convert SRT time format to frame number (copied from TEV)
static int srt_time_to_frame(const char *time_str, int fps) {
int hours, minutes, seconds, milliseconds;
if (sscanf(time_str, "%d:%d:%d,%d", &hours, &minutes, &seconds, &milliseconds) != 4) {
return -1;
}
double total_seconds = hours * 3600.0 + minutes * 60.0 + seconds + milliseconds / 1000.0;
return (int)(total_seconds * fps + 0.5); // Round to nearest frame
}
// Convert SAMI milliseconds to frame number
static int sami_ms_to_frame(int milliseconds, int fps) {
double seconds = milliseconds / 1000.0;
return (int)(seconds * fps + 0.5); // Round to nearest frame
}
// Parse SubRip subtitle file
static subtitle_entry_t* parse_srt_file(const char *filename, int fps) {
FILE *file = fopen(filename, "r");
if (!file) {
fprintf(stderr, "Failed to open subtitle file: %s\n", filename);
return NULL;
}
subtitle_entry_t *head = NULL;
subtitle_entry_t *tail = NULL;
char line[1024];
int state = 0; // 0=index, 1=time, 2=text, 3=blank
subtitle_entry_t *current_entry = NULL;
char *text_buffer = NULL;
size_t text_buffer_size = 0;
while (fgets(line, sizeof(line), file)) {
// Remove trailing newline
size_t len = strlen(line);
if (len > 0 && line[len-1] == '\n') {
line[len-1] = '\0';
len--;
}
if (len > 0 && line[len-1] == '\r') {
line[len-1] = '\0';
len--;
}
if (state == 0) { // Expecting subtitle index
if (strlen(line) == 0) continue; // Skip empty lines
// Create new subtitle entry
current_entry = calloc(1, sizeof(subtitle_entry_t));
if (!current_entry) break;
state = 1;
} else if (state == 1) { // Expecting time range
char start_time[32], end_time[32];
if (sscanf(line, "%31s --> %31s", start_time, end_time) == 2) {
current_entry->start_frame = srt_time_to_frame(start_time, fps);
current_entry->end_frame = srt_time_to_frame(end_time, fps);
if (current_entry->start_frame < 0 || current_entry->end_frame < 0) {
free(current_entry);
current_entry = NULL;
state = 3; // Skip to next blank line
continue;
}
// Initialise text buffer
text_buffer_size = 256;
text_buffer = malloc(text_buffer_size);
if (!text_buffer) {
free(current_entry);
current_entry = NULL;
fprintf(stderr, "Memory allocation failed while parsing subtitles\n");
break;
}
text_buffer[0] = '\0';
state = 2;
} else {
free(current_entry);
current_entry = NULL;
state = 3; // Skip malformed entry
}
} else if (state == 2) { // Collecting subtitle text
if (strlen(line) == 0) {
// End of subtitle text
current_entry->text = strdup(text_buffer);
free(text_buffer);
text_buffer = NULL;
// Add to list
if (!head) {
head = current_entry;
tail = current_entry;
} else {
tail->next = current_entry;
tail = current_entry;
}
current_entry = NULL;
state = 0;
} else {
// Append text line
size_t current_len = strlen(text_buffer);
size_t line_len = strlen(line);
size_t needed = current_len + line_len + 2; // +2 for newline and null
if (needed > text_buffer_size) {
text_buffer_size = needed + 256;
char *new_buffer = realloc(text_buffer, text_buffer_size);
if (!new_buffer) {
free(text_buffer);
free(current_entry);
current_entry = NULL;
fprintf(stderr, "Memory allocation failed while parsing subtitles\n");
break;
}
text_buffer = new_buffer;
}
if (current_len > 0) {
strcat(text_buffer, "\n");
}
strcat(text_buffer, line);
}
} else if (state == 3) { // Skip to next blank line
if (strlen(line) == 0) {
state = 0;
}
}
}
// Handle final subtitle if file doesn't end with blank line
if (current_entry && text_buffer) {
current_entry->text = strdup(text_buffer);
free(text_buffer);
if (!head) {
head = current_entry;
} else {
tail->next = current_entry;
}
}
//fclose(file); // why uncommenting it errors out with "Fatal error: glibc detected an invalid stdio handle"?
return head;
}
// Strip HTML tags from text but preserve <b> and <i> formatting tags
static char* strip_html_tags(const char *html) {
if (!html) return NULL;
size_t len = strlen(html);
char *result = malloc(len + 1);
if (!result) return NULL;
int in_tag = 0;
int out_pos = 0;
int i = 0;
while (i < len) {
if (html[i] == '<') {
// Check if this is a formatting tag we want to preserve
int preserve_tag = 0;
// Check for <b>, </b>, <i>, </i> tags
if (i + 1 < len) {
if ((i + 2 < len && strncasecmp(&html[i], "<b>", 3) == 0) ||
(i + 3 < len && strncasecmp(&html[i], "</b>", 4) == 0) ||
(i + 2 < len && strncasecmp(&html[i], "<i>", 3) == 0) ||
(i + 3 < len && strncasecmp(&html[i], "</i>", 4) == 0)) {
preserve_tag = 1;
}
}
if (preserve_tag) {
// Copy the entire tag
while (i < len && html[i] != '>') {
result[out_pos++] = html[i++];
}
if (i < len) {
result[out_pos++] = html[i++]; // Copy the '>'
}
} else {
// Skip non-formatting tags
in_tag = 1;
i++;
}
} else if (html[i] == '>') {
in_tag = 0;
i++;
} else if (!in_tag) {
result[out_pos++] = html[i++];
} else {
i++;
}
}
result[out_pos] = '\0';
return result;
}
// Parse SAMI subtitle file
static subtitle_entry_t* parse_smi_file(const char *filename, int fps) {
FILE *file = fopen(filename, "r");
if (!file) {
fprintf(stderr, "Failed to open subtitle file: %s\n", filename);
return NULL;
}
subtitle_entry_t *head = NULL;
subtitle_entry_t *tail = NULL;
char line[2048];
char *content = NULL;
size_t content_size = 0;
size_t content_pos = 0;
// Read entire file into memory for easier parsing
while (fgets(line, sizeof(line), file)) {
size_t line_len = strlen(line);
// Expand content buffer if needed
if (content_pos + line_len + 1 > content_size) {
content_size = content_size ? content_size * 2 : 8192;
char *new_content = realloc(content, content_size);
if (!new_content) {
free(content);
fclose(file);
fprintf(stderr, "Memory allocation failed while parsing SAMI file\n");
return NULL;
}
content = new_content;
}
strcpy(content + content_pos, line);
content_pos += line_len;
}
fclose(file);
if (!content) return NULL;
// Convert to lowercase for case-insensitive parsing
char *content_lower = malloc(strlen(content) + 1);
if (!content_lower) {
free(content);
return NULL;
}
for (int i = 0; content[i]; i++) {
content_lower[i] = tolower(content[i]);
}
content_lower[strlen(content)] = '\0';
// Find BODY section
char *body_start = strstr(content_lower, "<body");
if (!body_start) {
fprintf(stderr, "No BODY section found in SAMI file\n");
free(content);
free(content_lower);
return NULL;
}
// Skip to actual body content
body_start = strchr(body_start, '>');
if (!body_start) {
free(content);
free(content_lower);
return NULL;
}
body_start++;
// Calculate offset in original content
size_t body_offset = body_start - content_lower;
char *body_content = content + body_offset;
// Parse SYNC tags
char *pos = content_lower + body_offset;
while ((pos = strstr(pos, "<sync")) != NULL) {
// Find start time
char *start_attr = strstr(pos, "start");
if (!start_attr || start_attr > strstr(pos, ">")) {
pos++;
continue;
}
// Parse start time
start_attr = strchr(start_attr, '=');
if (!start_attr) {
pos++;
continue;
}
start_attr++;
// Skip whitespace and quotes
while (*start_attr && (*start_attr == ' ' || *start_attr == '"' || *start_attr == '\'')) {
start_attr++;
}
int start_ms = atoi(start_attr);
if (start_ms < 0) {
pos++;
continue;
}
// Find end of sync tag
char *sync_end = strchr(pos, '>');
if (!sync_end) {
pos++;
continue;
}
sync_end++;
// Find next sync tag or end of body
char *next_sync = strstr(sync_end, "<sync");
char *body_end = strstr(sync_end, "</body>");
char *text_end = next_sync;
if (body_end && (!next_sync || body_end < next_sync)) {
text_end = body_end;
}
if (!text_end) {
// Use end of content
text_end = content_lower + strlen(content_lower);
}
// Extract subtitle text
size_t text_len = text_end - sync_end;
if (text_len > 0) {
// Get text from original content (not lowercase version)
size_t sync_offset = sync_end - content_lower;
char *subtitle_text = malloc(text_len + 1);
if (!subtitle_text) break;
strncpy(subtitle_text, content + sync_offset, text_len);
subtitle_text[text_len] = '\0';
// Strip HTML tags and clean up text
char *clean_text = strip_html_tags(subtitle_text);
free(subtitle_text);
if (clean_text && strlen(clean_text) > 0) {
// Remove leading/trailing whitespace
char *start = clean_text;
while (*start && (*start == ' ' || *start == '\t' || *start == '\n' || *start == '\r')) {
start++;
}
char *end = start + strlen(start) - 1;
while (end > start && (*end == ' ' || *end == '\t' || *end == '\n' || *end == '\r')) {
*end = '\0';
end--;
}
if (strlen(start) > 0) {
// Create subtitle entry
subtitle_entry_t *entry = calloc(1, sizeof(subtitle_entry_t));
if (entry) {
entry->start_frame = sami_ms_to_frame(start_ms, fps);
entry->text = strdup(start);
// Set end frame to next subtitle start or a default duration
if (next_sync) {
// Parse next sync start time
char *next_start = strstr(next_sync, "start");
if (next_start) {
next_start = strchr(next_start, '=');
if (next_start) {
next_start++;
while (*next_start && (*next_start == ' ' || *next_start == '"' || *next_start == '\'')) {
next_start++;
}
int next_ms = atoi(next_start);
if (next_ms > start_ms) {
entry->end_frame = sami_ms_to_frame(next_ms, fps);
} else {
entry->end_frame = entry->start_frame + fps * 3; // 3 second default
}
}
}
} else {
entry->end_frame = entry->start_frame + fps * 3; // 3 second default
}
// Add to list
if (!head) {
head = entry;
tail = entry;
} else {
tail->next = entry;
tail = entry;
}
}
}
}
free(clean_text);
}
pos = sync_end;
}
free(content);
free(content_lower);
return head;
}
// Detect subtitle file format based on extension and content
static int detect_subtitle_format(const char *filename) {
// Check file extension first
const char *ext = strrchr(filename, '.');
if (ext) {
ext++; // Skip the dot
if (strcasecmp(ext, "smi") == 0 || strcasecmp(ext, "sami") == 0) {
return 1; // SAMI format
}
if (strcasecmp(ext, "srt") == 0) {
return 2; // SubRip format
}
}
// If extension is unclear, try to detect from content
FILE *file = fopen(filename, "r");
if (!file) return 0; // Default to SRT
char line[1024];
int has_sami_tags = 0;
int has_srt_format = 0;
int lines_checked = 0;
while (fgets(line, sizeof(line), file) && lines_checked < 20) {
// Convert to lowercase for checking
char *lower_line = malloc(strlen(line) + 1);
if (lower_line) {
for (int i = 0; line[i]; i++) {
lower_line[i] = tolower(line[i]);
}
lower_line[strlen(line)] = '\0';
// Check for SAMI indicators
if (strstr(lower_line, "<sami>") || strstr(lower_line, "<sync") ||
strstr(lower_line, "<body>") || strstr(lower_line, "start=")) {
has_sami_tags = 1;
free(lower_line);
break;
}
// Check for SRT indicators (time format)
if (strstr(lower_line, "-->")) {
has_srt_format = 1;
}
free(lower_line);
}
lines_checked++;
}
fclose(file);
// Return format based on detection
if (has_sami_tags) return 1; // SAMI
if (has_srt_format) return 2; // SRT
return 0; // Unknown
}
// Parse subtitle file (auto-detect format)
static subtitle_entry_t* parse_subtitle_file(const char *filename, int fps) {
int format = detect_subtitle_format(filename);
if (format == 1) return parse_smi_file(filename, fps);
else if (format == 2) return parse_srt_file(filename, fps);
else return NULL;
}
// Free subtitle list (copied from TEV)
static void free_subtitle_list(subtitle_entry_t *list) {
while (list) {
subtitle_entry_t *next = list->next;
free(list->text);
free(list);
list = next;
}
}
// Write subtitle packet (copied from TEV)
static int write_subtitle_packet(FILE *output, uint32_t index, uint8_t opcode, const char *text) {
// Calculate packet size
size_t text_len = text ? strlen(text) : 0;
size_t packet_size = 3 + 1 + text_len + 1; // index (3 bytes) + opcode + text + null terminator
// Write packet type and size
uint8_t packet_type = TAV_PACKET_SUBTITLE;
fwrite(&packet_type, 1, 1, output);
uint32_t size32 = (uint32_t)packet_size;
fwrite(&size32, 4, 1, output);
// Write subtitle data
uint8_t index_bytes[3] = {
(uint8_t)(index & 0xFF),
(uint8_t)((index >> 8) & 0xFF),
(uint8_t)((index >> 16) & 0xFF)
};
fwrite(index_bytes, 3, 1, output);
fwrite(&opcode, 1, 1, output);
if (text && text_len > 0) {
fwrite(text, 1, text_len, output);
}
uint8_t null_terminator = 0;
fwrite(&null_terminator, 1, 1, output);
return 1 + 4 + packet_size; // Total bytes written
}
// Write timecode packet for current frame
// Timecode is the time since stream start in nanoseconds
static void write_timecode_packet(FILE *output, int frame_num, int fps, int is_ntsc_framerate) {
uint8_t packet_type = TAV_PACKET_TIMECODE;
fwrite(&packet_type, 1, 1, output);
// Calculate timecode in nanoseconds
// For NTSC (29.97 fps): time = frame_num * 1001000000 / 30000
// For other framerates: time = frame_num * 1000000000 / fps
uint64_t timecode_ns;
if (is_ntsc_framerate) {
// NTSC uses 30000/1001 fps (29.97...)
// To avoid floating point: time_ns = frame_num * 1001000000 / 30000
timecode_ns = ((uint64_t)frame_num * 1001000000ULL) / 30000ULL;
} else {
// Standard framerate
timecode_ns = ((uint64_t)frame_num * 1000000000ULL) / (uint64_t)fps;
}
// Write timecode as little-endian uint64
fwrite(&timecode_ns, sizeof(uint64_t), 1, output);
}
// Write extended header packet with metadata
// Returns the file offset where ENDT value is written (for later update)
static long write_extended_header(tav_encoder_t *enc) {
uint8_t packet_type = TAV_PACKET_EXTENDED_HDR;
fwrite(&packet_type, 1, 1, enc->output_fp);
// Count key-value pairs (BGNT, ENDT, CDAT, VNDR, FMPG)
uint16_t num_pairs = enc->ffmpeg_version ? 5 : 4; // FMPG is optional
fwrite(&num_pairs, sizeof(uint16_t), 1, enc->output_fp);
// Helper macro to write key-value pairs
#define WRITE_KV_UINT64(key_str, value) do { \
fwrite(key_str, 1, 4, enc->output_fp); \
uint8_t value_type = 0x04; /* Uint64 */ \
fwrite(&value_type, 1, 1, enc->output_fp); \
uint64_t val = (value); \
fwrite(&val, sizeof(uint64_t), 1, enc->output_fp); \
} while(0)
#define WRITE_KV_BYTES(key_str, data, len) do { \
fwrite(key_str, 1, 4, enc->output_fp); \
uint8_t value_type = 0x10; /* Bytes */ \
fwrite(&value_type, 1, 1, enc->output_fp); \
uint16_t length = (len); \
fwrite(&length, sizeof(uint16_t), 1, enc->output_fp); \
fwrite((data), 1, (len), enc->output_fp); \
} while(0)
// BGNT: Video begin time (0 for frame 0)
WRITE_KV_UINT64("BGNT", 0ULL);
// ENDT: Video end time (placeholder, will be updated at end)
long endt_offset = ftell(enc->output_fp);
WRITE_KV_UINT64("ENDT", 0ULL);
// CDAT: Creation time in nanoseconds since UNIX epoch
WRITE_KV_UINT64("CDAT", enc->creation_time_ns);
// VNDR: Encoder name and version
const char *vendor_str = ENCODER_VENDOR_STRING;
WRITE_KV_BYTES("VNDR", vendor_str, strlen(vendor_str));
// FMPG: FFmpeg version (if available)
if (enc->ffmpeg_version) {
WRITE_KV_BYTES("FMPG", enc->ffmpeg_version, strlen(enc->ffmpeg_version));
}
#undef WRITE_KV_UINT64
#undef WRITE_KV_BYTES
// Return offset of ENDT value (skip key, type byte)
return endt_offset + 4 + 1; // 4 bytes for "ENDT", 1 byte for type
}
// Process audio for current frame (copied and adapted from TEV)
static int process_audio(tav_encoder_t *enc, int frame_num, FILE *output) {
if (!enc->has_audio || !enc->mp2_file || enc->audio_remaining <= 0) {
return 1;
}
// Initialise packet size on first frame
if (frame_num == 0) {
uint8_t header[4];
if (fread(header, 1, 4, enc->mp2_file) != 4) return 1;
fseek(enc->mp2_file, 0, SEEK_SET);
enc->mp2_packet_size = get_mp2_packet_size(header);
int is_mono = (header[3] >> 6) == 3;
enc->mp2_rate_index = mp2_packet_size_to_rate_index(enc->mp2_packet_size, is_mono);
enc->target_audio_buffer_size = 4; // 4 audio packets in buffer
enc->audio_frames_in_buffer = 0.0;
}
// Calculate how much audio time each frame represents (in seconds)
double frame_audio_time = 1.0 / enc->output_fps;
// Calculate how much audio time each MP2 packet represents
// MP2 frame contains 1152 samples at 32kHz = 0.036 seconds
#define MP2_SAMPLE_RATE 32000
double packet_audio_time = 1152.0 / MP2_SAMPLE_RATE;
// Estimate how many packets we consume per video frame
double packets_per_frame = frame_audio_time / packet_audio_time;
// Allocate MP2 buffer if needed
if (!enc->mp2_buffer) {
enc->mp2_buffer_size = enc->mp2_packet_size * 2; // Space for multiple packets
enc->mp2_buffer = malloc(enc->mp2_buffer_size);
if (!enc->mp2_buffer) {
fprintf(stderr, "Failed to allocate audio buffer\n");
return 1;
}
}
// Audio buffering strategy: maintain target buffer level
int packets_to_insert = 0;
if (frame_num == 0) {
// Prime buffer to target level initially
packets_to_insert = enc->target_audio_buffer_size;
enc->audio_frames_in_buffer = 0; // count starts from 0
if (enc->verbose) {
printf("Frame %d: Priming audio buffer with %d packets\n", frame_num, packets_to_insert);
}
} else {
// Simulate buffer consumption (fractional consumption per frame)
double old_buffer = enc->audio_frames_in_buffer;
enc->audio_frames_in_buffer -= packets_per_frame;
// Calculate how many packets we need to maintain target buffer level
// Only insert when buffer drops below target, and only insert enough to restore target
double target_level = (double)enc->target_audio_buffer_size;
if (enc->audio_frames_in_buffer < target_level) {
double deficit = target_level - enc->audio_frames_in_buffer;
// Insert packets to cover the deficit, but at least maintain minimum flow
packets_to_insert = (int)ceil(deficit);
// Cap at reasonable maximum to prevent excessive insertion
if (packets_to_insert > enc->target_audio_buffer_size) {
packets_to_insert = enc->target_audio_buffer_size;
}
if (enc->verbose) {
printf("Frame %d: Buffer low (%.2f->%.2f), deficit %.2f, inserting %d packets\n",
frame_num, old_buffer, enc->audio_frames_in_buffer, deficit, packets_to_insert);
}
} else if (enc->verbose && old_buffer != enc->audio_frames_in_buffer) {
printf("Frame %d: Buffer sufficient (%.2f->%.2f), no packets\n",
frame_num, old_buffer, enc->audio_frames_in_buffer);
}
}
// Insert the calculated number of audio packets
for (int q = 0; q < packets_to_insert; q++) {
size_t bytes_to_read = enc->mp2_packet_size;
if (bytes_to_read > enc->audio_remaining) {
bytes_to_read = enc->audio_remaining;
}
size_t bytes_read = fread(enc->mp2_buffer, 1, bytes_to_read, enc->mp2_file);
if (bytes_read == 0) break;
// Write TAV MP2 audio packet
uint8_t audio_packet_type = TAV_PACKET_AUDIO_MP2;
uint32_t audio_len = (uint32_t)bytes_read;
fwrite(&audio_packet_type, 1, 1, output);
fwrite(&audio_len, 4, 1, output);
fwrite(enc->mp2_buffer, 1, bytes_read, output);
// Track audio bytes written
enc->audio_remaining -= bytes_read;
enc->audio_frames_in_buffer++;
if (frame_num == 0) {
enc->audio_frames_in_buffer = enc->target_audio_buffer_size / 2; // trick the buffer simulator so that it doesn't count the frame 0 priming
}
if (enc->verbose) {
printf("Audio packet %d: %zu bytes (buffer: %.2f packets)\n",
q, bytes_read, enc->audio_frames_in_buffer);
}
}
return 1;
}
// Process subtitles for current frame (copied and adapted from TEV)
static int process_subtitles(tav_encoder_t *enc, int frame_num, FILE *output) {
if (!enc->subtitles) {
return 1; // No subtitles to process
}
int bytes_written = 0;
// Check if we need to show a new subtitle
if (!enc->subtitle_visible) {
subtitle_entry_t *sub = enc->current_subtitle;
if (!sub) sub = enc->subtitles; // Start from beginning if not set
// Find next subtitle to show
while (sub && sub->start_frame <= frame_num) {
if (sub->end_frame > frame_num) {
// This subtitle should be shown
if (sub != enc->current_subtitle) {
enc->current_subtitle = sub;
enc->subtitle_visible = 1;
bytes_written += write_subtitle_packet(output, 0, 0x01, sub->text);
if (enc->verbose) {
printf("Frame %d: Showing subtitle: %.50s%s\n",
frame_num, sub->text, strlen(sub->text) > 50 ? "..." : "");
}
}
break;
}
sub = sub->next;
}
}
// Check if we need to hide current subtitle
if (enc->subtitle_visible && enc->current_subtitle) {
if (frame_num >= enc->current_subtitle->end_frame) {
enc->subtitle_visible = 0;
bytes_written += write_subtitle_packet(output, 0, 0x02, NULL);
if (enc->verbose) {
printf("Frame %d: Hiding subtitle\n", frame_num);
}
}
}
return bytes_written;
}
// Detect scene changes by analysing frame differences
static int detect_scene_change(tav_encoder_t *enc) {
if (!enc->current_frame_rgb || enc->intra_only) {
return 0; // No current frame to compare
}
uint8_t *comparison_buffer = enc->previous_frame_rgb;
long long total_diff = 0;
int changed_pixels = 0;
// Sample every 4th pixel for performance (still gives good detection)
for (int y = 0; y < enc->height; y += 2) {
for (int x = 0; x < enc->width; x += 2) {
int offset = (y * enc->width + x) * 3;
// Calculate colour difference
int r_diff = abs(enc->current_frame_rgb[offset] - comparison_buffer[offset]);
int g_diff = abs(enc->current_frame_rgb[offset + 1] - comparison_buffer[offset + 1]);
int b_diff = abs(enc->current_frame_rgb[offset + 2] - comparison_buffer[offset + 2]);
int pixel_diff = r_diff + g_diff + b_diff;
total_diff += pixel_diff;
// Count significantly changed pixels (threshold of 30 per channel average)
if (pixel_diff > 90) {
changed_pixels++;
}
}
}
// Calculate metrics for scene change detection
int sampled_pixels = (enc->height / 2) * (enc->width / 2);
double avg_diff = (double)total_diff / sampled_pixels;
double changed_ratio = (double)changed_pixels / sampled_pixels;
if (enc->verbose) {
printf("Scene change detection: avg_diff=%.2f\tchanged_ratio=%.4f\n", avg_diff, changed_ratio);
}
// Scene change thresholds - adjust for interlaced mode
// Interlaced fields have more natural differences due to temporal field separation
double threshold = 0.30;
return changed_ratio > threshold;
}
// Detect still frames by comparing quantised DWT coefficients
// Returns 1 if frame is still (suitable for SKIP mode), 0 otherwise
static int detect_still_frame(tav_encoder_t *enc) {
if (!enc->current_frame_rgb || !enc->previous_frame_rgb || enc->intra_only) {
return 0; // No frame to compare or intra-only mode
}
long long total_diff = 0;
int changed_pixels = 0;
// Sample every 4th pixel for performance (same as scene change detection)
for (int y = 0; y < enc->height; y += 2) {
for (int x = 0; x < enc->width; x += 2) {
int offset = (y * enc->width + x) * 3;
// Calculate colour difference
int r_diff = abs(enc->current_frame_rgb[offset] - enc->previous_frame_rgb[offset]);
int g_diff = abs(enc->current_frame_rgb[offset + 1] - enc->previous_frame_rgb[offset + 1]);
int b_diff = abs(enc->current_frame_rgb[offset + 2] - enc->previous_frame_rgb[offset + 2]);
int pixel_diff = r_diff + g_diff + b_diff;
total_diff += pixel_diff;
// Count changed pixels with very low threshold (2 per channel average = 6 total)
if (pixel_diff > 6) {
changed_pixels++;
}
}
}
// Calculate metrics
int sampled_pixels = (enc->height / 2) * (enc->width / 2);
if (enc->verbose) {
printf("Still frame detection: %d/%d pixels changed\n", changed_pixels, sampled_pixels);
}
return (changed_pixels == 0);
}
// Detect still frames by comparing quantised DWT coefficients
// Returns 1 if quantised coefficients are identical (frame is truly still), 0 otherwise
// Benefits: quality-aware (lower quality = more SKIP frames), pure integer math
// DISABLED - should work in theory, not actually
static int detect_still_frame_dwt(tav_encoder_t *enc) {
if (!enc->previous_coeffs_allocated || enc->intra_only) {
return 0; // No previous coefficients to compare or intra-only mode
}
// Only compare against I-frames to avoid DELTA quantization drift
// previous_coeffs are updated by DELTA frames with reconstructed values that accumulate error
if (enc->last_frame_packet_type != TAV_PACKET_IFRAME) {
return 0; // Must compare against clean I-frame, not DELTA reconstruction
}
// Get current quantisers (use adjusted quantiser from bitrate control if applicable)
int qY = enc->bitrate_mode ? quantiser_float_to_int_dithered(enc) : enc->quantiser_y;
int this_frame_qY = QLUT[qY];
int this_frame_qCo = QLUT[enc->quantiser_co];
int this_frame_qCg = QLUT[enc->quantiser_cg];
// Coefficient count (monoblock mode)
const int coeff_count = enc->width * enc->height;
// Quantise current DWT coefficients
int16_t *quantised_y = enc->reusable_quantised_y;
int16_t *quantised_co = enc->reusable_quantised_co;
int16_t *quantised_cg = enc->reusable_quantised_cg;
if (enc->perceptual_tuning) {
quantise_dwt_coefficients_perceptual_per_coeff(enc, enc->current_dwt_y, quantised_y, coeff_count, this_frame_qY, enc->width, enc->height, enc->decomp_levels, 0, enc->frame_count);
quantise_dwt_coefficients_perceptual_per_coeff(enc, enc->current_dwt_co, quantised_co, coeff_count, this_frame_qCo, enc->width, enc->height, enc->decomp_levels, 1, enc->frame_count);
quantise_dwt_coefficients_perceptual_per_coeff(enc, enc->current_dwt_cg, quantised_cg, coeff_count, this_frame_qCg, enc->width, enc->height, enc->decomp_levels, 1, enc->frame_count);
} else {
quantise_dwt_coefficients(enc->current_dwt_y, quantised_y, coeff_count, this_frame_qY, enc->dead_zone_threshold, enc->width, enc->height, enc->decomp_levels, 0);
quantise_dwt_coefficients(enc->current_dwt_co, quantised_co, coeff_count, this_frame_qCo, enc->dead_zone_threshold, enc->width, enc->height, enc->decomp_levels, 1);
quantise_dwt_coefficients(enc->current_dwt_cg, quantised_cg, coeff_count, this_frame_qCg, enc->dead_zone_threshold, enc->width, enc->height, enc->decomp_levels, 1);
}
// Quantise previous DWT coefficients (stored from last I-frame)
int16_t *prev_quantised_y = malloc(coeff_count * sizeof(int16_t));
int16_t *prev_quantised_co = malloc(coeff_count * sizeof(int16_t));
int16_t *prev_quantised_cg = malloc(coeff_count * sizeof(int16_t));
if (enc->perceptual_tuning) {
quantise_dwt_coefficients_perceptual_per_coeff(enc, enc->previous_coeffs_y, prev_quantised_y, coeff_count, this_frame_qY, enc->width, enc->height, enc->decomp_levels, 0, enc->frame_count);
quantise_dwt_coefficients_perceptual_per_coeff(enc, enc->previous_coeffs_co, prev_quantised_co, coeff_count, this_frame_qCo, enc->width, enc->height, enc->decomp_levels, 1, enc->frame_count);
quantise_dwt_coefficients_perceptual_per_coeff(enc, enc->previous_coeffs_cg, prev_quantised_cg, coeff_count, this_frame_qCg, enc->width, enc->height, enc->decomp_levels, 1, enc->frame_count);
} else {
quantise_dwt_coefficients(enc->previous_coeffs_y, prev_quantised_y, coeff_count, this_frame_qY, enc->dead_zone_threshold, enc->width, enc->height, enc->decomp_levels, 0);
quantise_dwt_coefficients(enc->previous_coeffs_co, prev_quantised_co, coeff_count, this_frame_qCo, enc->dead_zone_threshold, enc->width, enc->height, enc->decomp_levels, 1);
quantise_dwt_coefficients(enc->previous_coeffs_cg, prev_quantised_cg, coeff_count, this_frame_qCg, enc->dead_zone_threshold, enc->width, enc->height, enc->decomp_levels, 1);
}
// Compare quantised coefficients - pure integer math
int diff_count = 0;
for (int i = 0; i < coeff_count; i++) {
if (quantised_y[i] != prev_quantised_y[i] ||
quantised_co[i] != prev_quantised_co[i] ||
quantised_cg[i] != prev_quantised_cg[i]) {
diff_count++;
}
}
free(prev_quantised_y);
free(prev_quantised_co);
free(prev_quantised_cg);
if (enc->verbose) {
printf("Still frame detection (DWT): %d/%d coeffs differ\n", diff_count, coeff_count);
}
// If all quantised coefficients match, frames are identical after compression
return (diff_count == 0);
}
// Main function
int main(int argc, char *argv[]) {
generate_random_filename(TEMP_AUDIO_FILE);
printf("Initialising encoder...\n");
tav_encoder_t *enc = create_encoder();
if (!enc) {
fprintf(stderr, "Error: Failed to create encoder\n");
return 1;
}
// Command line option parsing (similar to TEV encoder)
static struct option long_options[] = {
{"input", required_argument, 0, 'i'},
{"output", required_argument, 0, 'o'},
{"size", required_argument, 0, 's'},
{"dimension", required_argument, 0, 's'},
{"fps", required_argument, 0, 'f'},
{"quality", required_argument, 0, 'q'},
{"quantizer", required_argument, 0, 'Q'},
{"quantiser", required_argument, 0, 'Q'},
{"wavelet", required_argument, 0, 1010},
{"channel-layout", required_argument, 0, 'c'},
{"bitrate", required_argument, 0, 'b'},
{"arate", required_argument, 0, 'a'},
{"subtitle", required_argument, 0, 'S'},
{"subtitles", required_argument, 0, 'S'},
{"verbose", no_argument, 0, 'v'},
{"test", no_argument, 0, 't'},
{"lossless", no_argument, 0, 1000},
{"intra-only", no_argument, 0, 1006},
{"intraonly", no_argument, 0, 1006},
{"ictcp", no_argument, 0, 1005},
{"no-perceptual-tuning", no_argument, 0, 1007},
{"no-dead-zone", no_argument, 0, 1013},
{"no-deadzone", no_argument, 0, 1013},
{"encode-limit", required_argument, 0, 1008},
{"dump-frame", required_argument, 0, 1009},
{"fontrom-lo", required_argument, 0, 1011},
{"fontrom-low", required_argument, 0, 1011},
{"fontrom-hi", required_argument, 0, 1012},
{"fontrom-high", required_argument, 0, 1012},
{"zstd-level", required_argument, 0, 1014},
{"interlace", no_argument, 0, 1015},
{"interlaced", no_argument, 0, 1015},
// {"no-grain-synthesis", no_argument, 0, 1016},
{"enable-delta", no_argument, 0, 1017},
{"delta-haar", required_argument, 0, 1018},
{"temporal-dwt", no_argument, 0, 1019},
{"temporal-3d", no_argument, 0, 1019},
{"help", no_argument, 0, '?'},
{0, 0, 0, 0}
};
int c, option_index = 0;
while ((c = getopt_long(argc, argv, "i:o:s:f:q:Q:a:w:c:d:b:S:vt?", long_options, &option_index)) != -1) {
switch (c) {
case 'i':
enc->input_file = strdup(optarg);
break;
case 'o':
enc->output_file = strdup(optarg);
break;
case 's':
if (!parse_resolution(optarg, &enc->width, &enc->height)) {
fprintf(stderr, "Invalid resolution format: %s\n", optarg);
cleanup_encoder(enc);
return 1;
}
break;
case 'q':
enc->quality_level = CLAMP(atoi(optarg), 0, 6);
enc->quantiser_y = QUALITY_Y[enc->quality_level];
enc->quantiser_co = QUALITY_CO[enc->quality_level];
enc->quantiser_cg = QUALITY_CG[enc->quality_level];
enc->dead_zone_threshold = DEAD_ZONE_THRESHOLD[enc->quality_level];
break;
case 'Q':
// Parse quantiser values Y,Co,Cg
if (sscanf(optarg, "%d,%d,%d", &enc->quantiser_y, &enc->quantiser_co, &enc->quantiser_cg) != 3) {
fprintf(stderr, "Error: Invalid quantiser format. Use Y,Co,Cg (e.g., 5,3,2)\n");
cleanup_encoder(enc);
return 1;
}
enc->quantiser_y = CLAMP(enc->quantiser_y, 0, 255);
enc->quantiser_co = CLAMP(enc->quantiser_co, 0, 255);
enc->quantiser_cg = CLAMP(enc->quantiser_cg, 0, 255);
break;
case 1010: // --wavelet
enc->wavelet_filter = CLAMP(atoi(optarg), 0, 255);
break;
case 'b': {
int bitrate = atoi(optarg);
if (bitrate <= 0) {
fprintf(stderr, "Error: Invalid target bitrate: %d\n", bitrate);
cleanup_encoder(enc);
return 1;
}
enc->bitrate_mode = 1;
enc->target_bitrate = bitrate;
// Choose initial q-index based on target bitrate
if (bitrate >= 64000) {
enc->quality_level = 6;
} else if (bitrate >= 32000) {
enc->quality_level = 5;
} else if (bitrate >= 16000) {
enc->quality_level = 4;
} else if (bitrate >= 8000) {
enc->quality_level = 3;
} else if (bitrate >= 4000) {
enc->quality_level = 2;
} else if (bitrate >= 2000) {
enc->quality_level = 1;
} else {
enc->quality_level = 0;
}
enc->quantiser_y = QUALITY_Y[enc->quality_level];
enc->quantiser_co = QUALITY_CO[enc->quality_level];
enc->quantiser_cg = QUALITY_CG[enc->quality_level];
enc->dead_zone_threshold = DEAD_ZONE_THRESHOLD[enc->quality_level];
break;
}
case 'c': {
int layout = atoi(optarg);
if (layout < 0 || layout > 5) {
fprintf(stderr, "Error: Invalid channel layout %d. Valid range: 0-5\n", layout);
cleanup_encoder(enc);
return 1;
}
enc->channel_layout = layout;
if (enc->verbose) {
printf("Channel layout set to %d (%s)\n", enc->channel_layout,
channel_layouts[enc->channel_layout].channels[0] ?
channel_layouts[enc->channel_layout].channels[0] : "unknown");
}
break;
}
case 'f':
enc->output_fps = atoi(optarg);
if (enc->output_fps <= 0) {
fprintf(stderr, "Invalid FPS: %d\n", enc->output_fps);
cleanup_encoder(enc);
return 1;
}
break;
case 'v':
enc->verbose = 1;
break;
case 't':
enc->test_mode = 1;
break;
case 'S':
enc->subtitle_file = strdup(optarg);
break;
case 1000: // --lossless
enc->lossless = 1;
enc->wavelet_filter = WAVELET_5_3_REVERSIBLE;
break;
case 1005: // --ictcp
enc->ictcp_mode = 1;
break;
case 1006: // --intra-only
enc->intra_only = 1;
break;
case 1007: // --no-perceptual-tuning
enc->perceptual_tuning = 0;
break;
case 1013: // --no-dead-zone
enc->dead_zone_threshold = 0.0f;
break;
case 1008: // --encode-limit
enc->encode_limit = atoi(optarg);
if (enc->encode_limit < 0) {
fprintf(stderr, "Error: Invalid encode limit: %d\n", enc->encode_limit);
cleanup_encoder(enc);
return 1;
}
break;
case 1009: // --dump-frame
debugDumpFrameTarget = atoi(optarg);
break;
case 1011: // --fontrom-lo
enc->fontrom_lo_file = strdup(optarg);
break;
case 1012: // --fontrom-hi
enc->fontrom_hi_file = strdup(optarg);
break;
case 1014: // --zstd-level
enc->zstd_level = atoi(optarg);
if (enc->zstd_level < 1 || enc->zstd_level > 22) {
fprintf(stderr, "Error: Zstd compression level must be between 1 and 22 (got %d)\n", enc->zstd_level);
cleanup_encoder(enc);
return 1;
}
break;
case 1015: // --interlaced
enc->progressive_mode = 0;
break;
case 1016: // --no-grain-synthesis
enc->grain_synthesis = 0;
break;
case 1017: // --enable-delta
enc->use_delta_encoding = 1;
break;
case 1018: // --delta-haar
enc->delta_haar_levels = CLAMP(atoi(optarg), 0, 6);
if (enc->delta_haar_levels > 0) {
enc->use_delta_encoding = 1; // Auto-enable delta encoding
}
break;
case 1019: // --temporal-dwt / --temporal-3d
enc->use_delta_encoding = 0; // two modes are mutually exclusive
enc->enable_temporal_dwt = 1;
printf("Temporal 3D DWT encoding enabled (GOP size: %d frames)\n", GOP_SIZE);
break;
case 'a':
int bitrate = atoi(optarg);
int valid_bitrate = validate_mp2_bitrate(bitrate);
if (valid_bitrate == 0) {
fprintf(stderr, "Error: Invalid MP2 bitrate %d. Valid values are: ", bitrate);
for (int i = 0; i < sizeof(MP2_VALID_BITRATES) / sizeof(int); i++) {
fprintf(stderr, "%d%s", MP2_VALID_BITRATES[i],
(i < sizeof(MP2_VALID_BITRATES) / sizeof(int) - 1) ? ", " : "\n");
}
cleanup_encoder(enc);
return 1;
}
enc->audio_bitrate = valid_bitrate;
break;
case 1004: // --help
show_usage(argv[0]);
cleanup_encoder(enc);
return 0;
default:
show_usage(argv[0]);
cleanup_encoder(enc);
return 1;
}
}
// adjust encoding parameters for ICtCp
if (enc->ictcp_mode) {
enc->quantiser_cg = enc->quantiser_co;
}
// Halve internal height for interlaced mode (FFmpeg will output half-height fields)
if (!enc->progressive_mode) {
enc->height = enc->height / 2;
if (enc->verbose) {
printf("Interlaced mode: internal height adjusted to %d\n", enc->height);
}
enc->intra_only = 1;
}
// disable perceptual tuning if wavelet filter is not CDF 9/7
if (enc->wavelet_filter != WAVELET_9_7_IRREVERSIBLE) {
enc->perceptual_tuning = 0;
}
// disable monoblock mode if either width or height exceeds tie size
if (enc->width > TILE_SIZE_X || enc->height > TILE_SIZE_Y) {
enc->monoblock = 0;
}
if (enc->lossless) {
enc->quality_level = sizeof(MP2_RATE_TABLE) / sizeof(int); // use maximum quality table to disable anisotropy
enc->perceptual_tuning = 0;
enc->quantiser_y = 0; // will be resolved to 1
enc->quantiser_co = 0; // ditto
enc->quantiser_cg = 0; // do.
enc->intra_only = 1;
enc->dead_zone_threshold = 0.0f;
enc->audio_bitrate = 384;
}
// if user made `-q 6 -Q0,0,0 -w 0 --intra-only --no-perceptual-tuning --arate 384` manually, mark the video as lossless
int qtsize = sizeof(MP2_RATE_TABLE) / sizeof(int);
if (enc->quality_level == qtsize && enc->quantiser_y == 0 && enc->quantiser_co == 0 && enc->quantiser_cg == 0 &&
enc->perceptual_tuning == 0 && enc->intra_only == 1 && enc->dead_zone_threshold == 0.0f && enc->audio_bitrate == 384
) {
enc->lossless = 1;
}
if ((!enc->input_file && !enc->test_mode) || !enc->output_file) {
fprintf(stderr, "Error: Input and output files must be specified\n");
show_usage(argv[0]);
cleanup_encoder(enc);
return 1;
}
if (initialise_encoder(enc) != 0) {
fprintf(stderr, "Error: Failed to initialise encoder\n");
cleanup_encoder(enc);
return 1;
}
printf("TAV Encoder - DWT-based video compression\n");
printf("Input: %s\n", enc->input_file);
printf("Output: %s\n", enc->output_file);
printf("Resolution: %dx%d @ %dfps\n", enc->width, enc->height, enc->output_fps);
printf("Wavelet: %s\n",
enc->wavelet_filter == WAVELET_5_3_REVERSIBLE ? "CDF 5/3" :
enc->wavelet_filter == WAVELET_9_7_IRREVERSIBLE ? "CDF 9/7" :
enc->wavelet_filter == WAVELET_BIORTHOGONAL_13_7 ? "CDF 13/7" :
enc->wavelet_filter == WAVELET_DD4 ? "DD 4-tap" :
enc->wavelet_filter == WAVELET_HAAR ? "Haar" : "unknown");
printf("Decomposition levels: %d\n", enc->decomp_levels);
printf("Colour space: %s\n", enc->ictcp_mode ? "ICtCp" : "YCoCg-R");
printf("Quantisation: %s\n", enc->perceptual_tuning ? "Perceptual (HVS-optimised)" : "Uniform");
if (enc->ictcp_mode) {
printf("Base quantiser: I=%d, Ct=%d, Cp=%d\n", QLUT[enc->quantiser_y], QLUT[enc->quantiser_co], QLUT[enc->quantiser_cg]);
} else {
printf("Base quantiser: Y=%d, Co=%d, Cg=%d\n", QLUT[enc->quantiser_y], QLUT[enc->quantiser_co], QLUT[enc->quantiser_cg]);
}
// Open output file
if (strcmp(enc->output_file, "-") == 0) {
enc->output_fp = stdout;
} else {
enc->output_fp = fopen(enc->output_file, "wb");
if (!enc->output_fp) {
fprintf(stderr, "Error: Cannot open output file %s\n", enc->output_file);
cleanup_encoder(enc);
return 1;
}
}
// Capture FFmpeg version and creation time for extended header
enc->ffmpeg_version = get_ffmpeg_version();
struct timeval tv;
gettimeofday(&tv, NULL);
enc->creation_time_ns = (uint64_t)tv.tv_sec * 1000000000ULL + (uint64_t)tv.tv_usec * 1000ULL;
// Start FFmpeg process for video input (using TEV-compatible filtergraphs)
if (enc->test_mode) {
// Test mode - generate solid colour frames
enc->total_frames = 15; // Fixed 15 test frames like TEV
printf("Test mode: Generating %d solid colour frames\n", enc->total_frames);
} else {
// Normal mode - get video metadata first
printf("Retrieving video metadata...\n");
if (!get_video_metadata(enc)) {
fprintf(stderr, "Error: Failed to get video metadata\n");
cleanup_encoder(enc);
return 1;
}
// Start video preprocessing pipeline
if (start_video_conversion(enc) != 1) {
fprintf(stderr, "Error: Failed to start video conversion\n");
cleanup_encoder(enc);
return 1;
}
// Start audio conversion if needed
if (enc->has_audio) {
printf("Starting audio conversion...\n");
if (!start_audio_conversion(enc)) {
fprintf(stderr, "Warning: Audio conversion failed\n");
enc->has_audio = 0;
}
}
}
// Parse subtitles if provided
if (enc->subtitle_file) {
printf("Parsing subtitles: %s\n", enc->subtitle_file);
enc->subtitles = parse_subtitle_file(enc->subtitle_file, enc->output_fps);
if (NULL == enc->subtitles) {
fprintf(stderr, "Warning: Failed to parse subtitle file\n");
} else {
printf("Loaded subtitles successfully\n");
}
}
// Write TAV header
if (write_tav_header(enc) != 0) {
fprintf(stderr, "Error: Failed to write TAV header\n");
cleanup_encoder(enc);
return 1;
}
// Write extended header packet (before first timecode)
gettimeofday(&enc->start_time, NULL);
enc->extended_header_offset = write_extended_header(enc);
// Write font ROM packets if provided
if (enc->fontrom_lo_file) {
if (write_fontrom_packet(enc->output_fp, enc->fontrom_lo_file, 0x80) != 0) {
fprintf(stderr, "Warning: Failed to write low font ROM, continuing without it\n");
}
}
if (enc->fontrom_hi_file) {
if (write_fontrom_packet(enc->output_fp, enc->fontrom_hi_file, 0x81) != 0) {
fprintf(stderr, "Warning: Failed to write high font ROM, continuing without it\n");
}
}
if (enc->output_fps != enc->fps) {
printf("Frame rate conversion enabled: %d fps output\n", enc->output_fps);
}
printf("Starting encoding...\n");
// Main encoding loop - process frames until EOF or frame limit
int frame_count = 0;
int true_frame_count = 0;
int continue_encoding = 1;
// Write timecode packet for frame 0 (before the first frame group)
write_timecode_packet(enc->output_fp, 0, enc->output_fps, enc->is_ntsc_framerate);
while (continue_encoding) {
// Check encode limit if specified
if (enc->encode_limit > 0 && frame_count >= enc->encode_limit) {
printf("Reached encode limit of %d frames, finalising...\n", enc->encode_limit);
continue_encoding = 0;
break;
}
// Write timecode packet for frames 1+ (right after sync packet from previous frame)
// Skip timecode emission in temporal DWT mode (GOP handles its own timecodes)
if (frame_count > 0 && !enc->enable_temporal_dwt) {
write_timecode_packet(enc->output_fp, frame_count, enc->output_fps, enc->is_ntsc_framerate);
}
if (enc->test_mode) {
// Test mode has a fixed frame count
if (frame_count >= enc->total_frames) {
continue_encoding = 0;
break;
}
// Generate test frame with solid colours (TEV-style)
size_t rgb_size = enc->width * enc->height * 3;
uint8_t test_r = 0, test_g = 0, test_b = 0;
const char* colour_name = "unknown";
switch (frame_count) {
case 0: test_r = 0; test_g = 0; test_b = 0; colour_name = "black"; break;
case 1: test_r = 127; test_g = 127; test_b = 127; colour_name = "grey"; break;
case 2: test_r = 255; test_g = 255; test_b = 255; colour_name = "white"; break;
case 3: test_r = 127; test_g = 0; test_b = 0; colour_name = "half red"; break;
case 4: test_r = 127; test_g = 127; test_b = 0; colour_name = "half yellow"; break;
case 5: test_r = 0; test_g = 127; test_b = 0; colour_name = "half green"; break;
case 6: test_r = 0; test_g = 127; test_b = 127; colour_name = "half cyan"; break;
case 7: test_r = 0; test_g = 0; test_b = 127; colour_name = "half blue"; break;
case 8: test_r = 127; test_g = 0; test_b = 127; colour_name = "half magenta"; break;
case 9: test_r = 255; test_g = 0; test_b = 0; colour_name = "red"; break;
case 10: test_r = 255; test_g = 255; test_b = 0; colour_name = "yellow"; break;
case 11: test_r = 0; test_g = 255; test_b = 0; colour_name = "green"; break;
case 12: test_r = 0; test_g = 255; test_b = 255; colour_name = "cyan"; break;
case 13: test_r = 0; test_g = 0; test_b = 255; colour_name = "blue"; break;
case 14: test_r = 255; test_g = 0; test_b = 255; colour_name = "magenta"; break;
}
// Fill frame with test colour
for (size_t i = 0; i < rgb_size; i += 3) {
enc->current_frame_rgb[i] = test_r;
enc->current_frame_rgb[i + 1] = test_g;
enc->current_frame_rgb[i + 2] = test_b;
}
printf("Frame %d: %s (%d,%d,%d)\n", frame_count, colour_name, test_r, test_g, test_b);
} else {
// Real video mode - read frame from FFmpeg
// height-halving is already done on the encoder initialisation
int frame_height = enc->height;
size_t rgb_size = enc->width * frame_height * 3;
size_t bytes_read = fread(enc->current_frame_rgb, 1, rgb_size, enc->ffmpeg_video_pipe);
if (bytes_read != rgb_size) {
if (enc->verbose) {
printf("Frame %d: Expected %zu bytes, got %zu bytes\n", frame_count, rgb_size, bytes_read);
if (feof(enc->ffmpeg_video_pipe)) {
printf("FFmpeg pipe reached end of file\n");
}
if (ferror(enc->ffmpeg_video_pipe)) {
printf("FFmpeg pipe error occurred\n");
}
}
continue_encoding = 0;
break;
}
// Each frame from FFmpeg is now a single field at half height (for interlaced)
// Frame parity: even frames (0,2,4...) = bottom fields, odd frames (1,3,5...) = top fields
}
// Determine frame type
int is_scene_change = detect_scene_change(enc);
int is_time_keyframe = (frame_count % GOP_SIZE) == 0;
// Check if we can use SKIP mode (DWT coefficient-based detection)
int is_still = detect_still_frame(enc);
enc->is_still_frame_cached = is_still; // Cache for use in compress_and_write_frame
// SKIP mode can be used if frame is still (detect_still_frame_dwt already checks against I-frame)
// SKIP runs can continue as long as frames remain identical to the reference I-frame
int in_skip_run = enc->used_skip_mode_last_frame;
int can_use_skip = is_still && enc->previous_coeffs_allocated;
// During a SKIP run, suppress keyframe timer unless content changes enough to un-skip
// Un-skip threshold is the negation of SKIP threshold: content must change to break the run
int suppress_keyframe_timer = in_skip_run && is_still;
// Keyframe decision: intra-only mode, time-based (unless suppressed by SKIP run), scene change,
// or when delta encoding is disabled and skip mode cannot be used (pure INTRA frames)
int is_keyframe = enc->intra_only ||
(is_time_keyframe && !suppress_keyframe_timer) ||
is_scene_change ||
(!enc->use_delta_encoding && !can_use_skip);
// Track if we'll use SKIP mode this frame (continues the SKIP run)
enc->used_skip_mode_last_frame = can_use_skip && !is_keyframe;
// Verbose output for keyframe decisions
/*if (enc->verbose && is_keyframe) {
if (is_scene_change && !is_time_keyframe) {
printf("Frame %d: Scene change detected, inserting keyframe\n", frame_count);
} else if (is_time_keyframe) {
printf("Frame %d: Time-based keyframe (interval: %d)\n", frame_count, GOP_SIZE);
}
}*/
// Debug: check RGB input data
/*if (frame_count < 3) {
printf("Encoder Debug: Frame %d - RGB data (first 16 bytes): ", frame_count);
for (int i = 0; i < 16; i++) {
printf("%d ", enc->current_frame_rgb[i]);
}
printf("\n");
}*/
// Convert RGB to colour space (YCoCg-R or ICtCp)
rgb_to_colour_space_frame(enc, enc->current_frame_rgb,
enc->current_frame_y, enc->current_frame_co, enc->current_frame_cg,
enc->width, enc->height);
// Debug: check YCoCg conversion result
/*if (frame_count < 3) {
printf("Encoder Debug: Frame %d - YCoCg result (first 16): ", frame_count);
for (int i = 0; i < 16; i++) {
printf("Y=%.1f Co=%.1f Cg=%.1f ", enc->current_frame_y[i], enc->current_frame_co[i], enc->current_frame_cg[i]);
if (i % 4 == 3) break; // Only show first 4 pixels for readability
}
printf("\n");
}*/
// GOP-based temporal 3D DWT encoding path (when enabled)
size_t packet_size = 0;
if (enc->enable_temporal_dwt) {
// Add frame to GOP buffer
int add_result = gop_add_frame(enc, enc->current_frame_rgb,
enc->current_frame_y, enc->current_frame_co, enc->current_frame_cg);
if (add_result != 0) {
fprintf(stderr, "Error: Failed to add frame %d to GOP buffer\n", frame_count);
break;
}
// Check if GOP should be flushed
int should_flush = 0;
int force_flush = 0;
// Flush if GOP is full
if (gop_is_full(enc)) {
should_flush = 1;
if (enc->verbose) {
printf("GOP buffer full (%d frames), flushing...\n", enc->gop_frame_count);
}
}
// Flush if large motion detected (breaks temporal coherence)
else if (gop_should_flush_motion(enc)) {
should_flush = 1;
if (enc->verbose) {
printf("Large motion detected (>24 pixels), flushing GOP early...\n");
}
}
// Flush if scene change detected
else if (is_scene_change && enc->gop_frame_count > 1) {
should_flush = 1;
force_flush = 1; // Skip internal scene change detection (already detected)
if (enc->verbose) {
printf("Scene change detected, flushing GOP early...\n");
}
}
// Flush GOP if needed
if (should_flush) {
// Build frame number array for this GOP
int *gop_frame_numbers = malloc(enc->gop_frame_count * sizeof(int));
for (int i = 0; i < enc->gop_frame_count; i++) {
gop_frame_numbers[i] = frame_count - enc->gop_frame_count + 1 + i;
}
// Get quantiser (use adjusted quantiser from bitrate control if applicable)
int qY = enc->bitrate_mode ? quantiser_float_to_int_dithered(enc) : enc->quantiser_y;
// Process and flush GOP with scene change detection
packet_size = gop_process_and_flush(enc, enc->output_fp, qY,
gop_frame_numbers, force_flush);
free(gop_frame_numbers);
if (packet_size == 0) {
fprintf(stderr, "Error: Failed to flush GOP at frame %d\n", frame_count);
break;
}
} else {
// Frame added to GOP buffer but not flushed yet
// Skip normal packet processing (no packet written yet)
packet_size = 0;
}
} else {
// Traditional 2D DWT encoding path (no temporal transform)
uint8_t packet_type = is_keyframe ? TAV_PACKET_IFRAME : TAV_PACKET_PFRAME;
packet_size = compress_and_write_frame(enc, packet_type);
}
if (packet_size == 0 && !enc->enable_temporal_dwt) {
// Traditional 2D path: packet_size == 0 means encoding failed
fprintf(stderr, "Error: Failed to compress frame %d\n", frame_count);
break;
}
// Process audio/subtitles and sync packets only when frames were actually written
if (packet_size > 0) {
// Update bitrate tracking with compressed video packet size
if (enc->bitrate_mode) {
// For GOP-based encoding, packet_size covers multiple frames
// For traditional encoding, packet_size includes packet header (5 bytes)
size_t video_data_size = packet_size;
update_video_rate_bin(enc, video_data_size);
adjust_quantiser_for_bitrate(enc);
}
// For GOP encoding, process audio/subtitles for all frames in the flushed GOP
// For traditional encoding, process audio/subtitles for this single frame
if (enc->enable_temporal_dwt) {
// Note: In GOP mode, audio/subtitle sync is approximate since we flush multiple frames at once
// This is acceptable since GOPs are short (16 frames max = ~0.5s at 30fps)
// TODO: Consider buffering audio/subtitles for precise sync if needed
}
// Process audio for this frame
process_audio(enc, true_frame_count, enc->output_fp);
// Process subtitles for this frame
process_subtitles(enc, true_frame_count, enc->output_fp);
// Write a sync packet only after a video is been coded
uint8_t sync_packet = TAV_PACKET_SYNC;
fwrite(&sync_packet, 1, 1, enc->output_fp);
// NTSC frame duplication: emit extra sync packet for every 1000n+500 frames
if (enc->is_ntsc_framerate && (frame_count % 1000 == 500)) {
true_frame_count++;
// Process audio and subtitles for the duplicated frame to maintain sync
process_audio(enc, true_frame_count, enc->output_fp);
process_subtitles(enc, true_frame_count, enc->output_fp);
uint8_t sync_packet_ntsc = TAV_PACKET_SYNC_NTSC;
fwrite(&sync_packet_ntsc, 1, 1, enc->output_fp);
printf("Frame %d: NTSC duplication - extra sync packet emitted with audio/subtitle sync\n", frame_count);
}
}
// Swap ping-pong buffers (eliminates memcpy operations)
swap_frame_buffers(enc);
frame_count++;
true_frame_count++;
enc->frame_count = frame_count;
if (enc->verbose || frame_count % 30 == 0) {
struct timeval now;
gettimeofday(&now, NULL);
double elapsed = (now.tv_sec - enc->start_time.tv_sec) +
(now.tv_usec - enc->start_time.tv_usec) / 1000000.0;
double fps = frame_count / elapsed;
int display_qY = enc->bitrate_mode ? quantiser_float_to_int_dithered(enc) : enc->quantiser_y;
printf("Encoded frame %d (%s, %.1f fps, qY=%d)\n", frame_count,
is_keyframe ? "I-frame" : "P-frame", fps, QLUT[display_qY]);
}
}
// Flush any remaining GOP frames (temporal 3D DWT mode only)
if (enc->enable_temporal_dwt && enc->gop_frame_count > 0) {
printf("Flushing remaining %d frames from GOP buffer...\n", enc->gop_frame_count);
// Build frame number array for remaining GOP
int *gop_frame_numbers = malloc(enc->gop_frame_count * sizeof(int));
for (int i = 0; i < enc->gop_frame_count; i++) {
gop_frame_numbers[i] = frame_count - enc->gop_frame_count + 1 + i;
}
// Get quantiser (use adjusted quantiser from bitrate control if applicable)
int qY = enc->bitrate_mode ? quantiser_float_to_int_dithered(enc) : enc->quantiser_y;
// Flush remaining GOP with force_flush=1 to process all frames
size_t final_packet_size = gop_process_and_flush(enc, enc->output_fp, qY,
gop_frame_numbers, 1);
free(gop_frame_numbers);
if (final_packet_size == 0) {
fprintf(stderr, "Warning: Failed to flush final GOP frames\n");
} else {
// Write sync packet after final GOP
uint8_t sync_packet = TAV_PACKET_SYNC;
fwrite(&sync_packet, 1, 1, enc->output_fp);
printf("Final GOP flushed successfully (%zu bytes)\n", final_packet_size);
}
}
// Update actual frame count in encoder struct
enc->total_frames = frame_count;
// Update header with actual frame count (seek back to header position)
if (enc->output_fp != stdout) {
long current_pos = ftell(enc->output_fp);
fseek(enc->output_fp, 14, SEEK_SET); // Offset of total_frames field in TAV header
uint32_t actual_frames = frame_count;
fwrite(&actual_frames, sizeof(uint32_t), 1, enc->output_fp);
fseek(enc->output_fp, current_pos, SEEK_SET); // Restore position
if (enc->verbose) {
printf("Updated header with actual frame count: %d\n", frame_count);
}
// Update ENDT in extended header (calculate end time for last frame)
uint64_t endt_ns;
if (enc->is_ntsc_framerate) {
endt_ns = ((uint64_t)(frame_count - 1) * 1001000000ULL) / 30000ULL;
} else {
endt_ns = ((uint64_t)(frame_count - 1) * 1000000000ULL) / (uint64_t)enc->output_fps;
}
fseek(enc->output_fp, enc->extended_header_offset, SEEK_SET);
fwrite(&endt_ns, sizeof(uint64_t), 1, enc->output_fp);
fseek(enc->output_fp, current_pos, SEEK_SET); // Restore position
if (enc->verbose) {
printf("Updated ENDT in extended header: %llu ns\n", (unsigned long long)endt_ns);
}
}
// Final statistics
struct timeval end_time;
gettimeofday(&end_time, NULL);
double total_time = (end_time.tv_sec - enc->start_time.tv_sec) +
(end_time.tv_usec - enc->start_time.tv_usec) / 1000000.0;
printf("\nEncoding complete!\n");
printf(" Frames encoded: %d\n", frame_count);
printf(" Framerate: %d\n", enc->output_fps);
printf(" Output size: %zu bytes\n", enc->total_compressed_size);
printf(" Encoding time: %.2fs (%.1f fps)\n", total_time, frame_count / total_time);
printf(" Frame statistics: INTRA=%lu, DELTA=%lu, SKIP=%lu\n", count_intra, count_delta, count_skip);
cleanup_encoder(enc);
return 0;
}
// Cleanup encoder resources
static void cleanup_encoder(tav_encoder_t *enc) {
if (!enc) return;
if (enc->ffmpeg_video_pipe) {
pclose(enc->ffmpeg_video_pipe);
}
if (enc->mp2_file) {
fclose(enc->mp2_file);
unlink(TEMP_AUDIO_FILE);
}
if (enc->output_fp) {
fclose(enc->output_fp);
}
free(enc->input_file);
free(enc->output_file);
free(enc->subtitle_file);
free(enc->fontrom_lo_file);
free(enc->fontrom_hi_file);
free(enc->ffmpeg_version);
free(enc->frame_rgb[0]);
free(enc->frame_rgb[1]);
free(enc->current_frame_y);
free(enc->current_frame_co);
free(enc->current_frame_cg);
free(enc->current_frame_alpha);
free(enc->tiles);
free(enc->compressed_buffer);
free(enc->mp2_buffer);
// OPTIMISATION: Free reusable quantisation buffers
free(enc->reusable_quantised_y);
free(enc->reusable_quantised_co);
free(enc->reusable_quantised_cg);
free(enc->reusable_quantised_alpha);
// Free coefficient delta storage
free(enc->previous_coeffs_y);
free(enc->previous_coeffs_co);
free(enc->previous_coeffs_cg);
free(enc->previous_coeffs_alpha);
// Free bitrate control structures
free(enc->video_rate_bin);
// Free GOP buffers
if (enc->gop_rgb_frames) {
for (int i = 0; i < enc->gop_capacity; i++) {
free(enc->gop_rgb_frames[i]);
}
free(enc->gop_rgb_frames);
}
if (enc->gop_y_frames) {
for (int i = 0; i < enc->gop_capacity; i++) {
free(enc->gop_y_frames[i]);
}
free(enc->gop_y_frames);
}
if (enc->gop_co_frames) {
for (int i = 0; i < enc->gop_capacity; i++) {
free(enc->gop_co_frames[i]);
}
free(enc->gop_co_frames);
}
if (enc->gop_cg_frames) {
for (int i = 0; i < enc->gop_capacity; i++) {
free(enc->gop_cg_frames[i]);
}
free(enc->gop_cg_frames);
}
free(enc->gop_translation_x);
free(enc->gop_translation_y);
// Free subtitle list
if (enc->subtitles) {
free_subtitle_list(enc->subtitles);
}
if (enc->zstd_ctx) {
ZSTD_freeCCtx(enc->zstd_ctx);
}
free(enc);
}