diff --git a/CLAUDE.md b/CLAUDE.md index 01b8eb9..709aba5 100644 --- a/CLAUDE.md +++ b/CLAUDE.md @@ -168,6 +168,7 @@ Peripheral memories can be accessed using `vm.peek()` and `vm.poke()` functions, - **Wavelet Support**: Multiple wavelet types for different compression characteristics - **JS Decoder**: `assets/disk0/tvdos/bin/playtav.js` - Native decoder for TAV format playback - **Hardware accelerated decoding**: Extended GraphicsJSR223Delegate.kt with TAV functions +- **Packet analyser**: `video_encoder/tav_inspector.c` - Debugging tool that parses TAV packets into human-readable form - **Features**: - **Multiple Wavelet Types**: 5/3 reversible, 9/7 irreversible, CDF 13/7, DD-4, Haar - **Single-tile encoding**: One large DWT tile for optimal quality (no blocking artifacts) @@ -276,7 +277,7 @@ Implemented on 2025-10-15 for improved temporal compression through group-of-pic - **3D DWT**: Applies DWT in both spatial (2D) and temporal (1D) dimensions for optimal spacetime compression - **Unified GOP Preprocessing**: Single significance map for all frames and channels in a GOP (width×height×N_frames×3_channels) - **FFT-based Phase Correlation**: Uses FFTW3 library for accurate global motion estimation with quarter-pixel precision -- **GOP Size**: Typically 16 frames (configurable), with scene change detection for adaptive GOPs +- **GOP Size**: Typically 8 frames (configurable), with scene change detection for adaptive GOPs - **Single-frame Fallback**: GOP size of 1 automatically uses traditional I-frame encoding **Packet Format**: diff --git a/assets/disk0/tvdos/bin/playtav.js b/assets/disk0/tvdos/bin/playtav.js index 1c95e61..202081a 100644 --- a/assets/disk0/tvdos/bin/playtav.js +++ b/assets/disk0/tvdos/bin/playtav.js @@ -1005,8 +1005,12 @@ try { let motionY = new Array(gopSize) for (let i = 0; i < gopSize; i++) { - motionX[i] = seqread.readShort() // Signed int16 - motionY[i] = seqread.readShort() + // readShort() returns unsigned 16-bit, but motion vectors are signed int16 + let mx = seqread.readShort() + let my = seqread.readShort() + // Convert to signed: if > 32767, it's negative + motionX[i] = (mx > 32767) ? (mx - 65536) : mx + motionY[i] = (my > 32767) ? (my - 65536) : my } // Read compressed data size @@ -1019,7 +1023,7 @@ try { // Check if GOP fits in VM memory const gopMemoryNeeded = gopSize * FRAME_SIZE if (gopMemoryNeeded > MAXMEM) { - throw new Error(`GOP too large: ${gopSize} frames needs ${(gopMemoryNeeded / 1048576).toFixed(2)}MB, but VM has only ${(MAXMEM / 1048576).toFixed(1)}MB. Max GOP size: 11 frames for 8MB system.`) + throw new Error(`GOP too large: ${gopSize} frames needs ${(gopMemoryNeeded / 1048576).toFixed(2)}MB, but VM has only ${(MAXMEM / 1048576).toFixed(1)}MB. Max GOP size: 8 frames for 8MB system.`) } // Allocate GOP buffers outside try block so finally can free them diff --git a/tsvm_core/src/net/torvald/tsvm/GraphicsJSR223Delegate.kt b/tsvm_core/src/net/torvald/tsvm/GraphicsJSR223Delegate.kt index cb96c43..fafdcfb 100644 --- a/tsvm_core/src/net/torvald/tsvm/GraphicsJSR223Delegate.kt +++ b/tsvm_core/src/net/torvald/tsvm/GraphicsJSR223Delegate.kt @@ -72,6 +72,7 @@ import kotlin.intArrayOf import kotlin.let import kotlin.longArrayOf import kotlin.math.* +import kotlin.math.pow import kotlin.repeat import kotlin.text.format import kotlin.text.lowercase @@ -4538,30 +4539,18 @@ class GraphicsJSR223Delegate(private val vm: VM) { * - Frames 8-15: Level 2 (tHH - highest frequency) */ private fun getTemporalSubbandLevel(frameIdx: Int, numFrames: Int, temporalLevels: Int): Int { - if (temporalLevels == 0) return 0 + // Match encoder logic exactly (encoder_tav.c:1487-1501) + // After temporal DWT with 2 levels: + // Frames 0...num_frames/(2^2) = tLL (temporal low-low, coarsest, level 0) + // Frames in first half but after tLL = tLH (level 1) + // Remaining frames = tH from first level (level 2, finest) - val framesPerSubband = numFrames shr temporalLevels // numFrames / 2^temporalLevels + val framesPerLevel0 = numFrames shr temporalLevels // e.g., 16 >> 2 = 4, or 8 >> 2 = 2 - // Safety check: ensure we have enough frames for the temporal levels - // Minimum frames needed = 2^temporalLevels - if (framesPerSubband == 0) { - // Not enough frames for this many temporal levels - treat all as base level - return 0 - } - - // Determine which temporal subband this frame belongs to - val subbandIdx = frameIdx / framesPerSubband - - // Map subband index to level (0 = tLL, 1+ = temporal high-pass levels) - return if (subbandIdx == 0) 0 else { - // Find highest bit position in subbandIdx to determine level - var level = 0 - var idx = subbandIdx - while (idx > 1) { - idx = idx shr 1 - level++ - } - level + 1 + return when { + frameIdx < framesPerLevel0 -> 0 // Coarsest temporal level (tLL) + frameIdx < (numFrames shr 1) -> 1 // First level high-pass (tLH) + else -> 2 // Finest level high-pass (tH from level 1) } } @@ -4575,9 +4564,10 @@ class GraphicsJSR223Delegate(private val vm: VM) { * - Level 2 (tHH): 1.0 × 2^1.6 = 3.03 */ private fun getTemporalQuantizerScale(temporalLevel: Int): Float { - val BETA = 0.8f - val TEMPORAL_BASE_SCALE = 1.0f - return TEMPORAL_BASE_SCALE * Math.pow(2.0, (BETA * temporalLevel).toDouble()).toFloat() + val BETA = 0.6f // Temporal scaling exponent (aggressive for temporal high-pass) + val KAPPA = 1.14f + val TEMPORAL_BASE_SCALE = 1.0f // Don't reduce tLL quantization (same as intra) + return TEMPORAL_BASE_SCALE * 2.0f.pow(BETA * temporalLevel.toFloat().pow(KAPPA)) } // level is one-based index @@ -6251,8 +6241,8 @@ class GraphicsJSR223Delegate(private val vm: VM) { * @param compressedDataPtr Pointer to compressed Zstd data * @param compressedSize Size of compressed data * @param gopSize Number of frames in GOP (1-16) - * @param motionVectorsX X motion vectors in quarter-pixel units - * @param motionVectorsY Y motion vectors in quarter-pixel units + * @param motionVectorsX X motion vectors in 1/16-pixel units + * @param motionVectorsY Y motion vectors in 1/16-pixel units * @param outputRGBAddrs Array of output RGB buffer addresses * @param width Frame width * @param height Frame height @@ -6363,10 +6353,10 @@ class GraphicsJSR223Delegate(private val vm: VM) { tavApplyInverse3DDWT(gopCg, width, height, gopSize, spatialLevels, temporalLevels, spatialFilter) // Step 7: Apply inverse motion compensation (shift frames back) - // Note: Motion vectors are in quarter-pixel units + // Note: Motion vectors are in 1/16-pixel units, cumulative relative to frame 0 for (t in 1 until gopSize) { // Skip frame 0 (reference) - val dx = motionVectorsX[t] / 4 // Convert to pixel units - val dy = motionVectorsY[t] / 4 + val dx = motionVectorsX[t] / 16 // Convert to pixel units + val dy = motionVectorsY[t] / 16 if (dx != 0 || dy != 0) { applyInverseTranslation(gopY[t], width, height, dx, dy) @@ -6486,36 +6476,30 @@ class GraphicsJSR223Delegate(private val vm: VM) { // Haar wavelet inverse 1D transform // The simplest wavelet: reverses averages and differences + // MUST match encoder's dwt_haar_inverse_1d exactly (encoder_tav.c:1265-1284) private fun tavApplyDWTHaarInverse1D(data: FloatArray, length: Int) { if (length < 2) return val temp = FloatArray(length) val half = (length + 1) / 2 - // Split into low and high frequency components - for (i in 0 until half) { - temp[i] = data[i] // Low-pass coefficients (averages) - } - for (i in 0 until length / 2) { - if (half + i < length) { - temp[half + i] = data[half + i] // High-pass coefficients (differences) - } - } - - // Haar inverse: reconstruct original samples from averages and differences + // Inverse Haar transform: reconstruct from averages and differences + // Read directly from data array (already has low-pass then high-pass layout) for (i in 0 until half) { if (2 * i + 1 < length) { - val avg = temp[i] // Average (low-pass) - val diff = if (half + i < length) temp[half + i] else 0.0f // Difference (high-pass) - - // Reconstruct original adjacent pair - data[2 * i] = avg + diff // First sample: average + difference - data[2 * i + 1] = avg - diff // Second sample: average - difference + // Reconstruct adjacent pairs from average and difference + temp[2 * i] = data[i] + data[half + i] // average + difference + temp[2 * i + 1] = data[i] - data[half + i] // average - difference } else { - // Handle odd length: last sample comes directly from low-pass - data[2 * i] = temp[i] + // Handle odd length: last sample comes from low-pass only + temp[2 * i] = data[i] } } + + // Copy reconstructed data back + for (i in 0 until length) { + data[i] = temp[i] + } } // ============================================================================= diff --git a/video_encoder/encoder_tav.c b/video_encoder/encoder_tav.c index cdc5e1c..f81b13b 100644 --- a/video_encoder/encoder_tav.c +++ b/video_encoder/encoder_tav.c @@ -17,7 +17,7 @@ #include #include -#define ENCODER_VENDOR_STRING "Encoder-TAV 20251016" +#define ENCODER_VENDOR_STRING "Encoder-TAV 20251017" // TSVM Advanced Video (TAV) format constants #define TAV_MAGIC "\x1F\x54\x53\x56\x4D\x54\x41\x56" // "\x1FTSVM TAV" @@ -106,6 +106,7 @@ static int needs_alpha_channel(int channel_layout) { #define DEFAULT_ZSTD_LEVEL 9 #define GOP_SIZE 8 #define TEMPORAL_DECOMP_LEVEL 2 +#define MOTION_THRESHOLD 64.0f // Flush if motion exceeds 24 pixels in any direction // Audio/subtitle constants (reused from TEV) #define MP2_DEFAULT_PACKET_SIZE 1152 @@ -310,8 +311,8 @@ typedef struct tav_encoder_s { float **gop_y_frames; // [frame][pixel] - Y channel for each GOP frame float **gop_co_frames; // [frame][pixel] - Co channel for each GOP frame float **gop_cg_frames; // [frame][pixel] - Cg channel for each GOP frame - int16_t *gop_translation_x; // [frame] - Translation X in quarter-pixel units - int16_t *gop_translation_y; // [frame] - Translation Y in quarter-pixel units + int16_t *gop_translation_x; // [frame] - Translation X in 1/16-pixel units + int16_t *gop_translation_y; // [frame] - Translation Y in 1/16-pixel units int temporal_decomp_levels; // Number of temporal DWT levels (default: 2) // Tile processing @@ -1316,7 +1317,7 @@ static void dwt_53_inverse_1d(float *data, int length) { // FFT-based phase correlation for global motion estimation // Uses FFTW3 to compute cross-power spectrum and find translation peak -// Returns quarter-pixel precision translation vectors +// Returns 1/16-pixel precision translation vectors static void phase_correlate_fft(const uint8_t *frame1_rgb, const uint8_t *frame2_rgb, int width, int height, int16_t *dx_qpel, int16_t *dy_qpel) { // Step 1: Convert RGB to grayscale @@ -1404,7 +1405,7 @@ static void phase_correlate_fft(const uint8_t *frame1_rgb, const uint8_t *frame2 if (dx > width / 2) dx -= width; if (dy > height / 2) dy -= height; - // Step 7: Quarter-pixel refinement using parabolic interpolation + // Step 7: Subpixel refinement using parabolic interpolation // Only refine if peak is not at boundary float subpixel_dx = 0.0f; float subpixel_dy = 0.0f; @@ -1434,12 +1435,12 @@ static void phase_correlate_fft(const uint8_t *frame1_rgb, const uint8_t *frame2 } } - // Step 8: Convert to quarter-pixel units + // Step 8: Convert to 1/16-pixel units (sixteenth-pixel precision) float final_dx = dx + subpixel_dx; float final_dy = dy + subpixel_dy; - *dx_qpel = (int16_t)roundf(final_dx * 4.0f); - *dy_qpel = (int16_t)roundf(final_dy * 4.0f); + *dx_qpel = (int16_t)roundf(final_dx * 16.0f); + *dy_qpel = (int16_t)roundf(final_dy * 16.0f); // Cleanup fftwf_destroy_plan(plan_fwd1); @@ -1456,9 +1457,9 @@ static void phase_correlate_fft(const uint8_t *frame1_rgb, const uint8_t *frame2 // Apply translation to frame (for frame alignment before temporal DWT) static void apply_translation(float *frame_data, int width, int height, int16_t dx_qpel, int16_t dy_qpel, float *output) { - // Convert quarter-pixel to pixel (for now, just use integer translation) - int dx = dx_qpel / 4; - int dy = dy_qpel / 4; + // Convert 1/16-pixel to pixel (for now, just use integer translation) + int dx = dx_qpel / 16; + int dy = dy_qpel / 16; // Apply translation with boundary handling for (int y = 0; y < height; y++) { @@ -1524,7 +1525,8 @@ static void quantise_3d_dwt_coefficients(tav_encoder_t *enc, int spatial_size, int base_quantiser, int is_chroma) { - const float BETA = 0.8f; // Temporal scaling exponent (aggressive for temporal high-pass) + const float BETA = 0.6f; // Temporal scaling exponent (aggressive for temporal high-pass) + const float KAPPA = 1.14f; const float TEMPORAL_BASE_SCALE = 1.0f; // Don't reduce tLL quantization (same as intra) // Process each temporal subband independently (separable approach) @@ -1541,7 +1543,7 @@ static void quantise_3d_dwt_coefficients(tav_encoder_t *enc, // - Level 0 (tLL): 16 * 1.0 * 2^0 = 16 (same as intra-only) // - Level 1 (tH): 16 * 1.0 * 2^2.0 = 64 (4× base, aggressive) // - Level 2 (tHH): 16 * 1.0 * 2^4.0 = 256 → clamped to 255 (very aggressive) - float temporal_scale = TEMPORAL_BASE_SCALE * powf(2.0f, BETA * temporal_level); + float temporal_scale = TEMPORAL_BASE_SCALE * powf(2.0f, BETA * powf(temporal_level, KAPPA)); float temporal_quantiser = base_quantiser * temporal_scale; // Convert to integer for quantization @@ -1604,10 +1606,10 @@ static int gop_add_frame(tav_encoder_t *enc, const uint8_t *frame_rgb, &enc->gop_translation_y[frame_idx]); if (enc->verbose && (frame_idx < 3 || frame_idx == enc->gop_capacity - 1)) { - printf(" GOP frame %d: translation = (%.2f, %.2f) pixels\n", + printf(" GOP frame %d: translation = (%.3f, %.3f) pixels\n", frame_idx, - enc->gop_translation_x[frame_idx] / 4.0f, - enc->gop_translation_y[frame_idx] / 4.0f); + enc->gop_translation_x[frame_idx] / 16.0f, + enc->gop_translation_y[frame_idx] / 16.0f); } } else { // First frame has no translation @@ -1644,13 +1646,12 @@ static int gop_should_flush_motion(tav_encoder_t *enc) { int16_t dx = enc->gop_translation_x[last_idx]; int16_t dy = enc->gop_translation_y[last_idx]; - // Convert quarter-pixel to pixels - float dx_pixels = fabsf(dx / 4.0f); - float dy_pixels = fabsf(dy / 4.0f); + // Convert 1/16-pixel to pixels + float dx_pixels = fabsf(dx / 16.0f); + float dy_pixels = fabsf(dy / 16.0f); // Flush if motion exceeds threshold (24 pixels in any direction) // This indicates likely scene change or very fast motion - const float MOTION_THRESHOLD = 24.0f; if (dx_pixels > MOTION_THRESHOLD || dy_pixels > MOTION_THRESHOLD) { if (enc->verbose) { @@ -1690,8 +1691,36 @@ static size_t gop_flush(tav_encoder_t *enc, FILE *output, int base_quantiser, memcpy(gop_cg_coeffs[i], enc->gop_cg_frames[i], num_pixels * sizeof(float)); } - // Step 0.5: Apply motion compensation to align frames before temporal DWT - // This uses the computed translation vectors to align each frame to the previous one + // Debug: Print original frame-to-frame motion vectors + if (enc->verbose && actual_gop_size >= 4) { + printf("Frame-to-frame motion vectors (before cumulative conversion):\n"); + for (int i = 0; i < actual_gop_size; i++) { + printf(" Frame %d: 1/16px=(%d, %d) pixels=(%.3f, %.3f)\n", + i, enc->gop_translation_x[i], enc->gop_translation_y[i], + enc->gop_translation_x[i] / 16.0f, enc->gop_translation_y[i] / 16.0f); + } + } + + // Step 0.5: Convert frame-to-frame motion vectors to cumulative (relative to frame 0) + // Phase correlation computes motion of frame[i] relative to frame[i-1] + // We need cumulative motion relative to frame 0 for proper alignment + for (int i = 2; i < actual_gop_size; i++) { + enc->gop_translation_x[i] += enc->gop_translation_x[i-1]; + enc->gop_translation_y[i] += enc->gop_translation_y[i-1]; + } + + // Debug: Print cumulative motion vectors + if (enc->verbose && actual_gop_size >= 4) { + printf("Cumulative motion vectors (after conversion):\n"); + for (int i = 0; i < actual_gop_size; i++) { + printf(" Frame %d: 1/16px=(%d, %d) pixels=(%.3f, %.3f)\n", + i, enc->gop_translation_x[i], enc->gop_translation_y[i], + enc->gop_translation_x[i] / 16.0f, enc->gop_translation_y[i] / 16.0f); + } + } + + // Step 0.6: Apply motion compensation to align frames before temporal DWT + // This uses the cumulative translation vectors to align each frame to frame 0 for (int i = 1; i < actual_gop_size; i++) { // Skip frame 0 (reference frame) float *aligned_y = malloc(num_pixels * sizeof(float)); float *aligned_co = malloc(num_pixels * sizeof(float)); @@ -1856,7 +1885,7 @@ static size_t gop_flush(tav_encoder_t *enc, FILE *output, int base_quantiser, fwrite(&gop_size_byte, 1, 1, output); total_bytes_written += 1; - // Write all motion vectors (quarter-pixel precision) for the entire GOP + // Write all motion vectors (1/16-pixel precision) for the entire GOP for (int t = 0; t < actual_gop_size; t++) { int16_t dx = enc->gop_translation_x[t]; int16_t dy = enc->gop_translation_y[t]; @@ -1973,34 +2002,39 @@ static size_t gop_process_and_flush(tav_encoder_t *enc, FILE *output, int base_q long long total_diff = 0; int changed_pixels = 0; - int num_pixels = enc->width * enc->height; - // Sample every 4th pixel for performance - for (int p = 0; p < num_pixels; p += 4) { - int offset = p * 3; - int r_diff = abs(frame2[offset] - frame1[offset]); - int g_diff = abs(frame2[offset + 1] - frame1[offset + 1]); - int b_diff = abs(frame2[offset + 2] - frame1[offset + 2]); + // Sample every 4th pixel for performance (still gives good detection) + for (int y = 0; y < enc->height; y += 2) { + for (int x = 0; x < enc->width; x += 2) { + int offset = (y * enc->width + x) * 3; - int pixel_diff = r_diff + g_diff + b_diff; - total_diff += pixel_diff; + // Calculate colour difference + int r_diff = abs(frame2[offset] - frame1[offset]); + int g_diff = abs(frame2[offset + 1] - frame1[offset + 1]); + int b_diff = abs(frame2[offset + 2] - frame1[offset + 2]); - if (pixel_diff > 90) { - changed_pixels++; + int pixel_diff = r_diff + g_diff + b_diff; + total_diff += pixel_diff; + + // Count significantly changed pixels (threshold of 30 per channel average) + if (pixel_diff > 90) { + changed_pixels++; + } } } // Scene change thresholds (same as detect_scene_change) - int sampled_pixels = (num_pixels + 3) / 4; + int sampled_pixels = (enc->height / 2) * (enc->width / 2); double avg_diff = (double)total_diff / sampled_pixels; - double change_ratio = (double)changed_pixels / sampled_pixels; + double changed_ratio = (double)changed_pixels / sampled_pixels; + double threshold = 0.30; // Scene change detected if either threshold exceeded - if (avg_diff > 15.0 || change_ratio > 0.4) { + if (changed_ratio > threshold) { scene_change_frame = i; if (enc->verbose) { printf("Scene change detected within GOP at frame %d (avg_diff=%.2f, change_ratio=%.2f)\n", - frame_numbers[i], avg_diff, change_ratio); + frame_numbers[i], avg_diff, changed_ratio); } break; } diff --git a/video_encoder/tav_inspector.c b/video_encoder/tav_inspector.c index 2b125d0..eca7bac 100644 --- a/video_encoder/tav_inspector.c +++ b/video_encoder/tav_inspector.c @@ -515,10 +515,10 @@ int main(int argc, char *argv[]) { // Always show motion vectors for GOP packets with absolute frame numbers if (gop_size > 0) { - printf("\n Motion vectors (quarter-pixel):"); + printf("\n Motion vectors (1/16-pixel):"); for (int i = 0; i < gop_size; i++) { - printf("\n Frame %d (#%d): (%.2f, %.2f) px", - current_frame + i, i, motion_x[i] / 4.0, motion_y[i] / 4.0); + printf("\n Frame %d (#%d): (%.3f, %.3f) px", + current_frame + i, i, motion_x[i] / 16.0, motion_y[i] / 16.0); } } }