TAV: 3D DWT makes coherent picture at least

2026-06-06 05:28:31 +09:00 · 2025-10-17 02:01:08 +09:00
parent 0cf1173dd6
commit 93622fc8ca
5 changed files with 117 additions and 94 deletions
--- a/CLAUDE.md
+++ b/CLAUDE.md
@@ -168,6 +168,7 @@ Peripheral memories can be accessed using `vm.peek()` and `vm.poke()` functions,
  - **Wavelet Support**: Multiple wavelet types for different compression characteristics
 - **JS Decoder**: `assets/disk0/tvdos/bin/playtav.js` - Native decoder for TAV format playback
 - **Hardware accelerated decoding**: Extended GraphicsJSR223Delegate.kt with TAV functions
+- **Packet analyser**: `video_encoder/tav_inspector.c` - Debugging tool that parses TAV packets into human-readable form
 - **Features**:
  - **Multiple Wavelet Types**: 5/3 reversible, 9/7 irreversible, CDF 13/7, DD-4, Haar
  - **Single-tile encoding**: One large DWT tile for optimal quality (no blocking artifacts)
@@ -276,7 +277,7 @@ Implemented on 2025-10-15 for improved temporal compression through group-of-pic
 - **3D DWT**: Applies DWT in both spatial (2D) and temporal (1D) dimensions for optimal spacetime compression
 - **Unified GOP Preprocessing**: Single significance map for all frames and channels in a GOP (width×height×N_frames×3_channels)
 - **FFT-based Phase Correlation**: Uses FFTW3 library for accurate global motion estimation with quarter-pixel precision
- **GOP Size**: Typically 16 frames (configurable), with scene change detection for adaptive GOPs
+- **GOP Size**: Typically 8 frames (configurable), with scene change detection for adaptive GOPs
 - **Single-frame Fallback**: GOP size of 1 automatically uses traditional I-frame encoding

 **Packet Format**:
--- a/assets/disk0/tvdos/bin/playtav.js
+++ b/assets/disk0/tvdos/bin/playtav.js
@@ -1005,8 +1005,12 @@ try {
                let motionY = new Array(gopSize)

                for (let i = 0; i < gopSize; i++) {
-                    motionX[i] = seqread.readShort()  // Signed int16
-                    motionY[i] = seqread.readShort()
+                    // readShort() returns unsigned 16-bit, but motion vectors are signed int16
+                    let mx = seqread.readShort()
+                    let my = seqread.readShort()
+                    // Convert to signed: if > 32767, it's negative
+                    motionX[i] = (mx > 32767) ? (mx - 65536) : mx
+                    motionY[i] = (my > 32767) ? (my - 65536) : my
                }

                // Read compressed data size
@@ -1019,7 +1023,7 @@ try {
                // Check if GOP fits in VM memory
                const gopMemoryNeeded = gopSize * FRAME_SIZE
                if (gopMemoryNeeded > MAXMEM) {
-                    throw new Error(`GOP too large: ${gopSize} frames needs ${(gopMemoryNeeded / 1048576).toFixed(2)}MB, but VM has only ${(MAXMEM / 1048576).toFixed(1)}MB. Max GOP size: 11 frames for 8MB system.`)
+                    throw new Error(`GOP too large: ${gopSize} frames needs ${(gopMemoryNeeded / 1048576).toFixed(2)}MB, but VM has only ${(MAXMEM / 1048576).toFixed(1)}MB. Max GOP size: 8 frames for 8MB system.`)
                }

                // Allocate GOP buffers outside try block so finally can free them
--- a/tsvm_core/src/net/torvald/tsvm/GraphicsJSR223Delegate.kt
+++ b/tsvm_core/src/net/torvald/tsvm/GraphicsJSR223Delegate.kt
@@ -72,6 +72,7 @@ import kotlin.intArrayOf
 import kotlin.let
 import kotlin.longArrayOf
 import kotlin.math.*
+import kotlin.math.pow
 import kotlin.repeat
 import kotlin.text.format
 import kotlin.text.lowercase
@@ -4538,30 +4539,18 @@ class GraphicsJSR223Delegate(private val vm: VM) {
     *   - Frames 8-15: Level 2 (tHH - highest frequency)
     */
    private fun getTemporalSubbandLevel(frameIdx: Int, numFrames: Int, temporalLevels: Int): Int {
-        if (temporalLevels == 0) return 0
+        // Match encoder logic exactly (encoder_tav.c:1487-1501)
+        // After temporal DWT with 2 levels:
+        // Frames 0...num_frames/(2^2) = tLL (temporal low-low, coarsest, level 0)
+        // Frames in first half but after tLL = tLH (level 1)
+        // Remaining frames = tH from first level (level 2, finest)

-        val framesPerSubband = numFrames shr temporalLevels  // numFrames / 2^temporalLevels
+        val framesPerLevel0 = numFrames shr temporalLevels  // e.g., 16 >> 2 = 4, or 8 >> 2 = 2

-        // Safety check: ensure we have enough frames for the temporal levels
-        // Minimum frames needed = 2^temporalLevels
-        if (framesPerSubband == 0) {
-            // Not enough frames for this many temporal levels - treat all as base level
-            return 0
-        }
-
-        // Determine which temporal subband this frame belongs to
-        val subbandIdx = frameIdx / framesPerSubband
-
-        // Map subband index to level (0 = tLL, 1+ = temporal high-pass levels)
-        return if (subbandIdx == 0) 0 else {
-            // Find highest bit position in subbandIdx to determine level
-            var level = 0
-            var idx = subbandIdx
-            while (idx > 1) {
-                idx = idx shr 1
-                level++
-            }
-            level + 1
+        return when {
+            frameIdx < framesPerLevel0 -> 0  // Coarsest temporal level (tLL)
+            frameIdx < (numFrames shr 1) -> 1  // First level high-pass (tLH)
+            else -> 2  // Finest level high-pass (tH from level 1)
        }
    }

@@ -4575,9 +4564,10 @@ class GraphicsJSR223Delegate(private val vm: VM) {
     *   - Level 2 (tHH):  1.0 × 2^1.6 = 3.03
     */
    private fun getTemporalQuantizerScale(temporalLevel: Int): Float {
-        val BETA = 0.8f
-        val TEMPORAL_BASE_SCALE = 1.0f
-        return TEMPORAL_BASE_SCALE * Math.pow(2.0, (BETA * temporalLevel).toDouble()).toFloat()
+        val BETA = 0.6f // Temporal scaling exponent (aggressive for temporal high-pass)
+        val KAPPA = 1.14f
+        val TEMPORAL_BASE_SCALE = 1.0f // Don't reduce tLL quantization (same as intra)
+        return TEMPORAL_BASE_SCALE * 2.0f.pow(BETA * temporalLevel.toFloat().pow(KAPPA))
    }

    // level is one-based index
@@ -6251,8 +6241,8 @@ class GraphicsJSR223Delegate(private val vm: VM) {
     * @param compressedDataPtr Pointer to compressed Zstd data
     * @param compressedSize Size of compressed data
     * @param gopSize Number of frames in GOP (1-16)
-     * @param motionVectorsX X motion vectors in quarter-pixel units
-     * @param motionVectorsY Y motion vectors in quarter-pixel units
+     * @param motionVectorsX X motion vectors in 1/16-pixel units
+     * @param motionVectorsY Y motion vectors in 1/16-pixel units
     * @param outputRGBAddrs Array of output RGB buffer addresses
     * @param width Frame width
     * @param height Frame height
@@ -6363,10 +6353,10 @@ class GraphicsJSR223Delegate(private val vm: VM) {
        tavApplyInverse3DDWT(gopCg, width, height, gopSize, spatialLevels, temporalLevels, spatialFilter)

        // Step 7: Apply inverse motion compensation (shift frames back)
-        // Note: Motion vectors are in quarter-pixel units
+        // Note: Motion vectors are in 1/16-pixel units, cumulative relative to frame 0
        for (t in 1 until gopSize) {  // Skip frame 0 (reference)
-            val dx = motionVectorsX[t] / 4  // Convert to pixel units
-            val dy = motionVectorsY[t] / 4
+            val dx = motionVectorsX[t] / 16  // Convert to pixel units
+            val dy = motionVectorsY[t] / 16

            if (dx != 0 || dy != 0) {
                applyInverseTranslation(gopY[t], width, height, dx, dy)
@@ -6486,36 +6476,30 @@ class GraphicsJSR223Delegate(private val vm: VM) {

    // Haar wavelet inverse 1D transform
    // The simplest wavelet: reverses averages and differences
+    // MUST match encoder's dwt_haar_inverse_1d exactly (encoder_tav.c:1265-1284)
    private fun tavApplyDWTHaarInverse1D(data: FloatArray, length: Int) {
        if (length < 2) return

        val temp = FloatArray(length)
        val half = (length + 1) / 2

-        // Split into low and high frequency components
-        for (i in 0 until half) {
-            temp[i] = data[i]              // Low-pass coefficients (averages)
-        }
-        for (i in 0 until length / 2) {
-            if (half + i < length) {
-                temp[half + i] = data[half + i] // High-pass coefficients (differences)
-            }
-        }
-
-        // Haar inverse: reconstruct original samples from averages and differences
+        // Inverse Haar transform: reconstruct from averages and differences
+        // Read directly from data array (already has low-pass then high-pass layout)
        for (i in 0 until half) {
            if (2 * i + 1 < length) {
-                val avg = temp[i]           // Average (low-pass)
-                val diff = if (half + i < length) temp[half + i] else 0.0f  // Difference (high-pass)
-
-                // Reconstruct original adjacent pair
-                data[2 * i] = avg + diff        // First sample: average + difference
-                data[2 * i + 1] = avg - diff    // Second sample: average - difference
+                // Reconstruct adjacent pairs from average and difference
+                temp[2 * i] = data[i] + data[half + i]      // average + difference
+                temp[2 * i + 1] = data[i] - data[half + i]  // average - difference
            } else {
-                // Handle odd length: last sample comes directly from low-pass
-                data[2 * i] = temp[i]
+                // Handle odd length: last sample comes from low-pass only
+                temp[2 * i] = data[i]
            }
        }
+
+        // Copy reconstructed data back
+        for (i in 0 until length) {
+            data[i] = temp[i]
+        }
    }

    // =============================================================================
--- a/video_encoder/encoder_tav.c
+++ b/video_encoder/encoder_tav.c
@@ -17,7 +17,7 @@
 #include <float.h>
 #include <fftw3.h>

-#define ENCODER_VENDOR_STRING "Encoder-TAV 20251016"
+#define ENCODER_VENDOR_STRING "Encoder-TAV 20251017"

 // TSVM Advanced Video (TAV) format constants
 #define TAV_MAGIC "\x1F\x54\x53\x56\x4D\x54\x41\x56"  // "\x1FTSVM TAV"
@@ -106,6 +106,7 @@ static int needs_alpha_channel(int channel_layout) {
 #define DEFAULT_ZSTD_LEVEL 9
 #define GOP_SIZE 8
 #define TEMPORAL_DECOMP_LEVEL 2
+#define MOTION_THRESHOLD 64.0f // Flush if motion exceeds 24 pixels in any direction

 // Audio/subtitle constants (reused from TEV)
 #define MP2_DEFAULT_PACKET_SIZE 1152
@@ -310,8 +311,8 @@ typedef struct tav_encoder_s {
    float **gop_y_frames;        // [frame][pixel] - Y channel for each GOP frame
    float **gop_co_frames;       // [frame][pixel] - Co channel for each GOP frame
    float **gop_cg_frames;       // [frame][pixel] - Cg channel for each GOP frame
-    int16_t *gop_translation_x;  // [frame] - Translation X in quarter-pixel units
-    int16_t *gop_translation_y;  // [frame] - Translation Y in quarter-pixel units
+    int16_t *gop_translation_x;  // [frame] - Translation X in 1/16-pixel units
+    int16_t *gop_translation_y;  // [frame] - Translation Y in 1/16-pixel units
    int temporal_decomp_levels;  // Number of temporal DWT levels (default: 2)

    // Tile processing
@@ -1316,7 +1317,7 @@ static void dwt_53_inverse_1d(float *data, int length) {

 // FFT-based phase correlation for global motion estimation
 // Uses FFTW3 to compute cross-power spectrum and find translation peak
-// Returns quarter-pixel precision translation vectors
+// Returns 1/16-pixel precision translation vectors
 static void phase_correlate_fft(const uint8_t *frame1_rgb, const uint8_t *frame2_rgb,
                               int width, int height, int16_t *dx_qpel, int16_t *dy_qpel) {
    // Step 1: Convert RGB to grayscale
@@ -1404,7 +1405,7 @@ static void phase_correlate_fft(const uint8_t *frame1_rgb, const uint8_t *frame2
    if (dx > width / 2) dx -= width;
    if (dy > height / 2) dy -= height;

-    // Step 7: Quarter-pixel refinement using parabolic interpolation
+    // Step 7: Subpixel refinement using parabolic interpolation
    // Only refine if peak is not at boundary
    float subpixel_dx = 0.0f;
    float subpixel_dy = 0.0f;
@@ -1434,12 +1435,12 @@ static void phase_correlate_fft(const uint8_t *frame1_rgb, const uint8_t *frame2
        }
    }

-    // Step 8: Convert to quarter-pixel units
+    // Step 8: Convert to 1/16-pixel units (sixteenth-pixel precision)
    float final_dx = dx + subpixel_dx;
    float final_dy = dy + subpixel_dy;

-    *dx_qpel = (int16_t)roundf(final_dx * 4.0f);
-    *dy_qpel = (int16_t)roundf(final_dy * 4.0f);
+    *dx_qpel = (int16_t)roundf(final_dx * 16.0f);
+    *dy_qpel = (int16_t)roundf(final_dy * 16.0f);

    // Cleanup
    fftwf_destroy_plan(plan_fwd1);
@@ -1456,9 +1457,9 @@ static void phase_correlate_fft(const uint8_t *frame1_rgb, const uint8_t *frame2
 // Apply translation to frame (for frame alignment before temporal DWT)
 static void apply_translation(float *frame_data, int width, int height,
                             int16_t dx_qpel, int16_t dy_qpel, float *output) {
-    // Convert quarter-pixel to pixel (for now, just use integer translation)
-    int dx = dx_qpel / 4;
-    int dy = dy_qpel / 4;
+    // Convert 1/16-pixel to pixel (for now, just use integer translation)
+    int dx = dx_qpel / 16;
+    int dy = dy_qpel / 16;

    // Apply translation with boundary handling
    for (int y = 0; y < height; y++) {
@@ -1524,7 +1525,8 @@ static void quantise_3d_dwt_coefficients(tav_encoder_t *enc,
                                        int spatial_size,
                                        int base_quantiser,
                                        int is_chroma) {
-    const float BETA = 0.8f;  // Temporal scaling exponent (aggressive for temporal high-pass)
+    const float BETA = 0.6f;  // Temporal scaling exponent (aggressive for temporal high-pass)
+    const float KAPPA = 1.14f;
    const float TEMPORAL_BASE_SCALE = 1.0f;  // Don't reduce tLL quantization (same as intra)

    // Process each temporal subband independently (separable approach)
@@ -1541,7 +1543,7 @@ static void quantise_3d_dwt_coefficients(tav_encoder_t *enc,
        //   - Level 0 (tLL): 16 * 1.0 * 2^0 = 16 (same as intra-only)
        //   - Level 1 (tH):  16 * 1.0 * 2^2.0 = 64 (4× base, aggressive)
        //   - Level 2 (tHH): 16 * 1.0 * 2^4.0 = 256 → clamped to 255 (very aggressive)
-        float temporal_scale = TEMPORAL_BASE_SCALE * powf(2.0f, BETA * temporal_level);
+        float temporal_scale = TEMPORAL_BASE_SCALE * powf(2.0f, BETA * powf(temporal_level, KAPPA));
        float temporal_quantiser = base_quantiser * temporal_scale;

        // Convert to integer for quantization
@@ -1604,10 +1606,10 @@ static int gop_add_frame(tav_encoder_t *enc, const uint8_t *frame_rgb,
                           &enc->gop_translation_y[frame_idx]);

        if (enc->verbose && (frame_idx < 3 || frame_idx == enc->gop_capacity - 1)) {
-            printf("  GOP frame %d: translation = (%.2f, %.2f) pixels\n",
+            printf("  GOP frame %d: translation = (%.3f, %.3f) pixels\n",
                   frame_idx,
-                   enc->gop_translation_x[frame_idx] / 4.0f,
-                   enc->gop_translation_y[frame_idx] / 4.0f);
+                   enc->gop_translation_x[frame_idx] / 16.0f,
+                   enc->gop_translation_y[frame_idx] / 16.0f);
        }
    } else {
        // First frame has no translation
@@ -1644,13 +1646,12 @@ static int gop_should_flush_motion(tav_encoder_t *enc) {
    int16_t dx = enc->gop_translation_x[last_idx];
    int16_t dy = enc->gop_translation_y[last_idx];

-    // Convert quarter-pixel to pixels
-    float dx_pixels = fabsf(dx / 4.0f);
-    float dy_pixels = fabsf(dy / 4.0f);
+    // Convert 1/16-pixel to pixels
+    float dx_pixels = fabsf(dx / 16.0f);
+    float dy_pixels = fabsf(dy / 16.0f);

    // Flush if motion exceeds threshold (24 pixels in any direction)
    // This indicates likely scene change or very fast motion
-    const float MOTION_THRESHOLD = 24.0f;

    if (dx_pixels > MOTION_THRESHOLD || dy_pixels > MOTION_THRESHOLD) {
        if (enc->verbose) {
@@ -1690,8 +1691,36 @@ static size_t gop_flush(tav_encoder_t *enc, FILE *output, int base_quantiser,
        memcpy(gop_cg_coeffs[i], enc->gop_cg_frames[i], num_pixels * sizeof(float));
    }

-    // Step 0.5: Apply motion compensation to align frames before temporal DWT
-    // This uses the computed translation vectors to align each frame to the previous one
+    // Debug: Print original frame-to-frame motion vectors
+    if (enc->verbose && actual_gop_size >= 4) {
+        printf("Frame-to-frame motion vectors (before cumulative conversion):\n");
+        for (int i = 0; i < actual_gop_size; i++) {
+            printf("  Frame %d: 1/16px=(%d, %d) pixels=(%.3f, %.3f)\n",
+                   i, enc->gop_translation_x[i], enc->gop_translation_y[i],
+                   enc->gop_translation_x[i] / 16.0f, enc->gop_translation_y[i] / 16.0f);
+        }
+    }
+
+    // Step 0.5: Convert frame-to-frame motion vectors to cumulative (relative to frame 0)
+    // Phase correlation computes motion of frame[i] relative to frame[i-1]
+    // We need cumulative motion relative to frame 0 for proper alignment
+    for (int i = 2; i < actual_gop_size; i++) {
+        enc->gop_translation_x[i] += enc->gop_translation_x[i-1];
+        enc->gop_translation_y[i] += enc->gop_translation_y[i-1];
+    }
+
+    // Debug: Print cumulative motion vectors
+    if (enc->verbose && actual_gop_size >= 4) {
+        printf("Cumulative motion vectors (after conversion):\n");
+        for (int i = 0; i < actual_gop_size; i++) {
+            printf("  Frame %d: 1/16px=(%d, %d) pixels=(%.3f, %.3f)\n",
+                   i, enc->gop_translation_x[i], enc->gop_translation_y[i],
+                   enc->gop_translation_x[i] / 16.0f, enc->gop_translation_y[i] / 16.0f);
+        }
+    }
+
+    // Step 0.6: Apply motion compensation to align frames before temporal DWT
+    // This uses the cumulative translation vectors to align each frame to frame 0
    for (int i = 1; i < actual_gop_size; i++) {  // Skip frame 0 (reference frame)
        float *aligned_y = malloc(num_pixels * sizeof(float));
        float *aligned_co = malloc(num_pixels * sizeof(float));
@@ -1856,7 +1885,7 @@ static size_t gop_flush(tav_encoder_t *enc, FILE *output, int base_quantiser,
        fwrite(&gop_size_byte, 1, 1, output);
        total_bytes_written += 1;

-        // Write all motion vectors (quarter-pixel precision) for the entire GOP
+        // Write all motion vectors (1/16-pixel precision) for the entire GOP
        for (int t = 0; t < actual_gop_size; t++) {
            int16_t dx = enc->gop_translation_x[t];
            int16_t dy = enc->gop_translation_y[t];
@@ -1973,11 +2002,13 @@ static size_t gop_process_and_flush(tav_encoder_t *enc, FILE *output, int base_q

            long long total_diff = 0;
            int changed_pixels = 0;
-            int num_pixels = enc->width * enc->height;

-            // Sample every 4th pixel for performance
-            for (int p = 0; p < num_pixels; p += 4) {
-                int offset = p * 3;
+            // Sample every 4th pixel for performance (still gives good detection)
+            for (int y = 0; y < enc->height; y += 2) {
+                for (int x = 0; x < enc->width; x += 2) {
+                    int offset = (y * enc->width + x) * 3;
+
+                    // Calculate colour difference
                    int r_diff = abs(frame2[offset] - frame1[offset]);
                    int g_diff = abs(frame2[offset + 1] - frame1[offset + 1]);
                    int b_diff = abs(frame2[offset + 2] - frame1[offset + 2]);
@@ -1985,22 +2016,25 @@ static size_t gop_process_and_flush(tav_encoder_t *enc, FILE *output, int base_q
                    int pixel_diff = r_diff + g_diff + b_diff;
                    total_diff += pixel_diff;

+                    // Count significantly changed pixels (threshold of 30 per channel average)
                    if (pixel_diff > 90) {
                        changed_pixels++;
                    }
                }
+            }

            // Scene change thresholds (same as detect_scene_change)
-            int sampled_pixels = (num_pixels + 3) / 4;
+            int sampled_pixels = (enc->height / 2) * (enc->width / 2);
            double avg_diff = (double)total_diff / sampled_pixels;
-            double change_ratio = (double)changed_pixels / sampled_pixels;
+            double changed_ratio = (double)changed_pixels / sampled_pixels;
+            double threshold = 0.30;

            // Scene change detected if either threshold exceeded
-            if (avg_diff > 15.0 || change_ratio > 0.4) {
+            if (changed_ratio > threshold) {
                scene_change_frame = i;
                if (enc->verbose) {
                    printf("Scene change detected within GOP at frame %d (avg_diff=%.2f, change_ratio=%.2f)\n",
-                           frame_numbers[i], avg_diff, change_ratio);
+                           frame_numbers[i], avg_diff, changed_ratio);
                }
                break;
            }
--- a/video_encoder/tav_inspector.c
+++ b/video_encoder/tav_inspector.c
@@ -515,10 +515,10 @@ int main(int argc, char *argv[]) {

                    // Always show motion vectors for GOP packets with absolute frame numbers
                    if (gop_size > 0) {
-                        printf("\n    Motion vectors (quarter-pixel):");
+                        printf("\n    Motion vectors (1/16-pixel):");
                        for (int i = 0; i < gop_size; i++) {
-                            printf("\n      Frame %d (#%d): (%.2f, %.2f) px",
-                                   current_frame + i, i, motion_x[i] / 4.0, motion_y[i] / 4.0);
+                            printf("\n      Frame %d (#%d): (%.3f, %.3f) px",
+                                   current_frame + i, i, motion_x[i] / 16.0, motion_y[i] / 16.0);
                        }
                    }
                }