TAV: will replace frame aligning with something else, or maybe with nothing

2026-06-09 14:44:05 +09:00 · 2025-10-17 06:48:21 +09:00
parent 93622fc8ca
commit 3b9e02b17f
3 changed files with 299 additions and 60 deletions
--- a/assets/disk0/tvdos/bin/playtav.js
+++ b/assets/disk0/tvdos/bin/playtav.js
@@ -999,7 +999,18 @@ try {
                // Read GOP size (number of frames in this GOP, 1-16)
                const gopSize = seqread.readOneByte()
-                // Read motion vectors (quarter-pixel units, int16)
+                // Read canvas expansion margins (4 bytes)
                // Encoder expands canvas to preserve all original pixels from all aligned frames
                const marginLeft = seqread.readOneByte()
                const marginRight = seqread.readOneByte()
                const marginTop = seqread.readOneByte()
                const marginBottom = seqread.readOneByte()
                // Calculate expanded canvas dimensions
                const canvasWidth = header.width + marginLeft + marginRight
                const canvasHeight = header.height + marginTop + marginBottom
                // Read motion vectors (1/16-pixel units, int16)
                // Encoder writes ALL motion vectors including frame 0
                let motionX = new Array(gopSize)
                let motionY = new Array(gopSize)
@@ -1042,7 +1053,7 @@ try {
                try {
                    let decodeStart = sys.nanoTime()
-                    // Call GOP decoder
+                    // Call GOP decoder with canvas expansion information
                    const [r1, r2] = graphics.tavDecodeGopUnified(
                        compressedPtr,
                        compressedSize,
@@ -1050,8 +1061,12 @@ try {
                        motionX,
                        motionY,
                        gopRGBBuffers,  // Array of output buffer addresses
-                        header.width,
+                        header.width,   // Original frame width
-                        header.height,
+                        header.height,  // Original frame height
                        canvasWidth,    // Expanded canvas width (preserves all pixels)
                        canvasHeight,   // Expanded canvas height (preserves all pixels)
                        marginLeft,     // Left margin
                        marginTop,      // Top margin
                        header.qualityLevel,
                        QLUT[header.qualityY],
                        QLUT[header.qualityCo],
--- a/tsvm_core/src/net/torvald/tsvm/GraphicsJSR223Delegate.kt
+++ b/tsvm_core/src/net/torvald/tsvm/GraphicsJSR223Delegate.kt
@@ -3181,11 +3181,35 @@ class GraphicsJSR223Delegate(private val vm: VM) {
        }
    }
    /**
     * Symmetric padding (mirroring) for edge handling in motion compensation.
     * This provides smoother edges than simple clamping/replication.
     *
     * @param coord The coordinate to mirror if out of bounds
     * @param size The dimension size (width or height)
     * @return The mirrored coordinate within valid range [0, size-1]
     */
    private fun symmetricPadding(coord: Int, size: Int): Int {
        var mirrored = coord
        // Mirror for negative coordinates: -1 -> 0, -2 -> 1, -3 -> 2, etc.
        if (mirrored < 0) {
            mirrored = -mirrored - 1
        }
        // Mirror for coordinates beyond bounds: size -> size-1, size+1 -> size-2, etc.
        else if (mirrored >= size) {
            mirrored = 2 * size - mirrored - 1
        }
        // Final clamp to ensure we're within bounds (handles extreme cases)
        return mirrored.coerceIn(0, size - 1)
    }
    private fun tevHandleMotionBlockTwoPass(startX: Int, startY: Int, mvX: Int, mvY: Int,
                                            currentRGBAddr: Long, prevRGBAddr: Long,
                                            width: Int, height: Int, thisAddrIncVec: Int, prevAddrIncVec: Int,
                                            debugMotionVectors: Boolean) {
-        // Copy 16x16 block with motion compensation
+        // Copy 16x16 block with motion compensation using symmetric padding
        for (py in 0 until 16) {
            val y = startY + py
            if (y >= height) break
@@ -3194,8 +3218,9 @@ class GraphicsJSR223Delegate(private val vm: VM) {
                val x = startX + px
                if (x >= width) break
-                val srcX = (x + mvX).coerceIn(0, width - 1)
+                // Use symmetric padding instead of clamping for smoother edges
-                val srcY = (y + mvY).coerceIn(0, height - 1)
+                val srcX = symmetricPadding(x + mvX, width)
                val srcY = symmetricPadding(y + mvY, height)
                val srcOffset = (srcY * width + srcX) * 3
                val dstOffset = (y * width + x) * 3
@@ -3226,8 +3251,9 @@ class GraphicsJSR223Delegate(private val vm: VM) {
                val x = startX + px
                if (x >= width) break
-                val srcX = (x + mvX).coerceIn(0, width - 1)
+                // Use symmetric padding for smoother edges (commented-out code updated for consistency)
-                val srcY = (y + mvY).coerceIn(0, height - 1)
+                val srcX = symmetricPadding(x + mvX, width)
                val srcY = symmetricPadding(y + mvY, height)
                val srcOffset = (srcY * width + srcX) * 3
                val r = vm.peek(prevRGBAddr + srcOffset * prevAddrIncVec)?.toInt() ?: 0
@@ -6205,6 +6231,7 @@ class GraphicsJSR223Delegate(private val vm: VM) {
    /**
     * Apply inverse translation (motion compensation) to a frame.
     * Inverse operation: shifts by +dx, +dy (opposite of forward encoder).
     * Uses symmetric boundary extension (mirror padding) to match encoder.
     *
     * @param frameData Input frame data to shift
     * @param width Frame width
@@ -6215,14 +6242,28 @@ class GraphicsJSR223Delegate(private val vm: VM) {
    private fun applyInverseTranslation(frameData: FloatArray, width: Int, height: Int, dx: Int, dy: Int) {
        val output = FloatArray(width * height)
-        // Apply inverse translation with boundary clamping
+        // Apply inverse translation with symmetric boundary extension (mirror padding)
        for (y in 0 until height) {
            for (x in 0 until width) {
                // Inverse: shift by +dx, +dy (opposite of encoder's -dx, -dy)
                var srcX = x + dx
                var srcY = y + dy
-                // Clamp to frame boundaries
+                // Symmetric extension at boundaries (mirror padding)
                // This gives smooth edges instead of replicated stripes
                if (srcX < 0) {
                    srcX = -srcX - 1  // Mirror left edge
                } else if (srcX >= width) {
                    srcX = 2 * width - srcX - 1  // Mirror right edge
                }
                if (srcY < 0) {
                    srcY = -srcY - 1  // Mirror top edge
                } else if (srcY >= height) {
                    srcY = 2 * height - srcY - 1  // Mirror bottom edge
                }
                // Clamp after mirroring (in case of very large shifts)
                srcX = srcX.coerceIn(0, width - 1)
                srcY = srcY.coerceIn(0, height - 1)
@@ -6244,8 +6285,12 @@ class GraphicsJSR223Delegate(private val vm: VM) {
     * @param motionVectorsX X motion vectors in 1/16-pixel units
     * @param motionVectorsY Y motion vectors in 1/16-pixel units
     * @param outputRGBAddrs Array of output RGB buffer addresses
-     * @param width Frame width
+     * @param width Original frame width (output dimensions)
-     * @param height Frame height
+     * @param height Original frame height (output dimensions)
     * @param canvasWidth Expanded canvas width (for motion compensation)
     * @param canvasHeight Expanded canvas height (for motion compensation)
     * @param marginLeft Left margin to crop from expanded canvas
     * @param marginTop Top margin to crop from expanded canvas
     * @param qIndex Quality index
     * @param qYGlobal Global Y quantizer
     * @param qCoGlobal Global Co quantizer
@@ -6265,6 +6310,10 @@ class GraphicsJSR223Delegate(private val vm: VM) {
        outputRGBAddrs: LongArray,
        width: Int,
        height: Int,
        canvasWidth: Int,
        canvasHeight: Int,
        marginLeft: Int,
        marginTop: Int,
        qIndex: Int,
        qYGlobal: Int,
        qCoGlobal: Int,
@@ -6280,7 +6329,9 @@ class GraphicsJSR223Delegate(private val vm: VM) {
        dbgOut["qCg"] = qCgGlobal
        dbgOut["frameMode"] = "G"
-        val numPixels = width * height
+        // Use expanded canvas dimensions for DWT processing
        val canvasPixels = canvasWidth * canvasHeight
        val outputPixels = width * height
        // Step 1: Decompress unified GOP block
        val compressedData = ByteArray(compressedSize)
@@ -6305,17 +6356,17 @@ class GraphicsJSR223Delegate(private val vm: VM) {
        val quantizedCoeffs = tavPostprocessGopUnified(
            decompressedData,
            gopSize,
-            numPixels,
+            canvasPixels,  // Use expanded canvas size
            channelLayout
        )
-        // Step 3: Allocate GOP buffers for float coefficients
+        // Step 3: Allocate GOP buffers for float coefficients (expanded canvas size)
-        val gopY = Array(gopSize) { FloatArray(numPixels) }
+        val gopY = Array(gopSize) { FloatArray(canvasPixels) }
-        val gopCo = Array(gopSize) { FloatArray(numPixels) }
+        val gopCo = Array(gopSize) { FloatArray(canvasPixels) }
-        val gopCg = Array(gopSize) { FloatArray(numPixels) }
+        val gopCg = Array(gopSize) { FloatArray(canvasPixels) }
-        // Step 4: Calculate subband layout (needed for perceptual dequantization)
+        // Step 4: Calculate subband layout for expanded canvas (needed for perceptual dequantization)
-        val subbands = calculateSubbandLayout(width, height, spatialLevels)
+        val subbands = calculateSubbandLayout(canvasWidth, canvasHeight, spatialLevels)
        // Step 5: Dequantize with temporal-spatial scaling
        for (t in 0 until gopSize) {
@@ -6347,49 +6398,60 @@ class GraphicsJSR223Delegate(private val vm: VM) {
            )
        }
-        // Step 6: Apply inverse 3D DWT (spatial first, then temporal)
+        // Step 6: Apply inverse 3D DWT (spatial first, then temporal) on expanded canvas
-        tavApplyInverse3DDWT(gopY, width, height, gopSize, spatialLevels, temporalLevels, spatialFilter)
+        tavApplyInverse3DDWT(gopY, canvasWidth, canvasHeight, gopSize, spatialLevels, temporalLevels, spatialFilter)
-        tavApplyInverse3DDWT(gopCo, width, height, gopSize, spatialLevels, temporalLevels, spatialFilter)
+        tavApplyInverse3DDWT(gopCo, canvasWidth, canvasHeight, gopSize, spatialLevels, temporalLevels, spatialFilter)
-        tavApplyInverse3DDWT(gopCg, width, height, gopSize, spatialLevels, temporalLevels, spatialFilter)
+        tavApplyInverse3DDWT(gopCg, canvasWidth, canvasHeight, gopSize, spatialLevels, temporalLevels, spatialFilter)
-        // Step 7: Apply inverse motion compensation (shift frames back)
+        // Step 7: Apply inverse motion compensation (shift frames back) on expanded canvas
        // Note: Motion vectors are in 1/16-pixel units, cumulative relative to frame 0
        for (t in 1 until gopSize) {  // Skip frame 0 (reference)
            val dx = motionVectorsX[t] / 16  // Convert to pixel units
            val dy = motionVectorsY[t] / 16
            if (dx != 0 || dy != 0) {
-                applyInverseTranslation(gopY[t], width, height, dx, dy)
+                applyInverseTranslation(gopY[t], canvasWidth, canvasHeight, dx, dy)
-                applyInverseTranslation(gopCo[t], width, height, dx, dy)
+                applyInverseTranslation(gopCo[t], canvasWidth, canvasHeight, dx, dy)
-                applyInverseTranslation(gopCg[t], width, height, dx, dy)
+                applyInverseTranslation(gopCg[t], canvasWidth, canvasHeight, dx, dy)
            }
        }
-        // Step 8: Convert each frame to RGB and write to output buffers
+        // Step 8: Crop expanded canvas to original dimensions and convert to RGB
        for (t in 0 until gopSize) {
            val rgbAddr = outputRGBAddrs[t]
-            for (i in 0 until numPixels) {
+            // Crop from expanded canvas (canvasWidth x canvasHeight) to output (width x height)
-                val y = gopY[t][i]
+            for (row in 0 until height) {
-                val co = gopCo[t][i]
+                for (col in 0 until width) {
-                val cg = gopCg[t][i]
+                    // Source pixel in expanded canvas
                    val canvasX = col + marginLeft
                    val canvasY = row + marginTop
                    val canvasIdx = canvasY * canvasWidth + canvasX
-                // YCoCg-R to RGB conversion
+                    // Destination pixel in output buffer
-                val tmp = y - (cg / 2.0f)
+                    val outIdx = row * width + col
                val g = cg + tmp
                val b = tmp - (co / 2.0f)
                val r = b + co
-                // Clamp to 0-255 range
+                    val yVal = gopY[t][canvasIdx]
-                val rClamped = r.toInt().coerceIn(0, 255)
+                    val co = gopCo[t][canvasIdx]
-                val gClamped = g.toInt().coerceIn(0, 255)
+                    val cg = gopCg[t][canvasIdx]
                val bClamped = b.toInt().coerceIn(0, 255)
-                // Write RGB24 format (3 bytes per pixel)
+                    // YCoCg-R to RGB conversion
-                val offset = rgbAddr + i * 3L
+                    val tmp = yVal - (cg / 2.0f)
-                vm.usermem[offset] = rClamped.toByte()
+                    val g = cg + tmp
-                vm.usermem[offset + 1] = gClamped.toByte()
+                    val b = tmp - (co / 2.0f)
-                vm.usermem[offset + 2] = bClamped.toByte()
+                    val r = b + co
                    // Clamp to 0-255 range
                    val rClamped = r.toInt().coerceIn(0, 255)
                    val gClamped = g.toInt().coerceIn(0, 255)
                    val bClamped = b.toInt().coerceIn(0, 255)
                    // Write RGB24 format (3 bytes per pixel)
                    val offset = rgbAddr + outIdx * 3L
                    vm.usermem[offset] = rClamped.toByte()
                    vm.usermem[offset + 1] = gClamped.toByte()
                    vm.usermem[offset + 2] = bClamped.toByte()
                }
            }
        }
--- a/video_encoder/encoder_tav.c
+++ b/video_encoder/encoder_tav.c
@@ -1455,19 +1455,22 @@ static void phase_correlate_fft(const uint8_t *frame1_rgb, const uint8_t *frame2
 }
 // Apply translation to frame (for frame alignment before temporal DWT)
 // NO PADDING - only extracts the valid region that will be common across all frames
 static void apply_translation(float *frame_data, int width, int height,
                             int16_t dx_qpel, int16_t dy_qpel, float *output) {
    // Convert 1/16-pixel to pixel (for now, just use integer translation)
    int dx = dx_qpel / 16;
    int dy = dy_qpel / 16;
-    // Apply translation with boundary handling
+    // Apply translation WITHOUT padding - just shift the content
    // Out-of-bounds regions will be cropped away later
    for (int y = 0; y < height; y++) {
        for (int x = 0; x < width; x++) {
            int src_x = x - dx;
            int src_y = y - dy;
-            // Clamp to frame boundaries
+            // Clamp to valid region (this will create edge repetition, but those
            // edges will be cropped away, so it doesn't matter what we put there)
            src_x = CLAMP(src_x, 0, width - 1);
            src_y = CLAMP(src_y, 0, height - 1);
@@ -1476,6 +1479,22 @@ static void apply_translation(float *frame_data, int width, int height,
    }
 }
 // Extract cropped region from a frame after alignment
 static void extract_crop(const float *frame_data, int width, int height,
                        int crop_left, int crop_right, int crop_top, int crop_bottom,
                        float *cropped_output) {
    int valid_width = width - crop_left - crop_right;
    int valid_height = height - crop_top - crop_bottom;
    for (int y = 0; y < valid_height; y++) {
        for (int x = 0; x < valid_width; x++) {
            int src_x = x + crop_left;
            int src_y = y + crop_top;
            cropped_output[y * valid_width + x] = frame_data[src_y * width + src_x];
        }
    }
 }
 // =============================================================================
 // Temporal Subband Quantization
 // =============================================================================
@@ -1598,7 +1617,7 @@ static int gop_add_frame(tav_encoder_t *enc, const uint8_t *frame_rgb,
    memcpy(enc->gop_cg_frames[frame_idx], frame_cg, frame_channel_size);
    // Compute translation vector if not first frame
-    if (frame_idx > 0) {
+    /*if (frame_idx > 0) {
        phase_correlate_fft(enc->gop_rgb_frames[frame_idx - 1],
                           enc->gop_rgb_frames[frame_idx],
                           enc->width, enc->height,
@@ -1615,7 +1634,11 @@ static int gop_add_frame(tav_encoder_t *enc, const uint8_t *frame_rgb,
        // First frame has no translation
        enc->gop_translation_x[0] = 0;
        enc->gop_translation_y[0] = 0;
-    }
+    }*/
    // disabling frame realigning: producing worse results in general
    enc->gop_translation_x[frame_idx] = 0.0f;
    enc->gop_translation_y[frame_idx] = 0.0f;
    enc->gop_frame_count++;
    return 0;
@@ -1675,7 +1698,7 @@ static size_t gop_flush(tav_encoder_t *enc, FILE *output, int base_quantiser,
    }
    // Allocate working buffers for each channel
-    const int num_pixels = enc->width * enc->height;
+    int num_pixels = enc->width * enc->height;  // Will be updated if frames are cropped
    float **gop_y_coeffs = malloc(actual_gop_size * sizeof(float*));
    float **gop_co_coeffs = malloc(actual_gop_size * sizeof(float*));
    float **gop_cg_coeffs = malloc(actual_gop_size * sizeof(float*));
@@ -1719,6 +1742,34 @@ static size_t gop_flush(tav_encoder_t *enc, FILE *output, int base_quantiser,
        }
    }
    // Step 0.5b: Calculate the valid region after alignment (crop bounds)
    // Find the bounding box that's valid across all aligned frames
    int min_dx = 0, max_dx = 0, min_dy = 0, max_dy = 0;
    for (int i = 0; i < actual_gop_size; i++) {
        int dx = enc->gop_translation_x[i] / 16;
        int dy = enc->gop_translation_y[i] / 16;
        if (dx < min_dx) min_dx = dx;
        if (dx > max_dx) max_dx = dx;
        if (dy < min_dy) min_dy = dy;
        if (dy > max_dy) max_dy = dy;
    }
    // Crop region: the area valid in all frames
    // When we shift right by +N, we lose N pixels on the left, so crop left edge by abs(min_dx)
    // When we shift left by -N, we lose N pixels on the right, so crop right edge by max_dx
    int crop_left = (min_dx < 0) ? -min_dx : 0;
    int crop_right = (max_dx > 0) ? max_dx : 0;
    int crop_top = (min_dy < 0) ? -min_dy : 0;
    int crop_bottom = (max_dy > 0) ? max_dy : 0;
    int valid_width = enc->width - crop_left - crop_right;
    int valid_height = enc->height - crop_top - crop_bottom;
    if (enc->verbose && (crop_left || crop_right || crop_top || crop_bottom)) {
        printf("Valid region after alignment: %dx%d (cropped: L=%d R=%d T=%d B=%d)\n",
               valid_width, valid_height, crop_left, crop_right, crop_top, crop_bottom);
    }
    // Step 0.6: Apply motion compensation to align frames before temporal DWT
    // This uses the cumulative translation vectors to align each frame to frame 0
    for (int i = 1; i < actual_gop_size; i++) {  // Skip frame 0 (reference frame)
@@ -1753,23 +1804,122 @@ static size_t gop_flush(tav_encoder_t *enc, FILE *output, int base_quantiser,
        free(aligned_cg);
    }
    // Step 0.7: Expand frames to larger canvas that preserves ALL original pixels
    // Calculate expanded canvas size (UNION of all aligned frames)
    int canvas_width = enc->width + crop_left + crop_right;   // Original width + total shift range
    int canvas_height = enc->height + crop_top + crop_bottom; // Original height + total shift range
    int canvas_pixels = canvas_width * canvas_height;
    if (enc->verbose && (crop_left || crop_right || crop_top || crop_bottom)) {
        printf("Expanded canvas: %dx%d (original %dx%d + margins L=%d R=%d T=%d B=%d)\n",
               canvas_width, canvas_height, enc->width, enc->height,
               crop_left, crop_right, crop_top, crop_bottom);
        printf("This preserves all original pixels from all frames after alignment\n");
    }
    // Allocate expanded canvas buffers
    float **canvas_y_coeffs = malloc(actual_gop_size * sizeof(float*));
    float **canvas_co_coeffs = malloc(actual_gop_size * sizeof(float*));
    float **canvas_cg_coeffs = malloc(actual_gop_size * sizeof(float*));
    for (int i = 0; i < actual_gop_size; i++) {
        canvas_y_coeffs[i] = calloc(canvas_pixels, sizeof(float));  // Zero-initialized
        canvas_co_coeffs[i] = calloc(canvas_pixels, sizeof(float));
        canvas_cg_coeffs[i] = calloc(canvas_pixels, sizeof(float));
        // Place the aligned frame onto the canvas at the appropriate offset
        // Each frame's aligned position determines where it sits on the canvas
        int offset_x = crop_left;  // Frames are offset by the left margin
        int offset_y = crop_top;   // Frames are offset by the top margin
        // Copy the full aligned frame onto the canvas (preserves all original content)
        for (int y = 0; y < enc->height; y++) {
            for (int x = 0; x < enc->width; x++) {
                int src_idx = y * enc->width + x;
                int dst_idx = (y + offset_y) * canvas_width + (x + offset_x);
                canvas_y_coeffs[i][dst_idx] = gop_y_coeffs[i][src_idx];
                canvas_co_coeffs[i][dst_idx] = gop_co_coeffs[i][src_idx];
                canvas_cg_coeffs[i][dst_idx] = gop_cg_coeffs[i][src_idx];
            }
        }
        // Fill margin areas with symmetric padding from frame edges
        for (int y = 0; y < canvas_height; y++) {
            for (int x = 0; x < canvas_width; x++) {
                // Skip pixels in the original frame region (already copied)
                if (y >= offset_y && y < offset_y + enc->height &&
                    x >= offset_x && x < offset_x + enc->width) {
                    continue;
                }
                // Calculate position relative to original frame
                int src_x = x - offset_x;
                int src_y = y - offset_y;
                // Apply symmetric padding (mirroring)
                if (src_x < 0) {
                    src_x = -src_x - 1;  // Mirror left edge: -1→0, -2→1, -3→2
                } else if (src_x >= enc->width) {
                    src_x = 2 * enc->width - src_x - 1;  // Mirror right edge
                }
                if (src_y < 0) {
                    src_y = -src_y - 1;  // Mirror top edge
                } else if (src_y >= enc->height) {
                    src_y = 2 * enc->height - src_y - 1;  // Mirror bottom edge
                }
                // Clamp to valid range (safety for extreme cases)
                src_x = CLAMP(src_x, 0, enc->width - 1);
                src_y = CLAMP(src_y, 0, enc->height - 1);
                // Copy mirrored pixel from original frame to canvas margin
                int src_idx = src_y * enc->width + src_x;
                int dst_idx = y * canvas_width + x;
                canvas_y_coeffs[i][dst_idx] = gop_y_coeffs[i][src_idx];
                canvas_co_coeffs[i][dst_idx] = gop_co_coeffs[i][src_idx];
                canvas_cg_coeffs[i][dst_idx] = gop_cg_coeffs[i][src_idx];
            }
        }
        // Free the original frame (no longer needed)
        free(gop_y_coeffs[i]);
        free(gop_co_coeffs[i]);
        free(gop_cg_coeffs[i]);
    }
    // Replace pointers with expanded canvas
    free(gop_y_coeffs);
    free(gop_co_coeffs);
    free(gop_cg_coeffs);
    gop_y_coeffs = canvas_y_coeffs;
    gop_co_coeffs = canvas_co_coeffs;
    gop_cg_coeffs = canvas_cg_coeffs;
    // Update dimensions to canvas size
    valid_width = canvas_width;
    valid_height = canvas_height;
    num_pixels = canvas_pixels;
    // Step 1: For single-frame GOP, skip temporal DWT and use traditional I-frame path
    if (actual_gop_size == 1) {
        // Apply only 2D spatial DWT (no temporal transform for single frame)
-        dwt_2d_forward_flexible(gop_y_coeffs[0], enc->width, enc->height,
+        // Use cropped dimensions (will be full size if no motion)
        dwt_2d_forward_flexible(gop_y_coeffs[0], valid_width, valid_height,
                              enc->decomp_levels, enc->wavelet_filter);
-        dwt_2d_forward_flexible(gop_co_coeffs[0], enc->width, enc->height,
+        dwt_2d_forward_flexible(gop_co_coeffs[0], valid_width, valid_height,
                              enc->decomp_levels, enc->wavelet_filter);
-        dwt_2d_forward_flexible(gop_cg_coeffs[0], enc->width, enc->height,
+        dwt_2d_forward_flexible(gop_cg_coeffs[0], valid_width, valid_height,
                              enc->decomp_levels, enc->wavelet_filter);
    } else {
        // Multi-frame GOP: Apply 3D DWT (temporal + spatial) to each channel
        // Note: This modifies gop_*_coeffs in-place
-        dwt_3d_forward(gop_y_coeffs, enc->width, enc->height, actual_gop_size,
+        // Use cropped dimensions to encode only the valid region
        dwt_3d_forward(gop_y_coeffs, valid_width, valid_height, actual_gop_size,
                       enc->decomp_levels, enc->temporal_decomp_levels, enc->wavelet_filter);
-        dwt_3d_forward(gop_co_coeffs, enc->width, enc->height, actual_gop_size,
+        dwt_3d_forward(gop_co_coeffs, valid_width, valid_height, actual_gop_size,
                       enc->decomp_levels, enc->temporal_decomp_levels, enc->wavelet_filter);
-        dwt_3d_forward(gop_cg_coeffs, enc->width, enc->height, actual_gop_size,
+        dwt_3d_forward(gop_cg_coeffs, valid_width, valid_height, actual_gop_size,
                       enc->decomp_levels, enc->temporal_decomp_levels, enc->wavelet_filter);
    }
@@ -1875,7 +2025,7 @@ static size_t gop_flush(tav_encoder_t *enc, FILE *output, int base_quantiser,
    } else {
        // Multi-frame GOP: use unified 3D DWT encoding
        // Write unified GOP packet header
-        // Packet structure: [packet_type=0x12][gop_size][motion_vectors...][compressed_size][compressed_data]
+        // Packet structure: [packet_type=0x12][gop_size][crop_info][motion_vectors...][compressed_size][compressed_data]
        uint8_t packet_type = TAV_PACKET_GOP_UNIFIED;
        fwrite(&packet_type, 1, 1, output);
        total_bytes_written += 1;
@@ -1885,6 +2035,18 @@ static size_t gop_flush(tav_encoder_t *enc, FILE *output, int base_quantiser,
        fwrite(&gop_size_byte, 1, 1, output);
        total_bytes_written += 1;
        // Write canvas expansion information (4 bytes)
        // This tells the decoder the margins added to preserve all original pixels
        // The encoded canvas is larger than the original frame to preserve edge content after alignment
        uint8_t canvas_margins[4] = {
            (uint8_t)crop_left,    // Left margin
            (uint8_t)crop_right,   // Right margin
            (uint8_t)crop_top,     // Top margin
            (uint8_t)crop_bottom   // Bottom margin
        };
        fwrite(canvas_margins, 1, 4, output);
        total_bytes_written += 4;
        // Write all motion vectors (1/16-pixel precision) for the entire GOP
        for (int t = 0; t < actual_gop_size; t++) {
            int16_t dx = enc->gop_translation_x[t];