From 3828bd7fbcf5dde20116bec39ac462bc2244b426 Mon Sep 17 00:00:00 2001
From: minjaesong <alswo9628@gmail.com>
Date: Sun, 7 Dec 2025 04:54:26 +0900
Subject: [PATCH] reference tav decoder

---
 video_encoder/Makefile          |   12 +-
 video_encoder/src/decoder_tav.c | 4353 +++++++------------------------
 2 files changed, 899 insertions(+), 3466 deletions(-)

diff --git a/video_encoder/Makefile b/video_encoder/Makefile
index 178aa9c..1c95f69 100644
--- a/video_encoder/Makefile
+++ b/video_encoder/Makefile
@@ -67,10 +67,6 @@ tav: src/encoder_tav.c lib/libtadenc/encoder_tad.c encoder_tav_opencv.cpp
 	$(CXX) $(DBGFLAGS) -o encoder_tav encoder_tav.o encoder_tad.o encoder_tav_opencv.o $(LIBS) $(OPENCV_LIBS)
 
 # New library-based TAV encoder
-tav_new: src/encoder_tav_new.c lib/libtavenc.a lib/libtadenc.a
-	rm -f encoder_tav_new
-	$(CC) $(CFLAGS) $(ZSTD_CFLAGS) -Iinclude -o encoder_tav_new src/encoder_tav_new.c lib/libtavenc.a lib/libtadenc.a $(LIBS)
-
 tav_decoder: src/decoder_tav.c lib/libtaddec/decoder_tad.c include/decoder_tad.h
 	rm -f decoder_tav decoder_tav.o decoder_tad.o
 	$(CC) $(CFLAGS) $(ZSTD_CFLAGS) -DTAD_DECODER_LIB -c lib/libtaddec/decoder_tad.c -o decoder_tad.o
@@ -222,3 +218,11 @@ encoder_tav_ref: src/encoder_tav.c lib/libtavenc.a lib/libtadenc.a
 	@echo "Reference encoder built: encoder_tav_ref"
 	@echo "This is the official reference implementation with all features"
 
+# Reference decoder using libtavdec (replaces old monolithic decoder)
+decoder_tav_ref: src/decoder_tav.c lib/libtavdec.a lib/libtaddec.a
+	rm -f decoder_tav_ref
+	$(CC) $(CFLAGS) $(ZSTD_CFLAGS) -Iinclude -o decoder_tav_ref src/decoder_tav.c lib/libtavdec.a lib/libtaddec.a $(LIBS)
+	@echo ""
+	@echo "Reference decoder built: decoder_tav_ref"
+	@echo "This is the official reference implementation with all features"
+
diff --git a/video_encoder/src/decoder_tav.c b/video_encoder/src/decoder_tav.c
index f8256f0..6c1aaa8 100644
--- a/video_encoder/src/decoder_tav.c
+++ b/video_encoder/src/decoder_tav.c
@@ -1,82 +1,62 @@
-// Created by CuriousTorvald and Claude on 2025-11-03.
-// TAV Decoder - Converts TAV video to FFV1 format with TAD audio to PCMu8
-// Based on TSVM decoder implementation (GraphicsJSR223Delegate.kt + playtav.js)
-// Only supports features available in TSVM decoder (no MC-EZBC, no MPEG-style motion compensation)
+/**
+ * TAV Decoder CLI - Reference Implementation using libtavdec and libtaddec
+ *
+ * Complete reference decoder with all features:
+ * - Full command-line argument support
+ * - TAV file format parsing (header + packets)
+ * - Video decoding via libtavdec (I-frames, GOPs)
+ * - Audio decoding via libtaddec (TAD32 to PCMu8)
+ * - FFmpeg integration for output (FFV1/rawvideo + audio muxing)
+ * - Progress reporting and statistics
+ *
+ * This is the official CLI implementation using libtavdec/libtaddec libraries.
+ * Reduced from ~3,500 lines monolithic to ~1,000 lines while preserving all features.
+ *
+ * Created by CuriousTorvald and Claude on 2025-12-07.
+ */
 
 #include <stdio.h>
 #include <stdlib.h>
 #include <stdint.h>
 #include <string.h>
-#include <math.h>
-#include <zstd.h>
+#include <getopt.h>
+#include <time.h>
 #include <unistd.h>
 #include <sys/wait.h>
-#include <sys/time.h>
-#include <getopt.h>
 #include <signal.h>
-#include "decoder_tad.h"  // Shared TAD decoder library
-#include "tav_avx512.h"  // AVX-512 SIMD optimisations
 
-#define DECODER_VENDOR_STRING "Decoder-TAV 20251126 (presets)"
+#include "tav_video_decoder.h"
+#include "decoder_tad.h"
 
-// TAV format constants
-#define TAV_MAGIC "\x1F\x54\x53\x56\x4D\x54\x41\x56"
-#define TAV_MODE_SKIP      0x00
-#define TAV_MODE_INTRA     0x01
-#define TAV_MODE_DELTA     0x02
+// =============================================================================
+// Constants
+// =============================================================================
 
-// TAV packet types (only those supported by TSVM decoder)
-#define TAV_PACKET_IFRAME          0x10  // Intra frame (keyframe) - SUPPORTED
-#define TAV_PACKET_PFRAME          0x11  // Predicted frame - SUPPORTED (delta mode)
-#define TAV_PACKET_GOP_UNIFIED     0x12  // Unified 3D DWT GOP - SUPPORTED
-#define TAV_PACKET_AUDIO_MP2       0x20  // MP2 audio - SUPPORTED (passthrough)
-#define TAV_PACKET_AUDIO_PCM8      0x21  // 8-bit PCM audio - SUPPORTED
-#define TAV_PACKET_AUDIO_TAD       0x24  // TAD audio - SUPPORTED (decode to PCMu8)
-#define TAV_PACKET_AUDIO_TRACK     0x40  // Bundled audio track - SUPPORTED (passthrough)
-#define TAV_PACKET_SUBTITLE        0x30  // Subtitle - SKIPPED
-#define TAV_PACKET_SUBTITLE_TC     0x31  // Subtitle - SKIPPED
-#define TAV_PACKET_EXTENDED_HDR    0xEF  // Extended header - SKIPPED
-#define TAV_PACKET_SCREEN_MASK     0xF2  // Screen masking (letterbox/pillarbox) - PARSED
-#define TAV_PACKET_GOP_SYNC        0xFC  // GOP sync packet - SKIPPED
-#define TAV_PACKET_TIMECODE        0xFD  // Timecode - SKIPPED
-#define TAV_PACKET_SYNC_NTSC       0xFE  // NTSC sync - SKIPPED
-#define TAV_PACKET_SYNC            0xFF  // Sync - SKIPPED
+#define DECODER_VENDOR_STRING "Decoder-TAV 20251207 (libtavdec)"
+#define TAV_MAGIC "\x1F\x54\x53\x56\x4D\x54\x41\x56"  // "\x1FTSVMTAV"
+#define MAX_PATH 4096
 
-// Unsupported packet types (not in TSVM decoder)
-#define TAV_PACKET_PFRAME_RESIDUAL 0x14  // P-frame MPEG-style - NOT SUPPORTED
-#define TAV_PACKET_BFRAME_RESIDUAL 0x15  // B-frame MPEG-style - NOT SUPPORTED
+// TAV packet types
+#define TAV_PACKET_IFRAME          0x10
+#define TAV_PACKET_PFRAME          0x11
+#define TAV_PACKET_GOP_UNIFIED     0x12
+#define TAV_PACKET_AUDIO_MP2       0x20
+#define TAV_PACKET_AUDIO_PCM8      0x21
+#define TAV_PACKET_AUDIO_TAD       0x24
+#define TAV_PACKET_SUBTITLE        0x30
+#define TAV_PACKET_SUBTITLE_TC     0x31
+#define TAV_PACKET_AUDIO_TRACK     0x40
+#define TAV_PACKET_EXTENDED_HDR    0xEF
+#define TAV_PACKET_SCREEN_MASK     0xF2
+#define TAV_PACKET_GOP_SYNC        0xFC
+#define TAV_PACKET_TIMECODE        0xFD
+#define TAV_PACKET_SYNC_NTSC       0xFE
+#define TAV_PACKET_SYNC            0xFF
 
-// Channel layout definitions
-#define CHANNEL_LAYOUT_YCOCG     0  // Y-Co-Cg/I-Ct-Cp
-#define CHANNEL_LAYOUT_YCOCG_A   1  // Y-Co-Cg-A/I-Ct-Cp-A
-#define CHANNEL_LAYOUT_Y_ONLY    2  // Y/I only
-#define CHANNEL_LAYOUT_Y_A       3  // Y-A/I-A
-#define CHANNEL_LAYOUT_COCG      4  // Co-Cg/Ct-Cp
-#define CHANNEL_LAYOUT_COCG_A    5  // Co-Cg-A/Ct-Cp-A
 
-// Wavelet filter types
-#define WAVELET_5_3_REVERSIBLE 0
-#define WAVELET_9_7_IRREVERSIBLE 1
-#define WAVELET_BIORTHOGONAL_13_7 2
-#define WAVELET_DD4 16
-#define WAVELET_HAAR 255
-
-// Tile sizes (match TSVM)
-#define TILE_SIZE_X 640
-#define TILE_SIZE_Y 540
-#define DWT_FILTER_HALF_SUPPORT 4
-#define TILE_MARGIN_LEVELS 3
-#define TILE_MARGIN (DWT_FILTER_HALF_SUPPORT * (1 << TILE_MARGIN_LEVELS))
-#define PADDED_TILE_SIZE_X (TILE_SIZE_X + 2 * TILE_MARGIN)
-#define PADDED_TILE_SIZE_Y (TILE_SIZE_Y + 2 * TILE_MARGIN)
-
-static inline int CLAMP(int x, int min, int max) {
-    return x < min ? min : (x > max ? max : x);
-}
-
-//=============================================================================
+// =============================================================================
 // TAV Header Structure (32 bytes)
-//=============================================================================
+// =============================================================================
 
 typedef struct {
     uint8_t magic[8];
@@ -95,3486 +75,935 @@ typedef struct {
     uint8_t encoder_quality;
     uint8_t channel_layout;
     uint8_t entropy_coder;
-    uint8_t encoder_preset;  // Byte 28: bit 0 = sports, bit 1 = anime
+    uint8_t encoder_preset;
     uint8_t reserved;
     uint8_t device_orientation;
     uint8_t file_role;
 } __attribute__((packed)) tav_header_t;
 
-//=============================================================================
-// Quantisation Lookup Table (matches TSVM exactly)
-//=============================================================================
-
-static const int QLUT[] = {1,2,3,4,5,6,7,8,9,10,11,12,13,14,15,16,17,18,19,20,21,22,23,24,25,26,27,28,29,30,31,32,33,34,35,36,37,38,39,40,41,42,43,44,45,46,47,48,49,50,51,52,53,54,55,56,57,58,59,60,61,62,63,64,66,68,70,72,74,76,78,80,82,84,86,88,90,92,94,96,98,100,102,104,106,108,110,112,114,116,118,120,122,124,126,128,132,136,140,144,148,152,156,160,164,168,172,176,180,184,188,192,196,200,204,208,212,216,220,224,228,232,236,240,244,248,252,256,264,272,280,288,296,304,312,320,328,336,344,352,360,368,376,384,392,400,408,416,424,432,440,448,456,464,472,480,488,496,504,512,528,544,560,576,592,608,624,640,656,672,688,704,720,736,752,768,784,800,816,832,848,864,880,896,912,928,944,960,976,992,1008,1024,1056,1088,1120,1152,1184,1216,1248,1280,1312,1344,1376,1408,1440,1472,1504,1536,1568,1600,1632,1664,1696,1728,1760,1792,1824,1856,1888,1920,1952,1984,2016,2048,2112,2176,2240,2304,2368,2432,2496,2560,2624,2688,2752,2816,2880,2944,3008,3072,3136,3200,3264,3328,3392,3456,3520,3584,3648,3712,3776,3840,3904,3968,4032,4096};
-
-// Perceptual quantisation constants (match TSVM)
-static const float ANISOTROPY_MULT[] = {2.0f, 1.8f, 1.6f, 1.4f, 1.2f, 1.0f};
-static const float ANISOTROPY_BIAS[] = {0.4f, 0.2f, 0.1f, 0.0f, 0.0f, 0.0f};
-static const float ANISOTROPY_MULT_CHROMA[] = {6.6f, 5.5f, 4.4f, 3.3f, 2.2f, 1.1f};
-static const float ANISOTROPY_BIAS_CHROMA[] = {1.0f, 0.8f, 0.6f, 0.4f, 0.2f, 0.0f};
-static const float FOUR_PIXEL_DETAILER = 0.88f;
-static const float TWO_PIXEL_DETAILER = 0.92f;
-
-//=============================================================================
-// DWT Subband Layout Calculation (matches TSVM)
-//=============================================================================
-
-typedef struct {
-    int level;              // Decomposition level (1 to decompLevels)
-    int subband_type;       // 0=LL, 1=LH, 2=HL, 3=HH
-    int coeff_start;        // Starting index in linear coefficient array
-    int coeff_count;        // Number of coefficients in this subband
-} dwt_subband_info_t;
-
-static int calculate_subband_layout(int width, int height, int decomp_levels, dwt_subband_info_t *subbands) {
-    int subband_count = 0;
-
-    // generate division series
-    int widths[decomp_levels + 1]; widths[0] = width;
-    int heights[decomp_levels + 1]; heights[0] = height;
-
-    for (int i = 1; i < decomp_levels + 1; i++) {
-        widths[i] = (int)roundf(widths[i - 1] / 2.0f);
-        heights[i] = (int)roundf(heights[i - 1] / 2.0f);
-    }
-
-    // LL subband at maximum decomposition level
-    int ll_width = widths[decomp_levels];
-    int ll_height = heights[decomp_levels];
-    subbands[subband_count++] = (dwt_subband_info_t){decomp_levels, 0, 0, ll_width * ll_height};
-    int coeff_offset = ll_width * ll_height;
-
-    // LH, HL, HH subbands for each level from max down to 1
-    for (int level = decomp_levels; level >= 1; level--) {
-        int level_width = widths[decomp_levels - level + 1];
-        int level_height = heights[decomp_levels - level + 1];
-        const int subband_size = level_width * level_height;
-
-        // LH subband
-        subbands[subband_count++] = (dwt_subband_info_t){level, 1, coeff_offset, subband_size};
-        coeff_offset += subband_size;
-
-        // HL subband
-        subbands[subband_count++] = (dwt_subband_info_t){level, 2, coeff_offset, subband_size};
-        coeff_offset += subband_size;
-
-        // HH subband
-        subbands[subband_count++] = (dwt_subband_info_t){level, 3, coeff_offset, subband_size};
-        coeff_offset += subband_size;
-    }
-
-    return subband_count;
-}
-
-//=============================================================================
-// Perceptual Quantisation Model (matches TSVM exactly)
-//=============================================================================
-
-static int tav_derive_encoder_qindex(int q_index, int q_y_global) {
-    if (q_index > 0) return q_index - 1;
-    if (q_y_global >= 60) return 0;
-    else if (q_y_global >= 42) return 1;
-    else if (q_y_global >= 25) return 2;
-    else if (q_y_global >= 12) return 3;
-    else if (q_y_global >= 6) return 4;
-    else if (q_y_global >= 2) return 5;
-    else return 5;
-}
-
-static float perceptual_model3_LH(float level) {
-    const float H4 = 1.2f;
-    const float K = 2.0f;  // CRITICAL: Fixed value for fixed curve; quantiser will scale it up anyway
-    const float K12 = K * 12.0f;
-    const float x = level;
-
-    const float Lx = H4 - ((K + 1.0f) / 15.0f) * (x - 4.0f);
-    const float C3 = -1.0f / 45.0f * (K12 + 92.0f);
-    const float G3x = (-x / 180.0f) * (K12 + 5.0f * x * x - 60.0f * x + 252.0f) - C3 + H4;
-
-    return (level >= 4.0f) ? Lx : G3x;
-}
-
-static float perceptual_model3_HL(int quality, float LH) {
-    return LH * ANISOTROPY_MULT[quality] + ANISOTROPY_BIAS[quality];
-}
-
-static float lerp(float x, float y, float a) {
-    return x * (1.0f - a) + y * a;
-}
-
-static float perceptual_model3_HH(float LH, float HL, float level) {
-    const float Kx = (sqrtf(level) - 1.0f) * 0.5f + 0.5f;
-    return lerp(LH, HL, Kx);
-}
-
-static float perceptual_model3_LL(float level) {
-    const float n = perceptual_model3_LH(level);
-    const float m = perceptual_model3_LH(level - 1.0f) / n;
-    return n / m;
-}
-
-static float perceptual_model3_chroma_basecurve(int quality, float level) {
-    return 1.0f - (1.0f / (0.5f * quality * quality + 1.0f)) * (level - 4.0f);
-}
-
-static float get_perceptual_weight(int q_index, int q_y_global, int level0, int subband_type,
-                                  int is_chroma, int max_levels) {
-    // Convert to perceptual level (1-6 scale)
-    const float level = 1.0f + ((level0 - 1.0f) / (max_levels - 1.0f)) * 5.0f;
-    const int quality_level = tav_derive_encoder_qindex(q_index, q_y_global);
-
-    if (!is_chroma) {
-        // LUMA CHANNEL
-        if (subband_type == 0) {
-            return perceptual_model3_LL(level);
-        }
-
-        const float LH = perceptual_model3_LH(level);
-        if (subband_type == 1) {
-            return LH;
-        }
-
-        const float HL = perceptual_model3_HL(quality_level, LH);
-        if (subband_type == 2) {
-            float detailer = 1.0f;
-            if (level >= 1.8f && level <= 2.2f) detailer = TWO_PIXEL_DETAILER;
-            else if (level >= 2.8f && level <= 3.2f) detailer = FOUR_PIXEL_DETAILER;
-            return HL * detailer;
-        } else {
-            // HH subband
-            float detailer = 1.0f;
-            if (level >= 1.8f && level <= 2.2f) detailer = TWO_PIXEL_DETAILER;
-            else if (level >= 2.8f && level <= 3.2f) detailer = FOUR_PIXEL_DETAILER;
-            return perceptual_model3_HH(LH, HL, level) * detailer;
-        }
-    } else {
-        // CHROMA CHANNELS
-        const float base = perceptual_model3_chroma_basecurve(quality_level, level - 1);
-        if (subband_type == 0) {
-            return 1.0f;
-        } else if (subband_type == 1) {
-            return fmaxf(base, 1.0f);
-        } else if (subband_type == 2) {
-            return fmaxf(base * ANISOTROPY_MULT_CHROMA[quality_level], 1.0f);
-        } else {
-            return fmaxf(base * ANISOTROPY_MULT_CHROMA[quality_level] + ANISOTROPY_BIAS_CHROMA[quality_level], 1.0f);
-        }
-    }
-}
-
-static void dequantise_dwt_subbands_perceptual(int q_index, int q_y_global, const int16_t *quantised,
-                                              float *dequantised, int width, int height, int decomp_levels,
-                                              float base_quantiser, int is_chroma, int frame_num) {
-    dwt_subband_info_t subbands[32]; // Max possible subbands
-    const int subband_count = calculate_subband_layout(width, height, decomp_levels, subbands);
-
-    const int coeff_count = width * height;
-    memset(dequantised, 0, coeff_count * sizeof(float));
-
-    int is_debug = 0;//(frame_num == 32);
-//    if (frame_num == 32) {
-//        fprintf(stderr, "DEBUG: dequantise called for frame %d, is_chroma=%d\n", frame_num, is_chroma);
-//    }
-
-    // Apply perceptual weighting to each subband
-    for (int s = 0; s < subband_count; s++) {
-        const dwt_subband_info_t *subband = &subbands[s];
-        const float weight = get_perceptual_weight(q_index, q_y_global, subband->level,
-                                                  subband->subband_type, is_chroma, decomp_levels);
-        const float effective_quantiser = base_quantiser * weight;
-
-        if (is_debug && !is_chroma) {
-            if (subband->subband_type == 0) { // LL band
-                fprintf(stderr, "  Subband level %d (LL): weight=%.6f, base_q=%.1f, effective_q=%.1f, count=%d\n",
-                       subband->level, weight, base_quantiser, effective_quantiser, subband->coeff_count);
-
-                // Print first 5 quantised LL coefficients
-                fprintf(stderr, "    First 5 quantised LL: ");
-                for (int k = 0; k < 5 && k < subband->coeff_count; k++) {
-                    int idx = subband->coeff_start + k;
-                    fprintf(stderr, "%d ", quantised[idx]);
-                }
-                fprintf(stderr, "\n");
-
-                // Find max quantised LL coefficient
-                int max_quant_ll = 0;
-                for (int k = 0; k < subband->coeff_count; k++) {
-                    int idx = subband->coeff_start + k;
-                    int abs_val = quantised[idx] < 0 ? -quantised[idx] : quantised[idx];
-                    if (abs_val > max_quant_ll) max_quant_ll = abs_val;
-                }
-                fprintf(stderr, "    Max quantised LL coefficient: %d (dequantises to %.1f)\n",
-                       max_quant_ll, max_quant_ll * effective_quantiser);
-            }
-        }
-
-        // Apply linear dequantisation with perceptual weights (matching encoder's linear storage)
-        // FIX (2025-11-11): Both EZBC and Significance-map modes now store NORMALIZED coefficients
-        //                   Encoder stores quantised values (e.g., round(377/48) = 8)
-        //                   Decoder must multiply by effective quantiser to denormalize
-        //                   Previous denormalization in EZBC caused int16_t overflow (clipping at 32767)
-        //                   for bright pixels, creating dark DWT-pattern blemishes
-
-/*#ifdef __AVX512F__
-        // Use AVX-512 optimised dequantization if available (1.1x speedup against -Ofast)
-        // Check: subband has >=16 elements AND won't exceed buffer bounds
-        const int subband_end = subband->coeff_start + subband->coeff_count;
-        if (g_simd_level >= SIMD_AVX512F && subband->coeff_count >= 16 && subband_end <= coeff_count) {
-            dequantise_dwt_coefficients_avx512(
-                quantised + subband->coeff_start,
-                dequantised + subband->coeff_start,
-                subband->coeff_count,
-                effective_quantiser
-            );
-        } else {
-#endif*/
-            // Scalar fallback or small subbands
-            for (int i = 0; i < subband->coeff_count; i++) {
-                const int idx = subband->coeff_start + i;
-                if (idx < coeff_count) {
-                    const float untruncated = quantised[idx] * effective_quantiser;
-                    dequantised[idx] = untruncated;
-                }
-            }
-/*#ifdef __AVX512F__
-        }
-#endif*/
-    }
-
-    // Debug: Verify LL band was dequantised correctly
-    if (is_debug && !is_chroma) {
-        // Find LL band again to verify
-        for (int s = 0; s < subband_count; s++) {
-            const dwt_subband_info_t *subband = &subbands[s];
-            if (subband->level == decomp_levels && subband->subband_type == 0) {
-                fprintf(stderr, "  AFTER all subbands processed - First 5 dequantised LL: ");
-                for (int k = 0; k < 5 && k < subband->coeff_count; k++) {
-                    int idx = subband->coeff_start + k;
-                    fprintf(stderr, "%.1f ", dequantised[idx]);
-                }
-                fprintf(stderr, "\n");
-
-                // Find max dequantised LL
-                float max_dequant_ll = -999.0f;
-                for (int k = 0; k < subband->coeff_count; k++) {
-                    int idx = subband->coeff_start + k;
-                    float abs_val = dequantised[idx] < 0 ? -dequantised[idx] : dequantised[idx];
-                    if (abs_val > max_dequant_ll) max_dequant_ll = abs_val;
-                }
-                fprintf(stderr, "  AFTER all subbands - Max dequantised LL: %.1f\n", max_dequant_ll);
-                break;
-            }
-        }
-    }
-}
-
-//=============================================================================
-// Grain Synthesis Removal (matches TSVM exactly)
-//=============================================================================
-
-// Deterministic RNG for grain synthesis (matches encoder)
-static inline uint32_t tav_grain_synthesis_rng(uint32_t frame, uint32_t band, uint32_t x, uint32_t y) {
-    uint32_t key = frame * 0x9e3779b9u ^ band * 0x7f4a7c15u ^ (y << 16) ^ x;
-    // rng_hash implementation
-    uint32_t hash = key;
-    hash = hash ^ (hash >> 16);
-    hash = hash * 0x7feb352du;
-    hash = hash ^ (hash >> 15);
-    hash = hash * 0x846ca68bu;
-    hash = hash ^ (hash >> 16);
-    return hash;
-}
-
-// Generate triangular noise from uint32 RNG (returns value in range [-1.0, 1.0])
-static inline float tav_grain_triangular_noise(uint32_t rng_val) {
-    // Get two uniform random values in [0, 1]
-    float u1 = (rng_val & 0xFFFFu) / 65535.0f;
-    float u2 = ((rng_val >> 16) & 0xFFFFu) / 65535.0f;
-
-    // Convert to range [-1, 1] and average for triangular distribution
-    return (u1 + u2) - 1.0f;
-}
-
-// Apply grain synthesis from DWT coefficients (decoder subtracts noise)
-// This must be called AFTER dequantisation but BEFORE inverse DWT
-static void apply_grain_synthesis(float *coeffs, int width, int height,
-                                          int decomp_levels, int frame_num, int q_y_global, uint8_t encoder_preset, int no_grain_synthesis) {
-    // Command-line override: disable grain synthesis
-    if (no_grain_synthesis) {
-        return;  // Skip grain synthesis entirely
-    }
-
-    // Anime preset: completely disable grain synthesis
-    if (encoder_preset & 0x02) {
-        return;  // Skip grain synthesis entirely
-    }
-
-    dwt_subband_info_t subbands[32];
-    const int subband_count = calculate_subband_layout(width, height, decomp_levels, subbands);
-
-    // Noise amplitude (matches Kotlin: qYGlobal.coerceAtMost(32) * 0.8f)
-    const float noise_amplitude = (q_y_global < 32 ? q_y_global : 32) * 0.4f; // somehow this term behaves differently from the Kotlin decoder
-
-    // Process each subband (skip LL band which is level 0)
-    for (int s = 0; s < subband_count; s++) {
-        const dwt_subband_info_t *subband = &subbands[s];
-        if (subband->level == 0) continue;  // Skip LL band
-
-        // Calculate band index for RNG (matches Kotlin: level + subbandType * 31 + 16777619)
-        uint32_t band = subband->level + subband->subband_type * 31 + 16777619;
-
-        // Apply noise from each coefficient in this subband
-        for (int i = 0; i < subband->coeff_count; i++) {
-            const int idx = subband->coeff_start + i;
-            if (idx < width * height) {
-                // Calculate 2D position from linear index
-                int y = idx / width;
-                int x = idx % width;
-
-                // Generate same deterministic noise as encoder
-                uint32_t rng_val = tav_grain_synthesis_rng(frame_num, band, x, y);
-                float noise = tav_grain_triangular_noise(rng_val);
-
-                // Subtract noise from coefficient
-                coeffs[idx] -= noise * noise_amplitude;
-            }
-        }
-    }
-}
-
-//=============================================================================
-static int calculate_dwt_levels(int chunk_size) {
-    /*if (chunk_size < TAD_MIN_CHUNK_SIZE) {
-        fprintf(stderr, "Error: Chunk size %d is below minimum %d\n", chunk_size, TAD_MIN_CHUNK_SIZE);
-        return -1;
-    }
-
-    // Calculate levels: log2(chunk_size) - 1
-    int levels = 0;
-    int size = chunk_size;
-    while (size > 1) {
-        size >>= 1;
-        levels++;
-    }
-    return levels - 2;*/
-    return 9;
-}
-
-//=============================================================================
-// Chunk Decoding (TAD Audio)
-// NOTE: TAD decoding now uses shared tad32_decode_chunk() from decoder_tad.h
-//       This ensures decoder_tav and decoder_tad use identical decoding logic
-//=============================================================================
-// Significance Map Postprocessing (matches TSVM exactly)
-//=============================================================================
-
-// Helper: Extract 2-bit code from bit-packed array
-static inline int get_twobit_code(const uint8_t *map_data, int map_bytes, int coeff_idx) {
-    int bit_pos = coeff_idx * 2;
-    int byte_idx = bit_pos / 8;
-    int bit_offset = bit_pos % 8;
-
-    uint8_t byte0 = map_data[byte_idx];
-    int code = (byte0 >> bit_offset) & 0x03;
-
-    // Handle byte boundary crossing
-    if (bit_offset == 7 && byte_idx + 1 < map_bytes) {
-        uint8_t byte1 = map_data[byte_idx + 1];
-        code = ((byte0 >> 7) & 0x01) | ((byte1 << 1) & 0x02);
-    }
-
-    return code;
-}
-
-// Decoder: reconstruct coefficients from 2-bit map format (entropyCoder=0)
-// Layout: [Y_map_2bit][Co_map_2bit][Cg_map_2bit][Y_others][Co_others][Cg_others]
-// 2-bit encoding: 00=0, 01=+1, 10=-1, 11=other (stored in value array)
-static void postprocess_coefficients_twobit(uint8_t *compressed_data, int coeff_count,
-                                           int16_t *output_y, int16_t *output_co, int16_t *output_cg) {
-    int map_bytes = (coeff_count * 2 + 7) / 8;  // 2 bits per coefficient
-
-    // (Debug output removed)
-
-    // Map offsets (all channels present for Y-Co-Cg layout)
-    uint8_t *y_map = compressed_data;
-    uint8_t *co_map = compressed_data + map_bytes;
-    uint8_t *cg_map = compressed_data + map_bytes * 2;
-
-    // Count "other" values (code 11) for each channel
-    int y_others = 0, co_others = 0, cg_others = 0;
-    for (int i = 0; i < coeff_count; i++) {
-        if (get_twobit_code(y_map, map_bytes, i) == 3) y_others++;
-        if (get_twobit_code(co_map, map_bytes, i) == 3) co_others++;
-        if (get_twobit_code(cg_map, map_bytes, i) == 3) cg_others++;
-    }
-
-    // (Debug output removed)
-
-    // Value array offsets (after all maps)
-    uint8_t *value_ptr = compressed_data + map_bytes * 3;
-    int16_t *y_values = (int16_t *)value_ptr;
-    int16_t *co_values = (int16_t *)(value_ptr + y_others * 2);
-    int16_t *cg_values = (int16_t *)(value_ptr + y_others * 2 + co_others * 2);
-
-    // Reconstruct coefficients
-    int y_value_idx = 0, co_value_idx = 0, cg_value_idx = 0;
-
-    for (int i = 0; i < coeff_count; i++) {
-        // Y channel
-        int y_code = get_twobit_code(y_map, map_bytes, i);
-        switch (y_code) {
-            case 0: output_y[i] = 0; break;
-            case 1: output_y[i] = 1; break;
-            case 2: output_y[i] = -1; break;
-            case 3: output_y[i] = y_values[y_value_idx++]; break;
-        }
-
-        // Co channel
-        int co_code = get_twobit_code(co_map, map_bytes, i);
-        switch (co_code) {
-            case 0: output_co[i] = 0; break;
-            case 1: output_co[i] = 1; break;
-            case 2: output_co[i] = -1; break;
-            case 3: output_co[i] = co_values[co_value_idx++]; break;
-        }
-
-        // Cg channel
-        int cg_code = get_twobit_code(cg_map, map_bytes, i);
-        switch (cg_code) {
-            case 0: output_cg[i] = 0; break;
-            case 1: output_cg[i] = 1; break;
-            case 2: output_cg[i] = -1; break;
-            case 3: output_cg[i] = cg_values[cg_value_idx++]; break;
-        }
-    }
-}
-
-//=============================================================================
-// EZBC (Embedded Zero Block Coding) Decoder
-//=============================================================================
-
-// EZBC Block structure for quadtree
-typedef struct {
-    int x, y;
-    int width, height;
-} ezbc_block_t;
-
-// EZBC bitstream reader state
-typedef struct {
-    const uint8_t *data;
-    size_t size;
-    size_t byte_pos;
-    int bit_pos;
-} ezbc_bitreader_t;
-
-// Read N bits from EZBC bitstream (LSB-first within each byte)
-static int ezbc_read_bits(ezbc_bitreader_t *reader, int num_bits) {
-    int result = 0;
-    for (int i = 0; i < num_bits; i++) {
-        if (reader->byte_pos >= reader->size) {
-            return result;  // End of stream
-        }
-
-        const int bit = (reader->data[reader->byte_pos] >> reader->bit_pos) & 1;
-        result |= (bit << i);
-
-        reader->bit_pos++;
-        if (reader->bit_pos == 8) {
-            reader->bit_pos = 0;
-            reader->byte_pos++;
-        }
-    }
-    return result;
-}
-
-// EZBC block queues (simple dynamic arrays)
-typedef struct {
-    ezbc_block_t *blocks;
-    int count;
-    int capacity;
-} ezbc_block_queue_t;
-
-static void ezbc_queue_init(ezbc_block_queue_t *q) {
-    q->capacity = 256;
-    q->count = 0;
-    q->blocks = malloc(q->capacity * sizeof(ezbc_block_t));
-}
-
-static void ezbc_queue_free(ezbc_block_queue_t *q) {
-    free(q->blocks);
-    q->blocks = NULL;
-    q->count = 0;
-}
-
-static void ezbc_queue_add(ezbc_block_queue_t *q, ezbc_block_t block) {
-    if (q->count >= q->capacity) {
-        q->capacity *= 2;
-        q->blocks = realloc(q->blocks, q->capacity * sizeof(ezbc_block_t));
-    }
-    q->blocks[q->count++] = block;
-}
-
-// Forward declaration
-static int ezbc_process_significant_block_recursive(
-    ezbc_bitreader_t *reader, ezbc_block_t block, int bitplane, int threshold,
-    int16_t *output, int width, int8_t *significant, int *first_bitplane,
-    ezbc_block_queue_t *next_significant, ezbc_block_queue_t *next_insignificant);
-
-// EZBC recursive block decoder (matches Kotlin implementation)
-static int ezbc_process_significant_block_recursive(
-    ezbc_bitreader_t *reader, ezbc_block_t block, int bitplane, int threshold,
-    int16_t *output, int width, int8_t *significant, int *first_bitplane,
-    ezbc_block_queue_t *next_significant, ezbc_block_queue_t *next_insignificant) {
-
-    int sign_bits_read = 0;
-
-    // If 1x1 block: read sign bit and add to significant queue
-    if (block.width == 1 && block.height == 1) {
-        const int idx = block.y * width + block.x;
-        const int sign_bit = ezbc_read_bits(reader, 1);
-        sign_bits_read++;
-
-        // Set coefficient to threshold value with sign
-        output[idx] = sign_bit ? -threshold : threshold;
-        significant[idx] = 1;
-        first_bitplane[idx] = bitplane;
-        ezbc_queue_add(next_significant, block);
-        return sign_bits_read;
-    }
-
-    // Block is > 1x1: subdivide and recursively process children
-    int mid_x = block.width / 2;
-    int mid_y = block.height / 2;
-    if (mid_x == 0) mid_x = 1;
-    if (mid_y == 0) mid_y = 1;
-
-    // Top-left child
-    ezbc_block_t tl = {block.x, block.y, mid_x, mid_y};
-    const int tl_flag = ezbc_read_bits(reader, 1);
-    if (tl_flag) {
-        sign_bits_read += ezbc_process_significant_block_recursive(
-            reader, tl, bitplane, threshold, output, width, significant, first_bitplane,
-            next_significant, next_insignificant);
-    } else {
-        ezbc_queue_add(next_insignificant, tl);
-    }
-
-    // Top-right child (if exists)
-    if (block.width > mid_x) {
-        ezbc_block_t tr = {block.x + mid_x, block.y, block.width - mid_x, mid_y};
-        const int tr_flag = ezbc_read_bits(reader, 1);
-        if (tr_flag) {
-            sign_bits_read += ezbc_process_significant_block_recursive(
-                reader, tr, bitplane, threshold, output, width, significant, first_bitplane,
-                next_significant, next_insignificant);
-        } else {
-            ezbc_queue_add(next_insignificant, tr);
-        }
-    }
-
-    // Bottom-left child (if exists)
-    if (block.height > mid_y) {
-        ezbc_block_t bl = {block.x, block.y + mid_y, mid_x, block.height - mid_y};
-        const int bl_flag = ezbc_read_bits(reader, 1);
-        if (bl_flag) {
-            sign_bits_read += ezbc_process_significant_block_recursive(
-                reader, bl, bitplane, threshold, output, width, significant, first_bitplane,
-                next_significant, next_insignificant);
-        } else {
-            ezbc_queue_add(next_insignificant, bl);
-        }
-    }
-
-    // Bottom-right child (if exists)
-    if (block.width > mid_x && block.height > mid_y) {
-        ezbc_block_t br = {block.x + mid_x, block.y + mid_y, block.width - mid_x, block.height - mid_y};
-        const int br_flag = ezbc_read_bits(reader, 1);
-        if (br_flag) {
-            sign_bits_read += ezbc_process_significant_block_recursive(
-                reader, br, bitplane, threshold, output, width, significant, first_bitplane,
-                next_significant, next_insignificant);
-        } else {
-            ezbc_queue_add(next_insignificant, br);
-        }
-    }
-
-    return sign_bits_read;
-}
-
-// Decode a single channel with EZBC
-static void decode_channel_ezbc(const uint8_t *ezbc_data, size_t offset, size_t size,
-                               int16_t *output, int expected_count) {
-    ezbc_bitreader_t reader = {ezbc_data, offset + size, offset, 0};
-
-    // Debug: Print first few bytes
-//    fprintf(stderr, "[EZBC] Channel decode: offset=%zu, size=%zu, first 5 bytes: %02X %02X %02X %02X %02X\n",
-//           offset, size,
-//           ezbc_data[offset], ezbc_data[offset+1], ezbc_data[offset+2],
-//           ezbc_data[offset+3], ezbc_data[offset+4]);
-
-    // Read header: MSB bitplane (8 bits), width (16 bits), height (16 bits)
-    const int msb_bitplane = ezbc_read_bits(&reader, 8);
-    const int width = ezbc_read_bits(&reader, 16);
-    const int height = ezbc_read_bits(&reader, 16);
-
-//    fprintf(stderr, "[EZBC] Decoded header: MSB=%d, width=%d, height=%d (expected pixels=%d)\n",
-//           msb_bitplane, width, height, expected_count);
-
-    // With crop encoding, dimensions can vary per frame - trust the EZBC header
-    // Just ensure we don't overflow the output buffer
-    const int actual_count = width * height;
-    if (actual_count > expected_count) {
-        fprintf(stderr, "EZBC dimension overflow: %dx%d (%d) > %d\n",
-                width, height, actual_count, expected_count);
-        memset(output, 0, expected_count * sizeof(int16_t));
-        return;
-    }
-
-    // If actual count is less, only decode what we need
-    expected_count = actual_count;
-
-    // Initialise output and state tracking
-    memset(output, 0, expected_count * sizeof(int16_t));
-    int8_t *significant = calloc(expected_count, sizeof(int8_t));
-    int *first_bitplane = calloc(expected_count, sizeof(int));
-
-    // Initialise queues
-    ezbc_block_queue_t insignificant, next_insignificant, significant_queue, next_significant;
-    ezbc_queue_init(&insignificant);
-    ezbc_queue_init(&next_insignificant);
-    ezbc_queue_init(&significant_queue);
-    ezbc_queue_init(&next_significant);
-
-    // Start with root block
-    ezbc_block_t root = {0, 0, width, height};
-    ezbc_queue_add(&insignificant, root);
-
-    // Process bitplanes from MSB to LSB
-    for (int bitplane = msb_bitplane; bitplane >= 0; bitplane--) {
-        const int threshold = 1 << bitplane;
-
-        // Process insignificant blocks
-        for (int i = 0; i < insignificant.count; i++) {
-            const int flag = ezbc_read_bits(&reader, 1);
-
-            if (flag == 0) {
-                // Still insignificant
-                ezbc_queue_add(&next_insignificant, insignificant.blocks[i]);
-            } else {
-                // Became significant - use recursive processing
-                ezbc_process_significant_block_recursive(
-                    &reader, insignificant.blocks[i], bitplane, threshold,
-                    output, width, significant, first_bitplane,
-                    &next_significant, &next_insignificant);
-            }
-        }
-
-        // Process significant 1x1 blocks (refinement)
-        for (int i = 0; i < significant_queue.count; i++) {
-            ezbc_block_t block = significant_queue.blocks[i];
-            const int idx = block.y * width + block.x;
-            const int refine_bit = ezbc_read_bits(&reader, 1);
-
-            // Add refinement bit at current bitplane
-            if (refine_bit) {
-                const int bit_value = 1 << bitplane;
-                if (output[idx] < 0) {
-                    output[idx] -= bit_value;
-                } else {
-                    output[idx] += bit_value;
-                }
-            }
-
-            // Keep in significant queue
-            ezbc_queue_add(&next_significant, block);
-        }
-
-        // Swap queues
-        ezbc_block_queue_t temp_insig = insignificant;
-        insignificant = next_insignificant;
-        next_insignificant = temp_insig;
-        next_insignificant.count = 0;
-
-        ezbc_block_queue_t temp_sig = significant_queue;
-        significant_queue = next_significant;
-        next_significant = temp_sig;
-        next_significant.count = 0;
-    }
-
-    // Cleanup
-    free(significant);
-    free(first_bitplane);
-    ezbc_queue_free(&insignificant);
-    ezbc_queue_free(&next_insignificant);
-    ezbc_queue_free(&significant_queue);
-    ezbc_queue_free(&next_significant);
-
-    // Debug: Count non-zero coefficients
-    int nonzero_count = 0;
-    int16_t max_val = 0, min_val = 0;
-    for (int i = 0; i < expected_count; i++) {
-        if (output[i] != 0) {
-            nonzero_count++;
-            if (output[i] > max_val) max_val = output[i];
-            if (output[i] < min_val) min_val = output[i];
-        }
-    }
-//    fprintf(stderr, "[EZBC] Decoded %d non-zero coeffs (%.1f%%), range: [%d, %d]\n",
-//           nonzero_count, 100.0 * nonzero_count / expected_count, min_val, max_val);
-}
-
-// Helper: peek at EZBC header to get dimensions without decoding
-static int ezbc_peek_dimensions(const uint8_t *compressed_data, int channel_layout,
-                                 int *out_width, int *out_height) {
-    const int has_y = (channel_layout & 0x04) == 0;
-
-    if (!has_y) {
-        return -1;  // Need Y channel to get dimensions
-    }
-
-    // Read Y channel size header
-    const uint32_t size = ((uint32_t)compressed_data[0]) |
-                         ((uint32_t)compressed_data[1] << 8) |
-                         ((uint32_t)compressed_data[2] << 16) |
-                         ((uint32_t)compressed_data[3] << 24);
-
-    if (size < 6) {
-        return -1;  // Too small to contain EZBC header
-    }
-
-    // Skip to EZBC data for Y channel (after size header)
-    const uint8_t *ezbc_data = compressed_data + 4;
-
-    // Read EZBC header: skip MSB bitplane (1 byte), then read width and height
-    // Note: EZBC uses bitstream format, but dimensions are at fixed positions
-    // We need to parse the bitstream header carefully
-
-    // Create a temporary reader to parse the bitstream
-    ezbc_bitreader_t reader;
-    reader.data = ezbc_data;
-    reader.size = size;
-    reader.byte_pos = 0;
-    reader.bit_pos = 0;
-
-    // Read header: MSB bitplane (8 bits), width (16 bits), height (16 bits)
-    ezbc_read_bits(&reader, 8);  // Skip MSB bitplane
-    *out_width = ezbc_read_bits(&reader, 16);
-    *out_height = ezbc_read_bits(&reader, 16);
-
-    return 0;
-}
-
-// EZBC postprocessing for single frames
-static void postprocess_coefficients_ezbc(uint8_t *compressed_data, int coeff_count,
-                                          int16_t *output_y, int16_t *output_co, int16_t *output_cg,
-                                          int channel_layout) {
-    const int has_y = (channel_layout & 0x04) == 0;
-    const int has_co = (channel_layout & 0x02) == 0;
-    const int has_cg = (channel_layout & 0x02) == 0;
-
-    int offset = 0;
-
-    // Decode Y channel
-    if (has_y && output_y) {
-        const uint32_t size = ((uint32_t)compressed_data[offset + 0]) |
-                             ((uint32_t)compressed_data[offset + 1] << 8) |
-                             ((uint32_t)compressed_data[offset + 2] << 16) |
-                             ((uint32_t)compressed_data[offset + 3] << 24);
-        offset += 4;
-        decode_channel_ezbc(compressed_data, offset, size, output_y, coeff_count);
-        offset += size;
-    }
-
-    // Decode Co channel
-    if (has_co && output_co) {
-        const uint32_t size = ((uint32_t)compressed_data[offset + 0]) |
-                             ((uint32_t)compressed_data[offset + 1] << 8) |
-                             ((uint32_t)compressed_data[offset + 2] << 16) |
-                             ((uint32_t)compressed_data[offset + 3] << 24);
-        offset += 4;
-        decode_channel_ezbc(compressed_data, offset, size, output_co, coeff_count);
-        offset += size;
-    }
-
-    // Decode Cg channel
-    if (has_cg && output_cg) {
-        const uint32_t size = ((uint32_t)compressed_data[offset + 0]) |
-                             ((uint32_t)compressed_data[offset + 1] << 8) |
-                             ((uint32_t)compressed_data[offset + 2] << 16) |
-                             ((uint32_t)compressed_data[offset + 3] << 24);
-        offset += 4;
-        decode_channel_ezbc(compressed_data, offset, size, output_cg, coeff_count);
-        offset += size;
-    }
-}
-
-//=============================================================================
-// DWT Inverse Transforms (matches TSVM)
-//=============================================================================
-
-// 9/7 inverse DWT (from TSVM Kotlin code)
-static void dwt_97_inverse_1d(float *data, int length) {
-    if (length < 2) return;
-
-    // Debug: Check if input has non-zero values
-//    static int call_count = 0;
-//    if (call_count < 5) {
-//         Debug: count non-zero coefficients (disabled to reduce stderr output)
-//         int nonzero = 0;
-//         for (int i = 0; i < length; i++) {
-//             if (data[i] != 0.0f) nonzero++;
-//         }
-//         fprintf(stderr, "    dwt_97_inverse_1d call #%d: length=%d, nonzero=%d, first 5: %.1f %.1f %.1f %.1f %.1f\n",
-//                call_count, length, nonzero,
-//                data[0], length > 1 ? data[1] : 0.0f, length > 2 ? data[2] : 0.0f,
-//                length > 3 ? data[3] : 0.0f, length > 4 ? data[4] : 0.0f);
-//         call_count++;
-//    }
-
-    float *temp = malloc(length * sizeof(float));
-    int half = (length + 1) / 2;
-
-    // Split into low and high frequency components (matching TSVM layout)
-    for (int i = 0; i < half; i++) {
-        temp[i] = data[i];  // Low-pass coefficients (first half)
-    }
-    for (int i = 0; i < length / 2; i++) {
-        if (half + i < length) {
-            temp[half + i] = data[half + i];  // High-pass coefficients (second half)
-        }
-    }
-
-    // 9/7 inverse lifting coefficients from TSVM
-    const float alpha = -1.586134342f;
-    const float beta = -0.052980118f;
-    const float gamma = 0.882911076f;
-    const float delta = 0.443506852f;
-    const float K = 1.230174105f;
-
-    // Step 1: Undo scaling
-    for (int i = 0; i < half; i++) {
-        temp[i] /= K;  // Low-pass coefficients
-    }
-    for (int i = 0; i < length / 2; i++) {
-        if (half + i < length) {
-            temp[half + i] *= K;  // High-pass coefficients
-        }
-    }
-
-    // Step 2: Undo δ update
-    for (int i = 0; i < half; i++) {
-        float d_curr = (half + i < length) ? temp[half + i] : 0.0f;
-        float d_prev = (i > 0 && half + i - 1 < length) ? temp[half + i - 1] : d_curr;
-        temp[i] -= delta * (d_curr + d_prev);
-    }
-
-    // Step 3: Undo γ predict
-    for (int i = 0; i < length / 2; i++) {
-        if (half + i < length) {
-            float s_curr = temp[i];
-            float s_next = (i + 1 < half) ? temp[i + 1] : s_curr;
-            temp[half + i] -= gamma * (s_curr + s_next);
-        }
-    }
-
-    // Step 4: Undo β update
-    for (int i = 0; i < half; i++) {
-        float d_curr = (half + i < length) ? temp[half + i] : 0.0f;
-        float d_prev = (i > 0 && half + i - 1 < length) ? temp[half + i - 1] : d_curr;
-        temp[i] -= beta * (d_curr + d_prev);
-    }
-
-    // Step 5: Undo α predict
-    for (int i = 0; i < length / 2; i++) {
-        if (half + i < length) {
-            float s_curr = temp[i];
-            float s_next = (i + 1 < half) ? temp[i + 1] : s_curr;
-            temp[half + i] -= alpha * (s_curr + s_next);
-        }
-    }
-
-    // Reconstruction - interleave low and high pass
-    for (int i = 0; i < length; i++) {
-        if (i % 2 == 0) {
-            // Even positions: low-pass coefficients
-            data[i] = temp[i / 2];
-        } else {
-            // Odd positions: high-pass coefficients
-            int idx = i / 2;
-            if (half + idx < length) {
-                data[i] = temp[half + idx];
-            } else {
-                data[i] = 0.0f;
-            }
-        }
-    }
-
-    // Debug: Check output (disabled to reduce stderr output)
-    // if (call_count <= 5) {
-    //     int nonzero_out = 0;
-    //     for (int i = 0; i < length; i++) {
-    //         if (data[i] != 0.0f) nonzero_out++;
-    //     }
-    //     fprintf(stderr, "      -> OUTPUT: nonzero=%d, first 5: %.1f %.1f %.1f %.1f %.1f\n",
-    //            nonzero_out,
-    //            data[0], length > 1 ? data[1] : 0.0f, length > 2 ? data[2] : 0.0f,
-    //            length > 3 ? data[3] : 0.0f, length > 4 ? data[4] : 0.0f);
-    // }
-
-    free(temp);
-}
-
-// 5/3 inverse DWT using lifting scheme (JPEG 2000 reversible filter)
-static void dwt_53_inverse_1d(float *data, int length) {
-    if (length < 2) return;
-
-    float *temp = malloc(length * sizeof(float));
-    int half = (length + 1) / 2;
-
-    // Copy low-pass and high-pass subbands to temp
-    memcpy(temp, data, length * sizeof(float));
-
-    // Undo update step (low-pass)
-    for (int i = 0; i < half; i++) {
-        float update = 0.25f * ((i > 0 ? temp[half + i - 1] : 0) +
-                               (i < half - 1 ? temp[half + i] : 0));
-        temp[i] -= update;
-    }
-
-    // Undo predict step (high-pass) and interleave samples
-    for (int i = 0; i < half; i++) {
-        data[2 * i] = temp[i];  // Even samples (low-pass)
-        int idx = 2 * i + 1;
-        if (idx < length) {
-            float pred = 0.5f * (temp[i] + (i < half - 1 ? temp[i + 1] : temp[i]));
-            data[idx] = temp[half + i] + pred;  // Odd samples (high-pass)
-        }
-    }
-
-    free(temp);
-}
-
-// Biorthogonal 2,4 (LeGall 2/4) INVERSE 1D transform
-static void dwt_bior24_inverse_1d(float *data, int length) {
-    if (length < 2) return;
-
-    float *temp = malloc(sizeof(float) * length);
-    int half = (length + 1) / 2;
-    int i;
-
-    int nE = half;
-    int nO = length / 2;
-
-    float *even = temp;
-    float *odd  = temp + nE;
-
-    // Load L and H
-    for (i = 0; i < nE; i++) {
-        even[i] = data[i];
-    }
-    for (i = 0; i < nO; i++) {
-        odd[i] = data[half + i];
-    }
-
-    // ---- Inverse update: s[i] = s[i] - 0.25*d[i] ----
-    for (i = 0; i < nE; i++) {
-        float d = (i < nO) ? odd[i] : 0.0f;
-        even[i] = even[i] - 0.25f * d;
-    }
-
-    // ---- Inverse predict: o[i] = d[i] + 0.5*s[i] ----
-    for (i = 0; i < nO; i++) {
-        odd[i] = odd[i] + 0.5f * even[i];
-    }
-
-    // Interleave back into output
-    for (i = 0; i < nO; i++) {
-        data[2 * i]     = even[i];
-        data[2 * i + 1] = odd[i];
-    }
-    if (nE > nO) {
-        // Trailing even sample for odd length
-        data[2 * nO] = even[nO];
-    }
-
-    free(temp);
-}
-
-// Multi-level inverse DWT (matches TSVM exactly with correct non-power-of-2 handling)
-static void apply_inverse_dwt_multilevel(float *data, int width, int height, int levels, int filter_type) {
-    int max_size = (width > height) ? width : height;
-    float *temp_row = malloc(max_size * sizeof(float));
-    float *temp_col = malloc(max_size * sizeof(float));
-
-    // Pre-calculate exact sequence of widths/heights from forward transform
-    // This is CRITICAL for non-power-of-2 dimensions (e.g., 560, 448)
-    // Forward transform uses: width, (width+1)/2, ((width+1)/2+1)/2, ...
-    // Inverse MUST use the exact same sequence in reverse
-    int *widths = malloc((levels + 1) * sizeof(int));
-    int *heights = malloc((levels + 1) * sizeof(int));
-
-    widths[0] = width;
-    heights[0] = height;
-    for (int i = 1; i <= levels; i++) {
-        widths[i] = (widths[i - 1] + 1) / 2;
-        heights[i] = (heights[i - 1] + 1) / 2;
-    }
-
-    // Debug: Print dimension sequence
-    /*static int debug_once = 1;
-    if (debug_once) {
-        fprintf(stderr, "DWT dimension sequence for %dx%d with %d levels:\n", width, height, levels);
-        for (int i = 0; i <= levels; i++) {
-            fprintf(stderr, "  Level %d: %dx%d\n", i, widths[i], heights[i]);
-        }
-        debug_once = 0;
-    }*/
-
-    // TSVM: for (level in levels - 1 downTo 0)
-    // Apply inverse transforms using pre-calculated dimensions
-    for (int level = levels - 1; level >= 0; level--) {
-        int current_width = widths[level];
-        int current_height = heights[level];
-
-        if (current_width < 1 || current_height < 1) continue;
-        if (current_width == 1 && current_height == 1) continue;
-
-        // TSVM: Column inverse transform first (vertical)
-        for (int x = 0; x < current_width; x++) {
-            for (int y = 0; y < current_height; y++) {
-                temp_col[y] = data[y * width + x];
-            }
-
-            if (filter_type == 0) {
-                dwt_53_inverse_1d(temp_col, current_height);
-            } else {
-                dwt_97_inverse_1d(temp_col, current_height);
-            }
-
-            for (int y = 0; y < current_height; y++) {
-                data[y * width + x] = temp_col[y];
-            }
-        }
-
-        // TSVM: Row inverse transform second (horizontal)
-        for (int y = 0; y < current_height; y++) {
-            for (int x = 0; x < current_width; x++) {
-                temp_row[x] = data[y * width + x];
-            }
-
-            if (filter_type == 0) {
-                dwt_53_inverse_1d(temp_row, current_width);
-            } else {
-                dwt_97_inverse_1d(temp_row, current_width);
-            }
-
-            for (int x = 0; x < current_width; x++) {
-                data[y * width + x] = temp_row[x];
-            }
-        }
-
-        // Debug after EVERY level
-        static int first_frame_levels = 1;
-        if (first_frame_levels && level <= 2) {  // Only log levels 2, 1, 0 for first frame
-            int nonzero_level = 0;
-            for (int y = 0; y < current_height; y++) {
-                for (int x = 0; x < current_width; x++) {
-                    if (fabsf(data[y * width + x]) > 0.001f) {  // Use fabs for better zero detection
-                        nonzero_level++;
-                    }
-                }
-            }
-            // fprintf(stderr, "After level %d (%dx%d): nonzero=%d/%d, data[0]=%.1f, data[1]=%.1f, data[width]=%.1f\n",
-            //        level, current_width, current_height, nonzero_level, current_width * current_height,
-            //        data[0], data[1], data[width]);
-
-            if (level == 0) first_frame_levels = 0;  // Stop after level 0 of first frame
-        }
-    }
-
-    // Debug: Check buffer after all levels complete (disabled to reduce stderr output)
-    // static int debug_output_once = 1;
-    // if (debug_output_once) {
-    //     int nonzero_final = 0;
-    //     for (int i = 0; i < width * height; i++) {
-    //         if (data[i] != 0.0f) nonzero_final++;
-    //     }
-    //     fprintf(stderr, "After ALL IDWT levels complete: nonzero=%d/%d, first 10: ", nonzero_final, width * height);
-    //     for (int i = 0; i < 10 && i < width * height; i++) {
-    //         fprintf(stderr, "%.1f ", data[i]);
-    //     }
-    //     fprintf(stderr, "\n");
-    //     debug_output_once = 0;
-    // }
-
-    free(widths);
-    free(heights);
-    free(temp_row);
-    free(temp_col);
-}
-
-//=============================================================================
-// Temporal DWT and GOP Decoding (matches TSVM)
-//=============================================================================
-
-// Get temporal subband level for a given frame index in a GOP
-static int get_temporal_subband_level(int frame_idx, int num_frames, int temporal_levels) {
-    // Match encoder logic exactly (encoder_tav.c:1487-1506)
-    // After temporal DWT with N levels, frames are organised as:
-    // Frames 0...num_frames/(2^N) = tL...L (N low-passes, coarsest, level 0)
-    // Remaining frames are temporal high-pass subbands at various levels
-
-    // Check each level boundary from coarsest to finest
-    for (int level = 0; level < temporal_levels; level++) {
-        int frames_at_this_level = num_frames >> (temporal_levels - level);
-        if (frame_idx < frames_at_this_level) {
-            return level;
-        }
-    }
-
-    // Finest level (first decomposition's high-pass)
-    return temporal_levels;
-}
-
-// Calculate temporal quantiser scale for a given temporal subband level
-static float get_temporal_quantiser_scale(uint8_t encoder_preset, int temporal_level) {
-    // Uses exponential scaling: 2^(BETA × level^KAPPA)
-    // With BETA=0.6, KAPPA=1.14:
-    //   - Level 0 (tLL):  2^0.0 = 1.00
-    //   - Level 1 (tH):   2^0.68 = 1.61
-    //   - Level 2 (tHH):  2^1.29 = 2.45
-    const float BETA = (encoder_preset & 0x01) ? 0.0f : 0.6f;
-    const float KAPPA = (encoder_preset & 0x01) ? 1.0f : 1.14f;
-    return powf(2.0f, BETA * powf(temporal_level, KAPPA));
-}
-
-// Inverse Haar 1D DWT
-static void dwt_haar_inverse_1d(float *data, int length) {
-    if (length < 2) return;
-
-    float *temp = malloc(length * sizeof(float));
-    const int half = (length + 1) / 2;
-
-    // Inverse Haar transform: reconstruct from averages and differences
-    // Read directly from data array (already has low-pass then high-pass layout)
-    for (int i = 0; i < half; i++) {
-        if (2 * i + 1 < length) {
-            // Reconstruct adjacent pairs from average and difference
-            temp[2 * i] = data[i] + data[half + i];      // average + difference
-            temp[2 * i + 1] = data[i] - data[half + i];  // average - difference
-        } else {
-            // Handle odd length: last sample comes from low-pass only
-            temp[2 * i] = data[i];
-        }
-    }
-
-    // Copy reconstructed data back
-    for (int i = 0; i < length; i++) {
-        data[i] = temp[i];
-    }
-
-    free(temp);
-}
-
-// Apply inverse 3D DWT to GOP data (spatial + temporal)
-// Order: SPATIAL first (each frame), then TEMPORAL (across frames)
-static void apply_inverse_3d_dwt(float **gop_y, float **gop_co, float **gop_cg,
-                                int width, int height, int gop_size,
-                                int spatial_levels, int temporal_levels, int filter_type,
-                                int temporal_motion_coder) {
-    // Step 1: Apply inverse 2D spatial DWT to each frame
-    for (int t = 0; t < gop_size; t++) {
-        apply_inverse_dwt_multilevel(gop_y[t], width, height, spatial_levels, filter_type);
-        apply_inverse_dwt_multilevel(gop_co[t], width, height, spatial_levels, filter_type);
-        apply_inverse_dwt_multilevel(gop_cg[t], width, height, spatial_levels, filter_type);
-    }
-
-    // Step 2: Apply inverse temporal DWT to each spatial location
-    // Only needed for GOPs with multiple frames (skip for I-frames)
-    if (gop_size < 2) return;
-
-    // Pre-calculate all intermediate lengths for temporal DWT (same fix as TAD)
-    // This ensures correct reconstruction for non-power-of-2 GOP sizes
-    int *temporal_lengths = malloc((temporal_levels + 1) * sizeof(int));
-    temporal_lengths[0] = gop_size;
-    for (int i = 1; i <= temporal_levels; i++) {
-        temporal_lengths[i] = (temporal_lengths[i - 1] + 1) / 2;
-    }
-
-    float *temporal_line = malloc(gop_size * sizeof(float));
-    for (int y = 0; y < height; y++) {
-        for (int x = 0; x < width; x++) {
-            const int pixel_idx = y * width + x;
-
-            // Process Y channel
-            for (int t = 0; t < gop_size; t++) {
-                temporal_line[t] = gop_y[t][pixel_idx];
-            }
-            for (int level = temporal_levels - 1; level >= 0; level--) {
-                const int level_frames = temporal_lengths[level];
-                if (level_frames >= 2) {
-                    // Use selected temporal wavelet (0=Haar, 1=CDF 5/3)
-                    if (temporal_motion_coder == 0) {
-                        dwt_haar_inverse_1d(temporal_line, level_frames);
-                    } else {
-                        dwt_53_inverse_1d(temporal_line, level_frames);
-                    }
-                }
-            }
-            for (int t = 0; t < gop_size; t++) {
-                gop_y[t][pixel_idx] = temporal_line[t];
-            }
-
-            // Process Co channel
-            for (int t = 0; t < gop_size; t++) {
-                temporal_line[t] = gop_co[t][pixel_idx];
-            }
-            for (int level = temporal_levels - 1; level >= 0; level--) {
-                const int level_frames = temporal_lengths[level];
-                if (level_frames >= 2) {
-                    // Use selected temporal wavelet (0=Haar, 1=CDF 5/3)
-                    if (temporal_motion_coder == 0) {
-                        dwt_haar_inverse_1d(temporal_line, level_frames);
-                    } else {
-                        dwt_53_inverse_1d(temporal_line, level_frames);
-                    }
-                }
-            }
-            for (int t = 0; t < gop_size; t++) {
-                gop_co[t][pixel_idx] = temporal_line[t];
-            }
-
-            // Process Cg channel
-            for (int t = 0; t < gop_size; t++) {
-                temporal_line[t] = gop_cg[t][pixel_idx];
-            }
-            for (int level = temporal_levels - 1; level >= 0; level--) {
-                const int level_frames = temporal_lengths[level];
-                if (level_frames >= 2) {
-                    // Use selected temporal wavelet (0=Haar, 1=CDF 5/3)
-                    if (temporal_motion_coder == 0) {
-                        dwt_haar_inverse_1d(temporal_line, level_frames);
-                    } else {
-                        dwt_53_inverse_1d(temporal_line, level_frames);
-                    }
-                }
-            }
-            for (int t = 0; t < gop_size; t++) {
-                gop_cg[t][pixel_idx] = temporal_line[t];
-            }
-        }
-    }
-
-    free(temporal_line);
-    free(temporal_lengths);
-}
-
-// Postprocess GOP unified block to per-frame coefficients (2-bit map format)
-static int16_t ***postprocess_gop_unified(const uint8_t *decompressed_data, size_t data_size,
-                                         int gop_size, int num_pixels, int channel_layout) {
-    // 2 bits per coefficient
-    const int map_bytes_per_frame = (num_pixels * 2 + 7) / 8;
-
-    // Determine which channels are present
-    // Bit 0: has alpha, Bit 1: has chroma (inverted), Bit 2: has luma (inverted)
-    const int has_y = (channel_layout & 0x04) == 0;
-    const int has_co = (channel_layout & 0x02) == 0;  // Inverted: 0 = has chroma
-    const int has_cg = (channel_layout & 0x02) == 0;  // Inverted: 0 = has chroma
-
-    // Calculate buffer positions for maps
-    int read_ptr = 0;
-    const int y_maps_start = has_y ? read_ptr : -1;
-    if (has_y) read_ptr += map_bytes_per_frame * gop_size;
-
-    const int co_maps_start = has_co ? read_ptr : -1;
-    if (has_co) read_ptr += map_bytes_per_frame * gop_size;
-
-    const int cg_maps_start = has_cg ? read_ptr : -1;
-    if (has_cg) read_ptr += map_bytes_per_frame * gop_size;
-
-    // Count "other" values (code 11) across ALL frames
-    int y_other_count = 0;
-    int co_other_count = 0;
-    int cg_other_count = 0;
-
-    for (int frame = 0; frame < gop_size; frame++) {
-        const int frame_map_offset = frame * map_bytes_per_frame;
-        for (int i = 0; i < num_pixels; i++) {
-            const int bit_pos = i * 2;
-            const int byte_idx = bit_pos / 8;
-            const int bit_offset = bit_pos % 8;
-
-            if (has_y && y_maps_start + frame_map_offset + byte_idx < (int)data_size) {
-                int code = (decompressed_data[y_maps_start + frame_map_offset + byte_idx] >> bit_offset) & 0x03;
-                if (bit_offset == 7 && byte_idx + 1 < map_bytes_per_frame) {
-                    const int next_byte = decompressed_data[y_maps_start + frame_map_offset + byte_idx + 1] & 0xFF;
-                    code = (code & 0x01) | ((next_byte & 0x01) << 1);
-                }
-                if (code == 3) y_other_count++;
-            }
-            if (has_co && co_maps_start + frame_map_offset + byte_idx < (int)data_size) {
-                int code = (decompressed_data[co_maps_start + frame_map_offset + byte_idx] >> bit_offset) & 0x03;
-                if (bit_offset == 7 && byte_idx + 1 < map_bytes_per_frame) {
-                    const int next_byte = decompressed_data[co_maps_start + frame_map_offset + byte_idx + 1] & 0xFF;
-                    code = (code & 0x01) | ((next_byte & 0x01) << 1);
-                }
-                if (code == 3) co_other_count++;
-            }
-            if (has_cg && cg_maps_start + frame_map_offset + byte_idx < (int)data_size) {
-                int code = (decompressed_data[cg_maps_start + frame_map_offset + byte_idx] >> bit_offset) & 0x03;
-                if (bit_offset == 7 && byte_idx + 1 < map_bytes_per_frame) {
-                    const int next_byte = decompressed_data[cg_maps_start + frame_map_offset + byte_idx + 1] & 0xFF;
-                    code = (code & 0x01) | ((next_byte & 0x01) << 1);
-                }
-                if (code == 3) cg_other_count++;
-            }
-        }
-    }
-
-    // Value arrays start after all maps
-    const int y_values_start = read_ptr;
-    read_ptr += y_other_count * 2;
-
-    const int co_values_start = read_ptr;
-    read_ptr += co_other_count * 2;
-
-    const int cg_values_start = read_ptr;
-
-    // Allocate output arrays: [gop_size][3 channels][num_pixels]
-    int16_t ***output = malloc(gop_size * sizeof(int16_t **));
-    for (int t = 0; t < gop_size; t++) {
-        output[t] = malloc(3 * sizeof(int16_t *));
-        output[t][0] = calloc(num_pixels, sizeof(int16_t));  // Y
-        output[t][1] = calloc(num_pixels, sizeof(int16_t));  // Co
-        output[t][2] = calloc(num_pixels, sizeof(int16_t));  // Cg
-    }
-
-    int y_value_idx = 0;
-    int co_value_idx = 0;
-    int cg_value_idx = 0;
-
-    for (int frame = 0; frame < gop_size; frame++) {
-        const int frame_map_offset = frame * map_bytes_per_frame;
-        for (int i = 0; i < num_pixels; i++) {
-            const int bit_pos = i * 2;
-            const int byte_idx = bit_pos / 8;
-            const int bit_offset = bit_pos % 8;
-
-            // Decode Y
-            if (has_y && y_maps_start + frame_map_offset + byte_idx < (int)data_size) {
-                int code = (decompressed_data[y_maps_start + frame_map_offset + byte_idx] >> bit_offset) & 0x03;
-                if (bit_offset == 7 && byte_idx + 1 < map_bytes_per_frame) {
-                    const int next_byte = decompressed_data[y_maps_start + frame_map_offset + byte_idx + 1] & 0xFF;
-                    code = (code & 0x01) | ((next_byte & 0x01) << 1);
-                }
-                if (code == 0) {
-                    output[frame][0][i] = 0;
-                } else if (code == 1) {
-                    output[frame][0][i] = 1;
-                } else if (code == 2) {
-                    output[frame][0][i] = -1;
-                } else {  // code == 3
-                    const int val_offset = y_values_start + y_value_idx * 2;
-                    y_value_idx++;
-                    if (val_offset + 1 < (int)data_size) {
-                        const int lo = decompressed_data[val_offset] & 0xFF;
-                        const int hi = (int8_t)decompressed_data[val_offset + 1];
-                        output[frame][0][i] = (int16_t)((hi << 8) | lo);
-                    } else {
-                        output[frame][0][i] = 0;
-                    }
-                }
-            }
-
-            // Decode Co
-            if (has_co && co_maps_start + frame_map_offset + byte_idx < (int)data_size) {
-                int code = (decompressed_data[co_maps_start + frame_map_offset + byte_idx] >> bit_offset) & 0x03;
-                if (bit_offset == 7 && byte_idx + 1 < map_bytes_per_frame) {
-                    const int next_byte = decompressed_data[co_maps_start + frame_map_offset + byte_idx + 1] & 0xFF;
-                    code = (code & 0x01) | ((next_byte & 0x01) << 1);
-                }
-                if (code == 0) {
-                    output[frame][1][i] = 0;
-                } else if (code == 1) {
-                    output[frame][1][i] = 1;
-                } else if (code == 2) {
-                    output[frame][1][i] = -1;
-                } else {  // code == 3
-                    const int val_offset = co_values_start + co_value_idx * 2;
-                    co_value_idx++;
-                    if (val_offset + 1 < (int)data_size) {
-                        const int lo = decompressed_data[val_offset] & 0xFF;
-                        const int hi = (int8_t)decompressed_data[val_offset + 1];
-                        output[frame][1][i] = (int16_t)((hi << 8) | lo);
-                    } else {
-                        output[frame][1][i] = 0;
-                    }
-                }
-            }
-
-            // Decode Cg
-            if (has_cg && cg_maps_start + frame_map_offset + byte_idx < (int)data_size) {
-                int code = (decompressed_data[cg_maps_start + frame_map_offset + byte_idx] >> bit_offset) & 0x03;
-                if (bit_offset == 7 && byte_idx + 1 < map_bytes_per_frame) {
-                    const int next_byte = decompressed_data[cg_maps_start + frame_map_offset + byte_idx + 1] & 0xFF;
-                    code = (code & 0x01) | ((next_byte & 0x01) << 1);
-                }
-                if (code == 0) {
-                    output[frame][2][i] = 0;
-                } else if (code == 1) {
-                    output[frame][2][i] = 1;
-                } else if (code == 2) {
-                    output[frame][2][i] = -1;
-                } else {  // code == 3
-                    const int val_offset = cg_values_start + cg_value_idx * 2;
-                    cg_value_idx++;
-                    if (val_offset + 1 < (int)data_size) {
-                        const int lo = decompressed_data[val_offset] & 0xFF;
-                        const int hi = (int8_t)decompressed_data[val_offset + 1];
-                        output[frame][2][i] = (int16_t)((hi << 8) | lo);
-                    } else {
-                        output[frame][2][i] = 0;
-                    }
-                }
-            }
-        }
-    }
-
-    return output;
-}
-
-// Postprocess GOP RAW format to per-frame coefficients (entropyCoder=2)
-// Layout: [All_Y_coeffs][All_Co_coeffs][All_Cg_coeffs] (raw int16 arrays)
-static int16_t ***postprocess_gop_raw(const uint8_t *decompressed_data, size_t data_size,
-                                     int gop_size, int num_pixels, int channel_layout) {
-    // Determine which channels are present
-    const int has_y = (channel_layout & 0x04) == 0;
-    const int has_co = (channel_layout & 0x02) == 0;
-    const int has_cg = (channel_layout & 0x02) == 0;
-
-    // Allocate output arrays: [gop_size][3 channels][num_pixels]
-    int16_t ***output = malloc(gop_size * sizeof(int16_t **));
-    for (int t = 0; t < gop_size; t++) {
-        output[t] = malloc(3 * sizeof(int16_t *));
-        output[t][0] = calloc(num_pixels, sizeof(int16_t));  // Y
-        output[t][1] = calloc(num_pixels, sizeof(int16_t));  // Co
-        output[t][2] = calloc(num_pixels, sizeof(int16_t));  // Cg
-    }
-
-    int offset = 0;
-
-    // Read Y channel (all frames concatenated)
-    if (has_y) {
-        const int channel_size = gop_size * num_pixels * sizeof(int16_t);
-        if (offset + channel_size > (int)data_size) {
-            fprintf(stderr, "Error: Not enough data for Y channel in RAW GOP\n");
-            goto error_cleanup;
-        }
-        const int16_t *y_data = (const int16_t *)(decompressed_data + offset);
-        for (int t = 0; t < gop_size; t++) {
-            memcpy(output[t][0], y_data + t * num_pixels, num_pixels * sizeof(int16_t));
-        }
-        offset += channel_size;
-    }
-
-    // Read Co channel (all frames concatenated)
-    if (has_co) {
-        const int channel_size = gop_size * num_pixels * sizeof(int16_t);
-        if (offset + channel_size > (int)data_size) {
-            fprintf(stderr, "Error: Not enough data for Co channel in RAW GOP\n");
-            goto error_cleanup;
-        }
-        const int16_t *co_data = (const int16_t *)(decompressed_data + offset);
-        for (int t = 0; t < gop_size; t++) {
-            memcpy(output[t][1], co_data + t * num_pixels, num_pixels * sizeof(int16_t));
-        }
-        offset += channel_size;
-    }
-
-    // Read Cg channel (all frames concatenated)
-    if (has_cg) {
-        const int channel_size = gop_size * num_pixels * sizeof(int16_t);
-        if (offset + channel_size > (int)data_size) {
-            fprintf(stderr, "Error: Not enough data for Cg channel in RAW GOP\n");
-            goto error_cleanup;
-        }
-        const int16_t *cg_data = (const int16_t *)(decompressed_data + offset);
-        for (int t = 0; t < gop_size; t++) {
-            memcpy(output[t][2], cg_data + t * num_pixels, num_pixels * sizeof(int16_t));
-        }
-        offset += channel_size;
-    }
-
-    return output;
-
-error_cleanup:
-    for (int t = 0; t < gop_size; t++) {
-        free(output[t][0]);
-        free(output[t][1]);
-        free(output[t][2]);
-        free(output[t]);
-    }
-    free(output);
-    return NULL;
-}
-
-// Postprocess GOP EZBC format to per-frame coefficients (entropyCoder=1)
-// Layout: [frame0_size(4)][frame0_ezbc_data][frame1_size(4)][frame1_ezbc_data]...
-// Note: EZBC is a complex embedded bitplane codec - this is a simplified placeholder
-// Returns the actual dimensions through output parameters (for crop encoding support)
-static int16_t ***postprocess_gop_ezbc(const uint8_t *decompressed_data, size_t data_size,
-                                      int gop_size, int num_pixels, int channel_layout,
-                                      int *out_width, int *out_height) {
-    // First, peek at the first frame's dimensions to determine actual GOP size
-    // (with crop encoding, GOP dimensions may be smaller than full frame)
-    int actual_width = 0, actual_height = 0;
-    int actual_pixels = num_pixels;  // Default to full frame if peek fails
-
-    if (data_size >= 8) {  // Need at least frame size header + some EZBC data
-        // Skip first frame's size header to get to EZBC data
-        const uint32_t first_frame_size = ((uint32_t)decompressed_data[0]) |
-                                         ((uint32_t)decompressed_data[1] << 8) |
-                                         ((uint32_t)decompressed_data[2] << 16) |
-                                         ((uint32_t)decompressed_data[3] << 24);
-
-        if (4 + first_frame_size <= data_size) {
-            if (ezbc_peek_dimensions(decompressed_data + 4, channel_layout,
-                                     &actual_width, &actual_height) == 0) {
-                actual_pixels = actual_width * actual_height;
-                // Only log if dimensions differ significantly (crop encoding active)
-                // Suppress repetitive messages by using static counter
-                static int crop_log_count = 0;
-                if (actual_pixels != num_pixels && crop_log_count < 3) {
-                    fprintf(stderr, "[GOP-EZBC] Detected crop encoding: GOP dimensions %dx%d (%d pixels) vs full frame %d pixels\n",
-                           actual_width, actual_height, actual_pixels, num_pixels);
-                    crop_log_count++;
-                    if (crop_log_count == 3) {
-                        fprintf(stderr, "[GOP-EZBC] (Further crop encoding messages suppressed)\n");
-                    }
-                }
-            }
-        }
-    }
-
-    // If we didn't successfully peek dimensions, calculate from num_pixels
-    if (actual_width == 0 || actual_height == 0) {
-        // Assume square-ish dimensions - this is a fallback, should not happen with proper encoding
-        actual_width = (int)sqrt(num_pixels);
-        actual_height = num_pixels / actual_width;
-        actual_pixels = actual_width * actual_height;
-    }
-
-    // Return actual dimensions to caller
-    if (out_width) *out_width = actual_width;
-    if (out_height) *out_height = actual_height;
-
-    // Allocate output arrays: [gop_size][3 channels][actual_pixels]
-    // Use actual GOP dimensions (may be cropped) not full frame size
-    int16_t ***output = malloc(gop_size * sizeof(int16_t **));
-    for (int t = 0; t < gop_size; t++) {
-        output[t] = malloc(3 * sizeof(int16_t *));
-        output[t][0] = calloc(actual_pixels, sizeof(int16_t));  // Y
-        output[t][1] = calloc(actual_pixels, sizeof(int16_t));  // Co
-        output[t][2] = calloc(actual_pixels, sizeof(int16_t));  // Cg
-    }
-
-    int offset = 0;
-
-    // Read each frame
-    for (int t = 0; t < gop_size; t++) {
-        if (offset + 4 > (int)data_size) {
-            fprintf(stderr, "Error: Not enough data for frame %d size in EZBC GOP\n", t);
-            goto error_cleanup;
-        }
-
-        // Read frame size (4 bytes, little-endian)
-        const uint32_t frame_size = ((uint32_t)decompressed_data[offset + 0]) |
-                                   ((uint32_t)decompressed_data[offset + 1] << 8) |
-                                   ((uint32_t)decompressed_data[offset + 2] << 16) |
-                                   ((uint32_t)decompressed_data[offset + 3] << 24);
-        offset += 4;
-
-        if (offset + frame_size > data_size) {
-            fprintf(stderr, "Error: Frame %d EZBC data exceeds buffer (size=%u, available=%zu)\n",
-                   t, frame_size, data_size - offset);
-            goto error_cleanup;
-        }
-
-        // Decode EZBC frame using the single-frame EZBC decoder
-        // Pass actual_pixels (cropped size) not num_pixels (full frame size)
-        postprocess_coefficients_ezbc(
-            (uint8_t *)(decompressed_data + offset), actual_pixels,
-            output[t][0], output[t][1], output[t][2],
-            channel_layout);
-
-        offset += frame_size;
-    }
-
-    return output;
-
-error_cleanup:
-    for (int t = 0; t < gop_size; t++) {
-        free(output[t][0]);
-        free(output[t][1]);
-        free(output[t][2]);
-        free(output[t]);
-    }
-    free(output);
-    return NULL;
-}
-
-//=============================================================================
-// YCoCg-R / ICtCp to RGB Conversion (matches TSVM)
-//=============================================================================
-
-static void ycocg_r_to_rgb(float y, float co, float cg, uint8_t *r, uint8_t *g, uint8_t *b) {
-    float tmp = y - cg / 2.0f;
-    float g_val = cg + tmp;
-    float b_val = tmp - co / 2.0f;
-    float r_val = co + b_val;
-
-    // FIX: Use truncation (not rounding) to match Kotlin decoder behavior
-    // Kotlin uses .toInt() which truncates toward zero (floor for positive values)
-    *r = CLAMP(roundf(r_val), 0, 255);
-    *g = CLAMP(roundf(g_val), 0, 255);
-    *b = CLAMP(roundf(b_val), 0, 255);
-}
-
-// ICtCp to RGB conversion (for even TAV versions)
-static void ictcp_to_rgb(float i, float ct, float cp, uint8_t *r, uint8_t *g, uint8_t *b) {
-    // ICtCp → RGB conversion (inverse of RGB → ICtCp)
-    // Step 1: ICtCp → LMS
-    float l = i + 0.008609f * ct;
-    float m = i - 0.008609f * ct;
-    float s = i + 0.560031f * cp;
-
-    // Step 2: LMS (nonlinear) → LMS (linear)
-    // Inverse PQ transfer function (simplified)
-    l = powf(fmaxf(l, 0.0f), 1.0f / 0.1593f);
-    m = powf(fmaxf(m, 0.0f), 1.0f / 0.1593f);
-    s = powf(fmaxf(s, 0.0f), 1.0f / 0.1593f);
-
-    // Step 3: LMS → RGB
-    float r_val = 5.432622f * l - 4.679910f * m + 0.247288f * s;
-    float g_val = -1.106160f * l + 2.311198f * m - 0.205038f * s;
-    float b_val = 0.028262f * l - 0.195689f * m + 1.167427f * s;
-
-    *r = CLAMP((int)(r_val * 255.0f + 0.5f), 0, 255);
-    *g = CLAMP((int)(g_val * 255.0f + 0.5f), 0, 255);
-    *b = CLAMP((int)(b_val * 255.0f + 0.5f), 0, 255);
-}
-
-//=============================================================================
-// WAV File Writing
-//=============================================================================
-
-static void write_wav_header(FILE *fp, uint32_t sample_rate, uint16_t channels, uint32_t data_size) {
-    // RIFF header
-    fwrite("RIFF", 1, 4, fp);
-    uint32_t file_size = 36 + data_size;
-    fwrite(&file_size, 4, 1, fp);
-    fwrite("WAVE", 1, 4, fp);
-
-    // fmt chunk
-    fwrite("fmt ", 1, 4, fp);
-    uint32_t fmt_size = 16;
-    fwrite(&fmt_size, 4, 1, fp);
-    uint16_t audio_format = 1;  // PCM
-    fwrite(&audio_format, 2, 1, fp);
-    fwrite(&channels, 2, 1, fp);
-    fwrite(&sample_rate, 4, 1, fp);
-    uint32_t byte_rate = sample_rate * channels * 1;  // 1 byte per sample (u8)
-    fwrite(&byte_rate, 4, 1, fp);
-    uint16_t block_align = channels * 1;
-    fwrite(&block_align, 2, 1, fp);
-    uint16_t bits_per_sample = 8;
-    fwrite(&bits_per_sample, 2, 1, fp);
-
-    // data chunk
-    fwrite("data", 1, 4, fp);
-    fwrite(&data_size, 4, 1, fp);
-}
-
-//=============================================================================
-// Decoder State Structure
-//=============================================================================
-
-// Screen masking entry (letterbox/pillarbox geometry change)
-typedef struct {
-    uint32_t frame_num;
-    uint16_t top;
-    uint16_t right;
-    uint16_t bottom;
-    uint16_t left;
-} screen_mask_entry_t;
+// =============================================================================
+// Decoder Context
+// =============================================================================
 
 typedef struct {
+    // Input/output
+    char *input_file;
+    char *output_file;
     FILE *input_fp;
+
+    // TAV header info
     tav_header_t header;
-    uint8_t *current_frame_rgb;
-    uint8_t *reference_frame_rgb;
-    float *dwt_buffer_y;
-    float *dwt_buffer_co;
-    float *dwt_buffer_cg;
-    float *reference_ycocg_y;   // For P-frame delta accumulation
-    float *reference_ycocg_co;
-    float *reference_ycocg_cg;
-    int frame_count;
-    int frame_size;
-    int is_monoblock;           // True if version 3-6 (single tile mode)
-    int temporal_motion_coder;  // Temporal wavelet: 0=Haar, 1=CDF 5/3 (extracted from version)
-    int no_grain_synthesis;     // Command-line flag: disable grain synthesis
+    int perceptual_mode;
 
-    // Screen masking (letterbox/pillarbox) - array of geometry changes
-    screen_mask_entry_t *screen_masks;
-    int screen_mask_count;
-    int screen_mask_capacity;
-    // Current active mask
-    uint16_t screen_mask_top;
-    uint16_t screen_mask_right;
-    uint16_t screen_mask_bottom;
-    uint16_t screen_mask_left;
+    // Video decoder context
+    tav_video_context_t *video_ctx;
 
-    // Phase 2: Decoding dimensions (may differ from full frame dimensions per GOP)
-    int decoding_width;     // Actual encoded dimensions (cropped active region)
-    int decoding_height;    // Updated when Screen Mask packet is encountered
-    // Note: Buffers are allocated at max size (header.width × header.height)
-    //       but only decoding_width × decoding_height portion is used
-
-    // FFmpeg pipe for video only (audio from file)
-    FILE *video_pipe;
+    // FFmpeg integration
     pid_t ffmpeg_pid;
+    FILE *video_pipe;
+    char *audio_temp_file;
+    FILE *audio_temp_fp;
 
-    // Temporary audio file
-    char *audio_file_path;
-} tav_decoder_t;
+    // Frame buffers
+    uint8_t **gop_frames;
+    int gop_frames_allocated;
 
-//=============================================================================
-// Pass 1: Extract Audio to WAV File
-//=============================================================================
+    // Statistics
+    uint64_t frames_decoded;
+    uint64_t gops_decoded;
+    uint64_t audio_samples_decoded;
+    uint64_t bytes_read;
+    time_t start_time;
 
-static int extract_audio_to_wav(const char *input_file, const char *wav_file, int verbose) {
-    FILE *input_fp = fopen(input_file, "rb");
-    if (!input_fp) {
-        fprintf(stderr, "Failed to open input file for audio extraction\n");
+    // Options
+    int verbose;
+    int decode_limit;       // Max frames to decode (0=all)
+    int output_raw;         // Output raw video instead of FFV1
+    int no_audio;           // Skip audio decoding
+    int dump_packets;       // Debug: dump packet info
+
+} decoder_context_t;
+
+// =============================================================================
+// TAV Header Parsing
+// =============================================================================
+
+static int read_tav_header(decoder_context_t *ctx) {
+    // Read raw header bytes
+    uint8_t header_bytes[32];
+    if (fread(header_bytes, 1, 32, ctx->input_fp) != 32) {
+        fprintf(stderr, "Error: Failed to read TAV header\n");
         return -1;
     }
 
-    // Read header
-    tav_header_t header;
-    if (fread(&header, sizeof(tav_header_t), 1, input_fp) != 1) {
-        fclose(input_fp);
-        return -1;
-    }
-
-    // Open temporary audio file
-    FILE *wav_fp = fopen(wav_file, "wb");
-    if (!wav_fp) {
-        fprintf(stderr, "Failed to create temporary audio file\n");
-        fclose(input_fp);
-        return -1;
-    }
-
-    // Write placeholder WAV header (will be updated later)
-    write_wav_header(wav_fp, 32000, 2, 0);
-
-    uint32_t total_audio_bytes = 0;
-    int packet_count = 0;
-
-    if (verbose) {
-        fprintf(stderr, "[Pass 1] Extracting audio to %s...\n", wav_file);
-    }
-
-    // Read all packets and extract audio
-    while (1) {
-        uint8_t packet_type;
-        if (fread(&packet_type, 1, 1, input_fp) != 1) {
-            break;  // EOF
-        }
-
-        packet_count++;
-
-        // Skip non-audio packets
-        if (packet_type == TAV_PACKET_SYNC || packet_type == TAV_PACKET_SYNC_NTSC) {
-            continue;
-        }
-
-        if (packet_type == TAV_PACKET_TIMECODE) {
-            fseek(input_fp, 8, SEEK_CUR);  // Skip timecode
-            continue;
-        }
-
-        if (packet_type == TAV_PACKET_GOP_SYNC) {
-            fseek(input_fp, 1, SEEK_CUR);  // Skip frame count
-            continue;
-        }
-
-        if (packet_type == TAV_PACKET_SCREEN_MASK) {
-            fseek(input_fp, 12, SEEK_CUR);  // Skip frame_num(4) + top(2) + right(2) + bottom(2) + left(2)
-            continue;
-        }
-
-        if (packet_type == TAV_PACKET_GOP_UNIFIED) {
-            uint8_t gop_size;
-            uint32_t compressed_size;
-            fread(&gop_size, 1, 1, input_fp);
-            fread(&compressed_size, 4, 1, input_fp);
-            fseek(input_fp, compressed_size, SEEK_CUR);  // Skip GOP data
-            continue;
-        }
-
-        // Handle TAD audio
-        if (packet_type == TAV_PACKET_AUDIO_TAD) {
-            uint16_t sample_count_wrapper;
-            uint32_t payload_size_plus_7;
-            fread(&sample_count_wrapper, 2, 1, input_fp);
-            fread(&payload_size_plus_7, 4, 1, input_fp);
-
-            uint16_t sample_count_chunk;
-            uint8_t quantiser_index;
-            uint32_t compressed_size;
-            fread(&sample_count_chunk, 2, 1, input_fp);
-            fread(&quantiser_index, 1, 1, input_fp);
-            fread(&compressed_size, 4, 1, input_fp);
-
-            uint8_t *tad_compressed = malloc(compressed_size);
-            fread(tad_compressed, 1, compressed_size, input_fp);
-
-            // Build TAD chunk
-            size_t tad_chunk_size = 2 + 1 + 4 + compressed_size;
-            uint8_t *tad_chunk = malloc(tad_chunk_size);
-            memcpy(tad_chunk, &sample_count_chunk, 2);
-            memcpy(tad_chunk + 2, &quantiser_index, 1);
-            memcpy(tad_chunk + 3, &compressed_size, 4);
-            memcpy(tad_chunk + 7, tad_compressed, compressed_size);
-            free(tad_compressed);
-
-            // Decode TAD
-            uint8_t *pcmu8_output = malloc(sample_count_chunk * 2);
-            size_t bytes_consumed, samples_decoded;
-            int decode_result = tad32_decode_chunk(tad_chunk, tad_chunk_size,
-                                            pcmu8_output, &bytes_consumed, &samples_decoded);
-
-            if (decode_result >= 0) {
-                size_t pcm_bytes = samples_decoded * 2;
-                fwrite(pcmu8_output, 1, pcm_bytes, wav_fp);
-                total_audio_bytes += pcm_bytes;
-            }
-
-            free(tad_chunk);
-            free(pcmu8_output);
-            continue;
-        }
-
-        // Handle PCM8 audio
-        if (packet_type == TAV_PACKET_AUDIO_PCM8) {
-            uint32_t packet_size;
-            fread(&packet_size, 4, 1, input_fp);
-
-            uint8_t *compressed_data = malloc(packet_size);
-            fread(compressed_data, 1, packet_size, input_fp);
-
-            // Decompress
-            size_t decompressed_bound = ZSTD_getFrameContentSize(compressed_data, packet_size);
-            uint8_t *pcm_data = malloc(decompressed_bound);
-            size_t decompressed_size = ZSTD_decompress(pcm_data, decompressed_bound,
-                                                       compressed_data, packet_size);
-            free(compressed_data);
-
-            if (!ZSTD_isError(decompressed_size)) {
-                fwrite(pcm_data, 1, decompressed_size, wav_fp);
-                total_audio_bytes += decompressed_size;
-            }
-
-            free(pcm_data);
-            continue;
-        }
-
-        // Handle EXTENDED_HDR packet (key-value pairs)
-        if (packet_type == TAV_PACKET_EXTENDED_HDR) {
-            uint16_t num_pairs;
-            fread(&num_pairs, 2, 1, input_fp);
-            for (int i = 0; i < num_pairs; i++) {
-                fseek(input_fp, 4, SEEK_CUR);  // Skip key (4 bytes)
-                uint8_t value_type;
-                fread(&value_type, 1, 1, input_fp);
-                if (value_type == 0x04) {
-                    fseek(input_fp, 8, SEEK_CUR);  // uint64 value
-                } else if (value_type == 0x10) {
-                    uint16_t str_len;
-                    fread(&str_len, 2, 1, input_fp);
-                    fseek(input_fp, str_len, SEEK_CUR);  // string value
-                }
-            }
-            continue;
-        }
-
-        // Read packet size for standard packets
-        uint32_t packet_size;
-        if (fread(&packet_size, 4, 1, input_fp) == 1) {
-            fseek(input_fp, packet_size, SEEK_CUR);
-        }
-    }
-
-    // Update WAV header with actual data size
-    fseek(wav_fp, 0, SEEK_SET);
-    write_wav_header(wav_fp, 32000, 2, total_audio_bytes);
-
-    fclose(wav_fp);
-    fclose(input_fp);
-
-    if (verbose) {
-        fprintf(stderr, "[Pass 1] Extracted %u bytes of audio (%d packets processed)\n",
-               total_audio_bytes, packet_count);
-    }
-
-    return 0;
-}
-
-//=============================================================================
-// Decoder Initialisation and Cleanup
-//=============================================================================
-
-static tav_decoder_t* tav_decoder_init(const char *input_file, const char *output_file, const char *audio_file, int no_grain_synthesis) {
-    tav_decoder_t *decoder = calloc(1, sizeof(tav_decoder_t));
-    if (!decoder) return NULL;
-
-    decoder->no_grain_synthesis = no_grain_synthesis;
-    decoder->input_fp = fopen(input_file, "rb");
-    if (!decoder->input_fp) {
-        free(decoder);
-        return NULL;
-    }
-
-    // Read header
-    if (fread(&decoder->header, sizeof(tav_header_t), 1, decoder->input_fp) != 1) {
-        fclose(decoder->input_fp);
-        free(decoder);
-        return NULL;
-    }
-
     // Verify magic
-    if (memcmp(decoder->header.magic, TAV_MAGIC, 8) != 0) {
-        fclose(decoder->input_fp);
-        free(decoder);
-        return NULL;
+    if (memcmp(header_bytes, TAV_MAGIC, 8) != 0) {
+        fprintf(stderr, "Error: Invalid TAV magic (not a TAV file)\n");
+        return -1;
     }
 
-    decoder->frame_size = decoder->header.width * decoder->header.height;
-    // Extract temporal motion coder from version (versions 9-16 use CDF 5/3, 1-8 use Haar)
-    decoder->temporal_motion_coder = (decoder->header.version > 8) ? 1 : 0;
-    // Extract base version for determining monoblock mode
-    uint8_t base_version = (decoder->header.version > 8) ? (decoder->header.version - 8) : decoder->header.version;
-    decoder->is_monoblock = (base_version >= 3 && base_version <= 6);
-    decoder->audio_file_path = strdup(audio_file);
+    // Parse header fields manually (avoid packing issues)
+    memcpy(ctx->header.magic, header_bytes, 8);
+    ctx->header.version = header_bytes[8];
+    ctx->header.width = header_bytes[9] | (header_bytes[10] << 8);
+    ctx->header.height = header_bytes[11] | (header_bytes[12] << 8);
+    ctx->header.fps = header_bytes[13];
+    ctx->header.total_frames = header_bytes[14] | (header_bytes[15] << 8) |
+                               (header_bytes[16] << 16) | (header_bytes[17] << 24);
+    ctx->header.wavelet_filter = header_bytes[18];
+    ctx->header.decomp_levels = header_bytes[19];
+    ctx->header.quantiser_y = header_bytes[20];
+    ctx->header.quantiser_co = header_bytes[21];
+    ctx->header.quantiser_cg = header_bytes[22];
+    ctx->header.extra_flags = header_bytes[23];
+    ctx->header.video_flags = header_bytes[24];
+    ctx->header.encoder_quality = header_bytes[25];
+    ctx->header.channel_layout = header_bytes[26];
+    ctx->header.entropy_coder = header_bytes[27];
+    ctx->header.encoder_preset = header_bytes[28];
+    ctx->header.reserved = header_bytes[29];
+    ctx->header.device_orientation = header_bytes[30];
+    ctx->header.file_role = header_bytes[31];
 
-    // Phase 2: Initialize decoding dimensions to full frame (will be updated by Screen Mask packets)
-    decoder->decoding_width = decoder->header.width;
-    decoder->decoding_height = decoder->header.height;
+    ctx->bytes_read += 32;
 
-    // Allocate buffers
-    decoder->current_frame_rgb = calloc(decoder->frame_size * 3, 1);
-    decoder->reference_frame_rgb = calloc(decoder->frame_size * 3, 1);
-    decoder->dwt_buffer_y = calloc(decoder->frame_size, sizeof(float));
-    decoder->dwt_buffer_co = calloc(decoder->frame_size, sizeof(float));
-    decoder->dwt_buffer_cg = calloc(decoder->frame_size, sizeof(float));
-    decoder->reference_ycocg_y = calloc(decoder->frame_size, sizeof(float));
-    decoder->reference_ycocg_co = calloc(decoder->frame_size, sizeof(float));
-    decoder->reference_ycocg_cg = calloc(decoder->frame_size, sizeof(float));
+    // Determine perceptual mode from version
+    // Versions 5, 6, 13, 14 = perceptual; 3, 4, 11, 12 = uniform
+    int base_version = ctx->header.version & 0x07;  // Remove temporal wavelet flag
+    ctx->perceptual_mode = (base_version == 5 || base_version == 6);
 
-    // Create FFmpeg process for video encoding (video pipe only, audio from file)
+    if (ctx->verbose) {
+        printf("=== TAV Header ===\n");
+        printf("  Version: %d\n", ctx->header.version);
+        printf("  Resolution: %dx%d\n", ctx->header.width, ctx->header.height);
+        printf("  FPS: %d\n", ctx->header.fps);
+        printf("  Total frames: %u\n", ctx->header.total_frames);
+        printf("  Wavelet filter: %d\n", ctx->header.wavelet_filter);
+        printf("  Decomp levels: %d\n", ctx->header.decomp_levels);
+        printf("  Quantisers: Y=%d, Co=%d, Cg=%d\n",
+               ctx->header.quantiser_y, ctx->header.quantiser_co, ctx->header.quantiser_cg);
+        printf("  Perceptual mode: %s\n", ctx->perceptual_mode ? "yes" : "no");
+        printf("  Entropy coder: %s\n", ctx->header.entropy_coder ? "EZBC" : "Twobitmap");
+        printf("  Encoder preset: 0x%02X\n", ctx->header.encoder_preset);
+        printf("  Has audio: %s\n", (ctx->header.extra_flags & 0x01) ? "yes" : "no");
+        printf("==================\n\n");
+    }
+
+    return 0;
+}
+
+// =============================================================================
+// FFmpeg Integration
+// =============================================================================
+
+static int spawn_ffmpeg(decoder_context_t *ctx) {
     int video_pipe_fd[2];
-    if (pipe(video_pipe_fd) == -1) {
-        fprintf(stderr, "Failed to create video pipe\n");
-        free(decoder->current_frame_rgb);
-        free(decoder->reference_frame_rgb);
-        free(decoder->dwt_buffer_y);
-        free(decoder->dwt_buffer_co);
-        free(decoder->dwt_buffer_cg);
-        free(decoder->reference_ycocg_y);
-        free(decoder->reference_ycocg_co);
-        free(decoder->reference_ycocg_cg);
-        free(decoder->audio_file_path);
-        fclose(decoder->input_fp);
-        free(decoder);
-        return NULL;
+
+    // Create pipe for video data
+    if (pipe(video_pipe_fd) < 0) {
+        fprintf(stderr, "Error: Failed to create video pipe\n");
+        return -1;
     }
 
-    decoder->ffmpeg_pid = fork();
-    if (decoder->ffmpeg_pid == -1) {
-        fprintf(stderr, "Failed to fork FFmpeg process\n");
-        close(video_pipe_fd[0]); close(video_pipe_fd[1]);
-        free(decoder->current_frame_rgb);
-        free(decoder->reference_frame_rgb);
-        free(decoder->dwt_buffer_y);
-        free(decoder->dwt_buffer_co);
-        free(decoder->dwt_buffer_cg);
-        free(decoder->reference_ycocg_y);
-        free(decoder->reference_ycocg_co);
-        free(decoder->reference_ycocg_cg);
-        free(decoder->audio_file_path);
-        fclose(decoder->input_fp);
-        free(decoder);
-        return NULL;
-    } else if (decoder->ffmpeg_pid == 0) {
-        // Child process - FFmpeg
+    ctx->ffmpeg_pid = fork();
+
+    if (ctx->ffmpeg_pid < 0) {
+        fprintf(stderr, "Error: Failed to fork FFmpeg process\n");
+        close(video_pipe_fd[0]);
+        close(video_pipe_fd[1]);
+        return -1;
+    }
+
+    if (ctx->ffmpeg_pid == 0) {
+        // Child process - execute FFmpeg
         close(video_pipe_fd[1]);  // Close write end
 
         char video_size[32];
         char framerate[16];
-        snprintf(video_size, sizeof(video_size), "%dx%d", decoder->header.width, decoder->header.height);
-        snprintf(framerate, sizeof(framerate), "%d", decoder->header.fps);
+        snprintf(video_size, sizeof(video_size), "%dx%d", ctx->header.width, ctx->header.height);
+        snprintf(framerate, sizeof(framerate), "%d", ctx->header.fps);
 
         // Redirect video pipe to fd 3
-        dup2(video_pipe_fd[0], 3);  // Video input on fd 3
+        dup2(video_pipe_fd[0], 3);
         close(video_pipe_fd[0]);
 
-        execl("/usr/bin/ffmpeg", "ffmpeg",
-              "-f", "rawvideo",
-              "-pixel_format", "rgb24",
-              "-video_size", video_size,
-              "-framerate", framerate,
-              "-i", "pipe:3",              // Video from fd 3
-              "-i", audio_file,            // Audio from file
-              "-color_range", "2",
-              "-c:v", "ffv1",              // FFV1 codec
-              "-level", "3",               // FFV1 level 3
-              "-coder", "1",               // Range coder
-              "-context", "1",             // Large context
-              "-g", "1",                   // GOP size 1 (all I-frames)
-              "-slices", "24",             // 24 slices for threading
-              "-slicecrc", "1",            // CRC per slice
-              "-pixel_format", "rgb24",    // make FFmpeg encode to RGB
-              "-color_range", "2",
-              "-c:a", "pcm_u8",            // Audio codec (PCM unsigned 8-bit)
-              "-f", "matroska",            // MKV container
-              output_file,
-              "-y",                        // Overwrite output
-              "-v", "warning",             // Minimal logging
-              (char*)NULL);
+        if (ctx->output_raw) {
+            // Raw video output (no compression)
+            execl("/usr/bin/ffmpeg", "ffmpeg",
+                  "-f", "rawvideo",
+                  "-pixel_format", "rgb24",
+                  "-video_size", video_size,
+                  "-framerate", framerate,
+                  "-i", "pipe:3",
+                  "-f", "u8",
+                  "-ar", "32000",
+                  "-ac", "2",
+                  "-i", ctx->audio_temp_file,
+                  "-c:v", "rawvideo",
+                  "-pixel_format", "rgb24",
+                  "-c:a", "pcm_u8",
+                  "-f", "matroska",
+                  ctx->output_file,
+                  "-y",
+                  "-v", "warning",
+                  (char*)NULL);
+        } else {
+            // FFV1 output (lossless compression)
+            execl("/usr/bin/ffmpeg", "ffmpeg",
+                  "-f", "rawvideo",
+                  "-pixel_format", "rgb24",
+                  "-video_size", video_size,
+                  "-framerate", framerate,
+                  "-i", "pipe:3",
+                  "-f", "u8",
+                  "-ar", "32000",
+                  "-ac", "2",
+                  "-i", ctx->audio_temp_file,
+                  "-color_range", "2",
+                  "-c:v", "ffv1",
+                  "-level", "3",
+                  "-coder", "1",
+                  "-context", "1",
+                  "-g", "1",
+                  "-slices", "24",
+                  "-slicecrc", "1",
+                  "-pixel_format", "rgb24",
+                  "-color_range", "2",
+                  "-c:a", "pcm_u8",
+                  "-f", "matroska",
+                  ctx->output_file,
+                  "-y",
+                  "-v", "warning",
+                  (char*)NULL);
+        }
 
-        fprintf(stderr, "Failed to start FFmpeg\n");
+        fprintf(stderr, "Error: Failed to execute FFmpeg\n");
         exit(1);
-    } else {
-        // Parent process
-        close(video_pipe_fd[0]);  // Close read end
-
-        decoder->video_pipe = fdopen(video_pipe_fd[1], "wb");
-
-        if (!decoder->video_pipe) {
-            fprintf(stderr, "Failed to open video pipe for writing\n");
-            kill(decoder->ffmpeg_pid, SIGTERM);
-            free(decoder->current_frame_rgb);
-            free(decoder->reference_frame_rgb);
-            free(decoder->dwt_buffer_y);
-            free(decoder->dwt_buffer_co);
-            free(decoder->dwt_buffer_cg);
-            free(decoder->reference_ycocg_y);
-            free(decoder->reference_ycocg_co);
-            free(decoder->reference_ycocg_cg);
-            free(decoder->audio_file_path);
-            fclose(decoder->input_fp);
-            free(decoder);
-            return NULL;
-        }
     }
 
-    return decoder;
-}
+    // Parent process
+    close(video_pipe_fd[0]);  // Close read end
 
-static void tav_decoder_free(tav_decoder_t *decoder) {
-    if (!decoder) return;
-
-    if (decoder->input_fp) fclose(decoder->input_fp);
-    if (decoder->video_pipe) fclose(decoder->video_pipe);
-
-    // Wait for FFmpeg to finish
-    if (decoder->ffmpeg_pid > 0) {
-        int status;
-        waitpid(decoder->ffmpeg_pid, &status, 0);
-    }
-
-    free(decoder->current_frame_rgb);
-    free(decoder->reference_frame_rgb);
-    free(decoder->dwt_buffer_y);
-    free(decoder->dwt_buffer_co);
-    free(decoder->dwt_buffer_cg);
-    free(decoder->reference_ycocg_y);
-    free(decoder->reference_ycocg_co);
-    free(decoder->reference_ycocg_cg);
-    free(decoder->screen_masks);
-    free(decoder->audio_file_path);
-    free(decoder);
-}
-
-//=============================================================================
-// Screen Mask Management
-//=============================================================================
-
-// Fill masked regions (letterbox/pillarbox bars) with black
-// Phase 2: Composite cropped frame back to full frame with black borders
-static uint8_t* composite_to_full_frame(const uint8_t *cropped_rgb,
-                                        int cropped_width, int cropped_height,
-                                        int full_width, int full_height,
-                                        uint16_t top, uint16_t right,
-                                        uint16_t bottom, uint16_t left) {
-    // Allocate full frame buffer (filled with black)
-    uint8_t *full_frame = calloc(full_width * full_height * 3, sizeof(uint8_t));
-    if (!full_frame) {
-        return NULL;
-    }
-
-    // Calculate active region position in full frame
-    const int dest_x = left;
-    const int dest_y = top;
-
-    // Copy cropped frame into active region
-    for (int y = 0; y < cropped_height; y++) {
-        for (int x = 0; x < cropped_width; x++) {
-            const int src_offset = (y * cropped_width + x) * 3;
-            const int dest_offset = ((dest_y + y) * full_width + (dest_x + x)) * 3;
-
-            full_frame[dest_offset + 0] = cropped_rgb[src_offset + 0];  // R
-            full_frame[dest_offset + 1] = cropped_rgb[src_offset + 1];  // G
-            full_frame[dest_offset + 2] = cropped_rgb[src_offset + 2];  // B
-        }
-    }
-
-    return full_frame;
-}
-
-static void fill_masked_regions(uint8_t *frame_rgb, int width, int height,
-                                uint16_t top, uint16_t right, uint16_t bottom, uint16_t left) {
-    // Fill top letterbox bar
-    for (int y = 0; y < top && y < height; y++) {
-        for (int x = 0; x < width; x++) {
-            int offset = (y * width + x) * 3;
-            frame_rgb[offset] = 255;     // R
-            frame_rgb[offset + 1] = 0; // G
-            frame_rgb[offset + 2] = 0; // B
-        }
-    }
-
-    // Fill bottom letterbox bar
-    for (int y = height - bottom; y < height; y++) {
-        if (y < 0) continue;
-        for (int x = 0; x < width; x++) {
-            int offset = (y * width + x) * 3;
-            frame_rgb[offset] = 255;     // R
-            frame_rgb[offset + 1] = 0; // G
-            frame_rgb[offset + 2] = 0; // B
-        }
-    }
-
-    // Fill left pillarbox bar
-    for (int y = 0; y < height; y++) {
-        for (int x = 0; x < left && x < width; x++) {
-            int offset = (y * width + x) * 3;
-            frame_rgb[offset] = 0;     // R
-            frame_rgb[offset + 1] = 0; // G
-            frame_rgb[offset + 2] = 255; // B
-        }
-    }
-
-    // Fill right pillarbox bar
-    for (int y = 0; y < height; y++) {
-        for (int x = width - right; x < width; x++) {
-            if (x < 0) continue;
-            int offset = (y * width + x) * 3;
-            frame_rgb[offset] = 0;     // R
-            frame_rgb[offset + 1] = 0; // G
-            frame_rgb[offset + 2] = 255; // B
-        }
-    }
-}
-
-// Update active screen mask for the given frame number
-// Screen mask packets are sorted by frame_num, so we find the last entry
-// with frame_num <= current_frame_num
-static void update_screen_mask(tav_decoder_t *decoder, uint32_t current_frame_num) {
-    if (!decoder->screen_masks || decoder->screen_mask_count == 0) {
-        return;  // No screen mask entries
-    }
-
-    // Find the most recent screen mask entry for this frame
-    // Entries are in order, so scan backwards for efficiency
-    for (int i = decoder->screen_mask_count - 1; i >= 0; i--) {
-        if (decoder->screen_masks[i].frame_num <= current_frame_num) {
-            // Apply this mask
-            decoder->screen_mask_top = decoder->screen_masks[i].top;
-            decoder->screen_mask_right = decoder->screen_masks[i].right;
-            decoder->screen_mask_bottom = decoder->screen_masks[i].bottom;
-            decoder->screen_mask_left = decoder->screen_masks[i].left;
-            return;
-        }
-    }
-}
-
-//=============================================================================
-// Frame Decoding Logic
-//=============================================================================
-
-static int decode_i_or_p_frame(tav_decoder_t *decoder, uint8_t packet_type, uint32_t packet_size) {
-    // Variable declarations for cleanup
-    uint8_t *compressed_data = NULL;
-    uint8_t *decompressed_data = NULL;
-    int16_t *quantised_y = NULL;
-    int16_t *quantised_co = NULL;
-    int16_t *quantised_cg = NULL;
-    int decode_success = 1;  // Assume success, set to 0 on error
-
-    // Read and decompress frame data
-    compressed_data = malloc(packet_size);
-    if (!compressed_data) {
-        fprintf(stderr, "Error: Failed to allocate %u bytes for compressed data\n", packet_size);
-        decode_success = 0;
-        goto write_frame;
-    }
-
-    if (fread(compressed_data, 1, packet_size, decoder->input_fp) != packet_size) {
-        fprintf(stderr, "Error: Failed to read %u bytes of compressed frame data\n", packet_size);
-        decode_success = 0;
-        goto write_frame;
-    }
-
-    size_t decompressed_size = ZSTD_getFrameContentSize(compressed_data, packet_size);
-    if (decompressed_size == ZSTD_CONTENTSIZE_ERROR || decompressed_size == ZSTD_CONTENTSIZE_UNKNOWN) {
-        fprintf(stderr, "Warning: Could not determine decompressed size, using estimate\n");
-        decompressed_size = decoder->frame_size * 3 * sizeof(int16_t) + 1024;
-    }
-
-    decompressed_data = malloc(decompressed_size);
-    if (!decompressed_data) {
-        fprintf(stderr, "Error: Failed to allocate %zu bytes for decompressed data\n", decompressed_size);
-        decode_success = 0;
-        goto write_frame;
-    }
-
-    // Debug first 3 frames compression
-//    static int decomp_debug = 0;
-//    if (decomp_debug < 3) {
-//        fprintf(stderr, "  [ZSTD frame %d] Compressed size: %u, buffer size: %zu\n", decomp_debug, packet_size, decompressed_size);
-//        fprintf(stderr, "  [ZSTD frame %d] First 16 bytes of COMPRESSED data: ", decomp_debug);
-//        for (int i = 0; i < 16 && i < (int)packet_size; i++) {
-//            fprintf(stderr, "%02X ", compressed_data[i]);
-//        }
-//        fprintf(stderr, "\n");
-//    }
-
-    size_t actual_size = ZSTD_decompress(decompressed_data, decompressed_size, compressed_data, packet_size);
-
-    if (ZSTD_isError(actual_size)) {
-        fprintf(stderr, "Error: ZSTD decompression failed: %s\n", ZSTD_getErrorName(actual_size));
-        fprintf(stderr, "  Compressed size: %u, Buffer size: %zu\n", packet_size, decompressed_size);
-        decode_success = 0;
-        goto write_frame;
-    }
-
-//    if (decomp_debug < 3) {
-//        fprintf(stderr, "  [ZSTD frame %d] Decompressed size: %zu\n", decomp_debug, actual_size);
-//        fprintf(stderr, "  [ZSTD frame %d] First 16 bytes of DECOMPRESSED data: ", decomp_debug);
-//        for (int i = 0; i < 16 && i < (int)actual_size; i++) {
-//            fprintf(stderr, "%02X ", decompressed_data[i]);
-//        }
-//        fprintf(stderr, "\n");
-//        decomp_debug++;
-//    }
-
-    // Parse block data
-    uint8_t *ptr = decompressed_data;
-    uint8_t mode = *ptr++;
-    uint8_t qy_override = *ptr++;
-    uint8_t qco_override = *ptr++;
-    uint8_t qcg_override = *ptr++;
-
-    // IMPORTANT: Both header and override store QLUT indices, not values!
-    // Override of 0 means "use header value"
-    int qy = qy_override ? QLUT[qy_override] : QLUT[decoder->header.quantiser_y];
-    int qco = qco_override ? QLUT[qco_override] : QLUT[decoder->header.quantiser_co];
-    int qcg = qcg_override ? QLUT[qcg_override] : QLUT[decoder->header.quantiser_cg];
-
-    // Debug first few frames
-//    if (decoder->frame_count < 2) {
-//        fprintf(stderr, "Frame %d: mode=%d, Q: Y=%d, Co=%d, Cg=%d, decompressed=%zu bytes\n",
-//               decoder->frame_count, mode, qy, qco, qcg, actual_size);
-//    }
-
-    if (mode == TAV_MODE_SKIP) {
-        // Copy from reference frame
-        memcpy(decoder->current_frame_rgb, decoder->reference_frame_rgb, decoder->frame_size * 3);
-    } else {
-        // Decode coefficients (use function-level variables for proper cleanup)
-        // Phase 2: Use decoding dimensions (actual encoded size)
-        const int decoding_pixels = decoder->decoding_width * decoder->decoding_height;
-        int coeff_count = decoding_pixels;
-        quantised_y = calloc(coeff_count, sizeof(int16_t));
-        quantised_co = calloc(coeff_count, sizeof(int16_t));
-        quantised_cg = calloc(coeff_count, sizeof(int16_t));
-
-        if (!quantised_y || !quantised_co || !quantised_cg) {
-            fprintf(stderr, "Error: Failed to allocate coefficient buffers\n");
-            decode_success = 0;
-            goto write_frame;
-        }
-
-        // Postprocess coefficients based on entropy_coder value
-        if (decoder->header.entropy_coder == 1) {
-            // EZBC format (stub implementation)
-            postprocess_coefficients_ezbc(ptr, coeff_count, quantised_y, quantised_co, quantised_cg,
-                                         decoder->header.channel_layout);
-        } else {
-            // Default: Twobitmap format (entropy_coder=0)
-            postprocess_coefficients_twobit(ptr, coeff_count, quantised_y, quantised_co, quantised_cg);
-        }
-
-        // Debug: Check first few coefficients
-//        if (decoder->frame_count == 32) {
-//            fprintf(stderr, "  First 10 quantised Y coeffs: ");
-//            for (int i = 0; i < 10 && i < coeff_count; i++) {
-//                fprintf(stderr, "%d ", quantised_y[i]);
-//            }
-//            fprintf(stderr, "\n");
-//
-             // Check for any large quantised values that should produce bright pixels
-//            int max_quant_y = 0;
-//            for (int i = 0; i < coeff_count; i++) {
-//                int abs_val = quantised_y[i] < 0 ? -quantised_y[i] : quantised_y[i];
-//                if (abs_val > max_quant_y) max_quant_y = abs_val;
-//            }
-//            fprintf(stderr, "  Max quantised Y coefficient: %d\n", max_quant_y);
-//        }
-
-        // Phase 2: Allocate temporary DWT buffers for cropped region processing
-        float *temp_dwt_y = calloc(decoding_pixels, sizeof(float));
-        float *temp_dwt_co = calloc(decoding_pixels, sizeof(float));
-        float *temp_dwt_cg = calloc(decoding_pixels, sizeof(float));
-
-        if (!temp_dwt_y || !temp_dwt_co || !temp_dwt_cg) {
-            fprintf(stderr, "Error: Failed to allocate temporary DWT buffers\n");
-            free(temp_dwt_y);
-            free(temp_dwt_co);
-            free(temp_dwt_cg);
-            decode_success = 0;
-            goto write_frame;
-        }
-
-        // Dequantise (perceptual for versions 5-8, uniform for 1-4)
-        // Phase 2: Use decoding dimensions and temporary buffers
-        // Extract base version for perceptual check
-        uint8_t base_version = (decoder->header.version > 8) ? (decoder->header.version - 8) : decoder->header.version;
-        const int is_perceptual = (base_version >= 5 && base_version <= 8);
-        const int is_ezbc = (decoder->header.entropy_coder == 1);
-
-        if (is_ezbc && is_perceptual) {
-            // EZBC mode with perceptual quantisation: coefficients are normalised
-            // Need to dequantise using perceptual weights (same as twobit-map mode)
-            dequantise_dwt_subbands_perceptual(0, qy, quantised_y, temp_dwt_y,
-                                              decoder->decoding_width, decoder->decoding_height,
-                                              decoder->header.decomp_levels, qy, 0, decoder->frame_count);
-            dequantise_dwt_subbands_perceptual(0, qy, quantised_co, temp_dwt_co,
-                                              decoder->decoding_width, decoder->decoding_height,
-                                              decoder->header.decomp_levels, qco, 1, decoder->frame_count);
-            dequantise_dwt_subbands_perceptual(0, qy, quantised_cg, temp_dwt_cg,
-                                              decoder->decoding_width, decoder->decoding_height,
-                                              decoder->header.decomp_levels, qcg, 1, decoder->frame_count);
-        } else if (is_perceptual) {
-            dequantise_dwt_subbands_perceptual(0, qy, quantised_y, temp_dwt_y,
-                                              decoder->decoding_width, decoder->decoding_height,
-                                              decoder->header.decomp_levels, qy, 0, decoder->frame_count);
-            dequantise_dwt_subbands_perceptual(0, qy, quantised_co, temp_dwt_co,
-                                              decoder->decoding_width, decoder->decoding_height,
-                                              decoder->header.decomp_levels, qco, 1, decoder->frame_count);
-            dequantise_dwt_subbands_perceptual(0, qy, quantised_cg, temp_dwt_cg,
-                                              decoder->decoding_width, decoder->decoding_height,
-                                              decoder->header.decomp_levels, qcg, 1, decoder->frame_count);
-        } else {
-            for (int i = 0; i < coeff_count; i++) {
-                temp_dwt_y[i] = quantised_y[i] * qy;
-                temp_dwt_co[i] = quantised_co[i] * qco;
-                temp_dwt_cg[i] = quantised_cg[i] * qcg;
-            }
-        }
-
-        // Debug: Check dequantised values using correct subband layout
-//        if (decoder->frame_count == 32) {
-//            dwt_subband_info_t subbands[32];
-//            const int subband_count = calculate_subband_layout(decoder->header.width, decoder->header.height,
-//                                                              decoder->header.decomp_levels, subbands);
-//
-             // Find LL band (highest level, type 0)
-//            for (int s = 0; s < subband_count; s++) {
-//                if (subbands[s].level == decoder->header.decomp_levels && subbands[s].subband_type == 0) {
-//                    fprintf(stderr, "  LL band: level=%d, start=%d, count=%d\n",
-//                           subbands[s].level, subbands[s].coeff_start, subbands[s].coeff_count);
-//                    fprintf(stderr, "    Reading LL first 5 from dwt_buffer_y[0-4]: %.1f %.1f %.1f %.1f %.1f\n",
-//                           decoder->dwt_buffer_y[0], decoder->dwt_buffer_y[1], decoder->dwt_buffer_y[2],
-//                           decoder->dwt_buffer_y[3], decoder->dwt_buffer_y[4]);
-//
-                     // Find max in CORRECT LL band
-//                    float max_ll = -999.0f;
-//                    for (int i = 0; i < subbands[s].coeff_count; i++) {
-//                        int idx = subbands[s].coeff_start + i;
-//                        if (decoder->dwt_buffer_y[idx] > max_ll) max_ll = decoder->dwt_buffer_y[idx];
-//                    }
-//                    fprintf(stderr, "  Max LL coefficient BEFORE grain removal: %.1f\n", max_ll);
-//                    break;
-//                }
-//            }
-//        }
-
-        // Remove grain synthesis from Y channel (must happen after dequantisation, before inverse DWT)
-        // Phase 2: Use decoding dimensions and temporary buffer
-        apply_grain_synthesis(temp_dwt_y, decoder->decoding_width, decoder->decoding_height,
-                                      decoder->header.decomp_levels, decoder->frame_count, decoder->header.quantiser_y,
-                                      decoder->header.encoder_preset, decoder->no_grain_synthesis);
-
-        // Debug: Check LL band AFTER grain removal
-//        if (decoder->frame_count == 32) {
-//            int ll_width = decoder->header.width;
-//            int ll_height = decoder->header.height;
-//            for (int l = 0; l < decoder->header.decomp_levels; l++) {
-//                ll_width = (ll_width + 1) / 2;
-//                ll_height = (ll_height + 1) / 2;
-//            }
-//            float max_ll = -999.0f;
-//            for (int i = 0; i < ll_width * ll_height; i++) {
-//                if (decoder->dwt_buffer_y[i] > max_ll) max_ll = decoder->dwt_buffer_y[i];
-//            }
-//            fprintf(stderr, "  Max LL coefficient AFTER grain removal: %.1f\n", max_ll);
-//        }
-
-        // Apply inverse DWT with correct non-power-of-2 dimension handling
-        // Phase 2: Use decoding dimensions and temporary buffers
-        // Note: quantised arrays freed at write_frame label
-        apply_inverse_dwt_multilevel(temp_dwt_y, decoder->decoding_width, decoder->decoding_height,
-                                   decoder->header.decomp_levels, decoder->header.wavelet_filter);
-        apply_inverse_dwt_multilevel(temp_dwt_co, decoder->decoding_width, decoder->decoding_height,
-                                   decoder->header.decomp_levels, decoder->header.wavelet_filter);
-        apply_inverse_dwt_multilevel(temp_dwt_cg, decoder->decoding_width, decoder->decoding_height,
-                                   decoder->header.decomp_levels, decoder->header.wavelet_filter);
-
-        // Debug: Check spatial domain values after IDWT
-//        if (decoder->frame_count == 32) {
-//            float max_y_spatial = -999.0f;
-//            for (int i = 0; i < decoder->frame_size; i++) {
-//                if (decoder->dwt_buffer_y[i] > max_y_spatial) max_y_spatial = decoder->dwt_buffer_y[i];
-//            }
-//            fprintf(stderr, "  Max Y in spatial domain AFTER IDWT: %.1f\n", max_y_spatial);
-//        }
-
-        // Debug: Check spatial domain values after IDWT (original debug)
-//        if (decoder->frame_count < 1) {
-//            fprintf(stderr, "  After IDWT - First 10 Y values: ");
-//            for (int i = 0; i < 10 && i < decoder->frame_size; i++) {
-//                fprintf(stderr, "%.1f ", decoder->dwt_buffer_y[i]);
-//            }
-//            fprintf(stderr, "\n");
-//            fprintf(stderr, "  Y range: min=%.1f, max=%.1f\n",
-//                   decoder->dwt_buffer_y[0], decoder->dwt_buffer_y[decoder->frame_size-1]);
-//        }
-
-        // Handle P-frame delta accumulation (in YCoCg float space)
-        // TODO Phase 2: P-frame support with crop encoding needs additional work
-        //  - Reference frames are stored at full size but delta may be at cropped size
-        //  - Need to extract/composite reference region appropriately
-        if (packet_type == TAV_PACKET_PFRAME && mode == TAV_MODE_DELTA) {
-            fprintf(stderr, "Warning: P-frame delta mode not yet fully supported with crop encoding\n");
-            for (int i = 0; i < decoding_pixels; i++) {
-                temp_dwt_y[i] += decoder->reference_ycocg_y[i];
-                temp_dwt_co[i] += decoder->reference_ycocg_co[i];
-                temp_dwt_cg[i] += decoder->reference_ycocg_cg[i];
-            }
-        }
-
-        // Phase 2: Convert cropped region to RGB, then composite to full frame
-        uint8_t *cropped_rgb = malloc(decoding_pixels * 3);
-        if (!cropped_rgb) {
-            fprintf(stderr, "Error: Failed to allocate cropped RGB buffer\n");
-            free(temp_dwt_y);
-            free(temp_dwt_co);
-            free(temp_dwt_cg);
-            decode_success = 0;
-            goto write_frame;
-        }
-
-        // Convert YCoCg-R/ICtCp to RGB for cropped region
-        // Extract base version for ICtCp check (even versions use ICtCp)
-        uint8_t base_version_rgb = (decoder->header.version > 8) ? (decoder->header.version - 8) : decoder->header.version;
-        const int is_ictcp = (base_version_rgb % 2 == 0);
-
-        for (int i = 0; i < decoding_pixels; i++) {
-            uint8_t r, g, b;
-            if (is_ictcp) {
-                ictcp_to_rgb(temp_dwt_y[i], temp_dwt_co[i], temp_dwt_cg[i], &r, &g, &b);
-            } else {
-                ycocg_r_to_rgb(temp_dwt_y[i], temp_dwt_co[i], temp_dwt_cg[i], &r, &g, &b);
-            }
-
-            // RGB byte order for FFmpeg rgb24
-            cropped_rgb[i * 3 + 0] = r;
-            cropped_rgb[i * 3 + 1] = g;
-            cropped_rgb[i * 3 + 2] = b;
-        }
-
-        // Composite cropped frame to full frame with black borders
-        uint8_t *full_frame_rgb = composite_to_full_frame(cropped_rgb,
-                                                           decoder->decoding_width, decoder->decoding_height,
-                                                           decoder->header.width, decoder->header.height,
-                                                           decoder->screen_mask_top, decoder->screen_mask_right,
-                                                           decoder->screen_mask_bottom, decoder->screen_mask_left);
-        free(cropped_rgb);
-        free(temp_dwt_y);
-        free(temp_dwt_co);
-        free(temp_dwt_cg);
-
-        if (!full_frame_rgb) {
-            fprintf(stderr, "Error: Failed to composite frame to full size\n");
-            decode_success = 0;
-            goto write_frame;
-        }
-
-        // Copy composited frame to decoder buffer
-        memcpy(decoder->current_frame_rgb, full_frame_rgb, decoder->frame_size * 3);
-        free(full_frame_rgb);
-
-//        if (decoder->frame_count == 1000) {
-//            fprintf(stderr, "\n=== Frame 1000 Value Analysis ===\n");
-//            fprintf(stderr, "Max YCoCg values: Y=%.1f, Co=%.1f, Cg=%.1f\n", max_y, max_co, max_cg);
-//            fprintf(stderr, "Max RGB values: R=%d, G=%d, B=%d\n", max_r, max_g, max_b);
-//        }
-
-        // Debug: Check RGB output
-//        if (decoder->frame_count < 1) {
-//            fprintf(stderr, "  First 5 pixels RGB: ");
-//            for (int i = 0; i < 5 && i < decoder->frame_size; i++) {
-//                fprintf(stderr, "(%d,%d,%d) ",
-//                       decoder->current_frame_rgb[i*3],
-//                       decoder->current_frame_rgb[i*3+1],
-//                       decoder->current_frame_rgb[i*3+2]);
-//            }
-//            fprintf(stderr, "\n");
-//        }
-
-        // TODO Phase 2: Reference YCoCg frame update needs rework for crop encoding
-        // Currently not updated because we use temporary buffers that are already freed
-        // P-frame support will need to store reference at appropriate dimensions
-        // memcpy(decoder->reference_ycocg_y, temp_dwt_y, decoding_pixels * sizeof(float));
-        // memcpy(decoder->reference_ycocg_co, temp_dwt_co, decoding_pixels * sizeof(float));
-        // memcpy(decoder->reference_ycocg_cg, temp_dwt_cg, decoding_pixels * sizeof(float));
-    }
-
-    // Update reference frame
-    memcpy(decoder->reference_frame_rgb, decoder->current_frame_rgb, decoder->frame_size * 3);
-
-write_frame:
-    // Clean up temporary allocations
-    if (compressed_data) free(compressed_data);
-    if (decompressed_data) free(decompressed_data);
-    if (quantised_y) free(quantised_y);
-    if (quantised_co) free(quantised_co);
-    if (quantised_cg) free(quantised_cg);
-
-    // If decoding failed, fill frame with black to maintain stream alignment
-    if (!decode_success) {
-        memset(decoder->current_frame_rgb, 0, decoder->frame_size * 3);
-        fprintf(stderr, "Warning: Writing black frame %d due to decode error\n", decoder->frame_count);
-    }
-
-    // Write frame to video pipe with retry on partial writes (ALWAYS write to maintain alignment)
-    size_t bytes_to_write = decoder->frame_size * 3;
-    size_t total_written = 0;
-    const uint8_t *write_ptr = decoder->current_frame_rgb;
-
-    while (total_written < bytes_to_write) {
-        size_t bytes_written = fwrite(write_ptr + total_written, 1,
-                                     bytes_to_write - total_written,
-                                     decoder->video_pipe);
-        if (bytes_written == 0) {
-            if (ferror(decoder->video_pipe)) {
-                fprintf(stderr, "Error: Pipe write error at frame %d (wrote %zu/%zu bytes) - aborting\n",
-                       decoder->frame_count, total_written, bytes_to_write);
-                // Cannot maintain stream alignment if pipe is broken - this is fatal
-                return -1;
-            }
-            // Pipe might be full, flush and retry
-            fflush(decoder->video_pipe);
-            usleep(1000); // 1ms delay
-        } else {
-            total_written += bytes_written;
-        }
-    }
-
-    // Ensure data is flushed to FFmpeg
-    if (fflush(decoder->video_pipe) != 0) {
-        fprintf(stderr, "Error: Failed to flush video pipe at frame %d - aborting\n", decoder->frame_count);
-        // Cannot maintain stream alignment if pipe is broken - this is fatal
+    ctx->video_pipe = fdopen(video_pipe_fd[1], "wb");
+    if (!ctx->video_pipe) {
+        fprintf(stderr, "Error: Failed to open video pipe for writing\n");
+        kill(ctx->ffmpeg_pid, SIGTERM);
         return -1;
     }
 
-    decoder->frame_count++;
-    // Return success only if decoding succeeded; still return 1 to continue processing
-    // (we wrote a frame either way to maintain stream alignment)
-    return decode_success ? 1 : 1;  // Always return 1 to continue, errors are non-fatal now
+    return 0;
 }
 
-//=============================================================================
-// Main Decoding Loop
-//=============================================================================
+// =============================================================================
+// Frame Buffer Management
+// =============================================================================
 
-static void print_usage(const char *prog) {
-    printf("TAV Decoder - Converts TAV video to FFV1+PCMu8 in MKV container\n");
-    printf("Version: %s\n\n", DECODER_VENDOR_STRING);
-    printf("Usage: %s -i input.tav -o output.mkv\n\n", prog);
-    printf("Options:\n");
-    printf("  -i <file>              Input TAV file\n");
-    printf("  -o <file>              Output MKV file (optional, auto-generated from input)\n");
-    printf("  -v                     Verbose output\n");
-    printf("  --no-grain-synthesis   Disable grain synthesis (override encoder preset)\n");
-    printf("  -h, --help             Show this help\n\n");
-    printf("Supported features (matches TSVM decoder):\n");
-    printf("  - I-frames and P-frames (delta mode)\n");
-    printf("  - GOP unified 3D DWT (temporal compression)\n");
-    printf("  - TAD audio (decoded to PCMu8)\n");
-    printf("  - MP2 audio (passed through)\n");
-    printf("  - All wavelet types (5/3, 9/7, CDF 13/7, DD-4, Haar)\n");
-    printf("  - Perceptual quantisation (versions 5-8)\n");
-    printf("  - YCoCg-R and ICtCp color spaces\n\n");
-    printf("Unsupported features (not in TSVM decoder):\n");
-    printf("  - MC-EZBC motion compensation\n");
-    printf("  - MPEG-style residual coding (P/B-frames)\n");
-    printf("  - Adaptive block partitioning\n\n");
-}
+static int allocate_gop_frames(decoder_context_t *ctx, int gop_size) {
+    if (ctx->gop_frames_allocated >= gop_size) {
+        return 0;  // Already have enough
+    }
 
-int main(int argc, char *argv[]) {
-    // Ignore SIGPIPE to prevent process termination if FFmpeg exits early
-    signal(SIGPIPE, SIG_IGN);
+    // Free existing if any
+    if (ctx->gop_frames) {
+        for (int i = 0; i < ctx->gop_frames_allocated; i++) {
+            free(ctx->gop_frames[i]);
+        }
+        free(ctx->gop_frames);
+    }
 
-    // Initialize AVX-512 runtime detection
-    tav_simd_init();
+    // Allocate new
+    ctx->gop_frames = malloc(gop_size * sizeof(uint8_t*));
+    if (!ctx->gop_frames) {
+        return -1;
+    }
 
-    char *input_file = NULL;
-    char *output_file = NULL;
-    int verbose = 0;
-    int no_grain_synthesis = 0;
-
-    static struct option long_options[] = {
-        {"help", no_argument, 0, 'h'},
-        {"no-grain-synthesis", no_argument, 0, 1000},
-        {0, 0, 0, 0}
-    };
-
-    int opt;
-    while ((opt = getopt_long(argc, argv, "i:o:vh", long_options, NULL)) != -1) {
-        switch (opt) {
-            case 'i':
-                input_file = optarg;
-                break;
-            case 'o':
-                output_file = optarg;
-                break;
-            case 'v':
-                verbose = 1;
-                break;
-            case 'h':
-                print_usage(argv[0]);
-                return 0;
-            case 1000:  // --no-grain-synthesis
-                no_grain_synthesis = 1;
-                if (verbose) {
-                    printf("Grain synthesis disabled\n");
-                }
-                break;
-            default:
-                print_usage(argv[0]);
-                return 1;
+    size_t frame_size = ctx->header.width * ctx->header.height * 3;
+    for (int i = 0; i < gop_size; i++) {
+        ctx->gop_frames[i] = malloc(frame_size);
+        if (!ctx->gop_frames[i]) {
+            // Cleanup on failure
+            for (int j = 0; j < i; j++) {
+                free(ctx->gop_frames[j]);
+            }
+            free(ctx->gop_frames);
+            ctx->gop_frames = NULL;
+            return -1;
         }
     }
 
-    if (!input_file) {
+    ctx->gop_frames_allocated = gop_size;
+    return 0;
+}
+
+// =============================================================================
+// Packet Processing
+// =============================================================================
+
+static int process_gop_packet(decoder_context_t *ctx) {
+    // Read GOP size (1 byte)
+    uint8_t gop_size;
+    if (fread(&gop_size, 1, 1, ctx->input_fp) != 1) {
+        fprintf(stderr, "Error: Failed to read GOP size\n");
+        return -1;
+    }
+    ctx->bytes_read++;
+
+    // Read compressed size (4 bytes)
+    uint32_t compressed_size;
+    if (fread(&compressed_size, 4, 1, ctx->input_fp) != 1) {
+        fprintf(stderr, "Error: Failed to read GOP compressed size\n");
+        return -1;
+    }
+    ctx->bytes_read += 4;
+
+    if (ctx->dump_packets) {
+        printf("  GOP: %d frames, %u bytes compressed\n", gop_size, compressed_size);
+    }
+
+    // Allocate frame buffers
+    if (allocate_gop_frames(ctx, gop_size) < 0) {
+        fprintf(stderr, "Error: Failed to allocate GOP frame buffers\n");
+        return -1;
+    }
+
+    // Read compressed data
+    uint8_t *compressed_data = malloc(compressed_size);
+    if (!compressed_data) {
+        fprintf(stderr, "Error: Failed to allocate compressed data buffer\n");
+        return -1;
+    }
+
+    if (fread(compressed_data, 1, compressed_size, ctx->input_fp) != compressed_size) {
+        fprintf(stderr, "Error: Failed to read GOP compressed data\n");
+        free(compressed_data);
+        return -1;
+    }
+    ctx->bytes_read += compressed_size;
+
+    // Decode GOP using library
+    int result = tav_video_decode_gop(ctx->video_ctx, compressed_data, compressed_size,
+                                       gop_size, ctx->gop_frames);
+    free(compressed_data);
+
+    if (result < 0) {
+        fprintf(stderr, "Error: GOP decode failed: %s\n", tav_video_get_error(ctx->video_ctx));
+        return -1;
+    }
+
+    // Write frames to FFmpeg
+    size_t frame_size = ctx->header.width * ctx->header.height * 3;
+    for (int i = 0; i < gop_size; i++) {
+        if (ctx->video_pipe) {
+            fwrite(ctx->gop_frames[i], 1, frame_size, ctx->video_pipe);
+        }
+        ctx->frames_decoded++;
+
+        // Check decode limit
+        if (ctx->decode_limit > 0 && ctx->frames_decoded >= (uint64_t)ctx->decode_limit) {
+            break;
+        }
+    }
+
+    ctx->gops_decoded++;
+    return 0;
+}
+
+static int process_iframe_packet(decoder_context_t *ctx) {
+    // Read compressed size (4 bytes)
+    uint32_t compressed_size;
+    if (fread(&compressed_size, 4, 1, ctx->input_fp) != 1) {
+        fprintf(stderr, "Error: Failed to read I-frame compressed size\n");
+        return -1;
+    }
+    ctx->bytes_read += 4;
+
+    if (ctx->dump_packets) {
+        printf("  I-frame: %u bytes compressed\n", compressed_size);
+    }
+
+    // Allocate frame buffer
+    if (allocate_gop_frames(ctx, 1) < 0) {
+        fprintf(stderr, "Error: Failed to allocate I-frame buffer\n");
+        return -1;
+    }
+
+    // Read compressed data
+    uint8_t *compressed_data = malloc(compressed_size);
+    if (!compressed_data) {
+        fprintf(stderr, "Error: Failed to allocate compressed data buffer\n");
+        return -1;
+    }
+
+    if (fread(compressed_data, 1, compressed_size, ctx->input_fp) != compressed_size) {
+        fprintf(stderr, "Error: Failed to read I-frame compressed data\n");
+        free(compressed_data);
+        return -1;
+    }
+    ctx->bytes_read += compressed_size;
+
+    // Decode I-frame using library
+    if (ctx->dump_packets) {
+        printf("  Calling tav_video_decode_iframe(%p, %p, %u, %p)\n",
+               (void*)ctx->video_ctx, (void*)compressed_data, compressed_size, (void*)ctx->gop_frames[0]);
+    }
+
+    int result = tav_video_decode_iframe(ctx->video_ctx, compressed_data, compressed_size,
+                                          ctx->gop_frames[0]);
+    free(compressed_data);
+
+    if (result < 0) {
+        fprintf(stderr, "Error: I-frame decode failed: %s\n", tav_video_get_error(ctx->video_ctx));
+        return -1;
+    }
+
+    if (ctx->dump_packets) {
+        printf("  I-frame decoded successfully\n");
+    }
+
+    // Write frame to FFmpeg
+    if (ctx->video_pipe) {
+        size_t frame_size = ctx->header.width * ctx->header.height * 3;
+        fwrite(ctx->gop_frames[0], 1, frame_size, ctx->video_pipe);
+    }
+
+    ctx->frames_decoded++;
+    return 0;
+}
+
+static int process_audio_tad_packet(decoder_context_t *ctx) {
+    // TAD packet format:
+    // [sample_count(2)][payload_size+7(4)][sample_count(2)][quant_index(1)][compressed_size(4)][compressed_data]
+
+    // Read outer header
+    uint16_t sample_count;
+    uint32_t payload_size_plus_7;
+
+    if (fread(&sample_count, 2, 1, ctx->input_fp) != 1) return -1;
+    if (fread(&payload_size_plus_7, 4, 1, ctx->input_fp) != 1) return -1;
+    ctx->bytes_read += 6;
+
+    if (ctx->dump_packets) {
+        printf("  TAD audio: %u samples, %u bytes payload\n", sample_count, payload_size_plus_7);
+    }
+
+    if (ctx->no_audio) {
+        // Skip audio data
+        fseek(ctx->input_fp, payload_size_plus_7, SEEK_CUR);
+        ctx->bytes_read += payload_size_plus_7;
+        return 0;
+    }
+
+    // Read TAD chunk data (includes inner header)
+    uint8_t *tad_data = malloc(payload_size_plus_7);
+    if (!tad_data) return -1;
+
+    if (fread(tad_data, 1, payload_size_plus_7, ctx->input_fp) != payload_size_plus_7) {
+        free(tad_data);
+        return -1;
+    }
+    ctx->bytes_read += payload_size_plus_7;
+
+    // Allocate output buffer (stereo PCMu8)
+    uint8_t *pcm_output = malloc(sample_count * 2);
+    if (!pcm_output) {
+        free(tad_data);
+        return -1;
+    }
+
+    // Decode TAD using library
+    size_t bytes_consumed = 0;
+    size_t samples_decoded = 0;
+
+    int result = tad32_decode_chunk(tad_data, payload_size_plus_7,
+                                    pcm_output, &bytes_consumed, &samples_decoded);
+    free(tad_data);
+
+    if (result == 0 && samples_decoded > 0) {
+        // Write PCMu8 to audio temp file
+        if (ctx->audio_temp_fp) {
+            fwrite(pcm_output, 1, samples_decoded * 2, ctx->audio_temp_fp);
+        }
+        ctx->audio_samples_decoded += samples_decoded;
+    }
+
+    free(pcm_output);
+    return 0;
+}
+
+static int process_audio_pcm8_packet(decoder_context_t *ctx) {
+    // PCM8 packet format: [size(4)][pcm_data]
+    uint32_t pcm_size;
+    if (fread(&pcm_size, 4, 1, ctx->input_fp) != 1) return -1;
+    ctx->bytes_read += 4;
+
+    if (ctx->dump_packets) {
+        printf("  PCM8 audio: %u bytes\n", pcm_size);
+    }
+
+    if (ctx->no_audio) {
+        fseek(ctx->input_fp, pcm_size, SEEK_CUR);
+        ctx->bytes_read += pcm_size;
+        return 0;
+    }
+
+    // Read and write PCM data directly
+    uint8_t *pcm_data = malloc(pcm_size);
+    if (!pcm_data) return -1;
+
+    if (fread(pcm_data, 1, pcm_size, ctx->input_fp) != pcm_size) {
+        free(pcm_data);
+        return -1;
+    }
+    ctx->bytes_read += pcm_size;
+
+    if (ctx->audio_temp_fp) {
+        fwrite(pcm_data, 1, pcm_size, ctx->audio_temp_fp);
+    }
+
+    ctx->audio_samples_decoded += pcm_size / 2;  // Stereo
+    free(pcm_data);
+    return 0;
+}
+
+static int skip_packet_with_size(decoder_context_t *ctx, const char *name) {
+    uint32_t size;
+    if (fread(&size, 4, 1, ctx->input_fp) != 1) return -1;
+    ctx->bytes_read += 4;
+
+    if (ctx->dump_packets) {
+        printf("  %s: %u bytes (skipped)\n", name, size);
+    }
+
+    fseek(ctx->input_fp, size, SEEK_CUR);
+    ctx->bytes_read += size;
+    return 0;
+}
+
+static int process_packet(decoder_context_t *ctx) {
+    uint8_t packet_type;
+
+    if (fread(&packet_type, 1, 1, ctx->input_fp) != 1) {
+        return -1;  // EOF
+    }
+    ctx->bytes_read++;
+
+    if (ctx->dump_packets) {
+        printf("Packet 0x%02X at offset %lu\n", packet_type, ctx->bytes_read - 1);
+    }
+
+    switch (packet_type) {
+        case TAV_PACKET_GOP_UNIFIED:
+            return process_gop_packet(ctx);
+
+        case TAV_PACKET_IFRAME:
+            return process_iframe_packet(ctx);
+
+        case TAV_PACKET_PFRAME:
+            // P-frame not commonly used in TAV, skip for now
+            return skip_packet_with_size(ctx, "P-frame");
+
+        case TAV_PACKET_AUDIO_TAD:
+            return process_audio_tad_packet(ctx);
+
+        case TAV_PACKET_AUDIO_PCM8:
+            return process_audio_pcm8_packet(ctx);
+
+        case TAV_PACKET_AUDIO_MP2:
+        case TAV_PACKET_AUDIO_TRACK:
+            return skip_packet_with_size(ctx, "Audio track");
+
+        case TAV_PACKET_SUBTITLE:
+        case TAV_PACKET_SUBTITLE_TC:
+            return skip_packet_with_size(ctx, "Subtitle");
+
+        case TAV_PACKET_EXTENDED_HDR: {
+            // Extended header format: [num_pairs(2)][key-value pairs...]
+            // Each KV pair: [key(4)][type(1)][value...]
+            uint16_t num_pairs;
+            if (fread(&num_pairs, 2, 1, ctx->input_fp) != 1) return -1;
+            ctx->bytes_read += 2;
+
+            if (ctx->dump_packets) {
+                printf("  Extended header: %u key-value pairs\n", num_pairs);
+            }
+
+            // Skip key-value pairs
+            for (int i = 0; i < num_pairs; i++) {
+                uint8_t kv_header[5];  // key(4) + type(1)
+                if (fread(kv_header, 1, 5, ctx->input_fp) != 5) return 0;
+                ctx->bytes_read += 5;
+
+                uint8_t value_type = kv_header[4];
+                if (value_type == 0x04) {  // Int64
+                    uint64_t value;
+                    if (fread(&value, 8, 1, ctx->input_fp) != 1) return 0;
+                    ctx->bytes_read += 8;
+                } else if (value_type == 0x10) {  // Bytes
+                    uint16_t length;
+                    if (fread(&length, 2, 1, ctx->input_fp) != 1) return 0;
+                    ctx->bytes_read += 2;
+                    fseek(ctx->input_fp, length, SEEK_CUR);
+                    ctx->bytes_read += length;
+                } else if (value_type <= 0x04) {  // Int types
+                    int sizes[] = {2, 3, 4, 6, 8};  // Int16, Int24, Int32, Int48, Int64
+                    fseek(ctx->input_fp, sizes[value_type], SEEK_CUR);
+                    ctx->bytes_read += sizes[value_type];
+                }
+            }
+            return 0;
+        }
+
+        case TAV_PACKET_SCREEN_MASK: {
+            // Screen mask: 4 bytes (top, bottom, left, right)
+            uint8_t mask[4];
+            if (fread(mask, 1, 4, ctx->input_fp) != 4) return -1;
+            ctx->bytes_read += 4;
+            if (ctx->dump_packets) {
+                printf("  Screen mask: T=%d B=%d L=%d R=%d\n", mask[0], mask[1], mask[2], mask[3]);
+            }
+            return 0;
+        }
+
+        case TAV_PACKET_GOP_SYNC: {
+            // GOP sync: 1 byte (frame count)
+            uint8_t frame_count;
+            if (fread(&frame_count, 1, 1, ctx->input_fp) != 1) return -1;
+            ctx->bytes_read++;
+            if (ctx->dump_packets) {
+                printf("  GOP sync: %d frames\n", frame_count);
+            }
+            return 0;
+        }
+
+        case TAV_PACKET_TIMECODE: {
+            // Timecode: 8 bytes (nanoseconds)
+            uint64_t timecode_ns;
+            if (fread(&timecode_ns, 8, 1, ctx->input_fp) != 1) return -1;
+            ctx->bytes_read += 8;
+            if (ctx->dump_packets) {
+                printf("  Timecode: %.3f sec\n", timecode_ns / 1000000000.0);
+            }
+            return 0;
+        }
+
+        case TAV_PACKET_SYNC_NTSC:
+        case TAV_PACKET_SYNC:
+            // Sync packets: no payload
+            if (ctx->dump_packets) {
+                printf("  Sync packet\n");
+            }
+            return 0;
+
+        default:
+            if (ctx->verbose) {
+                fprintf(stderr, "Warning: Unknown packet type 0x%02X, attempting to skip\n", packet_type);
+            }
+            // Try to skip by reading size
+            uint32_t size;
+            if (fread(&size, 4, 1, ctx->input_fp) != 1) return 0;  // May be EOF
+            ctx->bytes_read += 4;
+            if (size < 1000000) {  // Sanity check
+                fseek(ctx->input_fp, size, SEEK_CUR);
+                ctx->bytes_read += size;
+            }
+            return 0;
+    }
+}
+
+// =============================================================================
+// Main Decoding Loop
+// =============================================================================
+
+static int decode_video(decoder_context_t *ctx) {
+    printf("Decoding...\n");
+    ctx->start_time = time(NULL);
+
+    // Two-pass approach for proper audio/video muxing:
+    // Pass 1: Extract all audio to temp file
+    // Pass 2: Spawn FFmpeg with complete audio, decode video
+
+    long data_start = ftell(ctx->input_fp);
+
+    // Pass 1: Audio extraction
+    if (!ctx->no_audio) {
+        printf("Pass 1: Extracting audio...\n");
+        while (process_packet(ctx) == 0) {
+            // Check decode limit
+            if (ctx->decode_limit > 0 && ctx->frames_decoded >= (uint64_t)ctx->decode_limit) {
+                break;
+            }
+        }
+
+        // Close and flush audio file
+        if (ctx->audio_temp_fp) {
+            fclose(ctx->audio_temp_fp);
+            ctx->audio_temp_fp = NULL;
+        }
+
+        printf("  Audio samples: %lu\n", ctx->audio_samples_decoded);
+    }
+
+    // Reset for pass 2
+    fseek(ctx->input_fp, data_start, SEEK_SET);
+    ctx->frames_decoded = 0;
+    ctx->gops_decoded = 0;
+    ctx->bytes_read = 32;  // Header already read
+
+    // Spawn FFmpeg with complete audio
+    printf("Pass 2: Decoding video and muxing...\n");
+    if (spawn_ffmpeg(ctx) < 0) {
+        return -1;
+    }
+
+    // Pass 2: Video decoding
+    uint64_t last_reported = 0;
+    while (process_packet(ctx) == 0) {
+        // Progress reporting - show when frames were decoded
+        if (ctx->frames_decoded != last_reported) {
+            time_t elapsed = time(NULL) - ctx->start_time;
+            double fps = elapsed > 0 ? (double)ctx->frames_decoded / elapsed : 0.0;
+            printf("\rFrames: %lu | GOPs: %lu | %.1f fps",
+                   ctx->frames_decoded, ctx->gops_decoded, fps);
+            fflush(stdout);
+            last_reported = ctx->frames_decoded;
+        }
+
+        // Check decode limit
+        if (ctx->decode_limit > 0 && ctx->frames_decoded >= (uint64_t)ctx->decode_limit) {
+            break;
+        }
+    }
+
+    printf("\n");
+    return 0;
+}
+
+// =============================================================================
+// Usage and Main
+// =============================================================================
+
+// Generate output filename by replacing extension with .mkv
+static char *generate_output_filename(const char *input_file) {
+    size_t len = strlen(input_file);
+    char *output = malloc(len + 5);  // Worst case: add ".mkv" + null
+    if (!output) return NULL;
+
+    strcpy(output, input_file);
+
+    // Find last dot in filename (not in path)
+    char *last_dot = strrchr(output, '.');
+    char *last_slash = strrchr(output, '/');
+
+    // Only replace if dot is after last slash (i.e., in filename, not path)
+    if (last_dot && (!last_slash || last_dot > last_slash)) {
+        strcpy(last_dot, ".mkv");
+    } else {
+        // No extension found, append .mkv
+        strcat(output, ".mkv");
+    }
+
+    return output;
+}
+
+static void print_usage(const char *program) {
+    printf("TAV Decoder - TSVM Advanced Video Codec (Reference Implementation)\n");
+    printf("\nUsage: %s -i input.tav [-o output.mkv] [options]\n\n", program);
+    printf("Required:\n");
+    printf("  -i, --input FILE         Input TAV file\n");
+    printf("\nOptional:\n");
+    printf("  -o, --output FILE        Output video file (default: input with .mkv extension)\n");
+    printf("  --raw                    Output raw video (no FFV1 compression)\n");
+    printf("  --no-audio               Skip audio decoding\n");
+    printf("  --decode-limit N         Decode only first N frames\n");
+    printf("  --dump-packets           Debug: print packet info\n");
+    printf("  -v, --verbose            Verbose output\n");
+    printf("  --help                   Show this help\n");
+    printf("\nExamples:\n");
+    printf("  %s -i video.tav                      # Output: video.mkv\n", program);
+    printf("  %s -i video.tav -o custom.mkv\n", program);
+    printf("  %s -i video.tav --verbose --decode-limit 100\n", program);
+}
+
+int main(int argc, char *argv[]) {
+    printf("TAV Decoder - %s\n", DECODER_VENDOR_STRING);
+    printf("Using libtavdec + libtaddec\n\n");
+
+    decoder_context_t ctx = {0};
+
+    // Command-line options
+    static struct option long_options[] = {
+        {"input",        required_argument, 0, 'i'},
+        {"output",       required_argument, 0, 'o'},
+        {"verbose",      no_argument,       0, 'v'},
+        {"raw",          no_argument,       0, 1001},
+        {"no-audio",     no_argument,       0, 1002},
+        {"decode-limit", required_argument, 0, 1003},
+        {"dump-packets", no_argument,       0, 1004},
+        {"help",         no_argument,       0, 'h'},
+        {0, 0, 0, 0}
+    };
+
+    int c, option_index = 0;
+    while ((c = getopt_long(argc, argv, "i:o:vh", long_options, &option_index)) != -1) {
+        switch (c) {
+            case 'i':
+                ctx.input_file = strdup(optarg);
+                break;
+            case 'o':
+                ctx.output_file = strdup(optarg);
+                break;
+            case 'v':
+                ctx.verbose = 1;
+                break;
+            case 1001:
+                ctx.output_raw = 1;
+                break;
+            case 1002:
+                ctx.no_audio = 1;
+                break;
+            case 1003:
+                ctx.decode_limit = atoi(optarg);
+                break;
+            case 1004:
+                ctx.dump_packets = 1;
+                break;
+            case 'h':
+            case '?':
+            default:
+                print_usage(argv[0]);
+                return (c == 'h') ? 0 : 1;
+        }
+    }
+
+    // Validate arguments
+    if (!ctx.input_file) {
         fprintf(stderr, "Error: Input file is required\n\n");
         print_usage(argv[0]);
         return 1;
     }
 
     // Generate output filename if not provided
-    if (!output_file) {
-        size_t input_len = strlen(input_file);
-        output_file = malloc(input_len + 32);  // Extra space for extension
-
-        // Find the last directory separator
-        const char *basename_start = strrchr(input_file, '/');
-        if (!basename_start) basename_start = strrchr(input_file, '\\');
-        basename_start = basename_start ? basename_start + 1 : input_file;
-
-        // Copy directory part
-        size_t dir_len = basename_start - input_file;
-        strncpy(output_file, input_file, dir_len);
-
-        // Find the .tad extension
-        const char *ext = strrchr(basename_start, '.');
-        if (ext && (strcmp(ext, ".tav") == 0 || strcmp(ext, ".mv3") == 0)) {
-            // Copy basename without .tav or .mv3
-            size_t name_len = ext - basename_start;
-            strncpy(output_file + dir_len, basename_start, name_len);
-            output_file[dir_len + name_len] = '\0';
-        } else {
-            // No .tad extension, copy entire basename
-            strcpy(output_file + dir_len, basename_start);
-        }
-
-        // Append appropriate extension
-        strcat(output_file, ".mkv");
-
-        if (verbose) {
-            printf("Auto-generated output path: %s\n", output_file);
+    if (!ctx.output_file) {
+        ctx.output_file = generate_output_filename(ctx.input_file);
+        if (!ctx.output_file) {
+            fprintf(stderr, "Error: Failed to generate output filename\n");
+            return 1;
         }
     }
 
-    // Create temporary audio file path
+    // Open input file
+    ctx.input_fp = fopen(ctx.input_file, "rb");
+    if (!ctx.input_fp) {
+        fprintf(stderr, "Error: Cannot open input file: %s\n", ctx.input_file);
+        return 1;
+    }
+
+    // Read and parse header
+    if (read_tav_header(&ctx) < 0) {
+        fclose(ctx.input_fp);
+        return 1;
+    }
+
+    // Create audio temp file
     char temp_audio_file[256];
-    snprintf(temp_audio_file, sizeof(temp_audio_file), "/tmp/tav_audio_%d.wav", getpid());
+    snprintf(temp_audio_file, sizeof(temp_audio_file), "/tmp/tav_dec_audio_%d.pcm", getpid());
+    ctx.audio_temp_file = strdup(temp_audio_file);
 
-    // Pass 1: Extract audio to WAV file
-    if (extract_audio_to_wav(input_file, temp_audio_file, verbose) < 0) {
-        fprintf(stderr, "Failed to extract audio\n");
-        unlink(temp_audio_file);  // Clean up temp file if it exists
+    if (!ctx.no_audio) {
+        ctx.audio_temp_fp = fopen(ctx.audio_temp_file, "wb");
+        if (!ctx.audio_temp_fp) {
+            fprintf(stderr, "Error: Cannot create audio temp file: %s\n", ctx.audio_temp_file);
+            fclose(ctx.input_fp);
+            return 1;
+        }
+    }
+
+    // Initialize video decoder
+    tav_video_params_t video_params = {
+        .width = ctx.header.width,
+        .height = ctx.header.height,
+        .decomp_levels = ctx.header.decomp_levels,
+        .temporal_levels = 2,  // Default
+        .wavelet_filter = ctx.header.wavelet_filter,
+        .temporal_wavelet = 0,  // Haar
+        .entropy_coder = ctx.header.entropy_coder,
+        .channel_layout = ctx.header.channel_layout,
+        .perceptual_tuning = ctx.perceptual_mode,
+        .quantiser_y = ctx.header.quantiser_y,
+        .quantiser_co = ctx.header.quantiser_co,
+        .quantiser_cg = ctx.header.quantiser_cg,
+        .encoder_preset = ctx.header.encoder_preset,
+        .monoblock = 1
+    };
+
+    ctx.video_ctx = tav_video_create(&video_params);
+    if (!ctx.video_ctx) {
+        fprintf(stderr, "Error: Failed to create video decoder context\n");
+        fclose(ctx.input_fp);
+        if (ctx.audio_temp_fp) fclose(ctx.audio_temp_fp);
         return 1;
     }
 
-    // Pass 2: Decode video with audio file
-    tav_decoder_t *decoder = tav_decoder_init(input_file, output_file, temp_audio_file, no_grain_synthesis);
-    if (!decoder) {
-        fprintf(stderr, "Failed to initialise decoder\n");
-        unlink(temp_audio_file);  // Clean up temp file
-        return 1;
+    tav_video_set_verbose(ctx.video_ctx, ctx.verbose);
+
+    printf("Input: %s\n", ctx.input_file);
+    printf("Output: %s\n", ctx.output_file);
+    printf("Resolution: %dx%d @ %d fps\n", ctx.header.width, ctx.header.height, ctx.header.fps);
+    printf("\n");
+
+    // Decode
+    int result = decode_video(&ctx);
+
+    // Cleanup FFmpeg
+    if (ctx.video_pipe) {
+        fclose(ctx.video_pipe);
+        waitpid(ctx.ffmpeg_pid, NULL, 0);
     }
 
-    if (verbose) {
-        printf("TAV Decoder - %dx%d @ %dfps\n", decoder->header.width, decoder->header.height, decoder->header.fps);
-        printf("Wavelet: %s, Levels: %d\n",
-               decoder->header.wavelet_filter == 0 ? "5/3" :
-               decoder->header.wavelet_filter == 1 ? "9/7" :
-               decoder->header.wavelet_filter == 2 ? "CDF 13/7" :
-               decoder->header.wavelet_filter == 16 ? "DD-4" :
-               decoder->header.wavelet_filter == 255 ? "Haar" : "Unknown",
-               decoder->header.decomp_levels);
-        printf("Version: %d (%s, %s)\n", decoder->header.version,
-               decoder->header.version % 2 == 0 ? "ICtCp" : "YCoCg-R",
-               decoder->is_monoblock ? "monoblock" : "tiled");
-        printf("Output: %s (FFV1 level 3 + PCMu8 @ 32 KHz)\n", output_file);
+    // Cleanup
+    if (ctx.video_ctx) {
+        tav_video_free(ctx.video_ctx);
     }
 
-    // Start timing for FPS calculation
-    struct timeval start_time, last_update_time;
-    gettimeofday(&start_time, NULL);
-    last_update_time = start_time;
-    int frames_since_last_update = 0;
-
-    // Main decoding loop
-    int result = 1;
-    int total_packets = 0;
-    int iframe_count = 0;
-    while (result > 0) {
-        // Check file position before reading packet
-        long file_pos = ftell(decoder->input_fp);
-
-        uint8_t packet_type;
-        if (fread(&packet_type, 1, 1, decoder->input_fp) != 1) {
-            if (verbose) {
-                fprintf(stderr, "Reached EOF at file position %ld after %d packets\n", file_pos, total_packets);
-            }
-            result = 0; // EOF
-            break;
-        }
-
-        total_packets++;
-
-        if (verbose && total_packets <= 30) {
-            fprintf(stderr, "Packet %d at file pos %ld: Type 0x%02X\n", total_packets, file_pos, packet_type);
-        }
-
-        // Handle sync packets (no size field)
-        if (packet_type == TAV_PACKET_SYNC || packet_type == TAV_PACKET_SYNC_NTSC) {
-            if (verbose && total_packets < 20) {
-                fprintf(stderr, "Packet %d: SYNC (0x%02X)\n", total_packets, packet_type);
-            }
-            continue;
-        }
-
-        // Handle timecode packets (no size field, just 8 bytes of uint64 timecode)
-        if (packet_type == TAV_PACKET_TIMECODE) {
-            uint64_t timecode_ns;
-            if (fread(&timecode_ns, 8, 1, decoder->input_fp) != 1) {
-                fprintf(stderr, "Error: Failed to read timecode\n");
-                result = -1;
-                break;
-            }
-            if (verbose && total_packets < 20) {
-                double timecode_sec = timecode_ns / 1000000000.0;
-                fprintf(stderr, "Packet %d: TIMECODE (0x%02X) - %.6f seconds\n",
-                       total_packets, packet_type, timecode_sec);
-            }
-            continue;
-        }
-
-        // Handle GOP sync packets (no size field, just 1 byte frame count)
-        if (packet_type == TAV_PACKET_GOP_SYNC) {
-            uint8_t gop_frame_count;
-            if (fread(&gop_frame_count, 1, 1, decoder->input_fp) != 1) {
-                fprintf(stderr, "Error: Failed to read GOP sync frame count\n");
-                result = -1;
-                break;
-            }
-            if (verbose) {
-                fprintf(stderr, "Packet %d: GOP_SYNC (0x%02X) - %u frames from GOP\n",
-                       total_packets, packet_type, gop_frame_count);
-            }
-            // Update decoder frame count (GOP already wrote frames)
-            decoder->frame_count += gop_frame_count;
-            frames_since_last_update += gop_frame_count;
-
-            // Print progress every second or so
-            struct timeval current_time;
-            gettimeofday(&current_time, NULL);
-            double time_since_update = (current_time.tv_sec - last_update_time.tv_sec) +
-                                     (current_time.tv_usec - last_update_time.tv_usec) / 1000000.0;
-
-            if (time_since_update >= 1.0 || decoder->frame_count == gop_frame_count) {  // Update every second
-                double total_time = (current_time.tv_sec - start_time.tv_sec) +
-                                  (current_time.tv_usec - start_time.tv_usec) / 1000000.0;
-                double current_fps = frames_since_last_update / time_since_update;
-                double avg_fps = decoder->frame_count / total_time;
-
-                fprintf(stderr, "\rDecoding: Frame %d (%.1f fps, avg %.1f fps)    ",
-                       decoder->frame_count, current_fps, avg_fps);
-                fflush(stderr);
-
-                last_update_time = current_time;
-                frames_since_last_update = 0;
-            }
-
-            continue;
-        }
-
-        // Handle screen masking packets (letterbox/pillarbox detection)
-        // Format: frame_num(4) + top(2) + right(2) + bottom(2) + left(2) = 12 bytes
-        if (packet_type == TAV_PACKET_SCREEN_MASK) {
-            uint32_t frame_num;
-            uint16_t top, right, bottom, left;
-            if (fread(&frame_num, 4, 1, decoder->input_fp) != 1 ||
-                fread(&top, 2, 1, decoder->input_fp) != 1 ||
-                fread(&right, 2, 1, decoder->input_fp) != 1 ||
-                fread(&bottom, 2, 1, decoder->input_fp) != 1 ||
-                fread(&left, 2, 1, decoder->input_fp) != 1) {
-                fprintf(stderr, "Error: Failed to read screen mask packet\n");
-                result = -1;
-                break;
-            }
-
-            // Allocate array if needed
-            if (decoder->screen_masks == NULL) {
-                decoder->screen_mask_capacity = 16;
-                decoder->screen_masks = malloc(decoder->screen_mask_capacity * sizeof(screen_mask_entry_t));
-                decoder->screen_mask_count = 0;
-            }
-
-            // Expand array if needed
-            if (decoder->screen_mask_count >= decoder->screen_mask_capacity) {
-                decoder->screen_mask_capacity *= 2;
-                decoder->screen_masks = realloc(decoder->screen_masks,
-                                               decoder->screen_mask_capacity * sizeof(screen_mask_entry_t));
-            }
-
-            // Store entry
-            screen_mask_entry_t *entry = &decoder->screen_masks[decoder->screen_mask_count++];
-            entry->frame_num = frame_num;
-            entry->top = top;
-            entry->right = right;
-            entry->bottom = bottom;
-            entry->left = left;
-
-            // Phase 2: Update current active mask and decoding dimensions
-            decoder->screen_mask_top = top;
-            decoder->screen_mask_right = right;
-            decoder->screen_mask_bottom = bottom;
-            decoder->screen_mask_left = left;
-
-            // Calculate new decoding dimensions (active region size)
-            decoder->decoding_width = decoder->header.width - left - right;
-            decoder->decoding_height = decoder->header.height - top - bottom;
-
-            if (verbose) {
-                fprintf(stderr, "Packet %d: SCREEN_MASK (0x%02X) - frame=%u top=%u right=%u bottom=%u left=%u (decoding: %dx%d)\n",
-                       total_packets, packet_type, frame_num, top, right, bottom, left,
-                       decoder->decoding_width, decoder->decoding_height);
-            }
-            continue;
-        }
-
-        // Handle GOP unified packets (custom format: 1-byte gop_size + 4-byte compressed_size)
-        if (packet_type == TAV_PACKET_GOP_UNIFIED) {
-            uint8_t gop_size;
-            uint32_t compressed_size;
-            if (fread(&gop_size, 1, 1, decoder->input_fp) != 1 ||
-                fread(&compressed_size, 4, 1, decoder->input_fp) != 1) {
-                fprintf(stderr, "Error: Failed to read GOP unified packet header\n");
-                result = -1;
-                break;
-            }
-
-            if (verbose) {
-                fprintf(stderr, "Packet %d: GOP_UNIFIED (0x%02X), %u frames, %u bytes\n",
-                       total_packets, packet_type, gop_size, compressed_size);
-            }
-
-            // Read compressed GOP data
-            uint8_t *compressed_data = malloc(compressed_size);
-            if (!compressed_data) {
-                fprintf(stderr, "Error: Failed to allocate GOP compressed buffer (%u bytes)\n", compressed_size);
-                result = -1;
-                break;
-            }
-
-            if (fread(compressed_data, 1, compressed_size, decoder->input_fp) != compressed_size) {
-                fprintf(stderr, "Error: Failed to read GOP compressed data\n");
-                free(compressed_data);
-                result = -1;
-                break;
-            }
-
-            // Decompress with Zstd
-            const size_t decompressed_bound = ZSTD_getFrameContentSize(compressed_data, compressed_size);
-            if (decompressed_bound == ZSTD_CONTENTSIZE_ERROR || decompressed_bound == ZSTD_CONTENTSIZE_UNKNOWN) {
-                fprintf(stderr, "Error: Invalid Zstd frame in GOP data\n");
-                free(compressed_data);
-                result = -1;
-                break;
-            }
-
-            uint8_t *decompressed_data = malloc(decompressed_bound);
-            if (!decompressed_data) {
-                fprintf(stderr, "Error: Failed to allocate GOP decompressed buffer (%zu bytes)\n", decompressed_bound);
-                free(compressed_data);
-                result = -1;
-                break;
-            }
-
-            const size_t decompressed_size = ZSTD_decompress(decompressed_data, decompressed_bound,
-                                                            compressed_data, compressed_size);
-            free(compressed_data);
-
-            if (ZSTD_isError(decompressed_size)) {
-                fprintf(stderr, "Error: Zstd decompression failed: %s\n", ZSTD_getErrorName(decompressed_size));
-                free(decompressed_data);
-                result = -1;
-                break;
-            }
-
-            // Postprocess coefficients based on entropy_coder value
-            // Phase 2: Use decoding dimensions (actual encoded size) for postprocessing
-            int decoding_pixels = decoder->decoding_width * decoder->decoding_height;
-            // Keep full frame size for buffer allocation
-            const int num_pixels = decoder->header.width * decoder->header.height;
-            int16_t ***quantised_gop;
-
-            // GOP dimensions (may differ from full frame with crop encoding)
-            int gop_width = decoder->decoding_width;
-            int gop_height = decoder->decoding_height;
-
-            if (decoder->header.entropy_coder == 2) {
-                // RAW format: simple concatenated int16 arrays
-                if (verbose) {
-                    fprintf(stderr, "  Using RAW postprocessing (entropy_coder=2) for %dx%d (%d pixels)\n",
-                           decoder->decoding_width, decoder->decoding_height, decoding_pixels);
-                }
-                quantised_gop = postprocess_gop_raw(decompressed_data, decompressed_size,
-                                                   gop_size, num_pixels, decoder->header.channel_layout);
-            } else if (decoder->header.entropy_coder == 1) {
-                // EZBC format: embedded zero-block coding
-                if (verbose) {
-                    fprintf(stderr, "  Using EZBC postprocessing (entropy_coder=1) for %dx%d (%d pixels)\n",
-                           decoder->decoding_width, decoder->decoding_height, decoding_pixels);
-                }
-                // EZBC will return actual GOP dimensions (may be cropped with crop encoding)
-                quantised_gop = postprocess_gop_ezbc(decompressed_data, decompressed_size,
-                                                    gop_size, num_pixels, decoder->header.channel_layout,
-                                                    &gop_width, &gop_height);
-                // Update decoding_pixels to match actual GOP dimensions
-                if (gop_width > 0 && gop_height > 0) {
-                    decoding_pixels = gop_width * gop_height;
-                    if (verbose) {
-                        fprintf(stderr, "  Actual GOP dimensions from EZBC: %dx%d (%d pixels)\n",
-                               gop_width, gop_height, decoding_pixels);
-                    }
-                }
-            } else {
-                // Default: Twobitmap format (entropy_coder=0)
-                if (verbose) {
-                    fprintf(stderr, "  Using Twobitmap postprocessing (entropy_coder=0) for %dx%d (%d pixels)\n",
-                           decoder->decoding_width, decoder->decoding_height, decoding_pixels);
-                }
-                quantised_gop = postprocess_gop_unified(decompressed_data, decompressed_size,
-                                                       gop_size, num_pixels, decoder->header.channel_layout);
-            }
-
-            free(decompressed_data);
-
-            if (!quantised_gop) {
-                fprintf(stderr, "Error: Failed to postprocess GOP data\n");
-                result = -1;
-                break;
-            }
-
-            // Allocate GOP float buffers
-            // Phase 2: Allocate at decoding size (cropped region), will composite to full frame later
-            float **gop_y = malloc(gop_size * sizeof(float *));
-            float **gop_co = malloc(gop_size * sizeof(float *));
-            float **gop_cg = malloc(gop_size * sizeof(float *));
-
-            for (int t = 0; t < gop_size; t++) {
-                gop_y[t] = calloc(decoding_pixels, sizeof(float));
-                gop_co[t] = calloc(decoding_pixels, sizeof(float));
-                gop_cg[t] = calloc(decoding_pixels, sizeof(float));
-            }
-
-            // Dequantise with temporal scaling (perceptual quantisation for versions 5-8)
-            // Extract base version for perceptual check
-            uint8_t base_version_gop = (decoder->header.version > 8) ? (decoder->header.version - 8) : decoder->header.version;
-            const int is_perceptual = (base_version_gop >= 5 && base_version_gop <= 8);
-            const int is_ezbc = (decoder->header.entropy_coder == 1);
-            const int temporal_levels = 2;  // Fixed for TAV GOP encoding
-
-            for (int t = 0; t < gop_size; t++) {
-                if (is_ezbc && is_perceptual) {
-                    // EZBC mode with perceptual quantisation: coefficients are normalised
-                    // Need to dequantise using perceptual weights (same as twobit-map mode)
-                    const int temporal_level = get_temporal_subband_level(t, gop_size, temporal_levels);
-                    const float temporal_scale = get_temporal_quantiser_scale(decoder->header.encoder_preset, temporal_level);
-
-                    // FIX: Use QLUT to convert header quantiser indices to actual values
-                    const float base_q_y = roundf(QLUT[decoder->header.quantiser_y] * temporal_scale);
-                    const float base_q_co = roundf(QLUT[decoder->header.quantiser_co] * temporal_scale);
-                    const float base_q_cg = roundf(QLUT[decoder->header.quantiser_cg] * temporal_scale);
-
-                    // Phase 2: Use GOP dimensions (may be cropped) for dequantisation
-                    dequantise_dwt_subbands_perceptual(0, QLUT[decoder->header.quantiser_y],
-                                                      quantised_gop[t][0], gop_y[t],
-                                                      gop_width, gop_height,
-                                                      decoder->header.decomp_levels, base_q_y, 0, decoder->frame_count + t);
-                    dequantise_dwt_subbands_perceptual(0, QLUT[decoder->header.quantiser_y],
-                                                      quantised_gop[t][1], gop_co[t],
-                                                      gop_width, gop_height,
-                                                      decoder->header.decomp_levels, base_q_co, 1, decoder->frame_count + t);
-                    dequantise_dwt_subbands_perceptual(0, QLUT[decoder->header.quantiser_y],
-                                                      quantised_gop[t][2], gop_cg[t],
-                                                      gop_width, gop_height,
-                                                      decoder->header.decomp_levels, base_q_cg, 1, decoder->frame_count + t);
-
-                    if (t == 0 && verbose) {
-                        // Debug: Check multiple LL values
-                        fprintf(stderr, "[GOP-EZBC] Frame 0 after dequant:\n");
-                        fprintf(stderr, "  Quantised: LL[0]=%d, LL[1]=%d, LL[2]=%d\n",
-                               quantised_gop[t][0][0], quantised_gop[t][0][1], quantised_gop[t][0][2]);
-                        fprintf(stderr, "  Dequantised: LL[0]=%.1f, LL[1]=%.1f, LL[2]=%.1f\n",
-                               gop_y[t][0], gop_y[t][1], gop_y[t][2]);
-                        fprintf(stderr, "  base_q_y=%.1f, temporal_level=%d, temporal_scale=%.3f\n",
-                               base_q_y, temporal_level, temporal_scale);
-                    }
-                } else if (!is_ezbc) {
-                    // Normal mode: multiply by quantiser
-                    const int temporal_level = get_temporal_subband_level(t, gop_size, temporal_levels);
-                    const float temporal_scale = get_temporal_quantiser_scale(decoder->header.encoder_preset, temporal_level);
-
-                    // CRITICAL: Must ROUND temporal quantiser to match encoder's roundf() behavior
-                    // FIX: Use QLUT to convert header quantiser indices to actual values
-                    const float base_q_y = roundf(QLUT[decoder->header.quantiser_y] * temporal_scale);
-                    const float base_q_co = roundf(QLUT[decoder->header.quantiser_co] * temporal_scale);
-                    const float base_q_cg = roundf(QLUT[decoder->header.quantiser_cg] * temporal_scale);
-
-                    if (is_perceptual) {
-                        // Phase 2: Use GOP dimensions (may be cropped) for dequantisation
-                        dequantise_dwt_subbands_perceptual(0, QLUT[decoder->header.quantiser_y],
-                                                          quantised_gop[t][0], gop_y[t],
-                                                          gop_width, gop_height,
-                                                          decoder->header.decomp_levels, base_q_y, 0, decoder->frame_count + t);
-                        dequantise_dwt_subbands_perceptual(0, QLUT[decoder->header.quantiser_y],
-                                                          quantised_gop[t][1], gop_co[t],
-                                                          gop_width, gop_height,
-                                                          decoder->header.decomp_levels, base_q_co, 1, decoder->frame_count + t);
-                        dequantise_dwt_subbands_perceptual(0, QLUT[decoder->header.quantiser_y],
-                                                          quantised_gop[t][2], gop_cg[t],
-                                                          gop_width, gop_height,
-                                                          decoder->header.decomp_levels, base_q_cg, 1, decoder->frame_count + t);
-                    } else {
-                        // Uniform quantisation for older versions
-                        // Phase 2: Use decoding_pixels for uniform dequantisation
-                        for (int i = 0; i < decoding_pixels; i++) {
-                            gop_y[t][i] = quantised_gop[t][0][i] * base_q_y;
-                            gop_co[t][i] = quantised_gop[t][1][i] * base_q_co;
-                            gop_cg[t][i] = quantised_gop[t][2][i] * base_q_cg;
-                        }
-                    }
-                }
-            }
-
-            // Free quantised coefficients
-            for (int t = 0; t < gop_size; t++) {
-                free(quantised_gop[t][0]);
-                free(quantised_gop[t][1]);
-                free(quantised_gop[t][2]);
-                free(quantised_gop[t]);
-            }
-            free(quantised_gop);
-
-
-            // Phase 2: Use GOP dimensions (may be cropped) for grain removal
-            for (int t = 0; t < gop_size; t++) {
-                apply_grain_synthesis(gop_y[t], gop_width, gop_height,
-                                              decoder->header.decomp_levels, decoder->frame_count + t,
-                                              decoder->header.quantiser_y, decoder->header.encoder_preset,
-                                              decoder->no_grain_synthesis);
-            }
-
-            // Apply inverse 3D DWT (spatial + temporal)
-            // Phase 2: Use GOP dimensions (may be cropped) for inverse DWT
-            apply_inverse_3d_dwt(gop_y, gop_co, gop_cg, gop_width, gop_height,
-                               gop_size, decoder->header.decomp_levels, temporal_levels,
-                               decoder->header.wavelet_filter, decoder->temporal_motion_coder);
-
-            // Debug: Check Y values after inverse DWT
-            if (verbose && decoder->frame_count == 0) {
-                fprintf(stderr, "[GOP-DEBUG] After inverse 3D DWT: Frame 0 Y[0]=%.1f, Y[1]=%.1f, Y[2]=%.1f\n",
-                       gop_y[0][0], gop_y[0][1], gop_y[0][2]);
-            }
-
-            // Debug: Check spatial coefficients after inverse temporal DWT (before inverse spatial DWT)
-//            if (is_ezbc) {
-//                float max_y = 0.0f, min_y = 0.0f;
-//                for (int i = 0; i < num_pixels; i++) {
-//                    if (gop_y[0][i] > max_y) max_y = gop_y[0][i];
-//                    if (gop_y[0][i] < min_y) min_y = gop_y[0][i];
-//                }
-//                fprintf(stderr, "[GOP-EZBC] After inverse temporal DWT, Frame 0 Y spatial coeffs range: [%.1f, %.1f], first 5: %.1f %.1f %.1f %.1f %.1f\n",
-//                       min_y, max_y,
-//                       gop_y[0][0], gop_y[0][1], gop_y[0][2], gop_y[0][3], gop_y[0][4]);
-//            }
-
-            // Convert YCoCg→RGB and write all GOP frames
-            const int is_ictcp = (decoder->header.version % 2 == 0);
-
-            // DEBUG: Print frame size calculation
-//            if (decoder->frame_count == 0) {
-//                fprintf(stderr, "[DEBUG] decoder->frame_size=%d, decoder->header.width=%d, decoder->header.height=%d\n",
-//                       decoder->frame_size, decoder->header.width, decoder->header.height);
-//                fprintf(stderr, "[DEBUG] bytes_to_write=%zu (should be %d)\n",
-//                       (size_t)decoder->frame_size * 3, decoder->header.width * decoder->header.height * 3);
-//            }
-
-            // Calculate consistent screen mask offsets for crop-encoded GOPs
-            // When crop encoding is active, all frames in GOP use same dimensions
-            const int is_crop_encoded = (gop_width != decoder->header.width ||
-                                        gop_height != decoder->header.height);
-            uint16_t gop_mask_top = 0, gop_mask_bottom = 0, gop_mask_left = 0, gop_mask_right = 0;
-
-            if (is_crop_encoded) {
-                // Center the cropped region in the full frame
-                if (gop_height < decoder->header.height) {
-                    gop_mask_top = (decoder->header.height - gop_height) / 2;
-                    gop_mask_bottom = decoder->header.height - gop_height - gop_mask_top;
-                }
-                if (gop_width < decoder->header.width) {
-                    gop_mask_left = (decoder->header.width - gop_width) / 2;
-                    gop_mask_right = decoder->header.width - gop_width - gop_mask_left;
-                }
-                if (verbose && decoder->frame_count == 0) {
-                    fprintf(stderr, "[GOP-Crop] Centering %dx%d in %dx%d: top=%u, bottom=%u, left=%u, right=%u\n",
-                           gop_width, gop_height, decoder->header.width, decoder->header.height,
-                           gop_mask_top, gop_mask_bottom, gop_mask_left, gop_mask_right);
-                }
-            }
-
-            for (int t = 0; t < gop_size; t++) {
-                // Update screen mask only if NOT crop-encoded
-                // Crop-encoded GOPs use consistent offsets calculated above
-                if (!is_crop_encoded) {
-                    update_screen_mask(decoder, decoder->frame_count + t);
-                }
-
-                // Phase 2: Convert cropped region to RGB, then composite to full frame
-                uint8_t *cropped_rgb = malloc(decoding_pixels * 3);
-                if (!cropped_rgb) {
-                    fprintf(stderr, "Error: Failed to allocate cropped GOP frame buffer\n");
-                    result = -1;
-                    break;
-                }
-
-                // Convert cropped region to RGB
-                for (int i = 0; i < decoding_pixels; i++) {
-                    uint8_t r, g, b;
-                    if (is_ictcp) {
-                        ictcp_to_rgb(gop_y[t][i], gop_co[t][i], gop_cg[t][i], &r, &g, &b);
-                    } else {
-                        ycocg_r_to_rgb(gop_y[t][i], gop_co[t][i], gop_cg[t][i], &r, &g, &b);
-                    }
-                    cropped_rgb[i * 3 + 0] = r;
-                    cropped_rgb[i * 3 + 1] = g;
-                    cropped_rgb[i * 3 + 2] = b;
-                }
-
-                // Composite cropped frame to full frame with black borders
-                // Use GOP-consistent offsets for crop-encoded, or per-frame offsets otherwise
-                const uint16_t mask_top = is_crop_encoded ? gop_mask_top : decoder->screen_mask_top;
-                const uint16_t mask_bottom = is_crop_encoded ? gop_mask_bottom : decoder->screen_mask_bottom;
-                const uint16_t mask_left = is_crop_encoded ? gop_mask_left : decoder->screen_mask_left;
-                const uint16_t mask_right = is_crop_encoded ? gop_mask_right : decoder->screen_mask_right;
-
-                uint8_t *frame_rgb = composite_to_full_frame(cropped_rgb,
-                                                             gop_width, gop_height,
-                                                             decoder->header.width, decoder->header.height,
-                                                             mask_top, mask_right, mask_bottom, mask_left);
-                free(cropped_rgb);
-
-                if (!frame_rgb) {
-                    fprintf(stderr, "Error: Failed to composite GOP frame to full size\n");
-                    result = -1;
-                    break;
-                }
-
-                // Note: Phase 1 fill_masked_regions() is now replaced by Phase 2 composite function
-                // which places the decoded cropped frame into a full-frame buffer with black borders
-
-                // Write frame to FFmpeg video pipe
-                const size_t bytes_to_write = decoder->frame_size * 3;
-
-                // DEBUG: Verify we're writing to correct pipe
-//                if (decoder->frame_count == 0 && t == 0) {
-//                    fprintf(stderr, "[DEBUG] Writing frame to video_pipe=%p, bytes_to_write=%zu\n",
-//                           (void*)decoder->video_pipe, bytes_to_write);
-//                    fprintf(stderr, "[DEBUG] First 10 RGB bytes: %02X %02X %02X %02X %02X %02X %02X %02X %02X %02X\n",
-//                           frame_rgb[0], frame_rgb[1], frame_rgb[2], frame_rgb[3], frame_rgb[4],
-//                           frame_rgb[5], frame_rgb[6], frame_rgb[7], frame_rgb[8], frame_rgb[9]);
-//                }
-
-                const size_t bytes_written = fwrite(frame_rgb, 1, bytes_to_write, decoder->video_pipe);
-                if (bytes_written != bytes_to_write) {
-                    fprintf(stderr, "Error: Failed to write GOP frame %d to FFmpeg (wrote %zu/%zu bytes)\n",
-                           t, bytes_written, bytes_to_write);
-                    free(frame_rgb);
-                    result = -1;
-                    break;
-                }
-                fflush(decoder->video_pipe);
-
-                free(frame_rgb);
-            }
-
-            // Free GOP buffers
-            for (int t = 0; t < gop_size; t++) {
-                free(gop_y[t]);
-                free(gop_co[t]);
-                free(gop_cg[t]);
-            }
-            free(gop_y);
-            free(gop_co);
-            free(gop_cg);
-
-            // BUGFIX: Only break on error (result < 0), not on success (result = 1)
-            if (result < 0) break;
-
-            // GOP decoding doesn't update frame_count here - GOP_SYNC packet will do it
-            if (verbose) {
-                long pos_after_gop = ftell(decoder->input_fp);
-                fprintf(stderr, "[DEBUG] After GOP: file pos = %ld, %d frames written (waiting for GOP_SYNC)\n",
-                       pos_after_gop, gop_size);
-            }
-
-            continue;
-        }
-
-        // Handle TAD audio packets (already extracted in Pass 1, just skip)
-        if (packet_type == TAV_PACKET_AUDIO_TAD) {
-            uint16_t sample_count_wrapper;
-            uint32_t payload_size_plus_7;
-            fread(&sample_count_wrapper, 2, 1, decoder->input_fp);
-            fread(&payload_size_plus_7, 4, 1, decoder->input_fp);
-
-            // Skip TAD chunk (payload_size_plus_7 includes header and data)
-            fseek(decoder->input_fp, payload_size_plus_7, SEEK_CUR);
-            continue;
-        }
-
-        // Handle extended header (has 2-byte count, not 4-byte size)
-        if (packet_type == TAV_PACKET_EXTENDED_HDR) {
-            uint16_t num_pairs;
-            if (fread(&num_pairs, 2, 1, decoder->input_fp) != 1) {
-                fprintf(stderr, "Error: Failed to read extended header count\n");
-                result = -1;
-                break;
-            }
-            if (verbose && total_packets < 20) {
-                fprintf(stderr, "Packet %d: EXTENDED_HDR (0x%02X), %u pairs - skipping\n",
-                       total_packets, packet_type, num_pairs);
-            }
-            // Skip the key-value pairs
-            // Format: each pair is [4-byte key][1-byte type][N-byte value]
-            // We need to parse each pair to know its size
-            for (int i = 0; i < num_pairs; i++) {
-                uint8_t key[4];
-                uint8_t value_type;
-                if (fread(key, 1, 4, decoder->input_fp) != 4 ||
-                    fread(&value_type, 1, 1, decoder->input_fp) != 1) {
-                    fprintf(stderr, "Error: Failed to read extended header pair %d\n", i);
-                    result = -1;
-                    break;
-                }
-                // Determine value size based on type
-                size_t value_size = 0;
-                switch (value_type) {
-                    case 0x00: value_size = 2; break;  // Int16
-                    case 0x01: value_size = 3; break;  // Int24
-                    case 0x02: value_size = 4; break;  // Int32
-                    case 0x03: value_size = 6; break;  // Int48
-                    case 0x04: value_size = 8; break;  // Int64
-                    case 0x10: {  // Bytes with 2-byte length prefix
-                        uint16_t str_len;
-                        if (fread(&str_len, 2, 1, decoder->input_fp) != 1) {
-                            fprintf(stderr, "Error: Failed to read string length\n");
-                            result = -1;
-                            break;
-                        }
-                        value_size = str_len;
-                        break;
-                    }
-                    default:
-                        fprintf(stderr, "Warning: Unknown extended header value type 0x%02X\n", value_type);
-                        break;
-                }
-                // Skip the value
-                if (value_size > 0) {
-                    fseek(decoder->input_fp, value_size, SEEK_CUR);
-                }
-            }
-            if (result < 0) break;
-            continue;
-        }
-
-        // Read packet size (for remaining packet types with standard format)
-        uint32_t packet_size;
-        if (fread(&packet_size, 4, 1, decoder->input_fp) != 1) {
-            fprintf(stderr, "Error: Failed to read packet size at packet %d (type 0x%02X)\n",
-                   total_packets, packet_type);
-            result = -1;
-            break;
-        }
-
-        if (verbose && total_packets < 20) {
-            fprintf(stderr, "Packet %d: Type 0x%02X, Size %u bytes\n", total_packets, packet_type, packet_size);
-        }
-
-        switch (packet_type) {
-            case TAV_PACKET_IFRAME:
-            case TAV_PACKET_PFRAME:
-                // Update active screen mask for this frame (Phase 1: just tracking, not applying)
-                update_screen_mask(decoder, decoder->frame_count);
-
-                iframe_count++;
-                if (verbose && iframe_count <= 5) {
-                    fprintf(stderr, "Processing %s (packet %d, size %u bytes)...\n",
-                           packet_type == TAV_PACKET_IFRAME ? "I-frame" : "P-frame",
-                           total_packets, packet_size);
-                }
-                result = decode_i_or_p_frame(decoder, packet_type, packet_size);
-                if (result < 0) {
-                    fprintf(stderr, "Error: Frame decoding failed at frame %d\n", decoder->frame_count);
-                    break;
-                }
-
-                // Update progress indicator
-                frames_since_last_update++;
-                struct timeval current_time;
-                gettimeofday(&current_time, NULL);
-                double time_since_update = (current_time.tv_sec - last_update_time.tv_sec) +
-                                         (current_time.tv_usec - last_update_time.tv_usec) / 1000000.0;
-
-                if (time_since_update >= 1.0 || decoder->frame_count == 1) {  // Update every second
-                    double total_time = (current_time.tv_sec - start_time.tv_sec) +
-                                      (current_time.tv_usec - start_time.tv_usec) / 1000000.0;
-                    double current_fps = frames_since_last_update / time_since_update;
-                    double avg_fps = decoder->frame_count / total_time;
-
-                    fprintf(stderr, "\rDecoding: Frame %d (%.1f fps, avg %.1f fps)    ",
-                           decoder->frame_count, current_fps, avg_fps);
-                    fflush(stderr);
-
-                    last_update_time = current_time;
-                    frames_since_last_update = 0;
-                }
-
-                break;
-
-            case TAV_PACKET_AUDIO_MP2:
-            case TAV_PACKET_AUDIO_TRACK:
-                // MP2 audio - write directly to audio pipe
-                // Note: FFmpeg cannot decode MP2 from raw stream, so we skip for now
-                if (verbose && total_packets < 20) {
-                    fprintf(stderr, "Skipping MP2 audio packet (%u bytes) - not yet supported\n", packet_size);
-                }
-                fseek(decoder->input_fp, packet_size, SEEK_CUR);
-                break;
-
-            case TAV_PACKET_AUDIO_PCM8:
-                // PCM8 audio - already extracted in Pass 1, just skip
-                fseek(decoder->input_fp, packet_size, SEEK_CUR);
-                break;
-
-            case TAV_PACKET_SUBTITLE:
-            case TAV_PACKET_SUBTITLE_TC:
-                // Skip subtitle packets
-                fseek(decoder->input_fp, packet_size, SEEK_CUR);
-                break;
-
-            case TAV_PACKET_PFRAME_RESIDUAL:
-            case TAV_PACKET_BFRAME_RESIDUAL:
-                fprintf(stderr, "\nError: Unsupported packet type 0x%02X (MPEG-style motion compensation not supported)\n", packet_type);
-                result = -1;
-                break;
-
-            default:
-                fprintf(stderr, "\nWarning: Unknown packet type 0x%02X (skipping)\n", packet_type);
-                fseek(decoder->input_fp, packet_size, SEEK_CUR);
-                break;
+    if (ctx.gop_frames) {
+        for (int i = 0; i < ctx.gop_frames_allocated; i++) {
+            free(ctx.gop_frames[i]);
         }
+        free(ctx.gop_frames);
     }
 
-    // Calculate final statistics
-    struct timeval end_time;
-    gettimeofday(&end_time, NULL);
-    double total_time = (end_time.tv_sec - start_time.tv_sec) +
-                       (end_time.tv_usec - start_time.tv_usec) / 1000000.0;
+    fclose(ctx.input_fp);
 
-    if (verbose) {
-        printf("\nDecoded %d frames\n", decoder->frame_count);
+    // Remove temp audio file
+    if (ctx.audio_temp_file) {
+        unlink(ctx.audio_temp_file);
+        free(ctx.audio_temp_file);
     }
 
-    tav_decoder_free(decoder);
+    // Statistics
+    time_t total_time = time(NULL) - ctx.start_time;
+    double avg_fps = total_time > 0 ? (double)ctx.frames_decoded / total_time : 0.0;
+
+    printf("\n=== Decoding Complete ===\n");
+    printf("  Frames decoded: %lu\n", ctx.frames_decoded);
+    printf("  GOPs decoded: %lu\n", ctx.gops_decoded);
+    printf("  Audio samples: %lu\n", ctx.audio_samples_decoded);
+    printf("  Bytes read: %lu\n", ctx.bytes_read);
+    printf("  Decoding speed: %.1f fps\n", avg_fps);
+    printf("  Time taken: %ld seconds\n", total_time);
+    printf("=========================\n");
 
     if (result < 0) {
-        fprintf(stderr, "Decoding error occurred\n");
-        unlink(temp_audio_file);  // Clean up temp file
+        fprintf(stderr, "Decoding failed\n");
+        free(ctx.input_file);
+        free(ctx.output_file);
         return 1;
     }
 
-    // Print final statistics (similar to encoder)
-    fprintf(stderr, "\n");  // Clear progress line
-    printf("\nDecoding complete!\n");
-    printf("  Frames decoded: %d\n", decoder->frame_count);
-    printf("  Decoding time: %.2fs (%.1f fps)\n", total_time, decoder->frame_count / total_time);
-    printf("  Output: %s\n", output_file);
+    printf("\nOutput written to: %s\n", ctx.output_file);
 
-    // Clean up temporary audio file
-    if (unlink(temp_audio_file) == 0 && verbose) {
-        fprintf(stderr, "Cleaned up temporary audio file: %s\n", temp_audio_file);
-    }
+    free(ctx.input_file);
+    free(ctx.output_file);
 
     return 0;
 }