tsvm/video_encoder/decoder_tav.c

// Created by CuriousTorvald and Claude on 2025-11-03.
// TAV Decoder - Converts TAV video to FFV1 format with TAD audio to PCMu8
// Based on TSVM decoder implementation (GraphicsJSR223Delegate.kt + playtav.js)
// Only supports features available in TSVM decoder (no MC-EZBC, no MPEG-style motion compensation)

#include <stdio.h>
#include <stdlib.h>
#include <stdint.h>
#include <string.h>
#include <math.h>
#include <zstd.h>
#include <unistd.h>
#include <sys/wait.h>
#include <getopt.h>
#include <signal.h>

#define DECODER_VENDOR_STRING "Decoder-TAV 20251103 (ffv1+pcmu8)"

// TAV format constants
#define TAV_MAGIC "\x1F\x54\x53\x56\x4D\x54\x41\x56"
#define TAV_MODE_SKIP      0x00
#define TAV_MODE_INTRA     0x01
#define TAV_MODE_DELTA     0x02

// TAV packet types (only those supported by TSVM decoder)
#define TAV_PACKET_IFRAME          0x10  // Intra frame (keyframe) - SUPPORTED
#define TAV_PACKET_PFRAME          0x11  // Predicted frame - SUPPORTED (delta mode)
#define TAV_PACKET_GOP_UNIFIED     0x12  // Unified 3D DWT GOP - SUPPORTED
#define TAV_PACKET_AUDIO_MP2       0x20  // MP2 audio - SUPPORTED (passthrough)
#define TAV_PACKET_AUDIO_PCM8      0x21  // 8-bit PCM audio - SUPPORTED
#define TAV_PACKET_AUDIO_TAD       0x24  // TAD audio - SUPPORTED (decode to PCMu8)
#define TAV_PACKET_AUDIO_TRACK     0x40  // Bundled audio track - SUPPORTED (passthrough)
#define TAV_PACKET_SUBTITLE        0x30  // Subtitle - SKIPPED
#define TAV_PACKET_EXTENDED_HDR    0xEF  // Extended header - SKIPPED
#define TAV_PACKET_GOP_SYNC        0xFC  // GOP sync packet - SKIPPED
#define TAV_PACKET_TIMECODE        0xFD  // Timecode - SKIPPED
#define TAV_PACKET_SYNC_NTSC       0xFE  // NTSC sync - SKIPPED
#define TAV_PACKET_SYNC            0xFF  // Sync - SKIPPED

// Unsupported packet types (not in TSVM decoder)
#define TAV_PACKET_PFRAME_RESIDUAL 0x14  // P-frame MPEG-style - NOT SUPPORTED
#define TAV_PACKET_BFRAME_RESIDUAL 0x15  // B-frame MPEG-style - NOT SUPPORTED

// Channel layout definitions
#define CHANNEL_LAYOUT_YCOCG     0  // Y-Co-Cg/I-Ct-Cp
#define CHANNEL_LAYOUT_YCOCG_A   1  // Y-Co-Cg-A/I-Ct-Cp-A
#define CHANNEL_LAYOUT_Y_ONLY    2  // Y/I only
#define CHANNEL_LAYOUT_Y_A       3  // Y-A/I-A
#define CHANNEL_LAYOUT_COCG      4  // Co-Cg/Ct-Cp
#define CHANNEL_LAYOUT_COCG_A    5  // Co-Cg-A/Ct-Cp-A

// Wavelet filter types
#define WAVELET_5_3_REVERSIBLE 0
#define WAVELET_9_7_IRREVERSIBLE 1
#define WAVELET_BIORTHOGONAL_13_7 2
#define WAVELET_DD4 16
#define WAVELET_HAAR 255

// Tile sizes (match TSVM)
#define TILE_SIZE_X 640
#define TILE_SIZE_Y 540
#define DWT_FILTER_HALF_SUPPORT 4
#define TILE_MARGIN_LEVELS 3
#define TILE_MARGIN (DWT_FILTER_HALF_SUPPORT * (1 << TILE_MARGIN_LEVELS))
#define PADDED_TILE_SIZE_X (TILE_SIZE_X + 2 * TILE_MARGIN)
#define PADDED_TILE_SIZE_Y (TILE_SIZE_Y + 2 * TILE_MARGIN)

static inline int CLAMP(int x, int min, int max) {
    return x < min ? min : (x > max ? max : x);
}

//=============================================================================
// TAV Header Structure (32 bytes)
//=============================================================================

typedef struct {
    uint8_t magic[8];
    uint8_t version;
    uint16_t width;
    uint16_t height;
    uint8_t fps;
    uint32_t total_frames;
    uint8_t wavelet_filter;
    uint8_t decomp_levels;
    uint8_t quantiser_y;
    uint8_t quantiser_co;
    uint8_t quantiser_cg;
    uint8_t extra_flags;
    uint8_t video_flags;
    uint8_t encoder_quality;
    uint8_t channel_layout;
    uint8_t entropy_coder;
    uint8_t reserved[2];
    uint8_t device_orientation;
    uint8_t file_role;
} __attribute__((packed)) tav_header_t;

//=============================================================================
// Quantization Lookup Table (matches TSVM exactly)
//=============================================================================

static const int QLUT[] = {1,2,3,4,5,6,7,8,9,10,11,12,13,14,15,16,17,18,19,20,21,22,23,24,25,26,27,28,29,30,31,32,33,34,35,36,37,38,39,40,41,42,43,44,45,46,47,48,49,50,51,52,53,54,55,56,57,58,59,60,61,62,63,64,66,68,70,72,74,76,78,80,82,84,86,88,90,92,94,96,98,100,102,104,106,108,110,112,114,116,118,120,122,124,126,128,132,136,140,144,148,152,156,160,164,168,172,176,180,184,188,192,196,200,204,208,212,216,220,224,228,232,236,240,244,248,252,256,264,272,280,288,296,304,312,320,328,336,344,352,360,368,376,384,392,400,408,416,424,432,440,448,456,464,472,480,488,496,504,512,528,544,560,576,592,608,624,640,656,672,688,704,720,736,752,768,784,800,816,832,848,864,880,896,912,928,944,960,976,992,1008,1024,1056,1088,1120,1152,1184,1216,1248,1280,1312,1344,1376,1408,1440,1472,1504,1536,1568,1600,1632,1664,1696,1728,1760,1792,1824,1856,1888,1920,1952,1984,2016,2048,2112,2176,2240,2304,2368,2432,2496,2560,2624,2688,2752,2816,2880,2944,3008,3072,3136,3200,3264,3328,3392,3456,3520,3584,3648,3712,3776,3840,3904,3968,4032,4096};

// Perceptual quantization constants (match TSVM)
static const float ANISOTROPY_MULT[] = {2.0f, 1.8f, 1.6f, 1.4f, 1.2f, 1.0f};
static const float ANISOTROPY_BIAS[] = {0.4f, 0.2f, 0.1f, 0.0f, 0.0f, 0.0f};
static const float ANISOTROPY_MULT_CHROMA[] = {6.6f, 5.5f, 4.4f, 3.3f, 2.2f, 1.1f};
static const float ANISOTROPY_BIAS_CHROMA[] = {1.0f, 0.8f, 0.6f, 0.4f, 0.2f, 0.0f};
static const float FOUR_PIXEL_DETAILER = 0.88f;
static const float TWO_PIXEL_DETAILER = 0.92f;

//=============================================================================
// DWT Subband Layout Calculation (matches TSVM)
//=============================================================================

typedef struct {
    int level;              // Decomposition level (1 to decompLevels)
    int subband_type;       // 0=LL, 1=LH, 2=HL, 3=HH
    int coeff_start;        // Starting index in linear coefficient array
    int coeff_count;        // Number of coefficients in this subband
} dwt_subband_info_t;

static int calculate_subband_layout(int width, int height, int decomp_levels, dwt_subband_info_t *subbands) {
    int subband_count = 0;

    // LL subband at maximum decomposition level
    const int ll_width = width >> decomp_levels;
    const int ll_height = height >> decomp_levels;
    subbands[subband_count++] = (dwt_subband_info_t){decomp_levels, 0, 0, ll_width * ll_height};
    int coeff_offset = ll_width * ll_height;

    // LH, HL, HH subbands for each level from max down to 1
    for (int level = decomp_levels; level >= 1; level--) {
        const int level_width = width >> (decomp_levels - level + 1);
        const int level_height = height >> (decomp_levels - level + 1);
        const int subband_size = level_width * level_height;

        // LH subband
        subbands[subband_count++] = (dwt_subband_info_t){level, 1, coeff_offset, subband_size};
        coeff_offset += subband_size;

        // HL subband
        subbands[subband_count++] = (dwt_subband_info_t){level, 2, coeff_offset, subband_size};
        coeff_offset += subband_size;

        // HH subband
        subbands[subband_count++] = (dwt_subband_info_t){level, 3, coeff_offset, subband_size};
        coeff_offset += subband_size;
    }

    return subband_count;
}

//=============================================================================
// Perceptual Quantization Model (matches TSVM exactly)
//=============================================================================

static int tav_derive_encoder_qindex(int q_index, int q_y_global) {
    if (q_index > 0) return q_index - 1;
    if (q_y_global >= 60) return 0;
    else if (q_y_global >= 42) return 1;
    else if (q_y_global >= 25) return 2;
    else if (q_y_global >= 12) return 3;
    else if (q_y_global >= 6) return 4;
    else if (q_y_global >= 2) return 5;
    else return 5;
}

static float perceptual_model3_LH(float level) {
    const float H4 = 1.2f;
    const float K = 2.0f;  // CRITICAL: Fixed value for fixed curve; quantiser will scale it up anyway
    const float K12 = K * 12.0f;
    const float x = level;

    const float Lx = H4 - ((K + 1.0f) / 15.0f) * (x - 4.0f);
    const float C3 = -1.0f / 45.0f * (K12 + 92.0f);
    const float G3x = (-x / 180.0f) * (K12 + 5.0f * x * x - 60.0f * x + 252.0f) - C3 + H4;

    return (level >= 4.0f) ? Lx : G3x;
}

static float perceptual_model3_HL(int quality, float LH) {
    return LH * ANISOTROPY_MULT[quality] + ANISOTROPY_BIAS[quality];
}

static float lerp(float x, float y, float a) {
    return x * (1.0f - a) + y * a;
}

static float perceptual_model3_HH(float LH, float HL, float level) {
    const float Kx = (sqrtf(level) - 1.0f) * 0.5f + 0.5f;
    return lerp(LH, HL, Kx);
}

static float perceptual_model3_LL(float level) {
    const float n = perceptual_model3_LH(level);
    const float m = perceptual_model3_LH(level - 1.0f) / n;
    return n / m;
}

static float perceptual_model3_chroma_basecurve(int quality, float level) {
    return 1.0f - (1.0f / (0.5f * quality * quality + 1.0f)) * (level - 4.0f);
}

static float get_perceptual_weight(int q_index, int q_y_global, int level0, int subband_type,
                                  int is_chroma, int max_levels) {
    // Convert to perceptual level (1-6 scale)
    const float level = 1.0f + ((level0 - 1.0f) / (max_levels - 1.0f)) * 5.0f;
    const int quality_level = tav_derive_encoder_qindex(q_index, q_y_global);

    if (!is_chroma) {
        // LUMA CHANNEL
        if (subband_type == 0) {
            return perceptual_model3_LL(level);
        }

        const float LH = perceptual_model3_LH(level);
        if (subband_type == 1) {
            return LH;
        }

        const float HL = perceptual_model3_HL(quality_level, LH);
        if (subband_type == 2) {
            float detailer = 1.0f;
            if (level >= 1.8f && level <= 2.2f) detailer = TWO_PIXEL_DETAILER;
            else if (level >= 2.8f && level <= 3.2f) detailer = FOUR_PIXEL_DETAILER;
            return HL * detailer;
        } else {
            // HH subband
            float detailer = 1.0f;
            if (level >= 1.8f && level <= 2.2f) detailer = TWO_PIXEL_DETAILER;
            else if (level >= 2.8f && level <= 3.2f) detailer = FOUR_PIXEL_DETAILER;
            return perceptual_model3_HH(LH, HL, level) * detailer;
        }
    } else {
        // CHROMA CHANNELS
        const float base = perceptual_model3_chroma_basecurve(quality_level, level - 1);
        if (subband_type == 0) {
            return 1.0f;
        } else if (subband_type == 1) {
            return fmaxf(base, 1.0f);
        } else if (subband_type == 2) {
            return fmaxf(base * ANISOTROPY_MULT_CHROMA[quality_level], 1.0f);
        } else {
            return fmaxf(base * ANISOTROPY_MULT_CHROMA[quality_level] + ANISOTROPY_BIAS_CHROMA[quality_level], 1.0f);
        }
    }
}

static void dequantize_dwt_subbands_perceptual(int q_index, int q_y_global, const int16_t *quantized,
                                              float *dequantized, int width, int height, int decomp_levels,
                                              float base_quantizer, int is_chroma, int frame_num) {
    dwt_subband_info_t subbands[32]; // Max possible subbands
    const int subband_count = calculate_subband_layout(width, height, decomp_levels, subbands);

    const int coeff_count = width * height;
    memset(dequantized, 0, coeff_count * sizeof(float));

    int is_debug = 0;//(frame_num == 32);
//    if (frame_num == 32) {
//        fprintf(stderr, "DEBUG: dequantize called for frame %d, is_chroma=%d\n", frame_num, is_chroma);
//    }

    // Apply perceptual weighting to each subband
    for (int s = 0; s < subband_count; s++) {
        const dwt_subband_info_t *subband = &subbands[s];
        const float weight = get_perceptual_weight(q_index, q_y_global, subband->level,
                                                  subband->subband_type, is_chroma, decomp_levels);
        const float effective_quantizer = base_quantizer * weight;

        if (is_debug && !is_chroma) {
            if (subband->subband_type == 0) { // LL band
                fprintf(stderr, "  Subband level %d (LL): weight=%.6f, base_q=%.1f, effective_q=%.1f, count=%d\n",
                       subband->level, weight, base_quantizer, effective_quantizer, subband->coeff_count);

                // Print first 5 quantized LL coefficients
                fprintf(stderr, "    First 5 quantized LL: ");
                for (int k = 0; k < 5 && k < subband->coeff_count; k++) {
                    int idx = subband->coeff_start + k;
                    fprintf(stderr, "%d ", quantized[idx]);
                }
                fprintf(stderr, "\n");

                // Find max quantized LL coefficient
                int max_quant_ll = 0;
                for (int k = 0; k < subband->coeff_count; k++) {
                    int idx = subband->coeff_start + k;
                    int abs_val = quantized[idx] < 0 ? -quantized[idx] : quantized[idx];
                    if (abs_val > max_quant_ll) max_quant_ll = abs_val;
                }
                fprintf(stderr, "    Max quantized LL coefficient: %d (dequantizes to %.1f)\n",
                       max_quant_ll, max_quant_ll * effective_quantizer);
            }
        }

        for (int i = 0; i < subband->coeff_count; i++) {
            const int idx = subband->coeff_start + i;
            if (idx < coeff_count) {
                // CRITICAL: Must ROUND to match EZBC encoder's roundf() behavior
                // Without rounding, truncation limits brightness range (e.g., Y maxes at 227 instead of 255)
                const float untruncated = quantized[idx] * effective_quantizer;
                dequantized[idx] = roundf(untruncated);
            }
        }
    }

    // Debug: Verify LL band was dequantized correctly
    if (is_debug && !is_chroma) {
        // Find LL band again to verify
        for (int s = 0; s < subband_count; s++) {
            const dwt_subband_info_t *subband = &subbands[s];
            if (subband->level == decomp_levels && subband->subband_type == 0) {
                fprintf(stderr, "  AFTER all subbands processed - First 5 dequantized LL: ");
                for (int k = 0; k < 5 && k < subband->coeff_count; k++) {
                    int idx = subband->coeff_start + k;
                    fprintf(stderr, "%.1f ", dequantized[idx]);
                }
                fprintf(stderr, "\n");

                // Find max dequantized LL
                float max_dequant_ll = -999.0f;
                for (int k = 0; k < subband->coeff_count; k++) {
                    int idx = subband->coeff_start + k;
                    float abs_val = dequantized[idx] < 0 ? -dequantized[idx] : dequantized[idx];
                    if (abs_val > max_dequant_ll) max_dequant_ll = abs_val;
                }
                fprintf(stderr, "  AFTER all subbands - Max dequantized LL: %.1f\n", max_dequant_ll);
                break;
            }
        }
    }
}

//=============================================================================
// Grain Synthesis Removal (matches TSVM exactly)
//=============================================================================

// Deterministic RNG for grain synthesis (matches encoder)
static inline uint32_t tav_grain_synthesis_rng(uint32_t frame, uint32_t band, uint32_t x, uint32_t y) {
    uint32_t key = frame * 0x9e3779b9u ^ band * 0x7f4a7c15u ^ (y << 16) ^ x;
    // rng_hash implementation
    uint32_t hash = key;
    hash = hash ^ (hash >> 16);
    hash = hash * 0x7feb352du;
    hash = hash ^ (hash >> 15);
    hash = hash * 0x846ca68bu;
    hash = hash ^ (hash >> 16);
    return hash;
}

// Generate triangular noise from uint32 RNG (returns value in range [-1.0, 1.0])
static inline float tav_grain_triangular_noise(uint32_t rng_val) {
    // Get two uniform random values in [0, 1]
    float u1 = (rng_val & 0xFFFFu) / 65535.0f;
    float u2 = ((rng_val >> 16) & 0xFFFFu) / 65535.0f;

    // Convert to range [-1, 1] and average for triangular distribution
    return (u1 + u2) - 1.0f;
}

// Remove grain synthesis from DWT coefficients (decoder subtracts noise)
// This must be called AFTER dequantization but BEFORE inverse DWT
static void remove_grain_synthesis_decoder(float *coeffs, int width, int height,
                                          int decomp_levels, int frame_num, int q_y_global) {
    dwt_subband_info_t subbands[32];
    const int subband_count = calculate_subband_layout(width, height, decomp_levels, subbands);

    // Noise amplitude (matches Kotlin: qYGlobal.coerceAtMost(32) * 0.8f)
    const float noise_amplitude = (q_y_global < 32 ? q_y_global : 32) * 0.25f; // somehow noise amplitude works differently than Kotlin?

    // Process each subband (skip LL band which is level 0)
    for (int s = 0; s < subband_count; s++) {
        const dwt_subband_info_t *subband = &subbands[s];
        if (subband->level == 0) continue;  // Skip LL band

        // Calculate band index for RNG (matches Kotlin: level + subbandType * 31 + 16777619)
        uint32_t band = subband->level + subband->subband_type * 31 + 16777619;

        // Remove noise from each coefficient in this subband
        for (int i = 0; i < subband->coeff_count; i++) {
            const int idx = subband->coeff_start + i;
            if (idx < width * height) {
                // Calculate 2D position from linear index
                int y = idx / width;
                int x = idx % width;

                // Generate same deterministic noise as encoder
                uint32_t rng_val = tav_grain_synthesis_rng(frame_num, band, x, y);
                float noise = tav_grain_triangular_noise(rng_val);

                // Subtract noise from coefficient
                coeffs[idx] -= noise * noise_amplitude;
            }
        }
    }
}

//=============================================================================
static int calculate_dwt_levels(int chunk_size) {
    /*if (chunk_size < TAD_MIN_CHUNK_SIZE) {
        fprintf(stderr, "Error: Chunk size %d is below minimum %d\n", chunk_size, TAD_MIN_CHUNK_SIZE);
        return -1;
    }

    // Calculate levels: log2(chunk_size) - 1
    int levels = 0;
    int size = chunk_size;
    while (size > 1) {
        size >>= 1;
        levels++;
    }
    return levels - 2;*/
    return 9;
}

//=============================================================================
// Haar DWT Implementation (inverse only needed for decoder)
//=============================================================================

// Forward declaration (defined later in TAV decoder section)
static void dwt_97_inverse_1d(float *data, int length);

static void dwt_inverse_multilevel(float *data, int length, int levels) {
    // Pre-calculate all intermediate lengths used during forward transform
    // Forward uses: data[0..length-1], then data[0..(length+1)/2-1], etc.
    int *lengths = malloc((levels + 1) * sizeof(int));
    lengths[0] = length;
    for (int i = 1; i <= levels; i++) {
        lengths[i] = (lengths[i - 1] + 1) / 2;
    }

    // Inverse transform: apply inverse DWT using exact forward lengths in reverse order
    // Forward applied DWT with lengths: [length, (length+1)/2, ((length+1)/2+1)/2, ...]
    // Inverse must use same lengths in reverse: [..., ((length+1)/2+1)/2, (length+1)/2, length]
    for (int level = levels - 1; level >= 0; level--) {
        int current_length = lengths[level];
//        dwt_haar_inverse_1d(data, current_length);  // THEN apply inverse
//        dwt_dd4_inverse_1d(data, current_length);  // THEN apply inverse
        dwt_97_inverse_1d(data, current_length);  // THEN apply inverse
    }

    free(lengths);
}

//=============================================================================
// Helper Functions for TAD Decoder
//=============================================================================

static inline float FCLAMP(float x, float min, float max) {
    return x < min ? min : (x > max ? max : x);
}

//=============================================================================
// M/S Stereo Correlation (inverse of decorrelation)
//=============================================================================

// Uniform random in [0, 1)
static inline float frand01(void) {
    return (float)rand() / ((float)RAND_MAX + 1.0f);
}

// TPDF noise in [-1, +1)
static inline float tpdf1(void) {
    return (frand01() - frand01());
}

static void ms_correlate(const float *mid, const float *side, float *left, float *right, size_t count) {
    for (size_t i = 0; i < count; i++) {
        // Decode M/S → L/R
        float m = mid[i];
        float s = side[i];
        left[i] = FCLAMP((m + s), -1.0f, 1.0f);
        right[i] = FCLAMP((m - s), -1.0f, 1.0f);
    }
}

static float signum(float x) {
    if (x > 0.0f) return 1.0f;
    if (x < 0.0f) return -1.0f;
    return 0.0f;
}

static void expand_gamma(float *left, float *right, size_t count) {
    for (size_t i = 0; i < count; i++) {
        // decode(y) = sign(y) * |y|^(1/γ) where γ=0.5
        float x = left[i]; float a = fabsf(x);
        left[i] = signum(x) * powf(a, 1.4142f);
        float y = right[i]; float b = fabsf(y);
        right[i] = signum(y) * powf(b, 1.4142f);
    }
}

static void expand_mu_law(float *left, float *right, size_t count) {
    static float MU = 255.0f;

    for (size_t i = 0; i < count; i++) {
        // decode(y) = sign(y) * |y|^(1/γ) where γ=0.5
        float x = left[i];
        left[i] = signum(x) * (powf(1.0f + MU, fabsf(x)) - 1.0f) / MU;
        float y = right[i];
        right[i] = signum(y) * (powf(1.0f + MU, fabsf(y)) - 1.0f) / MU;
    }
}

static void pcm32f_to_pcm8(const float *fleft, const float *fright, uint8_t *left, uint8_t *right, size_t count, float dither_error[2][2]) {
    const float b1 = 1.5f;   // 1st feedback coefficient
    const float b2 = -0.75f; // 2nd feedback coefficient
    const float scale = 127.5f;
    const float bias  = 128.0f;

    // Reduced dither amplitude to coordinate with coefficient-domain dithering
    // The decoder now adds TPDF dither in coefficient domain, so we reduce
    // sample-domain dither by ~60% to avoid doubling the noise floor
    const float dither_scale = 0.2f;  // Reduced from 0.5 (was ±0.5 LSB, now ±0.2 LSB)

    for (size_t i = 0; i < count; i++) {
        // --- LEFT channel ---
        float feedbackL = b1 * dither_error[0][0] + b2 * dither_error[0][1];
        float ditherL = dither_scale * tpdf1(); // Reduced TPDF dither
        float shapedL = fleft[i] + feedbackL + ditherL / scale;
        shapedL = FCLAMP(shapedL, -1.0f, 1.0f);

        int qL = (int)lrintf(shapedL * scale);
        if (qL < -128) qL = -128;
        else if (qL > 127) qL = 127;
        left[i] = (uint8_t)(qL + bias);

        float qerrL = shapedL - (float)qL / scale;
        dither_error[0][1] = dither_error[0][0]; // shift history
        dither_error[0][0] = qerrL;

        // --- RIGHT channel ---
        float feedbackR = b1 * dither_error[1][0] + b2 * dither_error[1][1];
        float ditherR = dither_scale * tpdf1(); // Reduced TPDF dither
        float shapedR = fright[i] + feedbackR + ditherR / scale;
        shapedR = FCLAMP(shapedR, -1.0f, 1.0f);

        int qR = (int)lrintf(shapedR * scale);
        if (qR < -128) qR = -128;
        else if (qR > 127) qR = 127;
        right[i] = (uint8_t)(qR + bias);

        float qerrR = shapedR - (float)qR / scale;
        dither_error[1][1] = dither_error[1][0];
        dither_error[1][0] = qerrR;
    }
}

//=============================================================================
// TAD (Terrarum Advanced Audio) Decoder - Constants and Helpers
//=============================================================================

// Coefficient scalars for each subband (CDF 9/7 with 9 decomposition levels)
static const float TAD32_COEFF_SCALARS[] = {64.0f, 45.255f, 32.0f, 22.627f, 16.0f, 11.314f, 8.0f, 5.657f, 4.0f, 2.828f};

// Base quantiser weight table (10 subbands: LL + 9 H bands)
static const float BASE_QUANTISER_WEIGHTS[] = {
    1.0f, 1.0f, 1.0f, 1.0f, 1.0f, 1.1f, 1.2f, 1.3f, 1.4f, 1.5f
};

//=============================================================================
// Spectral Interpolation for Coefficient Reconstruction (TAD)
//=============================================================================

// Fast PRNG for light dithering (xorshift32)
static inline uint32_t xorshift32(uint32_t *s) {
    uint32_t x = *s;
    x ^= x << 13;
    x ^= x >> 17;
    x ^= x << 5;
    return *s = x;
}

static inline float urand(uint32_t *s) {
    return (xorshift32(s) & 0xFFFFFF) / 16777216.0f;
}

static inline float tpdf_tad(uint32_t *s) {
    return urand(s) - urand(s);
}

// Compute RMS energy of a coefficient band
static float compute_band_rms(const float *c, size_t len) {
    if (len == 0) return 0.0f;
    double sumsq = 0.0;
    for (size_t i = 0; i < len; i++) {
        sumsq += (double)c[i] * c[i];
    }
    return sqrtf((float)(sumsq / (double)len));
}

// Simplified spectral reconstruction for wavelet coefficients
static void spectral_interpolate_band(float *c, size_t len, float Q, float lower_band_rms) {
    if (len < 4) return;

    uint32_t seed = 0x9E3779B9u ^ (uint32_t)len ^ (uint32_t)(Q * 65536.0f);
    const float dither_amp = 0.05f * Q;

    for (size_t i = 0; i < len; i++) {
        c[i] += tpdf_tad(&seed) * dither_amp;
    }

    (void)lower_band_rms;
}

//=============================================================================
// Dequantization (inverse of quantization)
//=============================================================================


#define LAMBDA_FIXED 6.0f

// Lambda-based decompanding decoder (inverse of Laplacian CDF-based encoder)
// Converts quantized index back to normalized float in [-1, 1]
static float lambda_decompanding(int8_t quant_val, int max_index) {
    // Handle zero
    if (quant_val == 0) {
        return 0.0f;
    }

    int sign = (quant_val < 0) ? -1 : 1;
    int abs_index = abs(quant_val);

    // Clamp to valid range
    if (abs_index > max_index) abs_index = max_index;

    // Map index back to normalized CDF [0, 1]
    float normalized_cdf = (float)abs_index / max_index;

    // Map from [0, 1] back to [0.5, 1.0] (CDF range for positive half)
    float cdf = 0.5f + normalized_cdf * 0.5f;

    // Inverse Laplacian CDF for x >= 0: x = -(1/λ) * ln(2*(1-F))
    // For F in [0.5, 1.0]: x = -(1/λ) * ln(2*(1-F))
    float abs_val = -(1.0f / LAMBDA_FIXED) * logf(2.0f * (1.0f - cdf));

    // Clamp to [0, 1]
    if (abs_val > 1.0f) abs_val = 1.0f;
    if (abs_val < 0.0f) abs_val = 0.0f;

    return sign * abs_val;
}

static void dequantize_dwt_coefficients(const int8_t *quantized, float *coeffs, size_t count, int chunk_size, int dwt_levels, int max_index, float quantiser_scale) {

    // Calculate sideband boundaries dynamically
    int first_band_size = chunk_size >> dwt_levels;

    int *sideband_starts = malloc((dwt_levels + 2) * sizeof(int));
    sideband_starts[0] = 0;
    sideband_starts[1] = first_band_size;
    for (int i = 2; i <= dwt_levels + 1; i++) {
        sideband_starts[i] = sideband_starts[i-1] + (first_band_size << (i-2));
    }

    // Step 1: Dequantize all coefficients (no dithering yet)
    for (size_t i = 0; i < count; i++) {
        int sideband = dwt_levels;
        for (int s = 0; s <= dwt_levels; s++) {
            if (i < sideband_starts[s + 1]) {
                sideband = s;
                break;
            }
        }

        // Decode using lambda companding
        float normalized_val = lambda_decompanding(quantized[i], max_index);

        // Denormalize using the subband scalar and apply base weight + quantiser scaling
        float weight = BASE_QUANTISER_WEIGHTS[sideband] * quantiser_scale;
        coeffs[i] = normalized_val * TAD32_COEFF_SCALARS[sideband] * weight;
    }

    // Step 2: Apply spectral interpolation per band
    // Process bands from high to low frequency (dwt_levels down to 0)
    // so we can use lower bands' RMS for higher band reconstruction
    float prev_band_rms = 0.0f;

    for (int band = dwt_levels; band >= 0; band--) {
        size_t band_start = sideband_starts[band];
        size_t band_end = sideband_starts[band + 1];
        size_t band_len = band_end - band_start;

        // Calculate quantization step Q for this band
        float weight = BASE_QUANTISER_WEIGHTS[band] * quantiser_scale;
        float scalar = TAD32_COEFF_SCALARS[band] * weight;
        float Q = scalar / max_index;

        // Apply spectral interpolation to this band
        spectral_interpolate_band(&coeffs[band_start], band_len, Q, prev_band_rms);

        // Compute RMS for this band to use as reference for next (lower frequency) band
        prev_band_rms = compute_band_rms(&coeffs[band_start], band_len);
    }

    free(sideband_starts);
}

//=============================================================================
// Chunk Decoding
//=============================================================================

static int decode_chunk(const uint8_t *input, size_t input_size, uint8_t *pcmu8_stereo,
                        size_t *bytes_consumed, size_t *samples_decoded) {
    const uint8_t *read_ptr = input;

    // Read chunk header
    uint16_t sample_count = *((const uint16_t*)read_ptr);
    read_ptr += sizeof(uint16_t);

    uint8_t max_index = *read_ptr;
    read_ptr += sizeof(uint8_t);

    uint32_t payload_size = *((const uint32_t*)read_ptr);
    read_ptr += sizeof(uint32_t);

    // Calculate DWT levels from sample count
    int dwt_levels = calculate_dwt_levels(sample_count);
    if (dwt_levels < 0) {
        fprintf(stderr, "Error: Invalid sample count %u\n", sample_count);
        return -1;
    }

    // Decompress if needed
    const uint8_t *payload;
    uint8_t *decompressed = NULL;

    // Estimate decompressed size (generous upper bound)
    size_t decompressed_size = sample_count * 4 * sizeof(int8_t);
    decompressed = malloc(decompressed_size);

    size_t actual_size = ZSTD_decompress(decompressed, decompressed_size, read_ptr, payload_size);

    if (ZSTD_isError(actual_size)) {
        fprintf(stderr, "Error: Zstd decompression failed: %s\n", ZSTD_getErrorName(actual_size));
        free(decompressed);
        return -1;
    }

    read_ptr += payload_size;
    *bytes_consumed = read_ptr - input;
    *samples_decoded = sample_count;

    // Allocate working buffers
    int8_t *quant_mid = malloc(sample_count * sizeof(int8_t));
    int8_t *quant_side = malloc(sample_count * sizeof(int8_t));
    float *dwt_mid = malloc(sample_count * sizeof(float));
    float *dwt_side = malloc(sample_count * sizeof(float));
    float *pcm32_left = malloc(sample_count * sizeof(float));
    float *pcm32_right = malloc(sample_count * sizeof(float));
    uint8_t *pcm8_left = malloc(sample_count * sizeof(uint8_t));
    uint8_t *pcm8_right = malloc(sample_count * sizeof(uint8_t));

    // Separate Mid/Side
    memcpy(quant_mid, decompressed, sample_count);
    memcpy(quant_side, decompressed + sample_count, sample_count);

    // Debug: Check if we have non-zero coefficients
//    static int debug_coeff_count = 0;
//    if (debug_coeff_count < 3) {
//        int nonzero_mid = 0, nonzero_side = 0;
//        for (int i = 0; i < sample_count; i++) {
//            if (quant_mid[i] != 0) nonzero_mid++;
//            if (quant_side[i] != 0) nonzero_side++;
//        }
//        debug_coeff_count++;
//    }

    // Dequantize with quantiser scaling and spectral interpolation
    // Use quantiser_scale = 1.0f for baseline (must match encoder)
    float quantiser_scale = 1.0f;
    dequantize_dwt_coefficients(quant_mid, dwt_mid, sample_count, sample_count, dwt_levels, max_index, quantiser_scale);
    dequantize_dwt_coefficients(quant_side, dwt_side, sample_count, sample_count, dwt_levels, max_index, quantiser_scale);

    // Inverse DWT
    dwt_inverse_multilevel(dwt_mid, sample_count, dwt_levels);
    dwt_inverse_multilevel(dwt_side, sample_count, dwt_levels);

    float err[2][2] = {{0,0},{0,0}};

    // M/S to L/R correlation
    ms_correlate(dwt_mid, dwt_side, pcm32_left, pcm32_right, sample_count);

    // expand dynamic range
    expand_gamma(pcm32_left, pcm32_right, sample_count);

    // dither to 8-bit
    pcm32f_to_pcm8(pcm32_left, pcm32_right, pcm8_left, pcm8_right, sample_count, err);

    // Interleave stereo output (PCMu8)
    for (size_t i = 0; i < sample_count; i++) {
        pcmu8_stereo[i * 2] = pcm8_left[i];
        pcmu8_stereo[i * 2 + 1] = pcm8_right[i];
    }

    // Cleanup
    free(quant_mid); free(quant_side); free(dwt_mid); free(dwt_side);
    free(pcm32_left); free(pcm32_right); free(pcm8_left); free(pcm8_right);
    if (decompressed) free(decompressed);

    return 0;
}

//=============================================================================
// Significance Map Postprocessing (matches TSVM exactly)
//=============================================================================

// Helper: Extract 2-bit code from bit-packed array
static inline int get_twobit_code(const uint8_t *map_data, int map_bytes, int coeff_idx) {
    int bit_pos = coeff_idx * 2;
    int byte_idx = bit_pos / 8;
    int bit_offset = bit_pos % 8;

    uint8_t byte0 = map_data[byte_idx];
    int code = (byte0 >> bit_offset) & 0x03;

    // Handle byte boundary crossing
    if (bit_offset == 7 && byte_idx + 1 < map_bytes) {
        uint8_t byte1 = map_data[byte_idx + 1];
        code = ((byte0 >> 7) & 0x01) | ((byte1 << 1) & 0x02);
    }

    return code;
}

// Decoder: reconstruct coefficients from 2-bit map format (entropyCoder=0)
// Layout: [Y_map_2bit][Co_map_2bit][Cg_map_2bit][Y_others][Co_others][Cg_others]
// 2-bit encoding: 00=0, 01=+1, 10=-1, 11=other (stored in value array)
static void postprocess_coefficients_twobit(uint8_t *compressed_data, int coeff_count,
                                           int16_t *output_y, int16_t *output_co, int16_t *output_cg) {
    int map_bytes = (coeff_count * 2 + 7) / 8;  // 2 bits per coefficient

    // (Debug output removed)

    // Map offsets (all channels present for Y-Co-Cg layout)
    uint8_t *y_map = compressed_data;
    uint8_t *co_map = compressed_data + map_bytes;
    uint8_t *cg_map = compressed_data + map_bytes * 2;

    // Count "other" values (code 11) for each channel
    int y_others = 0, co_others = 0, cg_others = 0;
    for (int i = 0; i < coeff_count; i++) {
        if (get_twobit_code(y_map, map_bytes, i) == 3) y_others++;
        if (get_twobit_code(co_map, map_bytes, i) == 3) co_others++;
        if (get_twobit_code(cg_map, map_bytes, i) == 3) cg_others++;
    }

    // (Debug output removed)

    // Value array offsets (after all maps)
    uint8_t *value_ptr = compressed_data + map_bytes * 3;
    int16_t *y_values = (int16_t *)value_ptr;
    int16_t *co_values = (int16_t *)(value_ptr + y_others * 2);
    int16_t *cg_values = (int16_t *)(value_ptr + y_others * 2 + co_others * 2);

    // Reconstruct coefficients
    int y_value_idx = 0, co_value_idx = 0, cg_value_idx = 0;

    for (int i = 0; i < coeff_count; i++) {
        // Y channel
        int y_code = get_twobit_code(y_map, map_bytes, i);
        switch (y_code) {
            case 0: output_y[i] = 0; break;
            case 1: output_y[i] = 1; break;
            case 2: output_y[i] = -1; break;
            case 3: output_y[i] = y_values[y_value_idx++]; break;
        }

        // Co channel
        int co_code = get_twobit_code(co_map, map_bytes, i);
        switch (co_code) {
            case 0: output_co[i] = 0; break;
            case 1: output_co[i] = 1; break;
            case 2: output_co[i] = -1; break;
            case 3: output_co[i] = co_values[co_value_idx++]; break;
        }

        // Cg channel
        int cg_code = get_twobit_code(cg_map, map_bytes, i);
        switch (cg_code) {
            case 0: output_cg[i] = 0; break;
            case 1: output_cg[i] = 1; break;
            case 2: output_cg[i] = -1; break;
            case 3: output_cg[i] = cg_values[cg_value_idx++]; break;
        }
    }
}

//=============================================================================
// EZBC (Embedded Zero Block Coding) Decoder
//=============================================================================

// EZBC Block structure for quadtree
typedef struct {
    int x, y;
    int width, height;
} ezbc_block_t;

// EZBC bitstream reader state
typedef struct {
    const uint8_t *data;
    size_t size;
    size_t byte_pos;
    int bit_pos;
} ezbc_bitreader_t;

// Read N bits from EZBC bitstream (LSB-first within each byte)
static int ezbc_read_bits(ezbc_bitreader_t *reader, int num_bits) {
    int result = 0;
    for (int i = 0; i < num_bits; i++) {
        if (reader->byte_pos >= reader->size) {
            return result;  // End of stream
        }

        const int bit = (reader->data[reader->byte_pos] >> reader->bit_pos) & 1;
        result |= (bit << i);

        reader->bit_pos++;
        if (reader->bit_pos == 8) {
            reader->bit_pos = 0;
            reader->byte_pos++;
        }
    }
    return result;
}

// EZBC block queues (simple dynamic arrays)
typedef struct {
    ezbc_block_t *blocks;
    int count;
    int capacity;
} ezbc_block_queue_t;

static void ezbc_queue_init(ezbc_block_queue_t *q) {
    q->capacity = 256;
    q->count = 0;
    q->blocks = malloc(q->capacity * sizeof(ezbc_block_t));
}

static void ezbc_queue_free(ezbc_block_queue_t *q) {
    free(q->blocks);
    q->blocks = NULL;
    q->count = 0;
}

static void ezbc_queue_add(ezbc_block_queue_t *q, ezbc_block_t block) {
    if (q->count >= q->capacity) {
        q->capacity *= 2;
        q->blocks = realloc(q->blocks, q->capacity * sizeof(ezbc_block_t));
    }
    q->blocks[q->count++] = block;
}

// Forward declaration
static int ezbc_process_significant_block_recursive(
    ezbc_bitreader_t *reader, ezbc_block_t block, int bitplane, int threshold,
    int16_t *output, int width, int8_t *significant, int *first_bitplane,
    ezbc_block_queue_t *next_significant, ezbc_block_queue_t *next_insignificant);

// EZBC recursive block decoder (matches Kotlin implementation)
static int ezbc_process_significant_block_recursive(
    ezbc_bitreader_t *reader, ezbc_block_t block, int bitplane, int threshold,
    int16_t *output, int width, int8_t *significant, int *first_bitplane,
    ezbc_block_queue_t *next_significant, ezbc_block_queue_t *next_insignificant) {

    int sign_bits_read = 0;

    // If 1x1 block: read sign bit and add to significant queue
    if (block.width == 1 && block.height == 1) {
        const int idx = block.y * width + block.x;
        const int sign_bit = ezbc_read_bits(reader, 1);
        sign_bits_read++;

        // Set coefficient to threshold value with sign
        output[idx] = sign_bit ? -threshold : threshold;
        significant[idx] = 1;
        first_bitplane[idx] = bitplane;
        ezbc_queue_add(next_significant, block);
        return sign_bits_read;
    }

    // Block is > 1x1: subdivide and recursively process children
    int mid_x = block.width / 2;
    int mid_y = block.height / 2;
    if (mid_x == 0) mid_x = 1;
    if (mid_y == 0) mid_y = 1;

    // Top-left child
    ezbc_block_t tl = {block.x, block.y, mid_x, mid_y};
    const int tl_flag = ezbc_read_bits(reader, 1);
    if (tl_flag) {
        sign_bits_read += ezbc_process_significant_block_recursive(
            reader, tl, bitplane, threshold, output, width, significant, first_bitplane,
            next_significant, next_insignificant);
    } else {
        ezbc_queue_add(next_insignificant, tl);
    }

    // Top-right child (if exists)
    if (block.width > mid_x) {
        ezbc_block_t tr = {block.x + mid_x, block.y, block.width - mid_x, mid_y};
        const int tr_flag = ezbc_read_bits(reader, 1);
        if (tr_flag) {
            sign_bits_read += ezbc_process_significant_block_recursive(
                reader, tr, bitplane, threshold, output, width, significant, first_bitplane,
                next_significant, next_insignificant);
        } else {
            ezbc_queue_add(next_insignificant, tr);
        }
    }

    // Bottom-left child (if exists)
    if (block.height > mid_y) {
        ezbc_block_t bl = {block.x, block.y + mid_y, mid_x, block.height - mid_y};
        const int bl_flag = ezbc_read_bits(reader, 1);
        if (bl_flag) {
            sign_bits_read += ezbc_process_significant_block_recursive(
                reader, bl, bitplane, threshold, output, width, significant, first_bitplane,
                next_significant, next_insignificant);
        } else {
            ezbc_queue_add(next_insignificant, bl);
        }
    }

    // Bottom-right child (if exists)
    if (block.width > mid_x && block.height > mid_y) {
        ezbc_block_t br = {block.x + mid_x, block.y + mid_y, block.width - mid_x, block.height - mid_y};
        const int br_flag = ezbc_read_bits(reader, 1);
        if (br_flag) {
            sign_bits_read += ezbc_process_significant_block_recursive(
                reader, br, bitplane, threshold, output, width, significant, first_bitplane,
                next_significant, next_insignificant);
        } else {
            ezbc_queue_add(next_insignificant, br);
        }
    }

    return sign_bits_read;
}

// Decode a single channel with EZBC
static void decode_channel_ezbc(const uint8_t *ezbc_data, size_t offset, size_t size,
                               int16_t *output, int expected_count) {
    ezbc_bitreader_t reader = {ezbc_data, offset + size, offset, 0};

    // Debug: Print first few bytes
//    fprintf(stderr, "[EZBC] Channel decode: offset=%zu, size=%zu, first 5 bytes: %02X %02X %02X %02X %02X\n",
//           offset, size,
//           ezbc_data[offset], ezbc_data[offset+1], ezbc_data[offset+2],
//           ezbc_data[offset+3], ezbc_data[offset+4]);

    // Read header: MSB bitplane (8 bits), width (16 bits), height (16 bits)
    const int msb_bitplane = ezbc_read_bits(&reader, 8);
    const int width = ezbc_read_bits(&reader, 16);
    const int height = ezbc_read_bits(&reader, 16);

//    fprintf(stderr, "[EZBC] Decoded header: MSB=%d, width=%d, height=%d (expected pixels=%d)\n",
//           msb_bitplane, width, height, expected_count);

    if (width * height != expected_count) {
        fprintf(stderr, "EZBC dimension mismatch: %dx%d != %d\n", width, height, expected_count);
        memset(output, 0, expected_count * sizeof(int16_t));
        return;
    }

    // Initialize output and state tracking
    memset(output, 0, expected_count * sizeof(int16_t));
    int8_t *significant = calloc(expected_count, sizeof(int8_t));
    int *first_bitplane = calloc(expected_count, sizeof(int));

    // Initialize queues
    ezbc_block_queue_t insignificant, next_insignificant, significant_queue, next_significant;
    ezbc_queue_init(&insignificant);
    ezbc_queue_init(&next_insignificant);
    ezbc_queue_init(&significant_queue);
    ezbc_queue_init(&next_significant);

    // Start with root block
    ezbc_block_t root = {0, 0, width, height};
    ezbc_queue_add(&insignificant, root);

    // Process bitplanes from MSB to LSB
    for (int bitplane = msb_bitplane; bitplane >= 0; bitplane--) {
        const int threshold = 1 << bitplane;

        // Process insignificant blocks
        for (int i = 0; i < insignificant.count; i++) {
            const int flag = ezbc_read_bits(&reader, 1);

            if (flag == 0) {
                // Still insignificant
                ezbc_queue_add(&next_insignificant, insignificant.blocks[i]);
            } else {
                // Became significant - use recursive processing
                ezbc_process_significant_block_recursive(
                    &reader, insignificant.blocks[i], bitplane, threshold,
                    output, width, significant, first_bitplane,
                    &next_significant, &next_insignificant);
            }
        }

        // Process significant 1x1 blocks (refinement)
        for (int i = 0; i < significant_queue.count; i++) {
            ezbc_block_t block = significant_queue.blocks[i];
            const int idx = block.y * width + block.x;
            const int refine_bit = ezbc_read_bits(&reader, 1);

            // Add refinement bit at current bitplane
            if (refine_bit) {
                const int bit_value = 1 << bitplane;
                if (output[idx] < 0) {
                    output[idx] -= bit_value;
                } else {
                    output[idx] += bit_value;
                }
            }

            // Keep in significant queue
            ezbc_queue_add(&next_significant, block);
        }

        // Swap queues
        ezbc_block_queue_t temp_insig = insignificant;
        insignificant = next_insignificant;
        next_insignificant = temp_insig;
        next_insignificant.count = 0;

        ezbc_block_queue_t temp_sig = significant_queue;
        significant_queue = next_significant;
        next_significant = temp_sig;
        next_significant.count = 0;
    }

    // Cleanup
    free(significant);
    free(first_bitplane);
    ezbc_queue_free(&insignificant);
    ezbc_queue_free(&next_insignificant);
    ezbc_queue_free(&significant_queue);
    ezbc_queue_free(&next_significant);

    // Debug: Count non-zero coefficients
    int nonzero_count = 0;
    int16_t max_val = 0, min_val = 0;
    for (int i = 0; i < expected_count; i++) {
        if (output[i] != 0) {
            nonzero_count++;
            if (output[i] > max_val) max_val = output[i];
            if (output[i] < min_val) min_val = output[i];
        }
    }
//    fprintf(stderr, "[EZBC] Decoded %d non-zero coeffs (%.1f%%), range: [%d, %d]\n",
//           nonzero_count, 100.0 * nonzero_count / expected_count, min_val, max_val);
}

// EZBC postprocessing for single frames
static void postprocess_coefficients_ezbc(uint8_t *compressed_data, int coeff_count,
                                          int16_t *output_y, int16_t *output_co, int16_t *output_cg,
                                          int channel_layout) {
    const int has_y = (channel_layout & 0x04) == 0;
    const int has_co = (channel_layout & 0x02) == 0;
    const int has_cg = (channel_layout & 0x02) == 0;

    int offset = 0;

    // Decode Y channel
    if (has_y && output_y) {
        const uint32_t size = ((uint32_t)compressed_data[offset + 0]) |
                             ((uint32_t)compressed_data[offset + 1] << 8) |
                             ((uint32_t)compressed_data[offset + 2] << 16) |
                             ((uint32_t)compressed_data[offset + 3] << 24);
        offset += 4;
        decode_channel_ezbc(compressed_data, offset, size, output_y, coeff_count);
        offset += size;
    }

    // Decode Co channel
    if (has_co && output_co) {
        const uint32_t size = ((uint32_t)compressed_data[offset + 0]) |
                             ((uint32_t)compressed_data[offset + 1] << 8) |
                             ((uint32_t)compressed_data[offset + 2] << 16) |
                             ((uint32_t)compressed_data[offset + 3] << 24);
        offset += 4;
        decode_channel_ezbc(compressed_data, offset, size, output_co, coeff_count);
        offset += size;
    }

    // Decode Cg channel
    if (has_cg && output_cg) {
        const uint32_t size = ((uint32_t)compressed_data[offset + 0]) |
                             ((uint32_t)compressed_data[offset + 1] << 8) |
                             ((uint32_t)compressed_data[offset + 2] << 16) |
                             ((uint32_t)compressed_data[offset + 3] << 24);
        offset += 4;
        decode_channel_ezbc(compressed_data, offset, size, output_cg, coeff_count);
        offset += size;
    }
}

//=============================================================================
// DWT Inverse Transforms (matches TSVM)
//=============================================================================

// 9/7 inverse DWT (from TSVM Kotlin code)
static void dwt_97_inverse_1d(float *data, int length) {
    if (length < 2) return;

    // Debug: Check if input has non-zero values
//    static int call_count = 0;
//    if (call_count < 5) {
//         Debug: count non-zero coefficients (disabled to reduce stderr output)
//         int nonzero = 0;
//         for (int i = 0; i < length; i++) {
//             if (data[i] != 0.0f) nonzero++;
//         }
//         fprintf(stderr, "    dwt_97_inverse_1d call #%d: length=%d, nonzero=%d, first 5: %.1f %.1f %.1f %.1f %.1f\n",
//                call_count, length, nonzero,
//                data[0], length > 1 ? data[1] : 0.0f, length > 2 ? data[2] : 0.0f,
//                length > 3 ? data[3] : 0.0f, length > 4 ? data[4] : 0.0f);
//         call_count++;
//    }

    float *temp = malloc(length * sizeof(float));
    int half = (length + 1) / 2;

    // Split into low and high frequency components (matching TSVM layout)
    for (int i = 0; i < half; i++) {
        temp[i] = data[i];  // Low-pass coefficients (first half)
    }
    for (int i = 0; i < length / 2; i++) {
        if (half + i < length) {
            temp[half + i] = data[half + i];  // High-pass coefficients (second half)
        }
    }

    // 9/7 inverse lifting coefficients from TSVM
    const float alpha = -1.586134342f;
    const float beta = -0.052980118f;
    const float gamma = 0.882911076f;
    const float delta = 0.443506852f;
    const float K = 1.230174105f;

    // Step 1: Undo scaling
    for (int i = 0; i < half; i++) {
        temp[i] /= K;  // Low-pass coefficients
    }
    for (int i = 0; i < length / 2; i++) {
        if (half + i < length) {
            temp[half + i] *= K;  // High-pass coefficients
        }
    }

    // Step 2: Undo δ update
    for (int i = 0; i < half; i++) {
        float d_curr = (half + i < length) ? temp[half + i] : 0.0f;
        float d_prev = (i > 0 && half + i - 1 < length) ? temp[half + i - 1] : d_curr;
        temp[i] -= delta * (d_curr + d_prev);
    }

    // Step 3: Undo γ predict
    for (int i = 0; i < length / 2; i++) {
        if (half + i < length) {
            float s_curr = temp[i];
            float s_next = (i + 1 < half) ? temp[i + 1] : s_curr;
            temp[half + i] -= gamma * (s_curr + s_next);
        }
    }

    // Step 4: Undo β update
    for (int i = 0; i < half; i++) {
        float d_curr = (half + i < length) ? temp[half + i] : 0.0f;
        float d_prev = (i > 0 && half + i - 1 < length) ? temp[half + i - 1] : d_curr;
        temp[i] -= beta * (d_curr + d_prev);
    }

    // Step 5: Undo α predict
    for (int i = 0; i < length / 2; i++) {
        if (half + i < length) {
            float s_curr = temp[i];
            float s_next = (i + 1 < half) ? temp[i + 1] : s_curr;
            temp[half + i] -= alpha * (s_curr + s_next);
        }
    }

    // Reconstruction - interleave low and high pass
    for (int i = 0; i < length; i++) {
        if (i % 2 == 0) {
            // Even positions: low-pass coefficients
            data[i] = temp[i / 2];
        } else {
            // Odd positions: high-pass coefficients
            int idx = i / 2;
            if (half + idx < length) {
                data[i] = temp[half + idx];
            } else {
                data[i] = 0.0f;
            }
        }
    }

    // Debug: Check output (disabled to reduce stderr output)
    // if (call_count <= 5) {
    //     int nonzero_out = 0;
    //     for (int i = 0; i < length; i++) {
    //         if (data[i] != 0.0f) nonzero_out++;
    //     }
    //     fprintf(stderr, "      -> OUTPUT: nonzero=%d, first 5: %.1f %.1f %.1f %.1f %.1f\n",
    //            nonzero_out,
    //            data[0], length > 1 ? data[1] : 0.0f, length > 2 ? data[2] : 0.0f,
    //            length > 3 ? data[3] : 0.0f, length > 4 ? data[4] : 0.0f);
    // }

    free(temp);
}

// 5/3 inverse DWT (simplified - uses 9/7 for now)
static void dwt_53_inverse_1d(float *data, int length) {
    if (length < 2) return;
    // TODO: Implement proper 5/3 from TSVM if needed
    dwt_97_inverse_1d(data, length);
}

// Multi-level inverse DWT (matches TSVM exactly with correct non-power-of-2 handling)
static void apply_inverse_dwt_multilevel(float *data, int width, int height, int levels, int filter_type) {
    int max_size = (width > height) ? width : height;
    float *temp_row = malloc(max_size * sizeof(float));
    float *temp_col = malloc(max_size * sizeof(float));

    // Pre-calculate exact sequence of widths/heights from forward transform
    // This is CRITICAL for non-power-of-2 dimensions (e.g., 560, 448)
    // Forward transform uses: width, (width+1)/2, ((width+1)/2+1)/2, ...
    // Inverse MUST use the exact same sequence in reverse
    int *widths = malloc((levels + 1) * sizeof(int));
    int *heights = malloc((levels + 1) * sizeof(int));

    widths[0] = width;
    heights[0] = height;
    for (int i = 1; i <= levels; i++) {
        widths[i] = (widths[i - 1] + 1) / 2;
        heights[i] = (heights[i - 1] + 1) / 2;
    }

    // Debug: Print dimension sequence
    static int debug_once = 1;
    if (debug_once) {
        fprintf(stderr, "DWT dimension sequence for %dx%d with %d levels:\n", width, height, levels);
        for (int i = 0; i <= levels; i++) {
            fprintf(stderr, "  Level %d: %dx%d\n", i, widths[i], heights[i]);
        }
        debug_once = 0;
    }

    // TSVM: for (level in levels - 1 downTo 0)
    // Apply inverse transforms using pre-calculated dimensions
    for (int level = levels - 1; level >= 0; level--) {
        int current_width = widths[level];
        int current_height = heights[level];

        if (current_width < 1 || current_height < 1) continue;
        if (current_width == 1 && current_height == 1) continue;

        // TSVM: Column inverse transform first (vertical)
        for (int x = 0; x < current_width; x++) {
            for (int y = 0; y < current_height; y++) {
                temp_col[y] = data[y * width + x];
            }

            if (filter_type == 0) {
                dwt_53_inverse_1d(temp_col, current_height);
            } else {
                dwt_97_inverse_1d(temp_col, current_height);
            }

            for (int y = 0; y < current_height; y++) {
                data[y * width + x] = temp_col[y];
            }
        }

        // TSVM: Row inverse transform second (horizontal)
        for (int y = 0; y < current_height; y++) {
            for (int x = 0; x < current_width; x++) {
                temp_row[x] = data[y * width + x];
            }

            if (filter_type == 0) {
                dwt_53_inverse_1d(temp_row, current_width);
            } else {
                dwt_97_inverse_1d(temp_row, current_width);
            }

            for (int x = 0; x < current_width; x++) {
                data[y * width + x] = temp_row[x];
            }
        }

        // Debug after EVERY level
        static int first_frame_levels = 1;
        if (first_frame_levels && level <= 2) {  // Only log levels 2, 1, 0 for first frame
            int nonzero_level = 0;
            for (int y = 0; y < current_height; y++) {
                for (int x = 0; x < current_width; x++) {
                    if (fabsf(data[y * width + x]) > 0.001f) {  // Use fabs for better zero detection
                        nonzero_level++;
                    }
                }
            }
            // fprintf(stderr, "After level %d (%dx%d): nonzero=%d/%d, data[0]=%.1f, data[1]=%.1f, data[width]=%.1f\n",
            //        level, current_width, current_height, nonzero_level, current_width * current_height,
            //        data[0], data[1], data[width]);

            if (level == 0) first_frame_levels = 0;  // Stop after level 0 of first frame
        }
    }

    // Debug: Check buffer after all levels complete (disabled to reduce stderr output)
    // static int debug_output_once = 1;
    // if (debug_output_once) {
    //     int nonzero_final = 0;
    //     for (int i = 0; i < width * height; i++) {
    //         if (data[i] != 0.0f) nonzero_final++;
    //     }
    //     fprintf(stderr, "After ALL IDWT levels complete: nonzero=%d/%d, first 10: ", nonzero_final, width * height);
    //     for (int i = 0; i < 10 && i < width * height; i++) {
    //         fprintf(stderr, "%.1f ", data[i]);
    //     }
    //     fprintf(stderr, "\n");
    //     debug_output_once = 0;
    // }

    free(widths);
    free(heights);
    free(temp_row);
    free(temp_col);
}

//=============================================================================
// Temporal DWT and GOP Decoding (matches TSVM)
//=============================================================================

// Get temporal subband level for a given frame index in a GOP
static int get_temporal_subband_level(int frame_idx, int num_frames, int temporal_levels) {
    // Match encoder logic exactly (encoder_tav.c:1487-1501)
    // After temporal DWT with 2 levels:
    // Frames 0...num_frames/(2^2) = tLL (temporal low-low, coarsest, level 0)
    // Frames in first half but after tLL = tLH (level 1)
    // Remaining frames = tH from first level (level 2, finest)

    const int frames_per_level0 = num_frames >> temporal_levels;  // e.g., 16 >> 2 = 4, or 8 >> 2 = 2

    if (frame_idx < frames_per_level0) {
        return 0;  // Coarsest temporal level (tLL)
    } else if (frame_idx < (num_frames >> 1)) {
        return 1;  // First level high-pass (tLH)
    } else {
        return 2;  // Finest level high-pass (tH from level 1)
    }
}

// Calculate temporal quantizer scale for a given temporal subband level
static float get_temporal_quantizer_scale(int temporal_level) {
    // Uses exponential scaling: 2^(BETA × level^KAPPA)
    // With BETA=0.6, KAPPA=1.14:
    //   - Level 0 (tLL):  2^0.0 = 1.00
    //   - Level 1 (tH):   2^0.68 = 1.61
    //   - Level 2 (tHH):  2^1.29 = 2.45
    const float BETA = 0.6f;  // Temporal scaling exponent
    const float KAPPA = 1.14f;
    return powf(2.0f, BETA * powf(temporal_level, KAPPA));
}

// Inverse Haar 1D DWT
static void dwt_haar_inverse_1d(float *data, int length) {
    if (length < 2) return;

    float *temp = malloc(length * sizeof(float));
    const int half = (length + 1) / 2;

    // Inverse Haar transform: reconstruct from averages and differences
    // Read directly from data array (already has low-pass then high-pass layout)
    for (int i = 0; i < half; i++) {
        if (2 * i + 1 < length) {
            // Reconstruct adjacent pairs from average and difference
            temp[2 * i] = data[i] + data[half + i];      // average + difference
            temp[2 * i + 1] = data[i] - data[half + i];  // average - difference
        } else {
            // Handle odd length: last sample comes from low-pass only
            temp[2 * i] = data[i];
        }
    }

    // Copy reconstructed data back
    for (int i = 0; i < length; i++) {
        data[i] = temp[i];
    }

    free(temp);
}

// Apply inverse 3D DWT to GOP data (spatial + temporal)
// Order: SPATIAL first (each frame), then TEMPORAL (across frames)
static void apply_inverse_3d_dwt(float **gop_y, float **gop_co, float **gop_cg,
                                int width, int height, int gop_size,
                                int spatial_levels, int temporal_levels, int filter_type) {
    // Step 1: Apply inverse 2D spatial DWT to each frame
    for (int t = 0; t < gop_size; t++) {
        apply_inverse_dwt_multilevel(gop_y[t], width, height, spatial_levels, filter_type);
        apply_inverse_dwt_multilevel(gop_co[t], width, height, spatial_levels, filter_type);
        apply_inverse_dwt_multilevel(gop_cg[t], width, height, spatial_levels, filter_type);
    }

    // Step 2: Apply inverse temporal DWT to each spatial location
    // Only needed for GOPs with multiple frames (skip for I-frames)
    if (gop_size < 2) return;

    // Pre-calculate all intermediate lengths for temporal DWT (same fix as TAD)
    // This ensures correct reconstruction for non-power-of-2 GOP sizes
    int *temporal_lengths = malloc((temporal_levels + 1) * sizeof(int));
    temporal_lengths[0] = gop_size;
    for (int i = 1; i <= temporal_levels; i++) {
        temporal_lengths[i] = (temporal_lengths[i - 1] + 1) / 2;
    }

    float *temporal_line = malloc(gop_size * sizeof(float));
    for (int y = 0; y < height; y++) {
        for (int x = 0; x < width; x++) {
            const int pixel_idx = y * width + x;

            // Process Y channel
            for (int t = 0; t < gop_size; t++) {
                temporal_line[t] = gop_y[t][pixel_idx];
            }
            for (int level = temporal_levels - 1; level >= 0; level--) {
                const int level_frames = temporal_lengths[level];
                if (level_frames >= 2) {
                    dwt_haar_inverse_1d(temporal_line, level_frames);
                }
            }
            for (int t = 0; t < gop_size; t++) {
                gop_y[t][pixel_idx] = temporal_line[t];
            }

            // Process Co channel
            for (int t = 0; t < gop_size; t++) {
                temporal_line[t] = gop_co[t][pixel_idx];
            }
            for (int level = temporal_levels - 1; level >= 0; level--) {
                const int level_frames = temporal_lengths[level];
                if (level_frames >= 2) {
                    dwt_haar_inverse_1d(temporal_line, level_frames);
                }
            }
            for (int t = 0; t < gop_size; t++) {
                gop_co[t][pixel_idx] = temporal_line[t];
            }

            // Process Cg channel
            for (int t = 0; t < gop_size; t++) {
                temporal_line[t] = gop_cg[t][pixel_idx];
            }
            for (int level = temporal_levels - 1; level >= 0; level--) {
                const int level_frames = temporal_lengths[level];
                if (level_frames >= 2) {
                    dwt_haar_inverse_1d(temporal_line, level_frames);
                }
            }
            for (int t = 0; t < gop_size; t++) {
                gop_cg[t][pixel_idx] = temporal_line[t];
            }
        }
    }

    free(temporal_line);
    free(temporal_lengths);
}

// Postprocess GOP unified block to per-frame coefficients (2-bit map format)
static int16_t ***postprocess_gop_unified(const uint8_t *decompressed_data, size_t data_size,
                                         int gop_size, int num_pixels, int channel_layout) {
    // 2 bits per coefficient
    const int map_bytes_per_frame = (num_pixels * 2 + 7) / 8;

    // Determine which channels are present
    // Bit 0: has alpha, Bit 1: has chroma (inverted), Bit 2: has luma (inverted)
    const int has_y = (channel_layout & 0x04) == 0;
    const int has_co = (channel_layout & 0x02) == 0;  // Inverted: 0 = has chroma
    const int has_cg = (channel_layout & 0x02) == 0;  // Inverted: 0 = has chroma

    // Calculate buffer positions for maps
    int read_ptr = 0;
    const int y_maps_start = has_y ? read_ptr : -1;
    if (has_y) read_ptr += map_bytes_per_frame * gop_size;

    const int co_maps_start = has_co ? read_ptr : -1;
    if (has_co) read_ptr += map_bytes_per_frame * gop_size;

    const int cg_maps_start = has_cg ? read_ptr : -1;
    if (has_cg) read_ptr += map_bytes_per_frame * gop_size;

    // Count "other" values (code 11) across ALL frames
    int y_other_count = 0;
    int co_other_count = 0;
    int cg_other_count = 0;

    for (int frame = 0; frame < gop_size; frame++) {
        const int frame_map_offset = frame * map_bytes_per_frame;
        for (int i = 0; i < num_pixels; i++) {
            const int bit_pos = i * 2;
            const int byte_idx = bit_pos / 8;
            const int bit_offset = bit_pos % 8;

            if (has_y && y_maps_start + frame_map_offset + byte_idx < (int)data_size) {
                int code = (decompressed_data[y_maps_start + frame_map_offset + byte_idx] >> bit_offset) & 0x03;
                if (bit_offset == 7 && byte_idx + 1 < map_bytes_per_frame) {
                    const int next_byte = decompressed_data[y_maps_start + frame_map_offset + byte_idx + 1] & 0xFF;
                    code = (code & 0x01) | ((next_byte & 0x01) << 1);
                }
                if (code == 3) y_other_count++;
            }
            if (has_co && co_maps_start + frame_map_offset + byte_idx < (int)data_size) {
                int code = (decompressed_data[co_maps_start + frame_map_offset + byte_idx] >> bit_offset) & 0x03;
                if (bit_offset == 7 && byte_idx + 1 < map_bytes_per_frame) {
                    const int next_byte = decompressed_data[co_maps_start + frame_map_offset + byte_idx + 1] & 0xFF;
                    code = (code & 0x01) | ((next_byte & 0x01) << 1);
                }
                if (code == 3) co_other_count++;
            }
            if (has_cg && cg_maps_start + frame_map_offset + byte_idx < (int)data_size) {
                int code = (decompressed_data[cg_maps_start + frame_map_offset + byte_idx] >> bit_offset) & 0x03;
                if (bit_offset == 7 && byte_idx + 1 < map_bytes_per_frame) {
                    const int next_byte = decompressed_data[cg_maps_start + frame_map_offset + byte_idx + 1] & 0xFF;
                    code = (code & 0x01) | ((next_byte & 0x01) << 1);
                }
                if (code == 3) cg_other_count++;
            }
        }
    }

    // Value arrays start after all maps
    const int y_values_start = read_ptr;
    read_ptr += y_other_count * 2;

    const int co_values_start = read_ptr;
    read_ptr += co_other_count * 2;

    const int cg_values_start = read_ptr;

    // Allocate output arrays: [gop_size][3 channels][num_pixels]
    int16_t ***output = malloc(gop_size * sizeof(int16_t **));
    for (int t = 0; t < gop_size; t++) {
        output[t] = malloc(3 * sizeof(int16_t *));
        output[t][0] = calloc(num_pixels, sizeof(int16_t));  // Y
        output[t][1] = calloc(num_pixels, sizeof(int16_t));  // Co
        output[t][2] = calloc(num_pixels, sizeof(int16_t));  // Cg
    }

    int y_value_idx = 0;
    int co_value_idx = 0;
    int cg_value_idx = 0;

    for (int frame = 0; frame < gop_size; frame++) {
        const int frame_map_offset = frame * map_bytes_per_frame;
        for (int i = 0; i < num_pixels; i++) {
            const int bit_pos = i * 2;
            const int byte_idx = bit_pos / 8;
            const int bit_offset = bit_pos % 8;

            // Decode Y
            if (has_y && y_maps_start + frame_map_offset + byte_idx < (int)data_size) {
                int code = (decompressed_data[y_maps_start + frame_map_offset + byte_idx] >> bit_offset) & 0x03;
                if (bit_offset == 7 && byte_idx + 1 < map_bytes_per_frame) {
                    const int next_byte = decompressed_data[y_maps_start + frame_map_offset + byte_idx + 1] & 0xFF;
                    code = (code & 0x01) | ((next_byte & 0x01) << 1);
                }
                if (code == 0) {
                    output[frame][0][i] = 0;
                } else if (code == 1) {
                    output[frame][0][i] = 1;
                } else if (code == 2) {
                    output[frame][0][i] = -1;
                } else {  // code == 3
                    const int val_offset = y_values_start + y_value_idx * 2;
                    y_value_idx++;
                    if (val_offset + 1 < (int)data_size) {
                        const int lo = decompressed_data[val_offset] & 0xFF;
                        const int hi = (int8_t)decompressed_data[val_offset + 1];
                        output[frame][0][i] = (int16_t)((hi << 8) | lo);
                    } else {
                        output[frame][0][i] = 0;
                    }
                }
            }

            // Decode Co
            if (has_co && co_maps_start + frame_map_offset + byte_idx < (int)data_size) {
                int code = (decompressed_data[co_maps_start + frame_map_offset + byte_idx] >> bit_offset) & 0x03;
                if (bit_offset == 7 && byte_idx + 1 < map_bytes_per_frame) {
                    const int next_byte = decompressed_data[co_maps_start + frame_map_offset + byte_idx + 1] & 0xFF;
                    code = (code & 0x01) | ((next_byte & 0x01) << 1);
                }
                if (code == 0) {
                    output[frame][1][i] = 0;
                } else if (code == 1) {
                    output[frame][1][i] = 1;
                } else if (code == 2) {
                    output[frame][1][i] = -1;
                } else {  // code == 3
                    const int val_offset = co_values_start + co_value_idx * 2;
                    co_value_idx++;
                    if (val_offset + 1 < (int)data_size) {
                        const int lo = decompressed_data[val_offset] & 0xFF;
                        const int hi = (int8_t)decompressed_data[val_offset + 1];
                        output[frame][1][i] = (int16_t)((hi << 8) | lo);
                    } else {
                        output[frame][1][i] = 0;
                    }
                }
            }

            // Decode Cg
            if (has_cg && cg_maps_start + frame_map_offset + byte_idx < (int)data_size) {
                int code = (decompressed_data[cg_maps_start + frame_map_offset + byte_idx] >> bit_offset) & 0x03;
                if (bit_offset == 7 && byte_idx + 1 < map_bytes_per_frame) {
                    const int next_byte = decompressed_data[cg_maps_start + frame_map_offset + byte_idx + 1] & 0xFF;
                    code = (code & 0x01) | ((next_byte & 0x01) << 1);
                }
                if (code == 0) {
                    output[frame][2][i] = 0;
                } else if (code == 1) {
                    output[frame][2][i] = 1;
                } else if (code == 2) {
                    output[frame][2][i] = -1;
                } else {  // code == 3
                    const int val_offset = cg_values_start + cg_value_idx * 2;
                    cg_value_idx++;
                    if (val_offset + 1 < (int)data_size) {
                        const int lo = decompressed_data[val_offset] & 0xFF;
                        const int hi = (int8_t)decompressed_data[val_offset + 1];
                        output[frame][2][i] = (int16_t)((hi << 8) | lo);
                    } else {
                        output[frame][2][i] = 0;
                    }
                }
            }
        }
    }

    return output;
}

// Postprocess GOP RAW format to per-frame coefficients (entropyCoder=2)
// Layout: [All_Y_coeffs][All_Co_coeffs][All_Cg_coeffs] (raw int16 arrays)
static int16_t ***postprocess_gop_raw(const uint8_t *decompressed_data, size_t data_size,
                                     int gop_size, int num_pixels, int channel_layout) {
    // Determine which channels are present
    const int has_y = (channel_layout & 0x04) == 0;
    const int has_co = (channel_layout & 0x02) == 0;
    const int has_cg = (channel_layout & 0x02) == 0;

    // Allocate output arrays: [gop_size][3 channels][num_pixels]
    int16_t ***output = malloc(gop_size * sizeof(int16_t **));
    for (int t = 0; t < gop_size; t++) {
        output[t] = malloc(3 * sizeof(int16_t *));
        output[t][0] = calloc(num_pixels, sizeof(int16_t));  // Y
        output[t][1] = calloc(num_pixels, sizeof(int16_t));  // Co
        output[t][2] = calloc(num_pixels, sizeof(int16_t));  // Cg
    }

    int offset = 0;

    // Read Y channel (all frames concatenated)
    if (has_y) {
        const int channel_size = gop_size * num_pixels * sizeof(int16_t);
        if (offset + channel_size > (int)data_size) {
            fprintf(stderr, "Error: Not enough data for Y channel in RAW GOP\n");
            goto error_cleanup;
        }
        const int16_t *y_data = (const int16_t *)(decompressed_data + offset);
        for (int t = 0; t < gop_size; t++) {
            memcpy(output[t][0], y_data + t * num_pixels, num_pixels * sizeof(int16_t));
        }
        offset += channel_size;
    }

    // Read Co channel (all frames concatenated)
    if (has_co) {
        const int channel_size = gop_size * num_pixels * sizeof(int16_t);
        if (offset + channel_size > (int)data_size) {
            fprintf(stderr, "Error: Not enough data for Co channel in RAW GOP\n");
            goto error_cleanup;
        }
        const int16_t *co_data = (const int16_t *)(decompressed_data + offset);
        for (int t = 0; t < gop_size; t++) {
            memcpy(output[t][1], co_data + t * num_pixels, num_pixels * sizeof(int16_t));
        }
        offset += channel_size;
    }

    // Read Cg channel (all frames concatenated)
    if (has_cg) {
        const int channel_size = gop_size * num_pixels * sizeof(int16_t);
        if (offset + channel_size > (int)data_size) {
            fprintf(stderr, "Error: Not enough data for Cg channel in RAW GOP\n");
            goto error_cleanup;
        }
        const int16_t *cg_data = (const int16_t *)(decompressed_data + offset);
        for (int t = 0; t < gop_size; t++) {
            memcpy(output[t][2], cg_data + t * num_pixels, num_pixels * sizeof(int16_t));
        }
        offset += channel_size;
    }

    return output;

error_cleanup:
    for (int t = 0; t < gop_size; t++) {
        free(output[t][0]);
        free(output[t][1]);
        free(output[t][2]);
        free(output[t]);
    }
    free(output);
    return NULL;
}

// Postprocess GOP EZBC format to per-frame coefficients (entropyCoder=1)
// Layout: [frame0_size(4)][frame0_ezbc_data][frame1_size(4)][frame1_ezbc_data]...
// Note: EZBC is a complex embedded bitplane codec - this is a simplified placeholder
static int16_t ***postprocess_gop_ezbc(const uint8_t *decompressed_data, size_t data_size,
                                      int gop_size, int num_pixels, int channel_layout) {
    // Allocate output arrays: [gop_size][3 channels][num_pixels]
    int16_t ***output = malloc(gop_size * sizeof(int16_t **));
    for (int t = 0; t < gop_size; t++) {
        output[t] = malloc(3 * sizeof(int16_t *));
        output[t][0] = calloc(num_pixels, sizeof(int16_t));  // Y
        output[t][1] = calloc(num_pixels, sizeof(int16_t));  // Co
        output[t][2] = calloc(num_pixels, sizeof(int16_t));  // Cg
    }

    int offset = 0;

    // Read each frame
    for (int t = 0; t < gop_size; t++) {
        if (offset + 4 > (int)data_size) {
            fprintf(stderr, "Error: Not enough data for frame %d size in EZBC GOP\n", t);
            goto error_cleanup;
        }

        // Read frame size (4 bytes, little-endian)
        const uint32_t frame_size = ((uint32_t)decompressed_data[offset + 0]) |
                                   ((uint32_t)decompressed_data[offset + 1] << 8) |
                                   ((uint32_t)decompressed_data[offset + 2] << 16) |
                                   ((uint32_t)decompressed_data[offset + 3] << 24);
        offset += 4;

        if (offset + frame_size > data_size) {
            fprintf(stderr, "Error: Frame %d EZBC data exceeds buffer (size=%u, available=%zu)\n",
                   t, frame_size, data_size - offset);
            goto error_cleanup;
        }

        // Decode EZBC frame using the single-frame EZBC decoder
        postprocess_coefficients_ezbc(
            (uint8_t *)(decompressed_data + offset), num_pixels,
            output[t][0], output[t][1], output[t][2],
            channel_layout);

        offset += frame_size;
    }

    return output;

error_cleanup:
    for (int t = 0; t < gop_size; t++) {
        free(output[t][0]);
        free(output[t][1]);
        free(output[t][2]);
        free(output[t]);
    }
    free(output);
    return NULL;
}

//=============================================================================
// YCoCg-R / ICtCp to RGB Conversion (matches TSVM)
//=============================================================================

static void ycocg_r_to_rgb(float y, float co, float cg, uint8_t *r, uint8_t *g, uint8_t *b) {
    float tmp = y - cg / 2.0f;
    float g_val = cg + tmp;
    float b_val = tmp - co / 2.0f;
    float r_val = co + b_val;

    *r = CLAMP((int)(r_val + 0.5f), 0, 255);
    *g = CLAMP((int)(g_val + 0.5f), 0, 255);
    *b = CLAMP((int)(b_val + 0.5f), 0, 255);
}

// ICtCp to RGB conversion (for even TAV versions)
static void ictcp_to_rgb(float i, float ct, float cp, uint8_t *r, uint8_t *g, uint8_t *b) {
    // ICtCp → RGB conversion (inverse of RGB → ICtCp)
    // Step 1: ICtCp → LMS
    float l = i + 0.008609f * ct;
    float m = i - 0.008609f * ct;
    float s = i + 0.560031f * cp;

    // Step 2: LMS (nonlinear) → LMS (linear)
    // Inverse PQ transfer function (simplified)
    l = powf(fmaxf(l, 0.0f), 1.0f / 0.1593f);
    m = powf(fmaxf(m, 0.0f), 1.0f / 0.1593f);
    s = powf(fmaxf(s, 0.0f), 1.0f / 0.1593f);

    // Step 3: LMS → RGB
    float r_val = 5.432622f * l - 4.679910f * m + 0.247288f * s;
    float g_val = -1.106160f * l + 2.311198f * m - 0.205038f * s;
    float b_val = 0.028262f * l - 0.195689f * m + 1.167427f * s;

    *r = CLAMP((int)(r_val * 255.0f + 0.5f), 0, 255);
    *g = CLAMP((int)(g_val * 255.0f + 0.5f), 0, 255);
    *b = CLAMP((int)(b_val * 255.0f + 0.5f), 0, 255);
}

//=============================================================================
// WAV File Writing
//=============================================================================

static void write_wav_header(FILE *fp, uint32_t sample_rate, uint16_t channels, uint32_t data_size) {
    // RIFF header
    fwrite("RIFF", 1, 4, fp);
    uint32_t file_size = 36 + data_size;
    fwrite(&file_size, 4, 1, fp);
    fwrite("WAVE", 1, 4, fp);

    // fmt chunk
    fwrite("fmt ", 1, 4, fp);
    uint32_t fmt_size = 16;
    fwrite(&fmt_size, 4, 1, fp);
    uint16_t audio_format = 1;  // PCM
    fwrite(&audio_format, 2, 1, fp);
    fwrite(&channels, 2, 1, fp);
    fwrite(&sample_rate, 4, 1, fp);
    uint32_t byte_rate = sample_rate * channels * 1;  // 1 byte per sample (u8)
    fwrite(&byte_rate, 4, 1, fp);
    uint16_t block_align = channels * 1;
    fwrite(&block_align, 2, 1, fp);
    uint16_t bits_per_sample = 8;
    fwrite(&bits_per_sample, 2, 1, fp);

    // data chunk
    fwrite("data", 1, 4, fp);
    fwrite(&data_size, 4, 1, fp);
}

//=============================================================================
// Decoder State Structure
//=============================================================================

typedef struct {
    FILE *input_fp;
    tav_header_t header;
    uint8_t *current_frame_rgb;
    uint8_t *reference_frame_rgb;
    float *dwt_buffer_y;
    float *dwt_buffer_co;
    float *dwt_buffer_cg;
    float *reference_ycocg_y;   // For P-frame delta accumulation
    float *reference_ycocg_co;
    float *reference_ycocg_cg;
    int frame_count;
    int frame_size;
    int is_monoblock;           // True if version 3-6 (single tile mode)

    // FFmpeg pipe for video only (audio from file)
    FILE *video_pipe;
    pid_t ffmpeg_pid;

    // Temporary audio file
    char *audio_file_path;
} tav_decoder_t;

//=============================================================================
// Pass 1: Extract Audio to WAV File
//=============================================================================

static int extract_audio_to_wav(const char *input_file, const char *wav_file, int verbose) {
    FILE *input_fp = fopen(input_file, "rb");
    if (!input_fp) {
        fprintf(stderr, "Failed to open input file for audio extraction\n");
        return -1;
    }

    // Read header
    tav_header_t header;
    if (fread(&header, sizeof(tav_header_t), 1, input_fp) != 1) {
        fclose(input_fp);
        return -1;
    }

    // Open temporary audio file
    FILE *wav_fp = fopen(wav_file, "wb");
    if (!wav_fp) {
        fprintf(stderr, "Failed to create temporary audio file\n");
        fclose(input_fp);
        return -1;
    }

    // Write placeholder WAV header (will be updated later)
    write_wav_header(wav_fp, 32000, 2, 0);

    uint32_t total_audio_bytes = 0;
    int packet_count = 0;

    if (verbose) {
        fprintf(stderr, "[Pass 1] Extracting audio to %s...\n", wav_file);
    }

    // Read all packets and extract audio
    while (1) {
        uint8_t packet_type;
        if (fread(&packet_type, 1, 1, input_fp) != 1) {
            break;  // EOF
        }

        packet_count++;

        // Skip non-audio packets
        if (packet_type == TAV_PACKET_SYNC || packet_type == TAV_PACKET_SYNC_NTSC) {
            continue;
        }

        if (packet_type == TAV_PACKET_TIMECODE) {
            fseek(input_fp, 8, SEEK_CUR);  // Skip timecode
            continue;
        }

        if (packet_type == TAV_PACKET_GOP_SYNC) {
            fseek(input_fp, 1, SEEK_CUR);  // Skip frame count
            continue;
        }

        if (packet_type == TAV_PACKET_GOP_UNIFIED) {
            uint8_t gop_size;
            uint32_t compressed_size;
            fread(&gop_size, 1, 1, input_fp);
            fread(&compressed_size, 4, 1, input_fp);
            fseek(input_fp, compressed_size, SEEK_CUR);  // Skip GOP data
            continue;
        }

        // Handle TAD audio
        if (packet_type == TAV_PACKET_AUDIO_TAD) {
            uint16_t sample_count_wrapper;
            uint32_t payload_size_plus_7;
            fread(&sample_count_wrapper, 2, 1, input_fp);
            fread(&payload_size_plus_7, 4, 1, input_fp);

            uint16_t sample_count_chunk;
            uint8_t quantiser_index;
            uint32_t compressed_size;
            fread(&sample_count_chunk, 2, 1, input_fp);
            fread(&quantiser_index, 1, 1, input_fp);
            fread(&compressed_size, 4, 1, input_fp);

            uint8_t *tad_compressed = malloc(compressed_size);
            fread(tad_compressed, 1, compressed_size, input_fp);

            // Build TAD chunk
            size_t tad_chunk_size = 2 + 1 + 4 + compressed_size;
            uint8_t *tad_chunk = malloc(tad_chunk_size);
            memcpy(tad_chunk, &sample_count_chunk, 2);
            memcpy(tad_chunk + 2, &quantiser_index, 1);
            memcpy(tad_chunk + 3, &compressed_size, 4);
            memcpy(tad_chunk + 7, tad_compressed, compressed_size);
            free(tad_compressed);

            // Decode TAD
            uint8_t *pcmu8_output = malloc(sample_count_chunk * 2);
            size_t bytes_consumed, samples_decoded;
            int decode_result = decode_chunk(tad_chunk, tad_chunk_size,
                                            pcmu8_output, &bytes_consumed, &samples_decoded);

            if (decode_result >= 0) {
                size_t pcm_bytes = samples_decoded * 2;
                fwrite(pcmu8_output, 1, pcm_bytes, wav_fp);
                total_audio_bytes += pcm_bytes;
            }

            free(tad_chunk);
            free(pcmu8_output);
            continue;
        }

        // Handle PCM8 audio
        if (packet_type == TAV_PACKET_AUDIO_PCM8) {
            uint32_t packet_size;
            fread(&packet_size, 4, 1, input_fp);

            uint8_t *compressed_data = malloc(packet_size);
            fread(compressed_data, 1, packet_size, input_fp);

            // Decompress
            size_t decompressed_bound = ZSTD_getFrameContentSize(compressed_data, packet_size);
            uint8_t *pcm_data = malloc(decompressed_bound);
            size_t decompressed_size = ZSTD_decompress(pcm_data, decompressed_bound,
                                                       compressed_data, packet_size);
            free(compressed_data);

            if (!ZSTD_isError(decompressed_size)) {
                fwrite(pcm_data, 1, decompressed_size, wav_fp);
                total_audio_bytes += decompressed_size;
            }

            free(pcm_data);
            continue;
        }

        // Handle EXTENDED_HDR packet (key-value pairs)
        if (packet_type == TAV_PACKET_EXTENDED_HDR) {
            uint16_t num_pairs;
            fread(&num_pairs, 2, 1, input_fp);
            for (int i = 0; i < num_pairs; i++) {
                fseek(input_fp, 4, SEEK_CUR);  // Skip key (4 bytes)
                uint8_t value_type;
                fread(&value_type, 1, 1, input_fp);
                if (value_type == 0x04) {
                    fseek(input_fp, 8, SEEK_CUR);  // uint64 value
                } else if (value_type == 0x10) {
                    uint16_t str_len;
                    fread(&str_len, 2, 1, input_fp);
                    fseek(input_fp, str_len, SEEK_CUR);  // string value
                }
            }
            continue;
        }

        // Read packet size for standard packets
        uint32_t packet_size;
        if (fread(&packet_size, 4, 1, input_fp) == 1) {
            fseek(input_fp, packet_size, SEEK_CUR);
        }
    }

    // Update WAV header with actual data size
    fseek(wav_fp, 0, SEEK_SET);
    write_wav_header(wav_fp, 32000, 2, total_audio_bytes);

    fclose(wav_fp);
    fclose(input_fp);

    if (verbose) {
        fprintf(stderr, "[Pass 1] Extracted %u bytes of audio (%d packets processed)\n",
               total_audio_bytes, packet_count);
    }

    return 0;
}

//=============================================================================
// Decoder Initialization and Cleanup
//=============================================================================

static tav_decoder_t* tav_decoder_init(const char *input_file, const char *output_file, const char *audio_file) {
    tav_decoder_t *decoder = calloc(1, sizeof(tav_decoder_t));
    if (!decoder) return NULL;

    decoder->input_fp = fopen(input_file, "rb");
    if (!decoder->input_fp) {
        free(decoder);
        return NULL;
    }

    // Read header
    if (fread(&decoder->header, sizeof(tav_header_t), 1, decoder->input_fp) != 1) {
        fclose(decoder->input_fp);
        free(decoder);
        return NULL;
    }

    // Verify magic
    if (memcmp(decoder->header.magic, TAV_MAGIC, 8) != 0) {
        fclose(decoder->input_fp);
        free(decoder);
        return NULL;
    }

    decoder->frame_size = decoder->header.width * decoder->header.height;
    decoder->is_monoblock = (decoder->header.version >= 3 && decoder->header.version <= 6);
    decoder->audio_file_path = strdup(audio_file);

    // Allocate buffers
    decoder->current_frame_rgb = calloc(decoder->frame_size * 3, 1);
    decoder->reference_frame_rgb = calloc(decoder->frame_size * 3, 1);
    decoder->dwt_buffer_y = calloc(decoder->frame_size, sizeof(float));
    decoder->dwt_buffer_co = calloc(decoder->frame_size, sizeof(float));
    decoder->dwt_buffer_cg = calloc(decoder->frame_size, sizeof(float));
    decoder->reference_ycocg_y = calloc(decoder->frame_size, sizeof(float));
    decoder->reference_ycocg_co = calloc(decoder->frame_size, sizeof(float));
    decoder->reference_ycocg_cg = calloc(decoder->frame_size, sizeof(float));

    // Create FFmpeg process for video encoding (video pipe only, audio from file)
    int video_pipe_fd[2];
    if (pipe(video_pipe_fd) == -1) {
        fprintf(stderr, "Failed to create video pipe\n");
        free(decoder->current_frame_rgb);
        free(decoder->reference_frame_rgb);
        free(decoder->dwt_buffer_y);
        free(decoder->dwt_buffer_co);
        free(decoder->dwt_buffer_cg);
        free(decoder->reference_ycocg_y);
        free(decoder->reference_ycocg_co);
        free(decoder->reference_ycocg_cg);
        free(decoder->audio_file_path);
        fclose(decoder->input_fp);
        free(decoder);
        return NULL;
    }

    decoder->ffmpeg_pid = fork();
    if (decoder->ffmpeg_pid == -1) {
        fprintf(stderr, "Failed to fork FFmpeg process\n");
        close(video_pipe_fd[0]); close(video_pipe_fd[1]);
        free(decoder->current_frame_rgb);
        free(decoder->reference_frame_rgb);
        free(decoder->dwt_buffer_y);
        free(decoder->dwt_buffer_co);
        free(decoder->dwt_buffer_cg);
        free(decoder->reference_ycocg_y);
        free(decoder->reference_ycocg_co);
        free(decoder->reference_ycocg_cg);
        free(decoder->audio_file_path);
        fclose(decoder->input_fp);
        free(decoder);
        return NULL;
    } else if (decoder->ffmpeg_pid == 0) {
        // Child process - FFmpeg
        close(video_pipe_fd[1]);  // Close write end

        char video_size[32];
        char framerate[16];
        snprintf(video_size, sizeof(video_size), "%dx%d", decoder->header.width, decoder->header.height);
        snprintf(framerate, sizeof(framerate), "%d", decoder->header.fps);

        // Redirect video pipe to fd 3
        dup2(video_pipe_fd[0], 3);  // Video input on fd 3
        close(video_pipe_fd[0]);

        execl("/usr/bin/ffmpeg", "ffmpeg",
              "-f", "rawvideo",
              "-pixel_format", "rgb24",
              "-video_size", video_size,
              "-framerate", framerate,
              "-i", "pipe:3",              // Video from fd 3
              "-i", audio_file,            // Audio from file
              "-color_range", "2",
              "-c:v", "ffv1",              // FFV1 codec
              "-level", "3",               // FFV1 level 3
              "-coder", "1",               // Range coder
              "-context", "1",             // Large context
              "-g", "1",                   // GOP size 1 (all I-frames)
              "-slices", "24",             // 24 slices for threading
              "-slicecrc", "1",            // CRC per slice
              "-pixel_format", "rgb24",    // make FFmpeg encode to RGB
              "-color_range", "2",
              "-c:a", "pcm_u8",            // Audio codec (PCM unsigned 8-bit)
              "-f", "matroska",            // MKV container
              output_file,
              "-y",                        // Overwrite output
              "-v", "warning",             // Minimal logging
              (char*)NULL);

        fprintf(stderr, "Failed to start FFmpeg\n");
        exit(1);
    } else {
        // Parent process
        close(video_pipe_fd[0]);  // Close read end

        decoder->video_pipe = fdopen(video_pipe_fd[1], "wb");

        if (!decoder->video_pipe) {
            fprintf(stderr, "Failed to open video pipe for writing\n");
            kill(decoder->ffmpeg_pid, SIGTERM);
            free(decoder->current_frame_rgb);
            free(decoder->reference_frame_rgb);
            free(decoder->dwt_buffer_y);
            free(decoder->dwt_buffer_co);
            free(decoder->dwt_buffer_cg);
            free(decoder->reference_ycocg_y);
            free(decoder->reference_ycocg_co);
            free(decoder->reference_ycocg_cg);
            free(decoder->audio_file_path);
            fclose(decoder->input_fp);
            free(decoder);
            return NULL;
        }
    }

    return decoder;
}

static void tav_decoder_free(tav_decoder_t *decoder) {
    if (!decoder) return;

    if (decoder->input_fp) fclose(decoder->input_fp);
    if (decoder->video_pipe) fclose(decoder->video_pipe);

    // Wait for FFmpeg to finish
    if (decoder->ffmpeg_pid > 0) {
        int status;
        waitpid(decoder->ffmpeg_pid, &status, 0);
    }

    free(decoder->current_frame_rgb);
    free(decoder->reference_frame_rgb);
    free(decoder->dwt_buffer_y);
    free(decoder->dwt_buffer_co);
    free(decoder->dwt_buffer_cg);
    free(decoder->reference_ycocg_y);
    free(decoder->reference_ycocg_co);
    free(decoder->reference_ycocg_cg);
    free(decoder->audio_file_path);
    free(decoder);
}

//=============================================================================
// Frame Decoding Logic
//=============================================================================

static int decode_i_or_p_frame(tav_decoder_t *decoder, uint8_t packet_type, uint32_t packet_size) {
    // Variable declarations for cleanup
    uint8_t *compressed_data = NULL;
    uint8_t *decompressed_data = NULL;
    int16_t *quantized_y = NULL;
    int16_t *quantized_co = NULL;
    int16_t *quantized_cg = NULL;
    int decode_success = 1;  // Assume success, set to 0 on error

    // Read and decompress frame data
    compressed_data = malloc(packet_size);
    if (!compressed_data) {
        fprintf(stderr, "Error: Failed to allocate %u bytes for compressed data\n", packet_size);
        decode_success = 0;
        goto write_frame;
    }

    if (fread(compressed_data, 1, packet_size, decoder->input_fp) != packet_size) {
        fprintf(stderr, "Error: Failed to read %u bytes of compressed frame data\n", packet_size);
        decode_success = 0;
        goto write_frame;
    }

    size_t decompressed_size = ZSTD_getFrameContentSize(compressed_data, packet_size);
    if (decompressed_size == ZSTD_CONTENTSIZE_ERROR || decompressed_size == ZSTD_CONTENTSIZE_UNKNOWN) {
        fprintf(stderr, "Warning: Could not determine decompressed size, using estimate\n");
        decompressed_size = decoder->frame_size * 3 * sizeof(int16_t) + 1024;
    }

    decompressed_data = malloc(decompressed_size);
    if (!decompressed_data) {
        fprintf(stderr, "Error: Failed to allocate %zu bytes for decompressed data\n", decompressed_size);
        decode_success = 0;
        goto write_frame;
    }

    // Debug first 3 frames compression
//    static int decomp_debug = 0;
//    if (decomp_debug < 3) {
//        fprintf(stderr, "  [ZSTD frame %d] Compressed size: %u, buffer size: %zu\n", decomp_debug, packet_size, decompressed_size);
//        fprintf(stderr, "  [ZSTD frame %d] First 16 bytes of COMPRESSED data: ", decomp_debug);
//        for (int i = 0; i < 16 && i < (int)packet_size; i++) {
//            fprintf(stderr, "%02X ", compressed_data[i]);
//        }
//        fprintf(stderr, "\n");
//    }

    size_t actual_size = ZSTD_decompress(decompressed_data, decompressed_size, compressed_data, packet_size);

    if (ZSTD_isError(actual_size)) {
        fprintf(stderr, "Error: ZSTD decompression failed: %s\n", ZSTD_getErrorName(actual_size));
        fprintf(stderr, "  Compressed size: %u, Buffer size: %zu\n", packet_size, decompressed_size);
        decode_success = 0;
        goto write_frame;
    }

//    if (decomp_debug < 3) {
//        fprintf(stderr, "  [ZSTD frame %d] Decompressed size: %zu\n", decomp_debug, actual_size);
//        fprintf(stderr, "  [ZSTD frame %d] First 16 bytes of DECOMPRESSED data: ", decomp_debug);
//        for (int i = 0; i < 16 && i < (int)actual_size; i++) {
//            fprintf(stderr, "%02X ", decompressed_data[i]);
//        }
//        fprintf(stderr, "\n");
//        decomp_debug++;
//    }

    // Parse block data
    uint8_t *ptr = decompressed_data;
    uint8_t mode = *ptr++;
    uint8_t qy_override = *ptr++;
    uint8_t qco_override = *ptr++;
    uint8_t qcg_override = *ptr++;

    // IMPORTANT: Both header and override store QLUT indices, not values!
    // Override of 0 means "use header value"
    int qy = qy_override ? QLUT[qy_override] : QLUT[decoder->header.quantiser_y];
    int qco = qco_override ? QLUT[qco_override] : QLUT[decoder->header.quantiser_co];
    int qcg = qcg_override ? QLUT[qcg_override] : QLUT[decoder->header.quantiser_cg];

    // Debug first few frames
//    if (decoder->frame_count < 2) {
//        fprintf(stderr, "Frame %d: mode=%d, Q: Y=%d, Co=%d, Cg=%d, decompressed=%zu bytes\n",
//               decoder->frame_count, mode, qy, qco, qcg, actual_size);
//    }

    if (mode == TAV_MODE_SKIP) {
        // Copy from reference frame
        memcpy(decoder->current_frame_rgb, decoder->reference_frame_rgb, decoder->frame_size * 3);
    } else {
        // Decode coefficients (use function-level variables for proper cleanup)
        int coeff_count = decoder->frame_size;
        quantized_y = calloc(coeff_count, sizeof(int16_t));
        quantized_co = calloc(coeff_count, sizeof(int16_t));
        quantized_cg = calloc(coeff_count, sizeof(int16_t));

        if (!quantized_y || !quantized_co || !quantized_cg) {
            fprintf(stderr, "Error: Failed to allocate coefficient buffers\n");
            decode_success = 0;
            goto write_frame;
        }

        // Postprocess coefficients based on entropy_coder value
        if (decoder->header.entropy_coder == 1) {
            // EZBC format (stub implementation)
            postprocess_coefficients_ezbc(ptr, coeff_count, quantized_y, quantized_co, quantized_cg,
                                         decoder->header.channel_layout);
        } else {
            // Default: Twobitmap format (entropy_coder=0)
            postprocess_coefficients_twobit(ptr, coeff_count, quantized_y, quantized_co, quantized_cg);
        }

        // Debug: Check first few coefficients
//        if (decoder->frame_count == 32) {
//            fprintf(stderr, "  First 10 quantized Y coeffs: ");
//            for (int i = 0; i < 10 && i < coeff_count; i++) {
//                fprintf(stderr, "%d ", quantized_y[i]);
//            }
//            fprintf(stderr, "\n");
//
             // Check for any large quantized values that should produce bright pixels
//            int max_quant_y = 0;
//            for (int i = 0; i < coeff_count; i++) {
//                int abs_val = quantized_y[i] < 0 ? -quantized_y[i] : quantized_y[i];
//                if (abs_val > max_quant_y) max_quant_y = abs_val;
//            }
//            fprintf(stderr, "  Max quantized Y coefficient: %d\n", max_quant_y);
//        }

        // Dequantize (perceptual for versions 5-8, uniform for 1-4)
        const int is_perceptual = (decoder->header.version >= 5 && decoder->header.version <= 8);
        const int is_ezbc = (decoder->header.entropy_coder == 1);

        if (is_ezbc) {
            // EZBC mode: coefficients are already denormalized by encoder
            // Just convert int16 to float without multiplying by quantizer
            for (int i = 0; i < coeff_count; i++) {
                decoder->dwt_buffer_y[i] = (float)quantized_y[i];
                decoder->dwt_buffer_co[i] = (float)quantized_co[i];
                decoder->dwt_buffer_cg[i] = (float)quantized_cg[i];
            }
        } else if (is_perceptual) {
            dequantize_dwt_subbands_perceptual(0, qy, quantized_y, decoder->dwt_buffer_y,
                                              decoder->header.width, decoder->header.height,
                                              decoder->header.decomp_levels, qy, 0, decoder->frame_count);

            // Debug: Check if values survived the function call
//            if (decoder->frame_count == 32) {
//                fprintf(stderr, "  RIGHT AFTER dequantize_Y returns: first 5 values: %.1f %.1f %.1f %.1f %.1f\n",
//                       decoder->dwt_buffer_y[0], decoder->dwt_buffer_y[1], decoder->dwt_buffer_y[2],
//                       decoder->dwt_buffer_y[3], decoder->dwt_buffer_y[4]);
//            }

            dequantize_dwt_subbands_perceptual(0, qy, quantized_co, decoder->dwt_buffer_co,
                                              decoder->header.width, decoder->header.height,
                                              decoder->header.decomp_levels, qco, 1, decoder->frame_count);
            dequantize_dwt_subbands_perceptual(0, qy, quantized_cg, decoder->dwt_buffer_cg,
                                              decoder->header.width, decoder->header.height,
                                              decoder->header.decomp_levels, qcg, 1, decoder->frame_count);
        } else {
            for (int i = 0; i < coeff_count; i++) {
                decoder->dwt_buffer_y[i] = quantized_y[i] * qy;
                decoder->dwt_buffer_co[i] = quantized_co[i] * qco;
                decoder->dwt_buffer_cg[i] = quantized_cg[i] * qcg;
            }
        }

        // Debug: Check dequantized values using correct subband layout
//        if (decoder->frame_count == 32) {
//            dwt_subband_info_t subbands[32];
//            const int subband_count = calculate_subband_layout(decoder->header.width, decoder->header.height,
//                                                              decoder->header.decomp_levels, subbands);
//
             // Find LL band (highest level, type 0)
//            for (int s = 0; s < subband_count; s++) {
//                if (subbands[s].level == decoder->header.decomp_levels && subbands[s].subband_type == 0) {
//                    fprintf(stderr, "  LL band: level=%d, start=%d, count=%d\n",
//                           subbands[s].level, subbands[s].coeff_start, subbands[s].coeff_count);
//                    fprintf(stderr, "    Reading LL first 5 from dwt_buffer_y[0-4]: %.1f %.1f %.1f %.1f %.1f\n",
//                           decoder->dwt_buffer_y[0], decoder->dwt_buffer_y[1], decoder->dwt_buffer_y[2],
//                           decoder->dwt_buffer_y[3], decoder->dwt_buffer_y[4]);
//
                     // Find max in CORRECT LL band
//                    float max_ll = -999.0f;
//                    for (int i = 0; i < subbands[s].coeff_count; i++) {
//                        int idx = subbands[s].coeff_start + i;
//                        if (decoder->dwt_buffer_y[idx] > max_ll) max_ll = decoder->dwt_buffer_y[idx];
//                    }
//                    fprintf(stderr, "  Max LL coefficient BEFORE grain removal: %.1f\n", max_ll);
//                    break;
//                }
//            }
//        }

        // Remove grain synthesis from Y channel (must happen after dequantization, before inverse DWT)
        remove_grain_synthesis_decoder(decoder->dwt_buffer_y, decoder->header.width, decoder->header.height,
                                      decoder->header.decomp_levels, decoder->frame_count, decoder->header.quantiser_y);

        // Debug: Check LL band AFTER grain removal
//        if (decoder->frame_count == 32) {
//            int ll_width = decoder->header.width;
//            int ll_height = decoder->header.height;
//            for (int l = 0; l < decoder->header.decomp_levels; l++) {
//                ll_width = (ll_width + 1) / 2;
//                ll_height = (ll_height + 1) / 2;
//            }
//            float max_ll = -999.0f;
//            for (int i = 0; i < ll_width * ll_height; i++) {
//                if (decoder->dwt_buffer_y[i] > max_ll) max_ll = decoder->dwt_buffer_y[i];
//            }
//            fprintf(stderr, "  Max LL coefficient AFTER grain removal: %.1f\n", max_ll);
//        }

        // Apply inverse DWT with correct non-power-of-2 dimension handling
        // Note: quantized arrays freed at write_frame label
        apply_inverse_dwt_multilevel(decoder->dwt_buffer_y, decoder->header.width, decoder->header.height,
                                   decoder->header.decomp_levels, decoder->header.wavelet_filter);
        apply_inverse_dwt_multilevel(decoder->dwt_buffer_co, decoder->header.width, decoder->header.height,
                                   decoder->header.decomp_levels, decoder->header.wavelet_filter);
        apply_inverse_dwt_multilevel(decoder->dwt_buffer_cg, decoder->header.width, decoder->header.height,
                                   decoder->header.decomp_levels, decoder->header.wavelet_filter);

        // Debug: Check spatial domain values after IDWT
//        if (decoder->frame_count == 32) {
//            float max_y_spatial = -999.0f;
//            for (int i = 0; i < decoder->frame_size; i++) {
//                if (decoder->dwt_buffer_y[i] > max_y_spatial) max_y_spatial = decoder->dwt_buffer_y[i];
//            }
//            fprintf(stderr, "  Max Y in spatial domain AFTER IDWT: %.1f\n", max_y_spatial);
//        }

        // Debug: Check spatial domain values after IDWT (original debug)
//        if (decoder->frame_count < 1) {
//            fprintf(stderr, "  After IDWT - First 10 Y values: ");
//            for (int i = 0; i < 10 && i < decoder->frame_size; i++) {
//                fprintf(stderr, "%.1f ", decoder->dwt_buffer_y[i]);
//            }
//            fprintf(stderr, "\n");
//            fprintf(stderr, "  Y range: min=%.1f, max=%.1f\n",
//                   decoder->dwt_buffer_y[0], decoder->dwt_buffer_y[decoder->frame_size-1]);
//        }

        // Handle P-frame delta accumulation (in YCoCg float space)
        if (packet_type == TAV_PACKET_PFRAME && mode == TAV_MODE_DELTA) {
            for (int i = 0; i < decoder->frame_size; i++) {
                decoder->dwt_buffer_y[i] += decoder->reference_ycocg_y[i];
                decoder->dwt_buffer_co[i] += decoder->reference_ycocg_co[i];
                decoder->dwt_buffer_cg[i] += decoder->reference_ycocg_cg[i];
            }
        }

        // Convert YCoCg-R/ICtCp to RGB
        const int is_ictcp = (decoder->header.version % 2 == 0);
        float max_y = -999, max_co = -999, max_cg = -999;
        int max_r = 0, max_g = 0, max_b = 0;

        for (int i = 0; i < decoder->frame_size; i++) {
            uint8_t r, g, b;
            if (is_ictcp) {
                ictcp_to_rgb(decoder->dwt_buffer_y[i],
                           decoder->dwt_buffer_co[i],
                           decoder->dwt_buffer_cg[i], &r, &g, &b);
            } else {
                ycocg_r_to_rgb(decoder->dwt_buffer_y[i],
                             decoder->dwt_buffer_co[i],
                             decoder->dwt_buffer_cg[i], &r, &g, &b);
            }

            // Track max values for debugging
//            if (decoder->frame_count == 1000) {
//                if (decoder->dwt_buffer_y[i] > max_y) max_y = decoder->dwt_buffer_y[i];
//                if (decoder->dwt_buffer_co[i] > max_co) max_co = decoder->dwt_buffer_co[i];
//                if (decoder->dwt_buffer_cg[i] > max_cg) max_cg = decoder->dwt_buffer_cg[i];
//                if (r > max_r) max_r = r;
//                if (g > max_g) max_g = g;
//                if (b > max_b) max_b = b;
//            }

            // RGB byte order for FFmpeg rgb24
            decoder->current_frame_rgb[i * 3 + 0] = r;
            decoder->current_frame_rgb[i * 3 + 1] = g;
            decoder->current_frame_rgb[i * 3 + 2] = b;
        }

//        if (decoder->frame_count == 1000) {
//            fprintf(stderr, "\n=== Frame 1000 Value Analysis ===\n");
//            fprintf(stderr, "Max YCoCg values: Y=%.1f, Co=%.1f, Cg=%.1f\n", max_y, max_co, max_cg);
//            fprintf(stderr, "Max RGB values: R=%d, G=%d, B=%d\n", max_r, max_g, max_b);
//        }

        // Debug: Check RGB output
//        if (decoder->frame_count < 1) {
//            fprintf(stderr, "  First 5 pixels RGB: ");
//            for (int i = 0; i < 5 && i < decoder->frame_size; i++) {
//                fprintf(stderr, "(%d,%d,%d) ",
//                       decoder->current_frame_rgb[i*3],
//                       decoder->current_frame_rgb[i*3+1],
//                       decoder->current_frame_rgb[i*3+2]);
//            }
//            fprintf(stderr, "\n");
//        }

        // Update reference YCoCg frame
        memcpy(decoder->reference_ycocg_y, decoder->dwt_buffer_y, decoder->frame_size * sizeof(float));
        memcpy(decoder->reference_ycocg_co, decoder->dwt_buffer_co, decoder->frame_size * sizeof(float));
        memcpy(decoder->reference_ycocg_cg, decoder->dwt_buffer_cg, decoder->frame_size * sizeof(float));
    }

    // Update reference frame
    memcpy(decoder->reference_frame_rgb, decoder->current_frame_rgb, decoder->frame_size * 3);

write_frame:
    // Clean up temporary allocations
    if (compressed_data) free(compressed_data);
    if (decompressed_data) free(decompressed_data);
    if (quantized_y) free(quantized_y);
    if (quantized_co) free(quantized_co);
    if (quantized_cg) free(quantized_cg);

    // If decoding failed, fill frame with black to maintain stream alignment
    if (!decode_success) {
        memset(decoder->current_frame_rgb, 0, decoder->frame_size * 3);
        fprintf(stderr, "Warning: Writing black frame %d due to decode error\n", decoder->frame_count);
    }

    // Write frame to video pipe with retry on partial writes (ALWAYS write to maintain alignment)
    size_t bytes_to_write = decoder->frame_size * 3;
    size_t total_written = 0;
    const uint8_t *write_ptr = decoder->current_frame_rgb;

    while (total_written < bytes_to_write) {
        size_t bytes_written = fwrite(write_ptr + total_written, 1,
                                     bytes_to_write - total_written,
                                     decoder->video_pipe);
        if (bytes_written == 0) {
            if (ferror(decoder->video_pipe)) {
                fprintf(stderr, "Error: Pipe write error at frame %d (wrote %zu/%zu bytes) - aborting\n",
                       decoder->frame_count, total_written, bytes_to_write);
                // Cannot maintain stream alignment if pipe is broken - this is fatal
                return -1;
            }
            // Pipe might be full, flush and retry
            fflush(decoder->video_pipe);
            usleep(1000); // 1ms delay
        } else {
            total_written += bytes_written;
        }
    }

    // Ensure data is flushed to FFmpeg
    if (fflush(decoder->video_pipe) != 0) {
        fprintf(stderr, "Error: Failed to flush video pipe at frame %d - aborting\n", decoder->frame_count);
        // Cannot maintain stream alignment if pipe is broken - this is fatal
        return -1;
    }

    decoder->frame_count++;
    // Return success only if decoding succeeded; still return 1 to continue processing
    // (we wrote a frame either way to maintain stream alignment)
    return decode_success ? 1 : 1;  // Always return 1 to continue, errors are non-fatal now
}

//=============================================================================
// Main Decoding Loop
//=============================================================================

static void print_usage(const char *prog) {
    printf("TAV Decoder - Converts TAV video to FFV1+PCMu8 in MKV container\n");
    printf("Version: %s\n\n", DECODER_VENDOR_STRING);
    printf("Usage: %s -i input.tav -o output.mkv\n\n", prog);
    printf("Options:\n");
    printf("  -i <file>    Input TAV file\n");
    printf("  -o <file>    Output MKV file (FFV1 video + PCMu8 audio)\n");
    printf("  -v           Verbose output\n");
    printf("  -h, --help   Show this help\n\n");
    printf("Supported features (matches TSVM decoder):\n");
    printf("  - I-frames and P-frames (delta mode)\n");
    printf("  - GOP unified 3D DWT (temporal compression)\n");
    printf("  - TAD audio (decoded to PCMu8)\n");
    printf("  - MP2 audio (passed through)\n");
    printf("  - All wavelet types (5/3, 9/7, CDF 13/7, DD-4, Haar)\n");
    printf("  - Perceptual quantization (versions 5-8)\n");
    printf("  - YCoCg-R and ICtCp color spaces\n\n");
    printf("Unsupported features (not in TSVM decoder):\n");
    printf("  - MC-EZBC motion compensation\n");
    printf("  - MPEG-style residual coding (P/B-frames)\n");
    printf("  - Adaptive block partitioning\n\n");
}

int main(int argc, char *argv[]) {
    // Ignore SIGPIPE to prevent process termination if FFmpeg exits early
    signal(SIGPIPE, SIG_IGN);

    char *input_file = NULL;
    char *output_file = NULL;
    int verbose = 0;

    static struct option long_options[] = {
        {"help", no_argument, 0, 'h'},
        {0, 0, 0, 0}
    };

    int opt;
    while ((opt = getopt_long(argc, argv, "i:o:vh", long_options, NULL)) != -1) {
        switch (opt) {
            case 'i':
                input_file = optarg;
                break;
            case 'o':
                output_file = optarg;
                break;
            case 'v':
                verbose = 1;
                break;
            case 'h':
                print_usage(argv[0]);
                return 0;
            default:
                print_usage(argv[0]);
                return 1;
        }
    }

    if (!input_file || !output_file) {
        fprintf(stderr, "Error: Both input and output files are required\n\n");
        print_usage(argv[0]);
        return 1;
    }

    // Create temporary audio file path
    char temp_audio_file[256];
    snprintf(temp_audio_file, sizeof(temp_audio_file), "/tmp/tav_audio_%d.wav", getpid());

    // Pass 1: Extract audio to WAV file
    if (extract_audio_to_wav(input_file, temp_audio_file, verbose) < 0) {
        fprintf(stderr, "Failed to extract audio\n");
        unlink(temp_audio_file);  // Clean up temp file if it exists
        return 1;
    }

    // Pass 2: Decode video with audio file
    tav_decoder_t *decoder = tav_decoder_init(input_file, output_file, temp_audio_file);
    if (!decoder) {
        fprintf(stderr, "Failed to initialize decoder\n");
        unlink(temp_audio_file);  // Clean up temp file
        return 1;
    }

    if (verbose) {
        printf("TAV Decoder - %dx%d @ %dfps\n", decoder->header.width, decoder->header.height, decoder->header.fps);
        printf("Wavelet: %s, Levels: %d\n",
               decoder->header.wavelet_filter == 0 ? "5/3" :
               decoder->header.wavelet_filter == 1 ? "9/7" :
               decoder->header.wavelet_filter == 2 ? "CDF 13/7" :
               decoder->header.wavelet_filter == 16 ? "DD-4" :
               decoder->header.wavelet_filter == 255 ? "Haar" : "Unknown",
               decoder->header.decomp_levels);
        printf("Version: %d (%s, %s)\n", decoder->header.version,
               decoder->header.version % 2 == 0 ? "ICtCp" : "YCoCg-R",
               decoder->is_monoblock ? "monoblock" : "tiled");
        printf("Output: %s (FFV1 level 3 + PCMu8 @ 32 KHz)\n", output_file);
    }

    // Main decoding loop
    int result = 1;
    int total_packets = 0;
    int iframe_count = 0;
    while (result > 0) {
        // Check file position before reading packet
        long file_pos = ftell(decoder->input_fp);

        uint8_t packet_type;
        if (fread(&packet_type, 1, 1, decoder->input_fp) != 1) {
            if (verbose) {
                fprintf(stderr, "Reached EOF at file position %ld after %d packets\n", file_pos, total_packets);
            }
            result = 0; // EOF
            break;
        }

        total_packets++;

        if (verbose && total_packets <= 30) {
            fprintf(stderr, "Packet %d at file pos %ld: Type 0x%02X\n", total_packets, file_pos, packet_type);
        }

        // Handle sync packets (no size field)
        if (packet_type == TAV_PACKET_SYNC || packet_type == TAV_PACKET_SYNC_NTSC) {
            if (verbose && total_packets < 20) {
                fprintf(stderr, "Packet %d: SYNC (0x%02X)\n", total_packets, packet_type);
            }
            continue;
        }

        // Handle timecode packets (no size field, just 8 bytes of uint64 timecode)
        if (packet_type == TAV_PACKET_TIMECODE) {
            uint64_t timecode_ns;
            if (fread(&timecode_ns, 8, 1, decoder->input_fp) != 1) {
                fprintf(stderr, "Error: Failed to read timecode\n");
                result = -1;
                break;
            }
            if (verbose && total_packets < 20) {
                double timecode_sec = timecode_ns / 1000000000.0;
                fprintf(stderr, "Packet %d: TIMECODE (0x%02X) - %.6f seconds\n",
                       total_packets, packet_type, timecode_sec);
            }
            continue;
        }

        // Handle GOP sync packets (no size field, just 1 byte frame count)
        if (packet_type == TAV_PACKET_GOP_SYNC) {
            uint8_t gop_frame_count;
            if (fread(&gop_frame_count, 1, 1, decoder->input_fp) != 1) {
                fprintf(stderr, "Error: Failed to read GOP sync frame count\n");
                result = -1;
                break;
            }
            if (verbose) {
                fprintf(stderr, "Packet %d: GOP_SYNC (0x%02X) - %u frames from GOP\n",
                       total_packets, packet_type, gop_frame_count);
            }
            // Update decoder frame count (GOP already wrote frames)
            decoder->frame_count += gop_frame_count;
            continue;
        }

        // Handle GOP unified packets (custom format: 1-byte gop_size + 4-byte compressed_size)
        if (packet_type == TAV_PACKET_GOP_UNIFIED) {
            uint8_t gop_size;
            uint32_t compressed_size;
            if (fread(&gop_size, 1, 1, decoder->input_fp) != 1 ||
                fread(&compressed_size, 4, 1, decoder->input_fp) != 1) {
                fprintf(stderr, "Error: Failed to read GOP unified packet header\n");
                result = -1;
                break;
            }

            if (verbose) {
                fprintf(stderr, "Packet %d: GOP_UNIFIED (0x%02X), %u frames, %u bytes\n",
                       total_packets, packet_type, gop_size, compressed_size);
            }

            // Read compressed GOP data
            uint8_t *compressed_data = malloc(compressed_size);
            if (!compressed_data) {
                fprintf(stderr, "Error: Failed to allocate GOP compressed buffer (%u bytes)\n", compressed_size);
                result = -1;
                break;
            }

            if (fread(compressed_data, 1, compressed_size, decoder->input_fp) != compressed_size) {
                fprintf(stderr, "Error: Failed to read GOP compressed data\n");
                free(compressed_data);
                result = -1;
                break;
            }

            // Decompress with Zstd
            const size_t decompressed_bound = ZSTD_getFrameContentSize(compressed_data, compressed_size);
            if (decompressed_bound == ZSTD_CONTENTSIZE_ERROR || decompressed_bound == ZSTD_CONTENTSIZE_UNKNOWN) {
                fprintf(stderr, "Error: Invalid Zstd frame in GOP data\n");
                free(compressed_data);
                result = -1;
                break;
            }

            uint8_t *decompressed_data = malloc(decompressed_bound);
            if (!decompressed_data) {
                fprintf(stderr, "Error: Failed to allocate GOP decompressed buffer (%zu bytes)\n", decompressed_bound);
                free(compressed_data);
                result = -1;
                break;
            }

            const size_t decompressed_size = ZSTD_decompress(decompressed_data, decompressed_bound,
                                                            compressed_data, compressed_size);
            free(compressed_data);

            if (ZSTD_isError(decompressed_size)) {
                fprintf(stderr, "Error: Zstd decompression failed: %s\n", ZSTD_getErrorName(decompressed_size));
                free(decompressed_data);
                result = -1;
                break;
            }

            // Postprocess coefficients based on entropy_coder value
            const int num_pixels = decoder->header.width * decoder->header.height;
            int16_t ***quantized_gop;

            if (decoder->header.entropy_coder == 2) {
                // RAW format: simple concatenated int16 arrays
                if (verbose) {
                    fprintf(stderr, "  Using RAW postprocessing (entropy_coder=2)\n");
                }
                quantized_gop = postprocess_gop_raw(decompressed_data, decompressed_size,
                                                   gop_size, num_pixels, decoder->header.channel_layout);
            } else if (decoder->header.entropy_coder == 1) {
                // EZBC format: embedded zero-block coding
                if (verbose) {
                    fprintf(stderr, "  Using EZBC postprocessing (entropy_coder=1)\n");
                }
                quantized_gop = postprocess_gop_ezbc(decompressed_data, decompressed_size,
                                                    gop_size, num_pixels, decoder->header.channel_layout);
            } else {
                // Default: Twobitmap format (entropy_coder=0)
                if (verbose) {
                    fprintf(stderr, "  Using Twobitmap postprocessing (entropy_coder=0)\n");
                }
                quantized_gop = postprocess_gop_unified(decompressed_data, decompressed_size,
                                                       gop_size, num_pixels, decoder->header.channel_layout);
            }

            free(decompressed_data);

            if (!quantized_gop) {
                fprintf(stderr, "Error: Failed to postprocess GOP data\n");
                result = -1;
                break;
            }

            // Allocate GOP float buffers
            float **gop_y = malloc(gop_size * sizeof(float *));
            float **gop_co = malloc(gop_size * sizeof(float *));
            float **gop_cg = malloc(gop_size * sizeof(float *));

            for (int t = 0; t < gop_size; t++) {
                gop_y[t] = calloc(num_pixels, sizeof(float));
                gop_co[t] = calloc(num_pixels, sizeof(float));
                gop_cg[t] = calloc(num_pixels, sizeof(float));
            }

            // Dequantize with temporal scaling (perceptual quantization for versions 5-8)
            const int is_perceptual = (decoder->header.version >= 5 && decoder->header.version <= 8);
            const int is_ezbc = (decoder->header.entropy_coder == 1);
            const int temporal_levels = 2;  // Fixed for TAV GOP encoding

            for (int t = 0; t < gop_size; t++) {
                if (is_ezbc) {
                    // EZBC mode: coefficients are already denormalized by encoder
                    // Just convert int16 to float without multiplying by quantizer
                    for (int i = 0; i < num_pixels; i++) {
                        gop_y[t][i] = (float)quantized_gop[t][0][i];
                        gop_co[t][i] = (float)quantized_gop[t][1][i];
                        gop_cg[t][i] = (float)quantized_gop[t][2][i];
                    }

                    if (t == 0) {
                        // Debug first frame
                        int16_t max_y = 0, min_y = 0;
                        for (int i = 0; i < num_pixels; i++) {
                            if (quantized_gop[t][0][i] > max_y) max_y = quantized_gop[t][0][i];
                            if (quantized_gop[t][0][i] < min_y) min_y = quantized_gop[t][0][i];
                        }
                        fprintf(stderr, "[GOP-EZBC] Frame 0 Y coeffs range: [%d, %d], first 5: %d %d %d %d %d\n",
                               min_y, max_y,
                               quantized_gop[t][0][0], quantized_gop[t][0][1], quantized_gop[t][0][2],
                               quantized_gop[t][0][3], quantized_gop[t][0][4]);
                    }
                } else {
                    // Normal mode: multiply by quantizer
                    const int temporal_level = get_temporal_subband_level(t, gop_size, temporal_levels);
                    const float temporal_scale = get_temporal_quantizer_scale(temporal_level);

                    // CRITICAL: Must ROUND temporal quantizer to match encoder's roundf() behavior
                    const float base_q_y = roundf(decoder->header.quantiser_y * temporal_scale);
                    const float base_q_co = roundf(decoder->header.quantiser_co * temporal_scale);
                    const float base_q_cg = roundf(decoder->header.quantiser_cg * temporal_scale);

                    if (is_perceptual) {
                        dequantize_dwt_subbands_perceptual(0, decoder->header.quantiser_y,
                                                          quantized_gop[t][0], gop_y[t],
                                                          decoder->header.width, decoder->header.height,
                                                          decoder->header.decomp_levels, base_q_y, 0, decoder->frame_count + t);
                        dequantize_dwt_subbands_perceptual(0, decoder->header.quantiser_y,
                                                          quantized_gop[t][1], gop_co[t],
                                                          decoder->header.width, decoder->header.height,
                                                          decoder->header.decomp_levels, base_q_co, 1, decoder->frame_count + t);
                        dequantize_dwt_subbands_perceptual(0, decoder->header.quantiser_y,
                                                          quantized_gop[t][2], gop_cg[t],
                                                          decoder->header.width, decoder->header.height,
                                                          decoder->header.decomp_levels, base_q_cg, 1, decoder->frame_count + t);
                    } else {
                        // Uniform quantization for older versions
                        for (int i = 0; i < num_pixels; i++) {
                            gop_y[t][i] = quantized_gop[t][0][i] * base_q_y;
                            gop_co[t][i] = quantized_gop[t][1][i] * base_q_co;
                            gop_cg[t][i] = quantized_gop[t][2][i] * base_q_cg;
                        }
                    }
                }
            }

            // Free quantized coefficients
            for (int t = 0; t < gop_size; t++) {
                free(quantized_gop[t][0]);
                free(quantized_gop[t][1]);
                free(quantized_gop[t][2]);
                free(quantized_gop[t]);
            }
            free(quantized_gop);

            // Remove grain synthesis from Y channel for each GOP frame
            // This must happen after dequantization but before inverse DWT
            for (int t = 0; t < gop_size; t++) {
                remove_grain_synthesis_decoder(gop_y[t], decoder->header.width, decoder->header.height,
                                              decoder->header.decomp_levels, decoder->frame_count + t,
                                              decoder->header.quantiser_y);
            }

            // Apply inverse 3D DWT (spatial + temporal)
            apply_inverse_3d_dwt(gop_y, gop_co, gop_cg, decoder->header.width, decoder->header.height,
                               gop_size, decoder->header.decomp_levels, temporal_levels,
                               decoder->header.wavelet_filter);

            // Debug: Check spatial coefficients after inverse temporal DWT (before inverse spatial DWT)
//            if (is_ezbc) {
//                float max_y = 0.0f, min_y = 0.0f;
//                for (int i = 0; i < num_pixels; i++) {
//                    if (gop_y[0][i] > max_y) max_y = gop_y[0][i];
//                    if (gop_y[0][i] < min_y) min_y = gop_y[0][i];
//                }
//                fprintf(stderr, "[GOP-EZBC] After inverse temporal DWT, Frame 0 Y spatial coeffs range: [%.1f, %.1f], first 5: %.1f %.1f %.1f %.1f %.1f\n",
//                       min_y, max_y,
//                       gop_y[0][0], gop_y[0][1], gop_y[0][2], gop_y[0][3], gop_y[0][4]);
//            }

            // Convert YCoCg→RGB and write all GOP frames
            const int is_ictcp = (decoder->header.version % 2 == 0);

            // DEBUG: Print frame size calculation
//            if (decoder->frame_count == 0) {
//                fprintf(stderr, "[DEBUG] decoder->frame_size=%d, decoder->header.width=%d, decoder->header.height=%d\n",
//                       decoder->frame_size, decoder->header.width, decoder->header.height);
//                fprintf(stderr, "[DEBUG] bytes_to_write=%zu (should be %d)\n",
//                       (size_t)decoder->frame_size * 3, decoder->header.width * decoder->header.height * 3);
//            }

            for (int t = 0; t < gop_size; t++) {
                // Allocate frame buffer
                uint8_t *frame_rgb = malloc(decoder->frame_size * 3);
                if (!frame_rgb) {
                    fprintf(stderr, "Error: Failed to allocate GOP frame buffer\n");
                    result = -1;
                    break;
                }

                // Convert to RGB
                for (int i = 0; i < decoder->frame_size; i++) {
                    uint8_t r, g, b;
                    if (is_ictcp) {
                        ictcp_to_rgb(gop_y[t][i], gop_co[t][i], gop_cg[t][i], &r, &g, &b);
                    } else {
                        ycocg_r_to_rgb(gop_y[t][i], gop_co[t][i], gop_cg[t][i], &r, &g, &b);
                    }
                    frame_rgb[i * 3 + 0] = r;
                    frame_rgb[i * 3 + 1] = g;
                    frame_rgb[i * 3 + 2] = b;
                }

                // Write frame to FFmpeg video pipe
                const size_t bytes_to_write = decoder->frame_size * 3;

                // DEBUG: Verify we're writing to correct pipe
//                if (decoder->frame_count == 0 && t == 0) {
//                    fprintf(stderr, "[DEBUG] Writing frame to video_pipe=%p, bytes_to_write=%zu\n",
//                           (void*)decoder->video_pipe, bytes_to_write);
//                    fprintf(stderr, "[DEBUG] First 10 RGB bytes: %02X %02X %02X %02X %02X %02X %02X %02X %02X %02X\n",
//                           frame_rgb[0], frame_rgb[1], frame_rgb[2], frame_rgb[3], frame_rgb[4],
//                           frame_rgb[5], frame_rgb[6], frame_rgb[7], frame_rgb[8], frame_rgb[9]);
//                }

                const size_t bytes_written = fwrite(frame_rgb, 1, bytes_to_write, decoder->video_pipe);
                if (bytes_written != bytes_to_write) {
                    fprintf(stderr, "Error: Failed to write GOP frame %d to FFmpeg (wrote %zu/%zu bytes)\n",
                           t, bytes_written, bytes_to_write);
                    free(frame_rgb);
                    result = -1;
                    break;
                }
                fflush(decoder->video_pipe);

                free(frame_rgb);
            }

            // Free GOP buffers
            for (int t = 0; t < gop_size; t++) {
                free(gop_y[t]);
                free(gop_co[t]);
                free(gop_cg[t]);
            }
            free(gop_y);
            free(gop_co);
            free(gop_cg);

            // BUGFIX: Only break on error (result < 0), not on success (result = 1)
            if (result < 0) break;

            // GOP decoding doesn't update frame_count here - GOP_SYNC packet will do it
            if (verbose) {
                long pos_after_gop = ftell(decoder->input_fp);
                fprintf(stderr, "[DEBUG] After GOP: file pos = %ld, %d frames written (waiting for GOP_SYNC)\n",
                       pos_after_gop, gop_size);
            }

            continue;
        }

        // Handle TAD audio packets (already extracted in Pass 1, just skip)
        if (packet_type == TAV_PACKET_AUDIO_TAD) {
            uint16_t sample_count_wrapper;
            uint32_t payload_size_plus_7;
            fread(&sample_count_wrapper, 2, 1, decoder->input_fp);
            fread(&payload_size_plus_7, 4, 1, decoder->input_fp);

            // Skip TAD chunk (payload_size_plus_7 includes header and data)
            fseek(decoder->input_fp, payload_size_plus_7, SEEK_CUR);
            continue;
        }

        // Handle extended header (has 2-byte count, not 4-byte size)
        if (packet_type == TAV_PACKET_EXTENDED_HDR) {
            uint16_t num_pairs;
            if (fread(&num_pairs, 2, 1, decoder->input_fp) != 1) {
                fprintf(stderr, "Error: Failed to read extended header count\n");
                result = -1;
                break;
            }
            if (verbose && total_packets < 20) {
                fprintf(stderr, "Packet %d: EXTENDED_HDR (0x%02X), %u pairs - skipping\n",
                       total_packets, packet_type, num_pairs);
            }
            // Skip the key-value pairs
            // Format: each pair is [4-byte key][1-byte type][N-byte value]
            // We need to parse each pair to know its size
            for (int i = 0; i < num_pairs; i++) {
                uint8_t key[4];
                uint8_t value_type;
                if (fread(key, 1, 4, decoder->input_fp) != 4 ||
                    fread(&value_type, 1, 1, decoder->input_fp) != 1) {
                    fprintf(stderr, "Error: Failed to read extended header pair %d\n", i);
                    result = -1;
                    break;
                }
                // Determine value size based on type
                size_t value_size = 0;
                switch (value_type) {
                    case 0x00: value_size = 2; break;  // Int16
                    case 0x01: value_size = 3; break;  // Int24
                    case 0x02: value_size = 4; break;  // Int32
                    case 0x03: value_size = 6; break;  // Int48
                    case 0x04: value_size = 8; break;  // Int64
                    case 0x10: {  // Bytes with 2-byte length prefix
                        uint16_t str_len;
                        if (fread(&str_len, 2, 1, decoder->input_fp) != 1) {
                            fprintf(stderr, "Error: Failed to read string length\n");
                            result = -1;
                            break;
                        }
                        value_size = str_len;
                        break;
                    }
                    default:
                        fprintf(stderr, "Warning: Unknown extended header value type 0x%02X\n", value_type);
                        break;
                }
                // Skip the value
                if (value_size > 0) {
                    fseek(decoder->input_fp, value_size, SEEK_CUR);
                }
            }
            if (result < 0) break;
            continue;
        }

        // Read packet size (for remaining packet types with standard format)
        uint32_t packet_size;
        if (fread(&packet_size, 4, 1, decoder->input_fp) != 1) {
            fprintf(stderr, "Error: Failed to read packet size at packet %d (type 0x%02X)\n",
                   total_packets, packet_type);
            result = -1;
            break;
        }

        if (verbose && total_packets < 20) {
            fprintf(stderr, "Packet %d: Type 0x%02X, Size %u bytes\n", total_packets, packet_type, packet_size);
        }

        switch (packet_type) {
            case TAV_PACKET_IFRAME:
            case TAV_PACKET_PFRAME:
                iframe_count++;
                if (verbose && iframe_count <= 5) {
                    fprintf(stderr, "Processing %s (packet %d, size %u bytes)...\n",
                           packet_type == TAV_PACKET_IFRAME ? "I-frame" : "P-frame",
                           total_packets, packet_size);
                }
                result = decode_i_or_p_frame(decoder, packet_type, packet_size);
                if (result < 0) {
                    fprintf(stderr, "Error: Frame decoding failed at frame %d\n", decoder->frame_count);
                    break;
                }
                if (verbose && decoder->frame_count % 100 == 0) {
                    printf("Decoded frame %d\r", decoder->frame_count);
                    fflush(stdout);
                }
                break;

            case TAV_PACKET_AUDIO_MP2:
            case TAV_PACKET_AUDIO_TRACK:
                // MP2 audio - write directly to audio pipe
                // Note: FFmpeg cannot decode MP2 from raw stream, so we skip for now
                if (verbose && total_packets < 20) {
                    fprintf(stderr, "Skipping MP2 audio packet (%u bytes) - not yet supported\n", packet_size);
                }
                fseek(decoder->input_fp, packet_size, SEEK_CUR);
                break;

            case TAV_PACKET_AUDIO_PCM8:
                // PCM8 audio - already extracted in Pass 1, just skip
                fseek(decoder->input_fp, packet_size, SEEK_CUR);
                break;

            case TAV_PACKET_SUBTITLE:
                // Skip subtitle packets
                fseek(decoder->input_fp, packet_size, SEEK_CUR);
                break;

            case TAV_PACKET_PFRAME_RESIDUAL:
            case TAV_PACKET_BFRAME_RESIDUAL:
                fprintf(stderr, "\nError: Unsupported packet type 0x%02X (MPEG-style motion compensation not supported)\n", packet_type);
                result = -1;
                break;

            default:
                fprintf(stderr, "\nWarning: Unknown packet type 0x%02X (skipping)\n", packet_type);
                fseek(decoder->input_fp, packet_size, SEEK_CUR);
                break;
        }
    }

    if (verbose) {
        printf("\nDecoded %d frames\n", decoder->frame_count);
    }

    tav_decoder_free(decoder);

    if (result < 0) {
        fprintf(stderr, "Decoding error occurred\n");
        unlink(temp_audio_file);  // Clean up temp file
        return 1;
    }

    printf("Successfully decoded to: %s\n", output_file);

    // Clean up temporary audio file
    if (unlink(temp_audio_file) == 0 && verbose) {
        fprintf(stderr, "Cleaned up temporary audio file: %s\n", temp_audio_file);
    }

    return 0;
}