// Created by CuriousTorvald and Claude on 2025-10-24.
// TAD32 (Terrarum Advanced Audio - PCM32f version) Encoder Library
// Alternative version: PCM32f throughout encoding, PCM8 conversion only at decoder
// This file contains only the encoding functions for comparison testing

#include <stdio.h>
#include <stdlib.h>
#include <stdint.h>
#include <string.h>
#include <math.h>
#include <zstd.h>
#include "encoder_tad.h"

// Undefine the macro version from header and define as array
#undef TAD32_COEFF_SCALARS

// Coefficient scalars for each subband (CDF 9/7 with 9 decomposition levels)
// Index 0 = LL band, Index 1-9 = H bands (L9 to L1)
static const float TAD32_COEFF_SCALARS[] = {64.0f, 45.255f, 32.0f, 22.627f, 16.0f, 11.314f, 8.0f, 5.657f, 4.0f, 2.828f};

// Base quantiser weight table (10 subbands: LL + 9 H bands)
// These weights are multiplied by quantiser_scale during quantisation
static const float BASE_QUANTISER_WEIGHTS[2][10] = {
{ // mid channel
    4.0f,    // LL (L9) DC
    2.0f,    // H (L9) 31.25 hz
    1.8f,    // H (L8) 62.5 hz
    1.6f,    // H (L7) 125 hz
    1.4f,    // H (L6) 250 hz
    1.2f,    // H (L5) 500 hz
    1.0f,    // H (L4) 1 khz
    1.0f,    // H (L3) 2 khz
    1.3f,    // H (L2) 4 khz
    2.0f     // H (L1) 8 khz
},
{ // side channel
    6.0f,    // LL (L9) DC
    5.0f,    // H (L9) 31.25 hz
    2.6f,    // H (L8) 62.5 hz
    2.4f,    // H (L7) 125 hz
    1.8f,    // H (L6) 250 hz
    1.3f,    // H (L5) 500 hz
    1.0f,    // H (L4) 1 khz
    1.0f,    // H (L3) 2 khz
    1.6f,    // H (L2) 4 khz
    3.2f     // H (L1) 8 khz
}};

// target: before quantisation
static const float DEADBANDS[2][10] = {
{ // mid channel
    0.20f,    // LL (L9) DC
    0.06f,    // H (L9) 31.25 hz
    0.06f,    // H (L8) 62.5 hz
    0.06f,    // H (L7) 125 hz
    0.06f,    // H (L6) 250 hz
    0.04f,    // H (L5) 500 hz
    0.04f,    // H (L4) 1 khz
    0.01f,    // H (L3) 2 khz
    0.01f,    // H (L2) 4 khz
    0.01f     // H (L1) 8 khz
},
{ // side channel
    0.20f,    // LL (L9) DC
    0.06f,    // H (L9) 31.25 hz
    0.06f,    // H (L8) 62.5 hz
    0.06f,    // H (L7) 125 hz
    0.06f,    // H (L6) 250 hz
    0.04f,    // H (L5) 500 hz
    0.04f,    // H (L4) 1 khz
    0.01f,    // H (L3) 2 khz
    0.01f,    // H (L2) 4 khz
    0.01f     // H (L1) 8 khz
}};

static inline float FCLAMP(float x, float min, float max) {
    return x < min ? min : (x > max ? max : x);
}

// Calculate DWT levels from chunk size
static int calculate_dwt_levels(int chunk_size) {
    /*if (chunk_size < TAD32_MIN_CHUNK_SIZE) {
        fprintf(stderr, "Error: Chunk size %d is below minimum %d\n", chunk_size, TAD32_MIN_CHUNK_SIZE);
        return -1;
    }

    int levels = 0;
    int size = chunk_size;
    while (size > 1) {
        size >>= 1;
        levels++;
    }

    // For non-power-of-2, we need to add 1 to levels
    int pow2 = 1 << levels;
    if (pow2 < chunk_size) {
        levels++;
    }

    return levels - 2;*/  // Maximum decomposition

    return 9;
}

// Special marker for deadzoned coefficients (will be reconstructed with noise on decode)
#define DEADZONE_MARKER_FLOAT (-999.0f)  // Unmistakable marker in float domain
#define DEADZONE_MARKER_QUANT (-128)     // Maps to this in quantised domain (int8 minimum)

// Perceptual epsilon - coefficients below this are truly zero (inaudible)
#define EPSILON_PERCEPTUAL 0.001f

static void apply_coeff_deadzone(int channel, float *coeffs, size_t num_samples) {
    // Apply deadzonning to each DWT subband using frequency-dependent thresholds
    // Instead of zeroing, mark small coefficients for stochastic reconstruction

    const int dwt_levels = 9;  // Fixed to match encoder

    // Calculate subband boundaries (same logic as decoder)
    const int first_band_size = num_samples >> dwt_levels;
    int sideband_starts[11];  // dwt_levels + 2
    sideband_starts[0] = 0;
    sideband_starts[1] = first_band_size;
    for (int i = 2; i <= dwt_levels + 1; i++) {
        sideband_starts[i] = sideband_starts[i - 1] + (first_band_size << (i - 2));
    }

    // Apply deadzone threshold to each coefficient
    for (size_t i = 0; i < num_samples; i++) {
        // Determine which subband this coefficient belongs to
        int sideband = dwt_levels;  // Default to highest frequency
        for (int s = 0; s <= dwt_levels; s++) {
            if (i < (size_t)sideband_starts[s + 1]) {
                sideband = s;
                break;
            }
        }

        // Get threshold for this subband and channel
        float threshold = DEADBANDS[channel][sideband];
        float abs_coeff = fabsf(coeffs[i]);

        // If coefficient is within deadband AND perceptually non-zero, mark it
        if (abs_coeff > EPSILON_PERCEPTUAL && abs_coeff < threshold) {
            // Mark for stochastic reconstruction (decoder will add noise)
            coeffs[i] = 0.0f;//DEADZONE_MARKER_FLOAT;
        }
        // If below perceptual epsilon, truly zero it
        else if (abs_coeff <= EPSILON_PERCEPTUAL) {
            coeffs[i] = 0.0f;
        }
        // Otherwise keep coefficient unchanged
    }
}

//=============================================================================
// DD-4 DWT Implementation
//=============================================================================

// Four-point interpolating Deslauriers-Dubuc (DD-4) wavelet forward 1D transform
static void dwt_dd4_forward_1d(float *data, int length) {
    if (length < 2) return;

    float *temp = malloc(length * sizeof(float));
    int half = (length + 1) / 2;

    // Split into even/odd samples
    for (int i = 0; i < half; i++) {
        temp[i] = data[2 * i];           // Even (low)
    }
    for (int i = 0; i < length / 2; i++) {
        temp[half + i] = data[2 * i + 1]; // Odd (high)
    }

    // DD-4 forward prediction step with four-point kernel
    for (int i = 0; i < length / 2; i++) {
        float s_m1, s_0, s_1, s_2;

        if (i > 0) s_m1 = temp[i - 1];
        else s_m1 = temp[0]; // Mirror boundary

        s_0 = temp[i];

        if (i + 1 < half) s_1 = temp[i + 1];
        else s_1 = temp[half - 1];

        if (i + 2 < half) s_2 = temp[i + 2];
        else if (half > 1) s_2 = temp[half - 2];
        else s_2 = temp[half - 1];

        float prediction = (-1.0f/16.0f) * s_m1 + (9.0f/16.0f) * s_0 +
                          (9.0f/16.0f) * s_1 + (-1.0f/16.0f) * s_2;

        temp[half + i] -= prediction;
    }

    // DD-4 update step
    for (int i = 0; i < half; i++) {
        float d_curr = (i < length / 2) ? temp[half + i] : 0.0f;
        float d_prev = (i > 0 && i - 1 < length / 2) ? temp[half + i - 1] : 0.0f;
        temp[i] += 0.25f * (d_prev + d_curr);
    }

    memcpy(data, temp, length * sizeof(float));
    free(temp);
}

// 1D DWT using lifting scheme for 9/7 irreversible filter
static void dwt_97_forward_1d(float *data, int length) {
    if (length < 2) return;

    float *temp = malloc(length * sizeof(float));
    int half = (length + 1) / 2;  // Handle odd lengths properly

    // Split into even/odd samples
    for (int i = 0; i < half; i++) {
        temp[i] = data[2 * i];           // Even (low)
    }
    for (int i = 0; i < length / 2; i++) {
        temp[half + i] = data[2 * i + 1]; // Odd (high)
    }

    // JPEG2000 9/7 forward lifting steps (corrected to match decoder)
    const float alpha = -1.586134342f;
    const float beta = -0.052980118f;
    const float gamma = 0.882911076f;
    const float delta = 0.443506852f;
    const float K = 1.230174105f;

    // Step 1: Predict α - d[i] += α * (s[i] + s[i+1])
    for (int i = 0; i < length / 2; i++) {
        if (half + i < length) {
            float s_curr = temp[i];
            float s_next = (i + 1 < half) ? temp[i + 1] : s_curr;
            temp[half + i] += alpha * (s_curr + s_next);
        }
    }

    // Step 2: Update β - s[i] += β * (d[i-1] + d[i])
    for (int i = 0; i < half; i++) {
        float d_curr = (half + i < length) ? temp[half + i] : 0.0f;
        float d_prev = (i > 0 && half + i - 1 < length) ? temp[half + i - 1] : d_curr;
        temp[i] += beta * (d_prev + d_curr);
    }

    // Step 3: Predict γ - d[i] += γ * (s[i] + s[i+1])
    for (int i = 0; i < length / 2; i++) {
        if (half + i < length) {
            float s_curr = temp[i];
            float s_next = (i + 1 < half) ? temp[i + 1] : s_curr;
            temp[half + i] += gamma * (s_curr + s_next);
        }
    }

    // Step 4: Update δ - s[i] += δ * (d[i-1] + d[i])
    for (int i = 0; i < half; i++) {
        float d_curr = (half + i < length) ? temp[half + i] : 0.0f;
        float d_prev = (i > 0 && half + i - 1 < length) ? temp[half + i - 1] : d_curr;
        temp[i] += delta * (d_prev + d_curr);
    }

    // Step 5: Scaling - s[i] *= K, d[i] /= K
    for (int i = 0; i < half; i++) {
        temp[i] *= K;  // Low-pass coefficients
    }
    for (int i = 0; i < length / 2; i++) {
        if (half + i < length) {
            temp[half + i] /= K;  // High-pass coefficients
        }
    }

    memcpy(data, temp, length * sizeof(float));
    free(temp);
}

// Apply multi-level DWT (using DD-4 wavelet)
static void dwt_forward_multilevel(float *data, int length, int levels) {
    int current_length = length;
    for (int level = 0; level < levels; level++) {
//        dwt_dd4_forward_1d(data, current_length);
        dwt_97_forward_1d(data, current_length);
        current_length = (current_length + 1) / 2;
    }
}

//=============================================================================
// Pre-emphasis Filter
//=============================================================================

static void calculate_preemphasis_coeffs(float *b0, float *b1, float *a1) {
    // Simple first-order digital pre-emphasis
    // Corner frequency ≈ 1200 Hz (chosen for 32 kHz codec)
    // Provides ~6 dB/octave boost above corner

    // Pre-emphasis factor (0.95 = gentle, 0.90 = moderate, 0.85 = aggressive)
    const float alpha = 0.5f;  // Gentle boost suitable for music

    *b0 = 1.0f;
    *b1 = -alpha;
    *a1 = 0.0f;  // No feedback
}

// emphasis at alpha=0.5 shifts quantisation crackles to lower frequency which MIGHT be more preferable
static void apply_preemphasis(float *left, float *right, size_t count) {
    // Static state variables - persistent across chunks to prevent discontinuities
    static float prev_x_l = 0.0f;
    static float prev_y_l = 0.0f;
    static float prev_x_r = 0.0f;
    static float prev_y_r = 0.0f;

    float b0, b1, a1;
    calculate_preemphasis_coeffs(&b0, &b1, &a1);

    // Left channel - use persistent state
    for (size_t i = 0; i < count; i++) {
        float x = left[i];
        float y = b0 * x + b1 * prev_x_l - a1 * prev_y_l;
        left[i] = y;
        prev_x_l = x;
        prev_y_l = y;
    }

    // Right channel - use persistent state
    for (size_t i = 0; i < count; i++) {
        float x = right[i];
        float y = b0 * x + b1 * prev_x_r - a1 * prev_y_r;
        right[i] = y;
        prev_x_r = x;
        prev_y_r = y;
    }
}

//=============================================================================
// M/S Stereo Decorrelation (PCM32f version)
//=============================================================================

static void ms_decorrelate(const float *left, const float *right, float *mid, float *side, size_t count) {
    for (size_t i = 0; i < count; i++) {
        // Mid = (L + R) / 2, Side = (L - R) / 2
        float l = left[i];
        float r = right[i];
        mid[i] = (l + r) / 2.0f;
        side[i] = (l - r) / 2.0f;
    }
}

static float signum(float x) {
    if (x > 0.0f) return 1.0f;
    if (x < 0.0f) return -1.0f;
    return 0.0f;
}

static void compress_gamma(float *left, float *right, size_t count) {
    for (size_t i = 0; i < count; i++) {
        // encode(x) = sign(x) * |x|^γ where γ=0.5
        float x = left[i];
        left[i] = signum(x) * powf(fabsf(x), 0.5f);
        float y = right[i];
        right[i] = signum(y) * powf(fabsf(y), 0.5f);
    }
}

static void compress_mu_law(float *left, float *right, size_t count) {
    static float MU = 255.0f;

    for (size_t i = 0; i < count; i++) {
        // encode(x) = sign(x) * |x|^γ where γ=0.5
        float x = left[i];
        left[i] = signum(x) * logf(1.0f + MU * fabsf(x)) / logf(1.0f + MU);
        float y = right[i];
        right[i] = signum(y) * logf(1.0f + MU * fabsf(y)) / logf(1.0f + MU);
    }
}

//=============================================================================
// Quantisation with Frequency-Dependent Weighting
//=============================================================================

#define LAMBDA_FIXED 6.0f

// Lambda-based companding encoder (based on Laplacian distribution CDF)
// val must be normalised to [-1,1]
// Returns quantised index in range [-127, +127]
static int8_t lambda_companding(float val, int max_index) {
    // Handle zero
    if (fabsf(val) < 1e-9f) {
        return 0;
    }

    int sign = (val < 0) ? -1 : 1;
    float abs_val = fabsf(val);

    // Clamp to [0, 1]
    if (abs_val > 1.0f) abs_val = 1.0f;


    // Laplacian CDF for x >= 0: F(x) = 1 - 0.5 * exp(-λ*x)
    // Map to [0.5, 1.0] range (half of CDF for positive values)
    float cdf = 1.0f - 0.5f * expf(-LAMBDA_FIXED * abs_val);

    // Map CDF from [0.5, 1.0] to [0, 1] for positive half
    float normalised_cdf = (cdf - 0.5f) * 2.0f;

    // Quantise to index
    int index = (int)roundf(normalised_cdf * max_index);

    // Clamp index to valid range [0, max_index]
    if (index < 0) index = 0;
    if (index > max_index) index = max_index;

    return (int8_t)(sign * index);
}

static void quantise_dwt_coefficients(int channel, const float *coeffs, int8_t *quantised, size_t count, int apply_deadzone, int chunk_size, int dwt_levels, int max_index, int *current_subband_index, float quantiser_scale) {
    int first_band_size = chunk_size >> dwt_levels;

    int *sideband_starts = malloc((dwt_levels + 2) * sizeof(int));
    sideband_starts[0] = 0;
    sideband_starts[1] = first_band_size;
    for (int i = 2; i <= dwt_levels + 1; i++) {
        sideband_starts[i] = sideband_starts[i-1] + (first_band_size << (i-2));
    }

    for (size_t i = 0; i < count; i++) {
        int sideband = dwt_levels;
        for (int s = 0; s <= dwt_levels; s++) {
            if (i < (size_t)sideband_starts[s + 1]) {
                sideband = s;
                break;
            }
        }

        // Store subband index (LL=0, H1=1, H2=2, ..., H9=9 for dwt_levels=9)
        if (current_subband_index != NULL) {
            current_subband_index[i] = sideband;
        }

        // Check for deadzone marker (special handling)
        /*if (coeffs[i] == DEADZONE_MARKER_FLOAT) {
            // Map to special quantised marker for stochastic reconstruction
            quantised[i] = (int8_t)DEADZONE_MARKER_QUANT;
        } else {*/
            // Normal quantisation
            float weight = BASE_QUANTISER_WEIGHTS[channel][sideband] * quantiser_scale;
            float val = (coeffs[i] / (TAD32_COEFF_SCALARS[sideband] * weight)); // val is normalised to [-1,1]
            int8_t quant_val = lambda_companding(val, max_index);
            quantised[i] = quant_val;
//        }
    }

    free(sideband_starts);
}

//=============================================================================
// Coefficient Statistics
//=============================================================================

static int compare_float(const void *a, const void *b) {
    float fa = *(const float*)a;
    float fb = *(const float*)b;
    if (fa < fb) return -1;
    if (fa > fb) return 1;
    return 0;
}

typedef struct {
    float min;
    float q1;
    float median;
    float q3;
    float max;
    float lambda;  // Laplacian distribution parameter (1/b, where b is scale)
} CoeffStats;

typedef struct {
    float *data;
    size_t count;
    size_t capacity;
} CoeffAccumulator;

typedef struct {
    int8_t *data;
    size_t count;
    size_t capacity;
} QuantAccumulator;

// Global accumulators for statistics
static CoeffAccumulator *mid_accumulators = NULL;
static CoeffAccumulator *side_accumulators = NULL;
static QuantAccumulator *mid_quant_accumulators = NULL;
static QuantAccumulator *side_quant_accumulators = NULL;
static int num_subbands = 0;
static int stats_initialised = 0;
static int stats_dwt_levels = 0;

static void init_statistics(int dwt_levels) {
    if (stats_initialised) return;

    num_subbands = dwt_levels + 1;
    stats_dwt_levels = dwt_levels;

    mid_accumulators = calloc(num_subbands, sizeof(CoeffAccumulator));
    side_accumulators = calloc(num_subbands, sizeof(CoeffAccumulator));
    mid_quant_accumulators = calloc(num_subbands, sizeof(QuantAccumulator));
    side_quant_accumulators = calloc(num_subbands, sizeof(QuantAccumulator));

    for (int i = 0; i < num_subbands; i++) {
        mid_accumulators[i].capacity = 1024;
        mid_accumulators[i].data = malloc(mid_accumulators[i].capacity * sizeof(float));
        mid_accumulators[i].count = 0;

        side_accumulators[i].capacity = 1024;
        side_accumulators[i].data = malloc(side_accumulators[i].capacity * sizeof(float));
        side_accumulators[i].count = 0;

        mid_quant_accumulators[i].capacity = 1024;
        mid_quant_accumulators[i].data = malloc(mid_quant_accumulators[i].capacity * sizeof(int8_t));
        mid_quant_accumulators[i].count = 0;

        side_quant_accumulators[i].capacity = 1024;
        side_quant_accumulators[i].data = malloc(side_quant_accumulators[i].capacity * sizeof(int8_t));
        side_quant_accumulators[i].count = 0;
    }

    stats_initialised = 1;
}

static void accumulate_coefficients(const float *coeffs, int dwt_levels, int chunk_size, CoeffAccumulator *accumulators) {
    int first_band_size = chunk_size >> dwt_levels;

    int *sideband_starts = malloc((dwt_levels + 2) * sizeof(int));
    sideband_starts[0] = 0;
    sideband_starts[1] = first_band_size;
    for (int i = 2; i <= dwt_levels + 1; i++) {
        sideband_starts[i] = sideband_starts[i-1] + (first_band_size << (i-2));
    }

    for (int s = 0; s <= dwt_levels; s++) {
        size_t start = sideband_starts[s];
        size_t end = sideband_starts[s + 1];
        size_t band_size = end - start;

        // Expand capacity if needed
        while (accumulators[s].count + band_size > accumulators[s].capacity) {
            accumulators[s].capacity *= 2;
            accumulators[s].data = realloc(accumulators[s].data,
                                          accumulators[s].capacity * sizeof(float));
        }

        // Copy coefficients
        memcpy(accumulators[s].data + accumulators[s].count,
               coeffs + start, band_size * sizeof(float));
        accumulators[s].count += band_size;
    }

    free(sideband_starts);
}

static void accumulate_quantised(const int8_t *quant, int dwt_levels, int chunk_size, QuantAccumulator *accumulators) {
    int first_band_size = chunk_size >> dwt_levels;

    int *sideband_starts = malloc((dwt_levels + 2) * sizeof(int));
    sideband_starts[0] = 0;
    sideband_starts[1] = first_band_size;
    for (int i = 2; i <= dwt_levels + 1; i++) {
        sideband_starts[i] = sideband_starts[i-1] + (first_band_size << (i-2));
    }

    for (int s = 0; s <= dwt_levels; s++) {
        size_t start = sideband_starts[s];
        size_t end = sideband_starts[s + 1];
        size_t band_size = end - start;

        // Expand capacity if needed
        while (accumulators[s].count + band_size > accumulators[s].capacity) {
            accumulators[s].capacity *= 2;
            accumulators[s].data = realloc(accumulators[s].data,
                                          accumulators[s].capacity * sizeof(int8_t));
        }

        // Copy coefficients
        memcpy(accumulators[s].data + accumulators[s].count,
               quant + start, band_size * sizeof(int8_t));
        accumulators[s].count += band_size;
    }

    free(sideband_starts);
}

static void calculate_coeff_stats(const float *coeffs, size_t count, CoeffStats *stats) {
    if (count == 0) {
        stats->min = stats->q1 = stats->median = stats->q3 = stats->max = 0.0f;
        stats->lambda = 0.0f;
        return;
    }

    // Copy coefficients for sorting
    float *sorted = malloc(count * sizeof(float));
    memcpy(sorted, coeffs, count * sizeof(float));
    qsort(sorted, count, sizeof(float), compare_float);

    stats->min = sorted[0];
    stats->max = sorted[count - 1];
    stats->median = sorted[count / 2];
    stats->q1 = sorted[count / 4];
    stats->q3 = sorted[(3 * count) / 4];

    free(sorted);

    // Estimate Laplacian distribution parameter λ = 1/b
    // For Laplacian centered at μ=0, MLE gives: b = mean(|x|)
    // Therefore: λ = 1/b = 1/mean(|x|)
    double sum_abs = 0.0;
    for (size_t i = 0; i < count; i++) {
        sum_abs += fabs(coeffs[i]);
    }
    double mean_abs = sum_abs / count;
    stats->lambda = (mean_abs > 1e-9) ? (1.0f / mean_abs) : 0.0f;
}

#define HISTOGRAM_BINS 40
#define HISTOGRAM_WIDTH 60

static void print_histogram(const float *coeffs, size_t count, const char *title) {
    if (count == 0) return;

    // Find min/max
    float min_val = coeffs[0];
    float max_val = coeffs[0];
    for (size_t i = 1; i < count; i++) {
        if (coeffs[i] < min_val) min_val = coeffs[i];
        if (coeffs[i] > max_val) max_val = coeffs[i];
    }

    // Handle case where all values are the same
    if (fabsf(max_val - min_val) < 1e-9f) {
        fprintf(stderr, "  %s: All values are %.3f\n", title, min_val);
        return;
    }

    // Create histogram bins
    size_t bins[HISTOGRAM_BINS] = {0};
    float bin_width = (max_val - min_val) / HISTOGRAM_BINS;

    for (size_t i = 0; i < count; i++) {
        int bin = (int)((coeffs[i] - min_val) / bin_width);
        if (bin >= HISTOGRAM_BINS) bin = HISTOGRAM_BINS - 1;
        if (bin < 0) bin = 0;
        bins[bin]++;
    }

    // Find max bin count for scaling
    size_t max_bin = 0;
    for (int i = 0; i < HISTOGRAM_BINS; i++) {
        if (bins[i] > max_bin) max_bin = bins[i];
    }

    // Print histogram
    fprintf(stderr, "  %s Histogram (range: %.3f to %.3f):\n", title, min_val, max_val);

    // Print top 20 bins to keep output manageable
    for (int i = 0; i < HISTOGRAM_BINS; i++) {
        float bin_start = min_val + i * bin_width;
        float bin_end = bin_start + bin_width;
        int bar_width = (int)((bins[i] * HISTOGRAM_WIDTH) / max_bin);

        // Only print bins with significant content (> 1% of max)
        if (bins[i] > max_bin / 100) {
            fprintf(stderr, "  %8.3f-%8.3f [%7zu]: ", bin_start, bin_end, bins[i]);
            for (int j = 0; j < bar_width; j++) {
                fprintf(stderr, "█");
            }
            fprintf(stderr, "\n");
        }
    }
    fprintf(stderr, "\n");
}

typedef struct {
    int8_t value;
    size_t count;
    float percentage;
} ValueFrequency;

static int compare_value_frequency(const void *a, const void *b) {
    const ValueFrequency *va = (const ValueFrequency*)a;
    const ValueFrequency *vb = (const ValueFrequency*)b;
    // Sort by count descending
    if (vb->count > va->count) return 1;
    if (vb->count < va->count) return -1;
    return 0;
}

static void print_top5_quantised_values(const int8_t *quant, size_t count, const char *title) {
    if (count == 0) {
        fprintf(stderr, "  %s: No data\n", title);
        return;
    }

    // For int8_t range is at most 256, so we can use direct indexing
    // Map from [-128, 127] to [0, 255]
    size_t freq[256] = {0};

    for (size_t i = 0; i < count; i++) {
        int idx = (int)quant[i] + 128;
        freq[idx]++;
    }

    // Find all unique values with their frequencies
    ValueFrequency values[256];
    int unique_count = 0;
    for (int i = 0; i < 256; i++) {
        if (freq[i] > 0) {
            values[unique_count].value = (int8_t)(i - 128);
            values[unique_count].count = freq[i];
            values[unique_count].percentage = (float)(freq[i] * 100.0) / count;
            unique_count++;
        }
    }

    // Sort by frequency
    qsort(values, unique_count, sizeof(ValueFrequency), compare_value_frequency);

    // Print top 10
    fprintf(stderr, "  %s Top 100 Values:\n", title);
    int print_count = (unique_count < 100) ? unique_count : 100;
    for (int i = 0; i < print_count; i++) {
        fprintf(stderr, "    %6d: %8zu occurrences (%5.2f%%)\n",
                values[i].value, values[i].count, values[i].percentage);
    }
    fprintf(stderr, "\n");
}

void tad32_print_statistics(void) {
    if (!stats_initialised) return;

    fprintf(stderr, "\n=== TAD Coefficient Statistics (before quantisation) ===\n");

    // Print Mid channel statistics
    fprintf(stderr, "\nMid Channel:\n");
    fprintf(stderr, "%-12s %10s %10s %10s %10s %10s %10s %10s\n",
            "Subband", "Samples", "Min", "Q1", "Median", "Q3", "Max", "Lambda");
    fprintf(stderr, "----------------------------------------------------------------------------------------\n");

    for (int s = 0; s < num_subbands; s++) {
        CoeffStats stats;
        calculate_coeff_stats(mid_accumulators[s].data, mid_accumulators[s].count, &stats);

        char band_name[16];
        if (s == 0) {
            snprintf(band_name, sizeof(band_name), "LL (L%d)", stats_dwt_levels);
        } else {
            snprintf(band_name, sizeof(band_name), "H (L%d)", stats_dwt_levels - s + 1);
        }

        fprintf(stderr, "%-12s %10zu %10.3f %10.3f %10.3f %10.3f %10.3f %10.3f\n",
                band_name, mid_accumulators[s].count,
                stats.min, stats.q1, stats.median, stats.q3, stats.max, stats.lambda);
    }

    // Print Mid channel histograms
    fprintf(stderr, "\nMid Channel Histograms:\n");
    for (int s = 0; s < num_subbands; s++) {
        char band_name[32];
        if (s == 0) {
            snprintf(band_name, sizeof(band_name), "LL (L%d)", stats_dwt_levels);
        } else {
            snprintf(band_name, sizeof(band_name), "H (L%d)", stats_dwt_levels - s + 1);
        }
        print_histogram(mid_accumulators[s].data, mid_accumulators[s].count, band_name);
    }

    // Print Side channel statistics
    fprintf(stderr, "\nSide Channel:\n");
    fprintf(stderr, "%-12s %10s %10s %10s %10s %10s %10s %10s\n",
            "Subband", "Samples", "Min", "Q1", "Median", "Q3", "Max", "Lambda");
    fprintf(stderr, "----------------------------------------------------------------------------------------\n");

    for (int s = 0; s < num_subbands; s++) {
        CoeffStats stats;
        calculate_coeff_stats(side_accumulators[s].data, side_accumulators[s].count, &stats);

        char band_name[16];
        if (s == 0) {
            snprintf(band_name, sizeof(band_name), "LL (L%d)", stats_dwt_levels);
        } else {
            snprintf(band_name, sizeof(band_name), "H (L%d)", stats_dwt_levels - s + 1);
        }

        fprintf(stderr, "%-12s %10zu %10.3f %10.3f %10.3f %10.3f %10.3f %10.3f\n",
                band_name, side_accumulators[s].count,
                stats.min, stats.q1, stats.median, stats.q3, stats.max, stats.lambda);
    }

    // Print Side channel histograms
    fprintf(stderr, "\nSide Channel Histograms:\n");
    for (int s = 0; s < num_subbands; s++) {
        char band_name[32];
        if (s == 0) {
            snprintf(band_name, sizeof(band_name), "LL (L%d)", stats_dwt_levels);
        } else {
            snprintf(band_name, sizeof(band_name), "H (L%d)", stats_dwt_levels - s + 1);
        }
        print_histogram(side_accumulators[s].data, side_accumulators[s].count, band_name);
    }

    // Print quantised values statistics
    fprintf(stderr, "\n=== TAD Quantised Values Statistics (after quantisation) ===\n");

    // Print Mid channel quantised values
    fprintf(stderr, "\nMid Channel Quantised Values:\n");
    for (int s = 0; s < num_subbands; s++) {
        char band_name[32];
        if (s == 0) {
            snprintf(band_name, sizeof(band_name), "LL (L%d)", stats_dwt_levels);
        } else {
            snprintf(band_name, sizeof(band_name), "H (L%d)", stats_dwt_levels - s + 1);
        }
        print_top5_quantised_values(mid_quant_accumulators[s].data, mid_quant_accumulators[s].count, band_name);
    }

    // Print Side channel quantised values
    fprintf(stderr, "\nSide Channel Quantised Values:\n");
    for (int s = 0; s < num_subbands; s++) {
        char band_name[32];
        if (s == 0) {
            snprintf(band_name, sizeof(band_name), "LL (L%d)", stats_dwt_levels);
        } else {
            snprintf(band_name, sizeof(band_name), "H (L%d)", stats_dwt_levels - s + 1);
        }
        print_top5_quantised_values(side_quant_accumulators[s].data, side_quant_accumulators[s].count, band_name);
    }

    fprintf(stderr, "\n");
}

void tad32_free_statistics(void) {
    if (!stats_initialised) return;

    for (int i = 0; i < num_subbands; i++) {
        free(mid_accumulators[i].data);
        free(side_accumulators[i].data);
        free(mid_quant_accumulators[i].data);
        free(side_quant_accumulators[i].data);
    }
    free(mid_accumulators);
    free(side_accumulators);
    free(mid_quant_accumulators);
    free(side_quant_accumulators);

    mid_accumulators = NULL;
    side_accumulators = NULL;
    mid_quant_accumulators = NULL;
    side_quant_accumulators = NULL;
    stats_initialised = 0;
}

//=============================================================================
// Binary Tree EZBC (1D Variant for TAD)
//=============================================================================

#include <stdbool.h>

// Bitstream writer for EZBC
typedef struct {
    uint8_t *data;
    size_t capacity;
    size_t byte_pos;
    uint8_t bit_pos;  // 0-7, current bit position in current byte
} tad_bitstream_t;

// Block structure for 1D binary tree
typedef struct {
    int start;    // Start index in 1D array
    int length;   // Block length
} tad_block_t;

// Queue for block processing
typedef struct {
    tad_block_t *blocks;
    size_t count;
    size_t capacity;
} tad_block_queue_t;

// Track coefficient state for refinement
typedef struct {
    bool significant;     // Has been marked significant
    int first_bitplane;   // Bitplane where it became significant
} tad_coeff_state_t;

// Bitstream operations
static void tad_bitstream_init(tad_bitstream_t *bs, size_t initial_capacity) {
    bs->capacity = initial_capacity;
    bs->data = calloc(1, initial_capacity);
    bs->byte_pos = 0;
    bs->bit_pos = 0;
}

static void tad_bitstream_write_bit(tad_bitstream_t *bs, int bit) {
    // Grow if needed
    if (bs->byte_pos >= bs->capacity) {
        bs->capacity *= 2;
        bs->data = realloc(bs->data, bs->capacity);
        // Clear new memory
        memset(bs->data + bs->byte_pos, 0, bs->capacity - bs->byte_pos);
    }

    if (bit) {
        bs->data[bs->byte_pos] |= (1 << bs->bit_pos);
    }

    bs->bit_pos++;
    if (bs->bit_pos == 8) {
        bs->bit_pos = 0;
        bs->byte_pos++;
    }
}

static void tad_bitstream_write_bits(tad_bitstream_t *bs, uint32_t value, int num_bits) {
    for (int i = 0; i < num_bits; i++) {
        tad_bitstream_write_bit(bs, (value >> i) & 1);
    }
}

static size_t tad_bitstream_size(tad_bitstream_t *bs) {
    return bs->byte_pos + (bs->bit_pos > 0 ? 1 : 0);
}

static void tad_bitstream_free(tad_bitstream_t *bs) {
    free(bs->data);
}

// Block queue operations
static void tad_queue_init(tad_block_queue_t *q) {
    q->capacity = 1024;
    q->blocks = malloc(q->capacity * sizeof(tad_block_t));
    q->count = 0;
}

static void tad_queue_push(tad_block_queue_t *q, tad_block_t block) {
    if (q->count >= q->capacity) {
        q->capacity *= 2;
        q->blocks = realloc(q->blocks, q->capacity * sizeof(tad_block_t));
    }
    q->blocks[q->count++] = block;
}

static void tad_queue_free(tad_block_queue_t *q) {
    free(q->blocks);
}

// Check if all coefficients in block have |coeff| < threshold
static bool tad_is_zero_block(int8_t *coeffs, const tad_block_t *block, int threshold) {
    for (int i = block->start; i < block->start + block->length; i++) {
        if (abs(coeffs[i]) >= threshold) {
            return false;
        }
    }
    return true;
}

// Find maximum absolute coefficient value
static int tad_find_max_abs(int8_t *coeffs, size_t count) {
    int max_abs = 0;
    for (size_t i = 0; i < count; i++) {
        int abs_val = abs(coeffs[i]);
        if (abs_val > max_abs) {
            max_abs = abs_val;
        }
    }
    return max_abs;
}

// Get MSB position (bitplane number)
static int tad_get_msb_bitplane(int value) {
    if (value == 0) return 0;
    int bitplane = 0;
    while (value > 1) {
        value >>= 1;
        bitplane++;
    }
    return bitplane;
}

// Context for recursive EZBC processing
typedef struct {
    tad_bitstream_t *bs;
    int8_t *coeffs;
    tad_coeff_state_t *states;
    int length;
    int bitplane;
    int threshold;
    tad_block_queue_t *next_insignificant;
    tad_block_queue_t *next_significant;
    int *sign_count;
} tad_ezbc_context_t;

// Recursively process a significant block - subdivide until size 1
static void tad_process_significant_block_recursive(tad_ezbc_context_t *ctx, tad_block_t block) {
    // If size 1: emit sign bit and add to significant queue
    if (block.length == 1) {
        int idx = block.start;
        tad_bitstream_write_bit(ctx->bs, ctx->coeffs[idx] < 0 ? 1 : 0);
        (*ctx->sign_count)++;
        ctx->states[idx].significant = true;
        ctx->states[idx].first_bitplane = ctx->bitplane;
        tad_queue_push(ctx->next_significant, block);
        return;
    }

    // Block is > 1: subdivide into left and right halves
    int mid = block.length / 2;
    if (mid == 0) mid = 1;

    // Process left child
    tad_block_t left = {block.start, mid};
    if (!tad_is_zero_block(ctx->coeffs, &left, ctx->threshold)) {
        tad_bitstream_write_bit(ctx->bs, 1);  // Significant
        tad_process_significant_block_recursive(ctx, left);
    } else {
        tad_bitstream_write_bit(ctx->bs, 0);  // Insignificant
        tad_queue_push(ctx->next_insignificant, left);
    }

    // Process right child (if exists)
    if (block.length > mid) {
        tad_block_t right = {block.start + mid, block.length - mid};
        if (!tad_is_zero_block(ctx->coeffs, &right, ctx->threshold)) {
            tad_bitstream_write_bit(ctx->bs, 1);
            tad_process_significant_block_recursive(ctx, right);
        } else {
            tad_bitstream_write_bit(ctx->bs, 0);
            tad_queue_push(ctx->next_insignificant, right);
        }
    }
}

// Binary tree EZBC encoding for a single channel (1D variant)
// Made non-static for testing
size_t tad_encode_channel_ezbc(int8_t *coeffs, size_t count, uint8_t **output) {
    tad_bitstream_t bs;
    tad_bitstream_init(&bs, count / 4);  // Initial guess

    // Track coefficient significance
    tad_coeff_state_t *states = calloc(count, sizeof(tad_coeff_state_t));

    // Find maximum value to determine MSB bitplane
    int max_abs = tad_find_max_abs(coeffs, count);
    int msb_bitplane = tad_get_msb_bitplane(max_abs);

    // Write header: MSB bitplane and length
    tad_bitstream_write_bits(&bs, msb_bitplane, 8);
    tad_bitstream_write_bits(&bs, (uint32_t)count, 16);

    // Initialise queues
    tad_block_queue_t insignificant_queue, next_insignificant;
    tad_block_queue_t significant_queue, next_significant;

    tad_queue_init(&insignificant_queue);
    tad_queue_init(&next_insignificant);
    tad_queue_init(&significant_queue);
    tad_queue_init(&next_significant);

    // Start with root block as insignificant
    tad_block_t root = {0, (int)count};
    tad_queue_push(&insignificant_queue, root);

    // Process bitplanes from MSB to LSB
    for (int bitplane = msb_bitplane; bitplane >= 0; bitplane--) {
        int threshold = 1 << bitplane;

        // Process insignificant blocks - check if they become significant
        for (size_t i = 0; i < insignificant_queue.count; i++) {
            tad_block_t block = insignificant_queue.blocks[i];

            if (tad_is_zero_block(coeffs, &block, threshold)) {
                // Still insignificant: emit 0
                tad_bitstream_write_bit(&bs, 0);
                // Keep in insignificant queue for next bitplane
                tad_queue_push(&next_insignificant, block);
            } else {
                // Became significant: emit 1
                tad_bitstream_write_bit(&bs, 1);

                // Use recursive subdivision
                int sign_count = 0;
                tad_ezbc_context_t ctx = {
                    .bs = &bs,
                    .coeffs = coeffs,
                    .states = states,
                    .length = (int)count,
                    .bitplane = bitplane,
                    .threshold = threshold,
                    .next_insignificant = &next_insignificant,
                    .next_significant = &next_significant,
                    .sign_count = &sign_count
                };
                tad_process_significant_block_recursive(&ctx, block);
            }
        }

        // Refinement pass: emit next bit for already-significant coefficients
        for (size_t i = 0; i < significant_queue.count; i++) {
            tad_block_t block = significant_queue.blocks[i];
            int idx = block.start;

            // Emit refinement bit (bit at position 'bitplane')
            int bit = (abs(coeffs[idx]) >> bitplane) & 1;
            tad_bitstream_write_bit(&bs, bit);

            // Add to next_significant so it continues being refined
            tad_queue_push(&next_significant, block);
        }

        // Swap queues for next bitplane
        tad_block_queue_t temp_insig = insignificant_queue;
        insignificant_queue = next_insignificant;
        next_insignificant = temp_insig;
        next_insignificant.count = 0;  // Clear for reuse

        tad_block_queue_t temp_sig = significant_queue;
        significant_queue = next_significant;
        next_significant = temp_sig;
        next_significant.count = 0;  // Clear for reuse
    }

    // Cleanup queues
    tad_queue_free(&insignificant_queue);
    tad_queue_free(&next_insignificant);
    tad_queue_free(&significant_queue);
    tad_queue_free(&next_significant);
    free(states);

    // Copy bitstream to output
    size_t output_size = tad_bitstream_size(&bs);
    *output = malloc(output_size);
    memcpy(*output, bs.data, output_size);
    tad_bitstream_free(&bs);

    return output_size;
}

//=============================================================================
// Public API: Chunk Encoding
//=============================================================================

size_t tad32_encode_chunk(const float *pcm32_stereo, size_t num_samples,
                          int max_index,
                          float quantiser_scale, int zstd_level, uint8_t *output) {
    // Calculate DWT levels from chunk size
    int dwt_levels = calculate_dwt_levels(num_samples);
    if (dwt_levels < 0) {
        fprintf(stderr, "Error: Invalid chunk size %zu\n", num_samples);
        return 0;
    }

    // Allocate working buffers (PCM32f throughout, int32 coefficients)
    float *pcm32_left = malloc(num_samples * sizeof(float));
    float *pcm32_right = malloc(num_samples * sizeof(float));
    float *pcm32_mid = malloc(num_samples * sizeof(float));
    float *pcm32_side = malloc(num_samples * sizeof(float));

    float *dwt_mid = malloc(num_samples * sizeof(float));
    float *dwt_side = malloc(num_samples * sizeof(float));

    int8_t *quant_mid = malloc(num_samples * sizeof(int8_t));
    int8_t *quant_side = malloc(num_samples * sizeof(int8_t));

    // Step 1: Deinterleave stereo
    for (size_t i = 0; i < num_samples; i++) {
        pcm32_left[i] = pcm32_stereo[i * 2];
        pcm32_right[i] = pcm32_stereo[i * 2 + 1];
    }

    // Step 1.1: Apply pre-emphasis filter (BEFORE gamma compression)
    apply_preemphasis(pcm32_left, pcm32_right, num_samples);

    // Step 1.2: Compress dynamic range
    compress_gamma(pcm32_left, pcm32_right, num_samples);
//    compress_mu_law(pcm32_left, pcm32_right, num_samples);

    // Step 2: M/S decorrelation
    ms_decorrelate(pcm32_left, pcm32_right, pcm32_mid, pcm32_side, num_samples);

    // Step 3: Convert to float and apply DWT
    for (size_t i = 0; i < num_samples; i++) {
        dwt_mid[i] = pcm32_mid[i];
        dwt_side[i] = pcm32_side[i];
    }

    dwt_forward_multilevel(dwt_mid, num_samples, dwt_levels);
    dwt_forward_multilevel(dwt_side, num_samples, dwt_levels);

    // Step 3.5: Accumulate coefficient statistics if enabled
    static int stats_enabled = -1;
    if (stats_enabled == -1) {
        stats_enabled = getenv("TAD_COEFF_STATS") != NULL;
        if (stats_enabled) {
            init_statistics(dwt_levels);
        }
    }
    if (stats_enabled) {
        accumulate_coefficients(dwt_mid, dwt_levels, num_samples, mid_accumulators);
        accumulate_coefficients(dwt_side, dwt_levels, num_samples, side_accumulators);
    }

//    apply_coeff_deadzone(0, dwt_mid, num_samples);
//    apply_coeff_deadzone(1, dwt_side, num_samples);

    // Step 4: Quantise with frequency-dependent weights and quantiser scaling
    quantise_dwt_coefficients(0, dwt_mid, quant_mid, num_samples, 1, num_samples, dwt_levels, max_index, NULL, quantiser_scale);
    quantise_dwt_coefficients(1, dwt_side, quant_side, num_samples, 1, num_samples, dwt_levels, max_index, NULL, quantiser_scale);

    // Step 4.5: Accumulate quantised coefficient statistics if enabled
    if (stats_enabled) {
        accumulate_quantised(quant_mid, dwt_levels, num_samples, mid_quant_accumulators);
        accumulate_quantised(quant_side, dwt_levels, num_samples, side_quant_accumulators);
    }

    // Step 5: Encode with binary tree EZBC (1D variant) - FIXED!
    uint8_t *mid_ezbc = NULL;
    uint8_t *side_ezbc = NULL;

    size_t mid_size = tad_encode_channel_ezbc(quant_mid, num_samples, &mid_ezbc);
    size_t side_size = tad_encode_channel_ezbc(quant_side, num_samples, &side_ezbc);

    // Concatenate EZBC outputs
    size_t uncompressed_size = mid_size + side_size;
    uint8_t *temp_buffer = malloc(uncompressed_size);
    memcpy(temp_buffer, mid_ezbc, mid_size);
    memcpy(temp_buffer + mid_size, side_ezbc, side_size);

    free(mid_ezbc);
    free(side_ezbc);

    // Step 6: Zstd compression (or bypass if zstd_level < 0)
    uint8_t *write_ptr = output;

    // Write chunk header
    *((uint16_t*)write_ptr) = (uint16_t)num_samples;
    write_ptr += sizeof(uint16_t);

    *write_ptr = (uint8_t)max_index;
    write_ptr += sizeof(uint8_t);

    uint32_t *payload_size_ptr = (uint32_t*)write_ptr;
    write_ptr += sizeof(uint32_t);

    size_t payload_size;
    int is_uncompressed = 0;

    if (zstd_level < 0) {
        // Bypass Zstd compression - use raw EZBC data
        payload_size = uncompressed_size;
        memcpy(write_ptr, temp_buffer, payload_size);
        is_uncompressed = 1;
    } else {
        // Normal Zstd compression
        int effective_level = (zstd_level > 0) ? zstd_level : TAD32_ZSTD_LEVEL;
        size_t zstd_bound = ZSTD_compressBound(uncompressed_size);
        uint8_t *zstd_buffer = malloc(zstd_bound);

        payload_size = ZSTD_compress(zstd_buffer, zstd_bound, temp_buffer, uncompressed_size, effective_level);

        if (ZSTD_isError(payload_size)) {
            fprintf(stderr, "Error: Zstd compression failed: %s\n", ZSTD_getErrorName(payload_size));
            free(zstd_buffer);
            free(pcm32_left); free(pcm32_right);
            free(pcm32_mid); free(pcm32_side); free(dwt_mid); free(dwt_side);
            free(quant_mid); free(quant_side); free(temp_buffer);
            return 0;
        }

        memcpy(write_ptr, zstd_buffer, payload_size);
        free(zstd_buffer);
    }

    // Set payload size (MSB=1 indicates uncompressed data)
    uint32_t size_field = (uint32_t)payload_size;
    if (is_uncompressed) {
        size_field |= 0x80000000;
    }
    *payload_size_ptr = size_field;
    write_ptr += payload_size;

    // Cleanup
    free(pcm32_left); free(pcm32_right);
    free(pcm32_mid); free(pcm32_side); free(dwt_mid); free(dwt_side);
    free(quant_mid); free(quant_side); free(temp_buffer);

    return write_ptr - output;
}