TAD: more wip

2026-06-06 21:44:04 +09:00 · 2025-10-26 02:30:51 +09:00
parent 52f25f7d04
commit 9fcb7fc95c
4 changed files with 563 additions and 37 deletions
--- a/video_encoder/decoder_tad.c
+++ b/video_encoder/decoder_tad.c
@@ -9,10 +9,15 @@
 #include <zstd.h>
 #include <getopt.h>

-#define DECODER_VENDOR_STRING "Decoder-TAD 20251023"
+#define DECODER_VENDOR_STRING "Decoder-TAD 20251026"

 // TAD format constants (must match encoder)
-#define TAD_COEFF_SCALAR 1024.0f
+#undef TAD32_COEFF_SCALARS
+
+// Coefficient scalars for each subband (CDF 9/7 with 9 decomposition levels)
+// Index 0 = LL band, Index 1-9 = H bands (L9 to L1)
+static const float TAD32_COEFF_SCALARS[] = {64.0f, 45.255f, 32.0f, 22.627f, 16.0f, 11.314f, 8.0f, 5.657f, 4.0f, 2.828f};
+
 #define TAD_DEFAULT_CHUNK_SIZE 32768
 #define TAD_MIN_CHUNK_SIZE 1024
 #define TAD_SAMPLE_RATE 32000
@@ -33,7 +38,7 @@ static inline float FCLAMP(float x, float min, float max) {

 // Calculate DWT levels from chunk size (must be power of 2, >= 1024)
 static int calculate_dwt_levels(int chunk_size) {
-    if (chunk_size < TAD_MIN_CHUNK_SIZE) {
+    /*if (chunk_size < TAD_MIN_CHUNK_SIZE) {
        fprintf(stderr, "Error: Chunk size %d is below minimum %d\n", chunk_size, TAD_MIN_CHUNK_SIZE);
        return -1;
    }
@@ -45,7 +50,8 @@ static int calculate_dwt_levels(int chunk_size) {
        size >>= 1;
        levels++;
    }
-    return levels - 2;
+    return levels - 2;*/
+    return 9;
 }

 //=============================================================================
@@ -71,6 +77,91 @@ static void dwt_haar_inverse_1d(float *data, int length) {
    free(temp);
 }

+// 9/7 inverse DWT (from TSVM Kotlin code)
+static void dwt_97_inverse_1d(float *data, int length) {
+    if (length < 2) return;
+
+    float *temp = malloc(length * sizeof(float));
+    int half = (length + 1) / 2;
+
+    // Split into low and high frequency components (matching TSVM layout)
+    for (int i = 0; i < half; i++) {
+        temp[i] = data[i];  // Low-pass coefficients (first half)
+    }
+    for (int i = 0; i < length / 2; i++) {
+        if (half + i < length) {
+            temp[half + i] = data[half + i];  // High-pass coefficients (second half)
+        }
+    }
+
+    // 9/7 inverse lifting coefficients from TSVM
+    const float alpha = -1.586134342f;
+    const float beta = -0.052980118f;
+    const float gamma = 0.882911076f;
+    const float delta = 0.443506852f;
+    const float K = 1.230174105f;
+
+    // Step 1: Undo scaling
+    for (int i = 0; i < half; i++) {
+        temp[i] /= K;  // Low-pass coefficients
+    }
+    for (int i = 0; i < length / 2; i++) {
+        if (half + i < length) {
+            temp[half + i] *= K;  // High-pass coefficients
+        }
+    }
+
+    // Step 2: Undo δ update
+    for (int i = 0; i < half; i++) {
+        float d_curr = (half + i < length) ? temp[half + i] : 0.0f;
+        float d_prev = (i > 0 && half + i - 1 < length) ? temp[half + i - 1] : d_curr;
+        temp[i] -= delta * (d_curr + d_prev);
+    }
+
+    // Step 3: Undo γ predict
+    for (int i = 0; i < length / 2; i++) {
+        if (half + i < length) {
+            float s_curr = temp[i];
+            float s_next = (i + 1 < half) ? temp[i + 1] : s_curr;
+            temp[half + i] -= gamma * (s_curr + s_next);
+        }
+    }
+
+    // Step 4: Undo β update
+    for (int i = 0; i < half; i++) {
+        float d_curr = (half + i < length) ? temp[half + i] : 0.0f;
+        float d_prev = (i > 0 && half + i - 1 < length) ? temp[half + i - 1] : d_curr;
+        temp[i] -= beta * (d_curr + d_prev);
+    }
+
+    // Step 5: Undo α predict
+    for (int i = 0; i < length / 2; i++) {
+        if (half + i < length) {
+            float s_curr = temp[i];
+            float s_next = (i + 1 < half) ? temp[i + 1] : s_curr;
+            temp[half + i] -= alpha * (s_curr + s_next);
+        }
+    }
+
+    // Reconstruction - interleave low and high pass
+    for (int i = 0; i < length; i++) {
+        if (i % 2 == 0) {
+            // Even positions: low-pass coefficients
+            data[i] = temp[i / 2];
+        } else {
+            // Odd positions: high-pass coefficients
+            int idx = i / 2;
+            if (half + idx < length) {
+                data[i] = temp[half + idx];
+            } else {
+                data[i] = 0.0f;
+            }
+        }
+    }
+
+    free(temp);
+}
+
 // Inverse 1D transform of Four-point interpolating Deslauriers-Dubuc (DD-4)
 static void dwt_dd4_inverse_1d(float *data, int length) {
    if (length < 2) return;
@@ -141,7 +232,8 @@ static void dwt_haar_inverse_multilevel(float *data, int length, int levels) {
        current_length *= 2;  // MULTIPLY FIRST: 128→256, 256→512, ..., 16384→32768
        if (current_length > length) current_length = length;
 //        dwt_haar_inverse_1d(data, current_length);  // THEN apply inverse
-        dwt_dd4_inverse_1d(data, current_length);  // THEN apply inverse
+//        dwt_dd4_inverse_1d(data, current_length);  // THEN apply inverse
+        dwt_97_inverse_1d(data, current_length);  // THEN apply inverse
    }
 }

@@ -159,23 +251,43 @@ static inline float tpdf1(void) {
    return (frand01() - frand01());
 }

-static void ms_correlate(const float *mid, const float *side, uint8_t *left, uint8_t *right, size_t count, float dither_error[2][2]) {
+static void ms_correlate(const float *mid, const float *side, float *left, float *right, size_t count) {
+    for (size_t i = 0; i < count; i++) {
+        // Decode M/S → L/R
+        float m = mid[i];
+        float s = side[i];
+        left[i] = FCLAMP((m + s) * 1.7321f, -1.0f, 1.0f);
+        right[i] = FCLAMP((m - s) * 1.7321f, -1.0f, 1.0f);
+    }
+}
+
+static float signum(float x) {
+    if (x > 0.0f) return 1.0f;
+    if (x < 0.0f) return -1.0f;
+    return 0.0f;
+}
+
+static void expand_gamma(float *left, float *right, size_t count) {
+    for (size_t i = 0; i < count; i++) {
+        // decode(y) = sign(y) * |y|^(1/γ) where γ=0.5
+        float x = left[i]; float a = fabsf(x);
+        left[i] = signum(x) * a * a;
+        float y = right[i]; float b = fabsf(y);
+        right[i] = signum(y) * b * b;
+    }
+}
+
+static void pcm32f_to_pcm8(const float *fleft, const float *fright, uint8_t *left, uint8_t *right, size_t count, float dither_error[2][2]) {
    const float b1 = 1.5f;   // 1st feedback coefficient
    const float b2 = -0.75f; // 2nd feedback coefficient
    const float scale = 127.5f;
    const float bias  = 128.0f;

    for (size_t i = 0; i < count; i++) {
-        // Decode M/S → L/R
-        float m = mid[i];
-        float s = side[i];
-        float l = FCLAMP(m + s, -1.0f, 1.0f);
-        float r = FCLAMP(m - s, -1.0f, 1.0f);
-
        // --- LEFT channel ---
        float feedbackL = b1 * dither_error[0][0] + b2 * dither_error[0][1];
        float ditherL = 0.5f * tpdf1(); // ±0.5 LSB TPDF
-        float shapedL = l + feedbackL + ditherL / scale;
+        float shapedL = fleft[i] + feedbackL + ditherL / scale;
        shapedL = FCLAMP(shapedL, -1.0f, 1.0f);

        int qL = (int)lrintf(shapedL * scale);
@@ -190,7 +302,7 @@ static void ms_correlate(const float *mid, const float *side, uint8_t *left, uin
        // --- RIGHT channel ---
        float feedbackR = b1 * dither_error[1][0] + b2 * dither_error[1][1];
        float ditherR = 0.5f * tpdf1();
-        float shapedR = r + feedbackR + ditherR / scale;
+        float shapedR = fright[i] + feedbackR + ditherR / scale;
        shapedR = FCLAMP(shapedR, -1.0f, 1.0f);

        int qR = (int)lrintf(shapedR * scale);
@@ -228,13 +340,15 @@ static void get_quantization_weights(int quality, int dwt_levels, float *weights
        /*15*/{0.2f, 0.2f, 0.8f, 1.0f, 1.0f, 1.0f, 1.0f, 1.0f, 1.0f, 1.0f, 1.0f, 1.0f, 1.0f, 1.25f, 1.5f, 1.5f}
    };

-    float quality_scale = 4.0f + FCLAMP((3 - quality) * 0.5f, 0.0f, 1000.0f);
+    float quality_scale = 1.0f * (1.0f + FCLAMP((5 - quality) * 0.5f, 0.0f, 1000.0f));

    for (int i = 0; i < dwt_levels; i++) {
-        weights[i] = FCLAMP(base_weights[dwt_levels][i] * quality_scale, 1.0f, 1000.0f);
+        weights[i] = 1.0f;//base_weights[dwt_levels][i] * quality_scale;
    }
 }

+#define QUANT_STEPS 8.0f // 64 -> [-64..64] -> 7 bits for LL
+
 static void dequantize_dwt_coefficients(const int16_t *quantized, float *coeffs, size_t count, int quality, int chunk_size, int dwt_levels) {
    float weights[16];
    get_quantization_weights(quality, dwt_levels, weights);
@@ -263,7 +377,7 @@ static void dequantize_dwt_coefficients(const int16_t *quantized, float *coeffs,
        if (weight_idx >= dwt_levels) weight_idx = dwt_levels - 1;

        float weight = weights[weight_idx];
-        coeffs[i] = (float)quantized[i] * weight / TAD_COEFF_SCALAR;
+        coeffs[i] = ((float)quantized[i] * TAD32_COEFF_SCALARS[sideband]) / (QUANT_STEPS * weight);
    }

    free(sideband_starts);
@@ -352,6 +466,8 @@ static int decode_chunk(const uint8_t *input, size_t input_size, uint8_t *pcmu8_
    int16_t *quant_side = malloc(sample_count * sizeof(int16_t));
    float *dwt_mid = malloc(sample_count * sizeof(float));
    float *dwt_side = malloc(sample_count * sizeof(float));
+    float *pcm32_left = malloc(sample_count * sizeof(float));
+    float *pcm32_right = malloc(sample_count * sizeof(float));
    uint8_t *pcm8_left = malloc(sample_count * sizeof(uint8_t));
    uint8_t *pcm8_right = malloc(sample_count * sizeof(uint8_t));

@@ -373,7 +489,13 @@ static int decode_chunk(const uint8_t *input, size_t input_size, uint8_t *pcmu8_
    float err[2][2] = {{0,0},{0,0}};

    // M/S to L/R correlation
-    ms_correlate(dwt_mid, dwt_side, pcm8_left, pcm8_right, sample_count, err);
+    ms_correlate(dwt_mid, dwt_side, pcm32_left, pcm32_right, sample_count);
+
+    // expand dynamic range
+//    expand_gamma(pcm32_left, pcm32_right, sample_count);
+
+    // dither to 8-bit
+    pcm32f_to_pcm8(pcm32_left, pcm32_right, pcm8_left, pcm8_right, sample_count, err);

    // Interleave stereo output (PCMu8)
    for (size_t i = 0; i < sample_count; i++) {
@@ -383,7 +505,7 @@ static int decode_chunk(const uint8_t *input, size_t input_size, uint8_t *pcmu8_

    // Cleanup
    free(quant_mid); free(quant_side); free(dwt_mid); free(dwt_side);
-    free(pcm8_left); free(pcm8_right);
+    free(pcm32_left); free(pcm32_right); free(pcm8_left); free(pcm8_right);
    if (decompressed) free(decompressed);

    return 0;
--- a/video_encoder/encoder_tad.c
+++ b/video_encoder/encoder_tad.c
@@ -11,13 +11,20 @@
 #include <zstd.h>
 #include "encoder_tad.h"

+// Undefine the macro version from header and define as array
+#undef TAD32_COEFF_SCALARS
+
+// Coefficient scalars for each subband (CDF 9/7 with 9 decomposition levels)
+// Index 0 = LL band, Index 1-9 = H bands (L9 to L1)
+static const float TAD32_COEFF_SCALARS[] = {64.0f, 45.255f, 32.0f, 22.627f, 16.0f, 11.314f, 8.0f, 5.657f, 4.0f, 2.828f};
+
 // Forward declarations for internal functions
 static void dwt_dd4_forward_1d(float *data, int length);
 static void dwt_dd4_forward_multilevel(float *data, int length, int levels);
 static void ms_decorrelate_16(const float *left, const float *right, float *mid, float *side, size_t count);
 static void get_quantization_weights(int quality, int dwt_levels, float *weights);
 static int get_deadzone_threshold(int quality);
-static void quantize_dwt_coefficients(const float *coeffs, int16_t *quantized, size_t count, int quality, int apply_deadzone, int chunk_size, int dwt_levels);
+static void quantize_dwt_coefficients(const float *coeffs, int16_t *quantized, size_t count, int quality, int apply_deadzone, int chunk_size, int dwt_levels, int *current_subband_index);
 static size_t encode_sigmap_2bit(const int16_t *values, size_t count, uint8_t *output);

 static inline float FCLAMP(float x, float min, float max) {
@@ -26,7 +33,7 @@ static inline float FCLAMP(float x, float min, float max) {

 // Calculate DWT levels from chunk size
 static int calculate_dwt_levels(int chunk_size) {
-    if (chunk_size < TAD32_MIN_CHUNK_SIZE) {
+    /*if (chunk_size < TAD32_MIN_CHUNK_SIZE) {
        fprintf(stderr, "Error: Chunk size %d is below minimum %d\n", chunk_size, TAD32_MIN_CHUNK_SIZE);
        return -1;
    }
@@ -44,7 +51,9 @@ static int calculate_dwt_levels(int chunk_size) {
        levels++;
    }

-    return levels - 2;  // Maximum decomposition
+    return levels - 2;*/  // Maximum decomposition
+
+    return 9;
 }

 //=============================================================================
@@ -99,11 +108,80 @@ static void dwt_dd4_forward_1d(float *data, int length) {
    free(temp);
 }

+// 1D DWT using lifting scheme for 9/7 irreversible filter
+static void dwt_97_forward_1d(float *data, int length) {
+    if (length < 2) return;
+
+    float *temp = malloc(length * sizeof(float));
+    int half = (length + 1) / 2;  // Handle odd lengths properly
+
+    // Split into even/odd samples
+    for (int i = 0; i < half; i++) {
+        temp[i] = data[2 * i];           // Even (low)
+    }
+    for (int i = 0; i < length / 2; i++) {
+        temp[half + i] = data[2 * i + 1]; // Odd (high)
+    }
+
+    // JPEG2000 9/7 forward lifting steps (corrected to match decoder)
+    const float alpha = -1.586134342f;
+    const float beta = -0.052980118f;
+    const float gamma = 0.882911076f;
+    const float delta = 0.443506852f;
+    const float K = 1.230174105f;
+
+    // Step 1: Predict α - d[i] += α * (s[i] + s[i+1])
+    for (int i = 0; i < length / 2; i++) {
+        if (half + i < length) {
+            float s_curr = temp[i];
+            float s_next = (i + 1 < half) ? temp[i + 1] : s_curr;
+            temp[half + i] += alpha * (s_curr + s_next);
+        }
+    }
+
+    // Step 2: Update β - s[i] += β * (d[i-1] + d[i])
+    for (int i = 0; i < half; i++) {
+        float d_curr = (half + i < length) ? temp[half + i] : 0.0f;
+        float d_prev = (i > 0 && half + i - 1 < length) ? temp[half + i - 1] : d_curr;
+        temp[i] += beta * (d_prev + d_curr);
+    }
+
+    // Step 3: Predict γ - d[i] += γ * (s[i] + s[i+1])
+    for (int i = 0; i < length / 2; i++) {
+        if (half + i < length) {
+            float s_curr = temp[i];
+            float s_next = (i + 1 < half) ? temp[i + 1] : s_curr;
+            temp[half + i] += gamma * (s_curr + s_next);
+        }
+    }
+
+    // Step 4: Update δ - s[i] += δ * (d[i-1] + d[i])
+    for (int i = 0; i < half; i++) {
+        float d_curr = (half + i < length) ? temp[half + i] : 0.0f;
+        float d_prev = (i > 0 && half + i - 1 < length) ? temp[half + i - 1] : d_curr;
+        temp[i] += delta * (d_prev + d_curr);
+    }
+
+    // Step 5: Scaling - s[i] *= K, d[i] /= K
+    for (int i = 0; i < half; i++) {
+        temp[i] *= K;  // Low-pass coefficients
+    }
+    for (int i = 0; i < length / 2; i++) {
+        if (half + i < length) {
+            temp[half + i] /= K;  // High-pass coefficients
+        }
+    }
+
+    memcpy(data, temp, length * sizeof(float));
+    free(temp);
+}
+
 // Apply multi-level DWT (using DD-4 wavelet)
 static void dwt_dd4_forward_multilevel(float *data, int length, int levels) {
    int current_length = length;
    for (int level = 0; level < levels; level++) {
-        dwt_dd4_forward_1d(data, current_length);
+//        dwt_dd4_forward_1d(data, current_length);
+        dwt_97_forward_1d(data, current_length);
        current_length = (current_length + 1) / 2;
    }
 }
@@ -112,7 +190,7 @@ static void dwt_dd4_forward_multilevel(float *data, int length, int levels) {
 // M/S Stereo Decorrelation (PCM32f version)
 //=============================================================================

-static void ms_decorrelate_16(const float *left, const float *right, float *mid, float *side, size_t count) {
+static void ms_decorrelate(const float *left, const float *right, float *mid, float *side, size_t count) {
    for (size_t i = 0; i < count; i++) {
        // Mid = (L + R) / 2, Side = (L - R) / 2
        float l = left[i];
@@ -122,6 +200,22 @@ static void ms_decorrelate_16(const float *left, const float *right, float *mid,
    }
 }

+static float signum(float x) {
+    if (x > 0.0f) return 1.0f;
+    if (x < 0.0f) return -1.0f;
+    return 0.0f;
+}
+
+static void compress_gamma(float *left, float *right, size_t count) {
+    for (size_t i = 0; i < count; i++) {
+        // encode(x) = sign(x) * |x|^γ where γ=0.5
+        float x = left[i];
+        left[i] = signum(x) * sqrtf(fabsf(x));
+        float y = right[i];
+        right[i] = signum(y) * sqrtf(fabsf(y));
+    }
+}
+
 //=============================================================================
 // Quantization with Frequency-Dependent Weighting
 //=============================================================================
@@ -146,19 +240,21 @@ static void get_quantization_weights(int quality, int dwt_levels, float *weights
        /*15*/{0.2f, 0.2f, 0.8f, 1.0f, 1.0f, 1.0f, 1.0f, 1.0f, 1.0f, 1.0f, 1.0f, 1.0f, 1.0f, 1.25f, 1.5f, 1.5f}
    };

-    float quality_scale = 4.0f * (1.0f + FCLAMP((5 - quality) * 0.5f, 0.0f, 1000.0f));
+    float quality_scale = 1.0f * (1.0f + FCLAMP((5 - quality) * 0.5f, 0.0f, 1000.0f));

    for (int i = 0; i < dwt_levels; i++) {
-        weights[i] = base_weights[dwt_levels][i] * quality_scale;
+        weights[i] = 1.0f;//base_weights[dwt_levels][i] * quality_scale;
    }
 }

+#define QUANT_STEPS 512.0f // 64 -> [-64..64] -> 7 bits for LL
+
 static int get_deadzone_threshold(int quality) {
    const int thresholds[] = {0,0,0,0,0,0};  // Q0 to Q5
    return thresholds[quality];
 }

-static void quantize_dwt_coefficients(const float *coeffs, int16_t *quantized, size_t count, int quality, int apply_deadzone, int chunk_size, int dwt_levels) {
+static void quantize_dwt_coefficients(const float *coeffs, int16_t *quantized, size_t count, int quality, int apply_deadzone, int chunk_size, int dwt_levels, int *current_subband_index) {
    float weights[16];
    get_quantization_weights(quality, dwt_levels, weights);
    int deadzone = apply_deadzone ? get_deadzone_threshold(quality) : 0;
@@ -181,11 +277,17 @@ static void quantize_dwt_coefficients(const float *coeffs, int16_t *quantized, s
            }
        }

+        // Store subband index (LL=0, H1=1, H2=2, ..., H9=9 for dwt_levels=9)
+        if (current_subband_index != NULL) {
+            current_subband_index[i] = sideband;
+        }
+
        int weight_idx = (sideband == 0) ? 0 : sideband - 1;
        if (weight_idx >= dwt_levels) weight_idx = dwt_levels - 1;

        float weight = weights[weight_idx];
-        float val = coeffs[i] / weight * TAD32_COEFF_SCALAR;
+        float val = (coeffs[i] / TAD32_COEFF_SCALARS[sideband]) * (QUANT_STEPS * weight);
+        // (coeffs[i] / TAD32_COEFF_SCALARS[sideband]) normalises coeffs to -1..1
        int16_t quant_val = (int16_t)roundf(val);

        if (apply_deadzone && sideband >= dwt_levels - 1) {
@@ -200,10 +302,280 @@ static void quantize_dwt_coefficients(const float *coeffs, int16_t *quantized, s
    free(sideband_starts);
 }

+// idea 1: power-of-two companding
+// for quant step 8:
+// Q -> Float
+// 0 -> 0
+// 1 -> 1/128
+// 2 -> 1/64
+// 3 -> 1/32
+// 4 -> 1/16
+// 5 -> 1/8
+// 6 -> 1/4
+// 7 -> 1/2
+// 8 -> 1/1
+// for -1 to -8, just invert the sign
+
+
 //=============================================================================
 // Significance Map Encoding
 //=============================================================================

+//=============================================================================
+// Coefficient Statistics
+//=============================================================================
+
+static int compare_float(const void *a, const void *b) {
+    float fa = *(const float*)a;
+    float fb = *(const float*)b;
+    if (fa < fb) return -1;
+    if (fa > fb) return 1;
+    return 0;
+}
+
+typedef struct {
+    float min;
+    float q1;
+    float median;
+    float q3;
+    float max;
+} CoeffStats;
+
+typedef struct {
+    float *data;
+    size_t count;
+    size_t capacity;
+} CoeffAccumulator;
+
+// Global accumulators for statistics
+static CoeffAccumulator *mid_accumulators = NULL;
+static CoeffAccumulator *side_accumulators = NULL;
+static int num_subbands = 0;
+static int stats_initialized = 0;
+static int stats_dwt_levels = 0;
+
+static void init_statistics(int dwt_levels) {
+    if (stats_initialized) return;
+
+    num_subbands = dwt_levels + 1;
+    stats_dwt_levels = dwt_levels;
+
+    mid_accumulators = calloc(num_subbands, sizeof(CoeffAccumulator));
+    side_accumulators = calloc(num_subbands, sizeof(CoeffAccumulator));
+
+    for (int i = 0; i < num_subbands; i++) {
+        mid_accumulators[i].capacity = 1024;
+        mid_accumulators[i].data = malloc(mid_accumulators[i].capacity * sizeof(float));
+        mid_accumulators[i].count = 0;
+
+        side_accumulators[i].capacity = 1024;
+        side_accumulators[i].data = malloc(side_accumulators[i].capacity * sizeof(float));
+        side_accumulators[i].count = 0;
+    }
+
+    stats_initialized = 1;
+}
+
+static void accumulate_coefficients(const float *coeffs, int dwt_levels, int chunk_size, CoeffAccumulator *accumulators) {
+    int first_band_size = chunk_size >> dwt_levels;
+
+    int *sideband_starts = malloc((dwt_levels + 2) * sizeof(int));
+    sideband_starts[0] = 0;
+    sideband_starts[1] = first_band_size;
+    for (int i = 2; i <= dwt_levels + 1; i++) {
+        sideband_starts[i] = sideband_starts[i-1] + (first_band_size << (i-2));
+    }
+
+    for (int s = 0; s <= dwt_levels; s++) {
+        size_t start = sideband_starts[s];
+        size_t end = sideband_starts[s + 1];
+        size_t band_size = end - start;
+
+        // Expand capacity if needed
+        while (accumulators[s].count + band_size > accumulators[s].capacity) {
+            accumulators[s].capacity *= 2;
+            accumulators[s].data = realloc(accumulators[s].data,
+                                          accumulators[s].capacity * sizeof(float));
+        }
+
+        // Copy coefficients
+        memcpy(accumulators[s].data + accumulators[s].count,
+               coeffs + start, band_size * sizeof(float));
+        accumulators[s].count += band_size;
+    }
+
+    free(sideband_starts);
+}
+
+static void calculate_coeff_stats(const float *coeffs, size_t count, CoeffStats *stats) {
+    if (count == 0) {
+        stats->min = stats->q1 = stats->median = stats->q3 = stats->max = 0.0f;
+        return;
+    }
+
+    // Copy coefficients for sorting
+    float *sorted = malloc(count * sizeof(float));
+    memcpy(sorted, coeffs, count * sizeof(float));
+    qsort(sorted, count, sizeof(float), compare_float);
+
+    stats->min = sorted[0];
+    stats->max = sorted[count - 1];
+    stats->median = sorted[count / 2];
+    stats->q1 = sorted[count / 4];
+    stats->q3 = sorted[(3 * count) / 4];
+
+    free(sorted);
+}
+
+#define HISTOGRAM_BINS 40
+#define HISTOGRAM_WIDTH 60
+
+static void print_histogram(const float *coeffs, size_t count, const char *title) {
+    if (count == 0) return;
+
+    // Find min/max
+    float min_val = coeffs[0];
+    float max_val = coeffs[0];
+    for (size_t i = 1; i < count; i++) {
+        if (coeffs[i] < min_val) min_val = coeffs[i];
+        if (coeffs[i] > max_val) max_val = coeffs[i];
+    }
+
+    // Handle case where all values are the same
+    if (fabsf(max_val - min_val) < 1e-9f) {
+        fprintf(stderr, "  %s: All values are %.3f\n", title, min_val);
+        return;
+    }
+
+    // Create histogram bins
+    size_t bins[HISTOGRAM_BINS] = {0};
+    float bin_width = (max_val - min_val) / HISTOGRAM_BINS;
+
+    for (size_t i = 0; i < count; i++) {
+        int bin = (int)((coeffs[i] - min_val) / bin_width);
+        if (bin >= HISTOGRAM_BINS) bin = HISTOGRAM_BINS - 1;
+        if (bin < 0) bin = 0;
+        bins[bin]++;
+    }
+
+    // Find max bin count for scaling
+    size_t max_bin = 0;
+    for (int i = 0; i < HISTOGRAM_BINS; i++) {
+        if (bins[i] > max_bin) max_bin = bins[i];
+    }
+
+    // Print histogram
+    fprintf(stderr, "  %s Histogram (range: %.3f to %.3f):\n", title, min_val, max_val);
+
+    // Print top 20 bins to keep output manageable
+    for (int i = 0; i < HISTOGRAM_BINS; i++) {
+        float bin_start = min_val + i * bin_width;
+        float bin_end = bin_start + bin_width;
+        int bar_width = (int)((bins[i] * HISTOGRAM_WIDTH) / max_bin);
+
+        // Only print bins with significant content (> 1% of max)
+        if (bins[i] > max_bin / 100) {
+            fprintf(stderr, "  %8.3f-%8.3f [%7zu]: ", bin_start, bin_end, bins[i]);
+            for (int j = 0; j < bar_width; j++) {
+                fprintf(stderr, "█");
+            }
+            fprintf(stderr, "\n");
+        }
+    }
+    fprintf(stderr, "\n");
+}
+
+void tad32_print_statistics(void) {
+    if (!stats_initialized) return;
+
+    fprintf(stderr, "\n=== TAD Coefficient Statistics (before quantization) ===\n");
+
+    // Print Mid channel statistics
+    fprintf(stderr, "\nMid Channel:\n");
+    fprintf(stderr, "%-12s %10s %10s %10s %10s %10s %10s\n",
+            "Subband", "Samples", "Min", "Q1", "Median", "Q3", "Max");
+    fprintf(stderr, "--------------------------------------------------------------------------------\n");
+
+    for (int s = 0; s < num_subbands; s++) {
+        CoeffStats stats;
+        calculate_coeff_stats(mid_accumulators[s].data, mid_accumulators[s].count, &stats);
+
+        char band_name[16];
+        if (s == 0) {
+            snprintf(band_name, sizeof(band_name), "LL (L%d)", stats_dwt_levels);
+        } else {
+            snprintf(band_name, sizeof(band_name), "H (L%d)", stats_dwt_levels - s + 1);
+        }
+
+        fprintf(stderr, "%-12s %10zu %10.3f %10.3f %10.3f %10.3f %10.3f\n",
+                band_name, mid_accumulators[s].count,
+                stats.min, stats.q1, stats.median, stats.q3, stats.max);
+    }
+
+    // Print Mid channel histograms
+    fprintf(stderr, "\nMid Channel Histograms:\n");
+    for (int s = 0; s < num_subbands; s++) {
+        char band_name[32];
+        if (s == 0) {
+            snprintf(band_name, sizeof(band_name), "LL (L%d)", stats_dwt_levels);
+        } else {
+            snprintf(band_name, sizeof(band_name), "H (L%d)", stats_dwt_levels - s + 1);
+        }
+        print_histogram(mid_accumulators[s].data, mid_accumulators[s].count, band_name);
+    }
+
+    // Print Side channel statistics
+    fprintf(stderr, "\nSide Channel:\n");
+    fprintf(stderr, "%-12s %10s %10s %10s %10s %10s %10s\n",
+            "Subband", "Samples", "Min", "Q1", "Median", "Q3", "Max");
+    fprintf(stderr, "--------------------------------------------------------------------------------\n");
+
+    for (int s = 0; s < num_subbands; s++) {
+        CoeffStats stats;
+        calculate_coeff_stats(side_accumulators[s].data, side_accumulators[s].count, &stats);
+
+        char band_name[16];
+        if (s == 0) {
+            snprintf(band_name, sizeof(band_name), "LL (L%d)", stats_dwt_levels);
+        } else {
+            snprintf(band_name, sizeof(band_name), "H (L%d)", stats_dwt_levels - s + 1);
+        }
+
+        fprintf(stderr, "%-12s %10zu %10.3f %10.3f %10.3f %10.3f %10.3f\n",
+                band_name, side_accumulators[s].count,
+                stats.min, stats.q1, stats.median, stats.q3, stats.max);
+    }
+
+    // Print Side channel histograms
+    fprintf(stderr, "\nSide Channel Histograms:\n");
+    for (int s = 0; s < num_subbands; s++) {
+        char band_name[32];
+        if (s == 0) {
+            snprintf(band_name, sizeof(band_name), "LL (L%d)", stats_dwt_levels);
+        } else {
+            snprintf(band_name, sizeof(band_name), "H (L%d)", stats_dwt_levels - s + 1);
+        }
+        print_histogram(side_accumulators[s].data, side_accumulators[s].count, band_name);
+    }
+
+    fprintf(stderr, "\n");
+}
+
+void tad32_free_statistics(void) {
+    if (!stats_initialized) return;
+
+    for (int i = 0; i < num_subbands; i++) {
+        free(mid_accumulators[i].data);
+        free(side_accumulators[i].data);
+    }
+    free(mid_accumulators);
+    free(side_accumulators);
+
+    mid_accumulators = NULL;
+    side_accumulators = NULL;
+    stats_initialized = 0;
+}
+
 static size_t encode_sigmap_2bit(const int16_t *values, size_t count, uint8_t *output) {
    size_t map_bytes = (count * 2 + 7) / 8;
    uint8_t *map = output;
@@ -269,8 +641,11 @@ size_t tad32_encode_chunk(const float *pcm32_stereo, size_t num_samples, int qua
        pcm32_right[i] = pcm32_stereo[i * 2 + 1];
    }

+    // Step 1.1: Compress dynamic range
+//    compress_gamma(pcm32_left, pcm32_right, num_samples);
+
    // Step 2: M/S decorrelation
-    ms_decorrelate_16(pcm32_left, pcm32_right, pcm32_mid, pcm32_side, num_samples);
+    ms_decorrelate(pcm32_left, pcm32_right, pcm32_mid, pcm32_side, num_samples);

    // Step 3: Convert to float and apply DWT
    for (size_t i = 0; i < num_samples; i++) {
@@ -281,9 +656,22 @@ size_t tad32_encode_chunk(const float *pcm32_stereo, size_t num_samples, int qua
    dwt_dd4_forward_multilevel(dwt_mid, num_samples, dwt_levels);
    dwt_dd4_forward_multilevel(dwt_side, num_samples, dwt_levels);

+    // Step 3.5: Accumulate coefficient statistics if enabled
+    static int stats_enabled = -1;
+    if (stats_enabled == -1) {
+        stats_enabled = 1;//getenv("TAD_COEFF_STATS") != NULL;
+        if (stats_enabled) {
+            init_statistics(dwt_levels);
+        }
+    }
+    if (stats_enabled) {
+        accumulate_coefficients(dwt_mid, dwt_levels, num_samples, mid_accumulators);
+        accumulate_coefficients(dwt_side, dwt_levels, num_samples, side_accumulators);
+    }
+
    // Step 4: Quantize with frequency-dependent weights and dead zone
-    quantize_dwt_coefficients(dwt_mid, quant_mid, num_samples, quality, 1, num_samples, dwt_levels);
-    quantize_dwt_coefficients(dwt_side, quant_side, num_samples, quality, 1, num_samples, dwt_levels);
+    quantize_dwt_coefficients(dwt_mid, quant_mid, num_samples, quality, 1, num_samples, dwt_levels, NULL);
+    quantize_dwt_coefficients(dwt_side, quant_side, num_samples, quality, 1, num_samples, dwt_levels, NULL);

    // Step 5: Encode with 2-bit significance map (32-bit version)
    uint8_t *temp_buffer = malloc(num_samples * 4 * sizeof(int32_t));
--- a/video_encoder/encoder_tad.h
+++ b/video_encoder/encoder_tad.h
@@ -9,7 +9,7 @@
 // Alternative version: PCM32f throughout encoding, PCM8 conversion only at decoder

 // Constants
-#define TAD32_COEFF_SCALAR 1024.0f
+#define TAD32_COEFF_SCALARS {64.0f, 45.255f, 32.0f, 22.627f, 16.0f, 11.314f, 8.0f, 5.657f, 4.0f, 2.828f} // value only valid for CDF 9/7 with decomposition level 9. Index 0 = LL band
 #define TAD32_MIN_CHUNK_SIZE 1024       // Minimum: 1024 samples
 #define TAD32_SAMPLE_RATE 32000
 #define TAD32_CHANNELS 2  // Stereo
@@ -37,4 +37,16 @@
 size_t tad32_encode_chunk(const float *pcm32_stereo, size_t num_samples, int quality,
                          int use_zstd, uint8_t *output);

+/**
+ * Print accumulated coefficient statistics
+ * Only effective if TAD_COEFF_STATS environment variable is set
+ */
+void tad32_print_statistics(void);
+
+/**
+ * Free accumulated statistics memory
+ * Should be called after tad32_print_statistics()
+ */
+void tad32_free_statistics(void);
+
 #endif // TAD32_ENCODER_H
--- a/video_encoder/encoder_tad_standalone.c
+++ b/video_encoder/encoder_tad_standalone.c
@@ -1,6 +1,6 @@
 // Created by CuriousTorvald and Claude on 2025-10-24.
-// TAD32 (Terrarum Advanced Audio - PCM16 version) Encoder - Standalone program
-// Alternative version: PCM16 throughout encoding, PCM8 conversion only at decoder
+// TAD32 (Terrarum Advanced Audio - PCM32 version) Encoder - Standalone program
+// Alternative version: PCM32 throughout encoding, PCM8 conversion only at decoder
 // Uses encoder_tad32.c library for encoding functions

 #include <stdio.h>
@@ -11,7 +11,7 @@
 #include <time.h>
 #include "encoder_tad.h"

-#define ENCODER_VENDOR_STRING "Encoder-TAD32 (PCM32f version) 20251024"
+#define ENCODER_VENDOR_STRING "Encoder-TAD32 (PCM32f version) 20251026"

 // TAD32 format constants
 #define TAD32_DEFAULT_CHUNK_SIZE 32768  // Default: power of 2 for optimal performance (2^15)
@@ -52,8 +52,8 @@ static void print_usage(const char *prog_name) {
    printf("  -v              Verbose output\n");
    printf("  -h, --help      Show this help\n");
    printf("\nVersion: %s\n", ENCODER_VENDOR_STRING);
-    printf("Note: This is the PCM16 alternative version for comparison testing.\n");
-    printf("      PCM16 is processed throughout encoding; PCM8 conversion happens at decoder.\n");
+    printf("Note: This is the PCM32 alternative version for comparison testing.\n");
+    printf("      PCM32 is processed throughout encoding; PCM8 conversion happens at decoder.\n");
 }

 int main(int argc, char *argv[]) {
@@ -269,6 +269,10 @@ int main(int argc, char *argv[]) {
        printf("\n");
    }

+    // Print coefficient statistics if enabled
+    tad32_print_statistics();
+    tad32_free_statistics();
+
    // Cleanup
    free(chunk_buffer);
    free(output_buffer);