fix: EZBC for TAV producing dark bloches on white background due to coeff clipping

2026-03-12 14:11:50 +09:00 · 2025-11-11 03:22:10 +09:00
parent 9425c58e53
commit bff5021a7a
3 changed files with 104 additions and 58 deletions
--- a/video_encoder/decoder_tav.c
+++ b/video_encoder/decoder_tav.c
@@ -2407,13 +2407,52 @@ static int decode_i_or_p_frame(tav_decoder_t *decoder, uint8_t packet_type, uint
        const int is_perceptual = (decoder->header.version >= 5 && decoder->header.version <= 8);
        const int is_ezbc = (decoder->header.entropy_coder == 1);

-        if (is_ezbc) {
-            // EZBC mode: coefficients are already denormalised by encoder
-            // Just convert int16 to float without multiplying by quantiser
-            for (int i = 0; i < coeff_count; i++) {
-                decoder->dwt_buffer_y[i] = (float)quantised_y[i];
-                decoder->dwt_buffer_co[i] = (float)quantised_co[i];
-                decoder->dwt_buffer_cg[i] = (float)quantised_cg[i];
+        // Debug: Print decoder state
+        static int state_debug_once = 1;
+        if (state_debug_once) {
+            fprintf(stderr, "[DECODER-STATE] version=%d, entropy_coder=%d, is_perceptual=%d, is_ezbc=%d\n",
+                    decoder->header.version, decoder->header.entropy_coder, is_perceptual, is_ezbc);
+            state_debug_once = 0;
+        }
+
+        if (is_ezbc && is_perceptual) {
+            // EZBC mode with perceptual quantisation: coefficients are normalised
+            // Need to dequantise using perceptual weights (same as twobit-map mode)
+
+            // Debug: Print quantised LL values before dequantisation
+            static int debug_count = 0;
+            if (debug_count < 1) {
+                fprintf(stderr, "[EZBC-DECODER-DEBUG] Quantised LL coefficients (9x7):\n");
+                for (int y = 0; y < 7 && y < decoder->header.height; y++) {
+                    for (int x = 0; x < 9 && x < decoder->header.width; x++) {
+                        int idx = y * decoder->header.width + x;
+                        fprintf(stderr, "%6d ", quantised_y[idx]);
+                    }
+                    fprintf(stderr, "\n");
+                }
+                debug_count++;
+            }
+
+            dequantise_dwt_subbands_perceptual(0, qy, quantised_y, decoder->dwt_buffer_y,
+                                              decoder->header.width, decoder->header.height,
+                                              decoder->header.decomp_levels, qy, 0, decoder->frame_count);
+            dequantise_dwt_subbands_perceptual(0, qy, quantised_co, decoder->dwt_buffer_co,
+                                              decoder->header.width, decoder->header.height,
+                                              decoder->header.decomp_levels, qco, 1, decoder->frame_count);
+            dequantise_dwt_subbands_perceptual(0, qy, quantised_cg, decoder->dwt_buffer_cg,
+                                              decoder->header.width, decoder->header.height,
+                                              decoder->header.decomp_levels, qcg, 1, decoder->frame_count);
+
+            // Debug: Print dequantised LL values
+            if (debug_count <= 1) {
+                fprintf(stderr, "[EZBC-DECODER-DEBUG] Dequantised LL coefficients (9x7):\n");
+                for (int y = 0; y < 7 && y < decoder->header.height; y++) {
+                    for (int x = 0; x < 9 && x < decoder->header.width; x++) {
+                        int idx = y * decoder->header.width + x;
+                        fprintf(stderr, "%7.0f ", decoder->dwt_buffer_y[idx]);
+                    }
+                    fprintf(stderr, "\n");
+                }
            }
        } else if (is_perceptual) {
            dequantise_dwt_subbands_perceptual(0, qy, quantised_y, decoder->dwt_buffer_y,
@@ -2912,28 +2951,34 @@ int main(int argc, char *argv[]) {
            const int temporal_levels = 2;  // Fixed for TAV GOP encoding

            for (int t = 0; t < gop_size; t++) {
-                if (is_ezbc) {
-                    // EZBC mode: coefficients are already denormalised by encoder
-                    // Just convert int16 to float without multiplying by quantiser
-                    for (int i = 0; i < num_pixels; i++) {
-                        gop_y[t][i] = (float)quantised_gop[t][0][i];
-                        gop_co[t][i] = (float)quantised_gop[t][1][i];
-                        gop_cg[t][i] = (float)quantised_gop[t][2][i];
-                    }
+                if (is_ezbc && is_perceptual) {
+                    // EZBC mode with perceptual quantisation: coefficients are normalised
+                    // Need to dequantise using perceptual weights (same as twobit-map mode)
+                    const int temporal_level = get_temporal_subband_level(t, gop_size, temporal_levels);
+                    const float temporal_scale = get_temporal_quantiser_scale(temporal_level);

-                    if (t == 0) {
-                        // Debug first frame
-                        int16_t max_y = 0, min_y = 0;
-                        for (int i = 0; i < num_pixels; i++) {
-                            if (quantised_gop[t][0][i] > max_y) max_y = quantised_gop[t][0][i];
-                            if (quantised_gop[t][0][i] < min_y) min_y = quantised_gop[t][0][i];
-                        }
-                        fprintf(stderr, "[GOP-EZBC] Frame 0 Y coeffs range: [%d, %d], first 5: %d %d %d %d %d\n",
-                               min_y, max_y,
-                               quantised_gop[t][0][0], quantised_gop[t][0][1], quantised_gop[t][0][2],
-                               quantised_gop[t][0][3], quantised_gop[t][0][4]);
+                    const float base_q_y = roundf(decoder->header.quantiser_y * temporal_scale);
+                    const float base_q_co = roundf(decoder->header.quantiser_co * temporal_scale);
+                    const float base_q_cg = roundf(decoder->header.quantiser_cg * temporal_scale);
+
+                    dequantise_dwt_subbands_perceptual(0, decoder->header.quantiser_y,
+                                                      quantised_gop[t][0], gop_y[t],
+                                                      decoder->header.width, decoder->header.height,
+                                                      decoder->header.decomp_levels, base_q_y, 0, decoder->frame_count + t);
+                    dequantise_dwt_subbands_perceptual(0, decoder->header.quantiser_y,
+                                                      quantised_gop[t][1], gop_co[t],
+                                                      decoder->header.width, decoder->header.height,
+                                                      decoder->header.decomp_levels, base_q_co, 1, decoder->frame_count + t);
+                    dequantise_dwt_subbands_perceptual(0, decoder->header.quantiser_y,
+                                                      quantised_gop[t][2], gop_cg[t],
+                                                      decoder->header.width, decoder->header.height,
+                                                      decoder->header.decomp_levels, base_q_cg, 1, decoder->frame_count + t);
+
+                    if (t == 0 && verbose) {
+                        fprintf(stderr, "[GOP-EZBC] Frame 0: Quantised LL[0]=%d, Dequantised LL[0]=%.1f, base_q_y=%.1f\n",
+                               quantised_gop[t][0][0], gop_y[t][0], base_q_y);
                    }
-                } else {
+                } else if (!is_ezbc) {
                    // Normal mode: multiply by quantiser
                    const int temporal_level = get_temporal_subband_level(t, gop_size, temporal_levels);
                    const float temporal_scale = get_temporal_quantiser_scale(temporal_level);