fix: EZBC for TAV producing dark bloches on white background due to coeff clipping

2026-06-06 13:38:30 +09:00 · 2025-11-11 03:22:10 +09:00
parent 9425c58e53
commit bff5021a7a
3 changed files with 104 additions and 58 deletions
--- a/tsvm_core/src/net/torvald/tsvm/GraphicsJSR223Delegate.kt
+++ b/tsvm_core/src/net/torvald/tsvm/GraphicsJSR223Delegate.kt
@@ -5022,28 +5022,21 @@ class GraphicsJSR223Delegate(private val vm: VM) {
        }

        // Apply linear dequantisation with perceptual weights (matching encoder's linear storage)
-        // EZBC mode: coefficients are ALREADY DENORMALIZED by encoder
-        //            e.g., encoder: coeff=377 → quantize: 377/48=7.85→8 → denormalize: 8*48=384 → store 384
-        //            decoder: read 384 → pass through as-is (already in correct range for IDWT)
-        // Significance-map mode: coefficients are normalized (quantized only)
-        //                       e.g., encoder stores 8 = round(377/48)
-        //                       decoder must multiply: 8 * 48 = 384 (denormalize for IDWT)
+        // FIX (2025-11-11): Both EZBC and Significance-map modes now store NORMALIZED coefficients
+        //                   Encoder stores quantised values (e.g., round(377/48) = 8)
+        //                   Decoder must multiply by effective quantiser to denormalize
+        //                   Previous denormalization in EZBC caused int16_t overflow (clipping at 32767)
+        //                   for bright pixels, creating dark DWT-pattern blemishes
        for (i in quantised.indices) {
            if (i < dequantised.size) {
                val effectiveQuantiser = baseQuantiser * weights[i]

-                dequantised[i] = if (isEZBC) {
-                    // EZBC mode: pass through as-is (coefficients already denormalized and rounded by encoder)
-                    quantised[i].toFloat()
-                } else {
-                    // Significance-map mode: multiply to denormalize, then round
-                    // CRITICAL: Must ROUND (not truncate) to match EZBC encoder's roundf() behavior
-                    // Truncation toward zero was wrong - it created mismatch with EZBC for odd baseQ values
-                    val untruncated = quantised[i] * effectiveQuantiser
-                    val rounded = kotlin.math.round(untruncated)
+                // Both modes now use the same dequantisation: multiply to denormalize, then round
+                // CRITICAL: Must ROUND (not truncate) to match encoder's roundf() behavior
+                val untruncated = quantised[i] * effectiveQuantiser
+                val rounded = kotlin.math.round(untruncated)

-                    rounded
-                }
+                dequantised[i] = rounded
            }
        }

--- a/video_encoder/decoder_tav.c
+++ b/video_encoder/decoder_tav.c
@@ -2407,13 +2407,52 @@ static int decode_i_or_p_frame(tav_decoder_t *decoder, uint8_t packet_type, uint
        const int is_perceptual = (decoder->header.version >= 5 && decoder->header.version <= 8);
        const int is_ezbc = (decoder->header.entropy_coder == 1);

-        if (is_ezbc) {
-            // EZBC mode: coefficients are already denormalised by encoder
-            // Just convert int16 to float without multiplying by quantiser
-            for (int i = 0; i < coeff_count; i++) {
-                decoder->dwt_buffer_y[i] = (float)quantised_y[i];
-                decoder->dwt_buffer_co[i] = (float)quantised_co[i];
-                decoder->dwt_buffer_cg[i] = (float)quantised_cg[i];
+        // Debug: Print decoder state
+        static int state_debug_once = 1;
+        if (state_debug_once) {
+            fprintf(stderr, "[DECODER-STATE] version=%d, entropy_coder=%d, is_perceptual=%d, is_ezbc=%d\n",
+                    decoder->header.version, decoder->header.entropy_coder, is_perceptual, is_ezbc);
+            state_debug_once = 0;
+        }
+
+        if (is_ezbc && is_perceptual) {
+            // EZBC mode with perceptual quantisation: coefficients are normalised
+            // Need to dequantise using perceptual weights (same as twobit-map mode)
+
+            // Debug: Print quantised LL values before dequantisation
+            static int debug_count = 0;
+            if (debug_count < 1) {
+                fprintf(stderr, "[EZBC-DECODER-DEBUG] Quantised LL coefficients (9x7):\n");
+                for (int y = 0; y < 7 && y < decoder->header.height; y++) {
+                    for (int x = 0; x < 9 && x < decoder->header.width; x++) {
+                        int idx = y * decoder->header.width + x;
+                        fprintf(stderr, "%6d ", quantised_y[idx]);
+                    }
+                    fprintf(stderr, "\n");
+                }
+                debug_count++;
+            }
+
+            dequantise_dwt_subbands_perceptual(0, qy, quantised_y, decoder->dwt_buffer_y,
+                                              decoder->header.width, decoder->header.height,
+                                              decoder->header.decomp_levels, qy, 0, decoder->frame_count);
+            dequantise_dwt_subbands_perceptual(0, qy, quantised_co, decoder->dwt_buffer_co,
+                                              decoder->header.width, decoder->header.height,
+                                              decoder->header.decomp_levels, qco, 1, decoder->frame_count);
+            dequantise_dwt_subbands_perceptual(0, qy, quantised_cg, decoder->dwt_buffer_cg,
+                                              decoder->header.width, decoder->header.height,
+                                              decoder->header.decomp_levels, qcg, 1, decoder->frame_count);
+
+            // Debug: Print dequantised LL values
+            if (debug_count <= 1) {
+                fprintf(stderr, "[EZBC-DECODER-DEBUG] Dequantised LL coefficients (9x7):\n");
+                for (int y = 0; y < 7 && y < decoder->header.height; y++) {
+                    for (int x = 0; x < 9 && x < decoder->header.width; x++) {
+                        int idx = y * decoder->header.width + x;
+                        fprintf(stderr, "%7.0f ", decoder->dwt_buffer_y[idx]);
+                    }
+                    fprintf(stderr, "\n");
+                }
            }
        } else if (is_perceptual) {
            dequantise_dwt_subbands_perceptual(0, qy, quantised_y, decoder->dwt_buffer_y,
@@ -2912,28 +2951,34 @@ int main(int argc, char *argv[]) {
            const int temporal_levels = 2;  // Fixed for TAV GOP encoding

            for (int t = 0; t < gop_size; t++) {
-                if (is_ezbc) {
-                    // EZBC mode: coefficients are already denormalised by encoder
-                    // Just convert int16 to float without multiplying by quantiser
-                    for (int i = 0; i < num_pixels; i++) {
-                        gop_y[t][i] = (float)quantised_gop[t][0][i];
-                        gop_co[t][i] = (float)quantised_gop[t][1][i];
-                        gop_cg[t][i] = (float)quantised_gop[t][2][i];
-                    }
+                if (is_ezbc && is_perceptual) {
+                    // EZBC mode with perceptual quantisation: coefficients are normalised
+                    // Need to dequantise using perceptual weights (same as twobit-map mode)
+                    const int temporal_level = get_temporal_subband_level(t, gop_size, temporal_levels);
+                    const float temporal_scale = get_temporal_quantiser_scale(temporal_level);

-                    if (t == 0) {
-                        // Debug first frame
-                        int16_t max_y = 0, min_y = 0;
-                        for (int i = 0; i < num_pixels; i++) {
-                            if (quantised_gop[t][0][i] > max_y) max_y = quantised_gop[t][0][i];
-                            if (quantised_gop[t][0][i] < min_y) min_y = quantised_gop[t][0][i];
-                        }
-                        fprintf(stderr, "[GOP-EZBC] Frame 0 Y coeffs range: [%d, %d], first 5: %d %d %d %d %d\n",
-                               min_y, max_y,
-                               quantised_gop[t][0][0], quantised_gop[t][0][1], quantised_gop[t][0][2],
-                               quantised_gop[t][0][3], quantised_gop[t][0][4]);
+                    const float base_q_y = roundf(decoder->header.quantiser_y * temporal_scale);
+                    const float base_q_co = roundf(decoder->header.quantiser_co * temporal_scale);
+                    const float base_q_cg = roundf(decoder->header.quantiser_cg * temporal_scale);
+
+                    dequantise_dwt_subbands_perceptual(0, decoder->header.quantiser_y,
+                                                      quantised_gop[t][0], gop_y[t],
+                                                      decoder->header.width, decoder->header.height,
+                                                      decoder->header.decomp_levels, base_q_y, 0, decoder->frame_count + t);
+                    dequantise_dwt_subbands_perceptual(0, decoder->header.quantiser_y,
+                                                      quantised_gop[t][1], gop_co[t],
+                                                      decoder->header.width, decoder->header.height,
+                                                      decoder->header.decomp_levels, base_q_co, 1, decoder->frame_count + t);
+                    dequantise_dwt_subbands_perceptual(0, decoder->header.quantiser_y,
+                                                      quantised_gop[t][2], gop_cg[t],
+                                                      decoder->header.width, decoder->header.height,
+                                                      decoder->header.decomp_levels, base_q_cg, 1, decoder->frame_count + t);
+
+                    if (t == 0 && verbose) {
+                        fprintf(stderr, "[GOP-EZBC] Frame 0: Quantised LL[0]=%d, Dequantised LL[0]=%.1f, base_q_y=%.1f\n",
+                               quantised_gop[t][0][0], gop_y[t][0], base_q_y);
                    }
-                } else {
+                } else if (!is_ezbc) {
                    // Normal mode: multiply by quantiser
                    const int temporal_level = get_temporal_subband_level(t, gop_size, temporal_levels);
                    const float temporal_scale = get_temporal_quantiser_scale(temporal_level);
--- a/video_encoder/encoder_tav.c
+++ b/video_encoder/encoder_tav.c
@@ -1976,7 +1976,7 @@ typedef struct tav_encoder_s {
    int two_pass_mode;                    // Enable two-pass encoding (0=disabled, 1=enabled)
    frame_analysis_t *frame_analyses;     // Array of frame analysis metrics (first pass)
    int frame_analyses_capacity;          // Allocated capacity
-    int frame_analyses_count;             // Current number of analyzed frames
+    int frame_analyses_count;             // Current number of analysed frames
    gop_boundary_t *gop_boundaries;       // Linked list of GOP boundaries (computed in first pass)
    gop_boundary_t *current_gop_boundary; // Current GOP being encoded (second pass)
    int two_pass_current_frame;           // Current frame number in second pass
@@ -6702,13 +6702,22 @@ static void quantise_dwt_coefficients_perceptual_per_coeff_no_normalisation(tav_
        // Step 3: Round to discrete quantisation levels
        quantised_val = roundf(quantised_val); // file size explodes without rounding

-        // Step 4: Denormalise - multiply back by quantiser to restore magnitude
-        // This gives us quantised values at original scale (not shrunken to 0-10 range)
-        float denormalised = quantised_val * effective_q;
+        // FIX: Store normalised values (not denormalised) to avoid int16_t overflow
+        // EZBC bitplane encoding works fine with normalised coefficients
+        // Denormalisation was causing bright pixels to clip at 32767
+        quantised[i] = (int16_t)CLAMP((int)quantised_val, -32768, 32767);

-        // CRITICAL FIX: Must round (not truncate) to match decoder behavior
-        // With odd baseQ values and fractional weights, truncation causes mismatch with Sigmap mode
-        quantised[i] = (int16_t)CLAMP((int)roundf(denormalised), -32768, 32767);
+        // Debug: Print LL subband coefficients (9×7 at top-left for 560×448)
+        static int debug_once = 1;
+        if (debug_once && i < 63 && width == 560 && !is_chroma) {
+            int x = i % width;
+            int y = i / width;
+            if (x < 9 && y < 7) {
+                fprintf(stderr, "[EZBC-QUANT-DEBUG] LL coeff[%d,%d] (idx=%d): coeff=%.1f, weight=%.3f, effective_q=%.1f, quantised_val=%.1f, stored=%d\n",
+                        x, y, i, coeffs[i], weight, effective_q, quantised_val, quantised[i]);
+                if (i == 62) debug_once = 0;
+            }
+        }
    }
 }

@@ -9631,7 +9640,7 @@ static void free_gop_boundaries(gop_boundary_t *head) {
    }
 }

-// First pass: Analyze all frames and build GOP boundaries
+// First pass: Analyse all frames and build GOP boundaries
 // Returns 0 on success, -1 on error
 static int two_pass_first_pass(tav_encoder_t *enc, const char *input_file) {
    printf("=== Two-Pass Encoding: First Pass (Scene Analysis) ===\n");
@@ -9737,12 +9746,12 @@ static int two_pass_first_pass(tav_encoder_t *enc, const char *input_file) {
        frame_num++;

        if (frame_num % 100 == 0) {
-            printf("  Analyzed %d frames...\r", frame_num);
+            printf("  Analysed %d frames...\r", frame_num);
            fflush(stdout);
        }
    }

-    printf("\n  Analyzed %d frames total\n", frame_num);
+    printf("\n  Analysed %d frames total\n", frame_num);

    free(frame_rgb);
    if (prev_dwt) free(prev_dwt);
@@ -9881,7 +9890,7 @@ int main(int argc, char *argv[]) {
        {"adaptive-blocks", no_argument, 0, 1022},
        {"bframes", required_argument, 0, 1023},
        {"gop-size", required_argument, 0, 1024},
-        {"ezbc", no_argument, 0, 1025},
+        {"sigmap", no_argument, 0, 1025},
        {"separate-audio-track", no_argument, 0, 1026},
        {"pcm8-audio", no_argument, 0, 1027},
        {"pcm-audio", no_argument, 0, 1027},
@@ -10095,9 +10104,8 @@ int main(int argc, char *argv[]) {
                }
                printf("GOP size set to %d frames\n", enc->residual_coding_gop_size);
                break;
-            case 1025: // --ezbc
-                enc->preprocess_mode = PREPROCESS_EZBC;
-                printf("EZBC (Embedded Zero Block Coding) enabled for significance maps\n");
+            case 1025: // --sigmap
+                enc->preprocess_mode = PREPROCESS_TWOBITMAP;
                break;
            case 1026: // --separate-audio-track
                enc->separate_audio_track = 1;