From bff5021a7af11f33e44b569602aabc66049d1a53 Mon Sep 17 00:00:00 2001 From: minjaesong Date: Tue, 11 Nov 2025 03:22:10 +0900 Subject: [PATCH] fix: EZBC for TAV producing dark bloches on white background due to coeff clipping --- .../torvald/tsvm/GraphicsJSR223Delegate.kt | 27 ++--- video_encoder/decoder_tav.c | 99 ++++++++++++++----- video_encoder/encoder_tav.c | 36 ++++--- 3 files changed, 104 insertions(+), 58 deletions(-) diff --git a/tsvm_core/src/net/torvald/tsvm/GraphicsJSR223Delegate.kt b/tsvm_core/src/net/torvald/tsvm/GraphicsJSR223Delegate.kt index 604cc13..a50be97 100644 --- a/tsvm_core/src/net/torvald/tsvm/GraphicsJSR223Delegate.kt +++ b/tsvm_core/src/net/torvald/tsvm/GraphicsJSR223Delegate.kt @@ -5022,28 +5022,21 @@ class GraphicsJSR223Delegate(private val vm: VM) { } // Apply linear dequantisation with perceptual weights (matching encoder's linear storage) - // EZBC mode: coefficients are ALREADY DENORMALIZED by encoder - // e.g., encoder: coeff=377 → quantize: 377/48=7.85→8 → denormalize: 8*48=384 → store 384 - // decoder: read 384 → pass through as-is (already in correct range for IDWT) - // Significance-map mode: coefficients are normalized (quantized only) - // e.g., encoder stores 8 = round(377/48) - // decoder must multiply: 8 * 48 = 384 (denormalize for IDWT) + // FIX (2025-11-11): Both EZBC and Significance-map modes now store NORMALIZED coefficients + // Encoder stores quantised values (e.g., round(377/48) = 8) + // Decoder must multiply by effective quantiser to denormalize + // Previous denormalization in EZBC caused int16_t overflow (clipping at 32767) + // for bright pixels, creating dark DWT-pattern blemishes for (i in quantised.indices) { if (i < dequantised.size) { val effectiveQuantiser = baseQuantiser * weights[i] - dequantised[i] = if (isEZBC) { - // EZBC mode: pass through as-is (coefficients already denormalized and rounded by encoder) - quantised[i].toFloat() - } else { - // Significance-map mode: multiply to denormalize, then round - // CRITICAL: Must ROUND (not truncate) to match EZBC encoder's roundf() behavior - // Truncation toward zero was wrong - it created mismatch with EZBC for odd baseQ values - val untruncated = quantised[i] * effectiveQuantiser - val rounded = kotlin.math.round(untruncated) + // Both modes now use the same dequantisation: multiply to denormalize, then round + // CRITICAL: Must ROUND (not truncate) to match encoder's roundf() behavior + val untruncated = quantised[i] * effectiveQuantiser + val rounded = kotlin.math.round(untruncated) - rounded - } + dequantised[i] = rounded } } diff --git a/video_encoder/decoder_tav.c b/video_encoder/decoder_tav.c index 9002077..c0d710c 100644 --- a/video_encoder/decoder_tav.c +++ b/video_encoder/decoder_tav.c @@ -2407,13 +2407,52 @@ static int decode_i_or_p_frame(tav_decoder_t *decoder, uint8_t packet_type, uint const int is_perceptual = (decoder->header.version >= 5 && decoder->header.version <= 8); const int is_ezbc = (decoder->header.entropy_coder == 1); - if (is_ezbc) { - // EZBC mode: coefficients are already denormalised by encoder - // Just convert int16 to float without multiplying by quantiser - for (int i = 0; i < coeff_count; i++) { - decoder->dwt_buffer_y[i] = (float)quantised_y[i]; - decoder->dwt_buffer_co[i] = (float)quantised_co[i]; - decoder->dwt_buffer_cg[i] = (float)quantised_cg[i]; + // Debug: Print decoder state + static int state_debug_once = 1; + if (state_debug_once) { + fprintf(stderr, "[DECODER-STATE] version=%d, entropy_coder=%d, is_perceptual=%d, is_ezbc=%d\n", + decoder->header.version, decoder->header.entropy_coder, is_perceptual, is_ezbc); + state_debug_once = 0; + } + + if (is_ezbc && is_perceptual) { + // EZBC mode with perceptual quantisation: coefficients are normalised + // Need to dequantise using perceptual weights (same as twobit-map mode) + + // Debug: Print quantised LL values before dequantisation + static int debug_count = 0; + if (debug_count < 1) { + fprintf(stderr, "[EZBC-DECODER-DEBUG] Quantised LL coefficients (9x7):\n"); + for (int y = 0; y < 7 && y < decoder->header.height; y++) { + for (int x = 0; x < 9 && x < decoder->header.width; x++) { + int idx = y * decoder->header.width + x; + fprintf(stderr, "%6d ", quantised_y[idx]); + } + fprintf(stderr, "\n"); + } + debug_count++; + } + + dequantise_dwt_subbands_perceptual(0, qy, quantised_y, decoder->dwt_buffer_y, + decoder->header.width, decoder->header.height, + decoder->header.decomp_levels, qy, 0, decoder->frame_count); + dequantise_dwt_subbands_perceptual(0, qy, quantised_co, decoder->dwt_buffer_co, + decoder->header.width, decoder->header.height, + decoder->header.decomp_levels, qco, 1, decoder->frame_count); + dequantise_dwt_subbands_perceptual(0, qy, quantised_cg, decoder->dwt_buffer_cg, + decoder->header.width, decoder->header.height, + decoder->header.decomp_levels, qcg, 1, decoder->frame_count); + + // Debug: Print dequantised LL values + if (debug_count <= 1) { + fprintf(stderr, "[EZBC-DECODER-DEBUG] Dequantised LL coefficients (9x7):\n"); + for (int y = 0; y < 7 && y < decoder->header.height; y++) { + for (int x = 0; x < 9 && x < decoder->header.width; x++) { + int idx = y * decoder->header.width + x; + fprintf(stderr, "%7.0f ", decoder->dwt_buffer_y[idx]); + } + fprintf(stderr, "\n"); + } } } else if (is_perceptual) { dequantise_dwt_subbands_perceptual(0, qy, quantised_y, decoder->dwt_buffer_y, @@ -2912,28 +2951,34 @@ int main(int argc, char *argv[]) { const int temporal_levels = 2; // Fixed for TAV GOP encoding for (int t = 0; t < gop_size; t++) { - if (is_ezbc) { - // EZBC mode: coefficients are already denormalised by encoder - // Just convert int16 to float without multiplying by quantiser - for (int i = 0; i < num_pixels; i++) { - gop_y[t][i] = (float)quantised_gop[t][0][i]; - gop_co[t][i] = (float)quantised_gop[t][1][i]; - gop_cg[t][i] = (float)quantised_gop[t][2][i]; - } + if (is_ezbc && is_perceptual) { + // EZBC mode with perceptual quantisation: coefficients are normalised + // Need to dequantise using perceptual weights (same as twobit-map mode) + const int temporal_level = get_temporal_subband_level(t, gop_size, temporal_levels); + const float temporal_scale = get_temporal_quantiser_scale(temporal_level); - if (t == 0) { - // Debug first frame - int16_t max_y = 0, min_y = 0; - for (int i = 0; i < num_pixels; i++) { - if (quantised_gop[t][0][i] > max_y) max_y = quantised_gop[t][0][i]; - if (quantised_gop[t][0][i] < min_y) min_y = quantised_gop[t][0][i]; - } - fprintf(stderr, "[GOP-EZBC] Frame 0 Y coeffs range: [%d, %d], first 5: %d %d %d %d %d\n", - min_y, max_y, - quantised_gop[t][0][0], quantised_gop[t][0][1], quantised_gop[t][0][2], - quantised_gop[t][0][3], quantised_gop[t][0][4]); + const float base_q_y = roundf(decoder->header.quantiser_y * temporal_scale); + const float base_q_co = roundf(decoder->header.quantiser_co * temporal_scale); + const float base_q_cg = roundf(decoder->header.quantiser_cg * temporal_scale); + + dequantise_dwt_subbands_perceptual(0, decoder->header.quantiser_y, + quantised_gop[t][0], gop_y[t], + decoder->header.width, decoder->header.height, + decoder->header.decomp_levels, base_q_y, 0, decoder->frame_count + t); + dequantise_dwt_subbands_perceptual(0, decoder->header.quantiser_y, + quantised_gop[t][1], gop_co[t], + decoder->header.width, decoder->header.height, + decoder->header.decomp_levels, base_q_co, 1, decoder->frame_count + t); + dequantise_dwt_subbands_perceptual(0, decoder->header.quantiser_y, + quantised_gop[t][2], gop_cg[t], + decoder->header.width, decoder->header.height, + decoder->header.decomp_levels, base_q_cg, 1, decoder->frame_count + t); + + if (t == 0 && verbose) { + fprintf(stderr, "[GOP-EZBC] Frame 0: Quantised LL[0]=%d, Dequantised LL[0]=%.1f, base_q_y=%.1f\n", + quantised_gop[t][0][0], gop_y[t][0], base_q_y); } - } else { + } else if (!is_ezbc) { // Normal mode: multiply by quantiser const int temporal_level = get_temporal_subband_level(t, gop_size, temporal_levels); const float temporal_scale = get_temporal_quantiser_scale(temporal_level); diff --git a/video_encoder/encoder_tav.c b/video_encoder/encoder_tav.c index f7e7faf..808a344 100644 --- a/video_encoder/encoder_tav.c +++ b/video_encoder/encoder_tav.c @@ -1976,7 +1976,7 @@ typedef struct tav_encoder_s { int two_pass_mode; // Enable two-pass encoding (0=disabled, 1=enabled) frame_analysis_t *frame_analyses; // Array of frame analysis metrics (first pass) int frame_analyses_capacity; // Allocated capacity - int frame_analyses_count; // Current number of analyzed frames + int frame_analyses_count; // Current number of analysed frames gop_boundary_t *gop_boundaries; // Linked list of GOP boundaries (computed in first pass) gop_boundary_t *current_gop_boundary; // Current GOP being encoded (second pass) int two_pass_current_frame; // Current frame number in second pass @@ -6702,13 +6702,22 @@ static void quantise_dwt_coefficients_perceptual_per_coeff_no_normalisation(tav_ // Step 3: Round to discrete quantisation levels quantised_val = roundf(quantised_val); // file size explodes without rounding - // Step 4: Denormalise - multiply back by quantiser to restore magnitude - // This gives us quantised values at original scale (not shrunken to 0-10 range) - float denormalised = quantised_val * effective_q; + // FIX: Store normalised values (not denormalised) to avoid int16_t overflow + // EZBC bitplane encoding works fine with normalised coefficients + // Denormalisation was causing bright pixels to clip at 32767 + quantised[i] = (int16_t)CLAMP((int)quantised_val, -32768, 32767); - // CRITICAL FIX: Must round (not truncate) to match decoder behavior - // With odd baseQ values and fractional weights, truncation causes mismatch with Sigmap mode - quantised[i] = (int16_t)CLAMP((int)roundf(denormalised), -32768, 32767); + // Debug: Print LL subband coefficients (9×7 at top-left for 560×448) + static int debug_once = 1; + if (debug_once && i < 63 && width == 560 && !is_chroma) { + int x = i % width; + int y = i / width; + if (x < 9 && y < 7) { + fprintf(stderr, "[EZBC-QUANT-DEBUG] LL coeff[%d,%d] (idx=%d): coeff=%.1f, weight=%.3f, effective_q=%.1f, quantised_val=%.1f, stored=%d\n", + x, y, i, coeffs[i], weight, effective_q, quantised_val, quantised[i]); + if (i == 62) debug_once = 0; + } + } } } @@ -9631,7 +9640,7 @@ static void free_gop_boundaries(gop_boundary_t *head) { } } -// First pass: Analyze all frames and build GOP boundaries +// First pass: Analyse all frames and build GOP boundaries // Returns 0 on success, -1 on error static int two_pass_first_pass(tav_encoder_t *enc, const char *input_file) { printf("=== Two-Pass Encoding: First Pass (Scene Analysis) ===\n"); @@ -9737,12 +9746,12 @@ static int two_pass_first_pass(tav_encoder_t *enc, const char *input_file) { frame_num++; if (frame_num % 100 == 0) { - printf(" Analyzed %d frames...\r", frame_num); + printf(" Analysed %d frames...\r", frame_num); fflush(stdout); } } - printf("\n Analyzed %d frames total\n", frame_num); + printf("\n Analysed %d frames total\n", frame_num); free(frame_rgb); if (prev_dwt) free(prev_dwt); @@ -9881,7 +9890,7 @@ int main(int argc, char *argv[]) { {"adaptive-blocks", no_argument, 0, 1022}, {"bframes", required_argument, 0, 1023}, {"gop-size", required_argument, 0, 1024}, - {"ezbc", no_argument, 0, 1025}, + {"sigmap", no_argument, 0, 1025}, {"separate-audio-track", no_argument, 0, 1026}, {"pcm8-audio", no_argument, 0, 1027}, {"pcm-audio", no_argument, 0, 1027}, @@ -10095,9 +10104,8 @@ int main(int argc, char *argv[]) { } printf("GOP size set to %d frames\n", enc->residual_coding_gop_size); break; - case 1025: // --ezbc - enc->preprocess_mode = PREPROCESS_EZBC; - printf("EZBC (Embedded Zero Block Coding) enabled for significance maps\n"); + case 1025: // --sigmap + enc->preprocess_mode = PREPROCESS_TWOBITMAP; break; case 1026: // --separate-audio-track enc->separate_audio_track = 1;