tav: librarying

2026-03-07 11:51:49 +09:00 · 2025-12-05 09:21:47 +09:00
parent 94ae24e9e4
commit dad1da741f
5 changed files with 56 additions and 13 deletions
--- a/video_encoder/include/tav_encoder_lib.h
+++ b/video_encoder/include/tav_encoder_lib.h
@@ -61,7 +61,7 @@ typedef struct {
    int quality_y;                // Luma quality (0-5, default: 3)
    int quality_co;               // Orange chrominance quality (0-5, default: 3)
    int quality_cg;               // Green chrominance quality (0-5, default: 3)
-    int dead_zone_threshold;      // Dead-zone quantization threshold (0=disabled, 1-10 typical)
+    float dead_zone_threshold;    // Dead-zone quantization threshold (0.0=disabled, 0.6-1.5 typical)

    // === Entropy Coding ===
    int entropy_coder;            // 0=Twobitmap (default), 1=EZBC (better for high-quality)
--- a/video_encoder/lib/libtavenc/tav_encoder_lib.c
+++ b/video_encoder/lib/libtavenc/tav_encoder_lib.c
@@ -43,6 +43,7 @@ static const int QLUT[] = {1,2,3,4,5,6,7,8,9,10,11,12,13,14,15,16,17,18,19,20,21
 static const int QUALITY_Y[] = {79, 47, 23, 11, 5, 2};   // Quality levels 0-5
 static const int QUALITY_CO[] = {123, 108, 91, 76, 59, 29};
 static const int QUALITY_CG[] = {148, 133, 113, 99, 76, 39};
+static const float DEAD_ZONE_THRESHOLD[] = {1.5f, 1.5f, 1.2f, 1.1f, 0.8f, 0.6f, 0.0f};

 // Channel layout definitions (from TAV specification)
 #define CHANNEL_LAYOUT_YCOCG     0
@@ -87,10 +88,17 @@ struct tav_encoder_s {
    int quality_level;           // For perceptual quantization
    int *widths;                 // Subband widths array (per decomposition level)
    int *heights;                // Subband heights array (per decomposition level)
-    int dead_zone_threshold;     // Dead-zone quantization threshold
+    int decomp_levels;           // Number of spatial DWT decomposition levels
+    float dead_zone_threshold;   // Dead-zone quantization threshold
    int encoder_preset;          // Preset flags (sports mode, etc.)
    int temporal_decomp_levels;  // Temporal DWT levels
    int verbose;                 // Verbose output flag
+    int frame_count;             // Current frame number for encoding
+    float adjusted_quantiser_y_float;  // For bitrate control (if needed)
+    float dither_accumulator;    // Dither accumulator for bitrate mode
+    int width;                   // Frame width
+    int height;                  // Frame height
+    int perceptual_tuning;       // 1 = perceptual quantization, 0 = uniform
 };

 // GOP slot for circular buffering
@@ -282,7 +290,7 @@ void tav_encoder_params_init(tav_encoder_params_t *params, int width, int height
    params->quality_y = QUALITY_Y[3];    // 11 - quantiser index
    params->quality_co = QUALITY_CO[3];  // 76 - quantiser index
    params->quality_cg = QUALITY_CG[3];  // 99 - quantiser index
-    params->dead_zone_threshold = 0;     // Disabled by default
+    params->dead_zone_threshold = DEAD_ZONE_THRESHOLD[3];  // 1.1 for Q3

    // Compression
    params->entropy_coder = 1;         // EZBC as default
@@ -963,6 +971,13 @@ static tav_encoder_t *create_compat_encoder(tav_encoder_context_t *ctx) {
    enc->encoder_preset = ctx->encoder_preset;
    enc->temporal_decomp_levels = ctx->temporal_levels;
    enc->verbose = ctx->verbose;
+    enc->perceptual_tuning = ctx->perceptual_tuning;
+
+    // Copy frame dimensions (needed by quantisation functions)
+    enc->width = ctx->width;
+    enc->height = ctx->height;
+    enc->decomp_levels = ctx->decomp_levels;
+    enc->frame_count = 0;  // Will be updated during encoding

    // Calculate subband widths and heights arrays
    // These are needed by the perceptual quantization module
@@ -1319,11 +1334,11 @@ static int encode_gop_intra_only(tav_encoder_context_t *ctx, gop_slot_t *slot) {

    if (ctx->perceptual_tuning) {
        tav_quantise_perceptual(ctx->compat_enc, work_y, quant_y, num_pixels,
-                               base_quantiser_y, width, height, ctx->decomp_levels, 0, 0);
+                               base_quantiser_y, (float)ctx->dead_zone_threshold, width, height, ctx->decomp_levels, 0, 0);
        tav_quantise_perceptual(ctx->compat_enc, work_co, quant_co, num_pixels,
-                               base_quantiser_co, width, height, ctx->decomp_levels, 1, 0);
+                               base_quantiser_co, (float)ctx->dead_zone_threshold, width, height, ctx->decomp_levels, 1, 0);
        tav_quantise_perceptual(ctx->compat_enc, work_cg, quant_cg, num_pixels,
-                               base_quantiser_cg, width, height, ctx->decomp_levels, 1, 0);
+                               base_quantiser_cg, (float)ctx->dead_zone_threshold, width, height, ctx->decomp_levels, 1, 0);
    } else {
        tav_quantise_uniform(work_y, quant_y, num_pixels, base_quantiser_y,
                            (float)ctx->dead_zone_threshold, width, height,
@@ -1448,6 +1463,18 @@ static int encode_gop_unified(tav_encoder_context_t *ctx, gop_slot_t *slot) {
    int base_quantiser_co = QLUT[ctx->quantiser_co];
    int base_quantiser_cg = QLUT[ctx->quantiser_cg];

+    // CRITICAL: Use UNIFORM quantization for 3D DWT GOPs to match old encoder behavior
+    // The old encoder had a bug where decomp_levels=0 caused perceptual weights to fallback to 1.0
+    // This accidentally produced better results than true perceptual quantization
+    // Preserve this behavior for compatibility with decoder expectations
+    int saved_perceptual = ctx->compat_enc->perceptual_tuning;
+    ctx->compat_enc->perceptual_tuning = 0;  // Temporarily disable for GOP encoding
+
+    if (ctx->verbose) {
+        fprintf(stderr, "[DEBUG] GOP quantization: decomp_levels=%d, base_q_y=%d, perceptual=%d (forced uniform), preset=0x%02x\n",
+                ctx->compat_enc->decomp_levels, base_quantiser_y, ctx->compat_enc->perceptual_tuning, ctx->compat_enc->encoder_preset);
+    }
+
    tav_quantise_3d_dwt(ctx->compat_enc, work_y, quant_y, num_frames, num_pixels,
                       base_quantiser_y, 0);
    tav_quantise_3d_dwt(ctx->compat_enc, work_co, quant_co, num_frames, num_pixels,
@@ -1455,6 +1482,8 @@ static int encode_gop_unified(tav_encoder_context_t *ctx, gop_slot_t *slot) {
    tav_quantise_3d_dwt(ctx->compat_enc, work_cg, quant_cg, num_frames, num_pixels,
                       base_quantiser_cg, 1);

+    ctx->compat_enc->perceptual_tuning = saved_perceptual;  // Restore for I-frames
+
    // Step 4: Unified GOP preprocessing (EZBC only)
    size_t preprocess_capacity = num_pixels * num_frames * 3 * sizeof(int16_t) + 65536;
    uint8_t *preprocess_buffer = tav_malloc(preprocess_capacity);
--- a/video_encoder/lib/libtavenc/tav_encoder_quantize.c
+++ b/video_encoder/lib/libtavenc/tav_encoder_quantize.c
@@ -310,6 +310,7 @@ void tav_quantise_uniform(float *coeffs, int16_t *quantised, int size, int quant
 * @param quantised       Output quantized coefficients (int16_t)
 * @param size            Number of coefficients
 * @param base_quantiser  Base quantizer value (before perceptual weighting)
+ * @param dead_zone_threshold  Dead-zone threshold (0.0 = disabled)
 * @param width           Frame width
 * @param height          Frame height
 * @param decomp_levels   Number of decomposition levels
@@ -318,7 +319,7 @@ void tav_quantise_uniform(float *coeffs, int16_t *quantised, int size, int quant
 */
 void tav_quantise_perceptual(tav_encoder_t *enc,
                              float *coeffs, int16_t *quantised, int size,
-                              int base_quantiser, int width, int height,
+                              int base_quantiser, float dead_zone_threshold, int width, int height,
                              int decomp_levels, int is_chroma, int frame_count);

 /**
@@ -391,6 +392,7 @@ struct tav_encoder_s {
    float dither_accumulator;
    int width;
    int height;
+    int perceptual_tuning;
 };
 #endif

@@ -434,6 +436,11 @@ static float get_perceptual_weight(tav_encoder_t *enc, int level0, int subband_t
 }

 static float get_perceptual_weight_for_position(tav_encoder_t *enc, int linear_idx, int width, int height, int decomp_levels, int is_chroma) {
+    // If perceptual tuning is disabled, use uniform quantization (weight = 1.0)
+    if (!enc->perceptual_tuning) {
+        return 1.0f;
+    }
+
    // Map linear coefficient index to DWT subband using same layout as decoder
    int offset = 0;

@@ -525,7 +532,7 @@ void tav_quantise_uniform(float *coeffs, int16_t *quantised, int size, int quant

 void tav_quantise_perceptual(tav_encoder_t *enc,
                              float *coeffs, int16_t *quantised, int size,
-                              int base_quantiser, int width, int height,
+                              int base_quantiser, float dead_zone_threshold, int width, int height,
                              int decomp_levels, int is_chroma, int frame_count) {
    float effective_base_q = base_quantiser;
    effective_base_q = FCLAMP(effective_base_q, 1.0f, 4096.0f);
@@ -537,20 +544,20 @@ void tav_quantise_perceptual(tav_encoder_t *enc,
        float quantised_val = coeffs[i] / effective_q;

        // Apply dead-zone quantisation ONLY to luma channel
-        if (enc->dead_zone_threshold > 0.0f && !is_chroma) {
+        if (dead_zone_threshold > 0.0f && !is_chroma) {
            int level = get_subband_level(i, width, height, decomp_levels);
            int subband_type = get_subband_type(i, width, height, decomp_levels);
            float level_threshold = 0.0f;

            if (level == 1) {
                if (subband_type == 3) {
-                    level_threshold = enc->dead_zone_threshold * DEAD_ZONE_FINEST_SCALE;
+                    level_threshold = dead_zone_threshold * DEAD_ZONE_FINEST_SCALE;
                } else if (subband_type == 1 || subband_type == 2) {
-                    level_threshold = enc->dead_zone_threshold * DEAD_ZONE_FINE_SCALE;
+                    level_threshold = dead_zone_threshold * DEAD_ZONE_FINE_SCALE;
                }
            } else if (level == 2) {
                if (subband_type == 3) {
-                    level_threshold = enc->dead_zone_threshold * DEAD_ZONE_FINE_SCALE;
+                    level_threshold = dead_zone_threshold * DEAD_ZONE_FINE_SCALE;
                }
            }

@@ -583,12 +590,16 @@ void tav_quantise_3d_dwt(tav_encoder_t *enc,
        temporal_base_quantiser = CLAMP(temporal_base_quantiser, 1, 255);

        // Step 3: Apply spatial quantisation within this temporal subband
+        // Check if perceptual tuning is enabled (stored in encoder_preset bit 1)
+        // NOTE: perceptual_tuning field is NOT in tav_encoder_s, so we check context flag
+        // For now, just use perceptual (this will be controlled by caller disabling)
        tav_quantise_perceptual(
            enc,
            gop_coeffs[t],           // Input: spatial coefficients for this temporal subband
            quantised[t],            // Output: quantised spatial coefficients
            spatial_size,            // Number of spatial coefficients
            temporal_base_quantiser, // Temporally-scaled base quantiser
+            enc->dead_zone_threshold, // Dead zone threshold
            enc->width,              // Frame width
            enc->height,             // Frame height
            enc->decomp_levels,      // Spatial decomposition levels
--- a/video_encoder/lib/libtavenc/tav_encoder_quantize.h
+++ b/video_encoder/lib/libtavenc/tav_encoder_quantize.h
@@ -64,6 +64,7 @@ void tav_quantise_uniform(float *coeffs, int16_t *quantised, int size, int quant
 * @param quantised       Output quantized coefficients (int16_t)
 * @param size            Number of coefficients
 * @param base_quantiser  Base quantizer value (before perceptual weighting)
+ * @param dead_zone_threshold  Dead-zone threshold (0.0 = disabled)
 * @param width           Frame width
 * @param height          Frame height
 * @param decomp_levels   Number of decomposition levels
@@ -72,7 +73,7 @@ void tav_quantise_uniform(float *coeffs, int16_t *quantised, int size, int quant
 */
 void tav_quantise_perceptual(tav_encoder_t *enc,
                              float *coeffs, int16_t *quantised, int size,
-                              int base_quantiser, int width, int height,
+                              int base_quantiser, float dead_zone_threshold, int width, int height,
                              int decomp_levels, int is_chroma, int frame_count);

 // =============================================================================
--- a/video_encoder/src/encoder_tav.c
+++ b/video_encoder/src/encoder_tav.c
@@ -49,6 +49,7 @@
 static const int QUALITY_Y[] = {79, 47, 23, 11, 5, 2};   // Quality levels 0-5
 static const int QUALITY_CO[] = {123, 108, 91, 76, 59, 29};
 static const int QUALITY_CG[] = {148, 133, 113, 99, 76, 39};
+static const float DEAD_ZONE_THRESHOLD[] = {1.5f, 1.5f, 1.2f, 1.1f, 0.8f, 0.6f, 0.0f};

 static char TEMP_AUDIO_FILE[TEMP_AUDIO_FILE_SIZE];
 static char TEMP_PCM_FILE[TEMP_PCM_FILE_SIZE];
@@ -1255,6 +1256,7 @@ int main(int argc, char *argv[]) {
                cli.enc_params.quality_y = QUALITY_Y[q];
                cli.enc_params.quality_co = QUALITY_CO[q];
                cli.enc_params.quality_cg = QUALITY_CG[q];
+                cli.enc_params.dead_zone_threshold = DEAD_ZONE_THRESHOLD[q];
                break;
            }
            case 'Q': {