TAV: preset implementation

2026-06-06 13:38:30 +09:00 · 2025-11-24 17:40:45 +09:00
parent 6132012e74
commit 08bb33bf27
6 changed files with 152 additions and 66 deletions
--- a/video_encoder/decoder_tav.c
+++ b/video_encoder/decoder_tav.c
@@ -17,7 +17,7 @@
 #include "decoder_tad.h"  // Shared TAD decoder library
 #include "tav_avx512.h"  // AVX-512 SIMD optimisations

-#define DECODER_VENDOR_STRING "Decoder-TAV 20251124 (avx512)"
+#define DECODER_VENDOR_STRING "Decoder-TAV 20251124 (avx512,presets)"

 // TAV format constants
 #define TAV_MAGIC "\x1F\x54\x53\x56\x4D\x54\x41\x56"
@@ -95,7 +95,8 @@ typedef struct {
    uint8_t encoder_quality;
    uint8_t channel_layout;
    uint8_t entropy_coder;
-    uint8_t reserved[2];
+    uint8_t encoder_preset;  // Byte 28: bit 0 = sports, bit 1 = anime
+    uint8_t reserved;
    uint8_t device_orientation;
    uint8_t file_role;
 } __attribute__((packed)) tav_header_t;
@@ -394,10 +395,20 @@ static inline float tav_grain_triangular_noise(uint32_t rng_val) {
    return (u1 + u2) - 1.0f;
 }

-// Remove grain synthesis from DWT coefficients (decoder subtracts noise)
+// Apply grain synthesis from DWT coefficients (decoder subtracts noise)
 // This must be called AFTER dequantisation but BEFORE inverse DWT
-static void remove_grain_synthesis_decoder(float *coeffs, int width, int height,
-                                          int decomp_levels, int frame_num, int q_y_global) {
+static void apply_grain_synthesis(float *coeffs, int width, int height,
+                                          int decomp_levels, int frame_num, int q_y_global, uint8_t encoder_preset, int no_grain_synthesis) {
+    // Command-line override: disable grain synthesis
+    if (no_grain_synthesis) {
+        return;  // Skip grain synthesis entirely
+    }
+
+    // Anime preset: completely disable grain synthesis
+    if (encoder_preset & 0x02) {
+        return;  // Skip grain synthesis entirely
+    }
+
    dwt_subband_info_t subbands[32];
    const int subband_count = calculate_subband_layout(width, height, decomp_levels, subbands);

@@ -412,7 +423,7 @@ static void remove_grain_synthesis_decoder(float *coeffs, int width, int height,
        // Calculate band index for RNG (matches Kotlin: level + subbandType * 31 + 16777619)
        uint32_t band = subband->level + subband->subband_type * 31 + 16777619;

-        // Remove noise from each coefficient in this subband
+        // Apply noise from each coefficient in this subband
        for (int i = 0; i < subband->coeff_count; i++) {
            const int idx = subband->coeff_start + i;
            if (idx < width * height) {
@@ -1226,14 +1237,14 @@ static int get_temporal_subband_level(int frame_idx, int num_frames, int tempora
 }

 // Calculate temporal quantiser scale for a given temporal subband level
-static float get_temporal_quantiser_scale(int temporal_level) {
+static float get_temporal_quantiser_scale(uint8_t encoder_preset, int temporal_level) {
    // Uses exponential scaling: 2^(BETA × level^KAPPA)
    // With BETA=0.6, KAPPA=1.14:
    //   - Level 0 (tLL):  2^0.0 = 1.00
    //   - Level 1 (tH):   2^0.68 = 1.61
    //   - Level 2 (tHH):  2^1.29 = 2.45
-    const float BETA = 0.6f;  // Temporal scaling exponent
-    const float KAPPA = 1.14f;
+    const float BETA = (encoder_preset & 0x01) ? 0.0f : 0.6f;
+    const float KAPPA = (encoder_preset & 0x01) ? 1.0f : 1.14f;
    return powf(2.0f, BETA * powf(temporal_level, KAPPA));
 }

@@ -1812,6 +1823,7 @@ typedef struct {
    int frame_size;
    int is_monoblock;           // True if version 3-6 (single tile mode)
    int temporal_motion_coder;  // Temporal wavelet: 0=Haar, 1=CDF 5/3 (extracted from version)
+    int no_grain_synthesis;     // Command-line flag: disable grain synthesis

    // Screen masking (letterbox/pillarbox) - array of geometry changes
    screen_mask_entry_t *screen_masks;
@@ -2023,10 +2035,11 @@ static int extract_audio_to_wav(const char *input_file, const char *wav_file, in
 // Decoder Initialisation and Cleanup
 //=============================================================================

-static tav_decoder_t* tav_decoder_init(const char *input_file, const char *output_file, const char *audio_file) {
+static tav_decoder_t* tav_decoder_init(const char *input_file, const char *output_file, const char *audio_file, int no_grain_synthesis) {
    tav_decoder_t *decoder = calloc(1, sizeof(tav_decoder_t));
    if (!decoder) return NULL;

+    decoder->no_grain_synthesis = no_grain_synthesis;
    decoder->input_fp = fopen(input_file, "rb");
    if (!decoder->input_fp) {
        free(decoder);
@@ -2511,8 +2524,9 @@ static int decode_i_or_p_frame(tav_decoder_t *decoder, uint8_t packet_type, uint

        // Remove grain synthesis from Y channel (must happen after dequantisation, before inverse DWT)
        // Phase 2: Use decoding dimensions and temporary buffer
-        remove_grain_synthesis_decoder(temp_dwt_y, decoder->decoding_width, decoder->decoding_height,
-                                      decoder->header.decomp_levels, decoder->frame_count, decoder->header.quantiser_y);
+        apply_grain_synthesis(temp_dwt_y, decoder->decoding_width, decoder->decoding_height,
+                                      decoder->header.decomp_levels, decoder->frame_count, decoder->header.quantiser_y,
+                                      decoder->header.encoder_preset, decoder->no_grain_synthesis);

        // Debug: Check LL band AFTER grain removal
 //        if (decoder->frame_count == 32) {
@@ -2712,10 +2726,11 @@ static void print_usage(const char *prog) {
    printf("Version: %s\n\n", DECODER_VENDOR_STRING);
    printf("Usage: %s -i input.tav -o output.mkv\n\n", prog);
    printf("Options:\n");
-    printf("  -i <file>    Input TAV file\n");
-    printf("  -o <file>    Output MKV file (optional, auto-generated from input)\n");
-    printf("  -v           Verbose output\n");
-    printf("  -h, --help   Show this help\n\n");
+    printf("  -i <file>              Input TAV file\n");
+    printf("  -o <file>              Output MKV file (optional, auto-generated from input)\n");
+    printf("  -v                     Verbose output\n");
+    printf("  --no-grain-synthesis   Disable grain synthesis (override encoder preset)\n");
+    printf("  -h, --help             Show this help\n\n");
    printf("Supported features (matches TSVM decoder):\n");
    printf("  - I-frames and P-frames (delta mode)\n");
    printf("  - GOP unified 3D DWT (temporal compression)\n");
@@ -2740,9 +2755,11 @@ int main(int argc, char *argv[]) {
    char *input_file = NULL;
    char *output_file = NULL;
    int verbose = 0;
+    int no_grain_synthesis = 0;

    static struct option long_options[] = {
        {"help", no_argument, 0, 'h'},
+        {"no-grain-synthesis", no_argument, 0, 1000},
        {0, 0, 0, 0}
    };

@@ -2761,6 +2778,12 @@ int main(int argc, char *argv[]) {
            case 'h':
                print_usage(argv[0]);
                return 0;
+            case 1000:  // --no-grain-synthesis
+                no_grain_synthesis = 1;
+                if (verbose) {
+                    printf("Grain synthesis disabled\n");
+                }
+                break;
            default:
                print_usage(argv[0]);
                return 1;
@@ -2819,7 +2842,7 @@ int main(int argc, char *argv[]) {
    }

    // Pass 2: Decode video with audio file
-    tav_decoder_t *decoder = tav_decoder_init(input_file, output_file, temp_audio_file);
+    tav_decoder_t *decoder = tav_decoder_init(input_file, output_file, temp_audio_file, no_grain_synthesis);
    if (!decoder) {
        fprintf(stderr, "Failed to initialise decoder\n");
        unlink(temp_audio_file);  // Clean up temp file
@@ -3126,7 +3149,7 @@ int main(int argc, char *argv[]) {
                    // EZBC mode with perceptual quantisation: coefficients are normalised
                    // Need to dequantise using perceptual weights (same as twobit-map mode)
                    const int temporal_level = get_temporal_subband_level(t, gop_size, temporal_levels);
-                    const float temporal_scale = get_temporal_quantiser_scale(temporal_level);
+                    const float temporal_scale = get_temporal_quantiser_scale(decoder->header.encoder_preset, temporal_level);

                    // FIX: Use QLUT to convert header quantiser indices to actual values
                    const float base_q_y = roundf(QLUT[decoder->header.quantiser_y] * temporal_scale);
@@ -3160,7 +3183,7 @@ int main(int argc, char *argv[]) {
                } else if (!is_ezbc) {
                    // Normal mode: multiply by quantiser
                    const int temporal_level = get_temporal_subband_level(t, gop_size, temporal_levels);
-                    const float temporal_scale = get_temporal_quantiser_scale(temporal_level);
+                    const float temporal_scale = get_temporal_quantiser_scale(decoder->header.encoder_preset, temporal_level);

                    // CRITICAL: Must ROUND temporal quantiser to match encoder's roundf() behavior
                    // FIX: Use QLUT to convert header quantiser indices to actual values
@@ -3206,9 +3229,10 @@ int main(int argc, char *argv[]) {

            // Phase 2: Use GOP dimensions (may be cropped) for grain removal
            for (int t = 0; t < gop_size; t++) {
-                remove_grain_synthesis_decoder(gop_y[t], gop_width, gop_height,
+                apply_grain_synthesis(gop_y[t], gop_width, gop_height,
                                              decoder->header.decomp_levels, decoder->frame_count + t,
-                                              decoder->header.quantiser_y);
+                                              decoder->header.quantiser_y, decoder->header.encoder_preset,
+                                              decoder->no_grain_synthesis);
            }

            // Apply inverse 3D DWT (spatial + temporal)
--- a/video_encoder/encoder_tav.c
+++ b/video_encoder/encoder_tav.c
@@ -19,7 +19,7 @@
 #include <float.h>
 #include "tav_avx512.h"  // AVX-512 SIMD optimisations

-#define ENCODER_VENDOR_STRING "Encoder-TAV 20251124 (3d-dwt,tad,ssf-tc,cdf53-motion,avx512)"
+#define ENCODER_VENDOR_STRING "Encoder-TAV 20251124 (3d-dwt,tad,ssf-tc,cdf53-motion,avx512,presets)"

 // TSVM Advanced Video (TAV) format constants
 #define TAV_MAGIC "\x1F\x54\x53\x56\x4D\x54\x41\x56"  // "\x1FTSVM TAV"
@@ -1835,6 +1835,7 @@ typedef struct tav_encoder_s {
    int pcm8_audio; // 1 = use 8-bit PCM audio (packet 0x21), 0 = use MP2 (default)
    int tad_audio; // 1 = use TAD audio (packet 0x24), 0 = use MP2/PCM8 (default, quality follows quality_level)
    int enable_crop_encoding;    // 1 = encode cropped active region only (Phase 2), 0 = encode full frame (default)
+    uint8_t encoder_preset;      // Encoder preset flags: bit 0 = sports (finer temporal quantisation), bit 1 = anime (no grain)

    // Active region tracking (for Phase 2 crop encoding)
    uint16_t active_mask_top, active_mask_right, active_mask_bottom, active_mask_left;
@@ -2432,6 +2433,9 @@ static void show_usage(const char *program_name) {
    printf("  --dump-frame N          Dump quantised coefficients for frame N (creates .bin files)\n");
    printf("  --wavelet N             Wavelet filter: 0=LGT 5/3, 1=CDF 9/7, 2=CDF 13/7, 16=DD-4, 255=Haar (default: 1)\n");
    printf("  --zstd-level N          Zstd compression level 1-22 (default: %d, higher = better compression but slower)\n", DEFAULT_ZSTD_LEVEL);
+    printf("  --preset PRESET         Encoder presets (comma-separated, e.g., 'sports,anime'):\n");
+    printf("                            sports (or sport): Finer temporal quantisation for better motion detail\n");
+    printf("                            anime (or animation): Disable grain synthesis for cleaner animated content\n");
    printf("  --help                  Show this help\n\n");

    printf("Audio Rate by Quality:\n  ");
@@ -3355,8 +3359,9 @@ static void quantise_3d_dwt_coefficients(tav_encoder_t *enc,
                                        int spatial_size,
                                        int base_quantiser,
                                        int is_chroma) {
-    const float BETA = 0.6f;  // Temporal scaling exponent (aggressive for temporal high-pass)
-    const float KAPPA = 1.14f;
+    // Sports preset: use finer temporal quantisation (less aggressive)
+    const float BETA = (enc->encoder_preset & 0x01) ? 0.0f : 0.6f;
+    const float KAPPA = (enc->encoder_preset & 0x01) ? 1.0f : 1.14f;

    // Process each temporal subband independently (separable approach)
    for (int t = 0; t < num_frames; t++) {
@@ -7528,8 +7533,10 @@ static int write_tav_header(tav_encoder_t *enc) {
    // Entropy Coder (0 = Twobit-map, 1 = EZBC, 2 = Raw)
    fputc(enc->preprocess_mode, enc->output_fp);

-    // Reserved bytes (2 bytes)
-    fputc(0, enc->output_fp);
+    // Encoder Preset (byte 28): bit 0 = sports, bit 1 = anime
+    fputc(enc->encoder_preset, enc->output_fp);
+
+    // Reserved byte (1 byte)
    fputc(0, enc->output_fp);

    // Device Orientation (default: 0 = no rotation)
@@ -10775,6 +10782,7 @@ int main(int argc, char *argv[]) {
        {"tad-audio", no_argument, 0, 1028},
        {"raw-coeffs", no_argument, 0, 1029},
        {"single-pass", no_argument, 0, 1050},  // disable two-pass encoding with wavelet-based scene detection
+        {"preset", required_argument, 0, 1051},  // Encoder presets: sports, anime (comma-separated)
        {"enable-crop-encoding", no_argument, 0, 1052},  // Phase 2: encode cropped active region only (experimental)
        {"help", no_argument, 0, '?'},
        {0, 0, 0, 0}
@@ -11012,6 +11020,34 @@ int main(int argc, char *argv[]) {
                enc->two_pass_mode = 0;
                printf("Two-pass wavelet-based scene change detection disabled\n");
                break;
+            case 1051: { // --preset
+                char *preset_str = strdup(optarg);
+                char *token = strtok(preset_str, ",");
+                while (token != NULL) {
+                    // Trim leading/trailing whitespace
+                    while (*token == ' ' || *token == '\t') token++;
+                    char *end = token + strlen(token) - 1;
+                    while (end > token && (*end == ' ' || *end == '\t')) {
+                        *end = '\0';
+                        end--;
+                    }
+
+                    // Check for presets and aliases
+                    if (strcmp(token, "sports") == 0 || strcmp(token, "sport") == 0) {
+                        enc->encoder_preset |= 0x01;
+                        printf("Preset 'sports' enabled: finer temporal quantisation (BETA=0.25, KAPPA=1.0)\n");
+                    } else if (strcmp(token, "anime") == 0 || strcmp(token, "animation") == 0) {
+                        enc->encoder_preset |= 0x02;
+                        printf("Preset 'anime' enabled: grain synthesis disabled\n");
+                    } else {
+                        fprintf(stderr, "Warning: Unknown preset '%s' (valid: sports, anime)\n", token);
+                    }
+
+                    token = strtok(NULL, ",");
+                }
+                free(preset_str);
+                break;
+            }
            case 1052: // --enable-crop-encoding
                enc->enable_crop_encoding = 1;
                printf("Phase 2 crop encoding enabled (experimental)\n");
--- a/video_encoder/tav_avx512.h
+++ b/video_encoder/tav_avx512.h
@@ -456,7 +456,7 @@ static inline void quantise_dwt_coefficients_avx512(
            quant = _mm512_mask_blend_ps(dead_mask, quant, zero_vec);
        }

-        // Manual rounding to match scalar behavior (round away from zero)
+        // Manual rounding to match scalar behaviour (round away from zero)
        // First add 0.5 or -0.5 based on sign
        __mmask16 pos_mask = _mm512_cmp_ps_mask(quant, zero_vec, _CMP_GE_OQ);
        __m512 round_val = _mm512_mask_blend_ps(pos_mask, nhalf_vec, half_vec);
@@ -510,7 +510,7 @@ static inline void quantise_dwt_coefficients_perceptual_avx512(
        __m512 effective_q = _mm512_mul_ps(base_q_vec, weight);
        __m512 quant = _mm512_div_ps(coeff, effective_q);

-        // Manual rounding to match scalar behavior
+        // Manual rounding to match scalar behaviour
        __mmask16 pos_mask = _mm512_cmp_ps_mask(quant, zero_vec, _CMP_GE_OQ);
        __m512 round_val = _mm512_mask_blend_ps(pos_mask, nhalf_vec, half_vec);
        quant = _mm512_add_ps(quant, round_val);
--- a/video_encoder/tav_inspector.c
+++ b/video_encoder/tav_inspector.c
@@ -514,6 +514,7 @@ int main(int argc, char *argv[]) {
        uint8_t quality = header[25];
        uint8_t channel_layout = header[26];
        uint8_t entropy_coder = header[27];
+        uint8_t encoder_preset = header[28];

 static const int QLUT[] = {1,2,3,4,5,6,7,8,9,10,11,12,13,14,15,16,17,18,19,20,21,22,23,24,25,26,27,28,29,30,31,32,33,34,35,36,37,38,39,40,41,42,43,44,45,46,47,48,49,50,51,52,53,54,55,56,57,58,59,60,61,62,63,64,66,68,70,72,74,76,78,80,82,84,86,88,90,92,94,96,98,100,102,104,106,108,110,112,114,116,118,120,122,124,126,128,132,136,140,144,148,152,156,160,164,168,172,176,180,184,188,192,196,200,204,208,212,216,220,224,228,232,236,240,244,248,252,256,264,272,280,288,296,304,312,320,328,336,344,352,360,368,376,384,392,400,408,416,424,432,440,448,456,464,472,480,488,496,504,512,528,544,560,576,592,608,624,640,656,672,688,704,720,736,752,768,784,800,816,832,848,864,880,896,912,928,944,960,976,992,1008,1024,1056,1088,1120,1152,1184,1216,1248,1280,1312,1344,1376,1408,1440,1472,1504,1536,1568,1600,1632,1664,1696,1728,1760,1792,1824,1856,1888,1920,1952,1984,2016,2048,2112,2176,2240,2304,2368,2432,2496,2560,2624,2688,2752,2816,2880,2944,3008,3072,3136,3200,3264,3328,3392,3456,3520,3584,3648,3712,3776,3840,3904,3968,4032,4096};
 static const char* CLAYOUT[] = {"Luma-Chroma", "Luma-Chroma-Alpha", "Luma", "Luma-Alpha", "Chroma", "Chroma-Alpha"};
@@ -548,6 +549,21 @@ static const char* TEMPORAL_WAVELET[] = {"Haar", "CDF 5/3"};
            printf("  Quality:          n/a\n");
        printf("  Channel layout:   %s\n", CLAYOUT[channel_layout]);
        printf("  Entropy coder:    %s\n", entropy_coder == 0 ? "Twobit-map" : "EZBC");
+        printf("  Encoder preset:   ");
+        if (encoder_preset == 0) {
+            printf("Default\n");
+        } else {
+            int first = 1;
+            if (encoder_preset & 0x01) {
+                printf("%sSports", first ? "" : ", ");
+                first = 0;
+            }
+            if (encoder_preset & 0x02) {
+                printf("%sAnime", first ? "" : ", ");
+                first = 0;
+            }
+            printf("\n");
+        }
        printf("  Flags:\n");
        printf("    Has audio:      %s\n", (extra_flags & 0x01) ? "Yes" : "No");
        printf("    Has subtitles:  %s\n", (extra_flags & 0x02) ? "Yes" : "No");