TAV: first working psychovisual tuning

2026-06-06 13:38:30 +09:00 · 2025-09-20 11:15:04 +09:00
parent d3a18c081a
commit 206e43a308
2 changed files with 186 additions and 47 deletions
--- a/tsvm_core/src/net/torvald/tsvm/GraphicsJSR223Delegate.kt
+++ b/tsvm_core/src/net/torvald/tsvm/GraphicsJSR223Delegate.kt
@@ -3889,10 +3889,97 @@ class GraphicsJSR223Delegate(private val vm: VM) {
    }

    private fun getPerceptualWeight(level: Int, subbandType: Int, isChroma: Boolean, maxLevels: Int): Float {
-        return 1f
+        // Psychovisual model based on DWT coefficient statistics and Human Visual System sensitivity

-        // Data-driven model based on coefficient variance analysis - MUST match encoder exactly
        if (!isChroma) {
+            // LUMA CHANNEL: Based on statistical analysis from real video content
+            when (subbandType) {
+                0 -> { // LL subband - contains most image energy, preserve carefully
+                    return when {
+                        level >= 6 -> 0.6f  // LL6: High energy but can tolerate moderate quantization (range up to 22K)
+                        level >= 5 -> 0.7f  // LL5: Good preservation
+                        else -> 0.8f        // Lower LL levels: Fine preservation
+                    }
+                }
+                1 -> { // LH subband - horizontal details (human eyes more sensitive)
+                    return when {
+                        level >= 6 -> 0.7f  // LH6: Significant coefficients (max ~500), preserve well
+                        level >= 5 -> 0.8f  // LH5: Moderate coefficients (max ~600)
+                        level >= 4 -> 1.0f  // LH4: Small coefficients (max ~50)
+                        level >= 3 -> 1.2f  // LH3: Very small coefficients, can quantize more
+                        level >= 2 -> 1.4f  // LH2: Minimal impact
+                        else -> 1.6f        // LH1: Least important
+                    }
+                }
+                2 -> { // HL subband - vertical details (less sensitive due to HVS characteristics)
+                    return when {
+                        level >= 6 -> 0.9f  // HL6: Can quantize more aggressively than LH6
+                        level >= 5 -> 1.0f  // HL5: Standard quantization
+                        level >= 4 -> 1.3f  // HL4: Notable range but less critical
+                        level >= 3 -> 1.5f  // HL3: Can tolerate more quantization
+                        level >= 2 -> 1.7f  // HL2: Less important
+                        else -> 2.0f        // HL1: Most aggressive for vertical details
+                    }
+                }
+                3 -> { // HH subband - diagonal details (least important for HVS)
+                    return when {
+                        level >= 6 -> 1.1f  // HH6: Preserve some diagonal detail
+                        level >= 5 -> 1.3f  // HH5: Can quantize aggressively
+                        level >= 4 -> 1.6f  // HH4: Very aggressive
+                        level >= 3 -> 2.0f  // HH3: Minimal preservation
+                        level >= 2 -> 2.2f  // HH2: Maximum compression
+                        else -> 2.5f        // HH1: Most aggressive quantization
+                    }
+                }
+                else -> 1.0f
+            }
+        } else {
+            // CHROMA CHANNELS: Less critical for human perception, more aggressive quantization
+            when (subbandType) {
+                0 -> { // LL chroma - still important but less than luma
+                    return when {
+                        level >= 6 -> 0.8f  // Chroma LL6: Less critical than luma LL
+                        level >= 5 -> 0.9f
+                        else -> 1.0f
+                    }
+                }
+                1 -> { // LH chroma - horizontal chroma details
+                    return when {
+                        level >= 6 -> 1.0f
+                        level >= 5 -> 1.2f
+                        level >= 4 -> 1.4f
+                        level >= 3 -> 1.6f
+                        level >= 2 -> 1.8f
+                        else -> 2.0f
+                    }
+                }
+                2 -> { // HL chroma - vertical chroma details (even less critical)
+                    return when {
+                        level >= 6 -> 1.2f
+                        level >= 5 -> 1.4f
+                        level >= 4 -> 1.6f
+                        level >= 3 -> 1.8f
+                        level >= 2 -> 2.0f
+                        else -> 2.2f
+                    }
+                }
+                3 -> { // HH chroma - diagonal chroma details (most aggressive)
+                    return when {
+                        level >= 6 -> 1.4f
+                        level >= 5 -> 1.6f
+                        level >= 4 -> 1.8f
+                        level >= 3 -> 2.1f
+                        level >= 2 -> 2.3f
+                        else -> 2.5f
+                    }
+                }
+                else -> 1.0f
+            }
+        }
+        return 1.0f
+
+        // Legacy data-driven model (kept for reference but not used)
+        /*if (!isChroma) {
            // Luma strategy based on statistical variance analysis from real video data
            return when (subbandType) {
                0 -> { // LL
@@ -3939,7 +4026,7 @@ class GraphicsJSR223Delegate(private val vm: VM) {
            // Chroma strategy - apply 0.85x reduction to luma weights for color preservation
            val lumaWeight = getPerceptualWeight(level, subbandType, false, maxLevels)
            return lumaWeight * 1.6f
-        }
+        }*/
    }

    // Helper function to calculate five-number summary for coefficient analysis
@@ -4027,7 +4114,7 @@ class GraphicsJSR223Delegate(private val vm: VM) {
        }
    }

-    private val tavDebugFrameTarget = 0 // use negative number to disable the debug print
+    private val tavDebugFrameTarget = -1 // use negative number to disable the debug print
    private var tavDebugCurrentFrameNumber = 0

    fun tavDecode(blockDataPtr: Long, currentRGBAddr: Long, prevRGBAddr: Long,
--- a/video_encoder/encoder_tav.c
+++ b/video_encoder/encoder_tav.c
@@ -799,51 +799,108 @@ static void quantise_dwt_coefficients(float *coeffs, int16_t *quantised, int siz

 // Get perceptual weight for specific subband - Data-driven model based on coefficient variance analysis
 static float get_perceptual_weight(int level, int subband_type, int is_chroma, int max_levels) {
-    // TEMPORARY: Test with uniform weights to verify linear layout works correctly
-    return 1.0f;
-
+    // Psychovisual model based on DWT coefficient statistics and Human Visual System sensitivity
    if (!is_chroma) {
-        // Luma strategy based on statistical variance analysis from real video data
-        if (subband_type == 0) { // LL
-            // LL6 has extremely high variance (Range=8026.7) but contains most image energy
-            // Moderate quantization appropriate due to high variance tolerance
-            return 1.1f;
-        } else if (subband_type == 1) { // LH (horizontal detail)
-            // Data-driven weights based on observed coefficient patterns
-            if (level >= 6) return 0.7f;      // LH6: significant coefficients (Range=243.1)
-            else if (level == 5) return 0.8f; // LH5: moderate coefficients (Range=264.3)
-            else if (level == 4) return 1.0f; // LH4: small coefficients (Range=50.8)
-            else if (level == 3) return 1.4f; // LH3: sparse but large outliers (Range=11909.1)
-            else if (level == 2) return 1.6f; // LH2: fewer coefficients (Range=6720.2)
-            else return 1.9f;                 // LH1: smallest detail (Range=1606.3)
-        } else if (subband_type == 2) { // HL (vertical detail)
-            // Similar pattern to LH but slightly different variance
-            if (level >= 6) return 0.8f;      // HL6: moderate coefficients (Range=181.6)
-            else if (level == 5) return 0.9f; // HL5: small coefficients (Range=80.4)
-            else if (level == 4) return 1.2f; // HL4: surprising large outliers (Range=9737.9)
-            else if (level == 3) return 1.3f; // HL3: very large outliers (Range=13698.2)
-            else if (level == 2) return 1.5f; // HL2: moderate range (Range=2099.4)
-            else return 1.8f;                 // HL1: small coefficients (Range=851.1)
-        } else { // HH (diagonal detail)
-            // HH bands generally have lower energy but important for texture
-            if (level >= 6) return 1.0f;      // HH6: some significant coefficients (Range=95.8)
-            else if (level == 5) return 1.1f; // HH5: small coefficients (Range=75.9)
-            else if (level == 4) return 1.3f; // HH4: moderate range (Range=89.8)
-            else if (level == 3) return 1.5f; // HH3: large outliers (Range=11611.2)
-            else if (level == 2) return 1.8f; // HH2: moderate range (Range=2499.2)
-            else return 2.1f;                 // HH1: smallest coefficients (Range=761.6)
+        // LUMA CHANNEL: Based on statistical analysis from real video content
+        if (subband_type == 0) { // LL subband - contains most image energy, preserve carefully
+            if (level >= 6) return 0.6f;  // LL6: High energy but can tolerate moderate quantization (range up to 22K)
+            if (level >= 5) return 0.7f;  // LL5: Good preservation
+            return 0.8f;                   // Lower LL levels: Fine preservation
+        } else if (subband_type == 1) { // LH subband - horizontal details (human eyes more sensitive)
+            if (level >= 6) return 0.7f;  // LH6: Significant coefficients (max ~500), preserve well
+            if (level >= 5) return 0.8f;  // LH5: Moderate coefficients (max ~600)
+            if (level >= 4) return 1.0f;  // LH4: Small coefficients (max ~50)
+            if (level >= 3) return 1.2f;  // LH3: Very small coefficients, can quantize more
+            if (level >= 2) return 1.4f;  // LH2: Minimal impact
+            return 1.6f;                   // LH1: Least important
+        } else if (subband_type == 2) { // HL subband - vertical details (less sensitive due to HVS characteristics)
+            if (level >= 6) return 0.9f;  // HL6: Can quantize more aggressively than LH6
+            if (level >= 5) return 1.0f;  // HL5: Standard quantization
+            if (level >= 4) return 1.3f;  // HL4: Notable range but less critical
+            if (level >= 3) return 1.5f;  // HL3: Can tolerate more quantization
+            if (level >= 2) return 1.7f;  // HL2: Less important
+            return 2.0f;                   // HL1: Most aggressive for vertical details
+        } else { // HH subband - diagonal details (least important for HVS)
+            if (level >= 6) return 1.1f;  // HH6: Preserve some diagonal detail
+            if (level >= 5) return 1.3f;  // HH5: Can quantize aggressively
+            if (level >= 4) return 1.6f;  // HH4: Very aggressive
+            if (level >= 3) return 2.0f;  // HH3: Minimal preservation
+            if (level >= 2) return 2.2f;  // HH2: Maximum compression
+            return 2.5f;                   // HH1: Most aggressive quantization
        }
    } else {
-        // Chroma strategy - apply 0.85x reduction to luma weights for color preservation
-        float luma_weight = get_perceptual_weight(level, subband_type, 0, max_levels);
-        return luma_weight * 0.85f;
+        // CHROMA CHANNELS: Less critical for human perception, more aggressive quantization
+        if (subband_type == 0) { // LL chroma - still important but less than luma
+            if (level >= 6) return 0.8f;  // Chroma LL6: Less critical than luma LL
+            if (level >= 5) return 0.9f;
+            return 1.0f;
+        } else if (subband_type == 1) { // LH chroma - horizontal chroma details
+            if (level >= 6) return 1.0f;
+            if (level >= 5) return 1.2f;
+            if (level >= 4) return 1.4f;
+            if (level >= 3) return 1.6f;
+            if (level >= 2) return 1.8f;
+            return 2.0f;
+        } else if (subband_type == 2) { // HL chroma - vertical chroma details (even less critical)
+            if (level >= 6) return 1.2f;
+            if (level >= 5) return 1.4f;
+            if (level >= 4) return 1.6f;
+            if (level >= 3) return 1.8f;
+            if (level >= 2) return 2.0f;
+            return 2.2f;
+        } else { // HH chroma - diagonal chroma details (most aggressive)
+            if (level >= 6) return 1.4f;
+            if (level >= 5) return 1.6f;
+            if (level >= 4) return 1.8f;
+            if (level >= 3) return 2.1f;
+            if (level >= 2) return 2.3f;
+            return 2.5f;
+        }
    }
 }

 // Determine perceptual weight for coefficient at linear position (matches actual DWT layout)
 static float get_perceptual_weight_for_position(int linear_idx, int width, int height, int decomp_levels, int is_chroma) {
-    // For now, return uniform weight while we figure out the actual DWT layout
-    // TODO: Map linear_idx to correct DWT subband and return appropriate weight
+    // Map linear coefficient index to DWT subband using same layout as decoder
+    int offset = 0;
+
+    // First: LL subband at maximum decomposition level
+    int ll_width = width >> decomp_levels;
+    int ll_height = height >> decomp_levels;
+    int ll_size = ll_width * ll_height;
+
+    if (linear_idx < offset + ll_size) {
+        // LL subband at maximum level - use get_perceptual_weight for consistency
+        return get_perceptual_weight(decomp_levels, 0, is_chroma, decomp_levels);
+    }
+    offset += ll_size;
+
+    // Then: LH, HL, HH subbands for each level from max down to 1
+    for (int level = decomp_levels; level >= 1; level--) {
+        int level_width = width >> (decomp_levels - level + 1);
+        int level_height = height >> (decomp_levels - level + 1);
+        int subband_size = level_width * level_height;
+
+        // LH subband (horizontal details)
+        if (linear_idx < offset + subband_size) {
+            return get_perceptual_weight(level, 1, is_chroma, decomp_levels);
+        }
+        offset += subband_size;
+
+        // HL subband (vertical details)
+        if (linear_idx < offset + subband_size) {
+            return get_perceptual_weight(level, 2, is_chroma, decomp_levels);
+        }
+        offset += subband_size;
+
+        // HH subband (diagonal details)
+        if (linear_idx < offset + subband_size) {
+            return get_perceptual_weight(level, 3, is_chroma, decomp_levels);
+        }
+        offset += subband_size;
+    }
+
+    // Fallback for out-of-bounds indices
    return 1.0f;
 }

@@ -2668,12 +2725,7 @@ int main(int argc, char *argv[]) {
        printf("Base quantiser: Y=%d, Co=%d, Cg=%d\n", enc->quantiser_y, enc->quantiser_co, enc->quantiser_cg);
    }
    if (enc->perceptual_tuning) {
-        printf("Perceptual weights: LL=%.1fx, LH/HL=%.1f-%.1fx, HH=%.1f-%.1fx (varies by level)\n",
-               get_perceptual_weight(enc->decomp_levels, 0, 0, enc->decomp_levels),
-               get_perceptual_weight(enc->decomp_levels, 1, 0, enc->decomp_levels),
-               get_perceptual_weight(1, 1, 0, enc->decomp_levels),
-               get_perceptual_weight(enc->decomp_levels, 3, 0, enc->decomp_levels),
-               get_perceptual_weight(1, 3, 0, enc->decomp_levels));
+        printf("Perceptual tuning enabled\n");
    }

    // Open output file