TAV: base code for adding psychovisual model

2026-06-09 22:54:03 +09:00 · 2025-09-20 02:02:59 +09:00
parent c14b692114
commit d3a18c081a
4 changed files with 994 additions and 74 deletions
--- a/assets/disk0/tvdos/bin/playtav.js
+++ b/assets/disk0/tvdos/bin/playtav.js
@@ -425,7 +425,7 @@ for (let i = 0; i < 7; i++) {
    seqread.readOneByte()
 }
-if (header.version < 1 || header.version > 4) {
+if (header.version < 1 || header.version > 6) {
    printerrln(`Error: Unsupported TAV version ${header.version}`)
    errorlevel = 1
    return
--- a/terranmon.txt
+++ b/terranmon.txt
@@ -816,7 +816,7 @@ transmission capability, and region-of-interest coding.
 ## Header (32 bytes)
    uint8  Magic[8]: "\x1FTSVM TAV"
-    uint8  Version: 3 (YCoCg-R) or 4 (ICtCp)
+    uint8  Version: 3 (YCoCg-R uniform), 4 (ICtCp uniform), 5 (YCoCg-R perceptual), 6 (ICtCp perceptual)
    uint16 Width: video width in pixels  
    uint16 Height: video height in pixels
    uint8  FPS: frames per second
@@ -879,17 +879,48 @@ transmission capability, and region-of-interest coding.
  * Provides better energy compaction than 5/3 but lossy reconstruction
 ### Quantization Strategy
-TAV uses different quantization steps for each subband based on human visual
+
-system sensitivity:
+#### Uniform Quantization (Versions 3-4)
- LL subbands: Fine quantization (preserve DC and low frequencies)
+Traditional approach using same quantization factor for all DWT subbands within each channel.
- LH/HL subbands: Medium quantization (diagonal details less critical)  
+
- HH subbands: Coarse quantization (high frequency noise can be discarded)
+#### Perceptual Quantization (Versions 5-6, Default)
 TAV versions 5 and 6 implement Human Visual System (HVS) optimized quantization with
 frequency-aware subband weighting for superior visual quality:
 **Luma (Y) Channel Strategy:**
 - LL (lowest frequency): Base quantizer × 0.4 (finest preservation)
 - LH/HL at max level: Base quantizer × 0.6
 - HH at max level: Base quantizer × 1.0
 - Progressive increase toward higher frequencies down to level 1:
  - LH1/HL1: Base quantizer × 2.5
  - HH1: Base quantizer × 3.0
 **Chroma (Co/Cg) Channel Strategy:**
 - LL (lowest frequency): Base quantizer × 0.7 (less critical than luma)
 - LH/HL at max level: Base quantizer × 1.0
 - HH at max level: Base quantizer × 1.3
 - Progressive increase toward higher frequencies down to level 1:
  - HH1: Base quantizer × 2.2
 This perceptual approach allocates more bits to visually important low-frequency
 details while aggressively quantizing high-frequency noise, resulting in superior
 visual quality at equivalent bitrates.
 ## Colour Space
-TAV operates in YCoCg-R colour space with full resolution channels:
+TAV supports two colour spaces:
- Y: Luma channel (full resolution, fine quantization)
+
- Co: Orange-Cyan chroma (full resolution, aggressive quantization by default)  
+**YCoCg-R (Versions 3, 5):**
- Cg: Green-Magenta chroma (full resolution, very aggressive quantization by default)
+- Y: Luma channel (full resolution)
 - Co: Orange-Cyan chroma (full resolution)
 - Cg: Green-Magenta chroma (full resolution)
 **ICtCp (Versions 4, 6):**
 - I: Intensity (similar to luma)
 - Ct: Chroma tritanopia
 - Cp: Chroma protanopia
 Perceptual versions (5-6) apply HVS-optimized quantization weights per channel,
 while uniform versions (3-4) use consistent quantization across all subbands.
 ## Compression Features
 - Single DWT tiles vs 16x16 DCT blocks in TEV
@@ -903,7 +934,8 @@ Expected improvements over TEV:
 - Reduced blocking artifacts
 - Scalable quality/resolution decoding
 - Better performance on natural images vs artificial content
- Full resolution chroma preserves color detail while aggressive quantization maintains compression
+- **Perceptual versions (5-6)**: Superior visual quality through HVS-optimized bit allocation
 - **Uniform versions (3-4)**: Backward compatibility with traditional quantization
 ## Hardware Acceleration Functions
 TAV decoder requires new GraphicsJSR223Delegate functions:
--- a/tsvm_core/src/net/torvald/tsvm/GraphicsJSR223Delegate.kt
+++ b/tsvm_core/src/net/torvald/tsvm/GraphicsJSR223Delegate.kt
@@ -1,7 +1,6 @@
 package net.torvald.tsvm
 import com.badlogic.gdx.graphics.Pixmap
 import com.badlogic.gdx.math.MathUtils.*
 import com.badlogic.gdx.math.MathUtils.PI
 import com.badlogic.gdx.math.MathUtils.ceil
 import com.badlogic.gdx.math.MathUtils.floor
@@ -33,6 +32,15 @@ class GraphicsJSR223Delegate(private val vm: VM) {
    private var tavPreviousCoeffsCo: MutableMap<Int, FloatArray>? = null
    private var tavPreviousCoeffsCg: MutableMap<Int, FloatArray>? = null
    // TAV Perceptual dequantization support (must match encoder weights)
    data class DWTSubbandInfo(
        val level: Int,          // Decomposition level (1 to decompLevels)
        val subbandType: Int,    // 0=LL, 1=LH, 2=HL, 3=HH
        val coeffStart: Int,     // Starting index in linear coefficient array
        val coeffCount: Int,     // Number of coefficients in this subband
        val perceptualWeight: Float // Quantization multiplier for this subband
    )
    private fun getFirstGPU(): GraphicsAdapter? {
        return vm.findPeribyType(VM.PERITYPE_GPU_AND_TERM)?.peripheral as? GraphicsAdapter
    }
@@ -1325,10 +1333,10 @@ class GraphicsJSR223Delegate(private val vm: VM) {
     * @param rgbAddr Source RGB buffer (24-bit: R,G,B bytes)
     * @param width Frame width
     * @param height Frame height
-     * @param frameCounter Frame counter for dithering
+     * @param frameCount Frame counter for dithering
     */
-    fun uploadRGBToFramebuffer(rgbAddr: Long, width: Int, height: Int, frameCounter: Int) {
+    fun uploadRGBToFramebuffer(rgbAddr: Long, width: Int, height: Int, frameCount: Int) {
-        uploadRGBToFramebuffer(rgbAddr, width, height, frameCounter, false)
+        uploadRGBToFramebuffer(rgbAddr, width, height, frameCount, false)
    }
    /**
@@ -1398,10 +1406,10 @@ class GraphicsJSR223Delegate(private val vm: VM) {
     * @param rgbAddr Source RGB buffer (24-bit: R,G,B bytes)
     * @param width Frame width
     * @param height Frame height
-     * @param frameCounter Frame counter for dithering
+     * @param frameCount Frame counter for dithering
     * @param resizeToFull If true, resize video to fill entire screen; if false, center video
     */
-    fun uploadRGBToFramebuffer(rgbAddr: Long, width: Int, height: Int, frameCounter: Int, resizeToFull: Boolean) {
+    fun uploadRGBToFramebuffer(rgbAddr: Long, width: Int, height: Int, frameCount: Int, resizeToFull: Boolean) {
        val gpu = (vm.peripheralTable[1].peripheral as GraphicsAdapter)
        val rgbAddrIncVec = if (rgbAddr >= 0) 1 else -1
@@ -1444,9 +1452,9 @@ class GraphicsJSR223Delegate(private val vm: VM) {
                    val b = rgb[2]
                    // Apply Bayer dithering and convert to 4-bit using native coordinates
-                    val r4 = ditherValue(r, nativeX, nativeY, frameCounter)
+                    val r4 = ditherValue(r, nativeX, nativeY, frameCount)
-                    val g4 = ditherValue(g, nativeX, nativeY, frameCounter)
+                    val g4 = ditherValue(g, nativeX, nativeY, frameCount)
-                    val b4 = ditherValue(b, nativeX, nativeY, frameCounter)
+                    val b4 = ditherValue(b, nativeX, nativeY, frameCount)
                    // Pack and store in chunk buffers
                    rgChunk[i] = ((r4 shl 4) or g4).toByte()
@@ -1507,9 +1515,9 @@ class GraphicsJSR223Delegate(private val vm: VM) {
                    val b = rgbBulkBuffer[rgbIndex + 2].toUint()
                    // Apply Bayer dithering and convert to 4-bit
-                    val r4 = ditherValue(r, videoX, videoY, frameCounter)
+                    val r4 = ditherValue(r, videoX, videoY, frameCount)
-                    val g4 = ditherValue(g, videoX, videoY, frameCounter)
+                    val g4 = ditherValue(g, videoX, videoY, frameCount)
-                    val b4 = ditherValue(b, videoX, videoY, frameCounter)
+                    val b4 = ditherValue(b, videoX, videoY, frameCount)
                    // Pack RGB values and store in chunk arrays for batch processing
                    val validIndex = i
@@ -2505,10 +2513,10 @@ class GraphicsJSR223Delegate(private val vm: VM) {
     * @param width Frame width in pixels
     * @param height Frame height in pixels
     * @param quality Quantisation quality level (0-7)
-     * @param frameCounter Frame counter for temporal patterns
+     * @param frameCount Frame counter for temporal patterns
     */
    fun tevDecode(blockDataPtr: Long, currentRGBAddr: Long, prevRGBAddr: Long,
-                  width: Int, height: Int, qY: Int, qCo: Int, qCg: Int, frameCounter: Int,
+                  width: Int, height: Int, qY: Int, qCo: Int, qCg: Int, frameCount: Int,
                  debugMotionVectors: Boolean = false, tevVersion: Int = 2,
                  enableDeblocking: Boolean = true, enableBoundaryAwareDecoding: Boolean = false) {
@@ -3004,9 +3012,9 @@ class GraphicsJSR223Delegate(private val vm: VM) {
        }
    }
-    fun tevDeinterlace(frameCounter: Int, width: Int, height: Int, prevField: Long, currentField: Long, nextField: Long, outputRGB: Long, algorithm: String = "yadif") {
+    fun tevDeinterlace(frameCount: Int, width: Int, height: Int, prevField: Long, currentField: Long, nextField: Long, outputRGB: Long, algorithm: String = "yadif") {
        // Apply selected deinterlacing algorithm: field -> progressive frame
-        val fieldParity = (frameCounter + 1) % 2
+        val fieldParity = (frameCount + 1) % 2
        when (algorithm.lowercase()) {
            "bwdif" -> {
@@ -3815,15 +3823,224 @@ class GraphicsJSR223Delegate(private val vm: VM) {
    // ================= TAV (TSVM Advanced Video) Decoder =================
    // DWT-based video codec with ICtCp colour space support
    // TAV Perceptual dequantization helper functions (must match encoder implementation exactly)
    private fun calculateSubbandLayout(width: Int, height: Int, decompLevels: Int): List<DWTSubbandInfo> {
        val subbands = mutableListOf<DWTSubbandInfo>()
        // Start with the LL subband at maximum decomposition level (MUST match encoder exactly)
        val llWidth = width shr decompLevels  // Right shift by decomp_levels (equivalent to >> in C)
        val llHeight = height shr decompLevels
        subbands.add(DWTSubbandInfo(decompLevels, 0, 0, llWidth * llHeight, 0f)) // LL subband
        var coeffOffset = llWidth * llHeight
        // Add LH, HL, HH subbands for each level from max down to 1 (MUST match encoder exactly)
        for (level in decompLevels downTo 1) {
            // Use encoder's exact calculation: width >> (decomp_levels - level + 1)
            val levelWidth = width shr (decompLevels - level + 1)
            val levelHeight = height shr (decompLevels - level + 1)
            val subbandSize = levelWidth * levelHeight
            // LH subband (horizontal high, vertical low)
            subbands.add(DWTSubbandInfo(level, 1, coeffOffset, subbandSize, 0f))
            coeffOffset += subbandSize
            // HL subband (horizontal low, vertical high)
            subbands.add(DWTSubbandInfo(level, 2, coeffOffset, subbandSize, 0f))
            coeffOffset += subbandSize
            // HH subband (horizontal high, vertical high)
            subbands.add(DWTSubbandInfo(level, 3, coeffOffset, subbandSize, 0f))
            coeffOffset += subbandSize
        }
        // Debug: Validate subband coverage
        if (tavDebugCurrentFrameNumber == tavDebugFrameTarget) {
            val expectedTotal = width * height
            val actualTotal = subbands.sumOf { it.coeffCount }
            val maxIndex = subbands.maxOfOrNull { it.coeffStart + it.coeffCount - 1 } ?: -1
            println("SUBBAND LAYOUT VALIDATION:")
            println("  Expected coeffs: $expectedTotal (${width}x${height})")
            println("  Actual coeffs: $actualTotal")
            println("  Max index: $maxIndex")
            println("  Decomp levels: $decompLevels")
            // Check for overlaps and gaps
            val covered = BooleanArray(expectedTotal)
            var overlaps = 0
            for (subband in subbands) {
                for (i in 0 until subband.coeffCount) {
                    val idx = subband.coeffStart + i
                    if (idx < covered.size) {
                        if (covered[idx]) overlaps++
                        covered[idx] = true
                    }
                }
            }
            val gaps = covered.count { !it }
            println("  Overlaps: $overlaps, Gaps: $gaps")
            if (gaps > 0 || overlaps > 0 || actualTotal != expectedTotal) {
                println("  ERROR: Subband layout is incorrect!")
            }
        }
        return subbands
    }
    private fun getPerceptualWeight(level: Int, subbandType: Int, isChroma: Boolean, maxLevels: Int): Float {
        return 1f
        // Data-driven model based on coefficient variance analysis - MUST match encoder exactly
        if (!isChroma) {
            // Luma strategy based on statistical variance analysis from real video data
            return when (subbandType) {
                0 -> { // LL
                    // LL6 has extremely high variance (Range=8026.7) but contains most image energy
                    // Moderate quantization appropriate due to high variance tolerance
                    1.1f
                }
                1 -> { // LH (horizontal detail)
                    // Data-driven weights based on observed coefficient patterns
                    when (level) {
                        in 6..maxLevels -> 0.7f      // LH6: significant coefficients (Range=243.1)
                        5 -> 0.8f      // LH5: moderate coefficients (Range=264.3)
                        4 -> 1.0f      // LH4: small coefficients (Range=50.8)
                        3 -> 1.4f      // LH3: sparse but large outliers (Range=11909.1)
                        2 -> 1.6f      // LH2: fewer coefficients (Range=6720.2)
                        else -> 1.9f   // LH1: smallest detail (Range=1606.3)
                    }
                }
                2 -> { // HL (vertical detail)
                    // Similar pattern to LH but slightly different variance
                    when (level) {
                        in 6..maxLevels -> 0.8f      // HL6: moderate coefficients (Range=181.6)
                        5 -> 0.9f      // HL5: small coefficients (Range=80.4)
                        4 -> 1.2f      // HL4: surprising large outliers (Range=9737.9)
                        3 -> 1.3f      // HL3: very large outliers (Range=13698.2)
                        2 -> 1.5f      // HL2: moderate range (Range=2099.4)
                        else -> 1.8f   // HL1: small coefficients (Range=851.1)
                    }
                }
                3 -> { // HH (diagonal detail)
                    // HH bands generally have lower energy but important for texture
                    when (level) {
                        in 6..maxLevels -> 1.0f      // HH6: some significant coefficients (Range=95.8)
                        5 -> 1.1f      // HH5: small coefficients (Range=75.9)
                        4 -> 1.3f      // HH4: moderate range (Range=89.8)
                        3 -> 1.5f      // HH3: large outliers (Range=11611.2)
                        2 -> 1.8f      // HH2: moderate range (Range=2499.2)
                        else -> 2.1f   // HH1: smallest coefficients (Range=761.6)
                    }
                }
                else -> 1.0f
            }
        } else {
            // Chroma strategy - apply 0.85x reduction to luma weights for color preservation
            val lumaWeight = getPerceptualWeight(level, subbandType, false, maxLevels)
            return lumaWeight * 1.6f
        }
    }
    // Helper function to calculate five-number summary for coefficient analysis
    private fun calculateFiveNumberSummary(values: List<Int>): String {
        if (values.isEmpty()) return "empty"
        val sorted = values.sorted()
        val n = sorted.size
        val min = sorted[0]
        val max = sorted[n - 1]
        val median = if (n % 2 == 1) sorted[n / 2] else (sorted[n / 2 - 1] + sorted[n / 2]) / 2.0
        val q1 = if (n >= 4) sorted[n / 4] else sorted[0]
        val q3 = if (n >= 4) sorted[3 * n / 4] else sorted[n - 1]
        return "min=$min, Q1=$q1, med=%.1f, Q3=$q3, max=$max, n=$n".format(median)
    }
    private fun dequantiseDWTSubbandsPerceptual(quantised: ShortArray, dequantised: FloatArray,
                                               subbands: List<DWTSubbandInfo>, baseQuantizer: Float, isChroma: Boolean, decompLevels: Int) {
        // Initialize output array to zero (critical for detecting missing coefficients)
        for (i in dequantised.indices) {
            dequantised[i] = 0.0f
        }
        // Track coefficient coverage for debugging
        var totalProcessed = 0
        var maxIdx = -1
        for (subband in subbands) {
            val weight = getPerceptualWeight(subband.level, subband.subbandType, isChroma, decompLevels)
            // CRITICAL FIX: Use the same effective quantizer as encoder for proper reconstruction
            val effectiveQuantizer = baseQuantizer * weight
            // Comprehensive five-number summary for perceptual model analysis
            if (tavDebugCurrentFrameNumber == tavDebugFrameTarget) {
                // Collect all quantized coefficient values for this subband
                val coeffValues = mutableListOf<Int>()
                for (i in 0 until subband.coeffCount) {
                    val idx = subband.coeffStart + i
                    if (idx < quantised.size) {
                        val quantVal = quantised[idx].toInt()
                        coeffValues.add(quantVal)
                    }
                }
                // Calculate and print five-number summary
                val subbandTypeName = when (subband.subbandType) {
                    0 -> "LL"
                    1 -> "LH"
                    2 -> "HL"
                    3 -> "HH"
                    else -> "??"
                }
                val channelType = if (isChroma) "Chroma" else "Luma"
                val summary = calculateFiveNumberSummary(coeffValues)
                println("SUBBAND STATS: $channelType ${subbandTypeName}${subband.level} weight=${weight} effectiveQ=${effectiveQuantizer} - $summary")
            }
            for (i in 0 until subband.coeffCount) {
                val idx = subband.coeffStart + i
                if (idx < quantised.size && idx < dequantised.size) {
                    dequantised[idx] = quantised[idx] * effectiveQuantizer
                    totalProcessed++
                    if (idx > maxIdx) maxIdx = idx
                }
            }
        }
        // Debug coefficient coverage
        if (tavDebugCurrentFrameNumber == tavDebugFrameTarget) {
            val channelType = if (isChroma) "Chroma" else "Luma"
            println("COEFFICIENT COVERAGE: $channelType - processed=$totalProcessed, maxIdx=$maxIdx, arraySize=${dequantised.size}")
            // Check for gaps (zero coefficients that should have been processed)
            var zeroCount = 0
            for (i in 0 until minOf(maxIdx + 1, dequantised.size)) {
                if (dequantised[i] == 0.0f && quantised[i] != 0.toShort()) {
                    zeroCount++
                }
            }
            if (zeroCount > 0) {
                println("WARNING: $zeroCount coefficients were not processed but should have been!")
            }
        }
    }
    private val tavDebugFrameTarget = 0 // use negative number to disable the debug print
    private var tavDebugCurrentFrameNumber = 0
    fun tavDecode(blockDataPtr: Long, currentRGBAddr: Long, prevRGBAddr: Long,
-                  width: Int, height: Int, qYGlobal: Int, qCoGlobal: Int, qCgGlobal: Int, frameCounter: Int,
+                  width: Int, height: Int, qYGlobal: Int, qCoGlobal: Int, qCgGlobal: Int, frameCount: Int,
                  waveletFilter: Int = 1, decompLevels: Int = 6, isLossless: Boolean = false, tavVersion: Int = 1) {
        tavDebugCurrentFrameNumber = frameCount
        var readPtr = blockDataPtr
        try {
            // Determine if monoblock mode based on TAV version
-            val isMonoblock = (tavVersion == 3 || tavVersion == 4)
+            val isMonoblock = (tavVersion == 3 || tavVersion == 4 || tavVersion == 5 || tavVersion == 6)
            val tilesX: Int
            val tilesY: Int
@@ -3849,7 +4066,7 @@ class GraphicsJSR223Delegate(private val vm: VM) {
                    val qCg = vm.peek(readPtr++).toUint().let { if (it == 0) qCgGlobal else it }
                    // debug print: raw decompressed bytes
-                    /*print("TAV Decode raw bytes (Frame $frameCounter, mode: ${arrayOf("SKIP", "INTRA", "DELTA")[mode]}): ")
+                    /*print("TAV Decode raw bytes (Frame $frameCount, mode: ${arrayOf("SKIP", "INTRA", "DELTA")[mode]}): ")
                    for (i in 0 until 32) {
                        print("${vm.peek(blockDataPtr + i).toUint().toString(16).uppercase().padStart(2, '0')} ")
                    }
@@ -3927,10 +4144,155 @@ class GraphicsJSR223Delegate(private val vm: VM) {
        val coTile = FloatArray(coeffCount)
        val cgTile = FloatArray(coeffCount)
-        for (i in 0 until coeffCount) {
+        // Check if perceptual quantization is used (versions 5 and 6)
-            yTile[i] = quantisedY[i] * qY.toFloat()
+        val isPerceptual = (tavVersion == 5 || tavVersion == 6)
-            coTile[i] = quantisedCo[i] * qCo.toFloat()
+
-            cgTile[i] = quantisedCg[i] * qCg.toFloat()
+        // Debug: Print version detection for frame 120
        if (tavDebugCurrentFrameNumber == tavDebugFrameTarget) {
            println("[VERSION-DEBUG-INTRA] Frame $tavDebugCurrentFrameNumber - TAV version: $tavVersion, isPerceptual: $isPerceptual")
        }
        if (isPerceptual) {
            // Perceptual dequantization with subband-specific weights
            val tileWidth = if (isMonoblock) width else PADDED_TILE_SIZE_X
            val tileHeight = if (isMonoblock) height else PADDED_TILE_SIZE_Y
            val subbands = calculateSubbandLayout(tileWidth, tileHeight, decompLevels)
            dequantiseDWTSubbandsPerceptual(quantisedY, yTile, subbands, qY.toFloat(), false, decompLevels)
            dequantiseDWTSubbandsPerceptual(quantisedCo, coTile, subbands, qCo.toFloat(), true, decompLevels)
            dequantiseDWTSubbandsPerceptual(quantisedCg, cgTile, subbands, qCg.toFloat(), true, decompLevels)
            // Debug: Check coefficient values before inverse DWT
            if (tavDebugCurrentFrameNumber == tavDebugFrameTarget) {
                var maxYDequant = 0.0f
                var nonzeroY = 0
                for (coeff in yTile) {
                    if (coeff != 0.0f) {
                        nonzeroY++
                        if (kotlin.math.abs(coeff) > maxYDequant) {
                            maxYDequant = kotlin.math.abs(coeff)
                        }
                    }
                }
                println("[DECODER-INTRA] Frame $tavDebugCurrentFrameNumber - Before IDWT: Y max=${maxYDequant.toInt()}, nonzero=$nonzeroY")
                // Debug: Check if subband layout is correct - print actual coefficient positions
                println("PERCEPTUAL SUBBAND LAYOUT DEBUG:")
                println("  Total coeffs: ${yTile.size}, Decomp levels: $decompLevels, Tile size: ${tileWidth}x${tileHeight}")
                for (subband in subbands) {
                    if (subband.level <= 6) { // LH, HL, HH for levels 1-2
                        var sampleCoeffs = 0
                        val coeffCount = minOf(1000, subband.coeffCount)
                        for (i in 0 until coeffCount) { // Sample first 100 coeffs
                            val idx = subband.coeffStart + i
                            if (idx < yTile.size && yTile[idx] != 0.0f) {
                                sampleCoeffs++
                            }
                        }
                        val subbandName = when(subband.subbandType) {
                            0 -> "LL${subband.level}"
                            1 -> "LH${subband.level}"
                            2 -> "HL${subband.level}"
                            3 -> "HH${subband.level}"
                            else -> "??${subband.level}"
                        }
                        println("  $subbandName: start=${subband.coeffStart}, count=${subband.coeffCount}, sample_nonzero=$sampleCoeffs/$coeffCount")
                        // Debug: Print first few RAW QUANTIZED values for comparison (before dequantization)
                        print("    $subbandName raw_quant: ")
                        for (i in 0 until minOf(32, subband.coeffCount)) {
                            val idx = subband.coeffStart + i
                            if (idx < quantisedY.size) {
                                print("${quantisedY[idx]} ")
                            }
                        }
                        println()
                    }
                }
            }
        } else {
            // Uniform dequantization for versions 3 and 4
            for (i in 0 until coeffCount) {
                yTile[i] = quantisedY[i] * qY.toFloat()
                coTile[i] = quantisedCo[i] * qCo.toFloat()
                cgTile[i] = quantisedCg[i] * qCg.toFloat()
            }
            // Debug: Uniform quantization subband analysis for comparison
            if (tavDebugCurrentFrameNumber == tavDebugFrameTarget) {
                val tileWidth = if (isMonoblock) width else PADDED_TILE_SIZE_X
                val tileHeight = if (isMonoblock) height else PADDED_TILE_SIZE_Y
                val subbands = calculateSubbandLayout(tileWidth, tileHeight, decompLevels)
                // Comprehensive five-number summary for uniform quantization baseline
                for (subband in subbands) {
                    // Collect all quantized coefficient values for this subband (luma only for baseline)
                    val coeffValues = mutableListOf<Int>()
                    for (i in 0 until subband.coeffCount) {
                        val idx = subband.coeffStart + i
                        if (idx < quantisedY.size) {
                            val quantVal = quantisedY[idx].toInt()
                            coeffValues.add(quantVal)
                        }
                    }
                    // Calculate and print five-number summary for uniform mode
                    val subbandTypeName = when (subband.subbandType) {
                        0 -> "LL"
                        1 -> "LH"
                        2 -> "HL"
                        3 -> "HH"
                        else -> "??"
                    }
                    val summary = calculateFiveNumberSummary(coeffValues)
                    println("UNIFORM SUBBAND STATS: Luma ${subbandTypeName}${subband.level} uniformQ=${qY.toFloat()} - $summary")
                }
                var maxYDequant = 0.0f
                var nonzeroY = 0
                for (coeff in yTile) {
                    if (coeff != 0.0f) {
                        nonzeroY++
                        if (kotlin.math.abs(coeff) > maxYDequant) {
                            maxYDequant = kotlin.math.abs(coeff)
                        }
                    }
                }
                println("[DECODER-INTRA] Frame $tavDebugCurrentFrameNumber - Before IDWT: Y max=${maxYDequant.toInt()}, nonzero=$nonzeroY")
                // Debug: Check if subband layout is correct for uniform too - print actual coefficient positions
                println("UNIFORM SUBBAND LAYOUT DEBUG:")
                println("  Total coeffs: ${yTile.size}, Decomp levels: $decompLevels, Tile size: ${tileWidth}x${tileHeight}")
                for (subband in subbands) {
                    if (subband.level <= 6) { // LH, HL, HH for levels 1-2
                        var sampleCoeffs = 0
                        val coeffCount = minOf(1000, subband.coeffCount)
                        for (i in 0 until coeffCount) { // Sample first 100 coeffs
                            val idx = subband.coeffStart + i
                            if (idx < yTile.size && yTile[idx] != 0.0f) {
                                sampleCoeffs++
                            }
                        }
                        val subbandName = when(subband.subbandType) {
                            0 -> "LL${subband.level}"
                            1 -> "LH${subband.level}"
                            2 -> "HL${subband.level}"
                            3 -> "HH${subband.level}"
                            else -> "??${subband.level}"
                        }
                        println("  $subbandName: start=${subband.coeffStart}, count=${subband.coeffCount}, sample_nonzero=$sampleCoeffs/$coeffCount")
                        // Debug: Print first few RAW QUANTIZED values for comparison with perceptual (before dequantization)
                        print("    $subbandName raw_quant: ")
                        for (i in 0 until minOf(32, subband.coeffCount)) {
                            val idx = subband.coeffStart + i
                            if (idx < quantisedY.size) {
                                print("${quantisedY[idx]} ")
                            }
                        }
                        println()
                    }
                }
            }
        }
        // Store coefficients for future delta reference (for P-frames)
@@ -3963,6 +4325,29 @@ class GraphicsJSR223Delegate(private val vm: VM) {
            tavApplyDWTInverseMultiLevel(cgTile, tileWidth, tileHeight, decompLevels, waveletFilter)
        }
        // Debug: Check coefficient values after inverse DWT
        if (tavDebugCurrentFrameNumber == tavDebugFrameTarget) {
            var maxYIdwt = 0.0f
            var minYIdwt = 0.0f
            var maxCoIdwt = 0.0f
            var minCoIdwt = 0.0f
            var maxCgIdwt = 0.0f
            var minCgIdwt = 0.0f
            for (coeff in yTile) {
                if (coeff > maxYIdwt) maxYIdwt = coeff
                if (coeff < minYIdwt) minYIdwt = coeff
            }
            for (coeff in coTile) {
                if (coeff > maxCoIdwt) maxCoIdwt = coeff
                if (coeff < minCoIdwt) minCoIdwt = coeff
            }
            for (coeff in cgTile) {
                if (coeff > maxCgIdwt) maxCgIdwt = coeff
                if (coeff < minCgIdwt) minCgIdwt = coeff
            }
            println("[DECODER-INTRA] Frame $tavDebugCurrentFrameNumber - After IDWT: Y=[${minYIdwt.toInt()}, ${maxYIdwt.toInt()}], Co=[${minCoIdwt.toInt()}, ${maxCoIdwt.toInt()}], Cg=[${minCgIdwt.toInt()}, ${maxCgIdwt.toInt()}]")
        }
        // Extract final tile data
        val finalYTile: FloatArray
        val finalCoTile: FloatArray
@@ -4123,6 +4508,16 @@ class GraphicsJSR223Delegate(private val vm: VM) {
    // Monoblock conversion functions (full frame processing)
    private fun tavConvertYCoCgMonoblockToRGB(yData: FloatArray, coData: FloatArray, cgData: FloatArray,
                                              rgbAddr: Long, width: Int, height: Int) {
        // Debug: Check if this is frame 120 for final RGB comparison
        val isFrame120Debug = tavDebugCurrentFrameNumber == tavDebugFrameTarget  // Enable for debugging
        var debugSampleCount = 0
        var debugRSum = 0
        var debugGSum = 0
        var debugBSum = 0
        var debugYSum = 0.0f
        var debugCoSum = 0.0f
        var debugCgSum = 0.0f
        // Process entire frame at once for monoblock mode
        for (y in 0 until height) {
            // Create row buffer for bulk RGB data
@@ -4143,9 +4538,24 @@ class GraphicsJSR223Delegate(private val vm: VM) {
                val b = tmp - Co / 2.0f
                val r = Co + b
-                rowRgbBuffer[bufferIdx++] = r.toInt().coerceIn(0, 255).toByte()
+                val rInt = r.toInt().coerceIn(0, 255)
-                rowRgbBuffer[bufferIdx++] = g.toInt().coerceIn(0, 255).toByte()
+                val gInt = g.toInt().coerceIn(0, 255)
-                rowRgbBuffer[bufferIdx++] = b.toInt().coerceIn(0, 255).toByte()
+                val bInt = b.toInt().coerceIn(0, 255)
                rowRgbBuffer[bufferIdx++] = rInt.toByte()
                rowRgbBuffer[bufferIdx++] = gInt.toByte()
                rowRgbBuffer[bufferIdx++] = bInt.toByte()
                // Debug: Sample RGB values for frame 120 comparison
                if (isFrame120Debug && y in 100..199 && x in 100..199) { // Sample 100x100 region
                    debugSampleCount++
                    debugRSum += rInt
                    debugGSum += gInt
                    debugBSum += bInt
                    debugYSum += Y
                    debugCoSum += Co
                    debugCgSum += Cg
                }
            }
            // OPTIMIZATION: Bulk copy entire row at once
@@ -4153,6 +4563,17 @@ class GraphicsJSR223Delegate(private val vm: VM) {
            UnsafeHelper.memcpyRaw(rowRgbBuffer, UnsafeHelper.getArrayOffset(rowRgbBuffer),
                                 null, vm.usermem.ptr + rgbAddr + rowStartOffset, rowRgbBuffer.size.toLong())
        }
        // Debug: Print RGB sample statistics for frame 120 comparison
        if (isFrame120Debug && debugSampleCount > 0) {
            val avgR = debugRSum / debugSampleCount
            val avgG = debugGSum / debugSampleCount
            val avgB = debugBSum / debugSampleCount
            val avgY = debugYSum / debugSampleCount
            val avgCo = debugCoSum / debugSampleCount
            val avgCg = debugCgSum / debugSampleCount
            println("[RGB-FINAL] Sample region (100x100): avgYCoCg=[${avgY.toInt()},${avgCo.toInt()},${avgCg.toInt()}] → avgRGB=[$avgR,$avgG,$avgB], samples=$debugSampleCount")
        }
    }
    private fun tavConvertICtCpMonoblockToRGB(iData: FloatArray, ctData: FloatArray, cpData: FloatArray,
@@ -4316,10 +4737,104 @@ class GraphicsJSR223Delegate(private val vm: VM) {
        val currentCo = FloatArray(coeffCount)
        val currentCg = FloatArray(coeffCount)
-        for (i in 0 until coeffCount) {
+        // Check if perceptual quantization is used (versions 5 and 6)
-            currentY[i] = prevY[i] + (deltaY[i].toFloat() * qY)
+        val isPerceptual = (tavVersion == 5 || tavVersion == 6)
-            currentCo[i] = prevCo[i] + (deltaCo[i].toFloat() * qCo)
+
-            currentCg[i] = prevCg[i] + (deltaCg[i].toFloat() * qCg)
+        // Debug: Print version detection for frame 120
        if (tavDebugCurrentFrameNumber == tavDebugFrameTarget) {
            println("[VERSION-DEBUG-DELTA] Frame $tavDebugCurrentFrameNumber - TAV version: $tavVersion, isPerceptual: $isPerceptual")
        }
        if (isPerceptual) {
            // Perceptual delta reconstruction with subband-specific weights
            val tileWidth = if (isMonoblock) width else PADDED_TILE_SIZE_X
            val tileHeight = if (isMonoblock) height else PADDED_TILE_SIZE_Y
            val subbands = calculateSubbandLayout(tileWidth, tileHeight, decompLevels)
            // Apply same chroma quantizer reduction as encoder (60% reduction for perceptual mode)
            val adjustedQCo = qCo * 0.4f
            val adjustedQCg = qCg * 0.4f
            // Apply perceptual dequantization to delta coefficients
            val deltaYFloat = FloatArray(coeffCount)
            val deltaCoFloat = FloatArray(coeffCount)
            val deltaCgFloat = FloatArray(coeffCount)
            dequantiseDWTSubbandsPerceptual(deltaY, deltaYFloat, subbands, qY.toFloat(), false, decompLevels)
            dequantiseDWTSubbandsPerceptual(deltaCo, deltaCoFloat, subbands, adjustedQCo, true, decompLevels)
            dequantiseDWTSubbandsPerceptual(deltaCg, deltaCgFloat, subbands, adjustedQCg, true, decompLevels)
            // Reconstruct: current = previous + perceptually_dequantized_delta
            for (i in 0 until coeffCount) {
                currentY[i] = prevY[i] + deltaYFloat[i]
                currentCo[i] = prevCo[i] + deltaCoFloat[i]
                currentCg[i] = prevCg[i] + deltaCgFloat[i]
            }
            // Debug: Check coefficient values before inverse DWT
            if (tavDebugCurrentFrameNumber == tavDebugFrameTarget) {
                var maxYRecon = 0.0f
                var nonzeroY = 0
                for (coeff in currentY) {
                    if (coeff != 0.0f) {
                        nonzeroY++
                        if (kotlin.math.abs(coeff) > maxYRecon) {
                            maxYRecon = kotlin.math.abs(coeff)
                        }
                    }
                }
                println("[DECODER-DELTA] Frame $tavDebugCurrentFrameNumber - Before IDWT: Y max=${maxYRecon.toInt()}, nonzero=$nonzeroY")
            }
        } else {
            // Uniform delta reconstruction for versions 3 and 4
            for (i in 0 until coeffCount) {
                currentY[i] = prevY[i] + (deltaY[i].toFloat() * qY)
                currentCo[i] = prevCo[i] + (deltaCo[i].toFloat() * qCo)
                currentCg[i] = prevCg[i] + (deltaCg[i].toFloat() * qCg)
            }
            // Debug: Uniform delta quantization subband analysis for comparison
            if (tavDebugCurrentFrameNumber == tavDebugFrameTarget) {
                val tileWidth = if (isMonoblock) width else PADDED_TILE_SIZE_X
                val tileHeight = if (isMonoblock) height else PADDED_TILE_SIZE_Y
                val subbands = calculateSubbandLayout(tileWidth, tileHeight, decompLevels)
                // Comprehensive five-number summary for uniform delta quantization baseline
                for (subband in subbands) {
                    // Collect all quantized delta coefficient values for this subband (luma only for baseline)
                    val coeffValues = mutableListOf<Int>()
                    for (i in 0 until subband.coeffCount) {
                        val idx = subband.coeffStart + i
                        if (idx < deltaY.size) {
                            val quantVal = deltaY[idx].toInt()
                            coeffValues.add(quantVal)
                        }
                    }
                    // Calculate and print five-number summary for uniform delta mode
                    val subbandTypeName = when (subband.subbandType) {
                        0 -> "LL"
                        1 -> "LH"
                        2 -> "HL"
                        3 -> "HH"
                        else -> "??"
                    }
                    val summary = calculateFiveNumberSummary(coeffValues)
                    println("UNIFORM DELTA SUBBAND STATS: Luma ${subbandTypeName}${subband.level} uniformQ=${qY.toFloat()} - $summary")
                }
                var maxYRecon = 0.0f
                var nonzeroY = 0
                for (coeff in currentY) {
                    if (coeff != 0.0f) {
                        nonzeroY++
                        if (kotlin.math.abs(coeff) > maxYRecon) {
                            maxYRecon = kotlin.math.abs(coeff)
                        }
                    }
                }
                println("[DECODER-DELTA] Frame $tavDebugCurrentFrameNumber - Before IDWT: Y max=${maxYRecon.toInt()}, nonzero=$nonzeroY")
            }
        }
        // Store current coefficients as previous for next frame
@@ -4341,6 +4856,29 @@ class GraphicsJSR223Delegate(private val vm: VM) {
            tavApplyDWTInverseMultiLevel(currentCg, tileWidth, tileHeight, decompLevels, waveletFilter)
        }
        // Debug: Check coefficient values after inverse DWT
        if (tavDebugCurrentFrameNumber == tavDebugFrameTarget) {
            var maxYIdwt = 0.0f
            var minYIdwt = 0.0f
            var maxCoIdwt = 0.0f
            var minCoIdwt = 0.0f
            var maxCgIdwt = 0.0f
            var minCgIdwt = 0.0f
            for (coeff in currentY) {
                if (coeff > maxYIdwt) maxYIdwt = coeff
                if (coeff < minYIdwt) minYIdwt = coeff
            }
            for (coeff in currentCo) {
                if (coeff > maxCoIdwt) maxCoIdwt = coeff
                if (coeff < minCoIdwt) minCoIdwt = coeff
            }
            for (coeff in currentCg) {
                if (coeff > maxCgIdwt) maxCgIdwt = coeff
                if (coeff < minCgIdwt) minCgIdwt = coeff
            }
            println("[DECODER-DELTA] Frame $tavDebugCurrentFrameNumber - After IDWT: Y=[${minYIdwt.toInt()}, ${maxYIdwt.toInt()}], Co=[${minCoIdwt.toInt()}, ${maxCoIdwt.toInt()}], Cg=[${minCgIdwt.toInt()}, ${maxCgIdwt.toInt()}]")
        }
        // Extract final tile data
        val finalYTile: FloatArray
        val finalCoTile: FloatArray
@@ -4478,6 +5016,19 @@ class GraphicsJSR223Delegate(private val vm: VM) {
                continue
            }
            // Debug: Sample coefficient values before this level's reconstruction
            if (tavDebugCurrentFrameNumber == tavDebugFrameTarget) {
                var maxCoeff = 0.0f
                var nonzeroCoeff = 0
                val sampleSize = minOf(100, currentWidth * currentHeight)
                for (i in 0 until sampleSize) {
                    val coeff = kotlin.math.abs(data[i])
                    if (coeff > maxCoeff) maxCoeff = coeff
                    if (coeff > 0.1f) nonzeroCoeff++
                }
                println("[IDWT-LEVEL-$level] BEFORE: ${currentWidth}x${currentHeight}, max=${maxCoeff.toInt()}, nonzero=$nonzeroCoeff/$sampleSize")
            }
            // Apply inverse DWT to current subband region - EXACT match to encoder
            // The encoder does ROW transform first, then COLUMN transform
            // So inverse must do COLUMN inverse first, then ROW inverse
@@ -4515,6 +5066,19 @@ class GraphicsJSR223Delegate(private val vm: VM) {
                    data[y * width + x] = tempRow[x]
                }
            }
            // Debug: Sample coefficient values after this level's reconstruction
            if (tavDebugCurrentFrameNumber == tavDebugFrameTarget) {
                var maxCoeff = 0.0f
                var nonzeroCoeff = 0
                val sampleSize = minOf(100, currentWidth * currentHeight)
                for (i in 0 until sampleSize) {
                    val coeff = kotlin.math.abs(data[i])
                    if (coeff > maxCoeff) maxCoeff = coeff
                    if (coeff > 0.1f) nonzeroCoeff++
                }
                println("[IDWT-LEVEL-$level] AFTER:  ${currentWidth}x${currentHeight}, max=${maxCoeff.toInt()}, nonzero=$nonzeroCoeff/$sampleSize")
            }
        }
    }
--- a/video_encoder/encoder_tav.c
+++ b/video_encoder/encoder_tav.c
@@ -22,12 +22,14 @@
 // TSVM Advanced Video (TAV) format constants
 #define TAV_MAGIC "\x1F\x54\x53\x56\x4D\x54\x41\x56"  // "\x1FTSVM TAV"
-// TAV version - dynamic based on colour space mode
+// TAV version - dynamic based on colour space and perceptual tuning
-// Version 3: YCoCg-R monoblock (default)
+// Version 5: YCoCg-R monoblock with perceptual quantization (default)
-// Version 4: ICtCp monoblock (--ictcp flag)
+// Version 6: ICtCp monoblock with perceptual quantization (--ictcp flag)
-// Legacy versions (4-tile mode, code preserved but not accessible):
+// Legacy versions (uniform quantization):
-// Version 1: YCoCg-R 4-tile
+// Version 3: YCoCg-R monoblock uniform (--no-perceptual-tuning)
-// Version 2: ICtCp 4-tile
+// Version 4: ICtCp monoblock uniform (--ictcp --no-perceptual-tuning)
 // Version 1: YCoCg-R 4-tile (legacy, code preserved but not accessible)
 // Version 2: ICtCp 4-tile (legacy, code preserved but not accessible)
 // Tile encoding modes (280x224 tiles)
 #define TAV_MODE_SKIP      0x00  // Skip tile (copy from reference)
@@ -142,6 +144,9 @@ static int validate_mp2_bitrate(int bitrate) {
 static const int QUALITY_Y[] = {60, 42, 25, 12, 6, 2};
 static const int QUALITY_CO[] = {120, 90, 60, 30, 15, 3};
 static const int QUALITY_CG[] = {240, 180, 120, 60, 30, 5};
 //static const int QUALITY_Y[] =  { 25, 12,  6,   3,  2, 1};
 //static const int QUALITY_CO[] =  {60, 30, 15,  7,  5, 2};
 //static const int QUALITY_CG[] = {120, 60, 30, 15, 10, 4};
 // DWT coefficient structure for each subband
 typedef struct {
@@ -157,6 +162,15 @@ typedef struct {
    int tile_x, tile_y;
 } dwt_tile_t;
 // DWT subband information for perceptual quantization
 typedef struct {
    int level;              // Decomposition level (1 to enc->decomp_levels)
    int subband_type;       // 0=LL, 1=LH, 2=HL, 3=HH
    int coeff_start;        // Starting index in linear coefficient array
    int coeff_count;        // Number of coefficients in this subband
    float perceptual_weight; // Quantization multiplier for this subband
 } dwt_subband_info_t;
 // TAV encoder structure
 typedef struct {
    // Input/output files
@@ -196,6 +210,7 @@ typedef struct {
    int ictcp_mode;       // 0 = YCoCg-R (default), 1 = ICtCp colour space
    int intra_only;       // Force all tiles to use INTRA mode (disable delta encoding)
    int monoblock;        // Single DWT tile mode (encode entire frame as one tile)
    int perceptual_tuning; // 1 = perceptual quantization (default), 0 = uniform quantization
    // Frame buffers - ping-pong implementation
    uint8_t *frame_rgb[2];      // [0] and [1] alternate between current and previous
@@ -247,6 +262,7 @@ typedef struct {
    // Progress tracking
    struct timeval start_time;
    int encode_limit;  // Maximum number of frames to encode (0 = no limit)
 } tav_encoder_t;
@@ -331,6 +347,8 @@ static void show_usage(const char *program_name) {
    printf("  --lossless              Lossless mode: use 5/3 reversible wavelet\n");
    printf("  --delta                 Enable delta encoding (improved compression but noisy picture)\n");
    printf("  --ictcp                 Use ICtCp colour space instead of YCoCg-R (use when source is in BT.2100)\n");
    printf("  --no-perceptual-tuning  Disable perceptual quantization (uniform quantization like versions 3/4)\n");
    printf("  --encode-limit N        Encode only first N frames (useful for testing/analysis)\n");
    printf("  --help                  Show this help\n\n");
    printf("Audio Rate by Quality:\n  ");
@@ -358,8 +376,10 @@ static void show_usage(const char *program_name) {
    printf("\n\n");
    printf("Features:\n");
    printf("  - Single DWT tile (monoblock) encoding for optimal quality\n");
    printf("  - Perceptual quantization optimized for human visual system (default)\n");
    printf("  - Full resolution YCoCg-R/ICtCp colour space\n");
    printf("  - Lossless and lossy compression modes\n");
    printf("  - Versions 5/6: Perceptual quantization, Versions 3/4: Uniform quantization\n");
    printf("\nExamples:\n");
    printf("  %s -i input.mp4 -o output.mv3               # Default settings\n", program_name);
@@ -386,7 +406,9 @@ static tav_encoder_t* create_encoder(void) {
    enc->quantiser_cg = QUALITY_CG[DEFAULT_QUALITY];
    enc->intra_only = 1;
    enc->monoblock = 1;  // Default to monoblock mode
    enc->perceptual_tuning = 1;  // Default to perceptual quantization (versions 5/6)
    enc->audio_bitrate = 0;  // 0 = use quality table
    enc->encode_limit = 0;  // Default: no frame limit
    return enc;
 }
@@ -775,6 +797,143 @@ static void quantise_dwt_coefficients(float *coeffs, int16_t *quantised, int siz
    }
 }
 // Get perceptual weight for specific subband - Data-driven model based on coefficient variance analysis
 static float get_perceptual_weight(int level, int subband_type, int is_chroma, int max_levels) {
    // TEMPORARY: Test with uniform weights to verify linear layout works correctly
    return 1.0f;
    if (!is_chroma) {
        // Luma strategy based on statistical variance analysis from real video data
        if (subband_type == 0) { // LL
            // LL6 has extremely high variance (Range=8026.7) but contains most image energy
            // Moderate quantization appropriate due to high variance tolerance
            return 1.1f;
        } else if (subband_type == 1) { // LH (horizontal detail)
            // Data-driven weights based on observed coefficient patterns
            if (level >= 6) return 0.7f;      // LH6: significant coefficients (Range=243.1)
            else if (level == 5) return 0.8f; // LH5: moderate coefficients (Range=264.3)
            else if (level == 4) return 1.0f; // LH4: small coefficients (Range=50.8)
            else if (level == 3) return 1.4f; // LH3: sparse but large outliers (Range=11909.1)
            else if (level == 2) return 1.6f; // LH2: fewer coefficients (Range=6720.2)
            else return 1.9f;                 // LH1: smallest detail (Range=1606.3)
        } else if (subband_type == 2) { // HL (vertical detail)
            // Similar pattern to LH but slightly different variance
            if (level >= 6) return 0.8f;      // HL6: moderate coefficients (Range=181.6)
            else if (level == 5) return 0.9f; // HL5: small coefficients (Range=80.4)
            else if (level == 4) return 1.2f; // HL4: surprising large outliers (Range=9737.9)
            else if (level == 3) return 1.3f; // HL3: very large outliers (Range=13698.2)
            else if (level == 2) return 1.5f; // HL2: moderate range (Range=2099.4)
            else return 1.8f;                 // HL1: small coefficients (Range=851.1)
        } else { // HH (diagonal detail)
            // HH bands generally have lower energy but important for texture
            if (level >= 6) return 1.0f;      // HH6: some significant coefficients (Range=95.8)
            else if (level == 5) return 1.1f; // HH5: small coefficients (Range=75.9)
            else if (level == 4) return 1.3f; // HH4: moderate range (Range=89.8)
            else if (level == 3) return 1.5f; // HH3: large outliers (Range=11611.2)
            else if (level == 2) return 1.8f; // HH2: moderate range (Range=2499.2)
            else return 2.1f;                 // HH1: smallest coefficients (Range=761.6)
        }
    } else {
        // Chroma strategy - apply 0.85x reduction to luma weights for color preservation
        float luma_weight = get_perceptual_weight(level, subband_type, 0, max_levels);
        return luma_weight * 0.85f;
    }
 }
 // Determine perceptual weight for coefficient at linear position (matches actual DWT layout)
 static float get_perceptual_weight_for_position(int linear_idx, int width, int height, int decomp_levels, int is_chroma) {
    // For now, return uniform weight while we figure out the actual DWT layout
    // TODO: Map linear_idx to correct DWT subband and return appropriate weight
    return 1.0f;
 }
 // Apply perceptual quantization per-coefficient (same loop as uniform but with spatial weights)
 static void quantise_dwt_coefficients_perceptual_per_coeff(float *coeffs, int16_t *quantised, int size,
                                                          int base_quantizer, int width, int height,
                                                          int decomp_levels, int is_chroma, int frame_count) {
    // EXACTLY the same approach as uniform quantization but apply weight per coefficient
    float effective_base_q = base_quantizer;
    effective_base_q = FCLAMP(effective_base_q, 1.0f, 255.0f);
    // Debug coefficient analysis
    if (frame_count == 1 || frame_count == 120) {
        int nonzero = 0;
        for (int i = 0; i < size; i++) {
            // Apply perceptual weight based on coefficient's position in DWT layout
            float weight = get_perceptual_weight_for_position(i, width, height, decomp_levels, is_chroma);
            float effective_q = effective_base_q * weight;
            float quantised_val = coeffs[i] / effective_q;
            quantised[i] = (int16_t)CLAMP((int)(quantised_val + (quantised_val >= 0 ? 0.5f : -0.5f)), -32768, 32767);
            if (quantised[i] != 0) nonzero++;
        }
        printf("DEBUG: Frame 120 - %s channel: %d/%d nonzero coeffs after perceptual per-coeff quantization\n",
               is_chroma ? "Chroma" : "Luma", nonzero, size);
    } else {
        // Normal quantization loop
        for (int i = 0; i < size; i++) {
            // Apply perceptual weight based on coefficient's position in DWT layout
            float weight = get_perceptual_weight_for_position(i, width, height, decomp_levels, is_chroma);
            float effective_q = effective_base_q * weight;
            float quantised_val = coeffs[i] / effective_q;
            quantised[i] = (int16_t)CLAMP((int)(quantised_val + (quantised_val >= 0 ? 0.5f : -0.5f)), -32768, 32767);
        }
    }
 }
 // Convert 2D spatial DWT layout to linear subband layout (for decoder compatibility)
 static void convert_2d_to_linear_layout(const int16_t *spatial_2d, int16_t *linear_subbands,
                                       int width, int height, int decomp_levels) {
    int linear_offset = 0;
    // First: LL subband (top-left corner at finest decomposition level)
    int ll_width = width >> decomp_levels;
    int ll_height = height >> decomp_levels;
    for (int y = 0; y < ll_height; y++) {
        for (int x = 0; x < ll_width; x++) {
            int spatial_idx = y * width + x;
            linear_subbands[linear_offset++] = spatial_2d[spatial_idx];
        }
    }
    // Then: LH, HL, HH subbands for each level from max down to 1
    for (int level = decomp_levels; level >= 1; level--) {
        int level_width = width >> (decomp_levels - level + 1);
        int level_height = height >> (decomp_levels - level + 1);
        // LH subband (top-right quadrant)
        for (int y = 0; y < level_height; y++) {
            for (int x = level_width; x < level_width * 2; x++) {
                if (y < height && x < width) {
                    int spatial_idx = y * width + x;
                    linear_subbands[linear_offset++] = spatial_2d[spatial_idx];
                }
            }
        }
        // HL subband (bottom-left quadrant)
        for (int y = level_height; y < level_height * 2; y++) {
            for (int x = 0; x < level_width; x++) {
                if (y < height && x < width) {
                    int spatial_idx = y * width + x;
                    linear_subbands[linear_offset++] = spatial_2d[spatial_idx];
                }
            }
        }
        // HH subband (bottom-right quadrant)
        for (int y = level_height; y < level_height * 2; y++) {
            for (int x = level_width; x < level_width * 2; x++) {
                if (y < height && x < width) {
                    int spatial_idx = y * width + x;
                    linear_subbands[linear_offset++] = spatial_2d[spatial_idx];
                }
            }
        }
    }
 }
 // Serialise tile data for compression
 static size_t serialise_tile_data(tav_encoder_t *enc, int tile_x, int tile_y, 
                                  const float *tile_y_data, const float *tile_co_data, const float *tile_cg_data,
@@ -820,9 +979,17 @@ static size_t serialise_tile_data(tav_encoder_t *enc, int tile_x, int tile_y,
    if (mode == TAV_MODE_INTRA) {
        // INTRA mode: quantise coefficients directly and store for future reference
-        quantise_dwt_coefficients((float*)tile_y_data, quantised_y, tile_size, this_frame_qY);
+        if (enc->perceptual_tuning) {
-        quantise_dwt_coefficients((float*)tile_co_data, quantised_co, tile_size, this_frame_qCo);
+            // Perceptual quantization: EXACTLY like uniform but with per-coefficient weights
-        quantise_dwt_coefficients((float*)tile_cg_data, quantised_cg, tile_size, this_frame_qCg);
+            quantise_dwt_coefficients_perceptual_per_coeff((float*)tile_y_data, quantised_y, tile_size, this_frame_qY, enc->width, enc->height, enc->decomp_levels, 0, enc->frame_count);
            quantise_dwt_coefficients_perceptual_per_coeff((float*)tile_co_data, quantised_co, tile_size, this_frame_qCo, enc->width, enc->height, enc->decomp_levels, 1, enc->frame_count);
            quantise_dwt_coefficients_perceptual_per_coeff((float*)tile_cg_data, quantised_cg, tile_size, this_frame_qCg, enc->width, enc->height, enc->decomp_levels, 1, enc->frame_count);
        } else {
            // Legacy uniform quantization
            quantise_dwt_coefficients((float*)tile_y_data, quantised_y, tile_size, this_frame_qY);
            quantise_dwt_coefficients((float*)tile_co_data, quantised_co, tile_size, this_frame_qCo);
            quantise_dwt_coefficients((float*)tile_cg_data, quantised_cg, tile_size, this_frame_qCg);
        }
        // Store current coefficients for future delta reference
        int tile_idx = tile_y * enc->tiles_x + tile_x;
@@ -851,20 +1018,121 @@ static size_t serialise_tile_data(tav_encoder_t *enc, int tile_x, int tile_y,
            delta_cg[i] = tile_cg_data[i] - prev_cg[i];
        }
-        // Quantise the deltas
+        // Quantise the deltas with per-coefficient perceptual quantization
-        quantise_dwt_coefficients(delta_y, quantised_y, tile_size, this_frame_qY);
+        if (enc->perceptual_tuning) {
-        quantise_dwt_coefficients(delta_co, quantised_co, tile_size, this_frame_qCo);
+            quantise_dwt_coefficients_perceptual_per_coeff(delta_y, quantised_y, tile_size, this_frame_qY, enc->width, enc->height, enc->decomp_levels, 0, 0);
-        quantise_dwt_coefficients(delta_cg, quantised_cg, tile_size, this_frame_qCg);
+            quantise_dwt_coefficients_perceptual_per_coeff(delta_co, quantised_co, tile_size, this_frame_qCo, enc->width, enc->height, enc->decomp_levels, 1, 0);
            quantise_dwt_coefficients_perceptual_per_coeff(delta_cg, quantised_cg, tile_size, this_frame_qCg, enc->width, enc->height, enc->decomp_levels, 1, 0);
        } else {
            // Legacy uniform delta quantization
            quantise_dwt_coefficients(delta_y, quantised_y, tile_size, this_frame_qY);
            quantise_dwt_coefficients(delta_co, quantised_co, tile_size, this_frame_qCo);
            quantise_dwt_coefficients(delta_cg, quantised_cg, tile_size, this_frame_qCg);
        }
        // Reconstruct coefficients like decoder will (previous + dequantised_delta)
-        for (int i = 0; i < tile_size; i++) {
+        if (enc->perceptual_tuning) {
-            float dequant_delta_y = (float)quantised_y[i] * this_frame_qY;
+            // Apply 2D perceptual dequantization using same logic as quantization
            float dequant_delta_co = (float)quantised_co[i] * this_frame_qCo;
            float dequant_delta_cg = (float)quantised_cg[i] * this_frame_qCg;
-            prev_y[i] = prev_y[i] + dequant_delta_y;
+            // First, apply uniform dequantization baseline
-            prev_co[i] = prev_co[i] + dequant_delta_co;
+            for (int i = 0; i < tile_size; i++) {
-            prev_cg[i] = prev_cg[i] + dequant_delta_cg;
+                prev_y[i] = prev_y[i] + ((float)quantised_y[i] * (float)this_frame_qY);
                prev_co[i] = prev_co[i] + ((float)quantised_co[i] * (float)this_frame_qCo);
                prev_cg[i] = prev_cg[i] + ((float)quantised_cg[i] * (float)this_frame_qCg);
            }
            // Then apply perceptual correction by re-dequantizing specific subbands
            for (int level = 1; level <= enc->decomp_levels; level++) {
                int level_width = enc->width >> (enc->decomp_levels - level + 1);
                int level_height = enc->height >> (enc->decomp_levels - level + 1);
                // Skip if subband is too small
                if (level_width < 1 || level_height < 1) continue;
                // Get perceptual weights for this level
                float lh_weight_y = get_perceptual_weight(level, 1, 0, enc->decomp_levels);
                float hl_weight_y = get_perceptual_weight(level, 2, 0, enc->decomp_levels);
                float hh_weight_y = get_perceptual_weight(level, 3, 0, enc->decomp_levels);
                float lh_weight_co = get_perceptual_weight(level, 1, 1, enc->decomp_levels);
                float hl_weight_co = get_perceptual_weight(level, 2, 1, enc->decomp_levels);
                float hh_weight_co = get_perceptual_weight(level, 3, 1, enc->decomp_levels);
                // Correct LH subband (top-right quadrant)
                for (int y = 0; y < level_height; y++) {
                    for (int x = level_width; x < level_width * 2; x++) {
                        if (y < enc->height && x < enc->width) {
                            int idx = y * enc->width + x;
                            // Remove uniform dequantization and apply perceptual
                            prev_y[idx] -= ((float)quantised_y[idx] * (float)this_frame_qY);
                            prev_y[idx] += ((float)quantised_y[idx] * ((float)this_frame_qY * lh_weight_y));
                            prev_co[idx] -= ((float)quantised_co[idx] * (float)this_frame_qCo);
                            prev_co[idx] += ((float)quantised_co[idx] * ((float)this_frame_qCo * lh_weight_co));
                            prev_cg[idx] -= ((float)quantised_cg[idx] * (float)this_frame_qCg);
                            prev_cg[idx] += ((float)quantised_cg[idx] * ((float)this_frame_qCg * lh_weight_co));
                        }
                    }
                }
                // Correct HL subband (bottom-left quadrant)
                for (int y = level_height; y < level_height * 2; y++) {
                    for (int x = 0; x < level_width; x++) {
                        if (y < enc->height && x < enc->width) {
                            int idx = y * enc->width + x;
                            prev_y[idx] -= ((float)quantised_y[idx] * (float)this_frame_qY);
                            prev_y[idx] += ((float)quantised_y[idx] * ((float)this_frame_qY * hl_weight_y));
                            prev_co[idx] -= ((float)quantised_co[idx] * (float)this_frame_qCo);
                            prev_co[idx] += ((float)quantised_co[idx] * ((float)this_frame_qCo * hl_weight_co));
                            prev_cg[idx] -= ((float)quantised_cg[idx] * (float)this_frame_qCg);
                            prev_cg[idx] += ((float)quantised_cg[idx] * ((float)this_frame_qCg * hl_weight_co));
                        }
                    }
                }
                // Correct HH subband (bottom-right quadrant)
                for (int y = level_height; y < level_height * 2; y++) {
                    for (int x = level_width; x < level_width * 2; x++) {
                        if (y < enc->height && x < enc->width) {
                            int idx = y * enc->width + x;
                            prev_y[idx] -= ((float)quantised_y[idx] * (float)this_frame_qY);
                            prev_y[idx] += ((float)quantised_y[idx] * ((float)this_frame_qY * hh_weight_y));
                            prev_co[idx] -= ((float)quantised_co[idx] * (float)this_frame_qCo);
                            prev_co[idx] += ((float)quantised_co[idx] * ((float)this_frame_qCo * hh_weight_co));
                            prev_cg[idx] -= ((float)quantised_cg[idx] * (float)this_frame_qCg);
                            prev_cg[idx] += ((float)quantised_cg[idx] * ((float)this_frame_qCg * hh_weight_co));
                        }
                    }
                }
            }
            // Finally, correct LL subband (top-left corner at finest level)
            int ll_width = enc->width >> enc->decomp_levels;
            int ll_height = enc->height >> enc->decomp_levels;
            float ll_weight_y = get_perceptual_weight(enc->decomp_levels, 0, 0, enc->decomp_levels);
            float ll_weight_co = get_perceptual_weight(enc->decomp_levels, 0, 1, enc->decomp_levels);
            for (int y = 0; y < ll_height; y++) {
                for (int x = 0; x < ll_width; x++) {
                    if (y < enc->height && x < enc->width) {
                        int idx = y * enc->width + x;
                        prev_y[idx] -= ((float)quantised_y[idx] * (float)this_frame_qY);
                        prev_y[idx] += ((float)quantised_y[idx] * ((float)this_frame_qY * ll_weight_y));
                        prev_co[idx] -= ((float)quantised_co[idx] * (float)this_frame_qCo);
                        prev_co[idx] += ((float)quantised_co[idx] * ((float)this_frame_qCo * ll_weight_co));
                        prev_cg[idx] -= ((float)quantised_cg[idx] * (float)this_frame_qCg);
                        prev_cg[idx] += ((float)quantised_cg[idx] * ((float)this_frame_qCg * ll_weight_co));
                    }
                }
            }
        } else {
            // Legacy uniform dequantization
            for (int i = 0; i < tile_size; i++) {
                float dequant_delta_y = (float)quantised_y[i] * this_frame_qY;
                float dequant_delta_co = (float)quantised_co[i] * this_frame_qCo;
                float dequant_delta_cg = (float)quantised_cg[i] * this_frame_qCg;
                prev_y[i] = prev_y[i] + dequant_delta_y;
                prev_co[i] = prev_co[i] + dequant_delta_co;
                prev_cg[i] = prev_cg[i] + dequant_delta_cg;
            }
        }
        free(delta_y);
@@ -881,7 +1149,7 @@ static size_t serialise_tile_data(tav_encoder_t *enc, int tile_x, int tile_y,
        printf("\n");
    }*/
-    // Write quantised coefficients
+    // Write quantised coefficients (both uniform and perceptual use same linear layout)
    memcpy(buffer + offset, quantised_y, tile_size * sizeof(int16_t)); offset += tile_size * sizeof(int16_t);
    memcpy(buffer + offset, quantised_co, tile_size * sizeof(int16_t)); offset += tile_size * sizeof(int16_t);
    memcpy(buffer + offset, quantised_cg, tile_size * sizeof(int16_t)); offset += tile_size * sizeof(int16_t);
@@ -950,6 +1218,19 @@ static size_t compress_and_write_frame(tav_encoder_t *enc, uint8_t packet_type)
                printf("\n");
            }*/
            // Debug: Check Y data before DWT transform
            if (enc->frame_count == 120 && enc->verbose) {
                float max_y_before = 0.0f;
                int nonzero_before = 0;
                int total_pixels = enc->monoblock ? (enc->width * enc->height) : (PADDED_TILE_SIZE_X * PADDED_TILE_SIZE_Y);
                for (int i = 0; i < total_pixels; i++) {
                    float abs_val = fabsf(tile_y_data[i]);
                    if (abs_val > max_y_before) max_y_before = abs_val;
                    if (abs_val > 0.1f) nonzero_before++;
                }
                printf("DEBUG: Y data before DWT: max=%.2f, nonzero=%d/%d\n", max_y_before, nonzero_before, total_pixels);
            }
            // Apply DWT transform to each channel
            if (enc->monoblock) {
                // Monoblock mode: transform entire frame
@@ -963,6 +1244,16 @@ static size_t compress_and_write_frame(tav_encoder_t *enc, uint8_t packet_type)
                dwt_2d_forward_padded(tile_cg_data, enc->decomp_levels, enc->wavelet_filter);
            }
            // Debug: Check Y data after DWT transform for high-frequency content
            if (enc->frame_count == 120 && enc->verbose) {
                printf("DEBUG: Y data after DWT (some high-freq samples): ");
                int sample_indices[] = {47034, 47035, 47036, 47037, 47038}; // HH1 start + some samples
                for (int i = 0; i < 5; i++) {
                    printf("%.3f ", tile_y_data[sample_indices[i]]);
                }
                printf("\n");
            }
            // Serialise tile
            size_t tile_size = serialise_tile_data(enc, tile_x, tile_y,
                                                   tile_y_data, tile_co_data, tile_cg_data,
@@ -1245,12 +1536,16 @@ static int write_tav_header(tav_encoder_t *enc) {
    // Magic number
    fwrite(TAV_MAGIC, 1, 8, enc->output_fp);
-    // Version (dynamic based on colour space and monoblock mode)
+    // Version (dynamic based on colour space, monoblock mode, and perceptual tuning)
    uint8_t version;
    if (enc->monoblock) {
-        version = enc->ictcp_mode ? 4 : 3;  // Version 4 for ICtCp monoblock, 3 for YCoCg-R monoblock
+        if (enc->perceptual_tuning) {
            version = enc->ictcp_mode ? 6 : 5;  // Version 6 for ICtCp perceptual, 5 for YCoCg-R perceptual
        } else {
            version = enc->ictcp_mode ? 4 : 3;  // Version 4 for ICtCp uniform, 3 for YCoCg-R uniform
        }
    } else {
-        version = enc->ictcp_mode ? 2 : 1;  // Version 2 for ICtCp, 1 for YCoCg-R
+        version = enc->ictcp_mode ? 2 : 1;  // Legacy 4-tile versions
    }
    fputc(version, enc->output_fp);
@@ -2231,6 +2526,8 @@ int main(int argc, char *argv[]) {
        {"lossless", no_argument, 0, 1000},
        {"delta", no_argument, 0, 1006},
        {"ictcp", no_argument, 0, 1005},
        {"no-perceptual-tuning", no_argument, 0, 1007},
        {"encode-limit", required_argument, 0, 1008},
        {"help", no_argument, 0, '?'},
        {0, 0, 0, 0}
    };
@@ -2301,6 +2598,17 @@ int main(int argc, char *argv[]) {
            case 1006: // --intra-only
                enc->intra_only = 0;
                break;
            case 1007: // --no-perceptual-tuning
                enc->perceptual_tuning = 0;
                break;
            case 1008: // --encode-limit
                enc->encode_limit = atoi(optarg);
                if (enc->encode_limit < 0) {
                    fprintf(stderr, "Error: Invalid encode limit: %d\n", enc->encode_limit);
                    cleanup_encoder(enc);
                    return 1;
                }
                break;
            case 1400: // --arate
                {
                    int bitrate = atoi(optarg);
@@ -2353,10 +2661,19 @@ int main(int argc, char *argv[]) {
    printf("Wavelet: %s\n", enc->wavelet_filter ? "9/7 irreversible" : "5/3 reversible");
    printf("Decomposition levels: %d\n", enc->decomp_levels);
    printf("Colour space: %s\n", enc->ictcp_mode ? "ICtCp" : "YCoCg-R");
    printf("Quantization: %s\n", enc->perceptual_tuning ? "Perceptual (HVS-optimized)" : "Uniform (legacy)");
    if (enc->ictcp_mode) {
-        printf("Quantiser: I=%d, Ct=%d, Cp=%d\n", enc->quantiser_y, enc->quantiser_co, enc->quantiser_cg);
+        printf("Base quantiser: I=%d, Ct=%d, Cp=%d\n", enc->quantiser_y, enc->quantiser_co, enc->quantiser_cg);
    } else {
-        printf("Quantiser: Y=%d, Co=%d, Cg=%d\n", enc->quantiser_y, enc->quantiser_co, enc->quantiser_cg);
+        printf("Base quantiser: Y=%d, Co=%d, Cg=%d\n", enc->quantiser_y, enc->quantiser_co, enc->quantiser_cg);
    }
    if (enc->perceptual_tuning) {
        printf("Perceptual weights: LL=%.1fx, LH/HL=%.1f-%.1fx, HH=%.1f-%.1fx (varies by level)\n",
               get_perceptual_weight(enc->decomp_levels, 0, 0, enc->decomp_levels),
               get_perceptual_weight(enc->decomp_levels, 1, 0, enc->decomp_levels),
               get_perceptual_weight(1, 1, 0, enc->decomp_levels),
               get_perceptual_weight(enc->decomp_levels, 3, 0, enc->decomp_levels),
               get_perceptual_weight(1, 3, 0, enc->decomp_levels));
    }
    // Open output file
@@ -2436,6 +2753,13 @@ int main(int argc, char *argv[]) {
    int count_pframe = 0;
    while (continue_encoding) {
        // Check encode limit if specified
        if (enc->encode_limit > 0 && frame_count >= enc->encode_limit) {
            printf("Reached encode limit of %d frames, finalizing...\n", enc->encode_limit);
            continue_encoding = 0;
            break;
        }
        if (enc->test_mode) {
            // Test mode has a fixed frame count
            if (frame_count >= enc->total_frames) {