p-frame for tav

2026-03-07 19:51:51 +09:00 · 2025-09-16 18:57:11 +09:00
parent be193269d8
commit 47f93194a7
4 changed files with 464 additions and 267 deletions
--- a/tsvm_core/src/net/torvald/tsvm/GraphicsJSR223Delegate.kt
+++ b/tsvm_core/src/net/torvald/tsvm/GraphicsJSR223Delegate.kt
@@ -17,16 +17,21 @@ import kotlin.math.*
 class GraphicsJSR223Delegate(private val vm: VM) {
    
    // TAV Simulated overlapping tiles constants (must match encoder)
-    private val TAV_TILE_SIZE_X = 280
-    private val TAV_TILE_SIZE_Y = 224
+    private val TILE_SIZE_X = 280
+    private val TILE_SIZE_Y = 224
    private val TAV_TILE_MARGIN = 32  // 32-pixel margin for 3 DWT levels (4 * 2^3 = 32px)
-    private val TAV_PADDED_TILE_SIZE_X = TAV_TILE_SIZE_X + 2 * TAV_TILE_MARGIN  // 280 + 64 = 344px
-    private val TAV_PADDED_TILE_SIZE_Y = TAV_TILE_SIZE_Y + 2 * TAV_TILE_MARGIN  // 224 + 64 = 288px
+    private val PADDED_TILE_SIZE_X = TILE_SIZE_X + 2 * TAV_TILE_MARGIN  // 280 + 64 = 344px
+    private val PADDED_TILE_SIZE_Y = TILE_SIZE_Y + 2 * TAV_TILE_MARGIN  // 224 + 64 = 288px

    // Reusable working arrays to reduce allocation overhead
    private val tevIdct8TempBuffer = FloatArray(64)
    private val tevIdct16TempBuffer = FloatArray(256) // For 16x16 IDCT
    private val tevIdct16SeparableBuffer = FloatArray(256) // For separable 16x16 IDCT
+    
+    // TAV coefficient delta storage for previous frame (for efficient P-frames)
+    private var tavPreviousCoeffsY: MutableMap<Int, FloatArray>? = null
+    private var tavPreviousCoeffsCo: MutableMap<Int, FloatArray>? = null 
+    private var tavPreviousCoeffsCg: MutableMap<Int, FloatArray>? = null

    private fun getFirstGPU(): GraphicsAdapter? {
        return vm.findPeribyType(VM.PERITYPE_GPU_AND_TERM)?.peripheral as? GraphicsAdapter
@@ -1285,7 +1290,7 @@ class GraphicsJSR223Delegate(private val vm: VM) {
        return (if ((q < 50)) 5000f / q else 200f - 2 * q) / 100f
    }

-    // Quality settings for quantization (Y channel) - 16x16 tables
+    // Quality settings for quantisation (Y channel) - 16x16 tables
    val QUANT_TABLE_Y: IntArray = intArrayOf(
        16, 14, 12, 11, 11, 13, 16, 20, 24, 30, 39, 48, 54, 61, 67, 73,
        14, 13, 12, 12, 12, 15, 18, 21, 25, 33, 46, 57, 61, 65, 67, 70,
@@ -1304,7 +1309,7 @@ class GraphicsJSR223Delegate(private val vm: VM) {
        73, 82, 92, 98, 103, 107, 110, 117, 126, 132, 134, 136, 138, 138, 133, 127,
        86, 98, 109, 112, 114, 116, 118, 124, 133, 135, 129, 125, 128, 130, 128, 127)

-    // Quality settings for quantization (Co channel - orange-blue, 8x8)
+    // Quality settings for quantisation (Co channel - orange-blue, 8x8)
    val QUANT_TABLE_C: IntArray =  intArrayOf(
        17, 18, 24, 47, 99, 99, 99, 99,
        18, 21, 26, 66, 99, 99, 99, 99,
@@ -1527,7 +1532,7 @@ class GraphicsJSR223Delegate(private val vm: VM) {
    }

    /**
-     * Apply Bayer dithering to reduce banding when quantizing to 4-bit
+     * Apply Bayer dithering to reduce banding when quantising to 4-bit
     */
    private fun ditherValue(value: Int, x: Int, y: Int, f: Int): Int {
        // Preserve pure values (0 and 255) exactly to maintain colour primaries
@@ -1707,7 +1712,7 @@ class GraphicsJSR223Delegate(private val vm: VM) {
    private fun tevIdct16x16_fast(coeffs: ShortArray, quantTable: IntArray, qualityIndex: Int, rateControlFactor: Float): IntArray {
        val result = IntArray(256) // 16x16 = 256
        
-        // Process coefficients and dequantize using preallocated buffer
+        // Process coefficients and dequantise using preallocated buffer
        for (u in 0 until 16) {
            for (v in 0 until 16) {
                val idx = u * 16 + v
@@ -2499,7 +2504,7 @@ class GraphicsJSR223Delegate(private val vm: VM) {
     * @param prevRGBAddr Address of previous frame RGB buffer (for motion compensation)
     * @param width Frame width in pixels
     * @param height Frame height in pixels
-     * @param quality Quantization quality level (0-7)
+     * @param quality Quantisation quality level (0-7)
     * @param frameCounter Frame counter for temporal patterns
     */
    fun tevDecode(blockDataPtr: Long, currentRGBAddr: Long, prevRGBAddr: Long,
@@ -2617,7 +2622,7 @@ class GraphicsJSR223Delegate(private val vm: VM) {
                                //     tevApplyMotionCompensationTwoPass(yBlock, coBlock, cgBlock, startX, startY, mv[0], mv[1], prevRGBAddr, width, height, prevAddrIncVec)
                                // }

-                                // Use IDCT on knusperli-optimised coefficients (coefficients are already optimally dequantized)
+                                // Use IDCT on knusperli-optimised coefficients (coefficients are already optimally dequantised)
                                val yPixels = tevIdct16x16_fromOptimisedCoeffs(yBlock)
                                val coPixels = tevIdct8x8_fromOptimisedCoeffs(coBlock)
                                val cgPixels = tevIdct8x8_fromOptimisedCoeffs(cgBlock)
@@ -2798,7 +2803,7 @@ class GraphicsJSR223Delegate(private val vm: VM) {
                        }

                        0x01 -> { // TEV_MODE_INTRA - Full YCoCg-R DCT decode (no motion compensation)
-                            // Regular lossy mode: quantized int16 coefficients
+                            // Regular lossy mode: quantised int16 coefficients
                            // Optimised bulk reading of all DCT coefficients: Y(256×2) + Co(64×2) + Cg(64×2) = 768 bytes
                            val coeffShortArray = ShortArray(384) // Total coefficients: 256 + 64 + 64 = 384 shorts
                            vm.bulkPeekShort(readPtr.toInt(), coeffShortArray, 768)
@@ -3141,7 +3146,7 @@ class GraphicsJSR223Delegate(private val vm: VM) {
        val kAlphaSqrt2 = intArrayOf(1024, 1448, 1448, 1448, 1448, 1448, 1448, 1448)
        val kHalfSqrt2 = 724 // sqrt(2)/2 in 10-bit fixed-point

-        // Convert to dequantized FloatArrays and apply knusperli optimisation
+        // Convert to dequantised FloatArrays and apply knusperli optimisation
        val optimisedYBlocks = tevConvertAndOptimise16x16Blocks(yBlocks, quantTableY, qY, rateControlFactors, blocksX, blocksY, kLinearGradient, kAlphaSqrt2, kHalfSqrt2)
        val optimisedCoBlocks = tevConvertAndOptimise8x8Blocks(coBlocks, quantTableCo, qCo, rateControlFactors, blocksX, blocksY, kLinearGradient, kAlphaSqrt2, kHalfSqrt2)
        val optimisedCgBlocks = tevConvertAndOptimise8x8Blocks(cgBlocks, quantTableCg, qCg, rateControlFactors, blocksX, blocksY, kLinearGradient, kAlphaSqrt2, kHalfSqrt2)
@@ -3149,7 +3154,7 @@ class GraphicsJSR223Delegate(private val vm: VM) {
        return Triple(optimisedYBlocks, optimisedCoBlocks, optimisedCgBlocks)
    }

-    // IDCT functions for knusperli-optimised coefficients (coefficients are already dequantized)
+    // IDCT functions for knusperli-optimised coefficients (coefficients are already dequantised)
    private fun tevIdct16x16_fromOptimisedCoeffs(coeffs: FloatArray): IntArray {
        val result = IntArray(256) // 16x16

@@ -3214,7 +3219,7 @@ class GraphicsJSR223Delegate(private val vm: VM) {
        tevProcessBlocksWithKnusperli16x16(blocks, quantTable, qScale, rateControlFactors,
                                       blocksX, blocksY, kLinearGradient16, kAlphaSqrt2_16, kHalfSqrt2)

-        // Convert optimised ShortArray blocks to FloatArray (dequantized)
+        // Convert optimised ShortArray blocks to FloatArray (dequantised)
        for (blockIndex in 0 until blocks.size) {
            val block = blocks[blockIndex]
            if (block != null) {
@@ -3243,7 +3248,7 @@ class GraphicsJSR223Delegate(private val vm: VM) {
        val coeffsSize = 256 // 16x16 = 256
        val numBlocks = blocksX * blocksY

-        // OPTIMIZATION 1: Pre-compute quantization values to avoid repeated calculations
+        // OPTIMIZATION 1: Pre-compute quantisation values to avoid repeated calculations
        val quantValues = Array(numBlocks) { IntArray(coeffsSize) }
        val quantHalfValues = Array(numBlocks) { IntArray(coeffsSize) }

@@ -3254,7 +3259,7 @@ class GraphicsJSR223Delegate(private val vm: VM) {
                val qualityMult = jpeg_quality_to_mult(qScale * rateControlFactor)

                quantValues[blockIndex][0] = 1 // DC is lossless
-                quantHalfValues[blockIndex][0] = 0 // DC has no quantization interval
+                quantHalfValues[blockIndex][0] = 0 // DC has no quantisation interval

                for (i in 1 until coeffsSize) {
                    val coeffIdx = i.coerceIn(0, quantTable.size - 1)
@@ -3269,7 +3274,7 @@ class GraphicsJSR223Delegate(private val vm: VM) {
        val blocksMid = Array(numBlocks) { IntArray(coeffsSize) }
        val blocksOff = Array(numBlocks) { LongArray(coeffsSize) } // Keep Long for accumulation

-        // Step 1: Setup dequantized values and initialize adjustments (BULK OPTIMIZED)
+        // Step 1: Setup dequantised values and initialize adjustments (BULK OPTIMIZED)
        for (blockIndex in 0 until numBlocks) {
            val block = blocks[blockIndex]
            if (block != null) {
@@ -3277,8 +3282,8 @@ class GraphicsJSR223Delegate(private val vm: VM) {
                val off = blocksOff[blockIndex]
                val quantVals = quantValues[blockIndex]

-                // OPTIMIZATION 9: Bulk dequantization using vectorized operations
-                tevBulkDequantizeCoefficients(block, mid, quantVals, coeffsSize)
+                // OPTIMIZATION 9: Bulk dequantisation using vectorized operations
+                tevBulkDequantiseCoefficients(block, mid, quantVals, coeffsSize)

                // OPTIMIZATION 10: Bulk zero initialization of adjustments
                off.fill(0L)
@@ -3315,11 +3320,11 @@ class GraphicsJSR223Delegate(private val vm: VM) {
            }
        }

-        // Step 4: Apply corrections and clamp to quantization intervals (BULK OPTIMIZED)
+        // Step 4: Apply corrections and clamp to quantisation intervals (BULK OPTIMIZED)
        for (blockIndex in 0 until numBlocks) {
            val block = blocks[blockIndex]
            if (block != null) {
-                // OPTIMIZATION 11: Bulk apply corrections and quantization clamping
+                // OPTIMIZATION 11: Bulk apply corrections and quantisation clamping
                tevBulkApplyCorrectionsAndClamp(
                    block, blocksMid[blockIndex], blocksOff[blockIndex],
                    quantValues[blockIndex], quantHalfValues[blockIndex],
@@ -3332,10 +3337,10 @@ class GraphicsJSR223Delegate(private val vm: VM) {
    // BULK MEMORY ACCESS HELPER FUNCTIONS FOR KNUSPERLI

    /**
-     * OPTIMIZATION 9: Bulk dequantization using vectorized operations
-     * Performs coefficient * quantization in optimised chunks
+     * OPTIMIZATION 9: Bulk dequantisation using vectorized operations
+     * Performs coefficient * quantisation in optimised chunks
     */
-    private fun tevBulkDequantizeCoefficients(
+    private fun tevBulkDequantiseCoefficients(
        coeffs: ShortArray, result: IntArray, quantVals: IntArray, size: Int
    ) {
        // Process in chunks of 16 for better vectorization (CPU can process multiple values per instruction)
@@ -3372,7 +3377,7 @@ class GraphicsJSR223Delegate(private val vm: VM) {
    }

    /**
-     * OPTIMIZATION 11: Bulk apply corrections and quantization clamping
+     * OPTIMIZATION 11: Bulk apply corrections and quantisation clamping
     * Vectorized correction application with proper bounds checking
     */
    private fun tevBulkApplyCorrectionsAndClamp(
@@ -3404,7 +3409,7 @@ class GraphicsJSR223Delegate(private val vm: VM) {
            mid[i + 6] += corr6
            mid[i + 7] += corr7

-            // Apply quantization interval clamping - bulk operations
+            // Apply quantisation interval clamping - bulk operations
            val orig0 = block[i].toInt() * quantVals[i]
            val orig1 = block[i + 1].toInt() * quantVals[i + 1]
            val orig2 = block[i + 2].toInt() * quantVals[i + 2]
@@ -3423,7 +3428,7 @@ class GraphicsJSR223Delegate(private val vm: VM) {
            mid[i + 6] = mid[i + 6].coerceIn(orig6 - quantHalf[i + 6], orig6 + quantHalf[i + 6])
            mid[i + 7] = mid[i + 7].coerceIn(orig7 - quantHalf[i + 7], orig7 + quantHalf[i + 7])

-            // Convert back to quantized coefficients - bulk operations
+            // Convert back to quantised coefficients - bulk operations
            val quantMax = Short.MAX_VALUE.toInt()
            val quantMin = Short.MIN_VALUE.toInt()
            block[i] = (mid[i] / quantVals[i]).coerceIn(quantMin, quantMax).toShort()
@@ -3603,7 +3608,7 @@ class GraphicsJSR223Delegate(private val vm: VM) {
        val coeffsSize = 64
        val numBlocks = blocksX * blocksY

-        // Step 1: Setup quantization intervals for all blocks (using integers like Google's code)
+        // Step 1: Setup quantisation intervals for all blocks (using integers like Google's code)
        val blocksMid = Array(numBlocks) { IntArray(coeffsSize) }
        val blocksMin = Array(numBlocks) { IntArray(coeffsSize) }
        val blocksMax = Array(numBlocks) { IntArray(coeffsSize) }
@@ -3617,19 +3622,19 @@ class GraphicsJSR223Delegate(private val vm: VM) {
                    val quantIdx = i.coerceIn(0, quantTable.size - 1)

                    if (i == 0) {
-                        // DC coefficient: lossless (no quantization)
+                        // DC coefficient: lossless (no quantisation)
                        val dcValue = block[i].toInt()
                        blocksMid[blockIndex][i] = dcValue
                        blocksMin[blockIndex][i] = dcValue  // No interval for DC
                        blocksMax[blockIndex][i] = dcValue
                    } else {
-                        // AC coefficients: use quantization intervals
+                        // AC coefficients: use quantisation intervals
                        val quant = (quantTable[quantIdx] * jpeg_quality_to_mult(qScale * rateControlFactor)).coerceIn(1f, 255f).toInt()

-                        // Standard dequantized value (midpoint)
+                        // Standard dequantised value (midpoint)
                        blocksMid[blockIndex][i] = block[i].toInt() * quant

-                        // Quantization interval bounds
+                        // Quantisation interval bounds
                        val halfQuant = quant / 2
                        blocksMin[blockIndex][i] = blocksMid[blockIndex][i] - halfQuant
                        blocksMax[blockIndex][i] = blocksMid[blockIndex][i] + halfQuant
@@ -3671,7 +3676,7 @@ class GraphicsJSR223Delegate(private val vm: VM) {
            }
        }

-        // Step 4: Apply corrections and return optimised dequantized coefficients
+        // Step 4: Apply corrections and return optimised dequantised coefficients
        val result = Array<FloatArray?>(blocks.size) { null }
        for (blockIndex in 0 until numBlocks) {
            val block = blocks[blockIndex]
@@ -3680,7 +3685,7 @@ class GraphicsJSR223Delegate(private val vm: VM) {
                    // Apply corrections with sqrt(2)/2 weighting (Google's exact formula with right shift)
                    blocksMid[blockIndex][i] += ((blocksOff[blockIndex][i] * kHalfSqrt2) shr 31).toInt()

-                    // Clamp to quantization interval bounds
+                    // Clamp to quantisation interval bounds
                    val optimisedValue = blocksMid[blockIndex][i].coerceIn(
                        blocksMin[blockIndex][i],
                        blocksMax[blockIndex][i]
@@ -3819,8 +3824,8 @@ class GraphicsJSR223Delegate(private val vm: VM) {
        var readPtr = blockDataPtr

        try {
-            val tilesX = (width + TAV_TILE_SIZE_X - 1) / TAV_TILE_SIZE_X  // 280x224 tiles
-            val tilesY = (height + TAV_TILE_SIZE_Y - 1) / TAV_TILE_SIZE_Y
+            val tilesX = (width + TILE_SIZE_X - 1) / TILE_SIZE_X  // 280x224 tiles
+            val tilesY = (height + TILE_SIZE_Y - 1) / TILE_SIZE_Y
            
            // Process each tile
            for (tileY in 0 until tilesY) {
@@ -3836,6 +3841,13 @@ class GraphicsJSR223Delegate(private val vm: VM) {
                    val rcf = vm.peekFloat(readPtr)
                    readPtr += 4

+                    // debug print: raw decompressed bytes
+                    /*print("TAV Decode raw bytes (Frame $frameCounter, mode: ${arrayOf("SKIP", "INTRA", "DELTA")[mode]}): ")
+                    for (i in 0 until 32) {
+                        print("${vm.peek(blockDataPtr + i).toUint().toString(16).uppercase().padStart(2, '0')} ")
+                    }
+                    println("...")*/
+
                    when (mode) {
                        0x00 -> { // TAV_MODE_SKIP
                            // Copy 280x224 tile from previous frame to current frame
@@ -3847,17 +3859,11 @@ class GraphicsJSR223Delegate(private val vm: VM) {
                                                          width, height, qY, qCo, qCg, rcf,
                                                          waveletFilter, decompLevels, isLossless, tavVersion)
                        }
-                        0x02 -> { // TAV_MODE_INTER
-                            // Motion compensation + DWT residual to RGB buffer
-                            readPtr = tavDecodeDWTInterTileRGB(readPtr, tileX, tileY, mvX, mvY,
-                                                          currentRGBAddr, prevRGBAddr,
-                                                          width, height, qY, qCo, qCg, rcf,
-                                                          waveletFilter, decompLevels, isLossless, tavVersion)
-                        }
-                        0x03 -> { // TAV_MODE_MOTION
-                            // Motion compensation only (no residual)
-                            tavApplyMotionCompensationRGB(tileX, tileY, mvX, mvY,
-                                                            currentRGBAddr, prevRGBAddr, width, height)
+                        0x02 -> { // TAV_MODE_DELTA
+                            // Coefficient delta encoding for efficient P-frames
+                            readPtr = tavDecodeDeltaTileRGB(readPtr, tileX, tileY, currentRGBAddr,
+                                                      width, height, qY, qCo, qCg, rcf,
+                                                      waveletFilter, decompLevels, isLossless, tavVersion)
                        }
                    }
                }
@@ -3872,13 +3878,13 @@ class GraphicsJSR223Delegate(private val vm: VM) {
                                         width: Int, height: Int, qY: Int, qCo: Int, qCg: Int, rcf: Float,
                                         waveletFilter: Int, decompLevels: Int, isLossless: Boolean, tavVersion: Int): Long {
        // Now reading padded coefficient tiles (344x288) instead of core tiles (280x224)
-        val paddedCoeffCount = TAV_PADDED_TILE_SIZE_X * TAV_PADDED_TILE_SIZE_Y
+        val paddedCoeffCount = PADDED_TILE_SIZE_X * PADDED_TILE_SIZE_Y
        var ptr = readPtr
        
-        // Read quantized DWT coefficients for padded tile Y, Co, Cg channels (344x288)
-        val quantizedY = ShortArray(paddedCoeffCount)
-        val quantizedCo = ShortArray(paddedCoeffCount)
-        val quantizedCg = ShortArray(paddedCoeffCount)
+        // Read quantised DWT coefficients for padded tile Y, Co, Cg channels (344x288)
+        val quantisedY = ShortArray(paddedCoeffCount)
+        val quantisedCo = ShortArray(paddedCoeffCount)
+        val quantisedCg = ShortArray(paddedCoeffCount)
        
        // OPTIMIZATION: Bulk read all coefficient data (344x288 * 3 channels * 2 bytes = 594,432 bytes)
        val totalCoeffBytes = paddedCoeffCount * 3 * 2L  // 3 channels, 2 bytes per short
@@ -3888,51 +3894,62 @@ class GraphicsJSR223Delegate(private val vm: VM) {
        // Convert bulk data to coefficient arrays
        var bufferOffset = 0
        for (i in 0 until paddedCoeffCount) {
-            quantizedY[i] = (((coeffBuffer[bufferOffset + 1].toInt() and 0xFF) shl 8) or (coeffBuffer[bufferOffset].toInt() and 0xFF)).toShort()
+            quantisedY[i] = (((coeffBuffer[bufferOffset + 1].toInt() and 0xFF) shl 8) or (coeffBuffer[bufferOffset].toInt() and 0xFF)).toShort()
            bufferOffset += 2
        }
        for (i in 0 until paddedCoeffCount) {
-            quantizedCo[i] = (((coeffBuffer[bufferOffset + 1].toInt() and 0xFF) shl 8) or (coeffBuffer[bufferOffset].toInt() and 0xFF)).toShort()
+            quantisedCo[i] = (((coeffBuffer[bufferOffset + 1].toInt() and 0xFF) shl 8) or (coeffBuffer[bufferOffset].toInt() and 0xFF)).toShort()
            bufferOffset += 2
        }
        for (i in 0 until paddedCoeffCount) {
-            quantizedCg[i] = (((coeffBuffer[bufferOffset + 1].toInt() and 0xFF) shl 8) or (coeffBuffer[bufferOffset].toInt() and 0xFF)).toShort()
+            quantisedCg[i] = (((coeffBuffer[bufferOffset + 1].toInt() and 0xFF) shl 8) or (coeffBuffer[bufferOffset].toInt() and 0xFF)).toShort()
            bufferOffset += 2
        }
        
        ptr += totalCoeffBytes.toInt()
        
-        // Dequantize padded coefficient tiles (344x288)
+        // Dequantise padded coefficient tiles (344x288)
        val yPaddedTile = FloatArray(paddedCoeffCount)
        val coPaddedTile = FloatArray(paddedCoeffCount)
        val cgPaddedTile = FloatArray(paddedCoeffCount)
        
        for (i in 0 until paddedCoeffCount) {
-            yPaddedTile[i] = quantizedY[i] * qY * rcf
-            coPaddedTile[i] = quantizedCo[i] * qCo * rcf
-            cgPaddedTile[i] = quantizedCg[i] * qCg * rcf
+            yPaddedTile[i] = quantisedY[i] * qY * rcf
+            coPaddedTile[i] = quantisedCo[i] * qCo * rcf
+            cgPaddedTile[i] = quantisedCg[i] * qCg * rcf
        }
        
+        // Store coefficients for future delta reference (for P-frames)
+        val tileIdx = tileY * ((width + TILE_SIZE_X - 1) / TILE_SIZE_X) + tileX
+        if (tavPreviousCoeffsY == null) {
+            tavPreviousCoeffsY = mutableMapOf()
+            tavPreviousCoeffsCo = mutableMapOf()
+            tavPreviousCoeffsCg = mutableMapOf()
+        }
+        tavPreviousCoeffsY!![tileIdx] = yPaddedTile.clone()
+        tavPreviousCoeffsCo!![tileIdx] = coPaddedTile.clone()
+        tavPreviousCoeffsCg!![tileIdx] = cgPaddedTile.clone()
+        
        // Apply inverse DWT on full padded tiles (344x288)
        if (isLossless) {
-            tavApplyDWTInverseMultiLevel(yPaddedTile, TAV_PADDED_TILE_SIZE_X, TAV_PADDED_TILE_SIZE_Y, decompLevels, 0)
-            tavApplyDWTInverseMultiLevel(coPaddedTile, TAV_PADDED_TILE_SIZE_X, TAV_PADDED_TILE_SIZE_Y, decompLevels, 0)
-            tavApplyDWTInverseMultiLevel(cgPaddedTile, TAV_PADDED_TILE_SIZE_X, TAV_PADDED_TILE_SIZE_Y, decompLevels, 0)
+            tavApplyDWTInverseMultiLevel(yPaddedTile, PADDED_TILE_SIZE_X, PADDED_TILE_SIZE_Y, decompLevels, 0)
+            tavApplyDWTInverseMultiLevel(coPaddedTile, PADDED_TILE_SIZE_X, PADDED_TILE_SIZE_Y, decompLevels, 0)
+            tavApplyDWTInverseMultiLevel(cgPaddedTile, PADDED_TILE_SIZE_X, PADDED_TILE_SIZE_Y, decompLevels, 0)
        } else {
-            tavApplyDWTInverseMultiLevel(yPaddedTile, TAV_PADDED_TILE_SIZE_X, TAV_PADDED_TILE_SIZE_Y, decompLevels, waveletFilter)
-            tavApplyDWTInverseMultiLevel(coPaddedTile, TAV_PADDED_TILE_SIZE_X, TAV_PADDED_TILE_SIZE_Y, decompLevels, waveletFilter)
-            tavApplyDWTInverseMultiLevel(cgPaddedTile, TAV_PADDED_TILE_SIZE_X, TAV_PADDED_TILE_SIZE_Y, decompLevels, waveletFilter)
+            tavApplyDWTInverseMultiLevel(yPaddedTile, PADDED_TILE_SIZE_X, PADDED_TILE_SIZE_Y, decompLevels, waveletFilter)
+            tavApplyDWTInverseMultiLevel(coPaddedTile, PADDED_TILE_SIZE_X, PADDED_TILE_SIZE_Y, decompLevels, waveletFilter)
+            tavApplyDWTInverseMultiLevel(cgPaddedTile, PADDED_TILE_SIZE_X, PADDED_TILE_SIZE_Y, decompLevels, waveletFilter)
        }
        
        // Extract core 280x224 pixels from reconstructed padded tiles (344x288)
-        val yTile = FloatArray(TAV_TILE_SIZE_X * TAV_TILE_SIZE_Y)
-        val coTile = FloatArray(TAV_TILE_SIZE_X * TAV_TILE_SIZE_Y)
-        val cgTile = FloatArray(TAV_TILE_SIZE_X * TAV_TILE_SIZE_Y)
+        val yTile = FloatArray(TILE_SIZE_X * TILE_SIZE_Y)
+        val coTile = FloatArray(TILE_SIZE_X * TILE_SIZE_Y)
+        val cgTile = FloatArray(TILE_SIZE_X * TILE_SIZE_Y)
        
-        for (y in 0 until TAV_TILE_SIZE_Y) {
-            for (x in 0 until TAV_TILE_SIZE_X) {
-                val coreIdx = y * TAV_TILE_SIZE_X + x
-                val paddedIdx = (y + TAV_TILE_MARGIN) * TAV_PADDED_TILE_SIZE_X + (x + TAV_TILE_MARGIN)
+        for (y in 0 until TILE_SIZE_Y) {
+            for (x in 0 until TILE_SIZE_X) {
+                val coreIdx = y * TILE_SIZE_X + x
+                val paddedIdx = (y + TAV_TILE_MARGIN) * PADDED_TILE_SIZE_X + (x + TAV_TILE_MARGIN)
                
                yTile[coreIdx] = yPaddedTile[paddedIdx]
                coTile[coreIdx] = coPaddedTile[paddedIdx]
@@ -3952,17 +3969,17 @@ class GraphicsJSR223Delegate(private val vm: VM) {

    private fun tavConvertYCoCgTileToRGB(tileX: Int, tileY: Int, yTile: FloatArray, coTile: FloatArray, cgTile: FloatArray,
                                         rgbAddr: Long, width: Int, height: Int) {
-        val startX = tileX * TAV_TILE_SIZE_X
-        val startY = tileY * TAV_TILE_SIZE_Y
+        val startX = tileX * TILE_SIZE_X
+        val startY = tileY * TILE_SIZE_Y
        
        // OPTIMIZATION: Process pixels row by row with bulk copying for better cache locality
-        for (y in 0 until TAV_TILE_SIZE_Y) {
+        for (y in 0 until TILE_SIZE_Y) {
            val frameY = startY + y
            if (frameY >= height) break
            
            // Calculate valid pixel range for this row
            val validStartX = maxOf(0, startX)
-            val validEndX = minOf(width, startX + TAV_TILE_SIZE_X)
+            val validEndX = minOf(width, startX + TILE_SIZE_X)
            val validPixelsInRow = validEndX - validStartX
            
            if (validPixelsInRow > 0) {
@@ -3971,7 +3988,7 @@ class GraphicsJSR223Delegate(private val vm: VM) {
                var bufferIdx = 0
                
                for (x in validStartX until validEndX) {
-                    val tileIdx = y * TAV_TILE_SIZE_X + (x - startX)
+                    val tileIdx = y * TILE_SIZE_X + (x - startX)
                    
                    // YCoCg-R to RGB conversion (exact inverse of encoder)
                    val Y = yTile[tileIdx]
@@ -3999,17 +4016,17 @@ class GraphicsJSR223Delegate(private val vm: VM) {

    private fun tavConvertICtCpTileToRGB(tileX: Int, tileY: Int, iTile: FloatArray, ctTile: FloatArray, cpTile: FloatArray,
                                         rgbAddr: Long, width: Int, height: Int) {
-        val startX = tileX * TAV_TILE_SIZE_X
-        val startY = tileY * TAV_TILE_SIZE_Y
+        val startX = tileX * TILE_SIZE_X
+        val startY = tileY * TILE_SIZE_Y
        
        // OPTIMIZATION: Process pixels row by row with bulk copying for better cache locality
-        for (y in 0 until TAV_TILE_SIZE_Y) {
+        for (y in 0 until TILE_SIZE_Y) {
            val frameY = startY + y
            if (frameY >= height) break
            
            // Calculate valid pixel range for this row
            val validStartX = maxOf(0, startX)
-            val validEndX = minOf(width, startX + TAV_TILE_SIZE_X)
+            val validEndX = minOf(width, startX + TILE_SIZE_X)
            val validPixelsInRow = validEndX - validStartX
            
            if (validPixelsInRow > 0) {
@@ -4018,7 +4035,7 @@ class GraphicsJSR223Delegate(private val vm: VM) {
                var bufferIdx = 0
                
                for (x in validStartX until validEndX) {
-                    val tileIdx = y * TAV_TILE_SIZE_X + (x - startX)
+                    val tileIdx = y * TILE_SIZE_X + (x - startX)
                    
                    // ICtCp to sRGB conversion (adapted from encoder ICtCp functions)
                    val I = iTile[tileIdx].toDouble() / 255.0
@@ -4060,16 +4077,16 @@ class GraphicsJSR223Delegate(private val vm: VM) {

    private fun tavAddYCoCgResidualToRGBTile(tileX: Int, tileY: Int, yRes: FloatArray, coRes: FloatArray, cgRes: FloatArray,
                                             rgbAddr: Long, width: Int, height: Int) {
-        val startX = tileX * TAV_TILE_SIZE_X
-        val startY = tileY * TAV_TILE_SIZE_Y
+        val startX = tileX * TILE_SIZE_X
+        val startY = tileY * TILE_SIZE_Y

-        for (y in 0 until TAV_TILE_SIZE_Y) {
-            for (x in 0 until TAV_TILE_SIZE_X) {
+        for (y in 0 until TILE_SIZE_Y) {
+            for (x in 0 until TILE_SIZE_X) {
                val frameX = startX + x
                val frameY = startY + y

                if (frameX < width && frameY < height) {
-                    val tileIdx = y * TAV_TILE_SIZE_X + x
+                    val tileIdx = y * TILE_SIZE_X + x
                    val pixelIdx = frameY * width + frameX
                    val rgbOffset = pixelIdx * 3L

@@ -4105,17 +4122,17 @@ class GraphicsJSR223Delegate(private val vm: VM) {

    // Helper functions (simplified versions of existing DWT functions)
    private fun tavCopyTileRGB(tileX: Int, tileY: Int, currentRGBAddr: Long, prevRGBAddr: Long, width: Int, height: Int) {
-        val startX = tileX * TAV_TILE_SIZE_X
-        val startY = tileY * TAV_TILE_SIZE_Y
+        val startX = tileX * TILE_SIZE_X
+        val startY = tileY * TILE_SIZE_Y
        
        // OPTIMIZATION: Copy entire rows at once for maximum performance
-        for (y in 0 until TAV_TILE_SIZE_Y) {
+        for (y in 0 until TILE_SIZE_Y) {
            val frameY = startY + y
            if (frameY >= height) break
            
            // Calculate valid pixel range for this row
            val validStartX = maxOf(0, startX)
-            val validEndX = minOf(width, startX + TAV_TILE_SIZE_X)
+            val validEndX = minOf(width, startX + TILE_SIZE_X)
            val validPixelsInRow = validEndX - validStartX
            
            if (validPixelsInRow > 0) {
@@ -4132,31 +4149,105 @@ class GraphicsJSR223Delegate(private val vm: VM) {
        }
    }

-    private fun tavDecodeDWTInterTileRGB(readPtr: Long, tileX: Int, tileY: Int, mvX: Int, mvY: Int,
-                                         currentRGBAddr: Long, prevRGBAddr: Long,
-                                         width: Int, height: Int, qY: Int, qCo: Int, qCg: Int, rcf: Float,
-                                         waveletFilter: Int, decompLevels: Int, isLossless: Boolean, tavVersion: Int): Long {
+    private fun tavDecodeDeltaTileRGB(readPtr: Long, tileX: Int, tileY: Int, currentRGBAddr: Long,
+                                      width: Int, height: Int, qY: Int, qCo: Int, qCg: Int, rcf: Float,
+                                      waveletFilter: Int, decompLevels: Int, isLossless: Boolean, tavVersion: Int): Long {
        
-        // Step 1: Apply motion compensation
-        tavApplyMotionCompensationRGB(tileX, tileY, mvX, mvY, currentRGBAddr, prevRGBAddr, width, height)
+        val tileIdx = tileY * ((width + TILE_SIZE_X - 1) / TILE_SIZE_X) + tileX
+        var ptr = readPtr
        
-        // Step 2: Add DWT residual (same as intra but add to existing pixels)
-        return tavDecodeDWTIntraTileRGB(readPtr, tileX, tileY, currentRGBAddr, width, height, qY, qCo, qCg, rcf, 
-                                   waveletFilter, decompLevels, isLossless, tavVersion)
+        // Initialize coefficient storage if needed
+        if (tavPreviousCoeffsY == null) {
+            tavPreviousCoeffsY = mutableMapOf()
+            tavPreviousCoeffsCo = mutableMapOf()
+            tavPreviousCoeffsCg = mutableMapOf()
+        }
+        
+        // Coefficient count for padded tiles: 344x288 = 99,072 coefficients per channel
+        val coeffCount = PADDED_TILE_SIZE_X * PADDED_TILE_SIZE_Y
+        
+        // Read delta coefficients (same format as intra: quantised int16 -> float)
+        val deltaY = ShortArray(coeffCount)
+        val deltaCo = ShortArray(coeffCount) 
+        val deltaCg = ShortArray(coeffCount)
+        
+        vm.bulkPeekShort(ptr.toInt(), deltaY, coeffCount * 2)
+        ptr += coeffCount * 2
+        vm.bulkPeekShort(ptr.toInt(), deltaCo, coeffCount * 2)
+        ptr += coeffCount * 2
+        vm.bulkPeekShort(ptr.toInt(), deltaCg, coeffCount * 2)
+        ptr += coeffCount * 2
+        
+        // Get or initialize previous coefficients for this tile
+        val prevY = tavPreviousCoeffsY!![tileIdx] ?: FloatArray(coeffCount)
+        val prevCo = tavPreviousCoeffsCo!![tileIdx] ?: FloatArray(coeffCount)
+        val prevCg = tavPreviousCoeffsCg!![tileIdx] ?: FloatArray(coeffCount)
+        
+        // Reconstruct current coefficients: current = previous + delta
+        val currentY = FloatArray(coeffCount)
+        val currentCo = FloatArray(coeffCount)
+        val currentCg = FloatArray(coeffCount)
+        
+        for (i in 0 until coeffCount) {
+            currentY[i] = prevY[i] + (deltaY[i].toFloat() * qY * rcf)
+            currentCo[i] = prevCo[i] + (deltaCo[i].toFloat() * qCo * rcf)
+            currentCg[i] = prevCg[i] + (deltaCg[i].toFloat() * qCg * rcf)
+        }
+        
+        // Store current coefficients as previous for next frame
+        tavPreviousCoeffsY!![tileIdx] = currentY.clone()
+        tavPreviousCoeffsCo!![tileIdx] = currentCo.clone()
+        tavPreviousCoeffsCg!![tileIdx] = currentCg.clone()
+        
+        // Apply inverse DWT
+        if (isLossless) {
+            tavApplyDWTInverseMultiLevel(currentY, PADDED_TILE_SIZE_X, PADDED_TILE_SIZE_Y, decompLevels, 0)
+            tavApplyDWTInverseMultiLevel(currentCo, PADDED_TILE_SIZE_X, PADDED_TILE_SIZE_Y, decompLevels, 0)
+            tavApplyDWTInverseMultiLevel(currentCg, PADDED_TILE_SIZE_X, PADDED_TILE_SIZE_Y, decompLevels, 0)
+        } else {
+            tavApplyDWTInverseMultiLevel(currentY, PADDED_TILE_SIZE_X, PADDED_TILE_SIZE_Y, decompLevels, waveletFilter)
+            tavApplyDWTInverseMultiLevel(currentCo, PADDED_TILE_SIZE_X, PADDED_TILE_SIZE_Y, decompLevels, waveletFilter)
+            tavApplyDWTInverseMultiLevel(currentCg, PADDED_TILE_SIZE_X, PADDED_TILE_SIZE_Y, decompLevels, waveletFilter)
+        }
+        
+        // Extract core 280x224 pixels and convert to RGB (same as intra)
+        val yTile = FloatArray(TILE_SIZE_X * TILE_SIZE_Y)
+        val coTile = FloatArray(TILE_SIZE_X * TILE_SIZE_Y)
+        val cgTile = FloatArray(TILE_SIZE_X * TILE_SIZE_Y)
+        
+        for (y in 0 until TILE_SIZE_Y) {
+            for (x in 0 until TILE_SIZE_X) {
+                val coreIdx = y * TILE_SIZE_X + x
+                val paddedIdx = (y + TAV_TILE_MARGIN) * PADDED_TILE_SIZE_X + (x + TAV_TILE_MARGIN)
+                
+                yTile[coreIdx] = currentY[paddedIdx]
+                coTile[coreIdx] = currentCo[paddedIdx]
+                cgTile[coreIdx] = currentCg[paddedIdx]
+            }
+        }
+        
+        // Convert to RGB based on TAV version
+        if (tavVersion == 2) {
+            tavConvertICtCpTileToRGB(tileX, tileY, yTile, coTile, cgTile, currentRGBAddr, width, height)
+        } else {
+            tavConvertYCoCgTileToRGB(tileX, tileY, yTile, coTile, cgTile, currentRGBAddr, width, height)
+        }
+        
+        return ptr
    }

    private fun tavApplyMotionCompensationRGB(tileX: Int, tileY: Int, mvX: Int, mvY: Int,
                                              currentRGBAddr: Long, prevRGBAddr: Long,
                                              width: Int, height: Int) {
-        val startX = tileX * TAV_TILE_SIZE_X
-        val startY = tileY * TAV_TILE_SIZE_Y
+        val startX = tileX * TILE_SIZE_X
+        val startY = tileY * TILE_SIZE_Y

        // Motion vectors in quarter-pixel precision
        val refX = startX + (mvX / 4.0f)
        val refY = startY + (mvY / 4.0f)

-        for (y in 0 until TAV_TILE_SIZE_Y) {
-            for (x in 0 until TAV_TILE_SIZE_X) {
+        for (y in 0 until TILE_SIZE_Y) {
+            for (x in 0 until TILE_SIZE_X) {
                val currentPixelIdx = (startY + y) * width + (startX + x)

                if (currentPixelIdx >= 0 && currentPixelIdx < width * height) {