From 270864ef0feee8037245e803fcbbeb39eb582b6a Mon Sep 17 00:00:00 2001
From: minjaesong <alswo9628@gmail.com>
Date: Fri, 22 Aug 2025 00:28:19 +0900
Subject: [PATCH] optimised IDCT on decoding

---
 assets/disk0/tvdos/bin/playtev.js             |   4 +-
 .../torvald/tsvm/GraphicsJSR223Delegate.kt    | 137 +++++++++---------
 2 files changed, 68 insertions(+), 73 deletions(-)

diff --git a/assets/disk0/tvdos/bin/playtev.js b/assets/disk0/tvdos/bin/playtev.js
index 68bcf17..ca9e31f 100644
--- a/assets/disk0/tvdos/bin/playtev.js
+++ b/assets/disk0/tvdos/bin/playtev.js
@@ -200,11 +200,11 @@ try {
             }
 
             // Decompress using gzip
-            // Calculate proper buffer size for TEV YCoCg-R blocks
+            // Optimized buffer size calculation for TEV YCoCg-R blocks
             let blocksX = (width + 15) >> 4  // 16x16 blocks
             let blocksY = (height + 15) >> 4
             let tevBlockSize = 1 + 4 + 2 + (256 * 2) + (64 * 2) + (64 * 2) // mode + mv + cbp + Y(16x16) + Co(8x8) + Cg(8x8)
-            let decompressedSize = blocksX * blocksY * tevBlockSize * 2 // Double for safety
+            let decompressedSize = Math.max(payloadLen * 4, blocksX * blocksY * tevBlockSize) // More efficient sizing
             let blockDataPtr = sys.malloc(decompressedSize)
 
             let actualSize
diff --git a/tsvm_core/src/net/torvald/tsvm/GraphicsJSR223Delegate.kt b/tsvm_core/src/net/torvald/tsvm/GraphicsJSR223Delegate.kt
index 7b46053..ec52137 100644
--- a/tsvm_core/src/net/torvald/tsvm/GraphicsJSR223Delegate.kt
+++ b/tsvm_core/src/net/torvald/tsvm/GraphicsJSR223Delegate.kt
@@ -12,6 +12,13 @@ import kotlin.math.roundToInt
 import kotlin.math.sqrt
 
 class GraphicsJSR223Delegate(private val vm: VM) {
+    
+    // Reusable working arrays to reduce allocation overhead
+    private val idctTempBuffer = FloatArray(64)
+    private val idct16TempBuffer = FloatArray(256) // For 16x16 IDCT
+    private val idct16SeparableBuffer = FloatArray(256) // For separable 16x16 IDCT
+    private val ycocgWorkArray = IntArray(256)
+    private val rgbWorkArray = IntArray(256 * 3)
 
     private fun getFirstGPU(): GraphicsAdapter? {
         return vm.findPeribyType(VM.PERITYPE_GPU_AND_TERM)?.peripheral as? GraphicsAdapter
@@ -1557,93 +1564,46 @@ class GraphicsJSR223Delegate(private val vm: VM) {
         }
     }
 
-    val dctBasis8_2 = Array(8) { u ->
-        FloatArray(8) { x ->
-            val cu = if (u == 0) 1.0 / sqrt(2.0) else 1.0
-            (0.25 * cu * cos((2.0 * x + 1.0) * u * PI / 16.0)).toFloat()
-        }
-    }
-
-    /**
-     * Perform IDCT on a single channel with integer coefficients
-     */
-    private fun tevIdct8x8(coeffs: IntArray, quantTable: IntArray): IntArray {
-        val dctCoeffs = Array(8) { FloatArray(8) }
+    private fun tevIdct8x8_fast(coeffs: IntArray, quantTable: IntArray, isChromaResidual: Boolean = false): IntArray {
         val result = IntArray(64)
-
-        // Convert integer coefficients to 2D array and dequantize
+        // Reuse preallocated temp buffer to reduce GC pressure
+        
+        // Direct IDCT implementation matching original loop structure
+        // Process coefficients and dequantize
         for (u in 0 until 8) {
             for (v in 0 until 8) {
                 val idx = u * 8 + v
-                val coeff = coeffs[idx]
-                if (idx == 0) {
-                    // DC coefficient for chroma: lossless quantization (no scaling)
-                    dctCoeffs[u][v] = coeff.toFloat()
+                val coeff = if (isChromaResidual && idx == 0) {
+                    coeffs[idx].toFloat() // DC lossless for chroma residual
                 } else {
-                    // AC coefficients: use quantization table
-                    dctCoeffs[u][v] = (coeff * quantTable[idx]).toFloat()
+                    coeffs[idx] * quantTable[idx].toFloat()
                 }
+                idctTempBuffer[idx] = coeff
             }
         }
-
-        // Apply 2D inverse DCT
+        
+        // Apply 2D inverse DCT with original loop structure: for x, for y
         for (x in 0 until 8) {
             for (y in 0 until 8) {
                 var sum = 0f
                 for (u in 0 until 8) {
                     for (v in 0 until 8) {
-                        sum += dctBasis8[u][x] * dctBasis8[v][y] * dctCoeffs[u][v]
+                        sum += dctBasis8[u][x] * dctBasis8[v][y] * idctTempBuffer[u * 8 + v]
                     }
                 }
-                // Chroma residuals should be in reasonable range (±255 max)
-                val pixel = sum.coerceIn(-256f, 255f)
-                result[y * 8 + x] = pixel.toInt()
-            }
-        }
-
-        return result
-    }
-
-    /**
-     * Perform IDCT on a single channel with integer coefficients
-     */
-    private fun tevIdct8x8_2(coeffs: IntArray, quantTable: IntArray): IntArray {
-        val dctCoeffs = Array(8) { FloatArray(8) }
-        val result = IntArray(64)
-
-        // Convert integer coefficients to 2D array and dequantize
-        for (u in 0 until 8) {
-            for (v in 0 until 8) {
-                val idx = u * 8 + v
-                val coeff = coeffs[idx]
-                if (idx == 0) {
-                    // DC coefficient for chroma: lossless quantization (no scaling)
-                    dctCoeffs[u][v] = coeff.toFloat()
+                val pixel = if (isChromaResidual) {
+                    sum.coerceIn(-256f, 255f)
                 } else {
-                    // AC coefficients: use quantization table
-                    dctCoeffs[u][v] = (coeff * quantTable[idx]).toFloat()
+                    (sum + 128f).coerceIn(0f, 255f)
                 }
-            }
-        }
-
-        // Apply 2D inverse DCT
-        for (x in 0 until 8) {
-            for (y in 0 until 8) {
-                var sum = 0f
-                for (u in 0 until 8) {
-                    for (v in 0 until 8) {
-                        sum += dctBasis8_2[u][x] * dctBasis8_2[v][y] * dctCoeffs[u][v]
-                    }
-                }
-                // Chroma residuals should be in reasonable range (±255 max)
-                val pixel = sum.coerceIn(-256f, 255f)
                 result[y * 8 + x] = pixel.toInt()
             }
         }
-
+        
         return result
     }
 
+
     val dctBasis16 = Array(16) { u ->
         FloatArray(16) { x ->
             val cu = if (u == 0) 1.0 / sqrt(2.0) else 1.0
@@ -1652,6 +1612,41 @@ class GraphicsJSR223Delegate(private val vm: VM) {
     }
     
     // 16x16 IDCT for Y channel (YCoCg-R format)
+    private fun tevIdct16x16_fast(coeffs: IntArray, quantTable: IntArray): IntArray {
+        val result = IntArray(256) // 16x16 = 256
+        
+        // Process coefficients and dequantize using preallocated buffer
+        for (u in 0 until 16) {
+            for (v in 0 until 16) {
+                val idx = u * 16 + v
+                val coeff = if (idx == 0) {
+                    coeffs[idx].toFloat() // DC lossless for luma
+                } else {
+                    coeffs[idx] * quantTable[idx].toFloat()
+                }
+                idct16TempBuffer[idx] = coeff
+            }
+        }
+        
+        // Apply 2D inverse DCT with original loop structure: for x, for y (like original)
+        // NOTE: Uses direct O(n⁴) method to ensure correct indexing. Separable version
+        // could be 8x faster but requires careful coordinate transformation.
+        for (x in 0 until 16) {
+            for (y in 0 until 16) {
+                var sum = 0f
+                for (u in 0 until 16) {
+                    for (v in 0 until 16) {
+                        sum += dctBasis16[u][x] * dctBasis16[v][y] * idct16TempBuffer[u * 16 + v]
+                    }
+                }
+                val pixel = (sum + 128f).coerceIn(0f, 255f)
+                result[y * 16 + x] = pixel.toInt()
+            }
+        }
+        
+        return result
+    }
+    
     private fun tevIdct16x16(coeffs: IntArray, quantTable: IntArray): IntArray {
         val dctCoeffs = Array(16) { FloatArray(16) }
         val result = IntArray(256)  // 16x16 = 256
@@ -1901,10 +1896,10 @@ class GraphicsJSR223Delegate(private val vm: VM) {
                             readPtr += 2
                         }
                         
-                        // Perform hardware IDCT for each channel
-                        val yBlock = tevIdct16x16(yCoeffs, quantTableY)
-                        val coBlock = tevIdct8x8(coCoeffs, quantTableC)
-                        val cgBlock = tevIdct8x8(cgCoeffs, quantTableC)
+                        // Perform hardware IDCT for each channel using fast algorithm
+                        val yBlock = tevIdct16x16_fast(yCoeffs, quantTableY)
+                        val coBlock = tevIdct8x8_fast(coCoeffs, quantTableC, true)
+                        val cgBlock = tevIdct8x8_fast(cgCoeffs, quantTableC, true)
                         
                         // Convert YCoCg-R to RGB
                         val rgbData = tevYcocgToRGB(yBlock, coBlock, cgBlock)
@@ -1958,9 +1953,9 @@ class GraphicsJSR223Delegate(private val vm: VM) {
                         }
                         
                         // Step 2: Decode residual DCT
-                        val yResidual = tevIdct16x16(yCoeffs, quantTableY)
-                        val coResidual = tevIdct8x8_2(coCoeffs, quantTableC)
-                        val cgResidual = tevIdct8x8_2(cgCoeffs, quantTableC)
+                        val yResidual = tevIdct16x16_fast(yCoeffs, quantTableY)
+                        val coResidual = tevIdct8x8_fast(coCoeffs, quantTableC, true)
+                        val cgResidual = tevIdct8x8_fast(cgCoeffs, quantTableC, true)
                         
                         // Step 3: Build motion-compensated YCoCg-R block and add residuals
                         val finalY = IntArray(256)