From db5249596b97ddf18a94c005c6e898763597e58b Mon Sep 17 00:00:00 2001 From: minjaesong Date: Mon, 1 Sep 2025 23:42:12 +0900 Subject: [PATCH] 33% Faster video decoding by cheating on VM --- .../torvald/tsvm/GraphicsJSR223Delegate.kt | 250 +++++++++--------- tsvm_core/src/net/torvald/tsvm/VM.kt | 15 ++ 2 files changed, 143 insertions(+), 122 deletions(-) diff --git a/tsvm_core/src/net/torvald/tsvm/GraphicsJSR223Delegate.kt b/tsvm_core/src/net/torvald/tsvm/GraphicsJSR223Delegate.kt index ad77469..561cd2e 100644 --- a/tsvm_core/src/net/torvald/tsvm/GraphicsJSR223Delegate.kt +++ b/tsvm_core/src/net/torvald/tsvm/GraphicsJSR223Delegate.kt @@ -1417,7 +1417,7 @@ class GraphicsJSR223Delegate(private val vm: VM) { } } - private fun tevIdct8x8_fast(coeffs: IntArray, quantTable: FloatArray, isChromaResidual: Boolean = false, mult: Float = 1f): IntArray { + private fun tevIdct8x8_fast(coeffs: ShortArray, quantTable: FloatArray, isChromaResidual: Boolean = false, mult: Float = 1f): IntArray { val result = IntArray(64) // Reuse preallocated temp buffer to reduce GC pressure @@ -1468,7 +1468,7 @@ class GraphicsJSR223Delegate(private val vm: VM) { } // 16x16 IDCT for Y channel (YCoCg-R format) - private fun tevIdct16x16_fast(coeffs: IntArray, quantTable: FloatArray, mult: Float = 1.0f): IntArray { + private fun tevIdct16x16_fast(coeffs: ShortArray, quantTable: FloatArray, mult: Float = 1.0f): IntArray { val result = IntArray(256) // 16x16 = 256 // Process coefficients and dequantize using preallocated buffer @@ -1767,23 +1767,38 @@ class GraphicsJSR223Delegate(private val vm: VM) { when (mode) { - 0x00 -> { // TEV_MODE_SKIP - copy RGB from previous frame - for (dy in 0 until 16) { - for (dx in 0 until 16) { - val x = startX + dx - val y = startY + dy - if (x < width && y < height) { - val pixelOffset = y.toLong() * width + x - val rgbOffset = pixelOffset * 3 - - // Copy RGB values from previous frame - val prevR = vm.peek(prevRGBAddr + rgbOffset*prevAddrIncVec)!! - val prevG = vm.peek(prevRGBAddr + (rgbOffset + 1)*prevAddrIncVec)!! - val prevB = vm.peek(prevRGBAddr + (rgbOffset + 2)*prevAddrIncVec)!! - - vm.poke(currentRGBAddr + rgbOffset*thisAddrIncVec, prevR) - vm.poke(currentRGBAddr + (rgbOffset + 1)*thisAddrIncVec, prevG) - vm.poke(currentRGBAddr + (rgbOffset + 2)*thisAddrIncVec, prevB) + 0x00 -> { // TEV_MODE_SKIP - copy RGB from previous frame (optimized with memcpy) + // Check if we can copy the entire block at once (no clipping) + if (startX + 16 <= width && startY + 16 <= height) { + // Optimized case: copy entire 16x16 block with row-by-row memcpy + for (dy in 0 until 16) { + val srcRowOffset = ((startY + dy).toLong() * width + startX) * 3 + val dstRowOffset = srcRowOffset + vm.memcpy( + (prevRGBAddr + srcRowOffset*prevAddrIncVec).toInt(), + (currentRGBAddr + dstRowOffset*thisAddrIncVec).toInt(), + 48 // 16 pixels × 3 bytes = 48 bytes per row + ) + } + } else { + // Fallback to pixel-by-pixel for boundary blocks + for (dy in 0 until 16) { + for (dx in 0 until 16) { + val x = startX + dx + val y = startY + dy + if (x < width && y < height) { + val pixelOffset = y.toLong() * width + x + val rgbOffset = pixelOffset * 3 + + // Copy RGB values from previous frame + val prevR = vm.peek(prevRGBAddr + rgbOffset*prevAddrIncVec)!! + val prevG = vm.peek(prevRGBAddr + (rgbOffset + 1)*prevAddrIncVec)!! + val prevB = vm.peek(prevRGBAddr + (rgbOffset + 2)*prevAddrIncVec)!! + + vm.poke(currentRGBAddr + rgbOffset*thisAddrIncVec, prevR) + vm.poke(currentRGBAddr + (rgbOffset + 1)*thisAddrIncVec, prevG) + vm.poke(currentRGBAddr + (rgbOffset + 2)*thisAddrIncVec, prevB) + } } } } @@ -1791,55 +1806,90 @@ class GraphicsJSR223Delegate(private val vm: VM) { readPtr += 768 } - 0x03 -> { // TEV_MODE_MOTION - motion compensation with RGB - for (dy in 0 until 16) { - for (dx in 0 until 16) { - val x = startX + dx - val y = startY + dy - val refX = x + mvX // Test: revert to original motion compensation - val refY = y + mvY - - if (x < width && y < height) { - val dstPixelOffset = y.toLong() * width + x - val dstRgbOffset = dstPixelOffset * 3 + 0x03 -> { // TEV_MODE_MOTION - motion compensation with RGB (optimized with memcpy) + if (debugMotionVectors) { + // Debug mode: use original pixel-by-pixel for motion vector visualization + for (dy in 0 until 16) { + for (dx in 0 until 16) { + val x = startX + dx + val y = startY + dy + val refX = x + mvX + val refY = y + mvY - if (refX >= 0 && refY >= 0 && refX < width && refY < height) { - val refPixelOffset = refY.toLong() * width + refX - val refRgbOffset = refPixelOffset * 3 + if (x < width && y < height) { + val dstPixelOffset = y.toLong() * width + x + val dstRgbOffset = dstPixelOffset * 3 - // Additional safety: ensure RGB offset is within valid range - val maxValidOffset = (width * height - 1) * 3L + 2 - if (refRgbOffset >= 0 && refRgbOffset <= maxValidOffset) { - // Copy RGB from reference position - val refR = vm.peek(prevRGBAddr + refRgbOffset*prevAddrIncVec)!! - val refG = vm.peek(prevRGBAddr + (refRgbOffset + 1)*prevAddrIncVec)!! - val refB = vm.peek(prevRGBAddr + (refRgbOffset + 2)*prevAddrIncVec)!! + // Debug: Color INTER blocks by motion vector magnitude + val mvMagnitude = kotlin.math.sqrt((mvX * mvX + mvY * mvY).toDouble()).toInt() + val intensity = (mvMagnitude * 8).coerceIn(0, 255) // Scale for visibility + + vm.poke(currentRGBAddr + dstRgbOffset*thisAddrIncVec, intensity.toByte()) // R = MV magnitude + vm.poke(currentRGBAddr + (dstRgbOffset + 1)*thisAddrIncVec, 0.toByte()) // G = 0 + vm.poke(currentRGBAddr + (dstRgbOffset + 2)*thisAddrIncVec, (255-intensity).toByte()) // B = inverse + } + } + } + } else { + // Optimized motion compensation + val refStartX = startX + mvX + val refStartY = startY + mvY + + // Check if entire 16x16 block can be copied with memcpy (no bounds issues) + if (startX + 16 <= width && startY + 16 <= height && + refStartX >= 0 && refStartY >= 0 && refStartX + 16 <= width && refStartY + 16 <= height) { + + // Optimized case: copy entire 16x16 block with row-by-row memcpy + for (dy in 0 until 16) { + val srcRowOffset = ((refStartY + dy).toLong() * width + refStartX) * 3 + val dstRowOffset = ((startY + dy).toLong() * width + startX) * 3 + vm.memcpy( + (prevRGBAddr + srcRowOffset*prevAddrIncVec).toInt(), + (currentRGBAddr + dstRowOffset*thisAddrIncVec).toInt(), + 48 // 16 pixels × 3 bytes = 48 bytes per row + ) + } + } else { + // Fallback to pixel-by-pixel for boundary/out-of-bounds cases + for (dy in 0 until 16) { + for (dx in 0 until 16) { + val x = startX + dx + val y = startY + dy + val refX = x + mvX + val refY = y + mvY + + if (x < width && y < height) { + val dstPixelOffset = y.toLong() * width + x + val dstRgbOffset = dstPixelOffset * 3 - - if (debugMotionVectors) { - // Debug: Color INTER blocks by motion vector magnitude - val mvMagnitude = kotlin.math.sqrt((mvX * mvX + mvY * mvY).toDouble()).toInt() - val intensity = (mvMagnitude * 8).coerceIn(0, 255) // Scale for visibility - - vm.poke(currentRGBAddr + dstRgbOffset*thisAddrIncVec, intensity.toByte()) // R = MV magnitude - vm.poke(currentRGBAddr + (dstRgbOffset + 1)*thisAddrIncVec, 0.toByte()) // G = 0 - vm.poke(currentRGBAddr + (dstRgbOffset + 2)*thisAddrIncVec, (255-intensity).toByte()) // B = inverse + if (refX >= 0 && refY >= 0 && refX < width && refY < height) { + val refPixelOffset = refY.toLong() * width + refX + val refRgbOffset = refPixelOffset * 3 + + // Additional safety: ensure RGB offset is within valid range + val maxValidOffset = (width * height - 1) * 3L + 2 + if (refRgbOffset >= 0 && refRgbOffset <= maxValidOffset) { + // Copy RGB from reference position + val refR = vm.peek(prevRGBAddr + refRgbOffset*prevAddrIncVec)!! + val refG = vm.peek(prevRGBAddr + (refRgbOffset + 1)*prevAddrIncVec)!! + val refB = vm.peek(prevRGBAddr + (refRgbOffset + 2)*prevAddrIncVec)!! + + vm.poke(currentRGBAddr + dstRgbOffset*thisAddrIncVec, refR) + vm.poke(currentRGBAddr + (dstRgbOffset + 1)*thisAddrIncVec, refG) + vm.poke(currentRGBAddr + (dstRgbOffset + 2)*thisAddrIncVec, refB) + } else { + // Invalid RGB offset - use black + vm.poke(currentRGBAddr + dstRgbOffset*thisAddrIncVec, 0.toByte()) // R=0 + vm.poke(currentRGBAddr + (dstRgbOffset + 1)*thisAddrIncVec, 0.toByte()) // G=0 + vm.poke(currentRGBAddr + (dstRgbOffset + 2)*thisAddrIncVec, 0.toByte()) // B=0 + } } else { - vm.poke(currentRGBAddr + dstRgbOffset*thisAddrIncVec, refR) - vm.poke(currentRGBAddr + (dstRgbOffset + 1)*thisAddrIncVec, refG) - vm.poke(currentRGBAddr + (dstRgbOffset + 2)*thisAddrIncVec, refB) + // Out of bounds - use black + vm.poke(currentRGBAddr + dstRgbOffset*thisAddrIncVec, 0.toByte()) // R=0 + vm.poke(currentRGBAddr + (dstRgbOffset + 1)*thisAddrIncVec, 0.toByte()) // G=0 + vm.poke(currentRGBAddr + (dstRgbOffset + 2)*thisAddrIncVec, 0.toByte()) // B=0 } - } else { - // Invalid RGB offset - use black - vm.poke(currentRGBAddr + dstRgbOffset*thisAddrIncVec, 0.toByte()) // R=0 - vm.poke(currentRGBAddr + (dstRgbOffset + 1)*thisAddrIncVec, 0.toByte()) // G=0 - vm.poke(currentRGBAddr + (dstRgbOffset + 2)*thisAddrIncVec, 0.toByte()) // B=0 } - } else { - // Out of bounds - use black - vm.poke(currentRGBAddr + dstRgbOffset*thisAddrIncVec, 0.toByte()) // R=0 - vm.poke(currentRGBAddr + (dstRgbOffset + 1)*thisAddrIncVec, 0.toByte()) // G=0 - vm.poke(currentRGBAddr + (dstRgbOffset + 2)*thisAddrIncVec, 0.toByte()) // B=0 } } } @@ -1850,38 +1900,16 @@ class GraphicsJSR223Delegate(private val vm: VM) { 0x01 -> { // TEV_MODE_INTRA - Full YCoCg-R DCT decode (no motion compensation) // Read DCT coefficients: Y (16x16=256), Co (8x8=64), Cg (8x8=64) - val yCoeffs = IntArray(256) - val coCoeffs = IntArray(64) - val cgCoeffs = IntArray(64) - - // Read Y coefficients (16x16 = 256 coefficients × 2 bytes) - for (i in 0 until 256) { - val coeff = ((vm.peek(readPtr)!!.toUint()) or - ((vm.peek(readPtr + 1)!!.toUint()) shl 8)).toShort().toInt() - yCoeffs[i] = coeff - readPtr += 2 - } - - // Read Co coefficients (8x8 = 64 coefficients × 2 bytes) - for (i in 0 until 64) { - val coeff = ((vm.peek(readPtr)!!.toUint()) or - ((vm.peek(readPtr + 1)!!.toUint()) shl 8)).toShort().toInt() - coCoeffs[i] = coeff - readPtr += 2 - } - - // Read Cg coefficients (8x8 = 64 coefficients × 2 bytes) - for (i in 0 until 64) { - val coeff = ((vm.peek(readPtr)!!.toUint()) or - ((vm.peek(readPtr + 1)!!.toUint()) shl 8)).toShort().toInt() - cgCoeffs[i] = coeff - readPtr += 2 - } + + // Optimized bulk reading of all DCT coefficients: Y(256×2) + Co(64×2) + Cg(64×2) = 768 bytes + val coeffShortArray = ShortArray(384) // Total coefficients: 256 + 64 + 64 = 384 shorts + vm.bulkPeekShort(readPtr.toInt(), coeffShortArray, 768) + readPtr += 768 // Perform hardware IDCT for each channel using fast algorithm - val yBlock = tevIdct16x16_fast(yCoeffs, quantTableY, rateControlFactor) - val coBlock = tevIdct8x8_fast(coCoeffs, quantTableCo, true, rateControlFactor) - val cgBlock = tevIdct8x8_fast(cgCoeffs, if (tevVersion == 3) quantTableB else quantTableCg, true, rateControlFactor) + val yBlock = tevIdct16x16_fast(coeffShortArray.sliceArray(0 until 256), quantTableY, rateControlFactor) + val coBlock = tevIdct8x8_fast(coeffShortArray.sliceArray(256 until 320), quantTableCo, true, rateControlFactor) + val cgBlock = tevIdct8x8_fast(coeffShortArray.sliceArray(320 until 384), if (tevVersion == 3) quantTableB else quantTableCg, true, rateControlFactor) // Convert to RGB (YCoCg-R for v2, XYB for v3) val rgbData = if (tevVersion == 3) { @@ -1910,39 +1938,17 @@ class GraphicsJSR223Delegate(private val vm: VM) { 0x02 -> { // TEV_MODE_INTER - Motion compensation + residual DCT // Step 1: Read residual DCT coefficients - val yCoeffs = IntArray(256) - val coCoeffs = IntArray(64) - val cgCoeffs = IntArray(64) - // Read Y coefficients (16x16 = 256 coefficients × 2 bytes) - for (i in 0 until 256) { - val coeff = ((vm.peek(readPtr)!!.toUint()) or - ((vm.peek(readPtr + 1)!!.toUint()) shl 8)).toShort().toInt() - yCoeffs[i] = coeff - readPtr += 2 - } - - // Read Co coefficients (8x8 = 64 coefficients × 2 bytes) - for (i in 0 until 64) { - val coeff = ((vm.peek(readPtr)!!.toUint()) or - ((vm.peek(readPtr + 1)!!.toUint()) shl 8)).toShort().toInt() - coCoeffs[i] = coeff - readPtr += 2 - } - - // Read Cg coefficients (8x8 = 64 coefficients × 2 bytes) - for (i in 0 until 64) { - val coeff = ((vm.peek(readPtr)!!.toUint()) or - ((vm.peek(readPtr + 1)!!.toUint()) shl 8)).toShort().toInt() - cgCoeffs[i] = coeff - readPtr += 2 - } + // Optimized bulk reading of all DCT coefficients: Y(256×2) + Co(64×2) + Cg(64×2) = 768 bytes + val coeffShortArray = ShortArray(384) // Total coefficients: 256 + 64 + 64 = 384 shorts + vm.bulkPeekShort(readPtr.toInt(), coeffShortArray, 768) + readPtr += 768 // Step 2: Decode residual DCT - val yResidual = tevIdct16x16_fast(yCoeffs, quantTableY, rateControlFactor) - val coResidual = tevIdct8x8_fast(coCoeffs, quantTableCo, true, rateControlFactor) - val cgResidual = tevIdct8x8_fast(cgCoeffs, if (tevVersion == 3) quantTableB else quantTableCg, true, rateControlFactor) - + val yResidual = tevIdct16x16_fast(coeffShortArray.sliceArray(0 until 256), quantTableY, rateControlFactor) + val coResidual = tevIdct8x8_fast(coeffShortArray.sliceArray(256 until 320), quantTableCo, true, rateControlFactor) + val cgResidual = tevIdct8x8_fast(coeffShortArray.sliceArray(320 until 384), if (tevVersion == 3) quantTableB else quantTableCg, true, rateControlFactor) + // Step 3: Build motion-compensated YCoCg-R block and add residuals val finalY = IntArray(256) val finalCo = IntArray(64) diff --git a/tsvm_core/src/net/torvald/tsvm/VM.kt b/tsvm_core/src/net/torvald/tsvm/VM.kt index 2138cba..37c967e 100644 --- a/tsvm_core/src/net/torvald/tsvm/VM.kt +++ b/tsvm_core/src/net/torvald/tsvm/VM.kt @@ -541,6 +541,21 @@ class VM( } } + fun bulkPeekShort(from: Int, to: ShortArray, sizeInBytes: Int) { + if (from !in 0..8*1024*1024) throw IllegalArgumentException() + UnsafeHelper.memcpyRaw(null, usermem.ptr + from, to, UnsafeHelper.getArrayOffset(to), sizeInBytes.toLong()) + } + + fun bulkPeekInt(from: Int, to: IntArray, sizeInBytes: Int) { + if (from !in 0..8*1024*1024) throw IllegalArgumentException() + UnsafeHelper.memcpyRaw(null, usermem.ptr + from, to, UnsafeHelper.getArrayOffset(to), sizeInBytes.toLong()) + } + + fun bulkPeekFloat(from: Int, to: FloatArray, sizeInBytes: Int) { + if (from !in 0..8*1024*1024) throw IllegalArgumentException() + UnsafeHelper.memcpyRaw(null, usermem.ptr + from, to, UnsafeHelper.getArrayOffset(to), sizeInBytes.toLong()) + } + private fun relPtrInDev(from: Long, len: Long, start: Int, end: Int) = (from in start..end && (from + len) in start..end)