decoder optimisation

This commit is contained in:
minjaesong
2025-09-16 14:46:56 +09:00
parent 3011c73168
commit dab56ee55d

View File

@@ -3879,23 +3879,27 @@ class GraphicsJSR223Delegate(private val vm: VM) {
val quantizedCo = ShortArray(paddedCoeffCount) val quantizedCo = ShortArray(paddedCoeffCount)
val quantizedCg = ShortArray(paddedCoeffCount) val quantizedCg = ShortArray(paddedCoeffCount)
// Read Y coefficients (176x176) // OPTIMIZATION: Bulk read all coefficient data (176x176 * 3 channels * 2 bytes = 185,856 bytes)
val totalCoeffBytes = paddedCoeffCount * 3 * 2L // 3 channels, 2 bytes per short
val coeffBuffer = ByteArray(totalCoeffBytes.toInt())
UnsafeHelper.memcpyRaw(null, vm.usermem.ptr + ptr, coeffBuffer, UnsafeHelper.getArrayOffset(coeffBuffer), totalCoeffBytes)
// Convert bulk data to coefficient arrays
var bufferOffset = 0
for (i in 0 until paddedCoeffCount) { for (i in 0 until paddedCoeffCount) {
quantizedY[i] = vm.peekShort(ptr) quantizedY[i] = (((coeffBuffer[bufferOffset + 1].toInt() and 0xFF) shl 8) or (coeffBuffer[bufferOffset].toInt() and 0xFF)).toShort()
ptr += 2 bufferOffset += 2
}
for (i in 0 until paddedCoeffCount) {
quantizedCo[i] = (((coeffBuffer[bufferOffset + 1].toInt() and 0xFF) shl 8) or (coeffBuffer[bufferOffset].toInt() and 0xFF)).toShort()
bufferOffset += 2
}
for (i in 0 until paddedCoeffCount) {
quantizedCg[i] = (((coeffBuffer[bufferOffset + 1].toInt() and 0xFF) shl 8) or (coeffBuffer[bufferOffset].toInt() and 0xFF)).toShort()
bufferOffset += 2
} }
// Read Co coefficients (176x176) ptr += totalCoeffBytes.toInt()
for (i in 0 until paddedCoeffCount) {
quantizedCo[i] = vm.peekShort(ptr)
ptr += 2
}
// Read Cg coefficients (176x176)
for (i in 0 until paddedCoeffCount) {
quantizedCg[i] = vm.peekShort(ptr)
ptr += 2
}
// Dequantize padded coefficient tiles (176x176) // Dequantize padded coefficient tiles (176x176)
val yPaddedTile = FloatArray(paddedCoeffCount) val yPaddedTile = FloatArray(paddedCoeffCount)
@@ -3951,14 +3955,23 @@ class GraphicsJSR223Delegate(private val vm: VM) {
val startX = tileX * tileSize val startX = tileX * tileSize
val startY = tileY * tileSize val startY = tileY * tileSize
// OPTIMIZATION: Process pixels row by row with bulk copying for better cache locality
for (y in 0 until tileSize) { for (y in 0 until tileSize) {
for (x in 0 until tileSize) { val frameY = startY + y
val frameX = startX + x if (frameY >= height) break
val frameY = startY + y
if (frameX < width && frameY < height) { // Calculate valid pixel range for this row
val tileIdx = y * tileSize + x val validStartX = maxOf(0, startX)
val pixelIdx = frameY * width + frameX val validEndX = minOf(width, startX + tileSize)
val validPixelsInRow = validEndX - validStartX
if (validPixelsInRow > 0) {
// Create row buffer for bulk RGB data
val rowRgbBuffer = ByteArray(validPixelsInRow * 3)
var bufferIdx = 0
for (x in validStartX until validEndX) {
val tileIdx = y * tileSize + (x - startX)
// YCoCg-R to RGB conversion (exact inverse of encoder) // YCoCg-R to RGB conversion (exact inverse of encoder)
val Y = yTile[tileIdx] val Y = yTile[tileIdx]
@@ -3971,11 +3984,15 @@ class GraphicsJSR223Delegate(private val vm: VM) {
val b = tmp - Co / 2.0f val b = tmp - Co / 2.0f
val r = Co + b val r = Co + b
val rgbOffset = pixelIdx * 3L rowRgbBuffer[bufferIdx++] = r.toInt().coerceIn(0, 255).toByte()
vm.poke(rgbAddr + rgbOffset, r.toInt().coerceIn(0, 255).toByte()) rowRgbBuffer[bufferIdx++] = g.toInt().coerceIn(0, 255).toByte()
vm.poke(rgbAddr + rgbOffset + 1, g.toInt().coerceIn(0, 255).toByte()) rowRgbBuffer[bufferIdx++] = b.toInt().coerceIn(0, 255).toByte()
vm.poke(rgbAddr + rgbOffset + 2, b.toInt().coerceIn(0, 255).toByte())
} }
// OPTIMIZATION: Bulk copy entire row at once
val rowStartOffset = (frameY * width + validStartX) * 3L
UnsafeHelper.memcpyRaw(rowRgbBuffer, UnsafeHelper.getArrayOffset(rowRgbBuffer),
null, vm.usermem.ptr + rgbAddr + rowStartOffset, rowRgbBuffer.size.toLong())
} }
} }
} }
@@ -3986,14 +4003,23 @@ class GraphicsJSR223Delegate(private val vm: VM) {
val startX = tileX * tileSize val startX = tileX * tileSize
val startY = tileY * tileSize val startY = tileY * tileSize
// OPTIMIZATION: Process pixels row by row with bulk copying for better cache locality
for (y in 0 until tileSize) { for (y in 0 until tileSize) {
for (x in 0 until tileSize) { val frameY = startY + y
val frameX = startX + x if (frameY >= height) break
val frameY = startY + y
if (frameX < width && frameY < height) { // Calculate valid pixel range for this row
val tileIdx = y * tileSize + x val validStartX = maxOf(0, startX)
val pixelIdx = frameY * width + frameX val validEndX = minOf(width, startX + tileSize)
val validPixelsInRow = validEndX - validStartX
if (validPixelsInRow > 0) {
// Create row buffer for bulk RGB data
val rowRgbBuffer = ByteArray(validPixelsInRow * 3)
var bufferIdx = 0
for (x in validStartX until validEndX) {
val tileIdx = y * tileSize + (x - startX)
// ICtCp to sRGB conversion (adapted from encoder ICtCp functions) // ICtCp to sRGB conversion (adapted from encoder ICtCp functions)
val I = iTile[tileIdx].toDouble() / 255.0 val I = iTile[tileIdx].toDouble() / 255.0
@@ -4020,11 +4046,15 @@ class GraphicsJSR223Delegate(private val vm: VM) {
val gSrgb = srgbUnlinearize(gLin) val gSrgb = srgbUnlinearize(gLin)
val bSrgb = srgbUnlinearize(bLin) val bSrgb = srgbUnlinearize(bLin)
val rgbOffset = pixelIdx * 3L rowRgbBuffer[bufferIdx++] = (rSrgb * 255.0).toInt().coerceIn(0, 255).toByte()
vm.poke(rgbAddr + rgbOffset, (rSrgb * 255.0).toInt().coerceIn(0, 255).toByte()) rowRgbBuffer[bufferIdx++] = (gSrgb * 255.0).toInt().coerceIn(0, 255).toByte()
vm.poke(rgbAddr + rgbOffset + 1, (gSrgb * 255.0).toInt().coerceIn(0, 255).toByte()) rowRgbBuffer[bufferIdx++] = (bSrgb * 255.0).toInt().coerceIn(0, 255).toByte()
vm.poke(rgbAddr + rgbOffset + 2, (bSrgb * 255.0).toInt().coerceIn(0, 255).toByte())
} }
// OPTIMIZATION: Bulk copy entire row at once
val rowStartOffset = (frameY * width + validStartX) * 3L
UnsafeHelper.memcpyRaw(rowRgbBuffer, UnsafeHelper.getArrayOffset(rowRgbBuffer),
null, vm.usermem.ptr + rgbAddr + rowStartOffset, rowRgbBuffer.size.toLong())
} }
} }
} }
@@ -4081,24 +4111,26 @@ class GraphicsJSR223Delegate(private val vm: VM) {
val startX = tileX * tileSize val startX = tileX * tileSize
val startY = tileY * tileSize val startY = tileY * tileSize
// OPTIMIZATION: Copy entire rows at once for maximum performance
for (y in 0 until tileSize) { for (y in 0 until tileSize) {
for (x in 0 until tileSize) { val frameY = startY + y
val frameX = startX + x if (frameY >= height) break
val frameY = startY + y
if (frameX < width && frameY < height) { // Calculate valid pixel range for this row
val pixelIdx = frameY * width + frameX val validStartX = maxOf(0, startX)
val rgbOffset = pixelIdx * 3L val validEndX = minOf(width, startX + tileSize)
val validPixelsInRow = validEndX - validStartX
// Copy RGB pixel from previous frame if (validPixelsInRow > 0) {
val r = vm.peek(prevRGBAddr + rgbOffset) val rowStartOffset = (frameY * width + validStartX) * 3L
val g = vm.peek(prevRGBAddr + rgbOffset + 1) val rowByteCount = validPixelsInRow * 3L
val b = vm.peek(prevRGBAddr + rgbOffset + 2)
vm.poke(currentRGBAddr + rgbOffset, r) // OPTIMIZATION: Bulk copy entire row of RGB data in one operation
vm.poke(currentRGBAddr + rgbOffset + 1, g) UnsafeHelper.memcpy(
vm.poke(currentRGBAddr + rgbOffset + 2, b) vm.usermem.ptr + prevRGBAddr + rowStartOffset,
} vm.usermem.ptr + currentRGBAddr + rowStartOffset,
rowByteCount
)
} }
} }
} }