interlacing optimisation with more memcpy

This commit is contained in:
minjaesong
2025-09-02 19:47:03 +09:00
parent b8311685d7
commit 4fb849d794
5 changed files with 134 additions and 99 deletions

View File

@@ -1498,18 +1498,15 @@ class GraphicsJSR223Delegate(private val vm: VM) {
*/
private fun extractFieldFromProgressive(progressiveAddr: Long, fieldAddr: Long, width: Int, height: Int,
fieldParity: Int, addrIncVec: Int) {
assert(addrIncVec == 1)
val fieldHeight = height / 2
for (y in 0 until fieldHeight) {
val progressiveY = y * 2 + fieldParity // Extract even (0) or odd (1) lines
val progressiveOffset = (progressiveY * width) * 3
val fieldOffset = (y * width) * 3
for (x in 0 until width) {
for (c in 0..2) {
val pixel = vm.peek(progressiveAddr + (progressiveOffset + x * 3 + c) * addrIncVec)!!
vm.poke(fieldAddr + (fieldOffset + x * 3 + c) * addrIncVec, pixel)
}
}
vm.memcpy(progressiveAddr.toInt() + progressiveOffset, fieldAddr.toInt() + fieldOffset, width * 3)
}
}
@@ -1528,79 +1525,106 @@ class GraphicsJSR223Delegate(private val vm: VM) {
val fieldOffset = (y * width + x) * 3
val outputOffset = ((y * 2 + fieldParity) * width + x) * 3
// Copy current field lines directly (no interpolation needed)
for (c in 0..2) {
val pixelValue = vm.peek(fieldRGBAddr + (fieldOffset + c) * fieldIncVec)!!
vm.poke(outputRGBAddr + (outputOffset + c) * outputIncVec, pixelValue)
}
// Copy current field lines directly (no interpolation needed) with loop unrolling
vm.poke(outputRGBAddr + (outputOffset + 0) * outputIncVec, vm.peek(fieldRGBAddr + (fieldOffset + 0) * fieldIncVec)!!)
vm.poke(outputRGBAddr + (outputOffset + 1) * outputIncVec, vm.peek(fieldRGBAddr + (fieldOffset + 1) * fieldIncVec)!!)
vm.poke(outputRGBAddr + (outputOffset + 2) * outputIncVec, vm.peek(fieldRGBAddr + (fieldOffset + 2) * fieldIncVec)!!)
// Interpolate missing lines using Yadif algorithm
// Even field (0,2,4...) interpolates odd lines (1,3,5...)
// Odd field (1,3,5...) interpolates even lines (2,4,6...) - skip line 0!
if (y > 0 && y < fieldHeight - 1) {
val interpOutputOffset = ((y * 2 + 1 - fieldParity) * width + x) * 3
val interpLine = if (fieldParity == 0) {
y * 2 + 1 // Even field: interpolate odd progressive lines (1,3,5...)
} else {
y * 2 + 2 // Odd field: interpolate even progressive lines (2,4,6...)
}
// Skip interpolation if the line would be out of bounds
if (interpLine < height) {
val interpOutputOffset = (interpLine * width + x) * 3
for (c in 0..2) {
// Get spatial neighbors
val above = vm.peek(fieldRGBAddr + (fieldOffset - width * 3 + c) * fieldIncVec)!!.toInt() and 0xFF
val below = vm.peek(fieldRGBAddr + (fieldOffset + width * 3 + c) * fieldIncVec)!!.toInt() and 0xFF
val current = vm.peek(fieldRGBAddr + (fieldOffset + c) * fieldIncVec)!!.toInt() and 0xFF
// Spatial interpolation
val spatialInterp = (above + below) / 2
// Temporal prediction using previous and next fields
var temporalPred = spatialInterp
if (prevFieldAddr != 0L && nextFieldAddr != 0L) {
// Get temporal neighbors from same spatial position
val prevPixel = (vm.peek(prevFieldAddr + (fieldOffset + c) * fieldIncVec)?.toInt() ?: current) and 0xFF
val nextPixel = (vm.peek(nextFieldAddr + (fieldOffset + c) * fieldIncVec)?.toInt() ?: current) and 0xFF
// Simple temporal interpolation
val tempInterp = (prevPixel + nextPixel) / 2
// Yadif edge-directed temporal-spatial decision
val spatialDiff = kotlin.math.abs(above - below)
val temporalDiff = kotlin.math.abs(prevPixel - nextPixel)
// Choose between spatial and temporal prediction based on local characteristics
temporalPred = when {
spatialDiff < 32 && temporalDiff < 32 -> {
// Low spatial and temporal variation: blend all
(spatialInterp + tempInterp + current) / 3
}
spatialDiff < temporalDiff -> {
// Prefer spatial interpolation
(spatialInterp * 3 + tempInterp) / 4
}
else -> {
// Prefer temporal interpolation
(tempInterp * 3 + spatialInterp) / 4
for (c in 0..2) {
// Get spatial neighbors
val above = vm.peek(fieldRGBAddr + (fieldOffset - width * 3 + c) * fieldIncVec)!!.toInt() and 0xFF
val below = vm.peek(fieldRGBAddr + (fieldOffset + width * 3 + c) * fieldIncVec)!!.toInt() and 0xFF
val current = vm.peek(fieldRGBAddr + (fieldOffset + c) * fieldIncVec)!!.toInt() and 0xFF
// Spatial interpolation
val spatialInterp = (above + below) / 2
// Temporal prediction using previous and next fields
var temporalPred = spatialInterp
if (prevFieldAddr != 0L && nextFieldAddr != 0L) {
// Get temporal neighbors from same spatial position
val prevPixel = (vm.peek(prevFieldAddr + (fieldOffset + c) * fieldIncVec)?.toInt() ?: current) and 0xFF
val nextPixel = (vm.peek(nextFieldAddr + (fieldOffset + c) * fieldIncVec)?.toInt() ?: current) and 0xFF
// Simple temporal interpolation
val tempInterp = (prevPixel + nextPixel) / 2
// Yadif edge-directed temporal-spatial decision
val spatialDiff = kotlin.math.abs(above - below)
val temporalDiff = kotlin.math.abs(prevPixel - nextPixel)
// Choose between spatial and temporal prediction based on local characteristics
temporalPred = when {
spatialDiff < 32 && temporalDiff < 32 -> {
// Low spatial and temporal variation: blend all
(spatialInterp + tempInterp + current) / 3
}
spatialDiff < temporalDiff -> {
// Prefer spatial interpolation
(spatialInterp * 3 + tempInterp) / 4
}
else -> {
// Prefer temporal interpolation
(tempInterp * 3 + spatialInterp) / 4
}
}
}
// Final edge-directed filtering
val finalValue = if (kotlin.math.abs(above - below) < 16) {
(current + temporalPred) / 2 // Very low edge activity: blend with current
} else {
temporalPred // Higher edge activity: use prediction
}
vm.poke(outputRGBAddr + (interpOutputOffset + c) * outputIncVec,
finalValue.coerceIn(0, 255).toByte())
}
// Final edge-directed filtering
val finalValue = if (kotlin.math.abs(above - below) < 16) {
(current + temporalPred) / 2 // Very low edge activity: blend with current
} else {
temporalPred // Higher edge activity: use prediction
}
vm.poke(outputRGBAddr + (interpOutputOffset + c) * outputIncVec,
finalValue.coerceIn(0, 255).toByte())
}
}
}
}
// Handle edge cases: first and last interpolated lines use simple spatial interpolation
for (x in 0 until width) {
val interpY = if (fieldParity == 0) 1 else 0
val outputOffset = (interpY * width + x) * 3
val referenceOffset = ((interpY + 1) * width + x) * 3
for (c in 0..2) {
val refPixel = vm.peek(outputRGBAddr + (referenceOffset + c) * outputIncVec)!!
vm.poke(outputRGBAddr + (outputOffset + c) * outputIncVec, refPixel)
// Handle edge cases: interpolate first missing line for each field
// Even field: interpolate line 1 (first odd line)
// Odd field: interpolate line 0 using simple duplication (since no spatial neighbors exist)
if (fieldParity == 0) {
// Even field: interpolate line 1 using line 0 and 2
for (x in 0 until width) {
val outputOffset = (1 * width + x) * 3
val ref0Offset = (0 * width + x) * 3 // Line 0
val ref2Offset = (2 * width + x) * 3 // Line 2
for (c in 0..2) {
val pixel0 = vm.peek(outputRGBAddr + (ref0Offset + c) * outputIncVec)!!.toInt() and 0xFF
val pixel2 = vm.peek(outputRGBAddr + (ref2Offset + c) * outputIncVec)!!.toInt() and 0xFF
val interpValue = (pixel0 + pixel2) / 2
vm.poke(outputRGBAddr + (outputOffset + c) * outputIncVec, interpValue.toByte())
}
}
} else {
// Odd field: interpolate line 0 by duplicating line 1
for (x in 0 until width) {
val outputOffset = (0 * width + x) * 3
val ref1Offset = (1 * width + x) * 3 // Line 1 (first odd line)
for (c in 0..2) {
val refPixel = vm.peek(outputRGBAddr + (ref1Offset + c) * outputIncVec)!!
vm.poke(outputRGBAddr + (outputOffset + c) * outputIncVec, refPixel)
}
}
}
}
@@ -1829,9 +1853,8 @@ class GraphicsJSR223Delegate(private val vm: VM) {
// height doesn't change when interlaced, because that's the encoder's output
// For interlaced mode, decode to half-height field first
val decodingHeight = if (isInterlaced) height / 2 else height
val blocksX = (width + 15) / 16 // 16x16 blocks now
val blocksY = (decodingHeight + 15) / 16
val blocksY = (height + 15) / 16
val quantYmult = jpeg_quality_to_mult(qualityIndices[0])
val quantCOmult = jpeg_quality_to_mult(qualityIndices[1])
@@ -1872,7 +1895,7 @@ class GraphicsJSR223Delegate(private val vm: VM) {
when (mode) {
0x00 -> { // TEV_MODE_SKIP - copy RGB from previous frame (optimized with memcpy)
// Check if we can copy the entire block at once (no clipping)
if (startX + 16 <= width && startY + 16 <= decodingHeight) {
if (startX + 16 <= width && startY + 16 <= height) {
// Optimized case: copy entire 16x16 block with row-by-row memcpy
for (dy in 0 until 16) {
val srcRowOffset = ((startY + dy).toLong() * width + startX) * 3
@@ -1889,7 +1912,7 @@ class GraphicsJSR223Delegate(private val vm: VM) {
for (dx in 0 until 16) {
val x = startX + dx
val y = startY + dy
if (x < width && y < decodingHeight) {
if (x < width && y < height) {
val pixelOffset = y.toLong() * width + x
val rgbOffset = pixelOffset * 3
@@ -1919,7 +1942,7 @@ class GraphicsJSR223Delegate(private val vm: VM) {
val refX = x + mvX
val refY = y + mvY
if (x < width && y < decodingHeight) {
if (x < width && y < height) {
val dstPixelOffset = y.toLong() * width + x
val dstRgbOffset = dstPixelOffset * 3
@@ -1939,7 +1962,7 @@ class GraphicsJSR223Delegate(private val vm: VM) {
val refStartY = startY + mvY
// Check if entire 16x16 block can be copied with memcpy (no bounds issues)
if (startX + 16 <= width && startY + 16 <= decodingHeight &&
if (startX + 16 <= width && startY + 16 <= height &&
refStartX >= 0 && refStartY >= 0 && refStartX + 16 <= width && refStartY + 16 <= height) {
// Optimized case: copy entire 16x16 block with row-by-row memcpy
@@ -1961,16 +1984,16 @@ class GraphicsJSR223Delegate(private val vm: VM) {
val refX = x + mvX
val refY = y + mvY
if (x < width && y < decodingHeight) {
if (x < width && y < height) {
val dstPixelOffset = y.toLong() * width + x
val dstRgbOffset = dstPixelOffset * 3
if (refX >= 0 && refY >= 0 && refX < width && refY < decodingHeight) {
if (refX >= 0 && refY >= 0 && refX < width && refY < height) {
val refPixelOffset = refY.toLong() * width + refX
val refRgbOffset = refPixelOffset * 3
// Additional safety: ensure RGB offset is within valid range
val maxValidOffset = (width * decodingHeight - 1) * 3L + 2
val maxValidOffset = (width * height - 1) * 3L + 2
if (refRgbOffset >= 0 && refRgbOffset <= maxValidOffset) {
// Copy RGB from reference position
val refR = vm.peek(prevRGBAddr + refRgbOffset*prevAddrIncVec)!!
@@ -2026,7 +2049,7 @@ class GraphicsJSR223Delegate(private val vm: VM) {
for (dx in 0 until 16) {
val x = startX + dx
val y = startY + dy
if (x < width && y < decodingHeight) {
if (x < width && y < height) {
val rgbIdx = (dy * 16 + dx) * 3
val imageOffset = y.toLong() * width + x
val bufferOffset = imageOffset * 3
@@ -2066,10 +2089,10 @@ class GraphicsJSR223Delegate(private val vm: VM) {
val refY = y + mvY
val pixelIdx = dy * 16 + dx
if (x < width && y < decodingHeight) {
if (x < width && y < height) {
var mcY: Int
if (refX >= 0 && refY >= 0 && refX < width && refY < decodingHeight) {
if (refX >= 0 && refY >= 0 && refX < width && refY < height) {
// Get motion-compensated RGB from previous frame
val refPixelOffset = refY.toLong() * width + refX
val refRgbOffset = refPixelOffset * 3
@@ -2106,12 +2129,12 @@ class GraphicsJSR223Delegate(private val vm: VM) {
val refY = y + mvY
val chromaIdx = cy * 8 + cx
if (x < width && y < decodingHeight) {
if (x < width && y < height) {
var mcCo: Int
var mcCg: Int
// Sample 2x2 block from motion-compensated position for chroma
if (refX >= 0 && refY >= 0 && refX < width - 1 && refY < decodingHeight - 1) {
if (refX >= 0 && refY >= 0 && refX < width - 1 && refY < height - 1) {
var coSum = 0
var cgSum = 0
var count = 0
@@ -2121,7 +2144,7 @@ class GraphicsJSR223Delegate(private val vm: VM) {
for (dx in 0 until 2) {
val sampleX = refX + dx
val sampleY = refY + dy
if (sampleX < width && sampleY < decodingHeight) {
if (sampleX < width && sampleY < height) {
val refPixelOffset = sampleY.toLong() * width + sampleX
val refRgbOffset = refPixelOffset * 3
@@ -2167,7 +2190,7 @@ class GraphicsJSR223Delegate(private val vm: VM) {
for (dx in 0 until 16) {
val x = startX + dx
val y = startY + dy
if (x < width && y < decodingHeight) {
if (x < width && y < height) {
val imageOffset = y.toLong() * width + x
val bufferOffset = imageOffset * 3
@@ -2202,7 +2225,7 @@ class GraphicsJSR223Delegate(private val vm: VM) {
for (dx in 0 until 16) {
val x = startX + dx
val y = startY + dy
if (x < width && y < decodingHeight) {
if (x < width && y < height) {
val imageOffset = y.toLong() * width + x
val bufferOffset = imageOffset * 3
@@ -2224,8 +2247,8 @@ class GraphicsJSR223Delegate(private val vm: VM) {
// require(prevFieldBuffer != 0L) { "prevFieldBuffer must be provided for interlaced decoding" }
// Copy the decoded field to temporary buffer
vm.memcpy(currentRGBAddr.toInt(), tempFieldBuffer.toInt(), width * decodingHeight * 3)
vm.memcpy(currentRGBAddr.toInt(), tempFieldBuffer.toInt(), width * height * 3)
// Apply Yadif deinterlacing: field -> progressive frame
// For temporal prediction, we need proper field management
val fieldParity = frameCounter % 2
@@ -2233,14 +2256,14 @@ class GraphicsJSR223Delegate(private val vm: VM) {
// Extract the corresponding field from the previous progressive frame
// Even field lines: y = 0, 2, 4, 6...
// Odd field lines: y = 1, 3, 5, 7...
extractFieldFromProgressive(prevRGBAddr, prevFieldBuffer, width, height, fieldParity, thisAddrIncVec)
extractFieldFromProgressive(prevRGBAddr, prevFieldBuffer, width, height * 2, fieldParity, thisAddrIncVec)
prevFieldBuffer
} else {
0L
}
yadifDeinterlace(
tempFieldBuffer, currentRGBAddr, width, height,
tempFieldBuffer, currentRGBAddr, width, height * 2,
prevFieldAddr, 0L, // Use previous field, no next field available
fieldParity,
thisAddrIncVec, thisAddrIncVec

View File

@@ -541,6 +541,14 @@ class VM(
}
}
fun memset(dest: Int, ch: Int, count: Int): Int {
val incVec = if (dest >= 0) 1L else -1L
for (i in 0 until count) {
poke(dest + count*incVec, ch.toByte())
}
return dest
}
fun bulkPeekShort(from: Int, to: ShortArray, sizeInBytes: Int) {
if (from !in 0..8*1024*1024) throw IllegalArgumentException()
UnsafeHelper.memcpyRaw(null, usermem.ptr + from, to, UnsafeHelper.getArrayOffset(to), sizeInBytes.toLong())

View File

@@ -98,13 +98,7 @@ class VMJSR223Delegate(private val vm: VM) {
fun nanoTime() = System.nanoTime()
fun malloc(size: Int) = vm.malloc(size)
fun memset(dest: Int, ch: Int, count: Int): Int {
val incVec = if (dest >= 0) 1 else -1
for (i in 0 until count) {
poke(dest + count*incVec, ch)
}
return dest
}
fun memset(dest: Int, ch: Int, count: Int) = vm.memset(dest, ch, count)
fun free(ptr: Int) = vm.free(ptr)
fun forceAlloc(ptr: Int, size: Int) = vm.forceAlloc(ptr, size)
fun memcpy(from: Int, to: Int, len: Int) {