interlacing optimisation with more memcpy

This commit is contained in:
minjaesong
2025-09-02 19:47:03 +09:00
parent b8311685d7
commit 4fb849d794
5 changed files with 134 additions and 99 deletions

View File

@@ -582,7 +582,8 @@ try {
// Hardware-accelerated TEV decoding to RGB buffers (YCoCg-R or XYB based on version)
try {
let decodeStart = sys.nanoTime()
graphics.tevDecode(blockDataPtr, CURRENT_RGB_ADDR, PREV_RGB_ADDR, width, height, [qualityY, qualityCo, qualityCg], frameCount, debugMotionVectors, version, isInterlaced, TEMP_FIELD_BUFFER, PREV_FIELD_BUFFER)
let decodingHeight = isInterlaced ? (height / 2)|0 : height
graphics.tevDecode(blockDataPtr, CURRENT_RGB_ADDR, PREV_RGB_ADDR, width, decodingHeight, [qualityY, qualityCo, qualityCg], frameCount, debugMotionVectors, version, isInterlaced, TEMP_FIELD_BUFFER, PREV_FIELD_BUFFER)
decodeTime = (sys.nanoTime() - decodeStart) / 1000000.0 // Convert to milliseconds
// Upload RGB buffer to display framebuffer with dithering

View File

@@ -1498,18 +1498,15 @@ class GraphicsJSR223Delegate(private val vm: VM) {
*/
private fun extractFieldFromProgressive(progressiveAddr: Long, fieldAddr: Long, width: Int, height: Int,
fieldParity: Int, addrIncVec: Int) {
assert(addrIncVec == 1)
val fieldHeight = height / 2
for (y in 0 until fieldHeight) {
val progressiveY = y * 2 + fieldParity // Extract even (0) or odd (1) lines
val progressiveOffset = (progressiveY * width) * 3
val fieldOffset = (y * width) * 3
for (x in 0 until width) {
for (c in 0..2) {
val pixel = vm.peek(progressiveAddr + (progressiveOffset + x * 3 + c) * addrIncVec)!!
vm.poke(fieldAddr + (fieldOffset + x * 3 + c) * addrIncVec, pixel)
}
}
vm.memcpy(progressiveAddr.toInt() + progressiveOffset, fieldAddr.toInt() + fieldOffset, width * 3)
}
}
@@ -1528,79 +1525,106 @@ class GraphicsJSR223Delegate(private val vm: VM) {
val fieldOffset = (y * width + x) * 3
val outputOffset = ((y * 2 + fieldParity) * width + x) * 3
// Copy current field lines directly (no interpolation needed)
for (c in 0..2) {
val pixelValue = vm.peek(fieldRGBAddr + (fieldOffset + c) * fieldIncVec)!!
vm.poke(outputRGBAddr + (outputOffset + c) * outputIncVec, pixelValue)
}
// Copy current field lines directly (no interpolation needed) with loop unrolling
vm.poke(outputRGBAddr + (outputOffset + 0) * outputIncVec, vm.peek(fieldRGBAddr + (fieldOffset + 0) * fieldIncVec)!!)
vm.poke(outputRGBAddr + (outputOffset + 1) * outputIncVec, vm.peek(fieldRGBAddr + (fieldOffset + 1) * fieldIncVec)!!)
vm.poke(outputRGBAddr + (outputOffset + 2) * outputIncVec, vm.peek(fieldRGBAddr + (fieldOffset + 2) * fieldIncVec)!!)
// Interpolate missing lines using Yadif algorithm
// Even field (0,2,4...) interpolates odd lines (1,3,5...)
// Odd field (1,3,5...) interpolates even lines (2,4,6...) - skip line 0!
if (y > 0 && y < fieldHeight - 1) {
val interpOutputOffset = ((y * 2 + 1 - fieldParity) * width + x) * 3
val interpLine = if (fieldParity == 0) {
y * 2 + 1 // Even field: interpolate odd progressive lines (1,3,5...)
} else {
y * 2 + 2 // Odd field: interpolate even progressive lines (2,4,6...)
}
// Skip interpolation if the line would be out of bounds
if (interpLine < height) {
val interpOutputOffset = (interpLine * width + x) * 3
for (c in 0..2) {
// Get spatial neighbors
val above = vm.peek(fieldRGBAddr + (fieldOffset - width * 3 + c) * fieldIncVec)!!.toInt() and 0xFF
val below = vm.peek(fieldRGBAddr + (fieldOffset + width * 3 + c) * fieldIncVec)!!.toInt() and 0xFF
val current = vm.peek(fieldRGBAddr + (fieldOffset + c) * fieldIncVec)!!.toInt() and 0xFF
// Spatial interpolation
val spatialInterp = (above + below) / 2
// Temporal prediction using previous and next fields
var temporalPred = spatialInterp
if (prevFieldAddr != 0L && nextFieldAddr != 0L) {
// Get temporal neighbors from same spatial position
val prevPixel = (vm.peek(prevFieldAddr + (fieldOffset + c) * fieldIncVec)?.toInt() ?: current) and 0xFF
val nextPixel = (vm.peek(nextFieldAddr + (fieldOffset + c) * fieldIncVec)?.toInt() ?: current) and 0xFF
// Simple temporal interpolation
val tempInterp = (prevPixel + nextPixel) / 2
// Yadif edge-directed temporal-spatial decision
val spatialDiff = kotlin.math.abs(above - below)
val temporalDiff = kotlin.math.abs(prevPixel - nextPixel)
// Choose between spatial and temporal prediction based on local characteristics
temporalPred = when {
spatialDiff < 32 && temporalDiff < 32 -> {
// Low spatial and temporal variation: blend all
(spatialInterp + tempInterp + current) / 3
}
spatialDiff < temporalDiff -> {
// Prefer spatial interpolation
(spatialInterp * 3 + tempInterp) / 4
}
else -> {
// Prefer temporal interpolation
(tempInterp * 3 + spatialInterp) / 4
for (c in 0..2) {
// Get spatial neighbors
val above = vm.peek(fieldRGBAddr + (fieldOffset - width * 3 + c) * fieldIncVec)!!.toInt() and 0xFF
val below = vm.peek(fieldRGBAddr + (fieldOffset + width * 3 + c) * fieldIncVec)!!.toInt() and 0xFF
val current = vm.peek(fieldRGBAddr + (fieldOffset + c) * fieldIncVec)!!.toInt() and 0xFF
// Spatial interpolation
val spatialInterp = (above + below) / 2
// Temporal prediction using previous and next fields
var temporalPred = spatialInterp
if (prevFieldAddr != 0L && nextFieldAddr != 0L) {
// Get temporal neighbors from same spatial position
val prevPixel = (vm.peek(prevFieldAddr + (fieldOffset + c) * fieldIncVec)?.toInt() ?: current) and 0xFF
val nextPixel = (vm.peek(nextFieldAddr + (fieldOffset + c) * fieldIncVec)?.toInt() ?: current) and 0xFF
// Simple temporal interpolation
val tempInterp = (prevPixel + nextPixel) / 2
// Yadif edge-directed temporal-spatial decision
val spatialDiff = kotlin.math.abs(above - below)
val temporalDiff = kotlin.math.abs(prevPixel - nextPixel)
// Choose between spatial and temporal prediction based on local characteristics
temporalPred = when {
spatialDiff < 32 && temporalDiff < 32 -> {
// Low spatial and temporal variation: blend all
(spatialInterp + tempInterp + current) / 3
}
spatialDiff < temporalDiff -> {
// Prefer spatial interpolation
(spatialInterp * 3 + tempInterp) / 4
}
else -> {
// Prefer temporal interpolation
(tempInterp * 3 + spatialInterp) / 4
}
}
}
// Final edge-directed filtering
val finalValue = if (kotlin.math.abs(above - below) < 16) {
(current + temporalPred) / 2 // Very low edge activity: blend with current
} else {
temporalPred // Higher edge activity: use prediction
}
vm.poke(outputRGBAddr + (interpOutputOffset + c) * outputIncVec,
finalValue.coerceIn(0, 255).toByte())
}
// Final edge-directed filtering
val finalValue = if (kotlin.math.abs(above - below) < 16) {
(current + temporalPred) / 2 // Very low edge activity: blend with current
} else {
temporalPred // Higher edge activity: use prediction
}
vm.poke(outputRGBAddr + (interpOutputOffset + c) * outputIncVec,
finalValue.coerceIn(0, 255).toByte())
}
}
}
}
// Handle edge cases: first and last interpolated lines use simple spatial interpolation
for (x in 0 until width) {
val interpY = if (fieldParity == 0) 1 else 0
val outputOffset = (interpY * width + x) * 3
val referenceOffset = ((interpY + 1) * width + x) * 3
for (c in 0..2) {
val refPixel = vm.peek(outputRGBAddr + (referenceOffset + c) * outputIncVec)!!
vm.poke(outputRGBAddr + (outputOffset + c) * outputIncVec, refPixel)
// Handle edge cases: interpolate first missing line for each field
// Even field: interpolate line 1 (first odd line)
// Odd field: interpolate line 0 using simple duplication (since no spatial neighbors exist)
if (fieldParity == 0) {
// Even field: interpolate line 1 using line 0 and 2
for (x in 0 until width) {
val outputOffset = (1 * width + x) * 3
val ref0Offset = (0 * width + x) * 3 // Line 0
val ref2Offset = (2 * width + x) * 3 // Line 2
for (c in 0..2) {
val pixel0 = vm.peek(outputRGBAddr + (ref0Offset + c) * outputIncVec)!!.toInt() and 0xFF
val pixel2 = vm.peek(outputRGBAddr + (ref2Offset + c) * outputIncVec)!!.toInt() and 0xFF
val interpValue = (pixel0 + pixel2) / 2
vm.poke(outputRGBAddr + (outputOffset + c) * outputIncVec, interpValue.toByte())
}
}
} else {
// Odd field: interpolate line 0 by duplicating line 1
for (x in 0 until width) {
val outputOffset = (0 * width + x) * 3
val ref1Offset = (1 * width + x) * 3 // Line 1 (first odd line)
for (c in 0..2) {
val refPixel = vm.peek(outputRGBAddr + (ref1Offset + c) * outputIncVec)!!
vm.poke(outputRGBAddr + (outputOffset + c) * outputIncVec, refPixel)
}
}
}
}
@@ -1829,9 +1853,8 @@ class GraphicsJSR223Delegate(private val vm: VM) {
// height doesn't change when interlaced, because that's the encoder's output
// For interlaced mode, decode to half-height field first
val decodingHeight = if (isInterlaced) height / 2 else height
val blocksX = (width + 15) / 16 // 16x16 blocks now
val blocksY = (decodingHeight + 15) / 16
val blocksY = (height + 15) / 16
val quantYmult = jpeg_quality_to_mult(qualityIndices[0])
val quantCOmult = jpeg_quality_to_mult(qualityIndices[1])
@@ -1872,7 +1895,7 @@ class GraphicsJSR223Delegate(private val vm: VM) {
when (mode) {
0x00 -> { // TEV_MODE_SKIP - copy RGB from previous frame (optimized with memcpy)
// Check if we can copy the entire block at once (no clipping)
if (startX + 16 <= width && startY + 16 <= decodingHeight) {
if (startX + 16 <= width && startY + 16 <= height) {
// Optimized case: copy entire 16x16 block with row-by-row memcpy
for (dy in 0 until 16) {
val srcRowOffset = ((startY + dy).toLong() * width + startX) * 3
@@ -1889,7 +1912,7 @@ class GraphicsJSR223Delegate(private val vm: VM) {
for (dx in 0 until 16) {
val x = startX + dx
val y = startY + dy
if (x < width && y < decodingHeight) {
if (x < width && y < height) {
val pixelOffset = y.toLong() * width + x
val rgbOffset = pixelOffset * 3
@@ -1919,7 +1942,7 @@ class GraphicsJSR223Delegate(private val vm: VM) {
val refX = x + mvX
val refY = y + mvY
if (x < width && y < decodingHeight) {
if (x < width && y < height) {
val dstPixelOffset = y.toLong() * width + x
val dstRgbOffset = dstPixelOffset * 3
@@ -1939,7 +1962,7 @@ class GraphicsJSR223Delegate(private val vm: VM) {
val refStartY = startY + mvY
// Check if entire 16x16 block can be copied with memcpy (no bounds issues)
if (startX + 16 <= width && startY + 16 <= decodingHeight &&
if (startX + 16 <= width && startY + 16 <= height &&
refStartX >= 0 && refStartY >= 0 && refStartX + 16 <= width && refStartY + 16 <= height) {
// Optimized case: copy entire 16x16 block with row-by-row memcpy
@@ -1961,16 +1984,16 @@ class GraphicsJSR223Delegate(private val vm: VM) {
val refX = x + mvX
val refY = y + mvY
if (x < width && y < decodingHeight) {
if (x < width && y < height) {
val dstPixelOffset = y.toLong() * width + x
val dstRgbOffset = dstPixelOffset * 3
if (refX >= 0 && refY >= 0 && refX < width && refY < decodingHeight) {
if (refX >= 0 && refY >= 0 && refX < width && refY < height) {
val refPixelOffset = refY.toLong() * width + refX
val refRgbOffset = refPixelOffset * 3
// Additional safety: ensure RGB offset is within valid range
val maxValidOffset = (width * decodingHeight - 1) * 3L + 2
val maxValidOffset = (width * height - 1) * 3L + 2
if (refRgbOffset >= 0 && refRgbOffset <= maxValidOffset) {
// Copy RGB from reference position
val refR = vm.peek(prevRGBAddr + refRgbOffset*prevAddrIncVec)!!
@@ -2026,7 +2049,7 @@ class GraphicsJSR223Delegate(private val vm: VM) {
for (dx in 0 until 16) {
val x = startX + dx
val y = startY + dy
if (x < width && y < decodingHeight) {
if (x < width && y < height) {
val rgbIdx = (dy * 16 + dx) * 3
val imageOffset = y.toLong() * width + x
val bufferOffset = imageOffset * 3
@@ -2066,10 +2089,10 @@ class GraphicsJSR223Delegate(private val vm: VM) {
val refY = y + mvY
val pixelIdx = dy * 16 + dx
if (x < width && y < decodingHeight) {
if (x < width && y < height) {
var mcY: Int
if (refX >= 0 && refY >= 0 && refX < width && refY < decodingHeight) {
if (refX >= 0 && refY >= 0 && refX < width && refY < height) {
// Get motion-compensated RGB from previous frame
val refPixelOffset = refY.toLong() * width + refX
val refRgbOffset = refPixelOffset * 3
@@ -2106,12 +2129,12 @@ class GraphicsJSR223Delegate(private val vm: VM) {
val refY = y + mvY
val chromaIdx = cy * 8 + cx
if (x < width && y < decodingHeight) {
if (x < width && y < height) {
var mcCo: Int
var mcCg: Int
// Sample 2x2 block from motion-compensated position for chroma
if (refX >= 0 && refY >= 0 && refX < width - 1 && refY < decodingHeight - 1) {
if (refX >= 0 && refY >= 0 && refX < width - 1 && refY < height - 1) {
var coSum = 0
var cgSum = 0
var count = 0
@@ -2121,7 +2144,7 @@ class GraphicsJSR223Delegate(private val vm: VM) {
for (dx in 0 until 2) {
val sampleX = refX + dx
val sampleY = refY + dy
if (sampleX < width && sampleY < decodingHeight) {
if (sampleX < width && sampleY < height) {
val refPixelOffset = sampleY.toLong() * width + sampleX
val refRgbOffset = refPixelOffset * 3
@@ -2167,7 +2190,7 @@ class GraphicsJSR223Delegate(private val vm: VM) {
for (dx in 0 until 16) {
val x = startX + dx
val y = startY + dy
if (x < width && y < decodingHeight) {
if (x < width && y < height) {
val imageOffset = y.toLong() * width + x
val bufferOffset = imageOffset * 3
@@ -2202,7 +2225,7 @@ class GraphicsJSR223Delegate(private val vm: VM) {
for (dx in 0 until 16) {
val x = startX + dx
val y = startY + dy
if (x < width && y < decodingHeight) {
if (x < width && y < height) {
val imageOffset = y.toLong() * width + x
val bufferOffset = imageOffset * 3
@@ -2224,8 +2247,8 @@ class GraphicsJSR223Delegate(private val vm: VM) {
// require(prevFieldBuffer != 0L) { "prevFieldBuffer must be provided for interlaced decoding" }
// Copy the decoded field to temporary buffer
vm.memcpy(currentRGBAddr.toInt(), tempFieldBuffer.toInt(), width * decodingHeight * 3)
vm.memcpy(currentRGBAddr.toInt(), tempFieldBuffer.toInt(), width * height * 3)
// Apply Yadif deinterlacing: field -> progressive frame
// For temporal prediction, we need proper field management
val fieldParity = frameCounter % 2
@@ -2233,14 +2256,14 @@ class GraphicsJSR223Delegate(private val vm: VM) {
// Extract the corresponding field from the previous progressive frame
// Even field lines: y = 0, 2, 4, 6...
// Odd field lines: y = 1, 3, 5, 7...
extractFieldFromProgressive(prevRGBAddr, prevFieldBuffer, width, height, fieldParity, thisAddrIncVec)
extractFieldFromProgressive(prevRGBAddr, prevFieldBuffer, width, height * 2, fieldParity, thisAddrIncVec)
prevFieldBuffer
} else {
0L
}
yadifDeinterlace(
tempFieldBuffer, currentRGBAddr, width, height,
tempFieldBuffer, currentRGBAddr, width, height * 2,
prevFieldAddr, 0L, // Use previous field, no next field available
fieldParity,
thisAddrIncVec, thisAddrIncVec

View File

@@ -541,6 +541,14 @@ class VM(
}
}
fun memset(dest: Int, ch: Int, count: Int): Int {
val incVec = if (dest >= 0) 1L else -1L
for (i in 0 until count) {
poke(dest + count*incVec, ch.toByte())
}
return dest
}
fun bulkPeekShort(from: Int, to: ShortArray, sizeInBytes: Int) {
if (from !in 0..8*1024*1024) throw IllegalArgumentException()
UnsafeHelper.memcpyRaw(null, usermem.ptr + from, to, UnsafeHelper.getArrayOffset(to), sizeInBytes.toLong())

View File

@@ -98,13 +98,7 @@ class VMJSR223Delegate(private val vm: VM) {
fun nanoTime() = System.nanoTime()
fun malloc(size: Int) = vm.malloc(size)
fun memset(dest: Int, ch: Int, count: Int): Int {
val incVec = if (dest >= 0) 1 else -1
for (i in 0 until count) {
poke(dest + count*incVec, ch)
}
return dest
}
fun memset(dest: Int, ch: Int, count: Int) = vm.memset(dest, ch, count)
fun free(ptr: Int) = vm.free(ptr)
fun forceAlloc(ptr: Int, size: Int) = vm.forceAlloc(ptr, size)
fun memcpy(from: Int, to: Int, len: Int) {

View File

@@ -1812,6 +1812,11 @@ static int start_video_conversion(tev_encoder_t *enc) {
} else {
if (enc->output_fps > 0 && enc->output_fps != enc->fps) {
// Frame rate conversion requested
// filtergraph path:
// 1. FPS conversion
// 2. scale and crop to requested size
// 3. tinterlace weave-overwrites even and odd fields together to produce intermediate video at half framerate, full height (we're losing half the information here -- and that's on purpose)
// 4. separatefields separates weave-overwritten frame as two consecutive frames, at half height. Since the frame rate is halved in Step 3. and being doubled here, the final framerate is identical to given framerate
snprintf(command, sizeof(command),
"ffmpeg -v error -i \"%s\" -f rawvideo -pix_fmt rgb24 "
"-vf \"fps=%d,scale=%d:%d:force_original_aspect_ratio=increase,crop=%d:%d,tinterlace=interleave_top:cvlpf,separatefields\" "
@@ -1819,6 +1824,10 @@ static int start_video_conversion(tev_encoder_t *enc) {
enc->input_file, enc->output_fps, enc->width, enc->height * 2, enc->width, enc->height * 2);
} else {
// No frame rate conversion
// filtergraph path:
// 1. scale and crop to requested size
// 2. tinterlace weave-overwrites even and odd fields together to produce intermediate video at half framerate, full height (we're losing half the information here -- and that's on purpose)
// 3. separatefields separates weave-overwritten frame as two consecutive frames, at half height. Since the frame rate is halved in Step 2. and being doubled here, the final framerate is identical to the original framerate
snprintf(command, sizeof(command),
"ffmpeg -v error -i \"%s\" -f rawvideo -pix_fmt rgb24 "
"-vf \"scale=%d:%d:force_original_aspect_ratio=increase,crop=%d:%d,tinterlace=interleave_top:cvlpf,separatefields\" "