interlacing optimisation with more memcpy

2026-06-06 05:28:31 +09:00 · 2025-09-02 19:47:03 +09:00
parent b8311685d7
commit 4fb849d794
5 changed files with 134 additions and 99 deletions
--- a/assets/disk0/tvdos/bin/playtev.js
+++ b/assets/disk0/tvdos/bin/playtev.js
@@ -582,7 +582,8 @@ try {
                // Hardware-accelerated TEV decoding to RGB buffers (YCoCg-R or XYB based on version)
                try {
                    let decodeStart = sys.nanoTime()
-                    graphics.tevDecode(blockDataPtr, CURRENT_RGB_ADDR, PREV_RGB_ADDR, width, height, [qualityY, qualityCo, qualityCg], frameCount, debugMotionVectors, version, isInterlaced, TEMP_FIELD_BUFFER, PREV_FIELD_BUFFER)
+                    let decodingHeight = isInterlaced ? (height / 2)|0 : height
+                    graphics.tevDecode(blockDataPtr, CURRENT_RGB_ADDR, PREV_RGB_ADDR, width, decodingHeight, [qualityY, qualityCo, qualityCg], frameCount, debugMotionVectors, version, isInterlaced, TEMP_FIELD_BUFFER, PREV_FIELD_BUFFER)
                    decodeTime = (sys.nanoTime() - decodeStart) / 1000000.0  // Convert to milliseconds

                    // Upload RGB buffer to display framebuffer with dithering
--- a/tsvm_core/src/net/torvald/tsvm/GraphicsJSR223Delegate.kt
+++ b/tsvm_core/src/net/torvald/tsvm/GraphicsJSR223Delegate.kt
@@ -1498,18 +1498,15 @@ class GraphicsJSR223Delegate(private val vm: VM) {
     */
    private fun extractFieldFromProgressive(progressiveAddr: Long, fieldAddr: Long, width: Int, height: Int, 
                                           fieldParity: Int, addrIncVec: Int) {
+        assert(addrIncVec == 1)
+
        val fieldHeight = height / 2
        for (y in 0 until fieldHeight) {
            val progressiveY = y * 2 + fieldParity // Extract even (0) or odd (1) lines
            val progressiveOffset = (progressiveY * width) * 3
            val fieldOffset = (y * width) * 3
-            
-            for (x in 0 until width) {
-                for (c in 0..2) {
-                    val pixel = vm.peek(progressiveAddr + (progressiveOffset + x * 3 + c) * addrIncVec)!!
-                    vm.poke(fieldAddr + (fieldOffset + x * 3 + c) * addrIncVec, pixel)
-                }
-            }
+
+            vm.memcpy(progressiveAddr.toInt() + progressiveOffset, fieldAddr.toInt() + fieldOffset, width * 3)
        }
    }
    
@@ -1528,79 +1525,106 @@ class GraphicsJSR223Delegate(private val vm: VM) {
                val fieldOffset = (y * width + x) * 3
                val outputOffset = ((y * 2 + fieldParity) * width + x) * 3
                
-                // Copy current field lines directly (no interpolation needed)
-                for (c in 0..2) {
-                    val pixelValue = vm.peek(fieldRGBAddr + (fieldOffset + c) * fieldIncVec)!!
-                    vm.poke(outputRGBAddr + (outputOffset + c) * outputIncVec, pixelValue)
-                }
-                
+                // Copy current field lines directly (no interpolation needed) with loop unrolling
+                vm.poke(outputRGBAddr + (outputOffset + 0) * outputIncVec, vm.peek(fieldRGBAddr + (fieldOffset + 0) * fieldIncVec)!!)
+                vm.poke(outputRGBAddr + (outputOffset + 1) * outputIncVec, vm.peek(fieldRGBAddr + (fieldOffset + 1) * fieldIncVec)!!)
+                vm.poke(outputRGBAddr + (outputOffset + 2) * outputIncVec, vm.peek(fieldRGBAddr + (fieldOffset + 2) * fieldIncVec)!!)
+
                // Interpolate missing lines using Yadif algorithm
+                // Even field (0,2,4...) interpolates odd lines (1,3,5...)
+                // Odd field (1,3,5...) interpolates even lines (2,4,6...) - skip line 0!
                if (y > 0 && y < fieldHeight - 1) {
-                    val interpOutputOffset = ((y * 2 + 1 - fieldParity) * width + x) * 3
+                    val interpLine = if (fieldParity == 0) {
+                        y * 2 + 1  // Even field: interpolate odd progressive lines (1,3,5...)
+                    } else {
+                        y * 2 + 2  // Odd field: interpolate even progressive lines (2,4,6...)
+                    }
+                    // Skip interpolation if the line would be out of bounds
+                    if (interpLine < height) {
+                        val interpOutputOffset = (interpLine * width + x) * 3
                    
-                    for (c in 0..2) {
-                        // Get spatial neighbors
-                        val above = vm.peek(fieldRGBAddr + (fieldOffset - width * 3 + c) * fieldIncVec)!!.toInt() and 0xFF
-                        val below = vm.peek(fieldRGBAddr + (fieldOffset + width * 3 + c) * fieldIncVec)!!.toInt() and 0xFF
-                        val current = vm.peek(fieldRGBAddr + (fieldOffset + c) * fieldIncVec)!!.toInt() and 0xFF
-                        
-                        // Spatial interpolation
-                        val spatialInterp = (above + below) / 2
-                        
-                        // Temporal prediction using previous and next fields
-                        var temporalPred = spatialInterp
-                        if (prevFieldAddr != 0L && nextFieldAddr != 0L) {
-                            // Get temporal neighbors from same spatial position
-                            val prevPixel = (vm.peek(prevFieldAddr + (fieldOffset + c) * fieldIncVec)?.toInt() ?: current) and 0xFF
-                            val nextPixel = (vm.peek(nextFieldAddr + (fieldOffset + c) * fieldIncVec)?.toInt() ?: current) and 0xFF
-                            
-                            // Simple temporal interpolation
-                            val tempInterp = (prevPixel + nextPixel) / 2
-                            
-                            // Yadif edge-directed temporal-spatial decision
-                            val spatialDiff = kotlin.math.abs(above - below)
-                            val temporalDiff = kotlin.math.abs(prevPixel - nextPixel)
-                            
-                            // Choose between spatial and temporal prediction based on local characteristics
-                            temporalPred = when {
-                                spatialDiff < 32 && temporalDiff < 32 -> {
-                                    // Low spatial and temporal variation: blend all
-                                    (spatialInterp + tempInterp + current) / 3
-                                }
-                                spatialDiff < temporalDiff -> {
-                                    // Prefer spatial interpolation
-                                    (spatialInterp * 3 + tempInterp) / 4
-                                }
-                                else -> {
-                                    // Prefer temporal interpolation  
-                                    (tempInterp * 3 + spatialInterp) / 4
+                        for (c in 0..2) {
+                            // Get spatial neighbors
+                            val above = vm.peek(fieldRGBAddr + (fieldOffset - width * 3 + c) * fieldIncVec)!!.toInt() and 0xFF
+                            val below = vm.peek(fieldRGBAddr + (fieldOffset + width * 3 + c) * fieldIncVec)!!.toInt() and 0xFF
+                            val current = vm.peek(fieldRGBAddr + (fieldOffset + c) * fieldIncVec)!!.toInt() and 0xFF
+
+                            // Spatial interpolation
+                            val spatialInterp = (above + below) / 2
+
+                            // Temporal prediction using previous and next fields
+                            var temporalPred = spatialInterp
+                            if (prevFieldAddr != 0L && nextFieldAddr != 0L) {
+                                // Get temporal neighbors from same spatial position
+                                val prevPixel = (vm.peek(prevFieldAddr + (fieldOffset + c) * fieldIncVec)?.toInt() ?: current) and 0xFF
+                                val nextPixel = (vm.peek(nextFieldAddr + (fieldOffset + c) * fieldIncVec)?.toInt() ?: current) and 0xFF
+
+                                // Simple temporal interpolation
+                                val tempInterp = (prevPixel + nextPixel) / 2
+
+                                // Yadif edge-directed temporal-spatial decision
+                                val spatialDiff = kotlin.math.abs(above - below)
+                                val temporalDiff = kotlin.math.abs(prevPixel - nextPixel)
+
+                                // Choose between spatial and temporal prediction based on local characteristics
+                                temporalPred = when {
+                                    spatialDiff < 32 && temporalDiff < 32 -> {
+                                        // Low spatial and temporal variation: blend all
+                                        (spatialInterp + tempInterp + current) / 3
+                                    }
+                                    spatialDiff < temporalDiff -> {
+                                        // Prefer spatial interpolation
+                                        (spatialInterp * 3 + tempInterp) / 4
+                                    }
+                                    else -> {
+                                        // Prefer temporal interpolation
+                                        (tempInterp * 3 + spatialInterp) / 4
+                                    }
                                }
                            }
+
+                            // Final edge-directed filtering
+                            val finalValue = if (kotlin.math.abs(above - below) < 16) {
+                                (current + temporalPred) / 2  // Very low edge activity: blend with current
+                            } else {
+                                temporalPred  // Higher edge activity: use prediction
+                            }
+
+                            vm.poke(outputRGBAddr + (interpOutputOffset + c) * outputIncVec,
+                                   finalValue.coerceIn(0, 255).toByte())
                        }
-                        
-                        // Final edge-directed filtering
-                        val finalValue = if (kotlin.math.abs(above - below) < 16) {
-                            (current + temporalPred) / 2  // Very low edge activity: blend with current
-                        } else {
-                            temporalPred  // Higher edge activity: use prediction
-                        }
-                        
-                        vm.poke(outputRGBAddr + (interpOutputOffset + c) * outputIncVec, 
-                               finalValue.coerceIn(0, 255).toByte())
                    }
                }
            }
        }
-        
-        // Handle edge cases: first and last interpolated lines use simple spatial interpolation
-        for (x in 0 until width) {
-            val interpY = if (fieldParity == 0) 1 else 0
-            val outputOffset = (interpY * width + x) * 3
-            val referenceOffset = ((interpY + 1) * width + x) * 3
-            
-            for (c in 0..2) {
-                val refPixel = vm.peek(outputRGBAddr + (referenceOffset + c) * outputIncVec)!!
-                vm.poke(outputRGBAddr + (outputOffset + c) * outputIncVec, refPixel)
+
+        // Handle edge cases: interpolate first missing line for each field
+        // Even field: interpolate line 1 (first odd line)
+        // Odd field: interpolate line 0 using simple duplication (since no spatial neighbors exist)
+        if (fieldParity == 0) {
+            // Even field: interpolate line 1 using line 0 and 2
+            for (x in 0 until width) {
+                val outputOffset = (1 * width + x) * 3
+                val ref0Offset = (0 * width + x) * 3  // Line 0 
+                val ref2Offset = (2 * width + x) * 3  // Line 2
+                
+                for (c in 0..2) {
+                    val pixel0 = vm.peek(outputRGBAddr + (ref0Offset + c) * outputIncVec)!!.toInt() and 0xFF
+                    val pixel2 = vm.peek(outputRGBAddr + (ref2Offset + c) * outputIncVec)!!.toInt() and 0xFF
+                    val interpValue = (pixel0 + pixel2) / 2
+                    vm.poke(outputRGBAddr + (outputOffset + c) * outputIncVec, interpValue.toByte())
+                }
+            }
+        } else {
+            // Odd field: interpolate line 0 by duplicating line 1
+            for (x in 0 until width) {
+                val outputOffset = (0 * width + x) * 3
+                val ref1Offset = (1 * width + x) * 3  // Line 1 (first odd line)
+                
+                for (c in 0..2) {
+                    val refPixel = vm.peek(outputRGBAddr + (ref1Offset + c) * outputIncVec)!!
+                    vm.poke(outputRGBAddr + (outputOffset + c) * outputIncVec, refPixel)
+                }
            }
        }
    }
@@ -1829,9 +1853,8 @@ class GraphicsJSR223Delegate(private val vm: VM) {
        // height doesn't change when interlaced, because that's the encoder's output

        // For interlaced mode, decode to half-height field first
-        val decodingHeight = if (isInterlaced) height / 2 else height
        val blocksX = (width + 15) / 16  // 16x16 blocks now
-        val blocksY = (decodingHeight + 15) / 16
+        val blocksY = (height + 15) / 16

        val quantYmult = jpeg_quality_to_mult(qualityIndices[0])
        val quantCOmult = jpeg_quality_to_mult(qualityIndices[1])
@@ -1872,7 +1895,7 @@ class GraphicsJSR223Delegate(private val vm: VM) {
                when (mode) {
                    0x00 -> { // TEV_MODE_SKIP - copy RGB from previous frame (optimized with memcpy)
                        // Check if we can copy the entire block at once (no clipping)
-                        if (startX + 16 <= width && startY + 16 <= decodingHeight) {
+                        if (startX + 16 <= width && startY + 16 <= height) {
                            // Optimized case: copy entire 16x16 block with row-by-row memcpy
                            for (dy in 0 until 16) {
                                val srcRowOffset = ((startY + dy).toLong() * width + startX) * 3
@@ -1889,7 +1912,7 @@ class GraphicsJSR223Delegate(private val vm: VM) {
                                for (dx in 0 until 16) {
                                    val x = startX + dx
                                    val y = startY + dy
-                                    if (x < width && y < decodingHeight) {
+                                    if (x < width && y < height) {
                                        val pixelOffset = y.toLong() * width + x
                                        val rgbOffset = pixelOffset * 3
                                        
@@ -1919,7 +1942,7 @@ class GraphicsJSR223Delegate(private val vm: VM) {
                                    val refX = x + mvX
                                    val refY = y + mvY
                                    
-                                    if (x < width && y < decodingHeight) {
+                                    if (x < width && y < height) {
                                        val dstPixelOffset = y.toLong() * width + x
                                        val dstRgbOffset = dstPixelOffset * 3
                                        
@@ -1939,7 +1962,7 @@ class GraphicsJSR223Delegate(private val vm: VM) {
                            val refStartY = startY + mvY
                            
                            // Check if entire 16x16 block can be copied with memcpy (no bounds issues)
-                            if (startX + 16 <= width && startY + 16 <= decodingHeight &&
+                            if (startX + 16 <= width && startY + 16 <= height &&
                                refStartX >= 0 && refStartY >= 0 && refStartX + 16 <= width && refStartY + 16 <= height) {
                                
                                // Optimized case: copy entire 16x16 block with row-by-row memcpy
@@ -1961,16 +1984,16 @@ class GraphicsJSR223Delegate(private val vm: VM) {
                                        val refX = x + mvX
                                        val refY = y + mvY
                                        
-                                        if (x < width && y < decodingHeight) {
+                                        if (x < width && y < height) {
                                            val dstPixelOffset = y.toLong() * width + x
                                            val dstRgbOffset = dstPixelOffset * 3
                                            
-                                            if (refX >= 0 && refY >= 0 && refX < width && refY < decodingHeight) {
+                                            if (refX >= 0 && refY >= 0 && refX < width && refY < height) {
                                                val refPixelOffset = refY.toLong() * width + refX
                                                val refRgbOffset = refPixelOffset * 3
                                                
                                                // Additional safety: ensure RGB offset is within valid range
-                                                val maxValidOffset = (width * decodingHeight - 1) * 3L + 2
+                                                val maxValidOffset = (width * height - 1) * 3L + 2
                                                if (refRgbOffset >= 0 && refRgbOffset <= maxValidOffset) {
                                                    // Copy RGB from reference position
                                                    val refR = vm.peek(prevRGBAddr + refRgbOffset*prevAddrIncVec)!!
@@ -2026,7 +2049,7 @@ class GraphicsJSR223Delegate(private val vm: VM) {
                            for (dx in 0 until 16) {
                                val x = startX + dx
                                val y = startY + dy
-                                if (x < width && y < decodingHeight) {
+                                if (x < width && y < height) {
                                    val rgbIdx = (dy * 16 + dx) * 3
                                    val imageOffset = y.toLong() * width + x
                                    val bufferOffset = imageOffset * 3
@@ -2066,10 +2089,10 @@ class GraphicsJSR223Delegate(private val vm: VM) {
                                val refY = y + mvY
                                val pixelIdx = dy * 16 + dx
                                
-                                if (x < width && y < decodingHeight) {
+                                if (x < width && y < height) {
                                    var mcY: Int
                                    
-                                    if (refX >= 0 && refY >= 0 && refX < width && refY < decodingHeight) {
+                                    if (refX >= 0 && refY >= 0 && refX < width && refY < height) {
                                        // Get motion-compensated RGB from previous frame
                                        val refPixelOffset = refY.toLong() * width + refX
                                        val refRgbOffset = refPixelOffset * 3
@@ -2106,12 +2129,12 @@ class GraphicsJSR223Delegate(private val vm: VM) {
                                val refY = y + mvY
                                val chromaIdx = cy * 8 + cx
                                
-                                if (x < width && y < decodingHeight) {
+                                if (x < width && y < height) {
                                    var mcCo: Int
                                    var mcCg: Int
                                    
                                    // Sample 2x2 block from motion-compensated position for chroma
-                                    if (refX >= 0 && refY >= 0 && refX < width - 1 && refY < decodingHeight - 1) {
+                                    if (refX >= 0 && refY >= 0 && refX < width - 1 && refY < height - 1) {
                                        var coSum = 0
                                        var cgSum = 0
                                        var count = 0
@@ -2121,7 +2144,7 @@ class GraphicsJSR223Delegate(private val vm: VM) {
                                            for (dx in 0 until 2) {
                                                val sampleX = refX + dx
                                                val sampleY = refY + dy
-                                                if (sampleX < width && sampleY < decodingHeight) {
+                                                if (sampleX < width && sampleY < height) {
                                                    val refPixelOffset = sampleY.toLong() * width + sampleX
                                                    val refRgbOffset = refPixelOffset * 3
                                                    
@@ -2167,7 +2190,7 @@ class GraphicsJSR223Delegate(private val vm: VM) {
                            for (dx in 0 until 16) {
                                val x = startX + dx
                                val y = startY + dy
-                                if (x < width && y < decodingHeight) {
+                                if (x < width && y < height) {
                                    val imageOffset = y.toLong() * width + x
                                    val bufferOffset = imageOffset * 3
                                    
@@ -2202,7 +2225,7 @@ class GraphicsJSR223Delegate(private val vm: VM) {
                            for (dx in 0 until 16) {
                                val x = startX + dx
                                val y = startY + dy
-                                if (x < width && y < decodingHeight) {
+                                if (x < width && y < height) {
                                    val imageOffset = y.toLong() * width + x
                                    val bufferOffset = imageOffset * 3
                                    
@@ -2224,8 +2247,8 @@ class GraphicsJSR223Delegate(private val vm: VM) {
 //            require(prevFieldBuffer != 0L) { "prevFieldBuffer must be provided for interlaced decoding" }
            
            // Copy the decoded field to temporary buffer
-            vm.memcpy(currentRGBAddr.toInt(), tempFieldBuffer.toInt(), width * decodingHeight * 3)
-            
+            vm.memcpy(currentRGBAddr.toInt(), tempFieldBuffer.toInt(), width * height * 3)
+
            // Apply Yadif deinterlacing: field -> progressive frame
            // For temporal prediction, we need proper field management
            val fieldParity = frameCounter % 2
@@ -2233,14 +2256,14 @@ class GraphicsJSR223Delegate(private val vm: VM) {
                // Extract the corresponding field from the previous progressive frame
                // Even field lines: y = 0, 2, 4, 6...  
                // Odd field lines:  y = 1, 3, 5, 7...
-                extractFieldFromProgressive(prevRGBAddr, prevFieldBuffer, width, height, fieldParity, thisAddrIncVec)
+                extractFieldFromProgressive(prevRGBAddr, prevFieldBuffer, width, height * 2, fieldParity, thisAddrIncVec)
                prevFieldBuffer
            } else {
                0L
            }
            
            yadifDeinterlace(
-                tempFieldBuffer, currentRGBAddr, width, height,
+                tempFieldBuffer, currentRGBAddr, width, height * 2,
                prevFieldAddr, 0L, // Use previous field, no next field available
                fieldParity, 
                thisAddrIncVec, thisAddrIncVec
--- a/tsvm_core/src/net/torvald/tsvm/VM.kt
+++ b/tsvm_core/src/net/torvald/tsvm/VM.kt
@@ -541,6 +541,14 @@ class VM(
        }
    }

+    fun memset(dest: Int, ch: Int, count: Int): Int {
+        val incVec = if (dest >= 0) 1L else -1L
+        for (i in 0 until count) {
+            poke(dest + count*incVec, ch.toByte())
+        }
+        return dest
+    }
+
    fun bulkPeekShort(from: Int, to: ShortArray, sizeInBytes: Int) {
        if (from !in 0..8*1024*1024) throw IllegalArgumentException()
        UnsafeHelper.memcpyRaw(null, usermem.ptr + from, to, UnsafeHelper.getArrayOffset(to), sizeInBytes.toLong())
--- a/tsvm_core/src/net/torvald/tsvm/VMJSR223Delegate.kt
+++ b/tsvm_core/src/net/torvald/tsvm/VMJSR223Delegate.kt
@@ -98,13 +98,7 @@ class VMJSR223Delegate(private val vm: VM) {
    
    fun nanoTime() = System.nanoTime()
    fun malloc(size: Int) = vm.malloc(size)
-    fun memset(dest: Int, ch: Int, count: Int): Int {
-        val incVec = if (dest >= 0) 1 else -1
-        for (i in 0 until count) {
-            poke(dest + count*incVec, ch)
-        }
-        return dest
-    }
+    fun memset(dest: Int, ch: Int, count: Int) = vm.memset(dest, ch, count)
    fun free(ptr: Int) = vm.free(ptr)
    fun forceAlloc(ptr: Int, size: Int) = vm.forceAlloc(ptr, size)
    fun memcpy(from: Int, to: Int, len: Int) {
--- a/video_encoder/encoder_tev.c
+++ b/video_encoder/encoder_tev.c
@@ -1812,6 +1812,11 @@ static int start_video_conversion(tev_encoder_t *enc) {
    } else {
        if (enc->output_fps > 0 && enc->output_fps != enc->fps) {
            // Frame rate conversion requested
+            // filtergraph path:
+            // 1. FPS conversion
+            // 2. scale and crop to requested size
+            // 3. tinterlace weave-overwrites even and odd fields together to produce intermediate video at half framerate, full height (we're losing half the information here -- and that's on purpose)
+            // 4. separatefields separates weave-overwritten frame as two consecutive frames, at half height. Since the frame rate is halved in Step 3. and being doubled here, the final framerate is identical to given framerate
            snprintf(command, sizeof(command),
                "ffmpeg -v error -i \"%s\" -f rawvideo -pix_fmt rgb24 "
                "-vf \"fps=%d,scale=%d:%d:force_original_aspect_ratio=increase,crop=%d:%d,tinterlace=interleave_top:cvlpf,separatefields\" "
@@ -1819,6 +1824,10 @@ static int start_video_conversion(tev_encoder_t *enc) {
                enc->input_file, enc->output_fps, enc->width, enc->height * 2, enc->width, enc->height * 2);
        } else {
            // No frame rate conversion
+            // filtergraph path:
+            // 1. scale and crop to requested size
+            // 2. tinterlace weave-overwrites even and odd fields together to produce intermediate video at half framerate, full height (we're losing half the information here -- and that's on purpose)
+            // 3. separatefields separates weave-overwritten frame as two consecutive frames, at half height. Since the frame rate is halved in Step 2. and being doubled here, the final framerate is identical to the original framerate
            snprintf(command, sizeof(command),
                "ffmpeg -v error -i \"%s\" -f rawvideo -pix_fmt rgb24 "
                "-vf \"scale=%d:%d:force_original_aspect_ratio=increase,crop=%d:%d,tinterlace=interleave_top:cvlpf,separatefields\" "