From 8bb111760b6497b9e050e407a9921446f84faa58 Mon Sep 17 00:00:00 2001
From: minjaesong <alswo9628@gmail.com>
Date: Tue, 19 Aug 2025 22:20:19 +0900
Subject: [PATCH] half-working INTER block

---
 assets/disk0/tvdos/bin/playtev.js             |  93 +-----
 .../torvald/tsvm/GraphicsJSR223Delegate.kt    | 288 +++++++++++++++---
 video_encoder/encoder_tev.c                   | 168 ++++++++--
 3 files changed, 397 insertions(+), 152 deletions(-)

diff --git a/assets/disk0/tvdos/bin/playtev.js b/assets/disk0/tvdos/bin/playtev.js
index 22a43a2..ddaa13c 100644
--- a/assets/disk0/tvdos/bin/playtev.js
+++ b/assets/disk0/tvdos/bin/playtev.js
@@ -24,80 +24,6 @@ const interactive = exec_args[2] && exec_args[2].toLowerCase() == "-i"
 const fullFilePath = _G.shell.resolvePathInput(exec_args[1])
 const FILE_LENGTH = files.open(fullFilePath.full).size
 
-// Quantization tables for Y channel (16x16 - just use first 8 quality levels)
-const QUANT_TABLES_Y = [
-    // Quality 0 (lowest) - 8x8 pattern repeated to 16x16
-    (() => {
-        const base = [80, 60, 50, 80, 120, 200, 255, 255,
-                     55, 60, 70, 95, 130, 255, 255, 255,
-                     70, 65, 80, 120, 200, 255, 255, 255,
-                     70, 85, 110, 145, 255, 255, 255, 255,
-                     90, 110, 185, 255, 255, 255, 255, 255,
-                     120, 175, 255, 255, 255, 255, 255, 255,
-                     245, 255, 255, 255, 255, 255, 255, 255,
-                     255, 255, 255, 255, 255, 255, 255, 255]
-        const extended = []
-        for (let y = 0; y < 16; y++) {
-            for (let x = 0; x < 16; x++) {
-                extended.push(base[(y % 8) * 8 + (x % 8)])
-            }
-        }
-        return extended
-    })(),
-    [40, 30, 25, 40, 60, 100, 128, 150, 28, 30, 35, 48, 65, 128, 150, 180], // Quality 1 (simplified)
-    [20, 15, 13, 20, 30, 50, 64, 75, 14, 15, 18, 24, 33, 64, 75, 90],       // Quality 2
-    [16, 12, 10, 16, 24, 40, 51, 60, 11, 12, 14, 19, 26, 51, 60, 72],       // Quality 3
-    [12, 9, 8, 12, 18, 30, 38, 45, 8, 9, 11, 14, 20, 38, 45, 54],           // Quality 4
-    [10, 7, 6, 10, 15, 25, 32, 38, 7, 7, 9, 12, 16, 32, 38, 45],            // Quality 5
-    [8, 6, 5, 8, 12, 20, 26, 30, 6, 6, 7, 10, 13, 26, 30, 36],             // Quality 6
-    // Quality 7 (highest)
-    (() => {
-        const base = [2, 1, 1, 2, 3, 5, 6, 7,
-                     1, 1, 1, 2, 3, 6, 7, 9,
-                     1, 1, 2, 3, 5, 6, 7, 9,
-                     1, 2, 3, 4, 6, 7, 9, 10,
-                     2, 3, 5, 6, 7, 9, 10, 11,
-                     3, 4, 6, 7, 9, 10, 11, 12,
-                     6, 6, 7, 9, 10, 11, 12, 13,
-                     6, 7, 9, 10, 11, 12, 13, 13]
-        const extended = []
-        for (let y = 0; y < 16; y++) {
-            for (let x = 0; x < 16; x++) {
-                extended.push(base[(y % 8) * 8 + (x % 8)])
-            }
-        }
-        return extended
-    })()
-]
-
-// Quantization tables for chroma channels (8x8)
-const QUANT_TABLES_C = [
-    // Quality 0 (lowest)
-    [120, 90, 75, 120, 180, 255, 255, 255,
-     83, 90, 105, 143, 195, 255, 255, 255,
-     105, 98, 120, 180, 255, 255, 255, 255,
-     105, 128, 165, 218, 255, 255, 255, 255,
-     135, 165, 278, 255, 255, 255, 255, 255,
-     180, 263, 255, 255, 255, 255, 255, 255,
-     255, 255, 255, 255, 255, 255, 255, 255,
-     255, 255, 255, 255, 255, 255, 255, 255],
-    [60, 45, 38, 60, 90, 150, 192, 225],       // Quality 1 (simplified)
-    [30, 23, 19, 30, 45, 75, 96, 113],         // Quality 2
-    [24, 18, 15, 24, 36, 60, 77, 90],          // Quality 3
-    [18, 14, 12, 18, 27, 45, 57, 68],          // Quality 4
-    [15, 11, 9, 15, 23, 38, 48, 57],           // Quality 5
-    [12, 9, 8, 12, 18, 30, 39, 45],            // Quality 6
-    // Quality 7 (highest)
-    [3, 2, 2, 3, 5, 8, 9, 11,
-     2, 2, 2, 3, 5, 9, 11, 14,
-     2, 2, 3, 5, 8, 9, 11, 14,
-     2, 3, 5, 6, 9, 11, 14, 15,
-     3, 5, 8, 9, 11, 14, 15, 17,
-     5, 6, 9, 11, 14, 15, 17, 18,
-     9, 9, 11, 14, 15, 17, 18, 20,
-     9, 11, 14, 15, 17, 18, 20, 20]
-]
-
 let videoRateBin = []
 let errorlevel = 0
 let notifHideTimer = 0
@@ -198,23 +124,12 @@ let ycocgWorkspace = sys.malloc(BLOCK_SIZE * BLOCK_SIZE * 3) // Y+Co+Cg workspac
 let dctWorkspace = sys.malloc(BLOCK_SIZE * BLOCK_SIZE * 4) // DCT coefficients (floats)
 
 // Initialize RGB frame buffers to black (0,0,0)
-for (let i = 0; i < FRAME_PIXELS; i++) {
-    // Current frame RGB: black
-    sys.poke(CURRENT_RGB_ADDR + i*3, 0)     // R
-    sys.poke(CURRENT_RGB_ADDR + i*3 + 1, 0) // G  
-    sys.poke(CURRENT_RGB_ADDR + i*3 + 2, 0) // B
-    
-    // Previous frame RGB: black
-    sys.poke(PREV_RGB_ADDR + i*3, 0)        // R
-    sys.poke(PREV_RGB_ADDR + i*3 + 1, 0)    // G
-    sys.poke(PREV_RGB_ADDR + i*3 + 2, 0)    // B
-}
+sys.memset(CURRENT_RGB_ADDR, 0, FRAME_PIXELS * 3)
+sys.memset(PREV_RGB_ADDR, 0, FRAME_PIXELS * 3)
 
 // Initialize display framebuffer to black
-for (let i = 0; i < FRAME_PIXELS; i++) {
-    sys.poke(DISPLAY_RG_ADDR - i, 0)  // Black in RG plane
-    sys.poke(DISPLAY_BA_ADDR - i, 15) // Black with alpha=15 (opaque) in BA plane
-}
+sys.memset(DISPLAY_RG_ADDR, 0, FRAME_PIXELS) // Black in RG plane
+sys.memset(DISPLAY_BA_ADDR, 15, FRAME_PIXELS) // Black with alpha=15 (opaque) in BA plane
 
 let frameCount = 0
 let stopPlay = false
diff --git a/tsvm_core/src/net/torvald/tsvm/GraphicsJSR223Delegate.kt b/tsvm_core/src/net/torvald/tsvm/GraphicsJSR223Delegate.kt
index c993cfe..858e531 100644
--- a/tsvm_core/src/net/torvald/tsvm/GraphicsJSR223Delegate.kt
+++ b/tsvm_core/src/net/torvald/tsvm/GraphicsJSR223Delegate.kt
@@ -7,7 +7,9 @@ import net.torvald.terrarum.modulecomputers.virtualcomputer.tvd.toUint
 import net.torvald.tsvm.peripheral.GraphicsAdapter
 import net.torvald.tsvm.peripheral.fmod
 import kotlin.math.abs
+import kotlin.math.cos
 import kotlin.math.roundToInt
+import kotlin.math.sqrt
 
 class GraphicsJSR223Delegate(private val vm: VM) {
 
@@ -1548,19 +1550,18 @@ class GraphicsJSR223Delegate(private val vm: VM) {
         return round(15f * q)
     }
 
+    val dctBasis8 = Array(8) { u ->
+        FloatArray(8) { x ->
+            val cu = if (u == 0) 1.0 / sqrt(2.0) else 1.0
+            (0.5 * cu * cos((2.0 * x + 1.0) * u * PI / 16.0)).toFloat()
+        }
+    }
+
     /**
      * Perform IDCT on a single channel with integer coefficients
      */
     private fun tevIdct8x8(coeffs: IntArray, quantTable: IntArray): IntArray {
-        // Use the same DCT basis as tevIdct8x8
-        val dctBasis = Array(8) { u ->
-            Array(8) { x ->
-                val cu = if (u == 0) 1.0 / kotlin.math.sqrt(2.0) else 1.0
-                cu * kotlin.math.cos((2.0 * x + 1.0) * u * kotlin.math.PI / 16.0) / 2.0
-            }
-        }
-
-        val dctCoeffs = Array(8) { DoubleArray(8) }
+        val dctCoeffs = Array(8) { FloatArray(8) }
         val result = IntArray(64)
 
         // Convert integer coefficients to 2D array and dequantize
@@ -1570,10 +1571,10 @@ class GraphicsJSR223Delegate(private val vm: VM) {
                 val coeff = coeffs[idx]
                 if (idx == 0) {
                     // DC coefficient for chroma: lossless quantization (no scaling)
-                    dctCoeffs[u][v] = coeff.toDouble()
+                    dctCoeffs[u][v] = coeff.toFloat()
                 } else {
                     // AC coefficients: use quantization table
-                    dctCoeffs[u][v] = (coeff * quantTable[idx]).toDouble()
+                    dctCoeffs[u][v] = (coeff * quantTable[idx]).toFloat()
                 }
             }
         }
@@ -1581,14 +1582,14 @@ class GraphicsJSR223Delegate(private val vm: VM) {
         // Apply 2D inverse DCT
         for (x in 0 until 8) {
             for (y in 0 until 8) {
-                var sum = 0.0
+                var sum = 0f
                 for (u in 0 until 8) {
                     for (v in 0 until 8) {
-                        sum += dctBasis[u][x] * dctBasis[v][y] * dctCoeffs[u][v]
+                        sum += dctBasis8[u][x] * dctBasis8[v][y] * dctCoeffs[u][v]
                     }
                 }
-                // Co/Cg values don't need +128 offset (they're already centered around 0)
-                val pixel = kotlin.math.max(-255.0, kotlin.math.min(255.0, sum))
+                // Chroma residuals should be in reasonable range (±128 max)
+                val pixel = sum.coerceIn(-127f, 128f)
                 result[y * 8 + x] = pixel.toInt()
             }
         }
@@ -1596,16 +1597,16 @@ class GraphicsJSR223Delegate(private val vm: VM) {
         return result
     }
 
+    val dctBasis16 = Array(16) { u ->
+        FloatArray(16) { x ->
+            val cu = if (u == 0) 1.0 / sqrt(2.0) else 1.0
+            (0.25 * cu * cos((2.0 * x + 1.0) * u * PI / 32.0)).toFloat()
+        }
+    }
+    
     // 16x16 IDCT for Y channel (YCoCg-R format)
     private fun tevIdct16x16(coeffs: IntArray, quantTable: IntArray): IntArray {
-        val dctBasis = Array(16) { u ->
-            Array(16) { x ->
-                val cu = if (u == 0) 1.0 / kotlin.math.sqrt(2.0) else 1.0
-                cu * kotlin.math.cos((2.0 * x + 1.0) * u * kotlin.math.PI / 32.0) / 4.0
-            }
-        }
-        
-        val dctCoeffs = Array(16) { DoubleArray(16) }
+        val dctCoeffs = Array(16) { FloatArray(16) }
         val result = IntArray(256)  // 16x16 = 256
         
         // Convert integer coefficients to 2D array and dequantize
@@ -1615,10 +1616,10 @@ class GraphicsJSR223Delegate(private val vm: VM) {
                 val coeff = coeffs[idx]
                 if (idx == 0) {
                     // DC coefficient for luma: lossless quantization (no scaling)
-                    dctCoeffs[u][v] = coeff.toDouble()
+                    dctCoeffs[u][v] = coeff.toFloat()
                 } else {
                     // AC coefficients: use quantization table
-                    dctCoeffs[u][v] = (coeff * quantTable[idx]).toDouble()
+                    dctCoeffs[u][v] = (coeff * quantTable[idx]).toFloat()
                 }
             }
         }
@@ -1626,13 +1627,13 @@ class GraphicsJSR223Delegate(private val vm: VM) {
         // Apply 2D inverse DCT
         for (x in 0 until 16) {
             for (y in 0 until 16) {
-                var sum = 0.0
+                var sum = 0f
                 for (u in 0 until 16) {
                     for (v in 0 until 16) {
-                        sum += dctBasis[u][x] * dctBasis[v][y] * dctCoeffs[u][v]
+                        sum += dctBasis16[u][x] * dctBasis16[v][y] * dctCoeffs[u][v]
                     }
                 }
-                val pixel = kotlin.math.max(0.0, kotlin.math.min(255.0, sum + 128.0))
+                val pixel = (sum + 128).coerceIn(0f, 255f)
                 result[y * 16 + x] = pixel.toInt()
             }
         }
@@ -1654,22 +1655,50 @@ class GraphicsJSR223Delegate(private val vm: VM) {
                 val co = coBlock[coIdx]
                 val cg = cgBlock[coIdx]
                 
-                // YCoCg-R inverse transform (using safe integer arithmetic)
-                val tmp = y - (cg / 2)  // Use division instead of shift to avoid overflow
+                // YCoCg-R inverse transform (per YCoCg-R spec with truncated division)
+                val tmp = y - (cg / 2)
                 val g = cg + tmp
-                val b = tmp - (co / 2)  // Use division instead of shift to avoid overflow  
+                val b = tmp - (co / 2)
                 val r = b + co
                 
                 // Clamp and store RGB
                 val baseIdx = (py * 16 + px) * 3
-                rgbData[baseIdx] = kotlin.math.max(0, kotlin.math.min(255, r))     // R
-                rgbData[baseIdx + 1] = kotlin.math.max(0, kotlin.math.min(255, g)) // G
-                rgbData[baseIdx + 2] = kotlin.math.max(0, kotlin.math.min(255, b)) // B
+                rgbData[baseIdx] = r.coerceIn(0, 255)     // R
+                rgbData[baseIdx + 1] = g.coerceIn(0, 255) // G
+                rgbData[baseIdx + 2] = b.coerceIn(0, 255) // B
             }
         }
         
         return rgbData
     }
+    
+    // RGB to YCoCg-R conversion for INTER mode residual calculation
+    fun tevRGBToYcocg(rgbBlock: IntArray): IntArray {
+        val ycocgData = IntArray(16 * 16 * 3)  // Y,Co,Cg for 16x16 pixels
+        
+        for (py in 0 until 16) {
+            for (px in 0 until 16) {
+                val baseIdx = (py * 16 + px) * 3
+                val r = rgbBlock[baseIdx]
+                val g = rgbBlock[baseIdx + 1] 
+                val b = rgbBlock[baseIdx + 2]
+                
+                // YCoCg-R forward transform
+                val co = r - b
+                val tmp = b + (co / 2)
+                val cg = g - tmp
+                val y = tmp + (cg / 2)
+                
+                // Store YCoCg values
+                val yIdx = py * 16 + px
+                ycocgData[yIdx * 3] = y.coerceIn(0, 255)        // Y
+                ycocgData[yIdx * 3 + 1] = co.coerceIn(-128, 127) // Co 
+                ycocgData[yIdx * 3 + 2] = cg.coerceIn(-128, 127) // Cg
+            }
+        }
+        
+        return ycocgData
+    }
 
     /**
      * Hardware-accelerated TEV frame decoder for YCoCg-R 4:2:0 format
@@ -1775,7 +1804,7 @@ class GraphicsJSR223Delegate(private val vm: VM) {
                         readPtr += 768
                     }
                     
-                    else -> { // TEV_MODE_INTRA (0x01) or TEV_MODE_INTER (0x02) - Full YCoCg-R DCT decode
+                    0x01 -> { // TEV_MODE_INTRA - Full YCoCg-R DCT decode (no motion compensation)
                         // Read DCT coefficients: Y (16x16=256), Co (8x8=64), Cg (8x8=64)
                         val yCoeffs = IntArray(256)
                         val coCoeffs = IntArray(64)
@@ -1813,7 +1842,7 @@ class GraphicsJSR223Delegate(private val vm: VM) {
                         // Convert YCoCg-R to RGB
                         val rgbData = tevYcocgToRGB(yBlock, coBlock, cgBlock)
                         
-                        // Store RGB data to frame buffer
+                        // Store RGB data to frame buffer (complete replacement)
                         for (dy in 0 until 16) {
                             for (dx in 0 until 16) {
                                 val x = startX + dx
@@ -1830,6 +1859,187 @@ class GraphicsJSR223Delegate(private val vm: VM) {
                             }
                         }
                     }
+                    
+                    0x02 -> { // TEV_MODE_INTER - Motion compensation + residual DCT
+                        // Step 1: Read residual DCT coefficients
+                        val yCoeffs = IntArray(256)
+                        val coCoeffs = IntArray(64)
+                        val cgCoeffs = IntArray(64)
+                        
+                        // Read Y coefficients (16x16 = 256 coefficients × 2 bytes)
+                        for (i in 0 until 256) {
+                            val coeff = ((vm.peek(readPtr)!!.toUint()) or 
+                                        ((vm.peek(readPtr + 1)!!.toUint()) shl 8)).toShort().toInt()
+                            yCoeffs[i] = coeff
+                            readPtr += 2
+                        }
+                        
+                        // Read Co coefficients (8x8 = 64 coefficients × 2 bytes)
+                        for (i in 0 until 64) {
+                            val coeff = ((vm.peek(readPtr)!!.toUint()) or 
+                                        ((vm.peek(readPtr + 1)!!.toUint()) shl 8)).toShort().toInt()
+                            coCoeffs[i] = coeff
+                            readPtr += 2
+                        }
+                        
+                        // Read Cg coefficients (8x8 = 64 coefficients × 2 bytes)  
+                        for (i in 0 until 64) {
+                            val coeff = ((vm.peek(readPtr)!!.toUint()) or 
+                                        ((vm.peek(readPtr + 1)!!.toUint()) shl 8)).toShort().toInt()
+                            cgCoeffs[i] = coeff
+                            readPtr += 2
+                        }
+                        
+                        // Step 2: Decode residual DCT
+                        val yResidual = tevIdct16x16(yCoeffs, quantTableY)
+                        val coResidual = tevIdct8x8(coCoeffs, quantTableC)
+                        val cgResidual = tevIdct8x8(cgCoeffs, quantTableC)
+                        
+                        // Step 3: Build motion-compensated YCoCg-R block and add residuals
+                        val finalY = IntArray(256)
+                        val finalCo = IntArray(64)
+                        val finalCg = IntArray(64)
+                        
+                        // Process Y residuals (16x16)  
+                        for (dy in 0 until 16) {
+                            for (dx in 0 until 16) {
+                                val x = startX + dx
+                                val y = startY + dy
+                                val refX = x + mvX
+                                val refY = y + mvY
+                                val pixelIdx = dy * 16 + dx
+                                
+                                if (x < width && y < height) {
+                                    var mcY: Int
+                                    
+                                    if (refX in 0 until width && refY in 0 until height) {
+                                        // Get motion-compensated RGB from previous frame
+                                        val refPixelOffset = refY.toLong() * width + refX
+                                        val refRgbOffset = refPixelOffset * 3
+                                        
+                                        val mcR = vm.peek(prevRGBAddr + refRgbOffset*prevAddrIncVec)!!.toUint().toInt()
+                                        val mcG = vm.peek(prevRGBAddr + (refRgbOffset + 1)*prevAddrIncVec)!!.toUint().toInt()
+                                        val mcB = vm.peek(prevRGBAddr + (refRgbOffset + 2)*prevAddrIncVec)!!.toUint().toInt()
+                                        
+                                        // Convert motion-compensated RGB to Y only
+                                        val co = mcR - mcB
+                                        val tmp = mcB + (co / 2)
+                                        val cg = mcG - tmp
+                                        val yVal = tmp + (cg / 2)
+                                        
+                                        mcY = yVal
+                                    } else {
+                                        // Out of bounds reference - use neutral values
+                                        mcY = 128
+                                    }
+                                    
+                                    // Add Y residual
+                                    finalY[pixelIdx] = (mcY + yResidual[pixelIdx]).coerceIn(0, 255)
+                                }
+                            }
+                        }
+                        
+                        // Process chroma residuals separately (8x8 subsampled)
+                        for (cy in 0 until 8) {
+                            for (cx in 0 until 8) {
+                                // Chroma coordinates are at 2x2 block centers in subsampled space
+                                val x = startX + cx * 2
+                                val y = startY + cy * 2
+                                
+                                // Apply motion vector to chroma block center
+                                val refX = x + mvX
+                                val refY = y + mvY
+                                val chromaIdx = cy * 8 + cx
+                                
+                                if (x < width && y < height) {
+                                    var mcCo: Int
+                                    var mcCg: Int
+                                    
+                                    // Sample 2x2 block from motion-compensated position for chroma
+                                    if (refX >= 0 && refY >= 0 && refX < width - 1 && refY < height - 1) {
+                                        var coSum = 0
+                                        var cgSum = 0
+                                        var count = 0
+                                        
+                                        // Sample 2x2 block for chroma subsampling (like encoder)
+                                        for (dy in 0 until 2) {
+                                            for (dx in 0 until 2) {
+                                                val sampleX = refX + dx
+                                                val sampleY = refY + dy
+                                                if (sampleX < width && sampleY < height) {
+                                                    val refPixelOffset = sampleY.toLong() * width + sampleX
+                                                    val refRgbOffset = refPixelOffset * 3
+                                                    
+                                                    val mcR = vm.peek(prevRGBAddr + refRgbOffset*prevAddrIncVec)!!.toUint().toInt()
+                                                    val mcG = vm.peek(prevRGBAddr + (refRgbOffset + 1)*prevAddrIncVec)!!.toUint().toInt()
+                                                    val mcB = vm.peek(prevRGBAddr + (refRgbOffset + 2)*prevAddrIncVec)!!.toUint().toInt()
+                                                    
+                                                    val co = mcR - mcB
+                                                    val tmp = mcB + (co / 2)
+                                                    val cg = mcG - tmp
+                                                    
+                                                    coSum += co
+                                                    cgSum += cg
+                                                    count++
+                                                }
+                                            }
+                                        }
+                                        
+                                        mcCo = if (count > 0) coSum / count else 0
+                                        mcCg = if (count > 0) cgSum / count else 0
+                                    } else {
+                                        // Out of bounds reference - use neutral chroma values
+                                        mcCo = 0
+                                        mcCg = 0
+                                    }
+                                    
+                                    // Add chroma residuals - no clamping to see if that's the issue
+                                    finalCo[chromaIdx] = mcCo + coResidual[chromaIdx]
+                                    finalCg[chromaIdx] = mcCg + cgResidual[chromaIdx]
+                                }
+                            }
+                        }
+                        
+                        // Step 4: Convert final YCoCg-R to RGB
+                        val finalRgb = tevYcocgToRGB(finalY, finalCo, finalCg)
+                        
+                        // Step 5: Store final RGB data to frame buffer
+                        for (dy in 0 until 16) {
+                            for (dx in 0 until 16) {
+                                val x = startX + dx
+                                val y = startY + dy
+                                if (x < width && y < height) {
+                                    val rgbIdx = (dy * 16 + dx) * 3
+                                    val imageOffset = y.toLong() * width + x
+                                    val bufferOffset = imageOffset * 3
+                                    
+                                    vm.poke(currentRGBAddr + bufferOffset*thisAddrIncVec, finalRgb[rgbIdx].toByte())
+                                    vm.poke(currentRGBAddr + (bufferOffset + 1)*thisAddrIncVec, finalRgb[rgbIdx + 1].toByte()) 
+                                    vm.poke(currentRGBAddr + (bufferOffset + 2)*thisAddrIncVec, finalRgb[rgbIdx + 2].toByte())
+                                }
+                            }
+                        }
+                    }
+                    
+                    else -> {
+                        // Unknown block mode - skip DCT coefficients and use black
+                        readPtr += 768 // Skip Y(256×2) + Co(64×2) + Cg(64×2) = 768 bytes
+                        
+                        for (dy in 0 until 16) {
+                            for (dx in 0 until 16) {
+                                val x = startX + dx
+                                val y = startY + dy
+                                if (x < width && y < height) {
+                                    val imageOffset = y.toLong() * width + x
+                                    val bufferOffset = imageOffset * 3
+                                    
+                                    vm.poke(currentRGBAddr + bufferOffset*thisAddrIncVec, 0.toByte())      // R=0
+                                    vm.poke(currentRGBAddr + (bufferOffset + 1)*thisAddrIncVec, 0.toByte()) // G=0
+                                    vm.poke(currentRGBAddr + (bufferOffset + 2)*thisAddrIncVec, 0.toByte()) // B=0
+                                }
+                            }
+                        }
+                    }
                 }
             }
         }
@@ -1855,9 +2065,9 @@ class GraphicsJSR223Delegate(private val vm: VM) {
                     
                     // YCoCg-R transform
                     val co = r - b
-                    val tmp = b + (co shr 1)
+                    val tmp = b + (co / 2)
                     val cg = g - tmp
-                    val y = tmp + (cg shr 1)
+                    val y = tmp + (cg / 2)
                     
                     yBlock[py * 16 + px] = y
                 }
@@ -1883,7 +2093,7 @@ class GraphicsJSR223Delegate(private val vm: VM) {
                             val b = vm.peek(srcPtr + (offset + 2) * incVec)!!.toUint()
                             
                             val co = r - b
-                            val tmp = b + (co shr 1)
+                            val tmp = b + (co / 2)
                             val cg = g - tmp
                             
                             coSum += co
diff --git a/video_encoder/encoder_tev.c b/video_encoder/encoder_tev.c
index 12fd056..621e9e3 100644
--- a/video_encoder/encoder_tev.c
+++ b/video_encoder/encoder_tev.c
@@ -28,6 +28,11 @@
 #define TEV_PACKET_AUDIO_MP2   0x20  // MP2 audio
 #define TEV_PACKET_SYNC        0xFF  // Sync packet
 
+// Utility macros
+static inline int CLAMP(int x, int min, int max) {
+    return x < min ? min : (x > max ? max : x);
+}
+
 // Quality settings for quantization (Y channel) - 16x16 tables
 static const uint8_t QUANT_TABLES_Y[8][256] = {
     // Quality 0 (lowest) - 16x16 table
@@ -310,30 +315,30 @@ typedef struct {
     int blocks_skip, blocks_intra, blocks_inter, blocks_motion;
 } tev_encoder_t;
 
-// RGB to YCoCg-R transform
+// RGB to YCoCg-R transform (per YCoCg-R specification with truncated division)
 static void rgb_to_ycocgr(uint8_t r, uint8_t g, uint8_t b, int *y, int *co, int *cg) {
     *co = (int)r - (int)b;
-    int tmp = (int)b + ((*co) >> 1);
+    int tmp = (int)b + ((*co) / 2);
     *cg = (int)g - tmp;
-    *y = tmp + ((*cg) >> 1);
+    *y = tmp + ((*cg) / 2);
     
-    // Clamp to valid ranges (YCoCg-R should be roughly -255 to +255)
-    *y = (*y < 0) ? 0 : ((*y > 255) ? 255 : *y);
-    *co = (*co < -255) ? -255 : ((*co > 255) ? 255 : *co);
-    *cg = (*cg < -255) ? -255 : ((*cg > 255) ? 255 : *cg);
+    // Clamp to valid ranges (YCoCg-R should be roughly -128 to +127)
+    *y = CLAMP(*y, 0, 255);
+    *co = CLAMP(*co, -128, 127);
+    *cg = CLAMP(*cg, -128, 127);
 }
 
-// YCoCg-R to RGB transform (for verification)
+// YCoCg-R to RGB transform (for verification - per YCoCg-R specification)
 static void ycocgr_to_rgb(int y, int co, int cg, uint8_t *r, uint8_t *g, uint8_t *b) {
-    int tmp = y - (cg >> 1);
+    int tmp = y - (cg / 2);
     *g = cg + tmp;
-    *b = tmp - (co >> 1);
+    *b = tmp - (co / 2);
     *r = *b + co;
     
     // Clamp values
-    *r = (*r < 0) ? 0 : ((*r > 255) ? 255 : *r);
-    *g = (*g < 0) ? 0 : ((*g > 255) ? 255 : *g);
-    *b = (*b < 0) ? 0 : ((*b > 255) ? 255 : *b);
+    *r = CLAMP(*r, 0, 255);
+    *g = CLAMP(*g, 0, 255);
+    *b = CLAMP(*b, 0, 255);
 }
 
 // 16x16 2D DCT
@@ -507,6 +512,117 @@ static void estimate_motion(tev_encoder_t *enc, int block_x, int block_y,
     }
 }
 
+// Convert RGB block to YCoCg-R with 4:2:0 chroma subsampling
+static void convert_rgb_to_ycocgr_block(const uint8_t *rgb_block, 
+                                       uint8_t *y_block, int8_t *co_block, int8_t *cg_block) {
+    // Convert 16x16 RGB to Y (full resolution)
+    for (int py = 0; py < 16; py++) {
+        for (int px = 0; px < 16; px++) {
+            int rgb_idx = (py * 16 + px) * 3;
+            int r = rgb_block[rgb_idx];
+            int g = rgb_block[rgb_idx + 1];
+            int b = rgb_block[rgb_idx + 2];
+            
+            // YCoCg-R transform (per specification with truncated division)
+            int co = r - b;
+            int tmp = b + (co / 2);
+            int cg = g - tmp;
+            int y = tmp + (cg / 2);
+            
+            y_block[py * 16 + px] = CLAMP(y, 0, 255);
+        }
+    }
+    
+    // Convert to Co and Cg with 4:2:0 subsampling (8x8)
+    for (int cy = 0; cy < 8; cy++) {
+        for (int cx = 0; cx < 8; cx++) {
+            // Sample 2x2 block from RGB and average for chroma
+            int sum_co = 0, sum_cg = 0;
+            
+            for (int dy = 0; dy < 2; dy++) {
+                for (int dx = 0; dx < 2; dx++) {
+                    int py = cy * 2 + dy;
+                    int px = cx * 2 + dx;
+                    int rgb_idx = (py * 16 + px) * 3;
+                    
+                    int r = rgb_block[rgb_idx];
+                    int g = rgb_block[rgb_idx + 1];
+                    int b = rgb_block[rgb_idx + 2];
+                    
+                    int co = r - b;
+                    int tmp = b + (co / 2);
+                    int cg = g - tmp;
+                    
+                    sum_co += co;
+                    sum_cg += cg;
+                }
+            }
+            
+            // Average and store subsampled chroma
+            co_block[cy * 8 + cx] = CLAMP(sum_co / 4, -128, 127);
+            cg_block[cy * 8 + cx] = CLAMP(sum_cg / 4, -128, 127);
+        }
+    }
+}
+
+// Extract motion-compensated YCoCg-R block from reference frame
+static void extract_motion_compensated_block(const uint8_t *rgb_data, int width, int height,
+                                           int block_x, int block_y, int mv_x, int mv_y,
+                                           uint8_t *y_block, int8_t *co_block, int8_t *cg_block) {
+    // Extract 16x16 RGB block with motion compensation
+    uint8_t rgb_block[16 * 16 * 3];
+    
+    for (int dy = 0; dy < 16; dy++) {
+        for (int dx = 0; dx < 16; dx++) {
+            int cur_x = block_x + dx;
+            int cur_y = block_y + dy;
+            int ref_x = cur_x + mv_x;
+            int ref_y = cur_y + mv_y;
+            
+            int rgb_idx = (dy * 16 + dx) * 3;
+            
+            if (ref_x >= 0 && ref_y >= 0 && ref_x < width && ref_y < height) {
+                // Copy RGB from reference position
+                int ref_offset = (ref_y * width + ref_x) * 3;
+                rgb_block[rgb_idx] = rgb_data[ref_offset];         // R
+                rgb_block[rgb_idx + 1] = rgb_data[ref_offset + 1]; // G
+                rgb_block[rgb_idx + 2] = rgb_data[ref_offset + 2]; // B
+            } else {
+                // Out of bounds - use black
+                rgb_block[rgb_idx] = 0;     // R
+                rgb_block[rgb_idx + 1] = 0; // G
+                rgb_block[rgb_idx + 2] = 0; // B
+            }
+        }
+    }
+    
+    // Convert RGB block to YCoCg-R
+    convert_rgb_to_ycocgr_block(rgb_block, y_block, co_block, cg_block);
+}
+
+// Compute motion-compensated residual for INTER mode
+static void compute_motion_residual(tev_encoder_t *enc, int block_x, int block_y, int mv_x, int mv_y) {
+    int start_x = block_x * 16;
+    int start_y = block_y * 16;
+    
+    // Extract motion-compensated reference block from previous frame
+    uint8_t ref_y[256];
+    int8_t ref_co[64], ref_cg[64];
+    extract_motion_compensated_block(enc->previous_rgb, enc->width, enc->height,
+                                   start_x, start_y, mv_x, mv_y, 
+                                   ref_y, ref_co, ref_cg);
+    
+    // Compute residuals: current - motion_compensated_reference
+    for (int i = 0; i < 256; i++) {
+        enc->y_workspace[i] = (int)enc->y_workspace[i] - (int)ref_y[i];
+    }
+    
+    for (int i = 0; i < 64; i++) {
+        enc->co_workspace[i] = (int)enc->co_workspace[i] - (int)ref_co[i];
+        enc->cg_workspace[i] = (int)enc->cg_workspace[i] - (int)ref_cg[i];
+    }
+}
+
 // Encode a 16x16 block
 static void encode_block(tev_encoder_t *enc, int block_x, int block_y, int is_keyframe) {
     tev_block_t *block = &enc->block_data[block_y * ((enc->width + 15) / 16) + block_x];
@@ -608,8 +724,15 @@ static void encode_block(tev_encoder_t *enc, int block_x, int block_y, int is_ke
             memset(block->cg_coeffs, 0, sizeof(block->cg_coeffs));
             enc->blocks_motion++;
             return; // Skip DCT encoding, just store motion vector
+        } else if (motion_sad < skip_sad && (abs(block->mv_x) > 0 || abs(block->mv_y) > 0)) {
+            // Use inter mode with residual DCT - motion compensation + residual
+            block->mode = TEV_MODE_INTER;
+            enc->blocks_inter++;
+            
+            // Compute motion-compensated residual for DCT encoding
+            compute_motion_residual(enc, block_x, block_y, block->mv_x, block->mv_y);
         } else {
-            // Use intra mode for now (inter mode with residual DCT not implemented)
+            // No good motion prediction - use intra mode
             block->mode = TEV_MODE_INTRA;
             block->mv_x = 0;
             block->mv_y = 0;
@@ -695,13 +818,13 @@ static int alloc_encoder_buffers(tev_encoder_t *enc) {
     
     if (gzip_init_result != Z_OK) {
         fprintf(stderr, "Failed to initialize gzip compression\n");
-        return -1;
+        return 0;
     }
     
     // Initialize previous frame to black
     memset(enc->previous_rgb, 0, pixels * 3);
     
-    return 0;
+    return 1;
 }
 
 // Free encoder resources
@@ -772,13 +895,13 @@ static int encode_frame(tev_encoder_t *enc, FILE *output, int frame_num) {
     
     if (deflateReset(&enc->gzip_stream) != Z_OK) {
         fprintf(stderr, "Gzip deflateReset failed\n");
-        return -1;
+        return 0;
     }
     
     int result = deflate(&enc->gzip_stream, Z_FINISH);
     if (result != Z_STREAM_END) {
         fprintf(stderr, "Gzip compression failed: %d\n", result);
-        return -1;
+        return 0;
     }
     
     size_t compressed_size = enc->gzip_stream.total_out;
@@ -792,16 +915,13 @@ static int encode_frame(tev_encoder_t *enc, FILE *output, int frame_num) {
     fwrite(enc->compressed_buffer, 1, compressed_size, output);
     
     enc->total_output_bytes += 5 + compressed_size;
-    
-    // Copy current frame to previous for next iteration
-    //memcpy(enc->previous_rgb, enc->current_rgb, enc->width * enc->height * 3);
 
     // Swap frame buffers for next frame
     uint8_t *temp_rgb = enc->previous_rgb;
     enc->previous_rgb = enc->current_rgb;
     enc->current_rgb = temp_rgb;
 
-    return 0;
+    return 1;
 }
 
 // Execute command and capture output
@@ -1099,7 +1219,7 @@ int main(int argc, char *argv[]) {
     }
     
     // Allocate buffers
-    if (alloc_encoder_buffers(enc) < 0) {
+    if (!alloc_encoder_buffers(enc)) {
         fprintf(stderr, "Failed to allocate encoder buffers\n");
         cleanup_encoder(enc);
         return 1;
@@ -1194,7 +1314,7 @@ int main(int argc, char *argv[]) {
         }
         
         // Encode frame
-        if (encode_frame(enc, output, frame_count) < 0) {
+        if (!encode_frame(enc, output, frame_count)) {
             fprintf(stderr, "Failed to encode frame %d\n", frame_count);
             break;
         }