diff --git a/tsvm_core/src/net/torvald/tsvm/GraphicsJSR223Delegate.kt b/tsvm_core/src/net/torvald/tsvm/GraphicsJSR223Delegate.kt
index ad6d078..19bd92c 100644
--- a/tsvm_core/src/net/torvald/tsvm/GraphicsJSR223Delegate.kt
+++ b/tsvm_core/src/net/torvald/tsvm/GraphicsJSR223Delegate.kt
@@ -2147,7 +2147,92 @@ class GraphicsJSR223Delegate(private val vm: VM) {
         
         return rgbData
     }
-    
+
+    // ICtCp to RGB conversion for TEV version 3
+    fun tevIctcpToRGB(iBlock: IntArray, ctBlock: IntArray, cpBlock: IntArray): IntArray {
+        val rgbData = IntArray(16 * 16 * 3)  // R,G,B for 16x16 pixels
+
+        // Process 16x16 I channel with 8x8 Ct/Cp channels (4:2:0 upsampling)
+        for (py in 0 until 16) {
+            for (px in 0 until 16) {
+                val iIdx = py * 16 + px
+                val i = iBlock[iIdx].toDouble()
+
+                // Get Ct/Cp from 8x8 chroma blocks (4:2:0 upsampling)
+                val ctIdx = (py / 2) * 8 + (px / 2)
+                val ct = ctBlock[ctIdx].toDouble()
+                val cp = cpBlock[ctIdx].toDouble()
+
+                // Convert scaled values back to ICtCp range
+                // I channel: IDCT already added 128, so i is in [0,255]. Reverse encoder: (c1*255-128)+128 = c1*255
+                val I = i / 255.0
+                // Ct/Cp were scaled: c2/c3 * 255.0, so reverse: ct/cp / 255.0
+                val Ct = (ct / 255.0)
+                val Cp = (cp / 255.0)
+
+                // ICtCp -> L'M'S' (inverse matrix)
+                val Lp = I + 0.015718580108730416 * Ct + 0.2095810681164055 * Cp
+                val Mp = I - 0.015718580108730416 * Ct - 0.20958106811640548 * Cp
+                val Sp = I + 1.0212710798422344 * Ct - 0.6052744909924316 * Cp
+
+                // HLG decode: L'M'S' -> linear LMS
+                val L = HLG_inverse_OETF(Lp)
+                val M = HLG_inverse_OETF(Mp)
+                val S = HLG_inverse_OETF(Sp)
+
+                // LMS -> linear sRGB (inverse matrix)
+                val rLin = 3.436606694333079 * L -2.5064521186562705 * M + 0.06984542432319149 * S
+                val gLin = -0.7913295555989289 * L + 1.983600451792291 * M -0.192270896193362 * S
+                val bLin = -0.025949899690592665 * L -0.09891371471172647 * M + 1.1248636144023192 * S
+
+                // Gamma encode to sRGB
+                val rSrgb = srgbUnlinearize(rLin)
+                val gSrgb = srgbUnlinearize(gLin)
+                val bSrgb = srgbUnlinearize(bLin)
+
+                // Convert to 8-bit and store
+                val baseIdx = (py * 16 + px) * 3
+                rgbData[baseIdx] = (rSrgb * 255.0).toInt().coerceIn(0, 255)     // R
+                rgbData[baseIdx + 1] = (gSrgb * 255.0).toInt().coerceIn(0, 255) // G
+                rgbData[baseIdx + 2] = (bSrgb * 255.0).toInt().coerceIn(0, 255) // B
+            }
+        }
+
+        return rgbData
+    }
+
+    // Helper functions for ICtCp decoding
+
+    // Inverse HLG OETF (HLG -> linear)
+    fun HLG_inverse_OETF(V: Double): Double {
+        val a = 0.17883277
+        val b = 1.0 - 4.0 * a
+        val c = 0.5 - a * ln(4.0 * a)
+
+        if (V <= 0.5)
+            return (V * V) / 3.0
+        else
+            return (exp((V - c)/a) + b) / 12.0
+    }
+
+    // sRGB gamma decode: nonlinear -> linear
+    private fun srgbLinearize(value: Double): Double {
+        return if (value <= 0.04045) {
+            value / 12.92
+        } else {
+            ((value + 0.055) / 1.055).pow(2.4)
+        }
+    }
+
+    // sRGB gamma encode: linear -> nonlinear
+    private fun srgbUnlinearize(value: Double): Double {
+        return if (value <= 0.0031308) {
+            value * 12.92
+        } else {
+            1.055 * value.pow(1.0 / 2.4) - 0.055
+        }
+    }
+
     // RGB to YCoCg-R conversion for INTER mode residual calculation
     fun tevRGBToYcocg(rgbBlock: IntArray): IntArray {
         val ycocgData = IntArray(16 * 16 * 3)  // Y,Co,Cg for 16x16 pixels
@@ -2175,147 +2260,6 @@ class GraphicsJSR223Delegate(private val vm: VM) {
         
         return ycocgData
     }
-    
-    // XYB conversion constants from JPEG XL specification
-    private val XYB_BIAS = 0.00379307325527544933
-    private val CBRT_BIAS = 0.155954200549248620 // cbrt(XYB_BIAS)
-    
-    // RGB to LMS mixing coefficients  
-    private val RGB_TO_LMS = arrayOf(
-        doubleArrayOf(0.3, 0.622, 0.078),                           // L coefficients
-        doubleArrayOf(0.23, 0.692, 0.078),                          // M coefficients  
-        doubleArrayOf(0.24342268924547819, 0.20476744424496821, 0.55180986650955360)  // S coefficients
-    )
-    
-    // LMS to RGB inverse matrix
-    private val LMS_TO_RGB = arrayOf(
-        doubleArrayOf(11.0315669046, -9.8669439081, -0.1646229965),
-        doubleArrayOf(-3.2541473811, 4.4187703776, -0.1646229965),
-        doubleArrayOf(-3.6588512867, 2.7129230459, 1.9459282408)
-    )
-    
-    // sRGB linearization functions
-    private fun srgbLinearise(value: Double): Double {
-        return if (value > 0.04045) {
-            Math.pow((value + 0.055) / 1.055, 2.4)
-        } else {
-            value / 12.92
-        }
-    }
-    
-    private fun srgbUnlinearise(value: Double): Double {
-        return if (value > 0.0031308) {
-            1.055 * Math.pow(value, 1.0 / 2.4) - 0.055
-        } else {
-            value * 12.92
-        }
-    }
-    
-    // XYB to RGB conversion for hardware decoding
-    fun tevXybToRGB(yBlock: IntArray, xBlock: IntArray, bBlock: IntArray): IntArray {
-        val rgbData = IntArray(16 * 16 * 3)  // R,G,B for 16x16 pixels
-        
-        for (py in 0 until 16) {
-            for (px in 0 until 16) {
-                val yIdx = py * 16 + px
-                val y = yBlock[yIdx]
-                
-                // Get chroma values from subsampled 8x8 blocks (nearest neighbor upsampling)
-                val xbIdx = (py / 2) * 8 + (px / 2)
-                val x = xBlock[xbIdx]
-                val b = bBlock[xbIdx]
-                
-                // Optimal range-based dequantization (exact inverse of improved quantization)
-                val X_MIN = -0.016; val X_MAX = 0.030
-                val xVal = (x / 255.0) * (X_MAX - X_MIN) + X_MIN  // X: inverse of range mapping
-                val Y_MAX = 0.85
-                val yVal = (y / 255.0) * Y_MAX                    // Y: inverse of improved scale
-                val B_MAX = 0.85
-                val bVal = ((b + 128.0) / 255.0) * B_MAX          // B: inverse of ((val/B_MAX*255)-128)
-                
-                // XYB to LMS gamma
-                val lgamma = xVal + yVal
-                val mgamma = yVal - xVal
-                val sgamma = bVal
-                
-                // Remove gamma correction
-                val lmix = (lgamma + CBRT_BIAS).pow(3.0) - XYB_BIAS
-                val mmix = (mgamma + CBRT_BIAS).pow(3.0) - XYB_BIAS
-                val smix = (sgamma + CBRT_BIAS).pow(3.0) - XYB_BIAS
-                
-                // LMS to linear RGB using inverse matrix
-                val rLinear = (LMS_TO_RGB[0][0] * lmix + LMS_TO_RGB[0][1] * mmix + LMS_TO_RGB[0][2] * smix).coerceIn(0.0, 1.0)
-                val gLinear = (LMS_TO_RGB[1][0] * lmix + LMS_TO_RGB[1][1] * mmix + LMS_TO_RGB[1][2] * smix).coerceIn(0.0, 1.0)
-                val bLinear = (LMS_TO_RGB[2][0] * lmix + LMS_TO_RGB[2][1] * mmix + LMS_TO_RGB[2][2] * smix).coerceIn(0.0, 1.0)
-                
-                // Convert back to sRGB gamma and 0-255 range
-                val r = (srgbUnlinearise(rLinear) * 255.0 + 0.5).toInt().coerceIn(0, 255)
-                val g = (srgbUnlinearise(gLinear) * 255.0 + 0.5).toInt().coerceIn(0, 255)
-                val bRgb = (srgbUnlinearise(bLinear) * 255.0 + 0.5).toInt().coerceIn(0, 255)
-                
-                // Store RGB
-                val baseIdx = (py * 16 + px) * 3
-                rgbData[baseIdx] = r     // R
-                rgbData[baseIdx + 1] = g // G
-                rgbData[baseIdx + 2] = bRgb // B
-            }
-        }
-        
-        return rgbData
-    }
-    
-    // RGB to XYB conversion for INTER mode residual calculation
-    fun tevRGBToXyb(rgbBlock: IntArray): IntArray {
-        val xybData = IntArray(16 * 16 * 3)  // Y,X,B for 16x16 pixels
-        
-        for (py in 0 until 16) {
-            for (px in 0 until 16) {
-                val baseIdx = (py * 16 + px) * 3
-                val r = rgbBlock[baseIdx]
-                val g = rgbBlock[baseIdx + 1]
-                val b = rgbBlock[baseIdx + 2]
-                
-                // Convert RGB to 0-1 range and linearise sRGB
-                val rNorm = srgbLinearise(r / 255.0)
-                val gNorm = srgbLinearise(g / 255.0)
-                val bNorm = srgbLinearise(b / 255.0)
-                
-                // RGB to LMS mixing with bias
-                val lmix = RGB_TO_LMS[0][0] * rNorm + RGB_TO_LMS[0][1] * gNorm + RGB_TO_LMS[0][2] * bNorm + XYB_BIAS
-                val mmix = RGB_TO_LMS[1][0] * rNorm + RGB_TO_LMS[1][1] * gNorm + RGB_TO_LMS[1][2] * bNorm + XYB_BIAS
-                val smix = RGB_TO_LMS[2][0] * rNorm + RGB_TO_LMS[2][1] * gNorm + RGB_TO_LMS[2][2] * bNorm + XYB_BIAS
-                
-                // Apply gamma correction (cube root)
-                val lgamma = lmix.pow(1.0 / 3.0) - CBRT_BIAS
-                val mgamma = mmix.pow(1.0 / 3.0) - CBRT_BIAS
-                val sgamma = smix.pow(1.0 / 3.0) - CBRT_BIAS
-                
-                // LMS to XYB transformation
-                val xVal = (lgamma - mgamma) / 2.0
-                val yVal = (lgamma + mgamma) / 2.0
-                val bVal = sgamma
-                
-                // Optimal range-based quantization for XYB values (improved precision)
-                // X: actual range -0.016 to +0.030, map to full 0-255 precision
-                val X_MIN = -0.016; val X_MAX = 0.030
-                val xQuant = (((xVal - X_MIN) / (X_MAX - X_MIN)) * 255.0).toInt().coerceIn(0, 255)
-                // Y: range 0 to 0.85, map to 0 to 255 (improved scale)
-                val Y_MAX = 0.85
-                val yQuant = ((yVal / Y_MAX) * 255.0).toInt().coerceIn(0, 255)
-                // B: range 0 to 0.85, map to -128 to +127 (optimized precision)
-                val B_MAX = 0.85
-                val bQuant = (((bVal / B_MAX) * 255.0) - 128.0).toInt().coerceIn(-128, 127)
-                
-                // Store XYB values
-                val yIdx = py * 16 + px
-                xybData[yIdx * 3] = yQuant     // Y
-                xybData[yIdx * 3 + 1] = xQuant // X
-                xybData[yIdx * 3 + 2] = bQuant // B
-            }
-        }
-        
-        return xybData
-    }
 
     /**
      * Enhanced TEV Deblocking Filter - Uses Knusperli-inspired techniques for superior boundary analysis
@@ -2627,8 +2571,7 @@ class GraphicsJSR223Delegate(private val vm: VM) {
     fun tevDecode(blockDataPtr: Long, currentRGBAddr: Long, prevRGBAddr: Long,
                   width: Int, height: Int, qY: Int, qCo: Int, qCg: Int, frameCounter: Int,
                   debugMotionVectors: Boolean = false, tevVersion: Int = 2,
-                  enableDeblocking: Boolean = true, enableBoundaryAwareDecoding: Boolean = false,
-                  isLossless: Boolean = false) {
+                  enableDeblocking: Boolean = true, enableBoundaryAwareDecoding: Boolean = false) {
 
         // height doesn't change when interlaced, because that's the encoder's output
 
@@ -2705,7 +2648,9 @@ class GraphicsJSR223Delegate(private val vm: VM) {
             // PASS 2: Apply proper knusperli boundary optimization (Google's algorithm)
             val (optimizedYBlocks, optimizedCoBlocks, optimizedCgBlocks) = applyKnusperliOptimization(
                 yBlocks, coBlocks, cgBlocks,
-                QUANT_TABLE_Y, QUANT_TABLE_C, QUANT_TABLE_C,
+                if (tevVersion == 3) QUANT_TABLE_Y else QUANT_TABLE_Y,
+                if (tevVersion == 3) QUANT_TABLE_C else QUANT_TABLE_C,
+                if (tevVersion == 3) QUANT_TABLE_C else QUANT_TABLE_C,
                 qY, qCo, qCg, rateControlFactors,
                 blocksX, blocksY
             )
@@ -2744,7 +2689,7 @@ class GraphicsJSR223Delegate(private val vm: VM) {
                                 val cgPixels = tevIdct8x8_fromOptimizedCoeffs(cgBlock)
                                 
                                 val rgbData = if (tevVersion == 3) {
-                                    tevXybToRGB(yPixels, coPixels, cgPixels)
+                                    tevIctcpToRGB(yPixels, coPixels, cgPixels)
                                 } else {
                                     tevYcocgToRGB(yPixels, coPixels, cgPixels)
                                 }
@@ -2919,69 +2864,20 @@ class GraphicsJSR223Delegate(private val vm: VM) {
                         }
 
                         0x01 -> { // TEV_MODE_INTRA - Full YCoCg-R DCT decode (no motion compensation)
-                            val yBlock: IntArray
-                            val coBlock: IntArray  
-                            val cgBlock: IntArray
-                            
-                            if (isLossless) {
-                                // Lossless mode: coefficients are stored as float16, no quantization
-                                // Read float16 coefficients: Y (16x16=256), Co (8x8=64), Cg (8x8=64)
-                                val coeffFloat16Array = ShortArray(384) // 384 float16 values stored as shorts
-                                vm.bulkPeekShort(readPtr.toInt(), coeffFloat16Array, 768) // 384 * 2 bytes
-                                readPtr += 768
-                                
-                                // Convert float16 to float32 and perform IDCT directly (no quantization)
-                                println("DEBUG: Reading lossless coefficients, first few float16 values: ${coeffFloat16Array.take(10).map { "0x${it.toString(16)}" }}")
-                                val yCoeffs = FloatArray(256) { i ->
-                                    // Convert signed short to unsigned short for float16 interpretation
-                                    val signedShort = coeffFloat16Array[i]
-                                    val float16bits = signedShort.toInt() and 0xFFFF  // Convert to unsigned
-                                    val floatVal = Float16.toFloat(float16bits.toShort())
-                                    if (floatVal.isNaN() || floatVal.isInfinite()) {
-                                        println("NaN/Inf detected at Y coefficient $i: signedShort=0x${signedShort.toString(16)}, unsigned=0x${float16bits.toString(16)}, floatVal=$floatVal")
-                                        0f // Replace NaN with 0
-                                    } else floatVal
-                                }
-                                val coCoeffs = FloatArray(64) { i ->
-                                    // Convert signed short to unsigned short for float16 interpretation
-                                    val signedShort = coeffFloat16Array[256 + i]
-                                    val float16bits = signedShort.toInt() and 0xFFFF  // Convert to unsigned
-                                    val floatVal = Float16.toFloat(float16bits.toShort())
-                                    if (floatVal.isNaN() || floatVal.isInfinite()) {
-                                        println("NaN/Inf detected at Co coefficient $i: signedShort=0x${signedShort.toString(16)}, unsigned=0x${float16bits.toString(16)}, floatVal=$floatVal")
-                                        0f // Replace NaN with 0
-                                    } else floatVal
-                                }
-                                val cgCoeffs = FloatArray(64) { i ->
-                                    // Convert signed short to unsigned short for float16 interpretation
-                                    val signedShort = coeffFloat16Array[320 + i]
-                                    val float16bits = signedShort.toInt() and 0xFFFF  // Convert to unsigned
-                                    val floatVal = Float16.toFloat(float16bits.toShort())
-                                    if (floatVal.isNaN() || floatVal.isInfinite()) {
-                                        println("NaN/Inf detected at Cg coefficient $i: signedShort=0x${signedShort.toString(16)}, unsigned=0x${float16bits.toString(16)}, floatVal=$floatVal")
-                                        0f // Replace NaN with 0
-                                    } else floatVal
-                                }
-                                
-                                yBlock = tevIdct16x16_lossless(yCoeffs)
-                                coBlock = tevIdct8x8_lossless(coCoeffs)
-                                cgBlock = tevIdct8x8_lossless(cgCoeffs)
-                            } else {
-                                // Regular lossy mode: quantized int16 coefficients
-                                // Optimized bulk reading of all DCT coefficients: Y(256×2) + Co(64×2) + Cg(64×2) = 768 bytes
-                                val coeffShortArray = ShortArray(384) // Total coefficients: 256 + 64 + 64 = 384 shorts
-                                vm.bulkPeekShort(readPtr.toInt(), coeffShortArray, 768)
-                                readPtr += 768
+                            // Regular lossy mode: quantized int16 coefficients
+                            // Optimized bulk reading of all DCT coefficients: Y(256×2) + Co(64×2) + Cg(64×2) = 768 bytes
+                            val coeffShortArray = ShortArray(384) // Total coefficients: 256 + 64 + 64 = 384 shorts
+                            vm.bulkPeekShort(readPtr.toInt(), coeffShortArray, 768)
+                            readPtr += 768
 
-                                // Perform hardware IDCT for each channel using fast algorithm
-                                yBlock = tevIdct16x16_fast(coeffShortArray.sliceArray(0 until 256), QUANT_TABLE_Y, qY, rateControlFactor)
-                                coBlock = tevIdct8x8_fast(coeffShortArray.sliceArray(256 until 320), QUANT_TABLE_C, true, qCo, rateControlFactor)
-                                cgBlock = tevIdct8x8_fast(coeffShortArray.sliceArray(320 until 384), QUANT_TABLE_C, true, qCg, rateControlFactor)
-                            }
+                            // Perform hardware IDCT for each channel using fast algorithm
+                            val yBlock = tevIdct16x16_fast(coeffShortArray.sliceArray(0 until 256), if (tevVersion == 3) QUANT_TABLE_Y else QUANT_TABLE_Y, qY, rateControlFactor)
+                            val coBlock = tevIdct8x8_fast(coeffShortArray.sliceArray(256 until 320), if (tevVersion == 3) QUANT_TABLE_C else QUANT_TABLE_C, true, qCo, rateControlFactor)
+                            val cgBlock = tevIdct8x8_fast(coeffShortArray.sliceArray(320 until 384), if (tevVersion == 3) QUANT_TABLE_C else QUANT_TABLE_C, true, qCg, rateControlFactor)
 
                             // Convert to RGB (YCoCg-R for v2, XYB for v3)
                             val rgbData = if (tevVersion == 3) {
-                                tevXybToRGB(yBlock, coBlock, cgBlock)  // XYB format (v3)
+                                tevIctcpToRGB(yBlock, coBlock, cgBlock)  // XYB format (v3)
                             } else {
                                 tevYcocgToRGB(yBlock, coBlock, cgBlock)  // YCoCg-R format (v2)
                             }
@@ -2999,9 +2895,9 @@ class GraphicsJSR223Delegate(private val vm: VM) {
                             readPtr += 768
 
                             // Step 2: Decode residual DCT
-                            val yResidual = tevIdct16x16_fast(coeffShortArray.sliceArray(0 until 256), QUANT_TABLE_Y, qY, rateControlFactor)
-                            val coResidual = tevIdct8x8_fast(coeffShortArray.sliceArray(256 until 320), QUANT_TABLE_C, true, qCo, rateControlFactor)
-                            val cgResidual = tevIdct8x8_fast(coeffShortArray.sliceArray(320 until 384), QUANT_TABLE_C, true, qCg, rateControlFactor)
+                            val yResidual = tevIdct16x16_fast(coeffShortArray.sliceArray(0 until 256), if (tevVersion == 3) QUANT_TABLE_Y else QUANT_TABLE_Y, qY, rateControlFactor)
+                            val coResidual = tevIdct8x8_fast(coeffShortArray.sliceArray(256 until 320), if (tevVersion == 3) QUANT_TABLE_C else QUANT_TABLE_C, true, qCo, rateControlFactor)
+                            val cgResidual = tevIdct8x8_fast(coeffShortArray.sliceArray(320 until 384), if (tevVersion == 3) QUANT_TABLE_C else QUANT_TABLE_C, true, qCg, rateControlFactor)
 
                             // Step 3: Build motion-compensated YCoCg-R block and add residuals
                             val finalY = IntArray(256)
@@ -3108,7 +3004,7 @@ class GraphicsJSR223Delegate(private val vm: VM) {
 
                             // Step 4: Convert final data to RGB (YCoCg-R for v2, XYB for v3)
                             val finalRgb = if (tevVersion == 3) {
-                                tevXybToRGB(finalY, finalCo, finalCg)  // XYB format (v3)
+                                tevIctcpToRGB(finalY, finalCo, finalCg)  // XYB format (v3)
                             } else {
                                 tevYcocgToRGB(finalY, finalCo, finalCg)  // YCoCg-R format (v2)
                             }
@@ -4023,1094 +3919,4 @@ class GraphicsJSR223Delegate(private val vm: VM) {
         }
     }
 
-    // =============================================================================
-    // TAV (TSVM Advanced Video) Hardware Acceleration Functions
-    // =============================================================================
-
-    // 5/3 Reversible wavelet filter coefficients
-    private val wavelet53LP = floatArrayOf(0.5f, 1.0f, 0.5f)
-    private val wavelet53HP = floatArrayOf(-0.125f, -0.25f, 0.75f, -0.25f, -0.125f)
-
-    // 9/7 Irreversible wavelet filter coefficients (Daubechies)
-    private val wavelet97LP = floatArrayOf(
-        0.037828455507f, -0.023849465020f, -0.110624404418f, 0.377402855613f,
-        0.852698679009f, 0.377402855613f, -0.110624404418f, -0.023849465020f, 0.037828455507f
-    )
-    private val wavelet97HP = floatArrayOf(
-        0.064538882629f, -0.040689417609f, -0.418092273222f, 0.788485616406f,
-        -0.418092273222f, -0.040689417609f, 0.064538882629f
-    )
-
-    // Working buffers for DWT processing
-    private val dwtTempBuffer = FloatArray(64 * 64)
-    private val dwtSubbandLL = FloatArray(32 * 32)
-    private val dwtSubbandLH = FloatArray(32 * 32) 
-    private val dwtSubbandHL = FloatArray(32 * 32)
-    private val dwtSubbandHH = FloatArray(32 * 32)
-
-    private var frameCounter = 0
-    /**
-     * Main TAV decoder function - processes compressed TAV tile data
-     * Called from JavaScript playtav.js decoder
-     */
-    fun tavDecode(blockDataPtr: Long, currentRGBAddr: Long, prevRGBAddr: Long,
-                  width: Int, height: Int, qY: Int, qCo: Int, qCg: Int, frameCounter: Int,
-                  debugMotionVectors: Boolean = false, waveletFilter: Int = 1,
-                  decompLevels: Int = 3, enableDeblocking: Boolean = true,
-                  isLossless: Boolean = false) {
-        this.frameCounter = frameCounter
-
-        var readPtr = blockDataPtr
-
-        try {
-            val tilesX = (width + 63) / 64  // 64x64 tiles (vs TEV's 16x16 blocks)
-            val tilesY = (height + 63) / 64
-            
-            // Process each tile
-            for (tileY in 0 until tilesY) {
-                for (tileX in 0 until tilesX) {
-                    
-                    // Read tile header (9 bytes: mode + mvX + mvY + rcf)
-                    val mode = vm.peek(readPtr).toInt() and 0xFF
-                    readPtr += 1
-                    val mvX = vm.peekShort(readPtr).toInt()
-                    readPtr += 2
-                    val mvY = vm.peekShort(readPtr).toInt()
-                    readPtr += 2
-                    val rcf = vm.peekFloat(readPtr)
-                    readPtr += 4
-
-                    // Debug tile header for first few tiles
-                    if ((tileX < 2 && tileY < 2) && frameCounter < 3) {
-                        println("TAV Debug: Tile ($tileX,$tileY) frame $frameCounter - mode=0x${mode.toString(16)}, mvX=$mvX, mvY=$mvY, rcf=$rcf")
-                    }
-
-                    when (mode) {
-                        0x00 -> { // TAV_MODE_SKIP
-                            // Copy 64x64 tile from previous frame to current frame
-                            copyTile64x64RGB(tileX, tileY, currentRGBAddr, prevRGBAddr, width, height)
-                        }
-                        0x01 -> { // TAV_MODE_INTRA  
-                            // Decode DWT coefficients directly to RGB buffer
-                            readPtr = decodeDWTIntraTileRGB(readPtr, tileX, tileY, currentRGBAddr, 
-                                                          width, height, qY, qCo, qCg, rcf,
-                                                          waveletFilter, decompLevels, isLossless)
-                        }
-                        0x02 -> { // TAV_MODE_INTER
-                            // Motion compensation + DWT residual to RGB buffer
-                            readPtr = decodeDWTInterTileRGB(readPtr, tileX, tileY, mvX, mvY,
-                                                          currentRGBAddr, prevRGBAddr,
-                                                          width, height, qY, qCo, qCg, rcf,
-                                                          waveletFilter, decompLevels, isLossless)
-                        }
-                        0x03 -> { // TAV_MODE_MOTION
-                            // Motion compensation only (no residual)
-                            applyMotionCompensation64x64RGB(tileX, tileY, mvX, mvY,
-                                                          currentRGBAddr, prevRGBAddr, width, height)
-                        }
-                    }
-                }
-            }
-
-        } catch (e: Exception) {
-            println("TAV decode error: ${e.message}")
-        }
-    }
-
-    // Helper functions for TAV RGB-based decoding
-    
-    private fun copyTile64x64RGB(tileX: Int, tileY: Int, currentRGBAddr: Long, prevRGBAddr: Long, width: Int, height: Int) {
-        val tileSize = 64
-        val startX = tileX * tileSize
-        val startY = tileY * tileSize
-        
-        for (y in 0 until tileSize) {
-            for (x in 0 until tileSize) {
-                val frameX = startX + x
-                val frameY = startY + y
-                
-                if (frameX < width && frameY < height) {
-                    val pixelIdx = frameY * width + frameX
-                    val rgbOffset = pixelIdx * 3L
-                    
-                    // Copy RGB pixel from previous frame
-                    val r = vm.peek(prevRGBAddr + rgbOffset)
-                    val g = vm.peek(prevRGBAddr + rgbOffset + 1)
-                    val b = vm.peek(prevRGBAddr + rgbOffset + 2)
-                    
-                    vm.poke(currentRGBAddr + rgbOffset, r)
-                    vm.poke(currentRGBAddr + rgbOffset + 1, g)
-                    vm.poke(currentRGBAddr + rgbOffset + 2, b)
-                }
-            }
-        }
-    }
-    
-    private fun decodeDWTIntraTileRGB(readPtr: Long, tileX: Int, tileY: Int, currentRGBAddr: Long,
-                                    width: Int, height: Int, qY: Int, qCo: Int, qCg: Int, rcf: Float,
-                                    waveletFilter: Int, decompLevels: Int, isLossless: Boolean): Long {
-        val tileSize = 64
-        val coeffCount = tileSize * tileSize
-        var ptr = readPtr
-        
-        // Read quantized DWT coefficients for Y, Co, Cg channels
-        val quantizedY = ShortArray(coeffCount)
-        val quantizedCo = ShortArray(coeffCount)
-        val quantizedCg = ShortArray(coeffCount)
-        
-        // Read Y coefficients
-        for (i in 0 until coeffCount) {
-            quantizedY[i] = vm.peekShort(ptr)
-            ptr += 2
-        }
-        
-        // Read Co coefficients
-        for (i in 0 until coeffCount) {
-            quantizedCo[i] = vm.peekShort(ptr)
-            ptr += 2
-        }
-        
-        // Read Cg coefficients
-        for (i in 0 until coeffCount) {
-            quantizedCg[i] = vm.peekShort(ptr)
-            ptr += 2
-        }
-        
-        // Dequantize and apply inverse DWT
-        val yTile = FloatArray(coeffCount)
-        val coTile = FloatArray(coeffCount)
-        val cgTile = FloatArray(coeffCount)
-        
-        // Debug: check quantized values before dequantization
-        if (tileX == 0 && tileY == 0 && frameCounter < 3) {
-            println("TAV Debug: Tile (0,0) frame $frameCounter - readPtr=0x${readPtr.toString(16)}")
-            println("TAV Debug: First 32 bytes at readPtr: ${(0 until 32).map { "0x%02x".format(vm.peek(readPtr + it).toInt() and 0xFF) }.joinToString(" ")}")
-            println("TAV Debug: Tile (0,0) frame $frameCounter - Quantized Y coeffs (first 64):")
-            for (i in 0 until 8) {
-                for (j in 0 until 8) {
-                    print("${quantizedY[i * 8 + j]} ")
-                }
-                println()
-            }
-            
-            // Check how many non-zero coefficients we have
-            var nonZeroCount = 0
-            for (i in 0 until coeffCount) {
-                if (quantizedY[i] != 0.toShort()) nonZeroCount++
-            }
-            println("TAV Debug: Non-zero Y coefficients: $nonZeroCount out of $coeffCount")
-            
-            // Show all non-zero coefficients with their positions
-            println("TAV Debug: All non-zero Y coefficients:")
-            for (i in 0 until coeffCount) {
-                if (quantizedY[i] != 0.toShort()) {
-                    val row = i / 64
-                    val col = i % 64
-                    println("  Y[$row,$col] = ${quantizedY[i]}")
-                }
-            }
-            
-            println("qY=$qY, qCo=$qCo, qCg=$qCg, rcf=$rcf")
-        }
-        
-        for (i in 0 until coeffCount) {
-            yTile[i] = quantizedY[i] * qY * rcf
-            coTile[i] = quantizedCo[i] * qCo * rcf
-            cgTile[i] = quantizedCg[i] * qCg * rcf
-        }
-        
-        // Debug: compare expected vs actual DC values
-        if (tileX == 0 && tileY == 0 && frameCounter < 3) {
-            val expectedDC = 195 * 5 * 1.0f  // quantized_dc * qY * rcf
-            val actualDC = yTile[0] 
-            println("TAV Debug: DC comparison - quantized=${quantizedY[0]}, expected_dc=$expectedDC, actual_dc=$actualDC")
-            println("TAV Debug: Dequantized Y[0-15]: ${yTile.sliceArray(0..15).joinToString { "%.1f".format(it) }}")
-        }
-        
-        // Apply inverse DWT using 9/7 irreversible filter with 3 decomposition levels
-        applyDWTInverseMultiLevel(yTile, tileSize, tileSize, 3, 1)
-        applyDWTInverseMultiLevel(coTile, tileSize, tileSize, 3, 1)
-        applyDWTInverseMultiLevel(cgTile, tileSize, tileSize, 3, 1)
-        
-        
-        // Debug: check if we get reasonable values after DWT
-        if (tileX == 0 && tileY == 0 && frameCounter < 3) {
-            println("TAV Debug: Tile (0,0) frame $frameCounter - Y sample values after DWT:")
-            for (i in 0 until 8) {
-                for (j in 0 until 8) {
-                    print("%.2f ".format(yTile[i * tileSize + j]))
-                }
-                println()
-            }
-        }
-        
-        // Convert YCoCg to RGB and store in buffer
-        convertYCoCgTileToRGB(tileX, tileY, yTile, coTile, cgTile, currentRGBAddr, width, height)
-        
-        return ptr
-    }
-    
-    private fun decodeDWTInterTileRGB(readPtr: Long, tileX: Int, tileY: Int, mvX: Int, mvY: Int,
-                                    currentRGBAddr: Long, prevRGBAddr: Long,
-                                    width: Int, height: Int, qY: Int, qCo: Int, qCg: Int, rcf: Float,
-                                    waveletFilter: Int, decompLevels: Int, isLossless: Boolean): Long {
-        
-        // Step 1: Apply motion compensation
-        applyMotionCompensation64x64RGB(tileX, tileY, mvX, mvY, currentRGBAddr, prevRGBAddr, width, height)
-        
-        // Step 2: Add DWT residual (same as intra but add to existing pixels)
-        var ptr = readPtr
-        val tileSize = 64
-        val coeffCount = tileSize * tileSize
-        
-        // Read and decode residual (same as intra)
-        val quantizedY = ShortArray(coeffCount)
-        val quantizedCo = ShortArray(coeffCount)
-        val quantizedCg = ShortArray(coeffCount)
-        
-        for (i in 0 until coeffCount) {
-            quantizedY[i] = vm.peekShort(ptr)
-            ptr += 2
-        }
-        for (i in 0 until coeffCount) {
-            quantizedCo[i] = vm.peekShort(ptr)
-            ptr += 2
-        }
-        for (i in 0 until coeffCount) {
-            quantizedCg[i] = vm.peekShort(ptr)
-            ptr += 2
-        }
-        
-        val yResidual = FloatArray(coeffCount)
-        val coResidual = FloatArray(coeffCount)
-        val cgResidual = FloatArray(coeffCount)
-        
-        for (i in 0 until coeffCount) {
-            yResidual[i] = quantizedY[i] * qY * rcf
-            coResidual[i] = quantizedCo[i] * qCo * rcf
-            cgResidual[i] = quantizedCg[i] * qCg * rcf
-        }
-        
-        applyDWTInverseMultiLevel(yResidual, tileSize, tileSize, 3, 1)
-        applyDWTInverseMultiLevel(coResidual, tileSize, tileSize, 3, 1)
-        applyDWTInverseMultiLevel(cgResidual, tileSize, tileSize, 3, 1)
-        
-        // Add residual to motion-compensated prediction
-        addYCoCgResidualToRGBTile(tileX, tileY, yResidual, coResidual, cgResidual, currentRGBAddr, width, height)
-        
-        return ptr
-    }
-    
-    private fun applyMotionCompensation64x64RGB(tileX: Int, tileY: Int, mvX: Int, mvY: Int,
-                                              currentRGBAddr: Long, prevRGBAddr: Long, 
-                                              width: Int, height: Int) {
-        val tileSize = 64
-        val startX = tileX * tileSize
-        val startY = tileY * tileSize
-        
-        // Motion vectors in quarter-pixel precision
-        val refX = startX + (mvX / 4.0f)
-        val refY = startY + (mvY / 4.0f)
-        
-        for (y in 0 until tileSize) {
-            for (x in 0 until tileSize) {
-                val currentPixelIdx = (startY + y) * width + (startX + x)
-                
-                if (currentPixelIdx >= 0 && currentPixelIdx < width * height) {
-                    // Bilinear interpolation for sub-pixel motion vectors
-                    val srcX = refX + x
-                    val srcY = refY + y
-                    
-                    val interpolatedRGB = bilinearInterpolateRGB(prevRGBAddr, width, height, srcX, srcY)
-                    
-                    val rgbOffset = currentPixelIdx * 3L
-                    vm.poke(currentRGBAddr + rgbOffset, interpolatedRGB[0])
-                    vm.poke(currentRGBAddr + rgbOffset + 1, interpolatedRGB[1])
-                    vm.poke(currentRGBAddr + rgbOffset + 2, interpolatedRGB[2])
-                }
-            }
-        }
-    }
-    
-    private fun bilinearInterpolateRGB(rgbPtr: Long, width: Int, height: Int, x: Float, y: Float): ByteArray {
-        val x0 = kotlin.math.floor(x).toInt()
-        val y0 = kotlin.math.floor(y).toInt()
-        val x1 = x0 + 1
-        val y1 = y0 + 1
-        
-        if (x0 < 0 || y0 < 0 || x1 >= width || y1 >= height) {
-            return byteArrayOf(0, 0, 0)  // Out of bounds - return black
-        }
-        
-        val fx = x - x0
-        val fy = y - y0
-        
-        // Get 4 corner pixels
-        val rgb00 = getRGBPixel(rgbPtr, y0 * width + x0)
-        val rgb10 = getRGBPixel(rgbPtr, y0 * width + x1) 
-        val rgb01 = getRGBPixel(rgbPtr, y1 * width + x0)
-        val rgb11 = getRGBPixel(rgbPtr, y1 * width + x1)
-        
-        // Bilinear interpolation
-        val result = ByteArray(3)
-        for (c in 0..2) {
-            val interp = (1 - fx) * (1 - fy) * (rgb00[c].toInt() and 0xFF) +
-                        fx * (1 - fy) * (rgb10[c].toInt() and 0xFF) +
-                        (1 - fx) * fy * (rgb01[c].toInt() and 0xFF) +
-                        fx * fy * (rgb11[c].toInt() and 0xFF)
-            result[c] = interp.toInt().coerceIn(0, 255).toByte()
-        }
-        
-        return result
-    }
-    
-    private fun getRGBPixel(rgbPtr: Long, pixelIdx: Int): ByteArray {
-        val offset = pixelIdx * 3L
-        return byteArrayOf(
-            vm.peek(rgbPtr + offset),
-            vm.peek(rgbPtr + offset + 1), 
-            vm.peek(rgbPtr + offset + 2)
-        )
-    }
-    
-    private fun convertYCoCgTileToRGB(tileX: Int, tileY: Int, yTile: FloatArray, coTile: FloatArray, cgTile: FloatArray,
-                                    rgbAddr: Long, width: Int, height: Int) {
-        val tileSize = 64
-        val startX = tileX * tileSize
-        val startY = tileY * tileSize
-        
-        for (y in 0 until tileSize) {
-            for (x in 0 until tileSize) {
-                val frameX = startX + x
-                val frameY = startY + y
-                
-                if (frameX < width && frameY < height) {
-                    val tileIdx = y * tileSize + x
-                    val pixelIdx = frameY * width + frameX
-                    
-                    // YCoCg-R to RGB conversion (exact inverse of encoder)
-                    val Y = yTile[tileIdx]
-                    val Co = coTile[tileIdx] 
-                    val Cg = cgTile[tileIdx]
-                    
-                    // Inverse of encoder's YCoCg-R transform:
-                    // Forward: Co = r - b; tmp = b + Co/2; Cg = g - tmp; Y = tmp + Cg/2
-                    val tmp = Y - Cg / 2.0f
-                    val g = Cg + tmp
-                    val b = tmp - Co / 2.0f
-                    val r = Co + b
-                    
-                    val rgbOffset = pixelIdx * 3L
-                    vm.poke(rgbAddr + rgbOffset, r.toInt().coerceIn(0, 255).toByte())
-                    vm.poke(rgbAddr + rgbOffset + 1, g.toInt().coerceIn(0, 255).toByte())
-                    vm.poke(rgbAddr + rgbOffset + 2, b.toInt().coerceIn(0, 255).toByte())
-                }
-            }
-        }
-    }
-    
-    private fun addYCoCgResidualToRGBTile(tileX: Int, tileY: Int, yRes: FloatArray, coRes: FloatArray, cgRes: FloatArray,
-                                        rgbAddr: Long, width: Int, height: Int) {
-        val tileSize = 64
-        val startX = tileX * tileSize
-        val startY = tileY * tileSize
-        
-        for (y in 0 until tileSize) {
-            for (x in 0 until tileSize) {
-                val frameX = startX + x
-                val frameY = startY + y
-                
-                if (frameX < width && frameY < height) {
-                    val tileIdx = y * tileSize + x
-                    val pixelIdx = frameY * width + frameX
-                    val rgbOffset = pixelIdx * 3L
-                    
-                    // Get current RGB (from motion compensation)
-                    val curR = (vm.peek(rgbAddr + rgbOffset).toInt() and 0xFF).toFloat()
-                    val curG = (vm.peek(rgbAddr + rgbOffset + 1).toInt() and 0xFF).toFloat()
-                    val curB = (vm.peek(rgbAddr + rgbOffset + 2).toInt() and 0xFF).toFloat()
-                    
-                    // Convert current RGB back to YCoCg
-                    val co = (curR - curB) / 2
-                    val tmp = curB + co
-                    val cg = (curG - tmp) / 2
-                    val yPred = tmp + cg
-                    
-                    // Add residual
-                    val yFinal = yPred + yRes[tileIdx]
-                    val coFinal = co + coRes[tileIdx]
-                    val cgFinal = cg + cgRes[tileIdx]
-                    
-                    // Convert back to RGB
-                    val tmpFinal = yFinal - cgFinal
-                    val gFinal = yFinal + cgFinal
-                    val bFinal = tmpFinal - coFinal
-                    val rFinal = tmpFinal + coFinal
-                    
-                    vm.poke(rgbAddr + rgbOffset, rFinal.toInt().coerceIn(0, 255).toByte())
-                    vm.poke(rgbAddr + rgbOffset + 1, gFinal.toInt().coerceIn(0, 255).toByte())
-                    vm.poke(rgbAddr + rgbOffset + 2, bFinal.toInt().coerceIn(0, 255).toByte())
-                }
-            }
-        }
-    }
-
-    /**
-     * 2D DWT forward/inverse transform
-     * Supports both 5/3 reversible and 9/7 irreversible filters
-     */
-    fun tavDWT2D(
-        inputPtr: Long, outputPtr: Long,
-        width: Int, height: Int,
-        levels: Int, filterType: Int,
-        isForward: Boolean
-    ) {
-        // Copy input data to working buffer
-        for (i in 0 until width * height) {
-            dwtTempBuffer[i] = vm.peekFloat(inputPtr + i * 4L)!!
-        }
-
-        if (isForward) {
-            // Forward DWT - decompose into subbands
-            for (level in 0 until levels) {
-                val levelWidth = width shr level
-                val levelHeight = height shr level
-
-                if (filterType == 0) {
-                    applyDWT53Forward(dwtTempBuffer, levelWidth, levelHeight)
-                } else {
-                    applyDWT97Forward(dwtTempBuffer, levelWidth, levelHeight)
-                }
-            }
-        } else {
-            // Inverse DWT - reconstruct from subbands
-            for (level in levels - 1 downTo 0) {
-                val levelWidth = width shr level
-                val levelHeight = height shr level
-
-                if (filterType == 0) {
-                    applyDWT53Inverse(dwtTempBuffer, levelWidth, levelHeight)
-                } else {
-                    applyDWT97Inverse(dwtTempBuffer, levelWidth, levelHeight)
-                }
-            }
-        }
-
-        // Copy result to output
-        for (i in 0 until width * height) {
-            vm.pokeFloat(outputPtr + i * 4L, dwtTempBuffer[i])
-        }
-    }
-
-    /**
-     * Multi-band quantization for DWT subbands
-     */
-    fun tavQuantize(
-        subbandPtr: Long, quantTable: IntArray,
-        width: Int, height: Int,
-        isInverse: Boolean
-    ) {
-        val size = width * height
-
-        if (isInverse) {
-            // Dequantization
-            for (i in 0 until size) {
-                val quantized = vm.peekShort(subbandPtr + i * 2L)!!.toInt()
-                val dequantized = quantized * quantTable[i % quantTable.size]
-                vm.pokeFloat(subbandPtr + i * 4L, dequantized.toFloat())
-            }
-        } else {
-            // Quantization
-            for (i in 0 until size) {
-                val value = vm.peekFloat(subbandPtr + i * 4L)!!
-                val quantized = (value / quantTable[i % quantTable.size]).toInt()
-                vm.pokeShort(subbandPtr + i * 2L, quantized.toShort())
-            }
-        }
-    }
-
-    /**
-     * 64x64 tile motion compensation with bilinear interpolation
-     */
-    fun tavMotionCompensate64x64(
-        currentTilePtr: Long, refFramePtr: Long,
-        tileX: Int, tileY: Int,
-        mvX: Int, mvY: Int,
-        width: Int, height: Int
-    ) {
-        val tileSize = 64
-        val startX = tileX * tileSize
-        val startY = tileY * tileSize
-
-        // Motion vector in 1/4 pixel precision
-        val refX = startX + (mvX / 4.0f)
-        val refY = startY + (mvY / 4.0f)
-
-        for (y in 0 until tileSize) {
-            for (x in 0 until tileSize) {
-                val currentPixelIdx = (startY + y) * width + (startX + x)
-
-                if (currentPixelIdx >= 0 && currentPixelIdx < width * height) {
-                    // Bilinear interpolation for sub-pixel motion vectors
-                    val interpolatedValue = bilinearInterpolate(
-                        refFramePtr, width, height,
-                        refX + x, refY + y
-                    )
-
-                    vm.pokeFloat(
-                        currentTilePtr + currentPixelIdx * 4L,
-                        interpolatedValue
-                    )
-                }
-            }
-        }
-    }
-
-    // Private helper functions for TAV implementation
-
-    private fun copyTileFromPrevious(
-        tileX: Int, tileY: Int,
-        currentYPtr: Long, currentCoPtr: Long, currentCgPtr: Long,
-        prevYPtr: Long, prevCoPtr: Long, prevCgPtr: Long,
-        width: Int, height: Int
-    ) {
-        val tileSize = 64
-        val startX = tileX * tileSize
-        val startY = tileY * tileSize
-
-        for (y in 0 until tileSize) {
-            for (x in 0 until tileSize) {
-                val pixelIdx = (startY + y) * width + (startX + x)
-                if (pixelIdx >= 0 && pixelIdx < width * height) {
-                    val prevY = vm.peekFloat(prevYPtr + pixelIdx * 4L)!!
-                    val prevCo = vm.peekFloat(prevCoPtr + pixelIdx * 4L)!!
-                    val prevCg = vm.peekFloat(prevCgPtr + pixelIdx * 4L)!!
-
-                    vm.pokeFloat(currentYPtr + pixelIdx * 4L, prevY)
-                    vm.pokeFloat(currentCoPtr + pixelIdx * 4L, prevCo)
-                    vm.pokeFloat(currentCgPtr + pixelIdx * 4L, prevCg)
-                }
-            }
-        }
-    }
-
-    // Global tile data reader state
-    private var currentTileDataPtr: Long = 0L
-    private var currentTileOffset: Int = 0
-
-    private fun decodeDWTTile(
-        tileX: Int, tileY: Int,
-        currentYPtr: Long, currentCoPtr: Long, currentCgPtr: Long,
-        width: Int, height: Int,
-        qY: Int, qCo: Int, qCg: Int, rcf: Float,
-        waveletFilter: Int, decompLevels: Int,
-        isLossless: Boolean
-    ) {
-        val tileSize = 64
-        val coeffCount = tileSize * tileSize
-
-        // Read quantized DWT coefficients for Y, Co, Cg channels
-        val quantizedY = ShortArray(coeffCount)
-        val quantizedCo = ShortArray(coeffCount)
-        val quantizedCg = ShortArray(coeffCount)
-
-        // Read from compressed data stream (currentTileDataPtr + currentTileOffset)
-        val dataPtr = currentTileDataPtr + currentTileOffset
-
-        // Read Y coefficients
-        for (i in 0 until coeffCount) {
-            quantizedY[i] = vm.peekShort(dataPtr + i * 2L)!!
-        }
-        currentTileOffset += coeffCount * 2
-
-        // Read Co coefficients
-        for (i in 0 until coeffCount) {
-            quantizedCo[i] = vm.peekShort(dataPtr + currentTileOffset + i * 2L)!!
-        }
-        currentTileOffset += coeffCount * 2
-
-        // Read Cg coefficients
-        for (i in 0 until coeffCount) {
-            quantizedCg[i] = vm.peekShort(dataPtr + currentTileOffset + i * 2L)!!
-        }
-        currentTileOffset += coeffCount * 2
-
-        // Dequantize coefficients
-        val dequantizedY = FloatArray(coeffCount)
-        val dequantizedCo = FloatArray(coeffCount)
-        val dequantizedCg = FloatArray(coeffCount)
-
-        for (i in 0 until coeffCount) {
-            dequantizedY[i] = quantizedY[i].toFloat() * qY * rcf
-            dequantizedCo[i] = quantizedCo[i].toFloat() * qCo * rcf
-            dequantizedCg[i] = quantizedCg[i].toFloat() * qCg * rcf
-        }
-
-        // Apply inverse DWT to reconstruct tile
-        if (waveletFilter == 0) { // 5/3 reversible
-            applyDWTInverseMultiLevel(dequantizedY, tileSize, tileSize, 3, 0)
-            applyDWTInverseMultiLevel(dequantizedCo, tileSize, tileSize, 3, 0)
-            applyDWTInverseMultiLevel(dequantizedCg, tileSize, tileSize, 3, 0)
-        } else { // 9/7 irreversible
-            applyDWTInverseMultiLevel(dequantizedY, tileSize, tileSize, 3, 1)
-            applyDWTInverseMultiLevel(dequantizedCo, tileSize, tileSize, 3, 1)
-            applyDWTInverseMultiLevel(dequantizedCg, tileSize, tileSize, 3, 1)
-        }
-
-        // Copy reconstructed data to frame buffers
-        val startX = tileX * tileSize
-        val startY = tileY * tileSize
-
-        for (y in 0 until tileSize) {
-            for (x in 0 until tileSize) {
-                val frameX = startX + x
-                val frameY = startY + y
-
-                if (frameX < width && frameY < height) {
-                    val pixelIdx = frameY * width + frameX
-                    val tileIdx = y * tileSize + x
-
-                    vm.pokeFloat(currentYPtr + pixelIdx * 4L, dequantizedY[tileIdx])
-                    vm.pokeFloat(currentCoPtr + pixelIdx * 4L, dequantizedCo[tileIdx])
-                    vm.pokeFloat(currentCgPtr + pixelIdx * 4L, dequantizedCg[tileIdx])
-                }
-            }
-        }
-
-
-    }
-
-    private fun decodeDWTTileWithMotion(
-        tileX: Int, tileY: Int, mvX: Int, mvY: Int,
-        currentYPtr: Long, currentCoPtr: Long, currentCgPtr: Long,
-        prevYPtr: Long, prevCoPtr: Long, prevCgPtr: Long,
-        width: Int, height: Int,
-        qY: Int, qCo: Int, qCg: Int, rcf: Float,
-        waveletFilter: Int, decompLevels: Int,
-        isLossless: Boolean
-    ) {
-        val tileSize = 64
-        val coeffCount = tileSize * tileSize
-
-        // Step 1: Apply motion compensation from previous frame
-        applyMotionCompensation64x64(
-            tileX, tileY, mvX, mvY,
-            currentYPtr, currentCoPtr, currentCgPtr,
-            prevYPtr, prevCoPtr, prevCgPtr,
-            width, height
-        )
-
-        // Step 2: Read and decode DWT residual coefficients
-        val quantizedY = ShortArray(coeffCount)
-        val quantizedCo = ShortArray(coeffCount)
-        val quantizedCg = ShortArray(coeffCount)
-
-        // Read from compressed data stream
-        val dataPtr = currentTileDataPtr + currentTileOffset
-
-        // Read Y residual coefficients
-        for (i in 0 until coeffCount) {
-            quantizedY[i] = vm.peekShort(dataPtr + i * 2L)!!
-        }
-        currentTileOffset += coeffCount * 2
-
-        // Read Co residual coefficients
-        for (i in 0 until coeffCount) {
-            quantizedCo[i] = vm.peekShort(dataPtr + currentTileOffset + i * 2L)!!
-        }
-        currentTileOffset += coeffCount * 2
-
-        // Read Cg residual coefficients
-        for (i in 0 until coeffCount) {
-            quantizedCg[i] = vm.peekShort(dataPtr + currentTileOffset + i * 2L)!!
-        }
-        currentTileOffset += coeffCount * 2
-
-        // Dequantize residual coefficients
-        val residualY = FloatArray(coeffCount)
-        val residualCo = FloatArray(coeffCount)
-        val residualCg = FloatArray(coeffCount)
-
-        for (i in 0 until coeffCount) {
-            residualY[i] = quantizedY[i].toFloat() * qY * rcf
-            residualCo[i] = quantizedCo[i].toFloat() * qCo * rcf
-            residualCg[i] = quantizedCg[i].toFloat() * qCg * rcf
-        }
-
-        // Apply inverse DWT to reconstruct residual
-        if (waveletFilter == 0) { // 5/3 reversible
-            applyDWTInverseMultiLevel(residualY, tileSize, tileSize, 3, 0)
-            applyDWTInverseMultiLevel(residualCo, tileSize, tileSize, 3, 0)
-            applyDWTInverseMultiLevel(residualCg, tileSize, tileSize, 3, 0)
-        } else { // 9/7 irreversible
-            applyDWTInverseMultiLevel(residualY, tileSize, tileSize, 3, 1)
-            applyDWTInverseMultiLevel(residualCo, tileSize, tileSize, 3, 1)
-            applyDWTInverseMultiLevel(residualCg, tileSize, tileSize, 3, 1)
-        }
-
-        // Step 3: Add residual to motion-compensated prediction
-        val startX = tileX * tileSize
-        val startY = tileY * tileSize
-
-        for (y in 0 until tileSize) {
-            for (x in 0 until tileSize) {
-                val frameX = startX + x
-                val frameY = startY + y
-
-                if (frameX < width && frameY < height) {
-                    val pixelIdx = frameY * width + frameX
-                    val tileIdx = y * tileSize + x
-
-                    // Add residual to motion-compensated prediction
-                    val predY = vm.peekFloat(currentYPtr + pixelIdx * 4L)!!
-                    val predCo = vm.peekFloat(currentCoPtr + pixelIdx * 4L)!!
-                    val predCg = vm.peekFloat(currentCgPtr + pixelIdx * 4L)!!
-
-                    vm.pokeFloat(currentYPtr + pixelIdx * 4L, predY + residualY[tileIdx])
-                    vm.pokeFloat(currentCoPtr + pixelIdx * 4L, predCo + residualCo[tileIdx])
-                    vm.pokeFloat(currentCgPtr + pixelIdx * 4L, predCg + residualCg[tileIdx])
-                }
-            }
-        }
-    }
-
-    private fun applyMotionCompensation64x64(
-        tileX: Int, tileY: Int, mvX: Int, mvY: Int,
-        currentYPtr: Long, currentCoPtr: Long, currentCgPtr: Long,
-        prevYPtr: Long, prevCoPtr: Long, prevCgPtr: Long,
-        width: Int, height: Int
-    ) {
-        tavMotionCompensate64x64(currentYPtr, prevYPtr, tileX, tileY, mvX, mvY, width, height)
-        tavMotionCompensate64x64(currentCoPtr, prevCoPtr, tileX, tileY, mvX, mvY, width, height)
-        tavMotionCompensate64x64(currentCgPtr, prevCgPtr, tileX, tileY, mvX, mvY, width, height)
-    }
-
-    private fun applyDWT53Forward(data: FloatArray, width: Int, height: Int) {
-        // TODO: Implement 5/3 forward DWT
-        // Lifting scheme implementation for 5/3 reversible filter
-    }
-
-    private fun applyDWT53Inverse(data: FloatArray, width: Int, height: Int) {
-        // 5/3 reversible DWT inverse using lifting scheme
-        // First apply horizontal inverse DWT on all rows
-        val tempRow = FloatArray(width)
-        for (y in 0 until height) {
-            for (x in 0 until width) {
-                tempRow[x] = data[y * width + x]
-            }
-            applyLift53InverseHorizontal(tempRow, width)
-            for (x in 0 until width) {
-                data[y * width + x] = tempRow[x]
-            }
-        }
-
-        // Then apply vertical inverse DWT on all columns
-        val tempCol = FloatArray(height)
-        for (x in 0 until width) {
-            for (y in 0 until height) {
-                tempCol[y] = data[y * width + x]
-            }
-            applyLift53InverseVertical(tempCol, height)
-            for (y in 0 until height) {
-                data[y * width + x] = tempCol[y]
-            }
-        }
-    }
-
-    private fun applyDWT97Forward(data: FloatArray, width: Int, height: Int) {
-        // TODO: Implement 9/7 forward DWT
-        // Lifting scheme implementation for 9/7 irreversible filter
-    }
-
-    private fun applyDWTInverseMultiLevel(data: FloatArray, width: Int, height: Int, levels: Int, filterType: Int) {
-        // Multi-level inverse DWT - reconstruct from smallest to largest (reverse of encoder)
-        val size = width // Full tile size (64)
-        val tempRow = FloatArray(size)
-        val tempCol = FloatArray(size)
-        
-        for (level in levels - 1 downTo 0) {
-            val currentSize = size shr level
-            if (currentSize < 2) break
-            
-            // Apply inverse DWT to current subband region - EXACT match to encoder
-            // The encoder does ROW transform first, then COLUMN transform
-            // So inverse must do COLUMN inverse first, then ROW inverse
-            
-            // Column inverse transform first
-            for (x in 0 until currentSize) {
-                for (y in 0 until currentSize) {
-                    tempCol[y] = data[y * size + x]
-                }
-                
-                if (filterType == 0) {
-                    applyDWT53Inverse1D(tempCol, currentSize)
-                } else {
-                    applyDWT97Inverse1D(tempCol, currentSize)
-                }
-                
-                for (y in 0 until currentSize) {
-                    data[y * size + x] = tempCol[y]
-                }
-            }
-            
-            // Row inverse transform second  
-            for (y in 0 until currentSize) {
-                for (x in 0 until currentSize) {
-                    tempRow[x] = data[y * size + x]
-                }
-                
-                if (filterType == 0) {
-                    applyDWT53Inverse1D(tempRow, currentSize)
-                } else {
-                    applyDWT97Inverse1D(tempRow, currentSize)
-                }
-                
-                for (x in 0 until currentSize) {
-                    data[y * size + x] = tempRow[x]
-                }
-            }
-        }
-    }
-
-    private fun applyDWT97Inverse(data: FloatArray, width: Int, height: Int) {
-        // 9/7 irreversible DWT inverse using lifting scheme
-        // First apply horizontal inverse DWT on all rows
-        val tempRow = FloatArray(width)
-        for (y in 0 until height) {
-            for (x in 0 until width) {
-                tempRow[x] = data[y * width + x]
-            }
-            applyLift97InverseHorizontal(tempRow, width)
-            for (x in 0 until width) {
-                data[y * width + x] = tempRow[x]
-            }
-        }
-
-        // Then apply vertical inverse DWT on all columns
-        val tempCol = FloatArray(height)
-        for (x in 0 until width) {
-            for (y in 0 until height) {
-                tempCol[y] = data[y * width + x]
-            }
-            applyLift97InverseVertical(tempCol, height)
-            for (y in 0 until height) {
-                data[y * width + x] = tempCol[y]
-            }
-        }
-    }
-
-    private fun applyLift97InverseHorizontal(row: FloatArray, width: Int) { TODO() }
-    private fun applyLift97InverseVertical(col: FloatArray, height: Int) { TODO() }
-
-    // 1D lifting scheme implementations for 5/3 filter
-    private fun applyLift53InverseHorizontal(data: FloatArray, length: Int) {
-        if (length < 2) return
-
-        val temp = FloatArray(length)
-        val half = (length + 1) / 2
-
-        // Separate even and odd samples (inverse interleaving)
-        for (i in 0 until half) {
-            temp[i] = data[2 * i] // Even samples (low-pass)
-        }
-        for (i in 0 until length / 2) {
-            temp[half + i] = data[2 * i + 1] // Odd samples (high-pass)
-        }
-
-        // Inverse lifting steps for 5/3 filter
-        // Step 2: Undo update step - even[i] -= (odd[i-1] + odd[i] + 2) >> 2
-        for (i in 1 until half) {
-            val oddPrev = if (i - 1 >= 0) temp[half + i - 1] else 0.0f
-            val oddCurr = if (i < length / 2) temp[half + i] else 0.0f
-            temp[i] += (oddPrev + oddCurr + 2.0f) / 4.0f
-        }
-        if (half > 0) {
-            val oddCurr = if (0 < length / 2) temp[half] else 0.0f
-            temp[0] += oddCurr / 2.0f
-        }
-
-        // Step 1: Undo predict step - odd[i] += (even[i] + even[i+1]) >> 1
-        for (i in 0 until length / 2) {
-            val evenCurr = temp[i]
-            val evenNext = if (i + 1 < half) temp[i + 1] else temp[half - 1]
-            temp[half + i] -= (evenCurr + evenNext) / 2.0f
-        }
-
-        // Interleave back
-        for (i in 0 until half) {
-            data[2 * i] = temp[i]
-        }
-        for (i in 0 until length / 2) {
-            data[2 * i + 1] = temp[half + i]
-        }
-    }
-
-    private fun applyLift53InverseVertical(data: FloatArray, length: Int) {
-        // Same as horizontal but for vertical direction
-        applyLift53InverseHorizontal(data, length)
-    }
-
-    // 1D lifting scheme implementations for 9/7 irreversible filter
-    private fun applyDWT97Inverse1D(data: FloatArray, length: Int) {
-        if (length < 2) return
-
-        val temp = FloatArray(length)
-        val half = length / 2
-
-        // Split into low and high frequency components (matching encoder layout)
-        // After forward DWT: first half = low-pass, second half = high-pass
-        for (i in 0 until half) {
-            temp[i] = data[i]              // Low-pass coefficients (first half)
-            temp[half + i] = data[half + i] // High-pass coefficients (second half)
-        }
-
-        // 9/7 inverse lifting coefficients (exactly matching encoder)
-        val alpha = -1.586134342f
-        val beta = -0.052980118f  
-        val gamma = 0.882911076f
-        val delta = 0.443506852f
-        val K = 1.230174105f
-
-        // Inverse lifting steps (undo forward steps in reverse order)
-        
-        // Step 5: Undo scaling (reverse of encoder's final step)
-        for (i in 0 until half) {
-            temp[i] /= K  // Undo temp[i] *= K
-            temp[half + i] *= K  // Undo temp[half + i] /= K
-        }
-
-        // Step 4: Undo update step (delta) 
-        for (i in 0 until half) {
-            val left = if (i > 0) temp[half + i - 1] else temp[half + i]
-            val right = if (i < half - 1) temp[half + i + 1] else temp[half + i]
-            temp[i] -= delta * (left + right)
-        }
-
-        // Step 3: Undo predict step (gamma)
-        for (i in 0 until half) {
-            val left = if (i > 0) temp[i - 1] else temp[i]
-            val right = if (i < half - 1) temp[i + 1] else temp[i]
-            temp[half + i] -= gamma * (left + right)
-        }
-
-        // Step 2: Undo update step (beta)
-        for (i in 0 until half) {
-            val left = if (i > 0) temp[half + i - 1] else temp[half + i]
-            val right = if (i < half - 1) temp[half + i + 1] else temp[half + i]
-            temp[i] -= beta * (left + right)
-        }
-
-        // Step 1: Undo predict step (alpha)
-        for (i in 0 until half) {
-            val left = if (i > 0) temp[i - 1] else temp[i]
-            val right = if (i < half - 1) temp[i + 1] else temp[i]
-            temp[half + i] -= alpha * (left + right)
-        }
-
-        // Merge back (inverse of encoder's split)
-        for (i in 0 until half) {
-            data[2 * i] = temp[i]           // Even positions get low-pass
-            if (2 * i + 1 < length) {
-                data[2 * i + 1] = temp[half + i] // Odd positions get high-pass
-            }
-        }
-    }
-
-    private fun applyDWT53Inverse1D(data: FloatArray, length: Int) {
-        if (length < 2) return
-
-        val temp = FloatArray(length)
-        val half = length / 2
-
-        // Split into low and high frequency components (matching encoder layout)
-        for (i in 0 until half) {
-            temp[i] = data[i]              // Low-pass coefficients (first half)
-            temp[half + i] = data[half + i] // High-pass coefficients (second half)
-        }
-
-        // 5/3 inverse lifting (undo forward steps in reverse order)
-        
-        // Step 2: Undo update step (1/4 coefficient)
-        for (i in 0 until half) {
-            val left = if (i > 0) temp[half + i - 1] else 0.0f
-            val right = if (i < half - 1) temp[half + i] else 0.0f
-            temp[i] -= 0.25f * (left + right)
-        }
-
-        // Step 1: Undo predict step (1/2 coefficient)
-        for (i in 0 until half) {
-            val left = temp[i]
-            val right = if (i < half - 1) temp[i + 1] else temp[i]
-            temp[half + i] -= 0.5f * (left + right)
-        }
-
-        // Merge back (inverse of encoder's split)
-        for (i in 0 until half) {
-            data[2 * i] = temp[i]           // Even positions get low-pass
-            if (2 * i + 1 < length) {
-                data[2 * i + 1] = temp[half + i] // Odd positions get high-pass
-            }
-        }
-    }
-
-
-    private fun bilinearInterpolate(
-        dataPtr: Long, width: Int, height: Int,
-        x: Float, y: Float
-    ): Float {
-        val x0 = floor(x).toInt()
-        val y0 = floor(y).toInt()
-        val x1 = x0 + 1
-        val y1 = y0 + 1
-
-        if (x0 < 0 || y0 < 0 || x1 >= width || y1 >= height) {
-            return 0.0f  // Out of bounds
-        }
-
-        val fx = x - x0
-        val fy = y - y0
-
-        val p00 = vm.peekFloat(dataPtr + (y0 * width + x0) * 4L)!!
-        val p10 = vm.peekFloat(dataPtr + (y0 * width + x1) * 4L)!!
-        val p01 = vm.peekFloat(dataPtr + (y1 * width + x0) * 4L)!!
-        val p11 = vm.peekFloat(dataPtr + (y1 * width + x1) * 4L)!!
-        
-        return p00 * (1 - fx) * (1 - fy) +
-               p10 * fx * (1 - fy) +
-               p01 * (1 - fx) * fy +
-               p11 * fx * fy
-    }
-
-
-    fun renderYCoCgToDisplay(
-        yPtr: Long, coPtr: Long, cgPtr: Long,
-        width: Int, height: Int
-    ) {
-        // Convert YCoCg to RGB and render to display
-        for (y in 0 until height) {
-            for (x in 0 until width) {
-                val idx = y * width + x
-                val Y = vm.peekFloat(yPtr + idx * 4L)!!
-                val Co = vm.peekFloat(coPtr + idx * 4L)!!
-                val Cg = vm.peekFloat(cgPtr + idx * 4L)!!
-
-                // YCoCg to RGB conversion
-                val tmp = Y - Cg
-                val G = Y + Cg
-                val B = tmp - Co
-                val R = tmp + Co
-
-                // Clamp to 0-255 and convert to 4-bit RGB for TSVM display
-                val r4 = (R.toInt().coerceIn(0, 255) / 16).coerceIn(0, 15)
-                val g4 = (G.toInt().coerceIn(0, 255) / 16).coerceIn(0, 15)
-                val b4 = (B.toInt().coerceIn(0, 255) / 16).coerceIn(0, 15)
-
-                val rg = r4.shl(4) or g4
-                val ba = b4.shl(4) or 15
-                plotPixel(x, y, rg)
-                plotPixel(x, y, ba)
-            }
-        }
-    }
-
 }
\ No newline at end of file
diff --git a/video_encoder/encoder_tev.c b/video_encoder/encoder_tev.c
index d39a957..600a98f 100644
--- a/video_encoder/encoder_tev.c
+++ b/video_encoder/encoder_tev.c
@@ -1,5 +1,5 @@
 // Created by Claude on 2025-08-18.
-// TEV (TSVM Enhanced Video) Encoder - YCoCg-R 4:2:0 16x16 Block Version
+// TEV (TSVM Enhanced Video) Encoder - YCoCg-R/ICtCp 4:2:0 16x16 Block Version
 #include <stdio.h>
 #include <stdlib.h>
 #include <stdint.h>
@@ -68,7 +68,9 @@ static inline float float16_to_float(uint16_t hbits) {
 
 // TSVM Enhanced Video (TEV) format constants
 #define TEV_MAGIC "\x1F\x54\x53\x56\x4D\x54\x45\x56"  // "\x1FTSVM TEV"
-#define TEV_VERSION 2  // Updated for YCoCg-R 4:2:0
+// TEV version - dynamic based on color space mode
+// Version 2: YCoCg-R 4:2:0 (default)
+// Version 3: ICtCp 4:2:0 (--ictcp flag)
 // version 1: 8x8 RGB
 // version 2: 16x16 Y, 8x8 Co/Cg, asymetric quantisation, optional quantiser multiplier for rate control multiplier (1.0 when unused) {current winner}
 // version 3: version 2 + internal 6-bit processing (discarded due to higher noise floor)
@@ -152,7 +154,6 @@ static const uint32_t QUANT_TABLE_C[HALF_BLOCK_SIZE_SQR] =
      99, 99, 99, 99, 99, 99, 99, 99,
      99, 99, 99, 99, 99, 99, 99, 99};
 
-
 // Audio constants (reuse MP2 from existing system)
 #define MP2_SAMPLE_RATE 32000
 #define MP2_DEFAULT_PACKET_SIZE 1728
@@ -192,17 +193,6 @@ typedef struct __attribute__((packed)) {
     int16_t cg_coeffs[HALF_BLOCK_SIZE_SQR];  // quantised Cg DCT coefficients (8x8)
 } tev_block_t;
 
-// Lossless TEV block structure (uses float32 internally, converted to float16 during serialization)
-typedef struct __attribute__((packed)) {
-    uint8_t mode;           // Block encoding mode
-    int16_t mv_x, mv_y;     // Motion vector (1/4 pixel precision)
-    float rate_control_factor; // Always 1.0f in lossless mode
-    uint16_t cbp;           // Coded block pattern (which channels have non-zero coeffs)
-    float y_coeffs[BLOCK_SIZE_SQR];  // lossless Y DCT coefficients (16x16)
-    float co_coeffs[HALF_BLOCK_SIZE_SQR];  // lossless Co DCT coefficients (8x8)
-    float cg_coeffs[HALF_BLOCK_SIZE_SQR];  // lossless Cg DCT coefficients (8x8)
-} tev_lossless_block_t;
-
 // Subtitle entry structure
 typedef struct subtitle_entry {
     int start_frame;
@@ -232,7 +222,7 @@ typedef struct {
     int qualityCg;
     int verbose;
     int disable_rcf;          // 0 = rcf enabled, 1 = disabled
-    int lossless_mode;    // 0 = lossy (default), 1 = lossless mode
+    int ictcp_mode;       // 0 = YCoCg-R (default), 1 = ICtCp color space
 
     // Bitrate control
     int target_bitrate_kbps;  // Target bitrate in kbps (0 = quality mode)
@@ -289,6 +279,10 @@ typedef struct {
     int complexity_capacity;  // Capacity of complexity_values array
 } tev_encoder_t;
 
+//////////////////////////
+// COLOUR MATHS CODES //
+//////////////////////////
+
 // RGB to YCoCg-R transform (per YCoCg-R specification with truncated division)
 static void rgb_to_ycocgr(uint8_t r, uint8_t g, uint8_t b, int *y, int *co, int *cg) {
     *co = (int)r - (int)b;
@@ -315,6 +309,189 @@ static void ycocgr_to_rgb(int y, int co, int cg, uint8_t *r, uint8_t *g, uint8_t
     *b = CLAMP(*b, 0, 255);
 }
 
+// ---------------------- ICtCp Implementation ----------------------
+
+static inline int iround(double v) { return (int)floor(v + 0.5); }
+
+// ---------------------- sRGB gamma helpers ----------------------
+static inline double srgb_linearize(double val) {
+    // val in [0,1]
+    if (val <= 0.04045) return val / 12.92;
+    return pow((val + 0.055) / 1.055, 2.4);
+}
+static inline double srgb_unlinearize(double val) {
+    // val in [0,1]
+    if (val <= 0.0031308) return val * 12.92;
+    return 1.055 * pow(val, 1.0 / 2.4) - 0.055;
+}
+
+// -------------------------- HLG --------------------------
+// Forward HLG OETF (linear -> HLG)
+static inline double HLG_OETF(double L) {
+    // L in [0,1], relative scene-linear
+    const double a = 0.17883277;
+    const double b = 1.0 - 4.0 * a;
+    const double c = 0.5 - a * log(4.0 * a);
+
+    if (L <= 1.0/12.0)
+        return sqrt(3.0 * L);
+    else
+        return a * log(12.0 * L - b) + c;
+}
+
+// Inverse HLG OETF (HLG -> linear)
+static inline double HLG_inverse_OETF(double V) {
+    const double a = 0.17883277;
+    const double b = 1.0 - 4.0 * a;
+    const double c = 0.5 - a * log(4.0 * a);
+
+    if (V <= 0.5)
+        return (V * V) / 3.0;
+    else
+        return (exp((V - c)/a) + b) / 12.0;
+}
+
+// ---------------------- Matrices (doubles) ----------------------
+// linear RGB -> XYZ -> Rec.2100 -> LMS
+/*static const double M_RGB_TO_LMS[3][3] = {
+    {1688.0/4096.0,2146.0/4096.0, 262.0/4096.0},
+    { 683.0/4096.0,2951.0/4096.0, 462.0/4096.0},
+    {  99.0/4096.0, 309.0/4096.0,3688.0/4096.0}
+};*/
+static const double M_RGB_TO_LMS[3][3] = {
+    {0.2958564579364564, 0.6230869483219083, 0.08106989398623762},
+    {0.15627390752659093, 0.727308963512872, 0.11639736914944238},
+    {0.035141262332177715, 0.15657109121101628, 0.8080956851990795}
+};
+
+// Inverse: LMS -> linear sRGB (inverse of above)
+/*static const double M_LMS_TO_RGB[3][3] = {
+    {3.436606694333079, -2.5064521186562705, 0.06984542432319149},
+    {-0.7913295555989289, 1.983600451792291, -0.192270896193362},
+    {-0.025949899690592665, -0.09891371471172647, 1.1248636144023192}
+};*/
+static const double M_LMS_TO_RGB[3][3] = {
+    {6.1723815689243215, -5.319534979827695, 0.14699442094633924},
+    {-1.3243428148026244, 2.560286104841917, -0.2359203727576164},
+    {-0.011819739235953752, -0.26473549971186555, 1.2767952602537955}
+};
+
+// ICtCp matrix (L' M' S' -> I Ct Cp). Values are the BT.2100 integer-derived /4096 constants.
+static const double M_LMSPRIME_TO_ICTCP[3][3] = {
+    { 2048.0/4096.0,   2048.0/4096.0,     0.0          },
+    { 3625.0/4096.0, -7465.0/4096.0, 3840.0/4096.0    },
+    { 9500.0/4096.0, -9212.0/4096.0, -288.0/4096.0    }
+};
+
+// Inverse: I Ct Cp -> L' M' S'  (precomputed inverse)
+static const double M_ICTCP_TO_LMSPRIME[3][3] = {
+    { 1.0,         0.015718580108730416,  0.2095810681164055 },
+    { 1.0,        -0.015718580108730416, -0.20958106811640548 },
+    { 1.0,         1.0212710798422344, -0.6052744909924316 }
+};
+
+// ---------------------- Forward: sRGB8 -> ICtCp (doubles) ----------------------
+// Inputs: r,g,b in 0..255 sRGB (8-bit)
+// Outputs: I, Ct, Cp as doubles (nominally I in ~[0..1], Ct/Cp ranges depend on colors)
+void srgb8_to_ictcp_hlg(uint8_t r8, uint8_t g8, uint8_t b8,
+                       double *out_I, double *out_Ct, double *out_Cp)
+{
+    // 1) linearize sRGB to 0..1
+    double r = srgb_linearize((double)r8 / 255.0);
+    double g = srgb_linearize((double)g8 / 255.0);
+    double b = srgb_linearize((double)b8 / 255.0);
+
+    // 2) linear RGB -> LMS (single 3x3 multiply)
+    double L = M_RGB_TO_LMS[0][0]*r + M_RGB_TO_LMS[0][1]*g + M_RGB_TO_LMS[0][2]*b;
+    double M = M_RGB_TO_LMS[1][0]*r + M_RGB_TO_LMS[1][1]*g + M_RGB_TO_LMS[1][2]*b;
+    double S = M_RGB_TO_LMS[2][0]*r + M_RGB_TO_LMS[2][1]*g + M_RGB_TO_LMS[2][2]*b;
+
+    // 3) apply HLG encode (map linear LMS -> perceptual domain L',M',S')
+    double Lp = HLG_OETF(L);
+    double Mp = HLG_OETF(M);
+    double Sp = HLG_OETF(S);
+
+    // 4) L'M'S' -> ICtCp
+    double I  = M_LMSPRIME_TO_ICTCP[0][0]*Lp + M_LMSPRIME_TO_ICTCP[0][1]*Mp + M_LMSPRIME_TO_ICTCP[0][2]*Sp;
+    double Ct = M_LMSPRIME_TO_ICTCP[1][0]*Lp + M_LMSPRIME_TO_ICTCP[1][1]*Mp + M_LMSPRIME_TO_ICTCP[1][2]*Sp;
+    double Cp = M_LMSPRIME_TO_ICTCP[2][0]*Lp + M_LMSPRIME_TO_ICTCP[2][1]*Mp + M_LMSPRIME_TO_ICTCP[2][2]*Sp;
+
+    *out_I = FCLAMP(I * 255.f, 0.f, 255.f);
+    *out_Ct = FCLAMP(Ct * 255.f, -256.f, 255.f);
+    *out_Cp = FCLAMP(Cp * 255.f, -256.f, 255.f);
+}
+
+// ---------------------- Reverse: ICtCp -> sRGB8 (doubles) ----------------------
+// Inputs: I, Ct, Cp as doubles
+// Outputs: r8,g8,b8 in 0..255 (8-bit sRGB, clamped and rounded)
+void ictcp_hlg_to_srgb8(double I8, double Ct8, double Cp8,
+                       uint8_t *r8, uint8_t *g8, uint8_t *b8)
+{
+    double I = I8 / 255.f;
+    double Ct = Ct8 / 255.f;
+    double Cp = Cp8 / 255.f;
+
+    // 1) ICtCp -> L' M' S' (3x3 multiply)
+    double Lp = M_ICTCP_TO_LMSPRIME[0][0]*I + M_ICTCP_TO_LMSPRIME[0][1]*Ct + M_ICTCP_TO_LMSPRIME[0][2]*Cp;
+    double Mp = M_ICTCP_TO_LMSPRIME[1][0]*I + M_ICTCP_TO_LMSPRIME[1][1]*Ct + M_ICTCP_TO_LMSPRIME[1][2]*Cp;
+    double Sp = M_ICTCP_TO_LMSPRIME[2][0]*I + M_ICTCP_TO_LMSPRIME[2][1]*Ct + M_ICTCP_TO_LMSPRIME[2][2]*Cp;
+
+    // 2) HLG decode: L' -> linear LMS
+    double L = HLG_inverse_OETF(Lp);
+    double M = HLG_inverse_OETF(Mp);
+    double S = HLG_inverse_OETF(Sp);
+
+    // 3) LMS -> linear sRGB (3x3 inverse)
+    double r_lin = M_LMS_TO_RGB[0][0]*L + M_LMS_TO_RGB[0][1]*M + M_LMS_TO_RGB[0][2]*S;
+    double g_lin = M_LMS_TO_RGB[1][0]*L + M_LMS_TO_RGB[1][1]*M + M_LMS_TO_RGB[1][2]*S;
+    double b_lin = M_LMS_TO_RGB[2][0]*L + M_LMS_TO_RGB[2][1]*M + M_LMS_TO_RGB[2][2]*S;
+
+    // 4) gamma encode and convert to 0..255 with center-of-bin rounding
+    double r = srgb_unlinearize(r_lin);
+    double g = srgb_unlinearize(g_lin);
+    double b = srgb_unlinearize(b_lin);
+
+    *r8 = (uint8_t)CLAMP(iround(r * 255.0), 0, 255);
+    *g8 = (uint8_t)CLAMP(iround(g * 255.0), 0, 255);
+    *b8 = (uint8_t)CLAMP(iround(b * 255.0), 0, 255);
+}
+
+// ---------------------- Color Space Switching Functions ----------------------
+// Wrapper functions that choose between YCoCg-R and ICtCp based on encoder mode
+
+static void rgb_to_color_space(tev_encoder_t *enc, uint8_t r, uint8_t g, uint8_t b,
+                               double *c1, double *c2, double *c3) {
+    if (enc->ictcp_mode) {
+        // Use ICtCp color space
+        srgb8_to_ictcp_hlg(r, g, b, c1, c2, c3);
+    } else {
+        // Use YCoCg-R color space (convert to int first, then to double)
+        int y_val, co_val, cg_val;
+        rgb_to_ycocgr(r, g, b, &y_val, &co_val, &cg_val);
+        *c1 = (double)y_val;
+        *c2 = (double)co_val;
+        *c3 = (double)cg_val;
+    }
+}
+
+static void color_space_to_rgb(tev_encoder_t *enc, double c1, double c2, double c3,
+                               uint8_t *r, uint8_t *g, uint8_t *b) {
+    if (enc->ictcp_mode) {
+        // Use ICtCp color space
+        ictcp_hlg_to_srgb8(c1, c2, c3, r, g, b);
+    } else {
+        // Use YCoCg-R color space (convert from double to int first)
+        int y_val = (int)round(c1);
+        int co_val = (int)round(c2);
+        int cg_val = (int)round(c3);
+        ycocgr_to_rgb(y_val, co_val, cg_val, r, g, b);
+    }
+}
+
+////////////////////////////////////////
+// DISCRETE COSINE TRANSFORMATIONS //
+////////////////////////////////////////
+
 // Pre-calculated cosine tables
 static float dct_table_16[16][16]; // For 16x16 DCT
 static float dct_table_8[8][8];    // For 8x8 DCT
@@ -429,14 +606,14 @@ static int16_t quantise_coeff(float coeff, float quant, int is_dc, int is_chroma
     }
 }
 
-// Extract 16x16 block from RGB frame and convert to YCoCg-R
-static void extract_ycocgr_block(uint8_t *rgb_frame, int width, int height,
-                                int block_x, int block_y,
-                                float *y_block, float *co_block, float *cg_block) {
+// Extract 16x16 block from RGB frame and convert to color space
+static void extract_color_space_block(tev_encoder_t *enc, uint8_t *rgb_frame, int width, int height,
+                                      int block_x, int block_y,
+                                      float *c1_block, float *c2_block, float *c3_block) {
     int start_x = block_x * BLOCK_SIZE;
     int start_y = block_y * BLOCK_SIZE;
 
-    // Extract 16x16 Y block
+    // Extract 16x16 primary channel block (Y for YCoCg-R, I for ICtCp)
     for (int py = 0; py < BLOCK_SIZE; py++) {
         for (int px = 0; px < BLOCK_SIZE; px++) {
             int x = start_x + px;
@@ -448,10 +625,10 @@ static void extract_ycocgr_block(uint8_t *rgb_frame, int width, int height,
                 uint8_t g = rgb_frame[offset + 1];
                 uint8_t b = rgb_frame[offset + 2];
 
-                int y_val, co_val, cg_val;
-                rgb_to_ycocgr(r, g, b, &y_val, &co_val, &cg_val);
+                double c1, c2, c3;
+                rgb_to_color_space(enc, r, g, b, &c1, &c2, &c3);
 
-                y_block[py * BLOCK_SIZE + px] = (float)y_val - 128.0f;  // Center around 0
+                c1_block[py * BLOCK_SIZE + px] = (float)c1 - 128.0f;
             }
         }
     }
@@ -473,25 +650,30 @@ static void extract_ycocgr_block(uint8_t *rgb_frame, int width, int height,
                         uint8_t g = rgb_frame[offset + 1];
                         uint8_t b = rgb_frame[offset + 2];
 
-                        int y_val, co_val, cg_val;
-                        rgb_to_ycocgr(r, g, b, &y_val, &co_val, &cg_val);
+                        double c1, c2, c3;
+                        rgb_to_color_space(enc, r, g, b, &c1, &c2, &c3);
+
+                        co_sum += (int)c2;
+                        cg_sum += (int)c3;
 
-                        co_sum += co_val;
-                        cg_sum += cg_val;
                         count++;
                     }
                 }
             }
 
             if (count > 0) {
-                // Center chroma around 0 for DCT (Co/Cg range is -255 to +255, so don't add offset)
-                co_block[py * HALF_BLOCK_SIZE + px] = (float)(co_sum / count);
-                cg_block[py * HALF_BLOCK_SIZE + px] = (float)(cg_sum / count);
+                // Average the accumulated chroma values and store
+                c2_block[py * HALF_BLOCK_SIZE + px] = (float)(co_sum / count);
+                c3_block[py * HALF_BLOCK_SIZE + px] = (float)(cg_sum / count);
             }
         }
     }
 }
 
+
+
+
+
 // Calculate spatial activity for any channel (16x16 or 8x8)
 static float calculate_spatial_activity(const float *block, int block_size) {
     float activity = 0.0f;
@@ -791,8 +973,143 @@ static void estimate_motion(tev_encoder_t *enc, int block_x, int block_y,
 }
 
 // Convert RGB block to YCoCg-R with 4:2:0 chroma subsampling
-static void convert_rgb_to_ycocgr_block(const uint8_t *rgb_block,
-                                       uint8_t *y_block, int8_t *co_block, int8_t *cg_block) {
+static void convert_rgb_to_color_space_block(tev_encoder_t *enc, const uint8_t *rgb_block,
+                                            float *c1_workspace, float *c2_workspace, float *c3_workspace) {
+    if (enc->ictcp_mode) {
+        // ICtCp mode: Convert 16x16 RGB to ICtCp (full resolution for I, 4:2:0 subsampling for CtCp)
+
+        // Convert I channel at full resolution (16x16)
+        for (int py = 0; py < BLOCK_SIZE; py++) {
+            for (int px = 0; px < BLOCK_SIZE; px++) {
+                int rgb_idx = (py * BLOCK_SIZE + px) * 3;
+                uint8_t r = rgb_block[rgb_idx];
+                uint8_t g = rgb_block[rgb_idx + 1];
+                uint8_t b = rgb_block[rgb_idx + 2];
+
+                double I, Ct, Cp;
+                srgb8_to_ictcp_hlg(r, g, b, &I, &Ct, &Cp);
+
+                // Store I at full resolution, scale to appropriate range
+                c1_workspace[py * BLOCK_SIZE + px] = (float)(I * 255.0);
+            }
+        }
+
+        // Convert Ct and Cp with 4:2:0 subsampling (8x8)
+        for (int cy = 0; cy < HALF_BLOCK_SIZE; cy++) {
+            for (int cx = 0; cx < HALF_BLOCK_SIZE; cx++) {
+                double sum_ct = 0.0, sum_cp = 0.0;
+
+                // Sample 2x2 block from RGB and average for chroma
+                for (int dy = 0; dy < 2; dy++) {
+                    for (int dx = 0; dx < 2; dx++) {
+                        int py = cy * 2 + dy;
+                        int px = cx * 2 + dx;
+                        int rgb_idx = (py * 16 + px) * 3;
+
+                        int r = rgb_block[rgb_idx];
+                        int g = rgb_block[rgb_idx + 1];
+                        int b = rgb_block[rgb_idx + 2];
+
+                        double I, Ct, Cp;
+                        srgb8_to_ictcp_hlg(r, g, b, &I, &Ct, &Cp);
+
+                        sum_ct += Ct;
+                        sum_cp += Cp;
+                    }
+                }
+
+                // Average and store subsampled chroma, scale to signed 8-bit equivalent range
+                // Apply centering to ensure chroma is balanced around 0 (like YCoCg-R)
+                double avg_ct = sum_ct / 4.0;
+                double avg_cp = sum_cp / 4.0;
+
+                // Scale and clamp to [-256, 255] range like YCoCg-R
+                c2_workspace[cy * HALF_BLOCK_SIZE + cx] = (float)CLAMP(avg_ct * 255.0, -256, 255);
+                c3_workspace[cy * HALF_BLOCK_SIZE + cx] = (float)CLAMP(avg_cp * 255.0, -256, 255);
+            }
+        }
+    } else {
+        // YCoCg-R mode: Original implementation
+
+        // Convert 16x16 RGB to Y (full resolution)
+        for (int py = 0; py < BLOCK_SIZE; py++) {
+            for (int px = 0; px < BLOCK_SIZE; px++) {
+                int rgb_idx = (py * BLOCK_SIZE + px) * 3;
+                int r = rgb_block[rgb_idx];
+                int g = rgb_block[rgb_idx + 1];
+                int b = rgb_block[rgb_idx + 2];
+
+                // YCoCg-R transform (per specification with truncated division)
+                int y = (r + 2*g + b) / 4;
+                c1_workspace[py * BLOCK_SIZE + px] = (float)CLAMP(y, 0, 255);
+            }
+        }
+
+        // Convert to Co and Cg with 4:2:0 subsampling (8x8)
+        for (int cy = 0; cy < HALF_BLOCK_SIZE; cy++) {
+            for (int cx = 0; cx < HALF_BLOCK_SIZE; cx++) {
+                int sum_co = 0, sum_cg = 0;
+
+                // Sample 2x2 block from RGB and average for chroma
+                for (int dy = 0; dy < 2; dy++) {
+                    for (int dx = 0; dx < 2; dx++) {
+                        int py = cy * 2 + dy;
+                        int px = cx * 2 + dx;
+                        int rgb_idx = (py * 16 + px) * 3;
+
+                        int r = rgb_block[rgb_idx];
+                        int g = rgb_block[rgb_idx + 1];
+                        int b = rgb_block[rgb_idx + 2];
+
+                        int co = r - b;
+                        int tmp = b + (co / 2);
+                        int cg = g - tmp;
+
+                        sum_co += co;
+                        sum_cg += cg;
+                    }
+                }
+
+                // Average and store subsampled chroma
+                c2_workspace[cy * HALF_BLOCK_SIZE + cx] = (float)CLAMP(sum_co / 4, -256, 255);
+                c3_workspace[cy * HALF_BLOCK_SIZE + cx] = (float)CLAMP(sum_cg / 4, -256, 255);
+            }
+        }
+    }
+}
+
+// Extract motion-compensated YCoCg-R block from reference frame
+static void extract_motion_compensated_block(const uint8_t *rgb_data, int width, int height,
+                                           int block_x, int block_y, int mv_x, int mv_y,
+                                           uint8_t *y_block, int8_t *co_block, int8_t *cg_block) {
+    // Extract 16x16 RGB block with motion compensation
+    uint8_t rgb_block[BLOCK_SIZE * BLOCK_SIZE * 3];
+
+    for (int dy = 0; dy < BLOCK_SIZE; dy++) {
+        for (int dx = 0; dx < BLOCK_SIZE; dx++) {
+            int cur_x = block_x + dx;
+            int cur_y = block_y + dy;
+            int ref_x = cur_x + mv_x;  // Revert to original motion compensation
+            int ref_y = cur_y + mv_y;
+
+            int rgb_idx = (dy * BLOCK_SIZE + dx) * 3;
+
+            if (ref_x >= 0 && ref_y >= 0 && ref_x < width && ref_y < height) {
+                // Copy RGB from reference position
+                int ref_offset = (ref_y * width + ref_x) * 3;
+                rgb_block[rgb_idx] = rgb_data[ref_offset];         // R
+                rgb_block[rgb_idx + 1] = rgb_data[ref_offset + 1]; // G
+                rgb_block[rgb_idx + 2] = rgb_data[ref_offset + 2]; // B
+            } else {
+                // Out of bounds - use black
+                rgb_block[rgb_idx] = 0;     // R
+                rgb_block[rgb_idx + 1] = 0; // G
+                rgb_block[rgb_idx + 2] = 0; // B
+            }
+        }
+    }
+
+    // Convert RGB block to YCoCg-R (original implementation for motion compensation)
     // Convert 16x16 RGB to Y (full resolution)
     for (int py = 0; py < BLOCK_SIZE; py++) {
         for (int px = 0; px < BLOCK_SIZE; px++) {
@@ -840,41 +1157,6 @@ static void convert_rgb_to_ycocgr_block(const uint8_t *rgb_block,
     }
 }
 
-// Extract motion-compensated YCoCg-R block from reference frame
-static void extract_motion_compensated_block(const uint8_t *rgb_data, int width, int height,
-                                           int block_x, int block_y, int mv_x, int mv_y,
-                                           uint8_t *y_block, int8_t *co_block, int8_t *cg_block) {
-    // Extract 16x16 RGB block with motion compensation
-    uint8_t rgb_block[BLOCK_SIZE * BLOCK_SIZE * 3];
-
-    for (int dy = 0; dy < BLOCK_SIZE; dy++) {
-        for (int dx = 0; dx < BLOCK_SIZE; dx++) {
-            int cur_x = block_x + dx;
-            int cur_y = block_y + dy;
-            int ref_x = cur_x + mv_x;  // Revert to original motion compensation
-            int ref_y = cur_y + mv_y;
-
-            int rgb_idx = (dy * BLOCK_SIZE + dx) * 3;
-
-            if (ref_x >= 0 && ref_y >= 0 && ref_x < width && ref_y < height) {
-                // Copy RGB from reference position
-                int ref_offset = (ref_y * width + ref_x) * 3;
-                rgb_block[rgb_idx] = rgb_data[ref_offset];         // R
-                rgb_block[rgb_idx + 1] = rgb_data[ref_offset + 1]; // G
-                rgb_block[rgb_idx + 2] = rgb_data[ref_offset + 2]; // B
-            } else {
-                // Out of bounds - use black
-                rgb_block[rgb_idx] = 0;     // R
-                rgb_block[rgb_idx + 1] = 0; // G
-                rgb_block[rgb_idx + 2] = 0; // B
-            }
-        }
-    }
-
-    // Convert RGB block to YCoCg-R
-    convert_rgb_to_ycocgr_block(rgb_block, y_block, co_block, cg_block);
-}
-
 // Compute motion-compensated residual for INTER mode
 static void compute_motion_residual(tev_encoder_t *enc, int block_x, int block_y, int mv_x, int mv_y) {
     int start_x = block_x * BLOCK_SIZE;
@@ -909,7 +1191,7 @@ static void encode_block(tev_encoder_t *enc, int block_x, int block_y, int is_ke
     tev_block_t *block = &enc->block_data[block_y * ((enc->width + 15) / 16) + block_x];
 
     // Extract YCoCg-R block
-    extract_ycocgr_block(enc->current_rgb, enc->width, enc->height,
+    extract_color_space_block(enc, enc->current_rgb, enc->width, enc->height,
                         block_x, block_y,
                         enc->y_workspace, enc->co_workspace, enc->cg_workspace);
 
@@ -1069,7 +1351,7 @@ static void encode_block(tev_encoder_t *enc, int block_x, int block_y, int is_ke
     dct_16x16_fast(enc->y_workspace, enc->dct_workspace);
 
     // quantise Y coefficients (luma) using per-block rate control
-    const uint32_t *y_quant = QUANT_TABLE_Y;
+    const uint32_t *y_quant = enc->ictcp_mode ? QUANT_TABLE_Y : QUANT_TABLE_Y;
     const float qmult_y = jpeg_quality_to_mult(enc->qualityY * block->rate_control_factor);
     for (int i = 0; i < BLOCK_SIZE_SQR; i++) {
         // Apply rate control factor to quantization table (like decoder does)
@@ -1081,7 +1363,7 @@ static void encode_block(tev_encoder_t *enc, int block_x, int block_y, int is_ke
     dct_8x8_fast(enc->co_workspace, enc->dct_workspace);
 
     // quantise Co coefficients (chroma - orange-blue) using per-block rate control
-    const uint32_t *co_quant = QUANT_TABLE_C;
+    const uint32_t *co_quant = enc->ictcp_mode ? QUANT_TABLE_C : QUANT_TABLE_C;
     const float qmult_co = jpeg_quality_to_mult(enc->qualityCo * block->rate_control_factor);
     for (int i = 0; i < HALF_BLOCK_SIZE_SQR; i++) {
         // Apply rate control factor to quantization table (like decoder does)
@@ -1093,7 +1375,8 @@ static void encode_block(tev_encoder_t *enc, int block_x, int block_y, int is_ke
     dct_8x8_fast(enc->cg_workspace, enc->dct_workspace);
 
     // quantise Cg coefficients (chroma - green-magenta, qmult_cg is more aggressive like NTSC Q) using per-block rate control
-    const uint32_t *cg_quant = QUANT_TABLE_C;
+    // In ICtCp mode, Cg becomes Cp (chroma-red) which needs special quantization table
+    const uint32_t *cg_quant = enc->ictcp_mode ? QUANT_TABLE_C : QUANT_TABLE_C;
     const float qmult_cg = jpeg_quality_to_mult(enc->qualityCg * block->rate_control_factor);
     for (int i = 0; i < HALF_BLOCK_SIZE_SQR; i++) {
         // Apply rate control factor to quantization table (like decoder does)
@@ -1105,107 +1388,6 @@ static void encode_block(tev_encoder_t *enc, int block_x, int block_y, int is_ke
     block->cbp = 0x07;  // Y, Co, Cg all present
 }
 
-// Encode a 16x16 block in lossless mode
-static void encode_block_lossless(tev_encoder_t *enc, int block_x, int block_y, int is_keyframe) {
-    tev_lossless_block_t *block = (tev_lossless_block_t*)&enc->block_data[block_y * ((enc->width + 15) / 16) + block_x];
-
-    // Extract YCoCg-R block
-    extract_ycocgr_block(enc->current_rgb, enc->width, enc->height,
-                        block_x, block_y,
-                        enc->y_workspace, enc->co_workspace, enc->cg_workspace);
-
-    if (is_keyframe) {
-        // Intra coding for keyframes
-        block->mode = TEV_MODE_INTRA;
-        block->mv_x = block->mv_y = 0;
-        enc->blocks_intra++;
-    } else {
-        // Same mode decision logic as regular encode_block
-        // For simplicity, using INTRA for now in lossless mode
-        block->mode = TEV_MODE_INTRA;
-        block->mv_x = block->mv_y = 0;
-        enc->blocks_intra++;
-    }
-
-    // Lossless mode: rate control factor is always 1.0f
-    block->rate_control_factor = 1.0f;
-
-    // Apply DCT transforms using the same pattern as regular encoding
-    // Y channel (16x16)
-    dct_16x16_fast(enc->y_workspace, enc->dct_workspace);
-    for (int i = 0; i < BLOCK_SIZE_SQR; i++) {
-        block->y_coeffs[i] = enc->dct_workspace[i]; // Store directly without quantization
-    }
-
-    // Co channel (8x8)  
-    dct_8x8_fast(enc->co_workspace, enc->dct_workspace);
-    for (int i = 0; i < HALF_BLOCK_SIZE_SQR; i++) {
-        block->co_coeffs[i] = enc->dct_workspace[i]; // Store directly without quantization
-    }
-
-    // Cg channel (8x8)
-    dct_8x8_fast(enc->cg_workspace, enc->dct_workspace);
-    for (int i = 0; i < HALF_BLOCK_SIZE_SQR; i++) {
-        block->cg_coeffs[i] = enc->dct_workspace[i]; // Store directly without quantization
-    }
-
-    // Set CBP (simplified - always encode all channels)
-    block->cbp = 0x07;  // Y, Co, Cg all present
-}
-
-// Serialized lossless block structure (for writing to file with float16 coefficients)
-typedef struct __attribute__((packed)) {
-    uint8_t mode;
-    int16_t mv_x, mv_y;
-    float rate_control_factor; // Always 1.0f in lossless mode
-    uint16_t cbp;
-    uint16_t y_coeffs[BLOCK_SIZE_SQR];      // float16 Y coefficients
-    uint16_t co_coeffs[HALF_BLOCK_SIZE_SQR]; // float16 Co coefficients
-    uint16_t cg_coeffs[HALF_BLOCK_SIZE_SQR]; // float16 Cg coefficients
-} tev_serialized_lossless_block_t;
-
-// Convert lossless blocks to serialized format with float16 coefficients
-static void serialize_lossless_blocks(tev_encoder_t *enc, int blocks_x, int blocks_y, 
-                                     tev_serialized_lossless_block_t *serialized_blocks) {
-    for (int by = 0; by < blocks_y; by++) {
-        for (int bx = 0; bx < blocks_x; bx++) {
-            tev_lossless_block_t *src = (tev_lossless_block_t*)&enc->block_data[by * blocks_x + bx];
-            tev_serialized_lossless_block_t *dst = &serialized_blocks[by * blocks_x + bx];
-            
-            // Copy basic fields
-            dst->mode = src->mode;
-            dst->mv_x = src->mv_x;
-            dst->mv_y = src->mv_y;
-            dst->rate_control_factor = src->rate_control_factor;
-            dst->cbp = src->cbp;
-            
-            // Convert float32 coefficients to float16 with range clamping
-            // Float16 max finite value is approximately 65504
-            const float FLOAT16_MAX = 65504.0f;
-            
-            for (int i = 0; i < BLOCK_SIZE_SQR; i++) {
-                float coeff = FCLAMP(src->y_coeffs[i], -FLOAT16_MAX, FLOAT16_MAX);
-                dst->y_coeffs[i] = float_to_float16(coeff);
-                if (enc->verbose && fabsf(src->y_coeffs[i]) > FLOAT16_MAX) {
-                    printf("WARNING: Y coefficient %d clamped: %f -> %f\n", i, src->y_coeffs[i], coeff);
-                }
-            }
-            for (int i = 0; i < HALF_BLOCK_SIZE_SQR; i++) {
-                float co_coeff = FCLAMP(src->co_coeffs[i], -FLOAT16_MAX, FLOAT16_MAX);
-                float cg_coeff = FCLAMP(src->cg_coeffs[i], -FLOAT16_MAX, FLOAT16_MAX);
-                dst->co_coeffs[i] = float_to_float16(co_coeff);
-                dst->cg_coeffs[i] = float_to_float16(cg_coeff);
-                if (enc->verbose && fabsf(src->co_coeffs[i]) > FLOAT16_MAX) {
-                    printf("WARNING: Co coefficient %d clamped: %f -> %f\n", i, src->co_coeffs[i], co_coeff);
-                }
-                if (enc->verbose && fabsf(src->cg_coeffs[i]) > FLOAT16_MAX) {
-                    printf("WARNING: Cg coefficient %d clamped: %f -> %f\n", i, src->cg_coeffs[i], cg_coeff);
-                }
-            }
-        }
-    }
-}
-
 // Convert SubRip time format (HH:MM:SS,mmm) to frame number
 static int srt_time_to_frame(const char *time_str, int fps) {
     int hours, minutes, seconds, milliseconds;
@@ -1820,17 +2002,13 @@ static int alloc_encoder_buffers(tev_encoder_t *enc) {
     enc->cg_workspace = malloc(8 * 8 * sizeof(float));
     enc->dct_workspace = malloc(16 * 16 * sizeof(float));
 
+    // Allocate block data
     enc->block_data = malloc(total_blocks * sizeof(tev_block_t));
-    // Allocate compression buffer large enough for both regular and lossless modes
-    size_t max_block_size = sizeof(tev_block_t) > sizeof(tev_serialized_lossless_block_t) ? 
-                            sizeof(tev_block_t) : sizeof(tev_serialized_lossless_block_t);
-    size_t compressed_buffer_size = total_blocks * max_block_size * 2;
+
+    // Allocate compression buffer
+    size_t compressed_buffer_size = total_blocks * sizeof(tev_block_t) * 2;
     enc->compressed_buffer = malloc(compressed_buffer_size);
-    
-    if (enc->verbose) {
-        printf("Allocated compressed buffer: %zu bytes for %d blocks (max_block_size: %zu)\n", 
-               compressed_buffer_size, total_blocks, max_block_size);
-    }
+
     enc->mp2_buffer = malloc(MP2_DEFAULT_PACKET_SIZE);
 
     if (!enc->current_rgb || !enc->previous_rgb || !enc->reference_rgb ||
@@ -1889,7 +2067,7 @@ static void free_encoder(tev_encoder_t *enc) {
 static int write_tev_header(FILE *output, tev_encoder_t *enc) {
     // Magic + version
     fwrite(TEV_MAGIC, 1, 8, output);
-    uint8_t version = TEV_VERSION;
+    uint8_t version = enc->ictcp_mode ? 3 : 2;  // Version 3 for ICtCp, 2 for YCoCg-R
     fwrite(&version, 1, 1, output);
 
     // Video parameters
@@ -1901,7 +2079,7 @@ static int write_tev_header(FILE *output, tev_encoder_t *enc) {
     uint8_t qualityCo = enc->qualityCo;
     uint8_t qualityCg = enc->qualityCg;
     uint8_t flags = (enc->has_audio) | (enc->has_subtitles << 1);
-    uint8_t video_flags = (enc->progressive_mode ? 0 : 1) | (enc->is_ntsc_framerate ? 2 : 0) | (enc->lossless_mode ? 4 : 0); // bit 0 = is_interlaced, bit 1 = is_ntsc_framerate, bit 2 = is_lossless
+    uint8_t video_flags = (enc->progressive_mode ? 0 : 1) | (enc->is_ntsc_framerate ? 2 : 0); // bit 0 = is_interlaced, bit 1 = is_ntsc_framerate
     uint8_t reserved = 0;
 
     fwrite(&width, 2, 1, output);
@@ -2008,11 +2186,7 @@ static int encode_frame(tev_encoder_t *enc, FILE *output, int frame_num, int fie
     // Encode all blocks
     for (int by = 0; by < blocks_y; by++) {
         for (int bx = 0; bx < blocks_x; bx++) {
-            if (enc->lossless_mode) {
-                encode_block_lossless(enc, bx, by, is_keyframe);
-            } else {
-                encode_block(enc, bx, by, is_keyframe);
-            }
+            encode_block(enc, bx, by, is_keyframe);
 
             // Calculate complexity for rate control (if enabled)
             if (enc->bitrate_mode > 0) {
@@ -2029,34 +2203,14 @@ static int encode_frame(tev_encoder_t *enc, FILE *output, int frame_num, int fie
 
     // Compress block data using Zstd (compatible with TSVM decoder)
     size_t compressed_size;
-    
-    if (enc->lossless_mode) {
-        // Lossless mode: serialize blocks with float16 coefficients
-        size_t serialized_block_data_size = blocks_x * blocks_y * sizeof(tev_serialized_lossless_block_t);
-        tev_serialized_lossless_block_t *serialized_blocks = malloc(serialized_block_data_size);
-        if (!serialized_blocks) {
-            fprintf(stderr, "Failed to allocate memory for serialized lossless blocks\n");
-            return -1;
-        }
-        
-        serialize_lossless_blocks(enc, blocks_x, blocks_y, serialized_blocks);
-        
-        // Use the pre-allocated buffer size instead of calculating dynamically
-        size_t output_buffer_size = blocks_x * blocks_y * sizeof(tev_serialized_lossless_block_t) * 2;
-        compressed_size = ZSTD_compressCCtx(enc->zstd_context,
-                                           enc->compressed_buffer, output_buffer_size,
-                                           serialized_blocks, serialized_block_data_size,
-                                           ZSTD_COMPRESSON_LEVEL);
-        free(serialized_blocks);
-    } else {
-        // Regular mode: use regular block data
-        size_t block_data_size = blocks_x * blocks_y * sizeof(tev_block_t);
-        compressed_size = ZSTD_compressCCtx(enc->zstd_context,
-                                           enc->compressed_buffer, block_data_size * 2,
-                                           enc->block_data, block_data_size,
-                                           ZSTD_COMPRESSON_LEVEL);
-    }
-    
+
+    // Regular mode: use regular block data
+    size_t block_data_size = blocks_x * blocks_y * sizeof(tev_block_t);
+    compressed_size = ZSTD_compressCCtx(enc->zstd_context,
+                                       enc->compressed_buffer, block_data_size * 2,
+                                       enc->block_data, block_data_size,
+                                       ZSTD_COMPRESSON_LEVEL);
+
     if (ZSTD_isError(compressed_size)) {
         fprintf(stderr, "Zstd compression failed: %s\n", ZSTD_getErrorName(compressed_size));
         return 0;
@@ -2288,7 +2442,7 @@ static int start_audio_conversion(tev_encoder_t *enc) {
     char command[2048];
     snprintf(command, sizeof(command),
         "ffmpeg -v quiet -i \"%s\" -acodec libtwolame -psymodel 4 -b:a %dk -ar %d -ac 2 -y \"%s\" 2>/dev/null",
-        enc->input_file, enc->lossless_mode ? 384 : MP2_RATE_TABLE[enc->qualityIndex], MP2_SAMPLE_RATE, TEMP_AUDIO_FILE);
+        enc->input_file, MP2_RATE_TABLE[enc->qualityIndex], MP2_SAMPLE_RATE, TEMP_AUDIO_FILE);
 
     int result = system(command);
     if (result == 0) {
@@ -2429,7 +2583,7 @@ static int process_audio(tev_encoder_t *enc, int frame_num, FILE *output) {
 
 // Show usage information
 static void show_usage(const char *program_name) {
-    printf("TEV YCoCg-R 4:2:0 Video Encoder\n");
+    printf("TEV YCoCg-R/ICtCp 4:2:0 Video Encoder\n");
     printf("Usage: %s [options] -i input.mp4 -o output.mv2\n\n", program_name);
     printf("Options:\n");
     printf("  -i, --input FILE       Input video file\n");
@@ -2443,7 +2597,7 @@ static void show_usage(const char *program_name) {
     printf("  -S, --subtitles FILE   SubRip (.srt) or SAMI (.smi) subtitle file\n");
     printf("  -v, --verbose          Verbose output\n");
     printf("  -t, --test             Test mode: generate solid colour frames\n");
-    printf("  --lossless             Lossless mode: store coefficients as float16 (no quantisation, implies -p, 384k audio)\n");
+    printf("  --ictcp                Use ICtCp color space instead of YCoCg-R (generates TEV version 3)\n");
     printf("  --enable-rcf           Enable per-block rate control (experimental)\n");
     printf("  --enable-encode-stats  Collect and report block complexity statistics\n");
     printf("  --help                 Show this help\n\n");
@@ -2467,7 +2621,7 @@ static void show_usage(const char *program_name) {
     printf("\n  -s default: equal to %dx%d", DEFAULT_WIDTH, DEFAULT_HEIGHT);
     printf("\n\n");
     printf("Features:\n");
-    printf("  - YCoCg-R 4:2:0 chroma subsampling for 50%% compression improvement\n");
+    printf("  - YCoCg-R or ICtCp 4:2:0 chroma subsampling for 50%% compression improvement\n");
     printf("  - 16x16 Y blocks with 8x8 chroma for optimal DCT efficiency\n");
     printf("  - Frame rate conversion with FFmpeg temporal filtering\n");
     printf("  - Adaptive quality control with complexity-based adjustment\n");
@@ -2536,7 +2690,7 @@ int main(int argc, char *argv[]) {
         {"test", no_argument, 0, 't'},
         {"enable-encode-stats", no_argument, 0, 1000},
         {"enable-rcf", no_argument, 0, 1100},
-        {"lossless", no_argument, 0, 1200},
+        {"ictcp", no_argument, 0, 1300},
         {"help", no_argument, 0, '?'},
         {0, 0, 0, 0}
     };
@@ -2611,8 +2765,8 @@ int main(int argc, char *argv[]) {
              case 1100: // --enable-rcf
                 enc->disable_rcf = 0;
                 break;
-            case 1200: // --lossless
-                enc->lossless_mode = 1;
+            case 1300: // --ictcp
+                enc->ictcp_mode = 1;
                 break;
             case 0:
                 if (strcmp(long_options[option_index].name, "help") == 0) {
@@ -2633,24 +2787,19 @@ int main(int argc, char *argv[]) {
         }
     }
 
-    // Lossless mode validation and adjustments
-    if (enc->lossless_mode) {
-        // In lossless mode, disable rate control and set quality to maximum
-        enc->bitrate_mode = 0;
-        enc->disable_rcf = 1;
-        enc->progressive_mode = 1;
-        enc->qualityIndex = 5;
-        enc->qualityY = enc->qualityCo = enc->qualityCg = 255; // Use 255 as a redundant lossless marker
-        if (enc->verbose) {
-            printf("Lossless mode enabled: Rate control disabled, quality set to maximum, enabling progressive scan\n");
-        }
-    }
-
     // halve the internal representation of frame height
     if (!enc->progressive_mode) {
         enc->height /= 2;
     }
 
+    if (enc->ictcp_mode) {
+        // ICtCp: Ct and Cp have different characteristics than YCoCg Co/Cg
+        // Cp channel now uses specialized quantization table, so moderate quality is fine
+        int base_chroma_quality = enc->qualityCo;
+        enc->qualityCo = base_chroma_quality;           // Ct channel: keep original Co quantization
+        enc->qualityCg = base_chroma_quality;           // Cp channel: same quality since Q_Cp_8 handles detail preservation
+    }
+
     if (!test_mode && (!enc->input_file || !enc->output_file)) {
         fprintf(stderr, "Input and output files are required (unless using --test mode)\n");
         show_usage(argv[0]);
@@ -2737,7 +2886,7 @@ int main(int argc, char *argv[]) {
     write_tev_header(output, enc);
     gettimeofday(&enc->start_time, NULL);
 
-    printf("Encoding video with YCoCg-R 4:2:0 format...\n");
+    printf("Encoding video with %s 4:2:0 format...\n", enc->ictcp_mode ? "ICtCp" : "YCoCg-R");
     if (enc->output_fps != enc->fps) {
         printf("Frame rate conversion enabled: %d fps output\n", enc->output_fps);
     }
@@ -2791,13 +2940,13 @@ int main(int argc, char *argv[]) {
             printf("Frame %d: %s (%d,%d,%d)\n", frame_count, colour_name, test_r, test_g, test_b);
             
             // Test YCoCg-R conversion
-            int y_test, co_test, cg_test;
-            rgb_to_ycocgr(test_r, test_g, test_b, &y_test, &co_test, &cg_test);
-            printf("  YCoCg-R: Y=%d Co=%d Cg=%d\n", y_test, co_test, cg_test);
+            double y_test, co_test, cg_test;
+            rgb_to_color_space(enc, test_r, test_g, test_b, &y_test, &co_test, &cg_test);
+            printf("  %s: Y=%.3f Co=%.3f Cg=%.3f\n", enc->ictcp_mode ? "ICtCp" : "YCoCg", y_test, co_test, cg_test);
             
             // Test reverse conversion
             uint8_t r_rev, g_rev, b_rev;
-            ycocgr_to_rgb(y_test, co_test, cg_test, &r_rev, &g_rev, &b_rev);
+            color_space_to_rgb(enc, y_test, co_test, cg_test, &r_rev, &g_rev, &b_rev);
             printf("  Reverse: R=%d G=%d B=%d\n", r_rev, g_rev, b_rev);
             
         } else {