diff --git a/assets/disk0/tvdos/bin/playtav.js b/assets/disk0/tvdos/bin/playtav.js index 684a288..873b179 100644 --- a/assets/disk0/tvdos/bin/playtav.js +++ b/assets/disk0/tvdos/bin/playtav.js @@ -92,7 +92,7 @@ if (fullFilePathStr.startsWith('$:/TAPE') || fullFilePathStr.startsWith('$:\\\\T con.clear() con.curs_set(0) -graphics.setGraphicsMode(4) // 4096-color mode +graphics.setGraphicsMode(4) // 4096-colour mode graphics.clearPixels(0) graphics.clearPixels2(0) @@ -106,8 +106,8 @@ audio.setMasterVolume(0, 255) function clearSubtitleArea() { // Clear the subtitle area at the bottom of the screen // Text mode is 80x32, so clear the bottom few lines - let oldFgColor = con.get_color_fore() - let oldBgColor = con.get_color_back() + let oldFgColour = con.get_color_fore() + let oldBgColour = con.get_color_back() con.color_pair(255, 255) // transparent to clear @@ -119,7 +119,7 @@ function clearSubtitleArea() { } } - con.color_pair(oldFgColor, oldBgColor) + con.color_pair(oldFgColour, oldBgColour) } function getVisualLength(line) { @@ -153,8 +153,8 @@ function getVisualLength(line) { } function displayFormattedLine(line) { - // Parse line and handle and tags with color changes - // Default subtitle color: yellow (231), formatted text: white (254) + // Parse line and handle and tags with colour changes + // Default subtitle colour: yellow (231), formatted text: white (254) let i = 0 let inBoldOrItalic = false @@ -202,9 +202,9 @@ function displaySubtitle(text, position = 0) { return } - // Set subtitle colors: yellow (231) on black (0) - let oldFgColor = con.get_color_fore() - let oldBgColor = con.get_color_back() + // Set subtitle colours: yellow (231) on black (0) + let oldFgColour = con.get_color_fore() + let oldBgColour = con.get_color_back() con.color_pair(231, 0) // Split text into lines @@ -270,7 +270,7 @@ function displaySubtitle(text, position = 0) { displayFormattedLine(line) } - con.color_pair(oldFgColor, oldBgColor) + con.color_pair(oldFgColour, oldBgColour) } function processSubtitlePacket(packetSize) { @@ -454,7 +454,7 @@ console.log(`Wavelet filter: ${header.waveletFilter === WAVELET_5_3_REVERSIBLE ? console.log(`Decomposition levels: ${header.decompLevels}`) console.log(`Quality: Y=${header.qualityY}, Co=${header.qualityCo}, Cg=${header.qualityCg}`) console.log(`Tiles: ${tilesX}x${tilesY} (${numTiles} total)`) -console.log(`Color space: ${header.version === 2 ? "ICtCp" : "YCoCg-R"}`) +console.log(`Colour space: ${header.version === 2 ? "ICtCp" : "YCoCg-R"}`) console.log(`Features: ${hasAudio ? "Audio " : ""}${hasSubtitles ? "Subtitles " : ""}${progressiveTransmission ? "Progressive " : ""}${roiCoding ? "ROI " : ""}`) // Frame buffer addresses - same as TEV @@ -559,7 +559,7 @@ let stopPlay = false let akku = FRAME_TIME let akku2 = 0.0 -let blockDataPtr = sys.malloc(560*448*3) +let blockDataPtr = sys.malloc(2377764) // Playback loop - properly adapted from TEV try { @@ -613,6 +613,7 @@ try { } try { +// serial.println(actualSize) // Duplicate every 1000th frame if NTSC (same as TEV) if (!isNTSC || frameCount % 1000 != 501 || frameDuped) { frameDuped = false @@ -631,7 +632,7 @@ try { header.decompLevels, // TAV-specific parameter enableDeblocking, isLossless, - header.version // TAV version for color space detection + header.version // TAV version for colour space detection ) decodeTime = (sys.nanoTime() - decodeStart) / 1000000.0 @@ -663,9 +664,10 @@ try { } } catch (e) { console.log(`Frame ${frameCount}: decode failed: ${e}`) + } finally { + sys.free(compressedPtr) } - sys.free(compressedPtr) let biasStart = sys.nanoTime() setBiasLighting() diff --git a/tsvm_core/src/net/torvald/tsvm/GraphicsJSR223Delegate.kt b/tsvm_core/src/net/torvald/tsvm/GraphicsJSR223Delegate.kt index 9d68782..8853adc 100644 --- a/tsvm_core/src/net/torvald/tsvm/GraphicsJSR223Delegate.kt +++ b/tsvm_core/src/net/torvald/tsvm/GraphicsJSR223Delegate.kt @@ -17,14 +17,16 @@ import kotlin.math.* class GraphicsJSR223Delegate(private val vm: VM) { // TAV Simulated overlapping tiles constants (must match encoder) - private val TILE_SIZE = 112 - private val TILE_MARGIN = 32 // 32-pixel margin for 3 DWT levels (4 * 2^3 = 32px) - private val PADDED_TILE_SIZE = TILE_SIZE + 2 * TILE_MARGIN // 112 + 64 = 176px + private val TAV_TILE_SIZE_X = 280 + private val TAV_TILE_SIZE_Y = 224 + private val TAV_TILE_MARGIN = 32 // 32-pixel margin for 3 DWT levels (4 * 2^3 = 32px) + private val TAV_PADDED_TILE_SIZE_X = TAV_TILE_SIZE_X + 2 * TAV_TILE_MARGIN // 280 + 64 = 344px + private val TAV_PADDED_TILE_SIZE_Y = TAV_TILE_SIZE_Y + 2 * TAV_TILE_MARGIN // 224 + 64 = 288px // Reusable working arrays to reduce allocation overhead - private val idct8TempBuffer = FloatArray(64) - private val idct16TempBuffer = FloatArray(256) // For 16x16 IDCT - private val idct16SeparableBuffer = FloatArray(256) // For separable 16x16 IDCT + private val tevIdct8TempBuffer = FloatArray(64) + private val tevIdct16TempBuffer = FloatArray(256) // For 16x16 IDCT + private val tevIdct16SeparableBuffer = FloatArray(256) // For separable 16x16 IDCT private fun getFirstGPU(): GraphicsAdapter? { return vm.findPeribyType(VM.PERITYPE_GPU_AND_TERM)?.peripheral as? GraphicsAdapter @@ -81,19 +83,19 @@ class GraphicsJSR223Delegate(private val vm: VM) { getFirstGPU()?._storebulk(fromAddr, toAddr, length) }*/ - fun plotPixel(x: Int, y: Int, color: Int) { + fun plotPixel(x: Int, y: Int, colour: Int) { getFirstGPU()?.let { if (x in 0 until it.config.width && y in 0 until it.config.height) { - it.poke(y.toLong() * it.config.width + x, color.toByte()) + it.poke(y.toLong() * it.config.width + x, colour.toByte()) it.applyDelay() } } } - fun plotPixel2(x: Int, y: Int, color: Int) { + fun plotPixel2(x: Int, y: Int, colour: Int) { getFirstGPU()?.let { if (x in 0 until it.config.width && y in 0 until it.config.height) { - it.poke(262144 + y.toLong() * it.config.width + x, color.toByte()) + it.poke(262144 + y.toLong() * it.config.width + x, colour.toByte()) it.applyDelay() } } @@ -918,7 +920,7 @@ class GraphicsJSR223Delegate(private val vm: VM) { } private fun clampRGB(f: Float) = f.coerceIn(0f, 1f) - private fun ycocgToRGB(co: Int, cg: Int, ys: Int, As: Int): Array { // ys: 4 Y-values + private fun ipf1YcocgToRGB(co: Int, cg: Int, ys: Int, As: Int): Array { // ys: 4 Y-values // return [R1|G1, B1|A1, R2|G2, B2|A2, R3|G3, B3|A3, R4|G4, B4|A4] // cocg = 0x7777 @@ -967,7 +969,7 @@ class GraphicsJSR223Delegate(private val vm: VM) { ) } - private fun ycocgToRGB(co1: Int, co2: Int, cg1: Int, cg2: Int, ys: Int, As: Int): Array { // ys: 4 Y-values + private fun ipf2YcocgToRGB(co1: Int, co2: Int, cg1: Int, cg2: Int, ys: Int, As: Int): Array { // ys: 4 Y-values // return [R1|G1, B1|A1, R2|G2, B2|A2, R3|G3, B3|A3, R4|G4, B4|A4] // cocg = 0x7777 @@ -1050,25 +1052,25 @@ class GraphicsJSR223Delegate(private val vm: VM) { a4 = readShort() } - var corner = ycocgToRGB(co and 15, cg and 15, y1, a1) + var corner = ipf1YcocgToRGB(co and 15, cg and 15, y1, a1) rg[0] = corner[0];ba[0] = corner[1] rg[1] = corner[2];ba[1] = corner[3] rg[4] = corner[4];ba[4] = corner[5] rg[5] = corner[6];ba[5] = corner[7] - corner = ycocgToRGB((co shr 4) and 15, (cg shr 4) and 15, y2, a2) + corner = ipf1YcocgToRGB((co shr 4) and 15, (cg shr 4) and 15, y2, a2) rg[2] = corner[0];ba[2] = corner[1] rg[3] = corner[2];ba[3] = corner[3] rg[6] = corner[4];ba[6] = corner[5] rg[7] = corner[6];ba[7] = corner[7] - corner = ycocgToRGB((co shr 8) and 15, (cg shr 8) and 15, y3, a3) + corner = ipf1YcocgToRGB((co shr 8) and 15, (cg shr 8) and 15, y3, a3) rg[8] = corner[0];ba[8] = corner[1] rg[9] = corner[2];ba[9] = corner[3] rg[12] = corner[4];ba[12] = corner[5] rg[13] = corner[6];ba[13] = corner[7] - corner = ycocgToRGB((co shr 12) and 15, (cg shr 12) and 15, y4, a4) + corner = ipf1YcocgToRGB((co shr 12) and 15, (cg shr 12) and 15, y4, a4) rg[10] = corner[0];ba[10] = corner[1] rg[11] = corner[2];ba[11] = corner[3] rg[14] = corner[4];ba[14] = corner[5] @@ -1141,25 +1143,25 @@ class GraphicsJSR223Delegate(private val vm: VM) { val rg = IntArray(16) val ba = IntArray(16) - var px = ycocgToRGB(co and 15, cg and 15, y1, 65535) + var px = ipf1YcocgToRGB(co and 15, cg and 15, y1, 65535) rg[0] = px[0]; ba[0] = px[1] rg[1] = px[2]; ba[1] = px[3] rg[4] = px[4]; ba[4] = px[5] rg[5] = px[6]; ba[5] = px[7] - px = ycocgToRGB((co shr 4) and 15, (cg shr 4) and 15, y2, 65535) + px = ipf1YcocgToRGB((co shr 4) and 15, (cg shr 4) and 15, y2, 65535) rg[2] = px[0]; ba[2] = px[1] rg[3] = px[2]; ba[3] = px[3] rg[6] = px[4]; ba[6] = px[5] rg[7] = px[6]; ba[7] = px[7] - px = ycocgToRGB((co shr 8) and 15, (cg shr 8) and 15, y3, 65535) + px = ipf1YcocgToRGB((co shr 8) and 15, (cg shr 8) and 15, y3, 65535) rg[8] = px[0]; ba[8] = px[1] rg[9] = px[2]; ba[9] = px[3] rg[12] = px[4]; ba[12] = px[5] rg[13] = px[6]; ba[13] = px[7] - px = ycocgToRGB((co shr 12) and 15, (cg shr 12) and 15, y4, 65535) + px = ipf1YcocgToRGB((co shr 12) and 15, (cg shr 12) and 15, y4, 65535) rg[10] = px[0]; ba[10] = px[1] rg[11] = px[2]; ba[11] = px[3] rg[14] = px[4]; ba[14] = px[5] @@ -1234,25 +1236,25 @@ class GraphicsJSR223Delegate(private val vm: VM) { a4 = readShort() } - var corner = ycocgToRGB(co and 15, (co shr 8) and 15, cg and 15, (cg shr 8) and 15, y1, a1) + var corner = ipf2YcocgToRGB(co and 15, (co shr 8) and 15, cg and 15, (cg shr 8) and 15, y1, a1) rg[0] = corner[0];ba[0] = corner[1] rg[1] = corner[2];ba[1] = corner[3] rg[4] = corner[4];ba[4] = corner[5] rg[5] = corner[6];ba[5] = corner[7] - corner = ycocgToRGB((co shr 4) and 15, (co shr 12) and 15, (cg shr 4) and 15, (cg shr 12) and 15, y2, a2) + corner = ipf2YcocgToRGB((co shr 4) and 15, (co shr 12) and 15, (cg shr 4) and 15, (cg shr 12) and 15, y2, a2) rg[2] = corner[0];ba[2] = corner[1] rg[3] = corner[2];ba[3] = corner[3] rg[6] = corner[4];ba[6] = corner[5] rg[7] = corner[6];ba[7] = corner[7] - corner = ycocgToRGB((co shr 16) and 15, (co shr 24) and 15, (cg shr 16) and 15, (cg shr 24) and 15, y3, a3) + corner = ipf2YcocgToRGB((co shr 16) and 15, (co shr 24) and 15, (cg shr 16) and 15, (cg shr 24) and 15, y3, a3) rg[8] = corner[0];ba[8] = corner[1] rg[9] = corner[2];ba[9] = corner[3] rg[12] = corner[4];ba[12] = corner[5] rg[13] = corner[6];ba[13] = corner[7] - corner = ycocgToRGB((co shr 20) and 15, (co shr 28) and 15, (cg shr 20) and 15, (cg shr 28) and 15, y4, a4) + corner = ipf2YcocgToRGB((co shr 20) and 15, (co shr 28) and 15, (cg shr 20) and 15, (cg shr 28) and 15, y4, a4) rg[10] = corner[0];ba[10] = corner[1] rg[11] = corner[2];ba[11] = corner[3] rg[14] = corner[4];ba[14] = corner[5] @@ -1430,8 +1432,8 @@ class GraphicsJSR223Delegate(private val vm: VM) { val videoX = nativeX * scaleX val videoY = nativeY * scaleY - // Sample RGB values using bilinear interpolation (optimized version) - val rgb = sampleBilinearOptimized(rgbAddr, width, height, videoX, videoY, rgbAddrIncVec) + // Sample RGB values using bilinear interpolation (optimised version) + val rgb = sampleBilinearOptimised(rgbAddr, width, height, videoX, videoY, rgbAddrIncVec) val r = rgb[0] val g = rgb[1] val b = rgb[2] @@ -1457,7 +1459,7 @@ class GraphicsJSR223Delegate(private val vm: VM) { pixelsProcessed += pixelsInChunk } } else { - // Optimized centering logic with bulk memory operations + // Optimised centering logic with bulk memory operations val offsetX = (nativeWidth - width) / 2 val offsetY = (nativeHeight - height) / 2 @@ -1528,7 +1530,7 @@ class GraphicsJSR223Delegate(private val vm: VM) { * Apply Bayer dithering to reduce banding when quantizing to 4-bit */ private fun ditherValue(value: Int, x: Int, y: Int, f: Int): Int { - // Preserve pure values (0 and 255) exactly to maintain color primaries + // Preserve pure values (0 and 255) exactly to maintain colour primaries if (value == 0) return 0 if (value == 255) return 15 @@ -1589,9 +1591,9 @@ class GraphicsJSR223Delegate(private val vm: VM) { } /** - * Optimized bilinear sampling with bulk memory access and caching + * Optimised bilinear sampling with bulk memory access and caching */ - private fun sampleBilinearOptimized(rgbAddr: Long, width: Int, height: Int, x: Float, y: Float, rgbAddrIncVec: Int): IntArray { + private fun sampleBilinearOptimised(rgbAddr: Long, width: Int, height: Int, x: Float, y: Float, rgbAddrIncVec: Int): IntArray { // Clamp coordinates to valid range val clampedX = x.coerceIn(0f, (width - 1).toFloat()) val clampedY = y.coerceIn(0f, (height - 1).toFloat()) @@ -1610,7 +1612,7 @@ class GraphicsJSR223Delegate(private val vm: VM) { val (memspace, baseOffset) = vm.translateAddr(rgbAddr) if (memspace is UnsafePtr && rgbAddrIncVec == 1) { - // Optimized path for user memory with forward addressing + // Optimised path for user memory with forward addressing val y0RowAddr = baseOffset + (y0 * width + x0) * 3 val y1RowAddr = baseOffset + (y1 * width + x0) * 3 @@ -1653,7 +1655,7 @@ class GraphicsJSR223Delegate(private val vm: VM) { val result = IntArray(64) // Reuse preallocated temp buffer to reduce GC pressure for (i in coeffs.indices) { - idct8TempBuffer[i] = coeffs[i] * (quantTable[i] * jpeg_quality_to_mult(qualityIndex * rateControlFactor)).coerceIn(1f, 255f) + tevIdct8TempBuffer[i] = coeffs[i] * (quantTable[i] * jpeg_quality_to_mult(qualityIndex * rateControlFactor)).coerceIn(1f, 255f) } // Fast separable IDCT (row-column decomposition) @@ -1670,7 +1672,7 @@ class GraphicsJSR223Delegate(private val vm: VM) { } sum += dctBasis8[u][col] * coeff } - idct8TempBuffer[row * 8 + col] = sum + tevIdct8TempBuffer[row * 8 + col] = sum } } @@ -1679,7 +1681,7 @@ class GraphicsJSR223Delegate(private val vm: VM) { for (row in 0 until 8) { var sum = 0f for (v in 0 until 8) { - sum += dctBasis8[v][row] * idct8TempBuffer[v * 8 + col] + sum += dctBasis8[v][row] * tevIdct8TempBuffer[v * 8 + col] } val pixel = if (isChromaResidual) { @@ -1714,7 +1716,7 @@ class GraphicsJSR223Delegate(private val vm: VM) { } else { coeffs[idx] * (quantTable[idx] * jpeg_quality_to_mult(qualityIndex * rateControlFactor)).coerceIn(1f, 255f) } - idct16TempBuffer[idx] = coeff + tevIdct16TempBuffer[idx] = coeff } } @@ -1724,9 +1726,9 @@ class GraphicsJSR223Delegate(private val vm: VM) { for (col in 0 until 16) { var sum = 0f for (u in 0 until 16) { - sum += dctBasis16[u][col] * idct16TempBuffer[row * 16 + u] + sum += dctBasis16[u][col] * tevIdct16TempBuffer[row * 16 + u] } - idct16SeparableBuffer[row * 16 + col] = sum + tevIdct16SeparableBuffer[row * 16 + col] = sum } } @@ -1735,7 +1737,7 @@ class GraphicsJSR223Delegate(private val vm: VM) { for (row in 0 until 16) { var sum = 0f for (v in 0 until 16) { - sum += dctBasis16[v][row] * idct16SeparableBuffer[v * 16 + col] + sum += dctBasis16[v][row] * tevIdct16SeparableBuffer[v * 16 + col] } val pixel = (sum + 128f).coerceIn(0f, 255f) result[row * 16 + col] = pixel.toInt() @@ -1754,7 +1756,7 @@ class GraphicsJSR223Delegate(private val vm: VM) { private val interlacedFieldBuffer = IntArray(560 * 224 * 3) // Half-height RGB buffer /** - * YADIF (Yet Another Deinterlacing Filter) implementation - Optimized + * YADIF (Yet Another Deinterlacing Filter) implementation - Optimised * Converts interlaced field to progressive frame with temporal/spatial interpolation */ fun yadifDeinterlace(fieldRGBAddr: Long, outputRGBAddr: Long, width: Int, height: Int, @@ -1861,7 +1863,7 @@ class GraphicsJSR223Delegate(private val vm: VM) { } } - // Cover up top and bottom lines with border color (optimized) + // Cover up top and bottom lines with border colour (optimised) val destT = 0 val destB = (height - 2) * width * 3 val col = (vm.peek(-1299457)!!.toUint() shl 16) or (vm.peek(-1299458)!!.toUint() shl 8) or vm.peek(-1299459)!!.toUint() @@ -1887,7 +1889,7 @@ class GraphicsJSR223Delegate(private val vm: VM) { for (c in 0..2) { val idx = pixelIdx + c - // Get spatial neighbors + // Get spatial neighbours val above = fieldBuffer[aboveRowIdx + idx].toUint() val below = fieldBuffer[belowRowIdx + idx].toUint() val current = fieldBuffer[rowStartIdx + idx].toUint() @@ -1902,7 +1904,7 @@ class GraphicsJSR223Delegate(private val vm: VM) { val nextPixel = nextBuffer[rowStartIdx + idx].toUint() val tempInterp = (prevPixel + nextPixel) / 2 - // YADIF edge-directed decision (optimized) + // YADIF edge-directed decision (optimised) val spatialDiff = kotlin.math.abs(above.toInt() - below.toInt()) val temporalDiff = kotlin.math.abs(prevPixel.toInt() - nextPixel.toInt()) @@ -1960,7 +1962,7 @@ class GraphicsJSR223Delegate(private val vm: VM) { val interpOutputOffset = (interpLine * width + x) * 3 for (c in 0..2) { - // Get spatial neighbors from sequential field data + // Get spatial neighbours from sequential field data val fieldStride = width * 3 val aboveOffset = fieldOffset - fieldStride + c val belowOffset = fieldOffset + fieldStride + c @@ -1985,7 +1987,7 @@ class GraphicsJSR223Delegate(private val vm: VM) { var interpolatedValue = (above + below) / 2 // Default spatial interpolation if (prevFieldAddr != 0L && nextFieldAddr != 0L) { - // Get temporal neighbors + // Get temporal neighbours val tempFieldOffset = (y * width + x) * 3 + c val prevPixel = (vm.peek(prevFieldAddr + tempFieldOffset * fieldIncVec)?.toInt() ?: current) and 0xFF val nextPixel = (vm.peek(nextFieldAddr + tempFieldOffset * fieldIncVec)?.toInt() ?: current) and 0xFF @@ -1993,7 +1995,7 @@ class GraphicsJSR223Delegate(private val vm: VM) { // BWDIF-inspired temporal differences (adapted for 3-frame window) // Note: True BWDIF uses 5 frames, we adapt to 3-frame constraint - // Get spatial neighbors from previous and next fields for temporal comparison + // Get spatial neighbours from previous and next fields for temporal comparison // Use same addressing pattern as working YADIF implementation val prevAboveOffset = if (y > 0) ((y-1) * width + x) * 3 + c else tempFieldOffset val prevBelowOffset = if (y < fieldHeight - 1) ((y+1) * width + x) * 3 + c else tempFieldOffset @@ -2241,102 +2243,102 @@ class GraphicsJSR223Delegate(private val vm: VM) { } // ENHANCED: Knusperli-inspired boundary discontinuity analysis - fun analyzeBoundaryDiscontinuity(samples: IntArray): Pair { + fun analyseBoundaryDiscontinuity(samples: IntArray): Pair { // samples: 8-pixel samples across the boundary for frequency analysis var delta = 0L var hfPenalty = 0L - + for (u in 0 until 8) { val alpha = kAlphaSqrt2[u] val sign = if (u and 1 != 0) -1 else 1 val leftVal = samples[u] val rightVal = samples[7 - u] // Mirror for boundary analysis - + delta += alpha * (rightVal - sign * leftVal) hfPenalty += (u * u) * (leftVal * leftVal + rightVal * rightVal) } - + return Pair(delta, hfPenalty) } - + // ENHANCED: Adaptive strength based on local complexity fun calculateAdaptiveStrength(baseStrength: Float, hfPenalty: Long, delta: Long): Float { val complexity = kotlin.math.sqrt(hfPenalty.toDouble()).toFloat() val discontinuityMagnitude = kotlin.math.abs(delta).toFloat() - + // Reduce filtering strength in high-frequency areas (preserve detail) val complexityFactor = if (complexity > 800) 0.3f else 1.0f - + // Increase filtering strength for clear discontinuities val discontinuityFactor = kotlin.math.min(2.0f, discontinuityMagnitude / 1000.0f) - + return baseStrength * complexityFactor * discontinuityFactor } - + // ENHANCED: Apply Knusperli-style corrections using linear gradient patterns fun applyBoundaryCorrection( samples: IntArray, delta: Long, adaptiveStrength: Float ): IntArray { val result = samples.clone() val correction = (delta * 724 shr 31).toInt() // Apply sqrt(2)/2 weighting like Knusperli - + // Apply linear gradient corrections across boundary for (i in 0 until 8) { val gradientWeight = kLinearGradient[i] * correction / 1024 // Scale from 10-bit fixed-point val sign = if (i < 4) 1 else -1 // Left/right side weighting - + val adjustment = (gradientWeight * sign * adaptiveStrength).toInt() result[i] = (result[i] + adjustment).coerceIn(0, 255) } - + return result } - + // ENHANCED HORIZONTAL DEBLOCKING: Using Knusperli-inspired boundary analysis for (by in 0 until blocksY) { for (bx in 1 until blocksX) { val blockEdgeX = bx * blockSize if (blockEdgeX >= width) continue - + // Process boundary in chunks for better performance val yStart = by * blockSize val yEnd = minOf((by + 1) * blockSize, height) - + for (y in yStart until yEnd step 2) { // Process 2 lines at a time if (y + 1 >= height) continue - + // Sample 8x2 pixel region across boundary for both lines val samples1 = IntArray(24) // 8 pixels × 3 channels (RGB) val samples2 = IntArray(24) - + for (i in 0 until 8) { val x = blockEdgeX - 4 + i val rgb1 = getPixelBulk(x, y) val rgb2 = getPixelBulk(x, y + 1) - + samples1[i * 3] = rgb1[0] // R - samples1[i * 3 + 1] = rgb1[1] // G + samples1[i * 3 + 1] = rgb1[1] // G samples1[i * 3 + 2] = rgb1[2] // B samples2[i * 3] = rgb2[0] samples2[i * 3 + 1] = rgb2[1] samples2[i * 3 + 2] = rgb2[2] } - - // Analyze each color channel separately + + // Analyse each colour channel separately for (c in 0..2) { val channelSamples1 = IntArray(8) { samples1[it * 3 + c] } val channelSamples2 = IntArray(8) { samples2[it * 3 + c] } - - val (delta1, hfPenalty1) = analyzeBoundaryDiscontinuity(channelSamples1) - val (delta2, hfPenalty2) = analyzeBoundaryDiscontinuity(channelSamples2) - - // Skip if very small discontinuity (early exit optimization) + + val (delta1, hfPenalty1) = analyseBoundaryDiscontinuity(channelSamples1) + val (delta2, hfPenalty2) = analyseBoundaryDiscontinuity(channelSamples2) + + // Skip if very small discontinuity (early exit optimisation) if (kotlin.math.abs(delta1) < 50 && kotlin.math.abs(delta2) < 50) continue - + // Calculate adaptive filtering strength val adaptiveStrength1 = calculateAdaptiveStrength(strength, hfPenalty1, delta1) val adaptiveStrength2 = calculateAdaptiveStrength(strength, hfPenalty2, delta2) - + // Apply corrections if strength is significant if (adaptiveStrength1 > 0.05f) { val corrected1 = applyBoundaryCorrection(channelSamples1, delta1, adaptiveStrength1) @@ -2344,7 +2346,7 @@ class GraphicsJSR223Delegate(private val vm: VM) { samples1[i * 3 + c] = corrected1[i] } } - + if (adaptiveStrength2 > 0.05f) { val corrected2 = applyBoundaryCorrection(channelSamples2, delta2, adaptiveStrength2) for (i in 0 until 8) { @@ -2352,7 +2354,7 @@ class GraphicsJSR223Delegate(private val vm: VM) { } } } - + // Write back corrected pixels in bulk for (i in 2..5) { // Only write middle 4 pixels to avoid artifacts val x = blockEdgeX - 4 + i @@ -2364,28 +2366,28 @@ class GraphicsJSR223Delegate(private val vm: VM) { } } } - + // ENHANCED VERTICAL DEBLOCKING: Same approach for horizontal block boundaries for (by in 1 until blocksY) { for (bx in 0 until blocksX) { val blockEdgeY = by * blockSize if (blockEdgeY >= height) continue - + val xStart = bx * blockSize val xEnd = minOf((bx + 1) * blockSize, width) - + for (x in xStart until xEnd step 2) { if (x + 1 >= width) continue - + // Sample 8x2 pixel region across vertical boundary val samples1 = IntArray(24) val samples2 = IntArray(24) - + for (i in 0 until 8) { val y = blockEdgeY - 4 + i val rgb1 = getPixelBulk(x, y) val rgb2 = getPixelBulk(x + 1, y) - + samples1[i * 3] = rgb1[0] samples1[i * 3 + 1] = rgb1[1] samples1[i * 3 + 2] = rgb1[2] @@ -2393,27 +2395,27 @@ class GraphicsJSR223Delegate(private val vm: VM) { samples2[i * 3 + 1] = rgb2[1] samples2[i * 3 + 2] = rgb2[2] } - + // Same boundary analysis and correction as horizontal for (c in 0..2) { val channelSamples1 = IntArray(8) { samples1[it * 3 + c] } val channelSamples2 = IntArray(8) { samples2[it * 3 + c] } - - val (delta1, hfPenalty1) = analyzeBoundaryDiscontinuity(channelSamples1) - val (delta2, hfPenalty2) = analyzeBoundaryDiscontinuity(channelSamples2) - + + val (delta1, hfPenalty1) = analyseBoundaryDiscontinuity(channelSamples1) + val (delta2, hfPenalty2) = analyseBoundaryDiscontinuity(channelSamples2) + if (kotlin.math.abs(delta1) < 50 && kotlin.math.abs(delta2) < 50) continue - + val adaptiveStrength1 = calculateAdaptiveStrength(strength, hfPenalty1, delta1) val adaptiveStrength2 = calculateAdaptiveStrength(strength, hfPenalty2, delta2) - + if (adaptiveStrength1 > 0.05f) { val corrected1 = applyBoundaryCorrection(channelSamples1, delta1, adaptiveStrength1) for (i in 0 until 8) { samples1[i * 3 + c] = corrected1[i] } } - + if (adaptiveStrength2 > 0.05f) { val corrected2 = applyBoundaryCorrection(channelSamples2, delta2, adaptiveStrength2) for (i in 0 until 8) { @@ -2421,7 +2423,7 @@ class GraphicsJSR223Delegate(private val vm: VM) { } } } - + // Write back corrected pixels for (i in 2..5) { val y = blockEdgeY - 4 + i @@ -2436,33 +2438,33 @@ class GraphicsJSR223Delegate(private val vm: VM) { } /** - * Bulk write RGB block data to VM memory + * Bulk write RGB block data to VM memory */ private fun bulkWriteRGB(destAddr: Long, rgbData: IntArray, width: Int, height: Int, startX: Int, startY: Int, blockWidth: Int, blockHeight: Int, addrIncVec: Int) { val (memspace, baseOffset) = vm.translateAddr(destAddr) - + if (memspace is UnsafePtr && addrIncVec == 1) { - // Optimized path for user memory with forward addressing + // Optimised path for user memory with forward addressing for (dy in 0 until blockHeight) { val y = startY + dy if (y >= height) break - + val rowStartX = kotlin.math.max(0, startX) val rowEndX = kotlin.math.min(width, startX + blockWidth) val rowPixels = rowEndX - rowStartX - + if (rowPixels > 0) { val srcRowOffset = dy * blockWidth * 3 + (rowStartX - startX) * 3 val dstRowOffset = baseOffset + (y * width + rowStartX) * 3 val rowBytes = rowPixels * 3 - + // Convert IntArray to ByteArray for this row val rowBuffer = ByteArray(rowBytes) for (i in 0 until rowBytes) { rowBuffer[i] = rgbData[srcRowOffset + i].toByte() } - + // Bulk write the row UnsafeHelper.memcpyRaw( rowBuffer, UnsafeHelper.getArrayOffset(rowBuffer), @@ -2478,7 +2480,7 @@ class GraphicsJSR223Delegate(private val vm: VM) { if (x < width && y < height) { val rgbIdx = (dy * blockWidth + dx) * 3 val bufferOffset = (y.toLong() * width + x) * 3 - + vm.poke(destAddr + bufferOffset * addrIncVec, rgbData[rgbIdx].toByte()) vm.poke(destAddr + (bufferOffset + 1) * addrIncVec, rgbData[rgbIdx + 1].toByte()) vm.poke(destAddr + (bufferOffset + 2) * addrIncVec, rgbData[rgbIdx + 2].toByte()) @@ -2491,7 +2493,7 @@ class GraphicsJSR223Delegate(private val vm: VM) { /** * Hardware-accelerated TEV frame decoder for YCoCg-R 4:2:0 format * Decodes compressed TEV block data directly to framebuffer - * + * * @param blockDataPtr Pointer to decompressed TEV block data * @param currentRGBAddr Address of current frame RGB buffer (24-bit: R,G,B per pixel) * @param prevRGBAddr Address of previous frame RGB buffer (for motion compensation) @@ -2513,7 +2515,7 @@ class GraphicsJSR223Delegate(private val vm: VM) { var readPtr = blockDataPtr - // decide increment "direction" by the sign of the pointer + // decide increment "direction" by the sign of the pointer val prevAddrIncVec = if (prevRGBAddr >= 0) 1 else -1 val thisAddrIncVec = if (currentRGBAddr >= 0) 1 else -1 @@ -2526,13 +2528,13 @@ class GraphicsJSR223Delegate(private val vm: VM) { val blockModes = IntArray(blocksX * blocksY) val motionVectors = Array(blocksX * blocksY) { intArrayOf(0, 0) } val rateControlFactors = FloatArray(blocksX * blocksY) - + // Collect all blocks first var tempReadPtr = readPtr for (by in 0 until blocksY) { for (bx in 0 until blocksX) { val blockIndex = by * blocksX + bx - + // Read TEV block header to get rate control factor val headerBuffer = ByteArray(11) val (memspace, offset) = vm.translateAddr(tempReadPtr) @@ -2545,7 +2547,7 @@ class GraphicsJSR223Delegate(private val vm: VM) { headerBuffer[i] = vm.peek(tempReadPtr + i) ?: 0 } } - + val mode = headerBuffer[0].toUint() val mvX = ((headerBuffer[1].toUint()) or ((headerBuffer[2].toUint()) shl 8)).toShort().toInt() val mvY = ((headerBuffer[3].toUint()) or ((headerBuffer[4].toUint()) shl 8)).toShort().toInt() @@ -2554,20 +2556,20 @@ class GraphicsJSR223Delegate(private val vm: VM) { ((headerBuffer[7].toUint()) shl 16) or ((headerBuffer[8].toUint()) shl 24)) tempReadPtr += 11 // Skip header - + blockModes[blockIndex] = mode.toInt() motionVectors[blockIndex] = intArrayOf(mvX, mvY) rateControlFactors[blockIndex] = rateControlFactor - + // TEV format always has 768 bytes of DCT coefficients per block (fixed size) val coeffShortArray = ShortArray(384) // 256 Y + 64 Co + 64 Cg = 384 shorts - + // Use bulk read like the original implementation vm.bulkPeekShort(tempReadPtr.toInt(), coeffShortArray, 768) tempReadPtr += 768 - + when (mode.toInt()) { - 0x01, 0x02 -> { // INTRA or INTER - store raw coefficients for boundary optimization + 0x01, 0x02 -> { // INTRA or INTER - store raw coefficients for boundary optimisation yBlocks[blockIndex] = coeffShortArray.sliceArray(0 until 256) coBlocks[blockIndex] = coeffShortArray.sliceArray(256 until 320) cgBlocks[blockIndex] = coeffShortArray.sliceArray(320 until 384) @@ -2576,9 +2578,9 @@ class GraphicsJSR223Delegate(private val vm: VM) { } } } - - // PASS 2: Apply proper knusperli boundary optimization (Google's algorithm) - val (optimizedYBlocks, optimizedCoBlocks, optimizedCgBlocks) = applyKnusperliOptimization( + + // PASS 2: Apply proper knusperli boundary optimisation (Google's algorithm) + val (optimisedYBlocks, optimisedCoBlocks, optimisedCgBlocks) = tevApplyKnusperliOptimisation( yBlocks, coBlocks, cgBlocks, if (tevVersion == 3) QUANT_TABLE_Y else QUANT_TABLE_Y, if (tevVersion == 3) QUANT_TABLE_C else QUANT_TABLE_C, @@ -2586,46 +2588,46 @@ class GraphicsJSR223Delegate(private val vm: VM) { qY, qCo, qCg, rateControlFactors, blocksX, blocksY ) - - // PASS 3: Convert optimized blocks to RGB and output + + // PASS 3: Convert optimised blocks to RGB and output for (by in 0 until blocksY) { for (bx in 0 until blocksX) { val blockIndex = by * blocksX + bx val startX = bx * 16 val startY = by * 16 - + when (blockModes[blockIndex]) { 0x00 -> { // SKIP - copy from previous frame - handleSkipBlockTwoPass(startX, startY, currentRGBAddr, prevRGBAddr, width, height, thisAddrIncVec, prevAddrIncVec) + tevHandleSkipBlockTwoPass(startX, startY, currentRGBAddr, prevRGBAddr, width, height, thisAddrIncVec, prevAddrIncVec) } 0x03 -> { // MOTION - copy with motion vector val mv = motionVectors[blockIndex] - handleMotionBlockTwoPass(startX, startY, mv[0], mv[1], currentRGBAddr, prevRGBAddr, width, height, thisAddrIncVec, prevAddrIncVec, debugMotionVectors) + tevHandleMotionBlockTwoPass(startX, startY, mv[0], mv[1], currentRGBAddr, prevRGBAddr, width, height, thisAddrIncVec, prevAddrIncVec, debugMotionVectors) } - 0x01, 0x02 -> { // INTRA/INTER - use optimized DCT blocks - val yBlock = optimizedYBlocks[blockIndex] - val coBlock = optimizedCoBlocks[blockIndex] - val cgBlock = optimizedCgBlocks[blockIndex] - + 0x01, 0x02 -> { // INTRA/INTER - use optimised DCT blocks + val yBlock = optimisedYBlocks[blockIndex] + val coBlock = optimisedCoBlocks[blockIndex] + val cgBlock = optimisedCgBlocks[blockIndex] + if (yBlock != null && coBlock != null && cgBlock != null) { // Skip INTER motion compensation for now (debugging) // TODO: Implement proper motion compensation for two-pass mode // if (blockModes[blockIndex] == 0x02) { // val mv = motionVectors[blockIndex] - // applyMotionCompensationTwoPass(yBlock, coBlock, cgBlock, startX, startY, mv[0], mv[1], prevRGBAddr, width, height, prevAddrIncVec) + // tevApplyMotionCompensationTwoPass(yBlock, coBlock, cgBlock, startX, startY, mv[0], mv[1], prevRGBAddr, width, height, prevAddrIncVec) // } - - // Use IDCT on knusperli-optimized coefficients (coefficients are already optimally dequantized) - val yPixels = tevIdct16x16_fromOptimizedCoeffs(yBlock) - val coPixels = tevIdct8x8_fromOptimizedCoeffs(coBlock) - val cgPixels = tevIdct8x8_fromOptimizedCoeffs(cgBlock) - + + // Use IDCT on knusperli-optimised coefficients (coefficients are already optimally dequantized) + val yPixels = tevIdct16x16_fromOptimisedCoeffs(yBlock) + val coPixels = tevIdct8x8_fromOptimisedCoeffs(coBlock) + val cgPixels = tevIdct8x8_fromOptimisedCoeffs(cgBlock) + val rgbData = if (tevVersion == 3) { tevIctcpToRGB(yPixels, coPixels, cgPixels) } else { tevYcocgToRGB(yPixels, coPixels, cgPixels) } - + bulkWriteRGB(currentRGBAddr, rgbData, width, height, startX, startY, 16, 16, thisAddrIncVec) } } @@ -2662,10 +2664,10 @@ class GraphicsJSR223Delegate(private val vm: VM) { when (mode) { - 0x00 -> { // TEV_MODE_SKIP - copy RGB from previous frame (optimized with memcpy) + 0x00 -> { // TEV_MODE_SKIP - copy RGB from previous frame (optimised with memcpy) // Check if we can copy the entire block at once (no clipping) if (startX + 16 <= width && startY + 16 <= height) { - // Optimized case: copy entire 16x16 block with row-by-row memcpy + // Optimised case: copy entire 16x16 block with row-by-row memcpy for (dy in 0 until 16) { val srcRowOffset = ((startY + dy).toLong() * width + startX) * 3 val dstRowOffset = srcRowOffset @@ -2676,7 +2678,7 @@ class GraphicsJSR223Delegate(private val vm: VM) { ) } } else { - // Optimized fallback using row-by-row copying for boundary blocks + // Optimised fallback using row-by-row copying for boundary blocks for (dy in 0 until 16) { val y = startY + dy if (y < height) { @@ -2703,7 +2705,7 @@ class GraphicsJSR223Delegate(private val vm: VM) { readPtr += 768 } - 0x03 -> { // TEV_MODE_MOTION - motion compensation with RGB (optimized with memcpy) + 0x03 -> { // TEV_MODE_MOTION - motion compensation with RGB (optimised with memcpy) if (debugMotionVectors) { // Debug mode: use original pixel-by-pixel for motion vector visualization for (dy in 0 until 16) { @@ -2717,7 +2719,7 @@ class GraphicsJSR223Delegate(private val vm: VM) { val dstPixelOffset = y.toLong() * width + x val dstRgbOffset = dstPixelOffset * 3 - // Debug: Color INTER blocks by motion vector magnitude + // Debug: Colour INTER blocks by motion vector magnitude val mvMagnitude = kotlin.math.sqrt((mvX * mvX + mvY * mvY).toDouble()).toInt() val intensity = (mvMagnitude * 8).coerceIn(0, 255) // Scale for visibility @@ -2728,7 +2730,7 @@ class GraphicsJSR223Delegate(private val vm: VM) { } } } else { - // Optimized motion compensation + // Optimised motion compensation val refStartX = startX + mvX val refStartY = startY + mvY @@ -2736,7 +2738,7 @@ class GraphicsJSR223Delegate(private val vm: VM) { if (startX + 16 <= width && startY + 16 <= height && refStartX >= 0 && refStartY >= 0 && refStartX + 16 <= width && refStartY + 16 <= height) { - // Optimized case: copy entire 16x16 block with row-by-row memcpy + // Optimised case: copy entire 16x16 block with row-by-row memcpy for (dy in 0 until 16) { val srcRowOffset = ((refStartY + dy).toLong() * width + refStartX) * 3 val dstRowOffset = ((startY + dy).toLong() * width + startX) * 3 @@ -2797,7 +2799,7 @@ class GraphicsJSR223Delegate(private val vm: VM) { 0x01 -> { // TEV_MODE_INTRA - Full YCoCg-R DCT decode (no motion compensation) // Regular lossy mode: quantized int16 coefficients - // Optimized bulk reading of all DCT coefficients: Y(256×2) + Co(64×2) + Cg(64×2) = 768 bytes + // Optimised bulk reading of all DCT coefficients: Y(256×2) + Co(64×2) + Cg(64×2) = 768 bytes val coeffShortArray = ShortArray(384) // Total coefficients: 256 + 64 + 64 = 384 shorts vm.bulkPeekShort(readPtr.toInt(), coeffShortArray, 768) readPtr += 768 @@ -2821,7 +2823,7 @@ class GraphicsJSR223Delegate(private val vm: VM) { 0x02 -> { // TEV_MODE_INTER - Motion compensation + residual DCT // Step 1: Read residual DCT coefficients - // Optimized bulk reading of all DCT coefficients: Y(256×2) + Co(64×2) + Cg(64×2) = 768 bytes + // Optimised bulk reading of all DCT coefficients: Y(256×2) + Co(64×2) + Cg(64×2) = 768 bytes val coeffShortArray = ShortArray(384) // Total coefficients: 256 + 64 + 64 = 384 shorts vm.bulkPeekShort(readPtr.toInt(), coeffShortArray, 768) readPtr += 768 @@ -2962,7 +2964,7 @@ class GraphicsJSR223Delegate(private val vm: VM) { } } } else { - // Optimized bulk write for normal operation + // Optimised bulk write for normal operation bulkWriteRGB(currentRGBAddr, finalRgb, width, height, startX, startY, 16, 16, thisAddrIncVec) } } @@ -2990,7 +2992,7 @@ class GraphicsJSR223Delegate(private val vm: VM) { } } } - + // Apply enhanced deblocking filter if enabled to reduce blocking artifacts if (enableDeblocking) { tevDeblockingFilterEnhanced(currentRGBAddr, width, height) @@ -3031,104 +3033,104 @@ class GraphicsJSR223Delegate(private val vm: VM) { } // Helper functions for motion compensation and block handling in two-pass mode - private fun handleSkipBlockTwoPass(startX: Int, startY: Int, currentRGBAddr: Long, prevRGBAddr: Long, - width: Int, height: Int, thisAddrIncVec: Int, prevAddrIncVec: Int) { + private fun tevHandleSkipBlockTwoPass(startX: Int, startY: Int, currentRGBAddr: Long, prevRGBAddr: Long, + width: Int, height: Int, thisAddrIncVec: Int, prevAddrIncVec: Int) { // Copy 16x16 block from previous frame for (py in 0 until 16) { val y = startY + py if (y >= height) break - + for (px in 0 until 16) { val x = startX + px if (x >= width) break - + val offset = (y * width + x) * 3 val prevR = vm.peek(prevRGBAddr + offset * prevAddrIncVec) ?: 0 val prevG = vm.peek(prevRGBAddr + (offset + 1) * prevAddrIncVec) ?: 0 val prevB = vm.peek(prevRGBAddr + (offset + 2) * prevAddrIncVec) ?: 0 - + vm.poke(currentRGBAddr + offset * thisAddrIncVec, prevR) vm.poke(currentRGBAddr + (offset + 1) * thisAddrIncVec, prevG) vm.poke(currentRGBAddr + (offset + 2) * thisAddrIncVec, prevB) } } } - - private fun handleMotionBlockTwoPass(startX: Int, startY: Int, mvX: Int, mvY: Int, - currentRGBAddr: Long, prevRGBAddr: Long, - width: Int, height: Int, thisAddrIncVec: Int, prevAddrIncVec: Int, - debugMotionVectors: Boolean) { + + private fun tevHandleMotionBlockTwoPass(startX: Int, startY: Int, mvX: Int, mvY: Int, + currentRGBAddr: Long, prevRGBAddr: Long, + width: Int, height: Int, thisAddrIncVec: Int, prevAddrIncVec: Int, + debugMotionVectors: Boolean) { // Copy 16x16 block with motion compensation for (py in 0 until 16) { val y = startY + py if (y >= height) break - + for (px in 0 until 16) { val x = startX + px if (x >= width) break - + val srcX = (x + mvX).coerceIn(0, width - 1) val srcY = (y + mvY).coerceIn(0, height - 1) - + val srcOffset = (srcY * width + srcX) * 3 val dstOffset = (y * width + x) * 3 - + val r = vm.peek(prevRGBAddr + srcOffset * prevAddrIncVec) ?: 0 val g = vm.peek(prevRGBAddr + (srcOffset + 1) * prevAddrIncVec) ?: 0 val b = vm.peek(prevRGBAddr + (srcOffset + 2) * prevAddrIncVec) ?: 0 - + vm.poke(currentRGBAddr + dstOffset * thisAddrIncVec, r) vm.poke(currentRGBAddr + (dstOffset + 1) * thisAddrIncVec, g) vm.poke(currentRGBAddr + (dstOffset + 2) * thisAddrIncVec, b) } } } - - /*private fun applyMotionCompensationTwoPass(yBlock: ShortArray, coBlock: ShortArray, cgBlock: ShortArray, + + /*private fun tevApplyMotionCompensationTwoPass(yBlock: ShortArray, coBlock: ShortArray, cgBlock: ShortArray, startX: Int, startY: Int, mvX: Int, mvY: Int, prevRGBAddr: Long, width: Int, height: Int, prevAddrIncVec: Int) { // For INTER blocks, add residual to motion-compensated reference // This is a simplified version - full implementation would extract reference block and add residuals - + // Apply motion compensation by reading reference pixels and converting to YCoCg-R coefficients for (py in 0 until 16) { val y = startY + py if (y >= height) break - + for (px in 0 until 16) { val x = startX + px if (x >= width) break - + val srcX = (x + mvX).coerceIn(0, width - 1) val srcY = (y + mvY).coerceIn(0, height - 1) - + val srcOffset = (srcY * width + srcX) * 3 val r = vm.peek(prevRGBAddr + srcOffset * prevAddrIncVec)?.toInt() ?: 0 val g = vm.peek(prevRGBAddr + (srcOffset + 1) * prevAddrIncVec)?.toInt() ?: 0 val b = vm.peek(prevRGBAddr + (srcOffset + 2) * prevAddrIncVec)?.toInt() ?: 0 - + // Convert reference RGB to YCoCg-R and add residual val co = r - b - val tmp = b + (co / 2) + val tmp = b + (co / 2) val cg = g - tmp val refY = tmp + (cg / 2) - + val yIdx = py * 16 + px if (yIdx < yBlock.size) { yBlock[yIdx] += refY.toFloat() } - + val cIdx = (py / 2) * 8 + (px / 2) if (cIdx < coBlock.size) { - coBlock[cIdx] += co.toFloat() + coBlock[cIdx] += co.toFloat() cgBlock[cIdx] += cg.toFloat() } } } }*/ - // Proper knusperli boundary-aware DCT optimization based on Google's algorithm - private fun applyKnusperliOptimization( + // Proper knusperli boundary-aware DCT optimisation based on Google's algorithm + private fun tevApplyKnusperliOptimisation( yBlocks: Array, coBlocks: Array, cgBlocks: Array, quantTableY: IntArray, quantTableCo: IntArray, quantTableCg: IntArray, qY: Int, qCo: Int, qCg: Int, rateControlFactors: FloatArray, @@ -3139,19 +3141,19 @@ class GraphicsJSR223Delegate(private val vm: VM) { val kAlphaSqrt2 = intArrayOf(1024, 1448, 1448, 1448, 1448, 1448, 1448, 1448) val kHalfSqrt2 = 724 // sqrt(2)/2 in 10-bit fixed-point - // Convert to dequantized FloatArrays and apply knusperli optimization - val optimizedYBlocks = convertAndOptimize16x16Blocks(yBlocks, quantTableY, qY, rateControlFactors, blocksX, blocksY, kLinearGradient, kAlphaSqrt2, kHalfSqrt2) - val optimizedCoBlocks = convertAndOptimize8x8Blocks(coBlocks, quantTableCo, qCo, rateControlFactors, blocksX, blocksY, kLinearGradient, kAlphaSqrt2, kHalfSqrt2) - val optimizedCgBlocks = convertAndOptimize8x8Blocks(cgBlocks, quantTableCg, qCg, rateControlFactors, blocksX, blocksY, kLinearGradient, kAlphaSqrt2, kHalfSqrt2) + // Convert to dequantized FloatArrays and apply knusperli optimisation + val optimisedYBlocks = tevConvertAndOptimise16x16Blocks(yBlocks, quantTableY, qY, rateControlFactors, blocksX, blocksY, kLinearGradient, kAlphaSqrt2, kHalfSqrt2) + val optimisedCoBlocks = tevConvertAndOptimise8x8Blocks(coBlocks, quantTableCo, qCo, rateControlFactors, blocksX, blocksY, kLinearGradient, kAlphaSqrt2, kHalfSqrt2) + val optimisedCgBlocks = tevConvertAndOptimise8x8Blocks(cgBlocks, quantTableCg, qCg, rateControlFactors, blocksX, blocksY, kLinearGradient, kAlphaSqrt2, kHalfSqrt2) - return Triple(optimizedYBlocks, optimizedCoBlocks, optimizedCgBlocks) + return Triple(optimisedYBlocks, optimisedCoBlocks, optimisedCgBlocks) } - // IDCT functions for knusperli-optimized coefficients (coefficients are already dequantized) - private fun tevIdct16x16_fromOptimizedCoeffs(coeffs: FloatArray): IntArray { + // IDCT functions for knusperli-optimised coefficients (coefficients are already dequantized) + private fun tevIdct16x16_fromOptimisedCoeffs(coeffs: FloatArray): IntArray { val result = IntArray(256) // 16x16 - - // Apply 2D IDCT directly to optimized coefficients (fix u/v indexing) + + // Apply 2D IDCT directly to optimised coefficients (fix u/v indexing) for (y in 0 until 16) { for (x in 0 until 16) { var sum = 0.0 @@ -3170,11 +3172,11 @@ class GraphicsJSR223Delegate(private val vm: VM) { } return result } - - private fun tevIdct8x8_fromOptimizedCoeffs(coeffs: FloatArray): IntArray { + + private fun tevIdct8x8_fromOptimisedCoeffs(coeffs: FloatArray): IntArray { val result = IntArray(64) // 8x8 - - // Apply 2D IDCT directly to optimized coefficients (fix u/v indexing) + + // Apply 2D IDCT directly to optimised coefficients (fix u/v indexing) for (y in 0 until 8) { for (x in 0 until 8) { var sum = 0.0 @@ -3194,31 +3196,31 @@ class GraphicsJSR223Delegate(private val vm: VM) { } return result } - - // Convert and optimize functions for proper knusperli implementation + + // Convert and optimise functions for proper knusperli implementation // Direct 16x16 block processing for Y blocks (no subdivision needed) - private fun convertAndOptimize16x16Blocks( + private fun tevConvertAndOptimise16x16Blocks( blocks: Array, quantTable: IntArray, qScale: Int, rateControlFactors: FloatArray, blocksX: Int, blocksY: Int, kLinearGradient: IntArray, kAlphaSqrt2: IntArray, kHalfSqrt2: Int ): Array { val result = Array(blocks.size) { null } - + // Extended constants for 16x16 blocks (based on Google's 8x8 pattern) val kLinearGradient16 = intArrayOf(318, -285, 81, -32, 17, -9, 5, -2, 1, 0, 0, 0, 0, 0, 0, 0) val kAlphaSqrt2_16 = intArrayOf(1024, 1448, 1448, 1448, 1448, 1448, 1448, 1448, 1448, 1448, 1448, 1448, 1448, 1448, 1448, 1448) - - // Apply knusperli boundary optimization to 16x16 blocks - processBlocksWithKnusperli16x16(blocks, quantTable, qScale, rateControlFactors, + + // Apply knusperli boundary optimisation to 16x16 blocks + tevProcessBlocksWithKnusperli16x16(blocks, quantTable, qScale, rateControlFactors, blocksX, blocksY, kLinearGradient16, kAlphaSqrt2_16, kHalfSqrt2) - - // Convert optimized ShortArray blocks to FloatArray (dequantized) + + // Convert optimised ShortArray blocks to FloatArray (dequantized) for (blockIndex in 0 until blocks.size) { val block = blocks[blockIndex] if (block != null) { result[blockIndex] = FloatArray(256) // 16x16 = 256 coefficients val rateControlFactor = rateControlFactors[blockIndex] - + for (i in 0 until 256) { val coeffIdx = i.coerceIn(0, quantTable.size - 1) val quantValue = if (i == 0) 1.0f else { @@ -3228,32 +3230,32 @@ class GraphicsJSR223Delegate(private val vm: VM) { } } } - + return result } - - // Optimized 16x16 version of Knusperli processing for Y blocks - private fun processBlocksWithKnusperli16x16( + + // Optimised 16x16 version of Knusperli processing for Y blocks + private fun tevProcessBlocksWithKnusperli16x16( blocks: Array, quantTable: IntArray, qScale: Int, rateControlFactors: FloatArray, blocksX: Int, blocksY: Int, kLinearGradient16: IntArray, kAlphaSqrt2_16: IntArray, kHalfSqrt2: Int ) { val coeffsSize = 256 // 16x16 = 256 val numBlocks = blocksX * blocksY - + // OPTIMIZATION 1: Pre-compute quantization values to avoid repeated calculations val quantValues = Array(numBlocks) { IntArray(coeffsSize) } val quantHalfValues = Array(numBlocks) { IntArray(coeffsSize) } - + for (blockIndex in 0 until numBlocks) { val block = blocks[blockIndex] if (block != null) { val rateControlFactor = rateControlFactors[blockIndex] val qualityMult = jpeg_quality_to_mult(qScale * rateControlFactor) - + quantValues[blockIndex][0] = 1 // DC is lossless quantHalfValues[blockIndex][0] = 0 // DC has no quantization interval - + for (i in 1 until coeffsSize) { val coeffIdx = i.coerceIn(0, quantTable.size - 1) val quant = (quantTable[coeffIdx] * qualityMult).coerceIn(1f, 255f).toInt() @@ -3262,11 +3264,11 @@ class GraphicsJSR223Delegate(private val vm: VM) { } } } - + // OPTIMIZATION 2: Use single-allocation arrays with block-stride access val blocksMid = Array(numBlocks) { IntArray(coeffsSize) } val blocksOff = Array(numBlocks) { LongArray(coeffsSize) } // Keep Long for accumulation - + // Step 1: Setup dequantized values and initialize adjustments (BULK OPTIMIZED) for (blockIndex in 0 until numBlocks) { val block = blocks[blockIndex] @@ -3274,37 +3276,37 @@ class GraphicsJSR223Delegate(private val vm: VM) { val mid = blocksMid[blockIndex] val off = blocksOff[blockIndex] val quantVals = quantValues[blockIndex] - + // OPTIMIZATION 9: Bulk dequantization using vectorized operations - bulkDequantizeCoefficients(block, mid, quantVals, coeffsSize) - + tevBulkDequantizeCoefficients(block, mid, quantVals, coeffsSize) + // OPTIMIZATION 10: Bulk zero initialization of adjustments off.fill(0L) } } - + // OPTIMIZATION 7: Combined boundary analysis loops for better cache locality // Process horizontal and vertical boundaries in interleaved pattern for (by in 0 until blocksY) { for (bx in 0 until blocksX) { val currentIndex = by * blocksX + bx - + // Horizontal boundary (if not rightmost column) if (bx < blocksX - 1) { val rightIndex = currentIndex + 1 if (blocks[currentIndex] != null && blocks[rightIndex] != null) { - analyzeHorizontalBoundary16x16( - currentIndex, rightIndex, blocksMid, blocksOff, + tevAnalyseHorizontalBoundary16x16( + currentIndex, rightIndex, blocksMid, blocksOff, kLinearGradient16, kAlphaSqrt2_16 ) } } - + // Vertical boundary (if not bottom row) if (by < blocksY - 1) { val bottomIndex = currentIndex + blocksX if (blocks[currentIndex] != null && blocks[bottomIndex] != null) { - analyzeVerticalBoundary16x16( + tevAnalyseVerticalBoundary16x16( currentIndex, bottomIndex, blocksMid, blocksOff, kLinearGradient16, kAlphaSqrt2_16 ) @@ -3312,13 +3314,13 @@ class GraphicsJSR223Delegate(private val vm: VM) { } } } - + // Step 4: Apply corrections and clamp to quantization intervals (BULK OPTIMIZED) for (blockIndex in 0 until numBlocks) { val block = blocks[blockIndex] if (block != null) { // OPTIMIZATION 11: Bulk apply corrections and quantization clamping - bulkApplyCorrectionsAndClamp( + tevBulkApplyCorrectionsAndClamp( block, blocksMid[blockIndex], blocksOff[blockIndex], quantValues[blockIndex], quantHalfValues[blockIndex], kHalfSqrt2, coeffsSize @@ -3326,20 +3328,20 @@ class GraphicsJSR223Delegate(private val vm: VM) { } } } - + // BULK MEMORY ACCESS HELPER FUNCTIONS FOR KNUSPERLI - + /** * OPTIMIZATION 9: Bulk dequantization using vectorized operations - * Performs coefficient * quantization in optimized chunks + * Performs coefficient * quantization in optimised chunks */ - private fun bulkDequantizeCoefficients( + private fun tevBulkDequantizeCoefficients( coeffs: ShortArray, result: IntArray, quantVals: IntArray, size: Int ) { // Process in chunks of 16 for better vectorization (CPU can process multiple values per instruction) var i = 0 val chunks = size and 0xFFFFFFF0.toInt() // Round down to nearest 16 - + // Bulk process 16 coefficients at a time for SIMD-friendly operations while (i < chunks) { // Manual loop unrolling for better performance @@ -3361,26 +3363,26 @@ class GraphicsJSR223Delegate(private val vm: VM) { result[i + 15] = coeffs[i + 15].toInt() * quantVals[i + 15] i += 16 } - + // Handle remaining coefficients while (i < size) { result[i] = coeffs[i].toInt() * quantVals[i] i++ } } - + /** * OPTIMIZATION 11: Bulk apply corrections and quantization clamping * Vectorized correction application with proper bounds checking */ - private fun bulkApplyCorrectionsAndClamp( + private fun tevBulkApplyCorrectionsAndClamp( block: ShortArray, mid: IntArray, off: LongArray, quantVals: IntArray, quantHalf: IntArray, kHalfSqrt2: Int, size: Int ) { var i = 0 val chunks = size and 0xFFFFFFF0.toInt() // Process in chunks of 16 - + // Bulk process corrections in chunks for better CPU pipeline utilization while (i < chunks) { // Apply corrections with sqrt(2)/2 weighting - bulk operations @@ -3392,7 +3394,7 @@ class GraphicsJSR223Delegate(private val vm: VM) { val corr5 = ((off[i + 5] * kHalfSqrt2) shr 31).toInt() val corr6 = ((off[i + 6] * kHalfSqrt2) shr 31).toInt() val corr7 = ((off[i + 7] * kHalfSqrt2) shr 31).toInt() - + mid[i] += corr0 mid[i + 1] += corr1 mid[i + 2] += corr2 @@ -3401,7 +3403,7 @@ class GraphicsJSR223Delegate(private val vm: VM) { mid[i + 5] += corr5 mid[i + 6] += corr6 mid[i + 7] += corr7 - + // Apply quantization interval clamping - bulk operations val orig0 = block[i].toInt() * quantVals[i] val orig1 = block[i + 1].toInt() * quantVals[i + 1] @@ -3411,7 +3413,7 @@ class GraphicsJSR223Delegate(private val vm: VM) { val orig5 = block[i + 5].toInt() * quantVals[i + 5] val orig6 = block[i + 6].toInt() * quantVals[i + 6] val orig7 = block[i + 7].toInt() * quantVals[i + 7] - + mid[i] = mid[i].coerceIn(orig0 - quantHalf[i], orig0 + quantHalf[i]) mid[i + 1] = mid[i + 1].coerceIn(orig1 - quantHalf[i + 1], orig1 + quantHalf[i + 1]) mid[i + 2] = mid[i + 2].coerceIn(orig2 - quantHalf[i + 2], orig2 + quantHalf[i + 2]) @@ -3420,7 +3422,7 @@ class GraphicsJSR223Delegate(private val vm: VM) { mid[i + 5] = mid[i + 5].coerceIn(orig5 - quantHalf[i + 5], orig5 + quantHalf[i + 5]) mid[i + 6] = mid[i + 6].coerceIn(orig6 - quantHalf[i + 6], orig6 + quantHalf[i + 6]) mid[i + 7] = mid[i + 7].coerceIn(orig7 - quantHalf[i + 7], orig7 + quantHalf[i + 7]) - + // Convert back to quantized coefficients - bulk operations val quantMax = Short.MAX_VALUE.toInt() val quantMin = Short.MIN_VALUE.toInt() @@ -3432,24 +3434,24 @@ class GraphicsJSR223Delegate(private val vm: VM) { block[i + 5] = (mid[i + 5] / quantVals[i + 5]).coerceIn(quantMin, quantMax).toShort() block[i + 6] = (mid[i + 6] / quantVals[i + 6]).coerceIn(quantMin, quantMax).toShort() block[i + 7] = (mid[i + 7] / quantVals[i + 7]).coerceIn(quantMin, quantMax).toShort() - + i += 8 // Process 8 at a time for the remaining corrections } - + // Handle remaining coefficients (usually 0-15 remaining for 256-coefficient blocks) while (i < size) { mid[i] += ((off[i] * kHalfSqrt2) shr 31).toInt() - + val originalValue = block[i].toInt() * quantVals[i] mid[i] = mid[i].coerceIn(originalValue - quantHalf[i], originalValue + quantHalf[i]) - + block[i] = (mid[i] / quantVals[i]).coerceIn(Short.MIN_VALUE.toInt(), Short.MAX_VALUE.toInt()).toShort() i++ } } - - // OPTIMIZED 16x16 horizontal boundary analysis - private fun analyzeHorizontalBoundary16x16( + + // OPTIMIZED 16x16 horizontal boundary analysis + private fun tevAnalyseHorizontalBoundary16x16( leftBlockIndex: Int, rightBlockIndex: Int, blocksMid: Array, blocksOff: Array, kLinearGradient16: IntArray, kAlphaSqrt2_16: IntArray @@ -3458,13 +3460,13 @@ class GraphicsJSR223Delegate(private val vm: VM) { val rightMid = blocksMid[rightBlockIndex] val leftOff = blocksOff[leftBlockIndex] val rightOff = blocksOff[rightBlockIndex] - + // OPTIMIZATION 4: Process multiple frequencies in single loop for better cache locality for (v in 0 until 8) { // Only low-to-mid frequencies var deltaV = 0L var hfPenalty = 0L val vOffset = v * 16 - + // First pass: Calculate boundary discontinuity for (u in 0 until 16) { val idx = vOffset + u @@ -3472,17 +3474,17 @@ class GraphicsJSR223Delegate(private val vm: VM) { val sign = if (u and 1 != 0) -1 else 1 val gi = leftMid[idx] val gj = rightMid[idx] - + deltaV += alpha * (gj - sign * gi) hfPenalty += (u * u) * (gi * gi + gj * gj) } - - // OPTIMIZATION 8: Early exit for very small adjustments + + // OPTIMIZATION 8: Early exit for very small adjustments if (kotlin.math.abs(deltaV) < 100) continue - + // OPTIMIZATION 5: Apply high-frequency damping once per frequency band if (hfPenalty > 1600) deltaV /= 2 - + // Second pass: Apply corrections (BULK OPTIMIZED with unrolling) val correction = deltaV // Bulk apply corrections for 16 coefficients - manually unrolled for performance @@ -3520,9 +3522,9 @@ class GraphicsJSR223Delegate(private val vm: VM) { rightOff[vOffset + 15] -= correction * kLinearGradient16[15] } } - - // OPTIMIZED 16x16 vertical boundary analysis - private fun analyzeVerticalBoundary16x16( + + // OPTIMIZED 16x16 vertical boundary analysis + private fun tevAnalyseVerticalBoundary16x16( topBlockIndex: Int, bottomBlockIndex: Int, blocksMid: Array, blocksOff: Array, kLinearGradient16: IntArray, kAlphaSqrt2_16: IntArray @@ -3531,12 +3533,12 @@ class GraphicsJSR223Delegate(private val vm: VM) { val bottomMid = blocksMid[bottomBlockIndex] val topOff = blocksOff[topBlockIndex] val bottomOff = blocksOff[bottomBlockIndex] - - // OPTIMIZATION 6: Optimized vertical analysis with better cache access pattern + + // OPTIMIZATION 6: Optimised vertical analysis with better cache access pattern for (u in 0 until 16) { // Only low-to-mid frequencies var deltaU = 0L var hfPenalty = 0L - + // First pass: Calculate boundary discontinuity for (v in 0 until 16) { val idx = v * 16 + u @@ -3544,17 +3546,17 @@ class GraphicsJSR223Delegate(private val vm: VM) { val sign = if (v and 1 != 0) -1 else 1 val gi = topMid[idx] val gj = bottomMid[idx] - + deltaU += alpha * (gj - sign * gi) hfPenalty += (v * v) * (gi * gi + gj * gj) } - + // Early exit for very small adjustments if (kotlin.math.abs(deltaU) < 100) continue - + // Apply high-frequency damping once per frequency band if (hfPenalty > 1600) deltaU /= 2 - + // Second pass: Apply corrections (BULK OPTIMIZED vertical) val correction = deltaU // Bulk apply corrections for 16 vertical coefficients - manually unrolled @@ -3593,27 +3595,27 @@ class GraphicsJSR223Delegate(private val vm: VM) { } } - private fun convertAndOptimize8x8Blocks( + private fun tevConvertAndOptimise8x8Blocks( blocks: Array, quantTable: IntArray, qScale: Int, rateControlFactors: FloatArray, blocksX: Int, blocksY: Int, kLinearGradient: IntArray, kAlphaSqrt2: IntArray, kHalfSqrt2: Int ): Array { val coeffsSize = 64 val numBlocks = blocksX * blocksY - + // Step 1: Setup quantization intervals for all blocks (using integers like Google's code) val blocksMid = Array(numBlocks) { IntArray(coeffsSize) } val blocksMin = Array(numBlocks) { IntArray(coeffsSize) } val blocksMax = Array(numBlocks) { IntArray(coeffsSize) } val blocksOff = Array(numBlocks) { LongArray(coeffsSize) } // Long for accumulation - + for (blockIndex in 0 until numBlocks) { val block = blocks[blockIndex] if (block != null) { val rateControlFactor = rateControlFactors[blockIndex] for (i in 0 until coeffsSize) { val quantIdx = i.coerceIn(0, quantTable.size - 1) - + if (i == 0) { // DC coefficient: lossless (no quantization) val dcValue = block[i].toInt() @@ -3623,53 +3625,53 @@ class GraphicsJSR223Delegate(private val vm: VM) { } else { // AC coefficients: use quantization intervals val quant = (quantTable[quantIdx] * jpeg_quality_to_mult(qScale * rateControlFactor)).coerceIn(1f, 255f).toInt() - + // Standard dequantized value (midpoint) blocksMid[blockIndex][i] = block[i].toInt() * quant - + // Quantization interval bounds val halfQuant = quant / 2 blocksMin[blockIndex][i] = blocksMid[blockIndex][i] - halfQuant blocksMax[blockIndex][i] = blocksMid[blockIndex][i] + halfQuant } - + // Initialize adjustment accumulator blocksOff[blockIndex][i] = 0L } } } - + // Step 2: Horizontal continuity analysis for (by in 0 until blocksY) { for (bx in 0 until blocksX - 1) { val leftBlockIndex = by * blocksX + bx val rightBlockIndex = by * blocksX + (bx + 1) - + if (blocks[leftBlockIndex] != null && blocks[rightBlockIndex] != null) { - analyzeHorizontalBoundary( - leftBlockIndex, rightBlockIndex, blocksMid, blocksOff, + tevAnalyseHorizontalBoundary8x8( + leftBlockIndex, rightBlockIndex, blocksMid, blocksOff, kLinearGradient, kAlphaSqrt2 ) } } } - - // Step 3: Vertical continuity analysis + + // Step 3: Vertical continuity analysis for (by in 0 until blocksY - 1) { for (bx in 0 until blocksX) { val topBlockIndex = by * blocksX + bx val bottomBlockIndex = (by + 1) * blocksX + bx - + if (blocks[topBlockIndex] != null && blocks[bottomBlockIndex] != null) { - analyzeVerticalBoundary( + tevAnalyseVerticalBoundary8x8( topBlockIndex, bottomBlockIndex, blocksMid, blocksOff, kLinearGradient, kAlphaSqrt2 ) } } } - - // Step 4: Apply corrections and return optimized dequantized coefficients + + // Step 4: Apply corrections and return optimised dequantized coefficients val result = Array(blocks.size) { null } for (blockIndex in 0 until numBlocks) { val block = blocks[blockIndex] @@ -3677,23 +3679,23 @@ class GraphicsJSR223Delegate(private val vm: VM) { result[blockIndex] = FloatArray(coeffsSize) { i -> // Apply corrections with sqrt(2)/2 weighting (Google's exact formula with right shift) blocksMid[blockIndex][i] += ((blocksOff[blockIndex][i] * kHalfSqrt2) shr 31).toInt() - + // Clamp to quantization interval bounds - val optimizedValue = blocksMid[blockIndex][i].coerceIn( - blocksMin[blockIndex][i], + val optimisedValue = blocksMid[blockIndex][i].coerceIn( + blocksMin[blockIndex][i], blocksMax[blockIndex][i] ) - - optimizedValue.toFloat() + + optimisedValue.toFloat() } } } - + return result } // BULK OPTIMIZED 8x8 horizontal boundary analysis for chroma channels - private fun analyzeHorizontalBoundary( + private fun tevAnalyseHorizontalBoundary8x8( leftBlockIndex: Int, rightBlockIndex: Int, blocksMid: Array, blocksOff: Array, kLinearGradient: IntArray, kAlphaSqrt2: IntArray @@ -3702,13 +3704,13 @@ class GraphicsJSR223Delegate(private val vm: VM) { val rightMid = blocksMid[rightBlockIndex] val leftOff = blocksOff[leftBlockIndex] val rightOff = blocksOff[rightBlockIndex] - + // OPTIMIZATION 12: Process 8x8 boundaries with bulk operations (v < 4 for low-to-mid frequencies) for (v in 0 until 4) { // Only low-to-mid frequencies for 8x8 var deltaV = 0L var hfPenalty = 0L val vOffset = v * 8 - + // First pass: Calculate boundary discontinuity for (u in 0 until 8) { val idx = vOffset + u @@ -3716,17 +3718,17 @@ class GraphicsJSR223Delegate(private val vm: VM) { val sign = if (u and 1 != 0) -1 else 1 val gi = leftMid[idx] val gj = rightMid[idx] - + deltaV += alpha * (gj - sign * gi) hfPenalty += (u * u) * (gi * gi + gj * gj) } - + // Early exit for very small adjustments if (kotlin.math.abs(deltaV) < 100) continue - + // Apply high-frequency damping once per frequency band if (hfPenalty > 400) deltaV /= 2 // 8x8 threshold - + // Second pass: Apply corrections (BULK OPTIMIZED with unrolling for 8x8) val correction = deltaV // Bulk apply corrections for 8 coefficients - manually unrolled for performance @@ -3748,9 +3750,9 @@ class GraphicsJSR223Delegate(private val vm: VM) { rightOff[vOffset + 7] -= correction * kLinearGradient[7] } } - + // BULK OPTIMIZED 8x8 vertical boundary analysis for chroma channels - private fun analyzeVerticalBoundary( + private fun tevAnalyseVerticalBoundary8x8( topBlockIndex: Int, bottomBlockIndex: Int, blocksMid: Array, blocksOff: Array, kLinearGradient: IntArray, kAlphaSqrt2: IntArray @@ -3760,7 +3762,7 @@ class GraphicsJSR223Delegate(private val vm: VM) { val topOff = blocksOff[topBlockIndex] val bottomOff = blocksOff[bottomBlockIndex] - // OPTIMIZATION 13: Optimized vertical analysis for 8x8 with better cache access pattern + // OPTIMIZATION 13: Optimised vertical analysis for 8x8 with better cache access pattern for (u in 0 until 4) { // Only low-to-mid frequencies for 8x8 var deltaU = 0L var hfPenalty = 0L @@ -3806,7 +3808,7 @@ class GraphicsJSR223Delegate(private val vm: VM) { } // ================= TAV (TSVM Advanced Video) Decoder ================= - // DWT-based video codec with ICtCp color space support + // DWT-based video codec with ICtCp colour space support fun tavDecode(blockDataPtr: Long, currentRGBAddr: Long, prevRGBAddr: Long, width: Int, height: Int, qY: Int, qCo: Int, qCg: Int, frameCounter: Int, @@ -3817,8 +3819,8 @@ class GraphicsJSR223Delegate(private val vm: VM) { var readPtr = blockDataPtr try { - val tilesX = (width + 111) / 112 // 112x112 tiles - val tilesY = (height + 111) / 112 + val tilesX = (width + TAV_TILE_SIZE_X - 1) / TAV_TILE_SIZE_X // 280x224 tiles + val tilesY = (height + TAV_TILE_SIZE_Y - 1) / TAV_TILE_SIZE_Y // Process each tile for (tileY in 0 until tilesY) { @@ -3836,25 +3838,25 @@ class GraphicsJSR223Delegate(private val vm: VM) { when (mode) { 0x00 -> { // TAV_MODE_SKIP - // Copy 112x112 tile from previous frame to current frame - copyTile112x112RGB(tileX, tileY, currentRGBAddr, prevRGBAddr, width, height) + // Copy 280x224 tile from previous frame to current frame + tavCopyTileRGB(tileX, tileY, currentRGBAddr, prevRGBAddr, width, height) } 0x01 -> { // TAV_MODE_INTRA // Decode DWT coefficients directly to RGB buffer - readPtr = decodeDWTIntraTileRGB(readPtr, tileX, tileY, currentRGBAddr, + readPtr = tavDecodeDWTIntraTileRGB(readPtr, tileX, tileY, currentRGBAddr, width, height, qY, qCo, qCg, rcf, waveletFilter, decompLevels, isLossless, tavVersion) } 0x02 -> { // TAV_MODE_INTER // Motion compensation + DWT residual to RGB buffer - readPtr = decodeDWTInterTileRGB(readPtr, tileX, tileY, mvX, mvY, + readPtr = tavDecodeDWTInterTileRGB(readPtr, tileX, tileY, mvX, mvY, currentRGBAddr, prevRGBAddr, width, height, qY, qCo, qCg, rcf, waveletFilter, decompLevels, isLossless, tavVersion) } 0x03 -> { // TAV_MODE_MOTION // Motion compensation only (no residual) - applyMotionCompensation112x112RGB(tileX, tileY, mvX, mvY, + tavApplyMotionCompensationRGB(tileX, tileY, mvX, mvY, currentRGBAddr, prevRGBAddr, width, height) } } @@ -3866,20 +3868,19 @@ class GraphicsJSR223Delegate(private val vm: VM) { } } - private fun decodeDWTIntraTileRGB(readPtr: Long, tileX: Int, tileY: Int, currentRGBAddr: Long, - width: Int, height: Int, qY: Int, qCo: Int, qCg: Int, rcf: Float, - waveletFilter: Int, decompLevels: Int, isLossless: Boolean, tavVersion: Int): Long { - // Now reading padded coefficient tiles (176x176) instead of core tiles (112x112) - val paddedSize = PADDED_TILE_SIZE - val paddedCoeffCount = paddedSize * paddedSize + private fun tavDecodeDWTIntraTileRGB(readPtr: Long, tileX: Int, tileY: Int, currentRGBAddr: Long, + width: Int, height: Int, qY: Int, qCo: Int, qCg: Int, rcf: Float, + waveletFilter: Int, decompLevels: Int, isLossless: Boolean, tavVersion: Int): Long { + // Now reading padded coefficient tiles (344x288) instead of core tiles (280x224) + val paddedCoeffCount = TAV_PADDED_TILE_SIZE_X * TAV_PADDED_TILE_SIZE_Y var ptr = readPtr - // Read quantized DWT coefficients for padded tile Y, Co, Cg channels (176x176) + // Read quantized DWT coefficients for padded tile Y, Co, Cg channels (344x288) val quantizedY = ShortArray(paddedCoeffCount) val quantizedCo = ShortArray(paddedCoeffCount) val quantizedCg = ShortArray(paddedCoeffCount) - // OPTIMIZATION: Bulk read all coefficient data (176x176 * 3 channels * 2 bytes = 185,856 bytes) + // OPTIMIZATION: Bulk read all coefficient data (344x288 * 3 channels * 2 bytes = 594,432 bytes) val totalCoeffBytes = paddedCoeffCount * 3 * 2L // 3 channels, 2 bytes per short val coeffBuffer = ByteArray(totalCoeffBytes.toInt()) UnsafeHelper.memcpyRaw(null, vm.usermem.ptr + ptr, coeffBuffer, UnsafeHelper.getArrayOffset(coeffBuffer), totalCoeffBytes) @@ -3901,7 +3902,7 @@ class GraphicsJSR223Delegate(private val vm: VM) { ptr += totalCoeffBytes.toInt() - // Dequantize padded coefficient tiles (176x176) + // Dequantize padded coefficient tiles (344x288) val yPaddedTile = FloatArray(paddedCoeffCount) val coPaddedTile = FloatArray(paddedCoeffCount) val cgPaddedTile = FloatArray(paddedCoeffCount) @@ -3912,26 +3913,26 @@ class GraphicsJSR223Delegate(private val vm: VM) { cgPaddedTile[i] = quantizedCg[i] * qCg * rcf } - // Apply inverse DWT on full padded tiles (176x176) + // Apply inverse DWT on full padded tiles (344x288) if (isLossless) { - applyDWTInverseMultiLevel(yPaddedTile, paddedSize, paddedSize, decompLevels, 0) - applyDWTInverseMultiLevel(coPaddedTile, paddedSize, paddedSize, decompLevels, 0) - applyDWTInverseMultiLevel(cgPaddedTile, paddedSize, paddedSize, decompLevels, 0) + tavApplyDWTInverseMultiLevel(yPaddedTile, TAV_PADDED_TILE_SIZE_X, TAV_PADDED_TILE_SIZE_Y, decompLevels, 0) + tavApplyDWTInverseMultiLevel(coPaddedTile, TAV_PADDED_TILE_SIZE_X, TAV_PADDED_TILE_SIZE_Y, decompLevels, 0) + tavApplyDWTInverseMultiLevel(cgPaddedTile, TAV_PADDED_TILE_SIZE_X, TAV_PADDED_TILE_SIZE_Y, decompLevels, 0) } else { - applyDWTInverseMultiLevel(yPaddedTile, paddedSize, paddedSize, decompLevels, waveletFilter) - applyDWTInverseMultiLevel(coPaddedTile, paddedSize, paddedSize, decompLevels, waveletFilter) - applyDWTInverseMultiLevel(cgPaddedTile, paddedSize, paddedSize, decompLevels, waveletFilter) + tavApplyDWTInverseMultiLevel(yPaddedTile, TAV_PADDED_TILE_SIZE_X, TAV_PADDED_TILE_SIZE_Y, decompLevels, waveletFilter) + tavApplyDWTInverseMultiLevel(coPaddedTile, TAV_PADDED_TILE_SIZE_X, TAV_PADDED_TILE_SIZE_Y, decompLevels, waveletFilter) + tavApplyDWTInverseMultiLevel(cgPaddedTile, TAV_PADDED_TILE_SIZE_X, TAV_PADDED_TILE_SIZE_Y, decompLevels, waveletFilter) } - // Extract core 112x112 pixels from reconstructed padded tiles (176x176) - val yTile = FloatArray(TILE_SIZE * TILE_SIZE) - val coTile = FloatArray(TILE_SIZE * TILE_SIZE) - val cgTile = FloatArray(TILE_SIZE * TILE_SIZE) + // Extract core 280x224 pixels from reconstructed padded tiles (344x288) + val yTile = FloatArray(TAV_TILE_SIZE_X * TAV_TILE_SIZE_Y) + val coTile = FloatArray(TAV_TILE_SIZE_X * TAV_TILE_SIZE_Y) + val cgTile = FloatArray(TAV_TILE_SIZE_X * TAV_TILE_SIZE_Y) - for (y in 0 until TILE_SIZE) { - for (x in 0 until TILE_SIZE) { - val coreIdx = y * TILE_SIZE + x - val paddedIdx = (y + TILE_MARGIN) * paddedSize + (x + TILE_MARGIN) + for (y in 0 until TAV_TILE_SIZE_Y) { + for (x in 0 until TAV_TILE_SIZE_X) { + val coreIdx = y * TAV_TILE_SIZE_X + x + val paddedIdx = (y + TAV_TILE_MARGIN) * TAV_PADDED_TILE_SIZE_X + (x + TAV_TILE_MARGIN) yTile[coreIdx] = yPaddedTile[paddedIdx] coTile[coreIdx] = coPaddedTile[paddedIdx] @@ -3941,28 +3942,27 @@ class GraphicsJSR223Delegate(private val vm: VM) { // Convert to RGB based on TAV version (YCoCg-R for v1, ICtCp for v2) if (tavVersion == 2) { - convertICtCpTileToRGB(tileX, tileY, yTile, coTile, cgTile, currentRGBAddr, width, height) + tavConvertICtCpTileToRGB(tileX, tileY, yTile, coTile, cgTile, currentRGBAddr, width, height) } else { - convertYCoCgTileToRGB(tileX, tileY, yTile, coTile, cgTile, currentRGBAddr, width, height) + tavConvertYCoCgTileToRGB(tileX, tileY, yTile, coTile, cgTile, currentRGBAddr, width, height) } return ptr } - private fun convertYCoCgTileToRGB(tileX: Int, tileY: Int, yTile: FloatArray, coTile: FloatArray, cgTile: FloatArray, - rgbAddr: Long, width: Int, height: Int) { - val tileSize = 112 - val startX = tileX * tileSize - val startY = tileY * tileSize + private fun tavConvertYCoCgTileToRGB(tileX: Int, tileY: Int, yTile: FloatArray, coTile: FloatArray, cgTile: FloatArray, + rgbAddr: Long, width: Int, height: Int) { + val startX = tileX * TAV_TILE_SIZE_X + val startY = tileY * TAV_TILE_SIZE_Y // OPTIMIZATION: Process pixels row by row with bulk copying for better cache locality - for (y in 0 until tileSize) { + for (y in 0 until TAV_TILE_SIZE_Y) { val frameY = startY + y if (frameY >= height) break // Calculate valid pixel range for this row val validStartX = maxOf(0, startX) - val validEndX = minOf(width, startX + tileSize) + val validEndX = minOf(width, startX + TAV_TILE_SIZE_X) val validPixelsInRow = validEndX - validStartX if (validPixelsInRow > 0) { @@ -3971,7 +3971,7 @@ class GraphicsJSR223Delegate(private val vm: VM) { var bufferIdx = 0 for (x in validStartX until validEndX) { - val tileIdx = y * tileSize + (x - startX) + val tileIdx = y * TAV_TILE_SIZE_X + (x - startX) // YCoCg-R to RGB conversion (exact inverse of encoder) val Y = yTile[tileIdx] @@ -3997,20 +3997,19 @@ class GraphicsJSR223Delegate(private val vm: VM) { } } - private fun convertICtCpTileToRGB(tileX: Int, tileY: Int, iTile: FloatArray, ctTile: FloatArray, cpTile: FloatArray, - rgbAddr: Long, width: Int, height: Int) { - val tileSize = 112 - val startX = tileX * tileSize - val startY = tileY * tileSize + private fun tavConvertICtCpTileToRGB(tileX: Int, tileY: Int, iTile: FloatArray, ctTile: FloatArray, cpTile: FloatArray, + rgbAddr: Long, width: Int, height: Int) { + val startX = tileX * TAV_TILE_SIZE_X + val startY = tileY * TAV_TILE_SIZE_Y // OPTIMIZATION: Process pixels row by row with bulk copying for better cache locality - for (y in 0 until tileSize) { + for (y in 0 until TAV_TILE_SIZE_Y) { val frameY = startY + y if (frameY >= height) break // Calculate valid pixel range for this row val validStartX = maxOf(0, startX) - val validEndX = minOf(width, startX + tileSize) + val validEndX = minOf(width, startX + TAV_TILE_SIZE_X) val validPixelsInRow = validEndX - validStartX if (validPixelsInRow > 0) { @@ -4019,7 +4018,7 @@ class GraphicsJSR223Delegate(private val vm: VM) { var bufferIdx = 0 for (x in validStartX until validEndX) { - val tileIdx = y * tileSize + (x - startX) + val tileIdx = y * TAV_TILE_SIZE_X + (x - startX) // ICtCp to sRGB conversion (adapted from encoder ICtCp functions) val I = iTile[tileIdx].toDouble() / 255.0 @@ -4059,19 +4058,18 @@ class GraphicsJSR223Delegate(private val vm: VM) { } } - private fun addYCoCgResidualToRGBTile(tileX: Int, tileY: Int, yRes: FloatArray, coRes: FloatArray, cgRes: FloatArray, - rgbAddr: Long, width: Int, height: Int) { - val tileSize = 112 - val startX = tileX * tileSize - val startY = tileY * tileSize + private fun tavAddYCoCgResidualToRGBTile(tileX: Int, tileY: Int, yRes: FloatArray, coRes: FloatArray, cgRes: FloatArray, + rgbAddr: Long, width: Int, height: Int) { + val startX = tileX * TAV_TILE_SIZE_X + val startY = tileY * TAV_TILE_SIZE_Y - for (y in 0 until tileSize) { - for (x in 0 until tileSize) { + for (y in 0 until TAV_TILE_SIZE_Y) { + for (x in 0 until TAV_TILE_SIZE_X) { val frameX = startX + x val frameY = startY + y if (frameX < width && frameY < height) { - val tileIdx = y * tileSize + x + val tileIdx = y * TAV_TILE_SIZE_X + x val pixelIdx = frameY * width + frameX val rgbOffset = pixelIdx * 3L @@ -4106,19 +4104,18 @@ class GraphicsJSR223Delegate(private val vm: VM) { } // Helper functions (simplified versions of existing DWT functions) - private fun copyTile112x112RGB(tileX: Int, tileY: Int, currentRGBAddr: Long, prevRGBAddr: Long, width: Int, height: Int) { - val tileSize = 112 - val startX = tileX * tileSize - val startY = tileY * tileSize + private fun tavCopyTileRGB(tileX: Int, tileY: Int, currentRGBAddr: Long, prevRGBAddr: Long, width: Int, height: Int) { + val startX = tileX * TAV_TILE_SIZE_X + val startY = tileY * TAV_TILE_SIZE_Y // OPTIMIZATION: Copy entire rows at once for maximum performance - for (y in 0 until tileSize) { + for (y in 0 until TAV_TILE_SIZE_Y) { val frameY = startY + y if (frameY >= height) break // Calculate valid pixel range for this row val validStartX = maxOf(0, startX) - val validEndX = minOf(width, startX + tileSize) + val validEndX = minOf(width, startX + TAV_TILE_SIZE_X) val validPixelsInRow = validEndX - validStartX if (validPixelsInRow > 0) { @@ -4135,32 +4132,31 @@ class GraphicsJSR223Delegate(private val vm: VM) { } } - private fun decodeDWTInterTileRGB(readPtr: Long, tileX: Int, tileY: Int, mvX: Int, mvY: Int, - currentRGBAddr: Long, prevRGBAddr: Long, - width: Int, height: Int, qY: Int, qCo: Int, qCg: Int, rcf: Float, - waveletFilter: Int, decompLevels: Int, isLossless: Boolean, tavVersion: Int): Long { + private fun tavDecodeDWTInterTileRGB(readPtr: Long, tileX: Int, tileY: Int, mvX: Int, mvY: Int, + currentRGBAddr: Long, prevRGBAddr: Long, + width: Int, height: Int, qY: Int, qCo: Int, qCg: Int, rcf: Float, + waveletFilter: Int, decompLevels: Int, isLossless: Boolean, tavVersion: Int): Long { // Step 1: Apply motion compensation - applyMotionCompensation112x112RGB(tileX, tileY, mvX, mvY, currentRGBAddr, prevRGBAddr, width, height) + tavApplyMotionCompensationRGB(tileX, tileY, mvX, mvY, currentRGBAddr, prevRGBAddr, width, height) // Step 2: Add DWT residual (same as intra but add to existing pixels) - return decodeDWTIntraTileRGB(readPtr, tileX, tileY, currentRGBAddr, width, height, qY, qCo, qCg, rcf, + return tavDecodeDWTIntraTileRGB(readPtr, tileX, tileY, currentRGBAddr, width, height, qY, qCo, qCg, rcf, waveletFilter, decompLevels, isLossless, tavVersion) } - private fun applyMotionCompensation112x112RGB(tileX: Int, tileY: Int, mvX: Int, mvY: Int, - currentRGBAddr: Long, prevRGBAddr: Long, + private fun tavApplyMotionCompensationRGB(tileX: Int, tileY: Int, mvX: Int, mvY: Int, + currentRGBAddr: Long, prevRGBAddr: Long, width: Int, height: Int) { - val tileSize = 112 - val startX = tileX * tileSize - val startY = tileY * tileSize + val startX = tileX * TAV_TILE_SIZE_X + val startY = tileY * TAV_TILE_SIZE_Y // Motion vectors in quarter-pixel precision val refX = startX + (mvX / 4.0f) val refY = startY + (mvY / 4.0f) - for (y in 0 until tileSize) { - for (x in 0 until tileSize) { + for (y in 0 until TAV_TILE_SIZE_Y) { + for (x in 0 until TAV_TILE_SIZE_X) { val currentPixelIdx = (startY + y) * width + (startX + x) if (currentPixelIdx >= 0 && currentPixelIdx < width * height) { @@ -4168,7 +4164,7 @@ class GraphicsJSR223Delegate(private val vm: VM) { val srcX = refX + x val srcY = refY + y - val interpolatedRGB = bilinearInterpolateRGB(prevRGBAddr, width, height, srcX, srcY) + val interpolatedRGB = tavBilinearInterpolateRGB(prevRGBAddr, width, height, srcX, srcY) val rgbOffset = currentPixelIdx * 3L vm.poke(currentRGBAddr + rgbOffset, interpolatedRGB[0]) @@ -4179,7 +4175,7 @@ class GraphicsJSR223Delegate(private val vm: VM) { } } - private fun bilinearInterpolateRGB(rgbPtr: Long, width: Int, height: Int, x: Float, y: Float): ByteArray { + private fun tavBilinearInterpolateRGB(rgbPtr: Long, width: Int, height: Int, x: Float, y: Float): ByteArray { val x0 = kotlin.math.floor(x).toInt() val y0 = kotlin.math.floor(y).toInt() val x1 = x0 + 1 @@ -4220,19 +4216,20 @@ class GraphicsJSR223Delegate(private val vm: VM) { ) } - private fun applyDWTInverseMultiLevel(data: FloatArray, width: Int, height: Int, levels: Int, filterType: Int) { + private fun tavApplyDWTInverseMultiLevel(data: FloatArray, width: Int, height: Int, levels: Int, filterType: Int) { // Multi-level inverse DWT - reconstruct from smallest to largest (reverse of encoder) - val size = width // Full tile size (112 for TAV) - val tempRow = FloatArray(size) - val tempCol = FloatArray(size) + val maxSize = kotlin.math.max(width, height) + val tempRow = FloatArray(maxSize) + val tempCol = FloatArray(maxSize) for (level in levels - 1 downTo 0) { - val currentSize = size shr level + val currentWidth = width shr level + val currentHeight = height shr level // Handle edge cases for very small decomposition levels - if (currentSize < 1) continue // Skip invalid sizes - if (currentSize == 1) { - // Level 6: 1x1 - single DC coefficient, no DWT needed but preserve it + if (currentWidth < 1 || currentHeight < 1) continue // Skip invalid sizes + if (currentWidth == 1 && currentHeight == 1) { + // Single DC coefficient, no DWT needed but preserve it continue } @@ -4240,44 +4237,44 @@ class GraphicsJSR223Delegate(private val vm: VM) { // The encoder does ROW transform first, then COLUMN transform // So inverse must do COLUMN inverse first, then ROW inverse - // Column inverse transform first - for (x in 0 until currentSize) { - for (y in 0 until currentSize) { - tempCol[y] = data[y * size + x] + // Column inverse transform first (vertical) + for (x in 0 until currentWidth) { + for (y in 0 until currentHeight) { + tempCol[y] = data[y * width + x] } if (filterType == 0) { - applyDWT53Inverse1D(tempCol, currentSize) + tavApplyDWT53Inverse1D(tempCol, currentHeight) } else { - applyDWT97Inverse1D(tempCol, currentSize) + tavApplyDWT97Inverse1D(tempCol, currentHeight) } - for (y in 0 until currentSize) { - data[y * size + x] = tempCol[y] + for (y in 0 until currentHeight) { + data[y * width + x] = tempCol[y] } } - // Row inverse transform second - for (y in 0 until currentSize) { - for (x in 0 until currentSize) { - tempRow[x] = data[y * size + x] + // Row inverse transform second (horizontal) + for (y in 0 until currentHeight) { + for (x in 0 until currentWidth) { + tempRow[x] = data[y * width + x] } if (filterType == 0) { - applyDWT53Inverse1D(tempRow, currentSize) + tavApplyDWT53Inverse1D(tempRow, currentWidth) } else { - applyDWT97Inverse1D(tempRow, currentSize) + tavApplyDWT97Inverse1D(tempRow, currentWidth) } - for (x in 0 until currentSize) { - data[y * size + x] = tempRow[x] + for (x in 0 until currentWidth) { + data[y * width + x] = tempRow[x] } } } } // 1D lifting scheme implementations for 9/7 irreversible filter - private fun applyDWT97Inverse1D(data: FloatArray, length: Int) { + private fun tavApplyDWT97Inverse1D(data: FloatArray, length: Int) { if (length < 2) return val temp = FloatArray(length) @@ -4363,7 +4360,7 @@ class GraphicsJSR223Delegate(private val vm: VM) { } } - private fun applyDWT53Inverse1D(data: FloatArray, length: Int) { + private fun tavApplyDWT53Inverse1D(data: FloatArray, length: Int) { if (length < 2) return val temp = FloatArray(length) diff --git a/video_encoder/encoder_tav.c b/video_encoder/encoder_tav.c index e4b993d..17c7799 100644 --- a/video_encoder/encoder_tav.c +++ b/video_encoder/encoder_tav.c @@ -22,7 +22,7 @@ // TSVM Advanced Video (TAV) format constants #define TAV_MAGIC "\x1F\x54\x53\x56\x4D\x54\x41\x56" // "\x1FTSVM TAV" -// TAV version - dynamic based on color space mode +// TAV version - dynamic based on colour space mode // Version 1: YCoCg-R (default) // Version 2: ICtCp (--ictcp flag) @@ -40,15 +40,16 @@ #define TAV_PACKET_SYNC 0xFF // Sync packet // DWT settings -#define TILE_SIZE 112 // 112x112 tiles - perfect fit for TSVM 560x448 (GCD = 112) -#define MAX_DECOMP_LEVELS 6 // Can go deeper: 112→56→28→14→7→3→1 -#define DEFAULT_DECOMP_LEVELS 5 // Increased default for better compression +#define TILE_SIZE_X 280 // 280x224 tiles - better compression efficiency +#define TILE_SIZE_Y 224 // Optimized for TSVM 560x448 (2×2 tiles exactly) +#define MAX_DECOMP_LEVELS 6 // Can go deeper: 280→140→70→35→17→8→4, 224→112→56→28→14→7→3 // Simulated overlapping tiles settings for seamless DWT processing #define DWT_FILTER_HALF_SUPPORT 4 // For 9/7 filter (filter lengths 9,7 → L=4) #define TILE_MARGIN_LEVELS 3 // Use margin for 3 levels: 4 * (2^3) = 4 * 8 = 32px #define TILE_MARGIN (DWT_FILTER_HALF_SUPPORT * (1 << TILE_MARGIN_LEVELS)) // 4 * 8 = 32px -#define PADDED_TILE_SIZE (TILE_SIZE + 2 * TILE_MARGIN) // 112 + 64 = 176px +#define PADDED_TILE_SIZE_X (TILE_SIZE_X + 2 * TILE_MARGIN) // 280 + 64 = 344px +#define PADDED_TILE_SIZE_Y (TILE_SIZE_Y + 2 * TILE_MARGIN) // 224 + 64 = 288px // Wavelet filter types #define WAVELET_5_3_REVERSIBLE 0 // Lossless capable @@ -166,7 +167,7 @@ typedef struct { int enable_roi; int verbose; int test_mode; - int ictcp_mode; // 0 = YCoCg-R (default), 1 = ICtCp color space + int ictcp_mode; // 0 = YCoCg-R (default), 1 = ICtCp colour space // Frame buffers uint8_t *current_frame_rgb; @@ -216,7 +217,7 @@ static tav_encoder_t* create_encoder(void); static void cleanup_encoder(tav_encoder_t *enc); static int initialize_encoder(tav_encoder_t *enc); static void rgb_to_ycocg(const uint8_t *rgb, float *y, float *co, float *cg, int width, int height); -static int estimate_motion_112x112(const float *current, const float *reference, +static int estimate_motion_280x224(const float *current, const float *reference, int width, int height, int tile_x, int tile_y, motion_vector_t *mv); @@ -246,7 +247,6 @@ static void show_usage(const char *program_name) { printf(" -q, --quality N Quality level 0-5 (default: 2)\n"); printf(" -Q, --quantizer Y,Co,Cg Quantizer levels 0-100 for each channel\n"); // printf(" -w, --wavelet N Wavelet filter: 0=5/3 reversible, 1=9/7 irreversible (default: 1)\n"); -// printf(" -d, --decomp N Decomposition levels 1-6 (default: %d)\n", DEFAULT_DECOMP_LEVELS); printf(" -b, --bitrate N Target bitrate in kbps (enables bitrate control mode)\n"); printf(" -S, --subtitles FILE SubRip (.srt) or SAMI (.smi) subtitle file\n"); printf(" -v, --verbose Verbose output\n"); @@ -254,7 +254,7 @@ static void show_usage(const char *program_name) { printf(" --lossless Lossless mode: use 5/3 reversible wavelet\n"); // printf(" --enable-progressive Enable progressive transmission\n"); // printf(" --enable-roi Enable region-of-interest coding\n"); - printf(" --ictcp Use ICtCp color space instead of YCoCg-R (generates TAV version 2)\n"); + printf(" --ictcp Use ICtCp colour space instead of YCoCg-R (generates TAV version 2)\n"); printf(" --help Show this help\n\n"); printf("Audio Rate by Quality:\n "); @@ -277,7 +277,7 @@ static void show_usage(const char *program_name) { printf("\n\nFeatures:\n"); printf(" - 112x112 DWT tiles with multi-resolution encoding\n"); - printf(" - Full resolution YCoCg-R/ICtCp color space\n"); + printf(" - Full resolution YCoCg-R/ICtCp colour space\n"); // printf(" - Progressive transmission and ROI coding\n"); // printf(" - Motion compensation with ±16 pixel search range\n"); printf(" - Lossless and lossy compression modes\n"); @@ -301,7 +301,7 @@ static tav_encoder_t* create_encoder(void) { enc->fps = DEFAULT_FPS; enc->quality_level = DEFAULT_QUALITY; enc->wavelet_filter = WAVELET_9_7_IRREVERSIBLE; - enc->decomp_levels = DEFAULT_DECOMP_LEVELS; + enc->decomp_levels = MAX_DECOMP_LEVELS; enc->quantizer_y = QUALITY_Y[DEFAULT_QUALITY]; enc->quantizer_co = QUALITY_CO[DEFAULT_QUALITY]; enc->quantizer_cg = QUALITY_CG[DEFAULT_QUALITY]; @@ -314,8 +314,8 @@ static int initialize_encoder(tav_encoder_t *enc) { if (!enc) return -1; // Calculate tile dimensions - enc->tiles_x = (enc->width + TILE_SIZE - 1) / TILE_SIZE; - enc->tiles_y = (enc->height + TILE_SIZE - 1) / TILE_SIZE; + enc->tiles_x = (enc->width + TILE_SIZE_X - 1) / TILE_SIZE_X; + enc->tiles_y = (enc->height + TILE_SIZE_Y - 1) / TILE_SIZE_Y; int num_tiles = enc->tiles_x * enc->tiles_y; // Allocate frame buffers @@ -338,8 +338,8 @@ static int initialize_encoder(tav_encoder_t *enc) { enc->compressed_buffer_size = ZSTD_compressBound(1024 * 1024); // 1MB max enc->compressed_buffer = malloc(enc->compressed_buffer_size); - // OPTIMIZATION: Allocate reusable quantization buffers for padded tiles (176x176) - const int padded_coeff_count = PADDED_TILE_SIZE * PADDED_TILE_SIZE; + // OPTIMIZATION: Allocate reusable quantization buffers for padded tiles (344x288) + const int padded_coeff_count = PADDED_TILE_SIZE_X * PADDED_TILE_SIZE_Y; enc->reusable_quantized_y = malloc(padded_coeff_count * sizeof(int16_t)); enc->reusable_quantized_co = malloc(padded_coeff_count * sizeof(int16_t)); enc->reusable_quantized_cg = malloc(padded_coeff_count * sizeof(int16_t)); @@ -459,11 +459,11 @@ static void dwt_97_forward_1d(float *data, int length) { // Extract padded tile with margins for seamless DWT processing (correct implementation) static void extract_padded_tile(tav_encoder_t *enc, int tile_x, int tile_y, float *padded_y, float *padded_co, float *padded_cg) { - const int core_start_x = tile_x * TILE_SIZE; - const int core_start_y = tile_y * TILE_SIZE; + const int core_start_x = tile_x * TILE_SIZE_X; + const int core_start_y = tile_y * TILE_SIZE_Y; // OPTIMIZATION: Process row by row with bulk copying for core region - for (int py = 0; py < PADDED_TILE_SIZE; py++) { + for (int py = 0; py < PADDED_TILE_SIZE_Y; py++) { // Map padded row to source image row int src_y = core_start_y + py - TILE_MARGIN; @@ -473,30 +473,30 @@ static void extract_padded_tile(tav_encoder_t *enc, int tile_x, int tile_y, src_y = CLAMP(src_y, 0, enc->height - 1); // Calculate source and destination row offsets - const int padded_row_offset = py * PADDED_TILE_SIZE; + const int padded_row_offset = py * PADDED_TILE_SIZE_X; const int src_row_offset = src_y * enc->width; // Check if we can do bulk copying for the core region int core_start_px = TILE_MARGIN; - int core_end_px = TILE_MARGIN + TILE_SIZE; + int core_end_px = TILE_MARGIN + TILE_SIZE_X; // Check if core region is entirely within frame bounds int core_src_start_x = core_start_x; - int core_src_end_x = core_start_x + TILE_SIZE; + int core_src_end_x = core_start_x + TILE_SIZE_X; if (core_src_start_x >= 0 && core_src_end_x <= enc->width) { - // OPTIMIZATION: Bulk copy core region (112 pixels) in one operation + // OPTIMIZATION: Bulk copy core region (280 pixels) in one operation const int src_core_offset = src_row_offset + core_src_start_x; memcpy(&padded_y[padded_row_offset + core_start_px], &enc->current_frame_y[src_core_offset], - TILE_SIZE * sizeof(float)); + TILE_SIZE_X * sizeof(float)); memcpy(&padded_co[padded_row_offset + core_start_px], &enc->current_frame_co[src_core_offset], - TILE_SIZE * sizeof(float)); + TILE_SIZE_X * sizeof(float)); memcpy(&padded_cg[padded_row_offset + core_start_px], &enc->current_frame_cg[src_core_offset], - TILE_SIZE * sizeof(float)); + TILE_SIZE_X * sizeof(float)); // Handle margin pixels individually (left and right margins) for (int px = 0; px < core_start_px; px++) { @@ -512,7 +512,7 @@ static void extract_padded_tile(tav_encoder_t *enc, int tile_x, int tile_y, padded_cg[padded_idx] = enc->current_frame_cg[src_idx]; } - for (int px = core_end_px; px < PADDED_TILE_SIZE; px++) { + for (int px = core_end_px; px < PADDED_TILE_SIZE_X; px++) { int src_x = core_start_x + px - TILE_MARGIN; if (src_x >= enc->width) src_x = enc->width - 1 - (src_x - enc->width); src_x = CLAMP(src_x, 0, enc->width - 1); @@ -526,7 +526,7 @@ static void extract_padded_tile(tav_encoder_t *enc, int tile_x, int tile_y, } } else { // Fallback: process entire row pixel by pixel (for edge tiles) - for (int px = 0; px < PADDED_TILE_SIZE; px++) { + for (int px = 0; px < PADDED_TILE_SIZE_X; px++) { int src_x = core_start_x + px - TILE_MARGIN; // Handle horizontal boundary conditions with mirroring @@ -546,47 +546,50 @@ static void extract_padded_tile(tav_encoder_t *enc, int tile_x, int tile_y, } -// 2D DWT forward transform for padded tile +// 2D DWT forward transform for rectangular padded tile (344x288) static void dwt_2d_forward_padded(float *tile_data, int levels, int filter_type) { - const int size = PADDED_TILE_SIZE; - float *temp_row = malloc(size * sizeof(float)); - float *temp_col = malloc(size * sizeof(float)); + const int width = PADDED_TILE_SIZE_X; // 344 + const int height = PADDED_TILE_SIZE_Y; // 288 + const int max_size = (width > height) ? width : height; + float *temp_row = malloc(max_size * sizeof(float)); + float *temp_col = malloc(max_size * sizeof(float)); for (int level = 0; level < levels; level++) { - int current_size = size >> level; - if (current_size < 1) break; + int current_width = width >> level; + int current_height = height >> level; + if (current_width < 1 || current_height < 1) break; - // Row transform - for (int y = 0; y < current_size; y++) { - for (int x = 0; x < current_size; x++) { - temp_row[x] = tile_data[y * size + x]; + // Row transform (horizontal) + for (int y = 0; y < current_height; y++) { + for (int x = 0; x < current_width; x++) { + temp_row[x] = tile_data[y * width + x]; } if (filter_type == WAVELET_5_3_REVERSIBLE) { - dwt_53_forward_1d(temp_row, current_size); + dwt_53_forward_1d(temp_row, current_width); } else { - dwt_97_forward_1d(temp_row, current_size); + dwt_97_forward_1d(temp_row, current_width); } - for (int x = 0; x < current_size; x++) { - tile_data[y * size + x] = temp_row[x]; + for (int x = 0; x < current_width; x++) { + tile_data[y * width + x] = temp_row[x]; } } - // Column transform - for (int x = 0; x < current_size; x++) { - for (int y = 0; y < current_size; y++) { - temp_col[y] = tile_data[y * size + x]; + // Column transform (vertical) + for (int x = 0; x < current_width; x++) { + for (int y = 0; y < current_height; y++) { + temp_col[y] = tile_data[y * width + x]; } if (filter_type == WAVELET_5_3_REVERSIBLE) { - dwt_53_forward_1d(temp_col, current_size); + dwt_53_forward_1d(temp_col, current_height); } else { - dwt_97_forward_1d(temp_col, current_size); + dwt_97_forward_1d(temp_col, current_height); } - for (int y = 0; y < current_size; y++) { - tile_data[y * size + x] = temp_col[y]; + for (int y = 0; y < current_height; y++) { + tile_data[y * width + x] = temp_col[y]; } } } @@ -626,8 +629,8 @@ static size_t serialize_tile_data(tav_encoder_t *enc, int tile_x, int tile_y, return offset; } - // Quantize and serialize DWT coefficients (full padded tile: 176x176) - const int tile_size = PADDED_TILE_SIZE * PADDED_TILE_SIZE; + // Quantize and serialize DWT coefficients (full padded tile: 344x288) + const int tile_size = PADDED_TILE_SIZE_X * PADDED_TILE_SIZE_Y; // OPTIMIZATION: Use pre-allocated buffers instead of malloc/free per tile int16_t *quantized_y = enc->reusable_quantized_y; int16_t *quantized_co = enc->reusable_quantized_co; @@ -669,8 +672,8 @@ static size_t serialize_tile_data(tav_encoder_t *enc, int tile_x, int tile_y, // Compress and write frame data static size_t compress_and_write_frame(tav_encoder_t *enc, uint8_t packet_type) { - // Calculate total uncompressed size (for padded tile coefficients: 176x176) - const size_t max_tile_size = 9 + (PADDED_TILE_SIZE * PADDED_TILE_SIZE * 3 * sizeof(int16_t)); // header + 3 channels of coefficients + // Calculate total uncompressed size (for padded tile coefficients: 344x288) + const size_t max_tile_size = 9 + (PADDED_TILE_SIZE_X * PADDED_TILE_SIZE_Y * 3 * sizeof(int16_t)); // header + 3 channels of coefficients const size_t total_uncompressed_size = enc->tiles_x * enc->tiles_y * max_tile_size; // Allocate buffer for uncompressed tile data @@ -685,12 +688,12 @@ static size_t compress_and_write_frame(tav_encoder_t *enc, uint8_t packet_type) // Determine tile mode (simplified) uint8_t mode = TAV_MODE_INTRA; // For now, all tiles are INTRA - // Extract padded tile data (176x176) with neighbor context for overlapping tiles - float tile_y_data[PADDED_TILE_SIZE * PADDED_TILE_SIZE]; - float tile_co_data[PADDED_TILE_SIZE * PADDED_TILE_SIZE]; - float tile_cg_data[PADDED_TILE_SIZE * PADDED_TILE_SIZE]; + // Extract padded tile data (344x288) with neighbour context for overlapping tiles + float tile_y_data[PADDED_TILE_SIZE_X * PADDED_TILE_SIZE_Y]; + float tile_co_data[PADDED_TILE_SIZE_X * PADDED_TILE_SIZE_Y]; + float tile_cg_data[PADDED_TILE_SIZE_X * PADDED_TILE_SIZE_Y]; - // Extract padded tiles using context from neighbors + // Extract padded tiles using context from neighbours extract_padded_tile(enc, tile_x, tile_y, tile_y_data, tile_co_data, tile_cg_data); // Debug: check input data before DWT @@ -742,13 +745,14 @@ static size_t compress_and_write_frame(tav_encoder_t *enc, uint8_t packet_type) } // Motion estimation for 112x112 tiles using SAD -static int estimate_motion_112x112(const float *current, const float *reference, +static int estimate_motion_280x224(const float *current, const float *reference, int width, int height, int tile_x, int tile_y, motion_vector_t *mv) { - const int tile_size = TILE_SIZE; - const int search_range = 28; // ±28 pixels (increased proportionally: 16 * 112/64 = 28) - const int start_x = tile_x * tile_size; - const int start_y = tile_y * tile_size; + const int tile_size_x = TILE_SIZE_X; + const int tile_size_y = TILE_SIZE_Y; + const int search_range = 32; // ±32 pixels (scaled for larger tiles) + const int start_x = tile_x * tile_size_x; + const int start_y = tile_y * tile_size_y; int best_mv_x = 0, best_mv_y = 0; int min_sad = INT_MAX; @@ -761,14 +765,14 @@ static int estimate_motion_112x112(const float *current, const float *reference, // Check bounds if (ref_x < 0 || ref_y < 0 || - ref_x + tile_size > width || ref_y + tile_size > height) { + ref_x + tile_size_x > width || ref_y + tile_size_y > height) { continue; } // Calculate SAD int sad = 0; - for (int y = 0; y < tile_size; y++) { - for (int x = 0; x < tile_size; x++) { + for (int y = 0; y < tile_size_y; y++) { + for (int x = 0; x < tile_size_x; x++) { int curr_idx = (start_y + y) * width + (start_x + x); int ref_idx = (ref_y + y) * width + (ref_x + x); @@ -795,7 +799,7 @@ static int estimate_motion_112x112(const float *current, const float *reference, return min_sad; } -// RGB to YCoCg color space conversion +// RGB to YCoCg colour space conversion static void rgb_to_ycocg(const uint8_t *rgb, float *y, float *co, float *cg, int width, int height) { const int total_pixels = width * height; @@ -815,7 +819,7 @@ static void rgb_to_ycocg(const uint8_t *rgb, float *y, float *co, float *cg, int const float g = rgb_ptr[j * 3 + 1]; const float b = rgb_ptr[j * 3 + 2]; - // YCoCg-R transform (optimized with fewer temporary variables) + // YCoCg-R transform (optimised with fewer temporary variables) co[idx] = r - b; const float tmp = b + co[idx] * 0.5f; cg[idx] = g - tmp; @@ -963,16 +967,16 @@ void ictcp_hlg_to_srgb8(double I8, double Ct8, double Cp8, *b8 = (uint8_t)iround(FCLAMP(b * 255.0, 0.0, 255.0)); } -// ---------------------- Color Space Switching Functions ---------------------- +// ---------------------- Colour Space Switching Functions ---------------------- // Wrapper functions that choose between YCoCg-R and ICtCp based on encoder mode -static void rgb_to_color_space(tav_encoder_t *enc, uint8_t r, uint8_t g, uint8_t b, +static void rgb_to_colour_space(tav_encoder_t *enc, uint8_t r, uint8_t g, uint8_t b, double *c1, double *c2, double *c3) { if (enc->ictcp_mode) { - // Use ICtCp color space + // Use ICtCp colour space srgb8_to_ictcp_hlg(r, g, b, c1, c2, c3); } else { - // Use YCoCg-R color space (convert from existing function) + // Use YCoCg-R colour space (convert from existing function) float rf = r, gf = g, bf = b; float co = rf - bf; float tmp = bf + co / 2; @@ -984,13 +988,13 @@ static void rgb_to_color_space(tav_encoder_t *enc, uint8_t r, uint8_t g, uint8_t } } -static void color_space_to_rgb(tav_encoder_t *enc, double c1, double c2, double c3, +static void colour_space_to_rgb(tav_encoder_t *enc, double c1, double c2, double c3, uint8_t *r, uint8_t *g, uint8_t *b) { if (enc->ictcp_mode) { - // Use ICtCp color space + // Use ICtCp colour space ictcp_hlg_to_srgb8(c1, c2, c3, r, g, b); } else { - // Use YCoCg-R color space (inverse of rgb_to_ycocg) + // Use YCoCg-R colour space (inverse of rgb_to_ycocg) float y = (float)c1; float co = (float)c2; float cg = (float)c3; @@ -1004,8 +1008,8 @@ static void color_space_to_rgb(tav_encoder_t *enc, double c1, double c2, double } } -// RGB to color space conversion for full frames -static void rgb_to_color_space_frame(tav_encoder_t *enc, const uint8_t *rgb, +// RGB to colour space conversion for full frames +static void rgb_to_colour_space_frame(tav_encoder_t *enc, const uint8_t *rgb, float *c1, float *c2, float *c3, int width, int height) { if (enc->ictcp_mode) { // ICtCp mode @@ -1029,7 +1033,7 @@ static int write_tav_header(tav_encoder_t *enc) { // Magic number fwrite(TAV_MAGIC, 1, 8, enc->output_fp); - // Version (dynamic based on color space) + // Version (dynamic based on colour space) uint8_t version = enc->ictcp_mode ? 2 : 1; // Version 2 for ICtCp, 1 for YCoCg-R fputc(version, enc->output_fp); @@ -1731,7 +1735,7 @@ int main(int argc, char *argv[]) { printf("Wavelet: %s\n", enc->wavelet_filter ? "9/7 irreversible" : "5/3 reversible"); printf("Decomposition levels: %d\n", enc->decomp_levels); printf("Quality: Y=%d, Co=%d, Cg=%d\n", enc->quantizer_y, enc->quantizer_co, enc->quantizer_cg); - printf("Color space: %s\n", enc->ictcp_mode ? "ICtCp" : "YCoCg-R"); + printf("Colour space: %s\n", enc->ictcp_mode ? "ICtCp" : "YCoCg-R"); // Open output file if (strcmp(enc->output_file, "-") == 0) { @@ -1747,7 +1751,7 @@ int main(int argc, char *argv[]) { // Start FFmpeg process for video input (using TEV-compatible filtergraphs) if (enc->test_mode) { - // Test mode - generate solid color frames + // Test mode - generate solid colour frames enc->total_frames = 15; // Fixed 15 test frames like TEV printf("Test mode: Generating %d solid colour frames\n", enc->total_frames); } else { @@ -1877,8 +1881,8 @@ int main(int argc, char *argv[]) { printf("\n"); }*/ - // Convert RGB to color space (YCoCg-R or ICtCp) - rgb_to_color_space_frame(enc, enc->current_frame_rgb, + // Convert RGB to colour space (YCoCg-R or ICtCp) + rgb_to_colour_space_frame(enc, enc->current_frame_rgb, enc->current_frame_y, enc->current_frame_co, enc->current_frame_cg, enc->width, enc->height); @@ -1899,7 +1903,7 @@ int main(int argc, char *argv[]) { int tile_y = tile_idx / enc->tiles_x; if (!is_keyframe && frame_count > 0) { - estimate_motion_112x112(enc->current_frame_y, enc->previous_frame_y, + estimate_motion_280x224(enc->current_frame_y, enc->previous_frame_y, enc->width, enc->height, tile_x, tile_y, &enc->motion_vectors[tile_idx]); } else {