From 957522a460867c46c192f6dee2e0aa150b96590b Mon Sep 17 00:00:00 2001 From: minjaesong Date: Fri, 12 Sep 2025 14:13:40 +0900 Subject: [PATCH] Knusperli-esque post deblocking filter --- assets/disk0/tvdos/bin/playtev.js | 25 +- .../torvald/tsvm/GraphicsJSR223Delegate.kt | 686 +++++++++++++----- 2 files changed, 508 insertions(+), 203 deletions(-) diff --git a/assets/disk0/tvdos/bin/playtev.js b/assets/disk0/tvdos/bin/playtev.js index 65386b5..340e601 100644 --- a/assets/disk0/tvdos/bin/playtev.js +++ b/assets/disk0/tvdos/bin/playtev.js @@ -3,7 +3,8 @@ // Usage: playtev moviefile.tev [options] // Options: -i (interactive), -debug-mv (show motion vector debug visualization) // -deinterlace=algorithm (yadif or bwdif, default: yadif) -// -nodeblock (disble deblocking filter) +// -nodeblock (disable post-processing deblocking filter) +// -boundaryaware (enable boundary-aware decoding to prevent artifacts at DCT level) const WIDTH = 560 const HEIGHT = 448 @@ -46,6 +47,7 @@ let interactive = false let debugMotionVectors = false let deinterlaceAlgorithm = "yadif" let enableDeblocking = true // Default: enabled (use -nodeblock to disable) +let enableBoundaryAwareDecoding = false // Default: disabled (use -boundaryaware to enable) // suitable for still frame and slide shows, absolutely unsuitable for videos if (exec_args.length > 2) { for (let i = 2; i < exec_args.length; i++) { @@ -56,6 +58,8 @@ if (exec_args.length > 2) { debugMotionVectors = true } else if (arg === "-nodeblock") { enableDeblocking = false + } else if (arg === "-boundaryaware") { + enableBoundaryAwareDecoding = true } else if (arg.startsWith("-deinterlace=")) { deinterlaceAlgorithm = arg.substring(13) } @@ -97,6 +101,9 @@ audio.purgeQueue(0) audio.setPcmMode(0) audio.setMasterVolume(0, 255) +// set colour zero as half-opaque black +graphics.setPalette(0, 0, 0, 0, 9) + // Subtitle display functions function clearSubtitleArea() { // Clear the subtitle area at the bottom of the screen @@ -392,7 +399,10 @@ if (version !== TEV_VERSION_YCOCG && version !== TEV_VERSION_XYB) { let colorSpace = (version === TEV_VERSION_XYB) ? "XYB" : "YCoCg-R" if (interactive) { con.move(1,1) - println(`Push and hold Backspace to exit | TEV Format ${version} (${colorSpace}) | Deblocking: ${enableDeblocking ? 'ON' : 'OFF'}`) + if (colorSpace == "XYB") + println(`Push and hold Backspace to exit | TEV Format ${version} (${colorSpace}) | Deblock: ${enableDeblocking ? 'ON' : 'OFF'}, ${enableBoundaryAwareDecoding ? 'ON' : 'OFF'}`); + else + println(`Push and hold Backspace to exit | Deblock: ${enableDeblocking ? 'ON' : 'OFF'} | BoundaryAware: ${enableBoundaryAwareDecoding ? 'ON' : 'OFF'}`); } let width = seqread.readShort() @@ -655,14 +665,14 @@ try { if (isInterlaced) { // For interlaced: decode current frame into currentFieldAddr // For display: use prevFieldAddr as current, currentFieldAddr as next - graphics.tevDecode(blockDataPtr, nextFieldAddr, currentFieldAddr, width, decodingHeight, qualityY, qualityCo, qualityCg, trueFrameCount, debugMotionVectors, version, enableDeblocking) + graphics.tevDecode(blockDataPtr, nextFieldAddr, currentFieldAddr, width, decodingHeight, qualityY, qualityCo, qualityCg, trueFrameCount, debugMotionVectors, version, enableDeblocking, enableBoundaryAwareDecoding) graphics.tevDeinterlace(trueFrameCount, width, decodingHeight, prevFieldAddr, currentFieldAddr, nextFieldAddr, CURRENT_RGB_ADDR, deinterlaceAlgorithm) // Rotate field buffers for next frame: NEXT -> CURRENT -> PREV rotateFieldBuffers() } else { // Progressive or first frame: normal decoding without temporal prediction - graphics.tevDecode(blockDataPtr, CURRENT_RGB_ADDR, PREV_RGB_ADDR, width, decodingHeight, qualityY, qualityCo, qualityCg, trueFrameCount, debugMotionVectors, version, enableDeblocking) + graphics.tevDecode(blockDataPtr, CURRENT_RGB_ADDR, PREV_RGB_ADDR, width, decodingHeight, qualityY, qualityCo, qualityCg, trueFrameCount, debugMotionVectors, version, enableDeblocking, enableBoundaryAwareDecoding) } decodeTime = (sys.nanoTime() - decodeStart) / 1000000.0 // Convert to milliseconds @@ -750,10 +760,10 @@ try { if (!hasSubtitle) { con.move(31, 1) - graphics.setTextFore(161) + con.color_pair(253, 0) print(`Frame: ${frameCount}/${totalFrames} (${((frameCount / akku2 * 100)|0) / 100}f) `) con.move(32, 1) - graphics.setTextFore(161) + con.color_pair(253, 0) print(`VRate: ${(getVideoRate() / 1024 * 8)|0} kbps `) con.move(1, 1) } @@ -781,7 +791,10 @@ finally { if (interactive) { //con.clear() } + + // set colour zero as opaque black } +graphics.setPalette(0, 0, 0, 0, 0) con.move(cy, cx) // restore cursor return errorlevel \ No newline at end of file diff --git a/tsvm_core/src/net/torvald/tsvm/GraphicsJSR223Delegate.kt b/tsvm_core/src/net/torvald/tsvm/GraphicsJSR223Delegate.kt index 2b29231..63d2338 100644 --- a/tsvm_core/src/net/torvald/tsvm/GraphicsJSR223Delegate.kt +++ b/tsvm_core/src/net/torvald/tsvm/GraphicsJSR223Delegate.kt @@ -48,7 +48,7 @@ class GraphicsJSR223Delegate(private val vm: VM) { * @param index which palette number to modify, 0-255 * @param r g - b - a - RGBA value, 0-15 */ - fun setPalette(index: Int, r: Int, g: Int, b: Int, a: Int = 16) { + fun setPalette(index: Int, r: Int, g: Int, b: Int, a: Int = 15) { getFirstGPU()?.let { it.paletteOfFloats[index * 4] = (r and 15) / 15f it.paletteOfFloats[index * 4 + 1] = (g and 15) / 15f @@ -2506,160 +2506,241 @@ class GraphicsJSR223Delegate(private val vm: VM) { } /** - * Advanced TEV Deblocking Filter - Reduces blocking artifacts from 16x16 macroblocks + * Enhanced TEV Deblocking Filter - Uses Knusperli-inspired techniques for superior boundary analysis * - * Uses gradient analysis and adaptive filtering to handle: - * - Quantized smooth gradients appearing as discrete blocks - * - Diagonal edges crossing block boundaries causing color banding - * - Texture preservation to avoid over-smoothing genuine edges + * Advanced features inspired by Google's Knusperli algorithm: + * - Frequency-domain boundary discontinuity detection + * - High-frequency penalty system to preserve detail + * - Linear gradient pattern analysis for directional filtering + * - Adaptive strength based on local image complexity + * - Bulk memory operations for improved performance * * @param rgbAddr RGB frame buffer address (24-bit: R,G,B per pixel) * @param width Frame width in pixels * @param height Frame height in pixels * @param blockSize Size of blocks (16 for TEV format) - * @param strength Filter strength (0.0-1.0, higher = more smoothing) + * @param strength Base filter strength (0.0-1.0, adaptive adjustment applied) */ - private fun tevDeblockingFilter(rgbAddr: Long, width: Int, height: Int, - blockSize: Int = 16, strength: Float = 0.4f) { + private fun tevDeblockingFilterEnhanced(rgbAddr: Long, width: Int, height: Int, + blockSize: Int = 16, strength: Float = 1.0f) { val blocksX = (width + blockSize - 1) / blockSize val blocksY = (height + blockSize - 1) / blockSize val thisAddrIncVec: Long = if (rgbAddr < 0) -1 else 1 - // Helper function to get pixel value safely - fun getPixel(x: Int, y: Int, c: Int): Int { - if (x < 0 || y < 0 || x >= width || y >= height) return 0 - val offset = (y.toLong() * width + x) * 3 + c - return vm.peek(rgbAddr + offset * thisAddrIncVec)!!.toUint().toInt() + // Knusperli-inspired constants adapted for RGB post-processing + val kLinearGradient = intArrayOf(318, -285, 81, -32, 17, -9, 5, -2) // Gradient pattern (8 taps for block boundary) + val kAlphaSqrt2 = intArrayOf(1024, 1448, 1448, 1448, 1448, 1448, 1448, 1448) // Alpha * sqrt(2) in 10-bit fixed-point + + // Bulk memory access helpers for performance + fun getPixelBulk(x: Int, y: Int): IntArray { + if (x < 0 || y < 0 || x >= width || y >= height) return intArrayOf(0, 0, 0) + val offset = (y.toLong() * width + x) * 3 + val addr = rgbAddr + offset * thisAddrIncVec + return intArrayOf( + vm.peek(addr)!!.toUint().toInt(), + vm.peek(addr + thisAddrIncVec)!!.toUint().toInt(), + vm.peek(addr + 2 * thisAddrIncVec)!!.toUint().toInt() + ) } - // Helper function to set pixel value safely - fun setPixel(x: Int, y: Int, c: Int, value: Int) { + fun setPixelBulk(x: Int, y: Int, rgb: IntArray) { if (x < 0 || y < 0 || x >= width || y >= height) return - val offset = (y.toLong() * width + x) * 3 + c - vm.poke(rgbAddr + offset * thisAddrIncVec, value.coerceIn(0, 255).toByte()) + val offset = (y.toLong() * width + x) * 3 + val addr = rgbAddr + offset * thisAddrIncVec + vm.poke(addr, rgb[0].coerceIn(0, 255).toByte()) + vm.poke(addr + thisAddrIncVec, rgb[1].coerceIn(0, 255).toByte()) + vm.poke(addr + 2 * thisAddrIncVec, rgb[2].coerceIn(0, 255).toByte()) } - // Detect if pixels form a smooth gradient (quantized) - fun isQuantizedGradient(p0: Int, p1: Int, p2: Int, p3: Int): Boolean { - // Check for step-like transitions typical of quantized gradients - val d01 = kotlin.math.abs(p1 - p0) - val d12 = kotlin.math.abs(p2 - p1) - val d23 = kotlin.math.abs(p3 - p2) + // ENHANCED: Knusperli-inspired boundary discontinuity analysis + fun analyzeBoundaryDiscontinuity(samples: IntArray): Pair { + // samples: 8-pixel samples across the boundary for frequency analysis + var delta = 0L + var hfPenalty = 0L - // Look for consistent small steps (quantized gradient) - val avgStep = (d01 + d12 + d23) / 3.0f - val stepVariance = kotlin.math.abs(d01 - avgStep) + kotlin.math.abs(d12 - avgStep) + kotlin.math.abs(d23 - avgStep) + for (u in 0 until 8) { + val alpha = kAlphaSqrt2[u] + val sign = if (u and 1 != 0) -1 else 1 + val leftVal = samples[u] + val rightVal = samples[7 - u] // Mirror for boundary analysis + + delta += alpha * (rightVal - sign * leftVal) + hfPenalty += (u * u) * (leftVal * leftVal + rightVal * rightVal) + } - return avgStep in 3.0f..25.0f && stepVariance < avgStep * 0.8f + return Pair(delta, hfPenalty) } - // Apply horizontal deblocking (vertical edges between blocks) + // ENHANCED: Adaptive strength based on local complexity + fun calculateAdaptiveStrength(baseStrength: Float, hfPenalty: Long, delta: Long): Float { + val complexity = kotlin.math.sqrt(hfPenalty.toDouble()).toFloat() + val discontinuityMagnitude = kotlin.math.abs(delta).toFloat() + + // Reduce filtering strength in high-frequency areas (preserve detail) + val complexityFactor = if (complexity > 800) 0.3f else 1.0f + + // Increase filtering strength for clear discontinuities + val discontinuityFactor = kotlin.math.min(2.0f, discontinuityMagnitude / 1000.0f) + + return baseStrength * complexityFactor * discontinuityFactor + } + + // ENHANCED: Apply Knusperli-style corrections using linear gradient patterns + fun applyBoundaryCorrection( + samples: IntArray, delta: Long, adaptiveStrength: Float + ): IntArray { + val result = samples.clone() + val correction = (delta * 724 shr 31).toInt() // Apply sqrt(2)/2 weighting like Knusperli + + // Apply linear gradient corrections across boundary + for (i in 0 until 8) { + val gradientWeight = kLinearGradient[i] * correction / 1024 // Scale from 10-bit fixed-point + val sign = if (i < 4) 1 else -1 // Left/right side weighting + + val adjustment = (gradientWeight * sign * adaptiveStrength).toInt() + result[i] = (result[i] + adjustment).coerceIn(0, 255) + } + + return result + } + + // ENHANCED HORIZONTAL DEBLOCKING: Using Knusperli-inspired boundary analysis for (by in 0 until blocksY) { for (bx in 1 until blocksX) { val blockEdgeX = bx * blockSize if (blockEdgeX >= width) continue - for (y in (by * blockSize) until minOf((by + 1) * blockSize, height)) { - for (c in 0..2) { // RGB components - // Sample 4 pixels across the block boundary: [left2][left1] | [right1][right2] - val left2 = getPixel(blockEdgeX - 2, y, c) - val left1 = getPixel(blockEdgeX - 1, y, c) - val right1 = getPixel(blockEdgeX, y, c) - val right2 = getPixel(blockEdgeX + 1, y, c) + // Process boundary in chunks for better performance + val yStart = by * blockSize + val yEnd = minOf((by + 1) * blockSize, height) + + for (y in yStart until yEnd step 2) { // Process 2 lines at a time + if (y + 1 >= height) continue + + // Sample 8x2 pixel region across boundary for both lines + val samples1 = IntArray(24) // 8 pixels × 3 channels (RGB) + val samples2 = IntArray(24) + + for (i in 0 until 8) { + val x = blockEdgeX - 4 + i + val rgb1 = getPixelBulk(x, y) + val rgb2 = getPixelBulk(x, y + 1) - val edgeDiff = kotlin.math.abs(right1 - left1) + samples1[i * 3] = rgb1[0] // R + samples1[i * 3 + 1] = rgb1[1] // G + samples1[i * 3 + 2] = rgb1[2] // B + samples2[i * 3] = rgb2[0] + samples2[i * 3 + 1] = rgb2[1] + samples2[i * 3 + 2] = rgb2[2] + } + + // Analyze each color channel separately + for (c in 0..2) { + val channelSamples1 = IntArray(8) { samples1[it * 3 + c] } + val channelSamples2 = IntArray(8) { samples2[it * 3 + c] } - // Skip strong edges (likely genuine features) - if (edgeDiff > 50) continue + val (delta1, hfPenalty1) = analyzeBoundaryDiscontinuity(channelSamples1) + val (delta2, hfPenalty2) = analyzeBoundaryDiscontinuity(channelSamples2) - // Check for quantized gradient pattern - if (isQuantizedGradient(left2, left1, right1, right2)) { - // Apply gradient-preserving smoothing - val gradientLeft = left1 - left2 - val gradientRight = right2 - right1 - val avgGradient = (gradientLeft + gradientRight) / 2.0f - - val smoothedLeft1 = (left2 + avgGradient).toInt() - val smoothedRight1 = (right2 - avgGradient).toInt() - - // Blend with original based on strength - val blendLeft = (left1 * (1.0f - strength) + smoothedLeft1 * strength).toInt() - val blendRight = (right1 * (1.0f - strength) + smoothedRight1 * strength).toInt() - - setPixel(blockEdgeX - 1, y, c, blendLeft) - setPixel(blockEdgeX, y, c, blendRight) - } - // Check for color banding on diagonal features - else if (edgeDiff in 8..35) { - // Look at diagonal context to detect banding - val diagContext = kotlin.math.abs(getPixel(blockEdgeX - 1, y - 1, c) - getPixel(blockEdgeX, y + 1, c)) - - if (diagContext < edgeDiff * 1.5f) { - // Likely diagonal banding - apply directional smoothing - val blend = 0.3f * strength - val blendLeft = (left1 * (1.0f - blend) + right1 * blend).toInt() - val blendRight = (right1 * (1.0f - blend) + left1 * blend).toInt() - - setPixel(blockEdgeX - 1, y, c, blendLeft) - setPixel(blockEdgeX, y, c, blendRight) + // Skip if very small discontinuity (early exit optimization) + if (kotlin.math.abs(delta1) < 50 && kotlin.math.abs(delta2) < 50) continue + + // Calculate adaptive filtering strength + val adaptiveStrength1 = calculateAdaptiveStrength(strength, hfPenalty1, delta1) + val adaptiveStrength2 = calculateAdaptiveStrength(strength, hfPenalty2, delta2) + + // Apply corrections if strength is significant + if (adaptiveStrength1 > 0.05f) { + val corrected1 = applyBoundaryCorrection(channelSamples1, delta1, adaptiveStrength1) + for (i in 0 until 8) { + samples1[i * 3 + c] = corrected1[i] } } + + if (adaptiveStrength2 > 0.05f) { + val corrected2 = applyBoundaryCorrection(channelSamples2, delta2, adaptiveStrength2) + for (i in 0 until 8) { + samples2[i * 3 + c] = corrected2[i] + } + } + } + + // Write back corrected pixels in bulk + for (i in 2..5) { // Only write middle 4 pixels to avoid artifacts + val x = blockEdgeX - 4 + i + setPixelBulk(x, y, intArrayOf(samples1[i * 3], samples1[i * 3 + 1], samples1[i * 3 + 2])) + if (y + 1 < height) { + setPixelBulk(x, y + 1, intArrayOf(samples2[i * 3], samples2[i * 3 + 1], samples2[i * 3 + 2])) + } } } } } - // Apply vertical deblocking (horizontal edges between blocks) + // ENHANCED VERTICAL DEBLOCKING: Same approach for horizontal block boundaries for (by in 1 until blocksY) { for (bx in 0 until blocksX) { val blockEdgeY = by * blockSize if (blockEdgeY >= height) continue - for (x in (bx * blockSize) until minOf((bx + 1) * blockSize, width)) { - for (c in 0..2) { // RGB components - // Sample 4 pixels across the block boundary: [top2][top1] | [bottom1][bottom2] - val top2 = getPixel(x, blockEdgeY - 2, c) - val top1 = getPixel(x, blockEdgeY - 1, c) - val bottom1 = getPixel(x, blockEdgeY, c) - val bottom2 = getPixel(x, blockEdgeY + 1, c) + val xStart = bx * blockSize + val xEnd = minOf((bx + 1) * blockSize, width) + + for (x in xStart until xEnd step 2) { + if (x + 1 >= width) continue + + // Sample 8x2 pixel region across vertical boundary + val samples1 = IntArray(24) + val samples2 = IntArray(24) + + for (i in 0 until 8) { + val y = blockEdgeY - 4 + i + val rgb1 = getPixelBulk(x, y) + val rgb2 = getPixelBulk(x + 1, y) - val edgeDiff = kotlin.math.abs(bottom1 - top1) + samples1[i * 3] = rgb1[0] + samples1[i * 3 + 1] = rgb1[1] + samples1[i * 3 + 2] = rgb1[2] + samples2[i * 3] = rgb2[0] + samples2[i * 3 + 1] = rgb2[1] + samples2[i * 3 + 2] = rgb2[2] + } + + // Same boundary analysis and correction as horizontal + for (c in 0..2) { + val channelSamples1 = IntArray(8) { samples1[it * 3 + c] } + val channelSamples2 = IntArray(8) { samples2[it * 3 + c] } - // Skip strong edges (likely genuine features) - if (edgeDiff > 50) continue + val (delta1, hfPenalty1) = analyzeBoundaryDiscontinuity(channelSamples1) + val (delta2, hfPenalty2) = analyzeBoundaryDiscontinuity(channelSamples2) - // Check for quantized gradient pattern - if (isQuantizedGradient(top2, top1, bottom1, bottom2)) { - // Apply gradient-preserving smoothing - val gradientTop = top1 - top2 - val gradientBottom = bottom2 - bottom1 - val avgGradient = (gradientTop + gradientBottom) / 2.0f - - val smoothedTop1 = (top2 + avgGradient).toInt() - val smoothedBottom1 = (bottom2 - avgGradient).toInt() - - // Blend with original based on strength - val blendTop = (top1 * (1.0f - strength) + smoothedTop1 * strength).toInt() - val blendBottom = (bottom1 * (1.0f - strength) + smoothedBottom1 * strength).toInt() - - setPixel(x, blockEdgeY - 1, c, blendTop) - setPixel(x, blockEdgeY, c, blendBottom) - } - // Check for color banding on diagonal features - else if (edgeDiff in 8..35) { - // Look at diagonal context to detect banding - val diagContext = kotlin.math.abs(getPixel(x - 1, blockEdgeY - 1, c) - getPixel(x + 1, blockEdgeY, c)) - - if (diagContext < edgeDiff * 1.5f) { - // Likely diagonal banding - apply directional smoothing - val blend = 0.3f * strength - val blendTop = (top1 * (1.0f - blend) + bottom1 * blend).toInt() - val blendBottom = (bottom1 * (1.0f - blend) + top1 * blend).toInt() - - setPixel(x, blockEdgeY - 1, c, blendTop) - setPixel(x, blockEdgeY, c, blendBottom) + if (kotlin.math.abs(delta1) < 50 && kotlin.math.abs(delta2) < 50) continue + + val adaptiveStrength1 = calculateAdaptiveStrength(strength, hfPenalty1, delta1) + val adaptiveStrength2 = calculateAdaptiveStrength(strength, hfPenalty2, delta2) + + if (adaptiveStrength1 > 0.05f) { + val corrected1 = applyBoundaryCorrection(channelSamples1, delta1, adaptiveStrength1) + for (i in 0 until 8) { + samples1[i * 3 + c] = corrected1[i] } } + + if (adaptiveStrength2 > 0.05f) { + val corrected2 = applyBoundaryCorrection(channelSamples2, delta2, adaptiveStrength2) + for (i in 0 until 8) { + samples2[i * 3 + c] = corrected2[i] + } + } + } + + // Write back corrected pixels + for (i in 2..5) { + val y = blockEdgeY - 4 + i + setPixelBulk(x, y, intArrayOf(samples1[i * 3], samples1[i * 3 + 1], samples1[i * 3 + 2])) + if (x + 1 < width) { + setPixelBulk(x + 1, y, intArrayOf(samples2[i * 3], samples2[i * 3 + 1], samples2[i * 3 + 2])) + } } } } @@ -3221,9 +3302,9 @@ class GraphicsJSR223Delegate(private val vm: VM) { } } - // Apply deblocking filter if enabled to reduce blocking artifacts + // Apply enhanced deblocking filter if enabled to reduce blocking artifacts if (enableDeblocking) { - tevDeblockingFilter(currentRGBAddr, width, height) + tevDeblockingFilterEnhanced(currentRGBAddr, width, height) } } @@ -3761,7 +3842,7 @@ class GraphicsJSR223Delegate(private val vm: VM) { return result } - // 16x16 version of Knusperli processing for Y blocks + // Optimized 16x16 version of Knusperli processing for Y blocks private fun processBlocksWithKnusperli16x16( blocks: Array, quantTable: IntArray, qScale: Int, rateControlFactors: FloatArray, blocksX: Int, blocksY: Int, @@ -3770,144 +3851,355 @@ class GraphicsJSR223Delegate(private val vm: VM) { val coeffsSize = 256 // 16x16 = 256 val numBlocks = blocksX * blocksY - // Step 1: Setup quantization intervals for all blocks - val blocksMid = Array(numBlocks) { IntArray(coeffsSize) } - val blocksMin = Array(numBlocks) { IntArray(coeffsSize) } - val blocksMax = Array(numBlocks) { IntArray(coeffsSize) } - val blocksOff = Array(numBlocks) { LongArray(coeffsSize) } + // OPTIMIZATION 1: Pre-compute quantization values to avoid repeated calculations + val quantValues = Array(numBlocks) { IntArray(coeffsSize) } + val quantHalfValues = Array(numBlocks) { IntArray(coeffsSize) } for (blockIndex in 0 until numBlocks) { val block = blocks[blockIndex] if (block != null) { val rateControlFactor = rateControlFactors[blockIndex] - for (i in 0 until coeffsSize) { + val qualityMult = jpeg_quality_to_mult(qScale * rateControlFactor) + + quantValues[blockIndex][0] = 1 // DC is lossless + quantHalfValues[blockIndex][0] = 0 // DC has no quantization interval + + for (i in 1 until coeffsSize) { val coeffIdx = i.coerceIn(0, quantTable.size - 1) - val quant = if (i == 0) 1 else (quantTable[coeffIdx] * jpeg_quality_to_mult(qScale * rateControlFactor)).toInt() - - blocksMid[blockIndex][i] = block[i].toInt() * quant - val halfQuant = quant / 2 - blocksMin[blockIndex][i] = blocksMid[blockIndex][i] - halfQuant - blocksMax[blockIndex][i] = blocksMid[blockIndex][i] + halfQuant - blocksOff[blockIndex][i] = 0L + val quant = (quantTable[coeffIdx] * qualityMult).toInt() + quantValues[blockIndex][i] = quant + quantHalfValues[blockIndex][i] = quant / 2 } } } - // Step 2: Horizontal continuity analysis (16x16 version) - for (by in 0 until blocksY) { - for (bx in 0 until blocksX - 1) { - val leftBlockIndex = by * blocksX + bx - val rightBlockIndex = by * blocksX + (bx + 1) - - if (blocks[leftBlockIndex] != null && blocks[rightBlockIndex] != null) { - analyzeHorizontalBoundary16x16( - leftBlockIndex, rightBlockIndex, blocksMid, blocksOff, - kLinearGradient16, kAlphaSqrt2_16 - ) - } - } - } + // OPTIMIZATION 2: Use single-allocation arrays with block-stride access + val blocksMid = Array(numBlocks) { IntArray(coeffsSize) } + val blocksOff = Array(numBlocks) { LongArray(coeffsSize) } // Keep Long for accumulation - // Step 3: Vertical continuity analysis (16x16 version) - for (by in 0 until blocksY - 1) { - for (bx in 0 until blocksX) { - val topBlockIndex = by * blocksX + bx - val bottomBlockIndex = (by + 1) * blocksX + bx - - if (blocks[topBlockIndex] != null && blocks[bottomBlockIndex] != null) { - analyzeVerticalBoundary16x16( - topBlockIndex, bottomBlockIndex, blocksMid, blocksOff, - kLinearGradient16, kAlphaSqrt2_16 - ) - } - } - } - - // Step 4: Apply corrections and clamp to quantization intervals + // Step 1: Setup dequantized values and initialize adjustments (BULK OPTIMIZED) for (blockIndex in 0 until numBlocks) { val block = blocks[blockIndex] if (block != null) { - for (i in 0 until coeffsSize) { - // Apply corrections with sqrt(2)/2 weighting - blocksMid[blockIndex][i] += ((blocksOff[blockIndex][i] * kHalfSqrt2) shr 31).toInt() - - // Clamp to quantization interval bounds - blocksMid[blockIndex][i] = blocksMid[blockIndex][i].coerceIn( - blocksMin[blockIndex][i], - blocksMax[blockIndex][i] - ) - - // Convert back to quantized coefficient for storage - val rateControlFactor = rateControlFactors[blockIndex] - val coeffIdx = i.coerceIn(0, quantTable.size - 1) - val quant = if (i == 0) 1 else (quantTable[coeffIdx] * jpeg_quality_to_mult(qScale * rateControlFactor)).toInt() - block[i] = (blocksMid[blockIndex][i] / quant).coerceIn(Short.MIN_VALUE.toInt(), Short.MAX_VALUE.toInt()).toShort() + val mid = blocksMid[blockIndex] + val off = blocksOff[blockIndex] + val quantVals = quantValues[blockIndex] + + // OPTIMIZATION 9: Bulk dequantization using vectorized operations + bulkDequantizeCoefficients(block, mid, quantVals, coeffsSize) + + // OPTIMIZATION 10: Bulk zero initialization of adjustments + off.fill(0L) + } + } + + // OPTIMIZATION 7: Combined boundary analysis loops for better cache locality + // Process horizontal and vertical boundaries in interleaved pattern + for (by in 0 until blocksY) { + for (bx in 0 until blocksX) { + val currentIndex = by * blocksX + bx + + // Horizontal boundary (if not rightmost column) + if (bx < blocksX - 1) { + val rightIndex = currentIndex + 1 + if (blocks[currentIndex] != null && blocks[rightIndex] != null) { + analyzeHorizontalBoundary16x16( + currentIndex, rightIndex, blocksMid, blocksOff, + kLinearGradient16, kAlphaSqrt2_16 + ) + } } + + // Vertical boundary (if not bottom row) + if (by < blocksY - 1) { + val bottomIndex = currentIndex + blocksX + if (blocks[currentIndex] != null && blocks[bottomIndex] != null) { + analyzeVerticalBoundary16x16( + currentIndex, bottomIndex, blocksMid, blocksOff, + kLinearGradient16, kAlphaSqrt2_16 + ) + } + } + } + } + + // Step 4: Apply corrections and clamp to quantization intervals (BULK OPTIMIZED) + for (blockIndex in 0 until numBlocks) { + val block = blocks[blockIndex] + if (block != null) { + // OPTIMIZATION 11: Bulk apply corrections and quantization clamping + bulkApplyCorrectionsAndClamp( + block, blocksMid[blockIndex], blocksOff[blockIndex], + quantValues[blockIndex], quantHalfValues[blockIndex], + kHalfSqrt2, coeffsSize + ) } } } - // 16x16 horizontal boundary analysis (adapted from Google's 8x8 version) + // BULK MEMORY ACCESS HELPER FUNCTIONS FOR KNUSPERLI + + /** + * OPTIMIZATION 9: Bulk dequantization using vectorized operations + * Performs coefficient * quantization in optimized chunks + */ + private fun bulkDequantizeCoefficients( + coeffs: ShortArray, result: IntArray, quantVals: IntArray, size: Int + ) { + // Process in chunks of 16 for better vectorization (CPU can process multiple values per instruction) + var i = 0 + val chunks = size and 0xFFFFFFF0.toInt() // Round down to nearest 16 + + // Bulk process 16 coefficients at a time for SIMD-friendly operations + while (i < chunks) { + // Manual loop unrolling for better performance + result[i] = coeffs[i].toInt() * quantVals[i] + result[i + 1] = coeffs[i + 1].toInt() * quantVals[i + 1] + result[i + 2] = coeffs[i + 2].toInt() * quantVals[i + 2] + result[i + 3] = coeffs[i + 3].toInt() * quantVals[i + 3] + result[i + 4] = coeffs[i + 4].toInt() * quantVals[i + 4] + result[i + 5] = coeffs[i + 5].toInt() * quantVals[i + 5] + result[i + 6] = coeffs[i + 6].toInt() * quantVals[i + 6] + result[i + 7] = coeffs[i + 7].toInt() * quantVals[i + 7] + result[i + 8] = coeffs[i + 8].toInt() * quantVals[i + 8] + result[i + 9] = coeffs[i + 9].toInt() * quantVals[i + 9] + result[i + 10] = coeffs[i + 10].toInt() * quantVals[i + 10] + result[i + 11] = coeffs[i + 11].toInt() * quantVals[i + 11] + result[i + 12] = coeffs[i + 12].toInt() * quantVals[i + 12] + result[i + 13] = coeffs[i + 13].toInt() * quantVals[i + 13] + result[i + 14] = coeffs[i + 14].toInt() * quantVals[i + 14] + result[i + 15] = coeffs[i + 15].toInt() * quantVals[i + 15] + i += 16 + } + + // Handle remaining coefficients + while (i < size) { + result[i] = coeffs[i].toInt() * quantVals[i] + i++ + } + } + + /** + * OPTIMIZATION 11: Bulk apply corrections and quantization clamping + * Vectorized correction application with proper bounds checking + */ + private fun bulkApplyCorrectionsAndClamp( + block: ShortArray, mid: IntArray, off: LongArray, + quantVals: IntArray, quantHalf: IntArray, + kHalfSqrt2: Int, size: Int + ) { + var i = 0 + val chunks = size and 0xFFFFFFF0.toInt() // Process in chunks of 16 + + // Bulk process corrections in chunks for better CPU pipeline utilization + while (i < chunks) { + // Apply corrections with sqrt(2)/2 weighting - bulk operations + val corr0 = ((off[i] * kHalfSqrt2) shr 31).toInt() + val corr1 = ((off[i + 1] * kHalfSqrt2) shr 31).toInt() + val corr2 = ((off[i + 2] * kHalfSqrt2) shr 31).toInt() + val corr3 = ((off[i + 3] * kHalfSqrt2) shr 31).toInt() + val corr4 = ((off[i + 4] * kHalfSqrt2) shr 31).toInt() + val corr5 = ((off[i + 5] * kHalfSqrt2) shr 31).toInt() + val corr6 = ((off[i + 6] * kHalfSqrt2) shr 31).toInt() + val corr7 = ((off[i + 7] * kHalfSqrt2) shr 31).toInt() + + mid[i] += corr0 + mid[i + 1] += corr1 + mid[i + 2] += corr2 + mid[i + 3] += corr3 + mid[i + 4] += corr4 + mid[i + 5] += corr5 + mid[i + 6] += corr6 + mid[i + 7] += corr7 + + // Apply quantization interval clamping - bulk operations + val orig0 = block[i].toInt() * quantVals[i] + val orig1 = block[i + 1].toInt() * quantVals[i + 1] + val orig2 = block[i + 2].toInt() * quantVals[i + 2] + val orig3 = block[i + 3].toInt() * quantVals[i + 3] + val orig4 = block[i + 4].toInt() * quantVals[i + 4] + val orig5 = block[i + 5].toInt() * quantVals[i + 5] + val orig6 = block[i + 6].toInt() * quantVals[i + 6] + val orig7 = block[i + 7].toInt() * quantVals[i + 7] + + mid[i] = mid[i].coerceIn(orig0 - quantHalf[i], orig0 + quantHalf[i]) + mid[i + 1] = mid[i + 1].coerceIn(orig1 - quantHalf[i + 1], orig1 + quantHalf[i + 1]) + mid[i + 2] = mid[i + 2].coerceIn(orig2 - quantHalf[i + 2], orig2 + quantHalf[i + 2]) + mid[i + 3] = mid[i + 3].coerceIn(orig3 - quantHalf[i + 3], orig3 + quantHalf[i + 3]) + mid[i + 4] = mid[i + 4].coerceIn(orig4 - quantHalf[i + 4], orig4 + quantHalf[i + 4]) + mid[i + 5] = mid[i + 5].coerceIn(orig5 - quantHalf[i + 5], orig5 + quantHalf[i + 5]) + mid[i + 6] = mid[i + 6].coerceIn(orig6 - quantHalf[i + 6], orig6 + quantHalf[i + 6]) + mid[i + 7] = mid[i + 7].coerceIn(orig7 - quantHalf[i + 7], orig7 + quantHalf[i + 7]) + + // Convert back to quantized coefficients - bulk operations + val quantMax = Short.MAX_VALUE.toInt() + val quantMin = Short.MIN_VALUE.toInt() + block[i] = (mid[i] / quantVals[i]).coerceIn(quantMin, quantMax).toShort() + block[i + 1] = (mid[i + 1] / quantVals[i + 1]).coerceIn(quantMin, quantMax).toShort() + block[i + 2] = (mid[i + 2] / quantVals[i + 2]).coerceIn(quantMin, quantMax).toShort() + block[i + 3] = (mid[i + 3] / quantVals[i + 3]).coerceIn(quantMin, quantMax).toShort() + block[i + 4] = (mid[i + 4] / quantVals[i + 4]).coerceIn(quantMin, quantMax).toShort() + block[i + 5] = (mid[i + 5] / quantVals[i + 5]).coerceIn(quantMin, quantMax).toShort() + block[i + 6] = (mid[i + 6] / quantVals[i + 6]).coerceIn(quantMin, quantMax).toShort() + block[i + 7] = (mid[i + 7] / quantVals[i + 7]).coerceIn(quantMin, quantMax).toShort() + + i += 8 // Process 8 at a time for the remaining corrections + } + + // Handle remaining coefficients (usually 0-15 remaining for 256-coefficient blocks) + while (i < size) { + mid[i] += ((off[i] * kHalfSqrt2) shr 31).toInt() + + val originalValue = block[i].toInt() * quantVals[i] + mid[i] = mid[i].coerceIn(originalValue - quantHalf[i], originalValue + quantHalf[i]) + + block[i] = (mid[i] / quantVals[i]).coerceIn(Short.MIN_VALUE.toInt(), Short.MAX_VALUE.toInt()).toShort() + i++ + } + } + + // OPTIMIZED 16x16 horizontal boundary analysis private fun analyzeHorizontalBoundary16x16( leftBlockIndex: Int, rightBlockIndex: Int, blocksMid: Array, blocksOff: Array, kLinearGradient16: IntArray, kAlphaSqrt2_16: IntArray ) { - // Analyze low-to-mid frequencies only (v < 8 for 16x16, similar to v < 4 for 8x8) - for (v in 0 until 8) { + val leftMid = blocksMid[leftBlockIndex] + val rightMid = blocksMid[rightBlockIndex] + val leftOff = blocksOff[leftBlockIndex] + val rightOff = blocksOff[rightBlockIndex] + + // OPTIMIZATION 4: Process multiple frequencies in single loop for better cache locality + for (v in 0 until 8) { // Only low-to-mid frequencies var deltaV = 0L var hfPenalty = 0L + val vOffset = v * 16 - // Analyze discontinuity across the boundary + // First pass: Calculate boundary discontinuity for (u in 0 until 16) { + val idx = vOffset + u val alpha = kAlphaSqrt2_16[u] val sign = if (u and 1 != 0) -1 else 1 - val gi = blocksMid[leftBlockIndex][v * 16 + u] - val gj = blocksMid[rightBlockIndex][v * 16 + u] + val gi = leftMid[idx] + val gj = rightMid[idx] deltaV += alpha * (gj - sign * gi) hfPenalty += (u * u) * (gi * gi + gj * gj) } - // Apply corrections with high-frequency damping (scaled for 16x16) - for (u in 0 until 16) { - if (hfPenalty > 1600) deltaV /= 2 // Scaled threshold for 16x16 - val sign = if (u and 1 != 0) 1 else -1 - val gradientIdx = u.coerceIn(0, kLinearGradient16.size - 1) - blocksOff[leftBlockIndex][v * 16 + u] += deltaV * kLinearGradient16[gradientIdx] - blocksOff[rightBlockIndex][v * 16 + u] += deltaV * kLinearGradient16[gradientIdx] * sign - } + // OPTIMIZATION 8: Early exit for very small adjustments + if (kotlin.math.abs(deltaV) < 100) continue + + // OPTIMIZATION 5: Apply high-frequency damping once per frequency band + if (hfPenalty > 1600) deltaV /= 2 + + // Second pass: Apply corrections (BULK OPTIMIZED with unrolling) + val correction = deltaV + // Bulk apply corrections for 16 coefficients - manually unrolled for performance + leftOff[vOffset] += correction * kLinearGradient16[0] + rightOff[vOffset] += correction * kLinearGradient16[0] + leftOff[vOffset + 1] += correction * kLinearGradient16[1] + rightOff[vOffset + 1] -= correction * kLinearGradient16[1] // Alternating signs + leftOff[vOffset + 2] += correction * kLinearGradient16[2] + rightOff[vOffset + 2] += correction * kLinearGradient16[2] + leftOff[vOffset + 3] += correction * kLinearGradient16[3] + rightOff[vOffset + 3] -= correction * kLinearGradient16[3] + leftOff[vOffset + 4] += correction * kLinearGradient16[4] + rightOff[vOffset + 4] += correction * kLinearGradient16[4] + leftOff[vOffset + 5] += correction * kLinearGradient16[5] + rightOff[vOffset + 5] -= correction * kLinearGradient16[5] + leftOff[vOffset + 6] += correction * kLinearGradient16[6] + rightOff[vOffset + 6] += correction * kLinearGradient16[6] + leftOff[vOffset + 7] += correction * kLinearGradient16[7] + rightOff[vOffset + 7] -= correction * kLinearGradient16[7] + leftOff[vOffset + 8] += correction * kLinearGradient16[8] + rightOff[vOffset + 8] += correction * kLinearGradient16[8] + leftOff[vOffset + 9] += correction * kLinearGradient16[9] + rightOff[vOffset + 9] -= correction * kLinearGradient16[9] + leftOff[vOffset + 10] += correction * kLinearGradient16[10] + rightOff[vOffset + 10] += correction * kLinearGradient16[10] + leftOff[vOffset + 11] += correction * kLinearGradient16[11] + rightOff[vOffset + 11] -= correction * kLinearGradient16[11] + leftOff[vOffset + 12] += correction * kLinearGradient16[12] + rightOff[vOffset + 12] += correction * kLinearGradient16[12] + leftOff[vOffset + 13] += correction * kLinearGradient16[13] + rightOff[vOffset + 13] -= correction * kLinearGradient16[13] + leftOff[vOffset + 14] += correction * kLinearGradient16[14] + rightOff[vOffset + 14] += correction * kLinearGradient16[14] + leftOff[vOffset + 15] += correction * kLinearGradient16[15] + rightOff[vOffset + 15] -= correction * kLinearGradient16[15] } } - // 16x16 vertical boundary analysis (adapted from Google's 8x8 version) + // OPTIMIZED 16x16 vertical boundary analysis private fun analyzeVerticalBoundary16x16( topBlockIndex: Int, bottomBlockIndex: Int, blocksMid: Array, blocksOff: Array, kLinearGradient16: IntArray, kAlphaSqrt2_16: IntArray ) { - // Analyze low-to-mid frequencies only (u < 8 for 16x16) - for (u in 0 until 8) { + val topMid = blocksMid[topBlockIndex] + val bottomMid = blocksMid[bottomBlockIndex] + val topOff = blocksOff[topBlockIndex] + val bottomOff = blocksOff[bottomBlockIndex] + + // OPTIMIZATION 6: Optimized vertical analysis with better cache access pattern + for (u in 0 until 8) { // Only low-to-mid frequencies var deltaU = 0L var hfPenalty = 0L + // First pass: Calculate boundary discontinuity for (v in 0 until 16) { + val idx = v * 16 + u val alpha = kAlphaSqrt2_16[v] val sign = if (v and 1 != 0) -1 else 1 - val gi = blocksMid[topBlockIndex][v * 16 + u] - val gj = blocksMid[bottomBlockIndex][v * 16 + u] + val gi = topMid[idx] + val gj = bottomMid[idx] deltaU += alpha * (gj - sign * gi) hfPenalty += (v * v) * (gi * gi + gj * gj) } - for (v in 0 until 16) { - if (hfPenalty > 1600) deltaU /= 2 // Scaled threshold for 16x16 - val sign = if (v and 1 != 0) 1 else -1 - val gradientIdx = v.coerceIn(0, kLinearGradient16.size - 1) - blocksOff[topBlockIndex][v * 16 + u] += deltaU * kLinearGradient16[gradientIdx] - blocksOff[bottomBlockIndex][v * 16 + u] += deltaU * kLinearGradient16[gradientIdx] * sign - } + // Early exit for very small adjustments + if (kotlin.math.abs(deltaU) < 100) continue + + // Apply high-frequency damping once per frequency band + if (hfPenalty > 1600) deltaU /= 2 + + // Second pass: Apply corrections (BULK OPTIMIZED vertical) + val correction = deltaU + // Bulk apply corrections for 16 vertical coefficients - manually unrolled + topOff[u] += correction * kLinearGradient16[0] + bottomOff[u] += correction * kLinearGradient16[0] + topOff[16 + u] += correction * kLinearGradient16[1] + bottomOff[16 + u] -= correction * kLinearGradient16[1] // Alternating signs + topOff[32 + u] += correction * kLinearGradient16[2] + bottomOff[32 + u] += correction * kLinearGradient16[2] + topOff[48 + u] += correction * kLinearGradient16[3] + bottomOff[48 + u] -= correction * kLinearGradient16[3] + topOff[64 + u] += correction * kLinearGradient16[4] + bottomOff[64 + u] += correction * kLinearGradient16[4] + topOff[80 + u] += correction * kLinearGradient16[5] + bottomOff[80 + u] -= correction * kLinearGradient16[5] + topOff[96 + u] += correction * kLinearGradient16[6] + bottomOff[96 + u] += correction * kLinearGradient16[6] + topOff[112 + u] += correction * kLinearGradient16[7] + bottomOff[112 + u] -= correction * kLinearGradient16[7] + topOff[128 + u] += correction * kLinearGradient16[8] + bottomOff[128 + u] += correction * kLinearGradient16[8] + topOff[144 + u] += correction * kLinearGradient16[9] + bottomOff[144 + u] -= correction * kLinearGradient16[9] + topOff[160 + u] += correction * kLinearGradient16[10] + bottomOff[160 + u] += correction * kLinearGradient16[10] + topOff[176 + u] += correction * kLinearGradient16[11] + bottomOff[176 + u] -= correction * kLinearGradient16[11] + topOff[192 + u] += correction * kLinearGradient16[12] + bottomOff[192 + u] += correction * kLinearGradient16[12] + topOff[208 + u] += correction * kLinearGradient16[13] + bottomOff[208 + u] -= correction * kLinearGradient16[13] + topOff[224 + u] += correction * kLinearGradient16[14] + bottomOff[224 + u] += correction * kLinearGradient16[14] + topOff[240 + u] += correction * kLinearGradient16[15] + bottomOff[240 + u] -= correction * kLinearGradient16[15] } }