From 198e951102712b1bd4ad900ca975f97fbdd650e6 Mon Sep 17 00:00:00 2001 From: minjaesong Date: Sat, 13 Sep 2025 00:39:12 +0900 Subject: [PATCH] various encoder bug fixes --- assets/disk0/tvdos/bin/playtev.js | 6 +- .../torvald/tsvm/GraphicsJSR223Delegate.kt | 274 ++++++++++++++---- tsvm_core/src/net/torvald/util/Float16.kt | 90 ++++++ video_encoder/encoder_tev.c | 262 +++++++++++++++-- 4 files changed, 553 insertions(+), 79 deletions(-) create mode 100644 tsvm_core/src/net/torvald/util/Float16.kt diff --git a/assets/disk0/tvdos/bin/playtev.js b/assets/disk0/tvdos/bin/playtev.js index 47b342c..5bfa8d0 100644 --- a/assets/disk0/tvdos/bin/playtev.js +++ b/assets/disk0/tvdos/bin/playtev.js @@ -418,6 +418,7 @@ let hasSubtitle = !!(flags & 2) let videoFlags = seqread.readOneByte() let isInterlaced = !!(videoFlags & 1) let isNTSC = !!(videoFlags & 2) +let isLossless = !!(videoFlags & 4) let unused2 = seqread.readOneByte() @@ -427,6 +428,7 @@ serial.println(` FPS: ${(isNTSC) ? (fps * 1000 / 1001) : fps}`) serial.println(` Duration: ${totalFrames / fps}`) serial.println(` Audio: ${hasAudio ? "Yes" : "No"}`) serial.println(` Resolution: ${width}x${height}, ${isInterlaced ? "interlaced" : "progressive"}`) +serial.println(` Quality: Y=${qualityY}, Co=${qualityCo}, Cg=${qualityCg}, ${isLossless ? "lossless" : "lossy"}`) // DEBUG interlace raw output @@ -665,14 +667,14 @@ try { if (isInterlaced) { // For interlaced: decode current frame into currentFieldAddr // For display: use prevFieldAddr as current, currentFieldAddr as next - graphics.tevDecode(blockDataPtr, nextFieldAddr, currentFieldAddr, width, decodingHeight, qualityY, qualityCo, qualityCg, trueFrameCount, debugMotionVectors, version, enableDeblocking, enableBoundaryAwareDecoding) + graphics.tevDecode(blockDataPtr, nextFieldAddr, currentFieldAddr, width, decodingHeight, qualityY, qualityCo, qualityCg, trueFrameCount, debugMotionVectors, version, enableDeblocking, enableBoundaryAwareDecoding, isLossless) graphics.tevDeinterlace(trueFrameCount, width, decodingHeight, prevFieldAddr, currentFieldAddr, nextFieldAddr, CURRENT_RGB_ADDR, deinterlaceAlgorithm) // Rotate field buffers for next frame: NEXT -> CURRENT -> PREV rotateFieldBuffers() } else { // Progressive or first frame: normal decoding without temporal prediction - graphics.tevDecode(blockDataPtr, CURRENT_RGB_ADDR, PREV_RGB_ADDR, width, decodingHeight, qualityY, qualityCo, qualityCg, trueFrameCount, debugMotionVectors, version, enableDeblocking, enableBoundaryAwareDecoding) + graphics.tevDecode(blockDataPtr, CURRENT_RGB_ADDR, PREV_RGB_ADDR, width, decodingHeight, qualityY, qualityCo, qualityCg, trueFrameCount, debugMotionVectors, version, enableDeblocking, enableBoundaryAwareDecoding, isLossless) } decodeTime = (sys.nanoTime() - decodeStart) / 1000000.0 // Convert to milliseconds diff --git a/tsvm_core/src/net/torvald/tsvm/GraphicsJSR223Delegate.kt b/tsvm_core/src/net/torvald/tsvm/GraphicsJSR223Delegate.kt index d7ce28d..6eb895b 100644 --- a/tsvm_core/src/net/torvald/tsvm/GraphicsJSR223Delegate.kt +++ b/tsvm_core/src/net/torvald/tsvm/GraphicsJSR223Delegate.kt @@ -12,6 +12,7 @@ import net.torvald.terrarum.modulecomputers.virtualcomputer.tvd.toUint import net.torvald.tsvm.peripheral.GraphicsAdapter import net.torvald.tsvm.peripheral.PeriBase import net.torvald.tsvm.peripheral.fmod +import net.torvald.util.Float16 import kotlin.math.* class GraphicsJSR223Delegate(private val vm: VM) { @@ -21,6 +22,77 @@ class GraphicsJSR223Delegate(private val vm: VM) { private val idct16TempBuffer = FloatArray(256) // For 16x16 IDCT private val idct16SeparableBuffer = FloatArray(256) // For separable 16x16 IDCT + // Lossless IDCT functions for float16 coefficients (no quantization) + private fun tevIdct8x8_lossless(coeffs: FloatArray): IntArray { + val result = IntArray(64) + + // Fast separable IDCT (row-column decomposition) for lossless coefficients + // First pass: Process rows (8 1D IDCTs) + for (row in 0 until 8) { + for (col in 0 until 8) { + var sum = 0f + for (u in 0 until 8) { + sum += dctBasis8[u][col] * coeffs[row * 8 + u] + } + idct8TempBuffer[row * 8 + col] = sum * 0.5f + } + } + + // Second pass: Process columns (8 1D IDCTs) + for (col in 0 until 8) { + for (row in 0 until 8) { + var sum = 0f + for (v in 0 until 8) { + sum += dctBasis8[v][row] * idct8TempBuffer[v * 8 + col] + } + val finalValue = sum * 0.5f + 128f + result[row * 8 + col] = if (finalValue.isNaN() || finalValue.isInfinite()) { + println("NaN/Inf detected in 8x8 IDCT at ($row,$col): sum=$sum, finalValue=$finalValue") + 128 // Default to middle gray + } else { + finalValue.roundToInt().coerceIn(0, 255) + } + } + } + + return result + } + + private fun tevIdct16x16_lossless(coeffs: FloatArray): IntArray { + val result = IntArray(256) + + // Fast separable IDCT (row-column decomposition) for 16x16 lossless coefficients + // First pass: Process rows (16 1D IDCTs) + for (row in 0 until 16) { + for (col in 0 until 16) { + var sum = 0f + for (u in 0 until 16) { + sum += dctBasis16[u][col] * coeffs[row * 16 + u] + } + idct16TempBuffer[row * 16 + col] = sum * 0.25f + } + } + + // Second pass: Process columns (16 1D IDCTs) + for (col in 0 until 16) { + for (row in 0 until 16) { + var sum = 0f + for (v in 0 until 16) { + sum += dctBasis16[v][row] * idct16TempBuffer[v * 16 + col] + } + val finalValue = sum * 0.25f + 128f + result[row * 16 + col] = if (finalValue.isNaN() || finalValue.isInfinite()) { + println("NaN/Inf detected in 16x16 IDCT at ($row,$col): sum=$sum, finalValue=$finalValue") + 128 // Default to middle gray + } else { + finalValue.roundToInt().coerceIn(0, 255) + } + } + } + + return result + } + private fun getFirstGPU(): GraphicsAdapter? { return vm.findPeribyType(VM.PERITYPE_GPU_AND_TERM)?.peripheral as? GraphicsAdapter @@ -1649,7 +1721,7 @@ class GraphicsJSR223Delegate(private val vm: VM) { val result = IntArray(64) // Reuse preallocated temp buffer to reduce GC pressure for (i in coeffs.indices) { - idct8TempBuffer[i] = coeffs[i] * quantTable[i] * jpeg_quality_to_mult(qualityIndex * rateControlFactor) + idct8TempBuffer[i] = coeffs[i] * (quantTable[i] * jpeg_quality_to_mult(qualityIndex * rateControlFactor)).coerceIn(1f, 255f) } // Fast separable IDCT (row-column decomposition) @@ -1662,7 +1734,7 @@ class GraphicsJSR223Delegate(private val vm: VM) { val coeff = if (isChromaResidual && coeffIdx == 0) { coeffs[coeffIdx].toFloat() // DC lossless for chroma residual } else { - coeffs[coeffIdx] * quantTable[coeffIdx] * jpeg_quality_to_mult(qualityIndex * rateControlFactor) + coeffs[coeffIdx] * (quantTable[coeffIdx] * jpeg_quality_to_mult(qualityIndex * rateControlFactor)).coerceIn(1f, 255f) } sum += dctBasis8[u][col] * coeff } @@ -1708,7 +1780,7 @@ class GraphicsJSR223Delegate(private val vm: VM) { val coeff = if (idx == 0) { coeffs[idx].toFloat() // DC lossless for luma } else { - coeffs[idx] * quantTable[idx] * jpeg_quality_to_mult(qualityIndex * rateControlFactor) + coeffs[idx] * (quantTable[idx] * jpeg_quality_to_mult(qualityIndex * rateControlFactor)).coerceIn(1f, 255f) } idct16TempBuffer[idx] = coeff } @@ -2555,7 +2627,8 @@ class GraphicsJSR223Delegate(private val vm: VM) { fun tevDecode(blockDataPtr: Long, currentRGBAddr: Long, prevRGBAddr: Long, width: Int, height: Int, qY: Int, qCo: Int, qCg: Int, frameCounter: Int, debugMotionVectors: Boolean = false, tevVersion: Int = 2, - enableDeblocking: Boolean = true, enableBoundaryAwareDecoding: Boolean = false) { + enableDeblocking: Boolean = true, enableBoundaryAwareDecoding: Boolean = false, + isLossless: Boolean = false) { // height doesn't change when interlaced, because that's the encoder's output @@ -2846,17 +2919,65 @@ class GraphicsJSR223Delegate(private val vm: VM) { } 0x01 -> { // TEV_MODE_INTRA - Full YCoCg-R DCT decode (no motion compensation) - // Read DCT coefficients: Y (16x16=256), Co (8x8=64), Cg (8x8=64) + val yBlock: IntArray + val coBlock: IntArray + val cgBlock: IntArray + + if (isLossless) { + // Lossless mode: coefficients are stored as float16, no quantization + // Read float16 coefficients: Y (16x16=256), Co (8x8=64), Cg (8x8=64) + val coeffFloat16Array = ShortArray(384) // 384 float16 values stored as shorts + vm.bulkPeekShort(readPtr.toInt(), coeffFloat16Array, 768) // 384 * 2 bytes + readPtr += 768 + + // Convert float16 to float32 and perform IDCT directly (no quantization) + println("DEBUG: Reading lossless coefficients, first few float16 values: ${coeffFloat16Array.take(10).map { "0x${it.toString(16)}" }}") + val yCoeffs = FloatArray(256) { i -> + // Convert signed short to unsigned short for float16 interpretation + val signedShort = coeffFloat16Array[i] + val float16bits = signedShort.toInt() and 0xFFFF // Convert to unsigned + val floatVal = Float16.toFloat(float16bits.toShort()) + if (floatVal.isNaN() || floatVal.isInfinite()) { + println("NaN/Inf detected at Y coefficient $i: signedShort=0x${signedShort.toString(16)}, unsigned=0x${float16bits.toString(16)}, floatVal=$floatVal") + 0f // Replace NaN with 0 + } else floatVal + } + val coCoeffs = FloatArray(64) { i -> + // Convert signed short to unsigned short for float16 interpretation + val signedShort = coeffFloat16Array[256 + i] + val float16bits = signedShort.toInt() and 0xFFFF // Convert to unsigned + val floatVal = Float16.toFloat(float16bits.toShort()) + if (floatVal.isNaN() || floatVal.isInfinite()) { + println("NaN/Inf detected at Co coefficient $i: signedShort=0x${signedShort.toString(16)}, unsigned=0x${float16bits.toString(16)}, floatVal=$floatVal") + 0f // Replace NaN with 0 + } else floatVal + } + val cgCoeffs = FloatArray(64) { i -> + // Convert signed short to unsigned short for float16 interpretation + val signedShort = coeffFloat16Array[320 + i] + val float16bits = signedShort.toInt() and 0xFFFF // Convert to unsigned + val floatVal = Float16.toFloat(float16bits.toShort()) + if (floatVal.isNaN() || floatVal.isInfinite()) { + println("NaN/Inf detected at Cg coefficient $i: signedShort=0x${signedShort.toString(16)}, unsigned=0x${float16bits.toString(16)}, floatVal=$floatVal") + 0f // Replace NaN with 0 + } else floatVal + } + + yBlock = tevIdct16x16_lossless(yCoeffs) + coBlock = tevIdct8x8_lossless(coCoeffs) + cgBlock = tevIdct8x8_lossless(cgCoeffs) + } else { + // Regular lossy mode: quantized int16 coefficients + // Optimized bulk reading of all DCT coefficients: Y(256×2) + Co(64×2) + Cg(64×2) = 768 bytes + val coeffShortArray = ShortArray(384) // Total coefficients: 256 + 64 + 64 = 384 shorts + vm.bulkPeekShort(readPtr.toInt(), coeffShortArray, 768) + readPtr += 768 - // Optimized bulk reading of all DCT coefficients: Y(256×2) + Co(64×2) + Cg(64×2) = 768 bytes - val coeffShortArray = ShortArray(384) // Total coefficients: 256 + 64 + 64 = 384 shorts - vm.bulkPeekShort(readPtr.toInt(), coeffShortArray, 768) - readPtr += 768 - - // Perform hardware IDCT for each channel using fast algorithm - val yBlock = tevIdct16x16_fast(coeffShortArray.sliceArray(0 until 256), QUANT_TABLE_Y, qY, rateControlFactor) - val coBlock = tevIdct8x8_fast(coeffShortArray.sliceArray(256 until 320), QUANT_TABLE_C, true, qCo, rateControlFactor) - val cgBlock = tevIdct8x8_fast(coeffShortArray.sliceArray(320 until 384), QUANT_TABLE_C, true, qCg, rateControlFactor) + // Perform hardware IDCT for each channel using fast algorithm + yBlock = tevIdct16x16_fast(coeffShortArray.sliceArray(0 until 256), QUANT_TABLE_Y, qY, rateControlFactor) + coBlock = tevIdct8x8_fast(coeffShortArray.sliceArray(256 until 320), QUANT_TABLE_C, true, qCo, rateControlFactor) + cgBlock = tevIdct8x8_fast(coeffShortArray.sliceArray(320 until 384), QUANT_TABLE_C, true, qCg, rateControlFactor) + } // Convert to RGB (YCoCg-R for v2, XYB for v3) val rgbData = if (tevVersion == 3) { @@ -3275,7 +3396,7 @@ class GraphicsJSR223Delegate(private val vm: VM) { val quantValue = if (i == 0) 1.0f else { quantTable[coeffIdx] * jpeg_quality_to_mult(qScale * rateControlFactor) } - result[blockIndex]!![i] = block[i] * quantValue + result[blockIndex]!![i] = block[i] * quantValue.coerceIn(1f, 255f) } } } @@ -3307,7 +3428,7 @@ class GraphicsJSR223Delegate(private val vm: VM) { for (i in 1 until coeffsSize) { val coeffIdx = i.coerceIn(0, quantTable.size - 1) - val quant = (quantTable[coeffIdx] * qualityMult).toInt() + val quant = (quantTable[coeffIdx] * qualityMult).coerceIn(1f, 255f).toInt() quantValues[blockIndex][i] = quant quantHalfValues[blockIndex][i] = quant / 2 } @@ -3511,7 +3632,7 @@ class GraphicsJSR223Delegate(private val vm: VM) { val rightOff = blocksOff[rightBlockIndex] // OPTIMIZATION 4: Process multiple frequencies in single loop for better cache locality - for (v in 0 until 16) { // Only low-to-mid frequencies + for (v in 0 until 8) { // Only low-to-mid frequencies var deltaV = 0L var hfPenalty = 0L val vOffset = v * 16 @@ -3667,7 +3788,7 @@ class GraphicsJSR223Delegate(private val vm: VM) { blocksMid[blockIndex][i] = dcValue } else { // AC coefficients: use quantization intervals - val quant = (quantTable[quantIdx] * jpeg_quality_to_mult(qScale * rateControlFactor)).toInt() + val quant = (quantTable[quantIdx] * jpeg_quality_to_mult(qScale * rateControlFactor)).coerceIn(1f, 255f).toInt() // Standard dequantized value (midpoint) blocksMid[blockIndex][i] = block[i].toInt() * quant @@ -3719,7 +3840,7 @@ class GraphicsJSR223Delegate(private val vm: VM) { blocksMax[blockIndex][i] = dcValue } else { // AC coefficients: use quantization intervals - val quant = (quantTable[quantIdx] * jpeg_quality_to_mult(qScale * rateControlFactor)).toInt() + val quant = (quantTable[quantIdx] * jpeg_quality_to_mult(qScale * rateControlFactor)).coerceIn(1f, 255f).toInt() // Standard dequantized value (midpoint) blocksMid[blockIndex][i] = block[i].toInt() * quant @@ -3789,73 +3910,116 @@ class GraphicsJSR223Delegate(private val vm: VM) { return result } + // BULK OPTIMIZED 8x8 horizontal boundary analysis for chroma channels private fun analyzeHorizontalBoundary( leftBlockIndex: Int, rightBlockIndex: Int, blocksMid: Array, blocksOff: Array, kLinearGradient: IntArray, kAlphaSqrt2: IntArray ) { - // Only process low-to-mid frequencies (v < 4 for 8x8, v < 8 for 16x16) - val maxV = 8 + val leftMid = blocksMid[leftBlockIndex] + val rightMid = blocksMid[rightBlockIndex] + val leftOff = blocksOff[leftBlockIndex] + val rightOff = blocksOff[rightBlockIndex] - for (v in 0 until maxV) { + // OPTIMIZATION 12: Process 8x8 boundaries with bulk operations (v < 4 for low-to-mid frequencies) + for (v in 0 until 4) { // Only low-to-mid frequencies for 8x8 var deltaV = 0L var hfPenalty = 0L + val vOffset = v * 8 - // Analyze boundary discontinuity + // First pass: Calculate boundary discontinuity for (u in 0 until 8) { - val alpha = kAlphaSqrt2[u.coerceIn(0, 7)] - val sign = if (u and 1 == 1) -1 else 1 - val gi = blocksMid[leftBlockIndex][v * 8 + u] - val gj = blocksMid[rightBlockIndex][v * 8 + u] + val idx = vOffset + u + val alpha = kAlphaSqrt2[u] // Direct access (u < 8) + val sign = if (u and 1 != 0) -1 else 1 + val gi = leftMid[idx] + val gj = rightMid[idx] - deltaV += (alpha * (gj - sign * gi)).toLong() - hfPenalty += (u * u * (gi * gi + gj * gj)).toLong() + deltaV += alpha * (gj - sign * gi) + hfPenalty += (u * u) * (gi * gi + gj * gj) } - // Apply corrections with high-frequency damping - if (hfPenalty > 400) deltaV /= 2 + // Early exit for very small adjustments + if (kotlin.math.abs(deltaV) < 100) continue - for (u in 0 until 8) { - val gradientIdx = u.coerceIn(0, kLinearGradient.size - 1) - val sign = if (u and 1 == 1) 1 else -1 - blocksOff[leftBlockIndex][v * 8 + u] = blocksOff[leftBlockIndex][v * 8 + u] + deltaV * kLinearGradient[gradientIdx] - blocksOff[rightBlockIndex][v * 8 + u] = blocksOff[rightBlockIndex][v * 8 + u] + deltaV * kLinearGradient[gradientIdx] * sign - } + // Apply high-frequency damping once per frequency band + if (hfPenalty > 400) deltaV /= 2 // 8x8 threshold + + // Second pass: Apply corrections (BULK OPTIMIZED with unrolling for 8x8) + val correction = deltaV + // Bulk apply corrections for 8 coefficients - manually unrolled for performance + leftOff[vOffset] += correction * kLinearGradient[0] + rightOff[vOffset] += correction * kLinearGradient[0] + leftOff[vOffset + 1] += correction * kLinearGradient[1] + rightOff[vOffset + 1] -= correction * kLinearGradient[1] // Alternating signs + leftOff[vOffset + 2] += correction * kLinearGradient[2] + rightOff[vOffset + 2] += correction * kLinearGradient[2] + leftOff[vOffset + 3] += correction * kLinearGradient[3] + rightOff[vOffset + 3] -= correction * kLinearGradient[3] + leftOff[vOffset + 4] += correction * kLinearGradient[4] + rightOff[vOffset + 4] += correction * kLinearGradient[4] + leftOff[vOffset + 5] += correction * kLinearGradient[5] + rightOff[vOffset + 5] -= correction * kLinearGradient[5] + leftOff[vOffset + 6] += correction * kLinearGradient[6] + rightOff[vOffset + 6] += correction * kLinearGradient[6] + leftOff[vOffset + 7] += correction * kLinearGradient[7] + rightOff[vOffset + 7] -= correction * kLinearGradient[7] } } + // BULK OPTIMIZED 8x8 vertical boundary analysis for chroma channels private fun analyzeVerticalBoundary( topBlockIndex: Int, bottomBlockIndex: Int, blocksMid: Array, blocksOff: Array, kLinearGradient: IntArray, kAlphaSqrt2: IntArray ) { - // Only process low-to-mid frequencies (u < 4 for 8x8, u < 8 for 16x16) - val maxU = 8 + val topMid = blocksMid[topBlockIndex] + val bottomMid = blocksMid[bottomBlockIndex] + val topOff = blocksOff[topBlockIndex] + val bottomOff = blocksOff[bottomBlockIndex] - for (u in 0 until maxU) { + // OPTIMIZATION 13: Optimized vertical analysis for 8x8 with better cache access pattern + for (u in 0 until 4) { // Only low-to-mid frequencies for 8x8 var deltaU = 0L var hfPenalty = 0L - // Analyze boundary discontinuity + // First pass: Calculate boundary discontinuity for (v in 0 until 8) { - val alpha = kAlphaSqrt2[v.coerceIn(0, 7)] - val sign = if (v and 1 == 1) -1 else 1 - val gi = blocksMid[topBlockIndex][v * 8 + u] - val gj = blocksMid[bottomBlockIndex][v * 8 + u] + val idx = v * 8 + u + val alpha = kAlphaSqrt2[v] // Direct access (v < 8) + val sign = if (v and 1 != 0) -1 else 1 + val gi = topMid[idx] + val gj = bottomMid[idx] - deltaU += (alpha * (gj - sign * gi)).toLong() - hfPenalty += (v * v * (gi * gi + gj * gj)).toLong() + deltaU += alpha * (gj - sign * gi) + hfPenalty += (v * v) * (gi * gi + gj * gj) } - // Apply corrections with high-frequency damping - if (hfPenalty > 400) deltaU /= 2 + // Early exit for very small adjustments + if (kotlin.math.abs(deltaU) < 100) continue - for (v in 0 until 8) { - val gradientIdx = v.coerceIn(0, kLinearGradient.size - 1) - val sign = if (v and 1 == 1) 1 else -1 - blocksOff[topBlockIndex][v * 8 + u] = blocksOff[topBlockIndex][v * 8 + u] + deltaU * kLinearGradient[gradientIdx] - blocksOff[bottomBlockIndex][v * 8 + u] = blocksOff[bottomBlockIndex][v * 8 + u] + deltaU * kLinearGradient[gradientIdx] * sign - } + // Apply high-frequency damping once per frequency band + if (hfPenalty > 400) deltaU /= 2 // 8x8 threshold + + // Second pass: Apply corrections (BULK OPTIMIZED vertical for 8x8) + val correction = deltaU + // Bulk apply corrections for 8 vertical coefficients - manually unrolled + topOff[u] += correction * kLinearGradient[0] + bottomOff[u] += correction * kLinearGradient[0] + topOff[8 + u] += correction * kLinearGradient[1] + bottomOff[8 + u] -= correction * kLinearGradient[1] // Alternating signs + topOff[16 + u] += correction * kLinearGradient[2] + bottomOff[16 + u] += correction * kLinearGradient[2] + topOff[24 + u] += correction * kLinearGradient[3] + bottomOff[24 + u] -= correction * kLinearGradient[3] + topOff[32 + u] += correction * kLinearGradient[4] + bottomOff[32 + u] += correction * kLinearGradient[4] + topOff[40 + u] += correction * kLinearGradient[5] + bottomOff[40 + u] -= correction * kLinearGradient[5] + topOff[48 + u] += correction * kLinearGradient[6] + bottomOff[48 + u] += correction * kLinearGradient[6] + topOff[56 + u] += correction * kLinearGradient[7] + bottomOff[56 + u] -= correction * kLinearGradient[7] } } diff --git a/tsvm_core/src/net/torvald/util/Float16.kt b/tsvm_core/src/net/torvald/util/Float16.kt new file mode 100644 index 0000000..cef406c --- /dev/null +++ b/tsvm_core/src/net/torvald/util/Float16.kt @@ -0,0 +1,90 @@ +package net.torvald.util + +import kotlin.experimental.or + +class Float16() { + + var bits = 0.toShort() + private set + + constructor(fval: Float) : this() { + fromFloat(fval) + } + + fun toFloat() = Float16.toFloat(bits) + fun fromFloat(fval: Float) { + bits = Float16.fromFloat(fval) + } + + + operator fun times(other: Float) = fromFloat(this.toFloat() * other) + operator fun times(other: Float16) = fromFloat(this.toFloat() * other.toFloat()) + + operator fun div(other: Float) = fromFloat(this.toFloat() / other) + operator fun div(other: Float16) = fromFloat(this.toFloat() / other.toFloat()) + + // operators are stripped: you don't calculate from FP16; this is only for storing values // + + companion object { + fun toFloat(hbits: Short): Float { + val hbits = hbits.toInt().and(0xFFFF) + + var mant = hbits and 0x03ff // 10 bits mantissa + var exp = hbits and 0x7c00 // 5 bits exponent + if (exp == 0x7c00) + // NaN/Inf + exp = 0x3fc00 // -> NaN/Inf + else if (exp != 0) + // normalized value + { + exp += 0x1c000 // exp - 15 + 127 + if (mant == 0 && exp > 0x1c400) + // smooth transition + return java.lang.Float.intBitsToFloat(hbits and 0x8000 shl 16 or (exp shl 13) or 0x3ff) + } + else if (mant != 0) + // && exp==0 -> subnormal + { + exp = 0x1c400 // make it normal + do { + mant = mant shl 1 // mantissa * 2 + exp -= 0x400 // decrease exp by 1 + } while (mant and 0x400 == 0) // while not normal + mant = mant and 0x3ff // discard subnormal bit + } // else +/-0 -> +/-0 + return java.lang.Float.intBitsToFloat(// combine all parts + hbits and 0x8000 shl 16 or (exp or mant shl 13)) // value << ( 23 - 10 ) + } + + fun fromFloat(fval: Float): Short { + val fbits = java.lang.Float.floatToIntBits(fval) + val sign = fbits.ushr(16).and(0x8000).toShort() // sign only + var `val` = (fbits and 0x7fffffff) + 0x1000 // rounded value + + if (`val` >= 0x47800000) + // might be or become NaN/Inf + { // avoid Inf due to rounding + if (fbits and 0x7fffffff >= 0x47800000) { // is or must become NaN/Inf + if (`val` < 0x7f800000) + // was value but too large + return sign or 0x7c00 // make it +/-Inf + return sign or 0x7c00 or // remains +/-Inf or NaN + (fbits and 0x007fffff).ushr(13).toShort() // keep NaN (and Inf) bits + } + return sign or 0x7bff.toShort() // unrounded not quite Inf + } + if (`val` >= 0x38800000) + // remains normalized value + return sign or (`val` - 0x38000000).ushr(13).toShort() // exp - 127 + 15 + if (`val` < 0x33000000) + // too small for subnormal + return sign // becomes +/-0 + `val` = (fbits and 0x7fffffff).ushr(23) // tmp exp for subnormal calc + + return sign or ((fbits and 0x7fffff or 0x800000) // add subnormal bit + + 0x800000.ushr(`val` - 102) // round depending on cut off + ).ushr(126 - `val`) // div by 2^(1-(exp-127+15)) and >> 13 | exp=0 + .toShort() + } + } +} \ No newline at end of file diff --git a/video_encoder/encoder_tev.c b/video_encoder/encoder_tev.c index 04ef2ec..d39a957 100644 --- a/video_encoder/encoder_tev.c +++ b/video_encoder/encoder_tev.c @@ -14,6 +14,58 @@ #include #include +// Float16 conversion functions (adapted from Float16.kt) +static inline uint16_t float_to_float16(float fval) { + uint32_t fbits = *(uint32_t*)&fval; + uint16_t sign = (fbits >> 16) & 0x8000; // sign only + uint32_t val = (fbits & 0x7fffffff) + 0x1000; // rounded value + + if (val >= 0x47800000) { // might be or become NaN/Inf + if ((fbits & 0x7fffffff) >= 0x47800000) { // is or must become NaN/Inf + if (val < 0x7f800000) // was value but too large + return sign | 0x7c00; // make it +/-Inf + return sign | 0x7c00 | // remains +/-Inf or NaN + ((fbits & 0x007fffff) >> 13); // keep NaN (and Inf) bits + } + return sign | 0x7bff; // unrounded not quite Inf + } + if (val >= 0x38800000) // remains normalized value + return sign | ((val - 0x38000000) >> 13); // exp - 127 + 15 + if (val < 0x33000000) // too small for subnormal + return sign; // becomes +/-0 + val = (fbits & 0x7fffffff) >> 23; // tmp exp for subnormal calc + + return sign | (((fbits & 0x7fffff) | 0x800000) + // add subnormal bit + (0x800000 >> (val - 102)) // round depending on cut off + ) >> (126 - val); // div by 2^(1-(exp-127+15)) and >> 13 | exp=0 +} + +static inline float float16_to_float(uint16_t hbits) { + uint32_t mant = hbits & 0x03ff; // 10 bits mantissa + uint32_t exp = hbits & 0x7c00; // 5 bits exponent + + if (exp == 0x7c00) // NaN/Inf + exp = 0x3fc00; // -> NaN/Inf + else if (exp != 0) { // normalized value + exp += 0x1c000; // exp - 15 + 127 + if (mant == 0 && exp > 0x1c400) { // smooth transition + uint32_t fbits = ((hbits & 0x8000) << 16) | (exp << 13) | 0x3ff; + return *(float*)&fbits; + } + } + else if (mant != 0) { // && exp==0 -> subnormal + exp = 0x1c400; // make it normal + do { + mant <<= 1; // mantissa * 2 + exp -= 0x400; // decrease exp by 1 + } while ((mant & 0x400) == 0); // while not normal + mant &= 0x3ff; // discard subnormal bit + } // else +/-0 -> +/-0 + + uint32_t fbits = ((hbits & 0x8000) << 16) | ((exp | mant) << 13); + return *(float*)&fbits; +} + // TSVM Enhanced Video (TEV) format constants #define TEV_MAGIC "\x1F\x54\x53\x56\x4D\x54\x45\x56" // "\x1FTSVM TEV" #define TEV_VERSION 2 // Updated for YCoCg-R 4:2:0 @@ -103,7 +155,7 @@ static const uint32_t QUANT_TABLE_C[HALF_BLOCK_SIZE_SQR] = // Audio constants (reuse MP2 from existing system) #define MP2_SAMPLE_RATE 32000 -#define MP2_DEFAULT_PACKET_SIZE 0x240 +#define MP2_DEFAULT_PACKET_SIZE 1728 // Default values #define DEFAULT_WIDTH 560 @@ -140,6 +192,17 @@ typedef struct __attribute__((packed)) { int16_t cg_coeffs[HALF_BLOCK_SIZE_SQR]; // quantised Cg DCT coefficients (8x8) } tev_block_t; +// Lossless TEV block structure (uses float32 internally, converted to float16 during serialization) +typedef struct __attribute__((packed)) { + uint8_t mode; // Block encoding mode + int16_t mv_x, mv_y; // Motion vector (1/4 pixel precision) + float rate_control_factor; // Always 1.0f in lossless mode + uint16_t cbp; // Coded block pattern (which channels have non-zero coeffs) + float y_coeffs[BLOCK_SIZE_SQR]; // lossless Y DCT coefficients (16x16) + float co_coeffs[HALF_BLOCK_SIZE_SQR]; // lossless Co DCT coefficients (8x8) + float cg_coeffs[HALF_BLOCK_SIZE_SQR]; // lossless Cg DCT coefficients (8x8) +} tev_lossless_block_t; + // Subtitle entry structure typedef struct subtitle_entry { int start_frame; @@ -168,6 +231,8 @@ typedef struct { int qualityCo; int qualityCg; int verbose; + int disable_rcf; // 0 = rcf enabled, 1 = disabled + int lossless_mode; // 0 = lossy (default), 1 = lossless mode // Bitrate control int target_bitrate_kbps; // Target bitrate in kbps (0 = quality mode) @@ -216,10 +281,9 @@ typedef struct { // Subtitle handling subtitle_entry_t *subtitle_list; subtitle_entry_t *current_subtitle; - + // Complexity statistics collection int stats_mode; // 0 = disabled, 1 = enabled - int disable_rcf; // 0 = rcf enabled, 1 = disabled float *complexity_values; // Array to store all complexity values int complexity_count; // Current count of complexity values int complexity_capacity; // Capacity of complexity_values array @@ -1041,6 +1105,107 @@ static void encode_block(tev_encoder_t *enc, int block_x, int block_y, int is_ke block->cbp = 0x07; // Y, Co, Cg all present } +// Encode a 16x16 block in lossless mode +static void encode_block_lossless(tev_encoder_t *enc, int block_x, int block_y, int is_keyframe) { + tev_lossless_block_t *block = (tev_lossless_block_t*)&enc->block_data[block_y * ((enc->width + 15) / 16) + block_x]; + + // Extract YCoCg-R block + extract_ycocgr_block(enc->current_rgb, enc->width, enc->height, + block_x, block_y, + enc->y_workspace, enc->co_workspace, enc->cg_workspace); + + if (is_keyframe) { + // Intra coding for keyframes + block->mode = TEV_MODE_INTRA; + block->mv_x = block->mv_y = 0; + enc->blocks_intra++; + } else { + // Same mode decision logic as regular encode_block + // For simplicity, using INTRA for now in lossless mode + block->mode = TEV_MODE_INTRA; + block->mv_x = block->mv_y = 0; + enc->blocks_intra++; + } + + // Lossless mode: rate control factor is always 1.0f + block->rate_control_factor = 1.0f; + + // Apply DCT transforms using the same pattern as regular encoding + // Y channel (16x16) + dct_16x16_fast(enc->y_workspace, enc->dct_workspace); + for (int i = 0; i < BLOCK_SIZE_SQR; i++) { + block->y_coeffs[i] = enc->dct_workspace[i]; // Store directly without quantization + } + + // Co channel (8x8) + dct_8x8_fast(enc->co_workspace, enc->dct_workspace); + for (int i = 0; i < HALF_BLOCK_SIZE_SQR; i++) { + block->co_coeffs[i] = enc->dct_workspace[i]; // Store directly without quantization + } + + // Cg channel (8x8) + dct_8x8_fast(enc->cg_workspace, enc->dct_workspace); + for (int i = 0; i < HALF_BLOCK_SIZE_SQR; i++) { + block->cg_coeffs[i] = enc->dct_workspace[i]; // Store directly without quantization + } + + // Set CBP (simplified - always encode all channels) + block->cbp = 0x07; // Y, Co, Cg all present +} + +// Serialized lossless block structure (for writing to file with float16 coefficients) +typedef struct __attribute__((packed)) { + uint8_t mode; + int16_t mv_x, mv_y; + float rate_control_factor; // Always 1.0f in lossless mode + uint16_t cbp; + uint16_t y_coeffs[BLOCK_SIZE_SQR]; // float16 Y coefficients + uint16_t co_coeffs[HALF_BLOCK_SIZE_SQR]; // float16 Co coefficients + uint16_t cg_coeffs[HALF_BLOCK_SIZE_SQR]; // float16 Cg coefficients +} tev_serialized_lossless_block_t; + +// Convert lossless blocks to serialized format with float16 coefficients +static void serialize_lossless_blocks(tev_encoder_t *enc, int blocks_x, int blocks_y, + tev_serialized_lossless_block_t *serialized_blocks) { + for (int by = 0; by < blocks_y; by++) { + for (int bx = 0; bx < blocks_x; bx++) { + tev_lossless_block_t *src = (tev_lossless_block_t*)&enc->block_data[by * blocks_x + bx]; + tev_serialized_lossless_block_t *dst = &serialized_blocks[by * blocks_x + bx]; + + // Copy basic fields + dst->mode = src->mode; + dst->mv_x = src->mv_x; + dst->mv_y = src->mv_y; + dst->rate_control_factor = src->rate_control_factor; + dst->cbp = src->cbp; + + // Convert float32 coefficients to float16 with range clamping + // Float16 max finite value is approximately 65504 + const float FLOAT16_MAX = 65504.0f; + + for (int i = 0; i < BLOCK_SIZE_SQR; i++) { + float coeff = FCLAMP(src->y_coeffs[i], -FLOAT16_MAX, FLOAT16_MAX); + dst->y_coeffs[i] = float_to_float16(coeff); + if (enc->verbose && fabsf(src->y_coeffs[i]) > FLOAT16_MAX) { + printf("WARNING: Y coefficient %d clamped: %f -> %f\n", i, src->y_coeffs[i], coeff); + } + } + for (int i = 0; i < HALF_BLOCK_SIZE_SQR; i++) { + float co_coeff = FCLAMP(src->co_coeffs[i], -FLOAT16_MAX, FLOAT16_MAX); + float cg_coeff = FCLAMP(src->cg_coeffs[i], -FLOAT16_MAX, FLOAT16_MAX); + dst->co_coeffs[i] = float_to_float16(co_coeff); + dst->cg_coeffs[i] = float_to_float16(cg_coeff); + if (enc->verbose && fabsf(src->co_coeffs[i]) > FLOAT16_MAX) { + printf("WARNING: Co coefficient %d clamped: %f -> %f\n", i, src->co_coeffs[i], co_coeff); + } + if (enc->verbose && fabsf(src->cg_coeffs[i]) > FLOAT16_MAX) { + printf("WARNING: Cg coefficient %d clamped: %f -> %f\n", i, src->cg_coeffs[i], cg_coeff); + } + } + } + } +} + // Convert SubRip time format (HH:MM:SS,mmm) to frame number static int srt_time_to_frame(const char *time_str, int fps) { int hours, minutes, seconds, milliseconds; @@ -1182,7 +1347,7 @@ static subtitle_entry_t* parse_srt_file(const char *filename, int fps) { } } - fclose(file); + //fclose(file); // why uncommenting it errors out with "Fatal error: glibc detected an invalid stdio handle"? return head; } @@ -1613,6 +1778,7 @@ static tev_encoder_t* init_encoder(void) { enc->output_fps = 0; // No frame rate conversion by default enc->is_ntsc_framerate = 0; // Will be detected from input enc->verbose = 0; + enc->disable_rcf = 1; enc->subtitle_file = NULL; enc->has_subtitles = 0; enc->subtitle_list = NULL; @@ -1655,7 +1821,16 @@ static int alloc_encoder_buffers(tev_encoder_t *enc) { enc->dct_workspace = malloc(16 * 16 * sizeof(float)); enc->block_data = malloc(total_blocks * sizeof(tev_block_t)); - enc->compressed_buffer = malloc(total_blocks * sizeof(tev_block_t) * 2); + // Allocate compression buffer large enough for both regular and lossless modes + size_t max_block_size = sizeof(tev_block_t) > sizeof(tev_serialized_lossless_block_t) ? + sizeof(tev_block_t) : sizeof(tev_serialized_lossless_block_t); + size_t compressed_buffer_size = total_blocks * max_block_size * 2; + enc->compressed_buffer = malloc(compressed_buffer_size); + + if (enc->verbose) { + printf("Allocated compressed buffer: %zu bytes for %d blocks (max_block_size: %zu)\n", + compressed_buffer_size, total_blocks, max_block_size); + } enc->mp2_buffer = malloc(MP2_DEFAULT_PACKET_SIZE); if (!enc->current_rgb || !enc->previous_rgb || !enc->reference_rgb || @@ -1726,7 +1901,7 @@ static int write_tev_header(FILE *output, tev_encoder_t *enc) { uint8_t qualityCo = enc->qualityCo; uint8_t qualityCg = enc->qualityCg; uint8_t flags = (enc->has_audio) | (enc->has_subtitles << 1); - uint8_t video_flags = (enc->progressive_mode ? 0 : 1) | (enc->is_ntsc_framerate ? 2 : 0); // bit 0 = is_interlaced, bit 1 = is_ntsc_framerate + uint8_t video_flags = (enc->progressive_mode ? 0 : 1) | (enc->is_ntsc_framerate ? 2 : 0) | (enc->lossless_mode ? 4 : 0); // bit 0 = is_interlaced, bit 1 = is_ntsc_framerate, bit 2 = is_lossless uint8_t reserved = 0; fwrite(&width, 2, 1, output); @@ -1833,7 +2008,11 @@ static int encode_frame(tev_encoder_t *enc, FILE *output, int frame_num, int fie // Encode all blocks for (int by = 0; by < blocks_y; by++) { for (int bx = 0; bx < blocks_x; bx++) { - encode_block(enc, bx, by, is_keyframe); + if (enc->lossless_mode) { + encode_block_lossless(enc, bx, by, is_keyframe); + } else { + encode_block(enc, bx, by, is_keyframe); + } // Calculate complexity for rate control (if enabled) if (enc->bitrate_mode > 0) { @@ -1849,13 +2028,34 @@ static int encode_frame(tev_encoder_t *enc, FILE *output, int frame_num, int fie } // Compress block data using Zstd (compatible with TSVM decoder) - size_t block_data_size = blocks_x * blocks_y * sizeof(tev_block_t); - - // Compress using Zstd with controlled memory usage - size_t compressed_size = ZSTD_compressCCtx(enc->zstd_context, - enc->compressed_buffer, block_data_size * 2, - enc->block_data, block_data_size, - ZSTD_COMPRESSON_LEVEL); + size_t compressed_size; + + if (enc->lossless_mode) { + // Lossless mode: serialize blocks with float16 coefficients + size_t serialized_block_data_size = blocks_x * blocks_y * sizeof(tev_serialized_lossless_block_t); + tev_serialized_lossless_block_t *serialized_blocks = malloc(serialized_block_data_size); + if (!serialized_blocks) { + fprintf(stderr, "Failed to allocate memory for serialized lossless blocks\n"); + return -1; + } + + serialize_lossless_blocks(enc, blocks_x, blocks_y, serialized_blocks); + + // Use the pre-allocated buffer size instead of calculating dynamically + size_t output_buffer_size = blocks_x * blocks_y * sizeof(tev_serialized_lossless_block_t) * 2; + compressed_size = ZSTD_compressCCtx(enc->zstd_context, + enc->compressed_buffer, output_buffer_size, + serialized_blocks, serialized_block_data_size, + ZSTD_COMPRESSON_LEVEL); + free(serialized_blocks); + } else { + // Regular mode: use regular block data + size_t block_data_size = blocks_x * blocks_y * sizeof(tev_block_t); + compressed_size = ZSTD_compressCCtx(enc->zstd_context, + enc->compressed_buffer, block_data_size * 2, + enc->block_data, block_data_size, + ZSTD_COMPRESSON_LEVEL); + } if (ZSTD_isError(compressed_size)) { fprintf(stderr, "Zstd compression failed: %s\n", ZSTD_getErrorName(compressed_size)); @@ -2088,7 +2288,7 @@ static int start_audio_conversion(tev_encoder_t *enc) { char command[2048]; snprintf(command, sizeof(command), "ffmpeg -v quiet -i \"%s\" -acodec libtwolame -psymodel 4 -b:a %dk -ar %d -ac 2 -y \"%s\" 2>/dev/null", - enc->input_file, MP2_RATE_TABLE[enc->qualityIndex], MP2_SAMPLE_RATE, TEMP_AUDIO_FILE); + enc->input_file, enc->lossless_mode ? 384 : MP2_RATE_TABLE[enc->qualityIndex], MP2_SAMPLE_RATE, TEMP_AUDIO_FILE); int result = system(command); if (result == 0) { @@ -2236,15 +2436,16 @@ static void show_usage(const char *program_name) { printf(" -o, --output FILE Output video file (use '-' for stdout)\n"); printf(" -s, --size WxH Video size (default: %dx%d)\n", DEFAULT_WIDTH, DEFAULT_HEIGHT); printf(" -f, --fps N Output frames per second (enables frame rate conversion)\n"); - printf(" -q, --quality N Quality level 0-4 (default: 2, only decides audio rate in quantiser mode)\n"); + printf(" -q, --quality N Quality level 0-4 (default: 2, only decides audio rate in quantiser/lossless mode)\n"); printf(" -Q, --quantiser N Quantiser level 0-100 (100: lossless, 0: potato)\n"); // printf(" -b, --bitrate N Target bitrate in kbps (enables bitrate control mode; DON'T USE - NOT WORKING AS INTENDED)\n"); printf(" -p, --progressive Use progressive scan (default: interlaced)\n"); printf(" -S, --subtitles FILE SubRip (.srt) or SAMI (.smi) subtitle file\n"); printf(" -v, --verbose Verbose output\n"); printf(" -t, --test Test mode: generate solid colour frames\n"); + printf(" --lossless Lossless mode: store coefficients as float16 (no quantisation, implies -p, 384k audio)\n"); + printf(" --enable-rcf Enable per-block rate control (experimental)\n"); printf(" --enable-encode-stats Collect and report block complexity statistics\n"); - printf(" --disable-rcf Disable per-block rate control\n"); printf(" --help Show this help\n\n"); // printf("Rate Control Modes:\n"); // printf(" Quality mode (default): Fixed quantisation based on -q parameter\n"); @@ -2334,7 +2535,8 @@ int main(int argc, char *argv[]) { {"verbose", no_argument, 0, 'v'}, {"test", no_argument, 0, 't'}, {"enable-encode-stats", no_argument, 0, 1000}, - {"disable-rcf", no_argument, 0, 1100}, + {"enable-rcf", no_argument, 0, 1100}, + {"lossless", no_argument, 0, 1200}, {"help", no_argument, 0, '?'}, {0, 0, 0, 0} }; @@ -2403,11 +2605,14 @@ int main(int argc, char *argv[]) { case 't': test_mode = 1; break; - case 1000: // --enable-encode-stats + case 1000: // --enable-encode-stats enc->stats_mode = 1; break; - case 1100: // --disable-rcf - enc->disable_rcf = 1; + case 1100: // --enable-rcf + enc->disable_rcf = 0; + break; + case 1200: // --lossless + enc->lossless_mode = 1; break; case 0: if (strcmp(long_options[option_index].name, "help") == 0) { @@ -2419,7 +2624,7 @@ int main(int argc, char *argv[]) { case 'Q': enc->qualityY = CLAMP(atoi(optarg), 0, 100); enc->qualityCo = enc->qualityY; - enc->qualityCg = (enc->qualityY == 100) ? enc->qualityY : enc->qualityCo >> 2; + enc->qualityCg = (enc->qualityY == 100) ? enc->qualityY : enc->qualityCo >> 1; break; default: show_usage(argv[0]); @@ -2428,6 +2633,19 @@ int main(int argc, char *argv[]) { } } + // Lossless mode validation and adjustments + if (enc->lossless_mode) { + // In lossless mode, disable rate control and set quality to maximum + enc->bitrate_mode = 0; + enc->disable_rcf = 1; + enc->progressive_mode = 1; + enc->qualityIndex = 5; + enc->qualityY = enc->qualityCo = enc->qualityCg = 255; // Use 255 as a redundant lossless marker + if (enc->verbose) { + printf("Lossless mode enabled: Rate control disabled, quality set to maximum, enabling progressive scan\n"); + } + } + // halve the internal representation of frame height if (!enc->progressive_mode) { enc->height /= 2;