various encoder bug fixes

This commit is contained in:
minjaesong
2025-09-13 00:39:12 +09:00
parent 1f5f72733a
commit 198e951102
4 changed files with 553 additions and 79 deletions

View File

@@ -418,6 +418,7 @@ let hasSubtitle = !!(flags & 2)
let videoFlags = seqread.readOneByte()
let isInterlaced = !!(videoFlags & 1)
let isNTSC = !!(videoFlags & 2)
let isLossless = !!(videoFlags & 4)
let unused2 = seqread.readOneByte()
@@ -427,6 +428,7 @@ serial.println(` FPS: ${(isNTSC) ? (fps * 1000 / 1001) : fps}`)
serial.println(` Duration: ${totalFrames / fps}`)
serial.println(` Audio: ${hasAudio ? "Yes" : "No"}`)
serial.println(` Resolution: ${width}x${height}, ${isInterlaced ? "interlaced" : "progressive"}`)
serial.println(` Quality: Y=${qualityY}, Co=${qualityCo}, Cg=${qualityCg}, ${isLossless ? "lossless" : "lossy"}`)
// DEBUG interlace raw output
@@ -665,14 +667,14 @@ try {
if (isInterlaced) {
// For interlaced: decode current frame into currentFieldAddr
// For display: use prevFieldAddr as current, currentFieldAddr as next
graphics.tevDecode(blockDataPtr, nextFieldAddr, currentFieldAddr, width, decodingHeight, qualityY, qualityCo, qualityCg, trueFrameCount, debugMotionVectors, version, enableDeblocking, enableBoundaryAwareDecoding)
graphics.tevDecode(blockDataPtr, nextFieldAddr, currentFieldAddr, width, decodingHeight, qualityY, qualityCo, qualityCg, trueFrameCount, debugMotionVectors, version, enableDeblocking, enableBoundaryAwareDecoding, isLossless)
graphics.tevDeinterlace(trueFrameCount, width, decodingHeight, prevFieldAddr, currentFieldAddr, nextFieldAddr, CURRENT_RGB_ADDR, deinterlaceAlgorithm)
// Rotate field buffers for next frame: NEXT -> CURRENT -> PREV
rotateFieldBuffers()
} else {
// Progressive or first frame: normal decoding without temporal prediction
graphics.tevDecode(blockDataPtr, CURRENT_RGB_ADDR, PREV_RGB_ADDR, width, decodingHeight, qualityY, qualityCo, qualityCg, trueFrameCount, debugMotionVectors, version, enableDeblocking, enableBoundaryAwareDecoding)
graphics.tevDecode(blockDataPtr, CURRENT_RGB_ADDR, PREV_RGB_ADDR, width, decodingHeight, qualityY, qualityCo, qualityCg, trueFrameCount, debugMotionVectors, version, enableDeblocking, enableBoundaryAwareDecoding, isLossless)
}
decodeTime = (sys.nanoTime() - decodeStart) / 1000000.0 // Convert to milliseconds

View File

@@ -12,6 +12,7 @@ import net.torvald.terrarum.modulecomputers.virtualcomputer.tvd.toUint
import net.torvald.tsvm.peripheral.GraphicsAdapter
import net.torvald.tsvm.peripheral.PeriBase
import net.torvald.tsvm.peripheral.fmod
import net.torvald.util.Float16
import kotlin.math.*
class GraphicsJSR223Delegate(private val vm: VM) {
@@ -21,6 +22,77 @@ class GraphicsJSR223Delegate(private val vm: VM) {
private val idct16TempBuffer = FloatArray(256) // For 16x16 IDCT
private val idct16SeparableBuffer = FloatArray(256) // For separable 16x16 IDCT
// Lossless IDCT functions for float16 coefficients (no quantization)
private fun tevIdct8x8_lossless(coeffs: FloatArray): IntArray {
val result = IntArray(64)
// Fast separable IDCT (row-column decomposition) for lossless coefficients
// First pass: Process rows (8 1D IDCTs)
for (row in 0 until 8) {
for (col in 0 until 8) {
var sum = 0f
for (u in 0 until 8) {
sum += dctBasis8[u][col] * coeffs[row * 8 + u]
}
idct8TempBuffer[row * 8 + col] = sum * 0.5f
}
}
// Second pass: Process columns (8 1D IDCTs)
for (col in 0 until 8) {
for (row in 0 until 8) {
var sum = 0f
for (v in 0 until 8) {
sum += dctBasis8[v][row] * idct8TempBuffer[v * 8 + col]
}
val finalValue = sum * 0.5f + 128f
result[row * 8 + col] = if (finalValue.isNaN() || finalValue.isInfinite()) {
println("NaN/Inf detected in 8x8 IDCT at ($row,$col): sum=$sum, finalValue=$finalValue")
128 // Default to middle gray
} else {
finalValue.roundToInt().coerceIn(0, 255)
}
}
}
return result
}
private fun tevIdct16x16_lossless(coeffs: FloatArray): IntArray {
val result = IntArray(256)
// Fast separable IDCT (row-column decomposition) for 16x16 lossless coefficients
// First pass: Process rows (16 1D IDCTs)
for (row in 0 until 16) {
for (col in 0 until 16) {
var sum = 0f
for (u in 0 until 16) {
sum += dctBasis16[u][col] * coeffs[row * 16 + u]
}
idct16TempBuffer[row * 16 + col] = sum * 0.25f
}
}
// Second pass: Process columns (16 1D IDCTs)
for (col in 0 until 16) {
for (row in 0 until 16) {
var sum = 0f
for (v in 0 until 16) {
sum += dctBasis16[v][row] * idct16TempBuffer[v * 16 + col]
}
val finalValue = sum * 0.25f + 128f
result[row * 16 + col] = if (finalValue.isNaN() || finalValue.isInfinite()) {
println("NaN/Inf detected in 16x16 IDCT at ($row,$col): sum=$sum, finalValue=$finalValue")
128 // Default to middle gray
} else {
finalValue.roundToInt().coerceIn(0, 255)
}
}
}
return result
}
private fun getFirstGPU(): GraphicsAdapter? {
return vm.findPeribyType(VM.PERITYPE_GPU_AND_TERM)?.peripheral as? GraphicsAdapter
@@ -1649,7 +1721,7 @@ class GraphicsJSR223Delegate(private val vm: VM) {
val result = IntArray(64)
// Reuse preallocated temp buffer to reduce GC pressure
for (i in coeffs.indices) {
idct8TempBuffer[i] = coeffs[i] * quantTable[i] * jpeg_quality_to_mult(qualityIndex * rateControlFactor)
idct8TempBuffer[i] = coeffs[i] * (quantTable[i] * jpeg_quality_to_mult(qualityIndex * rateControlFactor)).coerceIn(1f, 255f)
}
// Fast separable IDCT (row-column decomposition)
@@ -1662,7 +1734,7 @@ class GraphicsJSR223Delegate(private val vm: VM) {
val coeff = if (isChromaResidual && coeffIdx == 0) {
coeffs[coeffIdx].toFloat() // DC lossless for chroma residual
} else {
coeffs[coeffIdx] * quantTable[coeffIdx] * jpeg_quality_to_mult(qualityIndex * rateControlFactor)
coeffs[coeffIdx] * (quantTable[coeffIdx] * jpeg_quality_to_mult(qualityIndex * rateControlFactor)).coerceIn(1f, 255f)
}
sum += dctBasis8[u][col] * coeff
}
@@ -1708,7 +1780,7 @@ class GraphicsJSR223Delegate(private val vm: VM) {
val coeff = if (idx == 0) {
coeffs[idx].toFloat() // DC lossless for luma
} else {
coeffs[idx] * quantTable[idx] * jpeg_quality_to_mult(qualityIndex * rateControlFactor)
coeffs[idx] * (quantTable[idx] * jpeg_quality_to_mult(qualityIndex * rateControlFactor)).coerceIn(1f, 255f)
}
idct16TempBuffer[idx] = coeff
}
@@ -2555,7 +2627,8 @@ class GraphicsJSR223Delegate(private val vm: VM) {
fun tevDecode(blockDataPtr: Long, currentRGBAddr: Long, prevRGBAddr: Long,
width: Int, height: Int, qY: Int, qCo: Int, qCg: Int, frameCounter: Int,
debugMotionVectors: Boolean = false, tevVersion: Int = 2,
enableDeblocking: Boolean = true, enableBoundaryAwareDecoding: Boolean = false) {
enableDeblocking: Boolean = true, enableBoundaryAwareDecoding: Boolean = false,
isLossless: Boolean = false) {
// height doesn't change when interlaced, because that's the encoder's output
@@ -2846,17 +2919,65 @@ class GraphicsJSR223Delegate(private val vm: VM) {
}
0x01 -> { // TEV_MODE_INTRA - Full YCoCg-R DCT decode (no motion compensation)
// Read DCT coefficients: Y (16x16=256), Co (8x8=64), Cg (8x8=64)
val yBlock: IntArray
val coBlock: IntArray
val cgBlock: IntArray
if (isLossless) {
// Lossless mode: coefficients are stored as float16, no quantization
// Read float16 coefficients: Y (16x16=256), Co (8x8=64), Cg (8x8=64)
val coeffFloat16Array = ShortArray(384) // 384 float16 values stored as shorts
vm.bulkPeekShort(readPtr.toInt(), coeffFloat16Array, 768) // 384 * 2 bytes
readPtr += 768
// Convert float16 to float32 and perform IDCT directly (no quantization)
println("DEBUG: Reading lossless coefficients, first few float16 values: ${coeffFloat16Array.take(10).map { "0x${it.toString(16)}" }}")
val yCoeffs = FloatArray(256) { i ->
// Convert signed short to unsigned short for float16 interpretation
val signedShort = coeffFloat16Array[i]
val float16bits = signedShort.toInt() and 0xFFFF // Convert to unsigned
val floatVal = Float16.toFloat(float16bits.toShort())
if (floatVal.isNaN() || floatVal.isInfinite()) {
println("NaN/Inf detected at Y coefficient $i: signedShort=0x${signedShort.toString(16)}, unsigned=0x${float16bits.toString(16)}, floatVal=$floatVal")
0f // Replace NaN with 0
} else floatVal
}
val coCoeffs = FloatArray(64) { i ->
// Convert signed short to unsigned short for float16 interpretation
val signedShort = coeffFloat16Array[256 + i]
val float16bits = signedShort.toInt() and 0xFFFF // Convert to unsigned
val floatVal = Float16.toFloat(float16bits.toShort())
if (floatVal.isNaN() || floatVal.isInfinite()) {
println("NaN/Inf detected at Co coefficient $i: signedShort=0x${signedShort.toString(16)}, unsigned=0x${float16bits.toString(16)}, floatVal=$floatVal")
0f // Replace NaN with 0
} else floatVal
}
val cgCoeffs = FloatArray(64) { i ->
// Convert signed short to unsigned short for float16 interpretation
val signedShort = coeffFloat16Array[320 + i]
val float16bits = signedShort.toInt() and 0xFFFF // Convert to unsigned
val floatVal = Float16.toFloat(float16bits.toShort())
if (floatVal.isNaN() || floatVal.isInfinite()) {
println("NaN/Inf detected at Cg coefficient $i: signedShort=0x${signedShort.toString(16)}, unsigned=0x${float16bits.toString(16)}, floatVal=$floatVal")
0f // Replace NaN with 0
} else floatVal
}
yBlock = tevIdct16x16_lossless(yCoeffs)
coBlock = tevIdct8x8_lossless(coCoeffs)
cgBlock = tevIdct8x8_lossless(cgCoeffs)
} else {
// Regular lossy mode: quantized int16 coefficients
// Optimized bulk reading of all DCT coefficients: Y(256×2) + Co(64×2) + Cg(64×2) = 768 bytes
val coeffShortArray = ShortArray(384) // Total coefficients: 256 + 64 + 64 = 384 shorts
vm.bulkPeekShort(readPtr.toInt(), coeffShortArray, 768)
readPtr += 768
// Optimized bulk reading of all DCT coefficients: Y(256×2) + Co(64×2) + Cg(64×2) = 768 bytes
val coeffShortArray = ShortArray(384) // Total coefficients: 256 + 64 + 64 = 384 shorts
vm.bulkPeekShort(readPtr.toInt(), coeffShortArray, 768)
readPtr += 768
// Perform hardware IDCT for each channel using fast algorithm
val yBlock = tevIdct16x16_fast(coeffShortArray.sliceArray(0 until 256), QUANT_TABLE_Y, qY, rateControlFactor)
val coBlock = tevIdct8x8_fast(coeffShortArray.sliceArray(256 until 320), QUANT_TABLE_C, true, qCo, rateControlFactor)
val cgBlock = tevIdct8x8_fast(coeffShortArray.sliceArray(320 until 384), QUANT_TABLE_C, true, qCg, rateControlFactor)
// Perform hardware IDCT for each channel using fast algorithm
yBlock = tevIdct16x16_fast(coeffShortArray.sliceArray(0 until 256), QUANT_TABLE_Y, qY, rateControlFactor)
coBlock = tevIdct8x8_fast(coeffShortArray.sliceArray(256 until 320), QUANT_TABLE_C, true, qCo, rateControlFactor)
cgBlock = tevIdct8x8_fast(coeffShortArray.sliceArray(320 until 384), QUANT_TABLE_C, true, qCg, rateControlFactor)
}
// Convert to RGB (YCoCg-R for v2, XYB for v3)
val rgbData = if (tevVersion == 3) {
@@ -3275,7 +3396,7 @@ class GraphicsJSR223Delegate(private val vm: VM) {
val quantValue = if (i == 0) 1.0f else {
quantTable[coeffIdx] * jpeg_quality_to_mult(qScale * rateControlFactor)
}
result[blockIndex]!![i] = block[i] * quantValue
result[blockIndex]!![i] = block[i] * quantValue.coerceIn(1f, 255f)
}
}
}
@@ -3307,7 +3428,7 @@ class GraphicsJSR223Delegate(private val vm: VM) {
for (i in 1 until coeffsSize) {
val coeffIdx = i.coerceIn(0, quantTable.size - 1)
val quant = (quantTable[coeffIdx] * qualityMult).toInt()
val quant = (quantTable[coeffIdx] * qualityMult).coerceIn(1f, 255f).toInt()
quantValues[blockIndex][i] = quant
quantHalfValues[blockIndex][i] = quant / 2
}
@@ -3511,7 +3632,7 @@ class GraphicsJSR223Delegate(private val vm: VM) {
val rightOff = blocksOff[rightBlockIndex]
// OPTIMIZATION 4: Process multiple frequencies in single loop for better cache locality
for (v in 0 until 16) { // Only low-to-mid frequencies
for (v in 0 until 8) { // Only low-to-mid frequencies
var deltaV = 0L
var hfPenalty = 0L
val vOffset = v * 16
@@ -3667,7 +3788,7 @@ class GraphicsJSR223Delegate(private val vm: VM) {
blocksMid[blockIndex][i] = dcValue
} else {
// AC coefficients: use quantization intervals
val quant = (quantTable[quantIdx] * jpeg_quality_to_mult(qScale * rateControlFactor)).toInt()
val quant = (quantTable[quantIdx] * jpeg_quality_to_mult(qScale * rateControlFactor)).coerceIn(1f, 255f).toInt()
// Standard dequantized value (midpoint)
blocksMid[blockIndex][i] = block[i].toInt() * quant
@@ -3719,7 +3840,7 @@ class GraphicsJSR223Delegate(private val vm: VM) {
blocksMax[blockIndex][i] = dcValue
} else {
// AC coefficients: use quantization intervals
val quant = (quantTable[quantIdx] * jpeg_quality_to_mult(qScale * rateControlFactor)).toInt()
val quant = (quantTable[quantIdx] * jpeg_quality_to_mult(qScale * rateControlFactor)).coerceIn(1f, 255f).toInt()
// Standard dequantized value (midpoint)
blocksMid[blockIndex][i] = block[i].toInt() * quant
@@ -3789,73 +3910,116 @@ class GraphicsJSR223Delegate(private val vm: VM) {
return result
}
// BULK OPTIMIZED 8x8 horizontal boundary analysis for chroma channels
private fun analyzeHorizontalBoundary(
leftBlockIndex: Int, rightBlockIndex: Int,
blocksMid: Array<IntArray>, blocksOff: Array<LongArray>,
kLinearGradient: IntArray, kAlphaSqrt2: IntArray
) {
// Only process low-to-mid frequencies (v < 4 for 8x8, v < 8 for 16x16)
val maxV = 8
val leftMid = blocksMid[leftBlockIndex]
val rightMid = blocksMid[rightBlockIndex]
val leftOff = blocksOff[leftBlockIndex]
val rightOff = blocksOff[rightBlockIndex]
for (v in 0 until maxV) {
// OPTIMIZATION 12: Process 8x8 boundaries with bulk operations (v < 4 for low-to-mid frequencies)
for (v in 0 until 4) { // Only low-to-mid frequencies for 8x8
var deltaV = 0L
var hfPenalty = 0L
val vOffset = v * 8
// Analyze boundary discontinuity
// First pass: Calculate boundary discontinuity
for (u in 0 until 8) {
val alpha = kAlphaSqrt2[u.coerceIn(0, 7)]
val sign = if (u and 1 == 1) -1 else 1
val gi = blocksMid[leftBlockIndex][v * 8 + u]
val gj = blocksMid[rightBlockIndex][v * 8 + u]
val idx = vOffset + u
val alpha = kAlphaSqrt2[u] // Direct access (u < 8)
val sign = if (u and 1 != 0) -1 else 1
val gi = leftMid[idx]
val gj = rightMid[idx]
deltaV += (alpha * (gj - sign * gi)).toLong()
hfPenalty += (u * u * (gi * gi + gj * gj)).toLong()
deltaV += alpha * (gj - sign * gi)
hfPenalty += (u * u) * (gi * gi + gj * gj)
}
// Apply corrections with high-frequency damping
if (hfPenalty > 400) deltaV /= 2
// Early exit for very small adjustments
if (kotlin.math.abs(deltaV) < 100) continue
for (u in 0 until 8) {
val gradientIdx = u.coerceIn(0, kLinearGradient.size - 1)
val sign = if (u and 1 == 1) 1 else -1
blocksOff[leftBlockIndex][v * 8 + u] = blocksOff[leftBlockIndex][v * 8 + u] + deltaV * kLinearGradient[gradientIdx]
blocksOff[rightBlockIndex][v * 8 + u] = blocksOff[rightBlockIndex][v * 8 + u] + deltaV * kLinearGradient[gradientIdx] * sign
}
// Apply high-frequency damping once per frequency band
if (hfPenalty > 400) deltaV /= 2 // 8x8 threshold
// Second pass: Apply corrections (BULK OPTIMIZED with unrolling for 8x8)
val correction = deltaV
// Bulk apply corrections for 8 coefficients - manually unrolled for performance
leftOff[vOffset] += correction * kLinearGradient[0]
rightOff[vOffset] += correction * kLinearGradient[0]
leftOff[vOffset + 1] += correction * kLinearGradient[1]
rightOff[vOffset + 1] -= correction * kLinearGradient[1] // Alternating signs
leftOff[vOffset + 2] += correction * kLinearGradient[2]
rightOff[vOffset + 2] += correction * kLinearGradient[2]
leftOff[vOffset + 3] += correction * kLinearGradient[3]
rightOff[vOffset + 3] -= correction * kLinearGradient[3]
leftOff[vOffset + 4] += correction * kLinearGradient[4]
rightOff[vOffset + 4] += correction * kLinearGradient[4]
leftOff[vOffset + 5] += correction * kLinearGradient[5]
rightOff[vOffset + 5] -= correction * kLinearGradient[5]
leftOff[vOffset + 6] += correction * kLinearGradient[6]
rightOff[vOffset + 6] += correction * kLinearGradient[6]
leftOff[vOffset + 7] += correction * kLinearGradient[7]
rightOff[vOffset + 7] -= correction * kLinearGradient[7]
}
}
// BULK OPTIMIZED 8x8 vertical boundary analysis for chroma channels
private fun analyzeVerticalBoundary(
topBlockIndex: Int, bottomBlockIndex: Int,
blocksMid: Array<IntArray>, blocksOff: Array<LongArray>,
kLinearGradient: IntArray, kAlphaSqrt2: IntArray
) {
// Only process low-to-mid frequencies (u < 4 for 8x8, u < 8 for 16x16)
val maxU = 8
val topMid = blocksMid[topBlockIndex]
val bottomMid = blocksMid[bottomBlockIndex]
val topOff = blocksOff[topBlockIndex]
val bottomOff = blocksOff[bottomBlockIndex]
for (u in 0 until maxU) {
// OPTIMIZATION 13: Optimized vertical analysis for 8x8 with better cache access pattern
for (u in 0 until 4) { // Only low-to-mid frequencies for 8x8
var deltaU = 0L
var hfPenalty = 0L
// Analyze boundary discontinuity
// First pass: Calculate boundary discontinuity
for (v in 0 until 8) {
val alpha = kAlphaSqrt2[v.coerceIn(0, 7)]
val sign = if (v and 1 == 1) -1 else 1
val gi = blocksMid[topBlockIndex][v * 8 + u]
val gj = blocksMid[bottomBlockIndex][v * 8 + u]
val idx = v * 8 + u
val alpha = kAlphaSqrt2[v] // Direct access (v < 8)
val sign = if (v and 1 != 0) -1 else 1
val gi = topMid[idx]
val gj = bottomMid[idx]
deltaU += (alpha * (gj - sign * gi)).toLong()
hfPenalty += (v * v * (gi * gi + gj * gj)).toLong()
deltaU += alpha * (gj - sign * gi)
hfPenalty += (v * v) * (gi * gi + gj * gj)
}
// Apply corrections with high-frequency damping
if (hfPenalty > 400) deltaU /= 2
// Early exit for very small adjustments
if (kotlin.math.abs(deltaU) < 100) continue
for (v in 0 until 8) {
val gradientIdx = v.coerceIn(0, kLinearGradient.size - 1)
val sign = if (v and 1 == 1) 1 else -1
blocksOff[topBlockIndex][v * 8 + u] = blocksOff[topBlockIndex][v * 8 + u] + deltaU * kLinearGradient[gradientIdx]
blocksOff[bottomBlockIndex][v * 8 + u] = blocksOff[bottomBlockIndex][v * 8 + u] + deltaU * kLinearGradient[gradientIdx] * sign
}
// Apply high-frequency damping once per frequency band
if (hfPenalty > 400) deltaU /= 2 // 8x8 threshold
// Second pass: Apply corrections (BULK OPTIMIZED vertical for 8x8)
val correction = deltaU
// Bulk apply corrections for 8 vertical coefficients - manually unrolled
topOff[u] += correction * kLinearGradient[0]
bottomOff[u] += correction * kLinearGradient[0]
topOff[8 + u] += correction * kLinearGradient[1]
bottomOff[8 + u] -= correction * kLinearGradient[1] // Alternating signs
topOff[16 + u] += correction * kLinearGradient[2]
bottomOff[16 + u] += correction * kLinearGradient[2]
topOff[24 + u] += correction * kLinearGradient[3]
bottomOff[24 + u] -= correction * kLinearGradient[3]
topOff[32 + u] += correction * kLinearGradient[4]
bottomOff[32 + u] += correction * kLinearGradient[4]
topOff[40 + u] += correction * kLinearGradient[5]
bottomOff[40 + u] -= correction * kLinearGradient[5]
topOff[48 + u] += correction * kLinearGradient[6]
bottomOff[48 + u] += correction * kLinearGradient[6]
topOff[56 + u] += correction * kLinearGradient[7]
bottomOff[56 + u] -= correction * kLinearGradient[7]
}
}

View File

@@ -0,0 +1,90 @@
package net.torvald.util
import kotlin.experimental.or
class Float16() {
var bits = 0.toShort()
private set
constructor(fval: Float) : this() {
fromFloat(fval)
}
fun toFloat() = Float16.toFloat(bits)
fun fromFloat(fval: Float) {
bits = Float16.fromFloat(fval)
}
operator fun times(other: Float) = fromFloat(this.toFloat() * other)
operator fun times(other: Float16) = fromFloat(this.toFloat() * other.toFloat())
operator fun div(other: Float) = fromFloat(this.toFloat() / other)
operator fun div(other: Float16) = fromFloat(this.toFloat() / other.toFloat())
// operators are stripped: you don't calculate from FP16; this is only for storing values //
companion object {
fun toFloat(hbits: Short): Float {
val hbits = hbits.toInt().and(0xFFFF)
var mant = hbits and 0x03ff // 10 bits mantissa
var exp = hbits and 0x7c00 // 5 bits exponent
if (exp == 0x7c00)
// NaN/Inf
exp = 0x3fc00 // -> NaN/Inf
else if (exp != 0)
// normalized value
{
exp += 0x1c000 // exp - 15 + 127
if (mant == 0 && exp > 0x1c400)
// smooth transition
return java.lang.Float.intBitsToFloat(hbits and 0x8000 shl 16 or (exp shl 13) or 0x3ff)
}
else if (mant != 0)
// && exp==0 -> subnormal
{
exp = 0x1c400 // make it normal
do {
mant = mant shl 1 // mantissa * 2
exp -= 0x400 // decrease exp by 1
} while (mant and 0x400 == 0) // while not normal
mant = mant and 0x3ff // discard subnormal bit
} // else +/-0 -> +/-0
return java.lang.Float.intBitsToFloat(// combine all parts
hbits and 0x8000 shl 16 or (exp or mant shl 13)) // value << ( 23 - 10 )
}
fun fromFloat(fval: Float): Short {
val fbits = java.lang.Float.floatToIntBits(fval)
val sign = fbits.ushr(16).and(0x8000).toShort() // sign only
var `val` = (fbits and 0x7fffffff) + 0x1000 // rounded value
if (`val` >= 0x47800000)
// might be or become NaN/Inf
{ // avoid Inf due to rounding
if (fbits and 0x7fffffff >= 0x47800000) { // is or must become NaN/Inf
if (`val` < 0x7f800000)
// was value but too large
return sign or 0x7c00 // make it +/-Inf
return sign or 0x7c00 or // remains +/-Inf or NaN
(fbits and 0x007fffff).ushr(13).toShort() // keep NaN (and Inf) bits
}
return sign or 0x7bff.toShort() // unrounded not quite Inf
}
if (`val` >= 0x38800000)
// remains normalized value
return sign or (`val` - 0x38000000).ushr(13).toShort() // exp - 127 + 15
if (`val` < 0x33000000)
// too small for subnormal
return sign // becomes +/-0
`val` = (fbits and 0x7fffffff).ushr(23) // tmp exp for subnormal calc
return sign or ((fbits and 0x7fffff or 0x800000) // add subnormal bit
+ 0x800000.ushr(`val` - 102) // round depending on cut off
).ushr(126 - `val`) // div by 2^(1-(exp-127+15)) and >> 13 | exp=0
.toShort()
}
}
}

View File

@@ -14,6 +14,58 @@
#include <sys/time.h>
#include <time.h>
// Float16 conversion functions (adapted from Float16.kt)
static inline uint16_t float_to_float16(float fval) {
uint32_t fbits = *(uint32_t*)&fval;
uint16_t sign = (fbits >> 16) & 0x8000; // sign only
uint32_t val = (fbits & 0x7fffffff) + 0x1000; // rounded value
if (val >= 0x47800000) { // might be or become NaN/Inf
if ((fbits & 0x7fffffff) >= 0x47800000) { // is or must become NaN/Inf
if (val < 0x7f800000) // was value but too large
return sign | 0x7c00; // make it +/-Inf
return sign | 0x7c00 | // remains +/-Inf or NaN
((fbits & 0x007fffff) >> 13); // keep NaN (and Inf) bits
}
return sign | 0x7bff; // unrounded not quite Inf
}
if (val >= 0x38800000) // remains normalized value
return sign | ((val - 0x38000000) >> 13); // exp - 127 + 15
if (val < 0x33000000) // too small for subnormal
return sign; // becomes +/-0
val = (fbits & 0x7fffffff) >> 23; // tmp exp for subnormal calc
return sign | (((fbits & 0x7fffff) | 0x800000) + // add subnormal bit
(0x800000 >> (val - 102)) // round depending on cut off
) >> (126 - val); // div by 2^(1-(exp-127+15)) and >> 13 | exp=0
}
static inline float float16_to_float(uint16_t hbits) {
uint32_t mant = hbits & 0x03ff; // 10 bits mantissa
uint32_t exp = hbits & 0x7c00; // 5 bits exponent
if (exp == 0x7c00) // NaN/Inf
exp = 0x3fc00; // -> NaN/Inf
else if (exp != 0) { // normalized value
exp += 0x1c000; // exp - 15 + 127
if (mant == 0 && exp > 0x1c400) { // smooth transition
uint32_t fbits = ((hbits & 0x8000) << 16) | (exp << 13) | 0x3ff;
return *(float*)&fbits;
}
}
else if (mant != 0) { // && exp==0 -> subnormal
exp = 0x1c400; // make it normal
do {
mant <<= 1; // mantissa * 2
exp -= 0x400; // decrease exp by 1
} while ((mant & 0x400) == 0); // while not normal
mant &= 0x3ff; // discard subnormal bit
} // else +/-0 -> +/-0
uint32_t fbits = ((hbits & 0x8000) << 16) | ((exp | mant) << 13);
return *(float*)&fbits;
}
// TSVM Enhanced Video (TEV) format constants
#define TEV_MAGIC "\x1F\x54\x53\x56\x4D\x54\x45\x56" // "\x1FTSVM TEV"
#define TEV_VERSION 2 // Updated for YCoCg-R 4:2:0
@@ -103,7 +155,7 @@ static const uint32_t QUANT_TABLE_C[HALF_BLOCK_SIZE_SQR] =
// Audio constants (reuse MP2 from existing system)
#define MP2_SAMPLE_RATE 32000
#define MP2_DEFAULT_PACKET_SIZE 0x240
#define MP2_DEFAULT_PACKET_SIZE 1728
// Default values
#define DEFAULT_WIDTH 560
@@ -140,6 +192,17 @@ typedef struct __attribute__((packed)) {
int16_t cg_coeffs[HALF_BLOCK_SIZE_SQR]; // quantised Cg DCT coefficients (8x8)
} tev_block_t;
// Lossless TEV block structure (uses float32 internally, converted to float16 during serialization)
typedef struct __attribute__((packed)) {
uint8_t mode; // Block encoding mode
int16_t mv_x, mv_y; // Motion vector (1/4 pixel precision)
float rate_control_factor; // Always 1.0f in lossless mode
uint16_t cbp; // Coded block pattern (which channels have non-zero coeffs)
float y_coeffs[BLOCK_SIZE_SQR]; // lossless Y DCT coefficients (16x16)
float co_coeffs[HALF_BLOCK_SIZE_SQR]; // lossless Co DCT coefficients (8x8)
float cg_coeffs[HALF_BLOCK_SIZE_SQR]; // lossless Cg DCT coefficients (8x8)
} tev_lossless_block_t;
// Subtitle entry structure
typedef struct subtitle_entry {
int start_frame;
@@ -168,6 +231,8 @@ typedef struct {
int qualityCo;
int qualityCg;
int verbose;
int disable_rcf; // 0 = rcf enabled, 1 = disabled
int lossless_mode; // 0 = lossy (default), 1 = lossless mode
// Bitrate control
int target_bitrate_kbps; // Target bitrate in kbps (0 = quality mode)
@@ -216,10 +281,9 @@ typedef struct {
// Subtitle handling
subtitle_entry_t *subtitle_list;
subtitle_entry_t *current_subtitle;
// Complexity statistics collection
int stats_mode; // 0 = disabled, 1 = enabled
int disable_rcf; // 0 = rcf enabled, 1 = disabled
float *complexity_values; // Array to store all complexity values
int complexity_count; // Current count of complexity values
int complexity_capacity; // Capacity of complexity_values array
@@ -1041,6 +1105,107 @@ static void encode_block(tev_encoder_t *enc, int block_x, int block_y, int is_ke
block->cbp = 0x07; // Y, Co, Cg all present
}
// Encode a 16x16 block in lossless mode
static void encode_block_lossless(tev_encoder_t *enc, int block_x, int block_y, int is_keyframe) {
tev_lossless_block_t *block = (tev_lossless_block_t*)&enc->block_data[block_y * ((enc->width + 15) / 16) + block_x];
// Extract YCoCg-R block
extract_ycocgr_block(enc->current_rgb, enc->width, enc->height,
block_x, block_y,
enc->y_workspace, enc->co_workspace, enc->cg_workspace);
if (is_keyframe) {
// Intra coding for keyframes
block->mode = TEV_MODE_INTRA;
block->mv_x = block->mv_y = 0;
enc->blocks_intra++;
} else {
// Same mode decision logic as regular encode_block
// For simplicity, using INTRA for now in lossless mode
block->mode = TEV_MODE_INTRA;
block->mv_x = block->mv_y = 0;
enc->blocks_intra++;
}
// Lossless mode: rate control factor is always 1.0f
block->rate_control_factor = 1.0f;
// Apply DCT transforms using the same pattern as regular encoding
// Y channel (16x16)
dct_16x16_fast(enc->y_workspace, enc->dct_workspace);
for (int i = 0; i < BLOCK_SIZE_SQR; i++) {
block->y_coeffs[i] = enc->dct_workspace[i]; // Store directly without quantization
}
// Co channel (8x8)
dct_8x8_fast(enc->co_workspace, enc->dct_workspace);
for (int i = 0; i < HALF_BLOCK_SIZE_SQR; i++) {
block->co_coeffs[i] = enc->dct_workspace[i]; // Store directly without quantization
}
// Cg channel (8x8)
dct_8x8_fast(enc->cg_workspace, enc->dct_workspace);
for (int i = 0; i < HALF_BLOCK_SIZE_SQR; i++) {
block->cg_coeffs[i] = enc->dct_workspace[i]; // Store directly without quantization
}
// Set CBP (simplified - always encode all channels)
block->cbp = 0x07; // Y, Co, Cg all present
}
// Serialized lossless block structure (for writing to file with float16 coefficients)
typedef struct __attribute__((packed)) {
uint8_t mode;
int16_t mv_x, mv_y;
float rate_control_factor; // Always 1.0f in lossless mode
uint16_t cbp;
uint16_t y_coeffs[BLOCK_SIZE_SQR]; // float16 Y coefficients
uint16_t co_coeffs[HALF_BLOCK_SIZE_SQR]; // float16 Co coefficients
uint16_t cg_coeffs[HALF_BLOCK_SIZE_SQR]; // float16 Cg coefficients
} tev_serialized_lossless_block_t;
// Convert lossless blocks to serialized format with float16 coefficients
static void serialize_lossless_blocks(tev_encoder_t *enc, int blocks_x, int blocks_y,
tev_serialized_lossless_block_t *serialized_blocks) {
for (int by = 0; by < blocks_y; by++) {
for (int bx = 0; bx < blocks_x; bx++) {
tev_lossless_block_t *src = (tev_lossless_block_t*)&enc->block_data[by * blocks_x + bx];
tev_serialized_lossless_block_t *dst = &serialized_blocks[by * blocks_x + bx];
// Copy basic fields
dst->mode = src->mode;
dst->mv_x = src->mv_x;
dst->mv_y = src->mv_y;
dst->rate_control_factor = src->rate_control_factor;
dst->cbp = src->cbp;
// Convert float32 coefficients to float16 with range clamping
// Float16 max finite value is approximately 65504
const float FLOAT16_MAX = 65504.0f;
for (int i = 0; i < BLOCK_SIZE_SQR; i++) {
float coeff = FCLAMP(src->y_coeffs[i], -FLOAT16_MAX, FLOAT16_MAX);
dst->y_coeffs[i] = float_to_float16(coeff);
if (enc->verbose && fabsf(src->y_coeffs[i]) > FLOAT16_MAX) {
printf("WARNING: Y coefficient %d clamped: %f -> %f\n", i, src->y_coeffs[i], coeff);
}
}
for (int i = 0; i < HALF_BLOCK_SIZE_SQR; i++) {
float co_coeff = FCLAMP(src->co_coeffs[i], -FLOAT16_MAX, FLOAT16_MAX);
float cg_coeff = FCLAMP(src->cg_coeffs[i], -FLOAT16_MAX, FLOAT16_MAX);
dst->co_coeffs[i] = float_to_float16(co_coeff);
dst->cg_coeffs[i] = float_to_float16(cg_coeff);
if (enc->verbose && fabsf(src->co_coeffs[i]) > FLOAT16_MAX) {
printf("WARNING: Co coefficient %d clamped: %f -> %f\n", i, src->co_coeffs[i], co_coeff);
}
if (enc->verbose && fabsf(src->cg_coeffs[i]) > FLOAT16_MAX) {
printf("WARNING: Cg coefficient %d clamped: %f -> %f\n", i, src->cg_coeffs[i], cg_coeff);
}
}
}
}
}
// Convert SubRip time format (HH:MM:SS,mmm) to frame number
static int srt_time_to_frame(const char *time_str, int fps) {
int hours, minutes, seconds, milliseconds;
@@ -1182,7 +1347,7 @@ static subtitle_entry_t* parse_srt_file(const char *filename, int fps) {
}
}
fclose(file);
//fclose(file); // why uncommenting it errors out with "Fatal error: glibc detected an invalid stdio handle"?
return head;
}
@@ -1613,6 +1778,7 @@ static tev_encoder_t* init_encoder(void) {
enc->output_fps = 0; // No frame rate conversion by default
enc->is_ntsc_framerate = 0; // Will be detected from input
enc->verbose = 0;
enc->disable_rcf = 1;
enc->subtitle_file = NULL;
enc->has_subtitles = 0;
enc->subtitle_list = NULL;
@@ -1655,7 +1821,16 @@ static int alloc_encoder_buffers(tev_encoder_t *enc) {
enc->dct_workspace = malloc(16 * 16 * sizeof(float));
enc->block_data = malloc(total_blocks * sizeof(tev_block_t));
enc->compressed_buffer = malloc(total_blocks * sizeof(tev_block_t) * 2);
// Allocate compression buffer large enough for both regular and lossless modes
size_t max_block_size = sizeof(tev_block_t) > sizeof(tev_serialized_lossless_block_t) ?
sizeof(tev_block_t) : sizeof(tev_serialized_lossless_block_t);
size_t compressed_buffer_size = total_blocks * max_block_size * 2;
enc->compressed_buffer = malloc(compressed_buffer_size);
if (enc->verbose) {
printf("Allocated compressed buffer: %zu bytes for %d blocks (max_block_size: %zu)\n",
compressed_buffer_size, total_blocks, max_block_size);
}
enc->mp2_buffer = malloc(MP2_DEFAULT_PACKET_SIZE);
if (!enc->current_rgb || !enc->previous_rgb || !enc->reference_rgb ||
@@ -1726,7 +1901,7 @@ static int write_tev_header(FILE *output, tev_encoder_t *enc) {
uint8_t qualityCo = enc->qualityCo;
uint8_t qualityCg = enc->qualityCg;
uint8_t flags = (enc->has_audio) | (enc->has_subtitles << 1);
uint8_t video_flags = (enc->progressive_mode ? 0 : 1) | (enc->is_ntsc_framerate ? 2 : 0); // bit 0 = is_interlaced, bit 1 = is_ntsc_framerate
uint8_t video_flags = (enc->progressive_mode ? 0 : 1) | (enc->is_ntsc_framerate ? 2 : 0) | (enc->lossless_mode ? 4 : 0); // bit 0 = is_interlaced, bit 1 = is_ntsc_framerate, bit 2 = is_lossless
uint8_t reserved = 0;
fwrite(&width, 2, 1, output);
@@ -1833,7 +2008,11 @@ static int encode_frame(tev_encoder_t *enc, FILE *output, int frame_num, int fie
// Encode all blocks
for (int by = 0; by < blocks_y; by++) {
for (int bx = 0; bx < blocks_x; bx++) {
encode_block(enc, bx, by, is_keyframe);
if (enc->lossless_mode) {
encode_block_lossless(enc, bx, by, is_keyframe);
} else {
encode_block(enc, bx, by, is_keyframe);
}
// Calculate complexity for rate control (if enabled)
if (enc->bitrate_mode > 0) {
@@ -1849,13 +2028,34 @@ static int encode_frame(tev_encoder_t *enc, FILE *output, int frame_num, int fie
}
// Compress block data using Zstd (compatible with TSVM decoder)
size_t block_data_size = blocks_x * blocks_y * sizeof(tev_block_t);
// Compress using Zstd with controlled memory usage
size_t compressed_size = ZSTD_compressCCtx(enc->zstd_context,
enc->compressed_buffer, block_data_size * 2,
enc->block_data, block_data_size,
ZSTD_COMPRESSON_LEVEL);
size_t compressed_size;
if (enc->lossless_mode) {
// Lossless mode: serialize blocks with float16 coefficients
size_t serialized_block_data_size = blocks_x * blocks_y * sizeof(tev_serialized_lossless_block_t);
tev_serialized_lossless_block_t *serialized_blocks = malloc(serialized_block_data_size);
if (!serialized_blocks) {
fprintf(stderr, "Failed to allocate memory for serialized lossless blocks\n");
return -1;
}
serialize_lossless_blocks(enc, blocks_x, blocks_y, serialized_blocks);
// Use the pre-allocated buffer size instead of calculating dynamically
size_t output_buffer_size = blocks_x * blocks_y * sizeof(tev_serialized_lossless_block_t) * 2;
compressed_size = ZSTD_compressCCtx(enc->zstd_context,
enc->compressed_buffer, output_buffer_size,
serialized_blocks, serialized_block_data_size,
ZSTD_COMPRESSON_LEVEL);
free(serialized_blocks);
} else {
// Regular mode: use regular block data
size_t block_data_size = blocks_x * blocks_y * sizeof(tev_block_t);
compressed_size = ZSTD_compressCCtx(enc->zstd_context,
enc->compressed_buffer, block_data_size * 2,
enc->block_data, block_data_size,
ZSTD_COMPRESSON_LEVEL);
}
if (ZSTD_isError(compressed_size)) {
fprintf(stderr, "Zstd compression failed: %s\n", ZSTD_getErrorName(compressed_size));
@@ -2088,7 +2288,7 @@ static int start_audio_conversion(tev_encoder_t *enc) {
char command[2048];
snprintf(command, sizeof(command),
"ffmpeg -v quiet -i \"%s\" -acodec libtwolame -psymodel 4 -b:a %dk -ar %d -ac 2 -y \"%s\" 2>/dev/null",
enc->input_file, MP2_RATE_TABLE[enc->qualityIndex], MP2_SAMPLE_RATE, TEMP_AUDIO_FILE);
enc->input_file, enc->lossless_mode ? 384 : MP2_RATE_TABLE[enc->qualityIndex], MP2_SAMPLE_RATE, TEMP_AUDIO_FILE);
int result = system(command);
if (result == 0) {
@@ -2236,15 +2436,16 @@ static void show_usage(const char *program_name) {
printf(" -o, --output FILE Output video file (use '-' for stdout)\n");
printf(" -s, --size WxH Video size (default: %dx%d)\n", DEFAULT_WIDTH, DEFAULT_HEIGHT);
printf(" -f, --fps N Output frames per second (enables frame rate conversion)\n");
printf(" -q, --quality N Quality level 0-4 (default: 2, only decides audio rate in quantiser mode)\n");
printf(" -q, --quality N Quality level 0-4 (default: 2, only decides audio rate in quantiser/lossless mode)\n");
printf(" -Q, --quantiser N Quantiser level 0-100 (100: lossless, 0: potato)\n");
// printf(" -b, --bitrate N Target bitrate in kbps (enables bitrate control mode; DON'T USE - NOT WORKING AS INTENDED)\n");
printf(" -p, --progressive Use progressive scan (default: interlaced)\n");
printf(" -S, --subtitles FILE SubRip (.srt) or SAMI (.smi) subtitle file\n");
printf(" -v, --verbose Verbose output\n");
printf(" -t, --test Test mode: generate solid colour frames\n");
printf(" --lossless Lossless mode: store coefficients as float16 (no quantisation, implies -p, 384k audio)\n");
printf(" --enable-rcf Enable per-block rate control (experimental)\n");
printf(" --enable-encode-stats Collect and report block complexity statistics\n");
printf(" --disable-rcf Disable per-block rate control\n");
printf(" --help Show this help\n\n");
// printf("Rate Control Modes:\n");
// printf(" Quality mode (default): Fixed quantisation based on -q parameter\n");
@@ -2334,7 +2535,8 @@ int main(int argc, char *argv[]) {
{"verbose", no_argument, 0, 'v'},
{"test", no_argument, 0, 't'},
{"enable-encode-stats", no_argument, 0, 1000},
{"disable-rcf", no_argument, 0, 1100},
{"enable-rcf", no_argument, 0, 1100},
{"lossless", no_argument, 0, 1200},
{"help", no_argument, 0, '?'},
{0, 0, 0, 0}
};
@@ -2403,11 +2605,14 @@ int main(int argc, char *argv[]) {
case 't':
test_mode = 1;
break;
case 1000: // --enable-encode-stats
case 1000: // --enable-encode-stats
enc->stats_mode = 1;
break;
case 1100: // --disable-rcf
enc->disable_rcf = 1;
case 1100: // --enable-rcf
enc->disable_rcf = 0;
break;
case 1200: // --lossless
enc->lossless_mode = 1;
break;
case 0:
if (strcmp(long_options[option_index].name, "help") == 0) {
@@ -2419,7 +2624,7 @@ int main(int argc, char *argv[]) {
case 'Q':
enc->qualityY = CLAMP(atoi(optarg), 0, 100);
enc->qualityCo = enc->qualityY;
enc->qualityCg = (enc->qualityY == 100) ? enc->qualityY : enc->qualityCo >> 2;
enc->qualityCg = (enc->qualityY == 100) ? enc->qualityY : enc->qualityCo >> 1;
break;
default:
show_usage(argv[0]);
@@ -2428,6 +2633,19 @@ int main(int argc, char *argv[]) {
}
}
// Lossless mode validation and adjustments
if (enc->lossless_mode) {
// In lossless mode, disable rate control and set quality to maximum
enc->bitrate_mode = 0;
enc->disable_rcf = 1;
enc->progressive_mode = 1;
enc->qualityIndex = 5;
enc->qualityY = enc->qualityCo = enc->qualityCg = 255; // Use 255 as a redundant lossless marker
if (enc->verbose) {
printf("Lossless mode enabled: Rate control disabled, quality set to maximum, enabling progressive scan\n");
}
}
// halve the internal representation of frame height
if (!enc->progressive_mode) {
enc->height /= 2;