mirror of
https://github.com/curioustorvald/tsvm.git
synced 2026-06-11 15:24:05 +09:00
various encoder bug fixes
This commit is contained in:
@@ -418,6 +418,7 @@ let hasSubtitle = !!(flags & 2)
|
|||||||
let videoFlags = seqread.readOneByte()
|
let videoFlags = seqread.readOneByte()
|
||||||
let isInterlaced = !!(videoFlags & 1)
|
let isInterlaced = !!(videoFlags & 1)
|
||||||
let isNTSC = !!(videoFlags & 2)
|
let isNTSC = !!(videoFlags & 2)
|
||||||
|
let isLossless = !!(videoFlags & 4)
|
||||||
let unused2 = seqread.readOneByte()
|
let unused2 = seqread.readOneByte()
|
||||||
|
|
||||||
|
|
||||||
@@ -427,6 +428,7 @@ serial.println(` FPS: ${(isNTSC) ? (fps * 1000 / 1001) : fps}`)
|
|||||||
serial.println(` Duration: ${totalFrames / fps}`)
|
serial.println(` Duration: ${totalFrames / fps}`)
|
||||||
serial.println(` Audio: ${hasAudio ? "Yes" : "No"}`)
|
serial.println(` Audio: ${hasAudio ? "Yes" : "No"}`)
|
||||||
serial.println(` Resolution: ${width}x${height}, ${isInterlaced ? "interlaced" : "progressive"}`)
|
serial.println(` Resolution: ${width}x${height}, ${isInterlaced ? "interlaced" : "progressive"}`)
|
||||||
|
serial.println(` Quality: Y=${qualityY}, Co=${qualityCo}, Cg=${qualityCg}, ${isLossless ? "lossless" : "lossy"}`)
|
||||||
|
|
||||||
|
|
||||||
// DEBUG interlace raw output
|
// DEBUG interlace raw output
|
||||||
@@ -665,14 +667,14 @@ try {
|
|||||||
if (isInterlaced) {
|
if (isInterlaced) {
|
||||||
// For interlaced: decode current frame into currentFieldAddr
|
// For interlaced: decode current frame into currentFieldAddr
|
||||||
// For display: use prevFieldAddr as current, currentFieldAddr as next
|
// For display: use prevFieldAddr as current, currentFieldAddr as next
|
||||||
graphics.tevDecode(blockDataPtr, nextFieldAddr, currentFieldAddr, width, decodingHeight, qualityY, qualityCo, qualityCg, trueFrameCount, debugMotionVectors, version, enableDeblocking, enableBoundaryAwareDecoding)
|
graphics.tevDecode(blockDataPtr, nextFieldAddr, currentFieldAddr, width, decodingHeight, qualityY, qualityCo, qualityCg, trueFrameCount, debugMotionVectors, version, enableDeblocking, enableBoundaryAwareDecoding, isLossless)
|
||||||
graphics.tevDeinterlace(trueFrameCount, width, decodingHeight, prevFieldAddr, currentFieldAddr, nextFieldAddr, CURRENT_RGB_ADDR, deinterlaceAlgorithm)
|
graphics.tevDeinterlace(trueFrameCount, width, decodingHeight, prevFieldAddr, currentFieldAddr, nextFieldAddr, CURRENT_RGB_ADDR, deinterlaceAlgorithm)
|
||||||
|
|
||||||
// Rotate field buffers for next frame: NEXT -> CURRENT -> PREV
|
// Rotate field buffers for next frame: NEXT -> CURRENT -> PREV
|
||||||
rotateFieldBuffers()
|
rotateFieldBuffers()
|
||||||
} else {
|
} else {
|
||||||
// Progressive or first frame: normal decoding without temporal prediction
|
// Progressive or first frame: normal decoding without temporal prediction
|
||||||
graphics.tevDecode(blockDataPtr, CURRENT_RGB_ADDR, PREV_RGB_ADDR, width, decodingHeight, qualityY, qualityCo, qualityCg, trueFrameCount, debugMotionVectors, version, enableDeblocking, enableBoundaryAwareDecoding)
|
graphics.tevDecode(blockDataPtr, CURRENT_RGB_ADDR, PREV_RGB_ADDR, width, decodingHeight, qualityY, qualityCo, qualityCg, trueFrameCount, debugMotionVectors, version, enableDeblocking, enableBoundaryAwareDecoding, isLossless)
|
||||||
}
|
}
|
||||||
|
|
||||||
decodeTime = (sys.nanoTime() - decodeStart) / 1000000.0 // Convert to milliseconds
|
decodeTime = (sys.nanoTime() - decodeStart) / 1000000.0 // Convert to milliseconds
|
||||||
|
|||||||
@@ -12,6 +12,7 @@ import net.torvald.terrarum.modulecomputers.virtualcomputer.tvd.toUint
|
|||||||
import net.torvald.tsvm.peripheral.GraphicsAdapter
|
import net.torvald.tsvm.peripheral.GraphicsAdapter
|
||||||
import net.torvald.tsvm.peripheral.PeriBase
|
import net.torvald.tsvm.peripheral.PeriBase
|
||||||
import net.torvald.tsvm.peripheral.fmod
|
import net.torvald.tsvm.peripheral.fmod
|
||||||
|
import net.torvald.util.Float16
|
||||||
import kotlin.math.*
|
import kotlin.math.*
|
||||||
|
|
||||||
class GraphicsJSR223Delegate(private val vm: VM) {
|
class GraphicsJSR223Delegate(private val vm: VM) {
|
||||||
@@ -21,6 +22,77 @@ class GraphicsJSR223Delegate(private val vm: VM) {
|
|||||||
private val idct16TempBuffer = FloatArray(256) // For 16x16 IDCT
|
private val idct16TempBuffer = FloatArray(256) // For 16x16 IDCT
|
||||||
private val idct16SeparableBuffer = FloatArray(256) // For separable 16x16 IDCT
|
private val idct16SeparableBuffer = FloatArray(256) // For separable 16x16 IDCT
|
||||||
|
|
||||||
|
// Lossless IDCT functions for float16 coefficients (no quantization)
|
||||||
|
private fun tevIdct8x8_lossless(coeffs: FloatArray): IntArray {
|
||||||
|
val result = IntArray(64)
|
||||||
|
|
||||||
|
// Fast separable IDCT (row-column decomposition) for lossless coefficients
|
||||||
|
// First pass: Process rows (8 1D IDCTs)
|
||||||
|
for (row in 0 until 8) {
|
||||||
|
for (col in 0 until 8) {
|
||||||
|
var sum = 0f
|
||||||
|
for (u in 0 until 8) {
|
||||||
|
sum += dctBasis8[u][col] * coeffs[row * 8 + u]
|
||||||
|
}
|
||||||
|
idct8TempBuffer[row * 8 + col] = sum * 0.5f
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
// Second pass: Process columns (8 1D IDCTs)
|
||||||
|
for (col in 0 until 8) {
|
||||||
|
for (row in 0 until 8) {
|
||||||
|
var sum = 0f
|
||||||
|
for (v in 0 until 8) {
|
||||||
|
sum += dctBasis8[v][row] * idct8TempBuffer[v * 8 + col]
|
||||||
|
}
|
||||||
|
val finalValue = sum * 0.5f + 128f
|
||||||
|
result[row * 8 + col] = if (finalValue.isNaN() || finalValue.isInfinite()) {
|
||||||
|
println("NaN/Inf detected in 8x8 IDCT at ($row,$col): sum=$sum, finalValue=$finalValue")
|
||||||
|
128 // Default to middle gray
|
||||||
|
} else {
|
||||||
|
finalValue.roundToInt().coerceIn(0, 255)
|
||||||
|
}
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
return result
|
||||||
|
}
|
||||||
|
|
||||||
|
private fun tevIdct16x16_lossless(coeffs: FloatArray): IntArray {
|
||||||
|
val result = IntArray(256)
|
||||||
|
|
||||||
|
// Fast separable IDCT (row-column decomposition) for 16x16 lossless coefficients
|
||||||
|
// First pass: Process rows (16 1D IDCTs)
|
||||||
|
for (row in 0 until 16) {
|
||||||
|
for (col in 0 until 16) {
|
||||||
|
var sum = 0f
|
||||||
|
for (u in 0 until 16) {
|
||||||
|
sum += dctBasis16[u][col] * coeffs[row * 16 + u]
|
||||||
|
}
|
||||||
|
idct16TempBuffer[row * 16 + col] = sum * 0.25f
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
// Second pass: Process columns (16 1D IDCTs)
|
||||||
|
for (col in 0 until 16) {
|
||||||
|
for (row in 0 until 16) {
|
||||||
|
var sum = 0f
|
||||||
|
for (v in 0 until 16) {
|
||||||
|
sum += dctBasis16[v][row] * idct16TempBuffer[v * 16 + col]
|
||||||
|
}
|
||||||
|
val finalValue = sum * 0.25f + 128f
|
||||||
|
result[row * 16 + col] = if (finalValue.isNaN() || finalValue.isInfinite()) {
|
||||||
|
println("NaN/Inf detected in 16x16 IDCT at ($row,$col): sum=$sum, finalValue=$finalValue")
|
||||||
|
128 // Default to middle gray
|
||||||
|
} else {
|
||||||
|
finalValue.roundToInt().coerceIn(0, 255)
|
||||||
|
}
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
return result
|
||||||
|
}
|
||||||
|
|
||||||
|
|
||||||
private fun getFirstGPU(): GraphicsAdapter? {
|
private fun getFirstGPU(): GraphicsAdapter? {
|
||||||
return vm.findPeribyType(VM.PERITYPE_GPU_AND_TERM)?.peripheral as? GraphicsAdapter
|
return vm.findPeribyType(VM.PERITYPE_GPU_AND_TERM)?.peripheral as? GraphicsAdapter
|
||||||
@@ -1649,7 +1721,7 @@ class GraphicsJSR223Delegate(private val vm: VM) {
|
|||||||
val result = IntArray(64)
|
val result = IntArray(64)
|
||||||
// Reuse preallocated temp buffer to reduce GC pressure
|
// Reuse preallocated temp buffer to reduce GC pressure
|
||||||
for (i in coeffs.indices) {
|
for (i in coeffs.indices) {
|
||||||
idct8TempBuffer[i] = coeffs[i] * quantTable[i] * jpeg_quality_to_mult(qualityIndex * rateControlFactor)
|
idct8TempBuffer[i] = coeffs[i] * (quantTable[i] * jpeg_quality_to_mult(qualityIndex * rateControlFactor)).coerceIn(1f, 255f)
|
||||||
}
|
}
|
||||||
|
|
||||||
// Fast separable IDCT (row-column decomposition)
|
// Fast separable IDCT (row-column decomposition)
|
||||||
@@ -1662,7 +1734,7 @@ class GraphicsJSR223Delegate(private val vm: VM) {
|
|||||||
val coeff = if (isChromaResidual && coeffIdx == 0) {
|
val coeff = if (isChromaResidual && coeffIdx == 0) {
|
||||||
coeffs[coeffIdx].toFloat() // DC lossless for chroma residual
|
coeffs[coeffIdx].toFloat() // DC lossless for chroma residual
|
||||||
} else {
|
} else {
|
||||||
coeffs[coeffIdx] * quantTable[coeffIdx] * jpeg_quality_to_mult(qualityIndex * rateControlFactor)
|
coeffs[coeffIdx] * (quantTable[coeffIdx] * jpeg_quality_to_mult(qualityIndex * rateControlFactor)).coerceIn(1f, 255f)
|
||||||
}
|
}
|
||||||
sum += dctBasis8[u][col] * coeff
|
sum += dctBasis8[u][col] * coeff
|
||||||
}
|
}
|
||||||
@@ -1708,7 +1780,7 @@ class GraphicsJSR223Delegate(private val vm: VM) {
|
|||||||
val coeff = if (idx == 0) {
|
val coeff = if (idx == 0) {
|
||||||
coeffs[idx].toFloat() // DC lossless for luma
|
coeffs[idx].toFloat() // DC lossless for luma
|
||||||
} else {
|
} else {
|
||||||
coeffs[idx] * quantTable[idx] * jpeg_quality_to_mult(qualityIndex * rateControlFactor)
|
coeffs[idx] * (quantTable[idx] * jpeg_quality_to_mult(qualityIndex * rateControlFactor)).coerceIn(1f, 255f)
|
||||||
}
|
}
|
||||||
idct16TempBuffer[idx] = coeff
|
idct16TempBuffer[idx] = coeff
|
||||||
}
|
}
|
||||||
@@ -2555,7 +2627,8 @@ class GraphicsJSR223Delegate(private val vm: VM) {
|
|||||||
fun tevDecode(blockDataPtr: Long, currentRGBAddr: Long, prevRGBAddr: Long,
|
fun tevDecode(blockDataPtr: Long, currentRGBAddr: Long, prevRGBAddr: Long,
|
||||||
width: Int, height: Int, qY: Int, qCo: Int, qCg: Int, frameCounter: Int,
|
width: Int, height: Int, qY: Int, qCo: Int, qCg: Int, frameCounter: Int,
|
||||||
debugMotionVectors: Boolean = false, tevVersion: Int = 2,
|
debugMotionVectors: Boolean = false, tevVersion: Int = 2,
|
||||||
enableDeblocking: Boolean = true, enableBoundaryAwareDecoding: Boolean = false) {
|
enableDeblocking: Boolean = true, enableBoundaryAwareDecoding: Boolean = false,
|
||||||
|
isLossless: Boolean = false) {
|
||||||
|
|
||||||
// height doesn't change when interlaced, because that's the encoder's output
|
// height doesn't change when interlaced, because that's the encoder's output
|
||||||
|
|
||||||
@@ -2846,17 +2919,65 @@ class GraphicsJSR223Delegate(private val vm: VM) {
|
|||||||
}
|
}
|
||||||
|
|
||||||
0x01 -> { // TEV_MODE_INTRA - Full YCoCg-R DCT decode (no motion compensation)
|
0x01 -> { // TEV_MODE_INTRA - Full YCoCg-R DCT decode (no motion compensation)
|
||||||
// Read DCT coefficients: Y (16x16=256), Co (8x8=64), Cg (8x8=64)
|
val yBlock: IntArray
|
||||||
|
val coBlock: IntArray
|
||||||
|
val cgBlock: IntArray
|
||||||
|
|
||||||
// Optimized bulk reading of all DCT coefficients: Y(256×2) + Co(64×2) + Cg(64×2) = 768 bytes
|
if (isLossless) {
|
||||||
val coeffShortArray = ShortArray(384) // Total coefficients: 256 + 64 + 64 = 384 shorts
|
// Lossless mode: coefficients are stored as float16, no quantization
|
||||||
vm.bulkPeekShort(readPtr.toInt(), coeffShortArray, 768)
|
// Read float16 coefficients: Y (16x16=256), Co (8x8=64), Cg (8x8=64)
|
||||||
readPtr += 768
|
val coeffFloat16Array = ShortArray(384) // 384 float16 values stored as shorts
|
||||||
|
vm.bulkPeekShort(readPtr.toInt(), coeffFloat16Array, 768) // 384 * 2 bytes
|
||||||
|
readPtr += 768
|
||||||
|
|
||||||
// Perform hardware IDCT for each channel using fast algorithm
|
// Convert float16 to float32 and perform IDCT directly (no quantization)
|
||||||
val yBlock = tevIdct16x16_fast(coeffShortArray.sliceArray(0 until 256), QUANT_TABLE_Y, qY, rateControlFactor)
|
println("DEBUG: Reading lossless coefficients, first few float16 values: ${coeffFloat16Array.take(10).map { "0x${it.toString(16)}" }}")
|
||||||
val coBlock = tevIdct8x8_fast(coeffShortArray.sliceArray(256 until 320), QUANT_TABLE_C, true, qCo, rateControlFactor)
|
val yCoeffs = FloatArray(256) { i ->
|
||||||
val cgBlock = tevIdct8x8_fast(coeffShortArray.sliceArray(320 until 384), QUANT_TABLE_C, true, qCg, rateControlFactor)
|
// Convert signed short to unsigned short for float16 interpretation
|
||||||
|
val signedShort = coeffFloat16Array[i]
|
||||||
|
val float16bits = signedShort.toInt() and 0xFFFF // Convert to unsigned
|
||||||
|
val floatVal = Float16.toFloat(float16bits.toShort())
|
||||||
|
if (floatVal.isNaN() || floatVal.isInfinite()) {
|
||||||
|
println("NaN/Inf detected at Y coefficient $i: signedShort=0x${signedShort.toString(16)}, unsigned=0x${float16bits.toString(16)}, floatVal=$floatVal")
|
||||||
|
0f // Replace NaN with 0
|
||||||
|
} else floatVal
|
||||||
|
}
|
||||||
|
val coCoeffs = FloatArray(64) { i ->
|
||||||
|
// Convert signed short to unsigned short for float16 interpretation
|
||||||
|
val signedShort = coeffFloat16Array[256 + i]
|
||||||
|
val float16bits = signedShort.toInt() and 0xFFFF // Convert to unsigned
|
||||||
|
val floatVal = Float16.toFloat(float16bits.toShort())
|
||||||
|
if (floatVal.isNaN() || floatVal.isInfinite()) {
|
||||||
|
println("NaN/Inf detected at Co coefficient $i: signedShort=0x${signedShort.toString(16)}, unsigned=0x${float16bits.toString(16)}, floatVal=$floatVal")
|
||||||
|
0f // Replace NaN with 0
|
||||||
|
} else floatVal
|
||||||
|
}
|
||||||
|
val cgCoeffs = FloatArray(64) { i ->
|
||||||
|
// Convert signed short to unsigned short for float16 interpretation
|
||||||
|
val signedShort = coeffFloat16Array[320 + i]
|
||||||
|
val float16bits = signedShort.toInt() and 0xFFFF // Convert to unsigned
|
||||||
|
val floatVal = Float16.toFloat(float16bits.toShort())
|
||||||
|
if (floatVal.isNaN() || floatVal.isInfinite()) {
|
||||||
|
println("NaN/Inf detected at Cg coefficient $i: signedShort=0x${signedShort.toString(16)}, unsigned=0x${float16bits.toString(16)}, floatVal=$floatVal")
|
||||||
|
0f // Replace NaN with 0
|
||||||
|
} else floatVal
|
||||||
|
}
|
||||||
|
|
||||||
|
yBlock = tevIdct16x16_lossless(yCoeffs)
|
||||||
|
coBlock = tevIdct8x8_lossless(coCoeffs)
|
||||||
|
cgBlock = tevIdct8x8_lossless(cgCoeffs)
|
||||||
|
} else {
|
||||||
|
// Regular lossy mode: quantized int16 coefficients
|
||||||
|
// Optimized bulk reading of all DCT coefficients: Y(256×2) + Co(64×2) + Cg(64×2) = 768 bytes
|
||||||
|
val coeffShortArray = ShortArray(384) // Total coefficients: 256 + 64 + 64 = 384 shorts
|
||||||
|
vm.bulkPeekShort(readPtr.toInt(), coeffShortArray, 768)
|
||||||
|
readPtr += 768
|
||||||
|
|
||||||
|
// Perform hardware IDCT for each channel using fast algorithm
|
||||||
|
yBlock = tevIdct16x16_fast(coeffShortArray.sliceArray(0 until 256), QUANT_TABLE_Y, qY, rateControlFactor)
|
||||||
|
coBlock = tevIdct8x8_fast(coeffShortArray.sliceArray(256 until 320), QUANT_TABLE_C, true, qCo, rateControlFactor)
|
||||||
|
cgBlock = tevIdct8x8_fast(coeffShortArray.sliceArray(320 until 384), QUANT_TABLE_C, true, qCg, rateControlFactor)
|
||||||
|
}
|
||||||
|
|
||||||
// Convert to RGB (YCoCg-R for v2, XYB for v3)
|
// Convert to RGB (YCoCg-R for v2, XYB for v3)
|
||||||
val rgbData = if (tevVersion == 3) {
|
val rgbData = if (tevVersion == 3) {
|
||||||
@@ -3275,7 +3396,7 @@ class GraphicsJSR223Delegate(private val vm: VM) {
|
|||||||
val quantValue = if (i == 0) 1.0f else {
|
val quantValue = if (i == 0) 1.0f else {
|
||||||
quantTable[coeffIdx] * jpeg_quality_to_mult(qScale * rateControlFactor)
|
quantTable[coeffIdx] * jpeg_quality_to_mult(qScale * rateControlFactor)
|
||||||
}
|
}
|
||||||
result[blockIndex]!![i] = block[i] * quantValue
|
result[blockIndex]!![i] = block[i] * quantValue.coerceIn(1f, 255f)
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
@@ -3307,7 +3428,7 @@ class GraphicsJSR223Delegate(private val vm: VM) {
|
|||||||
|
|
||||||
for (i in 1 until coeffsSize) {
|
for (i in 1 until coeffsSize) {
|
||||||
val coeffIdx = i.coerceIn(0, quantTable.size - 1)
|
val coeffIdx = i.coerceIn(0, quantTable.size - 1)
|
||||||
val quant = (quantTable[coeffIdx] * qualityMult).toInt()
|
val quant = (quantTable[coeffIdx] * qualityMult).coerceIn(1f, 255f).toInt()
|
||||||
quantValues[blockIndex][i] = quant
|
quantValues[blockIndex][i] = quant
|
||||||
quantHalfValues[blockIndex][i] = quant / 2
|
quantHalfValues[blockIndex][i] = quant / 2
|
||||||
}
|
}
|
||||||
@@ -3511,7 +3632,7 @@ class GraphicsJSR223Delegate(private val vm: VM) {
|
|||||||
val rightOff = blocksOff[rightBlockIndex]
|
val rightOff = blocksOff[rightBlockIndex]
|
||||||
|
|
||||||
// OPTIMIZATION 4: Process multiple frequencies in single loop for better cache locality
|
// OPTIMIZATION 4: Process multiple frequencies in single loop for better cache locality
|
||||||
for (v in 0 until 16) { // Only low-to-mid frequencies
|
for (v in 0 until 8) { // Only low-to-mid frequencies
|
||||||
var deltaV = 0L
|
var deltaV = 0L
|
||||||
var hfPenalty = 0L
|
var hfPenalty = 0L
|
||||||
val vOffset = v * 16
|
val vOffset = v * 16
|
||||||
@@ -3667,7 +3788,7 @@ class GraphicsJSR223Delegate(private val vm: VM) {
|
|||||||
blocksMid[blockIndex][i] = dcValue
|
blocksMid[blockIndex][i] = dcValue
|
||||||
} else {
|
} else {
|
||||||
// AC coefficients: use quantization intervals
|
// AC coefficients: use quantization intervals
|
||||||
val quant = (quantTable[quantIdx] * jpeg_quality_to_mult(qScale * rateControlFactor)).toInt()
|
val quant = (quantTable[quantIdx] * jpeg_quality_to_mult(qScale * rateControlFactor)).coerceIn(1f, 255f).toInt()
|
||||||
|
|
||||||
// Standard dequantized value (midpoint)
|
// Standard dequantized value (midpoint)
|
||||||
blocksMid[blockIndex][i] = block[i].toInt() * quant
|
blocksMid[blockIndex][i] = block[i].toInt() * quant
|
||||||
@@ -3719,7 +3840,7 @@ class GraphicsJSR223Delegate(private val vm: VM) {
|
|||||||
blocksMax[blockIndex][i] = dcValue
|
blocksMax[blockIndex][i] = dcValue
|
||||||
} else {
|
} else {
|
||||||
// AC coefficients: use quantization intervals
|
// AC coefficients: use quantization intervals
|
||||||
val quant = (quantTable[quantIdx] * jpeg_quality_to_mult(qScale * rateControlFactor)).toInt()
|
val quant = (quantTable[quantIdx] * jpeg_quality_to_mult(qScale * rateControlFactor)).coerceIn(1f, 255f).toInt()
|
||||||
|
|
||||||
// Standard dequantized value (midpoint)
|
// Standard dequantized value (midpoint)
|
||||||
blocksMid[blockIndex][i] = block[i].toInt() * quant
|
blocksMid[blockIndex][i] = block[i].toInt() * quant
|
||||||
@@ -3789,73 +3910,116 @@ class GraphicsJSR223Delegate(private val vm: VM) {
|
|||||||
return result
|
return result
|
||||||
}
|
}
|
||||||
|
|
||||||
|
// BULK OPTIMIZED 8x8 horizontal boundary analysis for chroma channels
|
||||||
private fun analyzeHorizontalBoundary(
|
private fun analyzeHorizontalBoundary(
|
||||||
leftBlockIndex: Int, rightBlockIndex: Int,
|
leftBlockIndex: Int, rightBlockIndex: Int,
|
||||||
blocksMid: Array<IntArray>, blocksOff: Array<LongArray>,
|
blocksMid: Array<IntArray>, blocksOff: Array<LongArray>,
|
||||||
kLinearGradient: IntArray, kAlphaSqrt2: IntArray
|
kLinearGradient: IntArray, kAlphaSqrt2: IntArray
|
||||||
) {
|
) {
|
||||||
// Only process low-to-mid frequencies (v < 4 for 8x8, v < 8 for 16x16)
|
val leftMid = blocksMid[leftBlockIndex]
|
||||||
val maxV = 8
|
val rightMid = blocksMid[rightBlockIndex]
|
||||||
|
val leftOff = blocksOff[leftBlockIndex]
|
||||||
|
val rightOff = blocksOff[rightBlockIndex]
|
||||||
|
|
||||||
for (v in 0 until maxV) {
|
// OPTIMIZATION 12: Process 8x8 boundaries with bulk operations (v < 4 for low-to-mid frequencies)
|
||||||
|
for (v in 0 until 4) { // Only low-to-mid frequencies for 8x8
|
||||||
var deltaV = 0L
|
var deltaV = 0L
|
||||||
var hfPenalty = 0L
|
var hfPenalty = 0L
|
||||||
|
val vOffset = v * 8
|
||||||
|
|
||||||
// Analyze boundary discontinuity
|
// First pass: Calculate boundary discontinuity
|
||||||
for (u in 0 until 8) {
|
for (u in 0 until 8) {
|
||||||
val alpha = kAlphaSqrt2[u.coerceIn(0, 7)]
|
val idx = vOffset + u
|
||||||
val sign = if (u and 1 == 1) -1 else 1
|
val alpha = kAlphaSqrt2[u] // Direct access (u < 8)
|
||||||
val gi = blocksMid[leftBlockIndex][v * 8 + u]
|
val sign = if (u and 1 != 0) -1 else 1
|
||||||
val gj = blocksMid[rightBlockIndex][v * 8 + u]
|
val gi = leftMid[idx]
|
||||||
|
val gj = rightMid[idx]
|
||||||
|
|
||||||
deltaV += (alpha * (gj - sign * gi)).toLong()
|
deltaV += alpha * (gj - sign * gi)
|
||||||
hfPenalty += (u * u * (gi * gi + gj * gj)).toLong()
|
hfPenalty += (u * u) * (gi * gi + gj * gj)
|
||||||
}
|
}
|
||||||
|
|
||||||
// Apply corrections with high-frequency damping
|
// Early exit for very small adjustments
|
||||||
if (hfPenalty > 400) deltaV /= 2
|
if (kotlin.math.abs(deltaV) < 100) continue
|
||||||
|
|
||||||
for (u in 0 until 8) {
|
// Apply high-frequency damping once per frequency band
|
||||||
val gradientIdx = u.coerceIn(0, kLinearGradient.size - 1)
|
if (hfPenalty > 400) deltaV /= 2 // 8x8 threshold
|
||||||
val sign = if (u and 1 == 1) 1 else -1
|
|
||||||
blocksOff[leftBlockIndex][v * 8 + u] = blocksOff[leftBlockIndex][v * 8 + u] + deltaV * kLinearGradient[gradientIdx]
|
// Second pass: Apply corrections (BULK OPTIMIZED with unrolling for 8x8)
|
||||||
blocksOff[rightBlockIndex][v * 8 + u] = blocksOff[rightBlockIndex][v * 8 + u] + deltaV * kLinearGradient[gradientIdx] * sign
|
val correction = deltaV
|
||||||
}
|
// Bulk apply corrections for 8 coefficients - manually unrolled for performance
|
||||||
|
leftOff[vOffset] += correction * kLinearGradient[0]
|
||||||
|
rightOff[vOffset] += correction * kLinearGradient[0]
|
||||||
|
leftOff[vOffset + 1] += correction * kLinearGradient[1]
|
||||||
|
rightOff[vOffset + 1] -= correction * kLinearGradient[1] // Alternating signs
|
||||||
|
leftOff[vOffset + 2] += correction * kLinearGradient[2]
|
||||||
|
rightOff[vOffset + 2] += correction * kLinearGradient[2]
|
||||||
|
leftOff[vOffset + 3] += correction * kLinearGradient[3]
|
||||||
|
rightOff[vOffset + 3] -= correction * kLinearGradient[3]
|
||||||
|
leftOff[vOffset + 4] += correction * kLinearGradient[4]
|
||||||
|
rightOff[vOffset + 4] += correction * kLinearGradient[4]
|
||||||
|
leftOff[vOffset + 5] += correction * kLinearGradient[5]
|
||||||
|
rightOff[vOffset + 5] -= correction * kLinearGradient[5]
|
||||||
|
leftOff[vOffset + 6] += correction * kLinearGradient[6]
|
||||||
|
rightOff[vOffset + 6] += correction * kLinearGradient[6]
|
||||||
|
leftOff[vOffset + 7] += correction * kLinearGradient[7]
|
||||||
|
rightOff[vOffset + 7] -= correction * kLinearGradient[7]
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
|
// BULK OPTIMIZED 8x8 vertical boundary analysis for chroma channels
|
||||||
private fun analyzeVerticalBoundary(
|
private fun analyzeVerticalBoundary(
|
||||||
topBlockIndex: Int, bottomBlockIndex: Int,
|
topBlockIndex: Int, bottomBlockIndex: Int,
|
||||||
blocksMid: Array<IntArray>, blocksOff: Array<LongArray>,
|
blocksMid: Array<IntArray>, blocksOff: Array<LongArray>,
|
||||||
kLinearGradient: IntArray, kAlphaSqrt2: IntArray
|
kLinearGradient: IntArray, kAlphaSqrt2: IntArray
|
||||||
) {
|
) {
|
||||||
// Only process low-to-mid frequencies (u < 4 for 8x8, u < 8 for 16x16)
|
val topMid = blocksMid[topBlockIndex]
|
||||||
val maxU = 8
|
val bottomMid = blocksMid[bottomBlockIndex]
|
||||||
|
val topOff = blocksOff[topBlockIndex]
|
||||||
|
val bottomOff = blocksOff[bottomBlockIndex]
|
||||||
|
|
||||||
for (u in 0 until maxU) {
|
// OPTIMIZATION 13: Optimized vertical analysis for 8x8 with better cache access pattern
|
||||||
|
for (u in 0 until 4) { // Only low-to-mid frequencies for 8x8
|
||||||
var deltaU = 0L
|
var deltaU = 0L
|
||||||
var hfPenalty = 0L
|
var hfPenalty = 0L
|
||||||
|
|
||||||
// Analyze boundary discontinuity
|
// First pass: Calculate boundary discontinuity
|
||||||
for (v in 0 until 8) {
|
for (v in 0 until 8) {
|
||||||
val alpha = kAlphaSqrt2[v.coerceIn(0, 7)]
|
val idx = v * 8 + u
|
||||||
val sign = if (v and 1 == 1) -1 else 1
|
val alpha = kAlphaSqrt2[v] // Direct access (v < 8)
|
||||||
val gi = blocksMid[topBlockIndex][v * 8 + u]
|
val sign = if (v and 1 != 0) -1 else 1
|
||||||
val gj = blocksMid[bottomBlockIndex][v * 8 + u]
|
val gi = topMid[idx]
|
||||||
|
val gj = bottomMid[idx]
|
||||||
|
|
||||||
deltaU += (alpha * (gj - sign * gi)).toLong()
|
deltaU += alpha * (gj - sign * gi)
|
||||||
hfPenalty += (v * v * (gi * gi + gj * gj)).toLong()
|
hfPenalty += (v * v) * (gi * gi + gj * gj)
|
||||||
}
|
}
|
||||||
|
|
||||||
// Apply corrections with high-frequency damping
|
// Early exit for very small adjustments
|
||||||
if (hfPenalty > 400) deltaU /= 2
|
if (kotlin.math.abs(deltaU) < 100) continue
|
||||||
|
|
||||||
for (v in 0 until 8) {
|
// Apply high-frequency damping once per frequency band
|
||||||
val gradientIdx = v.coerceIn(0, kLinearGradient.size - 1)
|
if (hfPenalty > 400) deltaU /= 2 // 8x8 threshold
|
||||||
val sign = if (v and 1 == 1) 1 else -1
|
|
||||||
blocksOff[topBlockIndex][v * 8 + u] = blocksOff[topBlockIndex][v * 8 + u] + deltaU * kLinearGradient[gradientIdx]
|
// Second pass: Apply corrections (BULK OPTIMIZED vertical for 8x8)
|
||||||
blocksOff[bottomBlockIndex][v * 8 + u] = blocksOff[bottomBlockIndex][v * 8 + u] + deltaU * kLinearGradient[gradientIdx] * sign
|
val correction = deltaU
|
||||||
}
|
// Bulk apply corrections for 8 vertical coefficients - manually unrolled
|
||||||
|
topOff[u] += correction * kLinearGradient[0]
|
||||||
|
bottomOff[u] += correction * kLinearGradient[0]
|
||||||
|
topOff[8 + u] += correction * kLinearGradient[1]
|
||||||
|
bottomOff[8 + u] -= correction * kLinearGradient[1] // Alternating signs
|
||||||
|
topOff[16 + u] += correction * kLinearGradient[2]
|
||||||
|
bottomOff[16 + u] += correction * kLinearGradient[2]
|
||||||
|
topOff[24 + u] += correction * kLinearGradient[3]
|
||||||
|
bottomOff[24 + u] -= correction * kLinearGradient[3]
|
||||||
|
topOff[32 + u] += correction * kLinearGradient[4]
|
||||||
|
bottomOff[32 + u] += correction * kLinearGradient[4]
|
||||||
|
topOff[40 + u] += correction * kLinearGradient[5]
|
||||||
|
bottomOff[40 + u] -= correction * kLinearGradient[5]
|
||||||
|
topOff[48 + u] += correction * kLinearGradient[6]
|
||||||
|
bottomOff[48 + u] += correction * kLinearGradient[6]
|
||||||
|
topOff[56 + u] += correction * kLinearGradient[7]
|
||||||
|
bottomOff[56 + u] -= correction * kLinearGradient[7]
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
|
|||||||
90
tsvm_core/src/net/torvald/util/Float16.kt
Normal file
90
tsvm_core/src/net/torvald/util/Float16.kt
Normal file
@@ -0,0 +1,90 @@
|
|||||||
|
package net.torvald.util
|
||||||
|
|
||||||
|
import kotlin.experimental.or
|
||||||
|
|
||||||
|
class Float16() {
|
||||||
|
|
||||||
|
var bits = 0.toShort()
|
||||||
|
private set
|
||||||
|
|
||||||
|
constructor(fval: Float) : this() {
|
||||||
|
fromFloat(fval)
|
||||||
|
}
|
||||||
|
|
||||||
|
fun toFloat() = Float16.toFloat(bits)
|
||||||
|
fun fromFloat(fval: Float) {
|
||||||
|
bits = Float16.fromFloat(fval)
|
||||||
|
}
|
||||||
|
|
||||||
|
|
||||||
|
operator fun times(other: Float) = fromFloat(this.toFloat() * other)
|
||||||
|
operator fun times(other: Float16) = fromFloat(this.toFloat() * other.toFloat())
|
||||||
|
|
||||||
|
operator fun div(other: Float) = fromFloat(this.toFloat() / other)
|
||||||
|
operator fun div(other: Float16) = fromFloat(this.toFloat() / other.toFloat())
|
||||||
|
|
||||||
|
// operators are stripped: you don't calculate from FP16; this is only for storing values //
|
||||||
|
|
||||||
|
companion object {
|
||||||
|
fun toFloat(hbits: Short): Float {
|
||||||
|
val hbits = hbits.toInt().and(0xFFFF)
|
||||||
|
|
||||||
|
var mant = hbits and 0x03ff // 10 bits mantissa
|
||||||
|
var exp = hbits and 0x7c00 // 5 bits exponent
|
||||||
|
if (exp == 0x7c00)
|
||||||
|
// NaN/Inf
|
||||||
|
exp = 0x3fc00 // -> NaN/Inf
|
||||||
|
else if (exp != 0)
|
||||||
|
// normalized value
|
||||||
|
{
|
||||||
|
exp += 0x1c000 // exp - 15 + 127
|
||||||
|
if (mant == 0 && exp > 0x1c400)
|
||||||
|
// smooth transition
|
||||||
|
return java.lang.Float.intBitsToFloat(hbits and 0x8000 shl 16 or (exp shl 13) or 0x3ff)
|
||||||
|
}
|
||||||
|
else if (mant != 0)
|
||||||
|
// && exp==0 -> subnormal
|
||||||
|
{
|
||||||
|
exp = 0x1c400 // make it normal
|
||||||
|
do {
|
||||||
|
mant = mant shl 1 // mantissa * 2
|
||||||
|
exp -= 0x400 // decrease exp by 1
|
||||||
|
} while (mant and 0x400 == 0) // while not normal
|
||||||
|
mant = mant and 0x3ff // discard subnormal bit
|
||||||
|
} // else +/-0 -> +/-0
|
||||||
|
return java.lang.Float.intBitsToFloat(// combine all parts
|
||||||
|
hbits and 0x8000 shl 16 or (exp or mant shl 13)) // value << ( 23 - 10 )
|
||||||
|
}
|
||||||
|
|
||||||
|
fun fromFloat(fval: Float): Short {
|
||||||
|
val fbits = java.lang.Float.floatToIntBits(fval)
|
||||||
|
val sign = fbits.ushr(16).and(0x8000).toShort() // sign only
|
||||||
|
var `val` = (fbits and 0x7fffffff) + 0x1000 // rounded value
|
||||||
|
|
||||||
|
if (`val` >= 0x47800000)
|
||||||
|
// might be or become NaN/Inf
|
||||||
|
{ // avoid Inf due to rounding
|
||||||
|
if (fbits and 0x7fffffff >= 0x47800000) { // is or must become NaN/Inf
|
||||||
|
if (`val` < 0x7f800000)
|
||||||
|
// was value but too large
|
||||||
|
return sign or 0x7c00 // make it +/-Inf
|
||||||
|
return sign or 0x7c00 or // remains +/-Inf or NaN
|
||||||
|
(fbits and 0x007fffff).ushr(13).toShort() // keep NaN (and Inf) bits
|
||||||
|
}
|
||||||
|
return sign or 0x7bff.toShort() // unrounded not quite Inf
|
||||||
|
}
|
||||||
|
if (`val` >= 0x38800000)
|
||||||
|
// remains normalized value
|
||||||
|
return sign or (`val` - 0x38000000).ushr(13).toShort() // exp - 127 + 15
|
||||||
|
if (`val` < 0x33000000)
|
||||||
|
// too small for subnormal
|
||||||
|
return sign // becomes +/-0
|
||||||
|
`val` = (fbits and 0x7fffffff).ushr(23) // tmp exp for subnormal calc
|
||||||
|
|
||||||
|
return sign or ((fbits and 0x7fffff or 0x800000) // add subnormal bit
|
||||||
|
+ 0x800000.ushr(`val` - 102) // round depending on cut off
|
||||||
|
).ushr(126 - `val`) // div by 2^(1-(exp-127+15)) and >> 13 | exp=0
|
||||||
|
.toShort()
|
||||||
|
}
|
||||||
|
}
|
||||||
|
}
|
||||||
@@ -14,6 +14,58 @@
|
|||||||
#include <sys/time.h>
|
#include <sys/time.h>
|
||||||
#include <time.h>
|
#include <time.h>
|
||||||
|
|
||||||
|
// Float16 conversion functions (adapted from Float16.kt)
|
||||||
|
static inline uint16_t float_to_float16(float fval) {
|
||||||
|
uint32_t fbits = *(uint32_t*)&fval;
|
||||||
|
uint16_t sign = (fbits >> 16) & 0x8000; // sign only
|
||||||
|
uint32_t val = (fbits & 0x7fffffff) + 0x1000; // rounded value
|
||||||
|
|
||||||
|
if (val >= 0x47800000) { // might be or become NaN/Inf
|
||||||
|
if ((fbits & 0x7fffffff) >= 0x47800000) { // is or must become NaN/Inf
|
||||||
|
if (val < 0x7f800000) // was value but too large
|
||||||
|
return sign | 0x7c00; // make it +/-Inf
|
||||||
|
return sign | 0x7c00 | // remains +/-Inf or NaN
|
||||||
|
((fbits & 0x007fffff) >> 13); // keep NaN (and Inf) bits
|
||||||
|
}
|
||||||
|
return sign | 0x7bff; // unrounded not quite Inf
|
||||||
|
}
|
||||||
|
if (val >= 0x38800000) // remains normalized value
|
||||||
|
return sign | ((val - 0x38000000) >> 13); // exp - 127 + 15
|
||||||
|
if (val < 0x33000000) // too small for subnormal
|
||||||
|
return sign; // becomes +/-0
|
||||||
|
val = (fbits & 0x7fffffff) >> 23; // tmp exp for subnormal calc
|
||||||
|
|
||||||
|
return sign | (((fbits & 0x7fffff) | 0x800000) + // add subnormal bit
|
||||||
|
(0x800000 >> (val - 102)) // round depending on cut off
|
||||||
|
) >> (126 - val); // div by 2^(1-(exp-127+15)) and >> 13 | exp=0
|
||||||
|
}
|
||||||
|
|
||||||
|
static inline float float16_to_float(uint16_t hbits) {
|
||||||
|
uint32_t mant = hbits & 0x03ff; // 10 bits mantissa
|
||||||
|
uint32_t exp = hbits & 0x7c00; // 5 bits exponent
|
||||||
|
|
||||||
|
if (exp == 0x7c00) // NaN/Inf
|
||||||
|
exp = 0x3fc00; // -> NaN/Inf
|
||||||
|
else if (exp != 0) { // normalized value
|
||||||
|
exp += 0x1c000; // exp - 15 + 127
|
||||||
|
if (mant == 0 && exp > 0x1c400) { // smooth transition
|
||||||
|
uint32_t fbits = ((hbits & 0x8000) << 16) | (exp << 13) | 0x3ff;
|
||||||
|
return *(float*)&fbits;
|
||||||
|
}
|
||||||
|
}
|
||||||
|
else if (mant != 0) { // && exp==0 -> subnormal
|
||||||
|
exp = 0x1c400; // make it normal
|
||||||
|
do {
|
||||||
|
mant <<= 1; // mantissa * 2
|
||||||
|
exp -= 0x400; // decrease exp by 1
|
||||||
|
} while ((mant & 0x400) == 0); // while not normal
|
||||||
|
mant &= 0x3ff; // discard subnormal bit
|
||||||
|
} // else +/-0 -> +/-0
|
||||||
|
|
||||||
|
uint32_t fbits = ((hbits & 0x8000) << 16) | ((exp | mant) << 13);
|
||||||
|
return *(float*)&fbits;
|
||||||
|
}
|
||||||
|
|
||||||
// TSVM Enhanced Video (TEV) format constants
|
// TSVM Enhanced Video (TEV) format constants
|
||||||
#define TEV_MAGIC "\x1F\x54\x53\x56\x4D\x54\x45\x56" // "\x1FTSVM TEV"
|
#define TEV_MAGIC "\x1F\x54\x53\x56\x4D\x54\x45\x56" // "\x1FTSVM TEV"
|
||||||
#define TEV_VERSION 2 // Updated for YCoCg-R 4:2:0
|
#define TEV_VERSION 2 // Updated for YCoCg-R 4:2:0
|
||||||
@@ -103,7 +155,7 @@ static const uint32_t QUANT_TABLE_C[HALF_BLOCK_SIZE_SQR] =
|
|||||||
|
|
||||||
// Audio constants (reuse MP2 from existing system)
|
// Audio constants (reuse MP2 from existing system)
|
||||||
#define MP2_SAMPLE_RATE 32000
|
#define MP2_SAMPLE_RATE 32000
|
||||||
#define MP2_DEFAULT_PACKET_SIZE 0x240
|
#define MP2_DEFAULT_PACKET_SIZE 1728
|
||||||
|
|
||||||
// Default values
|
// Default values
|
||||||
#define DEFAULT_WIDTH 560
|
#define DEFAULT_WIDTH 560
|
||||||
@@ -140,6 +192,17 @@ typedef struct __attribute__((packed)) {
|
|||||||
int16_t cg_coeffs[HALF_BLOCK_SIZE_SQR]; // quantised Cg DCT coefficients (8x8)
|
int16_t cg_coeffs[HALF_BLOCK_SIZE_SQR]; // quantised Cg DCT coefficients (8x8)
|
||||||
} tev_block_t;
|
} tev_block_t;
|
||||||
|
|
||||||
|
// Lossless TEV block structure (uses float32 internally, converted to float16 during serialization)
|
||||||
|
typedef struct __attribute__((packed)) {
|
||||||
|
uint8_t mode; // Block encoding mode
|
||||||
|
int16_t mv_x, mv_y; // Motion vector (1/4 pixel precision)
|
||||||
|
float rate_control_factor; // Always 1.0f in lossless mode
|
||||||
|
uint16_t cbp; // Coded block pattern (which channels have non-zero coeffs)
|
||||||
|
float y_coeffs[BLOCK_SIZE_SQR]; // lossless Y DCT coefficients (16x16)
|
||||||
|
float co_coeffs[HALF_BLOCK_SIZE_SQR]; // lossless Co DCT coefficients (8x8)
|
||||||
|
float cg_coeffs[HALF_BLOCK_SIZE_SQR]; // lossless Cg DCT coefficients (8x8)
|
||||||
|
} tev_lossless_block_t;
|
||||||
|
|
||||||
// Subtitle entry structure
|
// Subtitle entry structure
|
||||||
typedef struct subtitle_entry {
|
typedef struct subtitle_entry {
|
||||||
int start_frame;
|
int start_frame;
|
||||||
@@ -168,6 +231,8 @@ typedef struct {
|
|||||||
int qualityCo;
|
int qualityCo;
|
||||||
int qualityCg;
|
int qualityCg;
|
||||||
int verbose;
|
int verbose;
|
||||||
|
int disable_rcf; // 0 = rcf enabled, 1 = disabled
|
||||||
|
int lossless_mode; // 0 = lossy (default), 1 = lossless mode
|
||||||
|
|
||||||
// Bitrate control
|
// Bitrate control
|
||||||
int target_bitrate_kbps; // Target bitrate in kbps (0 = quality mode)
|
int target_bitrate_kbps; // Target bitrate in kbps (0 = quality mode)
|
||||||
@@ -219,7 +284,6 @@ typedef struct {
|
|||||||
|
|
||||||
// Complexity statistics collection
|
// Complexity statistics collection
|
||||||
int stats_mode; // 0 = disabled, 1 = enabled
|
int stats_mode; // 0 = disabled, 1 = enabled
|
||||||
int disable_rcf; // 0 = rcf enabled, 1 = disabled
|
|
||||||
float *complexity_values; // Array to store all complexity values
|
float *complexity_values; // Array to store all complexity values
|
||||||
int complexity_count; // Current count of complexity values
|
int complexity_count; // Current count of complexity values
|
||||||
int complexity_capacity; // Capacity of complexity_values array
|
int complexity_capacity; // Capacity of complexity_values array
|
||||||
@@ -1041,6 +1105,107 @@ static void encode_block(tev_encoder_t *enc, int block_x, int block_y, int is_ke
|
|||||||
block->cbp = 0x07; // Y, Co, Cg all present
|
block->cbp = 0x07; // Y, Co, Cg all present
|
||||||
}
|
}
|
||||||
|
|
||||||
|
// Encode a 16x16 block in lossless mode
|
||||||
|
static void encode_block_lossless(tev_encoder_t *enc, int block_x, int block_y, int is_keyframe) {
|
||||||
|
tev_lossless_block_t *block = (tev_lossless_block_t*)&enc->block_data[block_y * ((enc->width + 15) / 16) + block_x];
|
||||||
|
|
||||||
|
// Extract YCoCg-R block
|
||||||
|
extract_ycocgr_block(enc->current_rgb, enc->width, enc->height,
|
||||||
|
block_x, block_y,
|
||||||
|
enc->y_workspace, enc->co_workspace, enc->cg_workspace);
|
||||||
|
|
||||||
|
if (is_keyframe) {
|
||||||
|
// Intra coding for keyframes
|
||||||
|
block->mode = TEV_MODE_INTRA;
|
||||||
|
block->mv_x = block->mv_y = 0;
|
||||||
|
enc->blocks_intra++;
|
||||||
|
} else {
|
||||||
|
// Same mode decision logic as regular encode_block
|
||||||
|
// For simplicity, using INTRA for now in lossless mode
|
||||||
|
block->mode = TEV_MODE_INTRA;
|
||||||
|
block->mv_x = block->mv_y = 0;
|
||||||
|
enc->blocks_intra++;
|
||||||
|
}
|
||||||
|
|
||||||
|
// Lossless mode: rate control factor is always 1.0f
|
||||||
|
block->rate_control_factor = 1.0f;
|
||||||
|
|
||||||
|
// Apply DCT transforms using the same pattern as regular encoding
|
||||||
|
// Y channel (16x16)
|
||||||
|
dct_16x16_fast(enc->y_workspace, enc->dct_workspace);
|
||||||
|
for (int i = 0; i < BLOCK_SIZE_SQR; i++) {
|
||||||
|
block->y_coeffs[i] = enc->dct_workspace[i]; // Store directly without quantization
|
||||||
|
}
|
||||||
|
|
||||||
|
// Co channel (8x8)
|
||||||
|
dct_8x8_fast(enc->co_workspace, enc->dct_workspace);
|
||||||
|
for (int i = 0; i < HALF_BLOCK_SIZE_SQR; i++) {
|
||||||
|
block->co_coeffs[i] = enc->dct_workspace[i]; // Store directly without quantization
|
||||||
|
}
|
||||||
|
|
||||||
|
// Cg channel (8x8)
|
||||||
|
dct_8x8_fast(enc->cg_workspace, enc->dct_workspace);
|
||||||
|
for (int i = 0; i < HALF_BLOCK_SIZE_SQR; i++) {
|
||||||
|
block->cg_coeffs[i] = enc->dct_workspace[i]; // Store directly without quantization
|
||||||
|
}
|
||||||
|
|
||||||
|
// Set CBP (simplified - always encode all channels)
|
||||||
|
block->cbp = 0x07; // Y, Co, Cg all present
|
||||||
|
}
|
||||||
|
|
||||||
|
// Serialized lossless block structure (for writing to file with float16 coefficients)
|
||||||
|
typedef struct __attribute__((packed)) {
|
||||||
|
uint8_t mode;
|
||||||
|
int16_t mv_x, mv_y;
|
||||||
|
float rate_control_factor; // Always 1.0f in lossless mode
|
||||||
|
uint16_t cbp;
|
||||||
|
uint16_t y_coeffs[BLOCK_SIZE_SQR]; // float16 Y coefficients
|
||||||
|
uint16_t co_coeffs[HALF_BLOCK_SIZE_SQR]; // float16 Co coefficients
|
||||||
|
uint16_t cg_coeffs[HALF_BLOCK_SIZE_SQR]; // float16 Cg coefficients
|
||||||
|
} tev_serialized_lossless_block_t;
|
||||||
|
|
||||||
|
// Convert lossless blocks to serialized format with float16 coefficients
|
||||||
|
static void serialize_lossless_blocks(tev_encoder_t *enc, int blocks_x, int blocks_y,
|
||||||
|
tev_serialized_lossless_block_t *serialized_blocks) {
|
||||||
|
for (int by = 0; by < blocks_y; by++) {
|
||||||
|
for (int bx = 0; bx < blocks_x; bx++) {
|
||||||
|
tev_lossless_block_t *src = (tev_lossless_block_t*)&enc->block_data[by * blocks_x + bx];
|
||||||
|
tev_serialized_lossless_block_t *dst = &serialized_blocks[by * blocks_x + bx];
|
||||||
|
|
||||||
|
// Copy basic fields
|
||||||
|
dst->mode = src->mode;
|
||||||
|
dst->mv_x = src->mv_x;
|
||||||
|
dst->mv_y = src->mv_y;
|
||||||
|
dst->rate_control_factor = src->rate_control_factor;
|
||||||
|
dst->cbp = src->cbp;
|
||||||
|
|
||||||
|
// Convert float32 coefficients to float16 with range clamping
|
||||||
|
// Float16 max finite value is approximately 65504
|
||||||
|
const float FLOAT16_MAX = 65504.0f;
|
||||||
|
|
||||||
|
for (int i = 0; i < BLOCK_SIZE_SQR; i++) {
|
||||||
|
float coeff = FCLAMP(src->y_coeffs[i], -FLOAT16_MAX, FLOAT16_MAX);
|
||||||
|
dst->y_coeffs[i] = float_to_float16(coeff);
|
||||||
|
if (enc->verbose && fabsf(src->y_coeffs[i]) > FLOAT16_MAX) {
|
||||||
|
printf("WARNING: Y coefficient %d clamped: %f -> %f\n", i, src->y_coeffs[i], coeff);
|
||||||
|
}
|
||||||
|
}
|
||||||
|
for (int i = 0; i < HALF_BLOCK_SIZE_SQR; i++) {
|
||||||
|
float co_coeff = FCLAMP(src->co_coeffs[i], -FLOAT16_MAX, FLOAT16_MAX);
|
||||||
|
float cg_coeff = FCLAMP(src->cg_coeffs[i], -FLOAT16_MAX, FLOAT16_MAX);
|
||||||
|
dst->co_coeffs[i] = float_to_float16(co_coeff);
|
||||||
|
dst->cg_coeffs[i] = float_to_float16(cg_coeff);
|
||||||
|
if (enc->verbose && fabsf(src->co_coeffs[i]) > FLOAT16_MAX) {
|
||||||
|
printf("WARNING: Co coefficient %d clamped: %f -> %f\n", i, src->co_coeffs[i], co_coeff);
|
||||||
|
}
|
||||||
|
if (enc->verbose && fabsf(src->cg_coeffs[i]) > FLOAT16_MAX) {
|
||||||
|
printf("WARNING: Cg coefficient %d clamped: %f -> %f\n", i, src->cg_coeffs[i], cg_coeff);
|
||||||
|
}
|
||||||
|
}
|
||||||
|
}
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
// Convert SubRip time format (HH:MM:SS,mmm) to frame number
|
// Convert SubRip time format (HH:MM:SS,mmm) to frame number
|
||||||
static int srt_time_to_frame(const char *time_str, int fps) {
|
static int srt_time_to_frame(const char *time_str, int fps) {
|
||||||
int hours, minutes, seconds, milliseconds;
|
int hours, minutes, seconds, milliseconds;
|
||||||
@@ -1182,7 +1347,7 @@ static subtitle_entry_t* parse_srt_file(const char *filename, int fps) {
|
|||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
fclose(file);
|
//fclose(file); // why uncommenting it errors out with "Fatal error: glibc detected an invalid stdio handle"?
|
||||||
return head;
|
return head;
|
||||||
}
|
}
|
||||||
|
|
||||||
@@ -1613,6 +1778,7 @@ static tev_encoder_t* init_encoder(void) {
|
|||||||
enc->output_fps = 0; // No frame rate conversion by default
|
enc->output_fps = 0; // No frame rate conversion by default
|
||||||
enc->is_ntsc_framerate = 0; // Will be detected from input
|
enc->is_ntsc_framerate = 0; // Will be detected from input
|
||||||
enc->verbose = 0;
|
enc->verbose = 0;
|
||||||
|
enc->disable_rcf = 1;
|
||||||
enc->subtitle_file = NULL;
|
enc->subtitle_file = NULL;
|
||||||
enc->has_subtitles = 0;
|
enc->has_subtitles = 0;
|
||||||
enc->subtitle_list = NULL;
|
enc->subtitle_list = NULL;
|
||||||
@@ -1655,7 +1821,16 @@ static int alloc_encoder_buffers(tev_encoder_t *enc) {
|
|||||||
enc->dct_workspace = malloc(16 * 16 * sizeof(float));
|
enc->dct_workspace = malloc(16 * 16 * sizeof(float));
|
||||||
|
|
||||||
enc->block_data = malloc(total_blocks * sizeof(tev_block_t));
|
enc->block_data = malloc(total_blocks * sizeof(tev_block_t));
|
||||||
enc->compressed_buffer = malloc(total_blocks * sizeof(tev_block_t) * 2);
|
// Allocate compression buffer large enough for both regular and lossless modes
|
||||||
|
size_t max_block_size = sizeof(tev_block_t) > sizeof(tev_serialized_lossless_block_t) ?
|
||||||
|
sizeof(tev_block_t) : sizeof(tev_serialized_lossless_block_t);
|
||||||
|
size_t compressed_buffer_size = total_blocks * max_block_size * 2;
|
||||||
|
enc->compressed_buffer = malloc(compressed_buffer_size);
|
||||||
|
|
||||||
|
if (enc->verbose) {
|
||||||
|
printf("Allocated compressed buffer: %zu bytes for %d blocks (max_block_size: %zu)\n",
|
||||||
|
compressed_buffer_size, total_blocks, max_block_size);
|
||||||
|
}
|
||||||
enc->mp2_buffer = malloc(MP2_DEFAULT_PACKET_SIZE);
|
enc->mp2_buffer = malloc(MP2_DEFAULT_PACKET_SIZE);
|
||||||
|
|
||||||
if (!enc->current_rgb || !enc->previous_rgb || !enc->reference_rgb ||
|
if (!enc->current_rgb || !enc->previous_rgb || !enc->reference_rgb ||
|
||||||
@@ -1726,7 +1901,7 @@ static int write_tev_header(FILE *output, tev_encoder_t *enc) {
|
|||||||
uint8_t qualityCo = enc->qualityCo;
|
uint8_t qualityCo = enc->qualityCo;
|
||||||
uint8_t qualityCg = enc->qualityCg;
|
uint8_t qualityCg = enc->qualityCg;
|
||||||
uint8_t flags = (enc->has_audio) | (enc->has_subtitles << 1);
|
uint8_t flags = (enc->has_audio) | (enc->has_subtitles << 1);
|
||||||
uint8_t video_flags = (enc->progressive_mode ? 0 : 1) | (enc->is_ntsc_framerate ? 2 : 0); // bit 0 = is_interlaced, bit 1 = is_ntsc_framerate
|
uint8_t video_flags = (enc->progressive_mode ? 0 : 1) | (enc->is_ntsc_framerate ? 2 : 0) | (enc->lossless_mode ? 4 : 0); // bit 0 = is_interlaced, bit 1 = is_ntsc_framerate, bit 2 = is_lossless
|
||||||
uint8_t reserved = 0;
|
uint8_t reserved = 0;
|
||||||
|
|
||||||
fwrite(&width, 2, 1, output);
|
fwrite(&width, 2, 1, output);
|
||||||
@@ -1833,7 +2008,11 @@ static int encode_frame(tev_encoder_t *enc, FILE *output, int frame_num, int fie
|
|||||||
// Encode all blocks
|
// Encode all blocks
|
||||||
for (int by = 0; by < blocks_y; by++) {
|
for (int by = 0; by < blocks_y; by++) {
|
||||||
for (int bx = 0; bx < blocks_x; bx++) {
|
for (int bx = 0; bx < blocks_x; bx++) {
|
||||||
encode_block(enc, bx, by, is_keyframe);
|
if (enc->lossless_mode) {
|
||||||
|
encode_block_lossless(enc, bx, by, is_keyframe);
|
||||||
|
} else {
|
||||||
|
encode_block(enc, bx, by, is_keyframe);
|
||||||
|
}
|
||||||
|
|
||||||
// Calculate complexity for rate control (if enabled)
|
// Calculate complexity for rate control (if enabled)
|
||||||
if (enc->bitrate_mode > 0) {
|
if (enc->bitrate_mode > 0) {
|
||||||
@@ -1849,13 +2028,34 @@ static int encode_frame(tev_encoder_t *enc, FILE *output, int frame_num, int fie
|
|||||||
}
|
}
|
||||||
|
|
||||||
// Compress block data using Zstd (compatible with TSVM decoder)
|
// Compress block data using Zstd (compatible with TSVM decoder)
|
||||||
size_t block_data_size = blocks_x * blocks_y * sizeof(tev_block_t);
|
size_t compressed_size;
|
||||||
|
|
||||||
// Compress using Zstd with controlled memory usage
|
if (enc->lossless_mode) {
|
||||||
size_t compressed_size = ZSTD_compressCCtx(enc->zstd_context,
|
// Lossless mode: serialize blocks with float16 coefficients
|
||||||
enc->compressed_buffer, block_data_size * 2,
|
size_t serialized_block_data_size = blocks_x * blocks_y * sizeof(tev_serialized_lossless_block_t);
|
||||||
enc->block_data, block_data_size,
|
tev_serialized_lossless_block_t *serialized_blocks = malloc(serialized_block_data_size);
|
||||||
ZSTD_COMPRESSON_LEVEL);
|
if (!serialized_blocks) {
|
||||||
|
fprintf(stderr, "Failed to allocate memory for serialized lossless blocks\n");
|
||||||
|
return -1;
|
||||||
|
}
|
||||||
|
|
||||||
|
serialize_lossless_blocks(enc, blocks_x, blocks_y, serialized_blocks);
|
||||||
|
|
||||||
|
// Use the pre-allocated buffer size instead of calculating dynamically
|
||||||
|
size_t output_buffer_size = blocks_x * blocks_y * sizeof(tev_serialized_lossless_block_t) * 2;
|
||||||
|
compressed_size = ZSTD_compressCCtx(enc->zstd_context,
|
||||||
|
enc->compressed_buffer, output_buffer_size,
|
||||||
|
serialized_blocks, serialized_block_data_size,
|
||||||
|
ZSTD_COMPRESSON_LEVEL);
|
||||||
|
free(serialized_blocks);
|
||||||
|
} else {
|
||||||
|
// Regular mode: use regular block data
|
||||||
|
size_t block_data_size = blocks_x * blocks_y * sizeof(tev_block_t);
|
||||||
|
compressed_size = ZSTD_compressCCtx(enc->zstd_context,
|
||||||
|
enc->compressed_buffer, block_data_size * 2,
|
||||||
|
enc->block_data, block_data_size,
|
||||||
|
ZSTD_COMPRESSON_LEVEL);
|
||||||
|
}
|
||||||
|
|
||||||
if (ZSTD_isError(compressed_size)) {
|
if (ZSTD_isError(compressed_size)) {
|
||||||
fprintf(stderr, "Zstd compression failed: %s\n", ZSTD_getErrorName(compressed_size));
|
fprintf(stderr, "Zstd compression failed: %s\n", ZSTD_getErrorName(compressed_size));
|
||||||
@@ -2088,7 +2288,7 @@ static int start_audio_conversion(tev_encoder_t *enc) {
|
|||||||
char command[2048];
|
char command[2048];
|
||||||
snprintf(command, sizeof(command),
|
snprintf(command, sizeof(command),
|
||||||
"ffmpeg -v quiet -i \"%s\" -acodec libtwolame -psymodel 4 -b:a %dk -ar %d -ac 2 -y \"%s\" 2>/dev/null",
|
"ffmpeg -v quiet -i \"%s\" -acodec libtwolame -psymodel 4 -b:a %dk -ar %d -ac 2 -y \"%s\" 2>/dev/null",
|
||||||
enc->input_file, MP2_RATE_TABLE[enc->qualityIndex], MP2_SAMPLE_RATE, TEMP_AUDIO_FILE);
|
enc->input_file, enc->lossless_mode ? 384 : MP2_RATE_TABLE[enc->qualityIndex], MP2_SAMPLE_RATE, TEMP_AUDIO_FILE);
|
||||||
|
|
||||||
int result = system(command);
|
int result = system(command);
|
||||||
if (result == 0) {
|
if (result == 0) {
|
||||||
@@ -2236,15 +2436,16 @@ static void show_usage(const char *program_name) {
|
|||||||
printf(" -o, --output FILE Output video file (use '-' for stdout)\n");
|
printf(" -o, --output FILE Output video file (use '-' for stdout)\n");
|
||||||
printf(" -s, --size WxH Video size (default: %dx%d)\n", DEFAULT_WIDTH, DEFAULT_HEIGHT);
|
printf(" -s, --size WxH Video size (default: %dx%d)\n", DEFAULT_WIDTH, DEFAULT_HEIGHT);
|
||||||
printf(" -f, --fps N Output frames per second (enables frame rate conversion)\n");
|
printf(" -f, --fps N Output frames per second (enables frame rate conversion)\n");
|
||||||
printf(" -q, --quality N Quality level 0-4 (default: 2, only decides audio rate in quantiser mode)\n");
|
printf(" -q, --quality N Quality level 0-4 (default: 2, only decides audio rate in quantiser/lossless mode)\n");
|
||||||
printf(" -Q, --quantiser N Quantiser level 0-100 (100: lossless, 0: potato)\n");
|
printf(" -Q, --quantiser N Quantiser level 0-100 (100: lossless, 0: potato)\n");
|
||||||
// printf(" -b, --bitrate N Target bitrate in kbps (enables bitrate control mode; DON'T USE - NOT WORKING AS INTENDED)\n");
|
// printf(" -b, --bitrate N Target bitrate in kbps (enables bitrate control mode; DON'T USE - NOT WORKING AS INTENDED)\n");
|
||||||
printf(" -p, --progressive Use progressive scan (default: interlaced)\n");
|
printf(" -p, --progressive Use progressive scan (default: interlaced)\n");
|
||||||
printf(" -S, --subtitles FILE SubRip (.srt) or SAMI (.smi) subtitle file\n");
|
printf(" -S, --subtitles FILE SubRip (.srt) or SAMI (.smi) subtitle file\n");
|
||||||
printf(" -v, --verbose Verbose output\n");
|
printf(" -v, --verbose Verbose output\n");
|
||||||
printf(" -t, --test Test mode: generate solid colour frames\n");
|
printf(" -t, --test Test mode: generate solid colour frames\n");
|
||||||
|
printf(" --lossless Lossless mode: store coefficients as float16 (no quantisation, implies -p, 384k audio)\n");
|
||||||
|
printf(" --enable-rcf Enable per-block rate control (experimental)\n");
|
||||||
printf(" --enable-encode-stats Collect and report block complexity statistics\n");
|
printf(" --enable-encode-stats Collect and report block complexity statistics\n");
|
||||||
printf(" --disable-rcf Disable per-block rate control\n");
|
|
||||||
printf(" --help Show this help\n\n");
|
printf(" --help Show this help\n\n");
|
||||||
// printf("Rate Control Modes:\n");
|
// printf("Rate Control Modes:\n");
|
||||||
// printf(" Quality mode (default): Fixed quantisation based on -q parameter\n");
|
// printf(" Quality mode (default): Fixed quantisation based on -q parameter\n");
|
||||||
@@ -2334,7 +2535,8 @@ int main(int argc, char *argv[]) {
|
|||||||
{"verbose", no_argument, 0, 'v'},
|
{"verbose", no_argument, 0, 'v'},
|
||||||
{"test", no_argument, 0, 't'},
|
{"test", no_argument, 0, 't'},
|
||||||
{"enable-encode-stats", no_argument, 0, 1000},
|
{"enable-encode-stats", no_argument, 0, 1000},
|
||||||
{"disable-rcf", no_argument, 0, 1100},
|
{"enable-rcf", no_argument, 0, 1100},
|
||||||
|
{"lossless", no_argument, 0, 1200},
|
||||||
{"help", no_argument, 0, '?'},
|
{"help", no_argument, 0, '?'},
|
||||||
{0, 0, 0, 0}
|
{0, 0, 0, 0}
|
||||||
};
|
};
|
||||||
@@ -2403,11 +2605,14 @@ int main(int argc, char *argv[]) {
|
|||||||
case 't':
|
case 't':
|
||||||
test_mode = 1;
|
test_mode = 1;
|
||||||
break;
|
break;
|
||||||
case 1000: // --enable-encode-stats
|
case 1000: // --enable-encode-stats
|
||||||
enc->stats_mode = 1;
|
enc->stats_mode = 1;
|
||||||
break;
|
break;
|
||||||
case 1100: // --disable-rcf
|
case 1100: // --enable-rcf
|
||||||
enc->disable_rcf = 1;
|
enc->disable_rcf = 0;
|
||||||
|
break;
|
||||||
|
case 1200: // --lossless
|
||||||
|
enc->lossless_mode = 1;
|
||||||
break;
|
break;
|
||||||
case 0:
|
case 0:
|
||||||
if (strcmp(long_options[option_index].name, "help") == 0) {
|
if (strcmp(long_options[option_index].name, "help") == 0) {
|
||||||
@@ -2419,7 +2624,7 @@ int main(int argc, char *argv[]) {
|
|||||||
case 'Q':
|
case 'Q':
|
||||||
enc->qualityY = CLAMP(atoi(optarg), 0, 100);
|
enc->qualityY = CLAMP(atoi(optarg), 0, 100);
|
||||||
enc->qualityCo = enc->qualityY;
|
enc->qualityCo = enc->qualityY;
|
||||||
enc->qualityCg = (enc->qualityY == 100) ? enc->qualityY : enc->qualityCo >> 2;
|
enc->qualityCg = (enc->qualityY == 100) ? enc->qualityY : enc->qualityCo >> 1;
|
||||||
break;
|
break;
|
||||||
default:
|
default:
|
||||||
show_usage(argv[0]);
|
show_usage(argv[0]);
|
||||||
@@ -2428,6 +2633,19 @@ int main(int argc, char *argv[]) {
|
|||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
|
// Lossless mode validation and adjustments
|
||||||
|
if (enc->lossless_mode) {
|
||||||
|
// In lossless mode, disable rate control and set quality to maximum
|
||||||
|
enc->bitrate_mode = 0;
|
||||||
|
enc->disable_rcf = 1;
|
||||||
|
enc->progressive_mode = 1;
|
||||||
|
enc->qualityIndex = 5;
|
||||||
|
enc->qualityY = enc->qualityCo = enc->qualityCg = 255; // Use 255 as a redundant lossless marker
|
||||||
|
if (enc->verbose) {
|
||||||
|
printf("Lossless mode enabled: Rate control disabled, quality set to maximum, enabling progressive scan\n");
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
// halve the internal representation of frame height
|
// halve the internal representation of frame height
|
||||||
if (!enc->progressive_mode) {
|
if (!enc->progressive_mode) {
|
||||||
enc->height /= 2;
|
enc->height /= 2;
|
||||||
|
|||||||
Reference in New Issue
Block a user