mirror of
https://github.com/curioustorvald/tsvm.git
synced 2026-03-11 13:41:50 +09:00
optimising and rotating DCT blocks
This commit is contained in:
@@ -17,8 +17,6 @@ class GraphicsJSR223Delegate(private val vm: VM) {
|
||||
private val idctTempBuffer = FloatArray(64)
|
||||
private val idct16TempBuffer = FloatArray(256) // For 16x16 IDCT
|
||||
private val idct16SeparableBuffer = FloatArray(256) // For separable 16x16 IDCT
|
||||
private val ycocgWorkArray = IntArray(256)
|
||||
private val rgbWorkArray = IntArray(256 * 3)
|
||||
|
||||
private fun getFirstGPU(): GraphicsAdapter? {
|
||||
return vm.findPeribyType(VM.PERITYPE_GPU_AND_TERM)?.peripheral as? GraphicsAdapter
|
||||
@@ -1567,39 +1565,43 @@ class GraphicsJSR223Delegate(private val vm: VM) {
|
||||
private fun tevIdct8x8_fast(coeffs: IntArray, quantTable: IntArray, isChromaResidual: Boolean = false): IntArray {
|
||||
val result = IntArray(64)
|
||||
// Reuse preallocated temp buffer to reduce GC pressure
|
||||
|
||||
// Direct IDCT implementation matching original loop structure
|
||||
// Process coefficients and dequantize
|
||||
for (u in 0 until 8) {
|
||||
for (v in 0 until 8) {
|
||||
val idx = u * 8 + v
|
||||
val coeff = if (isChromaResidual && idx == 0) {
|
||||
coeffs[idx].toFloat() // DC lossless for chroma residual
|
||||
} else {
|
||||
coeffs[idx] * quantTable[idx].toFloat()
|
||||
}
|
||||
idctTempBuffer[idx] = coeff
|
||||
}
|
||||
}
|
||||
|
||||
// Apply 2D inverse DCT with original loop structure: for x, for y
|
||||
for (x in 0 until 8) {
|
||||
for (y in 0 until 8) {
|
||||
|
||||
// Fast separable IDCT (row-column decomposition)
|
||||
// First pass: Process rows (8 1D IDCTs)
|
||||
for (row in 0 until 8) {
|
||||
for (col in 0 until 8) {
|
||||
var sum = 0f
|
||||
for (u in 0 until 8) {
|
||||
for (v in 0 until 8) {
|
||||
sum += dctBasis8[u][x] * dctBasis8[v][y] * idctTempBuffer[u * 8 + v]
|
||||
val coeffIdx = row * 8 + u
|
||||
val coeff = if (isChromaResidual && coeffIdx == 0) {
|
||||
coeffs[coeffIdx].toFloat() // DC lossless for chroma residual
|
||||
} else {
|
||||
coeffs[coeffIdx] * quantTable[coeffIdx].toFloat()
|
||||
}
|
||||
sum += dctBasis8[u][col] * coeff
|
||||
}
|
||||
idctTempBuffer[row * 8 + col] = sum
|
||||
}
|
||||
}
|
||||
|
||||
// Second pass: Process columns (8 1D IDCTs)
|
||||
for (col in 0 until 8) {
|
||||
for (row in 0 until 8) {
|
||||
var sum = 0f
|
||||
for (v in 0 until 8) {
|
||||
sum += dctBasis8[v][row] * idctTempBuffer[v * 8 + col]
|
||||
}
|
||||
|
||||
val pixel = if (isChromaResidual) {
|
||||
sum.coerceIn(-256f, 255f)
|
||||
} else {
|
||||
(sum + 128f).coerceIn(0f, 255f)
|
||||
}
|
||||
result[y * 8 + x] = pixel.toInt()
|
||||
// Fix indexing: col=x, row=y, so result[y * 8 + x]
|
||||
result[row * 8 + col] = pixel.toInt()
|
||||
}
|
||||
}
|
||||
|
||||
|
||||
return result
|
||||
}
|
||||
|
||||
@@ -1628,19 +1630,28 @@ class GraphicsJSR223Delegate(private val vm: VM) {
|
||||
}
|
||||
}
|
||||
|
||||
// Apply 2D inverse DCT with original loop structure: for x, for y (like original)
|
||||
// NOTE: Uses direct O(n⁴) method to ensure correct indexing. Separable version
|
||||
// could be 8x faster but requires careful coordinate transformation.
|
||||
for (x in 0 until 16) {
|
||||
for (y in 0 until 16) {
|
||||
// Fast separable IDCT: 8x performance improvement - but causes 90° rotation!
|
||||
// First pass: Process rows (16 1D IDCTs)
|
||||
for (row in 0 until 16) {
|
||||
for (col in 0 until 16) {
|
||||
var sum = 0f
|
||||
for (u in 0 until 16) {
|
||||
for (v in 0 until 16) {
|
||||
sum += dctBasis16[u][x] * dctBasis16[v][y] * idct16TempBuffer[u * 16 + v]
|
||||
}
|
||||
sum += dctBasis16[u][col] * idct16TempBuffer[row * 16 + u]
|
||||
}
|
||||
idct16SeparableBuffer[row * 16 + col] = sum
|
||||
}
|
||||
}
|
||||
|
||||
// Second pass: Process columns (16 1D IDCTs)
|
||||
for (col in 0 until 16) {
|
||||
for (row in 0 until 16) {
|
||||
var sum = 0f
|
||||
for (v in 0 until 16) {
|
||||
sum += dctBasis16[v][row] * idct16SeparableBuffer[v * 16 + col]
|
||||
}
|
||||
val pixel = (sum + 128f).coerceIn(0f, 255f)
|
||||
result[y * 16 + x] = pixel.toInt()
|
||||
// This indexing causes 90° rotation: row/col vs y/x mismatch
|
||||
result[row * 16 + col] = pixel.toInt()
|
||||
}
|
||||
}
|
||||
|
||||
|
||||
Reference in New Issue
Block a user