From c50d015515ef3b5f7b50cdd0c27a764bf9446f81 Mon Sep 17 00:00:00 2001 From: minjaesong Date: Fri, 26 Sep 2025 17:17:48 +0900 Subject: [PATCH] TAV decoder for ffmpeg/ffplay --- .../torvald/tsvm/GraphicsJSR223Delegate.kt | 387 +--------- video_encoder/Makefile | 8 +- video_encoder/decoder_tav.c | 699 ++++++++++++++++++ video_encoder/encoder_tav.c | 83 +-- 4 files changed, 752 insertions(+), 425 deletions(-) create mode 100644 video_encoder/decoder_tav.c diff --git a/tsvm_core/src/net/torvald/tsvm/GraphicsJSR223Delegate.kt b/tsvm_core/src/net/torvald/tsvm/GraphicsJSR223Delegate.kt index 62a29ab..7d88db9 100644 --- a/tsvm_core/src/net/torvald/tsvm/GraphicsJSR223Delegate.kt +++ b/tsvm_core/src/net/torvald/tsvm/GraphicsJSR223Delegate.kt @@ -52,6 +52,7 @@ import kotlin.collections.isNotEmpty import kotlin.collections.listOf import kotlin.collections.map import kotlin.collections.maxOfOrNull +import kotlin.collections.minus import kotlin.collections.mutableListOf import kotlin.collections.mutableMapOf import kotlin.collections.set @@ -67,37 +68,13 @@ import kotlin.let import kotlin.longArrayOf import kotlin.math.* import kotlin.repeat +import kotlin.sequences.minus import kotlin.text.format import kotlin.text.lowercase import kotlin.text.toString +import kotlin.times class GraphicsJSR223Delegate(private val vm: VM) { - - // TAV Simulated overlapping tiles constants (must match encoder) - private val TILE_SIZE_X = 280 - private val TILE_SIZE_Y = 224 - private val TAV_TILE_MARGIN = 32 // 32-pixel margin for 3 DWT levels (4 * 2^3 = 32px) - private val PADDED_TILE_SIZE_X = TILE_SIZE_X + 2 * TAV_TILE_MARGIN // 280 + 64 = 344px - private val PADDED_TILE_SIZE_Y = TILE_SIZE_Y + 2 * TAV_TILE_MARGIN // 224 + 64 = 288px - - // Reusable working arrays to reduce allocation overhead - private val tevIdct8TempBuffer = FloatArray(64) - private val tevIdct16TempBuffer = FloatArray(256) // For 16x16 IDCT - private val tevIdct16SeparableBuffer = FloatArray(256) // For separable 16x16 IDCT - - // TAV coefficient delta storage for previous frame (for efficient P-frames) - private var tavPreviousCoeffsY: MutableMap? = null - private var tavPreviousCoeffsCo: MutableMap? = null - private var tavPreviousCoeffsCg: MutableMap? = null - - // TAV Perceptual dequantisation support (must match encoder weights) - data class DWTSubbandInfo( - val level: Int, // Decomposition level (1 to decompLevels) - val subbandType: Int, // 0=LL, 1=LH, 2=HL, 3=HH - val coeffStart: Int, // Starting index in linear coefficient array - val coeffCount: Int, // Number of coefficients in this subband - val perceptualWeight: Float // Quantisation multiplier for this subband - ) private fun getFirstGPU(): GraphicsAdapter? { return vm.findPeribyType(VM.PERITYPE_GPU_AND_TERM)?.peripheral as? GraphicsAdapter @@ -1352,6 +1329,11 @@ class GraphicsJSR223Delegate(private val vm: VM) { // TEV (TSVM Enhanced Video) format support // Created by Claude on 2025-08-17 + // Reusable working arrays to reduce allocation overhead + private val tevIdct8TempBuffer = FloatArray(64) + private val tevIdct16TempBuffer = FloatArray(256) // For 16x16 IDCT + private val tevIdct16SeparableBuffer = FloatArray(256) // For separable 16x16 IDCT + fun jpeg_quality_to_mult(q: Float): Float { return (if ((q < 50)) 5000f / q else 200f - 2 * q) / 100f } @@ -3881,6 +3863,28 @@ class GraphicsJSR223Delegate(private val vm: VM) { // ================= TAV (TSVM Advanced Video) Decoder ================= // DWT-based video codec with ICtCp colour space support + // TAV Simulated overlapping tiles constants (must match encoder) + private val TILE_SIZE_X = 280 + private val TILE_SIZE_Y = 224 + private val TAV_TILE_MARGIN = 32 // 32-pixel margin for 3 DWT levels (4 * 2^3 = 32px) + private val PADDED_TILE_SIZE_X = TILE_SIZE_X + 2 * TAV_TILE_MARGIN // 280 + 64 = 344px + private val PADDED_TILE_SIZE_Y = TILE_SIZE_Y + 2 * TAV_TILE_MARGIN // 224 + 64 = 288px + + // TAV coefficient delta storage for previous frame (for efficient P-frames) + private var tavPreviousCoeffsY: MutableMap? = null + private var tavPreviousCoeffsCo: MutableMap? = null + private var tavPreviousCoeffsCg: MutableMap? = null + + // TAV Perceptual dequantisation support (must match encoder weights) + data class DWTSubbandInfo( + val level: Int, // Decomposition level (1 to decompLevels) + val subbandType: Int, // 0=LL, 1=LH, 2=HL, 3=HH + val coeffStart: Int, // Starting index in linear coefficient array + val coeffCount: Int, // Number of coefficients in this subband + val perceptualWeight: Float // Quantisation multiplier for this subband + ) + + // TAV Perceptual dequantisation helper functions (must match encoder implementation exactly) private fun calculateSubbandLayout(width: Int, height: Int, decompLevels: Int): List { val subbands = mutableListOf() @@ -3946,149 +3950,6 @@ class GraphicsJSR223Delegate(private val vm: VM) { return subbands } - private fun getPerceptualWeightModel2(level: Int, subbandType: Int, isChroma: Boolean, maxLevels: Int): Float { - // Psychovisual model based on DWT coefficient statistics and Human Visual System sensitivity - - if (!isChroma) { - // LUMA CHANNEL: Based on statistical analysis from real video content - when (subbandType) { - 0 -> { // LL subband - contains most image energy, preserve carefully - return when { - level >= 6 -> 0.5f // LL6: High energy but can tolerate moderate quantisation (range up to 22K) - level >= 5 -> 0.7f // LL5: Good preservation - else -> 0.9f // Lower LL levels: Fine preservation - } - } - 1 -> { // LH subband - horizontal details (human eyes more sensitive) - return when { - level >= 6 -> 0.8f // LH6: Significant coefficients (max ~500), preserve well - level >= 5 -> 1.0f // LH5: Moderate coefficients (max ~600) - level >= 4 -> 1.2f // LH4: Small coefficients (max ~50) - level >= 3 -> 1.6f // LH3: Very small coefficients, can quantize more - level >= 2 -> 2.0f // LH2: Minimal impact - else -> 2.5f // LH1: Least important - } - } - 2 -> { // HL subband - vertical details (less sensitive due to HVS characteristics) - return when { - level >= 6 -> 1.0f // HL6: Can quantize more aggressively than LH6 - level >= 5 -> 1.2f // HL5: Standard quantisation - level >= 4 -> 1.5f // HL4: Notable range but less critical - level >= 3 -> 2.0f // HL3: Can tolerate more quantisation - level >= 2 -> 2.5f // HL2: Less important - else -> 3.5f // HL1: Most aggressive for vertical details - } - } - 3 -> { // HH subband - diagonal details (least important for HVS) - return when { - level >= 6 -> 1.2f // HH6: Preserve some diagonal detail - level >= 5 -> 1.6f // HH5: Can quantize aggressively - level >= 4 -> 2.0f // HH4: Very aggressive - level >= 3 -> 2.8f // HH3: Minimal preservation - level >= 2 -> 3.5f // HH2: Maximum compression - else -> 5.0f // HH1: Most aggressive quantisation - } - } - } - } else { - // CHROMA CHANNELS: Less critical for human perception, more aggressive quantisation - when (subbandType) { - 0 -> { // LL chroma - still important but less than luma - return 1f - return when { - level >= 6 -> 0.8f // Chroma LL6: Less critical than luma LL - level >= 5 -> 0.9f - else -> 1.0f - } - } - 1 -> { // LH chroma - horizontal chroma details - return 1.8f - return when { - level >= 6 -> 1.0f - level >= 5 -> 1.2f - level >= 4 -> 1.4f - level >= 3 -> 1.6f - level >= 2 -> 1.8f - else -> 2.0f - } - } - 2 -> { // HL chroma - vertical chroma details (even less critical) - return 1.3f; - return when { - level >= 6 -> 1.2f - level >= 5 -> 1.4f - level >= 4 -> 1.6f - level >= 3 -> 1.8f - level >= 2 -> 2.0f - else -> 2.2f - } - } - 3 -> { // HH chroma - diagonal chroma details (most aggressive) - return 2.5f - return when { - level >= 6 -> 1.4f - level >= 5 -> 1.6f - level >= 4 -> 1.8f - level >= 3 -> 2.1f - level >= 2 -> 2.3f - else -> 2.5f - } - } - } - } - return 1.0f - - // Legacy data-driven model (kept for reference but not used) - /*if (!isChroma) { - // Luma strategy based on statistical variance analysis from real video data - return when (subbandType) { - 0 -> { // LL - // LL6 has extremely high variance (Range=8026.7) but contains most image energy - // Moderate quantisation appropriate due to high variance tolerance - 1.1f - } - 1 -> { // LH (horizontal detail) - // Data-driven weights based on observed coefficient patterns - when (level) { - in 6..maxLevels -> 0.7f // LH6: significant coefficients (Range=243.1) - 5 -> 0.8f // LH5: moderate coefficients (Range=264.3) - 4 -> 1.0f // LH4: small coefficients (Range=50.8) - 3 -> 1.4f // LH3: sparse but large outliers (Range=11909.1) - 2 -> 1.6f // LH2: fewer coefficients (Range=6720.2) - else -> 1.9f // LH1: smallest detail (Range=1606.3) - } - } - 2 -> { // HL (vertical detail) - // Similar pattern to LH but slightly different variance - when (level) { - in 6..maxLevels -> 0.8f // HL6: moderate coefficients (Range=181.6) - 5 -> 0.9f // HL5: small coefficients (Range=80.4) - 4 -> 1.2f // HL4: surprising large outliers (Range=9737.9) - 3 -> 1.3f // HL3: very large outliers (Range=13698.2) - 2 -> 1.5f // HL2: moderate range (Range=2099.4) - else -> 1.8f // HL1: small coefficients (Range=851.1) - } - } - 3 -> { // HH (diagonal detail) - // HH bands generally have lower energy but important for texture - when (level) { - in 6..maxLevels -> 1.0f // HH6: some significant coefficients (Range=95.8) - 5 -> 1.1f // HH5: small coefficients (Range=75.9) - 4 -> 1.3f // HH4: moderate range (Range=89.8) - 3 -> 1.5f // HH3: large outliers (Range=11611.2) - 2 -> 1.8f // HH2: moderate range (Range=2499.2) - else -> 2.1f // HH1: smallest coefficients (Range=761.6) - } - } - else -> 1.0f - } - } else { - // Chroma strategy - apply 0.85x reduction to luma weights for color preservation - val lumaWeight = getPerceptualWeight(level, subbandType, false, maxLevels) - return lumaWeight * 1.6f - }*/ - } - var ANISOTROPY_MULT = floatArrayOf(1.8f, 1.6f, 1.4f, 1.2f, 1.0f, 1.0f) var ANISOTROPY_BIAS = floatArrayOf(0.2f, 0.1f, 0.0f, 0.0f, 0.0f, 0.0f) var ANISOTROPY_MULT_CHROMA = floatArrayOf(6.6f, 5.5f, 4.4f, 3.3f, 2.2f, 1.1f) @@ -4096,7 +3957,7 @@ class GraphicsJSR223Delegate(private val vm: VM) { - private fun perceptual_model3_LH(quality: Int, level: Int): Float { + private fun perceptual_model3_LH(quality: Int, level: Float): Float { val H4 = 1.2f val Lx = H4 - ((quality + 1f) / 15f) * (level - 4f) val Ld = (quality + 1f) / -15f @@ -4114,14 +3975,14 @@ class GraphicsJSR223Delegate(private val vm: VM) { return (HL / LH) * 1.44f; } - fun perceptual_model3_LL(quality: Int, level: Int): Float { + fun perceptual_model3_LL(quality: Int, level: Float): Float { val n = perceptual_model3_LH(quality, level) val m = perceptual_model3_LH(quality, level - 1) / n return n / m } - fun perceptual_model3_chroma_basecurve(quality: Int, level: Int): Float { + fun perceptual_model3_chroma_basecurve(quality: Int, level: Float): Float { return 1.0f - (1.0f / (0.5f * quality * quality + 1.0f)) * (level - 4f) // just a line that passes (4,1) } @@ -4140,9 +4001,12 @@ class GraphicsJSR223Delegate(private val vm: VM) { } // level is one-based index - private fun getPerceptualWeight(qIndex: Int, qYGlobal: Int, level: Int, subbandType: Int, isChroma: Boolean, maxLevels: Int): Float { + private fun getPerceptualWeight(qIndex: Int, qYGlobal: Int, level0: Int, subbandType: Int, isChroma: Boolean, maxLevels: Int): Float { // Psychovisual model based on DWT coefficient statistics and Human Visual System sensitivity + val level = 1.0f + ((level0 - 1.0f) / (maxLevels - 1.0f)) * 5.0f + + val qualityLevel = tavDeriveEncoderQindex(qIndex, qYGlobal) if (!isChroma) { @@ -4157,10 +4021,10 @@ class GraphicsJSR223Delegate(private val vm: VM) { // HL subband - vertical details val HL: Float = perceptual_model3_HL(qualityLevel, LH) - if (subbandType == 2) return HL * (if (level == 2) TWO_PIXEL_DETAILER else if (level == 3) FOUR_PIXEL_DETAILER else 1f) + if (subbandType == 2) return HL * (if (level in 1.8f..2.2f) TWO_PIXEL_DETAILER else if (level in 2.8f..3.2f) FOUR_PIXEL_DETAILER else 1f) // HH subband - diagonal details - else return perceptual_model3_HH(LH, HL) * (if (level == 2) TWO_PIXEL_DETAILER else if (level == 3) FOUR_PIXEL_DETAILER else 1f) + else return perceptual_model3_HH(LH, HL) * (if (level in 1.8f..2.2f) TWO_PIXEL_DETAILER else if (level in 2.8f..3.2f) FOUR_PIXEL_DETAILER else 1f) } else { // CHROMA CHANNELS: Less critical for human perception, more aggressive quantisation @@ -4854,51 +4718,6 @@ class GraphicsJSR223Delegate(private val vm: VM) { } } - private fun tavAddYCoCgResidualToRGBTile(tileX: Int, tileY: Int, yRes: FloatArray, coRes: FloatArray, cgRes: FloatArray, - rgbAddr: Long, width: Int, height: Int) { - val startX = tileX * TILE_SIZE_X - val startY = tileY * TILE_SIZE_Y - - for (y in 0 until TILE_SIZE_Y) { - for (x in 0 until TILE_SIZE_X) { - val frameX = startX + x - val frameY = startY + y - - if (frameX < width && frameY < height) { - val tileIdx = y * TILE_SIZE_X + x - val pixelIdx = frameY * width + frameX - val rgbOffset = pixelIdx * 3L - - // Get current RGB (from motion compensation) - val curR = (vm.peek(rgbAddr + rgbOffset).toInt() and 0xFF).toFloat() - val curG = (vm.peek(rgbAddr + rgbOffset + 1).toInt() and 0xFF).toFloat() - val curB = (vm.peek(rgbAddr + rgbOffset + 2).toInt() and 0xFF).toFloat() - - // Convert current RGB back to YCoCg - val co = (curR - curB) / 2 - val tmp = curB + co - val cg = (curG - tmp) / 2 - val yPred = tmp + cg - - // Add residual - val yFinal = yPred + yRes[tileIdx] - val coFinal = co + coRes[tileIdx] - val cgFinal = cg + cgRes[tileIdx] - - // Convert back to RGB - val tmpFinal = yFinal - cgFinal - val gFinal = yFinal + cgFinal - val bFinal = tmpFinal - coFinal - val rFinal = tmpFinal + coFinal - - vm.poke(rgbAddr + rgbOffset, rFinal.toInt().coerceIn(0, 255).toByte()) - vm.poke(rgbAddr + rgbOffset + 1, gFinal.toInt().coerceIn(0, 255).toByte()) - vm.poke(rgbAddr + rgbOffset + 2, bFinal.toInt().coerceIn(0, 255).toByte()) - } - } - } - } - // Helper functions (simplified versions of existing DWT functions) private fun tavCopyTileRGB(tileX: Int, tileY: Int, currentRGBAddr: Long, prevRGBAddr: Long, width: Int, height: Int) { val startX = tileX * TILE_SIZE_X @@ -4970,77 +4789,11 @@ class GraphicsJSR223Delegate(private val vm: VM) { } } - // Helper functions for perceptual models (simplified versions of encoder models) - private fun getPerceptualModelLL(qualityLevel: Int, level: Int): Float { - // Simplified LL model - preserve DC components - return 1.0f - (level.toFloat() / 8.0f) * (qualityLevel.toFloat() / 6.0f) - } - - private fun getPerceptualModelLH(qualityLevel: Int, level: Int): Float { - // Simplified LH model - horizontal details - return 1.2f + (level.toFloat() / 4.0f) * (qualityLevel.toFloat() / 3.0f) - } - - private fun getPerceptualModelHL(qualityLevel: Int, lhWeight: Float): Float { - // Simplified HL model - vertical details - return lhWeight * 1.1f - } - - private fun getPerceptualModelHH(lhWeight: Float, hlWeight: Float): Float { - // Simplified HH model - diagonal details - return (lhWeight + hlWeight) * 0.6f - } - private fun getPerceptualModelChromaBase(qualityLevel: Int, level: Int): Float { // Simplified chroma base curve return 1.0f - (1.0f / (0.5f * qualityLevel * qualityLevel + 1.0f)) * (level - 4.0f) } - // Determine delta-specific perceptual weight for coefficient at linear position - private fun getPerceptualWeightForPositionDelta(qualityLevel: Int, linearIdx: Int, width: Int, height: Int, decompLevels: Int, isChroma: Boolean): Float { - // Map linear coefficient index to DWT subband using same layout as encoder - var offset = 0 - - // First: LL subband at maximum decomposition level - val llWidth = width shr decompLevels - val llHeight = height shr decompLevels - val llSize = llWidth * llHeight - - if (linearIdx < offset + llSize) { - // LL subband at maximum level - use delta-specific perceptual weight - return getPerceptualWeightDelta(qualityLevel, decompLevels, 0, isChroma, decompLevels) - } - offset += llSize - - // Then: LH, HL, HH subbands for each level from max down to 1 - for (level in decompLevels downTo 1) { - val levelWidth = width shr (decompLevels - level + 1) - val levelHeight = height shr (decompLevels - level + 1) - val subbandSize = levelWidth * levelHeight - - // LH subband (horizontal details) - if (linearIdx < offset + subbandSize) { - return getPerceptualWeightDelta(qualityLevel, level, 1, isChroma, decompLevels) - } - offset += subbandSize - - // HL subband (vertical details) - if (linearIdx < offset + subbandSize) { - return getPerceptualWeightDelta(qualityLevel, level, 2, isChroma, decompLevels) - } - offset += subbandSize - - // HH subband (diagonal details) - if (linearIdx < offset + subbandSize) { - return getPerceptualWeightDelta(qualityLevel, level, 3, isChroma, decompLevels) - } - offset += subbandSize - } - - // Fallback for out-of-bounds indices - return 1.0f - } - private fun tavDecodeDeltaTileRGB(readPtr: Long, tileX: Int, tileY: Int, currentRGBAddr: Long, width: Int, height: Int, qY: Int, qCo: Int, qCg: Int, waveletFilter: Int, decompLevels: Int, isLossless: Boolean, tavVersion: Int, isMonoblock: Boolean = false): Long { @@ -5199,68 +4952,6 @@ class GraphicsJSR223Delegate(private val vm: VM) { return ptr } - private fun tavApplyMotionCompensationRGB(tileX: Int, tileY: Int, mvX: Int, mvY: Int, - currentRGBAddr: Long, prevRGBAddr: Long, - width: Int, height: Int) { - val startX = tileX * TILE_SIZE_X - val startY = tileY * TILE_SIZE_Y - - // Motion vectors in quarter-pixel precision - val refX = startX + (mvX / 4.0f) - val refY = startY + (mvY / 4.0f) - - for (y in 0 until TILE_SIZE_Y) { - for (x in 0 until TILE_SIZE_X) { - val currentPixelIdx = (startY + y) * width + (startX + x) - - if (currentPixelIdx >= 0 && currentPixelIdx < width * height) { - // Bilinear interpolation for sub-pixel motion vectors - val srcX = refX + x - val srcY = refY + y - - val interpolatedRGB = tavBilinearInterpolateRGB(prevRGBAddr, width, height, srcX, srcY) - - val rgbOffset = currentPixelIdx * 3L - vm.poke(currentRGBAddr + rgbOffset, interpolatedRGB[0]) - vm.poke(currentRGBAddr + rgbOffset + 1, interpolatedRGB[1]) - vm.poke(currentRGBAddr + rgbOffset + 2, interpolatedRGB[2]) - } - } - } - } - - private fun tavBilinearInterpolateRGB(rgbPtr: Long, width: Int, height: Int, x: Float, y: Float): ByteArray { - val x0 = kotlin.math.floor(x).toInt() - val y0 = kotlin.math.floor(y).toInt() - val x1 = x0 + 1 - val y1 = y0 + 1 - - if (x0 < 0 || y0 < 0 || x1 >= width || y1 >= height) { - return byteArrayOf(0, 0, 0) // Out of bounds - return black - } - - val fx = x - x0 - val fy = y - y0 - - // Get 4 corner pixels - val rgb00 = getRGBPixel(rgbPtr, y0 * width + x0) - val rgb10 = getRGBPixel(rgbPtr, y0 * width + x1) - val rgb01 = getRGBPixel(rgbPtr, y1 * width + x0) - val rgb11 = getRGBPixel(rgbPtr, y1 * width + x1) - - // Bilinear interpolation - val result = ByteArray(3) - for (c in 0..2) { - val interp = (1 - fx) * (1 - fy) * (rgb00[c].toInt() and 0xFF) + - fx * (1 - fy) * (rgb10[c].toInt() and 0xFF) + - (1 - fx) * fy * (rgb01[c].toInt() and 0xFF) + - fx * fy * (rgb11[c].toInt() and 0xFF) - result[c] = interp.toInt().coerceIn(0, 255).toByte() - } - - return result - } - private fun getRGBPixel(rgbPtr: Long, pixelIdx: Int): ByteArray { val offset = pixelIdx * 3L return byteArrayOf( diff --git a/video_encoder/Makefile b/video_encoder/Makefile index 5fc6ae0..a42c219 100644 --- a/video_encoder/Makefile +++ b/video_encoder/Makefile @@ -6,7 +6,7 @@ CFLAGS = -std=c99 -Wall -Wextra -O2 -D_GNU_SOURCE LIBS = -lm -lzstd # Source files and targets -TARGETS = tev tav +TARGETS = tev tav tav_decoder # Build all encoders all: $(TARGETS) @@ -20,8 +20,9 @@ tav: encoder_tav.c rm -f encoder_tav $(CC) $(CFLAGS) -o encoder_tav $< $(LIBS) -# Default target -$(TARGETS): all +tav_decoder: decoder_tav.c + rm -f decoder_tav + $(CC) $(CFLAGS) -o decoder_tav $< $(LIBS) # Build with debug symbols debug: CFLAGS += -g -DDEBUG @@ -35,6 +36,7 @@ clean: install: $(TARGETS) cp encoder_tev /usr/local/bin/ cp encoder_tav /usr/local/bin/ + cp decoder_tav /usr/local/bin/ # Check for required dependencies check-deps: diff --git a/video_encoder/decoder_tav.c b/video_encoder/decoder_tav.c new file mode 100644 index 0000000..d2781b0 --- /dev/null +++ b/video_encoder/decoder_tav.c @@ -0,0 +1,699 @@ +// TAV Decoder - Working version with TSVM inverse DWT +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include + +// TAV format constants +#define TAV_MAGIC "\x1F\x54\x53\x56\x4D\x54\x41\x56" +#define TAV_MODE_SKIP 0x00 +#define TAV_MODE_INTRA 0x01 +#define TAV_MODE_DELTA 0x02 +#define TAV_PACKET_IFRAME 0x10 +#define TAV_PACKET_PFRAME 0x11 +#define TAV_PACKET_AUDIO_MP2 0x20 +#define TAV_PACKET_SUBTITLE 0x30 +#define TAV_PACKET_SYNC 0xFF + +// Utility macros +static inline int CLAMP(int x, int min, int max) { + return x < min ? min : (x > max ? max : x); +} + +// TAV header structure (32 bytes) +typedef struct { + uint8_t magic[8]; + uint8_t version; + uint16_t width; + uint16_t height; + uint8_t fps; + uint32_t total_frames; + uint8_t wavelet_filter; + uint8_t decomp_levels; + uint8_t quantiser_y; + uint8_t quantiser_co; + uint8_t quantiser_cg; + uint8_t extra_flags; + uint8_t video_flags; + uint8_t encoder_quality; + uint8_t file_role; + uint8_t reserved[5]; +} __attribute__((packed)) tav_header_t; + +// Decoder state +typedef struct { + FILE *input_fp; + FILE *audio_output_fp; // For MP2 audio output when using -p flag + tav_header_t header; + uint8_t *current_frame_rgb; + uint8_t *reference_frame_rgb; + float *dwt_buffer_y; + float *dwt_buffer_co; + float *dwt_buffer_cg; + float *reference_ycocg_y; // Reference frame in YCoCg float space + float *reference_ycocg_co; + float *reference_ycocg_cg; + int frame_count; + int frame_size; +} tav_decoder_t; + +// 9/7 inverse DWT (from TSVM Kotlin code) +static void dwt_97_inverse_1d(float *data, int length) { + if (length < 2) return; + + float *temp = malloc(length * sizeof(float)); + int half = (length + 1) / 2; + + // Split into low and high frequency components (matching TSVM layout) + for (int i = 0; i < half; i++) { + temp[i] = data[i]; // Low-pass coefficients (first half) + } + for (int i = 0; i < length / 2; i++) { + if (half + i < length) { + temp[half + i] = data[half + i]; // High-pass coefficients (second half) + } + } + + // 9/7 inverse lifting coefficients from TSVM + const float alpha = -1.586134342f; + const float beta = -0.052980118f; + const float gamma = 0.882911076f; + const float delta = 0.443506852f; + const float K = 1.230174105f; + + // Step 1: Undo scaling + for (int i = 0; i < half; i++) { + temp[i] /= K; // Low-pass coefficients + } + for (int i = 0; i < length / 2; i++) { + if (half + i < length) { + temp[half + i] *= K; // High-pass coefficients + } + } + + // Step 2: Undo δ update + for (int i = 0; i < half; i++) { + float d_curr = (half + i < length) ? temp[half + i] : 0.0f; + float d_prev = (i > 0 && half + i - 1 < length) ? temp[half + i - 1] : d_curr; + temp[i] -= delta * (d_curr + d_prev); + } + + // Step 3: Undo γ predict + for (int i = 0; i < length / 2; i++) { + if (half + i < length) { + float s_curr = temp[i]; + float s_next = (i + 1 < half) ? temp[i + 1] : s_curr; + temp[half + i] -= gamma * (s_curr + s_next); + } + } + + // Step 4: Undo β update + for (int i = 0; i < half; i++) { + float d_curr = (half + i < length) ? temp[half + i] : 0.0f; + float d_prev = (i > 0 && half + i - 1 < length) ? temp[half + i - 1] : d_curr; + temp[i] -= beta * (d_curr + d_prev); + } + + // Step 5: Undo α predict + for (int i = 0; i < length / 2; i++) { + if (half + i < length) { + float s_curr = temp[i]; + float s_next = (i + 1 < half) ? temp[i + 1] : s_curr; + temp[half + i] -= alpha * (s_curr + s_next); + } + } + + // Reconstruction - interleave low and high pass + for (int i = 0; i < length; i++) { + if (i % 2 == 0) { + // Even positions: low-pass coefficients + data[i] = temp[i / 2]; + } else { + // Odd positions: high-pass coefficients + int idx = i / 2; + if (half + idx < length) { + data[i] = temp[half + idx]; + } else { + data[i] = 0.0f; + } + } + } + + free(temp); +} + +// 5/3 inverse DWT (simplified for testing) +static void dwt_53_inverse_1d(float *data, int length) { + if (length < 2) return; + + // For now, use a simplified version + // TODO: Implement proper 5/3 from TSVM if needed + dwt_97_inverse_1d(data, length); +} + +// Multi-level inverse DWT (fixed to match TSVM exactly) +static void apply_inverse_dwt_multilevel(float *data, int width, int height, int levels, int filter_type) { + int max_size = (width > height) ? width : height; + float *temp_row = malloc(max_size * sizeof(float)); + float *temp_col = malloc(max_size * sizeof(float)); + + // TSVM: for (level in levels - 1 downTo 0) + for (int level = levels - 1; level >= 0; level--) { + // TSVM: val currentWidth = width shr level + int current_width = width >> level; + int current_height = height >> level; + + // Handle edge cases + if (current_width < 1 || current_height < 1) continue; + if (current_width == 1 && current_height == 1) continue; + + // TSVM: Column inverse transform first (vertical) + for (int x = 0; x < current_width; x++) { + for (int y = 0; y < current_height; y++) { + // TSVM applies sharpenFilter multiplier, we'll skip for now + temp_col[y] = data[y * width + x]; + } + + if (filter_type == 0) { // 5/3 reversible + dwt_53_inverse_1d(temp_col, current_height); + } else { // 9/7 irreversible + dwt_97_inverse_1d(temp_col, current_height); + } + + for (int y = 0; y < current_height; y++) { + data[y * width + x] = temp_col[y]; + } + } + + // TSVM: Row inverse transform second (horizontal) + for (int y = 0; y < current_height; y++) { + for (int x = 0; x < current_width; x++) { + // TSVM applies sharpenFilter multiplier, we'll skip for now + temp_row[x] = data[y * width + x]; + } + + if (filter_type == 0) { // 5/3 reversible + dwt_53_inverse_1d(temp_row, current_width); + } else { // 9/7 irreversible + dwt_97_inverse_1d(temp_row, current_width); + } + + for (int x = 0; x < current_width; x++) { + data[y * width + x] = temp_row[x]; + } + } + } + + free(temp_row); + free(temp_col); +} + +// YCoCg-R to RGB conversion (from TSVM) +static void ycocg_r_to_rgb(float y, float co, float cg, uint8_t *r, uint8_t *g, uint8_t *b) { + float tmp = y - cg / 2.0f; + float g_val = cg + tmp; + float b_val = tmp - co / 2.0f; + float r_val = co + b_val; + + *r = CLAMP((int)(r_val + 0.5f), 0, 255); + *g = CLAMP((int)(g_val + 0.5f), 0, 255); + *b = CLAMP((int)(b_val + 0.5f), 0, 255); +} + +// Initialize decoder +static tav_decoder_t* tav_decoder_init(const char *input_file) { + tav_decoder_t *decoder = calloc(1, sizeof(tav_decoder_t)); + if (!decoder) return NULL; + + decoder->input_fp = fopen(input_file, "rb"); + if (!decoder->input_fp) { + free(decoder); + return NULL; + } + + // Read header + if (fread(&decoder->header, sizeof(tav_header_t), 1, decoder->input_fp) != 1) { + fclose(decoder->input_fp); + free(decoder); + return NULL; + } + + // Verify magic + if (memcmp(decoder->header.magic, TAV_MAGIC, 8) != 0) { + fclose(decoder->input_fp); + free(decoder); + return NULL; + } + + decoder->frame_size = decoder->header.width * decoder->header.height; + + // Allocate buffers + decoder->current_frame_rgb = calloc(decoder->frame_size * 3, 1); + decoder->reference_frame_rgb = calloc(decoder->frame_size * 3, 1); + decoder->dwt_buffer_y = calloc(decoder->frame_size, sizeof(float)); + decoder->dwt_buffer_co = calloc(decoder->frame_size, sizeof(float)); + decoder->dwt_buffer_cg = calloc(decoder->frame_size, sizeof(float)); + decoder->reference_ycocg_y = calloc(decoder->frame_size, sizeof(float)); + decoder->reference_ycocg_co = calloc(decoder->frame_size, sizeof(float)); + decoder->reference_ycocg_cg = calloc(decoder->frame_size, sizeof(float)); + + return decoder; +} + +// Cleanup decoder +static void tav_decoder_free(tav_decoder_t *decoder) { + if (!decoder) return; + + if (decoder->input_fp) fclose(decoder->input_fp); + free(decoder->current_frame_rgb); + free(decoder->reference_frame_rgb); + free(decoder->dwt_buffer_y); + free(decoder->dwt_buffer_co); + free(decoder->dwt_buffer_cg); + free(decoder->reference_ycocg_y); + free(decoder->reference_ycocg_co); + free(decoder->reference_ycocg_cg); + free(decoder); +} + +// Decode a single frame +static int decode_frame(tav_decoder_t *decoder) { + uint8_t packet_type; + uint32_t packet_size; + + // Check file position before reading + long file_pos = ftell(decoder->input_fp); + + // Read packet header + if (fread(&packet_type, 1, 1, decoder->input_fp) != 1) { + fprintf(stderr, "EOF at frame %d (file pos: %ld)\n", decoder->frame_count, file_pos); + return 0; // EOF + } + + // Sync packets have no size field - they're just a single 0xFF byte + if (packet_type == TAV_PACKET_SYNC) { + if (decoder->frame_count < 5) { + fprintf(stderr, "Found sync packet 0xFF at pos %ld\n", file_pos); + } + return decode_frame(decoder); // Immediately try next packet + } + + // All other packets have a 4-byte size field + if (fread(&packet_size, 4, 1, decoder->input_fp) != 1) { + fprintf(stderr, "Error reading packet size at frame %d (file pos: %ld)\n", decoder->frame_count, file_pos); + return -1; // Error + } + + // Debug: Show packet info for first few frames + if (decoder->frame_count < 5) { + fprintf(stderr, "Frame %d: packet_type=0x%02X, size=%u (file pos: %ld)\n", + decoder->frame_count, packet_type, packet_size, file_pos); + } + + // Handle audio packets when using FFplay mode + if (packet_type == TAV_PACKET_AUDIO_MP2) { + if (decoder->audio_output_fp) { + // Read and write MP2 audio data directly + uint8_t *audio_data = malloc(packet_size); + if (fread(audio_data, 1, packet_size, decoder->input_fp) == packet_size) { + fwrite(audio_data, 1, packet_size, decoder->audio_output_fp); + fflush(decoder->audio_output_fp); + } + free(audio_data); + } else { + // Skip audio packets in normal mode + if (decoder->frame_count < 5) { + long before_skip = ftell(decoder->input_fp); + fprintf(stderr, "Skipping non-video packet: type=0x%02X, size=%u (pos: %ld)\n", packet_type, packet_size, before_skip); + fseek(decoder->input_fp, packet_size, SEEK_CUR); + long after_skip = ftell(decoder->input_fp); + fprintf(stderr, "After skip: pos=%ld (moved %ld bytes)\n", after_skip, after_skip - before_skip); + } else { + fseek(decoder->input_fp, packet_size, SEEK_CUR); + } + } + return decode_frame(decoder); + } + + // Skip subtitle packets + if (packet_type == TAV_PACKET_SUBTITLE) { + if (decoder->frame_count < 5) { + long before_skip = ftell(decoder->input_fp); + fprintf(stderr, "Skipping subtitle packet: type=0x%02X, size=%u (pos: %ld)\n", packet_type, packet_size, before_skip); + fseek(decoder->input_fp, packet_size, SEEK_CUR); + long after_skip = ftell(decoder->input_fp); + fprintf(stderr, "After skip: pos=%ld (moved %ld bytes)\n", after_skip, after_skip - before_skip); + } else { + fseek(decoder->input_fp, packet_size, SEEK_CUR); + } + return decode_frame(decoder); + } + + if (packet_type != TAV_PACKET_IFRAME && packet_type != TAV_PACKET_PFRAME) { + fprintf(stderr, "Unknown packet type: 0x%02X (expected 0x%02X for audio)\n", packet_type, TAV_PACKET_AUDIO_MP2); + return -1; + } + + // Read and decompress frame data + uint8_t *compressed_data = malloc(packet_size); + if (fread(compressed_data, 1, packet_size, decoder->input_fp) != packet_size) { + free(compressed_data); + return -1; + } + + size_t decompressed_size = ZSTD_getFrameContentSize(compressed_data, packet_size); + if (decompressed_size == ZSTD_CONTENTSIZE_ERROR || decompressed_size == ZSTD_CONTENTSIZE_UNKNOWN) { + decompressed_size = decoder->frame_size * 3 * sizeof(int16_t) + 1024; + } + + uint8_t *decompressed_data = malloc(decompressed_size); + size_t actual_size = ZSTD_decompress(decompressed_data, decompressed_size, compressed_data, packet_size); + + if (ZSTD_isError(actual_size)) { + fprintf(stderr, "ZSTD decompression failed: %s\n", ZSTD_getErrorName(actual_size)); + free(compressed_data); + free(decompressed_data); + return -1; + } + + // Parse block data + uint8_t *ptr = decompressed_data; + uint8_t mode = *ptr++; + uint8_t qy_override = *ptr++; + uint8_t qco_override = *ptr++; + uint8_t qcg_override = *ptr++; + + int qy = qy_override ? qy_override : decoder->header.quantiser_y; + int qco = qco_override ? qco_override : decoder->header.quantiser_co; + int qcg = qcg_override ? qcg_override : decoder->header.quantiser_cg; + + if (mode == TAV_MODE_SKIP) { + // Copy from reference frame + memcpy(decoder->current_frame_rgb, decoder->reference_frame_rgb, decoder->frame_size * 3); + } else { + // Read coefficients in TSVM order: all Y, then all Co, then all Cg + int coeff_count = decoder->frame_size; + uint8_t *coeff_ptr = ptr; + + // Read and dequantize coefficients (simple version for now) + for (int i = 0; i < coeff_count; i++) { + int16_t y_coeff = (int16_t)((coeff_ptr[1] << 8) | coeff_ptr[0]); + decoder->dwt_buffer_y[i] = y_coeff * qy; + coeff_ptr += 2; + } + for (int i = 0; i < coeff_count; i++) { + int16_t co_coeff = (int16_t)((coeff_ptr[1] << 8) | coeff_ptr[0]); + decoder->dwt_buffer_co[i] = co_coeff * qco; + coeff_ptr += 2; + } + for (int i = 0; i < coeff_count; i++) { + int16_t cg_coeff = (int16_t)((coeff_ptr[1] << 8) | coeff_ptr[0]); + decoder->dwt_buffer_cg[i] = cg_coeff * qcg; + coeff_ptr += 2; + } + + // Apply inverse DWT + apply_inverse_dwt_multilevel(decoder->dwt_buffer_y, decoder->header.width, decoder->header.height, + decoder->header.decomp_levels, decoder->header.wavelet_filter); + apply_inverse_dwt_multilevel(decoder->dwt_buffer_co, decoder->header.width, decoder->header.height, + decoder->header.decomp_levels, decoder->header.wavelet_filter); + apply_inverse_dwt_multilevel(decoder->dwt_buffer_cg, decoder->header.width, decoder->header.height, + decoder->header.decomp_levels, decoder->header.wavelet_filter); + + // Handle P-frame delta accumulation (in YCoCg float space) + if (packet_type == TAV_PACKET_PFRAME && mode == TAV_MODE_DELTA) { + // Add delta to reference frame + for (int i = 0; i < decoder->frame_size; i++) { + decoder->dwt_buffer_y[i] += decoder->reference_ycocg_y[i]; + decoder->dwt_buffer_co[i] += decoder->reference_ycocg_co[i]; + decoder->dwt_buffer_cg[i] += decoder->reference_ycocg_cg[i]; + } + } + + // Convert YCoCg-R to RGB + for (int i = 0; i < decoder->frame_size; i++) { + uint8_t r, g, b; + ycocg_r_to_rgb(decoder->dwt_buffer_y[i], + decoder->dwt_buffer_co[i], + decoder->dwt_buffer_cg[i], &r, &g, &b); + + decoder->current_frame_rgb[i * 3] = r; + decoder->current_frame_rgb[i * 3 + 1] = g; + decoder->current_frame_rgb[i * 3 + 2] = b; + } + + // Update reference YCoCg frame (for future P-frames) + memcpy(decoder->reference_ycocg_y, decoder->dwt_buffer_y, decoder->frame_size * sizeof(float)); + memcpy(decoder->reference_ycocg_co, decoder->dwt_buffer_co, decoder->frame_size * sizeof(float)); + memcpy(decoder->reference_ycocg_cg, decoder->dwt_buffer_cg, decoder->frame_size * sizeof(float)); + } + + // Update reference frame + memcpy(decoder->reference_frame_rgb, decoder->current_frame_rgb, decoder->frame_size * 3); + + free(compressed_data); + free(decompressed_data); + decoder->frame_count++; + + // Debug: Check file position after processing frame + if (decoder->frame_count < 5) { + long end_pos = ftell(decoder->input_fp); + fprintf(stderr, "Frame %d completed, file pos now: %ld\n", decoder->frame_count - 1, end_pos); + } + + return 1; +} + +// Output current frame as RGB24 to stdout +static void output_frame_rgb24(tav_decoder_t *decoder) { + fwrite(decoder->current_frame_rgb, 1, decoder->frame_size * 3, stdout); +} + +int main(int argc, char *argv[]) { + char *input_file = NULL; + int use_ffplay = 0; + + // Parse command line arguments + if (argc < 2 || argc > 3) { + fprintf(stderr, "Usage: %s input.tav [-p]\n", argv[0]); + fprintf(stderr, "TAV Decoder decodes video packets into raw RGB24 picture that can be piped into FFmpeg or FFplay.\n"); + fprintf(stderr, " -p Start FFplay directly instead of outputting to stdout\n"); + fprintf(stderr, "\nExamples:\n"); + fprintf(stderr, " %s input.tav | mpv --demuxer=rawvideo --demuxer-rawvideo-w=WIDTH --demuxer-rawvideo-h=HEIGHT -\n", argv[0]); + fprintf(stderr, " %s input.tav -p\n", argv[0]); + return 1; + } + + // Check for -p flag + if (argc == 3) { + if (strcmp(argv[2], "-p") == 0) { + use_ffplay = 1; + input_file = argv[1]; + } else if (strcmp(argv[1], "-p") == 0) { + use_ffplay = 1; + input_file = argv[2]; + } else { + fprintf(stderr, "Error: Unknown flag '%s'\n", argv[2]); + return 1; + } + } else { + input_file = argv[1]; + } + + tav_decoder_t *decoder = tav_decoder_init(input_file); + if (!decoder) { + fprintf(stderr, "Failed to initialize decoder\n"); + return 1; + } + + fprintf(stderr, "TAV Decoder - %dx%d @ %dfps, %d levels, version %d\n", + decoder->header.width, decoder->header.height, decoder->header.fps, + decoder->header.decomp_levels, decoder->header.version); + + fprintf(stderr, "Header says: %u total frames\n", decoder->header.total_frames); + + FILE *output_fp = stdout; + pid_t ffplay_pid = 0, ffmpeg_pid = 0; + char *audio_fifo_path = NULL; + + // If -p flag is used, use FFmpeg to mux video+audio and pipe to FFplay + if (use_ffplay) { + int video_pipe[2], audio_pipe[2], ffmpeg_pipe[2]; + if (pipe(video_pipe) == -1 || pipe(audio_pipe) == -1 || pipe(ffmpeg_pipe) == -1) { + fprintf(stderr, "Failed to create pipes\n"); + tav_decoder_free(decoder); + return 1; + } + + ffmpeg_pid = fork(); + if (ffmpeg_pid == -1) { + fprintf(stderr, "Failed to fork FFmpeg process\n"); + tav_decoder_free(decoder); + return 1; + } else if (ffmpeg_pid == 0) { + // Child process 1 - FFmpeg muxer + close(video_pipe[1]); // Close write ends + close(audio_pipe[1]); + close(ffmpeg_pipe[0]); // Close read end of output pipe + + char video_size[32]; + char framerate[16]; + snprintf(video_size, sizeof(video_size), "%dx%d", decoder->header.width, decoder->header.height); + snprintf(framerate, sizeof(framerate), "%d", decoder->header.fps); + + // Redirect pipes to file descriptors + dup2(video_pipe[0], 3); // Video input on fd 3 + dup2(audio_pipe[0], 4); // Audio input on fd 4 + dup2(ffmpeg_pipe[1], STDOUT_FILENO); // Output to stdout + + close(video_pipe[0]); + close(audio_pipe[0]); + close(ffmpeg_pipe[1]); + + execl("/usr/bin/ffmpeg", "ffmpeg", + "-f", "rawvideo", + "-pixel_format", "rgb24", + "-video_size", video_size, + "-framerate", framerate, + "-i", "pipe:3", // Video from fd 3 + "-f", "mp3", // MP3 demuxer handles MP2/MP3 + "-i", "pipe:4", // Audio from fd 4 + "-c:v", "libx264", // Encode video to H.264 + "-preset", "ultrafast", // Fast encoding + "-crf", "23", // Good quality + "-c:a", "copy", // Copy audio as-is (no re-encoding) + "-f", "matroska", // Output as MKV (good for streaming) + "-", // Output to stdout + "-v", "error", // Minimal logging + (char*)NULL); + + // Try alternative path + execl("/usr/local/bin/ffmpeg", "ffmpeg", + "-f", "rawvideo", + "-pixel_format", "rgb24", + "-video_size", video_size, + "-framerate", framerate, + "-i", "pipe:3", + "-f", "mp3", + "-i", "pipe:4", + "-c:v", "libx264", + "-preset", "ultrafast", + "-crf", "23", + "-c:a", "copy", + "-f", "matroska", + "-", + "-v", "error", + (char*)NULL); + + fprintf(stderr, "Failed to start ffmpeg for muxing\n"); + exit(1); + } + + // Fork again for FFplay + ffplay_pid = fork(); + if (ffplay_pid == -1) { + fprintf(stderr, "Failed to fork FFplay process\n"); + kill(ffmpeg_pid, SIGTERM); + tav_decoder_free(decoder); + return 1; + } else if (ffplay_pid == 0) { + // Child process 2 - FFplay + close(video_pipe[0]); // Close unused ends + close(video_pipe[1]); + close(audio_pipe[0]); + close(audio_pipe[1]); + close(ffmpeg_pipe[1]); + + // Read from FFmpeg output + dup2(ffmpeg_pipe[0], STDIN_FILENO); + close(ffmpeg_pipe[0]); + + execl("/usr/bin/ffplay", "ffplay", + "-i", "-", // Input from stdin + "-v", "error", // Minimal logging + (char*)NULL); + + execl("/usr/local/bin/ffplay", "ffplay", + "-i", "-", + "-v", "error", + (char*)NULL); + + fprintf(stderr, "Failed to start ffplay\n"); + exit(1); + } else { + // Parent process - write to video and audio pipes + close(video_pipe[0]); // Close read ends + close(audio_pipe[0]); + close(ffmpeg_pipe[0]); + close(ffmpeg_pipe[1]); + + output_fp = fdopen(video_pipe[1], "wb"); + decoder->audio_output_fp = fdopen(audio_pipe[1], "wb"); + + if (!output_fp || !decoder->audio_output_fp) { + fprintf(stderr, "Failed to open pipes for writing\n"); + kill(ffmpeg_pid, SIGTERM); + kill(ffplay_pid, SIGTERM); + tav_decoder_free(decoder); + return 1; + } + + fprintf(stderr, "Starting FFmpeg muxer + FFplay for video+audio playback\n"); + } + } else { + fprintf(stderr, "To test: %s %s | ffplay -f rawvideo -pixel_format rgb24 -video_size %dx%d -framerate %d -\n", + argv[0], input_file, decoder->header.width, decoder->header.height, decoder->header.fps); + } + + int result; + while ((result = decode_frame(decoder)) == 1) { + // Write RGB24 data to output (stdout or ffplay pipe) + fwrite(decoder->current_frame_rgb, decoder->frame_size * 3, 1, output_fp); + fflush(output_fp); + + // Debug: Print frame progress (only to stderr) + if (decoder->frame_count % 100 == 0 || decoder->frame_count < 5) { + fprintf(stderr, "Decoded frame %d\n", decoder->frame_count); + } + } + + if (result < 0) { + fprintf(stderr, "Decoding error\n"); + if (use_ffplay) { + if (ffmpeg_pid > 0) kill(ffmpeg_pid, SIGTERM); + if (ffplay_pid > 0) kill(ffplay_pid, SIGTERM); + } + tav_decoder_free(decoder); + return 1; + } + + fprintf(stderr, "Decoded %d frames\n", decoder->frame_count); + + // Clean up + if (use_ffplay) { + if (output_fp != stdout) { + fclose(output_fp); + } + if (decoder->audio_output_fp) { + fclose(decoder->audio_output_fp); + decoder->audio_output_fp = NULL; + } + if (ffmpeg_pid > 0) { + int status; + waitpid(ffmpeg_pid, &status, 0); + } + if (ffplay_pid > 0) { + int status; + waitpid(ffplay_pid, &status, 0); + } + } + + tav_decoder_free(decoder); + return 0; +} diff --git a/video_encoder/encoder_tav.c b/video_encoder/encoder_tav.c index 6ecb219..6cc9e9a 100644 --- a/video_encoder/encoder_tav.c +++ b/video_encoder/encoder_tav.c @@ -806,7 +806,7 @@ static void quantise_dwt_coefficients(float *coeffs, int16_t *quantised, int siz // https://www.desmos.com/calculator/mjlpwqm8ge // where Q=quality, x=level -static float perceptual_model3_LH(int quality, int level) { +static float perceptual_model3_LH(int quality, float level) { float H4 = 1.2f; float Lx = H4 - ((quality + 1.f) / 15.f) * (level - 4.f); float Ld = (quality + 1.f) / -15.f; @@ -824,91 +824,26 @@ static float perceptual_model3_HH(float LH, float HL) { return (HL / LH) * 1.44f; } -static float perceptual_model3_LL(int quality, int level) { +static float perceptual_model3_LL(int quality, float level) { float n = perceptual_model3_LH(quality, level); float m = perceptual_model3_LH(quality, level - 1) / n; return n / m; } -static float perceptual_model3_chroma_basecurve(int quality, int level) { +static float perceptual_model3_chroma_basecurve(int quality, float level) { return 1.0f - (1.0f / (0.5f * quality * quality + 1.0f)) * (level - 4.0f); // just a line that passes (4,1) } -// Get perceptual weight for specific subband - Data-driven model based on coefficient variance analysis -static float get_perceptual_weight_model2(int level, int subband_type, int is_chroma, int max_levels) { - // Psychovisual model based on DWT coefficient statistics and Human Visual System sensitivity - // strategy: JPEG quantisation table + real-world statistics from the encoded videos - if (!is_chroma) { - // LUMA CHANNEL: Based on statistical analysis from real video content - if (subband_type == 0) { // LL subband - contains most image energy, preserve carefully - if (level >= 6) return 0.5f; // LL6: High energy but can tolerate moderate quantisation (range up to 22K) - if (level >= 5) return 0.7f; // LL5: Good preservation - return 0.9f; // Lower LL levels: Fine preservation - } else if (subband_type == 1) { // LH subband - horizontal details (human eyes more sensitive) - if (level >= 6) return 0.8f; // LH6: Significant coefficients (max ~500), preserve well - if (level >= 5) return 1.0f; // LH5: Moderate coefficients (max ~600) - if (level >= 4) return 1.2f; // LH4: Small coefficients (max ~50) - if (level >= 3) return 1.6f; // LH3: Very small coefficients, can quantise more - if (level >= 2) return 2.0f; // LH2: Minimal impact - return 2.5f; // LH1: Least important - } else if (subband_type == 2) { // HL subband - vertical details (less sensitive due to HVS characteristics) - if (level >= 6) return 1.0f; // HL6: Can quantise more aggressively than LH6 - if (level >= 5) return 1.2f; // HL5: Standard quantisation - if (level >= 4) return 1.5f; // HL4: Notable range but less critical - if (level >= 3) return 2.0f; // HL3: Can tolerate more quantisation - if (level >= 2) return 2.5f; // HL2: Less important - return 3.5f; // HL1: Most aggressive for vertical details - } else { // HH subband - diagonal details (least important for HVS) - if (level >= 6) return 1.2f; // HH6: Preserve some diagonal detail - if (level >= 5) return 1.6f; // HH5: Can quantise aggressively - if (level >= 4) return 2.0f; // HH4: Very aggressive - if (level >= 3) return 2.8f; // HH3: Minimal preservation - if (level >= 2) return 3.5f; // HH2: Maximum compression - return 5.0f; // HH1: Most aggressive quantisation - } - } else { - // CHROMA CHANNELS: Less critical for human perception, more aggressive quantisation - // strategy: mimic 4:2:2 chroma subsampling - if (subband_type == 0) { // LL chroma - still important but less than luma - return 1.0f; - if (level >= 6) return 0.8f; // Chroma LL6: Less critical than luma LL - if (level >= 5) return 0.9f; - return 1.0f; - } else if (subband_type == 1) { // LH chroma - horizontal chroma details - return 1.8f; - if (level >= 6) return 1.0f; - if (level >= 5) return 1.2f; - if (level >= 4) return 1.4f; - if (level >= 3) return 1.6f; - if (level >= 2) return 1.8f; - return 2.0f; - } else if (subband_type == 2) { // HL chroma - vertical chroma details (even less critical) - return 1.3f; - if (level >= 6) return 1.2f; - if (level >= 5) return 1.4f; - if (level >= 4) return 1.6f; - if (level >= 3) return 1.8f; - if (level >= 2) return 2.0f; - return 2.2f; - } else { // HH chroma - diagonal chroma details (most aggressive) - return 2.5f; - if (level >= 6) return 1.4f; - if (level >= 5) return 1.6f; - if (level >= 4) return 1.8f; - if (level >= 3) return 2.1f; - if (level >= 2) return 2.3f; - return 2.5f; - } - } -} - #define FOUR_PIXEL_DETAILER 0.88f #define TWO_PIXEL_DETAILER 0.92f // level is one-based index -static float get_perceptual_weight(tav_encoder_t *enc, int level, int subband_type, int is_chroma, int max_levels) { +static float get_perceptual_weight(tav_encoder_t *enc, int level0, int subband_type, int is_chroma, int max_levels) { // Psychovisual model based on DWT coefficient statistics and Human Visual System sensitivity + + float level = 1.0f + ((level0 - 1.0f) / (max_levels - 1.0f)) * 5.0f; + // strategy: more horizontal detail if (!is_chroma) { // LL subband - contains most image energy, preserve carefully @@ -923,10 +858,10 @@ static float get_perceptual_weight(tav_encoder_t *enc, int level, int subband_ty // HL subband - vertical details float HL = perceptual_model3_HL(enc->quality_level, LH); if (subband_type == 2) - return HL * (level == 2 ? TWO_PIXEL_DETAILER : level == 3 ? FOUR_PIXEL_DETAILER : 1.0f); + return HL * (2.2f >= level && level >= 1.8f ? TWO_PIXEL_DETAILER : 3.2f >= level && level >= 2.8f ? FOUR_PIXEL_DETAILER : 1.0f); // HH subband - diagonal details - else return perceptual_model3_HH(LH, HL) * (level == 2 ? TWO_PIXEL_DETAILER : level == 3 ? FOUR_PIXEL_DETAILER : 1.0f); + else return perceptual_model3_HH(LH, HL) * (2.2f >= level && level >= 1.8f ? TWO_PIXEL_DETAILER : 3.2f >= level && level >= 2.8f ? FOUR_PIXEL_DETAILER : 1.0f); } else { // CHROMA CHANNELS: Less critical for human perception, more aggressive quantisation // strategy: more horizontal detail