From 4eec98cdca18d54e99f0750f039fd421f26842d7 Mon Sep 17 00:00:00 2001 From: minjaesong Date: Wed, 22 Oct 2025 01:32:19 +0900 Subject: [PATCH] TAV: half-fixed 3d dwt playback --- assets/disk0/tvdos/bin/playtav.js | 209 +++++++-------- terranmon.txt | 14 +- .../torvald/tsvm/GraphicsJSR223Delegate.kt | 252 ++---------------- .../tsvm/peripheral/GraphicsAdapter.kt | 2 +- video_encoder/encoder_tav.c | 199 ++++++++------ video_encoder/tav_inspector.c | 69 ++--- 6 files changed, 278 insertions(+), 467 deletions(-) diff --git a/assets/disk0/tvdos/bin/playtav.js b/assets/disk0/tvdos/bin/playtav.js index 819ecda..0480bcc 100644 --- a/assets/disk0/tvdos/bin/playtav.js +++ b/assets/disk0/tvdos/bin/playtav.js @@ -355,11 +355,12 @@ let decodeHeight = isInterlaced ? (header.height >> 1) : header.height const FRAME_PIXELS = header.width * header.height const FRAME_SIZE = FRAME_PIXELS * 3 // RGB buffer size -// Double-buffering: Fixed slot sizes in videoBuffer (32 MB total) -const MAX_GOP_SIZE = 21 // Maximum frames per slot (21 * 752KB = ~15MB per slot) +// Triple-buffering: Fixed slot sizes in videoBuffer (48 MB total) +const BUFFER_SLOTS = 3 // Three slots: playing, ready, decoding +const MAX_GOP_SIZE = 21 // Maximum frames per slot (21 * 752KB = ~15.8MB per slot) const SLOT_SIZE = MAX_GOP_SIZE * FRAME_SIZE // Fixed slot size regardless of actual GOP size -console.log(`Double-buffering: Max ${MAX_GOP_SIZE} frames/slot, ${(SLOT_SIZE / 1048576).toFixed(1)}MB per slot`) +console.log(`Triple-buffering: ${BUFFER_SLOTS} slots, max ${MAX_GOP_SIZE} frames/slot, ${(SLOT_SIZE / 1048576).toFixed(1)}MB per slot`) const RGB_BUFFER_A = sys.malloc(FRAME_SIZE) const RGB_BUFFER_B = sys.malloc(FRAME_SIZE) @@ -484,17 +485,18 @@ let currentFileIndex = 1 // Track which file we're playing in concatenated stre let totalFilesProcessed = 0 let decoderDbgInfo = {} -// GOP double-buffering state -let currentGopBufferSlot = 0 // Which buffer slot is currently being displayed (0 or 1) +// GOP triple-buffering state (3 slots: playing, ready, decoding) +let currentGopBufferSlot = 0 // Which buffer slot is currently being displayed (0, 1, or 2) let currentGopSize = 0 // Number of frames in current GOP being displayed let currentGopFrameIndex = 0 // Which frame of current GOP we're displaying -let nextGopData = null // Buffered next GOP packet data for background decode +let readyGopData = null // GOP that's already decoded and ready to play (next in line) +let decodingGopData = null // GOP currently being decoded in background let asyncDecodeInProgress = false // Track if async decode is running let asyncDecodeSlot = 0 // Which slot the async decode is targeting let asyncDecodeGopSize = 0 // Size of GOP being decoded async let asyncDecodePtr = 0 // Compressed data pointer to free after decode let asyncDecodeStartTime = 0 // When async decode started (for diagnostics) -let shouldReadPackets = true // Gate packet reading: false when both buffers are full +let shouldReadPackets = true // Gate packet reading: false when all 3 buffers are full let cueElements = [] let currentCueIndex = -1 // Track current cue position @@ -510,12 +512,19 @@ function cleanupAsyncDecode() { asyncDecodeGopSize = 0 } - // Free background GOP decode memory if in progress - if (nextGopData !== null && nextGopData.compressedPtr && nextGopData.compressedPtr !== 0) { - sys.free(nextGopData.compressedPtr) - nextGopData.compressedPtr = 0 + // Free ready GOP memory if present + if (readyGopData !== null && readyGopData.compressedPtr && readyGopData.compressedPtr !== 0) { + sys.free(readyGopData.compressedPtr) + readyGopData.compressedPtr = 0 } - nextGopData = null + readyGopData = null + + // Free decoding GOP memory if present + if (decodingGopData !== null && decodingGopData.compressedPtr && decodingGopData.compressedPtr !== 0) { + sys.free(decodingGopData.compressedPtr) + decodingGopData.compressedPtr = 0 + } + decodingGopData = null // Reset GOP playback state currentGopSize = 0 @@ -751,7 +760,10 @@ let paused = false try { let t1 = sys.nanoTime() - while (!stopPlay && seqread.getReadCount() < FILE_LENGTH) { + // Continue loop while: + // 1. Reading packets (not EOF yet), OR + // 2. There are buffered GOPs to play (after EOF) + while (!stopPlay && (seqread.getReadCount() < FILE_LENGTH || currentGopSize > 0 || readyGopData !== null || decodingGopData !== null || asyncDecodeInProgress)) { // Handle interactive controls @@ -866,9 +878,10 @@ try { } // GATED PACKET READING - // Stop reading when both buffers are full (GOP playing + GOP decoding/ready) + // Stop reading when all 3 buffers are full (GOP playing + ready GOP + decoding GOP) // Resume reading when GOP finishes (one buffer becomes free) - if (shouldReadPackets && !paused) { + // Also stop reading at EOF + if (shouldReadPackets && !paused && seqread.getReadCount() < FILE_LENGTH) { // Read packet header (record position before reading for I-frame tracking) let packetOffset = seqread.getReadCount() var packetType = seqread.readOneByte() @@ -1051,32 +1064,15 @@ try { // Read GOP packet data const gopSize = seqread.readOneByte() - const marginLeft = seqread.readOneByte() - const marginRight = seqread.readOneByte() - const marginTop = seqread.readOneByte() - const marginBottom = seqread.readOneByte() - - const canvasWidth = header.width + marginLeft + marginRight - const canvasHeight = header.height + marginTop + marginBottom - - // Read motion vectors (1/16-pixel units, int16) - let motionX = new Array(gopSize) - let motionY = new Array(gopSize) - - for (let i = 0; i < gopSize; i++) { - let mx = seqread.readShort() - let my = seqread.readShort() - motionX[i] = (mx > 32767) ? (mx - 65536) : mx - motionY[i] = (my > 32767) ? (my - 65536) : my - } - const compressedSize = seqread.readInt() let compressedPtr = seqread.readBytes(compressedSize) updateDataRateBin(compressedSize) - // DOUBLE-BUFFERING LOGIC: - // - If no GOP is currently playing: decode immediately to current slot - // - Otherwise: buffer this GOP for decode during next GOP's playback + // TRIPLE-BUFFERING LOGIC (3 slots: playing, ready, decoding): + // - If no GOP playing: decode first GOP to slot 0 + // - If GOP playing but no ready GOP: decode to ready slot (next in rotation) + // - If GOP playing and ready GOP exists but no decoding: decode to decoding slot + // - Otherwise: all 3 buffers full, ignore packet // Check GOP size fits in slot if (gopSize > MAX_GOP_SIZE) { @@ -1086,11 +1082,11 @@ try { } if (currentGopSize === 0 && !asyncDecodeInProgress) { - // No active GOP and no decode in progress: decode asynchronously and start playback when ready + // Case 1: No active GOP and no decode in progress - decode first GOP const bufferSlot = currentGopBufferSlot const bufferOffset = bufferSlot * SLOT_SIZE - // Defensive: free any old async decode memory (shouldn't happen but be safe) + // Defensive: free any old async decode memory if (asyncDecodePtr !== 0) { sys.free(asyncDecodePtr) asyncDecodePtr = 0 @@ -1099,10 +1095,7 @@ try { // Start async decode graphics.tavDecodeGopToVideoBufferAsync( compressedPtr, compressedSize, gopSize, - motionX, motionY, header.width, header.height, - canvasWidth, canvasHeight, - marginLeft, marginTop, header.qualityLevel, QLUT[header.qualityY], QLUT[header.qualityCo], QLUT[header.qualityCg], header.channelLayout, @@ -1114,49 +1107,25 @@ try { asyncDecodeInProgress = true asyncDecodeSlot = bufferSlot asyncDecodeGopSize = gopSize - asyncDecodePtr = compressedPtr // Will free after decode completes + asyncDecodePtr = compressedPtr asyncDecodeStartTime = sys.nanoTime() - // Note: compressedPtr will be freed after decode completes - // We'll check for completion in main loop and start playback then - if (interactive) { - console.log(`[GOP] Started async decode of first GOP (slot ${bufferSlot}, ${gopSize} frames)`) - } } else if (currentGopSize === 0 && asyncDecodeInProgress) { - // First GOP still decoding but another arrived - ignore it to avoid cancelling first GOP - if (interactive) { - console.log(`[GOP] Warning: GOP arrived while first GOP still decoding - ignoring to avoid cancellation`) - } + // Case 2: First GOP still decoding - ignore to avoid cancellation sys.free(compressedPtr) - } else if (currentGopSize > 0 && !asyncDecodeInProgress) { - // GOP is playing and first GOP decode is done: decode this one to other slot in background (async) - const nextSlot = 1 - currentGopBufferSlot + + } else if (currentGopSize > 0 && readyGopData === null && !asyncDecodeInProgress && graphics.tavDecodeGopIsComplete()) { + // Case 3: GOP playing, no ready GOP, no decode in progress - decode to ready slot + const nextSlot = (currentGopBufferSlot + 1) % BUFFER_SLOTS const nextOffset = nextSlot * SLOT_SIZE - // DIAGNOSTIC: Measure background decode timing const framesRemaining = currentGopSize - currentGopFrameIndex - const timeRemaining = framesRemaining * FRAME_TIME * 1000.0 // milliseconds + const timeRemaining = framesRemaining * FRAME_TIME * 1000.0 - // If previous GOP still decoding, free its memory (will be overwritten) - if (nextGopData !== null && !nextGopData.decoded && nextGopData.compressedPtr && nextGopData.compressedPtr !== 0) { - if (interactive) { - console.log(`[GOP] Warning: New GOP arrived before previous decode completed - freeing old data`) - } - sys.free(nextGopData.compressedPtr) - nextGopData.compressedPtr = 0 - } - - if (interactive) { - console.log(`[GOP] Background decode started: frame ${currentGopFrameIndex}/${currentGopSize}, ${framesRemaining} frames (${timeRemaining.toFixed(0)}ms) remaining`) - } - - // Start async background decode + // Start async decode to ready slot graphics.tavDecodeGopToVideoBufferAsync( compressedPtr, compressedSize, gopSize, - motionX, motionY, header.width, header.height, - canvasWidth, canvasHeight, - marginLeft, marginTop, header.qualityLevel, QLUT[header.qualityY], QLUT[header.qualityCo], QLUT[header.qualityCg], header.channelLayout, @@ -1165,20 +1134,44 @@ try { nextOffset ) - // Mark as decoding (will check completion in main loop) - nextGopData = { + readyGopData = { gopSize: gopSize, - decoded: false, // Will be set to true when async decode completes slot: nextSlot, - compressedPtr: compressedPtr, // Will free after decode completes + compressedPtr: compressedPtr, startTime: sys.nanoTime(), timeRemaining: timeRemaining } - } else { - // Fallback: unexpected state, just free the memory - if (interactive) { - console.log(`[GOP] Warning: Unexpected state - currentGopSize=${currentGopSize}, asyncDecodeInProgress=${asyncDecodeInProgress} - freeing GOP data`) + + } else if (currentGopSize > 0 && readyGopData !== null && decodingGopData === null && !asyncDecodeInProgress && graphics.tavDecodeGopIsComplete()) { + // Case 4: GOP playing, ready GOP exists, no decoding GOP, no decode in progress - decode to decoding slot + const decodingSlot = (currentGopBufferSlot + 2) % BUFFER_SLOTS + const decodingOffset = decodingSlot * SLOT_SIZE + + const framesRemaining = currentGopSize - currentGopFrameIndex + const timeRemaining = framesRemaining * FRAME_TIME * 1000.0 + + // Start async decode to decoding slot + graphics.tavDecodeGopToVideoBufferAsync( + compressedPtr, compressedSize, gopSize, + header.width, header.height, + header.qualityLevel, + QLUT[header.qualityY], QLUT[header.qualityCo], QLUT[header.qualityCg], + header.channelLayout, + header.waveletFilter, header.decompLevels, 2, + header.entropyCoder, + decodingOffset + ) + + decodingGopData = { + gopSize: gopSize, + slot: decodingSlot, + compressedPtr: compressedPtr, + startTime: sys.nanoTime(), + timeRemaining: timeRemaining } + + } else { + // Case 5: All 3 buffers full (playing + ready + decoding) - ignore packet sys.free(compressedPtr) } } @@ -1187,13 +1180,10 @@ try { const framesInGOP = seqread.readOneByte() // Ignore - we display frames based on time accumulator, not this packet - // CRITICAL: Stop reading packets if both buffers are full - // (one GOP playing + one GOP ready/decoding) - if (currentGopSize > 0 && nextGopData !== null) { + // CRITICAL: Stop reading packets if all 3 buffers are full + // (one GOP playing + ready GOP + decoding GOP) + if (currentGopSize > 0 && readyGopData !== null && decodingGopData !== null) { shouldReadPackets = false - if (interactive) { - console.log(`[GOP] Both buffers full - stopping packet reading until current GOP finishes`) - } } } else if (packetType === TAV_PACKET_AUDIO_MP2) { @@ -1326,9 +1316,9 @@ try { // Resume packet reading to get next GOP (only one buffer occupied now) shouldReadPackets = true - if (interactive) { - console.log(`[GOP] First GOP ready (slot ${asyncDecodeSlot}, ${asyncDecodeGopSize} frames) in ${decodeTime.toFixed(1)}ms - starting playback`) - } +// if (interactive) { +// console.log(`[GOP] First GOP ready (slot ${asyncDecodeSlot}, ${asyncDecodeGopSize} frames) in ${decodeTime.toFixed(1)}ms - starting playback`) +// } // Free compressed data sys.free(asyncDecodePtr) @@ -1374,44 +1364,37 @@ try { } } - // Step 4 & 7: GOP finished? Wait for background decode, then transition + // Step 4-7: GOP finished? Transition to ready GOP (triple-buffering) if (!paused && currentGopSize > 0 && currentGopFrameIndex >= currentGopSize) { - if (nextGopData !== null) { - // Wait for background decode to complete + if (readyGopData !== null) { + // Ready GOP exists - wait for it to finish decoding if still in progress while (!graphics.tavDecodeGopIsComplete() && !paused) { sys.sleep(1) } if (!paused) { const [r1, r2] = graphics.tavDecodeGopGetResult() - decodeTime = (sys.nanoTime() - nextGopData.startTime) / 1000000.0 - - if (interactive) { - const margin = nextGopData.timeRemaining - decodeTime - const status = margin > 0 ? "✓ ON TIME" : "✗ TOO LATE" - console.log(`[GOP] Background decode finished in ${decodeTime.toFixed(1)}ms (margin: ${margin.toFixed(0)}ms) ${status}`) - } + decodeTime = (sys.nanoTime() - readyGopData.startTime) / 1000000.0 // Free compressed data - sys.free(nextGopData.compressedPtr) + sys.free(readyGopData.compressedPtr) - // Transition to next GOP - currentGopBufferSlot = 1 - currentGopBufferSlot - currentGopSize = nextGopData.gopSize + // Transition to ready GOP + currentGopBufferSlot = readyGopData.slot + currentGopSize = readyGopData.gopSize currentGopFrameIndex = 0 - nextGopData = null - // Resume packet reading now that one buffer is free + // Promote decoding GOP to ready GOP + readyGopData = decodingGopData + decodingGopData = null + + // Resume packet reading now that one buffer is free (decoding slot available) shouldReadPackets = true - - if (interactive) { - console.log(`[GOP] ✓ SEAMLESS TRANSITION to next GOP (slot ${currentGopBufferSlot}, ${currentGopSize} frames)`) - } } } else { - // No next GOP available, pause playback + // No ready GOP available - hiccup (shouldn't happen with triple-buffering) if (interactive) { - console.log(`[GOP] ✗ HICCUP - next GOP NOT READY! Playback paused.`) + console.log(`[GOP] ✗ HICCUP - ready GOP NOT READY! Playback paused.`) } currentGopSize = 0 currentGopFrameIndex = 0 diff --git a/terranmon.txt b/terranmon.txt index daab381..9ae4c13 100644 --- a/terranmon.txt +++ b/terranmon.txt @@ -1030,9 +1030,9 @@ transmission capability, and region-of-interest coding. ### List of Keys - Uint64 BGNT: Video begin time (must be equal to the value of the first Timecode packet) - Uint64 ENDT: Video end time (must be equal to the value of the last Timecode packet) - - Uint64 CDAT: Creation time in nanoseconds since UNIX Epoch - - Bytes VNDR: Name and version of the encoder (for Reference encoder: "Encoder-TAV 20251014") - - Bytes FMPG: FFmpeg version (typically "ffmpeg version 6.1.2"; the first line of text FFmpeg emits right before the copyright text) + - Uint64 CDAT: Creation time in nanoseconds since UNIX Epoch (must be in UTC timezone) + - Bytes VNDR: Name and version of the encoder (for Reference encoder: "Encoder-TAV 20251014 (list,of,features)") + - Bytes FMPG: FFmpeg version (typically "ffmpeg version 8.0 Copyright (c) 2000-2025 the FFmpeg developers"; the first line of text FFmpeg emits) ## Standard Metadata Payload Packet Structure @@ -1062,10 +1062,12 @@ Updated on 2025-10-17 to include canvas expansion margins. This packet contains multiple frames encoded as a single spacetime block for optimal temporal compression. - uint8 Packet Type (0x12) + uint8 Packet Type (0x12/0x13) uint8 GOP Size (number of frames in this GOP, typically 16) - int16 Motion Vectors X[GOP Size] (quarter-pixel precision for global motion compensation) - int16 Motion Vectors Y[GOP Size] (quarter-pixel precision for global motion compensation) + + uint32 Compressed Size + * Zstd-compressed Motion Data + uint32 Compressed Size * Zstd-compressed Unified Block Data diff --git a/tsvm_core/src/net/torvald/tsvm/GraphicsJSR223Delegate.kt b/tsvm_core/src/net/torvald/tsvm/GraphicsJSR223Delegate.kt index 802b394..4ce01f6 100644 --- a/tsvm_core/src/net/torvald/tsvm/GraphicsJSR223Delegate.kt +++ b/tsvm_core/src/net/torvald/tsvm/GraphicsJSR223Delegate.kt @@ -6662,194 +6662,6 @@ class GraphicsJSR223Delegate(private val vm: VM) { System.arraycopy(output, 0, frameData, 0, frameData.size) } - /** - * Main GOP unified decoder function. - * Decodes a unified 3D DWT GOP block (temporal + spatial) and outputs RGB frames. - * - * @param compressedDataPtr Pointer to compressed Zstd data - * @param compressedSize Size of compressed data - * @param gopSize Number of frames in GOP (1-16) - * @param motionVectorsX X motion vectors in 1/16-pixel units - * @param motionVectorsY Y motion vectors in 1/16-pixel units - * @param outputRGBAddrs Array of output RGB buffer addresses - * @param width Original frame width (output dimensions) - * @param height Original frame height (output dimensions) - * @param canvasWidth Expanded canvas width (for motion compensation) - * @param canvasHeight Expanded canvas height (for motion compensation) - * @param marginLeft Left margin to crop from expanded canvas - * @param marginTop Top margin to crop from expanded canvas - * @param qIndex Quality index - * @param qYGlobal Global Y quantizer - * @param qCoGlobal Global Co quantizer - * @param qCgGlobal Global Cg quantizer - * @param channelLayout Channel layout flags - * @param spatialFilter Wavelet filter type - * @param spatialLevels Number of spatial DWT levels (default 6) - * @param temporalLevels Number of temporal DWT levels (default 2) - * @return Number of frames decoded - */ - fun tavDecodeGopUnified( - compressedDataPtr: Long, - compressedSize: Int, - gopSize: Int, - motionVectorsX: IntArray, - motionVectorsY: IntArray, - outputRGBAddrs: LongArray, - width: Int, - height: Int, - canvasWidth: Int, - canvasHeight: Int, - marginLeft: Int, - marginTop: Int, - qIndex: Int, - qYGlobal: Int, - qCoGlobal: Int, - qCgGlobal: Int, - channelLayout: Int, - spatialFilter: Int = 1, - spatialLevels: Int = 6, - temporalLevels: Int = 2, - entropyCoder: Int = 0 - ): Array { - val dbgOut = HashMap() - dbgOut["qY"] = qYGlobal - dbgOut["qCo"] = qCoGlobal - dbgOut["qCg"] = qCgGlobal - dbgOut["frameMode"] = "G" - - // Use expanded canvas dimensions for DWT processing - val canvasPixels = canvasWidth * canvasHeight - val outputPixels = width * height - - // Step 1: Decompress unified GOP block - val compressedData = ByteArray(compressedSize) - UnsafeHelper.memcpyRaw( - null, - vm.usermem.ptr + compressedDataPtr, - compressedData, - UnsafeHelper.getArrayOffset(compressedData), - compressedSize.toLong() - ) - - val decompressedData = try { - ZstdInputStream(java.io.ByteArrayInputStream(compressedData)).use { zstd -> - zstd.readBytes() - } - } catch (e: Exception) { - println("ERROR: Zstd decompression failed: ${e.message}") - return arrayOf(0, dbgOut) - } - - // Step 2: Postprocess unified block to per-frame coefficients (based on header's entropy coder field) - val (isEZBCMode, quantizedCoeffs) = tavPostprocessGopAuto( - decompressedData, - gopSize, - canvasPixels, // Use expanded canvas size - channelLayout, - entropyCoder - ) - - // Step 3: Allocate GOP buffers for float coefficients (expanded canvas size) - val gopY = Array(gopSize) { FloatArray(canvasPixels) } - val gopCo = Array(gopSize) { FloatArray(canvasPixels) } - val gopCg = Array(gopSize) { FloatArray(canvasPixels) } - - // Step 4: Calculate subband layout for expanded canvas (needed for perceptual dequantization) - val subbands = calculateSubbandLayout(canvasWidth, canvasHeight, spatialLevels) - - // Step 5: Dequantize with temporal-spatial scaling - for (t in 0 until gopSize) { - val temporalLevel = getTemporalSubbandLevel(t, gopSize, temporalLevels) - val temporalScale = getTemporalQuantizerScale(temporalLevel) - - // Apply temporal scaling to base quantizers for each channel - val baseQY = (qYGlobal * temporalScale).coerceIn(1.0f, 4096.0f) - val baseQCo = (qCoGlobal * temporalScale).coerceIn(1.0f, 4096.0f) - val baseQCg = (qCgGlobal * temporalScale).coerceIn(1.0f, 4096.0f) - - // Use existing perceptual dequantization for spatial weighting - dequantiseDWTSubbandsPerceptual( - qIndex, qYGlobal, - quantizedCoeffs[t][0], gopY[t], - subbands, baseQY, false, spatialLevels, // isChroma=false - isEZBCMode - ) - - dequantiseDWTSubbandsPerceptual( - qIndex, qYGlobal, - quantizedCoeffs[t][1], gopCo[t], - subbands, baseQCo, true, spatialLevels, // isChroma=true - isEZBCMode - ) - - dequantiseDWTSubbandsPerceptual( - qIndex, qYGlobal, - quantizedCoeffs[t][2], gopCg[t], - subbands, baseQCg, true, spatialLevels, // isChroma=true - isEZBCMode - ) - } - - // Step 6: Apply inverse 3D DWT (spatial first, then temporal) on expanded canvas - tavApplyInverse3DDWT(gopY, canvasWidth, canvasHeight, gopSize, spatialLevels, temporalLevels, spatialFilter) - tavApplyInverse3DDWT(gopCo, canvasWidth, canvasHeight, gopSize, spatialLevels, temporalLevels, spatialFilter) - tavApplyInverse3DDWT(gopCg, canvasWidth, canvasHeight, gopSize, spatialLevels, temporalLevels, spatialFilter) - - // Step 7: Apply inverse motion compensation (shift frames back) on expanded canvas - // Note: Motion vectors are in 1/16-pixel units, cumulative relative to frame 0 - for (t in 1 until gopSize) { // Skip frame 0 (reference) - val dx = motionVectorsX[t] / 16 // Convert to pixel units - val dy = motionVectorsY[t] / 16 - - if (dx != 0 || dy != 0) { - applyInverseTranslation(gopY[t], canvasWidth, canvasHeight, dx, dy) - applyInverseTranslation(gopCo[t], canvasWidth, canvasHeight, dx, dy) - applyInverseTranslation(gopCg[t], canvasWidth, canvasHeight, dx, dy) - } - } - - // Step 8: Crop expanded canvas to original dimensions and convert to RGB - for (t in 0 until gopSize) { - val rgbAddr = outputRGBAddrs[t] - - // Crop from expanded canvas (canvasWidth x canvasHeight) to output (width x height) - for (row in 0 until height) { - for (col in 0 until width) { - // Source pixel in expanded canvas - val canvasX = col + marginLeft - val canvasY = row + marginTop - val canvasIdx = canvasY * canvasWidth + canvasX - - // Destination pixel in output buffer - val outIdx = row * width + col - - val yVal = gopY[t][canvasIdx] - val co = gopCo[t][canvasIdx] - val cg = gopCg[t][canvasIdx] - - // YCoCg-R to RGB conversion - val tmp = yVal - (cg / 2.0f) - val g = cg + tmp - val b = tmp - (co / 2.0f) - val r = b + co - - // Clamp to 0-255 range - val rClamped = r.toInt().coerceIn(0, 255) - val gClamped = g.toInt().coerceIn(0, 255) - val bClamped = b.toInt().coerceIn(0, 255) - - // Write RGB24 format (3 bytes per pixel) - val offset = rgbAddr + outIdx * 3L - vm.usermem[offset] = rClamped.toByte() - vm.usermem[offset + 1] = gClamped.toByte() - vm.usermem[offset + 2] = bClamped.toByte() - } - } - } - - return arrayOf(gopSize, dbgOut) - } - /** * Decode GOP frames directly into GraphicsAdapter.videoBuffer (Java heap). * This avoids allocating GOP frames in VM user memory, saving ~6 MB for 8-frame GOPs. @@ -6864,14 +6676,8 @@ class GraphicsJSR223Delegate(private val vm: VM) { compressedDataPtr: Long, compressedSize: Int, gopSize: Int, - motionVectorsX: IntArray, - motionVectorsY: IntArray, width: Int, height: Int, - canvasWidth: Int, - canvasHeight: Int, - marginLeft: Int, - marginTop: Int, qIndex: Int, qYGlobal: Int, qCoGlobal: Int, @@ -6900,7 +6706,6 @@ class GraphicsJSR223Delegate(private val vm: VM) { } // Use expanded canvas dimensions for DWT processing - val canvasPixels = canvasWidth * canvasHeight val outputPixels = width * height // Step 1: Decompress unified GOP block @@ -6926,18 +6731,18 @@ class GraphicsJSR223Delegate(private val vm: VM) { val (isEZBCMode, quantizedCoeffs) = tavPostprocessGopAuto( decompressedData, gopSize, - canvasPixels, + outputPixels, channelLayout, entropyCoder ) // Step 3: Allocate GOP buffers for float coefficients (expanded canvas size) - val gopY = Array(gopSize) { FloatArray(canvasPixels) } - val gopCo = Array(gopSize) { FloatArray(canvasPixels) } - val gopCg = Array(gopSize) { FloatArray(canvasPixels) } + val gopY = Array(gopSize) { FloatArray(outputPixels) } + val gopCo = Array(gopSize) { FloatArray(outputPixels) } + val gopCg = Array(gopSize) { FloatArray(outputPixels) } // Step 4: Calculate subband layout for expanded canvas - val subbands = calculateSubbandLayout(canvasWidth, canvasHeight, spatialLevels) + val subbands = calculateSubbandLayout(width, height, spatialLevels) // Step 5: Dequantize with temporal-spatial scaling for (t in 0 until gopSize) { @@ -6971,40 +6776,23 @@ class GraphicsJSR223Delegate(private val vm: VM) { } // Step 6: Apply inverse 3D DWT - tavApplyInverse3DDWT(gopY, canvasWidth, canvasHeight, gopSize, spatialLevels, temporalLevels, spatialFilter) - tavApplyInverse3DDWT(gopCo, canvasWidth, canvasHeight, gopSize, spatialLevels, temporalLevels, spatialFilter) - tavApplyInverse3DDWT(gopCg, canvasWidth, canvasHeight, gopSize, spatialLevels, temporalLevels, spatialFilter) - - // Step 7: Apply inverse motion compensation - for (t in 1 until gopSize) { - val dx = motionVectorsX[t] / 16 - val dy = motionVectorsY[t] / 16 - - if (dx != 0 || dy != 0) { - applyInverseTranslation(gopY[t], canvasWidth, canvasHeight, dx, dy) - applyInverseTranslation(gopCo[t], canvasWidth, canvasHeight, dx, dy) - applyInverseTranslation(gopCg[t], canvasWidth, canvasHeight, dx, dy) - } - } + tavApplyInverse3DDWT(gopY, width, height, gopSize, spatialLevels, temporalLevels, spatialFilter) + tavApplyInverse3DDWT(gopCo, width, height, gopSize, spatialLevels, temporalLevels, spatialFilter) + tavApplyInverse3DDWT(gopCg, width, height, gopSize, spatialLevels, temporalLevels, spatialFilter) // Step 8: Crop and convert to RGB, write directly to videoBuffer for (t in 0 until gopSize) { val videoBufferOffset = bufferOffset + (t * frameSize) // Each frame sequentially, starting at bufferOffset - for (row in 0 until height) { - for (col in 0 until width) { - // Source pixel in expanded canvas - val canvasX = col + marginLeft - val canvasY = row + marginTop - val canvasIdx = canvasY * canvasWidth + canvasX - + for (py in 0 until height) { + for (px in 0 until width) { // Destination pixel in videoBuffer - val outIdx = row * width + col + val outIdx = py * width + px val offset = videoBufferOffset + outIdx * 3L - val yVal = gopY[t][canvasIdx] - val co = gopCo[t][canvasIdx] - val cg = gopCg[t][canvasIdx] + val yVal = gopY[t][outIdx] + val co = gopCo[t][outIdx] + val cg = gopCg[t][outIdx] // YCoCg-R to RGB conversion val tmp = yVal - (cg / 2.0f) @@ -7113,14 +6901,8 @@ class GraphicsJSR223Delegate(private val vm: VM) { compressedDataPtr: Long, compressedSize: Int, gopSize: Int, - motionVectorsX: IntArray, - motionVectorsY: IntArray, width: Int, height: Int, - canvasWidth: Int, - canvasHeight: Int, - marginLeft: Int, - marginTop: Int, qIndex: Int, qYGlobal: Int, qCoGlobal: Int, @@ -7128,7 +6910,7 @@ class GraphicsJSR223Delegate(private val vm: VM) { channelLayout: Int, spatialFilter: Int = 1, spatialLevels: Int = 6, - temporalLevels: Int = 2, + temporalLevels: Int = 3, entropyCoder: Int = 0, bufferOffset: Long = 0 ) { @@ -7144,9 +6926,7 @@ class GraphicsJSR223Delegate(private val vm: VM) { try { val result = tavDecodeGopToVideoBuffer( compressedDataPtr, compressedSize, gopSize, - motionVectorsX, motionVectorsY, - width, height, canvasWidth, canvasHeight, - marginLeft, marginTop, + width, height, qIndex, qYGlobal, qCoGlobal, qCgGlobal, channelLayout, spatialFilter, spatialLevels, temporalLevels, entropyCoder, bufferOffset diff --git a/tsvm_core/src/net/torvald/tsvm/peripheral/GraphicsAdapter.kt b/tsvm_core/src/net/torvald/tsvm/peripheral/GraphicsAdapter.kt index f6a332a..79e7b3a 100644 --- a/tsvm_core/src/net/torvald/tsvm/peripheral/GraphicsAdapter.kt +++ b/tsvm_core/src/net/torvald/tsvm/peripheral/GraphicsAdapter.kt @@ -107,7 +107,7 @@ open class GraphicsAdapter(private val assetsRoot: String, val vm: VM, val confi internal val unusedArea = UnsafeHelper.allocate(1024, this) internal val scanlineOffsets = UnsafeHelper.allocate(1024, this) - internal val videoBuffer = UnsafeHelper.allocate(32 * 1024 * 1024, this) + internal val videoBuffer = UnsafeHelper.allocate(48 * 1024 * 1024, this) // 48 MB for triple-buffering (3 slots × 21 frames × 752 kB) protected val paletteShader = LoadShader(DRAW_SHADER_VERT, config.paletteShader) protected val textShader = LoadShader(DRAW_SHADER_VERT, config.fragShader) diff --git a/video_encoder/encoder_tav.c b/video_encoder/encoder_tav.c index c53a266..e0b415e 100644 --- a/video_encoder/encoder_tav.c +++ b/video_encoder/encoder_tav.c @@ -18,7 +18,7 @@ #include #include -#define ENCODER_VENDOR_STRING "Encoder-TAV 20251019" +#define ENCODER_VENDOR_STRING "Encoder-TAV 20251022 (3d-dwt,ezbc)" // TSVM Advanced Video (TAV) format constants #define TAV_MAGIC "\x1F\x54\x53\x56\x4D\x54\x41\x56" // "\x1FTSVM TAV" @@ -48,7 +48,7 @@ #define TAV_PACKET_IFRAME 0x10 // Intra frame (keyframe) #define TAV_PACKET_PFRAME 0x11 // Predicted frame (legacy, unused) #define TAV_PACKET_GOP_UNIFIED 0x12 // Unified 3D DWT GOP (all frames in single block, translation-based) -#define TAV_PACKET_GOP_UNIFIED_MESH 0x13 // Unified 3D DWT GOP with distortion mesh warping +#define TAV_PACKET_GOP_UNIFIED_MOTION 0x13 // Unified 3D DWT GOP with motion-compensated lifting #define TAV_PACKET_PFRAME_RESIDUAL 0x14 // P-frame with MPEG-style residual coding (block motion compensation) #define TAV_PACKET_BFRAME_RESIDUAL 0x15 // B-frame with MPEG-style residual coding (bidirectional prediction) #define TAV_PACKET_PFRAME_ADAPTIVE 0x16 // P-frame with adaptive quad-tree block partitioning @@ -116,13 +116,15 @@ static int needs_alpha_channel(int channel_layout) { #define DEFAULT_HEIGHT 448 #define DEFAULT_FPS 30 #define DEFAULT_QUALITY 3 -#define DEFAULT_ZSTD_LEVEL 9 -#define TEMPORAL_GOP_SIZE 20//8 // ~42 frames fit into 32 MB video buffer +#define DEFAULT_ZSTD_LEVEL 3 +#define TEMPORAL_GOP_SIZE 20 #define TEMPORAL_DECOMP_LEVEL 2 #define MOTION_THRESHOLD 24.0f // Flush if motion exceeds 24 pixels in any direction // Audio/subtitle constants (reused from TEV) +#define MP2_SAMPLE_RATE 32000 #define MP2_DEFAULT_PACKET_SIZE 1152 +#define PACKET_AUDIO_TIME ((double)MP2_DEFAULT_PACKET_SIZE / MP2_SAMPLE_RATE) #define MAX_SUBTITLE_LENGTH 2048 int debugDumpMade = 0; @@ -2175,6 +2177,7 @@ static int mp2_packet_size_to_rate_index(int packet_size, int is_mono); static long write_extended_header(tav_encoder_t *enc); static void write_timecode_packet(FILE *output, int frame_num, int fps, int is_ntsc_framerate); static int process_audio(tav_encoder_t *enc, int frame_num, FILE *output); +static int process_audio_for_gop(tav_encoder_t *enc, int *frame_numbers, int num_frames, FILE *output); static subtitle_entry_t* parse_subtitle_file(const char *filename, int fps); static subtitle_entry_t* parse_srt_file(const char *filename, int fps); static subtitle_entry_t* parse_smi_file(const char *filename, int fps); @@ -2269,7 +2272,7 @@ static void show_usage(const char *program_name) { printf(" --dump-frame N Dump quantised coefficients for frame N (creates .bin files)\n"); printf(" --wavelet N Wavelet filter: 0=LGT 5/3, 1=CDF 9/7, 2=CDF 13/7, 16=DD-4, 255=Haar (default: 1)\n"); printf(" --zstd-level N Zstd compression level 1-22 (default: %d, higher = better compression but slower)\n", DEFAULT_ZSTD_LEVEL); - printf(" --no-grain-synthesis Disable grain synthesis (enabled by default)\n"); +// printf(" --no-grain-synthesis Disable grain synthesis (enabled by default)\n"); printf(" --help Show this help\n\n"); printf("Audio Rate by Quality:\n "); @@ -2328,7 +2331,7 @@ static tav_encoder_t* create_encoder(void) { enc->intra_only = 0; enc->monoblock = 1; // Default to monoblock mode enc->perceptual_tuning = 1; // Default to perceptual quantisation (versions 5/6) - enc->enable_ezbc = 0; // Default to twobit-map (EZBC adds overhead for small files) + enc->enable_ezbc = 1; // Default to EZBC over twobit-map enc->channel_layout = CHANNEL_LAYOUT_YCOCG; // Default to Y-Co-Cg enc->audio_bitrate = 0; // 0 = use quality table enc->encode_limit = 0; // Default: no frame limit @@ -2339,7 +2342,7 @@ static tav_encoder_t* create_encoder(void) { enc->delta_haar_levels = TEMPORAL_DECOMP_LEVEL; // GOP / temporal DWT settings - enc->enable_temporal_dwt = 0; // Default: disabled for backward compatibility. Mutually exclusive with use_delta_encoding + enc->enable_temporal_dwt = 1; // Mutually exclusive with use_delta_encoding enc->temporal_gop_capacity = TEMPORAL_GOP_SIZE; // 16 frames enc->temporal_gop_frame_count = 0; enc->temporal_decomp_levels = TEMPORAL_DECOMP_LEVEL; // 2 levels of temporal DWT (16 -> 4x4 subbands) @@ -4826,16 +4829,6 @@ static size_t gop_flush(tav_encoder_t *enc, FILE *output, int base_quantiser, memcpy(gop_cg_coeffs[i], enc->temporal_gop_cg_frames[i], num_pixels * sizeof(float)); } - // Debug: Print original frame-to-frame motion vectors - if (enc->verbose && actual_gop_size >= 4) { - printf("Frame-to-frame motion vectors (before cumulative conversion):\n"); - for (int i = 0; i < actual_gop_size; i++) { - printf(" Frame %d: 1/16px=(%d, %d) pixels=(%.3f, %.3f)\n", - i, enc->temporal_gop_translation_x[i], enc->temporal_gop_translation_y[i], - enc->temporal_gop_translation_x[i] / 16.0f, enc->temporal_gop_translation_y[i] / 16.0f); - } - } - // Step 0.5: Convert frame-to-frame motion vectors to cumulative (relative to frame 0) // Phase correlation computes motion of frame[i] relative to frame[i-1] // We need cumulative motion relative to frame 0 for proper alignment @@ -4844,16 +4837,6 @@ static size_t gop_flush(tav_encoder_t *enc, FILE *output, int base_quantiser, enc->temporal_gop_translation_y[i] += enc->temporal_gop_translation_y[i-1]; } - // Debug: Print cumulative motion vectors - if (enc->verbose && actual_gop_size >= 4) { - printf("Cumulative motion vectors (after conversion):\n"); - for (int i = 0; i < actual_gop_size; i++) { - printf(" Frame %d: 1/16px=(%d, %d) pixels=(%.3f, %.3f)\n", - i, enc->temporal_gop_translation_x[i], enc->temporal_gop_translation_y[i], - enc->temporal_gop_translation_x[i] / 16.0f, enc->temporal_gop_translation_y[i] / 16.0f); - } - } - // Step 0.5b: Calculate the valid region after alignment (crop bounds) // Find the bounding box that's valid across all aligned frames int min_dx = 0, max_dx = 0, min_dy = 0, max_dy = 0; @@ -5102,6 +5085,9 @@ static size_t gop_flush(tav_encoder_t *enc, FILE *output, int base_quantiser, // Write timecode packet for first frame in GOP write_timecode_packet(output, frame_numbers[0], enc->output_fps, enc->is_ntsc_framerate); + // Process audio for this GOP (all frames at once) + process_audio_for_gop(enc, frame_numbers, actual_gop_size, output); + // Single-frame GOP fallback: use traditional I-frame encoding with serialise_tile_data if (actual_gop_size == 1) { // Write I-frame packet header (no motion vectors, no GOP overhead) @@ -5171,10 +5157,11 @@ static size_t gop_flush(tav_encoder_t *enc, FILE *output, int base_quantiser, printf("Frame %d (single-frame GOP as I-frame): %zu bytes\n", frame_numbers[0], compressed_size); } - } else { + } + else { // Multi-frame GOP: use unified 3D DWT encoding // Choose packet type based on motion compensation method - uint8_t packet_type = enc->temporal_enable_mcezbc ? TAV_PACKET_GOP_UNIFIED_MESH : TAV_PACKET_GOP_UNIFIED; + uint8_t packet_type = enc->temporal_enable_mcezbc ? TAV_PACKET_GOP_UNIFIED_MOTION : TAV_PACKET_GOP_UNIFIED; fwrite(&packet_type, 1, 1, output); total_bytes_written += 1; @@ -5263,26 +5250,6 @@ static size_t gop_flush(tav_encoder_t *enc, FILE *output, int base_quantiser, free(mv_buffer); free(compressed_mv); - } else { - // Packet 0x12: Translation-based alignment - // Write canvas expansion information (4 bytes) - uint8_t canvas_margins[4] = { - (uint8_t)crop_left, // Left margin - (uint8_t)crop_right, // Right margin - (uint8_t)crop_top, // Top margin - (uint8_t)crop_bottom // Bottom margin - }; - fwrite(canvas_margins, 1, 4, output); - total_bytes_written += 4; - - // Write all motion vectors (1/16-pixel precision) for the entire GOP - for (int t = 0; t < actual_gop_size; t++) { - int16_t dx = enc->temporal_gop_translation_x[t]; - int16_t dy = enc->temporal_gop_translation_y[t]; - fwrite(&dx, sizeof(int16_t), 1, output); - fwrite(&dy, sizeof(int16_t), 1, output); - total_bytes_written += 4; - } } // Preprocess ALL frames with unified significance map @@ -8649,13 +8616,8 @@ static int process_audio(tav_encoder_t *enc, int frame_num, FILE *output) { // Calculate how much audio time each frame represents (in seconds) double frame_audio_time = 1.0 / enc->output_fps; - // Calculate how much audio time each MP2 packet represents - // MP2 frame contains 1152 samples at 32kHz = 0.036 seconds - #define MP2_SAMPLE_RATE 32000 - double packet_audio_time = 1152.0 / MP2_SAMPLE_RATE; - // Estimate how many packets we consume per video frame - double packets_per_frame = frame_audio_time / packet_audio_time; + double packets_per_frame = frame_audio_time / PACKET_AUDIO_TIME; // Allocate MP2 buffer if needed if (!enc->mp2_buffer) { @@ -8683,24 +8645,20 @@ static int process_audio(tav_encoder_t *enc, int frame_num, FILE *output) { // Calculate how many packets we need to maintain target buffer level // Only insert when buffer drops below target, and only insert enough to restore target - double target_level = (double)enc->target_audio_buffer_size; - if (enc->audio_frames_in_buffer < target_level) { + double target_level = fmax(packets_per_frame, (double)enc->target_audio_buffer_size); +// if (enc->audio_frames_in_buffer < target_level) { double deficit = target_level - enc->audio_frames_in_buffer; // Insert packets to cover the deficit, but at least maintain minimum flow packets_to_insert = (int)ceil(deficit); - // Cap at reasonable maximum to prevent excessive insertion - if (packets_to_insert > enc->target_audio_buffer_size) { - packets_to_insert = enc->target_audio_buffer_size; - } if (enc->verbose) { printf("Frame %d: Buffer low (%.2f->%.2f), deficit %.2f, inserting %d packets\n", frame_num, old_buffer, enc->audio_frames_in_buffer, deficit, packets_to_insert); } - } else if (enc->verbose && old_buffer != enc->audio_frames_in_buffer) { - printf("Frame %d: Buffer sufficient (%.2f->%.2f), no packets\n", - frame_num, old_buffer, enc->audio_frames_in_buffer); - } +// } else if (enc->verbose && old_buffer != enc->audio_frames_in_buffer) { +// printf("Frame %d: Buffer sufficient (%.2f->%.2f), no packets\n", +// frame_num, old_buffer, enc->audio_frames_in_buffer); +// } } // Insert the calculated number of audio packets @@ -8737,6 +8695,96 @@ static int process_audio(tav_encoder_t *enc, int frame_num, FILE *output) { return 1; } +// Process audio for a GOP (multiple frames at once) +// Accumulates deficit for N frames and emits all necessary audio packets +static int process_audio_for_gop(tav_encoder_t *enc, int *frame_numbers, int num_frames, FILE *output) { + if (!enc->has_audio || !enc->mp2_file || enc->audio_remaining <= 0 || num_frames == 0) { + return 1; + } + + // Handle first frame initialization (same as process_audio) + int first_frame_in_gop = frame_numbers[0]; + if (first_frame_in_gop == 0) { + uint8_t header[4]; + if (fread(header, 1, 4, enc->mp2_file) != 4) return 1; + fseek(enc->mp2_file, 0, SEEK_SET); + enc->mp2_packet_size = get_mp2_packet_size(header); + int is_mono = (header[3] >> 6) == 3; + enc->mp2_rate_index = mp2_packet_size_to_rate_index(enc->mp2_packet_size, is_mono); + enc->target_audio_buffer_size = 4; // 4 audio packets in buffer (does nothing for GOP) + enc->audio_frames_in_buffer = 0.0; + } + + // Calculate audio packet consumption per video frame + double frame_audio_time = 1.0 / enc->output_fps; + double packets_per_frame = frame_audio_time / PACKET_AUDIO_TIME; + + // Allocate MP2 buffer if needed + if (!enc->mp2_buffer) { + enc->mp2_buffer_size = enc->mp2_packet_size * 2; + enc->mp2_buffer = malloc(enc->mp2_buffer_size); + if (!enc->mp2_buffer) { + fprintf(stderr, "Failed to allocate audio buffer\n"); + return 1; + } + } + + // Calculate total deficit for all frames in the GOP + int total_packets_to_insert = 0; + + // Simulate buffer consumption for all N frames in the GOP + double old_buffer = enc->audio_frames_in_buffer; + enc->audio_frames_in_buffer -= (packets_per_frame * num_frames); + + // Calculate deficit to restore buffer to target level +// double target_level = fmax(packets_per_frame, (double)enc->target_audio_buffer_size); +// if (enc->audio_frames_in_buffer < target_level) { + double deficit = packets_per_frame * num_frames; + total_packets_to_insert = CLAMP((int)round(deficit), enc->target_audio_buffer_size, 9999); + + if (enc->verbose) { + printf("GOP (%d frames, starting at %d): Buffer low (%.2f->%.2f), deficit %.2f, inserting %d packets\n", + num_frames, first_frame_in_gop, old_buffer, enc->audio_frames_in_buffer, deficit, total_packets_to_insert); + } +// } else if (enc->verbose) { +// printf("GOP (%d frames, starting at %d): Buffer sufficient (%.2f->%.2f), no packets\n", +// num_frames, first_frame_in_gop, old_buffer, enc->audio_frames_in_buffer); +// } + + // Emit all audio packets for this GOP + for (int q = 0; q < total_packets_to_insert; q++) { + size_t bytes_to_read = enc->mp2_packet_size; + if (bytes_to_read > enc->audio_remaining) { + bytes_to_read = enc->audio_remaining; + } + + size_t bytes_read = fread(enc->mp2_buffer, 1, bytes_to_read, enc->mp2_file); + if (bytes_read == 0) break; + + // Write TAV MP2 audio packet + uint8_t audio_packet_type = TAV_PACKET_AUDIO_MP2; + uint32_t audio_len = (uint32_t)bytes_read; + fwrite(&audio_packet_type, 1, 1, output); + fwrite(&audio_len, 4, 1, output); + fwrite(enc->mp2_buffer, 1, bytes_read, output); + + // Track audio bytes written + enc->audio_remaining -= bytes_read; + enc->audio_frames_in_buffer++; + + if (first_frame_in_gop == 0) { + enc->audio_frames_in_buffer = enc->target_audio_buffer_size / 2; + } + + if (enc->verbose) { + printf("Audio packet %d: %zu bytes (buffer: %.2f packets)\n", + q, bytes_read, enc->audio_frames_in_buffer); + } + } + + return 1; +} + // Process subtitles for current frame (copied and adapted from TEV) static int process_subtitles(tav_encoder_t *enc, int frame_num, FILE *output) { if (!enc->subtitles) { @@ -9834,20 +9882,16 @@ int main(int argc, char *argv[]) { adjust_quantiser_for_bitrate(enc); } - // For GOP encoding, process audio/subtitles for all frames in the flushed GOP + // For GOP encoding, audio/subtitles are handled in gop_flush() for all GOP frames // For traditional encoding, process audio/subtitles for this single frame - if (enc->enable_temporal_dwt) { - // Note: In GOP mode, audio/subtitle sync is approximate since we flush multiple frames at once - // This is acceptable since GOPs are short (16 frames max = ~0.5s at 30fps) - // TODO: Consider buffering audio/subtitles for precise sync if needed + if (!enc->enable_temporal_dwt) { + // Process audio for this frame + process_audio(enc, true_frame_count, enc->output_fp); + + // Process subtitles for this frame + process_subtitles(enc, true_frame_count, enc->output_fp); } - // Process audio for this frame - process_audio(enc, true_frame_count, enc->output_fp); - - // Process subtitles for this frame - process_subtitles(enc, true_frame_count, enc->output_fp); - // Write a sync packet only after a video is been coded // For GOP encoding, GOP_SYNC packet already serves as sync - don't emit extra SYNC // For B-frame mode, sync packets are already written in the encoding loop @@ -9857,7 +9901,8 @@ int main(int argc, char *argv[]) { } // NTSC frame duplication: emit extra sync packet for every 1000n+500 frames - if (enc->is_ntsc_framerate && (frame_count % 1000 == 500)) { + // Skip when temporal DWT is enabled (audio handled in GOP flush) + if (!enc->enable_temporal_dwt && enc->is_ntsc_framerate && (frame_count % 1000 == 500)) { true_frame_count++; // Process audio and subtitles for the duplicated frame to maintain sync process_audio(enc, true_frame_count, enc->output_fp); diff --git a/video_encoder/tav_inspector.c b/video_encoder/tav_inspector.c index eca7bac..77c2d7d 100644 --- a/video_encoder/tav_inspector.c +++ b/video_encoder/tav_inspector.c @@ -18,6 +18,11 @@ #define TAV_PACKET_IFRAME 0x10 #define TAV_PACKET_PFRAME 0x11 #define TAV_PACKET_GOP_UNIFIED 0x12 // Unified 3D DWT GOP (all frames in single block) +#define TAV_PACKET_GOP_UNIFIED_MOTION 0x13 +#define TAV_PACKET_PFRAME_RESIDUAL 0x14 // P-frame with MPEG-style residual coding (block motion compensation) +#define TAV_PACKET_BFRAME_RESIDUAL 0x15 // B-frame with MPEG-style residual coding (bidirectional prediction) +#define TAV_PACKET_PFRAME_ADAPTIVE 0x16 // P-frame with adaptive quad-tree block partitioning +#define TAV_PACKET_BFRAME_ADAPTIVE 0x17 // B-frame with adaptive quad-tree block partitioning (bidirectional prediction) #define TAV_PACKET_AUDIO_MP2 0x20 #define TAV_PACKET_SUBTITLE 0x30 #define TAV_PACKET_SUBTITLE_KAR 0x31 @@ -59,6 +64,7 @@ typedef struct { int pframe_delta_count; int pframe_skip_count; int gop_unified_count; + int gop_unified_motion_count; int gop_sync_count; int total_gop_frames; int audio_count; @@ -94,6 +100,11 @@ const char* get_packet_type_name(uint8_t type) { case TAV_PACKET_IFRAME: return "I-FRAME"; case TAV_PACKET_PFRAME: return "P-FRAME"; case TAV_PACKET_GOP_UNIFIED: return "GOP (3D DWT Unified)"; + case TAV_PACKET_GOP_UNIFIED_MOTION: return "GOP (3D DWT Unified with Motion Data)"; + case TAV_PACKET_PFRAME_RESIDUAL: return "P-FRAME (residual)"; + case TAV_PACKET_BFRAME_RESIDUAL: return "B-FRAME (residual)"; + case TAV_PACKET_PFRAME_ADAPTIVE: return "P-FRAME (quadtree)"; + case TAV_PACKET_BFRAME_ADAPTIVE: return "B-FRAME (quadtree)"; case TAV_PACKET_AUDIO_MP2: return "AUDIO MP2"; case TAV_PACKET_SUBTITLE: return "SUBTITLE (Simple)"; case TAV_PACKET_SUBTITLE_KAR: return "SUBTITLE (Karaoke)"; @@ -246,9 +257,10 @@ void print_extended_header(FILE *fp, int verbose) { if (verbose) { if (strcmp(key, "CDAT") == 0) { time_t time_sec = value / 1000000000ULL; - char *time_str = ctime(&time_sec); - if (time_str) { - time_str[strlen(time_str)-1] = '\0'; // Remove newline + struct tm *time_info = gmtime(&time_sec); + if (time_info) { + char time_str[64]; + strftime(time_str, sizeof(time_str), "%a %b %d %H:%M:%S %Y UTC", time_info); printf("%s", time_str); } } else { @@ -484,48 +496,37 @@ int main(int argc, char *argv[]) { break; } - case TAV_PACKET_GOP_UNIFIED: { + case TAV_PACKET_GOP_UNIFIED: case TAV_PACKET_GOP_UNIFIED_MOTION: { // Unified GOP packet: [gop_size][motion_vectors...][compressed_size][data] uint8_t gop_size; if (fread(&gop_size, 1, 1, fp) != 1) break; - // Read all motion vectors - int16_t *motion_x = malloc(gop_size * sizeof(int16_t)); - int16_t *motion_y = malloc(gop_size * sizeof(int16_t)); - for (int i = 0; i < gop_size; i++) { - if (fread(&motion_x[i], sizeof(int16_t), 1, fp) != 1) break; - if (fread(&motion_y[i], sizeof(int16_t), 1, fp) != 1) break; + // Read motion vectors + uint32_t size0 = 0; + if (packet_type == TAV_PACKET_GOP_UNIFIED_MOTION) { + if (fread(&size0, sizeof(uint32_t), 1, fp) != 1) { break; } + stats.total_video_bytes += size0; + stats.gop_unified_motion_count++; + fseek(fp, size0, SEEK_CUR); } // Read compressed data size - uint32_t size; - if (fread(&size, sizeof(uint32_t), 1, fp) != 1) { - free(motion_x); - free(motion_y); - break; - } + uint32_t size1; + if (fread(&size1, sizeof(uint32_t), 1, fp) != 1) { break; } + stats.total_video_bytes += size1; + fseek(fp, size1, SEEK_CUR); + - stats.total_video_bytes += size; - stats.gop_unified_count++; stats.total_gop_frames += gop_size; + if (packet_type == TAV_PACKET_GOP_UNIFIED) { + stats.gop_unified_count++; + } if (!opts.summary_only && display) { printf(" - GOP size=%u, data size=%u bytes (%.2f bytes/frame)", - gop_size, size, (double)size / gop_size); - - // Always show motion vectors for GOP packets with absolute frame numbers - if (gop_size > 0) { - printf("\n Motion vectors (1/16-pixel):"); - for (int i = 0; i < gop_size; i++) { - printf("\n Frame %d (#%d): (%.3f, %.3f) px", - current_frame + i, i, motion_x[i] / 16.0, motion_y[i] / 16.0); - } - } + gop_size, (size0 + size1), (double)(size0 + size1) / gop_size); } - free(motion_x); - free(motion_y); - fseek(fp, size, SEEK_CUR); break; } @@ -714,10 +715,10 @@ int main(int argc, char *argv[]) { printf(")"); } printf("\n"); - if (stats.gop_unified_count > 0) { + if (stats.gop_unified_count + stats.gop_unified_motion_count > 0) { printf(" 3D GOP packets: %d (total frames: %d, avg %.1f frames/GOP)\n", - stats.gop_unified_count, stats.total_gop_frames, - (double)stats.total_gop_frames / stats.gop_unified_count); + (stats.gop_unified_count + stats.gop_unified_motion_count), stats.total_gop_frames, + (double)stats.total_gop_frames / (stats.gop_unified_count + stats.gop_unified_motion_count)); printf(" GOP sync packets: %d\n", stats.gop_sync_count); } printf(" Mux video: %d\n", stats.mux_video_count);