TAV: half-fixed 3d dwt playback

2026-06-07 22:14:03 +09:00 · 2025-10-22 01:32:19 +09:00
parent 9ac0424be3
commit 4eec98cdca
6 changed files with 278 additions and 467 deletions
--- a/assets/disk0/tvdos/bin/playtav.js
+++ b/assets/disk0/tvdos/bin/playtav.js
@@ -355,11 +355,12 @@ let decodeHeight = isInterlaced ? (header.height >> 1) : header.height
 const FRAME_PIXELS = header.width * header.height
 const FRAME_SIZE = FRAME_PIXELS * 3  // RGB buffer size
-// Double-buffering: Fixed slot sizes in videoBuffer (32 MB total)
+// Triple-buffering: Fixed slot sizes in videoBuffer (48 MB total)
-const MAX_GOP_SIZE = 21  // Maximum frames per slot (21 * 752KB = ~15MB per slot)
+const BUFFER_SLOTS = 3  // Three slots: playing, ready, decoding
 const MAX_GOP_SIZE = 21  // Maximum frames per slot (21 * 752KB = ~15.8MB per slot)
 const SLOT_SIZE = MAX_GOP_SIZE * FRAME_SIZE  // Fixed slot size regardless of actual GOP size
-console.log(`Double-buffering: Max ${MAX_GOP_SIZE} frames/slot, ${(SLOT_SIZE / 1048576).toFixed(1)}MB per slot`)
+console.log(`Triple-buffering: ${BUFFER_SLOTS} slots, max ${MAX_GOP_SIZE} frames/slot, ${(SLOT_SIZE / 1048576).toFixed(1)}MB per slot`)
 const RGB_BUFFER_A = sys.malloc(FRAME_SIZE)
 const RGB_BUFFER_B = sys.malloc(FRAME_SIZE)
@@ -484,17 +485,18 @@ let currentFileIndex = 1  // Track which file we're playing in concatenated stre
 let totalFilesProcessed = 0
 let decoderDbgInfo = {}
-// GOP double-buffering state
+// GOP triple-buffering state (3 slots: playing, ready, decoding)
-let currentGopBufferSlot = 0  // Which buffer slot is currently being displayed (0 or 1)
+let currentGopBufferSlot = 0  // Which buffer slot is currently being displayed (0, 1, or 2)
 let currentGopSize = 0         // Number of frames in current GOP being displayed
 let currentGopFrameIndex = 0   // Which frame of current GOP we're displaying
-let nextGopData = null         // Buffered next GOP packet data for background decode
+let readyGopData = null        // GOP that's already decoded and ready to play (next in line)
 let decodingGopData = null     // GOP currently being decoded in background
 let asyncDecodeInProgress = false  // Track if async decode is running
 let asyncDecodeSlot = 0        // Which slot the async decode is targeting
 let asyncDecodeGopSize = 0     // Size of GOP being decoded async
 let asyncDecodePtr = 0         // Compressed data pointer to free after decode
 let asyncDecodeStartTime = 0   // When async decode started (for diagnostics)
-let shouldReadPackets = true   // Gate packet reading: false when both buffers are full
+let shouldReadPackets = true   // Gate packet reading: false when all 3 buffers are full
 let cueElements = []
 let currentCueIndex = -1  // Track current cue position
@@ -510,12 +512,19 @@ function cleanupAsyncDecode() {
        asyncDecodeGopSize = 0
    }
-    // Free background GOP decode memory if in progress
+    // Free ready GOP memory if present
-    if (nextGopData !== null && nextGopData.compressedPtr && nextGopData.compressedPtr !== 0) {
+    if (readyGopData !== null && readyGopData.compressedPtr && readyGopData.compressedPtr !== 0) {
-        sys.free(nextGopData.compressedPtr)
+        sys.free(readyGopData.compressedPtr)
-        nextGopData.compressedPtr = 0
+        readyGopData.compressedPtr = 0
    }
-    nextGopData = null
+    readyGopData = null
    // Free decoding GOP memory if present
    if (decodingGopData !== null && decodingGopData.compressedPtr && decodingGopData.compressedPtr !== 0) {
        sys.free(decodingGopData.compressedPtr)
        decodingGopData.compressedPtr = 0
    }
    decodingGopData = null
    // Reset GOP playback state
    currentGopSize = 0
@@ -751,7 +760,10 @@ let paused = false
 try {
    let t1 = sys.nanoTime()
-    while (!stopPlay && seqread.getReadCount() < FILE_LENGTH) {
+    // Continue loop while:
    // 1. Reading packets (not EOF yet), OR
    // 2. There are buffered GOPs to play (after EOF)
    while (!stopPlay && (seqread.getReadCount() < FILE_LENGTH || currentGopSize > 0 || readyGopData !== null || decodingGopData !== null || asyncDecodeInProgress)) {
        // Handle interactive controls
@@ -866,9 +878,10 @@ try {
        }
        // GATED PACKET READING
-        // Stop reading when both buffers are full (GOP playing + GOP decoding/ready)
+        // Stop reading when all 3 buffers are full (GOP playing + ready GOP + decoding GOP)
        // Resume reading when GOP finishes (one buffer becomes free)
-        if (shouldReadPackets && !paused) {
+        // Also stop reading at EOF
        if (shouldReadPackets && !paused && seqread.getReadCount() < FILE_LENGTH) {
            // Read packet header (record position before reading for I-frame tracking)
            let packetOffset = seqread.getReadCount()
            var packetType = seqread.readOneByte()
@@ -1051,32 +1064,15 @@ try {
                // Read GOP packet data
                const gopSize = seqread.readOneByte()
                const marginLeft = seqread.readOneByte()
                const marginRight = seqread.readOneByte()
                const marginTop = seqread.readOneByte()
                const marginBottom = seqread.readOneByte()
                const canvasWidth = header.width + marginLeft + marginRight
                const canvasHeight = header.height + marginTop + marginBottom
                // Read motion vectors (1/16-pixel units, int16)
                let motionX = new Array(gopSize)
                let motionY = new Array(gopSize)
                for (let i = 0; i < gopSize; i++) {
                    let mx = seqread.readShort()
                    let my = seqread.readShort()
                    motionX[i] = (mx > 32767) ? (mx - 65536) : mx
                    motionY[i] = (my > 32767) ? (my - 65536) : my
                }
                const compressedSize = seqread.readInt()
                let compressedPtr = seqread.readBytes(compressedSize)
                updateDataRateBin(compressedSize)
-                // DOUBLE-BUFFERING LOGIC:
+                // TRIPLE-BUFFERING LOGIC (3 slots: playing, ready, decoding):
-                // - If no GOP is currently playing: decode immediately to current slot
+                // - If no GOP playing: decode first GOP to slot 0
-                // - Otherwise: buffer this GOP for decode during next GOP's playback
+                // - If GOP playing but no ready GOP: decode to ready slot (next in rotation)
                // - If GOP playing and ready GOP exists but no decoding: decode to decoding slot
                // - Otherwise: all 3 buffers full, ignore packet
                // Check GOP size fits in slot
                if (gopSize > MAX_GOP_SIZE) {
@@ -1086,11 +1082,11 @@ try {
                }
                if (currentGopSize === 0 && !asyncDecodeInProgress) {
-                    // No active GOP and no decode in progress: decode asynchronously and start playback when ready
+                    // Case 1: No active GOP and no decode in progress - decode first GOP
                    const bufferSlot = currentGopBufferSlot
                    const bufferOffset = bufferSlot * SLOT_SIZE
-                    // Defensive: free any old async decode memory (shouldn't happen but be safe)
+                    // Defensive: free any old async decode memory
                    if (asyncDecodePtr !== 0) {
                        sys.free(asyncDecodePtr)
                        asyncDecodePtr = 0
@@ -1099,10 +1095,7 @@ try {
                    // Start async decode
                    graphics.tavDecodeGopToVideoBufferAsync(
                        compressedPtr, compressedSize, gopSize,
                        motionX, motionY,
                        header.width, header.height,
                        canvasWidth, canvasHeight,
                        marginLeft, marginTop,
                        header.qualityLevel,
                        QLUT[header.qualityY], QLUT[header.qualityCo], QLUT[header.qualityCg],
                        header.channelLayout,
@@ -1114,49 +1107,25 @@ try {
                    asyncDecodeInProgress = true
                    asyncDecodeSlot = bufferSlot
                    asyncDecodeGopSize = gopSize
-                    asyncDecodePtr = compressedPtr  // Will free after decode completes
+                    asyncDecodePtr = compressedPtr
                    asyncDecodeStartTime = sys.nanoTime()
                    // Note: compressedPtr will be freed after decode completes
                    // We'll check for completion in main loop and start playback then
                    if (interactive) {
                        console.log(`[GOP] Started async decode of first GOP (slot ${bufferSlot}, ${gopSize} frames)`)
                    }
                } else if (currentGopSize === 0 && asyncDecodeInProgress) {
-                    // First GOP still decoding but another arrived - ignore it to avoid cancelling first GOP
+                    // Case 2: First GOP still decoding - ignore to avoid cancellation
                    if (interactive) {
                        console.log(`[GOP] Warning: GOP arrived while first GOP still decoding - ignoring to avoid cancellation`)
                    }
                    sys.free(compressedPtr)
-                } else if (currentGopSize > 0 && !asyncDecodeInProgress) {
+
-                    // GOP is playing and first GOP decode is done: decode this one to other slot in background (async)
+                } else if (currentGopSize > 0 && readyGopData === null && !asyncDecodeInProgress && graphics.tavDecodeGopIsComplete()) {
-                    const nextSlot = 1 - currentGopBufferSlot
+                    // Case 3: GOP playing, no ready GOP, no decode in progress - decode to ready slot
                    const nextSlot = (currentGopBufferSlot + 1) % BUFFER_SLOTS
                    const nextOffset = nextSlot * SLOT_SIZE
                    // DIAGNOSTIC: Measure background decode timing
                    const framesRemaining = currentGopSize - currentGopFrameIndex
-                    const timeRemaining = framesRemaining * FRAME_TIME * 1000.0  // milliseconds
+                    const timeRemaining = framesRemaining * FRAME_TIME * 1000.0
-                    // If previous GOP still decoding, free its memory (will be overwritten)
+                    // Start async decode to ready slot
                    if (nextGopData !== null && !nextGopData.decoded && nextGopData.compressedPtr && nextGopData.compressedPtr !== 0) {
                        if (interactive) {
                            console.log(`[GOP] Warning: New GOP arrived before previous decode completed - freeing old data`)
                        }
                        sys.free(nextGopData.compressedPtr)
                        nextGopData.compressedPtr = 0
                    }
                    if (interactive) {
                        console.log(`[GOP] Background decode started: frame ${currentGopFrameIndex}/${currentGopSize}, ${framesRemaining} frames (${timeRemaining.toFixed(0)}ms) remaining`)
                    }
                    // Start async background decode
                    graphics.tavDecodeGopToVideoBufferAsync(
                        compressedPtr, compressedSize, gopSize,
                        motionX, motionY,
                        header.width, header.height,
                        canvasWidth, canvasHeight,
                        marginLeft, marginTop,
                        header.qualityLevel,
                        QLUT[header.qualityY], QLUT[header.qualityCo], QLUT[header.qualityCg],
                        header.channelLayout,
@@ -1165,20 +1134,44 @@ try {
                        nextOffset
                    )
-                    // Mark as decoding (will check completion in main loop)
+                    readyGopData = {
                    nextGopData = {
                        gopSize: gopSize,
                        decoded: false,  // Will be set to true when async decode completes
                        slot: nextSlot,
-                        compressedPtr: compressedPtr,  // Will free after decode completes
+                        compressedPtr: compressedPtr,
                        startTime: sys.nanoTime(),
                        timeRemaining: timeRemaining
                    }
-                } else {
+
-                    // Fallback: unexpected state, just free the memory
+                } else if (currentGopSize > 0 && readyGopData !== null && decodingGopData === null && !asyncDecodeInProgress && graphics.tavDecodeGopIsComplete()) {
-                    if (interactive) {
+                    // Case 4: GOP playing, ready GOP exists, no decoding GOP, no decode in progress - decode to decoding slot
-                        console.log(`[GOP] Warning: Unexpected state - currentGopSize=${currentGopSize}, asyncDecodeInProgress=${asyncDecodeInProgress} - freeing GOP data`)
+                    const decodingSlot = (currentGopBufferSlot + 2) % BUFFER_SLOTS
                    const decodingOffset = decodingSlot * SLOT_SIZE
                    const framesRemaining = currentGopSize - currentGopFrameIndex
                    const timeRemaining = framesRemaining * FRAME_TIME * 1000.0
                    // Start async decode to decoding slot
                    graphics.tavDecodeGopToVideoBufferAsync(
                        compressedPtr, compressedSize, gopSize,
                        header.width, header.height,
                        header.qualityLevel,
                        QLUT[header.qualityY], QLUT[header.qualityCo], QLUT[header.qualityCg],
                        header.channelLayout,
                        header.waveletFilter, header.decompLevels, 2,
                        header.entropyCoder,
                        decodingOffset
                    )
                    decodingGopData = {
                        gopSize: gopSize,
                        slot: decodingSlot,
                        compressedPtr: compressedPtr,
                        startTime: sys.nanoTime(),
                        timeRemaining: timeRemaining
                    }
                } else {
                    // Case 5: All 3 buffers full (playing + ready + decoding) - ignore packet
                    sys.free(compressedPtr)
                }
            }
@@ -1187,13 +1180,10 @@ try {
                const framesInGOP = seqread.readOneByte()
                // Ignore - we display frames based on time accumulator, not this packet
-                // CRITICAL: Stop reading packets if both buffers are full
+                // CRITICAL: Stop reading packets if all 3 buffers are full
-                // (one GOP playing + one GOP ready/decoding)
+                // (one GOP playing + ready GOP + decoding GOP)
-                if (currentGopSize > 0 && nextGopData !== null) {
+                if (currentGopSize > 0 && readyGopData !== null && decodingGopData !== null) {
                    shouldReadPackets = false
                    if (interactive) {
                        console.log(`[GOP] Both buffers full - stopping packet reading until current GOP finishes`)
                    }
                }
            }
            else if (packetType === TAV_PACKET_AUDIO_MP2) {
@@ -1326,9 +1316,9 @@ try {
                // Resume packet reading to get next GOP (only one buffer occupied now)
                shouldReadPackets = true
-                if (interactive) {
+//                if (interactive) {
-                    console.log(`[GOP] First GOP ready (slot ${asyncDecodeSlot}, ${asyncDecodeGopSize} frames) in ${decodeTime.toFixed(1)}ms - starting playback`)
+//                    console.log(`[GOP] First GOP ready (slot ${asyncDecodeSlot}, ${asyncDecodeGopSize} frames) in ${decodeTime.toFixed(1)}ms - starting playback`)
-                }
+//                }
                // Free compressed data
                sys.free(asyncDecodePtr)
@@ -1374,44 +1364,37 @@ try {
            }
        }
-        // Step 4 & 7: GOP finished? Wait for background decode, then transition
+        // Step 4-7: GOP finished? Transition to ready GOP (triple-buffering)
        if (!paused && currentGopSize > 0 && currentGopFrameIndex >= currentGopSize) {
-            if (nextGopData !== null) {
+            if (readyGopData !== null) {
-                // Wait for background decode to complete
+                // Ready GOP exists - wait for it to finish decoding if still in progress
                while (!graphics.tavDecodeGopIsComplete() && !paused) {
                    sys.sleep(1)
                }
                if (!paused) {
                    const [r1, r2] = graphics.tavDecodeGopGetResult()
-                    decodeTime = (sys.nanoTime() - nextGopData.startTime) / 1000000.0
+                    decodeTime = (sys.nanoTime() - readyGopData.startTime) / 1000000.0
                    if (interactive) {
                        const margin = nextGopData.timeRemaining - decodeTime
                        const status = margin > 0 ? "✓ ON TIME" : "✗ TOO LATE"
                        console.log(`[GOP] Background decode finished in ${decodeTime.toFixed(1)}ms (margin: ${margin.toFixed(0)}ms) ${status}`)
                    }
                    // Free compressed data
-                    sys.free(nextGopData.compressedPtr)
+                    sys.free(readyGopData.compressedPtr)
-                    // Transition to next GOP
+                    // Transition to ready GOP
-                    currentGopBufferSlot = 1 - currentGopBufferSlot
+                    currentGopBufferSlot = readyGopData.slot
-                    currentGopSize = nextGopData.gopSize
+                    currentGopSize = readyGopData.gopSize
                    currentGopFrameIndex = 0
                    nextGopData = null
-                    // Resume packet reading now that one buffer is free
+                    // Promote decoding GOP to ready GOP
                    readyGopData = decodingGopData
                    decodingGopData = null
                    // Resume packet reading now that one buffer is free (decoding slot available)
                    shouldReadPackets = true
                    if (interactive) {
                        console.log(`[GOP] ✓ SEAMLESS TRANSITION to next GOP (slot ${currentGopBufferSlot}, ${currentGopSize} frames)`)
                    }
                }
            } else {
-                // No next GOP available, pause playback
+                // No ready GOP available - hiccup (shouldn't happen with triple-buffering)
                if (interactive) {
-                    console.log(`[GOP] ✗ HICCUP - next GOP NOT READY! Playback paused.`)
+                    console.log(`[GOP] ✗ HICCUP - ready GOP NOT READY! Playback paused.`)
                }
                currentGopSize = 0
                currentGopFrameIndex = 0
--- a/terranmon.txt
+++ b/terranmon.txt
@@ -1030,9 +1030,9 @@ transmission capability, and region-of-interest coding.
    ### List of Keys
    - Uint64 BGNT: Video begin time (must be equal to the value of the first Timecode packet)
    - Uint64 ENDT: Video end time (must be equal to the value of the last Timecode packet)
-    - Uint64 CDAT: Creation time in nanoseconds since UNIX Epoch
+    - Uint64 CDAT: Creation time in nanoseconds since UNIX Epoch (must be in UTC timezone)
-    - Bytes VNDR: Name and version of the encoder (for Reference encoder: "Encoder-TAV 20251014")
+    - Bytes VNDR: Name and version of the encoder (for Reference encoder: "Encoder-TAV 20251014 (list,of,features)")
-    - Bytes FMPG: FFmpeg version (typically "ffmpeg version 6.1.2"; the first line of text FFmpeg emits right before the copyright text)
+    - Bytes FMPG: FFmpeg version (typically "ffmpeg version 8.0 Copyright (c) 2000-2025 the FFmpeg developers"; the first line of text FFmpeg emits)
 ## Standard Metadata Payload Packet Structure
@@ -1062,10 +1062,12 @@ Updated on 2025-10-17 to include canvas expansion margins.
 This packet contains multiple frames encoded as a single spacetime block for optimal
 temporal compression.
-    uint8  Packet Type (0x12)
+    uint8  Packet Type (0x12/0x13)
    uint8  GOP Size (number of frames in this GOP, typically 16)
-    int16  Motion Vectors X[GOP Size] (quarter-pixel precision for global motion compensation)
+    <if packet type is 0x13>
-    int16  Motion Vectors Y[GOP Size] (quarter-pixel precision for global motion compensation)
+    uint32 Compressed Size
    *      Zstd-compressed Motion Data
    <endif>
    uint32 Compressed Size
    *      Zstd-compressed Unified Block Data
--- a/tsvm_core/src/net/torvald/tsvm/GraphicsJSR223Delegate.kt
+++ b/tsvm_core/src/net/torvald/tsvm/GraphicsJSR223Delegate.kt
@@ -6662,194 +6662,6 @@ class GraphicsJSR223Delegate(private val vm: VM) {
        System.arraycopy(output, 0, frameData, 0, frameData.size)
    }
    /**
     * Main GOP unified decoder function.
     * Decodes a unified 3D DWT GOP block (temporal + spatial) and outputs RGB frames.
     *
     * @param compressedDataPtr Pointer to compressed Zstd data
     * @param compressedSize Size of compressed data
     * @param gopSize Number of frames in GOP (1-16)
     * @param motionVectorsX X motion vectors in 1/16-pixel units
     * @param motionVectorsY Y motion vectors in 1/16-pixel units
     * @param outputRGBAddrs Array of output RGB buffer addresses
     * @param width Original frame width (output dimensions)
     * @param height Original frame height (output dimensions)
     * @param canvasWidth Expanded canvas width (for motion compensation)
     * @param canvasHeight Expanded canvas height (for motion compensation)
     * @param marginLeft Left margin to crop from expanded canvas
     * @param marginTop Top margin to crop from expanded canvas
     * @param qIndex Quality index
     * @param qYGlobal Global Y quantizer
     * @param qCoGlobal Global Co quantizer
     * @param qCgGlobal Global Cg quantizer
     * @param channelLayout Channel layout flags
     * @param spatialFilter Wavelet filter type
     * @param spatialLevels Number of spatial DWT levels (default 6)
     * @param temporalLevels Number of temporal DWT levels (default 2)
     * @return Number of frames decoded
     */
    fun tavDecodeGopUnified(
        compressedDataPtr: Long,
        compressedSize: Int,
        gopSize: Int,
        motionVectorsX: IntArray,
        motionVectorsY: IntArray,
        outputRGBAddrs: LongArray,
        width: Int,
        height: Int,
        canvasWidth: Int,
        canvasHeight: Int,
        marginLeft: Int,
        marginTop: Int,
        qIndex: Int,
        qYGlobal: Int,
        qCoGlobal: Int,
        qCgGlobal: Int,
        channelLayout: Int,
        spatialFilter: Int = 1,
        spatialLevels: Int = 6,
        temporalLevels: Int = 2,
        entropyCoder: Int = 0
    ): Array<Any> {
        val dbgOut = HashMap<String, Any>()
        dbgOut["qY"] = qYGlobal
        dbgOut["qCo"] = qCoGlobal
        dbgOut["qCg"] = qCgGlobal
        dbgOut["frameMode"] = "G"
        // Use expanded canvas dimensions for DWT processing
        val canvasPixels = canvasWidth * canvasHeight
        val outputPixels = width * height
        // Step 1: Decompress unified GOP block
        val compressedData = ByteArray(compressedSize)
        UnsafeHelper.memcpyRaw(
            null,
            vm.usermem.ptr + compressedDataPtr,
            compressedData,
            UnsafeHelper.getArrayOffset(compressedData),
            compressedSize.toLong()
        )
        val decompressedData = try {
            ZstdInputStream(java.io.ByteArrayInputStream(compressedData)).use { zstd ->
                zstd.readBytes()
            }
        } catch (e: Exception) {
            println("ERROR: Zstd decompression failed: ${e.message}")
            return arrayOf(0, dbgOut)
        }
        // Step 2: Postprocess unified block to per-frame coefficients (based on header's entropy coder field)
        val (isEZBCMode, quantizedCoeffs) = tavPostprocessGopAuto(
            decompressedData,
            gopSize,
            canvasPixels,  // Use expanded canvas size
            channelLayout,
            entropyCoder
        )
        // Step 3: Allocate GOP buffers for float coefficients (expanded canvas size)
        val gopY = Array(gopSize) { FloatArray(canvasPixels) }
        val gopCo = Array(gopSize) { FloatArray(canvasPixels) }
        val gopCg = Array(gopSize) { FloatArray(canvasPixels) }
        // Step 4: Calculate subband layout for expanded canvas (needed for perceptual dequantization)
        val subbands = calculateSubbandLayout(canvasWidth, canvasHeight, spatialLevels)
        // Step 5: Dequantize with temporal-spatial scaling
        for (t in 0 until gopSize) {
            val temporalLevel = getTemporalSubbandLevel(t, gopSize, temporalLevels)
            val temporalScale = getTemporalQuantizerScale(temporalLevel)
            // Apply temporal scaling to base quantizers for each channel
            val baseQY = (qYGlobal * temporalScale).coerceIn(1.0f, 4096.0f)
            val baseQCo = (qCoGlobal * temporalScale).coerceIn(1.0f, 4096.0f)
            val baseQCg = (qCgGlobal * temporalScale).coerceIn(1.0f, 4096.0f)
            // Use existing perceptual dequantization for spatial weighting
            dequantiseDWTSubbandsPerceptual(
                qIndex, qYGlobal,
                quantizedCoeffs[t][0], gopY[t],
                subbands, baseQY, false, spatialLevels,  // isChroma=false
                isEZBCMode
            )
            dequantiseDWTSubbandsPerceptual(
                qIndex, qYGlobal,
                quantizedCoeffs[t][1], gopCo[t],
                subbands, baseQCo, true, spatialLevels,  // isChroma=true
                isEZBCMode
            )
            dequantiseDWTSubbandsPerceptual(
                qIndex, qYGlobal,
                quantizedCoeffs[t][2], gopCg[t],
                subbands, baseQCg, true, spatialLevels,  // isChroma=true
                isEZBCMode
            )
        }
        // Step 6: Apply inverse 3D DWT (spatial first, then temporal) on expanded canvas
        tavApplyInverse3DDWT(gopY, canvasWidth, canvasHeight, gopSize, spatialLevels, temporalLevels, spatialFilter)
        tavApplyInverse3DDWT(gopCo, canvasWidth, canvasHeight, gopSize, spatialLevels, temporalLevels, spatialFilter)
        tavApplyInverse3DDWT(gopCg, canvasWidth, canvasHeight, gopSize, spatialLevels, temporalLevels, spatialFilter)
        // Step 7: Apply inverse motion compensation (shift frames back) on expanded canvas
        // Note: Motion vectors are in 1/16-pixel units, cumulative relative to frame 0
        for (t in 1 until gopSize) {  // Skip frame 0 (reference)
            val dx = motionVectorsX[t] / 16  // Convert to pixel units
            val dy = motionVectorsY[t] / 16
            if (dx != 0 || dy != 0) {
                applyInverseTranslation(gopY[t], canvasWidth, canvasHeight, dx, dy)
                applyInverseTranslation(gopCo[t], canvasWidth, canvasHeight, dx, dy)
                applyInverseTranslation(gopCg[t], canvasWidth, canvasHeight, dx, dy)
            }
        }
        // Step 8: Crop expanded canvas to original dimensions and convert to RGB
        for (t in 0 until gopSize) {
            val rgbAddr = outputRGBAddrs[t]
            // Crop from expanded canvas (canvasWidth x canvasHeight) to output (width x height)
            for (row in 0 until height) {
                for (col in 0 until width) {
                    // Source pixel in expanded canvas
                    val canvasX = col + marginLeft
                    val canvasY = row + marginTop
                    val canvasIdx = canvasY * canvasWidth + canvasX
                    // Destination pixel in output buffer
                    val outIdx = row * width + col
                    val yVal = gopY[t][canvasIdx]
                    val co = gopCo[t][canvasIdx]
                    val cg = gopCg[t][canvasIdx]
                    // YCoCg-R to RGB conversion
                    val tmp = yVal - (cg / 2.0f)
                    val g = cg + tmp
                    val b = tmp - (co / 2.0f)
                    val r = b + co
                    // Clamp to 0-255 range
                    val rClamped = r.toInt().coerceIn(0, 255)
                    val gClamped = g.toInt().coerceIn(0, 255)
                    val bClamped = b.toInt().coerceIn(0, 255)
                    // Write RGB24 format (3 bytes per pixel)
                    val offset = rgbAddr + outIdx * 3L
                    vm.usermem[offset] = rClamped.toByte()
                    vm.usermem[offset + 1] = gClamped.toByte()
                    vm.usermem[offset + 2] = bClamped.toByte()
                }
            }
        }
        return arrayOf(gopSize, dbgOut)
    }
    /**
     * Decode GOP frames directly into GraphicsAdapter.videoBuffer (Java heap).
     * This avoids allocating GOP frames in VM user memory, saving ~6 MB for 8-frame GOPs.
@@ -6864,14 +6676,8 @@ class GraphicsJSR223Delegate(private val vm: VM) {
        compressedDataPtr: Long,
        compressedSize: Int,
        gopSize: Int,
        motionVectorsX: IntArray,
        motionVectorsY: IntArray,
        width: Int,
        height: Int,
        canvasWidth: Int,
        canvasHeight: Int,
        marginLeft: Int,
        marginTop: Int,
        qIndex: Int,
        qYGlobal: Int,
        qCoGlobal: Int,
@@ -6900,7 +6706,6 @@ class GraphicsJSR223Delegate(private val vm: VM) {
        }
        // Use expanded canvas dimensions for DWT processing
        val canvasPixels = canvasWidth * canvasHeight
        val outputPixels = width * height
        // Step 1: Decompress unified GOP block
@@ -6926,18 +6731,18 @@ class GraphicsJSR223Delegate(private val vm: VM) {
        val (isEZBCMode, quantizedCoeffs) = tavPostprocessGopAuto(
            decompressedData,
            gopSize,
-            canvasPixels,
+            outputPixels,
            channelLayout,
            entropyCoder
        )
        // Step 3: Allocate GOP buffers for float coefficients (expanded canvas size)
-        val gopY = Array(gopSize) { FloatArray(canvasPixels) }
+        val gopY = Array(gopSize) { FloatArray(outputPixels) }
-        val gopCo = Array(gopSize) { FloatArray(canvasPixels) }
+        val gopCo = Array(gopSize) { FloatArray(outputPixels) }
-        val gopCg = Array(gopSize) { FloatArray(canvasPixels) }
+        val gopCg = Array(gopSize) { FloatArray(outputPixels) }
        // Step 4: Calculate subband layout for expanded canvas
-        val subbands = calculateSubbandLayout(canvasWidth, canvasHeight, spatialLevels)
+        val subbands = calculateSubbandLayout(width, height, spatialLevels)
        // Step 5: Dequantize with temporal-spatial scaling
        for (t in 0 until gopSize) {
@@ -6971,40 +6776,23 @@ class GraphicsJSR223Delegate(private val vm: VM) {
        }
        // Step 6: Apply inverse 3D DWT
-        tavApplyInverse3DDWT(gopY, canvasWidth, canvasHeight, gopSize, spatialLevels, temporalLevels, spatialFilter)
+        tavApplyInverse3DDWT(gopY, width, height, gopSize, spatialLevels, temporalLevels, spatialFilter)
-        tavApplyInverse3DDWT(gopCo, canvasWidth, canvasHeight, gopSize, spatialLevels, temporalLevels, spatialFilter)
+        tavApplyInverse3DDWT(gopCo, width, height, gopSize, spatialLevels, temporalLevels, spatialFilter)
-        tavApplyInverse3DDWT(gopCg, canvasWidth, canvasHeight, gopSize, spatialLevels, temporalLevels, spatialFilter)
+        tavApplyInverse3DDWT(gopCg, width, height, gopSize, spatialLevels, temporalLevels, spatialFilter)
        // Step 7: Apply inverse motion compensation
        for (t in 1 until gopSize) {
            val dx = motionVectorsX[t] / 16
            val dy = motionVectorsY[t] / 16
            if (dx != 0 || dy != 0) {
                applyInverseTranslation(gopY[t], canvasWidth, canvasHeight, dx, dy)
                applyInverseTranslation(gopCo[t], canvasWidth, canvasHeight, dx, dy)
                applyInverseTranslation(gopCg[t], canvasWidth, canvasHeight, dx, dy)
            }
        }
        // Step 8: Crop and convert to RGB, write directly to videoBuffer
        for (t in 0 until gopSize) {
            val videoBufferOffset = bufferOffset + (t * frameSize)  // Each frame sequentially, starting at bufferOffset
-            for (row in 0 until height) {
+            for (py in 0 until height) {
-                for (col in 0 until width) {
+                for (px in 0 until width) {
                    // Source pixel in expanded canvas
                    val canvasX = col + marginLeft
                    val canvasY = row + marginTop
                    val canvasIdx = canvasY * canvasWidth + canvasX
                    // Destination pixel in videoBuffer
-                    val outIdx = row * width + col
+                    val outIdx = py * width + px
                    val offset = videoBufferOffset + outIdx * 3L
-                    val yVal = gopY[t][canvasIdx]
+                    val yVal = gopY[t][outIdx]
-                    val co = gopCo[t][canvasIdx]
+                    val co = gopCo[t][outIdx]
-                    val cg = gopCg[t][canvasIdx]
+                    val cg = gopCg[t][outIdx]
                    // YCoCg-R to RGB conversion
                    val tmp = yVal - (cg / 2.0f)
@@ -7113,14 +6901,8 @@ class GraphicsJSR223Delegate(private val vm: VM) {
        compressedDataPtr: Long,
        compressedSize: Int,
        gopSize: Int,
        motionVectorsX: IntArray,
        motionVectorsY: IntArray,
        width: Int,
        height: Int,
        canvasWidth: Int,
        canvasHeight: Int,
        marginLeft: Int,
        marginTop: Int,
        qIndex: Int,
        qYGlobal: Int,
        qCoGlobal: Int,
@@ -7128,7 +6910,7 @@ class GraphicsJSR223Delegate(private val vm: VM) {
        channelLayout: Int,
        spatialFilter: Int = 1,
        spatialLevels: Int = 6,
-        temporalLevels: Int = 2,
+        temporalLevels: Int = 3,
        entropyCoder: Int = 0,
        bufferOffset: Long = 0
    ) {
@@ -7144,9 +6926,7 @@ class GraphicsJSR223Delegate(private val vm: VM) {
            try {
                val result = tavDecodeGopToVideoBuffer(
                    compressedDataPtr, compressedSize, gopSize,
-                    motionVectorsX, motionVectorsY,
+                    width, height,
                    width, height, canvasWidth, canvasHeight,
                    marginLeft, marginTop,
                    qIndex, qYGlobal, qCoGlobal, qCgGlobal,
                    channelLayout, spatialFilter, spatialLevels, temporalLevels,
                    entropyCoder, bufferOffset
--- a/tsvm_core/src/net/torvald/tsvm/peripheral/GraphicsAdapter.kt
+++ b/tsvm_core/src/net/torvald/tsvm/peripheral/GraphicsAdapter.kt
@@ -107,7 +107,7 @@ open class GraphicsAdapter(private val assetsRoot: String, val vm: VM, val confi
    internal val unusedArea = UnsafeHelper.allocate(1024, this)
    internal val scanlineOffsets = UnsafeHelper.allocate(1024, this)
-    internal val videoBuffer = UnsafeHelper.allocate(32 * 1024 * 1024, this)
+    internal val videoBuffer = UnsafeHelper.allocate(48 * 1024 * 1024, this)  // 48 MB for triple-buffering (3 slots × 21 frames × 752 kB)
    protected val paletteShader = LoadShader(DRAW_SHADER_VERT, config.paletteShader)
    protected val textShader = LoadShader(DRAW_SHADER_VERT, config.fragShader)
--- a/video_encoder/encoder_tav.c
+++ b/video_encoder/encoder_tav.c
@@ -18,7 +18,7 @@
 #include <float.h>
 #include <fftw3.h>
-#define ENCODER_VENDOR_STRING "Encoder-TAV 20251019"
+#define ENCODER_VENDOR_STRING "Encoder-TAV 20251022 (3d-dwt,ezbc)"
 // TSVM Advanced Video (TAV) format constants
 #define TAV_MAGIC "\x1F\x54\x53\x56\x4D\x54\x41\x56"  // "\x1FTSVM TAV"
@@ -48,7 +48,7 @@
 #define TAV_PACKET_IFRAME          0x10  // Intra frame (keyframe)
 #define TAV_PACKET_PFRAME          0x11  // Predicted frame (legacy, unused)
 #define TAV_PACKET_GOP_UNIFIED     0x12  // Unified 3D DWT GOP (all frames in single block, translation-based)
-#define TAV_PACKET_GOP_UNIFIED_MESH 0x13  // Unified 3D DWT GOP with distortion mesh warping
+#define TAV_PACKET_GOP_UNIFIED_MOTION 0x13  // Unified 3D DWT GOP with motion-compensated lifting
 #define TAV_PACKET_PFRAME_RESIDUAL 0x14  // P-frame with MPEG-style residual coding (block motion compensation)
 #define TAV_PACKET_BFRAME_RESIDUAL 0x15  // B-frame with MPEG-style residual coding (bidirectional prediction)
 #define TAV_PACKET_PFRAME_ADAPTIVE 0x16  // P-frame with adaptive quad-tree block partitioning
@@ -116,13 +116,15 @@ static int needs_alpha_channel(int channel_layout) {
 #define DEFAULT_HEIGHT 448
 #define DEFAULT_FPS 30
 #define DEFAULT_QUALITY 3
-#define DEFAULT_ZSTD_LEVEL 9
+#define DEFAULT_ZSTD_LEVEL 3
-#define TEMPORAL_GOP_SIZE 20//8 // ~42 frames fit into 32 MB video buffer
+#define TEMPORAL_GOP_SIZE 20
 #define TEMPORAL_DECOMP_LEVEL 2
 #define MOTION_THRESHOLD 24.0f // Flush if motion exceeds 24 pixels in any direction
 // Audio/subtitle constants (reused from TEV)
 #define MP2_SAMPLE_RATE 32000
 #define MP2_DEFAULT_PACKET_SIZE 1152
 #define PACKET_AUDIO_TIME ((double)MP2_DEFAULT_PACKET_SIZE / MP2_SAMPLE_RATE)
 #define MAX_SUBTITLE_LENGTH 2048
 int debugDumpMade = 0;
@@ -2175,6 +2177,7 @@ static int mp2_packet_size_to_rate_index(int packet_size, int is_mono);
 static long write_extended_header(tav_encoder_t *enc);
 static void write_timecode_packet(FILE *output, int frame_num, int fps, int is_ntsc_framerate);
 static int process_audio(tav_encoder_t *enc, int frame_num, FILE *output);
 static int process_audio_for_gop(tav_encoder_t *enc, int *frame_numbers, int num_frames, FILE *output);
 static subtitle_entry_t* parse_subtitle_file(const char *filename, int fps);
 static subtitle_entry_t* parse_srt_file(const char *filename, int fps);
 static subtitle_entry_t* parse_smi_file(const char *filename, int fps);
@@ -2269,7 +2272,7 @@ static void show_usage(const char *program_name) {
    printf("  --dump-frame N          Dump quantised coefficients for frame N (creates .bin files)\n");
    printf("  --wavelet N             Wavelet filter: 0=LGT 5/3, 1=CDF 9/7, 2=CDF 13/7, 16=DD-4, 255=Haar (default: 1)\n");
    printf("  --zstd-level N          Zstd compression level 1-22 (default: %d, higher = better compression but slower)\n", DEFAULT_ZSTD_LEVEL);
-    printf("  --no-grain-synthesis    Disable grain synthesis (enabled by default)\n");
+//    printf("  --no-grain-synthesis    Disable grain synthesis (enabled by default)\n");
    printf("  --help                  Show this help\n\n");
    printf("Audio Rate by Quality:\n  ");
@@ -2328,7 +2331,7 @@ static tav_encoder_t* create_encoder(void) {
    enc->intra_only = 0;
    enc->monoblock = 1;  // Default to monoblock mode
    enc->perceptual_tuning = 1;  // Default to perceptual quantisation (versions 5/6)
-    enc->enable_ezbc = 0;  // Default to twobit-map (EZBC adds overhead for small files)
+    enc->enable_ezbc = 1;  // Default to EZBC over twobit-map
    enc->channel_layout = CHANNEL_LAYOUT_YCOCG;  // Default to Y-Co-Cg
    enc->audio_bitrate = 0;  // 0 = use quality table
    enc->encode_limit = 0;  // Default: no frame limit
@@ -2339,7 +2342,7 @@ static tav_encoder_t* create_encoder(void) {
    enc->delta_haar_levels = TEMPORAL_DECOMP_LEVEL;
    // GOP / temporal DWT settings
-    enc->enable_temporal_dwt = 0;  // Default: disabled for backward compatibility. Mutually exclusive with use_delta_encoding
+    enc->enable_temporal_dwt = 1;  // Mutually exclusive with use_delta_encoding
    enc->temporal_gop_capacity = TEMPORAL_GOP_SIZE;  // 16 frames
    enc->temporal_gop_frame_count = 0;
    enc->temporal_decomp_levels = TEMPORAL_DECOMP_LEVEL;  // 2 levels of temporal DWT (16 -> 4x4 subbands)
@@ -4826,16 +4829,6 @@ static size_t gop_flush(tav_encoder_t *enc, FILE *output, int base_quantiser,
        memcpy(gop_cg_coeffs[i], enc->temporal_gop_cg_frames[i], num_pixels * sizeof(float));
    }
    // Debug: Print original frame-to-frame motion vectors
    if (enc->verbose && actual_gop_size >= 4) {
        printf("Frame-to-frame motion vectors (before cumulative conversion):\n");
        for (int i = 0; i < actual_gop_size; i++) {
            printf("  Frame %d: 1/16px=(%d, %d) pixels=(%.3f, %.3f)\n",
                   i, enc->temporal_gop_translation_x[i], enc->temporal_gop_translation_y[i],
                   enc->temporal_gop_translation_x[i] / 16.0f, enc->temporal_gop_translation_y[i] / 16.0f);
        }
    }
    // Step 0.5: Convert frame-to-frame motion vectors to cumulative (relative to frame 0)
    // Phase correlation computes motion of frame[i] relative to frame[i-1]
    // We need cumulative motion relative to frame 0 for proper alignment
@@ -4844,16 +4837,6 @@ static size_t gop_flush(tav_encoder_t *enc, FILE *output, int base_quantiser,
        enc->temporal_gop_translation_y[i] += enc->temporal_gop_translation_y[i-1];
    }
    // Debug: Print cumulative motion vectors
    if (enc->verbose && actual_gop_size >= 4) {
        printf("Cumulative motion vectors (after conversion):\n");
        for (int i = 0; i < actual_gop_size; i++) {
            printf("  Frame %d: 1/16px=(%d, %d) pixels=(%.3f, %.3f)\n",
                   i, enc->temporal_gop_translation_x[i], enc->temporal_gop_translation_y[i],
                   enc->temporal_gop_translation_x[i] / 16.0f, enc->temporal_gop_translation_y[i] / 16.0f);
        }
    }
    // Step 0.5b: Calculate the valid region after alignment (crop bounds)
    // Find the bounding box that's valid across all aligned frames
    int min_dx = 0, max_dx = 0, min_dy = 0, max_dy = 0;
@@ -5102,6 +5085,9 @@ static size_t gop_flush(tav_encoder_t *enc, FILE *output, int base_quantiser,
    // Write timecode packet for first frame in GOP
    write_timecode_packet(output, frame_numbers[0], enc->output_fps, enc->is_ntsc_framerate);
    // Process audio for this GOP (all frames at once)
    process_audio_for_gop(enc, frame_numbers, actual_gop_size, output);
    // Single-frame GOP fallback: use traditional I-frame encoding with serialise_tile_data
    if (actual_gop_size == 1) {
        // Write I-frame packet header (no motion vectors, no GOP overhead)
@@ -5171,10 +5157,11 @@ static size_t gop_flush(tav_encoder_t *enc, FILE *output, int base_quantiser,
            printf("Frame %d (single-frame GOP as I-frame): %zu bytes\n",
                   frame_numbers[0], compressed_size);
        }
-    } else {
+    }
    else {
        // Multi-frame GOP: use unified 3D DWT encoding
        // Choose packet type based on motion compensation method
-        uint8_t packet_type = enc->temporal_enable_mcezbc ? TAV_PACKET_GOP_UNIFIED_MESH : TAV_PACKET_GOP_UNIFIED;
+        uint8_t packet_type = enc->temporal_enable_mcezbc ? TAV_PACKET_GOP_UNIFIED_MOTION : TAV_PACKET_GOP_UNIFIED;
        fwrite(&packet_type, 1, 1, output);
        total_bytes_written += 1;
@@ -5263,26 +5250,6 @@ static size_t gop_flush(tav_encoder_t *enc, FILE *output, int base_quantiser,
            free(mv_buffer);
            free(compressed_mv);
        } else {
            // Packet 0x12: Translation-based alignment
            // Write canvas expansion information (4 bytes)
            uint8_t canvas_margins[4] = {
                (uint8_t)crop_left,    // Left margin
                (uint8_t)crop_right,   // Right margin
                (uint8_t)crop_top,     // Top margin
                (uint8_t)crop_bottom   // Bottom margin
            };
            fwrite(canvas_margins, 1, 4, output);
            total_bytes_written += 4;
            // Write all motion vectors (1/16-pixel precision) for the entire GOP
            for (int t = 0; t < actual_gop_size; t++) {
                int16_t dx = enc->temporal_gop_translation_x[t];
                int16_t dy = enc->temporal_gop_translation_y[t];
                fwrite(&dx, sizeof(int16_t), 1, output);
                fwrite(&dy, sizeof(int16_t), 1, output);
                total_bytes_written += 4;
            }
        }
        // Preprocess ALL frames with unified significance map
@@ -8649,13 +8616,8 @@ static int process_audio(tav_encoder_t *enc, int frame_num, FILE *output) {
    // Calculate how much audio time each frame represents (in seconds)
    double frame_audio_time = 1.0 / enc->output_fps;
    // Calculate how much audio time each MP2 packet represents
    // MP2 frame contains 1152 samples at 32kHz = 0.036 seconds
    #define MP2_SAMPLE_RATE 32000
    double packet_audio_time = 1152.0 / MP2_SAMPLE_RATE;
    // Estimate how many packets we consume per video frame
-    double packets_per_frame = frame_audio_time / packet_audio_time;
+    double packets_per_frame = frame_audio_time / PACKET_AUDIO_TIME;
    // Allocate MP2 buffer if needed
    if (!enc->mp2_buffer) {
@@ -8683,24 +8645,20 @@ static int process_audio(tav_encoder_t *enc, int frame_num, FILE *output) {
        // Calculate how many packets we need to maintain target buffer level
        // Only insert when buffer drops below target, and only insert enough to restore target
-        double target_level = (double)enc->target_audio_buffer_size;
+        double target_level = fmax(packets_per_frame, (double)enc->target_audio_buffer_size);
-        if (enc->audio_frames_in_buffer < target_level) {
+//        if (enc->audio_frames_in_buffer < target_level) {
            double deficit = target_level - enc->audio_frames_in_buffer;
            // Insert packets to cover the deficit, but at least maintain minimum flow
            packets_to_insert = (int)ceil(deficit);
            // Cap at reasonable maximum to prevent excessive insertion
            if (packets_to_insert > enc->target_audio_buffer_size) {
                packets_to_insert = enc->target_audio_buffer_size;
            }
            if (enc->verbose) {
                printf("Frame %d: Buffer low (%.2f->%.2f), deficit %.2f, inserting %d packets\n",
                       frame_num, old_buffer, enc->audio_frames_in_buffer, deficit, packets_to_insert);
            }
-        } else if (enc->verbose && old_buffer != enc->audio_frames_in_buffer) {
+//        } else if (enc->verbose && old_buffer != enc->audio_frames_in_buffer) {
-            printf("Frame %d: Buffer sufficient (%.2f->%.2f), no packets\n",
+//            printf("Frame %d: Buffer sufficient (%.2f->%.2f), no packets\n",
-                   frame_num, old_buffer, enc->audio_frames_in_buffer);
+//                   frame_num, old_buffer, enc->audio_frames_in_buffer);
-        }
+//        }
    }
    // Insert the calculated number of audio packets
@@ -8737,6 +8695,96 @@ static int process_audio(tav_encoder_t *enc, int frame_num, FILE *output) {
    return 1;
 }
 // Process audio for a GOP (multiple frames at once)
 // Accumulates deficit for N frames and emits all necessary audio packets
 static int process_audio_for_gop(tav_encoder_t *enc, int *frame_numbers, int num_frames, FILE *output) {
    if (!enc->has_audio || !enc->mp2_file || enc->audio_remaining <= 0 || num_frames == 0) {
        return 1;
    }
    // Handle first frame initialization (same as process_audio)
    int first_frame_in_gop = frame_numbers[0];
    if (first_frame_in_gop == 0) {
        uint8_t header[4];
        if (fread(header, 1, 4, enc->mp2_file) != 4) return 1;
        fseek(enc->mp2_file, 0, SEEK_SET);
        enc->mp2_packet_size = get_mp2_packet_size(header);
        int is_mono = (header[3] >> 6) == 3;
        enc->mp2_rate_index = mp2_packet_size_to_rate_index(enc->mp2_packet_size, is_mono);
        enc->target_audio_buffer_size = 4; // 4 audio packets in buffer (does nothing for GOP)
        enc->audio_frames_in_buffer = 0.0;
    }
    // Calculate audio packet consumption per video frame
    double frame_audio_time = 1.0 / enc->output_fps;
    double packets_per_frame = frame_audio_time / PACKET_AUDIO_TIME;
    // Allocate MP2 buffer if needed
    if (!enc->mp2_buffer) {
        enc->mp2_buffer_size = enc->mp2_packet_size * 2;
        enc->mp2_buffer = malloc(enc->mp2_buffer_size);
        if (!enc->mp2_buffer) {
            fprintf(stderr, "Failed to allocate audio buffer\n");
            return 1;
        }
    }
    // Calculate total deficit for all frames in the GOP
    int total_packets_to_insert = 0;
    // Simulate buffer consumption for all N frames in the GOP
    double old_buffer = enc->audio_frames_in_buffer;
    enc->audio_frames_in_buffer -= (packets_per_frame * num_frames);
    // Calculate deficit to restore buffer to target level
 //    double target_level = fmax(packets_per_frame, (double)enc->target_audio_buffer_size);
 //    if (enc->audio_frames_in_buffer < target_level) {
        double deficit = packets_per_frame * num_frames;
        total_packets_to_insert = CLAMP((int)round(deficit), enc->target_audio_buffer_size, 9999);
        if (enc->verbose) {
            printf("GOP (%d frames, starting at %d): Buffer low (%.2f->%.2f), deficit %.2f, inserting %d packets\n",
                   num_frames, first_frame_in_gop, old_buffer, enc->audio_frames_in_buffer, deficit, total_packets_to_insert);
        }
 //    } else if (enc->verbose) {
 //        printf("GOP (%d frames, starting at %d): Buffer sufficient (%.2f->%.2f), no packets\n",
 //               num_frames, first_frame_in_gop, old_buffer, enc->audio_frames_in_buffer);
 //    }
    // Emit all audio packets for this GOP
    for (int q = 0; q < total_packets_to_insert; q++) {
        size_t bytes_to_read = enc->mp2_packet_size;
        if (bytes_to_read > enc->audio_remaining) {
            bytes_to_read = enc->audio_remaining;
        }
        size_t bytes_read = fread(enc->mp2_buffer, 1, bytes_to_read, enc->mp2_file);
        if (bytes_read == 0) break;
        // Write TAV MP2 audio packet
        uint8_t audio_packet_type = TAV_PACKET_AUDIO_MP2;
        uint32_t audio_len = (uint32_t)bytes_read;
        fwrite(&audio_packet_type, 1, 1, output);
        fwrite(&audio_len, 4, 1, output);
        fwrite(enc->mp2_buffer, 1, bytes_read, output);
        // Track audio bytes written
        enc->audio_remaining -= bytes_read;
        enc->audio_frames_in_buffer++;
        if (first_frame_in_gop == 0) {
            enc->audio_frames_in_buffer = enc->target_audio_buffer_size / 2;
        }
        if (enc->verbose) {
            printf("Audio packet %d: %zu bytes (buffer: %.2f packets)\n",
                   q, bytes_read, enc->audio_frames_in_buffer);
        }
    }
    return 1;
 }
 // Process subtitles for current frame (copied and adapted from TEV)
 static int process_subtitles(tav_encoder_t *enc, int frame_num, FILE *output) {
    if (!enc->subtitles) {
@@ -9834,20 +9882,16 @@ int main(int argc, char *argv[]) {
                adjust_quantiser_for_bitrate(enc);
            }
-            // For GOP encoding, process audio/subtitles for all frames in the flushed GOP
+            // For GOP encoding, audio/subtitles are handled in gop_flush() for all GOP frames
            // For traditional encoding, process audio/subtitles for this single frame
-            if (enc->enable_temporal_dwt) {
+            if (!enc->enable_temporal_dwt) {
-                // Note: In GOP mode, audio/subtitle sync is approximate since we flush multiple frames at once
+                // Process audio for this frame
-                // This is acceptable since GOPs are short (16 frames max = ~0.5s at 30fps)
+                process_audio(enc, true_frame_count, enc->output_fp);
-                // TODO: Consider buffering audio/subtitles for precise sync if needed
+
                // Process subtitles for this frame
                process_subtitles(enc, true_frame_count, enc->output_fp);
            }
            // Process audio for this frame
            process_audio(enc, true_frame_count, enc->output_fp);
            // Process subtitles for this frame
            process_subtitles(enc, true_frame_count, enc->output_fp);
            // Write a sync packet only after a video is been coded
            // For GOP encoding, GOP_SYNC packet already serves as sync - don't emit extra SYNC
            // For B-frame mode, sync packets are already written in the encoding loop
@@ -9857,7 +9901,8 @@ int main(int argc, char *argv[]) {
            }
            // NTSC frame duplication: emit extra sync packet for every 1000n+500 frames
-            if (enc->is_ntsc_framerate && (frame_count % 1000 == 500)) {
+            // Skip when temporal DWT is enabled (audio handled in GOP flush)
            if (!enc->enable_temporal_dwt && enc->is_ntsc_framerate && (frame_count % 1000 == 500)) {
                true_frame_count++;
                // Process audio and subtitles for the duplicated frame to maintain sync
                process_audio(enc, true_frame_count, enc->output_fp);
--- a/video_encoder/tav_inspector.c
+++ b/video_encoder/tav_inspector.c
@@ -18,6 +18,11 @@
 #define TAV_PACKET_IFRAME         0x10
 #define TAV_PACKET_PFRAME         0x11
 #define TAV_PACKET_GOP_UNIFIED    0x12  // Unified 3D DWT GOP (all frames in single block)
 #define TAV_PACKET_GOP_UNIFIED_MOTION    0x13
 #define TAV_PACKET_PFRAME_RESIDUAL 0x14  // P-frame with MPEG-style residual coding (block motion compensation)
 #define TAV_PACKET_BFRAME_RESIDUAL 0x15  // B-frame with MPEG-style residual coding (bidirectional prediction)
 #define TAV_PACKET_PFRAME_ADAPTIVE 0x16  // P-frame with adaptive quad-tree block partitioning
 #define TAV_PACKET_BFRAME_ADAPTIVE 0x17  // B-frame with adaptive quad-tree block partitioning (bidirectional prediction)
 #define TAV_PACKET_AUDIO_MP2      0x20
 #define TAV_PACKET_SUBTITLE       0x30
 #define TAV_PACKET_SUBTITLE_KAR   0x31
@@ -59,6 +64,7 @@ typedef struct {
    int pframe_delta_count;
    int pframe_skip_count;
    int gop_unified_count;
    int gop_unified_motion_count;
    int gop_sync_count;
    int total_gop_frames;
    int audio_count;
@@ -94,6 +100,11 @@ const char* get_packet_type_name(uint8_t type) {
        case TAV_PACKET_IFRAME: return "I-FRAME";
        case TAV_PACKET_PFRAME: return "P-FRAME";
        case TAV_PACKET_GOP_UNIFIED: return "GOP (3D DWT Unified)";
        case TAV_PACKET_GOP_UNIFIED_MOTION: return "GOP (3D DWT Unified with Motion Data)";
        case TAV_PACKET_PFRAME_RESIDUAL: return "P-FRAME (residual)";
        case TAV_PACKET_BFRAME_RESIDUAL: return "B-FRAME (residual)";
        case TAV_PACKET_PFRAME_ADAPTIVE: return "P-FRAME (quadtree)";
        case TAV_PACKET_BFRAME_ADAPTIVE: return "B-FRAME (quadtree)";
        case TAV_PACKET_AUDIO_MP2: return "AUDIO MP2";
        case TAV_PACKET_SUBTITLE: return "SUBTITLE (Simple)";
        case TAV_PACKET_SUBTITLE_KAR: return "SUBTITLE (Karaoke)";
@@ -246,9 +257,10 @@ void print_extended_header(FILE *fp, int verbose) {
            if (verbose) {
                if (strcmp(key, "CDAT") == 0) {
                    time_t time_sec = value / 1000000000ULL;
-                    char *time_str = ctime(&time_sec);
+                    struct tm *time_info = gmtime(&time_sec);
-                    if (time_str) {
+                    if (time_info) {
-                        time_str[strlen(time_str)-1] = '\0';  // Remove newline
+                        char time_str[64];
                        strftime(time_str, sizeof(time_str), "%a %b %d %H:%M:%S %Y UTC", time_info);
                        printf("%s", time_str);
                    }
                } else {
@@ -484,48 +496,37 @@ int main(int argc, char *argv[]) {
                break;
            }
-            case TAV_PACKET_GOP_UNIFIED: {
+            case TAV_PACKET_GOP_UNIFIED: case TAV_PACKET_GOP_UNIFIED_MOTION: {
                // Unified GOP packet: [gop_size][motion_vectors...][compressed_size][data]
                uint8_t gop_size;
                if (fread(&gop_size, 1, 1, fp) != 1) break;
-                // Read all motion vectors
+                // Read motion vectors
-                int16_t *motion_x = malloc(gop_size * sizeof(int16_t));
+                uint32_t size0 = 0;
-                int16_t *motion_y = malloc(gop_size * sizeof(int16_t));
+                if (packet_type == TAV_PACKET_GOP_UNIFIED_MOTION) {
-                for (int i = 0; i < gop_size; i++) {
+                    if (fread(&size0, sizeof(uint32_t), 1, fp) != 1) { break; }
-                    if (fread(&motion_x[i], sizeof(int16_t), 1, fp) != 1) break;
+                    stats.total_video_bytes += size0;
-                    if (fread(&motion_y[i], sizeof(int16_t), 1, fp) != 1) break;
+                    stats.gop_unified_motion_count++;
                    fseek(fp, size0, SEEK_CUR);
                }
                // Read compressed data size
-                uint32_t size;
+                uint32_t size1;
-                if (fread(&size, sizeof(uint32_t), 1, fp) != 1) {
+                if (fread(&size1, sizeof(uint32_t), 1, fp) != 1) { break; }
-                    free(motion_x);
+                stats.total_video_bytes += size1;
-                    free(motion_y);
+                fseek(fp, size1, SEEK_CUR);
-                    break;
+
                }
                stats.total_video_bytes += size;
                stats.gop_unified_count++;
                stats.total_gop_frames += gop_size;
                if (packet_type == TAV_PACKET_GOP_UNIFIED) {
                    stats.gop_unified_count++;
                }
                if (!opts.summary_only && display) {
                    printf(" - GOP size=%u, data size=%u bytes (%.2f bytes/frame)",
-                           gop_size, size, (double)size / gop_size);
+                           gop_size, (size0 + size1), (double)(size0 + size1) / gop_size);
                    // Always show motion vectors for GOP packets with absolute frame numbers
                    if (gop_size > 0) {
                        printf("\n    Motion vectors (1/16-pixel):");
                        for (int i = 0; i < gop_size; i++) {
                            printf("\n      Frame %d (#%d): (%.3f, %.3f) px",
                                   current_frame + i, i, motion_x[i] / 16.0, motion_y[i] / 16.0);
                        }
                    }
                }
                free(motion_x);
                free(motion_y);
                fseek(fp, size, SEEK_CUR);
                break;
            }
@@ -714,10 +715,10 @@ int main(int argc, char *argv[]) {
        printf(")");
    }
    printf("\n");
-    if (stats.gop_unified_count > 0) {
+    if (stats.gop_unified_count + stats.gop_unified_motion_count > 0) {
        printf("  3D GOP packets:     %d (total frames: %d, avg %.1f frames/GOP)\n",
-               stats.gop_unified_count, stats.total_gop_frames,
+               (stats.gop_unified_count + stats.gop_unified_motion_count), stats.total_gop_frames,
-               (double)stats.total_gop_frames / stats.gop_unified_count);
+               (double)stats.total_gop_frames / (stats.gop_unified_count + stats.gop_unified_motion_count));
        printf("  GOP sync packets:   %d\n", stats.gop_sync_count);
    }
    printf("  Mux video:          %d\n", stats.mux_video_count);