From 4eec98cdca18d54e99f0750f039fd421f26842d7 Mon Sep 17 00:00:00 2001
From: minjaesong <alswo9628@gmail.com>
Date: Wed, 22 Oct 2025 01:32:19 +0900
Subject: [PATCH] TAV: half-fixed 3d dwt playback

---
 assets/disk0/tvdos/bin/playtav.js             | 209 +++++++--------
 terranmon.txt                                 |  14 +-
 .../torvald/tsvm/GraphicsJSR223Delegate.kt    | 252 ++----------------
 .../tsvm/peripheral/GraphicsAdapter.kt        |   2 +-
 video_encoder/encoder_tav.c                   | 199 ++++++++------
 video_encoder/tav_inspector.c                 |  69 ++---
 6 files changed, 278 insertions(+), 467 deletions(-)

diff --git a/assets/disk0/tvdos/bin/playtav.js b/assets/disk0/tvdos/bin/playtav.js
index 819ecda..0480bcc 100644
--- a/assets/disk0/tvdos/bin/playtav.js
+++ b/assets/disk0/tvdos/bin/playtav.js
@@ -355,11 +355,12 @@ let decodeHeight = isInterlaced ? (header.height >> 1) : header.height
 const FRAME_PIXELS = header.width * header.height
 const FRAME_SIZE = FRAME_PIXELS * 3  // RGB buffer size
 
-// Double-buffering: Fixed slot sizes in videoBuffer (32 MB total)
-const MAX_GOP_SIZE = 21  // Maximum frames per slot (21 * 752KB = ~15MB per slot)
+// Triple-buffering: Fixed slot sizes in videoBuffer (48 MB total)
+const BUFFER_SLOTS = 3  // Three slots: playing, ready, decoding
+const MAX_GOP_SIZE = 21  // Maximum frames per slot (21 * 752KB = ~15.8MB per slot)
 const SLOT_SIZE = MAX_GOP_SIZE * FRAME_SIZE  // Fixed slot size regardless of actual GOP size
 
-console.log(`Double-buffering: Max ${MAX_GOP_SIZE} frames/slot, ${(SLOT_SIZE / 1048576).toFixed(1)}MB per slot`)
+console.log(`Triple-buffering: ${BUFFER_SLOTS} slots, max ${MAX_GOP_SIZE} frames/slot, ${(SLOT_SIZE / 1048576).toFixed(1)}MB per slot`)
 
 const RGB_BUFFER_A = sys.malloc(FRAME_SIZE)
 const RGB_BUFFER_B = sys.malloc(FRAME_SIZE)
@@ -484,17 +485,18 @@ let currentFileIndex = 1  // Track which file we're playing in concatenated stre
 let totalFilesProcessed = 0
 let decoderDbgInfo = {}
 
-// GOP double-buffering state
-let currentGopBufferSlot = 0  // Which buffer slot is currently being displayed (0 or 1)
+// GOP triple-buffering state (3 slots: playing, ready, decoding)
+let currentGopBufferSlot = 0  // Which buffer slot is currently being displayed (0, 1, or 2)
 let currentGopSize = 0         // Number of frames in current GOP being displayed
 let currentGopFrameIndex = 0   // Which frame of current GOP we're displaying
-let nextGopData = null         // Buffered next GOP packet data for background decode
+let readyGopData = null        // GOP that's already decoded and ready to play (next in line)
+let decodingGopData = null     // GOP currently being decoded in background
 let asyncDecodeInProgress = false  // Track if async decode is running
 let asyncDecodeSlot = 0        // Which slot the async decode is targeting
 let asyncDecodeGopSize = 0     // Size of GOP being decoded async
 let asyncDecodePtr = 0         // Compressed data pointer to free after decode
 let asyncDecodeStartTime = 0   // When async decode started (for diagnostics)
-let shouldReadPackets = true   // Gate packet reading: false when both buffers are full
+let shouldReadPackets = true   // Gate packet reading: false when all 3 buffers are full
 
 let cueElements = []
 let currentCueIndex = -1  // Track current cue position
@@ -510,12 +512,19 @@ function cleanupAsyncDecode() {
         asyncDecodeGopSize = 0
     }
 
-    // Free background GOP decode memory if in progress
-    if (nextGopData !== null && nextGopData.compressedPtr && nextGopData.compressedPtr !== 0) {
-        sys.free(nextGopData.compressedPtr)
-        nextGopData.compressedPtr = 0
+    // Free ready GOP memory if present
+    if (readyGopData !== null && readyGopData.compressedPtr && readyGopData.compressedPtr !== 0) {
+        sys.free(readyGopData.compressedPtr)
+        readyGopData.compressedPtr = 0
     }
-    nextGopData = null
+    readyGopData = null
+
+    // Free decoding GOP memory if present
+    if (decodingGopData !== null && decodingGopData.compressedPtr && decodingGopData.compressedPtr !== 0) {
+        sys.free(decodingGopData.compressedPtr)
+        decodingGopData.compressedPtr = 0
+    }
+    decodingGopData = null
 
     // Reset GOP playback state
     currentGopSize = 0
@@ -751,7 +760,10 @@ let paused = false
 try {
     let t1 = sys.nanoTime()
 
-    while (!stopPlay && seqread.getReadCount() < FILE_LENGTH) {
+    // Continue loop while:
+    // 1. Reading packets (not EOF yet), OR
+    // 2. There are buffered GOPs to play (after EOF)
+    while (!stopPlay && (seqread.getReadCount() < FILE_LENGTH || currentGopSize > 0 || readyGopData !== null || decodingGopData !== null || asyncDecodeInProgress)) {
 
 
         // Handle interactive controls
@@ -866,9 +878,10 @@ try {
         }
 
         // GATED PACKET READING
-        // Stop reading when both buffers are full (GOP playing + GOP decoding/ready)
+        // Stop reading when all 3 buffers are full (GOP playing + ready GOP + decoding GOP)
         // Resume reading when GOP finishes (one buffer becomes free)
-        if (shouldReadPackets && !paused) {
+        // Also stop reading at EOF
+        if (shouldReadPackets && !paused && seqread.getReadCount() < FILE_LENGTH) {
             // Read packet header (record position before reading for I-frame tracking)
             let packetOffset = seqread.getReadCount()
             var packetType = seqread.readOneByte()
@@ -1051,32 +1064,15 @@ try {
 
                 // Read GOP packet data
                 const gopSize = seqread.readOneByte()
-                const marginLeft = seqread.readOneByte()
-                const marginRight = seqread.readOneByte()
-                const marginTop = seqread.readOneByte()
-                const marginBottom = seqread.readOneByte()
-
-                const canvasWidth = header.width + marginLeft + marginRight
-                const canvasHeight = header.height + marginTop + marginBottom
-
-                // Read motion vectors (1/16-pixel units, int16)
-                let motionX = new Array(gopSize)
-                let motionY = new Array(gopSize)
-
-                for (let i = 0; i < gopSize; i++) {
-                    let mx = seqread.readShort()
-                    let my = seqread.readShort()
-                    motionX[i] = (mx > 32767) ? (mx - 65536) : mx
-                    motionY[i] = (my > 32767) ? (my - 65536) : my
-                }
-
                 const compressedSize = seqread.readInt()
                 let compressedPtr = seqread.readBytes(compressedSize)
                 updateDataRateBin(compressedSize)
 
-                // DOUBLE-BUFFERING LOGIC:
-                // - If no GOP is currently playing: decode immediately to current slot
-                // - Otherwise: buffer this GOP for decode during next GOP's playback
+                // TRIPLE-BUFFERING LOGIC (3 slots: playing, ready, decoding):
+                // - If no GOP playing: decode first GOP to slot 0
+                // - If GOP playing but no ready GOP: decode to ready slot (next in rotation)
+                // - If GOP playing and ready GOP exists but no decoding: decode to decoding slot
+                // - Otherwise: all 3 buffers full, ignore packet
 
                 // Check GOP size fits in slot
                 if (gopSize > MAX_GOP_SIZE) {
@@ -1086,11 +1082,11 @@ try {
                 }
 
                 if (currentGopSize === 0 && !asyncDecodeInProgress) {
-                    // No active GOP and no decode in progress: decode asynchronously and start playback when ready
+                    // Case 1: No active GOP and no decode in progress - decode first GOP
                     const bufferSlot = currentGopBufferSlot
                     const bufferOffset = bufferSlot * SLOT_SIZE
 
-                    // Defensive: free any old async decode memory (shouldn't happen but be safe)
+                    // Defensive: free any old async decode memory
                     if (asyncDecodePtr !== 0) {
                         sys.free(asyncDecodePtr)
                         asyncDecodePtr = 0
@@ -1099,10 +1095,7 @@ try {
                     // Start async decode
                     graphics.tavDecodeGopToVideoBufferAsync(
                         compressedPtr, compressedSize, gopSize,
-                        motionX, motionY,
                         header.width, header.height,
-                        canvasWidth, canvasHeight,
-                        marginLeft, marginTop,
                         header.qualityLevel,
                         QLUT[header.qualityY], QLUT[header.qualityCo], QLUT[header.qualityCg],
                         header.channelLayout,
@@ -1114,49 +1107,25 @@ try {
                     asyncDecodeInProgress = true
                     asyncDecodeSlot = bufferSlot
                     asyncDecodeGopSize = gopSize
-                    asyncDecodePtr = compressedPtr  // Will free after decode completes
+                    asyncDecodePtr = compressedPtr
                     asyncDecodeStartTime = sys.nanoTime()
 
-                    // Note: compressedPtr will be freed after decode completes
-                    // We'll check for completion in main loop and start playback then
-                    if (interactive) {
-                        console.log(`[GOP] Started async decode of first GOP (slot ${bufferSlot}, ${gopSize} frames)`)
-                    }
                 } else if (currentGopSize === 0 && asyncDecodeInProgress) {
-                    // First GOP still decoding but another arrived - ignore it to avoid cancelling first GOP
-                    if (interactive) {
-                        console.log(`[GOP] Warning: GOP arrived while first GOP still decoding - ignoring to avoid cancellation`)
-                    }
+                    // Case 2: First GOP still decoding - ignore to avoid cancellation
                     sys.free(compressedPtr)
-                } else if (currentGopSize > 0 && !asyncDecodeInProgress) {
-                    // GOP is playing and first GOP decode is done: decode this one to other slot in background (async)
-                    const nextSlot = 1 - currentGopBufferSlot
+
+                } else if (currentGopSize > 0 && readyGopData === null && !asyncDecodeInProgress && graphics.tavDecodeGopIsComplete()) {
+                    // Case 3: GOP playing, no ready GOP, no decode in progress - decode to ready slot
+                    const nextSlot = (currentGopBufferSlot + 1) % BUFFER_SLOTS
                     const nextOffset = nextSlot * SLOT_SIZE
 
-                    // DIAGNOSTIC: Measure background decode timing
                     const framesRemaining = currentGopSize - currentGopFrameIndex
-                    const timeRemaining = framesRemaining * FRAME_TIME * 1000.0  // milliseconds
+                    const timeRemaining = framesRemaining * FRAME_TIME * 1000.0
 
-                    // If previous GOP still decoding, free its memory (will be overwritten)
-                    if (nextGopData !== null && !nextGopData.decoded && nextGopData.compressedPtr && nextGopData.compressedPtr !== 0) {
-                        if (interactive) {
-                            console.log(`[GOP] Warning: New GOP arrived before previous decode completed - freeing old data`)
-                        }
-                        sys.free(nextGopData.compressedPtr)
-                        nextGopData.compressedPtr = 0
-                    }
-
-                    if (interactive) {
-                        console.log(`[GOP] Background decode started: frame ${currentGopFrameIndex}/${currentGopSize}, ${framesRemaining} frames (${timeRemaining.toFixed(0)}ms) remaining`)
-                    }
-
-                    // Start async background decode
+                    // Start async decode to ready slot
                     graphics.tavDecodeGopToVideoBufferAsync(
                         compressedPtr, compressedSize, gopSize,
-                        motionX, motionY,
                         header.width, header.height,
-                        canvasWidth, canvasHeight,
-                        marginLeft, marginTop,
                         header.qualityLevel,
                         QLUT[header.qualityY], QLUT[header.qualityCo], QLUT[header.qualityCg],
                         header.channelLayout,
@@ -1165,20 +1134,44 @@ try {
                         nextOffset
                     )
 
-                    // Mark as decoding (will check completion in main loop)
-                    nextGopData = {
+                    readyGopData = {
                         gopSize: gopSize,
-                        decoded: false,  // Will be set to true when async decode completes
                         slot: nextSlot,
-                        compressedPtr: compressedPtr,  // Will free after decode completes
+                        compressedPtr: compressedPtr,
                         startTime: sys.nanoTime(),
                         timeRemaining: timeRemaining
                     }
-                } else {
-                    // Fallback: unexpected state, just free the memory
-                    if (interactive) {
-                        console.log(`[GOP] Warning: Unexpected state - currentGopSize=${currentGopSize}, asyncDecodeInProgress=${asyncDecodeInProgress} - freeing GOP data`)
+
+                } else if (currentGopSize > 0 && readyGopData !== null && decodingGopData === null && !asyncDecodeInProgress && graphics.tavDecodeGopIsComplete()) {
+                    // Case 4: GOP playing, ready GOP exists, no decoding GOP, no decode in progress - decode to decoding slot
+                    const decodingSlot = (currentGopBufferSlot + 2) % BUFFER_SLOTS
+                    const decodingOffset = decodingSlot * SLOT_SIZE
+
+                    const framesRemaining = currentGopSize - currentGopFrameIndex
+                    const timeRemaining = framesRemaining * FRAME_TIME * 1000.0
+
+                    // Start async decode to decoding slot
+                    graphics.tavDecodeGopToVideoBufferAsync(
+                        compressedPtr, compressedSize, gopSize,
+                        header.width, header.height,
+                        header.qualityLevel,
+                        QLUT[header.qualityY], QLUT[header.qualityCo], QLUT[header.qualityCg],
+                        header.channelLayout,
+                        header.waveletFilter, header.decompLevels, 2,
+                        header.entropyCoder,
+                        decodingOffset
+                    )
+
+                    decodingGopData = {
+                        gopSize: gopSize,
+                        slot: decodingSlot,
+                        compressedPtr: compressedPtr,
+                        startTime: sys.nanoTime(),
+                        timeRemaining: timeRemaining
                     }
+
+                } else {
+                    // Case 5: All 3 buffers full (playing + ready + decoding) - ignore packet
                     sys.free(compressedPtr)
                 }
             }
@@ -1187,13 +1180,10 @@ try {
                 const framesInGOP = seqread.readOneByte()
                 // Ignore - we display frames based on time accumulator, not this packet
 
-                // CRITICAL: Stop reading packets if both buffers are full
-                // (one GOP playing + one GOP ready/decoding)
-                if (currentGopSize > 0 && nextGopData !== null) {
+                // CRITICAL: Stop reading packets if all 3 buffers are full
+                // (one GOP playing + ready GOP + decoding GOP)
+                if (currentGopSize > 0 && readyGopData !== null && decodingGopData !== null) {
                     shouldReadPackets = false
-                    if (interactive) {
-                        console.log(`[GOP] Both buffers full - stopping packet reading until current GOP finishes`)
-                    }
                 }
             }
             else if (packetType === TAV_PACKET_AUDIO_MP2) {
@@ -1326,9 +1316,9 @@ try {
                 // Resume packet reading to get next GOP (only one buffer occupied now)
                 shouldReadPackets = true
 
-                if (interactive) {
-                    console.log(`[GOP] First GOP ready (slot ${asyncDecodeSlot}, ${asyncDecodeGopSize} frames) in ${decodeTime.toFixed(1)}ms - starting playback`)
-                }
+//                if (interactive) {
+//                    console.log(`[GOP] First GOP ready (slot ${asyncDecodeSlot}, ${asyncDecodeGopSize} frames) in ${decodeTime.toFixed(1)}ms - starting playback`)
+//                }
 
                 // Free compressed data
                 sys.free(asyncDecodePtr)
@@ -1374,44 +1364,37 @@ try {
             }
         }
 
-        // Step 4 & 7: GOP finished? Wait for background decode, then transition
+        // Step 4-7: GOP finished? Transition to ready GOP (triple-buffering)
         if (!paused && currentGopSize > 0 && currentGopFrameIndex >= currentGopSize) {
-            if (nextGopData !== null) {
-                // Wait for background decode to complete
+            if (readyGopData !== null) {
+                // Ready GOP exists - wait for it to finish decoding if still in progress
                 while (!graphics.tavDecodeGopIsComplete() && !paused) {
                     sys.sleep(1)
                 }
 
                 if (!paused) {
                     const [r1, r2] = graphics.tavDecodeGopGetResult()
-                    decodeTime = (sys.nanoTime() - nextGopData.startTime) / 1000000.0
-
-                    if (interactive) {
-                        const margin = nextGopData.timeRemaining - decodeTime
-                        const status = margin > 0 ? "✓ ON TIME" : "✗ TOO LATE"
-                        console.log(`[GOP] Background decode finished in ${decodeTime.toFixed(1)}ms (margin: ${margin.toFixed(0)}ms) ${status}`)
-                    }
+                    decodeTime = (sys.nanoTime() - readyGopData.startTime) / 1000000.0
 
                     // Free compressed data
-                    sys.free(nextGopData.compressedPtr)
+                    sys.free(readyGopData.compressedPtr)
 
-                    // Transition to next GOP
-                    currentGopBufferSlot = 1 - currentGopBufferSlot
-                    currentGopSize = nextGopData.gopSize
+                    // Transition to ready GOP
+                    currentGopBufferSlot = readyGopData.slot
+                    currentGopSize = readyGopData.gopSize
                     currentGopFrameIndex = 0
-                    nextGopData = null
 
-                    // Resume packet reading now that one buffer is free
+                    // Promote decoding GOP to ready GOP
+                    readyGopData = decodingGopData
+                    decodingGopData = null
+
+                    // Resume packet reading now that one buffer is free (decoding slot available)
                     shouldReadPackets = true
-
-                    if (interactive) {
-                        console.log(`[GOP] ✓ SEAMLESS TRANSITION to next GOP (slot ${currentGopBufferSlot}, ${currentGopSize} frames)`)
-                    }
                 }
             } else {
-                // No next GOP available, pause playback
+                // No ready GOP available - hiccup (shouldn't happen with triple-buffering)
                 if (interactive) {
-                    console.log(`[GOP] ✗ HICCUP - next GOP NOT READY! Playback paused.`)
+                    console.log(`[GOP] ✗ HICCUP - ready GOP NOT READY! Playback paused.`)
                 }
                 currentGopSize = 0
                 currentGopFrameIndex = 0
diff --git a/terranmon.txt b/terranmon.txt
index daab381..9ae4c13 100644
--- a/terranmon.txt
+++ b/terranmon.txt
@@ -1030,9 +1030,9 @@ transmission capability, and region-of-interest coding.
     ### List of Keys
     - Uint64 BGNT: Video begin time (must be equal to the value of the first Timecode packet)
     - Uint64 ENDT: Video end time (must be equal to the value of the last Timecode packet)
-    - Uint64 CDAT: Creation time in nanoseconds since UNIX Epoch
-    - Bytes VNDR: Name and version of the encoder (for Reference encoder: "Encoder-TAV 20251014")
-    - Bytes FMPG: FFmpeg version (typically "ffmpeg version 6.1.2"; the first line of text FFmpeg emits right before the copyright text)
+    - Uint64 CDAT: Creation time in nanoseconds since UNIX Epoch (must be in UTC timezone)
+    - Bytes VNDR: Name and version of the encoder (for Reference encoder: "Encoder-TAV 20251014 (list,of,features)")
+    - Bytes FMPG: FFmpeg version (typically "ffmpeg version 8.0 Copyright (c) 2000-2025 the FFmpeg developers"; the first line of text FFmpeg emits)
 
 
 ## Standard Metadata Payload Packet Structure
@@ -1062,10 +1062,12 @@ Updated on 2025-10-17 to include canvas expansion margins.
 This packet contains multiple frames encoded as a single spacetime block for optimal
 temporal compression.
 
-    uint8  Packet Type (0x12)
+    uint8  Packet Type (0x12/0x13)
     uint8  GOP Size (number of frames in this GOP, typically 16)
-    int16  Motion Vectors X[GOP Size] (quarter-pixel precision for global motion compensation)
-    int16  Motion Vectors Y[GOP Size] (quarter-pixel precision for global motion compensation)
+    <if packet type is 0x13>
+    uint32 Compressed Size
+    *      Zstd-compressed Motion Data
+    <endif>
     uint32 Compressed Size
     *      Zstd-compressed Unified Block Data
 
diff --git a/tsvm_core/src/net/torvald/tsvm/GraphicsJSR223Delegate.kt b/tsvm_core/src/net/torvald/tsvm/GraphicsJSR223Delegate.kt
index 802b394..4ce01f6 100644
--- a/tsvm_core/src/net/torvald/tsvm/GraphicsJSR223Delegate.kt
+++ b/tsvm_core/src/net/torvald/tsvm/GraphicsJSR223Delegate.kt
@@ -6662,194 +6662,6 @@ class GraphicsJSR223Delegate(private val vm: VM) {
         System.arraycopy(output, 0, frameData, 0, frameData.size)
     }
 
-    /**
-     * Main GOP unified decoder function.
-     * Decodes a unified 3D DWT GOP block (temporal + spatial) and outputs RGB frames.
-     *
-     * @param compressedDataPtr Pointer to compressed Zstd data
-     * @param compressedSize Size of compressed data
-     * @param gopSize Number of frames in GOP (1-16)
-     * @param motionVectorsX X motion vectors in 1/16-pixel units
-     * @param motionVectorsY Y motion vectors in 1/16-pixel units
-     * @param outputRGBAddrs Array of output RGB buffer addresses
-     * @param width Original frame width (output dimensions)
-     * @param height Original frame height (output dimensions)
-     * @param canvasWidth Expanded canvas width (for motion compensation)
-     * @param canvasHeight Expanded canvas height (for motion compensation)
-     * @param marginLeft Left margin to crop from expanded canvas
-     * @param marginTop Top margin to crop from expanded canvas
-     * @param qIndex Quality index
-     * @param qYGlobal Global Y quantizer
-     * @param qCoGlobal Global Co quantizer
-     * @param qCgGlobal Global Cg quantizer
-     * @param channelLayout Channel layout flags
-     * @param spatialFilter Wavelet filter type
-     * @param spatialLevels Number of spatial DWT levels (default 6)
-     * @param temporalLevels Number of temporal DWT levels (default 2)
-     * @return Number of frames decoded
-     */
-    fun tavDecodeGopUnified(
-        compressedDataPtr: Long,
-        compressedSize: Int,
-        gopSize: Int,
-        motionVectorsX: IntArray,
-        motionVectorsY: IntArray,
-        outputRGBAddrs: LongArray,
-        width: Int,
-        height: Int,
-        canvasWidth: Int,
-        canvasHeight: Int,
-        marginLeft: Int,
-        marginTop: Int,
-        qIndex: Int,
-        qYGlobal: Int,
-        qCoGlobal: Int,
-        qCgGlobal: Int,
-        channelLayout: Int,
-        spatialFilter: Int = 1,
-        spatialLevels: Int = 6,
-        temporalLevels: Int = 2,
-        entropyCoder: Int = 0
-    ): Array<Any> {
-        val dbgOut = HashMap<String, Any>()
-        dbgOut["qY"] = qYGlobal
-        dbgOut["qCo"] = qCoGlobal
-        dbgOut["qCg"] = qCgGlobal
-        dbgOut["frameMode"] = "G"
-
-        // Use expanded canvas dimensions for DWT processing
-        val canvasPixels = canvasWidth * canvasHeight
-        val outputPixels = width * height
-
-        // Step 1: Decompress unified GOP block
-        val compressedData = ByteArray(compressedSize)
-        UnsafeHelper.memcpyRaw(
-            null,
-            vm.usermem.ptr + compressedDataPtr,
-            compressedData,
-            UnsafeHelper.getArrayOffset(compressedData),
-            compressedSize.toLong()
-        )
-
-        val decompressedData = try {
-            ZstdInputStream(java.io.ByteArrayInputStream(compressedData)).use { zstd ->
-                zstd.readBytes()
-            }
-        } catch (e: Exception) {
-            println("ERROR: Zstd decompression failed: ${e.message}")
-            return arrayOf(0, dbgOut)
-        }
-
-        // Step 2: Postprocess unified block to per-frame coefficients (based on header's entropy coder field)
-        val (isEZBCMode, quantizedCoeffs) = tavPostprocessGopAuto(
-            decompressedData,
-            gopSize,
-            canvasPixels,  // Use expanded canvas size
-            channelLayout,
-            entropyCoder
-        )
-
-        // Step 3: Allocate GOP buffers for float coefficients (expanded canvas size)
-        val gopY = Array(gopSize) { FloatArray(canvasPixels) }
-        val gopCo = Array(gopSize) { FloatArray(canvasPixels) }
-        val gopCg = Array(gopSize) { FloatArray(canvasPixels) }
-
-        // Step 4: Calculate subband layout for expanded canvas (needed for perceptual dequantization)
-        val subbands = calculateSubbandLayout(canvasWidth, canvasHeight, spatialLevels)
-
-        // Step 5: Dequantize with temporal-spatial scaling
-        for (t in 0 until gopSize) {
-            val temporalLevel = getTemporalSubbandLevel(t, gopSize, temporalLevels)
-            val temporalScale = getTemporalQuantizerScale(temporalLevel)
-
-            // Apply temporal scaling to base quantizers for each channel
-            val baseQY = (qYGlobal * temporalScale).coerceIn(1.0f, 4096.0f)
-            val baseQCo = (qCoGlobal * temporalScale).coerceIn(1.0f, 4096.0f)
-            val baseQCg = (qCgGlobal * temporalScale).coerceIn(1.0f, 4096.0f)
-
-            // Use existing perceptual dequantization for spatial weighting
-            dequantiseDWTSubbandsPerceptual(
-                qIndex, qYGlobal,
-                quantizedCoeffs[t][0], gopY[t],
-                subbands, baseQY, false, spatialLevels,  // isChroma=false
-                isEZBCMode
-            )
-
-            dequantiseDWTSubbandsPerceptual(
-                qIndex, qYGlobal,
-                quantizedCoeffs[t][1], gopCo[t],
-                subbands, baseQCo, true, spatialLevels,  // isChroma=true
-                isEZBCMode
-            )
-
-            dequantiseDWTSubbandsPerceptual(
-                qIndex, qYGlobal,
-                quantizedCoeffs[t][2], gopCg[t],
-                subbands, baseQCg, true, spatialLevels,  // isChroma=true
-                isEZBCMode
-            )
-        }
-
-        // Step 6: Apply inverse 3D DWT (spatial first, then temporal) on expanded canvas
-        tavApplyInverse3DDWT(gopY, canvasWidth, canvasHeight, gopSize, spatialLevels, temporalLevels, spatialFilter)
-        tavApplyInverse3DDWT(gopCo, canvasWidth, canvasHeight, gopSize, spatialLevels, temporalLevels, spatialFilter)
-        tavApplyInverse3DDWT(gopCg, canvasWidth, canvasHeight, gopSize, spatialLevels, temporalLevels, spatialFilter)
-
-        // Step 7: Apply inverse motion compensation (shift frames back) on expanded canvas
-        // Note: Motion vectors are in 1/16-pixel units, cumulative relative to frame 0
-        for (t in 1 until gopSize) {  // Skip frame 0 (reference)
-            val dx = motionVectorsX[t] / 16  // Convert to pixel units
-            val dy = motionVectorsY[t] / 16
-
-            if (dx != 0 || dy != 0) {
-                applyInverseTranslation(gopY[t], canvasWidth, canvasHeight, dx, dy)
-                applyInverseTranslation(gopCo[t], canvasWidth, canvasHeight, dx, dy)
-                applyInverseTranslation(gopCg[t], canvasWidth, canvasHeight, dx, dy)
-            }
-        }
-
-        // Step 8: Crop expanded canvas to original dimensions and convert to RGB
-        for (t in 0 until gopSize) {
-            val rgbAddr = outputRGBAddrs[t]
-
-            // Crop from expanded canvas (canvasWidth x canvasHeight) to output (width x height)
-            for (row in 0 until height) {
-                for (col in 0 until width) {
-                    // Source pixel in expanded canvas
-                    val canvasX = col + marginLeft
-                    val canvasY = row + marginTop
-                    val canvasIdx = canvasY * canvasWidth + canvasX
-
-                    // Destination pixel in output buffer
-                    val outIdx = row * width + col
-
-                    val yVal = gopY[t][canvasIdx]
-                    val co = gopCo[t][canvasIdx]
-                    val cg = gopCg[t][canvasIdx]
-
-                    // YCoCg-R to RGB conversion
-                    val tmp = yVal - (cg / 2.0f)
-                    val g = cg + tmp
-                    val b = tmp - (co / 2.0f)
-                    val r = b + co
-
-                    // Clamp to 0-255 range
-                    val rClamped = r.toInt().coerceIn(0, 255)
-                    val gClamped = g.toInt().coerceIn(0, 255)
-                    val bClamped = b.toInt().coerceIn(0, 255)
-
-                    // Write RGB24 format (3 bytes per pixel)
-                    val offset = rgbAddr + outIdx * 3L
-                    vm.usermem[offset] = rClamped.toByte()
-                    vm.usermem[offset + 1] = gClamped.toByte()
-                    vm.usermem[offset + 2] = bClamped.toByte()
-                }
-            }
-        }
-
-        return arrayOf(gopSize, dbgOut)
-    }
-
     /**
      * Decode GOP frames directly into GraphicsAdapter.videoBuffer (Java heap).
      * This avoids allocating GOP frames in VM user memory, saving ~6 MB for 8-frame GOPs.
@@ -6864,14 +6676,8 @@ class GraphicsJSR223Delegate(private val vm: VM) {
         compressedDataPtr: Long,
         compressedSize: Int,
         gopSize: Int,
-        motionVectorsX: IntArray,
-        motionVectorsY: IntArray,
         width: Int,
         height: Int,
-        canvasWidth: Int,
-        canvasHeight: Int,
-        marginLeft: Int,
-        marginTop: Int,
         qIndex: Int,
         qYGlobal: Int,
         qCoGlobal: Int,
@@ -6900,7 +6706,6 @@ class GraphicsJSR223Delegate(private val vm: VM) {
         }
 
         // Use expanded canvas dimensions for DWT processing
-        val canvasPixels = canvasWidth * canvasHeight
         val outputPixels = width * height
 
         // Step 1: Decompress unified GOP block
@@ -6926,18 +6731,18 @@ class GraphicsJSR223Delegate(private val vm: VM) {
         val (isEZBCMode, quantizedCoeffs) = tavPostprocessGopAuto(
             decompressedData,
             gopSize,
-            canvasPixels,
+            outputPixels,
             channelLayout,
             entropyCoder
         )
 
         // Step 3: Allocate GOP buffers for float coefficients (expanded canvas size)
-        val gopY = Array(gopSize) { FloatArray(canvasPixels) }
-        val gopCo = Array(gopSize) { FloatArray(canvasPixels) }
-        val gopCg = Array(gopSize) { FloatArray(canvasPixels) }
+        val gopY = Array(gopSize) { FloatArray(outputPixels) }
+        val gopCo = Array(gopSize) { FloatArray(outputPixels) }
+        val gopCg = Array(gopSize) { FloatArray(outputPixels) }
 
         // Step 4: Calculate subband layout for expanded canvas
-        val subbands = calculateSubbandLayout(canvasWidth, canvasHeight, spatialLevels)
+        val subbands = calculateSubbandLayout(width, height, spatialLevels)
 
         // Step 5: Dequantize with temporal-spatial scaling
         for (t in 0 until gopSize) {
@@ -6971,40 +6776,23 @@ class GraphicsJSR223Delegate(private val vm: VM) {
         }
 
         // Step 6: Apply inverse 3D DWT
-        tavApplyInverse3DDWT(gopY, canvasWidth, canvasHeight, gopSize, spatialLevels, temporalLevels, spatialFilter)
-        tavApplyInverse3DDWT(gopCo, canvasWidth, canvasHeight, gopSize, spatialLevels, temporalLevels, spatialFilter)
-        tavApplyInverse3DDWT(gopCg, canvasWidth, canvasHeight, gopSize, spatialLevels, temporalLevels, spatialFilter)
-
-        // Step 7: Apply inverse motion compensation
-        for (t in 1 until gopSize) {
-            val dx = motionVectorsX[t] / 16
-            val dy = motionVectorsY[t] / 16
-
-            if (dx != 0 || dy != 0) {
-                applyInverseTranslation(gopY[t], canvasWidth, canvasHeight, dx, dy)
-                applyInverseTranslation(gopCo[t], canvasWidth, canvasHeight, dx, dy)
-                applyInverseTranslation(gopCg[t], canvasWidth, canvasHeight, dx, dy)
-            }
-        }
+        tavApplyInverse3DDWT(gopY, width, height, gopSize, spatialLevels, temporalLevels, spatialFilter)
+        tavApplyInverse3DDWT(gopCo, width, height, gopSize, spatialLevels, temporalLevels, spatialFilter)
+        tavApplyInverse3DDWT(gopCg, width, height, gopSize, spatialLevels, temporalLevels, spatialFilter)
 
         // Step 8: Crop and convert to RGB, write directly to videoBuffer
         for (t in 0 until gopSize) {
             val videoBufferOffset = bufferOffset + (t * frameSize)  // Each frame sequentially, starting at bufferOffset
 
-            for (row in 0 until height) {
-                for (col in 0 until width) {
-                    // Source pixel in expanded canvas
-                    val canvasX = col + marginLeft
-                    val canvasY = row + marginTop
-                    val canvasIdx = canvasY * canvasWidth + canvasX
-
+            for (py in 0 until height) {
+                for (px in 0 until width) {
                     // Destination pixel in videoBuffer
-                    val outIdx = row * width + col
+                    val outIdx = py * width + px
                     val offset = videoBufferOffset + outIdx * 3L
 
-                    val yVal = gopY[t][canvasIdx]
-                    val co = gopCo[t][canvasIdx]
-                    val cg = gopCg[t][canvasIdx]
+                    val yVal = gopY[t][outIdx]
+                    val co = gopCo[t][outIdx]
+                    val cg = gopCg[t][outIdx]
 
                     // YCoCg-R to RGB conversion
                     val tmp = yVal - (cg / 2.0f)
@@ -7113,14 +6901,8 @@ class GraphicsJSR223Delegate(private val vm: VM) {
         compressedDataPtr: Long,
         compressedSize: Int,
         gopSize: Int,
-        motionVectorsX: IntArray,
-        motionVectorsY: IntArray,
         width: Int,
         height: Int,
-        canvasWidth: Int,
-        canvasHeight: Int,
-        marginLeft: Int,
-        marginTop: Int,
         qIndex: Int,
         qYGlobal: Int,
         qCoGlobal: Int,
@@ -7128,7 +6910,7 @@ class GraphicsJSR223Delegate(private val vm: VM) {
         channelLayout: Int,
         spatialFilter: Int = 1,
         spatialLevels: Int = 6,
-        temporalLevels: Int = 2,
+        temporalLevels: Int = 3,
         entropyCoder: Int = 0,
         bufferOffset: Long = 0
     ) {
@@ -7144,9 +6926,7 @@ class GraphicsJSR223Delegate(private val vm: VM) {
             try {
                 val result = tavDecodeGopToVideoBuffer(
                     compressedDataPtr, compressedSize, gopSize,
-                    motionVectorsX, motionVectorsY,
-                    width, height, canvasWidth, canvasHeight,
-                    marginLeft, marginTop,
+                    width, height,
                     qIndex, qYGlobal, qCoGlobal, qCgGlobal,
                     channelLayout, spatialFilter, spatialLevels, temporalLevels,
                     entropyCoder, bufferOffset
diff --git a/tsvm_core/src/net/torvald/tsvm/peripheral/GraphicsAdapter.kt b/tsvm_core/src/net/torvald/tsvm/peripheral/GraphicsAdapter.kt
index f6a332a..79e7b3a 100644
--- a/tsvm_core/src/net/torvald/tsvm/peripheral/GraphicsAdapter.kt
+++ b/tsvm_core/src/net/torvald/tsvm/peripheral/GraphicsAdapter.kt
@@ -107,7 +107,7 @@ open class GraphicsAdapter(private val assetsRoot: String, val vm: VM, val confi
     internal val unusedArea = UnsafeHelper.allocate(1024, this)
     internal val scanlineOffsets = UnsafeHelper.allocate(1024, this)
 
-    internal val videoBuffer = UnsafeHelper.allocate(32 * 1024 * 1024, this)
+    internal val videoBuffer = UnsafeHelper.allocate(48 * 1024 * 1024, this)  // 48 MB for triple-buffering (3 slots × 21 frames × 752 kB)
 
     protected val paletteShader = LoadShader(DRAW_SHADER_VERT, config.paletteShader)
     protected val textShader = LoadShader(DRAW_SHADER_VERT, config.fragShader)
diff --git a/video_encoder/encoder_tav.c b/video_encoder/encoder_tav.c
index c53a266..e0b415e 100644
--- a/video_encoder/encoder_tav.c
+++ b/video_encoder/encoder_tav.c
@@ -18,7 +18,7 @@
 #include <float.h>
 #include <fftw3.h>
 
-#define ENCODER_VENDOR_STRING "Encoder-TAV 20251019"
+#define ENCODER_VENDOR_STRING "Encoder-TAV 20251022 (3d-dwt,ezbc)"
 
 // TSVM Advanced Video (TAV) format constants
 #define TAV_MAGIC "\x1F\x54\x53\x56\x4D\x54\x41\x56"  // "\x1FTSVM TAV"
@@ -48,7 +48,7 @@
 #define TAV_PACKET_IFRAME          0x10  // Intra frame (keyframe)
 #define TAV_PACKET_PFRAME          0x11  // Predicted frame (legacy, unused)
 #define TAV_PACKET_GOP_UNIFIED     0x12  // Unified 3D DWT GOP (all frames in single block, translation-based)
-#define TAV_PACKET_GOP_UNIFIED_MESH 0x13  // Unified 3D DWT GOP with distortion mesh warping
+#define TAV_PACKET_GOP_UNIFIED_MOTION 0x13  // Unified 3D DWT GOP with motion-compensated lifting
 #define TAV_PACKET_PFRAME_RESIDUAL 0x14  // P-frame with MPEG-style residual coding (block motion compensation)
 #define TAV_PACKET_BFRAME_RESIDUAL 0x15  // B-frame with MPEG-style residual coding (bidirectional prediction)
 #define TAV_PACKET_PFRAME_ADAPTIVE 0x16  // P-frame with adaptive quad-tree block partitioning
@@ -116,13 +116,15 @@ static int needs_alpha_channel(int channel_layout) {
 #define DEFAULT_HEIGHT 448
 #define DEFAULT_FPS 30
 #define DEFAULT_QUALITY 3
-#define DEFAULT_ZSTD_LEVEL 9
-#define TEMPORAL_GOP_SIZE 20//8 // ~42 frames fit into 32 MB video buffer
+#define DEFAULT_ZSTD_LEVEL 3
+#define TEMPORAL_GOP_SIZE 20
 #define TEMPORAL_DECOMP_LEVEL 2
 #define MOTION_THRESHOLD 24.0f // Flush if motion exceeds 24 pixels in any direction
 
 // Audio/subtitle constants (reused from TEV)
+#define MP2_SAMPLE_RATE 32000
 #define MP2_DEFAULT_PACKET_SIZE 1152
+#define PACKET_AUDIO_TIME ((double)MP2_DEFAULT_PACKET_SIZE / MP2_SAMPLE_RATE)
 #define MAX_SUBTITLE_LENGTH 2048
 
 int debugDumpMade = 0;
@@ -2175,6 +2177,7 @@ static int mp2_packet_size_to_rate_index(int packet_size, int is_mono);
 static long write_extended_header(tav_encoder_t *enc);
 static void write_timecode_packet(FILE *output, int frame_num, int fps, int is_ntsc_framerate);
 static int process_audio(tav_encoder_t *enc, int frame_num, FILE *output);
+static int process_audio_for_gop(tav_encoder_t *enc, int *frame_numbers, int num_frames, FILE *output);
 static subtitle_entry_t* parse_subtitle_file(const char *filename, int fps);
 static subtitle_entry_t* parse_srt_file(const char *filename, int fps);
 static subtitle_entry_t* parse_smi_file(const char *filename, int fps);
@@ -2269,7 +2272,7 @@ static void show_usage(const char *program_name) {
     printf("  --dump-frame N          Dump quantised coefficients for frame N (creates .bin files)\n");
     printf("  --wavelet N             Wavelet filter: 0=LGT 5/3, 1=CDF 9/7, 2=CDF 13/7, 16=DD-4, 255=Haar (default: 1)\n");
     printf("  --zstd-level N          Zstd compression level 1-22 (default: %d, higher = better compression but slower)\n", DEFAULT_ZSTD_LEVEL);
-    printf("  --no-grain-synthesis    Disable grain synthesis (enabled by default)\n");
+//    printf("  --no-grain-synthesis    Disable grain synthesis (enabled by default)\n");
     printf("  --help                  Show this help\n\n");
 
     printf("Audio Rate by Quality:\n  ");
@@ -2328,7 +2331,7 @@ static tav_encoder_t* create_encoder(void) {
     enc->intra_only = 0;
     enc->monoblock = 1;  // Default to monoblock mode
     enc->perceptual_tuning = 1;  // Default to perceptual quantisation (versions 5/6)
-    enc->enable_ezbc = 0;  // Default to twobit-map (EZBC adds overhead for small files)
+    enc->enable_ezbc = 1;  // Default to EZBC over twobit-map
     enc->channel_layout = CHANNEL_LAYOUT_YCOCG;  // Default to Y-Co-Cg
     enc->audio_bitrate = 0;  // 0 = use quality table
     enc->encode_limit = 0;  // Default: no frame limit
@@ -2339,7 +2342,7 @@ static tav_encoder_t* create_encoder(void) {
     enc->delta_haar_levels = TEMPORAL_DECOMP_LEVEL;
 
     // GOP / temporal DWT settings
-    enc->enable_temporal_dwt = 0;  // Default: disabled for backward compatibility. Mutually exclusive with use_delta_encoding
+    enc->enable_temporal_dwt = 1;  // Mutually exclusive with use_delta_encoding
     enc->temporal_gop_capacity = TEMPORAL_GOP_SIZE;  // 16 frames
     enc->temporal_gop_frame_count = 0;
     enc->temporal_decomp_levels = TEMPORAL_DECOMP_LEVEL;  // 2 levels of temporal DWT (16 -> 4x4 subbands)
@@ -4826,16 +4829,6 @@ static size_t gop_flush(tav_encoder_t *enc, FILE *output, int base_quantiser,
         memcpy(gop_cg_coeffs[i], enc->temporal_gop_cg_frames[i], num_pixels * sizeof(float));
     }
 
-    // Debug: Print original frame-to-frame motion vectors
-    if (enc->verbose && actual_gop_size >= 4) {
-        printf("Frame-to-frame motion vectors (before cumulative conversion):\n");
-        for (int i = 0; i < actual_gop_size; i++) {
-            printf("  Frame %d: 1/16px=(%d, %d) pixels=(%.3f, %.3f)\n",
-                   i, enc->temporal_gop_translation_x[i], enc->temporal_gop_translation_y[i],
-                   enc->temporal_gop_translation_x[i] / 16.0f, enc->temporal_gop_translation_y[i] / 16.0f);
-        }
-    }
-
     // Step 0.5: Convert frame-to-frame motion vectors to cumulative (relative to frame 0)
     // Phase correlation computes motion of frame[i] relative to frame[i-1]
     // We need cumulative motion relative to frame 0 for proper alignment
@@ -4844,16 +4837,6 @@ static size_t gop_flush(tav_encoder_t *enc, FILE *output, int base_quantiser,
         enc->temporal_gop_translation_y[i] += enc->temporal_gop_translation_y[i-1];
     }
 
-    // Debug: Print cumulative motion vectors
-    if (enc->verbose && actual_gop_size >= 4) {
-        printf("Cumulative motion vectors (after conversion):\n");
-        for (int i = 0; i < actual_gop_size; i++) {
-            printf("  Frame %d: 1/16px=(%d, %d) pixels=(%.3f, %.3f)\n",
-                   i, enc->temporal_gop_translation_x[i], enc->temporal_gop_translation_y[i],
-                   enc->temporal_gop_translation_x[i] / 16.0f, enc->temporal_gop_translation_y[i] / 16.0f);
-        }
-    }
-
     // Step 0.5b: Calculate the valid region after alignment (crop bounds)
     // Find the bounding box that's valid across all aligned frames
     int min_dx = 0, max_dx = 0, min_dy = 0, max_dy = 0;
@@ -5102,6 +5085,9 @@ static size_t gop_flush(tav_encoder_t *enc, FILE *output, int base_quantiser,
     // Write timecode packet for first frame in GOP
     write_timecode_packet(output, frame_numbers[0], enc->output_fps, enc->is_ntsc_framerate);
 
+    // Process audio for this GOP (all frames at once)
+    process_audio_for_gop(enc, frame_numbers, actual_gop_size, output);
+
     // Single-frame GOP fallback: use traditional I-frame encoding with serialise_tile_data
     if (actual_gop_size == 1) {
         // Write I-frame packet header (no motion vectors, no GOP overhead)
@@ -5171,10 +5157,11 @@ static size_t gop_flush(tav_encoder_t *enc, FILE *output, int base_quantiser,
             printf("Frame %d (single-frame GOP as I-frame): %zu bytes\n",
                    frame_numbers[0], compressed_size);
         }
-    } else {
+    }
+    else {
         // Multi-frame GOP: use unified 3D DWT encoding
         // Choose packet type based on motion compensation method
-        uint8_t packet_type = enc->temporal_enable_mcezbc ? TAV_PACKET_GOP_UNIFIED_MESH : TAV_PACKET_GOP_UNIFIED;
+        uint8_t packet_type = enc->temporal_enable_mcezbc ? TAV_PACKET_GOP_UNIFIED_MOTION : TAV_PACKET_GOP_UNIFIED;
         fwrite(&packet_type, 1, 1, output);
         total_bytes_written += 1;
 
@@ -5263,26 +5250,6 @@ static size_t gop_flush(tav_encoder_t *enc, FILE *output, int base_quantiser,
 
             free(mv_buffer);
             free(compressed_mv);
-        } else {
-            // Packet 0x12: Translation-based alignment
-            // Write canvas expansion information (4 bytes)
-            uint8_t canvas_margins[4] = {
-                (uint8_t)crop_left,    // Left margin
-                (uint8_t)crop_right,   // Right margin
-                (uint8_t)crop_top,     // Top margin
-                (uint8_t)crop_bottom   // Bottom margin
-            };
-            fwrite(canvas_margins, 1, 4, output);
-            total_bytes_written += 4;
-
-            // Write all motion vectors (1/16-pixel precision) for the entire GOP
-            for (int t = 0; t < actual_gop_size; t++) {
-                int16_t dx = enc->temporal_gop_translation_x[t];
-                int16_t dy = enc->temporal_gop_translation_y[t];
-                fwrite(&dx, sizeof(int16_t), 1, output);
-                fwrite(&dy, sizeof(int16_t), 1, output);
-                total_bytes_written += 4;
-            }
         }
 
         // Preprocess ALL frames with unified significance map
@@ -8649,13 +8616,8 @@ static int process_audio(tav_encoder_t *enc, int frame_num, FILE *output) {
     // Calculate how much audio time each frame represents (in seconds)
     double frame_audio_time = 1.0 / enc->output_fps;
 
-    // Calculate how much audio time each MP2 packet represents
-    // MP2 frame contains 1152 samples at 32kHz = 0.036 seconds
-    #define MP2_SAMPLE_RATE 32000
-    double packet_audio_time = 1152.0 / MP2_SAMPLE_RATE;
-
     // Estimate how many packets we consume per video frame
-    double packets_per_frame = frame_audio_time / packet_audio_time;
+    double packets_per_frame = frame_audio_time / PACKET_AUDIO_TIME;
 
     // Allocate MP2 buffer if needed
     if (!enc->mp2_buffer) {
@@ -8683,24 +8645,20 @@ static int process_audio(tav_encoder_t *enc, int frame_num, FILE *output) {
 
         // Calculate how many packets we need to maintain target buffer level
         // Only insert when buffer drops below target, and only insert enough to restore target
-        double target_level = (double)enc->target_audio_buffer_size;
-        if (enc->audio_frames_in_buffer < target_level) {
+        double target_level = fmax(packets_per_frame, (double)enc->target_audio_buffer_size);
+//        if (enc->audio_frames_in_buffer < target_level) {
             double deficit = target_level - enc->audio_frames_in_buffer;
             // Insert packets to cover the deficit, but at least maintain minimum flow
             packets_to_insert = (int)ceil(deficit);
-            // Cap at reasonable maximum to prevent excessive insertion
-            if (packets_to_insert > enc->target_audio_buffer_size) {
-                packets_to_insert = enc->target_audio_buffer_size;
-            }
 
             if (enc->verbose) {
                 printf("Frame %d: Buffer low (%.2f->%.2f), deficit %.2f, inserting %d packets\n",
                        frame_num, old_buffer, enc->audio_frames_in_buffer, deficit, packets_to_insert);
             }
-        } else if (enc->verbose && old_buffer != enc->audio_frames_in_buffer) {
-            printf("Frame %d: Buffer sufficient (%.2f->%.2f), no packets\n",
-                   frame_num, old_buffer, enc->audio_frames_in_buffer);
-        }
+//        } else if (enc->verbose && old_buffer != enc->audio_frames_in_buffer) {
+//            printf("Frame %d: Buffer sufficient (%.2f->%.2f), no packets\n",
+//                   frame_num, old_buffer, enc->audio_frames_in_buffer);
+//        }
     }
 
     // Insert the calculated number of audio packets
@@ -8737,6 +8695,96 @@ static int process_audio(tav_encoder_t *enc, int frame_num, FILE *output) {
     return 1;
 }
 
+// Process audio for a GOP (multiple frames at once)
+// Accumulates deficit for N frames and emits all necessary audio packets
+static int process_audio_for_gop(tav_encoder_t *enc, int *frame_numbers, int num_frames, FILE *output) {
+    if (!enc->has_audio || !enc->mp2_file || enc->audio_remaining <= 0 || num_frames == 0) {
+        return 1;
+    }
+
+    // Handle first frame initialization (same as process_audio)
+    int first_frame_in_gop = frame_numbers[0];
+    if (first_frame_in_gop == 0) {
+        uint8_t header[4];
+        if (fread(header, 1, 4, enc->mp2_file) != 4) return 1;
+        fseek(enc->mp2_file, 0, SEEK_SET);
+        enc->mp2_packet_size = get_mp2_packet_size(header);
+        int is_mono = (header[3] >> 6) == 3;
+        enc->mp2_rate_index = mp2_packet_size_to_rate_index(enc->mp2_packet_size, is_mono);
+        enc->target_audio_buffer_size = 4; // 4 audio packets in buffer (does nothing for GOP)
+        enc->audio_frames_in_buffer = 0.0;
+    }
+
+    // Calculate audio packet consumption per video frame
+    double frame_audio_time = 1.0 / enc->output_fps;
+    double packets_per_frame = frame_audio_time / PACKET_AUDIO_TIME;
+
+    // Allocate MP2 buffer if needed
+    if (!enc->mp2_buffer) {
+        enc->mp2_buffer_size = enc->mp2_packet_size * 2;
+        enc->mp2_buffer = malloc(enc->mp2_buffer_size);
+        if (!enc->mp2_buffer) {
+            fprintf(stderr, "Failed to allocate audio buffer\n");
+            return 1;
+        }
+    }
+
+    // Calculate total deficit for all frames in the GOP
+    int total_packets_to_insert = 0;
+
+    // Simulate buffer consumption for all N frames in the GOP
+    double old_buffer = enc->audio_frames_in_buffer;
+    enc->audio_frames_in_buffer -= (packets_per_frame * num_frames);
+
+    // Calculate deficit to restore buffer to target level
+//    double target_level = fmax(packets_per_frame, (double)enc->target_audio_buffer_size);
+//    if (enc->audio_frames_in_buffer < target_level) {
+        double deficit = packets_per_frame * num_frames;
+        total_packets_to_insert = CLAMP((int)round(deficit), enc->target_audio_buffer_size, 9999);
+
+        if (enc->verbose) {
+            printf("GOP (%d frames, starting at %d): Buffer low (%.2f->%.2f), deficit %.2f, inserting %d packets\n",
+                   num_frames, first_frame_in_gop, old_buffer, enc->audio_frames_in_buffer, deficit, total_packets_to_insert);
+        }
+//    } else if (enc->verbose) {
+//        printf("GOP (%d frames, starting at %d): Buffer sufficient (%.2f->%.2f), no packets\n",
+//               num_frames, first_frame_in_gop, old_buffer, enc->audio_frames_in_buffer);
+//    }
+
+    // Emit all audio packets for this GOP
+    for (int q = 0; q < total_packets_to_insert; q++) {
+        size_t bytes_to_read = enc->mp2_packet_size;
+        if (bytes_to_read > enc->audio_remaining) {
+            bytes_to_read = enc->audio_remaining;
+        }
+
+        size_t bytes_read = fread(enc->mp2_buffer, 1, bytes_to_read, enc->mp2_file);
+        if (bytes_read == 0) break;
+
+        // Write TAV MP2 audio packet
+        uint8_t audio_packet_type = TAV_PACKET_AUDIO_MP2;
+        uint32_t audio_len = (uint32_t)bytes_read;
+        fwrite(&audio_packet_type, 1, 1, output);
+        fwrite(&audio_len, 4, 1, output);
+        fwrite(enc->mp2_buffer, 1, bytes_read, output);
+
+        // Track audio bytes written
+        enc->audio_remaining -= bytes_read;
+        enc->audio_frames_in_buffer++;
+
+        if (first_frame_in_gop == 0) {
+            enc->audio_frames_in_buffer = enc->target_audio_buffer_size / 2;
+        }
+
+        if (enc->verbose) {
+            printf("Audio packet %d: %zu bytes (buffer: %.2f packets)\n",
+                   q, bytes_read, enc->audio_frames_in_buffer);
+        }
+    }
+
+    return 1;
+}
+
 // Process subtitles for current frame (copied and adapted from TEV)
 static int process_subtitles(tav_encoder_t *enc, int frame_num, FILE *output) {
     if (!enc->subtitles) {
@@ -9834,20 +9882,16 @@ int main(int argc, char *argv[]) {
                 adjust_quantiser_for_bitrate(enc);
             }
 
-            // For GOP encoding, process audio/subtitles for all frames in the flushed GOP
+            // For GOP encoding, audio/subtitles are handled in gop_flush() for all GOP frames
             // For traditional encoding, process audio/subtitles for this single frame
-            if (enc->enable_temporal_dwt) {
-                // Note: In GOP mode, audio/subtitle sync is approximate since we flush multiple frames at once
-                // This is acceptable since GOPs are short (16 frames max = ~0.5s at 30fps)
-                // TODO: Consider buffering audio/subtitles for precise sync if needed
+            if (!enc->enable_temporal_dwt) {
+                // Process audio for this frame
+                process_audio(enc, true_frame_count, enc->output_fp);
+
+                // Process subtitles for this frame
+                process_subtitles(enc, true_frame_count, enc->output_fp);
             }
 
-            // Process audio for this frame
-            process_audio(enc, true_frame_count, enc->output_fp);
-
-            // Process subtitles for this frame
-            process_subtitles(enc, true_frame_count, enc->output_fp);
-
             // Write a sync packet only after a video is been coded
             // For GOP encoding, GOP_SYNC packet already serves as sync - don't emit extra SYNC
             // For B-frame mode, sync packets are already written in the encoding loop
@@ -9857,7 +9901,8 @@ int main(int argc, char *argv[]) {
             }
 
             // NTSC frame duplication: emit extra sync packet for every 1000n+500 frames
-            if (enc->is_ntsc_framerate && (frame_count % 1000 == 500)) {
+            // Skip when temporal DWT is enabled (audio handled in GOP flush)
+            if (!enc->enable_temporal_dwt && enc->is_ntsc_framerate && (frame_count % 1000 == 500)) {
                 true_frame_count++;
                 // Process audio and subtitles for the duplicated frame to maintain sync
                 process_audio(enc, true_frame_count, enc->output_fp);
diff --git a/video_encoder/tav_inspector.c b/video_encoder/tav_inspector.c
index eca7bac..77c2d7d 100644
--- a/video_encoder/tav_inspector.c
+++ b/video_encoder/tav_inspector.c
@@ -18,6 +18,11 @@
 #define TAV_PACKET_IFRAME         0x10
 #define TAV_PACKET_PFRAME         0x11
 #define TAV_PACKET_GOP_UNIFIED    0x12  // Unified 3D DWT GOP (all frames in single block)
+#define TAV_PACKET_GOP_UNIFIED_MOTION    0x13
+#define TAV_PACKET_PFRAME_RESIDUAL 0x14  // P-frame with MPEG-style residual coding (block motion compensation)
+#define TAV_PACKET_BFRAME_RESIDUAL 0x15  // B-frame with MPEG-style residual coding (bidirectional prediction)
+#define TAV_PACKET_PFRAME_ADAPTIVE 0x16  // P-frame with adaptive quad-tree block partitioning
+#define TAV_PACKET_BFRAME_ADAPTIVE 0x17  // B-frame with adaptive quad-tree block partitioning (bidirectional prediction)
 #define TAV_PACKET_AUDIO_MP2      0x20
 #define TAV_PACKET_SUBTITLE       0x30
 #define TAV_PACKET_SUBTITLE_KAR   0x31
@@ -59,6 +64,7 @@ typedef struct {
     int pframe_delta_count;
     int pframe_skip_count;
     int gop_unified_count;
+    int gop_unified_motion_count;
     int gop_sync_count;
     int total_gop_frames;
     int audio_count;
@@ -94,6 +100,11 @@ const char* get_packet_type_name(uint8_t type) {
         case TAV_PACKET_IFRAME: return "I-FRAME";
         case TAV_PACKET_PFRAME: return "P-FRAME";
         case TAV_PACKET_GOP_UNIFIED: return "GOP (3D DWT Unified)";
+        case TAV_PACKET_GOP_UNIFIED_MOTION: return "GOP (3D DWT Unified with Motion Data)";
+        case TAV_PACKET_PFRAME_RESIDUAL: return "P-FRAME (residual)";
+        case TAV_PACKET_BFRAME_RESIDUAL: return "B-FRAME (residual)";
+        case TAV_PACKET_PFRAME_ADAPTIVE: return "P-FRAME (quadtree)";
+        case TAV_PACKET_BFRAME_ADAPTIVE: return "B-FRAME (quadtree)";
         case TAV_PACKET_AUDIO_MP2: return "AUDIO MP2";
         case TAV_PACKET_SUBTITLE: return "SUBTITLE (Simple)";
         case TAV_PACKET_SUBTITLE_KAR: return "SUBTITLE (Karaoke)";
@@ -246,9 +257,10 @@ void print_extended_header(FILE *fp, int verbose) {
             if (verbose) {
                 if (strcmp(key, "CDAT") == 0) {
                     time_t time_sec = value / 1000000000ULL;
-                    char *time_str = ctime(&time_sec);
-                    if (time_str) {
-                        time_str[strlen(time_str)-1] = '\0';  // Remove newline
+                    struct tm *time_info = gmtime(&time_sec);
+                    if (time_info) {
+                        char time_str[64];
+                        strftime(time_str, sizeof(time_str), "%a %b %d %H:%M:%S %Y UTC", time_info);
                         printf("%s", time_str);
                     }
                 } else {
@@ -484,48 +496,37 @@ int main(int argc, char *argv[]) {
                 break;
             }
 
-            case TAV_PACKET_GOP_UNIFIED: {
+            case TAV_PACKET_GOP_UNIFIED: case TAV_PACKET_GOP_UNIFIED_MOTION: {
                 // Unified GOP packet: [gop_size][motion_vectors...][compressed_size][data]
                 uint8_t gop_size;
                 if (fread(&gop_size, 1, 1, fp) != 1) break;
 
-                // Read all motion vectors
-                int16_t *motion_x = malloc(gop_size * sizeof(int16_t));
-                int16_t *motion_y = malloc(gop_size * sizeof(int16_t));
-                for (int i = 0; i < gop_size; i++) {
-                    if (fread(&motion_x[i], sizeof(int16_t), 1, fp) != 1) break;
-                    if (fread(&motion_y[i], sizeof(int16_t), 1, fp) != 1) break;
+                // Read motion vectors
+                uint32_t size0 = 0;
+                if (packet_type == TAV_PACKET_GOP_UNIFIED_MOTION) {
+                    if (fread(&size0, sizeof(uint32_t), 1, fp) != 1) { break; }
+                    stats.total_video_bytes += size0;
+                    stats.gop_unified_motion_count++;
+                    fseek(fp, size0, SEEK_CUR);
                 }
 
                 // Read compressed data size
-                uint32_t size;
-                if (fread(&size, sizeof(uint32_t), 1, fp) != 1) {
-                    free(motion_x);
-                    free(motion_y);
-                    break;
-                }
+                uint32_t size1;
+                if (fread(&size1, sizeof(uint32_t), 1, fp) != 1) { break; }
+                stats.total_video_bytes += size1;
+                fseek(fp, size1, SEEK_CUR);
+
 
-                stats.total_video_bytes += size;
-                stats.gop_unified_count++;
                 stats.total_gop_frames += gop_size;
+                if (packet_type == TAV_PACKET_GOP_UNIFIED) {
+                    stats.gop_unified_count++;
+                }
 
                 if (!opts.summary_only && display) {
                     printf(" - GOP size=%u, data size=%u bytes (%.2f bytes/frame)",
-                           gop_size, size, (double)size / gop_size);
-
-                    // Always show motion vectors for GOP packets with absolute frame numbers
-                    if (gop_size > 0) {
-                        printf("\n    Motion vectors (1/16-pixel):");
-                        for (int i = 0; i < gop_size; i++) {
-                            printf("\n      Frame %d (#%d): (%.3f, %.3f) px",
-                                   current_frame + i, i, motion_x[i] / 16.0, motion_y[i] / 16.0);
-                        }
-                    }
+                           gop_size, (size0 + size1), (double)(size0 + size1) / gop_size);
                 }
 
-                free(motion_x);
-                free(motion_y);
-                fseek(fp, size, SEEK_CUR);
                 break;
             }
 
@@ -714,10 +715,10 @@ int main(int argc, char *argv[]) {
         printf(")");
     }
     printf("\n");
-    if (stats.gop_unified_count > 0) {
+    if (stats.gop_unified_count + stats.gop_unified_motion_count > 0) {
         printf("  3D GOP packets:     %d (total frames: %d, avg %.1f frames/GOP)\n",
-               stats.gop_unified_count, stats.total_gop_frames,
-               (double)stats.total_gop_frames / stats.gop_unified_count);
+               (stats.gop_unified_count + stats.gop_unified_motion_count), stats.total_gop_frames,
+               (double)stats.total_gop_frames / (stats.gop_unified_count + stats.gop_unified_motion_count));
         printf("  GOP sync packets:   %d\n", stats.gop_sync_count);
     }
     printf("  Mux video:          %d\n", stats.mux_video_count);