tavenc: multithreaded decoding

2026-06-06 05:28:31 +09:00 · 2025-12-08 16:07:20 +09:00
parent 34a1f0e3db
commit c6c50c2ebe
5 changed files with 919 additions and 42 deletions
--- a/video_encoder/Makefile
+++ b/video_encoder/Makefile
@@ -44,7 +44,7 @@ LIBTADDEC_OBJ = lib/libtaddec/decoder_tad.o
 # =============================================================================

 # Source files and targets
-TARGETS = clean libs encoder_tav_ref#tev tav tav_decoder tav_inspector tav_dt_decoder
+TARGETS = clean libs encoder_tav_ref decoder_tav_ref tav_inspector
 TAD_TARGETS = encoder_tad decoder_tad
 LIBRARIES = lib/libtavenc.a lib/libtavdec.a lib/libtadenc.a lib/libtaddec.a
 TEST_TARGETS = test_mesh_warp test_mesh_roundtrip
@@ -67,13 +67,6 @@ tav: src/encoder_tav.c lib/libtadenc/encoder_tad.c encoder_tav_opencv.cpp
 	$(CXX) $(CXXFLAGS) $(OPENCV_CFLAGS) $(ZSTD_CFLAGS) -c encoder_tav_opencv.cpp -o encoder_tav_opencv.o
 	$(CXX) $(DBGFLAGS) -o encoder_tav encoder_tav.o encoder_tad.o encoder_tav_opencv.o $(LIBS) $(OPENCV_LIBS)

-# New library-based TAV encoder
-tav_decoder: src/decoder_tav.c lib/libtaddec/decoder_tad.c include/decoder_tad.h
-	rm -f decoder_tav decoder_tav.o decoder_tad.o
-	$(CC) $(CFLAGS) $(ZSTD_CFLAGS) -DTAD_DECODER_LIB -c lib/libtaddec/decoder_tad.c -o decoder_tad.o
-	$(CC) $(CFLAGS) $(ZSTD_CFLAGS) -c src/decoder_tav.c -o decoder_tav.o
-	$(CC) $(DBGFLAGS) -o decoder_tav decoder_tav.o decoder_tad.o $(LIBS)
-
 tav_inspector: tav_inspector.c
 	rm -f tav_inspector
 	$(CC) $(CFLAGS) $(ZSTD_CFLAGS) -o tav_inspector $< $(LIBS)
--- a/video_encoder/lib/libtavenc/tav_encoder_tile.c
+++ b/video_encoder/lib/libtavenc/tav_encoder_tile.c
@@ -0,0 +1,159 @@
+/**
+ * TAV Encoder Library - Tile Processing Implementation
+ */
+
+#include "tav_encoder_tile.h"
+#include "tav_encoder_dwt.h"
+#include <string.h>
+#include <stdlib.h>
+
+#define CLAMP(x, min, max) ((x) < (min) ? (min) : ((x) > (max) ? (max) : (x)))
+
+void tav_extract_padded_tile(const float *frame_y, const float *frame_co, const float *frame_cg,
+                             int frame_width, int frame_height,
+                             int tile_x, int tile_y,
+                             float *padded_y, float *padded_co, float *padded_cg) {
+    const int core_start_x = tile_x * TAV_TILE_SIZE_X;
+    const int core_start_y = tile_y * TAV_TILE_SIZE_Y;
+
+    // Process row by row with bulk copying for core region where possible
+    for (int py = 0; py < TAV_PADDED_TILE_SIZE_Y; py++) {
+        // Map padded row to source image row
+        int src_y = core_start_y + py - TAV_TILE_MARGIN;
+
+        // Handle vertical boundary conditions with mirroring
+        if (src_y < 0) {
+            src_y = -src_y;
+        } else if (src_y >= frame_height) {
+            src_y = frame_height - 1 - (src_y - frame_height);
+        }
+        src_y = CLAMP(src_y, 0, frame_height - 1);
+
+        // Calculate source and destination row offsets
+        const int padded_row_offset = py * TAV_PADDED_TILE_SIZE_X;
+        const int src_row_offset = src_y * frame_width;
+
+        // Margin boundaries in padded tile
+        const int core_start_px = TAV_TILE_MARGIN;
+        const int core_end_px = TAV_TILE_MARGIN + TAV_TILE_SIZE_X;
+
+        // Check if core region is entirely within frame bounds
+        const int core_src_start_x = core_start_x;
+        const int core_src_end_x = core_start_x + TAV_TILE_SIZE_X;
+
+        if (core_src_start_x >= 0 && core_src_end_x <= frame_width) {
+            // Bulk copy core region in one operation
+            const int src_core_offset = src_row_offset + core_src_start_x;
+
+            memcpy(&padded_y[padded_row_offset + core_start_px],
+                   &frame_y[src_core_offset],
+                   TAV_TILE_SIZE_X * sizeof(float));
+            memcpy(&padded_co[padded_row_offset + core_start_px],
+                   &frame_co[src_core_offset],
+                   TAV_TILE_SIZE_X * sizeof(float));
+            memcpy(&padded_cg[padded_row_offset + core_start_px],
+                   &frame_cg[src_core_offset],
+                   TAV_TILE_SIZE_X * sizeof(float));
+
+            // Handle left margin pixels individually
+            for (int px = 0; px < core_start_px; px++) {
+                int src_x = core_start_x + px - TAV_TILE_MARGIN;
+                if (src_x < 0) src_x = -src_x;
+                src_x = CLAMP(src_x, 0, frame_width - 1);
+
+                int src_idx = src_row_offset + src_x;
+                int padded_idx = padded_row_offset + px;
+
+                padded_y[padded_idx] = frame_y[src_idx];
+                padded_co[padded_idx] = frame_co[src_idx];
+                padded_cg[padded_idx] = frame_cg[src_idx];
+            }
+
+            // Handle right margin pixels individually
+            for (int px = core_end_px; px < TAV_PADDED_TILE_SIZE_X; px++) {
+                int src_x = core_start_x + px - TAV_TILE_MARGIN;
+                if (src_x >= frame_width) {
+                    src_x = frame_width - 1 - (src_x - frame_width);
+                }
+                src_x = CLAMP(src_x, 0, frame_width - 1);
+
+                int src_idx = src_row_offset + src_x;
+                int padded_idx = padded_row_offset + px;
+
+                padded_y[padded_idx] = frame_y[src_idx];
+                padded_co[padded_idx] = frame_co[src_idx];
+                padded_cg[padded_idx] = frame_cg[src_idx];
+            }
+        } else {
+            // Fallback: process entire row pixel by pixel (for edge tiles)
+            for (int px = 0; px < TAV_PADDED_TILE_SIZE_X; px++) {
+                int src_x = core_start_x + px - TAV_TILE_MARGIN;
+
+                // Handle horizontal boundary conditions with mirroring
+                if (src_x < 0) {
+                    src_x = -src_x;
+                } else if (src_x >= frame_width) {
+                    src_x = frame_width - 1 - (src_x - frame_width);
+                }
+                src_x = CLAMP(src_x, 0, frame_width - 1);
+
+                int src_idx = src_row_offset + src_x;
+                int padded_idx = padded_row_offset + px;
+
+                padded_y[padded_idx] = frame_y[src_idx];
+                padded_co[padded_idx] = frame_co[src_idx];
+                padded_cg[padded_idx] = frame_cg[src_idx];
+            }
+        }
+    }
+}
+
+// Use existing 2D DWT from tav_encoder_dwt.c
+// For padded tiles, we simply call the existing function with tile dimensions
+
+void tav_dwt_2d_forward_padded_tile(float *tile_data, int levels, int filter_type) {
+    // Use the existing 2D DWT with padded tile dimensions
+    tav_dwt_2d_forward(tile_data, TAV_PADDED_TILE_SIZE_X, TAV_PADDED_TILE_SIZE_Y,
+                       levels, filter_type);
+}
+
+void tav_dwt_2d_inverse_padded_tile(float *tile_data, int levels, int filter_type) {
+    // Note: Inverse transform not yet implemented in library for arbitrary dimensions
+    // For now, this is a placeholder - decoder uses different code path
+    (void)tile_data;
+    (void)levels;
+    (void)filter_type;
+}
+
+void tav_crop_tile_margins(const float *padded_data, float *core_data) {
+    for (int y = 0; y < TAV_TILE_SIZE_Y; y++) {
+        const int padded_row = (y + TAV_TILE_MARGIN) * TAV_PADDED_TILE_SIZE_X + TAV_TILE_MARGIN;
+        const int core_row = y * TAV_TILE_SIZE_X;
+        memcpy(&core_data[core_row], &padded_data[padded_row], TAV_TILE_SIZE_X * sizeof(float));
+    }
+}
+
+void tav_crop_tile_margins_edge(const float *padded_data, float *core_data,
+                                int actual_width, int actual_height) {
+    for (int y = 0; y < actual_height; y++) {
+        const int padded_row = (y + TAV_TILE_MARGIN) * TAV_PADDED_TILE_SIZE_X + TAV_TILE_MARGIN;
+        const int core_row = y * actual_width;
+        memcpy(&core_data[core_row], &padded_data[padded_row], actual_width * sizeof(float));
+    }
+}
+
+void tav_get_tile_dimensions(int frame_width, int frame_height,
+                             int tile_x, int tile_y,
+                             int *tile_width, int *tile_height) {
+    // Calculate the starting position of this tile
+    int start_x = tile_x * TAV_TILE_SIZE_X;
+    int start_y = tile_y * TAV_TILE_SIZE_Y;
+
+    // Calculate how much of the frame is left from this starting position
+    int remaining_width = frame_width - start_x;
+    int remaining_height = frame_height - start_y;
+
+    // Tile width is the minimum of standard tile size and remaining width
+    *tile_width = (remaining_width < TAV_TILE_SIZE_X) ? remaining_width : TAV_TILE_SIZE_X;
+    *tile_height = (remaining_height < TAV_TILE_SIZE_Y) ? remaining_height : TAV_TILE_SIZE_Y;
+}
--- a/video_encoder/lib/libtavenc/tav_encoder_tile.h
+++ b/video_encoder/lib/libtavenc/tav_encoder_tile.h
@@ -0,0 +1,103 @@
+/**
+ * TAV Encoder Library - Tile Processing
+ *
+ * Functions for padded tile extraction and DWT processing.
+ * Used when video dimensions exceed monoblock threshold (720x576).
+ */
+
+#ifndef TAV_ENCODER_TILE_H
+#define TAV_ENCODER_TILE_H
+
+#include <stdint.h>
+#include <stddef.h>
+#include "../../include/tav_encoder_lib.h"
+
+// Tile dimensions (from header)
+// TAV_TILE_SIZE_X = 640, TAV_TILE_SIZE_Y = 540
+// TAV_PADDED_TILE_SIZE_X = 704, TAV_PADDED_TILE_SIZE_Y = 604
+// TAV_TILE_MARGIN = 32
+
+/**
+ * Extract a padded tile from full-frame YCoCg buffers.
+ *
+ * Extracts a tile at position (tile_x, tile_y) with TAV_TILE_MARGIN pixels
+ * of padding on all sides for seamless DWT processing. Uses symmetric
+ * extension (mirroring) at frame boundaries.
+ *
+ * @param frame_y       Full frame Y channel
+ * @param frame_co      Full frame Co channel
+ * @param frame_cg      Full frame Cg channel
+ * @param frame_width   Full frame width
+ * @param frame_height  Full frame height
+ * @param tile_x        Tile X index (0-based)
+ * @param tile_y        Tile Y index (0-based)
+ * @param padded_y      Output: Padded tile Y (PADDED_TILE_SIZE_X * PADDED_TILE_SIZE_Y floats)
+ * @param padded_co     Output: Padded tile Co
+ * @param padded_cg     Output: Padded tile Cg
+ */
+void tav_extract_padded_tile(const float *frame_y, const float *frame_co, const float *frame_cg,
+                             int frame_width, int frame_height,
+                             int tile_x, int tile_y,
+                             float *padded_y, float *padded_co, float *padded_cg);
+
+/**
+ * Apply 2D DWT forward transform to a padded tile.
+ *
+ * Uses fixed PADDED_TILE_SIZE dimensions (704x604) for optimal performance.
+ *
+ * @param tile_data     Tile data (modified in-place)
+ * @param levels        Number of decomposition levels
+ * @param filter_type   Wavelet filter type (0=CDF 5/3, 1=CDF 9/7, etc.)
+ */
+void tav_dwt_2d_forward_padded_tile(float *tile_data, int levels, int filter_type);
+
+/**
+ * Apply 2D DWT inverse transform to a padded tile.
+ *
+ * @param tile_data     Tile data (modified in-place)
+ * @param levels        Number of decomposition levels
+ * @param filter_type   Wavelet filter type
+ */
+void tav_dwt_2d_inverse_padded_tile(float *tile_data, int levels, int filter_type);
+
+/**
+ * Crop a padded tile to its core region (removing margins).
+ *
+ * Extracts the central TAV_TILE_SIZE_X × TAV_TILE_SIZE_Y region from a padded tile.
+ *
+ * @param padded_data   Padded tile (PADDED_TILE_SIZE_X * PADDED_TILE_SIZE_Y)
+ * @param core_data     Output: Core tile (TILE_SIZE_X * TILE_SIZE_Y)
+ */
+void tav_crop_tile_margins(const float *padded_data, float *core_data);
+
+/**
+ * Crop a padded tile to actual dimensions for edge tiles.
+ *
+ * For tiles at the right/bottom edges of a frame, the actual tile may be
+ * smaller than TILE_SIZE_X × TILE_SIZE_Y. This function handles that case.
+ *
+ * @param padded_data   Padded tile (PADDED_TILE_SIZE_X * PADDED_TILE_SIZE_Y)
+ * @param core_data     Output: Core tile data
+ * @param actual_width  Actual tile width (may be < TILE_SIZE_X for edge tiles)
+ * @param actual_height Actual tile height (may be < TILE_SIZE_Y for edge tiles)
+ */
+void tav_crop_tile_margins_edge(const float *padded_data, float *core_data,
+                                int actual_width, int actual_height);
+
+/**
+ * Calculate actual tile dimensions for a given tile position.
+ *
+ * Edge tiles may be smaller than the standard tile size.
+ *
+ * @param frame_width   Full frame width
+ * @param frame_height  Full frame height
+ * @param tile_x        Tile X index
+ * @param tile_y        Tile Y index
+ * @param tile_width    Output: Actual tile width
+ * @param tile_height   Output: Actual tile height
+ */
+void tav_get_tile_dimensions(int frame_width, int frame_height,
+                             int tile_x, int tile_y,
+                             int *tile_width, int *tile_height);
+
+#endif // TAV_ENCODER_TILE_H
--- a/video_encoder/src/decoder_tav.c
+++ b/video_encoder/src/decoder_tav.c
@@ -24,6 +24,8 @@
 #include <unistd.h>
 #include <sys/wait.h>
 #include <signal.h>
+#include <pthread.h>
+#include <limits.h>

 #include "tav_video_decoder.h"
 #include "decoder_tad.h"
@@ -53,6 +55,31 @@
 #define TAV_PACKET_SYNC_NTSC       0xFE
 #define TAV_PACKET_SYNC            0xFF

+// Threading constants
+#define MAX_DECODE_THREADS 16
+#define DECODE_SLOT_PENDING     0
+#define DECODE_SLOT_PROCESSING  1
+#define DECODE_SLOT_DONE        2
+
+// =============================================================================
+// GOP Decode Job Structure (for multithreading)
+// =============================================================================
+
+typedef struct {
+    int job_id;
+    volatile int status;  // DECODE_SLOT_*
+
+    // Input (compressed data read from file)
+    uint8_t *compressed_data;
+    uint32_t compressed_size;
+    int gop_size;
+
+    // Output (decoded frames)
+    uint8_t **frames;
+    int frames_allocated;
+    int decode_result;
+
+} gop_decode_job_t;

 // =============================================================================
 // TAV Header Structure (32 bytes)
@@ -122,6 +149,21 @@ typedef struct {
    int no_audio;           // Skip audio decoding
    int dump_packets;       // Debug: dump packet info

+    // Threading support
+    int num_threads;
+    int num_slots;
+    gop_decode_job_t *slots;
+    tav_video_context_t **worker_video_ctx;  // Per-thread decoder contexts
+    pthread_t *worker_threads;
+    pthread_mutex_t mutex;
+    pthread_cond_t cond_job_available;
+    pthread_cond_t cond_slot_free;
+    volatile int threads_should_exit;
+    volatile int next_write_slot;      // Next slot to write to FFmpeg
+    volatile int next_read_slot;       // Next slot for reading from file
+    volatile int jobs_submitted;
+    volatile int jobs_completed;
+
 } decoder_context_t;

 // =============================================================================
@@ -294,6 +336,231 @@ static int spawn_ffmpeg(decoder_context_t *ctx) {
    return 0;
 }

+// =============================================================================
+// Multithreading Support
+// =============================================================================
+
+// Worker thread function - decodes GOPs in parallel
+static void *decoder_worker_thread(void *arg) {
+    decoder_context_t *ctx = (decoder_context_t *)arg;
+
+    // Get thread index by finding our thread ID in the array
+    int thread_idx = -1;
+    pthread_t self = pthread_self();
+    for (int i = 0; i < ctx->num_threads; i++) {
+        if (pthread_equal(ctx->worker_threads[i], self)) {
+            thread_idx = i;
+            break;
+        }
+    }
+    if (thread_idx < 0) thread_idx = 0;  // Fallback
+
+    tav_video_context_t *my_video_ctx = ctx->worker_video_ctx[thread_idx];
+
+    while (1) {
+        pthread_mutex_lock(&ctx->mutex);
+
+        // Find a pending slot to work on
+        int slot_idx = -1;
+        while (slot_idx < 0 && !ctx->threads_should_exit) {
+            for (int i = 0; i < ctx->num_slots; i++) {
+                if (ctx->slots[i].status == DECODE_SLOT_PENDING &&
+                    ctx->slots[i].compressed_data != NULL) {
+                    slot_idx = i;
+                    ctx->slots[i].status = DECODE_SLOT_PROCESSING;
+                    break;
+                }
+            }
+            if (slot_idx < 0 && !ctx->threads_should_exit) {
+                pthread_cond_wait(&ctx->cond_job_available, &ctx->mutex);
+            }
+        }
+
+        if (ctx->threads_should_exit && slot_idx < 0) {
+            pthread_mutex_unlock(&ctx->mutex);
+            break;
+        }
+
+        pthread_mutex_unlock(&ctx->mutex);
+
+        if (slot_idx < 0) continue;
+
+        gop_decode_job_t *job = &ctx->slots[slot_idx];
+
+        // Decode GOP using our thread's decoder context
+        job->decode_result = tav_video_decode_gop(
+            my_video_ctx,
+            job->compressed_data,
+            job->compressed_size,
+            job->gop_size,
+            job->frames
+        );
+
+        // Free compressed data after decoding
+        free(job->compressed_data);
+        job->compressed_data = NULL;
+
+        // Mark as done
+        pthread_mutex_lock(&ctx->mutex);
+        job->status = DECODE_SLOT_DONE;
+        ctx->jobs_completed++;
+        pthread_cond_broadcast(&ctx->cond_slot_free);
+        pthread_mutex_unlock(&ctx->mutex);
+    }
+
+    return NULL;
+}
+
+static int init_decoder_threads(decoder_context_t *ctx) {
+    if (ctx->num_threads <= 0) {
+        return 0;  // Single-threaded mode
+    }
+
+    // Limit threads
+    if (ctx->num_threads > MAX_DECODE_THREADS) {
+        ctx->num_threads = MAX_DECODE_THREADS;
+    }
+
+    // Number of slots = threads + 2 for pipelining
+    ctx->num_slots = ctx->num_threads + 2;
+
+    // Allocate slots
+    ctx->slots = calloc(ctx->num_slots, sizeof(gop_decode_job_t));
+    if (!ctx->slots) {
+        fprintf(stderr, "Error: Failed to allocate decode slots\n");
+        return -1;
+    }
+
+    // Pre-allocate frame buffers for each slot (assuming max GOP size of 32)
+    size_t frame_size = ctx->header.width * ctx->header.height * 3;
+    int max_gop_size = 32;
+
+    for (int i = 0; i < ctx->num_slots; i++) {
+        ctx->slots[i].job_id = -1;
+        ctx->slots[i].status = DECODE_SLOT_DONE;  // Available
+        ctx->slots[i].frames = malloc(max_gop_size * sizeof(uint8_t*));
+        if (!ctx->slots[i].frames) {
+            fprintf(stderr, "Error: Failed to allocate frame pointers for slot %d\n", i);
+            return -1;
+        }
+        for (int j = 0; j < max_gop_size; j++) {
+            ctx->slots[i].frames[j] = malloc(frame_size);
+            if (!ctx->slots[i].frames[j]) {
+                fprintf(stderr, "Error: Failed to allocate frame buffer for slot %d frame %d\n", i, j);
+                return -1;
+            }
+        }
+        ctx->slots[i].frames_allocated = max_gop_size;
+    }
+
+    // Create per-thread video decoder contexts
+    ctx->worker_video_ctx = malloc(ctx->num_threads * sizeof(tav_video_context_t*));
+    if (!ctx->worker_video_ctx) {
+        fprintf(stderr, "Error: Failed to allocate worker video contexts\n");
+        return -1;
+    }
+
+    tav_video_params_t video_params = {
+        .width = ctx->header.width,
+        .height = ctx->header.height,
+        .decomp_levels = ctx->header.decomp_levels,
+        .temporal_levels = 2,
+        .wavelet_filter = ctx->header.wavelet_filter,
+        .temporal_wavelet = 0,
+        .entropy_coder = ctx->header.entropy_coder,
+        .channel_layout = ctx->header.channel_layout,
+        .perceptual_tuning = ctx->perceptual_mode,
+        .quantiser_y = ctx->header.quantiser_y,
+        .quantiser_co = ctx->header.quantiser_co,
+        .quantiser_cg = ctx->header.quantiser_cg,
+        .encoder_preset = ctx->header.encoder_preset,
+        .monoblock = 1
+    };
+
+    for (int i = 0; i < ctx->num_threads; i++) {
+        ctx->worker_video_ctx[i] = tav_video_create(&video_params);
+        if (!ctx->worker_video_ctx[i]) {
+            fprintf(stderr, "Error: Failed to create video context for thread %d\n", i);
+            return -1;
+        }
+    }
+
+    // Initialize synchronization primitives
+    pthread_mutex_init(&ctx->mutex, NULL);
+    pthread_cond_init(&ctx->cond_job_available, NULL);
+    pthread_cond_init(&ctx->cond_slot_free, NULL);
+    ctx->threads_should_exit = 0;
+    ctx->next_write_slot = 0;
+    ctx->next_read_slot = 0;
+    ctx->jobs_submitted = 0;
+    ctx->jobs_completed = 0;
+
+    // Create worker threads
+    ctx->worker_threads = malloc(ctx->num_threads * sizeof(pthread_t));
+    if (!ctx->worker_threads) {
+        fprintf(stderr, "Error: Failed to allocate worker threads\n");
+        return -1;
+    }
+
+    for (int i = 0; i < ctx->num_threads; i++) {
+        if (pthread_create(&ctx->worker_threads[i], NULL, decoder_worker_thread, ctx) != 0) {
+            fprintf(stderr, "Error: Failed to create worker thread %d\n", i);
+            return -1;
+        }
+    }
+
+    if (ctx->verbose) {
+        printf("Initialized %d decoder worker threads with %d slots\n",
+               ctx->num_threads, ctx->num_slots);
+    }
+
+    return 0;
+}
+
+static void cleanup_decoder_threads(decoder_context_t *ctx) {
+    if (ctx->num_threads <= 0) return;
+
+    // Signal threads to exit
+    pthread_mutex_lock(&ctx->mutex);
+    ctx->threads_should_exit = 1;
+    pthread_cond_broadcast(&ctx->cond_job_available);
+    pthread_mutex_unlock(&ctx->mutex);
+
+    // Wait for threads to finish
+    for (int i = 0; i < ctx->num_threads; i++) {
+        pthread_join(ctx->worker_threads[i], NULL);
+    }
+    free(ctx->worker_threads);
+    ctx->worker_threads = NULL;
+
+    // Free per-thread video contexts
+    for (int i = 0; i < ctx->num_threads; i++) {
+        tav_video_free(ctx->worker_video_ctx[i]);
+    }
+    free(ctx->worker_video_ctx);
+    ctx->worker_video_ctx = NULL;
+
+    // Free slots
+    for (int i = 0; i < ctx->num_slots; i++) {
+        if (ctx->slots[i].frames) {
+            for (int j = 0; j < ctx->slots[i].frames_allocated; j++) {
+                free(ctx->slots[i].frames[j]);
+            }
+            free(ctx->slots[i].frames);
+        }
+        if (ctx->slots[i].compressed_data) {
+            free(ctx->slots[i].compressed_data);
+        }
+    }
+    free(ctx->slots);
+    ctx->slots = NULL;
+
+    // Destroy sync primitives
+    pthread_mutex_destroy(&ctx->mutex);
+    pthread_cond_destroy(&ctx->cond_job_available);
+    pthread_cond_destroy(&ctx->cond_slot_free);
+}
+
 // =============================================================================
 // Frame Buffer Management
 // =============================================================================
@@ -710,6 +977,301 @@ static int process_packet(decoder_context_t *ctx) {
    }
 }

+// =============================================================================
+// Multithreaded Video Decoding (Pass 2)
+// =============================================================================
+
+// Read a single GOP packet without decoding - for multithreaded submission
+static int read_gop_packet_mt(decoder_context_t *ctx, int slot_idx) {
+    gop_decode_job_t *job = &ctx->slots[slot_idx];
+
+    // Read GOP size (1 byte)
+    uint8_t gop_size;
+    if (fread(&gop_size, 1, 1, ctx->input_fp) != 1) {
+        return -1;
+    }
+    ctx->bytes_read++;
+
+    // Read compressed size (4 bytes)
+    uint32_t compressed_size;
+    if (fread(&compressed_size, 4, 1, ctx->input_fp) != 1) {
+        return -1;
+    }
+    ctx->bytes_read += 4;
+
+    // Read compressed data
+    uint8_t *compressed_data = malloc(compressed_size);
+    if (!compressed_data) {
+        fprintf(stderr, "Error: Failed to allocate compressed data buffer\n");
+        return -1;
+    }
+
+    if (fread(compressed_data, 1, compressed_size, ctx->input_fp) != compressed_size) {
+        free(compressed_data);
+        return -1;
+    }
+    ctx->bytes_read += compressed_size;
+
+    // Fill job
+    job->compressed_data = compressed_data;
+    job->compressed_size = compressed_size;
+    job->gop_size = gop_size;
+    job->decode_result = 0;
+
+    return gop_size;
+}
+
+// Multithreaded pass 2 decoding loop
+static int decode_video_pass2_mt(decoder_context_t *ctx) {
+    size_t frame_size = ctx->header.width * ctx->header.height * 3;
+    int done = 0;
+    int job_counter = 0;
+
+    while (!done) {
+        // Try to submit new jobs to any free slots
+        pthread_mutex_lock(&ctx->mutex);
+
+        // Find a free slot
+        int free_slot = -1;
+        for (int i = 0; i < ctx->num_slots; i++) {
+            if (ctx->slots[i].status == DECODE_SLOT_DONE &&
+                ctx->slots[i].compressed_data == NULL) {
+                free_slot = i;
+                break;
+            }
+        }
+
+        pthread_mutex_unlock(&ctx->mutex);
+
+        if (free_slot >= 0) {
+            // Read next packet
+            uint8_t packet_type;
+            if (fread(&packet_type, 1, 1, ctx->input_fp) != 1) {
+                // EOF
+                done = 1;
+            } else {
+                ctx->bytes_read++;
+
+                if (packet_type == TAV_PACKET_GOP_UNIFIED) {
+                    // Read GOP and submit to slot
+                    int gop_size = read_gop_packet_mt(ctx, free_slot);
+                    if (gop_size > 0) {
+                        pthread_mutex_lock(&ctx->mutex);
+                        ctx->slots[free_slot].job_id = job_counter++;
+                        ctx->slots[free_slot].status = DECODE_SLOT_PENDING;
+                        ctx->jobs_submitted++;
+                        pthread_cond_broadcast(&ctx->cond_job_available);
+                        pthread_mutex_unlock(&ctx->mutex);
+                    } else {
+                        done = 1;
+                    }
+                } else if (packet_type == TAV_PACKET_IFRAME) {
+                    // For I-frames, decode synchronously (they're rare)
+                    process_iframe_packet(ctx);
+                } else {
+                    // Skip other packets (audio already extracted in Pass 1)
+                    switch (packet_type) {
+                        case TAV_PACKET_AUDIO_TAD: {
+                            // TAD format: [sample_count(2)][payload_size+7(4)][data...]
+                            uint16_t sample_count;
+                            uint32_t payload_size;
+                            if (fread(&sample_count, 2, 1, ctx->input_fp) != 1) { done = 1; break; }
+                            if (fread(&payload_size, 4, 1, ctx->input_fp) != 1) { done = 1; break; }
+                            ctx->bytes_read += 6;
+                            fseek(ctx->input_fp, payload_size, SEEK_CUR);
+                            ctx->bytes_read += payload_size;
+                            break;
+                        }
+                        case TAV_PACKET_AUDIO_PCM8:
+                        case TAV_PACKET_AUDIO_MP2:
+                        case TAV_PACKET_AUDIO_TRACK:
+                        case TAV_PACKET_SUBTITLE:
+                        case TAV_PACKET_SUBTITLE_TC:
+                        case TAV_PACKET_PFRAME: {
+                            uint32_t size;
+                            if (fread(&size, 4, 1, ctx->input_fp) != 1) { done = 1; break; }
+                            ctx->bytes_read += 4;
+                            fseek(ctx->input_fp, size, SEEK_CUR);
+                            ctx->bytes_read += size;
+                            break;
+                        }
+                        case TAV_PACKET_SCREEN_MASK:
+                            fseek(ctx->input_fp, 4, SEEK_CUR);
+                            ctx->bytes_read += 4;
+                            break;
+                        case TAV_PACKET_GOP_SYNC:
+                            fseek(ctx->input_fp, 1, SEEK_CUR);
+                            ctx->bytes_read += 1;
+                            break;
+                        case TAV_PACKET_TIMECODE:
+                            fseek(ctx->input_fp, 8, SEEK_CUR);
+                            ctx->bytes_read += 8;
+                            break;
+                        case TAV_PACKET_EXTENDED_HDR: {
+                            // Skip extended header
+                            uint16_t num_pairs;
+                            if (fread(&num_pairs, 2, 1, ctx->input_fp) != 1) { done = 1; break; }
+                            ctx->bytes_read += 2;
+                            for (int i = 0; i < num_pairs; i++) {
+                                uint8_t kv_header[5];
+                                if (fread(kv_header, 1, 5, ctx->input_fp) != 5) break;
+                                ctx->bytes_read += 5;
+                                uint8_t value_type = kv_header[4];
+                                if (value_type == 0x04) {
+                                    fseek(ctx->input_fp, 8, SEEK_CUR);
+                                    ctx->bytes_read += 8;
+                                } else if (value_type == 0x10) {
+                                    uint16_t length;
+                                    if (fread(&length, 2, 1, ctx->input_fp) != 1) break;
+                                    ctx->bytes_read += 2;
+                                    fseek(ctx->input_fp, length, SEEK_CUR);
+                                    ctx->bytes_read += length;
+                                } else if (value_type <= 0x04) {
+                                    int sizes[] = {2, 3, 4, 6, 8};
+                                    fseek(ctx->input_fp, sizes[value_type], SEEK_CUR);
+                                    ctx->bytes_read += sizes[value_type];
+                                }
+                            }
+                            break;
+                        }
+                        case TAV_PACKET_SYNC_NTSC:
+                        case TAV_PACKET_SYNC:
+                            // No payload
+                            break;
+                        default:
+                            // Unknown packet, try to skip
+                            {
+                                uint32_t size;
+                                if (fread(&size, 4, 1, ctx->input_fp) == 1 && size < 1000000) {
+                                    fseek(ctx->input_fp, size, SEEK_CUR);
+                                    ctx->bytes_read += 4 + size;
+                                }
+                            }
+                            break;
+                    }
+                }
+            }
+        }
+
+        // Write completed jobs in order
+        pthread_mutex_lock(&ctx->mutex);
+        while (1) {
+            // Find the next job to write (by job_id order)
+            int write_slot = -1;
+            int min_job_id = INT32_MAX;
+            for (int i = 0; i < ctx->num_slots; i++) {
+                if (ctx->slots[i].status == DECODE_SLOT_DONE &&
+                    ctx->slots[i].job_id >= 0 &&
+                    ctx->slots[i].job_id < min_job_id) {
+                    // Check if this is the next expected job
+                    if (ctx->slots[i].job_id == ctx->next_write_slot) {
+                        write_slot = i;
+                        break;
+                    }
+                    min_job_id = ctx->slots[i].job_id;
+                }
+            }
+
+            if (write_slot < 0) {
+                // No jobs ready in order, wait if there are pending jobs
+                if (!done && ctx->jobs_submitted > ctx->next_write_slot) {
+                    // Wait for job to complete
+                    pthread_cond_wait(&ctx->cond_slot_free, &ctx->mutex);
+                    continue;
+                }
+                break;
+            }
+
+            pthread_mutex_unlock(&ctx->mutex);
+
+            // Write frames to FFmpeg
+            gop_decode_job_t *job = &ctx->slots[write_slot];
+            if (job->decode_result >= 0) {
+                for (int i = 0; i < job->gop_size; i++) {
+                    if (ctx->video_pipe) {
+                        fwrite(job->frames[i], 1, frame_size, ctx->video_pipe);
+                    }
+                    ctx->frames_decoded++;
+
+                    if (ctx->decode_limit > 0 && ctx->frames_decoded >= (uint64_t)ctx->decode_limit) {
+                        done = 1;
+                        break;
+                    }
+                }
+                ctx->gops_decoded++;
+            }
+
+            // Mark slot as free
+            pthread_mutex_lock(&ctx->mutex);
+            job->job_id = -1;
+            ctx->next_write_slot++;
+            pthread_mutex_unlock(&ctx->mutex);
+
+            // Progress
+            time_t elapsed = time(NULL) - ctx->start_time;
+            double fps = elapsed > 0 ? (double)ctx->frames_decoded / elapsed : 0.0;
+            printf("\rFrames: %lu | GOPs: %lu | %.1f fps",
+                   ctx->frames_decoded, ctx->gops_decoded, fps);
+            fflush(stdout);
+
+            pthread_mutex_lock(&ctx->mutex);
+        }
+        pthread_mutex_unlock(&ctx->mutex);
+
+        // Check decode limit
+        if (ctx->decode_limit > 0 && ctx->frames_decoded >= (uint64_t)ctx->decode_limit) {
+            done = 1;
+        }
+    }
+
+    // Wait for remaining jobs to complete
+    pthread_mutex_lock(&ctx->mutex);
+    while (ctx->jobs_completed < ctx->jobs_submitted) {
+        pthread_cond_wait(&ctx->cond_slot_free, &ctx->mutex);
+    }
+
+    // Write any remaining completed jobs
+    while (1) {
+        int write_slot = -1;
+        for (int i = 0; i < ctx->num_slots; i++) {
+            if (ctx->slots[i].status == DECODE_SLOT_DONE &&
+                ctx->slots[i].job_id == ctx->next_write_slot) {
+                write_slot = i;
+                break;
+            }
+        }
+
+        if (write_slot < 0) break;
+
+        pthread_mutex_unlock(&ctx->mutex);
+
+        gop_decode_job_t *job = &ctx->slots[write_slot];
+        if (job->decode_result >= 0) {
+            for (int i = 0; i < job->gop_size; i++) {
+                if (ctx->video_pipe) {
+                    fwrite(job->frames[i], 1, frame_size, ctx->video_pipe);
+                }
+                ctx->frames_decoded++;
+            }
+            ctx->gops_decoded++;
+        }
+
+        pthread_mutex_lock(&ctx->mutex);
+        job->job_id = -1;
+        ctx->next_write_slot++;
+
+        time_t elapsed = time(NULL) - ctx->start_time;
+        double fps = elapsed > 0 ? (double)ctx->frames_decoded / elapsed : 0.0;
+        printf("\rFrames: %lu | GOPs: %lu | %.1f fps",
+               ctx->frames_decoded, ctx->gops_decoded, fps);
+        fflush(stdout);
+    }
+    pthread_mutex_unlock(&ctx->mutex);
+
+    printf("\n");
+    return 0;
+}
+
 // =============================================================================
 // Main Decoding Loop
 // =============================================================================
@@ -755,27 +1317,44 @@ static int decode_video(decoder_context_t *ctx) {
        return -1;
    }

-    // Pass 2: Video decoding
-    uint64_t last_reported = 0;
-    while (process_packet(ctx) == 0) {
-        // Progress reporting - show when frames were decoded
-        if (ctx->frames_decoded != last_reported) {
-            time_t elapsed = time(NULL) - ctx->start_time;
-            double fps = elapsed > 0 ? (double)ctx->frames_decoded / elapsed : 0.0;
-            printf("\rFrames: %lu | GOPs: %lu | %.1f fps",
-                   ctx->frames_decoded, ctx->gops_decoded, fps);
-            fflush(stdout);
-            last_reported = ctx->frames_decoded;
-        }
-
-        // Check decode limit
-        if (ctx->decode_limit > 0 && ctx->frames_decoded >= (uint64_t)ctx->decode_limit) {
-            break;
+    // Initialize decoder threads if multithreaded mode
+    if (ctx->num_threads > 0) {
+        if (init_decoder_threads(ctx) < 0) {
+            fprintf(stderr, "Error: Failed to initialize decoder threads\n");
+            return -1;
        }
+        printf("  Using %d decoder threads\n", ctx->num_threads);
    }

-    printf("\n");
-    return 0;
+    // Pass 2: Video decoding
+    if (ctx->num_threads > 0) {
+        // Multithreaded decode
+        int result = decode_video_pass2_mt(ctx);
+        cleanup_decoder_threads(ctx);
+        return result;
+    } else {
+        // Single-threaded decode
+        uint64_t last_reported = 0;
+        while (process_packet(ctx) == 0) {
+            // Progress reporting - show when frames were decoded
+            if (ctx->frames_decoded != last_reported) {
+                time_t elapsed = time(NULL) - ctx->start_time;
+                double fps = elapsed > 0 ? (double)ctx->frames_decoded / elapsed : 0.0;
+                printf("\rFrames: %lu | GOPs: %lu | %.1f fps",
+                       ctx->frames_decoded, ctx->gops_decoded, fps);
+                fflush(stdout);
+                last_reported = ctx->frames_decoded;
+            }
+
+            // Check decode limit
+            if (ctx->decode_limit > 0 && ctx->frames_decoded >= (uint64_t)ctx->decode_limit) {
+                break;
+            }
+        }
+
+        printf("\n");
+        return 0;
+    }
 }

 // =============================================================================
@@ -816,6 +1395,7 @@ static void print_usage(const char *program) {
    printf("  --no-audio               Skip audio decoding\n");
    printf("  --decode-limit N         Decode only first N frames\n");
    printf("  --dump-packets           Debug: print packet info\n");
+    printf("  -t, --threads N          Number of decoder threads (0=single-threaded, default)\n");
    printf("  -v, --verbose            Verbose output\n");
    printf("  --help                   Show this help\n");
    printf("\nExamples:\n");
@@ -835,6 +1415,7 @@ int main(int argc, char *argv[]) {
        {"input",        required_argument, 0, 'i'},
        {"output",       required_argument, 0, 'o'},
        {"verbose",      no_argument,       0, 'v'},
+        {"threads",      required_argument, 0, 't'},
        {"raw",          no_argument,       0, 1001},
        {"no-audio",     no_argument,       0, 1002},
        {"decode-limit", required_argument, 0, 1003},
@@ -844,7 +1425,7 @@ int main(int argc, char *argv[]) {
    };

    int c, option_index = 0;
-    while ((c = getopt_long(argc, argv, "i:o:vh", long_options, &option_index)) != -1) {
+    while ((c = getopt_long(argc, argv, "i:o:t:vh", long_options, &option_index)) != -1) {
        switch (c) {
            case 'i':
                ctx.input_file = strdup(optarg);
@@ -855,6 +1436,9 @@ int main(int argc, char *argv[]) {
            case 'v':
                ctx.verbose = 1;
                break;
+            case 't':
+                ctx.num_threads = atoi(optarg);
+                break;
            case 1001:
                ctx.output_raw = 1;
                break;