TAV: letterbox detection

2026-06-06 05:28:31 +09:00 · 2025-11-17 03:16:26 +09:00
parent aa7e20695d
commit 8199cbc955
3 changed files with 731 additions and 1 deletions
--- a/assets/disk0/tvdos/bin/playtav.js
+++ b/assets/disk0/tvdos/bin/playtav.js
@@ -39,6 +39,7 @@ const TAV_PACKET_SUBTITLE = 0x30       // Legacy SSF (frame-locked)
 const TAV_PACKET_SUBTITLE_TC = 0x31    // SSF-TC (timecode-based)
 const TAV_PACKET_AUDIO_BUNDLED = 0x40  // Entire MP2 audio file in single packet
 const TAV_PACKET_EXTENDED_HDR = 0xEF
+const TAV_PACKET_SCREEN_MASK = 0xF2  // Screen masking (letterbox/pillarbox)
 const TAV_PACKET_GOP_SYNC = 0xFC  // GOP sync (N frames decoded from GOP block)
 const TAV_PACKET_TIMECODE = 0xFD
 const TAV_PACKET_SYNC_NTSC = 0xFE
@@ -72,6 +73,13 @@ let currentTimecodeNs = 0  // Current playback timecode (updated every frame)
 let baseTimecodeNs = 0  // Base timecode from most recent TIMECODE packet
 let baseTimecodeFrameCount = 0  // Frame count when base timecode was set

+// Screen masking (letterbox/pillarbox) state
+let screenMaskEntries = []  // Array of {frameNum, top, right, bottom, left}
+let screenMaskTop = 0
+let screenMaskRight = 0
+let screenMaskBottom = 0
+let screenMaskLeft = 0
+
 // Parse command line options
 let interactive = false
 let filmGrainLevel = null
@@ -739,6 +747,77 @@ function scanForwardToIframe(targetFrame, currentPos) {
 }

 // Function to try reading next TAV file header at current position
+// Update active screen mask for the given frame number
+// Screen mask packets are sorted by frameNum, so find the last entry with frameNum <= currentFrameNum
+function updateScreenMask(currentFrameNum) {
+    if (screenMaskEntries.length === 0) {
+        return  // No screen mask entries
+    }
+
+    // Find the most recent screen mask entry for this frame
+    // Entries are in order, so scan backwards for efficiency
+    for (let i = screenMaskEntries.length - 1; i >= 0; i--) {
+        if (screenMaskEntries[i].frameNum <= currentFrameNum) {
+            // Apply this mask
+            screenMaskTop = screenMaskEntries[i].top
+            screenMaskRight = screenMaskEntries[i].right
+            screenMaskBottom = screenMaskEntries[i].bottom
+            screenMaskLeft = screenMaskEntries[i].left
+            return
+        }
+    }
+}
+
+// Fill masked regions (letterbox/pillarbox bars) with black
+function fillMaskedRegions() {
+    return
+//    console.log(`ScrMask: ${screenMaskTop}, ${screenMaskRight}, ${screenMaskBottom}, ${screenMaskLeft}`)
+
+    if (screenMaskTop === 0 && screenMaskRight === 0 &&
+        screenMaskBottom === 0 && screenMaskLeft === 0) {
+        return  // No masking
+    }
+
+    const width = header.width
+    const height = header.height
+    const blackRG = 0xF0
+    const blackBA = 0xFF // 0xF0FF (magenta) for test
+
+    // Fill top letterbox bar
+    for (let y = 0; y < screenMaskTop && y < height; y++) {
+        for (let x = 0; x < width; x++) {
+            graphics.plotPixel(x, y, blackRG)
+            graphics.plotPixel2(x, y, blackBA)
+        }
+    }
+
+    // Fill bottom letterbox bar
+    for (let y = height - screenMaskBottom; y < height; y++) {
+        if (y < 0) continue
+        for (let x = 0; x < width; x++) {
+            graphics.plotPixel(x, y, blackRG)
+            graphics.plotPixel2(x, y, blackBA)
+        }
+    }
+
+    // Fill left pillarbox bar
+    for (let y = 0; y < height; y++) {
+        for (let x = 0; x < screenMaskLeft && x < width; x++) {
+            graphics.plotPixel(x, y, blackRG)
+            graphics.plotPixel2(x, y, blackBA)
+        }
+    }
+
+    // Fill right pillarbox bar
+    for (let y = 0; y < height; y++) {
+        for (let x = width - screenMaskRight; x < width; x++) {
+            if (x < 0) continue
+            graphics.plotPixel(x, y, blackRG)
+            graphics.plotPixel2(x, y, blackBA)
+        }
+    }
+}
+
 function tryReadNextTAVHeader() {
    // Save current position
    let currentPos = seqread.getReadCount()
@@ -1116,6 +1195,9 @@ try {
                // Do nothing - skip to next packet
            }
            else if (packetType === TAV_PACKET_IFRAME || packetType === TAV_PACKET_PFRAME) {
+                // Update active screen mask for this frame (Phase 1: just tracking, not applying)
+                updateScreenMask(frameCount)
+
                // Record I-frame position for seeking
                if (packetType === TAV_PACKET_IFRAME) {
                    iframePositions.push({offset: packetOffset, frameNum: frameCount})
@@ -1588,6 +1670,28 @@ try {
                    }
                }
            }
+            else if (packetType === TAV_PACKET_SCREEN_MASK) {
+                // Screen masking packet (letterbox/pillarbox detection)
+                // Format: frame_num(4) + top(2) + right(2) + bottom(2) + left(2) = 12 bytes
+                let frameNum = seqread.readInt()  // uint32 frame number
+                let top = seqread.readOneByte() | (seqread.readOneByte() << 8)
+                let right = seqread.readOneByte() | (seqread.readOneByte() << 8)
+                let bottom = seqread.readOneByte() | (seqread.readOneByte() << 8)
+                let left = seqread.readOneByte() | (seqread.readOneByte() << 8)
+
+                // Store in entries array
+                screenMaskEntries.push({
+                    frameNum: frameNum,
+                    top: top,
+                    right: right,
+                    bottom: bottom,
+                    left: left
+                })
+
+                if (interactive) {
+                    serial.println(`[SCREEN_MASK] frame=${frameNum} top=${top} right=${right} bottom=${bottom} left=${left}`)
+                }
+            }
            else if (packetType === TAV_PACKET_TIMECODE) {
                // Timecode packet - time since stream start in nanoseconds
                let timecodeLow = seqread.readInt()
@@ -1789,6 +1893,12 @@ try {
                graphics.uploadVideoBufferFrameToFramebuffer(currentGopFrameIndex, header.width, header.height, trueFrameCount, bufferOffset)
                uploadTime = (sys.nanoTime() - uploadStart) / 1000000.0

+                // Update active screen mask for this GOP frame
+                updateScreenMask(frameCount)
+
+                // Fill masked regions with black (letterbox/pillarbox bars)
+                fillMaskedRegions()
+
                if (interactive && currentGopFrameIndex === 0) {
 //                    console.log(`[GOP] Playing GOP: ${currentGopSize} frames from slot ${currentGopBufferSlot}`)
                }
--- a/video_encoder/decoder_tav.c
+++ b/video_encoder/decoder_tav.c
@@ -32,7 +32,9 @@
 #define TAV_PACKET_AUDIO_TAD       0x24  // TAD audio - SUPPORTED (decode to PCMu8)
 #define TAV_PACKET_AUDIO_TRACK     0x40  // Bundled audio track - SUPPORTED (passthrough)
 #define TAV_PACKET_SUBTITLE        0x30  // Subtitle - SKIPPED
+#define TAV_PACKET_SUBTITLE_TC     0x31  // Subtitle - SKIPPED
 #define TAV_PACKET_EXTENDED_HDR    0xEF  // Extended header - SKIPPED
+#define TAV_PACKET_SCREEN_MASK     0xF2  // Screen masking (letterbox/pillarbox) - PARSED
 #define TAV_PACKET_GOP_SYNC        0xFC  // GOP sync packet - SKIPPED
 #define TAV_PACKET_TIMECODE        0xFD  // Timecode - SKIPPED
 #define TAV_PACKET_SYNC_NTSC       0xFE  // NTSC sync - SKIPPED
@@ -1586,6 +1588,15 @@ static void write_wav_header(FILE *fp, uint32_t sample_rate, uint16_t channels,
 // Decoder State Structure
 //=============================================================================

+// Screen masking entry (letterbox/pillarbox geometry change)
+typedef struct {
+    uint32_t frame_num;
+    uint16_t top;
+    uint16_t right;
+    uint16_t bottom;
+    uint16_t left;
+} screen_mask_entry_t;
+
 typedef struct {
    FILE *input_fp;
    tav_header_t header;
@@ -1601,6 +1612,16 @@ typedef struct {
    int frame_size;
    int is_monoblock;           // True if version 3-6 (single tile mode)

+    // Screen masking (letterbox/pillarbox) - array of geometry changes
+    screen_mask_entry_t *screen_masks;
+    int screen_mask_count;
+    int screen_mask_capacity;
+    // Current active mask
+    uint16_t screen_mask_top;
+    uint16_t screen_mask_right;
+    uint16_t screen_mask_bottom;
+    uint16_t screen_mask_left;
+
    // FFmpeg pipe for video only (audio from file)
    FILE *video_pipe;
    pid_t ffmpeg_pid;
@@ -1669,6 +1690,11 @@ static int extract_audio_to_wav(const char *input_file, const char *wav_file, in
            continue;
        }

+        if (packet_type == TAV_PACKET_SCREEN_MASK) {
+            fseek(input_fp, 12, SEEK_CUR);  // Skip frame_num(4) + top(2) + right(2) + bottom(2) + left(2)
+            continue;
+        }
+
        if (packet_type == TAV_PACKET_GOP_UNIFIED) {
            uint8_t gop_size;
            uint32_t compressed_size;
@@ -1948,10 +1974,83 @@ static void tav_decoder_free(tav_decoder_t *decoder) {
    free(decoder->reference_ycocg_y);
    free(decoder->reference_ycocg_co);
    free(decoder->reference_ycocg_cg);
+    free(decoder->screen_masks);
    free(decoder->audio_file_path);
    free(decoder);
 }

+//=============================================================================
+// Screen Mask Management
+//=============================================================================
+
+// Fill masked regions (letterbox/pillarbox bars) with black
+static void fill_masked_regions(uint8_t *frame_rgb, int width, int height,
+                                uint16_t top, uint16_t right, uint16_t bottom, uint16_t left) {
+    // Fill top letterbox bar
+    for (int y = 0; y < top && y < height; y++) {
+        for (int x = 0; x < width; x++) {
+            int offset = (y * width + x) * 3;
+            frame_rgb[offset] = 255;     // R
+            frame_rgb[offset + 1] = 0; // G
+            frame_rgb[offset + 2] = 0; // B
+        }
+    }
+
+    // Fill bottom letterbox bar
+    for (int y = height - bottom; y < height; y++) {
+        if (y < 0) continue;
+        for (int x = 0; x < width; x++) {
+            int offset = (y * width + x) * 3;
+            frame_rgb[offset] = 255;     // R
+            frame_rgb[offset + 1] = 0; // G
+            frame_rgb[offset + 2] = 0; // B
+        }
+    }
+
+    // Fill left pillarbox bar
+    for (int y = 0; y < height; y++) {
+        for (int x = 0; x < left && x < width; x++) {
+            int offset = (y * width + x) * 3;
+            frame_rgb[offset] = 0;     // R
+            frame_rgb[offset + 1] = 0; // G
+            frame_rgb[offset + 2] = 255; // B
+        }
+    }
+
+    // Fill right pillarbox bar
+    for (int y = 0; y < height; y++) {
+        for (int x = width - right; x < width; x++) {
+            if (x < 0) continue;
+            int offset = (y * width + x) * 3;
+            frame_rgb[offset] = 0;     // R
+            frame_rgb[offset + 1] = 0; // G
+            frame_rgb[offset + 2] = 255; // B
+        }
+    }
+}
+
+// Update active screen mask for the given frame number
+// Screen mask packets are sorted by frame_num, so we find the last entry
+// with frame_num <= current_frame_num
+static void update_screen_mask(tav_decoder_t *decoder, uint32_t current_frame_num) {
+    if (!decoder->screen_masks || decoder->screen_mask_count == 0) {
+        return;  // No screen mask entries
+    }
+
+    // Find the most recent screen mask entry for this frame
+    // Entries are in order, so scan backwards for efficiency
+    for (int i = decoder->screen_mask_count - 1; i >= 0; i--) {
+        if (decoder->screen_masks[i].frame_num <= current_frame_num) {
+            // Apply this mask
+            decoder->screen_mask_top = decoder->screen_masks[i].top;
+            decoder->screen_mask_right = decoder->screen_masks[i].right;
+            decoder->screen_mask_bottom = decoder->screen_masks[i].bottom;
+            decoder->screen_mask_left = decoder->screen_masks[i].left;
+            return;
+        }
+    }
+}
+
 //=============================================================================
 // Frame Decoding Logic
 //=============================================================================
@@ -2486,6 +2585,50 @@ int main(int argc, char *argv[]) {
            continue;
        }

+        // Handle screen masking packets (letterbox/pillarbox detection)
+        // Format: frame_num(4) + top(2) + right(2) + bottom(2) + left(2) = 12 bytes
+        if (packet_type == TAV_PACKET_SCREEN_MASK) {
+            uint32_t frame_num;
+            uint16_t top, right, bottom, left;
+            if (fread(&frame_num, 4, 1, decoder->input_fp) != 1 ||
+                fread(&top, 2, 1, decoder->input_fp) != 1 ||
+                fread(&right, 2, 1, decoder->input_fp) != 1 ||
+                fread(&bottom, 2, 1, decoder->input_fp) != 1 ||
+                fread(&left, 2, 1, decoder->input_fp) != 1) {
+                fprintf(stderr, "Error: Failed to read screen mask packet\n");
+                result = -1;
+                break;
+            }
+
+            // Allocate array if needed
+            if (decoder->screen_masks == NULL) {
+                decoder->screen_mask_capacity = 16;
+                decoder->screen_masks = malloc(decoder->screen_mask_capacity * sizeof(screen_mask_entry_t));
+                decoder->screen_mask_count = 0;
+            }
+
+            // Expand array if needed
+            if (decoder->screen_mask_count >= decoder->screen_mask_capacity) {
+                decoder->screen_mask_capacity *= 2;
+                decoder->screen_masks = realloc(decoder->screen_masks,
+                                               decoder->screen_mask_capacity * sizeof(screen_mask_entry_t));
+            }
+
+            // Store entry
+            screen_mask_entry_t *entry = &decoder->screen_masks[decoder->screen_mask_count++];
+            entry->frame_num = frame_num;
+            entry->top = top;
+            entry->right = right;
+            entry->bottom = bottom;
+            entry->left = left;
+
+            if (verbose) {
+                fprintf(stderr, "Packet %d: SCREEN_MASK (0x%02X) - frame=%u top=%u right=%u bottom=%u left=%u\n",
+                       total_packets, packet_type, frame_num, top, right, bottom, left);
+            }
+            continue;
+        }
+
        // Handle GOP unified packets (custom format: 1-byte gop_size + 4-byte compressed_size)
        if (packet_type == TAV_PACKET_GOP_UNIFIED) {
            uint8_t gop_size;
@@ -2738,6 +2881,14 @@ int main(int argc, char *argv[]) {
                    frame_rgb[i * 3 + 2] = b;
                }

+                // Update active screen mask for this GOP frame
+                update_screen_mask(decoder, decoder->frame_count + t);
+
+                // Fill masked regions with black (letterbox/pillarbox bars)
+                fill_masked_regions(frame_rgb, decoder->header.width, decoder->header.height,
+                                   decoder->screen_mask_top, decoder->screen_mask_right,
+                                   decoder->screen_mask_bottom, decoder->screen_mask_left);
+
                // Write frame to FFmpeg video pipe
                const size_t bytes_to_write = decoder->frame_size * 3;

@@ -2869,6 +3020,9 @@ int main(int argc, char *argv[]) {
        switch (packet_type) {
            case TAV_PACKET_IFRAME:
            case TAV_PACKET_PFRAME:
+                // Update active screen mask for this frame (Phase 1: just tracking, not applying)
+                update_screen_mask(decoder, decoder->frame_count);
+
                iframe_count++;
                if (verbose && iframe_count <= 5) {
                    fprintf(stderr, "Processing %s (packet %d, size %u bytes)...\n",
@@ -2902,6 +3056,7 @@ int main(int argc, char *argv[]) {
                break;

            case TAV_PACKET_SUBTITLE:
+            case TAV_PACKET_SUBTITLE_TC:
                // Skip subtitle packets
                fseek(decoder->input_fp, packet_size, SEEK_CUR);
                break;
--- a/video_encoder/encoder_tav.c
+++ b/video_encoder/encoder_tav.c
@@ -59,6 +59,7 @@
 #define TAV_PACKET_SUBTITLE_TC     0x31  // Subtitle packet with timecode (SSF-TC format)
 #define TAV_PACKET_AUDIO_TRACK     0x40  // Separate audio track (full MP2 file)
 #define TAV_PACKET_EXTENDED_HDR    0xEF  // Extended header packet
+#define TAV_PACKET_SCREEN_MASK     0xF2  // Screen masking packet (letterbox/pillarbox)
 #define TAV_PACKET_GOP_SYNC        0xFC  // GOP sync packet (N frames decoded)
 #define TAV_PACKET_TIMECODE        0xFD  // Timecode packet
 #define TAV_PACKET_SYNC_NTSC       0xFE  // NTSC Sync packet
@@ -199,6 +200,13 @@ typedef struct frame_analysis {
    // Detection results
    int is_scene_change;         // Final scene change flag
    double scene_change_score;   // Composite score for debugging
+
+    // Letterbox/pillarbox detection
+    uint16_t letterbox_top;
+    uint16_t letterbox_right;
+    uint16_t letterbox_bottom;
+    uint16_t letterbox_left;
+    int has_letterbox;           // 1 if any masking detected
 } frame_analysis_t;

 // GOP boundary list for two-pass encoding
@@ -1804,6 +1812,7 @@ typedef struct tav_encoder_s {
    int separate_audio_track; // 1 = write entire MP2 file as packet 0x40 after header, 0 = interleave audio (default)
    int pcm8_audio; // 1 = use 8-bit PCM audio (packet 0x21), 0 = use MP2 (default)
    int tad_audio; // 1 = use TAD audio (packet 0x24), 0 = use MP2/PCM8 (default, quality follows quality_level)
+    int enable_letterbox_detect; // 1 = detect and emit letterbox/pillarbox packets (default), 0 = disable

    // Frame buffers - ping-pong implementation
    uint8_t *frame_rgb[2];      // [0] and [1] alternate between current and previous
@@ -2419,6 +2428,7 @@ static tav_encoder_t* create_encoder(void) {
    enc->separate_audio_track = 0;  // Default: interleave audio packets
    enc->pcm8_audio = 0;  // Default: use MP2 audio
    enc->tad_audio = 0;  // Default: use MP2 audio (TAD quality follows quality_level)
+    enc->enable_letterbox_detect = 1;  // Default: enable letterbox/pillarbox detection

    // GOP / temporal DWT settings
    enc->enable_temporal_dwt = 1;  // Mutually exclusive with use_delta_encoding
@@ -8125,6 +8135,415 @@ static void write_timecode_packet(FILE *output, int frame_num, int fps, int is_n
    fwrite(&timecode_ns, sizeof(uint64_t), 1, output);
 }

+// Write screen masking packet (letterbox/pillarbox detection)
+// Packet structure: type(1) + frame_num(4) + top(2) + right(2) + bottom(2) + left(2) = 13 bytes
+static void write_screen_mask_packet(FILE *output, uint32_t frame_num,
+                                      uint16_t top, uint16_t right,
+                                      uint16_t bottom, uint16_t left) {
+    uint8_t packet_type = TAV_PACKET_SCREEN_MASK;
+    fwrite(&packet_type, 1, 1, output);
+    fwrite(&frame_num, sizeof(uint32_t), 1, output);
+    fwrite(&top, sizeof(uint16_t), 1, output);
+    fwrite(&right, sizeof(uint16_t), 1, output);
+    fwrite(&bottom, sizeof(uint16_t), 1, output);
+    fwrite(&left, sizeof(uint16_t), 1, output);
+}
+
+// Calculate Sobel gradient magnitude for a pixel (edge detection)
+static float calculate_sobel_magnitude(const uint8_t *frame_rgb, int width, int height,
+                                         int x, int y) {
+    // Sobel kernels for X and Y gradients
+    // Gx = [[-1, 0, 1], [-2, 0, 2], [-1, 0, 1]]
+    // Gy = [[-1, -2, -1], [0, 0, 0], [1, 2, 1]]
+
+    // Handle boundary conditions with symmetric extension
+    int x_prev = (x > 0) ? (x - 1) : 0;
+    int x_next = (x < width - 1) ? (x + 1) : (width - 1);
+    int y_prev = (y > 0) ? (y - 1) : 0;
+    int y_next = (y < height - 1) ? (y + 1) : (height - 1);
+
+    // Sample 3x3 neighborhood (using luma only for efficiency)
+    float pixels[3][3];
+    for (int dy = 0; dy < 3; dy++) {
+        for (int dx = 0; dx < 3; dx++) {
+            int sample_y = (dy == 0) ? y_prev : ((dy == 1) ? y : y_next);
+            int sample_x = (dx == 0) ? x_prev : ((dx == 1) ? x : x_next);
+            int offset = (sample_y * width + sample_x) * 3;
+
+            // Convert to luma (simple approximation: Y = 0.299R + 0.587G + 0.114B)
+            pixels[dy][dx] = (0.299f * frame_rgb[offset] +
+                              0.587f * frame_rgb[offset + 1] +
+                              0.114f * frame_rgb[offset + 2]);
+        }
+    }
+
+    // Apply Sobel operators
+    float gx = -pixels[0][0] + pixels[0][2] +
+               -2*pixels[1][0] + 2*pixels[1][2] +
+               -pixels[2][0] + pixels[2][2];
+
+    float gy = -pixels[0][0] - 2*pixels[0][1] - pixels[0][2] +
+                pixels[2][0] + 2*pixels[2][1] + pixels[2][2];
+
+    // Calculate magnitude: sqrt(gx^2 + gy^2)
+    return sqrtf(gx * gx + gy * gy);
+}
+
+// Apply symmetric cropping and suppress simultaneous letterbox+pillarbox
+// ALWAYS makes left=right and top=bottom (perfect symmetry)
+// When BOTH letterbox and pillarbox are detected simultaneously, suppress one based on current state
+// Allows letterbox→pillarbox or pillarbox→letterbox transitions
+static void apply_symmetric_cropping(uint16_t *top, uint16_t *right,
+                                       uint16_t *bottom, uint16_t *left,
+                                       int width, int height,
+                                       uint16_t current_top, uint16_t current_bottom,
+                                       uint16_t current_left, uint16_t current_right) {
+    const int MIN_BAR_SIZE_LETTER = (int)(0.04f * height);  // Minimum bar size to consider (ignore <16 pixel bars)
+    const int MIN_BAR_SIZE_PILLAR = (int)(0.04f * width);  // Minimum bar size to consider (ignore <16 pixel bars)
+    const int SIGNIFICANT_THRESHOLD_LETTER = (int)(0.08f * height);  // Bar must be 32+ pixels to be considered significant
+    const int SIGNIFICANT_THRESHOLD_PILLAR = (int)(0.08f * width);  // Bar must be 32+ pixels to be considered significant
+
+    // Filter out small bars (noise/detection errors)
+    if (*top < MIN_BAR_SIZE_LETTER) *top = 0;
+    if (*bottom < MIN_BAR_SIZE_LETTER) *bottom = 0;
+    if (*left < MIN_BAR_SIZE_PILLAR) *left = 0;
+    if (*right < MIN_BAR_SIZE_PILLAR) *right = 0;
+
+    // ALWAYS make letterbox (top/bottom) perfectly symmetric
+    if (*top > 0 || *bottom > 0) {
+        // Use minimum value to avoid over-cropping
+        uint16_t symmetric_value = (*top < *bottom) ? *top : *bottom;
+        *top = symmetric_value+1;
+        *bottom = symmetric_value+1;
+    }
+
+    // ALWAYS make pillarbox (left/right) perfectly symmetric
+    if (*left > 0 || *right > 0) {
+        // Use minimum value to avoid over-cropping
+        uint16_t symmetric_value = (*left < *right) ? *left : *right;
+        *left = symmetric_value+1;
+        *right = symmetric_value+1;
+    }
+
+    // Check if BOTH letterbox and pillarbox are detected simultaneously
+    int new_has_letterbox = (*top >= SIGNIFICANT_THRESHOLD_LETTER || *bottom >= SIGNIFICANT_THRESHOLD_LETTER);
+    int new_has_pillarbox = (*left >= SIGNIFICANT_THRESHOLD_PILLAR || *right >= SIGNIFICANT_THRESHOLD_PILLAR);
+    int current_has_letterbox = (current_top >= SIGNIFICANT_THRESHOLD_LETTER || current_bottom >= SIGNIFICANT_THRESHOLD_LETTER);
+    int current_has_pillarbox = (current_left >= SIGNIFICANT_THRESHOLD_PILLAR || current_right >= SIGNIFICANT_THRESHOLD_PILLAR);
+
+    // Only suppress when BOTH are detected AND one is much smaller (likely false positive)
+    // Completely suppress windowboxing
+    if (new_has_letterbox && new_has_pillarbox) {
+        int letterbox_size = *top + *bottom;
+        int pillarbox_size = *left + *right;
+
+        // to allow windowboxing:
+        // Only suppress if one is less than 25% of total masking
+        // This allows legitimate windowboxing while filtering false positives
+        float letterbox_ratio_geom = (float)letterbox_size / height;
+        float pillarbox_ratio_geom = (float)pillarbox_size / width;
+        float ratio_sum = letterbox_ratio_geom + pillarbox_ratio_geom;
+        float letterbox_ratio = letterbox_ratio_geom / ratio_sum;
+        float pillarbox_ratio = pillarbox_ratio_geom / ratio_sum;
+
+        if (letterbox_ratio < 0.25f) {
+            *top = 0;
+            *bottom = 0;
+        } else if (pillarbox_ratio < 0.25f)
+            *left = 0;
+            *right = 0;
+        }
+        // Otherwise keep both (legitimate windowboxing)
+    }
+}
+
+// Detect letterbox/pillarbox bars in the current frame
+// Returns 1 if masking detected, 0 otherwise
+// Sets top, right, bottom, left to the size of detected bars in pixels
+static int detect_letterbox_pillarbox(tav_encoder_t *enc,
+                                       uint16_t *top, uint16_t *right,
+                                       uint16_t *bottom, uint16_t *left) {
+    if (!enc->current_frame_rgb) return 0;
+
+    const int width = enc->width;
+    const int height = enc->height;
+    const int SAMPLE_RATE_HORZ = 4;  // Sample every 4th pixel for performance
+    const int SAMPLE_RATE_VERT = 4;  // Sample every 4th pixel for performance
+    const float Y_THRESHOLD = 2.0f;  // Y < 2 for dark pixels
+    const float CHROMA_THRESHOLD = 1.0f;  // Co/Cg close to 0 (in ±255 scale)
+    const float EDGE_ACTIVITY_THRESHOLD = 1.0f;  // Mean Sobel magnitude < 1.0
+    const float ROW_COL_BLACK_RATIO = 0.999f;  // 99.9% of sampled pixels must be black
+
+    *top = 0;
+    *bottom = 0;
+    *left = 0;
+    *right = 0;
+
+    // Detect top letterbox
+    for (int y = 0; y < height / 4; y++) {
+        int black_pixel_count = 0;
+        float total_edge_activity = 0.0f;
+        int sampled_pixels = 0;
+
+        for (int x = 0; x < width; x += SAMPLE_RATE_HORZ) {
+            int idx = y * width + x;
+
+            // Use pre-converted YCoCg values (optimization: avoid RGB→YCoCg conversion in loop)
+            float yval = enc->current_frame_y[idx];
+             float co = enc->current_frame_co[idx];
+             float cg = enc->current_frame_cg[idx];
+
+            // Check if pixel is dark and neutral (letterbox bar)
+            if (yval < Y_THRESHOLD &&
+                fabs(co) < CHROMA_THRESHOLD &&
+                fabs(cg) < CHROMA_THRESHOLD) {
+                black_pixel_count++;
+            }
+
+            // Calculate edge activity
+            total_edge_activity += calculate_sobel_magnitude(enc->current_frame_rgb,
+                                                             width, height, x, y);
+            sampled_pixels++;
+        }
+
+        float black_ratio = (float)black_pixel_count / sampled_pixels;
+        float mean_edge_activity = total_edge_activity / sampled_pixels;
+
+        // Row is part of letterbox if mostly black AND low edge activity
+        if (black_ratio > ROW_COL_BLACK_RATIO &&
+            mean_edge_activity < EDGE_ACTIVITY_THRESHOLD) {
+            *top = y + 1;
+        } else {
+            break;  // Found content
+        }
+    }
+
+    // Detect bottom letterbox
+    for (int y = height - 1; y >= height * 3 / 4; y--) {
+        int black_pixel_count = 0;
+        float total_edge_activity = 0.0f;
+        int sampled_pixels = 0;
+
+        for (int x = 0; x < width; x += SAMPLE_RATE_HORZ) {
+            int idx = y * width + x;
+
+            // Use pre-converted YCoCg values (optimization)
+            float yval = enc->current_frame_y[idx];
+             float co = enc->current_frame_co[idx];
+             float cg = enc->current_frame_cg[idx];
+
+            if (yval < Y_THRESHOLD &&
+                fabs(co) < CHROMA_THRESHOLD &&
+                fabs(cg) < CHROMA_THRESHOLD) {
+                black_pixel_count++;
+            }
+
+            total_edge_activity += calculate_sobel_magnitude(enc->current_frame_rgb,
+                                                             width, height, x, y);
+            sampled_pixels++;
+        }
+
+        float black_ratio = (float)black_pixel_count / sampled_pixels;
+        float mean_edge_activity = total_edge_activity / sampled_pixels;
+
+        if (black_ratio > ROW_COL_BLACK_RATIO &&
+            mean_edge_activity < EDGE_ACTIVITY_THRESHOLD) {
+            *bottom = height - y;
+        } else {
+            break;
+        }
+    }
+
+    // Detect left pillarbox
+    for (int x = 0; x < width / 4; x++) {
+        int black_pixel_count = 0;
+        float total_edge_activity = 0.0f;
+        int sampled_pixels = 0;
+
+        for (int y = 0; y < height; y += SAMPLE_RATE_VERT) {
+            int idx = y * width + x;
+
+            // Use pre-converted YCoCg values (optimization)
+            float yval = enc->current_frame_y[idx];
+             float co = enc->current_frame_co[idx];
+             float cg = enc->current_frame_cg[idx];
+
+            if (yval < Y_THRESHOLD &&
+                fabs(co) < CHROMA_THRESHOLD &&
+                fabs(cg) < CHROMA_THRESHOLD) {
+                black_pixel_count++;
+            }
+
+            total_edge_activity += calculate_sobel_magnitude(enc->current_frame_rgb,
+                                                             width, height, x, y);
+            sampled_pixels++;
+        }
+
+        float black_ratio = (float)black_pixel_count / sampled_pixels;
+        float mean_edge_activity = total_edge_activity / sampled_pixels;
+
+        if (black_ratio > ROW_COL_BLACK_RATIO &&
+            mean_edge_activity < EDGE_ACTIVITY_THRESHOLD) {
+            *left = x + 1;
+        } else {
+            break;
+        }
+    }
+
+    // Detect right pillarbox
+    for (int x = width - 1; x >= width * 3 / 4; x--) {
+        int black_pixel_count = 0;
+        float total_edge_activity = 0.0f;
+        int sampled_pixels = 0;
+
+        for (int y = 0; y < height; y += SAMPLE_RATE_VERT) {
+            int idx = y * width + x;
+
+            // Use pre-converted YCoCg values (optimization)
+            float yval = enc->current_frame_y[idx];
+             float co = enc->current_frame_co[idx];
+             float cg = enc->current_frame_cg[idx];
+
+            if (yval < Y_THRESHOLD &&
+                fabs(co) < CHROMA_THRESHOLD &&
+                fabs(cg) < CHROMA_THRESHOLD) {
+                black_pixel_count++;
+            }
+
+            total_edge_activity += calculate_sobel_magnitude(enc->current_frame_rgb,
+                                                             width, height, x, y);
+            sampled_pixels++;
+        }
+
+        float black_ratio = (float)black_pixel_count / sampled_pixels;
+        float mean_edge_activity = total_edge_activity / sampled_pixels;
+
+        if (black_ratio > ROW_COL_BLACK_RATIO &&
+            mean_edge_activity < EDGE_ACTIVITY_THRESHOLD) {
+            *right = width - x;
+        } else {
+            break;
+        }
+    }
+
+    // Apply symmetric cropping preference and minimum bar size filtering
+    // Note: During detection phase, no current state available (use 0,0,0,0)
+    apply_symmetric_cropping(top, right, bottom, left, width, height, 0, 0, 0, 0);
+
+    // Return 1 if any masking was detected
+    return (*top > 0 || *bottom > 0 || *left > 0 || *right > 0);
+}
+
+// Refine geometry change detection - find exact frame where change occurred
+// Uses linear scan to find first frame with new geometry
+static int refine_geometry_change(tav_encoder_t *enc, int start_frame, int end_frame,
+                                 uint16_t old_top, uint16_t old_right,
+                                 uint16_t old_bottom, uint16_t old_left) {
+    #define GEOMETRY_TOLERANCE 4  // ±4 pixels tolerance
+
+    // Linear scan from start to find first frame with new geometry
+    for (int i = start_frame; i <= end_frame && i < enc->frame_analyses_count; i++) {
+        frame_analysis_t *m = &enc->frame_analyses[i];
+
+        // Check if this frame has different geometry (beyond tolerance)
+        if (abs((int)m->letterbox_top - (int)old_top) > GEOMETRY_TOLERANCE ||
+            abs((int)m->letterbox_right - (int)old_right) > GEOMETRY_TOLERANCE ||
+            abs((int)m->letterbox_bottom - (int)old_bottom) > GEOMETRY_TOLERANCE ||
+            abs((int)m->letterbox_left - (int)old_left) > GEOMETRY_TOLERANCE) {
+            return i;  // Found the change point
+        }
+    }
+
+    return end_frame;  // No change found, use end frame
+
+    #undef GEOMETRY_TOLERANCE
+}
+
+// Write all screen masking packets before first frame (similar to SSF-TC subtitles)
+// Uses two-stage approach: coarse detection (8-frame stride) + frame-exact refinement
+static void write_all_screen_mask_packets(tav_encoder_t *enc, FILE *output) {
+    if (!enc->enable_letterbox_detect || !enc->two_pass_mode) {
+        return;  // Letterbox detection requires two-pass mode
+    }
+
+    if (!enc->frame_analyses || enc->frame_analyses_count == 0) {
+        return;  // No analysis data
+    }
+
+#define COARSE_STRIDE 16      // Sample every 8 frames for coarse detection
+#define CHANGE_THRESHOLD 16  // Require 16+ pixel change to consider geometry change
+#define SKIP_INITIAL_FRAMES 60  // Skip first N frames (often black/fade-in)
+
+    // Track current geometry
+    uint16_t current_top = 0, current_right = 0, current_bottom = 0, current_left = 0;
+    int packets_written = 0;
+    int last_checked_frame = SKIP_INITIAL_FRAMES;
+
+    // Stage 1: Coarse scan every COARSE_STRIDE frames to detect geometry changes
+    for (int i = SKIP_INITIAL_FRAMES; i < enc->frame_analyses_count; i += COARSE_STRIDE) {
+        frame_analysis_t *metrics = &enc->frame_analyses[i];
+
+        // Check if geometry changed significantly
+        int is_first = (packets_written == 0);
+        int is_significant_change =
+            abs((int)metrics->letterbox_top - (int)current_top) >= CHANGE_THRESHOLD ||
+            abs((int)metrics->letterbox_right - (int)current_right) >= CHANGE_THRESHOLD ||
+            abs((int)metrics->letterbox_bottom - (int)current_bottom) >= CHANGE_THRESHOLD ||
+            abs((int)metrics->letterbox_left - (int)current_left) >= CHANGE_THRESHOLD;
+
+        if (is_first || is_significant_change) {
+            // Stage 2: Refine - find exact frame where change occurred
+            int change_frame;
+            if (is_first) {
+                change_frame = 0;  // First packet always at frame 0
+            } else {
+                // Search backwards from i to last_checked_frame to find exact change point
+                change_frame = refine_geometry_change(enc, last_checked_frame, i,
+                                                     current_top, current_right,
+                                                     current_bottom, current_left);
+            }
+
+            // Get geometry from the change frame
+            frame_analysis_t *change_metrics = &enc->frame_analyses[change_frame];
+
+            // Apply symmetric cropping to final geometry (with current state for context)
+            uint16_t final_top = change_metrics->letterbox_top;
+            uint16_t final_right = change_metrics->letterbox_right;
+            uint16_t final_bottom = change_metrics->letterbox_bottom;
+            uint16_t final_left = change_metrics->letterbox_left;
+            apply_symmetric_cropping(&final_top, &final_right, &final_bottom, &final_left,
+                                    enc->width, enc->height,
+                                    current_top, current_bottom, current_left, current_right);
+
+            // Emit packet
+            write_screen_mask_packet(output, change_frame,
+                                    final_top, final_right, final_bottom, final_left);
+
+            // Update current geometry
+            current_top = final_top;
+            current_right = final_right;
+            current_bottom = final_bottom;
+            current_left = final_left;
+            packets_written++;
+
+            if (enc->verbose) {
+                printf("  Frame %d: Screen mask t=%u r=%u b=%u l=%u (frame-exact detection)\n",
+                       change_frame, final_top, final_right, final_bottom, final_left);
+            }
+        }
+
+        last_checked_frame = i;
+    }
+
+    if (packets_written > 0) {
+        printf("Wrote %d screen masking packet(s) (frame-exact detection)\n", packets_written);
+    }
+
+#undef COARSE_STRIDE
+#undef CHANGE_THRESHOLD
+#undef SKIP_INITIAL_FRAMES
+}
+
 // Write extended header packet with metadata
 // Returns the file offset where ENDT value is written (for later update)
 static long write_extended_header(tav_encoder_t *enc) {
@@ -8297,6 +8716,15 @@ static int write_tad_packet_samples(tav_encoder_t *enc, FILE *output, int sample
    if (!enc->pcm_file || enc->audio_remaining <= 0 || samples_to_read <= 0) {
        return 0;
    }
+
+    // Check if we have enough audio for a minimum chunk
+    // Don't encode if less than minimum - avoids encoding mostly padding/zeros
+    size_t min_bytes_needed = TAD32_MIN_CHUNK_SIZE * 2 * sizeof(float);
+    if (enc->audio_remaining < min_bytes_needed) {
+        enc->audio_remaining = 0;  // Mark audio as exhausted
+        return 0;
+    }
+
    size_t bytes_to_read = samples_to_read * 2 * sizeof(float);  // Stereo Float32LE

    // Don't read more than what's available
@@ -9457,9 +9885,11 @@ static int two_pass_first_pass(tav_encoder_t *enc, const char *input_file) {
        // Compute metrics

        frame_analysis_t metrics;
-        metrics.frame_number = frame_num;
        compute_frame_metrics(enc, gray, prev_dwt, sub_width, sub_height, ANALYSIS_DWT_LEVELS, &metrics);

+        // Set frame number AFTER compute_frame_metrics (which does memset)
+        metrics.frame_number = frame_num;
+
        // Detect scene change using hybrid detector
        if (frame_num > 0) {
            metrics.is_scene_change = detect_scene_change_wavelet(
@@ -9473,6 +9903,29 @@ static int two_pass_first_pass(tav_encoder_t *enc, const char *input_file) {
            metrics.is_scene_change = 0;  // First frame is always start of first GOP
        }

+        // Detect letterbox/pillarbox if enabled
+        if (enc->enable_letterbox_detect) {
+            // Set current_frame_rgb temporarily for detection
+            uint8_t *saved_current = enc->current_frame_rgb;
+            enc->current_frame_rgb = frame_rgb;
+
+            metrics.has_letterbox = detect_letterbox_pillarbox(
+                enc,
+                &metrics.letterbox_top,
+                &metrics.letterbox_right,
+                &metrics.letterbox_bottom,
+                &metrics.letterbox_left
+            );
+
+            enc->current_frame_rgb = saved_current;
+        } else {
+            metrics.has_letterbox = 0;
+            metrics.letterbox_top = 0;
+            metrics.letterbox_right = 0;
+            metrics.letterbox_bottom = 0;
+            metrics.letterbox_left = 0;
+        }
+
        // Store analysis
        if (enc->frame_analyses_count >= enc->frame_analyses_capacity) {
            // Expand array
@@ -9650,6 +10103,7 @@ int main(int argc, char *argv[]) {
        {"tad-audio", no_argument, 0, 1028},
        {"raw-coeffs", no_argument, 0, 1029},
        {"single-pass", no_argument, 0, 1050},  // disable two-pass encoding with wavelet-based scene detection
+        {"no-letterbox-detect", no_argument, 0, 1051},  // disable letterbox/pillarbox detection
        {"help", no_argument, 0, '?'},
        {0, 0, 0, 0}
    };
@@ -9880,6 +10334,10 @@ int main(int argc, char *argv[]) {
                enc->two_pass_mode = 0;
                printf("Two-pass wavelet-based scene change detection disabled\n");
                break;
+            case 1051: // --no-letterbox-detect
+                enc->enable_letterbox_detect = 0;
+                printf("Letterbox/pillarbox detection disabled\n");
+                break;
            case 'a':
                int bitrate = atoi(optarg);
                int valid_bitrate = validate_mp2_bitrate(bitrate);
@@ -10088,6 +10546,10 @@ int main(int argc, char *argv[]) {
        write_all_subtitles_tc(enc, enc->output_fp);
    }

+    // Write all screen masking packets upfront (before first frame)
+    // This must be done AFTER first pass analysis completes, so we'll defer it
+    // to after the two-pass analysis block below
+
    if (enc->output_fps != enc->fps) {
        printf("Frame rate conversion enabled: %d fps output\n", enc->output_fps);
    }
@@ -10131,6 +10593,9 @@ int main(int argc, char *argv[]) {
                   TEMPORAL_GOP_SIZE, ANALYSIS_GOP_MAX_SIZE);
        }

+        // Write all screen masking packets NOW (after first pass analysis)
+        write_all_screen_mask_packets(enc, enc->output_fp);
+
        printf("\n=== Two-Pass Encoding: Second Pass (Encoding) ===\n");
    }