TAV: letterbox detection

2026-03-07 11:51:49 +09:00 · 2025-11-17 03:16:26 +09:00
parent aa7e20695d
commit 8199cbc955
3 changed files with 731 additions and 1 deletions
--- a/video_encoder/encoder_tav.c
+++ b/video_encoder/encoder_tav.c
@@ -59,6 +59,7 @@
 #define TAV_PACKET_SUBTITLE_TC     0x31  // Subtitle packet with timecode (SSF-TC format)
 #define TAV_PACKET_AUDIO_TRACK     0x40  // Separate audio track (full MP2 file)
 #define TAV_PACKET_EXTENDED_HDR    0xEF  // Extended header packet
+#define TAV_PACKET_SCREEN_MASK     0xF2  // Screen masking packet (letterbox/pillarbox)
 #define TAV_PACKET_GOP_SYNC        0xFC  // GOP sync packet (N frames decoded)
 #define TAV_PACKET_TIMECODE        0xFD  // Timecode packet
 #define TAV_PACKET_SYNC_NTSC       0xFE  // NTSC Sync packet
@@ -199,6 +200,13 @@ typedef struct frame_analysis {
    // Detection results
    int is_scene_change;         // Final scene change flag
    double scene_change_score;   // Composite score for debugging
+
+    // Letterbox/pillarbox detection
+    uint16_t letterbox_top;
+    uint16_t letterbox_right;
+    uint16_t letterbox_bottom;
+    uint16_t letterbox_left;
+    int has_letterbox;           // 1 if any masking detected
 } frame_analysis_t;

 // GOP boundary list for two-pass encoding
@@ -1804,6 +1812,7 @@ typedef struct tav_encoder_s {
    int separate_audio_track; // 1 = write entire MP2 file as packet 0x40 after header, 0 = interleave audio (default)
    int pcm8_audio; // 1 = use 8-bit PCM audio (packet 0x21), 0 = use MP2 (default)
    int tad_audio; // 1 = use TAD audio (packet 0x24), 0 = use MP2/PCM8 (default, quality follows quality_level)
+    int enable_letterbox_detect; // 1 = detect and emit letterbox/pillarbox packets (default), 0 = disable

    // Frame buffers - ping-pong implementation
    uint8_t *frame_rgb[2];      // [0] and [1] alternate between current and previous
@@ -2419,6 +2428,7 @@ static tav_encoder_t* create_encoder(void) {
    enc->separate_audio_track = 0;  // Default: interleave audio packets
    enc->pcm8_audio = 0;  // Default: use MP2 audio
    enc->tad_audio = 0;  // Default: use MP2 audio (TAD quality follows quality_level)
+    enc->enable_letterbox_detect = 1;  // Default: enable letterbox/pillarbox detection

    // GOP / temporal DWT settings
    enc->enable_temporal_dwt = 1;  // Mutually exclusive with use_delta_encoding
@@ -8125,6 +8135,415 @@ static void write_timecode_packet(FILE *output, int frame_num, int fps, int is_n
    fwrite(&timecode_ns, sizeof(uint64_t), 1, output);
 }

+// Write screen masking packet (letterbox/pillarbox detection)
+// Packet structure: type(1) + frame_num(4) + top(2) + right(2) + bottom(2) + left(2) = 13 bytes
+static void write_screen_mask_packet(FILE *output, uint32_t frame_num,
+                                      uint16_t top, uint16_t right,
+                                      uint16_t bottom, uint16_t left) {
+    uint8_t packet_type = TAV_PACKET_SCREEN_MASK;
+    fwrite(&packet_type, 1, 1, output);
+    fwrite(&frame_num, sizeof(uint32_t), 1, output);
+    fwrite(&top, sizeof(uint16_t), 1, output);
+    fwrite(&right, sizeof(uint16_t), 1, output);
+    fwrite(&bottom, sizeof(uint16_t), 1, output);
+    fwrite(&left, sizeof(uint16_t), 1, output);
+}
+
+// Calculate Sobel gradient magnitude for a pixel (edge detection)
+static float calculate_sobel_magnitude(const uint8_t *frame_rgb, int width, int height,
+                                         int x, int y) {
+    // Sobel kernels for X and Y gradients
+    // Gx = [[-1, 0, 1], [-2, 0, 2], [-1, 0, 1]]
+    // Gy = [[-1, -2, -1], [0, 0, 0], [1, 2, 1]]
+
+    // Handle boundary conditions with symmetric extension
+    int x_prev = (x > 0) ? (x - 1) : 0;
+    int x_next = (x < width - 1) ? (x + 1) : (width - 1);
+    int y_prev = (y > 0) ? (y - 1) : 0;
+    int y_next = (y < height - 1) ? (y + 1) : (height - 1);
+
+    // Sample 3x3 neighborhood (using luma only for efficiency)
+    float pixels[3][3];
+    for (int dy = 0; dy < 3; dy++) {
+        for (int dx = 0; dx < 3; dx++) {
+            int sample_y = (dy == 0) ? y_prev : ((dy == 1) ? y : y_next);
+            int sample_x = (dx == 0) ? x_prev : ((dx == 1) ? x : x_next);
+            int offset = (sample_y * width + sample_x) * 3;
+
+            // Convert to luma (simple approximation: Y = 0.299R + 0.587G + 0.114B)
+            pixels[dy][dx] = (0.299f * frame_rgb[offset] +
+                              0.587f * frame_rgb[offset + 1] +
+                              0.114f * frame_rgb[offset + 2]);
+        }
+    }
+
+    // Apply Sobel operators
+    float gx = -pixels[0][0] + pixels[0][2] +
+               -2*pixels[1][0] + 2*pixels[1][2] +
+               -pixels[2][0] + pixels[2][2];
+
+    float gy = -pixels[0][0] - 2*pixels[0][1] - pixels[0][2] +
+                pixels[2][0] + 2*pixels[2][1] + pixels[2][2];
+
+    // Calculate magnitude: sqrt(gx^2 + gy^2)
+    return sqrtf(gx * gx + gy * gy);
+}
+
+// Apply symmetric cropping and suppress simultaneous letterbox+pillarbox
+// ALWAYS makes left=right and top=bottom (perfect symmetry)
+// When BOTH letterbox and pillarbox are detected simultaneously, suppress one based on current state
+// Allows letterbox→pillarbox or pillarbox→letterbox transitions
+static void apply_symmetric_cropping(uint16_t *top, uint16_t *right,
+                                       uint16_t *bottom, uint16_t *left,
+                                       int width, int height,
+                                       uint16_t current_top, uint16_t current_bottom,
+                                       uint16_t current_left, uint16_t current_right) {
+    const int MIN_BAR_SIZE_LETTER = (int)(0.04f * height);  // Minimum bar size to consider (ignore <16 pixel bars)
+    const int MIN_BAR_SIZE_PILLAR = (int)(0.04f * width);  // Minimum bar size to consider (ignore <16 pixel bars)
+    const int SIGNIFICANT_THRESHOLD_LETTER = (int)(0.08f * height);  // Bar must be 32+ pixels to be considered significant
+    const int SIGNIFICANT_THRESHOLD_PILLAR = (int)(0.08f * width);  // Bar must be 32+ pixels to be considered significant
+
+    // Filter out small bars (noise/detection errors)
+    if (*top < MIN_BAR_SIZE_LETTER) *top = 0;
+    if (*bottom < MIN_BAR_SIZE_LETTER) *bottom = 0;
+    if (*left < MIN_BAR_SIZE_PILLAR) *left = 0;
+    if (*right < MIN_BAR_SIZE_PILLAR) *right = 0;
+
+    // ALWAYS make letterbox (top/bottom) perfectly symmetric
+    if (*top > 0 || *bottom > 0) {
+        // Use minimum value to avoid over-cropping
+        uint16_t symmetric_value = (*top < *bottom) ? *top : *bottom;
+        *top = symmetric_value+1;
+        *bottom = symmetric_value+1;
+    }
+
+    // ALWAYS make pillarbox (left/right) perfectly symmetric
+    if (*left > 0 || *right > 0) {
+        // Use minimum value to avoid over-cropping
+        uint16_t symmetric_value = (*left < *right) ? *left : *right;
+        *left = symmetric_value+1;
+        *right = symmetric_value+1;
+    }
+
+    // Check if BOTH letterbox and pillarbox are detected simultaneously
+    int new_has_letterbox = (*top >= SIGNIFICANT_THRESHOLD_LETTER || *bottom >= SIGNIFICANT_THRESHOLD_LETTER);
+    int new_has_pillarbox = (*left >= SIGNIFICANT_THRESHOLD_PILLAR || *right >= SIGNIFICANT_THRESHOLD_PILLAR);
+    int current_has_letterbox = (current_top >= SIGNIFICANT_THRESHOLD_LETTER || current_bottom >= SIGNIFICANT_THRESHOLD_LETTER);
+    int current_has_pillarbox = (current_left >= SIGNIFICANT_THRESHOLD_PILLAR || current_right >= SIGNIFICANT_THRESHOLD_PILLAR);
+
+    // Only suppress when BOTH are detected AND one is much smaller (likely false positive)
+    // Completely suppress windowboxing
+    if (new_has_letterbox && new_has_pillarbox) {
+        int letterbox_size = *top + *bottom;
+        int pillarbox_size = *left + *right;
+
+        // to allow windowboxing:
+        // Only suppress if one is less than 25% of total masking
+        // This allows legitimate windowboxing while filtering false positives
+        float letterbox_ratio_geom = (float)letterbox_size / height;
+        float pillarbox_ratio_geom = (float)pillarbox_size / width;
+        float ratio_sum = letterbox_ratio_geom + pillarbox_ratio_geom;
+        float letterbox_ratio = letterbox_ratio_geom / ratio_sum;
+        float pillarbox_ratio = pillarbox_ratio_geom / ratio_sum;
+
+        if (letterbox_ratio < 0.25f) {
+            *top = 0;
+            *bottom = 0;
+        } else if (pillarbox_ratio < 0.25f)
+            *left = 0;
+            *right = 0;
+        }
+        // Otherwise keep both (legitimate windowboxing)
+    }
+}
+
+// Detect letterbox/pillarbox bars in the current frame
+// Returns 1 if masking detected, 0 otherwise
+// Sets top, right, bottom, left to the size of detected bars in pixels
+static int detect_letterbox_pillarbox(tav_encoder_t *enc,
+                                       uint16_t *top, uint16_t *right,
+                                       uint16_t *bottom, uint16_t *left) {
+    if (!enc->current_frame_rgb) return 0;
+
+    const int width = enc->width;
+    const int height = enc->height;
+    const int SAMPLE_RATE_HORZ = 4;  // Sample every 4th pixel for performance
+    const int SAMPLE_RATE_VERT = 4;  // Sample every 4th pixel for performance
+    const float Y_THRESHOLD = 2.0f;  // Y < 2 for dark pixels
+    const float CHROMA_THRESHOLD = 1.0f;  // Co/Cg close to 0 (in ±255 scale)
+    const float EDGE_ACTIVITY_THRESHOLD = 1.0f;  // Mean Sobel magnitude < 1.0
+    const float ROW_COL_BLACK_RATIO = 0.999f;  // 99.9% of sampled pixels must be black
+
+    *top = 0;
+    *bottom = 0;
+    *left = 0;
+    *right = 0;
+
+    // Detect top letterbox
+    for (int y = 0; y < height / 4; y++) {
+        int black_pixel_count = 0;
+        float total_edge_activity = 0.0f;
+        int sampled_pixels = 0;
+
+        for (int x = 0; x < width; x += SAMPLE_RATE_HORZ) {
+            int idx = y * width + x;
+
+            // Use pre-converted YCoCg values (optimization: avoid RGB→YCoCg conversion in loop)
+            float yval = enc->current_frame_y[idx];
+             float co = enc->current_frame_co[idx];
+             float cg = enc->current_frame_cg[idx];
+
+            // Check if pixel is dark and neutral (letterbox bar)
+            if (yval < Y_THRESHOLD &&
+                fabs(co) < CHROMA_THRESHOLD &&
+                fabs(cg) < CHROMA_THRESHOLD) {
+                black_pixel_count++;
+            }
+
+            // Calculate edge activity
+            total_edge_activity += calculate_sobel_magnitude(enc->current_frame_rgb,
+                                                             width, height, x, y);
+            sampled_pixels++;
+        }
+
+        float black_ratio = (float)black_pixel_count / sampled_pixels;
+        float mean_edge_activity = total_edge_activity / sampled_pixels;
+
+        // Row is part of letterbox if mostly black AND low edge activity
+        if (black_ratio > ROW_COL_BLACK_RATIO &&
+            mean_edge_activity < EDGE_ACTIVITY_THRESHOLD) {
+            *top = y + 1;
+        } else {
+            break;  // Found content
+        }
+    }
+
+    // Detect bottom letterbox
+    for (int y = height - 1; y >= height * 3 / 4; y--) {
+        int black_pixel_count = 0;
+        float total_edge_activity = 0.0f;
+        int sampled_pixels = 0;
+
+        for (int x = 0; x < width; x += SAMPLE_RATE_HORZ) {
+            int idx = y * width + x;
+
+            // Use pre-converted YCoCg values (optimization)
+            float yval = enc->current_frame_y[idx];
+             float co = enc->current_frame_co[idx];
+             float cg = enc->current_frame_cg[idx];
+
+            if (yval < Y_THRESHOLD &&
+                fabs(co) < CHROMA_THRESHOLD &&
+                fabs(cg) < CHROMA_THRESHOLD) {
+                black_pixel_count++;
+            }
+
+            total_edge_activity += calculate_sobel_magnitude(enc->current_frame_rgb,
+                                                             width, height, x, y);
+            sampled_pixels++;
+        }
+
+        float black_ratio = (float)black_pixel_count / sampled_pixels;
+        float mean_edge_activity = total_edge_activity / sampled_pixels;
+
+        if (black_ratio > ROW_COL_BLACK_RATIO &&
+            mean_edge_activity < EDGE_ACTIVITY_THRESHOLD) {
+            *bottom = height - y;
+        } else {
+            break;
+        }
+    }
+
+    // Detect left pillarbox
+    for (int x = 0; x < width / 4; x++) {
+        int black_pixel_count = 0;
+        float total_edge_activity = 0.0f;
+        int sampled_pixels = 0;
+
+        for (int y = 0; y < height; y += SAMPLE_RATE_VERT) {
+            int idx = y * width + x;
+
+            // Use pre-converted YCoCg values (optimization)
+            float yval = enc->current_frame_y[idx];
+             float co = enc->current_frame_co[idx];
+             float cg = enc->current_frame_cg[idx];
+
+            if (yval < Y_THRESHOLD &&
+                fabs(co) < CHROMA_THRESHOLD &&
+                fabs(cg) < CHROMA_THRESHOLD) {
+                black_pixel_count++;
+            }
+
+            total_edge_activity += calculate_sobel_magnitude(enc->current_frame_rgb,
+                                                             width, height, x, y);
+            sampled_pixels++;
+        }
+
+        float black_ratio = (float)black_pixel_count / sampled_pixels;
+        float mean_edge_activity = total_edge_activity / sampled_pixels;
+
+        if (black_ratio > ROW_COL_BLACK_RATIO &&
+            mean_edge_activity < EDGE_ACTIVITY_THRESHOLD) {
+            *left = x + 1;
+        } else {
+            break;
+        }
+    }
+
+    // Detect right pillarbox
+    for (int x = width - 1; x >= width * 3 / 4; x--) {
+        int black_pixel_count = 0;
+        float total_edge_activity = 0.0f;
+        int sampled_pixels = 0;
+
+        for (int y = 0; y < height; y += SAMPLE_RATE_VERT) {
+            int idx = y * width + x;
+
+            // Use pre-converted YCoCg values (optimization)
+            float yval = enc->current_frame_y[idx];
+             float co = enc->current_frame_co[idx];
+             float cg = enc->current_frame_cg[idx];
+
+            if (yval < Y_THRESHOLD &&
+                fabs(co) < CHROMA_THRESHOLD &&
+                fabs(cg) < CHROMA_THRESHOLD) {
+                black_pixel_count++;
+            }
+
+            total_edge_activity += calculate_sobel_magnitude(enc->current_frame_rgb,
+                                                             width, height, x, y);
+            sampled_pixels++;
+        }
+
+        float black_ratio = (float)black_pixel_count / sampled_pixels;
+        float mean_edge_activity = total_edge_activity / sampled_pixels;
+
+        if (black_ratio > ROW_COL_BLACK_RATIO &&
+            mean_edge_activity < EDGE_ACTIVITY_THRESHOLD) {
+            *right = width - x;
+        } else {
+            break;
+        }
+    }
+
+    // Apply symmetric cropping preference and minimum bar size filtering
+    // Note: During detection phase, no current state available (use 0,0,0,0)
+    apply_symmetric_cropping(top, right, bottom, left, width, height, 0, 0, 0, 0);
+
+    // Return 1 if any masking was detected
+    return (*top > 0 || *bottom > 0 || *left > 0 || *right > 0);
+}
+
+// Refine geometry change detection - find exact frame where change occurred
+// Uses linear scan to find first frame with new geometry
+static int refine_geometry_change(tav_encoder_t *enc, int start_frame, int end_frame,
+                                 uint16_t old_top, uint16_t old_right,
+                                 uint16_t old_bottom, uint16_t old_left) {
+    #define GEOMETRY_TOLERANCE 4  // ±4 pixels tolerance
+
+    // Linear scan from start to find first frame with new geometry
+    for (int i = start_frame; i <= end_frame && i < enc->frame_analyses_count; i++) {
+        frame_analysis_t *m = &enc->frame_analyses[i];
+
+        // Check if this frame has different geometry (beyond tolerance)
+        if (abs((int)m->letterbox_top - (int)old_top) > GEOMETRY_TOLERANCE ||
+            abs((int)m->letterbox_right - (int)old_right) > GEOMETRY_TOLERANCE ||
+            abs((int)m->letterbox_bottom - (int)old_bottom) > GEOMETRY_TOLERANCE ||
+            abs((int)m->letterbox_left - (int)old_left) > GEOMETRY_TOLERANCE) {
+            return i;  // Found the change point
+        }
+    }
+
+    return end_frame;  // No change found, use end frame
+
+    #undef GEOMETRY_TOLERANCE
+}
+
+// Write all screen masking packets before first frame (similar to SSF-TC subtitles)
+// Uses two-stage approach: coarse detection (8-frame stride) + frame-exact refinement
+static void write_all_screen_mask_packets(tav_encoder_t *enc, FILE *output) {
+    if (!enc->enable_letterbox_detect || !enc->two_pass_mode) {
+        return;  // Letterbox detection requires two-pass mode
+    }
+
+    if (!enc->frame_analyses || enc->frame_analyses_count == 0) {
+        return;  // No analysis data
+    }
+
+#define COARSE_STRIDE 16      // Sample every 8 frames for coarse detection
+#define CHANGE_THRESHOLD 16  // Require 16+ pixel change to consider geometry change
+#define SKIP_INITIAL_FRAMES 60  // Skip first N frames (often black/fade-in)
+
+    // Track current geometry
+    uint16_t current_top = 0, current_right = 0, current_bottom = 0, current_left = 0;
+    int packets_written = 0;
+    int last_checked_frame = SKIP_INITIAL_FRAMES;
+
+    // Stage 1: Coarse scan every COARSE_STRIDE frames to detect geometry changes
+    for (int i = SKIP_INITIAL_FRAMES; i < enc->frame_analyses_count; i += COARSE_STRIDE) {
+        frame_analysis_t *metrics = &enc->frame_analyses[i];
+
+        // Check if geometry changed significantly
+        int is_first = (packets_written == 0);
+        int is_significant_change =
+            abs((int)metrics->letterbox_top - (int)current_top) >= CHANGE_THRESHOLD ||
+            abs((int)metrics->letterbox_right - (int)current_right) >= CHANGE_THRESHOLD ||
+            abs((int)metrics->letterbox_bottom - (int)current_bottom) >= CHANGE_THRESHOLD ||
+            abs((int)metrics->letterbox_left - (int)current_left) >= CHANGE_THRESHOLD;
+
+        if (is_first || is_significant_change) {
+            // Stage 2: Refine - find exact frame where change occurred
+            int change_frame;
+            if (is_first) {
+                change_frame = 0;  // First packet always at frame 0
+            } else {
+                // Search backwards from i to last_checked_frame to find exact change point
+                change_frame = refine_geometry_change(enc, last_checked_frame, i,
+                                                     current_top, current_right,
+                                                     current_bottom, current_left);
+            }
+
+            // Get geometry from the change frame
+            frame_analysis_t *change_metrics = &enc->frame_analyses[change_frame];
+
+            // Apply symmetric cropping to final geometry (with current state for context)
+            uint16_t final_top = change_metrics->letterbox_top;
+            uint16_t final_right = change_metrics->letterbox_right;
+            uint16_t final_bottom = change_metrics->letterbox_bottom;
+            uint16_t final_left = change_metrics->letterbox_left;
+            apply_symmetric_cropping(&final_top, &final_right, &final_bottom, &final_left,
+                                    enc->width, enc->height,
+                                    current_top, current_bottom, current_left, current_right);
+
+            // Emit packet
+            write_screen_mask_packet(output, change_frame,
+                                    final_top, final_right, final_bottom, final_left);
+
+            // Update current geometry
+            current_top = final_top;
+            current_right = final_right;
+            current_bottom = final_bottom;
+            current_left = final_left;
+            packets_written++;
+
+            if (enc->verbose) {
+                printf("  Frame %d: Screen mask t=%u r=%u b=%u l=%u (frame-exact detection)\n",
+                       change_frame, final_top, final_right, final_bottom, final_left);
+            }
+        }
+
+        last_checked_frame = i;
+    }
+
+    if (packets_written > 0) {
+        printf("Wrote %d screen masking packet(s) (frame-exact detection)\n", packets_written);
+    }
+
+#undef COARSE_STRIDE
+#undef CHANGE_THRESHOLD
+#undef SKIP_INITIAL_FRAMES
+}
+
 // Write extended header packet with metadata
 // Returns the file offset where ENDT value is written (for later update)
 static long write_extended_header(tav_encoder_t *enc) {
@@ -8297,6 +8716,15 @@ static int write_tad_packet_samples(tav_encoder_t *enc, FILE *output, int sample
    if (!enc->pcm_file || enc->audio_remaining <= 0 || samples_to_read <= 0) {
        return 0;
    }
+
+    // Check if we have enough audio for a minimum chunk
+    // Don't encode if less than minimum - avoids encoding mostly padding/zeros
+    size_t min_bytes_needed = TAD32_MIN_CHUNK_SIZE * 2 * sizeof(float);
+    if (enc->audio_remaining < min_bytes_needed) {
+        enc->audio_remaining = 0;  // Mark audio as exhausted
+        return 0;
+    }
+
    size_t bytes_to_read = samples_to_read * 2 * sizeof(float);  // Stereo Float32LE

    // Don't read more than what's available
@@ -9457,9 +9885,11 @@ static int two_pass_first_pass(tav_encoder_t *enc, const char *input_file) {
        // Compute metrics

        frame_analysis_t metrics;
-        metrics.frame_number = frame_num;
        compute_frame_metrics(enc, gray, prev_dwt, sub_width, sub_height, ANALYSIS_DWT_LEVELS, &metrics);

+        // Set frame number AFTER compute_frame_metrics (which does memset)
+        metrics.frame_number = frame_num;
+
        // Detect scene change using hybrid detector
        if (frame_num > 0) {
            metrics.is_scene_change = detect_scene_change_wavelet(
@@ -9473,6 +9903,29 @@ static int two_pass_first_pass(tav_encoder_t *enc, const char *input_file) {
            metrics.is_scene_change = 0;  // First frame is always start of first GOP
        }

+        // Detect letterbox/pillarbox if enabled
+        if (enc->enable_letterbox_detect) {
+            // Set current_frame_rgb temporarily for detection
+            uint8_t *saved_current = enc->current_frame_rgb;
+            enc->current_frame_rgb = frame_rgb;
+
+            metrics.has_letterbox = detect_letterbox_pillarbox(
+                enc,
+                &metrics.letterbox_top,
+                &metrics.letterbox_right,
+                &metrics.letterbox_bottom,
+                &metrics.letterbox_left
+            );
+
+            enc->current_frame_rgb = saved_current;
+        } else {
+            metrics.has_letterbox = 0;
+            metrics.letterbox_top = 0;
+            metrics.letterbox_right = 0;
+            metrics.letterbox_bottom = 0;
+            metrics.letterbox_left = 0;
+        }
+
        // Store analysis
        if (enc->frame_analyses_count >= enc->frame_analyses_capacity) {
            // Expand array
@@ -9650,6 +10103,7 @@ int main(int argc, char *argv[]) {
        {"tad-audio", no_argument, 0, 1028},
        {"raw-coeffs", no_argument, 0, 1029},
        {"single-pass", no_argument, 0, 1050},  // disable two-pass encoding with wavelet-based scene detection
+        {"no-letterbox-detect", no_argument, 0, 1051},  // disable letterbox/pillarbox detection
        {"help", no_argument, 0, '?'},
        {0, 0, 0, 0}
    };
@@ -9880,6 +10334,10 @@ int main(int argc, char *argv[]) {
                enc->two_pass_mode = 0;
                printf("Two-pass wavelet-based scene change detection disabled\n");
                break;
+            case 1051: // --no-letterbox-detect
+                enc->enable_letterbox_detect = 0;
+                printf("Letterbox/pillarbox detection disabled\n");
+                break;
            case 'a':
                int bitrate = atoi(optarg);
                int valid_bitrate = validate_mp2_bitrate(bitrate);
@@ -10088,6 +10546,10 @@ int main(int argc, char *argv[]) {
        write_all_subtitles_tc(enc, enc->output_fp);
    }

+    // Write all screen masking packets upfront (before first frame)
+    // This must be done AFTER first pass analysis completes, so we'll defer it
+    // to after the two-pass analysis block below
+
    if (enc->output_fps != enc->fps) {
        printf("Frame rate conversion enabled: %d fps output\n", enc->output_fps);
    }
@@ -10131,6 +10593,9 @@ int main(int argc, char *argv[]) {
                   TEMPORAL_GOP_SIZE, ANALYSIS_GOP_MAX_SIZE);
        }

+        // Write all screen masking packets NOW (after first pass analysis)
+        write_all_screen_mask_packets(enc, enc->output_fp);
+
        printf("\n=== Two-Pass Encoding: Second Pass (Encoding) ===\n");
    }