TAV: letterbox detection

This commit is contained in:
minjaesong
2025-11-17 03:16:26 +09:00
parent aa7e20695d
commit 8199cbc955
3 changed files with 731 additions and 1 deletions

View File

@@ -39,6 +39,7 @@ const TAV_PACKET_SUBTITLE = 0x30 // Legacy SSF (frame-locked)
const TAV_PACKET_SUBTITLE_TC = 0x31 // SSF-TC (timecode-based)
const TAV_PACKET_AUDIO_BUNDLED = 0x40 // Entire MP2 audio file in single packet
const TAV_PACKET_EXTENDED_HDR = 0xEF
const TAV_PACKET_SCREEN_MASK = 0xF2 // Screen masking (letterbox/pillarbox)
const TAV_PACKET_GOP_SYNC = 0xFC // GOP sync (N frames decoded from GOP block)
const TAV_PACKET_TIMECODE = 0xFD
const TAV_PACKET_SYNC_NTSC = 0xFE
@@ -72,6 +73,13 @@ let currentTimecodeNs = 0 // Current playback timecode (updated every frame)
let baseTimecodeNs = 0 // Base timecode from most recent TIMECODE packet
let baseTimecodeFrameCount = 0 // Frame count when base timecode was set
// Screen masking (letterbox/pillarbox) state
let screenMaskEntries = [] // Array of {frameNum, top, right, bottom, left}
let screenMaskTop = 0
let screenMaskRight = 0
let screenMaskBottom = 0
let screenMaskLeft = 0
// Parse command line options
let interactive = false
let filmGrainLevel = null
@@ -739,6 +747,77 @@ function scanForwardToIframe(targetFrame, currentPos) {
}
// Function to try reading next TAV file header at current position
// Update active screen mask for the given frame number
// Screen mask packets are sorted by frameNum, so find the last entry with frameNum <= currentFrameNum
function updateScreenMask(currentFrameNum) {
if (screenMaskEntries.length === 0) {
return // No screen mask entries
}
// Find the most recent screen mask entry for this frame
// Entries are in order, so scan backwards for efficiency
for (let i = screenMaskEntries.length - 1; i >= 0; i--) {
if (screenMaskEntries[i].frameNum <= currentFrameNum) {
// Apply this mask
screenMaskTop = screenMaskEntries[i].top
screenMaskRight = screenMaskEntries[i].right
screenMaskBottom = screenMaskEntries[i].bottom
screenMaskLeft = screenMaskEntries[i].left
return
}
}
}
// Fill masked regions (letterbox/pillarbox bars) with black
function fillMaskedRegions() {
return
// console.log(`ScrMask: ${screenMaskTop}, ${screenMaskRight}, ${screenMaskBottom}, ${screenMaskLeft}`)
if (screenMaskTop === 0 && screenMaskRight === 0 &&
screenMaskBottom === 0 && screenMaskLeft === 0) {
return // No masking
}
const width = header.width
const height = header.height
const blackRG = 0xF0
const blackBA = 0xFF // 0xF0FF (magenta) for test
// Fill top letterbox bar
for (let y = 0; y < screenMaskTop && y < height; y++) {
for (let x = 0; x < width; x++) {
graphics.plotPixel(x, y, blackRG)
graphics.plotPixel2(x, y, blackBA)
}
}
// Fill bottom letterbox bar
for (let y = height - screenMaskBottom; y < height; y++) {
if (y < 0) continue
for (let x = 0; x < width; x++) {
graphics.plotPixel(x, y, blackRG)
graphics.plotPixel2(x, y, blackBA)
}
}
// Fill left pillarbox bar
for (let y = 0; y < height; y++) {
for (let x = 0; x < screenMaskLeft && x < width; x++) {
graphics.plotPixel(x, y, blackRG)
graphics.plotPixel2(x, y, blackBA)
}
}
// Fill right pillarbox bar
for (let y = 0; y < height; y++) {
for (let x = width - screenMaskRight; x < width; x++) {
if (x < 0) continue
graphics.plotPixel(x, y, blackRG)
graphics.plotPixel2(x, y, blackBA)
}
}
}
function tryReadNextTAVHeader() {
// Save current position
let currentPos = seqread.getReadCount()
@@ -1116,6 +1195,9 @@ try {
// Do nothing - skip to next packet
}
else if (packetType === TAV_PACKET_IFRAME || packetType === TAV_PACKET_PFRAME) {
// Update active screen mask for this frame (Phase 1: just tracking, not applying)
updateScreenMask(frameCount)
// Record I-frame position for seeking
if (packetType === TAV_PACKET_IFRAME) {
iframePositions.push({offset: packetOffset, frameNum: frameCount})
@@ -1588,6 +1670,28 @@ try {
}
}
}
else if (packetType === TAV_PACKET_SCREEN_MASK) {
// Screen masking packet (letterbox/pillarbox detection)
// Format: frame_num(4) + top(2) + right(2) + bottom(2) + left(2) = 12 bytes
let frameNum = seqread.readInt() // uint32 frame number
let top = seqread.readOneByte() | (seqread.readOneByte() << 8)
let right = seqread.readOneByte() | (seqread.readOneByte() << 8)
let bottom = seqread.readOneByte() | (seqread.readOneByte() << 8)
let left = seqread.readOneByte() | (seqread.readOneByte() << 8)
// Store in entries array
screenMaskEntries.push({
frameNum: frameNum,
top: top,
right: right,
bottom: bottom,
left: left
})
if (interactive) {
serial.println(`[SCREEN_MASK] frame=${frameNum} top=${top} right=${right} bottom=${bottom} left=${left}`)
}
}
else if (packetType === TAV_PACKET_TIMECODE) {
// Timecode packet - time since stream start in nanoseconds
let timecodeLow = seqread.readInt()
@@ -1789,6 +1893,12 @@ try {
graphics.uploadVideoBufferFrameToFramebuffer(currentGopFrameIndex, header.width, header.height, trueFrameCount, bufferOffset)
uploadTime = (sys.nanoTime() - uploadStart) / 1000000.0
// Update active screen mask for this GOP frame
updateScreenMask(frameCount)
// Fill masked regions with black (letterbox/pillarbox bars)
fillMaskedRegions()
if (interactive && currentGopFrameIndex === 0) {
// console.log(`[GOP] Playing GOP: ${currentGopSize} frames from slot ${currentGopBufferSlot}`)
}

View File

@@ -32,7 +32,9 @@
#define TAV_PACKET_AUDIO_TAD 0x24 // TAD audio - SUPPORTED (decode to PCMu8)
#define TAV_PACKET_AUDIO_TRACK 0x40 // Bundled audio track - SUPPORTED (passthrough)
#define TAV_PACKET_SUBTITLE 0x30 // Subtitle - SKIPPED
#define TAV_PACKET_SUBTITLE_TC 0x31 // Subtitle - SKIPPED
#define TAV_PACKET_EXTENDED_HDR 0xEF // Extended header - SKIPPED
#define TAV_PACKET_SCREEN_MASK 0xF2 // Screen masking (letterbox/pillarbox) - PARSED
#define TAV_PACKET_GOP_SYNC 0xFC // GOP sync packet - SKIPPED
#define TAV_PACKET_TIMECODE 0xFD // Timecode - SKIPPED
#define TAV_PACKET_SYNC_NTSC 0xFE // NTSC sync - SKIPPED
@@ -1586,6 +1588,15 @@ static void write_wav_header(FILE *fp, uint32_t sample_rate, uint16_t channels,
// Decoder State Structure
//=============================================================================
// Screen masking entry (letterbox/pillarbox geometry change)
typedef struct {
uint32_t frame_num;
uint16_t top;
uint16_t right;
uint16_t bottom;
uint16_t left;
} screen_mask_entry_t;
typedef struct {
FILE *input_fp;
tav_header_t header;
@@ -1601,6 +1612,16 @@ typedef struct {
int frame_size;
int is_monoblock; // True if version 3-6 (single tile mode)
// Screen masking (letterbox/pillarbox) - array of geometry changes
screen_mask_entry_t *screen_masks;
int screen_mask_count;
int screen_mask_capacity;
// Current active mask
uint16_t screen_mask_top;
uint16_t screen_mask_right;
uint16_t screen_mask_bottom;
uint16_t screen_mask_left;
// FFmpeg pipe for video only (audio from file)
FILE *video_pipe;
pid_t ffmpeg_pid;
@@ -1669,6 +1690,11 @@ static int extract_audio_to_wav(const char *input_file, const char *wav_file, in
continue;
}
if (packet_type == TAV_PACKET_SCREEN_MASK) {
fseek(input_fp, 12, SEEK_CUR); // Skip frame_num(4) + top(2) + right(2) + bottom(2) + left(2)
continue;
}
if (packet_type == TAV_PACKET_GOP_UNIFIED) {
uint8_t gop_size;
uint32_t compressed_size;
@@ -1948,10 +1974,83 @@ static void tav_decoder_free(tav_decoder_t *decoder) {
free(decoder->reference_ycocg_y);
free(decoder->reference_ycocg_co);
free(decoder->reference_ycocg_cg);
free(decoder->screen_masks);
free(decoder->audio_file_path);
free(decoder);
}
//=============================================================================
// Screen Mask Management
//=============================================================================
// Fill masked regions (letterbox/pillarbox bars) with black
static void fill_masked_regions(uint8_t *frame_rgb, int width, int height,
uint16_t top, uint16_t right, uint16_t bottom, uint16_t left) {
// Fill top letterbox bar
for (int y = 0; y < top && y < height; y++) {
for (int x = 0; x < width; x++) {
int offset = (y * width + x) * 3;
frame_rgb[offset] = 255; // R
frame_rgb[offset + 1] = 0; // G
frame_rgb[offset + 2] = 0; // B
}
}
// Fill bottom letterbox bar
for (int y = height - bottom; y < height; y++) {
if (y < 0) continue;
for (int x = 0; x < width; x++) {
int offset = (y * width + x) * 3;
frame_rgb[offset] = 255; // R
frame_rgb[offset + 1] = 0; // G
frame_rgb[offset + 2] = 0; // B
}
}
// Fill left pillarbox bar
for (int y = 0; y < height; y++) {
for (int x = 0; x < left && x < width; x++) {
int offset = (y * width + x) * 3;
frame_rgb[offset] = 0; // R
frame_rgb[offset + 1] = 0; // G
frame_rgb[offset + 2] = 255; // B
}
}
// Fill right pillarbox bar
for (int y = 0; y < height; y++) {
for (int x = width - right; x < width; x++) {
if (x < 0) continue;
int offset = (y * width + x) * 3;
frame_rgb[offset] = 0; // R
frame_rgb[offset + 1] = 0; // G
frame_rgb[offset + 2] = 255; // B
}
}
}
// Update active screen mask for the given frame number
// Screen mask packets are sorted by frame_num, so we find the last entry
// with frame_num <= current_frame_num
static void update_screen_mask(tav_decoder_t *decoder, uint32_t current_frame_num) {
if (!decoder->screen_masks || decoder->screen_mask_count == 0) {
return; // No screen mask entries
}
// Find the most recent screen mask entry for this frame
// Entries are in order, so scan backwards for efficiency
for (int i = decoder->screen_mask_count - 1; i >= 0; i--) {
if (decoder->screen_masks[i].frame_num <= current_frame_num) {
// Apply this mask
decoder->screen_mask_top = decoder->screen_masks[i].top;
decoder->screen_mask_right = decoder->screen_masks[i].right;
decoder->screen_mask_bottom = decoder->screen_masks[i].bottom;
decoder->screen_mask_left = decoder->screen_masks[i].left;
return;
}
}
}
//=============================================================================
// Frame Decoding Logic
//=============================================================================
@@ -2486,6 +2585,50 @@ int main(int argc, char *argv[]) {
continue;
}
// Handle screen masking packets (letterbox/pillarbox detection)
// Format: frame_num(4) + top(2) + right(2) + bottom(2) + left(2) = 12 bytes
if (packet_type == TAV_PACKET_SCREEN_MASK) {
uint32_t frame_num;
uint16_t top, right, bottom, left;
if (fread(&frame_num, 4, 1, decoder->input_fp) != 1 ||
fread(&top, 2, 1, decoder->input_fp) != 1 ||
fread(&right, 2, 1, decoder->input_fp) != 1 ||
fread(&bottom, 2, 1, decoder->input_fp) != 1 ||
fread(&left, 2, 1, decoder->input_fp) != 1) {
fprintf(stderr, "Error: Failed to read screen mask packet\n");
result = -1;
break;
}
// Allocate array if needed
if (decoder->screen_masks == NULL) {
decoder->screen_mask_capacity = 16;
decoder->screen_masks = malloc(decoder->screen_mask_capacity * sizeof(screen_mask_entry_t));
decoder->screen_mask_count = 0;
}
// Expand array if needed
if (decoder->screen_mask_count >= decoder->screen_mask_capacity) {
decoder->screen_mask_capacity *= 2;
decoder->screen_masks = realloc(decoder->screen_masks,
decoder->screen_mask_capacity * sizeof(screen_mask_entry_t));
}
// Store entry
screen_mask_entry_t *entry = &decoder->screen_masks[decoder->screen_mask_count++];
entry->frame_num = frame_num;
entry->top = top;
entry->right = right;
entry->bottom = bottom;
entry->left = left;
if (verbose) {
fprintf(stderr, "Packet %d: SCREEN_MASK (0x%02X) - frame=%u top=%u right=%u bottom=%u left=%u\n",
total_packets, packet_type, frame_num, top, right, bottom, left);
}
continue;
}
// Handle GOP unified packets (custom format: 1-byte gop_size + 4-byte compressed_size)
if (packet_type == TAV_PACKET_GOP_UNIFIED) {
uint8_t gop_size;
@@ -2738,6 +2881,14 @@ int main(int argc, char *argv[]) {
frame_rgb[i * 3 + 2] = b;
}
// Update active screen mask for this GOP frame
update_screen_mask(decoder, decoder->frame_count + t);
// Fill masked regions with black (letterbox/pillarbox bars)
fill_masked_regions(frame_rgb, decoder->header.width, decoder->header.height,
decoder->screen_mask_top, decoder->screen_mask_right,
decoder->screen_mask_bottom, decoder->screen_mask_left);
// Write frame to FFmpeg video pipe
const size_t bytes_to_write = decoder->frame_size * 3;
@@ -2869,6 +3020,9 @@ int main(int argc, char *argv[]) {
switch (packet_type) {
case TAV_PACKET_IFRAME:
case TAV_PACKET_PFRAME:
// Update active screen mask for this frame (Phase 1: just tracking, not applying)
update_screen_mask(decoder, decoder->frame_count);
iframe_count++;
if (verbose && iframe_count <= 5) {
fprintf(stderr, "Processing %s (packet %d, size %u bytes)...\n",
@@ -2902,6 +3056,7 @@ int main(int argc, char *argv[]) {
break;
case TAV_PACKET_SUBTITLE:
case TAV_PACKET_SUBTITLE_TC:
// Skip subtitle packets
fseek(decoder->input_fp, packet_size, SEEK_CUR);
break;

View File

@@ -59,6 +59,7 @@
#define TAV_PACKET_SUBTITLE_TC 0x31 // Subtitle packet with timecode (SSF-TC format)
#define TAV_PACKET_AUDIO_TRACK 0x40 // Separate audio track (full MP2 file)
#define TAV_PACKET_EXTENDED_HDR 0xEF // Extended header packet
#define TAV_PACKET_SCREEN_MASK 0xF2 // Screen masking packet (letterbox/pillarbox)
#define TAV_PACKET_GOP_SYNC 0xFC // GOP sync packet (N frames decoded)
#define TAV_PACKET_TIMECODE 0xFD // Timecode packet
#define TAV_PACKET_SYNC_NTSC 0xFE // NTSC Sync packet
@@ -199,6 +200,13 @@ typedef struct frame_analysis {
// Detection results
int is_scene_change; // Final scene change flag
double scene_change_score; // Composite score for debugging
// Letterbox/pillarbox detection
uint16_t letterbox_top;
uint16_t letterbox_right;
uint16_t letterbox_bottom;
uint16_t letterbox_left;
int has_letterbox; // 1 if any masking detected
} frame_analysis_t;
// GOP boundary list for two-pass encoding
@@ -1804,6 +1812,7 @@ typedef struct tav_encoder_s {
int separate_audio_track; // 1 = write entire MP2 file as packet 0x40 after header, 0 = interleave audio (default)
int pcm8_audio; // 1 = use 8-bit PCM audio (packet 0x21), 0 = use MP2 (default)
int tad_audio; // 1 = use TAD audio (packet 0x24), 0 = use MP2/PCM8 (default, quality follows quality_level)
int enable_letterbox_detect; // 1 = detect and emit letterbox/pillarbox packets (default), 0 = disable
// Frame buffers - ping-pong implementation
uint8_t *frame_rgb[2]; // [0] and [1] alternate between current and previous
@@ -2419,6 +2428,7 @@ static tav_encoder_t* create_encoder(void) {
enc->separate_audio_track = 0; // Default: interleave audio packets
enc->pcm8_audio = 0; // Default: use MP2 audio
enc->tad_audio = 0; // Default: use MP2 audio (TAD quality follows quality_level)
enc->enable_letterbox_detect = 1; // Default: enable letterbox/pillarbox detection
// GOP / temporal DWT settings
enc->enable_temporal_dwt = 1; // Mutually exclusive with use_delta_encoding
@@ -8125,6 +8135,415 @@ static void write_timecode_packet(FILE *output, int frame_num, int fps, int is_n
fwrite(&timecode_ns, sizeof(uint64_t), 1, output);
}
// Write screen masking packet (letterbox/pillarbox detection)
// Packet structure: type(1) + frame_num(4) + top(2) + right(2) + bottom(2) + left(2) = 13 bytes
static void write_screen_mask_packet(FILE *output, uint32_t frame_num,
uint16_t top, uint16_t right,
uint16_t bottom, uint16_t left) {
uint8_t packet_type = TAV_PACKET_SCREEN_MASK;
fwrite(&packet_type, 1, 1, output);
fwrite(&frame_num, sizeof(uint32_t), 1, output);
fwrite(&top, sizeof(uint16_t), 1, output);
fwrite(&right, sizeof(uint16_t), 1, output);
fwrite(&bottom, sizeof(uint16_t), 1, output);
fwrite(&left, sizeof(uint16_t), 1, output);
}
// Calculate Sobel gradient magnitude for a pixel (edge detection)
static float calculate_sobel_magnitude(const uint8_t *frame_rgb, int width, int height,
int x, int y) {
// Sobel kernels for X and Y gradients
// Gx = [[-1, 0, 1], [-2, 0, 2], [-1, 0, 1]]
// Gy = [[-1, -2, -1], [0, 0, 0], [1, 2, 1]]
// Handle boundary conditions with symmetric extension
int x_prev = (x > 0) ? (x - 1) : 0;
int x_next = (x < width - 1) ? (x + 1) : (width - 1);
int y_prev = (y > 0) ? (y - 1) : 0;
int y_next = (y < height - 1) ? (y + 1) : (height - 1);
// Sample 3x3 neighborhood (using luma only for efficiency)
float pixels[3][3];
for (int dy = 0; dy < 3; dy++) {
for (int dx = 0; dx < 3; dx++) {
int sample_y = (dy == 0) ? y_prev : ((dy == 1) ? y : y_next);
int sample_x = (dx == 0) ? x_prev : ((dx == 1) ? x : x_next);
int offset = (sample_y * width + sample_x) * 3;
// Convert to luma (simple approximation: Y = 0.299R + 0.587G + 0.114B)
pixels[dy][dx] = (0.299f * frame_rgb[offset] +
0.587f * frame_rgb[offset + 1] +
0.114f * frame_rgb[offset + 2]);
}
}
// Apply Sobel operators
float gx = -pixels[0][0] + pixels[0][2] +
-2*pixels[1][0] + 2*pixels[1][2] +
-pixels[2][0] + pixels[2][2];
float gy = -pixels[0][0] - 2*pixels[0][1] - pixels[0][2] +
pixels[2][0] + 2*pixels[2][1] + pixels[2][2];
// Calculate magnitude: sqrt(gx^2 + gy^2)
return sqrtf(gx * gx + gy * gy);
}
// Apply symmetric cropping and suppress simultaneous letterbox+pillarbox
// ALWAYS makes left=right and top=bottom (perfect symmetry)
// When BOTH letterbox and pillarbox are detected simultaneously, suppress one based on current state
// Allows letterbox→pillarbox or pillarbox→letterbox transitions
static void apply_symmetric_cropping(uint16_t *top, uint16_t *right,
uint16_t *bottom, uint16_t *left,
int width, int height,
uint16_t current_top, uint16_t current_bottom,
uint16_t current_left, uint16_t current_right) {
const int MIN_BAR_SIZE_LETTER = (int)(0.04f * height); // Minimum bar size to consider (ignore <16 pixel bars)
const int MIN_BAR_SIZE_PILLAR = (int)(0.04f * width); // Minimum bar size to consider (ignore <16 pixel bars)
const int SIGNIFICANT_THRESHOLD_LETTER = (int)(0.08f * height); // Bar must be 32+ pixels to be considered significant
const int SIGNIFICANT_THRESHOLD_PILLAR = (int)(0.08f * width); // Bar must be 32+ pixels to be considered significant
// Filter out small bars (noise/detection errors)
if (*top < MIN_BAR_SIZE_LETTER) *top = 0;
if (*bottom < MIN_BAR_SIZE_LETTER) *bottom = 0;
if (*left < MIN_BAR_SIZE_PILLAR) *left = 0;
if (*right < MIN_BAR_SIZE_PILLAR) *right = 0;
// ALWAYS make letterbox (top/bottom) perfectly symmetric
if (*top > 0 || *bottom > 0) {
// Use minimum value to avoid over-cropping
uint16_t symmetric_value = (*top < *bottom) ? *top : *bottom;
*top = symmetric_value+1;
*bottom = symmetric_value+1;
}
// ALWAYS make pillarbox (left/right) perfectly symmetric
if (*left > 0 || *right > 0) {
// Use minimum value to avoid over-cropping
uint16_t symmetric_value = (*left < *right) ? *left : *right;
*left = symmetric_value+1;
*right = symmetric_value+1;
}
// Check if BOTH letterbox and pillarbox are detected simultaneously
int new_has_letterbox = (*top >= SIGNIFICANT_THRESHOLD_LETTER || *bottom >= SIGNIFICANT_THRESHOLD_LETTER);
int new_has_pillarbox = (*left >= SIGNIFICANT_THRESHOLD_PILLAR || *right >= SIGNIFICANT_THRESHOLD_PILLAR);
int current_has_letterbox = (current_top >= SIGNIFICANT_THRESHOLD_LETTER || current_bottom >= SIGNIFICANT_THRESHOLD_LETTER);
int current_has_pillarbox = (current_left >= SIGNIFICANT_THRESHOLD_PILLAR || current_right >= SIGNIFICANT_THRESHOLD_PILLAR);
// Only suppress when BOTH are detected AND one is much smaller (likely false positive)
// Completely suppress windowboxing
if (new_has_letterbox && new_has_pillarbox) {
int letterbox_size = *top + *bottom;
int pillarbox_size = *left + *right;
// to allow windowboxing:
// Only suppress if one is less than 25% of total masking
// This allows legitimate windowboxing while filtering false positives
float letterbox_ratio_geom = (float)letterbox_size / height;
float pillarbox_ratio_geom = (float)pillarbox_size / width;
float ratio_sum = letterbox_ratio_geom + pillarbox_ratio_geom;
float letterbox_ratio = letterbox_ratio_geom / ratio_sum;
float pillarbox_ratio = pillarbox_ratio_geom / ratio_sum;
if (letterbox_ratio < 0.25f) {
*top = 0;
*bottom = 0;
} else if (pillarbox_ratio < 0.25f)
*left = 0;
*right = 0;
}
// Otherwise keep both (legitimate windowboxing)
}
}
// Detect letterbox/pillarbox bars in the current frame
// Returns 1 if masking detected, 0 otherwise
// Sets top, right, bottom, left to the size of detected bars in pixels
static int detect_letterbox_pillarbox(tav_encoder_t *enc,
uint16_t *top, uint16_t *right,
uint16_t *bottom, uint16_t *left) {
if (!enc->current_frame_rgb) return 0;
const int width = enc->width;
const int height = enc->height;
const int SAMPLE_RATE_HORZ = 4; // Sample every 4th pixel for performance
const int SAMPLE_RATE_VERT = 4; // Sample every 4th pixel for performance
const float Y_THRESHOLD = 2.0f; // Y < 2 for dark pixels
const float CHROMA_THRESHOLD = 1.0f; // Co/Cg close to 0 (in ±255 scale)
const float EDGE_ACTIVITY_THRESHOLD = 1.0f; // Mean Sobel magnitude < 1.0
const float ROW_COL_BLACK_RATIO = 0.999f; // 99.9% of sampled pixels must be black
*top = 0;
*bottom = 0;
*left = 0;
*right = 0;
// Detect top letterbox
for (int y = 0; y < height / 4; y++) {
int black_pixel_count = 0;
float total_edge_activity = 0.0f;
int sampled_pixels = 0;
for (int x = 0; x < width; x += SAMPLE_RATE_HORZ) {
int idx = y * width + x;
// Use pre-converted YCoCg values (optimization: avoid RGB→YCoCg conversion in loop)
float yval = enc->current_frame_y[idx];
float co = enc->current_frame_co[idx];
float cg = enc->current_frame_cg[idx];
// Check if pixel is dark and neutral (letterbox bar)
if (yval < Y_THRESHOLD &&
fabs(co) < CHROMA_THRESHOLD &&
fabs(cg) < CHROMA_THRESHOLD) {
black_pixel_count++;
}
// Calculate edge activity
total_edge_activity += calculate_sobel_magnitude(enc->current_frame_rgb,
width, height, x, y);
sampled_pixels++;
}
float black_ratio = (float)black_pixel_count / sampled_pixels;
float mean_edge_activity = total_edge_activity / sampled_pixels;
// Row is part of letterbox if mostly black AND low edge activity
if (black_ratio > ROW_COL_BLACK_RATIO &&
mean_edge_activity < EDGE_ACTIVITY_THRESHOLD) {
*top = y + 1;
} else {
break; // Found content
}
}
// Detect bottom letterbox
for (int y = height - 1; y >= height * 3 / 4; y--) {
int black_pixel_count = 0;
float total_edge_activity = 0.0f;
int sampled_pixels = 0;
for (int x = 0; x < width; x += SAMPLE_RATE_HORZ) {
int idx = y * width + x;
// Use pre-converted YCoCg values (optimization)
float yval = enc->current_frame_y[idx];
float co = enc->current_frame_co[idx];
float cg = enc->current_frame_cg[idx];
if (yval < Y_THRESHOLD &&
fabs(co) < CHROMA_THRESHOLD &&
fabs(cg) < CHROMA_THRESHOLD) {
black_pixel_count++;
}
total_edge_activity += calculate_sobel_magnitude(enc->current_frame_rgb,
width, height, x, y);
sampled_pixels++;
}
float black_ratio = (float)black_pixel_count / sampled_pixels;
float mean_edge_activity = total_edge_activity / sampled_pixels;
if (black_ratio > ROW_COL_BLACK_RATIO &&
mean_edge_activity < EDGE_ACTIVITY_THRESHOLD) {
*bottom = height - y;
} else {
break;
}
}
// Detect left pillarbox
for (int x = 0; x < width / 4; x++) {
int black_pixel_count = 0;
float total_edge_activity = 0.0f;
int sampled_pixels = 0;
for (int y = 0; y < height; y += SAMPLE_RATE_VERT) {
int idx = y * width + x;
// Use pre-converted YCoCg values (optimization)
float yval = enc->current_frame_y[idx];
float co = enc->current_frame_co[idx];
float cg = enc->current_frame_cg[idx];
if (yval < Y_THRESHOLD &&
fabs(co) < CHROMA_THRESHOLD &&
fabs(cg) < CHROMA_THRESHOLD) {
black_pixel_count++;
}
total_edge_activity += calculate_sobel_magnitude(enc->current_frame_rgb,
width, height, x, y);
sampled_pixels++;
}
float black_ratio = (float)black_pixel_count / sampled_pixels;
float mean_edge_activity = total_edge_activity / sampled_pixels;
if (black_ratio > ROW_COL_BLACK_RATIO &&
mean_edge_activity < EDGE_ACTIVITY_THRESHOLD) {
*left = x + 1;
} else {
break;
}
}
// Detect right pillarbox
for (int x = width - 1; x >= width * 3 / 4; x--) {
int black_pixel_count = 0;
float total_edge_activity = 0.0f;
int sampled_pixels = 0;
for (int y = 0; y < height; y += SAMPLE_RATE_VERT) {
int idx = y * width + x;
// Use pre-converted YCoCg values (optimization)
float yval = enc->current_frame_y[idx];
float co = enc->current_frame_co[idx];
float cg = enc->current_frame_cg[idx];
if (yval < Y_THRESHOLD &&
fabs(co) < CHROMA_THRESHOLD &&
fabs(cg) < CHROMA_THRESHOLD) {
black_pixel_count++;
}
total_edge_activity += calculate_sobel_magnitude(enc->current_frame_rgb,
width, height, x, y);
sampled_pixels++;
}
float black_ratio = (float)black_pixel_count / sampled_pixels;
float mean_edge_activity = total_edge_activity / sampled_pixels;
if (black_ratio > ROW_COL_BLACK_RATIO &&
mean_edge_activity < EDGE_ACTIVITY_THRESHOLD) {
*right = width - x;
} else {
break;
}
}
// Apply symmetric cropping preference and minimum bar size filtering
// Note: During detection phase, no current state available (use 0,0,0,0)
apply_symmetric_cropping(top, right, bottom, left, width, height, 0, 0, 0, 0);
// Return 1 if any masking was detected
return (*top > 0 || *bottom > 0 || *left > 0 || *right > 0);
}
// Refine geometry change detection - find exact frame where change occurred
// Uses linear scan to find first frame with new geometry
static int refine_geometry_change(tav_encoder_t *enc, int start_frame, int end_frame,
uint16_t old_top, uint16_t old_right,
uint16_t old_bottom, uint16_t old_left) {
#define GEOMETRY_TOLERANCE 4 // ±4 pixels tolerance
// Linear scan from start to find first frame with new geometry
for (int i = start_frame; i <= end_frame && i < enc->frame_analyses_count; i++) {
frame_analysis_t *m = &enc->frame_analyses[i];
// Check if this frame has different geometry (beyond tolerance)
if (abs((int)m->letterbox_top - (int)old_top) > GEOMETRY_TOLERANCE ||
abs((int)m->letterbox_right - (int)old_right) > GEOMETRY_TOLERANCE ||
abs((int)m->letterbox_bottom - (int)old_bottom) > GEOMETRY_TOLERANCE ||
abs((int)m->letterbox_left - (int)old_left) > GEOMETRY_TOLERANCE) {
return i; // Found the change point
}
}
return end_frame; // No change found, use end frame
#undef GEOMETRY_TOLERANCE
}
// Write all screen masking packets before first frame (similar to SSF-TC subtitles)
// Uses two-stage approach: coarse detection (8-frame stride) + frame-exact refinement
static void write_all_screen_mask_packets(tav_encoder_t *enc, FILE *output) {
if (!enc->enable_letterbox_detect || !enc->two_pass_mode) {
return; // Letterbox detection requires two-pass mode
}
if (!enc->frame_analyses || enc->frame_analyses_count == 0) {
return; // No analysis data
}
#define COARSE_STRIDE 16 // Sample every 8 frames for coarse detection
#define CHANGE_THRESHOLD 16 // Require 16+ pixel change to consider geometry change
#define SKIP_INITIAL_FRAMES 60 // Skip first N frames (often black/fade-in)
// Track current geometry
uint16_t current_top = 0, current_right = 0, current_bottom = 0, current_left = 0;
int packets_written = 0;
int last_checked_frame = SKIP_INITIAL_FRAMES;
// Stage 1: Coarse scan every COARSE_STRIDE frames to detect geometry changes
for (int i = SKIP_INITIAL_FRAMES; i < enc->frame_analyses_count; i += COARSE_STRIDE) {
frame_analysis_t *metrics = &enc->frame_analyses[i];
// Check if geometry changed significantly
int is_first = (packets_written == 0);
int is_significant_change =
abs((int)metrics->letterbox_top - (int)current_top) >= CHANGE_THRESHOLD ||
abs((int)metrics->letterbox_right - (int)current_right) >= CHANGE_THRESHOLD ||
abs((int)metrics->letterbox_bottom - (int)current_bottom) >= CHANGE_THRESHOLD ||
abs((int)metrics->letterbox_left - (int)current_left) >= CHANGE_THRESHOLD;
if (is_first || is_significant_change) {
// Stage 2: Refine - find exact frame where change occurred
int change_frame;
if (is_first) {
change_frame = 0; // First packet always at frame 0
} else {
// Search backwards from i to last_checked_frame to find exact change point
change_frame = refine_geometry_change(enc, last_checked_frame, i,
current_top, current_right,
current_bottom, current_left);
}
// Get geometry from the change frame
frame_analysis_t *change_metrics = &enc->frame_analyses[change_frame];
// Apply symmetric cropping to final geometry (with current state for context)
uint16_t final_top = change_metrics->letterbox_top;
uint16_t final_right = change_metrics->letterbox_right;
uint16_t final_bottom = change_metrics->letterbox_bottom;
uint16_t final_left = change_metrics->letterbox_left;
apply_symmetric_cropping(&final_top, &final_right, &final_bottom, &final_left,
enc->width, enc->height,
current_top, current_bottom, current_left, current_right);
// Emit packet
write_screen_mask_packet(output, change_frame,
final_top, final_right, final_bottom, final_left);
// Update current geometry
current_top = final_top;
current_right = final_right;
current_bottom = final_bottom;
current_left = final_left;
packets_written++;
if (enc->verbose) {
printf(" Frame %d: Screen mask t=%u r=%u b=%u l=%u (frame-exact detection)\n",
change_frame, final_top, final_right, final_bottom, final_left);
}
}
last_checked_frame = i;
}
if (packets_written > 0) {
printf("Wrote %d screen masking packet(s) (frame-exact detection)\n", packets_written);
}
#undef COARSE_STRIDE
#undef CHANGE_THRESHOLD
#undef SKIP_INITIAL_FRAMES
}
// Write extended header packet with metadata
// Returns the file offset where ENDT value is written (for later update)
static long write_extended_header(tav_encoder_t *enc) {
@@ -8297,6 +8716,15 @@ static int write_tad_packet_samples(tav_encoder_t *enc, FILE *output, int sample
if (!enc->pcm_file || enc->audio_remaining <= 0 || samples_to_read <= 0) {
return 0;
}
// Check if we have enough audio for a minimum chunk
// Don't encode if less than minimum - avoids encoding mostly padding/zeros
size_t min_bytes_needed = TAD32_MIN_CHUNK_SIZE * 2 * sizeof(float);
if (enc->audio_remaining < min_bytes_needed) {
enc->audio_remaining = 0; // Mark audio as exhausted
return 0;
}
size_t bytes_to_read = samples_to_read * 2 * sizeof(float); // Stereo Float32LE
// Don't read more than what's available
@@ -9457,9 +9885,11 @@ static int two_pass_first_pass(tav_encoder_t *enc, const char *input_file) {
// Compute metrics
frame_analysis_t metrics;
metrics.frame_number = frame_num;
compute_frame_metrics(enc, gray, prev_dwt, sub_width, sub_height, ANALYSIS_DWT_LEVELS, &metrics);
// Set frame number AFTER compute_frame_metrics (which does memset)
metrics.frame_number = frame_num;
// Detect scene change using hybrid detector
if (frame_num > 0) {
metrics.is_scene_change = detect_scene_change_wavelet(
@@ -9473,6 +9903,29 @@ static int two_pass_first_pass(tav_encoder_t *enc, const char *input_file) {
metrics.is_scene_change = 0; // First frame is always start of first GOP
}
// Detect letterbox/pillarbox if enabled
if (enc->enable_letterbox_detect) {
// Set current_frame_rgb temporarily for detection
uint8_t *saved_current = enc->current_frame_rgb;
enc->current_frame_rgb = frame_rgb;
metrics.has_letterbox = detect_letterbox_pillarbox(
enc,
&metrics.letterbox_top,
&metrics.letterbox_right,
&metrics.letterbox_bottom,
&metrics.letterbox_left
);
enc->current_frame_rgb = saved_current;
} else {
metrics.has_letterbox = 0;
metrics.letterbox_top = 0;
metrics.letterbox_right = 0;
metrics.letterbox_bottom = 0;
metrics.letterbox_left = 0;
}
// Store analysis
if (enc->frame_analyses_count >= enc->frame_analyses_capacity) {
// Expand array
@@ -9650,6 +10103,7 @@ int main(int argc, char *argv[]) {
{"tad-audio", no_argument, 0, 1028},
{"raw-coeffs", no_argument, 0, 1029},
{"single-pass", no_argument, 0, 1050}, // disable two-pass encoding with wavelet-based scene detection
{"no-letterbox-detect", no_argument, 0, 1051}, // disable letterbox/pillarbox detection
{"help", no_argument, 0, '?'},
{0, 0, 0, 0}
};
@@ -9880,6 +10334,10 @@ int main(int argc, char *argv[]) {
enc->two_pass_mode = 0;
printf("Two-pass wavelet-based scene change detection disabled\n");
break;
case 1051: // --no-letterbox-detect
enc->enable_letterbox_detect = 0;
printf("Letterbox/pillarbox detection disabled\n");
break;
case 'a':
int bitrate = atoi(optarg);
int valid_bitrate = validate_mp2_bitrate(bitrate);
@@ -10088,6 +10546,10 @@ int main(int argc, char *argv[]) {
write_all_subtitles_tc(enc, enc->output_fp);
}
// Write all screen masking packets upfront (before first frame)
// This must be done AFTER first pass analysis completes, so we'll defer it
// to after the two-pass analysis block below
if (enc->output_fps != enc->fps) {
printf("Frame rate conversion enabled: %d fps output\n", enc->output_fps);
}
@@ -10131,6 +10593,9 @@ int main(int argc, char *argv[]) {
TEMPORAL_GOP_SIZE, ANALYSIS_GOP_MAX_SIZE);
}
// Write all screen masking packets NOW (after first pass analysis)
write_all_screen_mask_packets(enc, enc->output_fp);
printf("\n=== Two-Pass Encoding: Second Pass (Encoding) ===\n");
}