diff --git a/video_encoder/encoder_tav.c b/video_encoder/encoder_tav.c index 1ae66a7..70694c7 100644 --- a/video_encoder/encoder_tav.c +++ b/video_encoder/encoder_tav.c @@ -8221,8 +8221,8 @@ static void apply_symmetric_cropping(uint16_t *top, uint16_t *right, if (*left > 0 || *right > 0) { // Use minimum value to avoid over-cropping uint16_t symmetric_value = (*left < *right) ? *left : *right; - *left = symmetric_value+1; - *right = symmetric_value+1; + *left = symmetric_value; + *right = symmetric_value; } // Check if BOTH letterbox and pillarbox are detected simultaneously @@ -8249,7 +8249,7 @@ static void apply_symmetric_cropping(uint16_t *top, uint16_t *right, if (letterbox_ratio < 0.25f) { *top = 0; *bottom = 0; - } else if (pillarbox_ratio < 0.25f) + } else if (pillarbox_ratio < 0.25f) { *left = 0; *right = 0; } @@ -8271,7 +8271,7 @@ static int detect_letterbox_pillarbox(tav_encoder_t *enc, const int SAMPLE_RATE_VERT = 4; // Sample every 4th pixel for performance const float Y_THRESHOLD = 2.0f; // Y < 2 for dark pixels const float CHROMA_THRESHOLD = 1.0f; // Co/Cg close to 0 (in ±255 scale) - const float EDGE_ACTIVITY_THRESHOLD = 1.0f; // Mean Sobel magnitude < 1.0 + const float EDGE_ACTIVITY_THRESHOLD = 0.7f; // Mean Sobel magnitude const float ROW_COL_BLACK_RATIO = 0.999f; // 99.9% of sampled pixels must be black *top = 0; @@ -8434,33 +8434,67 @@ static int detect_letterbox_pillarbox(tav_encoder_t *enc, return (*top > 0 || *bottom > 0 || *left > 0 || *right > 0); } -// Refine geometry change detection - find exact frame where change occurred -// Uses linear scan to find first frame with new geometry -static int refine_geometry_change(tav_encoder_t *enc, int start_frame, int end_frame, - uint16_t old_top, uint16_t old_right, - uint16_t old_bottom, uint16_t old_left) { - #define GEOMETRY_TOLERANCE 4 // ±4 pixels tolerance +// Median filter helper - finds median of array (destructive sort) +static uint16_t median_uint16(uint16_t *values, int count) { + // Simple bubble sort for small arrays + for (int i = 0; i < count - 1; i++) { + for (int j = 0; j < count - i - 1; j++) { + if (values[j] > values[j + 1]) { + uint16_t tmp = values[j]; + values[j] = values[j + 1]; + values[j + 1] = tmp; + } + } + } + return values[count / 2]; +} - // Linear scan from start to find first frame with new geometry - for (int i = start_frame; i <= end_frame && i < enc->frame_analyses_count; i++) { - frame_analysis_t *m = &enc->frame_analyses[i]; +// Cluster and normalize a single dimension (top, right, bottom, or left) +// Groups values within ±1 and normalizes each to the most frequent value in its cluster +// E.g., [55, 56, 55, 57, 55, 200, 201, 200] -> [55, 55, 55, 55, 55, 200, 200, 200] +static void normalize_dimension_clusters(uint16_t *values, int count) { + if (count == 0) return; - // Check if this frame has different geometry (beyond tolerance) - if (abs((int)m->letterbox_top - (int)old_top) > GEOMETRY_TOLERANCE || - abs((int)m->letterbox_right - (int)old_right) > GEOMETRY_TOLERANCE || - abs((int)m->letterbox_bottom - (int)old_bottom) > GEOMETRY_TOLERANCE || - abs((int)m->letterbox_left - (int)old_left) > GEOMETRY_TOLERANCE) { - return i; // Found the change point +#define MAX_GEOMETRY 2048 // Maximum dimension size (width or height) + + // Build histogram of all values + int histogram[MAX_GEOMETRY]; + memset(histogram, 0, sizeof(histogram)); + + for (int i = 0; i < count; i++) { + if (values[i] < MAX_GEOMETRY) { + histogram[values[i]]++; } } - return end_frame; // No change found, use end frame + // For each value, find the most frequent value within ±1 range and normalize to it + for (int i = 0; i < count; i++) { + uint16_t val = values[i]; + if (val >= MAX_GEOMETRY) continue; - #undef GEOMETRY_TOLERANCE + uint16_t best_val = val; + int best_count = histogram[val]; + + // Check val-1 + if (val > 0 && histogram[val - 1] > best_count) { + best_val = val - 1; + best_count = histogram[val - 1]; + } + + // Check val+1 + if (val + 1 < MAX_GEOMETRY && histogram[val + 1] > best_count) { + best_val = val + 1; + best_count = histogram[val + 1]; + } + + values[i] = best_val; + } + +#undef MAX_GEOMETRY } // Write all screen masking packets before first frame (similar to SSF-TC subtitles) -// Uses two-stage approach: coarse detection (8-frame stride) + frame-exact refinement +// Uses median filtering + clustering to normalize geometry to predominant aspect ratios static void write_all_screen_mask_packets(tav_encoder_t *enc, FILE *output) { if (!enc->enable_letterbox_detect || !enc->two_pass_mode) { return; // Letterbox detection requires two-pass mode @@ -8470,76 +8504,170 @@ static void write_all_screen_mask_packets(tav_encoder_t *enc, FILE *output) { return; // No analysis data } -#define COARSE_STRIDE 16 // Sample every 8 frames for coarse detection -#define CHANGE_THRESHOLD 16 // Require 16+ pixel change to consider geometry change +#define MEDIAN_WINDOW_SIZE 5 // 5-frame window for median filter (smooths jitter, reacts quickly) +#define CHANGE_THRESHOLD 16 // Require 16+ pixel change to emit packet #define SKIP_INITIAL_FRAMES 60 // Skip first N frames (often black/fade-in) - // Track current geometry - uint16_t current_top = 0, current_right = 0, current_bottom = 0, current_left = 0; - int packets_written = 0; - int last_checked_frame = SKIP_INITIAL_FRAMES; + // Geometry storage for each frame + typedef struct { + uint16_t top, right, bottom, left; + } frame_geometry_t; - // Stage 1: Coarse scan every COARSE_STRIDE frames to detect geometry changes - for (int i = SKIP_INITIAL_FRAMES; i < enc->frame_analyses_count; i += COARSE_STRIDE) { - frame_analysis_t *metrics = &enc->frame_analyses[i]; + frame_geometry_t *geometries = calloc(enc->frame_analyses_count, sizeof(frame_geometry_t)); + if (!geometries) { + fprintf(stderr, "Failed to allocate geometry storage\n"); + return; + } - // Check if geometry changed significantly - int is_first = (packets_written == 0); - int is_significant_change = - abs((int)metrics->letterbox_top - (int)current_top) >= CHANGE_THRESHOLD || - abs((int)metrics->letterbox_right - (int)current_right) >= CHANGE_THRESHOLD || - abs((int)metrics->letterbox_bottom - (int)current_bottom) >= CHANGE_THRESHOLD || - abs((int)metrics->letterbox_left - (int)current_left) >= CHANGE_THRESHOLD; + // Step 1: Calculate median-filtered geometry for all frames + // Use centered median window to avoid early detection + uint16_t top_window[MEDIAN_WINDOW_SIZE]; + uint16_t right_window[MEDIAN_WINDOW_SIZE]; + uint16_t bottom_window[MEDIAN_WINDOW_SIZE]; + uint16_t left_window[MEDIAN_WINDOW_SIZE]; - if (is_first || is_significant_change) { - // Stage 2: Refine - find exact frame where change occurred - int change_frame; - if (is_first) { - change_frame = 0; // First packet always at frame 0 - } else { - // Search backwards from i to last_checked_frame to find exact change point - change_frame = refine_geometry_change(enc, last_checked_frame, i, - current_top, current_right, - current_bottom, current_left); + const int window_offset = MEDIAN_WINDOW_SIZE / 2; // Center offset (2 for size 5) + + for (int i = SKIP_INITIAL_FRAMES; i < enc->frame_analyses_count; i++) { + // Fill centered median window with values from [i-offset, i, i+offset] + // E.g., for window size 5: [i-2, i-1, i, i+1, i+2] + int window_count = 0; + for (int w = 0; w < MEDIAN_WINDOW_SIZE; w++) { + int frame_idx = i - window_offset + w; + + // Clamp to valid frame range + if (frame_idx < SKIP_INITIAL_FRAMES) { + frame_idx = SKIP_INITIAL_FRAMES; + } else if (frame_idx >= enc->frame_analyses_count) { + frame_idx = enc->frame_analyses_count - 1; } - // Get geometry from the change frame - frame_analysis_t *change_metrics = &enc->frame_analyses[change_frame]; - - // Apply symmetric cropping to final geometry (with current state for context) - uint16_t final_top = change_metrics->letterbox_top; - uint16_t final_right = change_metrics->letterbox_right; - uint16_t final_bottom = change_metrics->letterbox_bottom; - uint16_t final_left = change_metrics->letterbox_left; - apply_symmetric_cropping(&final_top, &final_right, &final_bottom, &final_left, - enc->width, enc->height, - current_top, current_bottom, current_left, current_right); - - // Emit packet - write_screen_mask_packet(output, change_frame, - final_top, final_right, final_bottom, final_left); - - // Update current geometry - current_top = final_top; - current_right = final_right; - current_bottom = final_bottom; - current_left = final_left; - packets_written++; - - if (enc->verbose) { - printf(" Frame %d: Screen mask t=%u r=%u b=%u l=%u (frame-exact detection)\n", - change_frame, final_top, final_right, final_bottom, final_left); - } + frame_analysis_t *metrics = &enc->frame_analyses[frame_idx]; + top_window[window_count] = metrics->letterbox_top; + right_window[window_count] = metrics->letterbox_right; + bottom_window[window_count] = metrics->letterbox_bottom; + left_window[window_count] = metrics->letterbox_left; + window_count++; } - last_checked_frame = i; + // Calculate median values (filters jitter like 52,53,53,52,53,52 -> 52) + geometries[i].top = median_uint16(top_window, window_count); + geometries[i].right = median_uint16(right_window, window_count); + geometries[i].bottom = median_uint16(bottom_window, window_count); + geometries[i].left = median_uint16(left_window, window_count); } - if (packets_written > 0) { - printf("Wrote %d screen masking packet(s) (frame-exact detection)\n", packets_written); + // Step 2: Identify change points and collect packet geometries (first pass) + typedef struct { + int frame_num; + uint16_t top, right, bottom, left; + } screen_mask_packet_t; + + // Allocate worst-case packet storage (one per frame) + screen_mask_packet_t *packets = malloc(enc->frame_analyses_count * sizeof(screen_mask_packet_t)); + if (!packets) { + fprintf(stderr, "Failed to allocate packet storage\n"); + free(geometries); + return; } -#undef COARSE_STRIDE + int packet_count = 0; + uint16_t current_top = 0, current_right = 0, current_bottom = 0, current_left = 0; + + for (int i = SKIP_INITIAL_FRAMES; i < enc->frame_analyses_count; i++) { + uint16_t top = geometries[i].top; + uint16_t right = geometries[i].right; + uint16_t bottom = geometries[i].bottom; + uint16_t left = geometries[i].left; + + // Apply symmetric cropping + apply_symmetric_cropping(&top, &right, &bottom, &left, + enc->width, enc->height, + current_top, current_bottom, current_left, current_right); + + // Check if geometry changed significantly + int is_first = (packet_count == 0); + int is_significant_change = + abs((int)top - (int)current_top) >= CHANGE_THRESHOLD || + abs((int)right - (int)current_right) >= CHANGE_THRESHOLD || + abs((int)bottom - (int)current_bottom) >= CHANGE_THRESHOLD || + abs((int)left - (int)current_left) >= CHANGE_THRESHOLD; + + if (is_first || is_significant_change) { + // Store packet (first packet points to frame 0) + packets[packet_count].frame_num = is_first ? 0 : i; + packets[packet_count].top = top; + packets[packet_count].right = right; + packets[packet_count].bottom = bottom; + packets[packet_count].left = left; + packet_count++; + + // Update current geometry + current_top = top; + current_right = right; + current_bottom = bottom; + current_left = left; + } + } + + // Step 3: Survey packet values and normalize clusters (second pass) + // Cluster values within ±1 across all packets and normalize to most frequent + if (packet_count > 0) { + // Extract dimension values from packets + uint16_t *tops = malloc(packet_count * sizeof(uint16_t)); + uint16_t *rights = malloc(packet_count * sizeof(uint16_t)); + uint16_t *bottoms = malloc(packet_count * sizeof(uint16_t)); + uint16_t *lefts = malloc(packet_count * sizeof(uint16_t)); + + for (int i = 0; i < packet_count; i++) { + tops[i] = packets[i].top; + rights[i] = packets[i].right; + bottoms[i] = packets[i].bottom; + lefts[i] = packets[i].left; + } + + // Normalize each dimension independently (54,55,56 -> 55) + normalize_dimension_clusters(tops, packet_count); + normalize_dimension_clusters(rights, packet_count); + normalize_dimension_clusters(bottoms, packet_count); + normalize_dimension_clusters(lefts, packet_count); + + // Write normalized values back to packets + for (int i = 0; i < packet_count; i++) { + packets[i].top = tops[i]; + packets[i].right = rights[i]; + packets[i].bottom = bottoms[i]; + packets[i].left = lefts[i]; + } + + free(tops); + free(rights); + free(bottoms); + free(lefts); + } + + // Step 4: Emit normalized packets to file + for (int i = 0; i < packet_count; i++) { + write_screen_mask_packet(output, packets[i].frame_num, + packets[i].top, packets[i].right, + packets[i].bottom, packets[i].left); + + if (enc->verbose) { + printf(" Frame %d: Screen mask t=%u r=%u b=%u l=%u (normalized%s)\n", + packets[i].frame_num, packets[i].top, packets[i].right, + packets[i].bottom, packets[i].left, + i == 0 ? ", initial geometry" : ""); + } + } + + if (packet_count > 0) { + printf("Wrote %d screen masking packet(s) (median + clustering)\n", packet_count); + } + + free(packets); + free(geometries); + +#undef MEDIAN_WINDOW_SIZE #undef CHANGE_THRESHOLD #undef SKIP_INITIAL_FRAMES }