From 92274a8e1978b1ecd2acb365eca8f063a3f2bbd4 Mon Sep 17 00:00:00 2001 From: minjaesong Date: Thu, 20 Nov 2025 02:13:45 +0900 Subject: [PATCH] TAV: letterbox detection encoding complete --- terranmon.txt | 16 + .../torvald/tsvm/GraphicsJSR223Delegate.kt | 215 ++++- video_encoder/Makefile | 11 +- video_encoder/decoder_tav.c | 451 +++++++--- video_encoder/encoder_tav.c | 769 ++++++++++++++++-- 5 files changed, 1242 insertions(+), 220 deletions(-) diff --git a/terranmon.txt b/terranmon.txt index 2de529c..be39a0e 100644 --- a/terranmon.txt +++ b/terranmon.txt @@ -1007,6 +1007,12 @@ transmission capability, and region-of-interest coding. 0xE2: ID3v2 packet 0xE3: Vorbis Comment packet 0xE4: CD-text packet + + 0x01: Vendor-specific video packets + 0x02: Vendor-specific audio frame + 0x03: Vendor-specific subtitle + 0x04: Vendor-specific audio file + 0x0E: Vendor-specific metadata 0x00: No-op (no payload) 0xEF: TAV Extended Header @@ -1068,6 +1074,16 @@ transmission capability, and region-of-interest coding. - Bytes VNDR: Name and version of the encoder (for Reference encoder: "Encoder-TAV 20251014 (list,of,features)") - Bytes FMPG: FFmpeg version (typically "ffmpeg version 8.0 Copyright (c) 2000-2025 the FFmpeg developers"; the first line of text FFmpeg emits) +## Extensible Packet Structure + uint8 Packet Type + uint8 Flags + - 0x01: 64-bit size + uint8 Identifier[4] + + uint64 Length of the payload + + uint32 Length of the payload + * Payload ## Standard Metadata Payload Packet Structure uint8 Packet Type (0xE0/0xE1/0xE2/.../0xEE; see Packet Types section) diff --git a/tsvm_core/src/net/torvald/tsvm/GraphicsJSR223Delegate.kt b/tsvm_core/src/net/torvald/tsvm/GraphicsJSR223Delegate.kt index 7c2fa5c..892d94b 100644 --- a/tsvm_core/src/net/torvald/tsvm/GraphicsJSR223Delegate.kt +++ b/tsvm_core/src/net/torvald/tsvm/GraphicsJSR223Delegate.kt @@ -4713,24 +4713,120 @@ class GraphicsJSR223Delegate(private val vm: VM) { return output } + /** + * Peek at EZBC dimensions without decoding + * Reads width and height from the EZBC bitstream header + * + * @param compressedData EZBC frame data + * @param compressedOffset Offset to EZBC channel data + * @param channelLayout Channel layout to find Y channel + * @return Pair of (width, height) or null if unable to read + */ + private fun ezbc_peek_dimensions(compressedData: ByteArray, compressedOffset: Int, channelLayout: Int): Pair? { + val hasY = (channelLayout and 4) == 0 + if (!hasY) return null // Need Y channel to get dimensions + + try { + // Read Y channel size header (4 bytes) + val size = ((compressedData[compressedOffset].toInt() and 0xFF) or + ((compressedData[compressedOffset + 1].toInt() and 0xFF) shl 8) or + ((compressedData[compressedOffset + 2].toInt() and 0xFF) shl 16) or + ((compressedData[compressedOffset + 3].toInt() and 0xFF) shl 24)) + + if (size < 6) return null // Too small for EZBC header + + // Parse EZBC bitstream header (skip MSB bitplane, read width/height) + val ezbc_offset = compressedOffset + 4 + var bytePos = ezbc_offset + var bitPos = 0 + + fun readBits(numBits: Int): Int { + var result = 0 + for (i in 0 until numBits) { + if (bytePos >= ezbc_offset + size) return 0 + val bit = (compressedData[bytePos].toInt() shr bitPos) and 1 + result = result or (bit shl i) + bitPos++ + if (bitPos == 8) { + bitPos = 0 + bytePos++ + } + } + return result + } + + // Read header: MSB bitplane (8 bits), width (16 bits), height (16 bits) + readBits(8) // Skip MSB bitplane + val width = readBits(16) + val height = readBits(16) + + return if (width > 0 && height > 0) Pair(width, height) else null + } catch (e: Exception) { + return null + } + } + + /** + * Result of EZBC GOP decoding with actual dimensions (for crop encoding support) + */ + data class EZBCGopResult( + val coeffs: Array>, + val width: Int, + val height: Int + ) + /** * Reconstruct per-frame coefficients from unified GOP block (EZBC format) * Format: [frame0_size(4)][frame0_ezbc][frame1_size(4)][frame1_ezbc]... * + * With crop encoding, GOP dimensions may differ from full frame size. + * This function detects the actual dimensions from EZBC headers. + * * @param decompressedData Unified EZBC block data (after Zstd decompression) * @param numFrames Number of frames in GOP - * @param numPixels Pixels per frame (width × height) + * @param numPixels Pixels per frame for full frame (width × height) * @param channelLayout Channel layout (0=YCoCg, 2=Y-only, etc) - * @return Array of [frame][channel] where channel: 0=Y, 1=Co, 2=Cg + * @param fullWidth Full frame width (for crop detection) + * @param fullHeight Full frame height (for crop detection) + * @return EZBCGopResult with coeffs array and actual GOP dimensions */ private fun tavPostprocessGopEZBC( decompressedData: ByteArray, numFrames: Int, numPixels: Int, - channelLayout: Int - ): Array> { - // Allocate output arrays - val output = Array(numFrames) { Array(3) { ShortArray(numPixels) } } + channelLayout: Int, + fullWidth: Int, + fullHeight: Int + ): EZBCGopResult { + // Peek at first frame's EZBC dimensions to detect crop encoding + var actualWidth = fullWidth + var actualHeight = fullHeight + var actualPixels = numPixels + + if (decompressedData.size >= 8) { + // Read first frame size + val firstFrameSize = ((decompressedData[0].toInt() and 0xFF) or + ((decompressedData[1].toInt() and 0xFF) shl 8) or + ((decompressedData[2].toInt() and 0xFF) shl 16) or + ((decompressedData[3].toInt() and 0xFF) shl 24)) + + if (4 + firstFrameSize <= decompressedData.size) { + // Peek at EZBC dimensions from first frame's Y channel + val dims = ezbc_peek_dimensions(decompressedData, 4, channelLayout) + if (dims != null) { + actualWidth = dims.first + actualHeight = dims.second + actualPixels = actualWidth * actualHeight + + if (actualPixels != numPixels) { + println("[TAV-GOP-EZBC] Detected crop encoding: GOP ${actualWidth}x${actualHeight} (${actualPixels} px) vs full frame ${fullWidth}x${fullHeight} (${numPixels} px)") + } + } + } + } + + // Allocate output arrays with actual GOP dimensions + val output = Array(numFrames) { Array(3) { ShortArray(actualPixels) } } var offset = 0 for (frame in 0 until numFrames) { @@ -4745,39 +4841,52 @@ class GraphicsJSR223Delegate(private val vm: VM) { if (offset + frameSize > decompressedData.size) break - // Decode this frame with EZBC + // Decode this frame with EZBC using actual GOP dimensions postprocessCoefficientsEZBC( - decompressedData, offset, numPixels, channelLayout, + decompressedData, offset, actualPixels, channelLayout, output[frame][0], output[frame][1], output[frame][2], null ) offset += frameSize } - return output + return EZBCGopResult(output, actualWidth, actualHeight) } + /** + * Result of GOP decoding with actual dimensions (for crop encoding support) + */ + data class GopDecodeResult( + val isEZBC: Boolean, + val coeffs: Array>, + val width: Int, + val height: Int + ) + /** * Auto-detecting GOP postprocessor * Detects EZBC vs twobit-map format and calls appropriate decoder + * Returns actual GOP dimensions (may differ from full frame with crop encoding) */ private fun tavPostprocessGopAuto( decompressedData: ByteArray, numFrames: Int, numPixels: Int, channelLayout: Int, - entropyCoder: Int - ): Pair>> { + entropyCoder: Int, + fullWidth: Int, + fullHeight: Int + ): GopDecodeResult { // Read entropy coder from header: 0 = Twobit-map, 1 = EZBC val isEZBC = (entropyCoder == 1) - val data = if (isEZBC) { - tavPostprocessGopEZBC(decompressedData, numFrames, numPixels, channelLayout) + return if (isEZBC) { + val result = tavPostprocessGopEZBC(decompressedData, numFrames, numPixels, channelLayout, fullWidth, fullHeight) + GopDecodeResult(true, result.coeffs, result.width, result.height) } else { - tavPostprocessGopUnified(decompressedData, numFrames, numPixels, channelLayout) + val coeffs = tavPostprocessGopUnified(decompressedData, numFrames, numPixels, channelLayout) + GopDecodeResult(false, coeffs, fullWidth, fullHeight) } - - return Pair(isEZBC, data) } // TAV Simulated overlapping tiles constants (must match encoder) @@ -6446,21 +6555,33 @@ class GraphicsJSR223Delegate(private val vm: VM) { } // Step 2: Postprocess unified block to per-frame coefficients - val (isEZBCMode, quantizedCoeffs) = tavPostprocessGopAuto( + // With crop encoding, GOP dimensions may differ from full frame + val gopResult = tavPostprocessGopAuto( decompressedData, gopSize, outputPixels, channelLayout, - entropyCoder + entropyCoder, + width, + height ) - // Step 3: Allocate GOP buffers for float coefficients (expanded canvas size) - val gopY = Array(gopSize) { FloatArray(outputPixels) } - val gopCo = Array(gopSize) { FloatArray(outputPixels) } - val gopCg = Array(gopSize) { FloatArray(outputPixels) } + val isEZBCMode = gopResult.isEZBC + val quantizedCoeffs = gopResult.coeffs + val gopWidth = gopResult.width + val gopHeight = gopResult.height + val gopPixels = gopWidth * gopHeight - // Step 4: Calculate subband layout for expanded canvas - val subbands = calculateSubbandLayout(width, height, spatialLevels) + // Detect crop encoding + val isCropEncoded = (gopWidth != width || gopHeight != height) + + // Step 3: Allocate GOP buffers for float coefficients (GOP dimensions) + val gopY = Array(gopSize) { FloatArray(gopPixels) } + val gopCo = Array(gopSize) { FloatArray(gopPixels) } + val gopCg = Array(gopSize) { FloatArray(gopPixels) } + + // Step 4: Calculate subband layout for GOP dimensions (may be cropped) + val subbands = calculateSubbandLayout(gopWidth, gopHeight, spatialLevels) // Step 5: Dequantize with temporal-spatial scaling for (t in 0 until gopSize) { @@ -6503,32 +6624,44 @@ class GraphicsJSR223Delegate(private val vm: VM) { // Step 5.5: Remove grain synthesis from Y channel for each GOP frame // This must happen after dequantization but before inverse DWT + // Use GOP dimensions (may be cropped) for (t in 0 until gopSize) { removeGrainSynthesisDecoder( - gopY[t], width, height, + gopY[t], gopWidth, gopHeight, rngFrameTick.getAndAdd(1) + t, subbands, qIndex ) } - // Step 6: Apply inverse 3D DWT - tavApplyInverse3DDWT(gopY, width, height, gopSize, spatialLevels, temporalLevels, spatialFilter) - tavApplyInverse3DDWT(gopCo, width, height, gopSize, spatialLevels, temporalLevels, spatialFilter) - tavApplyInverse3DDWT(gopCg, width, height, gopSize, spatialLevels, temporalLevels, spatialFilter) + // Step 6: Apply inverse 3D DWT using GOP dimensions (may be cropped) + tavApplyInverse3DDWT(gopY, gopWidth, gopHeight, gopSize, spatialLevels, temporalLevels, spatialFilter) + tavApplyInverse3DDWT(gopCo, gopWidth, gopHeight, gopSize, spatialLevels, temporalLevels, spatialFilter) + tavApplyInverse3DDWT(gopCg, gopWidth, gopHeight, gopSize, spatialLevels, temporalLevels, spatialFilter) + + // Step 8: Convert to RGB and composite to full frame + // With crop encoding, center the cropped frame and fill letterbox areas with black + val maskTop = if (isCropEncoded && gopHeight < height) (height - gopHeight) / 2 else 0 + val maskLeft = if (isCropEncoded && gopWidth < width) (width - gopWidth) / 2 else 0 - // Step 8: Crop and convert to RGB, write directly to videoBuffer for (t in 0 until gopSize) { val videoBufferOffset = bufferOffset + (t * frameSize) // Each frame sequentially, starting at bufferOffset - for (py in 0 until height) { - for (px in 0 until width) { - // Destination pixel in videoBuffer - val outIdx = py * width + px - val offset = videoBufferOffset + outIdx * 3L + // Fill entire frame with black (for letterbox/pillarbox areas) + if (isCropEncoded) { + for (i in 0 until (width * height * 3L)) { + gpu.videoBuffer[videoBufferOffset + i] = 0 + } + } - val yVal = gopY[t][outIdx] - val co = gopCo[t][outIdx] - val cg = gopCg[t][outIdx] + // Write cropped content to centered position + for (py in 0 until gopHeight) { + for (px in 0 until gopWidth) { + // Source pixel from GOP + val srcIdx = py * gopWidth + px + + val yVal = gopY[t][srcIdx] + val co = gopCo[t][srcIdx] + val cg = gopCg[t][srcIdx] // YCoCg-R to RGB conversion val tmp = yVal - (cg / 2.0f) @@ -6536,6 +6669,12 @@ class GraphicsJSR223Delegate(private val vm: VM) { val b = tmp - (co / 2.0f) val r = b + co + // Destination pixel in full frame (with centering offset) + val dstY = py + maskTop + val dstX = px + maskLeft + val dstIdx = dstY * width + dstX + val offset = videoBufferOffset + dstIdx * 3L + // Clamp and write to videoBuffer gpu.videoBuffer[offset + 0] = r.roundToInt().coerceIn(0, 255).toByte() gpu.videoBuffer[offset + 1] = g.roundToInt().coerceIn(0, 255).toByte() diff --git a/video_encoder/Makefile b/video_encoder/Makefile index 6aaecf4..d0525d6 100644 --- a/video_encoder/Makefile +++ b/video_encoder/Makefile @@ -3,8 +3,9 @@ CC = gcc CXX = g++ -CFLAGS = -std=c99 -Wall -Wextra -Ofast -D_GNU_SOURCE -CXXFLAGS = -std=c++11 -Wall -Wextra -Ofast -D_GNU_SOURCE +CFLAGS = -std=c99 -Wall -Wextra -Ofast -D_GNU_SOURCE #-fsanitize=address +CXXFLAGS = -std=c++11 -Wall -Wextra -Ofast -D_GNU_SOURCE #-fsanitize=address +DBGFLAGS = #-fsanitize=address # Zstd flags (use pkg-config if available, fallback for cross-platform compatibility) ZSTD_CFLAGS = $(shell pkg-config --cflags libzstd 2>/dev/null || echo "") @@ -33,13 +34,13 @@ tav: encoder_tav.c encoder_tad.c encoder_tav_opencv.cpp $(CC) $(CFLAGS) $(ZSTD_CFLAGS) -c encoder_tav.c -o encoder_tav.o $(CC) $(CFLAGS) $(ZSTD_CFLAGS) -c encoder_tad.c -o encoder_tad.o $(CXX) $(CXXFLAGS) $(OPENCV_CFLAGS) $(ZSTD_CFLAGS) -c encoder_tav_opencv.cpp -o encoder_tav_opencv.o - $(CXX) -o encoder_tav encoder_tav.o encoder_tad.o encoder_tav_opencv.o $(LIBS) $(OPENCV_LIBS) + $(CXX) $(DBGFLAGS) -o encoder_tav encoder_tav.o encoder_tad.o encoder_tav_opencv.o $(LIBS) $(OPENCV_LIBS) tav_decoder: decoder_tav.c decoder_tad.c decoder_tad.h rm -f decoder_tav decoder_tav.o $(CC) $(CFLAGS) $(ZSTD_CFLAGS) -DTAD_DECODER_LIB -c decoder_tad.c -o decoder_tad.o $(CC) $(CFLAGS) $(ZSTD_CFLAGS) -c decoder_tav.c -o decoder_tav.o - $(CC) -o decoder_tav decoder_tav.o decoder_tad.o $(LIBS) + $(CC) $(DBGFLAGS) -o decoder_tav decoder_tav.o decoder_tad.o $(LIBS) tav_inspector: tav_inspector.c rm -f tav_inspector @@ -50,7 +51,7 @@ encoder_tad: encoder_tad_standalone.c encoder_tad.c encoder_tad.h rm -f encoder_tad encoder_tad_standalone.o encoder_tad.o $(CC) $(CFLAGS) $(ZSTD_CFLAGS) -c encoder_tad.c -o encoder_tad.o $(CC) $(CFLAGS) $(ZSTD_CFLAGS) -c encoder_tad_standalone.c -o encoder_tad_standalone.o - $(CC) -o encoder_tad encoder_tad_standalone.o encoder_tad.o $(LIBS) + $(CC) $(DBGFLAGS) -o encoder_tad encoder_tad_standalone.o encoder_tad.o $(LIBS) decoder_tad: decoder_tad.c rm -f decoder_tad diff --git a/video_encoder/decoder_tav.c b/video_encoder/decoder_tav.c index 722192f..fc4b557 100644 --- a/video_encoder/decoder_tav.c +++ b/video_encoder/decoder_tav.c @@ -689,12 +689,19 @@ static void decode_channel_ezbc(const uint8_t *ezbc_data, size_t offset, size_t // fprintf(stderr, "[EZBC] Decoded header: MSB=%d, width=%d, height=%d (expected pixels=%d)\n", // msb_bitplane, width, height, expected_count); - if (width * height != expected_count) { - fprintf(stderr, "EZBC dimension mismatch: %dx%d != %d\n", width, height, expected_count); + // With crop encoding, dimensions can vary per frame - trust the EZBC header + // Just ensure we don't overflow the output buffer + const int actual_count = width * height; + if (actual_count > expected_count) { + fprintf(stderr, "EZBC dimension overflow: %dx%d (%d) > %d\n", + width, height, actual_count, expected_count); memset(output, 0, expected_count * sizeof(int16_t)); return; } + // If actual count is less, only decode what we need + expected_count = actual_count; + // Initialise output and state tracking memset(output, 0, expected_count * sizeof(int16_t)); int8_t *significant = calloc(expected_count, sizeof(int8_t)); @@ -785,6 +792,47 @@ static void decode_channel_ezbc(const uint8_t *ezbc_data, size_t offset, size_t // nonzero_count, 100.0 * nonzero_count / expected_count, min_val, max_val); } +// Helper: peek at EZBC header to get dimensions without decoding +static int ezbc_peek_dimensions(const uint8_t *compressed_data, int channel_layout, + int *out_width, int *out_height) { + const int has_y = (channel_layout & 0x04) == 0; + + if (!has_y) { + return -1; // Need Y channel to get dimensions + } + + // Read Y channel size header + const uint32_t size = ((uint32_t)compressed_data[0]) | + ((uint32_t)compressed_data[1] << 8) | + ((uint32_t)compressed_data[2] << 16) | + ((uint32_t)compressed_data[3] << 24); + + if (size < 6) { + return -1; // Too small to contain EZBC header + } + + // Skip to EZBC data for Y channel (after size header) + const uint8_t *ezbc_data = compressed_data + 4; + + // Read EZBC header: skip MSB bitplane (1 byte), then read width and height + // Note: EZBC uses bitstream format, but dimensions are at fixed positions + // We need to parse the bitstream header carefully + + // Create a temporary reader to parse the bitstream + ezbc_bitreader_t reader; + reader.data = ezbc_data; + reader.size = size; + reader.byte_pos = 0; + reader.bit_pos = 0; + + // Read header: MSB bitplane (8 bits), width (16 bits), height (16 bits) + ezbc_read_bits(&reader, 8); // Skip MSB bitplane + *out_width = ezbc_read_bits(&reader, 16); + *out_height = ezbc_read_bits(&reader, 16); + + return 0; +} + // EZBC postprocessing for single frames static void postprocess_coefficients_ezbc(uint8_t *compressed_data, int coeff_count, int16_t *output_y, int16_t *output_co, int16_t *output_cg, @@ -1457,15 +1505,61 @@ error_cleanup: // Postprocess GOP EZBC format to per-frame coefficients (entropyCoder=1) // Layout: [frame0_size(4)][frame0_ezbc_data][frame1_size(4)][frame1_ezbc_data]... // Note: EZBC is a complex embedded bitplane codec - this is a simplified placeholder +// Returns the actual dimensions through output parameters (for crop encoding support) static int16_t ***postprocess_gop_ezbc(const uint8_t *decompressed_data, size_t data_size, - int gop_size, int num_pixels, int channel_layout) { - // Allocate output arrays: [gop_size][3 channels][num_pixels] + int gop_size, int num_pixels, int channel_layout, + int *out_width, int *out_height) { + // First, peek at the first frame's dimensions to determine actual GOP size + // (with crop encoding, GOP dimensions may be smaller than full frame) + int actual_width = 0, actual_height = 0; + int actual_pixels = num_pixels; // Default to full frame if peek fails + + if (data_size >= 8) { // Need at least frame size header + some EZBC data + // Skip first frame's size header to get to EZBC data + const uint32_t first_frame_size = ((uint32_t)decompressed_data[0]) | + ((uint32_t)decompressed_data[1] << 8) | + ((uint32_t)decompressed_data[2] << 16) | + ((uint32_t)decompressed_data[3] << 24); + + if (4 + first_frame_size <= data_size) { + if (ezbc_peek_dimensions(decompressed_data + 4, channel_layout, + &actual_width, &actual_height) == 0) { + actual_pixels = actual_width * actual_height; + // Only log if dimensions differ significantly (crop encoding active) + // Suppress repetitive messages by using static counter + static int crop_log_count = 0; + if (actual_pixels != num_pixels && crop_log_count < 3) { + fprintf(stderr, "[GOP-EZBC] Detected crop encoding: GOP dimensions %dx%d (%d pixels) vs full frame %d pixels\n", + actual_width, actual_height, actual_pixels, num_pixels); + crop_log_count++; + if (crop_log_count == 3) { + fprintf(stderr, "[GOP-EZBC] (Further crop encoding messages suppressed)\n"); + } + } + } + } + } + + // If we didn't successfully peek dimensions, calculate from num_pixels + if (actual_width == 0 || actual_height == 0) { + // Assume square-ish dimensions - this is a fallback, should not happen with proper encoding + actual_width = (int)sqrt(num_pixels); + actual_height = num_pixels / actual_width; + actual_pixels = actual_width * actual_height; + } + + // Return actual dimensions to caller + if (out_width) *out_width = actual_width; + if (out_height) *out_height = actual_height; + + // Allocate output arrays: [gop_size][3 channels][actual_pixels] + // Use actual GOP dimensions (may be cropped) not full frame size int16_t ***output = malloc(gop_size * sizeof(int16_t **)); for (int t = 0; t < gop_size; t++) { output[t] = malloc(3 * sizeof(int16_t *)); - output[t][0] = calloc(num_pixels, sizeof(int16_t)); // Y - output[t][1] = calloc(num_pixels, sizeof(int16_t)); // Co - output[t][2] = calloc(num_pixels, sizeof(int16_t)); // Cg + output[t][0] = calloc(actual_pixels, sizeof(int16_t)); // Y + output[t][1] = calloc(actual_pixels, sizeof(int16_t)); // Co + output[t][2] = calloc(actual_pixels, sizeof(int16_t)); // Cg } int offset = 0; @@ -1491,8 +1585,9 @@ static int16_t ***postprocess_gop_ezbc(const uint8_t *decompressed_data, size_t } // Decode EZBC frame using the single-frame EZBC decoder + // Pass actual_pixels (cropped size) not num_pixels (full frame size) postprocess_coefficients_ezbc( - (uint8_t *)(decompressed_data + offset), num_pixels, + (uint8_t *)(decompressed_data + offset), actual_pixels, output[t][0], output[t][1], output[t][2], channel_layout); @@ -1622,6 +1717,12 @@ typedef struct { uint16_t screen_mask_bottom; uint16_t screen_mask_left; + // Phase 2: Decoding dimensions (may differ from full frame dimensions per GOP) + int decoding_width; // Actual encoded dimensions (cropped active region) + int decoding_height; // Updated when Screen Mask packet is encountered + // Note: Buffers are allocated at max size (header.width × header.height) + // but only decoding_width × decoding_height portion is used + // FFmpeg pipe for video only (audio from file) FILE *video_pipe; pid_t ffmpeg_pid; @@ -1844,6 +1945,10 @@ static tav_decoder_t* tav_decoder_init(const char *input_file, const char *outpu decoder->is_monoblock = (decoder->header.version >= 3 && decoder->header.version <= 6); decoder->audio_file_path = strdup(audio_file); + // Phase 2: Initialize decoding dimensions to full frame (will be updated by Screen Mask packets) + decoder->decoding_width = decoder->header.width; + decoder->decoding_height = decoder->header.height; + // Allocate buffers decoder->current_frame_rgb = calloc(decoder->frame_size * 3, 1); decoder->reference_frame_rgb = calloc(decoder->frame_size * 3, 1); @@ -1984,6 +2089,37 @@ static void tav_decoder_free(tav_decoder_t *decoder) { //============================================================================= // Fill masked regions (letterbox/pillarbox bars) with black +// Phase 2: Composite cropped frame back to full frame with black borders +static uint8_t* composite_to_full_frame(const uint8_t *cropped_rgb, + int cropped_width, int cropped_height, + int full_width, int full_height, + uint16_t top, uint16_t right, + uint16_t bottom, uint16_t left) { + // Allocate full frame buffer (filled with black) + uint8_t *full_frame = calloc(full_width * full_height * 3, sizeof(uint8_t)); + if (!full_frame) { + return NULL; + } + + // Calculate active region position in full frame + const int dest_x = left; + const int dest_y = top; + + // Copy cropped frame into active region + for (int y = 0; y < cropped_height; y++) { + for (int x = 0; x < cropped_width; x++) { + const int src_offset = (y * cropped_width + x) * 3; + const int dest_offset = ((dest_y + y) * full_width + (dest_x + x)) * 3; + + full_frame[dest_offset + 0] = cropped_rgb[src_offset + 0]; // R + full_frame[dest_offset + 1] = cropped_rgb[src_offset + 1]; // G + full_frame[dest_offset + 2] = cropped_rgb[src_offset + 2]; // B + } + } + + return full_frame; +} + static void fill_masked_regions(uint8_t *frame_rgb, int width, int height, uint16_t top, uint16_t right, uint16_t bottom, uint16_t left) { // Fill top letterbox bar @@ -2145,7 +2281,9 @@ static int decode_i_or_p_frame(tav_decoder_t *decoder, uint8_t packet_type, uint memcpy(decoder->current_frame_rgb, decoder->reference_frame_rgb, decoder->frame_size * 3); } else { // Decode coefficients (use function-level variables for proper cleanup) - int coeff_count = decoder->frame_size; + // Phase 2: Use decoding dimensions (actual encoded size) + const int decoding_pixels = decoder->decoding_width * decoder->decoding_height; + int coeff_count = decoding_pixels; quantised_y = calloc(coeff_count, sizeof(int16_t)); quantised_co = calloc(coeff_count, sizeof(int16_t)); quantised_cg = calloc(coeff_count, sizeof(int16_t)); @@ -2183,45 +2321,52 @@ static int decode_i_or_p_frame(tav_decoder_t *decoder, uint8_t packet_type, uint // fprintf(stderr, " Max quantised Y coefficient: %d\n", max_quant_y); // } + // Phase 2: Allocate temporary DWT buffers for cropped region processing + float *temp_dwt_y = calloc(decoding_pixels, sizeof(float)); + float *temp_dwt_co = calloc(decoding_pixels, sizeof(float)); + float *temp_dwt_cg = calloc(decoding_pixels, sizeof(float)); + + if (!temp_dwt_y || !temp_dwt_co || !temp_dwt_cg) { + fprintf(stderr, "Error: Failed to allocate temporary DWT buffers\n"); + free(temp_dwt_y); + free(temp_dwt_co); + free(temp_dwt_cg); + decode_success = 0; + goto write_frame; + } + // Dequantise (perceptual for versions 5-8, uniform for 1-4) + // Phase 2: Use decoding dimensions and temporary buffers const int is_perceptual = (decoder->header.version >= 5 && decoder->header.version <= 8); const int is_ezbc = (decoder->header.entropy_coder == 1); if (is_ezbc && is_perceptual) { // EZBC mode with perceptual quantisation: coefficients are normalised // Need to dequantise using perceptual weights (same as twobit-map mode) - dequantise_dwt_subbands_perceptual(0, qy, quantised_y, decoder->dwt_buffer_y, - decoder->header.width, decoder->header.height, + dequantise_dwt_subbands_perceptual(0, qy, quantised_y, temp_dwt_y, + decoder->decoding_width, decoder->decoding_height, decoder->header.decomp_levels, qy, 0, decoder->frame_count); - dequantise_dwt_subbands_perceptual(0, qy, quantised_co, decoder->dwt_buffer_co, - decoder->header.width, decoder->header.height, + dequantise_dwt_subbands_perceptual(0, qy, quantised_co, temp_dwt_co, + decoder->decoding_width, decoder->decoding_height, decoder->header.decomp_levels, qco, 1, decoder->frame_count); - dequantise_dwt_subbands_perceptual(0, qy, quantised_cg, decoder->dwt_buffer_cg, - decoder->header.width, decoder->header.height, + dequantise_dwt_subbands_perceptual(0, qy, quantised_cg, temp_dwt_cg, + decoder->decoding_width, decoder->decoding_height, decoder->header.decomp_levels, qcg, 1, decoder->frame_count); } else if (is_perceptual) { - dequantise_dwt_subbands_perceptual(0, qy, quantised_y, decoder->dwt_buffer_y, - decoder->header.width, decoder->header.height, + dequantise_dwt_subbands_perceptual(0, qy, quantised_y, temp_dwt_y, + decoder->decoding_width, decoder->decoding_height, decoder->header.decomp_levels, qy, 0, decoder->frame_count); - - // Debug: Check if values survived the function call -// if (decoder->frame_count == 32) { -// fprintf(stderr, " RIGHT AFTER dequantise_Y returns: first 5 values: %.1f %.1f %.1f %.1f %.1f\n", -// decoder->dwt_buffer_y[0], decoder->dwt_buffer_y[1], decoder->dwt_buffer_y[2], -// decoder->dwt_buffer_y[3], decoder->dwt_buffer_y[4]); -// } - - dequantise_dwt_subbands_perceptual(0, qy, quantised_co, decoder->dwt_buffer_co, - decoder->header.width, decoder->header.height, + dequantise_dwt_subbands_perceptual(0, qy, quantised_co, temp_dwt_co, + decoder->decoding_width, decoder->decoding_height, decoder->header.decomp_levels, qco, 1, decoder->frame_count); - dequantise_dwt_subbands_perceptual(0, qy, quantised_cg, decoder->dwt_buffer_cg, - decoder->header.width, decoder->header.height, + dequantise_dwt_subbands_perceptual(0, qy, quantised_cg, temp_dwt_cg, + decoder->decoding_width, decoder->decoding_height, decoder->header.decomp_levels, qcg, 1, decoder->frame_count); } else { for (int i = 0; i < coeff_count; i++) { - decoder->dwt_buffer_y[i] = quantised_y[i] * qy; - decoder->dwt_buffer_co[i] = quantised_co[i] * qco; - decoder->dwt_buffer_cg[i] = quantised_cg[i] * qcg; + temp_dwt_y[i] = quantised_y[i] * qy; + temp_dwt_co[i] = quantised_co[i] * qco; + temp_dwt_cg[i] = quantised_cg[i] * qcg; } } @@ -2253,7 +2398,8 @@ static int decode_i_or_p_frame(tav_decoder_t *decoder, uint8_t packet_type, uint // } // Remove grain synthesis from Y channel (must happen after dequantisation, before inverse DWT) - remove_grain_synthesis_decoder(decoder->dwt_buffer_y, decoder->header.width, decoder->header.height, + // Phase 2: Use decoding dimensions and temporary buffer + remove_grain_synthesis_decoder(temp_dwt_y, decoder->decoding_width, decoder->decoding_height, decoder->header.decomp_levels, decoder->frame_count, decoder->header.quantiser_y); // Debug: Check LL band AFTER grain removal @@ -2272,12 +2418,13 @@ static int decode_i_or_p_frame(tav_decoder_t *decoder, uint8_t packet_type, uint // } // Apply inverse DWT with correct non-power-of-2 dimension handling + // Phase 2: Use decoding dimensions and temporary buffers // Note: quantised arrays freed at write_frame label - apply_inverse_dwt_multilevel(decoder->dwt_buffer_y, decoder->header.width, decoder->header.height, + apply_inverse_dwt_multilevel(temp_dwt_y, decoder->decoding_width, decoder->decoding_height, decoder->header.decomp_levels, decoder->header.wavelet_filter); - apply_inverse_dwt_multilevel(decoder->dwt_buffer_co, decoder->header.width, decoder->header.height, + apply_inverse_dwt_multilevel(temp_dwt_co, decoder->decoding_width, decoder->decoding_height, decoder->header.decomp_levels, decoder->header.wavelet_filter); - apply_inverse_dwt_multilevel(decoder->dwt_buffer_cg, decoder->header.width, decoder->header.height, + apply_inverse_dwt_multilevel(temp_dwt_cg, decoder->decoding_width, decoder->decoding_height, decoder->header.decomp_levels, decoder->header.wavelet_filter); // Debug: Check spatial domain values after IDWT @@ -2301,47 +2448,67 @@ static int decode_i_or_p_frame(tav_decoder_t *decoder, uint8_t packet_type, uint // } // Handle P-frame delta accumulation (in YCoCg float space) + // TODO Phase 2: P-frame support with crop encoding needs additional work + // - Reference frames are stored at full size but delta may be at cropped size + // - Need to extract/composite reference region appropriately if (packet_type == TAV_PACKET_PFRAME && mode == TAV_MODE_DELTA) { - for (int i = 0; i < decoder->frame_size; i++) { - decoder->dwt_buffer_y[i] += decoder->reference_ycocg_y[i]; - decoder->dwt_buffer_co[i] += decoder->reference_ycocg_co[i]; - decoder->dwt_buffer_cg[i] += decoder->reference_ycocg_cg[i]; + fprintf(stderr, "Warning: P-frame delta mode not yet fully supported with crop encoding\n"); + for (int i = 0; i < decoding_pixels; i++) { + temp_dwt_y[i] += decoder->reference_ycocg_y[i]; + temp_dwt_co[i] += decoder->reference_ycocg_co[i]; + temp_dwt_cg[i] += decoder->reference_ycocg_cg[i]; } } - // Convert YCoCg-R/ICtCp to RGB - const int is_ictcp = (decoder->header.version % 2 == 0); - float max_y = -999, max_co = -999, max_cg = -999; - int max_r = 0, max_g = 0, max_b = 0; + // Phase 2: Convert cropped region to RGB, then composite to full frame + uint8_t *cropped_rgb = malloc(decoding_pixels * 3); + if (!cropped_rgb) { + fprintf(stderr, "Error: Failed to allocate cropped RGB buffer\n"); + free(temp_dwt_y); + free(temp_dwt_co); + free(temp_dwt_cg); + decode_success = 0; + goto write_frame; + } - for (int i = 0; i < decoder->frame_size; i++) { + // Convert YCoCg-R/ICtCp to RGB for cropped region + const int is_ictcp = (decoder->header.version % 2 == 0); + + for (int i = 0; i < decoding_pixels; i++) { uint8_t r, g, b; if (is_ictcp) { - ictcp_to_rgb(decoder->dwt_buffer_y[i], - decoder->dwt_buffer_co[i], - decoder->dwt_buffer_cg[i], &r, &g, &b); + ictcp_to_rgb(temp_dwt_y[i], temp_dwt_co[i], temp_dwt_cg[i], &r, &g, &b); } else { - ycocg_r_to_rgb(decoder->dwt_buffer_y[i], - decoder->dwt_buffer_co[i], - decoder->dwt_buffer_cg[i], &r, &g, &b); + ycocg_r_to_rgb(temp_dwt_y[i], temp_dwt_co[i], temp_dwt_cg[i], &r, &g, &b); } - // Track max values for debugging -// if (decoder->frame_count == 1000) { -// if (decoder->dwt_buffer_y[i] > max_y) max_y = decoder->dwt_buffer_y[i]; -// if (decoder->dwt_buffer_co[i] > max_co) max_co = decoder->dwt_buffer_co[i]; -// if (decoder->dwt_buffer_cg[i] > max_cg) max_cg = decoder->dwt_buffer_cg[i]; -// if (r > max_r) max_r = r; -// if (g > max_g) max_g = g; -// if (b > max_b) max_b = b; -// } - // RGB byte order for FFmpeg rgb24 - decoder->current_frame_rgb[i * 3 + 0] = r; - decoder->current_frame_rgb[i * 3 + 1] = g; - decoder->current_frame_rgb[i * 3 + 2] = b; + cropped_rgb[i * 3 + 0] = r; + cropped_rgb[i * 3 + 1] = g; + cropped_rgb[i * 3 + 2] = b; } + // Composite cropped frame to full frame with black borders + uint8_t *full_frame_rgb = composite_to_full_frame(cropped_rgb, + decoder->decoding_width, decoder->decoding_height, + decoder->header.width, decoder->header.height, + decoder->screen_mask_top, decoder->screen_mask_right, + decoder->screen_mask_bottom, decoder->screen_mask_left); + free(cropped_rgb); + free(temp_dwt_y); + free(temp_dwt_co); + free(temp_dwt_cg); + + if (!full_frame_rgb) { + fprintf(stderr, "Error: Failed to composite frame to full size\n"); + decode_success = 0; + goto write_frame; + } + + // Copy composited frame to decoder buffer + memcpy(decoder->current_frame_rgb, full_frame_rgb, decoder->frame_size * 3); + free(full_frame_rgb); + // if (decoder->frame_count == 1000) { // fprintf(stderr, "\n=== Frame 1000 Value Analysis ===\n"); // fprintf(stderr, "Max YCoCg values: Y=%.1f, Co=%.1f, Cg=%.1f\n", max_y, max_co, max_cg); @@ -2360,10 +2527,12 @@ static int decode_i_or_p_frame(tav_decoder_t *decoder, uint8_t packet_type, uint // fprintf(stderr, "\n"); // } - // Update reference YCoCg frame - memcpy(decoder->reference_ycocg_y, decoder->dwt_buffer_y, decoder->frame_size * sizeof(float)); - memcpy(decoder->reference_ycocg_co, decoder->dwt_buffer_co, decoder->frame_size * sizeof(float)); - memcpy(decoder->reference_ycocg_cg, decoder->dwt_buffer_cg, decoder->frame_size * sizeof(float)); + // TODO Phase 2: Reference YCoCg frame update needs rework for crop encoding + // Currently not updated because we use temporary buffers that are already freed + // P-frame support will need to store reference at appropriate dimensions + // memcpy(decoder->reference_ycocg_y, temp_dwt_y, decoding_pixels * sizeof(float)); + // memcpy(decoder->reference_ycocg_co, temp_dwt_co, decoding_pixels * sizeof(float)); + // memcpy(decoder->reference_ycocg_cg, temp_dwt_cg, decoding_pixels * sizeof(float)); } // Update reference frame @@ -2622,9 +2791,20 @@ int main(int argc, char *argv[]) { entry->bottom = bottom; entry->left = left; + // Phase 2: Update current active mask and decoding dimensions + decoder->screen_mask_top = top; + decoder->screen_mask_right = right; + decoder->screen_mask_bottom = bottom; + decoder->screen_mask_left = left; + + // Calculate new decoding dimensions (active region size) + decoder->decoding_width = decoder->header.width - left - right; + decoder->decoding_height = decoder->header.height - top - bottom; + if (verbose) { - fprintf(stderr, "Packet %d: SCREEN_MASK (0x%02X) - frame=%u top=%u right=%u bottom=%u left=%u\n", - total_packets, packet_type, frame_num, top, right, bottom, left); + fprintf(stderr, "Packet %d: SCREEN_MASK (0x%02X) - frame=%u top=%u right=%u bottom=%u left=%u (decoding: %dx%d)\n", + total_packets, packet_type, frame_num, top, right, bottom, left, + decoder->decoding_width, decoder->decoding_height); } continue; } @@ -2689,27 +2869,47 @@ int main(int argc, char *argv[]) { } // Postprocess coefficients based on entropy_coder value + // Phase 2: Use decoding dimensions (actual encoded size) for postprocessing + int decoding_pixels = decoder->decoding_width * decoder->decoding_height; + // Keep full frame size for buffer allocation const int num_pixels = decoder->header.width * decoder->header.height; int16_t ***quantised_gop; + // GOP dimensions (may differ from full frame with crop encoding) + int gop_width = decoder->decoding_width; + int gop_height = decoder->decoding_height; + if (decoder->header.entropy_coder == 2) { // RAW format: simple concatenated int16 arrays if (verbose) { - fprintf(stderr, " Using RAW postprocessing (entropy_coder=2)\n"); + fprintf(stderr, " Using RAW postprocessing (entropy_coder=2) for %dx%d (%d pixels)\n", + decoder->decoding_width, decoder->decoding_height, decoding_pixels); } quantised_gop = postprocess_gop_raw(decompressed_data, decompressed_size, gop_size, num_pixels, decoder->header.channel_layout); } else if (decoder->header.entropy_coder == 1) { // EZBC format: embedded zero-block coding if (verbose) { - fprintf(stderr, " Using EZBC postprocessing (entropy_coder=1)\n"); + fprintf(stderr, " Using EZBC postprocessing (entropy_coder=1) for %dx%d (%d pixels)\n", + decoder->decoding_width, decoder->decoding_height, decoding_pixels); } + // EZBC will return actual GOP dimensions (may be cropped with crop encoding) quantised_gop = postprocess_gop_ezbc(decompressed_data, decompressed_size, - gop_size, num_pixels, decoder->header.channel_layout); + gop_size, num_pixels, decoder->header.channel_layout, + &gop_width, &gop_height); + // Update decoding_pixels to match actual GOP dimensions + if (gop_width > 0 && gop_height > 0) { + decoding_pixels = gop_width * gop_height; + if (verbose) { + fprintf(stderr, " Actual GOP dimensions from EZBC: %dx%d (%d pixels)\n", + gop_width, gop_height, decoding_pixels); + } + } } else { // Default: Twobitmap format (entropy_coder=0) if (verbose) { - fprintf(stderr, " Using Twobitmap postprocessing (entropy_coder=0)\n"); + fprintf(stderr, " Using Twobitmap postprocessing (entropy_coder=0) for %dx%d (%d pixels)\n", + decoder->decoding_width, decoder->decoding_height, decoding_pixels); } quantised_gop = postprocess_gop_unified(decompressed_data, decompressed_size, gop_size, num_pixels, decoder->header.channel_layout); @@ -2724,14 +2924,15 @@ int main(int argc, char *argv[]) { } // Allocate GOP float buffers + // Phase 2: Allocate at decoding size (cropped region), will composite to full frame later float **gop_y = malloc(gop_size * sizeof(float *)); float **gop_co = malloc(gop_size * sizeof(float *)); float **gop_cg = malloc(gop_size * sizeof(float *)); for (int t = 0; t < gop_size; t++) { - gop_y[t] = calloc(num_pixels, sizeof(float)); - gop_co[t] = calloc(num_pixels, sizeof(float)); - gop_cg[t] = calloc(num_pixels, sizeof(float)); + gop_y[t] = calloc(decoding_pixels, sizeof(float)); + gop_co[t] = calloc(decoding_pixels, sizeof(float)); + gop_cg[t] = calloc(decoding_pixels, sizeof(float)); } // Dequantise with temporal scaling (perceptual quantisation for versions 5-8) @@ -2751,17 +2952,18 @@ int main(int argc, char *argv[]) { const float base_q_co = roundf(QLUT[decoder->header.quantiser_co] * temporal_scale); const float base_q_cg = roundf(QLUT[decoder->header.quantiser_cg] * temporal_scale); + // Phase 2: Use GOP dimensions (may be cropped) for dequantisation dequantise_dwt_subbands_perceptual(0, QLUT[decoder->header.quantiser_y], quantised_gop[t][0], gop_y[t], - decoder->header.width, decoder->header.height, + gop_width, gop_height, decoder->header.decomp_levels, base_q_y, 0, decoder->frame_count + t); dequantise_dwt_subbands_perceptual(0, QLUT[decoder->header.quantiser_y], quantised_gop[t][1], gop_co[t], - decoder->header.width, decoder->header.height, + gop_width, gop_height, decoder->header.decomp_levels, base_q_co, 1, decoder->frame_count + t); dequantise_dwt_subbands_perceptual(0, QLUT[decoder->header.quantiser_y], quantised_gop[t][2], gop_cg[t], - decoder->header.width, decoder->header.height, + gop_width, gop_height, decoder->header.decomp_levels, base_q_cg, 1, decoder->frame_count + t); if (t == 0 && verbose) { @@ -2786,21 +2988,23 @@ int main(int argc, char *argv[]) { const float base_q_cg = roundf(QLUT[decoder->header.quantiser_cg] * temporal_scale); if (is_perceptual) { + // Phase 2: Use GOP dimensions (may be cropped) for dequantisation dequantise_dwt_subbands_perceptual(0, QLUT[decoder->header.quantiser_y], quantised_gop[t][0], gop_y[t], - decoder->header.width, decoder->header.height, + gop_width, gop_height, decoder->header.decomp_levels, base_q_y, 0, decoder->frame_count + t); dequantise_dwt_subbands_perceptual(0, QLUT[decoder->header.quantiser_y], quantised_gop[t][1], gop_co[t], - decoder->header.width, decoder->header.height, + gop_width, gop_height, decoder->header.decomp_levels, base_q_co, 1, decoder->frame_count + t); dequantise_dwt_subbands_perceptual(0, QLUT[decoder->header.quantiser_y], quantised_gop[t][2], gop_cg[t], - decoder->header.width, decoder->header.height, + gop_width, gop_height, decoder->header.decomp_levels, base_q_cg, 1, decoder->frame_count + t); } else { // Uniform quantisation for older versions - for (int i = 0; i < num_pixels; i++) { + // Phase 2: Use decoding_pixels for uniform dequantisation + for (int i = 0; i < decoding_pixels; i++) { gop_y[t][i] = quantised_gop[t][0][i] * base_q_y; gop_co[t][i] = quantised_gop[t][1][i] * base_q_co; gop_cg[t][i] = quantised_gop[t][2][i] * base_q_cg; @@ -2819,14 +3023,16 @@ int main(int argc, char *argv[]) { free(quantised_gop); + // Phase 2: Use GOP dimensions (may be cropped) for grain removal for (int t = 0; t < gop_size; t++) { - remove_grain_synthesis_decoder(gop_y[t], decoder->header.width, decoder->header.height, + remove_grain_synthesis_decoder(gop_y[t], gop_width, gop_height, decoder->header.decomp_levels, decoder->frame_count + t, decoder->header.quantiser_y); } // Apply inverse 3D DWT (spatial + temporal) - apply_inverse_3d_dwt(gop_y, gop_co, gop_cg, decoder->header.width, decoder->header.height, + // Phase 2: Use GOP dimensions (may be cropped) for inverse DWT + apply_inverse_3d_dwt(gop_y, gop_co, gop_cg, gop_width, gop_height, gop_size, decoder->header.decomp_levels, temporal_levels, decoder->header.wavelet_filter); @@ -2859,35 +3065,78 @@ int main(int argc, char *argv[]) { // (size_t)decoder->frame_size * 3, decoder->header.width * decoder->header.height * 3); // } + // Calculate consistent screen mask offsets for crop-encoded GOPs + // When crop encoding is active, all frames in GOP use same dimensions + const int is_crop_encoded = (gop_width != decoder->header.width || + gop_height != decoder->header.height); + uint16_t gop_mask_top = 0, gop_mask_bottom = 0, gop_mask_left = 0, gop_mask_right = 0; + + if (is_crop_encoded) { + // Center the cropped region in the full frame + if (gop_height < decoder->header.height) { + gop_mask_top = (decoder->header.height - gop_height) / 2; + gop_mask_bottom = decoder->header.height - gop_height - gop_mask_top; + } + if (gop_width < decoder->header.width) { + gop_mask_left = (decoder->header.width - gop_width) / 2; + gop_mask_right = decoder->header.width - gop_width - gop_mask_left; + } + if (verbose && decoder->frame_count == 0) { + fprintf(stderr, "[GOP-Crop] Centering %dx%d in %dx%d: top=%u, bottom=%u, left=%u, right=%u\n", + gop_width, gop_height, decoder->header.width, decoder->header.height, + gop_mask_top, gop_mask_bottom, gop_mask_left, gop_mask_right); + } + } + for (int t = 0; t < gop_size; t++) { - // Allocate frame buffer - uint8_t *frame_rgb = malloc(decoder->frame_size * 3); - if (!frame_rgb) { - fprintf(stderr, "Error: Failed to allocate GOP frame buffer\n"); + // Update screen mask only if NOT crop-encoded + // Crop-encoded GOPs use consistent offsets calculated above + if (!is_crop_encoded) { + update_screen_mask(decoder, decoder->frame_count + t); + } + + // Phase 2: Convert cropped region to RGB, then composite to full frame + uint8_t *cropped_rgb = malloc(decoding_pixels * 3); + if (!cropped_rgb) { + fprintf(stderr, "Error: Failed to allocate cropped GOP frame buffer\n"); result = -1; break; } - // Convert to RGB - for (int i = 0; i < decoder->frame_size; i++) { + // Convert cropped region to RGB + for (int i = 0; i < decoding_pixels; i++) { uint8_t r, g, b; if (is_ictcp) { ictcp_to_rgb(gop_y[t][i], gop_co[t][i], gop_cg[t][i], &r, &g, &b); } else { ycocg_r_to_rgb(gop_y[t][i], gop_co[t][i], gop_cg[t][i], &r, &g, &b); } - frame_rgb[i * 3 + 0] = r; - frame_rgb[i * 3 + 1] = g; - frame_rgb[i * 3 + 2] = b; + cropped_rgb[i * 3 + 0] = r; + cropped_rgb[i * 3 + 1] = g; + cropped_rgb[i * 3 + 2] = b; } - // Update active screen mask for this GOP frame - update_screen_mask(decoder, decoder->frame_count + t); + // Composite cropped frame to full frame with black borders + // Use GOP-consistent offsets for crop-encoded, or per-frame offsets otherwise + const uint16_t mask_top = is_crop_encoded ? gop_mask_top : decoder->screen_mask_top; + const uint16_t mask_bottom = is_crop_encoded ? gop_mask_bottom : decoder->screen_mask_bottom; + const uint16_t mask_left = is_crop_encoded ? gop_mask_left : decoder->screen_mask_left; + const uint16_t mask_right = is_crop_encoded ? gop_mask_right : decoder->screen_mask_right; - // Fill masked regions with black (letterbox/pillarbox bars) - fill_masked_regions(frame_rgb, decoder->header.width, decoder->header.height, - decoder->screen_mask_top, decoder->screen_mask_right, - decoder->screen_mask_bottom, decoder->screen_mask_left); + uint8_t *frame_rgb = composite_to_full_frame(cropped_rgb, + gop_width, gop_height, + decoder->header.width, decoder->header.height, + mask_top, mask_right, mask_bottom, mask_left); + free(cropped_rgb); + + if (!frame_rgb) { + fprintf(stderr, "Error: Failed to composite GOP frame to full size\n"); + result = -1; + break; + } + + // Note: Phase 1 fill_masked_regions() is now replaced by Phase 2 composite function + // which places the decoded cropped frame into a full-frame buffer with black borders // Write frame to FFmpeg video pipe const size_t bytes_to_write = decoder->frame_size * 3; diff --git a/video_encoder/encoder_tav.c b/video_encoder/encoder_tav.c index 70694c7..bc70d94 100644 --- a/video_encoder/encoder_tav.c +++ b/video_encoder/encoder_tav.c @@ -214,6 +214,17 @@ typedef struct gop_boundary { int start_frame; int end_frame; int num_frames; + + // Phase 2: GOP-level geometry tracking for crop encoding + int max_active_width; // Maximum active width across all frames in this GOP + int max_active_height; // Maximum active height across all frames in this GOP + uint16_t mask_top; // Representative mask geometry for this GOP + uint16_t mask_right; // (uses geometry from first frame, or common geometry) + uint16_t mask_bottom; + uint16_t mask_left; + int geometry_changes; // Count of geometry changes within this GOP + int enable_crop_encoding; // 1 if crop encoding active for this GOP + struct gop_boundary *next; } gop_boundary_t; @@ -616,8 +627,11 @@ static size_t encode_channel_ezbc(int16_t *coeffs, size_t count, int width, int total_sign_bits_written, total_refinement_bits_written); } + // Free all queues (including the last next_* queues created in final iteration) queue_free(&insignificant_queue); queue_free(&significant_queue); + queue_free(&next_insignificant); + queue_free(&next_significant); free(states); size_t final_size = bitstream_size(&bs); @@ -1813,6 +1827,16 @@ typedef struct tav_encoder_s { int pcm8_audio; // 1 = use 8-bit PCM audio (packet 0x21), 0 = use MP2 (default) int tad_audio; // 1 = use TAD audio (packet 0x24), 0 = use MP2/PCM8 (default, quality follows quality_level) int enable_letterbox_detect; // 1 = detect and emit letterbox/pillarbox packets (default), 0 = disable + int enable_crop_encoding; // 1 = encode cropped active region only (Phase 2), 0 = encode full frame (default) + + // Active region tracking (for Phase 2 crop encoding) + uint16_t active_mask_top, active_mask_right, active_mask_bottom, active_mask_left; + int active_width, active_height; // Dimensions of active region (width - left - right, height - top - bottom) + + // Encoding dimensions (active region when crop encoding, full frame otherwise) + // IMPORTANT: width and height are ALWAYS full frame dimensions (never change them!) + // encoding_width and encoding_height are the dimensions actually encoded + int encoding_width, encoding_height; // Frame buffers - ping-pong implementation uint8_t *frame_rgb[2]; // [0] and [1] alternate between current and previous @@ -1830,6 +1854,8 @@ typedef struct tav_encoder_s { int enable_temporal_dwt; // Flag to enable temporal DWT (default: 0 for backward compatibility) int temporal_gop_capacity; // Maximum GOP size (typically 16) int temporal_gop_frame_count; // Current number of frames accumulated in GOP + int temporal_gop_width; // Width of frames in current GOP (for crop encoding) + int temporal_gop_height; // Height of frames in current GOP (for crop encoding) uint8_t **temporal_gop_rgb_frames; // [frame][pixel*3] - RGB data for each GOP frame float **temporal_gop_y_frames; // [frame][pixel] - Y channel for each GOP frame float **temporal_gop_co_frames; // [frame][pixel] - Co channel for each GOP frame @@ -2429,11 +2455,26 @@ static tav_encoder_t* create_encoder(void) { enc->pcm8_audio = 0; // Default: use MP2 audio enc->tad_audio = 0; // Default: use MP2 audio (TAD quality follows quality_level) enc->enable_letterbox_detect = 1; // Default: enable letterbox/pillarbox detection + enc->enable_crop_encoding = 0; // Default: disabled (Phase 2 experimental) + + // Active region tracking (initialized to full frame, updated when crop encoding enabled) + enc->active_mask_top = 0; + enc->active_mask_right = 0; + enc->active_mask_bottom = 0; + enc->active_mask_left = 0; + enc->active_width = 0; // Will be set when first frame is processed + enc->active_height = 0; // Will be set when first frame is processed + + // Encoding dimensions (default to full frame, updated per-frame when crop encoding enabled) + enc->encoding_width = enc->width; + enc->encoding_height = enc->height; // GOP / temporal DWT settings enc->enable_temporal_dwt = 1; // Mutually exclusive with use_delta_encoding enc->temporal_gop_capacity = TEMPORAL_GOP_SIZE; // 24 frames enc->temporal_gop_frame_count = 0; + enc->temporal_gop_width = 0; // Will be set when first frame is added to GOP + enc->temporal_gop_height = 0; // Will be set when first frame is added to GOP enc->temporal_decomp_levels = TEMPORAL_DECOMP_LEVEL; // 3 levels of temporal DWT (24 -> 12 -> 6 -> 3 temporal subbands) enc->temporal_gop_rgb_frames = NULL; enc->temporal_gop_y_frames = NULL; @@ -3922,9 +3963,10 @@ static size_t encode_pframe_residual(tav_encoder_t *enc, int qY) { } // Step 6: Preprocess coefficients (significance map compression) + // Phase 2: Use encoding dimensions (actual encoded size) int total_coeffs = frame_size * 3; // Y + Co + Cg uint8_t *preprocessed = malloc(total_coeffs * sizeof(int16_t) + 1024); // Extra space for map - size_t preprocessed_size = preprocess_coefficients_variable_layout(enc->preprocess_mode, enc->width, enc->height, + size_t preprocessed_size = preprocess_coefficients_variable_layout(enc->preprocess_mode, enc->encoding_width, enc->encoding_height, quantised_y, quantised_co, quantised_cg, NULL, frame_size, enc->channel_layout, preprocessed); @@ -4216,9 +4258,10 @@ static size_t encode_pframe_adaptive(tav_encoder_t *enc, int qY) { enc->decomp_levels, 1, 0); // Step 8: Preprocess coefficients + // Phase 2: Use encoding dimensions (actual encoded size) int total_coeffs = frame_size * 3; uint8_t *preprocessed = malloc(total_coeffs * sizeof(int16_t) + 1024); - size_t preprocessed_size = preprocess_coefficients_variable_layout(enc->preprocess_mode, enc->width, enc->height, + size_t preprocessed_size = preprocess_coefficients_variable_layout(enc->preprocess_mode, enc->encoding_width, enc->encoding_height, quantised_y, quantised_co, quantised_cg, NULL, frame_size, enc->channel_layout, preprocessed); @@ -4449,9 +4492,10 @@ static size_t encode_bframe_adaptive(tav_encoder_t *enc, int qY) { enc->decomp_levels, 1, 0); // Step 8: Preprocess coefficients + // Phase 2: Use encoding dimensions (actual encoded size) int total_coeffs = frame_size * 3; uint8_t *preprocessed = malloc(total_coeffs * sizeof(int16_t) + 1024); - size_t preprocessed_size = preprocess_coefficients_variable_layout(enc->preprocess_mode, enc->width, enc->height, + size_t preprocessed_size = preprocess_coefficients_variable_layout(enc->preprocess_mode, enc->encoding_width, enc->encoding_height, quantised_y, quantised_co, quantised_cg, NULL, frame_size, enc->channel_layout, preprocessed); @@ -4532,21 +4576,48 @@ static size_t encode_bframe_adaptive(tav_encoder_t *enc, int qY) { // Add frame to GOP buffer // Returns 0 on success, -1 on error static int temporal_gop_add_frame(tav_encoder_t *enc, const uint8_t *frame_rgb, - const float *frame_y, const float *frame_co, const float *frame_cg) { + const float *frame_y, const float *frame_co, const float *frame_cg, + int width, int height) { if (!enc->enable_temporal_dwt || enc->temporal_gop_frame_count >= enc->temporal_gop_capacity) { return -1; } int frame_idx = enc->temporal_gop_frame_count; - size_t frame_rgb_size = enc->width * enc->height * 3; - size_t frame_channel_size = enc->width * enc->height * sizeof(float); - // Copy frame data to GOP buffers + // On first frame, store GOP dimensions (all frames in GOP must have same dimensions) + if (frame_idx == 0) { + enc->temporal_gop_width = width; + enc->temporal_gop_height = height; + } + + // Verify all frames in GOP have consistent dimensions + if (width != enc->temporal_gop_width || height != enc->temporal_gop_height) { + fprintf(stderr, "Error: GOP dimension mismatch - frame %d is %dx%d but GOP is %dx%d\n", + frame_idx, width, height, enc->temporal_gop_width, enc->temporal_gop_height); + return -1; + } + + size_t frame_rgb_size = width * height * 3; + size_t frame_channel_size = width * height * sizeof(float); + + // Debug logging to catch buffer overflows + if (enc->verbose) { + fprintf(stderr, "[temporal_gop_add_frame] Frame %d: copying %dx%d (%zu bytes RGB, %zu bytes per channel)\n", + frame_idx, width, height, frame_rgb_size, frame_channel_size); + fprintf(stderr, " GOP dimensions: %dx%d, buffer was allocated for full frame: %dx%d\n", + enc->temporal_gop_width, enc->temporal_gop_height, enc->width, enc->height); + } + + // Copy frame data to GOP buffers (only actual data, not full frame if cropped) memcpy(enc->temporal_gop_rgb_frames[frame_idx], frame_rgb, frame_rgb_size); memcpy(enc->temporal_gop_y_frames[frame_idx], frame_y, frame_channel_size); memcpy(enc->temporal_gop_co_frames[frame_idx], frame_co, frame_channel_size); memcpy(enc->temporal_gop_cg_frames[frame_idx], frame_cg, frame_channel_size); + if (enc->verbose) { + fprintf(stderr, "[temporal_gop_add_frame] Frame %d: memcpy completed successfully\n", frame_idx); + } + // Compute block-based motion estimation if MC-EZBC is enabled if (enc->temporal_enable_mcezbc && frame_idx > 0) { // Compute forward motion vectors (F[i-1] → F[i]) using optical flow @@ -4554,7 +4625,7 @@ static int temporal_gop_add_frame(tav_encoder_t *enc, const uint8_t *frame_rgb, estimate_optical_flow_motion( enc->temporal_gop_y_frames[frame_idx], // Current frame Y channel enc->temporal_gop_y_frames[frame_idx - 1], // Reference frame Y channel - enc->width, enc->height, + width, height, // Use actual GOP dimensions (may be cropped) enc->temporal_block_size, enc->temporal_gop_mvs_fwd_x[frame_idx], // Output: forward MVs X (1/4-pixel units) enc->temporal_gop_mvs_fwd_y[frame_idx] // Output: forward MVs Y (1/4-pixel units) @@ -4604,6 +4675,8 @@ static int gop_is_full(const tav_encoder_t *enc) { // Reset GOP buffer static void gop_reset(tav_encoder_t *enc) { enc->temporal_gop_frame_count = 0; + enc->temporal_gop_width = 0; + enc->temporal_gop_height = 0; } // Check if GOP should be flushed based on pre-computed boundaries (two-pass mode) @@ -4634,8 +4707,30 @@ static size_t gop_flush(tav_encoder_t *enc, FILE *output, int base_quantiser, return 0; } + // Validate and debug GOP dimensions + if (enc->verbose) { + fprintf(stderr, "[gop_flush] DEBUG: GOP dimensions %dx%d, actual_gop_size=%d, capacity=%d\n", + enc->temporal_gop_width, enc->temporal_gop_height, actual_gop_size, enc->temporal_gop_capacity); + } + + if (enc->temporal_gop_width <= 0 || enc->temporal_gop_height <= 0) { + fprintf(stderr, "Error: Invalid GOP dimensions: %dx%d (GOP has %d frames)\n", + enc->temporal_gop_width, enc->temporal_gop_height, actual_gop_size); + fprintf(stderr, "This suggests frames were not added to GOP properly. Falling back to frame dimensions.\n"); + enc->temporal_gop_width = enc->width; + enc->temporal_gop_height = enc->height; + } + // Allocate working buffers for each channel - int num_pixels = enc->width * enc->height; // Will be updated if frames are cropped + // Phase 2: Use stored GOP dimensions (set when first frame was added to GOP) + // These are the actual dimensions of data in GOP buffers (cropped if crop encoding was active) + int num_pixels = enc->temporal_gop_width * enc->temporal_gop_height; + + if (enc->verbose) { + fprintf(stderr, "[gop_flush] Allocating %d frames × %d pixels = %zu total floats per channel\n", + actual_gop_size, num_pixels, (size_t)actual_gop_size * num_pixels); + } + float **gop_y_coeffs = malloc(actual_gop_size * sizeof(float*)); float **gop_co_coeffs = malloc(actual_gop_size * sizeof(float*)); float **gop_cg_coeffs = malloc(actual_gop_size * sizeof(float*)); @@ -4645,12 +4740,20 @@ static size_t gop_flush(tav_encoder_t *enc, FILE *output, int base_quantiser, gop_co_coeffs[i] = malloc(num_pixels * sizeof(float)); gop_cg_coeffs[i] = malloc(num_pixels * sizeof(float)); + if (enc->verbose && i == 0) { + fprintf(stderr, "[gop_flush] Allocated coefficient buffers, now copying frame data...\n"); + } + // Copy GOP frame data to working buffers memcpy(gop_y_coeffs[i], enc->temporal_gop_y_frames[i], num_pixels * sizeof(float)); memcpy(gop_co_coeffs[i], enc->temporal_gop_co_frames[i], num_pixels * sizeof(float)); memcpy(gop_cg_coeffs[i], enc->temporal_gop_cg_frames[i], num_pixels * sizeof(float)); } + if (enc->verbose) { + fprintf(stderr, "[gop_flush] Frame data copied successfully, proceeding to DWT...\n"); + } + // Step 0.6: Motion compensation note // For MC-EZBC: MC-lifting integrates motion compensation directly into the lifting steps // For translation: still use pre-alignment (old method for backwards compatibility) @@ -4664,29 +4767,72 @@ static size_t gop_flush(tav_encoder_t *enc, FILE *output, int base_quantiser, // Step 1: For single-frame GOP, skip temporal DWT and use traditional I-frame path if (actual_gop_size == 1) { + if (enc->verbose) { + fprintf(stderr, "[gop_flush] Single-frame GOP, applying 2D spatial DWT only\n"); + } // Apply only 2D spatial DWT (no temporal transform for single frame) - // Use cropped dimensions (will be full size if no motion) - dwt_2d_forward_flexible(enc, gop_y_coeffs[0], enc->width, enc->height, enc->decomp_levels, enc->wavelet_filter); - dwt_2d_forward_flexible(enc, gop_co_coeffs[0], enc->width, enc->height, enc->decomp_levels, enc->wavelet_filter); - dwt_2d_forward_flexible(enc, gop_cg_coeffs[0], enc->width, enc->height, enc->decomp_levels, enc->wavelet_filter); + // Phase 2: Use stored GOP dimensions (actual data size in buffers) + dwt_2d_forward_flexible(enc, gop_y_coeffs[0], enc->temporal_gop_width, enc->temporal_gop_height, enc->decomp_levels, enc->wavelet_filter); + dwt_2d_forward_flexible(enc, gop_co_coeffs[0], enc->temporal_gop_width, enc->temporal_gop_height, enc->decomp_levels, enc->wavelet_filter); + dwt_2d_forward_flexible(enc, gop_cg_coeffs[0], enc->temporal_gop_width, enc->temporal_gop_height, enc->decomp_levels, enc->wavelet_filter); } else { // Multi-frame GOP: Apply 3D DWT (temporal + spatial) to each channel // Note: This modifies gop_*_coeffs in-place // Use cropped dimensions to encode only the valid region if (enc->temporal_enable_mcezbc) { + if (enc->verbose) { + fprintf(stderr, "[gop_flush] Multi-frame GOP (size=%d), applying 3D DWT with MC-EZBC\n", actual_gop_size); + } // Use MC-EZBC lifting: motion compensation integrated into lifting steps dwt_3d_forward_mc(enc, gop_y_coeffs, gop_co_coeffs, gop_cg_coeffs, actual_gop_size, enc->decomp_levels, enc->temporal_decomp_levels, enc->wavelet_filter); } else { + if (enc->verbose) { + fprintf(stderr, "[gop_flush] Multi-frame GOP (size=%d), applying traditional 3D DWT\n", actual_gop_size); + } // Use traditional 3D DWT with pre-aligned frames (translation-only) - dwt_3d_forward(enc, gop_y_coeffs, enc->width, enc->height, actual_gop_size, + // Phase 2: Use stored GOP dimensions (actual data size in buffers) + + // CRITICAL FIX: Temporarily override enc->widths/heights arrays for cropped dimensions + // dwt_2d_forward_flexible() uses these arrays, which were initialized with full frame dimensions + // Save original arrays + int array_size = enc->decomp_levels + 2; + int *saved_widths = malloc(array_size * sizeof(int)); + int *saved_heights = malloc(array_size * sizeof(int)); + memcpy(saved_widths, enc->widths, array_size * sizeof(int)); + memcpy(saved_heights, enc->heights, array_size * sizeof(int)); + + // Recalculate for cropped dimensions + enc->widths[0] = enc->temporal_gop_width; + enc->heights[0] = enc->temporal_gop_height; + for (int i = 1; i < array_size; i++) { + enc->widths[i] = (enc->widths[i - 1] + 1) / 2; + enc->heights[i] = (enc->heights[i - 1] + 1) / 2; + } + + if (enc->verbose) { + fprintf(stderr, "[gop_flush] Recalculated dimension arrays for cropped size: level 0 = %dx%d\n", + enc->widths[0], enc->heights[0]); + } + + dwt_3d_forward(enc, gop_y_coeffs, enc->temporal_gop_width, enc->temporal_gop_height, actual_gop_size, enc->decomp_levels, enc->temporal_decomp_levels, enc->wavelet_filter); - dwt_3d_forward(enc, gop_co_coeffs, enc->width, enc->height, actual_gop_size, + dwt_3d_forward(enc, gop_co_coeffs, enc->temporal_gop_width, enc->temporal_gop_height, actual_gop_size, enc->decomp_levels, enc->temporal_decomp_levels, enc->wavelet_filter); - dwt_3d_forward(enc, gop_cg_coeffs, enc->width, enc->height, actual_gop_size, + dwt_3d_forward(enc, gop_cg_coeffs, enc->temporal_gop_width, enc->temporal_gop_height, actual_gop_size, enc->decomp_levels, enc->temporal_decomp_levels, enc->wavelet_filter); + + // Restore original arrays + memcpy(enc->widths, saved_widths, array_size * sizeof(int)); + memcpy(enc->heights, saved_heights, array_size * sizeof(int)); + free(saved_widths); + free(saved_heights); + + if (enc->verbose) { + fprintf(stderr, "[gop_flush] 3D DWT completed, restored original dimension arrays\n"); + } } } @@ -4744,9 +4890,11 @@ static size_t gop_flush(tav_encoder_t *enc, FILE *output, int base_quantiser, fwrite(&packet_type, 1, 1, output); total_bytes_written += 1; - // Allocate buffer for uncompressed tile data - // Use same format as compress_and_write_frame: serialise_tile_data - const size_t max_tile_size = 4 + (num_pixels * 3 * sizeof(int16_t)); + // Allocate buffer for tile data (with EZBC worst-case overhead) + // EZBC can produce more output than raw data due to headers and encoding metadata + // Worst case: raw data + headers (5 bytes/channel) + significance flags (1 bit/coeff) + // Use 3x raw size as safe upper bound to account for all EZBC overhead + const size_t max_tile_size = 4 + (num_pixels * 3 * sizeof(int16_t) * 3); uint8_t *uncompressed_buffer = malloc(max_tile_size); // Use serialise_tile_data with DWT-transformed float coefficients (before quantisation) @@ -4907,9 +5055,11 @@ static size_t gop_flush(tav_encoder_t *enc, FILE *output, int base_quantiser, (num_pixels * actual_gop_size * 3 * sizeof(int16_t)); uint8_t *preprocessed_buffer = malloc(max_preprocessed_size); + // CRITICAL: Use GOP dimensions (cropped if crop encoding active), not full frame dimensions + // The coefficient buffers (quant_y/co/cg) are sized for temporal_gop_width×height size_t preprocessed_size = preprocess_gop_unified( enc->preprocess_mode, quant_y, quant_co, quant_cg, - actual_gop_size, num_pixels, enc->width, enc->height, enc->channel_layout, + actual_gop_size, num_pixels, enc->temporal_gop_width, enc->temporal_gop_height, enc->channel_layout, preprocessed_buffer); // Compress entire GOP with Zstd (single compression for all frames) @@ -5004,12 +5154,13 @@ static size_t gop_process_and_flush(tav_encoder_t *enc, FILE *output, int base_q if (!force_flush && !enc->two_pass_mode) { for (int i = 1; i < enc->temporal_gop_frame_count; i++) { // Compare consecutive frames using unified scene change detection + // Phase 2: Use stored GOP dimensions (actual data size in buffers) double avg_diff, changed_ratio; int is_scene_change = detect_scene_change_between_frames( enc->temporal_gop_rgb_frames[i - 1], enc->temporal_gop_rgb_frames[i], - enc->width, - enc->height, + enc->temporal_gop_width, + enc->temporal_gop_height, &avg_diff, &changed_ratio ); @@ -6505,9 +6656,16 @@ static size_t serialise_tile_data(tav_encoder_t *enc, int tile_x, int tile_y, } // Quantise and serialise DWT coefficients + // tile_size: actual number of coefficients to process (varies with crop encoding) const int tile_size = enc->monoblock ? - (enc->width * enc->height) : // Monoblock mode: full frame + (enc->temporal_gop_width * enc->temporal_gop_height) : // Monoblock mode: use stored GOP dimensions (PADDED_TILE_SIZE_X * PADDED_TILE_SIZE_Y); // Standard mode: padded tiles + + // tile_stride: buffer stride for previous_coeffs indexing (constant, matches allocation) + const int tile_stride = enc->monoblock ? + (enc->width * enc->height) : // Monoblock mode: always use full frame dimensions for stride + (PADDED_TILE_SIZE_X * PADDED_TILE_SIZE_Y); // Standard mode: padded tiles + // OPTIMISATION: Use pre-allocated buffers instead of malloc/free per tile int16_t *quantised_y = enc->reusable_quantised_y; int16_t *quantised_co = enc->reusable_quantised_co; @@ -6530,9 +6688,9 @@ static size_t serialise_tile_data(tav_encoder_t *enc, int tile_x, int tile_y, if (enc->preprocess_mode == PREPROCESS_EZBC) { // EZBC mode: Quantise with perceptual weighting but no normalisation (division by quantiser) // fprintf(stderr, "[EZBC-QUANT-INTRA] Using perceptual quantisation without normalisation\n"); - quantise_dwt_coefficients_perceptual_per_coeff_no_normalisation(enc, (float*)tile_y_data, quantised_y, tile_size, this_frame_qY, enc->width, enc->height, enc->decomp_levels, 0, enc->frame_count); - quantise_dwt_coefficients_perceptual_per_coeff_no_normalisation(enc, (float*)tile_co_data, quantised_co, tile_size, this_frame_qCo, enc->width, enc->height, enc->decomp_levels, 1, enc->frame_count); - quantise_dwt_coefficients_perceptual_per_coeff_no_normalisation(enc, (float*)tile_cg_data, quantised_cg, tile_size, this_frame_qCg, enc->width, enc->height, enc->decomp_levels, 1, enc->frame_count); + quantise_dwt_coefficients_perceptual_per_coeff_no_normalisation(enc, (float*)tile_y_data, quantised_y, tile_size, this_frame_qY, enc->temporal_gop_width, enc->temporal_gop_height, enc->decomp_levels, 0, enc->frame_count); + quantise_dwt_coefficients_perceptual_per_coeff_no_normalisation(enc, (float*)tile_co_data, quantised_co, tile_size, this_frame_qCo, enc->temporal_gop_width, enc->temporal_gop_height, enc->decomp_levels, 1, enc->frame_count); + quantise_dwt_coefficients_perceptual_per_coeff_no_normalisation(enc, (float*)tile_cg_data, quantised_cg, tile_size, this_frame_qCg, enc->temporal_gop_width, enc->temporal_gop_height, enc->decomp_levels, 1, enc->frame_count); // Print max abs for debug int max_y = 0, max_co = 0, max_cg = 0; @@ -6544,21 +6702,21 @@ static size_t serialise_tile_data(tav_encoder_t *enc, int tile_x, int tile_y, // fprintf(stderr, "[EZBC-QUANT-INTRA] Quantised coeff max: Y=%d, Co=%d, Cg=%d\n", max_y, max_co, max_cg); } else if (enc->perceptual_tuning) { // Perceptual quantisation: EXACTLY like uniform but with per-coefficient weights - quantise_dwt_coefficients_perceptual_per_coeff(enc, (float*)tile_y_data, quantised_y, tile_size, this_frame_qY, enc->width, enc->height, enc->decomp_levels, 0, enc->frame_count); - quantise_dwt_coefficients_perceptual_per_coeff(enc, (float*)tile_co_data, quantised_co, tile_size, this_frame_qCo, enc->width, enc->height, enc->decomp_levels, 1, enc->frame_count); - quantise_dwt_coefficients_perceptual_per_coeff(enc, (float*)tile_cg_data, quantised_cg, tile_size, this_frame_qCg, enc->width, enc->height, enc->decomp_levels, 1, enc->frame_count); + quantise_dwt_coefficients_perceptual_per_coeff(enc, (float*)tile_y_data, quantised_y, tile_size, this_frame_qY, enc->temporal_gop_width, enc->temporal_gop_height, enc->decomp_levels, 0, enc->frame_count); + quantise_dwt_coefficients_perceptual_per_coeff(enc, (float*)tile_co_data, quantised_co, tile_size, this_frame_qCo, enc->temporal_gop_width, enc->temporal_gop_height, enc->decomp_levels, 1, enc->frame_count); + quantise_dwt_coefficients_perceptual_per_coeff(enc, (float*)tile_cg_data, quantised_cg, tile_size, this_frame_qCg, enc->temporal_gop_width, enc->temporal_gop_height, enc->decomp_levels, 1, enc->frame_count); } else { // Legacy uniform quantisation - quantise_dwt_coefficients((float*)tile_y_data, quantised_y, tile_size, this_frame_qY, enc->dead_zone_threshold, enc->width, enc->height, enc->decomp_levels, 0); - quantise_dwt_coefficients((float*)tile_co_data, quantised_co, tile_size, this_frame_qCo, enc->dead_zone_threshold, enc->width, enc->height, enc->decomp_levels, 1); - quantise_dwt_coefficients((float*)tile_cg_data, quantised_cg, tile_size, this_frame_qCg, enc->dead_zone_threshold, enc->width, enc->height, enc->decomp_levels, 1); + quantise_dwt_coefficients((float*)tile_y_data, quantised_y, tile_size, this_frame_qY, enc->dead_zone_threshold, enc->temporal_gop_width, enc->temporal_gop_height, enc->decomp_levels, 0); + quantise_dwt_coefficients((float*)tile_co_data, quantised_co, tile_size, this_frame_qCo, enc->dead_zone_threshold, enc->temporal_gop_width, enc->temporal_gop_height, enc->decomp_levels, 1); + quantise_dwt_coefficients((float*)tile_cg_data, quantised_cg, tile_size, this_frame_qCg, enc->dead_zone_threshold, enc->temporal_gop_width, enc->temporal_gop_height, enc->decomp_levels, 1); } // Store current coefficients for future delta reference int tile_idx = tile_y * enc->tiles_x + tile_x; - float *prev_y = enc->previous_coeffs_y + (tile_idx * tile_size); - float *prev_co = enc->previous_coeffs_co + (tile_idx * tile_size); - float *prev_cg = enc->previous_coeffs_cg + (tile_idx * tile_size); + float *prev_y = enc->previous_coeffs_y + (tile_idx * tile_stride); + float *prev_co = enc->previous_coeffs_co + (tile_idx * tile_stride); + float *prev_cg = enc->previous_coeffs_cg + (tile_idx * tile_stride); memcpy(prev_y, tile_y_data, tile_size * sizeof(float)); memcpy(prev_co, tile_co_data, tile_size * sizeof(float)); memcpy(prev_cg, tile_cg_data, tile_size * sizeof(float)); @@ -6566,9 +6724,9 @@ static size_t serialise_tile_data(tav_encoder_t *enc, int tile_x, int tile_y, } else if (mode == TAV_MODE_DELTA) { // DELTA mode: compute coefficient deltas and quantise them int tile_idx = tile_y * enc->tiles_x + tile_x; - float *prev_y = enc->previous_coeffs_y + (tile_idx * tile_size); - float *prev_co = enc->previous_coeffs_co + (tile_idx * tile_size); - float *prev_cg = enc->previous_coeffs_cg + (tile_idx * tile_size); + float *prev_y = enc->previous_coeffs_y + (tile_idx * tile_stride); + float *prev_co = enc->previous_coeffs_co + (tile_idx * tile_stride); + float *prev_cg = enc->previous_coeffs_cg + (tile_idx * tile_stride); // Compute deltas: delta = current - previous float *delta_y = malloc(tile_size * sizeof(float)); @@ -6616,8 +6774,8 @@ static size_t serialise_tile_data(tav_encoder_t *enc, int tile_x, int tile_y, if (enc->delta_haar_levels > 0) { int tile_width, tile_height; if (enc->monoblock) { - tile_width = enc->width; - tile_height = enc->height; + tile_width = enc->temporal_gop_width; + tile_height = enc->temporal_gop_height; } else { tile_width = PADDED_TILE_SIZE_X; tile_height = PADDED_TILE_SIZE_Y; @@ -6649,7 +6807,8 @@ static size_t serialise_tile_data(tav_encoder_t *enc, int tile_x, int tile_y, }*/ // Preprocess and write quantised coefficients using variable channel layout concatenated significance maps - size_t total_compressed_size = preprocess_coefficients_variable_layout(enc->preprocess_mode, enc->width, enc->height, + // Phase 2: Use stored GOP dimensions (actual data size in buffers) + size_t total_compressed_size = preprocess_coefficients_variable_layout(enc->preprocess_mode, enc->temporal_gop_width, enc->temporal_gop_height, quantised_y, quantised_co, quantised_cg, NULL, tile_size, enc->channel_layout, buffer + offset); offset += total_compressed_size; @@ -6704,10 +6863,12 @@ static size_t serialise_tile_data(tav_encoder_t *enc, int tile_x, int tile_y, // Compress and write frame data static size_t compress_and_write_frame(tav_encoder_t *enc, uint8_t packet_type) { // Calculate total uncompressed size + // Use encoding dimensions (cropped when crop encoding is enabled, full frame otherwise) const size_t coeff_count = enc->monoblock ? - (enc->width * enc->height) : + (enc->encoding_width * enc->encoding_height) : (PADDED_TILE_SIZE_X * PADDED_TILE_SIZE_Y); - const size_t max_tile_size = 4 + (coeff_count * 3 * sizeof(int16_t)); // header + 3 channels of coefficients + // Account for EZBC worst-case overhead (use 3x raw size) + const size_t max_tile_size = 4 + (coeff_count * 3 * sizeof(int16_t) * 3); const size_t total_uncompressed_size = enc->tiles_x * enc->tiles_y * max_tile_size; // Allocate buffer for uncompressed tile data @@ -6749,8 +6910,8 @@ static size_t compress_and_write_frame(tav_encoder_t *enc, uint8_t packet_type) // Determine tile data size and allocate buffers int tile_data_size; if (enc->monoblock) { - // Monoblock mode: entire frame - tile_data_size = enc->width * enc->height; + // Monoblock mode: entire frame (use encoding dimensions) + tile_data_size = enc->encoding_width * enc->encoding_height; } else { // Standard mode: padded tiles (344x288) tile_data_size = PADDED_TILE_SIZE_X * PADDED_TILE_SIZE_Y; @@ -6798,10 +6959,10 @@ static size_t compress_and_write_frame(tav_encoder_t *enc, uint8_t packet_type) // Apply DWT transform to each channel (skip for SKIP mode) if (mode != TAV_MODE_SKIP) { if (enc->monoblock) { - // Monoblock mode: transform entire frame - dwt_2d_forward_flexible(enc, tile_y_data, enc->width, enc->height, enc->decomp_levels, enc->wavelet_filter); - dwt_2d_forward_flexible(enc, tile_co_data, enc->width, enc->height, enc->decomp_levels, enc->wavelet_filter); - dwt_2d_forward_flexible(enc, tile_cg_data, enc->width, enc->height, enc->decomp_levels, enc->wavelet_filter); + // Monoblock mode: transform entire frame (use encoding dimensions) + dwt_2d_forward_flexible(enc, tile_y_data, enc->encoding_width, enc->encoding_height, enc->decomp_levels, enc->wavelet_filter); + dwt_2d_forward_flexible(enc, tile_co_data, enc->encoding_width, enc->encoding_height, enc->decomp_levels, enc->wavelet_filter); + dwt_2d_forward_flexible(enc, tile_cg_data, enc->encoding_width, enc->encoding_height, enc->decomp_levels, enc->wavelet_filter); } else { // Standard mode: transform padded tiles (344x288) dwt_2d_forward_padded(tile_y_data, enc->decomp_levels, enc->wavelet_filter); @@ -8189,6 +8350,85 @@ static float calculate_sobel_magnitude(const uint8_t *frame_rgb, int width, int return sqrtf(gx * gx + gy * gy); } +// Extract active picture region from full frame based on screen mask geometry +// Returns newly allocated buffer containing only the active region (caller must free) +// Active dimensions: active_width = width - left - right, active_height = height - top - bottom +static uint8_t* extract_active_region(const uint8_t *full_rgb, int width, int height, + uint16_t top, uint16_t right, uint16_t bottom, uint16_t left, + int *out_active_width, int *out_active_height) { + // Calculate active region dimensions + int active_width = width - left - right; + int active_height = height - top - bottom; + + // Validate dimensions + if (active_width <= 0 || active_height <= 0) { + fprintf(stderr, "Error: Invalid active region dimensions (%dx%d)\n", + active_width, active_height); + return NULL; + } + + // Allocate buffer for active region (RGB, 3 bytes per pixel) + size_t active_size = active_width * active_height * 3; + uint8_t *active_rgb = malloc(active_size); + if (!active_rgb) { + fprintf(stderr, "Error: Failed to allocate active region buffer (%zu bytes)\n", active_size); + return NULL; + } + + // Extract active region pixels + // Source region: [left, width-right) x [top, height-bottom) + for (int y = 0; y < active_height; y++) { + int src_y = top + y; + for (int x = 0; x < active_width; x++) { + int src_x = left + x; + + // Copy RGB pixel from full frame to active region + int src_idx = (src_y * width + src_x) * 3; + int dst_idx = (y * active_width + x) * 3; + + active_rgb[dst_idx + 0] = full_rgb[src_idx + 0]; // R + active_rgb[dst_idx + 1] = full_rgb[src_idx + 1]; // G + active_rgb[dst_idx + 2] = full_rgb[src_idx + 2]; // B + } + } + + // Output active dimensions + if (out_active_width) *out_active_width = active_width; + if (out_active_height) *out_active_height = active_height; + + return active_rgb; +} + +// Composite active region back to full frame (inverse of extract_active_region) +// Fills masked regions with black (0,0,0) and copies active region to correct position +// Used for testing roundtrip and decoder reconstruction +static void composite_to_full_frame(const uint8_t *active_rgb, int active_width, int active_height, + uint8_t *full_rgb, int width, int height, + uint16_t top, uint16_t right, uint16_t bottom, uint16_t left) { + // Fill entire frame with black first + memset(full_rgb, 0, width * height * 3); + + // Copy active region to correct position + // Destination region: [left, width-right) x [top, height-bottom) + for (int y = 0; y < active_height; y++) { + int dst_y = top + y; + if (dst_y >= height) break; // Safety check + + for (int x = 0; x < active_width; x++) { + int dst_x = left + x; + if (dst_x >= width) break; // Safety check + + // Copy RGB pixel from active region to full frame + int src_idx = (y * active_width + x) * 3; + int dst_idx = (dst_y * width + dst_x) * 3; + + full_rgb[dst_idx + 0] = active_rgb[src_idx + 0]; // R + full_rgb[dst_idx + 1] = active_rgb[src_idx + 1]; // G + full_rgb[dst_idx + 2] = active_rgb[src_idx + 2]; // B + } + } +} + // Apply symmetric cropping and suppress simultaneous letterbox+pillarbox // ALWAYS makes left=right and top=bottom (perfect symmetry) // When BOTH letterbox and pillarbox are detected simultaneously, suppress one based on current state @@ -9085,8 +9325,9 @@ static int process_audio(tav_encoder_t *enc, int frame_num, FILE *output) { double packets_per_frame = frame_audio_time / PACKET_AUDIO_TIME; // Allocate MP2 buffer if needed + // Note: MP2 packets can vary by ±1 byte due to padding, so allocate extra space if (!enc->mp2_buffer) { - enc->mp2_buffer_size = enc->mp2_packet_size * 2; // Space for multiple packets + enc->mp2_buffer_size = (enc->mp2_packet_size + 1) * 2; // Extra space for padding variations enc->mp2_buffer = malloc(enc->mp2_buffer_size); if (!enc->mp2_buffer) { fprintf(stderr, "Failed to allocate audio buffer\n"); @@ -9128,11 +9369,27 @@ static int process_audio(tav_encoder_t *enc, int frame_num, FILE *output) { // Insert the calculated number of audio packets for (int q = 0; q < packets_to_insert; q++) { - size_t bytes_to_read = enc->mp2_packet_size; + // Peek at header to get actual packet size (MP2 packets can vary by ±1 byte) + long pos = ftell(enc->mp2_file); + uint8_t header[4]; + if (fread(header, 1, 4, enc->mp2_file) != 4) break; + fseek(enc->mp2_file, pos, SEEK_SET); // Rewind to re-read with full packet + + int actual_packet_size = get_mp2_packet_size(header); + size_t bytes_to_read = actual_packet_size; + + // Clamp to remaining audio if (bytes_to_read > enc->audio_remaining) { bytes_to_read = enc->audio_remaining; } + // Sanity check buffer size + if (bytes_to_read > enc->mp2_buffer_size) { + fprintf(stderr, "ERROR: Packet size %zu exceeds buffer size %zu\n", bytes_to_read, enc->mp2_buffer_size); + break; + } + + // Read full packet including header size_t bytes_read = fread(enc->mp2_buffer, 1, bytes_to_read, enc->mp2_file); if (bytes_read == 0) break; @@ -9248,8 +9505,9 @@ static int process_audio_for_gop(tav_encoder_t *enc, int *frame_numbers, int num double packets_per_frame = frame_audio_time / PACKET_AUDIO_TIME; // Allocate MP2 buffer if needed + // Note: MP2 packets can vary by ±1 byte due to padding, so allocate extra space if (!enc->mp2_buffer) { - enc->mp2_buffer_size = enc->mp2_packet_size * 2; + enc->mp2_buffer_size = (enc->mp2_packet_size + 1) * 2; // Extra space for padding variations enc->mp2_buffer = malloc(enc->mp2_buffer_size); if (!enc->mp2_buffer) { fprintf(stderr, "Failed to allocate audio buffer\n"); @@ -9281,11 +9539,27 @@ static int process_audio_for_gop(tav_encoder_t *enc, int *frame_numbers, int num // Emit all audio packets for this GOP for (int q = 0; q < total_packets_to_insert; q++) { - size_t bytes_to_read = enc->mp2_packet_size; + // Peek at header to get actual packet size (MP2 packets can vary by ±1 byte) + long pos = ftell(enc->mp2_file); + uint8_t header[4]; + if (fread(header, 1, 4, enc->mp2_file) != 4) break; + fseek(enc->mp2_file, pos, SEEK_SET); // Rewind to re-read with full packet + + int actual_packet_size = get_mp2_packet_size(header); + size_t bytes_to_read = actual_packet_size; + + // Clamp to remaining audio if (bytes_to_read > enc->audio_remaining) { bytes_to_read = enc->audio_remaining; } + // Sanity check buffer size + if (bytes_to_read > enc->mp2_buffer_size) { + fprintf(stderr, "ERROR: GOP packet size %zu exceeds buffer size %zu\n", bytes_to_read, enc->mp2_buffer_size); + break; + } + + // Read full packet including header size_t bytes_read = fread(enc->mp2_buffer, 1, bytes_to_read, enc->mp2_file); if (bytes_read == 0) break; @@ -9939,6 +10213,112 @@ static gop_boundary_t* build_gop_boundaries(const frame_analysis_t *analyses, in return head; } +// Calculate GOP-level geometry from frame analyses (Phase 2) +// For each GOP, finds the maximum active dimensions and tracks geometry changes +static void calculate_gop_geometry(tav_encoder_t *enc, gop_boundary_t *gop_list, + const frame_analysis_t *analyses) { + if (!enc->enable_crop_encoding || !gop_list || !analyses) { + return; + } + + gop_boundary_t *gop = gop_list; + while (gop) { + // Initialize with full frame dimensions + gop->max_active_width = 0; + gop->max_active_height = 0; + gop->geometry_changes = 0; + gop->enable_crop_encoding = 0; + + // Track minimum letterbox values to calculate unified mask + // (Minimum letterbox = maximum active region) + uint16_t min_top = UINT16_MAX, min_right = UINT16_MAX; + uint16_t min_bottom = UINT16_MAX, min_left = UINT16_MAX; + + // Track previous geometry for change detection + uint16_t prev_top = 0, prev_right = 0, prev_bottom = 0, prev_left = 0; + int prev_initialized = 0; + + // Scan all frames in this GOP + for (int f = gop->start_frame; f <= gop->end_frame; f++) { + const frame_analysis_t *frame = &analyses[f]; + + // Calculate active region dimensions for this frame + int active_width = enc->width - frame->letterbox_left - frame->letterbox_right; + int active_height = enc->height - frame->letterbox_top - frame->letterbox_bottom; + + // Update maximum dimensions + if (active_width > gop->max_active_width) { + gop->max_active_width = active_width; + } + if (active_height > gop->max_active_height) { + gop->max_active_height = active_height; + } + + // Track minimum letterbox values (for unified mask calculation) + if (frame->letterbox_top < min_top) min_top = frame->letterbox_top; + if (frame->letterbox_right < min_right) min_right = frame->letterbox_right; + if (frame->letterbox_bottom < min_bottom) min_bottom = frame->letterbox_bottom; + if (frame->letterbox_left < min_left) min_left = frame->letterbox_left; + + // Detect geometry changes + if (prev_initialized) { + if (frame->letterbox_top != prev_top || + frame->letterbox_right != prev_right || + frame->letterbox_bottom != prev_bottom || + frame->letterbox_left != prev_left) { + gop->geometry_changes++; + } + } + + // Update previous geometry + prev_top = frame->letterbox_top; + prev_right = frame->letterbox_right; + prev_bottom = frame->letterbox_bottom; + prev_left = frame->letterbox_left; + prev_initialized = 1; + } + + // Calculate unified mask from minimum letterbox values + // This mask, when used with extract_active_region(), will give exactly max_active_width × max_active_height + gop->mask_top = (min_top == UINT16_MAX) ? 0 : min_top; + gop->mask_right = (min_right == UINT16_MAX) ? 0 : min_right; + gop->mask_bottom = (min_bottom == UINT16_MAX) ? 0 : min_bottom; + gop->mask_left = (min_left == UINT16_MAX) ? 0 : min_left; + + // Verify that mask gives correct dimensions + int calculated_width = enc->width - gop->mask_left - gop->mask_right; + int calculated_height = enc->height - gop->mask_top - gop->mask_bottom; + if (calculated_width != gop->max_active_width || calculated_height != gop->max_active_height) { + fprintf(stderr, "WARNING: GOP %d-%d: Mask mismatch! Calculated %dx%d but max is %dx%d\n", + gop->start_frame, gop->end_frame, + calculated_width, calculated_height, + gop->max_active_width, gop->max_active_height); + fprintf(stderr, " Mask: top=%d right=%d bottom=%d left=%d\n", + gop->mask_top, gop->mask_right, gop->mask_bottom, gop->mask_left); + } + + // Decide if crop encoding should be enabled for this GOP + if (gop->max_active_width > 0 && gop->max_active_height > 0 && + (gop->max_active_width < enc->width || gop->max_active_height < enc->height)) { + // There is actual cropping benefit + gop->enable_crop_encoding = 1; + + if (enc->verbose && gop->geometry_changes > 0) { + printf(" GOP %d-%d: geometry changes detected (%d), using max dimensions %dx%d\n", + gop->start_frame, gop->end_frame, gop->geometry_changes, + gop->max_active_width, gop->max_active_height); + } + } else { + // No cropping benefit, use full frame dimensions + gop->max_active_width = enc->width; + gop->max_active_height = enc->height; + gop->enable_crop_encoding = 0; + } + + gop = gop->next; + } +} + // Free GOP boundary list static void free_gop_boundaries(gop_boundary_t *head) { while (head) { @@ -10046,6 +10426,56 @@ static int two_pass_first_pass(tav_encoder_t *enc, const char *input_file) { ); enc->current_frame_rgb = saved_current; + + // Phase 2 Step 1: Test crop extraction roundtrip + if (metrics.has_letterbox && (metrics.letterbox_top > 0 || metrics.letterbox_left > 0) && frame_num < 5) { + int active_width, active_height; + + // Extract active region + uint8_t *active_rgb = extract_active_region(frame_rgb, enc->width, enc->height, + metrics.letterbox_top, + metrics.letterbox_right, + metrics.letterbox_bottom, + metrics.letterbox_left, + &active_width, &active_height); + + if (active_rgb) { + // Composite back to full frame + uint8_t *reconstructed = malloc(frame_rgb_size); + composite_to_full_frame(active_rgb, active_width, active_height, + reconstructed, enc->width, enc->height, + metrics.letterbox_top, + metrics.letterbox_right, + metrics.letterbox_bottom, + metrics.letterbox_left); + + // Verify roundtrip (check a few pixels in active region match) + int errors = 0; + for (int test_y = metrics.letterbox_top; test_y < enc->height - metrics.letterbox_bottom && errors < 10; test_y += 50) { + for (int test_x = metrics.letterbox_left; test_x < enc->width - metrics.letterbox_right && errors < 10; test_x += 50) { + int idx = (test_y * enc->width + test_x) * 3; + if (frame_rgb[idx] != reconstructed[idx] || + frame_rgb[idx+1] != reconstructed[idx+1] || + frame_rgb[idx+2] != reconstructed[idx+2]) { + errors++; + } + } + } + + if (errors == 0 && enc->verbose) { + printf("Frame %d: Crop roundtrip test PASSED (active: %dx%d, mask: t=%d r=%d b=%d l=%d)\n", + frame_num, active_width, active_height, + metrics.letterbox_top, metrics.letterbox_right, + metrics.letterbox_bottom, metrics.letterbox_left); + } else if (errors > 0) { + fprintf(stderr, "Frame %d: Crop roundtrip test FAILED (%d pixel errors)\n", + frame_num, errors); + } + + free(active_rgb); + free(reconstructed); + } + } } else { metrics.has_letterbox = 0; metrics.letterbox_top = 0; @@ -10100,6 +10530,11 @@ static int two_pass_first_pass(tav_encoder_t *enc, const char *input_file) { enc->verbose ); + // Calculate GOP-level geometry for crop encoding (Phase 2) + if (enc->enable_crop_encoding && enc->gop_boundaries) { + calculate_gop_geometry(enc, enc->gop_boundaries, enc->frame_analyses); + } + // Count and print GOP statistics int num_gops = 0; int total_gop_frames = 0; @@ -10232,6 +10667,7 @@ int main(int argc, char *argv[]) { {"raw-coeffs", no_argument, 0, 1029}, {"single-pass", no_argument, 0, 1050}, // disable two-pass encoding with wavelet-based scene detection {"no-letterbox-detect", no_argument, 0, 1051}, // disable letterbox/pillarbox detection + {"enable-crop-encoding", no_argument, 0, 1052}, // Phase 2: encode cropped active region only (experimental) {"help", no_argument, 0, '?'}, {0, 0, 0, 0} }; @@ -10466,6 +10902,10 @@ int main(int argc, char *argv[]) { enc->enable_letterbox_detect = 0; printf("Letterbox/pillarbox detection disabled\n"); break; + case 1052: // --enable-crop-encoding + enc->enable_crop_encoding = 1; + printf("Phase 2 crop encoding enabled (experimental)\n"); + break; case 'a': int bitrate = atoi(optarg); int valid_bitrate = validate_mp2_bitrate(bitrate); @@ -10613,6 +11053,10 @@ int main(int argc, char *argv[]) { return 1; } + // Update encoding dimensions to match actual video dimensions (unless crop encoding changes them later) + enc->encoding_width = enc->width; + enc->encoding_height = enc->height; + // Start video preprocessing pipeline if (start_video_conversion(enc) != 1) { fprintf(stderr, "Error: Failed to start video conversion\n"); @@ -10695,30 +11139,90 @@ int main(int argc, char *argv[]) { enc->two_pass_current_frame = 0; // Adjust GOP capacity to match maximum computed GOP size + int old_capacity = enc->temporal_gop_capacity; enc->temporal_gop_capacity = ANALYSIS_GOP_MAX_SIZE; - // Re-allocate GOP buffers with new capacity - enc->temporal_gop_rgb_frames = realloc(enc->temporal_gop_rgb_frames, - enc->temporal_gop_capacity * sizeof(uint8_t*)); - enc->temporal_gop_y_frames = realloc(enc->temporal_gop_y_frames, - enc->temporal_gop_capacity * sizeof(float*)); - enc->temporal_gop_co_frames = realloc(enc->temporal_gop_co_frames, - enc->temporal_gop_capacity * sizeof(float*)); - enc->temporal_gop_cg_frames = realloc(enc->temporal_gop_cg_frames, - enc->temporal_gop_capacity * sizeof(float*)); - - // Allocate new frame buffers for expanded capacity - int frame_size = enc->width * enc->height; - for (int i = TEMPORAL_GOP_SIZE; i < ANALYSIS_GOP_MAX_SIZE; i++) { - enc->temporal_gop_rgb_frames[i] = malloc(frame_size * 3); - enc->temporal_gop_y_frames[i] = malloc(frame_size * sizeof(float)); - enc->temporal_gop_co_frames[i] = malloc(frame_size * sizeof(float)); - enc->temporal_gop_cg_frames[i] = malloc(frame_size * sizeof(float)); + // Find maximum active region dimensions across all GOPs (for crop encoding) + int max_gop_width = enc->width; + int max_gop_height = enc->height; + if (enc->enable_crop_encoding && enc->gop_boundaries) { + // Traverse linked list of GOP boundaries + gop_boundary_t *gop = enc->gop_boundaries; + while (gop != NULL) { + if (gop->max_active_width > max_gop_width) { + max_gop_width = gop->max_active_width; + } + if (gop->max_active_height > max_gop_height) { + max_gop_height = gop->max_active_height; + } + gop = gop->next; + } + if (enc->verbose) { + printf(" Maximum GOP dimensions across all GOPs: %dx%d\n", + max_gop_width, max_gop_height); + } } - if (enc->verbose) { - printf(" Adjusted GOP capacity from %d to %d frames\n", - TEMPORAL_GOP_SIZE, ANALYSIS_GOP_MAX_SIZE); + // Calculate required frame buffer size + int frame_size = max_gop_width * max_gop_height; + int old_frame_size = enc->width * enc->height; + + // Check if we need to reallocate (capacity changed OR frame size changed) + int need_realloc = (old_capacity != ANALYSIS_GOP_MAX_SIZE) || (frame_size != old_frame_size); + + if (need_realloc) { + // Re-allocate GOP buffers with new capacity + uint8_t **new_rgb = realloc(enc->temporal_gop_rgb_frames, + enc->temporal_gop_capacity * sizeof(uint8_t*)); + float **new_y = realloc(enc->temporal_gop_y_frames, + enc->temporal_gop_capacity * sizeof(float*)); + float **new_co = realloc(enc->temporal_gop_co_frames, + enc->temporal_gop_capacity * sizeof(float*)); + float **new_cg = realloc(enc->temporal_gop_cg_frames, + enc->temporal_gop_capacity * sizeof(float*)); + + if (!new_rgb || !new_y || !new_co || !new_cg) { + fprintf(stderr, "Error: Failed to reallocate GOP buffers\n"); + return 1; + } + + enc->temporal_gop_rgb_frames = new_rgb; + enc->temporal_gop_y_frames = new_y; + enc->temporal_gop_co_frames = new_co; + enc->temporal_gop_cg_frames = new_cg; + + // Free and reallocate ALL frame buffers with correct size + // (not just new ones, since frame size might have changed) + for (int i = 0; i < old_capacity; i++) { + free(enc->temporal_gop_rgb_frames[i]); + free(enc->temporal_gop_y_frames[i]); + free(enc->temporal_gop_co_frames[i]); + free(enc->temporal_gop_cg_frames[i]); + } + + // Allocate all frame buffers with new size + for (int i = 0; i < ANALYSIS_GOP_MAX_SIZE; i++) { + enc->temporal_gop_rgb_frames[i] = malloc(frame_size * 3); + enc->temporal_gop_y_frames[i] = malloc(frame_size * sizeof(float)); + enc->temporal_gop_co_frames[i] = malloc(frame_size * sizeof(float)); + enc->temporal_gop_cg_frames[i] = malloc(frame_size * sizeof(float)); + + if (!enc->temporal_gop_rgb_frames[i] || !enc->temporal_gop_y_frames[i] || + !enc->temporal_gop_co_frames[i] || !enc->temporal_gop_cg_frames[i]) { + fprintf(stderr, "Error: Failed to allocate GOP frame buffer %d\n", i); + return 1; + } + } + + if (enc->verbose) { + printf(" Reallocated GOP buffers: capacity %d->%d, frame size %dx%d\n", + old_capacity, ANALYSIS_GOP_MAX_SIZE, max_gop_width, max_gop_height); + } + } else { + if (enc->verbose) { + printf(" GOP buffers unchanged: capacity=%d, frame size=%dx%d\n", + ANALYSIS_GOP_MAX_SIZE, max_gop_width, max_gop_height); + } } // Write all screen masking packets NOW (after first pass analysis) @@ -10867,10 +11371,121 @@ int main(int argc, char *argv[]) { printf("\n"); }*/ + // Phase 2: Extract active region if crop encoding is enabled + uint8_t *rgb_for_encoding = enc->current_frame_rgb; + uint8_t *cropped_rgb = NULL; + int using_crop_encoding = 0; + + // Reset encoding dimensions to full frame by default + enc->encoding_width = enc->width; + enc->encoding_height = enc->height; + + if (enc->enable_crop_encoding && enc->enable_letterbox_detect && enc->two_pass_mode) { + // Phase 2: Use GOP-level dimensions for temporal DWT (3D-DWT mode) + // This ensures all frames in a GOP have the same encoding dimensions + // IMPORTANT: Always use GOP-level dimensions in temporal DWT mode, even if there's no cropping benefit, + // to ensure all frames in the GOP have consistent dimensions (critical for 3D DWT) + if (enc->enable_temporal_dwt && enc->current_gop_boundary) { + // GOP mode: Use maximum dimensions across entire GOP + // Store GOP's max dimensions (DO NOT pass by reference to extract_active_region) + int gop_max_w = enc->current_gop_boundary->max_active_width; + int gop_max_h = enc->current_gop_boundary->max_active_height; + + // Calculate mask geometry to extract GOP's maximum active region + uint16_t mask_top = enc->current_gop_boundary->mask_top; + uint16_t mask_right = enc->current_gop_boundary->mask_right; + uint16_t mask_bottom = enc->current_gop_boundary->mask_bottom; + uint16_t mask_left = enc->current_gop_boundary->mask_left; + + // For frames with smaller active regions, we'll extract the GOP's max region + // (which may include some black bars, but ensures consistent dimensions) + // Use temporary variables - extract_active_region will recalculate dimensions + int extracted_w, extracted_h; + cropped_rgb = extract_active_region(enc->current_frame_rgb, + enc->width, enc->height, + mask_top, mask_right, + mask_bottom, mask_left, + &extracted_w, &extracted_h); + + if (cropped_rgb) { + rgb_for_encoding = cropped_rgb; + // Use GOP's max dimensions, not the recalculated ones + enc->encoding_width = gop_max_w; + enc->encoding_height = gop_max_h; + using_crop_encoding = 1; + + // Store GOP-level mask geometry + enc->active_mask_top = mask_top; + enc->active_mask_right = mask_right; + enc->active_mask_bottom = mask_bottom; + enc->active_mask_left = mask_left; + enc->active_width = gop_max_w; + enc->active_height = gop_max_h; + + if (enc->verbose && frame_count == enc->current_gop_boundary->start_frame) { + printf("GOP %d-%d: Encoding with max dimensions %dx%d (geometry changes: %d)\n", + enc->current_gop_boundary->start_frame, + enc->current_gop_boundary->end_frame, + gop_max_w, gop_max_h, + enc->current_gop_boundary->geometry_changes); + } + } + } + // Intra-only mode or single-pass: Use per-frame dimensions + else if (frame_count < enc->frame_analyses_count) { + frame_analysis_t *analysis = &enc->frame_analyses[frame_count]; + + // Only crop if letterbox/pillarbox was detected + if (analysis->has_letterbox && + (analysis->letterbox_top > 0 || analysis->letterbox_left > 0)) { + + // Extract active region for this specific frame + int active_w, active_h; + cropped_rgb = extract_active_region(enc->current_frame_rgb, + enc->width, enc->height, + analysis->letterbox_top, + analysis->letterbox_right, + analysis->letterbox_bottom, + analysis->letterbox_left, + &active_w, &active_h); + + if (cropped_rgb) { + rgb_for_encoding = cropped_rgb; + + // Set encoding dimensions to cropped size (NOT modifying enc->width/height!) + enc->encoding_width = active_w; + enc->encoding_height = active_h; + using_crop_encoding = 1; + + // Store mask geometry for later use + enc->active_mask_top = analysis->letterbox_top; + enc->active_mask_right = analysis->letterbox_right; + enc->active_mask_bottom = analysis->letterbox_bottom; + enc->active_mask_left = analysis->letterbox_left; + enc->active_width = active_w; + enc->active_height = active_h; + + if (enc->verbose && frame_count < 5) { + printf("Frame %d: Encoding cropped region %dx%d (mask: t=%d r=%d b=%d l=%d)\n", + frame_count, active_w, active_h, + analysis->letterbox_top, analysis->letterbox_right, + analysis->letterbox_bottom, analysis->letterbox_left); + } + } + } + } + } + // Convert RGB to colour space (YCoCg-R or ICtCp) - rgb_to_colour_space_frame(enc, enc->current_frame_rgb, + // Uses either full frame or cropped region depending on crop encoding + rgb_to_colour_space_frame(enc, rgb_for_encoding, enc->current_frame_y, enc->current_frame_co, enc->current_frame_cg, - enc->width, enc->height); + enc->encoding_width, enc->encoding_height); + + // Clean up cropped buffer if allocated + if (cropped_rgb) { + free(cropped_rgb); + } // Debug: check YCoCg conversion result /*if (frame_count < 3) { @@ -11018,8 +11633,10 @@ int main(int argc, char *argv[]) { } // Now add current frame to GOP (will be first frame of new GOP if scene change) + // Pass actual encoding dimensions (cropped if crop encoding is active) int add_result = temporal_gop_add_frame(enc, enc->current_frame_rgb, - enc->current_frame_y, enc->current_frame_co, enc->current_frame_cg); + enc->current_frame_y, enc->current_frame_co, enc->current_frame_cg, + enc->encoding_width, enc->encoding_height); if (add_result != 0) { fprintf(stderr, "Error: Failed to add frame %d to GOP buffer\n", frame_count);