diff --git a/assets/disk0/tvdos/bin/playtav.js b/assets/disk0/tvdos/bin/playtav.js index 5b6eb8f..e301799 100644 --- a/assets/disk0/tvdos/bin/playtav.js +++ b/assets/disk0/tvdos/bin/playtav.js @@ -1143,6 +1143,9 @@ try { startTime: 0, timeRemaining: 0 } + if (interactive) { + console.log(`[GOP] Buffered GOP ${gopSize} frames to ready slot during first GOP decode`) + } } else if (decodingGopData === null) { // Buffer as decoding GOP (will decode after ready GOP) const decodingSlot = (currentGopBufferSlot + 2) % BUFFER_SLOTS @@ -1155,8 +1158,20 @@ try { startTime: 0, timeRemaining: 0 } + if (interactive) { + console.log(`[GOP] Buffered GOP ${gopSize} frames to decoding slot during first GOP decode`) + } + + // CRITICAL: Stop reading packets now that all 3 buffers are full + shouldReadPackets = false + if (interactive) { + console.log(`[GOP] All 3 buffers full during first GOP decode - stopping packet reading`) + } } else { - // All 3 buffers full - discard this GOP + // All 3 buffers full - discard this GOP (shouldn't happen now with gate) + if (interactive) { + console.log(`[GOP] WARNING: All 3 buffers full during first GOP decode - discarding GOP ${gopSize} frames`) + } sys.free(compressedPtr) } @@ -1180,14 +1195,28 @@ try { nextOffset ) + // Set async decode tracking variables + asyncDecodeInProgress = true + asyncDecodeSlot = nextSlot + asyncDecodeGopSize = gopSize + asyncDecodePtr = compressedPtr + asyncDecodeStartTime = sys.nanoTime() + readyGopData = { gopSize: gopSize, slot: nextSlot, compressedPtr: compressedPtr, - startTime: sys.nanoTime(), + startTime: asyncDecodeStartTime, timeRemaining: timeRemaining } + // CRITICAL: Stop reading packets immediately after starting decode + // to prevent next GOP from being discarded in Case 5 + shouldReadPackets = false + if (interactive) { + console.log(`[GOP] Case 3: Started decode to ready slot - stopping packet reading`) + } + } else if (currentGopSize > 0 && readyGopData !== null && decodingGopData === null && !asyncDecodeInProgress && graphics.tavDecodeGopIsComplete()) { // Case 4: GOP playing, ready GOP exists, no decoding GOP, no decode in progress - decode to decoding slot const decodingSlot = (currentGopBufferSlot + 2) % BUFFER_SLOTS @@ -1208,16 +1237,33 @@ try { decodingOffset ) + // Set async decode tracking variables + asyncDecodeInProgress = true + asyncDecodeSlot = decodingSlot + asyncDecodeGopSize = gopSize + asyncDecodePtr = compressedPtr + asyncDecodeStartTime = sys.nanoTime() + decodingGopData = { gopSize: gopSize, slot: decodingSlot, compressedPtr: compressedPtr, - startTime: sys.nanoTime(), + startTime: asyncDecodeStartTime, timeRemaining: timeRemaining } + // CRITICAL: Stop reading packets immediately after starting decode + // All 3 buffers are now full (playing + ready + decoding) + shouldReadPackets = false + if (interactive) { + console.log(`[GOP] Case 4: Started decode to decoding slot - all buffers full, stopping packet reading`) + } + } else { // Case 5: All 3 buffers full (playing + ready + decoding) - ignore packet + if (interactive) { + console.log(`[GOP] Case 5: Discarding GOP ${gopSize} frames (current=${currentGopSize}, ready=${readyGopData !== null}, decoding=${decodingGopData !== null}, asyncInProgress=${asyncDecodeInProgress})`) + } sys.free(compressedPtr) } } @@ -1230,6 +1276,9 @@ try { // (one GOP playing + ready GOP + decoding GOP) if (currentGopSize > 0 && readyGopData !== null && decodingGopData !== null) { shouldReadPackets = false + if (interactive) { + console.log(`[GOP] All 3 buffers full - stopping packet reading`) + } } } else if (packetType === TAV_PACKET_AUDIO_BUNDLED) { @@ -1433,8 +1482,18 @@ try { // Set first frame time to NOW nextFrameTime = sys.nanoTime() - // Resume packet reading to get next GOP (only one buffer occupied now) - shouldReadPackets = true + // Resume packet reading only if not all 3 buffers are full + // (might have buffered GOP 2 and 3 during GOP 1 decode) + if (!(currentGopSize > 0 && readyGopData !== null && decodingGopData !== null)) { + shouldReadPackets = true + if (interactive) { + console.log(`[GOP] First GOP ready - resuming packet reading (ready=${readyGopData !== null}, decoding=${decodingGopData !== null})`) + } + } else { + if (interactive) { + console.log(`[GOP] First GOP ready - all 3 buffers full, keeping packet reading paused`) + } + } // if (interactive) { // console.log(`[GOP] First GOP ready (slot ${asyncDecodeSlot}, ${asyncDecodeGopSize} frames) in ${decodeTime.toFixed(1)}ms - starting playback`) @@ -1461,13 +1520,30 @@ try { readyGopData.slot * SLOT_SIZE ) + // CRITICAL FIX: Set async decode tracking variables so decode is properly tracked + asyncDecodeInProgress = true + asyncDecodeSlot = readyGopData.slot + asyncDecodeGopSize = readyGopData.gopSize + asyncDecodePtr = readyGopData.compressedPtr + asyncDecodeStartTime = sys.nanoTime() + readyGopData.needsDecode = false - readyGopData.startTime = sys.nanoTime() + readyGopData.startTime = asyncDecodeStartTime readyGopData.timeRemaining = timeRemaining + + if (interactive) { + console.log(`[GOP] Started decode of buffered GOP ${readyGopData.gopSize} frames (slot ${readyGopData.slot})`) + } } } } + // Fire audio on first frame + if (!audioFired) { + audio.play(0) + audioFired = true + } + // Step 2 & 3: Display current GOP frame if it's time if (!paused && currentGopSize > 0 && currentGopFrameIndex < currentGopSize) { // Spin-wait for next frame time @@ -1483,6 +1559,10 @@ try { graphics.uploadVideoBufferFrameToFramebuffer(currentGopFrameIndex, header.width, header.height, trueFrameCount, bufferOffset) uploadTime = (sys.nanoTime() - uploadStart) / 1000000.0 + if (interactive && currentGopFrameIndex === 0) { + console.log(`[GOP] Playing GOP: ${currentGopSize} frames from slot ${currentGopBufferSlot}`) + } + // Apply bias lighting let biasStart = sys.nanoTime() if (currentGopFrameIndex === 0 || currentGopFrameIndex === currentGopSize - 1) { @@ -1531,9 +1611,20 @@ try { decodingGopData.slot * SLOT_SIZE ) + // CRITICAL FIX: Set async decode tracking variables so decode is properly tracked + asyncDecodeInProgress = true + asyncDecodeSlot = decodingGopData.slot + asyncDecodeGopSize = decodingGopData.gopSize + asyncDecodePtr = decodingGopData.compressedPtr + asyncDecodeStartTime = sys.nanoTime() + decodingGopData.needsDecode = false - decodingGopData.startTime = sys.nanoTime() + decodingGopData.startTime = asyncDecodeStartTime decodingGopData.timeRemaining = timeRemaining + + if (interactive) { + console.log(`[GOP] Started decode of buffered GOP ${decodingGopData.gopSize} frames from decoding slot (slot ${decodingGopData.slot})`) + } } // Schedule next frame @@ -1543,6 +1634,9 @@ try { // Step 4-7: GOP finished? Transition to ready GOP (triple-buffering) if (!paused && currentGopSize > 0 && currentGopFrameIndex >= currentGopSize) { + if (interactive) { + console.log(`[GOP] GOP finished: played ${currentGopFrameIndex}/${currentGopSize} frames from slot ${currentGopBufferSlot}`) + } if (readyGopData !== null) { // If ready GOP still needs decode, start it now (defensive - should already be started) if (readyGopData.needsDecode) { @@ -1581,8 +1675,19 @@ try { readyGopData = decodingGopData decodingGopData = null + // CRITICAL: Only clear async decode tracking if NO decode is in progress + // (the promoted readyGop might be decoding from Case 4) + if (graphics.tavDecodeGopIsComplete()) { + asyncDecodeInProgress = false + asyncDecodePtr = 0 + asyncDecodeGopSize = 0 + } + // Resume packet reading now that one buffer is free (decoding slot available) shouldReadPackets = true + if (interactive) { + console.log(`[GOP] Transition complete - resuming packet reading (asyncInProgress=${asyncDecodeInProgress})`) + } } } else { // No ready GOP available - hiccup (shouldn't happen with triple-buffering) diff --git a/terranmon.txt b/terranmon.txt index c4271ce..54ab528 100644 --- a/terranmon.txt +++ b/terranmon.txt @@ -1038,9 +1038,9 @@ transmission capability, and region-of-interest coding. type_t Value ### List of Keys - - Uint64 BGNT: Video begin time (must be equal to the value of the first Timecode packet) - - Uint64 ENDT: Video end time (must be equal to the value of the last Timecode packet) - - Uint64 CDAT: Creation time in nanoseconds since UNIX Epoch (must be in UTC timezone) + - Uint64 BGNT: Video begin time in nanoseconds (must be equal to the value of the first Timecode packet) + - Uint64 ENDT: Video end time in nanoseconds (must be equal to the value of the last Timecode packet) + - Uint64 CDAT: Creation time in microseconds since UNIX Epoch (must be in UTC timezone) - Bytes VNDR: Name and version of the encoder (for Reference encoder: "Encoder-TAV 20251014 (list,of,features)") - Bytes FMPG: FFmpeg version (typically "ffmpeg version 8.0 Copyright (c) 2000-2025 the FFmpeg developers"; the first line of text FFmpeg emits) @@ -1067,7 +1067,6 @@ transmission capability, and region-of-interest coding. ## GOP Unified Packet Structure (0x12) Implemented on 2025-10-15 for temporal 3D DWT with unified preprocessing. -Updated on 2025-10-17 to include canvas expansion margins. This packet contains multiple frames encoded as a single spacetime block for optimal temporal compression. @@ -1084,6 +1083,7 @@ temporal compression. ### Unified Block Data Format The entire GOP (width×height×N_frames×3_channels) is preprocessed as a single block: + uint8 Y Significance Maps[(width*height + 7) / 8 * GOP Size] // All Y frames concatenated uint8 Co Significance Maps[(width*height + 7) / 8 * GOP Size] // All Co frames concatenated uint8 Cg Significance Maps[(width*height + 7) / 8 * GOP Size] // All Cg frames concatenated @@ -1091,28 +1091,17 @@ The entire GOP (width×height×N_frames×3_channels) is preprocessed as a single int16 Co Non-zero Values[variable length] // All Co non-zero coefficients int16 Cg Non-zero Values[variable length] // All Cg non-zero coefficients + + uint32 EZBC Size for Y + * EZBC Structure for Y + uint32 EZBC Size for Co + * EZBC Structure for Co + uint32 EZBC Size for Cg + * EZBC Structure for Cg + This layout enables Zstd to find patterns across both spatial and temporal dimensions, resulting in superior compression compared to per-frame encoding. -### Canvas Expansion for Motion Compensation -When frames in a GOP have camera motion, they must be aligned before temporal DWT. -However, alignment creates "gaps" at frame edges. To preserve ALL original pixels: - -1. **Calculate motion range**: Determine the total shift range across all GOP frames - - Example: If frames shift by ±3 pixels horizontally, total range = 6 pixels -2. **Expand canvas**: Create a larger canvas = original_size + margin - - Canvas width = header.width + margin_left + margin_right - - Canvas height = header.height + margin_top + margin_bottom -3. **Place aligned frames**: Each frame is positioned on the expanded canvas - - All original pixels from all frames are preserved - - No artificial padding or cropping occurs -4. **Encode expanded canvas**: Apply 3D DWT to the larger canvas dimensions -5. **Store margins**: 4 bytes (L/R/T/B) tell decoder the canvas expansion -6. **Decoder extraction**: Decoder extracts display region for each frame based on - motion vectors and margins - -This approach ensures lossless preservation of original video content during GOP encoding. - ### Motion Vectors - Stored in 1/16-pixel units (divide by 16.0 for pixel displacement) - Used for global motion compensation (camera movement, scene translation) diff --git a/video_encoder/encoder_tav.c b/video_encoder/encoder_tav.c index 8dcd022..92789fc 100644 --- a/video_encoder/encoder_tav.c +++ b/video_encoder/encoder_tav.c @@ -121,7 +121,11 @@ static int needs_alpha_channel(int channel_layout) { #define DEFAULT_ZSTD_LEVEL 3 #define DEFAULT_PCM_ZSTD_LEVEL 3 #define TEMPORAL_GOP_SIZE 20 +#define TEMPORAL_GOP_SIZE_MIN 8 // Minimum GOP size to avoid decoder hiccups #define TEMPORAL_DECOMP_LEVEL 2 + +#define SCENE_CHANGE_THRESHOLD_SOFT 0.6 +#define SCENE_CHANGE_THRESHOLD_HARD 0.8 #define MOTION_THRESHOLD 24.0f // Flush if motion exceeds 24 pixels in any direction // Audio/subtitle constants (reused from TEV) @@ -1897,7 +1901,7 @@ typedef struct tav_encoder_s { // Extended header support char *ffmpeg_version; // FFmpeg version string - uint64_t creation_time_ns; // Creation time in nanoseconds since UNIX epoch + uint64_t creation_time_us; // Creation time in nanoseconds since UNIX epoch long extended_header_offset; // File offset of extended header for ENDT update } tav_encoder_t; @@ -2267,7 +2271,7 @@ static void show_usage(const char *program_name) { printf(" -a, --arate N MP2 audio bitrate in kbps (overrides quality-based audio rate)\n"); printf(" Valid values: 32, 48, 56, 64, 80, 96, 112, 128, 160, 192, 224, 256, 320, 384\n"); printf(" --separate-audio-track Write entire MP2 file as single packet 0x40 (instead of interleaved)\n"); - printf(" --pcm8-audio Use 8-bit PCM audio (packet 0x21, zstd compressed, per-frame packets)\n"); + printf(" --pcm8-audio Use 8-bit PCM audio instead of MP2 (TSVM native audio format)\n"); printf(" -S, --subtitles FILE SubRip (.srt) or SAMI (.smi) subtitle file\n"); printf(" --fontrom-lo FILE Low font ROM file for internationalised subtitles\n"); printf(" --fontrom-hi FILE High font ROM file for internationalised subtitles\n"); @@ -4063,7 +4067,7 @@ static size_t encode_pframe_residual(tav_encoder_t *enc, int qY) { if (enc->enable_ezbc) { // EZBC mode: Quantize with perceptual weighting but no normalization (division by quantizer) // EZBC will compress by encoding only significant bitplanes - fprintf(stderr, "[EZBC-QUANT-PFRAME] Using perceptual quantization without normalization\n"); +// fprintf(stderr, "[EZBC-QUANT-PFRAME] Using perceptual quantization without normalization\n"); quantise_dwt_coefficients_perceptual_per_coeff_no_normalisation(enc, residual_y_dwt, quantised_y, frame_size, qY, enc->width, enc->height, enc->decomp_levels, 0, 0); @@ -4081,7 +4085,7 @@ static size_t encode_pframe_residual(tav_encoder_t *enc, int qY) { if (abs(quantised_co[i]) > max_co) max_co = abs(quantised_co[i]); if (abs(quantised_cg[i]) > max_cg) max_cg = abs(quantised_cg[i]); } - fprintf(stderr, "[EZBC-QUANT-PFRAME] Quantized coeff max: Y=%d, Co=%d, Cg=%d\n", max_y, max_co, max_cg); +// fprintf(stderr, "[EZBC-QUANT-PFRAME] Quantized coeff max: Y=%d, Co=%d, Cg=%d\n", max_y, max_co, max_cg); } else { // Twobit-map mode: Use traditional quantization quantise_dwt_coefficients_perceptual_per_coeff(enc, residual_y_dwt, quantised_y, frame_size, @@ -5396,9 +5400,84 @@ static size_t gop_process_and_flush(tav_encoder_t *enc, FILE *output, int base_q // Trim GOP if scene change detected if (scene_change_frame > 0) { actual_gop_size = scene_change_frame; - if (enc->verbose) { - printf("Trimming GOP from %d to %d frames due to scene change\n", - enc->temporal_gop_frame_count, actual_gop_size); + + // If trimmed GOP would be too small, encode as separate I-frames instead + if (actual_gop_size < TEMPORAL_GOP_SIZE_MIN) { + if (enc->verbose) { + printf("Scene change at frame %d would create GOP of %d frames (< %d), encoding as I-frames instead\n", + frame_numbers[scene_change_frame], actual_gop_size, TEMPORAL_GOP_SIZE_MIN); + } + + // Encode each frame before scene change as separate I-frame + size_t total_bytes = 0; + int original_gop_frame_count = enc->temporal_gop_frame_count; + + for (int i = 0; i < actual_gop_size; i++) { + // Temporarily set up single-frame GOP + uint8_t *saved_rgb_frame0 = enc->temporal_gop_rgb_frames[0]; + float *saved_y_frame0 = enc->temporal_gop_y_frames[0]; + float *saved_co_frame0 = enc->temporal_gop_co_frames[0]; + float *saved_cg_frame0 = enc->temporal_gop_cg_frames[0]; + + // Set up single-frame GOP by moving frame i to position 0 + enc->temporal_gop_rgb_frames[0] = enc->temporal_gop_rgb_frames[i]; + enc->temporal_gop_y_frames[0] = enc->temporal_gop_y_frames[i]; + enc->temporal_gop_co_frames[0] = enc->temporal_gop_co_frames[i]; + enc->temporal_gop_cg_frames[0] = enc->temporal_gop_cg_frames[i]; + enc->temporal_gop_frame_count = 1; + + // Encode as I-frame + size_t bytes = gop_flush(enc, output, base_quantiser, &frame_numbers[i], 1); + if (bytes == 0) { + fprintf(stderr, "Error: Failed to encode I-frame during GOP trimming\n"); + enc->temporal_gop_frame_count = original_gop_frame_count; + return 0; + } + total_bytes += bytes; + + // Restore position 0 (but keep frame i in place for the shift operation below) + enc->temporal_gop_rgb_frames[0] = saved_rgb_frame0; + enc->temporal_gop_y_frames[0] = saved_y_frame0; + enc->temporal_gop_co_frames[0] = saved_co_frame0; + enc->temporal_gop_cg_frames[0] = saved_cg_frame0; + } + + // Restore original frame count + enc->temporal_gop_frame_count = original_gop_frame_count; + + // Shift remaining frames (after scene change) to start of buffer + int remaining_frames = original_gop_frame_count - scene_change_frame; + for (int i = 0; i < remaining_frames; i++) { + int src = scene_change_frame + i; + // Swap pointers + uint8_t *temp_rgb = enc->temporal_gop_rgb_frames[i]; + float *temp_y = enc->temporal_gop_y_frames[i]; + float *temp_co = enc->temporal_gop_co_frames[i]; + float *temp_cg = enc->temporal_gop_cg_frames[i]; + + enc->temporal_gop_rgb_frames[i] = enc->temporal_gop_rgb_frames[src]; + enc->temporal_gop_y_frames[i] = enc->temporal_gop_y_frames[src]; + enc->temporal_gop_co_frames[i] = enc->temporal_gop_co_frames[src]; + enc->temporal_gop_cg_frames[i] = enc->temporal_gop_cg_frames[src]; + + enc->temporal_gop_rgb_frames[src] = temp_rgb; + enc->temporal_gop_y_frames[src] = temp_y; + enc->temporal_gop_co_frames[src] = temp_co; + enc->temporal_gop_cg_frames[src] = temp_cg; + + enc->temporal_gop_translation_x[i] = enc->temporal_gop_translation_x[src]; + enc->temporal_gop_translation_y[i] = enc->temporal_gop_translation_y[src]; + } + enc->temporal_gop_frame_count = remaining_frames; + + return total_bytes; + + } else { + // GOP large enough after trimming - proceed normally + if (enc->verbose) { + printf("Trimming GOP from %d to %d frames due to scene change\n", + enc->temporal_gop_frame_count, actual_gop_size); + } } } @@ -7017,7 +7096,7 @@ static size_t serialise_tile_data(tav_encoder_t *enc, int tile_x, int tile_y, // INTRA mode: quantise coefficients directly and store for future reference if (enc->enable_ezbc) { // EZBC mode: Quantize with perceptual weighting but no normalization (division by quantizer) - fprintf(stderr, "[EZBC-QUANT-INTRA] Using perceptual quantization without normalization\n"); +// fprintf(stderr, "[EZBC-QUANT-INTRA] Using perceptual quantization without normalization\n"); quantise_dwt_coefficients_perceptual_per_coeff_no_normalisation(enc, (float*)tile_y_data, quantised_y, tile_size, this_frame_qY, enc->width, enc->height, enc->decomp_levels, 0, enc->frame_count); quantise_dwt_coefficients_perceptual_per_coeff_no_normalisation(enc, (float*)tile_co_data, quantised_co, tile_size, this_frame_qCo, enc->width, enc->height, enc->decomp_levels, 1, enc->frame_count); quantise_dwt_coefficients_perceptual_per_coeff_no_normalisation(enc, (float*)tile_cg_data, quantised_cg, tile_size, this_frame_qCg, enc->width, enc->height, enc->decomp_levels, 1, enc->frame_count); @@ -7029,7 +7108,7 @@ static size_t serialise_tile_data(tav_encoder_t *enc, int tile_x, int tile_y, if (abs(quantised_co[i]) > max_co) max_co = abs(quantised_co[i]); if (abs(quantised_cg[i]) > max_cg) max_cg = abs(quantised_cg[i]); } - fprintf(stderr, "[EZBC-QUANT-INTRA] Quantized coeff max: Y=%d, Co=%d, Cg=%d\n", max_y, max_co, max_cg); +// fprintf(stderr, "[EZBC-QUANT-INTRA] Quantized coeff max: Y=%d, Co=%d, Cg=%d\n", max_y, max_co, max_cg); } else if (enc->perceptual_tuning) { // Perceptual quantisation: EXACTLY like uniform but with per-coefficient weights quantise_dwt_coefficients_perceptual_per_coeff(enc, (float*)tile_y_data, quantised_y, tile_size, this_frame_qY, enc->width, enc->height, enc->decomp_levels, 0, enc->frame_count); @@ -8627,7 +8706,7 @@ static long write_extended_header(tav_encoder_t *enc) { WRITE_KV_UINT64("ENDT", 0ULL); // CDAT: Creation time in nanoseconds since UNIX epoch - WRITE_KV_UINT64("CDAT", enc->creation_time_ns); + WRITE_KV_UINT64("CDAT", enc->creation_time_us); // VNDR: Encoder name and version const char *vendor_str = ENCODER_VENDOR_STRING; @@ -9157,15 +9236,13 @@ static int detect_scene_change_between_frames( if (out_avg_diff) *out_avg_diff = avg_diff; if (out_changed_ratio) *out_changed_ratio = changed_ratio; - // Scene change threshold - double threshold = 0.50; - - return changed_ratio > threshold; + return changed_ratio > SCENE_CHANGE_THRESHOLD_SOFT; } // Wrapper for normal mode: compare current frame with previous frame -static int detect_scene_change(tav_encoder_t *enc) { +static int detect_scene_change(tav_encoder_t *enc, double *out_changed_ratio) { if (!enc->current_frame_rgb || enc->intra_only) { + if (out_changed_ratio) *out_changed_ratio = 0.0; return 0; // No current frame to compare } @@ -9179,6 +9256,8 @@ static int detect_scene_change(tav_encoder_t *enc) { &changed_ratio ); + if (out_changed_ratio) *out_changed_ratio = changed_ratio; + if (is_scene_change) { printf("Scene change detection: avg_diff=%.2f\tchanged_ratio=%.4f\n", avg_diff, changed_ratio); } @@ -9364,6 +9443,9 @@ int main(int argc, char *argv[]) { {"ezbc", no_argument, 0, 1025}, {"separate-audio-track", no_argument, 0, 1026}, {"pcm8-audio", no_argument, 0, 1027}, + {"pcm-audio", no_argument, 0, 1027}, + {"native-audio", no_argument, 0, 1027}, + {"native-audio-format", no_argument, 0, 1027}, {"help", no_argument, 0, '?'}, {0, 0, 0, 0} }; @@ -9478,6 +9560,7 @@ int main(int argc, char *argv[]) { break; case 1006: // --intra-only enc->intra_only = 1; + enc->enable_temporal_dwt = 0; break; case 1007: // --no-perceptual-tuning enc->perceptual_tuning = 0; @@ -9518,6 +9601,7 @@ int main(int argc, char *argv[]) { break; case 1017: // --enable-delta enc->use_delta_encoding = 1; + enc->enable_temporal_dwt = 0; break; case 1018: // --delta-haar enc->delta_haar_levels = CLAMP(atoi(optarg), 0, 6); @@ -9697,7 +9781,7 @@ int main(int argc, char *argv[]) { enc->ffmpeg_version = get_ffmpeg_version(); struct timeval tv; gettimeofday(&tv, NULL); - enc->creation_time_ns = (uint64_t)tv.tv_sec * 1000000000ULL + (uint64_t)tv.tv_usec * 1000ULL; + enc->creation_time_us = (uint64_t)tv.tv_sec * 1000000ULL + (uint64_t)tv.tv_usec * 1ULL; // Start FFmpeg process for video input (using TEV-compatible filtergraphs) if (enc->test_mode) { @@ -9862,7 +9946,8 @@ int main(int argc, char *argv[]) { } // Determine frame type - int is_scene_change = detect_scene_change(enc); + double scene_change_ratio = 0.0; + int is_scene_change = detect_scene_change(enc, &scene_change_ratio); int is_time_keyframe = (frame_count % TEMPORAL_GOP_SIZE) == 0; // Check if we can use SKIP mode (DWT coefficient-based detection) @@ -9926,6 +10011,109 @@ int main(int argc, char *argv[]) { if (enc->enable_temporal_dwt) { // GOP-based temporal 3D DWT encoding path + + // Two-tier scene change handling: + // - Hard scene change (ratio >= 0.7): Force I-frames for current GOP, then flush + // - Soft scene change (0.5 <= ratio < 0.7): Only flush if GOP >= 10 frames (enforce minimum GOP size) + // - No scene change (ratio < 0.5): Don't flush + + int should_flush_scene_change = 0; + int force_iframes_for_scene_change = 0; + + if (is_scene_change && enc->temporal_gop_frame_count > 0) { + + if (scene_change_ratio >= SCENE_CHANGE_THRESHOLD_HARD) { + // Hard scene change: Force current GOP to be I-frames, then flush immediately + should_flush_scene_change = 1; + force_iframes_for_scene_change = 1; + if (enc->verbose) { + printf("Hard scene change (ratio=%.4f) at frame %d, forcing I-frames and flushing GOP...\n", + scene_change_ratio, frame_count); + } + } else if (enc->temporal_gop_frame_count >= TEMPORAL_GOP_SIZE_MIN) { + // Soft scene change with sufficient GOP size: Flush normally + should_flush_scene_change = 1; + if (enc->verbose) { + printf("Soft scene change (ratio=%.4f) at frame %d with GOP size %d >= %d, flushing GOP...\n", + scene_change_ratio, frame_count, enc->temporal_gop_frame_count, TEMPORAL_GOP_SIZE_MIN); + } + } else { + // Soft scene change with small GOP: Ignore to enforce minimum GOP size + if (enc->verbose) { + printf("Soft scene change (ratio=%.4f) at frame %d ignored (GOP size %d < %d)\n", + scene_change_ratio, frame_count, enc->temporal_gop_frame_count, TEMPORAL_GOP_SIZE_MIN); + } + } + } + + if (should_flush_scene_change) { + // Get quantiser + int qY = enc->bitrate_mode ? quantiser_float_to_int_dithered(enc) : enc->quantiser_y; + + if (force_iframes_for_scene_change) { + // Hard scene change: Encode each frame in GOP as separate I-frame (GOP size = 1) + // This ensures clean cut at major scene transitions + size_t total_bytes = 0; + int original_gop_frame_count = enc->temporal_gop_frame_count; + + for (int i = 0; i < original_gop_frame_count; i++) { + // Temporarily set up GOP to contain only this single frame + // Save position 0 pointers + uint8_t *saved_rgb_frame0 = enc->temporal_gop_rgb_frames[0]; + float *saved_y_frame0 = enc->temporal_gop_y_frames[0]; + float *saved_co_frame0 = enc->temporal_gop_co_frames[0]; + float *saved_cg_frame0 = enc->temporal_gop_cg_frames[0]; + + // Set up single-frame GOP by moving frame i to position 0 + enc->temporal_gop_rgb_frames[0] = enc->temporal_gop_rgb_frames[i]; + enc->temporal_gop_y_frames[0] = enc->temporal_gop_y_frames[i]; + enc->temporal_gop_co_frames[0] = enc->temporal_gop_co_frames[i]; + enc->temporal_gop_cg_frames[0] = enc->temporal_gop_cg_frames[i]; + enc->temporal_gop_frame_count = 1; + + // Encode single frame as I-frame (GOP size 1) + int frame_num = frame_count - original_gop_frame_count + i; + size_t bytes = gop_flush(enc, enc->output_fp, qY, &frame_num, 1); + + if (bytes == 0) { + fprintf(stderr, "Error: Failed to encode I-frame %d during hard scene change\n", frame_num); + enc->temporal_gop_frame_count = original_gop_frame_count; + break; + } + total_bytes += bytes; + + // Restore position 0 pointers + enc->temporal_gop_rgb_frames[0] = saved_rgb_frame0; + enc->temporal_gop_y_frames[0] = saved_y_frame0; + enc->temporal_gop_co_frames[0] = saved_co_frame0; + enc->temporal_gop_cg_frames[0] = saved_cg_frame0; + } + + // Restore original frame count + enc->temporal_gop_frame_count = original_gop_frame_count; + packet_size = total_bytes; + + } else { + // Soft scene change: Flush GOP normally as temporal GOP + int *gop_frame_numbers = malloc(enc->temporal_gop_frame_count * sizeof(int)); + for (int i = 0; i < enc->temporal_gop_frame_count; i++) { + gop_frame_numbers[i] = frame_count - enc->temporal_gop_frame_count + i; + } + + packet_size = gop_process_and_flush(enc, enc->output_fp, qY, + gop_frame_numbers, 1); + free(gop_frame_numbers); + } + + if (packet_size == 0) { + fprintf(stderr, "Error: Failed to flush GOP before scene change at frame %d\n", frame_count); + break; + } + + gop_reset(enc); + } + + // Now add current frame to GOP (will be first frame of new GOP if scene change) int add_result = temporal_gop_add_frame(enc, enc->current_frame_rgb, enc->current_frame_y, enc->current_frame_co, enc->current_frame_cg); @@ -9934,7 +10122,7 @@ int main(int argc, char *argv[]) { break; } - // Check if GOP should be flushed + // Check if GOP should be flushed (after adding frame) int should_flush = 0; int force_flush = 0; @@ -9945,23 +10133,24 @@ int main(int argc, char *argv[]) { printf("GOP buffer full (%d frames), flushing...\n", enc->temporal_gop_frame_count); } } - // Flush if large motion detected (breaks temporal coherence) - else if (gop_should_flush_motion(enc)) { + // Flush if large motion detected (breaks temporal coherence) AND GOP is large enough + else if (gop_should_flush_motion(enc) && enc->temporal_gop_frame_count >= TEMPORAL_GOP_SIZE_MIN) { should_flush = 1; if (enc->verbose) { - printf("Large motion detected (>24 pixels), flushing GOP early...\n"); + printf("Large motion detected (>24 pixels) with GOP size %d >= %d, flushing GOP early...\n", + enc->temporal_gop_frame_count, TEMPORAL_GOP_SIZE_MIN); } } - // Flush if scene change detected - else if (is_scene_change && enc->temporal_gop_frame_count > 1) { - should_flush = 1; - force_flush = 1; // Skip internal scene change detection (already detected) + else if (gop_should_flush_motion(enc) && enc->temporal_gop_frame_count < TEMPORAL_GOP_SIZE_MIN) { + // Large motion but GOP too small - keep accumulating if (enc->verbose) { - printf("Scene change detected, flushing GOP early...\n"); + printf("Large motion detected but GOP size %d < %d, continuing to accumulate...\n", + enc->temporal_gop_frame_count, TEMPORAL_GOP_SIZE_MIN); } } + // Note: Scene change flush is now handled BEFORE adding frame (above) - // Flush GOP if needed + // Flush GOP if needed (for reasons other than scene change) if (should_flush) { // Build frame number array for this GOP int *gop_frame_numbers = malloc(enc->temporal_gop_frame_count * sizeof(int)); @@ -9982,9 +10171,10 @@ int main(int argc, char *argv[]) { fprintf(stderr, "Error: Failed to flush GOP at frame %d\n", frame_count); break; } - } else { + } else if (packet_size == 0) { // Frame added to GOP buffer but not flushed yet // Skip normal packet processing (no packet written yet) + // Note: packet_size might already be > 0 from scene change flush above packet_size = 0; } } else if (enc->enable_residual_coding) { diff --git a/video_encoder/tav_inspector.c b/video_encoder/tav_inspector.c index 0b10a06..c2b1c0f 100644 --- a/video_encoder/tav_inspector.c +++ b/video_encoder/tav_inspector.c @@ -260,7 +260,7 @@ void print_extended_header(FILE *fp, int verbose) { if (verbose) { if (strcmp(key, "CDAT") == 0) { - time_t time_sec = value / 1000000000ULL; + time_t time_sec = value / 1000000ULL; // microseconds struct tm *time_info = gmtime(&time_sec); if (time_info) { char time_str[64]; @@ -268,7 +268,7 @@ void print_extended_header(FILE *fp, int verbose) { printf("%s", time_str); } } else { - printf("%.6f seconds", value / 1000000000.0); + printf("%.6f seconds", value / 1000000000.0); // nanoseconds } } } else if (value_type == 0x10) { // Bytes