TAV: minimal size for GOP

This commit is contained in:
minjaesong
2025-10-23 00:38:12 +09:00
parent 7f7222fe54
commit 34427d61d7
4 changed files with 344 additions and 60 deletions

View File

@@ -1143,6 +1143,9 @@ try {
startTime: 0, startTime: 0,
timeRemaining: 0 timeRemaining: 0
} }
if (interactive) {
console.log(`[GOP] Buffered GOP ${gopSize} frames to ready slot during first GOP decode`)
}
} else if (decodingGopData === null) { } else if (decodingGopData === null) {
// Buffer as decoding GOP (will decode after ready GOP) // Buffer as decoding GOP (will decode after ready GOP)
const decodingSlot = (currentGopBufferSlot + 2) % BUFFER_SLOTS const decodingSlot = (currentGopBufferSlot + 2) % BUFFER_SLOTS
@@ -1155,8 +1158,20 @@ try {
startTime: 0, startTime: 0,
timeRemaining: 0 timeRemaining: 0
} }
if (interactive) {
console.log(`[GOP] Buffered GOP ${gopSize} frames to decoding slot during first GOP decode`)
}
// CRITICAL: Stop reading packets now that all 3 buffers are full
shouldReadPackets = false
if (interactive) {
console.log(`[GOP] All 3 buffers full during first GOP decode - stopping packet reading`)
}
} else { } else {
// All 3 buffers full - discard this GOP // All 3 buffers full - discard this GOP (shouldn't happen now with gate)
if (interactive) {
console.log(`[GOP] WARNING: All 3 buffers full during first GOP decode - discarding GOP ${gopSize} frames`)
}
sys.free(compressedPtr) sys.free(compressedPtr)
} }
@@ -1180,14 +1195,28 @@ try {
nextOffset nextOffset
) )
// Set async decode tracking variables
asyncDecodeInProgress = true
asyncDecodeSlot = nextSlot
asyncDecodeGopSize = gopSize
asyncDecodePtr = compressedPtr
asyncDecodeStartTime = sys.nanoTime()
readyGopData = { readyGopData = {
gopSize: gopSize, gopSize: gopSize,
slot: nextSlot, slot: nextSlot,
compressedPtr: compressedPtr, compressedPtr: compressedPtr,
startTime: sys.nanoTime(), startTime: asyncDecodeStartTime,
timeRemaining: timeRemaining timeRemaining: timeRemaining
} }
// CRITICAL: Stop reading packets immediately after starting decode
// to prevent next GOP from being discarded in Case 5
shouldReadPackets = false
if (interactive) {
console.log(`[GOP] Case 3: Started decode to ready slot - stopping packet reading`)
}
} else if (currentGopSize > 0 && readyGopData !== null && decodingGopData === null && !asyncDecodeInProgress && graphics.tavDecodeGopIsComplete()) { } else if (currentGopSize > 0 && readyGopData !== null && decodingGopData === null && !asyncDecodeInProgress && graphics.tavDecodeGopIsComplete()) {
// Case 4: GOP playing, ready GOP exists, no decoding GOP, no decode in progress - decode to decoding slot // Case 4: GOP playing, ready GOP exists, no decoding GOP, no decode in progress - decode to decoding slot
const decodingSlot = (currentGopBufferSlot + 2) % BUFFER_SLOTS const decodingSlot = (currentGopBufferSlot + 2) % BUFFER_SLOTS
@@ -1208,16 +1237,33 @@ try {
decodingOffset decodingOffset
) )
// Set async decode tracking variables
asyncDecodeInProgress = true
asyncDecodeSlot = decodingSlot
asyncDecodeGopSize = gopSize
asyncDecodePtr = compressedPtr
asyncDecodeStartTime = sys.nanoTime()
decodingGopData = { decodingGopData = {
gopSize: gopSize, gopSize: gopSize,
slot: decodingSlot, slot: decodingSlot,
compressedPtr: compressedPtr, compressedPtr: compressedPtr,
startTime: sys.nanoTime(), startTime: asyncDecodeStartTime,
timeRemaining: timeRemaining timeRemaining: timeRemaining
} }
// CRITICAL: Stop reading packets immediately after starting decode
// All 3 buffers are now full (playing + ready + decoding)
shouldReadPackets = false
if (interactive) {
console.log(`[GOP] Case 4: Started decode to decoding slot - all buffers full, stopping packet reading`)
}
} else { } else {
// Case 5: All 3 buffers full (playing + ready + decoding) - ignore packet // Case 5: All 3 buffers full (playing + ready + decoding) - ignore packet
if (interactive) {
console.log(`[GOP] Case 5: Discarding GOP ${gopSize} frames (current=${currentGopSize}, ready=${readyGopData !== null}, decoding=${decodingGopData !== null}, asyncInProgress=${asyncDecodeInProgress})`)
}
sys.free(compressedPtr) sys.free(compressedPtr)
} }
} }
@@ -1230,6 +1276,9 @@ try {
// (one GOP playing + ready GOP + decoding GOP) // (one GOP playing + ready GOP + decoding GOP)
if (currentGopSize > 0 && readyGopData !== null && decodingGopData !== null) { if (currentGopSize > 0 && readyGopData !== null && decodingGopData !== null) {
shouldReadPackets = false shouldReadPackets = false
if (interactive) {
console.log(`[GOP] All 3 buffers full - stopping packet reading`)
}
} }
} }
else if (packetType === TAV_PACKET_AUDIO_BUNDLED) { else if (packetType === TAV_PACKET_AUDIO_BUNDLED) {
@@ -1433,8 +1482,18 @@ try {
// Set first frame time to NOW // Set first frame time to NOW
nextFrameTime = sys.nanoTime() nextFrameTime = sys.nanoTime()
// Resume packet reading to get next GOP (only one buffer occupied now) // Resume packet reading only if not all 3 buffers are full
shouldReadPackets = true // (might have buffered GOP 2 and 3 during GOP 1 decode)
if (!(currentGopSize > 0 && readyGopData !== null && decodingGopData !== null)) {
shouldReadPackets = true
if (interactive) {
console.log(`[GOP] First GOP ready - resuming packet reading (ready=${readyGopData !== null}, decoding=${decodingGopData !== null})`)
}
} else {
if (interactive) {
console.log(`[GOP] First GOP ready - all 3 buffers full, keeping packet reading paused`)
}
}
// if (interactive) { // if (interactive) {
// console.log(`[GOP] First GOP ready (slot ${asyncDecodeSlot}, ${asyncDecodeGopSize} frames) in ${decodeTime.toFixed(1)}ms - starting playback`) // console.log(`[GOP] First GOP ready (slot ${asyncDecodeSlot}, ${asyncDecodeGopSize} frames) in ${decodeTime.toFixed(1)}ms - starting playback`)
@@ -1461,13 +1520,30 @@ try {
readyGopData.slot * SLOT_SIZE readyGopData.slot * SLOT_SIZE
) )
// CRITICAL FIX: Set async decode tracking variables so decode is properly tracked
asyncDecodeInProgress = true
asyncDecodeSlot = readyGopData.slot
asyncDecodeGopSize = readyGopData.gopSize
asyncDecodePtr = readyGopData.compressedPtr
asyncDecodeStartTime = sys.nanoTime()
readyGopData.needsDecode = false readyGopData.needsDecode = false
readyGopData.startTime = sys.nanoTime() readyGopData.startTime = asyncDecodeStartTime
readyGopData.timeRemaining = timeRemaining readyGopData.timeRemaining = timeRemaining
if (interactive) {
console.log(`[GOP] Started decode of buffered GOP ${readyGopData.gopSize} frames (slot ${readyGopData.slot})`)
}
} }
} }
} }
// Fire audio on first frame
if (!audioFired) {
audio.play(0)
audioFired = true
}
// Step 2 & 3: Display current GOP frame if it's time // Step 2 & 3: Display current GOP frame if it's time
if (!paused && currentGopSize > 0 && currentGopFrameIndex < currentGopSize) { if (!paused && currentGopSize > 0 && currentGopFrameIndex < currentGopSize) {
// Spin-wait for next frame time // Spin-wait for next frame time
@@ -1483,6 +1559,10 @@ try {
graphics.uploadVideoBufferFrameToFramebuffer(currentGopFrameIndex, header.width, header.height, trueFrameCount, bufferOffset) graphics.uploadVideoBufferFrameToFramebuffer(currentGopFrameIndex, header.width, header.height, trueFrameCount, bufferOffset)
uploadTime = (sys.nanoTime() - uploadStart) / 1000000.0 uploadTime = (sys.nanoTime() - uploadStart) / 1000000.0
if (interactive && currentGopFrameIndex === 0) {
console.log(`[GOP] Playing GOP: ${currentGopSize} frames from slot ${currentGopBufferSlot}`)
}
// Apply bias lighting // Apply bias lighting
let biasStart = sys.nanoTime() let biasStart = sys.nanoTime()
if (currentGopFrameIndex === 0 || currentGopFrameIndex === currentGopSize - 1) { if (currentGopFrameIndex === 0 || currentGopFrameIndex === currentGopSize - 1) {
@@ -1531,9 +1611,20 @@ try {
decodingGopData.slot * SLOT_SIZE decodingGopData.slot * SLOT_SIZE
) )
// CRITICAL FIX: Set async decode tracking variables so decode is properly tracked
asyncDecodeInProgress = true
asyncDecodeSlot = decodingGopData.slot
asyncDecodeGopSize = decodingGopData.gopSize
asyncDecodePtr = decodingGopData.compressedPtr
asyncDecodeStartTime = sys.nanoTime()
decodingGopData.needsDecode = false decodingGopData.needsDecode = false
decodingGopData.startTime = sys.nanoTime() decodingGopData.startTime = asyncDecodeStartTime
decodingGopData.timeRemaining = timeRemaining decodingGopData.timeRemaining = timeRemaining
if (interactive) {
console.log(`[GOP] Started decode of buffered GOP ${decodingGopData.gopSize} frames from decoding slot (slot ${decodingGopData.slot})`)
}
} }
// Schedule next frame // Schedule next frame
@@ -1543,6 +1634,9 @@ try {
// Step 4-7: GOP finished? Transition to ready GOP (triple-buffering) // Step 4-7: GOP finished? Transition to ready GOP (triple-buffering)
if (!paused && currentGopSize > 0 && currentGopFrameIndex >= currentGopSize) { if (!paused && currentGopSize > 0 && currentGopFrameIndex >= currentGopSize) {
if (interactive) {
console.log(`[GOP] GOP finished: played ${currentGopFrameIndex}/${currentGopSize} frames from slot ${currentGopBufferSlot}`)
}
if (readyGopData !== null) { if (readyGopData !== null) {
// If ready GOP still needs decode, start it now (defensive - should already be started) // If ready GOP still needs decode, start it now (defensive - should already be started)
if (readyGopData.needsDecode) { if (readyGopData.needsDecode) {
@@ -1581,8 +1675,19 @@ try {
readyGopData = decodingGopData readyGopData = decodingGopData
decodingGopData = null decodingGopData = null
// CRITICAL: Only clear async decode tracking if NO decode is in progress
// (the promoted readyGop might be decoding from Case 4)
if (graphics.tavDecodeGopIsComplete()) {
asyncDecodeInProgress = false
asyncDecodePtr = 0
asyncDecodeGopSize = 0
}
// Resume packet reading now that one buffer is free (decoding slot available) // Resume packet reading now that one buffer is free (decoding slot available)
shouldReadPackets = true shouldReadPackets = true
if (interactive) {
console.log(`[GOP] Transition complete - resuming packet reading (asyncInProgress=${asyncDecodeInProgress})`)
}
} }
} else { } else {
// No ready GOP available - hiccup (shouldn't happen with triple-buffering) // No ready GOP available - hiccup (shouldn't happen with triple-buffering)

View File

@@ -1038,9 +1038,9 @@ transmission capability, and region-of-interest coding.
type_t Value type_t Value
### List of Keys ### List of Keys
- Uint64 BGNT: Video begin time (must be equal to the value of the first Timecode packet) - Uint64 BGNT: Video begin time in nanoseconds (must be equal to the value of the first Timecode packet)
- Uint64 ENDT: Video end time (must be equal to the value of the last Timecode packet) - Uint64 ENDT: Video end time in nanoseconds (must be equal to the value of the last Timecode packet)
- Uint64 CDAT: Creation time in nanoseconds since UNIX Epoch (must be in UTC timezone) - Uint64 CDAT: Creation time in microseconds since UNIX Epoch (must be in UTC timezone)
- Bytes VNDR: Name and version of the encoder (for Reference encoder: "Encoder-TAV 20251014 (list,of,features)") - Bytes VNDR: Name and version of the encoder (for Reference encoder: "Encoder-TAV 20251014 (list,of,features)")
- Bytes FMPG: FFmpeg version (typically "ffmpeg version 8.0 Copyright (c) 2000-2025 the FFmpeg developers"; the first line of text FFmpeg emits) - Bytes FMPG: FFmpeg version (typically "ffmpeg version 8.0 Copyright (c) 2000-2025 the FFmpeg developers"; the first line of text FFmpeg emits)
@@ -1067,7 +1067,6 @@ transmission capability, and region-of-interest coding.
## GOP Unified Packet Structure (0x12) ## GOP Unified Packet Structure (0x12)
Implemented on 2025-10-15 for temporal 3D DWT with unified preprocessing. Implemented on 2025-10-15 for temporal 3D DWT with unified preprocessing.
Updated on 2025-10-17 to include canvas expansion margins.
This packet contains multiple frames encoded as a single spacetime block for optimal This packet contains multiple frames encoded as a single spacetime block for optimal
temporal compression. temporal compression.
@@ -1084,6 +1083,7 @@ temporal compression.
### Unified Block Data Format ### Unified Block Data Format
The entire GOP (width×height×N_frames×3_channels) is preprocessed as a single block: The entire GOP (width×height×N_frames×3_channels) is preprocessed as a single block:
<if significance maps are used>
uint8 Y Significance Maps[(width*height + 7) / 8 * GOP Size] // All Y frames concatenated uint8 Y Significance Maps[(width*height + 7) / 8 * GOP Size] // All Y frames concatenated
uint8 Co Significance Maps[(width*height + 7) / 8 * GOP Size] // All Co frames concatenated uint8 Co Significance Maps[(width*height + 7) / 8 * GOP Size] // All Co frames concatenated
uint8 Cg Significance Maps[(width*height + 7) / 8 * GOP Size] // All Cg frames concatenated uint8 Cg Significance Maps[(width*height + 7) / 8 * GOP Size] // All Cg frames concatenated
@@ -1091,28 +1091,17 @@ The entire GOP (width×height×N_frames×3_channels) is preprocessed as a single
int16 Co Non-zero Values[variable length] // All Co non-zero coefficients int16 Co Non-zero Values[variable length] // All Co non-zero coefficients
int16 Cg Non-zero Values[variable length] // All Cg non-zero coefficients int16 Cg Non-zero Values[variable length] // All Cg non-zero coefficients
<if EZBC is used>
uint32 EZBC Size for Y
* EZBC Structure for Y
uint32 EZBC Size for Co
* EZBC Structure for Co
uint32 EZBC Size for Cg
* EZBC Structure for Cg
This layout enables Zstd to find patterns across both spatial and temporal dimensions, This layout enables Zstd to find patterns across both spatial and temporal dimensions,
resulting in superior compression compared to per-frame encoding. resulting in superior compression compared to per-frame encoding.
### Canvas Expansion for Motion Compensation
When frames in a GOP have camera motion, they must be aligned before temporal DWT.
However, alignment creates "gaps" at frame edges. To preserve ALL original pixels:
1. **Calculate motion range**: Determine the total shift range across all GOP frames
- Example: If frames shift by ±3 pixels horizontally, total range = 6 pixels
2. **Expand canvas**: Create a larger canvas = original_size + margin
- Canvas width = header.width + margin_left + margin_right
- Canvas height = header.height + margin_top + margin_bottom
3. **Place aligned frames**: Each frame is positioned on the expanded canvas
- All original pixels from all frames are preserved
- No artificial padding or cropping occurs
4. **Encode expanded canvas**: Apply 3D DWT to the larger canvas dimensions
5. **Store margins**: 4 bytes (L/R/T/B) tell decoder the canvas expansion
6. **Decoder extraction**: Decoder extracts display region for each frame based on
motion vectors and margins
This approach ensures lossless preservation of original video content during GOP encoding.
### Motion Vectors ### Motion Vectors
- Stored in 1/16-pixel units (divide by 16.0 for pixel displacement) - Stored in 1/16-pixel units (divide by 16.0 for pixel displacement)
- Used for global motion compensation (camera movement, scene translation) - Used for global motion compensation (camera movement, scene translation)

View File

@@ -121,7 +121,11 @@ static int needs_alpha_channel(int channel_layout) {
#define DEFAULT_ZSTD_LEVEL 3 #define DEFAULT_ZSTD_LEVEL 3
#define DEFAULT_PCM_ZSTD_LEVEL 3 #define DEFAULT_PCM_ZSTD_LEVEL 3
#define TEMPORAL_GOP_SIZE 20 #define TEMPORAL_GOP_SIZE 20
#define TEMPORAL_GOP_SIZE_MIN 8 // Minimum GOP size to avoid decoder hiccups
#define TEMPORAL_DECOMP_LEVEL 2 #define TEMPORAL_DECOMP_LEVEL 2
#define SCENE_CHANGE_THRESHOLD_SOFT 0.6
#define SCENE_CHANGE_THRESHOLD_HARD 0.8
#define MOTION_THRESHOLD 24.0f // Flush if motion exceeds 24 pixels in any direction #define MOTION_THRESHOLD 24.0f // Flush if motion exceeds 24 pixels in any direction
// Audio/subtitle constants (reused from TEV) // Audio/subtitle constants (reused from TEV)
@@ -1897,7 +1901,7 @@ typedef struct tav_encoder_s {
// Extended header support // Extended header support
char *ffmpeg_version; // FFmpeg version string char *ffmpeg_version; // FFmpeg version string
uint64_t creation_time_ns; // Creation time in nanoseconds since UNIX epoch uint64_t creation_time_us; // Creation time in nanoseconds since UNIX epoch
long extended_header_offset; // File offset of extended header for ENDT update long extended_header_offset; // File offset of extended header for ENDT update
} tav_encoder_t; } tav_encoder_t;
@@ -2267,7 +2271,7 @@ static void show_usage(const char *program_name) {
printf(" -a, --arate N MP2 audio bitrate in kbps (overrides quality-based audio rate)\n"); printf(" -a, --arate N MP2 audio bitrate in kbps (overrides quality-based audio rate)\n");
printf(" Valid values: 32, 48, 56, 64, 80, 96, 112, 128, 160, 192, 224, 256, 320, 384\n"); printf(" Valid values: 32, 48, 56, 64, 80, 96, 112, 128, 160, 192, 224, 256, 320, 384\n");
printf(" --separate-audio-track Write entire MP2 file as single packet 0x40 (instead of interleaved)\n"); printf(" --separate-audio-track Write entire MP2 file as single packet 0x40 (instead of interleaved)\n");
printf(" --pcm8-audio Use 8-bit PCM audio (packet 0x21, zstd compressed, per-frame packets)\n"); printf(" --pcm8-audio Use 8-bit PCM audio instead of MP2 (TSVM native audio format)\n");
printf(" -S, --subtitles FILE SubRip (.srt) or SAMI (.smi) subtitle file\n"); printf(" -S, --subtitles FILE SubRip (.srt) or SAMI (.smi) subtitle file\n");
printf(" --fontrom-lo FILE Low font ROM file for internationalised subtitles\n"); printf(" --fontrom-lo FILE Low font ROM file for internationalised subtitles\n");
printf(" --fontrom-hi FILE High font ROM file for internationalised subtitles\n"); printf(" --fontrom-hi FILE High font ROM file for internationalised subtitles\n");
@@ -4063,7 +4067,7 @@ static size_t encode_pframe_residual(tav_encoder_t *enc, int qY) {
if (enc->enable_ezbc) { if (enc->enable_ezbc) {
// EZBC mode: Quantize with perceptual weighting but no normalization (division by quantizer) // EZBC mode: Quantize with perceptual weighting but no normalization (division by quantizer)
// EZBC will compress by encoding only significant bitplanes // EZBC will compress by encoding only significant bitplanes
fprintf(stderr, "[EZBC-QUANT-PFRAME] Using perceptual quantization without normalization\n"); // fprintf(stderr, "[EZBC-QUANT-PFRAME] Using perceptual quantization without normalization\n");
quantise_dwt_coefficients_perceptual_per_coeff_no_normalisation(enc, residual_y_dwt, quantised_y, frame_size, quantise_dwt_coefficients_perceptual_per_coeff_no_normalisation(enc, residual_y_dwt, quantised_y, frame_size,
qY, enc->width, enc->height, qY, enc->width, enc->height,
enc->decomp_levels, 0, 0); enc->decomp_levels, 0, 0);
@@ -4081,7 +4085,7 @@ static size_t encode_pframe_residual(tav_encoder_t *enc, int qY) {
if (abs(quantised_co[i]) > max_co) max_co = abs(quantised_co[i]); if (abs(quantised_co[i]) > max_co) max_co = abs(quantised_co[i]);
if (abs(quantised_cg[i]) > max_cg) max_cg = abs(quantised_cg[i]); if (abs(quantised_cg[i]) > max_cg) max_cg = abs(quantised_cg[i]);
} }
fprintf(stderr, "[EZBC-QUANT-PFRAME] Quantized coeff max: Y=%d, Co=%d, Cg=%d\n", max_y, max_co, max_cg); // fprintf(stderr, "[EZBC-QUANT-PFRAME] Quantized coeff max: Y=%d, Co=%d, Cg=%d\n", max_y, max_co, max_cg);
} else { } else {
// Twobit-map mode: Use traditional quantization // Twobit-map mode: Use traditional quantization
quantise_dwt_coefficients_perceptual_per_coeff(enc, residual_y_dwt, quantised_y, frame_size, quantise_dwt_coefficients_perceptual_per_coeff(enc, residual_y_dwt, quantised_y, frame_size,
@@ -5396,9 +5400,84 @@ static size_t gop_process_and_flush(tav_encoder_t *enc, FILE *output, int base_q
// Trim GOP if scene change detected // Trim GOP if scene change detected
if (scene_change_frame > 0) { if (scene_change_frame > 0) {
actual_gop_size = scene_change_frame; actual_gop_size = scene_change_frame;
if (enc->verbose) {
printf("Trimming GOP from %d to %d frames due to scene change\n", // If trimmed GOP would be too small, encode as separate I-frames instead
enc->temporal_gop_frame_count, actual_gop_size); if (actual_gop_size < TEMPORAL_GOP_SIZE_MIN) {
if (enc->verbose) {
printf("Scene change at frame %d would create GOP of %d frames (< %d), encoding as I-frames instead\n",
frame_numbers[scene_change_frame], actual_gop_size, TEMPORAL_GOP_SIZE_MIN);
}
// Encode each frame before scene change as separate I-frame
size_t total_bytes = 0;
int original_gop_frame_count = enc->temporal_gop_frame_count;
for (int i = 0; i < actual_gop_size; i++) {
// Temporarily set up single-frame GOP
uint8_t *saved_rgb_frame0 = enc->temporal_gop_rgb_frames[0];
float *saved_y_frame0 = enc->temporal_gop_y_frames[0];
float *saved_co_frame0 = enc->temporal_gop_co_frames[0];
float *saved_cg_frame0 = enc->temporal_gop_cg_frames[0];
// Set up single-frame GOP by moving frame i to position 0
enc->temporal_gop_rgb_frames[0] = enc->temporal_gop_rgb_frames[i];
enc->temporal_gop_y_frames[0] = enc->temporal_gop_y_frames[i];
enc->temporal_gop_co_frames[0] = enc->temporal_gop_co_frames[i];
enc->temporal_gop_cg_frames[0] = enc->temporal_gop_cg_frames[i];
enc->temporal_gop_frame_count = 1;
// Encode as I-frame
size_t bytes = gop_flush(enc, output, base_quantiser, &frame_numbers[i], 1);
if (bytes == 0) {
fprintf(stderr, "Error: Failed to encode I-frame during GOP trimming\n");
enc->temporal_gop_frame_count = original_gop_frame_count;
return 0;
}
total_bytes += bytes;
// Restore position 0 (but keep frame i in place for the shift operation below)
enc->temporal_gop_rgb_frames[0] = saved_rgb_frame0;
enc->temporal_gop_y_frames[0] = saved_y_frame0;
enc->temporal_gop_co_frames[0] = saved_co_frame0;
enc->temporal_gop_cg_frames[0] = saved_cg_frame0;
}
// Restore original frame count
enc->temporal_gop_frame_count = original_gop_frame_count;
// Shift remaining frames (after scene change) to start of buffer
int remaining_frames = original_gop_frame_count - scene_change_frame;
for (int i = 0; i < remaining_frames; i++) {
int src = scene_change_frame + i;
// Swap pointers
uint8_t *temp_rgb = enc->temporal_gop_rgb_frames[i];
float *temp_y = enc->temporal_gop_y_frames[i];
float *temp_co = enc->temporal_gop_co_frames[i];
float *temp_cg = enc->temporal_gop_cg_frames[i];
enc->temporal_gop_rgb_frames[i] = enc->temporal_gop_rgb_frames[src];
enc->temporal_gop_y_frames[i] = enc->temporal_gop_y_frames[src];
enc->temporal_gop_co_frames[i] = enc->temporal_gop_co_frames[src];
enc->temporal_gop_cg_frames[i] = enc->temporal_gop_cg_frames[src];
enc->temporal_gop_rgb_frames[src] = temp_rgb;
enc->temporal_gop_y_frames[src] = temp_y;
enc->temporal_gop_co_frames[src] = temp_co;
enc->temporal_gop_cg_frames[src] = temp_cg;
enc->temporal_gop_translation_x[i] = enc->temporal_gop_translation_x[src];
enc->temporal_gop_translation_y[i] = enc->temporal_gop_translation_y[src];
}
enc->temporal_gop_frame_count = remaining_frames;
return total_bytes;
} else {
// GOP large enough after trimming - proceed normally
if (enc->verbose) {
printf("Trimming GOP from %d to %d frames due to scene change\n",
enc->temporal_gop_frame_count, actual_gop_size);
}
} }
} }
@@ -7017,7 +7096,7 @@ static size_t serialise_tile_data(tav_encoder_t *enc, int tile_x, int tile_y,
// INTRA mode: quantise coefficients directly and store for future reference // INTRA mode: quantise coefficients directly and store for future reference
if (enc->enable_ezbc) { if (enc->enable_ezbc) {
// EZBC mode: Quantize with perceptual weighting but no normalization (division by quantizer) // EZBC mode: Quantize with perceptual weighting but no normalization (division by quantizer)
fprintf(stderr, "[EZBC-QUANT-INTRA] Using perceptual quantization without normalization\n"); // fprintf(stderr, "[EZBC-QUANT-INTRA] Using perceptual quantization without normalization\n");
quantise_dwt_coefficients_perceptual_per_coeff_no_normalisation(enc, (float*)tile_y_data, quantised_y, tile_size, this_frame_qY, enc->width, enc->height, enc->decomp_levels, 0, enc->frame_count); quantise_dwt_coefficients_perceptual_per_coeff_no_normalisation(enc, (float*)tile_y_data, quantised_y, tile_size, this_frame_qY, enc->width, enc->height, enc->decomp_levels, 0, enc->frame_count);
quantise_dwt_coefficients_perceptual_per_coeff_no_normalisation(enc, (float*)tile_co_data, quantised_co, tile_size, this_frame_qCo, enc->width, enc->height, enc->decomp_levels, 1, enc->frame_count); quantise_dwt_coefficients_perceptual_per_coeff_no_normalisation(enc, (float*)tile_co_data, quantised_co, tile_size, this_frame_qCo, enc->width, enc->height, enc->decomp_levels, 1, enc->frame_count);
quantise_dwt_coefficients_perceptual_per_coeff_no_normalisation(enc, (float*)tile_cg_data, quantised_cg, tile_size, this_frame_qCg, enc->width, enc->height, enc->decomp_levels, 1, enc->frame_count); quantise_dwt_coefficients_perceptual_per_coeff_no_normalisation(enc, (float*)tile_cg_data, quantised_cg, tile_size, this_frame_qCg, enc->width, enc->height, enc->decomp_levels, 1, enc->frame_count);
@@ -7029,7 +7108,7 @@ static size_t serialise_tile_data(tav_encoder_t *enc, int tile_x, int tile_y,
if (abs(quantised_co[i]) > max_co) max_co = abs(quantised_co[i]); if (abs(quantised_co[i]) > max_co) max_co = abs(quantised_co[i]);
if (abs(quantised_cg[i]) > max_cg) max_cg = abs(quantised_cg[i]); if (abs(quantised_cg[i]) > max_cg) max_cg = abs(quantised_cg[i]);
} }
fprintf(stderr, "[EZBC-QUANT-INTRA] Quantized coeff max: Y=%d, Co=%d, Cg=%d\n", max_y, max_co, max_cg); // fprintf(stderr, "[EZBC-QUANT-INTRA] Quantized coeff max: Y=%d, Co=%d, Cg=%d\n", max_y, max_co, max_cg);
} else if (enc->perceptual_tuning) { } else if (enc->perceptual_tuning) {
// Perceptual quantisation: EXACTLY like uniform but with per-coefficient weights // Perceptual quantisation: EXACTLY like uniform but with per-coefficient weights
quantise_dwt_coefficients_perceptual_per_coeff(enc, (float*)tile_y_data, quantised_y, tile_size, this_frame_qY, enc->width, enc->height, enc->decomp_levels, 0, enc->frame_count); quantise_dwt_coefficients_perceptual_per_coeff(enc, (float*)tile_y_data, quantised_y, tile_size, this_frame_qY, enc->width, enc->height, enc->decomp_levels, 0, enc->frame_count);
@@ -8627,7 +8706,7 @@ static long write_extended_header(tav_encoder_t *enc) {
WRITE_KV_UINT64("ENDT", 0ULL); WRITE_KV_UINT64("ENDT", 0ULL);
// CDAT: Creation time in nanoseconds since UNIX epoch // CDAT: Creation time in nanoseconds since UNIX epoch
WRITE_KV_UINT64("CDAT", enc->creation_time_ns); WRITE_KV_UINT64("CDAT", enc->creation_time_us);
// VNDR: Encoder name and version // VNDR: Encoder name and version
const char *vendor_str = ENCODER_VENDOR_STRING; const char *vendor_str = ENCODER_VENDOR_STRING;
@@ -9157,15 +9236,13 @@ static int detect_scene_change_between_frames(
if (out_avg_diff) *out_avg_diff = avg_diff; if (out_avg_diff) *out_avg_diff = avg_diff;
if (out_changed_ratio) *out_changed_ratio = changed_ratio; if (out_changed_ratio) *out_changed_ratio = changed_ratio;
// Scene change threshold return changed_ratio > SCENE_CHANGE_THRESHOLD_SOFT;
double threshold = 0.50;
return changed_ratio > threshold;
} }
// Wrapper for normal mode: compare current frame with previous frame // Wrapper for normal mode: compare current frame with previous frame
static int detect_scene_change(tav_encoder_t *enc) { static int detect_scene_change(tav_encoder_t *enc, double *out_changed_ratio) {
if (!enc->current_frame_rgb || enc->intra_only) { if (!enc->current_frame_rgb || enc->intra_only) {
if (out_changed_ratio) *out_changed_ratio = 0.0;
return 0; // No current frame to compare return 0; // No current frame to compare
} }
@@ -9179,6 +9256,8 @@ static int detect_scene_change(tav_encoder_t *enc) {
&changed_ratio &changed_ratio
); );
if (out_changed_ratio) *out_changed_ratio = changed_ratio;
if (is_scene_change) { if (is_scene_change) {
printf("Scene change detection: avg_diff=%.2f\tchanged_ratio=%.4f\n", avg_diff, changed_ratio); printf("Scene change detection: avg_diff=%.2f\tchanged_ratio=%.4f\n", avg_diff, changed_ratio);
} }
@@ -9364,6 +9443,9 @@ int main(int argc, char *argv[]) {
{"ezbc", no_argument, 0, 1025}, {"ezbc", no_argument, 0, 1025},
{"separate-audio-track", no_argument, 0, 1026}, {"separate-audio-track", no_argument, 0, 1026},
{"pcm8-audio", no_argument, 0, 1027}, {"pcm8-audio", no_argument, 0, 1027},
{"pcm-audio", no_argument, 0, 1027},
{"native-audio", no_argument, 0, 1027},
{"native-audio-format", no_argument, 0, 1027},
{"help", no_argument, 0, '?'}, {"help", no_argument, 0, '?'},
{0, 0, 0, 0} {0, 0, 0, 0}
}; };
@@ -9478,6 +9560,7 @@ int main(int argc, char *argv[]) {
break; break;
case 1006: // --intra-only case 1006: // --intra-only
enc->intra_only = 1; enc->intra_only = 1;
enc->enable_temporal_dwt = 0;
break; break;
case 1007: // --no-perceptual-tuning case 1007: // --no-perceptual-tuning
enc->perceptual_tuning = 0; enc->perceptual_tuning = 0;
@@ -9518,6 +9601,7 @@ int main(int argc, char *argv[]) {
break; break;
case 1017: // --enable-delta case 1017: // --enable-delta
enc->use_delta_encoding = 1; enc->use_delta_encoding = 1;
enc->enable_temporal_dwt = 0;
break; break;
case 1018: // --delta-haar case 1018: // --delta-haar
enc->delta_haar_levels = CLAMP(atoi(optarg), 0, 6); enc->delta_haar_levels = CLAMP(atoi(optarg), 0, 6);
@@ -9697,7 +9781,7 @@ int main(int argc, char *argv[]) {
enc->ffmpeg_version = get_ffmpeg_version(); enc->ffmpeg_version = get_ffmpeg_version();
struct timeval tv; struct timeval tv;
gettimeofday(&tv, NULL); gettimeofday(&tv, NULL);
enc->creation_time_ns = (uint64_t)tv.tv_sec * 1000000000ULL + (uint64_t)tv.tv_usec * 1000ULL; enc->creation_time_us = (uint64_t)tv.tv_sec * 1000000ULL + (uint64_t)tv.tv_usec * 1ULL;
// Start FFmpeg process for video input (using TEV-compatible filtergraphs) // Start FFmpeg process for video input (using TEV-compatible filtergraphs)
if (enc->test_mode) { if (enc->test_mode) {
@@ -9862,7 +9946,8 @@ int main(int argc, char *argv[]) {
} }
// Determine frame type // Determine frame type
int is_scene_change = detect_scene_change(enc); double scene_change_ratio = 0.0;
int is_scene_change = detect_scene_change(enc, &scene_change_ratio);
int is_time_keyframe = (frame_count % TEMPORAL_GOP_SIZE) == 0; int is_time_keyframe = (frame_count % TEMPORAL_GOP_SIZE) == 0;
// Check if we can use SKIP mode (DWT coefficient-based detection) // Check if we can use SKIP mode (DWT coefficient-based detection)
@@ -9926,6 +10011,109 @@ int main(int argc, char *argv[]) {
if (enc->enable_temporal_dwt) { if (enc->enable_temporal_dwt) {
// GOP-based temporal 3D DWT encoding path // GOP-based temporal 3D DWT encoding path
// Two-tier scene change handling:
// - Hard scene change (ratio >= 0.7): Force I-frames for current GOP, then flush
// - Soft scene change (0.5 <= ratio < 0.7): Only flush if GOP >= 10 frames (enforce minimum GOP size)
// - No scene change (ratio < 0.5): Don't flush
int should_flush_scene_change = 0;
int force_iframes_for_scene_change = 0;
if (is_scene_change && enc->temporal_gop_frame_count > 0) {
if (scene_change_ratio >= SCENE_CHANGE_THRESHOLD_HARD) {
// Hard scene change: Force current GOP to be I-frames, then flush immediately
should_flush_scene_change = 1;
force_iframes_for_scene_change = 1;
if (enc->verbose) {
printf("Hard scene change (ratio=%.4f) at frame %d, forcing I-frames and flushing GOP...\n",
scene_change_ratio, frame_count);
}
} else if (enc->temporal_gop_frame_count >= TEMPORAL_GOP_SIZE_MIN) {
// Soft scene change with sufficient GOP size: Flush normally
should_flush_scene_change = 1;
if (enc->verbose) {
printf("Soft scene change (ratio=%.4f) at frame %d with GOP size %d >= %d, flushing GOP...\n",
scene_change_ratio, frame_count, enc->temporal_gop_frame_count, TEMPORAL_GOP_SIZE_MIN);
}
} else {
// Soft scene change with small GOP: Ignore to enforce minimum GOP size
if (enc->verbose) {
printf("Soft scene change (ratio=%.4f) at frame %d ignored (GOP size %d < %d)\n",
scene_change_ratio, frame_count, enc->temporal_gop_frame_count, TEMPORAL_GOP_SIZE_MIN);
}
}
}
if (should_flush_scene_change) {
// Get quantiser
int qY = enc->bitrate_mode ? quantiser_float_to_int_dithered(enc) : enc->quantiser_y;
if (force_iframes_for_scene_change) {
// Hard scene change: Encode each frame in GOP as separate I-frame (GOP size = 1)
// This ensures clean cut at major scene transitions
size_t total_bytes = 0;
int original_gop_frame_count = enc->temporal_gop_frame_count;
for (int i = 0; i < original_gop_frame_count; i++) {
// Temporarily set up GOP to contain only this single frame
// Save position 0 pointers
uint8_t *saved_rgb_frame0 = enc->temporal_gop_rgb_frames[0];
float *saved_y_frame0 = enc->temporal_gop_y_frames[0];
float *saved_co_frame0 = enc->temporal_gop_co_frames[0];
float *saved_cg_frame0 = enc->temporal_gop_cg_frames[0];
// Set up single-frame GOP by moving frame i to position 0
enc->temporal_gop_rgb_frames[0] = enc->temporal_gop_rgb_frames[i];
enc->temporal_gop_y_frames[0] = enc->temporal_gop_y_frames[i];
enc->temporal_gop_co_frames[0] = enc->temporal_gop_co_frames[i];
enc->temporal_gop_cg_frames[0] = enc->temporal_gop_cg_frames[i];
enc->temporal_gop_frame_count = 1;
// Encode single frame as I-frame (GOP size 1)
int frame_num = frame_count - original_gop_frame_count + i;
size_t bytes = gop_flush(enc, enc->output_fp, qY, &frame_num, 1);
if (bytes == 0) {
fprintf(stderr, "Error: Failed to encode I-frame %d during hard scene change\n", frame_num);
enc->temporal_gop_frame_count = original_gop_frame_count;
break;
}
total_bytes += bytes;
// Restore position 0 pointers
enc->temporal_gop_rgb_frames[0] = saved_rgb_frame0;
enc->temporal_gop_y_frames[0] = saved_y_frame0;
enc->temporal_gop_co_frames[0] = saved_co_frame0;
enc->temporal_gop_cg_frames[0] = saved_cg_frame0;
}
// Restore original frame count
enc->temporal_gop_frame_count = original_gop_frame_count;
packet_size = total_bytes;
} else {
// Soft scene change: Flush GOP normally as temporal GOP
int *gop_frame_numbers = malloc(enc->temporal_gop_frame_count * sizeof(int));
for (int i = 0; i < enc->temporal_gop_frame_count; i++) {
gop_frame_numbers[i] = frame_count - enc->temporal_gop_frame_count + i;
}
packet_size = gop_process_and_flush(enc, enc->output_fp, qY,
gop_frame_numbers, 1);
free(gop_frame_numbers);
}
if (packet_size == 0) {
fprintf(stderr, "Error: Failed to flush GOP before scene change at frame %d\n", frame_count);
break;
}
gop_reset(enc);
}
// Now add current frame to GOP (will be first frame of new GOP if scene change)
int add_result = temporal_gop_add_frame(enc, enc->current_frame_rgb, int add_result = temporal_gop_add_frame(enc, enc->current_frame_rgb,
enc->current_frame_y, enc->current_frame_co, enc->current_frame_cg); enc->current_frame_y, enc->current_frame_co, enc->current_frame_cg);
@@ -9934,7 +10122,7 @@ int main(int argc, char *argv[]) {
break; break;
} }
// Check if GOP should be flushed // Check if GOP should be flushed (after adding frame)
int should_flush = 0; int should_flush = 0;
int force_flush = 0; int force_flush = 0;
@@ -9945,23 +10133,24 @@ int main(int argc, char *argv[]) {
printf("GOP buffer full (%d frames), flushing...\n", enc->temporal_gop_frame_count); printf("GOP buffer full (%d frames), flushing...\n", enc->temporal_gop_frame_count);
} }
} }
// Flush if large motion detected (breaks temporal coherence) // Flush if large motion detected (breaks temporal coherence) AND GOP is large enough
else if (gop_should_flush_motion(enc)) { else if (gop_should_flush_motion(enc) && enc->temporal_gop_frame_count >= TEMPORAL_GOP_SIZE_MIN) {
should_flush = 1; should_flush = 1;
if (enc->verbose) { if (enc->verbose) {
printf("Large motion detected (>24 pixels), flushing GOP early...\n"); printf("Large motion detected (>24 pixels) with GOP size %d >= %d, flushing GOP early...\n",
enc->temporal_gop_frame_count, TEMPORAL_GOP_SIZE_MIN);
} }
} }
// Flush if scene change detected else if (gop_should_flush_motion(enc) && enc->temporal_gop_frame_count < TEMPORAL_GOP_SIZE_MIN) {
else if (is_scene_change && enc->temporal_gop_frame_count > 1) { // Large motion but GOP too small - keep accumulating
should_flush = 1;
force_flush = 1; // Skip internal scene change detection (already detected)
if (enc->verbose) { if (enc->verbose) {
printf("Scene change detected, flushing GOP early...\n"); printf("Large motion detected but GOP size %d < %d, continuing to accumulate...\n",
enc->temporal_gop_frame_count, TEMPORAL_GOP_SIZE_MIN);
} }
} }
// Note: Scene change flush is now handled BEFORE adding frame (above)
// Flush GOP if needed // Flush GOP if needed (for reasons other than scene change)
if (should_flush) { if (should_flush) {
// Build frame number array for this GOP // Build frame number array for this GOP
int *gop_frame_numbers = malloc(enc->temporal_gop_frame_count * sizeof(int)); int *gop_frame_numbers = malloc(enc->temporal_gop_frame_count * sizeof(int));
@@ -9982,9 +10171,10 @@ int main(int argc, char *argv[]) {
fprintf(stderr, "Error: Failed to flush GOP at frame %d\n", frame_count); fprintf(stderr, "Error: Failed to flush GOP at frame %d\n", frame_count);
break; break;
} }
} else { } else if (packet_size == 0) {
// Frame added to GOP buffer but not flushed yet // Frame added to GOP buffer but not flushed yet
// Skip normal packet processing (no packet written yet) // Skip normal packet processing (no packet written yet)
// Note: packet_size might already be > 0 from scene change flush above
packet_size = 0; packet_size = 0;
} }
} else if (enc->enable_residual_coding) { } else if (enc->enable_residual_coding) {

View File

@@ -260,7 +260,7 @@ void print_extended_header(FILE *fp, int verbose) {
if (verbose) { if (verbose) {
if (strcmp(key, "CDAT") == 0) { if (strcmp(key, "CDAT") == 0) {
time_t time_sec = value / 1000000000ULL; time_t time_sec = value / 1000000ULL; // microseconds
struct tm *time_info = gmtime(&time_sec); struct tm *time_info = gmtime(&time_sec);
if (time_info) { if (time_info) {
char time_str[64]; char time_str[64];
@@ -268,7 +268,7 @@ void print_extended_header(FILE *fp, int verbose) {
printf("%s", time_str); printf("%s", time_str);
} }
} else { } else {
printf("%.6f seconds", value / 1000000000.0); printf("%.6f seconds", value / 1000000000.0); // nanoseconds
} }
} }
} else if (value_type == 0x10) { // Bytes } else if (value_type == 0x10) { // Bytes