TAV update: CDF 5/3 for motion coder

This commit is contained in:
minjaesong
2025-11-23 18:16:12 +09:00
parent e928d2d3ec
commit 1c7ab17b1c
6 changed files with 174 additions and 95 deletions

View File

@@ -422,8 +422,12 @@ seqread.skip(3)
header.fileRole = seqread.readOneByte() header.fileRole = seqread.readOneByte()
if (header.version < 1 || header.version > 8) { // Extract temporal motion coder from version (versions 9-16 use CDF 5/3, 1-8 use Haar)
printerrln(`Error: Unsupported TAV version ${header.version}`) const baseVersion = (header.version > 8) ? (header.version - 8) : header.version
header.temporalMotionCoder = (header.version > 8) ? 1 : 0
if (baseVersion < 1 || baseVersion > 8) {
printerrln(`Error: Unsupported TAV base version ${baseVersion}`)
errorlevel = 1 errorlevel = 1
return return
} }
@@ -1339,7 +1343,8 @@ try {
header.channelLayout, header.channelLayout,
header.waveletFilter, header.decompLevels, TAV_TEMPORAL_LEVELS, header.waveletFilter, header.decompLevels, TAV_TEMPORAL_LEVELS,
header.entropyCoder, header.entropyCoder,
bufferOffset bufferOffset,
header.temporalMotionCoder
) )
asyncDecodeInProgress = true asyncDecodeInProgress = true
@@ -1412,7 +1417,8 @@ try {
header.channelLayout, header.channelLayout,
header.waveletFilter, header.decompLevels, TAV_TEMPORAL_LEVELS, header.waveletFilter, header.decompLevels, TAV_TEMPORAL_LEVELS,
header.entropyCoder, header.entropyCoder,
nextOffset nextOffset,
header.temporalMotionCoder
) )
// Set async decode tracking variables // Set async decode tracking variables
@@ -1454,7 +1460,8 @@ try {
header.channelLayout, header.channelLayout,
header.waveletFilter, header.decompLevels, TAV_TEMPORAL_LEVELS, header.waveletFilter, header.decompLevels, TAV_TEMPORAL_LEVELS,
header.entropyCoder, header.entropyCoder,
decodingOffset decodingOffset,
header.temporalMotionCoder
) )
// Set async decode tracking variables // Set async decode tracking variables
@@ -1821,7 +1828,8 @@ try {
header.channelLayout, header.channelLayout,
header.waveletFilter, header.decompLevels, TAV_TEMPORAL_LEVELS, header.waveletFilter, header.decompLevels, TAV_TEMPORAL_LEVELS,
header.entropyCoder, header.entropyCoder,
readyGopData.slot * SLOT_SIZE readyGopData.slot * SLOT_SIZE,
header.temporalMotionCoder
) )
// CRITICAL FIX: Set async decode tracking variables so decode is properly tracked // CRITICAL FIX: Set async decode tracking variables so decode is properly tracked
@@ -1998,7 +2006,8 @@ try {
header.channelLayout, header.channelLayout,
header.waveletFilter, header.decompLevels, TAV_TEMPORAL_LEVELS, header.waveletFilter, header.decompLevels, TAV_TEMPORAL_LEVELS,
header.entropyCoder, header.entropyCoder,
decodingGopData.slot * SLOT_SIZE decodingGopData.slot * SLOT_SIZE,
header.temporalMotionCoder
) )
// CRITICAL FIX: Set async decode tracking variables so decode is properly tracked // CRITICAL FIX: Set async decode tracking variables so decode is properly tracked
@@ -2038,7 +2047,8 @@ try {
header.channelLayout, header.channelLayout,
header.waveletFilter, header.decompLevels, TAV_TEMPORAL_LEVELS, header.waveletFilter, header.decompLevels, TAV_TEMPORAL_LEVELS,
header.entropyCoder, header.entropyCoder,
readyGopData.slot * SLOT_SIZE readyGopData.slot * SLOT_SIZE,
header.temporalMotionCoder
) )
readyGopData.needsDecode = false readyGopData.needsDecode = false
readyGopData.startTime = sys.nanoTime() readyGopData.startTime = sys.nanoTime()
@@ -2115,7 +2125,8 @@ try {
header.channelLayout, header.channelLayout,
header.waveletFilter, header.decompLevels, TAV_TEMPORAL_LEVELS, header.waveletFilter, header.decompLevels, TAV_TEMPORAL_LEVELS,
header.entropyCoder, header.entropyCoder,
targetOffset targetOffset,
header.temporalMotionCoder
) )
asyncDecodeInProgress = true asyncDecodeInProgress = true
@@ -2211,7 +2222,8 @@ try {
} }
catch (e) { catch (e) {
serial.printerr(`TAV decode error: ${e}`) serial.printerr(`TAV decode error: ${e}`)
e.printStackTrace() if (e.printStackTrace)
e.printStackTrace()
errorlevel = 1 errorlevel = 1
} }
finally { finally {

View File

@@ -905,6 +905,7 @@ transmission capability, and region-of-interest coding.
## Header (32 bytes) ## Header (32 bytes)
uint8 Magic[8]: "\x1F TSVM TAV" or "\x1F TSVM TAP" uint8 Magic[8]: "\x1F TSVM TAV" or "\x1F TSVM TAP"
uint8 Version: uint8 Version:
Base version number:
- 1 = YCoCg-R multi-tile uniform - 1 = YCoCg-R multi-tile uniform
- 2 = ICtCp multi-tile uniform - 2 = ICtCp multi-tile uniform
- 3 = YCoCg-R monoblock uniform - 3 = YCoCg-R monoblock uniform
@@ -913,6 +914,8 @@ transmission capability, and region-of-interest coding.
- 6 = ICtCp monoblock perceptual - 6 = ICtCp monoblock perceptual
- 7 = YCoCg-R multi-tile perceptual - 7 = YCoCg-R multi-tile perceptual
- 8 = ICtCp multi-tile perceptual - 8 = ICtCp multi-tile perceptual
When motion coder is Haar, take base version number.
When motion coder is CDF 5/3, add 8 to the base version number.
uint16 Width: picture width in pixels. Columns count for Videotex-only file. uint16 Width: picture width in pixels. Columns count for Videotex-only file.
uint16 Height: picture height in pixels. Rows count for Videotex-only file. uint16 Height: picture height in pixels. Rows count for Videotex-only file.
uint8 FPS: frames per second. Use 0x00 for still pictures uint8 FPS: frames per second. Use 0x00 for still pictures

View File

@@ -6297,65 +6297,25 @@ class GraphicsJSR223Delegate(private val vm: VM) {
if (length < 2) return if (length < 2) return
val temp = FloatArray(length) val temp = FloatArray(length)
val half = (length + 1) / 2 // Handle odd lengths properly val half = (length + 1) / 2
// Split into low and high frequency components (matching encoder layout) // Copy low-pass and high-pass subbands to temp
System.arraycopy(data, 0, temp, 0, length)
// Undo update step (low-pass)
for (i in 0 until half) { for (i in 0 until half) {
temp[i] = data[i] // Low-pass coefficients (first half) val update = 0.25f * ((if (i > 0) temp[half + i - 1] else 0.0f) +
} (if (i < half - 1) temp[half + i] else 0.0f))
for (i in 0 until length / 2) { temp[i] -= update
if (half + i < length && half + i < data.size) {
temp[half + i] = data[half + i] // High-pass coefficients (second half)
}
} }
// 5/3 inverse lifting (undo forward steps in reverse order) // Undo predict step (high-pass) and interleave samples
// Step 2: Undo update step (1/4 coefficient) - JPEG2000 symmetric extension
for (i in 0 until half) { for (i in 0 until half) {
val leftIdx = half + i - 1 data[2 * i] = temp[i] // Even samples (low-pass)
val centerIdx = half + i val idx = 2 * i + 1
if (idx < length) {
// Symmetric extension for boundary handling val pred = 0.5f * (temp[i] + (if (i < half - 1) temp[i + 1] else temp[i]))
val left = when { data[idx] = temp[half + i] + pred // Odd samples (high-pass)
leftIdx >= 0 && leftIdx < length -> temp[leftIdx]
centerIdx < length && centerIdx + 1 < length -> temp[centerIdx + 1] // Mirror
centerIdx < length -> temp[centerIdx]
else -> 0.0f
}
val right = if (centerIdx < length) temp[centerIdx] else 0.0f
temp[i] -= 0.25f * (left + right)
}
// Step 1: Undo predict step (1/2 coefficient) - JPEG2000 symmetric extension
for (i in 0 until length / 2) {
if (half + i < length) {
val left = temp[i]
// Symmetric extension for right boundary
val right = if (i < half - 1) temp[i + 1] else if (half > 2) temp[half - 2] else temp[half - 1]
temp[half + i] += 0.5f * (left + right) // ADD to undo the subtraction in encoder
}
}
// Simple reconstruction (revert to working version)
for (i in 0 until length) {
if (i % 2 == 0) {
// Even positions: low-pass coefficients
data[i] = temp[i / 2]
} else {
// Odd positions: high-pass coefficients
val idx = i / 2
if (half + idx < length) {
data[i] = temp[half + idx]
} else {
// Symmetric extension: mirror the last available high-pass coefficient
val lastHighIdx = (length / 2) - 1
if (lastHighIdx >= 0 && half + lastHighIdx < length) {
data[i] = temp[half + lastHighIdx]
} else {
data[i] = 0.0f
}
}
} }
} }
} }
@@ -6514,7 +6474,8 @@ class GraphicsJSR223Delegate(private val vm: VM) {
spatialLevels: Int = 6, spatialLevels: Int = 6,
temporalLevels: Int = 2, temporalLevels: Int = 2,
entropyCoder: Int = 0, entropyCoder: Int = 0,
bufferOffset: Long = 0 bufferOffset: Long = 0,
temporalMotionCoder: Int = 0
): Array<Any> { ): Array<Any> {
val dbgOut = HashMap<String, Any>() val dbgOut = HashMap<String, Any>()
dbgOut["qY"] = qYGlobal dbgOut["qY"] = qYGlobal
@@ -6634,9 +6595,9 @@ class GraphicsJSR223Delegate(private val vm: VM) {
} }
// Step 6: Apply inverse 3D DWT using GOP dimensions (may be cropped) // Step 6: Apply inverse 3D DWT using GOP dimensions (may be cropped)
tavApplyInverse3DDWT(gopY, gopWidth, gopHeight, gopSize, spatialLevels, temporalLevels, spatialFilter) tavApplyInverse3DDWT(gopY, gopWidth, gopHeight, gopSize, spatialLevels, temporalLevels, spatialFilter, temporalMotionCoder)
tavApplyInverse3DDWT(gopCo, gopWidth, gopHeight, gopSize, spatialLevels, temporalLevels, spatialFilter) tavApplyInverse3DDWT(gopCo, gopWidth, gopHeight, gopSize, spatialLevels, temporalLevels, spatialFilter, temporalMotionCoder)
tavApplyInverse3DDWT(gopCg, gopWidth, gopHeight, gopSize, spatialLevels, temporalLevels, spatialFilter) tavApplyInverse3DDWT(gopCg, gopWidth, gopHeight, gopSize, spatialLevels, temporalLevels, spatialFilter, temporalMotionCoder)
// Step 8: Convert to RGB and composite to full frame // Step 8: Convert to RGB and composite to full frame
// With crop encoding, center the cropped frame and fill letterbox areas with black // With crop encoding, center the cropped frame and fill letterbox areas with black
@@ -6780,7 +6741,8 @@ class GraphicsJSR223Delegate(private val vm: VM) {
spatialLevels: Int = 6, spatialLevels: Int = 6,
temporalLevels: Int = 3, temporalLevels: Int = 3,
entropyCoder: Int = 0, entropyCoder: Int = 0,
bufferOffset: Long = 0 bufferOffset: Long = 0,
temporalMotionCoder: Int = 0
) { ) {
// Cancel any existing decode thread // Cancel any existing decode thread
asyncDecodeThread?.interrupt() asyncDecodeThread?.interrupt()
@@ -6798,7 +6760,7 @@ class GraphicsJSR223Delegate(private val vm: VM) {
width, height, width, height,
qIndex, qYGlobal, qCoGlobal, qCgGlobal, qIndex, qYGlobal, qCoGlobal, qCgGlobal,
channelLayout, spatialFilter, spatialLevels, temporalLevels, channelLayout, spatialFilter, spatialLevels, temporalLevels,
entropyCoder, bufferOffset entropyCoder, bufferOffset, temporalMotionCoder
) )
asyncDecodeResult = result asyncDecodeResult = result
asyncDecodeComplete.set(true) asyncDecodeComplete.set(true)
@@ -6943,12 +6905,17 @@ class GraphicsJSR223Delegate(private val vm: VM) {
// ============================================================================= // =============================================================================
/** /**
* Inverse 1D temporal DWT (Haar) along time axis * Inverse 1D temporal DWT along time axis
* Reuses existing Haar inverse implementation * Supports both Haar and CDF 5/3 wavelets
* @param temporalMotionCoder 0=Haar, 1=CDF 5/3
*/ */
private fun tavApplyTemporalDWTInverse1D(data: FloatArray, numFrames: Int) { private fun tavApplyTemporalDWTInverse1D(data: FloatArray, numFrames: Int, temporalMotionCoder: Int = 0) {
if (numFrames < 2) return if (numFrames < 2) return
tavApplyDWTHaarInverse1D(data, numFrames) if (temporalMotionCoder == 0) {
tavApplyDWTHaarInverse1D(data, numFrames)
} else {
tavApplyDWT53Inverse1D(data, numFrames)
}
} }
/** /**
@@ -6962,6 +6929,7 @@ class GraphicsJSR223Delegate(private val vm: VM) {
* @param spatialLevels Spatial decomposition levels (typically 6) * @param spatialLevels Spatial decomposition levels (typically 6)
* @param temporalLevels Temporal decomposition levels (typically 2) * @param temporalLevels Temporal decomposition levels (typically 2)
* @param spatialFilter Spatial wavelet filter type (0=5/3, 1=9/7, 255=Haar) * @param spatialFilter Spatial wavelet filter type (0=5/3, 1=9/7, 255=Haar)
* @param temporalMotionCoder Temporal wavelet type (0=Haar, 1=CDF 5/3)
*/ */
private fun tavApplyInverse3DDWT( private fun tavApplyInverse3DDWT(
gopData: Array<FloatArray>, gopData: Array<FloatArray>,
@@ -6970,7 +6938,8 @@ class GraphicsJSR223Delegate(private val vm: VM) {
numFrames: Int, numFrames: Int,
spatialLevels: Int, spatialLevels: Int,
temporalLevels: Int, temporalLevels: Int,
spatialFilter: Int spatialFilter: Int,
temporalMotionCoder: Int = 0
) { ) {
// Step 1: Apply inverse 2D spatial DWT to each temporal subband (each frame) // Step 1: Apply inverse 2D spatial DWT to each temporal subband (each frame)
// This is required even for single frames (I-frames) to convert from DWT coefficients to pixel space // This is required even for single frames (I-frames) to convert from DWT coefficients to pixel space
@@ -7008,7 +6977,7 @@ class GraphicsJSR223Delegate(private val vm: VM) {
for (level in temporalLevels - 1 downTo 0) { for (level in temporalLevels - 1 downTo 0) {
val levelFrames = temporalLengths[level] val levelFrames = temporalLengths[level]
if (levelFrames >= 2) { if (levelFrames >= 2) {
tavApplyTemporalDWTInverse1D(temporalLine, levelFrames) tavApplyTemporalDWTInverse1D(temporalLine, levelFrames, temporalMotionCoder)
} }
} }

View File

@@ -993,11 +993,34 @@ static void dwt_97_inverse_1d(float *data, int length) {
free(temp); free(temp);
} }
// 5/3 inverse DWT (simplified - uses 9/7 for now) // 5/3 inverse DWT using lifting scheme (JPEG 2000 reversible filter)
static void dwt_53_inverse_1d(float *data, int length) { static void dwt_53_inverse_1d(float *data, int length) {
if (length < 2) return; if (length < 2) return;
// TODO: Implement proper 5/3 from TSVM if needed
dwt_97_inverse_1d(data, length); float *temp = malloc(length * sizeof(float));
int half = (length + 1) / 2;
// Copy low-pass and high-pass subbands to temp
memcpy(temp, data, length * sizeof(float));
// Undo update step (low-pass)
for (int i = 0; i < half; i++) {
float update = 0.25f * ((i > 0 ? temp[half + i - 1] : 0) +
(i < half - 1 ? temp[half + i] : 0));
temp[i] -= update;
}
// Undo predict step (high-pass) and interleave samples
for (int i = 0; i < half; i++) {
data[2 * i] = temp[i]; // Even samples (low-pass)
int idx = 2 * i + 1;
if (idx < length) {
float pred = 0.5f * (temp[i] + (i < half - 1 ? temp[i + 1] : temp[i]));
data[idx] = temp[half + i] + pred; // Odd samples (high-pass)
}
}
free(temp);
} }
// Multi-level inverse DWT (matches TSVM exactly with correct non-power-of-2 handling) // Multi-level inverse DWT (matches TSVM exactly with correct non-power-of-2 handling)
@@ -1180,7 +1203,8 @@ static void dwt_haar_inverse_1d(float *data, int length) {
// Order: SPATIAL first (each frame), then TEMPORAL (across frames) // Order: SPATIAL first (each frame), then TEMPORAL (across frames)
static void apply_inverse_3d_dwt(float **gop_y, float **gop_co, float **gop_cg, static void apply_inverse_3d_dwt(float **gop_y, float **gop_co, float **gop_cg,
int width, int height, int gop_size, int width, int height, int gop_size,
int spatial_levels, int temporal_levels, int filter_type) { int spatial_levels, int temporal_levels, int filter_type,
int temporal_motion_coder) {
// Step 1: Apply inverse 2D spatial DWT to each frame // Step 1: Apply inverse 2D spatial DWT to each frame
for (int t = 0; t < gop_size; t++) { for (int t = 0; t < gop_size; t++) {
apply_inverse_dwt_multilevel(gop_y[t], width, height, spatial_levels, filter_type); apply_inverse_dwt_multilevel(gop_y[t], width, height, spatial_levels, filter_type);
@@ -1212,7 +1236,12 @@ static void apply_inverse_3d_dwt(float **gop_y, float **gop_co, float **gop_cg,
for (int level = temporal_levels - 1; level >= 0; level--) { for (int level = temporal_levels - 1; level >= 0; level--) {
const int level_frames = temporal_lengths[level]; const int level_frames = temporal_lengths[level];
if (level_frames >= 2) { if (level_frames >= 2) {
dwt_haar_inverse_1d(temporal_line, level_frames); // Use selected temporal wavelet (0=Haar, 1=CDF 5/3)
if (temporal_motion_coder == 0) {
dwt_haar_inverse_1d(temporal_line, level_frames);
} else {
dwt_53_inverse_1d(temporal_line, level_frames);
}
} }
} }
for (int t = 0; t < gop_size; t++) { for (int t = 0; t < gop_size; t++) {
@@ -1226,7 +1255,12 @@ static void apply_inverse_3d_dwt(float **gop_y, float **gop_co, float **gop_cg,
for (int level = temporal_levels - 1; level >= 0; level--) { for (int level = temporal_levels - 1; level >= 0; level--) {
const int level_frames = temporal_lengths[level]; const int level_frames = temporal_lengths[level];
if (level_frames >= 2) { if (level_frames >= 2) {
dwt_haar_inverse_1d(temporal_line, level_frames); // Use selected temporal wavelet (0=Haar, 1=CDF 5/3)
if (temporal_motion_coder == 0) {
dwt_haar_inverse_1d(temporal_line, level_frames);
} else {
dwt_53_inverse_1d(temporal_line, level_frames);
}
} }
} }
for (int t = 0; t < gop_size; t++) { for (int t = 0; t < gop_size; t++) {
@@ -1240,7 +1274,12 @@ static void apply_inverse_3d_dwt(float **gop_y, float **gop_co, float **gop_cg,
for (int level = temporal_levels - 1; level >= 0; level--) { for (int level = temporal_levels - 1; level >= 0; level--) {
const int level_frames = temporal_lengths[level]; const int level_frames = temporal_lengths[level];
if (level_frames >= 2) { if (level_frames >= 2) {
dwt_haar_inverse_1d(temporal_line, level_frames); // Use selected temporal wavelet (0=Haar, 1=CDF 5/3)
if (temporal_motion_coder == 0) {
dwt_haar_inverse_1d(temporal_line, level_frames);
} else {
dwt_53_inverse_1d(temporal_line, level_frames);
}
} }
} }
for (int t = 0; t < gop_size; t++) { for (int t = 0; t < gop_size; t++) {
@@ -1706,6 +1745,7 @@ typedef struct {
int frame_count; int frame_count;
int frame_size; int frame_size;
int is_monoblock; // True if version 3-6 (single tile mode) int is_monoblock; // True if version 3-6 (single tile mode)
int temporal_motion_coder; // Temporal wavelet: 0=Haar, 1=CDF 5/3 (extracted from version)
// Screen masking (letterbox/pillarbox) - array of geometry changes // Screen masking (letterbox/pillarbox) - array of geometry changes
screen_mask_entry_t *screen_masks; screen_mask_entry_t *screen_masks;
@@ -1942,7 +1982,11 @@ static tav_decoder_t* tav_decoder_init(const char *input_file, const char *outpu
} }
decoder->frame_size = decoder->header.width * decoder->header.height; decoder->frame_size = decoder->header.width * decoder->header.height;
decoder->is_monoblock = (decoder->header.version >= 3 && decoder->header.version <= 6); // Extract temporal motion coder from version (versions 9-16 use CDF 5/3, 1-8 use Haar)
decoder->temporal_motion_coder = (decoder->header.version > 8) ? 1 : 0;
// Extract base version for determining monoblock mode
uint8_t base_version = (decoder->header.version > 8) ? (decoder->header.version - 8) : decoder->header.version;
decoder->is_monoblock = (base_version >= 3 && base_version <= 6);
decoder->audio_file_path = strdup(audio_file); decoder->audio_file_path = strdup(audio_file);
// Phase 2: Initialize decoding dimensions to full frame (will be updated by Screen Mask packets) // Phase 2: Initialize decoding dimensions to full frame (will be updated by Screen Mask packets)
@@ -2337,7 +2381,9 @@ static int decode_i_or_p_frame(tav_decoder_t *decoder, uint8_t packet_type, uint
// Dequantise (perceptual for versions 5-8, uniform for 1-4) // Dequantise (perceptual for versions 5-8, uniform for 1-4)
// Phase 2: Use decoding dimensions and temporary buffers // Phase 2: Use decoding dimensions and temporary buffers
const int is_perceptual = (decoder->header.version >= 5 && decoder->header.version <= 8); // Extract base version for perceptual check
uint8_t base_version = (decoder->header.version > 8) ? (decoder->header.version - 8) : decoder->header.version;
const int is_perceptual = (base_version >= 5 && base_version <= 8);
const int is_ezbc = (decoder->header.entropy_coder == 1); const int is_ezbc = (decoder->header.entropy_coder == 1);
if (is_ezbc && is_perceptual) { if (is_ezbc && is_perceptual) {
@@ -2472,7 +2518,9 @@ static int decode_i_or_p_frame(tav_decoder_t *decoder, uint8_t packet_type, uint
} }
// Convert YCoCg-R/ICtCp to RGB for cropped region // Convert YCoCg-R/ICtCp to RGB for cropped region
const int is_ictcp = (decoder->header.version % 2 == 0); // Extract base version for ICtCp check (even versions use ICtCp)
uint8_t base_version_rgb = (decoder->header.version > 8) ? (decoder->header.version - 8) : decoder->header.version;
const int is_ictcp = (base_version_rgb % 2 == 0);
for (int i = 0; i < decoding_pixels; i++) { for (int i = 0; i < decoding_pixels; i++) {
uint8_t r, g, b; uint8_t r, g, b;
@@ -2936,7 +2984,9 @@ int main(int argc, char *argv[]) {
} }
// Dequantise with temporal scaling (perceptual quantisation for versions 5-8) // Dequantise with temporal scaling (perceptual quantisation for versions 5-8)
const int is_perceptual = (decoder->header.version >= 5 && decoder->header.version <= 8); // Extract base version for perceptual check
uint8_t base_version_gop = (decoder->header.version > 8) ? (decoder->header.version - 8) : decoder->header.version;
const int is_perceptual = (base_version_gop >= 5 && base_version_gop <= 8);
const int is_ezbc = (decoder->header.entropy_coder == 1); const int is_ezbc = (decoder->header.entropy_coder == 1);
const int temporal_levels = 2; // Fixed for TAV GOP encoding const int temporal_levels = 2; // Fixed for TAV GOP encoding
@@ -3034,7 +3084,7 @@ int main(int argc, char *argv[]) {
// Phase 2: Use GOP dimensions (may be cropped) for inverse DWT // Phase 2: Use GOP dimensions (may be cropped) for inverse DWT
apply_inverse_3d_dwt(gop_y, gop_co, gop_cg, gop_width, gop_height, apply_inverse_3d_dwt(gop_y, gop_co, gop_cg, gop_width, gop_height,
gop_size, decoder->header.decomp_levels, temporal_levels, gop_size, decoder->header.decomp_levels, temporal_levels,
decoder->header.wavelet_filter); decoder->header.wavelet_filter, decoder->temporal_motion_coder);
// Debug: Check Y values after inverse DWT // Debug: Check Y values after inverse DWT
if (verbose && decoder->frame_count == 0) { if (verbose && decoder->frame_count == 0) {

View File

@@ -18,7 +18,7 @@
#include <limits.h> #include <limits.h>
#include <float.h> #include <float.h>
#define ENCODER_VENDOR_STRING "Encoder-TAV 20251122 (3d-dwt,tad,ssf-tc)" #define ENCODER_VENDOR_STRING "Encoder-TAV 20251123 (3d-dwt,tad,ssf-tc,cdf53-motion)"
// TSVM Advanced Video (TAV) format constants // TSVM Advanced Video (TAV) format constants
#define TAV_MAGIC "\x1F\x54\x53\x56\x4D\x54\x41\x56" // "\x1FTSVM TAV" #define TAV_MAGIC "\x1F\x54\x53\x56\x4D\x54\x41\x56" // "\x1FTSVM TAV"
@@ -1867,6 +1867,7 @@ typedef struct tav_encoder_s {
float **temporal_gop_co_frames; // [frame][pixel] - Co channel for each GOP frame float **temporal_gop_co_frames; // [frame][pixel] - Co channel for each GOP frame
float **temporal_gop_cg_frames; // [frame][pixel] - Cg channel for each GOP frame float **temporal_gop_cg_frames; // [frame][pixel] - Cg channel for each GOP frame
int temporal_decomp_levels; // Number of temporal DWT levels (default: 2) int temporal_decomp_levels; // Number of temporal DWT levels (default: 2)
int temporal_motion_coder; // Temporal wavelet type: 0=Haar, 1=CDF 5/3 (default: 1)
// MC-EZBC block-based motion compensation for temporal 3D DWT (0x13 packets) // MC-EZBC block-based motion compensation for temporal 3D DWT (0x13 packets)
int temporal_enable_mcezbc; // Flag to enable MC-EZBC block compensation (default: 0, uses translation if temporal_dwt enabled) int temporal_enable_mcezbc; // Flag to enable MC-EZBC block compensation (default: 0, uses translation if temporal_dwt enabled)
@@ -2412,6 +2413,7 @@ static void show_usage(const char *program_name) {
printf(" --enable-delta Enable delta encoding\n"); printf(" --enable-delta Enable delta encoding\n");
printf(" --delta-haar N Apply N-level Haar DWT to delta coefficients (1-6, auto-enables delta)\n"); printf(" --delta-haar N Apply N-level Haar DWT to delta coefficients (1-6, auto-enables delta)\n");
printf(" --3d-dwt Enable temporal 3D DWT (GOP-based encoding with temporal transform; the default encoding mode)\n"); printf(" --3d-dwt Enable temporal 3D DWT (GOP-based encoding with temporal transform; the default encoding mode)\n");
printf(" --motion-coder N Temporal wavelet: 0=Haar, 1=CDF 5/3 (default: auto-select based on resolution; use 0 for older version compatibility)\n");
printf(" --single-pass Disable two-pass encoding with wavelet-based scene change detection (optimal GOP boundaries)\n"); printf(" --single-pass Disable two-pass encoding with wavelet-based scene change detection (optimal GOP boundaries)\n");
// printf(" --mc-ezbc Enable MC-EZBC block-based motion compensation (requires --temporal-dwt, implies --ezbc)\n"); // printf(" --mc-ezbc Enable MC-EZBC block-based motion compensation (requires --temporal-dwt, implies --ezbc)\n");
printf(" --ezbc Enable EZBC (Embedded Zero Block Coding) entropy coding. May help reducing file size on high-quality videos\n"); printf(" --ezbc Enable EZBC (Embedded Zero Block Coding) entropy coding. May help reducing file size on high-quality videos\n");
@@ -2514,6 +2516,7 @@ static tav_encoder_t* create_encoder(void) {
enc->temporal_gop_width = 0; // Will be set when first frame is added to GOP enc->temporal_gop_width = 0; // Will be set when first frame is added to GOP
enc->temporal_gop_height = 0; // Will be set when first frame is added to GOP enc->temporal_gop_height = 0; // Will be set when first frame is added to GOP
enc->temporal_decomp_levels = TEMPORAL_DECOMP_LEVEL; // 3 levels of temporal DWT (24 -> 12 -> 6 -> 3 temporal subbands) enc->temporal_decomp_levels = TEMPORAL_DECOMP_LEVEL; // 3 levels of temporal DWT (24 -> 12 -> 6 -> 3 temporal subbands)
enc->temporal_motion_coder = -1; // Will be set automatically based on resolution (unless overridden)
enc->temporal_gop_rgb_frames = NULL; enc->temporal_gop_rgb_frames = NULL;
enc->temporal_gop_y_frames = NULL; enc->temporal_gop_y_frames = NULL;
enc->temporal_gop_co_frames = NULL; enc->temporal_gop_co_frames = NULL;
@@ -2836,7 +2839,7 @@ static int initialise_encoder(tav_encoder_t *enc) {
static void dwt_53_forward_1d(float *data, int length) { static void dwt_53_forward_1d(float *data, int length) {
if (length < 2) return; if (length < 2) return;
float *temp = malloc(length * sizeof(float)); float *temp = calloc(length, sizeof(float)); // Use calloc to zero-initialize for odd-length arrays
int half = (length + 1) / 2; // Handle odd lengths properly int half = (length + 1) / 2; // Handle odd lengths properly
// Predict step (high-pass) // Predict step (high-pass)
@@ -2846,6 +2849,7 @@ static void dwt_53_forward_1d(float *data, int length) {
float pred = 0.5f * (data[2 * i] + (2 * i + 2 < length ? data[2 * i + 2] : data[2 * i])); float pred = 0.5f * (data[2 * i] + (2 * i + 2 < length ? data[2 * i + 2] : data[2 * i]));
temp[half + i] = data[idx] - pred; temp[half + i] = data[idx] - pred;
} }
// Note: For odd lengths, last high-pass position remains zero (from calloc)
} }
// Update step (low-pass) // Update step (low-pass)
@@ -5612,7 +5616,12 @@ static void dwt_3d_forward(tav_encoder_t *enc, float **gop_data, int width, int
for (int level = 0; level < temporal_levels; level++) { for (int level = 0; level < temporal_levels; level++) {
int level_frames = temporal_lengths[level]; int level_frames = temporal_lengths[level];
if (level_frames >= 2) { if (level_frames >= 2) {
dwt_haar_forward_1d(temporal_line, level_frames); // Haar better for imperfect alignment // Use selected temporal wavelet (0=Haar, 1=CDF 5/3)
if (enc->temporal_motion_coder == 0) {
dwt_haar_forward_1d(temporal_line, level_frames);
} else {
dwt_53_forward_1d(temporal_line, level_frames);
}
} }
} }
@@ -7425,7 +7434,8 @@ static int write_tav_header(tav_encoder_t *enc) {
// Magic number // Magic number
fwrite(TAV_MAGIC, 1, 8, enc->output_fp); fwrite(TAV_MAGIC, 1, 8, enc->output_fp);
// Version (dynamic based on colour space, monoblock mode, and perceptual tuning) // Version (dynamic based on colour space, monoblock mode, perceptual tuning, and motion coder)
// Base versions 1-8, add 8 if temporal_motion_coder == 1 (CDF 5/3)
uint8_t version; uint8_t version;
if (enc->monoblock) { if (enc->monoblock) {
if (enc->perceptual_tuning) { if (enc->perceptual_tuning) {
@@ -7440,6 +7450,10 @@ static int write_tav_header(tav_encoder_t *enc) {
version = enc->ictcp_mode ? 2 : 1; version = enc->ictcp_mode ? 2 : 1;
} }
} }
// Add 8 if using CDF 5/3 temporal wavelet (motion_coder == 1)
if (enc->temporal_motion_coder == 1) {
version += 8;
}
fputc(version, enc->output_fp); fputc(version, enc->output_fp);
// Video parameters // Video parameters
@@ -10705,6 +10719,7 @@ int main(int argc, char *argv[]) {
{"temporal-3d", no_argument, 0, 1019}, {"temporal-3d", no_argument, 0, 1019},
{"dwt-3d", no_argument, 0, 1019}, {"dwt-3d", no_argument, 0, 1019},
{"3d-dwt", no_argument, 0, 1019}, {"3d-dwt", no_argument, 0, 1019},
{"motion-coder", required_argument, 0, 1030},
{"mc-ezbc", no_argument, 0, 1020}, {"mc-ezbc", no_argument, 0, 1020},
{"residual-coding", no_argument, 0, 1021}, {"residual-coding", no_argument, 0, 1021},
{"adaptive-blocks", no_argument, 0, 1022}, {"adaptive-blocks", no_argument, 0, 1022},
@@ -10946,6 +10961,12 @@ int main(int argc, char *argv[]) {
enc->preprocess_mode = PREPROCESS_RAW; enc->preprocess_mode = PREPROCESS_RAW;
printf("Raw coefficient mode enabled (no significance map preprocessing)\n"); printf("Raw coefficient mode enabled (no significance map preprocessing)\n");
break; break;
case 1030: // --motion-coder
enc->temporal_motion_coder = CLAMP(atoi(optarg), 0, 1);
printf("Temporal motion coder set to: %d (%s)\n",
enc->temporal_motion_coder,
enc->temporal_motion_coder == 0 ? "Haar" : "CDF 5/3");
break;
case 1050: // --single-pass case 1050: // --single-pass
enc->two_pass_mode = 0; enc->two_pass_mode = 0;
printf("Two-pass wavelet-based scene change detection disabled\n"); printf("Two-pass wavelet-based scene change detection disabled\n");
@@ -10987,6 +11008,26 @@ int main(int argc, char *argv[]) {
} }
} }
// Smart preset for temporal motion coder based on resolution
// For small videos (<500k pixels), use CDF 5/3 (better for fine details)
// For larger videos, use Haar (better compression, smoother motion matters less)
if (enc->temporal_motion_coder == -1) {
int num_pixels = enc->width * enc->height;
if (num_pixels >= 500000) {
enc->temporal_motion_coder = 0; // Haar
if (enc->verbose) {
printf("Auto-selected Haar temporal wavelet (resolution: %dx%d = %d pixels)\n",
enc->width, enc->height, num_pixels);
}
} else {
enc->temporal_motion_coder = 1; // CDF 5/3
if (enc->verbose) {
printf("Auto-selected CDF 5/3 temporal wavelet (resolution: %dx%d = %d pixels)\n",
enc->width, enc->height, num_pixels);
}
}
}
// generate division series // generate division series
enc->widths = malloc((enc->decomp_levels + 2) * sizeof(int)); enc->widths = malloc((enc->decomp_levels + 2) * sizeof(int));
enc->heights = malloc((enc->decomp_levels + 2) * sizeof(int)); enc->heights = malloc((enc->decomp_levels + 2) * sizeof(int));

View File

@@ -498,6 +498,8 @@ int main(int argc, char *argv[]) {
if (!opts.summary_only) { if (!opts.summary_only) {
// Parse header fields // Parse header fields
uint8_t version = header[8]; uint8_t version = header[8];
uint8_t base_version = (version > 8) ? (version - 8) : version;
uint8_t temporal_motion_coder = (version > 8) ? 1 : 0;
uint16_t width = *((uint16_t*)&header[9]); uint16_t width = *((uint16_t*)&header[9]);
uint16_t height = *((uint16_t*)&header[11]); uint16_t height = *((uint16_t*)&header[11]);
uint8_t fps = header[13]; uint8_t fps = header[13];
@@ -516,13 +518,15 @@ int main(int argc, char *argv[]) {
static const int QLUT[] = {1,2,3,4,5,6,7,8,9,10,11,12,13,14,15,16,17,18,19,20,21,22,23,24,25,26,27,28,29,30,31,32,33,34,35,36,37,38,39,40,41,42,43,44,45,46,47,48,49,50,51,52,53,54,55,56,57,58,59,60,61,62,63,64,66,68,70,72,74,76,78,80,82,84,86,88,90,92,94,96,98,100,102,104,106,108,110,112,114,116,118,120,122,124,126,128,132,136,140,144,148,152,156,160,164,168,172,176,180,184,188,192,196,200,204,208,212,216,220,224,228,232,236,240,244,248,252,256,264,272,280,288,296,304,312,320,328,336,344,352,360,368,376,384,392,400,408,416,424,432,440,448,456,464,472,480,488,496,504,512,528,544,560,576,592,608,624,640,656,672,688,704,720,736,752,768,784,800,816,832,848,864,880,896,912,928,944,960,976,992,1008,1024,1056,1088,1120,1152,1184,1216,1248,1280,1312,1344,1376,1408,1440,1472,1504,1536,1568,1600,1632,1664,1696,1728,1760,1792,1824,1856,1888,1920,1952,1984,2016,2048,2112,2176,2240,2304,2368,2432,2496,2560,2624,2688,2752,2816,2880,2944,3008,3072,3136,3200,3264,3328,3392,3456,3520,3584,3648,3712,3776,3840,3904,3968,4032,4096}; static const int QLUT[] = {1,2,3,4,5,6,7,8,9,10,11,12,13,14,15,16,17,18,19,20,21,22,23,24,25,26,27,28,29,30,31,32,33,34,35,36,37,38,39,40,41,42,43,44,45,46,47,48,49,50,51,52,53,54,55,56,57,58,59,60,61,62,63,64,66,68,70,72,74,76,78,80,82,84,86,88,90,92,94,96,98,100,102,104,106,108,110,112,114,116,118,120,122,124,126,128,132,136,140,144,148,152,156,160,164,168,172,176,180,184,188,192,196,200,204,208,212,216,220,224,228,232,236,240,244,248,252,256,264,272,280,288,296,304,312,320,328,336,344,352,360,368,376,384,392,400,408,416,424,432,440,448,456,464,472,480,488,496,504,512,528,544,560,576,592,608,624,640,656,672,688,704,720,736,752,768,784,800,816,832,848,864,880,896,912,928,944,960,976,992,1008,1024,1056,1088,1120,1152,1184,1216,1248,1280,1312,1344,1376,1408,1440,1472,1504,1536,1568,1600,1632,1664,1696,1728,1760,1792,1824,1856,1888,1920,1952,1984,2016,2048,2112,2176,2240,2304,2368,2432,2496,2560,2624,2688,2752,2816,2880,2944,3008,3072,3136,3200,3264,3328,3392,3456,3520,3584,3648,3712,3776,3840,3904,3968,4032,4096};
static const char* CLAYOUT[] = {"Luma-Chroma", "Luma-Chroma-Alpha", "Luma", "Luma-Alpha", "Chroma", "Chroma-Alpha"}; static const char* CLAYOUT[] = {"Luma-Chroma", "Luma-Chroma-Alpha", "Luma", "Luma-Alpha", "Chroma", "Chroma-Alpha"};
int is_monoblock = (3 <= version && version <= 6); int is_monoblock = (3 <= base_version && base_version <= 6);
int is_perceptual = (5 <= version && version <= 8); int is_perceptual = (5 <= base_version && base_version <= 8);
static const char* VERDESC[] = {"null", "YCoCg tiled, uniform", "ICtCp tiled, uniform", "YCoCg monoblock, uniform", "ICtCp monoblock, uniform", "YCoCg monoblock, perceptual", "ICtCp monoblock, perceptual", "YCoCg tiled, perceptual", "ICtCp tiled, perceptual"}; static const char* VERDESC[] = {"null", "YCoCg tiled, uniform", "ICtCp tiled, uniform", "YCoCg monoblock, uniform", "ICtCp monoblock, uniform", "YCoCg monoblock, perceptual", "ICtCp monoblock, perceptual", "YCoCg tiled, perceptual", "ICtCp tiled, perceptual"};
static const char* TEMPORAL_WAVELET[] = {"Haar", "CDF 5/3"};
printf("TAV Header:\n"); printf("TAV Header:\n");
printf(" Version: %d (%s)\n", version, VERDESC[version]); printf(" Version: %d (base: %d - %s, temporal: %s)\n",
version, base_version, VERDESC[base_version], TEMPORAL_WAVELET[temporal_motion_coder]);
printf(" Resolution: %dx%d\n", width, height); printf(" Resolution: %dx%d\n", width, height);
printf(" Frame rate: %d fps", fps); printf(" Frame rate: %d fps", fps);
if (video_flags & 0x02) printf(" (NTSC)"); if (video_flags & 0x02) printf(" (NTSC)");