mirror of
https://github.com/curioustorvald/tsvm.git
synced 2026-03-07 19:51:51 +09:00
TAV: experimental 3D DWT encoder
This commit is contained in:
49
CLAUDE.md
49
CLAUDE.md
@@ -189,6 +189,9 @@ Peripheral memories can be accessed using `vm.peek()` and `vm.poke()` functions,
|
|||||||
./encoder_tav -i input.mp4 -q 0 -o output.tav # Lowest quality, smallest file
|
./encoder_tav -i input.mp4 -q 0 -o output.tav # Lowest quality, smallest file
|
||||||
./encoder_tav -i input.mp4 -q 5 -o output.tav # Highest quality, largest file
|
./encoder_tav -i input.mp4 -q 5 -o output.tav # Highest quality, largest file
|
||||||
|
|
||||||
|
# Temporal 3D DWT (GOP-based encoding)
|
||||||
|
./encoder_tav -i input.mp4 --temporal-dwt -q 2 -o output.tav
|
||||||
|
|
||||||
# Playback
|
# Playback
|
||||||
playtav output.tav
|
playtav output.tav
|
||||||
```
|
```
|
||||||
@@ -264,3 +267,49 @@ Concatenated Maps Layout:
|
|||||||
- **Compression improvement**: 16.4% from significance maps + 1.6% from concatenated layout
|
- **Compression improvement**: 16.4% from significance maps + 1.6% from concatenated layout
|
||||||
- **Real-world impact**: 559 bytes saved per frame (5.59 MB per 10k frames)
|
- **Real-world impact**: 559 bytes saved per frame (5.59 MB per 10k frames)
|
||||||
- **Cross-channel benefit**: Concatenated maps allow Zstd to exploit similarity between significance patterns
|
- **Cross-channel benefit**: Concatenated maps allow Zstd to exploit similarity between significance patterns
|
||||||
|
|
||||||
|
#### TAV Temporal 3D DWT (GOP Unified Encoding)
|
||||||
|
|
||||||
|
Implemented on 2025-10-15 for improved temporal compression through group-of-pictures (GOP) encoding:
|
||||||
|
|
||||||
|
**Key Features**:
|
||||||
|
- **3D DWT**: Applies DWT in both spatial (2D) and temporal (1D) dimensions for optimal spacetime compression
|
||||||
|
- **Unified GOP Preprocessing**: Single significance map for all frames and channels in a GOP (width×height×N_frames×3_channels)
|
||||||
|
- **FFT-based Phase Correlation**: Uses FFTW3 library for accurate global motion estimation with quarter-pixel precision
|
||||||
|
- **GOP Size**: Typically 16 frames (configurable), with scene change detection for adaptive GOPs
|
||||||
|
- **Single-frame Fallback**: GOP size of 1 automatically uses traditional I-frame encoding
|
||||||
|
|
||||||
|
**Packet Format**:
|
||||||
|
- **0x12 (GOP_UNIFIED)**: `[gop_size][motion_vectors...][compressed_size][compressed_data]`
|
||||||
|
- Motion vectors stored as int16_t in quarter-pixel units for all frames in GOP
|
||||||
|
- Unified significance map for entire GOP block enables cross-frame compression
|
||||||
|
- **0xFC (GOP_SYNC)**: `[frame_count]` - Indicates N frames were decoded from GOP block
|
||||||
|
- **Timecode Emission**: One timecode packet per GOP (not per frame)
|
||||||
|
|
||||||
|
**Technical Implementation**:
|
||||||
|
```c
|
||||||
|
// Unified preprocessing structure (encoder_tav.c:2371-2509)
|
||||||
|
[All_Y_maps][All_Co_maps][All_Cg_maps][All_Y_values][All_Co_values][All_Cg_values]
|
||||||
|
// Where maps are grouped by channel across all GOP frames for optimal Zstd compression
|
||||||
|
|
||||||
|
// Phase correlation using FFT (encoder_tav.c:1246-1383)
|
||||||
|
// - FFTW3 forward FFT on grayscale frames
|
||||||
|
// - Cross-power spectrum computation
|
||||||
|
// - Inverse FFT gives correlation peak at (dx, dy)
|
||||||
|
// - Parabolic interpolation for quarter-pixel refinement
|
||||||
|
```
|
||||||
|
|
||||||
|
**Usage**:
|
||||||
|
```bash
|
||||||
|
# Enable temporal 3D DWT
|
||||||
|
./encoder_tav -i input.mp4 --temporal-dwt -q 2 -o output.tav
|
||||||
|
|
||||||
|
# Inspect GOP structure
|
||||||
|
./tav_inspector output.tav -v
|
||||||
|
```
|
||||||
|
|
||||||
|
**Compression Benefits**:
|
||||||
|
- **Temporal Coherence**: Exploits similarity across consecutive frames
|
||||||
|
- **Unified Compression**: Zstd compresses entire GOP as single block, finding patterns across time
|
||||||
|
- **Motion Compensation**: FFT-based phase correlation provides accurate global motion estimation
|
||||||
|
- **Adaptive GOPs**: Scene change detection ensures optimal GOP boundaries
|
||||||
|
|||||||
@@ -955,6 +955,7 @@ transmission capability, and region-of-interest coding.
|
|||||||
<video packets>
|
<video packets>
|
||||||
0x10: I-frame (intra-coded frame)
|
0x10: I-frame (intra-coded frame)
|
||||||
0x11: P-frame (delta/skip frame)
|
0x11: P-frame (delta/skip frame)
|
||||||
|
0x12: GOP Unified (temporal 3D DWT with unified preprocessing)
|
||||||
0x1F: (prohibited)
|
0x1F: (prohibited)
|
||||||
0x20: MP2 audio packet
|
0x20: MP2 audio packet
|
||||||
0x30: Subtitle in "Simple" format
|
0x30: Subtitle in "Simple" format
|
||||||
@@ -980,6 +981,7 @@ transmission capability, and region-of-interest coding.
|
|||||||
0xEF: TAV Extended Header
|
0xEF: TAV Extended Header
|
||||||
0xF0: Loop point start (insert right AFTER the TC packet; no payload)
|
0xF0: Loop point start (insert right AFTER the TC packet; no payload)
|
||||||
0xF1: Loop point end (insert right AFTER the TC packet; no payload)
|
0xF1: Loop point end (insert right AFTER the TC packet; no payload)
|
||||||
|
0xFC: GOP Sync packet (indicates N frames decoded from GOP block)
|
||||||
0xFD: Timecode (TC) Packet [for frame 0, insert at the beginning; otherwise, insert right AFTER the sync]
|
0xFD: Timecode (TC) Packet [for frame 0, insert at the beginning; otherwise, insert right AFTER the sync]
|
||||||
0xFE: NTSC sync packet (used by player to calculate exact framerate-wise performance; no payload)
|
0xFE: NTSC sync packet (used by player to calculate exact framerate-wise performance; no payload)
|
||||||
0xFF: Sync packet (no payload)
|
0xFF: Sync packet (no payload)
|
||||||
@@ -991,7 +993,7 @@ transmission capability, and region-of-interest coding.
|
|||||||
2. Standard metadata payloads (if any)
|
2. Standard metadata payloads (if any)
|
||||||
|
|
||||||
Frame group:
|
Frame group:
|
||||||
1. TC Packet (0xFD) or File packet (0x1F) [mutually exclusive!]
|
1. TC Packet (0xFD) or Next TAV File (0x1F) [mutually exclusive!]
|
||||||
2. Loop point packets
|
2. Loop point packets
|
||||||
3. Audio packets
|
3. Audio packets
|
||||||
4. Subtitle packets
|
4. Subtitle packets
|
||||||
@@ -1045,11 +1047,58 @@ transmission capability, and region-of-interest coding.
|
|||||||
uint8 Packet Type (0xFE)
|
uint8 Packet Type (0xFE)
|
||||||
uint64 Time since stream start in nanoseconds (this may NOT start from zero if the video is coming from a livestream)
|
uint64 Time since stream start in nanoseconds (this may NOT start from zero if the video is coming from a livestream)
|
||||||
|
|
||||||
## Video Packet Structure
|
## Video Packet Structure (0x10, 0x11)
|
||||||
uint8 Packet Type
|
uint8 Packet Type
|
||||||
uint32 Compressed Size
|
uint32 Compressed Size
|
||||||
* Zstd-compressed Block Data
|
* Zstd-compressed Block Data
|
||||||
|
|
||||||
|
## GOP Unified Packet Structure (0x12)
|
||||||
|
Implemented on 2025-10-15 for temporal 3D DWT with unified preprocessing.
|
||||||
|
This packet contains multiple frames encoded as a single spacetime block for optimal
|
||||||
|
temporal compression.
|
||||||
|
|
||||||
|
uint8 Packet Type (0x12)
|
||||||
|
uint8 GOP Size (number of frames in this GOP, typically 16)
|
||||||
|
int16 Motion Vectors X[GOP Size] (quarter-pixel precision for global motion compensation)
|
||||||
|
int16 Motion Vectors Y[GOP Size] (quarter-pixel precision for global motion compensation)
|
||||||
|
uint32 Compressed Size
|
||||||
|
* Zstd-compressed Unified Block Data
|
||||||
|
|
||||||
|
### Unified Block Data Format
|
||||||
|
The entire GOP (width×height×N_frames×3_channels) is preprocessed as a single block:
|
||||||
|
|
||||||
|
uint8 Y Significance Maps[(width*height + 7) / 8 * GOP Size] // All Y frames concatenated
|
||||||
|
uint8 Co Significance Maps[(width*height + 7) / 8 * GOP Size] // All Co frames concatenated
|
||||||
|
uint8 Cg Significance Maps[(width*height + 7) / 8 * GOP Size] // All Cg frames concatenated
|
||||||
|
int16 Y Non-zero Values[variable length] // All Y non-zero coefficients
|
||||||
|
int16 Co Non-zero Values[variable length] // All Co non-zero coefficients
|
||||||
|
int16 Cg Non-zero Values[variable length] // All Cg non-zero coefficients
|
||||||
|
|
||||||
|
This layout enables Zstd to find patterns across both spatial and temporal dimensions,
|
||||||
|
resulting in superior compression compared to per-frame encoding.
|
||||||
|
|
||||||
|
### Motion Vectors
|
||||||
|
- Stored in quarter-pixel units (divide by 4.0 for pixel displacement)
|
||||||
|
- Used for global motion compensation (camera movement, scene translation)
|
||||||
|
- Computed using FFT-based phase correlation for accurate frame alignment
|
||||||
|
- First frame (frame 0) typically has motion vector (0, 0)
|
||||||
|
|
||||||
|
### Temporal 3D DWT Process
|
||||||
|
1. Apply 1D DWT across temporal axis (GOP frames)
|
||||||
|
2. Apply 2D DWT on each spatial slice of temporal subbands
|
||||||
|
3. Perceptual quantization with temporal-spatial awareness
|
||||||
|
4. Unified significance map preprocessing across all frames/channels
|
||||||
|
5. Single Zstd compression of entire GOP block
|
||||||
|
|
||||||
|
## GOP Sync Packet Structure (0xFC)
|
||||||
|
Indicates that N frames were decoded from a GOP Unified block.
|
||||||
|
Decoders must track this to maintain proper frame count and synchronization.
|
||||||
|
|
||||||
|
uint8 Packet Type (0xFC)
|
||||||
|
uint8 Frame Count (number of frames that were decoded from preceding GOP block)
|
||||||
|
|
||||||
|
Note: GOP Sync packets have no payload size field (fixed 2-byte packet).
|
||||||
|
|
||||||
## Block Data (per frame)
|
## Block Data (per frame)
|
||||||
uint8 Mode: encoding mode
|
uint8 Mode: encoding mode
|
||||||
0x00 = SKIP (just use frame data from previous frame)
|
0x00 = SKIP (just use frame data from previous frame)
|
||||||
|
|||||||
@@ -18,7 +18,7 @@ tev: encoder_tev.c
|
|||||||
|
|
||||||
tav: encoder_tav.c
|
tav: encoder_tav.c
|
||||||
rm -f encoder_tav
|
rm -f encoder_tav
|
||||||
$(CC) $(CFLAGS) -o encoder_tav $< $(LIBS)
|
$(CC) $(CFLAGS) -o encoder_tav $< $(LIBS) -lfftw3f
|
||||||
|
|
||||||
tav_decoder: decoder_tav.c
|
tav_decoder: decoder_tav.c
|
||||||
rm -f decoder_tav
|
rm -f decoder_tav
|
||||||
|
|||||||
File diff suppressed because it is too large
Load Diff
@@ -16,6 +16,7 @@
|
|||||||
// Packet type constants
|
// Packet type constants
|
||||||
#define TAV_PACKET_IFRAME 0x10
|
#define TAV_PACKET_IFRAME 0x10
|
||||||
#define TAV_PACKET_PFRAME 0x11
|
#define TAV_PACKET_PFRAME 0x11
|
||||||
|
#define TAV_PACKET_GOP_UNIFIED 0x12 // Unified 3D DWT GOP (all frames in single block)
|
||||||
#define TAV_PACKET_AUDIO_MP2 0x20
|
#define TAV_PACKET_AUDIO_MP2 0x20
|
||||||
#define TAV_PACKET_SUBTITLE 0x30
|
#define TAV_PACKET_SUBTITLE 0x30
|
||||||
#define TAV_PACKET_SUBTITLE_KAR 0x31
|
#define TAV_PACKET_SUBTITLE_KAR 0x31
|
||||||
@@ -43,6 +44,7 @@
|
|||||||
#define TAV_PACKET_EXTENDED_HDR 0xEF
|
#define TAV_PACKET_EXTENDED_HDR 0xEF
|
||||||
#define TAV_PACKET_LOOP_START 0xF0
|
#define TAV_PACKET_LOOP_START 0xF0
|
||||||
#define TAV_PACKET_LOOP_END 0xF1
|
#define TAV_PACKET_LOOP_END 0xF1
|
||||||
|
#define TAV_PACKET_GOP_SYNC 0xFC // GOP sync packet (N frames decoded)
|
||||||
#define TAV_PACKET_TIMECODE 0xFD
|
#define TAV_PACKET_TIMECODE 0xFD
|
||||||
#define TAV_PACKET_SYNC_NTSC 0xFE
|
#define TAV_PACKET_SYNC_NTSC 0xFE
|
||||||
#define TAV_PACKET_SYNC 0xFF
|
#define TAV_PACKET_SYNC 0xFF
|
||||||
@@ -55,6 +57,9 @@ typedef struct {
|
|||||||
int pframe_intra_count;
|
int pframe_intra_count;
|
||||||
int pframe_delta_count;
|
int pframe_delta_count;
|
||||||
int pframe_skip_count;
|
int pframe_skip_count;
|
||||||
|
int gop_unified_count;
|
||||||
|
int gop_sync_count;
|
||||||
|
int total_gop_frames;
|
||||||
int audio_count;
|
int audio_count;
|
||||||
int subtitle_count;
|
int subtitle_count;
|
||||||
int timecode_count;
|
int timecode_count;
|
||||||
@@ -87,6 +92,7 @@ const char* get_packet_type_name(uint8_t type) {
|
|||||||
switch (type) {
|
switch (type) {
|
||||||
case TAV_PACKET_IFRAME: return "I-FRAME";
|
case TAV_PACKET_IFRAME: return "I-FRAME";
|
||||||
case TAV_PACKET_PFRAME: return "P-FRAME";
|
case TAV_PACKET_PFRAME: return "P-FRAME";
|
||||||
|
case TAV_PACKET_GOP_UNIFIED: return "GOP (3D DWT Unified)";
|
||||||
case TAV_PACKET_AUDIO_MP2: return "AUDIO MP2";
|
case TAV_PACKET_AUDIO_MP2: return "AUDIO MP2";
|
||||||
case TAV_PACKET_SUBTITLE: return "SUBTITLE (Simple)";
|
case TAV_PACKET_SUBTITLE: return "SUBTITLE (Simple)";
|
||||||
case TAV_PACKET_SUBTITLE_KAR: return "SUBTITLE (Karaoke)";
|
case TAV_PACKET_SUBTITLE_KAR: return "SUBTITLE (Karaoke)";
|
||||||
@@ -98,6 +104,7 @@ const char* get_packet_type_name(uint8_t type) {
|
|||||||
case TAV_PACKET_EXTENDED_HDR: return "EXTENDED HEADER";
|
case TAV_PACKET_EXTENDED_HDR: return "EXTENDED HEADER";
|
||||||
case TAV_PACKET_LOOP_START: return "LOOP START";
|
case TAV_PACKET_LOOP_START: return "LOOP START";
|
||||||
case TAV_PACKET_LOOP_END: return "LOOP END";
|
case TAV_PACKET_LOOP_END: return "LOOP END";
|
||||||
|
case TAV_PACKET_GOP_SYNC: return "GOP SYNC";
|
||||||
case TAV_PACKET_TIMECODE: return "TIMECODE";
|
case TAV_PACKET_TIMECODE: return "TIMECODE";
|
||||||
case TAV_PACKET_SYNC_NTSC: return "SYNC (NTSC)";
|
case TAV_PACKET_SYNC_NTSC: return "SYNC (NTSC)";
|
||||||
case TAV_PACKET_SYNC: return "SYNC";
|
case TAV_PACKET_SYNC: return "SYNC";
|
||||||
@@ -114,6 +121,7 @@ int should_display_packet(uint8_t type, display_options_t *opts) {
|
|||||||
if (opts->show_all) return 1;
|
if (opts->show_all) return 1;
|
||||||
|
|
||||||
if (opts->show_video && (type == TAV_PACKET_IFRAME || type == TAV_PACKET_PFRAME ||
|
if (opts->show_video && (type == TAV_PACKET_IFRAME || type == TAV_PACKET_PFRAME ||
|
||||||
|
type == TAV_PACKET_GOP_UNIFIED || type == TAV_PACKET_GOP_SYNC ||
|
||||||
(type >= 0x70 && type <= 0x7F))) return 1;
|
(type >= 0x70 && type <= 0x7F))) return 1;
|
||||||
if (opts->show_audio && type == TAV_PACKET_AUDIO_MP2) return 1;
|
if (opts->show_audio && type == TAV_PACKET_AUDIO_MP2) return 1;
|
||||||
if (opts->show_subtitles && (type == TAV_PACKET_SUBTITLE || type == TAV_PACKET_SUBTITLE_KAR)) return 1;
|
if (opts->show_subtitles && (type == TAV_PACKET_SUBTITLE || type == TAV_PACKET_SUBTITLE_KAR)) return 1;
|
||||||
@@ -449,6 +457,64 @@ int main(int argc, char *argv[]) {
|
|||||||
break;
|
break;
|
||||||
}
|
}
|
||||||
|
|
||||||
|
case TAV_PACKET_GOP_UNIFIED: {
|
||||||
|
// Unified GOP packet: [gop_size][motion_vectors...][compressed_size][data]
|
||||||
|
uint8_t gop_size;
|
||||||
|
if (fread(&gop_size, 1, 1, fp) != 1) break;
|
||||||
|
|
||||||
|
// Read all motion vectors
|
||||||
|
int16_t *motion_x = malloc(gop_size * sizeof(int16_t));
|
||||||
|
int16_t *motion_y = malloc(gop_size * sizeof(int16_t));
|
||||||
|
for (int i = 0; i < gop_size; i++) {
|
||||||
|
if (fread(&motion_x[i], sizeof(int16_t), 1, fp) != 1) break;
|
||||||
|
if (fread(&motion_y[i], sizeof(int16_t), 1, fp) != 1) break;
|
||||||
|
}
|
||||||
|
|
||||||
|
// Read compressed data size
|
||||||
|
uint32_t size;
|
||||||
|
if (fread(&size, sizeof(uint32_t), 1, fp) != 1) {
|
||||||
|
free(motion_x);
|
||||||
|
free(motion_y);
|
||||||
|
break;
|
||||||
|
}
|
||||||
|
|
||||||
|
stats.total_video_bytes += size;
|
||||||
|
stats.gop_unified_count++;
|
||||||
|
stats.total_gop_frames += gop_size;
|
||||||
|
|
||||||
|
if (!opts.summary_only && display) {
|
||||||
|
printf(" - GOP size=%u, data size=%u bytes (%.2f bytes/frame)",
|
||||||
|
gop_size, size, (double)size / gop_size);
|
||||||
|
|
||||||
|
// Always show motion vectors for GOP packets
|
||||||
|
if (gop_size > 0) {
|
||||||
|
printf("\n Motion vectors (quarter-pixel):");
|
||||||
|
for (int i = 0; i < gop_size; i++) {
|
||||||
|
printf("\n Frame %d: (%.2f, %.2f) px",
|
||||||
|
i, motion_x[i] / 4.0, motion_y[i] / 4.0);
|
||||||
|
}
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
free(motion_x);
|
||||||
|
free(motion_y);
|
||||||
|
fseek(fp, size, SEEK_CUR);
|
||||||
|
break;
|
||||||
|
}
|
||||||
|
|
||||||
|
case TAV_PACKET_GOP_SYNC: {
|
||||||
|
// GOP sync packet: [frame_count]
|
||||||
|
uint8_t frame_count;
|
||||||
|
if (fread(&frame_count, 1, 1, fp) != 1) break;
|
||||||
|
|
||||||
|
stats.gop_sync_count++;
|
||||||
|
|
||||||
|
if (!opts.summary_only && display) {
|
||||||
|
printf(" - %u frames decoded from GOP block", frame_count);
|
||||||
|
}
|
||||||
|
break;
|
||||||
|
}
|
||||||
|
|
||||||
case TAV_PACKET_IFRAME:
|
case TAV_PACKET_IFRAME:
|
||||||
case TAV_PACKET_PFRAME:
|
case TAV_PACKET_PFRAME:
|
||||||
case TAV_PACKET_VIDEO_CH2_I:
|
case TAV_PACKET_VIDEO_CH2_I:
|
||||||
@@ -610,6 +676,12 @@ int main(int argc, char *argv[]) {
|
|||||||
printf(")");
|
printf(")");
|
||||||
}
|
}
|
||||||
printf("\n");
|
printf("\n");
|
||||||
|
if (stats.gop_unified_count > 0) {
|
||||||
|
printf(" 3D GOP packets: %d (total frames: %d, avg %.1f frames/GOP)\n",
|
||||||
|
stats.gop_unified_count, stats.total_gop_frames,
|
||||||
|
(double)stats.total_gop_frames / stats.gop_unified_count);
|
||||||
|
printf(" GOP sync packets: %d\n", stats.gop_sync_count);
|
||||||
|
}
|
||||||
printf(" Mux video: %d\n", stats.mux_video_count);
|
printf(" Mux video: %d\n", stats.mux_video_count);
|
||||||
printf(" Total video bytes: %llu (%.2f MB)\n",
|
printf(" Total video bytes: %llu (%.2f MB)\n",
|
||||||
(unsigned long long)stats.total_video_bytes,
|
(unsigned long long)stats.total_video_bytes,
|
||||||
|
|||||||
Reference in New Issue
Block a user