TAV: pcm8 audio

2026-03-07 19:51:51 +09:00 · 2025-10-22 10:05:54 +09:00
parent 758b134abd
commit 4265891093
15 changed files with 298 additions and 47 deletions
--- a/video_encoder/encoder_tav.c
+++ b/video_encoder/encoder_tav.c
@@ -1,4 +1,4 @@
-// Created by Claude on 2025-09-13.
+// Created by CuriousTorvald and Claude on 2025-09-13.
 // TAV (TSVM Advanced Video) Encoder - DWT-based compression with full resolution YCoCg-R
 #include <stdio.h>
 #include <stdlib.h>
@@ -54,6 +54,7 @@
 #define TAV_PACKET_PFRAME_ADAPTIVE 0x16  // P-frame with adaptive quad-tree block partitioning
 #define TAV_PACKET_BFRAME_ADAPTIVE 0x17  // B-frame with adaptive quad-tree block partitioning (bidirectional prediction)
 #define TAV_PACKET_AUDIO_MP2       0x20  // MP2 audio
+#define TAV_PACKET_AUDIO_PCM8      0x21  // 8-bit PCM audio (zstd compressed)
 #define TAV_PACKET_SUBTITLE        0x30  // Subtitle packet
 #define TAV_PACKET_AUDIO_TRACK     0x40  // Separate audio track (full MP2 file)
 #define TAV_PACKET_EXTENDED_HDR    0xEF  // Extended header packet
@@ -118,6 +119,7 @@ static int needs_alpha_channel(int channel_layout) {
 #define DEFAULT_FPS 30
 #define DEFAULT_QUALITY 3
 #define DEFAULT_ZSTD_LEVEL 3
+#define DEFAULT_PCM_ZSTD_LEVEL 3
 #define TEMPORAL_GOP_SIZE 20
 #define TEMPORAL_DECOMP_LEVEL 2
 #define MOTION_THRESHOLD 24.0f // Flush if motion exceeds 24 pixels in any direction
@@ -159,6 +161,7 @@ static void generate_random_filename(char *filename) {
 }

 char TEMP_AUDIO_FILE[42];
+char TEMP_PCM_FILE[42];

 // Utility macros
 static inline int CLAMP(int x, int min, int max) {
@@ -1694,7 +1697,8 @@ typedef struct tav_encoder_s {
    FILE *output_fp;
    FILE *mp2_file;
    FILE *ffmpeg_video_pipe;
-    
+    FILE *pcm_file;  // PCM16LE audio file for PCM8 mode
+
    // Video parameters
    int width, height;
    int fps;
@@ -1744,6 +1748,7 @@ typedef struct tav_encoder_s {
    int use_delta_encoding;
    int delta_haar_levels; // Number of Haar DWT levels to apply to delta coefficients (0 = disabled)
    int separate_audio_track; // 1 = write entire MP2 file as packet 0x40 after header, 0 = interleave audio (default)
+    int pcm8_audio; // 1 = use 8-bit PCM audio (packet 0x21), 0 = use MP2 (default)

    // Frame buffers - ping-pong implementation
    uint8_t *frame_rgb[2];      // [0] and [1] alternate between current and previous
@@ -1846,6 +1851,12 @@ typedef struct tav_encoder_s {
    int audio_bitrate;  // Custom audio bitrate (0 = use quality table)
    int target_audio_buffer_size;
    double audio_frames_in_buffer;
+
+    // PCM8 audio processing
+    int samples_per_frame;  // Number of stereo samples per video frame
+    int16_t *pcm16_buffer;  // Buffer for reading PCM16LE data
+    uint8_t *pcm8_buffer;   // Buffer for converted PCM8 data
+    int16_t dither_error[2]; // Dithering error for stereo channels [L, R]
    
    // Subtitle processing  
    subtitle_entry_t *subtitles;
@@ -2256,6 +2267,7 @@ static void show_usage(const char *program_name) {
    printf("  -a, --arate N           MP2 audio bitrate in kbps (overrides quality-based audio rate)\n");
    printf("                          Valid values: 32, 48, 56, 64, 80, 96, 112, 128, 160, 192, 224, 256, 320, 384\n");
    printf("  --separate-audio-track  Write entire MP2 file as single packet 0x40 (instead of interleaved)\n");
+    printf("  --pcm8-audio            Use 8-bit PCM audio (packet 0x21, zstd compressed, per-frame packets)\n");
    printf("  -S, --subtitles FILE    SubRip (.srt) or SAMI (.smi) subtitle file\n");
    printf("  --fontrom-lo FILE       Low font ROM file for internationalised subtitles\n");
    printf("  --fontrom-hi FILE       High font ROM file for internationalised subtitles\n");
@@ -2344,6 +2356,7 @@ static tav_encoder_t* create_encoder(void) {
    enc->use_delta_encoding = 0;
    enc->delta_haar_levels = TEMPORAL_DECOMP_LEVEL;
    enc->separate_audio_track = 0;  // Default: interleave audio packets
+    enc->pcm8_audio = 0;  // Default: use MP2 audio

    // GOP / temporal DWT settings
    enc->enable_temporal_dwt = 1;  // Mutually exclusive with use_delta_encoding
@@ -7957,28 +7970,61 @@ static int start_audio_conversion(tav_encoder_t *enc) {
    if (!enc->has_audio) return 1;

    char command[2048];
-    int bitrate;
-    if (enc->audio_bitrate > 0) {
-        bitrate = enc->audio_bitrate;
-    } else {
-        bitrate = enc->lossless ? 384 : MP2_RATE_TABLE[enc->quality_level];
-    }
-    printf("  Audio format: MP2 %dkbps (via libtwolame)\n", bitrate);
-    snprintf(command, sizeof(command),
-        "ffmpeg -v quiet -i \"%s\" -acodec libtwolame -psymodel 4 -b:a %dk -ar 32000 -ac 2 -y \"%s\" 2>/dev/null",
-        enc->input_file, bitrate, TEMP_AUDIO_FILE);

-    int result = system(command);
-    if (result == 0) {
-        enc->mp2_file = fopen(TEMP_AUDIO_FILE, "rb");
-        if (enc->mp2_file) {
-            fseek(enc->mp2_file, 0, SEEK_END);
-            enc->audio_remaining = ftell(enc->mp2_file);
-            fseek(enc->mp2_file, 0, SEEK_SET);
+    if (enc->pcm8_audio) {
+        // Extract PCM16LE for PCM8 mode
+        printf("  Audio format: PCM16LE 32kHz stereo (will be converted to 8-bit)\n");
+        snprintf(command, sizeof(command),
+            "ffmpeg -v quiet -i \"%s\" -f s16le -acodec pcm_s16le -ar %d -ac 2 -y \"%s\" 2>/dev/null",
+            enc->input_file, TSVM_AUDIO_SAMPLE_RATE, TEMP_PCM_FILE);
+
+        int result = system(command);
+        if (result == 0) {
+            enc->pcm_file = fopen(TEMP_PCM_FILE, "rb");
+            if (enc->pcm_file) {
+                fseek(enc->pcm_file, 0, SEEK_END);
+                enc->audio_remaining = ftell(enc->pcm_file);
+                fseek(enc->pcm_file, 0, SEEK_SET);
+
+                // Calculate samples per frame: ceil(sample_rate / fps)
+                enc->samples_per_frame = (TSVM_AUDIO_SAMPLE_RATE + enc->output_fps - 1) / enc->output_fps;
+
+                // Initialize dithering error
+                enc->dither_error[0] = 0;
+                enc->dither_error[1] = 0;
+
+                if (enc->verbose) {
+                    printf("  PCM8: %d samples per frame\n", enc->samples_per_frame);
+                }
+            }
+            return 1;
        }
-        return 1;
+        return 0;
+    } else {
+        // Extract MP2 for normal mode
+        int bitrate;
+        if (enc->audio_bitrate > 0) {
+            bitrate = enc->audio_bitrate;
+        } else {
+            bitrate = enc->lossless ? 384 : MP2_RATE_TABLE[enc->quality_level];
+        }
+        printf("  Audio format: MP2 %dkbps (via libtwolame)\n", bitrate);
+        snprintf(command, sizeof(command),
+            "ffmpeg -v quiet -i \"%s\" -acodec libtwolame -psymodel 4 -b:a %dk -ar %d -ac 2 -y \"%s\" 2>/dev/null",
+            enc->input_file, bitrate, TSVM_AUDIO_SAMPLE_RATE, TEMP_AUDIO_FILE);
+
+        int result = system(command);
+        if (result == 0) {
+            enc->mp2_file = fopen(TEMP_AUDIO_FILE, "rb");
+            if (enc->mp2_file) {
+                fseek(enc->mp2_file, 0, SEEK_END);
+                enc->audio_remaining = ftell(enc->mp2_file);
+                fseek(enc->mp2_file, 0, SEEK_SET);
+            }
+            return 1;
+        }
+        return 0;
    }
-    return 0;
 }

 // Get MP2 packet size from header (copied from TEV)
@@ -8599,6 +8645,36 @@ static long write_extended_header(tav_encoder_t *enc) {
    return endt_offset + 4 + 1;  // 4 bytes for "ENDT", 1 byte for type
 }

+// Convert PCM16LE to unsigned 8-bit PCM with error-diffusion dithering
+static void convert_pcm16_to_pcm8_dithered(tav_encoder_t *enc, const int16_t *pcm16, uint8_t *pcm8, int num_samples) {
+    for (int i = 0; i < num_samples; i++) {
+        for (int ch = 0; ch < 2; ch++) {  // Stereo: L and R
+            int idx = i * 2 + ch;
+
+            // Convert signed 16-bit [-32768, 32767] to unsigned 8-bit [0, 255]
+            // First scale to [0, 65535], then add dithering error
+            int32_t sample = (int32_t)pcm16[idx] + 32768;  // Now in [0, 65535]
+
+            // Add accumulated dithering error
+            sample += enc->dither_error[ch];
+
+            // Quantize to 8-bit (divide by 256)
+            int32_t quantized = sample >> 8;
+
+            // Clamp to [0, 255]
+            if (quantized < 0) quantized = 0;
+            if (quantized > 255) quantized = 255;
+
+            // Store 8-bit value
+            pcm8[idx] = (uint8_t)quantized;
+
+            // Calculate quantization error for next sample (error diffusion)
+            // Error = original - (quantized * 256)
+            enc->dither_error[ch] = sample - (quantized << 8);
+        }
+    }
+}
+
 // Write separate audio track packet (0x40) - entire MP2 file in one packet
 static int write_separate_audio_track(tav_encoder_t *enc, FILE *output) {
    if (!enc->has_audio || !enc->mp2_file) {
@@ -8651,6 +8727,97 @@ static int write_separate_audio_track(tav_encoder_t *enc, FILE *output) {
    return 1;
 }

+// Write PCM8 audio packet (0x21) with specified sample count
+static int write_pcm8_packet_samples(tav_encoder_t *enc, FILE *output, int samples_to_read) {
+    if (!enc->pcm_file || enc->audio_remaining <= 0 || samples_to_read <= 0) {
+        return 0;
+    }
+    size_t bytes_to_read = samples_to_read * 2 * sizeof(int16_t);  // Stereo PCM16LE
+
+    // Don't read more than what's available
+    if (bytes_to_read > enc->audio_remaining) {
+        bytes_to_read = enc->audio_remaining;
+        samples_to_read = bytes_to_read / (2 * sizeof(int16_t));
+    }
+
+    if (samples_to_read == 0) {
+        return 0;
+    }
+
+    // Allocate buffers if needed (size for max samples: 32768)
+    int max_samples = 32768;  // Maximum samples per packet
+    if (!enc->pcm16_buffer) {
+        enc->pcm16_buffer = malloc(max_samples * 2 * sizeof(int16_t));
+    }
+    if (!enc->pcm8_buffer) {
+        enc->pcm8_buffer = malloc(max_samples * 2);
+    }
+
+    // Read PCM16LE data
+    size_t bytes_read = fread(enc->pcm16_buffer, 1, bytes_to_read, enc->pcm_file);
+    if (bytes_read == 0) {
+        return 0;
+    }
+
+    int samples_read = bytes_read / (2 * sizeof(int16_t));
+
+    // Convert to PCM8 with dithering
+    convert_pcm16_to_pcm8_dithered(enc, enc->pcm16_buffer, enc->pcm8_buffer, samples_read);
+
+    // Compress with zstd
+    size_t pcm8_size = samples_read * 2;  // Stereo
+    size_t max_compressed_size = ZSTD_compressBound(pcm8_size);
+    uint8_t *compressed_buffer = malloc(max_compressed_size);
+
+    size_t compressed_size = ZSTD_compress(compressed_buffer, max_compressed_size,
+                                           enc->pcm8_buffer, pcm8_size,
+                                           (DEFAULT_PCM_ZSTD_LEVEL > enc->zstd_level) ? DEFAULT_PCM_ZSTD_LEVEL : enc->zstd_level);
+
+    if (ZSTD_isError(compressed_size)) {
+        fprintf(stderr, "Error: Zstd compression failed for PCM8 audio\n");
+        free(compressed_buffer);
+        return 0;
+    }
+
+    // Write packet: [0x21][uint32 compressed_size][compressed_data]
+    uint8_t packet_type = TAV_PACKET_AUDIO_PCM8;
+    fwrite(&packet_type, 1, 1, output);
+
+    uint32_t compressed_size_32 = (uint32_t)compressed_size;
+    fwrite(&compressed_size_32, sizeof(uint32_t), 1, output);
+
+    fwrite(compressed_buffer, 1, compressed_size, output);
+
+    // Cleanup
+    free(compressed_buffer);
+
+    // Update audio remaining
+    enc->audio_remaining -= bytes_read;
+
+    if (enc->verbose) {
+        printf("PCM8 packet: %d samples, %zu bytes raw, %zu bytes compressed\n",
+               samples_read, pcm8_size, compressed_size);
+
+        // Debug: Show first few samples
+        if (samples_read > 0) {
+            printf("  First samples (PCM16→PCM8): ");
+            for (int i = 0; i < 4 && i < samples_read; i++) {
+                printf("[%d,%d]→[%d,%d] ",
+                    enc->pcm16_buffer[i*2], enc->pcm16_buffer[i*2+1],
+                    enc->pcm8_buffer[i*2], enc->pcm8_buffer[i*2+1]);
+            }
+            printf("\n");
+        }
+    }
+
+    return 1;
+}
+
+// Write PCM8 audio packet (0x21) for one frame's worth of audio
+static int write_pcm8_packet(tav_encoder_t *enc, FILE *output) {
+    return write_pcm8_packet_samples(enc, output, enc->samples_per_frame);
+}
+
 // Process audio for current frame (copied and adapted from TEV)
 static int process_audio(tav_encoder_t *enc, int frame_num, FILE *output) {
    // Skip if separate audio track mode is enabled
@@ -8658,6 +8825,16 @@ static int process_audio(tav_encoder_t *enc, int frame_num, FILE *output) {
        return 1;
    }

+    // Handle PCM8 mode
+    if (enc->pcm8_audio) {
+        if (!enc->has_audio || !enc->pcm_file) {
+            return 1;
+        }
+        // Write one PCM8 packet per frame
+        return write_pcm8_packet(enc, output);
+    }
+
+    // Handle MP2 mode
    if (!enc->has_audio || !enc->mp2_file || enc->audio_remaining <= 0) {
        return 1;
    }
@@ -8764,6 +8941,41 @@ static int process_audio_for_gop(tav_encoder_t *enc, int *frame_numbers, int num
        return 1;
    }

+    // Handle PCM8 mode: emit mega packet(s) evenly divided if exceeding 32768 samples
+    if (enc->pcm8_audio) {
+        if (!enc->has_audio || !enc->pcm_file || num_frames == 0) {
+            return 1;
+        }
+
+        // Calculate total samples for this GOP
+        int total_samples = num_frames * enc->samples_per_frame;
+        int max_samples_per_packet = 32768;  // Architectural limit
+
+        // Calculate how many packets we need
+        int num_packets = (total_samples + max_samples_per_packet - 1) / max_samples_per_packet;
+
+        // Divide samples evenly across packets
+        int samples_per_packet = total_samples / num_packets;
+        int remainder = total_samples % num_packets;
+
+        if (enc->verbose) {
+            printf("PCM8 GOP: %d frames, %d total samples, %d packets (%d samples/packet)\n",
+                   num_frames, total_samples, num_packets, samples_per_packet);
+        }
+
+        // Emit evenly-divided packets
+        for (int i = 0; i < num_packets; i++) {
+            // Distribute remainder across first packets
+            int samples_this_packet = samples_per_packet + (i < remainder ? 1 : 0);
+            if (!write_pcm8_packet_samples(enc, output, samples_this_packet)) {
+                break;  // No more audio data
+            }
+        }
+
+        return 1;
+    }
+
+    // Handle MP2 mode
    if (!enc->has_audio || !enc->mp2_file || enc->audio_remaining <= 0 || num_frames == 0) {
        return 1;
    }
@@ -9094,6 +9306,9 @@ static int detect_still_frame_dwt(tav_encoder_t *enc) {
 // Main function
 int main(int argc, char *argv[]) {
    generate_random_filename(TEMP_AUDIO_FILE);
+    generate_random_filename(TEMP_PCM_FILE);
+    // Change extension to .pcm
+    strcpy(TEMP_PCM_FILE + 37, ".pcm");

    printf("Initialising encoder...\n");
    tav_encoder_t *enc = create_encoder();
@@ -9148,6 +9363,7 @@ int main(int argc, char *argv[]) {
        {"gop-size", required_argument, 0, 1024},
        {"ezbc", no_argument, 0, 1025},
        {"separate-audio-track", no_argument, 0, 1026},
+        {"pcm8-audio", no_argument, 0, 1027},
        {"help", no_argument, 0, '?'},
        {0, 0, 0, 0}
    };
@@ -9361,6 +9577,10 @@ int main(int argc, char *argv[]) {
                enc->separate_audio_track = 1;
                printf("Separate audio track mode enabled (packet 0x40)\n");
                break;
+            case 1027: // --pcm8-audio
+                enc->pcm8_audio = 1;
+                printf("8-bit PCM audio mode enabled (packet 0x21)\n");
+                break;
            case 'a':
                int bitrate = atoi(optarg);
                int valid_bitrate = validate_mp2_bitrate(bitrate);
@@ -10095,10 +10315,18 @@ static void cleanup_encoder(tav_encoder_t *enc) {
        fclose(enc->mp2_file);
        unlink(TEMP_AUDIO_FILE);
    }
+    if (enc->pcm_file) {
+        fclose(enc->pcm_file);
+        unlink(TEMP_PCM_FILE);
+    }
    if (enc->output_fp) {
        fclose(enc->output_fp);
    }

+    // Free PCM8 buffers
+    free(enc->pcm16_buffer);
+    free(enc->pcm8_buffer);
+
    free(enc->input_file);
    free(enc->output_file);
    free(enc->subtitle_file);