From 9e8aeeb1124a4e8c881d7119dafa6816b749f088 Mon Sep 17 00:00:00 2001 From: minjaesong Date: Tue, 16 Sep 2025 22:23:31 +0900 Subject: [PATCH] audio handling --- video_encoder/encoder_tav.c | 104 +++++++++++++++++++++++++----------- 1 file changed, 74 insertions(+), 30 deletions(-) diff --git a/video_encoder/encoder_tav.c b/video_encoder/encoder_tav.c index 928ed25..df5e0e7 100644 --- a/video_encoder/encoder_tav.c +++ b/video_encoder/encoder_tav.c @@ -188,6 +188,7 @@ typedef struct { int mp2_packet_size; int mp2_rate_index; int target_audio_buffer_size; + double audio_frames_in_buffer; // Subtitle processing subtitle_entry_t *subtitles; @@ -1244,7 +1245,6 @@ static int start_video_conversion(tav_encoder_t *enc) { // Start audio conversion static int start_audio_conversion(tav_encoder_t *enc) { - return 1; if (!enc->has_audio) return 1; char command[2048]; @@ -1563,16 +1563,23 @@ static int process_audio(tav_encoder_t *enc, int frame_num, FILE *output) { int is_mono = (header[3] >> 6) == 3; enc->mp2_rate_index = mp2_packet_size_to_rate_index(enc->mp2_packet_size, is_mono); enc->target_audio_buffer_size = 4; // 4 audio packets in buffer + enc->audio_frames_in_buffer = 0.0; } - // Calculate how much audio we need for this frame - double frame_duration = 1.0 / enc->fps; - double samples_per_frame = 32000.0 * frame_duration; // 32kHz sample rate - int target_buffer_samples = (int)(samples_per_frame * enc->target_audio_buffer_size); - int target_buffer_bytes = (target_buffer_samples * enc->mp2_packet_size) / 1152; // 1152 samples per MP2 frame + // Calculate how much audio time each frame represents (in seconds) + double frame_audio_time = 1.0 / enc->fps; + // Calculate how much audio time each MP2 packet represents + // MP2 frame contains 1152 samples at 32kHz = 0.036 seconds + #define MP2_SAMPLE_RATE 32000 + double packet_audio_time = 1152.0 / MP2_SAMPLE_RATE; + + // Estimate how many packets we consume per video frame + double packets_per_frame = frame_audio_time / packet_audio_time; + + // Allocate MP2 buffer if needed if (!enc->mp2_buffer) { - enc->mp2_buffer_size = target_buffer_bytes * 2; // Extra buffer space + enc->mp2_buffer_size = enc->mp2_packet_size * 2; // Space for multiple packets enc->mp2_buffer = malloc(enc->mp2_buffer_size); if (!enc->mp2_buffer) { fprintf(stderr, "Failed to allocate audio buffer\n"); @@ -1580,34 +1587,71 @@ static int process_audio(tav_encoder_t *enc, int frame_num, FILE *output) { } } - // Read audio data - size_t bytes_to_read = target_buffer_bytes; - if (bytes_to_read > enc->audio_remaining) { - bytes_to_read = enc->audio_remaining; - } - if (bytes_to_read > enc->mp2_buffer_size) { - bytes_to_read = enc->mp2_buffer_size; + // Audio buffering strategy: maintain target buffer level + int packets_to_insert = 0; + if (frame_num == 0) { + // Prime buffer to target level initially + packets_to_insert = enc->target_audio_buffer_size; + enc->audio_frames_in_buffer = 0; // count starts from 0 + if (enc->verbose) { + printf("Frame %d: Priming audio buffer with %d packets\n", frame_num, packets_to_insert); + } + } else { + // Simulate buffer consumption (fractional consumption per frame) + double old_buffer = enc->audio_frames_in_buffer; + enc->audio_frames_in_buffer -= packets_per_frame; + + // Calculate how many packets we need to maintain target buffer level + // Only insert when buffer drops below target, and only insert enough to restore target + double target_level = (double)enc->target_audio_buffer_size; + if (enc->audio_frames_in_buffer < target_level) { + double deficit = target_level - enc->audio_frames_in_buffer; + // Insert packets to cover the deficit, but at least maintain minimum flow + packets_to_insert = (int)ceil(deficit); + // Cap at reasonable maximum to prevent excessive insertion + if (packets_to_insert > enc->target_audio_buffer_size) { + packets_to_insert = enc->target_audio_buffer_size; + } + + if (enc->verbose) { + printf("Frame %d: Buffer low (%.2f->%.2f), deficit %.2f, inserting %d packets\n", + frame_num, old_buffer, enc->audio_frames_in_buffer, deficit, packets_to_insert); + } + } else if (enc->verbose && old_buffer != enc->audio_frames_in_buffer) { + printf("Frame %d: Buffer sufficient (%.2f->%.2f), no packets\n", + frame_num, old_buffer, enc->audio_frames_in_buffer); + } } - size_t bytes_read = fread(enc->mp2_buffer, 1, bytes_to_read, enc->mp2_file); - if (bytes_read == 0) { - return 1; // No more audio - } + // Insert the calculated number of audio packets + for (int q = 0; q < packets_to_insert; q++) { + size_t bytes_to_read = enc->mp2_packet_size; + if (bytes_to_read > enc->audio_remaining) { + bytes_to_read = enc->audio_remaining; + } - // Write audio packet - uint8_t audio_packet_type = TAV_PACKET_AUDIO_MP2; - uint32_t audio_len = (uint32_t)bytes_read; - - fwrite(&audio_packet_type, 1, 1, output); - fwrite(&audio_len, 4, 1, output); - fwrite(enc->mp2_buffer, 1, bytes_read, output); + size_t bytes_read = fread(enc->mp2_buffer, 1, bytes_to_read, enc->mp2_file); + if (bytes_read == 0) break; - // Track audio bytes written - enc->audio_remaining -= bytes_read; + // Write TAV MP2 audio packet + uint8_t audio_packet_type = TAV_PACKET_AUDIO_MP2; + uint32_t audio_len = (uint32_t)bytes_read; + fwrite(&audio_packet_type, 1, 1, output); + fwrite(&audio_len, 4, 1, output); + fwrite(enc->mp2_buffer, 1, bytes_read, output); - if (enc->verbose) { - printf("Frame %d: Audio packet %zu bytes (remaining: %zu)\n", - frame_num, bytes_read, enc->audio_remaining); + // Track audio bytes written + enc->audio_remaining -= bytes_read; + enc->audio_frames_in_buffer++; + + if (frame_num == 0) { + enc->audio_frames_in_buffer = enc->target_audio_buffer_size / 2; // trick the buffer simulator so that it doesn't count the frame 0 priming + } + + if (enc->verbose) { + printf("Audio packet %d: %zu bytes (buffer: %.2f packets)\n", + q, bytes_read, enc->audio_frames_in_buffer); + } } return 1;