From 391adffad48b00c3c200f97e1ecde2d38b9bccb3 Mon Sep 17 00:00:00 2001
From: minjaesong <alswo9628@gmail.com>
Date: Tue, 16 Sep 2025 15:20:28 +0900
Subject: [PATCH] encoder optimisation

---
 video_encoder/encoder_tav.c | 165 ++++++++++++++++++++++++++++--------
 1 file changed, 131 insertions(+), 34 deletions(-)

diff --git a/video_encoder/encoder_tav.c b/video_encoder/encoder_tav.c
index 4739360..e4b993d 100644
--- a/video_encoder/encoder_tav.c
+++ b/video_encoder/encoder_tav.c
@@ -197,6 +197,11 @@ typedef struct {
     void *compressed_buffer;
     size_t compressed_buffer_size;
     
+    // OPTIMIZATION: Pre-allocated buffers to avoid malloc/free per tile
+    int16_t *reusable_quantized_y;
+    int16_t *reusable_quantized_co;
+    int16_t *reusable_quantized_cg;
+    
     // Statistics
     size_t total_compressed_size;
     size_t total_uncompressed_size;
@@ -333,10 +338,17 @@ static int initialize_encoder(tav_encoder_t *enc) {
     enc->compressed_buffer_size = ZSTD_compressBound(1024 * 1024); // 1MB max
     enc->compressed_buffer = malloc(enc->compressed_buffer_size);
     
+    // OPTIMIZATION: Allocate reusable quantization buffers for padded tiles (176x176)
+    const int padded_coeff_count = PADDED_TILE_SIZE * PADDED_TILE_SIZE;
+    enc->reusable_quantized_y = malloc(padded_coeff_count * sizeof(int16_t));
+    enc->reusable_quantized_co = malloc(padded_coeff_count * sizeof(int16_t));
+    enc->reusable_quantized_cg = malloc(padded_coeff_count * sizeof(int16_t));
+    
     if (!enc->current_frame_rgb || !enc->previous_frame_rgb || 
         !enc->current_frame_y || !enc->current_frame_co || !enc->current_frame_cg ||
         !enc->previous_frame_y || !enc->previous_frame_co || !enc->previous_frame_cg ||
-        !enc->tiles || !enc->motion_vectors || !enc->zstd_ctx || !enc->compressed_buffer) {
+        !enc->tiles || !enc->motion_vectors || !enc->zstd_ctx || !enc->compressed_buffer ||
+        !enc->reusable_quantized_y || !enc->reusable_quantized_co || !enc->reusable_quantized_cg) {
         return -1;
     }
     
@@ -450,30 +462,85 @@ static void extract_padded_tile(tav_encoder_t *enc, int tile_x, int tile_y,
     const int core_start_x = tile_x * TILE_SIZE;
     const int core_start_y = tile_y * TILE_SIZE;
     
-    // Extract padded tile: margin + core + margin  
+    // OPTIMIZATION: Process row by row with bulk copying for core region
     for (int py = 0; py < PADDED_TILE_SIZE; py++) {
-        for (int px = 0; px < PADDED_TILE_SIZE; px++) {
-            // Map padded coordinates to source image coordinates
-            int src_x = core_start_x + px - TILE_MARGIN;
-            int src_y = core_start_y + py - TILE_MARGIN;
+        // Map padded row to source image row
+        int src_y = core_start_y + py - TILE_MARGIN;
+        
+        // Handle vertical boundary conditions with mirroring
+        if (src_y < 0) src_y = -src_y;
+        else if (src_y >= enc->height) src_y = enc->height - 1 - (src_y - enc->height);
+        src_y = CLAMP(src_y, 0, enc->height - 1);
+        
+        // Calculate source and destination row offsets
+        const int padded_row_offset = py * PADDED_TILE_SIZE;
+        const int src_row_offset = src_y * enc->width;
+        
+        // Check if we can do bulk copying for the core region
+        int core_start_px = TILE_MARGIN;
+        int core_end_px = TILE_MARGIN + TILE_SIZE;
+        
+        // Check if core region is entirely within frame bounds
+        int core_src_start_x = core_start_x;
+        int core_src_end_x = core_start_x + TILE_SIZE;
+        
+        if (core_src_start_x >= 0 && core_src_end_x <= enc->width) {
+            // OPTIMIZATION: Bulk copy core region (112 pixels) in one operation
+            const int src_core_offset = src_row_offset + core_src_start_x;
             
-            // Handle boundary conditions with mirroring
-            if (src_x < 0) src_x = -src_x;
-            else if (src_x >= enc->width) src_x = enc->width - 1 - (src_x - enc->width);
+            memcpy(&padded_y[padded_row_offset + core_start_px], 
+                   &enc->current_frame_y[src_core_offset], 
+                   TILE_SIZE * sizeof(float));
+            memcpy(&padded_co[padded_row_offset + core_start_px], 
+                   &enc->current_frame_co[src_core_offset], 
+                   TILE_SIZE * sizeof(float));
+            memcpy(&padded_cg[padded_row_offset + core_start_px], 
+                   &enc->current_frame_cg[src_core_offset], 
+                   TILE_SIZE * sizeof(float));
             
-            if (src_y < 0) src_y = -src_y;
-            else if (src_y >= enc->height) src_y = enc->height - 1 - (src_y - enc->height);
+            // Handle margin pixels individually (left and right margins)
+            for (int px = 0; px < core_start_px; px++) {
+                int src_x = core_start_x + px - TILE_MARGIN;
+                if (src_x < 0) src_x = -src_x;
+                src_x = CLAMP(src_x, 0, enc->width - 1);
+                
+                int src_idx = src_row_offset + src_x;
+                int padded_idx = padded_row_offset + px;
+                
+                padded_y[padded_idx] = enc->current_frame_y[src_idx];
+                padded_co[padded_idx] = enc->current_frame_co[src_idx];
+                padded_cg[padded_idx] = enc->current_frame_cg[src_idx];
+            }
             
-            // Clamp to valid bounds
-            src_x = CLAMP(src_x, 0, enc->width - 1);
-            src_y = CLAMP(src_y, 0, enc->height - 1);
-            
-            int src_idx = src_y * enc->width + src_x;
-            int padded_idx = py * PADDED_TILE_SIZE + px;
-            
-            padded_y[padded_idx] = enc->current_frame_y[src_idx];
-            padded_co[padded_idx] = enc->current_frame_co[src_idx];
-            padded_cg[padded_idx] = enc->current_frame_cg[src_idx];
+            for (int px = core_end_px; px < PADDED_TILE_SIZE; px++) {
+                int src_x = core_start_x + px - TILE_MARGIN;
+                if (src_x >= enc->width) src_x = enc->width - 1 - (src_x - enc->width);
+                src_x = CLAMP(src_x, 0, enc->width - 1);
+                
+                int src_idx = src_row_offset + src_x;
+                int padded_idx = padded_row_offset + px;
+                
+                padded_y[padded_idx] = enc->current_frame_y[src_idx];
+                padded_co[padded_idx] = enc->current_frame_co[src_idx];
+                padded_cg[padded_idx] = enc->current_frame_cg[src_idx];
+            }
+        } else {
+            // Fallback: process entire row pixel by pixel (for edge tiles)
+            for (int px = 0; px < PADDED_TILE_SIZE; px++) {
+                int src_x = core_start_x + px - TILE_MARGIN;
+                
+                // Handle horizontal boundary conditions with mirroring
+                if (src_x < 0) src_x = -src_x;
+                else if (src_x >= enc->width) src_x = enc->width - 1 - (src_x - enc->width);
+                src_x = CLAMP(src_x, 0, enc->width - 1);
+                
+                int src_idx = src_row_offset + src_x;
+                int padded_idx = padded_row_offset + px;
+                
+                padded_y[padded_idx] = enc->current_frame_y[src_idx];
+                padded_co[padded_idx] = enc->current_frame_co[src_idx];
+                padded_cg[padded_idx] = enc->current_frame_cg[src_idx];
+            }
         }
     }
 }
@@ -561,9 +628,10 @@ static size_t serialize_tile_data(tav_encoder_t *enc, int tile_x, int tile_y,
     
     // Quantize and serialize DWT coefficients (full padded tile: 176x176)
     const int tile_size = PADDED_TILE_SIZE * PADDED_TILE_SIZE;
-    int16_t *quantized_y = malloc(tile_size * sizeof(int16_t));
-    int16_t *quantized_co = malloc(tile_size * sizeof(int16_t));
-    int16_t *quantized_cg = malloc(tile_size * sizeof(int16_t));
+    // OPTIMIZATION: Use pre-allocated buffers instead of malloc/free per tile
+    int16_t *quantized_y = enc->reusable_quantized_y;
+    int16_t *quantized_co = enc->reusable_quantized_co;
+    int16_t *quantized_cg = enc->reusable_quantized_cg;
     
     // Debug: check DWT coefficients before quantization
     /*if (tile_x == 0 && tile_y == 0) {
@@ -594,9 +662,7 @@ static size_t serialize_tile_data(tav_encoder_t *enc, int tile_x, int tile_y,
     memcpy(buffer + offset, quantized_co, tile_size * sizeof(int16_t)); offset += tile_size * sizeof(int16_t);
     memcpy(buffer + offset, quantized_cg, tile_size * sizeof(int16_t)); offset += tile_size * sizeof(int16_t);
     
-    free(quantized_y);
-    free(quantized_co);
-    free(quantized_cg);
+    // OPTIMIZATION: No need to free - using pre-allocated reusable buffers
     
     return offset;
 }
@@ -731,16 +797,42 @@ static int estimate_motion_112x112(const float *current, const float *reference,
 
 // RGB to YCoCg color space conversion
 static void rgb_to_ycocg(const uint8_t *rgb, float *y, float *co, float *cg, int width, int height) {
-    for (int i = 0; i < width * height; i++) {
-        float r = rgb[i * 3 + 0];
-        float g = rgb[i * 3 + 1]; 
-        float b = rgb[i * 3 + 2];
+    const int total_pixels = width * height;
+    
+    // OPTIMIZATION: Process 4 pixels at a time for better cache utilization
+    int i = 0;
+    const int simd_end = (total_pixels / 4) * 4;
+    
+    // Vectorized processing for groups of 4 pixels
+    for (i = 0; i < simd_end; i += 4) {
+        // Load 4 RGB triplets (12 bytes) at once
+        const uint8_t *rgb_ptr = &rgb[i * 3];
+        
+        // Process 4 pixels simultaneously with loop unrolling
+        for (int j = 0; j < 4; j++) {
+            const int idx = i + j;
+            const float r = rgb_ptr[j * 3 + 0];
+            const float g = rgb_ptr[j * 3 + 1]; 
+            const float b = rgb_ptr[j * 3 + 2];
+            
+            // YCoCg-R transform (optimized with fewer temporary variables)
+            co[idx] = r - b;
+            const float tmp = b + co[idx] * 0.5f;
+            cg[idx] = g - tmp;
+            y[idx] = tmp + cg[idx] * 0.5f;
+        }
+    }
+    
+    // Handle remaining pixels (1-3 pixels)
+    for (; i < total_pixels; i++) {
+        const float r = rgb[i * 3 + 0];
+        const float g = rgb[i * 3 + 1]; 
+        const float b = rgb[i * 3 + 2];
         
-        // YCoCg-R transform
         co[i] = r - b;
-        float tmp = b + co[i] / 2;
+        const float tmp = b + co[i] * 0.5f;
         cg[i] = g - tmp;
-        y[i] = tmp + cg[i] / 2;
+        y[i] = tmp + cg[i] * 0.5f;
     }
 }
 
@@ -1911,6 +2003,11 @@ static void cleanup_encoder(tav_encoder_t *enc) {
     free(enc->compressed_buffer);
     free(enc->mp2_buffer);
     
+    // OPTIMIZATION: Free reusable quantization buffers
+    free(enc->reusable_quantized_y);
+    free(enc->reusable_quantized_co);
+    free(enc->reusable_quantized_cg);
+    
     // Free subtitle list
     if (enc->subtitles) {
         free_subtitle_list(enc->subtitles);