From bb3f715ad6ee803055f3c20e16204091789eeba1 Mon Sep 17 00:00:00 2001
From: minjaesong <alswo9628@gmail.com>
Date: Thu, 25 Sep 2025 00:03:06 +0900
Subject: [PATCH] spatial delta prediction

---
 video_encoder/encoder_tav.c | 143 ++++++++++++++++++++++++++++++++++--
 1 file changed, 137 insertions(+), 6 deletions(-)

diff --git a/video_encoder/encoder_tav.c b/video_encoder/encoder_tav.c
index dfbde84..58c82c1 100644
--- a/video_encoder/encoder_tav.c
+++ b/video_encoder/encoder_tav.c
@@ -1019,6 +1019,112 @@ static float get_perceptual_weight_delta(tav_encoder_t *enc, int level, int subb
     }
 }
 
+// Safe spatial prediction using neighboring DWT coefficients (LL subband only)
+static void apply_spatial_prediction_safe(float *coeffs, float *predicted_coeffs,
+                                        int width, int height, int decomp_levels) {
+    // Apply spatial prediction ONLY to LL subband to avoid addressing issues
+    // This is much safer and still provides benefit for the most important coefficients
+
+    int total_size = width * height;
+
+    // Initialize with input temporal prediction values
+    for (int i = 0; i < total_size; i++) {
+        predicted_coeffs[i] = coeffs[i];
+    }
+
+    // Only process LL subband (DC component) with safe, simple neighbor averaging
+    int ll_width = width >> decomp_levels;
+    int ll_height = height >> decomp_levels;
+
+    // Only process interior pixels to avoid boundary issues
+    for (int y = 1; y < ll_height - 1; y++) {
+        for (int x = 1; x < ll_width - 1; x++) {
+            int idx = y * ll_width + x;
+
+            // Get 4-connected neighbors from the input (not the output being modified)
+            float left = coeffs[y * ll_width + (x-1)];
+            float right = coeffs[y * ll_width + (x+1)];
+            float top = coeffs[(y-1) * ll_width + x];
+            float bottom = coeffs[(y+1) * ll_width + x];
+
+            // Simple neighbor averaging for spatial prediction
+            float spatial_pred = (left + right + top + bottom) * 0.25f;
+
+            // Combine temporal and spatial predictions with conservative weight
+            // 85% temporal, 15% spatial for safety
+            predicted_coeffs[idx] = coeffs[idx] * 0.85f + spatial_pred * 0.15f;
+        }
+    }
+
+    // Leave all detail subbands unchanged - only modify LL subband
+    // This prevents any coefficient addressing corruption
+}
+
+// Spatial prediction using neighboring DWT coefficients within the same subband
+static void apply_spatial_prediction(float *coeffs, float *predicted_coeffs,
+                                   int width, int height, int decomp_levels) {
+    // Apply spatial prediction within each DWT subband
+    // This improves upon temporal prediction by using neighboring coefficients
+
+    int total_size = width * height;
+
+    // Initialize with temporal prediction values
+    for (int i = 0; i < total_size; i++) {
+        predicted_coeffs[i] = coeffs[i];
+    }
+
+    // Map each coefficient to its subband and apply spatial prediction
+    int offset = 0;
+
+    // Process LL subband (DC component) - use simple neighbor averaging
+    int ll_width = width >> decomp_levels;
+    int ll_height = height >> decomp_levels;
+    int ll_size = ll_width * ll_height;
+
+    // don't modify the LL subband
+    offset += ll_size;
+
+    // Process detail subbands (LH, HL, HH) from coarsest to finest
+    for (int level = decomp_levels; level >= 1; level--) {
+        int level_width = width >> (decomp_levels - level + 1);
+        int level_height = height >> (decomp_levels - level + 1);
+        int subband_size = level_width * level_height;
+
+        // Process LH, HL, HH subbands for this level
+        for (int subband = 0; subband < 3; subband++) {
+            for (int y = 1; y < level_height - 1; y++) {
+                for (int x = 1; x < level_width - 1; x++) {
+                    int idx = y * level_width + x;
+
+                    // Get neighboring coefficients in the same subband
+                    float left = predicted_coeffs[offset + y * level_width + (x-1)];
+                    float right = predicted_coeffs[offset + y * level_width + (x+1)];
+                    float top = predicted_coeffs[offset + (y-1) * level_width + x];
+                    float bottom = predicted_coeffs[offset + (y+1) * level_width + x];
+
+                    // Directional prediction based on subband type
+                    float spatial_pred;
+                    if (subband == 0) { // LH (horizontal edges)
+                        // Emphasize vertical neighbors for horizontal edge prediction
+                        spatial_pred = (top + bottom) * 0.4f + (left + right) * 0.1f;
+                    } else if (subband == 1) { // HL (vertical edges)
+                        // Emphasize horizontal neighbors for vertical edge prediction
+                        spatial_pred = (left + right) * 0.4f + (top + bottom) * 0.1f;
+                    } else { // HH (diagonal edges)
+                        // Equal weighting for diagonal prediction
+                        spatial_pred = (left + right + top + bottom) * 0.25f;
+                    }
+
+                    // Combine temporal and spatial predictions with lighter spatial weight for high-frequency
+                    float spatial_weight = 0.2f; // Less spatial influence in detail subbands
+                    predicted_coeffs[offset + idx] = coeffs[offset + idx] * (1.0f - spatial_weight) + spatial_pred * spatial_weight;
+                }
+            }
+            offset += subband_size;
+        }
+    }
+}
+
 
 // Determine perceptual weight for coefficient at linear position (matches actual DWT layout)
 static float get_perceptual_weight_for_position(tav_encoder_t *enc, int linear_idx, int width, int height, int decomp_levels, int is_chroma) {
@@ -1367,13 +1473,38 @@ static size_t serialise_tile_data(tav_encoder_t *enc, int tile_x, int tile_y,
             }
         }
 
-        // Calculate improved deltas using multi-frame prediction
-        for (int i = 0; i < tile_size; i++) {
-            compensated_delta_y[i] = tile_y_data[i] - predicted_y[i];
-            compensated_delta_co[i] = tile_co_data[i] - predicted_co[i];
-            compensated_delta_cg[i] = tile_cg_data[i] - predicted_cg[i];
+        // Apply spatial prediction on top of temporal prediction
+        float *spatially_enhanced_y = malloc(tile_size * sizeof(float));
+        float *spatially_enhanced_co = malloc(tile_size * sizeof(float));
+        float *spatially_enhanced_cg = malloc(tile_size * sizeof(float));
+
+        // Determine tile dimensions for spatial prediction
+        int tile_width, tile_height;
+        if (enc->monoblock) {
+            tile_width = enc->width;
+            tile_height = enc->height;
+        } else {
+            tile_width = PADDED_TILE_SIZE_X;
+            tile_height = PADDED_TILE_SIZE_Y;
         }
 
+        // Apply safe spatial prediction (LL subband only)
+        apply_spatial_prediction_safe(predicted_y, spatially_enhanced_y, tile_width, tile_height, enc->decomp_levels);
+        apply_spatial_prediction_safe(predicted_co, spatially_enhanced_co, tile_width, tile_height, enc->decomp_levels);
+        apply_spatial_prediction_safe(predicted_cg, spatially_enhanced_cg, tile_width, tile_height, enc->decomp_levels);
+
+        // Calculate improved deltas using temporal + spatial prediction
+        for (int i = 0; i < tile_size; i++) {
+            compensated_delta_y[i] = tile_y_data[i] - spatially_enhanced_y[i];
+            compensated_delta_co[i] = tile_co_data[i] - spatially_enhanced_co[i];
+            compensated_delta_cg[i] = tile_cg_data[i] - spatially_enhanced_cg[i];
+        }
+
+        // Free spatial prediction buffers
+        free(spatially_enhanced_y);
+        free(spatially_enhanced_co);
+        free(spatially_enhanced_cg);
+
         free(predicted_y);
         free(predicted_co);
         free(predicted_cg);
@@ -3043,7 +3174,7 @@ int main(int argc, char *argv[]) {
     int count_iframe = 0;
     int count_pframe = 0;
 
-    KEYFRAME_INTERVAL = enc->output_fps >> 2; // short interval makes ghosting less noticeable
+    KEYFRAME_INTERVAL = enc->output_fps;// >> 2; // short interval makes ghosting less noticeable
 
     while (continue_encoding) {
         // Check encode limit if specified