diff --git a/terranmon.txt b/terranmon.txt
index e5c747a..72ec164 100644
--- a/terranmon.txt
+++ b/terranmon.txt
@@ -1054,6 +1054,8 @@ transmission capability, and region-of-interest coding.
 
 ## GOP Unified Packet Structure (0x12)
 Implemented on 2025-10-15 for temporal 3D DWT with unified preprocessing.
+Updated on 2025-10-17 to include canvas expansion margins.
+
 This packet contains multiple frames encoded as a single spacetime block for optimal
 temporal compression.
 
@@ -1077,18 +1079,40 @@ The entire GOP (width×height×N_frames×3_channels) is preprocessed as a single
 This layout enables Zstd to find patterns across both spatial and temporal dimensions,
 resulting in superior compression compared to per-frame encoding.
 
+### Canvas Expansion for Motion Compensation
+When frames in a GOP have camera motion, they must be aligned before temporal DWT.
+However, alignment creates "gaps" at frame edges. To preserve ALL original pixels:
+
+1. **Calculate motion range**: Determine the total shift range across all GOP frames
+   - Example: If frames shift by ±3 pixels horizontally, total range = 6 pixels
+2. **Expand canvas**: Create a larger canvas = original_size + margin
+   - Canvas width = header.width + margin_left + margin_right
+   - Canvas height = header.height + margin_top + margin_bottom
+3. **Place aligned frames**: Each frame is positioned on the expanded canvas
+   - All original pixels from all frames are preserved
+   - No artificial padding or cropping occurs
+4. **Encode expanded canvas**: Apply 3D DWT to the larger canvas dimensions
+5. **Store margins**: 4 bytes (L/R/T/B) tell decoder the canvas expansion
+6. **Decoder extraction**: Decoder extracts display region for each frame based on
+   motion vectors and margins
+
+This approach ensures lossless preservation of original video content during GOP encoding.
+
 ### Motion Vectors
-- Stored in quarter-pixel units (divide by 4.0 for pixel displacement)
+- Stored in 1/16-pixel units (divide by 16.0 for pixel displacement)
 - Used for global motion compensation (camera movement, scene translation)
 - Computed using FFT-based phase correlation for accurate frame alignment
-- First frame (frame 0) typically has motion vector (0, 0)
+- Cumulative relative to frame 0 (not frame-to-frame deltas)
+- First frame (frame 0) always has motion vector (0, 0)
 
 ### Temporal 3D DWT Process
-1. Apply 1D DWT across temporal axis (GOP frames)
-2. Apply 2D DWT on each spatial slice of temporal subbands
-3. Perceptual quantization with temporal-spatial awareness
-4. Unified significance map preprocessing across all frames/channels
-5. Single Zstd compression of entire GOP block
+1. Detect inter-frame motion using phase correlation
+2. Align frames and expand canvas to preserve all original pixels
+3. Apply 1D DWT across temporal axis (GOP frames) on expanded canvas
+4. Apply 2D DWT on each spatial slice of temporal subbands
+5. Perceptual quantization with temporal-spatial awareness
+6. Unified significance map preprocessing across all frames/channels
+7. Single Zstd compression of entire GOP block
 
 ## GOP Sync Packet Structure (0xFC)
 Indicates that N frames were decoded from a GOP Unified block.
diff --git a/tsvm_core/src/net/torvald/tsvm/UnsafePtr.kt b/tsvm_core/src/net/torvald/tsvm/UnsafePtr.kt
index 82ac83c..9b6696e 100644
--- a/tsvm_core/src/net/torvald/tsvm/UnsafePtr.kt
+++ b/tsvm_core/src/net/torvald/tsvm/UnsafePtr.kt
@@ -89,8 +89,8 @@ internal class UnsafePtr(pointer: Long, allocSize: Long, private val caller: Any
         //// You may break the glass and use this tool when some fucking incomprehensible bugs ("vittujen vitun bugit")
         //// appear (e.g. getting garbage values when it fucking shouldn't)
 
-//        if (destroyed) { throw DanglingPointerException("The pointer is already destroyed ($this)") }
-//        if (index !in 0 until size) throw AddressOverflowException("Index: $index; alloc size: $size; pointer: ${this}\n${Thread.currentThread().stackTrace.joinToString("\n", limit=10) { "    $it" }}")
+        if (destroyed) { throw DanglingPointerException("The pointer is already destroyed ($this)") }
+        if (index !in 0 until size) throw AddressOverflowException("Index: $index; alloc size: $size; pointer: ${this}\n${Thread.currentThread().stackTrace.joinToString("\n", limit=10) { "    $it" }}")
     }
 
     operator fun get(index: Long): Byte {
diff --git a/video_encoder/Makefile b/video_encoder/Makefile
index 4607e3c..995a8f4 100644
--- a/video_encoder/Makefile
+++ b/video_encoder/Makefile
@@ -2,11 +2,18 @@
 # Makefile for TSVM Enhanced Video (TEV) encoder
 
 CC = gcc
+CXX = g++
 CFLAGS = -std=c99 -Wall -Wextra -O2 -D_GNU_SOURCE
+CXXFLAGS = -std=c++11 -Wall -Wextra -O2 -D_GNU_SOURCE
 LIBS = -lm -lzstd
 
+# OpenCV flags (for TAV encoder with mesh warping)
+OPENCV_CFLAGS = $(shell pkg-config --cflags opencv4)
+OPENCV_LIBS = $(shell pkg-config --libs opencv4)
+
 # Source files and targets
 TARGETS = tev tav tav_decoder
+TEST_TARGETS = test_mesh_warp test_mesh_roundtrip
 
 # Build all encoders
 all: $(TARGETS)
@@ -16,21 +23,35 @@ tev: encoder_tev.c
 	rm -f encoder_tev
 	$(CC) $(CFLAGS) -o encoder_tev $< $(LIBS)
 
-tav: encoder_tav.c
-	rm -f encoder_tav
-	$(CC) $(CFLAGS) -o encoder_tav $< $(LIBS) -lfftw3f
+tav: encoder_tav.c encoder_tav_opencv.cpp estimate_affine_from_blocks.cpp
+	rm -f encoder_tav encoder_tav.o encoder_tav_opencv.o estimate_affine_from_blocks.o
+	$(CC) $(CFLAGS) -c encoder_tav.c -o encoder_tav.o
+	$(CXX) $(CXXFLAGS) $(OPENCV_CFLAGS) -c encoder_tav_opencv.cpp -o encoder_tav_opencv.o
+	$(CXX) $(CXXFLAGS) -c estimate_affine_from_blocks.cpp -o estimate_affine_from_blocks.o
+	$(CXX) -o encoder_tav encoder_tav.o encoder_tav_opencv.o estimate_affine_from_blocks.o $(LIBS) -lfftw3f $(OPENCV_LIBS)
 
 tav_decoder: decoder_tav.c
 	rm -f decoder_tav
 	$(CC) $(CFLAGS) -o decoder_tav $< $(LIBS)
 
+# Build test programs
+test_mesh_warp: test_mesh_warp.cpp encoder_tav_opencv.cpp estimate_affine_from_blocks.cpp
+	rm -f test_mesh_warp test_mesh_warp.o
+	$(CXX) $(CXXFLAGS) $(OPENCV_CFLAGS) -o test_mesh_warp test_mesh_warp.cpp encoder_tav_opencv.cpp estimate_affine_from_blocks.cpp $(OPENCV_LIBS)
+
+test_mesh_roundtrip: test_mesh_roundtrip.cpp encoder_tav_opencv.cpp
+	rm -f test_mesh_roundtrip test_mesh_roundtrip.o
+	$(CXX) $(CXXFLAGS) $(OPENCV_CFLAGS) -o test_mesh_roundtrip test_mesh_roundtrip.cpp encoder_tav_opencv.cpp $(OPENCV_LIBS)
+
+tests: $(TEST_TARGETS)
+
 # Build with debug symbols
 debug: CFLAGS += -g -DDEBUG
 debug: $(TARGETS)
 
 # Clean build artifacts
 clean:
-	rm -f $(TARGETS)
+	rm -f $(TARGETS) *.o
 
 # Install (copy to PATH)
 install: $(TARGETS)
@@ -43,6 +64,8 @@ check-deps:
 	@echo "Checking dependencies..."
 	@echo "Using Zstd compression for better efficiency"
 	@pkg-config --exists libzstd || (echo "Error: libzstd-dev not found. Install with: sudo apt install libzstd-dev" && exit 1)
+	@pkg-config --exists fftw3f || (echo "Error: libfftw3-dev not found. Install with: sudo apt install libfftw3-dev" && exit 1)
+	@pkg-config --exists opencv4 || (echo "Error: OpenCV 4 not found. Install with: sudo apt install libopencv-dev" && exit 1)
 	@echo "All dependencies found."
 
 # Help
diff --git a/video_encoder/encoder_tav.c b/video_encoder/encoder_tav.c
index 1bc94d9..35f80ae 100644
--- a/video_encoder/encoder_tav.c
+++ b/video_encoder/encoder_tav.c
@@ -17,7 +17,7 @@
 #include <float.h>
 #include <fftw3.h>
 
-#define ENCODER_VENDOR_STRING "Encoder-TAV 20251017"
+#define ENCODER_VENDOR_STRING "Encoder-TAV 20251018 with mcwarp"
 
 // TSVM Advanced Video (TAV) format constants
 #define TAV_MAGIC "\x1F\x54\x53\x56\x4D\x54\x41\x56"  // "\x1FTSVM TAV"
@@ -37,16 +37,17 @@
 #define TAV_MODE_DELTA     0x02  // Coefficient delta encoding (efficient P-frames)
 
 // Video packet types
-#define TAV_PACKET_IFRAME         0x10  // Intra frame (keyframe)
-#define TAV_PACKET_PFRAME         0x11  // Predicted frame
-#define TAV_PACKET_GOP_UNIFIED    0x12  // Unified 3D DWT GOP (all frames in single block)
-#define TAV_PACKET_AUDIO_MP2      0x20  // MP2 audio
-#define TAV_PACKET_SUBTITLE       0x30  // Subtitle packet
-#define TAV_PACKET_EXTENDED_HDR   0xEF  // Extended header packet
-#define TAV_PACKET_GOP_SYNC       0xFC  // GOP sync packet (N frames decoded)
-#define TAV_PACKET_TIMECODE       0xFD  // Timecode packet
-#define TAV_PACKET_SYNC_NTSC      0xFE  // NTSC Sync packet
-#define TAV_PACKET_SYNC           0xFF  // Sync packet
+#define TAV_PACKET_IFRAME          0x10  // Intra frame (keyframe)
+#define TAV_PACKET_PFRAME          0x11  // Predicted frame
+#define TAV_PACKET_GOP_UNIFIED     0x12  // Unified 3D DWT GOP (all frames in single block, translation-based)
+#define TAV_PACKET_GOP_UNIFIED_MESH 0x13  // Unified 3D DWT GOP with distortion mesh warping
+#define TAV_PACKET_AUDIO_MP2       0x20  // MP2 audio
+#define TAV_PACKET_SUBTITLE        0x30  // Subtitle packet
+#define TAV_PACKET_EXTENDED_HDR    0xEF  // Extended header packet
+#define TAV_PACKET_GOP_SYNC        0xFC  // GOP sync packet (N frames decoded)
+#define TAV_PACKET_TIMECODE        0xFD  // Timecode packet
+#define TAV_PACKET_SYNC_NTSC       0xFE  // NTSC Sync packet
+#define TAV_PACKET_SYNC            0xFF  // Sync packet
 
 // DWT settings
 #define TILE_SIZE_X 640
@@ -106,7 +107,7 @@ static int needs_alpha_channel(int channel_layout) {
 #define DEFAULT_ZSTD_LEVEL 9
 #define GOP_SIZE 8
 #define TEMPORAL_DECOMP_LEVEL 2
-#define MOTION_THRESHOLD 64.0f // Flush if motion exceeds 24 pixels in any direction
+#define MOTION_THRESHOLD 24.0f // Flush if motion exceeds 24 pixels in any direction
 
 // Audio/subtitle constants (reused from TEV)
 #define MP2_DEFAULT_PACKET_SIZE 1152
@@ -311,10 +312,34 @@ typedef struct tav_encoder_s {
     float **gop_y_frames;        // [frame][pixel] - Y channel for each GOP frame
     float **gop_co_frames;       // [frame][pixel] - Co channel for each GOP frame
     float **gop_cg_frames;       // [frame][pixel] - Cg channel for each GOP frame
-    int16_t *gop_translation_x;  // [frame] - Translation X in 1/16-pixel units
-    int16_t *gop_translation_y;  // [frame] - Translation Y in 1/16-pixel units
+    int16_t *gop_translation_x;  // [frame] - Translation X in 1/16-pixel units (for 0x12 packets)
+    int16_t *gop_translation_y;  // [frame] - Translation Y in 1/16-pixel units (for 0x12 packets)
     int temporal_decomp_levels;  // Number of temporal DWT levels (default: 2)
 
+    // Mesh warping for temporal 3D DWT (0x13 packets)
+    int enable_mesh_warp;        // Flag to enable mesh warping (default: 0, uses translation if temporal_dwt enabled)
+    int mesh_width;              // Number of control points horizontally (default: 32 for 1920×1080)
+    int mesh_height;             // Number of control points vertically (default: 18 for 1920×1080)
+    int16_t **gop_mesh_dx;       // [frame][mesh_w * mesh_h] - Mesh displacement X in 1/8-pixel units
+    int16_t **gop_mesh_dy;       // [frame][mesh_w * mesh_h] - Mesh displacement Y in 1/8-pixel units
+
+    // Dense optical flow for reliability masking (used with mesh warping)
+    float **gop_flow_fwd_x;      // [frame][width * height] - Forward flow X (F[i-1] → F[i])
+    float **gop_flow_fwd_y;      // [frame][width * height] - Forward flow Y
+    float **gop_flow_bwd_x;      // [frame][width * height] - Backward flow X (F[i] → F[i-1])
+    float **gop_flow_bwd_y;      // [frame][width * height] - Backward flow Y
+
+    // Selective per-cell affine transforms (0x13 packets)
+    uint8_t **gop_affine_mask;   // [frame][mesh_w * mesh_h] - 1 bit: 1=affine, 0=translation only
+    int16_t **gop_affine_a11;    // [frame][mesh_w * mesh_h] - Affine matrix 1/256 fixed-point (a11, a12, a21, a22)
+    int16_t **gop_affine_a12;
+    int16_t **gop_affine_a21;
+    int16_t **gop_affine_a22;
+
+    float mesh_smoothness;       // Laplacian smoothing weight (0.0-1.0, default: 0.5)
+    int mesh_smooth_iterations;  // Number of smoothing iterations (default: 8)
+    float affine_threshold;      // Residual improvement threshold for using affine (default: 0.10 = 10%)
+
     // Tile processing
     int tiles_x, tiles_y;
     dwt_tile_t *tiles;
@@ -647,6 +672,8 @@ static int process_subtitles(tav_encoder_t *enc, int frame_num, FILE *output);
 // Temporal 3D DWT prototypes
 static void dwt_3d_forward(float **gop_data, int width, int height, int num_frames,
                           int spatial_levels, int temporal_levels, int spatial_filter);
+static void dwt_3d_forward_mc(tav_encoder_t *enc, float **gop_y, float **gop_co, float **gop_cg,
+                              int num_frames, int spatial_levels, int temporal_levels, int spatial_filter);
 static void dwt_3d_inverse(float **gop_data, int width, int height, int num_frames,
                           int spatial_levels, int temporal_levels, int spatial_filter);
 static size_t gop_flush(tav_encoder_t *enc, FILE *output, int base_quantiser,
@@ -710,6 +737,7 @@ static void show_usage(const char *program_name) {
     printf("  --enable-delta          Enable delta encoding\n");
     printf("  --delta-haar N          Apply N-level Haar DWT to delta coefficients (1-6, auto-enables delta)\n");
     printf("  --temporal-dwt          Enable temporal 3D DWT (GOP-based encoding with temporal transform)\n");
+    printf("  --mesh-warp             Enable mesh-based motion compensation (requires --temporal-dwt, uses optical flow)\n");
     printf("  --ictcp                 Use ICtCp colour space instead of YCoCg-R (use when source is in BT.2100)\n");
     printf("  --no-perceptual-tuning  Disable perceptual quantisation\n");
     printf("  --no-dead-zone          Disable dead-zone quantisation (for comparison/testing)\n");
@@ -797,6 +825,25 @@ static tav_encoder_t* create_encoder(void) {
     enc->gop_translation_x = NULL;
     enc->gop_translation_y = NULL;
 
+    // Mesh warping settings (for 0x13 packets)
+    enc->enable_mesh_warp = 0;  // Default: disabled (use translation-based 0x12)
+    enc->mesh_width = 0;  // Will be calculated based on frame dimensions
+    enc->mesh_height = 0;
+    enc->gop_mesh_dx = NULL;
+    enc->gop_mesh_dy = NULL;
+    enc->gop_flow_fwd_x = NULL;  // Dense flow for reliability masking
+    enc->gop_flow_fwd_y = NULL;
+    enc->gop_flow_bwd_x = NULL;
+    enc->gop_flow_bwd_y = NULL;
+    enc->gop_affine_mask = NULL;
+    enc->gop_affine_a11 = NULL;
+    enc->gop_affine_a12 = NULL;
+    enc->gop_affine_a21 = NULL;
+    enc->gop_affine_a22 = NULL;
+    enc->mesh_smoothness = 0.5f;  // 50% smoothness weight
+    enc->mesh_smooth_iterations = 8;  // 8 iterations
+    enc->affine_threshold = 0.40f;  // 40% residual improvement required to use affine (was 0.10, too lenient)
+
     return enc;
 }
 
@@ -945,6 +992,93 @@ static int initialise_encoder(tav_encoder_t *enc) {
         memset(enc->gop_translation_x, 0, enc->gop_capacity * sizeof(int16_t));
         memset(enc->gop_translation_y, 0, enc->gop_capacity * sizeof(int16_t));
 
+        // Calculate mesh dimensions if mesh warping is enabled
+        if (enc->enable_mesh_warp) {
+            // Calculate mesh grid: approximately 16×16 pixel cells
+            // For 560×448: 18×14 mesh (252 points), for 1920×1080: 60×34 mesh (2040 points)
+            // Mesh density: 32×32 pixel cells for finer motion modeling
+            // Matches block matching resolution (16×16 blocks → 2 blocks per cell)
+            int mesh_cell_size = 32;  // Pixel size of each mesh cell (finer than 64)
+            enc->mesh_width = (enc->width + mesh_cell_size - 1) / mesh_cell_size;   // Round up
+            enc->mesh_height = (enc->height + mesh_cell_size - 1) / mesh_cell_size;
+
+            // Ensure minimum 2×2 mesh
+            if (enc->mesh_width < 2) enc->mesh_width = 2;
+            if (enc->mesh_height < 2) enc->mesh_height = 2;
+
+            int mesh_size = enc->mesh_width * enc->mesh_height;
+
+            // Allocate mesh arrays for each GOP frame
+            enc->gop_mesh_dx = malloc(enc->gop_capacity * sizeof(int16_t*));
+            enc->gop_mesh_dy = malloc(enc->gop_capacity * sizeof(int16_t*));
+            enc->gop_affine_mask = malloc(enc->gop_capacity * sizeof(uint8_t*));
+            enc->gop_affine_a11 = malloc(enc->gop_capacity * sizeof(int16_t*));
+            enc->gop_affine_a12 = malloc(enc->gop_capacity * sizeof(int16_t*));
+            enc->gop_affine_a21 = malloc(enc->gop_capacity * sizeof(int16_t*));
+            enc->gop_affine_a22 = malloc(enc->gop_capacity * sizeof(int16_t*));
+
+            // Allocate dense flow arrays for reliability masking
+            enc->gop_flow_fwd_x = malloc(enc->gop_capacity * sizeof(float*));
+            enc->gop_flow_fwd_y = malloc(enc->gop_capacity * sizeof(float*));
+            enc->gop_flow_bwd_x = malloc(enc->gop_capacity * sizeof(float*));
+            enc->gop_flow_bwd_y = malloc(enc->gop_capacity * sizeof(float*));
+
+            if (!enc->gop_mesh_dx || !enc->gop_mesh_dy || !enc->gop_affine_mask ||
+                !enc->gop_affine_a11 || !enc->gop_affine_a12 ||
+                !enc->gop_affine_a21 || !enc->gop_affine_a22 ||
+                !enc->gop_flow_fwd_x || !enc->gop_flow_fwd_y ||
+                !enc->gop_flow_bwd_x || !enc->gop_flow_bwd_y) {
+                fprintf(stderr, "Failed to allocate GOP mesh arrays\n");
+                return -1;
+            }
+
+            // Allocate individual mesh and affine buffers
+            int flow_size = enc->width * enc->height;
+            for (int i = 0; i < enc->gop_capacity; i++) {
+                enc->gop_mesh_dx[i] = malloc(mesh_size * sizeof(int16_t));
+                enc->gop_mesh_dy[i] = malloc(mesh_size * sizeof(int16_t));
+                enc->gop_affine_mask[i] = malloc(mesh_size * sizeof(uint8_t));
+                enc->gop_affine_a11[i] = malloc(mesh_size * sizeof(int16_t));
+                enc->gop_affine_a12[i] = malloc(mesh_size * sizeof(int16_t));
+                enc->gop_affine_a21[i] = malloc(mesh_size * sizeof(int16_t));
+                enc->gop_affine_a22[i] = malloc(mesh_size * sizeof(int16_t));
+
+                // Allocate dense flow buffers for reliability masking
+                enc->gop_flow_fwd_x[i] = malloc(flow_size * sizeof(float));
+                enc->gop_flow_fwd_y[i] = malloc(flow_size * sizeof(float));
+                enc->gop_flow_bwd_x[i] = malloc(flow_size * sizeof(float));
+                enc->gop_flow_bwd_y[i] = malloc(flow_size * sizeof(float));
+
+                if (!enc->gop_mesh_dx[i] || !enc->gop_mesh_dy[i] || !enc->gop_affine_mask[i] ||
+                    !enc->gop_affine_a11[i] || !enc->gop_affine_a12[i] ||
+                    !enc->gop_affine_a21[i] || !enc->gop_affine_a22[i] ||
+                    !enc->gop_flow_fwd_x[i] || !enc->gop_flow_fwd_y[i] ||
+                    !enc->gop_flow_bwd_x[i] || !enc->gop_flow_bwd_y[i]) {
+                    fprintf(stderr, "Failed to allocate GOP mesh buffers\n");
+                    return -1;
+                }
+
+                // Initialize to zero (affine mask=0 means translation-only, identity affine)
+                memset(enc->gop_mesh_dx[i], 0, mesh_size * sizeof(int16_t));
+                memset(enc->gop_mesh_dy[i], 0, mesh_size * sizeof(int16_t));
+                memset(enc->gop_affine_mask[i], 0, mesh_size * sizeof(uint8_t));
+                // Initialize affine to identity matrix (256 = 1.0 in 1/256 fixed-point)
+                for (int j = 0; j < mesh_size; j++) {
+                    enc->gop_affine_a11[i][j] = 256;  // 1.0
+                    enc->gop_affine_a12[i][j] = 0;
+                    enc->gop_affine_a21[i][j] = 0;
+                    enc->gop_affine_a22[i][j] = 256;  // 1.0
+                }
+            }
+
+            if (enc->verbose) {
+                printf("Mesh warping enabled: mesh=%dx%d (approx %dx%d px cells), smoothness=%.2f, iterations=%d\n",
+                       enc->mesh_width, enc->mesh_height,
+                       enc->width / enc->mesh_width, enc->height / enc->mesh_height,
+                       enc->mesh_smoothness, enc->mesh_smooth_iterations);
+            }
+        }
+
         if (enc->verbose) {
             printf("Temporal DWT enabled: GOP size=%d, temporal levels=%d\n",
                    enc->gop_capacity, enc->temporal_decomp_levels);
@@ -1454,6 +1588,51 @@ static void phase_correlate_fft(const uint8_t *frame1_rgb, const uint8_t *frame2
     fftwf_free(correlation);
 }
 
+// ==============================================================================
+// Optical Flow and Mesh Warping Functions
+// ==============================================================================
+
+// Forward declaration for optical flow function (implemented in encoder_tav_opencv.cpp)
+// Forward declarations for OpenCV functions (implemented in encoder_tav_opencv.cpp)
+extern void estimate_motion_optical_flow(
+    const uint8_t *frame1_rgb, const uint8_t *frame2_rgb,
+    int width, int height,
+    float **out_flow_x, float **out_flow_y
+);
+
+extern void build_mesh_from_flow(
+    const float *flow_x, const float *flow_y,
+    int width, int height, int mesh_w, int mesh_h,
+    int16_t *mesh_dx, int16_t *mesh_dy
+);
+
+extern void smooth_mesh_laplacian(
+    int16_t *mesh_dx, int16_t *mesh_dy,
+    int mesh_width, int mesh_height,
+    float smoothness, int iterations
+);
+
+extern void warp_frame_with_mesh(
+    const float *src_frame, int width, int height,
+    const int16_t *mesh_dx, const int16_t *mesh_dy,
+    int mesh_width, int mesh_height,
+    float *dst_frame
+);
+
+extern int estimate_cell_affine(
+    const float *flow_x, const float *flow_y,
+    int width, int height,
+    int cell_x, int cell_y,
+    int cell_w, int cell_h,
+    float threshold,
+    int16_t *out_tx, int16_t *out_ty,
+    int16_t *out_a11, int16_t *out_a12,
+    int16_t *out_a21, int16_t *out_a22
+);
+
+// Note: build_mesh_from_flow, smooth_mesh_laplacian, and warp_frame_with_mesh
+// are implemented in encoder_tav_opencv.cpp (extern declarations above)
+
 // Apply translation to frame (for frame alignment before temporal DWT)
 // NO PADDING - only extracts the valid region that will be common across all frames
 static void apply_translation(float *frame_data, int width, int height,
@@ -1594,6 +1773,195 @@ static void quantise_3d_dwt_coefficients(tav_encoder_t *enc,
     }
 }
 
+// =============================================================================
+// Mesh Differential Encoding for Compression
+// =============================================================================
+
+// Encode mesh motion vectors with selective affine using temporal and spatial prediction
+// Returns the number of bytes written to output buffer
+// Format:
+//   1. Mesh dimensions (2 bytes each: width, height)
+//   2. Affine significance mask (1 bit per cell per frame, packed into bytes)
+//   3. Translation dx/dy for ALL cells (temporal + spatial differential encoding)
+//   4. Affine parameters a11, a12, a21, a22 for cells where mask=1 (temporal + spatial differential)
+static size_t encode_mesh_differential(
+    int16_t **mesh_dx, int16_t **mesh_dy,
+    uint8_t **affine_mask,
+    int16_t **affine_a11, int16_t **affine_a12,
+    int16_t **affine_a21, int16_t **affine_a22,
+    int gop_size, int mesh_width, int mesh_height,
+    uint8_t *output_buffer, size_t buffer_capacity
+) {
+    int mesh_points = mesh_width * mesh_height;
+    size_t bytes_written = 0;
+
+    // Write mesh dimensions (2 bytes each)
+    if (bytes_written + 4 > buffer_capacity) return 0;
+    uint16_t mesh_w_16 = (uint16_t)mesh_width;
+    uint16_t mesh_h_16 = (uint16_t)mesh_height;
+    memcpy(output_buffer + bytes_written, &mesh_w_16, sizeof(uint16_t));
+    bytes_written += sizeof(uint16_t);
+    memcpy(output_buffer + bytes_written, &mesh_h_16, sizeof(uint16_t));
+    bytes_written += sizeof(uint16_t);
+
+    // Write affine significance mask (packed bits: 1=affine, 0=translation-only)
+    int total_mask_bits = gop_size * mesh_points;
+    int mask_bytes = (total_mask_bits + 7) / 8;
+    if (bytes_written + mask_bytes > buffer_capacity) return 0;
+
+    memset(output_buffer + bytes_written, 0, mask_bytes);  // Clear mask buffer
+    int bit_idx = 0;
+    for (int t = 0; t < gop_size; t++) {
+        for (int i = 0; i < mesh_points; i++) {
+            if (affine_mask[t][i]) {
+                output_buffer[bytes_written + bit_idx / 8] |= (1 << (bit_idx % 8));
+            }
+            bit_idx++;
+        }
+    }
+    bytes_written += mask_bytes;
+
+    // Encode translation data for ALL cells (always present)
+    for (int t = 0; t < gop_size; t++) {
+        for (int i = 0; i < mesh_points; i++) {
+            int16_t dx = mesh_dx[t][i];
+            int16_t dy = mesh_dy[t][i];
+
+            // Temporal prediction
+            if (t > 0) {
+                dx -= mesh_dx[t - 1][i];
+                dy -= mesh_dy[t - 1][i];
+            }
+
+            // Spatial prediction
+            if (i > 0 && (i % mesh_width) != 0) {
+                int16_t left_dx = mesh_dx[t][i - 1];
+                int16_t left_dy = mesh_dy[t][i - 1];
+                if (t > 0) {
+                    left_dx -= mesh_dx[t - 1][i - 1];
+                    left_dy -= mesh_dy[t - 1][i - 1];
+                }
+                dx -= left_dx;
+                dy -= left_dy;
+            }
+
+            if (bytes_written + 4 > buffer_capacity) return 0;
+            memcpy(output_buffer + bytes_written, &dx, sizeof(int16_t));
+            bytes_written += sizeof(int16_t);
+            memcpy(output_buffer + bytes_written, &dy, sizeof(int16_t));
+            bytes_written += sizeof(int16_t);
+        }
+    }
+
+    // Encode affine parameters for cells where mask=1
+    for (int t = 0; t < gop_size; t++) {
+        for (int i = 0; i < mesh_points; i++) {
+            if (!affine_mask[t][i]) continue;  // Skip translation-only cells
+
+            int16_t a11 = affine_a11[t][i];
+            int16_t a12 = affine_a12[t][i];
+            int16_t a21 = affine_a21[t][i];
+            int16_t a22 = affine_a22[t][i];
+
+            // Temporal prediction (delta from previous frame's affine params)
+            if (t > 0 && affine_mask[t - 1][i]) {
+                a11 -= affine_a11[t - 1][i];
+                a12 -= affine_a12[t - 1][i];
+                a21 -= affine_a21[t - 1][i];
+                a22 -= affine_a22[t - 1][i];
+            }
+
+            // Spatial prediction (delta from left neighbor if it also uses affine)
+            if (i > 0 && (i % mesh_width) != 0 && affine_mask[t][i - 1]) {
+                int16_t left_a11 = affine_a11[t][i - 1];
+                int16_t left_a12 = affine_a12[t][i - 1];
+                int16_t left_a21 = affine_a21[t][i - 1];
+                int16_t left_a22 = affine_a22[t][i - 1];
+
+                if (t > 0 && affine_mask[t - 1][i - 1]) {
+                    left_a11 -= affine_a11[t - 1][i - 1];
+                    left_a12 -= affine_a12[t - 1][i - 1];
+                    left_a21 -= affine_a21[t - 1][i - 1];
+                    left_a22 -= affine_a22[t - 1][i - 1];
+                }
+
+                a11 -= left_a11;
+                a12 -= left_a12;
+                a21 -= left_a21;
+                a22 -= left_a22;
+            }
+
+            if (bytes_written + 8 > buffer_capacity) return 0;
+            memcpy(output_buffer + bytes_written, &a11, sizeof(int16_t));
+            bytes_written += sizeof(int16_t);
+            memcpy(output_buffer + bytes_written, &a12, sizeof(int16_t));
+            bytes_written += sizeof(int16_t);
+            memcpy(output_buffer + bytes_written, &a21, sizeof(int16_t));
+            bytes_written += sizeof(int16_t);
+            memcpy(output_buffer + bytes_written, &a22, sizeof(int16_t));
+            bytes_written += sizeof(int16_t);
+        }
+    }
+
+    return bytes_written;
+}
+
+// Decode mesh motion vectors from differential encoding
+// Returns 0 on success, -1 on error
+// This is the inverse of encode_mesh_differential()
+static int decode_mesh_differential(
+    const uint8_t *input_buffer, size_t buffer_size,
+    int16_t **mesh_dx, int16_t **mesh_dy,
+    int gop_size, int *out_mesh_width, int *out_mesh_height
+) {
+    size_t bytes_read = 0;
+
+    // Read mesh dimensions
+    if (bytes_read + 4 > buffer_size) return -1;
+    uint16_t mesh_w_16, mesh_h_16;
+    memcpy(&mesh_w_16, input_buffer + bytes_read, sizeof(uint16_t));
+    bytes_read += sizeof(uint16_t);
+    memcpy(&mesh_h_16, input_buffer + bytes_read, sizeof(uint16_t));
+    bytes_read += sizeof(uint16_t);
+
+    int mesh_width = (int)mesh_w_16;
+    int mesh_height = (int)mesh_h_16;
+    int mesh_points = mesh_width * mesh_height;
+
+    *out_mesh_width = mesh_width;
+    *out_mesh_height = mesh_height;
+
+    // Decode mesh data for all frames
+    for (int t = 0; t < gop_size; t++) {
+        for (int i = 0; i < mesh_points; i++) {
+            // Read differential values
+            if (bytes_read + 4 > buffer_size) return -1;
+            int16_t dx_delta, dy_delta;
+            memcpy(&dx_delta, input_buffer + bytes_read, sizeof(int16_t));
+            bytes_read += sizeof(int16_t);
+            memcpy(&dy_delta, input_buffer + bytes_read, sizeof(int16_t));
+            bytes_read += sizeof(int16_t);
+
+            // Reconstruct: reverse spatial prediction first
+            if (i > 0 && (i % mesh_width) != 0) {
+                dx_delta += mesh_dx[t][i - 1];
+                dy_delta += mesh_dy[t][i - 1];
+            }
+
+            // Then reverse temporal prediction
+            if (t > 0) {
+                dx_delta += mesh_dx[t - 1][i];
+                dy_delta += mesh_dy[t - 1][i];
+            }
+
+            mesh_dx[t][i] = dx_delta;
+            mesh_dy[t][i] = dy_delta;
+        }
+    }
+
+    return 0;
+}
+
 // =============================================================================
 // GOP Management Functions
 // =============================================================================
@@ -1616,29 +1984,112 @@ static int gop_add_frame(tav_encoder_t *enc, const uint8_t *frame_rgb,
     memcpy(enc->gop_co_frames[frame_idx], frame_co, frame_channel_size);
     memcpy(enc->gop_cg_frames[frame_idx], frame_cg, frame_channel_size);
 
-    // Compute translation vector if not first frame
-    /*if (frame_idx > 0) {
-        phase_correlate_fft(enc->gop_rgb_frames[frame_idx - 1],
-                           enc->gop_rgb_frames[frame_idx],
-                           enc->width, enc->height,
-                           &enc->gop_translation_x[frame_idx],
-                           &enc->gop_translation_y[frame_idx]);
+    // Compute motion estimation if mesh warping is enabled
+    if (enc->enable_mesh_warp && frame_idx > 0) {
+        // Compute forward optical flow (F[i-1] → F[i])
+        estimate_motion_optical_flow(
+            enc->gop_rgb_frames[frame_idx - 1],
+            enc->gop_rgb_frames[frame_idx],
+            enc->width, enc->height,
+            &enc->gop_flow_fwd_x[frame_idx],
+            &enc->gop_flow_fwd_y[frame_idx]
+        );
+
+        // Compute backward optical flow (F[i] → F[i-1]) for reliability masking
+        estimate_motion_optical_flow(
+            enc->gop_rgb_frames[frame_idx],
+            enc->gop_rgb_frames[frame_idx - 1],
+            enc->width, enc->height,
+            &enc->gop_flow_bwd_x[frame_idx],
+            &enc->gop_flow_bwd_y[frame_idx]
+        );
+
+        // Build distortion mesh from forward flow field
+        build_mesh_from_flow(
+            enc->gop_flow_fwd_x[frame_idx], enc->gop_flow_fwd_y[frame_idx],
+            enc->width, enc->height,
+            enc->mesh_width, enc->mesh_height,
+            enc->gop_mesh_dx[frame_idx], enc->gop_mesh_dy[frame_idx]
+        );
+
+        // Apply Laplacian smoothing to regularize mesh
+        smooth_mesh_laplacian(
+            enc->gop_mesh_dx[frame_idx], enc->gop_mesh_dy[frame_idx],
+            enc->mesh_width, enc->mesh_height,
+            enc->mesh_smoothness, enc->mesh_smooth_iterations
+        );
+
+        // Estimate selective per-cell affine transforms
+        int cell_w = enc->width / enc->mesh_width;
+        int cell_h = enc->height / enc->mesh_height;
+        int affine_count = 0;
+
+        for (int cy = 0; cy < enc->mesh_height; cy++) {
+            for (int cx = 0; cx < enc->mesh_width; cx++) {
+                int cell_idx = cy * enc->mesh_width + cx;
+
+                int16_t tx, ty, a11, a12, a21, a22;
+                int use_affine = estimate_cell_affine(
+                    enc->gop_flow_fwd_x[frame_idx], enc->gop_flow_fwd_y[frame_idx],
+                    enc->width, enc->height,
+                    cx, cy, cell_w, cell_h,
+                    enc->affine_threshold,
+                    &tx, &ty, &a11, &a12, &a21, &a22
+                );
+
+                if (use_affine) {
+                    // Use affine transform for this cell
+                    enc->gop_affine_mask[frame_idx][cell_idx] = 1;
+                    enc->gop_mesh_dx[frame_idx][cell_idx] = tx;
+                    enc->gop_mesh_dy[frame_idx][cell_idx] = ty;
+                    enc->gop_affine_a11[frame_idx][cell_idx] = a11;
+                    enc->gop_affine_a12[frame_idx][cell_idx] = a12;
+                    enc->gop_affine_a21[frame_idx][cell_idx] = a21;
+                    enc->gop_affine_a22[frame_idx][cell_idx] = a22;
+                    affine_count++;
+                } else {
+                    // Use translation-only (keep existing mesh dx/dy from build_mesh_from_flow)
+                    enc->gop_affine_mask[frame_idx][cell_idx] = 0;
+                    // Keep dx/dy from build_mesh_from_flow, set affine to identity
+                    enc->gop_affine_a11[frame_idx][cell_idx] = 256;
+                    enc->gop_affine_a12[frame_idx][cell_idx] = 0;
+                    enc->gop_affine_a21[frame_idx][cell_idx] = 0;
+                    enc->gop_affine_a22[frame_idx][cell_idx] = 256;
+                }
+            }
+        }
+
+        // Flow is now stored in enc->gop_flow_*[frame_idx], don't free
 
         if (enc->verbose && (frame_idx < 3 || frame_idx == enc->gop_capacity - 1)) {
-            printf("  GOP frame %d: translation = (%.3f, %.3f) pixels\n",
-                   frame_idx,
-                   enc->gop_translation_x[frame_idx] / 16.0f,
-                   enc->gop_translation_y[frame_idx] / 16.0f);
+            // Compute average mesh displacement for verbose output
+            int mesh_points = enc->mesh_width * enc->mesh_height;
+            float avg_dx = 0.0f, avg_dy = 0.0f;
+            for (int i = 0; i < mesh_points; i++) {
+                avg_dx += fabsf(enc->gop_mesh_dx[frame_idx][i] / 8.0f);
+                avg_dy += fabsf(enc->gop_mesh_dy[frame_idx][i] / 8.0f);
+            }
+            avg_dx /= mesh_points;
+            avg_dy /= mesh_points;
+            printf("  GOP frame %d: mesh avg=(%.2f,%.2f)px, %d/%d affine (%.1f%%), mesh=%dx%d\n",
+                   frame_idx, avg_dx, avg_dy,
+                   affine_count, mesh_points, 100.0f * affine_count / mesh_points,
+                   enc->mesh_width, enc->mesh_height);
         }
-    } else {
-        // First frame has no translation
-        enc->gop_translation_x[0] = 0;
-        enc->gop_translation_y[0] = 0;
-    }*/
+    } else if (frame_idx == 0) {
+        // First frame has no motion (reference frame)
+        if (enc->enable_mesh_warp) {
+            int mesh_points = enc->mesh_width * enc->mesh_height;
+            memset(enc->gop_mesh_dx[0], 0, mesh_points * sizeof(int16_t));
+            memset(enc->gop_mesh_dy[0], 0, mesh_points * sizeof(int16_t));
+        }
+    }
 
-    // disabling frame realigning: producing worse results in general
-    enc->gop_translation_x[frame_idx] = 0.0f;
-    enc->gop_translation_y[frame_idx] = 0.0f;
+    // Legacy translation vectors (used when mesh warp is disabled)
+    if (!enc->enable_mesh_warp) {
+        enc->gop_translation_x[frame_idx] = 0.0f;
+        enc->gop_translation_y[frame_idx] = 0.0f;
+    }
 
     enc->gop_frame_count++;
     return 0;
@@ -1770,54 +2221,75 @@ static size_t gop_flush(tav_encoder_t *enc, FILE *output, int base_quantiser,
                valid_width, valid_height, crop_left, crop_right, crop_top, crop_bottom);
     }
 
-    // Step 0.6: Apply motion compensation to align frames before temporal DWT
-    // This uses the cumulative translation vectors to align each frame to frame 0
-    for (int i = 1; i < actual_gop_size; i++) {  // Skip frame 0 (reference frame)
-        float *aligned_y = malloc(num_pixels * sizeof(float));
-        float *aligned_co = malloc(num_pixels * sizeof(float));
-        float *aligned_cg = malloc(num_pixels * sizeof(float));
+    // Step 0.6: Motion compensation note
+    // For mesh warping: MC-lifting integrates motion compensation directly into the lifting steps
+    // For translation: still use pre-alignment (old method for backwards compatibility)
+    if (!enc->enable_mesh_warp) {
+        // Translation-based motion compensation: align using global translation vectors
+        for (int i = 1; i < actual_gop_size; i++) {  // Skip frame 0 (reference frame)
+            float *aligned_y = malloc(num_pixels * sizeof(float));
+            float *aligned_co = malloc(num_pixels * sizeof(float));
+            float *aligned_cg = malloc(num_pixels * sizeof(float));
+
+            if (!aligned_y || !aligned_co || !aligned_cg) {
+                fprintf(stderr, "Error: Failed to allocate motion compensation buffers\n");
+                // Cleanup and skip motion compensation for this GOP
+                free(aligned_y);
+                free(aligned_co);
+                free(aligned_cg);
+                break;
+            }
+
+            // Apply translation to align this frame
+            apply_translation(gop_y_coeffs[i], enc->width, enc->height,
+                             enc->gop_translation_x[i], enc->gop_translation_y[i], aligned_y);
+            apply_translation(gop_co_coeffs[i], enc->width, enc->height,
+                             enc->gop_translation_x[i], enc->gop_translation_y[i], aligned_co);
+            apply_translation(gop_cg_coeffs[i], enc->width, enc->height,
+                             enc->gop_translation_x[i], enc->gop_translation_y[i], aligned_cg);
+
+            // Copy aligned frames back
+            memcpy(gop_y_coeffs[i], aligned_y, num_pixels * sizeof(float));
+            memcpy(gop_co_coeffs[i], aligned_co, num_pixels * sizeof(float));
+            memcpy(gop_cg_coeffs[i], aligned_cg, num_pixels * sizeof(float));
 
-        if (!aligned_y || !aligned_co || !aligned_cg) {
-            fprintf(stderr, "Error: Failed to allocate motion compensation buffers\n");
-            // Cleanup and skip motion compensation for this GOP
             free(aligned_y);
             free(aligned_co);
             free(aligned_cg);
-            break;
         }
-
-        // Apply translation to align this frame
-        apply_translation(gop_y_coeffs[i], enc->width, enc->height,
-                         enc->gop_translation_x[i], enc->gop_translation_y[i], aligned_y);
-        apply_translation(gop_co_coeffs[i], enc->width, enc->height,
-                         enc->gop_translation_x[i], enc->gop_translation_y[i], aligned_co);
-        apply_translation(gop_cg_coeffs[i], enc->width, enc->height,
-                         enc->gop_translation_x[i], enc->gop_translation_y[i], aligned_cg);
-
-        // Copy aligned frames back
-        memcpy(gop_y_coeffs[i], aligned_y, num_pixels * sizeof(float));
-        memcpy(gop_co_coeffs[i], aligned_co, num_pixels * sizeof(float));
-        memcpy(gop_cg_coeffs[i], aligned_cg, num_pixels * sizeof(float));
-
-        free(aligned_y);
-        free(aligned_co);
-        free(aligned_cg);
+    } else {
+        // Mesh-based motion compensation uses MC-lifting (integrated into temporal DWT)
+        if (enc->verbose) {
+            printf("Using motion-compensated lifting (MC-lifting) (%dx%d mesh)\n",
+                   enc->mesh_width, enc->mesh_height);
+        }
     }
 
-    // Step 0.7: Expand frames to larger canvas that preserves ALL original pixels
-    // Calculate expanded canvas size (UNION of all aligned frames)
-    int canvas_width = enc->width + crop_left + crop_right;   // Original width + total shift range
-    int canvas_height = enc->height + crop_top + crop_bottom; // Original height + total shift range
-    int canvas_pixels = canvas_width * canvas_height;
+    // Step 0.7: Expand frames to larger canvas (only for translation-based alignment)
+    // Mesh-based warping doesn't need canvas expansion since the mesh handles local distortions
+    int canvas_width = enc->width;
+    int canvas_height = enc->height;
+    int canvas_pixels = num_pixels;
 
-    if (enc->verbose && (crop_left || crop_right || crop_top || crop_bottom)) {
-        printf("Expanded canvas: %dx%d (original %dx%d + margins L=%d R=%d T=%d B=%d)\n",
-               canvas_width, canvas_height, enc->width, enc->height,
-               crop_left, crop_right, crop_top, crop_bottom);
-        printf("This preserves all original pixels from all frames after alignment\n");
+    if (!enc->enable_mesh_warp) {
+        // Calculate expanded canvas size (UNION of all aligned frames)
+        canvas_width = enc->width + crop_left + crop_right;   // Original width + total shift range
+        canvas_height = enc->height + crop_top + crop_bottom; // Original height + total shift range
+        canvas_pixels = canvas_width * canvas_height;
+
+        if (enc->verbose && (crop_left || crop_right || crop_top || crop_bottom)) {
+            printf("Expanded canvas: %dx%d (original %dx%d + margins L=%d R=%d T=%d B=%d)\n",
+                   canvas_width, canvas_height, enc->width, enc->height,
+                   crop_left, crop_right, crop_top, crop_bottom);
+            printf("This preserves all original pixels from all frames after alignment\n");
+        }
+    } else {
+        if (enc->verbose) {
+            printf("Using original frame dimensions (mesh warping handles distortions)\n");
+        }
     }
 
-    // Allocate expanded canvas buffers
+    // Allocate canvas buffers (expanded for translation, original size for mesh)
     float **canvas_y_coeffs = malloc(actual_gop_size * sizeof(float*));
     float **canvas_co_coeffs = malloc(actual_gop_size * sizeof(float*));
     float **canvas_cg_coeffs = malloc(actual_gop_size * sizeof(float*));
@@ -1827,58 +2299,65 @@ static size_t gop_flush(tav_encoder_t *enc, FILE *output, int base_quantiser,
         canvas_co_coeffs[i] = calloc(canvas_pixels, sizeof(float));
         canvas_cg_coeffs[i] = calloc(canvas_pixels, sizeof(float));
 
-        // Place the aligned frame onto the canvas at the appropriate offset
-        // Each frame's aligned position determines where it sits on the canvas
-        int offset_x = crop_left;  // Frames are offset by the left margin
-        int offset_y = crop_top;   // Frames are offset by the top margin
+        if (enc->enable_mesh_warp) {
+            // Mesh warping: simply copy frames (no expansion needed)
+            memcpy(canvas_y_coeffs[i], gop_y_coeffs[i], canvas_pixels * sizeof(float));
+            memcpy(canvas_co_coeffs[i], gop_co_coeffs[i], canvas_pixels * sizeof(float));
+            memcpy(canvas_cg_coeffs[i], gop_cg_coeffs[i], canvas_pixels * sizeof(float));
+        } else {
+            // Translation-based: expand canvas and add symmetric padding
+            // Place the aligned frame onto the canvas at the appropriate offset
+            int offset_x = crop_left;  // Frames are offset by the left margin
+            int offset_y = crop_top;   // Frames are offset by the top margin
 
-        // Copy the full aligned frame onto the canvas (preserves all original content)
-        for (int y = 0; y < enc->height; y++) {
-            for (int x = 0; x < enc->width; x++) {
-                int src_idx = y * enc->width + x;
-                int dst_idx = (y + offset_y) * canvas_width + (x + offset_x);
-                canvas_y_coeffs[i][dst_idx] = gop_y_coeffs[i][src_idx];
-                canvas_co_coeffs[i][dst_idx] = gop_co_coeffs[i][src_idx];
-                canvas_cg_coeffs[i][dst_idx] = gop_cg_coeffs[i][src_idx];
+            // Copy the full aligned frame onto the canvas (preserves all original content)
+            for (int y = 0; y < enc->height; y++) {
+                for (int x = 0; x < enc->width; x++) {
+                    int src_idx = y * enc->width + x;
+                    int dst_idx = (y + offset_y) * canvas_width + (x + offset_x);
+                    canvas_y_coeffs[i][dst_idx] = gop_y_coeffs[i][src_idx];
+                    canvas_co_coeffs[i][dst_idx] = gop_co_coeffs[i][src_idx];
+                    canvas_cg_coeffs[i][dst_idx] = gop_cg_coeffs[i][src_idx];
+                }
             }
-        }
 
-        // Fill margin areas with symmetric padding from frame edges
-        for (int y = 0; y < canvas_height; y++) {
-            for (int x = 0; x < canvas_width; x++) {
-                // Skip pixels in the original frame region (already copied)
-                if (y >= offset_y && y < offset_y + enc->height &&
-                    x >= offset_x && x < offset_x + enc->width) {
-                    continue;
+            // Fill margin areas with symmetric padding from frame edges
+            for (int y = 0; y < canvas_height; y++) {
+                for (int x = 0; x < canvas_width; x++) {
+                    // Skip pixels in the original frame region (already copied)
+                    if (y >= offset_y && y < offset_y + enc->height &&
+                        x >= offset_x && x < offset_x + enc->width) {
+                        continue;
+                    }
+
+                    // Calculate position relative to original frame
+                    int src_x = x - offset_x;
+                    int src_y = y - offset_y;
+
+                    // Apply symmetric padding (mirroring)
+                    if (src_x < 0) {
+                        src_x = -src_x - 1;  // Mirror left edge: -1→0, -2→1, -3→2
+                    } else if (src_x >= enc->width) {
+                        src_x = 2 * enc->width - src_x - 1;  // Mirror right edge
+                    }
+
+                    if (src_y < 0) {
+                        src_y = -src_y - 1;  // Mirror top edge
+                    } else if (src_y >= enc->height) {
+                        src_y = 2 * enc->height - src_y - 1;  // Mirror bottom edge
+                    }
+
+                    // Clamp to valid range (safety for extreme cases)
+                    src_x = CLAMP(src_x, 0, enc->width - 1);
+                    src_y = CLAMP(src_y, 0, enc->height - 1);
+
+                    // Copy mirrored pixel from original frame to canvas margin
+                    int src_idx = src_y * enc->width + src_x;
+                    int dst_idx = y * canvas_width + x;
+                    canvas_y_coeffs[i][dst_idx] = gop_y_coeffs[i][src_idx];
+                    canvas_co_coeffs[i][dst_idx] = gop_co_coeffs[i][src_idx];
+                    canvas_cg_coeffs[i][dst_idx] = gop_cg_coeffs[i][src_idx];
                 }
-
-                // Calculate position relative to original frame
-                int src_x = x - offset_x;
-                int src_y = y - offset_y;
-
-                // Apply symmetric padding (mirroring)
-                if (src_x < 0) {
-                    src_x = -src_x - 1;  // Mirror left edge: -1→0, -2→1, -3→2
-                } else if (src_x >= enc->width) {
-                    src_x = 2 * enc->width - src_x - 1;  // Mirror right edge
-                }
-
-                if (src_y < 0) {
-                    src_y = -src_y - 1;  // Mirror top edge
-                } else if (src_y >= enc->height) {
-                    src_y = 2 * enc->height - src_y - 1;  // Mirror bottom edge
-                }
-
-                // Clamp to valid range (safety for extreme cases)
-                src_x = CLAMP(src_x, 0, enc->width - 1);
-                src_y = CLAMP(src_y, 0, enc->height - 1);
-
-                // Copy mirrored pixel from original frame to canvas margin
-                int src_idx = src_y * enc->width + src_x;
-                int dst_idx = y * canvas_width + x;
-                canvas_y_coeffs[i][dst_idx] = gop_y_coeffs[i][src_idx];
-                canvas_co_coeffs[i][dst_idx] = gop_co_coeffs[i][src_idx];
-                canvas_cg_coeffs[i][dst_idx] = gop_cg_coeffs[i][src_idx];
             }
         }
 
@@ -1888,7 +2367,7 @@ static size_t gop_flush(tav_encoder_t *enc, FILE *output, int base_quantiser,
         free(gop_cg_coeffs[i]);
     }
 
-    // Replace pointers with expanded canvas
+    // Replace pointers with canvas
     free(gop_y_coeffs);
     free(gop_co_coeffs);
     free(gop_cg_coeffs);
@@ -1915,12 +2394,21 @@ static size_t gop_flush(tav_encoder_t *enc, FILE *output, int base_quantiser,
         // Multi-frame GOP: Apply 3D DWT (temporal + spatial) to each channel
         // Note: This modifies gop_*_coeffs in-place
         // Use cropped dimensions to encode only the valid region
-        dwt_3d_forward(gop_y_coeffs, valid_width, valid_height, actual_gop_size,
-                       enc->decomp_levels, enc->temporal_decomp_levels, enc->wavelet_filter);
-        dwt_3d_forward(gop_co_coeffs, valid_width, valid_height, actual_gop_size,
-                       enc->decomp_levels, enc->temporal_decomp_levels, enc->wavelet_filter);
-        dwt_3d_forward(gop_cg_coeffs, valid_width, valid_height, actual_gop_size,
-                       enc->decomp_levels, enc->temporal_decomp_levels, enc->wavelet_filter);
+
+        if (enc->enable_mesh_warp) {
+            // Use MC-lifting: motion compensation integrated into lifting steps
+            dwt_3d_forward_mc(enc, gop_y_coeffs, gop_co_coeffs, gop_cg_coeffs,
+                             actual_gop_size, enc->decomp_levels,
+                             enc->temporal_decomp_levels, enc->wavelet_filter);
+        } else {
+            // Use traditional 3D DWT with pre-aligned frames (translation-only)
+            dwt_3d_forward(gop_y_coeffs, valid_width, valid_height, actual_gop_size,
+                          enc->decomp_levels, enc->temporal_decomp_levels, enc->wavelet_filter);
+            dwt_3d_forward(gop_co_coeffs, valid_width, valid_height, actual_gop_size,
+                          enc->decomp_levels, enc->temporal_decomp_levels, enc->wavelet_filter);
+            dwt_3d_forward(gop_cg_coeffs, valid_width, valid_height, actual_gop_size,
+                          enc->decomp_levels, enc->temporal_decomp_levels, enc->wavelet_filter);
+        }
     }
 
     // Step 2: Allocate quantized coefficient buffers
@@ -2024,9 +2512,8 @@ static size_t gop_flush(tav_encoder_t *enc, FILE *output, int base_quantiser,
         }
     } else {
         // Multi-frame GOP: use unified 3D DWT encoding
-        // Write unified GOP packet header
-        // Packet structure: [packet_type=0x12][gop_size][crop_info][motion_vectors...][compressed_size][compressed_data]
-        uint8_t packet_type = TAV_PACKET_GOP_UNIFIED;
+        // Choose packet type based on motion compensation method
+        uint8_t packet_type = enc->enable_mesh_warp ? TAV_PACKET_GOP_UNIFIED_MESH : TAV_PACKET_GOP_UNIFIED;
         fwrite(&packet_type, 1, 1, output);
         total_bytes_written += 1;
 
@@ -2035,25 +2522,108 @@ static size_t gop_flush(tav_encoder_t *enc, FILE *output, int base_quantiser,
         fwrite(&gop_size_byte, 1, 1, output);
         total_bytes_written += 1;
 
-        // Write canvas expansion information (4 bytes)
-        // This tells the decoder the margins added to preserve all original pixels
-        // The encoded canvas is larger than the original frame to preserve edge content after alignment
-        uint8_t canvas_margins[4] = {
-            (uint8_t)crop_left,    // Left margin
-            (uint8_t)crop_right,   // Right margin
-            (uint8_t)crop_top,     // Top margin
-            (uint8_t)crop_bottom   // Bottom margin
-        };
-        fwrite(canvas_margins, 1, 4, output);
-        total_bytes_written += 4;
+        if (enc->enable_mesh_warp) {
+            // Packet 0x13: Mesh-based warping with selective affine
+            // Encode mesh motion vectors + affine parameters and compress with Zstd
+            // Max size: mesh dimensions (4) + affine mask + translation (4 bytes/cell) + affine (8 bytes/cell worst case)
+            size_t max_mesh_size = 4 + (actual_gop_size * enc->mesh_width * enc->mesh_height * 13);
+            uint8_t *mesh_buffer = malloc(max_mesh_size);
 
-        // Write all motion vectors (1/16-pixel precision) for the entire GOP
-        for (int t = 0; t < actual_gop_size; t++) {
-            int16_t dx = enc->gop_translation_x[t];
-            int16_t dy = enc->gop_translation_y[t];
-            fwrite(&dx, sizeof(int16_t), 1, output);
-            fwrite(&dy, sizeof(int16_t), 1, output);
+            size_t mesh_size = encode_mesh_differential(
+                enc->gop_mesh_dx, enc->gop_mesh_dy,
+                enc->gop_affine_mask,
+                enc->gop_affine_a11, enc->gop_affine_a12,
+                enc->gop_affine_a21, enc->gop_affine_a22,
+                actual_gop_size, enc->mesh_width, enc->mesh_height,
+                mesh_buffer, max_mesh_size
+            );
+
+            if (mesh_size == 0) {
+                fprintf(stderr, "Error: Failed to encode mesh motion vectors\n");
+                free(mesh_buffer);
+                // Free all allocated buffers
+                for (int i = 0; i < actual_gop_size; i++) {
+                    free(gop_y_coeffs[i]);
+                    free(gop_co_coeffs[i]);
+                    free(gop_cg_coeffs[i]);
+                    free(quant_y[i]);
+                    free(quant_co[i]);
+                    free(quant_cg[i]);
+                }
+                free(gop_y_coeffs);
+                free(gop_co_coeffs);
+                free(gop_cg_coeffs);
+                free(quant_y);
+                free(quant_co);
+                free(quant_cg);
+                return 0;
+            }
+
+            // Compress mesh data with Zstd
+            size_t max_compressed_mesh = ZSTD_compressBound(mesh_size);
+            uint8_t *compressed_mesh = malloc(max_compressed_mesh);
+            size_t compressed_mesh_size = ZSTD_compress(
+                compressed_mesh, max_compressed_mesh,
+                mesh_buffer, mesh_size,
+                enc->zstd_level
+            );
+
+            if (ZSTD_isError(compressed_mesh_size)) {
+                fprintf(stderr, "Error: Zstd compression failed for mesh data\n");
+                free(mesh_buffer);
+                free(compressed_mesh);
+                // Free all allocated buffers
+                for (int i = 0; i < actual_gop_size; i++) {
+                    free(gop_y_coeffs[i]);
+                    free(gop_co_coeffs[i]);
+                    free(gop_cg_coeffs[i]);
+                    free(quant_y[i]);
+                    free(quant_co[i]);
+                    free(quant_cg[i]);
+                }
+                free(gop_y_coeffs);
+                free(gop_co_coeffs);
+                free(gop_cg_coeffs);
+                free(quant_y);
+                free(quant_co);
+                free(quant_cg);
+                return 0;
+            }
+
+            // Write compressed mesh size and data
+            uint32_t compressed_mesh_size_32 = (uint32_t)compressed_mesh_size;
+            fwrite(&compressed_mesh_size_32, sizeof(uint32_t), 1, output);
+            fwrite(compressed_mesh, 1, compressed_mesh_size, output);
+            total_bytes_written += sizeof(uint32_t) + compressed_mesh_size;
+
+            if (enc->verbose) {
+                printf("Mesh data: %zu bytes raw, %zu bytes compressed (%.1f%% compression)\n",
+                       mesh_size, compressed_mesh_size,
+                       100.0 * compressed_mesh_size / mesh_size);
+            }
+
+            free(mesh_buffer);
+            free(compressed_mesh);
+        } else {
+            // Packet 0x12: Translation-based alignment
+            // Write canvas expansion information (4 bytes)
+            uint8_t canvas_margins[4] = {
+                (uint8_t)crop_left,    // Left margin
+                (uint8_t)crop_right,   // Right margin
+                (uint8_t)crop_top,     // Top margin
+                (uint8_t)crop_bottom   // Bottom margin
+            };
+            fwrite(canvas_margins, 1, 4, output);
             total_bytes_written += 4;
+
+            // Write all motion vectors (1/16-pixel precision) for the entire GOP
+            for (int t = 0; t < actual_gop_size; t++) {
+                int16_t dx = enc->gop_translation_x[t];
+                int16_t dy = enc->gop_translation_y[t];
+                fwrite(&dx, sizeof(int16_t), 1, output);
+                fwrite(&dy, sizeof(int16_t), 1, output);
+                total_bytes_written += 4;
+            }
         }
 
         // Preprocess ALL frames with unified significance map
@@ -2252,6 +2822,251 @@ static size_t gop_process_and_flush(tav_encoder_t *enc, FILE *output, int base_q
 // Temporal DWT Functions
 // =============================================================================
 
+// Invert mesh for backward warping (MC-lifting update step)
+// Forward mesh: warps F0 to F1
+// Backward mesh: warps F1 to F0 (negated motion vectors)
+static void invert_mesh(
+    const short *mesh_dx, const short *mesh_dy,
+    int mesh_width, int mesh_height,
+    short *inv_mesh_dx, short *inv_mesh_dy
+) {
+    int num_points = mesh_width * mesh_height;
+    for (int i = 0; i < num_points; i++) {
+        inv_mesh_dx[i] = -mesh_dx[i];
+        inv_mesh_dy[i] = -mesh_dy[i];
+    }
+}
+
+// Build block-based reliability mask for selective motion compensation
+// Process 16×16 blocks for efficiency (matches block matching resolution)
+// Returns mask where 1 = use MC, 0 = fall back to plain Haar
+/*static void build_reliability_mask(
+    const uint8_t *frame0_rgb, const uint8_t *frame1_rgb,
+    const float *flow_fwd_x, const float *flow_fwd_y,
+    const float *flow_bwd_x, const float *flow_bwd_y,
+    int width, int height,
+    uint8_t *mask
+) {
+    const int block_size = 16;  // Match block matching resolution
+    int num_pixels = width * height;
+
+    // Relaxed thresholds for better coverage
+    float motion_threshold = 1.0f;   // pixels (relaxed from 2.0)
+    float fb_threshold = 2.0f;       // pixels (relaxed from 1.0)
+    float texture_threshold = 10.0f; // gradient magnitude
+
+    int reliable_blocks = 0;
+    int total_blocks = 0;
+    int reliable_pixels = 0;
+
+    // Process in 16×16 blocks
+    for (int by = 0; by < height; by += block_size) {
+        for (int bx = 0; bx < width; bx += block_size) {
+            total_blocks++;
+
+            // Compute block statistics
+            float sum_motion = 0.0f;
+            float sum_fb_error = 0.0f;
+            float sum_texture = 0.0f;
+            int block_pixels = 0;
+
+            int bh = (by + block_size <= height) ? block_size : (height - by);
+            int bw = (bx + block_size <= width) ? block_size : (width - bx);
+
+            for (int y = by; y < by + bh; y++) {
+                for (int x = bx; x < bx + bw; x++) {
+                    int idx = y * width + x;
+
+                    // Motion magnitude
+                    float fx = flow_fwd_x[idx];
+                    float fy = flow_fwd_y[idx];
+                    sum_motion += sqrtf(fx * fx + fy * fy);
+
+                    // Forward-backward consistency
+                    int x_warped = (int)(x + fx + 0.5f);
+                    int y_warped = (int)(y + fy + 0.5f);
+                    if (x_warped >= 0 && x_warped < width && y_warped >= 0 && y_warped < height) {
+                        int idx_w = y_warped * width + x_warped;
+                        float bx_val = flow_bwd_x[idx_w];
+                        float by_val = flow_bwd_y[idx_w];
+                        float err_x = fx + bx_val;
+                        float err_y = fy + by_val;
+                        sum_fb_error += sqrtf(err_x * err_x + err_y * err_y);
+                    } else {
+                        sum_fb_error += 999.0f;
+                    }
+
+                    // Texture (simple gradient)
+                    if (x > 0 && x < width - 1 && y > 0 && y < height - 1) {
+                        int rgb_idx = idx * 3;
+                        int rgb_idx_r = (y * width + (x + 1)) * 3;
+                        int rgb_idx_d = ((y + 1) * width + x) * 3;
+
+                        float gx = (frame0_rgb[rgb_idx_r] - frame0_rgb[rgb_idx]);
+                        float gy = (frame0_rgb[rgb_idx_d] - frame0_rgb[rgb_idx]);
+                        sum_texture += sqrtf(gx * gx + gy * gy);
+                    }
+
+                    block_pixels++;
+                }
+            }
+
+            // Average block statistics
+            float avg_motion = sum_motion / block_pixels;
+            float avg_fb_error = sum_fb_error / block_pixels;
+            float avg_texture = sum_texture / block_pixels;
+
+            // Decide if block is reliable
+            int block_reliable = (avg_motion > motion_threshold) &&
+                                (avg_fb_error < fb_threshold) &&
+                                (avg_texture > texture_threshold);
+
+            if (block_reliable) reliable_blocks++;
+
+            // Apply decision to all pixels in block
+            for (int y = by; y < by + bh; y++) {
+                for (int x = bx; x < bx + bw; x++) {
+                    int idx = y * width + x;
+                    mask[idx] = block_reliable ? 1 : 0;
+                    if (mask[idx]) reliable_pixels++;
+                }
+            }
+        }
+    }
+
+    // Debug output
+    printf("  Reliability mask: %d/%d blocks (%d/%d pixels, %.1f%%) - motion>%.1fpx, texture>%.1f, fb_err<%.1fpx\n",
+           reliable_blocks, total_blocks, reliable_pixels, num_pixels,
+           100.0f * reliable_pixels / num_pixels,
+           motion_threshold, texture_threshold, fb_threshold);
+}*/
+
+// Motion-compensated lifting: Haar wavelet with SYMMETRIC bi-directional warping
+// Implements predict-update lifting scheme with symmetric motion compensation
+//
+// Symmetric lifting (both frames meet in the middle):
+//   Predict: H = warp(F1, -½·mesh) - warp(F0, +½·mesh)
+//   Update:  L = 0.5 * (warp(F0, +½·mesh) + warp(F1, -½·mesh))
+//
+// With reliability masking:
+//   For reliable pixels: use MC-lifting (mesh warp)
+//   For unreliable pixels: fall back to plain Haar (H = F1 - F0, L = 0.5*(F0+F1))
+//
+// This produces:
+//   L (lowband): symmetric motion-aligned temporal average (where reliable)
+//   H (highband): symmetric motion-compensated residual (where reliable)
+//
+// Benefits:
+//   - Symmetric treatment of both frames (no bias)
+//   - Better invertibility (both frames warp equally)
+//   - Lower residual energy (frames meet at temporal midpoint)
+//   - Eliminates errors from occlusions, low-texture regions, and bad matches
+static void mc_lifting_forward_pair(
+    tav_encoder_t *enc,
+    float **f0_y, float **f0_co, float **f0_cg,  // Frame 0 (reference)
+    float **f1_y, float **f1_co, float **f1_cg,  // Frame 1 (current)
+    const short *mesh_dx, const short *mesh_dy,  // Forward mesh (F0→F1)
+//    const uint8_t *reliability_mask,             // Reliability mask (1=use MC, 0=plain Haar)
+    float **out_l_y, float **out_l_co, float **out_l_cg,  // Lowband output
+    float **out_h_y, float **out_h_co, float **out_h_cg   // Highband output
+) {
+    int width = enc->width;
+    int height = enc->height;
+    int num_pixels = width * height;
+    int num_mesh_points = enc->mesh_width * enc->mesh_height;
+
+    // Allocate temporary buffers
+    float *warped_f0_fwd_y = malloc(num_pixels * sizeof(float));
+    float *warped_f0_fwd_co = malloc(num_pixels * sizeof(float));
+    float *warped_f0_fwd_cg = malloc(num_pixels * sizeof(float));
+
+    float *warped_f1_back_y = malloc(num_pixels * sizeof(float));
+    float *warped_f1_back_co = malloc(num_pixels * sizeof(float));
+    float *warped_f1_back_cg = malloc(num_pixels * sizeof(float));
+
+    short *half_mesh_dx = malloc(num_mesh_points * sizeof(short));
+    short *half_mesh_dy = malloc(num_mesh_points * sizeof(short));
+    short *neg_half_mesh_dx = malloc(num_mesh_points * sizeof(short));
+    short *neg_half_mesh_dy = malloc(num_mesh_points * sizeof(short));
+
+    if (!warped_f0_fwd_y || !warped_f0_fwd_co || !warped_f0_fwd_cg ||
+        !warped_f1_back_y || !warped_f1_back_co || !warped_f1_back_cg ||
+        !half_mesh_dx || !half_mesh_dy ||
+        !neg_half_mesh_dx || !neg_half_mesh_dy) {
+        fprintf(stderr, "Error: Failed to allocate MC-lifting buffers\n");
+        goto cleanup;
+    }
+
+    // Create half-step meshes for symmetric warping
+    // +½·mesh for F0 forward, -½·mesh for F1 backward
+    for (int i = 0; i < num_mesh_points; i++) {
+        half_mesh_dx[i] = mesh_dx[i] / 2;
+        half_mesh_dy[i] = mesh_dy[i] / 2;
+        neg_half_mesh_dx[i] = -half_mesh_dx[i];
+        neg_half_mesh_dy[i] = -half_mesh_dy[i];
+    }
+
+    // ===== SYMMETRIC PREDICT STEP: H = warp(F1, -½·mesh) - warp(F0, +½·mesh) =====
+    // Warp F0 forward by half-step (toward temporal midpoint)
+    warp_frame_with_mesh(*f0_y, width, height, half_mesh_dx, half_mesh_dy,
+                         enc->mesh_width, enc->mesh_height, warped_f0_fwd_y);
+    warp_frame_with_mesh(*f0_co, width, height, half_mesh_dx, half_mesh_dy,
+                         enc->mesh_width, enc->mesh_height, warped_f0_fwd_co);
+    warp_frame_with_mesh(*f0_cg, width, height, half_mesh_dx, half_mesh_dy,
+                         enc->mesh_width, enc->mesh_height, warped_f0_fwd_cg);
+
+    // Warp F1 backward by half-step (toward temporal midpoint)
+    warp_frame_with_mesh(*f1_y, width, height, neg_half_mesh_dx, neg_half_mesh_dy,
+                         enc->mesh_width, enc->mesh_height, warped_f1_back_y);
+    warp_frame_with_mesh(*f1_co, width, height, neg_half_mesh_dx, neg_half_mesh_dy,
+                         enc->mesh_width, enc->mesh_height, warped_f1_back_co);
+    warp_frame_with_mesh(*f1_cg, width, height, neg_half_mesh_dx, neg_half_mesh_dy,
+                         enc->mesh_width, enc->mesh_height, warped_f1_back_cg);
+
+    float ALPHA = 0.25f;
+
+    // Compute temporal highband and lowband with selective MC
+    // For reliable pixels: use MC-lifting (warped frames)
+    // For unreliable pixels: fall back to plain Haar (original frames)
+    for (int i = 0; i < num_pixels; i++) {
+//        if (reliability_mask[i]) {
+            // ===== RELIABLE: MC-LIFTING =====
+            // Predict: H = warp(F1, -½·mesh) - warp(F0, +½·mesh)
+            (*out_h_y)[i] = warped_f1_back_y[i] - warped_f0_fwd_y[i];
+            (*out_h_co)[i] = warped_f1_back_co[i] - warped_f0_fwd_co[i];
+            (*out_h_cg)[i] = warped_f1_back_cg[i] - warped_f0_fwd_cg[i];
+
+            // Update: L = 0.5 * (warp(F0, +½·mesh) + warp(F1, -½·mesh))
+            (*out_l_y)[i] = ALPHA * (warped_f0_fwd_y[i] + warped_f1_back_y[i]);
+            (*out_l_co)[i] = ALPHA * (warped_f0_fwd_co[i] + warped_f1_back_co[i]);
+            (*out_l_cg)[i] = ALPHA * (warped_f0_fwd_cg[i] + warped_f1_back_cg[i]);
+        /*} else {
+            // ===== UNRELIABLE: PLAIN HAAR (no warping) =====
+            // Predict: H = F1 - F0
+            (*out_h_y)[i] = (*f1_y)[i] - (*f0_y)[i];
+            (*out_h_co)[i] = (*f1_co)[i] - (*f0_co)[i];
+            (*out_h_cg)[i] = (*f1_cg)[i] - (*f0_cg)[i];
+
+            // Update: L = 0.5 * (F0 + F1)
+            (*out_l_y)[i] = ALPHA * ((*f0_y)[i] + (*f1_y)[i]);
+            (*out_l_co)[i] = ALPHA * ((*f0_co)[i] + (*f1_co)[i]);
+            (*out_l_cg)[i] = ALPHA * ((*f0_cg)[i] + (*f1_cg)[i]);
+        }*/
+    }
+
+cleanup:
+    free(warped_f0_fwd_y);
+    free(warped_f0_fwd_co);
+    free(warped_f0_fwd_cg);
+    free(warped_f1_back_y);
+    free(warped_f1_back_co);
+    free(warped_f1_back_cg);
+    free(half_mesh_dx);
+    free(half_mesh_dy);
+    free(neg_half_mesh_dx);
+    free(neg_half_mesh_dy);
+}
+
 // Apply 1D temporal DWT along time axis for a spatial location (encoder side)
 // data[i] = frame i's coefficient value at this spatial location
 // Applies LGT 5/3 wavelet for reversibility
@@ -2266,9 +3081,127 @@ static void dwt_temporal_1d_inverse_53(float *temporal_data, int num_frames) {
     dwt_53_inverse_1d(temporal_data, num_frames);
 }
 
+// Apply 3D DWT with motion-compensated lifting (MC-lifting)
+// Integrates motion compensation directly into wavelet lifting steps
+// This replaces separate warping + DWT for better invertibility and compression
+static void dwt_3d_forward_mc(
+    tav_encoder_t *enc,
+    float **gop_y, float **gop_co, float **gop_cg,
+    int num_frames, int spatial_levels, int temporal_levels, int spatial_filter
+) {
+    if (num_frames < 2) return;
+
+    int width = enc->width;
+    int height = enc->height;
+    int num_pixels = width * height;
+
+    // Allocate temporary buffers for L and H bands
+    float **temp_l_y = malloc(num_frames * sizeof(float*));
+    float **temp_l_co = malloc(num_frames * sizeof(float*));
+    float **temp_l_cg = malloc(num_frames * sizeof(float*));
+    float **temp_h_y = malloc(num_frames * sizeof(float*));
+    float **temp_h_co = malloc(num_frames * sizeof(float*));
+    float **temp_h_cg = malloc(num_frames * sizeof(float*));
+
+    for (int i = 0; i < num_frames; i++) {
+        temp_l_y[i] = malloc(num_pixels * sizeof(float));
+        temp_l_co[i] = malloc(num_pixels * sizeof(float));
+        temp_l_cg[i] = malloc(num_pixels * sizeof(float));
+        temp_h_y[i] = malloc(num_pixels * sizeof(float));
+        temp_h_co[i] = malloc(num_pixels * sizeof(float));
+        temp_h_cg[i] = malloc(num_pixels * sizeof(float));
+    }
+
+    // Step 1: Apply MC-lifting temporal transform
+    // Process frame pairs at each decomposition level
+    for (int level = 0; level < temporal_levels; level++) {
+        int level_frames = num_frames >> level;
+        if (level_frames < 2) break;
+
+        // Apply MC-lifting to each frame pair
+        for (int i = 0; i < level_frames; i += 2) {
+            int f0_idx = i;
+            int f1_idx = i + 1;
+
+            if (f1_idx >= level_frames) break;
+
+            // Get mesh for this frame pair (mesh[f1] describes motion from f0 to f1)
+            const short *mesh_dx = enc->gop_mesh_dx[f1_idx];
+            const short *mesh_dy = enc->gop_mesh_dy[f1_idx];
+
+            // Build reliability mask for selective motion compensation
+            /*uint8_t *reliability_mask = malloc(num_pixels * sizeof(uint8_t));
+            if (reliability_mask && enc->gop_flow_fwd_x[f1_idx] && enc->gop_flow_bwd_x[f1_idx]) {
+                build_reliability_mask(
+                    enc->gop_rgb_frames[f0_idx], enc->gop_rgb_frames[f1_idx],
+                    enc->gop_flow_fwd_x[f1_idx], enc->gop_flow_fwd_y[f1_idx],
+                    enc->gop_flow_bwd_x[f1_idx], enc->gop_flow_bwd_y[f1_idx],
+                    width, height,
+                    reliability_mask
+                );
+            } else {
+                // Fallback: use MC everywhere (all pixels reliable)
+                if (reliability_mask) {
+                    memset(reliability_mask, 1, num_pixels * sizeof(uint8_t));
+                }
+            }*/
+
+            // Apply MC-lifting: (L, H) = mc_lift(F0, F1, mesh, mask)
+            mc_lifting_forward_pair(
+                enc,
+                &gop_y[f0_idx], &gop_co[f0_idx], &gop_cg[f0_idx],  // F0
+                &gop_y[f1_idx], &gop_co[f1_idx], &gop_cg[f1_idx],  // F1
+                mesh_dx, mesh_dy,
+//                reliability_mask,                                    // Reliability mask
+                &temp_l_y[i/2], &temp_l_co[i/2], &temp_l_cg[i/2],  // L output
+                &temp_h_y[level_frames/2 + i/2], &temp_h_co[level_frames/2 + i/2], &temp_h_cg[level_frames/2 + i/2]  // H output
+            );
+
+//            free(reliability_mask);
+        }
+
+        // Copy L and H bands back to gop buffers for next level
+        int half = level_frames / 2;
+        for (int i = 0; i < half; i++) {
+            memcpy(gop_y[i], temp_l_y[i], num_pixels * sizeof(float));
+            memcpy(gop_co[i], temp_l_co[i], num_pixels * sizeof(float));
+            memcpy(gop_cg[i], temp_l_cg[i], num_pixels * sizeof(float));
+        }
+        for (int i = 0; i < half; i++) {
+            memcpy(gop_y[half + i], temp_h_y[half + i], num_pixels * sizeof(float));
+            memcpy(gop_co[half + i], temp_h_co[half + i], num_pixels * sizeof(float));
+            memcpy(gop_cg[half + i], temp_h_cg[half + i], num_pixels * sizeof(float));
+        }
+    }
+
+    // Step 2: Apply 2D spatial DWT to each temporal subband
+    for (int t = 0; t < num_frames; t++) {
+        dwt_2d_forward_flexible(gop_y[t], width, height, spatial_levels, spatial_filter);
+        dwt_2d_forward_flexible(gop_co[t], width, height, spatial_levels, spatial_filter);
+        dwt_2d_forward_flexible(gop_cg[t], width, height, spatial_levels, spatial_filter);
+    }
+
+    // Cleanup
+    for (int i = 0; i < num_frames; i++) {
+        free(temp_l_y[i]);
+        free(temp_l_co[i]);
+        free(temp_l_cg[i]);
+        free(temp_h_y[i]);
+        free(temp_h_co[i]);
+        free(temp_h_cg[i]);
+    }
+    free(temp_l_y);
+    free(temp_l_co);
+    free(temp_l_cg);
+    free(temp_h_y);
+    free(temp_h_co);
+    free(temp_h_cg);
+}
+
 // Apply 3D DWT: temporal DWT across frames, then spatial DWT on each temporal subband
 // gop_data[frame][y * width + x] - GOP buffer organized as frame-major
 // Modifies gop_data in-place
+// NOTE: This is the OLD version without MC-lifting (kept for non-mesh mode)
 static void dwt_3d_forward(float **gop_data, int width, int height, int num_frames,
                           int spatial_levels, int temporal_levels, int spatial_filter) {
     if (num_frames < 2 || width < 2 || height < 2) return;
@@ -2290,8 +3223,8 @@ static void dwt_3d_forward(float **gop_data, int width, int height, int num_fram
             for (int level = 0; level < temporal_levels; level++) {
                 int level_frames = num_frames >> level;
                 if (level_frames >= 2) {
-//                    dwt_temporal_1d_forward_53(temporal_line, level_frames);
-                    dwt_haar_forward_1d(temporal_line, level_frames);
+//                    dwt_temporal_1d_forward_53(temporal_line, level_frames);  // CDF 5/3 worse for motion-compensated frames
+                    dwt_haar_forward_1d(temporal_line, level_frames);  // Haar better for imperfect alignment
                 }
             }
 
@@ -5272,6 +6205,7 @@ int main(int argc, char *argv[]) {
         {"delta-haar", required_argument, 0, 1018},
         {"temporal-dwt", no_argument, 0, 1019},
         {"temporal-3d", no_argument, 0, 1019},
+        {"mesh-warp", no_argument, 0, 1020},
         {"help", no_argument, 0, '?'},
         {0, 0, 0, 0}
     };
@@ -5438,6 +6372,10 @@ int main(int argc, char *argv[]) {
                 enc->enable_temporal_dwt = 1;
                 printf("Temporal 3D DWT encoding enabled (GOP size: %d frames)\n", GOP_SIZE);
                 break;
+            case 1020: // --mesh-warp
+                enc->enable_mesh_warp = 1;
+                printf("Mesh-based motion compensation enabled (requires --temporal-dwt)\n");
+                break;
             case 'a':
                 int bitrate = atoi(optarg);
                 int valid_bitrate = validate_mp2_bitrate(bitrate);
diff --git a/video_encoder/encoder_tav_opencv.cpp b/video_encoder/encoder_tav_opencv.cpp
new file mode 100644
index 0000000..1a3ea6b
--- /dev/null
+++ b/video_encoder/encoder_tav_opencv.cpp
@@ -0,0 +1,461 @@
+// Created by Claude on 2025-10-17
+// OpenCV-based optical flow and mesh warping functions for TAV encoder
+// This file is compiled separately as C++ and linked with the C encoder
+
+#include <opencv2/opencv.hpp>
+#include <opencv2/video/tracking.hpp>
+#include <cstdlib>
+#include <cstring>
+#include <cmath>
+
+// Extern "C" linkage for functions callable from C code
+extern "C" {
+
+// Helper: Compute SAD (Sum of Absolute Differences) for a block
+static int compute_sad(
+    const unsigned char *ref, const unsigned char *cur,
+    int ref_x, int ref_y, int cur_x, int cur_y,
+    int width, int height, int block_size
+) {
+    int sad = 0;
+    for (int by = 0; by < block_size; by++) {
+        for (int bx = 0; bx < block_size; bx++) {
+            int ry = ref_y + by;
+            int rx = ref_x + bx;
+            int cy = cur_y + by;
+            int cx = cur_x + bx;
+
+            // Boundary check
+            if (rx < 0 || rx >= width || ry < 0 || ry >= height ||
+                cx < 0 || cx >= width || cy < 0 || cy >= height) {
+                sad += 255;  // Penalty for out-of-bounds
+                continue;
+            }
+
+            int ref_val = ref[ry * width + rx];
+            int cur_val = cur[cy * width + cx];
+            sad += abs(ref_val - cur_val);
+        }
+    }
+    return sad;
+}
+
+// Helper: Diamond search pattern for motion estimation
+static void diamond_search(
+    const unsigned char *ref, const unsigned char *cur,
+    int cx, int cy, int width, int height, int block_size,
+    int search_range, int *best_dx, int *best_dy
+) {
+    // Large diamond pattern (distance 2)
+    const int large_diamond[8][2] = {
+        {0, -2}, {-1, -1}, {1, -1}, {-2, 0},
+        {2, 0}, {-1, 1}, {1, 1}, {0, 2}
+    };
+
+    // Small diamond pattern (distance 1)
+    const int small_diamond[4][2] = {
+        {0, -1}, {-1, 0}, {1, 0}, {0, 1}
+    };
+
+    int dx = 0, dy = 0;
+    int best_sad = compute_sad(ref, cur, cx + dx, cy + dy, cx, cy, width, height, block_size);
+
+    // Large diamond search
+    bool improved = true;
+    while (improved) {
+        improved = false;
+        for (int i = 0; i < 8; i++) {
+            int test_dx = dx + large_diamond[i][0];
+            int test_dy = dy + large_diamond[i][1];
+
+            // Check search range bounds
+            if (abs(test_dx) > search_range || abs(test_dy) > search_range) {
+                continue;
+            }
+
+            int sad = compute_sad(ref, cur, cx + test_dx, cy + test_dy, cx, cy, width, height, block_size);
+            if (sad < best_sad) {
+                best_sad = sad;
+                dx = test_dx;
+                dy = test_dy;
+                improved = true;
+                break;
+            }
+        }
+    }
+
+    // Small diamond refinement
+    improved = true;
+    while (improved) {
+        improved = false;
+        for (int i = 0; i < 4; i++) {
+            int test_dx = dx + small_diamond[i][0];
+            int test_dy = dy + small_diamond[i][1];
+
+            if (abs(test_dx) > search_range || abs(test_dy) > search_range) {
+                continue;
+            }
+
+            int sad = compute_sad(ref, cur, cx + test_dx, cy + test_dy, cx, cy, width, height, block_size);
+            if (sad < best_sad) {
+                best_sad = sad;
+                dx = test_dx;
+                dy = test_dy;
+                improved = true;
+                break;
+            }
+        }
+    }
+
+    *best_dx = dx;
+    *best_dy = dy;
+}
+
+// Hierarchical block matching motion estimation with deeper pyramid
+// 3-level hierarchy to handle large motion (up to ±32px)
+void estimate_motion_optical_flow(
+    const unsigned char *frame1_rgb, const unsigned char *frame2_rgb,
+    int width, int height,
+    float **out_flow_x, float **out_flow_y
+) {
+    // Step 1: Convert RGB to grayscale
+    unsigned char *gray1 = (unsigned char*)std::malloc(width * height);
+    unsigned char *gray2 = (unsigned char*)std::malloc(width * height);
+
+    for (int y = 0; y < height; y++) {
+        for (int x = 0; x < width; x++) {
+            int idx = y * width + x;
+            int rgb_idx = idx * 3;
+
+            // ITU-R BT.601 grayscale conversion
+            gray1[idx] = (unsigned char)(0.299f * frame1_rgb[rgb_idx] +
+                                         0.587f * frame1_rgb[rgb_idx + 1] +
+                                         0.114f * frame1_rgb[rgb_idx + 2]);
+            gray2[idx] = (unsigned char)(0.299f * frame2_rgb[rgb_idx] +
+                                         0.587f * frame2_rgb[rgb_idx + 1] +
+                                         0.114f * frame2_rgb[rgb_idx + 2]);
+        }
+    }
+
+    // Step 2: 3-level hierarchical block matching (coarse to fine)
+    // Level 0: 64×64 blocks, ±32 pixel search (captures large motion up to 32px)
+    // Level 1: 32×32 blocks, ±16 pixel refinement
+    // Level 2: 16×16 blocks, ±8 pixel final refinement
+
+    *out_flow_x = (float*)std::malloc(width * height * sizeof(float));
+    *out_flow_y = (float*)std::malloc(width * height * sizeof(float));
+
+    // Initialize with zero motion
+    std::memset(*out_flow_x, 0, width * height * sizeof(float));
+    std::memset(*out_flow_y, 0, width * height * sizeof(float));
+
+    // Level 0: Coarsest search (64×64 blocks, ±32px)
+    const int block_size_l0 = 32;
+    const int search_range_l0 = 16;
+
+    for (int by = 0; by < height; by += block_size_l0) {
+        for (int bx = 0; bx < width; bx += block_size_l0) {
+            int dx = 0, dy = 0;
+            diamond_search(gray1, gray2, bx, by, width, height,
+                          block_size_l0, search_range_l0, &dx, &dy);
+
+            // Fill flow for this block
+            for (int y = by; y < by + block_size_l0 && y < height; y++) {
+                for (int x = bx; x < bx + block_size_l0 && x < width; x++) {
+                    int idx = y * width + x;
+                    (*out_flow_x)[idx] = (float)dx;
+                    (*out_flow_y)[idx] = (float)dy;
+                }
+            }
+        }
+    }
+
+    // Level 1: Medium refinement (32×32 blocks, ±16px)
+    const int block_size_l1 = 16;
+    const int search_range_l1 = 8;
+
+    for (int by = 0; by < height; by += block_size_l1) {
+        for (int bx = 0; bx < width; bx += block_size_l1) {
+            // Get initial guess from level 0
+            int init_dx = (int)(*out_flow_x)[by * width + bx];
+            int init_dy = (int)(*out_flow_y)[by * width + bx];
+
+            // Search around initial guess
+            int best_dx = init_dx;
+            int best_dy = init_dy;
+            int best_sad = compute_sad(gray1, gray2, bx + init_dx, by + init_dy,
+                                      bx, by, width, height, block_size_l1);
+
+            // Local search around initial guess
+            for (int dy = -search_range_l1; dy <= search_range_l1; dy += 2) {
+                for (int dx = -search_range_l1; dx <= search_range_l1; dx += 2) {
+                    int test_dx = init_dx + dx;
+                    int test_dy = init_dy + dy;
+
+                    int sad = compute_sad(gray1, gray2, bx + test_dx, by + test_dy,
+                                        bx, by, width, height, block_size_l1);
+                    if (sad < best_sad) {
+                        best_sad = sad;
+                        best_dx = test_dx;
+                        best_dy = test_dy;
+                    }
+                }
+            }
+
+            // Fill flow for this block
+            for (int y = by; y < by + block_size_l1 && y < height; y++) {
+                for (int x = bx; x < bx + block_size_l1 && x < width; x++) {
+                    int idx = y * width + x;
+                    (*out_flow_x)[idx] = (float)best_dx;
+                    (*out_flow_y)[idx] = (float)best_dy;
+                }
+            }
+        }
+    }
+
+    // Level 2: Finest refinement (16×16 blocks, ±8px)
+    /*const int block_size_l2 = 16;
+    const int search_range_l2 = 8;
+
+    for (int by = 0; by < height; by += block_size_l2) {
+        for (int bx = 0; bx < width; bx += block_size_l2) {
+            // Get initial guess from level 1
+            int init_dx = (int)(*out_flow_x)[by * width + bx];
+            int init_dy = (int)(*out_flow_y)[by * width + bx];
+
+            // Search around initial guess (finer grid)
+            int best_dx = init_dx;
+            int best_dy = init_dy;
+            int best_sad = compute_sad(gray1, gray2, bx + init_dx, by + init_dy,
+                                      bx, by, width, height, block_size_l2);
+
+            // Exhaustive local search for final refinement
+            for (int dy = -search_range_l2; dy <= search_range_l2; dy++) {
+                for (int dx = -search_range_l2; dx <= search_range_l2; dx++) {
+                    int test_dx = init_dx + dx;
+                    int test_dy = init_dy + dy;
+
+                    int sad = compute_sad(gray1, gray2, bx + test_dx, by + test_dy,
+                                        bx, by, width, height, block_size_l2);
+                    if (sad < best_sad) {
+                        best_sad = sad;
+                        best_dx = test_dx;
+                        best_dy = test_dy;
+                    }
+                }
+            }
+
+            // Fill flow for this block
+            for (int y = by; y < by + block_size_l2 && y < height; y++) {
+                for (int x = bx; x < bx + block_size_l2 && x < width; x++) {
+                    int idx = y * width + x;
+                    (*out_flow_x)[idx] = (float)best_dx;
+                    (*out_flow_y)[idx] = (float)best_dy;
+                }
+            }
+        }
+    }*/
+
+    std::free(gray1);
+    std::free(gray2);
+}
+
+// Build distortion mesh from dense optical flow field
+// Downsamples flow to coarse mesh grid using robust averaging
+void build_mesh_from_flow(
+    const float *flow_x, const float *flow_y,
+    int width, int height,
+    int mesh_w, int mesh_h,
+    short *mesh_dx, short *mesh_dy  // Output: 1/8 pixel precision
+) {
+    int cell_w = width / mesh_w;
+    int cell_h = height / mesh_h;
+
+    for (int my = 0; my < mesh_h; my++) {
+        for (int mx = 0; mx < mesh_w; mx++) {
+            // Cell center coordinates (control point position)
+            int cx = mx * cell_w + cell_w / 2;
+            int cy = my * cell_h + cell_h / 2;
+
+            // Collect flow vectors in a neighborhood around cell center (5×5 window)
+            float sum_dx = 0.0f, sum_dy = 0.0f;
+            int count = 0;
+
+            for (int dy = -2; dy <= 2; dy++) {
+                for (int dx = -2; dx <= 2; dx++) {
+                    int px = cx + dx;
+                    int py = cy + dy;
+                    if (px >= 0 && px < width && py >= 0 && py < height) {
+                        int idx = py * width + px;
+                        sum_dx += flow_x[idx];
+                        sum_dy += flow_y[idx];
+                        count++;
+                    }
+                }
+            }
+
+            // Average and convert to 1/8 pixel precision
+            float avg_dx = (count > 0) ? (sum_dx / count) : 0.0f;
+            float avg_dy = (count > 0) ? (sum_dy / count) : 0.0f;
+
+            int mesh_idx = my * mesh_w + mx;
+            mesh_dx[mesh_idx] = (short)(avg_dx * 8.0f);  // 1/8 pixel precision
+            mesh_dy[mesh_idx] = (short)(avg_dy * 8.0f);
+        }
+    }
+}
+
+// Apply Laplacian smoothing to mesh for spatial coherence
+// This prevents fold-overs and reduces high-frequency noise
+void smooth_mesh_laplacian(
+    short *mesh_dx, short *mesh_dy,
+    int mesh_width, int mesh_height,
+    float smoothness, int iterations
+) {
+    short *temp_dx = (short*)std::malloc(mesh_width * mesh_height * sizeof(short));
+    short *temp_dy = (short*)std::malloc(mesh_width * mesh_height * sizeof(short));
+
+    for (int iter = 0; iter < iterations; iter++) {
+        std::memcpy(temp_dx, mesh_dx, mesh_width * mesh_height * sizeof(short));
+        std::memcpy(temp_dy, mesh_dy, mesh_width * mesh_height * sizeof(short));
+
+        for (int my = 0; my < mesh_height; my++) {
+            for (int mx = 0; mx < mesh_width; mx++) {
+                int idx = my * mesh_width + mx;
+
+                // Collect neighbor displacements
+                float neighbor_dx = 0.0f, neighbor_dy = 0.0f;
+                int neighbor_count = 0;
+
+                // 4-connected neighbors (up, down, left, right)
+                int neighbors[4][2] = {{0, -1}, {0, 1}, {-1, 0}, {1, 0}};
+                for (int n = 0; n < 4; n++) {
+                    int nx = mx + neighbors[n][0];
+                    int ny = my + neighbors[n][1];
+                    if (nx >= 0 && nx < mesh_width && ny >= 0 && ny < mesh_height) {
+                        int nidx = ny * mesh_width + nx;
+                        neighbor_dx += temp_dx[nidx];
+                        neighbor_dy += temp_dy[nidx];
+                        neighbor_count++;
+                    }
+                }
+
+                if (neighbor_count > 0) {
+                    neighbor_dx /= neighbor_count;
+                    neighbor_dy /= neighbor_count;
+
+                    // Weighted average: data term + smoothness term
+                    float data_weight = 1.0f - smoothness;
+                    mesh_dx[idx] = (short)(data_weight * temp_dx[idx] + smoothness * neighbor_dx);
+                    mesh_dy[idx] = (short)(data_weight * temp_dy[idx] + smoothness * neighbor_dy);
+                }
+            }
+        }
+    }
+
+    std::free(temp_dx);
+    std::free(temp_dy);
+}
+
+// Apply bilinear mesh warp to a frame channel
+// Uses inverse mapping (destination → source) to avoid holes
+void warp_frame_with_mesh(
+    const float *src_frame, int width, int height,
+    const short *mesh_dx, const short *mesh_dy,
+    int mesh_width, int mesh_height,
+    float *dst_frame
+) {
+    int cell_w = width / mesh_width;
+    int cell_h = height / mesh_height;
+
+    // For each output pixel, compute source location using mesh warp
+    for (int y = 0; y < height; y++) {
+        for (int x = 0; x < width; x++) {
+            // Find which mesh cell this pixel belongs to
+            int cell_x = x / cell_w;
+            int cell_y = y / cell_h;
+
+            // Clamp to valid mesh range
+            if (cell_x >= mesh_width - 1) cell_x = mesh_width - 2;
+            if (cell_y >= mesh_height - 1) cell_y = mesh_height - 2;
+            if (cell_x < 0) cell_x = 0;
+            if (cell_y < 0) cell_y = 0;
+
+            // Get four corner control points
+            int idx_00 = cell_y * mesh_width + cell_x;
+            int idx_10 = idx_00 + 1;
+            int idx_01 = (cell_y + 1) * mesh_width + cell_x;
+            int idx_11 = idx_01 + 1;
+
+            // Control point positions (cell centers)
+            float cp_x0 = cell_x * cell_w + cell_w / 2.0f;
+            float cp_y0 = cell_y * cell_h + cell_h / 2.0f;
+            float cp_x1 = (cell_x + 1) * cell_w + cell_w / 2.0f;
+            float cp_y1 = (cell_y + 1) * cell_h + cell_h / 2.0f;
+
+            // Local coordinates within cell (0 to 1)
+            float alpha = (x - cp_x0) / (cp_x1 - cp_x0);
+            float beta = (y - cp_y0) / (cp_y1 - cp_y0);
+            if (alpha < 0.0f) alpha = 0.0f;
+            if (alpha > 1.0f) alpha = 1.0f;
+            if (beta < 0.0f) beta = 0.0f;
+            if (beta > 1.0f) beta = 1.0f;
+
+            // Bilinear interpolation of motion vectors
+            float dx_00 = mesh_dx[idx_00] / 8.0f;  // Convert to pixels
+            float dy_00 = mesh_dy[idx_00] / 8.0f;
+            float dx_10 = mesh_dx[idx_10] / 8.0f;
+            float dy_10 = mesh_dy[idx_10] / 8.0f;
+            float dx_01 = mesh_dx[idx_01] / 8.0f;
+            float dy_01 = mesh_dy[idx_01] / 8.0f;
+            float dx_11 = mesh_dx[idx_11] / 8.0f;
+            float dy_11 = mesh_dy[idx_11] / 8.0f;
+
+            float dx = (1 - alpha) * (1 - beta) * dx_00 +
+                       alpha * (1 - beta) * dx_10 +
+                       (1 - alpha) * beta * dx_01 +
+                       alpha * beta * dx_11;
+
+            float dy = (1 - alpha) * (1 - beta) * dy_00 +
+                       alpha * (1 - beta) * dy_10 +
+                       (1 - alpha) * beta * dy_01 +
+                       alpha * beta * dy_11;
+
+            // Source coordinates (inverse warp: dst → src)
+            float src_x = x + dx;
+            float src_y = y + dy;
+
+            // Bilinear interpolation of source pixel
+            int sx0 = (int)std::floor(src_x);
+            int sy0 = (int)std::floor(src_y);
+            int sx1 = sx0 + 1;
+            int sy1 = sy0 + 1;
+
+            // Clamp to frame bounds
+            if (sx0 < 0) sx0 = 0;
+            if (sy0 < 0) sy0 = 0;
+            if (sx1 >= width) sx1 = width - 1;
+            if (sy1 >= height) sy1 = height - 1;
+            if (sx0 >= width) sx0 = width - 1;
+            if (sy0 >= height) sy0 = height - 1;
+
+            float fx = src_x - sx0;
+            float fy = src_y - sy0;
+
+            // Bilinear interpolation
+            float val_00 = src_frame[sy0 * width + sx0];
+            float val_10 = src_frame[sy0 * width + sx1];
+            float val_01 = src_frame[sy1 * width + sx0];
+            float val_11 = src_frame[sy1 * width + sx1];
+
+            float val = (1 - fx) * (1 - fy) * val_00 +
+                        fx * (1 - fy) * val_10 +
+                        (1 - fx) * fy * val_01 +
+                        fx * fy * val_11;
+
+            dst_frame[y * width + x] = val;
+        }
+    }
+}
+
+} // extern "C"
diff --git a/video_encoder/estimate_affine_from_blocks.cpp b/video_encoder/estimate_affine_from_blocks.cpp
new file mode 100644
index 0000000..2cb7f9b
--- /dev/null
+++ b/video_encoder/estimate_affine_from_blocks.cpp
@@ -0,0 +1,169 @@
+// Affine estimation for TAV mesh warping
+// This file contains logic to estimate per-cell affine transforms from block motion
+
+#include <cmath>
+#include <cstdlib>
+#include <cstring>
+
+extern "C" {
+
+// Estimate affine transform for a mesh cell from surrounding block motion vectors
+// Uses least-squares fitting of motion vectors to affine model: [x'] = [a11 a12][x] + [tx]
+//                                                                  [y']   [a21 a22][y]   [ty]
+//
+// Returns 1 if affine improves residual by >threshold, 0 if translation-only is better
+int estimate_cell_affine(
+    const float *flow_x, const float *flow_y,
+    int width, int height,
+    int cell_x, int cell_y,      // Cell position in mesh coordinates
+    int cell_w, int cell_h,       // Cell size in pixels
+    float threshold,              // Residual improvement threshold (e.g. 0.10 = 10%)
+    short *out_tx, short *out_ty, // Translation (1/8 pixel)
+    short *out_a11, short *out_a12, // Affine matrix (1/256 fixed-point)
+    short *out_a21, short *out_a22
+) {
+    // Compute cell bounding box
+    int x_start = cell_x * cell_w;
+    int y_start = cell_y * cell_h;
+    int x_end = (cell_x + 1) * cell_w;
+    int y_end = (cell_y + 1) * cell_h;
+    if (x_end > width) x_end = width;
+    if (y_end > height) y_end = height;
+
+    // Sample motion vectors from a 4×4 grid within the cell
+    const int samples_x = 4;
+    const int samples_y = 4;
+    float sample_motion_x[16];
+    float sample_motion_y[16];
+    int sample_px[16];
+    int sample_py[16];
+    int n_samples = 0;
+
+    for (int sy = 0; sy < samples_y; sy++) {
+        for (int sx = 0; sx < samples_x; sx++) {
+            int px = x_start + (x_end - x_start) * sx / (samples_x - 1);
+            int py = y_start + (y_end - y_start) * sy / (samples_y - 1);
+
+            if (px >= width) px = width - 1;
+            if (py >= height) py = height - 1;
+
+            int idx = py * width + px;
+            sample_motion_x[n_samples] = flow_x[idx];
+            sample_motion_y[n_samples] = flow_y[idx];
+            sample_px[n_samples] = px - (x_start + x_end) / 2;  // Relative to cell center
+            sample_py[n_samples] = py - (y_start + y_end) / 2;
+            n_samples++;
+        }
+    }
+
+    // 1. Compute translation-only model (average motion)
+    float avg_dx = 0, avg_dy = 0;
+    for (int i = 0; i < n_samples; i++) {
+        avg_dx += sample_motion_x[i];
+        avg_dy += sample_motion_y[i];
+    }
+    avg_dx /= n_samples;
+    avg_dy /= n_samples;
+
+    // Translation residual
+    float trans_residual = 0;
+    for (int i = 0; i < n_samples; i++) {
+        float dx_err = sample_motion_x[i] - avg_dx;
+        float dy_err = sample_motion_y[i] - avg_dy;
+        trans_residual += dx_err * dx_err + dy_err * dy_err;
+    }
+
+    // 2. Estimate affine model using least-squares
+    // Solve: [vx] = [a11 a12][px] + [tx]
+    //        [vy]   [a21 a22][py]   [ty]
+    // Using normal equations for 2×2 affine
+
+    double sum_x = 0, sum_y = 0, sum_xx = 0, sum_yy = 0, sum_xy = 0;
+    double sum_vx = 0, sum_vy = 0, sum_vx_x = 0, sum_vx_y = 0;
+    double sum_vy_x = 0, sum_vy_y = 0;
+
+    for (int i = 0; i < n_samples; i++) {
+        double px = sample_px[i];
+        double py = sample_py[i];
+        double vx = sample_motion_x[i];
+        double vy = sample_motion_y[i];
+
+        sum_x += px;
+        sum_y += py;
+        sum_xx += px * px;
+        sum_yy += py * py;
+        sum_xy += px * py;
+        sum_vx += vx;
+        sum_vy += vy;
+        sum_vx_x += vx * px;
+        sum_vx_y += vx * py;
+        sum_vy_x += vy * px;
+        sum_vy_y += vy * py;
+    }
+
+    // Solve 2×2 system for [a11, a12, tx] and [a21, a22, ty]
+    double n = n_samples;
+    double det = n * sum_xx * sum_yy + 2 * sum_x * sum_y * sum_xy -
+                 sum_xx * sum_y * sum_y - sum_yy * sum_x * sum_x - n * sum_xy * sum_xy;
+
+    if (fabs(det) < 1e-6) {
+        // Singular matrix, fall back to translation
+        *out_tx = (short)(avg_dx * 8.0f);
+        *out_ty = (short)(avg_dy * 8.0f);
+        *out_a11 = 256;  // Identity
+        *out_a12 = 0;
+        *out_a21 = 0;
+        *out_a22 = 256;
+        return 0;  // Translation only
+    }
+
+    // Solve for affine parameters (simplified for readability)
+    double a11 = (sum_vx_x * sum_yy * n - sum_vx_y * sum_xy * n - sum_vx * sum_y * sum_y +
+                  sum_vx * sum_xy * sum_y + sum_vx_y * sum_x * sum_y - sum_vx_x * sum_y * sum_y) / det;
+    double a12 = (sum_vx_y * sum_xx * n - sum_vx_x * sum_xy * n - sum_vx * sum_x * sum_xy +
+                  sum_vx * sum_xx * sum_y + sum_vx_x * sum_x * sum_y - sum_vx_y * sum_x * sum_x) / det;
+    double tx = (sum_vx - a11 * sum_x - a12 * sum_y) / n;
+
+    double a21 = (sum_vy_x * sum_yy * n - sum_vy_y * sum_xy * n - sum_vy * sum_y * sum_y +
+                  sum_vy * sum_xy * sum_y + sum_vy_y * sum_x * sum_y - sum_vy_x * sum_y * sum_y) / det;
+    double a22 = (sum_vy_y * sum_xx * n - sum_vy_x * sum_xy * n - sum_vy * sum_x * sum_xy +
+                  sum_vy * sum_xx * sum_y + sum_vy_x * sum_x * sum_y - sum_vy_y * sum_x * sum_x) / det;
+    double ty = (sum_vy - a21 * sum_x - a22 * sum_y) / n;
+
+    // Affine residual
+    float affine_residual = 0;
+    for (int i = 0; i < n_samples; i++) {
+        double px = sample_px[i];
+        double py = sample_py[i];
+        double pred_vx = a11 * px + a12 * py + tx;
+        double pred_vy = a21 * px + a22 * py + ty;
+        double dx_err = sample_motion_x[i] - pred_vx;
+        double dy_err = sample_motion_y[i] - pred_vy;
+        affine_residual += dx_err * dx_err + dy_err * dy_err;
+    }
+
+    // Decision: Use affine if residual improves by > threshold
+    float improvement = (trans_residual - affine_residual) / (trans_residual + 1e-6f);
+
+    if (improvement > threshold) {
+        // Use affine
+        *out_tx = (short)(tx * 8.0f);
+        *out_ty = (short)(ty * 8.0f);
+        *out_a11 = (short)(a11 * 256.0);
+        *out_a12 = (short)(a12 * 256.0);
+        *out_a21 = (short)(a21 * 256.0);
+        *out_a22 = (short)(a22 * 256.0);
+        return 1;  // Affine
+    } else {
+        // Use translation
+        *out_tx = (short)(avg_dx * 8.0f);
+        *out_ty = (short)(avg_dy * 8.0f);
+        *out_a11 = 256;  // Identity
+        *out_a12 = 0;
+        *out_a21 = 0;
+        *out_a22 = 256;
+        return 0;  // Translation only
+    }
+}
+
+} // extern "C"
diff --git a/video_encoder/test_mesh_roundtrip.cpp b/video_encoder/test_mesh_roundtrip.cpp
new file mode 100644
index 0000000..df0e55c
--- /dev/null
+++ b/video_encoder/test_mesh_roundtrip.cpp
@@ -0,0 +1,328 @@
+// Test mesh warp round-trip consistency
+// Warps a frame forward, then backward, and checks if we get the original back
+// This is critical for MC-lifting invertibility
+
+#include <opencv2/opencv.hpp>
+#include <cstdlib>
+#include <cstring>
+#include <cmath>
+#include <cstdio>
+#include <ctime>
+
+// Include the mesh functions from encoder
+extern "C" {
+    void estimate_motion_optical_flow(
+        const unsigned char *frame1_rgb, const unsigned char *frame2_rgb,
+        int width, int height,
+        float **out_flow_x, float **out_flow_y
+    );
+
+    void build_mesh_from_flow(
+        const float *flow_x, const float *flow_y,
+        int width, int height,
+        int mesh_w, int mesh_h,
+        int16_t *mesh_dx, int16_t *mesh_dy
+    );
+
+    void smooth_mesh_laplacian(
+        int16_t *mesh_dx, int16_t *mesh_dy,
+        int mesh_width, int mesh_height,
+        float smoothness, int iterations
+    );
+}
+
+// Mesh warp with bilinear interpolation (translation only)
+static void apply_mesh_warp_rgb(
+    const cv::Mat &src,
+    cv::Mat &dst,
+    const int16_t *mesh_dx,
+    const int16_t *mesh_dy,
+    int mesh_w, int mesh_h
+) {
+    int width = src.cols;
+    int height = src.rows;
+    int cell_w = width / mesh_w;
+    int cell_h = height / mesh_h;
+
+    dst = cv::Mat(height, width, CV_8UC3);
+
+    for (int y = 0; y < height; y++) {
+        for (int x = 0; x < width; x++) {
+            int cell_x = x / cell_w;
+            int cell_y = y / cell_h;
+
+            cell_x = std::min(cell_x, mesh_w - 2);
+            cell_y = std::min(cell_y, mesh_h - 2);
+
+            int idx_00 = cell_y * mesh_w + cell_x;
+            int idx_10 = idx_00 + 1;
+            int idx_01 = (cell_y + 1) * mesh_w + cell_x;
+            int idx_11 = idx_01 + 1;
+
+            float cp_x0 = cell_x * cell_w + cell_w / 2.0f;
+            float cp_y0 = cell_y * cell_h + cell_h / 2.0f;
+            float cp_x1 = (cell_x + 1) * cell_w + cell_w / 2.0f;
+            float cp_y1 = (cell_y + 1) * cell_h + cell_h / 2.0f;
+
+            float alpha = (x - cp_x0) / (cp_x1 - cp_x0);
+            float beta = (y - cp_y0) / (cp_y1 - cp_y0);
+            alpha = std::max(0.0f, std::min(1.0f, alpha));
+            beta = std::max(0.0f, std::min(1.0f, beta));
+
+            float dx = (1 - alpha) * (1 - beta) * (mesh_dx[idx_00] / 8.0f) +
+                       alpha * (1 - beta) * (mesh_dx[idx_10] / 8.0f) +
+                       (1 - alpha) * beta * (mesh_dx[idx_01] / 8.0f) +
+                       alpha * beta * (mesh_dx[idx_11] / 8.0f);
+
+            float dy = (1 - alpha) * (1 - beta) * (mesh_dy[idx_00] / 8.0f) +
+                       alpha * (1 - beta) * (mesh_dy[idx_10] / 8.0f) +
+                       (1 - alpha) * beta * (mesh_dy[idx_01] / 8.0f) +
+                       alpha * beta * (mesh_dy[idx_11] / 8.0f);
+
+            float src_x = x + dx;
+            float src_y = y + dy;
+
+            int sx0 = (int)floorf(src_x);
+            int sy0 = (int)floorf(src_y);
+            int sx1 = sx0 + 1;
+            int sy1 = sy0 + 1;
+
+            sx0 = std::max(0, std::min(width - 1, sx0));
+            sy0 = std::max(0, std::min(height - 1, sy0));
+            sx1 = std::max(0, std::min(width - 1, sx1));
+            sy1 = std::max(0, std::min(height - 1, sy1));
+
+            float fx = src_x - sx0;
+            float fy = src_y - sy0;
+
+            for (int c = 0; c < 3; c++) {
+                float val_00 = src.at<cv::Vec3b>(sy0, sx0)[c];
+                float val_10 = src.at<cv::Vec3b>(sy0, sx1)[c];
+                float val_01 = src.at<cv::Vec3b>(sy1, sx0)[c];
+                float val_11 = src.at<cv::Vec3b>(sy1, sx1)[c];
+
+                float val = (1 - fx) * (1 - fy) * val_00 +
+                            fx * (1 - fy) * val_10 +
+                            (1 - fx) * fy * val_01 +
+                            fx * fy * val_11;
+
+                dst.at<cv::Vec3b>(y, x)[c] = (unsigned char)std::max(0.0f, std::min(255.0f, val));
+            }
+        }
+    }
+}
+
+int main(int argc, char** argv) {
+    const char* video_file = (argc > 1) ? argv[1] : "test_video.mp4";
+    int num_tests = (argc > 2) ? atoi(argv[2]) : 5;
+
+    printf("Opening video: %s\n", video_file);
+    cv::VideoCapture cap(video_file);
+
+    if (!cap.isOpened()) {
+        fprintf(stderr, "Error: Cannot open video file\n");
+        return 1;
+    }
+
+    int total_frames = (int)cap.get(cv::CAP_PROP_FRAME_COUNT);
+    int width = (int)cap.get(cv::CAP_PROP_FRAME_WIDTH);
+    int height = (int)cap.get(cv::CAP_PROP_FRAME_HEIGHT);
+
+    printf("Video: %dx%d, %d frames\n", width, height, total_frames);
+
+    // Mesh dimensions (32×32 cells)
+    int mesh_cell_size = 32;
+    int mesh_w = (width + mesh_cell_size - 1) / mesh_cell_size;
+    int mesh_h = (height + mesh_cell_size - 1) / mesh_cell_size;
+    if (mesh_w < 2) mesh_w = 2;
+    if (mesh_h < 2) mesh_h = 2;
+
+    printf("Mesh: %dx%d (approx %dx%d px cells)\n\n",
+           mesh_w, mesh_h, width / mesh_w, height / mesh_h);
+
+    float smoothness = 0.5f;
+    int smooth_iterations = 8;
+
+    srand(time(NULL));
+
+    double total_forward_psnr = 0.0;
+    double total_roundtrip_psnr = 0.0;
+    double total_half_roundtrip_psnr = 0.0;
+
+    for (int test = 0; test < num_tests; test++) {
+        int frame_num = 5 + rand() % (total_frames - 10);
+
+        printf("[Test %d/%d] Frame pair %d → %d\n", test + 1, num_tests, frame_num - 1, frame_num);
+
+        cap.set(cv::CAP_PROP_POS_FRAMES, frame_num - 1);
+        cv::Mat frame0, frame1;
+        cap >> frame0;
+        cap >> frame1;
+
+        if (frame0.empty() || frame1.empty()) {
+            fprintf(stderr, "Error reading frames\n");
+            continue;
+        }
+
+        cv::Mat frame0_rgb, frame1_rgb;
+        cv::cvtColor(frame0, frame0_rgb, cv::COLOR_BGR2RGB);
+        cv::cvtColor(frame1, frame1_rgb, cv::COLOR_BGR2RGB);
+
+        // Compute mesh (F0 → F1)
+        float *flow_x = nullptr, *flow_y = nullptr;
+        estimate_motion_optical_flow(frame0_rgb.data, frame1_rgb.data,
+                                     width, height, &flow_x, &flow_y);
+
+        int16_t *mesh_dx = (int16_t*)malloc(mesh_w * mesh_h * sizeof(int16_t));
+        int16_t *mesh_dy = (int16_t*)malloc(mesh_w * mesh_h * sizeof(int16_t));
+        build_mesh_from_flow(flow_x, flow_y, width, height, mesh_w, mesh_h, mesh_dx, mesh_dy);
+        smooth_mesh_laplacian(mesh_dx, mesh_dy, mesh_w, mesh_h, smoothness, smooth_iterations);
+
+        // Create inverted mesh
+        int16_t *inv_mesh_dx = (int16_t*)malloc(mesh_w * mesh_h * sizeof(int16_t));
+        int16_t *inv_mesh_dy = (int16_t*)malloc(mesh_w * mesh_h * sizeof(int16_t));
+        for (int i = 0; i < mesh_w * mesh_h; i++) {
+            inv_mesh_dx[i] = -mesh_dx[i];
+            inv_mesh_dy[i] = -mesh_dy[i];
+        }
+
+        // Create half-mesh for symmetric lifting test
+        int16_t *half_mesh_dx = (int16_t*)malloc(mesh_w * mesh_h * sizeof(int16_t));
+        int16_t *half_mesh_dy = (int16_t*)malloc(mesh_w * mesh_h * sizeof(int16_t));
+        int16_t *neg_half_mesh_dx = (int16_t*)malloc(mesh_w * mesh_h * sizeof(int16_t));
+        int16_t *neg_half_mesh_dy = (int16_t*)malloc(mesh_w * mesh_h * sizeof(int16_t));
+        for (int i = 0; i < mesh_w * mesh_h; i++) {
+            half_mesh_dx[i] = mesh_dx[i] / 2;
+            half_mesh_dy[i] = mesh_dy[i] / 2;
+            neg_half_mesh_dx[i] = -half_mesh_dx[i];
+            neg_half_mesh_dy[i] = -half_mesh_dy[i];
+        }
+
+        // TEST 1: Full forward warp quality (F0 → F1)
+        cv::Mat warped_forward;
+        apply_mesh_warp_rgb(frame0, warped_forward, mesh_dx, mesh_dy, mesh_w, mesh_h);
+
+        double forward_mse = 0.0;
+        for (int y = 0; y < height; y++) {
+            for (int x = 0; x < width; x++) {
+                for (int c = 0; c < 3; c++) {
+                    double diff = (double)warped_forward.at<cv::Vec3b>(y, x)[c] -
+                                 (double)frame1.at<cv::Vec3b>(y, x)[c];
+                    forward_mse += diff * diff;
+                }
+            }
+        }
+        forward_mse /= (width * height * 3);
+        double forward_psnr = (forward_mse > 0) ? 10.0 * log10(255.0 * 255.0 / forward_mse) : 999.0;
+        total_forward_psnr += forward_psnr;
+
+        // TEST 2: Full round-trip (F0 → forward → backward → F0')
+        cv::Mat roundtrip;
+        apply_mesh_warp_rgb(warped_forward, roundtrip, inv_mesh_dx, inv_mesh_dy, mesh_w, mesh_h);
+
+        double roundtrip_mse = 0.0;
+        for (int y = 0; y < height; y++) {
+            for (int x = 0; x < width; x++) {
+                for (int c = 0; c < 3; c++) {
+                    double diff = (double)roundtrip.at<cv::Vec3b>(y, x)[c] -
+                                 (double)frame0.at<cv::Vec3b>(y, x)[c];
+                    roundtrip_mse += diff * diff;
+                }
+            }
+        }
+        roundtrip_mse /= (width * height * 3);
+        double roundtrip_psnr = (roundtrip_mse > 0) ? 10.0 * log10(255.0 * 255.0 / roundtrip_mse) : 999.0;
+        total_roundtrip_psnr += roundtrip_psnr;
+
+        // TEST 3: Half-step symmetric round-trip (MC-lifting style)
+        // F0 → +½mesh, then → -½mesh (should return to F0)
+        cv::Mat half_forward, half_roundtrip;
+        apply_mesh_warp_rgb(frame0, half_forward, half_mesh_dx, half_mesh_dy, mesh_w, mesh_h);
+        apply_mesh_warp_rgb(half_forward, half_roundtrip, neg_half_mesh_dx, neg_half_mesh_dy, mesh_w, mesh_h);
+
+        double half_roundtrip_mse = 0.0;
+        for (int y = 0; y < height; y++) {
+            for (int x = 0; x < width; x++) {
+                for (int c = 0; c < 3; c++) {
+                    double diff = (double)half_roundtrip.at<cv::Vec3b>(y, x)[c] -
+                                 (double)frame0.at<cv::Vec3b>(y, x)[c];
+                    half_roundtrip_mse += diff * diff;
+                }
+            }
+        }
+        half_roundtrip_mse /= (width * height * 3);
+        double half_roundtrip_psnr = (half_roundtrip_mse > 0) ? 10.0 * log10(255.0 * 255.0 / half_roundtrip_mse) : 999.0;
+        total_half_roundtrip_psnr += half_roundtrip_psnr;
+
+        printf("  Forward warp (F0→F1):       PSNR = %.2f dB\n", forward_psnr);
+        printf("  Full round-trip (F0→F0'):   PSNR = %.2f dB\n", roundtrip_psnr);
+        printf("  Half round-trip (±½mesh):   PSNR = %.2f dB\n", half_roundtrip_psnr);
+
+        // Compute motion stats
+        float avg_motion = 0.0f, max_motion = 0.0f;
+        for (int i = 0; i < mesh_w * mesh_h; i++) {
+            float dx = mesh_dx[i] / 8.0f;
+            float dy = mesh_dy[i] / 8.0f;
+            float motion = sqrtf(dx * dx + dy * dy);
+            avg_motion += motion;
+            if (motion > max_motion) max_motion = motion;
+        }
+        avg_motion /= (mesh_w * mesh_h);
+        printf("  Motion: avg=%.2f px, max=%.2f px\n\n", avg_motion, max_motion);
+
+        // Save visualization for worst case
+        if (test == 0 || roundtrip_psnr < 30.0) {
+            char filename[256];
+            sprintf(filename, "roundtrip_%04d_original.png", frame_num);
+            cv::imwrite(filename, frame0);
+            sprintf(filename, "roundtrip_%04d_forward.png", frame_num);
+            cv::imwrite(filename, warped_forward);
+            sprintf(filename, "roundtrip_%04d_roundtrip.png", frame_num);
+            cv::imwrite(filename, roundtrip);
+
+            // Difference images
+            cv::Mat diff_roundtrip = cv::Mat::zeros(height, width, CV_8UC3);
+            for (int y = 0; y < height; y++) {
+                for (int x = 0; x < width; x++) {
+                    for (int c = 0; c < 3; c++) {
+                        int diff = abs((int)roundtrip.at<cv::Vec3b>(y, x)[c] -
+                                      (int)frame0.at<cv::Vec3b>(y, x)[c]);
+                        diff_roundtrip.at<cv::Vec3b>(y, x)[c] = std::min(diff * 5, 255);
+                    }
+                }
+            }
+            sprintf(filename, "roundtrip_%04d_diff.png", frame_num);
+            cv::imwrite(filename, diff_roundtrip);
+            printf("  Saved visualization: roundtrip_%04d_*.png\n\n", frame_num);
+        }
+
+        free(flow_x);
+        free(flow_y);
+        free(mesh_dx);
+        free(mesh_dy);
+        free(inv_mesh_dx);
+        free(inv_mesh_dy);
+        free(half_mesh_dx);
+        free(half_mesh_dy);
+        free(neg_half_mesh_dx);
+        free(neg_half_mesh_dy);
+    }
+
+    printf("===========================================\n");
+    printf("Average Results (%d tests):\n", num_tests);
+    printf("  Forward warp quality:       %.2f dB\n", total_forward_psnr / num_tests);
+    printf("  Full round-trip error:      %.2f dB\n", total_roundtrip_psnr / num_tests);
+    printf("  Half round-trip error:      %.2f dB\n", total_half_roundtrip_psnr / num_tests);
+    printf("===========================================\n\n");
+
+    if (total_roundtrip_psnr / num_tests < 35.0) {
+        printf("WARNING: Round-trip PSNR < 35 dB indicates poor invertibility!\n");
+        printf("This will cause MC-lifting to accumulate errors and hurt compression.\n");
+        printf("Bilinear interpolation artifacts are likely the culprit.\n");
+    } else {
+        printf("Round-trip consistency looks acceptable (>35 dB).\n");
+    }
+
+    cap.release();
+    return 0;
+}
diff --git a/video_encoder/test_mesh_warp.cpp b/video_encoder/test_mesh_warp.cpp
new file mode 100644
index 0000000..5dcf820
--- /dev/null
+++ b/video_encoder/test_mesh_warp.cpp
@@ -0,0 +1,422 @@
+// Visual unit test for mesh warping with hierarchical block matching and affine estimation
+// Picks 5 random frames from test_video.mp4, warps prev frame to current frame using mesh,
+// and saves both warped and target frames for visual comparison
+// Now includes: hierarchical diamond search, Laplacian smoothing, and selective affine transforms
+
+#include <opencv2/opencv.hpp>
+#include <opencv2/video/tracking.hpp>
+#include <cstdlib>
+#include <cstring>
+#include <cmath>
+#include <cstdio>
+#include <ctime>
+
+// Include the mesh functions from encoder
+extern "C" {
+    void estimate_motion_optical_flow(
+        const unsigned char *frame1_rgb, const unsigned char *frame2_rgb,
+        int width, int height,
+        float **out_flow_x, float **out_flow_y
+    );
+
+    void build_mesh_from_flow(
+        const float *flow_x, const float *flow_y,
+        int width, int height,
+        int mesh_w, int mesh_h,
+        int16_t *mesh_dx, int16_t *mesh_dy
+    );
+
+    void smooth_mesh_laplacian(
+        int16_t *mesh_dx, int16_t *mesh_dy,
+        int mesh_width, int mesh_height,
+        float smoothness, int iterations
+    );
+
+    int estimate_cell_affine(
+        const float *flow_x, const float *flow_y,
+        int width, int height,
+        int cell_x, int cell_y,
+        int cell_w, int cell_h,
+        float threshold,
+        int16_t *out_tx, int16_t *out_ty,
+        int16_t *out_a11, int16_t *out_a12,
+        int16_t *out_a21, int16_t *out_a22
+    );
+}
+
+// Mesh warp with bilinear interpolation and optional affine support
+static void apply_mesh_warp_rgb(
+    const cv::Mat &src,          // Input BGR image
+    cv::Mat &dst,                 // Output warped BGR image
+    const int16_t *mesh_dx,       // Mesh motion vectors (1/8 pixel)
+    const int16_t *mesh_dy,
+    const uint8_t *affine_mask,   // 1=affine, 0=translation
+    const int16_t *affine_a11,
+    const int16_t *affine_a12,
+    const int16_t *affine_a21,
+    const int16_t *affine_a22,
+    int mesh_w, int mesh_h
+) {
+    int width = src.cols;
+    int height = src.rows;
+    int cell_w = width / mesh_w;
+    int cell_h = height / mesh_h;
+
+    dst = cv::Mat(height, width, CV_8UC3);
+
+    for (int y = 0; y < height; y++) {
+        for (int x = 0; x < width; x++) {
+            int cell_x = x / cell_w;
+            int cell_y = y / cell_h;
+
+            // Clamp to valid mesh range
+            cell_x = std::min(cell_x, mesh_w - 2);
+            cell_y = std::min(cell_y, mesh_h - 2);
+
+            // Four corner control points
+            int idx_00 = cell_y * mesh_w + cell_x;
+            int idx_10 = idx_00 + 1;
+            int idx_01 = (cell_y + 1) * mesh_w + cell_x;
+            int idx_11 = idx_01 + 1;
+
+            // Control point positions
+            float cp_x0 = cell_x * cell_w + cell_w / 2.0f;
+            float cp_y0 = cell_y * cell_h + cell_h / 2.0f;
+            float cp_x1 = (cell_x + 1) * cell_w + cell_w / 2.0f;
+            float cp_y1 = (cell_y + 1) * cell_h + cell_h / 2.0f;
+
+            // Local coordinates
+            float alpha = (x - cp_x0) / (cp_x1 - cp_x0);
+            float beta = (y - cp_y0) / (cp_y1 - cp_y0);
+            alpha = std::max(0.0f, std::min(1.0f, alpha));
+            beta = std::max(0.0f, std::min(1.0f, beta));
+
+            // Bilinear interpolation of motion vectors
+            float dx = (1 - alpha) * (1 - beta) * (mesh_dx[idx_00] / 8.0f) +
+                       alpha * (1 - beta) * (mesh_dx[idx_10] / 8.0f) +
+                       (1 - alpha) * beta * (mesh_dx[idx_01] / 8.0f) +
+                       alpha * beta * (mesh_dx[idx_11] / 8.0f);
+
+            float dy = (1 - alpha) * (1 - beta) * (mesh_dy[idx_00] / 8.0f) +
+                       alpha * (1 - beta) * (mesh_dy[idx_10] / 8.0f) +
+                       (1 - alpha) * beta * (mesh_dy[idx_01] / 8.0f) +
+                       alpha * beta * (mesh_dy[idx_11] / 8.0f);
+
+            // Check if we're using affine in this cell
+            // For simplicity, just use the top-left corner's affine parameters
+            int cell_idx = cell_y * mesh_w + cell_x;
+            if (affine_mask && affine_mask[cell_idx]) {
+                // Apply affine transform
+                // Compute position relative to cell center
+                float rel_x = x - (cell_x * cell_w + cell_w / 2.0f);
+                float rel_y = y - (cell_y * cell_h + cell_h / 2.0f);
+
+                float a11 = affine_a11[cell_idx] / 256.0f;
+                float a12 = affine_a12[cell_idx] / 256.0f;
+                float a21 = affine_a21[cell_idx] / 256.0f;
+                float a22 = affine_a22[cell_idx] / 256.0f;
+
+                // Affine warp: [x'] = [a11 a12][x] + [dx]
+                //               [y']   [a21 a22][y]   [dy]
+                dx = a11 * rel_x + a12 * rel_y + dx;
+                dy = a21 * rel_x + a22 * rel_y + dy;
+            }
+
+            // Source coordinates (inverse warp)
+            float src_x = x + dx;
+            float src_y = y + dy;
+
+            // Bilinear interpolation
+            int sx0 = (int)floorf(src_x);
+            int sy0 = (int)floorf(src_y);
+            int sx1 = sx0 + 1;
+            int sy1 = sy0 + 1;
+
+            sx0 = std::max(0, std::min(width - 1, sx0));
+            sy0 = std::max(0, std::min(height - 1, sy0));
+            sx1 = std::max(0, std::min(width - 1, sx1));
+            sy1 = std::max(0, std::min(height - 1, sy1));
+
+            float fx = src_x - sx0;
+            float fy = src_y - sy0;
+
+            // Interpolate each channel
+            for (int c = 0; c < 3; c++) {
+                float val_00 = src.at<cv::Vec3b>(sy0, sx0)[c];
+                float val_10 = src.at<cv::Vec3b>(sy0, sx1)[c];
+                float val_01 = src.at<cv::Vec3b>(sy1, sx0)[c];
+                float val_11 = src.at<cv::Vec3b>(sy1, sx1)[c];
+
+                float val = (1 - fx) * (1 - fy) * val_00 +
+                            fx * (1 - fy) * val_10 +
+                            (1 - fx) * fy * val_01 +
+                            fx * fy * val_11;
+
+                dst.at<cv::Vec3b>(y, x)[c] = (unsigned char)std::max(0.0f, std::min(255.0f, val));
+            }
+        }
+    }
+}
+
+// Create visualization overlay showing affine cells
+static void create_affine_overlay(
+    cv::Mat &img,
+    const uint8_t *affine_mask,
+    int mesh_w, int mesh_h
+) {
+    int width = img.cols;
+    int height = img.rows;
+    int cell_w = width / mesh_w;
+    int cell_h = height / mesh_h;
+
+    for (int my = 0; my < mesh_h; my++) {
+        for (int mx = 0; mx < mesh_w; mx++) {
+            int idx = my * mesh_w + mx;
+
+            if (affine_mask[idx]) {
+                // Draw green rectangle for affine cells
+                int x0 = mx * cell_w;
+                int y0 = my * cell_h;
+                int x1 = (mx + 1) * cell_w;
+                int y1 = (my + 1) * cell_h;
+
+                cv::rectangle(img,
+                             cv::Point(x0, y0),
+                             cv::Point(x1, y1),
+                             cv::Scalar(0, 255, 0), 1);
+            }
+        }
+    }
+}
+
+int main(int argc, char** argv) {
+    const char* video_file = (argc > 1) ? argv[1] : "test_video.mp4";
+    int num_test_frames = (argc > 2) ? atoi(argv[2]) : 5;
+
+    printf("Opening video: %s\n", video_file);
+    cv::VideoCapture cap(video_file);
+
+    if (!cap.isOpened()) {
+        fprintf(stderr, "Error: Cannot open video file %s\n", video_file);
+        return 1;
+    }
+
+    int total_frames = (int)cap.get(cv::CAP_PROP_FRAME_COUNT);
+    int width = (int)cap.get(cv::CAP_PROP_FRAME_WIDTH);
+    int height = (int)cap.get(cv::CAP_PROP_FRAME_HEIGHT);
+
+    printf("Video: %dx%d, %d frames\n", width, height, total_frames);
+
+    if (total_frames < 10) {
+        fprintf(stderr, "Error: Video too short (need at least 10 frames)\n");
+        return 1;
+    }
+
+    // Calculate mesh dimensions (32×32 pixel cells, matches current encoder)
+    int mesh_cell_size = 32;
+    int mesh_w = (width + mesh_cell_size - 1) / mesh_cell_size;
+    int mesh_h = (height + mesh_cell_size - 1) / mesh_cell_size;
+    if (mesh_w < 2) mesh_w = 2;
+    if (mesh_h < 2) mesh_h = 2;
+
+    printf("Mesh: %dx%d (approx %dx%d px cells)\n",
+           mesh_w, mesh_h, width / mesh_w, height / mesh_h);
+
+    // Encoder parameters (match current encoder_tav.c settings)
+    float smoothness = 0.5f;      // Mesh smoothness weight
+    int smooth_iterations = 8;     // Smoothing iterations
+    float affine_threshold = 0.40f; // 40% improvement required for affine
+
+    printf("Settings: smoothness=%.2f, iterations=%d, affine_threshold=%.0f%%\n",
+           smoothness, smooth_iterations, affine_threshold * 100.0f);
+
+    // Seed random number generator
+    srand(time(NULL));
+
+    // Pick random frames (avoid first and last 5 frames)
+    printf("\nTesting %d random frame pairs:\n", num_test_frames);
+    for (int test = 0; test < num_test_frames; test++) {
+        // Pick random frame (ensure we have a previous frame)
+        int frame_num = 5 + rand() % (total_frames - 10);
+
+        printf("\n[Test %d/%d] Warping frame %d → frame %d (inverse warp)\n",
+               test + 1, num_test_frames, frame_num - 1, frame_num);
+
+        // Read previous frame (source for warping)
+        cap.set(cv::CAP_PROP_POS_FRAMES, frame_num - 1);
+
+        cv::Mat prev_frame;
+        cap >> prev_frame;
+        if (prev_frame.empty()) {
+            fprintf(stderr, "Error reading frame %d\n", frame_num - 1);
+            continue;
+        }
+
+        // Read current frame (target to match)
+        cv::Mat curr_frame;
+        cap >> curr_frame;
+        if (curr_frame.empty()) {
+            fprintf(stderr, "Error reading frame %d\n", frame_num);
+            continue;
+        }
+
+        // Convert to RGB for block matching
+        cv::Mat prev_rgb, curr_rgb;
+        cv::cvtColor(prev_frame, prev_rgb, cv::COLOR_BGR2RGB);
+        cv::cvtColor(curr_frame, curr_rgb, cv::COLOR_BGR2RGB);
+
+        // Compute hierarchical block matching (replaces optical flow)
+        printf("  Computing hierarchical block matching...\n");
+        float *flow_x = nullptr, *flow_y = nullptr;
+        estimate_motion_optical_flow(
+            prev_rgb.data, curr_rgb.data,
+            width, height,
+            &flow_x, &flow_y
+        );
+
+        // Build mesh from flow
+        printf("  Building mesh from block matches...\n");
+        int16_t *mesh_dx = (int16_t*)malloc(mesh_w * mesh_h * sizeof(int16_t));
+        int16_t *mesh_dy = (int16_t*)malloc(mesh_w * mesh_h * sizeof(int16_t));
+        build_mesh_from_flow(flow_x, flow_y, width, height, mesh_w, mesh_h, mesh_dx, mesh_dy);
+
+        // Apply Laplacian smoothing
+        printf("  Applying Laplacian smoothing (%d iterations, %.2f weight)...\n",
+               smooth_iterations, smoothness);
+        smooth_mesh_laplacian(mesh_dx, mesh_dy, mesh_w, mesh_h, smoothness, smooth_iterations);
+
+        // Estimate selective per-cell affine transforms
+        printf("  Estimating selective affine transforms (threshold=%.0f%%)...\n",
+               affine_threshold * 100.0f);
+        uint8_t *affine_mask = (uint8_t*)calloc(mesh_w * mesh_h, sizeof(uint8_t));
+        int16_t *affine_a11 = (int16_t*)malloc(mesh_w * mesh_h * sizeof(int16_t));
+        int16_t *affine_a12 = (int16_t*)malloc(mesh_w * mesh_h * sizeof(int16_t));
+        int16_t *affine_a21 = (int16_t*)malloc(mesh_w * mesh_h * sizeof(int16_t));
+        int16_t *affine_a22 = (int16_t*)malloc(mesh_w * mesh_h * sizeof(int16_t));
+
+        int cell_w = width / mesh_w;
+        int cell_h = height / mesh_h;
+        int affine_count = 0;
+
+        for (int cy = 0; cy < mesh_h; cy++) {
+            for (int cx = 0; cx < mesh_w; cx++) {
+                int cell_idx = cy * mesh_w + cx;
+
+                int16_t tx, ty, a11, a12, a21, a22;
+                int use_affine = estimate_cell_affine(
+                    flow_x, flow_y,
+                    width, height,
+                    cx, cy, cell_w, cell_h,
+                    affine_threshold,
+                    &tx, &ty, &a11, &a12, &a21, &a22
+                );
+
+                affine_mask[cell_idx] = use_affine ? 1 : 0;
+                mesh_dx[cell_idx] = tx;
+                mesh_dy[cell_idx] = ty;
+                affine_a11[cell_idx] = a11;
+                affine_a12[cell_idx] = a12;
+                affine_a21[cell_idx] = a21;
+                affine_a22[cell_idx] = a22;
+
+                if (use_affine) affine_count++;
+            }
+        }
+
+        printf("  Affine usage: %d/%d cells (%.1f%%)\n",
+               affine_count, mesh_w * mesh_h,
+               100.0f * affine_count / (mesh_w * mesh_h));
+
+        // Warp previous frame to current frame
+        printf("  Warping frame with mesh + affine...\n");
+        cv::Mat warped;
+        apply_mesh_warp_rgb(prev_frame, warped, mesh_dx, mesh_dy,
+                           affine_mask, affine_a11, affine_a12, affine_a21, affine_a22,
+                           mesh_w, mesh_h);
+
+        // Create visualization with affine overlay
+        cv::Mat warped_viz = warped.clone();
+        create_affine_overlay(warped_viz, affine_mask, mesh_w, mesh_h);
+
+        // Compute MSE between warped and target
+        double mse = 0.0;
+        for (int y = 0; y < height; y++) {
+            for (int x = 0; x < width; x++) {
+                for (int c = 0; c < 3; c++) {
+                    double diff = (double)warped.at<cv::Vec3b>(y, x)[c] -
+                                 (double)curr_frame.at<cv::Vec3b>(y, x)[c];
+                    mse += diff * diff;
+                }
+            }
+        }
+        mse /= (width * height * 3);
+        double psnr = (mse > 0) ? 10.0 * log10(255.0 * 255.0 / mse) : 999.0;
+        printf("  Warp quality: MSE=%.2f, PSNR=%.2f dB\n", mse, psnr);
+
+        // Save images
+        char filename[256];
+        sprintf(filename, "test_mesh_frame_%04d_source.png", frame_num - 1);
+        cv::imwrite(filename, prev_frame);
+        printf("  Saved source: %s\n", filename);
+
+        sprintf(filename, "test_mesh_frame_%04d_warped.png", frame_num);
+        cv::imwrite(filename, warped);
+        printf("  Saved warped: %s\n", filename);
+
+        sprintf(filename, "test_mesh_frame_%04d_warped_viz.png", frame_num);
+        cv::imwrite(filename, warped_viz);
+        printf("  Saved warped+viz (green=affine): %s\n", filename);
+
+        sprintf(filename, "test_mesh_frame_%04d_target.png", frame_num);
+        cv::imwrite(filename, curr_frame);
+        printf("  Saved target: %s\n", filename);
+
+        // Compute difference image
+        cv::Mat diff_img = cv::Mat::zeros(height, width, CV_8UC3);
+        for (int y = 0; y < height; y++) {
+            for (int x = 0; x < width; x++) {
+                for (int c = 0; c < 3; c++) {
+                    int diff = abs((int)warped.at<cv::Vec3b>(y, x)[c] -
+                                  (int)curr_frame.at<cv::Vec3b>(y, x)[c]);
+                    diff_img.at<cv::Vec3b>(y, x)[c] = std::min(diff * 3, 255); // Amplify for visibility
+                }
+            }
+        }
+        sprintf(filename, "test_mesh_frame_%04d_diff.png", frame_num);
+        cv::imwrite(filename, diff_img);
+        printf("  Saved difference (amplified 3x): %s\n", filename);
+
+        // Compute motion statistics
+        float max_motion = 0.0f, avg_motion = 0.0f;
+        for (int i = 0; i < mesh_w * mesh_h; i++) {
+            float dx = mesh_dx[i] / 8.0f;
+            float dy = mesh_dy[i] / 8.0f;
+            float motion = sqrtf(dx * dx + dy * dy);
+            avg_motion += motion;
+            if (motion > max_motion) max_motion = motion;
+        }
+        avg_motion /= (mesh_w * mesh_h);
+        printf("  Motion: avg=%.2f px, max=%.2f px\n", avg_motion, max_motion);
+
+        // Cleanup
+        free(flow_x);
+        free(flow_y);
+        free(mesh_dx);
+        free(mesh_dy);
+        free(affine_mask);
+        free(affine_a11);
+        free(affine_a12);
+        free(affine_a21);
+        free(affine_a22);
+    }
+
+    printf("\nDone! Check output images:\n");
+    printf("  *_source.png: Original frame before warping\n");
+    printf("  *_warped.png: Warped frame (should match target)\n");
+    printf("  *_warped_viz.png: Warped with green overlay showing affine cells\n");
+    printf("  *_target.png: Target frame to match\n");
+    printf("  *_diff.png: Difference image (should be mostly black if warp is good)\n");
+
+    cap.release();
+    return 0;
+}