diff --git a/video_encoder/encoder_tav.c b/video_encoder/encoder_tav.c
index bdf9217..b280042 100644
--- a/video_encoder/encoder_tav.c
+++ b/video_encoder/encoder_tav.c
@@ -1466,25 +1466,25 @@ static void apply_spatial_mv_prediction_to_tree(
         int block_y = node->y / residual_coding_min_block_size;
         int idx = block_y * blocks_x + block_x;
 
-        // Get neighbors: left, top, top-right
+        // Get neighbours: left, top, top-right
         int16_t left_x = 0, left_y = 0;
         int16_t top_x = 0, top_y = 0;
         int16_t top_right_x = 0, top_right_y = 0;
 
         if (block_x > 0) {
-            // Left neighbor
+            // Left neighbour
             int left_idx = idx - 1;
             left_x = mv_map_x[left_idx];
             left_y = mv_map_y[left_idx];
         }
 
         if (block_y > 0) {
-            // Top neighbor
+            // Top neighbour
             int top_idx = idx - blocks_x;
             top_x = mv_map_x[top_idx];
             top_y = mv_map_y[top_idx];
 
-            // Top-right neighbor
+            // Top-right neighbour
             if (block_x + 1 < blocks_x) {
                 int top_right_idx = top_idx + 1;
                 top_right_x = mv_map_x[top_right_idx];
@@ -1514,7 +1514,7 @@ static void apply_spatial_mv_prediction_to_tree(
 // Format: [split_flags_bitstream][leaf_mv_data]
 //   - split_flags: 1 bit per node (breadth-first), 1=split, 0=leaf
 //   - leaf_mv_data: For each leaf in order: [skip_flag:1bit][mvd_x:15bits][mvd_y:16bits]
-//   Note: MVs are now DIFFERENTIAL (predicted from spatial neighbors)
+//   Note: MVs are now DIFFERENTIAL (predicted from spatial neighbours)
 static size_t serialise_quad_tree(quad_tree_node_t *root, uint8_t *buffer, size_t buffer_size) {
     if (!root) return 0;
 
@@ -2069,9 +2069,9 @@ typedef struct thread_encoder_context {
     float **work_y_frames;        // [max_gop_size][max_pixels]
     float **work_co_frames;
     float **work_cg_frames;
-    int16_t **quantized_y;
-    int16_t **quantized_co;
-    int16_t **quantized_cg;
+    int16_t **quantised_y;
+    int16_t **quantised_co;
+    int16_t **quantised_cg;
     uint8_t *compression_buffer;
     size_t compression_buffer_size;
     ZSTD_CCtx *zstd_ctx;
@@ -2639,7 +2639,7 @@ static tav_encoder_t* create_encoder(void) {
     enc->tad_audio = 0;  // Default: use MP2 audio (TAD quality follows quality_level)
     enc->enable_crop_encoding = 0;  // Default: disabled (Phase 2 experimental)
 
-    // Active region tracking (initialized to full frame, updated when crop encoding enabled)
+    // Active region tracking (initialised to full frame, updated when crop encoding enabled)
     enc->active_mask_top = 0;
     enc->active_mask_right = 0;
     enc->active_mask_bottom = 0;
@@ -2731,6 +2731,8 @@ static tav_encoder_t* create_encoder(void) {
     enc->two_pass_current_frame = 0;
     enc->two_pass_analysis_file = NULL;
 
+    enc->num_threads = 0; // Default: undecided
+
     return enc;
 }
 
@@ -2978,7 +2980,7 @@ static int initialise_encoder(tav_encoder_t *enc) {
 // =============================================================================
 
 /**
- * Initialize GOP slots for circular buffer
+ * Initialise GOP slots for circular buffer
  * Allocates num_slots slots with frame buffers sized for width×height×capacity
  */
 static gop_slot_t* init_gop_slots(int num_slots, int width, int height, int capacity) {
@@ -2994,7 +2996,7 @@ static gop_slot_t* init_gop_slots(int num_slots, int width, int height, int capa
     for (int i = 0; i < num_slots; i++) {
         gop_slot_t *slot = &slots[i];
 
-        // Initialize status and synchronization
+        // Initialise status and synchronization
         slot->status = GOP_STATUS_EMPTY;
         slot->gop_index = -1;
         mtx_init(&slot->mutex, mtx_plain);
@@ -3041,7 +3043,7 @@ static gop_slot_t* init_gop_slots(int num_slots, int width, int height, int capa
             return NULL;
         }
 
-        // Initialize output pointers as NULL
+        // Initialise output pointers as NULL
         slot->video_packet = NULL;
         slot->audio_packets = NULL;
         slot->audio_packet_sizes = NULL;
@@ -3206,27 +3208,27 @@ static thread_pool_t* create_thread_pool(tav_encoder_t *enc, int num_threads, in
     }
 
     pool->num_threads = num_threads;
-    pool->num_slots = total_gops;  // UPFRONT ALLOCATION: one slot per GOP, no circular reuse
+    pool->num_slots = total_gops;
     pool->slot_capacity = TEMPORAL_GOP_SIZE;
     pool->shared_enc = enc;
     pool->shutdown = 0;
-    // Producer state already initialized earlier (lines 3232-3236)
+    // Producer state already initialised earlier (lines 3232-3236)
     pool->next_gop_to_write = 0;
     pool->total_gops_written = 0;
 
-    // Initialize job queue
+    // Initialise job queue
     pool->job_queue_capacity = pool->num_slots * 2;
     pool->job_queue = calloc(pool->job_queue_capacity, sizeof(int));
     pool->job_queue_head = 0;
     pool->job_queue_tail = 0;
     pool->job_queue_size = 0;
 
-    // Initialize synchronization primitives
+    // Initialise synchronization primitives
     mtx_init(&pool->job_queue_mutex, mtx_plain);
     cnd_init(&pool->job_available);
     cnd_init(&pool->slot_available);
 
-    // Initialize producer state for circular buffering
+    // Initialise producer state for circular buffering
     pool->next_slot_to_fill = 0;
     pool->total_gops_produced = 0;
     pool->total_frames_produced = 0;
@@ -3287,17 +3289,17 @@ static thread_pool_t* create_thread_pool(tav_encoder_t *enc, int num_threads, in
         ctx->work_y_frames = calloc(ctx->max_gop_frames, sizeof(float*));
         ctx->work_co_frames = calloc(ctx->max_gop_frames, sizeof(float*));
         ctx->work_cg_frames = calloc(ctx->max_gop_frames, sizeof(float*));
-        ctx->quantized_y = calloc(ctx->max_gop_frames, sizeof(int16_t*));
-        ctx->quantized_co = calloc(ctx->max_gop_frames, sizeof(int16_t*));
-        ctx->quantized_cg = calloc(ctx->max_gop_frames, sizeof(int16_t*));
+        ctx->quantised_y = calloc(ctx->max_gop_frames, sizeof(int16_t*));
+        ctx->quantised_co = calloc(ctx->max_gop_frames, sizeof(int16_t*));
+        ctx->quantised_cg = calloc(ctx->max_gop_frames, sizeof(int16_t*));
 
         for (int j = 0; j < ctx->max_gop_frames; j++) {
             ctx->work_y_frames[j] = malloc(ctx->max_frame_pixels * sizeof(float));
             ctx->work_co_frames[j] = malloc(ctx->max_frame_pixels * sizeof(float));
             ctx->work_cg_frames[j] = malloc(ctx->max_frame_pixels * sizeof(float));
-            ctx->quantized_y[j] = malloc(ctx->max_frame_pixels * sizeof(int16_t));
-            ctx->quantized_co[j] = malloc(ctx->max_frame_pixels * sizeof(int16_t));
-            ctx->quantized_cg[j] = malloc(ctx->max_frame_pixels * sizeof(int16_t));
+            ctx->quantised_y[j] = malloc(ctx->max_frame_pixels * sizeof(int16_t));
+            ctx->quantised_co[j] = malloc(ctx->max_frame_pixels * sizeof(int16_t));
+            ctx->quantised_cg[j] = malloc(ctx->max_frame_pixels * sizeof(int16_t));
         }
 
         ctx->compression_buffer_size = total_pixels * 3;
@@ -3312,16 +3314,16 @@ static thread_pool_t* create_thread_pool(tav_encoder_t *enc, int num_threads, in
                 free(ctx->work_y_frames[k]);
                 free(ctx->work_co_frames[k]);
                 free(ctx->work_cg_frames[k]);
-                free(ctx->quantized_y[k]);
-                free(ctx->quantized_co[k]);
-                free(ctx->quantized_cg[k]);
+                free(ctx->quantised_y[k]);
+                free(ctx->quantised_co[k]);
+                free(ctx->quantised_cg[k]);
             }
             free(ctx->work_y_frames);
             free(ctx->work_co_frames);
             free(ctx->work_cg_frames);
-            free(ctx->quantized_y);
-            free(ctx->quantized_co);
-            free(ctx->quantized_cg);
+            free(ctx->quantised_y);
+            free(ctx->quantised_co);
+            free(ctx->quantised_cg);
             free(ctx->compression_buffer);
             ZSTD_freeCCtx(ctx->zstd_ctx);
             free(ctx);
@@ -3486,7 +3488,7 @@ static int worker_thread_main(void *arg) {
                           enc->decomp_levels, enc->temporal_decomp_levels, enc->wavelet_filter);
         }
 
-        // Step 3: Quantize coefficients (using 3D DWT quantization for GOP)
+        // Step 3: Quantise coefficients (using 3D DWT quantisation for GOP)
         // Use channel-specific quantisers from encoder settings
         // Apply QLUT mapping to chroma quantisers (matches single-threaded path)
         int base_quantiser_y = enc->quantiser_y;
@@ -3495,11 +3497,11 @@ static int worker_thread_main(void *arg) {
 
         // Quantise 3D DWT coefficients with temporal-spatial quantisation
         // This applies temporal scaling based on subband level and spatial perceptual weighting
-        quantise_3d_dwt_coefficients(enc, ctx->work_y_frames, ctx->quantized_y, num_frames,
+        quantise_3d_dwt_coefficients(enc, ctx->work_y_frames, ctx->quantised_y, num_frames,
                                      num_pixels, base_quantiser_y, 0);  // Luma
-        quantise_3d_dwt_coefficients(enc, ctx->work_co_frames, ctx->quantized_co, num_frames,
+        quantise_3d_dwt_coefficients(enc, ctx->work_co_frames, ctx->quantised_co, num_frames,
                                      num_pixels, base_quantiser_co, 1);  // Chroma Co
-        quantise_3d_dwt_coefficients(enc, ctx->work_cg_frames, ctx->quantized_cg, num_frames,
+        quantise_3d_dwt_coefficients(enc, ctx->work_cg_frames, ctx->quantised_cg, num_frames,
                                      num_pixels, base_quantiser_cg, 1);  // Chroma Cg
 
         // Step 4: EZBC preprocessing
@@ -3509,7 +3511,7 @@ static int worker_thread_main(void *arg) {
 
         size_t preprocessed_size = preprocess_gop_unified(
             enc->preprocess_mode,  // Use encoder's preprocess mode (EZBC by default)
-            ctx->quantized_y, ctx->quantized_co, ctx->quantized_cg,
+            ctx->quantised_y, ctx->quantised_co, ctx->quantised_cg,
             num_frames, num_pixels, width, height,
             CHANNEL_LAYOUT_YCOCG,  // Standard YCoCg layout
             preprocessed_buffer
@@ -3653,9 +3655,9 @@ static int worker_thread_main(void *arg) {
         free(ctx->work_y_frames[i]);
         free(ctx->work_co_frames[i]);
         free(ctx->work_cg_frames[i]);
-        free(ctx->quantized_y[i]);
-        free(ctx->quantized_co[i]);
-        free(ctx->quantized_cg[i]);
+        free(ctx->quantised_y[i]);
+        free(ctx->quantised_co[i]);
+        free(ctx->quantised_cg[i]);
     }
     free(ctx->work_y_frames);
     free(ctx->work_co_frames);
@@ -3663,9 +3665,9 @@ static int worker_thread_main(void *arg) {
     // Save thread_id before freeing context
     int thread_id = ctx->thread_id;
 
-    free(ctx->quantized_y);
-    free(ctx->quantized_co);
-    free(ctx->quantized_cg);
+    free(ctx->quantised_y);
+    free(ctx->quantised_co);
+    free(ctx->quantised_cg);
     free(ctx->compression_buffer);
     ZSTD_freeCCtx(ctx->zstd_ctx);
     free(ctx);
@@ -3763,7 +3765,7 @@ static int producer_thread_main(void *arg) {
             slot->num_audio_samples = audio_read / (2 * sizeof(float));
         }
 
-        // 5. Initialize slot metadata
+        // 5. Initialise slot metadata
         mtx_lock(&slot->mutex);
         slot->gop_index = pool->total_gops_produced;
         slot->num_frames = frames_read;
@@ -3950,7 +3952,7 @@ static int writer_thread_main(void *arg) {
 static void dwt_53_forward_1d(float *data, int length) {
     if (length < 2) return;
 
-    float *temp = calloc(length, sizeof(float));  // Use calloc to zero-initialize for odd-length arrays
+    float *temp = calloc(length, sizeof(float));  // Use calloc to zero-initialise for odd-length arrays
     int half = (length + 1) / 2;  // Handle odd lengths properly
 
     // Predict step (high-pass)
@@ -4125,10 +4127,10 @@ static void dwt_dd4_forward_1d(float *data, int length) {
     }
 
     // DD-4 forward prediction step with four-point kernel
-    // Predict odd samples using four neighboring even samples
+    // Predict odd samples using four neighbouring even samples
     // Prediction: P(x) = (-1/16)*s[i-1] + (9/16)*s[i] + (9/16)*s[i+1] + (-1/16)*s[i+2]
     for (int i = 0; i < length / 2; i++) {
-        // Get four neighboring even samples with symmetric boundary extension
+        // Get four neighbouring even samples with symmetric boundary extension
         float s_m1, s_0, s_1, s_2;
 
         // s[i-1]
@@ -4773,7 +4775,7 @@ static void generate_bidirectional_prediction(
 }
 
 // Spatial motion vector prediction with differential coding
-// Predicts each block's MV from neighbors (left, top, top-right) using median
+// Predicts each block's MV from neighbours (left, top, top-right) using median
 // Converts absolute MVs to differential MVs for better compression
 // This enforces spatial coherence and is standard MPEG practice
 static void apply_mv_prediction(int16_t *mvs_x, int16_t *mvs_y,
@@ -4803,10 +4805,10 @@ static void apply_mv_prediction(int16_t *mvs_x, int16_t *mvs_y,
             int16_t mv_x = orig_mvs_x[block_idx];
             int16_t mv_y = orig_mvs_y[block_idx];
 
-            // Predict MV from spatial neighbors using median
+            // Predict MV from spatial neighbours using median
             int16_t pred_x = 0, pred_y = 0;
 
-            // Get neighbor indices (if they exist)
+            // Get neighbour indices (if they exist)
             int has_left = (bx > 0);
             int has_top = (by > 0);
             int has_top_right = (bx < residual_coding_num_blocks_x - 1 && by > 0);
@@ -4817,7 +4819,7 @@ static void apply_mv_prediction(int16_t *mvs_x, int16_t *mvs_y,
 
             // Standard MPEG median prediction
             if (has_left && has_top && has_top_right) {
-                // All three neighbors available: use median
+                // All three neighbours available: use median
                 pred_x = median3(orig_mvs_x[left_idx],
                                orig_mvs_x[top_idx],
                                orig_mvs_x[top_right_idx]);
@@ -4837,7 +4839,7 @@ static void apply_mv_prediction(int16_t *mvs_x, int16_t *mvs_y,
                 pred_x = orig_mvs_x[top_idx];
                 pred_y = orig_mvs_y[top_idx];
             }
-            // else: no neighbors, prediction remains (0, 0)
+            // else: no neighbours, prediction remains (0, 0)
 
             // Store differential MV = actual - predicted
             mvs_x[block_idx] = mv_x - pred_x;
@@ -5236,7 +5238,7 @@ static size_t encode_pframe_residual(tav_encoder_t *enc, int qY) {
 
     // Step 8: Write P-frame packet
     // Packet format: [type=0x14][num_blocks:uint16][mvs_x][mvs_y][compressed_size:uint32][compressed_data]
-    // Note: MVs are now differential (predicted from neighbors)
+    // Note: MVs are now differential (predicted from neighbours)
 
     uint8_t packet_type = TAV_PACKET_PFRAME_RESIDUAL;
     int total_blocks = enc->residual_coding_num_blocks_x * enc->residual_coding_num_blocks_y;
@@ -5413,7 +5415,7 @@ static size_t encode_pframe_adaptive(tav_encoder_t *enc, int qY) {
     // Differential MV coding doesn't help because:
     // 1. Too little MV data for Zstd to exploit patterns (only 63 trees/frame)
     // 2. Optical flow produces smooth absolute MVs that compress well already
-    // 3. Differential prediction can introduce noise if neighbors aren't perfect predictors
+    // 3. Differential prediction can introduce noise if neighbours aren't perfect predictors
     // Leaving code in place for future experimentation with entropy coding
     #if 0
     int mv_blocks_x = (enc->width + enc->residual_coding_min_block_size - 1) / enc->residual_coding_min_block_size;
@@ -6037,7 +6039,7 @@ static size_t gop_flush(tav_encoder_t *enc, FILE *output, int base_quantiser,
             // Phase 2: Use stored GOP dimensions (actual data size in buffers)
 
             // CRITICAL FIX: Temporarily override enc->widths/heights arrays for cropped dimensions
-            // dwt_2d_forward_flexible() uses these arrays, which were initialized with full frame dimensions
+            // dwt_2d_forward_flexible() uses these arrays, which were initialised with full frame dimensions
             // Save original arrays
             int array_size = enc->decomp_levels + 2;
             int *saved_widths = malloc(array_size * sizeof(int));
@@ -8058,7 +8060,7 @@ static size_t serialise_tile_data(tav_encoder_t *enc, int tile_x, int tile_y,
 
 // Compress and write frame data
 static size_t compress_and_write_frame(tav_encoder_t *enc, uint8_t packet_type) {
-    // Initialize GOP dimensions if not set (e.g., when not using temporal DWT)
+    // Initialise GOP dimensions if not set (e.g., when not using temporal DWT)
     if (enc->temporal_gop_width <= 0 || enc->temporal_gop_height <= 0) {
         enc->temporal_gop_width = enc->encoding_width;
         enc->temporal_gop_height = enc->encoding_height;
@@ -9532,7 +9534,7 @@ static float calculate_sobel_magnitude(const uint8_t *frame_rgb, int width, int
     int y_prev = (y > 0) ? (y - 1) : 0;
     int y_next = (y < height - 1) ? (y + 1) : (height - 1);
 
-    // Sample 3x3 neighborhood (using luma only for efficiency)
+    // Sample 3x3 neighbourhood (using luma only for efficiency)
     float pixels[3][3];
     for (int dy = 0; dy < 3; dy++) {
         for (int dx = 0; dx < 3; dx++) {
@@ -9898,10 +9900,10 @@ static uint16_t median_uint16(uint16_t *values, int count) {
     return values[count / 2];
 }
 
-// Cluster and normalize a single dimension (top, right, bottom, or left)
-// Groups values within ±1 and normalizes each to the most frequent value in its cluster
+// Cluster and normalise a single dimension (top, right, bottom, or left)
+// Groups values within ±1 and normalises each to the most frequent value in its cluster
 // E.g., [55, 56, 55, 57, 55, 200, 201, 200] -> [55, 55, 55, 55, 55, 200, 200, 200]
-static void normalize_dimension_clusters(uint16_t *values, int count) {
+static void normalise_dimension_clusters(uint16_t *values, int count) {
     if (count == 0) return;
 
 #define MAX_GEOMETRY 2048  // Maximum dimension size (width or height)
@@ -9916,7 +9918,7 @@ static void normalize_dimension_clusters(uint16_t *values, int count) {
         }
     }
 
-    // For each value, find the most frequent value within ±1 range and normalize to it
+    // For each value, find the most frequent value within ±1 range and normalise to it
     for (int i = 0; i < count; i++) {
         uint16_t val = values[i];
         if (val >= MAX_GEOMETRY) continue;
@@ -9943,7 +9945,7 @@ static void normalize_dimension_clusters(uint16_t *values, int count) {
 }
 
 // Write all screen masking packets before first frame (similar to SSF-TC subtitles)
-// Uses median filtering + clustering to normalize geometry to predominant aspect ratios
+// Uses median filtering + clustering to normalise geometry to predominant aspect ratios
 static void write_all_screen_mask_packets(tav_encoder_t *enc, FILE *output) {
     if (!enc->enable_crop_encoding || !enc->two_pass_mode) {
         return;  // Letterbox detection requires two-pass mode
@@ -10059,8 +10061,8 @@ static void write_all_screen_mask_packets(tav_encoder_t *enc, FILE *output) {
         }
     }
 
-    // Step 3: Survey packet values and normalize clusters (second pass)
-    // Cluster values within ±1 across all packets and normalize to most frequent
+    // Step 3: Survey packet values and normalise clusters (second pass)
+    // Cluster values within ±1 across all packets and normalise to most frequent
     if (packet_count > 0) {
         // Extract dimension values from packets
         uint16_t *tops = malloc(packet_count * sizeof(uint16_t));
@@ -10075,13 +10077,13 @@ static void write_all_screen_mask_packets(tav_encoder_t *enc, FILE *output) {
             lefts[i] = packets[i].left;
         }
 
-        // Normalize each dimension independently (54,55,56 -> 55)
-        normalize_dimension_clusters(tops, packet_count);
-        normalize_dimension_clusters(rights, packet_count);
-        normalize_dimension_clusters(bottoms, packet_count);
-        normalize_dimension_clusters(lefts, packet_count);
+        // Normalise each dimension independently (54,55,56 -> 55)
+        normalise_dimension_clusters(tops, packet_count);
+        normalise_dimension_clusters(rights, packet_count);
+        normalise_dimension_clusters(bottoms, packet_count);
+        normalise_dimension_clusters(lefts, packet_count);
 
-        // Write normalized values back to packets
+        // Write normalised values back to packets
         for (int i = 0; i < packet_count; i++) {
             packets[i].top = tops[i];
             packets[i].right = rights[i];
@@ -10095,14 +10097,14 @@ static void write_all_screen_mask_packets(tav_encoder_t *enc, FILE *output) {
         free(lefts);
     }
 
-    // Step 4: Emit normalized packets to file
+    // Step 4: Emit normalised packets to file
     for (int i = 0; i < packet_count; i++) {
         write_screen_mask_packet(output, packets[i].frame_num,
                                 packets[i].top, packets[i].right,
                                 packets[i].bottom, packets[i].left);
 
         if (enc->verbose) {
-            printf("  Frame %d: Screen mask t=%u r=%u b=%u l=%u (normalized%s)\n",
+            printf("  Frame %d: Screen mask t=%u r=%u b=%u l=%u (normalised%s)\n",
                    packets[i].frame_num, packets[i].top, packets[i].right,
                    packets[i].bottom, packets[i].left,
                    i == 0 ? ", initial geometry" : "");
@@ -11432,7 +11434,7 @@ static void calculate_gop_geometry(tav_encoder_t *enc, gop_boundary_t *gop_list,
 
     gop_boundary_t *gop = gop_list;
     while (gop) {
-        // Initialize with full frame dimensions
+        // Initialise with full frame dimensions
         gop->max_active_width = 0;
         gop->max_active_height = 0;
         gop->geometry_changes = 0;
@@ -11445,7 +11447,7 @@ static void calculate_gop_geometry(tav_encoder_t *enc, gop_boundary_t *gop_list,
 
         // Track previous geometry for change detection
         uint16_t prev_top = 0, prev_right = 0, prev_bottom = 0, prev_left = 0;
-        int prev_initialized = 0;
+        int prev_initialised = 0;
 
         // Scan all frames in this GOP
         for (int f = gop->start_frame; f <= gop->end_frame; f++) {
@@ -11470,7 +11472,7 @@ static void calculate_gop_geometry(tav_encoder_t *enc, gop_boundary_t *gop_list,
             if (frame->letterbox_left < min_left) min_left = frame->letterbox_left;
 
             // Detect geometry changes
-            if (prev_initialized) {
+            if (prev_initialised) {
                 if (frame->letterbox_top != prev_top ||
                     frame->letterbox_right != prev_right ||
                     frame->letterbox_bottom != prev_bottom ||
@@ -11484,7 +11486,7 @@ static void calculate_gop_geometry(tav_encoder_t *enc, gop_boundary_t *gop_list,
             prev_right = frame->letterbox_right;
             prev_bottom = frame->letterbox_bottom;
             prev_left = frame->letterbox_left;
-            prev_initialized = 1;
+            prev_initialised = 1;
         }
 
         // Calculate unified mask from minimum letterbox values
@@ -11817,7 +11819,7 @@ int main(int argc, char *argv[]) {
 
     printf("Initialising encoder...\n");
 
-    // Initialize AVX-512 runtime detection
+    // Initialise AVX-512 runtime detection
     tav_simd_init();
 
     tav_encoder_t *enc = create_encoder();
@@ -12154,20 +12156,7 @@ int main(int argc, char *argv[]) {
             case 1060: // --threads
                 enc->num_threads = atoi(optarg);
                 if (enc->num_threads < 1) enc->num_threads = 1;
-
-                // Future: auto-detect with limit (currently commented out)
-                // if (enc->num_threads == 0) {
-                //     #ifdef _WIN32
-                //     SYSTEM_INFO sysinfo;
-                //     GetSystemInfo(&sysinfo);
-                //     int cores = sysinfo.dwNumberOfProcessors;
-                //     #else
-                //     int cores = sysconf(_SC_NPROCESSORS_ONLN);
-                //     #endif
-                //     enc->num_threads = (cores > 8) ? 8 : cores;  // Limit to 8
-                // }
-
-                printf("Multi-threading: %d threads\n", enc->num_threads);
+                printf("Multi-threading: %d threads (user-defined)\n", enc->num_threads);
                 break;
             case 'a':
                 int bitrate = atoi(optarg);
@@ -12228,6 +12217,19 @@ int main(int argc, char *argv[]) {
         }
     }
 
+    //auto-detect with limit (currently commented out)
+    if (enc->num_threads == 0) {
+        #ifdef _WIN32
+        SYSTEM_INFO sysinfo;
+        GetSystemInfo(&sysinfo);
+        int cores = sysinfo.dwNumberOfProcessors;
+        #else
+        int cores = sysconf(_SC_NPROCESSORS_ONLN);
+        #endif
+        enc->num_threads = (cores > 8) ? 8 : cores;  // Limit to 8
+        printf("Multi-threading: %d threads (auto-selected)\n", enc->num_threads);
+    }
+
     // generate division series
     enc->widths = malloc((enc->decomp_levels + 2) * sizeof(int));
     enc->heights = malloc((enc->decomp_levels + 2) * sizeof(int));
diff --git a/video_encoder/tav_avx512.h b/video_encoder/tav_avx512.h
index 7694e59..be3f0cd 100644
--- a/video_encoder/tav_avx512.h
+++ b/video_encoder/tav_avx512.h
@@ -8,7 +8,7 @@
  * Optimised functions:
  * - 1D DWT transforms (5/3, 9/7, Haar, Bior13/7, DD4)
  * - Quantisation functions
- * - RGB to YCoCg color conversion
+ * - RGB to YCoCg colour conversion
  * - 2D DWT gather/scatter operations
  *
  * Compile with: -mavx512f -mavx512dq -mavx512bw -mavx512vl
@@ -79,7 +79,7 @@ static inline float _mm512_reduce_add_ps_compat(__m512 v) {
     return _mm_cvtss_f32(sum128);
 }
 
-// Clamp helper for vectorized operations
+// Clamp helper for vectorised operations
 static inline __m512 _mm512_clamp_ps(__m512 v, __m512 min_val, __m512 max_val) {
     return _mm512_min_ps(_mm512_max_ps(v, min_val), max_val);
 }
@@ -95,7 +95,7 @@ static inline void dwt_53_forward_1d_avx512(float *data, int length) {
     float *temp = (float*)calloc(length, sizeof(float));
     int half = (length + 1) / 2;
 
-    // Predict step (high-pass) - vectorized
+    // Predict step (high-pass) - vectorised
     // temp[half + i] = data[2*i+1] - 0.5 * (data[2*i] + data[2*i+2])
     int i;
     for (i = 0; i + 16 <= half; i += 16) {
@@ -145,7 +145,7 @@ static inline void dwt_53_forward_1d_avx512(float *data, int length) {
         }
     }
 
-    // Update step (low-pass) - vectorized
+    // Update step (low-pass) - vectorised
     // temp[i] = data[2*i] + 0.25 * (temp[half+i-1] + temp[half+i])
     for (i = 0; i + 16 <= half; i += 16) {
         __m512 even = _mm512_loadu_ps(&data[2 * i]);  // Load with stride 2 (simplified)
@@ -157,7 +157,7 @@ static inline void dwt_53_forward_1d_avx512(float *data, int length) {
         }
         even = _mm512_loadu_ps(even_vals);
 
-        // Load high-pass neighbors
+        // Load high-pass neighbours
         float high_prev[16], high_curr[16];
         for (int j = 0; j < 16 && (i + j) < half; j++) {
             high_prev[j] = ((i + j) > 0) ? temp[half + (i + j) - 1] : 0.0f;
@@ -241,7 +241,7 @@ static inline void dwt_97_forward_1d_avx512(float *data, int length) {
                 temp[half + 0] += -1.586134342f * (temp[0] + temp[0]);
             }
         } else {
-            // main vectorized body: ensure s_next loads (i+1) valid -> i <= half-2
+            // main vectorised body: ensure s_next loads (i+1) valid -> i <= half-2
             int limit = (half - 1);
             int n_full = (limit / 16) * 16; // process up to n_full (multiple of 16)
             i = 0;