mirror of
https://github.com/curioustorvald/tsvm.git
synced 2026-03-07 19:51:51 +09:00
tav: librarying
This commit is contained in:
65
video_encoder/include/coefficient_compress.h
Normal file
65
video_encoder/include/coefficient_compress.h
Normal file
@@ -0,0 +1,65 @@
|
||||
// Simple coefficient preprocessing for better compression
|
||||
// Insert right before Zstd compression
|
||||
|
||||
#ifndef COEFFICIENT_COMPRESS_H
|
||||
#define COEFFICIENT_COMPRESS_H
|
||||
|
||||
#include <stdint.h>
|
||||
#include <string.h>
|
||||
|
||||
// Preprocess coefficients using significance map
|
||||
// Returns new buffer size, modifies buffer in-place if possible
|
||||
static size_t preprocess_coefficients(int16_t *coeffs, int coeff_count, uint8_t *output_buffer) {
|
||||
// Count non-zero coefficients
|
||||
int nonzero_count = 0;
|
||||
for (int i = 0; i < coeff_count; i++) {
|
||||
if (coeffs[i] != 0) nonzero_count++;
|
||||
}
|
||||
|
||||
// Create significance map (1 bit per coefficient, packed into bytes)
|
||||
int map_bytes = (coeff_count + 7) / 8; // Round up to nearest byte
|
||||
uint8_t *sig_map = output_buffer;
|
||||
int16_t *values = (int16_t *)(output_buffer + map_bytes);
|
||||
|
||||
// Clear significance map
|
||||
memset(sig_map, 0, map_bytes);
|
||||
|
||||
// Fill significance map and extract non-zero values
|
||||
int value_idx = 0;
|
||||
for (int i = 0; i < coeff_count; i++) {
|
||||
if (coeffs[i] != 0) {
|
||||
// Set bit in significance map
|
||||
int byte_idx = i / 8;
|
||||
int bit_idx = i % 8;
|
||||
sig_map[byte_idx] |= (1 << bit_idx);
|
||||
|
||||
// Store the value
|
||||
values[value_idx++] = coeffs[i];
|
||||
}
|
||||
}
|
||||
|
||||
return map_bytes + (nonzero_count * sizeof(int16_t));
|
||||
}
|
||||
|
||||
// Decoder: reconstruct coefficients from significance map
|
||||
static void postprocess_coefficients(uint8_t *compressed_data, int coeff_count, int16_t *output_coeffs) {
|
||||
int map_bytes = (coeff_count + 7) / 8;
|
||||
uint8_t *sig_map = compressed_data;
|
||||
int16_t *values = (int16_t *)(compressed_data + map_bytes);
|
||||
|
||||
// Clear output
|
||||
memset(output_coeffs, 0, coeff_count * sizeof(int16_t));
|
||||
|
||||
// Reconstruct coefficients
|
||||
int value_idx = 0;
|
||||
for (int i = 0; i < coeff_count; i++) {
|
||||
int byte_idx = i / 8;
|
||||
int bit_idx = i % 8;
|
||||
|
||||
if (sig_map[byte_idx] & (1 << bit_idx)) {
|
||||
output_coeffs[i] = values[value_idx++];
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
#endif // COEFFICIENT_COMPRESS_H
|
||||
39
video_encoder/include/decoder_tad.h
Normal file
39
video_encoder/include/decoder_tad.h
Normal file
@@ -0,0 +1,39 @@
|
||||
#ifndef TAD32_DECODER_H
|
||||
#define TAD32_DECODER_H
|
||||
|
||||
#include <stdint.h>
|
||||
#include <stddef.h>
|
||||
|
||||
// TAD32 (Terrarum Advanced Audio - PCM32f version) Decoder
|
||||
// DWT-based perceptual audio codec for TSVM
|
||||
// Shared decoder library used by both decoder_tad (standalone) and decoder_tav (video decoder)
|
||||
|
||||
// Constants (must match encoder)
|
||||
#define TAD32_SAMPLE_RATE 32000
|
||||
#define TAD32_CHANNELS 2 // Stereo
|
||||
#define TAD_DEFAULT_CHUNK_SIZE 32768 // Default chunk size for standalone TAD files
|
||||
|
||||
/**
|
||||
* Decode audio chunk with TAD32 codec
|
||||
*
|
||||
* @param input Input TAD32 chunk data
|
||||
* @param input_size Size of input buffer
|
||||
* @param pcmu8_stereo Output PCMu8 stereo samples (interleaved L,R)
|
||||
* @param bytes_consumed [out] Number of bytes consumed from input
|
||||
* @param samples_decoded [out] Number of samples decoded per channel
|
||||
* @return 0 on success, -1 on error
|
||||
*
|
||||
* Input format:
|
||||
* uint16 sample_count (samples per channel)
|
||||
* uint8 max_index (maximum quantisation index)
|
||||
* uint32 payload_size (bytes in payload)
|
||||
* * payload (encoded M/S data, Zstd-compressed with EZBC)
|
||||
*
|
||||
* Output format:
|
||||
* PCMu8 stereo interleaved (8-bit unsigned PCM, L,R pairs)
|
||||
* Range: [0, 255] where 128 = silence
|
||||
*/
|
||||
int tad32_decode_chunk(const uint8_t *input, size_t input_size, uint8_t *pcmu8_stereo,
|
||||
size_t *bytes_consumed, size_t *samples_decoded);
|
||||
|
||||
#endif // TAD32_DECODER_H
|
||||
61
video_encoder/include/encoder_tad.h
Normal file
61
video_encoder/include/encoder_tad.h
Normal file
@@ -0,0 +1,61 @@
|
||||
#ifndef TAD32_ENCODER_H
|
||||
#define TAD32_ENCODER_H
|
||||
|
||||
#include <stdint.h>
|
||||
#include <stddef.h>
|
||||
|
||||
// TAD32 (Terrarum Advanced Audio - PCM32f version) Encoder
|
||||
// DWT-based perceptual audio codec for TSVM
|
||||
// Alternative version: PCM32f throughout encoding, PCM8 conversion only at decoder
|
||||
|
||||
// Constants
|
||||
#define TAD32_COEFF_SCALARS {64.0f, 45.255f, 32.0f, 22.627f, 16.0f, 11.314f, 8.0f, 5.657f, 4.0f, 2.828f} // value only valid for CDF 9/7 with decomposition level 9. Index 0 = LL band
|
||||
#define TAD32_MIN_CHUNK_SIZE 1024 // Minimum: 1024 samples
|
||||
#define TAD32_SAMPLE_RATE 32000
|
||||
#define TAD32_CHANNELS 2 // Stereo
|
||||
#define TAD32_QUALITY_MIN 0
|
||||
#define TAD32_QUALITY_MAX 6
|
||||
#define TAD32_QUALITY_DEFAULT 3
|
||||
#define TAD32_ZSTD_LEVEL 15
|
||||
|
||||
static inline int tad32_quality_to_max_index(int quality) {
|
||||
static const int quality_map[6] = {21, 31, 44, 63, 89, 127};
|
||||
if (quality < 0) quality = 0;
|
||||
if (quality > 5) quality = 5;
|
||||
return quality_map[quality];
|
||||
}
|
||||
|
||||
/**
|
||||
* Encode audio chunk with TAD32 codec (PCM32f version)
|
||||
*
|
||||
* @param pcm32_stereo Input PCM32fLE stereo samples (interleaved L,R)
|
||||
* @param num_samples Number of samples per channel (min 1024)
|
||||
* @param max_index Maximum quantisation index (7=3bit, 15=4bit, 31=5bit, 63=6bit, 127=7bit)
|
||||
* @param quantiser_scale Quantiser scaling factor (1.0=baseline, 2.0=2x coarser quantisation)
|
||||
* Higher values = more aggressive quantisation = smaller files
|
||||
* @param output Output buffer (must be large enough)
|
||||
* @return Number of bytes written to output, or 0 on error
|
||||
*
|
||||
* Output format:
|
||||
* uint16 sample_count (samples per channel)
|
||||
* uint8 max_index (maximum quantisation index)
|
||||
* uint32 payload_size (bytes in payload)
|
||||
* * payload (encoded M/S data, Zstd-compressed with 2-bit twobitmap)
|
||||
*/
|
||||
size_t tad32_encode_chunk(const float *pcm32_stereo, size_t num_samples,
|
||||
int max_index,
|
||||
float quantiser_scale, uint8_t *output);
|
||||
|
||||
/**
|
||||
* Print accumulated coefficient statistics
|
||||
* Only effective if TAD_COEFF_STATS environment variable is set
|
||||
*/
|
||||
void tad32_print_statistics(void);
|
||||
|
||||
/**
|
||||
* Free accumulated statistics memory
|
||||
* Should be called after tad32_print_statistics()
|
||||
*/
|
||||
void tad32_free_statistics(void);
|
||||
|
||||
#endif // TAD32_ENCODER_H
|
||||
74
video_encoder/include/entropy_coder.h
Normal file
74
video_encoder/include/entropy_coder.h
Normal file
@@ -0,0 +1,74 @@
|
||||
// TEV Entropy Coder - Specialised for DCT coefficients
|
||||
// Replaces gzip with video-optimized compression
|
||||
#ifndef ENTROPY_CODER_H
|
||||
#define ENTROPY_CODER_H
|
||||
|
||||
#include <stdint.h>
|
||||
#include <stdio.h>
|
||||
|
||||
// Bit writer for variable-length codes
|
||||
typedef struct {
|
||||
uint8_t *buffer;
|
||||
size_t buffer_size;
|
||||
size_t byte_pos;
|
||||
int bit_pos; // 0-7, next bit to write
|
||||
} bit_writer_t;
|
||||
|
||||
// Bit reader for decoding
|
||||
typedef struct {
|
||||
const uint8_t *buffer;
|
||||
size_t buffer_size;
|
||||
size_t byte_pos;
|
||||
int bit_pos; // 0-7, next bit to read
|
||||
} bit_reader_t;
|
||||
|
||||
// Huffman table entry
|
||||
typedef struct {
|
||||
uint16_t code; // Huffman code
|
||||
uint8_t bits; // Code length in bits
|
||||
} huffman_entry_t;
|
||||
|
||||
// Video entropy coder optimized for TEV coefficients
|
||||
typedef struct {
|
||||
// Huffman tables for different coefficient types
|
||||
huffman_entry_t y_dc_table[512]; // Y DC coefficients (-255 to +255)
|
||||
huffman_entry_t y_ac_table[512]; // Y AC coefficients
|
||||
huffman_entry_t c_dc_table[512]; // Chroma DC coefficients
|
||||
huffman_entry_t c_ac_table[512]; // Chroma AC coefficients
|
||||
huffman_entry_t run_table[256]; // Zero run lengths (0-255)
|
||||
|
||||
// Motion vector Huffman tables
|
||||
huffman_entry_t mv_table[65]; // Motion vectors (-32 to +32)
|
||||
|
||||
// Bit writer/reader
|
||||
bit_writer_t writer;
|
||||
bit_reader_t reader;
|
||||
} entropy_coder_t;
|
||||
|
||||
static const huffman_entry_t BLOCK_MODE_HUFFMAN[16];
|
||||
|
||||
void write_bits(bit_writer_t *writer, uint32_t value, int bits);
|
||||
uint32_t read_bits(bit_reader_t *reader, int bits);
|
||||
|
||||
// Initialise entropy coder
|
||||
entropy_coder_t* entropy_coder_create(uint8_t *buffer, size_t buffer_size);
|
||||
void entropy_coder_destroy(entropy_coder_t *coder);
|
||||
|
||||
// Encoding functions
|
||||
int encode_y_block(entropy_coder_t *coder, int16_t *y_coeffs);
|
||||
int encode_chroma_block(entropy_coder_t *coder, int16_t *chroma_coeffs, int is_cg);
|
||||
int encode_motion_vector(entropy_coder_t *coder, int16_t mv_x, int16_t mv_y);
|
||||
int encode_block_mode(entropy_coder_t *coder, uint8_t mode);
|
||||
|
||||
// Decoding functions
|
||||
void entropy_coder_init_reader(entropy_coder_t *coder, const uint8_t *buffer, size_t buffer_size);
|
||||
int decode_y_block(entropy_coder_t *coder, int16_t *y_coeffs);
|
||||
int decode_chroma_block(entropy_coder_t *coder, int16_t *chroma_coeffs, int is_cg);
|
||||
int decode_motion_vector(entropy_coder_t *coder, int16_t *mv_x, int16_t *mv_y);
|
||||
int decode_block_mode(entropy_coder_t *coder, uint8_t *mode);
|
||||
|
||||
// Get compressed size
|
||||
size_t entropy_coder_get_size(entropy_coder_t *coder);
|
||||
void entropy_coder_reset(entropy_coder_t *coder);
|
||||
|
||||
#endif // ENTROPY_CODER_H
|
||||
837
video_encoder/include/tav_avx512.h
Normal file
837
video_encoder/include/tav_avx512.h
Normal file
@@ -0,0 +1,837 @@
|
||||
/*
|
||||
* TAV AVX-512 Optimisations
|
||||
*
|
||||
* This file contains AVX-512 optimised versions of performance-critical functions
|
||||
* in the TAV encoder. Runtime CPU detection ensures fallback to scalar versions
|
||||
* on non-AVX-512 systems.
|
||||
*
|
||||
* Optimised functions:
|
||||
* - 1D DWT transforms (5/3, 9/7, Haar, Bior13/7, DD4)
|
||||
* - Quantisation functions
|
||||
* - RGB to YCoCg colour conversion
|
||||
* - 2D DWT gather/scatter operations
|
||||
*
|
||||
* Compile with: -mavx512f -mavx512dq -mavx512bw -mavx512vl
|
||||
*/
|
||||
|
||||
#ifndef TAV_AVX512_H
|
||||
#define TAV_AVX512_H
|
||||
|
||||
#include <immintrin.h>
|
||||
#include <stdint.h>
|
||||
#include <stdlib.h>
|
||||
#include <string.h>
|
||||
#include <math.h>
|
||||
#include <stdio.h>
|
||||
|
||||
// =============================================================================
|
||||
// SIMD Capability Detection
|
||||
// =============================================================================
|
||||
|
||||
typedef enum {
|
||||
SIMD_NONE = 0,
|
||||
SIMD_AVX512F = 1
|
||||
} simd_level_t;
|
||||
|
||||
// Global SIMD level (set by tav_simd_init)
|
||||
static simd_level_t g_simd_level = SIMD_NONE;
|
||||
|
||||
// CPU feature detection
|
||||
static inline int cpu_has_avx512f(void) {
|
||||
#ifdef __AVX512F__
|
||||
return __builtin_cpu_supports("avx512f") &&
|
||||
__builtin_cpu_supports("avx512dq");
|
||||
#else
|
||||
return 0;
|
||||
#endif
|
||||
}
|
||||
|
||||
// Initialize SIMD detection (call once at startup)
|
||||
static inline void tav_simd_init(void) {
|
||||
#ifdef __AVX512F__
|
||||
if (cpu_has_avx512f()) {
|
||||
g_simd_level = SIMD_AVX512F;
|
||||
fprintf(stderr, "[TAV] AVX-512 optimisations enabled\n");
|
||||
} else {
|
||||
g_simd_level = SIMD_NONE;
|
||||
fprintf(stderr, "[TAV] AVX-512 not available, using scalar fallback\n");
|
||||
}
|
||||
#else
|
||||
g_simd_level = SIMD_NONE;
|
||||
fprintf(stderr, "[TAV] Compiled without AVX-512 support\n");
|
||||
#endif
|
||||
}
|
||||
|
||||
#ifdef __AVX512F__
|
||||
|
||||
// =============================================================================
|
||||
// Helper Functions
|
||||
// =============================================================================
|
||||
|
||||
// Horizontal sum of 16 floats
|
||||
static inline float _mm512_reduce_add_ps_compat(__m512 v) {
|
||||
__m256 low = _mm512_castps512_ps256(v);
|
||||
__m256 high = _mm512_extractf32x8_ps(v, 1);
|
||||
__m256 sum256 = _mm256_add_ps(low, high);
|
||||
__m128 sum128 = _mm_add_ps(_mm256_castps256_ps128(sum256), _mm256_extractf128_ps(sum256, 1));
|
||||
sum128 = _mm_hadd_ps(sum128, sum128);
|
||||
sum128 = _mm_hadd_ps(sum128, sum128);
|
||||
return _mm_cvtss_f32(sum128);
|
||||
}
|
||||
|
||||
// Clamp helper for vectorised operations
|
||||
static inline __m512 _mm512_clamp_ps(__m512 v, __m512 min_val, __m512 max_val) {
|
||||
return _mm512_min_ps(_mm512_max_ps(v, min_val), max_val);
|
||||
}
|
||||
|
||||
// =============================================================================
|
||||
// AVX-512 Optimised 1D DWT Forward Transforms
|
||||
// =============================================================================
|
||||
|
||||
// 5/3 Reversible Forward DWT with AVX-512
|
||||
static inline void dwt_53_forward_1d_avx512(float *data, int length) {
|
||||
if (length < 2) return;
|
||||
|
||||
float *temp = (float*)calloc(length, sizeof(float));
|
||||
int half = (length + 1) / 2;
|
||||
|
||||
// Predict step (high-pass) - vectorised
|
||||
// temp[half + i] = data[2*i+1] - 0.5 * (data[2*i] + data[2*i+2])
|
||||
int i;
|
||||
for (i = 0; i + 16 <= half; i += 16) {
|
||||
__mmask16 valid_mask = 0xFFFF;
|
||||
|
||||
// Check boundary for last iteration
|
||||
for (int j = 0; j < 16; j++) {
|
||||
int idx = 2 * (i + j) + 1;
|
||||
if (idx >= length) {
|
||||
valid_mask &= ~(1 << j);
|
||||
}
|
||||
}
|
||||
|
||||
if (valid_mask == 0) break;
|
||||
|
||||
// Load data[2*i] - stride 2 load
|
||||
float even_curr_vals[16], even_next_vals[16], odd_vals[16];
|
||||
|
||||
for (int j = 0; j < 16; j++) {
|
||||
if (valid_mask & (1 << j)) {
|
||||
even_curr_vals[j] = data[2 * (i + j)];
|
||||
even_next_vals[j] = (2 * (i + j) + 2 < length) ? data[2 * (i + j) + 2] : data[2 * (i + j)];
|
||||
odd_vals[j] = data[2 * (i + j) + 1];
|
||||
} else {
|
||||
even_curr_vals[j] = 0.0f;
|
||||
even_next_vals[j] = 0.0f;
|
||||
odd_vals[j] = 0.0f;
|
||||
}
|
||||
}
|
||||
|
||||
__m512 even_curr = _mm512_loadu_ps(even_curr_vals);
|
||||
__m512 even_next = _mm512_loadu_ps(even_next_vals);
|
||||
__m512 odd = _mm512_loadu_ps(odd_vals);
|
||||
|
||||
__m512 pred = _mm512_mul_ps(_mm512_add_ps(even_curr, even_next), _mm512_set1_ps(0.5f));
|
||||
__m512 high = _mm512_sub_ps(odd, pred);
|
||||
|
||||
_mm512_mask_storeu_ps(&temp[half + i], valid_mask, high);
|
||||
}
|
||||
|
||||
// Handle remaining elements
|
||||
for (; i < half; i++) {
|
||||
int idx = 2 * i + 1;
|
||||
if (idx < length) {
|
||||
float pred = 0.5f * (data[2 * i] + (2 * i + 2 < length ? data[2 * i + 2] : data[2 * i]));
|
||||
temp[half + i] = data[idx] - pred;
|
||||
}
|
||||
}
|
||||
|
||||
// Update step (low-pass) - vectorised
|
||||
// temp[i] = data[2*i] + 0.25 * (temp[half+i-1] + temp[half+i])
|
||||
for (i = 0; i + 16 <= half; i += 16) {
|
||||
__m512 even = _mm512_loadu_ps(&data[2 * i]); // Load with stride 2 (simplified)
|
||||
|
||||
// Manual gather for strided load
|
||||
float even_vals[16];
|
||||
for (int j = 0; j < 16 && (i + j) < half; j++) {
|
||||
even_vals[j] = data[2 * (i + j)];
|
||||
}
|
||||
even = _mm512_loadu_ps(even_vals);
|
||||
|
||||
// Load high-pass neighbours
|
||||
float high_prev[16], high_curr[16];
|
||||
for (int j = 0; j < 16 && (i + j) < half; j++) {
|
||||
high_prev[j] = ((i + j) > 0) ? temp[half + (i + j) - 1] : 0.0f;
|
||||
high_curr[j] = ((i + j) < half - 1) ? temp[half + (i + j)] : 0.0f;
|
||||
}
|
||||
|
||||
__m512 hp = _mm512_loadu_ps(high_prev);
|
||||
__m512 hc = _mm512_loadu_ps(high_curr);
|
||||
__m512 update = _mm512_mul_ps(_mm512_add_ps(hp, hc), _mm512_set1_ps(0.25f));
|
||||
__m512 low = _mm512_add_ps(even, update);
|
||||
|
||||
__mmask16 store_mask = (i + 16 <= half) ? 0xFFFF : (1 << (half - i)) - 1;
|
||||
_mm512_mask_storeu_ps(&temp[i], store_mask, low);
|
||||
}
|
||||
|
||||
// Handle remaining elements
|
||||
for (; i < half; i++) {
|
||||
float update = 0.25f * ((i > 0 ? temp[half + i - 1] : 0) +
|
||||
(i < half - 1 ? temp[half + i] : 0));
|
||||
temp[i] = data[2 * i] + update;
|
||||
}
|
||||
|
||||
memcpy(data, temp, length * sizeof(float));
|
||||
free(temp);
|
||||
}
|
||||
|
||||
// 9/7 Irreversible Forward DWT with AVX-512
|
||||
static inline void dwt_97_forward_1d_avx512(float *data, int length) {
|
||||
if (length < 2) return;
|
||||
|
||||
int half = (length + 1) / 2;
|
||||
|
||||
// Allocate aligned temp buffer once (64-byte align for cache lines)
|
||||
float *temp = NULL;
|
||||
#if defined(_POSIX_C_SOURCE) || defined(_XOPEN_SOURCE)
|
||||
if (posix_memalign((void**)&temp, 64, (size_t)length * sizeof(float)) != 0) {
|
||||
temp = (float*)malloc((size_t)length * sizeof(float));
|
||||
}
|
||||
#else
|
||||
temp = (float*)aligned_alloc(64, ((size_t)length * sizeof(float) + 63) & ~63);
|
||||
if (!temp) temp = (float*)malloc((size_t)length * sizeof(float));
|
||||
#endif
|
||||
if (!temp) return; // allocation failure: bail out (preserve original behavior could be different)
|
||||
|
||||
// FAST SPLIT: interleave into temp: first half = evens, second half = odds
|
||||
// This is simple, streaming-friendly, and much faster than per-iteration small-array gathers.
|
||||
{
|
||||
float *even = temp;
|
||||
float *odd = temp + half;
|
||||
int i = 0;
|
||||
// process pairs to minimize branches and memory ops
|
||||
for (; i + 1 < length; i += 2) {
|
||||
even[0] = data[i];
|
||||
odd[0] = data[i + 1];
|
||||
++even; ++odd;
|
||||
}
|
||||
if (i < length) { // odd leftover
|
||||
even[0] = data[i];
|
||||
}
|
||||
}
|
||||
|
||||
// Lifting coefficients as vectors
|
||||
const __m512 alpha_vec = _mm512_set1_ps(-1.586134342f);
|
||||
const __m512 beta_vec = _mm512_set1_ps(-0.052980118f);
|
||||
const __m512 gamma_vec = _mm512_set1_ps(0.882911076f);
|
||||
const __m512 delta_vec = _mm512_set1_ps(0.443506852f);
|
||||
const __m512 K_vec = _mm512_set1_ps(1.230174105f);
|
||||
const __m512 invK_vec = _mm512_set1_ps(1.0f / 1.230174105f);
|
||||
|
||||
// Helper variables
|
||||
int i;
|
||||
|
||||
// -----------------------
|
||||
// Step 1: Predict α
|
||||
// d[i] += alpha * (s[i] + s[i+1])
|
||||
// -----------------------
|
||||
if (half > 0) {
|
||||
// handle small or trivial cases
|
||||
if (half == 1) {
|
||||
if (half < length) {
|
||||
temp[half + 0] += -1.586134342f * (temp[0] + temp[0]);
|
||||
}
|
||||
} else {
|
||||
// main vectorised body: ensure s_next loads (i+1) valid -> i <= half-2
|
||||
int limit = (half - 1);
|
||||
int n_full = (limit / 16) * 16; // process up to n_full (multiple of 16)
|
||||
i = 0;
|
||||
for (; i + 32 <= n_full; i += 32) {
|
||||
// unroll 2x (i and i+16)
|
||||
__m512 s0 = _mm512_loadu_ps(&temp[i]);
|
||||
__m512 s0n = _mm512_loadu_ps(&temp[i + 1]);
|
||||
__m512 d0 = _mm512_loadu_ps(&temp[half + i]);
|
||||
__m512 sum0 = _mm512_add_ps(s0, s0n);
|
||||
d0 = _mm512_fmadd_ps(alpha_vec, sum0, d0);
|
||||
_mm512_storeu_ps(&temp[half + i], d0);
|
||||
|
||||
__m512 s1 = _mm512_loadu_ps(&temp[i + 16]);
|
||||
__m512 s1n = _mm512_loadu_ps(&temp[i + 17]);
|
||||
__m512 d1 = _mm512_loadu_ps(&temp[half + i + 16]);
|
||||
__m512 sum1 = _mm512_add_ps(s1, s1n);
|
||||
d1 = _mm512_fmadd_ps(alpha_vec, sum1, d1);
|
||||
_mm512_storeu_ps(&temp[half + i + 16], d1);
|
||||
}
|
||||
for (; i + 16 <= n_full; i += 16) {
|
||||
__m512 s = _mm512_loadu_ps(&temp[i]);
|
||||
__m512 sn = _mm512_loadu_ps(&temp[i + 1]);
|
||||
__m512 d = _mm512_loadu_ps(&temp[half + i]);
|
||||
__m512 sum = _mm512_add_ps(s, sn);
|
||||
d = _mm512_fmadd_ps(alpha_vec, sum, d);
|
||||
_mm512_storeu_ps(&temp[half + i], d);
|
||||
}
|
||||
// scalar remainder up to limit (half-2 -> last vector handled below)
|
||||
for (; i < limit; ++i) {
|
||||
temp[half + i] += -1.586134342f * (temp[i] + temp[i + 1]);
|
||||
}
|
||||
// handle last index i = half-1 (mirror)
|
||||
int last = half - 1;
|
||||
if (half + last < length) {
|
||||
float s_curr = temp[last];
|
||||
float s_next = s_curr;
|
||||
temp[half + last] += -1.586134342f * (s_curr + s_next);
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
// -----------------------
|
||||
// Step 2: Update β
|
||||
// s[i] += beta * (d[i-1] + d[i])
|
||||
// -----------------------
|
||||
if (half > 0) {
|
||||
// handle i == 0 separately (d_prev = d_curr for boundary semantics)
|
||||
if (half >= 1) {
|
||||
// i == 0
|
||||
if (half + 0 < length) {
|
||||
float d_curr0 = temp[half + 0];
|
||||
temp[0] += -0.052980118f * (d_curr0 + d_curr0);
|
||||
}
|
||||
}
|
||||
|
||||
if (half > 1) {
|
||||
// main vector loop starting from i = 1 to half-1 (we will write s[i] for i>=1)
|
||||
int start = 1;
|
||||
int limit = half; // exclusive
|
||||
int n_elems = limit - start;
|
||||
int n_full = (n_elems / 16) * 16;
|
||||
i = start;
|
||||
for (; i + 32 <= start + n_full; i += 32) {
|
||||
// unroll 2x
|
||||
__m512 s0 = _mm512_loadu_ps(&temp[i]);
|
||||
__m512 dcurr0 = _mm512_loadu_ps(&temp[half + i]);
|
||||
__m512 dprev0 = _mm512_loadu_ps(&temp[half + i - 1]);
|
||||
__m512 sum0 = _mm512_add_ps(dprev0, dcurr0);
|
||||
s0 = _mm512_fmadd_ps(beta_vec, sum0, s0);
|
||||
_mm512_storeu_ps(&temp[i], s0);
|
||||
|
||||
__m512 s1 = _mm512_loadu_ps(&temp[i + 16]);
|
||||
__m512 dcurr1 = _mm512_loadu_ps(&temp[half + i + 16]);
|
||||
__m512 dprev1 = _mm512_loadu_ps(&temp[half + i + 15]);
|
||||
__m512 sum1 = _mm512_add_ps(dprev1, dcurr1);
|
||||
s1 = _mm512_fmadd_ps(beta_vec, sum1, s1);
|
||||
_mm512_storeu_ps(&temp[i + 16], s1);
|
||||
}
|
||||
for (; i + 16 <= start + n_full; i += 16) {
|
||||
__m512 s = _mm512_loadu_ps(&temp[i]);
|
||||
__m512 dcurr = _mm512_loadu_ps(&temp[half + i]);
|
||||
__m512 dprev = _mm512_loadu_ps(&temp[half + i - 1]);
|
||||
__m512 sum = _mm512_add_ps(dprev, dcurr);
|
||||
s = _mm512_fmadd_ps(beta_vec, sum, s);
|
||||
_mm512_storeu_ps(&temp[i], s);
|
||||
}
|
||||
// scalar remainder
|
||||
for (; i < limit; ++i) {
|
||||
float d_curr = (half + i < length) ? temp[half + i] : 0.0f;
|
||||
float d_prev = (half + i - 1 < length && i > 0) ? temp[half + i - 1] : d_curr;
|
||||
temp[i] += -0.052980118f * (d_prev + d_curr);
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
// -----------------------
|
||||
// Step 3: Predict γ
|
||||
// d[i] += gamma * (s[i] + s[i+1])
|
||||
// -----------------------
|
||||
if (half > 0) {
|
||||
if (half == 1) {
|
||||
if (half < length) {
|
||||
temp[half + 0] += 0.882911076f * (temp[0] + temp[0]);
|
||||
}
|
||||
} else {
|
||||
int limit = (half - 1);
|
||||
int n_full = (limit / 16) * 16;
|
||||
i = 0;
|
||||
for (; i + 32 <= n_full; i += 32) {
|
||||
__m512 s0 = _mm512_loadu_ps(&temp[i]);
|
||||
__m512 s0n = _mm512_loadu_ps(&temp[i + 1]);
|
||||
__m512 d0 = _mm512_loadu_ps(&temp[half + i]);
|
||||
__m512 sum0 = _mm512_add_ps(s0, s0n);
|
||||
d0 = _mm512_fmadd_ps(gamma_vec, sum0, d0);
|
||||
_mm512_storeu_ps(&temp[half + i], d0);
|
||||
|
||||
__m512 s1 = _mm512_loadu_ps(&temp[i + 16]);
|
||||
__m512 s1n = _mm512_loadu_ps(&temp[i + 17]);
|
||||
__m512 d1 = _mm512_loadu_ps(&temp[half + i + 16]);
|
||||
__m512 sum1 = _mm512_add_ps(s1, s1n);
|
||||
d1 = _mm512_fmadd_ps(gamma_vec, sum1, d1);
|
||||
_mm512_storeu_ps(&temp[half + i + 16], d1);
|
||||
}
|
||||
for (; i + 16 <= n_full; i += 16) {
|
||||
__m512 s = _mm512_loadu_ps(&temp[i]);
|
||||
__m512 sn = _mm512_loadu_ps(&temp[i + 1]);
|
||||
__m512 d = _mm512_loadu_ps(&temp[half + i]);
|
||||
__m512 sum = _mm512_add_ps(s, sn);
|
||||
d = _mm512_fmadd_ps(gamma_vec, sum, d);
|
||||
_mm512_storeu_ps(&temp[half + i], d);
|
||||
}
|
||||
for (; i < limit; ++i) {
|
||||
temp[half + i] += 0.882911076f * (temp[i] + temp[i + 1]);
|
||||
}
|
||||
// last index mirror
|
||||
int last = half - 1;
|
||||
if (half + last < length) {
|
||||
float s_curr = temp[last];
|
||||
float s_next = s_curr;
|
||||
temp[half + last] += 0.882911076f * (s_curr + s_next);
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
// -----------------------
|
||||
// Step 4: Update δ
|
||||
// s[i] += delta * (d[i-1] + d[i])
|
||||
// -----------------------
|
||||
if (half > 0) {
|
||||
// i == 0
|
||||
if (half >= 1) {
|
||||
if (half + 0 < length) {
|
||||
float d_curr0 = temp[half + 0];
|
||||
temp[0] += 0.443506852f * (d_curr0 + d_curr0);
|
||||
}
|
||||
}
|
||||
|
||||
if (half > 1) {
|
||||
int start = 1;
|
||||
int limit = half; // exclusive
|
||||
int n_elems = limit - start;
|
||||
int n_full = (n_elems / 16) * 16;
|
||||
i = start;
|
||||
for (; i + 32 <= start + n_full; i += 32) {
|
||||
__m512 s0 = _mm512_loadu_ps(&temp[i]);
|
||||
__m512 dcurr0 = _mm512_loadu_ps(&temp[half + i]);
|
||||
__m512 dprev0 = _mm512_loadu_ps(&temp[half + i - 1]);
|
||||
__m512 sum0 = _mm512_add_ps(dprev0, dcurr0);
|
||||
s0 = _mm512_fmadd_ps(delta_vec, sum0, s0);
|
||||
_mm512_storeu_ps(&temp[i], s0);
|
||||
|
||||
__m512 s1 = _mm512_loadu_ps(&temp[i + 16]);
|
||||
__m512 dcurr1 = _mm512_loadu_ps(&temp[half + i + 16]);
|
||||
__m512 dprev1 = _mm512_loadu_ps(&temp[half + i + 15]);
|
||||
__m512 sum1 = _mm512_add_ps(dprev1, dcurr1);
|
||||
s1 = _mm512_fmadd_ps(delta_vec, sum1, s1);
|
||||
_mm512_storeu_ps(&temp[i + 16], s1);
|
||||
}
|
||||
for (; i + 16 <= start + n_full; i += 16) {
|
||||
__m512 s = _mm512_loadu_ps(&temp[i]);
|
||||
__m512 dcurr = _mm512_loadu_ps(&temp[half + i]);
|
||||
__m512 dprev = _mm512_loadu_ps(&temp[half + i - 1]);
|
||||
__m512 sum = _mm512_add_ps(dprev, dcurr);
|
||||
s = _mm512_fmadd_ps(delta_vec, sum, s);
|
||||
_mm512_storeu_ps(&temp[i], s);
|
||||
}
|
||||
for (; i < limit; ++i) {
|
||||
float d_curr = (half + i < length) ? temp[half + i] : 0.0f;
|
||||
float d_prev = (half + i - 1 < length && i > 0) ? temp[half + i - 1] : d_curr;
|
||||
temp[i] += 0.443506852f * (d_prev + d_curr);
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
// -----------------------
|
||||
// Step 5: Scaling
|
||||
// s *= K, d *= invK
|
||||
// -----------------------
|
||||
// s (first half)
|
||||
{
|
||||
int n_full = (half / 16) * 16;
|
||||
i = 0;
|
||||
for (; i + 32 <= n_full; i += 32) {
|
||||
__m512 s0 = _mm512_loadu_ps(&temp[i]);
|
||||
s0 = _mm512_mul_ps(s0, K_vec);
|
||||
_mm512_storeu_ps(&temp[i], s0);
|
||||
|
||||
__m512 s1 = _mm512_loadu_ps(&temp[i + 16]);
|
||||
s1 = _mm512_mul_ps(s1, K_vec);
|
||||
_mm512_storeu_ps(&temp[i + 16], s1);
|
||||
}
|
||||
for (; i + 16 <= n_full; i += 16) {
|
||||
__m512 s = _mm512_loadu_ps(&temp[i]);
|
||||
s = _mm512_mul_ps(s, K_vec);
|
||||
_mm512_storeu_ps(&temp[i], s);
|
||||
}
|
||||
for (; i < half; ++i) temp[i] *= 1.230174105f;
|
||||
}
|
||||
|
||||
// d (second half)
|
||||
{
|
||||
int dlen = length - half;
|
||||
int n_full = (dlen / 16) * 16;
|
||||
i = 0;
|
||||
for (; i + 32 <= n_full; i += 32) {
|
||||
__m512 d0 = _mm512_loadu_ps(&temp[half + i]);
|
||||
d0 = _mm512_mul_ps(d0, invK_vec);
|
||||
_mm512_storeu_ps(&temp[half + i], d0);
|
||||
|
||||
__m512 d1 = _mm512_loadu_ps(&temp[half + i + 16]);
|
||||
d1 = _mm512_mul_ps(d1, invK_vec);
|
||||
_mm512_storeu_ps(&temp[half + i + 16], d1);
|
||||
}
|
||||
for (; i + 16 <= n_full; i += 16) {
|
||||
__m512 d = _mm512_loadu_ps(&temp[half + i]);
|
||||
d = _mm512_mul_ps(d, invK_vec);
|
||||
_mm512_storeu_ps(&temp[half + i], d);
|
||||
}
|
||||
for (; i < dlen; ++i) {
|
||||
if (half + i < length) temp[half + i] /= 1.230174105f;
|
||||
}
|
||||
}
|
||||
|
||||
// Copy back and free
|
||||
memcpy(data, temp, (size_t)length * sizeof(float));
|
||||
free(temp);
|
||||
}
|
||||
|
||||
// Haar Forward DWT with AVX-512
|
||||
static inline void dwt_haar_forward_1d_avx512(float *data, int length) {
|
||||
if (length < 2) return;
|
||||
|
||||
float *temp = (float*)malloc(length * sizeof(float));
|
||||
int half = (length + 1) / 2;
|
||||
|
||||
const __m512 half_vec = _mm512_set1_ps(0.5f);
|
||||
|
||||
// Process 16 pairs at a time
|
||||
int i;
|
||||
for (i = 0; i + 16 <= half; i += 16) {
|
||||
__mmask16 valid_mask = 0xFFFF;
|
||||
|
||||
float even_vals[16], odd_vals[16];
|
||||
for (int j = 0; j < 16; j++) {
|
||||
even_vals[j] = data[2 * (i + j)];
|
||||
if (2 * (i + j) + 1 < length) {
|
||||
odd_vals[j] = data[2 * (i + j) + 1];
|
||||
} else {
|
||||
odd_vals[j] = even_vals[j];
|
||||
valid_mask &= ~(1 << j);
|
||||
}
|
||||
}
|
||||
|
||||
__m512 even = _mm512_loadu_ps(even_vals);
|
||||
__m512 odd = _mm512_loadu_ps(odd_vals);
|
||||
|
||||
// Low-pass: (even + odd) / 2
|
||||
__m512 low = _mm512_mul_ps(_mm512_add_ps(even, odd), half_vec);
|
||||
// High-pass: (even - odd) / 2
|
||||
__m512 high = _mm512_mul_ps(_mm512_sub_ps(even, odd), half_vec);
|
||||
|
||||
_mm512_storeu_ps(&temp[i], low);
|
||||
_mm512_mask_storeu_ps(&temp[half + i], valid_mask, high);
|
||||
}
|
||||
|
||||
// Remaining scalar
|
||||
for (; i < half; i++) {
|
||||
if (2 * i + 1 < length) {
|
||||
temp[i] = (data[2 * i] + data[2 * i + 1]) / 2.0f;
|
||||
temp[half + i] = (data[2 * i] - data[2 * i + 1]) / 2.0f;
|
||||
} else {
|
||||
temp[i] = data[2 * i];
|
||||
if (half + i < length) {
|
||||
temp[half + i] = 0.0f;
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
memcpy(data, temp, length * sizeof(float));
|
||||
free(temp);
|
||||
}
|
||||
|
||||
// =============================================================================
|
||||
// AVX-512 Optimised Quantisation Functions
|
||||
// =============================================================================
|
||||
|
||||
static inline void quantise_dwt_coefficients_avx512(
|
||||
float *coeffs, int16_t *quantised, int size,
|
||||
float effective_q, float dead_zone_threshold,
|
||||
int width, int height, int decomp_levels, int is_chroma,
|
||||
int (*get_subband_level)(int, int, int, int),
|
||||
int (*get_subband_type)(int, int, int, int)
|
||||
) {
|
||||
const __m512 q_vec = _mm512_set1_ps(effective_q);
|
||||
const __m512 inv_q_vec = _mm512_set1_ps(1.0f / effective_q);
|
||||
const __m512 half_vec = _mm512_set1_ps(0.5f);
|
||||
const __m512 nhalf_vec = _mm512_set1_ps(-0.5f);
|
||||
const __m512 zero_vec = _mm512_setzero_ps();
|
||||
const __m512i min_i32 = _mm512_set1_epi32(-32768);
|
||||
const __m512i max_i32 = _mm512_set1_epi32(32767);
|
||||
|
||||
int i;
|
||||
for (i = 0; i + 16 <= size; i += 16) {
|
||||
__m512 coeff = _mm512_loadu_ps(&coeffs[i]);
|
||||
__m512 quant = _mm512_mul_ps(coeff, inv_q_vec);
|
||||
|
||||
// Dead-zone handling (simplified - full version needs per-coeff logic)
|
||||
if (dead_zone_threshold > 0.0f && !is_chroma) {
|
||||
__m512 threshold_vec = _mm512_set1_ps(dead_zone_threshold);
|
||||
__m512 abs_quant = _mm512_abs_ps(quant);
|
||||
__mmask16 dead_mask = _mm512_cmp_ps_mask(abs_quant, threshold_vec, _CMP_LE_OQ);
|
||||
quant = _mm512_mask_blend_ps(dead_mask, quant, zero_vec);
|
||||
}
|
||||
|
||||
// Manual rounding to match scalar behaviour (round away from zero)
|
||||
// First add 0.5 or -0.5 based on sign
|
||||
__mmask16 pos_mask = _mm512_cmp_ps_mask(quant, zero_vec, _CMP_GE_OQ);
|
||||
__m512 round_val = _mm512_mask_blend_ps(pos_mask, nhalf_vec, half_vec);
|
||||
quant = _mm512_add_ps(quant, round_val);
|
||||
|
||||
// Now truncate to int32 (this matches scalar (int32_t) cast after adding 0.5)
|
||||
__m512i quant_i32 = _mm512_cvttps_epi32(quant); // cvtt = truncate (round toward zero)
|
||||
quant_i32 = _mm512_max_epi32(quant_i32, min_i32);
|
||||
quant_i32 = _mm512_min_epi32(quant_i32, max_i32);
|
||||
|
||||
// Pack to int16 (AVX-512 has cvtsepi32_epi16)
|
||||
__m256i quant_i16 = _mm512_cvtsepi32_epi16(quant_i32);
|
||||
_mm256_storeu_si256((__m256i*)&quantised[i], quant_i16);
|
||||
}
|
||||
|
||||
// Remaining scalar
|
||||
for (; i < size; i++) {
|
||||
float quantised_val = coeffs[i] / effective_q;
|
||||
|
||||
// Dead-zone (simplified)
|
||||
if (dead_zone_threshold > 0.0f && !is_chroma) {
|
||||
if (fabsf(quantised_val) <= dead_zone_threshold) {
|
||||
quantised_val = 0.0f;
|
||||
}
|
||||
}
|
||||
|
||||
int32_t val = (int32_t)(quantised_val + (quantised_val >= 0 ? 0.5f : -0.5f));
|
||||
quantised[i] = (int16_t)((val < -32768) ? -32768 : (val > 32767 ? 32767 : val));
|
||||
}
|
||||
}
|
||||
|
||||
// Perceptual quantisation with per-coefficient weighting
|
||||
static inline void quantise_dwt_coefficients_perceptual_avx512(
|
||||
float *coeffs, int16_t *quantised, int size,
|
||||
float *weights, // Pre-computed per-coefficient weights
|
||||
float base_quantiser
|
||||
) {
|
||||
const __m512 base_q_vec = _mm512_set1_ps(base_quantiser);
|
||||
const __m512 half_vec = _mm512_set1_ps(0.5f);
|
||||
const __m512 nhalf_vec = _mm512_set1_ps(-0.5f);
|
||||
const __m512 zero_vec = _mm512_setzero_ps();
|
||||
const __m512i min_i32 = _mm512_set1_epi32(-32768);
|
||||
const __m512i max_i32 = _mm512_set1_epi32(32767);
|
||||
|
||||
int i;
|
||||
for (i = 0; i + 16 <= size; i += 16) {
|
||||
__m512 coeff = _mm512_loadu_ps(&coeffs[i]);
|
||||
__m512 weight = _mm512_loadu_ps(&weights[i]);
|
||||
|
||||
// effective_q = base_q * weight
|
||||
__m512 effective_q = _mm512_mul_ps(base_q_vec, weight);
|
||||
__m512 quant = _mm512_div_ps(coeff, effective_q);
|
||||
|
||||
// Manual rounding to match scalar behaviour
|
||||
__mmask16 pos_mask = _mm512_cmp_ps_mask(quant, zero_vec, _CMP_GE_OQ);
|
||||
__m512 round_val = _mm512_mask_blend_ps(pos_mask, nhalf_vec, half_vec);
|
||||
quant = _mm512_add_ps(quant, round_val);
|
||||
|
||||
// Truncate to int32 (matches scalar cast after rounding)
|
||||
__m512i quant_i32 = _mm512_cvttps_epi32(quant);
|
||||
quant_i32 = _mm512_max_epi32(quant_i32, min_i32);
|
||||
quant_i32 = _mm512_min_epi32(quant_i32, max_i32);
|
||||
|
||||
__m256i quant_i16 = _mm512_cvtsepi32_epi16(quant_i32);
|
||||
_mm256_storeu_si256((__m256i*)&quantised[i], quant_i16);
|
||||
}
|
||||
|
||||
// Remaining scalar
|
||||
for (; i < size; i++) {
|
||||
float effective_q = base_quantiser * weights[i];
|
||||
float quantised_val = coeffs[i] / effective_q;
|
||||
int32_t val = (int32_t)(quantised_val + (quantised_val >= 0 ? 0.5f : -0.5f));
|
||||
quantised[i] = (int16_t)((val < -32768) ? -32768 : (val > 32767 ? 32767 : val));
|
||||
}
|
||||
}
|
||||
|
||||
// =============================================================================
|
||||
// AVX-512 Optimised Dequantisation Functions
|
||||
// =============================================================================
|
||||
|
||||
// Basic dequantisation: quantised[i] * effective_q
|
||||
static inline void dequantise_dwt_coefficients_avx512(
|
||||
const int16_t *quantised, float *coeffs, int size,
|
||||
float effective_q
|
||||
) {
|
||||
const __m512 q_vec = _mm512_set1_ps(effective_q);
|
||||
|
||||
int i;
|
||||
for (i = 0; i + 16 <= size; i += 16) {
|
||||
// Load 16 int16 values
|
||||
__m256i quant_i16 = _mm256_loadu_si256((__m256i*)&quantised[i]);
|
||||
|
||||
// Convert int16 to int32
|
||||
__m512i quant_i32 = _mm512_cvtepi16_epi32(quant_i16);
|
||||
|
||||
// Convert int32 to float
|
||||
__m512 quant_f32 = _mm512_cvtepi32_ps(quant_i32);
|
||||
|
||||
// Multiply by quantiser
|
||||
__m512 dequant = _mm512_mul_ps(quant_f32, q_vec);
|
||||
|
||||
_mm512_storeu_ps(&coeffs[i], dequant);
|
||||
}
|
||||
|
||||
// Remaining scalar
|
||||
for (; i < size; i++) {
|
||||
coeffs[i] = (float)quantised[i] * effective_q;
|
||||
}
|
||||
}
|
||||
|
||||
// Perceptual dequantisation with per-coefficient weights
|
||||
static inline void dequantise_dwt_coefficients_perceptual_avx512(
|
||||
const int16_t *quantised, float *coeffs, int size,
|
||||
const float *weights, float base_quantiser
|
||||
) {
|
||||
const __m512 base_q_vec = _mm512_set1_ps(base_quantiser);
|
||||
|
||||
int i;
|
||||
for (i = 0; i + 16 <= size; i += 16) {
|
||||
// Load 16 int16 values
|
||||
__m256i quant_i16 = _mm256_loadu_si256((__m256i*)&quantised[i]);
|
||||
|
||||
// Convert int16 → int32 → float
|
||||
__m512i quant_i32 = _mm512_cvtepi16_epi32(quant_i16);
|
||||
__m512 quant_f32 = _mm512_cvtepi32_ps(quant_i32);
|
||||
|
||||
// Load weights
|
||||
__m512 weight = _mm512_loadu_ps(&weights[i]);
|
||||
|
||||
// effective_q = base_q * weight
|
||||
__m512 effective_q = _mm512_mul_ps(base_q_vec, weight);
|
||||
|
||||
// dequant = quantised * effective_q
|
||||
__m512 dequant = _mm512_mul_ps(quant_f32, effective_q);
|
||||
|
||||
_mm512_storeu_ps(&coeffs[i], dequant);
|
||||
}
|
||||
|
||||
// Remaining scalar
|
||||
for (; i < size; i++) {
|
||||
float effective_q = base_quantiser * weights[i];
|
||||
coeffs[i] = (float)quantised[i] * effective_q;
|
||||
}
|
||||
}
|
||||
|
||||
// =============================================================================
|
||||
// AVX-512 Optimised RGB to YCoCg Conversion
|
||||
// =============================================================================
|
||||
|
||||
static inline void rgb_to_ycocg_avx512(const uint8_t *rgb, float *y, float *co, float *cg, int width, int height) {
|
||||
const int total_pixels = width * height;
|
||||
const __m512 half_vec = _mm512_set1_ps(0.5f);
|
||||
|
||||
int i;
|
||||
// Process 16 pixels at a time (48 bytes of RGB data)
|
||||
for (i = 0; i + 16 <= total_pixels; i += 16) {
|
||||
// Load 16 RGB triplets (48 bytes)
|
||||
// We need to deinterleave R, G, B channels
|
||||
|
||||
// Manual load and deinterleave (AVX-512 doesn't have direct RGB deinterleave)
|
||||
float r_vals[16], g_vals[16], b_vals[16];
|
||||
for (int j = 0; j < 16; j++) {
|
||||
r_vals[j] = (float)rgb[(i + j) * 3 + 0];
|
||||
g_vals[j] = (float)rgb[(i + j) * 3 + 1];
|
||||
b_vals[j] = (float)rgb[(i + j) * 3 + 2];
|
||||
}
|
||||
|
||||
__m512 r = _mm512_loadu_ps(r_vals);
|
||||
__m512 g = _mm512_loadu_ps(g_vals);
|
||||
__m512 b = _mm512_loadu_ps(b_vals);
|
||||
|
||||
// YCoCg-R transform:
|
||||
// co = r - b
|
||||
// tmp = b + co * 0.5
|
||||
// cg = g - tmp
|
||||
// y = tmp + cg * 0.5
|
||||
|
||||
__m512 co_vec = _mm512_sub_ps(r, b);
|
||||
__m512 tmp = _mm512_fmadd_ps(co_vec, half_vec, b); // tmp = b + co * 0.5
|
||||
__m512 cg_vec = _mm512_sub_ps(g, tmp);
|
||||
__m512 y_vec = _mm512_fmadd_ps(cg_vec, half_vec, tmp); // y = tmp + cg * 0.5
|
||||
|
||||
_mm512_storeu_ps(&y[i], y_vec);
|
||||
_mm512_storeu_ps(&co[i], co_vec);
|
||||
_mm512_storeu_ps(&cg[i], cg_vec);
|
||||
}
|
||||
|
||||
// Remaining pixels (scalar)
|
||||
for (; i < total_pixels; i++) {
|
||||
const float r = rgb[i * 3 + 0];
|
||||
const float g = rgb[i * 3 + 1];
|
||||
const float b = rgb[i * 3 + 2];
|
||||
|
||||
co[i] = r - b;
|
||||
const float tmp = b + co[i] * 0.5f;
|
||||
cg[i] = g - tmp;
|
||||
y[i] = tmp + cg[i] * 0.5f;
|
||||
}
|
||||
}
|
||||
|
||||
// =============================================================================
|
||||
// AVX-512 Optimised 2D DWT with Gather/Scatter
|
||||
// =============================================================================
|
||||
|
||||
// Optimised column extraction using gather
|
||||
static inline void dwt_2d_extract_column_avx512(
|
||||
const float *tile_data, float *column,
|
||||
int x, int width, int height
|
||||
) {
|
||||
// Create gather indices for column extraction
|
||||
// indices[i] = (i * width + x)
|
||||
|
||||
int y;
|
||||
for (y = 0; y + 16 <= height; y += 16) {
|
||||
// Build gather indices
|
||||
int indices[16];
|
||||
for (int j = 0; j < 16; j++) {
|
||||
indices[j] = (y + j) * width + x;
|
||||
}
|
||||
|
||||
__m512i vindex = _mm512_loadu_si512((__m512i*)indices);
|
||||
__m512 col_data = _mm512_i32gather_ps(vindex, tile_data, 4);
|
||||
_mm512_storeu_ps(&column[y], col_data);
|
||||
}
|
||||
|
||||
// Remaining scalar
|
||||
for (; y < height; y++) {
|
||||
column[y] = tile_data[y * width + x];
|
||||
}
|
||||
}
|
||||
|
||||
// Optimised column insertion using scatter
|
||||
static inline void dwt_2d_insert_column_avx512(
|
||||
float *tile_data, const float *column,
|
||||
int x, int width, int height
|
||||
) {
|
||||
int y;
|
||||
for (y = 0; y + 16 <= height; y += 16) {
|
||||
// Build scatter indices
|
||||
int indices[16];
|
||||
for (int j = 0; j < 16; j++) {
|
||||
indices[j] = (y + j) * width + x;
|
||||
}
|
||||
|
||||
__m512i vindex = _mm512_loadu_si512((__m512i*)indices);
|
||||
__m512 col_data = _mm512_loadu_ps(&column[y]);
|
||||
_mm512_i32scatter_ps(tile_data, vindex, col_data, 4);
|
||||
}
|
||||
|
||||
// Remaining scalar
|
||||
for (; y < height; y++) {
|
||||
tile_data[y * width + x] = column[y];
|
||||
}
|
||||
}
|
||||
|
||||
#endif // __AVX512F__
|
||||
|
||||
#endif // TAV_AVX512_H
|
||||
303
video_encoder/include/tav_encoder_lib.h
Normal file
303
video_encoder/include/tav_encoder_lib.h
Normal file
@@ -0,0 +1,303 @@
|
||||
/**
|
||||
* TAV Encoder Library - Public API
|
||||
*
|
||||
* High-level interface for encoding video using the TSVM Advanced Video (TAV) codec.
|
||||
* Supports GOP-based encoding with internal multi-threading for optimal performance.
|
||||
*
|
||||
* Created by CuriousTorvald and Claude on 2025-12-03.
|
||||
*/
|
||||
|
||||
#ifndef TAV_ENCODER_LIB_H
|
||||
#define TAV_ENCODER_LIB_H
|
||||
|
||||
#include <stdint.h>
|
||||
#include <stddef.h>
|
||||
|
||||
#ifdef __cplusplus
|
||||
extern "C" {
|
||||
#endif
|
||||
|
||||
// =============================================================================
|
||||
// Opaque Encoder Context
|
||||
// =============================================================================
|
||||
|
||||
/**
|
||||
* TAV encoder context - opaque to users.
|
||||
* Created with tav_encoder_create(), freed with tav_encoder_free().
|
||||
*/
|
||||
typedef struct tav_encoder_context tav_encoder_context_t;
|
||||
|
||||
// =============================================================================
|
||||
// Configuration Structures
|
||||
// =============================================================================
|
||||
|
||||
/**
|
||||
* Video encoding parameters.
|
||||
*/
|
||||
typedef struct {
|
||||
// === Video Dimensions ===
|
||||
int width; // Frame width (must be even)
|
||||
int height; // Frame height (must be even)
|
||||
int fps_num; // Framerate numerator (e.g., 60 for 60fps)
|
||||
int fps_den; // Framerate denominator (e.g., 1 for 60/1)
|
||||
|
||||
// === Wavelet Configuration ===
|
||||
int wavelet_type; // Spatial wavelet: 0=CDF 5/3, 1=CDF 9/7 (default), 2=CDF 13/7, 16=DD-4, 255=Haar
|
||||
int temporal_wavelet; // Temporal wavelet: 0=Haar, 1=CDF 5/3 (default for smooth motion)
|
||||
int decomp_levels; // Spatial DWT levels (0=auto, typically 6)
|
||||
int temporal_levels; // Temporal DWT levels (0=auto, typically 2 for 8-frame GOPs)
|
||||
|
||||
// === Color Space ===
|
||||
int channel_layout; // 0=YCoCg-R (default), 1=ICtCp (for HDR/BT.2100 sources)
|
||||
int perceptual_tuning; // 1=enable HVS perceptual quantization (default), 0=uniform
|
||||
|
||||
// === GOP Configuration ===
|
||||
int enable_temporal_dwt; // 1=enable 3D DWT GOP encoding (default), 0=intra-only I-frames
|
||||
int gop_size; // Frames per GOP (8, 16, or 24; 0=auto based on framerate)
|
||||
int enable_two_pass; // 1=enable two-pass with scene change detection (default), 0=single-pass
|
||||
|
||||
// === Quality Control ===
|
||||
int quality_level;
|
||||
int quality_y; // Luma quality (0-5, default: 3)
|
||||
int quality_co; // Orange chrominance quality (0-5, default: 3)
|
||||
int quality_cg; // Green chrominance quality (0-5, default: 3)
|
||||
int dead_zone_threshold; // Dead-zone quantization threshold (0=disabled, 1-10 typical)
|
||||
|
||||
// === Entropy Coding ===
|
||||
int entropy_coder; // 0=Twobitmap (default), 1=EZBC (better for high-quality)
|
||||
int zstd_level; // Zstd compression level (3-22, default: 7)
|
||||
|
||||
// === Multi-threading ===
|
||||
int num_threads; // Worker threads (0=single-threaded, -1=auto, 1-16=explicit)
|
||||
|
||||
// === Encoder Presets ===
|
||||
int encoder_preset; // Preset flags: 0x01=sports (finer temporal quant), 0x02=anime (disable grain)
|
||||
|
||||
// === Advanced Options ===
|
||||
int verbose; // 1=enable debug output, 0=quiet (default)
|
||||
int monoblock; // 1=single tile encoding (always 1 for current implementation)
|
||||
|
||||
} tav_encoder_params_t;
|
||||
|
||||
/**
|
||||
* Initialize encoder parameters with default values.
|
||||
*
|
||||
* @param params Parameter structure to initialize
|
||||
* @param width Frame width
|
||||
* @param height Frame height
|
||||
*/
|
||||
void tav_encoder_params_init(tav_encoder_params_t *params, int width, int height);
|
||||
|
||||
/**
|
||||
* Encoder output packet.
|
||||
* Contains encoded video or audio data.
|
||||
*/
|
||||
typedef struct {
|
||||
uint8_t *data; // Packet data (owned by encoder, valid until next encode/flush)
|
||||
size_t size; // Packet size in bytes
|
||||
uint8_t packet_type; // TAV packet type (0x10=I-frame, 0x12=GOP, 0x24=audio, etc.)
|
||||
int frame_number; // Frame number (for video packets)
|
||||
int is_video; // 1=video packet, 0=audio packet
|
||||
} tav_encoder_packet_t;
|
||||
|
||||
// =============================================================================
|
||||
// Encoder Lifecycle
|
||||
// =============================================================================
|
||||
|
||||
/**
|
||||
* Create TAV encoder context.
|
||||
*
|
||||
* Allocates internal buffers, initializes thread pool (if multi-threading enabled),
|
||||
* and prepares encoder for frame submission.
|
||||
*
|
||||
* @param params Encoder parameters (copied internally)
|
||||
* @return Encoder context, or NULL on failure
|
||||
*/
|
||||
tav_encoder_context_t *tav_encoder_create(const tav_encoder_params_t *params);
|
||||
|
||||
/**
|
||||
* Free TAV encoder context.
|
||||
*
|
||||
* Shuts down thread pool, frees all buffers and resources.
|
||||
* Any unflushed frames in the GOP buffer will be lost.
|
||||
*
|
||||
* @param ctx Encoder context
|
||||
*/
|
||||
void tav_encoder_free(tav_encoder_context_t *ctx);
|
||||
|
||||
/**
|
||||
* Get last error message.
|
||||
*
|
||||
* @param ctx Encoder context
|
||||
* @return Error message string (valid until next encode operation)
|
||||
*/
|
||||
const char *tav_encoder_get_error(tav_encoder_context_t *ctx);
|
||||
|
||||
/**
|
||||
* Get encoder parameters (with calculated values).
|
||||
* After context creation, params will contain actual values used
|
||||
* (e.g., auto-calculated decomp_levels, gop_size).
|
||||
*
|
||||
* @param ctx Encoder context
|
||||
* @param params Output parameters structure
|
||||
*/
|
||||
void tav_encoder_get_params(tav_encoder_context_t *ctx, tav_encoder_params_t *params);
|
||||
|
||||
/**
|
||||
* DEBUG: Validate encoder context integrity
|
||||
* Returns 1 if context appears valid, 0 otherwise
|
||||
*/
|
||||
int tav_encoder_validate_context(tav_encoder_context_t *ctx);
|
||||
|
||||
// =============================================================================
|
||||
// Video Encoding
|
||||
// =============================================================================
|
||||
|
||||
/**
|
||||
* Encode a single RGB24 frame.
|
||||
*
|
||||
* Frames are buffered internally until a GOP is full, then encoded and returned.
|
||||
* For GOP encoding: returns NULL until GOP is complete.
|
||||
* For intra-only: returns packet immediately.
|
||||
*
|
||||
* Thread-safety: NOT thread-safe. Caller must serialize calls to encode_frame().
|
||||
*
|
||||
* @param ctx Encoder context
|
||||
* @param rgb_frame RGB24 frame data (planar: [R...][G...][B...]), width×height×3 bytes
|
||||
* @param frame_pts Presentation timestamp (frame number or time)
|
||||
* @param packet Output packet pointer (NULL if GOP not yet complete)
|
||||
* @return 1 if packet ready, 0 if buffering for GOP, -1 on error
|
||||
*/
|
||||
int tav_encoder_encode_frame(tav_encoder_context_t *ctx,
|
||||
const uint8_t *rgb_frame,
|
||||
int64_t frame_pts,
|
||||
tav_encoder_packet_t **packet);
|
||||
|
||||
/**
|
||||
* Flush encoder and encode any remaining buffered frames.
|
||||
*
|
||||
* Call at end of encoding to output final GOP (even if not full).
|
||||
* Returns packets one at a time through repeated calls.
|
||||
*
|
||||
* @param ctx Encoder context
|
||||
* @param packet Output packet pointer (NULL when no more packets)
|
||||
* @return 1 if packet ready, 0 if no more packets, -1 on error
|
||||
*/
|
||||
int tav_encoder_flush(tav_encoder_context_t *ctx,
|
||||
tav_encoder_packet_t **packet);
|
||||
|
||||
/**
|
||||
* Encode a complete GOP (Group of Pictures) directly.
|
||||
*
|
||||
* This function is STATELESS and THREAD-SAFE with separate contexts.
|
||||
* Perfect for multithreaded encoding from CLI:
|
||||
* - Each thread creates its own encoder context
|
||||
* - Each thread calls encode_gop() with a batch of frames
|
||||
* - No shared state, no locking needed
|
||||
*
|
||||
* Example multithreaded usage:
|
||||
* ```c
|
||||
* // Worker thread function
|
||||
* void* worker(void* arg) {
|
||||
* work_item_t* item = (work_item_t*)arg;
|
||||
*
|
||||
* // Create thread-local encoder context
|
||||
* tav_encoder_context_t* ctx = tav_encoder_create(&shared_params);
|
||||
*
|
||||
* // Encode this GOP
|
||||
* tav_encoder_packet_t* packet;
|
||||
* tav_encoder_encode_gop(ctx, item->frames, item->num_frames,
|
||||
* item->frame_numbers, &packet);
|
||||
*
|
||||
* // Store packet in output queue
|
||||
* queue_push(output_queue, packet);
|
||||
*
|
||||
* tav_encoder_free(ctx);
|
||||
* return NULL;
|
||||
* }
|
||||
* ```
|
||||
*
|
||||
* @param ctx Encoder context (one per thread)
|
||||
* @param rgb_frames Array of RGB24 frames [frame][width*height*3]
|
||||
* @param num_frames Number of frames in GOP (1-24)
|
||||
* @param frame_numbers Frame indices for timecodes (can be NULL)
|
||||
* @param packet Output packet pointer
|
||||
* @return 1 if packet ready, -1 on error
|
||||
*/
|
||||
int tav_encoder_encode_gop(tav_encoder_context_t *ctx,
|
||||
const uint8_t **rgb_frames,
|
||||
int num_frames,
|
||||
const int *frame_numbers,
|
||||
tav_encoder_packet_t **packet);
|
||||
|
||||
/**
|
||||
* Free a packet returned by encode_frame(), flush(), or encode_gop().
|
||||
*
|
||||
* @param packet Packet to free (can be NULL)
|
||||
*/
|
||||
void tav_encoder_free_packet(tav_encoder_packet_t *packet);
|
||||
|
||||
// =============================================================================
|
||||
// Audio Encoding (Optional)
|
||||
// =============================================================================
|
||||
|
||||
/**
|
||||
* Encode audio samples (TAD codec).
|
||||
*
|
||||
* Audio is encoded synchronously and returned immediately.
|
||||
* For TAV muxing: interleave audio packets with video packets by frame PTS.
|
||||
*
|
||||
* @param ctx Encoder context
|
||||
* @param pcm_samples PCM32f stereo samples (interleaved: L,R,L,R,...), num_samples×2 floats
|
||||
* @param num_samples Number of samples per channel
|
||||
* @param packet Output packet pointer
|
||||
* @return 1 if packet ready, -1 on error
|
||||
*/
|
||||
int tav_encoder_encode_audio(tav_encoder_context_t *ctx,
|
||||
const float *pcm_samples,
|
||||
size_t num_samples,
|
||||
tav_encoder_packet_t **packet);
|
||||
|
||||
// =============================================================================
|
||||
// Statistics and Info
|
||||
// =============================================================================
|
||||
|
||||
/**
|
||||
* Get encoding statistics.
|
||||
*/
|
||||
typedef struct {
|
||||
int64_t frames_encoded; // Total frames encoded
|
||||
int64_t gops_encoded; // Total GOPs encoded
|
||||
size_t total_bytes; // Total bytes output (video + audio)
|
||||
size_t video_bytes; // Video bytes
|
||||
size_t audio_bytes; // Audio bytes
|
||||
double avg_bitrate_kbps; // Average bitrate (kbps)
|
||||
double encoding_fps; // Encoding speed (frames/sec)
|
||||
} tav_encoder_stats_t;
|
||||
|
||||
/**
|
||||
* Get encoding statistics.
|
||||
*
|
||||
* @param ctx Encoder context
|
||||
* @param stats Output statistics structure
|
||||
*/
|
||||
void tav_encoder_get_stats(tav_encoder_context_t *ctx, tav_encoder_stats_t *stats);
|
||||
|
||||
// =============================================================================
|
||||
// TAV Packet Types (for reference)
|
||||
// =============================================================================
|
||||
|
||||
#define TAV_PACKET_IFRAME 0x10 // I-frame (intra-only, single frame)
|
||||
#define TAV_PACKET_PFRAME 0x11 // P-frame (delta from previous)
|
||||
#define TAV_PACKET_GOP_UNIFIED 0x12 // GOP unified (3D DWT, multiple frames)
|
||||
#define TAV_PACKET_AUDIO_TAD 0x24 // TAD audio (DWT-based perceptual codec)
|
||||
#define TAV_PACKET_AUDIO_PCM8 0x20 // PCM8 audio (legacy)
|
||||
#define TAV_PACKET_LOOP_START 0xF0 // Loop point start (no payload)
|
||||
#define TAV_PACKET_GOP_SYNC 0xFC // GOP sync (frame count marker)
|
||||
#define TAV_PACKET_TIMECODE 0xFD // Timecode metadata
|
||||
|
||||
#ifdef __cplusplus
|
||||
}
|
||||
#endif
|
||||
|
||||
#endif // TAV_ENCODER_LIB_H
|
||||
275
video_encoder/include/tav_simd_dispatch.h
Normal file
275
video_encoder/include/tav_simd_dispatch.h
Normal file
@@ -0,0 +1,275 @@
|
||||
/*
|
||||
* TAV SIMD Function Dispatcher
|
||||
*
|
||||
* This file provides runtime CPU detection and function pointer dispatch
|
||||
* for SIMD-optimized versions of performance-critical TAV encoder functions.
|
||||
*
|
||||
* Usage:
|
||||
* 1. Include this header after defining all scalar functions
|
||||
* 2. Call tav_simd_init() once at encoder initialization
|
||||
* 3. Use function pointers (e.g., dwt_53_forward_1d_ptr) throughout code
|
||||
*
|
||||
* The dispatcher will automatically select AVX-512, AVX2, or scalar versions
|
||||
* based on runtime CPU capabilities.
|
||||
*/
|
||||
|
||||
#ifndef TAV_SIMD_DISPATCH_H
|
||||
#define TAV_SIMD_DISPATCH_H
|
||||
|
||||
#include <stdint.h>
|
||||
|
||||
// =============================================================================
|
||||
// Function Pointer Types
|
||||
// =============================================================================
|
||||
|
||||
// 1D DWT function pointer types
|
||||
typedef void (*dwt_1d_func_t)(float *data, int length);
|
||||
|
||||
// Quantization function pointer types
|
||||
typedef void (*quantise_basic_func_t)(
|
||||
float *coeffs, int16_t *quantised, int size,
|
||||
float effective_q, float dead_zone_threshold,
|
||||
int width, int height, int decomp_levels, int is_chroma,
|
||||
int (*get_subband_level)(int, int, int, int),
|
||||
int (*get_subband_type)(int, int, int, int)
|
||||
);
|
||||
|
||||
typedef void (*quantise_perceptual_func_t)(
|
||||
float *coeffs, int16_t *quantised, int size,
|
||||
float *weights, float base_quantiser
|
||||
);
|
||||
|
||||
// Color conversion function pointer type
|
||||
typedef void (*rgb_to_ycocg_func_t)(
|
||||
const uint8_t *rgb, float *y, float *co, float *cg,
|
||||
int width, int height
|
||||
);
|
||||
|
||||
// 2D DWT column operations
|
||||
typedef void (*dwt_2d_column_extract_func_t)(
|
||||
const float *tile_data, float *column,
|
||||
int x, int width, int height
|
||||
);
|
||||
|
||||
typedef void (*dwt_2d_column_insert_func_t)(
|
||||
float *tile_data, const float *column,
|
||||
int x, int width, int height
|
||||
);
|
||||
|
||||
// =============================================================================
|
||||
// Global Function Pointers (initialized by tav_simd_init)
|
||||
// =============================================================================
|
||||
|
||||
// DWT 1D transforms
|
||||
static dwt_1d_func_t dwt_53_forward_1d_ptr = NULL;
|
||||
static dwt_1d_func_t dwt_97_forward_1d_ptr = NULL;
|
||||
static dwt_1d_func_t dwt_haar_forward_1d_ptr = NULL;
|
||||
static dwt_1d_func_t dwt_53_inverse_1d_ptr = NULL;
|
||||
static dwt_1d_func_t dwt_haar_inverse_1d_ptr = NULL;
|
||||
|
||||
// Quantization
|
||||
static quantise_basic_func_t quantise_dwt_coefficients_ptr = NULL;
|
||||
static quantise_perceptual_func_t quantise_dwt_coefficients_perceptual_ptr = NULL;
|
||||
|
||||
// Color conversion
|
||||
static rgb_to_ycocg_func_t rgb_to_ycocg_ptr = NULL;
|
||||
|
||||
// 2D DWT column operations
|
||||
static dwt_2d_column_extract_func_t dwt_2d_extract_column_ptr = NULL;
|
||||
static dwt_2d_column_insert_func_t dwt_2d_insert_column_ptr = NULL;
|
||||
|
||||
// =============================================================================
|
||||
// SIMD Capability Detection
|
||||
// =============================================================================
|
||||
|
||||
typedef enum {
|
||||
SIMD_NONE = 0,
|
||||
SIMD_AVX512F = 1,
|
||||
SIMD_AVX2 = 2,
|
||||
SIMD_SSE42 = 3
|
||||
} simd_level_t;
|
||||
|
||||
static simd_level_t detected_simd_level = SIMD_NONE;
|
||||
|
||||
static inline simd_level_t detect_simd_capabilities(void) {
|
||||
#if defined(__GNUC__) || defined(__clang__)
|
||||
// Use GCC/Clang built-in CPU detection
|
||||
if (!__builtin_cpu_supports("sse4.2")) {
|
||||
return SIMD_NONE;
|
||||
}
|
||||
|
||||
#ifdef __AVX512F__
|
||||
if (__builtin_cpu_supports("avx512f") &&
|
||||
__builtin_cpu_supports("avx512dq") &&
|
||||
__builtin_cpu_supports("avx512bw") &&
|
||||
__builtin_cpu_supports("avx512vl")) {
|
||||
return SIMD_AVX512F;
|
||||
}
|
||||
#endif
|
||||
|
||||
#ifdef __AVX2__
|
||||
if (__builtin_cpu_supports("avx2")) {
|
||||
return SIMD_AVX2;
|
||||
}
|
||||
#endif
|
||||
|
||||
if (__builtin_cpu_supports("sse4.2")) {
|
||||
return SIMD_SSE42;
|
||||
}
|
||||
#endif
|
||||
|
||||
return SIMD_NONE;
|
||||
}
|
||||
|
||||
// =============================================================================
|
||||
// Scalar Fallback Wrappers
|
||||
// =============================================================================
|
||||
|
||||
// These wrappers adapt the scalar functions to match function pointer signatures
|
||||
|
||||
static void quantise_dwt_coefficients_scalar_wrapper(
|
||||
float *coeffs, int16_t *quantised, int size,
|
||||
float effective_q, float dead_zone_threshold,
|
||||
int width, int height, int decomp_levels, int is_chroma,
|
||||
int (*get_subband_level)(int, int, int, int),
|
||||
int (*get_subband_type)(int, int, int, int)
|
||||
);
|
||||
// Implementation provided by including encoder - just declare prototype
|
||||
|
||||
static void quantise_dwt_coefficients_perceptual_scalar_wrapper(
|
||||
float *coeffs, int16_t *quantised, int size,
|
||||
float *weights, float base_quantiser
|
||||
);
|
||||
// Implementation provided by including encoder
|
||||
|
||||
static void dwt_2d_extract_column_scalar(
|
||||
const float *tile_data, float *column,
|
||||
int x, int width, int height
|
||||
) {
|
||||
for (int y = 0; y < height; y++) {
|
||||
column[y] = tile_data[y * width + x];
|
||||
}
|
||||
}
|
||||
|
||||
static void dwt_2d_insert_column_scalar(
|
||||
float *tile_data, const float *column,
|
||||
int x, int width, int height
|
||||
) {
|
||||
for (int y = 0; y < height; y++) {
|
||||
tile_data[y * width + x] = column[y];
|
||||
}
|
||||
}
|
||||
|
||||
// =============================================================================
|
||||
// SIMD Initialization
|
||||
// =============================================================================
|
||||
|
||||
static void tav_simd_init(void) {
|
||||
// Detect CPU capabilities
|
||||
detected_simd_level = detect_simd_capabilities();
|
||||
|
||||
const char *simd_names[] = {"None", "AVX-512", "AVX2", "SSE4.2"};
|
||||
fprintf(stderr, "[TAV] SIMD level detected: %s\n",
|
||||
simd_names[detected_simd_level]);
|
||||
|
||||
#ifdef __AVX512F__
|
||||
if (detected_simd_level == SIMD_AVX512F) {
|
||||
fprintf(stderr, "[TAV] Using AVX-512 optimizations\n");
|
||||
|
||||
// DWT functions
|
||||
extern void dwt_53_forward_1d_avx512(float *data, int length);
|
||||
extern void dwt_97_forward_1d_avx512(float *data, int length);
|
||||
extern void dwt_haar_forward_1d_avx512(float *data, int length);
|
||||
|
||||
dwt_53_forward_1d_ptr = dwt_53_forward_1d_avx512;
|
||||
dwt_97_forward_1d_ptr = dwt_97_forward_1d_avx512;
|
||||
dwt_haar_forward_1d_ptr = dwt_haar_forward_1d_avx512;
|
||||
|
||||
// Quantization
|
||||
// Note: Need wrapper functions that match the complex signature
|
||||
// For now, using scalar versions
|
||||
extern void dwt_53_forward_1d(float *data, int length);
|
||||
extern void dwt_97_forward_1d(float *data, int length);
|
||||
extern void dwt_haar_forward_1d(float *data, int length);
|
||||
extern void dwt_53_inverse_1d(float *data, int length);
|
||||
extern void dwt_haar_inverse_1d(float *data, int length);
|
||||
|
||||
// Fallback to scalar for inverse (can optimize later)
|
||||
dwt_53_inverse_1d_ptr = dwt_53_inverse_1d;
|
||||
dwt_haar_inverse_1d_ptr = dwt_haar_inverse_1d;
|
||||
|
||||
// Color conversion
|
||||
extern void rgb_to_ycocg_avx512(const uint8_t *rgb, float *y, float *co, float *cg, int width, int height);
|
||||
rgb_to_ycocg_ptr = rgb_to_ycocg_avx512;
|
||||
|
||||
// 2D column operations
|
||||
extern void dwt_2d_extract_column_avx512(const float *tile_data, float *column, int x, int width, int height);
|
||||
extern void dwt_2d_insert_column_avx512(float *tile_data, const float *column, int x, int width, int height);
|
||||
|
||||
dwt_2d_extract_column_ptr = dwt_2d_extract_column_avx512;
|
||||
dwt_2d_insert_column_ptr = dwt_2d_insert_column_avx512;
|
||||
|
||||
// Quantization uses scalar for now (needs integration work)
|
||||
extern void dwt_53_forward_1d(float *data, int length);
|
||||
extern void dwt_97_forward_1d(float *data, int length);
|
||||
extern void dwt_haar_forward_1d(float *data, int length);
|
||||
extern void dwt_53_inverse_1d(float *data, int length);
|
||||
extern void dwt_haar_inverse_1d(float *data, int length);
|
||||
extern void rgb_to_ycocg(const uint8_t *rgb, float *y, float *co, float *cg, int width, int height);
|
||||
|
||||
quantise_dwt_coefficients_ptr = quantise_dwt_coefficients_scalar_wrapper;
|
||||
quantise_dwt_coefficients_perceptual_ptr = quantise_dwt_coefficients_perceptual_scalar_wrapper;
|
||||
|
||||
return;
|
||||
}
|
||||
#endif
|
||||
|
||||
// Fallback to scalar implementations
|
||||
fprintf(stderr, "[TAV] Using scalar (non-SIMD) implementations\n");
|
||||
|
||||
extern void dwt_53_forward_1d(float *data, int length);
|
||||
extern void dwt_97_forward_1d(float *data, int length);
|
||||
extern void dwt_haar_forward_1d(float *data, int length);
|
||||
extern void dwt_53_inverse_1d(float *data, int length);
|
||||
extern void dwt_haar_inverse_1d(float *data, int length);
|
||||
extern void rgb_to_ycocg(const uint8_t *rgb, float *y, float *co, float *cg, int width, int height);
|
||||
|
||||
dwt_53_forward_1d_ptr = dwt_53_forward_1d;
|
||||
dwt_97_forward_1d_ptr = dwt_97_forward_1d;
|
||||
dwt_haar_forward_1d_ptr = dwt_haar_forward_1d;
|
||||
dwt_53_inverse_1d_ptr = dwt_53_inverse_1d;
|
||||
dwt_haar_inverse_1d_ptr = dwt_haar_inverse_1d;
|
||||
|
||||
rgb_to_ycocg_ptr = rgb_to_ycocg;
|
||||
|
||||
dwt_2d_extract_column_ptr = dwt_2d_extract_column_scalar;
|
||||
dwt_2d_insert_column_ptr = dwt_2d_insert_column_scalar;
|
||||
|
||||
quantise_dwt_coefficients_ptr = quantise_dwt_coefficients_scalar_wrapper;
|
||||
quantise_dwt_coefficients_perceptual_ptr = quantise_dwt_coefficients_perceptual_scalar_wrapper;
|
||||
}
|
||||
|
||||
// =============================================================================
|
||||
// Convenience Macros for Code Readability
|
||||
// =============================================================================
|
||||
|
||||
// Use these macros in encoder code for cleaner dispatch
|
||||
#define DWT_53_FORWARD_1D(data, length) \
|
||||
dwt_53_forward_1d_ptr((data), (length))
|
||||
|
||||
#define DWT_97_FORWARD_1D(data, length) \
|
||||
dwt_97_forward_1d_ptr((data), (length))
|
||||
|
||||
#define DWT_HAAR_FORWARD_1D(data, length) \
|
||||
dwt_haar_forward_1d_ptr((data), (length))
|
||||
|
||||
#define RGB_TO_YCOCG(rgb, y, co, cg, width, height) \
|
||||
rgb_to_ycocg_ptr((rgb), (y), (co), (cg), (width), (height))
|
||||
|
||||
#define DWT_2D_EXTRACT_COLUMN(tile_data, column, x, width, height) \
|
||||
dwt_2d_extract_column_ptr((tile_data), (column), (x), (width), (height))
|
||||
|
||||
#define DWT_2D_INSERT_COLUMN(tile_data, column, x, width, height) \
|
||||
dwt_2d_insert_column_ptr((tile_data), (column), (x), (width), (height))
|
||||
|
||||
#endif // TAV_SIMD_DISPATCH_H
|
||||
77
video_encoder/include/tav_video_decoder.h
Normal file
77
video_encoder/include/tav_video_decoder.h
Normal file
@@ -0,0 +1,77 @@
|
||||
// Created by CuriousTorvald and Claude on 2025-12-02.
|
||||
// TAV Video Decoder Library - Shared decoding functions for TAV format
|
||||
// Can be used by both regular TAV decoder and TAV-DT decoder
|
||||
|
||||
#ifndef TAV_VIDEO_DECODER_H
|
||||
#define TAV_VIDEO_DECODER_H
|
||||
|
||||
#include <stdint.h>
|
||||
#include <stddef.h>
|
||||
|
||||
// Video decoder context - opaque to users
|
||||
typedef struct tav_video_context tav_video_context_t;
|
||||
|
||||
// Video parameters structure
|
||||
typedef struct {
|
||||
int width;
|
||||
int height;
|
||||
int decomp_levels; // Spatial DWT levels (typically 4)
|
||||
int temporal_levels; // Temporal DWT levels (typically 2)
|
||||
int wavelet_filter; // 0=CDF 5/3, 1=CDF 9/7, 2=CDF 13/7, 16=DD-4, 255=Haar
|
||||
int temporal_wavelet; // Temporal wavelet (0=CDF 5/3, 1=CDF 9/7)
|
||||
int entropy_coder; // 0=Twobitmap, 1=EZBC, 2=RAW
|
||||
int channel_layout; // 0=YCoCg-R, 1=ICtCp
|
||||
int perceptual_tuning; // 1=perceptual quantisation, 0=uniform
|
||||
uint8_t quantiser_y; // Base quantiser index for Y/I
|
||||
uint8_t quantiser_co; // Base quantiser index for Co/Ct
|
||||
uint8_t quantiser_cg; // Base quantiser index for Cg/Cp
|
||||
uint8_t encoder_preset; // Encoder preset flags (sports, anime, etc.)
|
||||
int monoblock; // 1=single tile (monoblock), 0=multi-tile
|
||||
} tav_video_params_t;
|
||||
|
||||
// Create video decoder context
|
||||
// Returns NULL on failure
|
||||
tav_video_context_t *tav_video_create(const tav_video_params_t *params);
|
||||
|
||||
// Free video decoder context
|
||||
void tav_video_free(tav_video_context_t *ctx);
|
||||
|
||||
// Decode GOP_UNIFIED packet (0x12) to RGB24 frames
|
||||
// Input: compressed_data - GOP packet data (after packet type byte)
|
||||
// compressed_size - size of compressed data
|
||||
// gop_size - number of frames in GOP (read from packet)
|
||||
// Output: rgb_frames - array of pointers to RGB24 frame buffers (width*height*3 each)
|
||||
// Must be pre-allocated by caller (gop_size pointers, each pointing to width*height*3 bytes)
|
||||
// Returns: 0 on success, -1 on error
|
||||
int tav_video_decode_gop(tav_video_context_t *ctx,
|
||||
const uint8_t *compressed_data, uint32_t compressed_size,
|
||||
uint8_t gop_size, uint8_t **rgb_frames);
|
||||
|
||||
// Decode IFRAME packet (0x10) to RGB24 frame
|
||||
// Input: compressed_data - I-frame packet data (after packet type byte)
|
||||
// packet_size - size of packet data
|
||||
// Output: rgb_frame - pointer to RGB24 frame buffer (width*height*3 bytes)
|
||||
// Must be pre-allocated by caller
|
||||
// Returns: 0 on success, -1 on error
|
||||
int tav_video_decode_iframe(tav_video_context_t *ctx,
|
||||
const uint8_t *compressed_data, uint32_t packet_size,
|
||||
uint8_t *rgb_frame);
|
||||
|
||||
// Decode PFRAME packet (0x11) to RGB24 frame (delta from reference)
|
||||
// Input: compressed_data - P-frame packet data (after packet type byte)
|
||||
// packet_size - size of packet data
|
||||
// Output: rgb_frame - pointer to RGB24 frame buffer (width*height*3 bytes)
|
||||
// Must be pre-allocated by caller
|
||||
// Returns: 0 on success, -1 on error
|
||||
// Note: Requires previous frame to be decoded first (stored internally as reference)
|
||||
int tav_video_decode_pframe(tav_video_context_t *ctx,
|
||||
const uint8_t *compressed_data, uint32_t packet_size,
|
||||
uint8_t *rgb_frame);
|
||||
|
||||
// Get last error message
|
||||
const char *tav_video_get_error(tav_video_context_t *ctx);
|
||||
|
||||
// Enable verbose debug output
|
||||
void tav_video_set_verbose(tav_video_context_t *ctx, int verbose);
|
||||
|
||||
#endif // TAV_VIDEO_DECODER_H
|
||||
Reference in New Issue
Block a user