fix: wrong timecode calculation on NTSC framerates

2026-06-06 13:38:30 +09:00 · 2025-11-16 02:49:03 +09:00
parent 5c87325366
commit aa7e20695d
4 changed files with 74 additions and 27 deletions
--- a/terranmon.txt
+++ b/terranmon.txt
@@ -902,13 +902,21 @@ transmission capability, and region-of-interest coding.

 ## Header (32 bytes)
    uint8  Magic[8]: "\x1F TSVM TAV" or "\x1F TSVM TAP"
-    uint8  Version: 3 (YCoCg-R uniform), 4 (ICtCp uniform), 5 (YCoCg-R perceptual), 6 (ICtCp perceptual)
-    uint16 Width: video width in pixels  
-    uint16 Height: video height in pixels
-    uint8  FPS: frames per second. Use 0x00 for still images
+    uint8  Version:
+            - 1 = YCoCg-R multi-tile uniform
+            - 2 = ICtCp multi-tile uniform
+            - 3 = YCoCg-R monoblock uniform
+            - 4 = ICtCp monoblock uniform
+            - 5 = YCoCg-R monoblock perceptual
+            - 6 = ICtCp monoblock perceptual
+            - 7 = YCoCg-R multi-tile perceptual
+            - 8 = ICtCp multi-tile perceptual
+    uint16 Width: picture width in pixels
+    uint16 Height: picture height in pixels
+    uint8  FPS: frames per second. Use 0x00 for still pictures
    uint32 Total Frames: number of video frames
            - use 0 to denote not-finalised video stream
-            - use 0xFFFFFFFF to denote still image (.im3 file)
+            - use 0xFFFFFFFF to denote still picture (.im3 file)
    uint8  Wavelet Filter Type:
            - 0 = 5/3 reversible (LGT 5/3, JPEG 2000 standard)
            - 1 = 9/7 irreversible (CDF 9/7, slight modification of JPEG 2000, default choice)
@@ -919,19 +927,22 @@ transmission capability, and region-of-interest coding.
    uint8  Quantiser Index for Y channel (uses exponential numeric system; 0: lossless, 255: potato)
    uint8  Quantiser Index for Co channel (uses exponential numeric system; 0: lossless, 255: potato)
    uint8  Quantiser Index for Cg channel (uses exponential numeric system; 0: lossless, 255: potato)
-    uint8  Extra Feature Flags (must be ignored for still images)
-            - bit 0 = has audio
-            - bit 1 = has subtitle
-            - bit 2 = infinite loop (must be ignored when File Role is 1)
+    uint8  Extra Feature Flags
+            - bit 0 = has audio (for still pictures: has background music)
+            - bit 1 = has subtitle (for still pictures: has timed captions)
+            - bit 2 = infinite loop (has no effect for still pictures)
            - bit 7 = has no actual packets, this file is header-only without an Intro Movie
    uint8  Video Flags
            - bit 0 = interlaced
            - bit 1 = is NTSC framerate
            - bit 2 = is lossless mode
                (shorthand for `-q 6 -Q0,0,0 -w 0 --intra-only --no-perceptual-tuning --arate 384`)
-            - bit 3 = has region-of-interest coding (for still images only)
+            - bit 3 = has region-of-interest coding (for still pictures only)
    uint8  Encoder quality level (stored with bias of 1 (q0=1); used to derive anisotropy value)
    uint8  Channel layout (bit-field: bit 0=has alpha, bit 1=has chroma inverted, bit 2=has luma inverted)
+            * Luma-only videos must be decoded with fixed Chroma=0
+            * Chroma-only videos must be decoded with fixed Luma=127
+            * No-alpha videos must be decoded with fixed Alpha=255
            - 0 = Y-Co-Cg/I-Ct-Cp (000: no alpha, has chroma, has luma)
            - 1 = Y-Co-Cg-A/I-Ct-Cp-A (001: has alpha, has chroma, has luma)
            - 2 = Y/I only (010: no alpha, no chroma, has luma)
@@ -940,9 +951,9 @@ transmission capability, and region-of-interest coding.
            - 5 = Co-Cg-A/Ct-Cp-A (101: has alpha, has chroma, no luma)
            - 6-7 = Reserved/invalid (would indicate no luma and no chroma)
    uint8  Entropy Coder
-            - 0 = Twobit-plane significance map
+            - 0 = Twobit-plane significance map (deprecated)
            - 1 = Embedded Zero Block Coding
-            - 2 = Raw coefficients
+            - 2 = Raw coefficients (debugging purpose only)
    uint8  Reserved[2]: fill with zeros
    uint8  Device Orientation
            - 0 = No rotation
@@ -1001,6 +1012,7 @@ transmission capability, and region-of-interest coding.
    0xEF: TAV Extended Header
    0xF0: Loop point start (insert right AFTER the TC packet; no payload)
    0xF1: Loop point end (insert right AFTER the TC packet; no payload)
+    0xF2: Screen masking info
    0xFC: GOP Sync packet (indicates N frames decoded from GOP block)
    0xFD: Timecode (TC) Packet [for frame 0, insert at the beginning; otherwise, insert right AFTER the sync]
    0xFE: NTSC sync packet (used by player to calculate exact framerate-wise performance; no payload)
@@ -1012,11 +1024,12 @@ transmission capability, and region-of-interest coding.
        1. TAV Extended header (if any)
        2. Standard metadata payloads (if any)
        3. SSF-TC/KSF-TC packets (if any)
-            When time-coded subtitles are used, the entire subtitle bytes must precede the first video frame.
+            When time-coded subtitles are used, the entire subtitles must precede the first video frame.
            Think of it as tacking the whole subtitle file before the actual video.
+        4. Screen Masking packets (if any)

        Frame group:
-        1. TC Packet (0xFD) or Next TAV File (0x1F) [mutually exclusive!]
+        1. Timecode Packet (0xFD) or Next TAV File (0x1F) [mutually exclusive!]
        2. Loop point packet (if any)
        3. Audio packets (if any)
        4. Subtitle packets (if any) [mutually exclusive with SSF-TC/KSF-TC packets]
@@ -1024,11 +1037,12 @@ transmission capability, and region-of-interest coding.
        6. Multiplexed video packets (0x70-7F; if any)

        After a frame group:
-        1. Sync packet
+        1. Sync packet (0xFC or 0xFF)
+        2. NTSC Sync packet (if required; it will instruct players to duplicate the current frame)


 ## TAV Extended Header Specification and Structure
-    uint8  0xEF
+    uint8  Packet Type (0xEF)
    uint16 Number of Key-Value pairs
    *      Key-Value pairs

@@ -1056,7 +1070,7 @@ transmission capability, and region-of-interest coding.


 ## Standard Metadata Payload Packet Structure
-    uint8  0xE0/0xE1/0xE2/.../0xEE (see Packet Types section)
+    uint8  Packet Type (0xE0/0xE1/0xE2/.../0xEE; see Packet Types section)
    uint32 Length of the payload
    *      Standard payload

@@ -1070,13 +1084,25 @@ transmission capability, and region-of-interest coding.
    uint8  Packet Type (0xFE)
    uint64 Time since stream start in nanoseconds (this may NOT start from zero if the video is coming from a livestream)

-## Video Packet Structure (0x10, 0x11)
-    uint8  Packet Type
+## Screen Masking Packet Structure
+    When letterbox/pillarbox detection is active, the encoder will only encode pictures in the active area.
+    Decoders must use this value to derive the size of the active area for decoding, and fill the blank on playback.
+    Encoders only need to insert this packets at the start of the video (if necessary) and whenever geometry change occurs.
+
+    uint8  Packet Type (0xF2)
+    uint32 Starting frame number
+    uint16 Mask size top in pixels
+    uint16 Mask size right in pixels
+    uint16 Mask size bottom in pixels
+    uint16 Mask size left in pixels
+
+## Video Packet Structure
+    uint8  Packet Type (0x10/0x11)
    uint32 Compressed Size
    *      Zstd-compressed Block Data

 ## TAD Packet Structure
-    uint8  Packet type (0x24)
+    uint8  Packet Type (0x24)
    <header for decoding packet>
    uint16 Sample Count
    uint32 Compressed Size + 7
--- a/video_encoder/Makefile
+++ b/video_encoder/Makefile
@@ -3,8 +3,8 @@

 CC = gcc
 CXX = g++
-CFLAGS = -std=c99 -Wall -Wextra -O2 -D_GNU_SOURCE
-CXXFLAGS = -std=c++11 -Wall -Wextra -O2 -D_GNU_SOURCE
+CFLAGS = -std=c99 -Wall -Wextra -Ofast -D_GNU_SOURCE
+CXXFLAGS = -std=c++11 -Wall -Wextra -Ofast -D_GNU_SOURCE

 # Zstd flags (use pkg-config if available, fallback for cross-platform compatibility)
 ZSTD_CFLAGS = $(shell pkg-config --cflags libzstd 2>/dev/null || echo "")
--- a/video_encoder/encoder_tav.c
+++ b/video_encoder/encoder_tav.c
@@ -8108,13 +8108,14 @@ static void write_timecode_packet(FILE *output, int frame_num, int fps, int is_n
    fwrite(&packet_type, 1, 1, output);

    // Calculate timecode in nanoseconds
-    // For NTSC (29.97 fps): time = frame_num * 1001000000 / 30000
+    // For NTSC framerates (X000/1001): time = frame_num * 1001 * 1000000000 / (fps * 1000)
    // For other framerates: time = frame_num * 1000000000 / fps
    uint64_t timecode_ns;
    if (is_ntsc_framerate) {
-        // NTSC uses 30000/1001 fps (29.97...)
-        // To avoid floating point: time_ns = frame_num * 1001000000 / 30000
-        timecode_ns = ((uint64_t)frame_num * 1001000000ULL) / 30000ULL;
+        // NTSC framerates use denominator 1001 (e.g., 24000/1001, 30000/1001, 60000/1001)
+        // To avoid floating point: time_ns = frame_num * 1001 * 1e9 / (fps * 1000)
+        // This works for 24fps NTSC (23.976), 30fps NTSC (29.97), 60fps NTSC (59.94), etc.
+        timecode_ns = ((uint64_t)frame_num * 1001ULL * 1000000000ULL) / ((uint64_t)fps * 1000ULL);
    } else {
        // Standard framerate
        timecode_ns = ((uint64_t)frame_num * 1000000000ULL) / (uint64_t)fps;
@@ -10779,7 +10780,8 @@ int main(int argc, char *argv[]) {
        // Update ENDT in extended header (calculate end time for last frame)
        uint64_t endt_ns;
        if (enc->is_ntsc_framerate) {
-            endt_ns = ((uint64_t)(frame_count - 1) * 1001000000ULL) / 30000ULL;
+            // NTSC framerates use denominator 1001 (e.g., 24000/1001, 30000/1001, 60000/1001)
+            endt_ns = ((uint64_t)(frame_count - 1) * 1001ULL * 1000000000ULL) / ((uint64_t)enc->output_fps * 1000ULL);
        } else {
            endt_ns = ((uint64_t)(frame_count - 1) * 1000000000ULL) / (uint64_t)enc->output_fps;
        }
--- a/video_encoder/tav_inspector.c
+++ b/video_encoder/tav_inspector.c
@@ -53,6 +53,7 @@
 #define TAV_PACKET_EXTENDED_HDR   0xEF
 #define TAV_PACKET_LOOP_START     0xF0
 #define TAV_PACKET_LOOP_END       0xF1
+#define TAV_PACKET_SCREEN_MASK    0xF2
 #define TAV_PACKET_GOP_SYNC       0xFC  // GOP sync packet (N frames decoded)
 #define TAV_PACKET_TIMECODE       0xFD
 #define TAV_PACKET_SYNC_NTSC      0xFE
@@ -130,6 +131,7 @@ const char* get_packet_type_name(uint8_t type) {
        case TAV_PACKET_EXTENDED_HDR: return "EXTENDED HEADER";
        case TAV_PACKET_LOOP_START: return "LOOP START";
        case TAV_PACKET_LOOP_END: return "LOOP END";
+        case TAV_PACKET_SCREEN_MASK: return "SCREEN MASK";
        case TAV_PACKET_GOP_SYNC: return "GOP SYNC";
        case TAV_PACKET_TIMECODE: return "TIMECODE";
        case TAV_PACKET_SYNC_NTSC: return "SYNC (NTSC)";
@@ -842,6 +844,23 @@ static const char* VERDESC[] = {"null", "YCoCg tiled, uniform", "ICtCp tiled, un
                }
                break;

+            case TAV_PACKET_SCREEN_MASK:
+                uint32_t frame_number;
+                if (fread(&frame_number, sizeof(uint32_t), 1, fp) != 1) break;
+                uint16_t top;
+                if (fread(&top, sizeof(uint16_t), 1, fp) != 1) break;
+                uint16_t right;
+                if (fread(&right, sizeof(uint16_t), 1, fp) != 1) break;
+                uint16_t bottom;
+                if (fread(&bottom, sizeof(uint16_t), 1, fp) != 1) break;
+                uint16_t left;
+                if (fread(&left, sizeof(uint16_t), 1, fp) != 1) break;
+
+                if (!opts.summary_only && display) {
+                    printf(" - Frame=%u [top=%u, right=%u, bottom=%u, left=%u]", frame_number, top, right, bottom, left);
+                }
+                break;
+
            case TAV_PACKET_SYNC:
                stats.sync_count++;
                break;