#!/usr/bin/env python3
"""midi2taud.py — Convert Standard MIDI (.mid) + SoundFont 2 (.sf2) to TSVM Taud (.taud)

Usage:
    python3 midi2taud.py song.mid soundfont.sf2 [output.taud]
                         [--perc-force-mapping BANK INST]
                         [--rpb N] [--speed N] [--fadeout N]
                         [--bend-epsilon CENTS] [--drum-keyoff]
                         [-v] [--no-project-data]

Behaviour (per midi2taud.md):
  * Pitch bends are preserved as much as possible. A note starting under a
    non-zero bend triggers directly at the bent 4096-TET pitch (Taud notes
    are 4096-TET, so the trigger encodes the exact shifted pitch). Bend
    movement during a note is approximated as linear segments: each segment
    is one row carrying the exact 4096-TET target note plus tone portamento
    (G $xxxx, units/tick) sized to land on the target by row end. Jittery
    curves are simplified via --bend-epsilon (cents). RPN 0,0 pitch-bend
    range messages are honoured; bend values are computed as floats from
    the full 14-bit word (MIDIs that only drive the MSB work transparently).
  * Both MIDI key-off idioms — real note-off messages and note-on with
    velocity 0 — are translated into Taud KEY_OFF. Percussion-channel
    key-offs are dropped by default (GM percussion ignores note-off, and
    emitting them would chop one-shot drum tails); --drum-keyoff re-enables.
  * The SF2 key/velocity sample-layering model is recreated faithfully. Each
    preset's zones are partitioned into the fewest mutually-DISJOINT layers
    (--max-layers cap, default 4); each layer becomes one normal Taud instrument
    with its zones as Ixmp patches (velocity axis round(vel × 63/127)). A preset
    needing >1 layer is emitted as a Metainstrument (terranmon.txt "Metainstrument
    definition"): the note references the meta slot and the engine fans out one
    voice per matching layer, so SF2's simultaneous layering (and detune-stacks)
    now sound — overlapping zones are no longer dropped. Single-layer presets stay
    plain instruments. Stereo SF2 samples are mixed to mono. Unused instruments,
    patches, and samples are trimmed.
  * The SF2 volume-envelope ADSR is preserved on the (instrument-scope) Taud
    volume envelope: delay/attack/hold/decay nodes and a sustain region held
    while the key is on. There is NO release leg — the SF2 *release segment*
    is the Volume Fadeout (with NNA Note Fade): on key-off the voice holds at
    the sustain node and fades to silence over the SF2 releaseVolEnv time
    (measured against the 100 dB envelope floor: releaseVolEnv·(1000−sus_cb)/
    1000 seconds). Per-layer Ixmp patches carry their own fadeout when their
    release differs. The canonical zone's ADSR represents the instrument.
  * Polyphony rides the engine's New Note Action (matching MIDI semantics):
    every instrument (drum kits included) gets NNA = Note Fade, so a voice
    column is reusable the moment its note releases — the release/fade tail
    moves to a background ghost on the next trigger and dies over its own
    release time. Voice budget defaults to 16 columns (--max-voices); overflow
    releases the oldest pedal-held or soonest-ending note early, not cut.
  * Sub-row timing is carried by S $Dx note delays (one row = `--speed`
    ticks, default 6; one beat = `--rpb` rows, default 4 → 1/24-beat grid).
    MIDI tempo changes map to T $xx00 set-tempo effects; channel volume /
    expression (CC7 × CC11) map to M $xx00 channel-volume effects so they
    never disturb the velocity-driven patch selection axis.
"""

import argparse
import array
import bisect
import math
import os
import struct
import sys

from taud_common import (
    set_verbose, vprint,
    TAUD_MAGIC, TAUD_VERSION, TAUD_HEADER_SIZE, TAUD_SONG_ENTRY,
    SAMPLEBIN_SIZE, INSTBIN_SIZE, SAMPLEINST_SIZE, SAMPLE_LEN_LIMIT,
    PATTERN_ROWS, PATTERN_BYTES, NUM_PATTERNS_MAX, NUM_CUES, CUE_SIZE, NUM_VOICES,
    NOTE_NOP, NOTE_KEYOFF, TAUD_C4,
    TOP_G, TOP_M, TOP_S, TOP_T,
    SEL_SET, SEL_FINE,
    CUE_INST_NOP, CUE_INST_HALT,
    resample_linear, encode_cue, deduplicate_patterns, encode_song_entry,
    compress_blob, build_project_data, cue_instruction_len, nearest_minifloat,
    IXMP_PAN_NO_OVERRIDE, atten_cb_to_octet,
)

SIGNATURE = b'midi2taud/TSVM'   # 14 bytes
UNITS_PER_SEMI = 4096.0 / 12.0  # 4096-TET units per 12-TET semitone

# Effect priorities for the shared per-cell effect slot. Higher wins when a
# later pass needs the slot: SD note delays carry trigger timing and are
# never overwritten; T tempo is global and may evict G/M; M only takes free
# slots.
PRIO_FREE  = 0
PRIO_M     = 1
PRIO_PORTA = 2
PRIO_DELAY = 3
PRIO_TEMPO = 4


def key_to_noteval(key: float) -> int:
    """MIDI key (float, 60 = middle C) → Taud 4096-TET noteVal (C4 = 0x5000)."""
    return max(0x20, min(0xFFFF, round(TAUD_C4 + (key - 60.0) * UNITS_PER_SEMI)))


# ── MIDI parser ───────────────────────────────────────────────────────────────

def _read_varlen(data: bytes, pos: int):
    val = 0
    while True:
        b = data[pos]; pos += 1
        val = (val << 7) | (b & 0x7F)
        if not (b & 0x80):
            return val, pos


def _parse_track(data: bytes, pos: int, end: int) -> list:
    """Parse one MTrk body → list of (abs_tick, event_tuple)."""
    evs = []
    tick = 0
    status = 0
    while pos < end:
        delta, pos = _read_varlen(data, pos)
        tick += delta
        if pos >= end:
            break
        b = data[pos]
        if b & 0x80:
            status = b
            pos += 1
        elif status < 0x80:
            vprint(f"  warning: corrupt track data at {pos:#x}, truncating track")
            break

        if status == 0xFF:                       # meta
            mtype = data[pos]; pos += 1
            ln, pos = _read_varlen(data, pos)
            payload = data[pos:pos+ln]; pos += ln
            if mtype == 0x51 and ln >= 3:
                uspq = int.from_bytes(payload[:3], 'big')
                if uspq > 0:
                    evs.append((tick, ('tempo', 60000000.0 / uspq)))
            elif mtype == 0x03:
                txt = payload.decode('latin-1', errors='replace').strip()
                if txt:
                    evs.append((tick, ('title', txt)))
            elif mtype == 0x2F:
                evs.append((tick, ('eot',)))
                break
            status = 0                           # meta cancels running status
        elif status in (0xF0, 0xF7):             # sysex
            ln, pos = _read_varlen(data, pos)
            pos += ln
            status = 0
        else:
            hi = status & 0xF0
            ch = status & 0x0F
            if hi in (0xC0, 0xD0):
                d1 = data[pos]; pos += 1
                if hi == 0xC0:
                    evs.append((tick, ('prog', ch, d1)))
            else:
                d1 = data[pos]; d2 = data[pos+1]; pos += 2
                if hi == 0x90:
                    if d2 > 0:
                        evs.append((tick, ('on', ch, d1, d2)))
                    else:
                        evs.append((tick, ('off', ch, d1)))   # vel-0 idiom
                elif hi == 0x80:
                    evs.append((tick, ('off', ch, d1)))
                elif hi == 0xB0:
                    evs.append((tick, ('cc', ch, d1, d2)))
                elif hi == 0xE0:
                    evs.append((tick, ('bend', ch, (d2 << 7) | d1)))
                # 0xA0 polyphonic aftertouch: ignored
    return evs


def parse_midi(path: str):
    """Returns (division, merged_events). division: ('ppq', tpq) or
    ('smpte', fps, tpf). merged_events: [(tick, seq, event_tuple)] sorted."""
    with open(path, 'rb') as f:
        data = f.read()

    if data[:4] == b'RIFF':                      # RMID wrapper
        pos = 12
        while pos + 8 <= len(data):
            cid = data[pos:pos+4]
            sz  = struct.unpack_from('<I', data, pos+4)[0]
            if cid == b'data':
                data = data[pos+8 : pos+8+sz]
                break
            pos += 8 + sz + (sz & 1)

    if data[:4] != b'MThd':
        sys.exit("error: not a MIDI file (bad MThd magic)")
    hlen = struct.unpack_from('>I', data, 4)[0]
    fmt, ntrk, div = struct.unpack_from('>HHH', data, 8)
    if fmt == 2:
        vprint("  warning: SMF format 2 — tracks merged on a shared timeline")

    if div & 0x8000:
        fps = -struct.unpack_from('b', data, 12)[0]
        tpf = div & 0xFF
        division = ('smpte', fps, tpf)
    else:
        division = ('ppq', max(1, div))

    pos = 8 + hlen
    merged = []
    seq = 0
    tracks_found = 0
    while pos + 8 <= len(data) and tracks_found < ntrk:
        cid = data[pos:pos+4]
        sz  = struct.unpack_from('>I', data, pos+4)[0]
        body_start = pos + 8
        pos = body_start + sz
        if cid != b'MTrk':
            continue
        tracks_found += 1
        for tick, ev in _parse_track(data, body_start, min(pos, len(data))):
            merged.append((tick, seq, ev))
            seq += 1
    merged.sort(key=lambda e: (e[0], e[1]))
    return division, merged


# ── Note / controller extraction ──────────────────────────────────────────────

class Note:
    __slots__ = ('ch', 'key', 'vel', 'start_ft', 'end_ft', 'inst_key',
                 'bend0', 'slot', 'voice', 'drum', 'pedal_ft')
    def __init__(self, ch, key, vel, start_ft, inst_key, bend0):
        self.ch       = ch
        self.key      = key
        self.vel      = vel
        self.start_ft = start_ft
        self.end_ft   = None
        self.inst_key = inst_key
        self.bend0    = bend0
        self.slot     = 0
        self.voice    = -1
        self.drum     = (inst_key[0] == 'd')
        self.pedal_ft = None     # physical key-up time when only the pedal holds it


class _ChState:
    __slots__ = ('bank', 'prog', 'rpn_msb', 'rpn_lsb', 'range_semi',
                 'range_cents', 'cur_bend', 'bend_ft', 'bend_val',
                 'cc7_ft', 'cc7_val', 'cc11_ft', 'cc11_val',
                 'cc10_ft', 'cc10_val', 'sus', 'pending', 'active')
    def __init__(self):
        self.bank = 0
        self.prog = 0
        self.rpn_msb = 0x7F
        self.rpn_lsb = 0x7F
        self.range_semi  = 2
        self.range_cents = 0
        self.cur_bend = 0.0
        self.bend_ft  = [0];   self.bend_val = [0.0]
        self.cc7_ft   = [0];   self.cc7_val  = [100]    # GM default
        self.cc11_ft  = [0];   self.cc11_val = [127]
        self.cc10_ft  = [];    self.cc10_val = []       # empty = never set
        self.sus = False
        self.pending = []        # notes held by the sustain pedal
        self.active  = {}        # key → Note


def _curve_at(fts: list, vals: list, ft: int, default):
    i = bisect.bisect_right(fts, ft) - 1
    return vals[i] if i >= 0 else default


def _curve_push(fts: list, vals: list, ft: int, val):
    if fts and fts[-1] == ft:
        vals[-1] = val
    else:
        fts.append(ft); vals.append(val)


class Song:
    __slots__ = ('notes', 'channels', 'tempo_ft', 'tempo_bpm', 'title', 'end_ft')


def extract_song(division, merged, rpb: int, speed: int) -> Song:
    """Walk merged MIDI events, producing note instances (with both key-off
    idioms resolved to a definite end time), per-channel bend/CC curves, and
    the tempo map — all on the Taud fine-tick (ft) grid where one row =
    `speed` fts and one beat = `rpb` rows."""
    if division[0] == 'ppq':
        tpq = division[1]
        def to_ft(tick):
            return round(tick * rpb * speed / tpq)
    else:
        _, fps, tpf = division
        tps = max(1.0, float(fps * tpf))         # ticks per second
        # SMPTE timing has no musical beats: pin a 120 BPM equivalent grid.
        def to_ft(tick):
            return round((tick / tps) * 2.0 * rpb * speed)
        vprint("  info: SMPTE division — pinned to a 120 BPM-equivalent grid")

    chs = [_ChState() for _ in range(16)]
    notes = []
    tempo_ft, tempo_bpm = [], []
    title = None
    max_ft = 0

    def end_note(n: Note, ft: int):
        if n.end_ft is None:
            n.end_ft = max(ft, n.start_ft)

    for tick, _seq, ev in merged:
        ft = to_ft(tick)
        if ft > max_ft:
            max_ft = ft
        kind = ev[0]

        if kind == 'on':
            _, ch, key, vel = ev
            st = chs[ch]
            prev = st.active.pop(key, None)
            if prev is not None:                 # re-strike: close the old one
                end_note(prev, ft)
            ik = ('d', st.prog) if ch == 9 else ('m', st.bank, st.prog)
            n = Note(ch, key, vel, ft, ik, st.cur_bend)
            st.active[key] = n
            notes.append(n)

        elif kind == 'off':
            _, ch, key = ev
            st = chs[ch]
            n = st.active.pop(key, None)
            if n is not None:
                if st.sus:
                    n.pedal_ft = ft
                    st.pending.append(n)
                else:
                    end_note(n, ft)

        elif kind == 'bend':
            _, ch, val14 = ev
            st = chs[ch]
            # MUST be float maths: 14-bit word (or MSB-only 7-bit source,
            # which simply leaves the low 7 bits zero) → ±range semitones.
            norm  = (float(val14) - 8192.0) / 8192.0
            semis = norm * (st.range_semi + st.range_cents / 100.0)
            st.cur_bend = semis
            _curve_push(st.bend_ft, st.bend_val, ft, semis)

        elif kind == 'cc':
            _, ch, num, val = ev
            st = chs[ch]
            if num == 0:
                st.bank = val
            elif num == 7:
                _curve_push(st.cc7_ft, st.cc7_val, ft, val)
            elif num == 10:
                _curve_push(st.cc10_ft, st.cc10_val, ft, val)
            elif num == 11:
                _curve_push(st.cc11_ft, st.cc11_val, ft, val)
            elif num == 64:
                if val >= 64:
                    st.sus = True
                else:
                    st.sus = False
                    for n in st.pending:
                        end_note(n, ft)
                    st.pending.clear()
            elif num == 100:
                st.rpn_lsb = val
            elif num == 101:
                st.rpn_msb = val
            elif num in (98, 99):                # NRPN deselects RPN
                st.rpn_msb = st.rpn_lsb = 0x7F
            elif num == 6:
                if st.rpn_msb == 0 and st.rpn_lsb == 0:
                    st.range_semi = val
            elif num == 38:
                if st.rpn_msb == 0 and st.rpn_lsb == 0:
                    st.range_cents = val
            elif num in (120, 123):              # all sound / notes off
                for n in list(st.active.values()):
                    end_note(n, ft)
                st.active.clear()
                for n in st.pending:
                    end_note(n, ft)
                st.pending.clear()
            elif num == 121:                     # reset all controllers
                st.cur_bend = 0.0
                _curve_push(st.bend_ft, st.bend_val, ft, 0.0)
                _curve_push(st.cc11_ft, st.cc11_val, ft, 127)
                st.sus = False
                for n in st.pending:
                    end_note(n, ft)
                st.pending.clear()
                st.rpn_msb = st.rpn_lsb = 0x7F

        elif kind == 'prog':
            _, ch, val = ev
            chs[ch].prog = val

        elif kind == 'tempo':
            tempo_ft.append(ft); tempo_bpm.append(ev[1])

        elif kind == 'title':
            if title is None:
                title = ev[1]

    # Close anything still ringing at end-of-file.
    for st in chs:
        for n in list(st.active.values()):
            end_note(n, max_ft)
        st.active.clear()
        for n in st.pending:
            end_note(n, max_ft)
        st.pending.clear()

    dropped = [n for n in notes if n.end_ft <= n.start_ft]
    if dropped:
        vprint(f"  info: dropped {len(dropped)} zero-length note(s)")
    notes = [n for n in notes if n.end_ft > n.start_ft]
    notes.sort(key=lambda n: (n.start_ft, n.ch, n.key))

    song = Song()
    song.notes     = notes
    song.channels  = chs
    song.tempo_ft  = tempo_ft
    song.tempo_bpm = tempo_bpm
    song.title     = title
    song.end_ft    = max_ft
    return song


# ── SF2 parser ────────────────────────────────────────────────────────────────

GEN_START_OFF        = 0
GEN_END_OFF          = 1
GEN_STARTLOOP_OFF    = 2
GEN_ENDLOOP_OFF      = 3
GEN_START_COARSE     = 4
GEN_MODENV2PITCH     = 7      # modEnvToPitch (signed cents at full mod-env)
GEN_FILTERFC         = 8      # initialFilterFc (absolute cents; default 13500 = open)
GEN_FILTERQ          = 9      # initialFilterQ (cB of resonance; default 0)
GEN_MODENV2FILT      = 11     # modEnvToFilterFc (signed cents at full mod-env)
GEN_END_COARSE       = 12
GEN_PAN              = 17
GEN_DELAY_MODENV     = 25
GEN_ATTACK_MODENV    = 26
GEN_HOLD_MODENV      = 27
GEN_DECAY_MODENV     = 28
GEN_SUSTAIN_MODENV   = 29     # 0.1% units of full-scale DECREASE (0..1000)
GEN_RELEASE_MODENV   = 30
GEN_DELAY_VOLENV     = 33
GEN_ATTACK_VOLENV    = 34
GEN_HOLD_VOLENV      = 35
GEN_DECAY_VOLENV     = 36
GEN_SUSTAIN_VOLENV   = 37     # centibels of attenuation, 0..1440
GEN_RELEASE_VOLENV   = 38
GEN_INSTRUMENT       = 41
GEN_KEYRANGE         = 43
GEN_VELRANGE         = 44
GEN_STARTLOOP_COARSE = 45
GEN_INITATTEN        = 48     # initialAttenuation (cB; per-zone static gain)
GEN_ENDLOOP_COARSE   = 50
GEN_COARSETUNE       = 51
GEN_FINETUNE         = 52
GEN_SAMPLEID         = 53
GEN_SAMPLEMODES      = 54
GEN_SCALETUNING      = 56
GEN_ROOTKEY          = 58

_SIGNED_GENS = frozenset({GEN_START_OFF, GEN_END_OFF, GEN_STARTLOOP_OFF,
                          GEN_ENDLOOP_OFF, GEN_START_COARSE, GEN_END_COARSE,
                          GEN_STARTLOOP_COARSE, GEN_ENDLOOP_COARSE,
                          GEN_PAN, GEN_COARSETUNE, GEN_FINETUNE,
                          GEN_DELAY_VOLENV, GEN_ATTACK_VOLENV, GEN_HOLD_VOLENV,
                          GEN_DECAY_VOLENV, GEN_RELEASE_VOLENV,
                          GEN_MODENV2PITCH, GEN_MODENV2FILT,
                          GEN_DELAY_MODENV, GEN_ATTACK_MODENV, GEN_HOLD_MODENV,
                          GEN_DECAY_MODENV, GEN_RELEASE_MODENV,
                          # cB/cents value-generators that are ADDITIVE (and so may be
                          # NEGATIVE) at the preset level. Their instrument-level absolutes
                          # all sit well under 0x8000 (atten≤1440, filterFc≤13500, Q≤960,
                          # sustain≤1440/1000), so reading them signed is lossless there and
                          # correct for relative preset deltas. Without this a preset zone
                          # carrying e.g. initialAttenuation 0xFFFE (a −2 cB boost) was read
                          # as 65534 cB → ~−6575 dB → the whole instrument went silent
                          # (SGM 'Synth Strings 1' vol-env nodes stuck at 0).
                          GEN_INITATTEN, GEN_FILTERFC, GEN_FILTERQ,
                          GEN_SUSTAIN_VOLENV, GEN_SUSTAIN_MODENV})


def _timecents_to_sec(tc: int) -> float:
    """SF2 timecents → seconds (2^(tc/1200)); default -12000 ≈ 1 ms."""
    return 2.0 ** (max(-12000, min(8000, tc)) / 1200.0)


class SFSampleHdr:
    __slots__ = ('name', 'start', 'end', 'loopstart', 'loopend', 'rate',
                 'origkey', 'correction', 'link', 'stype')


class SFZone:
    """One effective preset×instrument zone (post combination)."""
    __slots__ = ('keylo', 'keyhi', 'vello', 'velhi', 'sample', 'rootkey',
                 'tune_cents', 'modes', 'pan', 'scale', 'a_start', 'a_end',
                 'loop_abs_start', 'loop_abs_end', 'pair', 'rate', 'name',
                 'env_delay', 'env_attack', 'env_hold', 'env_decay',
                 'env_sustain_cb', 'env_release',
                 # initialAttenuation (cB static per-zone gain) + static filter.
                 'atten_cb', 'filter_fc', 'filter_q',
                 # modulation envelope (drives pitch and/or filter) + its targets.
                 'm_delay', 'm_attack', 'm_hold', 'm_decay', 'm_sustain_pc',
                 'm_release', 'me2pitch', 'me2filt')


class SF2:
    __slots__ = ('presets', 'shdrs', 'file', 'smpl_off', 'smpl_size')

    def read_frames(self, start_frame: int, n_frames: int) -> array.array:
        """Read n_frames of 16-bit PCM starting at absolute frame index."""
        n_avail = max(0, min(n_frames, self.smpl_size // 2 - start_frame))
        a = array.array('h')
        if n_avail <= 0:
            return a
        self.file.seek(self.smpl_off + start_frame * 2)
        a.frombytes(self.file.read(n_avail * 2))
        if sys.byteorder == 'big':
            a.byteswap()
        return a


def _gen_amount(oper: int, raw: int) -> int:
    if oper in _SIGNED_GENS:
        return raw - 0x10000 if raw >= 0x8000 else raw
    return raw


def _parse_bags(bag_data, gen_data, start_bag, end_bag, terminal_gen):
    """Resolve bags [start_bag, end_bag) into (global_gens, [zone_gens...]).
    Each zone_gens is {oper: amount}; zones lacking the terminal generator
    other than a leading global zone are discarded per the SF2 spec."""
    glob = {}
    zones = []
    n_bags = len(bag_data) // 4
    for bi in range(start_bag, end_bag):
        g0 = struct.unpack_from('<H', bag_data, bi*4)[0]
        g1 = (struct.unpack_from('<H', bag_data, (bi+1)*4)[0]
              if bi + 1 < n_bags else len(gen_data) // 4)
        gens = {}
        for gi in range(g0, min(g1, len(gen_data) // 4)):
            oper, raw = struct.unpack_from('<HH', gen_data, gi*4)
            gens[oper] = _gen_amount(oper, raw)
        if terminal_gen in gens:
            zones.append(gens)
        elif bi == start_bag and not zones:
            glob = gens
    return glob, zones


def parse_sf2(path: str) -> SF2:
    f = open(path, 'rb')
    hdr = f.read(12)
    if hdr[:4] != b'RIFF' or hdr[8:12] != b'sfbk':
        sys.exit("error: not an SF2 file (bad RIFF/sfbk magic)")
    riff_end = 8 + struct.unpack_from('<I', hdr, 4)[0]

    pdta = {}
    smpl_off = smpl_size = 0
    pos = 12
    while pos + 8 <= riff_end:
        f.seek(pos)
        chdr = f.read(8)
        if len(chdr) < 8:
            break
        cid = chdr[:4]
        sz  = struct.unpack_from('<I', chdr, 4)[0]
        if cid == b'LIST':
            ltype = f.read(4)
            inner = pos + 12
            inner_end = pos + 8 + sz
            while inner + 8 <= inner_end:
                f.seek(inner)
                shdr_ = f.read(8)
                scid = shdr_[:4]
                ssz  = struct.unpack_from('<I', shdr_, 4)[0]
                if ltype == b'pdta':
                    pdta[scid.decode('latin-1')] = f.read(ssz)
                elif ltype == b'sdta' and scid == b'smpl':
                    smpl_off, smpl_size = inner + 8, ssz
                inner += 8 + ssz + (ssz & 1)
        pos += 8 + sz + (sz & 1)

    for need in ('phdr', 'pbag', 'pgen', 'inst', 'ibag', 'igen', 'shdr'):
        if need not in pdta:
            sys.exit(f"error: SF2 missing required pdta sub-chunk '{need}'")
    if not smpl_size:
        sys.exit("error: SF2 has no smpl chunk (sample data)")

    sf = SF2()
    sf.file = f
    sf.smpl_off, sf.smpl_size = smpl_off, smpl_size

    shdr_data = pdta['shdr']
    sf.shdrs = []
    for i in range(len(shdr_data) // 46 - 1):    # last record is EOS sentinel
        s = SFSampleHdr()
        off = i * 46
        s.name = shdr_data[off:off+20].split(b'\x00')[0].decode('latin-1',
                                                                errors='replace')
        (s.start, s.end, s.loopstart, s.loopend, s.rate) = \
            struct.unpack_from('<IIIII', shdr_data, off+20)
        s.origkey    = shdr_data[off+40]
        s.correction = struct.unpack_from('b', shdr_data, off+41)[0]
        s.link, s.stype = struct.unpack_from('<HH', shdr_data, off+42)
        if s.rate == 0:
            s.rate = 8363
        sf.shdrs.append(s)

    # Instruments: index → (global_gens, [zone_gens])
    inst_data, ibag, igen = pdta['inst'], pdta['ibag'], pdta['igen']
    n_inst = len(inst_data) // 22 - 1
    inst_zones = []
    for i in range(n_inst):
        b0 = struct.unpack_from('<H', inst_data, i*22 + 20)[0]
        b1 = struct.unpack_from('<H', inst_data, (i+1)*22 + 20)[0]
        inst_zones.append(_parse_bags(ibag, igen, b0, b1, GEN_SAMPLEID))

    # Presets
    phdr, pbag, pgen = pdta['phdr'], pdta['pbag'], pdta['pgen']
    n_pre = len(phdr) // 38 - 1
    sf.presets = {}
    scale_warned = False
    for i in range(n_pre):
        off = i * 38
        pname = phdr[off:off+20].split(b'\x00')[0].decode('latin-1',
                                                          errors='replace')
        preset, bank, bag0 = struct.unpack_from('<HHH', phdr, off+20)
        bag1 = struct.unpack_from('<H', phdr, (i+1)*38 + 24)[0]
        pglob, pzones = _parse_bags(pbag, pgen, bag0, bag1, GEN_INSTRUMENT)

        zones = []
        for pz_raw in pzones:
            pz = dict(pglob); pz.update(pz_raw)
            ii = pz[GEN_INSTRUMENT]
            if not (0 <= ii < n_inst):
                continue
            iglob, izones = inst_zones[ii]
            pk = pz.get(GEN_KEYRANGE, 0x7F00)
            pv = pz.get(GEN_VELRANGE, 0x7F00)
            pklo, pkhi = pk & 0xFF, (pk >> 8) & 0xFF
            pvlo, pvhi = pv & 0xFF, (pv >> 8) & 0xFF
            for iz_raw in izones:
                iz = dict(iglob); iz.update(iz_raw)
                si = iz[GEN_SAMPLEID]
                if not (0 <= si < len(sf.shdrs)):
                    continue
                s = sf.shdrs[si]
                if s.stype & 0x8000:             # ROM sample
                    continue
                ik = iz.get(GEN_KEYRANGE, 0x7F00)
                iv = iz.get(GEN_VELRANGE, 0x7F00)
                klo = max(ik & 0xFF, pklo); khi = min((ik >> 8) & 0xFF, pkhi)
                vlo = max(iv & 0xFF, pvlo); vhi = min((iv >> 8) & 0xFF, pvhi)
                if klo > khi or vlo > vhi:
                    continue

                z = SFZone()
                z.keylo, z.keyhi = klo, khi
                z.vello, z.velhi = vlo, vhi
                z.sample = si
                rk = iz.get(GEN_ROOTKEY, -1)
                z.rootkey = rk if 0 <= rk <= 127 else \
                            (s.origkey if s.origkey <= 127 else 60)
                z.tune_cents = ((iz.get(GEN_COARSETUNE, 0)
                                 + pz.get(GEN_COARSETUNE, 0)) * 100
                                + iz.get(GEN_FINETUNE, 0)
                                + pz.get(GEN_FINETUNE, 0)
                                + s.correction)
                z.modes = iz.get(GEN_SAMPLEMODES, 0) & 3
                z.pan   = max(-500, min(500, iz.get(GEN_PAN, 0)
                                        + pz.get(GEN_PAN, 0)))
                z.scale = iz.get(GEN_SCALETUNING, 100)
                if z.scale != 100 and klo != khi and not scale_warned:
                    vprint("  warning: scaleTuning != 100 on a multi-key zone "
                           "— pitch is exact only at the zone's centre key")
                    scale_warned = True
                # Volume-envelope ADSR (timecents at inst level, preset adds).
                z.env_delay  = _timecents_to_sec(iz.get(GEN_DELAY_VOLENV,  -12000)
                                                 + pz.get(GEN_DELAY_VOLENV,  0))
                z.env_attack = _timecents_to_sec(iz.get(GEN_ATTACK_VOLENV, -12000)
                                                 + pz.get(GEN_ATTACK_VOLENV, 0))
                z.env_hold   = _timecents_to_sec(iz.get(GEN_HOLD_VOLENV,   -12000)
                                                 + pz.get(GEN_HOLD_VOLENV,   0))
                z.env_decay  = _timecents_to_sec(iz.get(GEN_DECAY_VOLENV,  -12000)
                                                 + pz.get(GEN_DECAY_VOLENV,  0))
                z.env_sustain_cb = max(0, min(1440, iz.get(GEN_SUSTAIN_VOLENV, 0)
                                              + pz.get(GEN_SUSTAIN_VOLENV, 0)))
                z.env_release = _timecents_to_sec(iz.get(GEN_RELEASE_VOLENV, -12000)
                                                  + pz.get(GEN_RELEASE_VOLENV, 0))
                # initialAttenuation: per-zone static gain in cB (preset adds to inst).
                # Clamped to the SF2 spec range [0, 1440] so any out-of-range value can
                # never collapse the folded vol-env to silence (see _SIGNED_GENS note).
                z.atten_cb = max(0, min(1440, iz.get(GEN_INITATTEN, 0)
                                        + pz.get(GEN_INITATTEN, 0)))
                # Static low-pass filter. initialFilterFc is absolute cents (default
                # 13500 ≈ open); initialFilterQ is cB of resonance (default 0).
                z.filter_fc = iz.get(GEN_FILTERFC, 13500) + pz.get(GEN_FILTERFC, 0)
                z.filter_q  = max(0, iz.get(GEN_FILTERQ, 0) + pz.get(GEN_FILTERQ, 0))
                # Modulation envelope (drives pitch via modEnvToPitch and/or filter via
                # modEnvToFilterFc). Times are timecents; sustain is 0.1%-of-full DECREASE.
                z.m_delay   = _timecents_to_sec(iz.get(GEN_DELAY_MODENV,  -12000)
                                                + pz.get(GEN_DELAY_MODENV,  0))
                z.m_attack  = _timecents_to_sec(iz.get(GEN_ATTACK_MODENV, -12000)
                                                + pz.get(GEN_ATTACK_MODENV, 0))
                z.m_hold    = _timecents_to_sec(iz.get(GEN_HOLD_MODENV,   -12000)
                                                + pz.get(GEN_HOLD_MODENV,   0))
                z.m_decay   = _timecents_to_sec(iz.get(GEN_DECAY_MODENV,  -12000)
                                                + pz.get(GEN_DECAY_MODENV,  0))
                z.m_sustain_pc = max(0, min(1000, iz.get(GEN_SUSTAIN_MODENV, 0)
                                            + pz.get(GEN_SUSTAIN_MODENV, 0)))
                z.m_release = _timecents_to_sec(iz.get(GEN_RELEASE_MODENV, -12000)
                                                + pz.get(GEN_RELEASE_MODENV, 0))
                z.me2pitch  = iz.get(GEN_MODENV2PITCH, 0) + pz.get(GEN_MODENV2PITCH, 0)
                z.me2filt   = iz.get(GEN_MODENV2FILT,  0) + pz.get(GEN_MODENV2FILT,  0)
                z.a_start = (s.start + iz.get(GEN_START_OFF, 0)
                             + 32768 * iz.get(GEN_START_COARSE, 0))
                z.a_end   = (s.end + iz.get(GEN_END_OFF, 0)
                             + 32768 * iz.get(GEN_END_COARSE, 0))
                z.a_start = max(0, z.a_start)
                z.a_end   = max(z.a_start, min(z.a_end, sf.smpl_size // 2))
                z.loop_abs_start = (s.loopstart + iz.get(GEN_STARTLOOP_OFF, 0)
                                    + 32768 * iz.get(GEN_STARTLOOP_COARSE, 0))
                z.loop_abs_end   = (s.loopend + iz.get(GEN_ENDLOOP_OFF, 0)
                                    + 32768 * iz.get(GEN_ENDLOOP_COARSE, 0))
                z.pair = None
                z.rate = s.rate
                z.name = s.name
                zones.append(z)
        if zones:
            sf.presets[(bank, preset)] = (pname, zones)
    return sf


# ── Preset resolution / Taud instrument building ──────────────────────────────

def resolve_preset(sf: SF2, inst_key, perc_force):
    """inst_key: ('m', bank, prog) or ('d', prog). Returns (name, zones) or None."""
    if inst_key[0] == 'd':
        prog = inst_key[1]
        cands = []
        if perc_force is not None:
            cands.append(tuple(perc_force))
        cands += [(128, prog), (128, 0)]
    else:
        _, bank, prog = inst_key
        cands = [(bank, prog), (0, prog)]
    for c in cands:
        if c in sf.presets:
            return sf.presets[c]
    # Last resort: same program number in any bank, then nothing.
    prog = inst_key[1] if inst_key[0] == 'd' else inst_key[2]
    for (b, p) in sorted(sf.presets):
        if p == prog:
            return sf.presets[(b, p)]
    return None


def merge_stereo_zones(zones: list, shdrs: list) -> list:
    """Collapse L/R zone pairs into single mono zones. Two flavours are merged:
      (1) LINKED stereo — samples are each other's sampleLink with L/R types;
      (2) PAN stereo — two MONO-typed zones with the same key/vel rect and
          opposite hard pan (±500). SGM/Timbres store most "stereo" samples this
          way (e.g. 'VA LGFF C3-L' / '…-R'), NOT as linked L/R.
    The merged zone mixes both channels to mono and drops the pan override.
    Merging is essential: an unmerged R zone fully overlaps its L zone, so the
    disjointify spills it into a SECOND layer that then plays CENTRED alongside
    the L zone — a spurious +6 dB doubling. Lone L/R zones keep their channel."""
    out = []
    used = set()
    for i, z in enumerate(zones):
        if i in used:
            continue
        s = shdrs[z.sample]
        partner = None
        if s.stype in (2, 4) and 0 <= s.link < len(shdrs):
            for j in range(i + 1, len(zones)):
                if j in used:
                    continue
                z2 = zones[j]
                if (z2.sample == s.link
                        and (z2.keylo, z2.keyhi, z2.vello, z2.velhi)
                            == (z.keylo, z.keyhi, z.vello, z.velhi)
                        and z2.modes == z.modes
                        and z2.rootkey == z.rootkey):
                    partner = j
                    break
        if partner is None and z.pan is not None and abs(z.pan) >= 400:
            for j in range(i + 1, len(zones)):
                if j in used:
                    continue
                z2 = zones[j]
                if (z2.sample != z.sample
                        and z2.pan is not None and abs(z2.pan) >= 400
                        and (z.pan < 0) != (z2.pan < 0)        # opposite sides
                        and (z2.keylo, z2.keyhi, z2.vello, z2.velhi)
                            == (z.keylo, z.keyhi, z.vello, z.velhi)
                        and z2.modes == z.modes
                        and z2.rootkey == z.rootkey):
                    partner = j
                    break
        if partner is not None:
            used.add(partner)
            z2 = zones[partner]
            z.pair = (z.sample, z2.sample, z2.a_start)
            z.pan = None                          # mixed to mono → centred
            z.a_end = z.a_start + min(z.a_end - z.a_start,
                                      z2.a_end - z2.a_start)
        out.append(z)
    return out


def _rect_of_zone(z: SFZone):
    """Zone key/vel ranges → Taud (pitch_lo, pitch_hi, vol_lo, vol_hi).
    Pitch bounds sit on half-semitone boundaries so triggers carrying an
    initial pitch bend (< 50 cents) still land inside the right rectangle;
    adjacent zones stay disjoint. Velocity per Ixmp note 5: round(v·63/127)."""
    if z.keylo <= 0:
        plo = 0x0000
    else:
        plo = max(0, min(0xFFFF, round(TAUD_C4 + (z.keylo - 0.5 - 60) * UNITS_PER_SEMI)))
    if z.keyhi >= 127:
        phi = 0xFFFF
    else:
        phi = max(0, min(0xFFFF, round(TAUD_C4 + (z.keyhi + 0.5 - 60) * UNITS_PER_SEMI) - 1))
    vlo = round(z.vello * 63 / 127)
    vhi = round(z.velhi * 63 / 127)
    return (plo, phi, vlo, vhi)


def _rect_subtract(r, k):
    """Pieces of rectangle r not covered by rectangle k (≤ 4 pieces)."""
    p0, p1, v0, v1 = r
    q0, q1, w0, w1 = k
    if p1 < q0 or p0 > q1 or v1 < w0 or v0 > w1:
        return [r]
    pieces = []
    if p0 < q0: pieces.append((p0, q0 - 1, v0, v1))
    if p1 > q1: pieces.append((q1 + 1, p1, v0, v1))
    m0, m1 = max(p0, q0), min(p1, q1)
    if v0 < w0: pieces.append((m0, m1, v0, w0 - 1))
    if v1 > w1: pieces.append((m0, m1, w1 + 1, v1))
    return pieces


class MonoSample:
    """One pooled (deduplicated) mono u8 sample slice."""
    __slots__ = ('pair', 'a_start', 'frames', 'rate', 'name',
                 'data', 'ratio', 'offset', 'loop_native', 'synth_loop', 'synth_decay')
    def __init__(self, z: SFZone):
        self.pair    = z.pair                    # None or (idxL, idxR, b_start)
        self.a_start = z.a_start
        self.frames  = max(0, z.a_end - z.a_start)
        self.rate    = z.rate
        self.name    = z.name
        self.data    = None
        self.ratio   = 1.0
        self.offset  = 0
        # SF2 loop in NATIVE frames (mirrors the Patch loop test), or None when this
        # slice has no loop. Used by build_sample_inst_bin to decide how to fit an
        # over-length sample: a no-loop sample gets a synthesized loop, a looped one
        # is preserved (kept at 32 kHz when its loop fits, else fit-to-cap). Dedup
        # keeps the first zone's loop (same slice ⇒ same loop in practice).
        ls_n = max(0, min(z.loop_abs_start - z.a_start, self.frames))
        le_n = max(0, min(z.loop_abs_end   - z.a_start, self.frames))
        self.loop_native = (ls_n, le_n) if (z.modes in (1, 3) and le_n - ls_n >= 2) else None
        # Set when a too-long, originally UN-looped sample is resampled to the 32 kHz
        # floor and given a synthesized sustain loop (see _synth_sustain_loop): a
        # (loop_start, loop_end) pair in the FINAL output-frame domain (already scaled
        # by every resample) and the seconds over which a peak->0 vol-envelope fades
        # the looped note to silence (_synth_decay_vol_env). When set, the loop points
        # and vol-envelope of EVERY record/patch using this sample are overridden.
        self.synth_loop  = None
        self.synth_decay = None

    def key(self):
        return (self.pair[0], self.pair[1], self.a_start, self.frames) \
            if self.pair else (-1, -1, self.a_start, self.frames)

    def render(self, sf: SF2):
        if self.data is not None:
            return
        n = min(self.frames, 1 << 24)            # hard sanity cap (16M frames)
        if self.pair:
            la = sf.read_frames(self.a_start, n)
            ra = sf.read_frames(self.pair[2], n)
            m  = min(len(la), len(ra))
            self.data = bytes((((la[i] + ra[i]) >> 1) >> 8) + 128 & 0xFF
                              for i in range(m))
        else:
            la = sf.read_frames(self.a_start, n)
            self.data = bytes(((s >> 8) + 128) & 0xFF for s in la)
        self.frames = len(self.data)


class Patch:
    """One Ixmp-patch-to-be: a disjoint rect plus the zone's sample fields."""
    __slots__ = ('rect', 'zone', 'ms', 'loop_start', 'loop_end', 'loop_mode',
                 'detune', 'pan8', 'hits')
    def __init__(self, rect, z: SFZone, ms: MonoSample):
        self.rect = rect
        self.zone = z
        self.ms   = ms
        ls = z.loop_abs_start - z.a_start
        le = z.loop_abs_end   - z.a_start
        nf = max(0, z.a_end - z.a_start)
        ls = max(0, min(ls, nf)); le = max(0, min(le, nf))
        if z.modes in (1, 3) and le - ls >= 2:
            self.loop_mode  = 1 | (0x4 if z.modes == 3 else 0)
            self.loop_start = ls
            self.loop_end   = le
        else:
            self.loop_mode  = 0
            self.loop_start = 0
            self.loop_end   = 0
        # samplingRate = SF2 rate; the rootkey/tuning shift goes into the
        # signed 4096-TET detune so MIDI key 60 always means noteVal 0x5000.
        # scaleTuning (cents per key, 0 = fixed-pitch drums) is folded in
        # around the zone's centre key: exact for single-key zones, exact
        # everywhere when scale = 100.
        k_ref = (z.keylo + z.keyhi) / 2.0
        det = round(((k_ref - z.rootkey) * (z.scale / 100.0)
                     - (k_ref - 60.0)) * UNITS_PER_SEMI
                    + z.tune_cents * 4096.0 / 1200.0)
        self.detune = max(-0x8000, min(0x7FFF, det))
        if z.pan is None:
            self.pan8 = IXMP_PAN_NO_OVERRIDE
        else:
            self.pan8 = max(0, min(255, round(127.5 + z.pan * 255.0 / 1000.0)))
        self.hits = 0

    def to_ixmp_dict(self, canonical, bpm0, fadeout_override):
        r = self.ms.ratio
        # Synthesized-loop samples carry their loop in the final output-frame domain
        # (already resampled) and force a plain forward loop; otherwise the zone's SF2
        # loop scaled by this sample's resample ratio.
        if self.ms.synth_loop is not None:
            ls_w, le_w, lm_w = self.ms.synth_loop[0], self.ms.synth_loop[1], 1
        else:
            ls_w = round(self.loop_start * r)
            le_w = round(self.loop_end   * r)
            lm_w = self.loop_mode
        d = {
            'pitch_start':         self.rect[0],
            'pitch_end':           self.rect[1],
            'volume_start':        self.rect[2],
            'volume_end':          self.rect[3],
            'sample_ptr':          self.ms.offset,
            'sample_length':       min(len(self.ms.data), 0xFFFF),
            'play_start':          0,
            'loop_start':          min(0xFFFF, ls_w),
            'loop_end':            min(0xFFFF, le_w),
            'sampling_rate':       max(1, min(0xFFFF, round(self.ms.rate * r))),
            'sample_detune':       self.detune,
            'loop_mode':           lm_w,
            'default_pan':         self.pan8,
            'default_note_volume': 0,            # no override → base DNV
            'vibrato_speed':       0,
            'vibrato_sweep':       0,
            'vibrato_depth':       0,
            'vibrato_rate':        0,
            'vibrato_waveform':    0xFF,         # no override
        }
        # Per-patch overrides — emitted ONLY when they differ from the canonical
        # zone (whose envelopes/filter live in the base instrument record, which the
        # patch falls through to when a block is absent). This is what gives SF2
        # velocity / key layers their own ADSR + filter while keeping patches lean.
        z, c = self.zone, canonical.zone
        # Effective vol-env: a synthesized-loop sample uses a peak->0 decay (no sustain),
        # else the zone's SF2 ADSR. Emitted only when it differs from the canonical's.
        vol_self  = _effective_vol_env(z, self.ms)
        vol_canon = _effective_vol_env(c, canonical.ms)
        if vol_self != vol_canon:
            d['vol_env'] = vol_self
        # SF-mode filter: mode flag + 16-bit cutoff cents / Q centibels + filter env.
        sf_s, cut_s, res_s, filt_s = _zone_filter_sf(z)
        sf_c, cut_c, res_c, filt_c = _zone_filter_sf(c)
        pit_s = _pitch_env_block(z) if z.me2pitch else None
        pit_c = _pitch_env_block(c) if c.me2pitch else None
        # Emit the 'x' block when filter (mode/cutoff/resonance/env) OR initialAttenuation
        # differs from the canonical (base) zone. initialAttenuation is a per-voice gain (NOT
        # folded into the env); when 'x' is present it carries this patch's atten, else the
        # voice inherits the base record's atten. A differing filter ENV must co-emit 'x'
        # because the env's node ratios scale the patch's OWN peak cutoff (the 'x' cutoff).
        att_s = atten_cb_to_octet(z.atten_cb)
        att_c = atten_cb_to_octet(c.atten_cb)
        # Volume Fadeout = this patch's own SF2 release segment; emit 'x' when it (or any
        # filter / atten field) differs from the canonical zone so the per-layer release
        # time is faithful (an absent 'x' falls through to the base record's fadeout). A
        # synthesized-loop sample disables its key-off fadeout (its decay is the vol-env,
        # which runs from note-on regardless of key state).
        fo_s = 0 if self.ms.synth_loop is not None else _zone_fadeout(z, bpm0, fadeout_override)
        fo_c = 0 if canonical.ms.synth_loop is not None else _zone_fadeout(c, bpm0, fadeout_override)
        filt_differs = (filt_s != filt_c)
        if (sf_s != sf_c or cut_s != cut_c or res_s != res_c or att_s != att_c
                or filt_differs or fo_s != fo_c):
            d['extra'] = {'fadeout':            fo_s,
                          'filter_sf_mode':     sf_s,
                          'default_cutoff':     cut_s,
                          'default_resonance':  res_s,
                          'initial_attenuation': att_s}
        if filt_s is not None and filt_differs:
            d['filter_env'] = filt_s
        if pit_s is not None and pit_s != pit_c:
            d['pitch_env'] = pit_s
        return d


class TaudInstrument:
    __slots__ = ('slot', 'inst_key', 'name', 'patches', 'canonical', 'usable')
    # patches: kept Patch list in zone order, canonical Patch INCLUDED
    # (the Ixmp emitter skips it; the base record carries its fields).


def _rect_overlap(a, b) -> bool:
    """True when two (pitch_lo, pitch_hi, vol_lo, vol_hi) rectangles intersect."""
    p0, p1, v0, v1 = a
    q0, q1, w0, w1 = b
    return not (p1 < q0 or p0 > q1 or v1 < w0 or v0 > w1)


def _partition_layers(zones: list, registry: dict, max_layers: int):
    """Split zones into disjoint layers by ITERATED first-wins disjointify.

    Layer 0 is the classic disjointify result: each zone is rectangle-SUBTRACTED
    against the rects already placed in the layer, so its non-overlapping pieces
    tile in. This is essential — the velocity axis quantises 0..127 → 0..63, so
    adjacent SF2 velocity splits round to ranges that touch/overlap by ~1 unit;
    subtraction absorbs that boundary sliver into the first zone instead of
    spawning a spurious extra layer (which would DOUBLE the level at boundary
    velocities). Only a zone that is *fully* covered by the layer below — SF2's
    real simultaneous layering, detune-stacks, duplicate zones — spills down to
    the next layer, where the same disjointify runs over the spilled set. Returns
    ([ [(rect, zone, ms), …] per layer ], dropped_zone_count)."""
    remaining = []
    for z in zones:
        ms = MonoSample(z)
        if ms.frames < 2:
            continue
        ms = registry.setdefault(ms.key(), ms)
        remaining.append((z, ms))

    layers = []
    while remaining and len(layers) < max_layers:
        kept_rects = []
        layer = []
        spill = []
        for z, ms in remaining:
            pieces = [_rect_of_zone(z)]
            for k in kept_rects:
                pieces = [p2 for p in pieces for p2 in _rect_subtract(p, k)]
                if not pieces:
                    break
            pieces = [p for p in pieces if p[0] <= p[1] and p[2] <= p[3]]
            if not pieces:
                spill.append((z, ms))          # fully overlapped → next layer
                continue
            for p in pieces:
                kept_rects.append(p)
                layer.append((p, z, ms))
        if layer:
            layers.append(layer)
        remaining = spill
    return layers, len(remaining)


def _build_layer_instrument(name: str, items: list, trig: dict):
    """One normal TaudInstrument from a layer's disjoint (rect, zone, ms) items,
    trimmed to patches actually hit by a trigger. None when no patch is hit
    (the layer is silent for the whole song → dropped)."""
    all_patches = [Patch(r, z, ms) for (r, z, ms) in items]
    for (nv, v6), cnt in trig.items():
        for p in all_patches:
            r = p.rect
            if r[0] <= nv <= r[1] and r[2] <= v6 <= r[3]:
                p.hits += cnt
                break
    kept = [p for p in all_patches if p.hits > 0]
    if not kept:
        return None
    ti = TaudInstrument()
    ti.name = name
    ti.patches = kept
    ti.canonical = max(kept, key=lambda p: p.hits)
    ti.usable = True
    ti.slot = 0
    ti.inst_key = None
    return ti


def build_presets(sf: SF2, slot_keys: list, triggers: dict, perc_force,
                  registry: dict, max_layers: int) -> dict:
    """For each preset (inst_key), partition its SF2 zones into disjoint layers
    and build one normal TaudInstrument per layer (trimmed to triggered patches).
    Returns dict[inst_key → (name, [layer TaudInstrument])]. Downstream, a preset
    with >1 layer becomes a Metainstrument; a single-layer preset stays a plain
    instrument. `registry` dedupes MonoSamples across all presets/layers."""
    presets = {}
    for ik in slot_keys:
        res = resolve_preset(sf, ik, perc_force)
        if res is None:
            vprint(f"  warning: no SF2 preset for {ik!r} — its notes are dropped")
            presets[ik] = ('(missing preset)', [])
            continue
        name, zones = res
        zones = merge_stereo_zones(zones, sf.shdrs)
        layer_items, dropped = _partition_layers(zones, registry, max_layers)
        if dropped:
            vprint(f"  warning: '{name}': {dropped} zone(s) exceed the "
                   f"{max_layers}-layer cap and were dropped (raise --max-layers)")
        trig = triggers.get(ik, {})
        layers = [ti for items in layer_items
                  if (ti := _build_layer_instrument(name, items, trig)) is not None]
        if not layers and layer_items:
            # Nothing triggered (out-of-range): keep the single patch nearest the
            # mean trigger pitch so the preset still sounds (matches the old path).
            mean_nv = (sum(nv * c for (nv, _), c in trig.items())
                       / max(1, sum(trig.values()))) if trig else TAUD_C4
            flat = [Patch(r, z, ms) for items in layer_items for (r, z, ms) in items]
            best = min(flat, key=lambda p: abs((p.rect[0] + p.rect[1]) / 2 - mean_nv))
            ti = TaudInstrument()
            ti.name = name; ti.patches = [best]; ti.canonical = best
            ti.usable = True; ti.slot = 0; ti.inst_key = ik
            layers = [ti]
        for ti in layers:
            ti.inst_key = ik
        presets[ik] = (name, layers)
        if layers:
            vprint(f"  preset '{name}': {len(zones)} zone(s) → {len(layers)} layer(s)"
                   + (" → Metainstrument" if len(layers) > 1 else ""))
        else:
            vprint(f"  warning: '{name}': no usable zones — notes dropped")
    return presets


# Metainstrument mix-volume octet for an unmixed layer (159 = 0 dB / unity); the
# converter folds per-zone level/tune into each layer instrument's patches, so the
# meta layers stay neutral. (terranmon.txt "Perceptually Significant Octet …".)
META_UNITY_OCTET = 159


def _layer_bbox(ti: 'TaudInstrument'):
    """Bounding (pitch_lo, pitch_hi, vol_lo, vol_hi) over a layer instrument's kept
    patch rects — the Metainstrument layer's gating rectangle."""
    rs = [p.rect for p in ti.patches]
    return (min(r[0] for r in rs), max(r[1] for r in rs),
            min(r[2] for r in rs), max(r[3] for r in rs))


# ── Sample pool + instrument bin ──────────────────────────────────────────────

def _env_seg_count(t_sec: float) -> int:
    """Number of linear segments to approximate an exponential (linear-dB) ramp of
    `t_sec` seconds. Short ramps keep the old 2-segment shape; long ramps (the 5–20 s
    SF2 decays/releases that a 2-point line collapses badly) get up to 8 segments so
    the curve stays smooth (issue 4)."""
    return max(3, min(8, 2 + round(t_sec / 2.0)))


def _adsr_to_env(z: SFZone):
    """SF2 volume-envelope ADSR → (env_points, sustain_idx, release_sec).

    env_points is up to 25 (value 0..63, minifloat_idx) pairs; each node's
    minifloat encodes the time to the NEXT node (engine interpolates values
    linearly across that span). The envelope carries the delay/attack/hold/decay
    legs and ENDS at the sustain node — there is NO release leg. The engine wraps
    on the sustain node while the key is held (SUSTAIN word); on key-off it holds
    at that terminal node and the Volume Fadeout (emitted with NNA Note Fade) is
    the SF2 *release segment* (see _zone_fadeout). SF2's decay is LINEAR in dB
    (exponential in amplitude); per the SF2 spec decayVolEnv is the full-100dB
    time, truncated by the sustain level. The decay leg is sampled at equal-time
    (= equal-dB) points and emitted as a piecewise-linear-amplitude approximation
    — segment count scales with its duration (issue 4) so multi-second decays
    don't collapse to a 2-point line. release_sec (= SF2 releaseVolEnv) is returned
    only to feed the fadeout calc.
    """
    EPS = 0.004                       # below the minifloat resolution (1/256 s)
    sus_cb = min(z.env_sustain_cb, 1000.0)     # clamp to 100 dB full-scale
    slevel = 10.0 ** (-z.env_sustain_cb / 200.0)
    s63 = max(0, min(63, round(63 * slevel)))
    pts = []                          # (value, delta_sec_to_next)
    if z.env_delay >= EPS:
        pts.append((0, z.env_delay))
    if z.env_attack >= EPS:
        pts.append((0, z.env_attack))
    hold = z.env_hold if z.env_hold >= EPS else 0.0
    # Decay leg: peak (63) → sustain (s63), exponential amplitude over `edec` seconds.
    # The peak node carries the hold time. The final decay node is the sustain node
    # (appended below), so the in-between nodes are f = 1/n .. (n-1)/n.
    if s63 < 63:
        edec = z.env_decay * sus_cb / 1000.0
        if edec >= EPS:
            n = _env_seg_count(edec)
            seg = edec / n
            pts.append((63, hold + seg))                       # peak, held then 1st seg
            for i in range(1, n):                              # f = 1/n .. (n-1)/n
                f = i / n
                v = round(63 * 10.0 ** (-(sus_cb * f) / 200.0))
                pts.append((max(s63, min(63, v)), seg))
        else:
            pts.append((63, hold))
    sustain_idx = len(pts)            # the node appended next is the sustain node
    rel = z.env_release
    # No release leg: the sustain node is the terminal node. While the key is held the
    # engine wraps on it (SUSTAIN word); after key-off it holds there and the Volume
    # Fadeout (NNA Note Fade) performs the SF2 release segment (see _zone_fadeout). A
    # zero sustain leaves a terminal 0 node, so the engine retires the voice naturally
    # at the end of decay.
    pts.append((s63, 0.0))            # sustain node = terminator
    env = [(v, nearest_minifloat(d)) for v, d in pts[:25]]
    while len(env) < 25:
        env.append((env[-1][0], 0))
    return env, min(sustain_idx, 24), rel


# Envelope LOOP-word bits (terranmon.txt base byte 15/17/19).
ENV_PRESENT_BIT = 0x2000          # P — envelope present in source (LOOP-word bit 13)
ENV_SUS_ENABLE  = 0x0020          # b — enable the SUSTAIN wrap (SUSTAIN-word bit 5)
ENV_PF_FILTER   = 0x0080          # m — pitch/filter LOOP-word bit 7 (1 = filter)


def _atten_gain(atten_cb: float) -> float:
    """SF2 initialAttenuation (cB) → linear amplitude multiplier (≤ 1.0)."""
    return 10.0 ** (-max(0.0, atten_cb) / 200.0)


def _vol_env_block(z: SFZone):
    """Taud volume-envelope block dict from a zone's SF2 ADSR — the PURE ADSR shape
    at full 0..63 resolution. initialAttenuation is NO LONGER folded into the node
    peak (it would crush a heavily-attenuated env to peak ~3 and zero its tail, e.g.
    SGM 'Fantasia'); it is now carried as a separate per-voice gain — base record
    bytes 251-252 / Ixmp 'x' block initialAttenuation — applied in the mixer. Returns
    (block_dict, sustain_idx, release_sec)."""
    env, sidx, rel = _adsr_to_env(z)
    nodes = [(max(0, min(63, v)), mf) for (v, mf) in env]
    sustain = ENV_SUS_ENABLE | ((sidx & 0x1F) << 8) | (sidx & 0x1F)
    return {'loop': ENV_PRESENT_BIT, 'sustain': sustain, 'nodes': nodes}, sidx, rel


# SF2 initialFilterFc default ≈ 13500 cents (~20 kHz) means "no filter / fully open".
SF2_FILTER_OPEN_CENTS = 13500
# Taud SF-mode "filter off" sentinel for the 16-bit cutoff/resonance fields.
SF_FILTER_OFF = 0xFFFF


def _zone_filter_sf(z: SFZone):
    """Resolve a zone's filter into Taud SF-mode parameters.

    Taud SF mode (base byte 173 bit 4 / patch 'x' flag) stores the cutoff as
    SoundFont **absolute cents** and resonance as **centibels above DC gain** —
    the engine computes freq = 8.176·2^(cents/1200) and dmpfac = 10^(−Qcb/200),
    so there is no ImpulseTracker ~5 kHz cutoff ceiling. When the zone has a
    modulation envelope driving the cutoff, the stored cutoff is the PEAK the
    envelope reaches and the filter-env nodes scale it back down (see
    [_filter_env_block_sf]); the engine's `currentCutoff = baseCut · envValue`
    then reproduces the SF2 sweep exactly (linear-in-cents = the right log-Hz
    sweep).

    Returns (sf_mode, cutoff16, resonance16, filter_env_block_or_None).
    sf_mode False → no filter (IT-mode 'off')."""
    base_fc = z.filter_fc
    amt     = z.me2filt
    has_static = base_fc < SF2_FILTER_OPEN_CENTS
    has_env    = bool(amt)
    if not has_static and not has_env:
        return False, SF_FILTER_OFF, SF_FILTER_OFF, None
    peak = max(1, min(0xFFFE, round(base_fc + max(0, amt))))   # engine baseCut
    qcb  = max(0, min(0xFFFE, round(z.filter_q)))              # cB above DC gain
    env  = _filter_env_block_sf(z, base_fc, amt, peak) if has_env else None
    return True, peak, qcb, env


def _filter_env_block_sf(z: SFZone, base_fc: float, amt: float, peak: int) -> dict:
    """Filter envelope in SF-cents domain. Each node value = cutoff_cents(u)/peak·255
    following the SF2 modulation-envelope DAHDSR (u walks 0→1→sustain), where
    cutoff_cents(u) = base_fc + amt·u. 0xFF (255) = fully open at `peak`; the
    release returns to the base cutoff. The engine multiplies `peak` (= baseCut)
    by node/255 each tick, so the node ratios reproduce the SF2 cutoff sweep."""
    EPS   = 0.004
    sus_u = 1.0 - z.m_sustain_pc / 1000.0          # mod-env sustain level (0..1)

    def nodeval(u: float) -> int:
        cents = base_fc + amt * u
        return max(0, min(255, round(255.0 * cents / peak)))

    pts = []                                        # (value_byte, secs_to_next)
    if z.m_delay >= EPS:
        pts.append((nodeval(0.0), z.m_delay))
    pts.append((nodeval(0.0), z.m_attack if z.m_attack >= EPS else 0.0))
    hold = z.m_hold if z.m_hold >= EPS else 0.0
    if sus_u < 1.0 and z.m_decay >= EPS:
        pts.append((nodeval(1.0), hold + z.m_decay))
        sustain_idx = len(pts)
        pts.append((nodeval(sus_u), z.m_release if z.m_release >= EPS else 0.0))
    else:
        pts.append((nodeval(1.0), hold))
        sustain_idx = len(pts) - 1
    pts.append((nodeval(0.0), 0.0))                 # release returns to base cutoff
    nodes = [(v, nearest_minifloat(d)) for v, d in pts[:25]]
    while len(nodes) < 25:
        nodes.append((nodes[-1][0], 0))
    sustain_idx = min(sustain_idx, 24)
    loop    = ENV_PRESENT_BIT | ENV_PF_FILTER       # m-bit set = filter role
    sustain = ENV_SUS_ENABLE | ((sustain_idx & 0x1F) << 8) | (sustain_idx & 0x1F)
    return {'loop': loop, 'sustain': sustain, 'nodes': nodes}


def _zone_fadeout(z: SFZone, bpm0: int, fadeout_override) -> int:
    """Volume Fadeout step encoding the zone's SF2 release segment (gen 38,
    releaseVolEnv). With NNA Note Fade the fadeout IS the release: on key-off the
    voice holds at the sustain level and fades linearly to silence. The SF2 release
    ramps a constant 100 dB per `releaseVolEnv` seconds (spec sfspec24.txt:1934-1941
    — "until 100dB attenuation were reached"), so the time from the sustain level
    (sus_cb cB of attenuation) down to the 100 dB floor is
    releaseVolEnv·(1000−sus_cb)/1000. fadeStep makes the fadeout complete in that
    wall-clock time at bpm0: the engine subtracts fadeStep/1024 of unit volume per
    song tick, and the tick rate is bpm0·2/5 Hz, giving fadeStep = 2560/(fade_sec·bpm0)."""
    if fadeout_override is not None:
        return min(0xFFF, max(0, fadeout_override))
    sus_cb   = min(max(0.0, z.env_sustain_cb), 1000.0)
    fade_sec = max(0.02, z.env_release * (1000.0 - sus_cb) / 1000.0)
    return max(1, min(0xFFF, round(2560.0 / (fade_sec * bpm0))))


def _extra_block(z: SFZone, bpm0: int, fadeout_override) -> dict:
    """The 'x' block: release-segment fadeout + SF-mode static cutoff/resonance + filter mode."""
    sf_mode, cut16, res16, _ = _zone_filter_sf(z)
    return {'fadeout':            _zone_fadeout(z, bpm0, fadeout_override),
            'filter_sf_mode':     sf_mode,
            'default_cutoff':     cut16,
            'default_resonance':  res16}


def _pitch_env_block(z: SFZone) -> dict:
    """Pitch ('P') envelope block from the SF2 modulation envelope (DAHDSR),
    scaled by modEnvToPitch. Engine value mapping (byte/255; 0.5 = 0x80 = unity):
    envValue 1.0 → +16 semitones, so value = 0.5 + semis/32. The mod-env is
    unipolar 0→1; release returns to unity (0x80). (Filter envelopes are built
    separately in cents domain by [_filter_env_block_sf].)"""
    EPS = 0.004
    amount_cents = z.me2pitch
    sus_lvl = 1.0 - z.m_sustain_pc / 1000.0          # mod-env sustain level (0..1)

    def mapval(u: float) -> int:
        val = 0.5 + (amount_cents * u / 100.0) / 32.0
        return max(0, min(255, round(255 * max(0.0, min(1.0, val)))))

    pts = []                                          # (value_byte, secs_to_next)
    if z.m_delay >= EPS:
        pts.append((mapval(0.0), z.m_delay))
    pts.append((mapval(0.0), z.m_attack if z.m_attack >= EPS else 0.0))
    hold = z.m_hold if z.m_hold >= EPS else 0.0
    if sus_lvl < 1.0 and z.m_decay >= EPS:
        pts.append((mapval(1.0), hold + z.m_decay))
        sustain_idx = len(pts)
        pts.append((mapval(sus_lvl), z.m_release if z.m_release >= EPS else 0.0))
    else:
        pts.append((mapval(1.0), hold))
        sustain_idx = len(pts) - 1
    pts.append((mapval(0.0), 0.0))                    # release returns to unity (0x80)
    nodes = [(v, nearest_minifloat(d)) for v, d in pts[:25]]
    while len(nodes) < 25:
        nodes.append((nodes[-1][0], 0))
    sustain_idx = min(sustain_idx, 24)
    loop = ENV_PRESENT_BIT                            # m-bit clear = pitch role
    sustain = ENV_SUS_ENABLE | ((sustain_idx & 0x1F) << 8) | (sustain_idx & 0x1F)
    return {'loop': loop, 'sustain': sustain, 'nodes': nodes}


def _zone_pf_envs(z: SFZone):
    """Return (filter_env_block_or_None, pitch_env_block_or_None) for a zone's
    modulation envelope. SF2's single mod-env can drive both targets at once;
    the filter leg is built in SF-cents domain (see [_zone_filter_sf])."""
    _, _, _, filt = _zone_filter_sf(z)
    pit = _pitch_env_block(z) if z.me2pitch else None
    return filt, pit


# ── SF2 long-sample resampling + synthesized sustain loop ─────────────────────
#
# Per-sample handling when a rendered MonoSample exceeds the 65535-frame u16 cap
# (terranmon.txt sample_length is u16). Two strategies, by the rate that fitting
# the WHOLE sample into 65535 frames would leave:
#   (1)/(2) rate >= SF2_RESAMPLE_FLOOR_HZ  → downsample the whole sample to 65535
#           frames (quality stays acceptable, full sample preserved).
#   (3)     rate <  SF2_RESAMPLE_FLOOR_HZ  → resample to the 32 kHz floor instead
#           (keeps full bandwidth), keep the first 65535 frames, and — when the
#           sample has NO loop of its own — synthesize a near-seamless forward
#           loop near the end so held notes keep sounding, plus a peak->0 decay
#           vol-envelope (see _synth_decay_vol_env) that retires the voice
#           ~SF2_SYNTH_DECAY_SEC after the note fires.
SF2_RESAMPLE_FLOOR_HZ = 32000        # TSVM native audio rate (= full-bandwidth floor)
SF2_SYNTH_DECAY_SEC   = 10.0         # looped-note fade-to-silence span (from note-on)
SF2_LOOP_HINT         = 8192         # spec's "last 8192 samples" → MAX loop period searched
SF2_LOOP_MIN_PERIOD   = 512          # min loop period (avoid buzzy ultra-short loops)
SF2_LOOP_MATCH_WIN    = 256          # forward-window length used to score a loop seam
SF2_LOOP_MATCH_STEP   = 2            # stride within the match window (speed/quality trade)
SF2_LOOP_COARSE_STEP  = 32           # period stride for the coarse search pass


def _synth_sustain_loop(data: bytes, cap: int, hint: int):
    """Pick a near-seamless forward loop near the end of a resampled, originally
    UN-looped sample, and truncate it to <= `cap` frames. Returns
    (body, loop_start, loop_end) with the loop region [loop_start, loop_end)
    (loop_end exclusive — matches the engine's mode-1 wrap, AudioAdapter.kt:2126).

    The loop is chosen by minimising the sum-of-squared-difference between the
    W-frame windows that FOLLOW loop_start and loop_end. Forward playback wraps
    loop_end -> loop_start, so matching data[loop_start+k] ~= data[loop_end+k]
    makes the post-wrap texture continue the pre-wrap texture seamlessly (the k=0
    term also matches the immediate seam value). `hint` (the spec's "last 8192
    samples") is the MAXIMUM loop period searched, NOT taken at face value: the
    analysis settles on the smoothest-looping period in [SF2_LOOP_MIN_PERIOD, hint]
    via a coarse sweep refined locally."""
    keep = min(len(data), cap)
    W    = SF2_LOOP_MATCH_WIN
    # loop_end sits W frames before the kept end so the forward match window
    # [loop_end, loop_end + W) stays within the data.
    loop_end = keep - W
    p_max = min(hint, loop_end)
    p_min = min(SF2_LOOP_MIN_PERIOD, p_max)
    if loop_end <= p_min:                      # too short to loop (not expected in case 3)
        return data[:keep], max(0, keep - 2), keep

    def seam_err(ls: int) -> int:
        s  = 0
        le = loop_end
        for k in range(0, W, SF2_LOOP_MATCH_STEP):
            d = data[ls + k] - data[le + k]
            s += d * d
        return s

    best_p = p_min
    best_e = seam_err(loop_end - best_p)
    p = p_min + SF2_LOOP_COARSE_STEP
    while p <= p_max:
        e = seam_err(loop_end - p)
        if e < best_e:
            best_e, best_p = e, p
        p += SF2_LOOP_COARSE_STEP
    lo = max(p_min, best_p - SF2_LOOP_COARSE_STEP)
    hi = min(p_max, best_p + SF2_LOOP_COARSE_STEP)
    for p in range(lo, hi + 1):
        e = seam_err(loop_end - p)
        if e < best_e:
            best_e, best_p = e, p

    loop_start = max(0, min(loop_end - 2, loop_end - best_p))
    return data[:keep], loop_start, loop_end


def _synth_decay_vol_env(decay_sec: float) -> dict:
    """Volume-envelope block for a synthesized-loop sample: an immediate peak that
    decays exponentially (linear-dB) to silence over `decay_sec`, with NO sustain
    or loop wrap. The looped sample would otherwise sound forever; this envelope
    fades it from the instant the note fires and — because there is no wrap
    (resolveEnvWrap returns range (-1,-1)) — the engine's fall-through
    'envelope ends at 0 => cut' rule (AudioAdapter.kt:1693/1701) retires the voice
    once it reaches the terminal 0 node, ~decay_sec after firing, regardless of
    key state. The drop spans the representable 63->1 range (~36 dB); the final
    node is a true 0 terminator."""
    DROP_CB = 360.0                            # 63 -> 1 fills the whole decay span
    n   = _env_seg_count(decay_sec)
    seg = decay_sec / n
    pts = [(63, seg)]                          # peak, held one segment then decays
    for i in range(1, n):
        v = round(63 * 10.0 ** (-(DROP_CB * (i / n)) / 200.0))
        pts.append((max(1, min(63, v)), seg))
    pts.append((0, 0.0))                       # terminal 0 node => fall-through cut
    nodes = [(v, nearest_minifloat(d)) for v, d in pts[:25]]
    while len(nodes) < 25:
        nodes.append((0, 0))
    return {'loop': ENV_PRESENT_BIT, 'sustain': 0, 'nodes': nodes}


def _effective_vol_env(z: SFZone, ms: 'MonoSample') -> dict:
    """Volume-envelope block for a (zone, sample): a synthesized-loop sample fades
    from note-on via a peak->0 decay (no sustain), overriding the SF2 ADSR;
    otherwise the zone's SF2 ADSR shape (_vol_env_block)."""
    if ms is not None and ms.synth_decay is not None:
        return _synth_decay_vol_env(ms.synth_decay)
    blk, _, _ = _vol_env_block(z)
    return blk


def build_sample_inst_bin(sf: SF2, pool: list, layer_insts: list, meta_records: list,
                          fadeout_override, bpm0: int):
    """Render & pool every used MonoSample (with the 65535-byte per-sample
    and 8 MB global caps), write the 256-byte normal-instrument records for every
    layer instrument, then the Metainstrument records. Returns the raw
    SAMPLEINST_SIZE image."""
    for ms in pool:
        ms.render(sf)

    # Per-sample u16 cap. A sample over the 65535-frame limit is shrunk one of two
    # ways (see the SF2 long-sample section above): downsample the whole thing when
    # that keeps the rate >= 32 kHz; otherwise resample to the 32 kHz floor, keep the
    # first 65535 frames and synthesize a sustain loop + decay (only when the sample
    # has no loop of its own — a sample with an SF2 loop is left to fall-through, as
    # its loop already lets it sustain within whatever frames fit).
    for ms in pool:
        native_len = len(ms.data)
        if native_len <= SAMPLE_LEN_LIMIT:
            continue
        r_fit    = SAMPLE_LEN_LIMIT / native_len
        rate_fit = ms.rate * r_fit
        r32      = SF2_RESAMPLE_FLOOR_HZ / ms.rate
        # loop_end in 32 kHz frames (0 when unlooped) decides whether a 32 kHz render
        # still contains the loop within the 65535-frame cap.
        le32 = round(ms.loop_native[1] * r32) if ms.loop_native else 0

        def _fit_whole():
            """(1)/(2) downsample the WHOLE sample to <= 65535 frames. Used when the
            fitted rate stays >= 32 kHz, or as the fall-back for a looped sample whose
            loop sits past the cap at 32 kHz (only fit-to-cap keeps that far loop)."""
            ms.data   = resample_linear(ms.data, r_fit)
            ms.ratio *= len(ms.data) / native_len

        if rate_fit >= SF2_RESAMPLE_FLOOR_HZ:
            _fit_whole()
            vprint(f"  info: '{ms.name}' {native_len} frames > 64K cap; "
                   f"resampling by {r_fit:.4f} (rate {rate_fit:.0f} Hz)")
        elif ms.loop_native is None:
            # (3) No loop: resample to the 32 kHz floor (full bandwidth), keep the first
            # 65535 frames and synthesize a near-seamless sustain loop near the end, plus
            # a peak->0 decay vol-envelope that fades the looped note to silence from
            # note-on (the SF2 sample stops on its own otherwise; a loop would ring).
            resampled = resample_linear(ms.data, r32)
            ms.ratio *= len(resampled) / native_len    # effective rate -> 32 kHz
            ms.data   = resampled
            body, ls, le = _synth_sustain_loop(ms.data, SAMPLE_LEN_LIMIT, SF2_LOOP_HINT)
            ms.data        = body
            ms.synth_loop  = (ls, le)
            ms.synth_decay = SF2_SYNTH_DECAY_SEC
            vprint(f"  info: '{ms.name}' {native_len} frames > 64K cap, long & unlooped; "
                   f"32 kHz, kept {len(body)} frames, synth loop [{ls}..{le}] "
                   f"+ {SF2_SYNTH_DECAY_SEC:.0f}s decay")
        elif le32 <= SAMPLE_LEN_LIMIT - 2:
            # (3) Looped, and the loop fits at the 32 kHz floor: resample to 32 kHz and
            # keep the first 65535 frames. The per-patch loop points (native * ratio)
            # land within the kept data, so the SF2 loop + ADSR are preserved at full
            # bandwidth (a sustain-loop release tail past loop_end is truncated to fit).
            resampled = resample_linear(ms.data, r32)
            ms.ratio *= len(resampled) / native_len
            ms.data   = resampled[:SAMPLE_LEN_LIMIT]
            vprint(f"  info: '{ms.name}' {native_len} frames > 64K cap, long & looped; "
                   f"32 kHz, kept first {len(ms.data)} frames (loop_end {le32})")
        else:
            # (3) Looped but the loop sits past the 65535-frame cap at 32 kHz (a far-end
            # sustain loop on a multi-second sample): the floor rate can't hold it, so
            # downsample the whole sample to fit — the ratio-scaled loop stays valid,
            # at a sub-32 kHz rate. (This is the pre-existing fit-to-cap behaviour.)
            _fit_whole()
            vprint(f"  info: '{ms.name}' {native_len} frames > 64K cap, long, looped, "
                   f"far loop; fit-to-cap by {r_fit:.4f} (rate {ms.rate * r_fit:.0f} Hz)")

    # Global 8 MB pool cap. Resamples every sample down equally; synthesized loop
    # points ride the same ratio so the loop stays valid in the shrunken data.
    total = sum(len(ms.data) for ms in pool)
    if total > SAMPLEBIN_SIZE:
        g = SAMPLEBIN_SIZE / total
        vprint(f"  info: sample pool overflow ({total} bytes); "
               f"resampling all by {g:.4f}")
        for ms in pool:
            old = len(ms.data)
            ms.data = resample_linear(ms.data, g)
            ms.ratio *= len(ms.data) / old
            if ms.synth_loop is not None:
                le = min(len(ms.data) - 1, round(ms.synth_loop[1] * g))
                ls = max(0, min(le - 2, round(ms.synth_loop[0] * g)))
                ms.synth_loop = (ls, le)

    sample_bin = bytearray(SAMPLEBIN_SIZE)
    pos = 0
    for ms in pool:
        n = min(len(ms.data), SAMPLEBIN_SIZE - pos)
        if n < len(ms.data):
            vprint(f"  warning: pool full, truncating '{ms.name}'")
            ms.data = ms.data[:n]
            if ms.synth_loop is not None:        # keep the synthesized loop inside the data
                le = min(n - 1, ms.synth_loop[1])
                ms.synth_loop = (max(0, min(le - 2, ms.synth_loop[0])), le)
        sample_bin[pos:pos+n] = ms.data
        ms.offset = pos
        pos += n
    vprint(f"  sample pool: {len(pool)} sample(s), {pos} bytes")

    inst_bin = bytearray(INSTBIN_SIZE)
    for ti in layer_insts:
        if not ti.usable:
            continue
        c  = ti.canonical
        ms = c.ms
        r  = ms.ratio
        base = ti.slot * 256
        struct.pack_into('<I', inst_bin, base + 0, ms.offset)
        struct.pack_into('<H', inst_bin, base + 4, min(len(ms.data), 0xFFFF))
        struct.pack_into('<H', inst_bin, base + 6,
                         max(1, min(0xFFFF, round(ms.rate * r))))
        struct.pack_into('<H', inst_bin, base + 8, 0)            # play start
        # Synthesized-loop samples carry their loop in the final output-frame domain
        # (already scaled by every resample) and force a plain forward loop (mode 1);
        # otherwise the canonical zone's SF2 loop, scaled by this sample's ratio.
        if ms.synth_loop is not None:
            ls_w, le_w, lm_w = ms.synth_loop[0], ms.synth_loop[1], 1
        else:
            ls_w = round(c.loop_start * r)
            le_w = round(c.loop_end   * r)
            lm_w = c.loop_mode
        struct.pack_into('<H', inst_bin, base + 10, min(0xFFFF, ls_w))
        struct.pack_into('<H', inst_bin, base + 12, min(0xFFFF, le_w))
        inst_bin[base + 14] = lm_w

        def wenv(loop_off, sus_off, nodes_off, blk):
            struct.pack_into('<H', inst_bin, base + loop_off, blk['loop'] & 0xFFFF)
            struct.pack_into('<H', inst_bin, base + sus_off,  blk['sustain'] & 0xFFFF)
            nodes = list(blk['nodes'])
            for k in range(25):
                v, mf = nodes[k] if k < len(nodes) else (nodes[-1][0] if nodes else 0, 0)
                inst_bin[base + nodes_off + k*2]     = v & 0xFF
                inst_bin[base + nodes_off + k*2 + 1] = mf & 0xFF

        # Volume envelope from the canonical zone's SF2 ADSR (delay/attack/hold/decay,
        # single-node sustain held while key is on). There is NO release leg: on key-off
        # the voice holds at the sustain node and the Volume Fadeout (NNA Note Fade) is
        # the SF2 release segment (see _zone_fadeout). initialAttenuation is carried
        # separately (byte 251 / 'x' octet), not folded into the node peak. Non-canonical
        # zones with a different ADSR carry their own per-patch vol_env (see
        # Patch.to_ixmp_dict); the base record is the canonical / fall-through. A
        # synthesized-loop sample instead uses a peak->0 decay envelope (no sustain) so
        # its otherwise-infinite loop fades to silence ~SF2_SYNTH_DECAY_SEC after firing.
        wenv(15, 189, 21, _effective_vol_env(c.zone, ms))
        # Pan envelope: none (default unity nodes; P bit clear in LOOP word).
        struct.pack_into('<H', inst_bin, base + 17, 0)
        for k in range(25):
            inst_bin[base + 71 + k*2] = 0x80
        # Pitch/filter envelopes — SEPARATE, fixed slots (issue 2): slot #1 (bytes
        # 19/121) is the FILTER envelope (m-bit set), defaulting flat to 0xFF
        # (fully OPEN — the engine's filter-env neutral, since currentCutoff =
        # baseCut·envValue and 1.0 = open); slot #2 (bytes 197/201) is the PITCH
        # envelope (m-bit clear), defaulting flat to 0x80 (unity, no transpose). A
        # flat slot keeps its LOOP word at 0 (P-bit clear) so the engine ignores it.
        sf_mode, cut16, res16, filt_env = _zone_filter_sf(c.zone)
        pit_env = _pitch_env_block(c.zone) if c.zone.me2pitch else None
        for k in range(25):
            inst_bin[base + 121 + k*2] = 0xFF                    # filter-env (slot 1) flat = open
            inst_bin[base + 201 + k*2] = 0x80                    # pitch-env (slot 2) flat = unity
        if filt_env is not None:
            wenv(19, 193, 121, filt_env)
        if pit_env is not None:
            wenv(197, 199, 201, pit_env)

        # Volume Fadeout = the SF2 release segment (NNA Note Fade below). Derived from
        # the canonical zone's releaseVolEnv against the 100 dB envelope floor; see
        # _zone_fadeout for the timecent→step derivation. A synthesized-loop sample
        # disables the key-off fadeout (its decay is the vol-envelope, which runs from
        # note-on regardless of key state) so key-off does not cut it short.
        fo = 0 if ms.synth_loop is not None else _zone_fadeout(c.zone, bpm0, fadeout_override)
        inst_bin[base + 171] = 0xFF                              # IGV (unit)
        inst_bin[base + 172] = fo & 0xFF
        # byte 173: bits 0-3 = fadeout high nibble, bit 4 = SF filter mode (cutoff/resonance
        # are 16-bit SoundFont cents/centibels in bytes 182<<8|252 / 183<<8|253).
        inst_bin[base + 173] = ((fo >> 8) & 0x0F) | (0x10 if sf_mode else 0)
        inst_bin[base + 177] = (0x80 if c.pan8 == IXMP_PAN_NO_OVERRIDE
                                else c.pan8)                     # default pan
        struct.pack_into('<H', inst_bin, base + 178, TAUD_C4)    # PPC
        inst_bin[base + 182] = (cut16 >> 8) & 0xFF               # cutoff high (SF cents / IT byte)
        inst_bin[base + 252] = cut16 & 0xFF                      # cutoff low  (SF mode)
        inst_bin[base + 183] = (res16 >> 8) & 0xFF               # resonance high
        inst_bin[base + 253] = res16 & 0xFF                      # resonance low (SF mode)
        struct.pack_into('<H', inst_bin, base + 184, c.detune & 0xFFFF)
        # NNA = Note Fade (0b11) for every instrument, drum kits included. On any
        # key-off the voice holds at the sustain node and the Volume Fadeout performs
        # the SF2 release segment; when a fresh note displaces this voice the engine
        # ghosts it and starts the same fadeout, so released/displaced notes always
        # die over their own release time. (Supersedes the old melodic Key-Lift /
        # drum Continue split — the release now lives in the fadeout, not env nodes.)
        inst_bin[base + 186] = 0b11
        inst_bin[base + 196] = 255                               # default note vol
        # initialAttenuation (byte 251, dB-table octet) — the canonical zone's static gain,
        # applied per-voice by the mixer (no longer folded into the vol-env). Per-patch zones
        # with a different attenuation carry their own octet in the Ixmp 'x' block.
        inst_bin[base + 251] = atten_cb_to_octet(c.zone.atten_cb) & 0xFF

    # Metainstrument records: a 0xFFFF-sentinel sample pointer (high 16 bits) plus a
    # layer table (terranmon.txt "Metainstrument definition"). Layers stay neutral
    # (unity mix, zero detune); per-zone level/tune already live in each layer
    # instrument's patches. The note references the meta slot; the engine fans out.
    for meta_slot, _name, layer_descs in meta_records:
        base = meta_slot * 256
        inst_bin[base + 0] = 0                                  # type 0 = layered
        inst_bin[base + 1] = len(layer_descs) & 0xFF            # layer count
        inst_bin[base + 2] = 0xFF; inst_bin[base + 3] = 0xFF    # identifier (hi 16 bits)
        o = base + 4
        for layer_slot, rect in layer_descs:
            plo, phi, vlo, vhi = rect
            inst_bin[o]     = layer_slot & 0xFF
            inst_bin[o + 1] = META_UNITY_OCTET
            struct.pack_into('<h', inst_bin, o + 2, 0)          # sample detune (neutral)
            struct.pack_into('<H', inst_bin, o + 4, plo & 0xFFFF)
            struct.pack_into('<H', inst_bin, o + 6, phi & 0xFFFF)
            inst_bin[o + 8] = vlo & 0x3F
            inst_bin[o + 9] = vhi & 0x3F
            o += 10

    return bytes(sample_bin) + bytes(inst_bin)


# ── Cell grid (voices × rows) ────────────────────────────────────────────────

def _cell(cells: dict, v: int, row: int) -> dict:
    c = cells.get((v, row))
    if c is None:
        c = {'note': NOTE_NOP, 'inst': 0, 'vol': (SEL_FINE, 0),
             'pan': (SEL_FINE, 0), 'eff': None, 'prio': PRIO_FREE}
        cells[(v, row)] = c
    return c


def allocate_voices(notes: list, speed: int, max_voices: int) -> int:
    """Greedy per-row interval scheduling onto as few columns as possible.

    The engine's New Note Action does the heavy lifting (matching MIDI
    polyphony semantics): a fresh trigger on an occupied voice migrates the
    old note into the mixer's background-ghost pool, so a voice is reusable
    the moment its note is *released* — the Note-Fade tail rides the ghost
    (fading over the instrument's SF2 release). Melodic voices free at their
    key-off row; drum voices (no key-off by default) free on the very next
    row. Stealing is therefore graceful: the victim is released early, not cut.

    Mutates note.voice (and truncates stolen notes' end_ft). Returns the
    number of voices used."""
    cap = max(1, min(max_voices, NUM_VOICES))
    v_end  = []     # voice → first row at which it is free again
    v_slot = []     # voice → last instrument slot (affinity only)
    v_note = []     # voice → currently scheduled note
    stolen = 0
    for n in notes:
        srow = n.start_ft // speed
        free = [v for v in range(len(v_end)) if v_end[v] <= srow]
        v = next((x for x in free if v_slot[x] == n.slot),
                 free[0] if free else -1)
        if v < 0:
            if len(v_end) < cap:
                v = len(v_end)
                v_end.append(0); v_slot.append(0); v_note.append(None)
            else:
                # Steal preference: notes held only by the sustain pedal lose
                # least (their key is already up); otherwise the note ending
                # soonest. Either way NNA turns the steal into an early release.
                pedal = [x for x in range(len(v_end))
                         if v_note[x] is not None
                         and v_note[x].pedal_ft is not None
                         and v_note[x].pedal_ft <= n.start_ft]
                cand = pedal if pedal else range(len(v_end))
                v = min(cand, key=lambda x: v_end[x])
                victim = v_note[v]
                if victim is not None and victim.end_ft > n.start_ft:
                    victim.end_ft = n.start_ft
                stolen += 1
        if n.drum:
            end_row = srow + 1                       # ghost carries the ring
        else:
            end_row = max(srow + 1, n.end_ft // speed)   # free at key-off row
        n.voice = v
        v_end[v], v_slot[v], v_note[v] = end_row, n.slot, n
    if stolen:
        vprint(f"  info: polyphony exceeded {cap} voices; {stolen} note(s) "
               f"released early (NNA ghost keeps the tail)")
    return len(v_end)


def emit_cells(song: Song, insts: dict, speed: int, rpb: int,
               eps_units: float, drum_keyoff: bool, shift_ft: int,
               max_voices: int) -> tuple:
    """Place triggers, key-offs, portamento bend segments, M channel-volume
    and T tempo effects into the (voice,row) cell grid.
    Returns (cells, n_voices, total_rows, taud_bpm0)."""
    notes = [n for n in song.notes if n.slot > 0]

    def midi_bpm_at(ft):
        i = bisect.bisect_right(song.tempo_ft, ft) - 1
        return song.tempo_bpm[i] if i >= 0 else 120.0

    scale = rpb * speed / 24.0

    def taud_bpm(b):
        t = round(b * scale)
        if not (25 <= t <= 280):
            vprint(f"  warning: tempo {b:.1f} BPM maps to Taud {t}, "
                   f"clamped to 25..280 (try a different --rpb/--speed)")
        return max(25, min(280, t))

    n_voices = allocate_voices(notes, speed, max_voices)
    if n_voices == 0:
        sys.exit("error: no playable notes")
    vprint(f"  voices: {n_voices} used (cap {max_voices}; NNA carries tails)")

    cells = {}

    # ── Pass 1: triggers ──
    for n in notes:
        row, tick = n.start_ft // speed, n.start_ft % speed
        c = _cell(cells, n.voice, row)
        nv = key_to_noteval(n.key + n.bend0)
        c['note'] = nv
        c['inst'] = n.slot
        c['vol']  = (SEL_SET, round(n.vel * 63 / 127))
        st = song.channels[n.ch]
        if st.cc10_ft:
            pan = _curve_at(st.cc10_ft, st.cc10_val, n.start_ft + shift_ft, 64)
            c['pan'] = (SEL_SET, round(pan * 63 / 127))
        if tick > 0:
            c['eff']  = (TOP_S, 0xD000 | (tick << 8))
            c['prio'] = PRIO_DELAY

    # ── Pass 2: key-offs (both MIDI idioms arrive here as note.end_ft) ──
    skipped_offs = 0
    for n in notes:
        if n.drum and not drum_keyoff:
            continue
        row, tick = n.end_ft // speed, n.end_ft % speed
        srow = n.start_ft // speed
        if row == srow:
            # Sub-row note (shorter than one tracker row): its key-off would land on
            # its OWN trigger row, where the trigger cell already sits — pass 2 would
            # then skip it ("row taken") and the note would ring forever until the next
            # trigger on this voice. Push the key-off to the next row (tick 0) so a
            # staccato note rounds up to ~1 row instead of hanging. If the next row is
            # itself a fresh trigger, that note cuts/NNAs this one anyway (skip is fine).
            row = srow + 1
            tick = 0
        c = cells.get((n.voice, row))
        if c is None:
            c = _cell(cells, n.voice, row)
            c['note'] = NOTE_KEYOFF
            if tick > 0:
                c['eff']  = (TOP_S, 0xD000 | (tick << 8))
                c['prio'] = PRIO_DELAY
        elif c['note'] == NOTE_NOP:
            c['note'] = NOTE_KEYOFF
            if tick > 0 and c['eff'] is None:
                c['eff']  = (TOP_S, 0xD000 | (tick << 8))
                c['prio'] = PRIO_DELAY
        else:
            skipped_offs += 1    # row taken by a retrigger — which cuts/NNAs anyway
    if skipped_offs:
        vprint(f"  info: {skipped_offs} key-off(s) absorbed by same-row retriggers")

    # ── Pass 3: pitch-bend portamento segments ──
    # One linear segment per row: the cell carries the exact 4096-TET target
    # plus G at units/tick sized to land on it by row end (G slides on the
    # speed-1 non-first ticks). Targets within eps_units are skipped (jitter
    # simplification).
    seg_count = 0
    if speed >= 2:
        for n in notes:
            st = song.channels[n.ch]
            if len(st.bend_ft) <= 1 and n.bend0 == 0.0:
                continue
            start_row = n.start_ft // speed
            end_row   = n.end_ft   // speed
            cur = key_to_noteval(n.key + n.bend0)
            for r in range(start_row + 1, end_row):
                ftr = min((r + 1) * speed, n.end_ft) + shift_ft
                target = key_to_noteval(
                    n.key + _curve_at(st.bend_ft, st.bend_val, ftr, 0.0))
                if abs(target - cur) < eps_units:
                    continue
                if (n.voice, r) in cells:
                    continue
                step = -(-abs(target - cur) // (speed - 1))
                c = _cell(cells, n.voice, r)
                c['note'] = target
                c['eff']  = (TOP_G, min(0xFFFF, step))
                c['prio'] = PRIO_PORTA
                cur = target
                seg_count += 1
    elif any(len(st.bend_ft) > 1 for st in song.channels):
        vprint("  warning: --speed 1 cannot express portamento; "
               "pitch-bend movement dropped")
    if seg_count:
        vprint(f"  bend: {seg_count} portamento segment(s) emitted")

    # ── Pass 4: M channel volume (CC7 × CC11), per voice chronologically ──
    by_voice = {}
    for n in notes:
        by_voice.setdefault(n.voice, []).append(n)
    m_emitted = 0
    for v, vnotes in by_voice.items():
        vnotes.sort(key=lambda n: n.start_ft)
        m_state = 0x3F                            # engine channel_vol default
        for n in vnotes:
            st = song.channels[n.ch]
            for r in range(n.start_ft // speed, n.end_ft // speed + 1):
                ftr = r * speed + shift_ft
                m = round(_curve_at(st.cc7_ft,  st.cc7_val,  ftr, 100) / 127
                          * _curve_at(st.cc11_ft, st.cc11_val, ftr, 127) / 127
                          * 63)
                if m == m_state:
                    continue
                c = _cell(cells, v, r)
                if c['eff'] is not None:
                    continue                      # slot busy — retry next row
                c['eff']  = (TOP_M, (m & 0x3F) << 8)
                c['prio'] = PRIO_M
                m_state = m
                m_emitted += 1
    if m_emitted:
        vprint(f"  cc: {m_emitted} M channel-volume effect(s) emitted")

    total_rows = max(r for (_v, r) in cells) + 1

    # ── Pass 5: T tempo changes ──
    bpm0 = midi_bpm_at(shift_ft)                  # tempo in effect at row 0
    last = taud_bpm(bpm0)
    t_emitted = t_evict = 0
    for ft, b in zip(song.tempo_ft, song.tempo_bpm):
        row = (ft - shift_ft) // speed
        if row < 0:
            continue
        if row >= total_rows:
            break
        tb = taud_bpm(b)
        if tb == last:
            continue
        placed = False
        victim = None
        for v in range(n_voices):
            c = cells.get((v, row))
            if c is None or c['eff'] is None:
                c = _cell(cells, v, row)
                c['eff']  = (TOP_T, ((tb - 25) & 0xFF) << 8)
                c['prio'] = PRIO_TEMPO
                placed = True
                break
            if c['prio'] < PRIO_DELAY and (victim is None
                                           or c['prio'] < victim['prio']):
                victim = c
        if not placed and victim is not None:
            if victim['prio'] == PRIO_PORTA:
                victim['note'] = NOTE_NOP         # orphan G note would retrigger
            victim['eff']  = (TOP_T, ((tb - 25) & 0xFF) << 8)
            victim['prio'] = PRIO_TEMPO
            placed = True
            t_evict += 1
        if placed:
            last = tb
            t_emitted += 1
    if t_emitted:
        vprint(f"  tempo: {t_emitted} T effect(s)"
               + (f" ({t_evict} evicted a lesser effect)" if t_evict else ""))

    return cells, n_voices, total_rows, taud_bpm(bpm0)


# ── Pattern / cue emission and final assembly ────────────────────────────────

def build_pattern_bin(cells: dict, n_voices: int, n_cues: int) -> bytes:
    out = bytearray(n_cues * n_voices * PATTERN_BYTES)
    pos = 0
    for cue in range(n_cues):
        for v in range(n_voices):
            for r in range(PATTERN_ROWS):
                base = pos + r * 8
                c = cells.get((v, cue * PATTERN_ROWS + r))
                if c is None:
                    out[base + 3] = 0xC0
                    out[base + 4] = 0xC0
                    continue
                struct.pack_into('<H', out, base, c['note'] & 0xFFFF)
                out[base + 2] = c['inst'] & 0xFF
                vs, vv = c['vol']
                ps, pv = c['pan']
                out[base + 3] = (vv & 0x3F) | ((vs & 3) << 6)
                out[base + 4] = (pv & 0x3F) | ((ps & 3) << 6)
                if c['eff'] is not None:
                    op, arg = c['eff']
                    out[base + 5] = op & 0xFF
                    struct.pack_into('<H', out, base + 6, arg & 0xFFFF)
            pos += PATTERN_BYTES
    return bytes(out)


def assemble_taud(sf: SF2, song: Song, layer_insts: list, meta_records: list,
                  slot_name: dict, pool: list, args) -> bytes:
    speed, rpb = args.speed, args.rpb

    # Leading-silence trim: shift the grid so the first trigger is row 0.
    first_row = min(n.start_ft // speed for n in song.notes if n.slot > 0)
    shift_ft = first_row * speed
    if shift_ft:
        vprint(f"  info: trimming {first_row} leading silent row(s)")
        for n in song.notes:
            n.start_ft -= shift_ft
            n.end_ft   -= shift_ft

    eps_units = args.bend_epsilon * 4096.0 / 1200.0
    cells, n_voices, total_rows, bpm0 = emit_cells(
        song, None, speed, rpb, eps_units, args.drum_keyoff, shift_ft,
        args.max_voices)

    n_cues = (total_rows + PATTERN_ROWS - 1) // PATTERN_ROWS
    if n_cues > NUM_CUES:
        sys.exit(f"error: song needs {n_cues} cues > {NUM_CUES} limit "
                 f"(try a smaller --rpb)")
    if n_cues * n_voices > NUM_PATTERNS_MAX:
        sys.exit(f"error: {n_cues} cues × {n_voices} voices "
                 f"> {NUM_PATTERNS_MAX} pattern limit")

    pat_bin = build_pattern_bin(cells, n_voices, n_cues)
    pat_bin, remap, n_unique = deduplicate_patterns(pat_bin, n_cues * n_voices)
    vprint(f"  patterns: {n_cues * n_voices} → {n_unique} unique; "
           f"{n_cues} cue(s), {n_voices} voice(s), {total_rows} rows")

    sheet = bytearray(NUM_CUES * CUE_SIZE)
    for ci in range(NUM_CUES):
        sheet[ci*CUE_SIZE:(ci+1)*CUE_SIZE] = encode_cue([], 0)
    for ci in range(n_cues):
        pats = [remap[ci * n_voices + v] for v in range(n_voices)]
        tail = total_rows - ci * PATTERN_ROWS
        if ci == n_cues - 1:
            instr = CUE_INST_HALT
        elif tail < PATTERN_ROWS:
            instr = cue_instruction_len(tail)
        else:
            instr = CUE_INST_NOP
        sheet[ci*CUE_SIZE:(ci+1)*CUE_SIZE] = encode_cue(pats, instr)

    # ── Sample + instrument bin ──
    sampleinst_raw = build_sample_inst_bin(sf, pool, layer_insts, meta_records,
                                           args.fadeout, bpm0)
    assert len(sampleinst_raw) == SAMPLEINST_SIZE
    compressed = compress_blob(sampleinst_raw, "sample+inst bin")
    comp_size  = len(compressed)

    pat_comp = compress_blob(pat_bin,      "pattern bin")
    cue_comp = compress_blob(bytes(sheet), "cue sheet")

    song_table_off = TAUD_HEADER_SIZE + comp_size
    song_off       = song_table_off + TAUD_SONG_ENTRY
    entry = encode_song_entry(
        song_offset=song_off,
        num_voices=n_voices,
        num_patterns=n_unique,
        bpm_stored=(bpm0 - 25) & 0xFF,
        tick_rate=speed,
        base_note=0xA000,
        base_freq=8363.0,
        flags_byte=0x00,                          # linear pitch mode
        pat_bin_comp_size=len(pat_comp),
        cue_sheet_comp_size=len(cue_comp),
        global_vol=0xFF,
        mixing_vol=0xFF,
    )

    # ── Project data: names + the Ixmp section recreating SF2 layering ──
    proj_data = b''
    proj_off  = 0
    if not args.no_project_data:
        # Names indexed by slot (0 = unused). Layer slots carry the (suffixed) layer
        # instrument name; meta slots carry the bare preset name.
        max_slot = max([0] + list(slot_name))
        inst_names = ['' for _ in range(max_slot + 1)]
        for s, nm in slot_name.items():
            inst_names[s] = nm
        smp_names  = [''] + [ms.name for ms in pool]
        ixmp = {}
        for ti in layer_insts:
            if not ti.usable:
                continue
            pl = [p.to_ixmp_dict(ti.canonical, bpm0, args.fadeout)
                  for p in ti.patches if p is not ti.canonical]
            if pl:
                ixmp[ti.slot] = pl
        if ixmp:
            vprint(f"  ixmp: {sum(len(p) for p in ixmp.values())} patch(es) "
                   f"across {len(ixmp)} instrument(s)")
        title = song.title or os.path.splitext(os.path.basename(args.input))[0]
        proj_data = build_project_data(
            project_name=title,
            instrument_names=inst_names,
            sample_names=smp_names,
            ixmp_patches=ixmp or None,
        )

    header = (TAUD_MAGIC
              + bytes([TAUD_VERSION, 1])
              + struct.pack('<I', comp_size)
              + struct.pack('<I', 0)              # patched below if proj data
              + (SIGNATURE + b' ' * 14)[:14])
    assert len(header) == TAUD_HEADER_SIZE

    out = bytearray()
    out += header
    out += compressed
    out += entry
    out += pat_comp
    out += cue_comp
    if proj_data:
        proj_off = len(out)
        struct.pack_into('<I', out, 14, proj_off)
        out += proj_data
        vprint(f"  project data: {len(proj_data)} bytes @ {proj_off}")
    return bytes(out)


# ── Main ──────────────────────────────────────────────────────────────────────

def main():
    ap = argparse.ArgumentParser(
        description=__doc__,
        formatter_class=argparse.RawDescriptionHelpFormatter)
    ap.add_argument('input',     help='Input .mid file')
    ap.add_argument('soundfont', help='SoundFont 2 (.sf2) sample library')
    ap.add_argument('output', nargs='?', default=None,
                    help='Output .taud (default: input stem + .taud)')
    ap.add_argument('--perc-force-mapping', nargs=2, type=int, default=None,
                    metavar=('BANK', 'INST'),
                    help='Force the percussion channel to this SF2 preset '
                         '(default: bank 128, channel program)')
    ap.add_argument('--rpb', type=int, default=4, choices=(2, 4, 8, 16),
                    help='Rows per beat (default 4 = 16th-note rows)')
    ap.add_argument('--speed', type=int, default=6,
                    help='Ticks per row, 1..15 (default 6)')
    ap.add_argument('--fadeout', type=int, default=None,
                    help='Override the computed fadeout step (0..4095). By '
                         'default each instrument/patch gets a Volume Fadeout '
                         'reproducing its SF2 release segment (releaseVolEnv vs '
                         'the 100 dB floor), played out via NNA Note Fade')
    ap.add_argument('--max-voices', type=int, default=20,
                    help='Voice-column budget, 1..20 (default 20). NNA '
                         'background ghosts carry release/ring tails, so '
                         'few foreground voices are needed; songs exceeding '
                         'the budget release the oldest pedal-held or '
                         'soonest-ending note early')
    ap.add_argument('--max-layers', type=int, default=4,
                    help='Max simultaneous layers per note (default 4). Each SF2 '
                         'preset is split into this many disjoint layers; presets '
                         'needing >1 layer become a Metainstrument. 1 disables '
                         'layering (first-zone-wins, like the old behaviour). '
                         'Covers ~93%% of big-bank presets at 4, ~98%% at 5')
    ap.add_argument('--bend-epsilon', type=float, default=4.0,
                    help='Pitch-bend simplification threshold in cents '
                         '(default 4.0); smaller = more faithful')
    ap.add_argument('--drum-keyoff', action='store_true',
                    help='Emit KEY_OFF for percussion-channel notes too '
                         '(GM drums normally ignore note-off)')
    ap.add_argument('--no-project-data', action='store_true',
                    help='Omit the Project Data section — NOTE: this also '
                         'omits Ixmp, collapsing every instrument to its '
                         'canonical sample')
    ap.add_argument('-v', '--verbose', action='store_true')
    args = ap.parse_args()
    set_verbose(args.verbose)

    if not (1 <= args.speed <= 15):
        sys.exit("error: --speed must be 1..15")
    if not (1 <= args.max_voices <= 20):
        sys.exit("error: --max-voices must be 1..20")
    if not (1 <= args.max_layers <= 25):
        sys.exit("error: --max-layers must be 1..25")
    if args.output is None:
        args.output = os.path.splitext(args.input)[0] + '.taud'

    vprint(f"parsing MIDI '{args.input}'…")
    division, merged = parse_midi(args.input)
    song = extract_song(division, merged, args.rpb, args.speed)
    vprint(f"  {len(song.notes)} note(s), {len(song.tempo_ft)} tempo event(s)")
    if not song.notes:
        sys.exit("error: MIDI contains no playable notes")

    vprint(f"parsing SF2 '{args.soundfont}'…")
    sf = parse_sf2(args.soundfont)
    vprint(f"  {len(sf.presets)} preset(s), {len(sf.shdrs)} sample header(s)")

    # Presets in first-use order; triggers keyed by the exact (noteVal-with-initial-
    # bend, vol6) pair the patterns will carry, so layer trimming sees precisely what
    # the engine matches at runtime.
    slot_keys = []
    seen_keys = set()
    triggers  = {}
    for n in song.notes:
        if n.inst_key not in seen_keys:
            seen_keys.add(n.inst_key)
            slot_keys.append(n.inst_key)
        t = triggers.setdefault(n.inst_key, {})
        k = (key_to_noteval(n.key + n.bend0), round(n.vel * 63 / 127))
        t[k] = t.get(k, 0) + 1
    vprint(f"  {len(slot_keys)} preset(s) in use")

    registry = {}
    presets = build_presets(sf, slot_keys, triggers, args.perc_force_mapping,
                            registry, args.max_layers)

    # Allocate instrument-bin slots: each layer is a normal instrument; a preset with
    # >1 layer also takes a Metainstrument slot the note references. Single-layer
    # presets stay plain instruments (no meta, no extra slot).
    next_slot   = 1
    layer_insts = []      # all normal instruments, .slot assigned
    meta_records = []     # (meta_slot, name, [(layer_slot, bbox_rect)])
    slot_name   = {}      # slot → display name
    note_slot   = {}      # inst_key → slot a note triggers (0 = unplayable)
    for ik in slot_keys:
        name, layers = presets[ik]
        if not layers:
            note_slot[ik] = 0
            continue
        need = len(layers) + (1 if len(layers) > 1 else 0)
        if next_slot + need - 1 > 255:
            vprint(f"  warning: 255-slot budget exhausted — preset '{name}' dropped")
            note_slot[ik] = 0
            continue
        for li, ti in enumerate(layers):
            ti.slot = next_slot; next_slot += 1
            layer_insts.append(ti)
            slot_name[ti.slot] = name if len(layers) == 1 else f"{name} L{li}"
        if len(layers) == 1:
            note_slot[ik] = layers[0].slot
        else:
            meta_slot = next_slot; next_slot += 1
            meta_records.append((meta_slot, name,
                                 [(ti.slot, _layer_bbox(ti)) for ti in layers]))
            slot_name[meta_slot] = name
            note_slot[ik] = meta_slot
    vprint(f"  slots: {next_slot - 1} used — {len(layer_insts)} instrument(s), "
           f"{len(meta_records)} Metainstrument(s)")

    # Tag notes with their trigger slot; notes whose preset failed to resolve drop.
    unplayable = 0
    for n in song.notes:
        n.slot = note_slot.get(n.inst_key, 0)
        if n.slot == 0:
            unplayable += 1
    if unplayable:
        vprint(f"  warning: {unplayable} note(s) dropped (unresolvable preset)")
    song.notes = [n for n in song.notes if n.slot > 0]
    if not song.notes:
        sys.exit("error: no notes survived preset resolution")

    # Pool = every sample referenced by a kept patch (canonical included), in
    # deterministic first-reference order. Everything else is trimmed.
    pool = []
    seen = set()
    for ti in layer_insts:
        for p in ti.patches:
            if id(p.ms) not in seen:
                seen.add(id(p.ms))
                pool.append(p.ms)

    taud = assemble_taud(sf, song, layer_insts, meta_records, slot_name, pool, args)
    sf.file.close()

    with open(args.output, 'wb') as f:
        f.write(taud)
    print(f"wrote {len(taud)} bytes to '{args.output}'")


if __name__ == '__main__':
    main()