#!/usr/bin/env python3
"""midi2taud.py — Convert Standard MIDI (.mid) + SoundFont 2 (.sf2) to TSVM Taud (.taud)

Usage:
    python3 midi2taud.py song.mid soundfont.sf2 [output.taud]
                         [--perc-force-mapping BANK INST]
                         [--rpb N] [--speed N] [--fadeout N]
                         [--bend-epsilon CENTS] [--drum-keyoff]
                         [-v] [--no-project-data]

Behaviour (per midi2taud.md):
  * Pitch bends are preserved as much as possible. A note starting under a
    non-zero bend triggers directly at the bent 4096-TET pitch (Taud notes
    are 4096-TET, so the trigger encodes the exact shifted pitch). Bend
    movement during a note is approximated as linear segments: each segment
    is one row carrying the exact 4096-TET target note plus tone portamento
    (G $xxxx, units/tick) sized to land on the target by row end. Jittery
    curves are simplified via --bend-epsilon (cents). RPN 0,0 pitch-bend
    range messages are honoured; bend values are computed as floats from
    the full 14-bit word (MIDIs that only drive the MSB work transparently).
  * Both MIDI key-off idioms — real note-off messages and note-on with
    velocity 0 — are translated into Taud KEY_OFF. Percussion-channel
    key-offs are dropped by default (GM percussion ignores note-off, and
    emitting them would chop one-shot drum tails); --drum-keyoff re-enables.
  * The SF2 key/velocity sample-layering model is recreated faithfully. Each
    preset's zones are partitioned into the fewest mutually-DISJOINT layers
    (--max-layers cap, default 4); each layer becomes one normal Taud instrument
    with its zones as Ixmp patches (velocity axis round(vel × 63/127)). A preset
    needing >1 layer is emitted as a Metainstrument (terranmon.txt "Metainstrument
    definition"): the note references the meta slot and the engine fans out one
    voice per matching layer, so SF2's simultaneous layering (and detune-stacks)
    now sound — overlapping zones are no longer dropped. Single-layer presets stay
    plain instruments. Stereo SF2 samples are mixed to mono. Unused instruments,
    patches, and samples are trimmed.
  * The SF2 volume-envelope ADSR is preserved on the (instrument-scope) Taud
    volume envelope: delay/attack/hold/decay nodes and a sustain region held
    while the key is on. There is NO release leg — the SF2 *release segment*
    is the Volume Fadeout (with NNA Note Fade): on key-off the voice holds at
    the sustain node and fades to silence over the SF2 releaseVolEnv time
    (measured against the 100 dB envelope floor: releaseVolEnv·(1000−sus_cb)/
    1000 seconds, then scaled to FluidSynth's PERCEIVED release length because
    the engine's fadeout is linear in amplitude, not dB — see _zone_fadeout).
    Per-layer Ixmp patches carry their own fadeout when their release differs.
    The canonical zone's ADSR represents the instrument.
  * Polyphony rides the engine's New Note Action (matching MIDI semantics):
    every instrument (drum kits included) gets NNA = Note Fade, so a voice
    column is reusable the moment its note releases — the release/fade tail
    moves to a background ghost on the next trigger and dies over its own
    release time. Voice budget defaults to 16 columns (--max-voices); overflow
    releases the oldest pedal-held or soonest-ending note early, not cut.
  * SF2 exclusiveClass (gen 57) is honoured on the percussion channel: a new note
    in a class chokes any ringing note of the same class (e.g. a closed hi-hat
    silences a ringing open hi-hat), matching FluidSynth's kill-by-exclusive-class.
    The choke is the new fast note-fade (note 0x0004, ~0.3 s) emitted at the next
    same-class onset; without it long percussion tails wash over the whole beat.
  * Sub-row timing is carried by S $Dx note delays (one row = `--speed`
    ticks, default 6; one beat = `--rpb` rows, default 4 → 1/24-beat grid).
    MIDI tempo changes map to T $xx00 set-tempo effects; channel volume /
    expression (CC7 × CC11) map to M $xx00 channel-volume effects so they
    never disturb the velocity-driven patch selection axis.
"""

import argparse
import array
import bisect
import math
import os
import struct
import sys

from taud_common import (
    set_verbose, vprint,
    TAUD_MAGIC, TAUD_VERSION, TAUD_HEADER_SIZE, TAUD_SONG_ENTRY,
    SAMPLEBIN_SIZE, INSTBIN_SIZE, SAMPLEINST_SIZE, SAMPLE_LEN_LIMIT,
    PATTERN_ROWS, PATTERN_BYTES, NUM_PATTERNS_MAX, NUM_CUES, CUE_SIZE, NUM_VOICES,
    NOTE_NOP, NOTE_KEYOFF, NOTE_FASTFADE, TAUD_C4,
    TOP_G, TOP_M, TOP_S, TOP_T,
    SEL_SET, SEL_FINE,
    CUE_INST_NOP, CUE_INST_HALT,
    resample_linear, encode_cue, deduplicate_patterns, encode_song_entry,
    compress_blob, build_project_data, cue_instruction_len, nearest_minifloat,
    IXMP_PAN_NO_OVERRIDE, atten_cb_to_octet,
)

SIGNATURE = b'midi2taud/TSVM'   # 14 bytes
UNITS_PER_SEMI = 4096.0 / 12.0  # 4096-TET units per 12-TET semitone

# Effect priorities for the shared per-cell effect slot. Higher wins when a
# later pass needs the slot: SD note delays carry trigger timing and are
# never overwritten; T tempo is global and may evict G/M; M only takes free
# slots.
PRIO_FREE  = 0
PRIO_M     = 1
PRIO_PORTA = 2
PRIO_DELAY = 3
PRIO_TEMPO = 4


def key_to_noteval(key: float) -> int:
    """MIDI key (float, 60 = middle C) → Taud 4096-TET noteVal (C4 = 0x5000)."""
    return max(0x20, min(0xFFFF, round(TAUD_C4 + (key - 60.0) * UNITS_PER_SEMI)))


# ── MIDI parser ───────────────────────────────────────────────────────────────

def _read_varlen(data: bytes, pos: int):
    val = 0
    while True:
        b = data[pos]; pos += 1
        val = (val << 7) | (b & 0x7F)
        if not (b & 0x80):
            return val, pos


def _parse_track(data: bytes, pos: int, end: int) -> list:
    """Parse one MTrk body → list of (abs_tick, event_tuple)."""
    evs = []
    tick = 0
    status = 0
    while pos < end:
        delta, pos = _read_varlen(data, pos)
        tick += delta
        if pos >= end:
            break
        b = data[pos]
        if b & 0x80:
            status = b
            pos += 1
        elif status < 0x80:
            vprint(f"  warning: corrupt track data at {pos:#x}, truncating track")
            break

        if status == 0xFF:                       # meta
            mtype = data[pos]; pos += 1
            ln, pos = _read_varlen(data, pos)
            payload = data[pos:pos+ln]; pos += ln
            if mtype == 0x51 and ln >= 3:
                uspq = int.from_bytes(payload[:3], 'big')
                if uspq > 0:
                    evs.append((tick, ('tempo', 60000000.0 / uspq)))
            elif mtype == 0x03:
                txt = payload.decode('latin-1', errors='replace').strip()
                if txt:
                    evs.append((tick, ('title', txt)))
            elif mtype == 0x2F:
                evs.append((tick, ('eot',)))
                break
            status = 0                           # meta cancels running status
        elif status in (0xF0, 0xF7):             # sysex
            ln, pos = _read_varlen(data, pos)
            pos += ln
            status = 0
        else:
            hi = status & 0xF0
            ch = status & 0x0F
            if hi in (0xC0, 0xD0):
                d1 = data[pos]; pos += 1
                if hi == 0xC0:
                    evs.append((tick, ('prog', ch, d1)))
            else:
                d1 = data[pos]; d2 = data[pos+1]; pos += 2
                if hi == 0x90:
                    if d2 > 0:
                        evs.append((tick, ('on', ch, d1, d2)))
                    else:
                        evs.append((tick, ('off', ch, d1)))   # vel-0 idiom
                elif hi == 0x80:
                    evs.append((tick, ('off', ch, d1)))
                elif hi == 0xB0:
                    evs.append((tick, ('cc', ch, d1, d2)))
                elif hi == 0xE0:
                    evs.append((tick, ('bend', ch, (d2 << 7) | d1)))
                # 0xA0 polyphonic aftertouch: ignored
    return evs


def parse_midi(path: str):
    """Returns (division, merged_events). division: ('ppq', tpq) or
    ('smpte', fps, tpf). merged_events: [(tick, seq, event_tuple)] sorted."""
    with open(path, 'rb') as f:
        data = f.read()

    if data[:4] == b'RIFF':                      # RMID wrapper
        pos = 12
        while pos + 8 <= len(data):
            cid = data[pos:pos+4]
            sz  = struct.unpack_from('<I', data, pos+4)[0]
            if cid == b'data':
                data = data[pos+8 : pos+8+sz]
                break
            pos += 8 + sz + (sz & 1)

    if data[:4] != b'MThd':
        sys.exit("error: not a MIDI file (bad MThd magic)")
    hlen = struct.unpack_from('>I', data, 4)[0]
    fmt, ntrk, div = struct.unpack_from('>HHH', data, 8)
    if fmt == 2:
        vprint("  warning: SMF format 2 — tracks merged on a shared timeline")

    if div & 0x8000:
        fps = -struct.unpack_from('b', data, 12)[0]
        tpf = div & 0xFF
        division = ('smpte', fps, tpf)
    else:
        division = ('ppq', max(1, div))

    pos = 8 + hlen
    merged = []
    seq = 0
    tracks_found = 0
    while pos + 8 <= len(data) and tracks_found < ntrk:
        cid = data[pos:pos+4]
        sz  = struct.unpack_from('>I', data, pos+4)[0]
        body_start = pos + 8
        pos = body_start + sz
        if cid != b'MTrk':
            continue
        tracks_found += 1
        for tick, ev in _parse_track(data, body_start, min(pos, len(data))):
            merged.append((tick, seq, ev))
            seq += 1
    merged.sort(key=lambda e: (e[0], e[1]))
    return division, merged


# ── Note / controller extraction ──────────────────────────────────────────────

class Note:
    __slots__ = ('ch', 'key', 'vel', 'start_ft', 'end_ft', 'inst_key',
                 'bend0', 'slot', 'voice', 'drum', 'pedal_ft', 'excl_cut_ft')
    def __init__(self, ch, key, vel, start_ft, inst_key, bend0):
        self.ch       = ch
        self.key      = key
        self.vel      = vel
        self.start_ft = start_ft
        self.end_ft   = None
        self.inst_key = inst_key
        self.bend0    = bend0
        self.slot     = 0
        self.voice    = -1
        self.drum     = (inst_key[0] == 'd')
        self.pedal_ft = None     # physical key-up time when only the pedal holds it
        self.excl_cut_ft = None  # ft at which a same-exclusiveClass note chokes this one


class _ChState:
    __slots__ = ('bank', 'prog', 'rpn_msb', 'rpn_lsb', 'range_semi',
                 'range_cents', 'cur_bend', 'bend_ft', 'bend_val',
                 'cc7_ft', 'cc7_val', 'cc11_ft', 'cc11_val',
                 'cc10_ft', 'cc10_val', 'sus', 'pending', 'active')
    def __init__(self):
        self.bank = 0
        self.prog = 0
        self.rpn_msb = 0x7F
        self.rpn_lsb = 0x7F
        self.range_semi  = 2
        self.range_cents = 0
        self.cur_bend = 0.0
        self.bend_ft  = [0];   self.bend_val = [0.0]
        self.cc7_ft   = [0];   self.cc7_val  = [100]    # GM default
        self.cc11_ft  = [0];   self.cc11_val = [127]
        self.cc10_ft  = [];    self.cc10_val = []       # empty = never set
        self.sus = False
        self.pending = []        # notes held by the sustain pedal
        self.active  = {}        # key → Note


def _curve_at(fts: list, vals: list, ft: int, default):
    i = bisect.bisect_right(fts, ft) - 1
    return vals[i] if i >= 0 else default


def _curve_push(fts: list, vals: list, ft: int, val):
    if fts and fts[-1] == ft:
        vals[-1] = val
    else:
        fts.append(ft); vals.append(val)


class Song:
    __slots__ = ('notes', 'channels', 'tempo_ft', 'tempo_bpm', 'title', 'end_ft')


def extract_song(division, merged, rpb: int, speed: int) -> Song:
    """Walk merged MIDI events, producing note instances (with both key-off
    idioms resolved to a definite end time), per-channel bend/CC curves, and
    the tempo map — all on the Taud fine-tick (ft) grid where one row =
    `speed` fts and one beat = `rpb` rows."""
    if division[0] == 'ppq':
        tpq = division[1]
        def to_ft(tick):
            return round(tick * rpb * speed / tpq)
    else:
        _, fps, tpf = division
        tps = max(1.0, float(fps * tpf))         # ticks per second
        # SMPTE timing has no musical beats: pin a 120 BPM equivalent grid.
        def to_ft(tick):
            return round((tick / tps) * 2.0 * rpb * speed)
        vprint("  info: SMPTE division — pinned to a 120 BPM-equivalent grid")

    chs = [_ChState() for _ in range(16)]
    notes = []
    tempo_ft, tempo_bpm = [], []
    title = None
    max_ft = 0

    def end_note(n: Note, ft: int):
        if n.end_ft is None:
            n.end_ft = max(ft, n.start_ft)

    for tick, _seq, ev in merged:
        ft = to_ft(tick)
        if ft > max_ft:
            max_ft = ft
        kind = ev[0]

        if kind == 'on':
            _, ch, key, vel = ev
            st = chs[ch]
            prev = st.active.pop(key, None)
            if prev is not None:                 # re-strike: close the old one
                end_note(prev, ft)
            ik = ('d', st.prog) if ch == 9 else ('m', st.bank, st.prog)
            n = Note(ch, key, vel, ft, ik, st.cur_bend)
            st.active[key] = n
            notes.append(n)

        elif kind == 'off':
            _, ch, key = ev
            st = chs[ch]
            n = st.active.pop(key, None)
            if n is not None:
                if st.sus:
                    n.pedal_ft = ft
                    st.pending.append(n)
                else:
                    end_note(n, ft)

        elif kind == 'bend':
            _, ch, val14 = ev
            st = chs[ch]
            # MUST be float maths: 14-bit word (or MSB-only 7-bit source,
            # which simply leaves the low 7 bits zero) → ±range semitones.
            norm  = (float(val14) - 8192.0) / 8192.0
            semis = norm * (st.range_semi + st.range_cents / 100.0)
            st.cur_bend = semis
            _curve_push(st.bend_ft, st.bend_val, ft, semis)

        elif kind == 'cc':
            _, ch, num, val = ev
            st = chs[ch]
            if num == 0:
                st.bank = val
            elif num == 7:
                _curve_push(st.cc7_ft, st.cc7_val, ft, val)
            elif num == 10:
                _curve_push(st.cc10_ft, st.cc10_val, ft, val)
            elif num == 11:
                _curve_push(st.cc11_ft, st.cc11_val, ft, val)
            elif num == 64:
                if val >= 64:
                    st.sus = True
                else:
                    st.sus = False
                    for n in st.pending:
                        end_note(n, ft)
                    st.pending.clear()
            elif num == 100:
                st.rpn_lsb = val
            elif num == 101:
                st.rpn_msb = val
            elif num in (98, 99):                # NRPN deselects RPN
                st.rpn_msb = st.rpn_lsb = 0x7F
            elif num == 6:
                if st.rpn_msb == 0 and st.rpn_lsb == 0:
                    st.range_semi = val
            elif num == 38:
                if st.rpn_msb == 0 and st.rpn_lsb == 0:
                    st.range_cents = val
            elif num in (120, 123):              # all sound / notes off
                for n in list(st.active.values()):
                    end_note(n, ft)
                st.active.clear()
                for n in st.pending:
                    end_note(n, ft)
                st.pending.clear()
            elif num == 121:                     # reset all controllers
                st.cur_bend = 0.0
                _curve_push(st.bend_ft, st.bend_val, ft, 0.0)
                _curve_push(st.cc11_ft, st.cc11_val, ft, 127)
                st.sus = False
                for n in st.pending:
                    end_note(n, ft)
                st.pending.clear()
                st.rpn_msb = st.rpn_lsb = 0x7F

        elif kind == 'prog':
            _, ch, val = ev
            chs[ch].prog = val

        elif kind == 'tempo':
            tempo_ft.append(ft); tempo_bpm.append(ev[1])

        elif kind == 'title':
            if title is None:
                title = ev[1]

    # Close anything still ringing at end-of-file.
    for st in chs:
        for n in list(st.active.values()):
            end_note(n, max_ft)
        st.active.clear()
        for n in st.pending:
            end_note(n, max_ft)
        st.pending.clear()

    dropped = [n for n in notes if n.end_ft <= n.start_ft]
    if dropped:
        vprint(f"  info: dropped {len(dropped)} zero-length note(s)")
    notes = [n for n in notes if n.end_ft > n.start_ft]
    notes.sort(key=lambda n: (n.start_ft, n.ch, n.key))

    song = Song()
    song.notes     = notes
    song.channels  = chs
    song.tempo_ft  = tempo_ft
    song.tempo_bpm = tempo_bpm
    song.title     = title
    song.end_ft    = max_ft
    return song


# ── SF2 parser ────────────────────────────────────────────────────────────────

GEN_START_OFF        = 0
GEN_END_OFF          = 1
GEN_STARTLOOP_OFF    = 2
GEN_ENDLOOP_OFF      = 3
GEN_START_COARSE     = 4
GEN_MODENV2PITCH     = 7      # modEnvToPitch (signed cents at full mod-env)
GEN_FILTERFC         = 8      # initialFilterFc (absolute cents; default 13500 = open)
GEN_FILTERQ          = 9      # initialFilterQ (cB of resonance; default 0)
GEN_MODENV2FILT      = 11     # modEnvToFilterFc (signed cents at full mod-env)
GEN_END_COARSE       = 12
GEN_EXCLUSIVECLASS   = 57     # drum mutual-exclusion group (instrument-level; 0 = none)
GEN_PAN              = 17
GEN_DELAY_MODENV     = 25
GEN_ATTACK_MODENV    = 26
GEN_HOLD_MODENV      = 27
GEN_DECAY_MODENV     = 28
GEN_SUSTAIN_MODENV   = 29     # 0.1% units of full-scale DECREASE (0..1000)
GEN_RELEASE_MODENV   = 30
GEN_DELAY_VOLENV     = 33
GEN_ATTACK_VOLENV    = 34
GEN_HOLD_VOLENV      = 35
GEN_DECAY_VOLENV     = 36
GEN_SUSTAIN_VOLENV   = 37     # centibels of attenuation, 0..1440
GEN_RELEASE_VOLENV   = 38
GEN_INSTRUMENT       = 41
GEN_KEYRANGE         = 43
GEN_VELRANGE         = 44
GEN_STARTLOOP_COARSE = 45
GEN_INITATTEN        = 48     # initialAttenuation (cB; per-zone static gain)
GEN_ENDLOOP_COARSE   = 50
GEN_COARSETUNE       = 51
GEN_FINETUNE         = 52
GEN_SAMPLEID         = 53
GEN_SAMPLEMODES      = 54
GEN_SCALETUNING      = 56
GEN_ROOTKEY          = 58

_SIGNED_GENS = frozenset({GEN_START_OFF, GEN_END_OFF, GEN_STARTLOOP_OFF,
                          GEN_ENDLOOP_OFF, GEN_START_COARSE, GEN_END_COARSE,
                          GEN_STARTLOOP_COARSE, GEN_ENDLOOP_COARSE,
                          GEN_PAN, GEN_COARSETUNE, GEN_FINETUNE,
                          GEN_DELAY_VOLENV, GEN_ATTACK_VOLENV, GEN_HOLD_VOLENV,
                          GEN_DECAY_VOLENV, GEN_RELEASE_VOLENV,
                          GEN_MODENV2PITCH, GEN_MODENV2FILT,
                          GEN_DELAY_MODENV, GEN_ATTACK_MODENV, GEN_HOLD_MODENV,
                          GEN_DECAY_MODENV, GEN_RELEASE_MODENV,
                          # cB/cents value-generators that are ADDITIVE (and so may be
                          # NEGATIVE) at the preset level. Their instrument-level absolutes
                          # all sit well under 0x8000 (atten≤1440, filterFc≤13500, Q≤960,
                          # sustain≤1440/1000), so reading them signed is lossless there and
                          # correct for relative preset deltas. Without this a preset zone
                          # carrying e.g. initialAttenuation 0xFFFE (a −2 cB boost) was read
                          # as 65534 cB → ~−6575 dB → the whole instrument went silent
                          # (SGM 'Synth Strings 1' vol-env nodes stuck at 0).
                          GEN_INITATTEN, GEN_FILTERFC, GEN_FILTERQ,
                          GEN_SUSTAIN_VOLENV, GEN_SUSTAIN_MODENV})


def _timecents_to_sec(tc: int) -> float:
    """SF2 timecents → seconds (2^(tc/1200)); default -12000 ≈ 1 ms."""
    return 2.0 ** (max(-12000, min(8000, tc)) / 1200.0)


class SFSampleHdr:
    __slots__ = ('name', 'start', 'end', 'loopstart', 'loopend', 'rate',
                 'origkey', 'correction', 'link', 'stype')


class SFZone:
    """One effective preset×instrument zone (post combination)."""
    __slots__ = ('keylo', 'keyhi', 'vello', 'velhi', 'sample', 'rootkey',
                 'tune_cents', 'modes', 'pan', 'scale', 'a_start', 'a_end',
                 'loop_abs_start', 'loop_abs_end', 'pair', 'rate', 'name',
                 'env_delay', 'env_attack', 'env_hold', 'env_decay',
                 'env_sustain_cb', 'env_release',
                 # initialAttenuation (cB static per-zone gain) + static filter.
                 'atten_cb', 'filter_fc', 'filter_q',
                 # modulation envelope (drives pitch and/or filter) + its targets.
                 'm_delay', 'm_attack', 'm_hold', 'm_decay', 'm_sustain_pc',
                 'm_release', 'me2pitch', 'me2filt',
                 # exclusiveClass (gen 57): drum mutual-exclusion group (0 = none).
                 'excl_class')


class SF2:
    __slots__ = ('presets', 'shdrs', 'file', 'smpl_off', 'smpl_size')

    def read_frames(self, start_frame: int, n_frames: int) -> array.array:
        """Read n_frames of 16-bit PCM starting at absolute frame index."""
        n_avail = max(0, min(n_frames, self.smpl_size // 2 - start_frame))
        a = array.array('h')
        if n_avail <= 0:
            return a
        self.file.seek(self.smpl_off + start_frame * 2)
        a.frombytes(self.file.read(n_avail * 2))
        if sys.byteorder == 'big':
            a.byteswap()
        return a


def _gen_amount(oper: int, raw: int) -> int:
    if oper in _SIGNED_GENS:
        return raw - 0x10000 if raw >= 0x8000 else raw
    return raw


def _parse_bags(bag_data, gen_data, start_bag, end_bag, terminal_gen):
    """Resolve bags [start_bag, end_bag) into (global_gens, [zone_gens...]).
    Each zone_gens is {oper: amount}; zones lacking the terminal generator
    other than a leading global zone are discarded per the SF2 spec."""
    glob = {}
    zones = []
    n_bags = len(bag_data) // 4
    for bi in range(start_bag, end_bag):
        g0 = struct.unpack_from('<H', bag_data, bi*4)[0]
        g1 = (struct.unpack_from('<H', bag_data, (bi+1)*4)[0]
              if bi + 1 < n_bags else len(gen_data) // 4)
        gens = {}
        for gi in range(g0, min(g1, len(gen_data) // 4)):
            oper, raw = struct.unpack_from('<HH', gen_data, gi*4)
            gens[oper] = _gen_amount(oper, raw)
        if terminal_gen in gens:
            zones.append(gens)
        elif bi == start_bag and not zones:
            glob = gens
    return glob, zones


def parse_sf2(path: str) -> SF2:
    f = open(path, 'rb')
    hdr = f.read(12)
    if hdr[:4] != b'RIFF' or hdr[8:12] != b'sfbk':
        sys.exit("error: not an SF2 file (bad RIFF/sfbk magic)")
    riff_end = 8 + struct.unpack_from('<I', hdr, 4)[0]

    pdta = {}
    smpl_off = smpl_size = 0
    pos = 12
    while pos + 8 <= riff_end:
        f.seek(pos)
        chdr = f.read(8)
        if len(chdr) < 8:
            break
        cid = chdr[:4]
        sz  = struct.unpack_from('<I', chdr, 4)[0]
        if cid == b'LIST':
            ltype = f.read(4)
            inner = pos + 12
            inner_end = pos + 8 + sz
            while inner + 8 <= inner_end:
                f.seek(inner)
                shdr_ = f.read(8)
                scid = shdr_[:4]
                ssz  = struct.unpack_from('<I', shdr_, 4)[0]
                if ltype == b'pdta':
                    pdta[scid.decode('latin-1')] = f.read(ssz)
                elif ltype == b'sdta' and scid == b'smpl':
                    smpl_off, smpl_size = inner + 8, ssz
                inner += 8 + ssz + (ssz & 1)
        pos += 8 + sz + (sz & 1)

    for need in ('phdr', 'pbag', 'pgen', 'inst', 'ibag', 'igen', 'shdr'):
        if need not in pdta:
            sys.exit(f"error: SF2 missing required pdta sub-chunk '{need}'")
    if not smpl_size:
        sys.exit("error: SF2 has no smpl chunk (sample data)")

    sf = SF2()
    sf.file = f
    sf.smpl_off, sf.smpl_size = smpl_off, smpl_size

    shdr_data = pdta['shdr']
    sf.shdrs = []
    for i in range(len(shdr_data) // 46 - 1):    # last record is EOS sentinel
        s = SFSampleHdr()
        off = i * 46
        s.name = shdr_data[off:off+20].split(b'\x00')[0].decode('latin-1',
                                                                errors='replace')
        (s.start, s.end, s.loopstart, s.loopend, s.rate) = \
            struct.unpack_from('<IIIII', shdr_data, off+20)
        s.origkey    = shdr_data[off+40]
        s.correction = struct.unpack_from('b', shdr_data, off+41)[0]
        s.link, s.stype = struct.unpack_from('<HH', shdr_data, off+42)
        if s.rate == 0:
            s.rate = 8363
        sf.shdrs.append(s)

    # Instruments: index → (global_gens, [zone_gens])
    inst_data, ibag, igen = pdta['inst'], pdta['ibag'], pdta['igen']
    n_inst = len(inst_data) // 22 - 1
    inst_zones = []
    for i in range(n_inst):
        b0 = struct.unpack_from('<H', inst_data, i*22 + 20)[0]
        b1 = struct.unpack_from('<H', inst_data, (i+1)*22 + 20)[0]
        inst_zones.append(_parse_bags(ibag, igen, b0, b1, GEN_SAMPLEID))

    # Presets
    phdr, pbag, pgen = pdta['phdr'], pdta['pbag'], pdta['pgen']
    n_pre = len(phdr) // 38 - 1
    sf.presets = {}
    scale_warned = False
    for i in range(n_pre):
        off = i * 38
        pname = phdr[off:off+20].split(b'\x00')[0].decode('latin-1',
                                                          errors='replace')
        preset, bank, bag0 = struct.unpack_from('<HHH', phdr, off+20)
        bag1 = struct.unpack_from('<H', phdr, (i+1)*38 + 24)[0]
        pglob, pzones = _parse_bags(pbag, pgen, bag0, bag1, GEN_INSTRUMENT)

        zones = []
        for pz_raw in pzones:
            pz = dict(pglob); pz.update(pz_raw)
            ii = pz[GEN_INSTRUMENT]
            if not (0 <= ii < n_inst):
                continue
            iglob, izones = inst_zones[ii]
            pk = pz.get(GEN_KEYRANGE, 0x7F00)
            pv = pz.get(GEN_VELRANGE, 0x7F00)
            pklo, pkhi = pk & 0xFF, (pk >> 8) & 0xFF
            pvlo, pvhi = pv & 0xFF, (pv >> 8) & 0xFF
            for iz_raw in izones:
                iz = dict(iglob); iz.update(iz_raw)
                si = iz[GEN_SAMPLEID]
                if not (0 <= si < len(sf.shdrs)):
                    continue
                s = sf.shdrs[si]
                if s.stype & 0x8000:             # ROM sample
                    continue
                ik = iz.get(GEN_KEYRANGE, 0x7F00)
                iv = iz.get(GEN_VELRANGE, 0x7F00)
                klo = max(ik & 0xFF, pklo); khi = min((ik >> 8) & 0xFF, pkhi)
                vlo = max(iv & 0xFF, pvlo); vhi = min((iv >> 8) & 0xFF, pvhi)
                if klo > khi or vlo > vhi:
                    continue

                z = SFZone()
                z.keylo, z.keyhi = klo, khi
                z.vello, z.velhi = vlo, vhi
                z.sample = si
                rk = iz.get(GEN_ROOTKEY, -1)
                z.rootkey = rk if 0 <= rk <= 127 else \
                            (s.origkey if s.origkey <= 127 else 60)
                z.tune_cents = ((iz.get(GEN_COARSETUNE, 0)
                                 + pz.get(GEN_COARSETUNE, 0)) * 100
                                + iz.get(GEN_FINETUNE, 0)
                                + pz.get(GEN_FINETUNE, 0)
                                + s.correction)
                z.modes = iz.get(GEN_SAMPLEMODES, 0) & 3
                z.pan   = max(-500, min(500, iz.get(GEN_PAN, 0)
                                        + pz.get(GEN_PAN, 0)))
                z.scale = iz.get(GEN_SCALETUNING, 100)
                if z.scale != 100 and klo != khi and not scale_warned:
                    vprint("  warning: scaleTuning != 100 on a multi-key zone "
                           "— pitch is exact only at the zone's centre key")
                    scale_warned = True
                # Volume-envelope ADSR (timecents at inst level, preset adds).
                z.env_delay  = _timecents_to_sec(iz.get(GEN_DELAY_VOLENV,  -12000)
                                                 + pz.get(GEN_DELAY_VOLENV,  0))
                z.env_attack = _timecents_to_sec(iz.get(GEN_ATTACK_VOLENV, -12000)
                                                 + pz.get(GEN_ATTACK_VOLENV, 0))
                z.env_hold   = _timecents_to_sec(iz.get(GEN_HOLD_VOLENV,   -12000)
                                                 + pz.get(GEN_HOLD_VOLENV,   0))
                z.env_decay  = _timecents_to_sec(iz.get(GEN_DECAY_VOLENV,  -12000)
                                                 + pz.get(GEN_DECAY_VOLENV,  0))
                z.env_sustain_cb = max(0, min(1440, iz.get(GEN_SUSTAIN_VOLENV, 0)
                                              + pz.get(GEN_SUSTAIN_VOLENV, 0)))
                z.env_release = _timecents_to_sec(iz.get(GEN_RELEASE_VOLENV, -12000)
                                                  + pz.get(GEN_RELEASE_VOLENV, 0))
                # initialAttenuation: per-zone static gain in cB (preset adds to inst).
                # Clamped to the SF2 spec range [0, 1440] so any out-of-range value can
                # never collapse the folded vol-env to silence (see _SIGNED_GENS note).
                z.atten_cb = max(0, min(1440, iz.get(GEN_INITATTEN, 0)
                                        + pz.get(GEN_INITATTEN, 0)))
                # Static low-pass filter. initialFilterFc is absolute cents (default
                # 13500 ≈ open); initialFilterQ is cB of resonance (default 0).
                z.filter_fc = iz.get(GEN_FILTERFC, 13500) + pz.get(GEN_FILTERFC, 0)
                z.filter_q  = max(0, iz.get(GEN_FILTERQ, 0) + pz.get(GEN_FILTERQ, 0))
                # Modulation envelope (drives pitch via modEnvToPitch and/or filter via
                # modEnvToFilterFc). Times are timecents; sustain is 0.1%-of-full DECREASE.
                z.m_delay   = _timecents_to_sec(iz.get(GEN_DELAY_MODENV,  -12000)
                                                + pz.get(GEN_DELAY_MODENV,  0))
                z.m_attack  = _timecents_to_sec(iz.get(GEN_ATTACK_MODENV, -12000)
                                                + pz.get(GEN_ATTACK_MODENV, 0))
                z.m_hold    = _timecents_to_sec(iz.get(GEN_HOLD_MODENV,   -12000)
                                                + pz.get(GEN_HOLD_MODENV,   0))
                z.m_decay   = _timecents_to_sec(iz.get(GEN_DECAY_MODENV,  -12000)
                                                + pz.get(GEN_DECAY_MODENV,  0))
                z.m_sustain_pc = max(0, min(1000, iz.get(GEN_SUSTAIN_MODENV, 0)
                                            + pz.get(GEN_SUSTAIN_MODENV, 0)))
                z.m_release = _timecents_to_sec(iz.get(GEN_RELEASE_MODENV, -12000)
                                                + pz.get(GEN_RELEASE_MODENV, 0))
                z.me2pitch  = iz.get(GEN_MODENV2PITCH, 0) + pz.get(GEN_MODENV2PITCH, 0)
                z.me2filt   = iz.get(GEN_MODENV2FILT,  0) + pz.get(GEN_MODENV2FILT,  0)
                # exclusiveClass is instrument-level and NON-additive (SF2.04 §8.1.2 #57):
                # a new note in class C kills sounding notes of the same class on the same
                # channel (FluidSynth fluid_synth_kill_by_exclusive_class). Drum kits use it
                # so a closed hi-hat (42) chokes a ringing open hi-hat (46).
                z.excl_class = iz.get(GEN_EXCLUSIVECLASS, 0)
                z.a_start = (s.start + iz.get(GEN_START_OFF, 0)
                             + 32768 * iz.get(GEN_START_COARSE, 0))
                z.a_end   = (s.end + iz.get(GEN_END_OFF, 0)
                             + 32768 * iz.get(GEN_END_COARSE, 0))
                z.a_start = max(0, z.a_start)
                z.a_end   = max(z.a_start, min(z.a_end, sf.smpl_size // 2))
                z.loop_abs_start = (s.loopstart + iz.get(GEN_STARTLOOP_OFF, 0)
                                    + 32768 * iz.get(GEN_STARTLOOP_COARSE, 0))
                z.loop_abs_end   = (s.loopend + iz.get(GEN_ENDLOOP_OFF, 0)
                                    + 32768 * iz.get(GEN_ENDLOOP_COARSE, 0))
                z.pair = None
                z.rate = s.rate
                z.name = s.name
                zones.append(z)
        if zones:
            sf.presets[(bank, preset)] = (pname, zones)
    return sf


# ── Preset resolution / Taud instrument building ──────────────────────────────

def resolve_preset(sf: SF2, inst_key, perc_force):
    """inst_key: ('m', bank, prog) or ('d', prog). Returns (name, zones) or None."""
    if inst_key[0] == 'd':
        prog = inst_key[1]
        cands = []
        if perc_force is not None:
            cands.append(tuple(perc_force))
        cands += [(128, prog), (128, 0)]
    else:
        _, bank, prog = inst_key
        cands = [(bank, prog), (0, prog)]
    for c in cands:
        if c in sf.presets:
            return sf.presets[c]
    # Last resort: same program number in any bank, then nothing.
    prog = inst_key[1] if inst_key[0] == 'd' else inst_key[2]
    for (b, p) in sorted(sf.presets):
        if p == prog:
            return sf.presets[(b, p)]
    return None


def merge_stereo_zones(zones: list, shdrs: list) -> list:
    """Collapse L/R zone pairs into single mono zones. Two flavours are merged:
      (1) LINKED stereo — samples are each other's sampleLink with L/R types;
      (2) PAN stereo — two MONO-typed zones with the same key/vel rect and
          opposite hard pan (±500). SGM/Timbres store most "stereo" samples this
          way (e.g. 'VA LGFF C3-L' / '…-R'), NOT as linked L/R.
    The merged zone mixes both channels to mono and drops the pan override.
    Merging is essential: an unmerged R zone fully overlaps its L zone, so the
    disjointify spills it into a SECOND layer that then plays CENTRED alongside
    the L zone — a spurious +6 dB doubling. Lone L/R zones keep their channel."""
    out = []
    used = set()
    for i, z in enumerate(zones):
        if i in used:
            continue
        s = shdrs[z.sample]
        partner = None
        if s.stype in (2, 4) and 0 <= s.link < len(shdrs):
            for j in range(i + 1, len(zones)):
                if j in used:
                    continue
                z2 = zones[j]
                if (z2.sample == s.link
                        and (z2.keylo, z2.keyhi, z2.vello, z2.velhi)
                            == (z.keylo, z.keyhi, z.vello, z.velhi)
                        and z2.modes == z.modes
                        and z2.rootkey == z.rootkey):
                    partner = j
                    break
        if partner is None and z.pan is not None and abs(z.pan) >= 400:
            for j in range(i + 1, len(zones)):
                if j in used:
                    continue
                z2 = zones[j]
                if (z2.sample != z.sample
                        and z2.pan is not None and abs(z2.pan) >= 400
                        and (z.pan < 0) != (z2.pan < 0)        # opposite sides
                        and (z2.keylo, z2.keyhi, z2.vello, z2.velhi)
                            == (z.keylo, z.keyhi, z.vello, z.velhi)
                        and z2.modes == z.modes
                        and z2.rootkey == z.rootkey):
                    partner = j
                    break
        if partner is not None:
            used.add(partner)
            z2 = zones[partner]
            z.pair = (z.sample, z2.sample, z2.a_start)
            z.pan = None                          # mixed to mono → centred
            z.a_end = z.a_start + min(z.a_end - z.a_start,
                                      z2.a_end - z2.a_start)
        out.append(z)
    return out


def apply_exclusive_class(song, sf, perc_force):
    """SF2 exclusiveClass (gen 57): starting a note in class C kills any ringing note
    of the same class on the same channel — FluidSynth's
    fluid_synth_kill_by_exclusive_class (fluid_synth.c:5453). GM drum kits use it so a
    closed hi-hat (key 42) chokes a ringing open hi-hat (key 46); without it the open
    hi-hat's multi-second tail washes over the whole beat and buries the other hits.

    Resolve each percussion note's exclusiveClass from the SF2 zone it plays, then within
    each (channel, class) serialise the chokes: every note is cut at the next note of the
    same class that starts strictly later. `emit_cells` emits a fast note-fade
    (NOTE_FASTFADE) at that point and `allocate_voices` keeps the choked voice foreground
    until then. Drum channel only — GM melodic presets do not set gen 57, and a hard choke
    would fight the melodic key-off/release machinery."""
    zone_cache = {}
    def excl_of(n):
        if not n.drum:
            return 0
        zones = zone_cache.get(n.inst_key)
        if zones is None:
            res = resolve_preset(sf, n.inst_key, perc_force)
            zones = merge_stereo_zones(res[1], sf.shdrs) if res else []
            zone_cache[n.inst_key] = zones
        # SF2 zone selection: first zone whose key/velocity rect contains the note.
        for z in zones:
            if z.keylo <= n.key <= z.keyhi and z.vello <= n.vel <= z.velhi:
                return z.excl_class
        return 0

    groups = {}
    for n in song.notes:
        c = excl_of(n)
        if c:
            groups.setdefault((n.ch, c), []).append(n)

    n_cut = 0
    for notes in groups.values():
        notes.sort(key=lambda n: n.start_ft)
        for i, n in enumerate(notes):
            for j in range(i + 1, len(notes)):
                if notes[j].start_ft > n.start_ft:    # next strictly-later onset chokes n
                    n.excl_cut_ft = notes[j].start_ft
                    n_cut += 1
                    break
    if n_cut:
        vprint(f"  exclusiveClass: {n_cut} percussion choke(s) across "
               f"{len(groups)} group(s)")


def _rect_of_zone(z: SFZone):
    """Zone key/vel ranges → Taud (pitch_lo, pitch_hi, vol_lo, vol_hi).
    Pitch bounds sit on half-semitone boundaries so triggers carrying an
    initial pitch bend (< 50 cents) still land inside the right rectangle;
    adjacent zones stay disjoint. Velocity per Ixmp note 5: round(v·63/127)."""
    if z.keylo <= 0:
        plo = 0x0000
    else:
        plo = max(0, min(0xFFFF, round(TAUD_C4 + (z.keylo - 0.5 - 60) * UNITS_PER_SEMI)))
    if z.keyhi >= 127:
        phi = 0xFFFF
    else:
        phi = max(0, min(0xFFFF, round(TAUD_C4 + (z.keyhi + 0.5 - 60) * UNITS_PER_SEMI) - 1))
    vlo = round(z.vello * 63 / 127)
    vhi = round(z.velhi * 63 / 127)
    return (plo, phi, vlo, vhi)


def _rect_subtract(r, k):
    """Pieces of rectangle r not covered by rectangle k (≤ 4 pieces)."""
    p0, p1, v0, v1 = r
    q0, q1, w0, w1 = k
    if p1 < q0 or p0 > q1 or v1 < w0 or v0 > w1:
        return [r]
    pieces = []
    if p0 < q0: pieces.append((p0, q0 - 1, v0, v1))
    if p1 > q1: pieces.append((q1 + 1, p1, v0, v1))
    m0, m1 = max(p0, q0), min(p1, q1)
    if v0 < w0: pieces.append((m0, m1, v0, w0 - 1))
    if v1 > w1: pieces.append((m0, m1, w1 + 1, v1))
    return pieces


class MonoSample:
    """One pooled (deduplicated) mono u8 sample slice."""
    __slots__ = ('pair', 'a_start', 'frames', 'rate', 'name',
                 'data', 'ratio', 'offset', 'loop_native', 'synth_loop', 'synth_decay')
    def __init__(self, z: SFZone):
        self.pair    = z.pair                    # None or (idxL, idxR, b_start)
        self.a_start = z.a_start
        self.frames  = max(0, z.a_end - z.a_start)
        self.rate    = z.rate
        self.name    = z.name
        self.data    = None
        self.ratio   = 1.0
        self.offset  = 0
        # SF2 loop in NATIVE frames (mirrors the Patch loop test), or None when this
        # slice has no loop. Used by build_sample_inst_bin to decide how to fit an
        # over-length sample: a no-loop sample gets a synthesized loop, a looped one
        # is preserved (kept at 32 kHz when its loop fits, else fit-to-cap). Dedup
        # keeps the first zone's loop (same slice ⇒ same loop in practice).
        ls_n = max(0, min(z.loop_abs_start - z.a_start, self.frames))
        le_n = max(0, min(z.loop_abs_end   - z.a_start, self.frames))
        self.loop_native = (ls_n, le_n) if (z.modes in (1, 3) and le_n - ls_n >= 2) else None
        # Set when a too-long, originally UN-looped sample is resampled to the 32 kHz
        # floor and given a synthesized sustain loop (see _synth_sustain_loop): a
        # (loop_start, loop_end) pair in the FINAL output-frame domain (already scaled
        # by every resample) and the seconds over which a peak->0 vol-envelope fades
        # the looped note to silence (_synth_decay_vol_env). When set, the loop points
        # and vol-envelope of EVERY record/patch using this sample are overridden.
        self.synth_loop  = None
        self.synth_decay = None

    def key(self):
        return (self.pair[0], self.pair[1], self.a_start, self.frames) \
            if self.pair else (-1, -1, self.a_start, self.frames)

    def render(self, sf: SF2):
        if self.data is not None:
            return
        n = min(self.frames, 1 << 24)            # hard sanity cap (16M frames)
        if self.pair:
            la = sf.read_frames(self.a_start, n)
            ra = sf.read_frames(self.pair[2], n)
            m  = min(len(la), len(ra))
            self.data = bytes((((la[i] + ra[i]) >> 1) >> 8) + 128 & 0xFF
                              for i in range(m))
        else:
            la = sf.read_frames(self.a_start, n)
            self.data = bytes(((s >> 8) + 128) & 0xFF for s in la)
        self.frames = len(self.data)


class Patch:
    """One Ixmp-patch-to-be: a disjoint rect plus the zone's sample fields."""
    __slots__ = ('rect', 'zone', 'ms', 'loop_start', 'loop_end', 'loop_mode',
                 'detune', 'pan8', 'hits')
    def __init__(self, rect, z: SFZone, ms: MonoSample):
        self.rect = rect
        self.zone = z
        self.ms   = ms
        ls = z.loop_abs_start - z.a_start
        le = z.loop_abs_end   - z.a_start
        nf = max(0, z.a_end - z.a_start)
        ls = max(0, min(ls, nf)); le = max(0, min(le, nf))
        if z.modes in (1, 3) and le - ls >= 2:
            self.loop_mode  = 1 | (0x4 if z.modes == 3 else 0)
            self.loop_start = ls
            self.loop_end   = le
        else:
            self.loop_mode  = 0
            self.loop_start = 0
            self.loop_end   = 0
        # samplingRate = SF2 rate; the rootkey/tuning shift goes into the
        # signed 4096-TET detune so MIDI key 60 always means noteVal 0x5000.
        # scaleTuning (cents per key, 0 = fixed-pitch drums) is folded in
        # around the zone's centre key: exact for single-key zones, exact
        # everywhere when scale = 100.
        k_ref = (z.keylo + z.keyhi) / 2.0
        det = round(((k_ref - z.rootkey) * (z.scale / 100.0)
                     - (k_ref - 60.0)) * UNITS_PER_SEMI
                    + z.tune_cents * 4096.0 / 1200.0)
        self.detune = max(-0x8000, min(0x7FFF, det))
        if z.pan is None:
            self.pan8 = IXMP_PAN_NO_OVERRIDE
        else:
            self.pan8 = max(0, min(255, round(127.5 + z.pan * 255.0 / 1000.0)))
        self.hits = 0

    def to_ixmp_dict(self, canonical, bpm0, fadeout_override):
        r = self.ms.ratio
        # Synthesized-loop samples carry their loop in the final output-frame domain
        # (already resampled) and force a plain forward loop; otherwise the zone's SF2
        # loop scaled by this sample's resample ratio.
        if self.ms.synth_loop is not None:
            ls_w, le_w, lm_w = self.ms.synth_loop[0], self.ms.synth_loop[1], 1
        else:
            ls_w = round(self.loop_start * r)
            le_w = round(self.loop_end   * r)
            lm_w = self.loop_mode
        d = {
            'pitch_start':         self.rect[0],
            'pitch_end':           self.rect[1],
            'volume_start':        self.rect[2],
            'volume_end':          self.rect[3],
            'sample_ptr':          self.ms.offset,
            'sample_length':       min(len(self.ms.data), 0xFFFF),
            'play_start':          0,
            'loop_start':          min(0xFFFF, ls_w),
            'loop_end':            min(0xFFFF, le_w),
            'sampling_rate':       max(1, min(0xFFFF, round(self.ms.rate * r))),
            'sample_detune':       self.detune,
            'loop_mode':           lm_w,
            'default_pan':         self.pan8,
            'default_note_volume': 0,            # no override → base DNV
            'vibrato_speed':       0,
            'vibrato_sweep':       0,
            'vibrato_depth':       0,
            'vibrato_rate':        0,
            'vibrato_waveform':    0xFF,         # no override
        }
        # Per-patch overrides — emitted ONLY when they differ from the canonical
        # zone (whose envelopes/filter live in the base instrument record, which the
        # patch falls through to when a block is absent). This is what gives SF2
        # velocity / key layers their own ADSR + filter while keeping patches lean.
        z, c = self.zone, canonical.zone
        # Effective vol-env: a synthesized-loop sample uses a peak->0 decay (no sustain),
        # else the zone's SF2 ADSR. Emitted only when it differs from the canonical's.
        vol_self  = _effective_vol_env(z, self.ms)
        vol_canon = _effective_vol_env(c, canonical.ms)
        if vol_self != vol_canon:
            d['vol_env'] = vol_self
        # SF-mode filter: mode flag + 16-bit cutoff cents / Q centibels + filter env.
        sf_s, cut_s, res_s, filt_s = _zone_filter_sf(z)
        sf_c, cut_c, res_c, filt_c = _zone_filter_sf(c)
        pit_s = _pitch_env_block(z) if z.me2pitch else None
        pit_c = _pitch_env_block(c) if c.me2pitch else None
        # Emit the 'x' block when filter (mode/cutoff/resonance/env) OR initialAttenuation
        # differs from the canonical (base) zone. initialAttenuation is a per-voice gain (NOT
        # folded into the env); when 'x' is present it carries this patch's atten, else the
        # voice inherits the base record's atten. A differing filter ENV must co-emit 'x'
        # because the env's node ratios scale the patch's OWN peak cutoff (the 'x' cutoff).
        att_s = atten_cb_to_octet(z.atten_cb)
        att_c = atten_cb_to_octet(c.atten_cb)
        # Volume Fadeout = this patch's own SF2 release segment; emit 'x' when it (or any
        # filter / atten field) differs from the canonical zone so the per-layer release
        # time is faithful (an absent 'x' falls through to the base record's fadeout). A
        # synthesized-loop sample disables its key-off fadeout (its decay is the vol-env,
        # which runs from note-on regardless of key state).
        fo_s = 0 if self.ms.synth_loop is not None else _zone_fadeout(z, bpm0, fadeout_override)
        fo_c = 0 if canonical.ms.synth_loop is not None else _zone_fadeout(c, bpm0, fadeout_override)
        filt_differs = (filt_s != filt_c)
        if (sf_s != sf_c or cut_s != cut_c or res_s != res_c or att_s != att_c
                or filt_differs or fo_s != fo_c):
            d['extra'] = {'fadeout':            fo_s,
                          'filter_sf_mode':     sf_s,
                          'default_cutoff':     cut_s,
                          'default_resonance':  res_s,
                          'initial_attenuation': att_s}
        if filt_s is not None and filt_differs:
            d['filter_env'] = filt_s
        if pit_s is not None and pit_s != pit_c:
            d['pitch_env'] = pit_s
        return d


class TaudInstrument:
    __slots__ = ('slot', 'inst_key', 'name', 'patches', 'canonical', 'usable')
    # patches: kept Patch list in zone order, canonical Patch INCLUDED
    # (the Ixmp emitter skips it; the base record carries its fields).


def _rect_overlap(a, b) -> bool:
    """True when two (pitch_lo, pitch_hi, vol_lo, vol_hi) rectangles intersect."""
    p0, p1, v0, v1 = a
    q0, q1, w0, w1 = b
    return not (p1 < q0 or p0 > q1 or v1 < w0 or v0 > w1)


def _partition_layers(zones: list, registry: dict, max_layers: int):
    """Split zones into disjoint layers by ITERATED first-wins disjointify.

    Layer 0 is the classic disjointify result: each zone is rectangle-SUBTRACTED
    against the rects already placed in the layer, so its non-overlapping pieces
    tile in. This is essential — the velocity axis quantises 0..127 → 0..63, so
    adjacent SF2 velocity splits round to ranges that touch/overlap by ~1 unit;
    subtraction absorbs that boundary sliver into the first zone instead of
    spawning a spurious extra layer (which would DOUBLE the level at boundary
    velocities). Only a zone that is *fully* covered by the layer below — SF2's
    real simultaneous layering, detune-stacks, duplicate zones — spills down to
    the next layer, where the same disjointify runs over the spilled set. Returns
    ([ [(rect, zone, ms), …] per layer ], dropped_zone_count)."""
    remaining = []
    for z in zones:
        ms = MonoSample(z)
        if ms.frames < 2:
            continue
        ms = registry.setdefault(ms.key(), ms)
        remaining.append((z, ms))

    layers = []
    while remaining and len(layers) < max_layers:
        kept_rects = []
        layer = []
        spill = []
        for z, ms in remaining:
            pieces = [_rect_of_zone(z)]
            for k in kept_rects:
                pieces = [p2 for p in pieces for p2 in _rect_subtract(p, k)]
                if not pieces:
                    break
            pieces = [p for p in pieces if p[0] <= p[1] and p[2] <= p[3]]
            if not pieces:
                spill.append((z, ms))          # fully overlapped → next layer
                continue
            for p in pieces:
                kept_rects.append(p)
                layer.append((p, z, ms))
        if layer:
            layers.append(layer)
        remaining = spill
    return layers, len(remaining)


def _build_layer_instrument(name: str, items: list, trig: dict):
    """One normal TaudInstrument from a layer's disjoint (rect, zone, ms) items,
    trimmed to patches actually hit by a trigger. None when no patch is hit
    (the layer is silent for the whole song → dropped)."""
    all_patches = [Patch(r, z, ms) for (r, z, ms) in items]
    for (nv, v6), cnt in trig.items():
        for p in all_patches:
            r = p.rect
            if r[0] <= nv <= r[1] and r[2] <= v6 <= r[3]:
                p.hits += cnt
                break
    kept = [p for p in all_patches if p.hits > 0]
    if not kept:
        return None
    ti = TaudInstrument()
    ti.name = name
    ti.patches = kept
    ti.canonical = max(kept, key=lambda p: p.hits)
    ti.usable = True
    ti.slot = 0
    ti.inst_key = None
    return ti


def build_presets(sf: SF2, slot_keys: list, triggers: dict, perc_force,
                  registry: dict, max_layers: int) -> dict:
    """For each preset (inst_key), partition its SF2 zones into disjoint layers
    and build one normal TaudInstrument per layer (trimmed to triggered patches).
    Returns dict[inst_key → (name, [layer TaudInstrument])]. Downstream, a preset
    with >1 layer becomes a Metainstrument; a single-layer preset stays a plain
    instrument. `registry` dedupes MonoSamples across all presets/layers."""
    presets = {}
    for ik in slot_keys:
        res = resolve_preset(sf, ik, perc_force)
        if res is None:
            vprint(f"  warning: no SF2 preset for {ik!r} — its notes are dropped")
            presets[ik] = ('(missing preset)', [])
            continue
        name, zones = res
        zones = merge_stereo_zones(zones, sf.shdrs)
        layer_items, dropped = _partition_layers(zones, registry, max_layers)
        if dropped:
            vprint(f"  warning: '{name}': {dropped} zone(s) exceed the "
                   f"{max_layers}-layer cap and were dropped (raise --max-layers)")
        trig = triggers.get(ik, {})
        layers = [ti for items in layer_items
                  if (ti := _build_layer_instrument(name, items, trig)) is not None]
        if not layers and layer_items:
            # Nothing triggered (out-of-range): keep the single patch nearest the
            # mean trigger pitch so the preset still sounds (matches the old path).
            mean_nv = (sum(nv * c for (nv, _), c in trig.items())
                       / max(1, sum(trig.values()))) if trig else TAUD_C4
            flat = [Patch(r, z, ms) for items in layer_items for (r, z, ms) in items]
            best = min(flat, key=lambda p: abs((p.rect[0] + p.rect[1]) / 2 - mean_nv))
            ti = TaudInstrument()
            ti.name = name; ti.patches = [best]; ti.canonical = best
            ti.usable = True; ti.slot = 0; ti.inst_key = ik
            layers = [ti]
        for ti in layers:
            ti.inst_key = ik
        presets[ik] = (name, layers)
        if layers:
            vprint(f"  preset '{name}': {len(zones)} zone(s) → {len(layers)} layer(s)"
                   + (" → Metainstrument" if len(layers) > 1 else ""))
        else:
            vprint(f"  warning: '{name}': no usable zones — notes dropped")
    return presets


# Metainstrument mix-volume octet for an unmixed layer (159 = 0 dB / unity); the
# converter folds per-zone level/tune into each layer instrument's patches, so the
# meta layers stay neutral. (terranmon.txt "Perceptually Significant Octet …".)
META_UNITY_OCTET = 159


def _layer_bbox(ti: 'TaudInstrument'):
    """Bounding (pitch_lo, pitch_hi, vol_lo, vol_hi) over a layer instrument's kept
    patch rects — the Metainstrument layer's gating rectangle."""
    rs = [p.rect for p in ti.patches]
    return (min(r[0] for r in rs), max(r[1] for r in rs),
            min(r[2] for r in rs), max(r[3] for r in rs))


# ── Sample pool + instrument bin ──────────────────────────────────────────────

def _env_seg_count(t_sec: float) -> int:
    """Number of linear segments to approximate an exponential (linear-dB) ramp of
    `t_sec` seconds. Short ramps keep the old 2-segment shape; long ramps (the 5–20 s
    SF2 decays/releases that a 2-point line collapses badly) get up to 8 segments so
    the curve stays smooth (issue 4)."""
    return max(3, min(8, 2 + round(t_sec / 2.0)))


def _adsr_to_env(z: SFZone):
    """SF2 volume-envelope ADSR → (env_points, sustain_idx, release_sec).

    env_points is up to 25 (value 0..63, minifloat_idx) pairs; each node's
    minifloat encodes the time to the NEXT node (engine interpolates values
    linearly across that span). The envelope carries the delay/attack/hold/decay
    legs and ENDS at the sustain node — there is NO release leg. The engine wraps
    on the sustain node while the key is held (SUSTAIN word); on key-off it holds
    at that terminal node and the Volume Fadeout (emitted with NNA Note Fade) is
    the SF2 *release segment* (see _zone_fadeout). SF2's decay is LINEAR in dB
    (exponential in amplitude); per the SF2 spec decayVolEnv is the full-100dB
    time, truncated by the sustain level. The decay leg is sampled at equal-time
    (= equal-dB) points and emitted as a piecewise-linear-amplitude approximation
    — segment count scales with its duration (issue 4) so multi-second decays
    don't collapse to a 2-point line. release_sec (= SF2 releaseVolEnv) is returned
    only to feed the fadeout calc.
    """
    EPS = 0.004                       # below the minifloat resolution (1/256 s)
    sus_cb = min(z.env_sustain_cb, 1000.0)     # clamp to 100 dB full-scale
    slevel = 10.0 ** (-z.env_sustain_cb / 200.0)
    s63 = max(0, min(63, round(63 * slevel)))
    pts = []                          # (value, delta_sec_to_next)
    if z.env_delay >= EPS:
        pts.append((0, z.env_delay))
    if z.env_attack >= EPS:
        pts.append((0, z.env_attack))
    hold = z.env_hold if z.env_hold >= EPS else 0.0
    # Decay leg: peak (63) → sustain (s63), exponential amplitude over `edec` seconds.
    # The peak node carries the hold time. The final decay node is the sustain node
    # (appended below), so the in-between nodes are f = 1/n .. (n-1)/n.
    if s63 < 63:
        edec = z.env_decay * sus_cb / 1000.0
        if edec >= EPS:
            n = _env_seg_count(edec)
            seg = edec / n
            pts.append((63, hold + seg))                       # peak, held then 1st seg
            for i in range(1, n):                              # f = 1/n .. (n-1)/n
                f = i / n
                v = round(63 * 10.0 ** (-(sus_cb * f) / 200.0))
                pts.append((max(s63, min(63, v)), seg))
        else:
            pts.append((63, hold))
    sustain_idx = len(pts)            # the node appended next is the sustain node
    rel = z.env_release
    # No release leg: the sustain node is the terminal node. While the key is held the
    # engine wraps on it (SUSTAIN word); after key-off it holds there and the Volume
    # Fadeout (NNA Note Fade) performs the SF2 release segment (see _zone_fadeout). A
    # zero sustain leaves a terminal 0 node, so the engine retires the voice naturally
    # at the end of decay.
    pts.append((s63, 0.0))            # sustain node = terminator
    env = [(v, nearest_minifloat(d)) for v, d in pts[:25]]
    while len(env) < 25:
        env.append((env[-1][0], 0))
    return env, min(sustain_idx, 24), rel


# Envelope LOOP-word bits (terranmon.txt base byte 15/17/19).
ENV_PRESENT_BIT = 0x2000          # P — envelope present in source (LOOP-word bit 13)
ENV_SUS_ENABLE  = 0x0020          # b — enable the SUSTAIN wrap (SUSTAIN-word bit 5)
ENV_PF_FILTER   = 0x0080          # m — pitch/filter LOOP-word bit 7 (1 = filter)


def _atten_gain(atten_cb: float) -> float:
    """SF2 initialAttenuation (cB) → linear amplitude multiplier (≤ 1.0)."""
    return 10.0 ** (-max(0.0, atten_cb) / 200.0)


def _vol_env_block(z: SFZone):
    """Taud volume-envelope block dict from a zone's SF2 ADSR — the PURE ADSR shape
    at full 0..63 resolution. initialAttenuation is NO LONGER folded into the node
    peak (it would crush a heavily-attenuated env to peak ~3 and zero its tail, e.g.
    SGM 'Fantasia'); it is now carried as a separate per-voice gain — base record
    bytes 251-252 / Ixmp 'x' block initialAttenuation — applied in the mixer. Returns
    (block_dict, sustain_idx, release_sec)."""
    env, sidx, rel = _adsr_to_env(z)
    nodes = [(max(0, min(63, v)), mf) for (v, mf) in env]
    sustain = ENV_SUS_ENABLE | ((sidx & 0x1F) << 8) | (sidx & 0x1F)
    return {'loop': ENV_PRESENT_BIT, 'sustain': sustain, 'nodes': nodes}, sidx, rel


# SF2 initialFilterFc default ≈ 13500 cents (~20 kHz) means "no filter / fully open".
SF2_FILTER_OPEN_CENTS = 13500
# Taud SF-mode "filter off" sentinel for the 16-bit cutoff/resonance fields.
SF_FILTER_OFF = 0xFFFF


def _zone_filter_sf(z: SFZone):
    """Resolve a zone's filter into Taud SF-mode parameters.

    Taud SF mode (base byte 173 bit 4 / patch 'x' flag) stores the cutoff as
    SoundFont **absolute cents** and resonance as **centibels above DC gain** —
    the engine computes freq = 8.176·2^(cents/1200) and dmpfac = 10^(−Qcb/200),
    so there is no ImpulseTracker ~5 kHz cutoff ceiling. When the zone has a
    modulation envelope driving the cutoff, the stored cutoff is the PEAK the
    envelope reaches and the filter-env nodes scale it back down (see
    [_filter_env_block_sf]); the engine's `currentCutoff = baseCut · envValue`
    then reproduces the SF2 sweep exactly (linear-in-cents = the right log-Hz
    sweep).

    Returns (sf_mode, cutoff16, resonance16, filter_env_block_or_None).
    sf_mode False → no filter (IT-mode 'off')."""
    base_fc = z.filter_fc
    amt     = z.me2filt
    has_static = base_fc < SF2_FILTER_OPEN_CENTS
    has_env    = bool(amt)
    if not has_static and not has_env:
        return False, SF_FILTER_OFF, SF_FILTER_OFF, None
    peak = max(1, min(0xFFFE, round(base_fc + max(0, amt))))   # engine baseCut
    qcb  = max(0, min(0xFFFE, round(z.filter_q)))              # cB above DC gain
    env  = _filter_env_block_sf(z, base_fc, amt, peak) if has_env else None
    return True, peak, qcb, env


def _filter_env_block_sf(z: SFZone, base_fc: float, amt: float, peak: int) -> dict:
    """Filter envelope in SF-cents domain. Each node value = cutoff_cents(u)/peak·255
    following the SF2 modulation-envelope DAHDSR (u walks 0→1→sustain), where
    cutoff_cents(u) = base_fc + amt·u. 0xFF (255) = fully open at `peak`; the
    release returns to the base cutoff. The engine multiplies `peak` (= baseCut)
    by node/255 each tick, so the node ratios reproduce the SF2 cutoff sweep."""
    EPS   = 0.004
    sus_u = 1.0 - z.m_sustain_pc / 1000.0          # mod-env sustain level (0..1)

    def nodeval(u: float) -> int:
        cents = base_fc + amt * u
        return max(0, min(255, round(255.0 * cents / peak)))

    pts = []                                        # (value_byte, secs_to_next)
    if z.m_delay >= EPS:
        pts.append((nodeval(0.0), z.m_delay))
    pts.append((nodeval(0.0), z.m_attack if z.m_attack >= EPS else 0.0))
    hold = z.m_hold if z.m_hold >= EPS else 0.0
    if sus_u < 1.0 and z.m_decay >= EPS:
        pts.append((nodeval(1.0), hold + z.m_decay))
        sustain_idx = len(pts)
        pts.append((nodeval(sus_u), z.m_release if z.m_release >= EPS else 0.0))
    else:
        pts.append((nodeval(1.0), hold))
        sustain_idx = len(pts) - 1
    pts.append((nodeval(0.0), 0.0))                 # release returns to base cutoff
    nodes = [(v, nearest_minifloat(d)) for v, d in pts[:25]]
    while len(nodes) < 25:
        nodes.append((nodes[-1][0], 0))
    sustain_idx = min(sustain_idx, 24)
    loop    = ENV_PRESENT_BIT | ENV_PF_FILTER       # m-bit set = filter role
    sustain = ENV_SUS_ENABLE | ((sustain_idx & 0x1F) << 8) | (sustain_idx & 0x1F)
    return {'loop': loop, 'sustain': sustain, 'nodes': nodes}


# The engine's Volume Fadeout is LINEAR IN AMPLITUDE (fadeoutVolume drops 1→0 by
# fadeStep/1024 per tick — AudioAdapter.kt ~L3679), whereas FluidSynth's release ramps
# attenuation LINEARLY IN dB (amplitude decays exponentially: −96 dB over releaseVolEnv).
# Matching the two on "time to the absolute floor" makes the linear fade sound MUCH longer:
# a linear-amplitude fade is still at −6 dB at 50 % of its length and −20 dB only at 90 %,
# while FluidSynth is already −96 dB (silent) by then. The perceived release tail ends when
# FluidSynth has dropped ≈22 dB; for the linear fade to land there at the same wall-clock
# time it must complete in ≈0.25·releaseVolEnv (see the −18..−24 dB crossing band). This
# scale brings the fadeout in line with FluidSynth's audible release length.
_RELEASE_PERCEPTUAL_SCALE = 0.25


def _zone_fadeout(z: SFZone, bpm0: int, fadeout_override) -> int:
    """Volume Fadeout step encoding the zone's SF2 release segment (gen 38,
    releaseVolEnv). With NNA Note Fade the fadeout IS the release: on key-off the
    voice holds at the sustain level and fades to silence. The SF2 release ramps a
    constant 100 dB per `releaseVolEnv` seconds (spec sfspec24.txt:1934-1941 — "until
    100dB attenuation were reached"), so the time from the sustain level (sus_cb cB of
    attenuation) down to the 100 dB floor is releaseVolEnv·(1000−sus_cb)/1000.

    But the engine's fadeout is linear in AMPLITUDE while FluidSynth's release is linear
    in dB (see [_RELEASE_PERCEPTUAL_SCALE]); matching the floor-reaching time would make
    the audible tail ~4× too long, so fade_sec is scaled to FluidSynth's perceived release.
    fadeStep makes the fadeout complete in fade_sec at bpm0: the engine subtracts
    fadeStep/1024 of unit volume per song tick, and the tick rate is bpm0·2/5 Hz, giving
    fadeStep = 2560/(fade_sec·bpm0)."""
    if fadeout_override is not None:
        return min(0xFFF, max(0, fadeout_override))
    sus_cb   = min(max(0.0, z.env_sustain_cb), 1000.0)
    fade_sec = max(0.02, _RELEASE_PERCEPTUAL_SCALE * z.env_release * (1000.0 - sus_cb) / 1000.0)
    return max(1, min(0xFFF, round(2560.0 / (fade_sec * bpm0))))


def _extra_block(z: SFZone, bpm0: int, fadeout_override) -> dict:
    """The 'x' block: release-segment fadeout + SF-mode static cutoff/resonance + filter mode."""
    sf_mode, cut16, res16, _ = _zone_filter_sf(z)
    return {'fadeout':            _zone_fadeout(z, bpm0, fadeout_override),
            'filter_sf_mode':     sf_mode,
            'default_cutoff':     cut16,
            'default_resonance':  res16}


def _pitch_env_block(z: SFZone) -> dict:
    """Pitch ('P') envelope block from the SF2 modulation envelope (DAHDSR),
    scaled by modEnvToPitch. Engine value mapping (byte/255; 0.5 = 0x80 = unity):
    envValue 1.0 → +16 semitones, so value = 0.5 + semis/32. The mod-env is
    unipolar 0→1; release returns to unity (0x80). (Filter envelopes are built
    separately in cents domain by [_filter_env_block_sf].)"""
    EPS = 0.004
    amount_cents = z.me2pitch
    sus_lvl = 1.0 - z.m_sustain_pc / 1000.0          # mod-env sustain level (0..1)

    def mapval(u: float) -> int:
        val = 0.5 + (amount_cents * u / 100.0) / 32.0
        return max(0, min(255, round(255 * max(0.0, min(1.0, val)))))

    pts = []                                          # (value_byte, secs_to_next)
    if z.m_delay >= EPS:
        pts.append((mapval(0.0), z.m_delay))
    pts.append((mapval(0.0), z.m_attack if z.m_attack >= EPS else 0.0))
    hold = z.m_hold if z.m_hold >= EPS else 0.0
    if sus_lvl < 1.0 and z.m_decay >= EPS:
        pts.append((mapval(1.0), hold + z.m_decay))
        sustain_idx = len(pts)
        pts.append((mapval(sus_lvl), z.m_release if z.m_release >= EPS else 0.0))
    else:
        pts.append((mapval(1.0), hold))
        sustain_idx = len(pts) - 1
    pts.append((mapval(0.0), 0.0))                    # release returns to unity (0x80)
    nodes = [(v, nearest_minifloat(d)) for v, d in pts[:25]]
    while len(nodes) < 25:
        nodes.append((nodes[-1][0], 0))
    sustain_idx = min(sustain_idx, 24)
    loop = ENV_PRESENT_BIT                            # m-bit clear = pitch role
    sustain = ENV_SUS_ENABLE | ((sustain_idx & 0x1F) << 8) | (sustain_idx & 0x1F)
    return {'loop': loop, 'sustain': sustain, 'nodes': nodes}


def _zone_pf_envs(z: SFZone):
    """Return (filter_env_block_or_None, pitch_env_block_or_None) for a zone's
    modulation envelope. SF2's single mod-env can drive both targets at once;
    the filter leg is built in SF-cents domain (see [_zone_filter_sf])."""
    _, _, _, filt = _zone_filter_sf(z)
    pit = _pitch_env_block(z) if z.me2pitch else None
    return filt, pit


# ── SF2 long-sample resampling + synthesized sustain loop ─────────────────────
#
# Per-sample handling when a rendered MonoSample exceeds the 65535-frame u16 cap
# (terranmon.txt sample_length is u16). Two strategies, by the rate that fitting
# the WHOLE sample into 65535 frames would leave:
#   (1)/(2) rate >= SF2_RESAMPLE_FLOOR_HZ  → downsample the whole sample to 65535
#           frames (quality stays acceptable, full sample preserved).
#   (3)     rate <  SF2_RESAMPLE_FLOOR_HZ  → resample to the 32 kHz floor instead
#           (keeps full bandwidth), keep the first 65535 frames, and — when the
#           sample has NO loop of its own — synthesize a near-seamless forward
#           loop near the end so held notes keep sounding, plus a peak->0 decay
#           vol-envelope (see _synth_decay_vol_env) that retires the voice
#           ~SF2_SYNTH_DECAY_SEC after the note fires.
SF2_RESAMPLE_FLOOR_HZ = 32000        # TSVM native audio rate (= full-bandwidth floor)
SF2_SYNTH_DECAY_SEC   = 10.0         # looped-note fade-to-silence span (from note-on)
SF2_LOOP_HINT         = 8192         # spec's "last 8192 samples" → MAX loop period searched
SF2_LOOP_MIN_PERIOD   = 512          # min loop period (avoid buzzy ultra-short loops)
SF2_LOOP_MATCH_WIN    = 256          # forward-window length used to score a loop seam
SF2_LOOP_MATCH_STEP   = 2            # stride within the match window (speed/quality trade)
SF2_LOOP_COARSE_STEP  = 32           # period stride for the coarse search pass


def _synth_sustain_loop(data: bytes, cap: int, hint: int):
    """Pick a near-seamless forward loop near the end of a resampled, originally
    UN-looped sample, and truncate it to <= `cap` frames. Returns
    (body, loop_start, loop_end) with the loop region [loop_start, loop_end)
    (loop_end exclusive — matches the engine's mode-1 wrap, AudioAdapter.kt:2126).

    The loop is chosen by minimising the sum-of-squared-difference between the
    W-frame windows that FOLLOW loop_start and loop_end. Forward playback wraps
    loop_end -> loop_start, so matching data[loop_start+k] ~= data[loop_end+k]
    makes the post-wrap texture continue the pre-wrap texture seamlessly (the k=0
    term also matches the immediate seam value). `hint` (the spec's "last 8192
    samples") is the MAXIMUM loop period searched, NOT taken at face value: the
    analysis settles on the smoothest-looping period in [SF2_LOOP_MIN_PERIOD, hint]
    via a coarse sweep refined locally."""
    keep = min(len(data), cap)
    W    = SF2_LOOP_MATCH_WIN
    # loop_end sits W frames before the kept end so the forward match window
    # [loop_end, loop_end + W) stays within the data.
    loop_end = keep - W
    p_max = min(hint, loop_end)
    p_min = min(SF2_LOOP_MIN_PERIOD, p_max)
    if loop_end <= p_min:                      # too short to loop (not expected in case 3)
        return data[:keep], max(0, keep - 2), keep

    def seam_err(ls: int) -> int:
        s  = 0
        le = loop_end
        for k in range(0, W, SF2_LOOP_MATCH_STEP):
            d = data[ls + k] - data[le + k]
            s += d * d
        return s

    best_p = p_min
    best_e = seam_err(loop_end - best_p)
    p = p_min + SF2_LOOP_COARSE_STEP
    while p <= p_max:
        e = seam_err(loop_end - p)
        if e < best_e:
            best_e, best_p = e, p
        p += SF2_LOOP_COARSE_STEP
    lo = max(p_min, best_p - SF2_LOOP_COARSE_STEP)
    hi = min(p_max, best_p + SF2_LOOP_COARSE_STEP)
    for p in range(lo, hi + 1):
        e = seam_err(loop_end - p)
        if e < best_e:
            best_e, best_p = e, p

    loop_start = max(0, min(loop_end - 2, loop_end - best_p))
    return data[:keep], loop_start, loop_end


def _synth_decay_vol_env(decay_sec: float) -> dict:
    """Volume-envelope block for a synthesized-loop sample: an immediate peak that
    decays exponentially (linear-dB) to silence over `decay_sec`, with NO sustain
    or loop wrap. The looped sample would otherwise sound forever; this envelope
    fades it from the instant the note fires and — because there is no wrap
    (resolveEnvWrap returns range (-1,-1)) — the engine's fall-through
    'envelope ends at 0 => cut' rule (AudioAdapter.kt:1693/1701) retires the voice
    once it reaches the terminal 0 node, ~decay_sec after firing, regardless of
    key state. The drop spans the representable 63->1 range (~36 dB); the final
    node is a true 0 terminator."""
    DROP_CB = 360.0                            # 63 -> 1 fills the whole decay span
    n   = _env_seg_count(decay_sec)
    seg = decay_sec / n
    pts = [(63, seg)]                          # peak, held one segment then decays
    for i in range(1, n):
        v = round(63 * 10.0 ** (-(DROP_CB * (i / n)) / 200.0))
        pts.append((max(1, min(63, v)), seg))
    pts.append((0, 0.0))                       # terminal 0 node => fall-through cut
    nodes = [(v, nearest_minifloat(d)) for v, d in pts[:25]]
    while len(nodes) < 25:
        nodes.append((0, 0))
    return {'loop': ENV_PRESENT_BIT, 'sustain': 0, 'nodes': nodes}


def _effective_vol_env(z: SFZone, ms: 'MonoSample') -> dict:
    """Volume-envelope block for a (zone, sample): a synthesized-loop sample fades
    from note-on via a peak->0 decay (no sustain), overriding the SF2 ADSR;
    otherwise the zone's SF2 ADSR shape (_vol_env_block)."""
    if ms is not None and ms.synth_decay is not None:
        return _synth_decay_vol_env(ms.synth_decay)
    blk, _, _ = _vol_env_block(z)
    return blk


def build_sample_inst_bin(sf: SF2, pool: list, layer_insts: list, meta_records: list,
                          fadeout_override, bpm0: int):
    """Render & pool every used MonoSample (with the 65535-byte per-sample
    and 8 MB global caps), write the 256-byte normal-instrument records for every
    layer instrument, then the Metainstrument records. Returns the raw
    SAMPLEINST_SIZE image."""
    for ms in pool:
        ms.render(sf)

    # Per-sample u16 cap. A sample over the 65535-frame limit is shrunk one of two
    # ways (see the SF2 long-sample section above): downsample the whole thing when
    # that keeps the rate >= 32 kHz; otherwise resample to the 32 kHz floor, keep the
    # first 65535 frames and synthesize a sustain loop + decay (only when the sample
    # has no loop of its own — a sample with an SF2 loop is left to fall-through, as
    # its loop already lets it sustain within whatever frames fit).
    for ms in pool:
        native_len = len(ms.data)
        if native_len <= SAMPLE_LEN_LIMIT:
            continue
        r_fit    = SAMPLE_LEN_LIMIT / native_len
        rate_fit = ms.rate * r_fit
        r32      = SF2_RESAMPLE_FLOOR_HZ / ms.rate
        # loop_end in 32 kHz frames (0 when unlooped) decides whether a 32 kHz render
        # still contains the loop within the 65535-frame cap.
        le32 = round(ms.loop_native[1] * r32) if ms.loop_native else 0

        def _fit_whole():
            """(1)/(2) downsample the WHOLE sample to <= 65535 frames. Used when the
            fitted rate stays >= 32 kHz, or as the fall-back for a looped sample whose
            loop sits past the cap at 32 kHz (only fit-to-cap keeps that far loop)."""
            ms.data   = resample_linear(ms.data, r_fit)
            ms.ratio *= len(ms.data) / native_len

        if rate_fit >= SF2_RESAMPLE_FLOOR_HZ:
            _fit_whole()
            vprint(f"  info: '{ms.name}' {native_len} frames > 64K cap; "
                   f"resampling by {r_fit:.4f} (rate {rate_fit:.0f} Hz)")
        elif ms.loop_native is None:
            # (3) No loop: resample to the 32 kHz floor (full bandwidth), keep the first
            # 65535 frames and synthesize a near-seamless sustain loop near the end, plus
            # a peak->0 decay vol-envelope that fades the looped note to silence from
            # note-on (the SF2 sample stops on its own otherwise; a loop would ring).
            resampled = resample_linear(ms.data, r32)
            ms.ratio *= len(resampled) / native_len    # effective rate -> 32 kHz
            ms.data   = resampled
            body, ls, le = _synth_sustain_loop(ms.data, SAMPLE_LEN_LIMIT, SF2_LOOP_HINT)
            ms.data        = body
            ms.synth_loop  = (ls, le)
            ms.synth_decay = SF2_SYNTH_DECAY_SEC
            vprint(f"  info: '{ms.name}' {native_len} frames > 64K cap, long & unlooped; "
                   f"32 kHz, kept {len(body)} frames, synth loop [{ls}..{le}] "
                   f"+ {SF2_SYNTH_DECAY_SEC:.0f}s decay")
        elif le32 <= SAMPLE_LEN_LIMIT - 2:
            # (3) Looped, and the loop fits at the 32 kHz floor: resample to 32 kHz and
            # keep the first 65535 frames. The per-patch loop points (native * ratio)
            # land within the kept data, so the SF2 loop + ADSR are preserved at full
            # bandwidth (a sustain-loop release tail past loop_end is truncated to fit).
            resampled = resample_linear(ms.data, r32)
            ms.ratio *= len(resampled) / native_len
            ms.data   = resampled[:SAMPLE_LEN_LIMIT]
            vprint(f"  info: '{ms.name}' {native_len} frames > 64K cap, long & looped; "
                   f"32 kHz, kept first {len(ms.data)} frames (loop_end {le32})")
        else:
            # (3) Looped but the loop sits past the 65535-frame cap at 32 kHz (a far-end
            # sustain loop on a multi-second sample): the floor rate can't hold it, so
            # downsample the whole sample to fit — the ratio-scaled loop stays valid,
            # at a sub-32 kHz rate. (This is the pre-existing fit-to-cap behaviour.)
            _fit_whole()
            vprint(f"  info: '{ms.name}' {native_len} frames > 64K cap, long, looped, "
                   f"far loop; fit-to-cap by {r_fit:.4f} (rate {ms.rate * r_fit:.0f} Hz)")

    # Global 8 MB pool cap. Resamples every sample down equally; synthesized loop
    # points ride the same ratio so the loop stays valid in the shrunken data.
    total = sum(len(ms.data) for ms in pool)
    if total > SAMPLEBIN_SIZE:
        g = SAMPLEBIN_SIZE / total
        vprint(f"  info: sample pool overflow ({total} bytes); "
               f"resampling all by {g:.4f}")
        for ms in pool:
            old = len(ms.data)
            ms.data = resample_linear(ms.data, g)
            ms.ratio *= len(ms.data) / old
            if ms.synth_loop is not None:
                le = min(len(ms.data) - 1, round(ms.synth_loop[1] * g))
                ls = max(0, min(le - 2, round(ms.synth_loop[0] * g)))
                ms.synth_loop = (ls, le)

    sample_bin = bytearray(SAMPLEBIN_SIZE)
    pos = 0
    for ms in pool:
        n = min(len(ms.data), SAMPLEBIN_SIZE - pos)
        if n < len(ms.data):
            vprint(f"  warning: pool full, truncating '{ms.name}'")
            ms.data = ms.data[:n]
            if ms.synth_loop is not None:        # keep the synthesized loop inside the data
                le = min(n - 1, ms.synth_loop[1])
                ms.synth_loop = (max(0, min(le - 2, ms.synth_loop[0])), le)
        sample_bin[pos:pos+n] = ms.data
        ms.offset = pos
        pos += n
    vprint(f"  sample pool: {len(pool)} sample(s), {pos} bytes")

    inst_bin = bytearray(INSTBIN_SIZE)
    for ti in layer_insts:
        if not ti.usable:
            continue
        c  = ti.canonical
        ms = c.ms
        r  = ms.ratio
        base = ti.slot * 256
        struct.pack_into('<I', inst_bin, base + 0, ms.offset)
        struct.pack_into('<H', inst_bin, base + 4, min(len(ms.data), 0xFFFF))
        struct.pack_into('<H', inst_bin, base + 6,
                         max(1, min(0xFFFF, round(ms.rate * r))))
        struct.pack_into('<H', inst_bin, base + 8, 0)            # play start
        # Synthesized-loop samples carry their loop in the final output-frame domain
        # (already scaled by every resample) and force a plain forward loop (mode 1);
        # otherwise the canonical zone's SF2 loop, scaled by this sample's ratio.
        if ms.synth_loop is not None:
            ls_w, le_w, lm_w = ms.synth_loop[0], ms.synth_loop[1], 1
        else:
            ls_w = round(c.loop_start * r)
            le_w = round(c.loop_end   * r)
            lm_w = c.loop_mode
        struct.pack_into('<H', inst_bin, base + 10, min(0xFFFF, ls_w))
        struct.pack_into('<H', inst_bin, base + 12, min(0xFFFF, le_w))
        inst_bin[base + 14] = lm_w

        def wenv(loop_off, sus_off, nodes_off, blk):
            struct.pack_into('<H', inst_bin, base + loop_off, blk['loop'] & 0xFFFF)
            struct.pack_into('<H', inst_bin, base + sus_off,  blk['sustain'] & 0xFFFF)
            nodes = list(blk['nodes'])
            for k in range(25):
                v, mf = nodes[k] if k < len(nodes) else (nodes[-1][0] if nodes else 0, 0)
                inst_bin[base + nodes_off + k*2]     = v & 0xFF
                inst_bin[base + nodes_off + k*2 + 1] = mf & 0xFF

        # Volume envelope from the canonical zone's SF2 ADSR (delay/attack/hold/decay,
        # single-node sustain held while key is on). There is NO release leg: on key-off
        # the voice holds at the sustain node and the Volume Fadeout (NNA Note Fade) is
        # the SF2 release segment (see _zone_fadeout). initialAttenuation is carried
        # separately (byte 251 / 'x' octet), not folded into the node peak. Non-canonical
        # zones with a different ADSR carry their own per-patch vol_env (see
        # Patch.to_ixmp_dict); the base record is the canonical / fall-through. A
        # synthesized-loop sample instead uses a peak->0 decay envelope (no sustain) so
        # its otherwise-infinite loop fades to silence ~SF2_SYNTH_DECAY_SEC after firing.
        wenv(15, 189, 21, _effective_vol_env(c.zone, ms))
        # Pan envelope: none (default unity nodes; P bit clear in LOOP word).
        struct.pack_into('<H', inst_bin, base + 17, 0)
        for k in range(25):
            inst_bin[base + 71 + k*2] = 0x80
        # Pitch/filter envelopes — SEPARATE, fixed slots (issue 2): slot #1 (bytes
        # 19/121) is the FILTER envelope (m-bit set), defaulting flat to 0xFF
        # (fully OPEN — the engine's filter-env neutral, since currentCutoff =
        # baseCut·envValue and 1.0 = open); slot #2 (bytes 197/201) is the PITCH
        # envelope (m-bit clear), defaulting flat to 0x80 (unity, no transpose). A
        # flat slot keeps its LOOP word at 0 (P-bit clear) so the engine ignores it.
        sf_mode, cut16, res16, filt_env = _zone_filter_sf(c.zone)
        pit_env = _pitch_env_block(c.zone) if c.zone.me2pitch else None
        for k in range(25):
            inst_bin[base + 121 + k*2] = 0xFF                    # filter-env (slot 1) flat = open
            inst_bin[base + 201 + k*2] = 0x80                    # pitch-env (slot 2) flat = unity
        if filt_env is not None:
            wenv(19, 193, 121, filt_env)
        if pit_env is not None:
            wenv(197, 199, 201, pit_env)

        # Volume Fadeout = the SF2 release segment (NNA Note Fade below). Derived from
        # the canonical zone's releaseVolEnv against the 100 dB envelope floor; see
        # _zone_fadeout for the timecent→step derivation. A synthesized-loop sample
        # disables the key-off fadeout (its decay is the vol-envelope, which runs from
        # note-on regardless of key state) so key-off does not cut it short.
        fo = 0 if ms.synth_loop is not None else _zone_fadeout(c.zone, bpm0, fadeout_override)
        inst_bin[base + 171] = 0xFF                              # IGV (unit)
        inst_bin[base + 172] = fo & 0xFF
        # byte 173: bits 0-3 = fadeout high nibble, bit 4 = SF filter mode (cutoff/resonance
        # are 16-bit SoundFont cents/centibels in bytes 182<<8|252 / 183<<8|253).
        inst_bin[base + 173] = ((fo >> 8) & 0x0F) | (0x10 if sf_mode else 0)
        inst_bin[base + 177] = (0x80 if c.pan8 == IXMP_PAN_NO_OVERRIDE
                                else c.pan8)                     # default pan
        struct.pack_into('<H', inst_bin, base + 178, TAUD_C4)    # PPC
        inst_bin[base + 182] = (cut16 >> 8) & 0xFF               # cutoff high (SF cents / IT byte)
        inst_bin[base + 252] = cut16 & 0xFF                      # cutoff low  (SF mode)
        inst_bin[base + 183] = (res16 >> 8) & 0xFF               # resonance high
        inst_bin[base + 253] = res16 & 0xFF                      # resonance low (SF mode)
        struct.pack_into('<H', inst_bin, base + 184, c.detune & 0xFFFF)
        # NNA = Note Fade (0b11) for every instrument, drum kits included. On any
        # key-off the voice holds at the sustain node and the Volume Fadeout performs
        # the SF2 release segment; when a fresh note displaces this voice the engine
        # ghosts it and starts the same fadeout, so released/displaced notes always
        # die over their own release time. (Supersedes the old melodic Key-Lift /
        # drum Continue split — the release now lives in the fadeout, not env nodes.)
        inst_bin[base + 186] = 0b11
        inst_bin[base + 196] = 255                               # default note vol
        # initialAttenuation (byte 251, dB-table octet) — the canonical zone's static gain,
        # applied per-voice by the mixer (no longer folded into the vol-env). Per-patch zones
        # with a different attenuation carry their own octet in the Ixmp 'x' block.
        inst_bin[base + 251] = atten_cb_to_octet(c.zone.atten_cb) & 0xFF

    # Metainstrument records: a 0xFFFF-sentinel sample pointer (high 16 bits) plus a
    # layer table (terranmon.txt "Metainstrument definition"). Layers stay neutral
    # (unity mix, zero detune); per-zone level/tune already live in each layer
    # instrument's patches. The note references the meta slot; the engine fans out.
    for meta_slot, _name, layer_descs in meta_records:
        base = meta_slot * 256
        inst_bin[base + 0] = 0                                  # type 0 = layered
        inst_bin[base + 1] = len(layer_descs) & 0xFF            # layer count
        inst_bin[base + 2] = 0xFF; inst_bin[base + 3] = 0xFF    # identifier (hi 16 bits)
        o = base + 4
        for layer_slot, rect in layer_descs:
            plo, phi, vlo, vhi = rect
            inst_bin[o]     = layer_slot & 0xFF
            inst_bin[o + 1] = META_UNITY_OCTET
            struct.pack_into('<h', inst_bin, o + 2, 0)          # sample detune (neutral)
            struct.pack_into('<H', inst_bin, o + 4, plo & 0xFFFF)
            struct.pack_into('<H', inst_bin, o + 6, phi & 0xFFFF)
            inst_bin[o + 8] = vlo & 0x3F
            inst_bin[o + 9] = vhi & 0x3F
            o += 10

    return bytes(sample_bin) + bytes(inst_bin)


# ── Cell grid (voices × rows) ────────────────────────────────────────────────

def _cell(cells: dict, v: int, row: int) -> dict:
    c = cells.get((v, row))
    if c is None:
        c = {'note': NOTE_NOP, 'inst': 0, 'vol': (SEL_FINE, 0),
             'pan': (SEL_FINE, 0), 'eff': None, 'prio': PRIO_FREE}
        cells[(v, row)] = c
    return c


def allocate_voices(notes: list, speed: int, max_voices: int) -> int:
    """Greedy per-row interval scheduling onto as few columns as possible.

    The engine's New Note Action does the heavy lifting (matching MIDI
    polyphony semantics): a fresh trigger on an occupied voice migrates the
    old note into the mixer's background-ghost pool, so a voice is reusable
    the moment its note is *released* — the Note-Fade tail rides the ghost
    (fading over the instrument's SF2 release). Melodic voices free at their
    key-off row; drum voices (no key-off by default) free on the very next
    row. Stealing is therefore graceful: the victim is released early, not cut.

    Mutates note.voice (and truncates stolen notes' end_ft). Returns the
    number of voices used."""
    cap = max(1, min(max_voices, NUM_VOICES))
    v_end  = []     # voice → first row at which it is free again
    v_slot = []     # voice → last instrument slot (affinity only)
    v_note = []     # voice → currently scheduled note
    stolen = 0
    for n in notes:
        srow = n.start_ft // speed
        free = [v for v in range(len(v_end)) if v_end[v] <= srow]
        v = next((x for x in free if v_slot[x] == n.slot),
                 free[0] if free else -1)
        if v < 0:
            if len(v_end) < cap:
                v = len(v_end)
                v_end.append(0); v_slot.append(0); v_note.append(None)
            else:
                # Steal preference: notes held only by the sustain pedal lose
                # least (their key is already up); otherwise the note ending
                # soonest. Either way NNA turns the steal into an early release.
                pedal = [x for x in range(len(v_end))
                         if v_note[x] is not None
                         and v_note[x].pedal_ft is not None
                         and v_note[x].pedal_ft <= n.start_ft]
                cand = pedal if pedal else range(len(v_end))
                v = min(cand, key=lambda x: v_end[x])
                victim = v_note[v]
                if victim is not None and victim.end_ft > n.start_ft:
                    victim.end_ft = n.start_ft
                stolen += 1
        if n.drum:
            end_row = srow + 1                       # ghost carries the ring
        else:
            end_row = max(srow + 1, n.end_ft // speed)   # free at key-off row
        if n.excl_cut_ft is not None:
            # exclusiveClass choke: hold the voice through the choke row so this note stays
            # FOREGROUND until then (the fast-fade cell must land on it, not a ghost), and so
            # the choking same-class note cannot reuse this column at the choke row.
            crow = n.excl_cut_ft // speed
            if crow <= srow:
                crow = srow + 1
            end_row = max(end_row, crow + 1)
        n.voice = v
        v_end[v], v_slot[v], v_note[v] = end_row, n.slot, n
    if stolen:
        vprint(f"  info: polyphony exceeded {cap} voices; {stolen} note(s) "
               f"released early (NNA ghost keeps the tail)")
    return len(v_end)


def emit_cells(song: Song, insts: dict, speed: int, rpb: int,
               eps_units: float, drum_keyoff: bool, shift_ft: int,
               max_voices: int) -> tuple:
    """Place triggers, key-offs, portamento bend segments, M channel-volume
    and T tempo effects into the (voice,row) cell grid.
    Returns (cells, n_voices, total_rows, taud_bpm0)."""
    notes = [n for n in song.notes if n.slot > 0]

    def midi_bpm_at(ft):
        i = bisect.bisect_right(song.tempo_ft, ft) - 1
        return song.tempo_bpm[i] if i >= 0 else 120.0

    scale = rpb * speed / 24.0

    def taud_bpm(b):
        t = round(b * scale)
        if not (25 <= t <= 280):
            vprint(f"  warning: tempo {b:.1f} BPM maps to Taud {t}, "
                   f"clamped to 25..280 (try a different --rpb/--speed)")
        return max(25, min(280, t))

    n_voices = allocate_voices(notes, speed, max_voices)
    if n_voices == 0:
        sys.exit("error: no playable notes")
    vprint(f"  voices: {n_voices} used (cap {max_voices}; NNA carries tails)")

    cells = {}

    # ── Pass 1: triggers ──
    for n in notes:
        row, tick = n.start_ft // speed, n.start_ft % speed
        c = _cell(cells, n.voice, row)
        nv = key_to_noteval(n.key + n.bend0)
        c['note'] = nv
        c['inst'] = n.slot
        c['vol']  = (SEL_SET, round(n.vel * 63 / 127))
        st = song.channels[n.ch]
        if st.cc10_ft:
            pan = _curve_at(st.cc10_ft, st.cc10_val, n.start_ft + shift_ft, 64)
            c['pan'] = (SEL_SET, round(pan * 63 / 127))
        if tick > 0:
            c['eff']  = (TOP_S, 0xD000 | (tick << 8))
            c['prio'] = PRIO_DELAY

    # ── Pass 2: key-offs (both MIDI idioms arrive here as note.end_ft) ──
    skipped_offs = 0
    for n in notes:
        if n.drum and not drum_keyoff:
            continue
        row, tick = n.end_ft // speed, n.end_ft % speed
        srow = n.start_ft // speed
        if row == srow:
            # Sub-row note (shorter than one tracker row): its key-off would land on
            # its OWN trigger row, where the trigger cell already sits — pass 2 would
            # then skip it ("row taken") and the note would ring forever until the next
            # trigger on this voice. Push the key-off to the next row (tick 0) so a
            # staccato note rounds up to ~1 row instead of hanging. If the next row is
            # itself a fresh trigger, that note cuts/NNAs this one anyway (skip is fine).
            row = srow + 1
            tick = 0
        c = cells.get((n.voice, row))
        if c is None:
            c = _cell(cells, n.voice, row)
            c['note'] = NOTE_KEYOFF
            if tick > 0:
                c['eff']  = (TOP_S, 0xD000 | (tick << 8))
                c['prio'] = PRIO_DELAY
        elif c['note'] == NOTE_NOP:
            c['note'] = NOTE_KEYOFF
            if tick > 0 and c['eff'] is None:
                c['eff']  = (TOP_S, 0xD000 | (tick << 8))
                c['prio'] = PRIO_DELAY
        else:
            skipped_offs += 1    # row taken by a retrigger — which cuts/NNAs anyway
    if skipped_offs:
        vprint(f"  info: {skipped_offs} key-off(s) absorbed by same-row retriggers")

    # ── Pass 2b: exclusiveClass chokes (fast note-fade) ──
    # The choked note holds its voice through the choke row (allocate_voices), so the
    # NOTE_FASTFADE lands on it while it is still foreground. The next same-class note
    # plays on a different column, so this never collides with a fresh trigger.
    for n in notes:
        if n.excl_cut_ft is None:
            continue
        srow = n.start_ft // speed
        row, tick = n.excl_cut_ft // speed, n.excl_cut_ft % speed
        if row <= srow:          # choke within the trigger row → round up one row
            row = srow + 1
            tick = 0
        c = cells.get((n.voice, row))
        if c is None:
            c = _cell(cells, n.voice, row)
            c['note'] = NOTE_FASTFADE
            if tick > 0:
                c['eff']  = (TOP_S, 0xD000 | (tick << 8))
                c['prio'] = PRIO_DELAY
        elif c['note'] in (NOTE_NOP, NOTE_KEYOFF):
            c['note'] = NOTE_FASTFADE          # choke supersedes a natural key-off
            if tick > 0 and c['eff'] is None:
                c['eff']  = (TOP_S, 0xD000 | (tick << 8))
                c['prio'] = PRIO_DELAY
        # else: row already holds a fresh trigger — that note cuts/NNAs this one anyway.

    # ── Pass 3: pitch-bend portamento segments ──
    # One linear segment per row: the cell carries the exact 4096-TET target
    # plus G at units/tick sized to land on it by row end (G slides on the
    # speed-1 non-first ticks). Targets within eps_units are skipped (jitter
    # simplification).
    seg_count = 0
    if speed >= 2:
        for n in notes:
            st = song.channels[n.ch]
            if len(st.bend_ft) <= 1 and n.bend0 == 0.0:
                continue
            start_row = n.start_ft // speed
            end_row   = n.end_ft   // speed
            cur = key_to_noteval(n.key + n.bend0)
            for r in range(start_row + 1, end_row):
                ftr = min((r + 1) * speed, n.end_ft) + shift_ft
                target = key_to_noteval(
                    n.key + _curve_at(st.bend_ft, st.bend_val, ftr, 0.0))
                if abs(target - cur) < eps_units:
                    continue
                if (n.voice, r) in cells:
                    continue
                step = -(-abs(target - cur) // (speed - 1))
                c = _cell(cells, n.voice, r)
                c['note'] = target
                c['eff']  = (TOP_G, min(0xFFFF, step))
                c['prio'] = PRIO_PORTA
                cur = target
                seg_count += 1
    elif any(len(st.bend_ft) > 1 for st in song.channels):
        vprint("  warning: --speed 1 cannot express portamento; "
               "pitch-bend movement dropped")
    if seg_count:
        vprint(f"  bend: {seg_count} portamento segment(s) emitted")

    # ── Pass 4: M channel volume (CC7 × CC11), per voice chronologically ──
    by_voice = {}
    for n in notes:
        by_voice.setdefault(n.voice, []).append(n)
    m_emitted = 0
    for v, vnotes in by_voice.items():
        vnotes.sort(key=lambda n: n.start_ft)
        m_state = 0x3F                            # engine channel_vol default
        for n in vnotes:
            st = song.channels[n.ch]
            for r in range(n.start_ft // speed, n.end_ft // speed + 1):
                ftr = r * speed + shift_ft
                m = round(_curve_at(st.cc7_ft,  st.cc7_val,  ftr, 100) / 127
                          * _curve_at(st.cc11_ft, st.cc11_val, ftr, 127) / 127
                          * 63)
                if m == m_state:
                    continue
                c = _cell(cells, v, r)
                if c['eff'] is not None:
                    continue                      # slot busy — retry next row
                c['eff']  = (TOP_M, (m & 0x3F) << 8)
                c['prio'] = PRIO_M
                m_state = m
                m_emitted += 1
    if m_emitted:
        vprint(f"  cc: {m_emitted} M channel-volume effect(s) emitted")

    total_rows = max(r for (_v, r) in cells) + 1

    # ── Pass 5: T tempo changes ──
    bpm0 = midi_bpm_at(shift_ft)                  # tempo in effect at row 0
    last = taud_bpm(bpm0)
    t_emitted = t_evict = 0
    for ft, b in zip(song.tempo_ft, song.tempo_bpm):
        row = (ft - shift_ft) // speed
        if row < 0:
            continue
        if row >= total_rows:
            break
        tb = taud_bpm(b)
        if tb == last:
            continue
        placed = False
        victim = None
        for v in range(n_voices):
            c = cells.get((v, row))
            if c is None or c['eff'] is None:
                c = _cell(cells, v, row)
                c['eff']  = (TOP_T, ((tb - 25) & 0xFF) << 8)
                c['prio'] = PRIO_TEMPO
                placed = True
                break
            if c['prio'] < PRIO_DELAY and (victim is None
                                           or c['prio'] < victim['prio']):
                victim = c
        if not placed and victim is not None:
            if victim['prio'] == PRIO_PORTA:
                victim['note'] = NOTE_NOP         # orphan G note would retrigger
            victim['eff']  = (TOP_T, ((tb - 25) & 0xFF) << 8)
            victim['prio'] = PRIO_TEMPO
            placed = True
            t_evict += 1
        if placed:
            last = tb
            t_emitted += 1
    if t_emitted:
        vprint(f"  tempo: {t_emitted} T effect(s)"
               + (f" ({t_evict} evicted a lesser effect)" if t_evict else ""))

    return cells, n_voices, total_rows, taud_bpm(bpm0)


# ── Pattern / cue emission and final assembly ────────────────────────────────

def build_pattern_bin(cells: dict, n_voices: int, n_cues: int) -> bytes:
    out = bytearray(n_cues * n_voices * PATTERN_BYTES)
    pos = 0
    for cue in range(n_cues):
        for v in range(n_voices):
            for r in range(PATTERN_ROWS):
                base = pos + r * 8
                c = cells.get((v, cue * PATTERN_ROWS + r))
                if c is None:
                    out[base + 3] = 0xC0
                    out[base + 4] = 0xC0
                    continue
                struct.pack_into('<H', out, base, c['note'] & 0xFFFF)
                out[base + 2] = c['inst'] & 0xFF
                vs, vv = c['vol']
                ps, pv = c['pan']
                out[base + 3] = (vv & 0x3F) | ((vs & 3) << 6)
                out[base + 4] = (pv & 0x3F) | ((ps & 3) << 6)
                if c['eff'] is not None:
                    op, arg = c['eff']
                    out[base + 5] = op & 0xFF
                    struct.pack_into('<H', out, base + 6, arg & 0xFFFF)
            pos += PATTERN_BYTES
    return bytes(out)


def assemble_taud(sf: SF2, song: Song, layer_insts: list, meta_records: list,
                  slot_name: dict, pool: list, args) -> bytes:
    speed, rpb = args.speed, args.rpb

    # Leading-silence trim: shift the grid so the first trigger is row 0.
    first_row = min(n.start_ft // speed for n in song.notes if n.slot > 0)
    shift_ft = first_row * speed
    if shift_ft:
        vprint(f"  info: trimming {first_row} leading silent row(s)")
        for n in song.notes:
            n.start_ft -= shift_ft
            n.end_ft   -= shift_ft
            if n.excl_cut_ft is not None:
                n.excl_cut_ft -= shift_ft

    eps_units = args.bend_epsilon * 4096.0 / 1200.0
    cells, n_voices, total_rows, bpm0 = emit_cells(
        song, None, speed, rpb, eps_units, args.drum_keyoff, shift_ft,
        args.max_voices)

    n_cues = (total_rows + PATTERN_ROWS - 1) // PATTERN_ROWS
    if n_cues > NUM_CUES:
        sys.exit(f"error: song needs {n_cues} cues > {NUM_CUES} limit "
                 f"(try a smaller --rpb)")
    if n_cues * n_voices > NUM_PATTERNS_MAX:
        sys.exit(f"error: {n_cues} cues × {n_voices} voices "
                 f"> {NUM_PATTERNS_MAX} pattern limit")

    pat_bin = build_pattern_bin(cells, n_voices, n_cues)
    pat_bin, remap, n_unique = deduplicate_patterns(pat_bin, n_cues * n_voices)
    vprint(f"  patterns: {n_cues * n_voices} → {n_unique} unique; "
           f"{n_cues} cue(s), {n_voices} voice(s), {total_rows} rows")

    sheet = bytearray(NUM_CUES * CUE_SIZE)
    for ci in range(NUM_CUES):
        sheet[ci*CUE_SIZE:(ci+1)*CUE_SIZE] = encode_cue([], 0)
    for ci in range(n_cues):
        pats = [remap[ci * n_voices + v] for v in range(n_voices)]
        tail = total_rows - ci * PATTERN_ROWS
        if ci == n_cues - 1:
            instr = CUE_INST_HALT
        elif tail < PATTERN_ROWS:
            instr = cue_instruction_len(tail)
        else:
            instr = CUE_INST_NOP
        sheet[ci*CUE_SIZE:(ci+1)*CUE_SIZE] = encode_cue(pats, instr)

    # ── Sample + instrument bin ──
    sampleinst_raw = build_sample_inst_bin(sf, pool, layer_insts, meta_records,
                                           args.fadeout, bpm0)
    assert len(sampleinst_raw) == SAMPLEINST_SIZE
    compressed = compress_blob(sampleinst_raw, "sample+inst bin")
    comp_size  = len(compressed)

    pat_comp = compress_blob(pat_bin,      "pattern bin")
    cue_comp = compress_blob(bytes(sheet), "cue sheet")

    song_table_off = TAUD_HEADER_SIZE + comp_size
    song_off       = song_table_off + TAUD_SONG_ENTRY
    entry = encode_song_entry(
        song_offset=song_off,
        num_voices=n_voices,
        num_patterns=n_unique,
        bpm_stored=(bpm0 - 25) & 0xFF,
        tick_rate=speed,
        base_note=0xA000,
        base_freq=8363.0,
        flags_byte=0x00,                          # linear pitch mode
        pat_bin_comp_size=len(pat_comp),
        cue_sheet_comp_size=len(cue_comp),
        global_vol=0xFF,
        mixing_vol=0xFF,
    )

    # ── Project data: names + the Ixmp section recreating SF2 layering ──
    proj_data = b''
    proj_off  = 0
    if not args.no_project_data:
        # Names indexed by slot (0 = unused). Layer slots carry the (suffixed) layer
        # instrument name; meta slots carry the bare preset name.
        max_slot = max([0] + list(slot_name))
        inst_names = ['' for _ in range(max_slot + 1)]
        for s, nm in slot_name.items():
            inst_names[s] = nm
        smp_names  = [''] + [ms.name for ms in pool]
        ixmp = {}
        for ti in layer_insts:
            if not ti.usable:
                continue
            pl = [p.to_ixmp_dict(ti.canonical, bpm0, args.fadeout)
                  for p in ti.patches if p is not ti.canonical]
            if pl:
                ixmp[ti.slot] = pl
        if ixmp:
            vprint(f"  ixmp: {sum(len(p) for p in ixmp.values())} patch(es) "
                   f"across {len(ixmp)} instrument(s)")
        title = song.title or os.path.splitext(os.path.basename(args.input))[0]
        proj_data = build_project_data(
            project_name=title,
            instrument_names=inst_names,
            sample_names=smp_names,
            ixmp_patches=ixmp or None,
        )

    header = (TAUD_MAGIC
              + bytes([TAUD_VERSION, 1])
              + struct.pack('<I', comp_size)
              + struct.pack('<I', 0)              # patched below if proj data
              + (SIGNATURE + b' ' * 14)[:14])
    assert len(header) == TAUD_HEADER_SIZE

    out = bytearray()
    out += header
    out += compressed
    out += entry
    out += pat_comp
    out += cue_comp
    if proj_data:
        proj_off = len(out)
        struct.pack_into('<I', out, 14, proj_off)
        out += proj_data
        vprint(f"  project data: {len(proj_data)} bytes @ {proj_off}")
    return bytes(out)


# ── Main ──────────────────────────────────────────────────────────────────────

def main():
    ap = argparse.ArgumentParser(
        description=__doc__,
        formatter_class=argparse.RawDescriptionHelpFormatter)
    ap.add_argument('input',     help='Input .mid file')
    ap.add_argument('soundfont', help='SoundFont 2 (.sf2) sample library')
    ap.add_argument('output', nargs='?', default=None,
                    help='Output .taud (default: input stem + .taud)')
    ap.add_argument('--perc-force-mapping', nargs=2, type=int, default=None,
                    metavar=('BANK', 'INST'),
                    help='Force the percussion channel to this SF2 preset '
                         '(default: bank 128, channel program)')
    ap.add_argument('--rpb', type=int, default=4, choices=(2, 4, 8, 16),
                    help='Rows per beat (default 4 = 16th-note rows)')
    ap.add_argument('--speed', type=int, default=6,
                    help='Ticks per row, 1..15 (default 6)')
    ap.add_argument('--fadeout', type=int, default=None,
                    help='Override the computed fadeout step (0..4095). By '
                         'default each instrument/patch gets a Volume Fadeout '
                         'reproducing its SF2 release segment (releaseVolEnv vs '
                         'the 100 dB floor), played out via NNA Note Fade')
    ap.add_argument('--max-voices', type=int, default=20,
                    help='Voice-column budget, 1..20 (default 20). NNA '
                         'background ghosts carry release/ring tails, so '
                         'few foreground voices are needed; songs exceeding '
                         'the budget release the oldest pedal-held or '
                         'soonest-ending note early')
    ap.add_argument('--max-layers', type=int, default=4,
                    help='Max simultaneous layers per note (default 4). Each SF2 '
                         'preset is split into this many disjoint layers; presets '
                         'needing >1 layer become a Metainstrument. 1 disables '
                         'layering (first-zone-wins, like the old behaviour). '
                         'Covers ~93%% of big-bank presets at 4, ~98%% at 5')
    ap.add_argument('--bend-epsilon', type=float, default=4.0,
                    help='Pitch-bend simplification threshold in cents '
                         '(default 4.0); smaller = more faithful')
    ap.add_argument('--drum-keyoff', action='store_true',
                    help='Emit KEY_OFF for percussion-channel notes too '
                         '(GM drums normally ignore note-off)')
    ap.add_argument('--no-project-data', action='store_true',
                    help='Omit the Project Data section — NOTE: this also '
                         'omits Ixmp, collapsing every instrument to its '
                         'canonical sample')
    ap.add_argument('-v', '--verbose', action='store_true')
    args = ap.parse_args()
    set_verbose(args.verbose)

    if not (1 <= args.speed <= 15):
        sys.exit("error: --speed must be 1..15")
    if not (1 <= args.max_voices <= 20):
        sys.exit("error: --max-voices must be 1..20")
    if not (1 <= args.max_layers <= 25):
        sys.exit("error: --max-layers must be 1..25")
    if args.output is None:
        args.output = os.path.splitext(args.input)[0] + '.taud'

    vprint(f"parsing MIDI '{args.input}'…")
    division, merged = parse_midi(args.input)
    song = extract_song(division, merged, args.rpb, args.speed)
    vprint(f"  {len(song.notes)} note(s), {len(song.tempo_ft)} tempo event(s)")
    if not song.notes:
        sys.exit("error: MIDI contains no playable notes")

    vprint(f"parsing SF2 '{args.soundfont}'…")
    sf = parse_sf2(args.soundfont)
    vprint(f"  {len(sf.presets)} preset(s), {len(sf.shdrs)} sample header(s)")

    # SF2 exclusiveClass percussion choking (closed hi-hat silences open hi-hat, etc.).
    apply_exclusive_class(song, sf, args.perc_force_mapping)

    # Presets in first-use order; triggers keyed by the exact (noteVal-with-initial-
    # bend, vol6) pair the patterns will carry, so layer trimming sees precisely what
    # the engine matches at runtime.
    slot_keys = []
    seen_keys = set()
    triggers  = {}
    for n in song.notes:
        if n.inst_key not in seen_keys:
            seen_keys.add(n.inst_key)
            slot_keys.append(n.inst_key)
        t = triggers.setdefault(n.inst_key, {})
        k = (key_to_noteval(n.key + n.bend0), round(n.vel * 63 / 127))
        t[k] = t.get(k, 0) + 1
    vprint(f"  {len(slot_keys)} preset(s) in use")

    registry = {}
    presets = build_presets(sf, slot_keys, triggers, args.perc_force_mapping,
                            registry, args.max_layers)

    # Allocate instrument-bin slots: each layer is a normal instrument; a preset with
    # >1 layer also takes a Metainstrument slot the note references. Single-layer
    # presets stay plain instruments (no meta, no extra slot).
    next_slot   = 1
    layer_insts = []      # all normal instruments, .slot assigned
    meta_records = []     # (meta_slot, name, [(layer_slot, bbox_rect)])
    slot_name   = {}      # slot → display name
    note_slot   = {}      # inst_key → slot a note triggers (0 = unplayable)
    for ik in slot_keys:
        name, layers = presets[ik]
        if not layers:
            note_slot[ik] = 0
            continue
        need = len(layers) + (1 if len(layers) > 1 else 0)
        if next_slot + need - 1 > 255:
            vprint(f"  warning: 255-slot budget exhausted — preset '{name}' dropped")
            note_slot[ik] = 0
            continue
        for li, ti in enumerate(layers):
            ti.slot = next_slot; next_slot += 1
            layer_insts.append(ti)
            slot_name[ti.slot] = name if len(layers) == 1 else f"{name} L{li}"
        if len(layers) == 1:
            note_slot[ik] = layers[0].slot
        else:
            meta_slot = next_slot; next_slot += 1
            meta_records.append((meta_slot, name,
                                 [(ti.slot, _layer_bbox(ti)) for ti in layers]))
            slot_name[meta_slot] = name
            note_slot[ik] = meta_slot
    vprint(f"  slots: {next_slot - 1} used — {len(layer_insts)} instrument(s), "
           f"{len(meta_records)} Metainstrument(s)")

    # Tag notes with their trigger slot; notes whose preset failed to resolve drop.
    unplayable = 0
    for n in song.notes:
        n.slot = note_slot.get(n.inst_key, 0)
        if n.slot == 0:
            unplayable += 1
    if unplayable:
        vprint(f"  warning: {unplayable} note(s) dropped (unresolvable preset)")
    song.notes = [n for n in song.notes if n.slot > 0]
    if not song.notes:
        sys.exit("error: no notes survived preset resolution")

    # Pool = every sample referenced by a kept patch (canonical included), in
    # deterministic first-reference order. Everything else is trimmed.
    pool = []
    seen = set()
    for ti in layer_insts:
        for p in ti.patches:
            if id(p.ms) not in seen:
                seen.add(id(p.ms))
                pool.append(p.ms)

    taud = assemble_taud(sf, song, layer_insts, meta_records, slot_name, pool, args)
    sf.file.close()

    with open(args.output, 'wb') as f:
        f.write(taud)
    print(f"wrote {len(taud)} bytes to '{args.output}'")


if __name__ == '__main__':
    main()