tetherto · GustavoA1604 · May 18, 2026 · May 15, 2026 · May 18, 2026 · May 18, 2026
@@ -468,8 +468,13 @@ if (PARAKEET_BUILD_TESTS)
     set(_qvp_sf_q8_gguf    "${PARAKEET_TEST_MODEL_DIR}/diar_sortformer_4spk-v1.q8_0.gguf")
     set(_qvp_sf_f16_gguf   "${PARAKEET_TEST_MODEL_DIR}/diar_sortformer_4spk-v1.f16.gguf")
     set(_qvp_sfs_q8_gguf   "${PARAKEET_TEST_MODEL_DIR}/diar_streaming_sortformer_4spk-v2.q8_0.gguf")
+    set(_qvp_sfsv21_q8_gguf "${PARAKEET_TEST_MODEL_DIR}/diar_streaming_sortformer_4spk-v2.1.q8_0.gguf")
     set(_qvp_jfk_wav       "${PARAKEET_TEST_AUDIO_DIR}/jfk.wav")
     set(_qvp_diar_wav      "${PARAKEET_TEST_AUDIO_DIR}/diarization-sample-16k.wav")
+    set(_qvp_abcba_wav     "${PARAKEET_TEST_AUDIO_DIR}/abcba.wav")
+    set(_qvp_abcba_rttm    "${PARAKEET_TEST_AUDIO_DIR}/abcba.rttm")
+    set(_qvp_abcdba_wav    "${PARAKEET_TEST_AUDIO_DIR}/abcdba.wav")
+    set(_qvp_abcdba_rttm   "${PARAKEET_TEST_AUDIO_DIR}/abcdba.rttm")
     set(_qvp_ctc_ref       "${PARAKEET_TEST_REF_DIR}/ctc-ref")
     set(_qvp_tdt_ref       "${PARAKEET_TEST_REF_DIR}/tdt-ref")
     set(_qvp_sf_ref        "${PARAKEET_TEST_REF_DIR}/sortformer-ref")
@@ -543,6 +548,29 @@ if (PARAKEET_BUILD_TESTS)
         ARGS     "--model" "${_qvp_sfs_q8_gguf}" "--wav" "${_qvp_diar_wav}"
         REQUIRES "${_qvp_sfs_q8_gguf}" "${_qvp_diar_wav}")
 
+    # v2.1 AOSC speaker-correctness regression. Asserts speaker coverage,
+    # re-entry slot continuity (the AOSC contract), and frame-level DER
+    # ceiling against the RTTM ground truth. One binary, two ctest
+    # registrations (one per LIFO re-entry fixture).
+    add_executable(test-sortformer-aosc-speakers test/test_sortformer_aosc_speakers.cpp)
+    target_link_libraries(test-sortformer-aosc-speakers PRIVATE parakeet)
+    target_include_directories(test-sortformer-aosc-speakers PRIVATE include src ggml/include)
+    parakeet_apply_ccache(test-sortformer-aosc-speakers)
+    parakeet_register_test(test-sortformer-aosc-speakers-abcba
+        LABEL    "fixture"
+        EXE      test-sortformer-aosc-speakers
+        ARGS     "--model" "${_qvp_sfsv21_q8_gguf}"
+                 "--wav"   "${_qvp_abcba_wav}"
+                 "--ref-rttm" "${_qvp_abcba_rttm}"
+        REQUIRES "${_qvp_sfsv21_q8_gguf}" "${_qvp_abcba_wav}" "${_qvp_abcba_rttm}")
+    parakeet_register_test(test-sortformer-aosc-speakers-abcdba
+        LABEL    "fixture"
+        EXE      test-sortformer-aosc-speakers
+        ARGS     "--model" "${_qvp_sfsv21_q8_gguf}"
+                 "--wav"   "${_qvp_abcdba_wav}"
+                 "--ref-rttm" "${_qvp_abcdba_rttm}"
+        REQUIRES "${_qvp_sfsv21_q8_gguf}" "${_qvp_abcdba_wav}" "${_qvp_abcdba_rttm}")
+
     add_executable(test-perf-regression test/test_perf_regression.cpp)
     target_link_libraries(test-perf-regression PRIVATE parakeet)
     target_include_directories(test-perf-regression PRIVATE include src ggml/include)

@@ -3338,3 +3338,133 @@ NaN/Inf values. Exit code 1 on any failure.
 - Vulkan performance optimisation (RTF benchmarking, pipeline cache).
 - Validate on AMD and Intel GPUs.
 - Upstream the `ggml_cont` fix as a ggml-vulkan unary stride patch.
+
+## Phase 17 — Sortformer v2.1 Audio-Online Speaker Cache (AOSC)  _(done)_
+
+Phase 11 landed v1 (offline + sliding-history streaming) and §11.11.2
+reserved a slot for NeMo's streaming-Sortformer spkcache architecture
+shipped with `diar_streaming_sortformer_4spk-v2.x`. This phase fills
+that slot: a faithful C++ port of NeMo's AOSC algorithm so v2.1
+correctly tracks speakers across long re-entry gaps (which v1 and v2.1
+without a cache cannot do — they collapse returning speakers into
+whichever hyp slot is closest to the current talker).
+
+### 17.1 — algorithm and helpers
+
+Ported from `sortformer_modules.py` + `sortformer_diar_models.py` in
+NeMo. Each C++ helper carries an `// matches NeMo <fn> at
+sortformer_modules.py:<line>` comment pointing at its source.
+
+- `_compress_spkcache` — composite-score top-K retention per speaker,
+  silence anchoring via `mean_sil_emb`, dedupe by absolute frame index,
+  chronological output (the v2.1 model was trained with Sort Loss so
+  output order matters).
+- `_get_silence_profile` — runtime EMA of silence-frame embeddings.
+- `_disable_low_scores` / `_boost_topk_scores` — threshold gating +
+  newest-frame boost on the per-chunk score matrix.
+- `streaming_update` — FIFO + pop + compress orchestration.
+- `forward_streaming_step` (`sortformer_aosc_step` in C++) — per-chunk
+  cache + FIFO + chunk concat in the post-subsampling embedding space,
+  FastConformer over the concatenation, head, slice, threshold.
+
+### 17.2 — encoder context windowing
+
+`SortformerStreamSession::try_emit_chunks` waits for
+`chunk_right_context_ms` of lookahead audio before emitting; tail
+chunks fall back to a left-context-only finalize path. New public
+fields on `SortformerStreamingOptions`:
+`chunk_left_context_ms = 80`, `chunk_right_context_ms = 560`,
+`spkcache_update_period = 144`, `fifo_len = 188`. Defaults match
+NeMo's `e2e_diarize_speech.py` inference YAML.
+
+### 17.3 — bypass-pre-encode encoder forward
+
+`run_encoder_bypass_pre_encode` (in `parakeet_ctc.cpp`) skips the
+subsampling block and feeds pre-subsampled embeddings straight into
+the conformer stack. Required for splicing the speaker cache + FIFO +
+new chunk in the post-subsampling space the way NeMo trained v2.1
+with. Activated only when the cached `EncoderGraph` carries
+`bypass_pre_encode = true`; v1 continues through the regular encoder
+forward path.
+
+### 17.4 — v1 path unchanged
+
+`cache_active = false` for v1 GGUFs (detected via encoder shape:
+18 conformer layers / 80 mel bins, vs v2.x's 17 / 128). v1 streaming
+still uses the prior sliding-history + overlap-remap logic and stays
+bit-identical to its previous output.
+
+### 17.5 — validation
+
+Synthetic English-only fixtures generated via ElevenLabs TTS with
+LIFO re-entry patterns. Lengths chosen so the re-entry gap exceeds
+the FIFO span:
+
+- `test/samples/abcba.wav` (160.6 s, 3 distinct speakers, pattern
+  A→B→C→B→A) — A returns after a 97 s gap.
+- `test/samples/abcdba.wav` (191.2 s, 4 distinct speakers, pattern
+  A→B→C→D→B→A) — A returns after a 128 s gap, B returns after a 66 s
+  gap.
+
+Each fixture ships with a hand-built ground-truth `.rttm`.
+`test/test_sortformer_aosc_speakers.cpp` (new) checks three invariants
+against the RTTM: (a) every reference speaker has at least one
+emitted hyp frame, (b) every speaker that re-enters lands in the
+*same* `hyp_<id>` it was first assigned to (the AOSC contract), and
+(c) frame-level DER under the optimal hyp→ref permutation is below
+30 %. Both fixtures register as `ctest` entries
+`test-sortformer-aosc-speakers-{abcba,abcdba}`.
+
+Measured on q8_0 v2.1 GGUF, Apple M-series, CPU backend:
+
+| fixture  | mode           | speakers tracked | DER    | A re-binds       | B re-binds       |
+|----------|----------------|------------------|--------|------------------|------------------|
+| abcba    | v1 streaming   | 2 (A,B; no C)    | 24.31 %| yes (single hyp_0 across both) | yes (single hyp_1 across both) |
+| abcba    | v2.1 + AOSC    | 3 (A,B,C)        | 27.29 %| yes (gap 97 s)   | yes (gap 35 s)   |
+| abcba    | v2.1 no-cache  | 2 (A,B; no C)    | 23.74 %| n/a              | n/a              |
+| abcdba   | v1 streaming   | 2 (collapsed)    | 66.28 %| **no — rebinds to hyp_1** | **no — rebinds to hyp_0** |
+| abcdba   | v2.1 + AOSC    | 4 (A,B,C,D)      | 22.22 %| yes (gap 128 s)  | yes (gap 66 s)   |
+| abcdba   | v2.1 no-cache  | 2 (collapsed)    | 65.72 %| n/a              | n/a              |
+
+The 4-speaker case is the discriminating one: v2.1+AOSC drops DER
+from 66 % to 22 %, and is the only mode that holds slot continuity
+for the returning speakers. Residual confusion in the 3-speaker case
+(C/Alice gets bound to A/Sarah's slot once) is encoder-side acoustic
+similarity between two female voices — independent of the cache. The
+regression test gates on the AOSC contract (slot continuity + DER
+ceiling), not on per-frame identity, so this real-world ambiguity
+doesn't flake the test.
+
+### 17.6 — files touched
+
+- `include/parakeet/diarization.h` — new `SortformerStreamingOptions`
+  fields; `spkcache_enable` default flipped to `true`.
+- `src/parakeet_sortformer.{h,cpp}` — AOSC helpers + state extension
+  (`mean_sil_emb`, `spkcache_preds`, `fifo_preds`, `n_sil_frames`).
+- `src/parakeet_ctc.{h,cpp}` — `run_encoder_bypass_pre_encode`;
+  `EncoderGraph` gains `bypass_pre_encode` / `T_enc` /
+  `pre_encode_in` fields.
+- `src/parakeet_engine.cpp` — streaming session uses the
+  subsampling+AOSC pipeline on v2.x; `try_emit_chunks` waits for
+  right-context; `diarize_start` populates new config fields.
+- `test/test_sortformer_streaming.cpp` — reads defaults from
+  `SortformerStreamingOptions` so the existing binary reflects the
+  new AOSC config out of the box.
+- `test/test_sortformer_aosc_speakers.cpp` (new) — regression test
+  described in §17.5.
+- `test/samples/abcba.{wav,rttm}`, `test/samples/abcdba.{wav,rttm}`
+  — new ElevenLabs fixtures.
+- `CMakeLists.txt` — path vars + `add_executable` +
+  `parakeet_register_test` entries for the two new ctest cases.
+
+### 17.7 — follow-ups
+
+- The existing `test-sortformer-streaming` assertion
+  `n_finals == 1` trips non-deterministically on long inputs under
+  AOSC (session emits 0 `is_final` markers instead of 1). The hyp
+  RTTM is still valid; only the session-end signalling needs to
+  emit exactly one final marker. Separate, narrowly-scoped fix.
+- AOSC streaming is correct through the parakeet-cpp C++ test
+  binary. Surfacing it through downstream addon wrappers
+  (e.g. `transcription-parakeet`'s `runStreaming()` JS API) requires
+  separate plumbing work on those wrappers — not in this phase.
@@ -12,6 +12,7 @@
 | `nvidia/parakeet-tdt-1.1b`    | TDT  | 80  | 1024 × 42 | 1024 | 1.1 B  | 1225 MiB q8_0               | 0.027-0.079 | English only, lowest WER (no PnC) |
 | `nvidia/diar_sortformer_4spk-v1` | Sortformer (diarization) | 80 | enc 512 × 18 + tf 192 × 18 | n/a (4 spk) | ~123 M | 263 MiB f16 / 141 MiB q8_0 / 75 MiB q4_0 | 0.017-0.097 | Up to 4 speakers, offline |
 | `nvidia/diar_streaming_sortformer_4spk-v2` | Sortformer (diarization) | 128 | enc 512 × 17 + tf 192 × 18 | n/a (4 spk) | ~117 M | 251 MiB f16 / 134 MiB q8_0 / 72 MiB q4_0 | similar to v1 offline | Offline + sliding-history live streaming in-repo; NeMo spkcache-style streaming not implemented |
+| `nvidia/diar_streaming_sortformer_4spk-v2.1` | Sortformer (diarization) | 128 | enc 512 × 17 + tf 192 × 18 | n/a (4 spk) | ~117 M | 251 MiB f16 / 134 MiB q8_0 / 72 MiB q4_0 | similar to v1 offline | Offline + live streaming with NeMo Audio-Online Speaker Cache (AOSC): speakers rebind to their original slot across long gaps. Activated automatically on detection of the v2.x encoder shape (17 layers / 128 mels). |
 | `nvidia/parakeet_realtime_eou_120m-v1` | RNN-T + `<EOU>` | 128 | 512 × 17 (chunked-limited att + causal subsampler + LN-in-conv) | 1027 | 120 M | 246 MiB f16 / 132 MiB q8_0 | enc cosine 0.999997 vs NeMo offline; enc on GPU, LSTM decoder CPU-only | English; `<EOU>` turn detection. NVIDIA Open Model License. Offline + Mode 2/3 on fixtures. NeMo `cache_aware_stream_step` path was prototyped and rejected vs offline quality — see `PROGRESS.md`. |
 
 Encoder topology is selected from GGUF metadata (`conv_norm_type`, causal subsampling, chunked-limited attention, etc.), so EOU shares the same C++ graph path as CTC/TDT where weights allow.
@@ -23,7 +24,7 @@ Encoder topology is selected from GGUF metadata (`conv_norm_type`, causal subsam
 | `Engine::transcribe` | One-shot wav → text (CTC / TDT / EOU) or segments (Sortformer) |
 | `Engine::transcribe_stream` | Mode 2: full encode once, stream segments |
 | `Engine::stream_start` → `StreamSession` | Mode 3: live duplex / cache-aware chunks |
-| `Engine::diarize` / `diarize_start` | Sortformer offline / sliding-history live |
+| `Engine::diarize` / `diarize_start` | Sortformer offline / live streaming (v1: sliding-history; v2.1: speaker-cache / AOSC) |
 | `transcribe_with_speakers` | Sortformer + ASR → attributed transcript |
 
 EOU streaming segments expose `is_eou_boundary`. **`StreamEvent`** (optional callbacks) covers end-of-turn (EOU) and VAD-style signals (Sortformer threshold, optional energy VAD on CTC/TDT). **`Engine::backend_device`** / **`backend_name`** reflect the backend actually used after the load-time cascade.
@@ -314,8 +315,8 @@ Typical f16 stage rel vs NeMo (order of magnitude): mel ~1e-4 inner, blocks ~1e-
 
 ## Current status
 
-- **Shipped:** Offline + Mode 2/3 streaming for CTC/TDT/EOU; Sortformer offline + sliding-history live diarization; optional **`StreamEvent`** callbacks; **`test-vk-vs-cpu`** for Vulkan encoder parity.  
-- **Not in-repo:** NeMo-style Sortformer spkcache streaming; KV-cache speedups for Mode 3 (API shape exists).  
+- **Shipped:** Offline + Mode 2/3 streaming for CTC/TDT/EOU; Sortformer offline + live streaming (v1 sliding-history, v2.1 NeMo Audio-Online Speaker Cache / AOSC); optional **`StreamEvent`** callbacks; **`test-vk-vs-cpu`** for Vulkan encoder parity.  
+- **Not in-repo:** KV-cache speedups for Mode 3 (API shape exists).  
 - **EOU:** NeMo `cache_aware_stream_step` was evaluated and **rejected** for offline transcript parity — details in **`PROGRESS.md`**.
 
 ## Repository layout

@@ -65,7 +65,10 @@ void print_usage(const char * argv0) {
         "                                 diarization:   chunk stride in ms (default 2000)\n"
         "  --left-context-ms N            transcription: left context per chunk (default 5000)\n"
         "  --right-lookahead-ms N         transcription: right lookahead per chunk (default 1000)\n"
-        "  --history-ms N                 diarization: sliding history window (default 30000)\n"
+        "  --history-ms N                 diarization (v1 only): sliding history window in ms\n"
+        "                                 (default 30000). Ignored on v2.1 GGUFs, where the\n"
+        "                                 NeMo Audio-Online Speaker Cache (AOSC) replaces the\n"
+        "                                 sliding window and activates automatically.\n"
         "  --list-devices                 list available capture devices and exit\n"
         "  --device N                     use device with this index (default: system default)\n"
         "  --accumulate                   transcription only: accumulate on one line; emit a\n"
@@ -296,10 +299,25 @@ int main(int argc, char ** argv) {
     std::signal(SIGTERM, on_sigint);
 
     if (diarization_mode) {
-        std::fprintf(stderr,
-            "[live-mic] listening at 16 kHz mono (diarization).  "
-            "chunk=%d ms  history=%d ms. Speak, Ctrl-C to stop.\n\n",
-            args.chunk_ms, args.history_ms);
+        // diar_sess->aosc_active() is true on v2.1 GGUFs that took the
+        // NeMo Audio-Online Speaker Cache code path inside diarize_start.
+        // v1 GGUFs (or v2.x with spkcache_enable=false) return false and
+        // keep the sliding-history banner unchanged from earlier releases.
+        if (diar_sess->aosc_active()) {
+            const auto & sopts = diar_sess->options();
+            std::fprintf(stderr,
+                "[live-mic] listening at 16 kHz mono (v2.1 diarization, AOSC).  "
+                "chunk=%d ms  spkcache_len=%d  fifo_len=%d  lc=%d ms  rc=%d ms.  "
+                "Speak, Ctrl-C to stop.\n\n",
+                args.chunk_ms,
+                sopts.spkcache_len, sopts.fifo_len,
+                sopts.chunk_left_context_ms, sopts.chunk_right_context_ms);
+        } else {
+            std::fprintf(stderr,
+                "[live-mic] listening at 16 kHz mono (v1 diarization).  "
+                "chunk=%d ms  history=%d ms. Speak, Ctrl-C to stop.\n\n",
+                args.chunk_ms, args.history_ms);
+        }
     } else {
         std::fprintf(stderr,
             "[live-mic] listening at 16 kHz mono.  "

@@ -72,6 +72,26 @@ struct SortformerStreamingOptions {
 
     // Optional StreamEvent delivery (VadStateChanged from speaker_probs); nullptr disables.
     StreamEventCallback on_event = nullptr;
+
+    // === AOSC (Audio-Online Speaker Cache, Sortformer v2.1) ===
+    // Cache-aware streaming forward (port of NeMo's `forward_streaming_step` +
+    // `streaming_update` + `_compress_spkcache`). On v2.1 models (auto-detected
+    // from encoder shape) and spkcache_enable=true, the engine concatenates the
+    // speaker cache + FIFO + current chunk's pre-encode embeddings, runs the
+    // conformer layers over the concat, then the diariser head, before updating
+    // the runtime cache. This preserves speaker identity across silences far
+    // longer than `history_ms`. v1 models always take the legacy path.
+    //
+    // `mean_sil_emb` is RUNTIME state (zeros at session start, EMA of detected
+    // silence frames), NOT a learned tensor -- no converter changes required.
+    // Defaults below are NeMo's inference defaults (see
+    // examples/speaker_tasks/diarization/neural_diarizer/e2e_diarize_speech.py).
+    bool  spkcache_enable        = true;
+    int   spkcache_len           = 188;    // total cache rows (encoder frames)
+    int   fifo_len               = 188;    // FIFO warmup buffer (encoder frames)
+    int   chunk_left_context_ms  = 80;     // ~1 encoder frame at v2.1 (80ms)
+    int   chunk_right_context_ms = 560;    // ~7 encoder frames at v2.1 (560ms)
+    int   spkcache_update_period = 144;    // pop_out_len on FIFO overflow
 };
 
 using SortformerSegmentCallback =
@@ -98,6 +118,13 @@ class PARAKEET_API SortformerStreamSession {
 
     const SortformerStreamingOptions & options() const;
 
+    // True when the session is running v2.1 NeMo-style speaker-cache
+    // streaming (AOSC). False on v1 sortformer GGUFs, or on v2.x with
+    // `SortformerStreamingOptions::spkcache_enable=false`. Mirrors the
+    // internal `cache_active` flag; useful for CLI banners / logs that
+    // want to differentiate the two streaming modes for the user.
+    bool aosc_active() const;
+
 private:
     std::unique_ptr<Impl> pimpl_;
 };