Skip to content
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
28 changes: 28 additions & 0 deletions parakeet-cpp/CMakeLists.txt
Original file line number Diff line number Diff line change
Expand Up @@ -468,8 +468,13 @@ if (PARAKEET_BUILD_TESTS)
set(_qvp_sf_q8_gguf "${PARAKEET_TEST_MODEL_DIR}/diar_sortformer_4spk-v1.q8_0.gguf")
set(_qvp_sf_f16_gguf "${PARAKEET_TEST_MODEL_DIR}/diar_sortformer_4spk-v1.f16.gguf")
set(_qvp_sfs_q8_gguf "${PARAKEET_TEST_MODEL_DIR}/diar_streaming_sortformer_4spk-v2.q8_0.gguf")
set(_qvp_sfsv21_q8_gguf "${PARAKEET_TEST_MODEL_DIR}/diar_streaming_sortformer_4spk-v2.1.q8_0.gguf")
set(_qvp_jfk_wav "${PARAKEET_TEST_AUDIO_DIR}/jfk.wav")
set(_qvp_diar_wav "${PARAKEET_TEST_AUDIO_DIR}/diarization-sample-16k.wav")
set(_qvp_abcba_wav "${PARAKEET_TEST_AUDIO_DIR}/abcba.wav")
set(_qvp_abcba_rttm "${PARAKEET_TEST_AUDIO_DIR}/abcba.rttm")
set(_qvp_abcdba_wav "${PARAKEET_TEST_AUDIO_DIR}/abcdba.wav")
set(_qvp_abcdba_rttm "${PARAKEET_TEST_AUDIO_DIR}/abcdba.rttm")
set(_qvp_ctc_ref "${PARAKEET_TEST_REF_DIR}/ctc-ref")
set(_qvp_tdt_ref "${PARAKEET_TEST_REF_DIR}/tdt-ref")
set(_qvp_sf_ref "${PARAKEET_TEST_REF_DIR}/sortformer-ref")
Expand Down Expand Up @@ -543,6 +548,29 @@ if (PARAKEET_BUILD_TESTS)
ARGS "--model" "${_qvp_sfs_q8_gguf}" "--wav" "${_qvp_diar_wav}"
REQUIRES "${_qvp_sfs_q8_gguf}" "${_qvp_diar_wav}")

# v2.1 AOSC speaker-correctness regression. Asserts speaker coverage,
# re-entry slot continuity (the AOSC contract), and frame-level DER
# ceiling against the RTTM ground truth. One binary, two ctest
# registrations (one per LIFO re-entry fixture).
add_executable(test-sortformer-aosc-speakers test/test_sortformer_aosc_speakers.cpp)
target_link_libraries(test-sortformer-aosc-speakers PRIVATE parakeet)
target_include_directories(test-sortformer-aosc-speakers PRIVATE include src ggml/include)
parakeet_apply_ccache(test-sortformer-aosc-speakers)
parakeet_register_test(test-sortformer-aosc-speakers-abcba
LABEL "fixture"
EXE test-sortformer-aosc-speakers
ARGS "--model" "${_qvp_sfsv21_q8_gguf}"
"--wav" "${_qvp_abcba_wav}"
"--ref-rttm" "${_qvp_abcba_rttm}"
REQUIRES "${_qvp_sfsv21_q8_gguf}" "${_qvp_abcba_wav}" "${_qvp_abcba_rttm}")
parakeet_register_test(test-sortformer-aosc-speakers-abcdba
LABEL "fixture"
EXE test-sortformer-aosc-speakers
ARGS "--model" "${_qvp_sfsv21_q8_gguf}"
"--wav" "${_qvp_abcdba_wav}"
"--ref-rttm" "${_qvp_abcdba_rttm}"
REQUIRES "${_qvp_sfsv21_q8_gguf}" "${_qvp_abcdba_wav}" "${_qvp_abcdba_rttm}")

add_executable(test-perf-regression test/test_perf_regression.cpp)
target_link_libraries(test-perf-regression PRIVATE parakeet)
target_include_directories(test-perf-regression PRIVATE include src ggml/include)
Expand Down
130 changes: 130 additions & 0 deletions parakeet-cpp/PROGRESS.md
Original file line number Diff line number Diff line change
Expand Up @@ -3338,3 +3338,133 @@ NaN/Inf values. Exit code 1 on any failure.
- Vulkan performance optimisation (RTF benchmarking, pipeline cache).
- Validate on AMD and Intel GPUs.
- Upstream the `ggml_cont` fix as a ggml-vulkan unary stride patch.

## Phase 17 — Sortformer v2.1 Audio-Online Speaker Cache (AOSC) _(done)_

Phase 11 landed v1 (offline + sliding-history streaming) and §11.11.2
reserved a slot for NeMo's streaming-Sortformer spkcache architecture
shipped with `diar_streaming_sortformer_4spk-v2.x`. This phase fills
that slot: a faithful C++ port of NeMo's AOSC algorithm so v2.1
correctly tracks speakers across long re-entry gaps (which v1 and v2.1
without a cache cannot do — they collapse returning speakers into
whichever hyp slot is closest to the current talker).

### 17.1 — algorithm and helpers

Ported from `sortformer_modules.py` + `sortformer_diar_models.py` in
NeMo. Each C++ helper carries an `// matches NeMo <fn> at
sortformer_modules.py:<line>` comment pointing at its source.

- `_compress_spkcache` — composite-score top-K retention per speaker,
silence anchoring via `mean_sil_emb`, dedupe by absolute frame index,
chronological output (the v2.1 model was trained with Sort Loss so
output order matters).
- `_get_silence_profile` — runtime EMA of silence-frame embeddings.
- `_disable_low_scores` / `_boost_topk_scores` — threshold gating +
newest-frame boost on the per-chunk score matrix.
- `streaming_update` — FIFO + pop + compress orchestration.
- `forward_streaming_step` (`sortformer_aosc_step` in C++) — per-chunk
cache + FIFO + chunk concat in the post-subsampling embedding space,
FastConformer over the concatenation, head, slice, threshold.

### 17.2 — encoder context windowing

`SortformerStreamSession::try_emit_chunks` waits for
`chunk_right_context_ms` of lookahead audio before emitting; tail
chunks fall back to a left-context-only finalize path. New public
fields on `SortformerStreamingOptions`:
`chunk_left_context_ms = 80`, `chunk_right_context_ms = 560`,
`spkcache_update_period = 144`, `fifo_len = 188`. Defaults match
NeMo's `e2e_diarize_speech.py` inference YAML.

### 17.3 — bypass-pre-encode encoder forward

`run_encoder_bypass_pre_encode` (in `parakeet_ctc.cpp`) skips the
subsampling block and feeds pre-subsampled embeddings straight into
the conformer stack. Required for splicing the speaker cache + FIFO +
new chunk in the post-subsampling space the way NeMo trained v2.1
with. Activated only when the cached `EncoderGraph` carries
`bypass_pre_encode = true`; v1 continues through the regular encoder
forward path.

### 17.4 — v1 path unchanged

`cache_active = false` for v1 GGUFs (detected via encoder shape:
18 conformer layers / 80 mel bins, vs v2.x's 17 / 128). v1 streaming
still uses the prior sliding-history + overlap-remap logic and stays
bit-identical to its previous output.

### 17.5 — validation

Synthetic English-only fixtures generated via ElevenLabs TTS with
LIFO re-entry patterns. Lengths chosen so the re-entry gap exceeds
the FIFO span:

- `test/samples/abcba.wav` (160.6 s, 3 distinct speakers, pattern
A→B→C→B→A) — A returns after a 97 s gap.
- `test/samples/abcdba.wav` (191.2 s, 4 distinct speakers, pattern
A→B→C→D→B→A) — A returns after a 128 s gap, B returns after a 66 s
gap.

Each fixture ships with a hand-built ground-truth `.rttm`.
`test/test_sortformer_aosc_speakers.cpp` (new) checks three invariants
against the RTTM: (a) every reference speaker has at least one
emitted hyp frame, (b) every speaker that re-enters lands in the
*same* `hyp_<id>` it was first assigned to (the AOSC contract), and
(c) frame-level DER under the optimal hyp→ref permutation is below
30 %. Both fixtures register as `ctest` entries
`test-sortformer-aosc-speakers-{abcba,abcdba}`.

Measured on q8_0 v2.1 GGUF, Apple M-series, CPU backend:

| fixture | mode | speakers tracked | DER | A re-binds | B re-binds |
|----------|----------------|------------------|--------|------------------|------------------|
| abcba | v1 streaming | 2 (A,B; no C) | 24.31 %| yes (single hyp_0 across both) | yes (single hyp_1 across both) |
| abcba | v2.1 + AOSC | 3 (A,B,C) | 27.29 %| yes (gap 97 s) | yes (gap 35 s) |
| abcba | v2.1 no-cache | 2 (A,B; no C) | 23.74 %| n/a | n/a |
| abcdba | v1 streaming | 2 (collapsed) | 66.28 %| **no — rebinds to hyp_1** | **no — rebinds to hyp_0** |
| abcdba | v2.1 + AOSC | 4 (A,B,C,D) | 22.22 %| yes (gap 128 s) | yes (gap 66 s) |
| abcdba | v2.1 no-cache | 2 (collapsed) | 65.72 %| n/a | n/a |

The 4-speaker case is the discriminating one: v2.1+AOSC drops DER
from 66 % to 22 %, and is the only mode that holds slot continuity
for the returning speakers. Residual confusion in the 3-speaker case
(C/Alice gets bound to A/Sarah's slot once) is encoder-side acoustic
similarity between two female voices — independent of the cache. The
regression test gates on the AOSC contract (slot continuity + DER
ceiling), not on per-frame identity, so this real-world ambiguity
doesn't flake the test.

### 17.6 — files touched

- `include/parakeet/diarization.h` — new `SortformerStreamingOptions`
fields; `spkcache_enable` default flipped to `true`.
- `src/parakeet_sortformer.{h,cpp}` — AOSC helpers + state extension
(`mean_sil_emb`, `spkcache_preds`, `fifo_preds`, `n_sil_frames`).
- `src/parakeet_ctc.{h,cpp}` — `run_encoder_bypass_pre_encode`;
`EncoderGraph` gains `bypass_pre_encode` / `T_enc` /
`pre_encode_in` fields.
- `src/parakeet_engine.cpp` — streaming session uses the
subsampling+AOSC pipeline on v2.x; `try_emit_chunks` waits for
right-context; `diarize_start` populates new config fields.
- `test/test_sortformer_streaming.cpp` — reads defaults from
`SortformerStreamingOptions` so the existing binary reflects the
new AOSC config out of the box.
- `test/test_sortformer_aosc_speakers.cpp` (new) — regression test
described in §17.5.
- `test/samples/abcba.{wav,rttm}`, `test/samples/abcdba.{wav,rttm}`
— new ElevenLabs fixtures.
- `CMakeLists.txt` — path vars + `add_executable` +
`parakeet_register_test` entries for the two new ctest cases.

### 17.7 — follow-ups

- The existing `test-sortformer-streaming` assertion
`n_finals == 1` trips non-deterministically on long inputs under
AOSC (session emits 0 `is_final` markers instead of 1). The hyp
RTTM is still valid; only the session-end signalling needs to
emit exactly one final marker. Separate, narrowly-scoped fix.
- AOSC streaming is correct through the parakeet-cpp C++ test
binary. Surfacing it through downstream addon wrappers
(e.g. `transcription-parakeet`'s `runStreaming()` JS API) requires
separate plumbing work on those wrappers — not in this phase.
7 changes: 4 additions & 3 deletions parakeet-cpp/README.md
Original file line number Diff line number Diff line change
Expand Up @@ -12,6 +12,7 @@
| `nvidia/parakeet-tdt-1.1b` | TDT | 80 | 1024 × 42 | 1024 | 1.1 B | 1225 MiB q8_0 | 0.027-0.079 | English only, lowest WER (no PnC) |
| `nvidia/diar_sortformer_4spk-v1` | Sortformer (diarization) | 80 | enc 512 × 18 + tf 192 × 18 | n/a (4 spk) | ~123 M | 263 MiB f16 / 141 MiB q8_0 / 75 MiB q4_0 | 0.017-0.097 | Up to 4 speakers, offline |
| `nvidia/diar_streaming_sortformer_4spk-v2` | Sortformer (diarization) | 128 | enc 512 × 17 + tf 192 × 18 | n/a (4 spk) | ~117 M | 251 MiB f16 / 134 MiB q8_0 / 72 MiB q4_0 | similar to v1 offline | Offline + sliding-history live streaming in-repo; NeMo spkcache-style streaming not implemented |
| `nvidia/diar_streaming_sortformer_4spk-v2.1` | Sortformer (diarization) | 128 | enc 512 × 17 + tf 192 × 18 | n/a (4 spk) | ~117 M | 251 MiB f16 / 134 MiB q8_0 / 72 MiB q4_0 | similar to v1 offline | Offline + live streaming with NeMo Audio-Online Speaker Cache (AOSC): speakers rebind to their original slot across long gaps. Activated automatically on detection of the v2.x encoder shape (17 layers / 128 mels). |
| `nvidia/parakeet_realtime_eou_120m-v1` | RNN-T + `<EOU>` | 128 | 512 × 17 (chunked-limited att + causal subsampler + LN-in-conv) | 1027 | 120 M | 246 MiB f16 / 132 MiB q8_0 | enc cosine 0.999997 vs NeMo offline; enc on GPU, LSTM decoder CPU-only | English; `<EOU>` turn detection. NVIDIA Open Model License. Offline + Mode 2/3 on fixtures. NeMo `cache_aware_stream_step` path was prototyped and rejected vs offline quality — see `PROGRESS.md`. |

Encoder topology is selected from GGUF metadata (`conv_norm_type`, causal subsampling, chunked-limited attention, etc.), so EOU shares the same C++ graph path as CTC/TDT where weights allow.
Expand All @@ -23,7 +24,7 @@ Encoder topology is selected from GGUF metadata (`conv_norm_type`, causal subsam
| `Engine::transcribe` | One-shot wav → text (CTC / TDT / EOU) or segments (Sortformer) |
| `Engine::transcribe_stream` | Mode 2: full encode once, stream segments |
| `Engine::stream_start` → `StreamSession` | Mode 3: live duplex / cache-aware chunks |
| `Engine::diarize` / `diarize_start` | Sortformer offline / sliding-history live |
| `Engine::diarize` / `diarize_start` | Sortformer offline / live streaming (v1: sliding-history; v2.1: speaker-cache / AOSC) |
| `transcribe_with_speakers` | Sortformer + ASR → attributed transcript |

EOU streaming segments expose `is_eou_boundary`. **`StreamEvent`** (optional callbacks) covers end-of-turn (EOU) and VAD-style signals (Sortformer threshold, optional energy VAD on CTC/TDT). **`Engine::backend_device`** / **`backend_name`** reflect the backend actually used after the load-time cascade.
Expand Down Expand Up @@ -314,8 +315,8 @@ Typical f16 stage rel vs NeMo (order of magnitude): mel ~1e-4 inner, blocks ~1e-

## Current status

- **Shipped:** Offline + Mode 2/3 streaming for CTC/TDT/EOU; Sortformer offline + sliding-history live diarization; optional **`StreamEvent`** callbacks; **`test-vk-vs-cpu`** for Vulkan encoder parity.
- **Not in-repo:** NeMo-style Sortformer spkcache streaming; KV-cache speedups for Mode 3 (API shape exists).
- **Shipped:** Offline + Mode 2/3 streaming for CTC/TDT/EOU; Sortformer offline + live streaming (v1 sliding-history, v2.1 NeMo Audio-Online Speaker Cache / AOSC); optional **`StreamEvent`** callbacks; **`test-vk-vs-cpu`** for Vulkan encoder parity.
- **Not in-repo:** KV-cache speedups for Mode 3 (API shape exists).
- **EOU:** NeMo `cache_aware_stream_step` was evaluated and **rejected** for offline transcript parity — details in **`PROGRESS.md`**.

## Repository layout
Expand Down
28 changes: 23 additions & 5 deletions parakeet-cpp/examples/live-mic.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -65,7 +65,10 @@ void print_usage(const char * argv0) {
" diarization: chunk stride in ms (default 2000)\n"
" --left-context-ms N transcription: left context per chunk (default 5000)\n"
" --right-lookahead-ms N transcription: right lookahead per chunk (default 1000)\n"
" --history-ms N diarization: sliding history window (default 30000)\n"
" --history-ms N diarization (v1 only): sliding history window in ms\n"
" (default 30000). Ignored on v2.1 GGUFs, where the\n"
" NeMo Audio-Online Speaker Cache (AOSC) replaces the\n"
" sliding window and activates automatically.\n"
" --list-devices list available capture devices and exit\n"
" --device N use device with this index (default: system default)\n"
" --accumulate transcription only: accumulate on one line; emit a\n"
Expand Down Expand Up @@ -296,10 +299,25 @@ int main(int argc, char ** argv) {
std::signal(SIGTERM, on_sigint);

if (diarization_mode) {
std::fprintf(stderr,
"[live-mic] listening at 16 kHz mono (diarization). "
"chunk=%d ms history=%d ms. Speak, Ctrl-C to stop.\n\n",
args.chunk_ms, args.history_ms);
// diar_sess->aosc_active() is true on v2.1 GGUFs that took the
// NeMo Audio-Online Speaker Cache code path inside diarize_start.
// v1 GGUFs (or v2.x with spkcache_enable=false) return false and
// keep the sliding-history banner unchanged from earlier releases.
if (diar_sess->aosc_active()) {
const auto & sopts = diar_sess->options();
std::fprintf(stderr,
"[live-mic] listening at 16 kHz mono (v2.1 diarization, AOSC). "
"chunk=%d ms spkcache_len=%d fifo_len=%d lc=%d ms rc=%d ms. "
"Speak, Ctrl-C to stop.\n\n",
args.chunk_ms,
sopts.spkcache_len, sopts.fifo_len,
sopts.chunk_left_context_ms, sopts.chunk_right_context_ms);
} else {
std::fprintf(stderr,
"[live-mic] listening at 16 kHz mono (v1 diarization). "
"chunk=%d ms history=%d ms. Speak, Ctrl-C to stop.\n\n",
args.chunk_ms, args.history_ms);
}
} else {
std::fprintf(stderr,
"[live-mic] listening at 16 kHz mono. "
Expand Down
27 changes: 27 additions & 0 deletions parakeet-cpp/include/parakeet/diarization.h
Original file line number Diff line number Diff line change
Expand Up @@ -72,6 +72,26 @@ struct SortformerStreamingOptions {

// Optional StreamEvent delivery (VadStateChanged from speaker_probs); nullptr disables.
StreamEventCallback on_event = nullptr;

// === AOSC (Audio-Online Speaker Cache, Sortformer v2.1) ===
// Cache-aware streaming forward (port of NeMo's `forward_streaming_step` +
// `streaming_update` + `_compress_spkcache`). On v2.1 models (auto-detected
// from encoder shape) and spkcache_enable=true, the engine concatenates the
// speaker cache + FIFO + current chunk's pre-encode embeddings, runs the
// conformer layers over the concat, then the diariser head, before updating
// the runtime cache. This preserves speaker identity across silences far
// longer than `history_ms`. v1 models always take the legacy path.
//
// `mean_sil_emb` is RUNTIME state (zeros at session start, EMA of detected
// silence frames), NOT a learned tensor -- no converter changes required.
// Defaults below are NeMo's inference defaults (see
// examples/speaker_tasks/diarization/neural_diarizer/e2e_diarize_speech.py).
bool spkcache_enable = true;
int spkcache_len = 188; // total cache rows (encoder frames)
int fifo_len = 188; // FIFO warmup buffer (encoder frames)
int chunk_left_context_ms = 80; // ~1 encoder frame at v2.1 (80ms)
int chunk_right_context_ms = 560; // ~7 encoder frames at v2.1 (560ms)
int spkcache_update_period = 144; // pop_out_len on FIFO overflow
};

using SortformerSegmentCallback =
Expand All @@ -98,6 +118,13 @@ class PARAKEET_API SortformerStreamSession {

const SortformerStreamingOptions & options() const;

// True when the session is running v2.1 NeMo-style speaker-cache
// streaming (AOSC). False on v1 sortformer GGUFs, or on v2.x with
// `SortformerStreamingOptions::spkcache_enable=false`. Mirrors the
// internal `cache_active` flag; useful for CLI banners / logs that
// want to differentiate the two streaming modes for the user.
bool aosc_active() const;

private:
std::unique_ptr<Impl> pimpl_;
};
Expand Down
Loading