microsoft · kunal-vaishnavi · Mar 17, 2026 · Feb 10, 2026 · Feb 11, 2026 · Feb 12, 2026
diff --git a/cmake/deps.txt b/cmake/deps.txt
@@ -14,7 +14,7 @@ pybind11;https://github.com/pybind/pybind11/archive/refs/tags/v2.13.6.zip;f78029
 googletest;https://github.com/google/googletest/archive/530d5c8c84abd2a46f38583ee817743c9b3a42b4.zip;5e3a61db2aa975cfd0f97ba92c818744e7fa7034
 microsoft_wil;https://github.com/microsoft/wil/archive/refs/tags/v1.0.230629.1.zip;e4a542a323c070376f7c2d1973d0f7ddbc1d2fa5
 directx_headers;https://github.com/microsoft/DirectX-Headers/archive/refs/tags/v1.613.1.zip;47653509a3371eabb156360f42faf582f314bf2e
-onnxruntime_extensions;https://github.com/microsoft/onnxruntime-extensions.git;087953cde6149e423c6848c40c3791264272706c
+onnxruntime_extensions;https://github.com/microsoft/onnxruntime-extensions.git;45a76bce69f874ad980933504700fc110ebf1ecb
 
 # These two dependencies are for the optional constrained decoding feature (USE_GUIDANCE)
 llguidance;https://github.com/microsoft/llguidance.git;94fa39128ef184ffeda33845f6d333f332a34b4d

diff --git a/examples/python/nemotron_streaming.py b/examples/python/nemotron_streaming.py
@@ -0,0 +1,99 @@
+# Copyright (c) Microsoft Corporation. All rights reserved.
+# Licensed under the MIT License.
+
+import argparse
+import os
+import sys
+import time
+import re
+import numpy as np
+import onnxruntime_genai as og
+
+SAMPLE_RATE = 16000
+CHUNK_SAMPLES = 8960
+CHUNK_DURATION = CHUNK_SAMPLES / SAMPLE_RATE
+
+
+def load_audio(audio_path):
+    import soundfile as sf
+    audio, sr = sf.read(audio_path, dtype="float32")
+    if len(audio.shape) > 1:
+        audio = audio.mean(axis=1)
+    if sr != SAMPLE_RATE:
+        import scipy.signal
+        num_samples = int(len(audio) * SAMPLE_RATE / sr)
+        audio = scipy.signal.resample(audio, num_samples).astype(np.float32)
+    return audio
+
+
+def load_tokenizer(model_path):
+    import sentencepiece as spm
+    path = os.path.join(model_path, "tokenizer.model")
+    if not os.path.exists(path):
+        return None
+    sp = spm.SentencePieceProcessor()
+    sp.Load(path)
+    return sp
+
+
+def parse_token_ids(raw_text):
+    return [int(m.group(1)) for m in re.finditer(r'<(\d+)>', raw_text)]
+
+
+def simulate_microphone(model_path, audio_path):
+    audio = load_audio(audio_path)
+    duration = len(audio) / SAMPLE_RATE
+    num_chunks = (len(audio) + CHUNK_SAMPLES - 1) // CHUNK_SAMPLES
+    print(f"Audio: {duration:.1f}s | {num_chunks} chunks × {CHUNK_DURATION*1000:.0f}ms")
+
+    config = og.Config(model_path)
+    model = og.Model(config)
+    sp = load_tokenizer(model_path)
+    asr = og.StreamingASR(model)
+
+    print("-" * 60)
+    stream_start = time.time()
+
+    for i in range(0, len(audio), CHUNK_SAMPLES):
+        chunk = audio[i:i + CHUNK_SAMPLES]
+        if len(chunk) < CHUNK_SAMPLES:
+            chunk = np.pad(chunk, (0, CHUNK_SAMPLES - len(chunk)))
+        chunk = chunk.astype(np.float32)
+        raw_text = asr.transcribe_chunk(chunk)
+        if raw_text:
+            print(raw_text, end="", flush=True)
+
+    for _ in range(4):
+        silence = np.zeros(CHUNK_SAMPLES, dtype=np.float32)
+        raw_text = asr.transcribe_chunk(silence)
+        if raw_text:
+            print(raw_text, end="", flush=True)
+
+    total_wall = time.time() - stream_start
+
+    full_raw = asr.get_transcript()
+    if sp:
+        all_ids = parse_token_ids(full_raw)
+        final_text = sp.Decode(all_ids) if all_ids else full_raw
+    else:
+        final_text = full_raw
+
+    print(f"\n{'=' * 60}")
+    print(f"  {final_text.strip()}")
+    print(f"{'=' * 60}")
+    print(f"  Audio: {duration:.2f}s | Wall: {total_wall:.2f}s | RTF: {duration/total_wall:.2f}x")
+
+
+def main():
+    parser = argparse.ArgumentParser()
+    parser.add_argument("--model_path", type=str, required=True)
+    parser.add_argument("--audio_file", type=str, required=True)
+    args = parser.parse_args()
+    if not os.path.exists(args.audio_file):
+        print(f"Error: {args.audio_file} not found")
+        sys.exit(1)
+    simulate_microphone(args.model_path, args.audio_file)
+
+
+if __name__ == "__main__":
+    main()
@@ -315,6 +315,14 @@ struct DecoderInputs_Element : JSON::Element {
       v_.past_sequence_lengths = JSON::Get<std::string_view>(value);
     } else if (name == "block_table") {
       v_.block_table = JSON::Get<std::string_view>(value);
+    } else if (name == "targets") {
+      v_.targets = JSON::Get<std::string_view>(value);
+    } else if (name == "target_length") {
+      v_.target_length = JSON::Get<std::string_view>(value);
+    } else if (name == "states_1") {
+      v_.states_1 = JSON::Get<std::string_view>(value);
+    } else if (name == "states_2") {
+      v_.states_2 = JSON::Get<std::string_view>(value);
     } else {
       throw JSON::unknown_value_error{};
     }
@@ -340,6 +348,14 @@ struct DecoderOutputs_Element : JSON::Element {
       v_.output_cross_qk_names = JSON::Get<std::string_view>(value);
     } else if (name == "rnn_states") {
       v_.rnn_states = JSON::Get<std::string_view>(value);
+    } else if (name == "outputs") {
+      v_.outputs = JSON::Get<std::string_view>(value);
+    } else if (name == "prednet_lengths") {
+      v_.prednet_lengths = JSON::Get<std::string_view>(value);
+    } else if (name == "states_1") {
+      v_.states_1 = JSON::Get<std::string_view>(value);
+    } else if (name == "states_2") {
+      v_.states_2 = JSON::Get<std::string_view>(value);
     } else {
       throw JSON::unknown_value_error{};
     }
@@ -557,10 +573,10 @@ struct Decoder_Element : JSON::Element {
       v_.hidden_size = static_cast<int>(JSON::Get<double>(value));
     } else if (name == "num_attention_heads") {
       v_.num_attention_heads = static_cast<int>(JSON::Get<double>(value));
-    } else if (name == "num_key_value_heads") {
-      v_.num_key_value_heads = static_cast<int>(JSON::Get<double>(value));
     } else if (name == "num_hidden_layers") {
       v_.num_hidden_layers = static_cast<int>(JSON::Get<double>(value));
+    } else if (name == "num_key_value_heads") {
+      v_.num_key_value_heads = static_cast<int>(JSON::Get<double>(value));
     } else if (name == "head_size") {
       v_.head_size = static_cast<int>(JSON::Get<double>(value));
     } else {
@@ -827,6 +843,50 @@ struct Speech_Element : JSON::Element {
       v_.config_filename = JSON::Get<std::string_view>(value);
     } else if (name == "adapter_filename") {
       v_.adapter_filename = JSON::Get<std::string_view>(value);
+    } else if (name == "num_mels") {
+      v_.num_mels = static_cast<int>(JSON::Get<double>(value));
+    } else if (name == "fft_size") {
+      v_.fft_size = static_cast<int>(JSON::Get<double>(value));
+    } else if (name == "hop_length") {
+      v_.hop_length = static_cast<int>(JSON::Get<double>(value));
+    } else if (name == "win_length") {
+      v_.win_length = static_cast<int>(JSON::Get<double>(value));
+    } else if (name == "preemph") {
+      v_.preemph = static_cast<float>(JSON::Get<double>(value));
+    } else if (name == "log_eps") {
+      v_.log_eps = static_cast<float>(JSON::Get<double>(value));
+    } else if (name == "subsampling_factor") {
+      v_.subsampling_factor = static_cast<int>(JSON::Get<double>(value));
+    } else if (name == "left_context") {
+      v_.left_context = static_cast<int>(JSON::Get<double>(value));
+    } else if (name == "conv_context") {
+      v_.conv_context = static_cast<int>(JSON::Get<double>(value));
+    } else if (name == "pre_encode_cache_size") {
+      v_.pre_encode_cache_size = static_cast<int>(JSON::Get<double>(value));
+    } else if (name == "sample_rate") {
+      v_.sample_rate = static_cast<int>(JSON::Get<double>(value));
+    } else if (name == "chunk_samples") {
+      v_.chunk_samples = static_cast<int>(JSON::Get<double>(value));
+    } else if (name == "blank_id") {
+      v_.blank_id = static_cast<int>(JSON::Get<double>(value));
+    } else if (name == "max_symbols_per_step") {
+      v_.max_symbols_per_step = static_cast<int>(JSON::Get<double>(value));
+    } else if (name == "enc_in_length") {
+      v_.enc_in_length = JSON::Get<std::string_view>(value);
+    } else if (name == "enc_in_cache_channel") {
+      v_.enc_in_cache_channel = JSON::Get<std::string_view>(value);
+    } else if (name == "enc_in_cache_time") {
+      v_.enc_in_cache_time = JSON::Get<std::string_view>(value);
+    } else if (name == "enc_in_cache_channel_len") {
+      v_.enc_in_cache_channel_len = JSON::Get<std::string_view>(value);
+    } else if (name == "enc_out_length") {
+      v_.enc_out_length = JSON::Get<std::string_view>(value);
+    } else if (name == "enc_out_cache_channel") {
+      v_.enc_out_cache_channel = JSON::Get<std::string_view>(value);
+    } else if (name == "enc_out_cache_time") {
+      v_.enc_out_cache_time = JSON::Get<std::string_view>(value);
+    } else if (name == "enc_out_cache_channel_len") {
+      v_.enc_out_cache_channel_len = JSON::Get<std::string_view>(value);
     } else {
       throw JSON::unknown_value_error{};
     }
@@ -860,6 +920,77 @@ struct Speech_Element : JSON::Element {
   SpeechOutputs_Element outputs_{v_.outputs};
 };
 
+struct JoinerInputs_Element : JSON::Element {
+  explicit JoinerInputs_Element(Config::Model::Joiner::Inputs& v) : v_{v} {}
+
+  void OnValue(std::string_view name, JSON::Value value) override {
+    if (name == "encoder_outputs") {
+      v_.encoder_outputs = JSON::Get<std::string_view>(value);
+    } else if (name == "decoder_outputs") {
+      v_.decoder_outputs = JSON::Get<std::string_view>(value);
+    } else {
+      throw JSON::unknown_value_error{};
+    }
+  }
+
+ private:
+  Config::Model::Joiner::Inputs& v_;
+};
+
+struct JoinerOutputs_Element : JSON::Element {
+  explicit JoinerOutputs_Element(Config::Model::Joiner::Outputs& v) : v_{v} {}
+
+  void OnValue(std::string_view name, JSON::Value value) override {
+    if (name == "logits") {
+      v_.logits = JSON::Get<std::string_view>(value);
+    } else {
+      throw JSON::unknown_value_error{};
+    }
+  }
+
+ private:
+  Config::Model::Joiner::Outputs& v_;
+};
+
+struct Joiner_Element : JSON::Element {
+  explicit Joiner_Element(Config::Model::Joiner& v) : v_{v} {}
+
+  void OnValue(std::string_view name, JSON::Value value) override {
+    if (name == "filename") {
+      v_.filename = JSON::Get<std::string_view>(value);
+    } else {
+      throw JSON::unknown_value_error{};
+    }
+  }
+
+  Element& OnObject(std::string_view name) override {
+    if (name == "session_options") {
+      v_.session_options = Config::SessionOptions{};
+      session_options_ = std::make_unique<SessionOptions_Element>(*v_.session_options);
+      return *session_options_;
+    }
+    if (name == "run_options") {
+      v_.run_options = Config::RunOptions{};
+      run_options_ = std::make_unique<RunOptions_Element>(*v_.run_options);
+      return *run_options_;
+    }
+    if (name == "inputs") {
+      return inputs_;
+    }
+    if (name == "outputs") {
+      return outputs_;
+    }
+    throw JSON::unknown_value_error{};
+  }
+
+ private:
+  Config::Model::Joiner& v_;
+  std::unique_ptr<SessionOptions_Element> session_options_;
+  std::unique_ptr<RunOptions_Element> run_options_;
+  JoinerInputs_Element inputs_{v_.inputs};
+  JoinerOutputs_Element outputs_{v_.outputs};
+};
+
 struct EmbeddingInputs_Element : JSON::Element {
   explicit EmbeddingInputs_Element(Config::Model::Embedding::Inputs& v) : v_{v} {}
 
@@ -986,6 +1117,9 @@ struct Model_Element : JSON::Element {
     if (name == "speech") {
       return speech_;
     }
+    if (name == "joiner") {
+      return joiner_;
+    }
     throw JSON::unknown_value_error{};
   }
 
@@ -997,6 +1131,7 @@ struct Model_Element : JSON::Element {
   Vision_Element vision_{v_.vision};
   Embedding_Element embedding_{v_.embedding};
   Speech_Element speech_{v_.speech};
+  Joiner_Element joiner_{v_.joiner};
 };
 
 int SafeDoubleToInt(double x, std::string_view name) {

@@ -202,6 +202,32 @@ struct Config {
       std::string config_filename{"audio_processor_config.json"};
       std::optional<std::string> adapter_filename{};
 
+      // Mel spectrogram / streaming ASR parameters
+      int num_mels{};
+      int fft_size{};
+      int hop_length{};
+      int win_length{};
+      float preemph{};
+      float log_eps{};
+      int subsampling_factor{};
+      int left_context{};
+      int conv_context{};
+      int pre_encode_cache_size{};
+      int sample_rate{};
+      int chunk_samples{};
+      int blank_id{};
+      int max_symbols_per_step{};
+
+      // Cache-aware streaming encoder I/O names
+      std::string enc_in_length{"length"};
+      std::string enc_in_cache_channel{"cache_last_channel"};
+      std::string enc_in_cache_time{"cache_last_time"};
+      std::string enc_in_cache_channel_len{"cache_last_channel_len"};
+      std::string enc_out_length{"encoded_lengths"};
+      std::string enc_out_cache_channel{"cache_last_channel_next"};
+      std::string enc_out_cache_time{"cache_last_time_next"};
+      std::string enc_out_cache_channel_len{"cache_last_channel_next_len"};
+
       struct Inputs {
         std::string audio_embeds{Defaults::AudioEmbedsName};
         std::string attention_mask{Defaults::AudioAttentionMaskName};
@@ -214,6 +240,21 @@ struct Config {
       } outputs;
     } speech;
 
+    struct Joiner {
+      std::string filename;
+      std::optional<SessionOptions> session_options;
+      std::optional<RunOptions> run_options;
+
+      struct Inputs {
+        std::string encoder_outputs{"encoder_outputs"};
+        std::string decoder_outputs{"decoder_outputs"};
+      } inputs;
+
+      struct Outputs {
+        std::string logits{"outputs"};
+      } outputs;
+    } joiner;
+
     struct Decoder {
       std::string filename;
       SessionOptions session_options;
@@ -255,6 +296,12 @@ struct Config {
         std::string cumulative_sequence_lengths{Defaults::CumulativeSequenceLengthsName};
         std::string past_sequence_lengths{Defaults::PastSequenceLengthsName};
         std::string block_table{Defaults::BlockTableName};
+
+        // RNNT decoder inputs
+        std::string targets;
+        std::string target_length; 
+        std::string states_1;
+        std::string states_2;
       } inputs;
 
       struct Outputs {
@@ -264,6 +311,12 @@ struct Config {
         std::string present_names;  // When key/value pairs are combined
         std::string output_cross_qk_names{"output_cross_qk_%d"};
         std::string rnn_states{Defaults::RnnStatesName};
+
+        // RNNT decoder outputs
+        std::string outputs;
+        std::string prednet_lengths;
+        std::string states_1;
+        std::string states_2;
       } outputs;
 
       struct PipelineModel {

@@ -4,6 +4,7 @@
 // Modifications Copyright(C) 2026 Advanced Micro Devices, Inc. All rights reserved.
 
 #include "generators.h"
+#include "streaming_asr.h"
 #include "sequences.h"
 #include "models/env_utils.h"
 #include "models/model.h"