microsoft · kunal-vaishnavi · Mar 17, 2026 · Feb 10, 2026 · Feb 11, 2026 · Feb 12, 2026
diff --git a/CMakeLists.txt b/CMakeLists.txt
@@ -134,6 +134,15 @@ if(ENABLE_TESTS)
     add_compile_definitions(TEST_PHI2=0)
   endif()
 
+  if (TEST_STREAMING_ASR)
+    add_compile_definitions(TEST_STREAMING_ASR=1)
+    if (STREAMING_ASR_PATH)
+      add_compile_definitions(STREAMING_ASR_PATH="${STREAMING_ASR_PATH}")
+    endif()
+  else()
+    add_compile_definitions(TEST_STREAMING_ASR=0)
+  endif()
+
   if (USE_WEBGPU)
     add_compile_definitions(USE_WEBGPU=1)
   else()

@@ -0,0 +1,22 @@
+<Project Sdk="Microsoft.NET.Sdk">
+
+  <PropertyGroup>
+    <TargetFramework>net8.0</TargetFramework>
+    <OutputType>Exe</OutputType>
+    <Nullable>enable</Nullable>
+    <ImplicitUsings>enable</ImplicitUsings>
+    <AllowUnsafeBlocks>true</AllowUnsafeBlocks>
+  </PropertyGroup>
+
+  <ItemGroup>
+    <PackageReference Include="Microsoft.ML.OnnxRuntimeGenAI" Version="0.12.0" Condition=" '$(Configuration)' == 'Debug' OR '$(Configuration)' == 'Release' " />
+    <PackageReference Include="Microsoft.ML.OnnxRuntimeGenAI.Cuda" Version="0.12.0" Condition=" '$(Configuration)' == 'Debug_Cuda' OR '$(Configuration)' == 'Release_Cuda' " />
+    <PackageReference Include="NAudio" Version="2.2.1" />
+    <PackageReference Include="System.CommandLine" Version="2.0.1" />
+  </ItemGroup>
+
+  <ItemGroup>
+    <Compile Include="../Common/Common.cs" Link="Common/Common.cs" />
+  </ItemGroup>
+
+</Project>
@@ -0,0 +1,101 @@
+// Copyright (c) Microsoft Corporation. All rights reserved.
+// Licensed under the MIT License.
+
+using CommonUtils;
+using Microsoft.ML.OnnxRuntimeGenAI;
+using NAudio.Wave;
+using NAudio.Wave.SampleProviders;
+using System.Text.Json;
+
+if (args.Length < 2) {
+  Console.WriteLine("Usage: NemotronSpeech <model_path> <audio_file.wav> [execution_provider]");
+  return;
+}
+
+string modelPath = args[0];
+string audioFile = args[1];
+string executionProvider = args.Length > 2 ? args[2] : "follow_config";
+
+// Read sample_rate and chunk_samples from genai_config.json
+var configJson = JsonDocument.Parse(File.ReadAllText(Path.Combine(modelPath, "genai_config.json")));
+var modelConfig = configJson.RootElement.GetProperty("model");
+int sampleRate = modelConfig.GetProperty("sample_rate").GetInt32();
+int chunkSize = modelConfig.GetProperty("chunk_samples").GetInt32();
+
+// Load audio, convert to mono, and resample to match the model's expected sample rate
+float[] audio = LoadAudio(audioFile, sampleRate);
+Console.WriteLine($"Audio: {audio.Length / (double)sampleRate:F1}s ({audio.Length} samples)");
+
+using var config = Common.GetConfig(path: modelPath, ep: executionProvider, null, new GeneratorParamsArgs());
+using var model = new Model(config);
+using var processor = new StreamingProcessor(model);
+using var tokenizer = new Tokenizer(model);
+using var tokenizerStream = tokenizer.CreateStream();
+using var genParams = new GeneratorParams(model);
+using var generator = new Generator(model, genParams);
+Console.WriteLine(new string('-', 60));
+string fullTranscript = "";
+
+for (int i = 0; i < audio.Length; i += chunkSize) {
+  int remaining = Math.Min(chunkSize, audio.Length - i);
+  float[] chunk = new float[remaining];
+  Array.Copy(audio, i, chunk, 0, remaining);
+
+  using var inputs = processor.Process(chunk);
+  if (inputs != null) {
+    generator.SetInputs(inputs);
+    fullTranscript += DecodeTokens(generator, tokenizerStream);
+  }
+}
+
+// Flush remaining buffered audio
+using var flushInputs = processor.Flush();
+if (flushInputs != null) {
+  generator.SetInputs(flushInputs);
+  fullTranscript += DecodeTokens(generator, tokenizerStream);
+}
+
+Console.WriteLine($"\n{new string('=', 60)}");
+Console.WriteLine($"  {fullTranscript.Trim()}");
+Console.WriteLine(new string('=', 60));
+
+static string DecodeTokens(Generator generator, TokenizerStream tokenizerStream) {
+  string text = "";
+  while (!generator.IsDone()) {
+    generator.GenerateNextToken();
+    var tokens = generator.GetNextTokens();
+    if (tokens.Length > 0) {
+      string tokenText = tokenizerStream.Decode(tokens[0]);
+      if (!string.IsNullOrEmpty(tokenText)) {
+        Console.Write(tokenText);
+        text += tokenText;
+      }
+    }
+  }
+  return text;
+}
+
+static float[] LoadAudio(string path, int targetSampleRate) {
+  using var reader = new AudioFileReader(path);
+
+  // Convert to mono if needed
+  ISampleProvider source = reader;
+  if (reader.WaveFormat.Channels > 1) {
+    source = new StereoToMonoSampleProvider(source);
+  }
+
+  // Resample if needed
+  if (reader.WaveFormat.SampleRate != targetSampleRate) {
+    source = new WdlResamplingSampleProvider(source, targetSampleRate);
+  }
+
+  var samples = new List<float>();
+  // Allocate memory to read, any num works.
+  float[] buffer = new float[4096];
+  int read;
+  while ((read = source.Read(buffer, 0, buffer.Length)) > 0) {
+    for (int i = 0; i < read; i++)
+      samples.Add(buffer[i]);
+  }
+  return samples.ToArray();
+}
@@ -0,0 +1,105 @@
+# Copyright (c) Microsoft Corporation. All rights reserved.
+# Licensed under the MIT License.
+
+import argparse
+import json
+import os
+import sys
+import time
+import numpy as np
+import onnxruntime_genai as og
+from common import get_config
+
+
+def load_config(model_path):
+    """Read sample_rate and chunk_samples from genai_config.json."""
+    config_path = os.path.join(model_path, "genai_config.json")
+    with open(config_path, "r") as f:
+        config = json.load(f)
+    sample_rate = config["model"]["sample_rate"]
+    chunk_samples = config["model"]["chunk_samples"]
+    return sample_rate, chunk_samples
+
+
+def load_audio(audio_path, sample_rate):
+    import soundfile as sf
+    audio, sr = sf.read(audio_path, dtype="float32")
+    if len(audio.shape) > 1:
+        audio = audio.mean(axis=1)
+    if sr != sample_rate:
+        import scipy.signal
+        num_samples = int(len(audio) * sample_rate / sr)
+        audio = scipy.signal.resample(audio, num_samples).astype(np.float32)
+    return audio
+
+
+def decode_tokens(generator, tokenizer_stream):
+    """Decode all available tokens from the generator, returning the text."""
+    text = ""
+    while not generator.is_done():
+        generator.generate_next_token()
+        tokens = generator.get_next_tokens()
+        if len(tokens) > 0:
+            token_text = tokenizer_stream.decode(tokens[0])
+            if token_text:
+                print(token_text, end="", flush=True)
+                text += token_text
+    return text
+
+
+def simulate_microphone(model_path, audio_path, execution_provider):
+    """Stream audio through Generator + StreamingProcessor API."""
+    sample_rate, chunk_samples = load_config(model_path)
+    audio = load_audio(audio_path, sample_rate)
+    duration = len(audio) / sample_rate
+    chunk_duration = chunk_samples / sample_rate
+
+    config = get_config(model_path, execution_provider)
+    model = og.Model(config)
+    processor = og.StreamingProcessor(model)
+    tokenizer = og.Tokenizer(model)
+    tokenizer_stream = tokenizer.create_stream()
+    params = og.GeneratorParams(model)
+    generator = og.Generator(model, params)
+
+    print("-" * 60)
+    stream_start = time.time()
+    full_transcript = ""
+
+    for i in range(0, len(audio), chunk_samples):
+        chunk = audio[i:i + chunk_samples].astype(np.float32)
+        inputs = processor.process(chunk)
+        if inputs is not None:
+            generator.set_inputs(inputs)
+            full_transcript += decode_tokens(generator, tokenizer_stream)
+
+    # Flush remaining audio
+    inputs = processor.flush()
+    if inputs is not None:
+        generator.set_inputs(inputs)
+        full_transcript += decode_tokens(generator, tokenizer_stream)
+
+    total_wall = time.time() - stream_start
+
+    print(f"\n{'=' * 60}")
+    print(f"  {full_transcript.strip()}")
+    print(f"{'=' * 60}")
+    print(f"  Audio: {duration:.2f}s | Wall: {total_wall:.2f}s | RTF: {duration/total_wall:.2f}x")
+
+
+def main():
+    parser = argparse.ArgumentParser()
+    parser.add_argument("--model_path", type=str, required=True)
+    parser.add_argument("--audio_file", type=str, required=True)
+    parser.add_argument("-e", "--execution_provider", type=str, required=False, default="follow_config",
+                        choices=["cpu", "cuda", "dml", "follow_config"],
+                        help="Execution provider to run with. Defaults to follow_config.")
+    args = parser.parse_args()
+    if not os.path.exists(args.audio_file):
+        print(f"Error: {args.audio_file} not found")
+        sys.exit(1)
+    simulate_microphone(args.model_path, args.audio_file, args.execution_provider)
+
+
+if __name__ == "__main__":
+    main()
diff --git a/nuget.config b/nuget.config
@@ -3,5 +3,6 @@
   <packageSources>
     <clear />
     <add key="ORT-Nightly" value="https://aiinfra.pkgs.visualstudio.com/PublicPackages/_packaging/ORT-Nightly/nuget/v3/index.json" />
+    <add key="nuget.org" value="https://api.nuget.org/v3/index.json" />
   </packageSources>
 </configuration>