Skip to content
Merged
Show file tree
Hide file tree
Changes from 66 commits
Commits
Show all changes
70 commits
Select commit Hold shift + click to select a range
31d6779
nemotron support
nenad1002 Feb 10, 2026
9026781
ONNX 2 good version
nenad1002 Feb 11, 2026
8c6f4ed
Nemotron support
nenad1002 Feb 12, 2026
9dd6212
Support 4
nenad1002 Feb 12, 2026
8b0de45
First stream
nenad1002 Feb 12, 2026
0d83168
Overlap support
nenad1002 Feb 12, 2026
d2ff912
Nemotron support stream 3
nenad1002 Feb 12, 2026
c7ed0c9
Mi fix
nenad1002 Feb 13, 2026
b83b84f
Move mel stuff to separate file
nenad1002 Feb 13, 2026
ff275b4
Remove mel spectogram
nenad1002 Feb 13, 2026
32001f1
Revert non-needed changes
nenad1002 Feb 17, 2026
131db0c
Make sure genai_config.json defines model params
nenad1002 Feb 18, 2026
b262003
Point to latest extensions
nenad1002 Feb 20, 2026
5cc511d
Add tests
nenad1002 Feb 20, 2026
5cf0c59
Add a better test
nenad1002 Feb 20, 2026
f670d36
Remove text tokenizer and sr to genaiconfig
nenad1002 Feb 20, 2026
6e23d87
Remove dead code
nenad1002 Feb 20, 2026
fd47344
Abstract streaming ASR class
nenad1002 Feb 20, 2026
98d6e54
remove processor
nenad1002 Feb 21, 2026
06d05d0
Fix merge conflict
nenad1002 Mar 2, 2026
46a166d
Clean more code
nenad1002 Mar 2, 2026
8a5e912
Clean up examples
nenad1002 Mar 2, 2026
e10086f
Performance optimizations
nenad1002 Mar 2, 2026
5eb10ff
More cleaning
nenad1002 Mar 2, 2026
89b8bd5
Try removing warning
nenad1002 Mar 2, 2026
880143b
Add flag to tests
nenad1002 Mar 2, 2026
092d212
fix formatting
nenad1002 Mar 3, 2026
67e649c
Resolve Copilot comments
nenad1002 Mar 3, 2026
5be81c9
Fix formatting issue
nenad1002 Mar 3, 2026
b3c6411
Merge branch 'main' into nebanfic/nemotron-support-stream-3
nenad1002 Mar 3, 2026
165037b
Remove soundfile
nenad1002 Mar 3, 2026
5097afd
Remove dead tokenzier code
nenad1002 Mar 5, 2026
98e81b7
Adjust genai config to our exported models
nenad1002 Mar 5, 2026
0a6d87b
Resolve more comments
nenad1002 Mar 5, 2026
70f4e23
Avoid memset, memcpy and manual copy on GPU and whenever possible, ri…
nenad1002 Mar 5, 2026
4d8a0f5
Add consistency
nenad1002 Mar 5, 2026
96dafca
Big improvement - cache locality for frames
nenad1002 Mar 6, 2026
2499ab4
Csharp support
nenad1002 Mar 6, 2026
51a61c7
Add a check to the factory for StreamingASR
nenad1002 Mar 6, 2026
9e28df9
nemotron generator
nenad1002 Mar 6, 2026
8a1bef0
remove ProcessChunk from model.h
nenad1002 Mar 6, 2026
154b5aa
remove generate_next_tokens()
nenad1002 Mar 6, 2026
571f300
Rename processor
nenad1002 Mar 6, 2026
f61fc0a
C# sample and remove unnecessary files
nenad1002 Mar 9, 2026
8596fc1
Fix all
nenad1002 Mar 9, 2026
b762766
more fixes
nenad1002 Mar 9, 2026
e2ab1e7
samples change
nenad1002 Mar 9, 2026
4059341
Introduce NamedTensors on streaming processor
nenad1002 Mar 10, 2026
ca9a9f3
Remove speech section in genai_config
nenad1002 Mar 10, 2026
282a9f0
Reverse NativeMethods.cs formatting
nenad1002 Mar 10, 2026
dc46428
Some refactoring
nenad1002 Mar 10, 2026
7c636ba
Make streaming processor abstract class
nenad1002 Mar 10, 2026
33f809b
set_inputs
nenad1002 Mar 10, 2026
66ee360
Copilot suggestions
nenad1002 Mar 11, 2026
27ce2e5
Examples changes
nenad1002 Mar 12, 2026
8723d3b
More comments resolved
nenad1002 Mar 12, 2026
96ce812
SubStates
nenad1002 Mar 12, 2026
df8cb9b
More changes
nenad1002 Mar 12, 2026
c5ed7df
Resolvimg more comments
nenad1002 Mar 12, 2026
02c5fde
Mass copy
nenad1002 Mar 12, 2026
101113d
Copilot fixes
nenad1002 Mar 12, 2026
640e9af
Merge conflict fix
nenad1002 Mar 12, 2026
7d023ef
Potential fix for code scanning alert no. 798: Unused local variable
nenad1002 Mar 12, 2026
7283dd1
Fix clang
nenad1002 Mar 12, 2026
a3f77e4
Run clang
nenad1002 Mar 12, 2026
658e8de
fix tests
nenad1002 Mar 12, 2026
78b84a2
Resolve comments
nenad1002 Mar 13, 2026
ff890a9
Add C++ example
nenad1002 Mar 16, 2026
9473ab8
Semicolon on another line
nenad1002 Mar 16, 2026
8e683f3
Add C# sample readme
nenad1002 Mar 16, 2026
File filter

Filter by extension

Filter by extension


Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
9 changes: 9 additions & 0 deletions CMakeLists.txt
Original file line number Diff line number Diff line change
Expand Up @@ -134,6 +134,15 @@ if(ENABLE_TESTS)
add_compile_definitions(TEST_PHI2=0)
endif()

if (TEST_STREAMING_ASR)
add_compile_definitions(TEST_STREAMING_ASR=1)
Comment thread
baijumeswani marked this conversation as resolved.
Outdated
if (STREAMING_ASR_PATH)
add_compile_definitions(STREAMING_ASR_PATH="${STREAMING_ASR_PATH}")
endif()
else()
add_compile_definitions(TEST_STREAMING_ASR=0)
endif()

if (USE_WEBGPU)
add_compile_definitions(USE_WEBGPU=1)
else()
Expand Down
22 changes: 22 additions & 0 deletions examples/csharp/NemotronSpeech/NemotronSpeech.csproj
Original file line number Diff line number Diff line change
@@ -0,0 +1,22 @@
<Project Sdk="Microsoft.NET.Sdk">

Comment thread
kunal-vaishnavi marked this conversation as resolved.
<PropertyGroup>
<TargetFramework>net8.0</TargetFramework>
<OutputType>Exe</OutputType>
<Nullable>enable</Nullable>
<ImplicitUsings>enable</ImplicitUsings>
<AllowUnsafeBlocks>true</AllowUnsafeBlocks>
</PropertyGroup>

<ItemGroup>
<PackageReference Include="Microsoft.ML.OnnxRuntimeGenAI" Version="0.12.0" Condition=" '$(Configuration)' == 'Debug' OR '$(Configuration)' == 'Release' " />
<PackageReference Include="Microsoft.ML.OnnxRuntimeGenAI.Cuda" Version="0.12.0" Condition=" '$(Configuration)' == 'Debug_Cuda' OR '$(Configuration)' == 'Release_Cuda' " />
<PackageReference Include="NAudio" Version="2.2.1" />
<PackageReference Include="System.CommandLine" Version="2.0.1" />
</ItemGroup>

<ItemGroup>
<Compile Include="../Common/Common.cs" Link="Common/Common.cs" />
</ItemGroup>

</Project>
101 changes: 101 additions & 0 deletions examples/csharp/NemotronSpeech/Program.cs
Original file line number Diff line number Diff line change
@@ -0,0 +1,101 @@
// Copyright (c) Microsoft Corporation. All rights reserved.
// Licensed under the MIT License.

using CommonUtils;
using Microsoft.ML.OnnxRuntimeGenAI;
using NAudio.Wave;
using NAudio.Wave.SampleProviders;
using System.Text.Json;

if (args.Length < 2) {
Console.WriteLine("Usage: NemotronSpeech <model_path> <audio_file.wav> [execution_provider]");
return;
}

string modelPath = args[0];
string audioFile = args[1];
string executionProvider = args.Length > 2 ? args[2] : "follow_config";

// Read sample_rate and chunk_samples from genai_config.json
var configJson = JsonDocument.Parse(File.ReadAllText(Path.Combine(modelPath, "genai_config.json")));
var modelConfig = configJson.RootElement.GetProperty("model");
int sampleRate = modelConfig.GetProperty("sample_rate").GetInt32();
int chunkSize = modelConfig.GetProperty("chunk_samples").GetInt32();

// Load audio, convert to mono, and resample to match the model's expected sample rate
float[] audio = LoadAudio(audioFile, sampleRate);
Console.WriteLine($"Audio: {audio.Length / (double)sampleRate:F1}s ({audio.Length} samples)");

using var config = Common.GetConfig(path: modelPath, ep: executionProvider, null, new GeneratorParamsArgs());
using var model = new Model(config);
using var processor = new StreamingProcessor(model);
using var tokenizer = new Tokenizer(model);
using var tokenizerStream = tokenizer.CreateStream();
using var genParams = new GeneratorParams(model);
using var generator = new Generator(model, genParams);
Console.WriteLine(new string('-', 60));
string fullTranscript = "";

for (int i = 0; i < audio.Length; i += chunkSize) {
int remaining = Math.Min(chunkSize, audio.Length - i);
float[] chunk = new float[remaining];
Array.Copy(audio, i, chunk, 0, remaining);

using var inputs = processor.Process(chunk);
if (inputs != null) {
generator.SetInputs(inputs);
fullTranscript += DecodeTokens(generator, tokenizerStream);
}
}

// Flush remaining buffered audio
using var flushInputs = processor.Flush();
if (flushInputs != null) {
generator.SetInputs(flushInputs);
fullTranscript += DecodeTokens(generator, tokenizerStream);
}

Console.WriteLine($"\n{new string('=', 60)}");
Console.WriteLine($" {fullTranscript.Trim()}");
Console.WriteLine(new string('=', 60));

static string DecodeTokens(Generator generator, TokenizerStream tokenizerStream) {
string text = "";
while (!generator.IsDone()) {
generator.GenerateNextToken();
var tokens = generator.GetNextTokens();
if (tokens.Length > 0) {
string tokenText = tokenizerStream.Decode(tokens[0]);
if (!string.IsNullOrEmpty(tokenText)) {
Console.Write(tokenText);
text += tokenText;
}
}
}
return text;
}

static float[] LoadAudio(string path, int targetSampleRate) {
using var reader = new AudioFileReader(path);

// Convert to mono if needed
ISampleProvider source = reader;
if (reader.WaveFormat.Channels > 1) {
source = new StereoToMonoSampleProvider(source);
}

// Resample if needed
if (reader.WaveFormat.SampleRate != targetSampleRate) {
source = new WdlResamplingSampleProvider(source, targetSampleRate);
}

var samples = new List<float>();
// Allocate memory to read, any num works.
float[] buffer = new float[4096];
int read;
while ((read = source.Read(buffer, 0, buffer.Length)) > 0) {
for (int i = 0; i < read; i++)
samples.Add(buffer[i]);
}
return samples.ToArray();
}
105 changes: 105 additions & 0 deletions examples/python/nemotron_speech.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,105 @@
# Copyright (c) Microsoft Corporation. All rights reserved.
# Licensed under the MIT License.

Comment thread
nenad1002 marked this conversation as resolved.
import argparse
import json
import os
import sys
import time
import numpy as np
import onnxruntime_genai as og
from common import get_config


def load_config(model_path):
"""Read sample_rate and chunk_samples from genai_config.json."""
config_path = os.path.join(model_path, "genai_config.json")
with open(config_path, "r") as f:
config = json.load(f)
sample_rate = config["model"]["sample_rate"]
chunk_samples = config["model"]["chunk_samples"]
return sample_rate, chunk_samples


def load_audio(audio_path, sample_rate):
import soundfile as sf
audio, sr = sf.read(audio_path, dtype="float32")
if len(audio.shape) > 1:
audio = audio.mean(axis=1)
if sr != sample_rate:
import scipy.signal
num_samples = int(len(audio) * sample_rate / sr)
audio = scipy.signal.resample(audio, num_samples).astype(np.float32)
Comment thread
nenad1002 marked this conversation as resolved.
return audio


def decode_tokens(generator, tokenizer_stream):
"""Decode all available tokens from the generator, returning the text."""
text = ""
while not generator.is_done():
generator.generate_next_token()
tokens = generator.get_next_tokens()
if len(tokens) > 0:
token_text = tokenizer_stream.decode(tokens[0])
if token_text:
print(token_text, end="", flush=True)
text += token_text
return text


def simulate_microphone(model_path, audio_path, execution_provider):
"""Stream audio through Generator + StreamingProcessor API."""
sample_rate, chunk_samples = load_config(model_path)
audio = load_audio(audio_path, sample_rate)
duration = len(audio) / sample_rate
chunk_duration = chunk_samples / sample_rate
Comment thread Fixed
Comment thread
github-advanced-security[bot] marked this conversation as resolved.
Fixed

config = get_config(model_path, execution_provider)
model = og.Model(config)
Comment thread
nenad1002 marked this conversation as resolved.
processor = og.StreamingProcessor(model)
tokenizer = og.Tokenizer(model)
tokenizer_stream = tokenizer.create_stream()
params = og.GeneratorParams(model)
generator = og.Generator(model, params)

print("-" * 60)
stream_start = time.time()
full_transcript = ""

for i in range(0, len(audio), chunk_samples):
chunk = audio[i:i + chunk_samples].astype(np.float32)
inputs = processor.process(chunk)
if inputs is not None:
generator.set_inputs(inputs)
full_transcript += decode_tokens(generator, tokenizer_stream)

# Flush remaining audio
inputs = processor.flush()
if inputs is not None:
generator.set_inputs(inputs)
full_transcript += decode_tokens(generator, tokenizer_stream)

total_wall = time.time() - stream_start

print(f"\n{'=' * 60}")
print(f" {full_transcript.strip()}")
print(f"{'=' * 60}")
print(f" Audio: {duration:.2f}s | Wall: {total_wall:.2f}s | RTF: {duration/total_wall:.2f}x")


def main():
parser = argparse.ArgumentParser()
parser.add_argument("--model_path", type=str, required=True)
parser.add_argument("--audio_file", type=str, required=True)
parser.add_argument("-e", "--execution_provider", type=str, required=False, default="follow_config",
choices=["cpu", "cuda", "dml", "follow_config"],
help="Execution provider to run with. Defaults to follow_config.")
args = parser.parse_args()
if not os.path.exists(args.audio_file):
print(f"Error: {args.audio_file} not found")
sys.exit(1)
simulate_microphone(args.model_path, args.audio_file, args.execution_provider)


if __name__ == "__main__":
main()
1 change: 1 addition & 0 deletions nuget.config
Original file line number Diff line number Diff line change
Expand Up @@ -3,5 +3,6 @@
<packageSources>
<clear />
<add key="ORT-Nightly" value="https://aiinfra.pkgs.visualstudio.com/PublicPackages/_packaging/ORT-Nightly/nuget/v3/index.json" />
<add key="nuget.org" value="https://api.nuget.org/v3/index.json" />
Comment thread
baijumeswani marked this conversation as resolved.
Outdated
</packageSources>
</configuration>
Loading
Loading