Skip to content
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
Show all changes
70 commits
Select commit Hold shift + click to select a range
31d6779
nemotron support
nenad1002 Feb 10, 2026
9026781
ONNX 2 good version
nenad1002 Feb 11, 2026
8c6f4ed
Nemotron support
nenad1002 Feb 12, 2026
9dd6212
Support 4
nenad1002 Feb 12, 2026
8b0de45
First stream
nenad1002 Feb 12, 2026
0d83168
Overlap support
nenad1002 Feb 12, 2026
d2ff912
Nemotron support stream 3
nenad1002 Feb 12, 2026
c7ed0c9
Mi fix
nenad1002 Feb 13, 2026
b83b84f
Move mel stuff to separate file
nenad1002 Feb 13, 2026
ff275b4
Remove mel spectogram
nenad1002 Feb 13, 2026
32001f1
Revert non-needed changes
nenad1002 Feb 17, 2026
131db0c
Make sure genai_config.json defines model params
nenad1002 Feb 18, 2026
b262003
Point to latest extensions
nenad1002 Feb 20, 2026
5cc511d
Add tests
nenad1002 Feb 20, 2026
5cf0c59
Add a better test
nenad1002 Feb 20, 2026
f670d36
Remove text tokenizer and sr to genaiconfig
nenad1002 Feb 20, 2026
6e23d87
Remove dead code
nenad1002 Feb 20, 2026
fd47344
Abstract streaming ASR class
nenad1002 Feb 20, 2026
98d6e54
remove processor
nenad1002 Feb 21, 2026
06d05d0
Fix merge conflict
nenad1002 Mar 2, 2026
46a166d
Clean more code
nenad1002 Mar 2, 2026
8a5e912
Clean up examples
nenad1002 Mar 2, 2026
e10086f
Performance optimizations
nenad1002 Mar 2, 2026
5eb10ff
More cleaning
nenad1002 Mar 2, 2026
89b8bd5
Try removing warning
nenad1002 Mar 2, 2026
880143b
Add flag to tests
nenad1002 Mar 2, 2026
092d212
fix formatting
nenad1002 Mar 3, 2026
67e649c
Resolve Copilot comments
nenad1002 Mar 3, 2026
5be81c9
Fix formatting issue
nenad1002 Mar 3, 2026
b3c6411
Merge branch 'main' into nebanfic/nemotron-support-stream-3
nenad1002 Mar 3, 2026
165037b
Remove soundfile
nenad1002 Mar 3, 2026
5097afd
Remove dead tokenzier code
nenad1002 Mar 5, 2026
98e81b7
Adjust genai config to our exported models
nenad1002 Mar 5, 2026
0a6d87b
Resolve more comments
nenad1002 Mar 5, 2026
70f4e23
Avoid memset, memcpy and manual copy on GPU and whenever possible, ri…
nenad1002 Mar 5, 2026
4d8a0f5
Add consistency
nenad1002 Mar 5, 2026
96dafca
Big improvement - cache locality for frames
nenad1002 Mar 6, 2026
2499ab4
Csharp support
nenad1002 Mar 6, 2026
51a61c7
Add a check to the factory for StreamingASR
nenad1002 Mar 6, 2026
9e28df9
nemotron generator
nenad1002 Mar 6, 2026
8a1bef0
remove ProcessChunk from model.h
nenad1002 Mar 6, 2026
154b5aa
remove generate_next_tokens()
nenad1002 Mar 6, 2026
571f300
Rename processor
nenad1002 Mar 6, 2026
f61fc0a
C# sample and remove unnecessary files
nenad1002 Mar 9, 2026
8596fc1
Fix all
nenad1002 Mar 9, 2026
b762766
more fixes
nenad1002 Mar 9, 2026
e2ab1e7
samples change
nenad1002 Mar 9, 2026
4059341
Introduce NamedTensors on streaming processor
nenad1002 Mar 10, 2026
ca9a9f3
Remove speech section in genai_config
nenad1002 Mar 10, 2026
282a9f0
Reverse NativeMethods.cs formatting
nenad1002 Mar 10, 2026
dc46428
Some refactoring
nenad1002 Mar 10, 2026
7c636ba
Make streaming processor abstract class
nenad1002 Mar 10, 2026
33f809b
set_inputs
nenad1002 Mar 10, 2026
66ee360
Copilot suggestions
nenad1002 Mar 11, 2026
27ce2e5
Examples changes
nenad1002 Mar 12, 2026
8723d3b
More comments resolved
nenad1002 Mar 12, 2026
96ce812
SubStates
nenad1002 Mar 12, 2026
df8cb9b
More changes
nenad1002 Mar 12, 2026
c5ed7df
Resolvimg more comments
nenad1002 Mar 12, 2026
02c5fde
Mass copy
nenad1002 Mar 12, 2026
101113d
Copilot fixes
nenad1002 Mar 12, 2026
640e9af
Merge conflict fix
nenad1002 Mar 12, 2026
7d023ef
Potential fix for code scanning alert no. 798: Unused local variable
nenad1002 Mar 12, 2026
7283dd1
Fix clang
nenad1002 Mar 12, 2026
a3f77e4
Run clang
nenad1002 Mar 12, 2026
658e8de
fix tests
nenad1002 Mar 12, 2026
78b84a2
Resolve comments
nenad1002 Mar 13, 2026
ff890a9
Add C++ example
nenad1002 Mar 16, 2026
9473ab8
Semicolon on another line
nenad1002 Mar 16, 2026
8e683f3
Add C# sample readme
nenad1002 Mar 16, 2026
File filter

Filter by extension

Filter by extension


Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
7 changes: 7 additions & 0 deletions examples/c/CMakeLists.txt
Original file line number Diff line number Diff line change
Expand Up @@ -26,6 +26,7 @@ option(MODEL_CHAT "Build the Model Chat example" OFF)
option(MODEL_QA "Build the Model Q&A example" OFF)
option(MODEL_MM "Build the Model Multimodal example" OFF)
option(WHISPER "Build the Whisper example" OFF)
option(NEMOTRON_SPEECH "Build the Nemotron Speech Streaming example" OFF)

if(USE_CXX)
add_compile_definitions(USE_CXX)
Expand Down Expand Up @@ -126,3 +127,9 @@ if(WHISPER)
target_link_libraries(whisper PRIVATE nlohmann_json::nlohmann_json)
target_link_libraries(whisper PRIVATE CLI11::CLI11)
endif()

if(NEMOTRON_SPEECH)
add_executable(nemotron_speech ${EXAMPLES_SOURCE_DIR}/nemotron_speech.cpp)
prepare_executable(nemotron_speech)
target_link_libraries(nemotron_speech PRIVATE nlohmann_json::nlohmann_json)
endif()
240 changes: 240 additions & 0 deletions examples/c/src/nemotron_speech.cpp
Original file line number Diff line number Diff line change
@@ -0,0 +1,240 @@
// Copyright (c) Microsoft Corporation. All rights reserved.
// Licensed under the MIT License.
//
// nemotron_speech.cpp — Streaming ASR example using StreamingProcessor + Generator API.
//
// Usage:
// ./nemotron_speech --model_path /path/to/nemotron-model --audio_file /path/to/audio.wav

#include <chrono>
#include <cstring>
#include <fstream>
#include <iostream>
#include <string>
#include <vector>

#include <nlohmann/json.hpp>
#include "ort_genai.h"

struct AudioConfig {
int sample_rate;
int chunk_samples;
};

AudioConfig LoadConfig(const std::string& model_path) {
std::string config_path = model_path + "/genai_config.json";
std::ifstream f(config_path);
if (!f.is_open()) {
throw std::runtime_error("Cannot open " + config_path);
}
auto config = nlohmann::json::parse(f);
return {
config["model"]["sample_rate"].get<int>(),
config["model"]["chunk_samples"].get<int>(),
};
}

// Simple WAV loader — expects 16-bit PCM, mono or stereo.
// Returns float32 samples normalized to [-1, 1].
std::vector<float> LoadWav(const std::string& path, int target_sample_rate) {
std::ifstream file(path, std::ios::binary);
if (!file.is_open()) {
throw std::runtime_error("Cannot open audio file: " + path);
}

// Read WAV header
char riff[4];
file.read(riff, 4);
if (std::memcmp(riff, "RIFF", 4) != 0) {
throw std::runtime_error("Not a valid WAV file (missing RIFF header)");
}

file.seekg(4, std::ios::cur); // Skip file size

char wave[4];
file.read(wave, 4);
if (std::memcmp(wave, "WAVE", 4) != 0) {
throw std::runtime_error("Not a valid WAV file (missing WAVE marker)");
}

// Find fmt chunk
int16_t num_channels = 0;
int32_t sample_rate = 0;
int16_t bits_per_sample = 0;

while (file.good()) {
char chunk_id[4];
int32_t chunk_size;
file.read(chunk_id, 4);
file.read(reinterpret_cast<char*>(&chunk_size), 4);

if (std::memcmp(chunk_id, "fmt ", 4) == 0) {
int16_t audio_format;
file.read(reinterpret_cast<char*>(&audio_format), 2);
file.read(reinterpret_cast<char*>(&num_channels), 2);
file.read(reinterpret_cast<char*>(&sample_rate), 4);
file.seekg(6, std::ios::cur); // Skip byte rate + block align
file.read(reinterpret_cast<char*>(&bits_per_sample), 2);
if (chunk_size > 16) {
file.seekg(chunk_size - 16, std::ios::cur);
}
} else if (std::memcmp(chunk_id, "data", 4) == 0) {
int num_samples = chunk_size / (bits_per_sample / 8) / num_channels;
std::vector<float> audio(num_samples);

if (bits_per_sample == 16) {
std::vector<int16_t> raw(num_samples * num_channels);
file.read(reinterpret_cast<char*>(raw.data()), chunk_size);
for (int i = 0; i < num_samples; i++) {
if (num_channels == 1) {
audio[i] = raw[i] / 32768.0f;
} else {
// Average channels
float sum = 0.0f;
for (int c = 0; c < num_channels; c++) {
sum += raw[i * num_channels + c];
}
audio[i] = (sum / num_channels) / 32768.0f;
}
}
} else if (bits_per_sample == 32) {
// Assume float32
std::vector<float> raw(num_samples * num_channels);
file.read(reinterpret_cast<char*>(raw.data()), chunk_size);
for (int i = 0; i < num_samples; i++) {
if (num_channels == 1) {
audio[i] = raw[i];
} else {
float sum = 0.0f;
for (int c = 0; c < num_channels; c++) {
sum += raw[i * num_channels + c];
}
audio[i] = sum / num_channels;
}
}
} else {
throw std::runtime_error("Unsupported bits per sample: " + std::to_string(bits_per_sample));
}

// Basic resampling if needed (linear interpolation)
if (sample_rate != target_sample_rate) {
int new_len = static_cast<int>(audio.size() * static_cast<double>(target_sample_rate) / sample_rate);
std::vector<float> resampled(new_len);
for (int i = 0; i < new_len; i++) {
double src_idx = i * static_cast<double>(audio.size() - 1) / (new_len - 1);
int idx0 = static_cast<int>(src_idx);
int idx1 = std::min(idx0 + 1, static_cast<int>(audio.size()) - 1);
double frac = src_idx - idx0;
resampled[i] = static_cast<float>(audio[idx0] * (1.0 - frac) + audio[idx1] * frac);
}
return resampled;
}

return audio;
} else {
file.seekg(chunk_size, std::ios::cur);
}
}

throw std::runtime_error("No data chunk found in WAV file");
}

std::string DecodeTokens(OgaGenerator& generator, OgaTokenizerStream& tokenizer_stream) {
std::string text;
while (!generator.IsDone()) {
generator.GenerateNextToken();
auto next_tokens = generator.GetNextTokens();
if (!next_tokens.empty()) {
const char* token_text = tokenizer_stream.Decode(next_tokens[0]);
if (token_text && token_text[0] != '\0') {
std::cout << token_text << std::flush;
text += token_text;
}
}
}
return text;
}

void StreamingTranscribe(const std::string& model_path, const std::string& audio_path) {
auto [sample_rate, chunk_samples] = LoadConfig(model_path);

std::cout << "Loading audio: " << audio_path << std::endl;
auto audio = LoadWav(audio_path, sample_rate);
double duration = static_cast<double>(audio.size()) / sample_rate;

std::cout << "Loading model: " << model_path << std::endl;
auto config = OgaConfig::Create(model_path.c_str());
auto model = OgaModel::Create(*config);
auto processor = OgaStreamingProcessor::Create(*model);
auto tokenizer = OgaTokenizer::Create(*model);
auto tokenizer_stream = OgaTokenizerStream::Create(*tokenizer);
auto params = OgaGeneratorParams::Create(*model);
auto generator = OgaGenerator::Create(*model, *params);

std::cout << " Sample rate: " << sample_rate << ", Chunk: " << chunk_samples << " samples" << std::endl;
std::cout << " Audio duration: " << duration << "s" << std::endl;
std::cout << std::string(60, '-') << std::endl;

auto start = std::chrono::high_resolution_clock::now();
std::string full_transcript;

// Stream audio in chunks
for (size_t i = 0; i < audio.size(); i += chunk_samples) {
size_t remaining = std::min(static_cast<size_t>(chunk_samples), audio.size() - i);
auto inputs = processor->Process(audio.data() + i, remaining);
if (inputs) {
generator->SetInputs(*inputs);
full_transcript += DecodeTokens(*generator, *tokenizer_stream);
}
}

// Flush remaining audio
{
auto inputs = processor->Flush();
if (inputs && inputs.get()) {
generator->SetInputs(*inputs);
full_transcript += DecodeTokens(*generator, *tokenizer_stream);
}
}

auto end = std::chrono::high_resolution_clock::now();
double wall_time = std::chrono::duration<double>(end - start).count();

std::cout << "\n"
<< std::string(60, '=') << std::endl;
std::cout << " " << full_transcript << std::endl;
std::cout << std::string(60, '=') << std::endl;
std::cout << " Audio: " << duration << "s | Wall: " << wall_time << "s | RTF: " << (duration / wall_time) << "x" << std::endl;
}

int main(int argc, char* argv[]) {
if (argc < 3) {
std::cerr << "Usage: " << argv[0] << " --model_path <path> --audio_file <path>" << std::endl;
return 1;
}

std::string model_path;
std::string audio_file;

for (int i = 1; i < argc; i++) {
if (std::string(argv[i]) == "--model_path" && i + 1 < argc) {
model_path = argv[++i];
} else if (std::string(argv[i]) == "--audio_file" && i + 1 < argc) {
audio_file = argv[++i];
}
}

if (model_path.empty() || audio_file.empty()) {
std::cerr << "Both --model_path and --audio_file are required." << std::endl;
return 1;
}

try {
StreamingTranscribe(model_path, audio_file);
} catch (const std::exception& e) {
std::cerr << "Error: " << e.what() << std::endl;
return 1;
}

return 0;
}
22 changes: 22 additions & 0 deletions examples/csharp/NemotronSpeech/NemotronSpeech.csproj
Original file line number Diff line number Diff line change
@@ -0,0 +1,22 @@
<Project Sdk="Microsoft.NET.Sdk">

Comment thread
kunal-vaishnavi marked this conversation as resolved.
<PropertyGroup>
<TargetFramework>net8.0</TargetFramework>
<OutputType>Exe</OutputType>
<Nullable>enable</Nullable>
<ImplicitUsings>enable</ImplicitUsings>
<AllowUnsafeBlocks>true</AllowUnsafeBlocks>
</PropertyGroup>

<ItemGroup>
<PackageReference Include="Microsoft.ML.OnnxRuntimeGenAI" Version="0.12.0" Condition=" '$(Configuration)' == 'Debug' OR '$(Configuration)' == 'Release' " />
<PackageReference Include="Microsoft.ML.OnnxRuntimeGenAI.Cuda" Version="0.12.0" Condition=" '$(Configuration)' == 'Debug_Cuda' OR '$(Configuration)' == 'Release_Cuda' " />
<PackageReference Include="NAudio" Version="2.2.1" />
<PackageReference Include="System.CommandLine" Version="2.0.1" />
</ItemGroup>

<ItemGroup>
<Compile Include="../Common/Common.cs" Link="Common/Common.cs" />
</ItemGroup>

</Project>
101 changes: 101 additions & 0 deletions examples/csharp/NemotronSpeech/Program.cs
Original file line number Diff line number Diff line change
@@ -0,0 +1,101 @@
// Copyright (c) Microsoft Corporation. All rights reserved.
// Licensed under the MIT License.

using CommonUtils;
using Microsoft.ML.OnnxRuntimeGenAI;
using NAudio.Wave;
using NAudio.Wave.SampleProviders;
using System.Text.Json;

if (args.Length < 2) {
Console.WriteLine("Usage: NemotronSpeech <model_path> <audio_file.wav> [execution_provider]");
return;
}

string modelPath = args[0];
string audioFile = args[1];
string executionProvider = args.Length > 2 ? args[2] : "follow_config";

// Read sample_rate and chunk_samples from genai_config.json
var configJson = JsonDocument.Parse(File.ReadAllText(Path.Combine(modelPath, "genai_config.json")));
var modelConfig = configJson.RootElement.GetProperty("model");
int sampleRate = modelConfig.GetProperty("sample_rate").GetInt32();
int chunkSize = modelConfig.GetProperty("chunk_samples").GetInt32();

// Load audio, convert to mono, and resample to match the model's expected sample rate
float[] audio = LoadAudio(audioFile, sampleRate);
Console.WriteLine($"Audio: {audio.Length / (double)sampleRate:F1}s ({audio.Length} samples)");

using var config = Common.GetConfig(path: modelPath, ep: executionProvider, null, new GeneratorParamsArgs());
using var model = new Model(config);
using var processor = new StreamingProcessor(model);
using var tokenizer = new Tokenizer(model);
using var tokenizerStream = tokenizer.CreateStream();
using var genParams = new GeneratorParams(model);
using var generator = new Generator(model, genParams);
Console.WriteLine(new string('-', 60));
string fullTranscript = "";

for (int i = 0; i < audio.Length; i += chunkSize) {
int remaining = Math.Min(chunkSize, audio.Length - i);
float[] chunk = new float[remaining];
Array.Copy(audio, i, chunk, 0, remaining);

using var inputs = processor.Process(chunk);
if (inputs != null) {
generator.SetInputs(inputs);
fullTranscript += DecodeTokens(generator, tokenizerStream);
}
}

// Flush remaining buffered audio
using var flushInputs = processor.Flush();
if (flushInputs != null) {
generator.SetInputs(flushInputs);
fullTranscript += DecodeTokens(generator, tokenizerStream);
}

Console.WriteLine($"\n{new string('=', 60)}");
Console.WriteLine($" {fullTranscript.Trim()}");
Console.WriteLine(new string('=', 60));

static string DecodeTokens(Generator generator, TokenizerStream tokenizerStream) {
string text = "";
while (!generator.IsDone()) {
generator.GenerateNextToken();
var tokens = generator.GetNextTokens();
if (tokens.Length > 0) {
string tokenText = tokenizerStream.Decode(tokens[0]);
if (!string.IsNullOrEmpty(tokenText)) {
Console.Write(tokenText);
text += tokenText;
}
}
}
return text;
}

static float[] LoadAudio(string path, int targetSampleRate) {
using var reader = new AudioFileReader(path);

// Convert to mono if needed
ISampleProvider source = reader;
if (reader.WaveFormat.Channels > 1) {
source = new StereoToMonoSampleProvider(source);
}

// Resample if needed
if (reader.WaveFormat.SampleRate != targetSampleRate) {
source = new WdlResamplingSampleProvider(source, targetSampleRate);
}

var samples = new List<float>();
// Allocate memory to read, any num works.
float[] buffer = new float[4096];
int read;
while ((read = source.Read(buffer, 0, buffer.Length)) > 0) {
for (int i = 0; i < read; i++)
samples.Add(buffer[i]);
}
return samples.ToArray();
}
Loading
Loading