-
Notifications
You must be signed in to change notification settings - Fork 291
Nemotron ASR Support for Streaming #1997
New issue
Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.
By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.
Already on GitHub? Sign in to your account
Merged
Merged
Changes from all commits
Commits
Show all changes
70 commits
Select commit
Hold shift + click to select a range
31d6779
nemotron support
nenad1002 9026781
ONNX 2 good version
nenad1002 8c6f4ed
Nemotron support
nenad1002 9dd6212
Support 4
nenad1002 8b0de45
First stream
nenad1002 0d83168
Overlap support
nenad1002 d2ff912
Nemotron support stream 3
nenad1002 c7ed0c9
Mi fix
nenad1002 b83b84f
Move mel stuff to separate file
nenad1002 ff275b4
Remove mel spectogram
nenad1002 32001f1
Revert non-needed changes
nenad1002 131db0c
Make sure genai_config.json defines model params
nenad1002 b262003
Point to latest extensions
nenad1002 5cc511d
Add tests
nenad1002 5cf0c59
Add a better test
nenad1002 f670d36
Remove text tokenizer and sr to genaiconfig
nenad1002 6e23d87
Remove dead code
nenad1002 fd47344
Abstract streaming ASR class
nenad1002 98d6e54
remove processor
nenad1002 06d05d0
Fix merge conflict
nenad1002 46a166d
Clean more code
nenad1002 8a5e912
Clean up examples
nenad1002 e10086f
Performance optimizations
nenad1002 5eb10ff
More cleaning
nenad1002 89b8bd5
Try removing warning
nenad1002 880143b
Add flag to tests
nenad1002 092d212
fix formatting
nenad1002 67e649c
Resolve Copilot comments
nenad1002 5be81c9
Fix formatting issue
nenad1002 b3c6411
Merge branch 'main' into nebanfic/nemotron-support-stream-3
nenad1002 165037b
Remove soundfile
nenad1002 5097afd
Remove dead tokenzier code
nenad1002 98e81b7
Adjust genai config to our exported models
nenad1002 0a6d87b
Resolve more comments
nenad1002 70f4e23
Avoid memset, memcpy and manual copy on GPU and whenever possible, ri…
nenad1002 4d8a0f5
Add consistency
nenad1002 96dafca
Big improvement - cache locality for frames
nenad1002 2499ab4
Csharp support
nenad1002 51a61c7
Add a check to the factory for StreamingASR
nenad1002 9e28df9
nemotron generator
nenad1002 8a1bef0
remove ProcessChunk from model.h
nenad1002 154b5aa
remove generate_next_tokens()
nenad1002 571f300
Rename processor
nenad1002 f61fc0a
C# sample and remove unnecessary files
nenad1002 8596fc1
Fix all
nenad1002 b762766
more fixes
nenad1002 e2ab1e7
samples change
nenad1002 4059341
Introduce NamedTensors on streaming processor
nenad1002 ca9a9f3
Remove speech section in genai_config
nenad1002 282a9f0
Reverse NativeMethods.cs formatting
nenad1002 dc46428
Some refactoring
nenad1002 7c636ba
Make streaming processor abstract class
nenad1002 33f809b
set_inputs
nenad1002 66ee360
Copilot suggestions
nenad1002 27ce2e5
Examples changes
nenad1002 8723d3b
More comments resolved
nenad1002 96ce812
SubStates
nenad1002 df8cb9b
More changes
nenad1002 c5ed7df
Resolvimg more comments
nenad1002 02c5fde
Mass copy
nenad1002 101113d
Copilot fixes
nenad1002 640e9af
Merge conflict fix
nenad1002 7d023ef
Potential fix for code scanning alert no. 798: Unused local variable
nenad1002 7283dd1
Fix clang
nenad1002 a3f77e4
Run clang
nenad1002 658e8de
fix tests
nenad1002 78b84a2
Resolve comments
nenad1002 ff890a9
Add C++ example
nenad1002 9473ab8
Semicolon on another line
nenad1002 8e683f3
Add C# sample readme
nenad1002 File filter
Filter by extension
Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
There are no files selected for viewing
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
| Original file line number | Diff line number | Diff line change |
|---|---|---|
| @@ -0,0 +1,240 @@ | ||
| // Copyright (c) Microsoft Corporation. All rights reserved. | ||
| // Licensed under the MIT License. | ||
| // | ||
| // nemotron_speech.cpp — Streaming ASR example using StreamingProcessor + Generator API. | ||
| // | ||
| // Usage: | ||
| // ./nemotron_speech --model_path /path/to/nemotron-model --audio_file /path/to/audio.wav | ||
|
|
||
| #include <chrono> | ||
| #include <cstring> | ||
| #include <fstream> | ||
| #include <iostream> | ||
| #include <string> | ||
| #include <vector> | ||
|
|
||
| #include <nlohmann/json.hpp> | ||
| #include "ort_genai.h" | ||
|
|
||
| struct AudioConfig { | ||
| int sample_rate; | ||
| int chunk_samples; | ||
| }; | ||
|
|
||
| AudioConfig LoadConfig(const std::string& model_path) { | ||
| std::string config_path = model_path + "/genai_config.json"; | ||
| std::ifstream f(config_path); | ||
| if (!f.is_open()) { | ||
| throw std::runtime_error("Cannot open " + config_path); | ||
| } | ||
| auto config = nlohmann::json::parse(f); | ||
| return { | ||
| config["model"]["sample_rate"].get<int>(), | ||
| config["model"]["chunk_samples"].get<int>(), | ||
| }; | ||
| } | ||
|
|
||
| // Simple WAV loader — expects 16-bit PCM, mono or stereo. | ||
| // Returns float32 samples normalized to [-1, 1]. | ||
| std::vector<float> LoadWav(const std::string& path, int target_sample_rate) { | ||
| std::ifstream file(path, std::ios::binary); | ||
| if (!file.is_open()) { | ||
| throw std::runtime_error("Cannot open audio file: " + path); | ||
| } | ||
|
|
||
| // Read WAV header | ||
| char riff[4]; | ||
| file.read(riff, 4); | ||
| if (std::memcmp(riff, "RIFF", 4) != 0) { | ||
| throw std::runtime_error("Not a valid WAV file (missing RIFF header)"); | ||
| } | ||
|
|
||
| file.seekg(4, std::ios::cur); // Skip file size | ||
|
|
||
| char wave[4]; | ||
| file.read(wave, 4); | ||
| if (std::memcmp(wave, "WAVE", 4) != 0) { | ||
| throw std::runtime_error("Not a valid WAV file (missing WAVE marker)"); | ||
| } | ||
|
|
||
| // Find fmt chunk | ||
| int16_t num_channels = 0; | ||
| int32_t sample_rate = 0; | ||
| int16_t bits_per_sample = 0; | ||
|
|
||
| while (file.good()) { | ||
| char chunk_id[4]; | ||
| int32_t chunk_size; | ||
| file.read(chunk_id, 4); | ||
| file.read(reinterpret_cast<char*>(&chunk_size), 4); | ||
|
|
||
| if (std::memcmp(chunk_id, "fmt ", 4) == 0) { | ||
| int16_t audio_format; | ||
| file.read(reinterpret_cast<char*>(&audio_format), 2); | ||
| file.read(reinterpret_cast<char*>(&num_channels), 2); | ||
| file.read(reinterpret_cast<char*>(&sample_rate), 4); | ||
| file.seekg(6, std::ios::cur); // Skip byte rate + block align | ||
| file.read(reinterpret_cast<char*>(&bits_per_sample), 2); | ||
| if (chunk_size > 16) { | ||
| file.seekg(chunk_size - 16, std::ios::cur); | ||
| } | ||
| } else if (std::memcmp(chunk_id, "data", 4) == 0) { | ||
| int num_samples = chunk_size / (bits_per_sample / 8) / num_channels; | ||
| std::vector<float> audio(num_samples); | ||
|
|
||
| if (bits_per_sample == 16) { | ||
| std::vector<int16_t> raw(num_samples * num_channels); | ||
| file.read(reinterpret_cast<char*>(raw.data()), chunk_size); | ||
| for (int i = 0; i < num_samples; i++) { | ||
| if (num_channels == 1) { | ||
| audio[i] = raw[i] / 32768.0f; | ||
| } else { | ||
| // Average channels | ||
| float sum = 0.0f; | ||
| for (int c = 0; c < num_channels; c++) { | ||
| sum += raw[i * num_channels + c]; | ||
| } | ||
| audio[i] = (sum / num_channels) / 32768.0f; | ||
| } | ||
| } | ||
| } else if (bits_per_sample == 32) { | ||
| // Assume float32 | ||
| std::vector<float> raw(num_samples * num_channels); | ||
| file.read(reinterpret_cast<char*>(raw.data()), chunk_size); | ||
| for (int i = 0; i < num_samples; i++) { | ||
| if (num_channels == 1) { | ||
| audio[i] = raw[i]; | ||
| } else { | ||
| float sum = 0.0f; | ||
| for (int c = 0; c < num_channels; c++) { | ||
| sum += raw[i * num_channels + c]; | ||
| } | ||
| audio[i] = sum / num_channels; | ||
| } | ||
| } | ||
| } else { | ||
| throw std::runtime_error("Unsupported bits per sample: " + std::to_string(bits_per_sample)); | ||
| } | ||
|
|
||
| // Basic resampling if needed (linear interpolation) | ||
| if (sample_rate != target_sample_rate) { | ||
| int new_len = static_cast<int>(audio.size() * static_cast<double>(target_sample_rate) / sample_rate); | ||
| std::vector<float> resampled(new_len); | ||
| for (int i = 0; i < new_len; i++) { | ||
| double src_idx = i * static_cast<double>(audio.size() - 1) / (new_len - 1); | ||
| int idx0 = static_cast<int>(src_idx); | ||
| int idx1 = std::min(idx0 + 1, static_cast<int>(audio.size()) - 1); | ||
| double frac = src_idx - idx0; | ||
| resampled[i] = static_cast<float>(audio[idx0] * (1.0 - frac) + audio[idx1] * frac); | ||
| } | ||
| return resampled; | ||
| } | ||
|
|
||
| return audio; | ||
| } else { | ||
| file.seekg(chunk_size, std::ios::cur); | ||
| } | ||
| } | ||
|
|
||
| throw std::runtime_error("No data chunk found in WAV file"); | ||
| } | ||
|
|
||
| std::string DecodeTokens(OgaGenerator& generator, OgaTokenizerStream& tokenizer_stream) { | ||
| std::string text; | ||
| while (!generator.IsDone()) { | ||
| generator.GenerateNextToken(); | ||
| auto next_tokens = generator.GetNextTokens(); | ||
| if (!next_tokens.empty()) { | ||
| const char* token_text = tokenizer_stream.Decode(next_tokens[0]); | ||
| if (token_text && token_text[0] != '\0') { | ||
| std::cout << token_text << std::flush; | ||
| text += token_text; | ||
| } | ||
| } | ||
| } | ||
| return text; | ||
| } | ||
|
|
||
| void StreamingTranscribe(const std::string& model_path, const std::string& audio_path) { | ||
| auto [sample_rate, chunk_samples] = LoadConfig(model_path); | ||
|
|
||
| std::cout << "Loading audio: " << audio_path << std::endl; | ||
| auto audio = LoadWav(audio_path, sample_rate); | ||
| double duration = static_cast<double>(audio.size()) / sample_rate; | ||
|
|
||
| std::cout << "Loading model: " << model_path << std::endl; | ||
| auto config = OgaConfig::Create(model_path.c_str()); | ||
| auto model = OgaModel::Create(*config); | ||
| auto processor = OgaStreamingProcessor::Create(*model); | ||
| auto tokenizer = OgaTokenizer::Create(*model); | ||
| auto tokenizer_stream = OgaTokenizerStream::Create(*tokenizer); | ||
| auto params = OgaGeneratorParams::Create(*model); | ||
| auto generator = OgaGenerator::Create(*model, *params); | ||
|
|
||
| std::cout << " Sample rate: " << sample_rate << ", Chunk: " << chunk_samples << " samples" << std::endl; | ||
| std::cout << " Audio duration: " << duration << "s" << std::endl; | ||
| std::cout << std::string(60, '-') << std::endl; | ||
|
|
||
| auto start = std::chrono::high_resolution_clock::now(); | ||
| std::string full_transcript; | ||
|
|
||
| // Stream audio in chunks | ||
| for (size_t i = 0; i < audio.size(); i += chunk_samples) { | ||
| size_t remaining = std::min(static_cast<size_t>(chunk_samples), audio.size() - i); | ||
| auto inputs = processor->Process(audio.data() + i, remaining); | ||
| if (inputs) { | ||
| generator->SetInputs(*inputs); | ||
| full_transcript += DecodeTokens(*generator, *tokenizer_stream); | ||
| } | ||
| } | ||
|
|
||
| // Flush remaining audio | ||
| { | ||
| auto inputs = processor->Flush(); | ||
| if (inputs && inputs.get()) { | ||
| generator->SetInputs(*inputs); | ||
| full_transcript += DecodeTokens(*generator, *tokenizer_stream); | ||
| } | ||
| } | ||
|
|
||
| auto end = std::chrono::high_resolution_clock::now(); | ||
| double wall_time = std::chrono::duration<double>(end - start).count(); | ||
|
|
||
| std::cout << "\n" | ||
| << std::string(60, '=') << std::endl; | ||
| std::cout << " " << full_transcript << std::endl; | ||
| std::cout << std::string(60, '=') << std::endl; | ||
| std::cout << " Audio: " << duration << "s | Wall: " << wall_time << "s | RTF: " << (duration / wall_time) << "x" << std::endl; | ||
| } | ||
|
|
||
| int main(int argc, char* argv[]) { | ||
| if (argc < 3) { | ||
| std::cerr << "Usage: " << argv[0] << " --model_path <path> --audio_file <path>" << std::endl; | ||
| return 1; | ||
| } | ||
|
|
||
| std::string model_path; | ||
| std::string audio_file; | ||
|
|
||
| for (int i = 1; i < argc; i++) { | ||
| if (std::string(argv[i]) == "--model_path" && i + 1 < argc) { | ||
| model_path = argv[++i]; | ||
| } else if (std::string(argv[i]) == "--audio_file" && i + 1 < argc) { | ||
| audio_file = argv[++i]; | ||
| } | ||
| } | ||
|
|
||
| if (model_path.empty() || audio_file.empty()) { | ||
| std::cerr << "Both --model_path and --audio_file are required." << std::endl; | ||
| return 1; | ||
| } | ||
|
|
||
| try { | ||
| StreamingTranscribe(model_path, audio_file); | ||
| } catch (const std::exception& e) { | ||
| std::cerr << "Error: " << e.what() << std::endl; | ||
| return 1; | ||
| } | ||
|
|
||
| return 0; | ||
| } |
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
| Original file line number | Diff line number | Diff line change |
|---|---|---|
| @@ -0,0 +1,22 @@ | ||
| <Project Sdk="Microsoft.NET.Sdk"> | ||
|
|
||
| <PropertyGroup> | ||
| <TargetFramework>net8.0</TargetFramework> | ||
| <OutputType>Exe</OutputType> | ||
| <Nullable>enable</Nullable> | ||
| <ImplicitUsings>enable</ImplicitUsings> | ||
| <AllowUnsafeBlocks>true</AllowUnsafeBlocks> | ||
| </PropertyGroup> | ||
|
|
||
| <ItemGroup> | ||
| <PackageReference Include="Microsoft.ML.OnnxRuntimeGenAI" Version="0.12.0" Condition=" '$(Configuration)' == 'Debug' OR '$(Configuration)' == 'Release' " /> | ||
| <PackageReference Include="Microsoft.ML.OnnxRuntimeGenAI.Cuda" Version="0.12.0" Condition=" '$(Configuration)' == 'Debug_Cuda' OR '$(Configuration)' == 'Release_Cuda' " /> | ||
| <PackageReference Include="NAudio" Version="2.2.1" /> | ||
| <PackageReference Include="System.CommandLine" Version="2.0.1" /> | ||
| </ItemGroup> | ||
|
|
||
| <ItemGroup> | ||
| <Compile Include="../Common/Common.cs" Link="Common/Common.cs" /> | ||
| </ItemGroup> | ||
|
|
||
| </Project> | ||
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
| Original file line number | Diff line number | Diff line change |
|---|---|---|
| @@ -0,0 +1,101 @@ | ||
| // Copyright (c) Microsoft Corporation. All rights reserved. | ||
| // Licensed under the MIT License. | ||
|
|
||
| using CommonUtils; | ||
| using Microsoft.ML.OnnxRuntimeGenAI; | ||
| using NAudio.Wave; | ||
| using NAudio.Wave.SampleProviders; | ||
| using System.Text.Json; | ||
|
|
||
| if (args.Length < 2) { | ||
| Console.WriteLine("Usage: NemotronSpeech <model_path> <audio_file.wav> [execution_provider]"); | ||
| return; | ||
| } | ||
|
|
||
| string modelPath = args[0]; | ||
| string audioFile = args[1]; | ||
| string executionProvider = args.Length > 2 ? args[2] : "follow_config"; | ||
|
|
||
| // Read sample_rate and chunk_samples from genai_config.json | ||
| var configJson = JsonDocument.Parse(File.ReadAllText(Path.Combine(modelPath, "genai_config.json"))); | ||
| var modelConfig = configJson.RootElement.GetProperty("model"); | ||
| int sampleRate = modelConfig.GetProperty("sample_rate").GetInt32(); | ||
| int chunkSize = modelConfig.GetProperty("chunk_samples").GetInt32(); | ||
|
|
||
| // Load audio, convert to mono, and resample to match the model's expected sample rate | ||
| float[] audio = LoadAudio(audioFile, sampleRate); | ||
| Console.WriteLine($"Audio: {audio.Length / (double)sampleRate:F1}s ({audio.Length} samples)"); | ||
|
|
||
| using var config = Common.GetConfig(path: modelPath, ep: executionProvider, null, new GeneratorParamsArgs()); | ||
| using var model = new Model(config); | ||
| using var processor = new StreamingProcessor(model); | ||
| using var tokenizer = new Tokenizer(model); | ||
| using var tokenizerStream = tokenizer.CreateStream(); | ||
| using var genParams = new GeneratorParams(model); | ||
| using var generator = new Generator(model, genParams); | ||
| Console.WriteLine(new string('-', 60)); | ||
| string fullTranscript = ""; | ||
|
|
||
| for (int i = 0; i < audio.Length; i += chunkSize) { | ||
| int remaining = Math.Min(chunkSize, audio.Length - i); | ||
| float[] chunk = new float[remaining]; | ||
| Array.Copy(audio, i, chunk, 0, remaining); | ||
|
|
||
| using var inputs = processor.Process(chunk); | ||
| if (inputs != null) { | ||
| generator.SetInputs(inputs); | ||
| fullTranscript += DecodeTokens(generator, tokenizerStream); | ||
| } | ||
| } | ||
|
|
||
| // Flush remaining buffered audio | ||
| using var flushInputs = processor.Flush(); | ||
| if (flushInputs != null) { | ||
| generator.SetInputs(flushInputs); | ||
| fullTranscript += DecodeTokens(generator, tokenizerStream); | ||
| } | ||
|
|
||
| Console.WriteLine($"\n{new string('=', 60)}"); | ||
| Console.WriteLine($" {fullTranscript.Trim()}"); | ||
| Console.WriteLine(new string('=', 60)); | ||
|
|
||
| static string DecodeTokens(Generator generator, TokenizerStream tokenizerStream) { | ||
| string text = ""; | ||
| while (!generator.IsDone()) { | ||
| generator.GenerateNextToken(); | ||
| var tokens = generator.GetNextTokens(); | ||
| if (tokens.Length > 0) { | ||
| string tokenText = tokenizerStream.Decode(tokens[0]); | ||
| if (!string.IsNullOrEmpty(tokenText)) { | ||
| Console.Write(tokenText); | ||
| text += tokenText; | ||
| } | ||
| } | ||
| } | ||
| return text; | ||
| } | ||
|
|
||
| static float[] LoadAudio(string path, int targetSampleRate) { | ||
| using var reader = new AudioFileReader(path); | ||
|
|
||
| // Convert to mono if needed | ||
| ISampleProvider source = reader; | ||
| if (reader.WaveFormat.Channels > 1) { | ||
| source = new StereoToMonoSampleProvider(source); | ||
| } | ||
|
|
||
| // Resample if needed | ||
| if (reader.WaveFormat.SampleRate != targetSampleRate) { | ||
| source = new WdlResamplingSampleProvider(source, targetSampleRate); | ||
| } | ||
|
|
||
| var samples = new List<float>(); | ||
| // Allocate memory to read, any num works. | ||
| float[] buffer = new float[4096]; | ||
| int read; | ||
| while ((read = source.Read(buffer, 0, buffer.Length)) > 0) { | ||
| for (int i = 0; i < read; i++) | ||
| samples.Add(buffer[i]); | ||
| } | ||
| return samples.ToArray(); | ||
| } |
Oops, something went wrong.
Oops, something went wrong.
Add this suggestion to a batch that can be applied as a single commit.
This suggestion is invalid because no changes were made to the code.
Suggestions cannot be applied while the pull request is closed.
Suggestions cannot be applied while viewing a subset of changes.
Only one suggestion per line can be applied in a batch.
Add this suggestion to a batch that can be applied as a single commit.
Applying suggestions on deleted lines is not supported.
You must change the existing code in this line in order to create a valid suggestion.
Outdated suggestions cannot be applied.
This suggestion has been applied or marked resolved.
Suggestions cannot be applied from pending reviews.
Suggestions cannot be applied on multi-line comments.
Suggestions cannot be applied while the pull request is queued to merge.
Suggestion cannot be applied right now. Please check back later.
Uh oh!
There was an error while loading. Please reload this page.