microsoft · baijumeswani · Feb 17, 2026 · Feb 16, 2026 · Feb 16, 2026 · Feb 16, 2026
diff --git a/VERSION_INFO b/VERSION_INFO
@@ -1 +1 @@
-0.12.0-dev
+0.13.0-dev
@@ -103,8 +103,8 @@ void CXX_API(
   // Encode system prompt and append tokens to model
   auto sequences = OgaSequences::Create();
   tokenizer->Encode(prompt.c_str(), *sequences);
-  const int prompt_tokens_length = sequences->SequenceCount(0);
   generator->AppendTokenSequences(*sequences);
+  const int prompt_tokens_length = generator->TokenCount();
 
   // Keep asking for input prompts in a loop
   while (true) {
@@ -157,7 +157,7 @@ void CXX_API(
     if (verbose) std::cout << "Running generation loop..." << std::endl;
     std::cout << std::endl;
     std::cout << "Output: ";
-    const auto current_token_count = generator->GetSequenceCount(0);
+    const int current_token_count = generator->TokenCount();
     try {
       while (!generator->IsDone()) {
         generator->GenerateNextToken();
@@ -177,7 +177,7 @@ void CXX_API(
     }
     timing.RecordEndTimestamp();
 
-    const int new_tokens_length = generator->GetSequenceCount(0) - prompt_tokens_length;
+    const int new_tokens_length = generator->TokenCount() - prompt_tokens_length;
     timing.Log(prompt_tokens_length, new_tokens_length);
 
     std::cout << "\n\n"

@@ -144,7 +144,7 @@ void CXX_API(
     // Encode combined system + user prompt and append inputs to model
     auto input_tensors = processor->ProcessImagesAndAudios(prompt.c_str(), images.get(), audios.get());
     generator->SetInputs(*input_tensors);
-    const int prompt_tokens_length = generator->GetSequenceCount(0);
+    const int prompt_tokens_length = generator->TokenCount();
 
     // Run generation loop
     if (verbose) std::cout << "Running generation loop..." << std::endl;
@@ -174,7 +174,7 @@ void CXX_API(
     // Remove user message from list of messages
     input_list.pop_back();
 
-    const int new_tokens_length = generator->GetSequenceCount(0) - prompt_tokens_length;
+    const int new_tokens_length = generator->TokenCount() - prompt_tokens_length;
     timing.Log(prompt_tokens_length, new_tokens_length);
 
     std::cout << "\n\n\n";

@@ -126,6 +126,7 @@ void CXX_API(
     auto sequences = OgaSequences::Create();
     tokenizer->Encode(prompt.c_str(), *sequences);
     generator->AppendTokenSequences(*sequences);
+    const int prompt_tokens_length = generator->TokenCount();
 
     // Run generation loop
     if (verbose) std::cout << "Running generation loop..." << std::endl;
@@ -155,8 +156,7 @@ void CXX_API(
     // Remove user message from list of messages
     input_list.pop_back();
 
-    const int prompt_tokens_length = sequences->SequenceCount(0);
-    const int new_tokens_length = generator->GetSequenceCount(0) - prompt_tokens_length;
+    const int new_tokens_length = generator->TokenCount() - prompt_tokens_length;
     timing.Log(prompt_tokens_length, new_tokens_length);
 
     std::cout << "\n\n\n";

@@ -106,24 +106,19 @@ public static Config GetConfig(string path, string ep, Dictionary<string, string
                 }
             }
 
-            /**
-             * TODO: Uncomment the below snippet to use config.Overlay once the C# binding to Config.Overlay
-             * is in a stable package release.
-             */
-
-            // // Create serializer context to skip null attributes
-            // var options = new JsonSerializerOptions()
-            // {
-            //     WriteIndented = true,
-            //     PropertyNamingPolicy = JsonNamingPolicy.CamelCase,
-            //     DefaultIgnoreCondition = JsonIgnoreCondition.WhenWritingNull,
-            // };
-            // var ctx = new ArgsSerializerContext(options);
-            // var json = JsonSerializer.Serialize(search_options, ctx.GeneratorParamsArgs);
-
-            // // Set any search-specific options that need to be known before constructing a Model object
-            // // Otherwise they can be set with params.SetSearchOptions(search_options)
-            // config.Overlay(json);
+            // Create serializer context to skip null attributes
+            var options = new JsonSerializerOptions()
+            {
+                WriteIndented = true,
+                PropertyNamingPolicy = JsonNamingPolicy.CamelCase,
+                DefaultIgnoreCondition = JsonIgnoreCondition.WhenWritingNull,
+            };
+            var ctx = new ArgsSerializerContext(options);
+            var json = JsonSerializer.Serialize(search_options, ctx.GeneratorParamsArgs);
+
+            // Set any search-specific options that need to be known before constructing a Model object
+            // Otherwise they can be set with generatorParams.SetSearchOptions(search_options)
+            config.Overlay(json);
             return config;
         }
 

@@ -8,9 +8,9 @@
   </PropertyGroup>
 
   <ItemGroup>
-    <PackageReference Include="Microsoft.ML.OnnxRuntimeGenAI" Version="0.11.4" Condition=" '$(Configuration)' == 'Debug' OR '$(Configuration)' == 'Release' " />
-    <PackageReference Include="Microsoft.ML.OnnxRuntimeGenAI.Cuda" Version="0.11.4" Condition=" '$(Configuration)' == 'Debug_Cuda' OR '$(Configuration)' == 'Release_Cuda' " />
-    <PackageReference Include="Microsoft.ML.OnnxRuntimeGenAI.DirectML" Version="0.11.4" Condition=" '$(Configuration)' == 'Debug_DirectML' OR '$(Configuration)' == 'Release_DirectML' " />
+    <PackageReference Include="Microsoft.ML.OnnxRuntimeGenAI" Version="0.12.0" Condition=" '$(Configuration)' == 'Debug' OR '$(Configuration)' == 'Release' " />
+    <PackageReference Include="Microsoft.ML.OnnxRuntimeGenAI.Cuda" Version="0.12.0" Condition=" '$(Configuration)' == 'Debug_Cuda' OR '$(Configuration)' == 'Release_Cuda' " />
+    <PackageReference Include="Microsoft.ML.OnnxRuntimeGenAI.DirectML" Version="0.12.0" Condition=" '$(Configuration)' == 'Debug_DirectML' OR '$(Configuration)' == 'Release_DirectML' " />
     <PackageReference Include="System.CommandLine" Version="2.0.1" />
   </ItemGroup>
 

@@ -79,7 +79,7 @@ bool verbose
         // Display output and timings
         Console.WriteLine("Output:");
         Console.WriteLine(outputString);
-        var totalTokens = outputSequence.Length;
+        var totalTokens = (int)generator.TokenCount();
         Console.WriteLine($"Tokens: {totalTokens}, Time: {runTimeInSeconds:0.00}, Tokens per second: {totalTokens / runTimeInSeconds:0.00}");
         Console.WriteLine();
 
@@ -218,8 +218,7 @@ bool verbose
         input_list.RemoveAt(input_list.Count - 1);
 
         // Display output and timings
-        var outputSequence = generator.GetSequence(0);
-        var totalTokens = outputSequence.Length;
+        var totalTokens = (int)generator.TokenCount();
         Console.WriteLine();
         Console.WriteLine($"Streaming Tokens: {totalTokens}, Time: {runTimeInSeconds:0.00}, Tokens per second: {totalTokens / runTimeInSeconds:0.00}");
         Console.WriteLine();
@@ -315,8 +314,8 @@ bool verbose
 
     // Encode system prompt and append tokens to model
     var sequences = tokenizer.Encode(prompt);
-    var system_prompt_length = sequences[0].Length;
     generator.AppendTokenSequences(sequences);
+    var system_prompt_length = (int)generator.TokenCount();
 
     // Streaming Chat
     var prevTotalTokens = 0;
@@ -370,8 +369,7 @@ bool verbose
         var runTimeInSeconds = watch.Elapsed.TotalSeconds;
 
         // Display output and timings
-        var outputSequence = generator.GetSequence(0);
-        var totalNewTokens = outputSequence.Length - prevTotalTokens;
+        var totalNewTokens = (int)generator.TokenCount() - prevTotalTokens;
         prevTotalTokens = totalNewTokens;
         Console.WriteLine();
         Console.WriteLine($"Streaming Tokens: {totalNewTokens}, Time: {runTimeInSeconds:0.00}, Tokens per second: {totalNewTokens / runTimeInSeconds:0.00}");
@@ -583,11 +581,7 @@ void main(string[] args) {
 
     // Enable debugging if requested
     if (debug) Common.SetLogger();
-    /**
-     * TODO: Uncomment the below snippet to use Utils.RegisterEPLibrary once
-     * the C# binding to Utils.RegisterEPLibrary is in a stable package release.
-     */
-    // RegisterEP(executionProvider, epPath);
+    Common.RegisterEP(executionProvider, epPath);
 
     // Create model
     if (verbose) Console.WriteLine("Loading model...");

@@ -8,9 +8,9 @@
   </PropertyGroup>
 
   <ItemGroup>
-    <PackageReference Include="Microsoft.ML.OnnxRuntimeGenAI" Version="0.11.4" Condition=" '$(Configuration)' == 'Debug' OR '$(Configuration)' == 'Release' " />
-    <PackageReference Include="Microsoft.ML.OnnxRuntimeGenAI.Cuda" Version="0.11.4" Condition=" '$(Configuration)' == 'Debug_Cuda' OR '$(Configuration)' == 'Release_Cuda' " />
-    <PackageReference Include="Microsoft.ML.OnnxRuntimeGenAI.DirectML" Version="0.11.4" Condition=" '$(Configuration)' == 'Debug_DirectML' OR '$(Configuration)' == 'Release_DirectML' " />
+    <PackageReference Include="Microsoft.ML.OnnxRuntimeGenAI" Version="0.12.0" Condition=" '$(Configuration)' == 'Debug' OR '$(Configuration)' == 'Release' " />
+    <PackageReference Include="Microsoft.ML.OnnxRuntimeGenAI.Cuda" Version="0.12.0" Condition=" '$(Configuration)' == 'Debug_Cuda' OR '$(Configuration)' == 'Release_Cuda' " />
+    <PackageReference Include="Microsoft.ML.OnnxRuntimeGenAI.DirectML" Version="0.12.0" Condition=" '$(Configuration)' == 'Debug_DirectML' OR '$(Configuration)' == 'Release_DirectML' " />
     <PackageReference Include="System.CommandLine" Version="2.0.1" />
   </ItemGroup>
 

@@ -88,12 +88,7 @@ bool verbose
         }
 
         // Construct user content based on inputs
-        /**
-         * TODO: Uncomment the below snippet to use model.GetModelType() once
-         * the C# binding to Model.GetModelType() is in a stable package release.
-         */
-        //var user_content = Common.GetUserContent(model.GetModelType(), num_images, num_audios, text);
-        var user_content = Common.GetUserContent("phi4mm", num_images, num_audios, text);
+        var user_content = Common.GetUserContent(model.GetModelType(), num_images, num_audios, text);
 
         // Add user message to list of messages
         var user_message = new Dictionary<string, string>
@@ -162,8 +157,7 @@ bool verbose
         input_list.RemoveAt(input_list.Count - 1);
 
         // Display output and timings
-        var outputSequence = generator.GetSequence(0);
-        var totalTokens = outputSequence.Length;
+        var totalTokens = (int)generator.TokenCount();
         Console.WriteLine();
         Console.WriteLine($"Streaming Tokens: {totalTokens}, Time: {runTimeInSeconds:0.00}, Tokens per second: {totalTokens / runTimeInSeconds:0.00}");
         Console.WriteLine();
@@ -395,11 +389,7 @@ void main(string[] args) {
 
     // Enable debugging if requested
     if (debug) Common.SetLogger();
-    /**
-     * TODO: Uncomment the below snippet to use Utils.RegisterEPLibrary once
-     * the C# binding to Utils.RegisterEPLibrary is in a stable package release.
-     */
-    // RegisterEP(executionProvider, epPath);
+    Common.RegisterEP(executionProvider, epPath);
 
     // Create model
     if (verbose) Console.WriteLine("Loading model...");

@@ -144,7 +144,7 @@ def main(args):
             prompt_time = first_token_timestamp - started_timestamp
             run_time = time.time() - first_token_timestamp
             print(
-                f"Prompt length: {len(input_tokens)}, New tokens: {len(new_tokens)}, Time to first: {(prompt_time):.2f}s, Prompt tokens per second: {len(input_tokens) / prompt_time:.2f} tps, New tokens per second: {len(new_tokens) / run_time:.2f} tps"
+                f"Prompt length: {len(user_tokens)}, New tokens: {len(new_tokens)}, Total tokens: {generator.token_count()}, Time to first: {(prompt_time):.2f}s, Prompt tokens per second: {len(user_tokens) / prompt_time:.2f} tps, New tokens per second: {len(new_tokens) / run_time:.2f} tps"
             )
 
         # Rewind the generator to the system prompt. This will erase all the chat history with the model.

@@ -82,7 +82,7 @@ def main(args):
 
     print()
     total_tokens = sum(len(generator.get_sequence(i)) for i in range(len(prompts)))
-    print(f"Tokens: {total_tokens} Time: {run_time:.2f} Tokens per second: {total_tokens / run_time:.2f}")
+    print(f"Tokens: {total_tokens}, Time: {run_time:.2f}, Tokens per second: {total_tokens / run_time:.2f}")
     print()
 
 

@@ -130,6 +130,7 @@ def main(args):
         # Encode combined system + user prompt and append inputs to model
         inputs = processor(prompt, images=images, audios=audios)
         generator.set_inputs(inputs)
+        input_tokens = generator.token_count()
 
         if args.verbose:
             print("Running generation loop...")
@@ -158,6 +159,9 @@ def main(args):
         print()
         print()
 
+        # Get total tokens consumed
+        total_tokens = generator.token_count()
+
         # Delete the generator to free the captured graph for the next generator (if graph capture is enabled)
         del generator
 
@@ -168,7 +172,7 @@ def main(args):
             prompt_time = first_token_timestamp - started_timestamp
             run_time = time.time() - first_token_timestamp
             print(
-                f"Prompt length: {len(input_tokens)}, New tokens: {len(new_tokens)}, Time to first: {(prompt_time):.2f}s, Prompt tokens per second: {len(input_tokens) / prompt_time:.2f} tps, New tokens per second: {len(new_tokens) / run_time:.2f} tps"
+                f"Prompt length: {len(input_tokens)}, New tokens: {len(new_tokens)}, Total tokens: {total_tokens}, Time to first: {(prompt_time):.2f}s, Prompt tokens per second: {len(input_tokens) / prompt_time:.2f} tps, New tokens per second: {len(new_tokens) / run_time:.2f} tps"
             )
 
         # If non-interactive is requested, it will just run the model for the user prompt and exit

@@ -141,6 +141,9 @@ def main(args):
         print()
         print()
 
+        # Get total tokens consumed
+        total_tokens = generator.token_count()
+
         # Delete the generator to free the captured graph for the next generator (if graph capture is enabled)
         del generator
 
@@ -151,7 +154,7 @@ def main(args):
             prompt_time = first_token_timestamp - started_timestamp
             run_time = time.time() - first_token_timestamp
             print(
-                f"Prompt length: {len(input_tokens)}, New tokens: {len(new_tokens)}, Time to first: {(prompt_time):.2f}s, Prompt tokens per second: {len(input_tokens) / prompt_time:.2f} tps, New tokens per second: {len(new_tokens) / run_time:.2f} tps"
+                f"Prompt length: {len(input_tokens)}, New tokens: {len(new_tokens)}, Total tokens: {total_tokens}, Time to first: {(prompt_time):.2f}s, Prompt tokens per second: {len(input_tokens) / prompt_time:.2f} tps, New tokens per second: {len(new_tokens) / run_time:.2f} tps"
             )
 
         # If non-interactive is requested, it will just run the model for the user prompt and exit
-Original file line number
+Diff line change
@@ Expand Up / @@ -82,7 +82,7 @@ def main(args): @@
         print()
         total_tokens = sum(len(generator.get_sequence(i)) for i in range(len(prompts)))
-        print(f"Tokens: {total_tokens} Time: {run_time:.2f} Tokens per second: {total_tokens / run_time:.2f}")
+        print(f"Tokens: {total_tokens}, Time: {run_time:.2f}, Tokens per second: {total_tokens / run_time:.2f}")
         print()
@@ Expand Down @@