diff --git a/VERSION_INFO b/VERSION_INFO index eef637823a..eca364485e 100644 --- a/VERSION_INFO +++ b/VERSION_INFO @@ -1 +1 @@ -0.12.0-dev \ No newline at end of file +0.13.0-dev \ No newline at end of file diff --git a/examples/c/src/model_chat.cpp b/examples/c/src/model_chat.cpp index 37a537fb40..9e8fd311a6 100644 --- a/examples/c/src/model_chat.cpp +++ b/examples/c/src/model_chat.cpp @@ -103,8 +103,8 @@ void CXX_API( // Encode system prompt and append tokens to model auto sequences = OgaSequences::Create(); tokenizer->Encode(prompt.c_str(), *sequences); - const int prompt_tokens_length = sequences->SequenceCount(0); generator->AppendTokenSequences(*sequences); + const int prompt_tokens_length = generator->TokenCount(); // Keep asking for input prompts in a loop while (true) { @@ -157,7 +157,7 @@ void CXX_API( if (verbose) std::cout << "Running generation loop..." << std::endl; std::cout << std::endl; std::cout << "Output: "; - const auto current_token_count = generator->GetSequenceCount(0); + const int current_token_count = generator->TokenCount(); try { while (!generator->IsDone()) { generator->GenerateNextToken(); @@ -177,7 +177,7 @@ void CXX_API( } timing.RecordEndTimestamp(); - const int new_tokens_length = generator->GetSequenceCount(0) - prompt_tokens_length; + const int new_tokens_length = generator->TokenCount() - prompt_tokens_length; timing.Log(prompt_tokens_length, new_tokens_length); std::cout << "\n\n" diff --git a/examples/c/src/model_mm.cpp b/examples/c/src/model_mm.cpp index e01393c41a..494653568f 100644 --- a/examples/c/src/model_mm.cpp +++ b/examples/c/src/model_mm.cpp @@ -144,7 +144,7 @@ void CXX_API( // Encode combined system + user prompt and append inputs to model auto input_tensors = processor->ProcessImagesAndAudios(prompt.c_str(), images.get(), audios.get()); generator->SetInputs(*input_tensors); - const int prompt_tokens_length = generator->GetSequenceCount(0); + const int prompt_tokens_length = generator->TokenCount(); // Run generation loop if (verbose) std::cout << "Running generation loop..." << std::endl; @@ -174,7 +174,7 @@ void CXX_API( // Remove user message from list of messages input_list.pop_back(); - const int new_tokens_length = generator->GetSequenceCount(0) - prompt_tokens_length; + const int new_tokens_length = generator->TokenCount() - prompt_tokens_length; timing.Log(prompt_tokens_length, new_tokens_length); std::cout << "\n\n\n"; diff --git a/examples/c/src/model_qa.cpp b/examples/c/src/model_qa.cpp index 25b39dfce7..345f8aa995 100644 --- a/examples/c/src/model_qa.cpp +++ b/examples/c/src/model_qa.cpp @@ -126,6 +126,7 @@ void CXX_API( auto sequences = OgaSequences::Create(); tokenizer->Encode(prompt.c_str(), *sequences); generator->AppendTokenSequences(*sequences); + const int prompt_tokens_length = generator->TokenCount(); // Run generation loop if (verbose) std::cout << "Running generation loop..." << std::endl; @@ -155,8 +156,7 @@ void CXX_API( // Remove user message from list of messages input_list.pop_back(); - const int prompt_tokens_length = sequences->SequenceCount(0); - const int new_tokens_length = generator->GetSequenceCount(0) - prompt_tokens_length; + const int new_tokens_length = generator->TokenCount() - prompt_tokens_length; timing.Log(prompt_tokens_length, new_tokens_length); std::cout << "\n\n\n"; diff --git a/examples/csharp/Common/Common.cs b/examples/csharp/Common/Common.cs index d15476c374..dcbf2e6cd2 100644 --- a/examples/csharp/Common/Common.cs +++ b/examples/csharp/Common/Common.cs @@ -106,24 +106,19 @@ public static Config GetConfig(string path, string ep, Dictionary - - - + + + diff --git a/examples/csharp/ModelChat/Program.cs b/examples/csharp/ModelChat/Program.cs index 47a9e6aaec..dbbc9e4a9d 100644 --- a/examples/csharp/ModelChat/Program.cs +++ b/examples/csharp/ModelChat/Program.cs @@ -79,7 +79,7 @@ bool verbose // Display output and timings Console.WriteLine("Output:"); Console.WriteLine(outputString); - var totalTokens = outputSequence.Length; + var totalTokens = (int)generator.TokenCount(); Console.WriteLine($"Tokens: {totalTokens}, Time: {runTimeInSeconds:0.00}, Tokens per second: {totalTokens / runTimeInSeconds:0.00}"); Console.WriteLine(); @@ -218,8 +218,7 @@ bool verbose input_list.RemoveAt(input_list.Count - 1); // Display output and timings - var outputSequence = generator.GetSequence(0); - var totalTokens = outputSequence.Length; + var totalTokens = (int)generator.TokenCount(); Console.WriteLine(); Console.WriteLine($"Streaming Tokens: {totalTokens}, Time: {runTimeInSeconds:0.00}, Tokens per second: {totalTokens / runTimeInSeconds:0.00}"); Console.WriteLine(); @@ -315,8 +314,8 @@ bool verbose // Encode system prompt and append tokens to model var sequences = tokenizer.Encode(prompt); - var system_prompt_length = sequences[0].Length; generator.AppendTokenSequences(sequences); + var system_prompt_length = (int)generator.TokenCount(); // Streaming Chat var prevTotalTokens = 0; @@ -370,8 +369,7 @@ bool verbose var runTimeInSeconds = watch.Elapsed.TotalSeconds; // Display output and timings - var outputSequence = generator.GetSequence(0); - var totalNewTokens = outputSequence.Length - prevTotalTokens; + var totalNewTokens = (int)generator.TokenCount() - prevTotalTokens; prevTotalTokens = totalNewTokens; Console.WriteLine(); Console.WriteLine($"Streaming Tokens: {totalNewTokens}, Time: {runTimeInSeconds:0.00}, Tokens per second: {totalNewTokens / runTimeInSeconds:0.00}"); @@ -583,11 +581,7 @@ void main(string[] args) { // Enable debugging if requested if (debug) Common.SetLogger(); - /** - * TODO: Uncomment the below snippet to use Utils.RegisterEPLibrary once - * the C# binding to Utils.RegisterEPLibrary is in a stable package release. - */ - // RegisterEP(executionProvider, epPath); + Common.RegisterEP(executionProvider, epPath); // Create model if (verbose) Console.WriteLine("Loading model..."); diff --git a/examples/csharp/ModelMM/ModelMM.csproj b/examples/csharp/ModelMM/ModelMM.csproj index 272ed0d776..767b2f7511 100644 --- a/examples/csharp/ModelMM/ModelMM.csproj +++ b/examples/csharp/ModelMM/ModelMM.csproj @@ -8,9 +8,9 @@ - - - + + + diff --git a/examples/csharp/ModelMM/Program.cs b/examples/csharp/ModelMM/Program.cs index e4bf1baf39..451a5ed3c8 100644 --- a/examples/csharp/ModelMM/Program.cs +++ b/examples/csharp/ModelMM/Program.cs @@ -88,12 +88,7 @@ bool verbose } // Construct user content based on inputs - /** - * TODO: Uncomment the below snippet to use model.GetModelType() once - * the C# binding to Model.GetModelType() is in a stable package release. - */ - //var user_content = Common.GetUserContent(model.GetModelType(), num_images, num_audios, text); - var user_content = Common.GetUserContent("phi4mm", num_images, num_audios, text); + var user_content = Common.GetUserContent(model.GetModelType(), num_images, num_audios, text); // Add user message to list of messages var user_message = new Dictionary @@ -162,8 +157,7 @@ bool verbose input_list.RemoveAt(input_list.Count - 1); // Display output and timings - var outputSequence = generator.GetSequence(0); - var totalTokens = outputSequence.Length; + var totalTokens = (int)generator.TokenCount(); Console.WriteLine(); Console.WriteLine($"Streaming Tokens: {totalTokens}, Time: {runTimeInSeconds:0.00}, Tokens per second: {totalTokens / runTimeInSeconds:0.00}"); Console.WriteLine(); @@ -395,11 +389,7 @@ void main(string[] args) { // Enable debugging if requested if (debug) Common.SetLogger(); - /** - * TODO: Uncomment the below snippet to use Utils.RegisterEPLibrary once - * the C# binding to Utils.RegisterEPLibrary is in a stable package release. - */ - // RegisterEP(executionProvider, epPath); + Common.RegisterEP(executionProvider, epPath); // Create model if (verbose) Console.WriteLine("Loading model..."); diff --git a/examples/python/model-chat.py b/examples/python/model-chat.py index a36e7e0b11..7a50c8e0d7 100644 --- a/examples/python/model-chat.py +++ b/examples/python/model-chat.py @@ -144,7 +144,7 @@ def main(args): prompt_time = first_token_timestamp - started_timestamp run_time = time.time() - first_token_timestamp print( - f"Prompt length: {len(input_tokens)}, New tokens: {len(new_tokens)}, Time to first: {(prompt_time):.2f}s, Prompt tokens per second: {len(input_tokens) / prompt_time:.2f} tps, New tokens per second: {len(new_tokens) / run_time:.2f} tps" + f"Prompt length: {len(user_tokens)}, New tokens: {len(new_tokens)}, Total tokens: {generator.token_count()}, Time to first: {(prompt_time):.2f}s, Prompt tokens per second: {len(user_tokens) / prompt_time:.2f} tps, New tokens per second: {len(new_tokens) / run_time:.2f} tps" ) # Rewind the generator to the system prompt. This will erase all the chat history with the model. diff --git a/examples/python/model-generate.py b/examples/python/model-generate.py index 39d813b064..3c5559b808 100644 --- a/examples/python/model-generate.py +++ b/examples/python/model-generate.py @@ -82,7 +82,7 @@ def main(args): print() total_tokens = sum(len(generator.get_sequence(i)) for i in range(len(prompts))) - print(f"Tokens: {total_tokens} Time: {run_time:.2f} Tokens per second: {total_tokens / run_time:.2f}") + print(f"Tokens: {total_tokens}, Time: {run_time:.2f}, Tokens per second: {total_tokens / run_time:.2f}") print() diff --git a/examples/python/model-mm.py b/examples/python/model-mm.py index 56a671a2c3..baaa9237c8 100644 --- a/examples/python/model-mm.py +++ b/examples/python/model-mm.py @@ -130,6 +130,7 @@ def main(args): # Encode combined system + user prompt and append inputs to model inputs = processor(prompt, images=images, audios=audios) generator.set_inputs(inputs) + input_tokens = generator.token_count() if args.verbose: print("Running generation loop...") @@ -158,6 +159,9 @@ def main(args): print() print() + # Get total tokens consumed + total_tokens = generator.token_count() + # Delete the generator to free the captured graph for the next generator (if graph capture is enabled) del generator @@ -168,7 +172,7 @@ def main(args): prompt_time = first_token_timestamp - started_timestamp run_time = time.time() - first_token_timestamp print( - f"Prompt length: {len(input_tokens)}, New tokens: {len(new_tokens)}, Time to first: {(prompt_time):.2f}s, Prompt tokens per second: {len(input_tokens) / prompt_time:.2f} tps, New tokens per second: {len(new_tokens) / run_time:.2f} tps" + f"Prompt length: {len(input_tokens)}, New tokens: {len(new_tokens)}, Total tokens: {total_tokens}, Time to first: {(prompt_time):.2f}s, Prompt tokens per second: {len(input_tokens) / prompt_time:.2f} tps, New tokens per second: {len(new_tokens) / run_time:.2f} tps" ) # If non-interactive is requested, it will just run the model for the user prompt and exit diff --git a/examples/python/model-qa.py b/examples/python/model-qa.py index 3d3538cf62..3d711efc79 100644 --- a/examples/python/model-qa.py +++ b/examples/python/model-qa.py @@ -141,6 +141,9 @@ def main(args): print() print() + # Get total tokens consumed + total_tokens = generator.token_count() + # Delete the generator to free the captured graph for the next generator (if graph capture is enabled) del generator @@ -151,7 +154,7 @@ def main(args): prompt_time = first_token_timestamp - started_timestamp run_time = time.time() - first_token_timestamp print( - f"Prompt length: {len(input_tokens)}, New tokens: {len(new_tokens)}, Time to first: {(prompt_time):.2f}s, Prompt tokens per second: {len(input_tokens) / prompt_time:.2f} tps, New tokens per second: {len(new_tokens) / run_time:.2f} tps" + f"Prompt length: {len(input_tokens)}, New tokens: {len(new_tokens)}, Total tokens: {total_tokens}, Time to first: {(prompt_time):.2f}s, Prompt tokens per second: {len(input_tokens) / prompt_time:.2f} tps, New tokens per second: {len(new_tokens) / run_time:.2f} tps" ) # If non-interactive is requested, it will just run the model for the user prompt and exit