Skip to content
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension


Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
2 changes: 1 addition & 1 deletion VERSION_INFO
Original file line number Diff line number Diff line change
@@ -1 +1 @@
0.12.0-dev
0.13.0-dev
6 changes: 3 additions & 3 deletions examples/c/src/model_chat.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -103,8 +103,8 @@ void CXX_API(
// Encode system prompt and append tokens to model
auto sequences = OgaSequences::Create();
tokenizer->Encode(prompt.c_str(), *sequences);
const int prompt_tokens_length = sequences->SequenceCount(0);
generator->AppendTokenSequences(*sequences);
const int prompt_tokens_length = generator->TokenCount();

// Keep asking for input prompts in a loop
while (true) {
Expand Down Expand Up @@ -157,7 +157,7 @@ void CXX_API(
if (verbose) std::cout << "Running generation loop..." << std::endl;
std::cout << std::endl;
std::cout << "Output: ";
const auto current_token_count = generator->GetSequenceCount(0);
const int current_token_count = generator->TokenCount();
try {
while (!generator->IsDone()) {
generator->GenerateNextToken();
Expand All @@ -177,7 +177,7 @@ void CXX_API(
}
timing.RecordEndTimestamp();

const int new_tokens_length = generator->GetSequenceCount(0) - prompt_tokens_length;
const int new_tokens_length = generator->TokenCount() - prompt_tokens_length;
timing.Log(prompt_tokens_length, new_tokens_length);

std::cout << "\n\n"
Expand Down
4 changes: 2 additions & 2 deletions examples/c/src/model_mm.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -144,7 +144,7 @@ void CXX_API(
// Encode combined system + user prompt and append inputs to model
auto input_tensors = processor->ProcessImagesAndAudios(prompt.c_str(), images.get(), audios.get());
generator->SetInputs(*input_tensors);
const int prompt_tokens_length = generator->GetSequenceCount(0);
const int prompt_tokens_length = generator->TokenCount();

// Run generation loop
if (verbose) std::cout << "Running generation loop..." << std::endl;
Expand Down Expand Up @@ -174,7 +174,7 @@ void CXX_API(
// Remove user message from list of messages
input_list.pop_back();

const int new_tokens_length = generator->GetSequenceCount(0) - prompt_tokens_length;
const int new_tokens_length = generator->TokenCount() - prompt_tokens_length;
timing.Log(prompt_tokens_length, new_tokens_length);

std::cout << "\n\n\n";
Expand Down
4 changes: 2 additions & 2 deletions examples/c/src/model_qa.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -126,6 +126,7 @@ void CXX_API(
auto sequences = OgaSequences::Create();
tokenizer->Encode(prompt.c_str(), *sequences);
generator->AppendTokenSequences(*sequences);
const int prompt_tokens_length = generator->TokenCount();

// Run generation loop
if (verbose) std::cout << "Running generation loop..." << std::endl;
Expand Down Expand Up @@ -155,8 +156,7 @@ void CXX_API(
// Remove user message from list of messages
input_list.pop_back();

const int prompt_tokens_length = sequences->SequenceCount(0);
const int new_tokens_length = generator->GetSequenceCount(0) - prompt_tokens_length;
const int new_tokens_length = generator->TokenCount() - prompt_tokens_length;
timing.Log(prompt_tokens_length, new_tokens_length);

std::cout << "\n\n\n";
Expand Down
31 changes: 13 additions & 18 deletions examples/csharp/Common/Common.cs
Original file line number Diff line number Diff line change
Expand Up @@ -106,24 +106,19 @@ public static Config GetConfig(string path, string ep, Dictionary<string, string
}
}

/**
* TODO: Uncomment the below snippet to use config.Overlay once the C# binding to Config.Overlay
* is in a stable package release.
*/

// // Create serializer context to skip null attributes
// var options = new JsonSerializerOptions()
// {
// WriteIndented = true,
// PropertyNamingPolicy = JsonNamingPolicy.CamelCase,
// DefaultIgnoreCondition = JsonIgnoreCondition.WhenWritingNull,
// };
// var ctx = new ArgsSerializerContext(options);
// var json = JsonSerializer.Serialize(search_options, ctx.GeneratorParamsArgs);

// // Set any search-specific options that need to be known before constructing a Model object
// // Otherwise they can be set with params.SetSearchOptions(search_options)
// config.Overlay(json);
// Create serializer context to skip null attributes
var options = new JsonSerializerOptions()
{
WriteIndented = true,
PropertyNamingPolicy = JsonNamingPolicy.CamelCase,
DefaultIgnoreCondition = JsonIgnoreCondition.WhenWritingNull,
};
var ctx = new ArgsSerializerContext(options);
var json = JsonSerializer.Serialize(search_options, ctx.GeneratorParamsArgs);

// Set any search-specific options that need to be known before constructing a Model object
// Otherwise they can be set with generatorParams.SetSearchOptions(search_options)
config.Overlay(json);
return config;
}

Expand Down
6 changes: 3 additions & 3 deletions examples/csharp/ModelChat/ModelChat.csproj
Original file line number Diff line number Diff line change
Expand Up @@ -8,9 +8,9 @@
</PropertyGroup>

<ItemGroup>
<PackageReference Include="Microsoft.ML.OnnxRuntimeGenAI" Version="0.11.4" Condition=" '$(Configuration)' == 'Debug' OR '$(Configuration)' == 'Release' " />
<PackageReference Include="Microsoft.ML.OnnxRuntimeGenAI.Cuda" Version="0.11.4" Condition=" '$(Configuration)' == 'Debug_Cuda' OR '$(Configuration)' == 'Release_Cuda' " />
<PackageReference Include="Microsoft.ML.OnnxRuntimeGenAI.DirectML" Version="0.11.4" Condition=" '$(Configuration)' == 'Debug_DirectML' OR '$(Configuration)' == 'Release_DirectML' " />
<PackageReference Include="Microsoft.ML.OnnxRuntimeGenAI" Version="0.12.0" Condition=" '$(Configuration)' == 'Debug' OR '$(Configuration)' == 'Release' " />
<PackageReference Include="Microsoft.ML.OnnxRuntimeGenAI.Cuda" Version="0.12.0" Condition=" '$(Configuration)' == 'Debug_Cuda' OR '$(Configuration)' == 'Release_Cuda' " />
<PackageReference Include="Microsoft.ML.OnnxRuntimeGenAI.DirectML" Version="0.12.0" Condition=" '$(Configuration)' == 'Debug_DirectML' OR '$(Configuration)' == 'Release_DirectML' " />
<PackageReference Include="System.CommandLine" Version="2.0.1" />
</ItemGroup>

Expand Down
16 changes: 5 additions & 11 deletions examples/csharp/ModelChat/Program.cs
Original file line number Diff line number Diff line change
Expand Up @@ -79,7 +79,7 @@ bool verbose
// Display output and timings
Console.WriteLine("Output:");
Console.WriteLine(outputString);
var totalTokens = outputSequence.Length;
var totalTokens = (int)generator.TokenCount();
Console.WriteLine($"Tokens: {totalTokens}, Time: {runTimeInSeconds:0.00}, Tokens per second: {totalTokens / runTimeInSeconds:0.00}");
Console.WriteLine();

Expand Down Expand Up @@ -218,8 +218,7 @@ bool verbose
input_list.RemoveAt(input_list.Count - 1);

// Display output and timings
var outputSequence = generator.GetSequence(0);
var totalTokens = outputSequence.Length;
var totalTokens = (int)generator.TokenCount();
Console.WriteLine();
Console.WriteLine($"Streaming Tokens: {totalTokens}, Time: {runTimeInSeconds:0.00}, Tokens per second: {totalTokens / runTimeInSeconds:0.00}");
Console.WriteLine();
Expand Down Expand Up @@ -315,8 +314,8 @@ bool verbose

// Encode system prompt and append tokens to model
var sequences = tokenizer.Encode(prompt);
var system_prompt_length = sequences[0].Length;
generator.AppendTokenSequences(sequences);
var system_prompt_length = (int)generator.TokenCount();

// Streaming Chat
var prevTotalTokens = 0;
Expand Down Expand Up @@ -370,8 +369,7 @@ bool verbose
var runTimeInSeconds = watch.Elapsed.TotalSeconds;

// Display output and timings
var outputSequence = generator.GetSequence(0);
var totalNewTokens = outputSequence.Length - prevTotalTokens;
var totalNewTokens = (int)generator.TokenCount() - prevTotalTokens;
prevTotalTokens = totalNewTokens;
Console.WriteLine();
Console.WriteLine($"Streaming Tokens: {totalNewTokens}, Time: {runTimeInSeconds:0.00}, Tokens per second: {totalNewTokens / runTimeInSeconds:0.00}");
Expand Down Expand Up @@ -583,11 +581,7 @@ void main(string[] args) {

// Enable debugging if requested
if (debug) Common.SetLogger();
/**
* TODO: Uncomment the below snippet to use Utils.RegisterEPLibrary once
* the C# binding to Utils.RegisterEPLibrary is in a stable package release.
*/
// RegisterEP(executionProvider, epPath);
Common.RegisterEP(executionProvider, epPath);

// Create model
if (verbose) Console.WriteLine("Loading model...");
Expand Down
6 changes: 3 additions & 3 deletions examples/csharp/ModelMM/ModelMM.csproj
Original file line number Diff line number Diff line change
Expand Up @@ -8,9 +8,9 @@
</PropertyGroup>

<ItemGroup>
<PackageReference Include="Microsoft.ML.OnnxRuntimeGenAI" Version="0.11.4" Condition=" '$(Configuration)' == 'Debug' OR '$(Configuration)' == 'Release' " />
<PackageReference Include="Microsoft.ML.OnnxRuntimeGenAI.Cuda" Version="0.11.4" Condition=" '$(Configuration)' == 'Debug_Cuda' OR '$(Configuration)' == 'Release_Cuda' " />
<PackageReference Include="Microsoft.ML.OnnxRuntimeGenAI.DirectML" Version="0.11.4" Condition=" '$(Configuration)' == 'Debug_DirectML' OR '$(Configuration)' == 'Release_DirectML' " />
<PackageReference Include="Microsoft.ML.OnnxRuntimeGenAI" Version="0.12.0" Condition=" '$(Configuration)' == 'Debug' OR '$(Configuration)' == 'Release' " />
<PackageReference Include="Microsoft.ML.OnnxRuntimeGenAI.Cuda" Version="0.12.0" Condition=" '$(Configuration)' == 'Debug_Cuda' OR '$(Configuration)' == 'Release_Cuda' " />
<PackageReference Include="Microsoft.ML.OnnxRuntimeGenAI.DirectML" Version="0.12.0" Condition=" '$(Configuration)' == 'Debug_DirectML' OR '$(Configuration)' == 'Release_DirectML' " />
<PackageReference Include="System.CommandLine" Version="2.0.1" />
</ItemGroup>

Expand Down
16 changes: 3 additions & 13 deletions examples/csharp/ModelMM/Program.cs
Original file line number Diff line number Diff line change
Expand Up @@ -88,12 +88,7 @@ bool verbose
}

// Construct user content based on inputs
/**
* TODO: Uncomment the below snippet to use model.GetModelType() once
* the C# binding to Model.GetModelType() is in a stable package release.
*/
//var user_content = Common.GetUserContent(model.GetModelType(), num_images, num_audios, text);
var user_content = Common.GetUserContent("phi4mm", num_images, num_audios, text);
var user_content = Common.GetUserContent(model.GetModelType(), num_images, num_audios, text);

// Add user message to list of messages
var user_message = new Dictionary<string, string>
Expand Down Expand Up @@ -162,8 +157,7 @@ bool verbose
input_list.RemoveAt(input_list.Count - 1);

// Display output and timings
var outputSequence = generator.GetSequence(0);
var totalTokens = outputSequence.Length;
var totalTokens = (int)generator.TokenCount();
Console.WriteLine();
Console.WriteLine($"Streaming Tokens: {totalTokens}, Time: {runTimeInSeconds:0.00}, Tokens per second: {totalTokens / runTimeInSeconds:0.00}");
Console.WriteLine();
Expand Down Expand Up @@ -395,11 +389,7 @@ void main(string[] args) {

// Enable debugging if requested
if (debug) Common.SetLogger();
/**
* TODO: Uncomment the below snippet to use Utils.RegisterEPLibrary once
* the C# binding to Utils.RegisterEPLibrary is in a stable package release.
*/
// RegisterEP(executionProvider, epPath);
Common.RegisterEP(executionProvider, epPath);

// Create model
if (verbose) Console.WriteLine("Loading model...");
Expand Down
2 changes: 1 addition & 1 deletion examples/python/model-chat.py
Original file line number Diff line number Diff line change
Expand Up @@ -144,7 +144,7 @@ def main(args):
prompt_time = first_token_timestamp - started_timestamp
run_time = time.time() - first_token_timestamp
print(
f"Prompt length: {len(input_tokens)}, New tokens: {len(new_tokens)}, Time to first: {(prompt_time):.2f}s, Prompt tokens per second: {len(input_tokens) / prompt_time:.2f} tps, New tokens per second: {len(new_tokens) / run_time:.2f} tps"
f"Prompt length: {len(user_tokens)}, New tokens: {len(new_tokens)}, Total tokens: {generator.token_count()}, Time to first: {(prompt_time):.2f}s, Prompt tokens per second: {len(user_tokens) / prompt_time:.2f} tps, New tokens per second: {len(new_tokens) / run_time:.2f} tps"
)

# Rewind the generator to the system prompt. This will erase all the chat history with the model.
Expand Down
2 changes: 1 addition & 1 deletion examples/python/model-generate.py
Original file line number Diff line number Diff line change
Expand Up @@ -82,7 +82,7 @@ def main(args):

print()
total_tokens = sum(len(generator.get_sequence(i)) for i in range(len(prompts)))
print(f"Tokens: {total_tokens} Time: {run_time:.2f} Tokens per second: {total_tokens / run_time:.2f}")
print(f"Tokens: {total_tokens}, Time: {run_time:.2f}, Tokens per second: {total_tokens / run_time:.2f}")
print()


Expand Down
6 changes: 5 additions & 1 deletion examples/python/model-mm.py
Original file line number Diff line number Diff line change
Expand Up @@ -130,6 +130,7 @@ def main(args):
# Encode combined system + user prompt and append inputs to model
inputs = processor(prompt, images=images, audios=audios)
generator.set_inputs(inputs)
input_tokens = generator.token_count()

if args.verbose:
print("Running generation loop...")
Expand Down Expand Up @@ -158,6 +159,9 @@ def main(args):
print()
print()

# Get total tokens consumed
total_tokens = generator.token_count()

# Delete the generator to free the captured graph for the next generator (if graph capture is enabled)
del generator

Expand All @@ -168,7 +172,7 @@ def main(args):
prompt_time = first_token_timestamp - started_timestamp
run_time = time.time() - first_token_timestamp
print(
f"Prompt length: {len(input_tokens)}, New tokens: {len(new_tokens)}, Time to first: {(prompt_time):.2f}s, Prompt tokens per second: {len(input_tokens) / prompt_time:.2f} tps, New tokens per second: {len(new_tokens) / run_time:.2f} tps"
f"Prompt length: {len(input_tokens)}, New tokens: {len(new_tokens)}, Total tokens: {total_tokens}, Time to first: {(prompt_time):.2f}s, Prompt tokens per second: {len(input_tokens) / prompt_time:.2f} tps, New tokens per second: {len(new_tokens) / run_time:.2f} tps"
)

# If non-interactive is requested, it will just run the model for the user prompt and exit
Expand Down
5 changes: 4 additions & 1 deletion examples/python/model-qa.py
Original file line number Diff line number Diff line change
Expand Up @@ -141,6 +141,9 @@ def main(args):
print()
print()

# Get total tokens consumed
total_tokens = generator.token_count()

# Delete the generator to free the captured graph for the next generator (if graph capture is enabled)
del generator

Expand All @@ -151,7 +154,7 @@ def main(args):
prompt_time = first_token_timestamp - started_timestamp
run_time = time.time() - first_token_timestamp
print(
f"Prompt length: {len(input_tokens)}, New tokens: {len(new_tokens)}, Time to first: {(prompt_time):.2f}s, Prompt tokens per second: {len(input_tokens) / prompt_time:.2f} tps, New tokens per second: {len(new_tokens) / run_time:.2f} tps"
f"Prompt length: {len(input_tokens)}, New tokens: {len(new_tokens)}, Total tokens: {total_tokens}, Time to first: {(prompt_time):.2f}s, Prompt tokens per second: {len(input_tokens) / prompt_time:.2f} tps, New tokens per second: {len(new_tokens) / run_time:.2f} tps"
)

# If non-interactive is requested, it will just run the model for the user prompt and exit
Expand Down
Loading