Skip to content
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
33 changes: 33 additions & 0 deletions .pipelines/stages/jobs/steps/capi-macos-step.yml
Original file line number Diff line number Diff line change
Expand Up @@ -50,6 +50,39 @@ steps:
displayName: 'Package C/C++ API'
workingDirectory: '$(Build.Repository.LocalPath)'

- bash: |
set -e -x
ls -al $(Build.Repository.LocalPath)/build/$(ep)/$(os)-$(arch)/package
tar -xvzf onnxruntime-genai-*.tar.gz -C .
ls -al $(Build.Repository.LocalPath)/build/$(ep)/$(os)-$(arch)/package
find . -maxdepth 1 -type d -name 'onnxruntime-genai-*' -exec zip -FSr --symlinks {}.zip {} \;
ls -al $(Build.Repository.LocalPath)/build/$(ep)/$(os)-$(arch)/package
rm -rf $(Build.Repository.LocalPath)/build/$(ep)/$(os)-$(arch)/package/onnxruntime-genai-*.tar.gz
ls -al $(Build.Repository.LocalPath)/build/$(ep)/$(os)-$(arch)/package
find $(Build.Repository.LocalPath)/build/$(ep)/$(os)-$(arch)/package -maxdepth 1 -type d -name 'onnxruntime-genai-*' -exec rm -rf {} +
ls -al $(Build.Repository.LocalPath)/build/$(ep)/$(os)-$(arch)/package
displayName: 'Convert from .tar.gz to .zip'
workingDirectory: '$(Build.Repository.LocalPath)/build/$(ep)/$(os)-$(arch)/package'

- template: compliant/mac-esrp-archive-step.yml
parameters:
FolderPath: '$(Build.Repository.LocalPath)/build/$(ep)/$(os)-$(arch)/package'
Pattern: '*.zip'

- bash: |
set -e -x
ls -al $(Build.Repository.LocalPath)/build/$(ep)/$(os)-$(arch)/package
find . -name 'onnxruntime-genai-*.zip' -exec unzip {} \;
ls -al $(Build.Repository.LocalPath)/build/$(ep)/$(os)-$(arch)/package
find . -maxdepth 1 -type d -name 'onnxruntime-genai-*' -exec tar -czvf {}.tar.gz {} \;
ls -al $(Build.Repository.LocalPath)/build/$(ep)/$(os)-$(arch)/package
rm -rf $(Build.Repository.LocalPath)/build/$(ep)/$(os)-$(arch)/package/onnxruntime-genai-*.zip
ls -al $(Build.Repository.LocalPath)/build/$(ep)/$(os)-$(arch)/package
find $(Build.Repository.LocalPath)/build/$(ep)/$(os)-$(arch)/package -maxdepth 1 -type d -name 'onnxruntime-genai-*' -exec rm -rf {} +
ls -al $(Build.Repository.LocalPath)/build/$(ep)/$(os)-$(arch)/package
displayName: 'Convert from .zip to .tar.gz'
workingDirectory: '$(Build.Repository.LocalPath)/build/$(ep)/$(os)-$(arch)/package'

- task: 1ES.PublishPipelineArtifact@1
displayName: 'Publish Artifact: ONNXRuntime Genai capi'
inputs:
Expand Down
11 changes: 6 additions & 5 deletions .pipelines/stages/jobs/steps/compliant/esrp_nuget.yml
Original file line number Diff line number Diff line change
Expand Up @@ -9,11 +9,12 @@ steps:
displayName: ${{ parameters.DisplayName }}
inputs:
ConnectedServiceName: 'esrp_release'
AppRegistrationClientId: '53d54d02-978d-4305-8572-583cf6711c4f'
AppRegistrationTenantId: '72f988bf-86f1-41af-91ab-2d7cd011db47'
AuthAKVName: 'buildkeyvault'
AuthCertName: '53d54d02-SSL-AutoRotate'
AuthSignCertName: '53d54d02-978d-4305-8572-583cf6711c4f'
UseMSIAuthentication: true
AppRegistrationClientId: '62b7cfed-4d25-454f-880e-010dc21455ac'
AppRegistrationTenantId: '975f013f-7f24-47e8-a7d3-abc4752bf346'
EsrpClientId: "53d54d02-978d-4305-8572-583cf6711c4f"
AuthAKVName: 'ortbuildkeyvault'
AuthSignCertName: 'esrpcodesign'
FolderPath: ${{ parameters.FolderPath }}
Pattern: '*.nupkg'
SessionTimeout: 90
Expand Down
35 changes: 35 additions & 0 deletions .pipelines/stages/jobs/steps/compliant/mac-esrp-archive-step.yml
Original file line number Diff line number Diff line change
@@ -0,0 +1,35 @@
parameters:
- name: FolderPath
type: string
default: ''

- name: Pattern
type: string
default: '*.zip'

steps:
- task: SFP.build-tasks.custom-build-task-1.EsrpCodeSigning@5
displayName: 'ESRP CodeSigning'
inputs:
ConnectedServiceName: 'esrp_release'
UseMSIAuthentication: true
AppRegistrationClientId: '62b7cfed-4d25-454f-880e-010dc21455ac'
AppRegistrationTenantId: '975f013f-7f24-47e8-a7d3-abc4752bf346'
EsrpClientId: "53d54d02-978d-4305-8572-583cf6711c4f"
AuthAKVName: 'ortbuildkeyvault'
AuthSignCertName: 'esrpcodesign'
FolderPath: ${{ parameters.FolderPath }}
Pattern: ${{ parameters.Pattern }}
SessionTimeout: 90
ServiceEndpointUrl: 'https://api.esrp.microsoft.com/api/v2'
MaxConcurrency: 25
signConfigType: inlineSignParams
inlineOperation: |
[
{
"keyCode": "CP-401337-Apple",
"operationSetCode": "MacAppDeveloperSign",
"toolName": "sign",
"toolVersion": "6.2.9304.0"
}
]
11 changes: 6 additions & 5 deletions .pipelines/stages/jobs/steps/compliant/win-esrp-dll-step.yml
Original file line number Diff line number Diff line change
Expand Up @@ -22,11 +22,12 @@ steps:
continueOnError: true
inputs:
ConnectedServiceName: 'esrp_release'
AppRegistrationClientId: '53d54d02-978d-4305-8572-583cf6711c4f'
AppRegistrationTenantId: '72f988bf-86f1-41af-91ab-2d7cd011db47'
AuthAKVName: 'buildkeyvault'
AuthCertName: '53d54d02-SSL-AutoRotate'
AuthSignCertName: '53d54d02-978d-4305-8572-583cf6711c4f'
UseMSIAuthentication: true
AppRegistrationClientId: '62b7cfed-4d25-454f-880e-010dc21455ac'
AppRegistrationTenantId: '975f013f-7f24-47e8-a7d3-abc4752bf346'
EsrpClientId: "53d54d02-978d-4305-8572-583cf6711c4f"
AuthAKVName: 'ortbuildkeyvault'
AuthSignCertName: 'esrpcodesign'
FolderPath: ${{ parameters.FolderPath }}
Pattern: ${{ parameters.Pattern }}
SessionTimeout: 90
Expand Down
2 changes: 1 addition & 1 deletion VERSION_INFO
Original file line number Diff line number Diff line change
@@ -1 +1 @@
0.8.0-rc1
0.8.0-rc2
2 changes: 1 addition & 1 deletion cmake/deps.txt
Original file line number Diff line number Diff line change
Expand Up @@ -14,4 +14,4 @@ pybind11;https://github.com/pybind/pybind11/archive/refs/tags/v2.13.6.zip;f78029
googletest;https://github.com/google/googletest/archive/530d5c8c84abd2a46f38583ee817743c9b3a42b4.zip;5e3a61db2aa975cfd0f97ba92c818744e7fa7034
microsoft_wil;https://github.com/microsoft/wil/archive/refs/tags/v1.0.230629.1.zip;e4a542a323c070376f7c2d1973d0f7ddbc1d2fa5
directx_headers;https://github.com/microsoft/DirectX-Headers/archive/refs/tags/v1.613.1.zip;47653509a3371eabb156360f42faf582f314bf2e
onnxruntime_extensions;https://github.com/microsoft/onnxruntime-extensions.git;d1daadcb53a80645b3d96218e4713f24c12dfaf0
onnxruntime_extensions;https://github.com/microsoft/onnxruntime-extensions.git;cb00b43f05409d6f70cc558f52fcff0c7e386a97
40 changes: 3 additions & 37 deletions src/config.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -65,6 +65,8 @@ struct ProviderOptionsArray_Element : JSON::Element {
v.name = "QNN";
} else if (v.name == "webgpu") {
v.name = "WebGPU";
} else if (v.name == "dml") {
v.name = "DML";
}
}
}
Expand Down Expand Up @@ -587,38 +589,6 @@ struct Embedding_Element : JSON::Element {
EmbeddingOutputs_Element outputs_{v_.outputs};
};

struct PromptTemplates_Element : JSON::Element {
explicit PromptTemplates_Element(std::optional<Config::Model::PromptTemplates>& v) : v_{v} {}

void OnValue(std::string_view name, JSON::Value value) override {
// if one of templates is given in json, then any non-specified template will be default "{Content}"
if (name == "assistant") {
EnsureAvailable();
v_->assistant = JSON::Get<std::string_view>(value);
} else if (name == "prompt") {
EnsureAvailable();
v_->prompt = JSON::Get<std::string_view>(value);
} else if (name == "system") {
EnsureAvailable();
v_->system = JSON::Get<std::string_view>(value);
} else if (name == "user") {
EnsureAvailable();
v_->user = JSON::Get<std::string_view>(value);
} else {
throw JSON::unknown_value_error{};
}
}

private:
std::optional<Config::Model::PromptTemplates>& v_;

void EnsureAvailable() {
if (!v_.has_value()) {
v_.emplace();
}
}
};

struct Model_Element : JSON::Element {
explicit Model_Element(Config::Model& v) : v_{v} {}

Expand Down Expand Up @@ -662,9 +632,6 @@ struct Model_Element : JSON::Element {
if (name == "embedding") {
return embedding_;
}
if (name == "prompt_templates") {
return prompt_templates_;
}
if (name == "speech") {
return speech_;
}
Expand All @@ -678,7 +645,6 @@ struct Model_Element : JSON::Element {
Eos_Array_Element eos_token_ids_{v_};
Vision_Element vision_{v_.vision};
Embedding_Element embedding_{v_.embedding};
PromptTemplates_Element prompt_templates_{v_.prompt_templates};
Speech_Element speech_{v_.speech};
};

Expand Down Expand Up @@ -768,7 +734,7 @@ bool IsGraphCaptureEnabled(Config::SessionOptions& session_options) {
throw std::runtime_error("Graph Capture is currently unsupported for CUDA");
}
}
} else if (provider_options.name == "dml") {
} else if (provider_options.name == "DML") {
return true;
} else if (provider_options.name == "NvTensorRtRtx") {
return true;
Expand Down
8 changes: 0 additions & 8 deletions src/config.h
Original file line number Diff line number Diff line change
Expand Up @@ -25,7 +25,6 @@ struct Config {
static constexpr std::string_view InputsEmbedsName = "inputs_embeds";
static constexpr std::string_view CurrentSequenceLengthName = "current_sequence_length";
static constexpr std::string_view PastSequenceLengthName = "past_sequence_length";
static constexpr std::string_view promptTemplate = "{Content}";
static constexpr std::string_view TotalSequenceLengthName = "total_sequence_length";
static constexpr std::string_view TokenTypeIdsName = "token_type_ids";

Expand Down Expand Up @@ -206,13 +205,6 @@ struct Config {

} decoder;

struct PromptTemplates {
std::string assistant{Defaults::promptTemplate};
std::string prompt{Defaults::promptTemplate};
std::string system{Defaults::promptTemplate};
std::string user{Defaults::promptTemplate};
};
std::optional<PromptTemplates> prompt_templates;
} model;

struct Search {
Expand Down
1 change: 0 additions & 1 deletion src/csharp/NativeMethods.cs
Original file line number Diff line number Diff line change
Expand Up @@ -186,7 +186,6 @@ public static extern UIntPtr OgaSequencesGetSequenceCount(IntPtr /* const OgaSeq
byte[] /* const char* */ strings,
IntPtr /* OgaSequences* */ sequences);


// This function is used to decode the given token into a string. The caller is responsible for freeing the
// returned string using the OgaDestroyString function when it is no longer needed.
[DllImport(NativeLib.DllName, CallingConvention = CallingConvention.Winapi)]
Expand Down
2 changes: 1 addition & 1 deletion src/generators.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -207,7 +207,7 @@ std::string to_string(DeviceType device_type) {
case DeviceType::DML:
return "DirectML";
case DeviceType::WEBGPU:
return "WebGpu";
return "WebGPU";
case DeviceType::QNN:
return "QnnWithSharedMemory";
case DeviceType::OpenVINO:
Expand Down
1 change: 1 addition & 0 deletions src/json.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -50,6 +50,7 @@ void TranslateException(std::string_view name) {
JSON::JSON(Element& element, std::string_view document) : begin_{document.data()}, end_{document.data() + document.size()} {
try {
Parse_Value(element, {});
element.OnComplete(false);
} catch (const std::exception& message) {
// Figure out line number of error by counting carriage returns seen from start to error location
int line = 1;
Expand Down
21 changes: 14 additions & 7 deletions src/models/model.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -198,7 +198,7 @@ std::unique_ptr<TokenizerStream> Tokenizer::CreateStream() const {

std::vector<int32_t> Tokenizer::Encode(const char* text) const {
OrtxPtr<OrtxTokenId2DArray> ids;
CheckResult(OrtxTokenize(tokenizer_, &text, 1, ids.Address()));
CheckResult(OrtxTokenizeWithOptions(tokenizer_, &text, 1, ids.Address(), false /* add_special_tokens */));

const extTokenId_t* tokens;
size_t count;
Expand Down Expand Up @@ -311,8 +311,8 @@ DeviceInterface* SetProviderSessionOptions(OrtSessionOptions& session_options,

Ort::ThrowOnError(Ort::api->UpdateROCMProviderOptions(&ort_provider_options, keys.data(), values.data(), keys.size()));
session_options.AppendExecutionProvider_ROCM(ort_provider_options);
} else if (provider_options.name == "DML") {
#if USE_DML
} else if (provider_options.name == "dml") {
if (!GetDmlInterface()) {
LUID device_luid{};
LUID* p_device_luid{};
Expand All @@ -338,10 +338,11 @@ DeviceInterface* SetProviderSessionOptions(OrtSessionOptions& session_options,

if (is_primary_session_options)
p_device = GetDeviceInterface(DeviceType::DML); // We use a DML allocator for input/output caches, but other tensors will use CPU tensors
#else
throw std::runtime_error("DML provider requested, but the installed GenAI has not been built with DML support");
#endif
} else {
// For providers that go through the extensible AppendExecutionProvider API:

if (provider_options.name == "QNN") {
session_options.AddConfigEntry("ep.share_ep_contexts", "1");
// TODO set device_type_ in a less hacky way.
Expand Down Expand Up @@ -408,7 +409,7 @@ void EnsureDeviceOrtInit(DeviceInterface& device) {
// This ensures memory allocated on-device for model inputs/outputs is valid for the lifetime of GenAI.

// Names for the device types used by 'SetProviderSessionOptions'
static const char* device_type_names[] = {"CPU (Not used, see above)", "cuda", "dml", "webgpu", "qnn", "OpenVINO (Not used, see above)"};
static const char* device_type_names[] = {"CPU (Not used, see above)", "cuda", "DML", "WebGPU", "QNN", "OpenVINO (Not used, see above)"};
static_assert(std::size(device_type_names) == static_cast<size_t>(DeviceType::MAX));

// Create an OrtSessionOptions and set the options to use the DeviceType we're using here
Expand Down Expand Up @@ -738,9 +739,15 @@ std::unique_ptr<OrtValue> Model::ExpandInputs(std::unique_ptr<OrtValue>& input,
auto expanded = OrtValue::CreateTensor(p_device_inputs_->GetAllocator(), input_shape, element_type);
auto expanded_span = ByteWrapTensor(*p_device_inputs_, *expanded);

for (int i = 0; i < batch_size; i++) {
for (int j = 0; j < num_beams; j++) {
expanded_span.subspan((i * num_beams + j) * data_size_bytes, data_size_bytes).CopyFrom(input_span.subspan(i * data_size_bytes, data_size_bytes));
// Detect fast & simple copy case
if (num_beams == 1) {
expanded_span.CopyFrom(input_span);
} else {
// TODO (RyanHill): To avoid cuda uninitialized memory warnings, we should copy input_span to device memory first
for (int i = 0; i < batch_size; i++) {
for (int j = 0; j < num_beams; j++) {
expanded_span.subspan((i * num_beams + j) * data_size_bytes, data_size_bytes).CopyFrom(input_span.subspan(i * data_size_bytes, data_size_bytes));
}
}
}
return expanded;
Expand Down
33 changes: 1 addition & 32 deletions src/python/py/models/builder.py
Original file line number Diff line number Diff line change
Expand Up @@ -399,11 +399,6 @@ def make_genai_config(self, model_name_or_path, extra_kwargs, out_dir):
ep_options = { self.ep : self.ep_attrs[self.ep] }
genai_config["model"]["decoder"]["session_options"]["provider_options"].append(ep_options)

if self.extra_options.get("include_prompt_templates", False):
prompt_templates = self._get_prompt_templates(model_name_or_path, extra_kwargs)
if prompt_templates is not None:
genai_config["model"]["prompt_templates"] = prompt_templates

print(f"Saving GenAI config in {out_dir}")
with open(os.path.join(out_dir,"genai_config.json"), "w") as f:
json.dump(genai_config, f, indent=4)
Expand All @@ -412,30 +407,6 @@ def save_processing(self, model_name_or_path, extra_kwargs, out_dir):
tokenizer = AutoTokenizer.from_pretrained(model_name_or_path, token=self.hf_token, trust_remote_code=True, **extra_kwargs)
print(f"Saving processing files in {out_dir} for GenAI")
tokenizer.save_pretrained(out_dir)

def _get_prompt_templates(self, hf_name, extra_kwargs):
try:
# disable end of sentence padding with eos_token=None
tokenizer = AutoTokenizer.from_pretrained(hf_name, token=self.hf_token, trust_remote_code=True, eos_token=None, **extra_kwargs)
system_template = tokenizer.apply_chat_template([{'role': 'system', 'content': '{Content}'}], tokenize=False)
system_user_template = tokenizer.apply_chat_template([{'role': 'system', 'content': '{Content}'}, {'role': 'user', 'content': '{Content}'}], tokenize=False)
system_user_assistant_template = tokenizer.apply_chat_template([{'role': 'system', 'content': '{Content}'}, {'role': 'user', 'content': '{Content}'}, {'role': 'assistant', 'content': '{Content}'}], tokenize=False)
assert system_user_template.startswith(system_template), "Chat templates may contain padding tokens, leading to incorrect prompt templates"
assert system_user_assistant_template.startswith(system_user_template), "Chat templates may contain padding tokens, leading to incorrect prompt templates"
user_template = system_user_template[len(system_template):]
assistant_template = system_user_assistant_template[len(system_user_template):]
prompt_template = system_user_assistant_template[len(system_template):]
prompt_template = prompt_template[:prompt_template.rfind('{Content}')]
templates = {
"system": system_template,
"user": user_template,
"assistant": assistant_template,
"prompt": prompt_template
}
return templates
except Exception as e:
print(f"Failed to get prompt templates. Error: {e}")
return None

def save_model(self, out_dir):
print(f"Saving ONNX model in {out_dir}")
Expand Down Expand Up @@ -3273,7 +3244,7 @@ def check_extra_options(kv_pairs):
"""
Check key-value pairs and set values correctly
"""
bools = ["int4_is_symmetric", "exclude_embeds", "exclude_lm_head", "include_hidden_states", "enable_cuda_graph", "use_8bits_moe", "use_qdq", "include_prompt_templates"]
bools = ["int4_is_symmetric", "exclude_embeds", "exclude_lm_head", "include_hidden_states", "enable_cuda_graph", "use_8bits_moe", "use_qdq"]
for key in bools:
if key in kv_pairs:
if kv_pairs[key] in {"false", "False", "0"}:
Expand Down Expand Up @@ -3528,8 +3499,6 @@ def get_args():
Use this option to enable GPUs that do not support FP16 on WebGPU (e.g. GTX 10xx).
adapter_path = Path to folder on disk containing the adapter files (adapter_config.json and adapter model weights).
Use this option for LoRA models.
include_prompt_templates = Include prompt templates in the GenAI config file. Default is false.
Use this option to include per-role prompt templates in the `genai_config.json` file.
"""),
)

Expand Down
2 changes: 1 addition & 1 deletion src/runtime_settings.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -13,7 +13,7 @@ std::string RuntimeSettings::GenerateConfigOverlay() const {
"session_options": {
"provider_options": [
{
"webgpu": {
"WebGPU": {
"dawnProcTable": ")";
constexpr std::string_view webgpu_overlay_post = R"("
}
Expand Down
1 change: 1 addition & 0 deletions test/c_api_tests.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -37,6 +37,7 @@ TEST(CAPITests, Config) {
config->SetProviderOption("brainium", "custom_field2", "hello2");
config->ClearProviders();
config->AppendProvider("cuda");
config->AppendProvider("dml");
#endif
}

Expand Down
Loading