From 6199a5f6b4700daddf737e56c0c5555be0371aa6 Mon Sep 17 00:00:00 2001 From: Ryan Hill <38674843+RyanUnderhill@users.noreply.github.com> Date: Wed, 30 Apr 2025 16:43:56 -0700 Subject: [PATCH 01/10] update ESRP settings (#1435) Exact same change as https://github.com/microsoft/onnxruntime/pull/24608 in Onnxruntime --- .pipelines/stages/jobs/steps/compliant/esrp_nuget.yml | 11 ++++++----- .../stages/jobs/steps/compliant/win-esrp-dll-step.yml | 11 ++++++----- 2 files changed, 12 insertions(+), 10 deletions(-) diff --git a/.pipelines/stages/jobs/steps/compliant/esrp_nuget.yml b/.pipelines/stages/jobs/steps/compliant/esrp_nuget.yml index ae1cf2620..c89be986d 100644 --- a/.pipelines/stages/jobs/steps/compliant/esrp_nuget.yml +++ b/.pipelines/stages/jobs/steps/compliant/esrp_nuget.yml @@ -9,11 +9,12 @@ steps: displayName: ${{ parameters.DisplayName }} inputs: ConnectedServiceName: 'esrp_release' - AppRegistrationClientId: '53d54d02-978d-4305-8572-583cf6711c4f' - AppRegistrationTenantId: '72f988bf-86f1-41af-91ab-2d7cd011db47' - AuthAKVName: 'buildkeyvault' - AuthCertName: '53d54d02-SSL-AutoRotate' - AuthSignCertName: '53d54d02-978d-4305-8572-583cf6711c4f' + UseMSIAuthentication: true + AppRegistrationClientId: '62b7cfed-4d25-454f-880e-010dc21455ac' + AppRegistrationTenantId: '975f013f-7f24-47e8-a7d3-abc4752bf346' + EsrpClientId: "53d54d02-978d-4305-8572-583cf6711c4f" + AuthAKVName: 'ortbuildkeyvault' + AuthSignCertName: 'esrpcodesign' FolderPath: ${{ parameters.FolderPath }} Pattern: '*.nupkg' SessionTimeout: 90 diff --git a/.pipelines/stages/jobs/steps/compliant/win-esrp-dll-step.yml b/.pipelines/stages/jobs/steps/compliant/win-esrp-dll-step.yml index a41d8f928..cc8573507 100644 --- a/.pipelines/stages/jobs/steps/compliant/win-esrp-dll-step.yml +++ b/.pipelines/stages/jobs/steps/compliant/win-esrp-dll-step.yml @@ -22,11 +22,12 @@ steps: continueOnError: true inputs: ConnectedServiceName: 'esrp_release' - AppRegistrationClientId: '53d54d02-978d-4305-8572-583cf6711c4f' - AppRegistrationTenantId: '72f988bf-86f1-41af-91ab-2d7cd011db47' - AuthAKVName: 'buildkeyvault' - AuthCertName: '53d54d02-SSL-AutoRotate' - AuthSignCertName: '53d54d02-978d-4305-8572-583cf6711c4f' + UseMSIAuthentication: true + AppRegistrationClientId: '62b7cfed-4d25-454f-880e-010dc21455ac' + AppRegistrationTenantId: '975f013f-7f24-47e8-a7d3-abc4752bf346' + EsrpClientId: "53d54d02-978d-4305-8572-583cf6711c4f" + AuthAKVName: 'ortbuildkeyvault' + AuthSignCertName: 'esrpcodesign' FolderPath: ${{ parameters.FolderPath }} Pattern: ${{ parameters.Pattern }} SessionTimeout: 90 From 35d7fd2512c5a8edb33b9c780c4294f14669b879 Mon Sep 17 00:00:00 2001 From: Guenther Schmuelling Date: Wed, 30 Apr 2025 12:37:49 -0700 Subject: [PATCH 02/10] make WebGPU name consistent (#1434) --- src/generators.cpp | 2 +- src/models/model.cpp | 3 +-- 2 files changed, 2 insertions(+), 3 deletions(-) diff --git a/src/generators.cpp b/src/generators.cpp index ed8d82298..80fb7a403 100644 --- a/src/generators.cpp +++ b/src/generators.cpp @@ -207,7 +207,7 @@ std::string to_string(DeviceType device_type) { case DeviceType::DML: return "DirectML"; case DeviceType::WEBGPU: - return "WebGpu"; + return "WebGPU"; case DeviceType::QNN: return "QnnWithSharedMemory"; case DeviceType::OpenVINO: diff --git a/src/models/model.cpp b/src/models/model.cpp index 888a4cdde..82d78d82c 100644 --- a/src/models/model.cpp +++ b/src/models/model.cpp @@ -341,7 +341,6 @@ DeviceInterface* SetProviderSessionOptions(OrtSessionOptions& session_options, #endif } else { // For providers that go through the extensible AppendExecutionProvider API: - if (provider_options.name == "QNN") { session_options.AddConfigEntry("ep.share_ep_contexts", "1"); // TODO set device_type_ in a less hacky way. @@ -408,7 +407,7 @@ void EnsureDeviceOrtInit(DeviceInterface& device) { // This ensures memory allocated on-device for model inputs/outputs is valid for the lifetime of GenAI. // Names for the device types used by 'SetProviderSessionOptions' - static const char* device_type_names[] = {"CPU (Not used, see above)", "cuda", "dml", "webgpu", "qnn", "OpenVINO (Not used, see above)"}; + static const char* device_type_names[] = {"CPU (Not used, see above)", "cuda", "dml", "WebGPU", "QNN", "OpenVINO (Not used, see above)"}; static_assert(std::size(device_type_names) == static_cast(DeviceType::MAX)); // Create an OrtSessionOptions and set the options to use the DeviceType we're using here From 7cb742ff9034e5d72974771f8e170366af83c652 Mon Sep 17 00:00:00 2001 From: Ryan Hill <38674843+RyanUnderhill@users.noreply.github.com> Date: Tue, 29 Apr 2025 23:52:00 -0700 Subject: [PATCH 03/10] Missed an all lowercase "webgpu" string (#1432) This is in code we should deprecate going forward but it's breaking an existing case and this is the quickest fix. --- src/runtime_settings.cpp | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/src/runtime_settings.cpp b/src/runtime_settings.cpp index 0fec24e29..dd88b0393 100644 --- a/src/runtime_settings.cpp +++ b/src/runtime_settings.cpp @@ -13,7 +13,7 @@ std::string RuntimeSettings::GenerateConfigOverlay() const { "session_options": { "provider_options": [ { - "webgpu": { + "WebGPU": { "dawnProcTable": ")"; constexpr std::string_view webgpu_overlay_post = R"(" } From 100b3973c2f26e27d2b56cbea0f82967a03e7ebf Mon Sep 17 00:00:00 2001 From: RyanUnderhill <38674843+RyanUnderhill@users.noreply.github.com> Date: Wed, 30 Apr 2025 17:20:17 -0700 Subject: [PATCH 04/10] Cherrypick and update VERSION_INFO --- VERSION_INFO | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/VERSION_INFO b/VERSION_INFO index edb83cfbf..165f1af50 100644 --- a/VERSION_INFO +++ b/VERSION_INFO @@ -1 +1 @@ -0.8.0-rc1 \ No newline at end of file +0.8.0-rc2 \ No newline at end of file From cce95c2474f20cd2fe111c2e3517e6dcb8073980 Mon Sep 17 00:00:00 2001 From: Ryan Hill <38674843+RyanUnderhill@users.noreply.github.com> Date: Fri, 2 May 2025 14:15:12 -0700 Subject: [PATCH 05/10] Apply provider name backwards compatibility at runtime (#1440) Add backwards compatibility for dml->DML Give a runtime error if DML chosen but if we're not built with DML (otherwise it goes into Ort and it will use DML even if GenAI isn't built for it) --- src/config.cpp | 4 +++- src/json.cpp | 1 + src/models/model.cpp | 18 +++++++++++++----- test/c_api_tests.cpp | 1 + 4 files changed, 18 insertions(+), 6 deletions(-) diff --git a/src/config.cpp b/src/config.cpp index c0afd1ad3..225a080ba 100644 --- a/src/config.cpp +++ b/src/config.cpp @@ -65,6 +65,8 @@ struct ProviderOptionsArray_Element : JSON::Element { v.name = "QNN"; } else if (v.name == "webgpu") { v.name = "WebGPU"; + } else if (v.name == "dml") { + v.name = "DML"; } } } @@ -768,7 +770,7 @@ bool IsGraphCaptureEnabled(Config::SessionOptions& session_options) { throw std::runtime_error("Graph Capture is currently unsupported for CUDA"); } } - } else if (provider_options.name == "dml") { + } else if (provider_options.name == "DML") { return true; } else if (provider_options.name == "NvTensorRtRtx") { return true; diff --git a/src/json.cpp b/src/json.cpp index 73449eda1..6907b7f26 100644 --- a/src/json.cpp +++ b/src/json.cpp @@ -50,6 +50,7 @@ void TranslateException(std::string_view name) { JSON::JSON(Element& element, std::string_view document) : begin_{document.data()}, end_{document.data() + document.size()} { try { Parse_Value(element, {}); + element.OnComplete(false); } catch (const std::exception& message) { // Figure out line number of error by counting carriage returns seen from start to error location int line = 1; diff --git a/src/models/model.cpp b/src/models/model.cpp index 82d78d82c..4532ce7c7 100644 --- a/src/models/model.cpp +++ b/src/models/model.cpp @@ -311,8 +311,8 @@ DeviceInterface* SetProviderSessionOptions(OrtSessionOptions& session_options, Ort::ThrowOnError(Ort::api->UpdateROCMProviderOptions(&ort_provider_options, keys.data(), values.data(), keys.size())); session_options.AppendExecutionProvider_ROCM(ort_provider_options); + } else if (provider_options.name == "DML") { #if USE_DML - } else if (provider_options.name == "dml") { if (!GetDmlInterface()) { LUID device_luid{}; LUID* p_device_luid{}; @@ -338,6 +338,8 @@ DeviceInterface* SetProviderSessionOptions(OrtSessionOptions& session_options, if (is_primary_session_options) p_device = GetDeviceInterface(DeviceType::DML); // We use a DML allocator for input/output caches, but other tensors will use CPU tensors +#else + throw std::runtime_error("DML provider requested, but the installed GenAI has not been built with DML support"); #endif } else { // For providers that go through the extensible AppendExecutionProvider API: @@ -407,7 +409,7 @@ void EnsureDeviceOrtInit(DeviceInterface& device) { // This ensures memory allocated on-device for model inputs/outputs is valid for the lifetime of GenAI. // Names for the device types used by 'SetProviderSessionOptions' - static const char* device_type_names[] = {"CPU (Not used, see above)", "cuda", "dml", "WebGPU", "QNN", "OpenVINO (Not used, see above)"}; + static const char* device_type_names[] = {"CPU (Not used, see above)", "cuda", "DML", "WebGPU", "QNN", "OpenVINO (Not used, see above)"}; static_assert(std::size(device_type_names) == static_cast(DeviceType::MAX)); // Create an OrtSessionOptions and set the options to use the DeviceType we're using here @@ -737,9 +739,15 @@ std::unique_ptr Model::ExpandInputs(std::unique_ptr& input, auto expanded = OrtValue::CreateTensor(p_device_inputs_->GetAllocator(), input_shape, element_type); auto expanded_span = ByteWrapTensor(*p_device_inputs_, *expanded); - for (int i = 0; i < batch_size; i++) { - for (int j = 0; j < num_beams; j++) { - expanded_span.subspan((i * num_beams + j) * data_size_bytes, data_size_bytes).CopyFrom(input_span.subspan(i * data_size_bytes, data_size_bytes)); + // Detect fast & simple copy case + if (num_beams == 1) { + expanded_span.CopyFrom(input_span); + } else { + // TODO (RyanHill): To avoid cuda uninitialized memory warnings, we should copy input_span to device memory first + for (int i = 0; i < batch_size; i++) { + for (int j = 0; j < num_beams; j++) { + expanded_span.subspan((i * num_beams + j) * data_size_bytes, data_size_bytes).CopyFrom(input_span.subspan(i * data_size_bytes, data_size_bytes)); + } } } return expanded; diff --git a/test/c_api_tests.cpp b/test/c_api_tests.cpp index f4dbc8061..bfe622c62 100644 --- a/test/c_api_tests.cpp +++ b/test/c_api_tests.cpp @@ -37,6 +37,7 @@ TEST(CAPITests, Config) { config->SetProviderOption("brainium", "custom_field2", "hello2"); config->ClearProviders(); config->AppendProvider("cuda"); + config->AppendProvider("dml"); #endif } From 27448b1e08a6c74e5ac1e55cbea2560a3ee82457 Mon Sep 17 00:00:00 2001 From: Sayan Shaw <52221015+sayanshaw24@users.noreply.github.com> Date: Tue, 6 May 2025 08:54:52 -0700 Subject: [PATCH 06/10] Update Extensions Commit to Support Chat Template Override for Unsupported Models (#1452) --- cmake/deps.txt | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/cmake/deps.txt b/cmake/deps.txt index 9d706bef4..21931948c 100644 --- a/cmake/deps.txt +++ b/cmake/deps.txt @@ -14,4 +14,4 @@ pybind11;https://github.com/pybind/pybind11/archive/refs/tags/v2.13.6.zip;f78029 googletest;https://github.com/google/googletest/archive/530d5c8c84abd2a46f38583ee817743c9b3a42b4.zip;5e3a61db2aa975cfd0f97ba92c818744e7fa7034 microsoft_wil;https://github.com/microsoft/wil/archive/refs/tags/v1.0.230629.1.zip;e4a542a323c070376f7c2d1973d0f7ddbc1d2fa5 directx_headers;https://github.com/microsoft/DirectX-Headers/archive/refs/tags/v1.613.1.zip;47653509a3371eabb156360f42faf582f314bf2e -onnxruntime_extensions;https://github.com/microsoft/onnxruntime-extensions.git;d1daadcb53a80645b3d96218e4713f24c12dfaf0 +onnxruntime_extensions;https://github.com/microsoft/onnxruntime-extensions.git;cb00b43f05409d6f70cc558f52fcff0c7e386a97 From 43a50cecac7e35aa9b34381a0629f26cf1af9a82 Mon Sep 17 00:00:00 2001 From: Baiju Meswani Date: Thu, 1 May 2025 14:20:42 -0700 Subject: [PATCH 07/10] Sign macos binaries (#1439) --- .../stages/jobs/steps/capi-macos-step.yml | 33 +++++++++++++++++ .../steps/compliant/mac-esrp-archive-step.yml | 35 +++++++++++++++++++ 2 files changed, 68 insertions(+) create mode 100644 .pipelines/stages/jobs/steps/compliant/mac-esrp-archive-step.yml diff --git a/.pipelines/stages/jobs/steps/capi-macos-step.yml b/.pipelines/stages/jobs/steps/capi-macos-step.yml index abf936233..78d5c9ce9 100644 --- a/.pipelines/stages/jobs/steps/capi-macos-step.yml +++ b/.pipelines/stages/jobs/steps/capi-macos-step.yml @@ -50,6 +50,39 @@ steps: displayName: 'Package C/C++ API' workingDirectory: '$(Build.Repository.LocalPath)' + - bash: | + set -e -x + ls -al $(Build.Repository.LocalPath)/build/$(ep)/$(os)-$(arch)/package + tar -xvzf onnxruntime-genai-*.tar.gz -C . + ls -al $(Build.Repository.LocalPath)/build/$(ep)/$(os)-$(arch)/package + find . -maxdepth 1 -type d -name 'onnxruntime-genai-*' -exec zip -FSr --symlinks {}.zip {} \; + ls -al $(Build.Repository.LocalPath)/build/$(ep)/$(os)-$(arch)/package + rm -rf $(Build.Repository.LocalPath)/build/$(ep)/$(os)-$(arch)/package/onnxruntime-genai-*.tar.gz + ls -al $(Build.Repository.LocalPath)/build/$(ep)/$(os)-$(arch)/package + find $(Build.Repository.LocalPath)/build/$(ep)/$(os)-$(arch)/package -maxdepth 1 -type d -name 'onnxruntime-genai-*' -exec rm -rf {} + + ls -al $(Build.Repository.LocalPath)/build/$(ep)/$(os)-$(arch)/package + displayName: 'Convert from .tar.gz to .zip' + workingDirectory: '$(Build.Repository.LocalPath)/build/$(ep)/$(os)-$(arch)/package' + + - template: compliant/mac-esrp-archive-step.yml + parameters: + FolderPath: '$(Build.Repository.LocalPath)/build/$(ep)/$(os)-$(arch)/package' + Pattern: '*.zip' + + - bash: | + set -e -x + ls -al $(Build.Repository.LocalPath)/build/$(ep)/$(os)-$(arch)/package + find . -name 'onnxruntime-genai-*.zip' -exec unzip {} \; + ls -al $(Build.Repository.LocalPath)/build/$(ep)/$(os)-$(arch)/package + find . -maxdepth 1 -type d -name 'onnxruntime-genai-*' -exec tar -czvf {}.tar.gz {} \; + ls -al $(Build.Repository.LocalPath)/build/$(ep)/$(os)-$(arch)/package + rm -rf $(Build.Repository.LocalPath)/build/$(ep)/$(os)-$(arch)/package/onnxruntime-genai-*.zip + ls -al $(Build.Repository.LocalPath)/build/$(ep)/$(os)-$(arch)/package + find $(Build.Repository.LocalPath)/build/$(ep)/$(os)-$(arch)/package -maxdepth 1 -type d -name 'onnxruntime-genai-*' -exec rm -rf {} + + ls -al $(Build.Repository.LocalPath)/build/$(ep)/$(os)-$(arch)/package + displayName: 'Convert from .zip to .tar.gz' + workingDirectory: '$(Build.Repository.LocalPath)/build/$(ep)/$(os)-$(arch)/package' + - task: 1ES.PublishPipelineArtifact@1 displayName: 'Publish Artifact: ONNXRuntime Genai capi' inputs: diff --git a/.pipelines/stages/jobs/steps/compliant/mac-esrp-archive-step.yml b/.pipelines/stages/jobs/steps/compliant/mac-esrp-archive-step.yml new file mode 100644 index 000000000..a31cfca1d --- /dev/null +++ b/.pipelines/stages/jobs/steps/compliant/mac-esrp-archive-step.yml @@ -0,0 +1,35 @@ +parameters: +- name: FolderPath + type: string + default: '' + +- name: Pattern + type: string + default: '*.zip' + +steps: +- task: SFP.build-tasks.custom-build-task-1.EsrpCodeSigning@5 + displayName: 'ESRP CodeSigning' + inputs: + ConnectedServiceName: 'esrp_release' + UseMSIAuthentication: true + AppRegistrationClientId: '62b7cfed-4d25-454f-880e-010dc21455ac' + AppRegistrationTenantId: '975f013f-7f24-47e8-a7d3-abc4752bf346' + EsrpClientId: "53d54d02-978d-4305-8572-583cf6711c4f" + AuthAKVName: 'ortbuildkeyvault' + AuthSignCertName: 'esrpcodesign' + FolderPath: ${{ parameters.FolderPath }} + Pattern: ${{ parameters.Pattern }} + SessionTimeout: 90 + ServiceEndpointUrl: 'https://api.esrp.microsoft.com/api/v2' + MaxConcurrency: 25 + signConfigType: inlineSignParams + inlineOperation: | + [ + { + "keyCode": "CP-401337-Apple", + "operationSetCode": "MacAppDeveloperSign", + "toolName": "sign", + "toolVersion": "6.2.9304.0" + } + ] From 017262129e86bff464deaf7e3eb7df3f9d94320b Mon Sep 17 00:00:00 2001 From: Sayan Shaw <52221015+sayanshaw24@users.noreply.github.com> Date: Fri, 2 May 2025 16:57:22 -0700 Subject: [PATCH 08/10] Set `add_special_tokens` to false by default in Encode (#1442) Sets `add_special_tokens` from `OrtxTokenizeWithOptions` added in https://github.com/microsoft/onnxruntime-extensions/pull/940 to false to solve chat template issue in GenAI with extra BOS tokens. See https://github.com/huggingface/transformers/issues/37686 for more context. --------- Co-authored-by: Sayan Shaw Co-authored-by: Ryan Hill <38674843+RyanUnderhill@users.noreply.github.com> --- cmake/deps.txt | 2 +- src/csharp/NativeMethods.cs | 1 - src/models/model.cpp | 2 +- 3 files changed, 2 insertions(+), 3 deletions(-) diff --git a/cmake/deps.txt b/cmake/deps.txt index 21931948c..faf47d240 100644 --- a/cmake/deps.txt +++ b/cmake/deps.txt @@ -14,4 +14,4 @@ pybind11;https://github.com/pybind/pybind11/archive/refs/tags/v2.13.6.zip;f78029 googletest;https://github.com/google/googletest/archive/530d5c8c84abd2a46f38583ee817743c9b3a42b4.zip;5e3a61db2aa975cfd0f97ba92c818744e7fa7034 microsoft_wil;https://github.com/microsoft/wil/archive/refs/tags/v1.0.230629.1.zip;e4a542a323c070376f7c2d1973d0f7ddbc1d2fa5 directx_headers;https://github.com/microsoft/DirectX-Headers/archive/refs/tags/v1.613.1.zip;47653509a3371eabb156360f42faf582f314bf2e -onnxruntime_extensions;https://github.com/microsoft/onnxruntime-extensions.git;cb00b43f05409d6f70cc558f52fcff0c7e386a97 +onnxruntime_extensions;https://github.com/microsoft/onnxruntime-extensions.git;fc004859e82241e99d458a90d2a39d400050cc59 diff --git a/src/csharp/NativeMethods.cs b/src/csharp/NativeMethods.cs index 58f07939f..b0e9008f7 100644 --- a/src/csharp/NativeMethods.cs +++ b/src/csharp/NativeMethods.cs @@ -186,7 +186,6 @@ public static extern UIntPtr OgaSequencesGetSequenceCount(IntPtr /* const OgaSeq byte[] /* const char* */ strings, IntPtr /* OgaSequences* */ sequences); - // This function is used to decode the given token into a string. The caller is responsible for freeing the // returned string using the OgaDestroyString function when it is no longer needed. [DllImport(NativeLib.DllName, CallingConvention = CallingConvention.Winapi)] diff --git a/src/models/model.cpp b/src/models/model.cpp index 4532ce7c7..4ccf866c8 100644 --- a/src/models/model.cpp +++ b/src/models/model.cpp @@ -198,7 +198,7 @@ std::unique_ptr Tokenizer::CreateStream() const { std::vector Tokenizer::Encode(const char* text) const { OrtxPtr ids; - CheckResult(OrtxTokenize(tokenizer_, &text, 1, ids.Address())); + CheckResult(OrtxTokenizeWithOptions(tokenizer_, &text, 1, ids.Address(), false /* add_special_tokens */)); const extTokenId_t* tokens; size_t count; From 18d52c3fb016a5656b3cd36c4b3090d6ae603524 Mon Sep 17 00:00:00 2001 From: kunal-vaishnavi <115581922+kunal-vaishnavi@users.noreply.github.com> Date: Mon, 5 May 2025 14:00:07 -0700 Subject: [PATCH 09/10] Remove prompt templates from GenAI config (#1445) ### Description This PR removes generating prompt templates and storing them in the GenAI config. ### Motivation and Context The prompt templates are not used internally anymore. --- src/config.cpp | 36 --------------------------------- src/config.h | 8 -------- src/python/py/models/builder.py | 33 +----------------------------- 3 files changed, 1 insertion(+), 76 deletions(-) diff --git a/src/config.cpp b/src/config.cpp index 225a080ba..4e346d1ac 100644 --- a/src/config.cpp +++ b/src/config.cpp @@ -589,38 +589,6 @@ struct Embedding_Element : JSON::Element { EmbeddingOutputs_Element outputs_{v_.outputs}; }; -struct PromptTemplates_Element : JSON::Element { - explicit PromptTemplates_Element(std::optional& v) : v_{v} {} - - void OnValue(std::string_view name, JSON::Value value) override { - // if one of templates is given in json, then any non-specified template will be default "{Content}" - if (name == "assistant") { - EnsureAvailable(); - v_->assistant = JSON::Get(value); - } else if (name == "prompt") { - EnsureAvailable(); - v_->prompt = JSON::Get(value); - } else if (name == "system") { - EnsureAvailable(); - v_->system = JSON::Get(value); - } else if (name == "user") { - EnsureAvailable(); - v_->user = JSON::Get(value); - } else { - throw JSON::unknown_value_error{}; - } - } - - private: - std::optional& v_; - - void EnsureAvailable() { - if (!v_.has_value()) { - v_.emplace(); - } - } -}; - struct Model_Element : JSON::Element { explicit Model_Element(Config::Model& v) : v_{v} {} @@ -664,9 +632,6 @@ struct Model_Element : JSON::Element { if (name == "embedding") { return embedding_; } - if (name == "prompt_templates") { - return prompt_templates_; - } if (name == "speech") { return speech_; } @@ -680,7 +645,6 @@ struct Model_Element : JSON::Element { Eos_Array_Element eos_token_ids_{v_}; Vision_Element vision_{v_.vision}; Embedding_Element embedding_{v_.embedding}; - PromptTemplates_Element prompt_templates_{v_.prompt_templates}; Speech_Element speech_{v_.speech}; }; diff --git a/src/config.h b/src/config.h index baf4d7013..ea18bd11c 100644 --- a/src/config.h +++ b/src/config.h @@ -25,7 +25,6 @@ struct Config { static constexpr std::string_view InputsEmbedsName = "inputs_embeds"; static constexpr std::string_view CurrentSequenceLengthName = "current_sequence_length"; static constexpr std::string_view PastSequenceLengthName = "past_sequence_length"; - static constexpr std::string_view promptTemplate = "{Content}"; static constexpr std::string_view TotalSequenceLengthName = "total_sequence_length"; static constexpr std::string_view TokenTypeIdsName = "token_type_ids"; @@ -206,13 +205,6 @@ struct Config { } decoder; - struct PromptTemplates { - std::string assistant{Defaults::promptTemplate}; - std::string prompt{Defaults::promptTemplate}; - std::string system{Defaults::promptTemplate}; - std::string user{Defaults::promptTemplate}; - }; - std::optional prompt_templates; } model; struct Search { diff --git a/src/python/py/models/builder.py b/src/python/py/models/builder.py index 92fd815cd..5e274f394 100644 --- a/src/python/py/models/builder.py +++ b/src/python/py/models/builder.py @@ -399,11 +399,6 @@ def make_genai_config(self, model_name_or_path, extra_kwargs, out_dir): ep_options = { self.ep : self.ep_attrs[self.ep] } genai_config["model"]["decoder"]["session_options"]["provider_options"].append(ep_options) - if self.extra_options.get("include_prompt_templates", False): - prompt_templates = self._get_prompt_templates(model_name_or_path, extra_kwargs) - if prompt_templates is not None: - genai_config["model"]["prompt_templates"] = prompt_templates - print(f"Saving GenAI config in {out_dir}") with open(os.path.join(out_dir,"genai_config.json"), "w") as f: json.dump(genai_config, f, indent=4) @@ -412,30 +407,6 @@ def save_processing(self, model_name_or_path, extra_kwargs, out_dir): tokenizer = AutoTokenizer.from_pretrained(model_name_or_path, token=self.hf_token, trust_remote_code=True, **extra_kwargs) print(f"Saving processing files in {out_dir} for GenAI") tokenizer.save_pretrained(out_dir) - - def _get_prompt_templates(self, hf_name, extra_kwargs): - try: - # disable end of sentence padding with eos_token=None - tokenizer = AutoTokenizer.from_pretrained(hf_name, token=self.hf_token, trust_remote_code=True, eos_token=None, **extra_kwargs) - system_template = tokenizer.apply_chat_template([{'role': 'system', 'content': '{Content}'}], tokenize=False) - system_user_template = tokenizer.apply_chat_template([{'role': 'system', 'content': '{Content}'}, {'role': 'user', 'content': '{Content}'}], tokenize=False) - system_user_assistant_template = tokenizer.apply_chat_template([{'role': 'system', 'content': '{Content}'}, {'role': 'user', 'content': '{Content}'}, {'role': 'assistant', 'content': '{Content}'}], tokenize=False) - assert system_user_template.startswith(system_template), "Chat templates may contain padding tokens, leading to incorrect prompt templates" - assert system_user_assistant_template.startswith(system_user_template), "Chat templates may contain padding tokens, leading to incorrect prompt templates" - user_template = system_user_template[len(system_template):] - assistant_template = system_user_assistant_template[len(system_user_template):] - prompt_template = system_user_assistant_template[len(system_template):] - prompt_template = prompt_template[:prompt_template.rfind('{Content}')] - templates = { - "system": system_template, - "user": user_template, - "assistant": assistant_template, - "prompt": prompt_template - } - return templates - except Exception as e: - print(f"Failed to get prompt templates. Error: {e}") - return None def save_model(self, out_dir): print(f"Saving ONNX model in {out_dir}") @@ -3273,7 +3244,7 @@ def check_extra_options(kv_pairs): """ Check key-value pairs and set values correctly """ - bools = ["int4_is_symmetric", "exclude_embeds", "exclude_lm_head", "include_hidden_states", "enable_cuda_graph", "use_8bits_moe", "use_qdq", "include_prompt_templates"] + bools = ["int4_is_symmetric", "exclude_embeds", "exclude_lm_head", "include_hidden_states", "enable_cuda_graph", "use_8bits_moe", "use_qdq"] for key in bools: if key in kv_pairs: if kv_pairs[key] in {"false", "False", "0"}: @@ -3528,8 +3499,6 @@ def get_args(): Use this option to enable GPUs that do not support FP16 on WebGPU (e.g. GTX 10xx). adapter_path = Path to folder on disk containing the adapter files (adapter_config.json and adapter model weights). Use this option for LoRA models. - include_prompt_templates = Include prompt templates in the GenAI config file. Default is false. - Use this option to include per-role prompt templates in the `genai_config.json` file. """), ) From 6c04a61981b2ab63364057ebb001cc09bf4d4093 Mon Sep 17 00:00:00 2001 From: Sayan Shaw <52221015+sayanshaw24@users.noreply.github.com> Date: Tue, 6 May 2025 08:54:52 -0700 Subject: [PATCH 10/10] Update Extensions Commit to Support Chat Template Override for Unsupported Models (#1452) --- cmake/deps.txt | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/cmake/deps.txt b/cmake/deps.txt index faf47d240..21931948c 100644 --- a/cmake/deps.txt +++ b/cmake/deps.txt @@ -14,4 +14,4 @@ pybind11;https://github.com/pybind/pybind11/archive/refs/tags/v2.13.6.zip;f78029 googletest;https://github.com/google/googletest/archive/530d5c8c84abd2a46f38583ee817743c9b3a42b4.zip;5e3a61db2aa975cfd0f97ba92c818744e7fa7034 microsoft_wil;https://github.com/microsoft/wil/archive/refs/tags/v1.0.230629.1.zip;e4a542a323c070376f7c2d1973d0f7ddbc1d2fa5 directx_headers;https://github.com/microsoft/DirectX-Headers/archive/refs/tags/v1.613.1.zip;47653509a3371eabb156360f42faf582f314bf2e -onnxruntime_extensions;https://github.com/microsoft/onnxruntime-extensions.git;fc004859e82241e99d458a90d2a39d400050cc59 +onnxruntime_extensions;https://github.com/microsoft/onnxruntime-extensions.git;cb00b43f05409d6f70cc558f52fcff0c7e386a97