From 6199a5f6b4700daddf737e56c0c5555be0371aa6 Mon Sep 17 00:00:00 2001
From: Ryan Hill <38674843+RyanUnderhill@users.noreply.github.com>
Date: Wed, 30 Apr 2025 16:43:56 -0700
Subject: [PATCH 01/10] update ESRP settings (#1435)

Exact same change as https://github.com/microsoft/onnxruntime/pull/24608
in Onnxruntime
---
 .pipelines/stages/jobs/steps/compliant/esrp_nuget.yml | 11 ++++++-----
 .../stages/jobs/steps/compliant/win-esrp-dll-step.yml | 11 ++++++-----
 2 files changed, 12 insertions(+), 10 deletions(-)

diff --git a/.pipelines/stages/jobs/steps/compliant/esrp_nuget.yml b/.pipelines/stages/jobs/steps/compliant/esrp_nuget.yml
index ae1cf2620..c89be986d 100644
--- a/.pipelines/stages/jobs/steps/compliant/esrp_nuget.yml
+++ b/.pipelines/stages/jobs/steps/compliant/esrp_nuget.yml
@@ -9,11 +9,12 @@ steps:
     displayName: ${{ parameters.DisplayName }}
     inputs:
       ConnectedServiceName: 'esrp_release'
-      AppRegistrationClientId: '53d54d02-978d-4305-8572-583cf6711c4f'
-      AppRegistrationTenantId: '72f988bf-86f1-41af-91ab-2d7cd011db47'
-      AuthAKVName: 'buildkeyvault'
-      AuthCertName: '53d54d02-SSL-AutoRotate'
-      AuthSignCertName: '53d54d02-978d-4305-8572-583cf6711c4f'
+      UseMSIAuthentication: true
+      AppRegistrationClientId: '62b7cfed-4d25-454f-880e-010dc21455ac'
+      AppRegistrationTenantId: '975f013f-7f24-47e8-a7d3-abc4752bf346'
+      EsrpClientId: "53d54d02-978d-4305-8572-583cf6711c4f"
+      AuthAKVName: 'ortbuildkeyvault'
+      AuthSignCertName: 'esrpcodesign'
       FolderPath: ${{ parameters.FolderPath }}
       Pattern: '*.nupkg'
       SessionTimeout: 90
diff --git a/.pipelines/stages/jobs/steps/compliant/win-esrp-dll-step.yml b/.pipelines/stages/jobs/steps/compliant/win-esrp-dll-step.yml
index a41d8f928..cc8573507 100644
--- a/.pipelines/stages/jobs/steps/compliant/win-esrp-dll-step.yml
+++ b/.pipelines/stages/jobs/steps/compliant/win-esrp-dll-step.yml
@@ -22,11 +22,12 @@ steps:
   continueOnError: true
   inputs:
     ConnectedServiceName: 'esrp_release'
-    AppRegistrationClientId: '53d54d02-978d-4305-8572-583cf6711c4f'
-    AppRegistrationTenantId: '72f988bf-86f1-41af-91ab-2d7cd011db47'
-    AuthAKVName: 'buildkeyvault'
-    AuthCertName: '53d54d02-SSL-AutoRotate'
-    AuthSignCertName: '53d54d02-978d-4305-8572-583cf6711c4f'
+    UseMSIAuthentication: true
+    AppRegistrationClientId: '62b7cfed-4d25-454f-880e-010dc21455ac'
+    AppRegistrationTenantId: '975f013f-7f24-47e8-a7d3-abc4752bf346'
+    EsrpClientId: "53d54d02-978d-4305-8572-583cf6711c4f"
+    AuthAKVName: 'ortbuildkeyvault'
+    AuthSignCertName: 'esrpcodesign'
     FolderPath: ${{ parameters.FolderPath }}
     Pattern: ${{ parameters.Pattern }}
     SessionTimeout: 90

From 35d7fd2512c5a8edb33b9c780c4294f14669b879 Mon Sep 17 00:00:00 2001
From: Guenther Schmuelling <guschmue@microsoft.com>
Date: Wed, 30 Apr 2025 12:37:49 -0700
Subject: [PATCH 02/10] make WebGPU name consistent (#1434)

---
 src/generators.cpp   | 2 +-
 src/models/model.cpp | 3 +--
 2 files changed, 2 insertions(+), 3 deletions(-)

diff --git a/src/generators.cpp b/src/generators.cpp
index ed8d82298..80fb7a403 100644
--- a/src/generators.cpp
+++ b/src/generators.cpp
@@ -207,7 +207,7 @@ std::string to_string(DeviceType device_type) {
     case DeviceType::DML:
       return "DirectML";
     case DeviceType::WEBGPU:
-      return "WebGpu";
+      return "WebGPU";
     case DeviceType::QNN:
       return "QnnWithSharedMemory";
     case DeviceType::OpenVINO:
diff --git a/src/models/model.cpp b/src/models/model.cpp
index 888a4cdde..82d78d82c 100644
--- a/src/models/model.cpp
+++ b/src/models/model.cpp
@@ -341,7 +341,6 @@ DeviceInterface* SetProviderSessionOptions(OrtSessionOptions& session_options,
 #endif
     } else {
       // For providers that go through the extensible AppendExecutionProvider API:
-
       if (provider_options.name == "QNN") {
         session_options.AddConfigEntry("ep.share_ep_contexts", "1");
         // TODO set device_type_ in a less hacky way.
@@ -408,7 +407,7 @@ void EnsureDeviceOrtInit(DeviceInterface& device) {
   // This ensures memory allocated on-device for model inputs/outputs is valid for the lifetime of GenAI.
 
   // Names for the device types used by 'SetProviderSessionOptions'
-  static const char* device_type_names[] = {"CPU (Not used, see above)", "cuda", "dml", "webgpu", "qnn", "OpenVINO (Not used, see above)"};
+  static const char* device_type_names[] = {"CPU (Not used, see above)", "cuda", "dml", "WebGPU", "QNN", "OpenVINO (Not used, see above)"};
   static_assert(std::size(device_type_names) == static_cast<size_t>(DeviceType::MAX));
 
   // Create an OrtSessionOptions and set the options to use the DeviceType we're using here

From 7cb742ff9034e5d72974771f8e170366af83c652 Mon Sep 17 00:00:00 2001
From: Ryan Hill <38674843+RyanUnderhill@users.noreply.github.com>
Date: Tue, 29 Apr 2025 23:52:00 -0700
Subject: [PATCH 03/10] Missed an all lowercase "webgpu" string (#1432)

This is in code we should deprecate going forward but it's breaking an
existing case and this is the quickest fix.
---
 src/runtime_settings.cpp | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/src/runtime_settings.cpp b/src/runtime_settings.cpp
index 0fec24e29..dd88b0393 100644
--- a/src/runtime_settings.cpp
+++ b/src/runtime_settings.cpp
@@ -13,7 +13,7 @@ std::string RuntimeSettings::GenerateConfigOverlay() const {
       "session_options": {
         "provider_options": [
           {
-            "webgpu": {
+            "WebGPU": {
               "dawnProcTable": ")";
   constexpr std::string_view webgpu_overlay_post = R"("
             }

From 100b3973c2f26e27d2b56cbea0f82967a03e7ebf Mon Sep 17 00:00:00 2001
From: RyanUnderhill <38674843+RyanUnderhill@users.noreply.github.com>
Date: Wed, 30 Apr 2025 17:20:17 -0700
Subject: [PATCH 04/10] Cherrypick and update VERSION_INFO

---
 VERSION_INFO | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/VERSION_INFO b/VERSION_INFO
index edb83cfbf..165f1af50 100644
--- a/VERSION_INFO
+++ b/VERSION_INFO
@@ -1 +1 @@
-0.8.0-rc1
\ No newline at end of file
+0.8.0-rc2
\ No newline at end of file

From cce95c2474f20cd2fe111c2e3517e6dcb8073980 Mon Sep 17 00:00:00 2001
From: Ryan Hill <38674843+RyanUnderhill@users.noreply.github.com>
Date: Fri, 2 May 2025 14:15:12 -0700
Subject: [PATCH 05/10] Apply provider name backwards compatibility at runtime
 (#1440)

Add backwards compatibility for dml->DML
Give a runtime error if DML chosen but if we're not built with DML
(otherwise it goes into Ort and it will use DML even if GenAI isn't
built for it)
---
 src/config.cpp       |  4 +++-
 src/json.cpp         |  1 +
 src/models/model.cpp | 18 +++++++++++++-----
 test/c_api_tests.cpp |  1 +
 4 files changed, 18 insertions(+), 6 deletions(-)

diff --git a/src/config.cpp b/src/config.cpp
index c0afd1ad3..225a080ba 100644
--- a/src/config.cpp
+++ b/src/config.cpp
@@ -65,6 +65,8 @@ struct ProviderOptionsArray_Element : JSON::Element {
         v.name = "QNN";
       } else if (v.name == "webgpu") {
         v.name = "WebGPU";
+      } else if (v.name == "dml") {
+        v.name = "DML";
       }
     }
   }
@@ -768,7 +770,7 @@ bool IsGraphCaptureEnabled(Config::SessionOptions& session_options) {
           throw std::runtime_error("Graph Capture is currently unsupported for CUDA");
         }
       }
-    } else if (provider_options.name == "dml") {
+    } else if (provider_options.name == "DML") {
       return true;
     } else if (provider_options.name == "NvTensorRtRtx") {
       return true;
diff --git a/src/json.cpp b/src/json.cpp
index 73449eda1..6907b7f26 100644
--- a/src/json.cpp
+++ b/src/json.cpp
@@ -50,6 +50,7 @@ void TranslateException(std::string_view name) {
 JSON::JSON(Element& element, std::string_view document) : begin_{document.data()}, end_{document.data() + document.size()} {
   try {
     Parse_Value(element, {});
+    element.OnComplete(false);
   } catch (const std::exception& message) {
     // Figure out line number of error by counting carriage returns seen from start to error location
     int line = 1;
diff --git a/src/models/model.cpp b/src/models/model.cpp
index 82d78d82c..4532ce7c7 100644
--- a/src/models/model.cpp
+++ b/src/models/model.cpp
@@ -311,8 +311,8 @@ DeviceInterface* SetProviderSessionOptions(OrtSessionOptions& session_options,
 
       Ort::ThrowOnError(Ort::api->UpdateROCMProviderOptions(&ort_provider_options, keys.data(), values.data(), keys.size()));
       session_options.AppendExecutionProvider_ROCM(ort_provider_options);
+    } else if (provider_options.name == "DML") {
 #if USE_DML
-    } else if (provider_options.name == "dml") {
       if (!GetDmlInterface()) {
         LUID device_luid{};
         LUID* p_device_luid{};
@@ -338,6 +338,8 @@ DeviceInterface* SetProviderSessionOptions(OrtSessionOptions& session_options,
 
       if (is_primary_session_options)
         p_device = GetDeviceInterface(DeviceType::DML);  // We use a DML allocator for input/output caches, but other tensors will use CPU tensors
+#else
+      throw std::runtime_error("DML provider requested, but the installed GenAI has not been built with DML support");
 #endif
     } else {
       // For providers that go through the extensible AppendExecutionProvider API:
@@ -407,7 +409,7 @@ void EnsureDeviceOrtInit(DeviceInterface& device) {
   // This ensures memory allocated on-device for model inputs/outputs is valid for the lifetime of GenAI.
 
   // Names for the device types used by 'SetProviderSessionOptions'
-  static const char* device_type_names[] = {"CPU (Not used, see above)", "cuda", "dml", "WebGPU", "QNN", "OpenVINO (Not used, see above)"};
+  static const char* device_type_names[] = {"CPU (Not used, see above)", "cuda", "DML", "WebGPU", "QNN", "OpenVINO (Not used, see above)"};
   static_assert(std::size(device_type_names) == static_cast<size_t>(DeviceType::MAX));
 
   // Create an OrtSessionOptions and set the options to use the DeviceType we're using here
@@ -737,9 +739,15 @@ std::unique_ptr<OrtValue> Model::ExpandInputs(std::unique_ptr<OrtValue>& input,
   auto expanded = OrtValue::CreateTensor(p_device_inputs_->GetAllocator(), input_shape, element_type);
   auto expanded_span = ByteWrapTensor(*p_device_inputs_, *expanded);
 
-  for (int i = 0; i < batch_size; i++) {
-    for (int j = 0; j < num_beams; j++) {
-      expanded_span.subspan((i * num_beams + j) * data_size_bytes, data_size_bytes).CopyFrom(input_span.subspan(i * data_size_bytes, data_size_bytes));
+  // Detect fast & simple copy case
+  if (num_beams == 1) {
+    expanded_span.CopyFrom(input_span);
+  } else {
+    // TODO (RyanHill): To avoid cuda uninitialized memory warnings, we should copy input_span to device memory first
+    for (int i = 0; i < batch_size; i++) {
+      for (int j = 0; j < num_beams; j++) {
+        expanded_span.subspan((i * num_beams + j) * data_size_bytes, data_size_bytes).CopyFrom(input_span.subspan(i * data_size_bytes, data_size_bytes));
+      }
     }
   }
   return expanded;
diff --git a/test/c_api_tests.cpp b/test/c_api_tests.cpp
index f4dbc8061..bfe622c62 100644
--- a/test/c_api_tests.cpp
+++ b/test/c_api_tests.cpp
@@ -37,6 +37,7 @@ TEST(CAPITests, Config) {
   config->SetProviderOption("brainium", "custom_field2", "hello2");
   config->ClearProviders();
   config->AppendProvider("cuda");
+  config->AppendProvider("dml");
 #endif
 }
 

From 27448b1e08a6c74e5ac1e55cbea2560a3ee82457 Mon Sep 17 00:00:00 2001
From: Sayan Shaw <52221015+sayanshaw24@users.noreply.github.com>
Date: Tue, 6 May 2025 08:54:52 -0700
Subject: [PATCH 06/10] Update Extensions Commit to Support Chat Template
 Override for Unsupported Models (#1452)

---
 cmake/deps.txt | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/cmake/deps.txt b/cmake/deps.txt
index 9d706bef4..21931948c 100644
--- a/cmake/deps.txt
+++ b/cmake/deps.txt
@@ -14,4 +14,4 @@ pybind11;https://github.com/pybind/pybind11/archive/refs/tags/v2.13.6.zip;f78029
 googletest;https://github.com/google/googletest/archive/530d5c8c84abd2a46f38583ee817743c9b3a42b4.zip;5e3a61db2aa975cfd0f97ba92c818744e7fa7034
 microsoft_wil;https://github.com/microsoft/wil/archive/refs/tags/v1.0.230629.1.zip;e4a542a323c070376f7c2d1973d0f7ddbc1d2fa5
 directx_headers;https://github.com/microsoft/DirectX-Headers/archive/refs/tags/v1.613.1.zip;47653509a3371eabb156360f42faf582f314bf2e
-onnxruntime_extensions;https://github.com/microsoft/onnxruntime-extensions.git;d1daadcb53a80645b3d96218e4713f24c12dfaf0
+onnxruntime_extensions;https://github.com/microsoft/onnxruntime-extensions.git;cb00b43f05409d6f70cc558f52fcff0c7e386a97

From 43a50cecac7e35aa9b34381a0629f26cf1af9a82 Mon Sep 17 00:00:00 2001
From: Baiju Meswani <bmeswani@microsoft.com>
Date: Thu, 1 May 2025 14:20:42 -0700
Subject: [PATCH 07/10] Sign macos binaries (#1439)

---
 .../stages/jobs/steps/capi-macos-step.yml     | 33 +++++++++++++++++
 .../steps/compliant/mac-esrp-archive-step.yml | 35 +++++++++++++++++++
 2 files changed, 68 insertions(+)
 create mode 100644 .pipelines/stages/jobs/steps/compliant/mac-esrp-archive-step.yml

diff --git a/.pipelines/stages/jobs/steps/capi-macos-step.yml b/.pipelines/stages/jobs/steps/capi-macos-step.yml
index abf936233..78d5c9ce9 100644
--- a/.pipelines/stages/jobs/steps/capi-macos-step.yml
+++ b/.pipelines/stages/jobs/steps/capi-macos-step.yml
@@ -50,6 +50,39 @@ steps:
     displayName: 'Package C/C++ API'
     workingDirectory: '$(Build.Repository.LocalPath)'
 
+  - bash: |
+      set -e -x
+      ls -al $(Build.Repository.LocalPath)/build/$(ep)/$(os)-$(arch)/package
+      tar -xvzf onnxruntime-genai-*.tar.gz -C .
+      ls -al $(Build.Repository.LocalPath)/build/$(ep)/$(os)-$(arch)/package
+      find . -maxdepth 1 -type d -name 'onnxruntime-genai-*' -exec zip -FSr --symlinks {}.zip {} \;
+      ls -al $(Build.Repository.LocalPath)/build/$(ep)/$(os)-$(arch)/package
+      rm -rf $(Build.Repository.LocalPath)/build/$(ep)/$(os)-$(arch)/package/onnxruntime-genai-*.tar.gz
+      ls -al $(Build.Repository.LocalPath)/build/$(ep)/$(os)-$(arch)/package
+      find $(Build.Repository.LocalPath)/build/$(ep)/$(os)-$(arch)/package -maxdepth 1 -type d -name 'onnxruntime-genai-*' -exec rm -rf {} +
+      ls -al $(Build.Repository.LocalPath)/build/$(ep)/$(os)-$(arch)/package
+    displayName: 'Convert from .tar.gz to .zip'
+    workingDirectory: '$(Build.Repository.LocalPath)/build/$(ep)/$(os)-$(arch)/package'
+
+  - template: compliant/mac-esrp-archive-step.yml
+    parameters:
+      FolderPath: '$(Build.Repository.LocalPath)/build/$(ep)/$(os)-$(arch)/package'
+      Pattern: '*.zip'
+
+  - bash: |
+      set -e -x
+      ls -al $(Build.Repository.LocalPath)/build/$(ep)/$(os)-$(arch)/package
+      find . -name 'onnxruntime-genai-*.zip' -exec unzip {} \;
+      ls -al $(Build.Repository.LocalPath)/build/$(ep)/$(os)-$(arch)/package
+      find . -maxdepth 1 -type d -name 'onnxruntime-genai-*' -exec tar -czvf {}.tar.gz {} \;
+      ls -al $(Build.Repository.LocalPath)/build/$(ep)/$(os)-$(arch)/package
+      rm -rf $(Build.Repository.LocalPath)/build/$(ep)/$(os)-$(arch)/package/onnxruntime-genai-*.zip
+      ls -al $(Build.Repository.LocalPath)/build/$(ep)/$(os)-$(arch)/package
+      find $(Build.Repository.LocalPath)/build/$(ep)/$(os)-$(arch)/package -maxdepth 1 -type d -name 'onnxruntime-genai-*' -exec rm -rf {} +
+      ls -al $(Build.Repository.LocalPath)/build/$(ep)/$(os)-$(arch)/package
+    displayName: 'Convert from .zip to .tar.gz'
+    workingDirectory: '$(Build.Repository.LocalPath)/build/$(ep)/$(os)-$(arch)/package'
+
   - task: 1ES.PublishPipelineArtifact@1
     displayName: 'Publish Artifact: ONNXRuntime Genai capi'
     inputs:
diff --git a/.pipelines/stages/jobs/steps/compliant/mac-esrp-archive-step.yml b/.pipelines/stages/jobs/steps/compliant/mac-esrp-archive-step.yml
new file mode 100644
index 000000000..a31cfca1d
--- /dev/null
+++ b/.pipelines/stages/jobs/steps/compliant/mac-esrp-archive-step.yml
@@ -0,0 +1,35 @@
+parameters:
+- name: FolderPath
+  type: string
+  default: ''
+
+- name: Pattern
+  type: string
+  default: '*.zip'
+
+steps:
+- task: SFP.build-tasks.custom-build-task-1.EsrpCodeSigning@5
+  displayName: 'ESRP CodeSigning'
+  inputs:
+    ConnectedServiceName: 'esrp_release'
+    UseMSIAuthentication: true
+    AppRegistrationClientId: '62b7cfed-4d25-454f-880e-010dc21455ac'
+    AppRegistrationTenantId: '975f013f-7f24-47e8-a7d3-abc4752bf346'
+    EsrpClientId: "53d54d02-978d-4305-8572-583cf6711c4f"
+    AuthAKVName: 'ortbuildkeyvault'
+    AuthSignCertName: 'esrpcodesign'
+    FolderPath: ${{ parameters.FolderPath }}
+    Pattern: ${{ parameters.Pattern }}
+    SessionTimeout: 90
+    ServiceEndpointUrl: 'https://api.esrp.microsoft.com/api/v2'
+    MaxConcurrency: 25
+    signConfigType: inlineSignParams
+    inlineOperation: |
+      [
+        {
+          "keyCode": "CP-401337-Apple",
+          "operationSetCode": "MacAppDeveloperSign",
+          "toolName": "sign",
+          "toolVersion": "6.2.9304.0"
+        }
+      ]

From 017262129e86bff464deaf7e3eb7df3f9d94320b Mon Sep 17 00:00:00 2001
From: Sayan Shaw <52221015+sayanshaw24@users.noreply.github.com>
Date: Fri, 2 May 2025 16:57:22 -0700
Subject: [PATCH 08/10] Set `add_special_tokens` to false by default in Encode
 (#1442)

Sets `add_special_tokens` from `OrtxTokenizeWithOptions` added in
https://github.com/microsoft/onnxruntime-extensions/pull/940 to false to
solve chat template issue in GenAI with extra BOS tokens.

See https://github.com/huggingface/transformers/issues/37686 for more
context.

---------

Co-authored-by: Sayan Shaw <sayanshaw@microsoft.com>
Co-authored-by: Ryan Hill <38674843+RyanUnderhill@users.noreply.github.com>
---
 cmake/deps.txt              | 2 +-
 src/csharp/NativeMethods.cs | 1 -
 src/models/model.cpp        | 2 +-
 3 files changed, 2 insertions(+), 3 deletions(-)

diff --git a/cmake/deps.txt b/cmake/deps.txt
index 21931948c..faf47d240 100644
--- a/cmake/deps.txt
+++ b/cmake/deps.txt
@@ -14,4 +14,4 @@ pybind11;https://github.com/pybind/pybind11/archive/refs/tags/v2.13.6.zip;f78029
 googletest;https://github.com/google/googletest/archive/530d5c8c84abd2a46f38583ee817743c9b3a42b4.zip;5e3a61db2aa975cfd0f97ba92c818744e7fa7034
 microsoft_wil;https://github.com/microsoft/wil/archive/refs/tags/v1.0.230629.1.zip;e4a542a323c070376f7c2d1973d0f7ddbc1d2fa5
 directx_headers;https://github.com/microsoft/DirectX-Headers/archive/refs/tags/v1.613.1.zip;47653509a3371eabb156360f42faf582f314bf2e
-onnxruntime_extensions;https://github.com/microsoft/onnxruntime-extensions.git;cb00b43f05409d6f70cc558f52fcff0c7e386a97
+onnxruntime_extensions;https://github.com/microsoft/onnxruntime-extensions.git;fc004859e82241e99d458a90d2a39d400050cc59
diff --git a/src/csharp/NativeMethods.cs b/src/csharp/NativeMethods.cs
index 58f07939f..b0e9008f7 100644
--- a/src/csharp/NativeMethods.cs
+++ b/src/csharp/NativeMethods.cs
@@ -186,7 +186,6 @@ public static extern UIntPtr OgaSequencesGetSequenceCount(IntPtr /* const OgaSeq
                                                                         byte[] /* const char* */ strings,
                                                                         IntPtr /* OgaSequences* */ sequences);
 
-
         // This function is used to decode the given token into a string. The caller is responsible for freeing the
         // returned string using the OgaDestroyString function when it is no longer needed.
         [DllImport(NativeLib.DllName, CallingConvention = CallingConvention.Winapi)]
diff --git a/src/models/model.cpp b/src/models/model.cpp
index 4532ce7c7..4ccf866c8 100644
--- a/src/models/model.cpp
+++ b/src/models/model.cpp
@@ -198,7 +198,7 @@ std::unique_ptr<TokenizerStream> Tokenizer::CreateStream() const {
 
 std::vector<int32_t> Tokenizer::Encode(const char* text) const {
   OrtxPtr<OrtxTokenId2DArray> ids;
-  CheckResult(OrtxTokenize(tokenizer_, &text, 1, ids.Address()));
+  CheckResult(OrtxTokenizeWithOptions(tokenizer_, &text, 1, ids.Address(), false /* add_special_tokens */));
 
   const extTokenId_t* tokens;
   size_t count;

From 18d52c3fb016a5656b3cd36c4b3090d6ae603524 Mon Sep 17 00:00:00 2001
From: kunal-vaishnavi <115581922+kunal-vaishnavi@users.noreply.github.com>
Date: Mon, 5 May 2025 14:00:07 -0700
Subject: [PATCH 09/10] Remove prompt templates from GenAI config (#1445)

### Description

This PR removes generating prompt templates and storing them in the
GenAI config.

### Motivation and Context

The prompt templates are not used internally anymore.
---
 src/config.cpp                  | 36 ---------------------------------
 src/config.h                    |  8 --------
 src/python/py/models/builder.py | 33 +-----------------------------
 3 files changed, 1 insertion(+), 76 deletions(-)

diff --git a/src/config.cpp b/src/config.cpp
index 225a080ba..4e346d1ac 100644
--- a/src/config.cpp
+++ b/src/config.cpp
@@ -589,38 +589,6 @@ struct Embedding_Element : JSON::Element {
   EmbeddingOutputs_Element outputs_{v_.outputs};
 };
 
-struct PromptTemplates_Element : JSON::Element {
-  explicit PromptTemplates_Element(std::optional<Config::Model::PromptTemplates>& v) : v_{v} {}
-
-  void OnValue(std::string_view name, JSON::Value value) override {
-    // if one of templates is given in json, then any non-specified template will be default "{Content}"
-    if (name == "assistant") {
-      EnsureAvailable();
-      v_->assistant = JSON::Get<std::string_view>(value);
-    } else if (name == "prompt") {
-      EnsureAvailable();
-      v_->prompt = JSON::Get<std::string_view>(value);
-    } else if (name == "system") {
-      EnsureAvailable();
-      v_->system = JSON::Get<std::string_view>(value);
-    } else if (name == "user") {
-      EnsureAvailable();
-      v_->user = JSON::Get<std::string_view>(value);
-    } else {
-      throw JSON::unknown_value_error{};
-    }
-  }
-
- private:
-  std::optional<Config::Model::PromptTemplates>& v_;
-
-  void EnsureAvailable() {
-    if (!v_.has_value()) {
-      v_.emplace();
-    }
-  }
-};
-
 struct Model_Element : JSON::Element {
   explicit Model_Element(Config::Model& v) : v_{v} {}
 
@@ -664,9 +632,6 @@ struct Model_Element : JSON::Element {
     if (name == "embedding") {
       return embedding_;
     }
-    if (name == "prompt_templates") {
-      return prompt_templates_;
-    }
     if (name == "speech") {
       return speech_;
     }
@@ -680,7 +645,6 @@ struct Model_Element : JSON::Element {
   Eos_Array_Element eos_token_ids_{v_};
   Vision_Element vision_{v_.vision};
   Embedding_Element embedding_{v_.embedding};
-  PromptTemplates_Element prompt_templates_{v_.prompt_templates};
   Speech_Element speech_{v_.speech};
 };
 
diff --git a/src/config.h b/src/config.h
index baf4d7013..ea18bd11c 100644
--- a/src/config.h
+++ b/src/config.h
@@ -25,7 +25,6 @@ struct Config {
     static constexpr std::string_view InputsEmbedsName = "inputs_embeds";
     static constexpr std::string_view CurrentSequenceLengthName = "current_sequence_length";
     static constexpr std::string_view PastSequenceLengthName = "past_sequence_length";
-    static constexpr std::string_view promptTemplate = "{Content}";
     static constexpr std::string_view TotalSequenceLengthName = "total_sequence_length";
     static constexpr std::string_view TokenTypeIdsName = "token_type_ids";
 
@@ -206,13 +205,6 @@ struct Config {
 
     } decoder;
 
-    struct PromptTemplates {
-      std::string assistant{Defaults::promptTemplate};
-      std::string prompt{Defaults::promptTemplate};
-      std::string system{Defaults::promptTemplate};
-      std::string user{Defaults::promptTemplate};
-    };
-    std::optional<PromptTemplates> prompt_templates;
   } model;
 
   struct Search {
diff --git a/src/python/py/models/builder.py b/src/python/py/models/builder.py
index 92fd815cd..5e274f394 100644
--- a/src/python/py/models/builder.py
+++ b/src/python/py/models/builder.py
@@ -399,11 +399,6 @@ def make_genai_config(self, model_name_or_path, extra_kwargs, out_dir):
             ep_options = { self.ep : self.ep_attrs[self.ep] }
             genai_config["model"]["decoder"]["session_options"]["provider_options"].append(ep_options)
 
-        if self.extra_options.get("include_prompt_templates", False):
-            prompt_templates = self._get_prompt_templates(model_name_or_path, extra_kwargs)
-            if prompt_templates is not None:
-                genai_config["model"]["prompt_templates"] = prompt_templates
-
         print(f"Saving GenAI config in {out_dir}")
         with open(os.path.join(out_dir,"genai_config.json"), "w") as f:
             json.dump(genai_config, f, indent=4)
@@ -412,30 +407,6 @@ def save_processing(self, model_name_or_path, extra_kwargs, out_dir):
         tokenizer = AutoTokenizer.from_pretrained(model_name_or_path, token=self.hf_token, trust_remote_code=True, **extra_kwargs)
         print(f"Saving processing files in {out_dir} for GenAI")
         tokenizer.save_pretrained(out_dir)
-
-    def _get_prompt_templates(self, hf_name, extra_kwargs):
-        try:
-            # disable end of sentence padding with eos_token=None
-            tokenizer = AutoTokenizer.from_pretrained(hf_name, token=self.hf_token, trust_remote_code=True, eos_token=None, **extra_kwargs)
-            system_template = tokenizer.apply_chat_template([{'role': 'system', 'content': '{Content}'}], tokenize=False)
-            system_user_template = tokenizer.apply_chat_template([{'role': 'system', 'content': '{Content}'}, {'role': 'user', 'content': '{Content}'}], tokenize=False)
-            system_user_assistant_template = tokenizer.apply_chat_template([{'role': 'system', 'content': '{Content}'}, {'role': 'user', 'content': '{Content}'}, {'role': 'assistant', 'content': '{Content}'}], tokenize=False)
-            assert system_user_template.startswith(system_template), "Chat templates may contain padding tokens, leading to incorrect prompt templates"
-            assert system_user_assistant_template.startswith(system_user_template), "Chat templates may contain padding tokens, leading to incorrect prompt templates"
-            user_template = system_user_template[len(system_template):]
-            assistant_template = system_user_assistant_template[len(system_user_template):]
-            prompt_template = system_user_assistant_template[len(system_template):]
-            prompt_template = prompt_template[:prompt_template.rfind('{Content}')]
-            templates = {
-                "system": system_template,
-                "user": user_template,
-                "assistant": assistant_template,
-                "prompt": prompt_template
-            }
-            return templates 
-        except Exception as e:
-            print(f"Failed to get prompt templates. Error: {e}")
-            return None
         
     def save_model(self, out_dir):
         print(f"Saving ONNX model in {out_dir}")
@@ -3273,7 +3244,7 @@ def check_extra_options(kv_pairs):
     """
     Check key-value pairs and set values correctly
     """
-    bools = ["int4_is_symmetric", "exclude_embeds", "exclude_lm_head", "include_hidden_states", "enable_cuda_graph", "use_8bits_moe", "use_qdq", "include_prompt_templates"]
+    bools = ["int4_is_symmetric", "exclude_embeds", "exclude_lm_head", "include_hidden_states", "enable_cuda_graph", "use_8bits_moe", "use_qdq"]
     for key in bools:
         if key in kv_pairs:
             if kv_pairs[key] in {"false", "False", "0"}:
@@ -3528,8 +3499,6 @@ def get_args():
                     Use this option to enable GPUs that do not support FP16 on WebGPU (e.g. GTX 10xx).
                 adapter_path = Path to folder on disk containing the adapter files (adapter_config.json and adapter model weights).
                     Use this option for LoRA models.
-                include_prompt_templates = Include prompt templates in the GenAI config file. Default is false.
-                    Use this option to include per-role prompt templates in the `genai_config.json` file.
             """),
     )
 

From 6c04a61981b2ab63364057ebb001cc09bf4d4093 Mon Sep 17 00:00:00 2001
From: Sayan Shaw <52221015+sayanshaw24@users.noreply.github.com>
Date: Tue, 6 May 2025 08:54:52 -0700
Subject: [PATCH 10/10] Update Extensions Commit to Support Chat Template
 Override for Unsupported Models (#1452)

---
 cmake/deps.txt | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/cmake/deps.txt b/cmake/deps.txt
index faf47d240..21931948c 100644
--- a/cmake/deps.txt
+++ b/cmake/deps.txt
@@ -14,4 +14,4 @@ pybind11;https://github.com/pybind/pybind11/archive/refs/tags/v2.13.6.zip;f78029
 googletest;https://github.com/google/googletest/archive/530d5c8c84abd2a46f38583ee817743c9b3a42b4.zip;5e3a61db2aa975cfd0f97ba92c818744e7fa7034
 microsoft_wil;https://github.com/microsoft/wil/archive/refs/tags/v1.0.230629.1.zip;e4a542a323c070376f7c2d1973d0f7ddbc1d2fa5
 directx_headers;https://github.com/microsoft/DirectX-Headers/archive/refs/tags/v1.613.1.zip;47653509a3371eabb156360f42faf582f314bf2e
-onnxruntime_extensions;https://github.com/microsoft/onnxruntime-extensions.git;fc004859e82241e99d458a90d2a39d400050cc59
+onnxruntime_extensions;https://github.com/microsoft/onnxruntime-extensions.git;cb00b43f05409d6f70cc558f52fcff0c7e386a97