diff --git a/.github/workflows/android.yml b/.github/workflows/android.yml
index 8df0064e06a1d..87f93cd0f6ea7 100644
--- a/.github/workflows/android.yml
+++ b/.github/workflows/android.yml
@@ -78,8 +78,8 @@ jobs:
       run: |
         set -e -x
         BINARY_SIZE_THRESHOLD_ARGS=""
-        echo "Binary size threshold in bytes: 1306224"
-        BINARY_SIZE_THRESHOLD_ARGS="--threshold_size_in_bytes 1306224"
+        echo "Binary size threshold in bytes: 1436672"
+        BINARY_SIZE_THRESHOLD_ARGS="--threshold_size_in_bytes 1436672"
 
         # Ensure ANDROID_NDK_HOME is available and get its real path
         if [ -z "$ANDROID_NDK_HOME" ]; then
diff --git a/include/onnxruntime/core/framework/ortmemoryinfo.h b/include/onnxruntime/core/framework/ortmemoryinfo.h
index d930b2289170d..1be81e77064d2 100644
--- a/include/onnxruntime/core/framework/ortmemoryinfo.h
+++ b/include/onnxruntime/core/framework/ortmemoryinfo.h
@@ -13,18 +13,14 @@ struct OrtMemoryInfo {
   OrtMemoryInfo() = default;  // to allow default construction of Tensor
 
   // use string for name, so we could have customized allocator in execution provider.
-  const char* name = nullptr;
+  std::string name;
   OrtMemType mem_type = OrtMemTypeDefault;
   OrtAllocatorType alloc_type = OrtInvalidAllocator;
   OrtDevice device;
 
-  constexpr OrtMemoryInfo(const char* name_, OrtAllocatorType type_, OrtDevice device_ = OrtDevice(),
-                          OrtMemType mem_type_ = OrtMemTypeDefault)
-#if ((defined(__GNUC__) && __GNUC__ > 4) || defined(__clang__))
-      // this causes a spurious error in CentOS gcc 4.8 build so disable if GCC version < 5
-      __attribute__((nonnull))
-#endif
-      : name(name_),
+  OrtMemoryInfo(std::string name_, OrtAllocatorType type_, OrtDevice device_ = OrtDevice(),
+                OrtMemType mem_type_ = OrtMemTypeDefault)
+      : name(std::move(name_)),
         mem_type(mem_type_),
         alloc_type(type_),
         device(device_) {
@@ -39,7 +35,7 @@ struct OrtMemoryInfo {
     if (device != other.device)
       return device < other.device;
 
-    return strcmp(name, other.name) < 0;
+    return name < other.name;
   }
 
   // This is to make OrtMemoryInfo a valid key in hash tables
@@ -68,7 +64,7 @@ inline bool operator==(const OrtMemoryInfo& left, const OrtMemoryInfo& other) {
   return left.mem_type == other.mem_type &&
          left.alloc_type == other.alloc_type &&
          left.device == other.device &&
-         strcmp(left.name, other.name) == 0;
+         left.name == other.name;
 }
 
 inline bool operator!=(const OrtMemoryInfo& lhs, const OrtMemoryInfo& rhs) { return !(lhs == rhs); }
diff --git a/onnxruntime/core/framework/allocator.cc b/onnxruntime/core/framework/allocator.cc
index e1b9d1294fb9e..91b5b811a3529 100644
--- a/onnxruntime/core/framework/allocator.cc
+++ b/onnxruntime/core/framework/allocator.cc
@@ -6,6 +6,7 @@
 #include "core/common/safeint.h"
 #include "core/common/status.h"
 #include "core/framework/allocator.h"
+#include "core/framework/error_code_helper.h"
 #include "core/mlas/inc/mlas.h"
 #include "core/framework/utils.h"
 #include "core/session/ort_apis.h"
@@ -185,22 +186,32 @@ std::ostream& operator<<(std::ostream& out, const OrtMemoryInfo& info) { return
 #endif
 ORT_API_STATUS_IMPL(OrtApis::CreateMemoryInfo, _In_ const char* name1, enum OrtAllocatorType type, int id1,
                     enum OrtMemType mem_type1, _Outptr_ OrtMemoryInfo** out) {
+  API_IMPL_BEGIN
+
+  if (name1 == nullptr) {
+    return OrtApis::CreateStatus(ORT_INVALID_ARGUMENT, "MemoryInfo name cannot be null.");
+  }
+
+  if (out == nullptr) {
+    return OrtApis::CreateStatus(ORT_INVALID_ARGUMENT, "Output memory info cannot be null.");
+  }
+
   auto device_id = static_cast<OrtDevice::DeviceId>(id1);
   if (strcmp(name1, onnxruntime::CPU) == 0) {
     *out = new OrtMemoryInfo(onnxruntime::CPU, type, OrtDevice(), mem_type1);
   } else if (strcmp(name1, onnxruntime::CUDA) == 0) {
     *out = new OrtMemoryInfo(
-        name1, type,
+        onnxruntime::CUDA, type,
         OrtDevice(OrtDevice::GPU, OrtDevice::MemType::DEFAULT, OrtDevice::VendorIds::NVIDIA, device_id),
         mem_type1);
   } else if (strcmp(name1, onnxruntime::OpenVINO_GPU) == 0) {
     *out = new OrtMemoryInfo(
-        name1, type,
+        onnxruntime::OpenVINO_GPU, type,
         OrtDevice(OrtDevice::GPU, OrtDevice::MemType::DEFAULT, OrtDevice::VendorIds::INTEL, device_id),
         mem_type1);
   } else if (strcmp(name1, onnxruntime::HIP) == 0) {
     *out = new OrtMemoryInfo(
-        name1, type,
+        onnxruntime::HIP, type,
         OrtDevice(OrtDevice::GPU, OrtDevice::MemType::DEFAULT, OrtDevice::VendorIds::AMD, device_id),
         mem_type1);
   } else if (strcmp(name1, onnxruntime::WEBGPU_BUFFER) == 0 ||
@@ -212,38 +223,39 @@ ORT_API_STATUS_IMPL(OrtApis::CreateMemoryInfo, _In_ const char* name1, enum OrtA
 
   } else if (strcmp(name1, onnxruntime::DML) == 0) {
     *out = new OrtMemoryInfo(
-        name1, type,
+        onnxruntime::DML, type,
         OrtDevice(OrtDevice::GPU, OrtDevice::MemType::DEFAULT, OrtDevice::VendorIds::MICROSOFT, device_id),
         mem_type1);
   } else if (strcmp(name1, onnxruntime::OpenVINO_RT_NPU) == 0) {
     *out = new OrtMemoryInfo(
-        name1, type,
+        onnxruntime::OpenVINO_RT_NPU, type,
         OrtDevice(OrtDevice::NPU, OrtDevice::MemType::DEFAULT, OrtDevice::VendorIds::INTEL, device_id),
         mem_type1);
   } else if (strcmp(name1, onnxruntime::CUDA_PINNED) == 0) {
     *out = new OrtMemoryInfo(
-        name1, type,
+        onnxruntime::CUDA_PINNED, type,
         OrtDevice(OrtDevice::GPU, OrtDevice::MemType::HOST_ACCESSIBLE, OrtDevice::VendorIds::NVIDIA, device_id),
         mem_type1);
   } else if (strcmp(name1, onnxruntime::HIP_PINNED) == 0) {
     *out = new OrtMemoryInfo(
-        name1, type,
+        onnxruntime::HIP_PINNED, type,
         OrtDevice(OrtDevice::GPU, OrtDevice::MemType::HOST_ACCESSIBLE, OrtDevice::VendorIds::AMD, device_id),
         mem_type1);
   } else if (strcmp(name1, onnxruntime::QNN_HTP_SHARED) == 0) {
     *out = new OrtMemoryInfo(
-        name1, type,
+        onnxruntime::QNN_HTP_SHARED, type,
         OrtDevice(OrtDevice::CPU, OrtDevice::MemType::HOST_ACCESSIBLE, OrtDevice::VendorIds::QUALCOMM, device_id),
         mem_type1);
   } else if (strcmp(name1, onnxruntime::CPU_ALIGNED_4K) == 0) {
     *out = new OrtMemoryInfo(
-        name1, type,
+        onnxruntime::CPU_ALIGNED_4K, type,
         OrtDevice(OrtDevice::CPU, OrtDevice::MemType::DEFAULT, OrtDevice::VendorIds::NONE, device_id,
                   onnxruntime::kAlloc4KAlignment),
         mem_type1);
   } else {
     return OrtApis::CreateStatus(ORT_INVALID_ARGUMENT, "Specified device is not supported. Try CreateMemoryInfo_V2.");
   }
+  API_IMPL_END
   return nullptr;
 }
 
@@ -251,6 +263,16 @@ ORT_API_STATUS_IMPL(OrtApis::CreateMemoryInfo_V2, _In_ const char* name, _In_ en
                     _In_ uint32_t vendor_id, _In_ int32_t device_id, _In_ enum OrtDeviceMemoryType mem_type,
                     _In_ size_t alignment, enum OrtAllocatorType type,
                     _Outptr_ OrtMemoryInfo** out) {
+  API_IMPL_BEGIN
+
+  if (name == nullptr) {
+    return OrtApis::CreateStatus(ORT_INVALID_ARGUMENT, "MemoryInfo name cannot be null.");
+  }
+
+  if (out == nullptr) {
+    return OrtApis::CreateStatus(ORT_INVALID_ARGUMENT, "Output memory info cannot be null.");
+  }
+
   // map the public enum values to internal OrtDevice values
   OrtDevice::MemoryType mt = mem_type == OrtDeviceMemoryType_DEFAULT ? OrtDevice::MemType::DEFAULT
                                                                      : OrtDevice::MemType::HOST_ACCESSIBLE;
@@ -275,6 +297,7 @@ ORT_API_STATUS_IMPL(OrtApis::CreateMemoryInfo_V2, _In_ const char* name, _In_ en
 
   *out = new OrtMemoryInfo(name, type, OrtDevice{dt, mt, vendor_id, narrow<int16_t>(device_id), alignment},
                            mem_type == OrtDeviceMemoryType_DEFAULT ? OrtMemTypeDefault : OrtMemTypeCPU);
+  API_IMPL_END
   return nullptr;
 }
 
@@ -283,7 +306,7 @@ ORT_API(void, OrtApis::ReleaseMemoryInfo, _Frees_ptr_opt_ OrtMemoryInfo* p) { de
 #pragma warning(pop)
 #endif
 ORT_API_STATUS_IMPL(OrtApis::MemoryInfoGetName, _In_ const OrtMemoryInfo* ptr, _Out_ const char** out) {
-  *out = ptr->name;
+  *out = ptr->name.c_str();
   return nullptr;
 }
 
diff --git a/onnxruntime/core/framework/bfc_arena.cc b/onnxruntime/core/framework/bfc_arena.cc
index e0b50cd04173e..3a5af42d03cdd 100644
--- a/onnxruntime/core/framework/bfc_arena.cc
+++ b/onnxruntime/core/framework/bfc_arena.cc
@@ -13,7 +13,7 @@ BFCArena::BFCArena(std::unique_ptr<IAllocator> resource_allocator,
                    int max_dead_bytes_per_chunk,
                    int initial_growth_chunk_size_bytes,
                    int64_t max_power_of_two_extend_bytes)
-    : IAllocator(OrtMemoryInfo(resource_allocator->Info().name,
+    : IAllocator(OrtMemoryInfo(resource_allocator->Info().name.c_str(),
                                OrtAllocatorType::OrtArenaAllocator,
                                resource_allocator->Info().device,
                                resource_allocator->Info().mem_type)),
diff --git a/onnxruntime/core/mlas/lib/qnbitgemm.h b/onnxruntime/core/mlas/lib/qnbitgemm.h
index 06e8e49b59e2e..7ec80c6d67f15 100644
--- a/onnxruntime/core/mlas/lib/qnbitgemm.h
+++ b/onnxruntime/core/mlas/lib/qnbitgemm.h
@@ -53,16 +53,25 @@ struct PackedQuantBDataStruct {
     {
         const size_t PackedQuantBDataSize = N * BlockCountK * MlasQNBitBlkDataSizeInBytes(BlkBitWidth, BlkLen);
         size_t BlkSumSize = MlasDivRoundup(N, 16) * BlockCountK * 16 * sizeof(T);
-        if constexpr (BlkBitWidth == 8) {
-            PackedQuantBData = (std::byte*)MlasAlignAddress(PackedQuantBWorkspace, 32);
-        } else {
 #if defined(MLAS_TARGET_AMD64_IX86)
         // avx512 requires alignment on a 64-byte boundary
         PackedQuantBData = (std::byte*)MlasAlignAddress(PackedQuantBWorkspace, 64);
+#elif defined (MLAS_TARGET_ARM64)
+        // Only for 8-bit Gemms is the `PackedQuantBData` is to be 32-byte aligned and
+        // there is enough memory allocated to support this alignment.
+        // See QNBitGemmPackQuantBDataSize().
+        // When bit width is 4, there is no alignment guarantee.
+        // TODO(hasesh): Can we unify the alignment for 4-bit and 8-bit ARM64 Gemms so as to
+        // simpify this logic and make code here cleaner ?
+        if constexpr (BlkBitWidth == 8) {
+            PackedQuantBData = (std::byte*)MlasAlignAddress(PackedQuantBWorkspace, 32);        
+        }
+        else {
+            PackedQuantBData = (std::byte*)PackedQuantBWorkspace;
+        }
 #else
         PackedQuantBData = (std::byte*)PackedQuantBWorkspace;
 #endif
-        }
 
         QuantBBlkSum = (T*)(PackedQuantBData + PackedQuantBDataSize);
         QuantBBlkSum = (T*)MlasAlignAddress(QuantBBlkSum, MlasQNBitQuantBBlkSumAlignment());
diff --git a/onnxruntime/core/providers/dml/DmlExecutionProvider/src/ExecutionProvider.cpp b/onnxruntime/core/providers/dml/DmlExecutionProvider/src/ExecutionProvider.cpp
index 9611cb82d5a62..6d8d5453b9fc0 100644
--- a/onnxruntime/core/providers/dml/DmlExecutionProvider/src/ExecutionProvider.cpp
+++ b/onnxruntime/core/providers/dml/DmlExecutionProvider/src/ExecutionProvider.cpp
@@ -927,7 +927,7 @@ namespace Dml
 
     bool IsGpuTensor(const onnxruntime::Tensor& tensor)
     {
-        return strcmp(tensor.Location().name, onnxruntime::CPU) &&
+        return strcmp(tensor.Location().name.c_str(), onnxruntime::CPU) &&
             !(tensor.Location().mem_type == ::OrtMemType::OrtMemTypeCPUOutput || tensor.Location().mem_type == ::OrtMemType::OrtMemTypeCPUInput);
     }
 
diff --git a/onnxruntime/core/providers/dml/DmlExecutionProvider/src/MLOperatorAuthorImpl.cpp b/onnxruntime/core/providers/dml/DmlExecutionProvider/src/MLOperatorAuthorImpl.cpp
index c601ee3c1d5e6..fe52f27b35bb8 100644
--- a/onnxruntime/core/providers/dml/DmlExecutionProvider/src/MLOperatorAuthorImpl.cpp
+++ b/onnxruntime/core/providers/dml/DmlExecutionProvider/src/MLOperatorAuthorImpl.cpp
@@ -98,7 +98,7 @@ namespace Windows::AI::MachineLearning::Adapter
 
     bool IsAllocationInterface(const ::OrtMemoryInfo& info)
     {
-        return strcmp(info.name, onnxruntime::CPU) && !(info.mem_type == ::OrtMemType::OrtMemTypeCPUOutput || info.mem_type == ::OrtMemType::OrtMemTypeCPUInput);
+        return strcmp(info.name.c_str(), onnxruntime::CPU) && !(info.mem_type == ::OrtMemType::OrtMemTypeCPUOutput || info.mem_type == ::OrtMemType::OrtMemTypeCPUInput);
     }
 
     // Translate the data object stored in a tensor to the type which will be returned through
@@ -1774,7 +1774,9 @@ namespace Windows::AI::MachineLearning::Adapter
         }
 
         // tells caller whether this tensor is in CPU memory
-        return !strcmp(m_impl->Location().name, onnxruntime::CPU) || m_impl->Location().mem_type == ::OrtMemType::OrtMemTypeCPUOutput || m_impl->Location().mem_type == ::OrtMemType::OrtMemTypeCPUInput;
+        return !strcmp(m_impl->Location().name.c_str(), onnxruntime::CPU) 
+            || m_impl->Location().mem_type == ::OrtMemType::OrtMemTypeCPUOutput
+            || m_impl->Location().mem_type == ::OrtMemType::OrtMemTypeCPUInput;
     }
 
     bool STDMETHODCALLTYPE TensorWrapper::IsDataInterface() const noexcept
diff --git a/onnxruntime/core/providers/qnn/qnn_provider_factory.cc b/onnxruntime/core/providers/qnn/qnn_provider_factory.cc
index 8641952f27ee5..a7e553848fb4d 100644
--- a/onnxruntime/core/providers/qnn/qnn_provider_factory.cc
+++ b/onnxruntime/core/providers/qnn/qnn_provider_factory.cc
@@ -219,9 +219,11 @@ struct QnnEpFactory : OrtEpFactory {
         OrtKeyValuePairs* ep_options = nullptr;
         factory->ort_api.CreateKeyValuePairs(&ep_options);
         factory->ort_api.AddKeyValuePair(ep_options, "backend_path", factory->qnn_backend_path.c_str());
-        ORT_API_RETURN_IF_ERROR(
-            factory->ort_api.GetEpApi()->CreateEpDevice(factory, &device, nullptr, ep_options,
-                                                        &ep_devices[num_ep_devices++]));
+        OrtStatus* status = factory->ort_api.GetEpApi()->CreateEpDevice(factory, &device, nullptr, ep_options,
+                                                                        &ep_devices[num_ep_devices++]);
+
+        factory->ort_api.ReleaseKeyValuePairs(ep_options);
+        ORT_API_RETURN_IF_ERROR(status);
       }
     }
 
diff --git a/onnxruntime/core/providers/webgpu/webgpu_context.cc b/onnxruntime/core/providers/webgpu/webgpu_context.cc
index a9557f7b9aa87..562d54d1bf977 100644
--- a/onnxruntime/core/providers/webgpu/webgpu_context.cc
+++ b/onnxruntime/core/providers/webgpu/webgpu_context.cc
@@ -210,7 +210,7 @@ Status WebGpuContext::Run(ComputeContext& context, const ProgramBase& program) {
                   return tensor != nullptr &&
                          tensor->Location().mem_type == OrtMemType::OrtMemTypeDefault &&
                          tensor->Location().device.Type() == OrtDevice::GPU &&
-                         !strcmp(tensor->Location().name, WEBGPU_BUFFER);
+                         !strcmp(tensor->Location().name.c_str(), WEBGPU_BUFFER);
                 }),
                 "All inputs must be tensors on WebGPU buffers.");
 
@@ -219,7 +219,7 @@ Status WebGpuContext::Run(ComputeContext& context, const ProgramBase& program) {
                   return tensor != nullptr &&
                          tensor->Location().mem_type == OrtMemType::OrtMemTypeDefault &&
                          tensor->Location().device.Type() == OrtDevice::GPU &&
-                         !strcmp(tensor->Location().name, WEBGPU_BUFFER);
+                         !strcmp(tensor->Location().name.c_str(), WEBGPU_BUFFER);
                 }),
                 "All outputs must be tensors on WebGPU buffers.");
   }
diff --git a/onnxruntime/core/session/environment.cc b/onnxruntime/core/session/environment.cc
index 39b785c327d56..9c40eb75780ee 100644
--- a/onnxruntime/core/session/environment.cc
+++ b/onnxruntime/core/session/environment.cc
@@ -79,7 +79,7 @@ static bool AreOrtMemoryInfosEquivalent(
     bool ignore_alignment = false) {
   return left.mem_type == right.mem_type &&
          (ignore_alignment ? left.device.EqualIgnoringAlignment(right.device) : left.device == right.device) &&
-         (!match_name || strcmp(left.name, right.name) == 0);
+         (!match_name || left.name == right.name);
 }
 
 std::vector<AllocatorPtr>::const_iterator FindExistingAllocator(const std::vector<AllocatorPtr>& allocators,
diff --git a/onnxruntime/core/session/lora_adapters.cc b/onnxruntime/core/session/lora_adapters.cc
index 85ea958981e2c..124d748029fd4 100644
--- a/onnxruntime/core/session/lora_adapters.cc
+++ b/onnxruntime/core/session/lora_adapters.cc
@@ -53,11 +53,11 @@ void LoraAdapter::MemoryMap(const std::filesystem::path& file_path) {
 static std::unique_ptr<IDataTransfer> GetDataTransfer(const OrtMemoryInfo& mem_info) {
   std::unique_ptr<IDataTransfer> data_transfer;
 
-  if (strcmp(mem_info.name, onnxruntime::CPU) == 0) {
+  if (mem_info.name == onnxruntime::CPU) {
     return data_transfer;
   }
 
-  if (strcmp(mem_info.name, onnxruntime::CUDA) == 0) {
+  if (mem_info.name == onnxruntime::CUDA) {
 #if defined(USE_CUDA) || defined(USE_CUDA_PROVIDER_INTERFACE)
     auto* cuda_provider_info = TryGetProviderInfo_CUDA();
     if (cuda_provider_info != nullptr) {
diff --git a/onnxruntime/test/framework/TestAllocatorManager.cc b/onnxruntime/test/framework/TestAllocatorManager.cc
index 30f2686cd62f5..6440a805cdc59 100644
--- a/onnxruntime/test/framework/TestAllocatorManager.cc
+++ b/onnxruntime/test/framework/TestAllocatorManager.cc
@@ -10,7 +10,7 @@ namespace test {
 class DummyArena : public IAllocator {
  public:
   explicit DummyArena(std::unique_ptr<IAllocator> resource_allocator)
-      : IAllocator(OrtMemoryInfo(resource_allocator->Info().name,
+      : IAllocator(OrtMemoryInfo(resource_allocator->Info().name.c_str(),
                                  OrtAllocatorType::OrtDeviceAllocator,
                                  resource_allocator->Info().device,
                                  resource_allocator->Info().mem_type)),
diff --git a/onnxruntime/test/framework/allocator_test.cc b/onnxruntime/test/framework/allocator_test.cc
index 3efba6f1b6e52..445e023746aaa 100644
--- a/onnxruntime/test/framework/allocator_test.cc
+++ b/onnxruntime/test/framework/allocator_test.cc
@@ -13,7 +13,7 @@ namespace test {
 TEST(AllocatorTest, CPUAllocatorTest) {
   auto cpu_arena = TestCPUExecutionProvider()->CreatePreferredAllocators()[0];
 
-  ASSERT_STREQ(cpu_arena->Info().name, CPU);
+  ASSERT_STREQ(cpu_arena->Info().name.c_str(), CPU);
   EXPECT_EQ(cpu_arena->Info().device.Id(), 0);
 
   const auto expected_allocator_type = DoesCpuAllocatorSupportArenaUsage()
diff --git a/onnxruntime/test/framework/tensor_test.cc b/onnxruntime/test/framework/tensor_test.cc
index 2ac1a93013932..f08675271de21 100644
--- a/onnxruntime/test/framework/tensor_test.cc
+++ b/onnxruntime/test/framework/tensor_test.cc
@@ -29,7 +29,7 @@ void CPUTensorTest(std::vector<int64_t> dims, const int offset_elements = 0) {
   EXPECT_EQ(shape.GetDims(), tensor_shape.GetDims());
   EXPECT_EQ(t.DataType(), DataTypeImpl::GetType<T>());
   auto& location = t.Location();
-  EXPECT_STREQ(location.name, CPU);
+  EXPECT_STREQ(location.name.c_str(), CPU);
   EXPECT_EQ(location.device.Id(), 0);
 
   const T* t_data = t.Data<T>();
@@ -47,7 +47,7 @@ void CPUTensorTest(std::vector<int64_t> dims, const int offset_elements = 0) {
     EXPECT_EQ(shape.GetDims(), tensor_shape.GetDims());
     EXPECT_EQ(new_t.DataType(), DataTypeImpl::GetType<T>());
     auto& new_location = new_t.Location();
-    ASSERT_STREQ(new_location.name, CPU);
+    ASSERT_STREQ(new_location.name.c_str(), CPU);
     EXPECT_EQ(new_location.device.Id(), 0);
   }
 }
@@ -135,7 +135,7 @@ TEST(TensorTest, EmptyTensorTest) {
   EXPECT_TRUE(!data);
 
   auto& location = t.Location();
-  ASSERT_STREQ(location.name, CPU);
+  ASSERT_STREQ(location.name.c_str(), CPU);
   EXPECT_EQ(location.device.Id(), 0);
 
   const auto expected_allocator_type = DoesCpuAllocatorSupportArenaUsage()
@@ -160,7 +160,7 @@ TEST(TensorTest, StringTensorTest) {
     EXPECT_EQ(shape, tensor_shape);
     EXPECT_EQ(t.DataType(), DataTypeImpl::GetType<std::string>());
     auto& location = t.Location();
-    ASSERT_STREQ(location.name, CPU);
+    ASSERT_EQ(location.name, CPU);
     EXPECT_EQ(location.device.Id(), 0);
 
     std::string* new_data = t.MutableData<std::string>();
diff --git a/onnxruntime/test/lora/lora_test.cc b/onnxruntime/test/lora/lora_test.cc
index e8291a36447ca..ecfaf34c8a076 100644
--- a/onnxruntime/test/lora/lora_test.cc
+++ b/onnxruntime/test/lora/lora_test.cc
@@ -216,7 +216,7 @@ TEST(LoraAdapterTest, VerifyDeviceCopy) {
   for (; begin != end; ++begin) {
     const auto& [_, param] = *begin;
     const auto& tensor_device = param.GetDeviceOrMapped().Get<Tensor>();
-    ASSERT_EQ(0, strcmp(tensor_device.Location().name, onnxruntime::CUDA));
+    ASSERT_EQ(0, strcmp(tensor_device.Location().name.c_str(), onnxruntime::CUDA));
 
     const auto& tensor_cpu = param.GetMapped().Get<Tensor>();
     ASSERT_EQ(tensor_cpu.Shape().Size(), tensor_device.Shape().Size());
diff --git a/onnxruntime/test/mlas/unittest/test_sq8bitgemm.cpp b/onnxruntime/test/mlas/unittest/test_sq8bitgemm.cpp
index 3ed283d54f41d..1be05d88849cd 100644
--- a/onnxruntime/test/mlas/unittest/test_sq8bitgemm.cpp
+++ b/onnxruntime/test/mlas/unittest/test_sq8bitgemm.cpp
@@ -773,7 +773,8 @@ class MlasSQ8BitGemmKernelTest : public MlasTestBase {
         N, K, 8, BlkLen, MLAS_QNBIT_GEMM_COMPUTE_TYPE::SQNBIT_CompInt8, nullptr, packedBuffer,
         nullptr, HasZp, inputZp, nullptr);
 
-    PackedQuantBDataStruct<float, 8> packedQuantB(packedBuffer, N, BlkCount, BlkLen, true);
+    const bool isQuantAUnsigned = GetMlasPlatform().ArmNeonIsQuantActivationsUnsigned;
+    PackedQuantBDataStruct<float, 8> packedQuantB(packedBuffer, N, BlkCount, BlkLen, isQuantAUnsigned);
 
     auto* C = C_.GetBuffer(M * ldc, true);
     auto* ref = ref_.GetBuffer(M * ldc, true);
@@ -825,7 +826,9 @@ class MlasSQ8BitGemmKernelTest : public MlasTestBase {
 
   void ExecuteShort(void) override {
     Execute<1, 16, 1, 16>();
+    Execute<1, 1, 1, 16>();
     Execute<7, 2, 4, 16>();
+    Execute<7, 128, 4, 16>();
     Execute<8, 497, 5, 16>();
     Execute<1, 3072, 128, 16>();
     Execute<2, 3072, 128, 16>();
diff --git a/onnxruntime/test/providers/cuda/test_cases/allocator_cuda_test.cc b/onnxruntime/test/providers/cuda/test_cases/allocator_cuda_test.cc
index 91a4fe9a54251..af49bd0e3d58d 100644
--- a/onnxruntime/test/providers/cuda/test_cases/allocator_cuda_test.cc
+++ b/onnxruntime/test/providers/cuda/test_cases/allocator_cuda_test.cc
@@ -24,7 +24,7 @@ TEST(AllocatorTest, CUDAAllocatorTest) {
 
   size_t size = 1024;
 
-  EXPECT_STREQ(cuda_arena->Info().name, CUDA);
+  EXPECT_STREQ(cuda_arena->Info().name.c_str(), CUDA);
   EXPECT_EQ(cuda_arena->Info().device.Id(), cuda_device_id);
   EXPECT_EQ(cuda_arena->Info().mem_type, OrtMemTypeDefault);
   EXPECT_EQ(cuda_arena->Info().alloc_type, OrtArenaAllocator);
@@ -38,7 +38,7 @@ TEST(AllocatorTest, CUDAAllocatorTest) {
 
   auto pinned_allocator = CreateAllocator(pinned_memory_info);
 
-  EXPECT_STREQ(pinned_allocator->Info().name, CUDA_PINNED);
+  EXPECT_STREQ(pinned_allocator->Info().name.c_str(), CUDA_PINNED);
   EXPECT_EQ(pinned_allocator->Info().device.Id(), 0);
   EXPECT_EQ(pinned_allocator->Info().mem_type, OrtMemTypeCPUOutput);
   EXPECT_EQ(pinned_allocator->Info().alloc_type, OrtArenaAllocator);
@@ -50,7 +50,7 @@ TEST(AllocatorTest, CUDAAllocatorTest) {
   AllocatorCreationInfo cpu_memory_info(
       [](int) { return std::make_unique<CPUAllocator>(); }, true);
   const auto& cpu_arena = CreateAllocator(cpu_memory_info);
-  EXPECT_STREQ(cpu_arena->Info().name, CPU);
+  EXPECT_STREQ(cpu_arena->Info().name.c_str(), CPU);
   EXPECT_EQ(cpu_arena->Info().device.Id(), 0);
   EXPECT_EQ(cpu_arena->Info().mem_type, OrtMemTypeDefault);
   EXPECT_EQ(cpu_arena->Info().alloc_type, OrtArenaAllocator);
diff --git a/tools/ci_build/github/azure-pipelines/custom-nuget-packaging-pipeline.yml b/tools/ci_build/github/azure-pipelines/custom-nuget-packaging-pipeline.yml
index f22a26cec6d88..3acf0788ab5c3 100644
--- a/tools/ci_build/github/azure-pipelines/custom-nuget-packaging-pipeline.yml
+++ b/tools/ci_build/github/azure-pipelines/custom-nuget-packaging-pipeline.yml
@@ -13,6 +13,21 @@ parameters:
   type: boolean
   default: false
 
+- name: PreReleaseVersionSuffixString
+  displayName: Suffix added to pre-release package version. Only used if IsReleaseBuild is true. Denotes the type of pre-release package.
+  type: string
+  values:
+  - alpha
+  - beta
+  - rc
+  - none
+  default: none
+
+- name: PreReleaseVersionSuffixNumber
+  displayName: Number added to pre-release package version. Only used if IsReleaseBuild is true. Denotes the sequence of a pre-release package.
+  type: number
+  default: 0
+
 - name: PackageName
   displayName: What is the package name? Override using an environment variable CustomPackageName.
   type: string
@@ -69,6 +84,12 @@ extends:
         exclusionsFile: '$(Build.SourcesDirectory)\tools\ci_build\policheck_exclusions.xml'
 
     stages:
+      - template: stages/set_packaging_variables_stage.yml
+        parameters:
+          IsReleaseBuild: ${{ parameters.IsReleaseBuild }}
+          PreReleaseVersionSuffixString: ${{ parameters.PreReleaseVersionSuffixString }}
+          PreReleaseVersionSuffixNumber: ${{ parameters.PreReleaseVersionSuffixNumber }}
+
       - template: templates/win-ci.yml
         parameters:
           ort_build_pool_name: 'onnxruntime-Win2022-GPU-A10'
diff --git a/tools/ci_build/github/azure-pipelines/qnn-ep-nuget-packaging-pipeline.yml b/tools/ci_build/github/azure-pipelines/qnn-ep-nuget-packaging-pipeline.yml
index a94ceea6354e5..02fae0b10ac39 100644
--- a/tools/ci_build/github/azure-pipelines/qnn-ep-nuget-packaging-pipeline.yml
+++ b/tools/ci_build/github/azure-pipelines/qnn-ep-nuget-packaging-pipeline.yml
@@ -14,6 +14,22 @@ parameters:
   type: boolean
   default: false
 
+
+- name: PreReleaseVersionSuffixString
+  displayName: Suffix added to pre-release package version. Only used if IsReleaseBuild is true. Denotes the type of pre-release package.
+  type: string
+  values:
+  - alpha
+  - beta
+  - rc
+  - none
+  default: none
+
+- name: PreReleaseVersionSuffixNumber
+  displayName: Number added to pre-release package version. Only used if IsReleaseBuild is true. Denotes the sequence of a pre-release package.
+  type: number
+  default: 0
+
 - name: DoEsrp
   displayName: Run code sign tasks? Must be true if you are doing an Onnx Runtime release.
   type: boolean
@@ -68,6 +84,11 @@ extends:
         enabled: true
         exclusionsFile: '$(Build.SourcesDirectory)\tools\ci_build\policheck_exclusions.xml'
     stages:
+    - template: stages/set_packaging_variables_stage.yml
+      parameters:
+        IsReleaseBuild: ${{ parameters.IsReleaseBuild }}
+        PreReleaseVersionSuffixString: ${{ parameters.PreReleaseVersionSuffixString }}
+        PreReleaseVersionSuffixNumber: ${{ parameters.PreReleaseVersionSuffixNumber }}
 
     - template: templates/qnn-ep-win.yml
       parameters:
diff --git a/tools/ci_build/github/azure-pipelines/templates/qnn-ep-win.yml b/tools/ci_build/github/azure-pipelines/templates/qnn-ep-win.yml
index eeb8709e0dea2..b6388c22fae98 100644
--- a/tools/ci_build/github/azure-pipelines/templates/qnn-ep-win.yml
+++ b/tools/ci_build/github/azure-pipelines/templates/qnn-ep-win.yml
@@ -11,7 +11,8 @@ parameters:
 
 stages:
 - stage: ${{ parameters.StageName }}
-  dependsOn: []
+  dependsOn:
+  - Setup
   jobs:
   - job: ${{ parameters.StageName }}
     timeoutInMinutes: 300
@@ -45,6 +46,7 @@ stages:
           artifactName: "drop-signed-nuget-qnn"
     variables:
       OrtPackageId: ${{ parameters.OrtNugetPackageId }}
+      ReleaseVersionSuffix: $[stageDependencies.Setup.Set_Variables.outputs['Set_Release_Version_Suffix.ReleaseVersionSuffix']]
       commonBuildArgs: '--skip_submodule_sync --build_shared_lib --client_package_build --cmake_generator "Visual Studio 17 2022" --config ${{ parameters.build_config }} --parallel --use_vcpkg --use_vcpkg_ms_internal_asset_cache --use_binskim_compliant_compile_flags '
 
     steps:
@@ -107,7 +109,12 @@ stages:
         solution: '$(Build.SourcesDirectory)\csharp\OnnxRuntime.DesktopOnly.CSharp.sln'
         platform: 'Any CPU'
         configuration: ${{ parameters.build_config }}
-        msbuildArguments: '-p:OnnxRuntimeBuildDirectory="$(Build.BinariesDirectory)" -p:OrtPackageId=$(OrtPackageId) -p:IsReleaseBuild=${{ parameters.IsReleaseBuild }}'
+        msbuildArguments: >
+          -p:OnnxRuntimeBuildDirectory="$(Build.BinariesDirectory)"
+          -p:OrtPackageId=$(OrtPackageId)
+          -p:IsReleaseBuild=${{ parameters.IsReleaseBuild }}
+          -p:ReleaseVersionSuffix=$(ReleaseVersionSuffix)
+          -p:PackageVersion=$(OnnxRuntimeVersion)
         workingDirectory: '$(Build.SourcesDirectory)\csharp'
 
     - ${{ if eq(parameters.DoEsrp, true) }}:
@@ -123,7 +130,7 @@ stages:
         solution: '$(Build.SourcesDirectory)\csharp\OnnxRuntime.CSharp.proj'
         platform: 'Any CPU'
         configuration: ${{ parameters.build_config }}
-        msbuildArguments: '-t:CreatePackage -p:OnnxRuntimeBuildDirectory="$(Build.BinariesDirectory)" -p:OrtPackageId=$(OrtPackageId) -p:IsReleaseBuild=${{ parameters.IsReleaseBuild }} -p:TargetArchitecture=arm64'
+        msbuildArguments: '-t:CreatePackage -p:OnnxRuntimeBuildDirectory="$(Build.BinariesDirectory)" -p:OrtPackageId=$(OrtPackageId) -p:IsReleaseBuild=${{ parameters.IsReleaseBuild }} -p:ReleaseVersionSuffix=$(ReleaseVersionSuffix) -p:TargetArchitecture=arm64'
         workingDirectory: '$(Build.SourcesDirectory)\csharp'
 
     - task: CopyFiles@2
diff --git a/tools/ci_build/github/linux/docker/scripts/requirements.txt b/tools/ci_build/github/linux/docker/scripts/requirements.txt
index b52d0cbcf3fea..a52e57138117a 100644
--- a/tools/ci_build/github/linux/docker/scripts/requirements.txt
+++ b/tools/ci_build/github/linux/docker/scripts/requirements.txt
@@ -3,7 +3,7 @@ numpy==2.2.6
 mypy
 pytest
 setuptools==78.1.1
-wheel==0.42.0
+wheel==0.45.1
 onnx==1.18.0
 argparse
 sympy==1.14
diff --git a/tools/python/run_packaging_pipelines.py b/tools/python/run_packaging_pipelines.py
index 2ae66332781b9..4948f35c642e8 100644
--- a/tools/python/run_packaging_pipelines.py
+++ b/tools/python/run_packaging_pipelines.py
@@ -277,7 +277,9 @@ def filter_pipelines(pipelines: list[dict], token: str, branch: str, is_pr_build
             if result:
                 filtered_results.append(result)
 
-    print(f"\nFound {len(filtered_results)} pipelines to trigger.")
+    print(f"\nFound {len(filtered_results)} pipelines to trigger:")
+    for result in filtered_results:
+        print(f"  - {result['pipeline']['name']}")
     return filtered_results
 
 
@@ -453,6 +455,14 @@ def main():
             nightly_override = "0"
             release_override = "true"
 
+        # If pre-release flags are used, it implies a release build.
+        if args.pre_release_suffix_string:
+            print("Pre-release suffix provided. Forcing 'release' build mode.")
+            if args.build_mode and args.build_mode != "release":
+                print(f"Warning: --build-mode={args.build_mode} is overridden by pre-release flags.")
+            nightly_override = "0"
+            release_override = "true"
+
         for result in pipelines_to_trigger:
             pipeline = result["pipeline"]
             packaging_type = result["packaging_type"]