microsoft · TedThemistokleous · Feb 13, 2025 · Feb 10, 2025 · Feb 10, 2025 · Feb 5, 2025
diff --git a/cmake/CMakeLists.txt b/cmake/CMakeLists.txt
@@ -29,6 +29,7 @@ include(CheckLanguage)
 include(CMakeDependentOption)
 include(FetchContent)
 include(CheckFunctionExists)
+include(CheckSymbolExists)
 include(GNUInstallDirs) # onnxruntime_providers_* require CMAKE_INSTALL_* variables
 
 # TODO: update this once all system adapt c++20

diff --git a/cmake/external/composable_kernel.cmake b/cmake/external/composable_kernel.cmake
@@ -1,13 +1,14 @@
 set(PATCH_CLANG ${PROJECT_SOURCE_DIR}/patches/composable_kernel/Fix_Clang_Build.patch)
 set(PATCH_GFX12X ${PROJECT_SOURCE_DIR}/patches/composable_kernel/Add_gfx12x_support.patch)
+set(PATCH_GFX950 ${PROJECT_SOURCE_DIR}/patches/composable_kernel/Add_gfx950.patch)
 
 include(FetchContent)
 onnxruntime_fetchcontent_declare(composable_kernel
   URL ${DEP_URL_composable_kernel}
   URL_HASH SHA1=${DEP_SHA1_composable_kernel}
   PATCH_COMMAND ${Patch_EXECUTABLE} --binary --ignore-whitespace -p1 < ${PATCH_CLANG} &&
-                ${Patch_EXECUTABLE} --binary --ignore-whitespace -p1 < ${PATCH_GFX12X}
-  EXCLUDE_FROM_ALL
+                ${Patch_EXECUTABLE} --binary --ignore-whitespace -p1 < ${PATCH_GFX12X} &&
+                ${Patch_EXECUTABLE} --binary --ignore-whitespace -p1 < ${PATCH_GFX950}
 )
 
 FetchContent_GetProperties(composable_kernel)

diff --git a/cmake/onnxruntime_providers_migraphx.cmake b/cmake/onnxruntime_providers_migraphx.cmake
@@ -2,21 +2,11 @@
 # Licensed under the MIT License.
 
   add_definitions(-DUSE_MIGRAPHX=1)
-  set(BUILD_LIBRARY_ONLY 1)
-  add_definitions("-DONNX_ML=1")
-  add_definitions("-DONNX_NAMESPACE=onnx")
-  include_directories(${protobuf_SOURCE_DIR} ${eigen_SOURCE_DIR})
-  set(MIGRAPHX_ROOT ${onnxruntime_MIGRAPHX_HOME})
-  include_directories(${onnx_SOURCE_DIR})
+  include_directories(${protobuf_SOURCE_DIR} ${eigen_SOURCE_DIR} ${onnx_SOURCE_DIR})
   set(OLD_CMAKE_CXX_FLAGS ${CMAKE_CXX_FLAGS})
-  if ( CMAKE_COMPILER_IS_GNUCC )
+  if (CMAKE_COMPILER_IS_GNUCC)
     set(CMAKE_CXX_FLAGS  "${CMAKE_CXX_FLAGS} -Wno-unused-parameter -Wno-missing-field-initializers")
   endif()
-  set(CXX_VERSION_DEFINED TRUE)
-  set(CMAKE_CXX_FLAGS ${OLD_CMAKE_CXX_FLAGS})
-  if ( CMAKE_COMPILER_IS_GNUCC )
-    set(CMAKE_CXX_FLAGS  "${CMAKE_CXX_FLAGS} -Wno-unused-parameter")
-  endif()
 
   # Add search paths for default rocm installation
   list(APPEND CMAKE_PREFIX_PATH /opt/rocm/hcc /opt/rocm/hip /opt/rocm $ENV{HIP_PATH})
@@ -33,26 +23,24 @@
   find_package(hip REQUIRED)
   find_package(migraphx REQUIRED PATHS ${AMD_MIGRAPHX_HOME})
 
-  set(migraphx_libs migraphx::c hip::host)
-
   file(GLOB_RECURSE onnxruntime_providers_migraphx_cc_srcs CONFIGURE_DEPENDS
     "${ONNXRUNTIME_ROOT}/core/providers/migraphx/*.h"
     "${ONNXRUNTIME_ROOT}/core/providers/migraphx/*.cc"
     "${ONNXRUNTIME_ROOT}/core/providers/shared_library/*.h"
     "${ONNXRUNTIME_ROOT}/core/providers/shared_library/*.cc"
   )
   source_group(TREE ${ONNXRUNTIME_ROOT}/core FILES ${onnxruntime_providers_migraphx_cc_srcs})
-  onnxruntime_add_shared_library_module(onnxruntime_providers_migraphx ${onnxruntime_providers_migraphx_cc_srcs})
+  onnxruntime_add_shared_library(onnxruntime_providers_migraphx ${onnxruntime_providers_migraphx_cc_srcs})
   onnxruntime_add_include_to_target(onnxruntime_providers_migraphx onnxruntime_common onnx flatbuffers::flatbuffers Boost::mp11 safeint_interface)
-  add_dependencies(onnxruntime_providers_migraphx onnxruntime_providers_shared ${onnxruntime_EXTERNAL_DEPENDENCIES})
-  target_link_libraries(onnxruntime_providers_migraphx PRIVATE ${migraphx_libs} ${ONNXRUNTIME_PROVIDERS_SHARED} onnx flatbuffers::flatbuffers Boost::mp11 safeint_interface)
-  target_include_directories(onnxruntime_providers_migraphx PRIVATE ${ONNXRUNTIME_ROOT} ${CMAKE_CURRENT_BINARY_DIR} ${CMAKE_CURRENT_BINARY_DIR}/amdgpu/onnxruntime)
+  add_dependencies(onnxruntime_providers_migraphx ${onnxruntime_EXTERNAL_DEPENDENCIES})
+  target_link_libraries(onnxruntime_providers_migraphx PRIVATE migraphx::c hip::host ${ONNXRUNTIME_PROVIDERS_SHARED} onnx flatbuffers::flatbuffers Boost::mp11 safeint_interface)
+  target_include_directories(onnxruntime_providers_migraphx PRIVATE ${ONNXRUNTIME_ROOT} ${CMAKE_CURRENT_BINARY_DIR} ${CMAKE_CURRENT_BINARY_DIR}/migraphx/onnxruntime)
   set_target_properties(onnxruntime_providers_migraphx PROPERTIES LINKER_LANGUAGE CXX)
   set_target_properties(onnxruntime_providers_migraphx PROPERTIES FOLDER "ONNXRuntime")
-  target_compile_definitions(onnxruntime_providers_migraphx PRIVATE ONNXIFI_BUILD_LIBRARY=1)
-  if(MSVC)
+  target_compile_definitions(onnxruntime_providers_migraphx PRIVATE ONNXIFI_BUILD_LIBRARY=1 ONNX_ML=1 ONNX_NAMESPACE=onnx)
+  if(CMAKE_SYSTEM_NAME STREQUAL "Windows")
     set_property(TARGET onnxruntime_providers_migraphx APPEND_STRING PROPERTY LINK_FLAGS /DEF:${ONNXRUNTIME_ROOT}/core/providers/migraphx/symbols.def)
-    target_link_libraries(onnxruntime_providers_migraphx PRIVATE ws2_32)
+    target_link_libraries(onnxruntime_providers_migraphx PRIVATE ws2_32 shlwapi)
   else()
     target_compile_options(onnxruntime_providers_migraphx PRIVATE -Wno-error=sign-compare)
     set_property(TARGET onnxruntime_providers_migraphx APPEND_STRING PROPERTY COMPILE_FLAGS "-Wno-deprecated-declarations")
@@ -62,6 +50,15 @@
     target_link_libraries(onnxruntime_providers_migraphx PRIVATE  stdc++fs)
   endif()
 
+  set(CMAKE_REQUIRED_LIBRARIES migraphx::c)
+
+  check_symbol_exists(migraphx_onnx_options_set_external_data_path
+    "migraphx/migraphx.h" HAVE_MIGRAPHX_API_ONNX_OPTIONS_SET_EXTERNAL_DATA_PATH)
+
+  if(HAVE_MIGRAPHX_API_ONNX_OPTIONS_SET_EXTERNAL_DATA_PATH)
+    target_compile_definitions(onnxruntime_providers_migraphx PRIVATE HAVE_MIGRAPHX_API_ONNX_OPTIONS_SET_EXTERNAL_DATA_PATH=1)
+  endif()
+
   if (onnxruntime_ENABLE_TRAINING_OPS)
     onnxruntime_add_include_to_target(onnxruntime_providers_migraphx onnxruntime_training)
     target_link_libraries(onnxruntime_providers_migraphx PRIVATE onnxruntime_training)
@@ -70,16 +67,10 @@
     endif()
   endif()
 
-  if(CMAKE_SYSTEM_NAME STREQUAL "Windows")
-    install(TARGETS onnxruntime_providers_migraphx
-            ARCHIVE  DESTINATION ${CMAKE_INSTALL_LIBDIR}
-            LIBRARY  DESTINATION ${CMAKE_INSTALL_BINDIR}
-            RUNTIME  DESTINATION ${CMAKE_INSTALL_BINDIR}
-    )
-  else()
-    install(TARGETS onnxruntime_providers_migraphx
-            ARCHIVE  DESTINATION ${CMAKE_INSTALL_LIBDIR}
-            LIBRARY  DESTINATION ${CMAKE_INSTALL_LIBDIR}
-            RUNTIME  DESTINATION ${CMAKE_INSTALL_BINDIR}
-    )
-  endif()
+  install(TARGETS onnxruntime_providers_migraphx
+	  EXPORT onnxruntime_providers_migraphxTargets
+          ARCHIVE   DESTINATION ${CMAKE_INSTALL_LIBDIR}
+	  LIBRARY   DESTINATION ${CMAKE_INSTALL_LIBDIR}	  
+          RUNTIME   DESTINATION ${CMAKE_INSTALL_BINDIR}
+	  FRAMEWORK DESTINATION ${CMAKE_INSTALL_BINDIR}
+  )
diff --git a/cmake/onnxruntime_unittests.cmake b/cmake/onnxruntime_unittests.cmake
@@ -610,7 +610,6 @@ endif()
 
 if(onnxruntime_USE_MIGRAPHX)
   list(APPEND onnxruntime_test_framework_libs onnxruntime_providers_migraphx)
-  list(APPEND onnxruntime_test_providers_dependencies onnxruntime_providers_migraphx onnxruntime_providers_shared)
 endif()
 
 if(onnxruntime_USE_COREML)
@@ -691,9 +690,6 @@ endif()
 
 if(onnxruntime_USE_MIGRAPHX)
   list(APPEND onnxruntime_test_framework_src_patterns  ${TEST_SRC_DIR}/providers/migraphx/*)
-  list(APPEND onnxruntime_test_framework_src_patterns  "${ONNXRUNTIME_ROOT}/core/providers/migraphx/migraphx_execution_provider_utils.h")
-  list(APPEND onnxruntime_test_framework_libs onnxruntime_providers_migraphx)
-  list(APPEND onnxruntime_test_providers_dependencies onnxruntime_providers_migraphx onnxruntime_providers_shared)
 endif()
 
 if(onnxruntime_USE_NNAPI_BUILTIN)

diff --git a/cmake/patches/composable_kernel/Add_gfx12x_support.patch b/cmake/patches/composable_kernel/Add_gfx12x_support.patch
@@ -14,7 +14,8 @@ index bc326c8b5..db5ad5052 100644
 @@ -127,8 +127,10 @@ else()
          rocm_check_target_ids(DEFAULT_GPU_TARGETS TARGETS "gfx1030")
      elseif(GPU_ARCH MATCHES "gfx11")
-         rocm_check_target_ids(DEFAULT_GPU_TARGETS TARGETS "gfx1100;gfx1101;gfx1102")
+-         rocm_check_target_ids(DEFAULT_GPU_TARGETS TARGETS "gfx1100;gfx1101;gfx1102")
++         rocm_check_target_ids(DEFAULT_GPU_TARGETS TARGETS "gfx1100;gfx1101;gfx1102;gfx1151")
 +    elseif(GPU_ARCH MATCHES "gfx12")
 +        rocm_check_target_ids(DEFAULT_GPU_TARGETS TARGETS "gfx1200;gfx1201")
      else()
@@ -259,7 +260,8 @@ index 55f562061..69a7abf62 100644
 --- a/include/ck/ck.hpp
 +++ b/include/ck/ck.hpp
 @@ -69,6 +69,9 @@ CK_DECLARE_ENV_VAR_BOOL(CK_LOGGING)
- #if defined(__gfx1100__) || defined(__gfx1101__) || defined(__gfx1102__) || defined(__gfx1103__)
+- #if defined(__gfx1100__) || defined(__gfx1101__) || defined(__gfx1102__) || defined(__gfx1103__)
++ #if defined(__gfx1100__) || defined(__gfx1101__) || defined(__gfx1102__) || defined(__gfx1103__) || defined(__gfx1151__)
  #define __gfx11__
  #endif
 +#if defined(__gfx1200__) || defined(__gfx1201__)

diff --git a/cmake/patches/composable_kernel/Add_gfx950.patch b/cmake/patches/composable_kernel/Add_gfx950.patch
@@ -0,0 +1,14 @@
+diff --git a/include/ck/ck.hpp b/include/ck/ck.hpp
+index 55f562061..ee340eba1 100644
+--- a/include/ck/ck.hpp
++++ b/include/ck/ck.hpp
+@@ -53,7 +53,7 @@ CK_DECLARE_ENV_VAR_BOOL(CK_LOGGING)
+
+ // define general macros for various architectures
+ #if defined(__gfx908__) || defined(__gfx90a__) || defined(__gfx940__) || defined(__gfx941__) || \
+-    defined(__gfx942__)
++    defined(__gfx942__) || defined(__gfx950__)
+ #define __gfx9__
+ #endif
+ #if defined(__gfx940__) || defined(__gfx941__) || defined(__gfx942__)
+
diff --git a/include/onnxruntime/core/session/onnxruntime_c_api.h b/include/onnxruntime/core/session/onnxruntime_c_api.h
@@ -711,15 +711,13 @@ typedef struct OrtTensorRTProviderOptions {
 typedef struct OrtMIGraphXProviderOptions {
   int device_id;                                     // hip device id.
   int migraphx_fp16_enable;                          // MIGraphX FP16 precision. Default 0 = false, nonzero = true
+  int migraphx_bf16_enable;                          // MIGraphX BF16 precision. Default 0 = false, nonzero = true
   int migraphx_fp8_enable;                           // MIGraphX FP8 precision. Default 0 = false, nonzero = true
   int migraphx_int8_enable;                          // MIGraphX INT8 precision. Default 0 = false, nonzero = true
-  int migraphx_use_native_calibration_table;         // MIGraphx INT8 cal table. Default 0 = false, noznero = true
+  int migraphx_use_native_calibration_table;         // MIGraphx INT8 cal table. Default 0 = false, nonzero = true
   const char* migraphx_int8_calibration_table_name;  // MIGraphx INT8 calibration table name
-  int migraphx_save_compiled_model;                  // migraphx save compiled model. Default 0 = false, noznero = true
-  const char* migraphx_save_model_path;              // migraphx model path name
-  int migraphx_load_compiled_model;                  // migraphx int8 cal table. Default 0 = false, noznero = true
-  const char* migraphx_load_model_path;              // migraphx model path name
-  bool migraphx_exhaustive_tune;                     // migraphx tuned compile  Default = false
+  const char* migraphx_cache_dir;                    // MIGraphX model cache directory
+  int migraphx_exhaustive_tune;                      // MIGraphX tuned compile. Default = false, nonzero = true
 
   /** \brief MIGraphX memory limit (To use all possible memory pass in maximum size_t)
    *   Defaults to SIZE_MAX.

diff --git a/onnxruntime/core/providers/dml/dml_provider_factory.cc b/onnxruntime/core/providers/dml/dml_provider_factory.cc
@@ -562,7 +562,10 @@ static D3D12_COMMAND_LIST_TYPE CalculateCommandListType(ID3D12Device* d3d12_devi
       sizeof(feature_levels)
       ));
 
-  auto use_compute_command_list = (feature_levels.MaxSupportedFeatureLevel <= D3D_FEATURE_LEVEL_1_0_CORE);
+  // Use compute queue whenever possible on supported hardware to avoid TDR and maintain UI QoS
+  // Core and generic devices only have compute queues, DX11 has "immediate" submission, DX12 has both
+  auto use_compute_command_list = (feature_levels.MaxSupportedFeatureLevel <= D3D_FEATURE_LEVEL_1_0_CORE) ||
+                                  (feature_levels.MaxSupportedFeatureLevel >= D3D_FEATURE_LEVEL_12_0);
 
   if (use_compute_command_list)
   {

diff --git a/onnxruntime/core/providers/migraphx/migraphx_allocator.cc b/onnxruntime/core/providers/migraphx/migraphx_allocator.cc
@@ -23,11 +23,11 @@ void MIGraphXAllocator::CheckDevice() const {
 #endif
 }
 
-void* MIGraphXAllocator::Alloc(size_t size) {
+void* MIGraphXAllocator::Alloc(const size_t size) {
   CheckDevice();
   void* p = nullptr;
   if (size > 0) {
-    HIP_CALL_THROW(hipMalloc((void**)&p, size));
+    HIP_CALL_THROW(hipMalloc(&p, size));
   }
   return p;
 }
@@ -37,7 +37,7 @@ void MIGraphXAllocator::Free(void* p) {
   (void)hipFree(p);  // do not throw error since it's OK for hipFree to fail during shutdown
 }
 
-void* MIGraphXExternalAllocator::Alloc(size_t size) {
+void* MIGraphXExternalAllocator::Alloc(const size_t size) {
   void* p = nullptr;
   if (size > 0) {
     p = alloc_(size);
@@ -51,27 +51,27 @@ void* MIGraphXExternalAllocator::Alloc(size_t size) {
 
 void MIGraphXExternalAllocator::Free(void* p) {
   free_(p);
-  std::lock_guard<std::mutex> lock(lock_);
-  auto it = reserved_.find(p);
-  if (it != reserved_.end()) {
+  std::lock_guard lock(lock_);
+  if (const auto it = reserved_.find(p); it != reserved_.end()) {
     reserved_.erase(it);
     if (empty_cache_) empty_cache_();
   }
 }
 
-void* MIGraphXExternalAllocator::Reserve(size_t size) {
+void* MIGraphXExternalAllocator::Reserve(const size_t size) {
   void* p = Alloc(size);
-  if (!p) return nullptr;
-  std::lock_guard<std::mutex> lock(lock_);
-  ORT_ENFORCE(reserved_.find(p) == reserved_.end());
-  reserved_.insert(p);
+  if (p != nullptr) {
+    std::lock_guard lock(lock_);
+    ORT_ENFORCE(reserved_.find(p) == reserved_.end());
+    reserved_.insert(p);
+  }
   return p;
 }
 
-void* MIGraphXPinnedAllocator::Alloc(size_t size) {
+void* MIGraphXPinnedAllocator::Alloc(const size_t size) {
   void* p = nullptr;
   if (size > 0) {
-    HIP_CALL_THROW(hipHostMalloc((void**)&p, size));
+    HIP_CALL_THROW(hipHostMalloc(&p, size));
   }
   return p;
 }

diff --git a/onnxruntime/core/providers/migraphx/migraphx_allocator.h b/onnxruntime/core/providers/migraphx/migraphx_allocator.h
@@ -11,27 +11,27 @@ namespace onnxruntime {
 
 class MIGraphXAllocator : public IAllocator {
  public:
-  MIGraphXAllocator(int device_id, const char* name)
+  MIGraphXAllocator(const OrtDevice::DeviceId device_id, const char* name)
       : IAllocator(
             OrtMemoryInfo(name, OrtAllocatorType::OrtDeviceAllocator,
                           OrtDevice(OrtDevice::GPU, OrtDevice::MemType::DEFAULT, OrtDevice::VendorIds::AMD,
-                                    static_cast<OrtDevice::DeviceId>(device_id)),
+                                    device_id),
                           OrtMemTypeDefault)) {}
 
-  virtual void* Alloc(size_t size) override;
-  virtual void Free(void* p) override;
+  void* Alloc(size_t size) override;
+  void Free(void* p) override;
 
  private:
   void CheckDevice() const;
 };
 
-class MIGraphXExternalAllocator : public MIGraphXAllocator {
+class MIGraphXExternalAllocator final : public MIGraphXAllocator {
   typedef void* (*ExternalAlloc)(size_t size);
   typedef void (*ExternalFree)(void* p);
   typedef void (*ExternalEmptyCache)();
 
  public:
-  MIGraphXExternalAllocator(OrtDevice::DeviceId device_id, const char* name, void* alloc, void* free, void* empty_cache)
+  MIGraphXExternalAllocator(const OrtDevice::DeviceId device_id, const char* name, void* alloc, void* free, void* empty_cache)
       : MIGraphXAllocator(device_id, name) {
     alloc_ = reinterpret_cast<ExternalAlloc>(alloc);
     free_ = reinterpret_cast<ExternalFree>(free);
@@ -52,11 +52,11 @@ class MIGraphXExternalAllocator : public MIGraphXAllocator {
 
 class MIGraphXPinnedAllocator final : public IAllocator {
  public:
-  MIGraphXPinnedAllocator(const int device_id, const char* name)
+  MIGraphXPinnedAllocator(const OrtDevice::DeviceId device_id, const char* name)
       : IAllocator(
             OrtMemoryInfo(name, OrtDeviceAllocator,
                           OrtDevice(OrtDevice::GPU, OrtDevice::MemType::HOST_ACCESSIBLE, OrtDevice::VendorIds::AMD,
-                                    static_cast<OrtDevice::DeviceId>(device_id)),
+                                    device_id),
                           OrtMemTypeCPUOutput)) {}
 
   void* Alloc(size_t size) override;