diff --git a/onnxruntime/core/platform/windows/device_discovery.cc b/onnxruntime/core/platform/windows/device_discovery.cc index cf761f587ad0b..d8ea34491f526 100644 --- a/onnxruntime/core/platform/windows/device_discovery.cc +++ b/onnxruntime/core/platform/windows/device_discovery.cc @@ -280,6 +280,24 @@ std::unordered_map GetDeviceInfoSetupApi(const std::unorde } } + // Extract PCI bus location for GPU/NPU devices. + // SPDRP_LOCATION_INFORMATION returns strings like "PCI bus 1, device 0, function 0". + // Convert to "domain:bus:device.function" format (e.g., "0000:01:00.0") that + // CUDA and other runtimes expect (e.g., cudaDeviceGetByPCIBusId). + if (entry->type == OrtHardwareDeviceType_GPU || entry->type == OrtHardwareDeviceType_NPU) { + if (SetupDiGetDeviceRegistryPropertyW(devInfo, &devData, SPDRP_LOCATION_INFORMATION, nullptr, + (PBYTE)buffer, sizeof(buffer), &size)) { + std::wstring location_info(buffer, wcslen(buffer)); + unsigned int bus = 0, device = 0, function = 0; + if (swscanf_s(location_info.c_str(), L"PCI bus %u, device %u, function %u", + &bus, &device, &function) == 3) { + wchar_t pci_bus_id[32]; + swprintf_s(pci_bus_id, L"0000:%02x:%02x.%x", bus, device, function); + entry->metadata[L"pci_bus_id"] = pci_bus_id; + } + } + } + // Generate telemetry event to log the GPU and NPU driver name and version. if (entry->type == OrtHardwareDeviceType_CPU) { // Skip processor entries for telemetry. diff --git a/onnxruntime/core/providers/cuda/plugin/cuda_ep_factory.cc b/onnxruntime/core/providers/cuda/plugin/cuda_ep_factory.cc index 0350a259cfb8d..4d79902a43961 100644 --- a/onnxruntime/core/providers/cuda/plugin/cuda_ep_factory.cc +++ b/onnxruntime/core/providers/cuda/plugin/cuda_ep_factory.cc @@ -148,7 +148,6 @@ CudaEpFactory::HardwareDeviceKey CudaEpFactory::MakeDeviceKey(const OrtApi& ort_ }; } -/*static*/ OrtStatus* ORT_API_CALL CudaEpFactory::GetSupportedDevicesImpl( OrtEpFactory* this_ptr, const OrtHardwareDevice* const* hw_devices, @@ -196,10 +195,27 @@ OrtStatus* ORT_API_CALL CudaEpFactory::GetSupportedDevicesImpl( continue; // Skip non-NVIDIA GPUs } - // CUDA uses contiguous ordinals for CUDA-visible NVIDIA devices. Build that - // mapping from the filtered hardware-device list instead of relying on the - // ORT hardware device id, which is not guaranteed to be a CUDA ordinal. - int current_device_id = cuda_device_index++; + // Try to resolve the CUDA ordinal from pci_bus_id metadata if available. + // This is more reliable than counter-based ordinal assignment because it is + // not affected by enumeration order, CUDA_VISIBLE_DEVICES remapping, or + // mixed-vendor GPU configurations. + int current_device_id = -1; + const OrtKeyValuePairs* metadata = factory->ort_api_.HardwareDevice_Metadata(&device); + if (metadata != nullptr) { + const char* pci_bus_id = factory->ort_api_.GetKeyValue(metadata, "pci_bus_id"); + if (pci_bus_id != nullptr && pci_bus_id[0] != '\0') { + int resolved_ordinal = -1; + cudaError_t err = cudaDeviceGetByPCIBusId(&resolved_ordinal, pci_bus_id); + if (err == cudaSuccess && resolved_ordinal >= 0 && resolved_ordinal < cuda_device_count) { + current_device_id = resolved_ordinal; + } + } + } + + // Fallback: if pci_bus_id was not available, use counter-based ordinal assignment. + if (current_device_id < 0) { + current_device_id = cuda_device_index++; + } // Validate the assigned ordinal is within the range of CUDA-visible devices. // If hardware enumeration reports GPUs not visible to CUDA (e.g. due to @@ -207,6 +223,7 @@ OrtStatus* ORT_API_CALL CudaEpFactory::GetSupportedDevicesImpl( if (current_device_id >= cuda_device_count) { continue; } + const auto device_key = CudaEpFactory::MakeDeviceKey(factory->ort_api_, device, current_device_id); DeviceCacheEntry* cache_entry = nullptr; { diff --git a/onnxruntime/test/providers/cuda/plugin/cuda_plugin_pci_bus_id_test.cc b/onnxruntime/test/providers/cuda/plugin/cuda_plugin_pci_bus_id_test.cc new file mode 100644 index 0000000000000..5f91ff066f8c5 --- /dev/null +++ b/onnxruntime/test/providers/cuda/plugin/cuda_plugin_pci_bus_id_test.cc @@ -0,0 +1,211 @@ +// Copyright (c) Microsoft Corporation. All rights reserved. +// Licensed under the MIT License. + +// Tests that the CUDA plugin EP correctly resolves CUDA ordinals via pci_bus_id +// metadata from OrtHardwareDevice, using cudaDeviceGetByPCIBusId. + +#if defined(ORT_UNIT_TEST_HAS_CUDA_PLUGIN_EP) + +#include +#include +#include +#include + +#include +#include + +#include "core/session/onnxruntime_cxx_api.h" +#include "test/util/include/file_util.h" + +extern std::unique_ptr ort_env; + +namespace onnxruntime { +namespace test { +namespace { + +constexpr const char* kCudaPluginEpRegistrationName = "CudaPluginPciBusIdTest"; + +// Resolve the CUDA plugin EP shared library path. +std::filesystem::path GetCudaPluginLibraryPath() { + return GetSharedLibraryFileName(ORT_TSTR("onnxruntime_providers_cuda_plugin")); +} + +// RAII handle that registers/unregisters the CUDA plugin EP library. +class ScopedCudaPluginRegistration { + public: + ScopedCudaPluginRegistration(Ort::Env& env, const char* registration_name) + : env_(env), name_(registration_name) { + auto lib_path = GetCudaPluginLibraryPath(); + if (!std::filesystem::exists(lib_path)) { + available_ = false; + return; + } + env_.RegisterExecutionProviderLibrary(name_.c_str(), lib_path.c_str()); + available_ = true; + } + + ~ScopedCudaPluginRegistration() { + if (available_) { + try { + env_.UnregisterExecutionProviderLibrary(name_.c_str()); + } catch (...) { + } + } + } + + bool IsAvailable() const { return available_; } + + ScopedCudaPluginRegistration(const ScopedCudaPluginRegistration&) = delete; + ScopedCudaPluginRegistration& operator=(const ScopedCudaPluginRegistration&) = delete; + + private: + Ort::Env& env_; + std::string name_; + bool available_ = false; +}; + +} // namespace + +class CudaPluginPciBusIdTest : public ::testing::Test { + protected: + void SetUp() override { + cudaError_t err = cudaGetDeviceCount(&cuda_device_count_); + if (err != cudaSuccess || cuda_device_count_ == 0) { + GTEST_SKIP() << "No CUDA device available."; + } + + registration_ = std::make_unique( + *ort_env, kCudaPluginEpRegistrationName); + if (!registration_->IsAvailable()) { + GTEST_SKIP() << "CUDA plugin EP library not found."; + } + } + + void TearDown() override { + registration_.reset(); + cudaDeviceSynchronize(); + } + + std::unique_ptr registration_; + int cuda_device_count_ = 0; +}; + +// Verify that the CUDA plugin EP device's cuda_device_id in ep_metadata matches +// the ordinal obtained by calling cudaDeviceGetByPCIBusId with the pci_bus_id +// from the hardware device metadata. +TEST_F(CudaPluginPciBusIdTest, CudaOrdinalMatchesPciBusIdResolution) { + auto ep_devices = ort_env->GetEpDevices(); + + int cuda_ep_device_count = 0; + for (const auto& ep_device : ep_devices) { + if (strcmp(ep_device.EpName(), "CudaPluginExecutionProvider") != 0) { + continue; + } + + ++cuda_ep_device_count; + + // Get the pci_bus_id from the hardware device metadata. + auto hw_device = ep_device.Device(); + ASSERT_TRUE(hw_device) << "EpDevice has no associated hardware device."; + + auto hw_metadata = hw_device.Metadata(); + const char* pci_bus_id = hw_metadata.GetValue("pci_bus_id"); + + // pci_bus_id may not be available on all platforms (e.g., some VMs). + // If it's missing, we can't verify the mapping but the EP should still work + // via the fallback counter-based ordinal assignment. + if (pci_bus_id == nullptr || pci_bus_id[0] == '\0') { + GTEST_LOG_(INFO) << "pci_bus_id not available in hardware device metadata; " + << "skipping PCI bus ID resolution check for this device."; + continue; + } + + // Resolve the CUDA ordinal from pci_bus_id using the CUDA runtime API. + int expected_ordinal = -1; + cudaError_t cuda_err = cudaDeviceGetByPCIBusId(&expected_ordinal, pci_bus_id); + ASSERT_EQ(cuda_err, cudaSuccess) + << "cudaDeviceGetByPCIBusId failed for pci_bus_id=\"" << pci_bus_id + << "\": " << cudaGetErrorString(cuda_err); + ASSERT_GE(expected_ordinal, 0); + ASSERT_LT(expected_ordinal, cuda_device_count_); + + // Get the cuda_device_id that the plugin EP assigned (stored in ep_metadata). + auto ep_metadata = ep_device.EpMetadata(); + const char* cuda_device_id_str = ep_metadata.GetValue("cuda_device_id"); + ASSERT_NE(cuda_device_id_str, nullptr) + << "cuda_device_id missing from CUDA plugin EP metadata."; + + int actual_ordinal = std::stoi(cuda_device_id_str); + + EXPECT_EQ(actual_ordinal, expected_ordinal) + << "CUDA plugin EP assigned ordinal " << actual_ordinal + << " but cudaDeviceGetByPCIBusId(\"" << pci_bus_id << "\") resolved to " + << expected_ordinal << "."; + } + + EXPECT_GT(cuda_ep_device_count, 0) + << "No CudaPluginExecutionProvider devices found after registration."; +} + +// Verify that every CUDA device on the host has a corresponding CUDA plugin EP device +// with the correct pci_bus_id → ordinal mapping. +TEST_F(CudaPluginPciBusIdTest, AllCudaDevicesHaveMatchingEpDevices) { + auto ep_devices = ort_env->GetEpDevices(); + + // Collect the set of CUDA ordinals assigned by the plugin EP. + std::vector assigned_ordinals; + for (const auto& ep_device : ep_devices) { + if (strcmp(ep_device.EpName(), "CudaPluginExecutionProvider") != 0) { + continue; + } + auto ep_metadata = ep_device.EpMetadata(); + const char* cuda_device_id_str = ep_metadata.GetValue("cuda_device_id"); + if (cuda_device_id_str != nullptr) { + assigned_ordinals.push_back(std::stoi(cuda_device_id_str)); + } + } + + // Every ordinal in [0, cuda_device_count_) should appear exactly once. + std::sort(assigned_ordinals.begin(), assigned_ordinals.end()); + // Remove duplicates to check uniqueness after. + auto unique_end = std::unique(assigned_ordinals.begin(), assigned_ordinals.end()); + + EXPECT_EQ(std::distance(assigned_ordinals.begin(), unique_end), + static_cast(assigned_ordinals.size())) + << "Duplicate CUDA ordinals detected among plugin EP devices."; + + EXPECT_EQ(static_cast(assigned_ordinals.size()), cuda_device_count_) + << "Number of CUDA plugin EP devices (" << assigned_ordinals.size() + << ") does not match CUDA device count (" << cuda_device_count_ << ")."; +} + +// Verify that cudaDeviceGetByPCIBusId round-trips with cudaDeviceGetPCIBusId +// for all CUDA devices. This validates the format consistency that the +// ORT device discovery code must produce. +TEST_F(CudaPluginPciBusIdTest, PciBusIdRoundTrip) { + for (int ordinal = 0; ordinal < cuda_device_count_; ++ordinal) { + // Get the PCI bus ID string from CUDA runtime for this ordinal. + char pci_bus_id[64] = {}; + cudaError_t err = cudaDeviceGetPCIBusId(pci_bus_id, sizeof(pci_bus_id), ordinal); + ASSERT_EQ(err, cudaSuccess) + << "cudaDeviceGetPCIBusId failed for ordinal " << ordinal + << ": " << cudaGetErrorString(err); + + // Round-trip: resolve back to ordinal. + int resolved_ordinal = -1; + err = cudaDeviceGetByPCIBusId(&resolved_ordinal, pci_bus_id); + ASSERT_EQ(err, cudaSuccess) + << "cudaDeviceGetByPCIBusId failed for pci_bus_id=\"" << pci_bus_id + << "\": " << cudaGetErrorString(err); + + EXPECT_EQ(resolved_ordinal, ordinal) + << "PCI bus ID round-trip failed: ordinal " << ordinal + << " → pci_bus_id=\"" << pci_bus_id + << "\" → resolved ordinal " << resolved_ordinal; + } +} + +} // namespace test +} // namespace onnxruntime + +#endif // defined(ORT_UNIT_TEST_HAS_CUDA_PLUGIN_EP) \ No newline at end of file