Skip to content
Merged
Show file tree
Hide file tree
Changes from 6 commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
3 changes: 2 additions & 1 deletion cmake/onnxruntime_common.cmake
Original file line number Diff line number Diff line change
Expand Up @@ -84,7 +84,8 @@ if (WIN32)
"${ONNXRUNTIME_ROOT}/core/platform/windows/device_discovery.cc")
elseif (LINUX)
list(APPEND onnxruntime_common_src_patterns
"${ONNXRUNTIME_ROOT}/core/platform/linux/device_discovery.cc")
"${ONNXRUNTIME_ROOT}/core/platform/linux/device_discovery.cc"
"${ONNXRUNTIME_ROOT}/core/platform/linux/device_discovery_linux.h")
elseif (APPLE)
list(APPEND onnxruntime_common_src_patterns
"${ONNXRUNTIME_ROOT}/core/platform/apple/device_discovery.cc")
Expand Down
5 changes: 5 additions & 0 deletions cmake/onnxruntime_unittests.cmake
Original file line number Diff line number Diff line change
Expand Up @@ -461,6 +461,11 @@ if(WIN32)
"${TEST_SRC_DIR}/platform/windows/logging/*.cc" )
endif()

if(LINUX)
list(APPEND onnxruntime_test_framework_src_patterns
"${TEST_SRC_DIR}/platform/linux/*.cc" )
endif()

if(NOT onnxruntime_MINIMAL_BUILD AND NOT onnxruntime_REDUCED_OPS_BUILD)

if(onnxruntime_USE_CUDA)
Expand Down
114 changes: 114 additions & 0 deletions onnxruntime/core/platform/linux/device_discovery.cc
Original file line number Diff line number Diff line change
Expand Up @@ -2,6 +2,7 @@
// Licensed under the MIT License.

#include "core/platform/device_discovery.h"
#include "core/platform/linux/device_discovery_linux.h"

#include <filesystem>
#include <fstream>
Expand Down Expand Up @@ -175,6 +176,99 @@
return Status::OK();
}

} // namespace

// PCI bus-based GPU detection as a fallback for environments where DRM sysfs entries
// are not available (e.g., AKS/Kubernetes containers where the nvidia-drm kernel module
// is not loaded but GPU PCI devices are still exposed via sysfs).

namespace pci_device_discovery {

Status DetectGpuPciPaths(const fs::path& sysfs_pci_devices_path,
std::vector<GpuPciPathInfo>& gpu_pci_paths_out) {
std::error_code error_code{};
const bool path_exists = fs::exists(sysfs_pci_devices_path, error_code);
ORT_RETURN_IF_ERROR(ErrorCodeToStatus(error_code));

if (!path_exists) {
gpu_pci_paths_out = {};
return Status::OK();
}

std::vector<GpuPciPathInfo> gpu_pci_paths{};

auto dir_iterator = fs::directory_iterator{sysfs_pci_devices_path, error_code};
ORT_RETURN_IF_ERROR(ErrorCodeToStatus(error_code));

for (const auto& dir_item : dir_iterator) {
const auto& device_path = dir_item.path();

// Read PCI class code to identify GPU devices.
// The class file contains a 24-bit value: 0xCCSSpp (class/subclass/prog-if).
uint32_t pci_class{};
if (auto status = ReadValueFromFile(device_path / "class", pci_class); !status.IsOK()) {
continue;
}

// Check for GPU/display controller PCI class codes:
// Base class 0x03 = Display controller
// Sub-class 0x00 = VGA compatible controller
// Sub-class 0x02 = 3D controller (common for NVIDIA data center/compute GPUs)
Comment thread
edgchen1 marked this conversation as resolved.
const uint8_t base_class = static_cast<uint8_t>((pci_class >> 16) & 0xFF);
const uint8_t sub_class = static_cast<uint8_t>((pci_class >> 8) & 0xFF);
if (base_class != 0x03 || (sub_class != 0x00 && sub_class != 0x02)) {
continue;
}

GpuPciPathInfo path_info{};
path_info.path = device_path;
path_info.pci_bus_id = device_path.filename().string();

Check warning on line 225 in onnxruntime/core/platform/linux/device_discovery.cc

View workflow job for this annotation

GitHub Actions / Optional Lint C++

[cpplint] reported by reviewdog 🐶 Add #include <string> for string [build/include_what_you_use] [4] Raw Output: onnxruntime/core/platform/linux/device_discovery.cc:225: Add #include <string> for string [build/include_what_you_use] [4]
gpu_pci_paths.emplace_back(std::move(path_info));
}

gpu_pci_paths_out = std::move(gpu_pci_paths);
return Status::OK();
}

Status GetGpuDeviceFromPci(const GpuPciPathInfo& path_info, size_t device_idx, OrtHardwareDevice& gpu_device_out) {
OrtHardwareDevice gpu_device{};
const auto& pci_path = path_info.path;

// vendor id - directly under PCI device path
uint16_t vendor_id{};
ORT_RETURN_IF_ERROR(ReadValueFromFile(pci_path / "vendor", vendor_id));
gpu_device.vendor_id = vendor_id;

// device id - directly under PCI device path
uint16_t device_id{};
ORT_RETURN_IF_ERROR(ReadValueFromFile(pci_path / "device", device_id));
gpu_device.device_id = device_id;

// metadata
// Use "card_idx" key for consistency with DRM-based detection, using device enumeration order.
gpu_device.metadata.Add("card_idx", MakeString(device_idx));
Comment thread
edgchen1 marked this conversation as resolved.
Outdated

if (const auto is_gpu_discrete = IsGpuDiscrete(vendor_id, device_id);
is_gpu_discrete.has_value()) {
gpu_device.metadata.Add("Discrete", (*is_gpu_discrete ? "1" : "0"));
}

if (!path_info.pci_bus_id.empty()) {
gpu_device.metadata.Add("pci_bus_id", path_info.pci_bus_id);
}

gpu_device.type = OrtHardwareDeviceType_GPU;

gpu_device_out = std::move(gpu_device);
return Status::OK();
}

} // namespace pci_device_discovery

namespace {

constexpr const char* kSysfsPciDevicesPath = "/sys/bus/pci/devices";

Status GetGpuDevices(std::vector<OrtHardwareDevice>& gpu_devices_out) {
std::vector<GpuSysfsPathInfo> gpu_sysfs_path_infos{};
ORT_RETURN_IF_ERROR(DetectGpuSysfsPaths(gpu_sysfs_path_infos));
Expand All @@ -188,6 +282,26 @@
gpu_devices.emplace_back(std::move(gpu_device));
}

// If DRM-based detection found no GPUs, fall back to PCI bus scanning.
// This handles containerized environments (e.g., AKS/Kubernetes) where the DRM
// subsystem (nvidia-drm) may not be available but GPU PCI devices are still
// exposed via /sys/bus/pci/devices/.
if (gpu_devices.empty()) {
LOGS_DEFAULT(VERBOSE) << "No GPUs found via /sys/class/drm. "
<< "Falling back to PCI bus scanning via " << kSysfsPciDevicesPath << ".";

Comment thread
baijumeswani marked this conversation as resolved.
std::vector<pci_device_discovery::GpuPciPathInfo> gpu_pci_path_infos{};
ORT_RETURN_IF_ERROR(pci_device_discovery::DetectGpuPciPaths(kSysfsPciDevicesPath, gpu_pci_path_infos));

gpu_devices.reserve(gpu_pci_path_infos.size());

for (size_t i = 0; i < gpu_pci_path_infos.size(); ++i) {
OrtHardwareDevice gpu_device{};
ORT_RETURN_IF_ERROR(pci_device_discovery::GetGpuDeviceFromPci(gpu_pci_path_infos[i], i, gpu_device));
gpu_devices.emplace_back(std::move(gpu_device));
}
}

gpu_devices_out = std::move(gpu_devices);
return Status::OK();
}
Expand Down
33 changes: 33 additions & 0 deletions onnxruntime/core/platform/linux/device_discovery_linux.h
Original file line number Diff line number Diff line change
@@ -0,0 +1,33 @@
// Copyright (c) Microsoft Corporation. All rights reserved.
// Licensed under the MIT License.

// This header exposes Linux PCI device discovery internals for testing.

#pragma once

#include <filesystem>

Check warning on line 8 in onnxruntime/core/platform/linux/device_discovery_linux.h

View workflow job for this annotation

GitHub Actions / Optional Lint C++

[cpplint] reported by reviewdog 🐶 <filesystem> is an unapproved C++17 header. [build/c++17] [5] Raw Output: onnxruntime/core/platform/linux/device_discovery_linux.h:8: <filesystem> is an unapproved C++17 header. [build/c++17] [5]
#include <string>
#include <vector>

#include "core/common/status.h"
#include "core/session/abi_devices.h"

namespace onnxruntime {
namespace pci_device_discovery {

struct GpuPciPathInfo {
std::filesystem::path path;
std::string pci_bus_id;
};

// Scans the given sysfs PCI devices directory for GPU devices.
// Filters by PCI class codes: 0x0300 (VGA) and 0x0302 (3D controller).
Status DetectGpuPciPaths(const std::filesystem::path& sysfs_pci_devices_path,
std::vector<GpuPciPathInfo>& gpu_pci_paths_out);

// Reads vendor/device IDs and populates an OrtHardwareDevice from a PCI device sysfs path.
Status GetGpuDeviceFromPci(const GpuPciPathInfo& path_info, size_t device_idx,
OrtHardwareDevice& gpu_device_out);

} // namespace pci_device_discovery
} // namespace onnxruntime
11 changes: 11 additions & 0 deletions onnxruntime/test/platform/device_discovery_test.cc
Original file line number Diff line number Diff line change
Expand Up @@ -30,5 +30,16 @@ TEST(DeviceDiscoveryTest, HasCpuDevice) {
#endif // defined(CPUINFO_SUPPORTED)
}

TEST(DeviceDiscoveryTest, GpuDevicesHaveValidProperties) {
const auto gpu_devices = GetDevicesByType(OrtHardwareDeviceType_GPU);

// GPU detection should not crash. If GPUs are present, validate their properties.
for (const auto& gpu_device : gpu_devices) {
EXPECT_EQ(gpu_device.type, OrtHardwareDeviceType_GPU);
Comment thread
edgchen1 marked this conversation as resolved.
Outdated
EXPECT_NE(gpu_device.vendor_id, 0u);
// Note: device_id may be 0 on some platforms (e.g., Apple Silicon) where it is not populated.
}
Comment thread
baijumeswani marked this conversation as resolved.
}

} // namespace onnxruntime::test
#endif // !defined(ORT_MINIMAL_BUILD) && !defined(_GAMING_XBOX)
176 changes: 176 additions & 0 deletions onnxruntime/test/platform/linux/pci_device_discovery_test.cc
Original file line number Diff line number Diff line change
@@ -0,0 +1,176 @@
// Copyright (c) Microsoft Corporation. All rights reserved.
// Licensed under the MIT License.

#include "core/platform/linux/device_discovery_linux.h"

#include <filesystem>
#include <fstream>

#include "gtest/gtest.h"

namespace fs = std::filesystem;

namespace onnxruntime::test {

namespace {

// Helper to create a fake PCI device directory with the given class, vendor, and device files.
void CreateFakePciDevice(const fs::path& device_dir, const std::string& pci_class,
const std::string& vendor, const std::string& device) {
fs::create_directories(device_dir);
{
std::ofstream f(device_dir / "class");
f << pci_class;
}
{
std::ofstream f(device_dir / "vendor");
f << vendor;
}
{
std::ofstream f(device_dir / "device");
f << device;
}
}

class PciDeviceDiscoveryTest : public ::testing::Test {
protected:
void SetUp() override {
temp_dir_ = fs::temp_directory_path() / "ort_pci_discovery_test";
fs::remove_all(temp_dir_);
fs::create_directories(temp_dir_);
}

void TearDown() override {
fs::remove_all(temp_dir_);
}

fs::path temp_dir_;
};

} // namespace

TEST_F(PciDeviceDiscoveryTest, DetectsNvidiaVgaController) {
// PCI class 0x030000 = VGA compatible controller
CreateFakePciDevice(temp_dir_ / "0000:01:00.0", "0x030000", "0x10de", "0x2204");

std::vector<pci_device_discovery::GpuPciPathInfo> gpu_paths;
auto status = pci_device_discovery::DetectGpuPciPaths(temp_dir_, gpu_paths);
ASSERT_TRUE(status.IsOK()) << status.ErrorMessage();
Comment thread
edgchen1 marked this conversation as resolved.
Outdated
ASSERT_EQ(gpu_paths.size(), 1u);
EXPECT_EQ(gpu_paths[0].pci_bus_id, "0000:01:00.0");
}

TEST_F(PciDeviceDiscoveryTest, DetectsNvidia3DController) {
// PCI class 0x030200 = 3D controller (common for NVIDIA datacenter GPUs like A100/H100)
CreateFakePciDevice(temp_dir_ / "0000:65:00.0", "0x030200", "0x10de", "0x20b5");

std::vector<pci_device_discovery::GpuPciPathInfo> gpu_paths;
auto status = pci_device_discovery::DetectGpuPciPaths(temp_dir_, gpu_paths);
ASSERT_TRUE(status.IsOK()) << status.ErrorMessage();
ASSERT_EQ(gpu_paths.size(), 1u);
EXPECT_EQ(gpu_paths[0].pci_bus_id, "0000:65:00.0");
}

TEST_F(PciDeviceDiscoveryTest, FiltersOutNonGpuDevices) {
// PCI class 0x020000 = Network controller (should be skipped)
CreateFakePciDevice(temp_dir_ / "0000:02:00.0", "0x020000", "0x8086", "0x1533");
// PCI class 0x010600 = SATA controller (should be skipped)
CreateFakePciDevice(temp_dir_ / "0000:00:1f.2", "0x010600", "0x8086", "0xa102");
// PCI class 0x030000 = VGA controller (should be detected)
CreateFakePciDevice(temp_dir_ / "0000:01:00.0", "0x030000", "0x10de", "0x2204");

std::vector<pci_device_discovery::GpuPciPathInfo> gpu_paths;
auto status = pci_device_discovery::DetectGpuPciPaths(temp_dir_, gpu_paths);
ASSERT_TRUE(status.IsOK()) << status.ErrorMessage();
ASSERT_EQ(gpu_paths.size(), 1u);
EXPECT_EQ(gpu_paths[0].pci_bus_id, "0000:01:00.0");
}

TEST_F(PciDeviceDiscoveryTest, ReturnsEmptyForNonexistentPath) {
std::vector<pci_device_discovery::GpuPciPathInfo> gpu_paths;
auto status = pci_device_discovery::DetectGpuPciPaths(temp_dir_ / "nonexistent", gpu_paths);
ASSERT_TRUE(status.IsOK()) << status.ErrorMessage();
EXPECT_TRUE(gpu_paths.empty());
}

TEST_F(PciDeviceDiscoveryTest, ReturnsEmptyForEmptyDirectory) {
std::vector<pci_device_discovery::GpuPciPathInfo> gpu_paths;
auto status = pci_device_discovery::DetectGpuPciPaths(temp_dir_, gpu_paths);
ASSERT_TRUE(status.IsOK()) << status.ErrorMessage();
EXPECT_TRUE(gpu_paths.empty());
}

TEST_F(PciDeviceDiscoveryTest, DetectsMultipleGpus) {
// Two NVIDIA GPUs
CreateFakePciDevice(temp_dir_ / "0000:01:00.0", "0x030200", "0x10de", "0x20b5");
CreateFakePciDevice(temp_dir_ / "0000:41:00.0", "0x030200", "0x10de", "0x20b5");

std::vector<pci_device_discovery::GpuPciPathInfo> gpu_paths;
auto status = pci_device_discovery::DetectGpuPciPaths(temp_dir_, gpu_paths);
ASSERT_TRUE(status.IsOK()) << status.ErrorMessage();
EXPECT_EQ(gpu_paths.size(), 2u);
}

TEST_F(PciDeviceDiscoveryTest, SkipsDevicesWithMissingClassFile) {
// Device directory without a class file
auto device_dir = temp_dir_ / "0000:03:00.0";
fs::create_directories(device_dir);
{
std::ofstream f(device_dir / "vendor");
f << "0x10de";
}

std::vector<pci_device_discovery::GpuPciPathInfo> gpu_paths;
auto status = pci_device_discovery::DetectGpuPciPaths(temp_dir_, gpu_paths);
ASSERT_TRUE(status.IsOK()) << status.ErrorMessage();
EXPECT_TRUE(gpu_paths.empty());
}

TEST_F(PciDeviceDiscoveryTest, GetGpuDeviceFromPciReadsVendorAndDevice) {
// Create a fake NVIDIA GPU PCI device
CreateFakePciDevice(temp_dir_ / "0000:65:00.0", "0x030200", "0x10de", "0x20b5");
Comment thread
edgchen1 marked this conversation as resolved.

pci_device_discovery::GpuPciPathInfo path_info;
path_info.path = temp_dir_ / "0000:65:00.0";
path_info.pci_bus_id = "0000:65:00.0";

OrtHardwareDevice gpu_device{};
auto status = pci_device_discovery::GetGpuDeviceFromPci(path_info, 0, gpu_device);
ASSERT_TRUE(status.IsOK()) << status.ErrorMessage();

EXPECT_EQ(gpu_device.type, OrtHardwareDeviceType_GPU);
EXPECT_EQ(gpu_device.vendor_id, 0x10deu);
EXPECT_EQ(gpu_device.device_id, 0x20b5u);

const auto& entries = gpu_device.metadata.Entries();
EXPECT_NE(entries.find("card_idx"), entries.end());
EXPECT_EQ(entries.at("card_idx"), "0");
EXPECT_NE(entries.find("pci_bus_id"), entries.end());
EXPECT_EQ(entries.at("pci_bus_id"), "0000:65:00.0");
EXPECT_NE(entries.find("Discrete"), entries.end());
EXPECT_EQ(entries.at("Discrete"), "1");
}

TEST_F(PciDeviceDiscoveryTest, GetGpuDeviceFromPciNonNvidiaVendor) {
// Create a fake AMD GPU PCI device
CreateFakePciDevice(temp_dir_ / "0000:03:00.0", "0x030000", "0x1002", "0x731f");

pci_device_discovery::GpuPciPathInfo path_info;
path_info.path = temp_dir_ / "0000:03:00.0";
path_info.pci_bus_id = "0000:03:00.0";

OrtHardwareDevice gpu_device{};
auto status = pci_device_discovery::GetGpuDeviceFromPci(path_info, 2, gpu_device);
ASSERT_TRUE(status.IsOK()) << status.ErrorMessage();

EXPECT_EQ(gpu_device.type, OrtHardwareDeviceType_GPU);
EXPECT_EQ(gpu_device.vendor_id, 0x1002u);
EXPECT_EQ(gpu_device.device_id, 0x731fu);

const auto& entries = gpu_device.metadata.Entries();
EXPECT_EQ(entries.at("card_idx"), "2");
// Non-NVIDIA vendor should not have the Discrete metadata entry
EXPECT_EQ(entries.find("Discrete"), entries.end());
}

} // namespace onnxruntime::test
Loading