Skip to content
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
3 changes: 2 additions & 1 deletion cmake/onnxruntime_common.cmake
Original file line number Diff line number Diff line change
Expand Up @@ -84,7 +84,8 @@ if (WIN32)
"${ONNXRUNTIME_ROOT}/core/platform/windows/device_discovery.cc")
elseif (LINUX)
list(APPEND onnxruntime_common_src_patterns
"${ONNXRUNTIME_ROOT}/core/platform/linux/device_discovery.cc")
"${ONNXRUNTIME_ROOT}/core/platform/linux/device_discovery.cc"
"${ONNXRUNTIME_ROOT}/core/platform/linux/pci_device_discovery.h")
elseif (APPLE)
list(APPEND onnxruntime_common_src_patterns
"${ONNXRUNTIME_ROOT}/core/platform/apple/device_discovery.cc")
Expand Down
5 changes: 5 additions & 0 deletions cmake/onnxruntime_unittests.cmake
Original file line number Diff line number Diff line change
Expand Up @@ -461,6 +461,11 @@ if(WIN32)
"${TEST_SRC_DIR}/platform/windows/logging/*.cc" )
endif()

if(LINUX)
list(APPEND onnxruntime_test_framework_src_patterns
"${TEST_SRC_DIR}/platform/linux/*.cc" )
endif()

if(NOT onnxruntime_MINIMAL_BUILD AND NOT onnxruntime_REDUCED_OPS_BUILD)

if(onnxruntime_USE_CUDA)
Expand Down
114 changes: 114 additions & 0 deletions onnxruntime/core/platform/linux/device_discovery.cc
Original file line number Diff line number Diff line change
Expand Up @@ -2,6 +2,7 @@
// Licensed under the MIT License.

#include "core/platform/device_discovery.h"
#include "core/platform/linux/pci_device_discovery.h"

#include <filesystem>
#include <fstream>
Expand Down Expand Up @@ -175,6 +176,99 @@
return Status::OK();
}

} // namespace

// PCI bus-based GPU detection as a fallback for environments where DRM sysfs entries
// are not available (e.g., AKS/Kubernetes containers where the nvidia-drm kernel module
// is not loaded but GPU PCI devices are still exposed via sysfs).

namespace pci_device_discovery {

Status DetectGpuPciPaths(const fs::path& sysfs_pci_devices_path,
std::vector<GpuPciPathInfo>& gpu_pci_paths_out) {
std::error_code error_code{};
const bool path_exists = fs::exists(sysfs_pci_devices_path, error_code);
ORT_RETURN_IF_ERROR(ErrorCodeToStatus(error_code));

if (!path_exists) {
gpu_pci_paths_out = {};
return Status::OK();
}

std::vector<GpuPciPathInfo> gpu_pci_paths{};

auto dir_iterator = fs::directory_iterator{sysfs_pci_devices_path, error_code};
ORT_RETURN_IF_ERROR(ErrorCodeToStatus(error_code));

for (const auto& dir_item : dir_iterator) {
const auto& device_path = dir_item.path();

// Read PCI class code to identify GPU devices.
// The class file contains a 24-bit value: 0xCCSSpp (class/subclass/prog-if).
uint32_t pci_class{};
if (auto status = ReadValueFromFile(device_path / "class", pci_class); !status.IsOK()) {
continue;
}

// Check for GPU/display controller PCI class codes:
// Base class 0x03 = Display controller
// Sub-class 0x00 = VGA compatible controller
// Sub-class 0x02 = 3D controller (common for NVIDIA data center/compute GPUs)
Comment thread
edgchen1 marked this conversation as resolved.
// Reference: PCI Code and ID Assignment Specification
// https://pcisig.com/pci-code-and-id-assignment-specification-agreement
// See section on base class 03h.
const uint8_t base_class = static_cast<uint8_t>((pci_class >> 16) & 0xFF);
const uint8_t sub_class = static_cast<uint8_t>((pci_class >> 8) & 0xFF);
if (base_class != 0x03 || (sub_class != 0x00 && sub_class != 0x02)) {
continue;
}

GpuPciPathInfo path_info{};
path_info.path = device_path;
path_info.pci_bus_id = device_path.filename().string();

Check warning on line 228 in onnxruntime/core/platform/linux/device_discovery.cc

View workflow job for this annotation

GitHub Actions / Optional Lint C++

[cpplint] reported by reviewdog 🐶 Add #include <string> for string [build/include_what_you_use] [4] Raw Output: onnxruntime/core/platform/linux/device_discovery.cc:228: Add #include <string> for string [build/include_what_you_use] [4]
gpu_pci_paths.emplace_back(std::move(path_info));
}

gpu_pci_paths_out = std::move(gpu_pci_paths);
return Status::OK();
}

Status GetGpuDeviceFromPci(const GpuPciPathInfo& path_info, OrtHardwareDevice& gpu_device_out) {
OrtHardwareDevice gpu_device{};
const auto& pci_path = path_info.path;

// vendor id - directly under PCI device path
uint16_t vendor_id{};
ORT_RETURN_IF_ERROR(ReadValueFromFile(pci_path / "vendor", vendor_id));
gpu_device.vendor_id = vendor_id;

// device id - directly under PCI device path
uint16_t device_id{};
ORT_RETURN_IF_ERROR(ReadValueFromFile(pci_path / "device", device_id));
gpu_device.device_id = device_id;

// metadata
if (const auto is_gpu_discrete = IsGpuDiscrete(vendor_id, device_id);
is_gpu_discrete.has_value()) {
gpu_device.metadata.Add("Discrete", (*is_gpu_discrete ? "1" : "0"));
}

if (!path_info.pci_bus_id.empty()) {
gpu_device.metadata.Add("pci_bus_id", path_info.pci_bus_id);
}

gpu_device.type = OrtHardwareDeviceType_GPU;

gpu_device_out = std::move(gpu_device);
return Status::OK();
}

} // namespace pci_device_discovery

namespace {

constexpr const char* kSysfsPciDevicesPath = "/sys/bus/pci/devices";

Status GetGpuDevices(std::vector<OrtHardwareDevice>& gpu_devices_out) {
std::vector<GpuSysfsPathInfo> gpu_sysfs_path_infos{};
ORT_RETURN_IF_ERROR(DetectGpuSysfsPaths(gpu_sysfs_path_infos));
Expand All @@ -188,6 +282,26 @@
gpu_devices.emplace_back(std::move(gpu_device));
}

// If DRM-based detection found no GPUs, fall back to PCI bus scanning.
// This handles containerized environments (e.g., AKS/Kubernetes) where the DRM
// subsystem (nvidia-drm) may not be available but GPU PCI devices are still
// exposed via /sys/bus/pci/devices/.
if (gpu_devices.empty()) {
LOGS_DEFAULT(VERBOSE) << "No GPUs found via /sys/class/drm. "
<< "Falling back to PCI bus scanning via " << kSysfsPciDevicesPath << ".";

Comment thread
baijumeswani marked this conversation as resolved.
std::vector<pci_device_discovery::GpuPciPathInfo> gpu_pci_path_infos{};
ORT_RETURN_IF_ERROR(pci_device_discovery::DetectGpuPciPaths(kSysfsPciDevicesPath, gpu_pci_path_infos));

gpu_devices.reserve(gpu_pci_path_infos.size());

for (const auto& gpu_pci_path_info : gpu_pci_path_infos) {
OrtHardwareDevice gpu_device{};
ORT_RETURN_IF_ERROR(pci_device_discovery::GetGpuDeviceFromPci(gpu_pci_path_info, gpu_device));
gpu_devices.emplace_back(std::move(gpu_device));
}
}

gpu_devices_out = std::move(gpu_devices);
return Status::OK();
}
Expand Down
33 changes: 33 additions & 0 deletions onnxruntime/core/platform/linux/pci_device_discovery.h
Comment thread
edgchen1 marked this conversation as resolved.
Original file line number Diff line number Diff line change
@@ -0,0 +1,33 @@
// Copyright (c) Microsoft Corporation. All rights reserved.
// Licensed under the MIT License.

// This header exposes Linux PCI device discovery internals for testing.

#pragma once

#include <filesystem>

Check warning on line 8 in onnxruntime/core/platform/linux/pci_device_discovery.h

View workflow job for this annotation

GitHub Actions / Optional Lint C++

[cpplint] reported by reviewdog 🐶 <filesystem> is an unapproved C++17 header. [build/c++17] [5] Raw Output: onnxruntime/core/platform/linux/pci_device_discovery.h:8: <filesystem> is an unapproved C++17 header. [build/c++17] [5]
#include <string>
#include <vector>

#include "core/common/status.h"
#include "core/session/abi_devices.h"

namespace onnxruntime {
namespace pci_device_discovery {

struct GpuPciPathInfo {
std::filesystem::path path;
std::string pci_bus_id;
};

// Scans the given sysfs PCI devices directory for GPU devices.
// Filters by PCI class codes: 0x0300 (VGA) and 0x0302 (3D controller).
Status DetectGpuPciPaths(const std::filesystem::path& sysfs_pci_devices_path,
std::vector<GpuPciPathInfo>& gpu_pci_paths_out);

// Reads vendor/device IDs and populates an OrtHardwareDevice from a PCI device sysfs path.
Status GetGpuDeviceFromPci(const GpuPciPathInfo& path_info,
OrtHardwareDevice& gpu_device_out);

} // namespace pci_device_discovery
} // namespace onnxruntime
10 changes: 10 additions & 0 deletions onnxruntime/test/platform/device_discovery_test.cc
Original file line number Diff line number Diff line change
Expand Up @@ -30,5 +30,15 @@ TEST(DeviceDiscoveryTest, HasCpuDevice) {
#endif // defined(CPUINFO_SUPPORTED)
}

TEST(DeviceDiscoveryTest, GpuDevicesHaveValidProperties) {
const auto gpu_devices = GetDevicesByType(OrtHardwareDeviceType_GPU);

// GPU detection should not crash. If GPUs are present, validate their properties.
for (const auto& gpu_device : gpu_devices) {
EXPECT_NE(gpu_device.vendor_id, 0u);
// Note: device_id may be 0 on some platforms (e.g., Apple Silicon) where it is not populated.
}
Comment thread
baijumeswani marked this conversation as resolved.
}

} // namespace onnxruntime::test
#endif // !defined(ORT_MINIMAL_BUILD) && !defined(_GAMING_XBOX)
165 changes: 165 additions & 0 deletions onnxruntime/test/platform/linux/pci_device_discovery_test.cc
Original file line number Diff line number Diff line change
@@ -0,0 +1,165 @@
// Copyright (c) Microsoft Corporation. All rights reserved.
// Licensed under the MIT License.

#include "core/platform/linux/pci_device_discovery.h"

#include <filesystem>
#include <fstream>

#include "test/util/include/asserts.h"
#include "gtest/gtest.h"

namespace fs = std::filesystem;

namespace onnxruntime::test {

namespace {

// Helper to create a fake PCI device directory with the given class, vendor, and device files.
void CreateFakePciDevice(const fs::path& device_dir, const std::string& pci_class,
const std::string& vendor, const std::string& device) {
fs::create_directories(device_dir);
{
std::ofstream f(device_dir / "class");
f << pci_class;
}
{
std::ofstream f(device_dir / "vendor");
f << vendor;
}
{
std::ofstream f(device_dir / "device");
f << device;
}
}

class PciDeviceDiscoveryTest : public ::testing::Test {
protected:
void SetUp() override {
temp_dir_ = fs::temp_directory_path() / "ort_pci_discovery_test";
fs::remove_all(temp_dir_);
fs::create_directories(temp_dir_);
}

void TearDown() override {
fs::remove_all(temp_dir_);
}

fs::path temp_dir_;
};

} // namespace

TEST_F(PciDeviceDiscoveryTest, DetectsNvidiaVgaController) {
// PCI class 0x030000 = VGA compatible controller
CreateFakePciDevice(temp_dir_ / "0000:01:00.0", "0x030000", "0x10de", "0x2204");

std::vector<pci_device_discovery::GpuPciPathInfo> gpu_paths;
ASSERT_STATUS_OK(pci_device_discovery::DetectGpuPciPaths(temp_dir_, gpu_paths));
ASSERT_EQ(gpu_paths.size(), 1u);
EXPECT_EQ(gpu_paths[0].pci_bus_id, "0000:01:00.0");
}

TEST_F(PciDeviceDiscoveryTest, DetectsNvidia3DController) {
// PCI class 0x030200 = 3D controller (common for NVIDIA datacenter GPUs like A100/H100)
CreateFakePciDevice(temp_dir_ / "0000:65:00.0", "0x030200", "0x10de", "0x20b5");

std::vector<pci_device_discovery::GpuPciPathInfo> gpu_paths;
ASSERT_STATUS_OK(pci_device_discovery::DetectGpuPciPaths(temp_dir_, gpu_paths));
ASSERT_EQ(gpu_paths.size(), 1u);
EXPECT_EQ(gpu_paths[0].pci_bus_id, "0000:65:00.0");
}

TEST_F(PciDeviceDiscoveryTest, FiltersOutNonGpuDevices) {
// PCI class 0x020000 = Network controller (should be skipped)
CreateFakePciDevice(temp_dir_ / "0000:02:00.0", "0x020000", "0x8086", "0x1533");
// PCI class 0x010600 = SATA controller (should be skipped)
CreateFakePciDevice(temp_dir_ / "0000:00:1f.2", "0x010600", "0x8086", "0xa102");
// PCI class 0x030000 = VGA controller (should be detected)
CreateFakePciDevice(temp_dir_ / "0000:01:00.0", "0x030000", "0x10de", "0x2204");

std::vector<pci_device_discovery::GpuPciPathInfo> gpu_paths;
ASSERT_STATUS_OK(pci_device_discovery::DetectGpuPciPaths(temp_dir_, gpu_paths));
ASSERT_EQ(gpu_paths.size(), 1u);
EXPECT_EQ(gpu_paths[0].pci_bus_id, "0000:01:00.0");
}

TEST_F(PciDeviceDiscoveryTest, ReturnsEmptyForNonexistentPath) {
std::vector<pci_device_discovery::GpuPciPathInfo> gpu_paths;
ASSERT_STATUS_OK(pci_device_discovery::DetectGpuPciPaths(temp_dir_ / "nonexistent", gpu_paths));
EXPECT_TRUE(gpu_paths.empty());
}

TEST_F(PciDeviceDiscoveryTest, ReturnsEmptyForEmptyDirectory) {
std::vector<pci_device_discovery::GpuPciPathInfo> gpu_paths;
ASSERT_STATUS_OK(pci_device_discovery::DetectGpuPciPaths(temp_dir_, gpu_paths));
EXPECT_TRUE(gpu_paths.empty());
}

TEST_F(PciDeviceDiscoveryTest, DetectsMultipleGpus) {
// Two NVIDIA GPUs
CreateFakePciDevice(temp_dir_ / "0000:01:00.0", "0x030200", "0x10de", "0x20b5");
CreateFakePciDevice(temp_dir_ / "0000:41:00.0", "0x030200", "0x10de", "0x20b5");

std::vector<pci_device_discovery::GpuPciPathInfo> gpu_paths;
ASSERT_STATUS_OK(pci_device_discovery::DetectGpuPciPaths(temp_dir_, gpu_paths));
EXPECT_EQ(gpu_paths.size(), 2u);
}

TEST_F(PciDeviceDiscoveryTest, SkipsDevicesWithMissingClassFile) {
// Device directory without a class file
auto device_dir = temp_dir_ / "0000:03:00.0";
fs::create_directories(device_dir);
{
std::ofstream f(device_dir / "vendor");
f << "0x10de";
}

std::vector<pci_device_discovery::GpuPciPathInfo> gpu_paths;
ASSERT_STATUS_OK(pci_device_discovery::DetectGpuPciPaths(temp_dir_, gpu_paths));
EXPECT_TRUE(gpu_paths.empty());
}

TEST_F(PciDeviceDiscoveryTest, GetGpuDeviceFromPciReadsVendorAndDevice) {
// Create a fake NVIDIA GPU PCI device
CreateFakePciDevice(temp_dir_ / "0000:65:00.0", "0x030200", "0x10de", "0x20b5");

pci_device_discovery::GpuPciPathInfo path_info;
path_info.path = temp_dir_ / "0000:65:00.0";
path_info.pci_bus_id = "0000:65:00.0";

OrtHardwareDevice gpu_device{};
ASSERT_STATUS_OK(pci_device_discovery::GetGpuDeviceFromPci(path_info, gpu_device));

EXPECT_EQ(gpu_device.type, OrtHardwareDeviceType_GPU);
EXPECT_EQ(gpu_device.vendor_id, 0x10deu);
EXPECT_EQ(gpu_device.device_id, 0x20b5u);

const auto& entries = gpu_device.metadata.Entries();
EXPECT_NE(entries.find("pci_bus_id"), entries.end());
EXPECT_EQ(entries.at("pci_bus_id"), "0000:65:00.0");
EXPECT_NE(entries.find("Discrete"), entries.end());
EXPECT_EQ(entries.at("Discrete"), "1");
}

TEST_F(PciDeviceDiscoveryTest, GetGpuDeviceFromPciNonNvidiaVendor) {
// Create a fake AMD GPU PCI device
CreateFakePciDevice(temp_dir_ / "0000:03:00.0", "0x030000", "0x1002", "0x731f");

pci_device_discovery::GpuPciPathInfo path_info;
path_info.path = temp_dir_ / "0000:03:00.0";
path_info.pci_bus_id = "0000:03:00.0";

OrtHardwareDevice gpu_device{};
ASSERT_STATUS_OK(pci_device_discovery::GetGpuDeviceFromPci(path_info, gpu_device));

EXPECT_EQ(gpu_device.type, OrtHardwareDeviceType_GPU);
EXPECT_EQ(gpu_device.vendor_id, 0x1002u);
EXPECT_EQ(gpu_device.device_id, 0x731fu);

const auto& entries = gpu_device.metadata.Entries();
// Non-NVIDIA vendor should not have the Discrete metadata entry
EXPECT_EQ(entries.find("Discrete"), entries.end());
}

} // namespace onnxruntime::test
Loading