-
Notifications
You must be signed in to change notification settings - Fork 4k
Add PCI bus fallback for Linux GPU device discovery in containerized environments #27591
New issue
Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.
By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.
Already on GitHub? Sign in to your account
Merged
Merged
Changes from 6 commits
Commits
Show all changes
12 commits
Select commit
Hold shift + click to select a range
ca28284
Initial plan
Copilot 296e12c
Add PCI bus fallback for GPU device discovery on Linux containers
Copilot 785535e
Address review feedback: use VERBOSE log level and add clarifying com…
Copilot a08594b
Remove device_id assertion from GPU test to fix Apple Silicon compati…
Copilot ca79b33
Refactor PCI detection into testable namespace with injectable sysfs …
Copilot 09ea516
Extract hardcoded sysfs PCI path to a named constant
Copilot 1dacf9f
Address review feedback: add PCI spec references, remove card_idx fro…
Copilot eff6b81
Rename device_discovery_linux.h to pci_device_discovery.h
Copilot 6ddce68
Fix reference link.
edgchen1 437da85
Restore CreateFakePciDevice calls in GetGpuDeviceFromPci tests
Copilot fb5b0a0
Apply suggestion from @edgchen1
edgchen1 5d7db8b
Merge branch 'main' of https://github.com/microsoft/onnxruntime into …
baijumeswani File filter
Filter by extension
Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
There are no files selected for viewing
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
| Original file line number | Diff line number | Diff line change |
|---|---|---|
| @@ -0,0 +1,33 @@ | ||
| // Copyright (c) Microsoft Corporation. All rights reserved. | ||
| // Licensed under the MIT License. | ||
|
|
||
| // This header exposes Linux PCI device discovery internals for testing. | ||
|
|
||
| #pragma once | ||
|
|
||
| #include <filesystem> | ||
|
Check warning on line 8 in onnxruntime/core/platform/linux/device_discovery_linux.h
|
||
| #include <string> | ||
| #include <vector> | ||
|
|
||
| #include "core/common/status.h" | ||
| #include "core/session/abi_devices.h" | ||
|
|
||
| namespace onnxruntime { | ||
| namespace pci_device_discovery { | ||
|
|
||
| struct GpuPciPathInfo { | ||
| std::filesystem::path path; | ||
| std::string pci_bus_id; | ||
| }; | ||
|
|
||
| // Scans the given sysfs PCI devices directory for GPU devices. | ||
| // Filters by PCI class codes: 0x0300 (VGA) and 0x0302 (3D controller). | ||
| Status DetectGpuPciPaths(const std::filesystem::path& sysfs_pci_devices_path, | ||
| std::vector<GpuPciPathInfo>& gpu_pci_paths_out); | ||
|
|
||
| // Reads vendor/device IDs and populates an OrtHardwareDevice from a PCI device sysfs path. | ||
| Status GetGpuDeviceFromPci(const GpuPciPathInfo& path_info, size_t device_idx, | ||
| OrtHardwareDevice& gpu_device_out); | ||
|
|
||
| } // namespace pci_device_discovery | ||
| } // namespace onnxruntime | ||
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
176 changes: 176 additions & 0 deletions
176
onnxruntime/test/platform/linux/pci_device_discovery_test.cc
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
| Original file line number | Diff line number | Diff line change |
|---|---|---|
| @@ -0,0 +1,176 @@ | ||
| // Copyright (c) Microsoft Corporation. All rights reserved. | ||
| // Licensed under the MIT License. | ||
|
|
||
| #include "core/platform/linux/device_discovery_linux.h" | ||
|
|
||
| #include <filesystem> | ||
| #include <fstream> | ||
|
|
||
| #include "gtest/gtest.h" | ||
|
|
||
| namespace fs = std::filesystem; | ||
|
|
||
| namespace onnxruntime::test { | ||
|
|
||
| namespace { | ||
|
|
||
| // Helper to create a fake PCI device directory with the given class, vendor, and device files. | ||
| void CreateFakePciDevice(const fs::path& device_dir, const std::string& pci_class, | ||
| const std::string& vendor, const std::string& device) { | ||
| fs::create_directories(device_dir); | ||
| { | ||
| std::ofstream f(device_dir / "class"); | ||
| f << pci_class; | ||
| } | ||
| { | ||
| std::ofstream f(device_dir / "vendor"); | ||
| f << vendor; | ||
| } | ||
| { | ||
| std::ofstream f(device_dir / "device"); | ||
| f << device; | ||
| } | ||
| } | ||
|
|
||
| class PciDeviceDiscoveryTest : public ::testing::Test { | ||
| protected: | ||
| void SetUp() override { | ||
| temp_dir_ = fs::temp_directory_path() / "ort_pci_discovery_test"; | ||
| fs::remove_all(temp_dir_); | ||
| fs::create_directories(temp_dir_); | ||
| } | ||
|
|
||
| void TearDown() override { | ||
| fs::remove_all(temp_dir_); | ||
| } | ||
|
|
||
| fs::path temp_dir_; | ||
| }; | ||
|
|
||
| } // namespace | ||
|
|
||
| TEST_F(PciDeviceDiscoveryTest, DetectsNvidiaVgaController) { | ||
| // PCI class 0x030000 = VGA compatible controller | ||
| CreateFakePciDevice(temp_dir_ / "0000:01:00.0", "0x030000", "0x10de", "0x2204"); | ||
|
|
||
| std::vector<pci_device_discovery::GpuPciPathInfo> gpu_paths; | ||
| auto status = pci_device_discovery::DetectGpuPciPaths(temp_dir_, gpu_paths); | ||
| ASSERT_TRUE(status.IsOK()) << status.ErrorMessage(); | ||
|
edgchen1 marked this conversation as resolved.
Outdated
|
||
| ASSERT_EQ(gpu_paths.size(), 1u); | ||
| EXPECT_EQ(gpu_paths[0].pci_bus_id, "0000:01:00.0"); | ||
| } | ||
|
|
||
| TEST_F(PciDeviceDiscoveryTest, DetectsNvidia3DController) { | ||
| // PCI class 0x030200 = 3D controller (common for NVIDIA datacenter GPUs like A100/H100) | ||
| CreateFakePciDevice(temp_dir_ / "0000:65:00.0", "0x030200", "0x10de", "0x20b5"); | ||
|
|
||
| std::vector<pci_device_discovery::GpuPciPathInfo> gpu_paths; | ||
| auto status = pci_device_discovery::DetectGpuPciPaths(temp_dir_, gpu_paths); | ||
| ASSERT_TRUE(status.IsOK()) << status.ErrorMessage(); | ||
| ASSERT_EQ(gpu_paths.size(), 1u); | ||
| EXPECT_EQ(gpu_paths[0].pci_bus_id, "0000:65:00.0"); | ||
| } | ||
|
|
||
| TEST_F(PciDeviceDiscoveryTest, FiltersOutNonGpuDevices) { | ||
| // PCI class 0x020000 = Network controller (should be skipped) | ||
| CreateFakePciDevice(temp_dir_ / "0000:02:00.0", "0x020000", "0x8086", "0x1533"); | ||
| // PCI class 0x010600 = SATA controller (should be skipped) | ||
| CreateFakePciDevice(temp_dir_ / "0000:00:1f.2", "0x010600", "0x8086", "0xa102"); | ||
| // PCI class 0x030000 = VGA controller (should be detected) | ||
| CreateFakePciDevice(temp_dir_ / "0000:01:00.0", "0x030000", "0x10de", "0x2204"); | ||
|
|
||
| std::vector<pci_device_discovery::GpuPciPathInfo> gpu_paths; | ||
| auto status = pci_device_discovery::DetectGpuPciPaths(temp_dir_, gpu_paths); | ||
| ASSERT_TRUE(status.IsOK()) << status.ErrorMessage(); | ||
| ASSERT_EQ(gpu_paths.size(), 1u); | ||
| EXPECT_EQ(gpu_paths[0].pci_bus_id, "0000:01:00.0"); | ||
| } | ||
|
|
||
| TEST_F(PciDeviceDiscoveryTest, ReturnsEmptyForNonexistentPath) { | ||
| std::vector<pci_device_discovery::GpuPciPathInfo> gpu_paths; | ||
| auto status = pci_device_discovery::DetectGpuPciPaths(temp_dir_ / "nonexistent", gpu_paths); | ||
| ASSERT_TRUE(status.IsOK()) << status.ErrorMessage(); | ||
| EXPECT_TRUE(gpu_paths.empty()); | ||
| } | ||
|
|
||
| TEST_F(PciDeviceDiscoveryTest, ReturnsEmptyForEmptyDirectory) { | ||
| std::vector<pci_device_discovery::GpuPciPathInfo> gpu_paths; | ||
| auto status = pci_device_discovery::DetectGpuPciPaths(temp_dir_, gpu_paths); | ||
| ASSERT_TRUE(status.IsOK()) << status.ErrorMessage(); | ||
| EXPECT_TRUE(gpu_paths.empty()); | ||
| } | ||
|
|
||
| TEST_F(PciDeviceDiscoveryTest, DetectsMultipleGpus) { | ||
| // Two NVIDIA GPUs | ||
| CreateFakePciDevice(temp_dir_ / "0000:01:00.0", "0x030200", "0x10de", "0x20b5"); | ||
| CreateFakePciDevice(temp_dir_ / "0000:41:00.0", "0x030200", "0x10de", "0x20b5"); | ||
|
|
||
| std::vector<pci_device_discovery::GpuPciPathInfo> gpu_paths; | ||
| auto status = pci_device_discovery::DetectGpuPciPaths(temp_dir_, gpu_paths); | ||
| ASSERT_TRUE(status.IsOK()) << status.ErrorMessage(); | ||
| EXPECT_EQ(gpu_paths.size(), 2u); | ||
| } | ||
|
|
||
| TEST_F(PciDeviceDiscoveryTest, SkipsDevicesWithMissingClassFile) { | ||
| // Device directory without a class file | ||
| auto device_dir = temp_dir_ / "0000:03:00.0"; | ||
| fs::create_directories(device_dir); | ||
| { | ||
| std::ofstream f(device_dir / "vendor"); | ||
| f << "0x10de"; | ||
| } | ||
|
|
||
| std::vector<pci_device_discovery::GpuPciPathInfo> gpu_paths; | ||
| auto status = pci_device_discovery::DetectGpuPciPaths(temp_dir_, gpu_paths); | ||
| ASSERT_TRUE(status.IsOK()) << status.ErrorMessage(); | ||
| EXPECT_TRUE(gpu_paths.empty()); | ||
| } | ||
|
|
||
| TEST_F(PciDeviceDiscoveryTest, GetGpuDeviceFromPciReadsVendorAndDevice) { | ||
| // Create a fake NVIDIA GPU PCI device | ||
| CreateFakePciDevice(temp_dir_ / "0000:65:00.0", "0x030200", "0x10de", "0x20b5"); | ||
|
edgchen1 marked this conversation as resolved.
|
||
|
|
||
| pci_device_discovery::GpuPciPathInfo path_info; | ||
| path_info.path = temp_dir_ / "0000:65:00.0"; | ||
| path_info.pci_bus_id = "0000:65:00.0"; | ||
|
|
||
| OrtHardwareDevice gpu_device{}; | ||
| auto status = pci_device_discovery::GetGpuDeviceFromPci(path_info, 0, gpu_device); | ||
| ASSERT_TRUE(status.IsOK()) << status.ErrorMessage(); | ||
|
|
||
| EXPECT_EQ(gpu_device.type, OrtHardwareDeviceType_GPU); | ||
| EXPECT_EQ(gpu_device.vendor_id, 0x10deu); | ||
| EXPECT_EQ(gpu_device.device_id, 0x20b5u); | ||
|
|
||
| const auto& entries = gpu_device.metadata.Entries(); | ||
| EXPECT_NE(entries.find("card_idx"), entries.end()); | ||
| EXPECT_EQ(entries.at("card_idx"), "0"); | ||
| EXPECT_NE(entries.find("pci_bus_id"), entries.end()); | ||
| EXPECT_EQ(entries.at("pci_bus_id"), "0000:65:00.0"); | ||
| EXPECT_NE(entries.find("Discrete"), entries.end()); | ||
| EXPECT_EQ(entries.at("Discrete"), "1"); | ||
| } | ||
|
|
||
| TEST_F(PciDeviceDiscoveryTest, GetGpuDeviceFromPciNonNvidiaVendor) { | ||
| // Create a fake AMD GPU PCI device | ||
| CreateFakePciDevice(temp_dir_ / "0000:03:00.0", "0x030000", "0x1002", "0x731f"); | ||
|
|
||
| pci_device_discovery::GpuPciPathInfo path_info; | ||
| path_info.path = temp_dir_ / "0000:03:00.0"; | ||
| path_info.pci_bus_id = "0000:03:00.0"; | ||
|
|
||
| OrtHardwareDevice gpu_device{}; | ||
| auto status = pci_device_discovery::GetGpuDeviceFromPci(path_info, 2, gpu_device); | ||
| ASSERT_TRUE(status.IsOK()) << status.ErrorMessage(); | ||
|
|
||
| EXPECT_EQ(gpu_device.type, OrtHardwareDeviceType_GPU); | ||
| EXPECT_EQ(gpu_device.vendor_id, 0x1002u); | ||
| EXPECT_EQ(gpu_device.device_id, 0x731fu); | ||
|
|
||
| const auto& entries = gpu_device.metadata.Entries(); | ||
| EXPECT_EQ(entries.at("card_idx"), "2"); | ||
| // Non-NVIDIA vendor should not have the Discrete metadata entry | ||
| EXPECT_EQ(entries.find("Discrete"), entries.end()); | ||
| } | ||
|
|
||
| } // namespace onnxruntime::test | ||
Oops, something went wrong.
Add this suggestion to a batch that can be applied as a single commit.
This suggestion is invalid because no changes were made to the code.
Suggestions cannot be applied while the pull request is closed.
Suggestions cannot be applied while viewing a subset of changes.
Only one suggestion per line can be applied in a batch.
Add this suggestion to a batch that can be applied as a single commit.
Applying suggestions on deleted lines is not supported.
You must change the existing code in this line in order to create a valid suggestion.
Outdated suggestions cannot be applied.
This suggestion has been applied or marked resolved.
Suggestions cannot be applied from pending reviews.
Suggestions cannot be applied on multi-line comments.
Suggestions cannot be applied while the pull request is queued to merge.
Suggestion cannot be applied right now. Please check back later.
Uh oh!
There was an error while loading. Please reload this page.