From ac7412934161e8cdfaa474f59d0a17f8d3b35494 Mon Sep 17 00:00:00 2001 From: Kenny Ho Date: Mon, 9 Sep 2024 22:47:16 -0400 Subject: [PATCH] Add basic support for GPUs that can be partitioned --- cmd/k8s-device-plugin/main.go | 2 + internal/pkg/amdgpu/amdgpu.go | 72 ++++++++++++++++++- internal/pkg/amdgpu/amdgpu_test.go | 16 +++++ .../topology/nodes/2/properties | 2 +- 4 files changed, 88 insertions(+), 4 deletions(-) diff --git a/cmd/k8s-device-plugin/main.go b/cmd/k8s-device-plugin/main.go index 6d3ccc8a..37d00f03 100644 --- a/cmd/k8s-device-plugin/main.go +++ b/cmd/k8s-device-plugin/main.go @@ -128,6 +128,8 @@ func (p *Plugin) PreStartContainer(ctx context.Context, r *pluginapi.PreStartCon func (p *Plugin) ListAndWatch(e *pluginapi.Empty, s pluginapi.DevicePlugin_ListAndWatchServer) error { p.AMDGPUs = amdgpu.GetAMDGPUs() + glog.Infof("Found %d AMDGPUs", len(p.AMDGPUs)) + devs := make([]*pluginapi.Device, len(p.AMDGPUs)) // limit scope for hwloc diff --git a/internal/pkg/amdgpu/amdgpu.go b/internal/pkg/amdgpu/amdgpu.go index a245afac..3cfa8168 100644 --- a/internal/pkg/amdgpu/amdgpu.go +++ b/internal/pkg/amdgpu/amdgpu.go @@ -95,21 +95,54 @@ func GetAMDGPUs() map[string]map[string]int { matches, _ := filepath.Glob("/sys/module/amdgpu/drivers/pci:amdgpu/[0-9a-fA-F][0-9a-fA-F][0-9a-fA-F][0-9a-fA-F]:*") devices := make(map[string]map[string]int) + card, renderD := 0, 128 for _, path := range matches { glog.Info(path) devPaths, _ := filepath.Glob(path + "/drm/*") - devices[filepath.Base(path)] = make(map[string]int) for _, devPath := range devPaths { switch name := filepath.Base(devPath); { case name[0:4] == "card": - devices[filepath.Base(path)][name[0:4]], _ = strconv.Atoi(name[4:]) + card, _ = strconv.Atoi(name[4:]) case name[0:7] == "renderD": - devices[filepath.Base(path)][name[0:7]], _ = strconv.Atoi(name[7:]) + renderD, _ = strconv.Atoi(name[7:]) } } + + devices[filepath.Base(path)] = map[string]int{"card": card, "renderD": renderD} + } + + // certain products have additional devices (such as MI300's partitions) + //ex: /sys/devices/platform/amdgpu_xcp_30 + platformMatches, _ := filepath.Glob("/sys/devices/platform/amdgpu_xcp_*") + + // This is needed because some of the visible renderD are actually not valid + // Their validity depends on topology information from KFD + topoRenderNodes := renderNodeSetFromTopology() + + for _, path := range platformMatches { + glog.Info(path) + devPaths, _ := filepath.Glob(path + "/drm/*") + + for _, devPath := range devPaths { + switch name := filepath.Base(devPath); { + case name[0:4] == "card": + card, _ = strconv.Atoi(name[4:]) + case name[0:7] == "renderD": + renderD, _ = strconv.Atoi(name[7:]) + } + } + + glog.Info(renderD) + glog.Info(topoRenderNodes[renderD]) + if !topoRenderNodes[renderD] { + continue + } + + devices[filepath.Base(path)] = map[string]int{"card": card, "renderD": renderD} } + return devices } @@ -274,3 +307,36 @@ func parseDebugFSFirmwareInfo(path string) (map[string]uint32, map[string]uint32 return feat, fw } + +func renderNodeSetFromTopology(topoRootParam ...string) map[int]bool { + topoRoot := "/sys/class/kfd/kfd" + if len(topoRootParam) == 1 { + topoRoot = topoRootParam[0] + } + + renderNodes := make(map[int]bool) + var nodeFiles []string + var err error + + if nodeFiles, err = filepath.Glob(topoRoot + "/topology/nodes/*/properties"); err != nil { + glog.Fatalf("glob error: %s", err) + return renderNodes + } + + topoDrmRenderMinorRe := regexp.MustCompile(`drm_render_minor\s(\d+)`) + for _, nodeFile := range nodeFiles { + glog.Info("Parsing " + nodeFile) + v, e := ParseTopologyProperties(nodeFile, topoDrmRenderMinorRe) + if e != nil { + continue + } + + if v <= 0 { + continue + } + + renderNodes[int(v)] = true + } + + return renderNodes +} diff --git a/internal/pkg/amdgpu/amdgpu_test.go b/internal/pkg/amdgpu/amdgpu_test.go index 1a682ae3..d41cf839 100644 --- a/internal/pkg/amdgpu/amdgpu_test.go +++ b/internal/pkg/amdgpu/amdgpu_test.go @@ -17,9 +17,11 @@ package amdgpu import ( + "encoding/json" "fmt" "io/ioutil" "path/filepath" + "reflect" "regexp" "strings" "testing" @@ -213,3 +215,17 @@ func TestParseDebugFSFirmwareInfo(t *testing.T) { t.Errorf("Incorrect parsing of amdgpu firmware info from debugfs") } } + +func TestRenderNodeSetFromTopology(t *testing.T) { + renderNodes := renderNodeSetFromTopology("../../../testdata/topology-parsing") + + expNodes := map[int]bool{128: true, 129: true} + if !reflect.DeepEqual(renderNodes, expNodes) { + val, _ := json.MarshalIndent(renderNodes, "", " ") + exp, _ := json.MarshalIndent(expNodes, "", " ") + + t.Errorf("RenderNode set was incorrect") + t.Errorf("Got: %s", val) + t.Errorf("Want: %s", exp) + } +} diff --git a/testdata/topology-parsing/topology/nodes/2/properties b/testdata/topology-parsing/topology/nodes/2/properties index afabb2e5..97c46894 100644 --- a/testdata/topology-parsing/topology/nodes/2/properties +++ b/testdata/topology-parsing/topology/nodes/2/properties @@ -17,7 +17,7 @@ max_slots_scratch_cu 32 vendor_id 4098 device_id 26720 location_id 6400 -drm_render_minor 128 +drm_render_minor 129 max_engine_clk_fcompute 1500 local_mem_size 17163091968 fw_version 392