diff --git a/cluster-autoscaler/utils/gpu/gpu.go b/cluster-autoscaler/utils/gpu/gpu.go index a19efca46767..2a123c108c5b 100644 --- a/cluster-autoscaler/utils/gpu/gpu.go +++ b/cluster-autoscaler/utils/gpu/gpu.go @@ -26,6 +26,8 @@ import ( ) const ( + // ResourceAMDGPU is the name of the AMD GPU resource. + ResourceAMDGPU = "amd.com/gpu" // ResourceNvidiaGPU is the name of the Nvidia GPU resource. ResourceNvidiaGPU = "nvidia.com/gpu" // ResourceDirectX is the name of the DirectX resource on windows. @@ -35,6 +37,14 @@ const ( DefaultGPUType = "nvidia-tesla-k80" ) +// GPUVendorResourceNames centralized list of all known GPU vendor extended resource names. +// Extend this slice if new vendor resource names are added. +var GPUVendorResourceNames = []apiv1.ResourceName{ + ResourceNvidiaGPU, + ResourceAMDGPU, + ResourceDirectX, +} + const ( // MetricsGenericGPU - for when there is no information about GPU type MetricsGenericGPU = "generic" @@ -109,15 +119,41 @@ func validateGpuType(availableGPUTypes map[string]struct{}, gpu string) string { // if the drivers are installed and GPU is ready to use. func NodeHasGpu(GPULabel string, node *apiv1.Node) bool { _, hasGpuLabel := node.Labels[GPULabel] - gpuAllocatable, hasGpuAllocatable := node.Status.Allocatable[ResourceNvidiaGPU] - return hasGpuLabel || (hasGpuAllocatable && !gpuAllocatable.IsZero()) + if hasGpuLabel { + return true + } + // Check for extended resources as well + for _, gpuVendorResourceName := range GPUVendorResourceNames { + gpuAllocatable, hasGpuAllocatable := node.Status.Allocatable[gpuVendorResourceName] + if hasGpuAllocatable && !gpuAllocatable.IsZero() { + return true + } + } + return false } // PodRequestsGpu returns true if a given pod has GPU request. func PodRequestsGpu(pod *apiv1.Pod) bool { podRequests := podutils.PodRequests(pod) - _, gpuFound := podRequests[ResourceNvidiaGPU] - return gpuFound + for _, gpuVendorResourceName := range GPUVendorResourceNames { + if _, found := podRequests[gpuVendorResourceName]; found { + return true + } + } + return false +} + +// DetectNodeGPUResourceName inspects the node's allocatable resources and returns the first +// known GPU extended resource name that has non-zero allocatable. Falls back to Nvidia for +// backward compatibility if none are found but a GPU label is present. +func DetectNodeGPUResourceName(node *apiv1.Node) apiv1.ResourceName { + for _, rn := range GPUVendorResourceNames { + if qty, ok := node.Status.Allocatable[rn]; ok && !qty.IsZero() { + return rn + } + } + // Fallback: preserve previous behavior (defaulting to Nvidia) if label existed + return ResourceNvidiaGPU } // GetNodeGPUFromCloudProvider returns the GPU the node has. Returned GPU has the GPU label of the @@ -125,7 +161,11 @@ func PodRequestsGpu(pod *apiv1.Pod) bool { func GetNodeGPUFromCloudProvider(provider cloudprovider.CloudProvider, node *apiv1.Node) *cloudprovider.GpuConfig { gpuLabel := provider.GPULabel() if NodeHasGpu(gpuLabel, node) { - return &cloudprovider.GpuConfig{Label: gpuLabel, Type: node.Labels[gpuLabel], ExtendedResourceName: ResourceNvidiaGPU} + return &cloudprovider.GpuConfig{ + Label: gpuLabel, + Type: node.Labels[gpuLabel], + ExtendedResourceName: DetectNodeGPUResourceName(node), + } } return nil } diff --git a/cluster-autoscaler/utils/gpu/gpu_test.go b/cluster-autoscaler/utils/gpu/gpu_test.go index 0458c621de02..a9fd3c42ca85 100644 --- a/cluster-autoscaler/utils/gpu/gpu_test.go +++ b/cluster-autoscaler/utils/gpu/gpu_test.go @@ -236,3 +236,70 @@ func TestGetGpuInfoForMetrics(t *testing.T) { }) } } + +func TestDetectNodeGPUResourceName(t *testing.T) { + testCases := []struct { + name string + node *apiv1.Node + expectedResourceName apiv1.ResourceName + }{ + { + name: "nvidia gpu", + node: &apiv1.Node{ + ObjectMeta: metav1.ObjectMeta{ + Name: "node-with-nvidia-gpu", + Labels: map[string]string{}, + }, + Status: apiv1.NodeStatus{ + Capacity: apiv1.ResourceList{ + gpu.ResourceNvidiaGPU: *resource.NewQuantity(1, resource.DecimalSI), + }, + Allocatable: apiv1.ResourceList{ + gpu.ResourceNvidiaGPU: *resource.NewQuantity(1, resource.DecimalSI), + }, + }, + }, + expectedResourceName: gpu.ResourceNvidiaGPU, + }, + { + name: "amd gpu", + node: &apiv1.Node{ + ObjectMeta: metav1.ObjectMeta{ + Name: "node-with-amd-gpu", + Labels: map[string]string{}, + }, + Status: apiv1.NodeStatus{ + Capacity: apiv1.ResourceList{ + gpu.ResourceAMDGPU: *resource.NewQuantity(8, resource.DecimalSI), + }, + Allocatable: apiv1.ResourceList{ + gpu.ResourceAMDGPU: *resource.NewQuantity(8, resource.DecimalSI), + }, + }, + }, + expectedResourceName: gpu.ResourceAMDGPU, + }, + { + name: "test default gpu resource name", + node: &apiv1.Node{ + ObjectMeta: metav1.ObjectMeta{ + Name: "node-without-gpu", + Labels: map[string]string{}, + }, + Status: apiv1.NodeStatus{ + Capacity: apiv1.ResourceList{}, + Allocatable: apiv1.ResourceList{}, + }, + }, + expectedResourceName: gpu.ResourceNvidiaGPU, + }, + } + for _, tc := range testCases { + t.Run(tc.name, func(t *testing.T) { + resourceName := gpu.DetectNodeGPUResourceName(tc.node) + if resourceName != tc.expectedResourceName { + t.Errorf("expected resource name %s but got %s", tc.expectedResourceName, resourceName) + } + }) + } +}