Skip to content
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
50 changes: 45 additions & 5 deletions cluster-autoscaler/utils/gpu/gpu.go
Original file line number Diff line number Diff line change
Expand Up @@ -26,6 +26,8 @@ import (
)

const (
// ResourceAMDGPU is the name of the AMD GPU resource.
ResourceAMDGPU = "amd.com/gpu"
// ResourceNvidiaGPU is the name of the Nvidia GPU resource.
ResourceNvidiaGPU = "nvidia.com/gpu"
// ResourceDirectX is the name of the DirectX resource on windows.
Expand All @@ -35,6 +37,14 @@ const (
DefaultGPUType = "nvidia-tesla-k80"
)

// GPUVendorResourceNames centralized list of all known GPU vendor extended resource names.
// Extend this slice if new vendor resource names are added.
var GPUVendorResourceNames = []apiv1.ResourceName{
ResourceNvidiaGPU,
ResourceAMDGPU,
ResourceDirectX,
}

const (
// MetricsGenericGPU - for when there is no information about GPU type
MetricsGenericGPU = "generic"
Expand Down Expand Up @@ -109,23 +119,53 @@ func validateGpuType(availableGPUTypes map[string]struct{}, gpu string) string {
// if the drivers are installed and GPU is ready to use.
func NodeHasGpu(GPULabel string, node *apiv1.Node) bool {
_, hasGpuLabel := node.Labels[GPULabel]
gpuAllocatable, hasGpuAllocatable := node.Status.Allocatable[ResourceNvidiaGPU]
return hasGpuLabel || (hasGpuAllocatable && !gpuAllocatable.IsZero())
if hasGpuLabel {
return true
}
// Check for extended resources as well
for _, gpuVendorResourceName := range GPUVendorResourceNames {
gpuAllocatable, hasGpuAllocatable := node.Status.Allocatable[gpuVendorResourceName]
if hasGpuAllocatable && !gpuAllocatable.IsZero() {
return true
}
}
return false
}

// PodRequestsGpu returns true if a given pod has GPU request.
func PodRequestsGpu(pod *apiv1.Pod) bool {
podRequests := podutils.PodRequests(pod)
_, gpuFound := podRequests[ResourceNvidiaGPU]
return gpuFound
for _, gpuVendorResourceName := range GPUVendorResourceNames {
if _, found := podRequests[gpuVendorResourceName]; found {
return true
}
}
return false
}

// DetectNodeGPUResourceName inspects the node's allocatable resources and returns the first
// known GPU extended resource name that has non-zero allocatable. Falls back to Nvidia for
// backward compatibility if none are found but a GPU label is present.
func DetectNodeGPUResourceName(node *apiv1.Node) apiv1.ResourceName {
for _, rn := range GPUVendorResourceNames {
if qty, ok := node.Status.Allocatable[rn]; ok && !qty.IsZero() {
return rn
}
}
// Fallback: preserve previous behavior (defaulting to Nvidia) if label existed
return ResourceNvidiaGPU
}

// GetNodeGPUFromCloudProvider returns the GPU the node has. Returned GPU has the GPU label of the
// passed in cloud provider. If the node doesn't have a GPU, returns nil.
func GetNodeGPUFromCloudProvider(provider cloudprovider.CloudProvider, node *apiv1.Node) *cloudprovider.GpuConfig {
gpuLabel := provider.GPULabel()
if NodeHasGpu(gpuLabel, node) {
return &cloudprovider.GpuConfig{Label: gpuLabel, Type: node.Labels[gpuLabel], ExtendedResourceName: ResourceNvidiaGPU}
return &cloudprovider.GpuConfig{
Label: gpuLabel,
Type: node.Labels[gpuLabel],
ExtendedResourceName: DetectNodeGPUResourceName(node),
}
}
return nil
}
67 changes: 67 additions & 0 deletions cluster-autoscaler/utils/gpu/gpu_test.go
Original file line number Diff line number Diff line change
Expand Up @@ -236,3 +236,70 @@ func TestGetGpuInfoForMetrics(t *testing.T) {
})
}
}

func TestDetectNodeGPUResourceName(t *testing.T) {
testCases := []struct {
name string
node *apiv1.Node
expectedResourceName apiv1.ResourceName
}{
{
name: "nvidia gpu",
node: &apiv1.Node{
ObjectMeta: metav1.ObjectMeta{
Name: "node-with-nvidia-gpu",
Labels: map[string]string{},
},
Status: apiv1.NodeStatus{
Capacity: apiv1.ResourceList{
gpu.ResourceNvidiaGPU: *resource.NewQuantity(1, resource.DecimalSI),
},
Allocatable: apiv1.ResourceList{
gpu.ResourceNvidiaGPU: *resource.NewQuantity(1, resource.DecimalSI),
},
},
},
expectedResourceName: gpu.ResourceNvidiaGPU,
},
{
name: "amd gpu",
node: &apiv1.Node{
ObjectMeta: metav1.ObjectMeta{
Name: "node-with-amd-gpu",
Labels: map[string]string{},
},
Status: apiv1.NodeStatus{
Capacity: apiv1.ResourceList{
gpu.ResourceAMDGPU: *resource.NewQuantity(8, resource.DecimalSI),
},
Allocatable: apiv1.ResourceList{
gpu.ResourceAMDGPU: *resource.NewQuantity(8, resource.DecimalSI),
},
},
},
expectedResourceName: gpu.ResourceAMDGPU,
},
{
name: "test default gpu resource name",
node: &apiv1.Node{
ObjectMeta: metav1.ObjectMeta{
Name: "node-without-gpu",
Labels: map[string]string{},
},
Status: apiv1.NodeStatus{
Capacity: apiv1.ResourceList{},
Allocatable: apiv1.ResourceList{},
},
},
expectedResourceName: gpu.ResourceNvidiaGPU,
},
}
for _, tc := range testCases {
t.Run(tc.name, func(t *testing.T) {
resourceName := gpu.DetectNodeGPUResourceName(tc.node)
if resourceName != tc.expectedResourceName {
t.Errorf("expected resource name %s but got %s", tc.expectedResourceName, resourceName)
}
})
}
}
Loading