Merge pull request openshift#98 from elmiko/fix-a2-quotas

openshift-merge-bot[bot] · web-flow · commit c4ad0a9968e9 · 2024-12-12T10:29:01.000Z
OCPBUGS-45923: update a2 gpu detection logic to be dynamic
diff --git a/pkg/cloud/gcp/actuators/machine/reconciler.go b/pkg/cloud/gcp/actuators/machine/reconciler.go
@@ -49,13 +49,16 @@ func newReconciler(scope *machineScope) *Reconciler {
 }
 
 var (
+	// the keys have been sourced from https://cloud.google.com/compute/docs/gpus/
+	// the values have been sourced from https://github.com/googleapis/google-api-go-client/blob/main/compute/v1/compute-gen.go
 	supportedGpuTypes = map[string]string{
 		"nvidia-tesla-k80":  "NVIDIA_K80_GPUS",
 		"nvidia-tesla-p100": "NVIDIA_P100_GPUS",
 		"nvidia-tesla-v100": "NVIDIA_V100_GPUS",
 		"nvidia-tesla-a100": "NVIDIA_A100_GPUS",
 		"nvidia-tesla-p4":   "NVIDIA_P4_GPUS",
 		"nvidia-tesla-t4":   "NVIDIA_T4_GPUS",
+		"nvidia-a100-80gb":  "NVIDIA_A100_80GB_GPUS",
 	}
 )
 
@@ -86,20 +89,12 @@ func restartPolicyToBool(policy machinev1.GCPRestartPolicyType, preemptible bool
 }
 
 // machineTypeAcceleratorCount represents nvidia-tesla-A100 GPUs which are only compatible with A2 machine family
-func (r *Reconciler) checkQuota(machineTypeAcceleratorCount int64) error {
+func (r *Reconciler) checkQuota(guestAccelerators []machinev1.GCPGPUConfig) error {
 	region, err := r.computeService.RegionGet(r.projectID, r.providerSpec.Region)
 	if err != nil {
 		return machinecontroller.InvalidMachineConfiguration(fmt.Sprintf("Failed to get region %s via compute service: %v", r.providerSpec.Region, err))
 	}
 	quotas := region.Quotas
-	var guestAccelerators = []machinev1.GCPGPUConfig{}
-	// When the machine type has associated accelerator instances (A2 machine family), accelerators will be nvidia-tesla-A100s.
-	// Additional guest accelerators are not allowed so ignore the providerSpec GuestAccelerators.
-	if machineTypeAcceleratorCount != 0 {
-		guestAccelerators = append(guestAccelerators, machinev1.GCPGPUConfig{Type: "nvidia-tesla-a100", Count: int32(machineTypeAcceleratorCount)})
-	} else {
-		guestAccelerators = r.providerSpec.GPUs
-	}
 	// validate zone and then quota
 	// guestAccelerators slice can not store more than 1 element.
 	// More than one accelerator included in request results in error -> googleapi: Error 413: Value for field 'resource.guestAccelerators' is too large: maximum size 1 element(s); actual size 2., fieldSizeTooLarge
@@ -132,6 +127,7 @@ func (r *Reconciler) checkQuota(machineTypeAcceleratorCount int64) error {
 }
 
 func (r *Reconciler) validateGuestAccelerators() error {
+	// Note(elmiko) this is known to have an error in that non a2 instances with GPUs (eg a3 types) will bypass this check, which is fine for now.
 	if len(r.providerSpec.GPUs) == 0 && !strings.HasPrefix(r.providerSpec.MachineType, "a2-") {
 		// no accelerators to validate so return nil
 		return nil
@@ -144,17 +140,21 @@ func (r *Reconciler) validateGuestAccelerators() error {
 	}
 	a2MachineFamily, n1MachineFamily := r.computeService.GPUCompatibleMachineTypesList(r.providerSpec.ProjectID, r.providerSpec.Zone, r.Context)
 	machineType := r.providerSpec.MachineType
-	switch {
-	case a2MachineFamily[machineType] != 0:
+	if gpuInfo, ok := a2MachineFamily[machineType]; ok {
 		// a2 family machine - has fixed type and count of GPUs
-		return r.checkQuota(a2MachineFamily[machineType])
-	case containsString(n1MachineFamily, machineType):
-		// n1 family machine
-		return r.checkQuota(0)
-	default:
-		// any other machine type
-		return machinecontroller.InvalidMachineConfiguration(fmt.Sprintf("MachineType %s is not available in the zone %s.", r.providerSpec.MachineType, r.providerSpec.Zone))
+		guestAccelerators := []machinev1.GCPGPUConfig{
+			{
+				Type:  gpuInfo.Type,
+				Count: int32(gpuInfo.Count),
+			},
+		}
+		return r.checkQuota(guestAccelerators)
+	} else if containsString(n1MachineFamily, machineType) {
+		return r.checkQuota(r.providerSpec.GPUs)
 	}
+
+	// any other machine type
+	return machinecontroller.InvalidMachineConfiguration(fmt.Sprintf("MachineType %s is not available in the zone %s.", r.providerSpec.MachineType, r.providerSpec.Zone))
 }
 
 // Create creates machine if and only if machine exists, handled by cluster-api
diff --git a/pkg/cloud/gcp/actuators/services/compute/computeservice.go b/pkg/cloud/gcp/actuators/services/compute/computeservice.go
@@ -26,7 +26,7 @@ type GCPComputeService interface {
 	TargetPoolsRemoveInstance(project string, region string, name string, instance string) (*compute.Operation, error)
 	MachineTypesGet(project string, machineType string, zone string) (*compute.MachineType, error)
 	RegionGet(project string, region string) (*compute.Region, error)
-	GPUCompatibleMachineTypesList(project string, zone string, ctx context.Context) (map[string]int64, []string)
+	GPUCompatibleMachineTypesList(project string, zone string, ctx context.Context) (map[string]GpuInfo, []string)
 	AcceleratorTypeGet(project string, zone string, acceleratorType string) (*compute.AcceleratorType, error)
 	InstanceGroupsListInstances(project string, zone string, instanceGroup string, request *compute.InstanceGroupsListInstancesRequest) (*compute.InstanceGroupsListInstances, error)
 	InstanceGroupsAddInstances(project string, zone string, instance string, instanceGroup string) (*compute.Operation, error)
@@ -120,17 +120,25 @@ func (c *computeService) MachineTypesGet(project string, zone string, machineTyp
 	return c.service.MachineTypes.Get(project, zone, machineType).Do()
 }
 
+type GpuInfo struct {
+	Count int64
+	Type  string
+}
+
 // GPUCompatibleMachineTypesList function lists machineTypes available in the zone and return map of A2 family and slice of N1 family machineTypes
-func (c *computeService) GPUCompatibleMachineTypesList(project string, zone string, ctx context.Context) (map[string]int64, []string) {
+func (c *computeService) GPUCompatibleMachineTypesList(project string, zone string, ctx context.Context) (map[string]GpuInfo, []string) {
 	req := c.service.MachineTypes.List(project, zone)
 	var (
-		a2MachineFamily = map[string]int64{}
+		a2MachineFamily = map[string]GpuInfo{}
 		n1MachineFamily []string
 	)
 	if err := req.Pages(ctx, func(page *compute.MachineTypeList) error {
 		for _, machineType := range page.Items {
 			if strings.HasPrefix(machineType.Name, "a2") {
-				a2MachineFamily[machineType.Name] = machineType.Accelerators[0].GuestAcceleratorCount
+				a2MachineFamily[machineType.Name] = GpuInfo{
+					Count: machineType.Accelerators[0].GuestAcceleratorCount,
+					Type:  machineType.Accelerators[0].GuestAcceleratorType,
+				}
 			} else if strings.HasPrefix(machineType.Name, "n1") {
 				n1MachineFamily = append(n1MachineFamily, machineType.Name)
 			}
diff --git a/pkg/cloud/gcp/actuators/services/compute/computeservice_mock.go b/pkg/cloud/gcp/actuators/services/compute/computeservice_mock.go
@@ -149,7 +149,7 @@ func (c *GCPComputeServiceMock) RegionGet(project string, region string) (*compu
 	return &compute.Region{Quotas: nil}, nil
 }
 
-func (c *GCPComputeServiceMock) GPUCompatibleMachineTypesList(project string, zone string, ctx context.Context) (map[string]int64, []string) {
+func (c *GCPComputeServiceMock) GPUCompatibleMachineTypesList(project string, zone string, ctx context.Context) (map[string]GpuInfo, []string) {
 	var compatibleMachineType = []string{"n1-test-machineType"}
 	return nil, compatibleMachineType
 }

Original file line number	Diff line number	Diff line change
`@@ -149,7 +149,7 @@ func (c GCPComputeServiceMock) RegionGet(project string, region string) (compu`
`149`	`149`	`return &compute.Region{Quotas: nil}, nil`
`150`	`150`	`}`
`151`	`151`
`152`		`-func (c *GCPComputeServiceMock) GPUCompatibleMachineTypesList(project string, zone string, ctx context.Context) (map[string]int64, []string) {`
	`152`	`+func (c *GCPComputeServiceMock) GPUCompatibleMachineTypesList(project string, zone string, ctx context.Context) (map[string]GpuInfo, []string) {`
`153`	`153`	`var compatibleMachineType = []string{"n1-test-machineType"}`
`154`	`154`	`return nil, compatibleMachineType`
`155`	`155`	`}`