Skip to content

Commit c4ad0a9

Browse files
Merge pull request openshift#98 from elmiko/fix-a2-quotas
OCPBUGS-45923: update a2 gpu detection logic to be dynamic
2 parents ac81d76 + 7100df0 commit c4ad0a9

File tree

3 files changed

+31
-23
lines changed

3 files changed

+31
-23
lines changed

pkg/cloud/gcp/actuators/machine/reconciler.go

Lines changed: 18 additions & 18 deletions
Original file line numberDiff line numberDiff line change
@@ -49,13 +49,16 @@ func newReconciler(scope *machineScope) *Reconciler {
4949
}
5050

5151
var (
52+
// the keys have been sourced from https://cloud.google.com/compute/docs/gpus/
53+
// the values have been sourced from https://github.com/googleapis/google-api-go-client/blob/main/compute/v1/compute-gen.go
5254
supportedGpuTypes = map[string]string{
5355
"nvidia-tesla-k80": "NVIDIA_K80_GPUS",
5456
"nvidia-tesla-p100": "NVIDIA_P100_GPUS",
5557
"nvidia-tesla-v100": "NVIDIA_V100_GPUS",
5658
"nvidia-tesla-a100": "NVIDIA_A100_GPUS",
5759
"nvidia-tesla-p4": "NVIDIA_P4_GPUS",
5860
"nvidia-tesla-t4": "NVIDIA_T4_GPUS",
61+
"nvidia-a100-80gb": "NVIDIA_A100_80GB_GPUS",
5962
}
6063
)
6164

@@ -86,20 +89,12 @@ func restartPolicyToBool(policy machinev1.GCPRestartPolicyType, preemptible bool
8689
}
8790

8891
// machineTypeAcceleratorCount represents nvidia-tesla-A100 GPUs which are only compatible with A2 machine family
89-
func (r *Reconciler) checkQuota(machineTypeAcceleratorCount int64) error {
92+
func (r *Reconciler) checkQuota(guestAccelerators []machinev1.GCPGPUConfig) error {
9093
region, err := r.computeService.RegionGet(r.projectID, r.providerSpec.Region)
9194
if err != nil {
9295
return machinecontroller.InvalidMachineConfiguration(fmt.Sprintf("Failed to get region %s via compute service: %v", r.providerSpec.Region, err))
9396
}
9497
quotas := region.Quotas
95-
var guestAccelerators = []machinev1.GCPGPUConfig{}
96-
// When the machine type has associated accelerator instances (A2 machine family), accelerators will be nvidia-tesla-A100s.
97-
// Additional guest accelerators are not allowed so ignore the providerSpec GuestAccelerators.
98-
if machineTypeAcceleratorCount != 0 {
99-
guestAccelerators = append(guestAccelerators, machinev1.GCPGPUConfig{Type: "nvidia-tesla-a100", Count: int32(machineTypeAcceleratorCount)})
100-
} else {
101-
guestAccelerators = r.providerSpec.GPUs
102-
}
10398
// validate zone and then quota
10499
// guestAccelerators slice can not store more than 1 element.
105100
// More than one accelerator included in request results in error -> googleapi: Error 413: Value for field 'resource.guestAccelerators' is too large: maximum size 1 element(s); actual size 2., fieldSizeTooLarge
@@ -132,6 +127,7 @@ func (r *Reconciler) checkQuota(machineTypeAcceleratorCount int64) error {
132127
}
133128

134129
func (r *Reconciler) validateGuestAccelerators() error {
130+
// Note(elmiko) this is known to have an error in that non a2 instances with GPUs (eg a3 types) will bypass this check, which is fine for now.
135131
if len(r.providerSpec.GPUs) == 0 && !strings.HasPrefix(r.providerSpec.MachineType, "a2-") {
136132
// no accelerators to validate so return nil
137133
return nil
@@ -144,17 +140,21 @@ func (r *Reconciler) validateGuestAccelerators() error {
144140
}
145141
a2MachineFamily, n1MachineFamily := r.computeService.GPUCompatibleMachineTypesList(r.providerSpec.ProjectID, r.providerSpec.Zone, r.Context)
146142
machineType := r.providerSpec.MachineType
147-
switch {
148-
case a2MachineFamily[machineType] != 0:
143+
if gpuInfo, ok := a2MachineFamily[machineType]; ok {
149144
// a2 family machine - has fixed type and count of GPUs
150-
return r.checkQuota(a2MachineFamily[machineType])
151-
case containsString(n1MachineFamily, machineType):
152-
// n1 family machine
153-
return r.checkQuota(0)
154-
default:
155-
// any other machine type
156-
return machinecontroller.InvalidMachineConfiguration(fmt.Sprintf("MachineType %s is not available in the zone %s.", r.providerSpec.MachineType, r.providerSpec.Zone))
145+
guestAccelerators := []machinev1.GCPGPUConfig{
146+
{
147+
Type: gpuInfo.Type,
148+
Count: int32(gpuInfo.Count),
149+
},
150+
}
151+
return r.checkQuota(guestAccelerators)
152+
} else if containsString(n1MachineFamily, machineType) {
153+
return r.checkQuota(r.providerSpec.GPUs)
157154
}
155+
156+
// any other machine type
157+
return machinecontroller.InvalidMachineConfiguration(fmt.Sprintf("MachineType %s is not available in the zone %s.", r.providerSpec.MachineType, r.providerSpec.Zone))
158158
}
159159

160160
// Create creates machine if and only if machine exists, handled by cluster-api

pkg/cloud/gcp/actuators/services/compute/computeservice.go

Lines changed: 12 additions & 4 deletions
Original file line numberDiff line numberDiff line change
@@ -26,7 +26,7 @@ type GCPComputeService interface {
2626
TargetPoolsRemoveInstance(project string, region string, name string, instance string) (*compute.Operation, error)
2727
MachineTypesGet(project string, machineType string, zone string) (*compute.MachineType, error)
2828
RegionGet(project string, region string) (*compute.Region, error)
29-
GPUCompatibleMachineTypesList(project string, zone string, ctx context.Context) (map[string]int64, []string)
29+
GPUCompatibleMachineTypesList(project string, zone string, ctx context.Context) (map[string]GpuInfo, []string)
3030
AcceleratorTypeGet(project string, zone string, acceleratorType string) (*compute.AcceleratorType, error)
3131
InstanceGroupsListInstances(project string, zone string, instanceGroup string, request *compute.InstanceGroupsListInstancesRequest) (*compute.InstanceGroupsListInstances, error)
3232
InstanceGroupsAddInstances(project string, zone string, instance string, instanceGroup string) (*compute.Operation, error)
@@ -120,17 +120,25 @@ func (c *computeService) MachineTypesGet(project string, zone string, machineTyp
120120
return c.service.MachineTypes.Get(project, zone, machineType).Do()
121121
}
122122

123+
type GpuInfo struct {
124+
Count int64
125+
Type string
126+
}
127+
123128
// GPUCompatibleMachineTypesList function lists machineTypes available in the zone and return map of A2 family and slice of N1 family machineTypes
124-
func (c *computeService) GPUCompatibleMachineTypesList(project string, zone string, ctx context.Context) (map[string]int64, []string) {
129+
func (c *computeService) GPUCompatibleMachineTypesList(project string, zone string, ctx context.Context) (map[string]GpuInfo, []string) {
125130
req := c.service.MachineTypes.List(project, zone)
126131
var (
127-
a2MachineFamily = map[string]int64{}
132+
a2MachineFamily = map[string]GpuInfo{}
128133
n1MachineFamily []string
129134
)
130135
if err := req.Pages(ctx, func(page *compute.MachineTypeList) error {
131136
for _, machineType := range page.Items {
132137
if strings.HasPrefix(machineType.Name, "a2") {
133-
a2MachineFamily[machineType.Name] = machineType.Accelerators[0].GuestAcceleratorCount
138+
a2MachineFamily[machineType.Name] = GpuInfo{
139+
Count: machineType.Accelerators[0].GuestAcceleratorCount,
140+
Type: machineType.Accelerators[0].GuestAcceleratorType,
141+
}
134142
} else if strings.HasPrefix(machineType.Name, "n1") {
135143
n1MachineFamily = append(n1MachineFamily, machineType.Name)
136144
}

pkg/cloud/gcp/actuators/services/compute/computeservice_mock.go

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -149,7 +149,7 @@ func (c *GCPComputeServiceMock) RegionGet(project string, region string) (*compu
149149
return &compute.Region{Quotas: nil}, nil
150150
}
151151

152-
func (c *GCPComputeServiceMock) GPUCompatibleMachineTypesList(project string, zone string, ctx context.Context) (map[string]int64, []string) {
152+
func (c *GCPComputeServiceMock) GPUCompatibleMachineTypesList(project string, zone string, ctx context.Context) (map[string]GpuInfo, []string) {
153153
var compatibleMachineType = []string{"n1-test-machineType"}
154154
return nil, compatibleMachineType
155155
}

0 commit comments

Comments
 (0)