Skip to content

Commit

Permalink
Improve Google Batch support for GPUs (#5406)
Browse files Browse the repository at this point in the history
Signed-off-by: Siddhartha Bagaria <[email protected]>
Signed-off-by: Siddhartha Bagaria <[email protected]>
Signed-off-by: Paolo Di Tommaso <[email protected]>
Co-authored-by: Sid Bagaria <[email protected]>
Co-authored-by: Ben Sherman <[email protected]>
Co-authored-by: Paolo Di Tommaso <[email protected]>
  • Loading branch information
4 people authored Feb 13, 2025
1 parent d615faf commit 420fb17
Show file tree
Hide file tree
Showing 4 changed files with 91 additions and 28 deletions.
Original file line number Diff line number Diff line change
Expand Up @@ -77,6 +77,12 @@ class GoogleBatchMachineTypeSelector {

private static final List<String> DEFAULT_FAMILIES = ['n1-*', 'n2-*', 'n2d-*', 'c2-*', 'c2d-*', 'm1-*', 'm2-*', 'm3-*', 'e2-*']

/*
* Accelerator optimized families. See: https://cloud.google.com/compute/docs/accelerator-optimized-machines
* LAST UPDATE 2024-10-16
*/
private static final List<String> ACCELERATOR_OPTIMIZED_FAMILIES = ['a2-*', 'a3-*', 'g2-*']

@Immutable
static class MachineType {
String type
Expand All @@ -86,6 +92,7 @@ class GoogleBatchMachineTypeSelector {
float onDemandPrice
int cpusPerVm
int memPerVm
int gpusPerVm
PriceModel priceModel
}

Expand All @@ -97,7 +104,7 @@ class GoogleBatchMachineTypeSelector {
if (families.size() == 1) {
final familyOrType = families.get(0)
if (familyOrType.contains("custom-"))
return new MachineType(type: familyOrType, family: 'custom', cpusPerVm: cpus, memPerVm: memoryMB, location: region, priceModel: spot ? PriceModel.spot : PriceModel.standard)
return new MachineType(type: familyOrType, family: 'custom', cpusPerVm: cpus, memPerVm: memoryMB, gpusPerVm: 0, location: region, priceModel: spot ? PriceModel.spot : PriceModel.standard)

final machineType = getAvailableMachineTypes(region, spot).find { it.type == familyOrType }
if( machineType )
Expand Down Expand Up @@ -156,6 +163,7 @@ class GoogleBatchMachineTypeSelector {
onDemandPrice: it.onDemandPrice as float,
cpusPerVm: it.cpusPerVm as int,
memPerVm: it.memPerVm as int,
gpusPerVm: it.gpusPerVm as int,
location: region,
priceModel: priceModel
)
Expand Down Expand Up @@ -222,7 +230,38 @@ class GoogleBatchMachineTypeSelector {
return findFirstValidSize(requested, [4,8])
}

// other special families the user must provide a valid size
if( machineType.family == "a2" ) {
if ( machineType.type == 'a2-highgpu-1g' )
return findFirstValidSize(requested, [1, 2, 4, 8])
if ( machineType.type == 'a2-highgpu-2g' )
return findFirstValidSize(requested, [2, 4, 8])
if ( machineType.type == 'a2-highgpu-4g' )
return findFirstValidSize(requested, [4, 8])
if ( machineType.type == 'a2-highgpu-8g' || machineType.type == 'a2-megagpu-16g' )
return findFirstValidSize(requested, [8])
}

if( machineType.family == "g2" ) {
if( machineType.type == 'g2-standard-4' || machineType.type == 'g2-standard-8' ||
machineType.type == 'g2-standard-12' || machineType.type == 'g2-standard-16' ||
machineType.type == 'g2-standard-32' )
return findFirstValidSize(requested, [1])
if( machineType.type == 'g2-standard-24' )
return findFirstValidSize(requested, [2])
if( machineType.type == 'g2-standard-48' )
return findFirstValidSize(requested, [4])
if( machineType.type == 'g2-standard-96' )
return findFirstValidSize(requested, [8])
}

// These families have a local SSD already attached and is not configurable.
if( ((machineType.family == "c3" || machineType.family == "c3d") && machineType.type.endsWith("-lssd")) ||
machineType.family == "a3" ||
machineType.type.startsWith("a2-ultragpu-") )
return new MemoryUnit( 0 )

// For other special families, the user must provide a valid size. If a family does not
// support local disks, then Google Batch shall return an appropriate error.
return requested
}

Expand All @@ -249,4 +288,19 @@ class GoogleBatchMachineTypeSelector {
return new MemoryUnit( numberOfDisks * 375L * (1<<30) )
}

/**
* Determine whether GPU drivers should be installed.
*
* @param machineType Machine type
* @return Boolean value indicating if GPU drivers should be installed.
*/
protected boolean installGpuDrivers(MachineType machineType) {
if ( machineType.gpusPerVm > 0 ) {
return true
}
// Cloud Info service currently does not currently return gpusPerVm values (or the user
// could have disabled use of the service) so also check against a known set of families.
return ACCELERATOR_OPTIMIZED_FAMILIES.any { matchType(it, machineType.type) }
}

}
Original file line number Diff line number Diff line change
Expand Up @@ -233,19 +233,8 @@ class GoogleBatchTaskHandler extends TaskHandler implements FusionAwareTask {
.addAllCommands( cmd )
.addAllVolumes( launcher.getContainerMounts() )

final accel = task.config.getAccelerator()
// add nvidia specific driver paths
// see https://cloud.google.com/batch/docs/create-run-job#create-job-gpu
if( accel && accel.type.toLowerCase().startsWith('nvidia-') ) {
container
.addVolumes('/var/lib/nvidia/lib64:/usr/local/nvidia/lib64')
.addVolumes('/var/lib/nvidia/bin:/usr/local/nvidia/bin')
}

def containerOptions = task.config.getContainerOptions() ?: ''
// accelerator requires privileged option
// https://cloud.google.com/batch/docs/create-run-job#create-job-gpu
if( task.config.getAccelerator() || fusionEnabled() ) {
if( fusionEnabled() ) {
if( containerOptions ) containerOptions += ' '
containerOptions += '--privileged'
}
Expand Down Expand Up @@ -333,17 +322,6 @@ class GoogleBatchTaskHandler extends TaskHandler implements FusionAwareTask {
else {
final instancePolicy = AllocationPolicy.InstancePolicy.newBuilder()

if( task.config.getAccelerator() ) {
final accelerator = AllocationPolicy.Accelerator.newBuilder()
.setCount( task.config.getAccelerator().getRequest() )

if( task.config.getAccelerator().getType() )
accelerator.setType( task.config.getAccelerator().getType() )

instancePolicy.addAccelerators(accelerator)
instancePolicyOrTemplate.setInstallGpuDrivers(true)
}

if( executor.config.getBootDiskImage() )
instancePolicy.setBootDisk( AllocationPolicy.Disk.newBuilder().setImage( executor.config.getBootDiskImage() ) )

Expand All @@ -356,16 +334,34 @@ class GoogleBatchTaskHandler extends TaskHandler implements FusionAwareTask {

if( machineType ) {
instancePolicy.setMachineType(machineType.type)
instancePolicyOrTemplate.setInstallGpuDrivers(
GoogleBatchMachineTypeSelector.INSTANCE.installGpuDrivers(machineType)
)
machineInfo = new CloudMachineInfo(
type: machineType.type,
zone: machineType.location,
priceModel: machineType.priceModel
)
}

if( task.config.getAccelerator() ) {
final accelerator = AllocationPolicy.Accelerator.newBuilder()
.setCount( task.config.getAccelerator().getRequest() )

if( task.config.getAccelerator().getType() )
accelerator.setType( task.config.getAccelerator().getType() )

instancePolicy.addAccelerators(accelerator)
instancePolicyOrTemplate.setInstallGpuDrivers(true)
}

// When using local SSD not all the disk sizes are valid and depends on the machine type
if( disk?.type == 'local-ssd' && machineType ) {
final validSize = GoogleBatchMachineTypeSelector.INSTANCE.findValidLocalSSDSize(disk.request, machineType)
if( validSize.toBytes() == 0 ) {
disk = new DiskResource(request: 0)
log.debug "[GOOGLE BATCH] Process `${task.lazyName()}` - ${machineType.type} does not allow configuring local disks"
}
if( validSize != disk.request ) {
disk = new DiskResource(request: validSize, type: 'local-ssd')
log.debug "[GOOGLE BATCH] Process `${task.lazyName()}` - adjusting local disk size to: $validSize"
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -98,4 +98,18 @@ class GoogleBatchMachineTypeSelectorTest extends Specification {
'50 GB' | 'c2d-highmem-56' | 'c2d' | 56 | '1500 GB'
'750 GB' | 'm3-megamem-64' | 'm3' | 64 | '1500 GB'
}

def 'should know when to install GPU drivers'() {
expect:
final machineType = new MachineType(type: TYPE, gpusPerVm: GPUS)
GoogleBatchMachineTypeSelector.INSTANCE.installGpuDrivers(machineType) == EXPECTED

where:
TYPE | GPUS | EXPECTED
'n2-standard-4' | 0 | false
'n2-standard-4' | 1 | true
'a2-highgpu-1g' | 0 | true
'a3-highgpu-1g' | 0 | true
'g2-standard-4' | 0 | true
}
}
Original file line number Diff line number Diff line change
Expand Up @@ -106,6 +106,7 @@ class GoogleBatchTaskHandlerTest extends Specification {
and:
!instancePolicyOrTemplate.getInstanceTemplate()
and:
!instancePolicyOrTemplate.getInstallGpuDrivers()
instancePolicy.getAcceleratorsCount() == 0
instancePolicy.getDisksCount() == 0
!instancePolicy.getMachineType()
Expand Down Expand Up @@ -202,11 +203,9 @@ class GoogleBatchTaskHandlerTest extends Specification {
and:
runnable.getContainer().getCommandsList().join(' ') == '/bin/bash -o pipefail -c bash .command.run'
runnable.getContainer().getImageUri() == CONTAINER_IMAGE
runnable.getContainer().getOptions() == '--this --that --privileged'
runnable.getContainer().getOptions() == '--this --that'
runnable.getContainer().getVolumesList() == [
'/mnt/disks/foo/scratch:/mnt/disks/foo/scratch:rw',
'/var/lib/nvidia/lib64:/usr/local/nvidia/lib64',
'/var/lib/nvidia/bin:/usr/local/nvidia/bin'
]
and:
runnable.getEnvironment().getVariablesMap() == [:]
Expand Down

0 comments on commit 420fb17

Please sign in to comment.