From 420fb17ec319c0fe10dbffa4e5762c76237a08dd Mon Sep 17 00:00:00 2001 From: Siddhartha Bagaria Date: Thu, 13 Feb 2025 09:03:14 -0800 Subject: [PATCH] Improve Google Batch support for GPUs (#5406) Signed-off-by: Siddhartha Bagaria <1929612+siddharthab@users.noreply.github.com> Signed-off-by: Siddhartha Bagaria Signed-off-by: Paolo Di Tommaso Co-authored-by: Sid Bagaria Co-authored-by: Ben Sherman Co-authored-by: Paolo Di Tommaso --- .../GoogleBatchMachineTypeSelector.groovy | 58 ++++++++++++++++++- .../batch/GoogleBatchTaskHandler.groovy | 42 ++++++-------- .../GoogleBatchMachineTypeSelectorTest.groovy | 14 +++++ .../batch/GoogleBatchTaskHandlerTest.groovy | 5 +- 4 files changed, 91 insertions(+), 28 deletions(-) diff --git a/plugins/nf-google/src/main/nextflow/cloud/google/batch/GoogleBatchMachineTypeSelector.groovy b/plugins/nf-google/src/main/nextflow/cloud/google/batch/GoogleBatchMachineTypeSelector.groovy index 61b5c4cb8a..b2f497550e 100644 --- a/plugins/nf-google/src/main/nextflow/cloud/google/batch/GoogleBatchMachineTypeSelector.groovy +++ b/plugins/nf-google/src/main/nextflow/cloud/google/batch/GoogleBatchMachineTypeSelector.groovy @@ -77,6 +77,12 @@ class GoogleBatchMachineTypeSelector { private static final List DEFAULT_FAMILIES = ['n1-*', 'n2-*', 'n2d-*', 'c2-*', 'c2d-*', 'm1-*', 'm2-*', 'm3-*', 'e2-*'] + /* + * Accelerator optimized families. See: https://cloud.google.com/compute/docs/accelerator-optimized-machines + * LAST UPDATE 2024-10-16 + */ + private static final List ACCELERATOR_OPTIMIZED_FAMILIES = ['a2-*', 'a3-*', 'g2-*'] + @Immutable static class MachineType { String type @@ -86,6 +92,7 @@ class GoogleBatchMachineTypeSelector { float onDemandPrice int cpusPerVm int memPerVm + int gpusPerVm PriceModel priceModel } @@ -97,7 +104,7 @@ class GoogleBatchMachineTypeSelector { if (families.size() == 1) { final familyOrType = families.get(0) if (familyOrType.contains("custom-")) - return new MachineType(type: familyOrType, family: 'custom', cpusPerVm: cpus, memPerVm: memoryMB, location: region, priceModel: spot ? PriceModel.spot : PriceModel.standard) + return new MachineType(type: familyOrType, family: 'custom', cpusPerVm: cpus, memPerVm: memoryMB, gpusPerVm: 0, location: region, priceModel: spot ? PriceModel.spot : PriceModel.standard) final machineType = getAvailableMachineTypes(region, spot).find { it.type == familyOrType } if( machineType ) @@ -156,6 +163,7 @@ class GoogleBatchMachineTypeSelector { onDemandPrice: it.onDemandPrice as float, cpusPerVm: it.cpusPerVm as int, memPerVm: it.memPerVm as int, + gpusPerVm: it.gpusPerVm as int, location: region, priceModel: priceModel ) @@ -222,7 +230,38 @@ class GoogleBatchMachineTypeSelector { return findFirstValidSize(requested, [4,8]) } - // other special families the user must provide a valid size + if( machineType.family == "a2" ) { + if ( machineType.type == 'a2-highgpu-1g' ) + return findFirstValidSize(requested, [1, 2, 4, 8]) + if ( machineType.type == 'a2-highgpu-2g' ) + return findFirstValidSize(requested, [2, 4, 8]) + if ( machineType.type == 'a2-highgpu-4g' ) + return findFirstValidSize(requested, [4, 8]) + if ( machineType.type == 'a2-highgpu-8g' || machineType.type == 'a2-megagpu-16g' ) + return findFirstValidSize(requested, [8]) + } + + if( machineType.family == "g2" ) { + if( machineType.type == 'g2-standard-4' || machineType.type == 'g2-standard-8' || + machineType.type == 'g2-standard-12' || machineType.type == 'g2-standard-16' || + machineType.type == 'g2-standard-32' ) + return findFirstValidSize(requested, [1]) + if( machineType.type == 'g2-standard-24' ) + return findFirstValidSize(requested, [2]) + if( machineType.type == 'g2-standard-48' ) + return findFirstValidSize(requested, [4]) + if( machineType.type == 'g2-standard-96' ) + return findFirstValidSize(requested, [8]) + } + + // These families have a local SSD already attached and is not configurable. + if( ((machineType.family == "c3" || machineType.family == "c3d") && machineType.type.endsWith("-lssd")) || + machineType.family == "a3" || + machineType.type.startsWith("a2-ultragpu-") ) + return new MemoryUnit( 0 ) + + // For other special families, the user must provide a valid size. If a family does not + // support local disks, then Google Batch shall return an appropriate error. return requested } @@ -249,4 +288,19 @@ class GoogleBatchMachineTypeSelector { return new MemoryUnit( numberOfDisks * 375L * (1<<30) ) } + /** + * Determine whether GPU drivers should be installed. + * + * @param machineType Machine type + * @return Boolean value indicating if GPU drivers should be installed. + */ + protected boolean installGpuDrivers(MachineType machineType) { + if ( machineType.gpusPerVm > 0 ) { + return true + } + // Cloud Info service currently does not currently return gpusPerVm values (or the user + // could have disabled use of the service) so also check against a known set of families. + return ACCELERATOR_OPTIMIZED_FAMILIES.any { matchType(it, machineType.type) } + } + } diff --git a/plugins/nf-google/src/main/nextflow/cloud/google/batch/GoogleBatchTaskHandler.groovy b/plugins/nf-google/src/main/nextflow/cloud/google/batch/GoogleBatchTaskHandler.groovy index 92d2d7e266..442fea9a6a 100644 --- a/plugins/nf-google/src/main/nextflow/cloud/google/batch/GoogleBatchTaskHandler.groovy +++ b/plugins/nf-google/src/main/nextflow/cloud/google/batch/GoogleBatchTaskHandler.groovy @@ -233,19 +233,8 @@ class GoogleBatchTaskHandler extends TaskHandler implements FusionAwareTask { .addAllCommands( cmd ) .addAllVolumes( launcher.getContainerMounts() ) - final accel = task.config.getAccelerator() - // add nvidia specific driver paths - // see https://cloud.google.com/batch/docs/create-run-job#create-job-gpu - if( accel && accel.type.toLowerCase().startsWith('nvidia-') ) { - container - .addVolumes('/var/lib/nvidia/lib64:/usr/local/nvidia/lib64') - .addVolumes('/var/lib/nvidia/bin:/usr/local/nvidia/bin') - } - def containerOptions = task.config.getContainerOptions() ?: '' - // accelerator requires privileged option - // https://cloud.google.com/batch/docs/create-run-job#create-job-gpu - if( task.config.getAccelerator() || fusionEnabled() ) { + if( fusionEnabled() ) { if( containerOptions ) containerOptions += ' ' containerOptions += '--privileged' } @@ -333,17 +322,6 @@ class GoogleBatchTaskHandler extends TaskHandler implements FusionAwareTask { else { final instancePolicy = AllocationPolicy.InstancePolicy.newBuilder() - if( task.config.getAccelerator() ) { - final accelerator = AllocationPolicy.Accelerator.newBuilder() - .setCount( task.config.getAccelerator().getRequest() ) - - if( task.config.getAccelerator().getType() ) - accelerator.setType( task.config.getAccelerator().getType() ) - - instancePolicy.addAccelerators(accelerator) - instancePolicyOrTemplate.setInstallGpuDrivers(true) - } - if( executor.config.getBootDiskImage() ) instancePolicy.setBootDisk( AllocationPolicy.Disk.newBuilder().setImage( executor.config.getBootDiskImage() ) ) @@ -356,6 +334,9 @@ class GoogleBatchTaskHandler extends TaskHandler implements FusionAwareTask { if( machineType ) { instancePolicy.setMachineType(machineType.type) + instancePolicyOrTemplate.setInstallGpuDrivers( + GoogleBatchMachineTypeSelector.INSTANCE.installGpuDrivers(machineType) + ) machineInfo = new CloudMachineInfo( type: machineType.type, zone: machineType.location, @@ -363,9 +344,24 @@ class GoogleBatchTaskHandler extends TaskHandler implements FusionAwareTask { ) } + if( task.config.getAccelerator() ) { + final accelerator = AllocationPolicy.Accelerator.newBuilder() + .setCount( task.config.getAccelerator().getRequest() ) + + if( task.config.getAccelerator().getType() ) + accelerator.setType( task.config.getAccelerator().getType() ) + + instancePolicy.addAccelerators(accelerator) + instancePolicyOrTemplate.setInstallGpuDrivers(true) + } + // When using local SSD not all the disk sizes are valid and depends on the machine type if( disk?.type == 'local-ssd' && machineType ) { final validSize = GoogleBatchMachineTypeSelector.INSTANCE.findValidLocalSSDSize(disk.request, machineType) + if( validSize.toBytes() == 0 ) { + disk = new DiskResource(request: 0) + log.debug "[GOOGLE BATCH] Process `${task.lazyName()}` - ${machineType.type} does not allow configuring local disks" + } if( validSize != disk.request ) { disk = new DiskResource(request: validSize, type: 'local-ssd') log.debug "[GOOGLE BATCH] Process `${task.lazyName()}` - adjusting local disk size to: $validSize" diff --git a/plugins/nf-google/src/test/nextflow/cloud/google/batch/GoogleBatchMachineTypeSelectorTest.groovy b/plugins/nf-google/src/test/nextflow/cloud/google/batch/GoogleBatchMachineTypeSelectorTest.groovy index c36e13e3b4..35a8780f7d 100644 --- a/plugins/nf-google/src/test/nextflow/cloud/google/batch/GoogleBatchMachineTypeSelectorTest.groovy +++ b/plugins/nf-google/src/test/nextflow/cloud/google/batch/GoogleBatchMachineTypeSelectorTest.groovy @@ -98,4 +98,18 @@ class GoogleBatchMachineTypeSelectorTest extends Specification { '50 GB' | 'c2d-highmem-56' | 'c2d' | 56 | '1500 GB' '750 GB' | 'm3-megamem-64' | 'm3' | 64 | '1500 GB' } + + def 'should know when to install GPU drivers'() { + expect: + final machineType = new MachineType(type: TYPE, gpusPerVm: GPUS) + GoogleBatchMachineTypeSelector.INSTANCE.installGpuDrivers(machineType) == EXPECTED + + where: + TYPE | GPUS | EXPECTED + 'n2-standard-4' | 0 | false + 'n2-standard-4' | 1 | true + 'a2-highgpu-1g' | 0 | true + 'a3-highgpu-1g' | 0 | true + 'g2-standard-4' | 0 | true + } } diff --git a/plugins/nf-google/src/test/nextflow/cloud/google/batch/GoogleBatchTaskHandlerTest.groovy b/plugins/nf-google/src/test/nextflow/cloud/google/batch/GoogleBatchTaskHandlerTest.groovy index cefb0bfaad..37b27e0b5a 100644 --- a/plugins/nf-google/src/test/nextflow/cloud/google/batch/GoogleBatchTaskHandlerTest.groovy +++ b/plugins/nf-google/src/test/nextflow/cloud/google/batch/GoogleBatchTaskHandlerTest.groovy @@ -106,6 +106,7 @@ class GoogleBatchTaskHandlerTest extends Specification { and: !instancePolicyOrTemplate.getInstanceTemplate() and: + !instancePolicyOrTemplate.getInstallGpuDrivers() instancePolicy.getAcceleratorsCount() == 0 instancePolicy.getDisksCount() == 0 !instancePolicy.getMachineType() @@ -202,11 +203,9 @@ class GoogleBatchTaskHandlerTest extends Specification { and: runnable.getContainer().getCommandsList().join(' ') == '/bin/bash -o pipefail -c bash .command.run' runnable.getContainer().getImageUri() == CONTAINER_IMAGE - runnable.getContainer().getOptions() == '--this --that --privileged' + runnable.getContainer().getOptions() == '--this --that' runnable.getContainer().getVolumesList() == [ '/mnt/disks/foo/scratch:/mnt/disks/foo/scratch:rw', - '/var/lib/nvidia/lib64:/usr/local/nvidia/lib64', - '/var/lib/nvidia/bin:/usr/local/nvidia/bin' ] and: runnable.getEnvironment().getVariablesMap() == [:]