From e562ce061c0133ab2d16d8601127d86e1aba8d55 Mon Sep 17 00:00:00 2001 From: Paolo Di Tommaso Date: Tue, 16 Jul 2024 19:55:52 +0200 Subject: [PATCH] Make Google Batch auto retry codes configurable (#5148) Signed-off-by: Paolo Di Tommaso Signed-off-by: Ben Sherman Co-authored-by: Ben Sherman --- docs/config.md | 6 ++++ .../google/batch/client/BatchConfig.groovy | 6 ++++ .../batch/client/BatchConfigTest.groovy | 35 +++++++++++++++---- 3 files changed, 41 insertions(+), 6 deletions(-) diff --git a/docs/config.md b/docs/config.md index 645ec4f91e..ba97797cdb 100644 --- a/docs/config.md +++ b/docs/config.md @@ -845,6 +845,11 @@ Read the {ref}`google-page` page for more information. The following settings are available for Google Cloud Batch: +`google.batch.autoRetryExitCodes` +: :::{versionadded} 24.07.0-edge + ::: +: Defines the list of exit codes that will be automatically retried by Google Batch when `google.batch.maxSpotAttempts` is greater than 0 (default `[50001]`). Refer to the [Google Batch documentation](https://cloud.google.com/batch/docs/troubleshooting#reserved-exit-codes) for the list of retryable exit codes. + `google.enableRequesterPaysBuckets` : When `true` uses the given Google Cloud project ID as the billing project for storage access. This is required when accessing data from *requester pays enabled* buckets. See [Requester Pays on Google Cloud Storage documentation](https://cloud.google.com/storage/docs/requester-pays) (default: `false`). @@ -865,6 +870,7 @@ The following settings are available for Google Cloud Batch: : :::{versionadded} 23.11.0-edge ::: : Max number of execution attempts of a job interrupted by a Compute Engine spot reclaim event (default: `5`). +: See also: `google.batch.autoRetryExitCodes` `google.project` : The Google Cloud project ID to use for pipeline execution diff --git a/plugins/nf-google/src/main/nextflow/cloud/google/batch/client/BatchConfig.groovy b/plugins/nf-google/src/main/nextflow/cloud/google/batch/client/BatchConfig.groovy index c481c4ec4d..6af5c0abb0 100644 --- a/plugins/nf-google/src/main/nextflow/cloud/google/batch/client/BatchConfig.groovy +++ b/plugins/nf-google/src/main/nextflow/cloud/google/batch/client/BatchConfig.groovy @@ -22,6 +22,7 @@ import groovy.transform.CompileStatic import groovy.util.logging.Slf4j import nextflow.Session import nextflow.cloud.google.GoogleOpts +import nextflow.exception.ProcessUnrecoverableException import nextflow.util.MemoryUnit /** * Model Google Batch config settings @@ -32,6 +33,8 @@ import nextflow.util.MemoryUnit @CompileStatic class BatchConfig { + static private List DEFAULT_RETRY_LIST = List.of(50001) + private GoogleOpts googleOpts private GoogleCredentials credentials private List allowedLocations @@ -46,6 +49,7 @@ class BatchConfig { private String subnetwork private String serviceAccountEmail private BatchRetryConfig retryConfig + private List autoRetryExitCodes GoogleOpts getGoogleOpts() { return googleOpts } GoogleCredentials getCredentials() { return credentials } @@ -61,6 +65,7 @@ class BatchConfig { String getSubnetwork() { subnetwork } String getServiceAccountEmail() { serviceAccountEmail } BatchRetryConfig getRetryConfig() { retryConfig } + List getAutoRetryExitCodes() { autoRetryExitCodes } static BatchConfig create(Session session) { final result = new BatchConfig() @@ -78,6 +83,7 @@ class BatchConfig { result.subnetwork = session.config.navigate('google.batch.subnetwork') result.serviceAccountEmail = session.config.navigate('google.batch.serviceAccountEmail') result.retryConfig = new BatchRetryConfig( session.config.navigate('google.batch.retryPolicy') as Map ?: Map.of() ) + result.autoRetryExitCodes = session.config.navigate('google.batch.autoRetryExitCodes',DEFAULT_RETRY_LIST) as List return result } diff --git a/plugins/nf-google/src/test/nextflow/cloud/google/batch/client/BatchConfigTest.groovy b/plugins/nf-google/src/test/nextflow/cloud/google/batch/client/BatchConfigTest.groovy index 0a88e6c2a0..e364a01667 100644 --- a/plugins/nf-google/src/test/nextflow/cloud/google/batch/client/BatchConfigTest.groovy +++ b/plugins/nf-google/src/test/nextflow/cloud/google/batch/client/BatchConfigTest.groovy @@ -30,11 +30,33 @@ class BatchConfigTest extends Specification { def 'should create batch config' () { given: def CONFIG = [google: [ - batch: [ - spot: true, - retryPolicy: [maxAttempts: 10] - ] - ] ] + batch: [ + spot: true + ] + ] ] + def session = Mock(Session) { getConfig()>>CONFIG } + + when: + def config = BatchConfig.create(session) + then: + config.getSpot() + and: + config.retryConfig.maxAttempts == 5 + config.maxSpotAttempts == 5 + config.autoRetryExitCodes == [50001] + } + + @Requires({System.getenv('GOOGLE_APPLICATION_CREDENTIALS')}) + def 'should create batch config with custom settings' () { + given: + def CONFIG = [google: [ + batch: [ + spot: true, + maxSpotAttempts: 8, + autoRetryExitCodes: [50001, 50003, 50005], + retryPolicy: [maxAttempts: 10] + ] + ] ] def session = Mock(Session) { getConfig()>>CONFIG } when: @@ -43,7 +65,8 @@ class BatchConfigTest extends Specification { config.getSpot() and: config.retryConfig.maxAttempts == 10 - + config.maxSpotAttempts == 8 + config.autoRetryExitCodes == [50001, 50003, 50005] } }