From 90afdb21cb3a6768fbe249eb3311fa9179b355b6 Mon Sep 17 00:00:00 2001 From: Paolo Di Tommaso Date: Mon, 15 Jul 2024 21:01:00 +0200 Subject: [PATCH 1/3] Add google.batch.autoRetryExitCodes config setting Signed-off-by: Paolo Di Tommaso --- docs/config.md | 7 +++ .../google/batch/client/BatchConfig.groovy | 34 +++++++++++++++ .../batch/client/BatchConfigTest.groovy | 43 +++++++++++++++++-- 3 files changed, 81 insertions(+), 3 deletions(-) diff --git a/docs/config.md b/docs/config.md index 645ec4f91e..0fcbee1bbd 100644 --- a/docs/config.md +++ b/docs/config.md @@ -845,6 +845,12 @@ Read the {ref}`google-page` page for more information. The following settings are available for Google Cloud Batch: +`google.batch.autoRetryExitCodes` +: :::{versionadded} 24.06.0-edge + ::: +: Defines the task exit codes that determine an automatic task execution retry when the setting `maxSpotAttempts` is set +to a value greater than 0. (default `[50001]`). For more details check out the [Google Batch documentation](https://cloud.google.com/batch/docs/troubleshooting#vm_preemption_50001). + `google.enableRequesterPaysBuckets` : When `true` uses the given Google Cloud project ID as the billing project for storage access. This is required when accessing data from *requester pays enabled* buckets. See [Requester Pays on Google Cloud Storage documentation](https://cloud.google.com/storage/docs/requester-pays) (default: `false`). @@ -865,6 +871,7 @@ The following settings are available for Google Cloud Batch: : :::{versionadded} 23.11.0-edge ::: : Max number of execution attempts of a job interrupted by a Compute Engine spot reclaim event (default: `5`). +See also `autoRetryExitCodes`. `google.project` : The Google Cloud project ID to use for pipeline execution diff --git a/plugins/nf-google/src/main/nextflow/cloud/google/batch/client/BatchConfig.groovy b/plugins/nf-google/src/main/nextflow/cloud/google/batch/client/BatchConfig.groovy index c481c4ec4d..7e9fbea3c5 100644 --- a/plugins/nf-google/src/main/nextflow/cloud/google/batch/client/BatchConfig.groovy +++ b/plugins/nf-google/src/main/nextflow/cloud/google/batch/client/BatchConfig.groovy @@ -22,6 +22,7 @@ import groovy.transform.CompileStatic import groovy.util.logging.Slf4j import nextflow.Session import nextflow.cloud.google.GoogleOpts +import nextflow.exception.ProcessUnrecoverableException import nextflow.util.MemoryUnit /** * Model Google Batch config settings @@ -46,6 +47,7 @@ class BatchConfig { private String subnetwork private String serviceAccountEmail private BatchRetryConfig retryConfig + private List autoRetryExitCodes GoogleOpts getGoogleOpts() { return googleOpts } GoogleCredentials getCredentials() { return credentials } @@ -61,6 +63,7 @@ class BatchConfig { String getSubnetwork() { subnetwork } String getServiceAccountEmail() { serviceAccountEmail } BatchRetryConfig getRetryConfig() { retryConfig } + List getAutoRetryExitCodes() { autoRetryExitCodes } static BatchConfig create(Session session) { final result = new BatchConfig() @@ -78,6 +81,7 @@ class BatchConfig { result.subnetwork = session.config.navigate('google.batch.subnetwork') result.serviceAccountEmail = session.config.navigate('google.batch.serviceAccountEmail') result.retryConfig = new BatchRetryConfig( session.config.navigate('google.batch.retryPolicy') as Map ?: Map.of() ) + result.autoRetryExitCodes = parseAutoRetryExitCodes0(session.config.navigate('google.batch.autoRetryExitCodes','50001')) return result } @@ -86,4 +90,34 @@ class BatchConfig { return "BatchConfig[googleOpts=$googleOpts" } + static private String _50001 = '50001' + + static private List DEFAULT_RETRY_LIST = List.of(_50001.toInteger()) + + static protected List parseAutoRetryExitCodes0(value) { + if(!value) + return List.of() + if( value instanceof List ) { + // it's expected to be a list of integer + return value as List + } + if( value instanceof CharSequence ) { + final v0 = value.toString() + if( v0==_50001 ) + return DEFAULT_RETRY_LIST + else + return v0.tokenize(',').toList().collect(it->parseCode(it)) + } + return null + } + + static private Integer parseCode(String v) { + try { + return v.toInteger() + } + catch (NumberFormatException e) { + throw new ProcessUnrecoverableException("Invalid exit code value: $v -- check the setting `google.batch.autoRetryExitCodes`") + } + } + } diff --git a/plugins/nf-google/src/test/nextflow/cloud/google/batch/client/BatchConfigTest.groovy b/plugins/nf-google/src/test/nextflow/cloud/google/batch/client/BatchConfigTest.groovy index 0a88e6c2a0..94394a9931 100644 --- a/plugins/nf-google/src/test/nextflow/cloud/google/batch/client/BatchConfigTest.groovy +++ b/plugins/nf-google/src/test/nextflow/cloud/google/batch/client/BatchConfigTest.groovy @@ -20,6 +20,8 @@ package nextflow.cloud.google.batch.client import nextflow.Session import spock.lang.Requires import spock.lang.Specification +import spock.lang.Unroll + /** * * @author Paolo Di Tommaso @@ -31,19 +33,54 @@ class BatchConfigTest extends Specification { given: def CONFIG = [google: [ batch: [ - spot: true, - retryPolicy: [maxAttempts: 10] + spot: true ] ] ] def session = Mock(Session) { getConfig()>>CONFIG } + when: + def config = BatchConfig.create(session) + then: + config.getSpot() + and: + config.retryConfig.maxAttempts == 5 + config.maxSpotAttempts == 5 + config.autoRetryExitCodes == [50001] + } + + @Requires({System.getenv('GOOGLE_APPLICATION_CREDENTIALS')}) + def 'should create batch config with custom settings' () { + given: + def CONFIG = [google: [ + batch: [ + spot: true, + maxSpotAttempts: 8, + autoRetryExitCodes: [50001, 50003, 50005], + retryPolicy: [maxAttempts: 10] + ] + ] ] + def session = Mock(Session) { getConfig()>>CONFIG } + when: def config = BatchConfig.create(session) then: config.getSpot() and: config.retryConfig.maxAttempts == 10 - + config.maxSpotAttempts == 8 + config.autoRetryExitCodes == [50001, 50003, 50005] + } + + @Unroll + def 'should should parse exit codes' () { + expect: + BatchConfig.parseAutoRetryExitCodes0(CODES) == EXPECTED + where: + CODES | EXPECTED + null | [] + '1' | [1] + '2,4,8' | [2,4,8] + [10,20] | [10,20] } } From 3f424544e6d5359f86121bb71b04ef4f5379de7a Mon Sep 17 00:00:00 2001 From: Ben Sherman Date: Thu, 11 Jul 2024 10:18:10 -0500 Subject: [PATCH 2/3] keep it simple Signed-off-by: Ben Sherman --- docs/config.md | 5 ++- .../google/batch/client/BatchConfig.groovy | 34 ++----------------- .../batch/client/BatchConfigTest.groovy | 22 +++--------- 3 files changed, 9 insertions(+), 52 deletions(-) diff --git a/docs/config.md b/docs/config.md index 0fcbee1bbd..b00a6efd72 100644 --- a/docs/config.md +++ b/docs/config.md @@ -848,8 +848,7 @@ The following settings are available for Google Cloud Batch: `google.batch.autoRetryExitCodes` : :::{versionadded} 24.06.0-edge ::: -: Defines the task exit codes that determine an automatic task execution retry when the setting `maxSpotAttempts` is set -to a value greater than 0. (default `[50001]`). For more details check out the [Google Batch documentation](https://cloud.google.com/batch/docs/troubleshooting#vm_preemption_50001). +: Defines the list of exit codes that will be automatically retried by Google Batch when `google.batch.maxSpotAttempts` is greater than 0 (default `[50001]`). Refer to the [Google Batch documentation](https://cloud.google.com/batch/docs/troubleshooting#reserved-exit-codes) for the list of retryable exit codes. `google.enableRequesterPaysBuckets` : When `true` uses the given Google Cloud project ID as the billing project for storage access. This is required when accessing data from *requester pays enabled* buckets. See [Requester Pays on Google Cloud Storage documentation](https://cloud.google.com/storage/docs/requester-pays) (default: `false`). @@ -871,7 +870,7 @@ to a value greater than 0. (default `[50001]`). For more details check out the [ : :::{versionadded} 23.11.0-edge ::: : Max number of execution attempts of a job interrupted by a Compute Engine spot reclaim event (default: `5`). -See also `autoRetryExitCodes`. +: See also: `google.batch.autoRetryExitCodes` `google.project` : The Google Cloud project ID to use for pipeline execution diff --git a/plugins/nf-google/src/main/nextflow/cloud/google/batch/client/BatchConfig.groovy b/plugins/nf-google/src/main/nextflow/cloud/google/batch/client/BatchConfig.groovy index 7e9fbea3c5..6af5c0abb0 100644 --- a/plugins/nf-google/src/main/nextflow/cloud/google/batch/client/BatchConfig.groovy +++ b/plugins/nf-google/src/main/nextflow/cloud/google/batch/client/BatchConfig.groovy @@ -33,6 +33,8 @@ import nextflow.util.MemoryUnit @CompileStatic class BatchConfig { + static private List DEFAULT_RETRY_LIST = List.of(50001) + private GoogleOpts googleOpts private GoogleCredentials credentials private List allowedLocations @@ -81,7 +83,7 @@ class BatchConfig { result.subnetwork = session.config.navigate('google.batch.subnetwork') result.serviceAccountEmail = session.config.navigate('google.batch.serviceAccountEmail') result.retryConfig = new BatchRetryConfig( session.config.navigate('google.batch.retryPolicy') as Map ?: Map.of() ) - result.autoRetryExitCodes = parseAutoRetryExitCodes0(session.config.navigate('google.batch.autoRetryExitCodes','50001')) + result.autoRetryExitCodes = session.config.navigate('google.batch.autoRetryExitCodes',DEFAULT_RETRY_LIST) as List return result } @@ -90,34 +92,4 @@ class BatchConfig { return "BatchConfig[googleOpts=$googleOpts" } - static private String _50001 = '50001' - - static private List DEFAULT_RETRY_LIST = List.of(_50001.toInteger()) - - static protected List parseAutoRetryExitCodes0(value) { - if(!value) - return List.of() - if( value instanceof List ) { - // it's expected to be a list of integer - return value as List - } - if( value instanceof CharSequence ) { - final v0 = value.toString() - if( v0==_50001 ) - return DEFAULT_RETRY_LIST - else - return v0.tokenize(',').toList().collect(it->parseCode(it)) - } - return null - } - - static private Integer parseCode(String v) { - try { - return v.toInteger() - } - catch (NumberFormatException e) { - throw new ProcessUnrecoverableException("Invalid exit code value: $v -- check the setting `google.batch.autoRetryExitCodes`") - } - } - } diff --git a/plugins/nf-google/src/test/nextflow/cloud/google/batch/client/BatchConfigTest.groovy b/plugins/nf-google/src/test/nextflow/cloud/google/batch/client/BatchConfigTest.groovy index 94394a9931..e364a01667 100644 --- a/plugins/nf-google/src/test/nextflow/cloud/google/batch/client/BatchConfigTest.groovy +++ b/plugins/nf-google/src/test/nextflow/cloud/google/batch/client/BatchConfigTest.groovy @@ -20,8 +20,6 @@ package nextflow.cloud.google.batch.client import nextflow.Session import spock.lang.Requires import spock.lang.Specification -import spock.lang.Unroll - /** * * @author Paolo Di Tommaso @@ -32,10 +30,10 @@ class BatchConfigTest extends Specification { def 'should create batch config' () { given: def CONFIG = [google: [ - batch: [ - spot: true - ] - ] ] + batch: [ + spot: true + ] + ] ] def session = Mock(Session) { getConfig()>>CONFIG } when: @@ -71,16 +69,4 @@ class BatchConfigTest extends Specification { config.autoRetryExitCodes == [50001, 50003, 50005] } - @Unroll - def 'should should parse exit codes' () { - expect: - BatchConfig.parseAutoRetryExitCodes0(CODES) == EXPECTED - where: - CODES | EXPECTED - null | [] - '1' | [1] - '2,4,8' | [2,4,8] - [10,20] | [10,20] - } - } From f655dc39b5f38ccf185b427e1145df10cadc0f70 Mon Sep 17 00:00:00 2001 From: Paolo Di Tommaso Date: Tue, 16 Jul 2024 19:54:58 +0200 Subject: [PATCH 3/3] Update docs/config.md [ci skip] Signed-off-by: Paolo Di Tommaso --- docs/config.md | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/docs/config.md b/docs/config.md index b00a6efd72..ba97797cdb 100644 --- a/docs/config.md +++ b/docs/config.md @@ -846,7 +846,7 @@ Read the {ref}`google-page` page for more information. The following settings are available for Google Cloud Batch: `google.batch.autoRetryExitCodes` -: :::{versionadded} 24.06.0-edge +: :::{versionadded} 24.07.0-edge ::: : Defines the list of exit codes that will be automatically retried by Google Batch when `google.batch.maxSpotAttempts` is greater than 0 (default `[50001]`). Refer to the [Google Batch documentation](https://cloud.google.com/batch/docs/troubleshooting#reserved-exit-codes) for the list of retryable exit codes.