Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Make auto retry codes configurable #5148

Merged
merged 4 commits into from
Jul 16, 2024
Merged
Show file tree
Hide file tree
Changes from 2 commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
7 changes: 7 additions & 0 deletions docs/config.md
Original file line number Diff line number Diff line change
Expand Up @@ -845,6 +845,12 @@ Read the {ref}`google-page` page for more information.

The following settings are available for Google Cloud Batch:

`google.batch.autoRetryExitCodes`
: :::{versionadded} 24.06.0-edge
pditommaso marked this conversation as resolved.
Show resolved Hide resolved
:::
: Defines the task exit codes that determine an automatic task execution retry when the setting `maxSpotAttempts` is set
to a value greater than 0. (default `[50001]`). For more details check out the [Google Batch documentation](https://cloud.google.com/batch/docs/troubleshooting#vm_preemption_50001).

`google.enableRequesterPaysBuckets`
: When `true` uses the given Google Cloud project ID as the billing project for storage access. This is required when accessing data from *requester pays enabled* buckets. See [Requester Pays on Google Cloud Storage documentation](https://cloud.google.com/storage/docs/requester-pays) (default: `false`).

Expand All @@ -865,6 +871,7 @@ The following settings are available for Google Cloud Batch:
: :::{versionadded} 23.11.0-edge
:::
: Max number of execution attempts of a job interrupted by a Compute Engine spot reclaim event (default: `5`).
See also `autoRetryExitCodes`.
pditommaso marked this conversation as resolved.
Show resolved Hide resolved

`google.project`
: The Google Cloud project ID to use for pipeline execution
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -22,6 +22,7 @@ import groovy.transform.CompileStatic
import groovy.util.logging.Slf4j
import nextflow.Session
import nextflow.cloud.google.GoogleOpts
import nextflow.exception.ProcessUnrecoverableException
import nextflow.util.MemoryUnit
/**
* Model Google Batch config settings
Expand All @@ -46,6 +47,7 @@ class BatchConfig {
private String subnetwork
private String serviceAccountEmail
private BatchRetryConfig retryConfig
private List<Integer> autoRetryExitCodes

GoogleOpts getGoogleOpts() { return googleOpts }
GoogleCredentials getCredentials() { return credentials }
Expand All @@ -61,6 +63,7 @@ class BatchConfig {
String getSubnetwork() { subnetwork }
String getServiceAccountEmail() { serviceAccountEmail }
BatchRetryConfig getRetryConfig() { retryConfig }
List<Integer> getAutoRetryExitCodes() { autoRetryExitCodes }

static BatchConfig create(Session session) {
final result = new BatchConfig()
Expand All @@ -78,6 +81,7 @@ class BatchConfig {
result.subnetwork = session.config.navigate('google.batch.subnetwork')
result.serviceAccountEmail = session.config.navigate('google.batch.serviceAccountEmail')
result.retryConfig = new BatchRetryConfig( session.config.navigate('google.batch.retryPolicy') as Map ?: Map.of() )
result.autoRetryExitCodes = parseAutoRetryExitCodes0(session.config.navigate('google.batch.autoRetryExitCodes','50001'))
return result
}

Expand All @@ -86,4 +90,34 @@ class BatchConfig {
return "BatchConfig[googleOpts=$googleOpts"
}

static private String _50001 = '50001'

static private List<Integer> DEFAULT_RETRY_LIST = List.of(_50001.toInteger())

static protected List<Integer> parseAutoRetryExitCodes0(value) {
if(!value)
return List.of()
if( value instanceof List ) {
// it's expected to be a list of integer
return value as List<Integer>
}
if( value instanceof CharSequence ) {
final v0 = value.toString()
if( v0==_50001 )
return DEFAULT_RETRY_LIST
else
return v0.tokenize(',').toList().collect(it->parseCode(it))
}
return null
}

static private Integer parseCode(String v) {
try {
return v.toInteger()
}
catch (NumberFormatException e) {
throw new ProcessUnrecoverableException("Invalid exit code value: $v -- check the setting `google.batch.autoRetryExitCodes`")
}
}

}
Original file line number Diff line number Diff line change
Expand Up @@ -20,6 +20,8 @@ package nextflow.cloud.google.batch.client
import nextflow.Session
import spock.lang.Requires
import spock.lang.Specification
import spock.lang.Unroll

/**
*
* @author Paolo Di Tommaso <[email protected]>
Expand All @@ -31,19 +33,54 @@ class BatchConfigTest extends Specification {
given:
def CONFIG = [google: [
batch: [
spot: true,
retryPolicy: [maxAttempts: 10]
spot: true
]
] ]
def session = Mock(Session) { getConfig()>>CONFIG }

when:
def config = BatchConfig.create(session)
then:
config.getSpot()
and:
config.retryConfig.maxAttempts == 5
config.maxSpotAttempts == 5
config.autoRetryExitCodes == [50001]
}

@Requires({System.getenv('GOOGLE_APPLICATION_CREDENTIALS')})
def 'should create batch config with custom settings' () {
given:
def CONFIG = [google: [
batch: [
spot: true,
maxSpotAttempts: 8,
autoRetryExitCodes: [50001, 50003, 50005],
retryPolicy: [maxAttempts: 10]
]
] ]
def session = Mock(Session) { getConfig()>>CONFIG }

when:
def config = BatchConfig.create(session)
then:
config.getSpot()
and:
config.retryConfig.maxAttempts == 10

config.maxSpotAttempts == 8
config.autoRetryExitCodes == [50001, 50003, 50005]
}

@Unroll
def 'should should parse exit codes' () {
expect:
BatchConfig.parseAutoRetryExitCodes0(CODES) == EXPECTED
where:
CODES | EXPECTED
null | []
'1' | [1]
'2,4,8' | [2,4,8]
[10,20] | [10,20]
}

}
Loading