From 4c2b012d7fc0ba6ed8ba4c8448b0e2661cc976d4 Mon Sep 17 00:00:00 2001 From: Paolo Di Tommaso Date: Thu, 1 Aug 2024 08:49:08 +0200 Subject: [PATCH] Improve fusion docs (#5166) [ci skip] Signed-off-by: Paolo Di Tommaso Co-authored-by: Jordi Deu-Pons Co-authored-by: Adam Talbot <12817534+adamrtalbot@users.noreply.github.com> Signed-off-by: Niklas Schandry --- docker-scratch/Dockerfile | 11 ++ docker-scratch/make.sh | 58 ++++++++++ docs/fusion.md | 216 +++++++++++++++++++++++--------------- packing.gradle | 5 + 4 files changed, 205 insertions(+), 85 deletions(-) create mode 100644 docker-scratch/Dockerfile create mode 100644 docker-scratch/make.sh diff --git a/docker-scratch/Dockerfile b/docker-scratch/Dockerfile new file mode 100644 index 0000000000..c2919602af --- /dev/null +++ b/docker-scratch/Dockerfile @@ -0,0 +1,11 @@ +FROM amazoncorretto:17-al2023 +COPY .nextflow /.nextflow +COPY nextflow /usr/bin/nextflow +ENV NXF_HOME=/.nextflow +RUN nextflow info +RUN NXF_PLUGINS_DEFAULT=false nextflow plugin install nf-tower,nf-wave,nf-cloudcache,nf-azure,nf-google,nf-amazon,xpack-amzn,xpack-google,nf-cloudcache + +#FROM scratch +#COPY --from=0 /.nextflow /.nextflow +#COPY --from=0 /usr/bin/nextflow /usr/bin/nextflow + diff --git a/docker-scratch/make.sh b/docker-scratch/make.sh new file mode 100644 index 0000000000..8217e6b44b --- /dev/null +++ b/docker-scratch/make.sh @@ -0,0 +1,58 @@ +# +# Copyright 2013-2024, Seqera Labs +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +# +# + +# cleanup +rm -rf .nextflow && mkdir .nextflow +# copy nextflow dependencies +(cd .. +./gradlew compile assemble +BUILD_PACK=1 ./gradlew installScratch publishToMavenLocal -Dmaven.repo.local=${PWD}/docker-scratch/.nextflow/capsule/deps/ +) + +# copy nextflow launcher script +cp ../nextflow . && chmod +x nextflow +cp ../modules/nextflow/src/main/resources/META-INF/build-info.properties . +source build-info.properties + +if [ -z "$version" ]; then + echo "Error: version is empty or missing"; exit 1 +fi +if [ -z "$build" ]; then + echo "Error: build is empty or missing"; exit 1 +fi +if [ -z "$commitId" ]; then + echo "Error: commitId is empty or missing"; exit 1 +fi + + +TAG=${version}-${commitId} +repository=${repository:-'docker.io/pditommaso/nf-launcher-dev'} +image=${repository}:${TAG} +base=${base:-'docker.io/pditommaso/nf-lancher:j17-base'} + +docker buildx build \ + --no-cache \ + --platform linux/amd64 \ + --output=type=docker \ + --progress=plain \ + --tag ${image} \ + --build-arg TARGETPLATFORM=linux/amd64 \ + . + +#launcher=$(wave -i ${base} --include ${image} --config-env NXF_HOME=/.nextflow) +# +#echo $launcher diff --git a/docs/fusion.md b/docs/fusion.md index a10e7ab7cb..95cb14d33d 100644 --- a/docs/fusion.md +++ b/docs/fusion.md @@ -13,15 +13,95 @@ Support for Google Cloud Storage. Fusion is a distributed virtual file system for cloud-native data pipeline and optimised for Nextflow workloads. -It bridges the gap between cloud-native storage and data analysis workflow by implementing a thin client that allows any existing application to access object storage using the standard POSIX interface, thus simplifying and speeding up most operations. Currently it supports AWS S3 and Google Cloud Storage. +It bridges the gap between cloud-native storage and data analysis workflow by implementing a thin client that allows any existing application to access object storage using the standard POSIX interface, thus simplifying and speeding up most operations. +Currently it supports AWS S3, Google Cloud Storage and Azure Blob containers. ## Getting started +The Fusion file system implements a lazy download and upload algorithm that runs in the background to transfer files in +parallel to and from object storage into a container-local temporary folder. This means that the performance of the disk +volume used to carry out your computation is key to achieving maximum performance. + +By default Fusion uses the container `/tmp` directory as a temporary cache, so the size of the volume can be much lower +than the actual needs of your pipeline processes. Fusion has a built-in garbage collector that constantly monitors remaining +disk space on the temporary folder and immediately evicts old cached entries when necessary. + ### Requirements -Fusion file system is designed to work with containerised workloads, therefore it requires the use of a container engine such as Docker or a container native platform for the execution of your pipeline e.g. AWS Batch or Kubernetes. It also requires the use of {ref}`Wave containers`. +Fusion file system is designed to work with containerised workloads, therefore it requires the use of a container engine +such as Docker or a container native platform for the execution of your pipeline e.g. AWS Batch or Kubernetes. It also requires +the use of {ref}`Wave containers`. + +### Azure Cloud + +Fusion provides built-in support for [Azure Blob Storage](https://azure.microsoft.com/en-us/products/storage/blobs/) +when running in Azure Cloud. + +The support for Azure does not require any specific setting other then enabling Wave and Fusion in your Nextflow +configuration. For example: + +``` +fusion.enabled = true +wave.enabled = true +process.executor = 'azure-batch' +tower.accessToken = '' +``` + +Then run your pipeline using the usual command: + +``` +nextflow run -work-dir az:///scratch +``` + +Azure machines come with fast SSDs attached, therefore no additional storage configuration is required however it is +recommended to use the machine types with larger data disks attached, denoted by the suffix `d` after the core number +(e.g. `Standard_E32*d*_v5`). These will increase the throughput of Fusion and reduce the chance of overloading the machine. + +### AWS Cloud + +Fusion file system allows the use of an S3 bucket as a pipeline work directory with the AWS Batch executor. +The use of Fusion makes obsolete the need to create and configure a custom AMI that includes the `aws` command +line tool, when setting up the AWS Batch compute environment. + +The configuration for this deployment scenario looks like the following: + +```groovy +fusion.enabled = true +wave.enabled = true +process.executor = 'awsbatch' +process.queue = '' +aws.region = '' +tower.accessToken = '' +``` + +Then you can run your pipeline using the following command: + +```bash +nextflow run -work-dir s3:///scratch +``` + +For best performance make sure to use instance types that provide a NVMe disk as [instance storage](https://docs.aws.amazon.com/AWSEC2/latest/UserGuide/InstanceStorage.html). +If you are creating the AWS Batch compute environment by yourselves, you will need to make sure the NVMe is properly formatted (see below). + + +#### NVMe storage + +The recommended setup to get maximum performance is to mount a NVMe disk as the temporary folder and run the pipeline with +the {ref}`scratch ` directive set to `false` to also avoid stage-out transfer time. + +Example configuration for using AWS Batch with [NVMe disks](https://docs.aws.amazon.com/AWSEC2/latest/UserGuide/ssd-instance-store.html) to maximize performance: + +```groovy +aws.batch.volumes = '/path/to/ec2/nvme:/tmp' +process.scratch = false +``` + +:::{tip} +Seqera Platform is able to automatically format and configure the NVMe instance storage by enabling +the option "Use Fast storage" when creating the Batch compute environment. +::: -### AWS S3 configuration +#### AWS IAM permissions The AWS S3 bucket should be configured with the following IAM permissions: @@ -54,75 +134,30 @@ The AWS S3 bucket should be configured with the following IAM permissions: } ``` -## Use cases +### Google Cloud -### Local execution with S3 bucket as work directory +Fusion provides built-in support for [Google Storage](https://cloud.google.com/storage?hl=en) +when running in Google Cloud. -Fusion file system allows the use of an S3 bucket as a pipeline work directory with the Nextflow local executor. This configuration requires the use of Docker (or similar container engine) for the execution of your pipeline tasks. - -The AWS S3 bucket credentials should be made accessible via standard `AWS_ACCESS_KEY_ID` and `AWS_SECRET_ACCESS_KEY` environment variables. +The support for Google does not require any specific setting other then enabling Wave and Fusion in your Nextflow +configuration. For example: -The following configuration should be added in your Nextflow configuration file: - -```groovy -docker { - enabled = true -} - -fusion { - enabled = true - exportStorageCredentials = true -} - -wave { - enabled = true -} ``` - -Then you can run your pipeline using the following command: - -```bash -nextflow run -work-dir s3:///scratch +fusion.enabled = true +wave.enabled = true +process.executor = 'google-batch' +tower.accessToken = '' ``` -Replace `` and `` with a pipeline script and bucket or your choice, for example: +Then run your pipeline using the usual command: -```bash -nextflow run https://github.com/nextflow-io/rnaseq-nf -work-dir s3://nextflow-ci/scratch ``` - -### AWS Batch execution with S3 bucket as work directory - -Fusion file system allows the use of an S3 bucket as a pipeline work directory with the AWS Batch executor. The use of Fusion makes obsolete the need to create and configure a custom AMI that includes the `aws` command line tool, when setting up the AWS Batch compute environment. - -The configuration for this deployment scenario looks like the following: - -```groovy -fusion { - enabled = true -} - -wave { - enabled = true -} - -process { - executor = 'awsbatch' - queue = '' -} - -aws { - region = '' -} +nextflow run -work-dir gs:///scratch ``` -Then you can run your pipeline using the following command: +When using Fusion, if the `process.disk` is not set, Nextflow will attach a single local SSD disk to the machine. The size of this disk can be much lower than the actual needs of your pipeline processes because Fusion uses it only as a temporal cache. Fusion is also compatible with other types of `process.disk`, but better performance is achieved when using local SSD disks. -```bash -nextflow run -work-dir s3:///scratch -``` - -### Kubernetes execution with S3 bucket as work directory +### Kubernetes Fusion file system allows the use of an S3 bucket as a pipeline work directory with the Kubernetes executor. @@ -131,23 +166,13 @@ The use of Fusion makes obsolete the need to create and manage and separate pers The configuration for this deployment scenario looks like the following: ```groovy -wave { - enabled = true -} - -fusion { - enabled = true -} - -process { - executor = 'k8s' -} - -k8s { - context = '' - namespace = '' - serviceAccount = '' -} +fusion.enabled = true +wave.enabled = true +process.executor = 'k8s' +k8s.context = '' +k8s.namespace = '' +k8s.serviceAccount = '' +tower.accessToken = '' ``` The `k8s.context` represents the Kubernetes configuration context to be used for the pipeline execution. This setting can be omitted if Nextflow itself is run as a pod in the Kubernetes clusters. @@ -162,21 +187,42 @@ Having the above configuration in place, you can run your pipeline using the fol nextflow run -work-dir s3:///scratch ``` -## NVMe storage +:::{note} +You an also use Fusion and Kubernetes with Azure Blob Storage and Google Storage using the same deployment approach. +::: -The Fusion file system implements a lazy download and upload algorithm that runs in the background to transfer files in parallel to and from object storage into a container-local temporary folder. This means that the performance of the temporary folder inside the container (`/tmp` in a default setup) is key to achieving maximum performance. +### Local execution -The temporary folder is used only as a temporary cache, so the size of the volume can be much lower than the actual needs of your pipeline processes. Fusion has a built-in garbage collector that constantly monitors remaining disk space on the temporary folder and immediately evicts old cached entries when necessary. +Fusion file system allows the use of an S3 bucket as a pipeline work directory with the Nextflow local executor. This configuration requires the use of Docker (or similar container engine) for the execution of your pipeline tasks. -The recommended setup to get maximum performance is to mount a NVMe disk as the temporary folder and run the pipeline with the {ref}`scratch ` directive set to `false` to also avoid stage-out transfer time. +The AWS S3 bucket credentials should be made accessible via standard `AWS_ACCESS_KEY_ID` and `AWS_SECRET_ACCESS_KEY` environment variables. -Example configuration for using AWS Batch with [NVMe disks](https://docs.aws.amazon.com/AWSEC2/latest/UserGuide/ssd-instance-store.html) to maximize performance: +The following configuration should be added in your Nextflow configuration file: ```groovy -aws.batch.volumes = '/path/to/ec2/nvme:/tmp' -process.scratch = false +docker.enabled = true +fusion.enabled = true +fusion.exportStorageCredentials = true +wave.enabled = true +``` + +Then you can run your pipeline using the following command: + +```bash +nextflow run -work-dir s3:///scratch ``` +Replace `` and `` with a pipeline script and bucket or your choice, for example: + +```bash +nextflow run https://github.com/nextflow-io/rnaseq-nf -work-dir s3://nextflow-ci/scratch +``` + +:::{warning} +The option `fusion.exportStorageCredentials` leaks the AWS credentials on the task launcher script created by Nextflow. +This option should only be used for development purposes. +::: + ## Advanced settings Fusion advanced configuration settings are described in the {ref}`Fusion ` section on the Nextflow configuration page. diff --git a/packing.gradle b/packing.gradle index ccbfdaf40e..f2f29a53b5 100644 --- a/packing.gradle +++ b/packing.gradle @@ -166,6 +166,11 @@ task installLauncher(type: Copy, dependsOn: ['pack']) { into "$homeDir/.nextflow/framework/$version/" } +task installScratch(type: Copy, dependsOn: ['pack']) { + from "$releaseDir/nextflow-$version-one.jar" + into "${rootProject.projectDir}/docker-scratch/.nextflow/framework/$version/" +} + /* * build, tag and publish a and new docker packaged nextflow release */