Merge pull request #16 from seqeralabs/dev

Dev -> Main for 0.2.0 release
seqeralabs · Dec 3, 2024 · b48ed56 · b48ed56
2 parents 11c14b8 + 191da24
commit b48ed56
Show file tree

Hide file tree

Showing 16 changed files with 176 additions and 24 deletions.
diff --git a/CHANGELOG.md b/CHANGELOG.md
@@ -3,7 +3,25 @@
 The format is based on [Keep a Changelog](https://keepachangelog.com/en/1.0.0/)
 and this project adheres to [Semantic Versioning](https://semver.org/spec/v2.0.0.html).
 
-## v1.0.0dev - [date]
+## 0.2.0
+
+### Credits
+
+Special thanks to the following for their contributions to the release:
+
+- [Adam Talbot](https://github.com/adamrtalbot)
+- [Esha Joshi](https://github.com/ejseqera)
+- [Florian Wuennemann](https://github.com/FloWuenne)
+
+### Enhancements & fixes
+
+- [PR #11](https://github.com/seqeralabs/nf-chai/pull/11) - Expose additional Chai-1 parameters in the pipeline
+- [PR #12](https://github.com/seqeralabs/nf-chai/pull/12) - Add log for GPU/CPU
+- [PR #13](https://github.com/seqeralabs/nf-chai/pull/13) - Bump `chai_lab` version to 0.4.2
+- [PR #14](https://github.com/seqeralabs/nf-chai/pull/14) - Add parameter to provide multiple sequence alignment directory to Chai-1
+- [PR #15](https://github.com/seqeralabs/nf-chai/pull/15) - Add `test_full_msa` profile to test provision of MSAs
+
+## 0.1.0
 
 Initial release of seqeralabs/nf-chai, created with the [nf-core](https://nf-co.re/) template.
 

diff --git a/README.md b/README.md
@@ -9,9 +9,11 @@
 [![run with singularity](https://img.shields.io/badge/run%20with-singularity-1d355c.svg?labelColor=000000)](https://sylabs.io/docs/)
 [![Launch on Seqera Platform](https://img.shields.io/badge/Launch%20%F0%9F%9A%80-Seqera%20Platform-%234256e7)](https://cloud.seqera.io/launch?pipeline=https://github.com/seqeralabs/nf-chai)
 
+## POC implementation of Chai-1 in Nextflow
+
 ## Introduction
 
-**nf-chai** is a bioinformatics pipeline for running the [Chai-1](https://github.com/chaidiscovery/chai-lab) protein prediction algorithm on an input set of protein sequences in FASTA format. The pipeline has been written in Nextflow to generate results for downstream analysis in a reproducible, scalable and portable way.
+**nf-chai** is a simple, proof-of-concept bioinformatics pipeline for running the [Chai-1](https://github.com/chaidiscovery/chai-lab) protein prediction algorithm on an input set of protein sequences in FASTA format. The pipeline has been written in Nextflow to generate results for downstream analysis in a reproducible, scalable and portable way.
 
 ## Usage
 
@@ -54,6 +56,8 @@ nextflow run seqeralabs/nf-chai \
 
 Set the `--weights_dir` parameter to a location with the pre-downloaded weights required by Chai-1 to avoid having to download them every time you run the pipeline.
 
+To further improve prediction performance using pre-built multiple sequence alignments (MSA) with evolutionary information, set the `--msa_dir` parameter to a location with [`*.aligned.pqt`](https://github.com/chaidiscovery/chai-lab/tree/main/examples/msas#adding-msa-evolutionary-information) format as required by Chai-1.
+
 ## Credits
 
 nf-chai was originally written by the Seqera Team.

diff --git a/assets/multiple_entities.fa → assets/fasta/multiple_entities.fa b/assets/multiple_entities.fa → assets/fasta/multiple_entities.fa
diff --git a/assets/short_protein_sequence.fa → assets/fasta/short_protein_sequence.fa b/assets/short_protein_sequence.fa → assets/fasta/short_protein_sequence.fa
diff --git a/assets/msa/703adc2c74b8d7e613549b6efcf37126da7963522dc33852ad3c691eef1da06f.aligned.pqt b/assets/msa/703adc2c74b8d7e613549b6efcf37126da7963522dc33852ad3c691eef1da06f.aligned.pqt
diff --git a/assets/msa/952a89ff052afbe8cd1656a317de8a4aa2457d6d73f50d228961bb84efd17e02.aligned.pqt b/assets/msa/952a89ff052afbe8cd1656a317de8a4aa2457d6d73f50d228961bb84efd17e02.aligned.pqt
diff --git a/bin/run_chai_1.py b/bin/run_chai_1.py
@@ -4,6 +4,7 @@
 from pathlib import Path
 from chai_lab.chai1 import run_inference
 import torch
+import logging
 
 def main():
     # Set up argument parser
@@ -22,6 +23,37 @@ def main():
         type=Path,
         help="Path to the input FASTA file."
     )
+    # Add optional arguments with current defaults
+    parser.add_argument(
+        "--num-trunk-recycles",
+        type=int,
+        default=3,
+        help="Number of trunk recycles (default: 3)"
+    )
+    parser.add_argument(
+        "--num-diffn-timesteps",
+        type=int,
+        default=200,
+        help="Number of diffusion timesteps (default: 200)"
+    )
+    parser.add_argument(
+        "--seed",
+        type=int,
+        default=42,
+        help="Random seed for reproducibility (default: 42)"
+    )
+    parser.add_argument(
+        "--use-esm-embeddings",
+        action="store_true",
+        default=True,
+        help="Use ESM embeddings (enabled by default)"
+    )
+    parser.add_argument(
+        "--msa-dir",
+        type=str,
+        default=None,
+        help="Directory containing precomputed multiple sequence alignments (MSA)."
+    )
 
     # Parse arguments
     args = parser.parse_args()
@@ -34,17 +66,23 @@ def main():
     args.output_dir.mkdir(parents=True, exist_ok=True)
 
     # Set device for PyTorch
-    device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
+    if torch.cuda.is_available():
+        logging.info("GPU found, using GPU")
+        device = torch.device("cuda")
+    else:
+        logging.info("No GPU found, using CPU")
+        device = "cpu"
 
     # Run structure prediction
     run_inference(
         fasta_file=args.fasta_file,
         output_dir=args.output_dir,
-        num_trunk_recycles=3,
-        num_diffn_timesteps=200,
-        seed=42,
+        num_trunk_recycles=args.num_trunk_recycles,
+        num_diffn_timesteps=args.num_diffn_timesteps,
+        seed=args.seed,
         device=device,
-        use_esm_embeddings=True,
+        use_esm_embeddings=args.use_esm_embeddings,
+        msa_directory=Path(args.msa_dir) if args.msa_dir else None,
     )
 
 if __name__ == "__main__":

diff --git a/conf/test.config b/conf/test.config
@@ -24,7 +24,8 @@ process {
 
 params {
 
-    // Input fasta file
-    input = "${projectDir}/assets/short_protein_sequence.fa"
+    // Input sequence for FASTA file obtained from chai-lab examples:
+    // https://github.com/chaidiscovery/chai-lab/blob/2d2646bde676da6c9b3fa23b38b47fef8fc0d420/examples/msas/predict_with_msas.py#L14-L15
+    input = "${projectDir}/assets/fasta/short_protein_sequence.fa"
 
 }
diff --git a/conf/test_full.config b/conf/test_full.config
@@ -12,7 +12,8 @@
 
 params {
 
-    // Input fasta file
-    input = "${projectDir}/assets/multiple_entities.fa"
+    // Input sequences for FASTA file obtained from chai-lab examples:
+    // https://github.com/chaidiscovery/chai-lab/blob/2d2646bde676da6c9b3fa23b38b47fef8fc0d420/examples/predict_structure.py#L16-L23
+    input = "${projectDir}/assets/fasta/multiple_entities.fa"
 
 }
diff --git a/conf/test_full_msa.config b/conf/test_full_msa.config
@@ -0,0 +1,23 @@
+/*
+~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
+    Nextflow config file for running full-size tests
+~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
+    Defines input files and everything required to run a full size pipeline test.
+
+    Use as follows:
+        nextflow run seqeralabs/nf-chai -profile test_full_msa,<docker/singularity> --outdir <OUTDIR>
+
+----------------------------------------------------------------------------------------
+*/
+
+params {
+
+    // Input sequences for FASTA file obtained from chai-lab examples:
+    // https://github.com/chaidiscovery/chai-lab/blob/2d2646bde676da6c9b3fa23b38b47fef8fc0d420/examples/predict_structure.py#L16-L23
+    input = "${projectDir}/assets/fasta/multiple_entities.fa"
+
+    // Input MSA files obtained from chai-lab examples:
+    // https://github.com/chaidiscovery/chai-lab/tree/2d2646bde676da6c9b3fa23b38b47fef8fc0d420/examples/msas
+    msa_dir = "${projectDir}/assets/msa/"
+
+}
diff --git a/main.nf b/main.nf
@@ -43,7 +43,12 @@ workflow {
     //
     NF_CHAI (
         params.input,
-        params.weights_dir
+        params.weights_dir,
+        params.msa_dir,
+        params.num_trunk_recycles,
+        params.num_diffusion_timesteps,
+        params.seed,
+        params.use_esm_embeddings
     )
 
     //

diff --git a/modules/local/chai_1/environment.yml b/modules/local/chai_1/environment.yml
@@ -8,4 +8,4 @@ dependencies:
   - cuda=12.1
   - pip
   - pip:
-      - chai_lab==0.3.0
+      - chai_lab==0.4.2
diff --git a/modules/local/chai_1/main.nf b/modules/local/chai_1/main.nf
@@ -2,11 +2,16 @@ process CHAI_1 {
     tag "$meta.id"
     label 'process_high'
     conda "${moduleDir}/environment.yml"
-    container 'drpatelh/chai_lab:0.3.0'
+    container 'community.wave.seqera.io/library/gcc_linux-64_python_cuda_pip_chai_lab:44cb323409492b49'
 
     input:
     tuple val(meta), path(fasta)
     path weights_dir
+    path msa_dir
+    val num_trunk_recycles
+    val num_diffusion_timesteps
+    val seed
+    val use_esm_embeddings
 
     output:
     tuple val(meta), path("${meta.id}/*.cif"), emit: structures
@@ -15,17 +20,22 @@ process CHAI_1 {
 
     script:
     def downloads_dir = weights_dir ?: './downloads'
+    def msa_path = msa_dir ? "--msa-dir=$msa_dir" : ''
+    def use_esm = use_esm_embeddings ? '--use-esm-embeddings' : ''
     """
     CHAI_DOWNLOADS_DIR=$downloads_dir \\
     run_chai_1.py \\
+        --fasta-file ${fasta} \\
         --output-dir ${meta.id} \\
-        --fasta-file ${fasta}
+        --num-trunk-recycles ${num_trunk_recycles} \\
+        --num-diffn-timesteps ${num_diffusion_timesteps} \\
+        --seed ${seed} \\
+        ${use_esm} \\
+        ${msa_path}
 
     cat <<-END_VERSIONS > versions.yml
     "${task.process}":
-        python: \$(python --version | sed 's/Python //g')
         chai_lab: \$(python -c "import chai_lab; print(chai_lab.__version__)")
-        torch: \$(python -c "import torch; print(torch.__version__)")
     END_VERSIONS
     """
 

diff --git a/nextflow.config b/nextflow.config
@@ -12,7 +12,12 @@ params {
     // Input options
     input                        = null
     weights_dir                  = null
+    msa_dir                      = null
     use_gpus                     = false
+    num_trunk_recycles           = 3
+    num_diffusion_timesteps      = 200
+    seed                         = 42
+    use_esm_embeddings           = true
 
     // Boilerplate options
     outdir                       = null
@@ -26,6 +31,7 @@ params {
 
     // Schema validation default options
     validate_params              = true
+
 }
 
 // Default publishing settings for all processes
@@ -127,6 +133,7 @@ profiles {
     apptainer {
         apptainer.enabled       = true
         apptainer.autoMounts    = true
+        apptainer.runOptions    = params.use_gpus ? '--nv' : ''
         conda.enabled           = false
         docker.enabled          = false
         singularity.enabled     = false
@@ -141,8 +148,9 @@ profiles {
         wave.freeze             = true
         wave.strategy           = 'conda,container'
     }
-    test      { includeConfig 'conf/test.config'      }
-    test_full { includeConfig 'conf/test_full.config' }
+    test          { includeConfig 'conf/test.config'          }
+    test_full     { includeConfig 'conf/test_full.config'     }
+    test_full_msa { includeConfig 'conf/test_full_msa.config' }
 }
 
 // Export these variables to prevent local Python/R libraries from conflicting with those in the container
@@ -183,8 +191,7 @@ manifest {
     description     = """Nextflow pipeline to run the Chai-1, SOTA model for biomolecular structure prediction"""
     mainScript      = 'main.nf'
     nextflowVersion = '!>=24.04.2'
-    version         = '1.0.0dev'
-    doi             = ''
+    version         = '0.2.0'
 }
 
 // Nextflow plugins

diff --git a/nextflow_schema.json b/nextflow_schema.json
@@ -35,10 +35,45 @@
                     "description": "Directory containing model weights and other artifacts required by Chai-1.",
                     "fa_icon": "fas fa-folder-open"
                 },
+                "msa_dir": {
+                    "type": "string",
+                    "format": "directory-path",
+                    "exists": true,
+                    "description": "Directory containing precomputed multiple-sequence alignments",
+                    "fa_icon": "fas fa-align-justify"
+                },
                 "use_gpus": {
                     "type": "boolean",
                     "description": "Run compatible tasks on GPUs rather than CPUs (default).",
                     "fa_icon": "fas fa-microchip"
+                },
+                "num_trunk_recycles": {
+                    "type": "integer",
+                    "default": 3,
+                    "fa_icon": "fas fa-recycle",
+                    "description": "Number of trunk recycles",
+                    "hidden": true
+                },
+                "num_diffusion_timesteps": {
+                    "type": "integer",
+                    "default": 200,
+                    "fa_icon": "fas fa-shoe-prints",
+                    "hidden": true,
+                    "description": "Number of diffusion steps to use."
+                },
+                "seed": {
+                    "type": "integer",
+                    "default": 42,
+                    "fa_icon": "fas fa-seedling",
+                    "hidden": true,
+                    "description": "Random seed to be used for Chai-1 calculations"
+                },
+                "use_esm_embeddings": {
+                    "type": "boolean",
+                    "default": true,
+                    "fa_icon": "fas fa-stamp",
+                    "hidden": true,
+                    "description": "Use user-provided esm model embeddings"
                 }
             }
         },

diff --git a/workflows/nf_chai/main.nf b/workflows/nf_chai/main.nf
@@ -16,8 +16,13 @@ include { CHAI_1                 } from '../../modules/local/chai_1'
 workflow NF_CHAI {
 
     take:
-    fasta_file  //  string: path to fasta file read provided via --input parameter
-    weights_dir //  string: path to model directory read provided via --weights_directory parameter
+    fasta_file              //  string: path to fasta file read provided via --input parameter
+    weights_dir             //  string: path to model directory read provided via --weights_dir parameter
+    msa_dir                 //  string: path to the directory containing multiple sequence alignments (msa)
+    num_trunk_recycles      // integer: Number of trunk recycles
+    num_diffusion_timesteps // integer: Number of diffusion steps to use
+    seed                    // integer: Random seed to be used for Chai-1 calculations
+    use_esm_embeddings      // boolean: Use user-provided esm model embeddings
 
     main:
 
@@ -34,7 +39,12 @@ workflow NF_CHAI {
     // Run structure prediction with Chai-1
     CHAI_1 (
         ch_fasta,
-        weights_dir ? Channel.fromPath(weights_dir) : []
+        weights_dir ? Channel.fromPath(weights_dir) : [],
+        msa_dir ? Channel.fromPath(msa_dir) : [],
+        num_trunk_recycles,
+        num_diffusion_timesteps,
+        seed,
+        use_esm_embeddings
     )
     ch_versions = ch_versions.mix(CHAI_1.out.versions)