feat: bazam wrapper (snakemake#580)

### Description  ### QC  * [x] I confirm that: For all wrappers added by this PR, * there is a test case which covers any introduced changes, * `input:` and `output:` file paths in the resulting rule can be changed arbitrarily, * either the wrapper can only use a single core, or the example rule contains a `threads: x` statement with `x` being a reasonable default, * rule names in the test case are in [snake_case](https://en.wikipedia.org/wiki/Snake_case) and somehow tell what the rule is about or match the tools purpose or name (e.g., `map_reads` for a step that maps reads), * all `environment.yaml` specifications follow [the respective best practices](https://stackoverflow.com/a/64594513/2352071), * wherever possible, command line arguments are inferred and set automatically (e.g. based on file extensions in `input:` or `output:`), * all fields of the example rules in the `Snakefile`s and their entries are explained via comments (`input:`/`output:`/`params:` etc.), * `stderr` and/or `stdout` are logged correctly (`log:`), depending on the wrapped tool, * temporary files are either written to a unique hidden folder in the working directory, or (better) stored where the Python function `tempfile.gettempdir()` points to (see [here](https://docs.python.org/3/library/tempfile.html#tempfile.gettempdir); this also means that using any Python `tempfile` default behavior works), * the `meta.yaml` contains a link to the documentation of the respective tool or command, * `Snakefile`s pass the linting (`snakemake --lint`), * `Snakefile`s are formatted with [snakefmt](https://github.com/snakemake/snakefmt), * Python wrapper scripts are formatted with [black](https://black.readthedocs.io). * Conda environments use a minimal amount of channels, in recommended ordering. E.g. for bioconda, use (conda-forge, bioconda, nodefaults, as conda-forge should have highest priority and defaults channels are usually not needed because most packages are in conda-forge nowadays).
tdayris · Oct 13, 2022 · 17e58e6 · 17e58e6
1 parent 9b7c4fe
commit 17e58e6
Show file tree

Hide file tree

Showing 10 changed files with 95 additions and 0 deletions.
diff --git a/bio/bazam/environment.yaml b/bio/bazam/environment.yaml
@@ -0,0 +1,7 @@
+channels:
+  - conda-forge
+  - bioconda
+  - nodefaults
+dependencies:
+  - bazam =1.0
+  - snakemake-wrapper-utils ==0.5
diff --git a/bio/bazam/meta.yaml b/bio/bazam/meta.yaml
@@ -0,0 +1,10 @@
+name: "bazam"
+description: Bazam is a smarter way to realign reads from one genome to another. If you've tried to use Picard SAMtoFASTQ or samtools bam2fq before and ended up unsatisfied with complicated, long running inefficient pipelines, bazam might be what you wanted. Bazam will output FASTQ in a form that can stream directly into common aligners such as BWA or Bowtie2, so that you can quickly and easily realign reads without extraction to any intermediate format. Bazam can target a specific region of the genome, specified as a region or a gene name if you prefer.
+url: https://github.com/ssadedin/bazam
+authors:
+  - Christopher Schröder
+input:
+  - BAM/CRAM file
+  - reference genome
+output:
+  - fastq file
diff --git a/bio/bazam/test/Snakefile b/bio/bazam/test/Snakefile
@@ -0,0 +1,28 @@
+rule bazam_interleaved:
+    input:
+        bam="mapped/{sample}.bam",
+        bai="mapped/{sample}.bam.bai",
+    output:
+        reads="results/reads/{sample}.fastq.gz",
+    resources:
+        mem_mb=12000,
+    log:
+        "logs/bazam/{sample}.log",
+    wrapper:
+        "master/bio/bazam"
+
+
+rule bazam_separated:
+    input:
+        bam="mapped/{sample}.cram",
+        bai="mapped/{sample}.cram.crai",
+        reference="genome.fasta",
+    output:
+        r1="results/reads/{sample}.r1.fastq.gz",
+        r2="results/reads/{sample}.r2.fastq.gz",
+    resources:
+        mem_mb=12000,
+    log:
+        "logs/bazam/{sample}.log",
+    wrapper:
+        "master/bio/bazam"
diff --git a/bio/bazam/test/genome.fasta b/bio/bazam/test/genome.fasta
@@ -0,0 +1,2 @@
+>Sheila
+GCTAGCTCAGAAAAAAAAAA
diff --git a/bio/bazam/test/mapped/a.bam b/bio/bazam/test/mapped/a.bam
diff --git a/bio/bazam/test/mapped/a.bam.bai b/bio/bazam/test/mapped/a.bam.bai
diff --git a/bio/bazam/test/mapped/a.cram b/bio/bazam/test/mapped/a.cram
diff --git a/bio/bazam/test/mapped/a.cram.crai b/bio/bazam/test/mapped/a.cram.crai
diff --git a/bio/bazam/wrapper.py b/bio/bazam/wrapper.py
@@ -0,0 +1,34 @@
+__author__ = "Christopher Schröder"
+__copyright__ = "Copyright 2022, Christopher Schröder"
+__email__ = "[email protected]"
+__license__ = "MIT"
+
+from snakemake.shell import shell
+from snakemake_wrapper_utils.java import get_java_opts
+
+java_opts = get_java_opts(snakemake)
+
+log = snakemake.log_fmt_shell(stdout=False, stderr=True)
+bam = snakemake.input.bam
+
+# Extra parameters default value is an empty string
+extra = snakemake.params.get("extra", "")
+
+if bam.endswith(".cram"):
+    if not (reference := snakemake.input.get("reference", "")):
+        raise ValueError(
+            "input 'reference' is required when working with CRAM input files"
+        )
+    reference_cmd = f"-Dsamjdk.reference_fasta={reference}"
+else:
+    reference_cmd = ""
+
+# Extract arguments.
+if reads := snakemake.output.get("reads", ""):
+    out_cmd = f"-o {reads}"
+elif (r1 := snakemake.output.get("r1", "")) and (r2 := snakemake.output.get("r2", "")):
+    out_cmd = f"-r1 {r1} -r2 {r2}"
+else:
+    raise ValueError("either 'reads' or 'r1' and 'r2' must be specified in output")
+
+shell("(bazam {java_opts} {reference_cmd} {extra} -bam {bam} {out_cmd}) {log}")
diff --git a/test.py b/test.py
@@ -4853,4 +4853,18 @@ def test_calc_consensus_reads():
     run(
         "meta/bio/calc_consensus_reads/",
         ["snakemake", "--cores", "1", "--use-conda", "-F", "results/consensus/sampleA.bam"],
+    )
+
+@skip_if_not_modified
+def test_bazam_interleaved():
+    run(
+        "bio/bazam",
+        ["snakemake", "--cores", "1", "--use-conda", "-F", "results/reads/a.fastq.gz"],
+    )
+
+@skip_if_not_modified
+def test_bazam_separated():
+    run(
+        "bio/bazam",
+        ["snakemake", "--cores", "1", "--use-conda", "-F", "results/reads/a.r1.fastq.gz"],
     )