Skip to content

Commit 35676e4

Browse files
authored
feat: add flag to set snakemake rerun triggers (#27)
* feat: add flag to set snakemake rerun triggers * chore: remove references to old coassembly methods * fix: switch to space seperated trigger setting * fix: fix delimter for csv files
1 parent 17a982b commit 35676e4

File tree

7 files changed

+148
-200
lines changed

7 files changed

+148
-200
lines changed

metamorph

+57-45
Original file line numberDiff line numberDiff line change
@@ -29,37 +29,36 @@ USAGE:
2929
$ metamorph <command> [OPTIONS]
3030
3131
EXAMPLES:
32-
co-assembly dna-only:
33-
$ metamorph run --coa --input *.R?.fastq.gz --output output
34-
$ metamorph run -C --input *.R?.fastq.gz --output output
35-
36-
per-sample assembly dna-only:
37-
$ metamorph run --input *.R?.fastq.gz --output output
38-
39-
co-assembly rna & dna:
40-
$ metamorph run --coa --input *.R?.fastq.gz --rna rna/*.R?.fastq.gz --output output
41-
$ metamorph run -C --input *.R?.fastq.gz --rna rna/*.R?.fastq.gz --output output
42-
43-
per-sample assembly rna & dna:
44-
$ metamorph run --input *.R?.fastq.gz --rna rna/*.R?.fastq.gz --output output
45-
"""
32+
dna-only:
33+
$ metamorph run --input <sample_sheet> --output <output_dir>
34+
sample sheet:
35+
________
36+
| DNA |
37+
| -------
38+
pair1 | path |
39+
pair2 | path |
40+
--------
41+
42+
rna & dna:
43+
$ metamorph run --input <sample_sheet> --output <output_dir>
44+
sample sheet:
45+
________________
46+
| DNA | RNA |
47+
|---------------|
48+
pair1 | path | path |
49+
pair2 | path | path |
50+
---------------
51+
"""
4652
from __future__ import print_function
4753
from datetime import timezone, datetime
4854
import argparse, sys, os, subprocess, json, textwrap
4955

5056

5157
# Local imports
5258
from src import version
53-
from src.run import init, setup, bind, dryrun, runner, valid_input
54-
from src.utils import (
55-
Colors,
56-
err,
57-
exists,
58-
fatal,
59-
check_cache,
60-
require,
61-
permissions
62-
)
59+
from src.run import init, setup, bind, dryrun, runner
60+
from src.utils import Colors, err, exists, fatal, check_cache, require, \
61+
permissions, valid_trigger, valid_input
6362

6463

6564
# Pipeline Metadata
@@ -156,7 +155,6 @@ def run(sub_args):
156155
config = config
157156
)
158157
config['bindpaths'] = bindpaths
159-
config['coassembly'] = False
160158

161159
# Step 4b. Setup assembly mode
162160
# modes: 0 - megahit + metaspades assembly
@@ -190,6 +188,7 @@ def run(sub_args):
190188
if 'databases' in config:
191189
bindpaths.extend([mount['from']+':'+mount['to']+':'+mount['mode'] for mount in config['databases']])
192190

191+
triggers = sub_args.triggers if sub_args.triggers else None
193192
mjob = runner(mode = sub_args.mode,
194193
outdir = sub_args.output,
195194
alt_cache = sub_args.singularity_cache,
@@ -199,6 +198,7 @@ def run(sub_args):
199198
logger = logfh,
200199
additional_bind_paths = ",".join(bindpaths),
201200
tmp_dir = sub_args.tmp_dir,
201+
triggers = triggers
202202
)
203203

204204
# Step 6. Wait for subprocess to complete,
@@ -391,7 +391,7 @@ def parsed_arguments(name, description):
391391
module load singularity snakemake
392392
393393
# Step 2A.) Dry-run the pipeline
394-
./{0} run --input .tests/*.R?.fastq.gz \\
394+
./{0} run --input <sample_sheet> \\
395395
--output /data/$USER/output \\
396396
--mode slurm \\
397397
--dry-run
@@ -400,30 +400,36 @@ def parsed_arguments(name, description):
400400
# The slurm mode will submit jobs to
401401
# the cluster. It is recommended running
402402
# the pipeline in this mode.
403-
./{0} run --input .tests/*.R?.fastq.gz \\
403+
./{0} run --input <sample_sheet> \\
404404
--output /data/$USER/output \\
405405
--mode slurm
406406
407407
# Step 3B.) Run the {0} pipeline in co-assembly fashion
408408
# with slurm
409-
./{0} run --coa --input .tests/*.R?.fastq.gz \\
409+
./{0} run --input .tests/*.R?.fastq.gz \\
410410
--output /data/$USER/output \\
411411
--mode slurm
412412
413413
{2}{3}EXAMPLES:{4}
414-
co-assembly dna-only:
415-
$ metamorph run --coa --input *.R?.fastq.gz --output output
416-
$ metamorph run -C --input *.R?.fastq.gz --output output
417-
418-
per-sample assembly dna-only:
419-
$ metamorph run --input *.R?.fastq.gz --output output
420-
421-
co-assembly rna & dna:
422-
$ metamorph run --coa --input *.R?.fastq.gz --rna rna/*.R?.fastq.gz --output output
423-
$ metamorph run -C --input *.R?.fastq.gz --rna rna/*.R?.fastq.gz --output output
424-
425-
per-sample assembly rna & dna:
426-
$ metamorph run --input *.R?.fastq.gz --rna rna/*.R?.fastq.gz --output output
414+
dna-only:
415+
$ metamorph run --input <sample_sheet> --output <output_dir>
416+
sample sheet:
417+
________
418+
| DNA |
419+
| -------
420+
pair1 | path |
421+
pair2 | path |
422+
--------
423+
424+
rna & dna:
425+
$ metamorph run --input <sample_sheet> --output <output_dir>
426+
sample sheet:
427+
________________
428+
| DNA | RNA |
429+
|---------------|
430+
pair1 | path | path |
431+
pair2 | path | path |
432+
---------------
427433
428434
429435
{2}{3}VERSION:{4}
@@ -466,9 +472,6 @@ def parsed_arguments(name, description):
466472
action='help',
467473
help=argparse.SUPPRESS
468474
)
469-
470-
# Analysis options
471-
# ... add here
472475

473476
# Orchestration Options
474477
# Execution Method, run locally
@@ -492,6 +495,16 @@ def parsed_arguments(name, description):
492495
help = argparse.SUPPRESS
493496
)
494497

498+
# Snakemake rerun triggers
499+
subparser_run.add_argument(
500+
'-t', '--triggers',
501+
type = valid_trigger,
502+
required = False,
503+
default = None,
504+
nargs="*",
505+
help = argparse.SUPPRESS
506+
)
507+
495508
# Dry-run
496509
# Do not execute the workflow,
497510
# prints what steps remain
@@ -744,7 +757,6 @@ def parsed_arguments(name, description):
744757

745758

746759
def main():
747-
748760
# Sanity check for usage
749761
if len(sys.argv) == 1:
750762
# Nothing was provided

src/run.py

+11-54
Original file line numberDiff line numberDiff line change
@@ -5,7 +5,7 @@
55
from __future__ import print_function
66
from shutil import copytree
77
from pathlib import Path
8-
from csv import DictReader, Sniffer
8+
99
import os, re, json, sys, subprocess, argparse
1010

1111

@@ -664,53 +664,6 @@ def dryrun(outdir, config='config.json', snakefile=os.path.join('workflow', 'Sna
664664
return dryrun_output
665665

666666

667-
def valid_input(sheet):
668-
"""
669-
Valid sample sheets should contain two columns: "DNA" and "RNA"
670-
671-
_________________
672-
| DNA | RNA |
673-
|---------------|
674-
pair1 | path | path |
675-
pair2 | path | path |
676-
"""
677-
# check file permissions
678-
sheet = os.path.abspath(sheet)
679-
if not os.path.exists(sheet):
680-
raise argparse.ArgumentTypeError(f'Sample sheet path {sheet} does not exist!')
681-
if not os.access(sheet, os.R_OK):
682-
raise argparse.ArgumentTypeError(f"Path `{sheet}` exists, but cannot read path due to permissions!")
683-
684-
# check format to make sure it's correct
685-
if sheet.endswith('.tsv') or sheet.endswith('.txt'):
686-
delim = '\t'
687-
elif sheet.endswith('.csv'):
688-
delim = '\t'
689-
690-
rdr = DictReader(open(sheet, 'r'), delimiter=delim)
691-
692-
if 'DNA' not in rdr.fieldnames:
693-
raise argparse.ArgumentTypeError("Sample sheet does not contain `DNA` column")
694-
if 'RNA' not in rdr.fieldnames:
695-
print("-- Running in DNA only mode --")
696-
else:
697-
print("-- Running in paired DNA & RNA mode --")
698-
699-
data = [row for row in rdr]
700-
RNA_included = False
701-
for row in data:
702-
row['DNA'] = os.path.abspath(row['DNA'])
703-
if not os.path.exists(row['DNA']):
704-
raise argparse.ArgumentTypeError(f"Sample sheet path `{row['DNA']}` does not exist")
705-
if 'RNA' in row and not row['RNA'] in ('', None, 'None'):
706-
RNA_included = True
707-
row['RNA'] = os.path.abspath(row['RNA'])
708-
if not os.path.exists(row['RNA']):
709-
raise argparse.ArgumentTypeError(f"Sample sheet path `{row['RNA']}` does not exist")
710-
711-
return data, RNA_included
712-
713-
714667
try:
715668
__job_name__ = 'metamorph_' + os.getlogin() + ':master'
716669
except OSError:
@@ -726,6 +679,7 @@ def runner(
726679
threads=2,
727680
jobname=__job_name__,
728681
submission_script='run.sh',
682+
triggers=None,
729683
tmp_dir = '/lscratch/$SLURM_JOB_ID/'
730684
):
731685
"""Runs the pipeline via selected executor: local or slurm.
@@ -833,11 +787,14 @@ def runner(
833787
# --cluster "${CLUSTER_OPTS}" --keep-going --restart-times 3 -j 500 \
834788
# --rerun-incomplete --stats "$3"/logfiles/runtime_statistics.json \
835789
# --keep-remote --local-cores 30 2>&1 | tee -a "$3"/logfiles/master.log
836-
masterjob = subprocess.Popen([
837-
str(submission_script), mode,
838-
'-j', jobname, '-b', str(bindpaths),
839-
'-o', str(outdir), '-c', str(cache),
840-
'-t', "'{}'".format(tmp_dir)
841-
], cwd = outdir, stderr=subprocess.STDOUT, stdout=logger, env=my_env)
790+
cmd = [
791+
str(submission_script), mode,
792+
'-j', jobname, '-b', str(bindpaths),
793+
'-o', str(outdir), '-c', str(cache),
794+
'-t', "'{}'".format(tmp_dir),
795+
]
796+
if triggers:
797+
cmd.extend(['-r', ','.join(triggers)])
798+
masterjob = subprocess.Popen(cmd, cwd = outdir, stderr=subprocess.STDOUT, stdout=logger, env=my_env)
842799

843800
return masterjob

src/run.sh

+20-4
Original file line numberDiff line numberDiff line change
@@ -8,7 +8,8 @@ USAGE:
88
-o OUTDIR \\
99
-j MASTER_JOB_NAME \\
1010
-b SINGULARITY_BIND_PATHS \\
11-
-t TMP_DIR
11+
-t TMP_DIR \\
12+
-r RERUN_TRIGGERS
1213
SYNOPSIS:
1314
This script creates/submits the pipeline's master job to the
1415
cluster. The master job acts as the pipeline's main controller or
@@ -61,7 +62,12 @@ Required Arguments:
6162
this location. On Biowulf, it should be
6263
set to '/lscratch/\$SLURM_JOBID/'. On FRCE,
6364
this value should be set to the following:
64-
'/scratch/cluster_scratch/\$USER/'.
65+
'/scratch/cluster_scratch/\$USER/'.
66+
-r, --triggers [Type: Str] Snakemake rerun triggers. See
67+
description of flag '--rerun-triggers', at
68+
https://snakemake.readthedocs.io/en/stable/executing/cli.html#all-options
69+
for more details.
70+
Default: code params software_env input mtime
6571
OPTIONS:
6672
-c, --cache [Type: Path] Path to singularity cache. If not provided,
6773
the path will default to the current working
@@ -97,6 +103,7 @@ function parser() {
97103
-t | --tmp-dir) provided "$key" "${2:-}"; Arguments["t"]="$2"; shift; shift;;
98104
-o | --outdir) provided "$key" "${2:-}"; Arguments["o"]="$2"; shift; shift;;
99105
-c | --cache) provided "$key" "${2:-}"; Arguments["c"]="$2"; shift; shift;;
106+
-r | --triggers) provided "$key" "${2:-}"; Arguments["r"]="${2/,/' '}"; shift; shift;;
100107
-* | --*) err "Error: Failed to parse unsupported argument: '${key}'."; usage && exit 1;;
101108
*) err "Error: Failed to parse unrecognized argument: '${key}'. Do any of your inputs have spaces?"; usage && exit 1;;
102109
esac
@@ -159,6 +166,7 @@ function submit(){
159166
# INPUT $4 = Singularity Bind paths
160167
# INPUT $5 = Singularity cache directory
161168
# INPUT $6 = Temporary directory for output files
169+
# INPUT $7 = rerun trigger values
162170

163171
# Check if singularity and snakemake are in $PATH
164172
# If not, try to module load singularity as a last resort
@@ -191,6 +199,9 @@ function submit(){
191199
# --printshellcmds --keep-going --rerun-incomplete
192200
# --keep-remote --restart-times 3 -j 500 --use-singularity
193201
# --singularity-args -B {}.format({bindpaths}) --local-cores 24
202+
triggers="${7:-'code params software_env input mtime'}"
203+
rerun="--rerun-triggers $triggers"
204+
194205
SLURM_DIR="$3/logfiles/slurmfiles"
195206
CLUSTER_OPTS="sbatch --gres {cluster.gres} --cpus-per-task {cluster.threads} -p {cluster.partition} -t {cluster.time} --mem {cluster.mem} --job-name={params.rname} -e $SLURM_DIR/slurm-%j_{params.rname}.out -o $SLURM_DIR/slurm-%j_{params.rname}.out"
196207
# Check if NOT running on Biowulf
@@ -228,6 +239,7 @@ snakemake \\
228239
-s "$3/workflow/Snakefile" \\
229240
-d "$3" \\
230241
--use-singularity \\
242+
$rerun \\
231243
--singularity-args "\\-c \\-B '$4'" \\
232244
--use-envmodules \\
233245
--verbose \\
@@ -279,9 +291,9 @@ function main(){
279291

280292
# Parses remaining user provided command-line arguments
281293
parser "${@:2}" # Remove first item of list
294+
282295
outdir="$(abspath "$(dirname "${Arguments[o]}")")"
283296
Arguments[o]="${Arguments[o]%/}" # clean outdir path (remove trailing '/')
284-
285297
# Setting defaults for non-required arguments
286298
# If singularity cache not provided, default to ${outdir}/.singularity
287299
cache="${Arguments[o]}/.singularity"
@@ -294,7 +306,11 @@ function main(){
294306

295307
# Run pipeline and submit jobs to cluster using the defined executor
296308
mkdir -p "${Arguments[o]}/logfiles/"
297-
job_id=$(submit "${Arguments[e]}" "${Arguments[j]}" "${Arguments[o]}" "${Arguments[b]}" "${Arguments[c]}" "${Arguments[t]}")
309+
if [[ ! -v Arguments[r] ]] ; then
310+
job_id=$(submit "${Arguments[e]}" "${Arguments[j]}" "${Arguments[o]}" "${Arguments[b]}" "${Arguments[c]}" "${Arguments[t]}")
311+
else
312+
job_id=$(submit "${Arguments[e]}" "${Arguments[j]}" "${Arguments[o]}" "${Arguments[b]}" "${Arguments[c]}" "${Arguments[t]}" "${Arguments[r]}")
313+
fi
298314
echo -e "[$(date)] Pipeline submitted to cluster.\nMaster Job ID: $job_id"
299315
echo "${job_id}" > "${Arguments[o]}/logfiles/mjobid.log"
300316

0 commit comments

Comments
 (0)