From e10ab571a63b36ce1295e50b6f6b7e4742e0beae Mon Sep 17 00:00:00 2001 From: "Filipe G. Vieira" <1151762+fgvieira@users.noreply.github.com> Date: Tue, 28 May 2024 14:02:46 +0200 Subject: [PATCH] fix: issue #366 and #2649 (#2928) Allow for custom URLs (fix issues #366 and #2649). ### QC * [x] I confirm that: For all wrappers added by this PR, * there is a test case which covers any introduced changes, * `input:` and `output:` file paths in the resulting rule can be changed arbitrarily, * either the wrapper can only use a single core, or the example rule contains a `threads: x` statement with `x` being a reasonable default, * rule names in the test case are in [snake_case](https://en.wikipedia.org/wiki/Snake_case) and somehow tell what the rule is about or match the tools purpose or name (e.g., `map_reads` for a step that maps reads), * all `environment.yaml` specifications follow [the respective best practices](https://stackoverflow.com/a/64594513/2352071), * the `environment.yaml` pinning has been updated by running `snakedeploy pin-conda-envs environment.yaml` on a linux machine, * wherever possible, command line arguments are inferred and set automatically (e.g. based on file extensions in `input:` or `output:`), * all fields of the example rules in the `Snakefile`s and their entries are explained via comments (`input:`/`output:`/`params:` etc.), * `stderr` and/or `stdout` are logged correctly (`log:`), depending on the wrapped tool, * temporary files are either written to a unique hidden folder in the working directory, or (better) stored where the Python function `tempfile.gettempdir()` points to (see [here](https://docs.python.org/3/library/tempfile.html#tempfile.gettempdir); this also means that using any Python `tempfile` default behavior works), * the `meta.yaml` contains a link to the documentation of the respective tool or command, * `Snakefile`s pass the linting (`snakemake --lint`), * `Snakefile`s are formatted with [snakefmt](https://github.com/snakemake/snakefmt), * Python wrapper scripts are formatted with [black](https://black.readthedocs.io). * Conda environments use a minimal amount of channels, in recommended ordering. E.g. for bioconda, use (conda-forge, bioconda, nodefaults, as conda-forge should have highest priority and defaults channels are usually not needed because most packages are in conda-forge nowadays). --- bio/reference/ensembl-annotation/meta.yaml | 2 ++ bio/reference/ensembl-annotation/test/Snakefile | 2 ++ bio/reference/ensembl-annotation/wrapper.py | 13 ++----------- bio/reference/ensembl-sequence/meta.yaml | 4 ++++ bio/reference/ensembl-sequence/test/Snakefile | 2 ++ bio/reference/ensembl-sequence/wrapper.py | 3 ++- bio/reference/ensembl-variation/meta.yaml | 4 ++++ bio/reference/ensembl-variation/test/Snakefile | 2 ++ bio/reference/ensembl-variation/wrapper.py | 10 +++------- bio/vep/cache/meta.yaml | 7 +++++++ bio/vep/cache/test/Snakefile | 15 +++++++++++++++ bio/vep/cache/wrapper.py | 11 ++++++----- test.py | 5 +++++ 13 files changed, 56 insertions(+), 24 deletions(-) diff --git a/bio/reference/ensembl-annotation/meta.yaml b/bio/reference/ensembl-annotation/meta.yaml index be0a0bb69b..b8fd0924a7 100644 --- a/bio/reference/ensembl-annotation/meta.yaml +++ b/bio/reference/ensembl-annotation/meta.yaml @@ -4,3 +4,5 @@ authors: - Johannes Köster output: - Ensemble GTF or GFF3 anotation file +params: + - url: URL from where to download cache data (optional; by default is ``ftp://ftp.ensembl.org/pub``) diff --git a/bio/reference/ensembl-annotation/test/Snakefile b/bio/reference/ensembl-annotation/test/Snakefile index fed87b2ee6..3a30ca70bd 100644 --- a/bio/reference/ensembl-annotation/test/Snakefile +++ b/bio/reference/ensembl-annotation/test/Snakefile @@ -25,6 +25,8 @@ rule get_annotation_gz: # branch="plants", # optional: specify branch log: "logs/get_annotation.log", + params: + url="http://ftp.ensembl.org/pub", cache: "omit-software" # save space and time with between workflow caching (see docs) wrapper: "master/bio/reference/ensembl-annotation" diff --git a/bio/reference/ensembl-annotation/wrapper.py b/bio/reference/ensembl-annotation/wrapper.py index 2f1b78c2a6..c3d655cbb2 100644 --- a/bio/reference/ensembl-annotation/wrapper.py +++ b/bio/reference/ensembl-annotation/wrapper.py @@ -48,17 +48,8 @@ ) -url = "ftp://ftp.ensembl.org/pub/{branch}release-{release}/{out_fmt}/{species}/{species_cap}.{build}.{gtf_release}.{flavor}{suffix}".format( - release=release, - gtf_release=gtf_release, - build=build, - species=species, - out_fmt=out_fmt, - species_cap=species.capitalize(), - suffix=suffix, - flavor=flavor, - branch=branch, -) +url = snakemake.params.get("url", "ftp://ftp.ensembl.org/pub") +url = f"{url}/{branch}release-{release}/{out_fmt}/{species}/{species.capitalize()}.{build}.{gtf_release}.{flavor}{suffix}" try: diff --git a/bio/reference/ensembl-sequence/meta.yaml b/bio/reference/ensembl-sequence/meta.yaml index 189912a20e..20c769a0d1 100644 --- a/bio/reference/ensembl-sequence/meta.yaml +++ b/bio/reference/ensembl-sequence/meta.yaml @@ -2,3 +2,7 @@ name: ensembl-sequence description: Download sequences (e.g. genome) from ENSEMBL FTP servers, and store them in a single .fasta file. authors: - Johannes Köster +output: + - fasta file +params: + - url: URL from where to download cache data (optional; by default is ``ftp://ftp.ensembl.org/pub``) diff --git a/bio/reference/ensembl-sequence/test/Snakefile b/bio/reference/ensembl-sequence/test/Snakefile index a8227f4cdd..fec1c746a4 100644 --- a/bio/reference/ensembl-sequence/test/Snakefile +++ b/bio/reference/ensembl-sequence/test/Snakefile @@ -25,6 +25,8 @@ rule get_single_chromosome: # branch="plants", # optional: specify branch log: "logs/get_genome.log", + params: + url="http://ftp.ensembl.org/pub", cache: "omit-software" # save space and time with between workflow caching (see docs) wrapper: "master/bio/reference/ensembl-sequence" diff --git a/bio/reference/ensembl-sequence/wrapper.py b/bio/reference/ensembl-sequence/wrapper.py index df9a6eef69..cb2956a6c0 100644 --- a/bio/reference/ensembl-sequence/wrapper.py +++ b/bio/reference/ensembl-sequence/wrapper.py @@ -50,8 +50,9 @@ "invalid datatype, to select a single chromosome the datatype must be dna" ) +url = snakemake.params.get("url", "ftp://ftp.ensembl.org/pub") spec = spec.format(build=build, release=release) -url_prefix = f"ftp://ftp.ensembl.org/pub/{branch}release-{release}/fasta/{species}/{datatype}/{species.capitalize()}.{spec}" +url_prefix = f"{url}/{branch}release-{release}/fasta/{species}/{datatype}/{species.capitalize()}.{spec}" success = False for suffix in suffixes: diff --git a/bio/reference/ensembl-variation/meta.yaml b/bio/reference/ensembl-variation/meta.yaml index b562872809..3f1a261ae4 100644 --- a/bio/reference/ensembl-variation/meta.yaml +++ b/bio/reference/ensembl-variation/meta.yaml @@ -2,3 +2,7 @@ name: ensembl-variation description: Download known genomic variants from ENSEMBL FTP servers, and store them in a single .vcf.gz file. authors: - Johannes Köster +output: + - VCF file +params: + - url: URL from where to download cache data (optional; by default is ``ftp://ftp.ensembl.org/pub``) diff --git a/bio/reference/ensembl-variation/test/Snakefile b/bio/reference/ensembl-variation/test/Snakefile index 27594273e5..9189f7de19 100644 --- a/bio/reference/ensembl-variation/test/Snakefile +++ b/bio/reference/ensembl-variation/test/Snakefile @@ -12,6 +12,8 @@ rule get_variation: type="all", # one of "all", "somatic", "structural_variation" # chromosome="21", # optionally constrain to chromosome, only supported for homo_sapiens # branch="plants", # optional: specify branch + params: + url="http://ftp.ensembl.org/pub", log: "logs/get_variation.log", cache: "omit-software" # save space and time with between workflow caching (see docs) diff --git a/bio/reference/ensembl-variation/wrapper.py b/bio/reference/ensembl-variation/wrapper.py index ee179c51c1..4e21292886 100644 --- a/bio/reference/ensembl-variation/wrapper.py +++ b/bio/reference/ensembl-variation/wrapper.py @@ -62,16 +62,12 @@ species_filename = species if release >= 91 else species.capitalize() +url = snakemake.params.get("url", "ftp://ftp.ensembl.org/pub") urls = [ - "ftp://ftp.ensembl.org/pub/{branch}release-{release}/variation/vcf/{species}/{species_filename}{suffix}.vcf.gz".format( - release=release, - species=species, - suffix=suffix, - species_filename=species_filename, - branch=branch, - ) + f"{url}/{branch}release-{release}/variation/vcf/{species}/{species_filename}{suffix}.vcf.gz" for suffix in suffixes ] + names = [os.path.basename(url) for url in urls] try: diff --git a/bio/vep/cache/meta.yaml b/bio/vep/cache/meta.yaml index a743fdd138..220cec2c1f 100644 --- a/bio/vep/cache/meta.yaml +++ b/bio/vep/cache/meta.yaml @@ -3,3 +3,10 @@ description: Download VEP cache for given species, build and release. url: http://www.ensembl.org/info/docs/tools/vep/index.html authors: - Johannes Köster +output: + - directory to store the VEP cache +params: + - url: URL from where to download cache data (optional; by default is ``ftp://ftp.ensembl.org/pub``) + - species: species to download cache data + - build: build to download cache data + - release: release to download cache data diff --git a/bio/vep/cache/test/Snakefile b/bio/vep/cache/test/Snakefile index 4f6285893c..c2666c9547 100644 --- a/bio/vep/cache/test/Snakefile +++ b/bio/vep/cache/test/Snakefile @@ -10,3 +10,18 @@ rule get_vep_cache: cache: "omit-software" # save space and time with between workflow caching (see docs) wrapper: "master/bio/vep/cache" + + +rule get_vep_cache_ebi: + output: + directory("resources/vep/cache_ebi"), + params: + url="ftp://ftp.ebi.ac.uk/ensemblgenomes/pub/plants", + species="cyanidioschyzon_merolae", + build="ASM9120v1", + release="58", + log: + "logs/vep/cache_ebi.log", + cache: "omit-software" # save space and time with between workflow caching (see docs) + wrapper: + "master/bio/vep/cache" diff --git a/bio/vep/cache/wrapper.py b/bio/vep/cache/wrapper.py index 543f1b2616..291d816121 100644 --- a/bio/vep/cache/wrapper.py +++ b/bio/vep/cache/wrapper.py @@ -9,24 +9,25 @@ extra = snakemake.params.get("extra", "") +log = snakemake.log_fmt_shell(stdout=True, stderr=True) + try: release = int(snakemake.params.release) except ValueError: raise ValueError("The parameter release is supposed to be an integer.") + with tempfile.TemporaryDirectory() as tmpdir: # We download the cache tarball manually because vep_install does not consider proxy settings (in contrast to curl). # See https://github.com/bcbio/bcbio-nextgen/issues/1080 - vep_dir = "vep" if release >= 97 else "VEP" + cache_url = snakemake.params.get("url", "ftp://ftp.ensembl.org/pub") cache_tarball = ( f"{snakemake.params.species}_vep_{release}_{snakemake.params.build}.tar.gz" ) - log = snakemake.log_fmt_shell(stdout=True, stderr=True) + vep_dir = "vep" if snakemake.params.get("url") or release >= 97 else "VEP" shell( - "curl -L ftp://ftp.ensembl.org/pub/release-{snakemake.params.release}/" - "variation/{vep_dir}/{cache_tarball} " - "-o {tmpdir}/{cache_tarball} {log}" + "curl -L {cache_url}/release-{release}/variation/{vep_dir}/{cache_tarball} -o {tmpdir}/{cache_tarball} {log}" ) log = snakemake.log_fmt_shell(stdout=True, stderr=True, append=True) diff --git a/test.py b/test.py index cab6ba917b..6e8aa7e5fe 100644 --- a/test.py +++ b/test.py @@ -5965,6 +5965,11 @@ def test_vep_cache(): ["snakemake", "--cores", "1", "resources/vep/cache", "--use-conda", "-F"], ) + run( + "bio/vep/cache", + ["snakemake", "--cores", "1", "resources/vep/cache_ebi", "--use-conda", "-F"], + ) + @skip_if_not_modified def test_vep_plugins():