diff --git a/download-data.sh b/download-data.sh index bdd3239..8f736b0 100644 --- a/download-data.sh +++ b/download-data.sh @@ -14,6 +14,8 @@ export QUIET=${QUIET-0} export VERBOSE=${VERBOSE-0} # Dry-run mode. export DRY_RUN=${DRY_RUN-0} +# No-verify-ssl mode: do not verify SSL certificates. +export NO_VERIFY_SSL=${NO_VERIFY_SSL-0} # Download options: reduced-dev (default), reduced-exomes, full. export DOWNLOAD=${DOWNLOAD-reduced-dev} # Directory for static data. @@ -21,14 +23,27 @@ export STATIC_INFIX=${STATIC_INFIX-varfish-static} # Overall directory prefix. export DIR_PREFIX=${DIR_PREFIX-.dev} # Overall static data directory. -export DATA_DIR=${DATA_DIR-$DIR_PREFIX/volumes/$STATIC_INFIX/data} +export DATA_DIR=${DATA_DIR-$DIR_PREFIX/volumes/$STATIC_DIR/data} # S3 endpoing URL. export S3_ENDPOINT_URL=https://ceph-s3-public.cubi.bihealth.org +# Grep regex expression for downloading data. +export LIST_GREP=${LIST_GREP-} +# Steps to execute +export STEPS=${STEPS-s3_sync,other} + +# Set S5CMD_NO_VERIFY_SSL_ARG based on NO_VERIFY_SSL +if [ "$NO_VERIFY_SSL" -eq 1 ]; then + S5CMD_NO_VERIFY_SSL_ARG="--no-verify-ssl" +else + S5CMD_NO_VERIFY_SSL_ARG="" +fi # -- Versions ----------------------------------------------------------------- # annonars export V_ANNONARS=${V_ANNONARS-0.33.0} +# annonars for annonars/genes +export V_ANNONARS_ANNONARS_GENES=${V_ANNONARS_ANNONARS_GENES-0.34.0} # viguno export V_VIGUNO=${V_VIGUNO-0.2.0} # VarFish Worker @@ -63,6 +78,8 @@ export V_HPO=${V_HPO-20230606} export V_ORPHAPACKETS=${V_ORPHAPACKETS-10.1} # VarFish DB Download Data export V_VARFISHDB=${V_VARFISHDB-20240105} +# VarFish DB Download Data for annonars/genes +export V_VARFISHDB_ANNONARS_GENES=${V_VARFISHDB_ANNONARS_GENES-20240119} # gnomAD exomes GRCh37 export V_GNOMAD_EXOMES_GRCH37=${V_GNOMAD_EXOMES_GRCH37-2.1.1} # gnomAD exomes GRCh38 @@ -200,10 +217,11 @@ run() # -- Downloading -------------------------------------------------------------- -log_info "Download data ..." +if [[ "${STEPS}" = *s3_sync* ]]; then + log_info "Download data ..." -# First, write out folders that we want to download. -cat </tmp/download-list.txt + # First, write out folders that we want to download. + cat </tmp/download-list.raw.txt annonars/cadd-grch37-$V_CADD+$V_ANNONARS annonars/cadd-grch38-$V_CADD+$V_ANNONARS annonars/cons-grch37-$V_UCSC_CONS_37+$V_ANNONARS @@ -216,7 +234,7 @@ annonars/dbsnp-grch37-$V_DBSNP+$V_ANNONARS annonars/dbsnp-grch38-$V_DBSNP+$V_ANNONARS annonars/functional-grch37-$V_REFSEQ_GRCH37+$V_ANNONARS annonars/functional-grch38-$V_REFSEQ_GRCH38+$V_ANNONARS -annonars/genes-$V_ACMG_SF+$V_GNOMAD_CONSTRAINTS+$V_DBNSFP_NO_SUFFIX+$V_HPO+$V_ORPHAPACKETS+$V_VARFISHDB+$V_ANNONARS +annonars/genes-$V_ACMG_SF+$V_GNOMAD_CONSTRAINTS+$V_DBNSFP_NO_SUFFIX+$V_HPO_ANNONARS_GENES+$V_VARFISHDB_ANNONARS_GENES+$V_ANNONARS_ANNONARS_GENES annonars/gnomad-exomes-grch37-$V_GNOMAD_EXOMES_GRCH37+$V_ANNONARS annonars/gnomad-exomes-grch38-$V_GNOMAD_EXOMES_GRCH38+$V_ANNONARS annonars/gnomad-genomes-grch37-$V_GNOMAD_EXOMES_GRCH37+$V_ANNONARS @@ -238,373 +256,378 @@ tracks worker viguno/hpo-$V_HPO+$V_VIGUNO EOF -# Create download directory. -mkdir -p $DATA_DIR/download -# Download each entry from download list. Note that we support commenting -# out lines with a leading "#". -grep -v ^# /tmp/download-list.txt >/tmp/download-list.nocomment.txt -while read -r line; do - # Create the download directory. - run mkdir -p $DATA_DIR/download/$line - # Actually download the data. - log_info "s3://varfish-public/$(prefix_for $line)/$line/* -> $DATA_DIR/download/$line" - run s5cmd \ - --endpoint-url=$S3_ENDPOINT_URL \ - --no-sign-request \ - --no-verify-ssl \ - sync \ - "s3://varfish-public/$(prefix_for $line)/$line/*" \ - $DATA_DIR/download/$line \ - &> >(tee /tmp/download.stderr >&2) - grep ^ERROR /tmp/download.stderr >/dev/null && exit 1 -done /tmp/download-list.txt fi - done -done - -log_info "... done setting up symlink structure." - -log_info "- mehari transcripts" - -mkdir -p $DATA_DIR/download/mehari-data-txs-grch3{7,8} - -for ext in .zst .zst.sha256 .zst.report .zst.report.sha256; do - for release in grch37 grch38; do - wget -q -c -O $DATA_DIR/download/mehari-data-txs-$release/mehari-data-txs-$release-$V_MEHARI_TXS.bin$ext \ - https://github.com/varfish-org/mehari-data-tx/releases/download/v$V_MEHARI_TXS/mehari-data-txs-$release-$V_MEHARI_TXS.bin$ext - done -done - -for release in grch37 grch38; do - rm -f $DATA_DIR/mehari/$release/txs.bin.zst - ln -sr $DATA_DIR/download/mehari-data-txs-$release/mehari-data-txs-$release-$V_MEHARI_TXS.bin.zst \ - $DATA_DIR/mehari/$release/txs.bin.zst -done - -log_info "- clinvar" - -wget -q -c -O /tmp/annonars-clinvar-minimal-grch37-$V_ANNONARS_DATA_CLINVAR_CLINVAR+$V_ANNONARS_DATA_CLINVAR_ANNONARS.tar.gz \ - https://github.com/varfish-org/annonars-data-clinvar/releases/download/annonars-data-clinvar-$V_ANNONARS_DATA_CLINVAR_CLINVAR/annonars-clinvar-minimal-grch37-$V_ANNONARS_DATA_CLINVAR_CLINVAR+$V_ANNONARS_DATA_CLINVAR_ANNONARS.tar.gz -wget -q -c -O /tmp/annonars-clinvar-minimal-grch38-$V_ANNONARS_DATA_CLINVAR_CLINVAR+$V_ANNONARS_DATA_CLINVAR_ANNONARS.tar.gz \ - https://github.com/varfish-org/annonars-data-clinvar/releases/download/annonars-data-clinvar-$V_ANNONARS_DATA_CLINVAR_CLINVAR/annonars-clinvar-minimal-grch38-$V_ANNONARS_DATA_CLINVAR_CLINVAR+$V_ANNONARS_DATA_CLINVAR_ANNONARS.tar.gz - -tar -C $DATA_DIR/download/annonars/ \ - -xf /tmp/annonars-clinvar-minimal-grch37-$V_ANNONARS_DATA_CLINVAR_CLINVAR+$V_ANNONARS_DATA_CLINVAR_ANNONARS.tar.gz -tar -C $DATA_DIR/download/annonars \ - -xf /tmp/annonars-clinvar-minimal-grch38-$V_ANNONARS_DATA_CLINVAR_CLINVAR+$V_ANNONARS_DATA_CLINVAR_ANNONARS.tar.gz - -rm -f $DATA_DIR/annonars/grch37/clinvar -ln -sr $DATA_DIR/download/annonars/annonars-clinvar-minimal-grch37-$V_ANNONARS_DATA_CLINVAR_CLINVAR+$V_ANNONARS_DATA_CLINVAR_ANNONARS \ - $DATA_DIR/annonars/grch37/clinvar -rm -f $DATA_DIR/annonars/grch38/clinvar -ln -sr $DATA_DIR/download/annonars/annonars-clinvar-minimal-grch38-$V_ANNONARS_DATA_CLINVAR_CLINVAR+$V_ANNONARS_DATA_CLINVAR_ANNONARS \ - $DATA_DIR/annonars/grch38/clinvar - -wget -q -c -O /tmp/annonars-clinvar-genes-$V_ANNONARS_DATA_CLINVAR_CLINVAR+$V_ANNONARS_DATA_CLINVAR_ANNONARS.tar.gz \ - https://github.com/varfish-org/annonars-data-clinvar/releases/download/annonars-data-clinvar-$V_ANNONARS_DATA_CLINVAR_CLINVAR/annonars-clinvar-genes-$V_ANNONARS_DATA_CLINVAR_CLINVAR+$V_ANNONARS_DATA_CLINVAR_ANNONARS.tar.gz -tar -C $DATA_DIR/download/annonars \ - -xf /tmp/annonars-clinvar-genes-$V_ANNONARS_DATA_CLINVAR_CLINVAR+$V_ANNONARS_DATA_CLINVAR_ANNONARS.tar.gz - -rm -f $DATA_DIR/annonars/clinvar-genes -ln -sr $DATA_DIR/download/annonars/annonars-clinvar-genes-$V_ANNONARS_DATA_CLINVAR_CLINVAR+$V_ANNONARS_DATA_CLINVAR_ANNONARS \ - $DATA_DIR/annonars/clinvar-genes - -wget -q -c -O /tmp/annonars-clinvar-sv-grch37-$V_ANNONARS_DATA_CLINVAR_CLINVAR+$V_ANNONARS_DATA_CLINVAR_ANNONARS.tar.gz \ - https://github.com/varfish-org/annonars-data-clinvar/releases/download/annonars-data-clinvar-$V_ANNONARS_DATA_CLINVAR_CLINVAR/annonars-clinvar-sv-grch37-$V_ANNONARS_DATA_CLINVAR_CLINVAR+$V_ANNONARS_DATA_CLINVAR_ANNONARS.tar.gz -wget -q -c -O /tmp/annonars-clinvar-sv-grch38-$V_ANNONARS_DATA_CLINVAR_CLINVAR+$V_ANNONARS_DATA_CLINVAR_ANNONARS.tar.gz \ - https://github.com/varfish-org/annonars-data-clinvar/releases/download/annonars-data-clinvar-$V_ANNONARS_DATA_CLINVAR_CLINVAR/annonars-clinvar-sv-grch38-$V_ANNONARS_DATA_CLINVAR_CLINVAR+$V_ANNONARS_DATA_CLINVAR_ANNONARS.tar.gz -tar -C $DATA_DIR/download/annonars/ \ - -xf /tmp/annonars-clinvar-sv-grch37-$V_ANNONARS_DATA_CLINVAR_CLINVAR+$V_ANNONARS_DATA_CLINVAR_ANNONARS.tar.gz -tar -C $DATA_DIR/download/annonars/ \ - -xf /tmp/annonars-clinvar-sv-grch38-$V_ANNONARS_DATA_CLINVAR_CLINVAR+$V_ANNONARS_DATA_CLINVAR_ANNONARS.tar.gz -rm -f $DATA_DIR/annonars/grch3{7,8}/clinvar-sv -ln -sr $DATA_DIR/download/annonars/annonars-clinvar-sv-grch37-$V_ANNONARS_DATA_CLINVAR_CLINVAR+$V_ANNONARS_DATA_CLINVAR_ANNONARS \ - $DATA_DIR/annonars/grch37/clinvar-sv -ln -sr $DATA_DIR/download/annonars/annonars-clinvar-sv-grch38-$V_ANNONARS_DATA_CLINVAR_CLINVAR+$V_ANNONARS_DATA_CLINVAR_ANNONARS \ - $DATA_DIR/annonars/grch38/clinvar-sv - -log_info "- dotty" - -mkdir -p $DATA_DIR/download/dotty -pushd $DATA_DIR/download/dotty >/dev/null -wget -q -c \ - https://github.com/SACGF/cdot/releases/download/v$V_DOTTY_CDOT_VERSION/cdot-$V_DOTTY_CDOT_VERSION.ensembl.grch37.json.gz \ - https://github.com/SACGF/cdot/releases/download/v$V_DOTTY_CDOT_VERSION/cdot-$V_DOTTY_CDOT_VERSION.ensembl.grch38.json.gz \ - https://github.com/SACGF/cdot/releases/download/v$V_DOTTY_CDOT_VERSION/cdot-$V_DOTTY_CDOT_VERSION.refseq.grch37.json.gz \ - https://github.com/SACGF/cdot/releases/download/v$V_DOTTY_CDOT_VERSION/cdot-$V_DOTTY_CDOT_VERSION.refseq.grch38.json.gz -wget -q -c \ - https://github.com/varfish-org/dotty/releases/download/v$V_DOTTY_SEQREPO/seqrepo.tar.gz-00 \ - https://github.com/varfish-org/dotty/releases/download/v$V_DOTTY_SEQREPO/seqrepo.tar.gz-01 -cat seqrepo.tar.gz-?? | tar -xzf - -popd >/dev/null - -mkdir -p $DATA_DIR/dotty -rm -f $DATA_DIR/dotty/{*.json.gz,seqrepo} -ln -sr $DATA_DIR/download/dotty/{*.json.gz,seqrepo} \ - $DATA_DIR/dotty - -log_info "- cada-prio" - -mkdir -p $DATA_DIR/download/cada -pushd $DATA_DIR/download/cada >/dev/null -wget -q -c \ - https://github.com/varfish-org/cada-prio-data/releases/download/cada-prio-data-$V_CADA_PRIO_MODEL/cada-prio-model-$V_CADA_PRIO_MODEL+$V_CADA_PRIO_VERSION.tar.gz -tar -xzf cada-prio-model-$V_CADA_PRIO_MODEL+$V_CADA_PRIO_VERSION.tar.gz -popd >/dev/null - -mkdir -p $DATA_DIR/cada -rm -f $DATA_DIR/cada/model - -source_dir="$DATA_DIR/download/cada/cada-prio-model-$V_CADA_PRIO_MODEL+$V_CADA_PRIO_VERSION/model" -for file in "${source_dir}"/*; do - rm -f "$DATA_DIR/cada/$(basename "$file")" - ln -sr "$file" "$DATA_DIR/cada/" -done + # Create download directory. + mkdir -p $DATA_DIR/download + # Download each entry from download list. Note that we support commenting + # out lines with a leading "#". + grep -v ^# /tmp/download-list.txt >/tmp/download-list.nocomment.txt + while read -r line; do + # Create the download directory. + run mkdir -p $DATA_DIR/download/$line + # Actually download the data. + log_info "s3://varfish-public/$(prefix_for $line)/$line/* -> $DATA_DIR/download/$line" + run s5cmd \ + --endpoint-url=$S3_ENDPOINT_URL \ + --no-sign-request \ + $S5CMD_NO_VERIFY_SSL_ARG \ + sync \ + "s3://varfish-public/$(prefix_for $line)/$line/*" \ + $DATA_DIR/download/$line \ + &> >(tee /tmp/download.stderr >&2) + grep ^ERROR /tmp/download.stderr >/dev/null && exit 1 + done /dev/null + wget -q -c \ + https://github.com/SACGF/cdot/releases/download/v$V_DOTTY_CDOT_VERSION/cdot-$V_DOTTY_CDOT_VERSION.ensembl.grch37.json.gz \ + https://github.com/SACGF/cdot/releases/download/v$V_DOTTY_CDOT_VERSION/cdot-$V_DOTTY_CDOT_VERSION.ensembl.grch38.json.gz \ + https://github.com/SACGF/cdot/releases/download/v$V_DOTTY_CDOT_VERSION/cdot-$V_DOTTY_CDOT_VERSION.refseq.grch37.json.gz \ + https://github.com/SACGF/cdot/releases/download/v$V_DOTTY_CDOT_VERSION/cdot-$V_DOTTY_CDOT_VERSION.refseq.grch38.json.gz + wget -q -c \ + https://github.com/bihealth/dotty/releases/download/v$V_DOTTY_SEQREPO/seqrepo.tar.gz-00 \ + https://github.com/bihealth/dotty/releases/download/v$V_DOTTY_SEQREPO/seqrepo.tar.gz-01 + cat seqrepo.tar.gz-?? | tar -xzf - + popd >/dev/null + + mkdir -p $DIR_PREFIX/volumes/$STATIC_DIR/data/dotty + rm -f $DIR_PREFIX/volumes/$STATIC_DIR/data/dotty/{*.json.gz,seqrepo} + ln -sr $DIR_PREFIX/volumes/$STATIC_DIR/data/download/dotty/{*.json.gz,seqrepo} \ + $DIR_PREFIX/volumes/$STATIC_DIR/data/dotty + + log_info "- cada-prio" + + mkdir -p $DIR_PREFIX/volumes/$STATIC_DIR/data/download/cada + pushd $DIR_PREFIX/volumes/$STATIC_DIR/data/download/cada >/dev/null + wget -q -c \ + https://github.com/bihealth/cada-prio-data/releases/download/cada-prio-data-$V_CADA_PRIO_MODEL/cada-prio-model-$V_CADA_PRIO_MODEL+$V_CADA_PRIO_VERSION.tar.gz + tar -xzf cada-prio-model-$V_CADA_PRIO_MODEL+$V_CADA_PRIO_VERSION.tar.gz + popd >/dev/null + + mkdir -p $DIR_PREFIX/volumes/$STATIC_DIR/data/cada + rm -f $DIR_PREFIX/volumes/$STATIC_DIR/data/cada/model + + source_dir="$DIR_PREFIX/volumes/$STATIC_DIR/data/download/cada/cada-prio-model-$V_CADA_PRIO_MODEL+$V_CADA_PRIO_VERSION/model" + for file in "${source_dir}"/*; do + rm -f "$DIR_PREFIX/volumes/$STATIC_DIR/data/cada/$(basename "$file")" + ln -sr "$file" "$DIR_PREFIX/volumes/$STATIC_DIR/data/cada/" + done +fi