generated from populationgenomics/cpg-python-template-repo
-
Notifications
You must be signed in to change notification settings - Fork 4
/
Copy pathexample_usage.sh
126 lines (108 loc) · 4.12 KB
/
example_usage.sh
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
#!/usr/bin/env bash
# This script is an example of how to run Talos, and is not intended to be run as-is
# It can be run from any directory within the Talos image, or any environment where the
# Talos package is installed
#
# If this is run end-to-end in one
set -e
# set the path to use for an output directory
OUTPUT_DIR=${1:-$(date +%F)}
mkdir -p "$OUTPUT_DIR"
# pass the populated Config TOML file, and export as an environment variable
CONFIG_FILE=${2:-config.toml}
export TALOS_CONFIG="$CONFIG_FILE"
# pass the Pedigree file to the script
PED_FILE=${3:-example_ped.txt}
# pass the MatrixTable of variants to the script
VARIANT_MT=${4:-variants.mt}
# pass both ClinVar tables to the script
CLINVAR_DECISIONS=${5:-clinvar_decisions.ht}
CLINVAR_PM5=${6:-clinvar_pm5.ht}
# pass the HPO OBO file from http://purl.obolibrary.org/obo/hp.obo
HPO_OBO=${7:-hp.obo}
# [optional] pass the MatrixTable of SVs to the script
SV_MT=${8:-sv.mt}
# identify the PanelApp data to use
MATCHED_PANELS="${OUTPUT_DIR}/matched_panels.json"
GeneratePanelData \
-i "$PED_FILE" \
--hpo "${HPO_OBO}" \
--out "$MATCHED_PANELS"
# query PanelApp for panels
PANELAPP_RESULTS="${OUTPUT_DIR}/panelapp_results.json"
QueryPanelapp \
--panels "$MATCHED_PANELS" \
--out "$PANELAPP_RESULTS"
# find the symbol~ENSG map for all genes in the panelapp results
GENE_LOOKUP="${OUTPUT_DIR}/gene_lookup.json"
FindGeneSymbolMap \
--panelapp "$PANELAPP_RESULTS" \
--out_path "$GENE_LOOKUP"
# run Hail filtering on the small variant MatrixTable
# this step is the most resource-intensive, so I'd recommend running it on a VM with more resources
# aim for a machine with at least 8-cores, 16GB RAM
# the config entry 'RunHailFiltering.cores.small_variants' is consulted when setting up the PySpark cluster
# with a default of 8, but you can override this if you have more cores available.
SMALL_VARIANT_VCF="${OUTPUT_DIR}/small_variants.vcf.bgz"
RunHailFiltering \
--mt "$VARIANT_MT" \
--panelapp "$PANELAPP_RESULTS" \
--pedigree "$PED_FILE" \
--vcf_out "$SMALL_VARIANT_VCF" \
--clinvar "$CLINVAR_DECISIONS" \
--pm5 "$CLINVAR_PM5" \
--checkpoint small_var_checkpoint.mt
# If you have SVs, run Hail filtering on the SV MatrixTable
SV_VARIANT_VCF="${OUTPUT_DIR}/sv_variants.vcf.bgz"
if [ -n "$SV_MT" ]; then
RunHailFilteringSV \
--mt "$SV_MT" \
--panelapp "$PANELAPP_RESULTS" \
--pedigree "$PED_FILE" \
--vcf_out "$SV_VARIANT_VCF"
fi
# run the MOI validation
MOI_RESULTS="${OUTPUT_DIR}/moi_results.json"
# If the SV MatrixTable was provided, run the SV version of the validation
if [ -n "$SV_VARIANT_VCF" ]; then
ValidateMOI \
--labelled_vcf "$SMALL_VARIANT_VCF" \
--labelled_sv "$SV_VARIANT_VCF" \
--out_json "$MOI_RESULTS" \
--panelapp "$PANELAPP_RESULTS" \
--pedigree "$PED_FILE" \
--participant_panels "$MATCHED_PANELS"
else
ValidateMOI \
--labelled_vcf "$SMALL_VARIANT_VCF" \
--out_json "$MOI_RESULTS" \
--panelapp "$PANELAPP_RESULTS" \
--pedigree "$PED_FILE" \
--participant_panels "$MATCHED_PANELS"
fi
# run the Phenotype-based flagging
# downloaded from https://data.monarchinitiative.org/monarch-kg/latest/phenio.db.gz and unzipped
PHENIO_DB=PLACEHOLDER
# download Genes to Phenotypes from https://hpo.jax.org/app/data/annotations
GENE_TO_PHENOTYPE=PLACEHOLDER
PHENO_ANNOTATED_RESULTS="${OUTPUT_DIR}/phenotype_annotated_results.json"
PHENO_FILTERED_RESULTS="${OUTPUT_DIR}/phenotype_filtered_results.json"
HPOFlagging \
--results "$MOI_RESULTS" \
--gene_map "$GENE_LOOKUP" \
--gen2phen "$GENE_TO_PHENOTYPE" \
--phenio "$PHENIO_DB" \
--out "$PHENO_ANNOTATED_RESULTS" \
--phenout "$PHENO_FILTERED_RESULTS"
# generate the HTML report
HTML_REPORT="${OUTPUT_DIR}/talos_results.html"
LATEST_HTML_REPORT="${OUTPUT_DIR}/latest_talos_results.html"
CreateTalosHTML \
--results "$PHENO_ANNOTATED_RESULTS" \
--panelapp "$PANELAPP_RESULTS" \
--output "$HTML_REPORT" \
--latest "$LATEST_HTML_REPORT"
# generate the Seqr file
SEQR_LABELS="${OUTPUT_DIR}/seqr_labels.json"
PHENOTYPE_SPECIFIC_SEQR_LABELS="${OUTPUT_DIR}/phenotype_match_seqr_labels.json"
GenerateSeqrFile "$MOI_RESULTS" "$PHENOTYPE_SPECIFIC_SEQR_LABELS"