-
Notifications
You must be signed in to change notification settings - Fork 5
/
silva-preprocess
executable file
·125 lines (104 loc) · 3.36 KB
/
silva-preprocess
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
#!/usr/bin/env bash
set -eu
set -o pipefail
source ${SILVA_PATH:-$(dirname $0)}/init.sh
control=$SILVA_CONTROL.mat
src=$SILVA_PATH/src
data=$SILVA_PATH/data
synonymous="$src/input/synonymous.py --genome=$data/hg19.2bit --genes=$data/refGene.ucsc.gz --cache-genes=$data/refGene.pkl"
gp1k="$src/input/1000gp.py $data/1000gp.refGene.vcf.gz $data/1000gp.refGene.pkl"
function usage {
cat <<EOF
Usage: $0 OUTDIR VCF
Annotates rare, synonymous variants in VCF file,
creating files necessary to run this tool.
OUTDIR should be a directory specific to the VCF file.
Existing files in OUTDIR are NOT overwritten, so this
script can be stopped and resumed.
Creates OUTDIR if it does not exit.
Will use TMPDIR='$TMPDIR' as needed.
EOF
exit 1
}
if [[ $# -ne 2 ]]; then
usage
fi
outdir="$1"
vcf="$2"
if [[ ! -s "$vcf" ]]; then
echo -e "Error: missing input file: $vcf\n" >&2
usage
fi
function skip_if_exists {
local f="$1"
if [[ -s "$f" ]]; then
echo "Found existing: $f..." >&2
fi
test ! -s "$f"
}
init_message "$0" "$@"
mkdir -pv $outdir
# Filter to just synonymous variants
# Adds gene and transcript information
if [[ "$vcf" == *.pcoord ]]; then
base=$(basename "$vcf" .pcoord)
pcoord="--protein-coords"
else
base=$(basename "$vcf" .vcf)
pcoord=
fi
if [[ -n "$pcoord" ]]; then
out=$base.flt
else
out=$base.syn
fi
skip_if_exists $outdir/$out \
&& echo "Filtering for synonymous exonic variants..." >&2 \
&& $synonymous filter $pcoord "$vcf" | egrep -v "^Y\b" > $TMPDIR/$out \
&& echo "Removing variants on chromosome Y..." >&2 \
&& mv $TMPDIR/$out $outdir/$out \
&& echo "Left with $(grep -v '^#' $outdir/$out | wc -l) variants..." >&2
test -s $outdir/$out
# Annotate 1000 Genomes Project AF
if [[ -z "$pcoord" ]]; then
out=$base.af
skip_if_exists $outdir/$out \
&& echo "Annotating with 1000 Genomes Project allele frequency..." >&2 \
&& cut -f 1,2,5 $outdir/$base.syn \
| $gp1k \
> $TMPDIR/$out \
&& mv $TMPDIR/$out $outdir/$out
test -s $outdir/$out
out=$base.flt
skip_if_exists $outdir/$out \
&& echo "Filtering by allele frequency [$SILVA_AF_MIN, $SILVA_AF_MAX]..." >&2 \
&& paste $outdir/$base.af $outdir/$base.syn \
| awk -F"\t" '/^#/; /^[^#]/{OFS="\t"; if ($1 == ".") {$1 = 0} if ($1 >= '"$SILVA_AF_MIN"' && $1 <= '"$SILVA_AF_MAX"') {print $0}}' \
| cut -f 2- \
> $TMPDIR/$out \
&& mv $TMPDIR/$out $outdir/$out \
&& echo "Left with $(grep -v '^#' $outdir/$out | wc -l) variants..." >&2
test -s $outdir/$out
fi
# Annotate with mrna sequence
out=$base.mrna
skip_if_exists $outdir/$out \
&& echo "Annotating with mRNA sequence..." >&2 \
&& $synonymous annotate $outdir/$base.flt > $TMPDIR/$out \
&& mv $TMPDIR/$out $outdir/$out
test -s $outdir/$out
# Annotate with features
out=$outdir/$base.mat
skip_if_exists $out \
&& echo "Annotating features from MRNA file..." >&2 \
&& $src/input/annotate_features.sh $outdir/$base.mrna $outdir/$base
test -s $out
# Standardize according to control dataset
out=$base.input
skip_if_exists $outdir/$out \
&& echo "Standardizing MAT file according to control data..." >&2 \
&& $src/input/standardize.py \
--class=0 --control=$control $outdir/$base.mat > $TMPDIR/$out \
&& mv $TMPDIR/$out $outdir/$out
test -s $outdir/$out
echo "$0: SUCCESS" >&2