-
Notifications
You must be signed in to change notification settings - Fork 0
/
Copy path8H0ZB-HG004.sh
140 lines (94 loc) · 3.94 KB
/
8H0ZB-HG004.sh
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
## MANUALLY SET THESE VARIABLES ##
WORKDIR="/home/kokyriakidis/Desktop/Submissions"
SAMPLENAME="HG004"
SUBMISSION="8H0ZB"
FQ1FILEPATH="path/to/HG004.novaseq.pcr-free.35x.R1.fastq.gz"
FQ2FILEPATH="path/to/HG004.novaseq.pcr-free.35x.R2.fastq.gz"
NTHREADS=18
## SUBMISSION PIPELINE ##
mkdir -p ${WORKDIR}
cd ${WORKDIR}
mkdir -p ${SUBMISSION}
cd ${SUBMISSION}
mkdir -p reference
cd reference
# Download the full hg38/GRCh38 reference genome distributed by 1000 genomes
# Derived from NCBI set with HLA and decoy alternative alleles
# http://ftp.1000genomes.ebi.ac.uk/vol1/ftp/technical/reference/GRCh38_reference_genome/url=ftp://ftp.1000genomes.ebi.ac.uk/vol1/ftp/technical/reference/GRCh38_reference_genome
base=GRCh38_full_analysis_set_plus_decoy_hla
new=hg38
mkdir -p seq
for suffix in .dict .fa.fai
do
[[ -f seq/$new$suffix ]] || wget -c -O seq/$new$suffix $url/$base$suffix
done
# gzipped references for use with ensembl-vep HGVS
urlgz=https://s3.amazonaws.com/biodata/hg38_bundle
for suffix in .fa.gz .fa.gz.fai .fa.gz.gzi
do
[[ -f seq/$new$suffix ]] || wget --no-check-certificate -c -O seq/$new$suffix $urlgz/$new$suffix
done
gunzip -c seq/hg38.fa.gz > seq/hg38.fa
touch seq/hg38.fa.fai
touch seq/hg38.dict
## Download pre-build bwa indices ##
url=ftp://ftp.1000genomes.ebi.ac.uk/vol1/ftp/technical/reference/GRCh38_reference_genome
base=GRCh38_full_analysis_set_plus_decoy_hla.fa
new=hg38.fa
mkdir -p bwa
for suffix in .bwt .amb .ann .pac .sa .alt
do
[[ -f bwa/$new$suffix ]] || wget -c -O bwa/$new$suffix $url/$base$suffix
done
cd ${WORKDIR}
## ADAPTER TRIMMING WITH ATROPOS ##
atropos trim \
-a AACACTCTTTCCCT -A AGATCGGAAGAGCG \
--quality-base 64 --format fastq --overlap 8 \
--aligner adapter --quality-cutoff 5 \
-o ${WORKDIR}/${SUBMISSION}/ATROPOS/${SAMPLENAME}-${SUBMISSION}-trimmed.1.fq.gz -p ${WORKDIR}/${SUBMISSION}/ATROPOS/${SAMPLENAME}-${SUBMISSION}-trimmed.2.fq.gz \
-pe1 ${FQ1FILEPATH} -pe2 ${FQ2FILEPATH}
mkdir -p ${WORKDIR}/${SUBMISSION}/BWA
mkdir -p ${WORKDIR}/${SUBMISSION}/BWA/tmpdir
## READ MAPPING WITH BWA ##
bwa mem \
-c 250 \
-M \
-t ${NTHREADS} \
-R '@RG\tID:${SAMPLENAME}\tPL:illumina\tPU:${SAMPLENAME}\tSM:${SAMPLENAME}' \
-v 1 \
${WORKDIR}/${SUBMISSION}/reference/bwa/hg38.fa \
${WORKDIR}/${SUBMISSION}/ATROPOS/${SAMPLENAME}-${SUBMISSION}-trimmed.1.fq.gz \
${WORKDIR}/${SUBMISSION}/ATROPOS/${SAMPLENAME}-${SUBMISSION}-trimmed.2.fq.gz > ${WORKDIR}/${SUBMISSION}/BWA/${SAMPLENAME}-${SUBMISSION}-sorted.sam
## SORT BAM FILE BY CHROMOSOMAL COORDINATES ##
samtools sort -@ ${NTHREADS} -m 3G -O bam \
-T ${WORKDIR}/${SUBMISSION}/BWA/tmpdir \
-o ${WORKDIR}/${SUBMISSION}/BWA/${SAMPLENAME}-${SUBMISSION}-sorted.bam \
${WORKDIR}/${SUBMISSION}/BWA/${SAMPLENAME}-${SUBMISSION}-sorted.sam
## INDEX BAM FILE WITH SAMTOOLS ##
samtools index -@ ${NTHREADS} ${WORKDIR}/${SUBMISSION}/BWA/${SAMPLENAME}-${SUBMISSION}-sorted.bam
## REMOVE INTERMEDIATE FILES ##
rm ${WORKDIR}/${SUBMISSION}/BWA/${SAMPLENAME}-${SUBMISSION}-sorted.sam
rm -rf ${WORKDIR}/${SUBMISSION}/BWA/tmpdir
mkdir ${WORKDIR}/${SUBMISSION}/DEEPVARIANT
mkdir ${WORKDIR}/${SUBMISSION}/DEEPVARIANT/tmpdir
mkdir ${WORKDIR}/${SUBMISSION}/DEEPVARIANT/logdir
## VARIANT CALLING WITH DEEPVARIANT ##
dv_make_examples.py \
--ref ${WORKDIR}/${SUBMISSION}/reference/seq/hg38.fa \
--reads ${WORKDIR}/${SUBMISSION}/BWA/${SAMPLENAME}-${SUBMISSION}-sorted.bam \
--examples ${WORKDIR}/${SUBMISSION}/DEEPVARIANT/tmpdir \
--sample ${SAMPLENAME} \
--cores ${NTHREADS} \
--logdir ${WORKDIR}/${SUBMISSION}/DEEPVARIANT/logdir
dv_call_variants.py \
--cores ${NTHREADS} \
--outfile ${WORKDIR}/${SUBMISSION}/DEEPVARIANT/tmpdir/${SAMPLENAME}.tmp \
--sample ${SAMPLENAME} \
--examples ${WORKDIR}/${SUBMISSION}/DEEPVARIANT/tmpdir \
--model wgs
mkdir ${WORKDIR}/${SUBMISSION}/DEEPVARIANT/results
dv_postprocess_variants.py \
--ref ${WORKDIR}/${SUBMISSION}/reference/seq/hg38.fa \
--infile ${WORKDIR}/${SUBMISSION}/DEEPVARIANT/tmpdir/${SAMPLENAME}.tmp \
--outfile ${WORKDIR}/${SUBMISSION}/DEEPVARIANT/results/${SAMPLENAME}.vcf