Skip to content

Commit aae075a

Browse files
committed
updated to also use GATK HaplotypeCaller
1 parent 8a9dbbf commit aae075a

File tree

6 files changed

+108
-55
lines changed

6 files changed

+108
-55
lines changed

HG00096.lowcov.settings

+2-1
Original file line numberDiff line numberDiff line change
@@ -5,5 +5,6 @@ NAME=HG00096
55
K_ANALYSIS=wgs
66
#CHROMOSOMES=chr22
77
#SWE_ENGINE=local
8-
#GATK_JAR=~/GenomeAnalysisTK.jar
8+
GATK_LITE_JAR=~/GenomeAnalysisTKLite.jar
9+
GATK_FULL_JAR=~/GenomeAnalysisTK.jar
910
INPUT_FASTQ="paired[s3://gapp-east/1000G/SRR062634_1.filt.fastq.gz,s3://gapp-east/1000G/SRR062634_2.filt.fastq.gz] paired[s3://gapp-east/1000G/SRR062635_1.filt.fastq.gz,s3://gapp-east/1000G/SRR062635_2.filt.fastq.gz] paired[s3://gapp-east/1000G/SRR062641_1.filt.fastq.gz,s3://gapp-east/1000G/SRR062641_2.filt.fastq.gz]"
File renamed without changes.

exome.settings

+3-1
Original file line numberDiff line numberDiff line change
@@ -5,4 +5,6 @@ INPUT_FASTQ=paired[s3://gapp-west/[email protected]/sample_exome/025_Bioplanet_G
55
K_ANALYSIS=exome
66
CHROMOSOMES=chr22
77
#SWE_ENGINE=local
8-
#GATK_JAR=~/GenomeAnalysisTK.jar
8+
#GATK_JAR=~/GenomeAnalysisTK.jar
9+
GATK_LITE_JAR=~/GenomeAnalysisTKLite.jar
10+
GATK_FULL_JAR=~/GenomeAnalysisTK.jar

gatk-hc/gatk-hc.sh

+68
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,68 @@
1+
#!/bin/bash
2+
set -e
3+
set -x
4+
set -o pipefail
5+
6+
input=$(./swe get input | ./swe fetch -)
7+
gatk_jar=$(./swe get gatk_jar | ./swe fetch -)
8+
interval_file=$(./swe get interval|./swe fetch -)
9+
interval=$(cat $interval_file)
10+
gatk_data=$(./swe get GATK_DATA)
11+
cpu_cores=32
12+
13+
tabix -h $gatk_data/Mills_and_1000G_gold_standard.indels.hg19.vcf.gz $interval > VCF1.vcf
14+
tabix -h $gatk_data/1000G_phase1.indels.hg19.vcf.gz $interval > VCF2.vcf
15+
tabix -h $gatk_data/dbsnp_137.hg19.vcf.gz $interval > interval.dbsnp_137.hg19.vcf
16+
17+
samtools index $input
18+
19+
java -Xmx2g -jar $gatk_jar \
20+
-I $input \
21+
-R $gatk_data/hg19/ucsc.hg19.fasta \
22+
-T RealignerTargetCreator \
23+
-o realigned.intervals \
24+
--known VCF1.vcf \
25+
--known VCF2.vcf \
26+
-L $interval
27+
28+
java -Xmx2g -jar $gatk_jar \
29+
-I $input \
30+
-R $gatk_data/hg19/ucsc.hg19.fasta \
31+
-T IndelRealigner \
32+
-targetIntervals realigned.intervals \
33+
-o realigned.bam \
34+
-known VCF1.vcf \
35+
-known VCF2.vcf \
36+
--consensusDeterminationModel KNOWNS_ONLY \
37+
-L $interval \
38+
-LOD 0.4 \
39+
--downsample_to_coverage 250 \
40+
-compress 0
41+
42+
# drop BQSR
43+
44+
#java -Xmx2g -jar $gatk_jar \
45+
# -R $gatk_data/hg19/ucsc.hg19.fasta \
46+
# -I realigned.bam \
47+
# -T PrintReads \
48+
# -o recalibrated.bam \
49+
# --disable_indel_quals \
50+
# -BQSR $bqsr \
51+
# -compress 0
52+
53+
# note that for GATK-UG we use the realigned bam, whereas for GATK-HC we use the raw input
54+
55+
#full featured GATK, run HaplotypeCaller
56+
java -Xmx6g -jar $gatk_jar \
57+
-R $gatk_data/hg19/ucsc.hg19.fasta \
58+
-T HaplotypeCaller \
59+
-I $input \
60+
--dbsnp interval.dbsnp_137.hg19.vcf \
61+
-stand_call_conf 30.0 \
62+
-stand_emit_conf 30.0 \
63+
-o raw.vcf \
64+
-L $interval \
65+
-nct $cpu_cores \
66+
-rf BadCigar
67+
68+
./swe emit file raw.vcf

gatk/gatk.sh gatk-ug/gatk-ug.sh

+12-41
Original file line numberDiff line numberDiff line change
@@ -53,46 +53,17 @@ java -Xmx2g -jar $gatk_jar \
5353

5454
# note that for GATK-UG we use the realigned bam, whereas for GATK-HC we use the raw input
5555

56-
if [[ $gatk_jar =~ GenomeAnalysisTKLite ]] ;
57-
then
58-
java -Xmx6g -jar $gatk_jar \
59-
-R $gatk_data/hg19/ucsc.hg19.fasta \
60-
-T UnifiedGenotyper \
61-
-I realigned.bam \
62-
--dbsnp interval.dbsnp_137.hg19.vcf \
63-
-o raw.vcf \
64-
-glm BOTH \
65-
-L $interval \
66-
-stand_call_conf 30.0 \
67-
-stand_emit_conf 30.0 \
68-
-nct $cpu_cores \
69-
-rf BadCigar
70-
else
71-
#full featured GATK, run HaplotypeCaller
72-
java -Xmx6g -jar $gatk_jar \
73-
-R $gatk_data/hg19/ucsc.hg19.fasta \
74-
-T HaplotypeCaller \
75-
-I $input \
76-
--dbsnp interval.dbsnp_137.hg19.vcf \
77-
-stand_call_conf 30.0 \
78-
-stand_emit_conf 30.0 \
79-
-o raw.hc.vcf \
80-
-L $interval \
81-
-nct $cpu_cores \
82-
-rf BadCigar
83-
84-
java -Xmx6g -jar $gatk_jar \
85-
-R $gatk_data/hg19/ucsc.hg19.fasta \
86-
-T VariantAnnotator \
87-
-I $input \
88-
--variant raw.hc.vcf \
89-
-A MappingQualityZero \
90-
--dbsnp interval.dbsnp_137.hg19.vcf \
91-
-o raw.vcf \
92-
-L $interval \
93-
-nt $cpu_cores \
94-
-rf BadCigar
95-
96-
fi
56+
java -Xmx6g -jar $gatk_jar \
57+
-R $gatk_data/hg19/ucsc.hg19.fasta \
58+
-T UnifiedGenotyper \
59+
-I realigned.bam \
60+
--dbsnp interval.dbsnp_137.hg19.vcf \
61+
-o raw.vcf \
62+
-glm BOTH \
63+
-L $interval \
64+
-stand_call_conf 30.0 \
65+
-stand_emit_conf 30.0 \
66+
-nct $cpu_cores \
67+
-rf BadCigar
9768

9869
./swe emit file raw.vcf

pipeline.sh

+23-12
Original file line numberDiff line numberDiff line change
@@ -45,16 +45,18 @@ fi
4545
#common_params="$common_params -j 1"
4646
#if a GATK 3.0 jar is specified - use it. Otherwise fall back to GATK_Lita
4747
#upload GATK_JAR
48-
if [ "$GATK_JAR" != "" ]
48+
if [ "$GATK_FULL_JAR" != "" ]
49+
then
50+
GATK_FULL_JAR=$(./swe store ./GenomeAnalysisTK.jar)
51+
fi
52+
if [ "$GATK_LITE_JAR" != "" ]
4953
then
50-
GATK_JAR=$(./swe store $GATK_JAR)
51-
else
5254
# if HTTP link for 3.0 is not provided fall back on free GATK_Lite
53-
GATK_JAR=$(./swe store ./GenomeAnalysisTKLite.jar)
55+
GATK_LITE_JAR=$(./swe store ./GenomeAnalysisTKLite.jar)
5456
fi
5557

5658

57-
[ "$GATK_JAR" != "" ] #GATK_JAR must be defined
59+
[ "$GATK_LITE_JAR" != "" ] #GATK_JAR must be defined
5860
[ -e bin.tar.gz ] || tar czvf bin.tar.gz ./bin
5961

6062
NAME_PREFIX="$NAME:$K_ANALYSIS";
@@ -176,7 +178,7 @@ do
176178
-ichr=$chr \
177179
-iinput="$input_array" \
178180
-u combine/combine.sh \
179-
-u gatk/MarkDuplicates.jar \
181+
-u MarkDuplicates.jar \
180182
-c 8 \
181183
-t NAME=$NAME_PREFIX:combine:$chr -t STAGE=combine \
182184
--wrap="bash combine.sh" )
@@ -223,14 +225,23 @@ do
223225
# output: $gatk_job_id:raw.vcf
224226
# -d $chr_split_id,$bqsr_job_id \
225227

226-
gatk_job_id=$(./swe submit $common_params \
228+
gatk_ug_job_id=$(./swe submit $common_params \
229+
-d $chr_split_id \
230+
-iinput=$chr_split_id:$split_id.bam \
231+
-iinterval=$chr_split_id:$split_id.interval \
232+
-t NAME=$NAME_PREFIX:gatk-ug:$chr:$split_id -t STAGE=gatk-ug \
233+
-u gatk-ug/gatk-ug.sh \
234+
-igatk_jar=$GATK_LITE_JAR \
235+
--wrap="bash gatk-ug.sh")
236+
237+
gatk_hc_job_id=$(./swe submit $common_params \
227238
-d $chr_split_id \
228239
-iinput=$chr_split_id:$split_id.bam \
229240
-iinterval=$chr_split_id:$split_id.interval \
230-
-t NAME=$NAME_PREFIX:gatk:$chr:$split_id -t STAGE=gatk \
231-
-u gatk/gatk.sh \
232-
-igatk_jar=$GATK_JAR \
233-
--wrap="bash gatk.sh")
241+
-t NAME=$NAME_PREFIX:gatk-hc:$chr:$split_id -t STAGE=gatk-hc \
242+
-u gatk-hc/gatk-hc.sh \
243+
-igatk_jar=$GATK_FULL_JAR \
244+
--wrap="bash gatk-hc.sh")
234245

235246
freebayes_job_id=$(./swe submit $common_params \
236247
-d $chr_split_id \
@@ -251,7 +262,7 @@ do
251262

252263
#[ "$swe_wait" == "" ] || ./swe wait $caller_job_id
253264

254-
caller_job_ids="$caller_job_ids $gatk_job_id $freebayes_job_id $platypus_job_id"
265+
caller_job_ids="$caller_job_ids $gatk_ug_job_id $gatk_hc_job_id $freebayes_job_id $platypus_job_id"
255266

256267
done
257268
done

0 commit comments

Comments
 (0)