-
Notifications
You must be signed in to change notification settings - Fork 35
Install and test SV callers
wget https://repo.continuum.io/miniconda/Miniconda3-latest-Linux-x86_64.sh # python 3.6
# wget https://repo.continuum.io/miniconda/Miniconda2-latest-Linux-x86_64.sh # python 2.7
bash Miniconda3-latest-Linux-x86_64.sh
# add conda to the PATH & source the env
source ~/.bashrc
conda update -y conda
- set up bioconda channels
conda config --add channels conda-forge
conda config --add channels defaults
conda config --add channels r
conda config --add channels bioconda
- create a new environment e.g.
sv-callers
and install the tools (latest versions)
conda create -n sv-callers manta delly lumpy-sv gridss sambamba samblaster bwa R
source activate sv-callers
Note: To install the latest samtools
(v1.5) with the correct bzip2
distribution use -c conda-forge -c bioconda
options.
or install the tools using env.yaml
:
channels:
- conda-forge
- bioconda
dependencies:
- manta=1.1.0
- delly=0.7.7
- lumpy-sv=0.2.13
- samblaster=0.1.24
- gridss=1.3.4
- bwa=0.7.15
- R
conda-env create -n sv-callers -f env.yaml
Compute infra: HPC/UMC Utrecht (CentOS Linux rel.7.3)
Installed software
conda env export -n sv-callers
name: sv-callers
channels:
- bioconda
- r
- defaults
- conda-forge
dependencies:
- bcftools=1.4.1=0
- bwa=0.7.15=1
- delly=0.7.7=boost1.61_1
- gawk=4.1.3=1
- gridss=1.3.4=py27_0
- htslib=1.4.1=0
- lumpy-sv=0.2.13=py27_0
- manta=1.1.0=py27_0
- pysam=0.11.2.2=py27_0
- sambamba=0.6.6=0
- samblaster=0.1.24=0
- samtools=1.4.1=0
- boost=1.61.0=py27_0
- bzip2=1.0.6=3
- cairo=1.14.8=0
- curl=7.52.1=0
- fontconfig=2.12.1=3
- freetype=2.5.5=2
- glib=2.50.2=1
- gmp=6.1.0=0
- gsl=2.2.1=0
- harfbuzz=0.9.39=2
- icu=54.1=0
- jbig=2.1=0
- jpeg=9b=0
- libffi=3.2.1=1
- libgcc=5.2.0=0
- libiconv=1.14=0
- libpng=1.6.27=0
- libtiff=4.0.6=3
- libxml2=2.9.4=0
- mkl=2017.0.1=0
- mpfr=3.1.5=0
- ncurses=5.9=10
- numpy=1.13.0=py27_0
- openjdk=8.0.121=1
- openssl=1.0.2l=0
- pango=1.40.3=1
- pcre=8.39=1
- pip=9.0.1=py27_1
- pixman=0.34.0=0
- python=2.7.13=0
- readline=6.2=2
- setuptools=27.2.0=py27_0
- sqlite=3.13.0=0
- tk=8.5.18=0
- wheel=0.29.0=py27_0
- xz=5.2.2=1
- zlib=1.2.8=3
- r=3.3.2=r3.3.2_0
- r-base=3.3.2=1
- r-boot=1.3_18=r3.3.2_0
- r-class=7.3_14=r3.3.2_0
- r-cluster=2.0.5=r3.3.2_0
- r-codetools=0.2_15=r3.3.2_0
- r-foreign=0.8_67=r3.3.2_0
- r-kernsmooth=2.23_15=r3.3.2_0
- r-lattice=0.20_34=r3.3.2_0
- r-mass=7.3_45=r3.3.2_0
- r-matrix=1.2_7.1=r3.3.2_0
- r-mgcv=1.8_16=r3.3.2_0
- r-nlme=3.1_128=r3.3.2_0
- r-nnet=7.3_12=r3.3.2_0
- r-recommended=3.3.2=r3.3.2_0
- r-rpart=4.1_10=r3.3.2_0
- r-spatial=7.3_11=r3.3.2_0
- r-survival=2.40_1=r3.3.2_0
prefix: /home/cog/akuzniar/miniconda2/envs/sv-caller
sv caller | implementation | parallelism | I/O file formats |
---|---|---|---|
Manta | C++/Python | pyFlow tasks, SIMD | BAM,CRAM / VCF |
DELLY | C++ | OpenMP threads | BAM / BCF |
LUMPY | C/C++/Python | - | BAM / VCF |
GRIDSS | Java/R/Python | threads, SIMD | BAM,CRAM / VCF,BCF |
Data sets
-
Prostate cancer sample
- based dir:
/hpc/cog_bioinf/ridder/users/akuzniar
- input files:
VCAP_dedup.realigned.bam
(79GB)->chr21_pairs.bam|cram
(1.6GB|0.7GB) orchr1_pairs.bam|cram
(5.8GB|...),Homo_sapiens.GRCh37.GATK.illumina.fasta
(3GB)
- based dir:
-
Tumor-Normal (blood) samples:
- base dir:
/hpc/cog_bioinf/ridder/users/akuzniar
- input files:
Z424-10B00A0_dedup.realigned.bam
(63GB; T) andZ424-00A00A0_dedup.realigned.bam
(61GB; N)
- base dir:
export DATA_DIR=/hpc/cog_bioinf/ridder/users/akuzniar
cd $DATA_DIR
Use BAM or CRAM
samtools view -bh -@ 8 -f 2 -o chr21_pairs.bam VCAP_dedup.realigned.bam 21
samtools index -@ 8 chr21_pairs.bam # create index file (.bai)
or
samtools view -C -@ 8 -f 2 -C -T Homo_sapiens.GRCh37.GATK.illumina.fasta -o chr21_pairs.cram chr21_pairs.bam #
samtools index -@ 8 chr21_pairs.cram # create index file (.crai)
Alternatively, use sambamba
(with BAM/CRAM) instead of samtools
.
sambamba view -o chr21.bam -f bam -F "proper_pair" -t 8 VCAP_dedup.realigned.bam 21 # or with -f cram
N.B. To work on a subset of the human genome (e.g. chr.21) select proper read pairs.
Check read pairs statistics
samtools flagstat chr21_pairs.bam
20672736 + 0 in total (QC-passed reads + QC-failed reads)
37338 + 0 secondary
0 + 0 supplementary
1637676 + 0 duplicates
20672736 + 0 mapped (100.00% : N/A)
20635398 + 0 paired in sequencing
10317699 + 0 read1
10317699 + 0 read2
20635398 + 0 properly paired (100.00% : N/A)
20635398 + 0 with itself and mate mapped
0 + 0 singletons (0.00% : N/A)
0 + 0 with mate mapped to a different chr
0 + 0 with mate mapped to a different chr (mapQ>=5)
samtools flagstat chr1_pairs.bam
80671804 + 0 in total (QC-passed reads + QC-failed reads)
150516 + 0 secondary
0 + 0 supplementary
9041508 + 0 duplicates
80671804 + 0 mapped (100.00% : N/A)
80521288 + 0 paired in sequencing
40260644 + 0 read1
40260644 + 0 read2
80521288 + 0 properly paired (100.00% : N/A)
80521288 + 0 with itself and mate mapped
0 + 0 singletons (0.00% : N/A)
0 + 0 with mate mapped to a different chr
0 + 0 with mate mapped to a different chr (mapQ>=5)
Run Manta
- tumor sample (chr.21)
configManta.py --tumorBam=chr21_pairs.bam --reference=Homo_sapiens.GRCh37.GATK.illumina.fasta --runDir=. # or with *.cram
THREADS=1
echo ". activate sv-callers; ./runWorkflow.py --quiet -m local -j ${THREADS}" | qsub -N manta-${THREADS} -cwd -V -l h_rt=00:30:00 -l h_vmem=1G -pe threaded ${THREADS}
# ./runWorkflow.py -m sge -j $THREADS # submit directly to SGE cluster as pyflowTask(s)
- tumor-normal samples
configManta.py --tumorBam=Z424-10B00A0_dedup.realigned.bam --normalBam=Z424-00A00A0_dedup.realigned.bam --reference=Homo_sapiens.GRCh37.GATK.illumina.fasta --runDir=.
echo ". activate sv-callers; ./runWorkflow.py --quiet -m local -j ${THREADS}" | qsub -N manta-${THREADS} -cwd -V -l h_rt=07:00:00 -l h_vmem=2G -pe threaded ${THREADS}
Run DELLY
- tumor sample (chr.21)
# enable multi-threading at sample-level, requires more than one BAM file
# export OMP_NUM_THREADS=1
# export OMP_PROC_BIND=true # for >1 OMP_NUM_THREADS
# qsub ... -pe threaded $OMP_NUM_THREADS
SV_TYPES=(DEL DUP INV BND INS)
for t in "${SV_TYPES[@]}"; do
echo ". activate sv-callers; delly call -t ${t} -g Homo_sapiens.GRCh37.GATK.illumina.fasta -o delly_chr21_${t}.bcf -x chr21.excl.tsv chr21_pairs.bam" | qsub -N delly-${t}-${OMP_NUM_THREADS} -cwd -V -l h_rt=01:00:00 -l h_vmem=1G
done
- tumor-normal samples
for t in "${SV_TYPES[@]}"
do
echo ". activate sv-callers; delly call -t ${t} -g Homo_sapiens.GRCh37.GATK.illumina.fasta -o delly-${t}_Z424.bcf Z424-10B00A0_dedup.realigned.bam Z424-00A00A0_dedup.realigned.bam" | qsub -N delly-${t}-${OMP_NUM_THREADS} -cwd -V -l h_rt=24:00:00 -l h_vmem=8G -pe threaded ${OMP_NUM_THREADS}
done
Note: DELLY must be called for each SV type separately. DELLY's runtime can be further improved by parallel execution per chromosome (for INS, INV, DUP and DEL) and per chromosome pairs (BND = translocation).
Run LUMPY
- tumor sample (chr.21)
echo ". activate sv-callers; lumpyexpress -B chr21_pairs.bam -o lumpy_chr21_pairs.vcf" | qsub -N lumpy -cwd -V -l h_rt=00:30:00 -l h_vmem=1G
- tumor-normal samples
echo ". activate sv-callers; lumpyexpress -B Z424-10B00A0_dedup.realigned.bam,Z424-00A00A0_dedup.realigned.bam -o lumpy-Z424.vcf" | qsub -N lumpy -cwd -V -l h_rt=10:00:00 -l h_vmem=8G
TODO: To (potentially) speed-up the execution, extract split and discordant reads using SAMBAMBA instead of SAMBLASTER prior running LUMPY(express). For this, use -S
and -D
options of LUMPY.
Run GRIDSS
- tumor sample (chr.21)
THREADS=1
# rm -fr *gridss* *.dict snappy* # clean-up tmp files & dirs
echo ". activate sv-callers; gridss -Xmx31g gridss.CallVariants WORKER_THREADS=${THREADS} TMP_DIR=. WORKING_DIR=. REFERENCE_SEQUENCE=Homo_sapiens.GRCh37.GATK.illumina.fasta INPUT=chr21_pairs.bam OUTPUT=gridss_chr21.vcf ASSEMBLY=gridss_assembly.bam" | qsub -N gridss-${THREADS} -cwd -V -l h_vmem=64G -l h_rt=02:00:00 -pe threaded ${THREADS}
- tumor-normal samples
echo ". activate sv-callers; gridss -Xmx31g gridss.CallVariants WORKER_THREADS=${THREADS} TMP_DIR=. WORKING_DIR=. REFERENCE_SEQUENCE=Homo_sapiens.GRCh37.GATK.illumina.fasta INPUT=Z424-00A00A0_dedup.realigned.bam INPUT=Z424-10B00A0_dedup.realigned.bam OUTPUT=gridss-${THREADS}_Z424.vcf ASSEMBLY=gridss-${THREADS}_Z424_assembly.bam" | qsub -N gridss-${THREADS} -cwd -V -l h_vmem=64G -l h_rt=48:00:00 -pe threaded ${THREADS}
Summary
- split/index BAM files using samtools (v1.5) with 1/8/16 threads
chromosome | runtime (s) | memory use (M) |
---|---|---|
1 | 3158/487/317 | 21/29/38 |
2 | 3377/486/217 | 21/30/49 |
3 | 2710/467/183 | 21/29/48 |
4 | 2429/396/167 | 21/30/49 |
5 | 2547/487/184 | 21/29/49 |
6 | 2243/487/136 | 21/29/38 |
7 | 2763/429/182 | 21/29/49 |
8 | 2866/487/190 | 21/29/38 |
9 | 2244/294/145 | 21/29/38 |
10 | 1976/277/145 | 21/29/39 |
11 | 2073/487/142 | 21/29/49 |
12 | 2284/487/149 | 21/29/38 |
13 | 1211/487/76 | 21/29/38 |
14 | 1186/136/74 | 21/29/38 |
15 | 1439/199/106 | 21/29/38 |
16 | 863/97/67 | 21/29/38 |
17 | 1182/135/82 | 21/29/38 |
18 | 1167/140/72 | 19/29/47 |
19 | 902/100/63 | 21/29/38 |
20 | 1141/134/80 | 21/29/38 |
21 | 871/101/62 | 21/29/47 |
22 | 531/62/46 | 21/29/38 |
MT | 96/15/7 | 21/29/37 |
X | 1434/157/92 | 21/29/38 |
Y | 468/66/42 | 21/34/37 |
Note: Samtools segfaults when requesting less than 1G h_vmem
for 16 threads.
- chromosome 21 (tumor sample)
sv caller | # threads | runtime | memory use |
---|---|---|---|
Manta | 1/8/16 | 18m 58s/2m 38s/1m 53s | 85.5/328.9/496.5M |
DELLY | 1 | DEL: 20m 19s DUP: 1m 29s INV: 1m 4s INS : 1h 3m BND: 56s | DEL: 57 DUP: 58.4 INV: 48.5 INS: 55 BND: 48.5M |
LUMPY | 1 | 9m 6s | 240M |
GRIDSS | 1/8/16 | 1h 42m/1h 7m/1h 2m | 12.8/10.4/14.7G |
- chromosome 1 (tumor sample)
sv caller | # threads | runtime | memory use |
---|---|---|---|
Manta | 1/8/16 | 23m 54s/3m 54s/2m 38s | 70/260.2/440.9M |
DELLY | 1 | DEL: 1h 12m DUP: 3m 21s INV: 2m 54s INS: 1h 23m BND: 2m 23s | DEL: 252 DUP: 272 INV: 49.6 INS: 250 BND: 49.6M |
LUMPY | 1 | 38m 21s | 1G |
GRIDSS | 1/8/16 | 2h 54m/1h 2m/58m 40s | 18.3/18/17.8G |
- chromosome pairs (tumor-normal samples)
sv caller | # threads | runtime | memory use |
---|---|---|---|
Manta | 24 | 1m 21s | 772.7M |
DELLY | 2 | DEL: 13m 5s DUP: 2m 37s INV: 2m INS: 12m 27s BND: 1m 54s | DEL: 113.1 DUP: 113 INV: 113.1 INS: 113.1 BND: 113.1M |
LUMPY | 1 | 15m 11s | 169.2M |
GRIDSS | 24 | 47m 10s | 25.8G |
Note: LUMPY failed due to an issue with tmpspace.
- all chromosomes (tumor-normal samples)
sv caller | # threads | runtime | memory use |
---|---|---|---|
Manta | 1/8/16 | 7h/1h 27m/34m 57s | 248M/1.6/3.1G |
DELLY | 1/2 | DEL: 24h 3m/17h 29m DUP: 5h 46m/3h 37m INV: 5h 49m/3h 58m INS: 24h 4m/19h 30m BND: >48h/>20h | DEL: 432.2/514.8M DUP: 409.7/465.8M INV: 417.5/468.5M INS: 318/324.5M BND: 1.8/2.6G |
LUMPY | 1 | 10h 45m | 9.4G |
GRIDSS | 1/8/16 | >48h/22h 20m/14h 33m | >18.9/40.5/42G |
Note: In DELLY the multi-threaded parallelism works only if multiple BAM files are used. Its runtime performance can be further improved (>15%) using core binding (data locality). runtime=ru_wallclock
and memory use=maxvmem
Monitor jobs: watch --interval 10 qstat
Show job accounting log: qacct -j [job_id|job_name]