-
Notifications
You must be signed in to change notification settings - Fork 2
/
Copy path02peakWidthsPipeline.sh
85 lines (64 loc) · 2.62 KB
/
02peakWidthsPipeline.sh
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
# generate k-mer spectrum according to genome and population parameters
echo "########################################"
echo "COALESCENT SIMULATION FOR K-MER SPRECTRA"
# activate conda environment (run 00setUpEnv.sh to set up)
#conda activate CSKS
# make a temp dir
#td=$(mktemp -d)
td=/tmp/tmp.PwXQCJ1tQi
echo "TMP DIR $td"
# simulation parameters
gs=1000000 # Size in nt of each chromosome. There are 10 chromosomes, so the overall haploid genome size is 'gs'*10!
#chrs=1
#ploy=2 # ploidy level
theta=0.001 # population-scaled mutation rate (= heterozygosity in a random-mating population)
gc=0.4 # genome GC-content (ignored for now)
#div=0.05 # genome divergence (allotetraploids only)
rl=150 # read length
dep=100
# run msprime (this produces the genomes)
python code/makeDiploidNoCoal.py $gs $theta $gc $td
# make reads from genomes
#python code/makeReads.py $td
wgsim -r 0 -R 0 -e 0.001 -d 550 -s 50 -1 $rl -2 $rl -N $((gs/2/$rl*$dep)) $td/genomes.fa $td/fw.fq $td/rw.fq > /dev/null
# run kmc
echo "########################################"
echo "KMC DB"
printf "$td/fw.fq\n$td/rw.fq" > $td/readFiles
kmc -k21 -t3 -cs500000000 -fq @$td/readFiles $td/db21 $td
echo "DB done."
echo ""
echo "########################################"
echo "MAKING SPECTRUM"
# run kmc_tools to make spectrum
kmc_tools transform $td/db21 histogram G$gs'P2T'$theta.hist -cx5000
echo "removing zero lines"
awk '{ if( $2 != 0 ){ print $0 } }' G$gs'P2T'$theta.hist > G$gs'P2T'$theta.hist.no0 && rm G$gs'P2T'$theta.hist
echo "Done."
echo ""
echo "########################################"
echo "INDEXING REFERENCE"
bwa-mem2 index $td/reference.fa
echo "########################################"
echo "MAPPING"
bwa-mem2 mem -t 6 $td/reference.fa $td/fw.fq $td/rw.fq | samtools view -b -o $td/ref.bam
echo "########################################"
echo "SORTING BAM"
samtools sort -o $td/refS.bam $td/ref.bam && rm $td/ref.bam
echo "########################################"
echo "MARKING DUPLICATES"
picard MarkDuplicates I=$td/refS.bam O=$td/refSD.bam M=$td/refSD.dup-metrics.txt && rm $td/refS.bam
echo "########################################"
echo "EXTRACTING DUPLICATES"
samtools view -f 1024 -b -o $td/refSDonly.bam $td/refSD.bam
samtools index $td/refSDonly.bam
samtools index $td/refSD.bam
echo "########################################"
echo "COMPUTING MAPPING DEPTHS"
mosdepth -b 1000 $td/refSD $td/refSD.bam
mosdepth -b 1000 $td/refSDonly $td/refSDonly.bam
#echo "########################################"
#echo "Removing temp files..."
#rm -rf $td # comment out for debug!
#echo "Histogram written to G"$gs"P"$ploy"T"$theta".hist.no0"
#echo "PIPELINE DONE."