-
Notifications
You must be signed in to change notification settings - Fork 12
/
Copy pathfilter-sam.py
executable file
·122 lines (108 loc) · 5.5 KB
/
filter-sam.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
#!/usr/bin/env python
"""
This script removes any reads from the SAM file that were not mapped.
For BWA it also keeps only reads with one best match and no suboptimal matches.
"""
import sys
from cmdline.cmdline import CommandLineApp
import pysam
class SamFilter(CommandLineApp):
def __init__(self):
CommandLineApp.__init__(self)
op = self.option_parser
op.set_usage('usage: filter-sam.py')
op.add_option('-i', '--infile', dest='infile', type='string', default=None,
help='Input .sam file')
op.add_option('-o', '--outfile', dest='outfile', type='string', default=None,
help='Output .sam file')
op.add_option('-a', '--bwa_alg', dest='bwa_alg', type='string', default='',
help='Output .sam file')
op.add_option('-s', '--use_stampy', dest='use_stampy', type='int', default=0,
help='SAM file was mapped with stampy')
def main(self):
#my best guess about what the SAM flags mean:
flags_to_consider = set([
0, #no information
4, #segment unmapped
16, #SEQ being reverse complemented
])
bwa_alg = self.options.bwa_alg
use_stampy = self.options.use_stampy
try: #Newer pysam deprecates Samfile object in favour of AlignmentFile object
infile = pysam.AlignmentFile(self.options.infile, 'r')
outfile = pysam.AlignmentFile(self.options.outfile, 'wh', template=infile)
except:
infile = pysam.Samfile(self.options.infile, 'r')
outfile = pysam.Samfile(self.options.outfile, 'wh', template=infile)
#outfile = pysam.Samfile(self.options.outfile, 'w', template=infile)
print('Filtering reads in %s to %s' % (infile, outfile))
i = 0
omit = 0
unmapped = 0
for read in infile.fetch(until_eof=True): #until_eof=True required by newer pysam, not detrimental to old
i += 1
if not (read.flag in flags_to_consider):
try: #Newer pysam deprecates .qname in favour of .query_name
print 'read %s flag %d not in {0,4,16}: omitting' % (read.query_name, read.flag)
except:
print 'read %s flag %d not in {0,4,16}: omitting' % (read.qname, read.flag)
continue
#skip unmapped reads
if read.flag == 4:
unmapped += 1
continue
try:
if bwa_alg == 'bwasw' or use_stampy == 1:
#The SAM files output by the bwasw algorithm (or stampy) don't include the X0 or X1 tags
#bwasw created files do seem to include at least AS,XN,XS,XF,XE if we want to filter by those later.
ok = True
elif bwa_alg == 'mem':
#SAM files generated by BWA-MEM do not get the same X* tags as produced by BWA-aln.
#We can use the AS and XS tags to check for repetitive mapping.
#AS = Alignment Score for reported alignment
#XS = alignment score for Suboptimal alignments
#MAPQ=0 also implies multiple mapping for BWA, but only if scores are same
#i.e. if scores of multiple hits are diff., MAPQ is small, but not 0
AS_XS_threshold = 6 #Empirically estimated from Dsim data?
try: #If pysam is recent, opt method is deprecated, use get_tag instead
par_AS_XS_diff = read.get_tag('AS')-read.get_tag('XS')
except:
par_AS_XS_diff = read.opt('AS')-read.opt('XS')
non_repetitive = par_AS_XS_diff > AS_XS_threshold
#Only allow read that do not map to repetitive regions:
ok = non_repetitive
elif bwa_alg == 'aln':
try: #Newer pysam deprecates opt method in favour of get_tag
one_best_match = read.get_tag('X0') == 1
no_suboptimal_matches = False
try:
no_suboptimal_matches = read.get_tag('X1') == 0
except:
no_suboptimal_matches = read.get_tag('XT') == 'U'
except:
one_best_match = read.opt('X0') == 1
no_suboptimal_matches = False
try: no_suboptimal_matches = read.opt('X1') == 0
except: no_suboptimal_matches = read.opt('XT')=='U'
ok = one_best_match and no_suboptimal_matches
else:
raise Exception('Invalid BWA algorithm (%s) specified. Please select from aln, mem, and bwasw.' % bwa_alg)
except Exception, e: ## KeyError
try: #Newer pysam deprecates .qname in favour of .query_name
print '%s %s' % (read.query_name, e)
except:
print '%s %s' % (read.qname, e)
print read
ok = False
if ok:
outfile.write(read)
else:
omit += 1
print 'Removed %d reads out of %d' % (omit, i)
print 'Omitted %d unmapped reads out of %d' % (unmapped, i)
if __name__ == '__main__':
try:
SamFilter().run()
except Exception, e:
print '%s' % e
sys.exit(2)