Skip to content

Commit a6b49e7

Browse files
authored
Create getpairedreads.py
1 parent 0637d5a commit a6b49e7

File tree

1 file changed

+195
-0
lines changed

1 file changed

+195
-0
lines changed

getpairedreads.py

+195
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,195 @@
1+
#!/usr/bin/env python3
2+
import sys
3+
import os
4+
import pysam
5+
from get_graph import getgraph
6+
7+
8+
def bool_paired(reada, readb):
9+
if reada.query_name != readb.query_name: # exclude different reads
10+
return 0
11+
if not (reada.cigartuples and readb.cigartuples): # unmapped end in paired-end reads
12+
return 0
13+
if len(reada.cigartuples) == 1 or len(readb.cigartuples) == 1: # exclude only include M
14+
return 0
15+
if reada.flag & 16 == readb.flag & 16: # mapped to the same strand
16+
if reada.cigartuples[0][0] == 0 and readb.cigartuples[-1][0] == 0 or \
17+
(reada.cigartuples[-1][0] == 0 and readb.cigartuples[0][0] == 0): # 32S43M & 32M43S
18+
return 1
19+
if len(reada.cigartuples) == 3 and reada.cigartuples[-1][0] != 0 and reada.cigartuples[-1][1] <= 3 and readb.cigartuples[0][0] == 0 or \
20+
(reada.cigartuples[0][0] == 0 and len(readb.cigartuples) == 3 and readb.cigartuples[-1][0] != 0 and readb.cigartuples[-1][1] <= 3) or \
21+
(len(reada.cigartuples) == 3 and reada.cigartuples[0][0] != 0 and reada.cigartuples[0][1] <= 3 and readb.cigartuples[-1][0] == 0) or \
22+
(reada.cigartuples[-1][0] == 0 and len(readb.cigartuples) == 3 and readb.cigartuples[0][0] != 0 and readb.cigartuples[0][1] <= 3): # 41S30M2S & 41MXXX | 2S30M41S & XXX41M
23+
return 1
24+
else:
25+
if reada.cigartuples[0][0] == 0 and readb.cigartuples[0][0] == 0 or \
26+
(reada.cigartuples[-1][0] == 0 and readb.cigartuples[-1][0] == 0): # 32S43M & 43S32M
27+
return 1
28+
if len(reada.cigartuples) == 3 and reada.cigartuples[-1][0] != 0 and reada.cigartuples[-1][1] <= 3 and readb.cigartuples[-1][0] == 0 or \
29+
(reada.cigartuples[-1][0] == 0 and len(readb.cigartuples) == 3 and readb.cigartuples[-1][0] != 0 and readb.cigartuples[-1][1] <= 3) or \
30+
(len(reada.cigartuples) == 3 and reada.cigartuples[0][0] != 0 and reada.cigartuples[0][1] <= 3 and readb.cigartuples[0][0] == 0) or \
31+
(reada.cigartuples[0][0] == 0 and len(readb.cigartuples) == 3 and readb.cigartuples[0][0] != 0 and readb.cigartuples[0][1] <= 3): # 41S30M2S & XXX41M | 2S30M41S & 41MXXX
32+
return 1
33+
34+
35+
def bool_bridge(pairreada,pairreadb):
36+
if pairreada[0].cigartuples and not pairreada[1].cigartuples and not pairreadb[0].cigartuples and pairreadb[1].cigartuples:
37+
return 1
38+
elif not pairreada[0].cigartuples and pairreada[1].cigartuples and pairreadb[0].cigartuples and not pairreadb[1].cigartuples:
39+
return 1
40+
else:
41+
return 0
42+
43+
44+
45+
def getpairedreads(samplenameU, samplenameV, pattern='single-end', fusion_idx_dir=''):
46+
samfile_U = pysam.AlignmentFile(samplenameU, 'r')
47+
48+
filtered_from_U = {}
49+
filtered_from_V = {}
50+
51+
fusion_dic = getgraph(os.path.join(fusion_idx_dir, "fusion_table.tsv"))
52+
samfile_V = pysam.AlignmentFile(samplenameV, 'r')
53+
54+
# single-end
55+
if pattern == 'single-end':
56+
reads_V = {}
57+
for one in samfile_V:
58+
if one.reference_name not in reads_V:
59+
reads_V[one.reference_name] = {}
60+
61+
# store the primary alignment
62+
if one.query_name not in reads_V[one.reference_name] or not one.flag & 256:
63+
reads_V[one.reference_name][one.query_name] = one
64+
65+
66+
for read in samfile_U:
67+
partners = fusion_dic[read.reference_name]
68+
for pa in partners:
69+
if pa not in reads_V:
70+
continue
71+
if read.query_name in reads_V[pa]:
72+
anotherread = reads_V[pa][read.query_name]
73+
if bool_paired(read, anotherread):
74+
filtered_from_U[read.query_name] = read
75+
filtered_from_V[anotherread.query_name] = anotherread
76+
break
77+
78+
79+
print("Find " + str(len(filtered_from_U)) + " Reads in U! " + str(len(filtered_from_V)) + " Reads in V!")
80+
81+
# write
82+
sam_filter_U_path = samplenameU.replace(".sam","_filtered.sam")
83+
Ureads = pysam.AlignmentFile(sam_filter_U_path, 'w', template=samfile_U)
84+
for oneU in filtered_from_U.keys():
85+
Ureads.write(filtered_from_U[oneU])
86+
Ureads.close()
87+
88+
sam_filter_V_path = samplenameV.replace(".sam", "_filtered.sam")
89+
Vreads = pysam.AlignmentFile(sam_filter_V_path, 'w', template=samfile_V)
90+
for oneV in filtered_from_V.keys():
91+
Vreads.write(filtered_from_V[oneV])
92+
Vreads.close()
93+
94+
95+
96+
# paired-end
97+
if pattern == 'paired-end':
98+
reads_V = {}
99+
temp_paired = []
100+
101+
for one in samfile_V: # reads sorted by query_name works
102+
temp_paired.append(one)
103+
if len(temp_paired) == 2:
104+
if one.query_name == temp_paired[0].query_name:
105+
if temp_paired[0].flag & 128 and temp_paired[1].flag & 64:
106+
temp_paired = [temp_paired[1],temp_paired[0]]
107+
if temp_paired[0].flag & 64 and temp_paired[1].flag & 128:
108+
# add into reads_V
109+
if temp_paired[0].reference_name != "*":
110+
if temp_paired[0].reference_name not in reads_V:
111+
reads_V[temp_paired[0].reference_name] = {}
112+
reads_V[temp_paired[0].reference_name][temp_paired[0].query_name] = tuple(temp_paired)
113+
114+
if temp_paired[1].reference_name != "*":
115+
if temp_paired[1].reference_name not in reads_V:
116+
reads_V[temp_paired[1].reference_name] = {}
117+
reads_V[temp_paired[1].reference_name][temp_paired[1].query_name] = tuple(temp_paired)
118+
temp_paired = []
119+
else:
120+
temp_paired = [temp_paired[0] if temp_paired[1].flag & 256 else temp_paired[1]]
121+
else:
122+
temp_paired = [one]
123+
124+
125+
126+
temp_paired = []
127+
for read in samfile_U:
128+
temp_paired.append(read)
129+
if len(temp_paired) == 2:
130+
if read.query_name == temp_paired[0].query_name:
131+
if temp_paired[0].flag & 128 and temp_paired[1].flag & 64:
132+
temp_paired = [temp_paired[1],temp_paired[0]]
133+
if temp_paired[0].flag & 64 and temp_paired[1].flag & 128:
134+
if temp_paired[0].reference_name != "*" and temp_paired[1].reference_name != "*" and \
135+
temp_paired[0].reference_name != temp_paired[1].reference_name:
136+
temp_paired = []
137+
continue
138+
if temp_paired[0].reference_name != "*":
139+
partners1 = fusion_dic[temp_paired[0].reference_name]
140+
for pa1 in partners1:
141+
if pa1 not in reads_V:
142+
continue
143+
if read.query_name in reads_V[pa1]:
144+
anotherpairedread = reads_V[pa1][read.query_name]
145+
if bool_paired(anotherpairedread[0], temp_paired[0]) or bool_bridge(temp_paired, anotherpairedread):
146+
filtered_from_U[read.query_name] = tuple(temp_paired)
147+
filtered_from_V[read.query_name] = anotherpairedread
148+
break
149+
150+
if temp_paired[1].reference_name != "*":
151+
partners2 = fusion_dic[temp_paired[1].reference_name]
152+
for pa2 in partners2:
153+
if pa2 not in reads_V:
154+
continue
155+
if read.query_name in reads_V[pa2]:
156+
anotherpairedread = reads_V[pa2][read.query_name]
157+
if bool_paired(anotherpairedread[1],temp_paired[1]) or bool_bridge(temp_paired, anotherpairedread):
158+
filtered_from_U[read.query_name] = tuple(temp_paired)
159+
filtered_from_V[read.query_name] = anotherpairedread
160+
break
161+
temp_paired = []
162+
else:
163+
temp_paired = [temp_paired[0] if temp_paired[1].flag & 256 else temp_paired[1]]
164+
else:
165+
temp_paired = [read]
166+
167+
print('Find ' + str(len(filtered_from_U)) + ' Reads in U! ' + str(len(filtered_from_V)) + ' Reads in V!')
168+
169+
# write
170+
sam_filter_U_path = samplenameU.replace(".sam", "_filtered.sam")
171+
Ureads = pysam.AlignmentFile(sam_filter_U_path, 'w', template=samfile_U)
172+
for paironeU in filtered_from_U.keys():
173+
for oneU in filtered_from_U[paironeU]:
174+
Ureads.write(oneU)
175+
Ureads.close()
176+
177+
sam_filter_V_path = samplenameV.replace(".sam", "_filtered.sam")
178+
Vreads = pysam.AlignmentFile(sam_filter_V_path, 'w', template=samfile_V)
179+
for paironeV in filtered_from_V.keys():
180+
for oneV in filtered_from_V[paironeV]:
181+
Vreads.write(oneV)
182+
Vreads.close()
183+
184+
185+
186+
samfile_U.close()
187+
samfile_V.close()
188+
189+
return sam_filter_U_path, sam_filter_V_path
190+
191+
192+
193+
194+
if __name__ == '__main__':
195+
getpairedreads(sys.argv[1], sys.argv[2], sys.argv[3])

0 commit comments

Comments
 (0)