|
| 1 | +#!/usr/bin/env python3 |
| 2 | +import sys |
| 3 | +import os |
| 4 | +import pysam |
| 5 | +from get_graph import getgraph |
| 6 | + |
| 7 | + |
| 8 | +def bool_paired(reada, readb): |
| 9 | + if reada.query_name != readb.query_name: # exclude different reads |
| 10 | + return 0 |
| 11 | + if not (reada.cigartuples and readb.cigartuples): # unmapped end in paired-end reads |
| 12 | + return 0 |
| 13 | + if len(reada.cigartuples) == 1 or len(readb.cigartuples) == 1: # exclude only include M |
| 14 | + return 0 |
| 15 | + if reada.flag & 16 == readb.flag & 16: # mapped to the same strand |
| 16 | + if reada.cigartuples[0][0] == 0 and readb.cigartuples[-1][0] == 0 or \ |
| 17 | + (reada.cigartuples[-1][0] == 0 and readb.cigartuples[0][0] == 0): # 32S43M & 32M43S |
| 18 | + return 1 |
| 19 | + if len(reada.cigartuples) == 3 and reada.cigartuples[-1][0] != 0 and reada.cigartuples[-1][1] <= 3 and readb.cigartuples[0][0] == 0 or \ |
| 20 | + (reada.cigartuples[0][0] == 0 and len(readb.cigartuples) == 3 and readb.cigartuples[-1][0] != 0 and readb.cigartuples[-1][1] <= 3) or \ |
| 21 | + (len(reada.cigartuples) == 3 and reada.cigartuples[0][0] != 0 and reada.cigartuples[0][1] <= 3 and readb.cigartuples[-1][0] == 0) or \ |
| 22 | + (reada.cigartuples[-1][0] == 0 and len(readb.cigartuples) == 3 and readb.cigartuples[0][0] != 0 and readb.cigartuples[0][1] <= 3): # 41S30M2S & 41MXXX | 2S30M41S & XXX41M |
| 23 | + return 1 |
| 24 | + else: |
| 25 | + if reada.cigartuples[0][0] == 0 and readb.cigartuples[0][0] == 0 or \ |
| 26 | + (reada.cigartuples[-1][0] == 0 and readb.cigartuples[-1][0] == 0): # 32S43M & 43S32M |
| 27 | + return 1 |
| 28 | + if len(reada.cigartuples) == 3 and reada.cigartuples[-1][0] != 0 and reada.cigartuples[-1][1] <= 3 and readb.cigartuples[-1][0] == 0 or \ |
| 29 | + (reada.cigartuples[-1][0] == 0 and len(readb.cigartuples) == 3 and readb.cigartuples[-1][0] != 0 and readb.cigartuples[-1][1] <= 3) or \ |
| 30 | + (len(reada.cigartuples) == 3 and reada.cigartuples[0][0] != 0 and reada.cigartuples[0][1] <= 3 and readb.cigartuples[0][0] == 0) or \ |
| 31 | + (reada.cigartuples[0][0] == 0 and len(readb.cigartuples) == 3 and readb.cigartuples[0][0] != 0 and readb.cigartuples[0][1] <= 3): # 41S30M2S & XXX41M | 2S30M41S & 41MXXX |
| 32 | + return 1 |
| 33 | + |
| 34 | + |
| 35 | +def bool_bridge(pairreada,pairreadb): |
| 36 | + if pairreada[0].cigartuples and not pairreada[1].cigartuples and not pairreadb[0].cigartuples and pairreadb[1].cigartuples: |
| 37 | + return 1 |
| 38 | + elif not pairreada[0].cigartuples and pairreada[1].cigartuples and pairreadb[0].cigartuples and not pairreadb[1].cigartuples: |
| 39 | + return 1 |
| 40 | + else: |
| 41 | + return 0 |
| 42 | + |
| 43 | + |
| 44 | + |
| 45 | +def getpairedreads(samplenameU, samplenameV, pattern='single-end', fusion_idx_dir=''): |
| 46 | + samfile_U = pysam.AlignmentFile(samplenameU, 'r') |
| 47 | + |
| 48 | + filtered_from_U = {} |
| 49 | + filtered_from_V = {} |
| 50 | + |
| 51 | + fusion_dic = getgraph(os.path.join(fusion_idx_dir, "fusion_table.tsv")) |
| 52 | + samfile_V = pysam.AlignmentFile(samplenameV, 'r') |
| 53 | + |
| 54 | + # single-end |
| 55 | + if pattern == 'single-end': |
| 56 | + reads_V = {} |
| 57 | + for one in samfile_V: |
| 58 | + if one.reference_name not in reads_V: |
| 59 | + reads_V[one.reference_name] = {} |
| 60 | + |
| 61 | + # store the primary alignment |
| 62 | + if one.query_name not in reads_V[one.reference_name] or not one.flag & 256: |
| 63 | + reads_V[one.reference_name][one.query_name] = one |
| 64 | + |
| 65 | + |
| 66 | + for read in samfile_U: |
| 67 | + partners = fusion_dic[read.reference_name] |
| 68 | + for pa in partners: |
| 69 | + if pa not in reads_V: |
| 70 | + continue |
| 71 | + if read.query_name in reads_V[pa]: |
| 72 | + anotherread = reads_V[pa][read.query_name] |
| 73 | + if bool_paired(read, anotherread): |
| 74 | + filtered_from_U[read.query_name] = read |
| 75 | + filtered_from_V[anotherread.query_name] = anotherread |
| 76 | + break |
| 77 | + |
| 78 | + |
| 79 | + print("Find " + str(len(filtered_from_U)) + " Reads in U! " + str(len(filtered_from_V)) + " Reads in V!") |
| 80 | + |
| 81 | + # write |
| 82 | + sam_filter_U_path = samplenameU.replace(".sam","_filtered.sam") |
| 83 | + Ureads = pysam.AlignmentFile(sam_filter_U_path, 'w', template=samfile_U) |
| 84 | + for oneU in filtered_from_U.keys(): |
| 85 | + Ureads.write(filtered_from_U[oneU]) |
| 86 | + Ureads.close() |
| 87 | + |
| 88 | + sam_filter_V_path = samplenameV.replace(".sam", "_filtered.sam") |
| 89 | + Vreads = pysam.AlignmentFile(sam_filter_V_path, 'w', template=samfile_V) |
| 90 | + for oneV in filtered_from_V.keys(): |
| 91 | + Vreads.write(filtered_from_V[oneV]) |
| 92 | + Vreads.close() |
| 93 | + |
| 94 | + |
| 95 | + |
| 96 | + # paired-end |
| 97 | + if pattern == 'paired-end': |
| 98 | + reads_V = {} |
| 99 | + temp_paired = [] |
| 100 | + |
| 101 | + for one in samfile_V: # reads sorted by query_name works |
| 102 | + temp_paired.append(one) |
| 103 | + if len(temp_paired) == 2: |
| 104 | + if one.query_name == temp_paired[0].query_name: |
| 105 | + if temp_paired[0].flag & 128 and temp_paired[1].flag & 64: |
| 106 | + temp_paired = [temp_paired[1],temp_paired[0]] |
| 107 | + if temp_paired[0].flag & 64 and temp_paired[1].flag & 128: |
| 108 | + # add into reads_V |
| 109 | + if temp_paired[0].reference_name != "*": |
| 110 | + if temp_paired[0].reference_name not in reads_V: |
| 111 | + reads_V[temp_paired[0].reference_name] = {} |
| 112 | + reads_V[temp_paired[0].reference_name][temp_paired[0].query_name] = tuple(temp_paired) |
| 113 | + |
| 114 | + if temp_paired[1].reference_name != "*": |
| 115 | + if temp_paired[1].reference_name not in reads_V: |
| 116 | + reads_V[temp_paired[1].reference_name] = {} |
| 117 | + reads_V[temp_paired[1].reference_name][temp_paired[1].query_name] = tuple(temp_paired) |
| 118 | + temp_paired = [] |
| 119 | + else: |
| 120 | + temp_paired = [temp_paired[0] if temp_paired[1].flag & 256 else temp_paired[1]] |
| 121 | + else: |
| 122 | + temp_paired = [one] |
| 123 | + |
| 124 | + |
| 125 | + |
| 126 | + temp_paired = [] |
| 127 | + for read in samfile_U: |
| 128 | + temp_paired.append(read) |
| 129 | + if len(temp_paired) == 2: |
| 130 | + if read.query_name == temp_paired[0].query_name: |
| 131 | + if temp_paired[0].flag & 128 and temp_paired[1].flag & 64: |
| 132 | + temp_paired = [temp_paired[1],temp_paired[0]] |
| 133 | + if temp_paired[0].flag & 64 and temp_paired[1].flag & 128: |
| 134 | + if temp_paired[0].reference_name != "*" and temp_paired[1].reference_name != "*" and \ |
| 135 | + temp_paired[0].reference_name != temp_paired[1].reference_name: |
| 136 | + temp_paired = [] |
| 137 | + continue |
| 138 | + if temp_paired[0].reference_name != "*": |
| 139 | + partners1 = fusion_dic[temp_paired[0].reference_name] |
| 140 | + for pa1 in partners1: |
| 141 | + if pa1 not in reads_V: |
| 142 | + continue |
| 143 | + if read.query_name in reads_V[pa1]: |
| 144 | + anotherpairedread = reads_V[pa1][read.query_name] |
| 145 | + if bool_paired(anotherpairedread[0], temp_paired[0]) or bool_bridge(temp_paired, anotherpairedread): |
| 146 | + filtered_from_U[read.query_name] = tuple(temp_paired) |
| 147 | + filtered_from_V[read.query_name] = anotherpairedread |
| 148 | + break |
| 149 | + |
| 150 | + if temp_paired[1].reference_name != "*": |
| 151 | + partners2 = fusion_dic[temp_paired[1].reference_name] |
| 152 | + for pa2 in partners2: |
| 153 | + if pa2 not in reads_V: |
| 154 | + continue |
| 155 | + if read.query_name in reads_V[pa2]: |
| 156 | + anotherpairedread = reads_V[pa2][read.query_name] |
| 157 | + if bool_paired(anotherpairedread[1],temp_paired[1]) or bool_bridge(temp_paired, anotherpairedread): |
| 158 | + filtered_from_U[read.query_name] = tuple(temp_paired) |
| 159 | + filtered_from_V[read.query_name] = anotherpairedread |
| 160 | + break |
| 161 | + temp_paired = [] |
| 162 | + else: |
| 163 | + temp_paired = [temp_paired[0] if temp_paired[1].flag & 256 else temp_paired[1]] |
| 164 | + else: |
| 165 | + temp_paired = [read] |
| 166 | + |
| 167 | + print('Find ' + str(len(filtered_from_U)) + ' Reads in U! ' + str(len(filtered_from_V)) + ' Reads in V!') |
| 168 | + |
| 169 | + # write |
| 170 | + sam_filter_U_path = samplenameU.replace(".sam", "_filtered.sam") |
| 171 | + Ureads = pysam.AlignmentFile(sam_filter_U_path, 'w', template=samfile_U) |
| 172 | + for paironeU in filtered_from_U.keys(): |
| 173 | + for oneU in filtered_from_U[paironeU]: |
| 174 | + Ureads.write(oneU) |
| 175 | + Ureads.close() |
| 176 | + |
| 177 | + sam_filter_V_path = samplenameV.replace(".sam", "_filtered.sam") |
| 178 | + Vreads = pysam.AlignmentFile(sam_filter_V_path, 'w', template=samfile_V) |
| 179 | + for paironeV in filtered_from_V.keys(): |
| 180 | + for oneV in filtered_from_V[paironeV]: |
| 181 | + Vreads.write(oneV) |
| 182 | + Vreads.close() |
| 183 | + |
| 184 | + |
| 185 | + |
| 186 | + samfile_U.close() |
| 187 | + samfile_V.close() |
| 188 | + |
| 189 | + return sam_filter_U_path, sam_filter_V_path |
| 190 | + |
| 191 | + |
| 192 | + |
| 193 | + |
| 194 | +if __name__ == '__main__': |
| 195 | + getpairedreads(sys.argv[1], sys.argv[2], sys.argv[3]) |
0 commit comments