1
1
#!/usr/bin/env python3
2
2
3
3
import re
4
- import os , getopt
5
- import time
4
+ import os
5
+ import getopt
6
6
import sys
7
7
from pysam import FastaFile
8
8
9
9
10
-
11
10
def getgraph (location ):
12
- '''
11
+ """
13
12
Load the fusion gene table from the local file
14
13
:param location: a string of fusion gene table location
15
14
:return: a dict of fusion genes
16
- '''
17
- if not (os .path .exists (os .curdir + '/' + location ) or os .path .exists (location )):
18
- print ('Error: There is no %s' % location )
15
+ """
16
+ if not (os .path .exists (os .curdir + '/' + location ) or os .path .exists (location )):
17
+ print ('Error: There is no %s' % location )
19
18
exit (1 )
20
19
with open (location , 'r' ) as f :
21
20
edge = {}
22
- nodes = []
23
21
for line in f :
24
22
if not line .startswith ('#' ):
25
23
try :
26
24
temp = line .strip ('\n ' ).split ('\t ' )[0 ].split ('--' )
27
25
start = temp [0 ]
28
26
end = temp [1 ]
29
27
except IndexError as e :
30
- print ("%s: gene pairs placed in first columns should be joined with '--' not '-' " % e )
28
+ print ("%s: gene pairs placed in first columns should be joined with '--' not '-' " % e )
31
29
exit (1 )
32
30
33
31
if start in edge :
@@ -41,17 +39,18 @@ def getgraph(location):
41
39
edge [end ] = [start ]
42
40
return edge
43
41
42
+
44
43
def find_bool (graph ):
45
44
thisside = set ()
46
45
thatside = set ()
47
46
nodes = list (graph .keys ())
48
47
thisside .add (nodes [0 ])
49
- while ( nodes != []) :
48
+ while nodes :
50
49
edges_count = sum ([len (graph [node ]) for node in nodes ])
51
50
if edges_count == 0 :
52
51
break
53
52
for start in graph .keys ():
54
- if graph [start ] == [ ]:
53
+ if not graph [start ]:
55
54
continue
56
55
if start in thisside :
57
56
for side in graph [start ]:
@@ -70,171 +69,194 @@ def find_bool(graph):
70
69
return list (thisside ), list (thatside )
71
70
72
71
73
-
74
-
75
-
76
-
77
- def Get_geneCoordinate (gtf ):
78
- if not (os .path .exists (gtf ) or (os .path .exists (os .curdir + '/' + gtf ))):
79
- print ("there is no gtf file,please check the path" )
72
+ def Get_geneCoordinate (gtf , od ):
73
+ synonyms_dict = {"SEPTIN5" : "SEPT5" ,
74
+ "SIKE1" : "SIKE" ,
75
+ "AC011462.1" : "TGFB1" ,
76
+ "SEPTIN9" : "SEPT9" ,
77
+ "SEPTIN6" : "SEPT6" ,
78
+ "H4C9" : "HIST1H4I" ,
79
+ "GET1" : "WRB" ,
80
+ "H2BC4" : "HIST1H2BC" ,
81
+ "SEPTIN11" : "SEPT11" ,
82
+ "ZFTA" : "C11orf95" ,
83
+ "H2AC17" : "HIST1H2AM" ,
84
+ "HLA-DMA" : "HLA_DMA" ,
85
+ "HLA-DMB" : "HLA_DMB" ,
86
+ "RNF213" : "AC124319.1" ,
87
+ "PEDS1" : "TMEM189" ,
88
+ "DYNLT2B" : "TCTEX1D2" ,
89
+ "TGFB1" : "AC011462.1" ,
90
+ "SEPTIN2" : "SEPT2" ,
91
+ "SEPTIN8" : "SEPT8" ,
92
+ "ERVK3-1" : "ERVK3_1" ,
93
+ "HI-6" : "HIST1H1T" ,
94
+ "CEP43" : "FGFR1OP" ,
95
+ "H3C12" : "HIST1H3J" ,
96
+ "CFAP251" : "WDR66" ,
97
+ "MAGEA5P" : "MAGEA5" ,
98
+ "BMERB1" : "C16orf45" ,
99
+ "EZHIP" : "CXorf67" ,
100
+ "H1-6" : "HIST1H1T" }
101
+ if not (os .path .exists (gtf ) or (os .path .exists (os .curdir + '/' + gtf ))):
102
+ print ("there is no gtf file, please check the path" )
80
103
exit (1 )
81
- Out = open ('fusion_genes_coordinate.txt' , 'w' )
82
- GeneCoordinate_dict = {}
83
- with open (gtf ,'r' ) as f :
104
+ Out = open ('{od}/ fusion_genes_coordinate.txt' . format ( od = od ), 'w' )
105
+ GeneCoordinate_dict = {}
106
+ with open (gtf , 'r' ) as f :
84
107
for line in f :
85
108
if not line .startswith ('#' ):
86
- Line = line .strip ('\n ' ).split ('\t ' )
87
- chr ,Type ,start ,end ,strand ,gene = Line [0 ],Line [2 ],Line [3 ],Line [4 ],Line [6 ],Line [- 1 ].split (';' )[2 ].replace ('gene_name "' ,'' ).replace ('"' ,'' ).strip ()
88
- if Type == 'gene' :
89
- if strand == '+' :
90
- strand = '1'
109
+ Line = line .strip ('\n ' ).split ('\t ' )
110
+ chrn , Type , start , end , strand , gene = Line [0 ], Line [2 ], Line [3 ], Line [4 ], Line [6 ], Line [- 1 ].split (';' )[
111
+ 2 ].replace ('gene_name "' , '' ).replace ('"' , '' ).strip ()
112
+ if Type == 'gene' :
113
+ if strand == '+' :
114
+ strand = '1'
91
115
else :
92
- strand = '-1'
93
- gene_coordinates = '\t ' .join ([chr ,strand ,start ,end ,gene ,'\n ' ])
116
+ strand = '-1'
117
+ gene_coordinates = '\t ' .join ([chrn , strand , start , end , gene , '\n ' ])
118
+ if gene in synonyms_dict :
119
+ gene = synonyms_dict [gene ]
94
120
if gene not in GeneCoordinate_dict :
95
- GeneCoordinate_dict [gene ]= [ chr , strand ,start ,end ,gene ]
121
+ GeneCoordinate_dict [gene ] = [ chrn , strand , start , end , gene ]
96
122
Out .write (gene_coordinates )
97
123
Out .close ()
98
124
return GeneCoordinate_dict
99
125
100
126
101
-
102
- def Get_fusionGene_seq (GenesU ,GenesV ,geneCoordniates_dict ,reference_fa = '' ):
103
- if not (os .path .exists (reference_fa ) or os .path .exists (os .curdir + '/' + reference_fa )):
127
+ def Get_fusionGene_seq (GenesU , GenesV , geneCoordniates_dict , reference_fa , od ):
128
+ if not (os .path .exists (reference_fa ) or os .path .exists (os .curdir + '/' + reference_fa )):
104
129
print ('Error: There is no reference fasta files' )
105
130
exit (1 )
106
- genome_name = os .path .basename (reference_fa )
131
+ genome_name = os .path .basename (reference_fa )
107
132
if not os .path .exists (genome_name ):
108
- genome = os .system ('ln -s %s %s' % ( reference_fa , genome_name ))
109
- genome_index = os . system ( 'samtools faidx %s' % genome_name )
110
- genome = FastaFile ( genome_name )
111
-
133
+ os .system ('ln -s %s %s' % ( os . path . abspath ( os . curdir + '/' + reference_fa ), od + '/' + genome_name ))
134
+ # fasta index
135
+ os . system ( 'samtools faidx %s' % od + '/' + genome_name )
136
+ genome = FastaFile ( od + '/' + genome_name )
112
137
113
- fusiongenes_ref_U = open ('fusion_total_index /fusiongenes_ref_U.fa' , 'w' )
138
+ fusiongenes_ref_U = open ('{od} /fusiongenes_ref_U.fa' . format ( od = od ), 'w' )
114
139
for gene in GenesU :
115
- chr , strand , start , end = None ,None ,None ,None
140
+ chrn , strand , start , end = None , None , None , None
116
141
try :
117
- chr , strand , start , end , gene = geneCoordniates_dict [gene ]
142
+ chrn , strand , start , end , gene = geneCoordniates_dict [gene ]
118
143
except KeyError as e :
119
- print ('%s: Input gene name wasnot found in Gtf,Check gene names' % e )
144
+ print ('%s: Input gene name wasnot found in Gtf, Check gene names' % e )
120
145
exit (1 )
121
- if strand != None :
146
+ if strand :
122
147
if strand == '1' :
123
- seq = genome .fetch (reference = chr , start = int (start ), end = int (end ))
148
+ seq = genome .fetch (reference = chrn , start = int (start ), end = int (end ))
124
149
else :
125
- seq_plus = genome .fetch (reference = chr , start = int (start ), end = int (end ))
150
+ seq_plus = genome .fetch (reference = chrn , start = int (start ), end = int (end ))
126
151
trantab = str .maketrans ('ACGTacgtNn' , 'TGCAtgcaNn' )
127
152
seq = seq_plus .translate (trantab )
128
153
seq = seq [::- 1 ]
129
- fusiongenes_ref_U .write ('>%s \n ' % gene )
154
+ fusiongenes_ref_U .write ('>%s\n ' % gene )
130
155
for line in re .findall (r'.{60}' , seq ):
131
156
fusiongenes_ref_U .write ('%s\n ' % line )
132
157
fusiongenes_ref_U .close ()
133
158
134
-
135
- fusiongenes_ref_V = open ('fusion_total_index/fusiongenes_ref_V.fa' ,'w' )
159
+ fusiongenes_ref_V = open ('{od}/fusiongenes_ref_V.fa' .format (od = od ), 'w' )
136
160
for gene in GenesV :
137
- chr , strand , start , end = None , None , None , None
161
+ chrn , strand , start , end = None , None , None , None
138
162
try :
139
- chr , strand , start , end , gene = geneCoordniates_dict [gene ]
163
+ chrn , strand , start , end , gene = geneCoordniates_dict [gene ]
140
164
except KeyError as e :
141
- print ('%s: Input gene name wasnot found in Gtf,Check gene names' % e )
165
+ print ('%s: Input gene name wasnot found in Gtf, Check gene names' % e )
142
166
exit (1 )
143
- if strand != None :
167
+ if strand :
144
168
if strand == '1' :
145
- seq = genome .fetch (reference = chr , start = int (start ), end = int (end ))
169
+ seq = genome .fetch (reference = chrn , start = int (start ), end = int (end ))
146
170
else :
147
- seq_plus = genome .fetch (reference = chr , start = int (start ), end = int (end ))
171
+ seq_plus = genome .fetch (reference = chrn , start = int (start ), end = int (end ))
148
172
trantab = str .maketrans ('ACGTacgtNn' , 'TGCAtgcaNn' )
149
173
seq = seq_plus .translate (trantab )
150
174
seq = seq [::- 1 ]
151
- fusiongenes_ref_V .write ('>%s \n ' % gene )
175
+ fusiongenes_ref_V .write ('>%s\n ' % gene )
152
176
for line in re .findall (r'.{60}' , seq ):
153
177
fusiongenes_ref_V .write ('%s\n ' % line )
154
178
fusiongenes_ref_V .close ()
155
179
return 0
156
180
157
181
158
-
159
- def main (fusion_tab ,gtf ,ref_genome ):
160
- if not os .path .exists ('fusion_total_index' ):
161
- os .mkdir ('fusion_total_index' )
182
+ def main (fusion_tab , gtf , ref_genome , od ):
183
+ if not os .path .exists (od ):
184
+ os .mkdir (od )
162
185
graph = getgraph (fusion_tab )
163
- bipgraph = find_bool (graph )
164
- Flag = False
186
+ bipgraph = find_bool (graph )
187
+ Flag = False
165
188
if bipgraph is not None :
166
- if ( len (bipgraph [0 ])+ len (bipgraph [1 ])== len (graph .keys () )):
167
- Flag = True
189
+ if len (bipgraph [0 ]) + len (bipgraph [1 ]) == len (graph .keys ()):
190
+ Flag = True
168
191
genesU , genesV = bipgraph [0 ], bipgraph [1 ]
169
- geneCoordinate = Get_geneCoordinate (gtf )
170
- fusion_seqs = Get_fusionGene_seq (genesU , genesV , geneCoordinate , ref_genome )
192
+ geneCoordinate = Get_geneCoordinate (gtf , od )
193
+ fusion_seqs = Get_fusionGene_seq (genesU , genesV , geneCoordinate , ref_genome , od )
171
194
if fusion_seqs == 0 :
172
195
print ("Sucessfully get the fusion genes sequence " )
173
196
else :
174
197
print ("Failed to get the fusion genes sequence" )
175
198
exit (1 )
199
+ os .system ('hisat2-build --quiet {od}/fusiongenes_ref_U.fa {od}/fusiongenes_ref_U' .format (od = od ))
200
+ os .system ('hisat2-build --quiet {od}/fusiongenes_ref_V.fa {od}/fusiongenes_ref_V' .format (od = od ))
176
201
if not Flag :
177
202
print ("Failed to build the bipartite graph pairs" )
178
203
else :
179
204
print ("Sucessfully building the bipartite graph pairs" )
180
205
181
206
182
-
183
-
184
-
185
-
186
207
if __name__ == '__main__' :
187
208
os .chdir (os .path .abspath (os .path .dirname (__file__ )))
188
209
usage_string = '''
189
- Usage: python Preprocess .py -genome <genome-reference-path> -gtf <gtf-reference-path> -tab <fusion-table>
210
+ Usage: python build_graph .py -- genome <genome-reference-path> -- gtf <gtf-reference-path> - -tab <fusion-table>
190
211
Arguments:
191
212
Required:
192
- -genome <genome-reference-path>
193
- genome reference fasta file, default: Homo_sapiens.GRCh38.dna.primary_assembly.fa
194
- -gtf <gtf-reference-path>
195
- gene transfer format file default: Homo_sapiens.GRCh38.95.gtf
196
- -tab <fusion-table>
197
- Inputed fusion gene pairs table,file format should be tsv, fusion pairs Gene pairs should be connected with -- not -, default: fusion_total_index/fusion_table.tsv
213
+ -g/--genome <genome-reference-path>
214
+ genome reference fasta file
215
+ -a/--gtf <gtf-reference-path>
216
+ gene annotation GTF format file
217
+ Optional:
218
+ -t/--tab <fusion-table>
219
+ Inputed fusion gene pairs table,file format should be tsv, fusion pairs Gene pairs should be connected with --, default: reference_fusion_info/fusion_table.tsv
220
+ -o/--outdir <outdir>
221
+ output directory fusion reference information and index
198
222
Other:
199
223
-h --help
200
224
help information
201
225
'''
202
- if len (sys .argv [1 :])== 0 :
226
+ if len (sys .argv [1 :]) == 0 :
203
227
print (usage_string )
204
228
sys .exit (1 )
205
229
try :
206
- opts , args = getopt .getopt (sys .argv [1 :], "h" , ["help" ,"genome=" ,"gtf=" ,"tab=" ])
230
+ opts , args = getopt .getopt (sys .argv [1 :], "hg:a:t:o:" , ["help" , "genome=" , "gtf=" , "tab=" , "outdir =" ])
207
231
except getopt .GetoptError as err :
208
232
print (err )
209
233
sys .exit (1 )
210
234
211
- genome = 'Homo_sapiens.GRCh38.dna.primary_assembly.fa'
212
- gtf = 'Homo_sapiens.GRCh38.95.gtf'
213
- fusion_tab = 'fusion_total_index/fusion_table.tsv'
235
+ genome = 'Homo_sapiens.GRCh38.dna.primary_assembly.fa'
236
+ gtf = 'Homo_sapiens.GRCh38.95.gtf'
237
+ fusion_tab = 'reference_fusion_info/fusion_table.tsv'
238
+ outdir = 'fusion_total_index'
214
239
215
240
for opt , arg in opts :
216
241
if opt in ("-h" , '--help' ):
217
242
print (usage_string )
218
243
sys .exit ()
219
- elif (opt == "--genome" ):
220
- genome = arg
221
- elif (opt == "--gtf" ):
222
- gtf = arg
223
- elif (opt == "--tab" ):
224
- fusion_tab = arg
225
- else :
226
- assert False , "unhandled option"
227
- main (fusion_tab ,gtf ,genome )
228
- movement = os .system ("mv %s fusion_total_index/" % (fusion_tab ))
229
- if movement == 0 :
244
+
245
+ if opt in ("-g" , "--genome" ):
246
+ genome = arg
247
+ if opt in ("-a" , "--gtf" ):
248
+ gtf = arg
249
+ if opt in ("-t" , "--tab" ):
250
+ fusion_tab = arg
251
+ if opt in ("-o" , "--outdir" ):
252
+ if arg != '' :
253
+ outdir = arg
254
+
255
+ print ("Outdir is {od}!" .format (od = outdir ))
256
+ main (fusion_tab , gtf , genome , outdir )
257
+ movement = os .system ("cp {ft} {od}/" .format (ft = fusion_tab , od = outdir ))
258
+ if movement == 0 :
230
259
print ("All processes done!" )
231
260
else :
232
- print ("Fail to move fusion_tab into fusion_total_index" )
261
+ print ("Fail to move fusion_tab into {od}" . format ( od = outdir ) )
233
262
exit (1 )
234
-
235
-
236
-
237
-
238
-
239
-
240
-
0 commit comments