Skip to content

Commit eac0e92

Browse files
committed
replace "-" with "--".
fix build_graph.py.
1 parent 7430f38 commit eac0e92

10 files changed

+1055
-1761808
lines changed

build_graph.py

+121-99
Original file line numberDiff line numberDiff line change
@@ -1,33 +1,31 @@
11
#!/usr/bin/env python3
22

33
import re
4-
import os,getopt
5-
import time
4+
import os
5+
import getopt
66
import sys
77
from pysam import FastaFile
88

99

10-
1110
def getgraph(location):
12-
'''
11+
"""
1312
Load the fusion gene table from the local file
1413
:param location: a string of fusion gene table location
1514
:return: a dict of fusion genes
16-
'''
17-
if not (os.path.exists(os.curdir+'/'+location) or os.path.exists(location)):
18-
print('Error: There is no %s' %location)
15+
"""
16+
if not (os.path.exists(os.curdir + '/' + location) or os.path.exists(location)):
17+
print('Error: There is no %s' % location)
1918
exit(1)
2019
with open(location, 'r') as f:
2120
edge = {}
22-
nodes = []
2321
for line in f:
2422
if not line.startswith('#'):
2523
try:
2624
temp = line.strip('\n').split('\t')[0].split('--')
2725
start = temp[0]
2826
end = temp[1]
2927
except IndexError as e:
30-
print("%s: gene pairs placed in first columns should be joined with '--' not '-' "%e)
28+
print("%s: gene pairs placed in first columns should be joined with '--' not '-' " % e)
3129
exit(1)
3230

3331
if start in edge:
@@ -41,17 +39,18 @@ def getgraph(location):
4139
edge[end] = [start]
4240
return edge
4341

42+
4443
def find_bool(graph):
4544
thisside = set()
4645
thatside = set()
4746
nodes = list(graph.keys())
4847
thisside.add(nodes[0])
49-
while(nodes!= []):
48+
while nodes:
5049
edges_count = sum([len(graph[node]) for node in nodes])
5150
if edges_count == 0:
5251
break
5352
for start in graph.keys():
54-
if graph[start] == []:
53+
if not graph[start]:
5554
continue
5655
if start in thisside:
5756
for side in graph[start]:
@@ -70,171 +69,194 @@ def find_bool(graph):
7069
return list(thisside), list(thatside)
7170

7271

73-
74-
75-
76-
77-
def Get_geneCoordinate(gtf):
78-
if not (os.path.exists(gtf) or (os.path.exists(os.curdir+'/'+gtf))):
79-
print("there is no gtf file,please check the path")
72+
def Get_geneCoordinate(gtf, od):
73+
synonyms_dict = {"SEPTIN5": "SEPT5",
74+
"SIKE1": "SIKE",
75+
"AC011462.1": "TGFB1",
76+
"SEPTIN9": "SEPT9",
77+
"SEPTIN6": "SEPT6",
78+
"H4C9": "HIST1H4I",
79+
"GET1": "WRB",
80+
"H2BC4": "HIST1H2BC",
81+
"SEPTIN11": "SEPT11",
82+
"ZFTA": "C11orf95",
83+
"H2AC17": "HIST1H2AM",
84+
"HLA-DMA": "HLA_DMA",
85+
"HLA-DMB": "HLA_DMB",
86+
"RNF213": "AC124319.1",
87+
"PEDS1": "TMEM189",
88+
"DYNLT2B": "TCTEX1D2",
89+
"TGFB1": "AC011462.1",
90+
"SEPTIN2": "SEPT2",
91+
"SEPTIN8": "SEPT8",
92+
"ERVK3-1": "ERVK3_1",
93+
"HI-6": "HIST1H1T",
94+
"CEP43": "FGFR1OP",
95+
"H3C12": "HIST1H3J",
96+
"CFAP251": "WDR66",
97+
"MAGEA5P": "MAGEA5",
98+
"BMERB1": "C16orf45",
99+
"EZHIP": "CXorf67",
100+
"H1-6": "HIST1H1T"}
101+
if not (os.path.exists(gtf) or (os.path.exists(os.curdir + '/' + gtf))):
102+
print("there is no gtf file, please check the path")
80103
exit(1)
81-
Out=open('fusion_genes_coordinate.txt','w')
82-
GeneCoordinate_dict={}
83-
with open(gtf,'r') as f:
104+
Out = open('{od}/fusion_genes_coordinate.txt'.format(od=od), 'w')
105+
GeneCoordinate_dict = {}
106+
with open(gtf, 'r') as f:
84107
for line in f:
85108
if not line.startswith('#'):
86-
Line=line.strip('\n').split('\t')
87-
chr,Type,start,end,strand,gene=Line[0],Line[2],Line[3],Line[4],Line[6],Line[-1].split(';')[2].replace('gene_name "','').replace('"','').strip()
88-
if Type=='gene':
89-
if strand=='+':
90-
strand='1'
109+
Line = line.strip('\n').split('\t')
110+
chrn, Type, start, end, strand, gene = Line[0], Line[2], Line[3], Line[4], Line[6], Line[-1].split(';')[
111+
2].replace('gene_name "', '').replace('"', '').strip()
112+
if Type == 'gene':
113+
if strand == '+':
114+
strand = '1'
91115
else:
92-
strand='-1'
93-
gene_coordinates='\t'.join([chr,strand,start,end,gene,'\n'])
116+
strand = '-1'
117+
gene_coordinates = '\t'.join([chrn, strand, start, end, gene, '\n'])
118+
if gene in synonyms_dict:
119+
gene = synonyms_dict[gene]
94120
if gene not in GeneCoordinate_dict:
95-
GeneCoordinate_dict[gene]=[chr,strand,start,end,gene]
121+
GeneCoordinate_dict[gene] = [chrn, strand, start, end, gene]
96122
Out.write(gene_coordinates)
97123
Out.close()
98124
return GeneCoordinate_dict
99125

100126

101-
102-
def Get_fusionGene_seq(GenesU,GenesV,geneCoordniates_dict,reference_fa=''):
103-
if not (os.path.exists(reference_fa) or os.path.exists(os.curdir+'/'+reference_fa)):
127+
def Get_fusionGene_seq(GenesU, GenesV, geneCoordniates_dict, reference_fa, od):
128+
if not (os.path.exists(reference_fa) or os.path.exists(os.curdir + '/' + reference_fa)):
104129
print('Error: There is no reference fasta files')
105130
exit(1)
106-
genome_name=os.path.basename(reference_fa)
131+
genome_name = os.path.basename(reference_fa)
107132
if not os.path.exists(genome_name):
108-
genome=os.system('ln -s %s %s'%(reference_fa,genome_name))
109-
genome_index=os.system('samtools faidx %s'%genome_name)
110-
genome=FastaFile(genome_name)
111-
133+
os.system('ln -s %s %s' % (os.path.abspath(os.curdir + '/' + reference_fa), od + '/' + genome_name))
134+
# fasta index
135+
os.system('samtools faidx %s' % od + '/' + genome_name)
136+
genome = FastaFile(od + '/' + genome_name)
112137

113-
fusiongenes_ref_U=open('fusion_total_index/fusiongenes_ref_U.fa','w')
138+
fusiongenes_ref_U = open('{od}/fusiongenes_ref_U.fa'.format(od=od), 'w')
114139
for gene in GenesU:
115-
chr, strand, start, end=None,None,None,None
140+
chrn, strand, start, end = None, None, None, None
116141
try:
117-
chr, strand, start, end, gene = geneCoordniates_dict[gene]
142+
chrn, strand, start, end, gene = geneCoordniates_dict[gene]
118143
except KeyError as e:
119-
print('%s: Input gene name wasnot found in Gtf,Check gene names'%e)
144+
print('%s: Input gene name wasnot found in Gtf, Check gene names' % e)
120145
exit(1)
121-
if strand!=None:
146+
if strand:
122147
if strand == '1':
123-
seq = genome.fetch(reference=chr, start=int(start), end=int(end))
148+
seq = genome.fetch(reference=chrn, start=int(start), end=int(end))
124149
else:
125-
seq_plus = genome.fetch(reference=chr, start=int(start), end=int(end))
150+
seq_plus = genome.fetch(reference=chrn, start=int(start), end=int(end))
126151
trantab = str.maketrans('ACGTacgtNn', 'TGCAtgcaNn')
127152
seq = seq_plus.translate(trantab)
128153
seq = seq[::-1]
129-
fusiongenes_ref_U.write('>%s \n' % gene)
154+
fusiongenes_ref_U.write('>%s\n' % gene)
130155
for line in re.findall(r'.{60}', seq):
131156
fusiongenes_ref_U.write('%s\n' % line)
132157
fusiongenes_ref_U.close()
133158

134-
135-
fusiongenes_ref_V=open('fusion_total_index/fusiongenes_ref_V.fa','w')
159+
fusiongenes_ref_V = open('{od}/fusiongenes_ref_V.fa'.format(od=od), 'w')
136160
for gene in GenesV:
137-
chr, strand, start, end = None, None, None, None
161+
chrn, strand, start, end = None, None, None, None
138162
try:
139-
chr, strand, start, end, gene = geneCoordniates_dict[gene]
163+
chrn, strand, start, end, gene = geneCoordniates_dict[gene]
140164
except KeyError as e:
141-
print('%s: Input gene name wasnot found in Gtf,Check gene names'%e)
165+
print('%s: Input gene name wasnot found in Gtf, Check gene names' % e)
142166
exit(1)
143-
if strand != None:
167+
if strand:
144168
if strand == '1':
145-
seq = genome.fetch(reference=chr, start=int(start), end=int(end))
169+
seq = genome.fetch(reference=chrn, start=int(start), end=int(end))
146170
else:
147-
seq_plus = genome.fetch(reference=chr, start=int(start), end=int(end))
171+
seq_plus = genome.fetch(reference=chrn, start=int(start), end=int(end))
148172
trantab = str.maketrans('ACGTacgtNn', 'TGCAtgcaNn')
149173
seq = seq_plus.translate(trantab)
150174
seq = seq[::-1]
151-
fusiongenes_ref_V.write('>%s \n' % gene)
175+
fusiongenes_ref_V.write('>%s\n' % gene)
152176
for line in re.findall(r'.{60}', seq):
153177
fusiongenes_ref_V.write('%s\n' % line)
154178
fusiongenes_ref_V.close()
155179
return 0
156180

157181

158-
159-
def main(fusion_tab,gtf,ref_genome):
160-
if not os.path.exists('fusion_total_index'):
161-
os.mkdir('fusion_total_index')
182+
def main(fusion_tab, gtf, ref_genome, od):
183+
if not os.path.exists(od):
184+
os.mkdir(od)
162185
graph = getgraph(fusion_tab)
163-
bipgraph=find_bool(graph)
164-
Flag=False
186+
bipgraph = find_bool(graph)
187+
Flag = False
165188
if bipgraph is not None:
166-
if (len(bipgraph[0])+len(bipgraph[1])==len(graph.keys())):
167-
Flag=True
189+
if len(bipgraph[0]) + len(bipgraph[1]) == len(graph.keys()):
190+
Flag = True
168191
genesU, genesV = bipgraph[0], bipgraph[1]
169-
geneCoordinate = Get_geneCoordinate(gtf)
170-
fusion_seqs = Get_fusionGene_seq(genesU, genesV, geneCoordinate, ref_genome)
192+
geneCoordinate = Get_geneCoordinate(gtf, od)
193+
fusion_seqs = Get_fusionGene_seq(genesU, genesV, geneCoordinate, ref_genome, od)
171194
if fusion_seqs == 0:
172195
print("Sucessfully get the fusion genes sequence ")
173196
else:
174197
print("Failed to get the fusion genes sequence")
175198
exit(1)
199+
os.system('hisat2-build --quiet {od}/fusiongenes_ref_U.fa {od}/fusiongenes_ref_U'.format(od=od))
200+
os.system('hisat2-build --quiet {od}/fusiongenes_ref_V.fa {od}/fusiongenes_ref_V'.format(od=od))
176201
if not Flag:
177202
print("Failed to build the bipartite graph pairs")
178203
else:
179204
print("Sucessfully building the bipartite graph pairs")
180205

181206

182-
183-
184-
185-
186207
if __name__ == '__main__':
187208
os.chdir(os.path.abspath(os.path.dirname(__file__)))
188209
usage_string = '''
189-
Usage: python Preprocess.py -genome <genome-reference-path> -gtf <gtf-reference-path> -tab <fusion-table>
210+
Usage: python build_graph.py --genome <genome-reference-path> --gtf <gtf-reference-path> --tab <fusion-table>
190211
Arguments:
191212
Required:
192-
-genome <genome-reference-path>
193-
genome reference fasta file, default: Homo_sapiens.GRCh38.dna.primary_assembly.fa
194-
-gtf <gtf-reference-path>
195-
gene transfer format file default: Homo_sapiens.GRCh38.95.gtf
196-
-tab <fusion-table>
197-
Inputed fusion gene pairs table,file format should be tsv, fusion pairs Gene pairs should be connected with -- not -, default: fusion_total_index/fusion_table.tsv
213+
-g/--genome <genome-reference-path>
214+
genome reference fasta file
215+
-a/--gtf <gtf-reference-path>
216+
gene annotation GTF format file
217+
Optional:
218+
-t/--tab <fusion-table>
219+
Inputed fusion gene pairs table,file format should be tsv, fusion pairs Gene pairs should be connected with --, default: reference_fusion_info/fusion_table.tsv
220+
-o/--outdir <outdir>
221+
output directory fusion reference information and index
198222
Other:
199223
-h --help
200224
help information
201225
'''
202-
if len(sys.argv[1:])==0:
226+
if len(sys.argv[1:]) == 0:
203227
print(usage_string)
204228
sys.exit(1)
205229
try:
206-
opts, args = getopt.getopt(sys.argv[1:], "h",["help","genome=","gtf=","tab="])
230+
opts, args = getopt.getopt(sys.argv[1:], "hg:a:t:o:", ["help", "genome=", "gtf=", "tab=", "outdir="])
207231
except getopt.GetoptError as err:
208232
print(err)
209233
sys.exit(1)
210234

211-
genome='Homo_sapiens.GRCh38.dna.primary_assembly.fa'
212-
gtf='Homo_sapiens.GRCh38.95.gtf'
213-
fusion_tab='fusion_total_index/fusion_table.tsv'
235+
genome = 'Homo_sapiens.GRCh38.dna.primary_assembly.fa'
236+
gtf = 'Homo_sapiens.GRCh38.95.gtf'
237+
fusion_tab = 'reference_fusion_info/fusion_table.tsv'
238+
outdir = 'fusion_total_index'
214239

215240
for opt, arg in opts:
216241
if opt in ("-h", '--help'):
217242
print(usage_string)
218243
sys.exit()
219-
elif (opt =="--genome"):
220-
genome=arg
221-
elif (opt =="--gtf"):
222-
gtf=arg
223-
elif (opt =="--tab"):
224-
fusion_tab=arg
225-
else:
226-
assert False, "unhandled option"
227-
main(fusion_tab,gtf,genome)
228-
movement=os.system("mv %s fusion_total_index/"%(fusion_tab))
229-
if movement==0:
244+
245+
if opt in ("-g", "--genome"):
246+
genome = arg
247+
if opt in ("-a", "--gtf"):
248+
gtf = arg
249+
if opt in ("-t", "--tab"):
250+
fusion_tab = arg
251+
if opt in ("-o", "--outdir"):
252+
if arg != '':
253+
outdir = arg
254+
255+
print("Outdir is {od}!".format(od=outdir))
256+
main(fusion_tab, gtf, genome, outdir)
257+
movement = os.system("cp {ft} {od}/".format(ft=fusion_tab, od=outdir))
258+
if movement == 0:
230259
print("All processes done!")
231260
else:
232-
print("Fail to move fusion_tab into fusion_total_index")
261+
print("Fail to move fusion_tab into {od}".format(od=outdir))
233262
exit(1)
234-
235-
236-
237-
238-
239-
240-

0 commit comments

Comments
 (0)