Skip to content

Commit 33c2079

Browse files
speedup - counting only necessary distances
1 parent e970cd6 commit 33c2079

File tree

3 files changed

+146
-13
lines changed

3 files changed

+146
-13
lines changed

src/scripts/scaffolding/match_graph.py

+22-1
Original file line numberDiff line numberDiff line change
@@ -315,4 +315,25 @@ def isHomologousPath (self, paths, lens):
315315
self.logger.debug (f"Found homologous paths {paths} with homology size {hom_size}")
316316
return True
317317
else:
318-
return False
318+
return False
319+
320+
def getHomologousPaths(self, path_storage, path_id):
321+
homologous_paths = []
322+
homologous_nodes = set()
323+
for node in path_storage.getEdgeSequenceById(path_id):
324+
if node in self.matchGraph.nodes:
325+
for hom_node in self.matchGraph.neighbors(node):
326+
if self.matchGraph.edges[node, hom_node]['weight'] < 0:
327+
homologous_nodes.add(hom_node)
328+
self.logger.debug(f"For path {path_id} counted homologous nodes {homologous_nodes}")
329+
paths_to_check = set()
330+
for node in homologous_nodes:
331+
paths_to_check.update(path_storage.getPathsFromNode(node))
332+
self.logger.debug(f"For path {path_id} suspicious homologous paths {paths_to_check}")
333+
334+
for susp_path_id in paths_to_check:
335+
if self.isHomologousPath([path_storage.getPathById(path_id), path_storage.getPathById(susp_path_id)], [path_storage.getLength(path_id), path_storage.getLength(susp_path_id)]):
336+
homologous_paths.append(susp_path_id)
337+
self.logger.debug(f"For path {path_id} found {len(paths_to_check)} homologous paths {paths_to_check}")
338+
339+
return homologous_paths

src/scripts/scaffolding/path_storage.py

+16
Original file line numberDiff line numberDiff line change
@@ -22,6 +22,9 @@ def __init__(self, G):
2222
self.G = G
2323
self.pairwise_dists = {}
2424

25+
self.node_to_paths = {}
26+
self.node_to_paths_counted = False
27+
2528
def getPathById(self, path_id):
2629
return self.paths[path_id]
2730

@@ -150,3 +153,16 @@ def writePathAsFasta(self, input_fasta, output_fasta):
150153
else:
151154
overlap = 0
152155
f.write(f"\n")
156+
157+
def getPathsFromNode(self, node_id):
158+
if not self.node_to_paths_counted:
159+
for path_id in self.paths:
160+
for node in self.getEdgeSequenceById(path_id):
161+
if not node in self.node_to_paths:
162+
self.node_to_paths[node] = []
163+
self.node_to_paths[node].append(path_id)
164+
self.node_to_paths_counted = True
165+
if node_id in self.node_to_paths:
166+
return self.node_to_paths[node_id]
167+
else:
168+
return []

src/scripts/scaffolding/scaffold_graph.py

+108-12
Original file line numberDiff line numberDiff line change
@@ -103,6 +103,7 @@ class ScaffoldGraph:
103103
#Just too long/far
104104
TOO_FAR = 1000000000
105105

106+
106107
#efficiantly it is so because of bwa behaviour on XA tags but not used directly
107108
MAX_ALIGNMENTS = 6
108109

@@ -112,6 +113,10 @@ class ScaffoldGraph:
112113
#to check whether paths have similar lengths, in cheating t2t connection
113114
SIMILAR_LEN_FRAC = 0.7
114115

116+
#Constants to switch off distance counting for huge graphs
117+
MAX_GRAPH_FOR_DISTANCES = 1000000
118+
MAX_COMPONENT_FOR_DISTANCES = 50000
119+
115120
def __init__(self, rukki_paths, telomere_locations_file, hic_alignment_file, matches_file, G, uncompressed_fasta, path_mashmap, porec, logger):
116121
self.logger = logger_wrap.UpdatedAdapter(logger, self.__class__.__name__)
117122
self.rukki_paths = rukki_paths
@@ -132,9 +137,11 @@ def __init__(self, rukki_paths, telomere_locations_file, hic_alignment_file, mat
132137
all_connections, unique_connections = self.get_connections_porec(hic_alignment_file, True)
133138
else:
134139
self.logger.info ("Loading Hi-C alignments")
135-
all_connections, unique_connections = self.get_connections_bam(hic_alignment_file, True)
136-
self.output_basename = "scaff_rukki.paths"
140+
all_connections, unique_connections = self.get_connections_bam(hic_alignment_file, True)
141+
self.all_connections = all_connections
142+
self.unique_connections = unique_connections
137143

144+
self.output_basename = "scaff_rukki.paths"
138145
telomeric_ends = self.getTelomericEnds()
139146
self.hic_alignment_file = hic_alignment_file
140147

@@ -166,12 +173,25 @@ def __init__(self, rukki_paths, telomere_locations_file, hic_alignment_file, mat
166173
#possilby unefficient but whelp
167174
scores = {}
168175
unique_scores = {}
169-
self.dists = dict(nx.all_pairs_dijkstra_path_length(self.upd_G, weight=lambda u, v, d: self.upd_G.edges[u, v]['mid_length']))
170-
self.logger.info("Pairwise distances in assembly graph calculated")
176+
self.dists = {}
177+
if G.number_of_nodes() > ScaffoldGraph.MAX_GRAPH_FOR_DISTANCES:
178+
self.logger.info(f"Graph is too big {G.number_of_nodes()}, not calculating pairwise distances")
179+
else:
180+
max_comp_size = len(max(nx.weakly_connected_components(G), key=len))
181+
if max_comp_size > ScaffoldGraph.MAX_COMPONENT_FOR_DISTANCES:
182+
self.logger.info(f"Biggest component is too big {len(max_comp_size)}, not calculating pairwise distances")
183+
else:
184+
self.logger.info("Calculating pairwise distances for assembly graph nodes")
185+
self.dists = dict(nx.all_pairs_dijkstra_path_length(self.upd_G, weight=lambda u, v, d: self.upd_G.edges[u, v]['mid_length']))
186+
self.logger.info("Pairwise distances in assembly graph calculated")
187+
171188
self.haploids = self.getHaploidPaths(self.rukki_paths)
172189
#bam should be prefiltered
173190
#all_connections = self.get_connections_bam("../", True)
174191

192+
self.scores = {}
193+
self.unique_scores = {}
194+
'''
175195
for from_path_id in self.rukki_paths.getPathIds():
176196
scores[from_path_id] = {}
177197
unique_scores[from_path_id] = {}
@@ -195,11 +215,9 @@ def __init__(self, rukki_paths, telomere_locations_file, hic_alignment_file, mat
195215
or_from_path_id = from_path_id + from_dir
196216
or_to_path_id = to_path_id + to_dir
197217
self.scaffold_graph.add_edge(or_from_path_id, or_to_path_id, weight = scores[from_path_id][to_path_id][from_dir + to_dir], unique_weight = unique_scores[from_path_id][to_path_id][from_dir + to_dir])
198-
if self.rukki_paths.getLength(from_path_id) >= ScaffoldGraph.MIN_PATH_TO_SCAFFOLD and self.rukki_paths.getLength(from_path_id) >= ScaffoldGraph.MIN_PATH_TO_SCAFFOLD:
218+
if self.rukki_paths.getLength(from_path_id) >= ScaffoldGraph.MIN_PATH_TO_SCAFFOLD and self.rukki_paths.getLength(from_path_id) >= ScaffoldGraph.MIN_PATH_TO_SCAFFOLD:
199219
self.logger.debug (f"Final not-unique scores {from_path_id} {to_path_id} {scores[from_path_id][to_path_id]}")
200-
self.logger.debug (f"Final unique scores {from_path_id} {to_path_id} {unique_scores[from_path_id][to_path_id]}")
201-
202-
#TODO: move all rc_<smth> somewhere, not right place
220+
self.logger.debug (f"Final unique scores {from_path_id} {to_path_id} {unique_scores[from_path_id][to_path_id]}")'''
203221

204222

205223
def isHaploidPath(self, paths, nor_path_id):
@@ -393,6 +411,50 @@ def forbiddenPair(self, from_path_id, to_path_id):
393411
return True
394412
return False
395413

414+
415+
def getScores(self, or_path_id, path_storage, type):
416+
res = []
417+
if type == "weight":
418+
cur_scores = self.scores
419+
connections = self.all_connections
420+
elif type == "unique_weight":
421+
cur_scores = self.unique_scores
422+
connections = self.unique_connections
423+
else:
424+
self.logger.error(f"Unknown type {type}")
425+
return res
426+
nor_path_id = gf.nor_path_id(or_path_id)
427+
precounted = True
428+
429+
homologous_paths = self.match_graph.getHomologousPaths(path_storage, nor_path_id)
430+
#Counting only necessary scores, cashing in global dict
431+
if not (or_path_id in cur_scores):
432+
cur_scores[or_path_id] = {}
433+
cur_scores[gf.rc_path_id(or_path_id)] = {}
434+
precounted = False
435+
for next_or_path in self.scaffold_graph.nodes():
436+
#Counting all orientations together so no need to double count
437+
if next_or_path[-1] == "-":
438+
continue
439+
next_nor_path = gf.nor_path_id(next_or_path)
440+
#TODO: optimize homology counting, once per all paths
441+
if next_nor_path in homologous_paths:
442+
continue
443+
else:
444+
if not precounted:
445+
nor_scores = self.getPathPairConnections([nor_path_id, next_nor_path], connections, self.uncompressed_lens)
446+
for from_dir in ('-', '+'):
447+
for to_dir in ('-', '+'):
448+
or_from_path_id = nor_path_id + from_dir
449+
or_to_path_id = next_nor_path + to_dir
450+
if nor_scores[from_dir + to_dir] > 0:
451+
cur_scores[or_from_path_id][or_to_path_id] = nor_scores[from_dir + to_dir]
452+
for next_or_path in self.scaffold_graph.nodes():
453+
if next_or_path in cur_scores[or_path_id]:
454+
res.append([next_or_path, cur_scores[or_path_id][next_or_path]])
455+
res.sort(key=lambda x: x[1], reverse=True)
456+
return res
457+
396458
#Main logic is here!
397459
# returns NONE/DONE/next_path_id
398460
#type: weight/unique_weight
@@ -404,11 +466,21 @@ def findExtension(self, cur_path_id, type):
404466
if self.scaffold_graph.nodes[cur_path_id]['telomere'][1]:
405467
self.logger.info (f"Stopped at the telomere")
406468
return "DONE",0
469+
'''
407470
for next_node in self.scaffold_graph.successors(cur_path_id):
408471
#specific hacks to avoid
409472
local_scores.append([next_node, self.scaffold_graph.edges[cur_path_id, next_node][type]])
410-
411473
local_scores.sort(key=lambda x: x[1], reverse=True)
474+
alt_scores = self.getScores(cur_path_id, self.rukki_paths, type)
475+
476+
if len(alt_scores) > 0 and alt_scores[0]!= local_scores[0]:
477+
self.logger.warning(f"Difference! local scores {local_scores[0]} alt scores {alt_scores[0]}")
478+
for i in range (0,10):
479+
self.logger.warning(f"Alt scores {i} {alt_scores[i]}")
480+
elif len(alt_scores) == 0 and len(local_scores) > 0:
481+
self.logger.warning(f"Local scores but no alt scores, best extension {local_scores[0]}")
482+
'''
483+
local_scores = self.getScores(cur_path_id, self.rukki_paths, type)
412484
best_ind = -1
413485
second_best_ind = -1
414486
for i in range (0, len(local_scores)):
@@ -504,10 +576,12 @@ def generateScaffolds(self):
504576
break
505577
else:
506578
hom_before = False
579+
nor_new_path_id = gf.nor_path_id(next_path_id)
580+
homologous_paths = self.match_graph.getHomologousPaths(self.rukki_paths, gf.nor_path_id(nor_new_path_id))
507581
for prev_path_id in cur_scaffold:
508-
nor_new_path_id = gf.nor_path_id(next_path_id)
509582
nor_prev_path_id = gf.nor_path_id(prev_path_id)
510-
if self.match_graph.isHomologousPath([self.rukki_paths.getPathById(nor_new_path_id), self.rukki_paths.getPathById(nor_prev_path_id)], [self.rukki_paths.getLength(nor_new_path_id), self.rukki_paths.getLength(nor_prev_path_id)]):
583+
if nor_prev_path_id in homologous_paths:
584+
# self.match_graph.isHomologousPath([self.rukki_paths.getPathById(nor_new_path_id), self.rukki_paths.getPathById(nor_prev_path_id)], [self.rukki_paths.getLength(nor_new_path_id), self.rukki_paths.getLength(nor_prev_path_id)]):
511585
#TODO: really not normal if we see that best extension is homologous to some path in scaffold, deserves investigation
512586
self.logger.warning(f"Trying to extend, had homologous path in same scaffold before! {nor_new_path_id} {nor_prev_path_id}")
513587
hom_before = True
@@ -574,9 +648,11 @@ def getAdditionalRephase(self, scaffolded_rephased, used_ids, paths):
574648
for nor_old_id in scaffolded_rephased:
575649
if not self.isHaploidPath(paths, nor_old_id):
576650
#very unoptimal but this should not be frequent
651+
homologous_paths = self.match_graph.getHomologousPaths(paths, nor_old_id)
577652
for alt_path_id in paths.getPathIds():
578653
#scaffolded path should not be significantly shorter than homologous alternative to recolor
579-
if paths.getLength(nor_old_id) * 2 > paths.getLength(alt_path_id) and self.match_graph.isHomologousPath([paths.getPathById(nor_old_id), paths.getPathById(alt_path_id)], [paths.getLength(nor_old_id), paths.getLength(alt_path_id)]):
654+
if paths.getLength(nor_old_id) * 2 > paths.getLength(alt_path_id) and alt_path_id in homologous_paths:
655+
#and self.match_graph.isHomologousPath([paths.getPathById(nor_old_id), paths.getPathById(alt_path_id)], [paths.getLength(nor_old_id), paths.getLength(alt_path_id)]):
580656
#alt_path_id is homologous to something rephased and thus should be rephased too.
581657
old_label = paths.getLabel(alt_path_id)
582658
splitted_id = alt_path_id.split("_")
@@ -1333,3 +1409,23 @@ def getTelomericEnds(self):
13331409
tel_end = True
13341410
res[id] = [tel_start, tel_end]
13351411
return res
1412+
1413+
#For each paths returns its connected component. If multiple, then 0
1414+
def getPathColors(self, rukki_paths, graph):
1415+
components = sorted(nx.weakly_connected_components(self.upd_G),key=len, reverse=True)
1416+
node_colors = {}
1417+
path_colors = {}
1418+
for i in range (0, len(components)):
1419+
for node in components[i]:
1420+
node_colors[node] = i + 1
1421+
for path_id in rukki_paths.getPathIds():
1422+
current_colors = set()
1423+
for node in rukki_paths.getPathById(path_id):
1424+
current_colors.add(node_colors[gf.nor_node(node)])
1425+
if len (current_colors) == 1:
1426+
path_colors[path_id] = current_colors.pop()
1427+
else:
1428+
path_colors[path_id] = 0
1429+
return path_colors
1430+
1431+

0 commit comments

Comments
 (0)