@@ -103,6 +103,7 @@ class ScaffoldGraph:
103
103
#Just too long/far
104
104
TOO_FAR = 1000000000
105
105
106
+
106
107
#efficiantly it is so because of bwa behaviour on XA tags but not used directly
107
108
MAX_ALIGNMENTS = 6
108
109
@@ -112,6 +113,10 @@ class ScaffoldGraph:
112
113
#to check whether paths have similar lengths, in cheating t2t connection
113
114
SIMILAR_LEN_FRAC = 0.7
114
115
116
+ #Constants to switch off distance counting for huge graphs
117
+ MAX_GRAPH_FOR_DISTANCES = 1000000
118
+ MAX_COMPONENT_FOR_DISTANCES = 50000
119
+
115
120
def __init__ (self , rukki_paths , telomere_locations_file , hic_alignment_file , matches_file , G , uncompressed_fasta , path_mashmap , porec , logger ):
116
121
self .logger = logger_wrap .UpdatedAdapter (logger , self .__class__ .__name__ )
117
122
self .rukki_paths = rukki_paths
@@ -132,9 +137,11 @@ def __init__(self, rukki_paths, telomere_locations_file, hic_alignment_file, mat
132
137
all_connections , unique_connections = self .get_connections_porec (hic_alignment_file , True )
133
138
else :
134
139
self .logger .info ("Loading Hi-C alignments" )
135
- all_connections , unique_connections = self .get_connections_bam (hic_alignment_file , True )
136
- self .output_basename = "scaff_rukki.paths"
140
+ all_connections , unique_connections = self .get_connections_bam (hic_alignment_file , True )
141
+ self .all_connections = all_connections
142
+ self .unique_connections = unique_connections
137
143
144
+ self .output_basename = "scaff_rukki.paths"
138
145
telomeric_ends = self .getTelomericEnds ()
139
146
self .hic_alignment_file = hic_alignment_file
140
147
@@ -166,12 +173,25 @@ def __init__(self, rukki_paths, telomere_locations_file, hic_alignment_file, mat
166
173
#possilby unefficient but whelp
167
174
scores = {}
168
175
unique_scores = {}
169
- self .dists = dict (nx .all_pairs_dijkstra_path_length (self .upd_G , weight = lambda u , v , d : self .upd_G .edges [u , v ]['mid_length' ]))
170
- self .logger .info ("Pairwise distances in assembly graph calculated" )
176
+ self .dists = {}
177
+ if G .number_of_nodes () > ScaffoldGraph .MAX_GRAPH_FOR_DISTANCES :
178
+ self .logger .info (f"Graph is too big { G .number_of_nodes ()} , not calculating pairwise distances" )
179
+ else :
180
+ max_comp_size = len (max (nx .weakly_connected_components (G ), key = len ))
181
+ if max_comp_size > ScaffoldGraph .MAX_COMPONENT_FOR_DISTANCES :
182
+ self .logger .info (f"Biggest component is too big { len (max_comp_size )} , not calculating pairwise distances" )
183
+ else :
184
+ self .logger .info ("Calculating pairwise distances for assembly graph nodes" )
185
+ self .dists = dict (nx .all_pairs_dijkstra_path_length (self .upd_G , weight = lambda u , v , d : self .upd_G .edges [u , v ]['mid_length' ]))
186
+ self .logger .info ("Pairwise distances in assembly graph calculated" )
187
+
171
188
self .haploids = self .getHaploidPaths (self .rukki_paths )
172
189
#bam should be prefiltered
173
190
#all_connections = self.get_connections_bam("../", True)
174
191
192
+ self .scores = {}
193
+ self .unique_scores = {}
194
+ '''
175
195
for from_path_id in self.rukki_paths.getPathIds():
176
196
scores[from_path_id] = {}
177
197
unique_scores[from_path_id] = {}
@@ -195,11 +215,9 @@ def __init__(self, rukki_paths, telomere_locations_file, hic_alignment_file, mat
195
215
or_from_path_id = from_path_id + from_dir
196
216
or_to_path_id = to_path_id + to_dir
197
217
self.scaffold_graph.add_edge(or_from_path_id, or_to_path_id, weight = scores[from_path_id][to_path_id][from_dir + to_dir], unique_weight = unique_scores[from_path_id][to_path_id][from_dir + to_dir])
198
- if self .rukki_paths .getLength (from_path_id ) >= ScaffoldGraph .MIN_PATH_TO_SCAFFOLD and self .rukki_paths .getLength (from_path_id ) >= ScaffoldGraph .MIN_PATH_TO_SCAFFOLD :
218
+ if self.rukki_paths.getLength(from_path_id) >= ScaffoldGraph.MIN_PATH_TO_SCAFFOLD and self.rukki_paths.getLength(from_path_id) >= ScaffoldGraph.MIN_PATH_TO_SCAFFOLD:
199
219
self.logger.debug (f"Final not-unique scores {from_path_id} {to_path_id} {scores[from_path_id][to_path_id]}")
200
- self .logger .debug (f"Final unique scores { from_path_id } { to_path_id } { unique_scores [from_path_id ][to_path_id ]} " )
201
-
202
- #TODO: move all rc_<smth> somewhere, not right place
220
+ self.logger.debug (f"Final unique scores {from_path_id} {to_path_id} {unique_scores[from_path_id][to_path_id]}")'''
203
221
204
222
205
223
def isHaploidPath (self , paths , nor_path_id ):
@@ -393,6 +411,50 @@ def forbiddenPair(self, from_path_id, to_path_id):
393
411
return True
394
412
return False
395
413
414
+
415
+ def getScores (self , or_path_id , path_storage , type ):
416
+ res = []
417
+ if type == "weight" :
418
+ cur_scores = self .scores
419
+ connections = self .all_connections
420
+ elif type == "unique_weight" :
421
+ cur_scores = self .unique_scores
422
+ connections = self .unique_connections
423
+ else :
424
+ self .logger .error (f"Unknown type { type } " )
425
+ return res
426
+ nor_path_id = gf .nor_path_id (or_path_id )
427
+ precounted = True
428
+
429
+ homologous_paths = self .match_graph .getHomologousPaths (path_storage , nor_path_id )
430
+ #Counting only necessary scores, cashing in global dict
431
+ if not (or_path_id in cur_scores ):
432
+ cur_scores [or_path_id ] = {}
433
+ cur_scores [gf .rc_path_id (or_path_id )] = {}
434
+ precounted = False
435
+ for next_or_path in self .scaffold_graph .nodes ():
436
+ #Counting all orientations together so no need to double count
437
+ if next_or_path [- 1 ] == "-" :
438
+ continue
439
+ next_nor_path = gf .nor_path_id (next_or_path )
440
+ #TODO: optimize homology counting, once per all paths
441
+ if next_nor_path in homologous_paths :
442
+ continue
443
+ else :
444
+ if not precounted :
445
+ nor_scores = self .getPathPairConnections ([nor_path_id , next_nor_path ], connections , self .uncompressed_lens )
446
+ for from_dir in ('-' , '+' ):
447
+ for to_dir in ('-' , '+' ):
448
+ or_from_path_id = nor_path_id + from_dir
449
+ or_to_path_id = next_nor_path + to_dir
450
+ if nor_scores [from_dir + to_dir ] > 0 :
451
+ cur_scores [or_from_path_id ][or_to_path_id ] = nor_scores [from_dir + to_dir ]
452
+ for next_or_path in self .scaffold_graph .nodes ():
453
+ if next_or_path in cur_scores [or_path_id ]:
454
+ res .append ([next_or_path , cur_scores [or_path_id ][next_or_path ]])
455
+ res .sort (key = lambda x : x [1 ], reverse = True )
456
+ return res
457
+
396
458
#Main logic is here!
397
459
# returns NONE/DONE/next_path_id
398
460
#type: weight/unique_weight
@@ -404,11 +466,21 @@ def findExtension(self, cur_path_id, type):
404
466
if self .scaffold_graph .nodes [cur_path_id ]['telomere' ][1 ]:
405
467
self .logger .info (f"Stopped at the telomere" )
406
468
return "DONE" ,0
469
+ '''
407
470
for next_node in self.scaffold_graph.successors(cur_path_id):
408
471
#specific hacks to avoid
409
472
local_scores.append([next_node, self.scaffold_graph.edges[cur_path_id, next_node][type]])
410
-
411
473
local_scores.sort(key=lambda x: x[1], reverse=True)
474
+ alt_scores = self.getScores(cur_path_id, self.rukki_paths, type)
475
+
476
+ if len(alt_scores) > 0 and alt_scores[0]!= local_scores[0]:
477
+ self.logger.warning(f"Difference! local scores {local_scores[0]} alt scores {alt_scores[0]}")
478
+ for i in range (0,10):
479
+ self.logger.warning(f"Alt scores {i} {alt_scores[i]}")
480
+ elif len(alt_scores) == 0 and len(local_scores) > 0:
481
+ self.logger.warning(f"Local scores but no alt scores, best extension {local_scores[0]}")
482
+ '''
483
+ local_scores = self .getScores (cur_path_id , self .rukki_paths , type )
412
484
best_ind = - 1
413
485
second_best_ind = - 1
414
486
for i in range (0 , len (local_scores )):
@@ -504,10 +576,12 @@ def generateScaffolds(self):
504
576
break
505
577
else :
506
578
hom_before = False
579
+ nor_new_path_id = gf .nor_path_id (next_path_id )
580
+ homologous_paths = self .match_graph .getHomologousPaths (self .rukki_paths , gf .nor_path_id (nor_new_path_id ))
507
581
for prev_path_id in cur_scaffold :
508
- nor_new_path_id = gf .nor_path_id (next_path_id )
509
582
nor_prev_path_id = gf .nor_path_id (prev_path_id )
510
- if self .match_graph .isHomologousPath ([self .rukki_paths .getPathById (nor_new_path_id ), self .rukki_paths .getPathById (nor_prev_path_id )], [self .rukki_paths .getLength (nor_new_path_id ), self .rukki_paths .getLength (nor_prev_path_id )]):
583
+ if nor_prev_path_id in homologous_paths :
584
+ # self.match_graph.isHomologousPath([self.rukki_paths.getPathById(nor_new_path_id), self.rukki_paths.getPathById(nor_prev_path_id)], [self.rukki_paths.getLength(nor_new_path_id), self.rukki_paths.getLength(nor_prev_path_id)]):
511
585
#TODO: really not normal if we see that best extension is homologous to some path in scaffold, deserves investigation
512
586
self .logger .warning (f"Trying to extend, had homologous path in same scaffold before! { nor_new_path_id } { nor_prev_path_id } " )
513
587
hom_before = True
@@ -574,9 +648,11 @@ def getAdditionalRephase(self, scaffolded_rephased, used_ids, paths):
574
648
for nor_old_id in scaffolded_rephased :
575
649
if not self .isHaploidPath (paths , nor_old_id ):
576
650
#very unoptimal but this should not be frequent
651
+ homologous_paths = self .match_graph .getHomologousPaths (paths , nor_old_id )
577
652
for alt_path_id in paths .getPathIds ():
578
653
#scaffolded path should not be significantly shorter than homologous alternative to recolor
579
- if paths .getLength (nor_old_id ) * 2 > paths .getLength (alt_path_id ) and self .match_graph .isHomologousPath ([paths .getPathById (nor_old_id ), paths .getPathById (alt_path_id )], [paths .getLength (nor_old_id ), paths .getLength (alt_path_id )]):
654
+ if paths .getLength (nor_old_id ) * 2 > paths .getLength (alt_path_id ) and alt_path_id in homologous_paths :
655
+ #and self.match_graph.isHomologousPath([paths.getPathById(nor_old_id), paths.getPathById(alt_path_id)], [paths.getLength(nor_old_id), paths.getLength(alt_path_id)]):
580
656
#alt_path_id is homologous to something rephased and thus should be rephased too.
581
657
old_label = paths .getLabel (alt_path_id )
582
658
splitted_id = alt_path_id .split ("_" )
@@ -1333,3 +1409,23 @@ def getTelomericEnds(self):
1333
1409
tel_end = True
1334
1410
res [id ] = [tel_start , tel_end ]
1335
1411
return res
1412
+
1413
+ #For each paths returns its connected component. If multiple, then 0
1414
+ def getPathColors (self , rukki_paths , graph ):
1415
+ components = sorted (nx .weakly_connected_components (self .upd_G ),key = len , reverse = True )
1416
+ node_colors = {}
1417
+ path_colors = {}
1418
+ for i in range (0 , len (components )):
1419
+ for node in components [i ]:
1420
+ node_colors [node ] = i + 1
1421
+ for path_id in rukki_paths .getPathIds ():
1422
+ current_colors = set ()
1423
+ for node in rukki_paths .getPathById (path_id ):
1424
+ current_colors .add (node_colors [gf .nor_node (node )])
1425
+ if len (current_colors ) == 1 :
1426
+ path_colors [path_id ] = current_colors .pop ()
1427
+ else :
1428
+ path_colors [path_id ] = 0
1429
+ return path_colors
1430
+
1431
+
0 commit comments