19
19
_LOG = logging .getLogger (__name__ )
20
20
21
21
22
- def parse_beds (options : Options , reference_dict : _IndexedSeqFileDict , average_mutation_rate : float ) -> list :
22
+ def parse_beds (options : Options , ref_keys_counts : dict , average_mutation_rate : float ) -> list :
23
23
"""
24
24
This single function parses the three possible bed file types for NEAT.
25
25
26
26
:param options: The options object for this run
27
- :param reference_dict: The reference dict generated by SeqIO for this run
27
+ :param ref_keys_counts: A dictionary containing the reference keys and the associated lengths.
28
28
:param average_mutation_rate: The average mutation rate from the model or user input for this run
29
29
:return: target_dict, discard_dict, mutation_rate_dict
30
30
"""
@@ -57,13 +57,13 @@ def parse_beds(options: Options, reference_dict: _IndexedSeqFileDict, average_mu
57
57
for i in range (len (bed_files )):
58
58
temp_bed_dict = parse_single_bed (
59
59
bed_files [i ],
60
- reference_dict ,
60
+ ref_keys_counts ,
61
61
processing_structure [i ]
62
62
)
63
63
# If there was a bed this function will fill in any gaps the bed might have missed, so we have a complete map
64
64
# of each chromosome. The uniform case in parse_single_bed handles this in cases of no input bed.
65
65
if processing_structure [i ][2 ]:
66
- final_dict = fill_out_bed_dict (reference_dict , temp_bed_dict , processing_structure [i ])
66
+ final_dict = fill_out_bed_dict (ref_keys_counts , temp_bed_dict , processing_structure [i ])
67
67
else :
68
68
final_dict = temp_bed_dict
69
69
@@ -73,7 +73,7 @@ def parse_beds(options: Options, reference_dict: _IndexedSeqFileDict, average_mu
73
73
74
74
75
75
def parse_single_bed (input_bed : str ,
76
- reference_dictionary : _IndexedSeqFileDict ,
76
+ ref_keys_counts : dict ,
77
77
process_key : tuple [str , float , bool ],
78
78
) -> dict :
79
79
"""
@@ -82,12 +82,13 @@ def parse_single_bed(input_bed: str,
82
82
then we will also import the mutation data.
83
83
84
84
:param input_bed: A bed file containing the regions of interest
85
- :param reference_dictionary : A list of chromosomes to check, along with their reads and lengths
85
+ :param ref_keys_counts : A dict of chromosomes to check (key) , along with their lengths (value)
86
86
:param process_key: Indicates which bed we are processing and the factor we need to process it.
87
87
:return: a dictionary of chromosomes: [(pos1, pos2, factor), etc...]
88
88
"""
89
- ret_dict = {x : [] for x in reference_dictionary }
89
+ ret_dict = {x : [] for x in ref_keys_counts }
90
90
in_bed_only = []
91
+ key_pool = list (ref_keys_counts .keys ())
91
92
92
93
# process_key[2] is True when there is an input bed, False otherwise.
93
94
if process_key [2 ]:
@@ -107,7 +108,7 @@ def parse_single_bed(input_bed: str,
107
108
sys .exit (1 )
108
109
# Bed file chromosome names must match the reference.
109
110
try :
110
- assert my_chr in reference_dictionary
111
+ assert my_chr in key_pool
111
112
except AssertionError :
112
113
in_bed_only .append (my_chr )
113
114
_LOG .warning (
@@ -149,7 +150,7 @@ def parse_single_bed(input_bed: str,
149
150
ret_dict [my_chr ].append ((int (pos1 ), int (pos2 ), True ))
150
151
151
152
# some validation
152
- in_ref_only = [k for k in reference_dictionary if k not in ret_dict ]
153
+ in_ref_only = [k for k in ref_keys_counts if k not in ret_dict ]
153
154
if in_ref_only :
154
155
_LOG .warning (f'Warning: Reference contains sequences not found in BED file { input_bed } . '
155
156
'These chromosomes will be omitted from the outputs.' )
@@ -161,13 +162,13 @@ def parse_single_bed(input_bed: str,
161
162
_LOG .debug (f'Regions ignored: { list (set (in_bed_only ))} ' )
162
163
163
164
else :
164
- for my_chr in reference_dictionary . keys () :
165
- ret_dict [my_chr ].append ((0 , len ( reference_dictionary [my_chr ]) , process_key [1 ]))
165
+ for my_chr in ref_keys_counts :
166
+ ret_dict [my_chr ].append ((0 , ref_keys_counts [my_chr ], process_key [1 ]))
166
167
167
168
return ret_dict
168
169
169
170
170
- def fill_out_bed_dict (ref_dict : _IndexedSeqFileDict ,
171
+ def fill_out_bed_dict (ref_keys_counts : dict ,
171
172
region_dict : dict | None ,
172
173
default_factor : tuple [str , bool , bool ],
173
174
) -> dict :
@@ -177,9 +178,9 @@ def fill_out_bed_dict(ref_dict: _IndexedSeqFileDict,
177
178
178
179
The input to this function is the dict for a single chromosome.
179
180
180
- :param ref_dict: The reference dictionary for the run. Needed to calulate max values. We shouldn't need to access
181
- the data it refers to .
182
- :param region_dict: A dict based on the input from the bed file, with keys being all the chronmosomes
181
+ :param ref_keys_counts: A dictionary with the keys as the contigs from the reference dictionary, and the values as
182
+ the length of the contig. Needed to calculate max values .
183
+ :param region_dict: A dict based on the input from the bed file, with keys being all the chromosomes
183
184
present in the reference.
184
185
:param default_factor: This is a tuple representing the type and any extra information. The extra info is for
185
186
mutation rates. If beds were input, then these values come from the bed, else they are set to one value
@@ -191,7 +192,7 @@ def fill_out_bed_dict(ref_dict: _IndexedSeqFileDict,
191
192
192
193
for contig in region_dict :
193
194
regions = []
194
- max_value = len ( ref_dict [contig ])
195
+ max_value = ref_keys_counts [contig ]
195
196
start = 0
196
197
197
198
# The trivial case of no data yet for this contig the region gets filled out as 0 to len(contig_sequence)
0 commit comments