4
4
"""
5
5
import pathlib
6
6
import logging
7
- import numpy as np
8
7
9
8
from Bio .File import _IndexedSeqFileDict
10
9
@@ -32,21 +31,34 @@ def parse_beds(options: Options, reference_dict: _IndexedSeqFileDict, average_mu
32
31
return_list = []
33
32
bed_files = (options .target_bed , options .discard_bed , options .mutation_bed )
34
33
# These numbers indicate the factors for target_bed, discard_bed, and mutation_bed, respectively
35
- # Since Average mutation rate might equal zero, we start the others at 1.
36
34
processing_structure = {
37
- 0 : ("target" , 1.0 , True if options .target_bed else False ),
38
- 1 : ("discard" , 0.0 , True if options .discard_bed else False ),
39
- 2 : ("mutation" , average_mutation_rate , True if options .mutation_bed else False )
35
+ 0 : ("target" ,
36
+ # For target dict, True will indicate region is to be included (either within a target bed or
37
+ # if there was no target bed entered.
38
+ # For example (0, 10, False), (10, 100, True) would indicade that 0-10 fell outside a target region and will
39
+ # be discarded, whereas 10-100 is within a targeted region and will be included. A single tuple of
40
+ # (0, chromosome_end, True) would indicate either the entire chromosome was targeted or there was no bed
41
+ # file. Default is to target all regions off all contigs.
42
+ True ,
43
+ True if options .target_bed else False ),
44
+ 1 : ("discard" ,
45
+ # For the discard dict, false indicates the region is not within a discard bed (i.e., it should be included
46
+ # in the final results), whereas True indicates a region to be discarded, according to the bed.
47
+ # For example (0, 10, False), (10, 100, True) would mean we keep reads from 0-10 and discard the
48
+ # ones from 10-100. Default is that there are no discard regions and we keep everything.
49
+ False ,
50
+ True if options .discard_bed else False ),
51
+ 2 : ("mutation" ,
52
+ average_mutation_rate , # Default value will be this for all sections
53
+ True if options .mutation_bed else False )
40
54
}
41
55
42
56
for i in range (len (bed_files )):
43
- # To distinguish between a targeted bed with chromosomes intentionally omitted and no target bed at all
44
57
temp_bed_dict = parse_single_bed (
45
58
bed_files [i ],
46
59
reference_dict ,
47
60
processing_structure [i ]
48
61
)
49
-
50
62
# If there was a bed this function will fill in any gaps the bed might have missed, so we have a complete map
51
63
# of each chromosome. The uniform case in parse_single_bed handles this in cases of no input bed.
52
64
if processing_structure [i ][2 ]:
@@ -80,8 +92,6 @@ def parse_single_bed(input_bed: str,
80
92
if process_key [2 ]:
81
93
# Pathlib will help us stay machine-agnostic to the degree possible
82
94
input_bed = pathlib .Path (input_bed )
83
- printed_chromosome_warning = False
84
- printed_mutation_rate_warning = False
85
95
with open_input (input_bed ) as f :
86
96
for line in f :
87
97
if not line .startswith (('@' , '#' , "\n " )):
@@ -94,33 +104,25 @@ def parse_single_bed(input_bed: str,
94
104
except ValueError :
95
105
_LOG .error (f"Improperly formatted bed file line { line } " )
96
106
raise
97
- # Trying not to 'fix' bed files, but an easy case is if the vcf uses 'chr' and the bed doesn't, in
98
- # which we frequently see in human data, we'll just put chr on there. Anything more complicated
99
- # will be on the user to correct their beds first.
107
+ # Bed file chromosome names must match the reference.
100
108
try :
101
109
assert my_chr in reference_dictionary
102
110
except AssertionError :
103
- try :
104
- assert my_chr in reference_dictionary
105
- except AssertionError :
106
- in_bed_only .append (my_chr )
107
- if not printed_chromosome_warning :
108
- _LOG .warning ("Found chromosome in BED file that isn't in Reference file, skipping" )
109
- printed_chromosome_warning = True
110
- continue
111
-
111
+ in_bed_only .append (my_chr )
112
+ _LOG .warning (
113
+ f"Found chromosome in BED file that isn't in Reference file, skipping { my_chr } "
114
+ )
115
+ continue
112
116
# If this is the mutation bed, we need to process an extra column.
113
117
if process_key [0 ] == "mutation" :
114
118
# here we append the metadata, if present
115
119
index = line_list [3 ].find ('mut_rate=' )
120
+ # Improperly formatted mutation rate bed file
116
121
if index == - 1 :
117
- if not printed_mutation_rate_warning :
118
- _LOG .warning (f"Found no mutation rates in bed" )
119
- _LOG .warning (f'4th column of mutation rate bed must be a semicolon list of key, value '
120
- f'pairs, with one key being mut_rate, '
121
- f'e.g., "foo=bar;mut_rate=0.001;do=re".' )
122
- printed_mutation_rate_warning = True
123
- continue
122
+ _LOG .error (f"Invalid mutation rate: { my_chr } : ({ pos1 } , { pos2 } )" )
123
+ _LOG .error (f'4th column of mutation rate bed must be a semicolon list of key, value '
124
+ f'pairs, with one key being mut_rate, e.g., "foo=bar;mut_rate=0.001;do=re".' )
125
+ raise ValueError
124
126
125
127
# +9 because that's len('mut_rate='). Whatever is that should be our mutation rate.
126
128
mut_rate = line_list [3 ][index + 9 :]
@@ -129,16 +131,21 @@ def parse_single_bed(input_bed: str,
129
131
mut_rate = float (mut_rate .split (';' )[0 ])
130
132
except ValueError :
131
133
_LOG .error (f"Invalid mutation rate: { my_chr } : ({ pos1 } , { pos2 } )" )
132
- _LOG .debug (f'4th column of mutation rate bed must be a semicolon list of key, value '
134
+ _LOG .error (f'4th column of mutation rate bed must be a semicolon list of key, value '
133
135
f'pairs, with one key being mut_rate, e.g., "foo=bar;mut_rate=0.001;do=re".' )
134
136
raise
135
137
136
138
if mut_rate > 0.3 :
137
139
_LOG .warning ("Found a mutation rate > 0.3. This is unusual." )
138
140
139
141
ret_dict [my_chr ].append ((int (pos1 ), int (pos2 ), mut_rate ))
140
- else :
141
- ret_dict [my_chr ].append ((int (pos1 ), int (pos2 ), process_key [1 ]))
142
+ elif process_key [0 ] == "discard" :
143
+ # True here means a discard bed was present and this region was marked for discard.
144
+ ret_dict [my_chr ].append ((int (pos1 ), int (pos2 ), True ))
145
+ else : # target dict
146
+ # True here means that this section is to be included (either within a target bed or because
147
+ # no targeted regions were defined)
148
+ ret_dict [my_chr ].append ((int (pos1 ), int (pos2 ), True ))
142
149
143
150
# some validation
144
151
in_ref_only = [k for k in reference_dictionary if k not in ret_dict ]
@@ -161,7 +168,7 @@ def parse_single_bed(input_bed: str,
161
168
162
169
def fill_out_bed_dict (ref_dict : _IndexedSeqFileDict ,
163
170
region_dict : dict | None ,
164
- factor : tuple [str , float , bool ],
171
+ default_factor : tuple [str , bool , bool ],
165
172
) -> dict :
166
173
"""
167
174
This parses the dict derived from the bed file and fills in any gaps, so it can be more easily cycled through
@@ -173,8 +180,9 @@ def fill_out_bed_dict(ref_dict: _IndexedSeqFileDict,
173
180
the data it refers to.
174
181
:param region_dict: A dict based on the input from the bed file, with keys being all the chronmosomes
175
182
present in the reference.
176
- :param factor: This is a tuple representing the type and any extra information. The extra info is for mutation
177
- rates. If beds were input, then these values come from the bed, else they are set to one value across the contig
183
+ :param default_factor: This is a tuple representing the type and any extra information. The extra info is for
184
+ mutation rates. If beds were input, then these values come from the bed, else they are set to one value
185
+ across the contig
178
186
:return: A tuple with (start, end, some_factor) for each region in the genome.
179
187
"""
180
188
@@ -185,20 +193,29 @@ def fill_out_bed_dict(ref_dict: _IndexedSeqFileDict,
185
193
max_value = len (ref_dict [contig ])
186
194
start = 0
187
195
188
- for region in region_dict [contig ]:
189
- if len (region ) > 2 :
190
- factor = region [2 ]
191
- if region [0 ] > start :
192
- regions .append ((start , region [0 ], factor [1 ]))
193
- start = region [1 ]
194
- regions .append ((region [0 ], region [1 ], factor [1 ]))
195
- elif region [0 ] == start :
196
- regions .append ((region [0 ], region [1 ], factor [1 ]))
197
- start = region [1 ]
198
-
199
- # If the region ends short of the end, this fills in to the end
200
- if regions [- 1 ][1 ] != max_value :
201
- regions .append ((start , max_value , factor [1 ]))
196
+ # The trivial case of no data yet for this contig the region gets filled out as 0 to len(contig_sequence)
197
+ # with the default value applied to the whole region
198
+ if not region_dict [contig ]:
199
+ regions .append ((start , max_value , default_factor [1 ]))
200
+ else :
201
+ for region in region_dict [contig ]:
202
+ if region [0 ] > start :
203
+ if default_factor [0 ] == "discard" or default_factor [0 ] == "mutation" :
204
+ regions .append ((start , region [0 ], default_factor [1 ]))
205
+ else : # target regions get assigned "False" values outside the targets, if bed is present
206
+ regions .append ((start , region [0 ], False ))
207
+ start = region [1 ]
208
+ regions .append (region )
209
+ elif region [0 ] == start :
210
+ regions .append (region )
211
+ start = region [1 ]
212
+
213
+ # If the region ends short of the end, this fills in to the end
214
+ if regions [- 1 ][1 ] != max_value :
215
+ if default_factor [0 ] == "discard" or default_factor [0 ] == "mutation" :
216
+ regions .append ((start , max_value , default_factor [1 ]))
217
+ else : # target regions get assigned "False" values outside the targets, if bed is present
218
+ regions .append ((start , max_value , False ))
202
219
203
220
ret_dict [contig ] = regions
204
221
0 commit comments