Skip to content

Commit d17945e

Browse files
Merge pull request #104 from ncsa/102-cant-disable-target-regions-with-discard
Reworked beds a bit, as they were not quite working correctly
2 parents 4bf762a + c4bcb55 commit d17945e

File tree

3 files changed

+74
-54
lines changed

3 files changed

+74
-54
lines changed

data/discard.bed

+2
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,2 @@
1+
H1N1_MP 0 900
2+

neat/read_simulator/utils/bed_func.py

+65-48
Original file line numberDiff line numberDiff line change
@@ -4,7 +4,6 @@
44
"""
55
import pathlib
66
import logging
7-
import numpy as np
87

98
from Bio.File import _IndexedSeqFileDict
109

@@ -32,21 +31,34 @@ def parse_beds(options: Options, reference_dict: _IndexedSeqFileDict, average_mu
3231
return_list = []
3332
bed_files = (options.target_bed, options.discard_bed, options.mutation_bed)
3433
# These numbers indicate the factors for target_bed, discard_bed, and mutation_bed, respectively
35-
# Since Average mutation rate might equal zero, we start the others at 1.
3634
processing_structure = {
37-
0: ("target", 1.0, True if options.target_bed else False),
38-
1: ("discard", 0.0, True if options.discard_bed else False),
39-
2: ("mutation", average_mutation_rate, True if options.mutation_bed else False)
35+
0: ("target",
36+
# For target dict, True will indicate region is to be included (either within a target bed or
37+
# if there was no target bed entered.
38+
# For example (0, 10, False), (10, 100, True) would indicade that 0-10 fell outside a target region and will
39+
# be discarded, whereas 10-100 is within a targeted region and will be included. A single tuple of
40+
# (0, chromosome_end, True) would indicate either the entire chromosome was targeted or there was no bed
41+
# file. Default is to target all regions off all contigs.
42+
True,
43+
True if options.target_bed else False),
44+
1: ("discard",
45+
# For the discard dict, false indicates the region is not within a discard bed (i.e., it should be included
46+
# in the final results), whereas True indicates a region to be discarded, according to the bed.
47+
# For example (0, 10, False), (10, 100, True) would mean we keep reads from 0-10 and discard the
48+
# ones from 10-100. Default is that there are no discard regions and we keep everything.
49+
False,
50+
True if options.discard_bed else False),
51+
2: ("mutation",
52+
average_mutation_rate, # Default value will be this for all sections
53+
True if options.mutation_bed else False)
4054
}
4155

4256
for i in range(len(bed_files)):
43-
# To distinguish between a targeted bed with chromosomes intentionally omitted and no target bed at all
4457
temp_bed_dict = parse_single_bed(
4558
bed_files[i],
4659
reference_dict,
4760
processing_structure[i]
4861
)
49-
5062
# If there was a bed this function will fill in any gaps the bed might have missed, so we have a complete map
5163
# of each chromosome. The uniform case in parse_single_bed handles this in cases of no input bed.
5264
if processing_structure[i][2]:
@@ -80,8 +92,6 @@ def parse_single_bed(input_bed: str,
8092
if process_key[2]:
8193
# Pathlib will help us stay machine-agnostic to the degree possible
8294
input_bed = pathlib.Path(input_bed)
83-
printed_chromosome_warning = False
84-
printed_mutation_rate_warning = False
8595
with open_input(input_bed) as f:
8696
for line in f:
8797
if not line.startswith(('@', '#', "\n")):
@@ -94,33 +104,25 @@ def parse_single_bed(input_bed: str,
94104
except ValueError:
95105
_LOG.error(f"Improperly formatted bed file line {line}")
96106
raise
97-
# Trying not to 'fix' bed files, but an easy case is if the vcf uses 'chr' and the bed doesn't, in
98-
# which we frequently see in human data, we'll just put chr on there. Anything more complicated
99-
# will be on the user to correct their beds first.
107+
# Bed file chromosome names must match the reference.
100108
try:
101109
assert my_chr in reference_dictionary
102110
except AssertionError:
103-
try:
104-
assert my_chr in reference_dictionary
105-
except AssertionError:
106-
in_bed_only.append(my_chr)
107-
if not printed_chromosome_warning:
108-
_LOG.warning("Found chromosome in BED file that isn't in Reference file, skipping")
109-
printed_chromosome_warning = True
110-
continue
111-
111+
in_bed_only.append(my_chr)
112+
_LOG.warning(
113+
f"Found chromosome in BED file that isn't in Reference file, skipping {my_chr}"
114+
)
115+
continue
112116
# If this is the mutation bed, we need to process an extra column.
113117
if process_key[0] == "mutation":
114118
# here we append the metadata, if present
115119
index = line_list[3].find('mut_rate=')
120+
# Improperly formatted mutation rate bed file
116121
if index == -1:
117-
if not printed_mutation_rate_warning:
118-
_LOG.warning(f"Found no mutation rates in bed")
119-
_LOG.warning(f'4th column of mutation rate bed must be a semicolon list of key, value '
120-
f'pairs, with one key being mut_rate, '
121-
f'e.g., "foo=bar;mut_rate=0.001;do=re".')
122-
printed_mutation_rate_warning = True
123-
continue
122+
_LOG.error(f"Invalid mutation rate: {my_chr}: ({pos1}, {pos2})")
123+
_LOG.error(f'4th column of mutation rate bed must be a semicolon list of key, value '
124+
f'pairs, with one key being mut_rate, e.g., "foo=bar;mut_rate=0.001;do=re".')
125+
raise ValueError
124126

125127
# +9 because that's len('mut_rate='). Whatever is that should be our mutation rate.
126128
mut_rate = line_list[3][index + 9:]
@@ -129,16 +131,21 @@ def parse_single_bed(input_bed: str,
129131
mut_rate = float(mut_rate.split(';')[0])
130132
except ValueError:
131133
_LOG.error(f"Invalid mutation rate: {my_chr}: ({pos1}, {pos2})")
132-
_LOG.debug(f'4th column of mutation rate bed must be a semicolon list of key, value '
134+
_LOG.error(f'4th column of mutation rate bed must be a semicolon list of key, value '
133135
f'pairs, with one key being mut_rate, e.g., "foo=bar;mut_rate=0.001;do=re".')
134136
raise
135137

136138
if mut_rate > 0.3:
137139
_LOG.warning("Found a mutation rate > 0.3. This is unusual.")
138140

139141
ret_dict[my_chr].append((int(pos1), int(pos2), mut_rate))
140-
else:
141-
ret_dict[my_chr].append((int(pos1), int(pos2), process_key[1]))
142+
elif process_key[0] == "discard":
143+
# True here means a discard bed was present and this region was marked for discard.
144+
ret_dict[my_chr].append((int(pos1), int(pos2), True))
145+
else: # target dict
146+
# True here means that this section is to be included (either within a target bed or because
147+
# no targeted regions were defined)
148+
ret_dict[my_chr].append((int(pos1), int(pos2), True))
142149

143150
# some validation
144151
in_ref_only = [k for k in reference_dictionary if k not in ret_dict]
@@ -161,7 +168,7 @@ def parse_single_bed(input_bed: str,
161168

162169
def fill_out_bed_dict(ref_dict: _IndexedSeqFileDict,
163170
region_dict: dict | None,
164-
factor: tuple[str, float, bool],
171+
default_factor: tuple[str, bool, bool],
165172
) -> dict:
166173
"""
167174
This parses the dict derived from the bed file and fills in any gaps, so it can be more easily cycled through
@@ -173,8 +180,9 @@ def fill_out_bed_dict(ref_dict: _IndexedSeqFileDict,
173180
the data it refers to.
174181
:param region_dict: A dict based on the input from the bed file, with keys being all the chronmosomes
175182
present in the reference.
176-
:param factor: This is a tuple representing the type and any extra information. The extra info is for mutation
177-
rates. If beds were input, then these values come from the bed, else they are set to one value across the contig
183+
:param default_factor: This is a tuple representing the type and any extra information. The extra info is for
184+
mutation rates. If beds were input, then these values come from the bed, else they are set to one value
185+
across the contig
178186
:return: A tuple with (start, end, some_factor) for each region in the genome.
179187
"""
180188

@@ -185,20 +193,29 @@ def fill_out_bed_dict(ref_dict: _IndexedSeqFileDict,
185193
max_value = len(ref_dict[contig])
186194
start = 0
187195

188-
for region in region_dict[contig]:
189-
if len(region) > 2:
190-
factor = region[2]
191-
if region[0] > start:
192-
regions.append((start, region[0], factor[1]))
193-
start = region[1]
194-
regions.append((region[0], region[1], factor[1]))
195-
elif region[0] == start:
196-
regions.append((region[0], region[1], factor[1]))
197-
start = region[1]
198-
199-
# If the region ends short of the end, this fills in to the end
200-
if regions[-1][1] != max_value:
201-
regions.append((start, max_value, factor[1]))
196+
# The trivial case of no data yet for this contig the region gets filled out as 0 to len(contig_sequence)
197+
# with the default value applied to the whole region
198+
if not region_dict[contig]:
199+
regions.append((start, max_value, default_factor[1]))
200+
else:
201+
for region in region_dict[contig]:
202+
if region[0] > start:
203+
if default_factor[0] == "discard" or default_factor[0] == "mutation":
204+
regions.append((start, region[0], default_factor[1]))
205+
else: # target regions get assigned "False" values outside the targets, if bed is present
206+
regions.append((start, region[0], False))
207+
start = region[1]
208+
regions.append(region)
209+
elif region[0] == start:
210+
regions.append(region)
211+
start = region[1]
212+
213+
# If the region ends short of the end, this fills in to the end
214+
if regions[-1][1] != max_value:
215+
if default_factor[0] == "discard" or default_factor[0] == "mutation":
216+
regions.append((start, max_value, default_factor[1]))
217+
else: # target regions get assigned "False" values outside the targets, if bed is present
218+
regions.append((start, max_value, False))
202219

203220
ret_dict[contig] = regions
204221

neat/read_simulator/utils/local_file_writer.py

+7-6
Original file line numberDiff line numberDiff line change
@@ -48,16 +48,17 @@ def write_local_file(vcf_filename: str or Path,
4848
for variant in variant_data[loc]:
4949
if not variant.is_input:
5050
target_region = find_region(loc, targeted_regions)
51-
# For runs without a targeted bed, target_region[2] is 1, so the following will always fail
52-
if options.rng.random() >= target_region[2]:
51+
# This will be True in targeted regions, if a bed is present, or everywhere if not bed is present.
52+
# Anything outside defined target regions will be marked false and this `if` will activate.
53+
if not target_region[2]:
5354
_LOG.debug(f'Variant filtered out by target regions bed: {reference.id}: '
5455
f'{variant}')
5556
filtered_by_target += 1
5657
continue
5758

58-
# Now we check the discard regions. All regions are set to a factor of 1 if there is not discard bed
5959
discard_region = find_region(loc, discarded_regions)
60-
if discard_region[2] == 0:
60+
# This will be True if the discard bed was present and this region was within a discard bed region.
61+
if discard_region[2]:
6162
_LOG.debug(f'Variant filtered out by discard regions bed: {reference.id}: '
6263
f'{variant}')
6364
filtered_by_discard += 1
@@ -87,11 +88,11 @@ def write_local_file(vcf_filename: str or Path,
8788
if options.produce_fasta:
8889
local_fasta.write_fasta()
8990

90-
if filtered_by_target:
91+
if filtered_by_target > 0:
9192
_LOG.info(f'{filtered_by_target} variants excluded because '
9293
f'of target regions with discard off-target enabled')
9394

94-
if filtered_by_discard:
95+
if filtered_by_discard > 0:
9596
_LOG.info(f'{filtered_by_discard} variants excluded because '
9697
f'of target regions with discard off-target enabled')
9798

0 commit comments

Comments
 (0)