Merge pull request #104 from ncsa/102-cant-disable-target-regions-with-discard

keshav-gandhi · web-flow · commit d17945e1e778 · 2024-05-02T17:42:32.000-05:00
Reworked beds a bit, as they were not quite working correctly
diff --git a/data/discard.bed b/data/discard.bed
@@ -0,0 +1,2 @@
+H1N1_MP	0	900
+
diff --git a/neat/read_simulator/utils/bed_func.py b/neat/read_simulator/utils/bed_func.py
@@ -4,7 +4,6 @@
 """
 import pathlib
 import logging
-import numpy as np
 
 from Bio.File import _IndexedSeqFileDict
 
@@ -32,21 +31,34 @@ def parse_beds(options: Options, reference_dict: _IndexedSeqFileDict, average_mu
     return_list = []
     bed_files = (options.target_bed, options.discard_bed, options.mutation_bed)
     # These numbers indicate the factors for target_bed, discard_bed, and mutation_bed, respectively
-    # Since Average mutation rate might equal zero, we start the others at 1.
     processing_structure = {
-        0: ("target", 1.0, True if options.target_bed else False),
-        1: ("discard", 0.0, True if options.discard_bed else False),
-        2: ("mutation", average_mutation_rate, True if options.mutation_bed else False)
+        0: ("target",
+            # For target dict, True will indicate region is to be included (either within a target bed or
+            # if there was no target bed entered.
+            # For example (0, 10, False), (10, 100, True) would indicade that 0-10 fell outside a target region and will
+            # be discarded, whereas 10-100 is within a targeted region and will be included. A single tuple of
+            # (0, chromosome_end, True) would indicate either the entire chromosome was targeted or there was no bed
+            # file. Default is to target all regions off all contigs.
+            True,
+            True if options.target_bed else False),
+        1: ("discard",
+            # For the discard dict, false indicates the region is not within a discard bed (i.e., it should be included
+            # in the final results), whereas True indicates a region to be discarded, according to the bed.
+            # For example (0, 10, False), (10, 100, True) would mean we keep reads from 0-10 and discard the
+            # ones from 10-100. Default is that there are no discard regions and we keep everything.
+            False,
+            True if options.discard_bed else False),
+        2: ("mutation",
+            average_mutation_rate,  # Default value will be this for all sections
+            True if options.mutation_bed else False)
     }
 
     for i in range(len(bed_files)):
-        # To distinguish between a targeted bed with chromosomes intentionally omitted and no target bed at all
         temp_bed_dict = parse_single_bed(
             bed_files[i],
             reference_dict,
             processing_structure[i]
         )
-
         # If there was a bed this function will fill in any gaps the bed might have missed, so we have a complete map
         # of each chromosome. The uniform case in parse_single_bed handles this in cases of no input bed.
         if processing_structure[i][2]:
@@ -80,8 +92,6 @@ def parse_single_bed(input_bed: str,
     if process_key[2]:
         # Pathlib will help us stay machine-agnostic to the degree possible
         input_bed = pathlib.Path(input_bed)
-        printed_chromosome_warning = False
-        printed_mutation_rate_warning = False
         with open_input(input_bed) as f:
             for line in f:
                 if not line.startswith(('@', '#', "\n")):
@@ -94,33 +104,25 @@ def parse_single_bed(input_bed: str,
                     except ValueError:
                         _LOG.error(f"Improperly formatted bed file line {line}")
                         raise
-                    # Trying not to 'fix' bed files, but an easy case is if the vcf uses 'chr' and the bed doesn't, in
-                    # which we frequently see in human data, we'll just put chr on there. Anything more complicated
-                    # will be on the user to correct their beds first.
+                    # Bed file chromosome names must match the reference.
                     try:
                         assert my_chr in reference_dictionary
                     except AssertionError:
-                        try:
-                            assert my_chr in reference_dictionary
-                        except AssertionError:
-                            in_bed_only.append(my_chr)
-                            if not printed_chromosome_warning:
-                                _LOG.warning("Found chromosome in BED file that isn't in Reference file, skipping")
-                                printed_chromosome_warning = True
-                            continue
-
+                        in_bed_only.append(my_chr)
+                        _LOG.warning(
+                            f"Found chromosome in BED file that isn't in Reference file, skipping {my_chr}"
+                        )
+                        continue
                     # If this is the mutation bed, we need to process an extra column.
                     if process_key[0] == "mutation":
                         # here we append the metadata, if present
                         index = line_list[3].find('mut_rate=')
+                        # Improperly formatted mutation rate bed file
                         if index == -1:
-                            if not printed_mutation_rate_warning:
-                                _LOG.warning(f"Found no mutation rates in bed")
-                                _LOG.warning(f'4th column of mutation rate bed must be a semicolon list of key, value '
-                                             f'pairs, with one key being mut_rate, '
-                                             f'e.g., "foo=bar;mut_rate=0.001;do=re".')
-                                printed_mutation_rate_warning = True
-                            continue
+                            _LOG.error(f"Invalid mutation rate: {my_chr}: ({pos1}, {pos2})")
+                            _LOG.error(f'4th column of mutation rate bed must be a semicolon list of key, value '
+                                       f'pairs, with one key being mut_rate, e.g., "foo=bar;mut_rate=0.001;do=re".')
+                            raise ValueError
 
                         # +9 because that's len('mut_rate='). Whatever is that should be our mutation rate.
                         mut_rate = line_list[3][index + 9:]
@@ -129,16 +131,21 @@ def parse_single_bed(input_bed: str,
                             mut_rate = float(mut_rate.split(';')[0])
                         except ValueError:
                             _LOG.error(f"Invalid mutation rate: {my_chr}: ({pos1}, {pos2})")
-                            _LOG.debug(f'4th column of mutation rate bed must be a semicolon list of key, value '
+                            _LOG.error(f'4th column of mutation rate bed must be a semicolon list of key, value '
                                        f'pairs, with one key being mut_rate, e.g., "foo=bar;mut_rate=0.001;do=re".')
                             raise
 
                         if mut_rate > 0.3:
                             _LOG.warning("Found a mutation rate > 0.3. This is unusual.")
 
                         ret_dict[my_chr].append((int(pos1), int(pos2), mut_rate))
-                    else:
-                        ret_dict[my_chr].append((int(pos1), int(pos2), process_key[1]))
+                    elif process_key[0] == "discard":
+                        # True here means a discard bed was present and this region was marked for discard.
+                        ret_dict[my_chr].append((int(pos1), int(pos2), True))
+                    else:  # target dict
+                        # True here means that this section is to be included (either within a target bed or because
+                        # no targeted regions were defined)
+                        ret_dict[my_chr].append((int(pos1), int(pos2), True))
 
         # some validation
         in_ref_only = [k for k in reference_dictionary if k not in ret_dict]
@@ -161,7 +168,7 @@ def parse_single_bed(input_bed: str,
 
 def fill_out_bed_dict(ref_dict: _IndexedSeqFileDict,
                       region_dict: dict | None,
-                      factor: tuple[str, float, bool],
+                      default_factor: tuple[str, bool, bool],
                       ) -> dict:
     """
     This parses the dict derived from the bed file and fills in any gaps, so it can be more easily cycled through
@@ -173,8 +180,9 @@ def fill_out_bed_dict(ref_dict: _IndexedSeqFileDict,
         the data it refers to.
     :param region_dict: A dict based on the input from the bed file, with keys being all the chronmosomes
         present in the reference.
-    :param factor: This is a tuple representing the type and any extra information. The extra info is for mutation
-        rates. If beds were input, then these values come from the bed, else they are set to one value across the contig
+    :param default_factor: This is a tuple representing the type and any extra information. The extra info is for
+        mutation rates. If beds were input, then these values come from the bed, else they are set to one value
+        across the contig
     :return: A tuple with (start, end, some_factor) for each region in the genome.
     """
 
@@ -185,20 +193,29 @@ def fill_out_bed_dict(ref_dict: _IndexedSeqFileDict,
         max_value = len(ref_dict[contig])
         start = 0
 
-        for region in region_dict[contig]:
-            if len(region) > 2:
-                factor = region[2]
-            if region[0] > start:
-                regions.append((start, region[0], factor[1]))
-                start = region[1]
-                regions.append((region[0], region[1], factor[1]))
-            elif region[0] == start:
-                regions.append((region[0], region[1], factor[1]))
-                start = region[1]
-
-        # If the region ends short of the end, this fills in to the end
-        if regions[-1][1] != max_value:
-            regions.append((start, max_value, factor[1]))
+        # The trivial case of no data yet for this contig the region gets filled out as 0 to len(contig_sequence)
+        # with the default value applied to the whole region
+        if not region_dict[contig]:
+            regions.append((start, max_value, default_factor[1]))
+        else:
+            for region in region_dict[contig]:
+                if region[0] > start:
+                    if default_factor[0] == "discard" or default_factor[0] == "mutation":
+                        regions.append((start, region[0], default_factor[1]))
+                    else:  # target regions get assigned "False" values outside the targets, if bed is present
+                        regions.append((start, region[0], False))
+                    start = region[1]
+                    regions.append(region)
+                elif region[0] == start:
+                    regions.append(region)
+                    start = region[1]
+
+            # If the region ends short of the end, this fills in to the end
+            if regions[-1][1] != max_value:
+                if default_factor[0] == "discard" or default_factor[0] == "mutation":
+                    regions.append((start, max_value, default_factor[1]))
+                else:  # target regions get assigned "False" values outside the targets, if bed is present
+                    regions.append((start, max_value, False))
 
         ret_dict[contig] = regions
 
diff --git a/neat/read_simulator/utils/local_file_writer.py b/neat/read_simulator/utils/local_file_writer.py
@@ -48,16 +48,17 @@ def write_local_file(vcf_filename: str or Path,
             for variant in variant_data[loc]:
                 if not variant.is_input:
                     target_region = find_region(loc, targeted_regions)
-                    # For runs without a targeted bed, target_region[2] is 1, so the following will always fail
-                    if options.rng.random() >= target_region[2]:
+                    # This will be True in targeted regions, if a bed is present, or everywhere if not bed is present.
+                    # Anything outside defined target regions will be marked false and this `if` will activate.
+                    if not target_region[2]:
                         _LOG.debug(f'Variant filtered out by target regions bed: {reference.id}: '
                                    f'{variant}')
                         filtered_by_target += 1
                         continue
 
-                    # Now we check the discard regions. All regions are set to a factor of 1 if there is not discard bed
                     discard_region = find_region(loc, discarded_regions)
-                    if discard_region[2] == 0:
+                    # This will be True if the discard bed was present and this region was within a discard bed region.
+                    if discard_region[2]:
                         _LOG.debug(f'Variant filtered out by discard regions bed: {reference.id}: '
                                    f'{variant}')
                         filtered_by_discard += 1
@@ -87,11 +88,11 @@ def write_local_file(vcf_filename: str or Path,
     if options.produce_fasta:
         local_fasta.write_fasta()
 
-    if filtered_by_target:
+    if filtered_by_target > 0:
         _LOG.info(f'{filtered_by_target} variants excluded because '
                   f'of target regions with discard off-target enabled')
 
-    if filtered_by_discard:
+    if filtered_by_discard > 0:
         _LOG.info(f'{filtered_by_discard} variants excluded because '
                   f'of target regions with discard off-target enabled')