Fixing discard bed regions

joshfactorial · joshfactorial · commit 75b74da6121e · 2024-05-09T12:54:54.000-05:00
diff --git a/neat/read_simulator/utils/generate_reads.py b/neat/read_simulator/utils/generate_reads.py
@@ -313,21 +313,30 @@ def replace_n(segment: Seq, rng: Generator) -> Seq:
     return Seq(modified_segment)
 
 
-def modify_target_coverage(included_regions: list, excluded_regions: list, coverage_vector: np.ndarray):
+def modify_target_coverage(target_regions: list, discard_regions: list, coverage_vector: np.ndarray, coverage: int):
     """
     Modifies the coverage vector by applying the list of regions. For this version, areas
     outside the regions have coverage adjusted by the off_target_percent
 
-    :param included_regions: A list of intervals to target, extracted from a bed file
-    :param excluded_regions: A list of regions to throw out, extracted from a bed file
+    :param target_regions: A list of intervals to target, extracted from a bed file
+    :param discard_regions: A list of regions to throw out, extracted from a bed file
     :param coverage_vector: The target coverage vector, which will be modified
     :return: The updated target coverage vector.
     """
 
     # this will tabulate values for included regions first, then for excluded regions. Hopefully both are not present.
     # If only one is present, the other should not change it.
-    for region in included_regions + excluded_regions:
-        coverage_vector[region[0]: region[1]] = coverage_vector[region[0]: region[1]] * region[2]
+
+    for region in target_regions:
+        # in the included regions, any are marked false is to be excluded. Everything else remains untouched
+        if not region[2]:
+            coverage_vector[region[0]: region[1]] = [0] * (region[1] - region[0])
+
+    for region in discard_regions:
+        # in the discard regions section, a True indicates this region falls
+        # within a discard bed and should be discarded
+        if region[2]:
+            coverage_vector[region[0]: region[1]] = [0] * (region[1] - region[0])
 
     return coverage_vector
 
@@ -388,18 +397,17 @@ def generate_reads(reference: SeqRecord,
     start = time.time()
 
     base_name = f'{Path(options.output).name}-{chrom}'
-    # We want to bin the coverage into window-sized segments to speed up calculations.
-    # This divides the segment into len(reference) // window_size rounded up to the nearest int.
-    target_shape = ceil(len(reference) // gc_bias.window_size)
     # Assume uniform coverage
-    target_coverage_vector = np.full(shape=target_shape, fill_value=options.coverage)
+    target_coverage_vector = np.full(shape=len(reference), fill_value=options.coverage)
     if not options.no_coverage_bias:
         pass
         # I'm trying to move this section into cover_dataset.
         # target_coverage_vector = gc_bias.create_coverage_bias_vector(target_coverage_vector, reference.seq.upper())
 
     # Apply the targeting/discarded rates.
-    target_coverage_vector = modify_target_coverage(targeted_regions, discarded_regions, target_coverage_vector)
+    target_coverage_vector = modify_target_coverage(
+        targeted_regions, discarded_regions, target_coverage_vector, options.coverage
+    )
 
     _LOG.debug("Covering dataset.")
     t = time.process_time()