Skip to content

Commit 8763a48

Browse files
committed
fixed a mistake with the target coverage vector
1 parent 75b74da commit 8763a48

File tree

1 file changed

+26
-8
lines changed

1 file changed

+26
-8
lines changed

neat/read_simulator/utils/generate_reads.py

+26-8
Original file line numberDiff line numberDiff line change
@@ -313,32 +313,46 @@ def replace_n(segment: Seq, rng: Generator) -> Seq:
313313
return Seq(modified_segment)
314314

315315

316-
def modify_target_coverage(target_regions: list, discard_regions: list, coverage_vector: np.ndarray, coverage: int):
316+
def modify_target_coverage(
317+
target_regions: list,
318+
discard_regions: list,
319+
coverage_vector: np.ndarray,
320+
coverage: int,
321+
target_shape: int):
317322
"""
318323
Modifies the coverage vector by applying the list of regions. For this version, areas
319324
outside the regions have coverage adjusted by the off_target_percent
320325
321326
:param target_regions: A list of intervals to target, extracted from a bed file
322327
:param discard_regions: A list of regions to throw out, extracted from a bed file
323328
:param coverage_vector: The target coverage vector, which will be modified
324-
:return: The updated target coverage vector.
329+
:param coverage: The target coverage value at this position
330+
:param target_shape: The size of the bins in the final output
331+
:return: The updated target coverage vector, binned into target_shape sized bins.
325332
"""
326333

327-
# this will tabulate values for included regions first, then for excluded regions. Hopefully both are not present.
328-
# If only one is present, the other should not change it.
329-
330334
for region in target_regions:
331335
# in the included regions, any are marked false is to be excluded. Everything else remains untouched
332336
if not region[2]:
333337
coverage_vector[region[0]: region[1]] = [0] * (region[1] - region[0])
334338

335339
for region in discard_regions:
336340
# in the discard regions section, a True indicates this region falls
337-
# within a discard bed and should be discarded
341+
# within a discard bed and should be discarded. Note that this will discard regions if the discard and target
342+
# beds overlap
338343
if region[2]:
339344
coverage_vector[region[0]: region[1]] = [0] * (region[1] - region[0])
340345

341-
return coverage_vector
346+
# now for the final coverage vector, we need to bin by the window bias in the gc-bias model
347+
start = 0
348+
final_coverage_vector = []
349+
for i in range(target_shape, len(coverage_vector), target_shape):
350+
subsection = coverage_vector[start: i]
351+
subsection_average = round(sum(subsection)/len(subsection)) # round() with no second argument returns an int
352+
final_coverage_vector.append(subsection_average)
353+
start = i
354+
355+
return final_coverage_vector
342356

343357

344358
def merge_sort(my_array: np.ndarray):
@@ -404,9 +418,13 @@ def generate_reads(reference: SeqRecord,
404418
# I'm trying to move this section into cover_dataset.
405419
# target_coverage_vector = gc_bias.create_coverage_bias_vector(target_coverage_vector, reference.seq.upper())
406420

421+
# We want to bin the coverage into window-sized segments to speed up calculations.
422+
# This divides the segment into len(reference) // window_size (integer division).
423+
target_shape = len(reference) // gc_bias.window_size
424+
407425
# Apply the targeting/discarded rates.
408426
target_coverage_vector = modify_target_coverage(
409-
targeted_regions, discarded_regions, target_coverage_vector, options.coverage
427+
targeted_regions, discarded_regions, target_coverage_vector, options.coverage, target_shape
410428
)
411429

412430
_LOG.debug("Covering dataset.")

0 commit comments

Comments
 (0)