@@ -313,32 +313,46 @@ def replace_n(segment: Seq, rng: Generator) -> Seq:
313
313
return Seq (modified_segment )
314
314
315
315
316
- def modify_target_coverage (target_regions : list , discard_regions : list , coverage_vector : np .ndarray , coverage : int ):
316
+ def modify_target_coverage (
317
+ target_regions : list ,
318
+ discard_regions : list ,
319
+ coverage_vector : np .ndarray ,
320
+ coverage : int ,
321
+ target_shape : int ):
317
322
"""
318
323
Modifies the coverage vector by applying the list of regions. For this version, areas
319
324
outside the regions have coverage adjusted by the off_target_percent
320
325
321
326
:param target_regions: A list of intervals to target, extracted from a bed file
322
327
:param discard_regions: A list of regions to throw out, extracted from a bed file
323
328
:param coverage_vector: The target coverage vector, which will be modified
324
- :return: The updated target coverage vector.
329
+ :param coverage: The target coverage value at this position
330
+ :param target_shape: The size of the bins in the final output
331
+ :return: The updated target coverage vector, binned into target_shape sized bins.
325
332
"""
326
333
327
- # this will tabulate values for included regions first, then for excluded regions. Hopefully both are not present.
328
- # If only one is present, the other should not change it.
329
-
330
334
for region in target_regions :
331
335
# in the included regions, any are marked false is to be excluded. Everything else remains untouched
332
336
if not region [2 ]:
333
337
coverage_vector [region [0 ]: region [1 ]] = [0 ] * (region [1 ] - region [0 ])
334
338
335
339
for region in discard_regions :
336
340
# in the discard regions section, a True indicates this region falls
337
- # within a discard bed and should be discarded
341
+ # within a discard bed and should be discarded. Note that this will discard regions if the discard and target
342
+ # beds overlap
338
343
if region [2 ]:
339
344
coverage_vector [region [0 ]: region [1 ]] = [0 ] * (region [1 ] - region [0 ])
340
345
341
- return coverage_vector
346
+ # now for the final coverage vector, we need to bin by the window bias in the gc-bias model
347
+ start = 0
348
+ final_coverage_vector = []
349
+ for i in range (target_shape , len (coverage_vector ), target_shape ):
350
+ subsection = coverage_vector [start : i ]
351
+ subsection_average = round (sum (subsection )/ len (subsection )) # round() with no second argument returns an int
352
+ final_coverage_vector .append (subsection_average )
353
+ start = i
354
+
355
+ return final_coverage_vector
342
356
343
357
344
358
def merge_sort (my_array : np .ndarray ):
@@ -404,9 +418,13 @@ def generate_reads(reference: SeqRecord,
404
418
# I'm trying to move this section into cover_dataset.
405
419
# target_coverage_vector = gc_bias.create_coverage_bias_vector(target_coverage_vector, reference.seq.upper())
406
420
421
+ # We want to bin the coverage into window-sized segments to speed up calculations.
422
+ # This divides the segment into len(reference) // window_size (integer division).
423
+ target_shape = len (reference ) // gc_bias .window_size
424
+
407
425
# Apply the targeting/discarded rates.
408
426
target_coverage_vector = modify_target_coverage (
409
- targeted_regions , discarded_regions , target_coverage_vector , options .coverage
427
+ targeted_regions , discarded_regions , target_coverage_vector , options .coverage , target_shape
410
428
)
411
429
412
430
_LOG .debug ("Covering dataset." )
0 commit comments