21
21
from .default_mutation_model import *
22
22
from .default_sequencing_error_model import *
23
23
from .default_gc_bias_model import *
24
- from .default_fraglen_model import *
25
24
from .utils import bin_scores , take_closest
26
25
27
26
__all__ = [
@@ -389,16 +388,17 @@ def __init__(self,
389
388
390
389
def get_sequencing_errors (self ,
391
390
read_length : int ,
391
+ padding : int ,
392
392
reference_segment : SeqRecord ,
393
393
quality_scores : np .ndarray ):
394
394
"""
395
395
Inserts errors of type substitution, insertion, or deletion into read_data, and assigns a quality score
396
396
based on the container model.
397
397
:param read_length: The length of the read to generate errors for.
398
+ :param padding: this is the amount of space we have in the read for deletions.
398
399
:param reference_segment: The section of the reference from which the read is drawn
399
400
:param quality_scores: Array of quality scores for the read
400
- :return: modified sequence and associated quality scores
401
-
401
+ :return: Modified sequence and associated quality scores
402
402
"""
403
403
404
404
error_indexes = []
@@ -413,7 +413,7 @@ def get_sequencing_errors(self,
413
413
if self .rng .random () < self .quality_score_error_rate [quality_scores [i ]]:
414
414
error_indexes .append (i )
415
415
416
- num_indels_so_far = 0
416
+ total_indel_length = 0
417
417
# To prevent deletion collisions
418
418
del_blacklist = []
419
419
@@ -424,19 +424,24 @@ def get_sequencing_errors(self,
424
424
# Not too sure about how realistic it is to model errors as indels, but I'm leaving the code in for now.
425
425
426
426
# This is to prevent deletion error collisions and to keep there from being too many indel errors.
427
- if 0 < index < self .read_length - max (self .deletion_len_model ) and num_indels_so_far > self .read_length // 2 :
427
+ if 0 < index < self .read_length - max (self .deletion_len_model ) and total_indel_length > self .read_length // 4 :
428
428
error_type = self .rng .choice (a = list (self .variant_probs ), p = list (self .variant_probs .values ()))
429
429
430
430
# Deletion error
431
431
if error_type == Deletion :
432
432
deletion_length = self .get_deletion_length ()
433
+ if padding - deletion_length < 0 :
434
+ # No space in this read to add this deletion
435
+ continue
433
436
deletion_reference = reference_segment .seq [index : index + deletion_length + 1 ]
434
437
deletion_alternate = deletion_reference [0 ]
435
438
introduced_errors .append (
436
439
ErrorContainer (Deletion , index , deletion_length , deletion_reference , deletion_alternate )
437
440
)
438
- num_indels_so_far += deletion_length
441
+ total_indel_length += deletion_length
442
+
439
443
del_blacklist .extend (list (range (index , index + deletion_length )))
444
+ padding -= deletion_length
440
445
441
446
elif error_type == Insertion :
442
447
insertion_length = self .get_insertion_length ()
@@ -446,7 +451,7 @@ def get_sequencing_errors(self,
446
451
introduced_errors .append (
447
452
ErrorContainer (Insertion , index , insertion_length , insertion_reference , insertion_alternate )
448
453
)
449
- num_indels_so_far += insertion_length
454
+ total_indel_length += insertion_length
450
455
451
456
# Insert substitution error
452
457
# Programmer note: if you add new error types, they can be added as elifs above, leaving the final
@@ -465,7 +470,7 @@ def get_sequencing_errors(self,
465
470
if introduced_errors [i ].location in del_blacklist :
466
471
del introduced_errors [i ]
467
472
468
- return introduced_errors
473
+ return introduced_errors , max ( padding , 0 )
469
474
470
475
def quality_index_remap (self , input_read_length ):
471
476
"""
@@ -600,53 +605,27 @@ class FragmentLengthModel:
600
605
601
606
:param fragment_mean: the mean of the collection of fragment lengths derived from data
602
607
:param fragment_std: the standard deviation of the collection of fragment lengths derived from data
603
- :param fragment_max: the largest fragment observed in the data
604
- :param fragment_min: the smallest fragment observed in data
605
608
:param rng: the random number generator for the run
606
609
"""
607
610
608
611
def __init__ (self ,
609
- fragment_mean : float = None ,
610
- fragment_std : float = None ,
611
- fragment_max : int = None ,
612
- fragment_min : int = None ,
612
+ fragment_mean : float ,
613
+ fragment_std : float ,
613
614
rng : Generator = None ):
614
- self .fragment_mean = fragment_mean if fragment_mean else default_fragment_mean
615
- self .fragment_st_dev = fragment_std if fragment_std else default_fragment_std
616
- self .fragment_max = fragment_max if fragment_max else default_fragment_max
617
- self .fragment_min = fragment_min if fragment_min else default_fragment_min
615
+ self .fragment_mean = fragment_mean
616
+ self .fragment_st_dev = fragment_std
618
617
self .rng = rng
619
618
620
619
def generate_fragments (self ,
621
620
total_length : int ,
622
- read_length : int ,
623
- coverage : int ) -> list :
621
+ number_of_fragments : int ) -> list :
624
622
"""
625
623
Generates a number of fragments based on the total length needed, and the mean and standard deviation of the set
626
624
627
625
:param total_length: Length of the reference segment we are covering.
628
- :param read_length: average length of the reads
629
- :param coverage: the target coverage number
626
+ :param number_of_fragments: The number of fragments needed.
630
627
:return: A list of fragment random fragment lengths sampled from the model.
631
628
"""
632
- # Estimate the number of fragments needed (with a 2x padding)
633
- number_of_fragments = int (round (total_length / read_length ) * (coverage * 2 ))
634
- # Check that we don't have unusable values for fragment mean. Too many fragments under the read length means
635
- # NEAT will either get caught in an infinite cycle of sampling fragments but never finding one that works, or
636
- # it will only find a few and will run very slowly.
637
- if self .fragment_mean < read_length :
638
- # Let's just reset the fragment mean to make up for this.
639
- self .fragment_mean = read_length
640
629
# generates a distribution, assuming normality, then rounds the result and converts to ints
641
630
dist = np .round (self .rng .normal (self .fragment_mean , self .fragment_st_dev , size = number_of_fragments )).astype (int )
642
- # filter the list to throw out outliers and to set anything under the read length to the read length.
643
- dist = [max (x , read_length ) for x in dist if x <= self .fragment_max ]
644
- # Just a sanity check to make sure our data isn't too thin:
645
- while number_of_fragments - len (dist ) > 0 :
646
- additional_fragment = self .rng .normal (loc = self .fragment_mean , scale = self .fragment_st_dev )
647
- if additional_fragment < read_length :
648
- continue
649
- dist .append (round (additional_fragment ))
650
-
651
- # Now set a minimum on the dataset. Any fragment smaller than read_length gets turned into read_length
652
- return dist
631
+ return [abs (x ) for x in dist ]
0 commit comments