Skip to content

Commit d389ada

Browse files
committed
Making progress simplifying and strengthening the generate reads section
1 parent 4c94542 commit d389ada

File tree

6 files changed

+151
-372
lines changed

6 files changed

+151
-372
lines changed

neat/models/default_fraglen_model.py

-8
This file was deleted.

neat/models/models.py

+8-34
Original file line numberDiff line numberDiff line change
@@ -21,7 +21,6 @@
2121
from .default_mutation_model import *
2222
from .default_sequencing_error_model import *
2323
from .default_gc_bias_model import *
24-
from .default_fraglen_model import *
2524
from .utils import bin_scores, take_closest
2625

2726
__all__ = [
@@ -600,53 +599,28 @@ class FragmentLengthModel:
600599
601600
:param fragment_mean: the mean of the collection of fragment lengths derived from data
602601
:param fragment_std: the standard deviation of the collection of fragment lengths derived from data
603-
:param fragment_max: the largest fragment observed in the data
604-
:param fragment_min: the smallest fragment observed in data
605602
:param rng: the random number generator for the run
606603
"""
607604

608605
def __init__(self,
609-
fragment_mean: float = None,
610-
fragment_std: float = None,
611-
fragment_max: int = None,
612-
fragment_min: int = None,
606+
fragment_mean: float,
607+
fragment_std: float,
613608
rng: Generator = None):
614-
self.fragment_mean = fragment_mean if fragment_mean else default_fragment_mean
615-
self.fragment_st_dev = fragment_std if fragment_std else default_fragment_std
616-
self.fragment_max = fragment_max if fragment_max else default_fragment_max
617-
self.fragment_min = fragment_min if fragment_min else default_fragment_min
609+
self.fragment_mean = fragment_mean
610+
self.fragment_st_dev = fragment_std
618611
self.rng = rng
619612

620613
def generate_fragments(self,
621614
total_length: int,
622-
read_length: int,
623-
coverage: int) -> list:
615+
number_of_fragments: int) -> list:
624616
"""
625617
Generates a number of fragments based on the total length needed, and the mean and standard deviation of the set
626618
627619
:param total_length: Length of the reference segment we are covering.
628-
:param read_length: average length of the reads
629-
:param coverage: the target coverage number
620+
:param number_of_fragments: The number of fragments needed.
630621
:return: A list of fragment random fragment lengths sampled from the model.
631622
"""
632-
# Estimate the number of fragments needed (with a 2x padding)
633-
number_of_fragments = int(round(total_length / read_length) * (coverage * 2))
634-
# Check that we don't have unusable values for fragment mean. Too many fragments under the read length means
635-
# NEAT will either get caught in an infinite cycle of sampling fragments but never finding one that works, or
636-
# it will only find a few and will run very slowly.
637-
if self.fragment_mean < read_length:
638-
# Let's just reset the fragment mean to make up for this.
639-
self.fragment_mean = read_length
640623
# generates a distribution, assuming normality, then rounds the result and converts to ints
641624
dist = np.round(self.rng.normal(self.fragment_mean, self.fragment_st_dev, size=number_of_fragments)).astype(int)
642-
# filter the list to throw out outliers and to set anything under the read length to the read length.
643-
dist = [max(x, read_length) for x in dist if x <= self.fragment_max]
644-
# Just a sanity check to make sure our data isn't too thin:
645-
while number_of_fragments - len(dist) > 0:
646-
additional_fragment = self.rng.normal(loc=self.fragment_mean, scale=self.fragment_st_dev)
647-
if additional_fragment < read_length:
648-
continue
649-
dist.append(round(additional_fragment))
650-
651-
# Now set a minimum on the dataset. Any fragment smaller than read_length gets turned into read_length
652-
return dist
625+
626+
return list(dist)

neat/read_simulator/runner.py

+9-8
Original file line numberDiff line numberDiff line change
@@ -98,15 +98,16 @@ def initialize_all_models(options: Options):
9898

9999
_LOG.debug('GC Bias model loaded')
100100

101-
fraglen_model = None
102-
if options.paired_ended:
103-
if options.fragment_model:
104-
fraglen_model = pickle.load(gzip.open(options.fragment_model))
105-
else:
106-
fraglen_model = FragmentLengthModel(options.fragment_mean, options.fragment_st_dev)
107-
108-
# Set the rng for the fragment length model
101+
if options.fragment_model:
102+
fraglen_model = pickle.load(gzip.open(options.fragment_model))
109103
fraglen_model.rng = options.rng
104+
elif options.fragment_mean:
105+
fraglen_model = FragmentLengthModel(options.fragment_mean, options.fragment_st_dev, rng=options.rng)
106+
else:
107+
# For single ended, fragment length will be based on read length
108+
fragment_mean = options.read_len * 1.5
109+
fragment_st_dev = fragment_mean * 0.2
110+
fraglen_model = FragmentLengthModel(fragment_mean, fragment_st_dev, options.rng)
110111

111112
_LOG.debug("Fragment length model loaded")
112113

0 commit comments

Comments
 (0)