Skip to content

Commit fc2aced

Browse files
committed
Fixing a bug in gen_mut_model where something got deleted
1 parent a305799 commit fc2aced

File tree

3 files changed

+53
-4
lines changed

3 files changed

+53
-4
lines changed

neat/cli/commands/gen_mut_model.py

+7-1
Original file line numberDiff line numberDiff line change
@@ -68,6 +68,12 @@ def add_arguments(self, parser: argparse.ArgumentParser):
6868
help="Includes a list of common variants, if you want to visualize "
6969
"common variants with plot_mut_model.py.")
7070

71+
parser.add_argument('--save-trinuc',
72+
action='store_true',
73+
required=False,
74+
default=False,
75+
help='Saves trinucleotide counts to a separate file')
76+
7177
parser.add_argument('--overwrite',
7278
action='store_true',
7379
required=False,
@@ -83,5 +89,5 @@ def execute(self, arguments: argparse.Namespace):
8389
:param arguments: The namespace with arguments and their values.
8490
"""
8591
compute_mut_runner(arguments.reference, arguments.mutations, arguments.bed, arguments.outcounts,
86-
arguments.show_trinuc, arguments.human_sample,
92+
arguments.show_trinuc, arguments.save_trinuc, arguments.human_sample,
8793
arguments.skip_common, arguments.output, arguments.overwrite)

neat/models/models.py

+14-1
Original file line numberDiff line numberDiff line change
@@ -567,6 +567,19 @@ def generate_fragments(self,
567567
:param number_of_fragments: The number of fragments needed.
568568
:return: A list of fragment random fragment lengths sampled from the model.
569569
"""
570+
# Due to natural variation in genome lengths, it's difficult to harden this code against all the possible
571+
# inputs. In order to harden this code against infinite loops caused by fragment means that the wrong size for
572+
# the genome, we introduce a small number of standard fragments, to ensure enough variability that our code can
573+
# complete. a few small fragments should increase the variability of the set. Most of these are too small
574+
# to create a read, so they become spacers instead.
575+
extra_fragments = [10, 11, 12, 13, 14, 28, 31]
570576
# generates a distribution, assuming normality, then rounds the result and converts to ints
571577
dist = np.round(self.rng.normal(self.fragment_mean, self.fragment_st_dev, size=number_of_fragments)).astype(int)
572-
return [abs(x) for x in dist]
578+
dist = [abs(x) for x in dist]
579+
# We'll append enough fragments to pad out distribution and add variability. Don't know if the cost of doing
580+
# this is worth it though.
581+
number_extra = int(number_of_fragments * 0.1)
582+
for i in range(number_extra):
583+
dist.append(extra_fragments[i % len(extra_fragments)])
584+
self.rng.shuffle(dist) # this shuffle mixes extra fragments in.
585+
return dist[:number_of_fragments]

neat/read_simulator/utils/generate_reads.py

+32-2
Original file line numberDiff line numberDiff line change
@@ -1,6 +1,7 @@
11
import logging
22
import time
33
import pickle
4+
import sys
45

56
from math import ceil
67
from pathlib import Path
@@ -37,6 +38,10 @@ def cover_dataset(
3738
"""
3839

3940
final_reads = set()
41+
# sanity check
42+
if span_length/options.fragment_mean < 5:
43+
_LOG.warning("The fragment mean is relatively large compared to the chromosome size. You may need to increase "
44+
"standard deviation, or decrease fragment mean, if NEAT cannot complete successfully.")
4045
# precompute how many reads we want
4146
# The numerator is the total number of base pair calls needed.
4247
# Divide that by read length gives the number of reads needed
@@ -49,9 +54,23 @@ def cover_dataset(
4954
# step 2: repeat above until number of reads exceeds number_reads * 1.5
5055
# step 3: shuffle pool, then draw number_reads (or number_reads/2 for paired ended) reads to be our reads
5156
read_count = 0
57+
loop_count = 0
5258
while read_count <= number_reads:
5359
start = 0
60+
loop_count += 1
61+
# if loop_count > options.coverage * 100:
62+
# _LOG.error("The selected fragment mean and standard deviation are causing NEAT to get stuck.")
63+
# _LOG.error("Please try adjusting fragment mean or standard deviation to see if that fixes the issue.")
64+
# _LOG.error(f"parameters:\n"
65+
# f"chromosome length: {span_length}\n"
66+
# f"read length: {options.read_len}\n"
67+
# f"fragment mean: {options.fragment_mean}\n"
68+
# f"fragment standard deviation: {options.fragment_st_dev}")
69+
# sys.exit(1)
5470
temp_fragments = []
71+
# trying to get enough variability to harden NEAT against edge cases.
72+
if loop_count % 10 == 0:
73+
fragment_model.rng.shuffle(fragment_pool)
5574
# Breaking the gename into fragments
5675
while start < span_length:
5776
# We take the first element and put it back on the end to create an endless pool of fragments to draw from
@@ -61,9 +80,9 @@ def cover_dataset(
6180
if end - start < options.read_len:
6281
# add some random flavor to try to keep it to falling into a loop
6382
if options.rng.normal() < 0.5:
64-
fragment_pool.insert(fragment, len(fragment_pool)//2)
83+
fragment_pool.insert(len(fragment_pool)//2, fragment)
6584
else:
66-
fragment_pool.insert(fragment, len(fragment_pool) - 3)
85+
fragment_pool.insert(len(fragment_pool) - 3, fragment)
6786
else:
6887
fragment_pool.append(fragment)
6988
temp_fragments.append((start, end))
@@ -87,6 +106,16 @@ def cover_dataset(
87106
# where start and end are ints with end > start. Reads can overlap, so right_start < left_end
88107
# is possible, but the reads cannot extend past each other, so right_start < left_start and
89108
# left_end > right_end are not possible.
109+
110+
# sanity check that we haven't created an unrealistic read:
111+
insert_size = read2[0] - read1[1]
112+
if insert_size > 2 * options.read_len:
113+
# Probably an outlier fragment length. We'll just pitch one of the reads
114+
# and consider it lost to the ages.
115+
if fragment_model.rng.choice((True, False)):
116+
read1 = (0, 0)
117+
else:
118+
read2 = (0, 0)
90119
read = read1 + read2
91120
if read not in final_reads:
92121
final_reads.add(read)
@@ -97,6 +126,7 @@ def cover_dataset(
97126
# Now we shuffle them to add some randomness
98127
options.rng.shuffle(final_reads)
99128
# And only return the number we needed
129+
_LOG.debug(f"Coverage required {loop_count} loops")
100130
if options.paired_ended:
101131
# Since each read is actually 2 reads, we only need to return half as many. But to cheat a few extra, we scale
102132
# that down slightly to 1.85 reads per read. This factor is arbitrary and may even be a function. But let's see

0 commit comments

Comments
 (0)