1
1
import logging
2
2
import time
3
3
import pickle
4
+ import sys
4
5
5
6
from math import ceil
6
7
from pathlib import Path
@@ -37,6 +38,10 @@ def cover_dataset(
37
38
"""
38
39
39
40
final_reads = set ()
41
+ # sanity check
42
+ if span_length / options .fragment_mean < 5 :
43
+ _LOG .warning ("The fragment mean is relatively large compared to the chromosome size. You may need to increase "
44
+ "standard deviation, or decrease fragment mean, if NEAT cannot complete successfully." )
40
45
# precompute how many reads we want
41
46
# The numerator is the total number of base pair calls needed.
42
47
# Divide that by read length gives the number of reads needed
@@ -49,9 +54,23 @@ def cover_dataset(
49
54
# step 2: repeat above until number of reads exceeds number_reads * 1.5
50
55
# step 3: shuffle pool, then draw number_reads (or number_reads/2 for paired ended) reads to be our reads
51
56
read_count = 0
57
+ loop_count = 0
52
58
while read_count <= number_reads :
53
59
start = 0
60
+ loop_count += 1
61
+ # if loop_count > options.coverage * 100:
62
+ # _LOG.error("The selected fragment mean and standard deviation are causing NEAT to get stuck.")
63
+ # _LOG.error("Please try adjusting fragment mean or standard deviation to see if that fixes the issue.")
64
+ # _LOG.error(f"parameters:\n"
65
+ # f"chromosome length: {span_length}\n"
66
+ # f"read length: {options.read_len}\n"
67
+ # f"fragment mean: {options.fragment_mean}\n"
68
+ # f"fragment standard deviation: {options.fragment_st_dev}")
69
+ # sys.exit(1)
54
70
temp_fragments = []
71
+ # trying to get enough variability to harden NEAT against edge cases.
72
+ if loop_count % 10 == 0 :
73
+ fragment_model .rng .shuffle (fragment_pool )
55
74
# Breaking the gename into fragments
56
75
while start < span_length :
57
76
# We take the first element and put it back on the end to create an endless pool of fragments to draw from
@@ -61,9 +80,9 @@ def cover_dataset(
61
80
if end - start < options .read_len :
62
81
# add some random flavor to try to keep it to falling into a loop
63
82
if options .rng .normal () < 0.5 :
64
- fragment_pool .insert (fragment , len (fragment_pool )// 2 )
83
+ fragment_pool .insert (len (fragment_pool )// 2 , fragment )
65
84
else :
66
- fragment_pool .insert (fragment , len (fragment_pool ) - 3 )
85
+ fragment_pool .insert (len (fragment_pool ) - 3 , fragment )
67
86
else :
68
87
fragment_pool .append (fragment )
69
88
temp_fragments .append ((start , end ))
@@ -87,6 +106,16 @@ def cover_dataset(
87
106
# where start and end are ints with end > start. Reads can overlap, so right_start < left_end
88
107
# is possible, but the reads cannot extend past each other, so right_start < left_start and
89
108
# left_end > right_end are not possible.
109
+
110
+ # sanity check that we haven't created an unrealistic read:
111
+ insert_size = read2 [0 ] - read1 [1 ]
112
+ if insert_size > 2 * options .read_len :
113
+ # Probably an outlier fragment length. We'll just pitch one of the reads
114
+ # and consider it lost to the ages.
115
+ if fragment_model .rng .choice ((True , False )):
116
+ read1 = (0 , 0 )
117
+ else :
118
+ read2 = (0 , 0 )
90
119
read = read1 + read2
91
120
if read not in final_reads :
92
121
final_reads .add (read )
@@ -97,6 +126,7 @@ def cover_dataset(
97
126
# Now we shuffle them to add some randomness
98
127
options .rng .shuffle (final_reads )
99
128
# And only return the number we needed
129
+ _LOG .debug (f"Coverage required { loop_count } loops" )
100
130
if options .paired_ended :
101
131
# Since each read is actually 2 reads, we only need to return half as many. But to cheat a few extra, we scale
102
132
# that down slightly to 1.85 reads per read. This factor is arbitrary and may even be a function. But let's see
0 commit comments