Skip to content

Commit 27bd768

Browse files
committed
The last commit was a lie. I knew it seemed too easy. Still need to separate out model generation in read simulator and sequencing error model generation (and maybe make a separate model generator for each regular and markov? I'm fine with straight replacement too.)
1 parent 62821b9 commit 27bd768

File tree

1 file changed

+53
-74
lines changed

1 file changed

+53
-74
lines changed

neat/models/error_models.py

+53-74
Original file line numberDiff line numberDiff line change
@@ -8,7 +8,6 @@
88

99
import logging
1010

11-
from numpy.random import Generator
1211
from Bio.Seq import Seq
1312
from Bio import SeqRecord
1413

@@ -27,7 +26,7 @@
2726
_LOG = logging.getLogger(__name__)
2827

2928

30-
class TraditionalModel:
29+
class TraditionalQualityModel:
3130
"""
3231
This is the traditional NEAT model for generating quality scores. This will be replaced by
3332
a Markov model or an optional markov model
@@ -39,17 +38,20 @@ class TraditionalModel:
3938
:param is_uniform: Some machines use uniform quality scores. This makes simulation a little easier.
4039
"""
4140

42-
def __init__(self,
43-
transition_matrix: np.ndarray = default_error_transition_matrix,
44-
quality_scores: np.ndarray = default_quality_scores,
45-
qual_score_probs: np.ndarray = default_qual_score_probs,
46-
is_uniform: bool = False,
47-
):
41+
def __init__(
42+
self,
43+
average_error: float = default_avg_seq_error,
44+
transition_matrix: np.ndarray = default_error_transition_matrix,
45+
quality_scores: np.ndarray = default_quality_scores,
46+
qual_score_probs: np.ndarray = default_qual_score_probs,
47+
is_uniform: bool = False
48+
):
4849

4950
self.transition_matrix = transition_matrix
5051
self.quality_scores = quality_scores
5152
self.quality_score_probabilities = qual_score_probs
5253
self.is_uniform = is_uniform
54+
self.average_error = average_error
5355

5456
# pre-compute the error rate for each quality score. This is the inverse of the phred score equation
5557
self.quality_score_error_rate: dict[int, float] = {x: 10. ** (-x / 10) for x in self.quality_scores}
@@ -59,11 +61,12 @@ def __init__(self,
5961
# Set score to the lowest of the max of the quality scores and the input avg error.
6062
self.uniform_quality_score = min([max(self.quality_scores), int(-10. * np.log10(self.average_error) + 0.5)])
6163

62-
def get_quality_scores(self,
63-
input_read_length: int,
64-
orig_read_len: int,
65-
rng
66-
) -> list:
64+
def get_quality_scores(
65+
self,
66+
input_read_length: int,
67+
orig_read_len: int,
68+
rng
69+
) -> list:
6770
"""
6871
Takes a read_length and rng and returns an array of quality scores
6972
@@ -93,7 +96,8 @@ def get_quality_scores(self,
9396

9497
return temp_qual_array
9598

96-
def quality_index_remap(self, input_read_length, original_read_length):
99+
@staticmethod
100+
def quality_index_remap(input_read_length, original_read_length):
97101
"""
98102
Adjusts the quality map to the suitable read length.
99103
@@ -108,7 +112,7 @@ def quality_index_remap(self, input_read_length, original_read_length):
108112
return np.array([max([0, original_read_length * n // input_read_length]) for n in range(input_read_length)])
109113

110114

111-
class MarkovModel:
115+
class MarkovQualityModel:
112116
def __init__(self):
113117
# TODO
114118
pass
@@ -118,59 +122,60 @@ def get_quality_scores(self):
118122
pass
119123

120124

121-
class SequencingErrorModel(SnvModel, DeletionModel, InsertionModel, TraditionalModel):
125+
class SequencingErrorModel(SnvModel, DeletionModel, InsertionModel):
122126
"""
123-
This is a SequencingErrorModel class, based on the old SequencingError. This covers both errors and quality
124-
scores, since they are intimately related. There are three types of
125-
errors possible: substitutions, insertions, and deletions, similar to mutations, but
126-
simpler. Note that the three input probabilities must add up to 1.0 and the length the list of indel lengths
127-
must be equal to the length of its corresponding probabilities.
127+
This is a SequencingErrorModel class, based on the old SequencingError. Instead of handling everything, we've
128+
refactored this to only handle sequencing errors. There are three types of errors possible: substitutions,
129+
insertions, and deletions, similar to mutations, but simpler and with different underlying models. Note that the
130+
three input probabilities must add up to 1.0 and the length the list of indel lengths must be equal to the length
131+
of its corresponding probabilities.
128132
129133
:param read_length: The read length derived from real data.
130134
:param rescale_qualities: If set to true, NEAT will attempt to rescale the qualities based on the input error
131135
model, rather than using the qualities derived from the real data.
132-
:param variant_probs: Probability dict for each valid variant type
136+
:param variant_probs: Probability dict for each valid variant type.
133137
:param indel_len_model: Similar to mutation model, but simpler because errors tend not to be complicated. The
134-
three possible variant types for errors are Insertion, Deletion, and SNV
135-
:param rng: optional random number generator. For generating this model, no RNG is needed. But for a run,
136-
we'll need the rng to perform certain methods.
138+
three possible variant types for errors are Insertion, Deletion, and SNV.
139+
:param insertion_model: The specific parameters for the machine's insertion error rates.
140+
:param transition_matrix: The matrix for the transition for SNVs from one base to another.
141+
:param rescale_qualities: If qualities should be shifted because of a manually entered error rate override.
137142
:param avg_seq_error: A float giving the average rate of sequencing errors,
138143
either defined by data or user input.
139144
"""
140145

141-
def __init__(self,
142-
read_length: int = default_read_length,
143-
variant_probs: dict[variants: float] = default_error_variant_probs,
144-
indel_len_model: dict[int: float] = default_indel_len_model,
145-
insertion_model: np.ndarray = default_insertion_model,
146-
rescale_qualities: bool = False,
147-
avg_seq_error: float = default_avg_seq_error,
148-
rng: Generator = None):
146+
def __init__(
147+
self,
148+
read_length: int = default_read_length,
149+
variant_probs: dict[variants: float] = default_error_variant_probs,
150+
indel_len_model: dict[int: float] = default_indel_len_model,
151+
insertion_model: np.ndarray = default_insertion_model,
152+
transition_matrix: np.ndarray = default_error_transition_matrix,
153+
rescale_qualities: bool = False,
154+
avg_seq_error: float = default_avg_seq_error,
155+
):
149156

150157
SnvModel.__init__(self)
151158
InsertionModel.__init__(self, indel_len_model)
152159
DeletionModel.__init__(self, indel_len_model)
153160

154161
self.variant_probs = variant_probs
155162

156-
self.read_length = read_length
157163
self.read_length = read_length
158164
self.rescale_qualities = rescale_qualities
159165
self.insertion_model = insertion_model
166+
self.transition_matrix = transition_matrix
160167
self.average_error = avg_seq_error
161-
self.rng = rng
162-
163-
def get_sequencing_errors(self,
164-
read_length: int,
165-
padding: int,
166-
reference_segment: SeqRecord,
167-
quality_scores: np.ndarray,
168-
rng
169-
):
168+
169+
def get_sequencing_errors(
170+
self,
171+
padding: int,
172+
reference_segment: SeqRecord,
173+
quality_scores: np.ndarray,
174+
rng
175+
):
170176
"""
171177
Inserts errors of type substitution, insertion, or deletion into read_data, and assigns a quality score
172178
based on the container model.
173-
:param read_length: The length of the read to generate errors for.
174179
:param padding: this is the amount of space we have in the read for deletions.
175180
:param reference_segment: The section of the reference from which the read is drawn
176181
:param quality_scores: Array of quality scores for the read
@@ -180,14 +185,16 @@ def get_sequencing_errors(self,
180185

181186
error_indexes = []
182187
introduced_errors = []
188+
# pre-compute the error rate for each quality score. This is the inverse of the phred score equation
189+
quality_score_error_rate: dict[int, float] = {x: 10. ** (-x / 10) for x in quality_scores}
183190

184191
# The use case here would be someone running a simulation where they want no sequencing errors.
185192
# No need to run any loops in this case.
186193
if self.average_error == 0:
187194
return introduced_errors
188195
else:
189-
for i in range(read_length):
190-
if rng.random() < self.quality_score_error_rate[quality_scores[i]]:
196+
for i in range(self.read_length):
197+
if rng.random() < quality_score_error_rate[quality_scores[i]]:
191198
error_indexes.append(i)
192199

193200
total_indel_length = 0
@@ -250,34 +257,6 @@ def get_sequencing_errors(self,
250257

251258
return introduced_errors, max(padding, 0)
252259

253-
def generate_quality_scores(
254-
self,
255-
inp_read_len: int,
256-
):
257-
if self.uniform_quality_score:
258-
return np.array([self.uniform_quality_score] * inp_read_len)
259-
else:
260-
261-
temp_quality_array = self.get_quality_scores(
262-
inp_read_len,
263-
self.read_length,
264-
self.rng
265-
)
266-
267-
if self.rescale_qualities:
268-
# Note that rescaling won't have much effect on binned quality scores.
269-
# First we rescale the quality score literals then convert back to phred score (with the 0.5
270-
# causing borderline cases to take the next highest number).
271-
rescaled_quals = [max([0, int(-10. * np.log10(self.average_error *
272-
self.quality_score_error_rate[n]) + 0.5)])
273-
for n in temp_quality_array]
274-
# Now rebin the quality scores.
275-
temp_qual_array = np.array(rescaled_quals)
276-
else:
277-
temp_qual_array = np.array(temp_quality_array)
278-
279-
return temp_qual_array[:inp_read_len]
280-
281260

282261
class ErrorContainer:
283262
"""

0 commit comments

Comments
 (0)