8
8
9
9
import logging
10
10
11
- from numpy .random import Generator
12
11
from Bio .Seq import Seq
13
12
from Bio import SeqRecord
14
13
27
26
_LOG = logging .getLogger (__name__ )
28
27
29
28
30
- class TraditionalModel :
29
+ class TraditionalQualityModel :
31
30
"""
32
31
This is the traditional NEAT model for generating quality scores. This will be replaced by
33
32
a Markov model or an optional markov model
@@ -39,17 +38,20 @@ class TraditionalModel:
39
38
:param is_uniform: Some machines use uniform quality scores. This makes simulation a little easier.
40
39
"""
41
40
42
- def __init__ (self ,
43
- transition_matrix : np .ndarray = default_error_transition_matrix ,
44
- quality_scores : np .ndarray = default_quality_scores ,
45
- qual_score_probs : np .ndarray = default_qual_score_probs ,
46
- is_uniform : bool = False ,
47
- ):
41
+ def __init__ (
42
+ self ,
43
+ average_error : float = default_avg_seq_error ,
44
+ transition_matrix : np .ndarray = default_error_transition_matrix ,
45
+ quality_scores : np .ndarray = default_quality_scores ,
46
+ qual_score_probs : np .ndarray = default_qual_score_probs ,
47
+ is_uniform : bool = False
48
+ ):
48
49
49
50
self .transition_matrix = transition_matrix
50
51
self .quality_scores = quality_scores
51
52
self .quality_score_probabilities = qual_score_probs
52
53
self .is_uniform = is_uniform
54
+ self .average_error = average_error
53
55
54
56
# pre-compute the error rate for each quality score. This is the inverse of the phred score equation
55
57
self .quality_score_error_rate : dict [int , float ] = {x : 10. ** (- x / 10 ) for x in self .quality_scores }
@@ -59,11 +61,12 @@ def __init__(self,
59
61
# Set score to the lowest of the max of the quality scores and the input avg error.
60
62
self .uniform_quality_score = min ([max (self .quality_scores ), int (- 10. * np .log10 (self .average_error ) + 0.5 )])
61
63
62
- def get_quality_scores (self ,
63
- input_read_length : int ,
64
- orig_read_len : int ,
65
- rng
66
- ) -> list :
64
+ def get_quality_scores (
65
+ self ,
66
+ input_read_length : int ,
67
+ orig_read_len : int ,
68
+ rng
69
+ ) -> list :
67
70
"""
68
71
Takes a read_length and rng and returns an array of quality scores
69
72
@@ -93,7 +96,8 @@ def get_quality_scores(self,
93
96
94
97
return temp_qual_array
95
98
96
- def quality_index_remap (self , input_read_length , original_read_length ):
99
+ @staticmethod
100
+ def quality_index_remap (input_read_length , original_read_length ):
97
101
"""
98
102
Adjusts the quality map to the suitable read length.
99
103
@@ -108,7 +112,7 @@ def quality_index_remap(self, input_read_length, original_read_length):
108
112
return np .array ([max ([0 , original_read_length * n // input_read_length ]) for n in range (input_read_length )])
109
113
110
114
111
- class MarkovModel :
115
+ class MarkovQualityModel :
112
116
def __init__ (self ):
113
117
# TODO
114
118
pass
@@ -118,59 +122,60 @@ def get_quality_scores(self):
118
122
pass
119
123
120
124
121
- class SequencingErrorModel (SnvModel , DeletionModel , InsertionModel , TraditionalModel ):
125
+ class SequencingErrorModel (SnvModel , DeletionModel , InsertionModel ):
122
126
"""
123
- This is a SequencingErrorModel class, based on the old SequencingError. This covers both errors and quality
124
- scores, since they are intimately related . There are three types of
125
- errors possible: substitutions, insertions, and deletions, similar to mutations, but
126
- simpler. Note that the three input probabilities must add up to 1.0 and the length the list of indel lengths
127
- must be equal to the length of its corresponding probabilities.
127
+ This is a SequencingErrorModel class, based on the old SequencingError. Instead of handling everything, we've
128
+ refactored this to only handle sequencing errors . There are three types of errors possible: substitutions,
129
+ insertions, and deletions, similar to mutations, but simpler and with different underlying models. Note that the
130
+ three input probabilities must add up to 1.0 and the length the list of indel lengths must be equal to the length
131
+ of its corresponding probabilities.
128
132
129
133
:param read_length: The read length derived from real data.
130
134
:param rescale_qualities: If set to true, NEAT will attempt to rescale the qualities based on the input error
131
135
model, rather than using the qualities derived from the real data.
132
- :param variant_probs: Probability dict for each valid variant type
136
+ :param variant_probs: Probability dict for each valid variant type.
133
137
:param indel_len_model: Similar to mutation model, but simpler because errors tend not to be complicated. The
134
- three possible variant types for errors are Insertion, Deletion, and SNV
135
- :param rng: optional random number generator. For generating this model, no RNG is needed. But for a run,
136
- we'll need the rng to perform certain methods.
138
+ three possible variant types for errors are Insertion, Deletion, and SNV.
139
+ :param insertion_model: The specific parameters for the machine's insertion error rates.
140
+ :param transition_matrix: The matrix for the transition for SNVs from one base to another.
141
+ :param rescale_qualities: If qualities should be shifted because of a manually entered error rate override.
137
142
:param avg_seq_error: A float giving the average rate of sequencing errors,
138
143
either defined by data or user input.
139
144
"""
140
145
141
- def __init__ (self ,
142
- read_length : int = default_read_length ,
143
- variant_probs : dict [variants : float ] = default_error_variant_probs ,
144
- indel_len_model : dict [int : float ] = default_indel_len_model ,
145
- insertion_model : np .ndarray = default_insertion_model ,
146
- rescale_qualities : bool = False ,
147
- avg_seq_error : float = default_avg_seq_error ,
148
- rng : Generator = None ):
146
+ def __init__ (
147
+ self ,
148
+ read_length : int = default_read_length ,
149
+ variant_probs : dict [variants : float ] = default_error_variant_probs ,
150
+ indel_len_model : dict [int : float ] = default_indel_len_model ,
151
+ insertion_model : np .ndarray = default_insertion_model ,
152
+ transition_matrix : np .ndarray = default_error_transition_matrix ,
153
+ rescale_qualities : bool = False ,
154
+ avg_seq_error : float = default_avg_seq_error ,
155
+ ):
149
156
150
157
SnvModel .__init__ (self )
151
158
InsertionModel .__init__ (self , indel_len_model )
152
159
DeletionModel .__init__ (self , indel_len_model )
153
160
154
161
self .variant_probs = variant_probs
155
162
156
- self .read_length = read_length
157
163
self .read_length = read_length
158
164
self .rescale_qualities = rescale_qualities
159
165
self .insertion_model = insertion_model
166
+ self .transition_matrix = transition_matrix
160
167
self .average_error = avg_seq_error
161
- self .rng = rng
162
-
163
- def get_sequencing_errors (self ,
164
- read_length : int ,
165
- padding : int ,
166
- reference_segment : SeqRecord ,
167
- quality_scores : np .ndarray ,
168
- rng
169
- ):
168
+
169
+ def get_sequencing_errors (
170
+ self ,
171
+ padding : int ,
172
+ reference_segment : SeqRecord ,
173
+ quality_scores : np .ndarray ,
174
+ rng
175
+ ):
170
176
"""
171
177
Inserts errors of type substitution, insertion, or deletion into read_data, and assigns a quality score
172
178
based on the container model.
173
- :param read_length: The length of the read to generate errors for.
174
179
:param padding: this is the amount of space we have in the read for deletions.
175
180
:param reference_segment: The section of the reference from which the read is drawn
176
181
:param quality_scores: Array of quality scores for the read
@@ -180,14 +185,16 @@ def get_sequencing_errors(self,
180
185
181
186
error_indexes = []
182
187
introduced_errors = []
188
+ # pre-compute the error rate for each quality score. This is the inverse of the phred score equation
189
+ quality_score_error_rate : dict [int , float ] = {x : 10. ** (- x / 10 ) for x in quality_scores }
183
190
184
191
# The use case here would be someone running a simulation where they want no sequencing errors.
185
192
# No need to run any loops in this case.
186
193
if self .average_error == 0 :
187
194
return introduced_errors
188
195
else :
189
- for i in range (read_length ):
190
- if rng .random () < self . quality_score_error_rate [quality_scores [i ]]:
196
+ for i in range (self . read_length ):
197
+ if rng .random () < quality_score_error_rate [quality_scores [i ]]:
191
198
error_indexes .append (i )
192
199
193
200
total_indel_length = 0
@@ -250,34 +257,6 @@ def get_sequencing_errors(self,
250
257
251
258
return introduced_errors , max (padding , 0 )
252
259
253
- def generate_quality_scores (
254
- self ,
255
- inp_read_len : int ,
256
- ):
257
- if self .uniform_quality_score :
258
- return np .array ([self .uniform_quality_score ] * inp_read_len )
259
- else :
260
-
261
- temp_quality_array = self .get_quality_scores (
262
- inp_read_len ,
263
- self .read_length ,
264
- self .rng
265
- )
266
-
267
- if self .rescale_qualities :
268
- # Note that rescaling won't have much effect on binned quality scores.
269
- # First we rescale the quality score literals then convert back to phred score (with the 0.5
270
- # causing borderline cases to take the next highest number).
271
- rescaled_quals = [max ([0 , int (- 10. * np .log10 (self .average_error *
272
- self .quality_score_error_rate [n ]) + 0.5 )])
273
- for n in temp_quality_array ]
274
- # Now rebin the quality scores.
275
- temp_qual_array = np .array (rescaled_quals )
276
- else :
277
- temp_qual_array = np .array (temp_quality_array )
278
-
279
- return temp_qual_array [:inp_read_len ]
280
-
281
260
282
261
class ErrorContainer :
283
262
"""
0 commit comments