Skip to content

Commit 9ce447f

Browse files
committed
Updates to the models to allow for the replacement or addition of the Markov model.
1 parent 27bd768 commit 9ce447f

12 files changed

+465
-410
lines changed

neat/model_sequencing_error/runner.py

+24-16
Original file line numberDiff line numberDiff line change
@@ -14,7 +14,7 @@
1414

1515
from .utils import parse_file
1616
from ..common import validate_output_path, validate_input_path
17-
from ..models import SequencingErrorModel
17+
from ..models import SequencingErrorModel, TraditionalQualityModel
1818
from ..variants import Insertion, Deletion, SingleNucleotideVariant
1919

2020
__all__ = [
@@ -89,17 +89,7 @@ def model_seq_err_runner(
8989
validate_output_path(output_prefix, is_file=False)
9090
output_prefix = Path(output_prefix)
9191

92-
"""
93-
Previously, this was
94-
95-
output_file = Path(output_prefix).with_suffix('.p.gz')
96-
97-
But this tended to drop parts of the name, if they included dots:
98-
e.g., output_prefix = "/path/to/samplename.testrun" outputted to /path/to/samplename.p.gz
99-
100-
This way avoids that issue and outputs to /path/to/samplename.testrun.p.gz,
101-
even though it is a little more cumbersome
102-
"""
92+
# used string logic instead of pathlib here bc pathlib was cutting off extensions.
10393
output_file = output_prefix.parent / f'{output_prefix.name}.p.gz'
10494
validate_output_path(output_file, overwrite=overwrite)
10595
_LOG.info(f'Writing output to: {output_file}')
@@ -116,8 +106,7 @@ def model_seq_err_runner(
116106
final_quality_scores,
117107
num_records_to_process,
118108
offset,
119-
read_length,
120-
binned_scores
109+
read_length
121110
)
122111

123112
read_parameters.append(parameters_by_position)
@@ -157,23 +146,42 @@ def model_seq_err_runner(
157146
seq_err_model = SequencingErrorModel(
158147
avg_seq_error=average_error,
159148
read_length=read_length,
149+
)
150+
151+
# Just the default model
152+
qual_score_model = TraditionalQualityModel(
153+
average_error=average_error,
160154
quality_scores=np.array(final_quality_scores),
161155
qual_score_probs=read_parameters[0],
162156
)
163157

164158
if len(files) == 1:
165159
with gzip.open(output_file, 'w') as out_model:
166-
pickle.dump([seq_err_model, None], out_model)
160+
pickle.dump({
161+
"error_model1": seq_err_model,
162+
"error_model2": None,
163+
"qual_score_model1": qual_score_model,
164+
"qual_score_model2": None
165+
}, out_model)
167166

168167
else:
169168
# Second model if a second input was given
170169
seq_err_model_r2 = SequencingErrorModel(
171170
avg_seq_error=average_error,
172171
read_length=read_length,
172+
)
173+
174+
qual_score_model_r2 = TraditionalQualityModel(
175+
average_error=average_error,
173176
quality_scores=np.array(final_quality_scores),
174177
qual_score_probs=read_parameters[1]
175178
)
176179
with gzip.open(output_file, 'w') as out_model:
177-
pickle.dump([seq_err_model, seq_err_model_r2], out_model)
180+
pickle.dump({
181+
"error_model1": seq_err_model,
182+
"error_model2": seq_err_model_r2,
183+
"qual_score_model1": qual_score_model,
184+
"qual_score_model2": qual_score_model_r2
185+
}, out_model)
178186

179187
_LOG.info("Modeling sequencing errors is complete, have a nice day.")

neat/model_sequencing_error/utils.py

+7-7
Original file line numberDiff line numberDiff line change
@@ -189,12 +189,12 @@ def plot_stuff(init_q, real_q, q_range, prob_q, actual_readlen, plot_path):
189189
plt.rcParams.update({'font.size': 14, 'font.weight': 'bold', 'lines.linewidth': 3})
190190

191191
plt.figure(1)
192-
Z = np.array(init_q).T
193-
X, Y = np.meshgrid(range(0, len(Z[0]) + 1), range(0, len(Z) + 1))
194-
plt.pcolormesh(X, Y, Z, vmin=0., vmax=0.25)
195-
plt.axis([0, len(Z[0]), 0, len(Z)])
196-
plt.yticks(range(0, len(Z), 10), range(0, len(Z), 10))
197-
plt.xticks(range(0, len(Z[0]), 10), range(0, len(Z[0]), 10))
192+
z = np.array(init_q).T
193+
x, y = np.meshgrid(range(0, len(z[0]) + 1), range(0, len(z) + 1))
194+
plt.pcolormesh(x, Y, z, vmin=0., vmax=0.25)
195+
plt.axis([0, len(z[0]), 0, len(z)])
196+
plt.yticks(range(0, len(z), 10), range(0, len(z), 10))
197+
plt.xticks(range(0, len(z[0]), 10), range(0, len(z[0]), 10))
198198
plt.xlabel('Read Position')
199199
plt.ylabel('Quality Score')
200200
plt.title('Q-Score Prior Probabilities')
@@ -233,7 +233,7 @@ def plot_stuff(init_q, real_q, q_range, prob_q, actual_readlen, plot_path):
233233

234234
plt.figure(p + 1)
235235
z = np.log10(current_data)
236-
x, y = np.meshgrid(range(0, len(Z[0]) + 1), range(0, len(Z) + 1))
236+
x, y = np.meshgrid(range(0, len(z[0]) + 1), range(0, len(z) + 1))
237237
plt.pcolormesh(x, y, z[::-1, :], vmin=v_min_log[0], vmax=v_min_log[1], cmap='jet')
238238
plt.xlim([q_range[0], q_range[1] + 1])
239239
plt.ylim([real_q - q_range[1] - 1, real_q - q_range[0]])
+170
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,170 @@
1+
import numpy as np
2+
3+
# The following default model parameters are here, for tweaking and revision.
4+
5+
# This is based on the input model NEAT was using previously
6+
default_error_transition_matrix = np.array(
7+
[[0.0, 0.4918, 0.3377, 0.1705],
8+
[0.5238, 0.0, 0.2661, 0.2101],
9+
[0.3755, 0.2355, 0.0, 0.389],
10+
[0.2505, 0.2552, 0.4943, 0.0]]
11+
)
12+
13+
# All numbers from 1-42
14+
default_quality_scores = np.arange(1, 43)
15+
16+
17+
default_qual_score_probs = np.array([
18+
[35.20551064, 5.39726466],
19+
[35.05310236, 5.63622973],
20+
[35.03595806, 5.66646784],
21+
[34.98022112, 5.74627755],
22+
[34.97572106, 5.75463045],
23+
[34.4884581, 6.41083843],
24+
[35.01230914, 5.69464369],
25+
[35.05956044, 5.61320501],
26+
[35.06796147, 5.57262371],
27+
[34.96093227, 5.72592418],
28+
[34.94813878, 5.75079686],
29+
[34.96537761, 5.74288137],
30+
[35.04818471, 5.64898391],
31+
[35.05231225, 5.64270542],
32+
[35.01482501, 5.69340856],
33+
[34.9825929, 5.73638614],
34+
[34.98442388, 5.7366054],
35+
[35.03409773, 5.66869673],
36+
[35.0844361, 5.54805317],
37+
[35.08961879, 5.53743692],
38+
[35.0466029, 5.60379104],
39+
[34.99117371, 5.69895578],
40+
[34.9815548, 5.71482919],
41+
[34.98911734, 5.7011922],
42+
[34.93751495, 5.78304943],
43+
[34.93036155, 5.79838573],
44+
[34.92922972, 5.8309752],
45+
[34.96684586, 5.76060028],
46+
[34.9708005, 5.74962913],
47+
[34.95329795, 5.7652556],
48+
[34.92443461, 5.83257523],
49+
[34.91492225, 5.84652014],
50+
[34.8549109, 5.91585614],
51+
[34.94560666, 5.76479635],
52+
[34.96711896, 5.72745932],
53+
[34.88901206, 5.85709602],
54+
[34.93905948, 5.76436017],
55+
[34.94363806, 5.7506911],
56+
[34.89343347, 5.80958086],
57+
[34.95898364, 5.77013784],
58+
[34.9618132, 5.77057502],
59+
[34.9055352, 5.82500424],
60+
[34.95546906, 5.78985858],
61+
[34.96132197, 5.77845029],
62+
[34.87531478, 5.88705352],
63+
[34.84007611, 5.98736108],
64+
[34.84061082, 5.98726021],
65+
[34.89358007, 5.83662558],
66+
[34.88615263, 5.87565349],
67+
[34.88569617, 5.87220656],
68+
[34.80492984, 6.03141243],
69+
[34.88045333, 5.87773523],
70+
[34.86089018, 5.92644241],
71+
[34.85899099, 5.93253766],
72+
[34.85281461, 5.95985895],
73+
[34.8233026, 5.92281158],
74+
[34.81999927, 5.93027502],
75+
[34.86737867, 5.89225314],
76+
[34.84964333, 5.94130026],
77+
[34.8416016, 5.95394121],
78+
[34.83298563, 5.93068537],
79+
[34.85175139, 5.91356723],
80+
[34.85312089, 5.91332882],
81+
[34.81363703, 5.99487116],
82+
[34.84683162, 5.91835211],
83+
[34.8503877, 5.90848694],
84+
[34.75132727, 6.05903999],
85+
[34.69121836, 6.10534185],
86+
[34.69027134, 6.10258631],
87+
[34.55838196, 6.33738628],
88+
[34.66787243, 6.16328276],
89+
[34.67600762, 6.14921298],
90+
[34.69141004, 6.12850504],
91+
[34.69170575, 6.11885946],
92+
[34.69627971, 6.11449364],
93+
[34.61730669, 6.25960228],
94+
[34.65572245, 6.15978888],
95+
[34.66272688, 6.15123117],
96+
[34.61673377, 6.26945453],
97+
[34.50737271, 6.36690753],
98+
[34.49273489, 6.38753577],
99+
[34.53277647, 6.35271811],
100+
[34.55186016, 6.29667187],
101+
[34.5545907, 6.29615587],
102+
[34.42162184, 6.48769325],
103+
[34.42162184, 6.48769325],
104+
[34.41120476, 6.51973428],
105+
[34.41001927, 6.52218756],
106+
[34.50872609, 6.36667517],
107+
[34.34054261, 6.55724021],
108+
[34.31472999, 6.59706435],
109+
[34.44800289, 6.370829],
110+
[34.38082743, 6.48116784],
111+
[34.36736783, 6.50225428],
112+
[34.39091046, 6.49333254],
113+
[34.14685981, 6.81443525],
114+
[34.11286757, 6.86040777],
115+
[34.03892364, 6.9167951],
116+
[34.09461999, 6.84357267],
117+
[34.11624969, 6.81739307],
118+
[34.03311938, 6.89976027],
119+
[34.19285701, 6.73526643],
120+
[34.0871172, 6.85069533],
121+
[34.06668596, 6.8773227],
122+
[33.96060364, 6.9642502],
123+
[33.96399568, 6.94576564],
124+
[33.96784324, 6.94562026],
125+
[33.90260425, 7.05523946],
126+
[33.84443023, 7.10595304],
127+
[33.82584028, 7.12368594],
128+
[33.83180939, 7.14712464],
129+
[33.80809054, 7.14842046],
130+
[33.80740632, 7.14768389],
131+
[33.77340509, 7.16367224],
132+
[33.68690403, 7.30024217],
133+
[33.67144094, 7.32120751],
134+
[33.71053312, 7.25308594],
135+
[33.67084687, 7.25185612],
136+
[33.67146605, 7.24695417],
137+
[33.50354352, 7.43740665],
138+
[33.43878783, 7.43451809],
139+
[33.41994711, 7.45122696],
140+
[33.41239779, 7.46949964],
141+
[33.2412823, 7.64587631],
142+
[33.21334153, 7.67937435],
143+
[33.26008323, 7.62315701],
144+
[33.07948592, 7.80559543],
145+
[33.04336633, 7.83646848],
146+
[33.06584588, 7.77135486],
147+
[32.75754503, 8.02442237],
148+
[32.69941899, 8.08132663],
149+
[32.32145073, 8.41139017],
150+
[32.29822443, 8.37790239],
151+
[32.27939786, 8.38937451],
152+
[32.12769178, 8.50162153],
153+
[31.96975037, 8.62588371],
154+
[31.91995927, 8.66055964],
155+
[31.90028244, 8.63664978],
156+
[31.95031161, 8.63975579],
157+
[31.9793546, 8.62708864],
158+
[32.01732911, 8.59965107],
159+
[31.90794492, 8.63745435],
160+
[31.86247331, 8.66495217],
161+
[31.8237584, 8.71014014],
162+
[31.7715014, 8.75323658],
163+
[31.75047688, 8.77579462],
164+
[31.50448208, 8.92324659],
165+
[31.49927017, 8.91928077],
166+
[31.48842743, 8.92882862],
167+
[30.88872613, 9.25271538],
168+
[28.51071465, 10.2517433]
169+
])
170+

0 commit comments

Comments
 (0)