Skip to content

Commit 0793f4d

Browse files
committed
Removed binned scoring
1 parent 5d23806 commit 0793f4d

File tree

8 files changed

+32
-114
lines changed

8 files changed

+32
-114
lines changed

neat/cli/commands/model_sequencing_error.py

+2-6
Original file line numberDiff line numberDiff line change
@@ -48,12 +48,8 @@ def add_arguments(self, parser: argparse.ArgumentParser):
4848
dest="quality_scores",
4949
nargs='+',
5050
required=False,
51-
default=[2, 11, 25, 37],
52-
help="Quality score bins. Enter as a space separeted list for binned scores "
53-
"(-Q 2 11 24 37), or enter a single maximum score for a full range (-Q 42 gives all"
54-
"scores from 1-42). The default is binned quality scores: [2, 11, 24, 37]. Note that"
55-
"using quality score bins on an unbinned fastq will result in a binned model, at the"
56-
"cost of some inaccuracy.")
51+
default=42,
52+
help="Quality score max. The default 42. The lowest possible score is 1")
5753

5854
parser.add_argument('-m',
5955
type=int,

neat/model_sequencing_error/runner.py

+10-5
Original file line numberDiff line numberDiff line change
@@ -62,9 +62,11 @@ def model_seq_err_runner(
6262
_LOG.debug(f"Quality offset: {offset}")
6363

6464
final_quality_scores: list
65+
binned_scores = False
6566
if len(qual_scores) == 1:
6667
final_quality_scores = list(range(1, qual_scores[0] + 1))
6768
else:
69+
binned_scores = True
6870
final_quality_scores = sorted(qual_scores)
6971

7072
_LOG.debug(f'Quality scores: {final_quality_scores}')
@@ -109,11 +111,14 @@ def model_seq_err_runner(
109111
for file in files:
110112
file_num += 1
111113
_LOG.info(f'Reading file {file_num} of {len(files)}')
112-
parameters_by_position, file_avg_error, read_length = parse_file(file,
113-
final_quality_scores,
114-
num_records_to_process,
115-
offset,
116-
read_length)
114+
parameters_by_position, file_avg_error, read_length = parse_file(
115+
file,
116+
final_quality_scores,
117+
num_records_to_process,
118+
offset,
119+
read_length,
120+
binned_scores
121+
)
117122

118123
read_parameters.append(parameters_by_position)
119124
average_errors.append(file_avg_error)

neat/model_sequencing_error/utils.py

+1-5
Original file line numberDiff line numberDiff line change
@@ -10,7 +10,6 @@
1010

1111
from scipy.stats import mode
1212
from ..common import open_input
13-
from ..models import take_closest
1413

1514
__all__ = [
1615
"parse_file"
@@ -139,10 +138,7 @@ def parse_file(input_file: str, quality_scores: list, max_reads: int, qual_offse
139138

140139
for j in range(read_length):
141140
# The qualities of each read_position_scores
142-
quality_bin = take_closest(quality_scores, qualities_to_check[j])
143-
bin_index = quality_scores.index(quality_bin)
144-
temp_q_count[j][bin_index] += 1
145-
qual_score_counter[quality_bin] += 1
141+
qual_score_counter[qualities_to_check[j]] += 1
146142

147143
if records_read % quarters == 0:
148144
_LOG.info(f'reading data: {(records_read / max_reads) * 100:.0f}%')

neat/models/__init__.py

-1
Original file line numberDiff line numberDiff line change
@@ -1,2 +1 @@
11
from .models import *
2-
from .utils import *

neat/models/default_sequencing_error_model.py

+2-2
Original file line numberDiff line numberDiff line change
@@ -17,8 +17,8 @@
1717
[0.2505, 0.2552, 0.4943, 0.0]]
1818
)
1919

20-
# This list may not be the final list
21-
default_quality_scores = np.array([2, 11, 25, 37])
20+
# All numbers from 1-42
21+
default_quality_scores = np.arange(1, 43)
2222

2323
# This puts a high probability toward getting a maximum quality score. The current values
2424
# should be considered temporary. We're working on final values.

neat/models/models.py

+11-9
Original file line numberDiff line numberDiff line change
@@ -18,7 +18,6 @@
1818
from ..common import TRINUC_IND, ALLOWED_NUCL, NUC_IND, DINUC_IND
1919
from .default_mutation_model import *
2020
from .default_sequencing_error_model import *
21-
from .utils import bin_scores, take_closest
2221

2322
__all__ = [
2423
"MutationModel",
@@ -376,11 +375,8 @@ def __init__(self,
376375
self.insertion_model = insertion_model
377376
self.uniform_quality_score = None
378377
if self.is_uniform:
379-
# bin scores returns a list, so we need the first (only) element of the list
380-
converted_avg_err = bin_scores(self.quality_scores,
381-
[int(-10. * np.log10(self.average_error))])[0]
382-
# Set score to the lowest of the max of the quality scores and the bin closest to the input avg error.
383-
self.uniform_quality_score = min([max(self.quality_scores), converted_avg_err])
378+
# Set score to the lowest of the max of the quality scores and the input avg error.
379+
self.uniform_quality_score = min([max(self.quality_scores), int(-10. * np.log10(self.average_error) + 0.5)])
384380
self.rng = rng
385381

386382
def get_sequencing_errors(self,
@@ -498,7 +494,13 @@ def get_quality_scores(self,
498494
for i in quality_index_map:
499495
score = self.rng.normal(self.quality_score_probabilities[i][0],
500496
scale=self.quality_score_probabilities[i][1])
501-
score = take_closest(self.quality_scores, score)
497+
# make sure score is in range and an int
498+
score = round(score)
499+
if score > 42:
500+
score = 42
501+
if score < 1:
502+
score = 1
503+
502504
temp_qual_array.append(score)
503505

504506
if self.rescale_qualities:
@@ -509,9 +511,9 @@ def get_quality_scores(self,
509511
self.quality_score_error_rate[n]) + 0.5)])
510512
for n in temp_qual_array]
511513
# Now rebin the quality scores.
512-
temp_qual_array = np.array(bin_scores(self.quality_scores, rescaled_quals))
514+
temp_qual_array = np.array(rescaled_quals)
513515
else:
514-
temp_qual_array = np.array(bin_scores(self.quality_scores, temp_qual_array))
516+
temp_qual_array = np.array(temp_qual_array)
515517

516518
return temp_qual_array[:input_read_length]
517519

neat/models/utils.py

-75
This file was deleted.

neat/read_simulator/utils/generate_reads.py

+6-11
Original file line numberDiff line numberDiff line change
@@ -71,20 +71,15 @@ def cover_dataset(
7171
# trying to get enough variability to harden NEAT against edge cases.
7272
if loop_count % 10 == 0:
7373
fragment_model.rng.shuffle(fragment_pool)
74-
# Breaking the gename into fragments
74+
# Mapping random fragments onto genome
75+
i = 0
7576
while start < span_length:
7677
# We take the first element and put it back on the end to create an endless pool of fragments to draw from
77-
fragment = fragment_pool.pop(0)
78+
fragment = fragment_pool[i]
79+
i = (i + 1) % len(fragment_pool)
7880
end = min(start + fragment, span_length)
79-
# these are equivalent of reads we expect the machine to filter out, but we won't actually use it
80-
if end - start < options.read_len:
81-
# add some random flavor to try to keep it to falling into a loop
82-
if fragment_model.rng.normal() < 0.5:
83-
fragment_pool.insert(len(fragment_pool)//2, fragment)
84-
else:
85-
fragment_pool.insert(len(fragment_pool) - 3, fragment)
86-
else:
87-
fragment_pool.append(fragment)
81+
# Ensure the read is long enough to form a read, else we will not use it.
82+
if end - start > options.read_len:
8883
temp_fragments.append((start, end))
8984
start = end
9085

0 commit comments

Comments
 (0)