Removed binned scoring

joshfactorial · joshfactorial · commit 0793f4dd0e75 · 2024-07-15T18:03:30.000-05:00
diff --git a/neat/cli/commands/model_sequencing_error.py b/neat/cli/commands/model_sequencing_error.py
@@ -48,12 +48,8 @@ def add_arguments(self, parser: argparse.ArgumentParser):
                             dest="quality_scores",
                             nargs='+',
                             required=False,
-                            default=[2, 11, 25, 37],
-                            help="Quality score bins. Enter as a space separeted list for binned scores "
-                                 "(-Q 2 11 24 37), or enter a single maximum score for a full range (-Q 42 gives all"
-                                 "scores from 1-42). The default is binned quality scores: [2, 11, 24, 37]. Note that"
-                                 "using quality score bins on an unbinned fastq will result in a binned model, at the"
-                                 "cost of some inaccuracy.")
+                            default=42,
+                            help="Quality score max. The default 42. The lowest possible score is 1")
 
         parser.add_argument('-m',
                             type=int,
diff --git a/neat/model_sequencing_error/runner.py b/neat/model_sequencing_error/runner.py
@@ -62,9 +62,11 @@ def model_seq_err_runner(
     _LOG.debug(f"Quality offset: {offset}")
 
     final_quality_scores: list
+    binned_scores = False
     if len(qual_scores) == 1:
         final_quality_scores = list(range(1, qual_scores[0] + 1))
     else:
+        binned_scores = True
         final_quality_scores = sorted(qual_scores)
 
     _LOG.debug(f'Quality scores: {final_quality_scores}')
@@ -109,11 +111,14 @@ def model_seq_err_runner(
     for file in files:
         file_num += 1
         _LOG.info(f'Reading file {file_num} of {len(files)}')
-        parameters_by_position, file_avg_error, read_length = parse_file(file,
-                                                                         final_quality_scores,
-                                                                         num_records_to_process,
-                                                                         offset,
-                                                                         read_length)
+        parameters_by_position, file_avg_error, read_length = parse_file(
+            file,
+            final_quality_scores,
+            num_records_to_process,
+            offset,
+            read_length,
+            binned_scores
+        )
 
         read_parameters.append(parameters_by_position)
         average_errors.append(file_avg_error)
diff --git a/neat/model_sequencing_error/utils.py b/neat/model_sequencing_error/utils.py
@@ -10,7 +10,6 @@
 
 from scipy.stats import mode
 from ..common import open_input
-from ..models import take_closest
 
 __all__ = [
     "parse_file"
@@ -139,10 +138,7 @@ def parse_file(input_file: str, quality_scores: list, max_reads: int, qual_offse
 
             for j in range(read_length):
                 # The qualities of each read_position_scores
-                quality_bin = take_closest(quality_scores, qualities_to_check[j])
-                bin_index = quality_scores.index(quality_bin)
-                temp_q_count[j][bin_index] += 1
-                qual_score_counter[quality_bin] += 1
+                qual_score_counter[qualities_to_check[j]] += 1
 
             if records_read % quarters == 0:
                 _LOG.info(f'reading data: {(records_read / max_reads) * 100:.0f}%')
diff --git a/neat/models/__init__.py b/neat/models/__init__.py
@@ -1,2 +1 @@
 from .models import *
-from .utils import *
diff --git a/neat/models/default_sequencing_error_model.py b/neat/models/default_sequencing_error_model.py
@@ -17,8 +17,8 @@
      [0.2505, 0.2552, 0.4943, 0.0]]
 )
 
-# This list may not be the final list
-default_quality_scores = np.array([2, 11, 25, 37])
+# All numbers from 1-42
+default_quality_scores = np.arange(1, 43)
 
 # This puts a high probability toward getting a maximum quality score. The current values
 # should be considered temporary. We're working on final values.
diff --git a/neat/models/models.py b/neat/models/models.py
@@ -18,7 +18,6 @@
 from ..common import TRINUC_IND, ALLOWED_NUCL, NUC_IND, DINUC_IND
 from .default_mutation_model import *
 from .default_sequencing_error_model import *
-from .utils import bin_scores, take_closest
 
 __all__ = [
     "MutationModel",
@@ -376,11 +375,8 @@ def __init__(self,
         self.insertion_model = insertion_model
         self.uniform_quality_score = None
         if self.is_uniform:
-            # bin scores returns a list, so we need the first (only) element of the list
-            converted_avg_err = bin_scores(self.quality_scores,
-                                           [int(-10. * np.log10(self.average_error))])[0]
-            # Set score to the lowest of the max of the quality scores and the bin closest to the input avg error.
-            self.uniform_quality_score = min([max(self.quality_scores), converted_avg_err])
+            # Set score to the lowest of the max of the quality scores and the input avg error.
+            self.uniform_quality_score = min([max(self.quality_scores), int(-10. * np.log10(self.average_error) + 0.5)])
         self.rng = rng
 
     def get_sequencing_errors(self,
@@ -498,7 +494,13 @@ def get_quality_scores(self,
             for i in quality_index_map:
                 score = self.rng.normal(self.quality_score_probabilities[i][0],
                                         scale=self.quality_score_probabilities[i][1])
-                score = take_closest(self.quality_scores, score)
+                # make sure score is in range and an int
+                score = round(score)
+                if score > 42:
+                    score = 42
+                if score < 1:
+                    score = 1
+
                 temp_qual_array.append(score)
 
         if self.rescale_qualities:
@@ -509,9 +511,9 @@ def get_quality_scores(self,
                                                           self.quality_score_error_rate[n]) + 0.5)])
                               for n in temp_qual_array]
             # Now rebin the quality scores.
-            temp_qual_array = np.array(bin_scores(self.quality_scores, rescaled_quals))
+            temp_qual_array = np.array(rescaled_quals)
         else:
-            temp_qual_array = np.array(bin_scores(self.quality_scores, temp_qual_array))
+            temp_qual_array = np.array(temp_qual_array)
 
         return temp_qual_array[:input_read_length]
 
diff --git a/neat/models/utils.py b/neat/models/utils.py
diff --git a/neat/read_simulator/utils/generate_reads.py b/neat/read_simulator/utils/generate_reads.py
@@ -71,20 +71,15 @@ def cover_dataset(
         # trying to get enough variability to harden NEAT against edge cases.
         if loop_count % 10 == 0:
             fragment_model.rng.shuffle(fragment_pool)
-        # Breaking the gename into fragments
+        # Mapping random fragments onto genome
+        i = 0
         while start < span_length:
             # We take the first element and put it back on the end to create an endless pool of fragments to draw from
-            fragment = fragment_pool.pop(0)
+            fragment = fragment_pool[i]
+            i = (i + 1) % len(fragment_pool)
             end = min(start + fragment, span_length)
-            # these are equivalent of reads we expect the machine to filter out, but we won't actually use it
-            if end - start < options.read_len:
-                # add some random flavor to try to keep it to falling into a loop
-                if fragment_model.rng.normal() < 0.5:
-                    fragment_pool.insert(len(fragment_pool)//2, fragment)
-                else:
-                    fragment_pool.insert(len(fragment_pool) - 3, fragment)
-            else:
-                fragment_pool.append(fragment)
+            # Ensure the read is long enough to form a read, else we will not use it.
+            if end - start > options.read_len:
                 temp_fragments.append((start, end))
             start = end
 

Original file line number	Diff line number	Diff line change
`@@ -1,2 +1 @@`
`1`	`1`	`from .models import *`
`2`		`-from .utils import *`
Original file line number	Diff line number	Diff line change
`@@ -17,8 +17,8 @@`
`17`	`17`	`[0.2505, 0.2552, 0.4943, 0.0]]`
`18`	`18`	`)`
`19`	`19`
`20`		`-# This list may not be the final list`
`21`		`-default_quality_scores = np.array([2, 11, 25, 37])`
	`20`	`+# All numbers from 1-42`
	`21`	`+default_quality_scores = np.arange(1, 43)`
`22`	`22`
`23`	`23`	`# This puts a high probability toward getting a maximum quality score. The current values`
`24`	`24`	`# should be considered temporary. We're working on final values.`