Fixing a bug when requested read length is different than the model

joshfactorial · joshfactorial · commit d297910c6e0f · 2024-10-24T12:36:43.000-05:00
diff --git a/neat/models/default_quality_score_model.py b/neat/models/default_quality_score_model.py
@@ -13,7 +13,6 @@
 # All numbers from 1-42
 default_quality_scores = np.arange(1, 43)
 
-
 default_qual_score_probs = np.array([
        [35.20551064,  5.39726466],
        [35.05310236,  5.63622973],
diff --git a/neat/models/error_models.py b/neat/models/error_models.py
@@ -67,27 +67,27 @@ def __init__(
 
     def get_quality_scores(
             self,
-            run_read_length: int,
             model_read_length: int,
+            length: int,
             rng
     ) -> np.ndarray:
         """
-        Takes a read_length and rng and returns an array of quality scores
+        Takes a length and rng and returns an array of quality scores
 
-        :param run_read_length: The desired length of the quality score array
         :param model_read_length: the original read length for the model
+        :param length: The desired length of the quality score array
         :param rng: random number generator.
         :return: An array of quality scores.
         """
         if self.uniform_quality_score:
-            return np.array([self.uniform_quality_score] * run_read_length)
+            return np.array([self.uniform_quality_score] * length)
         else:
-            if run_read_length == model_read_length:
+            if length == model_read_length:
                 quality_index_map = np.arange(model_read_length)
             else:
                 # This is basically a way to evenly spread the distribution across the number of bases in the read
                 quality_index_map = np.array(
-                    [max([0, model_read_length * n // run_read_length]) for n in range(run_read_length)]
+                    [max([0, model_read_length * n // length]) for n in range(length)]
                 )
 
             temp_qual_array = []
@@ -206,7 +206,7 @@ def get_sequencing_errors(
             # This is to prevent deletion error collisions and to keep there from being too many indel errors.
             if 0 < index < self.read_length - max(
                     self.deletion_len_model) and total_indel_length > self.read_length // 4:
-                error_type = self.rng.choice(a=list(self.variant_probs), p=list(self.variant_probs.values()))
+                error_type = rng.choice(a=list(self.variant_probs), p=list(self.variant_probs.values()))
 
             # Deletion error
             if error_type == Deletion:
@@ -227,7 +227,7 @@ def get_sequencing_errors(
             elif error_type == Insertion:
                 insertion_length = self.get_insertion_length()
                 insertion_reference = reference_segment[index]
-                insert_string = ''.join(self.rng.choice(ALLOWED_NUCL, size=insertion_length))
+                insert_string = ''.join(rng.choice(ALLOWED_NUCL, size=insertion_length))
                 insertion_alternate = insertion_reference + insert_string
                 introduced_errors.append(
                     ErrorContainer(Insertion, index, insertion_length, insertion_reference, insertion_alternate)
diff --git a/neat/read_simulator/utils/read.py b/neat/read_simulator/utils/read.py
@@ -321,7 +321,7 @@ def finalize_read_and_write(
         """
 
         # Generate quality scores for the read
-        self.quality_array = qual_model.get_quality_scores(err_model.read_length, self.run_read_length, rng)
+        self.quality_array = qual_model.get_quality_scores(err_model.read_length, len(self.reference_segment), rng)
 
         # This replaces either hard or soft-masked reference segment with upper case or a standard repeat
         # It updates the quality array and reference segment in place, including reversing them, if appropriate