Merge pull request #103 from ncsa/develop

joshfactorial · web-flow · commit 4bf762a30bb6 · 2024-04-18T14:19:59.000-05:00
Develop
diff --git a/.github/workflows/python-app.yml b/.github/workflows/python-app.yml
@@ -5,9 +5,9 @@ name: NEAT unit tests
 
 on:
   push:
-    branches: [ master ]
+    branches: [ "main", "develop" ]
   pull_request:
-    branches: [ master ]
+    branches: [ "main" ]
 
 jobs:
   build:
@@ -26,6 +26,11 @@ jobs:
           conda activate test_neat
           poetry install
           neat
+
+      - name: run coverage tests
+        run: |
+          conda activate test_neat
+          python tests/coverage_tests.py
       
 #       - name: lint with flake8
 #         run: |
@@ -46,7 +51,3 @@ jobs:
 #           cd ${{ github.workspace }}
 #           neat --log-level ERROR --no-log model-seq-err -i data/baby.fastq
 #       - run: echo "This job's status is ${{ job.status }}." 
-
-      
-
-      
diff --git a/neat/models/models.py b/neat/models/models.py
@@ -631,15 +631,22 @@ def generate_fragments(self,
         """
         # Estimate the number of fragments needed (with a 2x padding)
         number_of_fragments = int(round(total_length / read_length) * (coverage * 2))
+        # Check that we don't have unusable values for fragment mean. Too many fragments under the read length means
+        # NEAT will either get caught in an infinite cycle of sampling fragments but never finding one that works, or
+        # it will only find a few and will run very slowly.
+        if self.fragment_mean < read_length:
+            # Let's just reset the fragment mean to make up for this.
+            self.fragment_mean = read_length
         # generates a distribution, assuming normality, then rounds the result and converts to ints
         dist = np.round(self.rng.normal(self.fragment_mean, self.fragment_st_dev, size=number_of_fragments)).astype(int)
-        # filter the list to throw out outliers.
-        dist = [x for x in dist if self.fragment_min <= x <= self.fragment_max]
+        # filter the list to throw out outliers and to set anything under the read length to the read length.
+        dist = [max(x, read_length) for x in dist if x <= self.fragment_max]
         # Just a sanity check to make sure our data isn't too thin:
         while number_of_fragments - len(dist) > 0:
             additional_fragment = self.rng.normal(loc=self.fragment_mean, scale=self.fragment_st_dev)
             if additional_fragment < read_length:
                 continue
             dist.append(round(additional_fragment))
 
+        # Now set a minimum on the dataset. Any fragment smaller than read_length gets turned into read_length
         return dist
diff --git a/neat/read_simulator/utils/options.py b/neat/read_simulator/utils/options.py
@@ -105,7 +105,7 @@ def __init__(self,
         self.defs['gc_model'] = (str, None, 'exists', None)
         self.defs['paired_ended'] = (bool, False, None, None)
         self.defs['fragment_model'] = (str, None, 'exists', None)
-        self.defs['fragment_mean'] = (float, None, 1e-10, inf)
+        self.defs['fragment_mean'] = (float, None, 10, inf)
         self.defs['fragment_st_dev'] = (float, None, 1e-10, inf)
         self.defs['produce_bam'] = (bool, False, None, None)
         self.defs['produce_vcf'] = (bool, False, None, None)
@@ -196,14 +196,14 @@ def check_and_log_error(keyname, value_to_check, lowval, highval):
             pass
         elif lowval != "exists" and highval:
             if not (lowval <= value_to_check <= highval):
-                raise ValueError(f'@{keyname} must be between {lowval} and {highval}.')
+                raise ValueError(f'@{keyname} must be between {lowval} and {highval}.\nYour input: {value_to_check}')
         elif lowval == "exists" and value_to_check:
             validate_input_path(value_to_check)
 
     def read(self):
         """
-        This sets up the option attributes. It's not perfect, because it sort of kills
-        type hints. But I'm not sure how else to accomplish this.
+        This sets up the option attributes. It's not perfect, because it sort of kills type hints.
+        But I'm not sure how else to accomplish this.
         """
         # Skip trying to read the config for a test run
         config = yaml.load(open(self.config_file, 'r'), Loader=Loader)
diff --git a/tests/test_read_simulator/test_cover_dataset.py b/tests/test_read_simulator/test_cover_dataset.py
@@ -1,53 +1,101 @@
-"""
-Tests for sequencing error model in models
-"""
-
 import pytest
 import numpy as np
 
-from Bio.SeqRecord import SeqRecord
-from Bio.Seq import Seq
-
 from neat.models import FragmentLengthModel
-from neat.variants import SingleNucleotideVariant, Insertion, Deletion
 from neat.read_simulator.utils import Options, cover_dataset
 
 
 def test_cover_dataset():
-    """Test that a cover is successfully generated"""
+    """Test that a cover is successfully generated for different coverage values"""
     read_pool = [10] * 2000
     span_length = 100
     target_vector = np.full(100, fill_value=10, dtype=int)
     options = Options(rng_seed=0)
-    options.paired_ended = False
-    options.read_len = 10
-    options.coverage = 10
+    options.read_len = 101
+    options.paired_ended = True
+    options.fragment_mean = 250
+    options.fragment_st_dev = 100
+    options.output.overwrite_output = True
     fragment_model = FragmentLengthModel(rng=options.rng)
 
-    read1, read2 = cover_dataset(read_pool, span_length, target_vector, options, fragment_model)
-    coverage_check = []
-    for i in range(span_length):
-        # single ended test, only need read1
-        cover = [x for x in read1 if i in range(x[0], x[1])]
-        coverage_check.append(len(cover))
-    assert sum(coverage_check)/len(coverage_check) > 10
+    coverage_values = [1, 2, 5, 10, 25, 50]
+    for coverage in coverage_values:
+        options.coverage = coverage
+        read1, read2 = cover_dataset(read_pool, span_length, target_vector, options, fragment_model)
+        coverage_check = []
+        for i in range(span_length):
+            # single ended test, only need read1
+            cover = [x for x in read1 if i in range(x[0], x[1])]
+            coverage_check.append(len(cover))
+        assert sum(coverage_check)/len(coverage_check) > coverage, f"Coverage check failed for coverage {coverage}"
 
 
 def test_paired_cover_dataset():
-    """Test that a cover is successfully generated"""
+    """Test that a cover is successfully generated for different coverage values"""
     read_pool = [10] * 2000
     span_length = 100
     target_vector = np.full(100, fill_value=10, dtype=int)
     options = Options(rng_seed=0)
+    options.read_len = 101
     options.paired_ended = True
-    options.read_len = 10
-    options.coverage = 10
+    options.fragment_mean = 250
+    options.fragment_st_dev = 100
+    options.output.overwrite_output = True
     fragment_model = FragmentLengthModel(fragment_mean=20, fragment_std=2, fragment_min=10, fragment_max=30, rng=options.rng)
 
-    read1, read2 = cover_dataset(read_pool, span_length, target_vector, options, fragment_model)
-    coverage_check = []
-    for i in range(span_length):
-        # single ended test, only need read1
-        cover = [x for x in read1+read2 if i in range(x[0], x[1])]
-        coverage_check.append(len(cover))
-    assert sum(coverage_check) / len(coverage_check) > 10
+    coverage_values = [1, 2, 5, 10, 25, 50]
+    for coverage in coverage_values:
+        options.coverage = coverage
+        read1, read2 = cover_dataset(read_pool, span_length, target_vector, options, fragment_model)
+        coverage_check = []
+        for i in range(span_length):
+            # paired ended test, need both read1 and read2
+            cover = [x for x in read1 + read2 if i in range(x[0], x[1])]
+            coverage_check.append(len(cover))
+        assert sum(coverage_check) / len(coverage_check) > coverage, f"Coverage check failed for coverage {coverage}"
+
+
+def test_various_read_lengths():
+    """Test cover_dataset with various read lengths to ensure no errors"""
+    read_pool = [10] * 2000
+    span_length = 100
+    target_vector = np.full(100, fill_value=10, dtype=int)
+    options = Options(rng_seed=0)
+    options.paired_ended = True
+    options.coverage = 10
+    options.fragment_mean = 250
+    options.fragment_st_dev = 100
+    options.output.overwrite_output = True
+    fragment_model = FragmentLengthModel(rng=options.rng)
+
+    for read_len in range(10, 251, 10):
+        options.read_len = read_len
+        try:
+            read1, _ = cover_dataset(read_pool, span_length, target_vector, options, fragment_model)
+        except Exception as e:
+            pytest.fail(f"Test failed for read_len={read_len} with exception: {e}")
+
+
+def test_fragment_mean_st_dev_combinations():
+    """Test cover_dataset with combinations of fragment mean and standard deviation to ensure no errors"""
+    read_pool = [10] * 2000
+    span_length = 100
+    target_vector = np.full(100, fill_value=10, dtype=int)
+    options = Options(rng_seed=0)
+    options.paired_ended = True
+    options.read_len = 50
+    options.coverage = 10
+    options.output.overwrite_output = True
+
+    fragment_means = [1, 2, 5, 10, 25, 50, 100, 150, 200, 250, 300, 350, 400, 450, 500, 750, 1000]
+    fragment_st_devs = [1, 2, 5, 10, 25, 50, 100, 150, 200, 250, 300, 350, 400, 450, 500, 750, 1000]
+
+    for mean in fragment_means:
+        for st_dev in fragment_st_devs:
+            options.fragment_mean = mean
+            options.fragment_st_dev = st_dev
+            fragment_model = FragmentLengthModel(fragment_mean=mean, fragment_std=st_dev, rng=options.rng)
+            try:
+                read1, _ = cover_dataset(read_pool, span_length, target_vector, options, fragment_model)
+            except Exception as e:
+                pytest.fail(f"Test failed for mean={mean}, st_dev={st_dev} with exception: {e}")