Skip to content

Commit 4bf762a

Browse files
Merge pull request #103 from ncsa/develop
Develop
2 parents 59b754b + 21a10fe commit 4bf762a

File tree

4 files changed

+97
-41
lines changed

4 files changed

+97
-41
lines changed

.github/workflows/python-app.yml

+7-6
Original file line numberDiff line numberDiff line change
@@ -5,9 +5,9 @@ name: NEAT unit tests
55

66
on:
77
push:
8-
branches: [ master ]
8+
branches: [ "main", "develop" ]
99
pull_request:
10-
branches: [ master ]
10+
branches: [ "main" ]
1111

1212
jobs:
1313
build:
@@ -26,6 +26,11 @@ jobs:
2626
conda activate test_neat
2727
poetry install
2828
neat
29+
30+
- name: run coverage tests
31+
run: |
32+
conda activate test_neat
33+
python tests/coverage_tests.py
2934
3035
# - name: lint with flake8
3136
# run: |
@@ -46,7 +51,3 @@ jobs:
4651
# cd ${{ github.workspace }}
4752
# neat --log-level ERROR --no-log model-seq-err -i data/baby.fastq
4853
# - run: echo "This job's status is ${{ job.status }}."
49-
50-
51-
52-

neat/models/models.py

+9-2
Original file line numberDiff line numberDiff line change
@@ -631,15 +631,22 @@ def generate_fragments(self,
631631
"""
632632
# Estimate the number of fragments needed (with a 2x padding)
633633
number_of_fragments = int(round(total_length / read_length) * (coverage * 2))
634+
# Check that we don't have unusable values for fragment mean. Too many fragments under the read length means
635+
# NEAT will either get caught in an infinite cycle of sampling fragments but never finding one that works, or
636+
# it will only find a few and will run very slowly.
637+
if self.fragment_mean < read_length:
638+
# Let's just reset the fragment mean to make up for this.
639+
self.fragment_mean = read_length
634640
# generates a distribution, assuming normality, then rounds the result and converts to ints
635641
dist = np.round(self.rng.normal(self.fragment_mean, self.fragment_st_dev, size=number_of_fragments)).astype(int)
636-
# filter the list to throw out outliers.
637-
dist = [x for x in dist if self.fragment_min <= x <= self.fragment_max]
642+
# filter the list to throw out outliers and to set anything under the read length to the read length.
643+
dist = [max(x, read_length) for x in dist if x <= self.fragment_max]
638644
# Just a sanity check to make sure our data isn't too thin:
639645
while number_of_fragments - len(dist) > 0:
640646
additional_fragment = self.rng.normal(loc=self.fragment_mean, scale=self.fragment_st_dev)
641647
if additional_fragment < read_length:
642648
continue
643649
dist.append(round(additional_fragment))
644650

651+
# Now set a minimum on the dataset. Any fragment smaller than read_length gets turned into read_length
645652
return dist

neat/read_simulator/utils/options.py

+4-4
Original file line numberDiff line numberDiff line change
@@ -105,7 +105,7 @@ def __init__(self,
105105
self.defs['gc_model'] = (str, None, 'exists', None)
106106
self.defs['paired_ended'] = (bool, False, None, None)
107107
self.defs['fragment_model'] = (str, None, 'exists', None)
108-
self.defs['fragment_mean'] = (float, None, 1e-10, inf)
108+
self.defs['fragment_mean'] = (float, None, 10, inf)
109109
self.defs['fragment_st_dev'] = (float, None, 1e-10, inf)
110110
self.defs['produce_bam'] = (bool, False, None, None)
111111
self.defs['produce_vcf'] = (bool, False, None, None)
@@ -196,14 +196,14 @@ def check_and_log_error(keyname, value_to_check, lowval, highval):
196196
pass
197197
elif lowval != "exists" and highval:
198198
if not (lowval <= value_to_check <= highval):
199-
raise ValueError(f'@{keyname} must be between {lowval} and {highval}.')
199+
raise ValueError(f'@{keyname} must be between {lowval} and {highval}.\nYour input: {value_to_check}')
200200
elif lowval == "exists" and value_to_check:
201201
validate_input_path(value_to_check)
202202

203203
def read(self):
204204
"""
205-
This sets up the option attributes. It's not perfect, because it sort of kills
206-
type hints. But I'm not sure how else to accomplish this.
205+
This sets up the option attributes. It's not perfect, because it sort of kills type hints.
206+
But I'm not sure how else to accomplish this.
207207
"""
208208
# Skip trying to read the config for a test run
209209
config = yaml.load(open(self.config_file, 'r'), Loader=Loader)
Original file line numberDiff line numberDiff line change
@@ -1,53 +1,101 @@
1-
"""
2-
Tests for sequencing error model in models
3-
"""
4-
51
import pytest
62
import numpy as np
73

8-
from Bio.SeqRecord import SeqRecord
9-
from Bio.Seq import Seq
10-
114
from neat.models import FragmentLengthModel
12-
from neat.variants import SingleNucleotideVariant, Insertion, Deletion
135
from neat.read_simulator.utils import Options, cover_dataset
146

157

168
def test_cover_dataset():
17-
"""Test that a cover is successfully generated"""
9+
"""Test that a cover is successfully generated for different coverage values"""
1810
read_pool = [10] * 2000
1911
span_length = 100
2012
target_vector = np.full(100, fill_value=10, dtype=int)
2113
options = Options(rng_seed=0)
22-
options.paired_ended = False
23-
options.read_len = 10
24-
options.coverage = 10
14+
options.read_len = 101
15+
options.paired_ended = True
16+
options.fragment_mean = 250
17+
options.fragment_st_dev = 100
18+
options.output.overwrite_output = True
2519
fragment_model = FragmentLengthModel(rng=options.rng)
2620

27-
read1, read2 = cover_dataset(read_pool, span_length, target_vector, options, fragment_model)
28-
coverage_check = []
29-
for i in range(span_length):
30-
# single ended test, only need read1
31-
cover = [x for x in read1 if i in range(x[0], x[1])]
32-
coverage_check.append(len(cover))
33-
assert sum(coverage_check)/len(coverage_check) > 10
21+
coverage_values = [1, 2, 5, 10, 25, 50]
22+
for coverage in coverage_values:
23+
options.coverage = coverage
24+
read1, read2 = cover_dataset(read_pool, span_length, target_vector, options, fragment_model)
25+
coverage_check = []
26+
for i in range(span_length):
27+
# single ended test, only need read1
28+
cover = [x for x in read1 if i in range(x[0], x[1])]
29+
coverage_check.append(len(cover))
30+
assert sum(coverage_check)/len(coverage_check) > coverage, f"Coverage check failed for coverage {coverage}"
3431

3532

3633
def test_paired_cover_dataset():
37-
"""Test that a cover is successfully generated"""
34+
"""Test that a cover is successfully generated for different coverage values"""
3835
read_pool = [10] * 2000
3936
span_length = 100
4037
target_vector = np.full(100, fill_value=10, dtype=int)
4138
options = Options(rng_seed=0)
39+
options.read_len = 101
4240
options.paired_ended = True
43-
options.read_len = 10
44-
options.coverage = 10
41+
options.fragment_mean = 250
42+
options.fragment_st_dev = 100
43+
options.output.overwrite_output = True
4544
fragment_model = FragmentLengthModel(fragment_mean=20, fragment_std=2, fragment_min=10, fragment_max=30, rng=options.rng)
4645

47-
read1, read2 = cover_dataset(read_pool, span_length, target_vector, options, fragment_model)
48-
coverage_check = []
49-
for i in range(span_length):
50-
# single ended test, only need read1
51-
cover = [x for x in read1+read2 if i in range(x[0], x[1])]
52-
coverage_check.append(len(cover))
53-
assert sum(coverage_check) / len(coverage_check) > 10
46+
coverage_values = [1, 2, 5, 10, 25, 50]
47+
for coverage in coverage_values:
48+
options.coverage = coverage
49+
read1, read2 = cover_dataset(read_pool, span_length, target_vector, options, fragment_model)
50+
coverage_check = []
51+
for i in range(span_length):
52+
# paired ended test, need both read1 and read2
53+
cover = [x for x in read1 + read2 if i in range(x[0], x[1])]
54+
coverage_check.append(len(cover))
55+
assert sum(coverage_check) / len(coverage_check) > coverage, f"Coverage check failed for coverage {coverage}"
56+
57+
58+
def test_various_read_lengths():
59+
"""Test cover_dataset with various read lengths to ensure no errors"""
60+
read_pool = [10] * 2000
61+
span_length = 100
62+
target_vector = np.full(100, fill_value=10, dtype=int)
63+
options = Options(rng_seed=0)
64+
options.paired_ended = True
65+
options.coverage = 10
66+
options.fragment_mean = 250
67+
options.fragment_st_dev = 100
68+
options.output.overwrite_output = True
69+
fragment_model = FragmentLengthModel(rng=options.rng)
70+
71+
for read_len in range(10, 251, 10):
72+
options.read_len = read_len
73+
try:
74+
read1, _ = cover_dataset(read_pool, span_length, target_vector, options, fragment_model)
75+
except Exception as e:
76+
pytest.fail(f"Test failed for read_len={read_len} with exception: {e}")
77+
78+
79+
def test_fragment_mean_st_dev_combinations():
80+
"""Test cover_dataset with combinations of fragment mean and standard deviation to ensure no errors"""
81+
read_pool = [10] * 2000
82+
span_length = 100
83+
target_vector = np.full(100, fill_value=10, dtype=int)
84+
options = Options(rng_seed=0)
85+
options.paired_ended = True
86+
options.read_len = 50
87+
options.coverage = 10
88+
options.output.overwrite_output = True
89+
90+
fragment_means = [1, 2, 5, 10, 25, 50, 100, 150, 200, 250, 300, 350, 400, 450, 500, 750, 1000]
91+
fragment_st_devs = [1, 2, 5, 10, 25, 50, 100, 150, 200, 250, 300, 350, 400, 450, 500, 750, 1000]
92+
93+
for mean in fragment_means:
94+
for st_dev in fragment_st_devs:
95+
options.fragment_mean = mean
96+
options.fragment_st_dev = st_dev
97+
fragment_model = FragmentLengthModel(fragment_mean=mean, fragment_std=st_dev, rng=options.rng)
98+
try:
99+
read1, _ = cover_dataset(read_pool, span_length, target_vector, options, fragment_model)
100+
except Exception as e:
101+
pytest.fail(f"Test failed for mean={mean}, st_dev={st_dev} with exception: {e}")

0 commit comments

Comments
 (0)