Skip to content

Commit bf36496

Browse files
Merge pull request #124 from ncsa/develop
Develop
2 parents baa0cb7 + 21407db commit bf36496

File tree

12 files changed

+95
-184
lines changed

12 files changed

+95
-184
lines changed

environment.yml

+1-1
Original file line numberDiff line numberDiff line change
@@ -17,4 +17,4 @@ dependencies:
1717
- pip:
1818
- pysam
1919
- frozendict
20-
- poetry>=1.3.*
20+
- poetry==1.3.*

neat/cli/commands/model_sequencing_error.py

+3-6
Original file line numberDiff line numberDiff line change
@@ -48,12 +48,9 @@ def add_arguments(self, parser: argparse.ArgumentParser):
4848
dest="quality_scores",
4949
nargs='+',
5050
required=False,
51-
default=[2, 11, 25, 37],
52-
help="Quality score bins. Enter as a space separeted list for binned scores "
53-
"(-Q 2 11 24 37), or enter a single maximum score for a full range (-Q 42 gives all"
54-
"scores from 1-42). The default is binned quality scores: [2, 11, 24, 37]. Note that"
55-
"using quality score bins on an unbinned fastq will result in a binned model, at the"
56-
"cost of some inaccuracy.")
51+
default=42,
52+
help="Quality score max. The default 42. The lowest possible score is 1. To used binned"
53+
"scoring, enter a space separated list of scores, e.g., -Q 2 15 23 37")
5754

5855
parser.add_argument('-m',
5956
type=int,

neat/gen_mut_model/runner.py

+28-52
Original file line numberDiff line numberDiff line change
@@ -85,18 +85,23 @@ def runner(reference_index,
8585

8686
# Pre-parsing to find all the matching chromosomes between ref and vcf
8787
_LOG.info('Processing VCF file...')
88-
matching_variants, matching_chromosomes = read_and_filter_variants(vcf_to_process,
89-
reference_index,
90-
ignore)
88+
matching_variants, matching_chromosomes = read_and_filter_variants(
89+
vcf_to_process,
90+
reference_index,
91+
ignore
92+
)
9193

9294
if not matching_variants or not matching_chromosomes:
9395
_LOG.error("No valid variants detected. Check names in vcf versus reference and/or bed.")
9496
sys.exit(1)
9597

96-
trinuc_ref_count, bed_track_length = count_trinucleotides(reference_index,
97-
bed,
98-
outcounts_file,
99-
matching_chromosomes)
98+
trinuc_ref_count, bed_track_length = count_trinucleotides(
99+
reference_index,
100+
bed,
101+
outcounts_file,
102+
matching_chromosomes,
103+
save_trinuc
104+
)
100105

101106
if not trinuc_ref_count:
102107
_LOG.error("No valid trinucleotides detected in reference.")
@@ -296,47 +301,6 @@ def runner(reference_index,
296301
for k in sorted(snp_trans_freq):
297302
_LOG.info(f'p({k[0]} --> {k[1]} | SNP occurs) = {snp_trans_freq[k]}')
298303

299-
# Save counts, if requested
300-
if save_trinuc:
301-
trinuc_output_data = {
302-
'p(snp)': average_snp_freq,
303-
'p(insertion)': average_insertion_frequency,
304-
'p(deletion)': average_deletion_frequency,
305-
'overall average mut rate': avg_mut_rate,
306-
'total variants processed': total_var,
307-
'homozygous probability': homozygous_frequency
308-
}
309-
310-
for k in sorted(trinuc_mut_prob):
311-
trinuc_output_data[f'p({k} mutates)'] = trinuc_mut_prob[k]
312-
313-
for k in sorted(trinuc_trans_probs):
314-
trinuc_output_data[f'p({k[0]} --> {k[1]} | {k[0]} mutates)'] = trinuc_trans_probs[k]
315-
316-
for k in sorted(deletion_counts):
317-
trinuc_output_data[f'p(del length = {abs(k)} | deletion occurs)'] = deletion_frequency[k]
318-
319-
for k in sorted(insertion_counts):
320-
trinuc_output_data[f'p(insert length = {abs(k)} | insertion occurs)'] = insertion_freqency[k]
321-
322-
for k in sorted(snp_trans_freq):
323-
trinuc_output_data[f'p({k[0]} --> {k[1]} | SNP occurs)'] = snp_trans_freq[k]
324-
325-
trinuc_output_path = Path(output)
326-
trinuc_output_path = trinuc_output_path.with_suffix('')
327-
trinuc_output_path = trinuc_output_path.with_suffix('.trinuc.pickle.gz')
328-
_LOG.info(f'Saving trinucleotide counts to: {trinuc_output_path}')
329-
330-
with open_output(trinuc_output_path, 'w+') as trinuc_outfile:
331-
332-
pickle.dump(trinuc_output_data, trinuc_outfile)
333-
334-
# Human-readable content of file below
335-
trinuc_outfile.write('\n')
336-
337-
for key, value in trinuc_output_data.items():
338-
trinuc_outfile.write(f'{key}: {value}\n')
339-
340304
_LOG.info(f'p(snp) = {average_snp_freq}')
341305
_LOG.info(f'p(insertion) = {average_insertion_frequency}')
342306
_LOG.info(f'p(deletion) = {average_deletion_frequency}')
@@ -360,8 +324,8 @@ def runner(reference_index,
360324
transition_matrix=snp_transition_bias,
361325
trinuc_trans_matrices=trinuc_transition_bias,
362326
trinuc_mut_bias=trinuc_mutation_probs,
363-
insert_len_model=insertion_counts,
364-
deletion_len_model=deletion_counts
327+
insert_len_model=insertion_freqency,
328+
deletion_len_model=deletion_frequency
365329
)
366330

367331
print('\nSaving model...')
@@ -405,6 +369,8 @@ def compute_mut_runner(reference,
405369
if outcounts:
406370
validate_input_path(outcounts)
407371
outcounts = Path(outcounts)
372+
elif save_trinuc:
373+
outcounts = Path(output + '.trinuc.pickle.gz')
408374

409375
print('Processing reference...')
410376
reference_index = SeqIO.index(reference, 'fasta')
@@ -477,8 +443,18 @@ def compute_mut_runner(reference,
477443
output = Path(output + '.pickle.gz')
478444
validate_output_path(output, overwrite=overwrite_output)
479445

480-
runner(reference_index, vcf_to_process, vcf_columns, outcounts, show_trinuc, save_trinuc,
481-
output, bed, human_sample, skip_common)
446+
runner(
447+
reference_index,
448+
vcf_to_process,
449+
vcf_columns,
450+
outcounts,
451+
show_trinuc,
452+
save_trinuc,
453+
output,
454+
bed,
455+
human_sample,
456+
skip_common
457+
)
482458

483459
if os.path.exists('temp.vcf'):
484460
os.remove('temp.vcf')

neat/gen_mut_model/utils.py

+10-8
Original file line numberDiff line numberDiff line change
@@ -4,6 +4,8 @@
44

55
import json
66
import sys
7+
import pickle
8+
import gzip
79

810
import numpy as np
911

@@ -165,14 +167,16 @@ def cluster_list(list_to_cluster, delta):
165167
def count_trinucleotides(reference_index,
166168
bed,
167169
trinuc_counts,
168-
matching_chroms):
170+
matching_chroms,
171+
save_trinuc_file):
169172
"""
170173
Counts the frequency of the various trinucleotide combinations in the dataset
171174
172175
:param dict reference_index: The index of the fasta, from SeqIO
173176
:param Path bed: Full path to bed file, if using
174177
:param Path trinuc_counts: Full path to existing trinculeotide counts file, if input, or the output path
175178
:param list matching_chroms: List of matching chromosomes
179+
:param save_trinuc_file: Boolean determines whether to save trinucleotide counts file.
176180
:return dict, int: A dictionary of trinculeotides and counts, the number of bases spanned
177181
"""
178182
# Data format: TRINUC: COUNT (e.g., "AAA": 12), where COUNT is the number of times trinuc was observed
@@ -183,11 +187,11 @@ def count_trinucleotides(reference_index,
183187
track_len = 0
184188

185189
trinuc_counts_exists = False
186-
save_trinuc_file = False
187190
if trinuc_counts:
188191
if trinuc_counts.is_file():
192+
_LOG.info("Trinucleotide counts file exists, skipping save to avoid overwriting data.")
189193
trinuc_counts_exists = True
190-
save_trinuc_file = True
194+
save_trinuc_file = False
191195

192196
if bed:
193197
_LOG.info("Counting trinucleotide combinations in bed regions")
@@ -207,7 +211,6 @@ def count_trinucleotides(reference_index,
207211
trinuc_ref_count = count_trinuc(trinuc, trinuc_ref_count)
208212

209213
# Solution to attribute error (needs to be checked)
210-
# TODO remove ref_name from this dict
211214
elif not trinuc_counts_exists:
212215
_LOG.info("Counting trinucleotide combinations in reference.")
213216
for ref_name in matching_chroms:
@@ -220,13 +223,12 @@ def count_trinucleotides(reference_index,
220223
with open_output(trinuc_counts, 'w') as countfile:
221224
_LOG.info('Saving trinuc counts to file...')
222225
# Convert all values to writable formats
223-
trinuc_ref_count = {str(x): int(y) for x, y in trinuc_ref_count.items()}
224-
countfile.write(json.dumps(trinuc_ref_count))
226+
pickle.dump(trinuc_ref_count, countfile)
225227

226228
else:
227229
_LOG.info(f'Loading file: {trinuc_counts}.')
228-
with open_input(trinuc_counts) as counts:
229-
trinuc_ref_count = json.load(counts)
230+
with gzip.open(trinuc_counts, 'rb') as counts:
231+
trinuc_ref_count = pickle.load(counts)
230232
if save_trinuc_file:
231233
_LOG.warning('Existing trinucelotide file will not be changed.')
232234

neat/model_sequencing_error/runner.py

+10-5
Original file line numberDiff line numberDiff line change
@@ -62,9 +62,11 @@ def model_seq_err_runner(
6262
_LOG.debug(f"Quality offset: {offset}")
6363

6464
final_quality_scores: list
65+
binned_scores = False
6566
if len(qual_scores) == 1:
6667
final_quality_scores = list(range(1, qual_scores[0] + 1))
6768
else:
69+
binned_scores = True
6870
final_quality_scores = sorted(qual_scores)
6971

7072
_LOG.debug(f'Quality scores: {final_quality_scores}')
@@ -109,11 +111,14 @@ def model_seq_err_runner(
109111
for file in files:
110112
file_num += 1
111113
_LOG.info(f'Reading file {file_num} of {len(files)}')
112-
parameters_by_position, file_avg_error, read_length = parse_file(file,
113-
final_quality_scores,
114-
num_records_to_process,
115-
offset,
116-
read_length)
114+
parameters_by_position, file_avg_error, read_length = parse_file(
115+
file,
116+
final_quality_scores,
117+
num_records_to_process,
118+
offset,
119+
read_length,
120+
binned_scores
121+
)
117122

118123
read_parameters.append(parameters_by_position)
119124
average_errors.append(file_avg_error)

neat/model_sequencing_error/utils.py

+1-5
Original file line numberDiff line numberDiff line change
@@ -10,7 +10,6 @@
1010

1111
from scipy.stats import mode
1212
from ..common import open_input
13-
from ..models import take_closest
1413

1514
__all__ = [
1615
"parse_file"
@@ -139,10 +138,7 @@ def parse_file(input_file: str, quality_scores: list, max_reads: int, qual_offse
139138

140139
for j in range(read_length):
141140
# The qualities of each read_position_scores
142-
quality_bin = take_closest(quality_scores, qualities_to_check[j])
143-
bin_index = quality_scores.index(quality_bin)
144-
temp_q_count[j][bin_index] += 1
145-
qual_score_counter[quality_bin] += 1
141+
qual_score_counter[qualities_to_check[j]] += 1
146142

147143
if records_read % quarters == 0:
148144
_LOG.info(f'reading data: {(records_read / max_reads) * 100:.0f}%')

neat/models/__init__.py

-1
Original file line numberDiff line numberDiff line change
@@ -1,2 +1 @@
11
from .models import *
2-
from .utils import *

neat/models/default_sequencing_error_model.py

+2-2
Original file line numberDiff line numberDiff line change
@@ -17,8 +17,8 @@
1717
[0.2505, 0.2552, 0.4943, 0.0]]
1818
)
1919

20-
# This list may not be the final list
21-
default_quality_scores = np.array([2, 11, 25, 37])
20+
# All numbers from 1-42
21+
default_quality_scores = np.arange(1, 43)
2222

2323
# This puts a high probability toward getting a maximum quality score. The current values
2424
# should be considered temporary. We're working on final values.

neat/models/models.py

+22-13
Original file line numberDiff line numberDiff line change
@@ -18,7 +18,6 @@
1818
from ..common import TRINUC_IND, ALLOWED_NUCL, NUC_IND, DINUC_IND
1919
from .default_mutation_model import *
2020
from .default_sequencing_error_model import *
21-
from .utils import bin_scores, take_closest
2221

2322
__all__ = [
2423
"MutationModel",
@@ -60,7 +59,9 @@ class InsertionModel(VariantModel):
6059
def __init__(self,
6160
insert_len_model: dict[int: float, ...],
6261
rng: Generator = None):
63-
self.insert_len_model = insert_len_model
62+
# Creating probabilities from the weights
63+
tot = sum(insert_len_model.values())
64+
self.insertion_len_model = {key: val / tot for key, val in insert_len_model.items()}
6465
self.rng = rng
6566

6667
def get_insertion_length(self, size: int = None) -> int | list[int, ...]:
@@ -71,8 +72,8 @@ def get_insertion_length(self, size: int = None) -> int | list[int, ...]:
7172
Greater than 1 returns a list of ints.
7273
:return: int or list of ints.
7374
"""
74-
return self.rng.choice(a=list(self.insert_len_model),
75-
p=[*self.insert_len_model.values()],
75+
return self.rng.choice(a=list(self.insertion_len_model),
76+
p=[*self.insertion_len_model.values()],
7677
size=size, shuffle=False)
7778

7879

@@ -91,7 +92,9 @@ class DeletionModel(VariantModel):
9192
def __init__(self,
9293
deletion_len_model: dict[int: float, ...],
9394
rng: Generator = None):
94-
self.deletion_len_model = deletion_len_model
95+
# Creating probabilities from the weights
96+
tot = sum(deletion_len_model.values())
97+
self.deletion_len_model = {key: val/tot for key, val in deletion_len_model.items()}
9598
self.rng = rng
9699

97100
def get_deletion_length(self, size: int = None) -> int | list[int, ...]:
@@ -281,6 +284,9 @@ def generate_snv(self, trinucleotide: Seq, reference_location: int) -> SingleNuc
281284
transition_matrix = self.trinuc_trans_matrices[DINUC_IND[trinucleotide[0] + "_" + trinucleotide[2]]]
282285
# then determine the trans probs based on the middle nucleotide
283286
transition_probs = transition_matrix[NUC_IND[trinucleotide[1]]]
287+
# Creating probabilities from the weights
288+
transition_sum = sum(transition_probs)
289+
transition_probs = [x/transition_sum for x in transition_probs]
284290
# Now pick a random alternate, weighted by the probabilities
285291
alt = self.rng.choice(ALLOWED_NUCL, p=transition_probs)
286292
temp_snv = SingleNucleotideVariant(reference_location, alt=alt)
@@ -376,11 +382,8 @@ def __init__(self,
376382
self.insertion_model = insertion_model
377383
self.uniform_quality_score = None
378384
if self.is_uniform:
379-
# bin scores returns a list, so we need the first (only) element of the list
380-
converted_avg_err = bin_scores(self.quality_scores,
381-
[int(-10. * np.log10(self.average_error))])[0]
382-
# Set score to the lowest of the max of the quality scores and the bin closest to the input avg error.
383-
self.uniform_quality_score = min([max(self.quality_scores), converted_avg_err])
385+
# Set score to the lowest of the max of the quality scores and the input avg error.
386+
self.uniform_quality_score = min([max(self.quality_scores), int(-10. * np.log10(self.average_error) + 0.5)])
384387
self.rng = rng
385388

386389
def get_sequencing_errors(self,
@@ -498,7 +501,13 @@ def get_quality_scores(self,
498501
for i in quality_index_map:
499502
score = self.rng.normal(self.quality_score_probabilities[i][0],
500503
scale=self.quality_score_probabilities[i][1])
501-
score = take_closest(self.quality_scores, score)
504+
# make sure score is in range and an int
505+
score = round(score)
506+
if score > 42:
507+
score = 42
508+
if score < 1:
509+
score = 1
510+
502511
temp_qual_array.append(score)
503512

504513
if self.rescale_qualities:
@@ -509,9 +518,9 @@ def get_quality_scores(self,
509518
self.quality_score_error_rate[n]) + 0.5)])
510519
for n in temp_qual_array]
511520
# Now rebin the quality scores.
512-
temp_qual_array = np.array(bin_scores(self.quality_scores, rescaled_quals))
521+
temp_qual_array = np.array(rescaled_quals)
513522
else:
514-
temp_qual_array = np.array(bin_scores(self.quality_scores, temp_qual_array))
523+
temp_qual_array = np.array(temp_qual_array)
515524

516525
return temp_qual_array[:input_read_length]
517526

0 commit comments

Comments
 (0)