Skip to content

Commit 1eafd28

Browse files
Merge pull request #110 from ncsa/feature/simplifying_coverage
Feature/simplifying coverage
2 parents 10c6022 + 5250f56 commit 1eafd28

File tree

12 files changed

+715
-706
lines changed

12 files changed

+715
-706
lines changed

neat/cli/cli.py

+1-1
Original file line numberDiff line numberDiff line change
@@ -140,7 +140,7 @@ def main(parser: argparse.ArgumentParser, arguments: list[str]) -> int:
140140
else:
141141
end = time.time()
142142
log.info(
143-
"command finished successfully; execution took %.3f sec", end - start
143+
f"command finished successfully; execution took {(end - start)/60:.2f} m"
144144
)
145145
return 0
146146

neat/common/logging.py

+2
Original file line numberDiff line numberDiff line change
@@ -57,4 +57,6 @@ def setup_logging(
5757

5858
logging.basicConfig(**kwargs)
5959

60+
logging.getLogger(__name__).info(f"writing log to: {log_file}")
61+
6062

neat/common/ploid_functions.py

+10-16
Original file line numberDiff line numberDiff line change
@@ -28,24 +28,18 @@ def pick_ploids(ploidy: int,
2828
:return: a list of strings representing the genotype of each ploid.
2929
"""
3030
# number of ploids to make this mutation on (always at least 1)
31+
how_many = 1
3132
if rng.random() < homozygous_frequency:
32-
if ploidy <= 2:
33-
# If it's homozygous with a ploidy of 2 it's on both.
34-
how_many = ploidy
35-
else:
36-
# if it's polyploid, it's on more than one ploid
37-
# TODO may need to improve the modeling for polyploid
38-
how_many = 1
39-
for i in range(ploidy):
40-
# Not totally sure how to model this, so I'm counting each
41-
# ploid as a separate homozygous event. That doesn't exactly make
42-
# sense though, so we'll improve this later.
43-
if rng.random() < homozygous_frequency:
44-
how_many += 1
45-
else:
46-
break
33+
# We'll consider this one to be homozygous
34+
how_many = ploidy
4735
else:
48-
how_many = 1
36+
if ploidy == 1:
37+
# special case where heteroyzgous makes no sense
38+
how_many = 1
39+
else:
40+
# if it's polyploid, we'll consider it to be on roughly half the ploids
41+
# TODO may need to improve the modeling for polyploid, maybe
42+
how_many = ploidy//2
4943

5044
# wp is just the temporary genotype list, a hat tip to the old version of NEAT
5145
wp = np.zeros(ploidy)

neat/models/default_fraglen_model.py

-8
This file was deleted.

neat/models/models.py

+20-41
Original file line numberDiff line numberDiff line change
@@ -21,7 +21,6 @@
2121
from .default_mutation_model import *
2222
from .default_sequencing_error_model import *
2323
from .default_gc_bias_model import *
24-
from .default_fraglen_model import *
2524
from .utils import bin_scores, take_closest
2625

2726
__all__ = [
@@ -389,16 +388,17 @@ def __init__(self,
389388

390389
def get_sequencing_errors(self,
391390
read_length: int,
391+
padding: int,
392392
reference_segment: SeqRecord,
393393
quality_scores: np.ndarray):
394394
"""
395395
Inserts errors of type substitution, insertion, or deletion into read_data, and assigns a quality score
396396
based on the container model.
397397
:param read_length: The length of the read to generate errors for.
398+
:param padding: this is the amount of space we have in the read for deletions.
398399
:param reference_segment: The section of the reference from which the read is drawn
399400
:param quality_scores: Array of quality scores for the read
400-
:return: modified sequence and associated quality scores
401-
401+
:return: Modified sequence and associated quality scores
402402
"""
403403

404404
error_indexes = []
@@ -413,7 +413,7 @@ def get_sequencing_errors(self,
413413
if self.rng.random() < self.quality_score_error_rate[quality_scores[i]]:
414414
error_indexes.append(i)
415415

416-
num_indels_so_far = 0
416+
total_indel_length = 0
417417
# To prevent deletion collisions
418418
del_blacklist = []
419419

@@ -424,19 +424,24 @@ def get_sequencing_errors(self,
424424
# Not too sure about how realistic it is to model errors as indels, but I'm leaving the code in for now.
425425

426426
# This is to prevent deletion error collisions and to keep there from being too many indel errors.
427-
if 0 < index < self.read_length - max(self.deletion_len_model) and num_indels_so_far > self.read_length//2:
427+
if 0 < index < self.read_length - max(self.deletion_len_model) and total_indel_length > self.read_length//4:
428428
error_type = self.rng.choice(a=list(self.variant_probs), p=list(self.variant_probs.values()))
429429

430430
# Deletion error
431431
if error_type == Deletion:
432432
deletion_length = self.get_deletion_length()
433+
if padding - deletion_length < 0:
434+
# No space in this read to add this deletion
435+
continue
433436
deletion_reference = reference_segment.seq[index: index + deletion_length + 1]
434437
deletion_alternate = deletion_reference[0]
435438
introduced_errors.append(
436439
ErrorContainer(Deletion, index, deletion_length, deletion_reference, deletion_alternate)
437440
)
438-
num_indels_so_far += deletion_length
441+
total_indel_length += deletion_length
442+
439443
del_blacklist.extend(list(range(index, index + deletion_length)))
444+
padding -= deletion_length
440445

441446
elif error_type == Insertion:
442447
insertion_length = self.get_insertion_length()
@@ -446,7 +451,7 @@ def get_sequencing_errors(self,
446451
introduced_errors.append(
447452
ErrorContainer(Insertion, index, insertion_length, insertion_reference, insertion_alternate)
448453
)
449-
num_indels_so_far += insertion_length
454+
total_indel_length += insertion_length
450455

451456
# Insert substitution error
452457
# Programmer note: if you add new error types, they can be added as elifs above, leaving the final
@@ -465,7 +470,7 @@ def get_sequencing_errors(self,
465470
if introduced_errors[i].location in del_blacklist:
466471
del introduced_errors[i]
467472

468-
return introduced_errors
473+
return introduced_errors, max(padding, 0)
469474

470475
def quality_index_remap(self, input_read_length):
471476
"""
@@ -600,53 +605,27 @@ class FragmentLengthModel:
600605
601606
:param fragment_mean: the mean of the collection of fragment lengths derived from data
602607
:param fragment_std: the standard deviation of the collection of fragment lengths derived from data
603-
:param fragment_max: the largest fragment observed in the data
604-
:param fragment_min: the smallest fragment observed in data
605608
:param rng: the random number generator for the run
606609
"""
607610

608611
def __init__(self,
609-
fragment_mean: float = None,
610-
fragment_std: float = None,
611-
fragment_max: int = None,
612-
fragment_min: int = None,
612+
fragment_mean: float,
613+
fragment_std: float,
613614
rng: Generator = None):
614-
self.fragment_mean = fragment_mean if fragment_mean else default_fragment_mean
615-
self.fragment_st_dev = fragment_std if fragment_std else default_fragment_std
616-
self.fragment_max = fragment_max if fragment_max else default_fragment_max
617-
self.fragment_min = fragment_min if fragment_min else default_fragment_min
615+
self.fragment_mean = fragment_mean
616+
self.fragment_st_dev = fragment_std
618617
self.rng = rng
619618

620619
def generate_fragments(self,
621620
total_length: int,
622-
read_length: int,
623-
coverage: int) -> list:
621+
number_of_fragments: int) -> list:
624622
"""
625623
Generates a number of fragments based on the total length needed, and the mean and standard deviation of the set
626624
627625
:param total_length: Length of the reference segment we are covering.
628-
:param read_length: average length of the reads
629-
:param coverage: the target coverage number
626+
:param number_of_fragments: The number of fragments needed.
630627
:return: A list of fragment random fragment lengths sampled from the model.
631628
"""
632-
# Estimate the number of fragments needed (with a 2x padding)
633-
number_of_fragments = int(round(total_length / read_length) * (coverage * 2))
634-
# Check that we don't have unusable values for fragment mean. Too many fragments under the read length means
635-
# NEAT will either get caught in an infinite cycle of sampling fragments but never finding one that works, or
636-
# it will only find a few and will run very slowly.
637-
if self.fragment_mean < read_length:
638-
# Let's just reset the fragment mean to make up for this.
639-
self.fragment_mean = read_length
640629
# generates a distribution, assuming normality, then rounds the result and converts to ints
641630
dist = np.round(self.rng.normal(self.fragment_mean, self.fragment_st_dev, size=number_of_fragments)).astype(int)
642-
# filter the list to throw out outliers and to set anything under the read length to the read length.
643-
dist = [max(x, read_length) for x in dist if x <= self.fragment_max]
644-
# Just a sanity check to make sure our data isn't too thin:
645-
while number_of_fragments - len(dist) > 0:
646-
additional_fragment = self.rng.normal(loc=self.fragment_mean, scale=self.fragment_st_dev)
647-
if additional_fragment < read_length:
648-
continue
649-
dist.append(round(additional_fragment))
650-
651-
# Now set a minimum on the dataset. Any fragment smaller than read_length gets turned into read_length
652-
return dist
631+
return [abs(x) for x in dist]

neat/read_simulator/runner.py

+19-15
Original file line numberDiff line numberDiff line change
@@ -9,8 +9,8 @@
99
from Bio import SeqIO
1010
from pathlib import Path
1111

12-
from .utils import Options, parse_input_vcf, parse_beds, fill_out_bed_dict, OutputFileWriter, find_file_breaks, \
13-
map_chromosome, generate_variants, write_local_file, generate_reads
12+
from .utils import Options, parse_input_vcf, parse_beds, OutputFileWriter, find_file_breaks, \
13+
generate_variants, write_local_file, generate_reads
1414
from ..common import validate_input_path, validate_output_path
1515
from ..models import MutationModel, SequencingErrorModel, FragmentLengthModel, GcModel
1616
from ..models.default_cancer_mutation_model import *
@@ -45,7 +45,7 @@ def initialize_all_models(options: Options):
4545

4646
cancer_model = None
4747
if options.cancer and options.cancer_model:
48-
cancer_model = pickle.load(gzip.open(options.cancer_model))
48+
# cancer_model = pickle.load(gzip.open(options.cancer_model))
4949
# Set the rng for the cancer mutation model
5050
cancer_model.rng = options.rng
5151
elif options.cancer:
@@ -98,15 +98,16 @@ def initialize_all_models(options: Options):
9898

9999
_LOG.debug('GC Bias model loaded')
100100

101-
fraglen_model = None
102-
if options.paired_ended:
103-
if options.fragment_model:
104-
fraglen_model = pickle.load(gzip.open(options.fragment_model))
105-
else:
106-
fraglen_model = FragmentLengthModel(options.fragment_mean, options.fragment_st_dev)
107-
108-
# Set the rng for the fragment length model
101+
if options.fragment_model:
102+
fraglen_model = pickle.load(gzip.open(options.fragment_model))
109103
fraglen_model.rng = options.rng
104+
elif options.fragment_mean:
105+
fraglen_model = FragmentLengthModel(options.fragment_mean, options.fragment_st_dev, rng=options.rng)
106+
else:
107+
# For single ended, fragment length will be based on read length
108+
fragment_mean = options.read_len * 1.5
109+
fragment_st_dev = fragment_mean * 0.2
110+
fraglen_model = FragmentLengthModel(fragment_mean, fragment_st_dev, options.rng)
110111

111112
_LOG.debug("Fragment length model loaded")
112113

@@ -174,6 +175,7 @@ def read_simulator_runner(config: str, output: str):
174175
"""
175176
_LOG.info(f'Reading {options.reference}.')
176177

178+
# TODO check into SeqIO.index_db()
177179
reference_index = SeqIO.index(str(options.reference), "fasta")
178180
_LOG.debug("Reference file indexed.")
179181

@@ -300,7 +302,7 @@ def read_simulator_runner(config: str, output: str):
300302
pass
301303

302304
if options.paired_ended:
303-
max_qual_score = max(seq_error_model_1.quality_scores + seq_error_model_2.quality_scores)
305+
max_qual_score = max(max(seq_error_model_1.quality_scores), max(seq_error_model_2.quality_scores))
304306
else:
305307
max_qual_score = max(seq_error_model_1.quality_scores)
306308

@@ -329,11 +331,12 @@ def read_simulator_runner(config: str, output: str):
329331
fasta_files.extend(local_fasta_file)
330332

331333
if options.produce_fastq or options.produce_bam:
332-
read1_fastq, read2_fastq = \
334+
read1_fastq_paired, read1_fastq_single, read2_fastq_paired, read2_fastq_single = \
333335
generate_reads(local_reference,
334336
local_bam_pickle_file,
335337
seq_error_model_1,
336338
seq_error_model_2,
339+
mut_model,
337340
gc_bias_model,
338341
fraglen_model,
339342
local_variants,
@@ -343,7 +346,8 @@ def read_simulator_runner(config: str, output: str):
343346
options,
344347
contig)
345348

346-
fastq_files.append((read1_fastq, read2_fastq))
349+
contig_temp_fastqs = ((read1_fastq_paired, read2_fastq_paired), (read1_fastq_single, read2_fastq_single))
350+
fastq_files.append(contig_temp_fastqs)
347351
if options.produce_bam:
348352
sam_reads_files.append(local_bam_pickle_file)
349353

@@ -367,4 +371,4 @@ def read_simulator_runner(config: str, output: str):
367371
_LOG.info(f"Outputting golden bam file: {str(output_file_writer.bam_fn)}")
368372
contig_list = list(reference_index)
369373
contigs_by_index = {contig_list[n]: n for n in range(len(contig_list))}
370-
output_file_writer.output_bam_file(sam_reads_files, contigs_by_index)
374+
output_file_writer.output_bam_file(sam_reads_files, contigs_by_index, options.read_len)

0 commit comments

Comments
 (0)