Skip to content

Commit fd00e28

Browse files
Merge pull request #138 from ncsa/feature/fix_output
Feature/fix output
2 parents 374f415 + ffa45a1 commit fd00e28

File tree

6 files changed

+44
-37
lines changed

6 files changed

+44
-37
lines changed

environment.yml

+3-3
Original file line numberDiff line numberDiff line change
@@ -1,8 +1,8 @@
11
name: neat
22
channels:
3-
- bioconda
4-
- conda-forge
5-
- defaults
3+
- conda-forge
4+
- bioconda
5+
66
dependencies:
77
- python=3.10.*
88
- biopython=1.79

neat/models/error_models.py

+1-1
Original file line numberDiff line numberDiff line change
@@ -189,7 +189,7 @@ def get_sequencing_errors(
189189
if self.average_error == 0:
190190
return introduced_errors
191191
else:
192-
for i in range(self.read_length):
192+
for i in range(len(quality_scores)):
193193
if rng.random() < quality_score_error_rate[quality_scores[i]]:
194194
error_indexes.append(i)
195195

neat/read_simulator/runner.py

+14-11
Original file line numberDiff line numberDiff line change
@@ -177,18 +177,20 @@ def read_simulator_runner(config: str, output: str):
177177

178178
# TODO check into SeqIO.index_db()
179179
reference_index = SeqIO.index(str(options.reference), "fasta")
180+
reference_keys_with_lens = {key: len(value) for key, value in reference_index.items()}
180181
_LOG.debug("Reference file indexed.")
181182

182183
if _LOG.getEffectiveLevel() < 20:
183184
count = 0
184-
for contig in reference_index:
185-
count += len(reference_index[contig])
185+
for contig in reference_keys_with_lens:
186+
count += reference_keys_with_lens[contig]
186187
_LOG.debug(f"Length of reference: {count / 1_000_000:.2f} Mb")
187188

188-
input_variants_dict = {x: ContigVariants() for x in reference_index}
189+
input_variants_dict = {x: ContigVariants() for x in reference_keys_with_lens}
189190
if options.include_vcf:
190191
_LOG.info(f"Reading input VCF: {options.include_vcf}.")
191192
if options.cancer:
193+
# TODO Check if we need full ref index or just keys and lens
192194
sample_names = parse_input_vcf(
193195
input_variants_dict,
194196
options.include_vcf,
@@ -202,6 +204,7 @@ def read_simulator_runner(config: str, output: str):
202204
tumor_ind = sample_names['tumor_sample']
203205
normal_ind = sample_names['normal_sample']
204206
else:
207+
# TODO Check if we need full ref index or just keys and lens
205208
sample_names = parse_input_vcf(
206209
input_variants_dict,
207210
options.include_vcf,
@@ -224,7 +227,7 @@ def read_simulator_runner(config: str, output: str):
224227
target_regions_dict,
225228
discard_regions_dict,
226229
mutation_rate_dict
227-
) = parse_beds(options, reference_index, mut_model.avg_mut_rate)
230+
) = parse_beds(options, reference_keys_with_lens, mut_model.avg_mut_rate)
228231

229232
if any(bed_files):
230233
_LOG.debug("Finished reading input beds.")
@@ -234,7 +237,7 @@ def read_simulator_runner(config: str, output: str):
234237
if options.produce_bam:
235238
# This is a dictionary that is the list of the contigs and the length of each.
236239
# This information will be needed later to create the bam header.
237-
bam_header = {key: len(reference_index[key]) for key in reference_index}
240+
bam_header = reference_keys_with_lens
238241

239242
# Creates files and sets up objects for files that can be written to as needed.
240243
# Also creates headers for bam and vcf.
@@ -259,7 +262,7 @@ def read_simulator_runner(config: str, output: str):
259262
"""
260263
_LOG.info("Beginning simulation.")
261264

262-
breaks = find_file_breaks(reference_index)
265+
breaks = find_file_breaks(reference_keys_with_lens)
263266

264267
_LOG.debug("Input reference partitioned for run")
265268

@@ -345,20 +348,20 @@ def read_simulator_runner(config: str, output: str):
345348

346349
if options.produce_bam:
347350
_LOG.info(f"Outputting golden bam file: {str(output_file_writer.bam_fn)}")
348-
contig_list = list(reference_index)
351+
contig_list = list(reference_keys_with_lens)
349352
contigs_by_index = {contig_list[n]: n for n in range(len(contig_list))}
350353
output_file_writer.output_bam_file(sam_reads_files, contigs_by_index, options.read_len)
351354

352355

353-
def find_file_breaks(reference_index: dict) -> dict:
356+
def find_file_breaks(reference_keys_with_lens: dict) -> dict:
354357
"""
355358
Returns a dictionary with the chromosomes as keys, which is the start of building the chromosome map
356359
357-
:param reference_index: a dictionary with chromosome keys and sequence values
360+
:param reference_keys_with_lens: a dictionary with chromosome keys and sequence values
358361
:return: a dictionary containing the chromosomes as keys and either "all" for values, or a list of indices
359362
"""
360363
partitions = {}
361-
for contig in reference_index.keys():
362-
partitions[contig] = [(0, len(reference_index[contig]))]
364+
for contig in reference_keys_with_lens:
365+
partitions[contig] = [(0, reference_keys_with_lens[contig])]
363366

364367
return partitions

neat/read_simulator/utils/bed_func.py

+17-16
Original file line numberDiff line numberDiff line change
@@ -19,12 +19,12 @@
1919
_LOG = logging.getLogger(__name__)
2020

2121

22-
def parse_beds(options: Options, reference_dict: _IndexedSeqFileDict, average_mutation_rate: float) -> list:
22+
def parse_beds(options: Options, ref_keys_counts: dict, average_mutation_rate: float) -> list:
2323
"""
2424
This single function parses the three possible bed file types for NEAT.
2525
2626
:param options: The options object for this run
27-
:param reference_dict: The reference dict generated by SeqIO for this run
27+
:param ref_keys_counts: A dictionary containing the reference keys and the associated lengths.
2828
:param average_mutation_rate: The average mutation rate from the model or user input for this run
2929
:return: target_dict, discard_dict, mutation_rate_dict
3030
"""
@@ -57,13 +57,13 @@ def parse_beds(options: Options, reference_dict: _IndexedSeqFileDict, average_mu
5757
for i in range(len(bed_files)):
5858
temp_bed_dict = parse_single_bed(
5959
bed_files[i],
60-
reference_dict,
60+
ref_keys_counts,
6161
processing_structure[i]
6262
)
6363
# If there was a bed this function will fill in any gaps the bed might have missed, so we have a complete map
6464
# of each chromosome. The uniform case in parse_single_bed handles this in cases of no input bed.
6565
if processing_structure[i][2]:
66-
final_dict = fill_out_bed_dict(reference_dict, temp_bed_dict, processing_structure[i])
66+
final_dict = fill_out_bed_dict(ref_keys_counts, temp_bed_dict, processing_structure[i])
6767
else:
6868
final_dict = temp_bed_dict
6969

@@ -73,7 +73,7 @@ def parse_beds(options: Options, reference_dict: _IndexedSeqFileDict, average_mu
7373

7474

7575
def parse_single_bed(input_bed: str,
76-
reference_dictionary: _IndexedSeqFileDict,
76+
ref_keys_counts: dict,
7777
process_key: tuple[str, float, bool],
7878
) -> dict:
7979
"""
@@ -82,12 +82,13 @@ def parse_single_bed(input_bed: str,
8282
then we will also import the mutation data.
8383
8484
:param input_bed: A bed file containing the regions of interest
85-
:param reference_dictionary: A list of chromosomes to check, along with their reads and lengths
85+
:param ref_keys_counts: A dict of chromosomes to check (key), along with their lengths (value)
8686
:param process_key: Indicates which bed we are processing and the factor we need to process it.
8787
:return: a dictionary of chromosomes: [(pos1, pos2, factor), etc...]
8888
"""
89-
ret_dict = {x: [] for x in reference_dictionary}
89+
ret_dict = {x: [] for x in ref_keys_counts}
9090
in_bed_only = []
91+
key_pool = list(ref_keys_counts.keys())
9192

9293
# process_key[2] is True when there is an input bed, False otherwise.
9394
if process_key[2]:
@@ -107,7 +108,7 @@ def parse_single_bed(input_bed: str,
107108
sys.exit(1)
108109
# Bed file chromosome names must match the reference.
109110
try:
110-
assert my_chr in reference_dictionary
111+
assert my_chr in key_pool
111112
except AssertionError:
112113
in_bed_only.append(my_chr)
113114
_LOG.warning(
@@ -149,7 +150,7 @@ def parse_single_bed(input_bed: str,
149150
ret_dict[my_chr].append((int(pos1), int(pos2), True))
150151

151152
# some validation
152-
in_ref_only = [k for k in reference_dictionary if k not in ret_dict]
153+
in_ref_only = [k for k in ref_keys_counts if k not in ret_dict]
153154
if in_ref_only:
154155
_LOG.warning(f'Warning: Reference contains sequences not found in BED file {input_bed}. '
155156
'These chromosomes will be omitted from the outputs.')
@@ -161,13 +162,13 @@ def parse_single_bed(input_bed: str,
161162
_LOG.debug(f'Regions ignored: {list(set(in_bed_only))}')
162163

163164
else:
164-
for my_chr in reference_dictionary.keys():
165-
ret_dict[my_chr].append((0, len(reference_dictionary[my_chr]), process_key[1]))
165+
for my_chr in ref_keys_counts:
166+
ret_dict[my_chr].append((0, ref_keys_counts[my_chr], process_key[1]))
166167

167168
return ret_dict
168169

169170

170-
def fill_out_bed_dict(ref_dict: _IndexedSeqFileDict,
171+
def fill_out_bed_dict(ref_keys_counts: dict,
171172
region_dict: dict | None,
172173
default_factor: tuple[str, bool, bool],
173174
) -> dict:
@@ -177,9 +178,9 @@ def fill_out_bed_dict(ref_dict: _IndexedSeqFileDict,
177178
178179
The input to this function is the dict for a single chromosome.
179180
180-
:param ref_dict: The reference dictionary for the run. Needed to calulate max values. We shouldn't need to access
181-
the data it refers to.
182-
:param region_dict: A dict based on the input from the bed file, with keys being all the chronmosomes
181+
:param ref_keys_counts: A dictionary with the keys as the contigs from the reference dictionary, and the values as
182+
the length of the contig. Needed to calculate max values.
183+
:param region_dict: A dict based on the input from the bed file, with keys being all the chromosomes
183184
present in the reference.
184185
:param default_factor: This is a tuple representing the type and any extra information. The extra info is for
185186
mutation rates. If beds were input, then these values come from the bed, else they are set to one value
@@ -191,7 +192,7 @@ def fill_out_bed_dict(ref_dict: _IndexedSeqFileDict,
191192

192193
for contig in region_dict:
193194
regions = []
194-
max_value = len(ref_dict[contig])
195+
max_value = ref_keys_counts[contig]
195196
start = 0
196197

197198
# The trivial case of no data yet for this contig the region gets filled out as 0 to len(contig_sequence)

neat/read_simulator/utils/output_file_writer.py

+3-2
Original file line numberDiff line numberDiff line change
@@ -10,6 +10,7 @@
1010

1111
import gzip
1212
import re
13+
import shutil
1314
import time
1415
from struct import pack
1516
import logging
@@ -101,7 +102,7 @@ def __init__(self,
101102
files_to_write.extend(self.fastq_fns)
102103
elif self.write_fastq:
103104
self.fastq1_fn = options.output.parent / f'{options.output.stem}.fastq.gz'
104-
self.fastq2_fn = options.output.parent / "dummy.fastq.gz"
105+
self.fastq2_fn = self.temporary_dir / "dummy.fastq.gz"
105106
self.fastq_fns = [self.fastq1_fn, self.fastq2_fn]
106107
files_to_write.extend(self.fastq_fns)
107108
if self.write_bam:
@@ -115,7 +116,7 @@ def __init__(self,
115116
self.files_to_write = files_to_write
116117

117118
# Create files as applicable
118-
for file in [x for x in self.files_to_write if x != "dummy.fastq.gz"]:
119+
for file in [x for x in self.files_to_write if x.name != "dummy.fastq.gz"]:
119120
validate_output_path(file, True, options.overwrite_output)
120121

121122
mode = 'xt'

neat/read_simulator/utils/read.py

+6-4
Original file line numberDiff line numberDiff line change
@@ -453,10 +453,12 @@ def make_cigar(self):
453453
cig_string, cig_count, curr_char, cig_length = self.align_seqs()
454454
if cig_length < self.run_read_length:
455455
_LOG.warning("Poor alignment, trying reversed")
456-
cig_string2, cig_count2, curr_char2, cig_length = self.align_seqs()
457-
if cig_length < self.run_read_length:
458-
_LOG.error("Alignment still not working")
459-
sys.exit(1)
456+
cig_string2, cig_count2, curr_char2, cig_length2 = self.align_seqs()
457+
if cig_length2 < cig_length:
458+
cig_length = cig_length2
459+
while cig_length < self.run_read_length:
460+
cig_string += "M"
461+
cig_length += 1
460462

461463
# append the final section as we return
462464
return cig_string + str(cig_count) + curr_char

0 commit comments

Comments
 (0)