1
1
"""
2
2
Runner for generate_reads task
3
3
"""
4
- import copy
4
+ import time
5
5
import logging
6
6
import pickle
7
7
import gzip
10
10
from pathlib import Path
11
11
12
12
from .utils import Options , parse_input_vcf , parse_beds , OutputFileWriter , \
13
- generate_variants , write_local_file , generate_reads
13
+ generate_variants , generate_reads
14
14
from ..common import validate_input_path , validate_output_path
15
15
from ..models import MutationModel , SequencingErrorModel , FragmentLengthModel
16
16
from ..models .default_cancer_mutation_model import *
@@ -249,33 +249,24 @@ def read_simulator_runner(config: str, output: str):
249
249
# these will be the features common to each contig, for multiprocessing
250
250
common_features = {}
251
251
252
- all_variants = {} # dict of all ContigVariants objects, indexed by contig, which we will collect at the end.
253
- vcf_files = []
252
+ local_variant_files = {}
254
253
fastq_files = []
255
254
256
255
sam_reads_files = []
257
256
258
257
for contig in breaks :
258
+ local_variant_files [contig ] = None
259
259
260
260
_LOG .info (f"Generating variants for { contig } " )
261
261
262
- # Todo genericize breaks
263
-
264
262
input_variants = input_variants_dict [contig ]
265
263
# TODO: add the ability to pick up input variants here from previous loop
266
264
267
265
local_reference = reference_index [contig ]
268
266
269
- # _LOG.info(f'Creating trinucleotide map for {contig}...')
270
- # local_trinuc_map = map_chromosome(local_reference, mut_model)
271
-
272
267
# Since we're only running single threaded for now:
273
268
threadidx = 1
274
269
275
- local_variant_file = options .temp_dir_path / f'{ options .output .stem } _tmp_{ contig } _{ threadidx } .vcf.gz'
276
-
277
- _LOG .debug (f'local vcf filename = { local_variant_file } ' )
278
-
279
270
local_bam_pickle_file = None
280
271
if options .produce_bam :
281
272
local_bam_pickle_file = options .temp_dir_path / f'{ options .output .stem } _tmp_{ contig } _{ threadidx } .p.gz'
@@ -296,20 +287,8 @@ def read_simulator_runner(config: str, output: str):
296
287
max_qual_score = max_qual_score ,
297
288
options = options )
298
289
299
- _LOG .info (f'Outputting temp vcf for { contig } for later use' )
300
- # This function produces the local vcf file.
301
- # TODO pickle dump the ContigVariants object instead. Combine them into one vcf
302
- # at the end.
303
- write_local_file (
304
- local_variant_file ,
305
- local_variants ,
306
- local_reference ,
307
- target_regions_dict [contig ],
308
- discard_regions_dict [contig ]
309
- )
310
-
311
- # The above function writes data to local_variant_file, so we need only store its location.
312
- vcf_files .append (local_variant_file )
290
+ # This function saves the local variant data a dictionary. We may need to write this to file.
291
+ local_variant_files [contig ] = local_variants
313
292
314
293
if options .produce_fastq or options .produce_bam :
315
294
read1_fastq_paired , read1_fastq_single , read2_fastq_paired , read2_fastq_single = \
@@ -333,15 +312,15 @@ def read_simulator_runner(config: str, output: str):
333
312
334
313
if options .produce_vcf :
335
314
_LOG .info (f"Outputting golden vcf: { str (output_file_writer .vcf_fn )} " )
336
- output_file_writer .merge_temp_vcfs ( vcf_files )
315
+ output_file_writer .write_final_vcf ( local_variant_files , reference_index )
337
316
338
317
if options .produce_fastq :
339
318
if options .paired_ended :
340
319
_LOG .info (f"Outputting fastq files: "
341
320
f"{ ', ' .join ([str (x ) for x in output_file_writer .fastq_fns ]).strip (', ' )} " )
342
321
else :
343
322
_LOG .info (f"Outputting fastq file: { output_file_writer .fastq_fns [0 ]} " )
344
- output_file_writer .merge_temp_fastqs (fastq_files , options .paired_ended , options . rng )
323
+ output_file_writer .merge_temp_fastqs (fastq_files , options .rng )
345
324
346
325
if options .produce_bam :
347
326
_LOG .info (f"Outputting golden bam file: { str (output_file_writer .bam_fn )} " )
0 commit comments