@@ -25,6 +25,7 @@ def simulate_reads(
25
25
cpu_number ,
26
26
forward_handle ,
27
27
reverse_handle ,
28
+ mutations_handle ,
28
29
sequence_type ,
29
30
gc_bias = False ,
30
31
mode = "default" ,
@@ -42,6 +43,7 @@ def simulate_reads(
42
43
function. Is used for naming the output file
43
44
forward_handle (file): a file handle to write the forward reads to
44
45
reverse_handle (file): a file handle to write the reverse reads to
46
+ mutations_handle (file): a file handle to write the mutations to
45
47
sequencing_type (str): metagenomics or amplicon sequencing used
46
48
gc_bias (bool): if set, the function may skip a read due to abnormal
47
49
GC content
@@ -56,11 +58,12 @@ def simulate_reads(
56
58
57
59
logger .debug ("Cpu #%s: Generating %s read pairs" % (cpu_number , n_pairs ))
58
60
59
- for forward_record , reverse_record in reads_generator (
61
+ for forward_record , reverse_record , mutations in reads_generator (
60
62
n_pairs , record , error_model , cpu_number , gc_bias , sequence_type
61
63
):
62
64
SeqIO .write (forward_record , forward_handle , "fastq-sanger" )
63
65
SeqIO .write (reverse_record , reverse_handle , "fastq-sanger" )
66
+ write_mutations (mutations , mutations_handle )
64
67
65
68
66
69
def reads_generator (n_pairs , record , error_model , cpu_number , gc_bias , sequence_type ):
@@ -69,7 +72,9 @@ def reads_generator(n_pairs, record, error_model, cpu_number, gc_bias, sequence_
69
72
i = 0
70
73
while i < n_pairs :
71
74
try :
72
- forward , reverse = simulate_read (record , error_model , i , cpu_number , sequence_type )
75
+ # forward, reverse = simulate_read(record, error_model, i, cpu_number, sequence_type)
76
+ forward , reverse , mutations = simulate_read (record , error_model , i , cpu_number , sequence_type )
77
+
73
78
except AssertionError :
74
79
logger .warning ("%s shorter than read length for this ErrorModel" % record .id )
75
80
logger .warning ("Skipping %s. You will have less reads than specified" % record .id )
@@ -79,15 +84,15 @@ def reads_generator(n_pairs, record, error_model, cpu_number, gc_bias, sequence_
79
84
stiched_seq = forward .seq + reverse .seq
80
85
gc_content = gc_fraction (stiched_seq )
81
86
if 40 < gc_content < 60 :
82
- yield (forward , reverse )
87
+ yield (forward , reverse , mutations )
83
88
i += 1
84
89
elif np .random .rand () < 0.90 :
85
- yield (forward , reverse )
90
+ yield (forward , reverse , mutations )
86
91
i += 1
87
92
else :
88
93
continue
89
94
else :
90
- yield (forward , reverse )
95
+ yield (forward , reverse , mutations )
91
96
i += 1
92
97
93
98
@@ -145,6 +150,9 @@ def simulate_read(record, error_model, i, cpu_number, sequence_type):
145
150
forward = SeqRecord (
146
151
Seq (str (sequence [forward_start :forward_end ])), id = "%s_%s_%s/1" % (header , i , cpu_number ), description = ""
147
152
)
153
+ forward .annotations ["mutations" ] = []
154
+ forward .annotations ["original" ] = str (forward .seq )
155
+
148
156
# add the indels, the qual scores and modify the record accordingly
149
157
forward .seq = error_model .introduce_indels (forward , "forward" , sequence , bounds )
150
158
forward = error_model .introduce_error_scores (forward , "forward" )
@@ -174,13 +182,15 @@ def simulate_read(record, error_model, i, cpu_number, sequence_type):
174
182
id = "%s_%s_%s/2" % (header , i , cpu_number ),
175
183
description = "" ,
176
184
)
185
+ reverse .annotations ["mutations" ] = []
186
+ reverse .annotations ["original" ] = str (reverse .seq )
177
187
178
188
# add the indels, the qual scores and modify the record accordingly
179
189
reverse .seq = error_model .introduce_indels (reverse , "reverse" , sequence , bounds )
180
190
reverse = error_model .introduce_error_scores (reverse , "reverse" )
181
191
reverse .seq = error_model .mut_sequence (reverse , "reverse" )
182
192
183
- return (forward , reverse )
193
+ return (forward , reverse ) # mutations
184
194
185
195
186
196
def to_fastq (generator , output ):
@@ -217,6 +227,7 @@ def worker_iterator(work, error_model, cpu_number, worker_prefix, seed, sequence
217
227
try :
218
228
forward_handle = open (f"{ worker_prefix } _R1.fastq" , "w" )
219
229
reverse_handle = open (f"{ worker_prefix } _R2.fastq" , "w" )
230
+ mutation_handle = open (f"{ worker_prefix } .vcf" , "w" )
220
231
except PermissionError as e :
221
232
logger .error ("Failed to write temporary output file(s): %s" % e )
222
233
sys .exit (1 )
@@ -235,6 +246,7 @@ def worker_iterator(work, error_model, cpu_number, worker_prefix, seed, sequence
235
246
cpu_number = cpu_number ,
236
247
forward_handle = forward_handle ,
237
248
reverse_handle = reverse_handle ,
249
+ mutations_handle = mutation_handle ,
238
250
sequence_type = sequence_type ,
239
251
gc_bias = gc_bias ,
240
252
)
@@ -345,7 +357,7 @@ def generate_work_divider(
345
357
yield chunk_work
346
358
347
359
348
- def load_error_model (mode , seed , model , fragment_length , fragment_length_sd ):
360
+ def load_error_model (mode , seed , model , fragment_length , fragment_length_sd , store_mutations ):
349
361
"""
350
362
Load the error model based on the specified mode and parameters.
351
363
@@ -387,12 +399,12 @@ def load_error_model(mode, seed, model, fragment_length, fragment_length_sd):
387
399
npz = os .path .join (os .path .dirname (__file__ ), "profiles/MiSeq" )
388
400
else :
389
401
npz = model
390
- err_mod = kde .KDErrorModel (npz , fragment_length , fragment_length_sd )
402
+ err_mod = kde .KDErrorModel (npz , fragment_length , fragment_length_sd , store_mutations )
391
403
elif mode == "basic" :
392
404
if model is not None :
393
405
logger .warning ("--model %s will be ignored in --mode %s" % (model , mode ))
394
406
395
- err_mod = basic .BasicErrorModel (fragment_length , fragment_length_sd )
407
+ err_mod = basic .BasicErrorModel (fragment_length , fragment_length_sd , store_mutations )
396
408
elif mode == "perfect" :
397
409
if model is not None :
398
410
logger .warning ("--model %s will be ignored in --mode %s" % (model , mode ))
@@ -575,3 +587,28 @@ def load_readcount_or_abundance(
575
587
sys .exit (1 )
576
588
577
589
return readcount_dic , abundance_dic
590
+
591
+
592
+ def write_mutations (mutations , mutations_handle ):
593
+ """Write mutations to a file
594
+
595
+ Args:
596
+ mutations (list): List of mutations.
597
+ mutations_handle (file): File handle to write the mutations to.
598
+ """
599
+ for vcf_dict in mutations :
600
+ # pass
601
+ mutations_handle .write (
602
+ "\t " .join (
603
+ [
604
+ str (vcf_dict ["id" ]),
605
+ str (vcf_dict ["position" ] + 1 ), # vcf files have 1-based index
606
+ "." ,
607
+ vcf_dict ["ref" ],
608
+ str (vcf_dict ["alt" ]),
609
+ str (vcf_dict ["quality" ]),
610
+ "" ,
611
+ ""
612
+ ]
613
+ ) + "\n "
614
+ )
0 commit comments