@@ -25,6 +25,7 @@ def simulate_reads(
25
25
cpu_number ,
26
26
forward_handle ,
27
27
reverse_handle ,
28
+ mutations_handle ,
28
29
sequence_type ,
29
30
gc_bias = False ,
30
31
mode = "default" ,
@@ -42,6 +43,7 @@ def simulate_reads(
42
43
function. Is used for naming the output file
43
44
forward_handle (file): a file handle to write the forward reads to
44
45
reverse_handle (file): a file handle to write the reverse reads to
46
+ mutations_handle (file): a file handle to write the mutations to
45
47
sequencing_type (str): metagenomics or amplicon sequencing used
46
48
gc_bias (bool): if set, the function may skip a read due to abnormal
47
49
GC content
@@ -56,11 +58,12 @@ def simulate_reads(
56
58
57
59
logger .debug ("Cpu #%s: Generating %s read pairs" % (cpu_number , n_pairs ))
58
60
59
- for forward_record , reverse_record in reads_generator (
61
+ for forward_record , reverse_record , mutations in reads_generator (
60
62
n_pairs , record , error_model , cpu_number , gc_bias , sequence_type
61
63
):
62
64
SeqIO .write (forward_record , forward_handle , "fastq-sanger" )
63
65
SeqIO .write (reverse_record , reverse_handle , "fastq-sanger" )
66
+ write_mutations (mutations , mutations_handle )
64
67
65
68
66
69
def reads_generator (n_pairs , record , error_model , cpu_number , gc_bias , sequence_type ):
@@ -69,7 +72,8 @@ def reads_generator(n_pairs, record, error_model, cpu_number, gc_bias, sequence_
69
72
i = 0
70
73
while i < n_pairs :
71
74
try :
72
- forward , reverse = simulate_read (record , error_model , i , cpu_number , sequence_type )
75
+ forward , reverse , mutations = simulate_read (record , error_model , i , cpu_number , sequence_type )
76
+
73
77
except AssertionError :
74
78
logger .warning ("%s shorter than read length for this ErrorModel" % record .id )
75
79
logger .warning ("Skipping %s. You will have less reads than specified" % record .id )
@@ -79,15 +83,15 @@ def reads_generator(n_pairs, record, error_model, cpu_number, gc_bias, sequence_
79
83
stiched_seq = forward .seq + reverse .seq
80
84
gc_content = gc_fraction (stiched_seq )
81
85
if 40 < gc_content < 60 :
82
- yield (forward , reverse )
86
+ yield (forward , reverse , mutations )
83
87
i += 1
84
88
elif np .random .rand () < 0.90 :
85
- yield (forward , reverse )
89
+ yield (forward , reverse , mutations )
86
90
i += 1
87
91
else :
88
92
continue
89
93
else :
90
- yield (forward , reverse )
94
+ yield (forward , reverse , mutations )
91
95
i += 1
92
96
93
97
@@ -145,10 +149,13 @@ def simulate_read(record, error_model, i, cpu_number, sequence_type):
145
149
forward = SeqRecord (
146
150
Seq (str (sequence [forward_start :forward_end ])), id = "%s_%s_%s/1" % (header , i , cpu_number ), description = ""
147
151
)
152
+ forward .annotations ["mutations" ] = []
153
+ forward .annotations ["original" ] = str (forward .seq )
154
+
148
155
# add the indels, the qual scores and modify the record accordingly
149
- forward . seq = error_model .introduce_indels (forward , "forward" , sequence , bounds )
156
+ forward = error_model .introduce_indels (forward , "forward" , sequence , bounds )
150
157
forward = error_model .introduce_error_scores (forward , "forward" )
151
- forward . seq = error_model .mut_sequence (forward , "forward" )
158
+ forward = error_model .mut_sequence (forward , "forward" )
152
159
153
160
# generate the reverse read
154
161
# assign start position reverse read
@@ -174,13 +181,15 @@ def simulate_read(record, error_model, i, cpu_number, sequence_type):
174
181
id = "%s_%s_%s/2" % (header , i , cpu_number ),
175
182
description = "" ,
176
183
)
184
+ reverse .annotations ["mutations" ] = []
185
+ reverse .annotations ["original" ] = str (reverse .seq )
177
186
178
187
# add the indels, the qual scores and modify the record accordingly
179
- reverse . seq = error_model .introduce_indels (reverse , "reverse" , sequence , bounds )
188
+ reverse = error_model .introduce_indels (reverse , "reverse" , sequence , bounds )
180
189
reverse = error_model .introduce_error_scores (reverse , "reverse" )
181
- reverse . seq = error_model .mut_sequence (reverse , "reverse" )
190
+ reverse = error_model .mut_sequence (reverse , "reverse" )
182
191
183
- return (forward , reverse )
192
+ return (forward , reverse , forward . annotations [ "mutations" ] + reverse . annotations [ "mutations" ] )
184
193
185
194
186
195
def to_fastq (generator , output ):
@@ -217,6 +226,7 @@ def worker_iterator(work, error_model, cpu_number, worker_prefix, seed, sequence
217
226
try :
218
227
forward_handle = open (f"{ worker_prefix } _R1.fastq" , "w" )
219
228
reverse_handle = open (f"{ worker_prefix } _R2.fastq" , "w" )
229
+ mutation_handle = open (f"{ worker_prefix } .vcf" , "w" )
220
230
except PermissionError as e :
221
231
logger .error ("Failed to write temporary output file(s): %s" % e )
222
232
sys .exit (1 )
@@ -225,7 +235,7 @@ def worker_iterator(work, error_model, cpu_number, worker_prefix, seed, sequence
225
235
random .seed (seed + cpu_number )
226
236
np .random .seed (seed + cpu_number )
227
237
228
- with forward_handle , reverse_handle :
238
+ with forward_handle , reverse_handle , mutation_handle :
229
239
for record , n_pairs , mode in work :
230
240
simulate_reads (
231
241
record = record ,
@@ -235,6 +245,7 @@ def worker_iterator(work, error_model, cpu_number, worker_prefix, seed, sequence
235
245
cpu_number = cpu_number ,
236
246
forward_handle = forward_handle ,
237
247
reverse_handle = reverse_handle ,
248
+ mutations_handle = mutation_handle ,
238
249
sequence_type = sequence_type ,
239
250
gc_bias = gc_bias ,
240
251
)
@@ -345,7 +356,7 @@ def generate_work_divider(
345
356
yield chunk_work
346
357
347
358
348
- def load_error_model (mode , seed , model , fragment_length , fragment_length_sd ):
359
+ def load_error_model (mode , seed , model , fragment_length , fragment_length_sd , store_mutations ):
349
360
"""
350
361
Load the error model based on the specified mode and parameters.
351
362
@@ -387,12 +398,12 @@ def load_error_model(mode, seed, model, fragment_length, fragment_length_sd):
387
398
npz = os .path .join (os .path .dirname (__file__ ), "profiles/MiSeq" )
388
399
else :
389
400
npz = model
390
- err_mod = kde .KDErrorModel (npz , fragment_length , fragment_length_sd )
401
+ err_mod = kde .KDErrorModel (npz , fragment_length , fragment_length_sd , store_mutations )
391
402
elif mode == "basic" :
392
403
if model is not None :
393
404
logger .warning ("--model %s will be ignored in --mode %s" % (model , mode ))
394
405
395
- err_mod = basic .BasicErrorModel (fragment_length , fragment_length_sd )
406
+ err_mod = basic .BasicErrorModel (fragment_length , fragment_length_sd , store_mutations )
396
407
elif mode == "perfect" :
397
408
if model is not None :
398
409
logger .warning ("--model %s will be ignored in --mode %s" % (model , mode ))
@@ -575,3 +586,28 @@ def load_readcount_or_abundance(
575
586
sys .exit (1 )
576
587
577
588
return readcount_dic , abundance_dic
589
+
590
+
591
+ def write_mutations (mutations , mutations_handle ):
592
+ """Write mutations to a file
593
+
594
+ Args:
595
+ mutations (list): List of mutations.
596
+ mutations_handle (file): File handle to write the mutations to.
597
+ """
598
+ for vcf_dict in mutations :
599
+ mutations_handle .write (
600
+ "\t " .join (
601
+ [
602
+ str (vcf_dict ["id" ]),
603
+ str (vcf_dict ["position" ] + 1 ), # vcf files have 1-based index
604
+ "." ,
605
+ vcf_dict ["ref" ],
606
+ str (vcf_dict ["alt" ]),
607
+ str (vcf_dict ["quality" ]),
608
+ "" ,
609
+ "" ,
610
+ ]
611
+ )
612
+ + "\n "
613
+ )
0 commit comments