Skip to content

Commit e8ff9ca

Browse files
authored
Merge pull request #250 from HadrienG/storing-mutations
Saving inserted mutations/sequencing errors to a vcf file
2 parents fb425d1 + 886fdea commit e8ff9ca

File tree

8 files changed

+143
-40
lines changed

8 files changed

+143
-40
lines changed

Diff for: iss/app.py

+25-3
Original file line numberDiff line numberDiff line change
@@ -34,7 +34,9 @@ def generate_reads(args):
3434
logger.debug("Using verbose logger")
3535
logger.info("Starting iss generate")
3636

37-
error_model = load_error_model(args.mode, args.seed, args.model, args.fragment_length, args.fragment_length_sd)
37+
error_model = load_error_model(
38+
args.mode, args.seed, args.model, args.fragment_length, args.fragment_length_sd, args.store_mutations
39+
)
3840

3941
genome_list, genome_file = load_genomes(
4042
args.genomes, args.draft, args.ncbi, args.n_genomes_ncbi, args.output, args.n_genomes
@@ -54,6 +56,9 @@ def generate_reads(args):
5456
error_model,
5557
)
5658

59+
if args.store_mutations:
60+
logger.info(f"Storing inserted sequence errors in {args.output}.vcf")
61+
5762
logger.info("Using %s cpus for read generation" % args.cpus)
5863

5964
if readcount_dic is not None:
@@ -104,7 +109,8 @@ def generate_reads(args):
104109
logger.error("iss generate interrupted: %s" % e)
105110
temp_R1 = [temp_file + "_R1.fastq" for temp_file in temp_file_list]
106111
temp_R2 = [temp_file + "_R2.fastq" for temp_file in temp_file_list]
107-
full_tmp_list = temp_R1 + temp_R2
112+
temp_mut = [temp_file + ".vcf" for temp_file in temp_file_list]
113+
full_tmp_list = temp_R1 + temp_R2 + temp_mut
108114
full_tmp_list.append(genome_file)
109115
if os.path.exists("%s.memmap" % args.output):
110116
full_tmp_list.append("%s.memmap" % args.output)
@@ -116,16 +122,25 @@ def generate_reads(args):
116122
# and reads were appended to the same temp file.
117123
temp_R1 = [temp_file + "_R1.fastq" for temp_file in temp_file_list]
118124
temp_R2 = [temp_file + "_R2.fastq" for temp_file in temp_file_list]
125+
temp_mut = [temp_file + ".vcf" for temp_file in temp_file_list] if args.store_mutations else []
119126
util.concatenate(temp_R1, args.output + "_R1.fastq")
120127
util.concatenate(temp_R2, args.output + "_R2.fastq")
121-
full_tmp_list = temp_R1 + temp_R2
128+
if args.store_mutations:
129+
util.concatenate(
130+
temp_mut,
131+
args.output + ".vcf",
132+
"##fileformat=VCFv4.1\n" + "\t".join(["#CHROM", "POS", "ID", "REF", "ALT", "QUAL", "FILTER", "INFO"]),
133+
)
134+
full_tmp_list = temp_R1 + temp_R2 + temp_mut
122135
full_tmp_list.append(genome_file)
123136
if os.path.exists("%s.memmap" % args.output):
124137
full_tmp_list.append("%s.memmap" % args.output)
125138
util.cleanup(full_tmp_list)
126139
if args.compress:
127140
util.compress(args.output + "_R1.fastq")
128141
util.compress(args.output + "_R2.fastq")
142+
if args.store_mutations:
143+
util.compress(args.output + ".vcf")
129144
logger.info("Read generation complete")
130145

131146

@@ -381,6 +396,13 @@ def main():
381396
type=int,
382397
help="Fragment length standard deviation for metagenomics sequencing (default: %(default)s).",
383398
)
399+
parser_gen.add_argument(
400+
"--store_mutations",
401+
"-M",
402+
action="store_true",
403+
default=False,
404+
help="Generates an additional VCF file with the mutations introduced in the reads",
405+
)
384406
parser_gen._optionals.title = "arguments"
385407
parser_gen.set_defaults(func=generate_reads)
386408

Diff for: iss/error_models/__init__.py

+42-5
Original file line numberDiff line numberDiff line change
@@ -78,7 +78,7 @@ def mut_sequence(self, record, orientation):
7878
'reverse'
7979
8080
Returns:
81-
Seq: a sequence
81+
SeqRecord: the read record with substitution errors
8282
"""
8383

8484
# get the right subst_matrix
@@ -92,11 +92,24 @@ def mut_sequence(self, record, orientation):
9292
position = 0
9393
for nucl, qual in zip(mutable_seq, quality_list):
9494
if random.random() > util.phred_to_prob(qual) and nucl.upper() not in "RYWSMKHBVDN":
95-
mutable_seq[position] = str(
95+
mutated_nuc = str(
9696
np.random.choice(nucl_choices[position][nucl.upper()][0], p=nucl_choices[position][nucl.upper()][1])
9797
)
98+
if self.store_mutations and mutated_nuc != record.annotations["original"][position]:
99+
record.annotations["mutations"].append(
100+
{
101+
"id": record.id,
102+
"position": position,
103+
"ref": mutable_seq[position],
104+
"alt": mutated_nuc,
105+
"quality": qual,
106+
"type": "sub",
107+
}
108+
)
109+
mutable_seq[position] = mutated_nuc
98110
position += 1
99-
return Seq(mutable_seq)
111+
record.seq = Seq(mutable_seq)
112+
return record
100113

101114
def adjust_seq_length(self, mut_seq, orientation, full_sequence, bounds):
102115
"""Truncate or Extend reads to make them fit the read length
@@ -158,7 +171,7 @@ def introduce_indels(self, record, orientation, full_seq, bounds):
158171
bounds (tuple): the position of the read in the full_sequence
159172
160173
Returns:
161-
Seq: a sequence with (eventually) indels
174+
SeqRecord: a sequence record with indel errors
162175
"""
163176

164177
# get the right indel arrays
@@ -181,11 +194,35 @@ def introduce_indels(self, record, orientation, full_seq, bounds):
181194
if random.random() < prob:
182195
# we want to insert after the base read
183196
mutable_seq.insert(position + 1, str(nucl_to_insert))
197+
if self.store_mutations:
198+
record.annotations["mutations"].append(
199+
{
200+
"id": record.id,
201+
"position": position,
202+
"ref": mutable_seq[position],
203+
"alt": mutable_seq[position] + nucl_to_insert,
204+
"quality": ".",
205+
"type": "ins",
206+
}
207+
)
208+
184209
if random.random() < deletions[position][mutable_seq[nucl].upper()]:
185210
mutable_seq.pop(position)
211+
if self.store_mutations:
212+
record.annotations["mutations"].append(
213+
{
214+
"id": record.id,
215+
"position": position,
216+
"ref": mutable_seq[position],
217+
"alt": ".",
218+
"quality": ".",
219+
"type": "del",
220+
}
221+
)
186222
position += 1
187223
except IndexError:
188224
continue
189225

190226
seq = self.adjust_seq_length(mutable_seq, orientation, full_seq, bounds)
191-
return seq
227+
record.seq = seq
228+
return record

Diff for: iss/error_models/basic.py

+2-1
Original file line numberDiff line numberDiff line change
@@ -15,12 +15,13 @@ class BasicErrorModel(ErrorModel):
1515
equal between all nucleotides.
1616
"""
1717

18-
def __init__(self, fragment_length=None, fragment_sd=None):
18+
def __init__(self, fragment_length=None, fragment_sd=None, store_mutations=False):
1919
super().__init__()
2020
self.read_length = 125
2121
self.insert_size = 200
2222
self.fragment_length = fragment_length
2323
self.fragment_sd = fragment_sd
24+
self.store_mutations = store_mutations
2425
self.quality_forward = self.quality_reverse = 30
2526
self.subst_choices_for = self.subst_choices_rev = [
2627
{

Diff for: iss/error_models/kde.py

+2-1
Original file line numberDiff line numberDiff line change
@@ -21,10 +21,11 @@ class KDErrorModel(ErrorModel):
2121
- the insertion and deletion rates for each position (for R1 and R2)
2222
"""
2323

24-
def __init__(self, npz_path, fragment_length=None, fragment_sd=None):
24+
def __init__(self, npz_path, fragment_length=None, fragment_sd=None, store_mutations=False):
2525
super().__init__()
2626
self.npz_path = npz_path
2727
self.error_profile = self.load_npz(npz_path, "kde")
28+
self.store_mutations = store_mutations
2829

2930
self.read_length = self.error_profile["read_length"]
3031
self.i_size_cdf = self.error_profile["insert_size"]

Diff for: iss/generator.py

+50-14
Original file line numberDiff line numberDiff line change
@@ -25,6 +25,7 @@ def simulate_reads(
2525
cpu_number,
2626
forward_handle,
2727
reverse_handle,
28+
mutations_handle,
2829
sequence_type,
2930
gc_bias=False,
3031
mode="default",
@@ -42,6 +43,7 @@ def simulate_reads(
4243
function. Is used for naming the output file
4344
forward_handle (file): a file handle to write the forward reads to
4445
reverse_handle (file): a file handle to write the reverse reads to
46+
mutations_handle (file): a file handle to write the mutations to
4547
sequencing_type (str): metagenomics or amplicon sequencing used
4648
gc_bias (bool): if set, the function may skip a read due to abnormal
4749
GC content
@@ -56,11 +58,12 @@ def simulate_reads(
5658

5759
logger.debug("Cpu #%s: Generating %s read pairs" % (cpu_number, n_pairs))
5860

59-
for forward_record, reverse_record in reads_generator(
61+
for forward_record, reverse_record, mutations in reads_generator(
6062
n_pairs, record, error_model, cpu_number, gc_bias, sequence_type
6163
):
6264
SeqIO.write(forward_record, forward_handle, "fastq-sanger")
6365
SeqIO.write(reverse_record, reverse_handle, "fastq-sanger")
66+
write_mutations(mutations, mutations_handle)
6467

6568

6669
def reads_generator(n_pairs, record, error_model, cpu_number, gc_bias, sequence_type):
@@ -69,7 +72,8 @@ def reads_generator(n_pairs, record, error_model, cpu_number, gc_bias, sequence_
6972
i = 0
7073
while i < n_pairs:
7174
try:
72-
forward, reverse = simulate_read(record, error_model, i, cpu_number, sequence_type)
75+
forward, reverse, mutations = simulate_read(record, error_model, i, cpu_number, sequence_type)
76+
7377
except AssertionError:
7478
logger.warning("%s shorter than read length for this ErrorModel" % record.id)
7579
logger.warning("Skipping %s. You will have less reads than specified" % record.id)
@@ -79,15 +83,15 @@ def reads_generator(n_pairs, record, error_model, cpu_number, gc_bias, sequence_
7983
stiched_seq = forward.seq + reverse.seq
8084
gc_content = gc_fraction(stiched_seq)
8185
if 40 < gc_content < 60:
82-
yield (forward, reverse)
86+
yield (forward, reverse, mutations)
8387
i += 1
8488
elif np.random.rand() < 0.90:
85-
yield (forward, reverse)
89+
yield (forward, reverse, mutations)
8690
i += 1
8791
else:
8892
continue
8993
else:
90-
yield (forward, reverse)
94+
yield (forward, reverse, mutations)
9195
i += 1
9296

9397

@@ -145,10 +149,13 @@ def simulate_read(record, error_model, i, cpu_number, sequence_type):
145149
forward = SeqRecord(
146150
Seq(str(sequence[forward_start:forward_end])), id="%s_%s_%s/1" % (header, i, cpu_number), description=""
147151
)
152+
forward.annotations["mutations"] = []
153+
forward.annotations["original"] = str(forward.seq)
154+
148155
# add the indels, the qual scores and modify the record accordingly
149-
forward.seq = error_model.introduce_indels(forward, "forward", sequence, bounds)
156+
forward = error_model.introduce_indels(forward, "forward", sequence, bounds)
150157
forward = error_model.introduce_error_scores(forward, "forward")
151-
forward.seq = error_model.mut_sequence(forward, "forward")
158+
forward = error_model.mut_sequence(forward, "forward")
152159

153160
# generate the reverse read
154161
# assign start position reverse read
@@ -174,13 +181,15 @@ def simulate_read(record, error_model, i, cpu_number, sequence_type):
174181
id="%s_%s_%s/2" % (header, i, cpu_number),
175182
description="",
176183
)
184+
reverse.annotations["mutations"] = []
185+
reverse.annotations["original"] = str(reverse.seq)
177186

178187
# add the indels, the qual scores and modify the record accordingly
179-
reverse.seq = error_model.introduce_indels(reverse, "reverse", sequence, bounds)
188+
reverse = error_model.introduce_indels(reverse, "reverse", sequence, bounds)
180189
reverse = error_model.introduce_error_scores(reverse, "reverse")
181-
reverse.seq = error_model.mut_sequence(reverse, "reverse")
190+
reverse = error_model.mut_sequence(reverse, "reverse")
182191

183-
return (forward, reverse)
192+
return (forward, reverse, forward.annotations["mutations"] + reverse.annotations["mutations"])
184193

185194

186195
def to_fastq(generator, output):
@@ -217,6 +226,7 @@ def worker_iterator(work, error_model, cpu_number, worker_prefix, seed, sequence
217226
try:
218227
forward_handle = open(f"{worker_prefix}_R1.fastq", "w")
219228
reverse_handle = open(f"{worker_prefix}_R2.fastq", "w")
229+
mutation_handle = open(f"{worker_prefix}.vcf", "w")
220230
except PermissionError as e:
221231
logger.error("Failed to write temporary output file(s): %s" % e)
222232
sys.exit(1)
@@ -225,7 +235,7 @@ def worker_iterator(work, error_model, cpu_number, worker_prefix, seed, sequence
225235
random.seed(seed + cpu_number)
226236
np.random.seed(seed + cpu_number)
227237

228-
with forward_handle, reverse_handle:
238+
with forward_handle, reverse_handle, mutation_handle:
229239
for record, n_pairs, mode in work:
230240
simulate_reads(
231241
record=record,
@@ -235,6 +245,7 @@ def worker_iterator(work, error_model, cpu_number, worker_prefix, seed, sequence
235245
cpu_number=cpu_number,
236246
forward_handle=forward_handle,
237247
reverse_handle=reverse_handle,
248+
mutations_handle=mutation_handle,
238249
sequence_type=sequence_type,
239250
gc_bias=gc_bias,
240251
)
@@ -345,7 +356,7 @@ def generate_work_divider(
345356
yield chunk_work
346357

347358

348-
def load_error_model(mode, seed, model, fragment_length, fragment_length_sd):
359+
def load_error_model(mode, seed, model, fragment_length, fragment_length_sd, store_mutations):
349360
"""
350361
Load the error model based on the specified mode and parameters.
351362
@@ -387,12 +398,12 @@ def load_error_model(mode, seed, model, fragment_length, fragment_length_sd):
387398
npz = os.path.join(os.path.dirname(__file__), "profiles/MiSeq")
388399
else:
389400
npz = model
390-
err_mod = kde.KDErrorModel(npz, fragment_length, fragment_length_sd)
401+
err_mod = kde.KDErrorModel(npz, fragment_length, fragment_length_sd, store_mutations)
391402
elif mode == "basic":
392403
if model is not None:
393404
logger.warning("--model %s will be ignored in --mode %s" % (model, mode))
394405

395-
err_mod = basic.BasicErrorModel(fragment_length, fragment_length_sd)
406+
err_mod = basic.BasicErrorModel(fragment_length, fragment_length_sd, store_mutations)
396407
elif mode == "perfect":
397408
if model is not None:
398409
logger.warning("--model %s will be ignored in --mode %s" % (model, mode))
@@ -575,3 +586,28 @@ def load_readcount_or_abundance(
575586
sys.exit(1)
576587

577588
return readcount_dic, abundance_dic
589+
590+
591+
def write_mutations(mutations, mutations_handle):
592+
"""Write mutations to a file
593+
594+
Args:
595+
mutations (list): List of mutations.
596+
mutations_handle (file): File handle to write the mutations to.
597+
"""
598+
for vcf_dict in mutations:
599+
mutations_handle.write(
600+
"\t".join(
601+
[
602+
str(vcf_dict["id"]),
603+
str(vcf_dict["position"] + 1), # vcf files have 1-based index
604+
".",
605+
vcf_dict["ref"],
606+
str(vcf_dict["alt"]),
607+
str(vcf_dict["quality"]),
608+
"",
609+
"",
610+
]
611+
)
612+
+ "\n"
613+
)

Diff for: iss/test/test_error_model.py

+4-4
Original file line numberDiff line numberDiff line change
@@ -52,7 +52,7 @@ def test_mut_sequence():
5252

5353
read = SeqRecord(Seq(str("AAAAA" * 25)), id="read_1", description="test read")
5454
read.letter_annotations["phred_quality"] = [5] * 125
55-
read.seq = err_mod.mut_sequence(read, "forward")
55+
read = err_mod.mut_sequence(read, "forward")
5656
assert str(read.seq[:10]) == "AAAACAGAAA"
5757

5858

@@ -66,7 +66,7 @@ def test_introduce_indels():
6666
bounds = (5, 130)
6767
read = SeqRecord(Seq(str("ATATA" * 25)), id="read_1", description="test read")
6868
ref_genome = SeqRecord(Seq(str("ATATA" * 100)), id="ref_genome", description="test reference")
69-
read.seq = err_mod.introduce_indels(read, "forward", ref_genome, bounds)
69+
read = err_mod.introduce_indels(read, "forward", ref_genome, bounds)
7070
assert len(read.seq) == 125
7171
assert read.seq[:10] == "ATGATAATAT"
7272

@@ -81,7 +81,7 @@ def test_adjust_seq_length_extend():
8181
bounds = (480, 500)
8282
read = SeqRecord(Seq(str("ATTTA" * 4)), id="read_1", description="test read")
8383
ref_genome = SeqRecord(Seq(str("ATTTA" * 100)), id="ref_genome", description="test reference")
84-
read.seq = err_mod.introduce_indels(read, "forward", ref_genome, bounds)
84+
read = err_mod.introduce_indels(read, "forward", ref_genome, bounds)
8585
assert len(read.seq) == 20
8686
assert read.seq[:10] == "TTAATTTAAT"
8787
assert read.seq[10:] == "TTAATTTAAA"
@@ -100,7 +100,7 @@ def test_introduce_indels_rev():
100100

101101
ref_genome = SeqRecord(Seq("GG" + str("GTACC" * 100) + "GG"), id="ref_genome", description="test reference")
102102
read = SeqRecord(Seq(rev_comp(str(ref_genome.seq[484:504]))), id="read_1", description="test read")
103-
read.seq = err_mod.introduce_indels(read, "reverse", ref_genome, bounds)
103+
read = err_mod.introduce_indels(read, "reverse", ref_genome, bounds)
104104
assert len(read.seq) == 20
105105
assert read.seq == "CGTACGGTACGGTACGGTAC"
106106

0 commit comments

Comments
 (0)