Skip to content

Commit 8633b9f

Browse files
committed
init storing mutations
1 parent fb425d1 commit 8633b9f

File tree

6 files changed

+85
-16
lines changed

6 files changed

+85
-16
lines changed

Diff for: iss/app.py

+19-2
Original file line numberDiff line numberDiff line change
@@ -34,7 +34,7 @@ def generate_reads(args):
3434
logger.debug("Using verbose logger")
3535
logger.info("Starting iss generate")
3636

37-
error_model = load_error_model(args.mode, args.seed, args.model, args.fragment_length, args.fragment_length_sd)
37+
error_model = load_error_model(args.mode, args.seed, args.model, args.fragment_length, args.fragment_length_sd, args.store_mutations)
3838

3939
genome_list, genome_file = load_genomes(
4040
args.genomes, args.draft, args.ncbi, args.n_genomes_ncbi, args.output, args.n_genomes
@@ -104,7 +104,8 @@ def generate_reads(args):
104104
logger.error("iss generate interrupted: %s" % e)
105105
temp_R1 = [temp_file + "_R1.fastq" for temp_file in temp_file_list]
106106
temp_R2 = [temp_file + "_R2.fastq" for temp_file in temp_file_list]
107-
full_tmp_list = temp_R1 + temp_R2
107+
temp_mut = [temp_file + ".vcf" for temp_file in temp_file_list]
108+
full_tmp_list = temp_R1 + temp_R2 + temp_mut
108109
full_tmp_list.append(genome_file)
109110
if os.path.exists("%s.memmap" % args.output):
110111
full_tmp_list.append("%s.memmap" % args.output)
@@ -116,8 +117,15 @@ def generate_reads(args):
116117
# and reads were appended to the same temp file.
117118
temp_R1 = [temp_file + "_R1.fastq" for temp_file in temp_file_list]
118119
temp_R2 = [temp_file + "_R2.fastq" for temp_file in temp_file_list]
120+
temp_mut = [temp_file + ".vcf" for temp_file in temp_file_list] if args.store_mutations else []
119121
util.concatenate(temp_R1, args.output + "_R1.fastq")
120122
util.concatenate(temp_R2, args.output + "_R2.fastq")
123+
if args.store_mutations:
124+
util.concatenate(
125+
temp_mut,
126+
args.output + '.vcf',
127+
"##fileformat=VCFv4.1\n" + "\t".join(["#CHROM", "POS", "ID", "REF", "ALT", "QUAL", "FILTER", "INFO"])
128+
)
121129
full_tmp_list = temp_R1 + temp_R2
122130
full_tmp_list.append(genome_file)
123131
if os.path.exists("%s.memmap" % args.output):
@@ -126,6 +134,8 @@ def generate_reads(args):
126134
if args.compress:
127135
util.compress(args.output + "_R1.fastq")
128136
util.compress(args.output + "_R2.fastq")
137+
if args.store_mutations:
138+
util.compress(args.output + '.vcf')
129139
logger.info("Read generation complete")
130140

131141

@@ -381,6 +391,13 @@ def main():
381391
type=int,
382392
help="Fragment length standard deviation for metagenomics sequencing (default: %(default)s).",
383393
)
394+
parser_gen.add_argument(
395+
'--store_mutations',
396+
'-M',
397+
action='store_true',
398+
default=False,
399+
help='Generates an additional VCF file with the mutations introduced in the reads',
400+
)
384401
parser_gen._optionals.title = "arguments"
385402
parser_gen.set_defaults(func=generate_reads)
386403

Diff for: iss/error_models/__init__.py

+13-2
Original file line numberDiff line numberDiff line change
@@ -92,11 +92,22 @@ def mut_sequence(self, record, orientation):
9292
position = 0
9393
for nucl, qual in zip(mutable_seq, quality_list):
9494
if random.random() > util.phred_to_prob(qual) and nucl.upper() not in "RYWSMKHBVDN":
95-
mutable_seq[position] = str(
95+
mutated_nuc = str(
9696
np.random.choice(nucl_choices[position][nucl.upper()][0], p=nucl_choices[position][nucl.upper()][1])
9797
)
98+
if self.store_mutations and mutated_nuc != record.annotations["original"][position]:
99+
record.annotations["mutations"].append({
100+
"id": record.id,
101+
"position": position,
102+
"ref": mutable_seq[position],
103+
"alt": mutated_nuc,
104+
"quality": qual,
105+
"type": "snp",
106+
})
107+
mutable_seq[position] = mutated_nuc
98108
position += 1
99-
return Seq(mutable_seq)
109+
record.seq = Seq(mutable_seq)
110+
return record
100111

101112
def adjust_seq_length(self, mut_seq, orientation, full_sequence, bounds):
102113
"""Truncate or Extend reads to make them fit the read length

Diff for: iss/error_models/basic.py

+2-1
Original file line numberDiff line numberDiff line change
@@ -15,12 +15,13 @@ class BasicErrorModel(ErrorModel):
1515
equal between all nucleotides.
1616
"""
1717

18-
def __init__(self, fragment_length=None, fragment_sd=None):
18+
def __init__(self, fragment_length=None, fragment_sd=None, store_mutations=False):
1919
super().__init__()
2020
self.read_length = 125
2121
self.insert_size = 200
2222
self.fragment_length = fragment_length
2323
self.fragment_sd = fragment_sd
24+
self.store_mutations = store_mutations
2425
self.quality_forward = self.quality_reverse = 30
2526
self.subst_choices_for = self.subst_choices_rev = [
2627
{

Diff for: iss/error_models/kde.py

+2-1
Original file line numberDiff line numberDiff line change
@@ -21,10 +21,11 @@ class KDErrorModel(ErrorModel):
2121
- the insertion and deletion rates for each position (for R1 and R2)
2222
"""
2323

24-
def __init__(self, npz_path, fragment_length=None, fragment_sd=None):
24+
def __init__(self, npz_path, fragment_length=None, fragment_sd=None, store_mutations=False):
2525
super().__init__()
2626
self.npz_path = npz_path
2727
self.error_profile = self.load_npz(npz_path, "kde")
28+
self.store_mutations = store_mutations
2829

2930
self.read_length = self.error_profile["read_length"]
3031
self.i_size_cdf = self.error_profile["insert_size"]

Diff for: iss/generator.py

+46-9
Original file line numberDiff line numberDiff line change
@@ -25,6 +25,7 @@ def simulate_reads(
2525
cpu_number,
2626
forward_handle,
2727
reverse_handle,
28+
mutations_handle,
2829
sequence_type,
2930
gc_bias=False,
3031
mode="default",
@@ -42,6 +43,7 @@ def simulate_reads(
4243
function. Is used for naming the output file
4344
forward_handle (file): a file handle to write the forward reads to
4445
reverse_handle (file): a file handle to write the reverse reads to
46+
mutations_handle (file): a file handle to write the mutations to
4547
sequencing_type (str): metagenomics or amplicon sequencing used
4648
gc_bias (bool): if set, the function may skip a read due to abnormal
4749
GC content
@@ -56,11 +58,12 @@ def simulate_reads(
5658

5759
logger.debug("Cpu #%s: Generating %s read pairs" % (cpu_number, n_pairs))
5860

59-
for forward_record, reverse_record in reads_generator(
61+
for forward_record, reverse_record, mutations in reads_generator(
6062
n_pairs, record, error_model, cpu_number, gc_bias, sequence_type
6163
):
6264
SeqIO.write(forward_record, forward_handle, "fastq-sanger")
6365
SeqIO.write(reverse_record, reverse_handle, "fastq-sanger")
66+
write_mutations(mutations, mutations_handle)
6467

6568

6669
def reads_generator(n_pairs, record, error_model, cpu_number, gc_bias, sequence_type):
@@ -69,7 +72,9 @@ def reads_generator(n_pairs, record, error_model, cpu_number, gc_bias, sequence_
6972
i = 0
7073
while i < n_pairs:
7174
try:
72-
forward, reverse = simulate_read(record, error_model, i, cpu_number, sequence_type)
75+
# forward, reverse = simulate_read(record, error_model, i, cpu_number, sequence_type)
76+
forward, reverse, mutations = simulate_read(record, error_model, i, cpu_number, sequence_type)
77+
7378
except AssertionError:
7479
logger.warning("%s shorter than read length for this ErrorModel" % record.id)
7580
logger.warning("Skipping %s. You will have less reads than specified" % record.id)
@@ -79,15 +84,15 @@ def reads_generator(n_pairs, record, error_model, cpu_number, gc_bias, sequence_
7984
stiched_seq = forward.seq + reverse.seq
8085
gc_content = gc_fraction(stiched_seq)
8186
if 40 < gc_content < 60:
82-
yield (forward, reverse)
87+
yield (forward, reverse, mutations)
8388
i += 1
8489
elif np.random.rand() < 0.90:
85-
yield (forward, reverse)
90+
yield (forward, reverse, mutations)
8691
i += 1
8792
else:
8893
continue
8994
else:
90-
yield (forward, reverse)
95+
yield (forward, reverse, mutations)
9196
i += 1
9297

9398

@@ -145,6 +150,9 @@ def simulate_read(record, error_model, i, cpu_number, sequence_type):
145150
forward = SeqRecord(
146151
Seq(str(sequence[forward_start:forward_end])), id="%s_%s_%s/1" % (header, i, cpu_number), description=""
147152
)
153+
forward.annotations["mutations"] = []
154+
forward.annotations["original"] = str(forward.seq)
155+
148156
# add the indels, the qual scores and modify the record accordingly
149157
forward.seq = error_model.introduce_indels(forward, "forward", sequence, bounds)
150158
forward = error_model.introduce_error_scores(forward, "forward")
@@ -174,13 +182,15 @@ def simulate_read(record, error_model, i, cpu_number, sequence_type):
174182
id="%s_%s_%s/2" % (header, i, cpu_number),
175183
description="",
176184
)
185+
reverse.annotations["mutations"] = []
186+
reverse.annotations["original"] = str(reverse.seq)
177187

178188
# add the indels, the qual scores and modify the record accordingly
179189
reverse.seq = error_model.introduce_indels(reverse, "reverse", sequence, bounds)
180190
reverse = error_model.introduce_error_scores(reverse, "reverse")
181191
reverse.seq = error_model.mut_sequence(reverse, "reverse")
182192

183-
return (forward, reverse)
193+
return (forward, reverse) # mutations
184194

185195

186196
def to_fastq(generator, output):
@@ -217,6 +227,7 @@ def worker_iterator(work, error_model, cpu_number, worker_prefix, seed, sequence
217227
try:
218228
forward_handle = open(f"{worker_prefix}_R1.fastq", "w")
219229
reverse_handle = open(f"{worker_prefix}_R2.fastq", "w")
230+
mutation_handle = open(f"{worker_prefix}.vcf", "w")
220231
except PermissionError as e:
221232
logger.error("Failed to write temporary output file(s): %s" % e)
222233
sys.exit(1)
@@ -235,6 +246,7 @@ def worker_iterator(work, error_model, cpu_number, worker_prefix, seed, sequence
235246
cpu_number=cpu_number,
236247
forward_handle=forward_handle,
237248
reverse_handle=reverse_handle,
249+
mutations_handle=mutation_handle,
238250
sequence_type=sequence_type,
239251
gc_bias=gc_bias,
240252
)
@@ -345,7 +357,7 @@ def generate_work_divider(
345357
yield chunk_work
346358

347359

348-
def load_error_model(mode, seed, model, fragment_length, fragment_length_sd):
360+
def load_error_model(mode, seed, model, fragment_length, fragment_length_sd, store_mutations):
349361
"""
350362
Load the error model based on the specified mode and parameters.
351363
@@ -387,12 +399,12 @@ def load_error_model(mode, seed, model, fragment_length, fragment_length_sd):
387399
npz = os.path.join(os.path.dirname(__file__), "profiles/MiSeq")
388400
else:
389401
npz = model
390-
err_mod = kde.KDErrorModel(npz, fragment_length, fragment_length_sd)
402+
err_mod = kde.KDErrorModel(npz, fragment_length, fragment_length_sd, store_mutations)
391403
elif mode == "basic":
392404
if model is not None:
393405
logger.warning("--model %s will be ignored in --mode %s" % (model, mode))
394406

395-
err_mod = basic.BasicErrorModel(fragment_length, fragment_length_sd)
407+
err_mod = basic.BasicErrorModel(fragment_length, fragment_length_sd, store_mutations)
396408
elif mode == "perfect":
397409
if model is not None:
398410
logger.warning("--model %s will be ignored in --mode %s" % (model, mode))
@@ -575,3 +587,28 @@ def load_readcount_or_abundance(
575587
sys.exit(1)
576588

577589
return readcount_dic, abundance_dic
590+
591+
592+
def write_mutations(mutations, mutations_handle):
593+
"""Write mutations to a file
594+
595+
Args:
596+
mutations (list): List of mutations.
597+
mutations_handle (file): File handle to write the mutations to.
598+
"""
599+
for vcf_dict in mutations:
600+
# pass
601+
mutations_handle.write(
602+
"\t".join(
603+
[
604+
str(vcf_dict["id"]),
605+
str(vcf_dict["position"] + 1), # vcf files have 1-based index
606+
".",
607+
vcf_dict["ref"],
608+
str(vcf_dict["alt"]),
609+
str(vcf_dict["quality"]),
610+
"",
611+
""
612+
]
613+
) + "\n"
614+
)

Diff for: iss/util.py

+3-1
Original file line numberDiff line numberDiff line change
@@ -210,7 +210,7 @@ def reservoir(records, record_list, n=None):
210210
yield record
211211

212212

213-
def concatenate(file_list, output):
213+
def concatenate(file_list, output, header = None):
214214
"""Concatenate files together
215215
216216
Args:
@@ -226,6 +226,8 @@ def concatenate(file_list, output):
226226
sys.exit(1)
227227

228228
with out_file:
229+
if header is not None:
230+
out_file.write(str.encode(header + "\n"))
229231
for file_name in file_list:
230232
if file_name is not None:
231233
with open(file_name, "rb") as f:

0 commit comments

Comments
 (0)