Skip to content

Commit

Permalink
Merge pull request #242 from ENPICOM/improve-amplicon-seq
Browse files Browse the repository at this point in the history
Additional Amplicon sequencing features
  • Loading branch information
HadrienG authored Oct 6, 2023
2 parents 9e75b2a + cb73c36 commit 6f2368a
Show file tree
Hide file tree
Showing 5 changed files with 126 additions and 72 deletions.
5 changes: 5 additions & 0 deletions data/readcounts.txt
Original file line number Diff line number Diff line change
@@ -0,0 +1,5 @@
amplicon_A 2
amplicon_T 2
amplicon_GC 4
amplicon_ATCG 1
amplicon_TA 1
27 changes: 27 additions & 0 deletions iss/abundance.py
Original file line number Diff line number Diff line change
Expand Up @@ -10,6 +10,33 @@
import numpy as np


def parse_readcount_file(readcount_file):
logger = logging.getLogger(__name__)

readcount_dic = {}
try:
assert os.stat(readcount_file).st_size != 0
f = open(readcount_file, 'r')
except (IOError, OSError) as e:
logger.error('Failed to open file:%s' % e)
sys.exit(1)
except AssertionError as e:
logger.error('File seems empty: %s' % readcount_file)
sys.exit(1)
else:
with f:
for line in f:
try:
genome_id = line.split()[0]
read_count = int(line.split()[1])
except IndexError as e:
logger.error('Failed to read file: %s' % e)
sys.exit(1)
else:
readcount_dic[genome_id] = read_count
return readcount_dic


def parse_abundance_file(abundance_file):
"""Parse an abundance or coverage file
Expand Down
148 changes: 82 additions & 66 deletions iss/app.py
Original file line number Diff line number Diff line change
Expand Up @@ -141,8 +141,17 @@ def generate_reads(args):
'lognormal': abundance.lognormal,
'zero_inflated_lognormal': abundance.zero_inflated_lognormal
}
if args.readcount_file:
logger.info('Using readcount file:%s' % args.readcount_file)
logger.warning('--readcount_file disables --n_reads, n_reads will be calculated from the readcount file')
if args.draft:
raise RuntimeError(
"readcount_file is only supported using --genomes, not --draft"
)
readcount_dic = abundance.parse_readcount_file(args.readcount_file)

# read the abundance file
if args.abundance_file:
elif args.abundance_file:
logger.info('Using abundance file:%s' % args.abundance_file)
if args.draft:
abundance_dic_short = abundance.parse_abundance_file(
Expand Down Expand Up @@ -212,13 +221,16 @@ def generate_reads(args):
args.abundance](genome_list)
abundance.to_file(abundance_dic, args.output)
else:
logger.error('Could not get abundance')
logger.error('Could not get abundance, or coverage or readcount information')
sys.exit(1)

cpus = args.cpus
logger.info('Using %s cpus for read generation' % cpus)

if not (args.coverage or args.coverage_file):

if args.readcount_file:
n_reads = sum(readcount_dic.values())
logger.info('Generating %s reads' % n_reads)
elif not (args.coverage or args.coverage_file):
n_reads = util.convert_n_reads(args.n_reads)
logger.info('Generating %s reads' % n_reads)

Expand All @@ -228,16 +240,16 @@ def generate_reads(args):
with f:
fasta_file = SeqIO.parse(f, 'fasta')
total_reads_generated = 0
total_reads_generated_unrouned = 0
total_reads_generated_unrounded = 0
for record in fasta_file:
# generate reads for records
try:
species_abundance = abundance_dic[record.id]
except KeyError as e:
logger.error(
'Fasta record not found in abundance file: %s' % e)
sys.exit(1)
if args.readcount_file:
assert record.id in readcount_dic, f"Record {record.id} not found in readcount file"
n_pairs = n_pairs_unrounded = readcount_dic[record.id]
else:
assert record.id in abundance_dic, f"Record {record.id} not found in abundance file"
species_abundance = abundance_dic[record.id]

logger.info('Generating reads for record: %s'
% record.id)
genome_size = len(record.seq)
Expand All @@ -256,62 +268,61 @@ def generate_reads(args):
err_mod.read_length) / 2
n_pairs = round(n_pairs_unrounded)

# check that the rounding does not cause to drop
# read pairs
total_reads_generated_unrouned += n_pairs_unrounded
total_reads_generated += n_pairs
if round(total_reads_generated_unrouned) > \
total_reads_generated:
logger.debug(
"Adding a pair to correct rounding error")
n_pairs += 1
total_reads_generated += 1

# skip record if n_reads == 0
if n_pairs == 0:
continue

# exact n_reads for each cpus
if n_pairs % cpus == 0:
n_pairs_per_cpu = [(n_pairs // cpus)
for _ in range(cpus)]
else:
n_pairs_per_cpu = [(n_pairs // cpus)
for _ in range(cpus)]
n_pairs_per_cpu[-1] += n_pairs % cpus

# due to a bug in multiprocessing
# https://bugs.python.org/issue17560
# we can't send records taking more than 2**31 bytes
# through serialisation.
# In those cases we use memmapping
if sys.getsizeof(str(record.seq)) >= 2**31 - 1:
logger.warning(
"record %s unusually big." % record.id)
logger.warning("Using a memory map.")
mode = "memmap"

record_mmap = "%s.memmap" % args.output
if os.path.exists(record_mmap):
os.unlink(record_mmap)
util.dump(record, record_mmap)
del record
record = record_mmap
gc.collect()
else:
mode = "default"

record_file_name_list = Parallel(
n_jobs=cpus)(
delayed(generator.reads)(
record, err_mod,
n_pairs_per_cpu[i], i, args.output,
args.seed, args.sequence_type,
args.gc_bias, mode) for i in range(cpus))
temp_file_list.extend(record_file_name_list)
# check that the rounding does not cause to drop
# read pairs
total_reads_generated_unrounded += n_pairs_unrounded
total_reads_generated += n_pairs
if round(total_reads_generated_unrounded) > \
total_reads_generated:
logger.debug(
"Adding a pair to correct rounding error")
n_pairs += 1
total_reads_generated += 1

# skip record if n_reads == 0
if n_pairs == 0:
continue

# exact n_reads for each cpus
if n_pairs % cpus == 0:
n_pairs_per_cpu = [(n_pairs // cpus)
for _ in range(cpus)]
else:
n_pairs_per_cpu = [(n_pairs // cpus)
for _ in range(cpus)]
n_pairs_per_cpu[-1] += n_pairs % cpus

# due to a bug in multiprocessing
# https://bugs.python.org/issue17560
# we can't send records taking more than 2**31 bytes
# through serialisation.
# In those cases we use memmapping
if sys.getsizeof(str(record.seq)) >= 2**31 - 1:
logger.warning(
"record %s unusually big." % record.id)
logger.warning("Using a memory map.")
mode = "memmap"

record_mmap = "%s.memmap" % args.output
if os.path.exists(record_mmap):
os.unlink(record_mmap)
util.dump(record, record_mmap)
del record
record = record_mmap
gc.collect()
else:
mode = "default"

record_file_name_list = Parallel(
n_jobs=cpus)(
delayed(generator.reads)(
record, err_mod,
n_pairs_per_cpu[i], i, args.output,
args.seed, args.sequence_type,
args.gc_bias, mode) for i in range(cpus))
temp_file_list.extend(record_file_name_list)
except KeyboardInterrupt as e:
logger.error('iss generate interrupted: %s' % e)
temp_file_unique = list(set(temp_file_list))
temp_R1 = [temp_file + '_R1.fastq' for temp_file in temp_file_list]
temp_R2 = [temp_file + '_R2.fastq' for temp_file in temp_file_list]
full_tmp_list = temp_R1 + temp_R2
Expand All @@ -324,7 +335,6 @@ def generate_reads(args):
# remove the duplicates in file list and cleanup
# we remove the duplicates in case two records had the same header
# and reads were appended to the same temp file.
temp_file_unique = list(set(temp_file_list))
temp_R1 = [temp_file + '_R1.fastq' for temp_file in temp_file_list]
temp_R2 = [temp_file + '_R2.fastq' for temp_file in temp_file_list]
util.concatenate(temp_R1, args.output + '_R1.fastq')
Expand Down Expand Up @@ -505,6 +515,12 @@ def main():
metavar='<coverage.txt>',
help='file containing coverage information (default: %(default)s).'
)
input_abundance.add_argument(
"--readcount_file",
"-R",
metavar='<readcount.txt>',
help='file containing read_count information (default: %(default)s).'
)
parser_gen.add_argument(
'--n_reads',
'-n',
Expand Down
11 changes: 11 additions & 0 deletions iss/test/test_abundance.py
Original file line number Diff line number Diff line change
Expand Up @@ -33,6 +33,17 @@ def test_parsing():
'genome_T': 0.2
}

def test_parsing_readcounts():
readcount_dic = abundance.parse_readcount_file('data/readcounts.txt')
assert readcount_dic == {
'amplicon_ATCG': 1,
'amplicon_TA': 1,
'amplicon_A': 2,
'amplicon_GC': 4,
'amplicon_T': 2
}



def test_parsing_empty():
with pytest.raises(SystemExit):
Expand Down
7 changes: 1 addition & 6 deletions iss/test/test_error_model.py
Original file line number Diff line number Diff line change
Expand Up @@ -3,11 +3,9 @@

import pytest

from iss.error_models import ErrorModel, basic, kde, perfect
from iss.error_models import basic, kde, perfect
from iss.util import rev_comp

from pickle import UnpicklingError

from Bio.Seq import Seq
from Bio.SeqRecord import SeqRecord

Expand Down Expand Up @@ -136,11 +134,8 @@ def test_introduce_indels_rev():
id='read_1',
description='test read'
)
print(read.seq)
read.seq = err_mod.introduce_indels(
read, 'reverse', ref_genome, bounds)
print(read.seq)
print(ref_genome.seq)
assert len(read.seq) == 20
assert read.seq == 'CGTACGGTACGGTACGGTAC'

Expand Down

0 comments on commit 6f2368a

Please sign in to comment.