From 9b960342a0da0c871488e5f4df1e40344aa9e876 Mon Sep 17 00:00:00 2001 From: Thijs Date: Thu, 13 Jul 2023 13:36:08 +0200 Subject: [PATCH 1/3] ignore eggs --- .gitignore | 1 + 1 file changed, 1 insertion(+) diff --git a/.gitignore b/.gitignore index 66b7e13..47f5c70 100644 --- a/.gitignore +++ b/.gitignore @@ -11,6 +11,7 @@ __pycache__/ # Distribution / packaging *.egg-info/ +.eggs/ dist/ build/ From cfd0ee09e7502d01e23a7a7ed092a74c04775175 Mon Sep 17 00:00:00 2001 From: Thijs Date: Thu, 14 Sep 2023 10:40:25 +0200 Subject: [PATCH 2/3] Fix test --- iss/test/test_generator.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/iss/test/test_generator.py b/iss/test/test_generator.py index 0c903e8..fcc255c 100644 --- a/iss/test/test_generator.py +++ b/iss/test/test_generator.py @@ -69,7 +69,7 @@ def test_small_input(): id='my_genome', description='test genome' ) - generator.simulate_read(ref_genome, err_mod, 1, 0) + generator.simulate_read(ref_genome, err_mod, 1, 0, 'metagenomics') def test_basic(): From 4c151005a3a3aa5be805046e12705c11cde5b380 Mon Sep 17 00:00:00 2001 From: Thijs Date: Thu, 21 Sep 2023 09:54:34 +0200 Subject: [PATCH 3/3] Added readcounts parser. Added readcounts file input, added alternative n_reads flow for readcounts. Added test. --- data/readcounts.txt | 5 ++ iss/abundance.py | 27 +++++++ iss/app.py | 148 ++++++++++++++++++++----------------- iss/test/test_abundance.py | 11 +++ 4 files changed, 125 insertions(+), 66 deletions(-) create mode 100644 data/readcounts.txt diff --git a/data/readcounts.txt b/data/readcounts.txt new file mode 100644 index 0000000..1ffe53b --- /dev/null +++ b/data/readcounts.txt @@ -0,0 +1,5 @@ +amplicon_A 2 +amplicon_T 2 +amplicon_GC 4 +amplicon_ATCG 1 +amplicon_TA 1 \ No newline at end of file diff --git a/iss/abundance.py b/iss/abundance.py index c81598b..c0f49d6 100644 --- a/iss/abundance.py +++ b/iss/abundance.py @@ -10,6 +10,33 @@ import numpy as np +def parse_readcount_file(readcount_file): + logger = logging.getLogger(__name__) + + readcount_dic = {} + try: + assert os.stat(readcount_file).st_size != 0 + f = open(readcount_file, 'r') + except (IOError, OSError) as e: + logger.error('Failed to open file:%s' % e) + sys.exit(1) + except AssertionError as e: + logger.error('File seems empty: %s' % readcount_file) + sys.exit(1) + else: + with f: + for line in f: + try: + genome_id = line.split()[0] + read_count = int(line.split()[1]) + except IndexError as e: + logger.error('Failed to read file: %s' % e) + sys.exit(1) + else: + readcount_dic[genome_id] = read_count + return readcount_dic + + def parse_abundance_file(abundance_file): """Parse an abundance or coverage file diff --git a/iss/app.py b/iss/app.py index 6c38559..20dcace 100644 --- a/iss/app.py +++ b/iss/app.py @@ -141,8 +141,17 @@ def generate_reads(args): 'lognormal': abundance.lognormal, 'zero_inflated_lognormal': abundance.zero_inflated_lognormal } + if args.readcount_file: + logger.info('Using readcount file:%s' % args.readcount_file) + logger.warning('--readcount_file disables --n_reads, n_reads will be calculated from the readcount file') + if args.draft: + raise RuntimeError( + "readcount_file is only supported using --genomes, not --draft" + ) + readcount_dic = abundance.parse_readcount_file(args.readcount_file) + # read the abundance file - if args.abundance_file: + elif args.abundance_file: logger.info('Using abundance file:%s' % args.abundance_file) if args.draft: abundance_dic_short = abundance.parse_abundance_file( @@ -212,13 +221,16 @@ def generate_reads(args): args.abundance](genome_list) abundance.to_file(abundance_dic, args.output) else: - logger.error('Could not get abundance') + logger.error('Could not get abundance, or coverage or readcount information') sys.exit(1) cpus = args.cpus logger.info('Using %s cpus for read generation' % cpus) - - if not (args.coverage or args.coverage_file): + + if args.readcount_file: + n_reads = sum(readcount_dic.values()) + logger.info('Generating %s reads' % n_reads) + elif not (args.coverage or args.coverage_file): n_reads = util.convert_n_reads(args.n_reads) logger.info('Generating %s reads' % n_reads) @@ -228,16 +240,16 @@ def generate_reads(args): with f: fasta_file = SeqIO.parse(f, 'fasta') total_reads_generated = 0 - total_reads_generated_unrouned = 0 + total_reads_generated_unrounded = 0 for record in fasta_file: # generate reads for records - try: - species_abundance = abundance_dic[record.id] - except KeyError as e: - logger.error( - 'Fasta record not found in abundance file: %s' % e) - sys.exit(1) + if args.readcount_file: + assert record.id in readcount_dic, f"Record {record.id} not found in readcount file" + n_pairs = n_pairs_unrounded = readcount_dic[record.id] else: + assert record.id in abundance_dic, f"Record {record.id} not found in abundance file" + species_abundance = abundance_dic[record.id] + logger.info('Generating reads for record: %s' % record.id) genome_size = len(record.seq) @@ -256,62 +268,61 @@ def generate_reads(args): err_mod.read_length) / 2 n_pairs = round(n_pairs_unrounded) - # check that the rounding does not cause to drop - # read pairs - total_reads_generated_unrouned += n_pairs_unrounded - total_reads_generated += n_pairs - if round(total_reads_generated_unrouned) > \ - total_reads_generated: - logger.debug( - "Adding a pair to correct rounding error") - n_pairs += 1 - total_reads_generated += 1 - - # skip record if n_reads == 0 - if n_pairs == 0: - continue - - # exact n_reads for each cpus - if n_pairs % cpus == 0: - n_pairs_per_cpu = [(n_pairs // cpus) - for _ in range(cpus)] - else: - n_pairs_per_cpu = [(n_pairs // cpus) - for _ in range(cpus)] - n_pairs_per_cpu[-1] += n_pairs % cpus - - # due to a bug in multiprocessing - # https://bugs.python.org/issue17560 - # we can't send records taking more than 2**31 bytes - # through serialisation. - # In those cases we use memmapping - if sys.getsizeof(str(record.seq)) >= 2**31 - 1: - logger.warning( - "record %s unusually big." % record.id) - logger.warning("Using a memory map.") - mode = "memmap" - - record_mmap = "%s.memmap" % args.output - if os.path.exists(record_mmap): - os.unlink(record_mmap) - util.dump(record, record_mmap) - del record - record = record_mmap - gc.collect() - else: - mode = "default" - - record_file_name_list = Parallel( - n_jobs=cpus)( - delayed(generator.reads)( - record, err_mod, - n_pairs_per_cpu[i], i, args.output, - args.seed, args.sequence_type, - args.gc_bias, mode) for i in range(cpus)) - temp_file_list.extend(record_file_name_list) + # check that the rounding does not cause to drop + # read pairs + total_reads_generated_unrounded += n_pairs_unrounded + total_reads_generated += n_pairs + if round(total_reads_generated_unrounded) > \ + total_reads_generated: + logger.debug( + "Adding a pair to correct rounding error") + n_pairs += 1 + total_reads_generated += 1 + + # skip record if n_reads == 0 + if n_pairs == 0: + continue + + # exact n_reads for each cpus + if n_pairs % cpus == 0: + n_pairs_per_cpu = [(n_pairs // cpus) + for _ in range(cpus)] + else: + n_pairs_per_cpu = [(n_pairs // cpus) + for _ in range(cpus)] + n_pairs_per_cpu[-1] += n_pairs % cpus + + # due to a bug in multiprocessing + # https://bugs.python.org/issue17560 + # we can't send records taking more than 2**31 bytes + # through serialisation. + # In those cases we use memmapping + if sys.getsizeof(str(record.seq)) >= 2**31 - 1: + logger.warning( + "record %s unusually big." % record.id) + logger.warning("Using a memory map.") + mode = "memmap" + + record_mmap = "%s.memmap" % args.output + if os.path.exists(record_mmap): + os.unlink(record_mmap) + util.dump(record, record_mmap) + del record + record = record_mmap + gc.collect() + else: + mode = "default" + + record_file_name_list = Parallel( + n_jobs=cpus)( + delayed(generator.reads)( + record, err_mod, + n_pairs_per_cpu[i], i, args.output, + args.seed, args.sequence_type, + args.gc_bias, mode) for i in range(cpus)) + temp_file_list.extend(record_file_name_list) except KeyboardInterrupt as e: logger.error('iss generate interrupted: %s' % e) - temp_file_unique = list(set(temp_file_list)) temp_R1 = [temp_file + '_R1.fastq' for temp_file in temp_file_list] temp_R2 = [temp_file + '_R2.fastq' for temp_file in temp_file_list] full_tmp_list = temp_R1 + temp_R2 @@ -324,7 +335,6 @@ def generate_reads(args): # remove the duplicates in file list and cleanup # we remove the duplicates in case two records had the same header # and reads were appended to the same temp file. - temp_file_unique = list(set(temp_file_list)) temp_R1 = [temp_file + '_R1.fastq' for temp_file in temp_file_list] temp_R2 = [temp_file + '_R2.fastq' for temp_file in temp_file_list] util.concatenate(temp_R1, args.output + '_R1.fastq') @@ -505,6 +515,12 @@ def main(): metavar='', help='file containing coverage information (default: %(default)s).' ) + input_abundance.add_argument( + "--readcount_file", + "-R", + metavar='', + help='file containing read_count information (default: %(default)s).' + ) parser_gen.add_argument( '--n_reads', '-n', diff --git a/iss/test/test_abundance.py b/iss/test/test_abundance.py index 3dc57ea..3fc561e 100644 --- a/iss/test/test_abundance.py +++ b/iss/test/test_abundance.py @@ -33,6 +33,17 @@ def test_parsing(): 'genome_T': 0.2 } +def test_parsing_readcounts(): + readcount_dic = abundance.parse_readcount_file('data/readcounts.txt') + assert readcount_dic == { + 'amplicon_ATCG': 1, + 'amplicon_TA': 1, + 'amplicon_A': 2, + 'amplicon_GC': 4, + 'amplicon_T': 2 + } + + def test_parsing_empty(): with pytest.raises(SystemExit):