diff --git a/.flake8 b/.flake8 new file mode 100644 index 0000000..c94f6cb --- /dev/null +++ b/.flake8 @@ -0,0 +1,5 @@ +[flake8] +max-line-length = 120 +ignore = E203, W503, C901 +exclude = .git,__pycache__,docs/source/conf.py,old,build,dist +max-complexity = 10 \ No newline at end of file diff --git a/.github/workflows/pythonpackage.yml b/.github/workflows/pythonpackage.yml index eb8ad71..422941b 100644 --- a/.github/workflows/pythonpackage.yml +++ b/.github/workflows/pythonpackage.yml @@ -21,6 +21,11 @@ jobs: python -m pip install --upgrade pip pip install pipenv pipenv install --dev + - name: Style check + run: | + black . --check + isort . --check + flake8 . - name: Test with pytest run: | chmod -w data/read_only.fasta diff --git a/iss/__init__.py b/iss/__init__.py index 7675730..e69de29 100644 --- a/iss/__init__.py +++ b/iss/__init__.py @@ -1,4 +0,0 @@ -#!/usr/bin/env python -# -*- coding: utf-8 -*- - -from iss import * diff --git a/iss/__main__.py b/iss/__main__.py deleted file mode 100644 index a730590..0000000 --- a/iss/__main__.py +++ /dev/null @@ -1,5 +0,0 @@ -#!/usr/bin/env python -# -*- coding: utf-8 -*- - -from iss.app import main -main() diff --git a/iss/abundance.py b/iss/abundance.py index c0f49d6..71fc93d 100644 --- a/iss/abundance.py +++ b/iss/abundance.py @@ -1,13 +1,13 @@ #!/usr/bin/env python # -*- coding: utf-8 -*- -from Bio import SeqIO -from scipy import stats - +import logging import os import sys -import logging + import numpy as np +from Bio import SeqIO +from scipy import stats def parse_readcount_file(readcount_file): @@ -16,12 +16,12 @@ def parse_readcount_file(readcount_file): readcount_dic = {} try: assert os.stat(readcount_file).st_size != 0 - f = open(readcount_file, 'r') + f = open(readcount_file, "r") except (IOError, OSError) as e: - logger.error('Failed to open file:%s' % e) + logger.error("Failed to open file:%s" % e) sys.exit(1) - except AssertionError as e: - logger.error('File seems empty: %s' % readcount_file) + except AssertionError: + logger.error("File seems empty: %s" % readcount_file) sys.exit(1) else: with f: @@ -30,7 +30,7 @@ def parse_readcount_file(readcount_file): genome_id = line.split()[0] read_count = int(line.split()[1]) except IndexError as e: - logger.error('Failed to read file: %s' % e) + logger.error("Failed to read file: %s" % e) sys.exit(1) else: readcount_dic[genome_id] = read_count @@ -55,12 +55,12 @@ def parse_abundance_file(abundance_file): abundance_dic = {} try: assert os.stat(abundance_file).st_size != 0 - f = open(abundance_file, 'r') + f = open(abundance_file, "r") except (IOError, OSError) as e: - logger.error('Failed to open file:%s' % e) + logger.error("Failed to open file:%s" % e) sys.exit(1) - except AssertionError as e: - logger.error('File seems empty: %s' % abundance_file) + except AssertionError: + logger.error("File seems empty: %s" % abundance_file) sys.exit(1) else: with f: @@ -69,11 +69,11 @@ def parse_abundance_file(abundance_file): genome_id = line.split()[0] abundance = float(line.split()[1]) except IndexError as e: - logger.error('Failed to read file: %s' % e) + logger.error("Failed to read file: %s" % e) sys.exit(1) else: abundance_dic[genome_id] = abundance - logger.debug('Loaded abundance/coverage file: %s' % abundance_file) + logger.debug("Loaded abundance/coverage file: %s" % abundance_file) return abundance_dic @@ -206,16 +206,16 @@ def coverage_scaling(total_n_reads, abundance_dic, genome_file, read_length): Returns: dict: scaled coverage dictionary """ + logger = logging.getLogger(__name__) total_reads = 0 - f = open(genome_file, 'r') # re-opens the file + f = open(genome_file, "r") # re-opens the file with f: - fasta_file = SeqIO.parse(f, 'fasta') + fasta_file = SeqIO.parse(f, "fasta") for record in fasta_file: try: species_coverage = abundance_dic[record.id] except KeyError as e: - logger.error( - 'Fasta record not found in abundance file: %s' % e) + logger.error("Fasta record not found in abundance file: %s" % e) sys.exit(1) genome_size = len(record.seq) @@ -237,18 +237,18 @@ def to_file(abundance_dic, output, mode="abundance"): """ logger = logging.getLogger(__name__) if mode == "abundance": - output_abundance = output + '_abundance.txt' + output_abundance = output + "_abundance.txt" else: - output_abundance = output + '_coverage.txt' + output_abundance = output + "_coverage.txt" try: - f = open(output_abundance, 'w') + f = open(output_abundance, "w") except PermissionError as e: - logger.error('Failed to open output file: %s' % e) + logger.error("Failed to open output file: %s" % e) sys.exit(1) else: with f: for record, abundance in abundance_dic.items(): - f.write('%s\t%s\n' % (record, abundance)) + f.write("%s\t%s\n" % (record, abundance)) def draft(genomes, draft, distribution, output, mode="abundance"): @@ -267,13 +267,10 @@ def draft(genomes, draft, distribution, output, mode="abundance"): # first we get a list of contig names in draft genomes draft_records = [] for d in draft: - draft_records.extend( - [record.name for record in SeqIO.parse(d, 'fasta')]) + draft_records.extend([record.name for record in SeqIO.parse(d, "fasta")]) genomes = list(set(genomes) - set(draft_records)) abundance_dic = distribution(genomes + draft) - complete_genomes_abundance = {k: v for - k, v in abundance_dic.items() - if k not in draft} + complete_genomes_abundance = {k: v for k, v in abundance_dic.items() if k not in draft} to_file(abundance_dic, output) draft_dic = expand_draft_abundance(abundance_dic, draft, mode) full_abundance_dic = {**complete_genomes_abundance, **draft_dic} @@ -300,12 +297,12 @@ def expand_draft_abundance(abundance_dic, draft, mode="abundance"): for key, abundance in abundance_dic.items(): if key in draft: try: - f = open(key, 'r') + f = open(key, "r") with f: - fasta_file = SeqIO.parse(f, 'fasta') + fasta_file = SeqIO.parse(f, "fasta") draft_records = [record for record in fasta_file] total_length = sum(len(record) for record in draft_records) - except Exception as e: + except Exception: raise else: total_length = sum(len(record) for record in draft_records) diff --git a/iss/app.py b/iss/app.py index eb77ebd..852b299 100644 --- a/iss/app.py +++ b/iss/app.py @@ -1,22 +1,19 @@ #!/usr/bin/env python # -*- coding: utf-8 -*- -from iss import util -from iss import download -from iss import abundance -from iss import generator -from iss.version import __version__ - -from Bio import SeqIO -from joblib import Parallel, delayed - +import argparse import gc +import logging import os -import sys import random -import logging -import argparse +import sys + import numpy as np +from Bio import SeqIO +from joblib import Parallel, delayed + +from iss import abundance, download, generator, util +from iss.version import __version__ def generate_reads(args): @@ -29,50 +26,45 @@ def generate_reads(args): args (object): the command-line arguments from argparse """ logger = logging.getLogger(__name__) - logger.debug('iss version %s' % __version__) - logger.debug('Using verbose logger') + logger.debug("iss version %s" % __version__) + logger.debug("Using verbose logger") try: # try to import and load the correct error model - logger.info('Starting iss generate') - logger.info('Using %s ErrorModel' % args.mode) + logger.info("Starting iss generate") + logger.info("Using %s ErrorModel" % args.mode) if args.seed: - logger.info('Setting random seed to %i' % args.seed) + logger.info("Setting random seed to %i" % args.seed) random.seed(args.seed) np.random.seed(args.seed) - if args.mode == 'kde': + if args.mode == "kde": from iss.error_models import kde + if args.model is None: - logger.error('--model is required in --mode kde') + logger.error("--model is required in --mode kde") sys.exit(1) - elif args.model.lower() == 'hiseq': - npz = os.path.join( - os.path.dirname(__file__), - 'profiles/HiSeq') - elif args.model.lower() == 'novaseq': - npz = os.path.join( - os.path.dirname(__file__), - 'profiles/NovaSeq') - elif args.model.lower() == 'miseq': - npz = os.path.join( - os.path.dirname(__file__), - 'profiles/MiSeq') + elif args.model.lower() == "hiseq": + npz = os.path.join(os.path.dirname(__file__), "profiles/HiSeq") + elif args.model.lower() == "novaseq": + npz = os.path.join(os.path.dirname(__file__), "profiles/NovaSeq") + elif args.model.lower() == "miseq": + npz = os.path.join(os.path.dirname(__file__), "profiles/MiSeq") else: npz = args.model err_mod = kde.KDErrorModel(npz) - elif args.mode == 'basic': + elif args.mode == "basic": if args.model is not None: - logger.warning('--model %s will be ignored in --mode %s' % ( - args.model, args.mode)) + logger.warning("--model %s will be ignored in --mode %s" % (args.model, args.mode)) from iss.error_models import basic + err_mod = basic.BasicErrorModel() - elif args.mode == 'perfect': + elif args.mode == "perfect": if args.model is not None: - logger.warning('--model %s will be ignored in --mode %s' % ( - args.model, args.mode)) + logger.warning("--model %s will be ignored in --mode %s" % (args.model, args.mode)) from iss.error_models import perfect + err_mod = perfect.PerfectErrorModel() except ImportError as e: - logger.error('Failed to import ErrorModel module: %s' % e) + logger.error("Failed to import ErrorModel module: %s" % e) sys.exit(1) try: # try to read genomes and concatenate --genomes and --ncbi genomes @@ -83,162 +75,137 @@ def generate_reads(args): if args.draft: genome_files.extend(args.draft) if args.ncbi and args.n_genomes_ncbi: - util.genome_file_exists(args.output + '_ncbi_genomes.fasta') - total_genomes_ncbi = [] + util.genome_file_exists(args.output + "_ncbi_genomes.fasta") try: assert len(*args.ncbi) == len(*args.n_genomes_ncbi) - except AssertionError as e: + except AssertionError: logger.error( - '--ncbi and --n_genomes_ncbi of unequal lengths. \ - Aborting') + "--ncbi and --n_genomes_ncbi of unequal lengths. \ + Aborting" + ) sys.exit(1) for g, n in zip(*args.ncbi, *args.n_genomes_ncbi): - genomes_ncbi = download.ncbi( - g, n, args.output + '_ncbi_genomes.fasta') + genomes_ncbi = download.ncbi(g, n, args.output + "_ncbi_genomes.fasta") genome_files.append(genomes_ncbi) if args.ncbi and not args.n_genomes_ncbi: - logger.error( - '--ncbi/-k requires --n_genomes_ncbi/-U. Aborting.') + logger.error("--ncbi/-k requires --n_genomes_ncbi/-U. Aborting.") sys.exit(1) else: logger.error("One of --genomes/-g, --draft, --ncbi/-k is required") sys.exit(1) - genome_file = args.output + '.iss.tmp.genomes.fasta' - util.concatenate( - genome_files, - output=genome_file) + genome_file = args.output + ".iss.tmp.genomes.fasta" + util.concatenate(genome_files, output=genome_file) # for n_genomes we use reservoir sampling to draw random genomes # from the concatenated genome file. We then override the file. if args.n_genomes and not args.draft and not args.ncbi: genome_count = util.count_records(genome_file) - genome_files = [genome for genome in util.reservoir( - SeqIO.parse(genome_file, 'fasta'), - genome_count, - args.n_genomes)] - SeqIO.write(genome_files, genome_file, 'fasta') + genome_files = [ + genome for genome in util.reservoir(SeqIO.parse(genome_file, "fasta"), genome_count, args.n_genomes) + ] + SeqIO.write(genome_files, genome_file, "fasta") assert os.stat(genome_file).st_size != 0 - f = open(genome_file, 'r') + f = open(genome_file, "r") with f: # count the number of records genome_list = util.count_records(f) except IOError as e: - logger.error('Failed to open genome(s) file:%s' % e) + logger.error("Failed to open genome(s) file:%s" % e) sys.exit(1) - except AssertionError as e: - logger.error('Genome(s) file seems empty: %s' % genome_file) + except AssertionError: + logger.error("Genome(s) file seems empty: %s" % genome_file) sys.exit(1) except KeyboardInterrupt as e: - logger.error('iss generate interrupted: %s' % e) + logger.error("iss generate interrupted: %s" % e) sys.exit(1) else: abundance_dispatch = { - 'uniform': abundance.uniform, - 'halfnormal': abundance.halfnormal, - 'exponential': abundance.exponential, - 'lognormal': abundance.lognormal, - 'zero_inflated_lognormal': abundance.zero_inflated_lognormal + "uniform": abundance.uniform, + "halfnormal": abundance.halfnormal, + "exponential": abundance.exponential, + "lognormal": abundance.lognormal, + "zero_inflated_lognormal": abundance.zero_inflated_lognormal, } if args.readcount_file: - logger.info('Using readcount file:%s' % args.readcount_file) - logger.warning('--readcount_file disables --n_reads, n_reads will be calculated from the readcount file') + logger.info("Using readcount file:%s" % args.readcount_file) + logger.warning("--readcount_file disables --n_reads, n_reads will be calculated from the readcount file") if args.draft: - raise RuntimeError( - "readcount_file is only supported using --genomes, not --draft" - ) - readcount_dic = abundance.parse_readcount_file(args.readcount_file) + raise RuntimeError("readcount_file is only supported using --genomes, not --draft") + readcount_dic = abundance.parse_readcount_file(args.readcount_file) # read the abundance file elif args.abundance_file: - logger.info('Using abundance file:%s' % args.abundance_file) + logger.info("Using abundance file:%s" % args.abundance_file) if args.draft: - abundance_dic_short = abundance.parse_abundance_file( - args.abundance_file) - complete_genomes_dic = {k: v for - k, v in abundance_dic_short.items() - if k not in args.draft} - draft_dic = abundance.expand_draft_abundance( - abundance_dic_short, - args.draft) - abundance_dic = {**complete_genomes_dic, - **draft_dic} + abundance_dic_short = abundance.parse_abundance_file(args.abundance_file) + complete_genomes_dic = {k: v for k, v in abundance_dic_short.items() if k not in args.draft} + draft_dic = abundance.expand_draft_abundance(abundance_dic_short, args.draft) + abundance_dic = {**complete_genomes_dic, **draft_dic} else: - abundance_dic = abundance.parse_abundance_file( - args.abundance_file) + abundance_dic = abundance.parse_abundance_file(args.abundance_file) elif args.coverage_file: - logger.warning('--coverage_file is an experimental feature') - logger.warning('--coverage_file disables --n_reads') - logger.info('Using coverage file:%s' % args.coverage_file) + logger.warning("--coverage_file is an experimental feature") + logger.warning("--coverage_file disables --n_reads") + logger.info("Using coverage file:%s" % args.coverage_file) if args.draft: - coverage_dic = abundance.parse_abundance_file( - args.coverage_file) - complete_genomes_dic = {k: v for - k, v in coverage_dic.items() - if k not in args.draft} - draft_dic = abundance.expand_draft_abundance( - coverage_dic, - args.draft, - mode="coverage") - abundance_dic = {**complete_genomes_dic, - **draft_dic} + coverage_dic = abundance.parse_abundance_file(args.coverage_file) + complete_genomes_dic = {k: v for k, v in coverage_dic.items() if k not in args.draft} + draft_dic = abundance.expand_draft_abundance(coverage_dic, args.draft, mode="coverage") + abundance_dic = {**complete_genomes_dic, **draft_dic} else: - abundance_dic = abundance.parse_abundance_file( - args.coverage_file) + abundance_dic = abundance.parse_abundance_file(args.coverage_file) elif args.coverage in abundance_dispatch: # todo coverage distribution with --draft - logger.warning('--coverage is an experimental feature') - logger.info('Using %s coverage distribution' % args.coverage) + logger.warning("--coverage is an experimental feature") + logger.info("Using %s coverage distribution" % args.coverage) if args.draft: abundance_dic = abundance.draft( genome_list, args.draft, abundance_dispatch[args.abundance], args.output, - mode="coverage") + mode="coverage", + ) else: - abundance_dic = abundance_dispatch[ - args.coverage](genome_list) + abundance_dic = abundance_dispatch[args.coverage](genome_list) if args.n_reads: n_reads = util.convert_n_reads(args.n_reads) - logger.info('scaling coverage to %s reads' % n_reads) - abundance_dic = abundance.coverage_scaling(n_reads, - abundance_dic, - genome_file, - err_mod.read_length) + logger.info("scaling coverage to %s reads" % n_reads) + abundance_dic = abundance.coverage_scaling(n_reads, abundance_dic, genome_file, err_mod.read_length) abundance.to_file(abundance_dic, args.output, mode="coverage") elif args.abundance in abundance_dispatch: - logger.info('Using %s abundance distribution' % args.abundance) + logger.info("Using %s abundance distribution" % args.abundance) if args.draft: abundance_dic = abundance.draft( genome_list, args.draft, abundance_dispatch[args.abundance], - args.output) + args.output, + ) else: - abundance_dic = abundance_dispatch[ - args.abundance](genome_list) + abundance_dic = abundance_dispatch[args.abundance](genome_list) abundance.to_file(abundance_dic, args.output) else: - logger.error('Could not get abundance, or coverage or readcount information') + logger.error("Could not get abundance, or coverage or readcount information") sys.exit(1) cpus = args.cpus - logger.info('Using %s cpus for read generation' % cpus) - + logger.info("Using %s cpus for read generation" % cpus) + if args.readcount_file: n_reads = sum(readcount_dic.values()) - logger.info('Generating %s reads' % n_reads) + logger.info("Generating %s reads" % n_reads) elif not (args.coverage or args.coverage_file): n_reads = util.convert_n_reads(args.n_reads) - logger.info('Generating %s reads' % n_reads) + logger.info("Generating %s reads" % n_reads) try: temp_file_list = [] # list holding the prefix of all temp files - f = open(genome_file, 'r') # re-opens the file + f = open(genome_file, "r") # re-opens the file with f: - fasta_file = SeqIO.parse(f, 'fasta') + fasta_file = SeqIO.parse(f, "fasta") total_reads_generated = 0 total_reads_generated_unrounded = 0 for record in fasta_file: @@ -249,9 +216,8 @@ def generate_reads(args): else: assert record.id in abundance_dic, f"Record {record.id} not found in abundance file" species_abundance = abundance_dic[record.id] - - logger.info('Generating reads for record: %s' - % record.id) + + logger.info("Generating reads for record: %s" % record.id) genome_size = len(record.seq) if args.coverage or args.coverage_file: @@ -261,21 +227,17 @@ def generate_reads(args): n_reads, species_abundance, err_mod.read_length, - genome_size + genome_size, ) - n_pairs_unrounded = ( - (coverage * len(record.seq)) / - err_mod.read_length) / 2 + n_pairs_unrounded = ((coverage * len(record.seq)) / err_mod.read_length) / 2 n_pairs = round(n_pairs_unrounded) # check that the rounding does not cause to drop # read pairs total_reads_generated_unrounded += n_pairs_unrounded total_reads_generated += n_pairs - if round(total_reads_generated_unrounded) > \ - total_reads_generated: - logger.debug( - "Adding a pair to correct rounding error") + if round(total_reads_generated_unrounded) > total_reads_generated: + logger.debug("Adding a pair to correct rounding error") n_pairs += 1 total_reads_generated += 1 @@ -285,11 +247,9 @@ def generate_reads(args): # exact n_reads for each cpus if n_pairs % cpus == 0: - n_pairs_per_cpu = [(n_pairs // cpus) - for _ in range(cpus)] + n_pairs_per_cpu = [(n_pairs // cpus) for _ in range(cpus)] else: - n_pairs_per_cpu = [(n_pairs // cpus) - for _ in range(cpus)] + n_pairs_per_cpu = [(n_pairs // cpus) for _ in range(cpus)] n_pairs_per_cpu[-1] += n_pairs % cpus # due to a bug in multiprocessing @@ -298,8 +258,7 @@ def generate_reads(args): # through serialisation. # In those cases we use memmapping if sys.getsizeof(str(record.seq)) >= 2**31 - 1: - logger.warning( - "record %s unusually big." % record.id) + logger.warning("record %s unusually big." % record.id) logger.warning("Using a memory map.") mode = "memmap" @@ -313,18 +272,25 @@ def generate_reads(args): else: mode = "default" - record_file_name_list = Parallel( - n_jobs=cpus)( - delayed(generator.reads)( - record, err_mod, - n_pairs_per_cpu[i], i, args.output, - args.seed, args.sequence_type, - args.gc_bias, mode) for i in range(cpus)) + record_file_name_list = Parallel(n_jobs=cpus)( + delayed(generator.reads)( + record, + err_mod, + n_pairs_per_cpu[i], + i, + args.output, + args.seed, + args.sequence_type, + args.gc_bias, + mode, + ) + for i in range(cpus) + ) temp_file_list.extend(record_file_name_list) except KeyboardInterrupt as e: - logger.error('iss generate interrupted: %s' % e) - temp_R1 = [temp_file + '_R1.fastq' for temp_file in temp_file_list] - temp_R2 = [temp_file + '_R2.fastq' for temp_file in temp_file_list] + logger.error("iss generate interrupted: %s" % e) + temp_R1 = [temp_file + "_R1.fastq" for temp_file in temp_file_list] + temp_R2 = [temp_file + "_R2.fastq" for temp_file in temp_file_list] full_tmp_list = temp_R1 + temp_R2 full_tmp_list.append(genome_file) if os.path.exists("%s.memmap" % args.output): @@ -335,19 +301,19 @@ def generate_reads(args): # remove the duplicates in file list and cleanup # we remove the duplicates in case two records had the same header # and reads were appended to the same temp file. - temp_R1 = [temp_file + '_R1.fastq' for temp_file in temp_file_list] - temp_R2 = [temp_file + '_R2.fastq' for temp_file in temp_file_list] - util.concatenate(temp_R1, args.output + '_R1.fastq') - util.concatenate(temp_R2, args.output + '_R2.fastq') + temp_R1 = [temp_file + "_R1.fastq" for temp_file in temp_file_list] + temp_R2 = [temp_file + "_R2.fastq" for temp_file in temp_file_list] + util.concatenate(temp_R1, args.output + "_R1.fastq") + util.concatenate(temp_R2, args.output + "_R2.fastq") full_tmp_list = temp_R1 + temp_R2 full_tmp_list.append(genome_file) if os.path.exists("%s.memmap" % args.output): full_tmp_list.append("%s.memmap" % args.output) util.cleanup(full_tmp_list) if args.compress: - util.compress(args.output + '_R1.fastq') - util.compress(args.output + '_R2.fastq') - logger.info('Read generation complete') + util.compress(args.output + "_R1.fastq") + util.compress(args.output + "_R2.fastq") + logger.info("Read generation complete") def model_from_bam(args): @@ -360,50 +326,47 @@ def model_from_bam(args): args (object): the command-line arguments from argparse """ logger = logging.getLogger(__name__) - logger.debug('iss version %s' % __version__) - logger.debug('Using verbose logger') + logger.debug("iss version %s" % __version__) + logger.debug("Using verbose logger") try: # try to import bam module and write model data to file - logger.info('Starting iss model') + logger.info("Starting iss model") from iss import bam except ImportError as e: - logger.error('Failed to import bam module: %s' % e) + logger.error("Failed to import bam module: %s" % e) sys.exit(1) else: - logger.info('Using KDE ErrorModel') + logger.info("Using KDE ErrorModel") bam.to_model(args.bam, args.output) - logger.info('Model generation complete') + logger.info("Model generation complete") def main(): parser = argparse.ArgumentParser( - prog='InSilicoSeq', - usage='iss [subcommand] [options]', - description='InSilicoSeq: A sequencing simulator' + prog="InSilicoSeq", + usage="iss [subcommand] [options]", + description="InSilicoSeq: A sequencing simulator", ) parser.add_argument( - '-v', - '--version', - action='store_true', + "-v", + "--version", + action="store_true", default=False, - help='print software version and exit' - ) - subparsers = parser.add_subparsers( - title='available subcommands', - metavar='' + help="print software version and exit", ) + subparsers = parser.add_subparsers(title="available subcommands", metavar="") parser_mod = subparsers.add_parser( - 'model', - prog='iss model', - description='generate an error model from a bam file', - help='generate an error model from a bam file' + "model", + prog="iss model", + description="generate an error model from a bam file", + help="generate an error model from a bam file", ) parser_gen = subparsers.add_parser( - 'generate', - prog='iss generate', - description='simulate reads from an error model', - help='simulate reads from an error model' + "generate", + prog="iss generate", + description="simulate reads from an error model", + help="simulate reads from an error model", ) # arguments form the read generator module @@ -412,213 +375,223 @@ def main(): # input_genomes = parser_gen.add_mutually_exclusive_group() input_abundance = parser_gen.add_mutually_exclusive_group() param_logging.add_argument( - '--quiet', - '-q', - action='store_true', + "--quiet", + "-q", + action="store_true", default=False, - help='Disable info logging. (default: %(default)s).' + help="Disable info logging. (default: %(default)s).", ) param_logging.add_argument( - '--debug', - '-d', - action='store_true', + "--debug", + "-d", + action="store_true", default=False, - help='Enable debug logging. (default: %(default)s).' + help="Enable debug logging. (default: %(default)s).", ) parser_gen.add_argument( - '--seed', + "--seed", type=int, - metavar='', - help='Seed all the random number generators', - default=None + metavar="", + help="Seed all the random number generators", + default=None, ) parser_gen.add_argument( - '--cpus', - '-p', + "--cpus", + "-p", default=2, type=int, - metavar='', - help='number of cpus to use. (default: %(default)s).' + metavar="", + help="number of cpus to use. (default: %(default)s).", ) parser_gen.add_argument( - '--genomes', - '-g', - metavar='', + "--genomes", + "-g", + metavar="", nargs="+", - help='Input genome(s) from where the reads will originate' + help="Input genome(s) from where the reads will originate", ) parser_gen.add_argument( - '--draft', - metavar='', + "--draft", + metavar="", nargs="+", - help='Input draft genome(s) from where the reads will originate' + help="Input draft genome(s) from where the reads will originate", ) parser_gen.add_argument( - '--n_genomes', - '-u', + "--n_genomes", + "-u", type=int, - metavar='', - help='How many genomes will be used for the simulation. is set with \ + metavar="", + help="How many genomes will be used for the simulation. is set with \ --genomes/-g or/and --draft to take random genomes from the \ - input multifasta' + input multifasta", ) parser_gen.add_argument( - '--ncbi', - '-k', - choices=['bacteria', 'viruses', 'archaea'], - action='append', - nargs='*', - metavar='', - help='Download input genomes from NCBI. Requires --n_genomes/-u\ + "--ncbi", + "-k", + choices=["bacteria", "viruses", "archaea"], + action="append", + nargs="*", + metavar="", + help="Download input genomes from NCBI. Requires --n_genomes/-u\ option. Can be bacteria, viruses, archaea or a combination of the\ - three (space-separated)' + three (space-separated)", ) parser_gen.add_argument( - '--n_genomes_ncbi', - '-U', + "--n_genomes_ncbi", + "-U", type=int, - action='append', - metavar='', - nargs='*', - help='How many genomes will be downloaded from NCBI. Required if\ + action="append", + metavar="", + nargs="*", + help="How many genomes will be downloaded from NCBI. Required if\ --ncbi/-k is set. If more than one kingdom is set with --ncbi,\ - multiple values are necessary (space-separated).' + multiple values are necessary (space-separated).", ) input_abundance.add_argument( - '--abundance', - '-a', - choices=['uniform', 'halfnormal', - 'exponential', 'lognormal', 'zero_inflated_lognormal'], - metavar='', - default='lognormal', - help='abundance distribution (default: %(default)s). Can be uniform,\ - halfnormal, exponential, lognormal or zero-inflated-lognormal.' + "--abundance", + "-a", + choices=[ + "uniform", + "halfnormal", + "exponential", + "lognormal", + "zero_inflated_lognormal", + ], + metavar="", + default="lognormal", + help="abundance distribution (default: %(default)s). Can be uniform,\ + halfnormal, exponential, lognormal or zero-inflated-lognormal.", ) input_abundance.add_argument( - '--abundance_file', - '-b', - metavar='', - help='abundance file for coverage calculations (default: %(default)s).' + "--abundance_file", + "-b", + metavar="", + help="abundance file for coverage calculations (default: %(default)s).", ) input_abundance.add_argument( - '--coverage', - '-C', - choices=['uniform', 'halfnormal', - 'exponential', 'lognormal', 'zero_inflated_lognormal'], - metavar='', - help='coverage distribution. Can be uniform,\ - halfnormal, exponential, lognormal or zero-inflated-lognormal.' + "--coverage", + "-C", + choices=[ + "uniform", + "halfnormal", + "exponential", + "lognormal", + "zero_inflated_lognormal", + ], + metavar="", + help="coverage distribution. Can be uniform,\ + halfnormal, exponential, lognormal or zero-inflated-lognormal.", ) input_abundance.add_argument( - '--coverage_file', - '-D', - metavar='', - help='file containing coverage information (default: %(default)s).' + "--coverage_file", + "-D", + metavar="", + help="file containing coverage information (default: %(default)s).", ) input_abundance.add_argument( "--readcount_file", "-R", - metavar='', - help='file containing read_count information (default: %(default)s).' + metavar="", + help="file containing read_count information (default: %(default)s).", ) parser_gen.add_argument( - '--n_reads', - '-n', - metavar='', - default='1000000', - help='Number of reads to generate (default: %(default)s). Allows \ - suffixes k, K, m, M, g and G (ex 0.5M for 500000).' + "--n_reads", + "-n", + metavar="", + default="1000000", + help="Number of reads to generate (default: %(default)s). Allows \ + suffixes k, K, m, M, g and G (ex 0.5M for 500000).", ) parser_gen.add_argument( - '--mode', - '-e', - metavar='', - choices=['kde', 'basic', 'perfect'], - default='kde', - help='Error model. If not specified, using kernel density estimation \ - (default: %(default)s). Can be kde, basic or perfect' + "--mode", + "-e", + metavar="", + choices=["kde", "basic", "perfect"], + default="kde", + help="Error model. If not specified, using kernel density estimation \ + (default: %(default)s). Can be kde, basic or perfect", ) parser_gen.add_argument( - '--model', - '-m', - metavar='', + "--model", + "-m", + metavar="", default=None, - help='Error model file. (default: %(default)s). Use HiSeq, NovaSeq or \ + help="Error model file. (default: %(default)s). Use HiSeq, NovaSeq or \ MiSeq for a pre-computed error model provided with the software, or a \ file generated with iss model. If you do not wish to use a model, use \ --mode basic or --mode perfect. The name of the built-in models are \ - case insensitive.' + case insensitive.", ) parser_gen.add_argument( - '--gc_bias', - '-c', - action='store_true', + "--gc_bias", + "-c", + action="store_true", default=False, - help='If set, may fail to sequence reads with abnormal GC content. \ - (default: %(default)s)' + help="If set, may fail to sequence reads with abnormal GC content. \ + (default: %(default)s)", ) parser_gen.add_argument( - '--compress', - '-z', - action='store_true', + "--compress", + "-z", + action="store_true", default=False, - help='Compress the output in gzip format (default: %(default)s).' + help="Compress the output in gzip format (default: %(default)s).", ) parser_gen.add_argument( - '--output', - '-o', - metavar='', - help='Output file path and prefix (Required)', - required=True + "--output", + "-o", + metavar="", + help="Output file path and prefix (Required)", + required=True, ) parser_gen.add_argument( - '--sequence_type', - '-t', - choices=['metagenomics', 'amplicon'], - default='metagenomics', + "--sequence_type", + "-t", + choices=["metagenomics", "amplicon"], + default="metagenomics", required=True, - help='Type of sequencing. Can be metagenomics or amplicon (default: %(default)s).' + help="Type of sequencing. Can be metagenomics or amplicon (default: %(default)s).", ) - parser_gen._optionals.title = 'arguments' + parser_gen._optionals.title = "arguments" parser_gen.set_defaults(func=generate_reads) # arguments for the error model module parser_mod.add_argument( - '--quiet', - '-q', - action='store_true', + "--quiet", + "-q", + action="store_true", default=False, - help='Disable info logging. (default: %(default)s).' + help="Disable info logging. (default: %(default)s).", ) parser_mod.add_argument( - '--debug', - '-d', - action='store_true', + "--debug", + "-d", + action="store_true", default=False, - help='Enable debug logging. (default: %(default)s).' + help="Enable debug logging. (default: %(default)s).", ) parser_mod.add_argument( - '--bam', - '-b', - metavar='', - help='aligned reads from which the model will be inferred (Required)', - required=True + "--bam", + "-b", + metavar="", + help="aligned reads from which the model will be inferred (Required)", + required=True, ) parser_mod.add_argument( - '--output', - '-o', - metavar='', - help='Output file path and prefix (Required)', - required=True + "--output", + "-o", + metavar="", + help="Output file path and prefix (Required)", + required=True, ) - parser_mod._optionals.title = 'arguments' + parser_mod._optionals.title = "arguments" parser_mod.set_defaults(func=model_from_bam) args = parser.parse_args() # set logger and display version if args.version try: if args.version: - print('iss version %s' % __version__) + print("iss version %s" % __version__) sys.exit(0) elif args.quiet: logging.basicConfig(level=logging.ERROR) diff --git a/iss/bam.py b/iss/bam.py index cac7397..150b21b 100644 --- a/iss/bam.py +++ b/iss/bam.py @@ -1,14 +1,14 @@ #!/usr/bin/env python # -*- coding: utf-8 -*- -from iss import modeller - +import logging +import sys from random import random -import sys -import pysam -import logging import numpy as np +import pysam + +from iss import modeller def read_bam(bam_file, n_reads=1000000): @@ -24,35 +24,31 @@ def read_bam(bam_file, n_reads=1000000): try: lines = pysam.idxstats(bam_file).splitlines() - total_records = sum([int(l.split("\t")[2]) - for l in lines if not l.startswith("#")]) + total_records = sum([int(line.split("\t")[2]) for line in lines if not line.startswith("#")]) # total_records = sum(1 for _ in bam.fetch() if not _.is_unmapped) random_fraction = n_reads / total_records - bam = pysam.AlignmentFile(bam_file, 'rb') # reopen the file + bam = pysam.AlignmentFile(bam_file, "rb") # reopen the file - except (IOError, ValueError, - ZeroDivisionError, pysam.utils.SamtoolsError) as e: - logger.error('Failed to read bam file: %s' % e) + except (IOError, ValueError, ZeroDivisionError, pysam.utils.SamtoolsError) as e: + logger.error("Failed to read bam file: %s" % e) sys.exit(1) else: - logger.info('Reading bam file: %s' % bam_file) + logger.info("Reading bam file: %s" % bam_file) c = 0 with bam: for read in bam.fetch(): if not read.is_unmapped and random() < random_fraction: c += 1 if logger.getEffectiveLevel() == 10: - print( - 'DEBUG:iss.bam:Subsampling %s / %s reads' % ( - c, n_reads), - end='\r') + print("DEBUG:iss.bam:Subsampling %s / %s reads" % (c, n_reads), end="\r") yield read elif c >= n_reads: break -def write_to_file(model, read_length, mean_f, mean_r, hist_f, hist_r, - sub_f, sub_r, ins_f, ins_r, del_f, del_r, i_size, output): +def write_to_file( + model, read_length, mean_f, mean_r, hist_f, hist_r, sub_f, sub_r, ins_f, ins_r, del_f, del_r, i_size, output +): """Write variables to a .npz file Args: @@ -82,7 +78,7 @@ def write_to_file(model, read_length, mean_f, mean_r, hist_f, hist_r, logger = logging.getLogger(__name__) try: - logger.info('Writing model to file: %s' % output) + logger.info("Writing model to file: %s" % output) np.savez_compressed( output, model=model, @@ -97,10 +93,10 @@ def write_to_file(model, read_length, mean_f, mean_r, hist_f, hist_r, ins_forward=ins_f, ins_reverse=ins_r, del_forward=del_f, - del_reverse=del_r + del_reverse=del_r, ) except PermissionError as e: - logger.error('Failed to open output file: %s' % e) + logger.error("Failed to open output file: %s" % e) sys.exit(1) @@ -122,7 +118,7 @@ def to_model(bam_path, output): qualities_reverse = [] subst_matrix_f = np.zeros([301, 16]) # we dont know the len of the reads subst_matrix_r = np.zeros([301, 16]) # yet. we will find out from the - indel_matrix_f = np.zeros([301, 9]) # len of the quality lists + indel_matrix_f = np.zeros([301, 9]) # len of the quality lists indel_matrix_r = np.zeros([301, 9]) # read the bam file and extract info needed for modelling @@ -136,38 +132,30 @@ def to_model(bam_path, output): # get qualities if read.is_read1: # get mean quality too - quality_means = [] read_quality = read.query_qualities mean_quality = np.mean(read_quality) if read.is_reverse: read_quality = read_quality[::-1] # reverse the list - quality_plus_mean = [ - (quality, mean_quality) for quality in read_quality] + quality_plus_mean = [(quality, mean_quality) for quality in read_quality] qualities_forward.append(np.asarray(quality_plus_mean)) # qualities_forward.append(read.query_qualities) elif read.is_read2: # get mean quality too - quality_means = [] read_quality = read.query_qualities mean_quality = np.mean(read_quality) if read.is_reverse: read_quality = read_quality[::-1] # reverse the list - quality_plus_mean = [ - (quality, mean_quality) for quality in read_quality] + quality_plus_mean = [(quality, mean_quality) for quality in read_quality] qualities_reverse.append(np.asarray(quality_plus_mean)) # qualities_reverse.append(read.query_qualities) # get mismatches - alignment = read.get_aligned_pairs( - matches_only=True, - with_seq=True - ) + alignment = read.get_aligned_pairs(matches_only=True, with_seq=True) read_has_indels = False for base in alignment: # dispatch mismatches in matrix - pos, subst, read_has_indels = modeller.dispatch_subst( - base, read, read_has_indels) + pos, subst, read_has_indels = modeller.dispatch_subst(base, read, read_has_indels) if read.is_read1 and subst is not None: subst_matrix_f[pos, subst] += 1 elif read.is_read2 and subst is not None: @@ -179,11 +167,11 @@ def to_model(bam_path, output): elif read.is_read2: indel_matrix_r[pos, indel] += 1 - logger.info('Calculating insert size distribution') + logger.info("Calculating insert size distribution") # insert_size = int(np.mean(insert_size_dist)) hist_insert_size = modeller.insert_size(insert_size_dist) - logger.info('Calculating mean and base quality distribution') + logger.info("Calculating mean and base quality distribution") quality_bins_f = modeller.divide_qualities_into_bins(qualities_forward) quality_bins_r = modeller.divide_qualities_into_bins(qualities_reverse) @@ -208,23 +196,21 @@ def to_model(bam_path, output): indel_matrix_f.resize([read_length, 9], refcheck=False) indel_matrix_r.resize([read_length, 9], refcheck=False) - logger.info('Calculating substitution rate') + logger.info("Calculating substitution rate") subst_f = modeller.subst_matrix_to_choices(subst_matrix_f, read_length) subst_r = modeller.subst_matrix_to_choices(subst_matrix_r, read_length) - logger.info('Calculating indel rate') + logger.info("Calculating indel rate") # update the base count in indel matrices for position in range(read_length): indel_matrix_f[position][0] = sum(subst_matrix_f[position][::4]) indel_matrix_r[position][0] = sum(subst_matrix_r[position][::4]) - ins_f, del_f = modeller.indel_matrix_to_choices( - indel_matrix_f, read_length) - ins_r, del_r = modeller.indel_matrix_to_choices( - indel_matrix_r, read_length) + ins_f, del_f = modeller.indel_matrix_to_choices(indel_matrix_f, read_length) + ins_r, del_r = modeller.indel_matrix_to_choices(indel_matrix_r, read_length) write_to_file( - 'kde', + "kde", read_length, mean_f, mean_r, @@ -237,4 +223,5 @@ def to_model(bam_path, output): del_f, del_r, hist_insert_size, - output + '.npz') + output + ".npz", + ) diff --git a/iss/download.py b/iss/download.py index a611753..b8ef7c7 100644 --- a/iss/download.py +++ b/iss/download.py @@ -1,25 +1,22 @@ #!/usr/bin/env python # -*- coding: utf-8 -*- -from Bio import SeqIO -from Bio import Entrez - import io +import logging +import random import sys import time import zlib -import random -import logging import requests +from Bio import Entrez, SeqIO class BadRequestError(Exception): - """Exception to raise when a http request does not return 200 - """ + """Exception to raise when a http request does not return 200""" def __init__(self, url, status_code): - super().__init__('%s returned %d' % (url, status_code)) + super().__init__("%s returned %d" % (url, status_code)) def ncbi(kingdom, n_genomes, output): @@ -34,35 +31,36 @@ def ncbi(kingdom, n_genomes, output): str: the output file """ logger = logging.getLogger(__name__) - Entrez.email = 'hadrien.gourle@slu.se' - Entrez.tool = 'InSilicoSeq' - Entrez.api_key = 'd784b36672ca73601f4a19c3865775a17207' - full_id_list = Entrez.read(Entrez.esearch( - 'assembly', - term='%s[Organism] AND "latest refseq"[filter] AND "complete genome"[filter]' - % kingdom, retmax=100000))['IdList'] + Entrez.email = "hadrien.gourle@slu.se" + Entrez.tool = "InSilicoSeq" + Entrez.api_key = "d784b36672ca73601f4a19c3865775a17207" + full_id_list = Entrez.read( + Entrez.esearch( + "assembly", + term='%s[Organism] AND "latest refseq"[filter] AND "complete genome"[filter]' % kingdom, + retmax=100000, + ) + )["IdList"] n = 0 - logger.info('Searching for %s to download' % kingdom) + logger.info("Searching for %s to download" % kingdom) while n < n_genomes: ident = random.choice(full_id_list) - genome_info = Entrez.read( - Entrez.esummary( - db='assembly', - id=ident), - validate=False)["DocumentSummarySet"]["DocumentSummary"][0] - if genome_info['FtpPath_RefSeq']: - url = genome_info['FtpPath_RefSeq'] - url = "%s/%s_%s_genomic.fna.gz" \ - % (genome_info['FtpPath_RefSeq'], - genome_info['AssemblyAccession'], - genome_info['AssemblyName']) - logger.info('Downloading %s' % genome_info['AssemblyAccession']) + genome_info = Entrez.read(Entrez.esummary(db="assembly", id=ident), validate=False)["DocumentSummarySet"][ + "DocumentSummary" + ][0] + if genome_info["FtpPath_RefSeq"]: + url = genome_info["FtpPath_RefSeq"] + url = "%s/%s_%s_genomic.fna.gz" % ( + genome_info["FtpPath_RefSeq"], + genome_info["AssemblyAccession"], + genome_info["AssemblyName"], + ) + logger.info("Downloading %s" % genome_info["AssemblyAccession"]) try: assembly_to_fasta(url, output) - except BadRequestError as e: - logger.debug('Could not download %s' % - genome_info['AssemblyAccession']) - logger.debug('Skipping and waiting two seconds') + except BadRequestError: + logger.debug("Could not download %s" % genome_info["AssemblyAccession"]) + logger.debug("Skipping and waiting two seconds") time.sleep(2) else: n += 1 @@ -70,7 +68,7 @@ def ncbi(kingdom, n_genomes, output): return output -def assembly_to_fasta(url, output, chunk_size=1024): +def assembly_to_fasta(url, output): """download an assembly from the ncbi ftp and append the chromosome sequence to a fasta file. @@ -89,30 +87,28 @@ def assembly_to_fasta(url, output, chunk_size=1024): if url: request = requests.get(url) if request.status_code == 200: - request = zlib.decompress( - request.content, zlib.MAX_WBITS | 32).decode() + request = zlib.decompress(request.content, zlib.MAX_WBITS | 32).decode() else: raise BadRequestError(url, request.status_code) with io.StringIO(request) as fasta_io: - seq_handle = SeqIO.parse(fasta_io, 'fasta') + seq_handle = SeqIO.parse(fasta_io, "fasta") chromosome = filter_plasmids(seq_handle) try: - f = open(output, 'a') + f = open(output, "a") except (IOError, OSError) as e: - logger.error('Failed to open output file: %s' % e) + logger.error("Failed to open output file: %s" % e) sys.exit(1) else: - logger.debug('Writing genome to %s' % output) + logger.debug("Writing genome to %s" % output) with f: - SeqIO.write(chromosome, f, 'fasta') + SeqIO.write(chromosome, f, "fasta") return output def filter_plasmids(handle): - """returns the largest sequence from a sequence handle - """ + """returns the largest sequence from a sequence handle""" n = 0 for record in handle: if len(record) > n: diff --git a/iss/error_models/__init__.py b/iss/error_models/__init__.py index e4eff51..5378fd9 100644 --- a/iss/error_models/__init__.py +++ b/iss/error_models/__init__.py @@ -1,15 +1,15 @@ #!/usr/bin/env python # -*- coding: utf-8 -*- -from iss import util - -import sys -import random import logging +import random +import sys + import _pickle import numpy as np +from Bio.Seq import MutableSeq, Seq -from Bio.Seq import Seq, MutableSeq +from iss import util class ErrorModel(object): @@ -18,6 +18,7 @@ class ErrorModel(object): This class is used to create inheriting classes and contains all the functions that are shared by all ErrorModel classes """ + @property def logger(self): component = "{}.{}".format(type(self).__module__, type(self).__name__) @@ -37,17 +38,15 @@ def load_npz(self, npz_path, model): """ try: error_profile = np.load(npz_path, allow_pickle=True) - assert error_profile['model'] == model + assert error_profile["model"] == model except (OSError, IOError, EOFError, _pickle.UnpicklingError) as e: - self.logger.error('Failed to read ErrorModel file: %s' % e) + self.logger.error("Failed to read ErrorModel file: %s" % e) sys.exit(1) - except AssertionError as e: - self.logger.error( - 'Trying to load a %s ErrorModel in %s mode' % ( - error_profile['model'], model)) + except AssertionError: + self.logger.error("Trying to load a %s ErrorModel in %s mode" % (error_profile["model"], model)) sys.exit(1) else: - self.logger.debug('Loaded ErrorProfile: %s' % npz_path) + self.logger.debug("Loaded ErrorProfile: %s" % npz_path) return error_profile def introduce_error_scores(self, record, orientation): @@ -57,16 +56,14 @@ def introduce_error_scores(self, record, orientation): record (SeqRecord): a read record orientation (string): orientation of the read. Can be 'forward' or 'reverse' - + Returns: SeqRecord: a read record with error scores """ - if orientation == 'forward': - record.letter_annotations["phred_quality"] = self.gen_phred_scores( - self.quality_forward, 'forward') - elif orientation == 'reverse': - record.letter_annotations["phred_quality"] = self.gen_phred_scores( - self.quality_reverse, 'reverse') + if orientation == "forward": + record.letter_annotations["phred_quality"] = self.gen_phred_scores(self.quality_forward, "forward") + elif orientation == "reverse": + record.letter_annotations["phred_quality"] = self.gen_phred_scores(self.quality_reverse, "reverse") return record def mut_sequence(self, record, orientation): @@ -85,20 +82,19 @@ def mut_sequence(self, record, orientation): """ # get the right subst_matrix - if orientation == 'forward': + if orientation == "forward": nucl_choices = self.subst_choices_for - elif orientation == 'reverse': + elif orientation == "reverse": nucl_choices = self.subst_choices_rev mutable_seq = MutableSeq(record.seq) quality_list = record.letter_annotations["phred_quality"] position = 0 for nucl, qual in zip(mutable_seq, quality_list): - if random.random() > util.phred_to_prob(qual) \ - and nucl.upper() not in 'RYWSMKHBVDN': - mutable_seq[position] = str(np.random.choice( - nucl_choices[position][nucl.upper()][0], - p=nucl_choices[position][nucl.upper()][1])) + if random.random() > util.phred_to_prob(qual) and nucl.upper() not in "RYWSMKHBVDN": + mutable_seq[position] = str( + np.random.choice(nucl_choices[position][nucl.upper()][0], p=nucl_choices[position][nucl.upper()][1]) + ) position += 1 return Seq(mutable_seq) @@ -130,20 +126,19 @@ def adjust_seq_length(self, mut_seq, orientation, full_sequence, bounds): return Seq(mut_seq) else: # len smaller to_add = self.read_length - len(mut_seq) - if orientation == 'forward': + if orientation == "forward": for i in range(to_add): if read_end + i >= len(full_sequence): - nucl_to_add = 'A' + nucl_to_add = "A" else: nucl_to_add = str(full_sequence[read_end + i]) mut_seq.append(nucl_to_add) - elif orientation == 'reverse': + elif orientation == "reverse": for i in range(to_add): if read_start - 1 - i < 0: - nucl_to_add = 'A' + nucl_to_add = "A" else: - nucl_to_add = util.rev_comp( - full_sequence[read_start-1 - i]) + nucl_to_add = util.rev_comp(full_sequence[read_start - 1 - i]) mut_seq.append(nucl_to_add) return Seq(mut_seq) @@ -167,10 +162,10 @@ def introduce_indels(self, record, orientation, full_seq, bounds): """ # get the right indel arrays - if orientation == 'forward': + if orientation == "forward": insertions = self.ins_for deletions = self.del_for - elif orientation == 'reverse': + elif orientation == "reverse": insertions = self.ins_rev deletions = self.del_rev @@ -179,7 +174,7 @@ def introduce_indels(self, record, orientation, full_seq, bounds): for nucl in range(self.read_length - 1): try: # skip ambiguous nucleotides - if mutable_seq[nucl].upper() in 'RYWSMKHBVDN': + if mutable_seq[nucl].upper() in "RYWSMKHBVDN": position += 1 continue for nucl_to_insert, prob in insertions[position].items(): @@ -189,9 +184,8 @@ def introduce_indels(self, record, orientation, full_seq, bounds): if random.random() < deletions[position][mutable_seq[nucl].upper()]: mutable_seq.pop(position) position += 1 - except IndexError as e: + except IndexError: continue - seq = self.adjust_seq_length( - mutable_seq, orientation, full_seq, bounds) + seq = self.adjust_seq_length(mutable_seq, orientation, full_seq, bounds) return seq diff --git a/iss/error_models/basic.py b/iss/error_models/basic.py index a3efb70..6c30fc4 100644 --- a/iss/error_models/basic.py +++ b/iss/error_models/basic.py @@ -1,11 +1,11 @@ #!/usr/bin/env python # -*- coding: utf-8 -*- +import numpy as np + from iss import util from iss.error_models import ErrorModel -import numpy as np - class BasicErrorModel(ErrorModel): """Basic Error Model class @@ -14,21 +14,25 @@ class BasicErrorModel(ErrorModel): Only substitutions errors occur. The substitution rate is assumed equal between all nucleotides. """ + def __init__(self): super().__init__() self.read_length = 125 self.insert_size = 200 self.quality_forward = self.quality_reverse = 30 - self.subst_choices_for = self.subst_choices_rev = [{ - 'A': (['T', 'C', 'G'], [1/3, 1/3, 1/3]), - 'T': (['A', 'C', 'G'], [1/3, 1/3, 1/3]), - 'C': (['A', 'T', 'G'], [1/3, 1/3, 1/3]), - 'G': (['A', 'T', 'C'], [1/3, 1/3, 1/3]) - } for _ in range(self.read_length)] - - self.ins_for = self.ins_rev = self.del_for = self.del_rev = [{ - 'A': 0.0, 'T': 0.0, 'C': 0.0, 'G': 0.0 - } for _ in range(125)] + self.subst_choices_for = self.subst_choices_rev = [ + { + "A": (["T", "C", "G"], [1 / 3, 1 / 3, 1 / 3]), + "T": (["A", "C", "G"], [1 / 3, 1 / 3, 1 / 3]), + "C": (["A", "T", "G"], [1 / 3, 1 / 3, 1 / 3]), + "G": (["A", "T", "C"], [1 / 3, 1 / 3, 1 / 3]), + } + for _ in range(self.read_length) + ] + + self.ins_for = self.ins_rev = self.del_for = self.del_rev = [ + {"A": 0.0, "T": 0.0, "C": 0.0, "G": 0.0} for _ in range(125) + ] def gen_phred_scores(self, mean_quality, orientation): """Generate a normal distribution, transform to phred scores @@ -42,8 +46,7 @@ def gen_phred_scores(self, mean_quality, orientation): Returns: list: list of phred scores following a normal distribution """ - norm = [min(q, 0.9999) for q in np.random.normal( - util.phred_to_prob(mean_quality), 0.01, self.read_length)] + norm = [min(q, 0.9999) for q in np.random.normal(util.phred_to_prob(mean_quality), 0.01, self.read_length)] phred = [util.prob_to_phred(p) for p in norm] return phred diff --git a/iss/error_models/kde.py b/iss/error_models/kde.py index f59f7c4..0cf8d88 100644 --- a/iss/error_models/kde.py +++ b/iss/error_models/kde.py @@ -1,10 +1,10 @@ #!/usr/bin/env python # -*- coding: utf-8 -*- -from iss.error_models import ErrorModel - import numpy as np +from iss.error_models import ErrorModel + class KDErrorModel(ErrorModel): """KDErrorModel class. @@ -24,26 +24,26 @@ class KDErrorModel(ErrorModel): def __init__(self, npz_path): super().__init__() self.npz_path = npz_path - self.error_profile = self.load_npz(npz_path, 'kde') + self.error_profile = self.load_npz(npz_path, "kde") - self.read_length = self.error_profile['read_length'] - self.i_size_cdf = self.error_profile['insert_size'] + self.read_length = self.error_profile["read_length"] + self.i_size_cdf = self.error_profile["insert_size"] - self.mean_forward = self.error_profile['mean_count_forward'] - self.mean_reverse = self.error_profile['mean_count_reverse'] + self.mean_forward = self.error_profile["mean_count_forward"] + self.mean_reverse = self.error_profile["mean_count_reverse"] - self.quality_forward = self.error_profile['quality_hist_forward'] - self.quality_reverse = self.error_profile['quality_hist_reverse'] + self.quality_forward = self.error_profile["quality_hist_forward"] + self.quality_reverse = self.error_profile["quality_hist_reverse"] - self.subst_choices_for = self.error_profile['subst_choices_forward'] - self.subst_choices_rev = self.error_profile['subst_choices_reverse'] + self.subst_choices_for = self.error_profile["subst_choices_forward"] + self.subst_choices_rev = self.error_profile["subst_choices_reverse"] - self.ins_for = self.error_profile['ins_forward'] - self.ins_rev = self.error_profile['ins_reverse'] - self.del_for = self.error_profile['del_forward'] - self.del_rev = self.error_profile['del_reverse'] + self.ins_for = self.error_profile["ins_forward"] + self.ins_rev = self.error_profile["ins_reverse"] + self.del_for = self.error_profile["del_forward"] + self.del_rev = self.error_profile["del_reverse"] - self.error_profile = '' # discard the error profile after reading + self.error_profile = "" # discard the error profile after reading def gen_phred_scores(self, cdfs, orientation): """Generate a list of phred scores based on cdfs and mean bins @@ -55,14 +55,14 @@ def gen_phred_scores(self, cdfs, orientation): cdfs (ndarray): array containing the cdfs orientation (string): orientation of the read. Can be 'forward' or 'reverse' - + Returns: list: a list of phred scores """ # choose which mean bin to use - if orientation == 'forward': + if orientation == "forward": mean = self.mean_forward - elif orientation == 'reverse': + elif orientation == "reverse": mean = self.mean_reverse norm_mean = mean / sum(mean) @@ -72,14 +72,14 @@ def gen_phred_scores(self, cdfs, orientation): # is 0.95) to best quality bin if quality_bin == 4: quality_bin = 3 - + cdfs_bin = cdfs[quality_bin] phred_list = [] for cdf in cdfs_bin: random_quality = np.searchsorted(cdf, np.random.rand()) phred_list.append(random_quality) - return phred_list[:self.read_length] + return phred_list[: self.read_length] def random_insert_size(self): """Draw a random insert size from the insert size cdf diff --git a/iss/error_models/perfect.py b/iss/error_models/perfect.py index 48a876f..2704780 100644 --- a/iss/error_models/perfect.py +++ b/iss/error_models/perfect.py @@ -17,16 +17,19 @@ def __init__(self): self.insert_size = 200 self.quality_forward = self.quality_reverse = 40 - self.subst_choices_for = self.subst_choices_rev = [{ - 'A': (['A', 'T', 'C', 'G'], [1, 0, 0, 0]), - 'T': (['A', 'T', 'C', 'G'], [0, 1, 0, 0]), - 'C': (['A', 'T', 'C', 'G'], [0, 0, 1, 0]), - 'G': (['A', 'T', 'C', 'G'], [0, 0, 0, 1]) - } for _ in range(self.read_length)] - - self.ins_for = self.ins_rev = self.del_for = self.del_rev = [{ - 'A': 0.0, 'T': 0.0, 'C': 0.0, 'G': 0.0 - } for _ in range(self.read_length)] + self.subst_choices_for = self.subst_choices_rev = [ + { + "A": (["A", "T", "C", "G"], [1, 0, 0, 0]), + "T": (["A", "T", "C", "G"], [0, 1, 0, 0]), + "C": (["A", "T", "C", "G"], [0, 0, 1, 0]), + "G": (["A", "T", "C", "G"], [0, 0, 0, 1]), + } + for _ in range(self.read_length) + ] + + self.ins_for = self.ins_rev = self.del_for = self.del_rev = [ + {"A": 0.0, "T": 0.0, "C": 0.0, "G": 0.0} for _ in range(self.read_length) + ] def gen_phred_scores(self, mean_quality, orientation): """Fake randorm function returning the distribution of Phred diff --git a/iss/generator.py b/iss/generator.py index 7c70a41..dbcb7d2 100644 --- a/iss/generator.py +++ b/iss/generator.py @@ -1,21 +1,20 @@ #!/usr/bin/env python # -*- coding: utf-8 -*- -from iss.util import load, rev_comp +import logging +import random +import sys +import numpy as np from Bio import SeqIO from Bio.Seq import Seq -from Bio.SeqUtils import gc_fraction from Bio.SeqRecord import SeqRecord +from Bio.SeqUtils import gc_fraction -import sys -import random -import logging -import numpy as np +from iss.util import load, rev_comp -def reads(record, ErrorModel, n_pairs, cpu_number, output, seed, sequence_type, - gc_bias=False, mode="default"): +def reads(record, ErrorModel, n_pairs, cpu_number, output, seed, sequence_type, gc_bias=False, mode="default"): """Simulate reads from one genome (or sequence) according to an ErrorModel This function makes use of the `simulate_read` function to simulate reads @@ -46,25 +45,20 @@ def reads(record, ErrorModel, n_pairs, cpu_number, output, seed, sequence_type, if seed is not None: random.seed(seed + cpu_number) np.random.seed(seed + cpu_number) - logger.debug( - 'Cpu #%s: Generating %s read pairs' - % (cpu_number, n_pairs)) + logger.debug("Cpu #%s: Generating %s read pairs" % (cpu_number, n_pairs)) read_tuple_list = [] i = 0 while i < n_pairs: # try: # forward, reverse = simulate_read(record, ErrorModel, i) - # except ValueError as e: + # except ValueError: # logger.error('Skipping this record: %s' % record.id) # return try: forward, reverse = simulate_read(record, ErrorModel, i, cpu_number, sequence_type) - except AssertionError as e: - logger.warning( - '%s shorter than read length for this ErrorModel' % record.id) - logger.warning( - 'Skipping %s. You will have less reads than specified' - % record.id) + except AssertionError: + logger.warning("%s shorter than read length for this ErrorModel" % record.id) + logger.warning("Skipping %s. You will have less reads than specified" % record.id) break else: if gc_bias: @@ -82,7 +76,7 @@ def reads(record, ErrorModel, n_pairs, cpu_number, output, seed, sequence_type, read_tuple_list.append((forward, reverse)) i += 1 - temp_file_name = output + '.iss.tmp.%s.%s' % (record.id, cpu_number) + temp_file_name = output + ".iss.tmp.%s.%s" % (record.id, cpu_number) to_fastq(read_tuple_list, temp_file_name) return temp_file_name @@ -118,35 +112,28 @@ def simulate_read(record, ErrorModel, i, cpu_number, sequence_type): # assign the start position of the forward read # if sequence_type == metagenomics, get a random start position # if sequence_type == amplicon, start position is the start of the read - if sequence_type == 'metagenomics': - forward_start = random.randrange( - 0, len(record.seq) - (2 * read_length + insert_size)) - elif sequence_type == 'amplicon': + if sequence_type == "metagenomics": + forward_start = random.randrange(0, len(record.seq) - (2 * read_length + insert_size)) + elif sequence_type == "amplicon": forward_start = 0 else: raise RuntimeError(f"sequence type '{sequence_type}' is not supported") - except AssertionError as e: + except AssertionError: raise except ValueError as e: - logger.debug( - '%s shorter than template length for this ErrorModel:%s' - % (record.id, e)) - forward_start = max(0, random.randrange( - 0, len(record.seq) - read_length)) + logger.debug("%s shorter than template length for this ErrorModel:%s" % (record.id, e)) + forward_start = max(0, random.randrange(0, len(record.seq) - read_length)) forward_end = forward_start + read_length bounds = (forward_start, forward_end) # create a perfect read forward = SeqRecord( - Seq(str(sequence[forward_start:forward_end])), - id='%s_%s_%s/1' % (header, i, cpu_number), - description='' + Seq(str(sequence[forward_start:forward_end])), id="%s_%s_%s/1" % (header, i, cpu_number), description="" ) # add the indels, the qual scores and modify the record accordingly - forward.seq = ErrorModel.introduce_indels( - forward, 'forward', sequence, bounds) - forward = ErrorModel.introduce_error_scores(forward, 'forward') - forward.seq = ErrorModel.mut_sequence(forward, 'forward') + forward.seq = ErrorModel.introduce_indels(forward, "forward", sequence, bounds) + forward = ErrorModel.introduce_error_scores(forward, "forward") + forward.seq = ErrorModel.mut_sequence(forward, "forward") # generate the reverse read # assign start position reverse read @@ -169,15 +156,14 @@ def simulate_read(record, ErrorModel, i, cpu_number, sequence_type): # create a perfect read reverse = SeqRecord( Seq(rev_comp(str(sequence[reverse_start:reverse_end]))), - id='%s_%s_%s/2' % (header, i, cpu_number), - description='' + id="%s_%s_%s/2" % (header, i, cpu_number), + description="", ) # add the indels, the qual scores and modify the record accordingly - reverse.seq = ErrorModel.introduce_indels( - reverse, 'reverse', sequence, bounds) - reverse = ErrorModel.introduce_error_scores(reverse, 'reverse') - reverse.seq = ErrorModel.mut_sequence(reverse, 'reverse') + reverse.seq = ErrorModel.introduce_indels(reverse, "reverse", sequence, bounds) + reverse = ErrorModel.introduce_error_scores(reverse, "reverse") + reverse.seq = ErrorModel.mut_sequence(reverse, "reverse") return (forward, reverse) @@ -194,17 +180,17 @@ def to_fastq(generator, output): """ logger = logging.getLogger(__name__) # define name of output files - output_forward = output + '_R1.fastq' - output_reverse = output + '_R2.fastq' + output_forward = output + "_R1.fastq" + output_reverse = output + "_R2.fastq" try: - f = open(output_forward, 'a') - r = open(output_reverse, 'a') + f = open(output_forward, "a") + r = open(output_reverse, "a") except PermissionError as e: - logger.error('Failed to open output file(s): %s' % e) + logger.error("Failed to open output file(s): %s" % e) sys.exit(1) else: with f, r: for read_tuple in generator: - SeqIO.write(read_tuple[0], f, 'fastq-sanger') - SeqIO.write(read_tuple[1], r, 'fastq-sanger') + SeqIO.write(read_tuple[0], f, "fastq-sanger") + SeqIO.write(read_tuple[1], r, "fastq-sanger") diff --git a/iss/modeller.py b/iss/modeller.py index a38741b..3d856a4 100644 --- a/iss/modeller.py +++ b/iss/modeller.py @@ -1,11 +1,12 @@ #!/usr/bin/env python # -*- coding: utf-8 -*- -from iss import util -from scipy import stats - import logging + import numpy as np +from scipy import stats + +from iss import util def insert_size(insert_size_distribution): @@ -19,12 +20,8 @@ def insert_size(insert_size_distribution): Returns: 1darray: a cumulative density function """ - kde = stats.gaussian_kde( - insert_size_distribution, - bw_method=0.2 / np.std(insert_size_distribution, ddof=1)) - x_grid = np.linspace( - min(insert_size_distribution), - max(insert_size_distribution), 1000) + kde = stats.gaussian_kde(insert_size_distribution, bw_method=0.2 / np.std(insert_size_distribution, ddof=1)) + x_grid = np.linspace(min(insert_size_distribution), max(insert_size_distribution), 1000) kde = kde.evaluate(x_grid) cdf = np.cumsum(kde) cdf = cdf / cdf[-1] @@ -44,7 +41,7 @@ def divide_qualities_into_bins(qualities, n_bins=4): list: a list of lists containing the binned quality scores """ logger = logging.getLogger(__name__) - logger.debug('Dividing qualities into mean clusters') + logger.debug("Dividing qualities into mean clusters") bin_lists = [[] for _ in range(n_bins)] # create list of `n_bins` list ranges = np.split(np.array(range(40)), n_bins) for quality in qualities: @@ -76,15 +73,14 @@ def quality_bins_to_histogram(bin_lists): i = 0 for qual_bin in bin_lists: if len(qual_bin) > 1: - logger.debug('Transposing matrix for mean cluster #%s' % i) + logger.debug("Transposing matrix for mean cluster #%s" % i) # quals = np.asarray(qual_bin).T # seems to make clunkier models quals = [q for q in zip(*qual_bin)] - logger.debug( - 'Modelling quality distribution for mean cluster #%s' % i) + logger.debug("Modelling quality distribution for mean cluster #%s" % i) cdfs_list = raw_qualities_to_histogram(quals) cdf_bins.append(cdfs_list) else: - logger.debug('Mean quality bin #%s of length < 1. Skipping' % i) + logger.debug("Mean quality bin #%s of length < 1. Skipping" % i) cdf_bins.append([]) i += 1 return cdf_bins @@ -103,17 +99,17 @@ def raw_qualities_to_histogram(qualities): list: list of cumulative distribution functions. One cdf per base. The list has the size of the read length """ - logger = logging.getLogger(__name__) + # logger = logging.getLogger(__name__) # moved this in quality_bins_to_histogram to try parallelization # quals = util.split_list([i for i in zip(*qualities)], n_parts=cpus) cdfs_list = [] for q in qualities: - numpy_log_handler = np.seterrcall(util.nplog) - with np.errstate(under='ignore', divide='call'): + np.seterrcall(util.nplog) + with np.errstate(under="ignore", divide="call"): try: kde = stats.gaussian_kde(q, bw_method=0.2 / np.std(q, ddof=1)) - except np.linalg.linalg.LinAlgError as e: + except np.linalg.linalg.LinAlgError: # if np.std of array is 0, we modify the array slightly to be # able to divide by ~ np.std # this will print a FloatingPointError in DEBUG mode @@ -155,22 +151,22 @@ def dispatch_subst(base, read, read_has_indels): third element: True if an indel has been detected, False otherwise """ dispatch_dict = { - 'AA': 0, - 'aT': 1, - 'aG': 2, - 'aC': 3, - 'TT': 4, - 'tA': 5, - 'tG': 6, - 'tC': 7, - 'CC': 8, - 'cA': 9, - 'cT': 10, - 'cG': 11, - 'GG': 12, - 'gA': 13, - 'gT': 14, - 'gC': 15 + "AA": 0, + "aT": 1, + "aG": 2, + "aC": 3, + "TT": 4, + "tA": 5, + "tG": 6, + "tC": 7, + "CC": 8, + "cA": 9, + "cT": 10, + "cG": 11, + "GG": 12, + "gA": 13, + "gT": 14, + "gC": 15, } query_pos = base[0] @@ -210,53 +206,41 @@ def subst_matrix_to_choices(substitution_matrix, read_length): nucl_choices_list = [] for pos in range(read_length): sums = { - 'A': np.sum(substitution_matrix[pos][1:4]), - 'T': np.sum(substitution_matrix[pos][5:8]), - 'C': np.sum(substitution_matrix[pos][9:12]), - 'G': np.sum(substitution_matrix[pos][13:]) + "A": np.sum(substitution_matrix[pos][1:4]), + "T": np.sum(substitution_matrix[pos][5:8]), + "C": np.sum(substitution_matrix[pos][9:12]), + "G": np.sum(substitution_matrix[pos][13:]), } # we want to avoid 'na' in the data so we raise FloatingPointError # if we try to divide by 0 (no count data for that nucl at that pos) # we assume equal rate of substitution - with np.errstate(all='raise'): + with np.errstate(all="raise"): nucl_choices = {} try: - A = ( - ['T', 'C', 'G'], - [count / sums['A'] for - count in substitution_matrix[pos][1:4]]) + A = (["T", "C", "G"], [count / sums["A"] for count in substitution_matrix[pos][1:4]]) except FloatingPointError as e: logger.debug(e, exc_info=True) - A = (['T', 'C', 'G'], [1/3, 1/3, 1/3]) + A = (["T", "C", "G"], [1 / 3, 1 / 3, 1 / 3]) try: - T = ( - ['A', 'C', 'G'], - [count / sums['T'] for - count in substitution_matrix[pos][5:8]]) + T = (["A", "C", "G"], [count / sums["T"] for count in substitution_matrix[pos][5:8]]) except FloatingPointError as e: logger.debug(e, exc_info=True) - T = (['A', 'C', 'G'], [1/3, 1/3, 1/3]) + T = (["A", "C", "G"], [1 / 3, 1 / 3, 1 / 3]) try: - C = ( - ['A', 'T', 'G'], - [count / sums['C'] for - count in substitution_matrix[pos][9:12]]) + C = (["A", "T", "G"], [count / sums["C"] for count in substitution_matrix[pos][9:12]]) except FloatingPointError as e: logger.debug(e, exc_info=True) - C = (['A', 'T', 'G'], [1/3, 1/3, 1/3]) + C = (["A", "T", "G"], [1 / 3, 1 / 3, 1 / 3]) try: - G = ( - ['A', 'T', 'C'], - [count / sums['G'] for - count in substitution_matrix[pos][13:]]) + G = (["A", "T", "C"], [count / sums["G"] for count in substitution_matrix[pos][13:]]) except FloatingPointError as e: logger.debug(e, exc_info=True) - G = (['A', 'T', 'C'], [1/3, 1/3, 1/3]) + G = (["A", "T", "C"], [1 / 3, 1 / 3, 1 / 3]) - nucl_choices['A'] = A - nucl_choices['T'] = T - nucl_choices['C'] = C - nucl_choices['G'] = G + nucl_choices["A"] = A + nucl_choices["T"] = T + nucl_choices["C"] = C + nucl_choices["G"] = G nucl_choices_list.append(nucl_choices) return nucl_choices_list @@ -282,43 +266,33 @@ def dispatch_indels(read): """ logger = logging.getLogger(__name__) - dispatch_indels = { - 0: 0, - 'A1': 1, - 'T1': 2, - 'C1': 3, - 'G1': 4, - 'A2': 5, - 'T2': 6, - 'C2': 7, - 'G2': 8 - } + dispatch_indels = {0: 0, "A1": 1, "T1": 2, "C1": 3, "G1": 4, "A2": 5, "T2": 6, "C2": 7, "G2": 8} position = 0 - for (cigar_type, cigar_length) in read.cigartuples: + for cigar_type, cigar_length in read.cigartuples: if cigar_type == 0: # match position += cigar_length continue elif cigar_type == 1: # insertion query_base = read.query_sequence[position] - insertion = query_base.upper() + '1' + insertion = query_base.upper() + "1" try: indel = dispatch_indels[insertion] dispatch_tuple = (position, indel) position += cigar_length - except KeyError as e: # we avoid ambiguous bases + except KeyError: # we avoid ambiguous bases # logger.debug( # '%s not in dispatch: %s' % (insertion, e), exc_info=True) position += cigar_length continue elif cigar_type == 2: # deletion ref_base = read.query_alignment_sequence[position] - deletion = ref_base.upper() + '2' + deletion = ref_base.upper() + "2" try: indel = dispatch_indels[deletion] dispatch_tuple = (position, indel) position -= cigar_length - except KeyError as e: # we avoid ambiguous bases + except KeyError: # we avoid ambiguous bases # logger.debug( # '%s not in dispatch: %s' % (deletion, e), exc_info=True) position -= cigar_length @@ -352,16 +326,16 @@ def indel_matrix_to_choices(indel_matrix, read_length): del_choices = [] for pos in range(read_length): insertions = { - 'A': indel_matrix[pos][1] / indel_matrix[pos][0], - 'T': indel_matrix[pos][2] / indel_matrix[pos][0], - 'C': indel_matrix[pos][3] / indel_matrix[pos][0], - 'G': indel_matrix[pos][4] / indel_matrix[pos][0] + "A": indel_matrix[pos][1] / indel_matrix[pos][0], + "T": indel_matrix[pos][2] / indel_matrix[pos][0], + "C": indel_matrix[pos][3] / indel_matrix[pos][0], + "G": indel_matrix[pos][4] / indel_matrix[pos][0], } deletions = { - 'A': indel_matrix[pos][5] / indel_matrix[pos][0], - 'T': indel_matrix[pos][6] / indel_matrix[pos][0], - 'C': indel_matrix[pos][7] / indel_matrix[pos][0], - 'G': indel_matrix[pos][8] / indel_matrix[pos][0] + "A": indel_matrix[pos][5] / indel_matrix[pos][0], + "T": indel_matrix[pos][6] / indel_matrix[pos][0], + "C": indel_matrix[pos][7] / indel_matrix[pos][0], + "G": indel_matrix[pos][8] / indel_matrix[pos][0], } ins_choices.append(insertions) del_choices.append(deletions) diff --git a/iss/test/test_abundance.py b/iss/test/test_abundance.py index 3fc561e..4e42968 100644 --- a/iss/test/test_abundance.py +++ b/iss/test/test_abundance.py @@ -1,78 +1,51 @@ #!/usr/bin/env python # -*- coding: utf-8 -*- -import pytest - -from iss import util -from iss import abundance - import numpy as np +import pytest +from iss import abundance, util -def setup_function(): - output_file_prefix = 'data/.test' - - -def teardown_cleanup(): - util.cleanup(['data/test_abundance.txt']) @pytest.fixture def setup_and_teardown(): - setup_function() yield - teardown_cleanup() + util.cleanup(["data/test_abundance.txt"]) def test_parsing(): - abundance_dic = abundance.parse_abundance_file('data/abundance.txt') - assert abundance_dic == { - 'genome_ATCG': 0.1, - 'genome_TA': 0.1, - 'genome_A': 0.2, - 'genome_GC': 0.4, - 'genome_T': 0.2 - } + abundance_dic = abundance.parse_abundance_file("data/abundance.txt") + assert abundance_dic == {"genome_ATCG": 0.1, "genome_TA": 0.1, "genome_A": 0.2, "genome_GC": 0.4, "genome_T": 0.2} -def test_parsing_readcounts(): - readcount_dic = abundance.parse_readcount_file('data/readcounts.txt') - assert readcount_dic == { - 'amplicon_ATCG': 1, - 'amplicon_TA': 1, - 'amplicon_A': 2, - 'amplicon_GC': 4, - 'amplicon_T': 2 - } +def test_parsing_readcounts(): + readcount_dic = abundance.parse_readcount_file("data/readcounts.txt") + assert readcount_dic == {"amplicon_ATCG": 1, "amplicon_TA": 1, "amplicon_A": 2, "amplicon_GC": 4, "amplicon_T": 2} def test_parsing_empty(): with pytest.raises(SystemExit): - abundance_dic = abundance.parse_abundance_file('data/empty_file') + abundance.parse_abundance_file("data/empty_file") def test_parsing_no_exists(): with pytest.raises(SystemExit): - abundance_dic = abundance.parse_abundance_file('data/does_not_exist') + abundance.parse_abundance_file("data/does_not_exist") def test_parsing_bad_abundance(): with pytest.raises(SystemExit): - abundance_dic = abundance.parse_abundance_file('data/bad_abundance.txt') + abundance.parse_abundance_file("data/bad_abundance.txt") def test_cov_calc(): - coverage_ecoli = abundance.to_coverage( - 10000000, - 0.08, - 150, - 4639221 - ) + coverage_ecoli = abundance.to_coverage(10000000, 0.08, 150, 4639221) assert round(coverage_ecoli, 3) == 25.866 def test_distributions(): np.random.seed(42) - f = open('data/genomes.fasta', 'r') + f = open("data/genomes.fasta", "r") with f: # count the number of records record_list = util.count_records(f) @@ -82,47 +55,44 @@ def test_distributions(): lognormal_dic = abundance.lognormal(record_list) np.random.seed(42) # reset the seed to get 0s in zero_inflated_lognormal - zero_inflated_lognormal_dic = abundance.zero_inflated_lognormal( - record_list) + zero_inflated_lognormal_dic = abundance.zero_inflated_lognormal(record_list) assert list(uniform_dic.values()) == [0.2] * 5 - assert round(halfnormal_dic['genome_A'], 2) == 0.16 - assert round(exponential_dic['genome_A'], 2) == 0.01 - assert round(lognormal_dic['genome_T'], 2) == 0.19 - assert zero_inflated_lognormal_dic['genome_T'] == 0.0 - assert round(zero_inflated_lognormal_dic['genome_A'], 2) == 0.44 + assert round(halfnormal_dic["genome_A"], 2) == 0.16 + assert round(exponential_dic["genome_A"], 2) == 0.01 + assert round(lognormal_dic["genome_T"], 2) == 0.19 + assert zero_inflated_lognormal_dic["genome_T"] == 0.0 + assert round(zero_inflated_lognormal_dic["genome_A"], 2) == 0.44 def test_abunance_draft(setup_and_teardown): - abundance_dic = {'genome_A': 0.15511887441170918, - 'genome_T': 0.08220476760848751, - 'genome_GC': 0.18039811160555874, - 'genome_ATCG': 0.4329003045949206, - 'genome_TA': 0.07468835777633397, - 'contig_1': 0.02776920430880394, - 'contig_2': 0.011490705231229217, - 'contig_3': 0.03542967446295675} + abundance_dic = { + "genome_A": 0.15511887441170918, + "genome_T": 0.08220476760848751, + "genome_GC": 0.18039811160555874, + "genome_ATCG": 0.4329003045949206, + "genome_TA": 0.07468835777633397, + "contig_1": 0.02776920430880394, + "contig_2": 0.011490705231229217, + "contig_3": 0.03542967446295675, + } np.random.seed(42) - f = open('data/genomes.fasta', 'r') + f = open("data/genomes.fasta", "r") with f: # count the number of records complete_genomes = util.count_records(f) - draft_genomes = ['data/draft.fasta'] - ab = abundance.draft( - complete_genomes, - draft_genomes, - abundance.lognormal, - 'data/test') + draft_genomes = ["data/draft.fasta"] + ab = abundance.draft(complete_genomes, draft_genomes, abundance.lognormal, "data/test") for tv, v in zip(abundance_dic.values(), ab.values()): assert round(tv) == round(v) # assert_almost_equal(ab, abundance_dic) def test_coverage_scaling(): - abundance_dic = abundance.parse_abundance_file('data/abundance.txt') - coverage_dic = abundance.coverage_scaling(10000, abundance_dic, - 'data/genomes.fasta', 25) - assert coverage_dic == {'genome_A': 136.6120218579235, - 'genome_T': 136.6120218579235, - 'genome_GC': 273.224043715847, - 'genome_ATCG': 68.30601092896175, - 'genome_TA': 68.30601092896175 - } + abundance_dic = abundance.parse_abundance_file("data/abundance.txt") + coverage_dic = abundance.coverage_scaling(10000, abundance_dic, "data/genomes.fasta", 25) + assert coverage_dic == { + "genome_A": 136.6120218579235, + "genome_T": 136.6120218579235, + "genome_GC": 273.224043715847, + "genome_ATCG": 68.30601092896175, + "genome_TA": 68.30601092896175, + } diff --git a/iss/test/test_bam.py b/iss/test/test_bam.py index aa52545..d67a02d 100644 --- a/iss/test/test_bam.py +++ b/iss/test/test_bam.py @@ -1,23 +1,23 @@ #!/usr/bin/env python # -*- coding: utf-8 -*- +import os + import pytest from iss import bam -import os - def test_read_fail(): with pytest.raises(SystemExit): - bam_file = 'data/empty_file' + bam_file = "data/empty_file" bam_reader = bam.read_bam(bam_file) for read in bam_reader: print(read) def test_to_model(): - bam_file = 'data/ecoli.bam' - output = 'data/test_bam' + bam_file = "data/ecoli.bam" + output = "data/test_bam" bam.to_model(bam_file, output) - os.remove(output + '.npz') + os.remove(output + ".npz") diff --git a/iss/test/test_download.py b/iss/test/test_download.py index 60e0b5e..fbd60d2 100644 --- a/iss/test/test_download.py +++ b/iss/test/test_download.py @@ -3,31 +3,23 @@ import pytest -from Bio.Seq import Seq -from Bio.SeqRecord import SeqRecord - from iss import download from iss.util import cleanup -def setup_function(): - output_file_prefix = 'data/.test' - - -def teardown_cleanup(): - cleanup(['data/test_download.fasta']) - @pytest.fixture def setup_and_teardown(): - setup_function() yield - teardown_cleanup() + cleanup(["data/test_download.fasta"]) def download_to_fasta(setup_and_teardown): - ftp_url = 'ftp://ftp.ncbi.nlm.nih.gov/genomes/all/GCF/000/737/615/GCF_000737615.1_ASM73761v1/GCF_000737615.1_ASM73761v1_genomic.fna.gz' - download.assembly_to_fasta(ftp_url, 'data/test_download.fasta') + ftp_url = ( + "ftp://ftp.ncbi.nlm.nih.gov/genomes/all/GCF/000/737/615/GCF_000737615.1_ASM73761v1/" + "GCF_000737615.1_ASM73761v1_genomic.fna.gz" + ) + download.assembly_to_fasta(ftp_url, "data/test_download.fasta") def test_ncbi(setup_and_teardown): - genome_list = download.ncbi('bacteria', 2, 'data/test_download.fasta') + download.ncbi("bacteria", 2, "data/test_download.fasta") diff --git a/iss/test/test_error_model.py b/iss/test/test_error_model.py index b6bf441..6835665 100644 --- a/iss/test/test_error_model.py +++ b/iss/test/test_error_model.py @@ -1,22 +1,21 @@ #!/usr/bin/env python # -*- coding: utf-8 -*- -import pytest - -from iss.error_models import basic, kde, perfect -from iss.util import rev_comp +import random +import numpy as np +import pytest from Bio.Seq import Seq from Bio.SeqRecord import SeqRecord -import random -import numpy as np +from iss.error_models import basic, kde, perfect +from iss.util import rev_comp def test_perfect_phred(): err_mod = perfect.PerfectErrorModel() - distribution = err_mod.gen_phred_scores(20, 'forward')[:10] + distribution = err_mod.gen_phred_scores(20, "forward")[:10] assert distribution == [40, 40, 40, 40, 40, 40, 40, 40, 40, 40] @@ -24,15 +23,14 @@ def test_basic_phred(): np.random.seed(42) err_mod = basic.BasicErrorModel() - distribution = err_mod.gen_phred_scores(20, 'forward')[:10] + distribution = err_mod.gen_phred_scores(20, "forward")[:10] assert distribution == [23, 19, 25, 40, 19, 19, 40, 26, 18, 23] def test_kde_phred(): np.random.seed(42) - err_mod = kde.KDErrorModel('data/ecoli.npz') - distribution = err_mod.gen_phred_scores(err_mod.quality_reverse, - 'reverse')[10:] + err_mod = kde.KDErrorModel("data/ecoli.npz") + distribution = err_mod.gen_phred_scores(err_mod.quality_reverse, "reverse")[10:] assert distribution == [40, 40, 40, 40, 40, 40, 40, 40, 10, 10] @@ -40,12 +38,8 @@ def test_introduce_errors(): np.random.seed(42) err_mod = basic.BasicErrorModel() - read = SeqRecord( - Seq(str('AATGC' * 25)), - id='read_1', - description='test read' - ) - read = err_mod.introduce_error_scores(read, 'forward') + read = SeqRecord(Seq(str("AATGC" * 25)), id="read_1", description="test read") + read = err_mod.introduce_error_scores(read, "forward") qualities = read.letter_annotations["phred_quality"][:10] assert qualities == [40, 26, 40, 40, 25, 25, 40, 40, 22, 40] @@ -56,14 +50,10 @@ def test_mut_sequence(): err_mod = basic.BasicErrorModel() - read = SeqRecord( - Seq(str('AAAAA' * 25)), - id='read_1', - description='test read' - ) + read = SeqRecord(Seq(str("AAAAA" * 25)), id="read_1", description="test read") read.letter_annotations["phred_quality"] = [5] * 125 - read.seq = err_mod.mut_sequence(read, 'forward') - assert str(read.seq[:10]) == 'AAAACAGAAA' + read.seq = err_mod.mut_sequence(read, "forward") + assert str(read.seq[:10]) == "AAAACAGAAA" def test_introduce_indels(): @@ -71,75 +61,50 @@ def test_introduce_indels(): np.random.seed(42) err_mod = basic.BasicErrorModel() - err_mod.ins_for[1]['G'] = 1.0 - err_mod.del_for[0]['A'] = 1.0 + err_mod.ins_for[1]["G"] = 1.0 + err_mod.del_for[0]["A"] = 1.0 bounds = (5, 130) - read = SeqRecord( - Seq(str('ATATA' * 25)), - id='read_1', - description='test read' - ) - ref_genome = SeqRecord( - Seq(str('ATATA' * 100)), - id='ref_genome', - description='test reference' - ) - read.seq = err_mod.introduce_indels( - read, 'forward', ref_genome, bounds) + read = SeqRecord(Seq(str("ATATA" * 25)), id="read_1", description="test read") + ref_genome = SeqRecord(Seq(str("ATATA" * 100)), id="ref_genome", description="test reference") + read.seq = err_mod.introduce_indels(read, "forward", ref_genome, bounds) assert len(read.seq) == 125 - assert read.seq[:10] == 'ATGATAATAT' + assert read.seq[:10] == "ATGATAATAT" + def test_adjust_seq_length_extend(): random.seed(12) np.random.seed(12) - err_mod = kde.KDErrorModel('data/ecoli.npz') - err_mod.del_for[0]['A'] = 1.0 - err_mod.del_for[1]['T'] = 1.0 + err_mod = kde.KDErrorModel("data/ecoli.npz") + err_mod.del_for[0]["A"] = 1.0 + err_mod.del_for[1]["T"] = 1.0 bounds = (480, 500) - read = SeqRecord( - Seq(str('ATTTA' * 4)), - id='read_1', - description='test read' - ) - ref_genome = SeqRecord( - Seq(str('ATTTA' * 100)), - id='ref_genome', - description='test reference' - ) - read.seq = err_mod.introduce_indels( - read, 'forward', ref_genome, bounds) + read = SeqRecord(Seq(str("ATTTA" * 4)), id="read_1", description="test read") + ref_genome = SeqRecord(Seq(str("ATTTA" * 100)), id="ref_genome", description="test reference") + read.seq = err_mod.introduce_indels(read, "forward", ref_genome, bounds) assert len(read.seq) == 20 - assert read.seq[:10] == 'TTAATTTAAT' - assert read.seq[10:] == 'TTAATTTAAA' + assert read.seq[:10] == "TTAATTTAAT" + assert read.seq[10:] == "TTAATTTAAA" + def test_introduce_indels_rev(): random.seed(87) np.random.seed(87) - err_mod = kde.KDErrorModel('data/ecoli.npz') + err_mod = kde.KDErrorModel("data/ecoli.npz") - err_mod.del_rev[0]['C'] = 1.0 - err_mod.del_rev[1]['G'] = 1.0 + err_mod.del_rev[0]["C"] = 1.0 + err_mod.del_rev[1]["G"] = 1.0 bounds = (484, 504) - - ref_genome = SeqRecord( - Seq('GG'+ str('GTACC' * 100) + 'GG'), - id='ref_genome', - description='test reference' - ) - read = SeqRecord( - Seq(rev_comp(str(ref_genome.seq[484:504]))), - id='read_1', - description='test read' - ) - read.seq = err_mod.introduce_indels( - read, 'reverse', ref_genome, bounds) + + ref_genome = SeqRecord(Seq("GG" + str("GTACC" * 100) + "GG"), id="ref_genome", description="test reference") + read = SeqRecord(Seq(rev_comp(str(ref_genome.seq[484:504]))), id="read_1", description="test read") + read.seq = err_mod.introduce_indels(read, "reverse", ref_genome, bounds) assert len(read.seq) == 20 - assert read.seq == 'CGTACGGTACGGTACGGTAC' + assert read.seq == "CGTACGGTACGGTACGGTAC" def test_bad_err_mod(): with pytest.raises(SystemExit): - err_mod = kde.KDErrorModel('data/empty_file') + kde.KDErrorModel("data/empty_file") diff --git a/iss/test/test_generator.py b/iss/test/test_generator.py index fcc255c..7d4b1cc 100644 --- a/iss/test/test_generator.py +++ b/iss/test/test_generator.py @@ -1,75 +1,55 @@ #!/usr/bin/env python # -*- coding: utf-8 -*- -import pytest - - -from iss import generator -from iss.util import cleanup -from iss.error_models import basic, kde +import os +import random +import sys +import numpy as np +import pytest from Bio.Seq import Seq from Bio.SeqRecord import SeqRecord -import sys -import os -import random -import numpy as np +from iss import generator +from iss.error_models import basic, kde +from iss.util import cleanup # due to inconsistent seeding between python 2 and 3, some of the following # tests are disabled with python2 -def setup_function(): - output_file_prefix = 'data/.test' - - def teardown_cleanup(): - cleanup(['data/.test.iss.tmp.my_genome.0_R1.fastq', - 'data/.test.iss.tmp.my_genome.0_R2.fastq']) + cleanup(["data/.test.iss.tmp.my_genome.0_R1.fastq", "data/.test.iss.tmp.my_genome.0_R2.fastq"]) @pytest.fixture def setup_and_teardown(): - setup_function() yield - teardown_cleanup() + cleanup(["data/.test.iss.tmp.my_genome.0_R1.fastq", "data/.test.iss.tmp.my_genome.0_R2.fastq"]) def test_cleanup_fail(): with pytest.raises(SystemExit): - cleanup('data/does_not_exist') + cleanup("data/does_not_exist") def test_simulate_and_save(setup_and_teardown): err_mod = basic.BasicErrorModel() - ref_genome = SeqRecord( - Seq(str('AAAAACCCCC' * 100)), - id='my_genome', - description='test genome' - ) - generator.reads(ref_genome, err_mod, 1000, 0, 'data/.test', 0, 'metagenomics', gc_bias = True) + ref_genome = SeqRecord(Seq(str("AAAAACCCCC" * 100)), id="my_genome", description="test genome") + generator.reads(ref_genome, err_mod, 1000, 0, "data/.test", 0, "metagenomics", gc_bias=True) def test_simulate_and_save_short(setup_and_teardown): err_mod = basic.BasicErrorModel() - ref_genome = SeqRecord( - Seq(str('AACCC' * 100)), - id='my_genome', - description='test genome' - ) - generator.reads(ref_genome, err_mod, 1000, 0, 'data/.test', 0, 'metagenomics', gc_bias = True) + ref_genome = SeqRecord(Seq(str("AACCC" * 100)), id="my_genome", description="test genome") + generator.reads(ref_genome, err_mod, 1000, 0, "data/.test", 0, "metagenomics", gc_bias=True) def test_small_input(): with pytest.raises(AssertionError): - err_mod = kde.KDErrorModel('data/ecoli.npz') - ref_genome = SeqRecord( - Seq(str('AAAAACCCCC')), - id='my_genome', - description='test genome' - ) - generator.simulate_read(ref_genome, err_mod, 1, 0, 'metagenomics') + err_mod = kde.KDErrorModel("data/ecoli.npz") + ref_genome = SeqRecord(Seq(str("AAAAACCCCC")), id="my_genome", description="test genome") + generator.simulate_read(ref_genome, err_mod, 1, 0, "metagenomics") def test_basic(): @@ -77,72 +57,57 @@ def test_basic(): random.seed(42) np.random.seed(42) err_mod = basic.BasicErrorModel() - ref_genome = SeqRecord( - Seq(str('AAAAACCCCC' * 100)), - id='my_genome', - description='test genome' - ) - read_tuple = generator.simulate_read(ref_genome, err_mod, 1, 0, 'metagenomics') - big_read = ''.join(str(read_tuple[0].seq) + str(read_tuple[1].seq)) - assert big_read[-15:] == 'TTTTGGGGGTTTTTG' + ref_genome = SeqRecord(Seq(str("AAAAACCCCC" * 100)), id="my_genome", description="test genome") + read_tuple = generator.simulate_read(ref_genome, err_mod, 1, 0, "metagenomics") + big_read = "".join(str(read_tuple[0].seq) + str(read_tuple[1].seq)) + assert big_read[-15:] == "TTTTGGGGGTTTTTG" def test_kde(): if sys.version_info > (3,): random.seed(42) np.random.seed(42) - err_mod = kde.KDErrorModel('data/ecoli.npz') - ref_genome = SeqRecord( - Seq(str('CGTTTCAACC' * 400)), - id='my_genome', - description='test genome' - ) - read_tuple = generator.simulate_read(ref_genome, err_mod, 1, 0, 'metagenomics') - big_read = ''.join(str(read_tuple[0].seq) + str(read_tuple[1].seq)) - assert big_read[:15] == 'CCGTTTCAACCCGTT' + err_mod = kde.KDErrorModel("data/ecoli.npz") + ref_genome = SeqRecord(Seq(str("CGTTTCAACC" * 400)), id="my_genome", description="test genome") + read_tuple = generator.simulate_read(ref_genome, err_mod, 1, 0, "metagenomics") + big_read = "".join(str(read_tuple[0].seq) + str(read_tuple[1].seq)) + assert big_read[:15] == "CCGTTTCAACCCGTT" def test_kde_short(): if sys.version_info > (3,): random.seed(42) np.random.seed(42) - err_mod = kde.KDErrorModel('data/ecoli.npz') - ref_genome = SeqRecord( - Seq(str('AAACC' * 100)), - id='my_genome', - description='test genome' - ) - read_tuple = generator.simulate_read(ref_genome, err_mod, 1, 0, 'metagenomics') - big_read = ''.join(str(read_tuple[0].seq) + str(read_tuple[1].seq)) - assert big_read == 'ACCAAACCAAACCAAACCAAGGTTTGGTTTGGTTTGGTGT' + err_mod = kde.KDErrorModel("data/ecoli.npz") + ref_genome = SeqRecord(Seq(str("AAACC" * 100)), id="my_genome", description="test genome") + read_tuple = generator.simulate_read(ref_genome, err_mod, 1, 0, "metagenomics") + big_read = "".join(str(read_tuple[0].seq) + str(read_tuple[1].seq)) + assert big_read == "ACCAAACCAAACCAAACCAAGGTTTGGTTTGGTTTGGTGT" def test_amp(): if sys.version_info > (3,): random.seed(42) np.random.seed(42) - err_mod = kde.KDErrorModel( - os.path.join( - os.path.dirname(__file__), - '../profiles/MiSeq' - ) - ) + err_mod = kde.KDErrorModel(os.path.join(os.path.dirname(__file__), "../profiles/MiSeq")) print(err_mod.read_length) ref_genome = SeqRecord( - Seq(( - "TTTAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAA" - "AAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAA" - "AAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAA" - "AAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAGG" - "CCAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAA" - "AAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAA" - "AAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAA" - "AAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAATTT" - )), - id='my_amplicon', - description='test amplicon' + Seq( + ( + "TTTAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAA" + "AAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAA" + "AAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAA" + "AAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAGG" + "CCAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAA" + "AAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAA" + "AAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAA" + "AAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAATTT" + ) + ), + id="my_amplicon", + description="test amplicon", ) - read_tuple = generator.simulate_read(ref_genome, err_mod, 1, 0, 'amplicon') + read_tuple = generator.simulate_read(ref_genome, err_mod, 1, 0, "amplicon") assert len(read_tuple[0].seq) == 301 assert read_tuple[0].seq.startswith("TTTAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAA") assert len(read_tuple[1].seq) == 301 diff --git a/iss/test/test_modeller.py b/iss/test/test_modeller.py index 6ff7c86..8658b12 100644 --- a/iss/test/test_modeller.py +++ b/iss/test/test_modeller.py @@ -1,25 +1,16 @@ #!/usr/bin/env python # -*- coding: utf-8 -*- -import pytest - -from builtins import next - -from iss import bam -from iss import modeller - - import sys + import numpy as np +import pytest + +from iss import bam, modeller def test_kde_qualities(): - quality_distribution = [ - [40, 30], - [40, 30], - [20, 20], - [40, 10], - [10, 10]] + quality_distribution = [[40, 30], [40, 30], [20, 20], [40, 10], [10, 10]] cdf_list = modeller.raw_qualities_to_histogram(quality_distribution) assert cdf_list[0][-2] == pytest.approx(0.5, rel=1e-3) assert cdf_list[-1][0] == 0.0 @@ -29,7 +20,7 @@ def test_kde_qualities(): def test_substitutions(): subst_matrix = np.zeros([20, 16]) - bam_file = 'data/substitutions_test.bam' + bam_file = "data/substitutions_test.bam" bam_reader = bam.read_bam(bam_file) if sys.version_info > (3,): for _ in range(2): @@ -42,18 +33,17 @@ def test_substitutions(): alignment = read.get_aligned_pairs(matches_only=True, with_seq=True) read_has_indels = False for base in alignment: - pos, subst, read_has_indels = modeller.dispatch_subst( - base, read, read_has_indels) + pos, subst, read_has_indels = modeller.dispatch_subst(base, read, read_has_indels) subst_matrix[pos, subst] += 1 choices = modeller.subst_matrix_to_choices(subst_matrix, 20) assert read_has_indels is False assert subst_matrix[0][1] == 1 - assert choices[0]['A'] == (['T', 'C', 'G'], [1.0, 0.0, 0.0]) + assert choices[0]["A"] == (["T", "C", "G"], [1.0, 0.0, 0.0]) def test_indels(): indel_matrix = np.zeros([20, 9]) - bam_file = 'data/substitutions_test.bam' + bam_file = "data/substitutions_test.bam" bam_reader = bam.read_bam(bam_file) if sys.version_info > (3,): for _ in range(8): @@ -67,7 +57,6 @@ def test_indels(): indel_matrix[pos, indel] += 1 for position in range(20): indel_matrix[position][0] = 5 - insertion, deletion = modeller.indel_matrix_to_choices( - indel_matrix, 20) - assert round(insertion[6]['T'], 2) == 0.2 + insertion, deletion = modeller.indel_matrix_to_choices(indel_matrix, 20) + assert round(insertion[6]["T"], 2) == 0.2 assert indel_matrix[6][2] == 1 diff --git a/iss/test/test_util.py b/iss/test/test_util.py index 152489e..304990e 100644 --- a/iss/test/test_util.py +++ b/iss/test/test_util.py @@ -2,35 +2,21 @@ # -*- coding: utf-8 -*- import pytest - -from iss import util - from Bio import SeqIO -def setup_function(): - output_file_prefix = 'data/.test' - - -def teardown_uncompress(): - util.cleanup(['data/test_concat.iss.tmp.genomes.fasta']) - - -def teardown_compress(): - util.cleanup(['data/ecoli.fasta.gz']) +from iss import util @pytest.fixture def setup_and_teardown(): - setup_function() yield - teardown_uncompress() + util.cleanup(["data/test_concat.iss.tmp.genomes.fasta"]) @pytest.fixture def setup_and_teardown_compress(): - setup_function() yield - teardown_compress() + util.cleanup(["data/ecoli.fasta.gz"]) def test_phred_conversions(): @@ -46,23 +32,22 @@ def test_phred_conversions(): def test_rev_comp(): - lowercase_seq = 'attgctat' - uppercase_seq = 'CCGATTAC' - assert util.rev_comp(lowercase_seq) == 'atagcaat' - assert util.rev_comp(uppercase_seq) == 'GTAATCGG' + lowercase_seq = "attgctat" + uppercase_seq = "CCGATTAC" + assert util.rev_comp(lowercase_seq) == "atagcaat" + assert util.rev_comp(uppercase_seq) == "GTAATCGG" def test_count_records(): - f = open('data/genomes.fasta', 'r') + f = open("data/genomes.fasta", "r") with f: # count the number of records record_list = util.count_records(f) - assert record_list == [ - 'genome_A', 'genome_T', 'genome_GC', 'genome_ATCG', 'genome_TA'] + assert record_list == ["genome_A", "genome_T", "genome_GC", "genome_ATCG", "genome_TA"] def test_count_records_empty(): with pytest.raises(SystemExit): - util.count_records('data/empty_file') + util.count_records("data/empty_file") def test_split_list(): @@ -70,35 +55,22 @@ def test_split_list(): test_list = list(range(10)) # tests on range assert util.split_list(test_range) == [range(10)] - assert util.split_list(test_range, n_parts=2) == [ - range(0, 5), - range(5, 10) - ] - assert util.split_list(test_range, n_parts=3) == [ - range(0, 3), - range(3, 6), - range(6, 10) - ] + assert util.split_list(test_range, n_parts=2) == [range(0, 5), range(5, 10)] + assert util.split_list(test_range, n_parts=3) == [range(0, 3), range(3, 6), range(6, 10)] # tests on lists assert util.split_list(test_list) == [list(range(10))] - assert util.split_list(test_list, n_parts=2) == [ - list(range(0, 5)), - list(range(5, 10))] - assert util.split_list(test_list, n_parts=3) == [ - list(range(0, 3)), - list(range(3, 6)), - list(range(6, 10)) - ] + assert util.split_list(test_list, n_parts=2) == [list(range(0, 5)), list(range(5, 10))] + assert util.split_list(test_list, n_parts=3) == [list(range(0, 3)), list(range(3, 6)), list(range(6, 10))] def test_convert_n_reads(): - simple_number = '10000' - kilo_lower = '100k' - kilo_upper = '100K' - mega_lower = '20m' - mega_upper = '20M' - giga_lower = '1.5g' - giga_upper = '0.5G' + simple_number = "10000" + kilo_lower = "100k" + kilo_upper = "100K" + mega_lower = "20m" + mega_upper = "20M" + giga_lower = "1.5g" + giga_upper = "0.5G" assert util.convert_n_reads(simple_number) == 10000 assert util.convert_n_reads(kilo_lower) == 100000 @@ -111,30 +83,30 @@ def test_convert_n_reads(): def test_convert_n_reads_bad_not_a_number(): with pytest.raises(SystemExit): - not_valid = 'rocket0' + not_valid = "rocket0" util.convert_n_reads(not_valid) def test_convert_n_reads_bad_suffix(): with pytest.raises(SystemExit): - not_valid = '0.12Q' + not_valid = "0.12Q" util.convert_n_reads(not_valid) def test_genome_file_exists(): with pytest.raises(SystemExit): - my_important_file = 'data/ecoli.fasta' + my_important_file = "data/ecoli.fasta" util.genome_file_exists(my_important_file) def test_reservoir(): samples = [] - genome_file = 'data/genomes.fasta' - with open(genome_file, 'r') as f: + genome_file = "data/genomes.fasta" + with open(genome_file, "r") as f: record_list = util.count_records(f) n = 2 - with open(genome_file, 'r') as f: - fasta_file = SeqIO.parse(f, 'fasta') + with open(genome_file, "r") as f: + fasta_file = SeqIO.parse(f, "fasta") for record in util.reservoir(fasta_file, record_list, n): samples.append(record.id) assert len(samples) == 2 @@ -142,28 +114,28 @@ def test_reservoir(): def test_reservoir_invalid_input(): with pytest.raises(SystemExit): - genome_file = 'data/ecoli.fasta' - record_list = ['NC_002695.1'] + genome_file = "data/ecoli.fasta" + record_list = ["NC_002695.1"] n = 4 - with open(genome_file, 'r') as f: - fasta_file = SeqIO.parse(f, 'fasta') + with open(genome_file, "r") as f: + fasta_file = SeqIO.parse(f, "fasta") for record in util.reservoir(fasta_file, record_list, n): pass def test_concatenate(setup_and_teardown): - genome_files = ['data/ecoli.fasta'] * 2 - util.concatenate(genome_files, 'data/test_concat.iss.tmp.genomes.fasta') - with open('data/test_concat.iss.tmp.genomes.fasta', 'rb') as f: + genome_files = ["data/ecoli.fasta"] * 2 + util.concatenate(genome_files, "data/test_concat.iss.tmp.genomes.fasta") + with open("data/test_concat.iss.tmp.genomes.fasta", "rb") as f: assert len(f.readlines()) == 40 def test_concatenate_read_only(): with pytest.raises(SystemExit): - genome_files = ['data/ecoli.fasta'] * 2 - util.concatenate(genome_files, 'data/read_only.fasta') + genome_files = ["data/ecoli.fasta"] * 2 + util.concatenate(genome_files, "data/read_only.fasta") def test_compress(setup_and_teardown_compress): - genome_file = 'data/ecoli.fasta' + genome_file = "data/ecoli.fasta" util.compress(genome_file, remove=False) diff --git a/iss/util.py b/iss/util.py index c89a847..cc64436 100644 --- a/iss/util.py +++ b/iss/util.py @@ -1,18 +1,17 @@ #!/usr/bin/env python # -*- coding: utf-8 -*- -from Bio import SeqIO - -import os -import sys import gzip +import logging +import os import pickle import random -import logging -import numpy as np - +import sys from shutil import copyfileobj +import numpy as np +from Bio import SeqIO + def phred_to_prob(q): """Convert a phred score (Sanger or modern Illumina) in probabilty @@ -56,11 +55,37 @@ def rev_comp(s): list: reverse complement of the input sequence """ bases = { - "a": "t", "c": "g", "g": "c", "t": "a", "y": "r", "r": "y", "w": "w", - "s": "s", "k": "m", "m": "k", "n": "n", "b": "v", "v": "b", "d": "h", - "h": "d", "A": "T", "C": "G", "G": "C", "T": "A", "Y": "R", "R": "Y", - "W": "W", "S": "S", "K": "M", "M": "K", "N": "N", "B": "V", "V": "B", - "D": "H", "H": "D"} + "a": "t", + "c": "g", + "g": "c", + "t": "a", + "y": "r", + "r": "y", + "w": "w", + "s": "s", + "k": "m", + "m": "k", + "n": "n", + "b": "v", + "v": "b", + "d": "h", + "h": "d", + "A": "T", + "C": "G", + "G": "C", + "T": "A", + "Y": "R", + "R": "Y", + "W": "W", + "S": "S", + "K": "M", + "M": "K", + "N": "N", + "B": "V", + "V": "B", + "D": "H", + "H": "D", + } sequence = list(s) complement = "".join([bases[b] for b in sequence]) reverse_complement = complement[::-1] @@ -83,15 +108,14 @@ def count_records(fasta_file): record_list.append(record.id) try: assert len(record_list) != 0 - except AssertionError as e: - logger.error( - 'Failed to find records in genome(s) file:%s' % fasta_file) + except AssertionError: + logger.error("Failed to find records in genome(s) file:%s" % fasta_file) sys.exit(1) else: return record_list -def split_list(l, n_parts=1): +def split_list(lst, n_parts=1): """Split a list in a number of parts Args: @@ -101,9 +125,8 @@ def split_list(l, n_parts=1): Returns: list: a list of n_parts lists """ - length = len(l) - return [l[i * length // n_parts: (i + 1) * length // n_parts] - for i in range(n_parts)] + length = len(lst) + return [lst[i * length // n_parts : (i + 1) * length // n_parts] for i in range(n_parts)] def nplog(type, flag): @@ -121,19 +144,19 @@ def convert_n_reads(unit): float: a number of reads """ logger = logging.getLogger(__name__) - suffixes = {'k': 3, 'm': 6, 'g': 9} + suffixes = {"k": 3, "m": 6, "g": 9} if unit[-1].isdigit(): try: unit_int = int(unit) - except ValueError as e: - logger.error('%s is not a valid number of reads' % unit) + except ValueError: + logger.error("%s is not a valid number of reads" % unit) sys.exit(1) elif unit[-1].lower() in suffixes: number = unit[:-1] exponent = suffixes[unit[-1].lower()] unit_int = int(float(number) * 10**exponent) else: - logger.error('%s is not a valid number of reads' % unit) + logger.error("%s is not a valid number of reads" % unit) sys.exit(1) return unit_int @@ -147,9 +170,9 @@ def genome_file_exists(filename): logger = logging.getLogger(__name__) try: assert os.path.exists(filename) is False - except AssertionError as e: - logger.error('%s already exists. Aborting.' % filename) - logger.error('Maybe use another --output prefix') + except AssertionError: + logger.error("%s already exists. Aborting." % filename) + logger.error("Maybe use another --output prefix") sys.exit(1) @@ -167,9 +190,8 @@ def reservoir(records, record_list, n=None): try: total = len(record_list) assert n < total - except AssertionError as e: - logger.error( - '-u should be strictly smaller than total number of records.') + except AssertionError: + logger.error("-u should be strictly smaller than total number of records.") sys.exit(1) else: random.seed() @@ -196,17 +218,17 @@ def concatenate(file_list, output): output (string): the output file name """ logger = logging.getLogger(__name__) - logger.info('Stitching input files together') + logger.info("Stitching input files together") try: - out_file = open(output, 'wb') + out_file = open(output, "wb") except (IOError, OSError) as e: - logger.error('Failed to open output file: %s' % e) + logger.error("Failed to open output file: %s" % e) sys.exit(1) with out_file: for file_name in file_list: if file_name is not None: - with open(file_name, 'rb') as f: + with open(file_name, "rb") as f: copyfileobj(f, out_file) @@ -217,14 +239,14 @@ def cleanup(file_list): file_list (list): a list of files to be removed """ logger = logging.getLogger(__name__) - logger.info('Cleaning up') + logger.info("Cleaning up") for temp_file in file_list: if temp_file is not None: try: os.remove(temp_file) - except (IOError, OSError) as e: - logger.error('Could not read temporary file: %s' % temp_file) - logger.error('You may have to remove temporary files manually') + except (IOError, OSError): + logger.error("Could not read temporary file: %s" % temp_file) + logger.error("You may have to remove temporary files manually") sys.exit(1) @@ -235,9 +257,9 @@ def compress(filename, remove=True): filename (string): name of file to be compressed """ logger = logging.getLogger(__name__) - logger.info('Compressing %s' % filename) - outfile = filename + '.gz' - with open(filename, 'rb') as i, gzip.open(outfile, 'wb') as o: + logger.info("Compressing %s" % filename) + outfile = filename + ".gz" + with open(filename, "rb") as i, gzip.open(outfile, "wb") as o: copyfileobj(i, o) if remove: cleanup([filename]) @@ -255,9 +277,9 @@ def dump(object, output): pickled_object = pickle.dumps(object, protocol=pickle.HIGHEST_PROTOCOL) size = sys.getsizeof(pickled_object) - with open(output, 'wb') as out_file: + with open(output, "wb") as out_file: for i in range(0, size, MAX_BYTES): - out_file.write(pickled_object[i:i + MAX_BYTES]) + out_file.write(pickled_object[i : i + MAX_BYTES]) def load(filename): @@ -272,7 +294,7 @@ def load(filename): size = os.path.getsize(filename) bytes = bytearray(0) - with open(filename, 'rb') as f: + with open(filename, "rb") as f: for _ in range(0, size, MAX_BYTES): bytes += f.read(MAX_BYTES) object = pickle.loads(bytes) diff --git a/iss/version.py b/iss/version.py index bcd8d54..e4adfb8 100644 --- a/iss/version.py +++ b/iss/version.py @@ -1 +1 @@ -__version__ = '1.6.0' +__version__ = "1.6.0" diff --git a/pyproject.toml b/pyproject.toml new file mode 100644 index 0000000..436796c --- /dev/null +++ b/pyproject.toml @@ -0,0 +1,29 @@ +[tool.black] +line-length = 120 +target_version = ['py311'] +exclude = ''' +^/( + ( + \.eggs # exclude a few common directories in the + | \.git # root of the project + | \.hg + | \.mypy_cache + | \.tox + | \.venv + | _build + | buck-out + | build + | dist + | doc + | data + )/ + | foo.py # also separately exclude a file named foo.py in + # the root of the project +) +''' +[tool.isort] +profile = "black" +line_length = 120 +known_first_party = 'iss' + + diff --git a/setup.py b/setup.py index cbf0e45..e84d4c0 100644 --- a/setup.py +++ b/setup.py @@ -1,39 +1,31 @@ #!/usr/bin/env python # -*- coding: utf-8 -*- -from iss.version import __version__ - -from setuptools import setup, find_packages +from setuptools import find_packages, setup +from iss.version import __version__ -url = 'https://github.com/HadrienG/InSilicoSeq' +url = "https://github.com/HadrienG/InSilicoSeq" -with open('README.md') as f: +with open("README.md") as f: long_description = f.read() setup( - name='InSilicoSeq', + name="InSilicoSeq", version=__version__, - - description='a sequencing simulator', + description="a sequencing simulator", long_description=long_description, - long_description_content_type='text/markdown', - + long_description_content_type="text/markdown", url=url, - download_url=url + '/tarball/' + __version__, - author='Hadrien Gourlé', - author_email='hadrien.gourle@slu.se', - - license='MIT', + download_url=url + "/tarball/" + __version__, + author="Hadrien Gourlé", + author_email="hadrien.gourle@slu.se", + license="MIT", packages=find_packages(), - - tests_require=['nose'], - install_requires=['numpy', 'scipy', 'biopython>=1.79', - 'pysam>=0.15.1', 'joblib', - 'requests'], + tests_require=["pytest"], + install_requires=["numpy", "scipy", "biopython>=1.79", "pysam>=0.15.1", "joblib", "requests"], include_package_data=True, - entry_points={ - 'console_scripts': ['iss = iss.app:main'], - } + "console_scripts": ["iss = iss.app:main"], + }, )