From 25b1a08c40ac57414947f97ceb36711a2c9c1e9c Mon Sep 17 00:00:00 2001 From: rfm-targa Date: Fri, 12 Apr 2024 16:43:23 +0100 Subject: [PATCH] Updated docstrings. --- CHEWBBACA/AlleleCall/allele_call.py | 13 +- .../AlleleCallEvaluator/evaluate_calls.py | 4 +- CHEWBBACA/CHEWBBACA_NS/README.md | 4 - CHEWBBACA/CHEWBBACA_NS/download_schema.py | 6 +- CHEWBBACA/CHEWBBACA_NS/stats_requests.py | 108 ++-------- CHEWBBACA/CHEWBBACA_NS/upload_schema.py | 199 ++++-------------- CHEWBBACA/CreateSchema/create_schema.py | 154 ++++++-------- CHEWBBACA/ExtractCgMLST/determine_cgmlst.py | 66 +----- CHEWBBACA/PrepExternalSchema/adapt_schema.py | 29 +++ CHEWBBACA/SchemaEvaluator/evaluate_schema.py | 30 +++ CHEWBBACA/UniprotFinder/annotate_schema.py | 52 ++++- CHEWBBACA/utils/blast_wrapper.py | 6 +- CHEWBBACA/utils/chewiens_requests.py | 2 +- CHEWBBACA/utils/core_functions.py | 4 +- CHEWBBACA/utils/distance_matrix.py | 36 +++- CHEWBBACA/utils/fasta_operations.py | 2 +- CHEWBBACA/utils/fasttree_wrapper.py | 9 +- CHEWBBACA/utils/file_operations.py | 12 +- CHEWBBACA/utils/gene_prediction.py | 2 +- CHEWBBACA/utils/iterables_manipulation.py | 2 +- CHEWBBACA/utils/join_profiles.py | 15 +- CHEWBBACA/utils/mafft_wrapper.py | 6 +- CHEWBBACA/utils/multiprocessing_operations.py | 2 +- CHEWBBACA/utils/parameters_validation.py | 4 +- CHEWBBACA/utils/process_datetime.py | 12 +- CHEWBBACA/utils/profile_hasher.py | 45 +++- CHEWBBACA/utils/profiles_sqlitedb.py | 73 +++---- CHEWBBACA/utils/remove_genes.py | 57 ++--- CHEWBBACA/utils/sequence_clustering.py | 10 +- CHEWBBACA/utils/uniprot_requests.py | 6 +- 30 files changed, 395 insertions(+), 575 deletions(-) delete mode 100644 CHEWBBACA/CHEWBBACA_NS/README.md diff --git a/CHEWBBACA/AlleleCall/allele_call.py b/CHEWBBACA/AlleleCall/allele_call.py index f08809f7..e1c1abf4 100644 --- a/CHEWBBACA/AlleleCall/allele_call.py +++ b/CHEWBBACA/AlleleCall/allele_call.py @@ -830,7 +830,7 @@ def write_results_alleles(classification_files, input_identifiers, def write_results_statistics(classification_files, input_identifiers, cds_counts, output_directory, classification_labels, - repeated_counts, invalid_data, loci_finder): + repeated_counts, invalid_data): """Write a TSV file with classification counts per input. Parameters @@ -856,9 +856,6 @@ def write_results_statistics(classification_files, input_identifiers, invalid_data : dict Dictionary with input identifiers as keys and the total number of invalid CDSs as values. - loci_finder : re.Pattern - Regular expression object to search for loci identifiers - in paths and filenames. Returns ------- @@ -870,7 +867,6 @@ def write_results_statistics(classification_files, input_identifiers, class_counts = {i: {c: 0 for c in classification_labels} for i in input_identifiers} for file in classification_files.values(): - locus_id = loci_finder.search(file).group() locus_results = fo.pickle_loader(file) for i in class_counts: @@ -1361,7 +1357,7 @@ def process_blast_results(blast_results, bsr_threshold, query_scores): bsr = cf.compute_bsr(raw_score, query_scores[query_id][1]) except Exception as e: print('Could not get the self-score for the representative ' - f'allele {query_id}') + f'allele {query_id}', e) continue # Only keep matches above BSR threshold if bsr >= bsr_threshold: @@ -1563,7 +1559,7 @@ def classify_inexact_matches(locus, genomes_matches, inv_map, int(rep_alleleid.replace('*', '').split('_')[-1]) rep_alleleid = rep_alleleid.split('_')[-1] except Exception as e: - pass + print(e) # Get hash of the CDS DNA sequence target_dna_hash = match[2] @@ -2952,8 +2948,7 @@ def main(input_file, loci_list, schema_directory, output_directory, output_directory, classification_labels, repeated_counts, - results['invalid_alleles'], - loci_finder) + results['invalid_alleles']) # Create file with class counts per locus called print(f'Creating file with class counts per locus ({ct.LOCI_STATS_BASENAME})...') diff --git a/CHEWBBACA/AlleleCallEvaluator/evaluate_calls.py b/CHEWBBACA/AlleleCallEvaluator/evaluate_calls.py index b8b39743..c7ef7643 100644 --- a/CHEWBBACA/AlleleCallEvaluator/evaluate_calls.py +++ b/CHEWBBACA/AlleleCallEvaluator/evaluate_calls.py @@ -214,7 +214,7 @@ def concatenate_loci_alignments(sample, loci, fasta_index, output_directory): try: alignment += str(fasta_index[seqid].seq) except Exception as e: - print(f'Could not get {sample} allele for locus {locus}.') + print(f'Could not get {sample} allele for locus {locus}.', e) # Save alignment for sample alignment_outfile = fo.join_paths(output_directory, [f'{sample}_cgMLST_alignment.fasta']) @@ -322,8 +322,6 @@ def main(input_files, schema_directory, output_directory, annotations, summary_rows = [total_samples, total_loci, total_cds, loci_sums[-1], *loci_sums[:-1]] - pa_lines = [] - dm_lines = [] phylo_data = {"phylo_data": []} if light is False: if False in [no_pa, no_dm, no_tree] or cg_alignment is True: diff --git a/CHEWBBACA/CHEWBBACA_NS/README.md b/CHEWBBACA/CHEWBBACA_NS/README.md deleted file mode 100644 index f87b0f9b..00000000 --- a/CHEWBBACA/CHEWBBACA_NS/README.md +++ /dev/null @@ -1,4 +0,0 @@ -# chewBBACA - Chewie-NS modules - - - diff --git a/CHEWBBACA/CHEWBBACA_NS/download_schema.py b/CHEWBBACA/CHEWBBACA_NS/download_schema.py index f52f9a80..8f2240c3 100755 --- a/CHEWBBACA/CHEWBBACA_NS/download_schema.py +++ b/CHEWBBACA/CHEWBBACA_NS/download_schema.py @@ -4,11 +4,9 @@ Purpose ------- -This module enables the download of chewBBACA's schemas from the -Chewie-NS. - +This module enables the download of schemas from a Chewie-NS instance. The process enables the download of ZIP archives that contain ready-to-use -versions of any schema in the Chewie-NS. It also allows users to download +versions of any schema in Chewie-NS. It also allows users to download any schema with the structure it had at a specific time point. It is also possible to download the latest version of the schema through requests to the Chewie-NS API, if the compressed version that is available does not diff --git a/CHEWBBACA/CHEWBBACA_NS/stats_requests.py b/CHEWBBACA/CHEWBBACA_NS/stats_requests.py index 3d67c2e0..f22cf890 100644 --- a/CHEWBBACA/CHEWBBACA_NS/stats_requests.py +++ b/CHEWBBACA/CHEWBBACA_NS/stats_requests.py @@ -4,46 +4,13 @@ Purpose ------- -This module enables the retrieval of information/stats from the -Chewie-NS. Its main objective is to provide information about -the list of species and schemas in the Chewie-NS, so that users +This module enables the retrieval of information/stats from a +Chewie-NS instance. Its main objective is to provide information about +the list of species and schemas in Chewie-NS, so that users can quickly identify a schema of interest and download it (this process generates tables with species and schemas identifiers that can be passed to the `-sc` and `-sp` arguments of DownloadSchema). -Expected input --------------- - -The process expects the following variables whether through command line -execution or invocation of the :py:func:`main` function: - -- ``-m``, ``stats_mode`` : The process can retrieve the list of species - ("species" option) in the Chewie-NS, the list of schemas for a species - ("schemas" option and valid value for `--sp`) or information about a - single schema ("schemas" option and valid values for `--sp` and `--sc`). - - - e.g.: ``species`` or ``schemas`` - -- ``--ns_url``, ``nomenclature_server_url`` : The base URL for the Nomenclature Server. - The default value, "main", will establish a connection to "https://chewbbaca.online/", - "tutorial" to "https://tutorial.chewbbaca.online/"" and "local" to - "http://127.0.0.1:5000/NS/api/" (localhost). Users may also provide the IP address to - other Chewie-NS instances. - - - e.g.: ``http://127.0.0.1:5000/NS/api/`` (localhost) - -- ``--sp``, ``species_id`` : The integer identifier of a species - in the Chewie-NS. The process will retrieve the list of schemas - for the species with specified identifier. - - - e.g.: ``2`` - -- ``--sc``, ``schema_id`` : The integer identifier of a schema in - the Chewie-NS. The process will retrieve information about the - schema with specified identifier. - - - e.g.: ``4`` - Code documentation ------------------ """ @@ -51,17 +18,14 @@ import sys import requests -import argparse from urllib3.exceptions import InsecureRequestWarning try: from utils import (constants as ct, - chewiens_requests as cr, - parameters_validation as pv) + chewiens_requests as cr) except ModuleNotFoundError: from CHEWBBACA.utils import (constants as ct, - chewiens_requests as cr, - parameters_validation as pv) + chewiens_requests as cr) # Suppress only the single warning from urllib3 needed. @@ -384,7 +348,23 @@ def single_schema(species_id, schema_id, base_url, headers_get): def main(mode, nomenclature_server, species_id, schema_id): + """Get species and sschema statistics from a Chewie-NS instance. + Parameters + ---------- + mode : str + The process can retrieve the list of species ("species" option) + from Chewie-NS, the list of schemas for a species ("schemas" + option and valid value for `species_id`) or information about a + single schema ("schemas" option and valid values for `species_id` + and `schema_id`). + nomenclature_server : str + The base URL for the Chewie-NS instance. + species_id : int + The integer identifier of a species in Chewie-NS. + schema_id : int + The integer identifier of a schema in Chewie-NS. + """ headers_get = ct.HEADERS_GET_JSON print('\nRetrieving data...') @@ -402,50 +382,6 @@ def main(mode, nomenclature_server, species_id, schema_id): sys.exit('\nPlease provide a valid species identifier ' 'to get the list of available schemas.\n') - # print stats + # Print stats stats_text = '\n'.join(stats) print('\n{0}\n'.format(stats_text)) - - -def parse_arguments(): - - parser = argparse.ArgumentParser(description=__doc__, - formatter_class=argparse.RawDescriptionHelpFormatter) - - parser.add_argument('-m', type=str, required=True, - dest='stats_mode', choices=['species', 'schemas'], - help='The process can retrieve the list of species ' - '("species" option) in the Chewie-NS or the ' - 'list of schemas for a species ' - '("schemas" option).') - - parser.add_argument('--ns', type=pv.validate_ns_url, required=False, - default='main', - dest='nomenclature_server', - help='The base URL for the Nomenclature Server. ' - 'The default value, "main", will establish a ' - 'connection to "https://chewbbaca.online/", ' - '"tutorial" to "https://tutorial.chewbbaca.online/" ' - 'and "local" to "http://127.0.0.1:5000/NS/api/" (localhost). ' - 'Users may also provide the IP address to other ' - 'Chewie-NS instances.') - - parser.add_argument('--sp', type=str, required=False, - dest='species_id', default=None, - help='The integer identifier of a ' - 'species in the Chewie-NS.') - - parser.add_argument('--sc', type=str, required=False, - dest='schema_id', default=None, - help='The integer identifier of a schema in ' - 'the Chewie-NS.') - - args = parser.parse_args() - - return args - - -if __name__ == '__main__': - - args = parse_arguments() - main(**(vars(args))) diff --git a/CHEWBBACA/CHEWBBACA_NS/upload_schema.py b/CHEWBBACA/CHEWBBACA_NS/upload_schema.py index cad2400c..0c03fe87 100755 --- a/CHEWBBACA/CHEWBBACA_NS/upload_schema.py +++ b/CHEWBBACA/CHEWBBACA_NS/upload_schema.py @@ -5,86 +5,17 @@ ------- This module allows authorized users to upload chewBBACA's schemas -to the Chewie-NS. - -The process for schema upload has four stages: - - - User Permissions: Determines if the current user has permission - to upload schemas. Only Admin or Contributor level users can - upload schemas to the Chewie-NS. - - - Parameters Validation: Validation of the set of parameters associated - with the schema. Only schemas that have been used with a single valid - value per parameter can be uploaded. Invalid or multiple values - for a single parameter can lead to inconsistent results; thus, - it is strongly advised to always perform allele calling with - the same set of parameters and refrain from altering the initial - set of parameters values defined in the schema creation or - adaptation processes. - - - Schema Pre-processing: Applies quality control measures to identify - and exclude invalid alleles. Searches for annotations on UniProt - and imports annotations provided by users. - - - Schema Upload: Collects essential data and sends it to the Chewie-NS - for schema creation and data insertion. The process finishes when all - the necessary data has been uploaded. The Chewie-NS automatically - detects that all data has been received and finishes data insertion. - -Expected input --------------- - -The process expects the following variables whether through command line -execution or invocation of the :py:func:`main` function: - -- ``-i``, ``schema_directory`` : Path to the directory of the schema to upload. - - - e.g.: ``/home/user/schemas/ypestis_schema`` - -- ``-sp``, ``species_id`` : The integer identifier or name of the species that - the schema will be associated to in the Chewie-NS. - - - e.g.: ``1`` or ``'Yersinia pestis'`` - -- ``-sn``, ``schema_name`` : A brief and meaningful name that should help - understand the type and content of the schema. - - - e.g.: ``ypestis_cgMLST`` or ``ypestis cgMLST`` - -- ``-lp``, ``loci_prefix`` : Prefix included in the name of each locus of the - schema. - - - e.g.: ``ypestis`` - -- ``--df``, ``description_file`` : Path to a text file with a description - about the schema. Markdown syntax is supported in order to allow greater - customizability of the rendered description in the Frontend. Will default - to the schema's name if the user does not provide a valid path for a - file (default=None). - - - e.g.: ``/home/user/schemas/ypestis_description`` - -- ``--a``, ``annotations`` : Path to a TSV file with loci annotations. The - first column has loci identifiers (w/o .fasta extension), the second has - user annotations and the third has custom annotations (default=None). - - - e.g.: ``/home/user/schemas/ypestis_annotations`` - -- ``--cpu``, ``cpu_cores`` : Number of CPU cores that will be used in the - Schema Pre-processing step (default=1). - - - e.g.: ``4`` - -- ``--ns_url``, ``nomenclature_server`` : The base URL for the Nomenclature - Server. The default value, "main", will establish a connection to - "https://chewbbaca.online/", "tutorial" to "https://tutorial.chewbbaca.online/" - and "local" to "http://127.0.0.1:5000/NS/api/" (localhost). Users may also - provide the IP address to other Chewie-NS instances. - - - e.g.: ``http://127.0.0.1:5000/NS/api/`` (localhost) - -- ``--continue_up`` : If the process should check if the schema upload was - interrupted and try to resume it. ``True`` if provided, ``False`` otherwise. +to a Chewie-NS instance. The process requests the credentials of +the user trying to upload a schema and determines if the user is +allowed to upload the schema (only Admin and Contributor level +users can upload schemas). The schema config file is read to +validate the argument values used to create the schema. Only +schemas that have been used with a single valid value per parameter +can be uploaded. Invalid or multiple values for a single parameter +can lead to inconsistent results; thus, it is strongly advised to +always perform allele calling with the same set of parameters +and refrain from altering the initial set of parameters values +defined in the schema creation or adaptation processes. Code documentation ------------------ @@ -96,7 +27,6 @@ import json import time import hashlib -import argparse import requests import itertools import multiprocessing @@ -930,7 +860,39 @@ def upload_alleles_data(alleles_data, length_files, base_url, def main(schema_directory, species_id, schema_name, loci_prefix, description_file, annotations, cpu_cores, nomenclature_server, continue_up): + """Upload a schema to a Chewie-NS module. + Parameters + ---------- + schema_directory : str + Path to the directory of the schema to upload. + species_id : int + The integer identifier or name of the species that + the schema will be associated to in Chewie-NS. + schema_name : str + A brief and meaningful name that should help + understand the type and content of the schema. + loci_prefix : str + A short prefix included in the name of each locus. + description_file : str + Path to a text file with a description about the schema. Markdown + syntax is supported in order to allow greater customizability of + the rendered description in the Frontend. Will default to the + schema's name if the user does not provide a valid path for a + file. + annotations : str + Path to a TSV file with loci annotations. The first column has + loci identifiers (w/o .fasta extension), the second has user + annotations and the third has custom annotations. + cpu_cores : int + Number of CPU cores that will be used in the pre-processing steps. + nomenclature_server : str + The base URL for the Chewie-NS instance. + continue_up : bool + If the process should check if the schema upload was interrupted + and try to resume it. + """ + if 'tutorial' not in nomenclature_server: token = cr.capture_login_credentials(nomenclature_server) else: @@ -1277,78 +1239,3 @@ def main(schema_directory, species_id, schema_name, loci_prefix, if len(absent_loci) > 0: os.remove(loci_file) os.remove('{0}.zip'.format(loci_file)) - - -def parse_arguments(): - - parser = argparse.ArgumentParser(description=__doc__, - formatter_class=argparse.RawDescriptionHelpFormatter) - - parser.add_argument('-i', type=str, required=True, - dest='schema_directory', - help='Path to the directory of the schema to upload.') - - parser.add_argument('-sp', type=str, required=True, - dest='species_id', - help='The integer identifier or name of the species ' - 'that the schema will be associated to in ' - 'the NS.') - - parser.add_argument('-sn', type=str, required=True, - dest='schema_name', - help='A brief and meaningful name that ' - 'should help understand the type and content ' - 'of the schema.') - - parser.add_argument('-lp', type=str, required=True, - dest='loci_prefix', - help='Prefix included in the name of each locus of ' - 'the schema.') - - parser.add_argument('--df', type=str, required=False, - dest='description_file', default=None, - help='Path to a text file with a description ' - 'about the schema. Markdown syntax is supported ' - 'in order to offer greater customizability of ' - 'the rendered description in the Frontend. ' - 'Will default to the schema\'s name if the user ' - 'does not provide a valid path for a file.') - - parser.add_argument('--a', type=str, required=False, - dest='annotations', default=None, - help='Path to a TSV file with loci annotations. ' - 'The first column has loci identifiers ' - '(w/o .fasta extension), the second has user ' - 'annotations and the third has custom ' - 'annotations.') - - parser.add_argument('--cpu', type=int, required=False, - dest='cpu_cores', default=1, - help='Number of CPU cores that will ' - 'be used in the Schema Pre-processing step.') - - parser.add_argument('--ns', type=pv.validate_ns_url, required=False, - default='main', - dest='nomenclature_server', - help='The base URL for the Nomenclature Server. ' - 'The default value, "main", will establish a ' - 'connection to "https://chewbbaca.online/", ' - '"tutorial" to "https://tutorial.chewbbaca.online/" ' - 'and "local" to "http://127.0.0.1:5000/NS/api/" (localhost). ' - 'Users may also provide the IP address to other ' - 'Chewie-NS instances.') - - parser.add_argument('--continue_up', required=False, action='store_true', - dest='continue_up', - help='If the process should check if the schema ' - 'upload was interrupted and try to resume it.') - - args = parser.parse_args() - - return args - - -if __name__ == "__main__": - - args = parse_arguments() - main(**vars(args)) diff --git a/CHEWBBACA/CreateSchema/create_schema.py b/CHEWBBACA/CreateSchema/create_schema.py index 9c0e270f..99c55ccc 100644 --- a/CHEWBBACA/CreateSchema/create_schema.py +++ b/CHEWBBACA/CreateSchema/create_schema.py @@ -5,101 +5,10 @@ ------- This module enables the creation of a whole genome multi locus sequence -typing (wgMLST) schema seed. - -Expected input --------------- - -The process expects the following variables whether through command line -execution or invocation of the :py:func:`main` function: - -- ``-i``, ``input_files`` : Path to the directory that contains the input - FASTA files. Alternatively, a single file with a list of paths to FASTA - files, one per line. - - - e.g.: ``/home/user/genomes`` - -- ``-o``, ``output_directory`` : Output directory where the process will - store intermediate files and create the schema's directory. - - - e.g.: ``/home/user/schemas/new_schema`` - -- ``--n``, ``schema_name`` : Name given to the folder that will store the - schema files. - - - e.g.: ``my_schema`` - -- ``--ptf``, ``ptf_path`` : Path to the Prodigal training file. - - - e.g.: ``/home/user/training_files/species.trn`` - -- ``--bsr``, ``blast_score_ratio`` : BLAST Score Ratio value. - - - e.g.: ``0.6`` - -- ``--l``, ``minimum_length`` : Minimum sequence length. Coding sequences - shorter than this value are excluded. - - - e.g.: ``201`` - -- ``--t``, ``translation_table`` : Genetic code used to predict genes and - to translate coding sequences. - - - e.g.: ``11`` - -- ``--st``, ``size_threshold`` : CDS size variation threshold. Added to the - schema's config file and used to identify alleles with a length value that - deviates from the locus length mode during the allele calling process. - - - e.g.: ``0.2`` - -- ``--w``, ``word_size`` : word size used to generate k-mers during the - clustering step. - - - e.g.: ``5`` - -- ``--ws``, ``window_size`` : window size value. Number of consecutive - k-mers included in each window to determine a minimizer. - - - e.g.: ``5`` - -- ``--cs``, ``clustering_sim`` : clustering similarity threshold. Minimum - decimal proportion of shared distinct minimizers for a sequence to be - added to a cluster. - - - e.g.: ``0.2`` - -- ``--rf``, ``representative_filter`` : representative similarity threshold. - Clustered sequences are excluded if they share this proportion of distinct - minimizers with the cluster representative. - - - e.g.: ``0.9`` - -- ``--if``, ``intra_filter`` : intra-cluster similarity threshold. Clustered - sequences are excluded if they share this proportion of distinct minimizers - with another clustered sequence of equal or greater length. - - - e.g.: ``0.9`` - -- ``--cpu``, ``cpu_cores`` : Number of CPU cores used to run the process. - - - e.g.: ``4`` - -- ``--b``, ``blast_path`` : Path to the BLAST executables. - - - e.g.: ``/home/software/blast`` - -- ``--pm``, ``prodigal_mode`` : Prodigal running mode. - - - e.g.: ``single`` - -- ``--CDS``, ``cds_input`` : If provided, input is a single or several FASTA - files with coding sequences (skips gene prediction and CDS extraction). - - - e.g.: ``/home/user/coding_sequences_files`` - -- ``--no-cleanup``, ``no_cleanup`` : If provided, intermediate files - generated during process execution are not removed at the end. +typing (wgMLST) schema seed. The process selects one representative allele +per distinct locus identified in the input files. The schema seed corresponds +to a wgMLST schema with one FASTA file per distinct locus, each FASTA file +containing the representative allele selected by the process. Code documentation ------------------ @@ -516,7 +425,62 @@ def main(input_files, output_directory, schema_name, ptf_path, size_threshold, word_size, window_size, clustering_sim, representative_filter, intra_filter, cpu_cores, blast_path, cds_input, prodigal_mode, no_cleanup): + """Create a wgMLST schema seed. + Parameters + ---------- + input_files : str + Path to the directory that contains the input FASTA files. + Alternatively, a single file with a list of paths to FASTA + files, one per line. + output_directory : str + Output directory where the process will store intermediate + files and create the schema seed. + schema_name : str + Name given to the folder that will store the schema seed files. + ptf_path : str + Path to the Prodigal training file. + blast_score_ratio : float + BLAST Score Ratio value. + minimum_length : int + Minimum sequence length. Coding sequences shorter than this + value are excluded. + translation_table : int + Genetic code used to predict genes and to translate coding + sequences. + size_threshold : float + CDS size variation threshold. Added to the schema's config + file and used to identify alleles with a length value that + deviates from the locus length mode during the allele calling + process. + word_size : int + K-mer size used during minimizer clustering. + window_size : int + Number of consecutive k-mers included in each window to + determine a minimizer. + clustering_sim :float + Minimum decimal proportion of shared distinct minimizers for + a sequence to be added to a cluster. + representative_filter : float + Clustered sequences are excluded if they share this proportion + of distinct minimizers with the cluster representative. + intra_filter : float + Clustered sequences are excluded if they share this proportion + of distinct minimizers with another clustered sequence of equal + or greater length. + cpu_cores : int + Number of CPU cores used to run the process. + blast_path : str + Path to the BLAST executables. + cds_input : bool + If provided, input is a single or several FASTA files with + coding sequences (skips gene prediction and CDS extraction). + prodigal_mode : str + Prodigal running mode ("single" or "meta"). + no_cleanup : bool + If provided, intermediate files generated during process + execution are not removed at the end. + """ print(f'Prodigal training file: {ptf_path}') print(f'Prodigal mode: {prodigal_mode}') print(f'CPU cores: {cpu_cores}') diff --git a/CHEWBBACA/ExtractCgMLST/determine_cgmlst.py b/CHEWBBACA/ExtractCgMLST/determine_cgmlst.py index ec9439bb..f25b8162 100755 --- a/CHEWBBACA/ExtractCgMLST/determine_cgmlst.py +++ b/CHEWBBACA/ExtractCgMLST/determine_cgmlst.py @@ -4,44 +4,8 @@ Purpose ------- -This module determines the set of genes in the core genome based on -a matrix with allelic profiles and a threshold that defines the -proportion of genomes a gene must be present in to be included in -the core genome. - -Expected input --------------- - -The process expects the following variables whether through command line -execution or invocation of the :py:func:`main` function: - -- ``-i``, ``input_file`` : Path to input file containing a matrix with - 'allelic profiles. - - - e.g.: ``/home/user/chewie/results/matrix`` - -- ``-o``, ``output_directory`` : Path to the directory where the process - will store output files. - - - e.g.: ``/home/user/chewie/results/output_directory`` - -- ``--t``, ``threshold`` : Genes that constitute the core genome must be - in a proportion of genomes that is at least equal to this value. - - - e.g.: ``0.95`` - -- ``--s``, ``step`` : Number of genomes added to the cgMLST computation - at each step. - - - e.g.: ``5`` - -- ``--r``, ``genes2remove`` : Path to file with a list of genes/columns to - remove from the matrix (one gene identifier per line). - - - e.g.: ``home/user/results/genes.txt`` - -- ``--g``, ``genomes2remove`` : Path to file with a list of genomes/rows - to remove from the matrix (one genome identifier per line). +This module determines the set of loci that constitute the core genome +based on a matrix with allelic profiles and a loci presence threshold. Code documentation ------------------ @@ -272,30 +236,21 @@ def main(input_file, output_directory, threshold, step, Parameters ---------- input_file : str - Path a TSV file with allelic profiles for a set - of genomes. + Path to a TSV file with allelic profiles. output_directory : str Path to the directory where the process will store output files. threshold : list - Core genome determination thresholds. + Loci presence threshold used to determine the core genome. step : int Number of genomes added to the cgMLST computation at each step. genes2remove : str - Path to TXT file with the list of genomes to remove. + Path to TXT file with a list of genomes to exclude from + the analysis. genomes2remove : str - Path to TXT file with the list of loci to remove. - - Returns - ------- - List with the paths to three files: - - - Path a TSV file with the cgMLST matrix. - - Path to a TXT file with the list of genes that - constitute the core genome. - - Path to a TSV file with the information about - missing data per genome. + Path to TXT file with a list of loci to exclude from + the analysis. """ fo.create_directory(output_directory) @@ -418,8 +373,3 @@ def main(input_file, output_directory, threshold, step, plot(fig, filename=output_html, auto_open=False) print('HTML file with cgMLST per loci presence threshold ' 'and per step saved to {0}'.format(output_html)) - - -if __name__ == '__main__': - - main() diff --git a/CHEWBBACA/PrepExternalSchema/adapt_schema.py b/CHEWBBACA/PrepExternalSchema/adapt_schema.py index aeefe908..8e54d860 100644 --- a/CHEWBBACA/PrepExternalSchema/adapt_schema.py +++ b/CHEWBBACA/PrepExternalSchema/adapt_schema.py @@ -389,7 +389,36 @@ def adapt_loci(loci, schema_path, schema_short_path, bsr, min_len, def main(input_files, output_directories, cpu_cores, blast_score_ratio, minimum_length, translation_table, size_threshold, blast_path): + """ + Adapt a schema to be used with chewBBACA. + Parameters + ---------- + input_files : str + Path to a TXT file with the list of schema loci to adapt. + output_directories : list + Path to the output directories to create (the main schema + directory and the 'short' directory to store representative + alleles). + cpu_cores : int + Number of CPU cores that will be used to run the process. + blast_score_ratio : float + The BLAST Score Ratio value that will be used to evaluate + allele similarity and select representative alleles. + minimum_length : int + Minimum sequence length value stored in the schema config file. + The schema adaptation process will only discard sequences smaller + that this value if the `--size-filter` parameter is provided. + translation_table : int + Genetic code used to translate alleles. + size_threshold : float + Allele size variation threshold value stored in the schema + config file. The schema adaptation process will only discard + alleles below or above the locus size threshold if the + ´--size-filter´ parameter is provided. + blast_path : str + Path to the directory that contains the BLAST executables. + """ schema_path, schema_short_path = output_directories # Import list of loci to adapt diff --git a/CHEWBBACA/SchemaEvaluator/evaluate_schema.py b/CHEWBBACA/SchemaEvaluator/evaluate_schema.py index d9d4fd30..d97a0706 100644 --- a/CHEWBBACA/SchemaEvaluator/evaluate_schema.py +++ b/CHEWBBACA/SchemaEvaluator/evaluate_schema.py @@ -564,7 +564,37 @@ def locus_report(locus_file, locus_data, annotation_columns, def main(schema_directory, output_directory, genes_list, annotations, translation_table, size_threshold, minimum_length, cpu_cores, loci_reports, light, add_sequences): + """Evaluate a schema and create a HTML report. + Parameters + ---------- + schema_directory : str + Path to the schema directory. + output_directory : str + Path to the output directory where the report files will be created. + genes_list : str + Path to a file with the list of loci to evaluate. + annotations : str + Path to a TSV file created by the UniprotFinder module. + translation_table : int + Genetic code used to translate the alleles. + size_threshold : float + Allele size variation threshold. Used to determine if the size + of an allele is within the interval of the locus size mode +/- + the size threshold. + minimum_length : int + Minimum sequence length accepted for an allele to be included + in the schema. + cpu_cores : int + Number of CPU cores used to run the process. + loci_reports : bool + Create individual reports for the loci. + light : bool + Skips MSA and NJ tree computations for loci reports. + add_sequences : bool + Adds Code Editor components with the allele DNA and + protein sequences. + """ # Create directory to store intermediate files temp_directory = fo.join_paths(output_directory, ['temp']) fo.create_directory(temp_directory) diff --git a/CHEWBBACA/UniprotFinder/annotate_schema.py b/CHEWBBACA/UniprotFinder/annotate_schema.py index 04be16da..1fdb29e8 100755 --- a/CHEWBBACA/UniprotFinder/annotate_schema.py +++ b/CHEWBBACA/UniprotFinder/annotate_schema.py @@ -4,16 +4,13 @@ Purpose ------- -This module enables the creation of a TSV file with annotation -terms for the loci in a schema. - -The process queries UniProt's SPARQL endpoint to find exact -matches and retrieve the product name and page URL for those -matches. If users provide a taxon/taxa name/s, the process -will also search for reference proteomes for the specified -taxon/taxa and use BLASTp to align local sequences against -reference sequences to assign annotation terms based on the -BSR value computed for each alignment. +This module retrieves annotations for the loci in a schema. +The process can retrieve annotations through UniProt's SPARQL +endpoint to find exact matches. If users provide a taxon/taxa +name/s, the process will also search for reference proteomes +for the specified taxon/taxa and use BLASTp to align local +sequences against reference sequences to assign annotations +based on the BSR value computed for each alignment. Code documentation ------------------ @@ -331,7 +328,42 @@ def create_annotations_table(annotations, output_directory, header, def main(schema_directory, output_directory, genes_list, protein_table, blast_score_ratio, cpu_cores, taxa, proteome_matches, no_sparql, no_cleanup, blast_path): + """Annotate loci in a schema. + Parameters + ---------- + schema_directory + Path to the schema directory. + output_directory + Path to the output directory where the process will store + intermediate files and save the results. + genes_list + Path to a file that contains a list of schema loci to + annotate. + protein_table + Path to the 'cds_coordinates.tsv' file created by the + 'CreateSchema' process. + blast_score_ratio + BLAST Score Ratio value. This value is only used to evaluate + matches against reference proteomes when a taxon/taxa name/s + are provided. + cpu_cores + Number of CPU cores used by the process. + taxa + List of scientific names for a set of taxa. The process will + download reference proteomes from UniProt and align schema + translated alleles against the proteomes to find annotations + for the loci. + proteome_matches + Maximum number of proteome matches per locus to report. + no_sparql + Do not search for annotations through UniProt's SPARQL + endpoint. + no_cleanup + Do not keep intermediate files. + blast_path + Path to the directory that contains the BLAST executables. + """ # Create output directory created = fo.create_directory(output_directory) if created is False: diff --git a/CHEWBBACA/utils/blast_wrapper.py b/CHEWBBACA/utils/blast_wrapper.py index b2c51ab5..d88bdc21 100644 --- a/CHEWBBACA/utils/blast_wrapper.py +++ b/CHEWBBACA/utils/blast_wrapper.py @@ -16,11 +16,9 @@ import subprocess try: - from utils import (constants as ct, - iterables_manipulation as im) + from utils import constants as ct except ModuleNotFoundError: - from CHEWBBACA.utils import (constants as ct, - iterables_manipulation as im) + from CHEWBBACA.utils import constants as ct def make_blast_db(makeblastdb_path, input_fasta, output_path, db_type): diff --git a/CHEWBBACA/utils/chewiens_requests.py b/CHEWBBACA/utils/chewiens_requests.py index f4555a36..6a546217 100644 --- a/CHEWBBACA/utils/chewiens_requests.py +++ b/CHEWBBACA/utils/chewiens_requests.py @@ -5,7 +5,7 @@ ------- This module contains functions to perform requests to -Chewie-NS (https://github.com/B-UMMI/Chewie-NS). +Chewie-NS instances (main: https://github.com/B-UMMI/Chewie-NS). Code documentation ------------------ diff --git a/CHEWBBACA/utils/core_functions.py b/CHEWBBACA/utils/core_functions.py index bcd288ee..0f0d97a6 100644 --- a/CHEWBBACA/utils/core_functions.py +++ b/CHEWBBACA/utils/core_functions.py @@ -4,7 +4,8 @@ Purpose ------- -This module contains functions related to +This module contains core functions used by chewBBACA's +modules. Code documentation ------------------ @@ -12,7 +13,6 @@ import os -import sys try: from utils import (constants as ct, diff --git a/CHEWBBACA/utils/distance_matrix.py b/CHEWBBACA/utils/distance_matrix.py index b5ed8acb..e32c6d25 100644 --- a/CHEWBBACA/utils/distance_matrix.py +++ b/CHEWBBACA/utils/distance_matrix.py @@ -4,13 +4,11 @@ Purpose ------- -Accepts a matrix with results from the AlleleCall process of -chewBBACA and determines the pairwise allelic differences to -create a distance matrix. It also determines the number of -shared loci to create a matrix with those values. The 'INF-' -prefix is removed and ASM, ALM, NIPH, NIPHEM, PLOT3, PLOT5, -LNF and LOTSC classifications are substituted by '0' before -performing pairwise comparisons. +Determines the pairwise allelic differences based on a TSV file +with allelic profiles determined by the AlleleCall module to +create a distance matrix. The 'INF-' prefix is removed and ASM, +ALM, NIPH, NIPHEM, PLOT3, PLOT5, LNF and LOTSC classifications +are substituted by '0' before computing the pairwise distances. Code documentation ------------------ @@ -337,8 +335,30 @@ def symmetrify_matrix(input_matrix, matrix_size, tmp_directory): def main(input_matrix, output_directory, cpu_cores, symmetric, masked): + """Compute a distance matrix based on allelic profiles. - # create output directory if it does not exist + Parameters + ---------- + input_matrix : str + Path to a TSV file with allelic profiles determined by + the AlleleCall module. + output_directory : str + Path to the output directory. + cpu_cores : int + Number of CPU cores used to compute distances. + symmetric : bool + Determine a symmetric pairwise distance matrix, instead + of a triangular matrix. + masked : bool + False if the input matrix values are masked, True otherwise. + The process will mask the matrix values it this value is False. + + Returns + ------- + output_pairwise : str + Path to the TSV file that contains the distance matrix. + """ + # Create output directory if it does not exist if os.path.isdir(output_directory) is False: os.mkdir(output_directory) diff --git a/CHEWBBACA/utils/fasta_operations.py b/CHEWBBACA/utils/fasta_operations.py index 21ea79c8..6ab10c34 100644 --- a/CHEWBBACA/utils/fasta_operations.py +++ b/CHEWBBACA/utils/fasta_operations.py @@ -4,7 +4,7 @@ Purpose ------- -This module contains functions to work with FASTA files. +This module contains functions used to work with FASTA files. Code documentation ------------------ diff --git a/CHEWBBACA/utils/fasttree_wrapper.py b/CHEWBBACA/utils/fasttree_wrapper.py index de0956d7..53fc12dc 100644 --- a/CHEWBBACA/utils/fasttree_wrapper.py +++ b/CHEWBBACA/utils/fasttree_wrapper.py @@ -16,7 +16,14 @@ def call_fasttree(alignment_file, tree_file): - """ + """Compute a phylogenetic tree based on MSA data. + + Parameters + ---------- + alignment_file : str + Path to a file with a MSA. + tree_file : str + Path to the output tree file in Newick format. """ proc = subprocess.Popen(['FastTree', '-fastest', '-nosupport', '-noml', '-out', tree_file, alignment_file], diff --git a/CHEWBBACA/utils/file_operations.py b/CHEWBBACA/utils/file_operations.py index 3c644a02..1cc418b2 100644 --- a/CHEWBBACA/utils/file_operations.py +++ b/CHEWBBACA/utils/file_operations.py @@ -4,9 +4,9 @@ Purpose ------- -This module contains functions related with file operations, -such as read and write files, create and delete files, -manipulate file paths, compress files, verify file contents, +This module contains functions related to file operations, +such as reading and writing files, creating and deleting files, +manipulating file paths, compressing files, verifying file contents, etc. Code documentation @@ -33,11 +33,9 @@ import pandas as pd try: - from utils import (constants as ct, - iterables_manipulation as im) + from utils import iterables_manipulation as im except ModuleNotFoundError: - from CHEWBBACA.utils import (constants as ct, - iterables_manipulation as im) + from CHEWBBACA.utils import iterables_manipulation as im def file_basename(file_path, file_extension=True): diff --git a/CHEWBBACA/utils/gene_prediction.py b/CHEWBBACA/utils/gene_prediction.py index 9544f136..f2a5eb95 100644 --- a/CHEWBBACA/utils/gene_prediction.py +++ b/CHEWBBACA/utils/gene_prediction.py @@ -4,7 +4,7 @@ Purpose ------- -This module contains functions related with gene prediction +This module contains functions related to gene prediction with Pyrodigal. Code documentation diff --git a/CHEWBBACA/utils/iterables_manipulation.py b/CHEWBBACA/utils/iterables_manipulation.py index 3ee53789..7cfc94d6 100644 --- a/CHEWBBACA/utils/iterables_manipulation.py +++ b/CHEWBBACA/utils/iterables_manipulation.py @@ -4,7 +4,7 @@ Purpose ------- -This module contains functions to work with iterables. +This module contains functions used to work with iterables. Code documentation ------------------ diff --git a/CHEWBBACA/utils/join_profiles.py b/CHEWBBACA/utils/join_profiles.py index e55fc3b3..23bd2214 100755 --- a/CHEWBBACA/utils/join_profiles.py +++ b/CHEWBBACA/utils/join_profiles.py @@ -7,8 +7,7 @@ This module joins allele calling results from different runs. It can concatenate files with allelic profiles for the same set of loci or create a new file with the -allelic profiles for the loci that were common between -all input files. +allelic profiles for the loci shared by all input files. Code documentation ------------------ @@ -62,7 +61,18 @@ def concatenate_profiles(files, loci_list, output_file): def main(profiles, output_file, common): + """Join files with allelic profiles. + Parameters + ---------- + profiles : list + List with paths to TSV files with allelic profiles. + output_file : str + Path to the output file. + common : bool + If the process should join profile data only for shared loci + when the profiles do not share the same loci sets. + """ if len(profiles) == 1: sys.exit('Provided a single file. Nothing to do.') @@ -75,7 +85,6 @@ def main(profiles, output_file, common): # check if headers are equal if all([set(headers[0]) == set(h) for h in headers[1:]]) is True: print('Profiles have {0} loci.'.format(len(headers[0])-1)) - total_profiles = concatenate_profiles(profiles, headers[0], output_file) diff --git a/CHEWBBACA/utils/mafft_wrapper.py b/CHEWBBACA/utils/mafft_wrapper.py index b9fd1f0d..9c754b54 100644 --- a/CHEWBBACA/utils/mafft_wrapper.py +++ b/CHEWBBACA/utils/mafft_wrapper.py @@ -16,11 +16,9 @@ import subprocess try: - from utils import (constants as ct, - iterables_manipulation as im) + from utils import constants as ct except ModuleNotFoundError: - from CHEWBBACA.utils import (constants as ct, - iterables_manipulation as im) + from CHEWBBACA.utils import constants as ct def call_mafft(input_file, output_file): diff --git a/CHEWBBACA/utils/multiprocessing_operations.py b/CHEWBBACA/utils/multiprocessing_operations.py index 5bf03247..c6c0ae34 100644 --- a/CHEWBBACA/utils/multiprocessing_operations.py +++ b/CHEWBBACA/utils/multiprocessing_operations.py @@ -4,7 +4,7 @@ Purpose ------- -This modules contains functions used to paralellize +This module contains functions used to paralellize function calls. Code documentation diff --git a/CHEWBBACA/utils/parameters_validation.py b/CHEWBBACA/utils/parameters_validation.py index 5124ae55..8f05e376 100644 --- a/CHEWBBACA/utils/parameters_validation.py +++ b/CHEWBBACA/utils/parameters_validation.py @@ -4,8 +4,8 @@ Purpose ------- -This module contains functions/classes related to parameter -and argument validation. +This module contains functions/classes related to the validation +of the arguments passed to chewBABCA's modules. Code documentation ------------------ diff --git a/CHEWBBACA/utils/process_datetime.py b/CHEWBBACA/utils/process_datetime.py index b192e3ef..fcb45304 100644 --- a/CHEWBBACA/utils/process_datetime.py +++ b/CHEWBBACA/utils/process_datetime.py @@ -128,24 +128,24 @@ def process_header(process): print(f'{hf}\n {header}\n{hf}') -# decorator to time main processes +# Decorator to time main processes def process_timer(func): - # use functools to preserve info about wrapped function + # Use functools to preserve info about wrapped function @functools.wraps(func) def wrapper(*args, **kwargs): - # get process name and print header + # Get process name and print header process_header(sys.argv[1]) - # do not measure time if it is only needed to print the help message + # Do not measure time if it is only needed to print the help message if any([option in ['-h', '--help'] for option in sys.argv]) is False: start = get_datetime() start_str = datetime_str(start) print(f'Started at: {start_str}\n') - # run function + # Run function func(*args, **kwargs) - # does not print elapsed time if the help message is printed + # Does not print elapsed time if the help message is printed end = get_datetime() end_str = datetime_str(end) print(f'\nFinished at: {end_str}') diff --git a/CHEWBBACA/utils/profile_hasher.py b/CHEWBBACA/utils/profile_hasher.py index 2ed4943a..c2048fc1 100644 --- a/CHEWBBACA/utils/profile_hasher.py +++ b/CHEWBBACA/utils/profile_hasher.py @@ -103,7 +103,7 @@ def hash_profiles(profiles_table, loci_ids, loci_files, hashing_function, current_rows = pd.read_csv(profiles_table, delimiter='\t', dtype=str, skiprows=skiprows, nrows=nrows, index_col=0) - # remove all 'INF-' prefixes, missing data and '*' from identifiers + # Remove all 'INF-' prefixes, missing data and '*' from identifiers current_rows = current_rows.apply(im.replace_chars, args=('-')) hashed_profiles = [] @@ -126,8 +126,33 @@ def hash_profiles(profiles_table, loci_ids, loci_files, hashing_function, def main(profiles_table, schema_directory, output_directory, hash_type, cpu_cores, nrows, updated_files, no_inferred): + """Hash allele identifiers in a matrix of allelic profiles. - # get hash function + Parameters + ---------- + profiles_table : str + Path to a TSV file with allelic profiles determined by the + AlleleCall module. + schema_directory : str + Path to the directory of the schema used to determine the + allelic profiles. + output_directory : str + Path to the output directory. + hash_type : str + Hashing algorithm to use. + cpu_cores : int + Number of CPU cores used by the process. + nrows : int + Divide input file into subsets to process more efficiently. + updated_files : dict + Dictionary with paths to schema FASTA files as keys and paths + to FASTA files updated by allele calling as values. Only used + if `no_inferred` is True. + no_inferred : bool + If the allele calling process did not add inferred alleles to + the schema. + """ + # Get hash function hashing_function = getattr(hashlib, hash_type, None) if hashing_function is None: hashing_function = getattr(zlib, hash_type, None) @@ -137,7 +162,7 @@ def main(profiles_table, schema_directory, output_directory, hash_type, 'hashlib or zlib modules.'.format(hash_type)) return False - # get loci identifiers + # Get loci identifiers with open(profiles_table, 'r') as infile: header = infile.readline() loci_ids = header.split()[1:] @@ -145,25 +170,25 @@ def main(profiles_table, schema_directory, output_directory, hash_type, loci_files = {} for locus in loci_ids: locus_file = fo.join_paths(schema_directory, [locus]) - # add .fasta extension if file headers did not include it + # Add .fasta extension if file headers did not include it if locus_file.endswith('.fasta') is False: locus_file += '.fasta' loci_files[locus] = [locus_file] if locus_file in updated_files and no_inferred is True: loci_files[locus].append(updated_files[locus_file][0]) - # get input/sample identifiers + # Get input/sample identifiers sample_ids = pd.read_csv(profiles_table, delimiter='\t', dtype=str, usecols=['FILE']) - # write file with header + # Write file with header header_basename = fo.file_basename(profiles_table).replace('.tsv', '_header.tsv') header_file = fo.join_paths(output_directory, [header_basename]) fo.write_to_file(header, header_file, 'w', '') - # create multiprocessing inputs + # Create multiprocessing inputs multi_inputs = [] - # divide and process by row chunks + # Divide and process by row chunks for i in range(0, len(sample_ids), nrows): multi_inputs.append([profiles_table, loci_ids, loci_files, hashing_function, nrows, range(1, i+1), @@ -172,12 +197,12 @@ def main(profiles_table, schema_directory, output_directory, hash_type, hashed_files = mo.map_async_parallelizer(multi_inputs, mo.function_helper, cpu_cores) - # concatenate all files + # Concatenate all files output_basename = fo.file_basename(profiles_table).replace('.tsv', '_hashed.tsv') output_file = fo.join_paths(output_directory, [output_basename]) fo.concatenate_files([header_file]+hashed_files, output_file) - # delete intermediate dataframes + # Delete intermediate dataframes fo.remove_files([header_file]+hashed_files) return output_file diff --git a/CHEWBBACA/utils/profiles_sqlitedb.py b/CHEWBBACA/utils/profiles_sqlitedb.py index 9a44765c..4c99e060 100755 --- a/CHEWBBACA/utils/profiles_sqlitedb.py +++ b/CHEWBBACA/utils/profiles_sqlitedb.py @@ -4,7 +4,7 @@ Purpose ------- -This module contains functions related with the creation and +This module contains functions related to the creation and manipulation of the SQLite database used to store allelic profiles. Notes @@ -15,6 +15,8 @@ (this was only implemented in SQLite 3.6.19). We need to take that into account when altering the data in the database. +Code documentation +------------------ """ @@ -30,9 +32,10 @@ def create_database_file(db_file): - """ Creates a SQLite database file. - If the database file already exists, - it will establish and close connection. + """Create a SQLite database file. + + If the database file already exists, + it will establish and close connection. Parameters ---------- @@ -46,11 +49,10 @@ def create_database_file(db_file): successfully created, OperationalError if it could not create/establish connection """ - conn = None error = None try: - # creates db file if it does not exist + # Creates db file if it does not exist conn = sqlite3.connect(db_file) except Exception as e: error = e @@ -62,8 +64,7 @@ def create_database_file(db_file): def create_connection(db_file): - """ Creates a database connection to a SQLite - database. + """Create a database connection to a SQLite database. Parameters ---------- @@ -77,7 +78,6 @@ def create_connection(db_file): successfull or error if it was not possible to connect to the database. """ - try: conn = sqlite3.connect(db_file) except Exception as e: @@ -87,7 +87,7 @@ def create_connection(db_file): def execute_statement(conn, statement): - """ Executes a SQL statement. + """Execute a SQL statement. Parameters ---------- @@ -103,7 +103,6 @@ def execute_statement(conn, statement): successfully created, OperationalError if it could not create/establish connection """ - error = None try: c = conn.cursor() @@ -115,7 +114,7 @@ def execute_statement(conn, statement): def select_all_rows(db_file, table): - """ Retrieves all rows in a table. + """Retrieve all rows in a table. Parameters ---------- @@ -131,7 +130,6 @@ def select_all_rows(db_file, table): is represented by a tuple with the values for all columns. """ - conn = create_connection(db_file) cur = conn.cursor() cur.execute('SELECT * FROM {0}'.format(table)) @@ -146,7 +144,7 @@ def select_all_rows(db_file, table): def create_insert_statement(table, columns, placeholders): - """ Creates a base SQL insert statement. + """Create a base SQL insert statement. Parameters ---------- @@ -163,7 +161,6 @@ def create_insert_statement(table, columns, placeholders): used to insert values into `columns` of a `table`. """ - statement = ('INSERT OR IGNORE INTO {0}({1}) ' 'VALUES({2});'.format(table, ','.join(columns), ','.join(placeholders))) @@ -172,7 +169,7 @@ def create_insert_statement(table, columns, placeholders): def insert_loci(db_file, matrix_file): - """ Inserts loci into the loci table. + """Insert loci into the loci table. Parameters ---------- @@ -187,7 +184,6 @@ def insert_loci(db_file, matrix_file): The number of loci that were insert into the table. """ - matrix_lines = read_matrix(matrix_file) loci_list = [locus.rstrip('.fasta') for locus in matrix_lines[0][1:]] @@ -207,7 +203,7 @@ def insert_loci(db_file, matrix_file): def insert_multiple(db_file, base_statement, data): - """ Executes several insert statements. + """Execute several insert statements. Parameters ---------- @@ -225,7 +221,6 @@ def insert_multiple(db_file, base_statement, data): None if the SQL statement was successfully inserted, SQLite OperationalError otherwise. """ - error = None try: conn = create_connection(db_file) @@ -240,9 +235,7 @@ def insert_multiple(db_file, base_statement, data): def create_database(db_file): - """ Creates the database file and tables of a SQLite database - that will store the allelic profiles determined with - a schema. + """Create a SQLite database to store allelic profiles. Parameters ---------- @@ -254,7 +247,6 @@ def create_database(db_file): True if the SQLite database file and tables were successfully created, SQLite OperationalError otherwise. """ - message = create_database_file(db_file) # samples table @@ -308,8 +300,7 @@ def create_database(db_file): def read_matrix(matrix_file): - """ Reads a TSV file that contains a matrix with - allelic profiles. + """Read a TSV file that contains a matrix with allelic profiles. Parameters ---------- @@ -321,7 +312,6 @@ def read_matrix(matrix_file): matrix_lines : list of list A list with all the lines in the TSV file. """ - with open(matrix_file, 'r') as m: matrix_lines = list(csv.reader(m, delimiter='\t')) @@ -329,8 +319,7 @@ def read_matrix(matrix_file): def get_loci_ids(matrix_lines): - """ Extracts loci identifiers from a list - with lines from a matrix of allelic profiles. + """Extract loci identifiers from a list with allelic profiles. Parameters ---------- @@ -344,15 +333,13 @@ def get_loci_ids(matrix_lines): List with the identifiers of all loci represented in the allelic profiles. """ - loci_ids = [locus.rstrip('.fasta') for locus in matrix_lines[0][1:]] return loci_ids def get_sample_ids(matrix_lines): - """ Extracts sample identifiers from a list - with lines from a matrix of allelic profiles. + """Extract sample identifiers from a list with allelic profiles. Parameters ---------- @@ -366,15 +353,13 @@ def get_sample_ids(matrix_lines): List with the sample identifiers of all allelic profiles. """ - sample_ids = [l[0].rstrip('.fasta') for l in matrix_lines[1:]] return sample_ids def get_profiles(matrix_lines): - """ Extracts profiles from a list with lines from - a matrix of allelic profiles. + """Extract profiles from a list with allelic profiles. Parameters ---------- @@ -387,7 +372,6 @@ def get_profiles(matrix_lines): profiles : list of dict List with one dictionary per allelic profile. """ - profiles = [] loci_ids = matrix_lines[0][1:] for l in matrix_lines[1:]: @@ -399,7 +383,7 @@ def get_profiles(matrix_lines): def remove_inf(profile): - """ Remove 'INF-' prefix from inferred alleles. + """Remove the 'INF-' prefix from inferred alleles. Parameters ---------- @@ -413,7 +397,6 @@ def remove_inf(profile): List with allele identifiers stripped of the 'INF-' prefix. """ - clean_profile = [a.lstrip('INF-') if 'INF-' in a else a for a in profile] return clean_profile @@ -422,7 +405,6 @@ def remove_inf(profile): def jsonify_profile(profile, loci): """ """ - json_profile = '' for k, v in profile.items(): # add first entry to JSON only if locus value is not LNF @@ -444,7 +426,6 @@ def jsonify_profile(profile, loci): def store_allelecall_results(output_directory, schema_directory): """ """ - # add profiles to SQLite database # parent results folder might have several results folders results_folders = [os.path.join(output_directory, file) @@ -492,8 +473,7 @@ def store_allelecall_results(output_directory, schema_directory): def insert_allelecall_matrix(matrix_file, db_file, insert_date): - """ Inserts the data contained in a AlleleCall matrix into - the SQLite database of the schema. + """Insert profile data from a TSV file into a SQLite db. Parameters ---------- @@ -514,7 +494,6 @@ def insert_allelecall_matrix(matrix_file, db_file, insert_date): - Total number of profiles. - Number of unique profiles. """ - loci_list_db = select_all_rows(db_file, 'loci') loci_map = {t[1]: t[0] for t in loci_list_db} @@ -592,8 +571,7 @@ def insert_allelecall_matrix(matrix_file, db_file, insert_date): def select_outdated(loci, reassigned, cursor): - """ Retrives the allelic profiles that have outdated - allele identifiers. + """Retrive the allelic profiles with outdated allele identifiers. Parameters ---------- @@ -620,7 +598,6 @@ def select_outdated(loci, reassigned, cursor): the outdated allele identifier and the updated allele identifier). """ - profiles = {} for locus, alleles in reassigned.items(): locus_id = loci[locus.split('-')[-1].rstrip('.fasta').lstrip('0')] @@ -643,8 +620,7 @@ def select_outdated(loci, reassigned, cursor): def alter_profiles(profiles, cursor): - """ Alters allele identifiers in allelic profiles - that are outdated. + """Update allele identifiers in allelic profiles. Parameters ---------- @@ -665,7 +641,6 @@ def alter_profiles(profiles, cursor): A dictionary with profiles hashes as keys and updated profiles as values. """ - results = {} for k, v in profiles.items(): profile = v[0] @@ -685,7 +660,7 @@ def alter_profiles(profiles, cursor): def update_profiles(schema_directory, reassigned): - """ Updates allele identifiers that have been changed. + """Update allele identifiers in allelic profiles. Parameters ---------- diff --git a/CHEWBBACA/utils/remove_genes.py b/CHEWBBACA/utils/remove_genes.py index cc1746be..cc95e2f6 100755 --- a/CHEWBBACA/utils/remove_genes.py +++ b/CHEWBBACA/utils/remove_genes.py @@ -4,8 +4,8 @@ Purpose ------- -This module removes a set of loci from a TSV file with -results from the AlleleCall process. +This module removes a set of loci from results of the +AlleleCall process. Code documentation ------------------ @@ -13,7 +13,6 @@ import csv -import argparse import pandas as pd @@ -24,7 +23,21 @@ def main(input_file, genes_list, output_file, inverse): - + """Remove loci from allele calling results. + + Parameters + ---------- + input_file : str + Path to a TSV file that contains allelic profiles + determined by the AlleleCall module. + genes_list : str + Path to a file with a list of loci to keep or remove. + output_file : str + Path to the output file. + inverse : bool + Keep the loci included in `genes_list` and remove the + rest instead. + """ # Read genes list with open(genes_list, 'r') as infile: genes_list = list(csv.reader(infile, delimiter='\t')) @@ -51,39 +64,3 @@ def main(input_file, genes_list, output_file, inverse): # Save dataframe to file df.to_csv(output_file, header=True, sep='\t', index=False) - -def parse_arguments(): - - parser = argparse.ArgumentParser(description=__doc__, - formatter_class=argparse.RawDescriptionHelpFormatter) - - parser.add_argument('-i', '--input-file', type=str, - required=True, dest='input_file', - help='TSV file that contains a matrix with ' - 'allelic profiles determined by the ' - 'AlleleCall process.') - - parser.add_argument('-gl', '--genes-list', type=str, - required=True, dest='genes_list', - help='File with the list of genes to ' - 'remove, one identifier per line.') - - parser.add_argument('-o', '--output-file', type=str, - required=True, dest='output_file', - help='Path to the output file.') - - parser.add_argument('--inverse', action='store_true', - required=False, dest='inverse', - help='List of genes that is provided ' - 'is the list of genes to keep and ' - 'all other genes should be removed.') - - args = parser.parse_args() - - return args - - -if __name__ == "__main__": - - args = parse_arguments() - main(**vars(args)) diff --git a/CHEWBBACA/utils/sequence_clustering.py b/CHEWBBACA/utils/sequence_clustering.py index 06715d82..30333a43 100644 --- a/CHEWBBACA/utils/sequence_clustering.py +++ b/CHEWBBACA/utils/sequence_clustering.py @@ -4,8 +4,8 @@ Purpose ------- -This module contains functions related with sequence clustering -based on k-mer sets. +This module contains functions related to sequence clustering +based on k-mers. Code documentation ------------------ @@ -19,14 +19,12 @@ from utils import (file_operations as fo, iterables_manipulation as im, blast_wrapper as bw, - fasta_operations as fao, - constants as ct) + fasta_operations as fao) except ModuleNotFoundError: from CHEWBBACA.utils import (file_operations as fo, iterables_manipulation as im, blast_wrapper as bw, - fasta_operations as fao, - constants as ct) + fasta_operations as fao) def select_representatives(kmers, reps_groups, clustering_sim): diff --git a/CHEWBBACA/utils/uniprot_requests.py b/CHEWBBACA/utils/uniprot_requests.py index 1934d658..ce375dce 100644 --- a/CHEWBBACA/utils/uniprot_requests.py +++ b/CHEWBBACA/utils/uniprot_requests.py @@ -5,7 +5,7 @@ ------- This module contains functions to perform requests to -UniProts's SPARQL endpoint and process retrieved data. +UniProt's SPARQL endpoint and process retrieved data. Code documentation ------------------ @@ -81,10 +81,10 @@ def select_name(result): i = 0 found = False - # get the entries with results + # Get the entries with results try: aux = result['results']['bindings'] - # response does not contain annotation data + # Response does not contain annotation data except Exception as e: aux = {}