From 25b1a08c40ac57414947f97ceb36711a2c9c1e9c Mon Sep 17 00:00:00 2001
From: rfm-targa <rmamede@medicina.ulisboa.pt>
Date: Fri, 12 Apr 2024 16:43:23 +0100
Subject: [PATCH] Updated docstrings.

---
 CHEWBBACA/AlleleCall/allele_call.py           |  13 +-
 .../AlleleCallEvaluator/evaluate_calls.py     |   4 +-
 CHEWBBACA/CHEWBBACA_NS/README.md              |   4 -
 CHEWBBACA/CHEWBBACA_NS/download_schema.py     |   6 +-
 CHEWBBACA/CHEWBBACA_NS/stats_requests.py      | 108 ++--------
 CHEWBBACA/CHEWBBACA_NS/upload_schema.py       | 199 ++++--------------
 CHEWBBACA/CreateSchema/create_schema.py       | 154 ++++++--------
 CHEWBBACA/ExtractCgMLST/determine_cgmlst.py   |  66 +-----
 CHEWBBACA/PrepExternalSchema/adapt_schema.py  |  29 +++
 CHEWBBACA/SchemaEvaluator/evaluate_schema.py  |  30 +++
 CHEWBBACA/UniprotFinder/annotate_schema.py    |  52 ++++-
 CHEWBBACA/utils/blast_wrapper.py              |   6 +-
 CHEWBBACA/utils/chewiens_requests.py          |   2 +-
 CHEWBBACA/utils/core_functions.py             |   4 +-
 CHEWBBACA/utils/distance_matrix.py            |  36 +++-
 CHEWBBACA/utils/fasta_operations.py           |   2 +-
 CHEWBBACA/utils/fasttree_wrapper.py           |   9 +-
 CHEWBBACA/utils/file_operations.py            |  12 +-
 CHEWBBACA/utils/gene_prediction.py            |   2 +-
 CHEWBBACA/utils/iterables_manipulation.py     |   2 +-
 CHEWBBACA/utils/join_profiles.py              |  15 +-
 CHEWBBACA/utils/mafft_wrapper.py              |   6 +-
 CHEWBBACA/utils/multiprocessing_operations.py |   2 +-
 CHEWBBACA/utils/parameters_validation.py      |   4 +-
 CHEWBBACA/utils/process_datetime.py           |  12 +-
 CHEWBBACA/utils/profile_hasher.py             |  45 +++-
 CHEWBBACA/utils/profiles_sqlitedb.py          |  73 +++----
 CHEWBBACA/utils/remove_genes.py               |  57 ++---
 CHEWBBACA/utils/sequence_clustering.py        |  10 +-
 CHEWBBACA/utils/uniprot_requests.py           |   6 +-
 30 files changed, 395 insertions(+), 575 deletions(-)
 delete mode 100644 CHEWBBACA/CHEWBBACA_NS/README.md

diff --git a/CHEWBBACA/AlleleCall/allele_call.py b/CHEWBBACA/AlleleCall/allele_call.py
index f08809f7..e1c1abf4 100644
--- a/CHEWBBACA/AlleleCall/allele_call.py
+++ b/CHEWBBACA/AlleleCall/allele_call.py
@@ -830,7 +830,7 @@ def write_results_alleles(classification_files, input_identifiers,
 
 def write_results_statistics(classification_files, input_identifiers,
                              cds_counts, output_directory, classification_labels,
-                             repeated_counts, invalid_data, loci_finder):
+                             repeated_counts, invalid_data):
     """Write a TSV file with classification counts per input.
 
     Parameters
@@ -856,9 +856,6 @@ def write_results_statistics(classification_files, input_identifiers,
     invalid_data : dict
         Dictionary with input identifiers as keys and the total
         number of invalid CDSs as values.
-	loci_finder : re.Pattern
-		Regular expression object to search for loci identifiers
-		in paths and filenames.
 
     Returns
     -------
@@ -870,7 +867,6 @@ def write_results_statistics(classification_files, input_identifiers,
     class_counts = {i: {c: 0 for c in classification_labels}
                     for i in input_identifiers}
     for file in classification_files.values():
-        locus_id = loci_finder.search(file).group()
         locus_results = fo.pickle_loader(file)
 
         for i in class_counts:
@@ -1361,7 +1357,7 @@ def process_blast_results(blast_results, bsr_threshold, query_scores):
             bsr = cf.compute_bsr(raw_score, query_scores[query_id][1])
         except Exception as e:
             print('Could not get the self-score for the representative '
-                  f'allele {query_id}')
+                  f'allele {query_id}', e)
             continue
         # Only keep matches above BSR threshold
         if bsr >= bsr_threshold:
@@ -1563,7 +1559,7 @@ def classify_inexact_matches(locus, genomes_matches, inv_map,
                 int(rep_alleleid.replace('*', '').split('_')[-1])
                 rep_alleleid = rep_alleleid.split('_')[-1]
             except Exception as e:
-                pass
+                print(e)
 
             # Get hash of the CDS DNA sequence
             target_dna_hash = match[2]
@@ -2952,8 +2948,7 @@ def main(input_file, loci_list, schema_directory, output_directory,
                                                 output_directory,
                                                 classification_labels,
                                                 repeated_counts,
-                                                results['invalid_alleles'],
-												loci_finder)
+                                                results['invalid_alleles'])
 
     # Create file with class counts per locus called
     print(f'Creating file with class counts per locus ({ct.LOCI_STATS_BASENAME})...')
diff --git a/CHEWBBACA/AlleleCallEvaluator/evaluate_calls.py b/CHEWBBACA/AlleleCallEvaluator/evaluate_calls.py
index b8b39743..c7ef7643 100644
--- a/CHEWBBACA/AlleleCallEvaluator/evaluate_calls.py
+++ b/CHEWBBACA/AlleleCallEvaluator/evaluate_calls.py
@@ -214,7 +214,7 @@ def concatenate_loci_alignments(sample, loci, fasta_index, output_directory):
         try:
             alignment += str(fasta_index[seqid].seq)
         except Exception as e:
-            print(f'Could not get {sample} allele for locus {locus}.')
+            print(f'Could not get {sample} allele for locus {locus}.', e)
     # Save alignment for sample
     alignment_outfile = fo.join_paths(output_directory,
                                       [f'{sample}_cgMLST_alignment.fasta'])
@@ -322,8 +322,6 @@ def main(input_files, schema_directory, output_directory, annotations,
     summary_rows = [total_samples, total_loci, total_cds,
                     loci_sums[-1], *loci_sums[:-1]]
 
-    pa_lines = []
-    dm_lines = []
     phylo_data = {"phylo_data": []}
     if light is False:
         if False in [no_pa, no_dm, no_tree] or cg_alignment is True:
diff --git a/CHEWBBACA/CHEWBBACA_NS/README.md b/CHEWBBACA/CHEWBBACA_NS/README.md
deleted file mode 100644
index f87b0f9b..00000000
--- a/CHEWBBACA/CHEWBBACA_NS/README.md
+++ /dev/null
@@ -1,4 +0,0 @@
-# chewBBACA - Chewie-NS modules
-
-
-
diff --git a/CHEWBBACA/CHEWBBACA_NS/download_schema.py b/CHEWBBACA/CHEWBBACA_NS/download_schema.py
index f52f9a80..8f2240c3 100755
--- a/CHEWBBACA/CHEWBBACA_NS/download_schema.py
+++ b/CHEWBBACA/CHEWBBACA_NS/download_schema.py
@@ -4,11 +4,9 @@
 Purpose
 -------
 
-This module enables the download of chewBBACA's schemas from the
-Chewie-NS.
-
+This module enables the download of schemas from a Chewie-NS instance.
 The process enables the download of ZIP archives that contain ready-to-use
-versions of any schema in the Chewie-NS. It also allows users to download
+versions of any schema in Chewie-NS. It also allows users to download
 any schema with the structure it had at a specific time point. It is also
 possible to download the latest version of the schema through requests to
 the Chewie-NS API, if the compressed version that is available does not
diff --git a/CHEWBBACA/CHEWBBACA_NS/stats_requests.py b/CHEWBBACA/CHEWBBACA_NS/stats_requests.py
index 3d67c2e0..f22cf890 100644
--- a/CHEWBBACA/CHEWBBACA_NS/stats_requests.py
+++ b/CHEWBBACA/CHEWBBACA_NS/stats_requests.py
@@ -4,46 +4,13 @@
 Purpose
 -------
 
-This module enables the retrieval of information/stats from the
-Chewie-NS. Its main objective is to provide information about
-the list of species and schemas in the Chewie-NS, so that users
+This module enables the retrieval of information/stats from a
+Chewie-NS instance. Its main objective is to provide information about
+the list of species and schemas in Chewie-NS, so that users
 can quickly identify a schema of interest and download it (this
 process generates tables with species and schemas identifiers that
 can be passed to the `-sc` and `-sp` arguments of DownloadSchema).
 
-Expected input
---------------
-
-The process expects the following variables whether through command line
-execution or invocation of the :py:func:`main` function:
-
-- ``-m``, ``stats_mode`` : The process can retrieve the list of species
-  ("species" option) in the Chewie-NS, the list of schemas for a species
-  ("schemas" option and valid value for `--sp`) or information about a
-  single schema ("schemas" option and valid values for `--sp` and `--sc`).
-
-    - e.g.: ``species`` or ``schemas``
-
-- ``--ns_url``, ``nomenclature_server_url`` : The base URL for the Nomenclature Server.
-  The default value, "main", will establish a connection to "https://chewbbaca.online/",
-  "tutorial" to "https://tutorial.chewbbaca.online/"" and "local" to
-  "http://127.0.0.1:5000/NS/api/" (localhost). Users may also provide the IP address to
-  other Chewie-NS instances.
-
-    - e.g.: ``http://127.0.0.1:5000/NS/api/`` (localhost)
-
-- ``--sp``, ``species_id`` : The integer identifier of a species
-  in the Chewie-NS. The process will retrieve the list of schemas
-  for the species with specified identifier.
-
-    - e.g.: ``2``
-
-- ``--sc``, ``schema_id`` : The integer identifier of a schema in
-  the Chewie-NS. The process will retrieve information about the
-  schema with specified identifier.
-
-    - e.g.: ``4``
-
 Code documentation
 ------------------
 """
@@ -51,17 +18,14 @@
 
 import sys
 import requests
-import argparse
 from urllib3.exceptions import InsecureRequestWarning
 
 try:
     from utils import (constants as ct,
-                       chewiens_requests as cr,
-                       parameters_validation as pv)
+                       chewiens_requests as cr)
 except ModuleNotFoundError:
     from CHEWBBACA.utils import (constants as ct,
-                                 chewiens_requests as cr,
-                                 parameters_validation as pv)
+                                 chewiens_requests as cr)
 
 
 # Suppress only the single warning from urllib3 needed.
@@ -384,7 +348,23 @@ def single_schema(species_id, schema_id, base_url, headers_get):
 
 
 def main(mode, nomenclature_server, species_id, schema_id):
+    """Get species and sschema statistics from a Chewie-NS instance.
 
+    Parameters
+    ----------
+    mode : str
+        The process can retrieve the list of species ("species" option)
+        from Chewie-NS, the list of schemas for a species ("schemas"
+        option and valid value for `species_id`) or information about a
+        single schema ("schemas" option and valid values for `species_id`
+        and `schema_id`).
+    nomenclature_server : str
+        The base URL for the Chewie-NS instance.
+    species_id : int
+        The integer identifier of a species in Chewie-NS.
+    schema_id : int
+        The integer identifier of a schema in Chewie-NS.
+    """
     headers_get = ct.HEADERS_GET_JSON
 
     print('\nRetrieving data...')
@@ -402,50 +382,6 @@ def main(mode, nomenclature_server, species_id, schema_id):
             sys.exit('\nPlease provide a valid species identifier '
                      'to get the list of available schemas.\n')
 
-    # print stats
+    # Print stats
     stats_text = '\n'.join(stats)
     print('\n{0}\n'.format(stats_text))
-
-
-def parse_arguments():
-
-    parser = argparse.ArgumentParser(description=__doc__,
-                                     formatter_class=argparse.RawDescriptionHelpFormatter)
-
-    parser.add_argument('-m', type=str, required=True,
-                        dest='stats_mode', choices=['species', 'schemas'],
-                        help='The process can retrieve the list of species '
-                             '("species" option) in the Chewie-NS or the '
-                             'list of schemas for a species '
-                             '("schemas" option).')
-
-    parser.add_argument('--ns', type=pv.validate_ns_url, required=False,
-                        default='main',
-                        dest='nomenclature_server',
-                        help='The base URL for the Nomenclature Server. '
-                             'The default value, "main", will establish a '
-                             'connection to "https://chewbbaca.online/", '
-                             '"tutorial" to "https://tutorial.chewbbaca.online/" '
-                             'and "local" to "http://127.0.0.1:5000/NS/api/" (localhost). '
-                             'Users may also provide the IP address to other '
-                             'Chewie-NS instances.')
-
-    parser.add_argument('--sp', type=str, required=False,
-                        dest='species_id', default=None,
-                        help='The integer identifier of a '
-                             'species in the Chewie-NS.')
-
-    parser.add_argument('--sc', type=str, required=False,
-                        dest='schema_id', default=None,
-                        help='The integer identifier of a schema in '
-                             'the Chewie-NS.')
-
-    args = parser.parse_args()
-
-    return args
-
-
-if __name__ == '__main__':
-
-    args = parse_arguments()
-    main(**(vars(args)))
diff --git a/CHEWBBACA/CHEWBBACA_NS/upload_schema.py b/CHEWBBACA/CHEWBBACA_NS/upload_schema.py
index cad2400c..0c03fe87 100755
--- a/CHEWBBACA/CHEWBBACA_NS/upload_schema.py
+++ b/CHEWBBACA/CHEWBBACA_NS/upload_schema.py
@@ -5,86 +5,17 @@
 -------
 
 This module allows authorized users to upload chewBBACA's schemas
-to the Chewie-NS.
-
-The process for schema upload has four stages:
-
-    - User Permissions: Determines if the current user has permission
-      to upload schemas. Only Admin or Contributor level users can
-      upload schemas to the Chewie-NS.
-
-    - Parameters Validation: Validation of the set of parameters associated
-      with the schema. Only schemas that have been used with a single valid
-      value per parameter can be uploaded. Invalid or multiple values
-      for a single parameter can lead to inconsistent results; thus,
-      it is strongly advised to always perform allele calling with
-      the same set of parameters and refrain from altering the initial
-      set of parameters values defined in the schema creation or
-      adaptation processes.
-
-    - Schema Pre-processing: Applies quality control measures to identify
-      and exclude invalid alleles. Searches for annotations on UniProt
-      and imports annotations provided by users.
-
-    - Schema Upload: Collects essential data and sends it to the Chewie-NS
-      for schema creation and data insertion. The process finishes when all
-      the necessary data has been uploaded. The Chewie-NS automatically
-      detects that all data has been received and finishes data insertion.
-
-Expected input
---------------
-
-The process expects the following variables whether through command line
-execution or invocation of the :py:func:`main` function:
-
-- ``-i``, ``schema_directory`` : Path to the directory of the schema to upload.
-
-    - e.g.: ``/home/user/schemas/ypestis_schema``
-
-- ``-sp``, ``species_id`` : The integer identifier or name of the species that
-  the schema will be associated to in the Chewie-NS.
-
-    - e.g.: ``1`` or ``'Yersinia pestis'``
-
-- ``-sn``, ``schema_name`` : A brief and meaningful name that should help
-  understand the type and content of the schema.
-
-    - e.g.: ``ypestis_cgMLST`` or ``ypestis cgMLST``
-
-- ``-lp``, ``loci_prefix`` : Prefix included in the name of each locus of the
-  schema.
-
-    - e.g.: ``ypestis``
-
-- ``--df``, ``description_file`` : Path to a text file with a description
-  about the schema. Markdown syntax is supported in order to allow greater
-  customizability of the rendered description in the Frontend. Will default
-  to the schema's name if the user does not provide a valid path for a
-  file (default=None).
-
-    - e.g.: ``/home/user/schemas/ypestis_description``
-
-- ``--a``, ``annotations`` : Path to a TSV file with loci annotations. The
-  first column has loci identifiers (w/o .fasta extension), the second has
-  user annotations and the third has custom annotations (default=None).
-
-    - e.g.: ``/home/user/schemas/ypestis_annotations``
-
-- ``--cpu``, ``cpu_cores`` : Number of CPU cores that will be used in the
-  Schema Pre-processing step (default=1).
-
-    - e.g.: ``4``
-
-- ``--ns_url``, ``nomenclature_server`` : The base URL for the Nomenclature
-  Server. The default value, "main", will establish a connection to
-  "https://chewbbaca.online/", "tutorial" to "https://tutorial.chewbbaca.online/"
-  and "local" to "http://127.0.0.1:5000/NS/api/" (localhost). Users may also
-  provide the IP address to other Chewie-NS instances.
-
-    - e.g.: ``http://127.0.0.1:5000/NS/api/`` (localhost)
-
-- ``--continue_up`` : If the process should check if the schema upload was
-  interrupted and try to resume it. ``True`` if provided, ``False`` otherwise.
+to a Chewie-NS instance. The process requests the credentials of
+the user trying to upload a schema and determines if the user is
+allowed to upload the schema (only Admin and Contributor level
+users can upload schemas). The schema config file is read to
+validate the argument values used to create the schema. Only
+schemas that have been used with a single valid value per parameter
+can be uploaded. Invalid or multiple values for a single parameter
+can lead to inconsistent results; thus, it is strongly advised to
+always perform allele calling with the same set of parameters
+and refrain from altering the initial set of parameters values
+defined in the schema creation or adaptation processes.
 
 Code documentation
 ------------------
@@ -96,7 +27,6 @@
 import json
 import time
 import hashlib
-import argparse
 import requests
 import itertools
 import multiprocessing
@@ -930,7 +860,39 @@ def upload_alleles_data(alleles_data, length_files, base_url,
 def main(schema_directory, species_id, schema_name, loci_prefix,
          description_file, annotations, cpu_cores,
          nomenclature_server, continue_up):
+    """Upload a schema to a Chewie-NS module.
 
+    Parameters
+    ----------
+    schema_directory : str
+        Path to the directory of the schema to upload.
+    species_id : int
+        The integer identifier or name of the species that
+        the schema will be associated to in Chewie-NS.
+    schema_name : str
+        A brief and meaningful name that should help
+        understand the type and content of the schema.
+    loci_prefix : str
+        A short prefix included in the name of each locus.
+    description_file : str
+        Path to a text file with a description about the schema. Markdown
+        syntax is supported in order to allow greater customizability of
+        the rendered description in the Frontend. Will default to the
+        schema's name if the user does not provide a valid path for a
+        file.
+    annotations : str
+        Path to a TSV file with loci annotations. The first column has
+        loci identifiers (w/o .fasta extension), the second has user
+        annotations and the third has custom annotations.
+    cpu_cores : int
+        Number of CPU cores that will be used in the pre-processing steps.
+    nomenclature_server : str
+        The base URL for the Chewie-NS instance.
+    continue_up : bool
+        If the process should check if the schema upload was interrupted
+        and try to resume it.
+    """
+    
     if 'tutorial' not in nomenclature_server:
         token = cr.capture_login_credentials(nomenclature_server)
     else:
@@ -1277,78 +1239,3 @@ def main(schema_directory, species_id, schema_name, loci_prefix,
     if len(absent_loci) > 0:
         os.remove(loci_file)
         os.remove('{0}.zip'.format(loci_file))
-
-
-def parse_arguments():
-
-    parser = argparse.ArgumentParser(description=__doc__,
-                                     formatter_class=argparse.RawDescriptionHelpFormatter)
-
-    parser.add_argument('-i', type=str, required=True,
-                        dest='schema_directory',
-                        help='Path to the directory of the schema to upload.')
-
-    parser.add_argument('-sp', type=str, required=True,
-                        dest='species_id',
-                        help='The integer identifier or name of the species '
-                             'that the schema will be associated to in '
-                             'the NS.')
-
-    parser.add_argument('-sn', type=str, required=True,
-                        dest='schema_name',
-                        help='A brief and meaningful name that '
-                             'should help understand the type and content '
-                             'of the schema.')
-
-    parser.add_argument('-lp', type=str, required=True,
-                        dest='loci_prefix',
-                        help='Prefix included in the name of each locus of '
-                             'the schema.')
-
-    parser.add_argument('--df', type=str, required=False,
-                        dest='description_file', default=None,
-                        help='Path to a text file with a description '
-                             'about the schema. Markdown syntax is supported '
-                             'in order to offer greater customizability of '
-                             'the rendered description in the Frontend. '
-                             'Will default to the schema\'s name if the user '
-                             'does not provide a valid path for a file.')
-
-    parser.add_argument('--a', type=str, required=False,
-                        dest='annotations', default=None,
-                        help='Path to a TSV file with loci annotations. '
-                             'The first column has loci identifiers '
-                             '(w/o .fasta extension), the second has user '
-                             'annotations and the third has custom '
-                             'annotations.')
-
-    parser.add_argument('--cpu', type=int, required=False,
-                        dest='cpu_cores', default=1,
-                        help='Number of CPU cores that will '
-                             'be used in the Schema Pre-processing step.')
-
-    parser.add_argument('--ns', type=pv.validate_ns_url, required=False,
-                        default='main',
-                        dest='nomenclature_server',
-                        help='The base URL for the Nomenclature Server. '
-                             'The default value, "main", will establish a '
-                             'connection to "https://chewbbaca.online/", '
-                             '"tutorial" to "https://tutorial.chewbbaca.online/" '
-                             'and "local" to "http://127.0.0.1:5000/NS/api/" (localhost). '
-                             'Users may also provide the IP address to other '
-                             'Chewie-NS instances.')
-
-    parser.add_argument('--continue_up', required=False, action='store_true',
-                        dest='continue_up',
-                        help='If the process should check if the schema '
-                             'upload was interrupted and try to resume it.')
-
-    args = parser.parse_args()
-
-    return args
-
-
-if __name__ == "__main__":
-
-    args = parse_arguments()
-    main(**vars(args))
diff --git a/CHEWBBACA/CreateSchema/create_schema.py b/CHEWBBACA/CreateSchema/create_schema.py
index 9c0e270f..99c55ccc 100644
--- a/CHEWBBACA/CreateSchema/create_schema.py
+++ b/CHEWBBACA/CreateSchema/create_schema.py
@@ -5,101 +5,10 @@
 -------
 
 This module enables the creation of a whole genome multi locus sequence
-typing (wgMLST) schema seed.
-
-Expected input
---------------
-
-The process expects the following variables whether through command line
-execution or invocation of the :py:func:`main` function:
-
-- ``-i``, ``input_files`` : Path to the directory that contains the input
-  FASTA files. Alternatively, a single file with a list of paths to FASTA
-  files, one per line.
-
-    - e.g.: ``/home/user/genomes``
-
-- ``-o``, ``output_directory`` : Output directory where the process will
-  store intermediate files and create the schema's directory.
-
-    - e.g.: ``/home/user/schemas/new_schema``
-
-- ``--n``, ``schema_name`` : Name given to the folder that will store the
-  schema files.
-
-    - e.g.: ``my_schema``
-
-- ``--ptf``, ``ptf_path`` : Path to the Prodigal training file.
-
-    - e.g.: ``/home/user/training_files/species.trn``
-
-- ``--bsr``, ``blast_score_ratio`` : BLAST Score Ratio value.
-
-    - e.g.: ``0.6``
-
-- ``--l``, ``minimum_length`` : Minimum sequence length. Coding sequences
-  shorter than this value are excluded.
-
-    - e.g.: ``201``
-
-- ``--t``, ``translation_table`` : Genetic code used to predict genes and
-  to translate coding sequences.
-
-    - e.g.: ``11``
-
-- ``--st``, ``size_threshold`` : CDS size variation threshold. Added to the
-  schema's config file and used to identify alleles with a length value that
-  deviates from the locus length mode during the allele calling process.
-
-    - e.g.: ``0.2``
-
-- ``--w``, ``word_size`` : word size used to generate k-mers during the
-  clustering step.
-
-    - e.g.: ``5``
-
-- ``--ws``, ``window_size`` : window size value. Number of consecutive
-  k-mers included in each window to determine a minimizer.
-
-    - e.g.: ``5``
-
-- ``--cs``, ``clustering_sim`` : clustering similarity threshold. Minimum
-  decimal proportion of shared distinct minimizers for a sequence to be
-  added to a cluster.
-
-    - e.g.: ``0.2``
-
-- ``--rf``, ``representative_filter`` : representative similarity threshold.
-  Clustered sequences are excluded if they share this proportion of distinct
-  minimizers with the cluster representative.
-
-    - e.g.: ``0.9``
-
-- ``--if``, ``intra_filter`` : intra-cluster similarity threshold. Clustered
-  sequences are excluded if they share this proportion of distinct minimizers
-  with another clustered sequence of equal or greater length.
-
-    - e.g.: ``0.9``
-
-- ``--cpu``, ``cpu_cores`` : Number of CPU cores used to run the process.
-
-    - e.g.: ``4``
-
-- ``--b``, ``blast_path`` : Path to the BLAST executables.
-
-    - e.g.: ``/home/software/blast``
-
-- ``--pm``, ``prodigal_mode`` : Prodigal running mode.
-
-    - e.g.: ``single``
-
-- ``--CDS``, ``cds_input`` : If provided, input is a single or several FASTA
-  files with coding sequences (skips gene prediction and CDS extraction).
-
-    - e.g.: ``/home/user/coding_sequences_files``
-
-- ``--no-cleanup``, ``no_cleanup`` : If provided, intermediate files
-  generated during process execution are not removed at the end.
+typing (wgMLST) schema seed. The process selects one representative allele
+per distinct locus identified in the input files. The schema seed corresponds
+to a wgMLST schema with one FASTA file per distinct locus, each FASTA file
+containing the representative allele selected by the process.
 
 Code documentation
 ------------------
@@ -516,7 +425,62 @@ def main(input_files, output_directory, schema_name, ptf_path,
          size_threshold, word_size, window_size, clustering_sim,
          representative_filter, intra_filter, cpu_cores, blast_path,
          cds_input, prodigal_mode, no_cleanup):
+    """Create a wgMLST schema seed.
 
+    Parameters
+    ----------
+    input_files : str
+        Path to the directory that contains the input FASTA files.
+        Alternatively, a single file with a list of paths to FASTA
+        files, one per line.
+    output_directory : str
+        Output directory where the process will store intermediate
+        files and create the schema seed.
+    schema_name : str
+        Name given to the folder that will store the schema seed files.
+    ptf_path : str
+        Path to the Prodigal training file.
+    blast_score_ratio : float
+        BLAST Score Ratio value.
+    minimum_length : int
+        Minimum sequence length. Coding sequences shorter than this
+        value are excluded.
+    translation_table : int
+        Genetic code used to predict genes and to translate coding
+        sequences.
+    size_threshold : float
+        CDS size variation threshold. Added to the schema's config
+        file and used to identify alleles with a length value that
+        deviates from the locus length mode during the allele calling
+        process.
+    word_size : int
+        K-mer size used during minimizer clustering.
+    window_size : int
+        Number of consecutive k-mers included in each window to
+        determine a minimizer.
+    clustering_sim :float
+        Minimum decimal proportion of shared distinct minimizers for
+        a sequence to be added to a cluster.
+    representative_filter : float
+        Clustered sequences are excluded if they share this proportion
+        of distinct minimizers with the cluster representative.
+    intra_filter : float
+        Clustered sequences are excluded if they share this proportion
+        of distinct minimizers with another clustered sequence of equal
+        or greater length.
+    cpu_cores : int
+        Number of CPU cores used to run the process.
+    blast_path : str
+        Path to the BLAST executables.
+    cds_input : bool
+        If provided, input is a single or several FASTA files with
+        coding sequences (skips gene prediction and CDS extraction).
+    prodigal_mode : str
+        Prodigal running mode ("single" or "meta").
+    no_cleanup : bool
+        If provided, intermediate files generated during process
+        execution are not removed at the end.
+    """
     print(f'Prodigal training file: {ptf_path}')
     print(f'Prodigal mode: {prodigal_mode}')
     print(f'CPU cores: {cpu_cores}')
diff --git a/CHEWBBACA/ExtractCgMLST/determine_cgmlst.py b/CHEWBBACA/ExtractCgMLST/determine_cgmlst.py
index ec9439bb..f25b8162 100755
--- a/CHEWBBACA/ExtractCgMLST/determine_cgmlst.py
+++ b/CHEWBBACA/ExtractCgMLST/determine_cgmlst.py
@@ -4,44 +4,8 @@
 Purpose
 -------
 
-This module determines the set of genes in the core genome based on
-a matrix with allelic profiles and a threshold that defines the
-proportion of genomes a gene must be present in to be included in
-the core genome.
-
-Expected input
---------------
-
-The process expects the following variables whether through command line
-execution or invocation of the :py:func:`main` function:
-
-- ``-i``, ``input_file`` : Path to input file containing a matrix with
-     'allelic profiles.
-
-    - e.g.: ``/home/user/chewie/results/matrix``
-
-- ``-o``, ``output_directory`` : Path to the directory where the process
-  will store output files.
-
-    - e.g.: ``/home/user/chewie/results/output_directory``
-
-- ``--t``, ``threshold`` : Genes that constitute the core genome must be
-  in a proportion of genomes that is at least equal to this value.
-
-    - e.g.: ``0.95``
-
-- ``--s``, ``step`` : Number of genomes added to the cgMLST computation
-  at each step.
-
-    - e.g.: ``5``
-
-- ``--r``, ``genes2remove`` : Path to file with a list of genes/columns to
-  remove from the matrix (one gene identifier per line).
-
-    - e.g.: ``home/user/results/genes.txt``
-
-- ``--g``, ``genomes2remove`` : Path to file with a list of genomes/rows
-  to remove from the matrix (one genome identifier per line).
+This module determines the set of loci that constitute the core genome
+based on a matrix with allelic profiles and a loci presence threshold.
 
 Code documentation
 ------------------
@@ -272,30 +236,21 @@ def main(input_file, output_directory, threshold, step,
     Parameters
     ----------
     input_file : str
-        Path a TSV file with allelic profiles for a set
-        of genomes.
+        Path to a TSV file with allelic profiles.
     output_directory : str
         Path to the directory where the process will
         store output files.
     threshold : list
-        Core genome determination thresholds.
+        Loci presence threshold used to determine the core genome.
     step : int
         Number of genomes added to the cgMLST computation at
         each step.
     genes2remove : str
-        Path to TXT file with the list of genomes to remove.
+        Path to TXT file with a list of genomes to exclude from
+        the analysis.
     genomes2remove : str
-        Path to TXT file with the list of loci to remove.
-
-    Returns
-    -------
-    List with the paths to three files:
-
-    - Path a TSV file with the cgMLST matrix.
-    - Path to a TXT file with the list of genes that
-      constitute the core genome.
-    - Path to a TSV file with the information about
-      missing data per genome.
+        Path to TXT file with a list of loci to exclude from
+        the analysis.
     """
     fo.create_directory(output_directory)
 
@@ -418,8 +373,3 @@ def main(input_file, output_directory, threshold, step,
     plot(fig, filename=output_html, auto_open=False)
     print('HTML file with cgMLST per loci presence threshold '
           'and per step saved to {0}'.format(output_html))
-
-
-if __name__ == '__main__':
-
-    main()
diff --git a/CHEWBBACA/PrepExternalSchema/adapt_schema.py b/CHEWBBACA/PrepExternalSchema/adapt_schema.py
index aeefe908..8e54d860 100644
--- a/CHEWBBACA/PrepExternalSchema/adapt_schema.py
+++ b/CHEWBBACA/PrepExternalSchema/adapt_schema.py
@@ -389,7 +389,36 @@ def adapt_loci(loci, schema_path, schema_short_path, bsr, min_len,
 
 def main(input_files, output_directories, cpu_cores, blast_score_ratio,
          minimum_length, translation_table, size_threshold, blast_path):
+    """
+    Adapt a schema to be used with chewBBACA.
 
+    Parameters
+    ----------
+    input_files : str
+        Path to a TXT file with the list of schema loci to adapt.
+    output_directories :  list
+        Path to the output directories to create (the main schema
+        directory and the 'short' directory to store representative
+        alleles).
+    cpu_cores : int
+        Number of CPU cores that will be used to run the process.
+    blast_score_ratio : float
+        The BLAST Score Ratio value that will be used to evaluate
+        allele similarity and select representative alleles.
+    minimum_length : int
+        Minimum sequence length value stored in the schema config file.
+        The schema adaptation process will only discard sequences smaller
+        that this value if the `--size-filter` parameter is provided.
+    translation_table : int
+        Genetic code used to translate alleles.
+    size_threshold : float
+        Allele size variation threshold value stored in the schema
+        config file. The schema adaptation process will only discard
+        alleles below or above the locus size threshold if the
+        ´--size-filter´ parameter is provided.
+    blast_path : str
+        Path to the directory that contains the BLAST executables.
+    """
     schema_path, schema_short_path = output_directories
 
     # Import list of loci to adapt
diff --git a/CHEWBBACA/SchemaEvaluator/evaluate_schema.py b/CHEWBBACA/SchemaEvaluator/evaluate_schema.py
index d9d4fd30..d97a0706 100644
--- a/CHEWBBACA/SchemaEvaluator/evaluate_schema.py
+++ b/CHEWBBACA/SchemaEvaluator/evaluate_schema.py
@@ -564,7 +564,37 @@ def locus_report(locus_file, locus_data, annotation_columns,
 def main(schema_directory, output_directory, genes_list, annotations,
          translation_table, size_threshold, minimum_length,
          cpu_cores, loci_reports, light, add_sequences):
+    """Evaluate a schema and create a HTML report.
 
+    Parameters
+    ----------
+    schema_directory : str
+        Path to the schema directory.
+    output_directory : str
+        Path to the output directory where the report files will be created.
+    genes_list : str
+        Path to a file with the list of loci to evaluate.
+    annotations : str
+        Path to a TSV file created by the UniprotFinder module.
+    translation_table : int
+        Genetic code used to translate the alleles.
+    size_threshold : float
+        Allele size variation threshold. Used to determine if the size
+        of an allele is within the interval of the locus size mode +/-
+        the size threshold.
+    minimum_length : int
+        Minimum sequence length accepted for an allele to be included
+        in the schema.
+    cpu_cores : int
+        Number of CPU cores used to run the process.
+    loci_reports : bool
+        Create individual reports for the loci.
+    light : bool
+        Skips MSA and NJ tree computations for loci reports.
+    add_sequences : bool
+        Adds Code Editor components with the allele DNA and
+        protein sequences.
+    """
     # Create directory to store intermediate files
     temp_directory = fo.join_paths(output_directory, ['temp'])
     fo.create_directory(temp_directory)
diff --git a/CHEWBBACA/UniprotFinder/annotate_schema.py b/CHEWBBACA/UniprotFinder/annotate_schema.py
index 04be16da..1fdb29e8 100755
--- a/CHEWBBACA/UniprotFinder/annotate_schema.py
+++ b/CHEWBBACA/UniprotFinder/annotate_schema.py
@@ -4,16 +4,13 @@
 Purpose
 -------
 
-This module enables the creation of a TSV file with annotation
-terms for the loci in a schema.
-
-The process queries UniProt's SPARQL endpoint to find exact
-matches and retrieve the product name and page URL for those
-matches. If users provide a taxon/taxa name/s, the process
-will also search for reference proteomes for the specified
-taxon/taxa and use BLASTp to align local sequences against
-reference sequences to assign annotation terms based on the
-BSR value computed for each alignment.
+This module retrieves annotations for the loci in a schema.
+The process can retrieve annotations through UniProt's SPARQL
+endpoint to find exact matches. If users provide a taxon/taxa
+name/s, the process will also search for reference proteomes
+for the specified taxon/taxa and use BLASTp to align local
+sequences against reference sequences to assign annotations
+based on the BSR value computed for each alignment.
 
 Code documentation
 ------------------
@@ -331,7 +328,42 @@ def create_annotations_table(annotations, output_directory, header,
 def main(schema_directory, output_directory, genes_list, protein_table,
          blast_score_ratio, cpu_cores, taxa, proteome_matches, no_sparql,
          no_cleanup, blast_path):
+    """Annotate loci in a schema.
 
+    Parameters
+    ----------
+    schema_directory
+        Path to the schema directory.
+    output_directory
+        Path to the output directory where the process will store
+        intermediate files and save the results.
+    genes_list
+        Path to a file that contains a list of schema loci to
+        annotate.
+    protein_table
+        Path to the 'cds_coordinates.tsv' file created by the
+        'CreateSchema' process.
+    blast_score_ratio
+        BLAST Score Ratio value. This value is only used to evaluate
+        matches against reference proteomes when a taxon/taxa name/s
+        are provided.
+    cpu_cores
+        Number of CPU cores used by the process.
+    taxa
+        List of scientific names for a set of taxa. The process will
+        download reference proteomes from UniProt and align schema
+        translated alleles against the proteomes to find annotations
+        for the loci.
+    proteome_matches
+        Maximum number of proteome matches per locus to report.
+    no_sparql
+        Do not search for annotations through UniProt's SPARQL
+        endpoint.
+    no_cleanup
+        Do not keep intermediate files.
+    blast_path
+        Path to the directory that contains the BLAST executables.
+    """
     # Create output directory
     created = fo.create_directory(output_directory)
     if created is False:
diff --git a/CHEWBBACA/utils/blast_wrapper.py b/CHEWBBACA/utils/blast_wrapper.py
index b2c51ab5..d88bdc21 100644
--- a/CHEWBBACA/utils/blast_wrapper.py
+++ b/CHEWBBACA/utils/blast_wrapper.py
@@ -16,11 +16,9 @@
 import subprocess
 
 try:
-	from utils import (constants as ct,
-					   iterables_manipulation as im)
+	from utils import constants as ct
 except ModuleNotFoundError:
-	from CHEWBBACA.utils import (constants as ct,
-								 iterables_manipulation as im)
+	from CHEWBBACA.utils import constants as ct
 
 
 def make_blast_db(makeblastdb_path, input_fasta, output_path, db_type):
diff --git a/CHEWBBACA/utils/chewiens_requests.py b/CHEWBBACA/utils/chewiens_requests.py
index f4555a36..6a546217 100644
--- a/CHEWBBACA/utils/chewiens_requests.py
+++ b/CHEWBBACA/utils/chewiens_requests.py
@@ -5,7 +5,7 @@
 -------
 
 This module contains functions to perform requests to
-Chewie-NS (https://github.com/B-UMMI/Chewie-NS).
+Chewie-NS instances (main: https://github.com/B-UMMI/Chewie-NS).
 
 Code documentation
 ------------------
diff --git a/CHEWBBACA/utils/core_functions.py b/CHEWBBACA/utils/core_functions.py
index bcd288ee..0f0d97a6 100644
--- a/CHEWBBACA/utils/core_functions.py
+++ b/CHEWBBACA/utils/core_functions.py
@@ -4,7 +4,8 @@
 Purpose
 -------
 
-This module contains functions related to
+This module contains core functions used by chewBBACA's
+modules.
 
 Code documentation
 ------------------
@@ -12,7 +13,6 @@
 
 
 import os
-import sys
 
 try:
     from utils import (constants as ct,
diff --git a/CHEWBBACA/utils/distance_matrix.py b/CHEWBBACA/utils/distance_matrix.py
index b5ed8acb..e32c6d25 100644
--- a/CHEWBBACA/utils/distance_matrix.py
+++ b/CHEWBBACA/utils/distance_matrix.py
@@ -4,13 +4,11 @@
 Purpose
 -------
 
-Accepts a matrix with results from the AlleleCall process of
-chewBBACA and determines the pairwise allelic differences to
-create a distance matrix. It also determines the number of
-shared loci to create a matrix with those values. The 'INF-'
-prefix is removed and ASM, ALM, NIPH, NIPHEM, PLOT3, PLOT5,
-LNF and LOTSC classifications are substituted by '0' before
-performing pairwise comparisons.
+Determines the pairwise allelic differences based on a TSV file
+with allelic profiles determined by the AlleleCall module to
+create a distance matrix. The 'INF-' prefix is removed and ASM,
+ALM, NIPH, NIPHEM, PLOT3, PLOT5, LNF and LOTSC classifications
+are substituted by '0' before computing the pairwise distances.
 
 Code documentation
 ------------------
@@ -337,8 +335,30 @@ def symmetrify_matrix(input_matrix, matrix_size, tmp_directory):
 
 
 def main(input_matrix, output_directory, cpu_cores, symmetric, masked):
+    """Compute a distance matrix based on allelic profiles.
 
-    # create output directory if it does not exist
+    Parameters
+    ----------
+    input_matrix : str
+        Path to a TSV file with allelic profiles determined by
+        the AlleleCall module.
+    output_directory : str
+        Path to the output directory.
+    cpu_cores : int
+        Number of CPU cores used to compute distances.
+    symmetric : bool
+        Determine a symmetric pairwise distance matrix, instead
+        of a triangular matrix.
+    masked : bool
+        False if the input matrix values are masked, True otherwise.
+        The process will mask the matrix values it this value is False.
+
+    Returns
+    -------
+    output_pairwise : str
+        Path to the TSV file that contains the distance matrix.
+    """
+    # Create output directory if it does not exist
     if os.path.isdir(output_directory) is False:
         os.mkdir(output_directory)
 
diff --git a/CHEWBBACA/utils/fasta_operations.py b/CHEWBBACA/utils/fasta_operations.py
index 21ea79c8..6ab10c34 100644
--- a/CHEWBBACA/utils/fasta_operations.py
+++ b/CHEWBBACA/utils/fasta_operations.py
@@ -4,7 +4,7 @@
 Purpose
 -------
 
-This module contains functions to work with FASTA files.
+This module contains functions used to work with FASTA files.
 
 Code documentation
 ------------------
diff --git a/CHEWBBACA/utils/fasttree_wrapper.py b/CHEWBBACA/utils/fasttree_wrapper.py
index de0956d7..53fc12dc 100644
--- a/CHEWBBACA/utils/fasttree_wrapper.py
+++ b/CHEWBBACA/utils/fasttree_wrapper.py
@@ -16,7 +16,14 @@
 
 
 def call_fasttree(alignment_file, tree_file):
-    """
+    """Compute a phylogenetic tree based on MSA data.
+
+    Parameters
+    ----------
+    alignment_file : str
+        Path to a file with a MSA.
+    tree_file : str
+        Path to the output tree file in Newick format.
     """
     proc = subprocess.Popen(['FastTree', '-fastest', '-nosupport',
                              '-noml', '-out', tree_file, alignment_file],
diff --git a/CHEWBBACA/utils/file_operations.py b/CHEWBBACA/utils/file_operations.py
index 3c644a02..1cc418b2 100644
--- a/CHEWBBACA/utils/file_operations.py
+++ b/CHEWBBACA/utils/file_operations.py
@@ -4,9 +4,9 @@
 Purpose
 -------
 
-This module contains functions related with file operations,
-such as read and write files, create and delete files,
-manipulate file paths, compress files, verify file contents,
+This module contains functions related to file operations,
+such as reading and writing files, creating and deleting files,
+manipulating file paths, compressing files, verifying file contents,
 etc.
 
 Code documentation
@@ -33,11 +33,9 @@
 import pandas as pd
 
 try:
-    from utils import (constants as ct,
-                       iterables_manipulation as im)
+    from utils import iterables_manipulation as im
 except ModuleNotFoundError:
-    from CHEWBBACA.utils import (constants as ct,
-                                 iterables_manipulation as im)
+    from CHEWBBACA.utils import iterables_manipulation as im
 
 
 def file_basename(file_path, file_extension=True):
diff --git a/CHEWBBACA/utils/gene_prediction.py b/CHEWBBACA/utils/gene_prediction.py
index 9544f136..f2a5eb95 100644
--- a/CHEWBBACA/utils/gene_prediction.py
+++ b/CHEWBBACA/utils/gene_prediction.py
@@ -4,7 +4,7 @@
 Purpose
 -------
 
-This module contains functions related with gene prediction
+This module contains functions related to gene prediction
 with Pyrodigal.
 
 Code documentation
diff --git a/CHEWBBACA/utils/iterables_manipulation.py b/CHEWBBACA/utils/iterables_manipulation.py
index 3ee53789..7cfc94d6 100644
--- a/CHEWBBACA/utils/iterables_manipulation.py
+++ b/CHEWBBACA/utils/iterables_manipulation.py
@@ -4,7 +4,7 @@
 Purpose
 -------
 
-This module contains functions to work with iterables.
+This module contains functions used to work with iterables.
 
 Code documentation
 ------------------
diff --git a/CHEWBBACA/utils/join_profiles.py b/CHEWBBACA/utils/join_profiles.py
index e55fc3b3..23bd2214 100755
--- a/CHEWBBACA/utils/join_profiles.py
+++ b/CHEWBBACA/utils/join_profiles.py
@@ -7,8 +7,7 @@
 This module joins allele calling results from different
 runs. It can concatenate files with allelic profiles
 for the same set of loci or create a new file with the
-allelic profiles for the loci that were common between
-all input files.
+allelic profiles for the loci shared by all input files.
 
 Code documentation
 ------------------
@@ -62,7 +61,18 @@ def concatenate_profiles(files, loci_list, output_file):
 
 
 def main(profiles, output_file, common):
+    """Join files with allelic profiles.
 
+    Parameters
+    ----------
+    profiles : list
+        List with paths to TSV files with allelic profiles.
+    output_file : str
+        Path to the output file.
+    common : bool
+        If the process should join profile data only for shared loci
+        when the profiles do not share the same loci sets.
+    """
     if len(profiles) == 1:
         sys.exit('Provided a single file. Nothing to do.')
 
@@ -75,7 +85,6 @@ def main(profiles, output_file, common):
         # check if headers are equal
         if all([set(headers[0]) == set(h) for h in headers[1:]]) is True:
             print('Profiles have {0} loci.'.format(len(headers[0])-1))
-
             total_profiles = concatenate_profiles(profiles,
                                                   headers[0],
                                                   output_file)
diff --git a/CHEWBBACA/utils/mafft_wrapper.py b/CHEWBBACA/utils/mafft_wrapper.py
index b9fd1f0d..9c754b54 100644
--- a/CHEWBBACA/utils/mafft_wrapper.py
+++ b/CHEWBBACA/utils/mafft_wrapper.py
@@ -16,11 +16,9 @@
 import subprocess
 
 try:
-    from utils import (constants as ct,
-                       iterables_manipulation as im)
+    from utils import constants as ct
 except ModuleNotFoundError:
-    from CHEWBBACA.utils import (constants as ct,
-                                 iterables_manipulation as im)
+    from CHEWBBACA.utils import constants as ct
 
 
 def call_mafft(input_file, output_file):
diff --git a/CHEWBBACA/utils/multiprocessing_operations.py b/CHEWBBACA/utils/multiprocessing_operations.py
index 5bf03247..c6c0ae34 100644
--- a/CHEWBBACA/utils/multiprocessing_operations.py
+++ b/CHEWBBACA/utils/multiprocessing_operations.py
@@ -4,7 +4,7 @@
 Purpose
 -------
 
-This modules contains functions used to paralellize
+This module contains functions used to paralellize
 function calls.
 
 Code documentation
diff --git a/CHEWBBACA/utils/parameters_validation.py b/CHEWBBACA/utils/parameters_validation.py
index 5124ae55..8f05e376 100644
--- a/CHEWBBACA/utils/parameters_validation.py
+++ b/CHEWBBACA/utils/parameters_validation.py
@@ -4,8 +4,8 @@
 Purpose
 -------
 
-This module contains functions/classes related to parameter
-and argument validation.
+This module contains functions/classes related to the validation
+of the arguments passed to chewBABCA's modules.
 
 Code documentation
 ------------------
diff --git a/CHEWBBACA/utils/process_datetime.py b/CHEWBBACA/utils/process_datetime.py
index b192e3ef..fcb45304 100644
--- a/CHEWBBACA/utils/process_datetime.py
+++ b/CHEWBBACA/utils/process_datetime.py
@@ -128,24 +128,24 @@ def process_header(process):
     print(f'{hf}\n  {header}\n{hf}')
 
 
-# decorator to time main processes
+# Decorator to time main processes
 def process_timer(func):
-    # use functools to preserve info about wrapped function
+    # Use functools to preserve info about wrapped function
     @functools.wraps(func)
     def wrapper(*args, **kwargs):
-        # get process name and print header
+        # Get process name and print header
         process_header(sys.argv[1])
 
-        # do not measure time if it is only needed to print the help message
+        # Do not measure time if it is only needed to print the help message
         if any([option in ['-h', '--help'] for option in sys.argv]) is False:
             start = get_datetime()
             start_str = datetime_str(start)
             print(f'Started at: {start_str}\n')
 
-        # run function
+        # Run function
         func(*args, **kwargs)
 
-        # does not print elapsed time if the help message is printed
+        # Does not print elapsed time if the help message is printed
         end = get_datetime()
         end_str = datetime_str(end)
         print(f'\nFinished at: {end_str}')
diff --git a/CHEWBBACA/utils/profile_hasher.py b/CHEWBBACA/utils/profile_hasher.py
index 2ed4943a..c2048fc1 100644
--- a/CHEWBBACA/utils/profile_hasher.py
+++ b/CHEWBBACA/utils/profile_hasher.py
@@ -103,7 +103,7 @@ def hash_profiles(profiles_table, loci_ids, loci_files, hashing_function,
     current_rows = pd.read_csv(profiles_table, delimiter='\t', dtype=str,
                                skiprows=skiprows, nrows=nrows, index_col=0)
 
-    # remove all 'INF-' prefixes, missing data and '*' from identifiers
+    # Remove all 'INF-' prefixes, missing data and '*' from identifiers
     current_rows = current_rows.apply(im.replace_chars, args=('-'))
 
     hashed_profiles = []
@@ -126,8 +126,33 @@ def hash_profiles(profiles_table, loci_ids, loci_files, hashing_function,
 
 def main(profiles_table, schema_directory, output_directory, hash_type,
          cpu_cores, nrows, updated_files, no_inferred):
+    """Hash allele identifiers in a matrix of allelic profiles.
 
-    # get hash function
+    Parameters
+    ----------
+    profiles_table : str
+        Path to a TSV file with allelic profiles determined by the
+        AlleleCall module.
+    schema_directory : str
+        Path to the directory of the schema used to determine the
+        allelic profiles.
+    output_directory : str
+        Path to the output directory.
+    hash_type : str
+        Hashing algorithm to use.
+    cpu_cores : int
+        Number of CPU cores used by the process.
+    nrows : int
+        Divide input file into subsets to process more efficiently.
+    updated_files : dict
+        Dictionary with paths to schema FASTA files as keys and paths
+        to FASTA files updated by allele calling as values. Only used
+        if `no_inferred` is True.
+    no_inferred : bool
+        If the allele calling process did not add inferred alleles to
+        the schema.
+    """
+    # Get hash function
     hashing_function = getattr(hashlib, hash_type, None)
     if hashing_function is None:
         hashing_function = getattr(zlib, hash_type, None)
@@ -137,7 +162,7 @@ def main(profiles_table, schema_directory, output_directory, hash_type,
               'hashlib or zlib modules.'.format(hash_type))
         return False
 
-    # get loci identifiers
+    # Get loci identifiers
     with open(profiles_table, 'r') as infile:
         header = infile.readline()
         loci_ids = header.split()[1:]
@@ -145,25 +170,25 @@ def main(profiles_table, schema_directory, output_directory, hash_type,
     loci_files = {}
     for locus in loci_ids:
         locus_file = fo.join_paths(schema_directory, [locus])
-        # add .fasta extension if file headers did not include it
+        # Add .fasta extension if file headers did not include it
         if locus_file.endswith('.fasta') is False:
             locus_file += '.fasta'
         loci_files[locus] = [locus_file]
         if locus_file in updated_files and no_inferred is True:
             loci_files[locus].append(updated_files[locus_file][0])
 
-    # get input/sample identifiers
+    # Get input/sample identifiers
     sample_ids = pd.read_csv(profiles_table, delimiter='\t',
                              dtype=str, usecols=['FILE'])
 
-    # write file with header
+    # Write file with header
     header_basename = fo.file_basename(profiles_table).replace('.tsv', '_header.tsv')
     header_file = fo.join_paths(output_directory, [header_basename])
     fo.write_to_file(header, header_file, 'w', '')
 
-    # create multiprocessing inputs
+    # Create multiprocessing inputs
     multi_inputs = []
-    # divide and process by row chunks
+    # Divide and process by row chunks
     for i in range(0, len(sample_ids), nrows):
         multi_inputs.append([profiles_table, loci_ids, loci_files,
                              hashing_function, nrows, range(1, i+1),
@@ -172,12 +197,12 @@ def main(profiles_table, schema_directory, output_directory, hash_type,
     hashed_files = mo.map_async_parallelizer(multi_inputs, mo.function_helper,
                                              cpu_cores)
 
-    # concatenate all files
+    # Concatenate all files
     output_basename = fo.file_basename(profiles_table).replace('.tsv', '_hashed.tsv')
     output_file = fo.join_paths(output_directory, [output_basename])
     fo.concatenate_files([header_file]+hashed_files, output_file)
 
-    # delete intermediate dataframes
+    # Delete intermediate dataframes
     fo.remove_files([header_file]+hashed_files)
 
     return output_file
diff --git a/CHEWBBACA/utils/profiles_sqlitedb.py b/CHEWBBACA/utils/profiles_sqlitedb.py
index 9a44765c..4c99e060 100755
--- a/CHEWBBACA/utils/profiles_sqlitedb.py
+++ b/CHEWBBACA/utils/profiles_sqlitedb.py
@@ -4,7 +4,7 @@
 Purpose
 -------
 
-This module contains functions related with the creation and
+This module contains functions related to the creation and
 manipulation of the SQLite database used to store allelic profiles.
 
 Notes
@@ -15,6 +15,8 @@
 (this was only implemented in SQLite 3.6.19). We need to take
 that into account when altering the data in the database.
 
+Code documentation
+------------------
 """
 
 
@@ -30,9 +32,10 @@
 
 
 def create_database_file(db_file):
-    """ Creates a SQLite database file.
-        If the database file already exists,
-        it will establish and close connection.
+    """Create a SQLite database file.
+
+    If the database file already exists,
+    it will establish and close connection.
 
     Parameters
     ----------
@@ -46,11 +49,10 @@ def create_database_file(db_file):
         successfully created, OperationalError
         if it could not create/establish connection
     """
-
     conn = None
     error = None
     try:
-        # creates db file if it does not exist
+        # Creates db file if it does not exist
         conn = sqlite3.connect(db_file)
     except Exception as e:
         error = e
@@ -62,8 +64,7 @@ def create_database_file(db_file):
 
 
 def create_connection(db_file):
-    """ Creates a database connection to a SQLite
-        database.
+    """Create a database connection to a SQLite database.
 
     Parameters
     ----------
@@ -77,7 +78,6 @@ def create_connection(db_file):
         successfull or error if it was not possible
         to connect to the database.
     """
-
     try:
         conn = sqlite3.connect(db_file)
     except Exception as e:
@@ -87,7 +87,7 @@ def create_connection(db_file):
 
 
 def execute_statement(conn, statement):
-    """ Executes a SQL statement.
+    """Execute a SQL statement.
 
     Parameters
     ----------
@@ -103,7 +103,6 @@ def execute_statement(conn, statement):
         successfully created, OperationalError
         if it could not create/establish connection
     """
-
     error = None
     try:
         c = conn.cursor()
@@ -115,7 +114,7 @@ def execute_statement(conn, statement):
 
 
 def select_all_rows(db_file, table):
-    """ Retrieves all rows in a table.
+    """Retrieve all rows in a table.
 
     Parameters
     ----------
@@ -131,7 +130,6 @@ def select_all_rows(db_file, table):
         is represented by a tuple with the values
         for all columns.
     """
-
     conn = create_connection(db_file)
     cur = conn.cursor()
     cur.execute('SELECT * FROM {0}'.format(table))
@@ -146,7 +144,7 @@ def select_all_rows(db_file, table):
 
 
 def create_insert_statement(table, columns, placeholders):
-    """ Creates a base SQL insert statement.
+    """Create a base SQL insert statement.
 
     Parameters
     ----------
@@ -163,7 +161,6 @@ def create_insert_statement(table, columns, placeholders):
         used to insert values into `columns`
         of a `table`.
     """
-
     statement = ('INSERT OR IGNORE INTO {0}({1}) '
                  'VALUES({2});'.format(table, ','.join(columns),
                                        ','.join(placeholders)))
@@ -172,7 +169,7 @@ def create_insert_statement(table, columns, placeholders):
 
 
 def insert_loci(db_file, matrix_file):
-    """ Inserts loci into the loci table.
+    """Insert loci into the loci table.
 
     Parameters
     ----------
@@ -187,7 +184,6 @@ def insert_loci(db_file, matrix_file):
     The number of loci that were insert
     into the table.
     """
-
     matrix_lines = read_matrix(matrix_file)
     loci_list = [locus.rstrip('.fasta') for locus in matrix_lines[0][1:]]
 
@@ -207,7 +203,7 @@ def insert_loci(db_file, matrix_file):
 
 
 def insert_multiple(db_file, base_statement, data):
-    """ Executes several insert statements.
+    """Execute several insert statements.
 
     Parameters
     ----------
@@ -225,7 +221,6 @@ def insert_multiple(db_file, base_statement, data):
         None if the SQL statement was successfully
         inserted, SQLite OperationalError otherwise.
     """
-
     error = None
     try:
         conn = create_connection(db_file)
@@ -240,9 +235,7 @@ def insert_multiple(db_file, base_statement, data):
 
 
 def create_database(db_file):
-    """ Creates the database file and tables of a SQLite database
-        that will store the allelic profiles determined with
-        a schema.
+    """Create a SQLite database to store allelic profiles.
 
     Parameters
     ----------
@@ -254,7 +247,6 @@ def create_database(db_file):
     True if the SQLite database file and tables were
     successfully created, SQLite OperationalError otherwise.
     """
-
     message = create_database_file(db_file)
 
     # samples table
@@ -308,8 +300,7 @@ def create_database(db_file):
 
 
 def read_matrix(matrix_file):
-    """ Reads a TSV file that contains a matrix with
-        allelic profiles.
+    """Read a TSV file that contains a matrix with allelic profiles.
 
     Parameters
     ----------
@@ -321,7 +312,6 @@ def read_matrix(matrix_file):
     matrix_lines : list of list
         A list with all the lines in the TSV file.
     """
-
     with open(matrix_file, 'r') as m:
         matrix_lines = list(csv.reader(m, delimiter='\t'))
 
@@ -329,8 +319,7 @@ def read_matrix(matrix_file):
 
 
 def get_loci_ids(matrix_lines):
-    """ Extracts loci identifiers from a list
-        with lines from a matrix of allelic profiles.
+    """Extract loci identifiers from a list with allelic profiles.
 
     Parameters
     ----------
@@ -344,15 +333,13 @@ def get_loci_ids(matrix_lines):
         List with the identifiers of all loci represented
         in the allelic profiles.
     """
-
     loci_ids = [locus.rstrip('.fasta') for locus in matrix_lines[0][1:]]
 
     return loci_ids
 
 
 def get_sample_ids(matrix_lines):
-    """ Extracts sample identifiers from a list
-        with lines from a matrix of allelic profiles.
+    """Extract sample identifiers from a list with allelic profiles.
 
     Parameters
     ----------
@@ -366,15 +353,13 @@ def get_sample_ids(matrix_lines):
         List with the sample identifiers of all allelic
         profiles.
     """
-
     sample_ids = [l[0].rstrip('.fasta') for l in matrix_lines[1:]]
 
     return sample_ids
 
 
 def get_profiles(matrix_lines):
-    """ Extracts profiles from a list with lines from
-        a matrix of allelic profiles.
+    """Extract profiles from a list with allelic profiles.
 
     Parameters
     ----------
@@ -387,7 +372,6 @@ def get_profiles(matrix_lines):
     profiles : list of dict
         List with one dictionary per allelic profile.
     """
-
     profiles = []
     loci_ids = matrix_lines[0][1:]
     for l in matrix_lines[1:]:
@@ -399,7 +383,7 @@ def get_profiles(matrix_lines):
 
 
 def remove_inf(profile):
-    """ Remove 'INF-' prefix from inferred alleles.
+    """Remove the 'INF-' prefix from inferred alleles.
 
     Parameters
     ----------
@@ -413,7 +397,6 @@ def remove_inf(profile):
         List with allele identifiers stripped of the
         'INF-' prefix.
     """
-
     clean_profile = [a.lstrip('INF-') if 'INF-' in a else a for a in profile]
 
     return clean_profile
@@ -422,7 +405,6 @@ def remove_inf(profile):
 def jsonify_profile(profile, loci):
     """
     """
-
     json_profile = ''
     for k, v in profile.items():
         # add first entry to JSON only if locus value is not LNF
@@ -444,7 +426,6 @@ def jsonify_profile(profile, loci):
 def store_allelecall_results(output_directory, schema_directory):
     """
     """
-
     # add profiles to SQLite database
     # parent results folder might have several results folders
     results_folders = [os.path.join(output_directory, file)
@@ -492,8 +473,7 @@ def store_allelecall_results(output_directory, schema_directory):
 
 
 def insert_allelecall_matrix(matrix_file, db_file, insert_date):
-    """ Inserts the data contained in a AlleleCall matrix into
-        the SQLite database of the schema.
+    """Insert profile data from a TSV file into a SQLite db.
 
     Parameters
     ----------
@@ -514,7 +494,6 @@ def insert_allelecall_matrix(matrix_file, db_file, insert_date):
     - Total number of profiles.
     - Number of unique profiles.
     """
-
     loci_list_db = select_all_rows(db_file, 'loci')
     loci_map = {t[1]: t[0] for t in loci_list_db}
 
@@ -592,8 +571,7 @@ def insert_allelecall_matrix(matrix_file, db_file, insert_date):
 
 
 def select_outdated(loci, reassigned, cursor):
-    """ Retrives the allelic profiles that have outdated
-        allele identifiers.
+    """Retrive the allelic profiles with outdated allele identifiers.
 
     Parameters
     ----------
@@ -620,7 +598,6 @@ def select_outdated(loci, reassigned, cursor):
         the outdated allele identifier and the updated
         allele identifier).
     """
-
     profiles = {}
     for locus, alleles in reassigned.items():
         locus_id = loci[locus.split('-')[-1].rstrip('.fasta').lstrip('0')]
@@ -643,8 +620,7 @@ def select_outdated(loci, reassigned, cursor):
 
 
 def alter_profiles(profiles, cursor):
-    """ Alters allele identifiers in allelic profiles
-        that are outdated.
+    """Update allele identifiers in allelic profiles.
 
     Parameters
     ----------
@@ -665,7 +641,6 @@ def alter_profiles(profiles, cursor):
         A dictionary with profiles hashes as keys and
         updated profiles as values.
     """
-
     results = {}
     for k, v in profiles.items():
         profile = v[0]
@@ -685,7 +660,7 @@ def alter_profiles(profiles, cursor):
 
 
 def update_profiles(schema_directory, reassigned):
-    """ Updates allele identifiers that have been changed.
+    """Update allele identifiers in allelic profiles.
 
     Parameters
     ----------
diff --git a/CHEWBBACA/utils/remove_genes.py b/CHEWBBACA/utils/remove_genes.py
index cc1746be..cc95e2f6 100755
--- a/CHEWBBACA/utils/remove_genes.py
+++ b/CHEWBBACA/utils/remove_genes.py
@@ -4,8 +4,8 @@
 Purpose
 -------
 
-This module removes a set of loci from a TSV file with
-results from the AlleleCall process.
+This module removes a set of loci from results of the
+AlleleCall process.
 
 Code documentation
 ------------------
@@ -13,7 +13,6 @@
 
 
 import csv
-import argparse
 
 import pandas as pd
 
@@ -24,7 +23,21 @@
 
 
 def main(input_file, genes_list, output_file, inverse):
-
+    """Remove loci from allele calling results.
+
+    Parameters
+    ----------
+    input_file : str
+        Path to a TSV file that contains allelic profiles
+        determined by the AlleleCall module.
+    genes_list : str
+        Path to a file with a list of loci to keep or remove.
+    output_file : str
+        Path to the output file.
+    inverse : bool
+        Keep the loci included in `genes_list` and remove the
+        rest instead.
+    """
     # Read genes list
     with open(genes_list, 'r') as infile:
         genes_list = list(csv.reader(infile, delimiter='\t'))
@@ -51,39 +64,3 @@ def main(input_file, genes_list, output_file, inverse):
     # Save dataframe to file
     df.to_csv(output_file, header=True, sep='\t', index=False)
 
-
-def parse_arguments():
-
-    parser = argparse.ArgumentParser(description=__doc__,
-                                     formatter_class=argparse.RawDescriptionHelpFormatter)
-
-    parser.add_argument('-i', '--input-file', type=str,
-                        required=True, dest='input_file',
-                        help='TSV file that contains a matrix with '
-                             'allelic profiles determined by the '
-                             'AlleleCall process.')
-
-    parser.add_argument('-gl', '--genes-list', type=str,
-                        required=True, dest='genes_list',
-                        help='File with the list of genes to '
-                             'remove, one identifier per line.')
-
-    parser.add_argument('-o', '--output-file', type=str,
-                        required=True, dest='output_file',
-                        help='Path to the output file.')
-
-    parser.add_argument('--inverse', action='store_true',
-                        required=False, dest='inverse',
-                        help='List of genes that is provided '
-                             'is the list of genes to keep and '
-                             'all other genes should be removed.')
-
-    args = parser.parse_args()
-
-    return args
-
-
-if __name__ == "__main__":
-
-    args = parse_arguments()
-    main(**vars(args))
diff --git a/CHEWBBACA/utils/sequence_clustering.py b/CHEWBBACA/utils/sequence_clustering.py
index 06715d82..30333a43 100644
--- a/CHEWBBACA/utils/sequence_clustering.py
+++ b/CHEWBBACA/utils/sequence_clustering.py
@@ -4,8 +4,8 @@
 Purpose
 -------
 
-This module contains functions related with sequence clustering
-based on k-mer sets.
+This module contains functions related to sequence clustering
+based on k-mers.
 
 Code documentation
 ------------------
@@ -19,14 +19,12 @@
     from utils import (file_operations as fo,
                        iterables_manipulation as im,
                        blast_wrapper as bw,
-                       fasta_operations as fao,
-                       constants as ct)
+                       fasta_operations as fao)
 except ModuleNotFoundError:
     from CHEWBBACA.utils import (file_operations as fo,
                                  iterables_manipulation as im,
                                  blast_wrapper as bw,
-                                 fasta_operations as fao,
-                                 constants as ct)
+                                 fasta_operations as fao)
 
 
 def select_representatives(kmers, reps_groups, clustering_sim):
diff --git a/CHEWBBACA/utils/uniprot_requests.py b/CHEWBBACA/utils/uniprot_requests.py
index 1934d658..ce375dce 100644
--- a/CHEWBBACA/utils/uniprot_requests.py
+++ b/CHEWBBACA/utils/uniprot_requests.py
@@ -5,7 +5,7 @@
 -------
 
 This module contains functions to perform requests to
-UniProts's SPARQL endpoint and process retrieved data.
+UniProt's SPARQL endpoint and process retrieved data.
 
 Code documentation
 ------------------
@@ -81,10 +81,10 @@ def select_name(result):
 
     i = 0
     found = False
-    # get the entries with results
+    # Get the entries with results
     try:
         aux = result['results']['bindings']
-    # response does not contain annotation data
+    # Response does not contain annotation data
     except Exception as e:
         aux = {}