From 8abe70a783b4616d24cd826333d1b33729baaed2 Mon Sep 17 00:00:00 2001
From: rfm-targa <rmamede@medicina.ulisboa.pt>
Date: Fri, 22 Mar 2024 10:41:14 +0000
Subject: [PATCH] Passing loci_finder variable to each function instead of
 defining it as global.

---
 CHEWBBACA/AlleleCall/allele_call.py | 76 +++++++++++++++++++++--------
 1 file changed, 55 insertions(+), 21 deletions(-)

diff --git a/CHEWBBACA/AlleleCall/allele_call.py b/CHEWBBACA/AlleleCall/allele_call.py
index 6c462d1b..092f4485 100644
--- a/CHEWBBACA/AlleleCall/allele_call.py
+++ b/CHEWBBACA/AlleleCall/allele_call.py
@@ -663,7 +663,7 @@ def allele_size_classification(sequence_length, locus_mode, size_threshold):
 
 
 def write_loci_summary(classification_files, output_directory, total_inputs,
-                       classification_labels):
+                       classification_labels, loci_finder):
     """Write a TSV file with classification counts per locus.
 
     Parameters
@@ -679,6 +679,10 @@ def write_loci_summary(classification_files, output_directory, total_inputs,
     classification_labels : list
         List with the possible class labels assigned by
         chewBBACA.
+	loci_finder : re.Pattern
+		Regular expression object to search for loci identifiers
+		in paths and filenames.
+
 
     Returns
     -------
@@ -757,7 +761,7 @@ def write_logfile(start_time, end_time, total_inputs,
 
 
 def write_results_alleles(classification_files, input_identifiers,
-                          output_directory, missing_class):
+                          output_directory, missing_class, loci_finder):
     """Write a TSV file with the allelic profiles for the input samples.
 
     Parameters
@@ -770,6 +774,9 @@ def write_results_alleles(classification_files, input_identifiers,
         Path to the output directory.
     missing_class : str
         'LNF' if execution mode is 4, 'PLNF' otherwise.
+	loci_finder : re.Pattern
+		Regular expression object to search for loci identifiers
+		in paths and filenames.
 
     Returns
     -------
@@ -823,7 +830,7 @@ def write_results_alleles(classification_files, input_identifiers,
 
 def write_results_statistics(classification_files, input_identifiers,
                              cds_counts, output_directory, classification_labels,
-                             repeated_counts, invalid_counts):
+                             repeated_counts, invalid_counts, loci_finder):
     """Write a TSV file with classification counts per input.
 
     Parameters
@@ -849,6 +856,9 @@ def write_results_statistics(classification_files, input_identifiers,
     invalid_counts : dict
         Dictionary with input identifiers as keys and the total
         number of invalid CDSs as values.
+	loci_finder : re.Pattern
+		Regular expression object to search for loci identifiers
+		in paths and filenames.
 
     Returns
     -------
@@ -897,7 +907,7 @@ def write_results_statistics(classification_files, input_identifiers,
 
 def write_results_contigs(classification_files, input_identifiers,
                           output_directory, cds_coordinates_files,
-                          classification_labels):
+                          classification_labels, loci_finder):
     """Write a TSV file with the CDS coordinates for each input.
 
     Writes a TSV file with coding sequence coordinates (contig
@@ -921,6 +931,9 @@ def write_results_contigs(classification_files, input_identifiers,
         coordinates as values.
     classification_labels : list
         List with the class labels attributed by chewBBACA.
+	loci_finder : re.Pattern
+		Regular expression object to search for loci identifiers
+		in paths and filenames.
 
     Returns
     -------
@@ -1082,7 +1095,7 @@ def create_unclassified_fasta(fasta_file, prot_file, unclassified_protids,
     return output_file
 
 
-def assign_allele_ids(locus_files, ns, repeated, output_directory):
+def assign_allele_ids(locus_files, ns, repeated, output_directory, loci_finder):
     """Assign allele identifiers to CDSs classified as EXC or INF.
 
     Parameters
@@ -1170,7 +1183,7 @@ def assign_allele_ids(locus_files, ns, repeated, output_directory):
 
 
 def create_novel_fastas(inferred_alleles, inferred_representatives,
-                        sequences_file, output_directory):
+                        sequences_file, output_directory, loci_finder):
     """Create FASTA files with the novel alleles for each locus.
 
     Parameters
@@ -1189,6 +1202,9 @@ def create_novel_fastas(inferred_alleles, inferred_representatives,
     output_directory : str
         Path to the directory where the FASTA files that contain
         the novel alleles will be saved to.
+	loci_finder : re.Pattern
+		Regular expression object to search for loci identifiers
+		in paths and filenames.
 
     Returns
     -------
@@ -1238,7 +1254,7 @@ def create_novel_fastas(inferred_alleles, inferred_representatives,
     return [total_inferred, total_representatives, updated_novel]
 
 
-def add_inferred_alleles(inferred_alleles):
+def add_inferred_alleles(inferred_alleles, loci_finder):
     """Add inferred alleles to a schema.
 
     Prameters
@@ -1247,6 +1263,9 @@ def add_inferred_alleles(inferred_alleles):
         Dictionary with paths to loci FASTA files in the schema as
         keys and paths to the FASTA files that contain the novel
         alleles as values.
+	loci_finder : re.Pattern
+		Regular expression object to search for loci identifiers
+		in paths and filenames.
 
     Returns
     -------
@@ -1639,7 +1658,8 @@ def classify_inexact_matches(locus, genomes_matches, inv_map,
 
 
 def create_missing_fasta(class_files, fasta_file, input_map, dna_hashtable,
-                         output_directory, classification_labels, cds_input):
+                         output_directory, classification_labels, cds_input,
+						 loci_finder):
     """Create Fasta file with sequences for missing data classes.
 
     Parameters
@@ -1665,6 +1685,9 @@ def create_missing_fasta(class_files, fasta_file, input_map, dna_hashtable,
         List with the class labels attributed by chewBBACA.
     cds_input : bool
         False if there are files with CDS coordinates, True otherwise.
+	loci_finder : re.Pattern
+		Regular expression object to search for loci identifiers
+		in paths and filenames.
 
     Returns
     -------
@@ -1843,7 +1866,7 @@ def count_invalid(input_ids, invalid_seqids, cds_index, distinct_htable):
     return invalid_counts
 
 
-def merge_blast_results(blast_outfiles, output_directory):
+def merge_blast_results(blast_outfiles, output_directory, loci_finder):
     """Concatenate BLAST output files based on locus identifier.
 
     Parameters
@@ -1853,6 +1876,9 @@ def merge_blast_results(blast_outfiles, output_directory):
         the locus identifier.
     output_directory : str
         Path to the output directory.
+	loci_finder : re.Pattern
+		Regular expression object to search for loci identifiers
+		in paths and filenames.
 
     Returns
     -------
@@ -1877,7 +1903,8 @@ def merge_blast_results(blast_outfiles, output_directory):
 
 
 def allele_calling(fasta_files, schema_directory, temp_directory,
-                   loci_modes, loci_files, config, pre_computed_dir):
+                   loci_modes, loci_files, config, pre_computed_dir,
+				   loci_finder):
     """Perform allele calling for a set of inputs.
 
     Parameters
@@ -1901,6 +1928,9 @@ def allele_calling(fasta_files, schema_directory, temp_directory,
     pre_computed_dir : str
         Path to the the directory that contains the files with the
         pre-computed hash tables.
+	loci_finder : re.Pattern
+		Regular expression object to search for loci identifiers
+		in paths and filenames.
 
     Returns
     -------
@@ -2361,7 +2391,7 @@ def allele_calling(fasta_files, schema_directory, temp_directory,
         # Concatenate files based on locus identifier included in file paths
         blast_merged_dir = fo.join_paths(blast_results_dir, ['concatenated'])
         fo.create_directory(blast_merged_dir)
-        concatenated_files = merge_blast_results(blast_files, blast_merged_dir)
+        concatenated_files = merge_blast_results(blast_files, blast_merged_dir, loci_finder)
 
         # Select best hit per target, filter based on BSR, expand matches
         # and get relevant data for classification
@@ -2762,8 +2792,6 @@ def main(input_file, loci_list, schema_directory, output_directory,
     loci_ids = sorted(loci_ids, key=lambda x: len(x), reverse=True)
     # Create regex object to search for loci identifiers in paths/strings
     loci_finder = re.compile('|'.join(loci_ids))
-    # Define as global variable to use inside any function
-    globals()['loci_finder'] = loci_finder
 
     # Create directory to store intermediate files
     temp_directory = fo.join_paths(output_directory, ['temp'])
@@ -2800,7 +2828,8 @@ def main(input_file, loci_list, schema_directory, output_directory,
     # Perform allele calling
     results = allele_calling(input_files, schema_directory,
                              temp_directory, loci_modes.copy(),
-                             loci_to_call, config, pre_computed_dir)
+                             loci_to_call, config, pre_computed_dir,
+							 loci_finder)
 
     # Assign allele identifiers, add alleles to schema and create output files
     print(f'\n {ct.WRAPPING_UP} ')
@@ -2823,7 +2852,8 @@ def main(input_file, loci_list, schema_directory, output_directory,
                                             results['int_to_unique'],
                                             output_directory,
                                             results['cds_coordinates'],
-                                            classification_labels)
+                                            classification_labels,
+											loci_finder)
     outfile, repeated_info, repeated_counts = results_contigs
 
     # Identify paralogous loci
@@ -2840,7 +2870,7 @@ def main(input_file, loci_list, schema_directory, output_directory,
     fo.create_directory(novel_data_directory)
     assignment_inputs = list(results['classification_files'].items())
     repeated_hashes = set(repeated_info.keys())
-    assignment_inputs = [[g, ns, repeated_hashes, novel_data_directory, assign_allele_ids]
+    assignment_inputs = [[g, ns, repeated_hashes, novel_data_directory, loci_finder, assign_allele_ids]
                          for g in assignment_inputs]
 
     novel_alleles = mo.map_async_parallelizer(assignment_inputs,
@@ -2911,7 +2941,7 @@ def main(input_file, loci_list, schema_directory, output_directory,
             if no_inferred is False:
                 print('Adding new alleles to schema...')
                 # Add inferred alleles to schema
-                alleles_added = add_inferred_alleles(updated_novel)
+                alleles_added = add_inferred_alleles(updated_novel, loci_finder)
                 # Recompute mode for loci with novel alleles
                 print(f'Updating allele size mode values stored in {loci_modes_file}')
                 for locus_novel in novel_alleles:
@@ -2929,7 +2959,8 @@ def main(input_file, loci_list, schema_directory, output_directory,
     profiles_table = write_results_alleles(list(results['classification_files'].values()),
                                            list(results['int_to_unique'].values()),
                                            output_directory,
-                                           classification_labels[-1])
+                                           classification_labels[-1],
+										   loci_finder)
 
     # Create file with class counts per input file
     print(f'Creating file with class counts per input ({ct.RESULTS_STATISTICS_BASENAME})...')
@@ -2939,14 +2970,16 @@ def main(input_file, loci_list, schema_directory, output_directory,
                                                 output_directory,
                                                 classification_labels,
                                                 repeated_counts,
-                                                results['invalid_alleles'][1])
+                                                results['invalid_alleles'][1],
+												loci_finder)
 
     # Create file with class counts per locus called
     print(f'Creating file with class counts per locus ({ct.LOCI_STATS_BASENAME})...')
     loci_stats_file = write_loci_summary(results['classification_files'],
                                          output_directory,
                                          len(input_files),
-                                         classification_labels)
+                                         classification_labels,
+										 loci_finder)
 
     # Create FASTA file with unclassified CDSs
     if output_unclassified is True:
@@ -2968,7 +3001,8 @@ def main(input_file, loci_list, schema_directory, output_directory,
                                                 results['dna_hashtable'],
                                                 output_directory,
                                                 classification_labels,
-                                                config['CDS input'])
+                                                config['CDS input'],
+												loci_finder)
 
     # Create FASTA file with inferred alleles
     if len(novel_alleles) > 0 and output_novel is True: