Reserved word check (#151)

* Reserved word check * Added tests
biocore · Sep 12, 2024 · 775db1e · 775db1e
1 parent 79ca027
commit 775db1e
Show file tree

Hide file tree

Showing 3 changed files with 129 additions and 0 deletions.
diff --git a/sequence_processing_pipeline/Pipeline.py b/sequence_processing_pipeline/Pipeline.py
@@ -14,6 +14,7 @@
 from collections import defaultdict
 from datetime import datetime
 from xml.etree import ElementTree as ET
+from metapool.prep import PREP_MF_COLUMNS
 
 
 logging.basicConfig(format='%(asctime)s - %(message)s', level=logging.INFO)
@@ -235,6 +236,38 @@ def __init__(self, configuration_file_path, run_id, sample_sheet_path,
 
         self._configure_profile()
 
+    def identify_reserved_words(self, words):
+        '''
+        Returns a list of words that should not appear as column names in any
+        project referenced in the Pipeline's sample-sheet/pre-prep file.
+        :param words: A list of words that may include reserved words.
+        :return: A list of words that are already reserved in upper, lower,
+                 and mixed cases.
+        '''
+
+        # Only strings used as column names in pre-prep files are currently
+        # considered 'reserved' as loading a pre-prep file containing these
+        # column names will fail if one or more of the strings already appears
+        # as a column name in a study's sample metadata table.
+
+        # This implementation assumes some understanding of metapool's impl,
+        # specifically how the proper set of prep-info file columns are
+        # generated. For now the functionality will be defined here as this
+        # area of metapool is currently in flux.
+        if self.mapping_file is not None:
+            reserved = PREP_MF_COLUMNS
+        else:
+            # results will be dependent on SheetType and SheetVersion of
+            # the sample-sheet. Since all columns in a prep-info file are
+            # lower()ed before writing out to file, the word must be
+            # reserved in all case forms. e.g.: 'Sample_Well' and 'Sample_well'
+            # are both forms of 'sample_well'.
+            reserved = [x.lower() for x in
+                        self.sample_sheet.CARRIED_PREP_COLUMNS] + \
+                        self.sample_sheet.GENERATED_PREP_COLUMNS
+
+        return list(set([x.lower() for x in words]) & set(reserved))
+
     def _configure_profile(self):
         # extract the instrument type from self.run_dir and the assay type
         # from self.sample_sheet (or self.mapping_file).

diff --git a/sequence_processing_pipeline/tests/data/mgv90_test_sheet.csv b/sequence_processing_pipeline/tests/data/mgv90_test_sheet.csv
@@ -0,0 +1,40 @@
+[Header],,,,,,,,,,
+IEMFileVersion,4,,,,,,,,,
+SheetType,standard_metag,,,,,,,,,
+SheetVersion,90,,,,,,,,,
+Investigator Name,Caballero,,,,,,,,,
+Experiment Name,RKL0042,,,,,,,,,
+Date,2/26/20,,,,,,,,,
+Workflow,GenerateFASTQ,,,,,,,,,
+Application,FASTQ Only,,,,,,,,,
+Assay,Metagenomic,,,,,,,,,
+Description,,,,,,,,,,
+Chemistry,Default,,,,,,,,,
+,,,,,,,,,,
+[Reads],,,,,,,,,,
+150,,,,,,,,,,
+150,,,,,,,,,,
+,,,,,,,,,,
+[Settings],,,,,,,,,,
+ReverseComplement,0,,,,,,,,,
+,,,,,,,,,,
+[Data],,,,,,,,,,
+Lane,Sample_ID,Sample_Name,Sample_Plate,Sample_Well,I7_Index_ID,index,I5_Index_ID,index2,Sample_Project,Well_description
+1,sample1,sample1,FooBar_666_p1,A1,iTru7_107_07,CCGACTAT,iTru5_01_A,ACCGACAA,Project_1111,s1
+1,sample2,sample2,FooBar_666_p1,A2,iTru7_107_08,CCGACTAT,iTru5_01_A,CTTCGCAA,Project_1111,s2
+3,sample1,sample1,FooBar_666_p1,A3,iTru7_107_09,GCCTTGTT,iTru5_01_A,AACACCAC,Project_1111,s1
+3,sample2,sample2,FooBar_666_p1,A4,iTru7_107_10,AACTTGCC,iTru5_01_A,CGTATCTC,Project_1111,s2
+3,sample3,sample3,FooBar_666_p1,A5,iTru7_107_11,CAATGTGG,iTru5_01_A,GGTACGAA,Trojecp_666,s5
+3,sample4,sample4,FooBar_666_p1,B6,iTru7_107_12,AAGGCTGA,iTru5_01_A,CGATCGAT,Trojecp_666,s6
+3,sample5,sample5,FooBar_666_p1,B8,iTru7_107_13,TTACCGAG,iTru5_01_A,AAGACACC,Trojecp_666,s7
+,,,,,,,,,,
+[Bioinformatics],,,,,,,,,,
+Sample_Project,QiitaID,BarcodesAreRC,ForwardAdapter,ReverseAdapter,HumanFiltering,library_construction_protocol,experiment_design_description,,,
+Project_1111,1111,False,AACC,GGTT,False,Knight Lab Kapa HP,Eqiiperiment,,,
+Trojecp_666,666,False,AACC,GGTT,False,Knight Lab Kapa HP,SomethingWitty,,,
+,,,,,,,,,,
+[Contact],,,,,,,,,,
+Email,Sample_Project,,,,,,,,,
+[email protected],Project_1111,,,,,,,,,
+[email protected],Trojecp_666,,,,,,,,,
+,,,,,,,,,,
diff --git a/sequence_processing_pipeline/tests/test_Pipeline.py b/sequence_processing_pipeline/tests/test_Pipeline.py
@@ -28,6 +28,7 @@ def setUp(self):
         makedirs(self.output_file_path, exist_ok=True)
         self.maxDiff = None
         self.good_sample_sheet_path = self.path('good-sample-sheet.csv')
+        self.good_legacy_sheet_path = self.path('mgv90_test_sheet.csv')
         self.mp_sheet_path = self.path('multi-project-sheet.csv')
         self.bad_sample_sheet_path = self.path('duplicate_sample-sample-sheet'
                                                '.csv')
@@ -1630,6 +1631,38 @@ def test_parse_project_name(self):
                     obs = pipeline._parse_project_name(test, t_set == 'True')
                     self.assertEqual(obs, exp)
 
+    def test_identify_reserved_words(self):
+        pipeline = Pipeline(self.good_config_file, self.good_run_id,
+                            self.good_sample_sheet_path, None,
+                            self.output_file_path, self.qiita_id,
+                            Pipeline.METAGENOMIC_PTYPE)
+
+        # assert that arbitrary strings are not reserved.
+        obs = pipeline.identify_reserved_words(['NOT_A_RESERVED_WORD',
+                                                'ANOTHER_WORD'])
+        self.assertEqual(obs, [])
+
+        # assert that 'well_id_384' is a reserved word.
+        obs = pipeline.identify_reserved_words(['well_id_384',
+                                                'NOT_A_RESERVED_WORD'])
+
+        self.assertEqual(obs, ['well_id_384'])
+
+        # create new pipeline using a/legacy (v90) metagenomic sample-sheet.
+        pipeline = Pipeline(self.good_config_file, self.good_run_id,
+                            self.good_legacy_sheet_path, None,
+                            self.output_file_path, self.qiita_id,
+                            Pipeline.METAGENOMIC_PTYPE)
+
+        # assert that for legacy sample-sheets, well_id_384 is NOT a reserved
+        # word and the appropriate reserved word is 'Sample_well'.
+        obs = pipeline.identify_reserved_words(['well_id_384',
+                                                'NOT_A_RESERVED_WORD',
+                                                'Sample_well',
+                                                'Sample_Well'])
+
+        self.assertEqual(obs, ['sample_well'])
+
 
 class TestAmpliconPipeline(unittest.TestCase):
     def setUp(self):
@@ -2339,6 +2372,29 @@ def test_process_run_info_file(self):
         # These are indirectly tested as generate_dummy_sample_sheet() is
         # called by Pipeline's constructor.
 
+    def test_identify_reserved_words(self):
+        pipeline = Pipeline(self.good_config_file,
+                            self.good_run_id,
+                            None,
+                            self.good_mapping_file_path,
+                            self.output_file_path,
+                            self.qiita_id,
+                            Pipeline.AMPLICON_PTYPE)
+
+        # assert that arbitrary strings are not reserved.
+        obs = pipeline.identify_reserved_words(['NOT_A_RESERVED_WORD',
+                                                'ANOTHER_WORD'])
+        self.assertEqual(obs, [])
+
+        # assert that Sample_Well is okay for current pre-prep files but
+        # well_id_384 is reserved. Show that all forms of tm300_8_tool are
+        # also reserved.
+        obs = pipeline.identify_reserved_words(['Sample_Well',
+                                                'TM300_8_Tool',
+                                                'tm300_8_tool',
+                                                'well_id_384'])
+        self.assertEqual(set(obs), {'tm300_8_tool', 'well_id_384'})
+
 
 class TestInstrumentUtils(unittest.TestCase):
     def setUp(self):