Skip to content

Commit

Permalink
Reserved word check (#151)
Browse files Browse the repository at this point in the history
* Reserved word check

* Added tests
  • Loading branch information
charles-cowart authored Sep 12, 2024
1 parent 79ca027 commit 775db1e
Show file tree
Hide file tree
Showing 3 changed files with 129 additions and 0 deletions.
33 changes: 33 additions & 0 deletions sequence_processing_pipeline/Pipeline.py
Original file line number Diff line number Diff line change
Expand Up @@ -14,6 +14,7 @@
from collections import defaultdict
from datetime import datetime
from xml.etree import ElementTree as ET
from metapool.prep import PREP_MF_COLUMNS


logging.basicConfig(format='%(asctime)s - %(message)s', level=logging.INFO)
Expand Down Expand Up @@ -235,6 +236,38 @@ def __init__(self, configuration_file_path, run_id, sample_sheet_path,

self._configure_profile()

def identify_reserved_words(self, words):
'''
Returns a list of words that should not appear as column names in any
project referenced in the Pipeline's sample-sheet/pre-prep file.
:param words: A list of words that may include reserved words.
:return: A list of words that are already reserved in upper, lower,
and mixed cases.
'''

# Only strings used as column names in pre-prep files are currently
# considered 'reserved' as loading a pre-prep file containing these
# column names will fail if one or more of the strings already appears
# as a column name in a study's sample metadata table.

# This implementation assumes some understanding of metapool's impl,
# specifically how the proper set of prep-info file columns are
# generated. For now the functionality will be defined here as this
# area of metapool is currently in flux.
if self.mapping_file is not None:
reserved = PREP_MF_COLUMNS
else:
# results will be dependent on SheetType and SheetVersion of
# the sample-sheet. Since all columns in a prep-info file are
# lower()ed before writing out to file, the word must be
# reserved in all case forms. e.g.: 'Sample_Well' and 'Sample_well'
# are both forms of 'sample_well'.
reserved = [x.lower() for x in
self.sample_sheet.CARRIED_PREP_COLUMNS] + \
self.sample_sheet.GENERATED_PREP_COLUMNS

return list(set([x.lower() for x in words]) & set(reserved))

def _configure_profile(self):
# extract the instrument type from self.run_dir and the assay type
# from self.sample_sheet (or self.mapping_file).
Expand Down
40 changes: 40 additions & 0 deletions sequence_processing_pipeline/tests/data/mgv90_test_sheet.csv
Original file line number Diff line number Diff line change
@@ -0,0 +1,40 @@
[Header],,,,,,,,,,
IEMFileVersion,4,,,,,,,,,
SheetType,standard_metag,,,,,,,,,
SheetVersion,90,,,,,,,,,
Investigator Name,Caballero,,,,,,,,,
Experiment Name,RKL0042,,,,,,,,,
Date,2/26/20,,,,,,,,,
Workflow,GenerateFASTQ,,,,,,,,,
Application,FASTQ Only,,,,,,,,,
Assay,Metagenomic,,,,,,,,,
Description,,,,,,,,,,
Chemistry,Default,,,,,,,,,
,,,,,,,,,,
[Reads],,,,,,,,,,
150,,,,,,,,,,
150,,,,,,,,,,
,,,,,,,,,,
[Settings],,,,,,,,,,
ReverseComplement,0,,,,,,,,,
,,,,,,,,,,
[Data],,,,,,,,,,
Lane,Sample_ID,Sample_Name,Sample_Plate,Sample_Well,I7_Index_ID,index,I5_Index_ID,index2,Sample_Project,Well_description
1,sample1,sample1,FooBar_666_p1,A1,iTru7_107_07,CCGACTAT,iTru5_01_A,ACCGACAA,Project_1111,s1
1,sample2,sample2,FooBar_666_p1,A2,iTru7_107_08,CCGACTAT,iTru5_01_A,CTTCGCAA,Project_1111,s2
3,sample1,sample1,FooBar_666_p1,A3,iTru7_107_09,GCCTTGTT,iTru5_01_A,AACACCAC,Project_1111,s1
3,sample2,sample2,FooBar_666_p1,A4,iTru7_107_10,AACTTGCC,iTru5_01_A,CGTATCTC,Project_1111,s2
3,sample3,sample3,FooBar_666_p1,A5,iTru7_107_11,CAATGTGG,iTru5_01_A,GGTACGAA,Trojecp_666,s5
3,sample4,sample4,FooBar_666_p1,B6,iTru7_107_12,AAGGCTGA,iTru5_01_A,CGATCGAT,Trojecp_666,s6
3,sample5,sample5,FooBar_666_p1,B8,iTru7_107_13,TTACCGAG,iTru5_01_A,AAGACACC,Trojecp_666,s7
,,,,,,,,,,
[Bioinformatics],,,,,,,,,,
Sample_Project,QiitaID,BarcodesAreRC,ForwardAdapter,ReverseAdapter,HumanFiltering,library_construction_protocol,experiment_design_description,,,
Project_1111,1111,False,AACC,GGTT,False,Knight Lab Kapa HP,Eqiiperiment,,,
Trojecp_666,666,False,AACC,GGTT,False,Knight Lab Kapa HP,SomethingWitty,,,
,,,,,,,,,,
[Contact],,,,,,,,,,
Email,Sample_Project,,,,,,,,,
[email protected],Project_1111,,,,,,,,,
[email protected],Trojecp_666,,,,,,,,,
,,,,,,,,,,
56 changes: 56 additions & 0 deletions sequence_processing_pipeline/tests/test_Pipeline.py
Original file line number Diff line number Diff line change
Expand Up @@ -28,6 +28,7 @@ def setUp(self):
makedirs(self.output_file_path, exist_ok=True)
self.maxDiff = None
self.good_sample_sheet_path = self.path('good-sample-sheet.csv')
self.good_legacy_sheet_path = self.path('mgv90_test_sheet.csv')
self.mp_sheet_path = self.path('multi-project-sheet.csv')
self.bad_sample_sheet_path = self.path('duplicate_sample-sample-sheet'
'.csv')
Expand Down Expand Up @@ -1630,6 +1631,38 @@ def test_parse_project_name(self):
obs = pipeline._parse_project_name(test, t_set == 'True')
self.assertEqual(obs, exp)

def test_identify_reserved_words(self):
pipeline = Pipeline(self.good_config_file, self.good_run_id,
self.good_sample_sheet_path, None,
self.output_file_path, self.qiita_id,
Pipeline.METAGENOMIC_PTYPE)

# assert that arbitrary strings are not reserved.
obs = pipeline.identify_reserved_words(['NOT_A_RESERVED_WORD',
'ANOTHER_WORD'])
self.assertEqual(obs, [])

# assert that 'well_id_384' is a reserved word.
obs = pipeline.identify_reserved_words(['well_id_384',
'NOT_A_RESERVED_WORD'])

self.assertEqual(obs, ['well_id_384'])

# create new pipeline using a/legacy (v90) metagenomic sample-sheet.
pipeline = Pipeline(self.good_config_file, self.good_run_id,
self.good_legacy_sheet_path, None,
self.output_file_path, self.qiita_id,
Pipeline.METAGENOMIC_PTYPE)

# assert that for legacy sample-sheets, well_id_384 is NOT a reserved
# word and the appropriate reserved word is 'Sample_well'.
obs = pipeline.identify_reserved_words(['well_id_384',
'NOT_A_RESERVED_WORD',
'Sample_well',
'Sample_Well'])

self.assertEqual(obs, ['sample_well'])


class TestAmpliconPipeline(unittest.TestCase):
def setUp(self):
Expand Down Expand Up @@ -2339,6 +2372,29 @@ def test_process_run_info_file(self):
# These are indirectly tested as generate_dummy_sample_sheet() is
# called by Pipeline's constructor.

def test_identify_reserved_words(self):
pipeline = Pipeline(self.good_config_file,
self.good_run_id,
None,
self.good_mapping_file_path,
self.output_file_path,
self.qiita_id,
Pipeline.AMPLICON_PTYPE)

# assert that arbitrary strings are not reserved.
obs = pipeline.identify_reserved_words(['NOT_A_RESERVED_WORD',
'ANOTHER_WORD'])
self.assertEqual(obs, [])

# assert that Sample_Well is okay for current pre-prep files but
# well_id_384 is reserved. Show that all forms of tm300_8_tool are
# also reserved.
obs = pipeline.identify_reserved_words(['Sample_Well',
'TM300_8_Tool',
'tm300_8_tool',
'well_id_384'])
self.assertEqual(set(obs), {'tm300_8_tool', 'well_id_384'})


class TestInstrumentUtils(unittest.TestCase):
def setUp(self):
Expand Down

0 comments on commit 775db1e

Please sign in to comment.