updating tests

OpenBioML · Aug 14, 2024 · b5eb5e2 · b5eb5e2
1 parent 79bc334
commit b5eb5e2
Show file tree

Hide file tree

Showing 6 changed files with 4,228 additions and 57 deletions.
diff --git a/data/tabular/lipophilicity/data_original.txt b/data/tabular/lipophilicity/data_original.txt
diff --git a/data/tabular/lipophilicity/meta.yaml b/data/tabular/lipophilicity/meta.yaml
@@ -1,8 +1,12 @@
 name: lipophilicity
-description: Experimental results of octanol/water distribution coefficient (logD at pH 7.4).
+description:
+  Experimental results of octanol/water distribution coefficient (logD
+  at pH 7.4).
 targets:
   - id: exp
-    description: experimental results of octanol/water distribution coefficient (logD at pH 7.4)
+    description:
+      experimental results of octanol/water distribution coefficient (logD
+      at pH 7.4)
     units: (dimensionless)
     type: continuous
     names:
@@ -47,7 +51,7 @@ templates:
     {exp%}
     Answer: {%multiple_choice_result}
   - |-
-    Question: Please {#estimate|guess|predict|provide!} the {exp__names__noun} of {SMILES#} by picking one choice of {%multiple_choice_enum%3-6%aA1}.
+    Question: Please estimate the {exp__names__noun} of the molecule with {SMILES__description} {SMILES#} by picking one choice of {%multiple_choice_enum%3-6%aA1}.
     Options:
     {exp%}
     Answer: {%multiple_choice_result}
@@ -57,40 +61,3 @@ templates:
     Options:
     {exp%}
     Answer:<EOI>{%multiple_choice_result}
-  - |-
-    Question: What is the {exp__names__noun} for the {#molecule|chemical|compound!} represented by the {SMILES__description} {SMILES#}?
-
-    Answer:<EOI>{exp}
-  - |-
-    Task: Determine the {exp__names__noun} for the given {SMILES__description}.
-
-    Molecule: {SMILES#}
-
-    Answer:<EOI>{exp}
-  - |-
-    Task: Please {#estimate|guess|predict|provide!} the {exp__names__noun} for the following {SMILES__description}.
-
-    Molecule: {SMILES#}
-
-    Answer:<EOI>{exp}
-  - |-
-    Question: What is the experimental {exp__names__noun} for the molecule with the {SMILES__description} {SMILES#}?
-
-    Answer:<EOI>{exp}
-  - |-
-    Task: Identify the {exp__names__noun} for the given {#molecule|chemical|compound!}.
-
-    Molecule: {SMILES#}
-
-    Answer:<EOI>{exp}
-  - |-
-    Task: Please select the correct {exp__names__noun} for the {#molecule|chemical|compound!} represented by the {SMILES__description} {SMILES#}.
-
-    Options:
-    {exp%}
-
-    Answer:<EOI>{%multiple_choice_result}
-  - |-
-    Task: {#Estimate|Guess|Predict|Provide!} the {exp__names__noun} for the {#molecule|chemical|compound!} with the {SMILES__description} {SMILES#}.
-
-    Answer:<EOI>{exp}
diff --git a/data/tabular/opv/meta.yaml b/data/tabular/opv/meta.yaml
@@ -117,19 +117,19 @@ bibtex:
 templates:
   - |-
     Question: What is the {PCE_ave__names__noun} of a {#non-fullerene|PC71BM|PCBM!} {#organic photovoltaics|OPV|organic solar cell|organic photovoltaics (OPV)!} device with a donor polymer with monomer {SMILES__description} {SMILES#} and {#Mw|weight-average molecular weight|weight-average molecular weight (Mw)!} {Mw#} g/mol and {#PDI|polydispersity index|polydispersity index (PDI)!} of {PDI#}?
-    Answer: {#The power conversion efficiency is |The PCE is!}{PCE_ave#} %.
+    Answer: {#The power conversion efficiency is |The PCE is !}{PCE_ave#} %.
   - |-
     Question: What is the {Voc__names__noun} of a {#non-fullerene|PC71BM|PCBM!} {#organic photovoltaics|OPV|organic solar cell|organic photovoltaics (OPV)!} device with a donor polymer with monomer {SMILES__description} {SMILES#} and {#Mw|weight-average molecular weight|weight-average molecular weight (Mw)!} {Mw#} g/mol and {#PDI|polydispersity index|polydispersity index (PDI)!} of {PDI#}?
-    Answer: {#The open-circuit voltage is |The Voc is!}{Voc#} {Voc__units}.
+    Answer: {#The open-circuit voltage is |The Voc is !}{Voc#} {Voc__units}.
   - |-
     Question: What is the {Jsc__names__noun} of a {#non-fullerene|PC71BM|PCBM!} {#organic photovoltaics|OPV|organic solar cell|organic photovoltaics (OPV)!} device with a donor polymer with monomer {SMILES__description} {SMILES#} and {#Mw|weight-average molecular weight|weight-average molecular weight (Mw)!} {Mw#} g/mol and {#PDI|polydispersity index|polydispersity index (PDI)!} of {PDI#}?
-    Answer: {#The short-circuit current density is |The Jsc is!}{Jsc#} {Jsc__units}.
+    Answer: {#The short-circuit current density is |The Jsc is !}{Jsc#} {Jsc__units}.
   - |-
     Question: What is the {FF__names__noun} of a {#non-fullerene|PC71BM|PCBM!} {#organic photovoltaics|OPV|organic solar cell|organic photovoltaics (OPV)!} device with a donor polymer with monomer {SMILES__description} {SMILES#} and {#Mw|weight-average molecular weight|weight-average molecular weight (Mw)!} {Mw#} g/mol and {#PDI|polydispersity index|polydispersity index (PDI)!} of {PDI#}?
     Answer: {#The fill factor is |The FF is !}{FF#}.
   - |-
     Question: What is the {bandgap__names__noun} of a polymer with monomer {SMILES__description} {SMILES#}?
-    Answer: {#The bandgap is |The bandgap of the polymer is!}{bandgap#} {bandgap__units}.
+    Answer: {#The bandgap is |The bandgap of the polymer is !}{bandgap#} {bandgap__units}.
   - |-
     Question: What is the {HOMO__names__noun} of a polymer with monomer {SMILES__description} {SMILES#}?
     Answer: The {HOMO__names__noun} {#of the polymer|!} is {HOMO#} {HOMO__units}.

diff --git a/src/chemnlp/data/sampler_cli.py b/src/chemnlp/data/sampler_cli.py
@@ -11,8 +11,8 @@
     EXCLUDE_FROM_STANDARD_TABULAR_TEXT_TEMPLATES,
     DEFAULT_SIGNIFICANT_DIGITS,
 )
-
-
+from loguru import logger
+from pathlib import Path
 def determine_balance_column(meta: dict, template: str) -> Optional[str]:
     """
     Determine which column to use for class balancing based on the template and metadata.
@@ -101,14 +101,14 @@ def process_dataset(
         templates = [t for t in templates if "<EOI>" in t]
     else:
         templates = [t for t in templates if "<EOI>" not in t]
-
-    output_dir = os.path.join(output_dir, os.path.dirname(data_dir))
-    os.makedirs(output_dir, exist_ok=True)
-
+    logger.debug(f"Outout directory: {output_dir}, {os.path.isdir(output_dir)}")
+    output_dir_ = os.path.join(Path(output_dir), os.path.basename(data_dir))
+    os.makedirs(output_dir_, exist_ok=True)
     for chunk_idx, df_chunk in enumerate(
         pd.read_csv(data_path, chunksize=chunksize, low_memory=False)
     ):
-        chunk_output_dir = os.path.join(output_dir, f"chunk_{chunk_idx}")
+        chunk_output_dir = os.path.join(output_dir_, f"chunk_{chunk_idx}")
+        logger.debug(f"Processing chunk {chunk_idx} to {chunk_output_dir}")
         os.makedirs(chunk_output_dir, exist_ok=True)
 
         sampler = TemplateSampler(df_chunk, meta, config, data_dir)

diff --git a/tests/data/test_sampler.py b/tests/data/test_sampler.py
@@ -230,6 +230,8 @@ def large_sample_meta(sample_meta):
 
 
 def test_basic_identifier_wrapping(sample_df, sample_meta, sample_config_with_wrapping):
+
+    sample_config_with_wrapping['excluded_from_wrapping'] = []
     sampler = TemplateSampler(sample_df, sample_meta, sample_config_with_wrapping)
     template = "SMILES: {SMILES#}, Name: {compound_name#}"
     result = sampler.sample(sample_df.iloc[0], template)
@@ -371,6 +373,7 @@ def test_random_sampling(large_sample_df, large_sample_meta, sample_config):
 
 
 def test_multiple_identifier_types(sample_df, sample_meta, sample_config_with_wrapping):
+    sample_config_with_wrapping['excluded_from_wrapping'] = []
     sampler = TemplateSampler(sample_df, sample_meta, sample_config_with_wrapping)
     template = "SMILES: {SMILES#}, Name: {compound_name#}"
     result = sampler.sample(sample_df.iloc[0], template)

diff --git a/tests/data/test_sampler_cli.py b/tests/data/test_sampler_cli.py
@@ -4,7 +4,7 @@
 import json
 from chemnlp.data.sampler_cli import process_dataset
 from chemnlp.data.constants import STANDARD_TABULAR_TEXT_TEMPLATES
-
+import os
 
 @pytest.fixture
 def temp_tabular_data_dir(tmp_path):
@@ -89,7 +89,7 @@ def test_process_dataset(temp_data_dir, temp_output_dir):
     )
 
     # Check that output files were created
-    chunk_dir = temp_output_dir / "chunk_0"
+    chunk_dir = temp_output_dir /  "data" / "chunk_0"
     template_dir = chunk_dir / "template_0"
     assert template_dir.exists()
 
@@ -114,7 +114,7 @@ def test_process_dataset_benchmarking(temp_data_dir, temp_output_dir):
     )
 
     # Check that output files were created
-    chunk_dir = temp_output_dir / "chunk_0"
+    chunk_dir = temp_output_dir / "data"/ "chunk_0"
     template_dir = chunk_dir / "template_0"
     assert template_dir.exists()
 
@@ -144,7 +144,7 @@ def test_process_dataset_class_balanced(temp_data_dir, temp_output_dir):
     )
 
     # Check that output files were created
-    chunk_dir = temp_output_dir / "chunk_0"
+    chunk_dir = temp_output_dir / "data"/ "chunk_0"
     template_dir = chunk_dir / "template_0"
     assert template_dir.exists()
 
@@ -172,12 +172,12 @@ def test_process_dataset_with_standard_templates(
     )
 
     # Check that output files were created
-    chunk_dir = temp_output_dir / "chunk_0"
+    chunk_dir = temp_output_dir / "test_dataset" / "chunk_0"
+
 
     # Count the number of template directories
     template_dirs = list(chunk_dir.glob("template_*"))
 
-    print(len(template_dirs))
 
     # Expected number of templates: 1 custom + len(STANDARD_TABULAR_TEXT_TEMPLATES)
     expected_template_count = 1 + len(