Skip to content

Commit

Permalink
updating tests
Browse files Browse the repository at this point in the history
  • Loading branch information
kjappelbaum committed Aug 14, 2024
1 parent 79bc334 commit b5eb5e2
Show file tree
Hide file tree
Showing 6 changed files with 4,228 additions and 57 deletions.
4,201 changes: 4,201 additions & 0 deletions data/tabular/lipophilicity/data_original.txt

Large diffs are not rendered by default.

47 changes: 7 additions & 40 deletions data/tabular/lipophilicity/meta.yaml
Original file line number Diff line number Diff line change
@@ -1,8 +1,12 @@
name: lipophilicity
description: Experimental results of octanol/water distribution coefficient (logD at pH 7.4).
description:
Experimental results of octanol/water distribution coefficient (logD
at pH 7.4).
targets:
- id: exp
description: experimental results of octanol/water distribution coefficient (logD at pH 7.4)
description:
experimental results of octanol/water distribution coefficient (logD
at pH 7.4)
units: (dimensionless)
type: continuous
names:
Expand Down Expand Up @@ -47,7 +51,7 @@ templates:
{exp%}
Answer: {%multiple_choice_result}
- |-
Question: Please {#estimate|guess|predict|provide!} the {exp__names__noun} of {SMILES#} by picking one choice of {%multiple_choice_enum%3-6%aA1}.
Question: Please estimate the {exp__names__noun} of the molecule with {SMILES__description} {SMILES#} by picking one choice of {%multiple_choice_enum%3-6%aA1}.
Options:
{exp%}
Answer: {%multiple_choice_result}
Expand All @@ -57,40 +61,3 @@ templates:
Options:
{exp%}
Answer:<EOI>{%multiple_choice_result}
- |-
Question: What is the {exp__names__noun} for the {#molecule|chemical|compound!} represented by the {SMILES__description} {SMILES#}?
Answer:<EOI>{exp}
- |-
Task: Determine the {exp__names__noun} for the given {SMILES__description}.
Molecule: {SMILES#}
Answer:<EOI>{exp}
- |-
Task: Please {#estimate|guess|predict|provide!} the {exp__names__noun} for the following {SMILES__description}.
Molecule: {SMILES#}
Answer:<EOI>{exp}
- |-
Question: What is the experimental {exp__names__noun} for the molecule with the {SMILES__description} {SMILES#}?
Answer:<EOI>{exp}
- |-
Task: Identify the {exp__names__noun} for the given {#molecule|chemical|compound!}.
Molecule: {SMILES#}
Answer:<EOI>{exp}
- |-
Task: Please select the correct {exp__names__noun} for the {#molecule|chemical|compound!} represented by the {SMILES__description} {SMILES#}.
Options:
{exp%}
Answer:<EOI>{%multiple_choice_result}
- |-
Task: {#Estimate|Guess|Predict|Provide!} the {exp__names__noun} for the {#molecule|chemical|compound!} with the {SMILES__description} {SMILES#}.
Answer:<EOI>{exp}
8 changes: 4 additions & 4 deletions data/tabular/opv/meta.yaml
Original file line number Diff line number Diff line change
Expand Up @@ -117,19 +117,19 @@ bibtex:
templates:
- |-
Question: What is the {PCE_ave__names__noun} of a {#non-fullerene|PC71BM|PCBM!} {#organic photovoltaics|OPV|organic solar cell|organic photovoltaics (OPV)!} device with a donor polymer with monomer {SMILES__description} {SMILES#} and {#Mw|weight-average molecular weight|weight-average molecular weight (Mw)!} {Mw#} g/mol and {#PDI|polydispersity index|polydispersity index (PDI)!} of {PDI#}?
Answer: {#The power conversion efficiency is |The PCE is!}{PCE_ave#} %.
Answer: {#The power conversion efficiency is |The PCE is !}{PCE_ave#} %.
- |-
Question: What is the {Voc__names__noun} of a {#non-fullerene|PC71BM|PCBM!} {#organic photovoltaics|OPV|organic solar cell|organic photovoltaics (OPV)!} device with a donor polymer with monomer {SMILES__description} {SMILES#} and {#Mw|weight-average molecular weight|weight-average molecular weight (Mw)!} {Mw#} g/mol and {#PDI|polydispersity index|polydispersity index (PDI)!} of {PDI#}?
Answer: {#The open-circuit voltage is |The Voc is!}{Voc#} {Voc__units}.
Answer: {#The open-circuit voltage is |The Voc is !}{Voc#} {Voc__units}.
- |-
Question: What is the {Jsc__names__noun} of a {#non-fullerene|PC71BM|PCBM!} {#organic photovoltaics|OPV|organic solar cell|organic photovoltaics (OPV)!} device with a donor polymer with monomer {SMILES__description} {SMILES#} and {#Mw|weight-average molecular weight|weight-average molecular weight (Mw)!} {Mw#} g/mol and {#PDI|polydispersity index|polydispersity index (PDI)!} of {PDI#}?
Answer: {#The short-circuit current density is |The Jsc is!}{Jsc#} {Jsc__units}.
Answer: {#The short-circuit current density is |The Jsc is !}{Jsc#} {Jsc__units}.
- |-
Question: What is the {FF__names__noun} of a {#non-fullerene|PC71BM|PCBM!} {#organic photovoltaics|OPV|organic solar cell|organic photovoltaics (OPV)!} device with a donor polymer with monomer {SMILES__description} {SMILES#} and {#Mw|weight-average molecular weight|weight-average molecular weight (Mw)!} {Mw#} g/mol and {#PDI|polydispersity index|polydispersity index (PDI)!} of {PDI#}?
Answer: {#The fill factor is |The FF is !}{FF#}.
- |-
Question: What is the {bandgap__names__noun} of a polymer with monomer {SMILES__description} {SMILES#}?
Answer: {#The bandgap is |The bandgap of the polymer is!}{bandgap#} {bandgap__units}.
Answer: {#The bandgap is |The bandgap of the polymer is !}{bandgap#} {bandgap__units}.
- |-
Question: What is the {HOMO__names__noun} of a polymer with monomer {SMILES__description} {SMILES#}?
Answer: The {HOMO__names__noun} {#of the polymer|!} is {HOMO#} {HOMO__units}.
Expand Down
14 changes: 7 additions & 7 deletions src/chemnlp/data/sampler_cli.py
Original file line number Diff line number Diff line change
Expand Up @@ -11,8 +11,8 @@
EXCLUDE_FROM_STANDARD_TABULAR_TEXT_TEMPLATES,
DEFAULT_SIGNIFICANT_DIGITS,
)


from loguru import logger
from pathlib import Path
def determine_balance_column(meta: dict, template: str) -> Optional[str]:
"""
Determine which column to use for class balancing based on the template and metadata.
Expand Down Expand Up @@ -101,14 +101,14 @@ def process_dataset(
templates = [t for t in templates if "<EOI>" in t]
else:
templates = [t for t in templates if "<EOI>" not in t]

output_dir = os.path.join(output_dir, os.path.dirname(data_dir))
os.makedirs(output_dir, exist_ok=True)

logger.debug(f"Outout directory: {output_dir}, {os.path.isdir(output_dir)}")
output_dir_ = os.path.join(Path(output_dir), os.path.basename(data_dir))
os.makedirs(output_dir_, exist_ok=True)
for chunk_idx, df_chunk in enumerate(
pd.read_csv(data_path, chunksize=chunksize, low_memory=False)
):
chunk_output_dir = os.path.join(output_dir, f"chunk_{chunk_idx}")
chunk_output_dir = os.path.join(output_dir_, f"chunk_{chunk_idx}")
logger.debug(f"Processing chunk {chunk_idx} to {chunk_output_dir}")
os.makedirs(chunk_output_dir, exist_ok=True)

sampler = TemplateSampler(df_chunk, meta, config, data_dir)
Expand Down
3 changes: 3 additions & 0 deletions tests/data/test_sampler.py
Original file line number Diff line number Diff line change
Expand Up @@ -230,6 +230,8 @@ def large_sample_meta(sample_meta):


def test_basic_identifier_wrapping(sample_df, sample_meta, sample_config_with_wrapping):

sample_config_with_wrapping['excluded_from_wrapping'] = []
sampler = TemplateSampler(sample_df, sample_meta, sample_config_with_wrapping)
template = "SMILES: {SMILES#}, Name: {compound_name#}"
result = sampler.sample(sample_df.iloc[0], template)
Expand Down Expand Up @@ -371,6 +373,7 @@ def test_random_sampling(large_sample_df, large_sample_meta, sample_config):


def test_multiple_identifier_types(sample_df, sample_meta, sample_config_with_wrapping):
sample_config_with_wrapping['excluded_from_wrapping'] = []
sampler = TemplateSampler(sample_df, sample_meta, sample_config_with_wrapping)
template = "SMILES: {SMILES#}, Name: {compound_name#}"
result = sampler.sample(sample_df.iloc[0], template)
Expand Down
12 changes: 6 additions & 6 deletions tests/data/test_sampler_cli.py
Original file line number Diff line number Diff line change
Expand Up @@ -4,7 +4,7 @@
import json
from chemnlp.data.sampler_cli import process_dataset
from chemnlp.data.constants import STANDARD_TABULAR_TEXT_TEMPLATES

import os

@pytest.fixture
def temp_tabular_data_dir(tmp_path):
Expand Down Expand Up @@ -89,7 +89,7 @@ def test_process_dataset(temp_data_dir, temp_output_dir):
)

# Check that output files were created
chunk_dir = temp_output_dir / "chunk_0"
chunk_dir = temp_output_dir / "data" / "chunk_0"
template_dir = chunk_dir / "template_0"
assert template_dir.exists()

Expand All @@ -114,7 +114,7 @@ def test_process_dataset_benchmarking(temp_data_dir, temp_output_dir):
)

# Check that output files were created
chunk_dir = temp_output_dir / "chunk_0"
chunk_dir = temp_output_dir / "data"/ "chunk_0"
template_dir = chunk_dir / "template_0"
assert template_dir.exists()

Expand Down Expand Up @@ -144,7 +144,7 @@ def test_process_dataset_class_balanced(temp_data_dir, temp_output_dir):
)

# Check that output files were created
chunk_dir = temp_output_dir / "chunk_0"
chunk_dir = temp_output_dir / "data"/ "chunk_0"
template_dir = chunk_dir / "template_0"
assert template_dir.exists()

Expand Down Expand Up @@ -172,12 +172,12 @@ def test_process_dataset_with_standard_templates(
)

# Check that output files were created
chunk_dir = temp_output_dir / "chunk_0"
chunk_dir = temp_output_dir / "test_dataset" / "chunk_0"


# Count the number of template directories
template_dirs = list(chunk_dir.glob("template_*"))

print(len(template_dirs))

# Expected number of templates: 1 custom + len(STANDARD_TABULAR_TEXT_TEMPLATES)
expected_template_count = 1 + len(
Expand Down

0 comments on commit b5eb5e2

Please sign in to comment.