Skip to content

Commit

Permalink
New fixes (#522)
Browse files Browse the repository at this point in the history
* feat: add data/check_pandas.py

* add data/check_smiles_split.py

* update kg meta.yaml files

* add data/natural

* add dataset scripts

* update data/text_sampling/

* update meta and transform

* additional fixes

* apply pre-commit hook

* sort exclude_from_standard_tabular_text_templates

* more fixes (#517)

* fix: use SMILES__description instead of SMILES__names__noun for mol. repr. sampling

* fix: mol_repr_transl/transform.py w/o SMILES with H and with split col only

* fix: exclude uniprot binding ..

* fix: missing ! and wrong variable name in template

* fix: missing ! in templates

* fix: and rheadb and sort exclude_from_standard_tabular_text_templates

* fix: RedDB templates and fully export to meta.yaml

* fix: aminoacids templates var name

* fix: polymer_data templates var name

* fix: var dtypes

* fix: name and id

* fix: add rdkit_features to /exclude_from_standard_tabular_text_templates

* fix: # missing in template var

* fix: # missing in template var 2

* fix: fix dialogue template in compound_protein_compound_*

* fix: QC templates 1

* update reddb

* update zhu

* add description of representation

* orexin1_receptor_butkiewicz

* smiles__description

* molecule with SMILES

* fix: QC templates 2

* fix: QC templates 3

* fix: QC templates 3

* fix: QC templates

* fix: QC templates

* fix: QC templates

* fix: QC templates

* add organism

* make explicit

* add representation name

* add representation name

* update standard templates

* add representation name

* add representation names

* representation name use

* smiles usage

* must to

* add representation name

* Update data/kg/compound_protein_protein/meta.yaml

Co-authored-by: Michael Pieler <[email protected]>

* Update data/tabular/freesolv/meta.yaml

Co-authored-by: Michael Pieler <[email protected]>

* Update data/tabular/freesolv/transform.py

Co-authored-by: Michael Pieler <[email protected]>

* Update data/tabular/freesolv/transform.py

Co-authored-by: Michael Pieler <[email protected]>

* Update data/tabular/sr_p53_tox21/meta.yaml

Co-authored-by: Michael Pieler <[email protected]>

* Update data/kg/compound_protein_protein/meta.yaml

* Update data/kg/compound_protein_protein/meta.yaml

* Update data/kg/compound_protein_protein/meta.yaml

* Update data/tabular/freesolv/meta.yaml

* Update data/tabular/sr_p53_tox21/meta.yaml

* Update data/kg/compound_protein_compound_1/meta.yaml

* Update data/kg/compound_protein_compound_1/meta.yaml

* Update data/kg/compound_protein_compound_1/meta.yaml

* Update data/kg/compound_protein_compound_1/meta.yaml

* Update data/kg/compound_protein_compound_1/meta.yaml

* Update data/kg/compound_protein_compound_3/meta.yaml

* Update data/kg/compound_protein_compound_3/meta.yaml

* Update data/kg/compound_protein_compound_3/meta.yaml

* Update data/kg/compound_protein_compound_3/meta.yaml

* Update data/kg/compound_protein_compound_3/meta.yaml

* Update data/kg/compound_protein_go_term_1/meta.yaml

* Update data/kg/compound_protein_go_term_1/meta.yaml

* Update data/kg/compound_protein_go_term_2/meta.yaml

* Update data/kg/compound_protein_go_term_2/meta.yaml

* Update data/kg/compound_protein_go_term_3/meta.yaml

* Update data/kg/compound_protein_go_term_3/meta.yaml

* Update data/kg/compound_protein_go_term_4/meta.yaml

* Update data/kg/compound_protein_go_term_4/meta.yaml

* Update data/kg/compound_protein_pathway_disease_2/meta.yaml

* Update data/kg/drug_protein_hpo_disease/meta.yaml

* Update data/tabular/chemcaption_rdkit/meta.yaml

* Update data/tabular/mona/meta.yaml

* Update data/tabular/mona/transform.py

---------

Co-authored-by: Michael Pieler <[email protected]>
Co-authored-by: Kevin Maik Jablonka <[email protected]>
Co-authored-by: Michael Pieler <[email protected]>

* additional fixes

* Update data/check_pandas.py

Co-authored-by: Kevin M Jablonka <[email protected]>

* Update data/check_pandas.py

Co-authored-by: Kevin M Jablonka <[email protected]>

* Update data/check_smiles_split.py

Co-authored-by: Kevin M Jablonka <[email protected]>

* Update data/natural/preprocess_europepmc.py

Co-authored-by: Kevin M Jablonka <[email protected]>

* Update data/natural/preprocess_msds.py

Co-authored-by: Kevin M Jablonka <[email protected]>

* Update data/natural/preprocess_nougat.py

Co-authored-by: Kevin M Jablonka <[email protected]>

* Update data/postprocess_split.py

Co-authored-by: Kevin M Jablonka <[email protected]>

* additional fixes 2

* additional fixes 3

* additional fixes 4

* additional fixes 5

* additional fixes 6

* additional fixes 7

* additional fixes 8

* remove linebreak

* remove linebreak

* Delete data/tabular/bicerano_dataset/meta.yaml

those changes are incorrect, the CTE and density are not there for all polymers

* feat: update yamls

* Update data/text_sampling/preprocess_kg.py

* Update data/text_sampling/preprocess_kg.py

* Update data/text_sampling/preprocess_kg.py

---------

Co-authored-by: Kevin M Jablonka <[email protected]>
Co-authored-by: Kevin Maik Jablonka <[email protected]>
  • Loading branch information
3 people committed Feb 8, 2024
1 parent b445d1f commit 768f131
Show file tree
Hide file tree
Showing 56 changed files with 2,684 additions and 804 deletions.
145 changes: 145 additions & 0 deletions data/check_pandas.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,145 @@
"""
This check performs a basic check for data leakage. The checks in this script only focus on SMILES.
Train/test split needs to be run before running this script.
This script assumes that `test_smiles.txt` and `val_smiles.txt` exist in the current working directory.
If leakage is detected, an `AssertionError` will be thrown.
This script has a command line interface. You can run it using `python check_pandas <data_dir>`,
where `<data_dir>` points to a nested set of directories with `data_clean.csv` files.
"""
import os
from glob import glob
from pathlib import Path

import fire
import pandas as pd
from pandarallel import pandarallel
from tqdm import tqdm

pandarallel.initialize(progress_bar=False)

with open("test_smiles.txt", "r") as f:
test_smiles_ref = f.readlines()
test_smiles_ref = [x.strip() for x in test_smiles_ref]

with open("val_smiles.txt", "r") as f:
valid_smiles_ref = f.readlines()
valid_smiles_ref = [x.strip() for x in valid_smiles_ref]


def leakage_check(file, outdir="out"):
# mirror subdir structures in outdir
if not os.path.exists(outdir):
os.makedirs(outdir)
print(f"Checking {file}")
df = pd.read_csv(file, low_memory=False)
print(df["split"].value_counts())
train_smiles = df[df["split"] == "train"]["SMILES"].to_list()
train_smiles = set(train_smiles)
test_smiles = df[df["split"] == "test"]["SMILES"].to_list()
test_smiles = set(test_smiles)
valid_smiles = df[df["split"] == "valid"]["SMILES"].to_list()
valid_smiles = set(valid_smiles)

try:
assert (
len(train_smiles.intersection(test_smiles)) == 0
), "Smiles in train and test"
assert (
len(train_smiles.intersection(valid_smiles)) == 0
), "Smiles in train and valid"
assert (
len(test_smiles.intersection(valid_smiles)) == 0
), "Smiles in test and valid"
except AssertionError as e:
path = os.path.join(outdir, Path(file).parts[-2], Path(file).name)
print(f"Leakage in {file}: {e}. Fixing... {path}")
is_in_test = df["SMILES"].isin(test_smiles)
is_in_val = df["SMILES"].isin(valid_smiles)

df.loc[is_in_test, "split"] = "test"
df.loc[is_in_val, "split"] = "valid"

os.makedirs(os.path.dirname(path), exist_ok=True)
df.to_csv(path, index=False)
print(f"Saved fixed file to {path}")
print("Checking fixed file...")
leakage_check(path, outdir)

try:
assert (
len(train_smiles.intersection(test_smiles_ref)) == 0
), "Smiles in train and scaffold test"

assert (
len(train_smiles.intersection(valid_smiles_ref)) == 0
), "Smiles in train and scaffold valid"

assert (
len(test_smiles.intersection(valid_smiles_ref)) == 0
), "Smiles in test and scaffold valid"
except AssertionError as e:
path = os.path.join(outdir, Path(file).parts[-2], Path(file).name)
print(f"Leakage in {file}: {e}. Fixing... {path}")
is_in_test = df["SMILES"].isin(test_smiles)
is_in_val = df["SMILES"].isin(valid_smiles)

df.loc[is_in_test, "split"] = "test"
df.loc[is_in_val, "split"] = "valid"

test_smiles = df[df["split"] == "test"]["SMILES"].to_list()
test_smiles = set(test_smiles)

valid_smiles = df[df["split"] == "valid"]["SMILES"].to_list()
valid_smiles = set(valid_smiles)

is_in_test = df["SMILES"].isin(test_smiles)
is_in_val = df["SMILES"].isin(valid_smiles)

df.loc[is_in_test, "split"] = "test"
df.loc[is_in_val, "split"] = "valid"

path = os.path.join(outdir, Path(file).parts[-2], Path(file).name)
os.makedirs(os.path.dirname(path), exist_ok=True)
df.to_csv(path, index=False)
print(f"Saved fixed file to {path}")
print("Checking fixed file...")
leakage_check(path, outdir)

print(f"No leakage in {file}")
with open("leakage_check.txt", "a") as f:
f.write(f"No leakage in {file}\n")
f.write(f"train: {len(train_smiles)}\n")
f.write(f"test: {len(test_smiles)}\n")
f.write(f"valid: {len(valid_smiles)}\n")
return True


def check_all_files(data_dir):
all_csv_files = glob(os.path.join(data_dir, "**", "**", "data_clean.csv"))
for csv_file in tqdm(all_csv_files):
if Path(csv_file).parts[-2] not in [
"odd_one_out",
"uniprot_binding_single",
"uniprot_binding_sites_multiple",
"uniprot_organisms",
"uniprot_reactions",
"uniprot_sentences",
"fda_adverse_reactions",
"drugchat_liang_zhang_et_al",
"herg_central",
# those files were checked manually
]:
# if filesize < 35 GB:
if os.path.getsize(csv_file) < 35 * 1024 * 1024 * 1024:
try:
leakage_check(csv_file)
except Exception as e:
print(f"Could not process {csv_file}: {e}")
else:
print(f"Skipping {csv_file} due to size")


if __name__ == "__main__":
fire.Fire(check_all_files)
Loading

0 comments on commit 768f131

Please sign in to comment.