Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Add uspto data from drfp #95

Open
wants to merge 15 commits into
base: main
Choose a base branch
from
Open
41 changes: 41 additions & 0 deletions data/uspto_500k/meta.yaml
Original file line number Diff line number Diff line change
@@ -0,0 +1,41 @@
---
name: uspto_500k
description: United States Patent and Trademark Office reaction dataset with yields.
targets:
- id: yield
description: reaction yields
units: '%'
type: continuous
names:
- reaction yield
- yield
uris:
- http://purl.allotrope.org/ontologies/quality#AFQ_0000227
identifiers:
- id: reaction_SMILES
Copy link
Collaborator

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

there is a new entry for that

Copy link
Contributor Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Yeah, great I see it. I will edit the file and PR it again.

type: RXNSMILES
description: reaction SMILES
license: CC0
links:
- url: https://doi.org/10.17863/CAM.16293
description: corresponding publication
- url: https://github.com/reymond-group/drfp/blob/main/data/uspto_yields_below.csv
description: data source
- url: https://github.com/reymond-group/drfp/blob/main/data/uspto_yields_above.csv
description: data source
- url: https://tdcommons.ai/single_pred_tasks/yields/#uspto
description: other source
num_points: 853638
bibtex:
- |-
@article{https://doi.org/10.17863/cam.16293,
doi = {10.17863/CAM.16293},
url = {https://www.repository.cam.ac.uk/handle/1810/244727},
year = {2012},
publisher = {Apollo - University of Cambridge Repository},
keywords = {Name to structure, OPSIN, Chemical text mining, Text mining,
Patent reaction extraction, Reaction mining, Patents},
language = {en},
author = {Lowe, Daniel Mark},
title = {Extraction of chemical structures and reactions from the literature},
copyright = {All Rights Reserved}
135 changes: 135 additions & 0 deletions data/uspto_500k/transform.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,135 @@
import pandas as pd
import yaml
from tdc.single_pred import Yields


def get_and_transform_data():
# get raw data
data = Yields(name="USPTO_Yields")
splits = data.get_split()
df_train = splits["train"]
df_valid = splits["valid"]
df_test = splits["test"]
df_train["split"] = "train"
df_valid["split"] = "valid"
df_test["split"] = "test"
df = pd.concat([df_train, df_valid, df_test], axis=0)

df["catalyst"] = df.Reaction.apply(lambda x: x["catalyst"])
df["reactant"] = df.Reaction.apply(lambda x: x["reactant"])
df["product"] = df.Reaction.apply(lambda x: x["product"])
df = df.drop("Reaction", axis=1)

fn_data_original = "data_original.csv"
df.to_csv(fn_data_original, index=False)
del df

# create dataframe
df = pd.read_csv(
fn_data_original, delimiter=","
) # not necessary but ensure we can load the saved data

# check if fields are the same
fields_orig = df.columns.tolist()
assert fields_orig == [
"Reaction_ID",
"Y",
"split",
"catalyst",
"reactant",
"product",
]
fields_clean = ["Reaction_ID", "yield", "split", "catalyst", "reactant", "product"]
# overwrite column names = fields
df.columns = fields_clean
assert df.columns.tolist() == fields_clean

# remove leading and trailing white space characters
assert not df.duplicated().sum()

# save to csv
fn_data_csv = "data_clean.csv"
df.to_csv(fn_data_csv, index=False)

# create meta yaml
meta = {
"name": "uspto_500k",
"description": """United States Patent and Trademark Office reaction dataset with yields.""",
"targets": [
{
"id": "yield",
"description": "reaction yields",
"units": "%",
"type": "continuous",
"names": [
"reaction yield",
"yield",
],
"uris": [
"http://purl.allotrope.org/ontologies/quality#AFQ_0000227",
],
},
],
"identifiers": [
{
"id": "reaction_SMILES",
"type": "RXNSMILES",
"description": "reaction SMILES",
},
],
"license": "CC0", # license under which the original dataset was published
"links": [ # list of relevant links (original dataset, other uses, etc.)
{
"url": "https://doi.org/10.17863/CAM.16293",
"description": "corresponding publication",
},
{
"url": "https://github.com/reymond-group/drfp/blob/main/data/uspto_yields_below.csv",
"description": "data source",
},
{
"url": "https://github.com/reymond-group/drfp/blob/main/data/uspto_yields_above.csv",
"description": "data source",
},
{
"url": "https://tdcommons.ai/single_pred_tasks/yields/#uspto",
"description": "other source",
},
],
"num_points": len(df), # number of datapoints in this dataset
"bibtex": [
"""@article{https://doi.org/10.17863/cam.16293,
doi = {10.17863/CAM.16293},
url = {https://www.repository.cam.ac.uk/handle/1810/244727},
year = {2012},
publisher = {Apollo - University of Cambridge Repository},
keywords = {Name to structure, OPSIN, Chemical text mining, Text mining,
Patent reaction extraction, Reaction mining, Patents},
language = {en},
author = {Lowe, Daniel Mark},
title = {Extraction of chemical structures and reactions from the literature},
copyright = {All Rights Reserved}""",
],
}

def str_presenter(dumper, data):
"""configures yaml for dumping multiline strings
Ref: https://stackoverflow.com/questions/8640959/how-can-i-control-what-scalar-form-pyyaml-uses-for-my-data
"""
if data.count("\n") > 0: # check for multiline string
return dumper.represent_scalar("tag:yaml.org,2002:str", data, style="|")
return dumper.represent_scalar("tag:yaml.org,2002:str", data)

yaml.add_representer(str, str_presenter)
yaml.representer.SafeRepresenter.add_representer(
str, str_presenter
) # to use with safe_dum
fn_meta = "meta.yaml"
with open(fn_meta, "w") as f:
yaml.dump(meta, f, sort_keys=False)

print(f"Finished processing {meta['name']} dataset!")


if __name__ == "__main__":
get_and_transform_data()