Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Add uspto data from drfp #95

Open
wants to merge 15 commits into
base: main
Choose a base branch
from
Open
46 changes: 46 additions & 0 deletions data/uspto_500k/meta.yaml
Original file line number Diff line number Diff line change
@@ -0,0 +1,46 @@
---
name: uspto_500k
description: United States Patent and Trademark Office reaction dataset with yields.
targets:
- id: yield
description: Reaction yields analyzed by UPLC
units: '%'
type: continuous
names:
- Reaction yield
- yield
uris:
- https://bioportal.bioontology.org/ontologies/AFO?p=classes&conceptid=http%3A%2F%2Fpurl.allotrope.org%2Fontologies%2Fquality%23AFQ_0000227
- https://en.wikipedia.org/wiki/Yield_(chemistry)
phalem marked this conversation as resolved.
Show resolved Hide resolved
benchmarks:
- name: TDC
link: https://tdcommons.ai/
split_column: split
MicPie marked this conversation as resolved.
Show resolved Hide resolved
identifiers:
- id: reaction_SMILES
Copy link
Collaborator

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

there is a new entry for that

Copy link
Contributor Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Yeah, great I see it. I will edit the file and PR it again.

type: SMILES
description: reaction SMILES
license: CC0
links:
- url: https://doi.org/10.17863/CAM.16293
description: corresponding publication
- url: https://github.com/reymond-group/drfp/blob/main/data/uspto_yields_below.csv
description: data source
- url: https://github.com/reymond-group/drfp/blob/main/data/uspto_yields_above.csv
description: data source
- url: https://tdcommons.ai/single_pred_tasks/yields/#uspto
description: other source
num_points: 498721
bibtex:
- |-
@article{https://doi.org/10.17863/cam.16293,
doi = {10.17863/CAM.16293},
url = {https://www.repository.cam.ac.uk/handle/1810/244727},
year = {2012},
publisher = {Apollo - University of Cambridge Repository},
keywords = {Name to structure, OPSIN, Chemical text mining, Text mining,
Patent reaction extraction, Reaction mining, Patents},
language = {en},
author = {Lowe, Daniel Mark},
title = {Extraction of chemical structures and reactions from the literature},
copyright = {All Rights Reserved}
130 changes: 130 additions & 0 deletions data/uspto_500k/transform.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,130 @@
import pandas as pd
import yaml
from tdc.single_pred import Tox


def get_and_transform_data():
# get raw data
df1 = pd.read_csv(
"https://github.com/reymond-group/drfp/raw/main/data/uspto_yields_above.csv"
)
df2 = pd.read_csv(
"https://github.com/reymond-group/drfp/raw/main/data/uspto_yields_below.csv"
)
data = pd.concat([df1, df2])
data = data[["rxn", "yield"]]
data = data.drop_duplicates(subset="rxn")
fn_data_original = "uptso.csv"
data.to_csv(fn_data_original, index=False)

# create dataframe
df = pd.read_csv(
fn_data_original, delimiter=","
) # not necessary but ensure we can load the saved data

# check if fields are the same
fields_orig = df.columns.tolist()
assert fields_orig == ["rxn", "yield"]
fields_clean = ["reaction_SMILES", "yield"]

# overwrite column names = fields
df.columns = fields_clean
assert fields_orig != fields_clean

# remove leading and trailing white space characters
assert not df.duplicated().sum()

# save to csv
fn_data_csv = "data_clean.csv"
df.to_csv(fn_data_csv, index=False)

# create meta yaml
meta = {
"name": "uspto_500k", # unique identifier, we will also use this for directory names
"description": """United States Patent and Trademark Office reaction dataset with yields.""",
"targets": [
{
"id": "yield", # name of the column in a tabular dataset
"description": "Reaction yields analyzed by UPLC", # description of what this column means
"units": "%", # units of the values in this column (leave empty if unitless)
"type": "continuous", # can be "categorical", "ordinal", "continuous"
"names": [ # names for the property (to sample from for building the prompts)
"Reaction yield",
"yield",
],
"uris": [
"https://bioportal.bioontology.org/ontologies/AFO?p=classes&conceptid=http%3A%2F%2Fpurl.allotrope.org%2Fontologies%2Fquality%23AFQ_0000227",
"https://en.wikipedia.org/wiki/Yield_(chemistry)",
],
},
],
"benchmarks": [
{
"name": "TDC", # unique benchmark name
"link": "https://tdcommons.ai/", # benchmark URL
"split_column": "split", # name of the column that contains the split information
},
],
"identifiers": [
{
"id": "reaction_SMILES", # column name
"type": "SMILES", # can be "SMILES", "SELFIES", "IUPAC", "Other"
"description": "reaction SMILES", # description (optional, except for "Other")
},
],
"license": "CC0", # license under which the original dataset was published
"links": [ # list of relevant links (original dataset, other uses, etc.)
{
"url": "https://doi.org/10.17863/CAM.16293",
"description": "corresponding publication",
},
{
"url": "https://github.com/reymond-group/drfp/blob/main/data/uspto_yields_below.csv",
"description": "data source",
},
{
"url": "https://github.com/reymond-group/drfp/blob/main/data/uspto_yields_above.csv",
"description": "data source",
},
{
"url": "https://tdcommons.ai/single_pred_tasks/yields/#uspto",
"description": "other source",
},
],
"num_points": len(df), # number of datapoints in this dataset
"bibtex": [
"""@article{https://doi.org/10.17863/cam.16293,
doi = {10.17863/CAM.16293},
url = {https://www.repository.cam.ac.uk/handle/1810/244727},
year = {2012},
publisher = {Apollo - University of Cambridge Repository},
keywords = {Name to structure, OPSIN, Chemical text mining, Text mining,
Patent reaction extraction, Reaction mining, Patents},
language = {en},
author = {Lowe, Daniel Mark},
title = {Extraction of chemical structures and reactions from the literature},
copyright = {All Rights Reserved}""",
],
}

def str_presenter(dumper, data):
"""configures yaml for dumping multiline strings
Ref: https://stackoverflow.com/questions/8640959/how-can-i-control-what-scalar-form-pyyaml-uses-for-my-data
"""
if data.count("\n") > 0: # check for multiline string
return dumper.represent_scalar("tag:yaml.org,2002:str", data, style="|")
return dumper.represent_scalar("tag:yaml.org,2002:str", data)

yaml.add_representer(str, str_presenter)
yaml.representer.SafeRepresenter.add_representer(
str, str_presenter
) # to use with safe_dum
fn_meta = "meta.yaml"
with open(fn_meta, "w") as f:
yaml.dump(meta, f, sort_keys=False)

print(f"Finished processing {meta['name']} dataset!")


if __name__ == "__main__":
get_and_transform_data()