From 5c20bd914778b30f63b99a19c8392087ffef68ab Mon Sep 17 00:00:00 2001 From: Mohamed Abd Elaleem <109590482+phalem@users.noreply.github.com> Date: Fri, 10 Mar 2023 19:34:09 +0200 Subject: [PATCH 01/13] Add uspto data from drfp --- data/USPTO_500k/meta.yaml | 40 ++++++++++++ data/USPTO_500k/transform.py | 122 +++++++++++++++++++++++++++++++++++ 2 files changed, 162 insertions(+) create mode 100644 data/USPTO_500k/meta.yaml create mode 100644 data/USPTO_500k/transform.py diff --git a/data/USPTO_500k/meta.yaml b/data/USPTO_500k/meta.yaml new file mode 100644 index 000000000..1c6010066 --- /dev/null +++ b/data/USPTO_500k/meta.yaml @@ -0,0 +1,40 @@ +name: USPTO_500k +description: United States Patent and Trademark Office reaction dataset with yields. +targets: +- id: yield + description: Reaction yields analyzed by UPLC + units: '%' + type: continuous + names: + - Reaction yield + - yield + uris: + - https://bioportal.bioontology.org/ontologies/AFO?p=classes&conceptid=http%3A%2F%2Fpurl.allotrope.org%2Fontologies%2Fquality%23AFQ_0000227 + - https://en.wikipedia.org/wiki/Yield_(chemistry) +identifiers: +- id: reaction_SMILES + type: SMILES + description: reaction SMILES +license: CC0 +links: +- url: https://doi.org/10.17863/CAM.16293 + description: corresponding publication +- url: https://github.com/reymond-group/drfp/blob/main/data/uspto_yields_below.csv + description: data source +- url: https://github.com/reymond-group/drfp/blob/main/data/uspto_yields_above.csv + description: data source +- url: https://tdcommons.ai/single_pred_tasks/yields/#uspto + description: other source +num_points: 498721 +bibtex: +- |- + @article{https://doi.org/10.17863/cam.16293, + doi = {10.17863/CAM.16293}, + url = {https://www.repository.cam.ac.uk/handle/1810/244727}, + author = {Lowe, Daniel Mark}, + keywords = {Name to structure, OPSIN, Chemical text mining, Text mining, Patent reaction extraction, Reaction mining, Patents}, + language = {en}, + title = {Extraction of chemical structures and reactions from the literature}, + publisher = {Apollo - University of Cambridge Repository}, + year = {2012}, + copyright = {All Rights Reserved}} diff --git a/data/USPTO_500k/transform.py b/data/USPTO_500k/transform.py new file mode 100644 index 000000000..98e753076 --- /dev/null +++ b/data/USPTO_500k/transform.py @@ -0,0 +1,122 @@ +import pandas as pd +import yaml +from tdc.single_pred import Tox + + +def get_and_transform_data(): + # get raw data + df1 = pd.read_csv('https://github.com/reymond-group/drfp/raw/main/data/uspto_yields_above.csv') + df2 = pd.read_csv('https://github.com/reymond-group/drfp/raw/main/data/uspto_yields_below.csv') + data = pd.concat([df1,df2]) + data = data[['rxn','yield']] + data= data.drop_duplicates(subset='rxn') + fn_data_original = "uptso.csv" + data.to_csv(fn_data_original, index=False) + + # create dataframe + df = pd.read_csv(fn_data_original, + delimiter="," + )# not necessary but ensure we can load the saved data + + # check if fields are the same + fields_orig = df.columns.tolist() + assert fields_orig == ['rxn', 'yield'] + fields_clean = [ + "reaction_SMILES", + "yield" + ] + + # overwrite column names = fields + df.columns = fields_clean + assert fields_orig != fields_clean + + # remove leading and trailing white space characters + assert not df.duplicated().sum() + + # save to csv + fn_data_csv = "data_clean.csv" + df.to_csv(fn_data_csv, index=False) + + + # create meta yaml + meta = { + "name": "USPTO_500k", # unique identifier, we will also use this for directory names + "description": """United States Patent and Trademark Office reaction dataset with yields.""", + "targets": [ + { + "id": "yield", # name of the column in a tabular dataset + "description": "Reaction yields analyzed by UPLC", # description of what this column means + "units": "%", # units of the values in this column (leave empty if unitless) + "type": "continuous", # can be "categorical", "ordinal", "continuous" + "names": [ # names for the property (to sample from for building the prompts) + "Reaction yield", + "yield", + ], + "uris":[ + "https://bioportal.bioontology.org/ontologies/AFO?p=classes&conceptid=http%3A%2F%2Fpurl.allotrope.org%2Fontologies%2Fquality%23AFQ_0000227", + "https://en.wikipedia.org/wiki/Yield_(chemistry)", + ], + }, + ], + "identifiers": [ + { + "id": "reaction_SMILES", # column name + "type": "SMILES", # can be "SMILES", "SELFIES", "IUPAC", "Other" + "description": "reaction SMILES", # description (optional, except for "Other") + }, + ], + "license": "CC0", # license under which the original dataset was published + "links": [ # list of relevant links (original dataset, other uses, etc.) + { + "url": "https://doi.org/10.17863/CAM.16293", + "description": "corresponding publication", + }, + { + "url": "https://github.com/reymond-group/drfp/blob/main/data/uspto_yields_below.csv", + "description": "data source", + }, + { + "url": "https://github.com/reymond-group/drfp/blob/main/data/uspto_yields_above.csv", + "description": "data source", + }, + { + "url": "https://tdcommons.ai/single_pred_tasks/yields/#uspto", + "description": "other source", + } + ], + "num_points": len(df), # number of datapoints in this dataset + "bibtex": [ + """@article{https://doi.org/10.17863/cam.16293, + doi = {10.17863/CAM.16293}, + url = {https://www.repository.cam.ac.uk/handle/1810/244727}, + author = {Lowe, Daniel Mark}, + keywords = {Name to structure, OPSIN, Chemical text mining, Text mining, Patent reaction extraction, Reaction mining, Patents}, + language = {en}, + title = {Extraction of chemical structures and reactions from the literature}, + publisher = {Apollo - University of Cambridge Repository}, + year = {2012}, + copyright = {All Rights Reserved}}""", + ], + } + + def str_presenter(dumper, data): + """configures yaml for dumping multiline strings + Ref: https://stackoverflow.com/questions/8640959/how-can-i-control-what-scalar-form-pyyaml-uses-for-my-data + """ + if data.count("\n") > 0: # check for multiline string + return dumper.represent_scalar("tag:yaml.org,2002:str", data, style="|") + return dumper.represent_scalar("tag:yaml.org,2002:str", data) + + yaml.add_representer(str, str_presenter) + yaml.representer.SafeRepresenter.add_representer( + str, str_presenter + ) # to use with safe_dum + fn_meta = "meta.yaml" + with open(fn_meta, "w") as f: + yaml.dump(meta, f, sort_keys=False) + + print(f"Finished processing {meta['name']} dataset!") + + +if __name__ == "__main__": + get_and_transform_data() From 0303bf39134263ad581f65293d3984bbecfcff01 Mon Sep 17 00:00:00 2001 From: Mohamed Abd Elaleem <109590482+phalem@users.noreply.github.com> Date: Thu, 16 Mar 2023 16:28:00 +0200 Subject: [PATCH 02/13] Update data/USPTO_500k/meta.yaml Co-authored-by: Kevin M Jablonka <32935233+kjappelbaum@users.noreply.github.com> --- data/USPTO_500k/meta.yaml | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/data/USPTO_500k/meta.yaml b/data/USPTO_500k/meta.yaml index 1c6010066..adbd80857 100644 --- a/data/USPTO_500k/meta.yaml +++ b/data/USPTO_500k/meta.yaml @@ -13,7 +13,7 @@ targets: - https://en.wikipedia.org/wiki/Yield_(chemistry) identifiers: - id: reaction_SMILES - type: SMILES + type: RXN-SMILES description: reaction SMILES license: CC0 links: From dfdf3fdccb07bf7b5433872b3bd4f61696f21ae3 Mon Sep 17 00:00:00 2001 From: "pre-commit-ci[bot]" <66853113+pre-commit-ci[bot]@users.noreply.github.com> Date: Thu, 16 Mar 2023 14:28:15 +0000 Subject: [PATCH 03/13] [pre-commit.ci] auto fixes from pre-commit.com hooks for more information, see https://pre-commit.ci --- data/USPTO_500k/meta.yaml | 65 ++++++++++++++++++------------------ data/USPTO_500k/transform.py | 44 ++++++++++++------------ 2 files changed, 55 insertions(+), 54 deletions(-) diff --git a/data/USPTO_500k/meta.yaml b/data/USPTO_500k/meta.yaml index adbd80857..06d9252c0 100644 --- a/data/USPTO_500k/meta.yaml +++ b/data/USPTO_500k/meta.yaml @@ -1,40 +1,41 @@ +--- name: USPTO_500k description: United States Patent and Trademark Office reaction dataset with yields. targets: -- id: yield - description: Reaction yields analyzed by UPLC - units: '%' - type: continuous - names: - - Reaction yield - - yield - uris: - - https://bioportal.bioontology.org/ontologies/AFO?p=classes&conceptid=http%3A%2F%2Fpurl.allotrope.org%2Fontologies%2Fquality%23AFQ_0000227 - - https://en.wikipedia.org/wiki/Yield_(chemistry) + - id: yield + description: Reaction yields analyzed by UPLC + units: '%' + type: continuous + names: + - Reaction yield + - yield + uris: + - https://bioportal.bioontology.org/ontologies/AFO?p=classes&conceptid=http%3A%2F%2Fpurl.allotrope.org%2Fontologies%2Fquality%23AFQ_0000227 + - https://en.wikipedia.org/wiki/Yield_(chemistry) identifiers: -- id: reaction_SMILES - type: RXN-SMILES - description: reaction SMILES + - id: reaction_SMILES + type: RXN-SMILES + description: reaction SMILES license: CC0 links: -- url: https://doi.org/10.17863/CAM.16293 - description: corresponding publication -- url: https://github.com/reymond-group/drfp/blob/main/data/uspto_yields_below.csv - description: data source -- url: https://github.com/reymond-group/drfp/blob/main/data/uspto_yields_above.csv - description: data source -- url: https://tdcommons.ai/single_pred_tasks/yields/#uspto - description: other source + - url: https://doi.org/10.17863/CAM.16293 + description: corresponding publication + - url: https://github.com/reymond-group/drfp/blob/main/data/uspto_yields_below.csv + description: data source + - url: https://github.com/reymond-group/drfp/blob/main/data/uspto_yields_above.csv + description: data source + - url: https://tdcommons.ai/single_pred_tasks/yields/#uspto + description: other source num_points: 498721 bibtex: -- |- - @article{https://doi.org/10.17863/cam.16293, - doi = {10.17863/CAM.16293}, - url = {https://www.repository.cam.ac.uk/handle/1810/244727}, - author = {Lowe, Daniel Mark}, - keywords = {Name to structure, OPSIN, Chemical text mining, Text mining, Patent reaction extraction, Reaction mining, Patents}, - language = {en}, - title = {Extraction of chemical structures and reactions from the literature}, - publisher = {Apollo - University of Cambridge Repository}, - year = {2012}, - copyright = {All Rights Reserved}} + - |- + @article{https://doi.org/10.17863/cam.16293, + doi = {10.17863/CAM.16293}, + url = {https://www.repository.cam.ac.uk/handle/1810/244727}, + author = {Lowe, Daniel Mark}, + keywords = {Name to structure, OPSIN, Chemical text mining, Text mining, Patent reaction extraction, Reaction mining, Patents}, + language = {en}, + title = {Extraction of chemical structures and reactions from the literature}, + publisher = {Apollo - University of Cambridge Repository}, + year = {2012}, + copyright = {All Rights Reserved}} diff --git a/data/USPTO_500k/transform.py b/data/USPTO_500k/transform.py index 98e753076..8596e43c6 100644 --- a/data/USPTO_500k/transform.py +++ b/data/USPTO_500k/transform.py @@ -5,41 +5,41 @@ def get_and_transform_data(): # get raw data - df1 = pd.read_csv('https://github.com/reymond-group/drfp/raw/main/data/uspto_yields_above.csv') - df2 = pd.read_csv('https://github.com/reymond-group/drfp/raw/main/data/uspto_yields_below.csv') - data = pd.concat([df1,df2]) - data = data[['rxn','yield']] - data= data.drop_duplicates(subset='rxn') + df1 = pd.read_csv( + "https://github.com/reymond-group/drfp/raw/main/data/uspto_yields_above.csv" + ) + df2 = pd.read_csv( + "https://github.com/reymond-group/drfp/raw/main/data/uspto_yields_below.csv" + ) + data = pd.concat([df1, df2]) + data = data[["rxn", "yield"]] + data = data.drop_duplicates(subset="rxn") fn_data_original = "uptso.csv" data.to_csv(fn_data_original, index=False) - + # create dataframe - df = pd.read_csv(fn_data_original, - delimiter="," - )# not necessary but ensure we can load the saved data - + df = pd.read_csv( + fn_data_original, delimiter="," + ) # not necessary but ensure we can load the saved data + # check if fields are the same fields_orig = df.columns.tolist() - assert fields_orig == ['rxn', 'yield'] - fields_clean = [ - "reaction_SMILES", - "yield" - ] - + assert fields_orig == ["rxn", "yield"] + fields_clean = ["reaction_SMILES", "yield"] + # overwrite column names = fields df.columns = fields_clean assert fields_orig != fields_clean - + # remove leading and trailing white space characters assert not df.duplicated().sum() - + # save to csv fn_data_csv = "data_clean.csv" df.to_csv(fn_data_csv, index=False) - # create meta yaml - meta = { + meta = { "name": "USPTO_500k", # unique identifier, we will also use this for directory names "description": """United States Patent and Trademark Office reaction dataset with yields.""", "targets": [ @@ -52,7 +52,7 @@ def get_and_transform_data(): "Reaction yield", "yield", ], - "uris":[ + "uris": [ "https://bioportal.bioontology.org/ontologies/AFO?p=classes&conceptid=http%3A%2F%2Fpurl.allotrope.org%2Fontologies%2Fquality%23AFQ_0000227", "https://en.wikipedia.org/wiki/Yield_(chemistry)", ], @@ -82,7 +82,7 @@ def get_and_transform_data(): { "url": "https://tdcommons.ai/single_pred_tasks/yields/#uspto", "description": "other source", - } + }, ], "num_points": len(df), # number of datapoints in this dataset "bibtex": [ From f5bf9109169ab16a7775b74b45c804a491fb5cb6 Mon Sep 17 00:00:00 2001 From: Mohamed Abd Elaleem <109590482+phalem@users.noreply.github.com> Date: Sat, 25 Mar 2023 00:26:08 +0200 Subject: [PATCH 04/13] Add files via upload --- data/uspto_500k/meta.yaml | 45 ++++++++++++ data/uspto_500k/transform.py | 130 +++++++++++++++++++++++++++++++++++ 2 files changed, 175 insertions(+) create mode 100644 data/uspto_500k/meta.yaml create mode 100644 data/uspto_500k/transform.py diff --git a/data/uspto_500k/meta.yaml b/data/uspto_500k/meta.yaml new file mode 100644 index 000000000..0270ec866 --- /dev/null +++ b/data/uspto_500k/meta.yaml @@ -0,0 +1,45 @@ +name: uspto_500k +description: United States Patent and Trademark Office reaction dataset with yields. +targets: +- id: yield + description: Reaction yields analyzed by UPLC + units: '%' + type: continuous + names: + - Reaction yield + - yield + uris: + - https://bioportal.bioontology.org/ontologies/AFO?p=classes&conceptid=http%3A%2F%2Fpurl.allotrope.org%2Fontologies%2Fquality%23AFQ_0000227 + - https://en.wikipedia.org/wiki/Yield_(chemistry) +benchmarks: +- name: TDC + link: https://tdcommons.ai/ + split_column: split +identifiers: +- id: reaction_SMILES + type: SMILES + description: reaction SMILES +license: CC0 +links: +- url: https://doi.org/10.17863/CAM.16293 + description: corresponding publication +- url: https://github.com/reymond-group/drfp/blob/main/data/uspto_yields_below.csv + description: data source +- url: https://github.com/reymond-group/drfp/blob/main/data/uspto_yields_above.csv + description: data source +- url: https://tdcommons.ai/single_pred_tasks/yields/#uspto + description: other source +num_points: 498721 +bibtex: +- |- + @article{https://doi.org/10.17863/cam.16293, + doi = {10.17863/CAM.16293}, + url = {https://www.repository.cam.ac.uk/handle/1810/244727}, + year = {2012}, + publisher = {Apollo - University of Cambridge Repository}, + keywords = {Name to structure, OPSIN, Chemical text mining, Text mining, + Patent reaction extraction, Reaction mining, Patents}, + language = {en}, + author = {Lowe, Daniel Mark}, + title = {Extraction of chemical structures and reactions from the literature}, + copyright = {All Rights Reserved} diff --git a/data/uspto_500k/transform.py b/data/uspto_500k/transform.py new file mode 100644 index 000000000..0a322c1fd --- /dev/null +++ b/data/uspto_500k/transform.py @@ -0,0 +1,130 @@ +import pandas as pd +import yaml +from tdc.single_pred import Tox + + +def get_and_transform_data(): + # get raw data + df1 = pd.read_csv( + "https://github.com/reymond-group/drfp/raw/main/data/uspto_yields_above.csv" + ) + df2 = pd.read_csv( + "https://github.com/reymond-group/drfp/raw/main/data/uspto_yields_below.csv" + ) + data = pd.concat([df1, df2]) + data = data[["rxn", "yield"]] + data = data.drop_duplicates(subset="rxn") + fn_data_original = "uptso.csv" + data.to_csv(fn_data_original, index=False) + + # create dataframe + df = pd.read_csv( + fn_data_original, delimiter="," + ) # not necessary but ensure we can load the saved data + + # check if fields are the same + fields_orig = df.columns.tolist() + assert fields_orig == ["rxn", "yield"] + fields_clean = ["reaction_SMILES", "yield"] + + # overwrite column names = fields + df.columns = fields_clean + assert fields_orig != fields_clean + + # remove leading and trailing white space characters + assert not df.duplicated().sum() + + # save to csv + fn_data_csv = "data_clean.csv" + df.to_csv(fn_data_csv, index=False) + + # create meta yaml + meta = { + "name": "uspto_500k", # unique identifier, we will also use this for directory names + "description": """United States Patent and Trademark Office reaction dataset with yields.""", + "targets": [ + { + "id": "yield", # name of the column in a tabular dataset + "description": "Reaction yields analyzed by UPLC", # description of what this column means + "units": "%", # units of the values in this column (leave empty if unitless) + "type": "continuous", # can be "categorical", "ordinal", "continuous" + "names": [ # names for the property (to sample from for building the prompts) + "Reaction yield", + "yield", + ], + "uris": [ + "https://bioportal.bioontology.org/ontologies/AFO?p=classes&conceptid=http%3A%2F%2Fpurl.allotrope.org%2Fontologies%2Fquality%23AFQ_0000227", + "https://en.wikipedia.org/wiki/Yield_(chemistry)", + ], + }, + ], + "benchmarks": [ + { + "name": "TDC", # unique benchmark name + "link": "https://tdcommons.ai/", # benchmark URL + "split_column": "split", # name of the column that contains the split information + }, + ], + "identifiers": [ + { + "id": "reaction_SMILES", # column name + "type": "SMILES", # can be "SMILES", "SELFIES", "IUPAC", "Other" + "description": "reaction SMILES", # description (optional, except for "Other") + }, + ], + "license": "CC0", # license under which the original dataset was published + "links": [ # list of relevant links (original dataset, other uses, etc.) + { + "url": "https://doi.org/10.17863/CAM.16293", + "description": "corresponding publication", + }, + { + "url": "https://github.com/reymond-group/drfp/blob/main/data/uspto_yields_below.csv", + "description": "data source", + }, + { + "url": "https://github.com/reymond-group/drfp/blob/main/data/uspto_yields_above.csv", + "description": "data source", + }, + { + "url": "https://tdcommons.ai/single_pred_tasks/yields/#uspto", + "description": "other source", + }, + ], + "num_points": len(df), # number of datapoints in this dataset + "bibtex": [ + """@article{https://doi.org/10.17863/cam.16293, +doi = {10.17863/CAM.16293}, +url = {https://www.repository.cam.ac.uk/handle/1810/244727}, +year = {2012}, +publisher = {Apollo - University of Cambridge Repository}, +keywords = {Name to structure, OPSIN, Chemical text mining, Text mining, +Patent reaction extraction, Reaction mining, Patents}, +language = {en}, +author = {Lowe, Daniel Mark}, +title = {Extraction of chemical structures and reactions from the literature}, +copyright = {All Rights Reserved}""", + ], + } + + def str_presenter(dumper, data): + """configures yaml for dumping multiline strings + Ref: https://stackoverflow.com/questions/8640959/how-can-i-control-what-scalar-form-pyyaml-uses-for-my-data + """ + if data.count("\n") > 0: # check for multiline string + return dumper.represent_scalar("tag:yaml.org,2002:str", data, style="|") + return dumper.represent_scalar("tag:yaml.org,2002:str", data) + + yaml.add_representer(str, str_presenter) + yaml.representer.SafeRepresenter.add_representer( + str, str_presenter + ) # to use with safe_dum + fn_meta = "meta.yaml" + with open(fn_meta, "w") as f: + yaml.dump(meta, f, sort_keys=False) + + print(f"Finished processing {meta['name']} dataset!") + + +if __name__ == "__main__": + get_and_transform_data() From 69af75c2fc1ab2137e84c708fc9b3f7b4b1f2487 Mon Sep 17 00:00:00 2001 From: "pre-commit-ci[bot]" <66853113+pre-commit-ci[bot]@users.noreply.github.com> Date: Fri, 24 Mar 2023 22:26:16 +0000 Subject: [PATCH 05/13] [pre-commit.ci] auto fixes from pre-commit.com hooks for more information, see https://pre-commit.ci --- data/uspto_500k/meta.yaml | 73 ++++++++++++++++++------------------ data/uspto_500k/transform.py | 10 ++--- 2 files changed, 42 insertions(+), 41 deletions(-) diff --git a/data/uspto_500k/meta.yaml b/data/uspto_500k/meta.yaml index 0270ec866..59fd50cf2 100644 --- a/data/uspto_500k/meta.yaml +++ b/data/uspto_500k/meta.yaml @@ -1,45 +1,46 @@ +--- name: uspto_500k description: United States Patent and Trademark Office reaction dataset with yields. targets: -- id: yield - description: Reaction yields analyzed by UPLC - units: '%' - type: continuous - names: - - Reaction yield - - yield - uris: - - https://bioportal.bioontology.org/ontologies/AFO?p=classes&conceptid=http%3A%2F%2Fpurl.allotrope.org%2Fontologies%2Fquality%23AFQ_0000227 - - https://en.wikipedia.org/wiki/Yield_(chemistry) + - id: yield + description: Reaction yields analyzed by UPLC + units: '%' + type: continuous + names: + - Reaction yield + - yield + uris: + - https://bioportal.bioontology.org/ontologies/AFO?p=classes&conceptid=http%3A%2F%2Fpurl.allotrope.org%2Fontologies%2Fquality%23AFQ_0000227 + - https://en.wikipedia.org/wiki/Yield_(chemistry) benchmarks: -- name: TDC - link: https://tdcommons.ai/ - split_column: split + - name: TDC + link: https://tdcommons.ai/ + split_column: split identifiers: -- id: reaction_SMILES - type: SMILES - description: reaction SMILES + - id: reaction_SMILES + type: SMILES + description: reaction SMILES license: CC0 links: -- url: https://doi.org/10.17863/CAM.16293 - description: corresponding publication -- url: https://github.com/reymond-group/drfp/blob/main/data/uspto_yields_below.csv - description: data source -- url: https://github.com/reymond-group/drfp/blob/main/data/uspto_yields_above.csv - description: data source -- url: https://tdcommons.ai/single_pred_tasks/yields/#uspto - description: other source + - url: https://doi.org/10.17863/CAM.16293 + description: corresponding publication + - url: https://github.com/reymond-group/drfp/blob/main/data/uspto_yields_below.csv + description: data source + - url: https://github.com/reymond-group/drfp/blob/main/data/uspto_yields_above.csv + description: data source + - url: https://tdcommons.ai/single_pred_tasks/yields/#uspto + description: other source num_points: 498721 bibtex: -- |- - @article{https://doi.org/10.17863/cam.16293, - doi = {10.17863/CAM.16293}, - url = {https://www.repository.cam.ac.uk/handle/1810/244727}, - year = {2012}, - publisher = {Apollo - University of Cambridge Repository}, - keywords = {Name to structure, OPSIN, Chemical text mining, Text mining, - Patent reaction extraction, Reaction mining, Patents}, - language = {en}, - author = {Lowe, Daniel Mark}, - title = {Extraction of chemical structures and reactions from the literature}, - copyright = {All Rights Reserved} + - |- + @article{https://doi.org/10.17863/cam.16293, + doi = {10.17863/CAM.16293}, + url = {https://www.repository.cam.ac.uk/handle/1810/244727}, + year = {2012}, + publisher = {Apollo - University of Cambridge Repository}, + keywords = {Name to structure, OPSIN, Chemical text mining, Text mining, + Patent reaction extraction, Reaction mining, Patents}, + language = {en}, + author = {Lowe, Daniel Mark}, + title = {Extraction of chemical structures and reactions from the literature}, + copyright = {All Rights Reserved} diff --git a/data/uspto_500k/transform.py b/data/uspto_500k/transform.py index 0a322c1fd..6d407d93e 100644 --- a/data/uspto_500k/transform.py +++ b/data/uspto_500k/transform.py @@ -59,11 +59,11 @@ def get_and_transform_data(): }, ], "benchmarks": [ - { - "name": "TDC", # unique benchmark name - "link": "https://tdcommons.ai/", # benchmark URL - "split_column": "split", # name of the column that contains the split information - }, + { + "name": "TDC", # unique benchmark name + "link": "https://tdcommons.ai/", # benchmark URL + "split_column": "split", # name of the column that contains the split information + }, ], "identifiers": [ { From 8ec54afba8e3c521aacdae0416f65b77de95cd4d Mon Sep 17 00:00:00 2001 From: Mohamed Abd Elaleem <109590482+phalem@users.noreply.github.com> Date: Sat, 25 Mar 2023 00:27:07 +0200 Subject: [PATCH 06/13] Delete data/USPTO_500k directory --- data/USPTO_500k/meta.yaml | 41 ------------ data/USPTO_500k/transform.py | 122 ----------------------------------- 2 files changed, 163 deletions(-) delete mode 100644 data/USPTO_500k/meta.yaml delete mode 100644 data/USPTO_500k/transform.py diff --git a/data/USPTO_500k/meta.yaml b/data/USPTO_500k/meta.yaml deleted file mode 100644 index 06d9252c0..000000000 --- a/data/USPTO_500k/meta.yaml +++ /dev/null @@ -1,41 +0,0 @@ ---- -name: USPTO_500k -description: United States Patent and Trademark Office reaction dataset with yields. -targets: - - id: yield - description: Reaction yields analyzed by UPLC - units: '%' - type: continuous - names: - - Reaction yield - - yield - uris: - - https://bioportal.bioontology.org/ontologies/AFO?p=classes&conceptid=http%3A%2F%2Fpurl.allotrope.org%2Fontologies%2Fquality%23AFQ_0000227 - - https://en.wikipedia.org/wiki/Yield_(chemistry) -identifiers: - - id: reaction_SMILES - type: RXN-SMILES - description: reaction SMILES -license: CC0 -links: - - url: https://doi.org/10.17863/CAM.16293 - description: corresponding publication - - url: https://github.com/reymond-group/drfp/blob/main/data/uspto_yields_below.csv - description: data source - - url: https://github.com/reymond-group/drfp/blob/main/data/uspto_yields_above.csv - description: data source - - url: https://tdcommons.ai/single_pred_tasks/yields/#uspto - description: other source -num_points: 498721 -bibtex: - - |- - @article{https://doi.org/10.17863/cam.16293, - doi = {10.17863/CAM.16293}, - url = {https://www.repository.cam.ac.uk/handle/1810/244727}, - author = {Lowe, Daniel Mark}, - keywords = {Name to structure, OPSIN, Chemical text mining, Text mining, Patent reaction extraction, Reaction mining, Patents}, - language = {en}, - title = {Extraction of chemical structures and reactions from the literature}, - publisher = {Apollo - University of Cambridge Repository}, - year = {2012}, - copyright = {All Rights Reserved}} diff --git a/data/USPTO_500k/transform.py b/data/USPTO_500k/transform.py deleted file mode 100644 index 8596e43c6..000000000 --- a/data/USPTO_500k/transform.py +++ /dev/null @@ -1,122 +0,0 @@ -import pandas as pd -import yaml -from tdc.single_pred import Tox - - -def get_and_transform_data(): - # get raw data - df1 = pd.read_csv( - "https://github.com/reymond-group/drfp/raw/main/data/uspto_yields_above.csv" - ) - df2 = pd.read_csv( - "https://github.com/reymond-group/drfp/raw/main/data/uspto_yields_below.csv" - ) - data = pd.concat([df1, df2]) - data = data[["rxn", "yield"]] - data = data.drop_duplicates(subset="rxn") - fn_data_original = "uptso.csv" - data.to_csv(fn_data_original, index=False) - - # create dataframe - df = pd.read_csv( - fn_data_original, delimiter="," - ) # not necessary but ensure we can load the saved data - - # check if fields are the same - fields_orig = df.columns.tolist() - assert fields_orig == ["rxn", "yield"] - fields_clean = ["reaction_SMILES", "yield"] - - # overwrite column names = fields - df.columns = fields_clean - assert fields_orig != fields_clean - - # remove leading and trailing white space characters - assert not df.duplicated().sum() - - # save to csv - fn_data_csv = "data_clean.csv" - df.to_csv(fn_data_csv, index=False) - - # create meta yaml - meta = { - "name": "USPTO_500k", # unique identifier, we will also use this for directory names - "description": """United States Patent and Trademark Office reaction dataset with yields.""", - "targets": [ - { - "id": "yield", # name of the column in a tabular dataset - "description": "Reaction yields analyzed by UPLC", # description of what this column means - "units": "%", # units of the values in this column (leave empty if unitless) - "type": "continuous", # can be "categorical", "ordinal", "continuous" - "names": [ # names for the property (to sample from for building the prompts) - "Reaction yield", - "yield", - ], - "uris": [ - "https://bioportal.bioontology.org/ontologies/AFO?p=classes&conceptid=http%3A%2F%2Fpurl.allotrope.org%2Fontologies%2Fquality%23AFQ_0000227", - "https://en.wikipedia.org/wiki/Yield_(chemistry)", - ], - }, - ], - "identifiers": [ - { - "id": "reaction_SMILES", # column name - "type": "SMILES", # can be "SMILES", "SELFIES", "IUPAC", "Other" - "description": "reaction SMILES", # description (optional, except for "Other") - }, - ], - "license": "CC0", # license under which the original dataset was published - "links": [ # list of relevant links (original dataset, other uses, etc.) - { - "url": "https://doi.org/10.17863/CAM.16293", - "description": "corresponding publication", - }, - { - "url": "https://github.com/reymond-group/drfp/blob/main/data/uspto_yields_below.csv", - "description": "data source", - }, - { - "url": "https://github.com/reymond-group/drfp/blob/main/data/uspto_yields_above.csv", - "description": "data source", - }, - { - "url": "https://tdcommons.ai/single_pred_tasks/yields/#uspto", - "description": "other source", - }, - ], - "num_points": len(df), # number of datapoints in this dataset - "bibtex": [ - """@article{https://doi.org/10.17863/cam.16293, - doi = {10.17863/CAM.16293}, - url = {https://www.repository.cam.ac.uk/handle/1810/244727}, - author = {Lowe, Daniel Mark}, - keywords = {Name to structure, OPSIN, Chemical text mining, Text mining, Patent reaction extraction, Reaction mining, Patents}, - language = {en}, - title = {Extraction of chemical structures and reactions from the literature}, - publisher = {Apollo - University of Cambridge Repository}, - year = {2012}, - copyright = {All Rights Reserved}}""", - ], - } - - def str_presenter(dumper, data): - """configures yaml for dumping multiline strings - Ref: https://stackoverflow.com/questions/8640959/how-can-i-control-what-scalar-form-pyyaml-uses-for-my-data - """ - if data.count("\n") > 0: # check for multiline string - return dumper.represent_scalar("tag:yaml.org,2002:str", data, style="|") - return dumper.represent_scalar("tag:yaml.org,2002:str", data) - - yaml.add_representer(str, str_presenter) - yaml.representer.SafeRepresenter.add_representer( - str, str_presenter - ) # to use with safe_dum - fn_meta = "meta.yaml" - with open(fn_meta, "w") as f: - yaml.dump(meta, f, sort_keys=False) - - print(f"Finished processing {meta['name']} dataset!") - - -if __name__ == "__main__": - get_and_transform_data() From 5c120787ed1ccf7b2c7235f5c0c07f48b1d6c152 Mon Sep 17 00:00:00 2001 From: Mohamed Abd Elaleem <109590482+phalem@users.noreply.github.com> Date: Sat, 25 Mar 2023 00:40:10 +0200 Subject: [PATCH 07/13] Add files via upload --- data/uspto_500k/meta.yaml | 73 ++++++++++++++++++------------------ data/uspto_500k/transform.py | 10 ++--- 2 files changed, 41 insertions(+), 42 deletions(-) diff --git a/data/uspto_500k/meta.yaml b/data/uspto_500k/meta.yaml index 59fd50cf2..0270ec866 100644 --- a/data/uspto_500k/meta.yaml +++ b/data/uspto_500k/meta.yaml @@ -1,46 +1,45 @@ ---- name: uspto_500k description: United States Patent and Trademark Office reaction dataset with yields. targets: - - id: yield - description: Reaction yields analyzed by UPLC - units: '%' - type: continuous - names: - - Reaction yield - - yield - uris: - - https://bioportal.bioontology.org/ontologies/AFO?p=classes&conceptid=http%3A%2F%2Fpurl.allotrope.org%2Fontologies%2Fquality%23AFQ_0000227 - - https://en.wikipedia.org/wiki/Yield_(chemistry) +- id: yield + description: Reaction yields analyzed by UPLC + units: '%' + type: continuous + names: + - Reaction yield + - yield + uris: + - https://bioportal.bioontology.org/ontologies/AFO?p=classes&conceptid=http%3A%2F%2Fpurl.allotrope.org%2Fontologies%2Fquality%23AFQ_0000227 + - https://en.wikipedia.org/wiki/Yield_(chemistry) benchmarks: - - name: TDC - link: https://tdcommons.ai/ - split_column: split +- name: TDC + link: https://tdcommons.ai/ + split_column: split identifiers: - - id: reaction_SMILES - type: SMILES - description: reaction SMILES +- id: reaction_SMILES + type: SMILES + description: reaction SMILES license: CC0 links: - - url: https://doi.org/10.17863/CAM.16293 - description: corresponding publication - - url: https://github.com/reymond-group/drfp/blob/main/data/uspto_yields_below.csv - description: data source - - url: https://github.com/reymond-group/drfp/blob/main/data/uspto_yields_above.csv - description: data source - - url: https://tdcommons.ai/single_pred_tasks/yields/#uspto - description: other source +- url: https://doi.org/10.17863/CAM.16293 + description: corresponding publication +- url: https://github.com/reymond-group/drfp/blob/main/data/uspto_yields_below.csv + description: data source +- url: https://github.com/reymond-group/drfp/blob/main/data/uspto_yields_above.csv + description: data source +- url: https://tdcommons.ai/single_pred_tasks/yields/#uspto + description: other source num_points: 498721 bibtex: - - |- - @article{https://doi.org/10.17863/cam.16293, - doi = {10.17863/CAM.16293}, - url = {https://www.repository.cam.ac.uk/handle/1810/244727}, - year = {2012}, - publisher = {Apollo - University of Cambridge Repository}, - keywords = {Name to structure, OPSIN, Chemical text mining, Text mining, - Patent reaction extraction, Reaction mining, Patents}, - language = {en}, - author = {Lowe, Daniel Mark}, - title = {Extraction of chemical structures and reactions from the literature}, - copyright = {All Rights Reserved} +- |- + @article{https://doi.org/10.17863/cam.16293, + doi = {10.17863/CAM.16293}, + url = {https://www.repository.cam.ac.uk/handle/1810/244727}, + year = {2012}, + publisher = {Apollo - University of Cambridge Repository}, + keywords = {Name to structure, OPSIN, Chemical text mining, Text mining, + Patent reaction extraction, Reaction mining, Patents}, + language = {en}, + author = {Lowe, Daniel Mark}, + title = {Extraction of chemical structures and reactions from the literature}, + copyright = {All Rights Reserved} diff --git a/data/uspto_500k/transform.py b/data/uspto_500k/transform.py index 6d407d93e..0a322c1fd 100644 --- a/data/uspto_500k/transform.py +++ b/data/uspto_500k/transform.py @@ -59,11 +59,11 @@ def get_and_transform_data(): }, ], "benchmarks": [ - { - "name": "TDC", # unique benchmark name - "link": "https://tdcommons.ai/", # benchmark URL - "split_column": "split", # name of the column that contains the split information - }, + { + "name": "TDC", # unique benchmark name + "link": "https://tdcommons.ai/", # benchmark URL + "split_column": "split", # name of the column that contains the split information + }, ], "identifiers": [ { From 5716744d03a0e8a81cce9e91c3304f0c5debd187 Mon Sep 17 00:00:00 2001 From: "pre-commit-ci[bot]" <66853113+pre-commit-ci[bot]@users.noreply.github.com> Date: Fri, 24 Mar 2023 22:41:09 +0000 Subject: [PATCH 08/13] [pre-commit.ci] auto fixes from pre-commit.com hooks for more information, see https://pre-commit.ci --- data/uspto_500k/meta.yaml | 73 ++++++++++++++++++------------------ data/uspto_500k/transform.py | 10 ++--- 2 files changed, 42 insertions(+), 41 deletions(-) diff --git a/data/uspto_500k/meta.yaml b/data/uspto_500k/meta.yaml index 0270ec866..59fd50cf2 100644 --- a/data/uspto_500k/meta.yaml +++ b/data/uspto_500k/meta.yaml @@ -1,45 +1,46 @@ +--- name: uspto_500k description: United States Patent and Trademark Office reaction dataset with yields. targets: -- id: yield - description: Reaction yields analyzed by UPLC - units: '%' - type: continuous - names: - - Reaction yield - - yield - uris: - - https://bioportal.bioontology.org/ontologies/AFO?p=classes&conceptid=http%3A%2F%2Fpurl.allotrope.org%2Fontologies%2Fquality%23AFQ_0000227 - - https://en.wikipedia.org/wiki/Yield_(chemistry) + - id: yield + description: Reaction yields analyzed by UPLC + units: '%' + type: continuous + names: + - Reaction yield + - yield + uris: + - https://bioportal.bioontology.org/ontologies/AFO?p=classes&conceptid=http%3A%2F%2Fpurl.allotrope.org%2Fontologies%2Fquality%23AFQ_0000227 + - https://en.wikipedia.org/wiki/Yield_(chemistry) benchmarks: -- name: TDC - link: https://tdcommons.ai/ - split_column: split + - name: TDC + link: https://tdcommons.ai/ + split_column: split identifiers: -- id: reaction_SMILES - type: SMILES - description: reaction SMILES + - id: reaction_SMILES + type: SMILES + description: reaction SMILES license: CC0 links: -- url: https://doi.org/10.17863/CAM.16293 - description: corresponding publication -- url: https://github.com/reymond-group/drfp/blob/main/data/uspto_yields_below.csv - description: data source -- url: https://github.com/reymond-group/drfp/blob/main/data/uspto_yields_above.csv - description: data source -- url: https://tdcommons.ai/single_pred_tasks/yields/#uspto - description: other source + - url: https://doi.org/10.17863/CAM.16293 + description: corresponding publication + - url: https://github.com/reymond-group/drfp/blob/main/data/uspto_yields_below.csv + description: data source + - url: https://github.com/reymond-group/drfp/blob/main/data/uspto_yields_above.csv + description: data source + - url: https://tdcommons.ai/single_pred_tasks/yields/#uspto + description: other source num_points: 498721 bibtex: -- |- - @article{https://doi.org/10.17863/cam.16293, - doi = {10.17863/CAM.16293}, - url = {https://www.repository.cam.ac.uk/handle/1810/244727}, - year = {2012}, - publisher = {Apollo - University of Cambridge Repository}, - keywords = {Name to structure, OPSIN, Chemical text mining, Text mining, - Patent reaction extraction, Reaction mining, Patents}, - language = {en}, - author = {Lowe, Daniel Mark}, - title = {Extraction of chemical structures and reactions from the literature}, - copyright = {All Rights Reserved} + - |- + @article{https://doi.org/10.17863/cam.16293, + doi = {10.17863/CAM.16293}, + url = {https://www.repository.cam.ac.uk/handle/1810/244727}, + year = {2012}, + publisher = {Apollo - University of Cambridge Repository}, + keywords = {Name to structure, OPSIN, Chemical text mining, Text mining, + Patent reaction extraction, Reaction mining, Patents}, + language = {en}, + author = {Lowe, Daniel Mark}, + title = {Extraction of chemical structures and reactions from the literature}, + copyright = {All Rights Reserved} diff --git a/data/uspto_500k/transform.py b/data/uspto_500k/transform.py index 0a322c1fd..6d407d93e 100644 --- a/data/uspto_500k/transform.py +++ b/data/uspto_500k/transform.py @@ -59,11 +59,11 @@ def get_and_transform_data(): }, ], "benchmarks": [ - { - "name": "TDC", # unique benchmark name - "link": "https://tdcommons.ai/", # benchmark URL - "split_column": "split", # name of the column that contains the split information - }, + { + "name": "TDC", # unique benchmark name + "link": "https://tdcommons.ai/", # benchmark URL + "split_column": "split", # name of the column that contains the split information + }, ], "identifiers": [ { From b00d2280747bb9f0d4dee92772ebaf3b39567be5 Mon Sep 17 00:00:00 2001 From: Mohamed Abd Elaleem <109590482+phalem@users.noreply.github.com> Date: Tue, 28 Mar 2023 20:43:14 +0200 Subject: [PATCH 09/13] Update data/uspto_500k/meta.yaml Co-authored-by: Kevin M Jablonka <32935233+kjappelbaum@users.noreply.github.com> --- data/uspto_500k/meta.yaml | 1 - 1 file changed, 1 deletion(-) diff --git a/data/uspto_500k/meta.yaml b/data/uspto_500k/meta.yaml index 59fd50cf2..fa04c3600 100644 --- a/data/uspto_500k/meta.yaml +++ b/data/uspto_500k/meta.yaml @@ -11,7 +11,6 @@ targets: - yield uris: - https://bioportal.bioontology.org/ontologies/AFO?p=classes&conceptid=http%3A%2F%2Fpurl.allotrope.org%2Fontologies%2Fquality%23AFQ_0000227 - - https://en.wikipedia.org/wiki/Yield_(chemistry) benchmarks: - name: TDC link: https://tdcommons.ai/ From f231ca9e4418b784b4f920170b12850c5a76f844 Mon Sep 17 00:00:00 2001 From: Mohamed Abd Elaleem <109590482+phalem@users.noreply.github.com> Date: Wed, 29 Mar 2023 01:41:41 +0200 Subject: [PATCH 10/13] Remove Benchmark field I will add benchmark field on TDC version UPSTO --- data/uspto_500k/meta.yaml | 70 +++++++++++++++++------------------- data/uspto_500k/transform.py | 25 +++++-------- 2 files changed, 42 insertions(+), 53 deletions(-) diff --git a/data/uspto_500k/meta.yaml b/data/uspto_500k/meta.yaml index fa04c3600..96bf0717e 100644 --- a/data/uspto_500k/meta.yaml +++ b/data/uspto_500k/meta.yaml @@ -1,45 +1,41 @@ ---- name: uspto_500k description: United States Patent and Trademark Office reaction dataset with yields. targets: - - id: yield - description: Reaction yields analyzed by UPLC - units: '%' - type: continuous - names: - - Reaction yield - - yield - uris: - - https://bioportal.bioontology.org/ontologies/AFO?p=classes&conceptid=http%3A%2F%2Fpurl.allotrope.org%2Fontologies%2Fquality%23AFQ_0000227 -benchmarks: - - name: TDC - link: https://tdcommons.ai/ - split_column: split +- id: yield + description: Reaction yields analyzed by UPLC + units: '%' + type: continuous + names: + - Reaction yield + - yield + uris: + - https://bioportal.bioontology.org/ontologies/AFO?p=classes&conceptid=http%3A%2F%2Fpurl.allotrope.org%2Fontologies%2Fquality%23AFQ_0000227 + - https://en.wikipedia.org/wiki/Yield_(chemistry) identifiers: - - id: reaction_SMILES - type: SMILES - description: reaction SMILES +- id: reaction_SMILES + type: RXNSMILES + description: reaction SMILES license: CC0 links: - - url: https://doi.org/10.17863/CAM.16293 - description: corresponding publication - - url: https://github.com/reymond-group/drfp/blob/main/data/uspto_yields_below.csv - description: data source - - url: https://github.com/reymond-group/drfp/blob/main/data/uspto_yields_above.csv - description: data source - - url: https://tdcommons.ai/single_pred_tasks/yields/#uspto - description: other source +- url: https://doi.org/10.17863/CAM.16293 + description: corresponding publication +- url: https://github.com/reymond-group/drfp/blob/main/data/uspto_yields_below.csv + description: data source +- url: https://github.com/reymond-group/drfp/blob/main/data/uspto_yields_above.csv + description: data source +- url: https://tdcommons.ai/single_pred_tasks/yields/#uspto + description: other source num_points: 498721 bibtex: - - |- - @article{https://doi.org/10.17863/cam.16293, - doi = {10.17863/CAM.16293}, - url = {https://www.repository.cam.ac.uk/handle/1810/244727}, - year = {2012}, - publisher = {Apollo - University of Cambridge Repository}, - keywords = {Name to structure, OPSIN, Chemical text mining, Text mining, - Patent reaction extraction, Reaction mining, Patents}, - language = {en}, - author = {Lowe, Daniel Mark}, - title = {Extraction of chemical structures and reactions from the literature}, - copyright = {All Rights Reserved} +- |- + @article{https://doi.org/10.17863/cam.16293, + doi = {10.17863/CAM.16293}, + url = {https://www.repository.cam.ac.uk/handle/1810/244727}, + year = {2012}, + publisher = {Apollo - University of Cambridge Repository}, + keywords = {Name to structure, OPSIN, Chemical text mining, Text mining, + Patent reaction extraction, Reaction mining, Patents}, + language = {en}, + author = {Lowe, Daniel Mark}, + title = {Extraction of chemical structures and reactions from the literature}, + copyright = {All Rights Reserved} diff --git a/data/uspto_500k/transform.py b/data/uspto_500k/transform.py index 6d407d93e..308817a7a 100644 --- a/data/uspto_500k/transform.py +++ b/data/uspto_500k/transform.py @@ -40,15 +40,15 @@ def get_and_transform_data(): # create meta yaml meta = { - "name": "uspto_500k", # unique identifier, we will also use this for directory names + "name": "uspto_500k", "description": """United States Patent and Trademark Office reaction dataset with yields.""", "targets": [ { - "id": "yield", # name of the column in a tabular dataset - "description": "Reaction yields analyzed by UPLC", # description of what this column means - "units": "%", # units of the values in this column (leave empty if unitless) - "type": "continuous", # can be "categorical", "ordinal", "continuous" - "names": [ # names for the property (to sample from for building the prompts) + "id": "yield", + "description": "Reaction yields analyzed by UPLC", + "units": "%", + "type": "continuous", + "names": [ "Reaction yield", "yield", ], @@ -58,18 +58,11 @@ def get_and_transform_data(): ], }, ], - "benchmarks": [ - { - "name": "TDC", # unique benchmark name - "link": "https://tdcommons.ai/", # benchmark URL - "split_column": "split", # name of the column that contains the split information - }, - ], "identifiers": [ { - "id": "reaction_SMILES", # column name - "type": "SMILES", # can be "SMILES", "SELFIES", "IUPAC", "Other" - "description": "reaction SMILES", # description (optional, except for "Other") + "id": "reaction_SMILES", + "type": "RXNSMILES", + "description": "reaction SMILES", }, ], "license": "CC0", # license under which the original dataset was published From 0bdd4aa1764b755558e5c7b86752c422e9204ab1 Mon Sep 17 00:00:00 2001 From: Mohamed Abd Elaleem <109590482+phalem@users.noreply.github.com> Date: Wed, 29 Mar 2023 01:42:21 +0200 Subject: [PATCH 11/13] Remove benchmark field I will add benchmark field on TDC version UPSTO From d55bfd95f4aa23a0646b39dab7d7f69df4d46a84 Mon Sep 17 00:00:00 2001 From: Mohamed Abd Elaleem <109590482+phalem@users.noreply.github.com> Date: Wed, 29 Mar 2023 01:42:42 +0200 Subject: [PATCH 12/13] Remove benchmark field I will add benchmark field on TDC version UPSTO From 8cf5f1a15e40d543bb39a40eb28baa346554b0af Mon Sep 17 00:00:00 2001 From: Michael Pieler Date: Fri, 14 Apr 2023 17:14:26 +0200 Subject: [PATCH 13/13] feat: uspto_500k clean up --- data/uspto_500k/meta.yaml | 68 ++++++++++++++++++------------------ data/uspto_500k/transform.py | 68 +++++++++++++++++++++--------------- 2 files changed, 74 insertions(+), 62 deletions(-) diff --git a/data/uspto_500k/meta.yaml b/data/uspto_500k/meta.yaml index 96bf0717e..0d1cfb0f1 100644 --- a/data/uspto_500k/meta.yaml +++ b/data/uspto_500k/meta.yaml @@ -1,41 +1,41 @@ +--- name: uspto_500k description: United States Patent and Trademark Office reaction dataset with yields. targets: -- id: yield - description: Reaction yields analyzed by UPLC - units: '%' - type: continuous - names: - - Reaction yield - - yield - uris: - - https://bioportal.bioontology.org/ontologies/AFO?p=classes&conceptid=http%3A%2F%2Fpurl.allotrope.org%2Fontologies%2Fquality%23AFQ_0000227 - - https://en.wikipedia.org/wiki/Yield_(chemistry) + - id: yield + description: reaction yields + units: '%' + type: continuous + names: + - reaction yield + - yield + uris: + - http://purl.allotrope.org/ontologies/quality#AFQ_0000227 identifiers: -- id: reaction_SMILES - type: RXNSMILES - description: reaction SMILES + - id: reaction_SMILES + type: RXNSMILES + description: reaction SMILES license: CC0 links: -- url: https://doi.org/10.17863/CAM.16293 - description: corresponding publication -- url: https://github.com/reymond-group/drfp/blob/main/data/uspto_yields_below.csv - description: data source -- url: https://github.com/reymond-group/drfp/blob/main/data/uspto_yields_above.csv - description: data source -- url: https://tdcommons.ai/single_pred_tasks/yields/#uspto - description: other source -num_points: 498721 + - url: https://doi.org/10.17863/CAM.16293 + description: corresponding publication + - url: https://github.com/reymond-group/drfp/blob/main/data/uspto_yields_below.csv + description: data source + - url: https://github.com/reymond-group/drfp/blob/main/data/uspto_yields_above.csv + description: data source + - url: https://tdcommons.ai/single_pred_tasks/yields/#uspto + description: other source +num_points: 853638 bibtex: -- |- - @article{https://doi.org/10.17863/cam.16293, - doi = {10.17863/CAM.16293}, - url = {https://www.repository.cam.ac.uk/handle/1810/244727}, - year = {2012}, - publisher = {Apollo - University of Cambridge Repository}, - keywords = {Name to structure, OPSIN, Chemical text mining, Text mining, - Patent reaction extraction, Reaction mining, Patents}, - language = {en}, - author = {Lowe, Daniel Mark}, - title = {Extraction of chemical structures and reactions from the literature}, - copyright = {All Rights Reserved} + - |- + @article{https://doi.org/10.17863/cam.16293, + doi = {10.17863/CAM.16293}, + url = {https://www.repository.cam.ac.uk/handle/1810/244727}, + year = {2012}, + publisher = {Apollo - University of Cambridge Repository}, + keywords = {Name to structure, OPSIN, Chemical text mining, Text mining, + Patent reaction extraction, Reaction mining, Patents}, + language = {en}, + author = {Lowe, Daniel Mark}, + title = {Extraction of chemical structures and reactions from the literature}, + copyright = {All Rights Reserved} diff --git a/data/uspto_500k/transform.py b/data/uspto_500k/transform.py index 308817a7a..f85e4a13d 100644 --- a/data/uspto_500k/transform.py +++ b/data/uspto_500k/transform.py @@ -1,21 +1,28 @@ import pandas as pd import yaml -from tdc.single_pred import Tox +from tdc.single_pred import Yields def get_and_transform_data(): # get raw data - df1 = pd.read_csv( - "https://github.com/reymond-group/drfp/raw/main/data/uspto_yields_above.csv" - ) - df2 = pd.read_csv( - "https://github.com/reymond-group/drfp/raw/main/data/uspto_yields_below.csv" - ) - data = pd.concat([df1, df2]) - data = data[["rxn", "yield"]] - data = data.drop_duplicates(subset="rxn") - fn_data_original = "uptso.csv" - data.to_csv(fn_data_original, index=False) + data = Yields(name="USPTO_Yields") + splits = data.get_split() + df_train = splits["train"] + df_valid = splits["valid"] + df_test = splits["test"] + df_train["split"] = "train" + df_valid["split"] = "valid" + df_test["split"] = "test" + df = pd.concat([df_train, df_valid, df_test], axis=0) + + df["catalyst"] = df.Reaction.apply(lambda x: x["catalyst"]) + df["reactant"] = df.Reaction.apply(lambda x: x["reactant"]) + df["product"] = df.Reaction.apply(lambda x: x["product"]) + df = df.drop("Reaction", axis=1) + + fn_data_original = "data_original.csv" + df.to_csv(fn_data_original, index=False) + del df # create dataframe df = pd.read_csv( @@ -24,12 +31,18 @@ def get_and_transform_data(): # check if fields are the same fields_orig = df.columns.tolist() - assert fields_orig == ["rxn", "yield"] - fields_clean = ["reaction_SMILES", "yield"] - + assert fields_orig == [ + "Reaction_ID", + "Y", + "split", + "catalyst", + "reactant", + "product", + ] + fields_clean = ["Reaction_ID", "yield", "split", "catalyst", "reactant", "product"] # overwrite column names = fields df.columns = fields_clean - assert fields_orig != fields_clean + assert df.columns.tolist() == fields_clean # remove leading and trailing white space characters assert not df.duplicated().sum() @@ -40,29 +53,28 @@ def get_and_transform_data(): # create meta yaml meta = { - "name": "uspto_500k", + "name": "uspto_500k", "description": """United States Patent and Trademark Office reaction dataset with yields.""", "targets": [ { - "id": "yield", - "description": "Reaction yields analyzed by UPLC", - "units": "%", - "type": "continuous", - "names": [ - "Reaction yield", + "id": "yield", + "description": "reaction yields", + "units": "%", + "type": "continuous", + "names": [ + "reaction yield", "yield", ], "uris": [ - "https://bioportal.bioontology.org/ontologies/AFO?p=classes&conceptid=http%3A%2F%2Fpurl.allotrope.org%2Fontologies%2Fquality%23AFQ_0000227", - "https://en.wikipedia.org/wiki/Yield_(chemistry)", + "http://purl.allotrope.org/ontologies/quality#AFQ_0000227", ], }, ], "identifiers": [ { - "id": "reaction_SMILES", - "type": "RXNSMILES", - "description": "reaction SMILES", + "id": "reaction_SMILES", + "type": "RXNSMILES", + "description": "reaction SMILES", }, ], "license": "CC0", # license under which the original dataset was published