From 5c20bd914778b30f63b99a19c8392087ffef68ab Mon Sep 17 00:00:00 2001
From: Mohamed Abd Elaleem <109590482+phalem@users.noreply.github.com>
Date: Fri, 10 Mar 2023 19:34:09 +0200
Subject: [PATCH 01/13] Add uspto data from drfp

---
 data/USPTO_500k/meta.yaml    |  40 ++++++++++++
 data/USPTO_500k/transform.py | 122 +++++++++++++++++++++++++++++++++++
 2 files changed, 162 insertions(+)
 create mode 100644 data/USPTO_500k/meta.yaml
 create mode 100644 data/USPTO_500k/transform.py

diff --git a/data/USPTO_500k/meta.yaml b/data/USPTO_500k/meta.yaml
new file mode 100644
index 000000000..1c6010066
--- /dev/null
+++ b/data/USPTO_500k/meta.yaml
@@ -0,0 +1,40 @@
+name: USPTO_500k
+description: United States Patent and Trademark Office reaction dataset with yields.
+targets:
+- id: yield
+  description: Reaction yields analyzed by UPLC
+  units: '%'
+  type: continuous
+  names:
+  - Reaction yield
+  - yield
+  uris:
+  - https://bioportal.bioontology.org/ontologies/AFO?p=classes&conceptid=http%3A%2F%2Fpurl.allotrope.org%2Fontologies%2Fquality%23AFQ_0000227
+  - https://en.wikipedia.org/wiki/Yield_(chemistry)
+identifiers:
+- id: reaction_SMILES
+  type: SMILES
+  description: reaction SMILES
+license: CC0
+links:
+- url: https://doi.org/10.17863/CAM.16293
+  description: corresponding publication
+- url: https://github.com/reymond-group/drfp/blob/main/data/uspto_yields_below.csv
+  description: data source
+- url: https://github.com/reymond-group/drfp/blob/main/data/uspto_yields_above.csv
+  description: data source
+- url: https://tdcommons.ai/single_pred_tasks/yields/#uspto
+  description: other source
+num_points: 498721
+bibtex:
+- |-
+  @article{https://doi.org/10.17863/cam.16293,
+                doi = {10.17863/CAM.16293},
+                url = {https://www.repository.cam.ac.uk/handle/1810/244727},
+                author = {Lowe,  Daniel Mark},
+                keywords = {Name to structure,  OPSIN,  Chemical text mining,  Text mining,  Patent reaction extraction,  Reaction mining,  Patents},
+                language = {en},
+                title = {Extraction of chemical structures and reactions from the literature},
+                publisher = {Apollo - University of Cambridge Repository},
+                year = {2012},
+                copyright = {All Rights Reserved}}
diff --git a/data/USPTO_500k/transform.py b/data/USPTO_500k/transform.py
new file mode 100644
index 000000000..98e753076
--- /dev/null
+++ b/data/USPTO_500k/transform.py
@@ -0,0 +1,122 @@
+import pandas as pd
+import yaml
+from tdc.single_pred import Tox
+
+
+def get_and_transform_data():
+    # get raw data
+    df1 = pd.read_csv('https://github.com/reymond-group/drfp/raw/main/data/uspto_yields_above.csv')
+    df2 = pd.read_csv('https://github.com/reymond-group/drfp/raw/main/data/uspto_yields_below.csv')
+    data = pd.concat([df1,df2])
+    data = data[['rxn','yield']]
+    data= data.drop_duplicates(subset='rxn')
+    fn_data_original = "uptso.csv"
+    data.to_csv(fn_data_original, index=False)
+    
+    # create dataframe
+    df = pd.read_csv(fn_data_original, 
+                     delimiter=","
+        )# not necessary but ensure we can load the saved data
+    
+    # check if fields are the same
+    fields_orig = df.columns.tolist()
+    assert fields_orig == ['rxn', 'yield']
+    fields_clean = [
+        "reaction_SMILES",
+        "yield"
+    ]
+    
+    # overwrite column names = fields
+    df.columns = fields_clean
+    assert fields_orig != fields_clean
+    
+    # remove leading and trailing white space characters
+    assert not df.duplicated().sum()
+    
+    # save to csv
+    fn_data_csv = "data_clean.csv"
+    df.to_csv(fn_data_csv, index=False)
+
+    
+    # create meta yaml
+    meta =  {
+        "name": "USPTO_500k",  # unique identifier, we will also use this for directory names
+        "description": """United States Patent and Trademark Office reaction dataset with yields.""",
+        "targets": [
+            {
+                "id": "yield",  # name of the column in a tabular dataset
+                "description": "Reaction yields analyzed by UPLC",  # description of what this column means
+                "units": "%",  # units of the values in this column (leave empty if unitless)
+                "type": "continuous",  # can be "categorical", "ordinal", "continuous"
+                "names": [  # names for the property (to sample from for building the prompts)
+                    "Reaction yield",
+                    "yield",
+                ],
+                "uris":[
+                    "https://bioportal.bioontology.org/ontologies/AFO?p=classes&conceptid=http%3A%2F%2Fpurl.allotrope.org%2Fontologies%2Fquality%23AFQ_0000227",
+                    "https://en.wikipedia.org/wiki/Yield_(chemistry)",
+                ],
+            },
+        ],
+        "identifiers": [
+            {
+                "id": "reaction_SMILES",  # column name
+                "type": "SMILES",  # can be "SMILES", "SELFIES", "IUPAC", "Other"
+                "description": "reaction SMILES",  # description (optional, except for "Other")
+            },
+        ],
+        "license": "CC0",  # license under which the original dataset was published
+        "links": [  # list of relevant links (original dataset, other uses, etc.)
+            {
+                "url": "https://doi.org/10.17863/CAM.16293",
+                "description": "corresponding publication",
+            },
+            {
+                "url": "https://github.com/reymond-group/drfp/blob/main/data/uspto_yields_below.csv",
+                "description": "data source",
+            },
+            {
+                "url": "https://github.com/reymond-group/drfp/blob/main/data/uspto_yields_above.csv",
+                "description": "data source",
+            },
+            {
+                "url": "https://tdcommons.ai/single_pred_tasks/yields/#uspto",
+                "description": "other source",
+            }
+        ],
+        "num_points": len(df),  # number of datapoints in this dataset
+        "bibtex": [
+            """@article{https://doi.org/10.17863/cam.16293,
+              doi = {10.17863/CAM.16293},
+              url = {https://www.repository.cam.ac.uk/handle/1810/244727},
+              author = {Lowe,  Daniel Mark},
+              keywords = {Name to structure,  OPSIN,  Chemical text mining,  Text mining,  Patent reaction extraction,  Reaction mining,  Patents},
+              language = {en},
+              title = {Extraction of chemical structures and reactions from the literature},
+              publisher = {Apollo - University of Cambridge Repository},
+              year = {2012},
+              copyright = {All Rights Reserved}}""",
+        ],
+    }
+
+    def str_presenter(dumper, data):
+        """configures yaml for dumping multiline strings
+        Ref: https://stackoverflow.com/questions/8640959/how-can-i-control-what-scalar-form-pyyaml-uses-for-my-data
+        """
+        if data.count("\n") > 0:  # check for multiline string
+            return dumper.represent_scalar("tag:yaml.org,2002:str", data, style="|")
+        return dumper.represent_scalar("tag:yaml.org,2002:str", data)
+
+    yaml.add_representer(str, str_presenter)
+    yaml.representer.SafeRepresenter.add_representer(
+        str, str_presenter
+    )  # to use with safe_dum
+    fn_meta = "meta.yaml"
+    with open(fn_meta, "w") as f:
+        yaml.dump(meta, f, sort_keys=False)
+
+    print(f"Finished processing {meta['name']} dataset!")
+
+
+if __name__ == "__main__":
+    get_and_transform_data()

From 0303bf39134263ad581f65293d3984bbecfcff01 Mon Sep 17 00:00:00 2001
From: Mohamed Abd Elaleem <109590482+phalem@users.noreply.github.com>
Date: Thu, 16 Mar 2023 16:28:00 +0200
Subject: [PATCH 02/13] Update data/USPTO_500k/meta.yaml

Co-authored-by: Kevin M Jablonka <32935233+kjappelbaum@users.noreply.github.com>
---
 data/USPTO_500k/meta.yaml | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/data/USPTO_500k/meta.yaml b/data/USPTO_500k/meta.yaml
index 1c6010066..adbd80857 100644
--- a/data/USPTO_500k/meta.yaml
+++ b/data/USPTO_500k/meta.yaml
@@ -13,7 +13,7 @@ targets:
   - https://en.wikipedia.org/wiki/Yield_(chemistry)
 identifiers:
 - id: reaction_SMILES
-  type: SMILES
+  type: RXN-SMILES
   description: reaction SMILES
 license: CC0
 links:

From dfdf3fdccb07bf7b5433872b3bd4f61696f21ae3 Mon Sep 17 00:00:00 2001
From: "pre-commit-ci[bot]"
 <66853113+pre-commit-ci[bot]@users.noreply.github.com>
Date: Thu, 16 Mar 2023 14:28:15 +0000
Subject: [PATCH 03/13] [pre-commit.ci] auto fixes from pre-commit.com hooks

for more information, see https://pre-commit.ci
---
 data/USPTO_500k/meta.yaml    | 65 ++++++++++++++++++------------------
 data/USPTO_500k/transform.py | 44 ++++++++++++------------
 2 files changed, 55 insertions(+), 54 deletions(-)

diff --git a/data/USPTO_500k/meta.yaml b/data/USPTO_500k/meta.yaml
index adbd80857..06d9252c0 100644
--- a/data/USPTO_500k/meta.yaml
+++ b/data/USPTO_500k/meta.yaml
@@ -1,40 +1,41 @@
+---
 name: USPTO_500k
 description: United States Patent and Trademark Office reaction dataset with yields.
 targets:
-- id: yield
-  description: Reaction yields analyzed by UPLC
-  units: '%'
-  type: continuous
-  names:
-  - Reaction yield
-  - yield
-  uris:
-  - https://bioportal.bioontology.org/ontologies/AFO?p=classes&conceptid=http%3A%2F%2Fpurl.allotrope.org%2Fontologies%2Fquality%23AFQ_0000227
-  - https://en.wikipedia.org/wiki/Yield_(chemistry)
+    - id: yield
+      description: Reaction yields analyzed by UPLC
+      units: '%'
+      type: continuous
+      names:
+          - Reaction yield
+          - yield
+      uris:
+          - https://bioportal.bioontology.org/ontologies/AFO?p=classes&conceptid=http%3A%2F%2Fpurl.allotrope.org%2Fontologies%2Fquality%23AFQ_0000227
+          - https://en.wikipedia.org/wiki/Yield_(chemistry)
 identifiers:
-- id: reaction_SMILES
-  type: RXN-SMILES
-  description: reaction SMILES
+    - id: reaction_SMILES
+      type: RXN-SMILES
+      description: reaction SMILES
 license: CC0
 links:
-- url: https://doi.org/10.17863/CAM.16293
-  description: corresponding publication
-- url: https://github.com/reymond-group/drfp/blob/main/data/uspto_yields_below.csv
-  description: data source
-- url: https://github.com/reymond-group/drfp/blob/main/data/uspto_yields_above.csv
-  description: data source
-- url: https://tdcommons.ai/single_pred_tasks/yields/#uspto
-  description: other source
+    - url: https://doi.org/10.17863/CAM.16293
+      description: corresponding publication
+    - url: https://github.com/reymond-group/drfp/blob/main/data/uspto_yields_below.csv
+      description: data source
+    - url: https://github.com/reymond-group/drfp/blob/main/data/uspto_yields_above.csv
+      description: data source
+    - url: https://tdcommons.ai/single_pred_tasks/yields/#uspto
+      description: other source
 num_points: 498721
 bibtex:
-- |-
-  @article{https://doi.org/10.17863/cam.16293,
-                doi = {10.17863/CAM.16293},
-                url = {https://www.repository.cam.ac.uk/handle/1810/244727},
-                author = {Lowe,  Daniel Mark},
-                keywords = {Name to structure,  OPSIN,  Chemical text mining,  Text mining,  Patent reaction extraction,  Reaction mining,  Patents},
-                language = {en},
-                title = {Extraction of chemical structures and reactions from the literature},
-                publisher = {Apollo - University of Cambridge Repository},
-                year = {2012},
-                copyright = {All Rights Reserved}}
+    - |-
+      @article{https://doi.org/10.17863/cam.16293,
+                    doi = {10.17863/CAM.16293},
+                    url = {https://www.repository.cam.ac.uk/handle/1810/244727},
+                    author = {Lowe,  Daniel Mark},
+                    keywords = {Name to structure,  OPSIN,  Chemical text mining,  Text mining,  Patent reaction extraction,  Reaction mining,  Patents},
+                    language = {en},
+                    title = {Extraction of chemical structures and reactions from the literature},
+                    publisher = {Apollo - University of Cambridge Repository},
+                    year = {2012},
+                    copyright = {All Rights Reserved}}
diff --git a/data/USPTO_500k/transform.py b/data/USPTO_500k/transform.py
index 98e753076..8596e43c6 100644
--- a/data/USPTO_500k/transform.py
+++ b/data/USPTO_500k/transform.py
@@ -5,41 +5,41 @@
 
 def get_and_transform_data():
     # get raw data
-    df1 = pd.read_csv('https://github.com/reymond-group/drfp/raw/main/data/uspto_yields_above.csv')
-    df2 = pd.read_csv('https://github.com/reymond-group/drfp/raw/main/data/uspto_yields_below.csv')
-    data = pd.concat([df1,df2])
-    data = data[['rxn','yield']]
-    data= data.drop_duplicates(subset='rxn')
+    df1 = pd.read_csv(
+        "https://github.com/reymond-group/drfp/raw/main/data/uspto_yields_above.csv"
+    )
+    df2 = pd.read_csv(
+        "https://github.com/reymond-group/drfp/raw/main/data/uspto_yields_below.csv"
+    )
+    data = pd.concat([df1, df2])
+    data = data[["rxn", "yield"]]
+    data = data.drop_duplicates(subset="rxn")
     fn_data_original = "uptso.csv"
     data.to_csv(fn_data_original, index=False)
-    
+
     # create dataframe
-    df = pd.read_csv(fn_data_original, 
-                     delimiter=","
-        )# not necessary but ensure we can load the saved data
-    
+    df = pd.read_csv(
+        fn_data_original, delimiter=","
+    )  # not necessary but ensure we can load the saved data
+
     # check if fields are the same
     fields_orig = df.columns.tolist()
-    assert fields_orig == ['rxn', 'yield']
-    fields_clean = [
-        "reaction_SMILES",
-        "yield"
-    ]
-    
+    assert fields_orig == ["rxn", "yield"]
+    fields_clean = ["reaction_SMILES", "yield"]
+
     # overwrite column names = fields
     df.columns = fields_clean
     assert fields_orig != fields_clean
-    
+
     # remove leading and trailing white space characters
     assert not df.duplicated().sum()
-    
+
     # save to csv
     fn_data_csv = "data_clean.csv"
     df.to_csv(fn_data_csv, index=False)
 
-    
     # create meta yaml
-    meta =  {
+    meta = {
         "name": "USPTO_500k",  # unique identifier, we will also use this for directory names
         "description": """United States Patent and Trademark Office reaction dataset with yields.""",
         "targets": [
@@ -52,7 +52,7 @@ def get_and_transform_data():
                     "Reaction yield",
                     "yield",
                 ],
-                "uris":[
+                "uris": [
                     "https://bioportal.bioontology.org/ontologies/AFO?p=classes&conceptid=http%3A%2F%2Fpurl.allotrope.org%2Fontologies%2Fquality%23AFQ_0000227",
                     "https://en.wikipedia.org/wiki/Yield_(chemistry)",
                 ],
@@ -82,7 +82,7 @@ def get_and_transform_data():
             {
                 "url": "https://tdcommons.ai/single_pred_tasks/yields/#uspto",
                 "description": "other source",
-            }
+            },
         ],
         "num_points": len(df),  # number of datapoints in this dataset
         "bibtex": [

From f5bf9109169ab16a7775b74b45c804a491fb5cb6 Mon Sep 17 00:00:00 2001
From: Mohamed Abd Elaleem <109590482+phalem@users.noreply.github.com>
Date: Sat, 25 Mar 2023 00:26:08 +0200
Subject: [PATCH 04/13] Add files via upload

---
 data/uspto_500k/meta.yaml    |  45 ++++++++++++
 data/uspto_500k/transform.py | 130 +++++++++++++++++++++++++++++++++++
 2 files changed, 175 insertions(+)
 create mode 100644 data/uspto_500k/meta.yaml
 create mode 100644 data/uspto_500k/transform.py

diff --git a/data/uspto_500k/meta.yaml b/data/uspto_500k/meta.yaml
new file mode 100644
index 000000000..0270ec866
--- /dev/null
+++ b/data/uspto_500k/meta.yaml
@@ -0,0 +1,45 @@
+name: uspto_500k
+description: United States Patent and Trademark Office reaction dataset with yields.
+targets:
+- id: yield
+  description: Reaction yields analyzed by UPLC
+  units: '%'
+  type: continuous
+  names:
+  - Reaction yield
+  - yield
+  uris:
+  - https://bioportal.bioontology.org/ontologies/AFO?p=classes&conceptid=http%3A%2F%2Fpurl.allotrope.org%2Fontologies%2Fquality%23AFQ_0000227
+  - https://en.wikipedia.org/wiki/Yield_(chemistry)
+benchmarks:
+- name: TDC
+  link: https://tdcommons.ai/
+  split_column: split
+identifiers:
+- id: reaction_SMILES
+  type: SMILES
+  description: reaction SMILES
+license: CC0
+links:
+- url: https://doi.org/10.17863/CAM.16293
+  description: corresponding publication
+- url: https://github.com/reymond-group/drfp/blob/main/data/uspto_yields_below.csv
+  description: data source
+- url: https://github.com/reymond-group/drfp/blob/main/data/uspto_yields_above.csv
+  description: data source
+- url: https://tdcommons.ai/single_pred_tasks/yields/#uspto
+  description: other source
+num_points: 498721
+bibtex:
+- |-
+  @article{https://doi.org/10.17863/cam.16293,
+  doi = {10.17863/CAM.16293},
+  url = {https://www.repository.cam.ac.uk/handle/1810/244727},
+  year = {2012},
+  publisher = {Apollo - University of Cambridge Repository},
+  keywords = {Name to structure,  OPSIN,  Chemical text mining,  Text mining,
+  Patent reaction extraction,  Reaction mining,  Patents},
+  language = {en},
+  author = {Lowe,  Daniel Mark},
+  title = {Extraction of chemical structures and reactions from the literature},
+  copyright = {All Rights Reserved}
diff --git a/data/uspto_500k/transform.py b/data/uspto_500k/transform.py
new file mode 100644
index 000000000..0a322c1fd
--- /dev/null
+++ b/data/uspto_500k/transform.py
@@ -0,0 +1,130 @@
+import pandas as pd
+import yaml
+from tdc.single_pred import Tox
+
+
+def get_and_transform_data():
+    # get raw data
+    df1 = pd.read_csv(
+        "https://github.com/reymond-group/drfp/raw/main/data/uspto_yields_above.csv"
+    )
+    df2 = pd.read_csv(
+        "https://github.com/reymond-group/drfp/raw/main/data/uspto_yields_below.csv"
+    )
+    data = pd.concat([df1, df2])
+    data = data[["rxn", "yield"]]
+    data = data.drop_duplicates(subset="rxn")
+    fn_data_original = "uptso.csv"
+    data.to_csv(fn_data_original, index=False)
+
+    # create dataframe
+    df = pd.read_csv(
+        fn_data_original, delimiter=","
+    )  # not necessary but ensure we can load the saved data
+
+    # check if fields are the same
+    fields_orig = df.columns.tolist()
+    assert fields_orig == ["rxn", "yield"]
+    fields_clean = ["reaction_SMILES", "yield"]
+
+    # overwrite column names = fields
+    df.columns = fields_clean
+    assert fields_orig != fields_clean
+
+    # remove leading and trailing white space characters
+    assert not df.duplicated().sum()
+
+    # save to csv
+    fn_data_csv = "data_clean.csv"
+    df.to_csv(fn_data_csv, index=False)
+
+    # create meta yaml
+    meta = {
+        "name": "uspto_500k",  # unique identifier, we will also use this for directory names
+        "description": """United States Patent and Trademark Office reaction dataset with yields.""",
+        "targets": [
+            {
+                "id": "yield",  # name of the column in a tabular dataset
+                "description": "Reaction yields analyzed by UPLC",  # description of what this column means
+                "units": "%",  # units of the values in this column (leave empty if unitless)
+                "type": "continuous",  # can be "categorical", "ordinal", "continuous"
+                "names": [  # names for the property (to sample from for building the prompts)
+                    "Reaction yield",
+                    "yield",
+                ],
+                "uris": [
+                    "https://bioportal.bioontology.org/ontologies/AFO?p=classes&conceptid=http%3A%2F%2Fpurl.allotrope.org%2Fontologies%2Fquality%23AFQ_0000227",
+                    "https://en.wikipedia.org/wiki/Yield_(chemistry)",
+                ],
+            },
+        ],
+        "benchmarks": [
+        {
+            "name": "TDC",  # unique benchmark name
+            "link": "https://tdcommons.ai/",  # benchmark URL
+            "split_column": "split",  # name of the column that contains the split information
+        },
+        ],
+        "identifiers": [
+            {
+                "id": "reaction_SMILES",  # column name
+                "type": "SMILES",  # can be "SMILES", "SELFIES", "IUPAC", "Other"
+                "description": "reaction SMILES",  # description (optional, except for "Other")
+            },
+        ],
+        "license": "CC0",  # license under which the original dataset was published
+        "links": [  # list of relevant links (original dataset, other uses, etc.)
+            {
+                "url": "https://doi.org/10.17863/CAM.16293",
+                "description": "corresponding publication",
+            },
+            {
+                "url": "https://github.com/reymond-group/drfp/blob/main/data/uspto_yields_below.csv",
+                "description": "data source",
+            },
+            {
+                "url": "https://github.com/reymond-group/drfp/blob/main/data/uspto_yields_above.csv",
+                "description": "data source",
+            },
+            {
+                "url": "https://tdcommons.ai/single_pred_tasks/yields/#uspto",
+                "description": "other source",
+            },
+        ],
+        "num_points": len(df),  # number of datapoints in this dataset
+        "bibtex": [
+            """@article{https://doi.org/10.17863/cam.16293,
+doi = {10.17863/CAM.16293},
+url = {https://www.repository.cam.ac.uk/handle/1810/244727},
+year = {2012},
+publisher = {Apollo - University of Cambridge Repository},
+keywords = {Name to structure,  OPSIN,  Chemical text mining,  Text mining,
+Patent reaction extraction,  Reaction mining,  Patents},
+language = {en},
+author = {Lowe,  Daniel Mark},
+title = {Extraction of chemical structures and reactions from the literature},
+copyright = {All Rights Reserved}""",
+        ],
+    }
+
+    def str_presenter(dumper, data):
+        """configures yaml for dumping multiline strings
+        Ref: https://stackoverflow.com/questions/8640959/how-can-i-control-what-scalar-form-pyyaml-uses-for-my-data
+        """
+        if data.count("\n") > 0:  # check for multiline string
+            return dumper.represent_scalar("tag:yaml.org,2002:str", data, style="|")
+        return dumper.represent_scalar("tag:yaml.org,2002:str", data)
+
+    yaml.add_representer(str, str_presenter)
+    yaml.representer.SafeRepresenter.add_representer(
+        str, str_presenter
+    )  # to use with safe_dum
+    fn_meta = "meta.yaml"
+    with open(fn_meta, "w") as f:
+        yaml.dump(meta, f, sort_keys=False)
+
+    print(f"Finished processing {meta['name']} dataset!")
+
+
+if __name__ == "__main__":
+    get_and_transform_data()

From 69af75c2fc1ab2137e84c708fc9b3f7b4b1f2487 Mon Sep 17 00:00:00 2001
From: "pre-commit-ci[bot]"
 <66853113+pre-commit-ci[bot]@users.noreply.github.com>
Date: Fri, 24 Mar 2023 22:26:16 +0000
Subject: [PATCH 05/13] [pre-commit.ci] auto fixes from pre-commit.com hooks

for more information, see https://pre-commit.ci
---
 data/uspto_500k/meta.yaml    | 73 ++++++++++++++++++------------------
 data/uspto_500k/transform.py | 10 ++---
 2 files changed, 42 insertions(+), 41 deletions(-)

diff --git a/data/uspto_500k/meta.yaml b/data/uspto_500k/meta.yaml
index 0270ec866..59fd50cf2 100644
--- a/data/uspto_500k/meta.yaml
+++ b/data/uspto_500k/meta.yaml
@@ -1,45 +1,46 @@
+---
 name: uspto_500k
 description: United States Patent and Trademark Office reaction dataset with yields.
 targets:
-- id: yield
-  description: Reaction yields analyzed by UPLC
-  units: '%'
-  type: continuous
-  names:
-  - Reaction yield
-  - yield
-  uris:
-  - https://bioportal.bioontology.org/ontologies/AFO?p=classes&conceptid=http%3A%2F%2Fpurl.allotrope.org%2Fontologies%2Fquality%23AFQ_0000227
-  - https://en.wikipedia.org/wiki/Yield_(chemistry)
+    - id: yield
+      description: Reaction yields analyzed by UPLC
+      units: '%'
+      type: continuous
+      names:
+          - Reaction yield
+          - yield
+      uris:
+          - https://bioportal.bioontology.org/ontologies/AFO?p=classes&conceptid=http%3A%2F%2Fpurl.allotrope.org%2Fontologies%2Fquality%23AFQ_0000227
+          - https://en.wikipedia.org/wiki/Yield_(chemistry)
 benchmarks:
-- name: TDC
-  link: https://tdcommons.ai/
-  split_column: split
+    - name: TDC
+      link: https://tdcommons.ai/
+      split_column: split
 identifiers:
-- id: reaction_SMILES
-  type: SMILES
-  description: reaction SMILES
+    - id: reaction_SMILES
+      type: SMILES
+      description: reaction SMILES
 license: CC0
 links:
-- url: https://doi.org/10.17863/CAM.16293
-  description: corresponding publication
-- url: https://github.com/reymond-group/drfp/blob/main/data/uspto_yields_below.csv
-  description: data source
-- url: https://github.com/reymond-group/drfp/blob/main/data/uspto_yields_above.csv
-  description: data source
-- url: https://tdcommons.ai/single_pred_tasks/yields/#uspto
-  description: other source
+    - url: https://doi.org/10.17863/CAM.16293
+      description: corresponding publication
+    - url: https://github.com/reymond-group/drfp/blob/main/data/uspto_yields_below.csv
+      description: data source
+    - url: https://github.com/reymond-group/drfp/blob/main/data/uspto_yields_above.csv
+      description: data source
+    - url: https://tdcommons.ai/single_pred_tasks/yields/#uspto
+      description: other source
 num_points: 498721
 bibtex:
-- |-
-  @article{https://doi.org/10.17863/cam.16293,
-  doi = {10.17863/CAM.16293},
-  url = {https://www.repository.cam.ac.uk/handle/1810/244727},
-  year = {2012},
-  publisher = {Apollo - University of Cambridge Repository},
-  keywords = {Name to structure,  OPSIN,  Chemical text mining,  Text mining,
-  Patent reaction extraction,  Reaction mining,  Patents},
-  language = {en},
-  author = {Lowe,  Daniel Mark},
-  title = {Extraction of chemical structures and reactions from the literature},
-  copyright = {All Rights Reserved}
+    - |-
+      @article{https://doi.org/10.17863/cam.16293,
+      doi = {10.17863/CAM.16293},
+      url = {https://www.repository.cam.ac.uk/handle/1810/244727},
+      year = {2012},
+      publisher = {Apollo - University of Cambridge Repository},
+      keywords = {Name to structure,  OPSIN,  Chemical text mining,  Text mining,
+      Patent reaction extraction,  Reaction mining,  Patents},
+      language = {en},
+      author = {Lowe,  Daniel Mark},
+      title = {Extraction of chemical structures and reactions from the literature},
+      copyright = {All Rights Reserved}
diff --git a/data/uspto_500k/transform.py b/data/uspto_500k/transform.py
index 0a322c1fd..6d407d93e 100644
--- a/data/uspto_500k/transform.py
+++ b/data/uspto_500k/transform.py
@@ -59,11 +59,11 @@ def get_and_transform_data():
             },
         ],
         "benchmarks": [
-        {
-            "name": "TDC",  # unique benchmark name
-            "link": "https://tdcommons.ai/",  # benchmark URL
-            "split_column": "split",  # name of the column that contains the split information
-        },
+            {
+                "name": "TDC",  # unique benchmark name
+                "link": "https://tdcommons.ai/",  # benchmark URL
+                "split_column": "split",  # name of the column that contains the split information
+            },
         ],
         "identifiers": [
             {

From 8ec54afba8e3c521aacdae0416f65b77de95cd4d Mon Sep 17 00:00:00 2001
From: Mohamed Abd Elaleem <109590482+phalem@users.noreply.github.com>
Date: Sat, 25 Mar 2023 00:27:07 +0200
Subject: [PATCH 06/13] Delete data/USPTO_500k directory

---
 data/USPTO_500k/meta.yaml    |  41 ------------
 data/USPTO_500k/transform.py | 122 -----------------------------------
 2 files changed, 163 deletions(-)
 delete mode 100644 data/USPTO_500k/meta.yaml
 delete mode 100644 data/USPTO_500k/transform.py

diff --git a/data/USPTO_500k/meta.yaml b/data/USPTO_500k/meta.yaml
deleted file mode 100644
index 06d9252c0..000000000
--- a/data/USPTO_500k/meta.yaml
+++ /dev/null
@@ -1,41 +0,0 @@
----
-name: USPTO_500k
-description: United States Patent and Trademark Office reaction dataset with yields.
-targets:
-    - id: yield
-      description: Reaction yields analyzed by UPLC
-      units: '%'
-      type: continuous
-      names:
-          - Reaction yield
-          - yield
-      uris:
-          - https://bioportal.bioontology.org/ontologies/AFO?p=classes&conceptid=http%3A%2F%2Fpurl.allotrope.org%2Fontologies%2Fquality%23AFQ_0000227
-          - https://en.wikipedia.org/wiki/Yield_(chemistry)
-identifiers:
-    - id: reaction_SMILES
-      type: RXN-SMILES
-      description: reaction SMILES
-license: CC0
-links:
-    - url: https://doi.org/10.17863/CAM.16293
-      description: corresponding publication
-    - url: https://github.com/reymond-group/drfp/blob/main/data/uspto_yields_below.csv
-      description: data source
-    - url: https://github.com/reymond-group/drfp/blob/main/data/uspto_yields_above.csv
-      description: data source
-    - url: https://tdcommons.ai/single_pred_tasks/yields/#uspto
-      description: other source
-num_points: 498721
-bibtex:
-    - |-
-      @article{https://doi.org/10.17863/cam.16293,
-                    doi = {10.17863/CAM.16293},
-                    url = {https://www.repository.cam.ac.uk/handle/1810/244727},
-                    author = {Lowe,  Daniel Mark},
-                    keywords = {Name to structure,  OPSIN,  Chemical text mining,  Text mining,  Patent reaction extraction,  Reaction mining,  Patents},
-                    language = {en},
-                    title = {Extraction of chemical structures and reactions from the literature},
-                    publisher = {Apollo - University of Cambridge Repository},
-                    year = {2012},
-                    copyright = {All Rights Reserved}}
diff --git a/data/USPTO_500k/transform.py b/data/USPTO_500k/transform.py
deleted file mode 100644
index 8596e43c6..000000000
--- a/data/USPTO_500k/transform.py
+++ /dev/null
@@ -1,122 +0,0 @@
-import pandas as pd
-import yaml
-from tdc.single_pred import Tox
-
-
-def get_and_transform_data():
-    # get raw data
-    df1 = pd.read_csv(
-        "https://github.com/reymond-group/drfp/raw/main/data/uspto_yields_above.csv"
-    )
-    df2 = pd.read_csv(
-        "https://github.com/reymond-group/drfp/raw/main/data/uspto_yields_below.csv"
-    )
-    data = pd.concat([df1, df2])
-    data = data[["rxn", "yield"]]
-    data = data.drop_duplicates(subset="rxn")
-    fn_data_original = "uptso.csv"
-    data.to_csv(fn_data_original, index=False)
-
-    # create dataframe
-    df = pd.read_csv(
-        fn_data_original, delimiter=","
-    )  # not necessary but ensure we can load the saved data
-
-    # check if fields are the same
-    fields_orig = df.columns.tolist()
-    assert fields_orig == ["rxn", "yield"]
-    fields_clean = ["reaction_SMILES", "yield"]
-
-    # overwrite column names = fields
-    df.columns = fields_clean
-    assert fields_orig != fields_clean
-
-    # remove leading and trailing white space characters
-    assert not df.duplicated().sum()
-
-    # save to csv
-    fn_data_csv = "data_clean.csv"
-    df.to_csv(fn_data_csv, index=False)
-
-    # create meta yaml
-    meta = {
-        "name": "USPTO_500k",  # unique identifier, we will also use this for directory names
-        "description": """United States Patent and Trademark Office reaction dataset with yields.""",
-        "targets": [
-            {
-                "id": "yield",  # name of the column in a tabular dataset
-                "description": "Reaction yields analyzed by UPLC",  # description of what this column means
-                "units": "%",  # units of the values in this column (leave empty if unitless)
-                "type": "continuous",  # can be "categorical", "ordinal", "continuous"
-                "names": [  # names for the property (to sample from for building the prompts)
-                    "Reaction yield",
-                    "yield",
-                ],
-                "uris": [
-                    "https://bioportal.bioontology.org/ontologies/AFO?p=classes&conceptid=http%3A%2F%2Fpurl.allotrope.org%2Fontologies%2Fquality%23AFQ_0000227",
-                    "https://en.wikipedia.org/wiki/Yield_(chemistry)",
-                ],
-            },
-        ],
-        "identifiers": [
-            {
-                "id": "reaction_SMILES",  # column name
-                "type": "SMILES",  # can be "SMILES", "SELFIES", "IUPAC", "Other"
-                "description": "reaction SMILES",  # description (optional, except for "Other")
-            },
-        ],
-        "license": "CC0",  # license under which the original dataset was published
-        "links": [  # list of relevant links (original dataset, other uses, etc.)
-            {
-                "url": "https://doi.org/10.17863/CAM.16293",
-                "description": "corresponding publication",
-            },
-            {
-                "url": "https://github.com/reymond-group/drfp/blob/main/data/uspto_yields_below.csv",
-                "description": "data source",
-            },
-            {
-                "url": "https://github.com/reymond-group/drfp/blob/main/data/uspto_yields_above.csv",
-                "description": "data source",
-            },
-            {
-                "url": "https://tdcommons.ai/single_pred_tasks/yields/#uspto",
-                "description": "other source",
-            },
-        ],
-        "num_points": len(df),  # number of datapoints in this dataset
-        "bibtex": [
-            """@article{https://doi.org/10.17863/cam.16293,
-              doi = {10.17863/CAM.16293},
-              url = {https://www.repository.cam.ac.uk/handle/1810/244727},
-              author = {Lowe,  Daniel Mark},
-              keywords = {Name to structure,  OPSIN,  Chemical text mining,  Text mining,  Patent reaction extraction,  Reaction mining,  Patents},
-              language = {en},
-              title = {Extraction of chemical structures and reactions from the literature},
-              publisher = {Apollo - University of Cambridge Repository},
-              year = {2012},
-              copyright = {All Rights Reserved}}""",
-        ],
-    }
-
-    def str_presenter(dumper, data):
-        """configures yaml for dumping multiline strings
-        Ref: https://stackoverflow.com/questions/8640959/how-can-i-control-what-scalar-form-pyyaml-uses-for-my-data
-        """
-        if data.count("\n") > 0:  # check for multiline string
-            return dumper.represent_scalar("tag:yaml.org,2002:str", data, style="|")
-        return dumper.represent_scalar("tag:yaml.org,2002:str", data)
-
-    yaml.add_representer(str, str_presenter)
-    yaml.representer.SafeRepresenter.add_representer(
-        str, str_presenter
-    )  # to use with safe_dum
-    fn_meta = "meta.yaml"
-    with open(fn_meta, "w") as f:
-        yaml.dump(meta, f, sort_keys=False)
-
-    print(f"Finished processing {meta['name']} dataset!")
-
-
-if __name__ == "__main__":
-    get_and_transform_data()

From 5c120787ed1ccf7b2c7235f5c0c07f48b1d6c152 Mon Sep 17 00:00:00 2001
From: Mohamed Abd Elaleem <109590482+phalem@users.noreply.github.com>
Date: Sat, 25 Mar 2023 00:40:10 +0200
Subject: [PATCH 07/13] Add files via upload

---
 data/uspto_500k/meta.yaml    | 73 ++++++++++++++++++------------------
 data/uspto_500k/transform.py | 10 ++---
 2 files changed, 41 insertions(+), 42 deletions(-)

diff --git a/data/uspto_500k/meta.yaml b/data/uspto_500k/meta.yaml
index 59fd50cf2..0270ec866 100644
--- a/data/uspto_500k/meta.yaml
+++ b/data/uspto_500k/meta.yaml
@@ -1,46 +1,45 @@
----
 name: uspto_500k
 description: United States Patent and Trademark Office reaction dataset with yields.
 targets:
-    - id: yield
-      description: Reaction yields analyzed by UPLC
-      units: '%'
-      type: continuous
-      names:
-          - Reaction yield
-          - yield
-      uris:
-          - https://bioportal.bioontology.org/ontologies/AFO?p=classes&conceptid=http%3A%2F%2Fpurl.allotrope.org%2Fontologies%2Fquality%23AFQ_0000227
-          - https://en.wikipedia.org/wiki/Yield_(chemistry)
+- id: yield
+  description: Reaction yields analyzed by UPLC
+  units: '%'
+  type: continuous
+  names:
+  - Reaction yield
+  - yield
+  uris:
+  - https://bioportal.bioontology.org/ontologies/AFO?p=classes&conceptid=http%3A%2F%2Fpurl.allotrope.org%2Fontologies%2Fquality%23AFQ_0000227
+  - https://en.wikipedia.org/wiki/Yield_(chemistry)
 benchmarks:
-    - name: TDC
-      link: https://tdcommons.ai/
-      split_column: split
+- name: TDC
+  link: https://tdcommons.ai/
+  split_column: split
 identifiers:
-    - id: reaction_SMILES
-      type: SMILES
-      description: reaction SMILES
+- id: reaction_SMILES
+  type: SMILES
+  description: reaction SMILES
 license: CC0
 links:
-    - url: https://doi.org/10.17863/CAM.16293
-      description: corresponding publication
-    - url: https://github.com/reymond-group/drfp/blob/main/data/uspto_yields_below.csv
-      description: data source
-    - url: https://github.com/reymond-group/drfp/blob/main/data/uspto_yields_above.csv
-      description: data source
-    - url: https://tdcommons.ai/single_pred_tasks/yields/#uspto
-      description: other source
+- url: https://doi.org/10.17863/CAM.16293
+  description: corresponding publication
+- url: https://github.com/reymond-group/drfp/blob/main/data/uspto_yields_below.csv
+  description: data source
+- url: https://github.com/reymond-group/drfp/blob/main/data/uspto_yields_above.csv
+  description: data source
+- url: https://tdcommons.ai/single_pred_tasks/yields/#uspto
+  description: other source
 num_points: 498721
 bibtex:
-    - |-
-      @article{https://doi.org/10.17863/cam.16293,
-      doi = {10.17863/CAM.16293},
-      url = {https://www.repository.cam.ac.uk/handle/1810/244727},
-      year = {2012},
-      publisher = {Apollo - University of Cambridge Repository},
-      keywords = {Name to structure,  OPSIN,  Chemical text mining,  Text mining,
-      Patent reaction extraction,  Reaction mining,  Patents},
-      language = {en},
-      author = {Lowe,  Daniel Mark},
-      title = {Extraction of chemical structures and reactions from the literature},
-      copyright = {All Rights Reserved}
+- |-
+  @article{https://doi.org/10.17863/cam.16293,
+  doi = {10.17863/CAM.16293},
+  url = {https://www.repository.cam.ac.uk/handle/1810/244727},
+  year = {2012},
+  publisher = {Apollo - University of Cambridge Repository},
+  keywords = {Name to structure,  OPSIN,  Chemical text mining,  Text mining,
+  Patent reaction extraction,  Reaction mining,  Patents},
+  language = {en},
+  author = {Lowe,  Daniel Mark},
+  title = {Extraction of chemical structures and reactions from the literature},
+  copyright = {All Rights Reserved}
diff --git a/data/uspto_500k/transform.py b/data/uspto_500k/transform.py
index 6d407d93e..0a322c1fd 100644
--- a/data/uspto_500k/transform.py
+++ b/data/uspto_500k/transform.py
@@ -59,11 +59,11 @@ def get_and_transform_data():
             },
         ],
         "benchmarks": [
-            {
-                "name": "TDC",  # unique benchmark name
-                "link": "https://tdcommons.ai/",  # benchmark URL
-                "split_column": "split",  # name of the column that contains the split information
-            },
+        {
+            "name": "TDC",  # unique benchmark name
+            "link": "https://tdcommons.ai/",  # benchmark URL
+            "split_column": "split",  # name of the column that contains the split information
+        },
         ],
         "identifiers": [
             {

From 5716744d03a0e8a81cce9e91c3304f0c5debd187 Mon Sep 17 00:00:00 2001
From: "pre-commit-ci[bot]"
 <66853113+pre-commit-ci[bot]@users.noreply.github.com>
Date: Fri, 24 Mar 2023 22:41:09 +0000
Subject: [PATCH 08/13] [pre-commit.ci] auto fixes from pre-commit.com hooks

for more information, see https://pre-commit.ci
---
 data/uspto_500k/meta.yaml    | 73 ++++++++++++++++++------------------
 data/uspto_500k/transform.py | 10 ++---
 2 files changed, 42 insertions(+), 41 deletions(-)

diff --git a/data/uspto_500k/meta.yaml b/data/uspto_500k/meta.yaml
index 0270ec866..59fd50cf2 100644
--- a/data/uspto_500k/meta.yaml
+++ b/data/uspto_500k/meta.yaml
@@ -1,45 +1,46 @@
+---
 name: uspto_500k
 description: United States Patent and Trademark Office reaction dataset with yields.
 targets:
-- id: yield
-  description: Reaction yields analyzed by UPLC
-  units: '%'
-  type: continuous
-  names:
-  - Reaction yield
-  - yield
-  uris:
-  - https://bioportal.bioontology.org/ontologies/AFO?p=classes&conceptid=http%3A%2F%2Fpurl.allotrope.org%2Fontologies%2Fquality%23AFQ_0000227
-  - https://en.wikipedia.org/wiki/Yield_(chemistry)
+    - id: yield
+      description: Reaction yields analyzed by UPLC
+      units: '%'
+      type: continuous
+      names:
+          - Reaction yield
+          - yield
+      uris:
+          - https://bioportal.bioontology.org/ontologies/AFO?p=classes&conceptid=http%3A%2F%2Fpurl.allotrope.org%2Fontologies%2Fquality%23AFQ_0000227
+          - https://en.wikipedia.org/wiki/Yield_(chemistry)
 benchmarks:
-- name: TDC
-  link: https://tdcommons.ai/
-  split_column: split
+    - name: TDC
+      link: https://tdcommons.ai/
+      split_column: split
 identifiers:
-- id: reaction_SMILES
-  type: SMILES
-  description: reaction SMILES
+    - id: reaction_SMILES
+      type: SMILES
+      description: reaction SMILES
 license: CC0
 links:
-- url: https://doi.org/10.17863/CAM.16293
-  description: corresponding publication
-- url: https://github.com/reymond-group/drfp/blob/main/data/uspto_yields_below.csv
-  description: data source
-- url: https://github.com/reymond-group/drfp/blob/main/data/uspto_yields_above.csv
-  description: data source
-- url: https://tdcommons.ai/single_pred_tasks/yields/#uspto
-  description: other source
+    - url: https://doi.org/10.17863/CAM.16293
+      description: corresponding publication
+    - url: https://github.com/reymond-group/drfp/blob/main/data/uspto_yields_below.csv
+      description: data source
+    - url: https://github.com/reymond-group/drfp/blob/main/data/uspto_yields_above.csv
+      description: data source
+    - url: https://tdcommons.ai/single_pred_tasks/yields/#uspto
+      description: other source
 num_points: 498721
 bibtex:
-- |-
-  @article{https://doi.org/10.17863/cam.16293,
-  doi = {10.17863/CAM.16293},
-  url = {https://www.repository.cam.ac.uk/handle/1810/244727},
-  year = {2012},
-  publisher = {Apollo - University of Cambridge Repository},
-  keywords = {Name to structure,  OPSIN,  Chemical text mining,  Text mining,
-  Patent reaction extraction,  Reaction mining,  Patents},
-  language = {en},
-  author = {Lowe,  Daniel Mark},
-  title = {Extraction of chemical structures and reactions from the literature},
-  copyright = {All Rights Reserved}
+    - |-
+      @article{https://doi.org/10.17863/cam.16293,
+      doi = {10.17863/CAM.16293},
+      url = {https://www.repository.cam.ac.uk/handle/1810/244727},
+      year = {2012},
+      publisher = {Apollo - University of Cambridge Repository},
+      keywords = {Name to structure,  OPSIN,  Chemical text mining,  Text mining,
+      Patent reaction extraction,  Reaction mining,  Patents},
+      language = {en},
+      author = {Lowe,  Daniel Mark},
+      title = {Extraction of chemical structures and reactions from the literature},
+      copyright = {All Rights Reserved}
diff --git a/data/uspto_500k/transform.py b/data/uspto_500k/transform.py
index 0a322c1fd..6d407d93e 100644
--- a/data/uspto_500k/transform.py
+++ b/data/uspto_500k/transform.py
@@ -59,11 +59,11 @@ def get_and_transform_data():
             },
         ],
         "benchmarks": [
-        {
-            "name": "TDC",  # unique benchmark name
-            "link": "https://tdcommons.ai/",  # benchmark URL
-            "split_column": "split",  # name of the column that contains the split information
-        },
+            {
+                "name": "TDC",  # unique benchmark name
+                "link": "https://tdcommons.ai/",  # benchmark URL
+                "split_column": "split",  # name of the column that contains the split information
+            },
         ],
         "identifiers": [
             {

From b00d2280747bb9f0d4dee92772ebaf3b39567be5 Mon Sep 17 00:00:00 2001
From: Mohamed Abd Elaleem <109590482+phalem@users.noreply.github.com>
Date: Tue, 28 Mar 2023 20:43:14 +0200
Subject: [PATCH 09/13] Update data/uspto_500k/meta.yaml

Co-authored-by: Kevin M Jablonka <32935233+kjappelbaum@users.noreply.github.com>
---
 data/uspto_500k/meta.yaml | 1 -
 1 file changed, 1 deletion(-)

diff --git a/data/uspto_500k/meta.yaml b/data/uspto_500k/meta.yaml
index 59fd50cf2..fa04c3600 100644
--- a/data/uspto_500k/meta.yaml
+++ b/data/uspto_500k/meta.yaml
@@ -11,7 +11,6 @@ targets:
           - yield
       uris:
           - https://bioportal.bioontology.org/ontologies/AFO?p=classes&conceptid=http%3A%2F%2Fpurl.allotrope.org%2Fontologies%2Fquality%23AFQ_0000227
-          - https://en.wikipedia.org/wiki/Yield_(chemistry)
 benchmarks:
     - name: TDC
       link: https://tdcommons.ai/

From f231ca9e4418b784b4f920170b12850c5a76f844 Mon Sep 17 00:00:00 2001
From: Mohamed Abd Elaleem <109590482+phalem@users.noreply.github.com>
Date: Wed, 29 Mar 2023 01:41:41 +0200
Subject: [PATCH 10/13] Remove Benchmark field

I will add benchmark field on TDC version UPSTO
---
 data/uspto_500k/meta.yaml    | 70 +++++++++++++++++-------------------
 data/uspto_500k/transform.py | 25 +++++--------
 2 files changed, 42 insertions(+), 53 deletions(-)

diff --git a/data/uspto_500k/meta.yaml b/data/uspto_500k/meta.yaml
index fa04c3600..96bf0717e 100644
--- a/data/uspto_500k/meta.yaml
+++ b/data/uspto_500k/meta.yaml
@@ -1,45 +1,41 @@
----
 name: uspto_500k
 description: United States Patent and Trademark Office reaction dataset with yields.
 targets:
-    - id: yield
-      description: Reaction yields analyzed by UPLC
-      units: '%'
-      type: continuous
-      names:
-          - Reaction yield
-          - yield
-      uris:
-          - https://bioportal.bioontology.org/ontologies/AFO?p=classes&conceptid=http%3A%2F%2Fpurl.allotrope.org%2Fontologies%2Fquality%23AFQ_0000227
-benchmarks:
-    - name: TDC
-      link: https://tdcommons.ai/
-      split_column: split
+- id: yield
+  description: Reaction yields analyzed by UPLC
+  units: '%'
+  type: continuous
+  names:
+  - Reaction yield
+  - yield
+  uris:
+  - https://bioportal.bioontology.org/ontologies/AFO?p=classes&conceptid=http%3A%2F%2Fpurl.allotrope.org%2Fontologies%2Fquality%23AFQ_0000227
+  - https://en.wikipedia.org/wiki/Yield_(chemistry)
 identifiers:
-    - id: reaction_SMILES
-      type: SMILES
-      description: reaction SMILES
+- id: reaction_SMILES
+  type: RXNSMILES
+  description: reaction SMILES
 license: CC0
 links:
-    - url: https://doi.org/10.17863/CAM.16293
-      description: corresponding publication
-    - url: https://github.com/reymond-group/drfp/blob/main/data/uspto_yields_below.csv
-      description: data source
-    - url: https://github.com/reymond-group/drfp/blob/main/data/uspto_yields_above.csv
-      description: data source
-    - url: https://tdcommons.ai/single_pred_tasks/yields/#uspto
-      description: other source
+- url: https://doi.org/10.17863/CAM.16293
+  description: corresponding publication
+- url: https://github.com/reymond-group/drfp/blob/main/data/uspto_yields_below.csv
+  description: data source
+- url: https://github.com/reymond-group/drfp/blob/main/data/uspto_yields_above.csv
+  description: data source
+- url: https://tdcommons.ai/single_pred_tasks/yields/#uspto
+  description: other source
 num_points: 498721
 bibtex:
-    - |-
-      @article{https://doi.org/10.17863/cam.16293,
-      doi = {10.17863/CAM.16293},
-      url = {https://www.repository.cam.ac.uk/handle/1810/244727},
-      year = {2012},
-      publisher = {Apollo - University of Cambridge Repository},
-      keywords = {Name to structure,  OPSIN,  Chemical text mining,  Text mining,
-      Patent reaction extraction,  Reaction mining,  Patents},
-      language = {en},
-      author = {Lowe,  Daniel Mark},
-      title = {Extraction of chemical structures and reactions from the literature},
-      copyright = {All Rights Reserved}
+- |-
+  @article{https://doi.org/10.17863/cam.16293,
+  doi = {10.17863/CAM.16293},
+  url = {https://www.repository.cam.ac.uk/handle/1810/244727},
+  year = {2012},
+  publisher = {Apollo - University of Cambridge Repository},
+  keywords = {Name to structure,  OPSIN,  Chemical text mining,  Text mining,
+  Patent reaction extraction,  Reaction mining,  Patents},
+  language = {en},
+  author = {Lowe,  Daniel Mark},
+  title = {Extraction of chemical structures and reactions from the literature},
+  copyright = {All Rights Reserved}
diff --git a/data/uspto_500k/transform.py b/data/uspto_500k/transform.py
index 6d407d93e..308817a7a 100644
--- a/data/uspto_500k/transform.py
+++ b/data/uspto_500k/transform.py
@@ -40,15 +40,15 @@ def get_and_transform_data():
 
     # create meta yaml
     meta = {
-        "name": "uspto_500k",  # unique identifier, we will also use this for directory names
+        "name": "uspto_500k",  
         "description": """United States Patent and Trademark Office reaction dataset with yields.""",
         "targets": [
             {
-                "id": "yield",  # name of the column in a tabular dataset
-                "description": "Reaction yields analyzed by UPLC",  # description of what this column means
-                "units": "%",  # units of the values in this column (leave empty if unitless)
-                "type": "continuous",  # can be "categorical", "ordinal", "continuous"
-                "names": [  # names for the property (to sample from for building the prompts)
+                "id": "yield",  
+                "description": "Reaction yields analyzed by UPLC",
+                "units": "%",  
+                "type": "continuous",  
+                "names": [  
                     "Reaction yield",
                     "yield",
                 ],
@@ -58,18 +58,11 @@ def get_and_transform_data():
                 ],
             },
         ],
-        "benchmarks": [
-            {
-                "name": "TDC",  # unique benchmark name
-                "link": "https://tdcommons.ai/",  # benchmark URL
-                "split_column": "split",  # name of the column that contains the split information
-            },
-        ],
         "identifiers": [
             {
-                "id": "reaction_SMILES",  # column name
-                "type": "SMILES",  # can be "SMILES", "SELFIES", "IUPAC", "Other"
-                "description": "reaction SMILES",  # description (optional, except for "Other")
+                "id": "reaction_SMILES",  
+                "type": "RXNSMILES",  
+                "description": "reaction SMILES", 
             },
         ],
         "license": "CC0",  # license under which the original dataset was published

From 0bdd4aa1764b755558e5c7b86752c422e9204ab1 Mon Sep 17 00:00:00 2001
From: Mohamed Abd Elaleem <109590482+phalem@users.noreply.github.com>
Date: Wed, 29 Mar 2023 01:42:21 +0200
Subject: [PATCH 11/13] Remove benchmark field

I will add benchmark field on TDC version UPSTO

From d55bfd95f4aa23a0646b39dab7d7f69df4d46a84 Mon Sep 17 00:00:00 2001
From: Mohamed Abd Elaleem <109590482+phalem@users.noreply.github.com>
Date: Wed, 29 Mar 2023 01:42:42 +0200
Subject: [PATCH 12/13] Remove benchmark field

I will add benchmark field on TDC version UPSTO

From 8cf5f1a15e40d543bb39a40eb28baa346554b0af Mon Sep 17 00:00:00 2001
From: Michael Pieler <Michael.Pieler@Gmail.com>
Date: Fri, 14 Apr 2023 17:14:26 +0200
Subject: [PATCH 13/13] feat: uspto_500k clean up

---
 data/uspto_500k/meta.yaml    | 68 ++++++++++++++++++------------------
 data/uspto_500k/transform.py | 68 +++++++++++++++++++++---------------
 2 files changed, 74 insertions(+), 62 deletions(-)

diff --git a/data/uspto_500k/meta.yaml b/data/uspto_500k/meta.yaml
index 96bf0717e..0d1cfb0f1 100644
--- a/data/uspto_500k/meta.yaml
+++ b/data/uspto_500k/meta.yaml
@@ -1,41 +1,41 @@
+---
 name: uspto_500k
 description: United States Patent and Trademark Office reaction dataset with yields.
 targets:
-- id: yield
-  description: Reaction yields analyzed by UPLC
-  units: '%'
-  type: continuous
-  names:
-  - Reaction yield
-  - yield
-  uris:
-  - https://bioportal.bioontology.org/ontologies/AFO?p=classes&conceptid=http%3A%2F%2Fpurl.allotrope.org%2Fontologies%2Fquality%23AFQ_0000227
-  - https://en.wikipedia.org/wiki/Yield_(chemistry)
+    - id: yield
+      description: reaction yields
+      units: '%'
+      type: continuous
+      names:
+          - reaction yield
+          - yield
+      uris:
+          - http://purl.allotrope.org/ontologies/quality#AFQ_0000227
 identifiers:
-- id: reaction_SMILES
-  type: RXNSMILES
-  description: reaction SMILES
+    - id: reaction_SMILES
+      type: RXNSMILES
+      description: reaction SMILES
 license: CC0
 links:
-- url: https://doi.org/10.17863/CAM.16293
-  description: corresponding publication
-- url: https://github.com/reymond-group/drfp/blob/main/data/uspto_yields_below.csv
-  description: data source
-- url: https://github.com/reymond-group/drfp/blob/main/data/uspto_yields_above.csv
-  description: data source
-- url: https://tdcommons.ai/single_pred_tasks/yields/#uspto
-  description: other source
-num_points: 498721
+    - url: https://doi.org/10.17863/CAM.16293
+      description: corresponding publication
+    - url: https://github.com/reymond-group/drfp/blob/main/data/uspto_yields_below.csv
+      description: data source
+    - url: https://github.com/reymond-group/drfp/blob/main/data/uspto_yields_above.csv
+      description: data source
+    - url: https://tdcommons.ai/single_pred_tasks/yields/#uspto
+      description: other source
+num_points: 853638
 bibtex:
-- |-
-  @article{https://doi.org/10.17863/cam.16293,
-  doi = {10.17863/CAM.16293},
-  url = {https://www.repository.cam.ac.uk/handle/1810/244727},
-  year = {2012},
-  publisher = {Apollo - University of Cambridge Repository},
-  keywords = {Name to structure,  OPSIN,  Chemical text mining,  Text mining,
-  Patent reaction extraction,  Reaction mining,  Patents},
-  language = {en},
-  author = {Lowe,  Daniel Mark},
-  title = {Extraction of chemical structures and reactions from the literature},
-  copyright = {All Rights Reserved}
+    - |-
+      @article{https://doi.org/10.17863/cam.16293,
+      doi = {10.17863/CAM.16293},
+      url = {https://www.repository.cam.ac.uk/handle/1810/244727},
+      year = {2012},
+      publisher = {Apollo - University of Cambridge Repository},
+      keywords = {Name to structure,  OPSIN,  Chemical text mining,  Text mining,
+      Patent reaction extraction,  Reaction mining,  Patents},
+      language = {en},
+      author = {Lowe,  Daniel Mark},
+      title = {Extraction of chemical structures and reactions from the literature},
+      copyright = {All Rights Reserved}
diff --git a/data/uspto_500k/transform.py b/data/uspto_500k/transform.py
index 308817a7a..f85e4a13d 100644
--- a/data/uspto_500k/transform.py
+++ b/data/uspto_500k/transform.py
@@ -1,21 +1,28 @@
 import pandas as pd
 import yaml
-from tdc.single_pred import Tox
+from tdc.single_pred import Yields
 
 
 def get_and_transform_data():
     # get raw data
-    df1 = pd.read_csv(
-        "https://github.com/reymond-group/drfp/raw/main/data/uspto_yields_above.csv"
-    )
-    df2 = pd.read_csv(
-        "https://github.com/reymond-group/drfp/raw/main/data/uspto_yields_below.csv"
-    )
-    data = pd.concat([df1, df2])
-    data = data[["rxn", "yield"]]
-    data = data.drop_duplicates(subset="rxn")
-    fn_data_original = "uptso.csv"
-    data.to_csv(fn_data_original, index=False)
+    data = Yields(name="USPTO_Yields")
+    splits = data.get_split()
+    df_train = splits["train"]
+    df_valid = splits["valid"]
+    df_test = splits["test"]
+    df_train["split"] = "train"
+    df_valid["split"] = "valid"
+    df_test["split"] = "test"
+    df = pd.concat([df_train, df_valid, df_test], axis=0)
+
+    df["catalyst"] = df.Reaction.apply(lambda x: x["catalyst"])
+    df["reactant"] = df.Reaction.apply(lambda x: x["reactant"])
+    df["product"] = df.Reaction.apply(lambda x: x["product"])
+    df = df.drop("Reaction", axis=1)
+
+    fn_data_original = "data_original.csv"
+    df.to_csv(fn_data_original, index=False)
+    del df
 
     # create dataframe
     df = pd.read_csv(
@@ -24,12 +31,18 @@ def get_and_transform_data():
 
     # check if fields are the same
     fields_orig = df.columns.tolist()
-    assert fields_orig == ["rxn", "yield"]
-    fields_clean = ["reaction_SMILES", "yield"]
-
+    assert fields_orig == [
+        "Reaction_ID",
+        "Y",
+        "split",
+        "catalyst",
+        "reactant",
+        "product",
+    ]
+    fields_clean = ["Reaction_ID", "yield", "split", "catalyst", "reactant", "product"]
     # overwrite column names = fields
     df.columns = fields_clean
-    assert fields_orig != fields_clean
+    assert df.columns.tolist() == fields_clean
 
     # remove leading and trailing white space characters
     assert not df.duplicated().sum()
@@ -40,29 +53,28 @@ def get_and_transform_data():
 
     # create meta yaml
     meta = {
-        "name": "uspto_500k",  
+        "name": "uspto_500k",
         "description": """United States Patent and Trademark Office reaction dataset with yields.""",
         "targets": [
             {
-                "id": "yield",  
-                "description": "Reaction yields analyzed by UPLC",
-                "units": "%",  
-                "type": "continuous",  
-                "names": [  
-                    "Reaction yield",
+                "id": "yield",
+                "description": "reaction yields",
+                "units": "%",
+                "type": "continuous",
+                "names": [
+                    "reaction yield",
                     "yield",
                 ],
                 "uris": [
-                    "https://bioportal.bioontology.org/ontologies/AFO?p=classes&conceptid=http%3A%2F%2Fpurl.allotrope.org%2Fontologies%2Fquality%23AFQ_0000227",
-                    "https://en.wikipedia.org/wiki/Yield_(chemistry)",
+                    "http://purl.allotrope.org/ontologies/quality#AFQ_0000227",
                 ],
             },
         ],
         "identifiers": [
             {
-                "id": "reaction_SMILES",  
-                "type": "RXNSMILES",  
-                "description": "reaction SMILES", 
+                "id": "reaction_SMILES",
+                "type": "RXNSMILES",
+                "description": "reaction SMILES",
             },
         ],
         "license": "CC0",  # license under which the original dataset was published