OpenBioML · pschwllr · Mar 8, 2023 · Mar 8, 2023 · May 6, 2023 · May 7, 2023
diff --git a/data/buchwald_hartwig/meta.yaml b/data/buchwald_hartwig/meta.yaml
@@ -0,0 +1,39 @@
+name: buchwald_hartwig_doyle
+description: High-throughput experimentation palladium-catalyzed Buchwald Hardwig
+  C-N cross-coupling data set with yields.
+targets:
+- id: yield
+  description: Reaction yields analyzed by UPLC
+  units: '%'
+  type: continuous
+  names:
+  - Reaction yield
+  - yield
+identifiers:
+- id: reaction_SMILES
+  type: RXNSMILES
+  description: RXNSMILES
+license: MIT
+links:
+- url: https://doi.org/10.1126/science.aar5169
+  description: corresponding publication
+- url: https://www.sciencedirect.com/science/article/pii/S2451929420300851
+  description: publication with data processing
+- url: https://github.com/rxn4chemistry/rxn_yields/blob/master/rxn_yields/data.py
+  description: preprocessing
+- url: https://github.com/reymond-group/drfp/tree/main/data
+  description: dataset
+num_points: 3955
+url: https://doi.org/10.1126/science.aar5169
+bibtex:
+- |-
+  @article{ahneman2018predicting,
+  title={Predicting reaction performance in C--N cross-coupling using machine learning},
+  author={Ahneman, Derek T and Estrada, Jes{'u}s G and Lin, Shishi and Dreher, Spencer D and Doyle, Abigail G},
+  journal={Science},
+  volume={360},
+  number={6385},
+  pages={186--190},
+  year={2018},
+  publisher={American Association for the Advancement of Science},
+  }
diff --git a/data/buchwald_hartwig/transform.py b/data/buchwald_hartwig/transform.py
@@ -0,0 +1,161 @@
+import pandas as pd
+import yaml
+from rdkit import Chem # 2022.9.5
+from rdkit.Chem import rdChemReactions
+
+def generate_buchwald_hartwig_rxns(df):
+    """
+    Converts the entries in the excel files to reaction SMILES.
+    From: https://github.com/reymond-group/drfp/blob/main/scripts/encoding/encode_buchwald_hartwig_reactions.py
+    and https://github.com/rxn4chemistry/rxn_yields/blob/master/rxn_yields/data.py
+    """
+    df = df.copy()
+    fwd_template = "[F,Cl,Br,I]-[c;H0;D3;+0:1](:[c,n:2]):[c,n:3].[NH2;D1;+0:4]-[c:5]>>[c,n:2]:[c;H0;D3;+0:1](:[c,n:3])-[NH;D2;+0:4]-[c:5]"
+    methylaniline = "Cc1ccc(N)cc1"
+    pd_catalyst = "O=S(=O)(O[Pd]1~[NH2]C2C=CC=CC=2C2C=CC=CC1=2)C(F)(F)F"
+    methylaniline_mol = Chem.MolFromSmiles(methylaniline)
+    rxn = rdChemReactions.ReactionFromSmarts(fwd_template)
+    products = []
+
+    for i, row in df.iterrows():
+        reacts = (Chem.MolFromSmiles(row["aryl_halide"]), methylaniline_mol)
+        rxn_products = rxn.RunReactants(reacts)
+
+        rxn_products_smiles = set([Chem.MolToSmiles(mol[0]) for mol in rxn_products])
+        assert len(rxn_products_smiles) == 1
+        products.append(list(rxn_products_smiles)[0])
+
+    df["product"] = products
+    rxns = []
+
+    for i, row in df.iterrows():
+        reactants = Chem.MolToSmiles(
+            Chem.MolFromSmiles(
+                f"{row['aryl_halide']}.{methylaniline}.{pd_catalyst}.{row['ligand']}.{row['base']}.{row['additive']}"
+            )
+        )
+        rxns.append(f"{reactants.replace('N~', '[NH2]')}>>{row['product']}")
+
+    return rxns
+
+def get_and_transform_data():
+    # get raw data
+    fn_data_original = "Dreher_and_Doyle_input_data.csv"
+    data = pd.read_excel('https://github.com/reymond-group/drfp/raw/main/data/Dreher_and_Doyle_input_data.xlsx')
+    data.to_csv(fn_data_original, index=False)
+
+    # create dataframe
+    df = pd.read_csv(
+        fn_data_original,
+        delimiter=",",
+    )  # not necessary but ensure we can load the saved data
+
+    # check if fields are the same
+    fields_orig = df.columns.tolist()
+    assert fields_orig == [
+        'Ligand',
+        'Additive', 
+        'Base', 
+        'Aryl halide', 
+        'Output'
+    ]
+
+    # overwrite column names = fields
+    fields_clean = [
+    "ligand",
+    "additive",
+    "base",
+    "aryl_halide",
+    'yield'
+    ]
+    df.columns = fields_clean
+
+    # data cleaning
+    reaction_SMILES = generate_buchwald_hartwig_rxns(df) # compile reactions
+    df.insert(4, 'reaction_SMILES', reaction_SMILES) # add reaction SMILES column
+
+    assert not df.duplicated().sum()
+
+    # save to csv
+    fn_data_csv = "data_clean.csv"
+    df.to_csv(fn_data_csv, index=False)
+
+    # create meta yaml
+    meta = {
+    "name": "buchwald_hartwig_doyle",  # unique identifier, we will also use this for directory names
+    "description": """High-throughput experimentation palladium-catalyzed Buchwald Hardwig C-N cross-coupling data set with yields.""",
+    "targets": [
+        {
+            "id": "yield",  # name of the column in a tabular dataset
+            "description": "Reaction yields analyzed by UPLC",  # description of what this column means
+            "units": "%",  # units of the values in this column (leave empty if unitless)
+            "type": "continuous",  # can be "categorical", "ordinal", "continuous"
+            "names": [  # names for the property (to sample from for building the prompts)
+                "Reaction yield",
+                "yield",
+            ],
+        },
+    ],
+    "identifiers": [
+        {
+            "id": "reaction_SMILES",  # column name
+            "type": "RXNSMILES",  # can be "SMILES", "SELFIES", "IUPAC", "Other"
+            "description": "RXNSMILES",  # description (optional, except for "Other")
+        },
+    ],
+    "license": "MIT",  # license under which the original dataset was published
+    "links": [  # list of relevant links (original dataset, other uses, etc.)
+        {
+            "url": "https://doi.org/10.1126/science.aar5169",
+            "description": "corresponding publication",
+        },
+        {
+            "url": "https://www.sciencedirect.com/science/article/pii/S2451929420300851",
+            "description": "publication with data processing",
+        },
+        {
+            "url": "https://github.com/rxn4chemistry/rxn_yields/blob/master/rxn_yields/data.py",
+            "description": "preprocessing",
+        },
+        {
+            "url": "https://github.com/reymond-group/drfp/tree/main/data",
+            "description": "dataset",
+        }
+    ],
+        "num_points": len(df),  # number of datapoints in this dataset
+        "url": "https://doi.org/10.1126/science.aar5169",
+        "bibtex": [
+            """@article{ahneman2018predicting,
+title={Predicting reaction performance in C--N cross-coupling using machine learning},
+author={Ahneman, Derek T and Estrada, Jes{\'u}s G and Lin, Shishi and Dreher, Spencer D and Doyle, Abigail G},
+journal={Science},
+volume={360},
+number={6385},
+pages={186--190},
+year={2018},
+publisher={American Association for the Advancement of Science},
+}""",
+    ],
+    }
+
+    def str_presenter(dumper, data):
+        """configures yaml for dumping multiline strings
+        Ref: https://stackoverflow.com/questions/8640959/how-can-i-control-what-scalar-form-pyyaml-uses-for-my-data
+        """
+        if data.count("\n") > 0:  # check for multiline string
+            return dumper.represent_scalar("tag:yaml.org,2002:str", data, style="|")
+        return dumper.represent_scalar("tag:yaml.org,2002:str", data)
+
+    yaml.add_representer(str, str_presenter)
+    yaml.representer.SafeRepresenter.add_representer(
+        str, str_presenter
+    )  # to use with safe_dum
+    fn_meta = "meta.yaml"
+    with open(fn_meta, "w") as f:
+        yaml.dump(meta, f, sort_keys=False)
+
+    print(f"Finished processing {meta['name']} dataset!")
+
+
+if __name__ == "__main__":
+    get_and_transform_data()