From b317b2c8b2eceec36b1e6f111fb201a9d9f843e1 Mon Sep 17 00:00:00 2001
From: Michael Pieler <36303596+MicPie@users.noreply.github.com>
Date: Sat, 10 Aug 2024 17:51:12 +0200
Subject: [PATCH 1/2] Class balanced sampling (#524)

Co-authored-by: Michael Pieler <36303596+MicPie@users.noreply.github.com>
Co-authored-by: Michael Pieler <Michael.Pieler@Gmail.com>
Co-authored-by: Kevin Maik Jablonka <kevin.jablonka@epfl.ch>
Co-authored-by: Kevin M Jablonka <32935233+kjappelbaum@users.noreply.github.com>
---
 data/text_sampling/text_sampling.py | 488 +++++++++++++++++++---------
 1 file changed, 329 insertions(+), 159 deletions(-)

diff --git a/data/text_sampling/text_sampling.py b/data/text_sampling/text_sampling.py
index 94dadbe33..0f2a5e522 100644
--- a/data/text_sampling/text_sampling.py
+++ b/data/text_sampling/text_sampling.py
@@ -542,6 +542,7 @@ def check_targets_and_identifiers(meta: dict, df: pd.DataFrame):
         if "split" not in df.columns:
             df["split"] = "train"
         self.df = df
+        self.df_orig = None  # only used for class_balanced sampling to keep a copy of the original self.df
 
         # text templates
         self.benchmarking_templates = benchmarking_templates
@@ -906,12 +907,59 @@ def sample(self, sample: pd.Series, template_idx: int = None):
 
     def __getitem__(self, sample_idx: int, template_idx: int = None):
         """Get item from data with sample and template index.
-        A random template will be ised if no template index is handed over."""
+        A random template will be used if no template index is handed over."""
         sample = self.df.iloc[sample_idx]
         return self.sample(sample, template_idx)
 
-    def apply_sampling(self, template_idx: int = None):
+    def apply_sampling(
+        self, template_idx: int = None, class_balanced: bool = True
+    ):  # TODO: set class_balanced to False !!!
         """Applies the sampling to the entire data frame."""
+        if template_idx is not None and class_balanced is True:
+            # create a copy of the original self.df to restore self.df after class balanced sampling
+            if self.df_orig is None:
+                self.df_orig = self.df.copy()
+
+            # get targets for balancing
+            template = self.get_prompt_template_from_template_idx(template_idx)
+            target_to_balance = []
+            for target in self.meta["targets"]:
+                for var in template.input_variables:
+                    if (target["id"] in var.replace("#", "")) or (
+                        target["id"] in var.replace("%", "")
+                    ):
+                        # print(f"{target['id']=}")
+                        target_to_balance.append(target["id"])
+            target_to_balance = list(set(target_to_balance))
+
+            # create class balanced self.df
+            if len(target_to_balance) > 1:
+                print("TEMPLATE USES MORE THAN ONE TARGET!")
+                print(f"{target_to_balance=}")
+                target_to_balance = random.sample(target_to_balance, k=1)[0]
+                print(f"{target_to_balance=}")
+            else:
+                # unwrap list of length 1
+                target_to_balance = target_to_balance[0]
+            df_vc = self.df_orig[target_to_balance].value_counts()
+            vc_min = df_vc.min()
+            vc_max = df_vc.max()
+            if vc_max > 1:
+                dfs = []
+                # cycle through all values and get a sample of size vc_min
+                for values in df_vc.index.tolist():
+                    dfs.append(
+                        self.df_orig[self.df_orig[target_to_balance] == values].sample(
+                            vc_min
+                        )
+                    )
+                self.df = pd.concat(dfs)
+            else:
+                self.df = self.df_orig
+            print(self.df[target_to_balance].value_counts())
+        # else:
+        #    assert template_idx is None and class_balanced is True, "class_balanced sampling is only supported with template_idx."  # noqa: E501
+
         self.df["sample"] = self.df.apply(
             lambda sample: self.sample(sample, template_idx), axis=1
         )
@@ -1047,84 +1095,226 @@ def export(self, fn_suffix: str = None):
         return pd.DataFrame(print_data)
 
     def apply_sampling_and_export(
-        self, template_idx: int = None, fn_suffix: str = None
+        self,
+        template_idx: int = None,
+        fn_suffix: str = None,
+        class_balanced=True,
     ):
         """Applies the sampling and exports the data."""
-        self.apply_sampling(template_idx=template_idx)
+        self.apply_sampling(template_idx=template_idx, class_balanced=class_balanced)
         df_results = self.export(fn_suffix=fn_suffix)
+
+        # if class_balanced restore self.df to original df that is not balanced
+        if class_balanced:
+            self.df = self.df_orig
+
         print(f"\n### results\n{df_results.to_string()}")
 
 
 if __name__ == "__main__":
     path_base = __file__.replace("text_sampling/text_sampling.py", "")
-    path_data_dir = sorted(glob.glob(path_base + "tabular/*"))
-    path_data_dir += sorted(
-        [p for p in glob.glob(path_base + "kg/*") if os.path.isdir(p)]
-    )
-    path_lm_eval_data_dir = path_base + "text_sampling/export"
-
-    # index = [i for i, x in enumerate(path_data_dir) if x.find("RedDB") != -1][0]
+    # path_data_dir = sorted(glob.glob(path_base + "tabular/*"))
+    # path_data_dir += sorted(
+    #    [p for p in glob.glob(path_base + "kg/*") if os.path.isdir(p)]
+    # )
+    # path_lm_eval_data_dir = path_base + "text_sampling/export_class_balanced"
+    # path_lm_eval_data_dir = path_base + "text_sampling/export_class_balanced_benchmark"
+    # path_lm_eval_data_dir = path_base + "text_sampling/export_standard"
+    path_lm_eval_data_dir = path_base + "text_sampling/export_standard_benchmark"
+    # path_lm_eval_data_dir = path_base + "text_sampling/export_inverse"
+
+    path_data_dir = [
+        # CLASS BALANCED DATASETS, set class_balanced = True !!!
+        # "/weka/proj-chemnlp/micpie/chemnlp/data/tabular/ames_mutagenicity",
+        # "/weka/proj-chemnlp/micpie/chemnlp/data/tabular/aminoacids",
+        # "/weka/proj-chemnlp/micpie/chemnlp/data/tabular/BACE",
+        # "/weka/proj-chemnlp/micpie/chemnlp/data/tabular/BBBP",
+        # "/weka/proj-chemnlp/micpie/chemnlp/data/tabular/bc5chem",
+        # "/weka/proj-chemnlp/micpie/chemnlp/data/tabular/bc5disease",
+        # "/weka/proj-chemnlp/micpie/chemnlp/data/tabular/bio_ner",
+        # "/weka/proj-chemnlp/micpie/chemnlp/data/tabular/bio_ner_10",
+        # "/weka/proj-chemnlp/micpie/chemnlp/data/tabular/bio_ner_11",
+        # "/weka/proj-chemnlp/micpie/chemnlp/data/tabular/bio_ner_12",
+        # "/weka/proj-chemnlp/micpie/chemnlp/data/tabular/bio_ner_13",
+        # "/weka/proj-chemnlp/micpie/chemnlp/data/tabular/bio_ner_14",
+        # "/weka/proj-chemnlp/micpie/chemnlp/data/tabular/bio_ner_15",
+        # "/weka/proj-chemnlp/micpie/chemnlp/data/tabular/bio_ner_16",
+        # "/weka/proj-chemnlp/micpie/chemnlp/data/tabular/bio_ner_17",
+        # "/weka/proj-chemnlp/micpie/chemnlp/data/tabular/bio_ner_18",
+        # "/weka/proj-chemnlp/micpie/chemnlp/data/tabular/bio_ner_19",
+        # "/weka/proj-chemnlp/micpie/chemnlp/data/tabular/bio_ner_2",
+        # "/weka/proj-chemnlp/micpie/chemnlp/data/tabular/bio_ner_20",
+        # "/weka/proj-chemnlp/micpie/chemnlp/data/tabular/bio_ner_21",
+        # "/weka/proj-chemnlp/micpie/chemnlp/data/tabular/bio_ner_22",
+        # "/weka/proj-chemnlp/micpie/chemnlp/data/tabular/bio_ner_23",
+        # "/weka/proj-chemnlp/micpie/chemnlp/data/tabular/bio_ner_24",
+        # "/weka/proj-chemnlp/micpie/chemnlp/data/tabular/bio_ner_25",
+        # "/weka/proj-chemnlp/micpie/chemnlp/data/tabular/bio_ner_26",
+        # "/weka/proj-chemnlp/micpie/chemnlp/data/tabular/bio_ner_27",
+        # "/weka/proj-chemnlp/micpie/chemnlp/data/tabular/bio_ner_28",
+        # "/weka/proj-chemnlp/micpie/chemnlp/data/tabular/bio_ner_29",
+        # "/weka/proj-chemnlp/micpie/chemnlp/data/tabular/bio_ner_3",
+        # "/weka/proj-chemnlp/micpie/chemnlp/data/tabular/bio_ner_30",
+        # "/weka/proj-chemnlp/micpie/chemnlp/data/tabular/bio_ner_31",
+        # "/weka/proj-chemnlp/micpie/chemnlp/data/tabular/bio_ner_33",
+        # "/weka/proj-chemnlp/micpie/chemnlp/data/tabular/bio_ner_34",
+        # "/weka/proj-chemnlp/micpie/chemnlp/data/tabular/bio_ner_35",
+        # "/weka/proj-chemnlp/micpie/chemnlp/data/tabular/bio_ner_36",
+        # "/weka/proj-chemnlp/micpie/chemnlp/data/tabular/bio_ner_37",
+        # "/weka/proj-chemnlp/micpie/chemnlp/data/tabular/bio_ner_38",
+        # "/weka/proj-chemnlp/micpie/chemnlp/data/tabular/bio_ner_39",
+        # "/weka/proj-chemnlp/micpie/chemnlp/data/tabular/bio_ner_4",
+        # "/weka/proj-chemnlp/micpie/chemnlp/data/tabular/bio_ner_40",
+        # "/weka/proj-chemnlp/micpie/chemnlp/data/tabular/bio_ner_47",
+        # "/weka/proj-chemnlp/micpie/chemnlp/data/tabular/bio_ner_48",
+        # "/weka/proj-chemnlp/micpie/chemnlp/data/tabular/bio_ner_5",
+        # "/weka/proj-chemnlp/micpie/chemnlp/data/tabular/bio_ner_52",
+        # "/weka/proj-chemnlp/micpie/chemnlp/data/tabular/bio_ner_57",
+        # "/weka/proj-chemnlp/micpie/chemnlp/data/tabular/bio_ner_58",
+        # "/weka/proj-chemnlp/micpie/chemnlp/data/tabular/bio_ner_6",
+        # "/weka/proj-chemnlp/micpie/chemnlp/data/tabular/bio_ner_7",
+        # "/weka/proj-chemnlp/micpie/chemnlp/data/tabular/bio_ner_8",
+        # "/weka/proj-chemnlp/micpie/chemnlp/data/tabular/bio_ner_9",
+        # "/weka/proj-chemnlp/micpie/chemnlp/data/tabular/bioavailability_ma_et_al",
+        # "/weka/proj-chemnlp/micpie/chemnlp/data/tabular/block_polymers_morphology",
+        # "/weka/proj-chemnlp/micpie/chemnlp/data/tabular/blood_brain_barrier_martins_et_al",
+        # "/weka/proj-chemnlp/micpie/chemnlp/data/tabular/caco2_wang",
+        # "/weka/proj-chemnlp/micpie/chemnlp/data/tabular/carcinogens",
+        # "/weka/proj-chemnlp/micpie/chemnlp/data/tabular/cav3_t-type_calcium_channels_butkiewicz",
+        # "/weka/proj-chemnlp/micpie/chemnlp/data/tabular/chemcaption_fragments",
+        # "/weka/proj-chemnlp/micpie/chemnlp/data/tabular/chemcaption_rdkit",
+        # "/weka/proj-chemnlp/micpie/chemnlp/data/tabular/chemdner",
+        # "/weka/proj-chemnlp/micpie/chemnlp/data/tabular/choline_transporter_butkiewicz",
+        # "/weka/proj-chemnlp/micpie/chemnlp/data/tabular/clearance_astrazeneca",
+        # "/weka/proj-chemnlp/micpie/chemnlp/data/tabular/clintox",
+        # "/weka/proj-chemnlp/micpie/chemnlp/data/tabular/cyp_p450_1a2_inhibition_veith_et_al",
+        # "/weka/proj-chemnlp/micpie/chemnlp/data/tabular/cyp_p450_2c19_inhibition_veith_et_al",
+        # "/weka/proj-chemnlp/micpie/chemnlp/data/tabular/cyp_p450_2c9_inhibition_veith_et_al",
+        # "/weka/proj-chemnlp/micpie/chemnlp/data/tabular/cyp_p450_2d6_inhibition_veith_et_al",
+        # "/weka/proj-chemnlp/micpie/chemnlp/data/tabular/cyp_p450_3a4_inhibition_veith_et_al",
+        # "/weka/proj-chemnlp/micpie/chemnlp/data/tabular/cyp2c9_substrate_carbonmangels",
+        # "/weka/proj-chemnlp/micpie/chemnlp/data/tabular/cyp2d6_substrate_carbonmangels",
+        # "/weka/proj-chemnlp/micpie/chemnlp/data/tabular/cyp3a4_substrate_carbonmangels",
+        # "/weka/proj-chemnlp/micpie/chemnlp/data/tabular/drug_induced_liver_injury",
+        # "/weka/proj-chemnlp/micpie/chemnlp/data/tabular/flashpoint",
+        # "/weka/proj-chemnlp/micpie/chemnlp/data/tabular/formation_energies",
+        # "/weka/proj-chemnlp/micpie/chemnlp/data/tabular/freesolv",
+        # "/weka/proj-chemnlp/micpie/chemnlp/data/tabular/half_life_obach",
+        # "/weka/proj-chemnlp/micpie/chemnlp/data/tabular/herg_blockers",
+        # "/weka/proj-chemnlp/micpie/chemnlp/data/tabular/herg_central_at_10uM",
+        # "/weka/proj-chemnlp/micpie/chemnlp/data/tabular/herg_central_at_1uM",
+        # "/weka/proj-chemnlp/micpie/chemnlp/data/tabular/herg_central_inhib",
+        # "/weka/proj-chemnlp/micpie/chemnlp/data/tabular/herg_karim_et_al",
+        # "/weka/proj-chemnlp/micpie/chemnlp/data/tabular/hiv",
+        # "/weka/proj-chemnlp/micpie/chemnlp/data/tabular/human_intestinal_absorption",
+        # "/weka/proj-chemnlp/micpie/chemnlp/data/tabular/kcnq2_potassium_channel_butkiewicz",
+        # "/weka/proj-chemnlp/micpie/chemnlp/data/tabular/ld50_zhu",
+        # "/weka/proj-chemnlp/micpie/chemnlp/data/tabular/lipophilicity",
+        # "/weka/proj-chemnlp/micpie/chemnlp/data/tabular/m1_muscarinic_receptor_agonists_butkiewicz",
+        # "/weka/proj-chemnlp/micpie/chemnlp/data/tabular/m1_muscarinic_receptor_antagonists_butkiewicz",
+        # "/weka/proj-chemnlp/micpie/chemnlp/data/tabular/melting_points",
+        # "/weka/proj-chemnlp/micpie/chemnlp/data/tabular/mona",
+        # "/weka/proj-chemnlp/micpie/chemnlp/data/tabular/mp_anisotropy",
+        # "/weka/proj-chemnlp/micpie/chemnlp/data/tabular/mp_bulk_modulus",
+        # "/weka/proj-chemnlp/micpie/chemnlp/data/tabular/mp_shear_modulus",
+        # "/weka/proj-chemnlp/micpie/chemnlp/data/tabular/MUV_466",
+        # "/weka/proj-chemnlp/micpie/chemnlp/data/tabular/MUV_548",
+        # "/weka/proj-chemnlp/micpie/chemnlp/data/tabular/MUV_600",
+        # "/weka/proj-chemnlp/micpie/chemnlp/data/tabular/MUV_644",
+        # "/weka/proj-chemnlp/micpie/chemnlp/data/tabular/MUV_652",
+        # "/weka/proj-chemnlp/micpie/chemnlp/data/tabular/MUV_689",
+        # "/weka/proj-chemnlp/micpie/chemnlp/data/tabular/MUV_692",
+        # "/weka/proj-chemnlp/micpie/chemnlp/data/tabular/MUV_712",
+        # "/weka/proj-chemnlp/micpie/chemnlp/data/tabular/MUV_713",
+        # "/weka/proj-chemnlp/micpie/chemnlp/data/tabular/MUV_733",
+        # "/weka/proj-chemnlp/micpie/chemnlp/data/tabular/MUV_737",
+        # "/weka/proj-chemnlp/micpie/chemnlp/data/tabular/MUV_810",
+        # "/weka/proj-chemnlp/micpie/chemnlp/data/tabular/MUV_832",
+        # "/weka/proj-chemnlp/micpie/chemnlp/data/tabular/MUV_846",
+        # "/weka/proj-chemnlp/micpie/chemnlp/data/tabular/MUV_852",
+        # "/weka/proj-chemnlp/micpie/chemnlp/data/tabular/MUV_858",
+        # "/weka/proj-chemnlp/micpie/chemnlp/data/tabular/MUV_859",
+        # "/weka/proj-chemnlp/micpie/chemnlp/data/tabular/ncbi_disease",
+        # "/weka/proj-chemnlp/micpie/chemnlp/data/tabular/nr_ahr_tox21",
+        # "/weka/proj-chemnlp/micpie/chemnlp/data/tabular/nr_ar_lbd_tox21",
+        # "/weka/proj-chemnlp/micpie/chemnlp/data/tabular/nr_ar_tox21",
+        # "/weka/proj-chemnlp/micpie/chemnlp/data/tabular/nr_aromatase_tox21",
+        # "/weka/proj-chemnlp/micpie/chemnlp/data/tabular/nr_er_lbd_tox21",
+        # "/weka/proj-chemnlp/micpie/chemnlp/data/tabular/nr_er_tox21",
+        # "/weka/proj-chemnlp/micpie/chemnlp/data/tabular/nr_ppar_gamma_tox21",
+        # "/weka/proj-chemnlp/micpie/chemnlp/data/tabular/ocp",
+        # "/weka/proj-chemnlp/micpie/chemnlp/data/tabular/opv",
+        # "/weka/proj-chemnlp/micpie/chemnlp/data/tabular/oqmd",
+        # "/weka/proj-chemnlp/micpie/chemnlp/data/tabular/ord_rxn_smiles_yield_pred",
+        # "/weka/proj-chemnlp/micpie/chemnlp/data/tabular/ord_steps_yield",
+        # "/weka/proj-chemnlp/micpie/chemnlp/data/tabular/orexin1_receptor_butkiewicz",
+        # "/weka/proj-chemnlp/micpie/chemnlp/data/tabular/p_glycoprotein_inhibition_broccatelli_et_al",
+        # "/weka/proj-chemnlp/micpie/chemnlp/data/tabular/pampa_ncats",
+        # "/weka/proj-chemnlp/micpie/chemnlp/data/tabular/peptides_hemolytic",
+        # "/weka/proj-chemnlp/micpie/chemnlp/data/tabular/peptides_nonfouling",
+        # "/weka/proj-chemnlp/micpie/chemnlp/data/tabular/peptides_soluble",
+        # "/weka/proj-chemnlp/micpie/chemnlp/data/tabular/potassium_ion_channel_kir2_1_butkiewicz",
+        # "/weka/proj-chemnlp/micpie/chemnlp/data/tabular/qm8",
+        # "/weka/proj-chemnlp/micpie/chemnlp/data/tabular/qm9",
+        # "/weka/proj-chemnlp/micpie/chemnlp/data/tabular/qmof_gcmc",
+        # "/weka/proj-chemnlp/micpie/chemnlp/data/tabular/qmof_quantum",
+        # "/weka/proj-chemnlp/micpie/chemnlp/data/tabular/rhea_db_predictions",
+        # "/weka/proj-chemnlp/micpie/chemnlp/data/tabular/sarscov2_3clpro_diamond",
+        # "/weka/proj-chemnlp/micpie/chemnlp/data/tabular/sarscov2_vitro_touret",
+        # "/weka/proj-chemnlp/micpie/chemnlp/data/tabular/serine_threonine_kinase_33_butkiewicz",
+        # "/weka/proj-chemnlp/micpie/chemnlp/data/tabular/SIDER",
+        # "/weka/proj-chemnlp/micpie/chemnlp/data/tabular/skin_reaction",
+        # "/weka/proj-chemnlp/micpie/chemnlp/data/tabular/solubility_aqsoldb",
+        # "/weka/proj-chemnlp/micpie/chemnlp/data/tabular/sr_are_tox21",
+        # "/weka/proj-chemnlp/micpie/chemnlp/data/tabular/sr_atad5_tox21",
+        # "/weka/proj-chemnlp/micpie/chemnlp/data/tabular/sr_hse_tox21",
+        # "/weka/proj-chemnlp/micpie/chemnlp/data/tabular/sr_mmp_tox21",
+        # "/weka/proj-chemnlp/micpie/chemnlp/data/tabular/sr_p53_tox21",
+        # "/weka/proj-chemnlp/micpie/chemnlp/data/tabular/thermosol",
+        # "/weka/proj-chemnlp/micpie/chemnlp/data/tabular/tyrosyl-dna_phosphodiesterase_butkiewicz",
+        # "/weka/proj-chemnlp/micpie/chemnlp/data/tabular/uniprot_binding_sites_multiple",
+        # "/weka/proj-chemnlp/micpie/chemnlp/data/tabular/uniprot_organisms",
+        # "/weka/proj-chemnlp/micpie/chemnlp/data/tabular/volume_of_distribution_at_steady_state_lombardo_et_al",
+        # "/weka/proj-chemnlp/micpie/chemnlp/data/tabular/bicerano_dataset",
+        # STANDARD SAMPLING DATASETS, set class_balanced = False !!!
+        "/weka/proj-chemnlp/micpie/chemnlp/data/tabular/RedDB",
+        "/weka/proj-chemnlp/micpie/chemnlp/data/tabular/mattermodeling_stackexchange",
+        "/weka/proj-chemnlp/micpie/chemnlp/data/tabular/mp_descriptions",
+        "/weka/proj-chemnlp/micpie/chemnlp/data/tabular/mp_self_supervised",
+        "/weka/proj-chemnlp/micpie/chemnlp/data/tabular/rdkit_features",
+        "/weka/proj-chemnlp/micpie/chemnlp/data/tabular/smiles_to_3d",
+        "/weka/proj-chemnlp/micpie/chemnlp/data/tabular/uniprot_binding_sites_multiple",
+        # INVERSE DATASETS, set class_balanced = False !!!
+        # "/weka/proj-chemnlp/micpie/chemnlp/data/tabular/inverse_1",
+        # "/weka/proj-chemnlp/micpie/chemnlp/data/tabular/inverse_2",
+        # "/weka/proj-chemnlp/micpie/chemnlp/data/tabular/inverse_3",
+    ]
+    # index = [i for i, x in enumerate(path_data_dir) if x.find("qmof_quantum") != -1][0]
     # print(index)
     # path_data_dir = path_data_dir[index:]
+    # path_data_dir = path_data_dir[index + 1 :]
     # path_data_dir = [path_data_dir[index]]
-    # path_data_dir = [
-    #        '/weka/proj-chemnlp/micpie/chemnlp/data/tabular/bioavailability_ma_et_al',
-    #        '/fsx/proj-chemnlp/micpie/chemnlp/data/tabular/RedDB',
-    #        '/fsx/proj-chemnlp/micpie/chemnlp/data/tabular/SIDER',
-    #        '/fsx/proj-chemnlp/micpie/chemnlp/data/tabular/aminoacids',
-    #        '/fsx/proj-chemnlp/micpie/chemnlp/data/tabular/bc5chem',
-    #        '/fsx/proj-chemnlp/micpie/chemnlp/data/tabular/bc5disease',
-    #        '/fsx/proj-chemnlp/micpie/chemnlp/data/tabular/bicerano_dataset',
-    #        '/fsx/proj-chemnlp/micpie/chemnlp/data/tabular/block_polymers_morphology',
-    #        '/fsx/proj-chemnlp/micpie/chemnlp/data/tabular/buchwald_hartwig',
-    #        '/fsx/proj-chemnlp/micpie/chemnlp/data/tabular/chem_caption_smarts',
-    #        '/fsx/proj-chemnlp/micpie/chemnlp/data/tabular/chemdner',
-    #        '/fsx/proj-chemnlp/micpie/chemnlp/data/tabular/chemistry_stackexchange',
-    #        '/fsx/proj-chemnlp/micpie/chemnlp/data/tabular/compound_chebi_chebi_chebi_1',
-    #        '/fsx/proj-chemnlp/micpie/chemnlp/data/tabular/compound_chebi_chebi_chebi_2',
-    #        '/fsx/proj-chemnlp/micpie/chemnlp/data/tabular/drug_chebi_chebi_chebi',
-    #        '/fsx/proj-chemnlp/micpie/chemnlp/data/tabular/fda_adverse_reactions',
-    #        '/fsx/proj-chemnlp/micpie/chemnlp/data/tabular/formation_energies',
-    #        '/fsx/proj-chemnlp/micpie/chemnlp/data/tabular/h2_storage_materials',
-    #        '/fsx/proj-chemnlp/micpie/chemnlp/data/tabular/mattermodeling_stackexchange',
-    #        '/fsx/proj-chemnlp/micpie/chemnlp/data/tabular/melting_points',
-    #        '/fsx/proj-chemnlp/micpie/chemnlp/data/tabular/mofdscribe',
-    #        '/fsx/proj-chemnlp/micpie/chemnlp/data/tabular/mol2svg',
-    #        '/fsx/proj-chemnlp/micpie/chemnlp/data/tabular/moses',
-    #        '/fsx/proj-chemnlp/micpie/chemnlp/data/tabular/mp_anisotropy',
-    #        '/fsx/proj-chemnlp/micpie/chemnlp/data/tabular/mp_bulk_modulus',
-    #        '/fsx/proj-chemnlp/micpie/chemnlp/data/tabular/mp_descriptions',
-    #        '/fsx/proj-chemnlp/micpie/chemnlp/data/tabular/mp_self_supervised',
-    #        '/fsx/proj-chemnlp/micpie/chemnlp/data/tabular/mp_shear_modulus',
-    #        '/fsx/proj-chemnlp/micpie/chemnlp/data/tabular/ncbi_disease',
-    #        '/fsx/proj-chemnlp/micpie/chemnlp/data/tabular/nomad_structure',
-    #        '/fsx/proj-chemnlp/micpie/chemnlp/data/tabular/ocp',
-    #        '/fsx/proj-chemnlp/micpie/chemnlp/data/tabular/opv',
-    #        '/fsx/proj-chemnlp/micpie/chemnlp/data/tabular/oqmd',
-    #        '/fsx/proj-chemnlp/micpie/chemnlp/data/tabular/ord_masked',
-    #        '/fsx/proj-chemnlp/micpie/chemnlp/data/tabular/ord_predictions',
-    #        '/fsx/proj-chemnlp/micpie/chemnlp/data/tabular/ord_procedure_steps',
-    #        '/fsx/proj-chemnlp/micpie/chemnlp/data/tabular/ord_rxn_smiles_procedure',
-    #        '/fsx/proj-chemnlp/micpie/chemnlp/data/tabular/ord_rxn_smiles_yield_pred',
-    #        '/fsx/proj-chemnlp/micpie/chemnlp/data/tabular/ord_steps_yield',
-    #        '/fsx/proj-chemnlp/micpie/chemnlp/data/tabular/perovskite_db',
-    #        '/fsx/proj-chemnlp/micpie/chemnlp/data/tabular/physics_stackexchange',
-    #        '/fsx/proj-chemnlp/micpie/chemnlp/data/tabular/qm8',
-    #        '/fsx/proj-chemnlp/micpie/chemnlp/data/tabular/qm9',
-    #        '/fsx/proj-chemnlp/micpie/chemnlp/data/tabular/suzuki_miyaura_sach',
-    #        '/fsx/proj-chemnlp/micpie/chemnlp/data/tabular/uspto',
-    #        '/fsx/proj-chemnlp/micpie/chemnlp/data/tabular/uspto_yield',
-    #        ]
 
     for path in path_data_dir:
         # subselect one path
-        # if path.find("data/tabular/") == -1: continue
         # if path.find("data/kg/") == -1: continue
-        # if path.find("chembl33") != -1: continue
-        # if path.find("data/kg/compound_chebi") == -1: continue
-        # if path.find("data/tabular/cyp3a4_substrate_carbonmangels") == -1: continue
-        # if path.find("data/tabular/bio_ner") == -1: continue
-        # if path.find("rdkit_features") != -1: continue
+        # if path.find("data/tabular/") == -1:
+        #     continue
+        # if path.find("data/kg/") == -1: continue
+
+        # exclude data_clean.csv files with more than 1GB
+        # if path.find("rdkit_features") != -1:
+        #     continue
+        # if path.find("iupac_smiles") != -1:
+        #     continue
+        # if path.find("orbnet_denali") != -1:
+        #     continue
+        # if path.find("ord_masked") != -1:
+        #     continue
+        # if path.find("ord_predictions") != -1:
+        #     continue
+        # if path.find("chembl_v29") != -1:
+        #     continue
 
         print(f"\n###### {path}")
         path_meta = path + "/meta.yaml"
@@ -1179,33 +1369,8 @@ def apply_sampling_and_export(
             if "templates" in meta:
                 multiple_choice_rnd_symbols = ["", ".", ".)", ")", ":", "()", "[]"]
                 print(f"Running sampling for: {path}")
-                # uncomment to randomly sample from all templates and save the output to a single file
-                # TemplateSampler(
-                #    path,
-                #    path_lm_eval_data_dir,
-                #    multiple_choice_rnd_symbols=multiple_choice_rnd_symbols,
-                #    additional_templates=additional_templates,
-                #    benchmarking_templates=False,
-                #    multiple_choice_benchmarking_templates=False,
-                # ).apply_sampling_and_export()
-
-                # tempsamp = TemplateSampler(
-                #    path,
-                #    path_lm_eval_data_dir,
-                #    multiple_choice_rnd_symbols=multiple_choice_rnd_symbols,
-                #    additional_templates=additional_templates,
-                #    benchmarking_templates=False,
-                #    multiple_choice_benchmarking_templates=False,
-                # )
-                # for i, template in enumerate(
-                #    [t for t in meta["templates"] if "<EOI>" not in t]
-                # ):
-                #    print(f"\nRunning sampling for template {i}:\n{template}")
-                #    tempsamp.apply_sampling_and_export(
-                #        template_idx=i,
-                #        fn_suffix=i,
-                #    )
 
+                # CHUNKED TRAIN SAMPLING
                 chunksize = 1_000_000
                 path_data_csv = path + "/data_clean.csv"
                 with pd.read_csv(
@@ -1213,74 +1378,79 @@ def apply_sampling_and_export(
                 ) as reader:
                     chunk_idx = 0
                     for df_chunk in reader:
-                        tempsamp = TemplateSampler(
-                            path,
-                            df_chunk,
-                            path_lm_eval_data_dir,
-                            multiple_choice_rnd_symbols=multiple_choice_rnd_symbols,
-                            additional_templates=additional_templates,
-                            benchmarking_templates=False,
-                            multiple_choice_benchmarking_templates=False,
-                        )
-                        for i, template in enumerate(
-                            [t for t in meta["templates"] if "<EOI>" not in t]
+                        #        tempsamp = TemplateSampler(
+                        #            path,
+                        #            df_chunk,
+                        #            path_lm_eval_data_dir,
+                        #            multiple_choice_rnd_symbols=multiple_choice_rnd_symbols,
+                        #            additional_templates=additional_templates,
+                        #            benchmarking_templates=False,
+                        #            multiple_choice_benchmarking_templates=False,
+                        #        )
+                        #        for i, template in enumerate(
+                        #            [t for t in meta["templates"] if "<EOI>" not in t]
+                        #        ):
+                        #            print(f"\nRunning sampling for template {i}:\n{template}")
+                        #            tempsamp.apply_sampling_and_export(
+                        #                template_idx=i,
+                        #                fn_suffix=f"{chunk_idx}-{i}",
+                        #            )
+
+                        # STANDARD BENCHMARKING SAMPLING
+                        if any(["<EOI>" in t for t in meta["templates"]]):
+                            tempsamp = TemplateSampler(
+                                path,
+                                df_chunk,
+                                path_lm_eval_data_dir,
+                                multiple_choice_rnd_symbols=multiple_choice_rnd_symbols,
+                                additional_templates=additional_templates,
+                                benchmarking_templates=True,
+                                multiple_choice_benchmarking_templates=False,
+                            )
+                            for i, template in enumerate(
+                                [
+                                    t
+                                    for t in meta["templates"]
+                                    if "<EOI>" in t and "%multiple_choice_" not in t
+                                ]
+                            ):
+                                print(
+                                    f"\nRunning sampling for template {i}:\n{template}"
+                                )
+                                tempsamp.apply_sampling_and_export(
+                                    template_idx=i,
+                                    fn_suffix=f"{chunk_idx}-{i}",
+                                )
+
+                        # MULTIPLE CHOICE BENCHMARKING SAMPLING
+                        if any(
+                            [
+                                "<EOI>" in t and "%multiple_choice_" in t
+                                for t in meta["templates"]
+                            ]
                         ):
-                            print(f"\nRunning sampling for template {i}:\n{template}")
-                            tempsamp.apply_sampling_and_export(
-                                template_idx=i,
-                                fn_suffix=f"{chunk_idx}-{i}",
+                            tempsamp = TemplateSampler(
+                                path,
+                                df_chunk,
+                                path_lm_eval_data_dir,
+                                multiple_choice_rnd_symbols=multiple_choice_rnd_symbols,
+                                additional_templates=additional_templates,
+                                benchmarking_templates=True,
+                                multiple_choice_benchmarking_templates=True,
                             )
-                        chunk_idx += 1
+                            for i, template in enumerate(
+                                [
+                                    t
+                                    for t in meta["templates"]
+                                    if "<EOI>" in t and "%multiple_choice_" in t
+                                ]
+                            ):
+                                print(
+                                    f"\nRunning sampling for template {i}:\n{template}"
+                                )
+                                tempsamp.apply_sampling_and_export(
+                                    template_idx=i,
+                                    fn_suffix=f"{chunk_idx}-{i}",
+                                )
 
-                # if any(["<EOI>" in t for t in meta["templates"]]):
-                #    # uncomment to randomly sample from all templates and save the output to a single file
-                #    # TemplateSampler(
-                #    #     path,
-                #    #     path_lm_eval_data_dir,
-                #    #     multiple_choice_rnd_symbols=multiple_choice_rnd_symbols,
-                #    #     additional_templates=additional_templates,
-                #    #     benchmarking_templates=True,
-                #    #     multiple_choice_benchmarking_templates=False,
-                #    # ).apply_sampling_and_export()
-
-                #    tempsamp = TemplateSampler(
-                #        path,
-                #        path_lm_eval_data_dir,
-                #        multiple_choice_rnd_symbols=multiple_choice_rnd_symbols,
-                #        additional_templates=additional_templates,
-                #        benchmarking_templates=True,
-                #        multiple_choice_benchmarking_templates=False,
-                #    )
-                #    for i, template in enumerate(
-                #        [
-                #            t
-                #            for t in meta["templates"]
-                #            if "<EOI>" in t and "%multiple_choice_" not in t
-                #        ]
-                #    ):
-                #        print(f"\nRunning sampling for template {i}:\n{template}")
-                #        tempsamp.apply_sampling_and_export(
-                #            template_idx=i,
-                #            fn_suffix=i,
-                #        )
-
-                #    if any(["%multiple_choice_" in t for t in meta["templates"]]):
-                #        TemplateSampler(
-                #            path,
-                #            path_lm_eval_data_dir,
-                #            multiple_choice_rnd_symbols=multiple_choice_rnd_symbols,
-                #            additional_templates=additional_templates,
-                #            benchmarking_templates=True,
-                #            multiple_choice_benchmarking_templates=True,
-                #        ).apply_sampling_and_export()
-
-                #        # for i, s in enumerate(multiple_choice_rnd_symbols):
-                #        #    TemplateSampler(
-                #        #        path,
-                #        #        path_lm_eval_data_dir,
-                #        #        multiple_choice_rnd_symbols=[s],
-                #        #        additional_templates=additional_templates,
-                #        #        benchmarking_templates=True,
-                #        #        multiple_choice_benchmarking_templates=True,
-                #        #        multiple_choice_benchmarking_format=i,
-                #        #    ).apply_sampling_and_export()
+                        chunk_idx += 1

From 1a844a7065cd5821c216a7ecaafeb183e0c3e0e4 Mon Sep 17 00:00:00 2001
From: Kevin M Jablonka <32935233+kjappelbaum@users.noreply.github.com>
Date: Sat, 10 Aug 2024 08:53:13 -0700
Subject: [PATCH 2/2] fix: in post-processing of split rewrite file (#531)

---
 data/postprocess_split.py | 9 ++++++---
 1 file changed, 6 insertions(+), 3 deletions(-)

diff --git a/data/postprocess_split.py b/data/postprocess_split.py
index ef8999b0b..7ef160a54 100644
--- a/data/postprocess_split.py
+++ b/data/postprocess_split.py
@@ -4,6 +4,9 @@
 It also merges files that have been created by `dask` if they are chunks of one large dataset.
 
 This script needs to be run after the splitting script.
+
+An independent check (that does not rewrite files is `check_smiles_split.py`;
+this checks also for compliance with the predetermined files)
 """
 
 import os
@@ -143,7 +146,7 @@ def process_file(file: Union[str, Path], id_cols):
         # appear multiple times
         ddf = read_ddf(file)
         ddf = ddf.drop_duplicates(subset=id_cols)
-        ddf.to_csv("data_clean-{*}.csv", index=False)
+        ddf.to_csv(os.path.join(dir, "data_clean-{*}.csv"), index=False)
         merge_files(dir)
 
     else:
@@ -154,7 +157,7 @@ def process_file(file: Union[str, Path], id_cols):
         for id in id_cols:
             test_smiles.extend(df[df["split"] == "test"][id].to_list())
             val_smiles.extend(df[df["split"] == "valid"][id].to_list())
-
+            df.drop_duplicates(subset=[id], inplace=True)
         test_smiles = set(test_smiles)
         val_smiles = set(val_smiles)
 
@@ -184,7 +187,7 @@ def process_file(file: Union[str, Path], id_cols):
                 len(this_test_smiles.intersection(this_val_smiles)) == 0
             ), f"Smiles in test and valid for {id}"
 
-        df.to_csv("data_clean.csv", index=False)
+        df.to_csv(os.path.join(dir, "data_clean.csv"), index=False)
 
 
 def process_all_files(data_dir):