diff --git a/data/text_sampling/text_sampling.py b/data/text_sampling/text_sampling.py index 2a7de61d4..a877fcfb8 100644 --- a/data/text_sampling/text_sampling.py +++ b/data/text_sampling/text_sampling.py @@ -1,4 +1,4 @@ -import glob +# import glob import math import os.path import random @@ -1114,176 +1114,177 @@ def apply_sampling_and_export( if __name__ == "__main__": path_base = __file__.replace("text_sampling/text_sampling.py", "") - path_data_dir = sorted(glob.glob(path_base + "tabular/*")) - path_data_dir += sorted( - [p for p in glob.glob(path_base + "kg/*") if os.path.isdir(p)] - ) + # path_data_dir = sorted(glob.glob(path_base + "tabular/*")) + # path_data_dir += sorted( + # [p for p in glob.glob(path_base + "kg/*") if os.path.isdir(p)] + # ) # path_lm_eval_data_dir = path_base + "text_sampling/export_class_balanced" - path_lm_eval_data_dir = path_base + "text_sampling/export_class_balanced_benchmark" + # path_lm_eval_data_dir = path_base + "text_sampling/export_class_balanced_benchmark" # path_lm_eval_data_dir = path_base + "text_sampling/export_standard" + path_lm_eval_data_dir = path_base + "text_sampling/export_standard_benchmark" # path_lm_eval_data_dir = path_base + "text_sampling/export_inverse" path_data_dir = [ # CLASS BALANCED DATASETS, set class_balanced = True !!! - "/weka/proj-chemnlp/micpie/chemnlp/data/tabular/ames_mutagenicity", - "/weka/proj-chemnlp/micpie/chemnlp/data/tabular/aminoacids", - "/weka/proj-chemnlp/micpie/chemnlp/data/tabular/BACE", - "/weka/proj-chemnlp/micpie/chemnlp/data/tabular/BBBP", - "/weka/proj-chemnlp/micpie/chemnlp/data/tabular/bc5chem", - "/weka/proj-chemnlp/micpie/chemnlp/data/tabular/bc5disease", - "/weka/proj-chemnlp/micpie/chemnlp/data/tabular/bio_ner", - "/weka/proj-chemnlp/micpie/chemnlp/data/tabular/bio_ner_10", - "/weka/proj-chemnlp/micpie/chemnlp/data/tabular/bio_ner_11", - "/weka/proj-chemnlp/micpie/chemnlp/data/tabular/bio_ner_12", - "/weka/proj-chemnlp/micpie/chemnlp/data/tabular/bio_ner_13", - "/weka/proj-chemnlp/micpie/chemnlp/data/tabular/bio_ner_14", - "/weka/proj-chemnlp/micpie/chemnlp/data/tabular/bio_ner_15", - "/weka/proj-chemnlp/micpie/chemnlp/data/tabular/bio_ner_16", - "/weka/proj-chemnlp/micpie/chemnlp/data/tabular/bio_ner_17", - "/weka/proj-chemnlp/micpie/chemnlp/data/tabular/bio_ner_18", - "/weka/proj-chemnlp/micpie/chemnlp/data/tabular/bio_ner_19", - "/weka/proj-chemnlp/micpie/chemnlp/data/tabular/bio_ner_2", - "/weka/proj-chemnlp/micpie/chemnlp/data/tabular/bio_ner_20", - "/weka/proj-chemnlp/micpie/chemnlp/data/tabular/bio_ner_21", - "/weka/proj-chemnlp/micpie/chemnlp/data/tabular/bio_ner_22", - "/weka/proj-chemnlp/micpie/chemnlp/data/tabular/bio_ner_23", - "/weka/proj-chemnlp/micpie/chemnlp/data/tabular/bio_ner_24", - "/weka/proj-chemnlp/micpie/chemnlp/data/tabular/bio_ner_25", - "/weka/proj-chemnlp/micpie/chemnlp/data/tabular/bio_ner_26", - "/weka/proj-chemnlp/micpie/chemnlp/data/tabular/bio_ner_27", - "/weka/proj-chemnlp/micpie/chemnlp/data/tabular/bio_ner_28", - "/weka/proj-chemnlp/micpie/chemnlp/data/tabular/bio_ner_29", - "/weka/proj-chemnlp/micpie/chemnlp/data/tabular/bio_ner_3", - "/weka/proj-chemnlp/micpie/chemnlp/data/tabular/bio_ner_30", - "/weka/proj-chemnlp/micpie/chemnlp/data/tabular/bio_ner_31", - "/weka/proj-chemnlp/micpie/chemnlp/data/tabular/bio_ner_33", - "/weka/proj-chemnlp/micpie/chemnlp/data/tabular/bio_ner_34", - "/weka/proj-chemnlp/micpie/chemnlp/data/tabular/bio_ner_35", - "/weka/proj-chemnlp/micpie/chemnlp/data/tabular/bio_ner_36", - "/weka/proj-chemnlp/micpie/chemnlp/data/tabular/bio_ner_37", - "/weka/proj-chemnlp/micpie/chemnlp/data/tabular/bio_ner_38", - "/weka/proj-chemnlp/micpie/chemnlp/data/tabular/bio_ner_39", - "/weka/proj-chemnlp/micpie/chemnlp/data/tabular/bio_ner_4", - "/weka/proj-chemnlp/micpie/chemnlp/data/tabular/bio_ner_40", - "/weka/proj-chemnlp/micpie/chemnlp/data/tabular/bio_ner_47", - "/weka/proj-chemnlp/micpie/chemnlp/data/tabular/bio_ner_48", - "/weka/proj-chemnlp/micpie/chemnlp/data/tabular/bio_ner_5", - "/weka/proj-chemnlp/micpie/chemnlp/data/tabular/bio_ner_52", - "/weka/proj-chemnlp/micpie/chemnlp/data/tabular/bio_ner_57", - "/weka/proj-chemnlp/micpie/chemnlp/data/tabular/bio_ner_58", - "/weka/proj-chemnlp/micpie/chemnlp/data/tabular/bio_ner_6", - "/weka/proj-chemnlp/micpie/chemnlp/data/tabular/bio_ner_7", - "/weka/proj-chemnlp/micpie/chemnlp/data/tabular/bio_ner_8", - "/weka/proj-chemnlp/micpie/chemnlp/data/tabular/bio_ner_9", - "/weka/proj-chemnlp/micpie/chemnlp/data/tabular/bioavailability_ma_et_al", - "/weka/proj-chemnlp/micpie/chemnlp/data/tabular/block_polymers_morphology", - "/weka/proj-chemnlp/micpie/chemnlp/data/tabular/blood_brain_barrier_martins_et_al", - "/weka/proj-chemnlp/micpie/chemnlp/data/tabular/caco2_wang", - "/weka/proj-chemnlp/micpie/chemnlp/data/tabular/carcinogens", - "/weka/proj-chemnlp/micpie/chemnlp/data/tabular/cav3_t-type_calcium_channels_butkiewicz", - "/weka/proj-chemnlp/micpie/chemnlp/data/tabular/chemcaption_fragments", - "/weka/proj-chemnlp/micpie/chemnlp/data/tabular/chemcaption_rdkit", - "/weka/proj-chemnlp/micpie/chemnlp/data/tabular/chemdner", - "/weka/proj-chemnlp/micpie/chemnlp/data/tabular/choline_transporter_butkiewicz", - "/weka/proj-chemnlp/micpie/chemnlp/data/tabular/clearance_astrazeneca", - "/weka/proj-chemnlp/micpie/chemnlp/data/tabular/clintox", - "/weka/proj-chemnlp/micpie/chemnlp/data/tabular/cyp_p450_1a2_inhibition_veith_et_al", - "/weka/proj-chemnlp/micpie/chemnlp/data/tabular/cyp_p450_2c19_inhibition_veith_et_al", - "/weka/proj-chemnlp/micpie/chemnlp/data/tabular/cyp_p450_2c9_inhibition_veith_et_al", - "/weka/proj-chemnlp/micpie/chemnlp/data/tabular/cyp_p450_2d6_inhibition_veith_et_al", - "/weka/proj-chemnlp/micpie/chemnlp/data/tabular/cyp_p450_3a4_inhibition_veith_et_al", - "/weka/proj-chemnlp/micpie/chemnlp/data/tabular/cyp2c9_substrate_carbonmangels", - "/weka/proj-chemnlp/micpie/chemnlp/data/tabular/cyp2d6_substrate_carbonmangels", - "/weka/proj-chemnlp/micpie/chemnlp/data/tabular/cyp3a4_substrate_carbonmangels", - "/weka/proj-chemnlp/micpie/chemnlp/data/tabular/drug_induced_liver_injury", - "/weka/proj-chemnlp/micpie/chemnlp/data/tabular/flashpoint", - "/weka/proj-chemnlp/micpie/chemnlp/data/tabular/formation_energies", - "/weka/proj-chemnlp/micpie/chemnlp/data/tabular/freesolv", - "/weka/proj-chemnlp/micpie/chemnlp/data/tabular/half_life_obach", - "/weka/proj-chemnlp/micpie/chemnlp/data/tabular/herg_blockers", - "/weka/proj-chemnlp/micpie/chemnlp/data/tabular/herg_central_at_10uM", - "/weka/proj-chemnlp/micpie/chemnlp/data/tabular/herg_central_at_1uM", - "/weka/proj-chemnlp/micpie/chemnlp/data/tabular/herg_central_inhib", - "/weka/proj-chemnlp/micpie/chemnlp/data/tabular/herg_karim_et_al", - "/weka/proj-chemnlp/micpie/chemnlp/data/tabular/hiv", - "/weka/proj-chemnlp/micpie/chemnlp/data/tabular/human_intestinal_absorption", - "/weka/proj-chemnlp/micpie/chemnlp/data/tabular/kcnq2_potassium_channel_butkiewicz", - "/weka/proj-chemnlp/micpie/chemnlp/data/tabular/ld50_zhu", - "/weka/proj-chemnlp/micpie/chemnlp/data/tabular/lipophilicity", - "/weka/proj-chemnlp/micpie/chemnlp/data/tabular/m1_muscarinic_receptor_agonists_butkiewicz", - "/weka/proj-chemnlp/micpie/chemnlp/data/tabular/m1_muscarinic_receptor_antagonists_butkiewicz", - "/weka/proj-chemnlp/micpie/chemnlp/data/tabular/melting_points", - "/weka/proj-chemnlp/micpie/chemnlp/data/tabular/mona", - "/weka/proj-chemnlp/micpie/chemnlp/data/tabular/mp_anisotropy", - "/weka/proj-chemnlp/micpie/chemnlp/data/tabular/mp_bulk_modulus", - "/weka/proj-chemnlp/micpie/chemnlp/data/tabular/mp_shear_modulus", - "/weka/proj-chemnlp/micpie/chemnlp/data/tabular/MUV_466", - "/weka/proj-chemnlp/micpie/chemnlp/data/tabular/MUV_548", - "/weka/proj-chemnlp/micpie/chemnlp/data/tabular/MUV_600", - "/weka/proj-chemnlp/micpie/chemnlp/data/tabular/MUV_644", - "/weka/proj-chemnlp/micpie/chemnlp/data/tabular/MUV_652", - "/weka/proj-chemnlp/micpie/chemnlp/data/tabular/MUV_689", - "/weka/proj-chemnlp/micpie/chemnlp/data/tabular/MUV_692", - "/weka/proj-chemnlp/micpie/chemnlp/data/tabular/MUV_712", - "/weka/proj-chemnlp/micpie/chemnlp/data/tabular/MUV_713", - "/weka/proj-chemnlp/micpie/chemnlp/data/tabular/MUV_733", - "/weka/proj-chemnlp/micpie/chemnlp/data/tabular/MUV_737", - "/weka/proj-chemnlp/micpie/chemnlp/data/tabular/MUV_810", - "/weka/proj-chemnlp/micpie/chemnlp/data/tabular/MUV_832", - "/weka/proj-chemnlp/micpie/chemnlp/data/tabular/MUV_846", - "/weka/proj-chemnlp/micpie/chemnlp/data/tabular/MUV_852", - "/weka/proj-chemnlp/micpie/chemnlp/data/tabular/MUV_858", - "/weka/proj-chemnlp/micpie/chemnlp/data/tabular/MUV_859", - "/weka/proj-chemnlp/micpie/chemnlp/data/tabular/ncbi_disease", - "/weka/proj-chemnlp/micpie/chemnlp/data/tabular/nr_ahr_tox21", - "/weka/proj-chemnlp/micpie/chemnlp/data/tabular/nr_ar_lbd_tox21", - "/weka/proj-chemnlp/micpie/chemnlp/data/tabular/nr_ar_tox21", - "/weka/proj-chemnlp/micpie/chemnlp/data/tabular/nr_aromatase_tox21", - "/weka/proj-chemnlp/micpie/chemnlp/data/tabular/nr_er_lbd_tox21", - "/weka/proj-chemnlp/micpie/chemnlp/data/tabular/nr_er_tox21", - "/weka/proj-chemnlp/micpie/chemnlp/data/tabular/nr_ppar_gamma_tox21", - "/weka/proj-chemnlp/micpie/chemnlp/data/tabular/ocp", - "/weka/proj-chemnlp/micpie/chemnlp/data/tabular/opv", - "/weka/proj-chemnlp/micpie/chemnlp/data/tabular/oqmd", - "/weka/proj-chemnlp/micpie/chemnlp/data/tabular/ord_rxn_smiles_yield_pred", - "/weka/proj-chemnlp/micpie/chemnlp/data/tabular/ord_steps_yield", - "/weka/proj-chemnlp/micpie/chemnlp/data/tabular/orexin1_receptor_butkiewicz", - "/weka/proj-chemnlp/micpie/chemnlp/data/tabular/p_glycoprotein_inhibition_broccatelli_et_al", - "/weka/proj-chemnlp/micpie/chemnlp/data/tabular/pampa_ncats", - "/weka/proj-chemnlp/micpie/chemnlp/data/tabular/peptides_hemolytic", - "/weka/proj-chemnlp/micpie/chemnlp/data/tabular/peptides_nonfouling", - "/weka/proj-chemnlp/micpie/chemnlp/data/tabular/peptides_soluble", - "/weka/proj-chemnlp/micpie/chemnlp/data/tabular/potassium_ion_channel_kir2_1_butkiewicz", - "/weka/proj-chemnlp/micpie/chemnlp/data/tabular/qm8", - "/weka/proj-chemnlp/micpie/chemnlp/data/tabular/qm9", - "/weka/proj-chemnlp/micpie/chemnlp/data/tabular/qmof_gcmc", - "/weka/proj-chemnlp/micpie/chemnlp/data/tabular/qmof_quantum", - "/weka/proj-chemnlp/micpie/chemnlp/data/tabular/rhea_db_predictions", - "/weka/proj-chemnlp/micpie/chemnlp/data/tabular/sarscov2_3clpro_diamond", - "/weka/proj-chemnlp/micpie/chemnlp/data/tabular/sarscov2_vitro_touret", - "/weka/proj-chemnlp/micpie/chemnlp/data/tabular/serine_threonine_kinase_33_butkiewicz", - "/weka/proj-chemnlp/micpie/chemnlp/data/tabular/SIDER", - "/weka/proj-chemnlp/micpie/chemnlp/data/tabular/skin_reaction", - "/weka/proj-chemnlp/micpie/chemnlp/data/tabular/solubility_aqsoldb", - "/weka/proj-chemnlp/micpie/chemnlp/data/tabular/sr_are_tox21", - "/weka/proj-chemnlp/micpie/chemnlp/data/tabular/sr_atad5_tox21", - "/weka/proj-chemnlp/micpie/chemnlp/data/tabular/sr_hse_tox21", - "/weka/proj-chemnlp/micpie/chemnlp/data/tabular/sr_mmp_tox21", - "/weka/proj-chemnlp/micpie/chemnlp/data/tabular/sr_p53_tox21", - "/weka/proj-chemnlp/micpie/chemnlp/data/tabular/thermosol", - "/weka/proj-chemnlp/micpie/chemnlp/data/tabular/tyrosyl-dna_phosphodiesterase_butkiewicz", - "/weka/proj-chemnlp/micpie/chemnlp/data/tabular/uniprot_binding_sites_multiple", - "/weka/proj-chemnlp/micpie/chemnlp/data/tabular/uniprot_organisms", - "/weka/proj-chemnlp/micpie/chemnlp/data/tabular/volume_of_distribution_at_steady_state_lombardo_et_al", - "/weka/proj-chemnlp/micpie/chemnlp/data/tabular/bicerano_dataset", - # STANDARD SAMPLING DATASETS, set class_balanced = False !!! - # "/weka/proj-chemnlp/micpie/chemnlp/data/tabular/RedDB", - # "/weka/proj-chemnlp/micpie/chemnlp/data/tabular/mattermodeling_stackexchange", - # "/weka/proj-chemnlp/micpie/chemnlp/data/tabular/mp_descriptions", - # "/weka/proj-chemnlp/micpie/chemnlp/data/tabular/mp_self_supervised", - # "/weka/proj-chemnlp/micpie/chemnlp/data/tabular/rdkit_features", - # "/weka/proj-chemnlp/micpie/chemnlp/data/tabular/smiles_to_3d", + # "/weka/proj-chemnlp/micpie/chemnlp/data/tabular/ames_mutagenicity", + # "/weka/proj-chemnlp/micpie/chemnlp/data/tabular/aminoacids", + # "/weka/proj-chemnlp/micpie/chemnlp/data/tabular/BACE", + # "/weka/proj-chemnlp/micpie/chemnlp/data/tabular/BBBP", + # "/weka/proj-chemnlp/micpie/chemnlp/data/tabular/bc5chem", + # "/weka/proj-chemnlp/micpie/chemnlp/data/tabular/bc5disease", + # "/weka/proj-chemnlp/micpie/chemnlp/data/tabular/bio_ner", + # "/weka/proj-chemnlp/micpie/chemnlp/data/tabular/bio_ner_10", + # "/weka/proj-chemnlp/micpie/chemnlp/data/tabular/bio_ner_11", + # "/weka/proj-chemnlp/micpie/chemnlp/data/tabular/bio_ner_12", + # "/weka/proj-chemnlp/micpie/chemnlp/data/tabular/bio_ner_13", + # "/weka/proj-chemnlp/micpie/chemnlp/data/tabular/bio_ner_14", + # "/weka/proj-chemnlp/micpie/chemnlp/data/tabular/bio_ner_15", + # "/weka/proj-chemnlp/micpie/chemnlp/data/tabular/bio_ner_16", + # "/weka/proj-chemnlp/micpie/chemnlp/data/tabular/bio_ner_17", + # "/weka/proj-chemnlp/micpie/chemnlp/data/tabular/bio_ner_18", + # "/weka/proj-chemnlp/micpie/chemnlp/data/tabular/bio_ner_19", + # "/weka/proj-chemnlp/micpie/chemnlp/data/tabular/bio_ner_2", + # "/weka/proj-chemnlp/micpie/chemnlp/data/tabular/bio_ner_20", + # "/weka/proj-chemnlp/micpie/chemnlp/data/tabular/bio_ner_21", + # "/weka/proj-chemnlp/micpie/chemnlp/data/tabular/bio_ner_22", + # "/weka/proj-chemnlp/micpie/chemnlp/data/tabular/bio_ner_23", + # "/weka/proj-chemnlp/micpie/chemnlp/data/tabular/bio_ner_24", + # "/weka/proj-chemnlp/micpie/chemnlp/data/tabular/bio_ner_25", + # "/weka/proj-chemnlp/micpie/chemnlp/data/tabular/bio_ner_26", + # "/weka/proj-chemnlp/micpie/chemnlp/data/tabular/bio_ner_27", + # "/weka/proj-chemnlp/micpie/chemnlp/data/tabular/bio_ner_28", + # "/weka/proj-chemnlp/micpie/chemnlp/data/tabular/bio_ner_29", + # "/weka/proj-chemnlp/micpie/chemnlp/data/tabular/bio_ner_3", + # "/weka/proj-chemnlp/micpie/chemnlp/data/tabular/bio_ner_30", + # "/weka/proj-chemnlp/micpie/chemnlp/data/tabular/bio_ner_31", + # "/weka/proj-chemnlp/micpie/chemnlp/data/tabular/bio_ner_33", + # "/weka/proj-chemnlp/micpie/chemnlp/data/tabular/bio_ner_34", + # "/weka/proj-chemnlp/micpie/chemnlp/data/tabular/bio_ner_35", + # "/weka/proj-chemnlp/micpie/chemnlp/data/tabular/bio_ner_36", + # "/weka/proj-chemnlp/micpie/chemnlp/data/tabular/bio_ner_37", + # "/weka/proj-chemnlp/micpie/chemnlp/data/tabular/bio_ner_38", + # "/weka/proj-chemnlp/micpie/chemnlp/data/tabular/bio_ner_39", + # "/weka/proj-chemnlp/micpie/chemnlp/data/tabular/bio_ner_4", + # "/weka/proj-chemnlp/micpie/chemnlp/data/tabular/bio_ner_40", + # "/weka/proj-chemnlp/micpie/chemnlp/data/tabular/bio_ner_47", + # "/weka/proj-chemnlp/micpie/chemnlp/data/tabular/bio_ner_48", + # "/weka/proj-chemnlp/micpie/chemnlp/data/tabular/bio_ner_5", + # "/weka/proj-chemnlp/micpie/chemnlp/data/tabular/bio_ner_52", + # "/weka/proj-chemnlp/micpie/chemnlp/data/tabular/bio_ner_57", + # "/weka/proj-chemnlp/micpie/chemnlp/data/tabular/bio_ner_58", + # "/weka/proj-chemnlp/micpie/chemnlp/data/tabular/bio_ner_6", + # "/weka/proj-chemnlp/micpie/chemnlp/data/tabular/bio_ner_7", + # "/weka/proj-chemnlp/micpie/chemnlp/data/tabular/bio_ner_8", + # "/weka/proj-chemnlp/micpie/chemnlp/data/tabular/bio_ner_9", + # "/weka/proj-chemnlp/micpie/chemnlp/data/tabular/bioavailability_ma_et_al", + # "/weka/proj-chemnlp/micpie/chemnlp/data/tabular/block_polymers_morphology", + # "/weka/proj-chemnlp/micpie/chemnlp/data/tabular/blood_brain_barrier_martins_et_al", + # "/weka/proj-chemnlp/micpie/chemnlp/data/tabular/caco2_wang", + # "/weka/proj-chemnlp/micpie/chemnlp/data/tabular/carcinogens", + # "/weka/proj-chemnlp/micpie/chemnlp/data/tabular/cav3_t-type_calcium_channels_butkiewicz", + # "/weka/proj-chemnlp/micpie/chemnlp/data/tabular/chemcaption_fragments", + # "/weka/proj-chemnlp/micpie/chemnlp/data/tabular/chemcaption_rdkit", + # "/weka/proj-chemnlp/micpie/chemnlp/data/tabular/chemdner", + # "/weka/proj-chemnlp/micpie/chemnlp/data/tabular/choline_transporter_butkiewicz", + # "/weka/proj-chemnlp/micpie/chemnlp/data/tabular/clearance_astrazeneca", + # "/weka/proj-chemnlp/micpie/chemnlp/data/tabular/clintox", + # "/weka/proj-chemnlp/micpie/chemnlp/data/tabular/cyp_p450_1a2_inhibition_veith_et_al", + # "/weka/proj-chemnlp/micpie/chemnlp/data/tabular/cyp_p450_2c19_inhibition_veith_et_al", + # "/weka/proj-chemnlp/micpie/chemnlp/data/tabular/cyp_p450_2c9_inhibition_veith_et_al", + # "/weka/proj-chemnlp/micpie/chemnlp/data/tabular/cyp_p450_2d6_inhibition_veith_et_al", + # "/weka/proj-chemnlp/micpie/chemnlp/data/tabular/cyp_p450_3a4_inhibition_veith_et_al", + # "/weka/proj-chemnlp/micpie/chemnlp/data/tabular/cyp2c9_substrate_carbonmangels", + # "/weka/proj-chemnlp/micpie/chemnlp/data/tabular/cyp2d6_substrate_carbonmangels", + # "/weka/proj-chemnlp/micpie/chemnlp/data/tabular/cyp3a4_substrate_carbonmangels", + # "/weka/proj-chemnlp/micpie/chemnlp/data/tabular/drug_induced_liver_injury", + # "/weka/proj-chemnlp/micpie/chemnlp/data/tabular/flashpoint", + # "/weka/proj-chemnlp/micpie/chemnlp/data/tabular/formation_energies", + # "/weka/proj-chemnlp/micpie/chemnlp/data/tabular/freesolv", + # "/weka/proj-chemnlp/micpie/chemnlp/data/tabular/half_life_obach", + # "/weka/proj-chemnlp/micpie/chemnlp/data/tabular/herg_blockers", + # "/weka/proj-chemnlp/micpie/chemnlp/data/tabular/herg_central_at_10uM", + # "/weka/proj-chemnlp/micpie/chemnlp/data/tabular/herg_central_at_1uM", + # "/weka/proj-chemnlp/micpie/chemnlp/data/tabular/herg_central_inhib", + # "/weka/proj-chemnlp/micpie/chemnlp/data/tabular/herg_karim_et_al", + # "/weka/proj-chemnlp/micpie/chemnlp/data/tabular/hiv", + # "/weka/proj-chemnlp/micpie/chemnlp/data/tabular/human_intestinal_absorption", + # "/weka/proj-chemnlp/micpie/chemnlp/data/tabular/kcnq2_potassium_channel_butkiewicz", + # "/weka/proj-chemnlp/micpie/chemnlp/data/tabular/ld50_zhu", + # "/weka/proj-chemnlp/micpie/chemnlp/data/tabular/lipophilicity", + # "/weka/proj-chemnlp/micpie/chemnlp/data/tabular/m1_muscarinic_receptor_agonists_butkiewicz", + # "/weka/proj-chemnlp/micpie/chemnlp/data/tabular/m1_muscarinic_receptor_antagonists_butkiewicz", + # "/weka/proj-chemnlp/micpie/chemnlp/data/tabular/melting_points", + # "/weka/proj-chemnlp/micpie/chemnlp/data/tabular/mona", + # "/weka/proj-chemnlp/micpie/chemnlp/data/tabular/mp_anisotropy", + # "/weka/proj-chemnlp/micpie/chemnlp/data/tabular/mp_bulk_modulus", + # "/weka/proj-chemnlp/micpie/chemnlp/data/tabular/mp_shear_modulus", + # "/weka/proj-chemnlp/micpie/chemnlp/data/tabular/MUV_466", + # "/weka/proj-chemnlp/micpie/chemnlp/data/tabular/MUV_548", + # "/weka/proj-chemnlp/micpie/chemnlp/data/tabular/MUV_600", + # "/weka/proj-chemnlp/micpie/chemnlp/data/tabular/MUV_644", + # "/weka/proj-chemnlp/micpie/chemnlp/data/tabular/MUV_652", + # "/weka/proj-chemnlp/micpie/chemnlp/data/tabular/MUV_689", + # "/weka/proj-chemnlp/micpie/chemnlp/data/tabular/MUV_692", + # "/weka/proj-chemnlp/micpie/chemnlp/data/tabular/MUV_712", + # "/weka/proj-chemnlp/micpie/chemnlp/data/tabular/MUV_713", + # "/weka/proj-chemnlp/micpie/chemnlp/data/tabular/MUV_733", + # "/weka/proj-chemnlp/micpie/chemnlp/data/tabular/MUV_737", + # "/weka/proj-chemnlp/micpie/chemnlp/data/tabular/MUV_810", + # "/weka/proj-chemnlp/micpie/chemnlp/data/tabular/MUV_832", + # "/weka/proj-chemnlp/micpie/chemnlp/data/tabular/MUV_846", + # "/weka/proj-chemnlp/micpie/chemnlp/data/tabular/MUV_852", + # "/weka/proj-chemnlp/micpie/chemnlp/data/tabular/MUV_858", + # "/weka/proj-chemnlp/micpie/chemnlp/data/tabular/MUV_859", + # "/weka/proj-chemnlp/micpie/chemnlp/data/tabular/ncbi_disease", + # "/weka/proj-chemnlp/micpie/chemnlp/data/tabular/nr_ahr_tox21", + # "/weka/proj-chemnlp/micpie/chemnlp/data/tabular/nr_ar_lbd_tox21", + # "/weka/proj-chemnlp/micpie/chemnlp/data/tabular/nr_ar_tox21", + # "/weka/proj-chemnlp/micpie/chemnlp/data/tabular/nr_aromatase_tox21", + # "/weka/proj-chemnlp/micpie/chemnlp/data/tabular/nr_er_lbd_tox21", + # "/weka/proj-chemnlp/micpie/chemnlp/data/tabular/nr_er_tox21", + # "/weka/proj-chemnlp/micpie/chemnlp/data/tabular/nr_ppar_gamma_tox21", + # "/weka/proj-chemnlp/micpie/chemnlp/data/tabular/ocp", + # "/weka/proj-chemnlp/micpie/chemnlp/data/tabular/opv", + # "/weka/proj-chemnlp/micpie/chemnlp/data/tabular/oqmd", + # "/weka/proj-chemnlp/micpie/chemnlp/data/tabular/ord_rxn_smiles_yield_pred", + # "/weka/proj-chemnlp/micpie/chemnlp/data/tabular/ord_steps_yield", + # "/weka/proj-chemnlp/micpie/chemnlp/data/tabular/orexin1_receptor_butkiewicz", + # "/weka/proj-chemnlp/micpie/chemnlp/data/tabular/p_glycoprotein_inhibition_broccatelli_et_al", + # "/weka/proj-chemnlp/micpie/chemnlp/data/tabular/pampa_ncats", + # "/weka/proj-chemnlp/micpie/chemnlp/data/tabular/peptides_hemolytic", + # "/weka/proj-chemnlp/micpie/chemnlp/data/tabular/peptides_nonfouling", + # "/weka/proj-chemnlp/micpie/chemnlp/data/tabular/peptides_soluble", + # "/weka/proj-chemnlp/micpie/chemnlp/data/tabular/potassium_ion_channel_kir2_1_butkiewicz", + # "/weka/proj-chemnlp/micpie/chemnlp/data/tabular/qm8", + # "/weka/proj-chemnlp/micpie/chemnlp/data/tabular/qm9", + # "/weka/proj-chemnlp/micpie/chemnlp/data/tabular/qmof_gcmc", + # "/weka/proj-chemnlp/micpie/chemnlp/data/tabular/qmof_quantum", + # "/weka/proj-chemnlp/micpie/chemnlp/data/tabular/rhea_db_predictions", + # "/weka/proj-chemnlp/micpie/chemnlp/data/tabular/sarscov2_3clpro_diamond", + # "/weka/proj-chemnlp/micpie/chemnlp/data/tabular/sarscov2_vitro_touret", + # "/weka/proj-chemnlp/micpie/chemnlp/data/tabular/serine_threonine_kinase_33_butkiewicz", + # "/weka/proj-chemnlp/micpie/chemnlp/data/tabular/SIDER", + # "/weka/proj-chemnlp/micpie/chemnlp/data/tabular/skin_reaction", + # "/weka/proj-chemnlp/micpie/chemnlp/data/tabular/solubility_aqsoldb", + # "/weka/proj-chemnlp/micpie/chemnlp/data/tabular/sr_are_tox21", + # "/weka/proj-chemnlp/micpie/chemnlp/data/tabular/sr_atad5_tox21", + # "/weka/proj-chemnlp/micpie/chemnlp/data/tabular/sr_hse_tox21", + # "/weka/proj-chemnlp/micpie/chemnlp/data/tabular/sr_mmp_tox21", + # "/weka/proj-chemnlp/micpie/chemnlp/data/tabular/sr_p53_tox21", + # "/weka/proj-chemnlp/micpie/chemnlp/data/tabular/thermosol", + # "/weka/proj-chemnlp/micpie/chemnlp/data/tabular/tyrosyl-dna_phosphodiesterase_butkiewicz", # "/weka/proj-chemnlp/micpie/chemnlp/data/tabular/uniprot_binding_sites_multiple", + # "/weka/proj-chemnlp/micpie/chemnlp/data/tabular/uniprot_organisms", + # "/weka/proj-chemnlp/micpie/chemnlp/data/tabular/volume_of_distribution_at_steady_state_lombardo_et_al", + # "/weka/proj-chemnlp/micpie/chemnlp/data/tabular/bicerano_dataset", + # STANDARD SAMPLING DATASETS, set class_balanced = False !!! + "/weka/proj-chemnlp/micpie/chemnlp/data/tabular/RedDB", + "/weka/proj-chemnlp/micpie/chemnlp/data/tabular/mattermodeling_stackexchange", + "/weka/proj-chemnlp/micpie/chemnlp/data/tabular/mp_descriptions", + "/weka/proj-chemnlp/micpie/chemnlp/data/tabular/mp_self_supervised", + "/weka/proj-chemnlp/micpie/chemnlp/data/tabular/rdkit_features", + "/weka/proj-chemnlp/micpie/chemnlp/data/tabular/smiles_to_3d", + "/weka/proj-chemnlp/micpie/chemnlp/data/tabular/uniprot_binding_sites_multiple", # INVERSE DATASETS, set class_balanced = False !!! # "/weka/proj-chemnlp/micpie/chemnlp/data/tabular/inverse_1", # "/weka/proj-chemnlp/micpie/chemnlp/data/tabular/inverse_2",