-
Notifications
You must be signed in to change notification settings - Fork 0
/
experiments_launcher.py
168 lines (129 loc) · 10.1 KB
/
experiments_launcher.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
import os
import argparse
import utilities
# set TensorFlow log level
os.environ['TF_CPP_MIN_LOG_LEVEL'] = '3'
from experiment_sets import ExperimentSets
from experiment import Experiment
def main():
parser = argparse.ArgumentParser(description="LUS Ordinal Classification")
parser.add_argument("--exps_json", type=str, required=True, help="json file containing the experiments to be performed")
parser.add_argument("--dataset", type=str, required=True, help="dataset folder")
parser.add_argument("--results_dir", type=str, default="results/", help="directory used to store the results")
parser.add_argument("--hpv_splits", type=int, default=3, help="number of splits for grid searching phase")
parser.add_argument("--hpt_splits", type=int, default=3, help="number of splits to train models with")
parser.add_argument("--no_gridsearch", action='store_true', help="avoid grid searching the best parameters")
parser.add_argument("--mixedp", action='store_true', help="enable the mixed precision 'float_16/32'")
parser.add_argument("--xla", action='store_true', help="enable the XLA (Accelerated Linear Algebra)")
parser.add_argument("--workers", type=int, default=1, help="processes employed when using process-based threading")
parser.add_argument("--shuffle_bsize", type=int, default=300, help="buffer size units used to shuffle the training dataset")
parser.add_argument("--max_qsize", type=int, default=300, help="maximum size for the generator queue")
parser.add_argument("--verbose", type=int, default=1, help="verbose intensity for the whole experiment")
parser.add_argument("--seed", type=int, default=42, help="seed used to reproducibility")
args = parser.parse_args()
print("\n★★★★★★★★★★★★★★★★★★★★★★★★★★★★ START ★★★★★★★★★★★★★★★★★★★★★★★★★★★★\n")
# initialization of the set of experiments class with global settings
experiments_set = ExperimentSets(exps_json_path = args.exps_json,
dataset_dir = args.dataset,
results_dir = args.results_dir,
mp_xla = (args.mixedp, args.xla),
workers = args.workers,
shuffle_bsize = args.shuffle_bsize,
max_qsize = args.max_qsize,
verbose = args.verbose,
seed = args.seed)
# building the class
set_config = experiments_set.build()
# ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ EXPERIMENTS LOOP ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
for exp_id, exp_settings in enumerate(experiments_set.exps, start=1):
# initialization of the experiment class with a defined configuration
experiment = Experiment(settings=exp_settings, set_config=set_config)
# build the experiment, it returns the experiment name
exp_name = experiment.build(args.no_gridsearch)
# log and print the builded experiment name with the experiments counter
message = f"◆ experiment {exp_name} loaded [{exp_id}/{experiments_set.tot_exps}]"
utilities.log_this(experiments_set.logs_path, message)
# perform the Stratified Group K-Fold Cross-Validation
exp_ds = experiment.dataset
num_folds = exp_ds.sgkfold(num_folds=experiment.settings['folds'], shuffle_folds=True)
# ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ K-FOLDING ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
for curr_fold, (train_folds, test_fold) in enumerate(exp_ds.folds, start=1):
print(f"\n~~~~~~~~~~~~~~~ FOLD {curr_fold}/{num_folds} ~~~~~~~~~~~~~~~")
utilities.log_this(experiments_set.logs_path, f"fold {curr_fold}/{num_folds}", p=False)
# set the current fold into the experiment class
experiment.set_current_fold(curr_fold, train_folds, test_fold)
# extract groups and labels for the train fold
train_groups = [exp_ds.groups[movie] for movie in train_folds]
train_labels = [exp_ds.labels[movie] for movie in train_folds]
# extract training and test patients from the movies (and saves the split in a JSON file)
train_pats, test_pats = experiment.from_fold_split_to_pats(train_folds, test_fold)
# extract folds labels
_, fold_train_y = exp_ds.build_tfrecord_from_patients(list(train_pats))
_, fold_test_y = exp_ds.build_tfrecord_from_patients(list(test_pats))
# print split charts of the current fold
utilities.plot_pats_fold(experiment, train_pats, test_pats)
utilities.plot_fdistr_per_class(experiment, fold_train_y, fold_test_y, phase='fold')
# sample a label for each patient on the train folds based on the movie-level ranking
labels_pats = exp_ds.sample_score_per_patient(train_groups, train_labels)
# ~~~~~~~~~~~~~~~~~~~~~~~~ GRID SEARCH HOLDOUTS ~~~~~~~~~~~~~~~~~~~~~~~~~
if not args.no_gridsearch:
# perform hyperparameters grid search if enabled
hpv_sss = exp_ds.n_strat_shuffle_split(train_pats, labels_pats, val_ratio=0.15, splits=args.hpv_splits, state=f"{curr_fold}_hpv")
for hpv_curr_split, (hpv_train, hpv_val) in enumerate(hpv_sss, start=1):
print(f"******** Grid Search Holdout {hpv_curr_split}/{args.hpv_splits} *******")
utilities.log_this(experiments_set.logs_path, f"grid search holdout {hpv_curr_split}/{args.hpv_splits}", p=False)
# extract patients from the splitting
hpv_train = [train_pats[pat] for pat in hpv_train]
hpv_val = [train_pats[pat] for pat in hpv_val]
# set the current holdout for hyperparameters grid searching
experiment.set_current_hpv_holdout(hpv_curr_split, hpv_train, hpv_val)
# prepare train and val sets from the patients got from the holdout
experiment.prepare_hpv_sets()
# compute the grid of every parameters combination
hpv_grid = experiment.get_hyperparameters_grid()
# gridsearching...
for hp_iter, hyperparameters in enumerate(hpv_grid, start=1):
print(f"testing HPs [{hp_iter}/{len(hpv_grid)}]")
experiment.evaluate_hyperparams(hyperparameters, epochs=6)
# save the MAE values of each parameter combination for the current holdout
experiment.splits_mae_scores[str(hpv_curr_split)] = experiment.mae_scores
# Find the best combination of hyper-parameters from the grid search
best_params = experiment.compute_grid_search_results()
else:
# Load default parameters based on the experiment's network model
exp_sett = experiment.settings
if experiment.settings['nn_model'] == 'obd':
best_params = {'batch_size': exp_sett["batch_size"][0], 'dropout': exp_sett["dropout"][0], 'learning_rate': exp_sett["learning_rate"][0], 'hidden_size': exp_sett["hidden_size"][0]}
elif experiment.settings['nn_model'] == 'clm':
best_params = {'batch_size': exp_sett["batch_size"][0], 'dropout': exp_sett["dropout"][0], 'learning_rate': exp_sett["learning_rate"][0], 'link_function': exp_sett["link_function"][0], 'use_tau': exp_sett["use_tau"][0]}
else:
best_params = {'batch_size': exp_sett["batch_size"][0], 'dropout': exp_sett["dropout"][0], 'learning_rate': exp_sett["learning_rate"][0]}
print(best_params)
# ~~~~~~~~~~~~~~~~~~~~~~ NETWORK TRAINING HOLDOUTS ~~~~~~~~~~~~~~~~~~~~~~
hpt_sss = exp_ds.n_strat_shuffle_split(train_pats, labels_pats, val_ratio=0.15, splits=args.hpt_splits, state=f"{curr_fold}_hpt")
for hpt_curr_split, (hpt_train, hpt_val) in enumerate(hpt_sss, start=1):
print(f"****** Model Training Holdout {hpt_curr_split}/{args.hpt_splits} ******")
utilities.log_this(experiments_set.logs_path, f"training holdout {hpt_curr_split}/{args.hpt_splits}", p=False)
# extract patients from the splitting
hpt_train = [train_pats[pat] for pat in hpt_train]
hpt_val = [train_pats[pat] for pat in hpt_val]
# set the current training holdout in the experiment
experiment.set_current_hpt_holdout(hpt_curr_split, hpt_train, hpt_val)
# prepare train and val sets from the patients got from the holdout
experiment.prepare_hpt_sets()
# print the patients splitting in the train and test sets for the current fold
utilities.plot_fdistr_per_class(experiment, phase='hpt')
# hpt_train = exp_ds.generate_tfrset(experiment.hpt_train, batch_size=experiment.settings['batch_size'][0], shuffle=True, augment=True)
# exp_ds.plot_set_batches(hpt_train, experiment.settings['batch_size'][0])
# continue
# train the network with the best parameters on the holdout and get the history
hpt_model, hpt_history = experiment.hpt_train_network(best_params)
# save the training graphs
experiment.nn_train_graphs(hpt_history)
# test the model on the testing fold (from the initial k-fold)
experiment.hpt_test_network(test_pats, hpt_model, best_params)
print("\n")
utilities.log_this(experiments_set.logs_path, f"End", p=False)
print("★★★★★★★★★★★★★★★★★★★★★★★★★★★★★ END ★★★★★★★★★★★★★★★★★★★★★★★★★★★★★\n")
if __name__ == "__main__":
main()