|
| 1 | +import argparse |
| 2 | +import numpy as np |
| 3 | +import seaborn as sns |
| 4 | + |
| 5 | +from analysis_utils.analyze_utils import analyze_clustering, analyze_weights_n_norms, analyze_matrix |
| 6 | +from analysis_utils.calcs import calc_spectral_clustering, \ |
| 7 | + calc_clustering_metrics, create_matrix, find_num_bins |
| 8 | +from analysis_utils.data_utils import load_matrix, save_matrix, get_models_weights |
| 9 | +from analysis_utils.plots import plot_clusters |
| 10 | +from analysis_utils.saphra_code import barrier_height, connectivity_gap, get_clusters, get_sc_centroid_dists |
| 11 | +from analysis_utils.constants import * |
| 12 | +from utils.model_loading_utils import ModelLoadingInfo, load_model |
| 13 | +from clearml import Task |
| 14 | + |
| 15 | +sns.set_theme() |
| 16 | + |
| 17 | + |
| 18 | +def run_saphra_analysis(datasets_and_seeds_str, data_num2name): |
| 19 | + # ------------------------------- saphra code |
| 20 | + # device = torch.device("cuda" if torch.cuda.is_available() else "cpu") |
| 21 | + # |
| 22 | + # xy_min = (0, 1) |
| 23 | + # xy_max = (1, 0) |
| 24 | + # coef_samples = np.linspace(xy_min, xy_max, args.n_sample + 2).tolist() |
| 25 | + # columns = ["point_num", "loss"] |
| 26 | + # for k in range(args.n_sample + 2): |
| 27 | + # coeffs_t = coef_samples[k] |
| 28 | + # print(f'{coeffs_t}') |
| 29 | + # linear_comb(w1, w2, coeffs_t[0], coeffs_t[1], model) |
| 30 | + # metrics = eval(input_target_loader, model, |
| 31 | + # criterion, pred_fn, metric) |
| 32 | + |
| 33 | + v1 = np.array([0.48, 0.43, 0.41, 0.405, 0.41, 0.408, 0.05, 0.41, 0.43, 0.48]) # mnli, gen-to-gen |
| 34 | + v2 = np.array([0.48, 0.43, 0.42, 0.445, 0.5, 0.51, 0.46, 0.44, 0.45, 0.5]) # mnli, gen-to-heur |
| 35 | + v3 = np.array([0.51, 0.45, 0.425, 0.43, 0.45, 0.45, 0.43, 0.425, 0.45, 0.5]) # mnli, heur-to-heur |
| 36 | + |
| 37 | + for losses in [v1, v2, v3]: |
| 38 | + bh_metric = barrier_height(losses) |
| 39 | + cg_metric = connectivity_gap(losses) |
| 40 | + print(f'bh: {bh_metric}') |
| 41 | + print(f'cg: {cg_metric}\n') |
| 42 | + |
| 43 | + dist_matrix = np.zeros((len([v1, v2, v3]), len([v1, v2, v3]))) |
| 44 | + models_losses = [[1.0, v1, v3], [v1, 1.0, v2], [v3, v2, 1.0]] |
| 45 | + for i in range(len(models_losses)): |
| 46 | + for j in range(len(models_losses)): |
| 47 | + if i == j: |
| 48 | + dist_matrix[i][j] = 1.0 |
| 49 | + else: |
| 50 | + dist_matrix[i][j] = connectivity_gap(models_losses[i][j]) |
| 51 | + |
| 52 | + # dist_matrix = np.array([[1.0, connectivity_gap(v1)], [connectivity_gap(v2), 1.0]]) |
| 53 | + |
| 54 | + print('dist matrix:\n', dist_matrix) |
| 55 | + |
| 56 | + clusters = get_clusters(dist_matrix, 2) |
| 57 | + |
| 58 | + dists = get_sc_centroid_dists(dist_matrix) |
| 59 | + print('centroid dists:\n', dists) |
| 60 | + # indices = [[acc_keys.index(k) for k in cluster] for cluster in clusters] |
| 61 | + |
| 62 | + # clusters = [[suf_ordered_models[idx] for idx in cluster] |
| 63 | + # for cluster in clusters] |
| 64 | + |
| 65 | + print("Clusters found:", clusters) |
| 66 | + |
| 67 | + # my clustering code |
| 68 | + sim_matrix = np.exp(-dist_matrix ** 2 / (2. * 1 ** 2)) |
| 69 | + models_names = [f'v{i}' for i in range(1, len(models_losses) + 1)] |
| 70 | + sim_labels = calc_spectral_clustering(sim_matrix, models_names, num_clusters=2, |
| 71 | + affinity='precomputed') |
| 72 | + metrics_df = calc_clustering_metrics(sim_matrix, sim_labels, datasets_and_seeds_str) |
| 73 | + plot_clusters(sim_labels, sim_matrix, datasets_and_seeds_str, num_dims=2, reduction_method='tsne', |
| 74 | + matrix_method='cosine similarity', title_add='', metrics_df=metrics_df) |
| 75 | + |
| 76 | + |
| 77 | +def run_analysis(depended_expr_dirs, base_model_name, desired_datasets, weights_reference_point, |
| 78 | + weights_normalization_type, norm_thr, matrix_type, dim_reduction_method, save_matrix_flag, |
| 79 | + with_base_model:bool, model_paths=None, is_classification=True, to_cluster=True, matrix_file_name=None, |
| 80 | + return_matrix=False, clearml_task=None, dataset_group='GLUE_AND_SUPER_GLUE'): |
| 81 | + """ |
| 82 | +
|
| 83 | + :param matrix_file_name: file_name of ready matrix to load, or None |
| 84 | + :param |
| 85 | + :return: |
| 86 | + """ |
| 87 | + if model_paths is None: |
| 88 | + model_paths = [] |
| 89 | + print('\nGetting data') |
| 90 | + if matrix_file_name: # load ready matrix |
| 91 | + print(f'\nLoading matrix from {matrix_file_name}') |
| 92 | + matrix, models_names = load_matrix(matrix_file_name) |
| 93 | + data_num2name = {num.split('_')[-1]: DATA_NUM2NAME[dataset_group][num.split('_')[-1]] for num in models_names} |
| 94 | + short_models_names = [s.replace('seed_', 's').replace('data', 'd') for s in models_names] |
| 95 | + short_models_names = [s.split('d_')[0] + data_num2name[s.split('d_')[-1]] for s in short_models_names] |
| 96 | + matrix_type = 'cosine similarity' if matrix_file_name.split('/')[-1].split('_')[0]=='sim' else 'euclidean distance' if matrix_file_name.split('/')[-1].split('_')[0]=='euc' else 'mutual information' if matrix_file_name.split('/')[-1].split('_')[0]=='mi' else None |
| 97 | + else: # get data |
| 98 | + print('\nCalculating matrix') |
| 99 | + if type(base_model_name)==str: # needs to load model |
| 100 | + base_model_name = load_model(ModelLoadingInfo(classification=is_classification, name=base_model_name, from_tf=False, tokenizer_name=base_model_name)) |
| 101 | + |
| 102 | + models_weights, models_names = get_models_weights(depended_expr_dirs, base_model_name, desired_datasets, |
| 103 | + is_classification, model_paths) |
| 104 | + |
| 105 | + short_models_names = [s.replace('seed_', 's').replace('data', 'd') for s in models_names] # seed_0_data_1 -> s0_d_1 |
| 106 | + if models_names[0].split('_')[-1].isdigit(): |
| 107 | + data_num2name = {name.split('_')[-1]: DATA_NUM2NAME[dataset_group][name.split('_')[-1]] for name in models_names} |
| 108 | + short_models_names = [s.split('d_')[0] + data_num2name[s.split('d_')[-1]] for s in short_models_names] # s0_mnli |
| 109 | + |
| 110 | + # add base model as a model |
| 111 | + if with_base_model: |
| 112 | + # assert weights_reference_point != 'base_model', 'reference point is base_model, but base model is one of the models' |
| 113 | + models_weights.append(tuple(base_model_name.base_model.parameters())) |
| 114 | + short_models_names.append('s0_basemodel') |
| 115 | + models_names.append('base_model') |
| 116 | + |
| 117 | + flat_models_weights_diff = analyze_weights_n_norms(models_weights, short_models_names, base_model_name, weights_reference_point, weights_normalization_type, norm_thr) |
| 118 | + matrix = create_matrix(flat_models_weights_diff, matrix_type) |
| 119 | + matrix = matrix[0] |
| 120 | + matrix = [[float(d) for d in data] for data in matrix] |
| 121 | + if save_matrix_flag: |
| 122 | + s_matrix_type = 'sim' if matrix_type == 'cosine similarity' else 'euc' if matrix_type == 'euclidean distance' else 'mi' if matrix_type == 'mutual information' else matrix_type |
| 123 | + s_models_names = 'glue_super_glue_70' if not desired_datasets else '_'.join(desired_datasets) |
| 124 | + num_seeds = len(set([name.split('_')[0] for name in short_models_names])) -1 * with_base_model |
| 125 | + save_matrix_file_name = f'{s_matrix_type}_matrix_{s_models_names}_seeds_{num_seeds}.csv' |
| 126 | + save_matrix(matrix, save_matrix_file_name, models_names) |
| 127 | + |
| 128 | + if clearml_task: |
| 129 | + clearml_task.upload_artifact('matrix', matrix) |
| 130 | + clearml_task.upload_artifact('models names', models_names) |
| 131 | + if save_matrix_flag: |
| 132 | + clearml_task.upload_artifact('matrix save file path', save_matrix_file_name) |
| 133 | + |
| 134 | + if to_cluster: |
| 135 | + print('\nCluster models') |
| 136 | + is_sim = True if (matrix_type in ['cosine similarity', 'mutual information', 'cka']) else False |
| 137 | + analyze_matrix(matrix, short_models_names, is_sim) |
| 138 | + analyze_clustering(matrix, is_sim, short_models_names, dim_reduction_method, matrix_type, with_base_model=with_base_model) |
| 139 | + |
| 140 | + if return_matrix: |
| 141 | + try: |
| 142 | + return matrix, models_names |
| 143 | + except Exception as e: |
| 144 | + print('missing one of the return objects: matrix, save_matrix_file_name, models_names in run_analysis()') |
| 145 | + raise e |
| 146 | + |
| 147 | + |
| 148 | +def run_find_num_bins(bins_range, depended_expr_dirs, base_model_name, desired_datasets, is_classification, model_paths, |
| 149 | + weights_reference_point, weights_normalization_type, norm_thr, dataset_group='GLUE_AND_SUPER_GLUE'): |
| 150 | + |
| 151 | + models_weights, models_names = get_models_weights(depended_expr_dirs, base_model_name, desired_datasets, |
| 152 | + is_classification, model_paths) |
| 153 | + data_num2name = {name.split('_')[-1]: DATA_NUM2NAME[dataset_group][name.split('_')[-1]] for name in models_names} |
| 154 | + short_models_names = [s.replace('seed_', 's').replace('data', 'd') for s in models_names] |
| 155 | + short_models_names = [s.split('d_')[0] + data_num2name[s.split('d_')[-1]] for s in short_models_names] |
| 156 | + flat_models_weights_diff = analyze_weights_n_norms(models_weights, short_models_names, base_model_name, |
| 157 | + weights_reference_point, weights_normalization_type, norm_thr) |
| 158 | + |
| 159 | + best_num_bins = find_num_bins(flat_models_weights_diff, short_models_names, bins_range) |
| 160 | + print(best_num_bins) |
| 161 | + |
| 162 | +if __name__ == '__main__': |
| 163 | + |
| 164 | + # get parser args |
| 165 | + parser = argparse.ArgumentParser() |
| 166 | + parser.add_argument("--debug", action="store_true") |
| 167 | + parser.add_argument("--is_local_run", default=False, type=bool) |
| 168 | + args, _ = parser.parse_known_args() |
| 169 | + |
| 170 | + if args.debug: |
| 171 | + from cvar_pyutils.debugging_tools import set_remote_debugger |
| 172 | + debug_ip = None # '9.148.203.20' # os.environ.get('SSH_CONNECTION', None) # this is the default value for debug_ip |
| 173 | + debug_port = 12345 # this is the default value for debug_port |
| 174 | + set_remote_debugger(debug_ip, debug_port) |
| 175 | + |
| 176 | + # Define depended experiment dirs |
| 177 | + if args.is_local_run: |
| 178 | + glue_template_expr_dir = '/Users/almoggueta/data/fusion/outputs_almog/TrainIterateOverDataset_' |
| 179 | + else: |
| 180 | + glue_template_expr_dir = '/dccstor/fuse/outputs_almog/TrainIterateOverDataset_' |
| 181 | + |
| 182 | + seeds = range(1, 6) # range(1, 6) 1-5 is for roberta with weight_decay=0.01, 6-10 for roberta weight_decay=0.0 |
| 183 | + glue_all_seeds_expr_dir = [glue_template_expr_dir + str(seed) for seed in seeds] |
| 184 | + depended_expr_dirs = glue_all_seeds_expr_dir |
| 185 | + |
| 186 | + # get base model |
| 187 | + base_model_name = 'roberta-base' |
| 188 | + base_model = load_model( |
| 189 | + ModelLoadingInfo(classification=True, name=base_model_name, from_tf=False, tokenizer_name=base_model_name)) |
| 190 | + |
| 191 | + model_paths = [] |
| 192 | + |
| 193 | + desired_datasets = ['mnli', 'sst2'] # , 'cola', 'qqp'] |
| 194 | + |
| 195 | + matrix_type = 'cosine similarity' # 'euclidean distance' # # 'mutual information' |
| 196 | + # run analysis |
| 197 | + # analyze_models(depended_expr_dirs, |
| 198 | + # model_paths, base_model, weights_reference_point='base_model', weights_normalization_type=None, |
| 199 | + # is_classification=True, desired_datasets=desired_datasets) |
| 200 | + |
| 201 | + |
| 202 | + run_analysis_kwargs = { |
| 203 | + 'depended_expr_dirs': depended_expr_dirs, |
| 204 | + 'base_model_name': base_model, |
| 205 | + 'desired_datasets': desired_datasets, |
| 206 | + 'weights_reference_point': 'avg weights', #'base_model', |
| 207 | + 'weights_normalization_type': None, |
| 208 | + 'norm_thr': 0, |
| 209 | + 'matrix_type': matrix_type, |
| 210 | + 'dim_reduction_method': 'tsne', |
| 211 | + 'save_matrix_flag': False, |
| 212 | + 'with_base_model': False, |
| 213 | + 'model_paths': model_paths, |
| 214 | + 'is_classification': True, |
| 215 | + 'to_cluster': True, |
| 216 | + 'dataset_group': 'GLUE_AND_SUPER_GLUE', |
| 217 | + 'matrix_file_name': 'analysis_utils/saved_matrices/sim_matrix_glue_super_glue_70.csv', #None, #'analysis_utils/saved_matrices/euc_dist_matrix_glue_super_glue_70.csv', #None, # 'analysis_utils/saved_matrices/mi_matrix_mnli_sst2.csv', |
| 218 | + } |
| 219 | + matrix_name = run_analysis_kwargs['matrix_file_name'].split('/')[-1] if run_analysis_kwargs['matrix_file_name'] is not None else desired_datasets if desired_datasets is not None else 'glue super glue' |
| 220 | + task = Task.init(project_name='fusion', tags=[matrix_type, 'knn'], |
| 221 | + task_name=f'analysis_clustering_{matrix_type}_on_{matrix_name}') |
| 222 | + task.upload_artifact('run_analysis_kwargs', run_analysis_kwargs) |
| 223 | + # task = None |
| 224 | + run_analysis(**run_analysis_kwargs, clearml_task=task) |
| 225 | + |
| 226 | + |
| 227 | + # MUTUAL INFORMATION FIND NUM BINS EXP |
| 228 | + # run_find_bins_kwargs = {} |
| 229 | + # for arg in ['depended_expr_dirs', 'base_model_name', 'desired_datasets', 'is_classification', 'model_paths', |
| 230 | + # 'weights_reference_point', 'weights_normalization_type', 'norm_thr']: |
| 231 | + # run_find_bins_kwargs[arg] = run_analysis_kwargs[arg] |
| 232 | + # run_find_bins_kwargs['bins_range'] = range(100, 20000, int((20000-100)/15)) |
| 233 | + # run_find_num_bins(**run_find_bins_kwargs) |
| 234 | + |
0 commit comments