Skip to content

Commit 1c1d422

Browse files
committed
clustering paper code
1 parent 70b9d2e commit 1c1d422

25 files changed

+3437
-0
lines changed

python_code/__init__.py

Whitespace-only changes.

python_code/clustering code/__init__.py

Whitespace-only changes.
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,234 @@
1+
import argparse
2+
import numpy as np
3+
import seaborn as sns
4+
5+
from analysis_utils.analyze_utils import analyze_clustering, analyze_weights_n_norms, analyze_matrix
6+
from analysis_utils.calcs import calc_spectral_clustering, \
7+
calc_clustering_metrics, create_matrix, find_num_bins
8+
from analysis_utils.data_utils import load_matrix, save_matrix, get_models_weights
9+
from analysis_utils.plots import plot_clusters
10+
from analysis_utils.saphra_code import barrier_height, connectivity_gap, get_clusters, get_sc_centroid_dists
11+
from analysis_utils.constants import *
12+
from utils.model_loading_utils import ModelLoadingInfo, load_model
13+
from clearml import Task
14+
15+
sns.set_theme()
16+
17+
18+
def run_saphra_analysis(datasets_and_seeds_str, data_num2name):
19+
# ------------------------------- saphra code
20+
# device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
21+
#
22+
# xy_min = (0, 1)
23+
# xy_max = (1, 0)
24+
# coef_samples = np.linspace(xy_min, xy_max, args.n_sample + 2).tolist()
25+
# columns = ["point_num", "loss"]
26+
# for k in range(args.n_sample + 2):
27+
# coeffs_t = coef_samples[k]
28+
# print(f'{coeffs_t}')
29+
# linear_comb(w1, w2, coeffs_t[0], coeffs_t[1], model)
30+
# metrics = eval(input_target_loader, model,
31+
# criterion, pred_fn, metric)
32+
33+
v1 = np.array([0.48, 0.43, 0.41, 0.405, 0.41, 0.408, 0.05, 0.41, 0.43, 0.48]) # mnli, gen-to-gen
34+
v2 = np.array([0.48, 0.43, 0.42, 0.445, 0.5, 0.51, 0.46, 0.44, 0.45, 0.5]) # mnli, gen-to-heur
35+
v3 = np.array([0.51, 0.45, 0.425, 0.43, 0.45, 0.45, 0.43, 0.425, 0.45, 0.5]) # mnli, heur-to-heur
36+
37+
for losses in [v1, v2, v3]:
38+
bh_metric = barrier_height(losses)
39+
cg_metric = connectivity_gap(losses)
40+
print(f'bh: {bh_metric}')
41+
print(f'cg: {cg_metric}\n')
42+
43+
dist_matrix = np.zeros((len([v1, v2, v3]), len([v1, v2, v3])))
44+
models_losses = [[1.0, v1, v3], [v1, 1.0, v2], [v3, v2, 1.0]]
45+
for i in range(len(models_losses)):
46+
for j in range(len(models_losses)):
47+
if i == j:
48+
dist_matrix[i][j] = 1.0
49+
else:
50+
dist_matrix[i][j] = connectivity_gap(models_losses[i][j])
51+
52+
# dist_matrix = np.array([[1.0, connectivity_gap(v1)], [connectivity_gap(v2), 1.0]])
53+
54+
print('dist matrix:\n', dist_matrix)
55+
56+
clusters = get_clusters(dist_matrix, 2)
57+
58+
dists = get_sc_centroid_dists(dist_matrix)
59+
print('centroid dists:\n', dists)
60+
# indices = [[acc_keys.index(k) for k in cluster] for cluster in clusters]
61+
62+
# clusters = [[suf_ordered_models[idx] for idx in cluster]
63+
# for cluster in clusters]
64+
65+
print("Clusters found:", clusters)
66+
67+
# my clustering code
68+
sim_matrix = np.exp(-dist_matrix ** 2 / (2. * 1 ** 2))
69+
models_names = [f'v{i}' for i in range(1, len(models_losses) + 1)]
70+
sim_labels = calc_spectral_clustering(sim_matrix, models_names, num_clusters=2,
71+
affinity='precomputed')
72+
metrics_df = calc_clustering_metrics(sim_matrix, sim_labels, datasets_and_seeds_str)
73+
plot_clusters(sim_labels, sim_matrix, datasets_and_seeds_str, num_dims=2, reduction_method='tsne',
74+
matrix_method='cosine similarity', title_add='', metrics_df=metrics_df)
75+
76+
77+
def run_analysis(depended_expr_dirs, base_model_name, desired_datasets, weights_reference_point,
78+
weights_normalization_type, norm_thr, matrix_type, dim_reduction_method, save_matrix_flag,
79+
with_base_model:bool, model_paths=None, is_classification=True, to_cluster=True, matrix_file_name=None,
80+
return_matrix=False, clearml_task=None, dataset_group='GLUE_AND_SUPER_GLUE'):
81+
"""
82+
83+
:param matrix_file_name: file_name of ready matrix to load, or None
84+
:param
85+
:return:
86+
"""
87+
if model_paths is None:
88+
model_paths = []
89+
print('\nGetting data')
90+
if matrix_file_name: # load ready matrix
91+
print(f'\nLoading matrix from {matrix_file_name}')
92+
matrix, models_names = load_matrix(matrix_file_name)
93+
data_num2name = {num.split('_')[-1]: DATA_NUM2NAME[dataset_group][num.split('_')[-1]] for num in models_names}
94+
short_models_names = [s.replace('seed_', 's').replace('data', 'd') for s in models_names]
95+
short_models_names = [s.split('d_')[0] + data_num2name[s.split('d_')[-1]] for s in short_models_names]
96+
matrix_type = 'cosine similarity' if matrix_file_name.split('/')[-1].split('_')[0]=='sim' else 'euclidean distance' if matrix_file_name.split('/')[-1].split('_')[0]=='euc' else 'mutual information' if matrix_file_name.split('/')[-1].split('_')[0]=='mi' else None
97+
else: # get data
98+
print('\nCalculating matrix')
99+
if type(base_model_name)==str: # needs to load model
100+
base_model_name = load_model(ModelLoadingInfo(classification=is_classification, name=base_model_name, from_tf=False, tokenizer_name=base_model_name))
101+
102+
models_weights, models_names = get_models_weights(depended_expr_dirs, base_model_name, desired_datasets,
103+
is_classification, model_paths)
104+
105+
short_models_names = [s.replace('seed_', 's').replace('data', 'd') for s in models_names] # seed_0_data_1 -> s0_d_1
106+
if models_names[0].split('_')[-1].isdigit():
107+
data_num2name = {name.split('_')[-1]: DATA_NUM2NAME[dataset_group][name.split('_')[-1]] for name in models_names}
108+
short_models_names = [s.split('d_')[0] + data_num2name[s.split('d_')[-1]] for s in short_models_names] # s0_mnli
109+
110+
# add base model as a model
111+
if with_base_model:
112+
# assert weights_reference_point != 'base_model', 'reference point is base_model, but base model is one of the models'
113+
models_weights.append(tuple(base_model_name.base_model.parameters()))
114+
short_models_names.append('s0_basemodel')
115+
models_names.append('base_model')
116+
117+
flat_models_weights_diff = analyze_weights_n_norms(models_weights, short_models_names, base_model_name, weights_reference_point, weights_normalization_type, norm_thr)
118+
matrix = create_matrix(flat_models_weights_diff, matrix_type)
119+
matrix = matrix[0]
120+
matrix = [[float(d) for d in data] for data in matrix]
121+
if save_matrix_flag:
122+
s_matrix_type = 'sim' if matrix_type == 'cosine similarity' else 'euc' if matrix_type == 'euclidean distance' else 'mi' if matrix_type == 'mutual information' else matrix_type
123+
s_models_names = 'glue_super_glue_70' if not desired_datasets else '_'.join(desired_datasets)
124+
num_seeds = len(set([name.split('_')[0] for name in short_models_names])) -1 * with_base_model
125+
save_matrix_file_name = f'{s_matrix_type}_matrix_{s_models_names}_seeds_{num_seeds}.csv'
126+
save_matrix(matrix, save_matrix_file_name, models_names)
127+
128+
if clearml_task:
129+
clearml_task.upload_artifact('matrix', matrix)
130+
clearml_task.upload_artifact('models names', models_names)
131+
if save_matrix_flag:
132+
clearml_task.upload_artifact('matrix save file path', save_matrix_file_name)
133+
134+
if to_cluster:
135+
print('\nCluster models')
136+
is_sim = True if (matrix_type in ['cosine similarity', 'mutual information', 'cka']) else False
137+
analyze_matrix(matrix, short_models_names, is_sim)
138+
analyze_clustering(matrix, is_sim, short_models_names, dim_reduction_method, matrix_type, with_base_model=with_base_model)
139+
140+
if return_matrix:
141+
try:
142+
return matrix, models_names
143+
except Exception as e:
144+
print('missing one of the return objects: matrix, save_matrix_file_name, models_names in run_analysis()')
145+
raise e
146+
147+
148+
def run_find_num_bins(bins_range, depended_expr_dirs, base_model_name, desired_datasets, is_classification, model_paths,
149+
weights_reference_point, weights_normalization_type, norm_thr, dataset_group='GLUE_AND_SUPER_GLUE'):
150+
151+
models_weights, models_names = get_models_weights(depended_expr_dirs, base_model_name, desired_datasets,
152+
is_classification, model_paths)
153+
data_num2name = {name.split('_')[-1]: DATA_NUM2NAME[dataset_group][name.split('_')[-1]] for name in models_names}
154+
short_models_names = [s.replace('seed_', 's').replace('data', 'd') for s in models_names]
155+
short_models_names = [s.split('d_')[0] + data_num2name[s.split('d_')[-1]] for s in short_models_names]
156+
flat_models_weights_diff = analyze_weights_n_norms(models_weights, short_models_names, base_model_name,
157+
weights_reference_point, weights_normalization_type, norm_thr)
158+
159+
best_num_bins = find_num_bins(flat_models_weights_diff, short_models_names, bins_range)
160+
print(best_num_bins)
161+
162+
if __name__ == '__main__':
163+
164+
# get parser args
165+
parser = argparse.ArgumentParser()
166+
parser.add_argument("--debug", action="store_true")
167+
parser.add_argument("--is_local_run", default=False, type=bool)
168+
args, _ = parser.parse_known_args()
169+
170+
if args.debug:
171+
from cvar_pyutils.debugging_tools import set_remote_debugger
172+
debug_ip = None # '9.148.203.20' # os.environ.get('SSH_CONNECTION', None) # this is the default value for debug_ip
173+
debug_port = 12345 # this is the default value for debug_port
174+
set_remote_debugger(debug_ip, debug_port)
175+
176+
# Define depended experiment dirs
177+
if args.is_local_run:
178+
glue_template_expr_dir = '/Users/almoggueta/data/fusion/outputs_almog/TrainIterateOverDataset_'
179+
else:
180+
glue_template_expr_dir = '/dccstor/fuse/outputs_almog/TrainIterateOverDataset_'
181+
182+
seeds = range(1, 6) # range(1, 6) 1-5 is for roberta with weight_decay=0.01, 6-10 for roberta weight_decay=0.0
183+
glue_all_seeds_expr_dir = [glue_template_expr_dir + str(seed) for seed in seeds]
184+
depended_expr_dirs = glue_all_seeds_expr_dir
185+
186+
# get base model
187+
base_model_name = 'roberta-base'
188+
base_model = load_model(
189+
ModelLoadingInfo(classification=True, name=base_model_name, from_tf=False, tokenizer_name=base_model_name))
190+
191+
model_paths = []
192+
193+
desired_datasets = ['mnli', 'sst2'] # , 'cola', 'qqp']
194+
195+
matrix_type = 'cosine similarity' # 'euclidean distance' # # 'mutual information'
196+
# run analysis
197+
# analyze_models(depended_expr_dirs,
198+
# model_paths, base_model, weights_reference_point='base_model', weights_normalization_type=None,
199+
# is_classification=True, desired_datasets=desired_datasets)
200+
201+
202+
run_analysis_kwargs = {
203+
'depended_expr_dirs': depended_expr_dirs,
204+
'base_model_name': base_model,
205+
'desired_datasets': desired_datasets,
206+
'weights_reference_point': 'avg weights', #'base_model',
207+
'weights_normalization_type': None,
208+
'norm_thr': 0,
209+
'matrix_type': matrix_type,
210+
'dim_reduction_method': 'tsne',
211+
'save_matrix_flag': False,
212+
'with_base_model': False,
213+
'model_paths': model_paths,
214+
'is_classification': True,
215+
'to_cluster': True,
216+
'dataset_group': 'GLUE_AND_SUPER_GLUE',
217+
'matrix_file_name': 'analysis_utils/saved_matrices/sim_matrix_glue_super_glue_70.csv', #None, #'analysis_utils/saved_matrices/euc_dist_matrix_glue_super_glue_70.csv', #None, # 'analysis_utils/saved_matrices/mi_matrix_mnli_sst2.csv',
218+
}
219+
matrix_name = run_analysis_kwargs['matrix_file_name'].split('/')[-1] if run_analysis_kwargs['matrix_file_name'] is not None else desired_datasets if desired_datasets is not None else 'glue super glue'
220+
task = Task.init(project_name='fusion', tags=[matrix_type, 'knn'],
221+
task_name=f'analysis_clustering_{matrix_type}_on_{matrix_name}')
222+
task.upload_artifact('run_analysis_kwargs', run_analysis_kwargs)
223+
# task = None
224+
run_analysis(**run_analysis_kwargs, clearml_task=task)
225+
226+
227+
# MUTUAL INFORMATION FIND NUM BINS EXP
228+
# run_find_bins_kwargs = {}
229+
# for arg in ['depended_expr_dirs', 'base_model_name', 'desired_datasets', 'is_classification', 'model_paths',
230+
# 'weights_reference_point', 'weights_normalization_type', 'norm_thr']:
231+
# run_find_bins_kwargs[arg] = run_analysis_kwargs[arg]
232+
# run_find_bins_kwargs['bins_range'] = range(100, 20000, int((20000-100)/15))
233+
# run_find_num_bins(**run_find_bins_kwargs)
234+

python_code/clustering code/analysis_utils/__init__.py

Whitespace-only changes.

0 commit comments

Comments
 (0)