From b2ffa54ba1c8b3932b4c68d0f3bd9123971a5382 Mon Sep 17 00:00:00 2001 From: Vijay Viswanathan Date: Sat, 22 Oct 2022 23:56:18 -0400 Subject: [PATCH 1/9] Wip --- CMVC_main_opiec.py | 1 + embeddings_multi_view.py | 12 ++++++++++++ 2 files changed, 13 insertions(+) diff --git a/CMVC_main_opiec.py b/CMVC_main_opiec.py index db8d1f1..95288b7 100644 --- a/CMVC_main_opiec.py +++ b/CMVC_main_opiec.py @@ -210,6 +210,7 @@ def embedKG(self): parser.add_argument('--nentity', type=int, default=0, help='DO NOT MANUALLY SET') parser.add_argument('--nrelation', type=int, default=0, help='DO NOT MANUALLY SET') parser.add_argument('-embed_dims', dest='embed_dims', default=300, type=int, help='Embedding dimension') + parser.add_argument('--kmeans_initialization', default="k-means++", type=str, choices=["k-means++", "seeding", "pc"], help='Embedding dimension') parser.add_argument('--num_reinit', type=int, default=10, help="Number of reinitializations to try for k-Means clustering") # word2vec and iteration hyper-parameters diff --git a/embeddings_multi_view.py b/embeddings_multi_view.py index 3284f35..605315e 100644 --- a/embeddings_multi_view.py +++ b/embeddings_multi_view.py @@ -525,6 +525,18 @@ def fit(self): t0 = time.time() real_time = time.strftime("%Y_%m_%d") + ' ' + time.strftime("%H:%M:%S") print('time:', real_time) + + if self.p.kmeans_initialization == "seeding": + init = np.ndarray() + print("TODO(Vijay): Not implemented") + breakpoint() + elif self.p.kmeans_initialization == "pc": + init = np.ndarray() + print("TODO(Vijay): Not implemented") + breakpoint() + else: + init = "k-means++" + mv_skm = Multi_view_SphericalKMeans(n_clusters=n_cluster, init='k-means++', n_init=self.num_reinit, max_iter=10, n_jobs=5, verbose=0, p=self.p, side_info=self.side_info, true_ent2clust=self.true_ent2clust, From a3a03161b312d3ed50e04435eedfe5dd08d600e5 Mon Sep 17 00:00:00 2001 From: Vijay Viswanathan Date: Sat, 22 Oct 2022 23:56:18 -0400 Subject: [PATCH 2/9] Wip --- CMVC_main_opiec.py | 1 + embeddings_multi_view.py | 14 +++++++++++++- 2 files changed, 14 insertions(+), 1 deletion(-) diff --git a/CMVC_main_opiec.py b/CMVC_main_opiec.py index db8d1f1..95288b7 100644 --- a/CMVC_main_opiec.py +++ b/CMVC_main_opiec.py @@ -210,6 +210,7 @@ def embedKG(self): parser.add_argument('--nentity', type=int, default=0, help='DO NOT MANUALLY SET') parser.add_argument('--nrelation', type=int, default=0, help='DO NOT MANUALLY SET') parser.add_argument('-embed_dims', dest='embed_dims', default=300, type=int, help='Embedding dimension') + parser.add_argument('--kmeans_initialization', default="k-means++", type=str, choices=["k-means++", "seeding", "pc"], help='Embedding dimension') parser.add_argument('--num_reinit', type=int, default=10, help="Number of reinitializations to try for k-Means clustering") # word2vec and iteration hyper-parameters diff --git a/embeddings_multi_view.py b/embeddings_multi_view.py index 3284f35..bf8f64f 100644 --- a/embeddings_multi_view.py +++ b/embeddings_multi_view.py @@ -525,7 +525,19 @@ def fit(self): t0 = time.time() real_time = time.strftime("%Y_%m_%d") + ' ' + time.strftime("%H:%M:%S") print('time:', real_time) - mv_skm = Multi_view_SphericalKMeans(n_clusters=n_cluster, init='k-means++', n_init=self.num_reinit, max_iter=10, + + if self.p.kmeans_initialization == "seeding": + init = np.ndarray() + print("TODO(Vijay): Not implemented") + breakpoint() + elif self.p.kmeans_initialization == "pc": + init = np.ndarray() + print("TODO(Vijay): Not implemented") + breakpoint() + else: + init = "k-means++" + + mv_skm = Multi_view_SphericalKMeans(n_clusters=n_cluster, init=init, n_init=self.num_reinit, max_iter=10, n_jobs=5, verbose=0, p=self.p, side_info=self.side_info, true_ent2clust=self.true_ent2clust, true_clust2ent=self.true_clust2ent) From d6fce7df40fbe00ef9daa5a9fa8fced2f5a4bfe6 Mon Sep 17 00:00:00 2001 From: Vijay Viswanathan Date: Tue, 25 Oct 2022 07:11:39 -0400 Subject: [PATCH 3/9] Save [wip] --- CMVC_main_opiec.py | 7 +- Multi_view_CH_kmeans.py | 231 +++++++++++++++++++++++++++++++++++++-- embeddings_multi_view.py | 82 ++++++++++---- 3 files changed, 289 insertions(+), 31 deletions(-) diff --git a/CMVC_main_opiec.py b/CMVC_main_opiec.py index 95288b7..c41305a 100644 --- a/CMVC_main_opiec.py +++ b/CMVC_main_opiec.py @@ -198,7 +198,7 @@ def embedKG(self): help='Otherwise use subsampling weighting like in word2vec', default=True) parser.add_argument('-lr', '--learning_rate', default=0.0001, type=float) - parser.add_argument('-cpu', '--cpu_num', default=12, type=int) + parser.add_argument('-cpu', '--cpu_num', default=4, type=int) parser.add_argument('-init', '--init_checkpoint', default=None, type=str) parser.add_argument('--warm_up_steps', default=None, type=int) @@ -210,8 +210,11 @@ def embedKG(self): parser.add_argument('--nentity', type=int, default=0, help='DO NOT MANUALLY SET') parser.add_argument('--nrelation', type=int, default=0, help='DO NOT MANUALLY SET') parser.add_argument('-embed_dims', dest='embed_dims', default=300, type=int, help='Embedding dimension') - parser.add_argument('--kmeans_initialization', default="k-means++", type=str, choices=["k-means++", "seeding", "pc"], help='Embedding dimension') + parser.add_argument('--kmeans_initialization', default="k-means++", type=str, choices=["k-means++", "seeded-k-means++", "pc"], help='Embedding dimension') + parser.add_argument('--num_cluster_seeds', default=None, type=int, help='Number of cluster seeds to use, if simulating user seed feedback') parser.add_argument('--num_reinit', type=int, default=10, help="Number of reinitializations to try for k-Means clustering") + parser.add_argument('--save_model', action="store_true", help="Whether or not to serialize and save model outputs and parameters to disk") + parser.add_argument('--unnormalize', action="store_true", help="Whether to normalize each point before clustering") # word2vec and iteration hyper-parameters parser.add_argument('-retrain_literal_embeds', dest='retrain_literal_embeds', default=True, diff --git a/Multi_view_CH_kmeans.py b/Multi_view_CH_kmeans.py index 0b97508..9d7f7dc 100644 --- a/Multi_view_CH_kmeans.py +++ b/Multi_view_CH_kmeans.py @@ -9,15 +9,17 @@ from sklearn.cluster._kmeans import ( # from sklearn.cluster.k_means_ import ( _check_sample_weight, - _init_centroids, + _k_init, _labels_inertia, _tolerance, _validate_center_shape, ) +import scipy.sparse as sp +from sklearn.metrics.pairwise import euclidean_distances from sklearn.preprocessing import normalize from sklearn.utils import check_array, check_random_state -from sklearn.utils.extmath import row_norms, squared_norm +from sklearn.utils.extmath import row_norms, squared_norm, stable_cumsum from sklearn.utils.validation import _num_samples, check_X_y from sklearn.preprocessing._label import LabelEncoder from sklearn.metrics.cluster._unsupervised import check_number_of_labels @@ -241,17 +243,215 @@ def multi_view_labels_inertia(X_view_1, X_view_2, sample_weight, x_view_1_square inertia = inertia_view_1 + inertia_view_2 return labels, inertia +def init_seeded_kmeans_plusplus(X, seed_set, n_clusters, x_squared_norms, random_state, n_local_trials=None): + """Init n_clusters seeds according to k-means++. Modified from original at + https://github.com/scikit-learn/scikit-learn/blob/36958fb24/sklearn/cluster/_kmeans.py#L154. + + Parameters + ---------- + X : array or sparse matrix, shape (n_samples, n_features) + The data to pick seeds for. To avoid memory copy, the input data + should be double precision (dtype=np.float64). + + init_clusters_seeds : array, shape N + Pre-initialized cluster seeds (indices to the dataset) chosen by + a previous method (such as an oracle). The number of initial cluster + seeds N must be less than n_clusters. + + n_clusters : integer + The number of seeds to choose + + x_squared_norms : array, shape (n_samples,) + Squared Euclidean norm of each data point. + + random_state : int, RandomState instance + The generator used to initialize the centers. Use an int to make the + randomness deterministic. + See :term:`Glossary `. + + n_local_trials : integer, optional + The number of seeding trials for each center (except the first), + of which the one reducing inertia the most is greedily chosen. + Set to None to make the number of trials depend logarithmically + on the number of seeds (2+log(k)); this is the default. + + Notes + ----- + Selects initial cluster centers for k-mean clustering in a smart way + to speed up convergence. see: Arthur, D. and Vassilvitskii, S. + "k-means++: the advantages of careful seeding". ACM-SIAM symposium + on Discrete algorithms. 2007 + + Version ported from http://www.stanford.edu/~darthur/kMeansppTest.zip, + which is the implementation used in the aforementioned paper. + """ + n_samples, n_features = X.shape + + + print(f"n_clusters: {n_clusters}") + centers = np.empty((n_clusters, n_features), dtype=X.dtype) + print(f"(BEFORE) centers.shape: {centers.shape}") + + assert x_squared_norms is not None, 'x_squared_norms None in _k_init' + + # Set the number of local seeding trials if none is given + if n_local_trials is None: + # This is what Arthur/Vassilvitskii tried, but did not report + # specific results for other than mentioning in the conclusion + # that it helped. + n_local_trials = 2 + int(np.log(n_clusters)) + + init_clusters_seeds, initial_cluster_vectors = seed_set + + # Pick first center randomly + centers[0] = X[init_clusters_seeds[0]] + + # Pick first N centers from seeds + + # Initialize list of closest distances and calculate current potential + closest_dist_sq = euclidean_distances( + centers[0, np.newaxis], X, Y_norm_squared=x_squared_norms, + squared=True) + current_pot = closest_dist_sq.sum() + + # Pick the remaining n_clusters-1 points + for c in range(1, n_clusters): + + if c < len(init_clusters_seeds): + true_cluster_seed = init_clusters_seeds[c] + candidate_ids = np.array([true_cluster_seed]) + else: + # Choose center candidates by sampling with probability proportional + # to the squared distance to the closest existing center + rand_vals = random_state.random_sample(n_local_trials) * current_pot + candidate_ids = np.searchsorted(stable_cumsum(closest_dist_sq), + rand_vals) + + # XXX: numerical imprecision can result in a candidate_id out of range + np.clip(candidate_ids, None, closest_dist_sq.size - 1, + out=candidate_ids) + + # Compute distances to center candidates + distance_to_candidates = euclidean_distances( + X[candidate_ids], X, Y_norm_squared=x_squared_norms, squared=True) + + # update closest distances squared and potential for each candidate + np.minimum(closest_dist_sq, distance_to_candidates, + out=distance_to_candidates) + candidates_pot = distance_to_candidates.sum(axis=1) + + # Decide which candidate is the best + best_candidate = np.argmin(candidates_pot) + current_pot = candidates_pot[best_candidate] + closest_dist_sq = distance_to_candidates[best_candidate] + best_candidate = candidate_ids[best_candidate] + + # Permanently add best center candidate found in local tries + if sp.issparse(X): + centers[c] = X[best_candidate].toarray() + else: + centers[c] = X[best_candidate] + return centers + +def _init_centroids_with_seeding(X, k, init, seed_set=None, random_state=None, x_squared_norms=None, + init_size=None): + """Compute the initial centroids + + Parameters + ---------- + + X : array, shape (n_samples, n_features) + + k : int + number of centroids + + init : {'k-means++', 'random' or ndarray or callable} optional + Method for initialization + + random_state : int, RandomState instance or None (default) + Determines random number generation for centroid initialization. Use + an int to make the randomness deterministic. + See :term:`Glossary `. + + x_squared_norms : array, shape (n_samples,), optional + Squared euclidean norm of each data point. Pass it if you have it at + hands already to avoid it being recomputed here. Default: None + + init_size : int, optional + Number of samples to randomly sample for speeding up the + initialization (sometimes at the expense of accuracy): the + only algorithm is initialized by running a batch KMeans on a + random subset of the data. This needs to be larger than k. + + Returns + ------- + centers : array, shape(k, n_features) + """ + random_state = check_random_state(random_state) + n_samples = X.shape[0] + + if x_squared_norms is None: + x_squared_norms = row_norms(X, squared=True) + + if init_size is not None and init_size < n_samples: + if init_size < k: + warnings.warn( + "init_size=%d should be larger than k=%d. " + "Setting it to 3*k" % (init_size, k), + RuntimeWarning, stacklevel=2) + init_size = 3 * k + init_indices = random_state.randint(0, n_samples, init_size) + X = X[init_indices] + x_squared_norms = x_squared_norms[init_indices] + n_samples = X.shape[0] + elif n_samples < k: + raise ValueError( + "n_samples=%d should be larger than k=%d" % (n_samples, k)) + + print("(1)") + if isinstance(init, str) and init == 'k-means++': + centers = _k_init(X, k, random_state=random_state, + x_squared_norms=x_squared_norms) + elif isinstance(init, str) and init == 'seeded-k-means++': + print("(0)") + centers = init_seeded_kmeans_plusplus(X, seed_set, k, random_state=random_state, + x_squared_norms=x_squared_norms) + elif isinstance(init, str) and init == 'random': + seeds = random_state.permutation(n_samples)[:k] + centers = X[seeds] + elif hasattr(init, '__array__'): + # ensure that the centers have the same dtype as X + # this is a requirement of fused types of cython + centers = np.array(init, dtype=X.dtype) + elif callable(init): + centers = init(X, k, random_state=random_state) + centers = np.asarray(centers, dtype=X.dtype) + else: + raise ValueError("the init parameter for the k-means should " + "be 'k-means++' or 'random' or an ndarray, " + "'%s' (type '%s') was passed." % (init, type(init))) + + if sp.issparse(centers): + centers = centers.toarray() + + print(f"(AFTER) centers.shape: {centers.shape}") + + _validate_center_shape(X, k, centers) + return centers def multi_view_spherical_kmeans_single_lloyd( X_view_1, X_view_2, + X_view_2_unnorm, n_clusters, sample_weight=None, max_iter=300, init="k-means++", + seed_set=None, verbose=False, x_view_1_squared_norms=None, x_view_2_squared_norms=None, + X_view_2_unnorm_squared_norms=None, random_state=None, tol=1e-4, precompute_distances=True, @@ -269,8 +469,9 @@ def multi_view_spherical_kmeans_single_lloyd( best_labels, best_inertia, best_centers = None, None, None # init - centers_view_2 = _init_centroids( - X_view_2, n_clusters, init, random_state=random_state, x_squared_norms=x_view_2_squared_norms + print("(2)") + centers_view_2 = _init_centroids_with_seeding( + X_view_2_unnorm, n_clusters, init, seed_set=seed_set, random_state=random_state, x_squared_norms=X_view_2_unnorm_squared_norms ) if verbose: @@ -468,9 +669,11 @@ def multi_view_spherical_kmeans_single_lloyd( def multi_view_spherical_k_means( X_view_1, X_view_2, + X_view_2_unnorm, n_clusters, sample_weight=None, init="k-means++", + seed_set=None, n_init=10, max_iter=300, verbose=False, @@ -523,8 +726,6 @@ def multi_view_spherical_k_means( tol = (tol_view_1 + tol_view_2) / 2 if hasattr(init, "__array__"): - init = check_array(init, dtype=X_view_1.dtype.type, order="C", copy=True) - _validate_center_shape(X_view_1, n_clusters, init) init = check_array(init, dtype=X_view_2.dtype.type, order="C", copy=True) _validate_center_shape(X_view_2, n_clusters, init) @@ -540,6 +741,8 @@ def multi_view_spherical_k_means( # precompute squared norms of data points x_view_1_squared_norms = row_norms(X_view_1, squared=True) x_view_2_squared_norms = row_norms(X_view_2, squared=True) + X_view_2_unnorm_squared_norms = row_norms(X_view_2_unnorm, squared=True) + print("(3)") if n_jobs == 1: # For a single thread, less memory is needed if we just store one set @@ -549,14 +752,17 @@ def multi_view_spherical_k_means( labels, inertia, n_iter_ = multi_view_spherical_kmeans_single_lloyd( X_view_1, X_view_2, + X_view_2_unnorm, n_clusters, sample_weight, max_iter=max_iter, init=init, + seed_set=seed_set, verbose=verbose, tol=tol, x_view_1_squared_norms=x_view_1_squared_norms, x_view_2_squared_norms=x_view_2_squared_norms, + X_view_2_unnorm_squared_norms=X_view_2_unnorm_squared_norms, random_state=random_state, p=p, side_info=side_info, @@ -576,14 +782,17 @@ def multi_view_spherical_k_means( delayed(multi_view_spherical_kmeans_single_lloyd)( X_view_1, X_view_2, + X_view_2_unnorm, n_clusters, sample_weight, max_iter=max_iter, init=init, + seed_set=seed_set, verbose=verbose, tol=tol, x_view_1_squared_norms=x_view_1_squared_norms, x_view_2_squared_norms=x_view_2_squared_norms, + X_view_2_unnorm_squared_norms=X_view_2_unnorm_squared_norms, # Change seed to ensure variety random_state=seed, p=p, @@ -629,11 +838,12 @@ class Multi_view_SphericalKMeans(object): centroid seeds. The final results will be the best output of n_init consecutive runs in terms of inertia. - init : {'k-means++', 'random' or an ndarray} + init : {'k-means++', 'seeded-k-means++', 'random' or an ndarray} Method for initialization, defaults to 'k-means++': 'k-means++' : selects initial cluster centers for k-mean clustering in a smart way to speed up convergence. See section Notes in k_init for more details. + 'seeded-k-means++' : k-means++ with some fixed cluster seeds provided. 'random': choose k observations (rows) at random from data for the initial centroids. If an ndarray is passed, it should be of shape (n_clusters, n_features) @@ -685,6 +895,7 @@ def __init__( self, n_clusters=8, init="k-means++", + seed_set=None, n_init=10, max_iter=300, tol=1e-4, @@ -700,6 +911,7 @@ def __init__( ): self.n_clusters = n_clusters self.init = init + self.seed_set = seed_set self.max_iter = max_iter self.tol = tol self.n_init = n_init @@ -727,6 +939,7 @@ def fit(self, X_view_1, X_view_2, sample_weight=None): The weights for each observation in X. If None, all observations are assigned equal weight (default: None) """ + X_view_2_unnorm = check_array(X_view_2) if self.normalize: X_view_1 = normalize(X_view_1) X_view_2 = normalize(X_view_2) @@ -735,12 +948,16 @@ def fit(self, X_view_1, X_view_2, sample_weight=None): # TODO: add check that all data is unit-normalized + print("(4)") + self.labels_, self.inertia_, self.n_iter_ = multi_view_spherical_k_means( X_view_1, X_view_2, + X_view_2_unnorm, n_clusters=self.n_clusters, sample_weight=sample_weight, init=self.init, + seed_set=self.seed_set, n_init=self.n_init, max_iter=self.max_iter, verbose=self.verbose, diff --git a/embeddings_multi_view.py b/embeddings_multi_view.py index 0e4dabe..61ea965 100644 --- a/embeddings_multi_view.py +++ b/embeddings_multi_view.py @@ -1,4 +1,4 @@ -import gensim, itertools, pickle, time +import gensim, itertools, pickle, random, time from helper import * from utils import cos_sim from test_performance import cluster_test, HAC_getClusters @@ -182,6 +182,35 @@ def totol_cluster2pair(cluster_list): seed_pair_list += iter_list return seed_pair_list +def initialize_cluster_seeds(X, num_cluster_seeds, ent2id, ent2embed, id_to_embedding_rows, true_clust2ent): + """Init k-means clusters seeds according to k-means++ + + Parameters + ---------- + X : array or sparse matrix, shape (n_samples, n_features) + The data to pick seeds for. To avoid memory copy, the input data + should be double precision (dtype=np.float64). + + num_cluster_seeds : array or sparse matrix, shape (n_samples, n_features) + Pre-initialized cluster seeds chosen by a previous method (such as an + oracle). The number of initial cluster seeds N must be less than + num_clusters. + ... + + Return indices into the embedding matrix to use as cluster initializations + """ + seed_points = [] + seed_point_vectors = np.empty((num_cluster_seeds, len(X[0])), dtype=X[0].dtype) + assert num_cluster_seeds <= len(true_clust2ent) + cluster_names_to_sample = random.sample(true_clust2ent.keys(), num_cluster_seeds) + for i, cluster_name in enumerate(cluster_names_to_sample): + cluster_entities = true_clust2ent[cluster_name] + random_entity = random.choice(list(cluster_entities)) + entity_name = random_entity.split("|")[0] + seed_points.append(id_to_embedding_rows[ent2id[entity_name]]) + seed_point_vectors[i] = ent2embed[ent2id[entity_name]] + return seed_points, seed_point_vectors + class Embeddings(object): """ @@ -189,7 +218,7 @@ class Embeddings(object): """ def __init__(self, params, side_info, true_ent2clust, true_clust2ent, sub_uni2triple_dict=None, - triple_list=None, num_reinit=10): + triple_list=None, num_reinit=10, random_seed=None): self.p = params self.side_info = side_info @@ -201,6 +230,7 @@ def __init__(self, params, side_info, true_ent2clust, true_clust2ent, sub_uni2tr self.rel_id2sentence_list = dict() self.num_reinit = num_reinit + self.random_seed = random_seed ent_id2sentence_list = self.side_info.ent_id2sentence_list for rel in self.side_info.rel_list: @@ -415,8 +445,8 @@ def fit(self): K_init = len(list(set(self.sub_label))) print('K_init:', K_init) print('epochs:', self.epochs) - for i in range(self.epochs): - BERT_self_training_time = i + for random_seed in range(self.epochs): + BERT_self_training_time = random_seed if str(self.p.input) == 'entity': input_list = clean_ent_list else: @@ -450,10 +480,12 @@ def fit(self): print('self.BERT_CLS:', len(self.BERT_CLS)) self.relation_view_embed, self.context_view_embed = [], [] + id_to_embedding_rows = {} for ent in clean_ent_list: id = self.side_info.ent2id[ent] if id in self.side_info.isSub: self.relation_view_embed.append(self.ent2embed[id]) + id_to_embedding_rows[id] = len(self.context_view_embed) self.context_view_embed.append(self.BERT_CLS[id]) print('self.relation_view_embed:', len(self.relation_view_embed)) print('self.context_view_embed:', len(self.context_view_embed)) @@ -511,8 +543,8 @@ def fit(self): print('Model is multi-view spherical-k-means') - for i in range(3): - print('test time:', i) + for random_seed in range(3): + print('test time:', random_seed) if self.p.dataset == 'OPIEC59k': n_cluster = 490 elif self.p.dataset == 'reverb45k' or self.p.dataset == 'reverb45k_change': @@ -526,21 +558,26 @@ def fit(self): real_time = time.strftime("%Y_%m_%d") + ' ' + time.strftime("%H:%M:%S") print('time:', real_time) - if self.p.kmeans_initialization == "seeding": - init = np.ndarray() + if self.p.kmeans_initialization == "seeded-k-means++": + init = "seeded-k-means++" + assert self.p.num_cluster_seeds <= n_cluster + seed_set = initialize_cluster_seeds(self.context_view_embed, self.p.num_cluster_seeds, self.side_info.ent2id, self.BERT_CLS, id_to_embedding_rows, self.true_clust2ent) print("TODO(Vijay): Not implemented") - breakpoint() elif self.p.kmeans_initialization == "pc": - init = np.ndarray() - print("TODO(Vijay): Not implemented") breakpoint() + init = "seeded-k-means++" + seed_set = np.array([]) + print("TODO(Vijay): Not implemented") else: init = "k-means++" + seed_set = None - mv_skm = Multi_view_SphericalKMeans(n_clusters=n_cluster, init='k-means++', n_init=self.num_reinit, max_iter=10, - n_jobs=5, verbose=0, p=self.p, side_info=self.side_info, + print(f"(5): {init}") + mv_skm = Multi_view_SphericalKMeans(n_clusters=n_cluster, init=init, seed_set=seed_set, n_init=self.num_reinit, max_iter=10, + n_jobs=1, verbose=0, p=self.p, side_info=self.side_info, true_ent2clust=self.true_ent2clust, - true_clust2ent=self.true_clust2ent) + true_clust2ent=self.true_clust2ent, + normalize=not self.p.unnormalize) mv_skm.fit(self.relation_view_embed, self.context_view_embed) cluster_predict_list = mv_skm.labels_ time_cost = time.time() - t0 @@ -563,14 +600,15 @@ def fit(self): print('Gold: #Clusters: %d, #Singletons %d' % (gold_clusters, gold_Singletons)) - try: - metrics = (ave_prec, ave_recall, ave_f1, macro_prec, micro_prec, pair_prec, macro_recall, micro_recall, \ - pair_recall, macro_f1, micro_f1, pair_f1, model_clusters, model_Singletons, gold_clusters, gold_Singletons) - model_output = (self.p, self.side_info, cluster_predict_list, self.true_ent2clust, self.true_clust2ent, metrics) - model_output_fname = "../output/" + self.p.dataset + '_' + self.p.split + '_' + '1/' + "model_output_iter_" + str(i) + ".pkl" - pickle.dump(model_output, open(model_output_fname, 'wb')) - except: - breakpoint() + if self.p.save_model: + try: + metrics = (ave_prec, ave_recall, ave_f1, macro_prec, micro_prec, pair_prec, macro_recall, micro_recall, \ + pair_recall, macro_f1, micro_f1, pair_f1, model_clusters, model_Singletons, gold_clusters, gold_Singletons) + model_output = (self.p, self.side_info, cluster_predict_list, self.true_ent2clust, self.true_clust2ent, metrics) + model_output_fname = "../output/" + self.p.dataset + '_' + self.p.split + '_' + '1/' + "model_output_iter_" + str(random_seed) + ".pkl" + pickle.dump(model_output, open(model_output_fname, 'wb')) + except: + breakpoint() if show_memory: size, peak = tracemalloc.get_traced_memory() From 72f9eff5c2bb5f28556714dd0fa7ff817071012a Mon Sep 17 00:00:00 2001 From: Vijay Viswanathan Date: Sun, 6 Nov 2022 20:58:43 -0500 Subject: [PATCH 4/9] make feature normalization configurable --- Multi_view_CH_kmeans.py | 35 ++++++++++++++++++++++++----------- 1 file changed, 24 insertions(+), 11 deletions(-) diff --git a/Multi_view_CH_kmeans.py b/Multi_view_CH_kmeans.py index 9d7f7dc..b91e5bd 100644 --- a/Multi_view_CH_kmeans.py +++ b/Multi_view_CH_kmeans.py @@ -458,7 +458,8 @@ def multi_view_spherical_kmeans_single_lloyd( p=None, side_info=None, true_ent2clust=None, - true_clust2ent=None + true_clust2ent=None, + normalize_vectors=True, ): """ Modified from sklearn.cluster.k_means_.k_means_single_lloyd. @@ -518,6 +519,7 @@ def multi_view_spherical_kmeans_single_lloyd( # iterations inertia_totol = 0 + i = 0 for i in range(max_iter): # epoch inertia_totol = 0 for j in range(len(x_list)): # views @@ -535,7 +537,8 @@ def multi_view_spherical_kmeans_single_lloyd( ) # l2-normalize centers (this is the main contribution here) - centers = normalize(centers) + if normalize_vectors: + centers = normalize(centers) # E step: labels assignment # TODO: _labels_inertia should be done with cosine distance @@ -596,8 +599,9 @@ def multi_view_spherical_kmeans_single_lloyd( ) # l2-normalize centers (this is the main contribution here) - best_centers_view_1 = normalize(best_centers_view_1) - best_centers_view_2 = normalize(best_centers_view_2) + if normalize_vectors: + best_centers_view_1 = normalize(best_centers_view_1) + best_centers_view_2 = normalize(best_centers_view_2) # E step: labels assignment # TODO: _labels_inertia should be done with cosine distance @@ -643,8 +647,9 @@ def multi_view_spherical_kmeans_single_lloyd( ) # l2-normalize centers (this is the main contribution here) - best_centers_view_1 = normalize(best_centers_view_1) - best_centers_view_2 = normalize(best_centers_view_2) + if normalize_vectors: + best_centers_view_1 = normalize(best_centers_view_1) + best_centers_view_2 = normalize(best_centers_view_2) # E step: labels assignment # TODO: _labels_inertia should be done with cosine distance @@ -664,6 +669,10 @@ def multi_view_spherical_kmeans_single_lloyd( labels_view_2=labels_view_2 ) + if i == 0: + best_inertia = 0 + best_labels = labels + return best_labels, best_inertia, i + 1 def multi_view_spherical_k_means( @@ -686,7 +695,8 @@ def multi_view_spherical_k_means( p=None, side_info=None, true_ent2clust=None, - true_clust2ent=None + true_clust2ent=None, + normalize_vectors=True ): """Modified from sklearn.cluster.k_means_.k_means. """ @@ -697,7 +707,7 @@ def multi_view_spherical_k_means( ) random_state = check_random_state(random_state) - if max_iter <= 0: + if max_iter < 0: raise ValueError( "Number of iterations should be a positive number," " got %d instead" % max_iter @@ -767,7 +777,8 @@ def multi_view_spherical_k_means( p=p, side_info=side_info, true_ent2clust=true_ent2clust, - true_clust2ent=true_clust2ent + true_clust2ent=true_clust2ent, + normalize_vectors=normalize_vectors ) # determine if these results are the best so far @@ -798,7 +809,8 @@ def multi_view_spherical_k_means( p=p, side_info=side_info, true_ent2clust=true_ent2clust, - true_clust2ent=true_clust2ent + true_clust2ent=true_clust2ent, + normalize_vectors=normalize_vectors ) for seed in seeds ) @@ -969,7 +981,8 @@ def fit(self, X_view_1, X_view_2, sample_weight=None): p=self.p, side_info=self.side_info, true_ent2clust=self.true_ent2clust, - true_clust2ent=self.true_clust2ent + true_clust2ent=self.true_clust2ent, + normalize_vectors=self.normalize ) return self From ac3ab5b216b0dbdb7588b5cd1d6e9f143167f96d Mon Sep 17 00:00:00 2001 From: Vijay Viswanathan Date: Mon, 21 Nov 2022 14:40:25 -0500 Subject: [PATCH 5/9] Allow the cluster-initialization-with-seeding function to work with no seeds --- CMVC_main_opiec.py | 10 +++++----- Multi_view_CH_kmeans.py | 9 ++++++--- 2 files changed, 11 insertions(+), 8 deletions(-) diff --git a/CMVC_main_opiec.py b/CMVC_main_opiec.py index c41305a..4c48be5 100644 --- a/CMVC_main_opiec.py +++ b/CMVC_main_opiec.py @@ -20,10 +20,10 @@ def read_triples(self): self.amb_mentions = {} # Contains all ambiguous mentions self.isAcronym = {} # Contains all mentions which can be acronyms - print('dataset:', args.dataset) - if args.dataset == 'OPIEC59k': + print('dataset:', self.p.dataset) + if self.p.dataset == 'OPIEC59k': print('load OPIEC_dataset ... ') - self.triples_list = pickle.load(open(args.data_path, 'rb')) + self.triples_list = pickle.load(open(self.p.data_path, 'rb')) ''' Ground truth clustering ''' self.true_ent2clust = ddict(set) @@ -34,7 +34,7 @@ def read_triples(self): else: if not checkFile(fname): - with codecs.open(args.data_path, encoding='utf-8', errors='ignore') as f: + with codecs.open(self.p.data_path, encoding='utf-8', errors='ignore') as f: for line in f: trp = json.loads(line.strip()) @@ -253,4 +253,4 @@ def embedKG(self): cmvc = CMVC_Main(args) # Loading KG triples cmvc.get_sideInfo() # Side Information Acquisition - cmvc.embedKG() # Learning embedding for Noun and relation phrases \ No newline at end of file + cmvc.embedKG() # Learning embedding for Noun and relation phrases diff --git a/Multi_view_CH_kmeans.py b/Multi_view_CH_kmeans.py index b91e5bd..22cae2f 100644 --- a/Multi_view_CH_kmeans.py +++ b/Multi_view_CH_kmeans.py @@ -234,6 +234,7 @@ def multi_view_labels_inertia(X_view_1, X_view_2, sample_weight, x_view_1_square # distances will be changed in-place if precompute_distances: return multi_view_labels_inertia_precompute_dense(X_view_1, X_view_2, centers_view_1, centers_view_2, labels_view_1, labels_view_2) + inertia_view_1 = _k_means._assign_labels_array( X_view_1, sample_weight, x_view_1_squared_norms, centers_view_1, labels, distances=distances) @@ -301,7 +302,11 @@ def init_seeded_kmeans_plusplus(X, seed_set, n_clusters, x_squared_norms, random # that it helped. n_local_trials = 2 + int(np.log(n_clusters)) - init_clusters_seeds, initial_cluster_vectors = seed_set + if seed_set is None or len(seed_set) == 0: + random_index = np.random.choice(list(range(len(X)))) + init_clusters_seeds = [random_index] + else: + init_clusters_seeds, _ = seed_set # Pick first center randomly centers[0] = X[init_clusters_seeds[0]] @@ -408,12 +413,10 @@ def _init_centroids_with_seeding(X, k, init, seed_set=None, random_state=None, x raise ValueError( "n_samples=%d should be larger than k=%d" % (n_samples, k)) - print("(1)") if isinstance(init, str) and init == 'k-means++': centers = _k_init(X, k, random_state=random_state, x_squared_norms=x_squared_norms) elif isinstance(init, str) and init == 'seeded-k-means++': - print("(0)") centers = init_seeded_kmeans_plusplus(X, seed_set, k, random_state=random_state, x_squared_norms=x_squared_norms) elif isinstance(init, str) and init == 'random': From 32b6ba49451c4d0f3853966ace6c9108bd637125 Mon Sep 17 00:00:00 2001 From: Vijay Viswanathan Date: Mon, 21 Nov 2022 17:32:21 -0500 Subject: [PATCH 6/9] Simplify algorithm to use seeds --- Multi_view_CH_kmeans.py | 10 +++++----- embeddings_multi_view.py | 10 ++++------ 2 files changed, 9 insertions(+), 11 deletions(-) diff --git a/Multi_view_CH_kmeans.py b/Multi_view_CH_kmeans.py index 22cae2f..4ca72ff 100644 --- a/Multi_view_CH_kmeans.py +++ b/Multi_view_CH_kmeans.py @@ -304,12 +304,12 @@ def init_seeded_kmeans_plusplus(X, seed_set, n_clusters, x_squared_norms, random if seed_set is None or len(seed_set) == 0: random_index = np.random.choice(list(range(len(X)))) - init_clusters_seeds = [random_index] + seed_set = [random_index] else: - init_clusters_seeds, _ = seed_set + seed_set = seed_set # Pick first center randomly - centers[0] = X[init_clusters_seeds[0]] + centers[0] = X[seed_set[0]] # Pick first N centers from seeds @@ -322,8 +322,8 @@ def init_seeded_kmeans_plusplus(X, seed_set, n_clusters, x_squared_norms, random # Pick the remaining n_clusters-1 points for c in range(1, n_clusters): - if c < len(init_clusters_seeds): - true_cluster_seed = init_clusters_seeds[c] + if c < len(seed_set): + true_cluster_seed = seed_set[c] candidate_ids = np.array([true_cluster_seed]) else: # Choose center candidates by sampling with probability proportional diff --git a/embeddings_multi_view.py b/embeddings_multi_view.py index 61ea965..cb0664b 100644 --- a/embeddings_multi_view.py +++ b/embeddings_multi_view.py @@ -182,7 +182,7 @@ def totol_cluster2pair(cluster_list): seed_pair_list += iter_list return seed_pair_list -def initialize_cluster_seeds(X, num_cluster_seeds, ent2id, ent2embed, id_to_embedding_rows, true_clust2ent): +def initialize_cluster_seeds(num_cluster_seeds, ent2id, id_to_embedding_rows, true_clust2ent): """Init k-means clusters seeds according to k-means++ Parameters @@ -200,16 +200,14 @@ def initialize_cluster_seeds(X, num_cluster_seeds, ent2id, ent2embed, id_to_embe Return indices into the embedding matrix to use as cluster initializations """ seed_points = [] - seed_point_vectors = np.empty((num_cluster_seeds, len(X[0])), dtype=X[0].dtype) assert num_cluster_seeds <= len(true_clust2ent) cluster_names_to_sample = random.sample(true_clust2ent.keys(), num_cluster_seeds) - for i, cluster_name in enumerate(cluster_names_to_sample): + for cluster_name in cluster_names_to_sample: cluster_entities = true_clust2ent[cluster_name] random_entity = random.choice(list(cluster_entities)) entity_name = random_entity.split("|")[0] seed_points.append(id_to_embedding_rows[ent2id[entity_name]]) - seed_point_vectors[i] = ent2embed[ent2id[entity_name]] - return seed_points, seed_point_vectors + return seed_points class Embeddings(object): @@ -561,7 +559,7 @@ def fit(self): if self.p.kmeans_initialization == "seeded-k-means++": init = "seeded-k-means++" assert self.p.num_cluster_seeds <= n_cluster - seed_set = initialize_cluster_seeds(self.context_view_embed, self.p.num_cluster_seeds, self.side_info.ent2id, self.BERT_CLS, id_to_embedding_rows, self.true_clust2ent) + seed_set = initialize_cluster_seeds(self.p.num_cluster_seeds, self.side_info.ent2id, id_to_embedding_rows, self.true_clust2ent) print("TODO(Vijay): Not implemented") elif self.p.kmeans_initialization == "pc": breakpoint() From bd5136fc9bde2b6d09321ef1bfb54ee52cb3f111 Mon Sep 17 00:00:00 2001 From: Vijay Viswanathan Date: Tue, 28 Mar 2023 15:04:37 -0400 Subject: [PATCH 7/9] save --- CMVC_main_NYT.py | 2 +- CMVC_main_opiec.py | 3 +-- CMVC_main_reverb45k.py | 2 +- Multi_view_CH_kmeans.py | 27 +++++++++++++++++---------- embeddings_multi_view.py | 9 ++++++++- find_k_methods.py | 2 +- metrics.py | 28 ++++++++++++++++++++++++++-- preprocessing.py | 2 +- test_performance.py | 11 ++++++++--- train_embedding_model.py | 2 +- utils.py | 26 -------------------------- 11 files changed, 65 insertions(+), 49 deletions(-) delete mode 100644 utils.py diff --git a/CMVC_main_NYT.py b/CMVC_main_NYT.py index 166cd93..1adb26e 100644 --- a/CMVC_main_NYT.py +++ b/CMVC_main_NYT.py @@ -2,7 +2,7 @@ import gensim from preprocessing import SideInfo # For processing data and side information from embeddings_multi_view import Embeddings -from utils import * +from cmvc_utils import * import os, argparse, pickle, codecs from collections import defaultdict as ddict os.environ["CUDA_VISIBLE_DEVICES"] = "0" diff --git a/CMVC_main_opiec.py b/CMVC_main_opiec.py index 4c48be5..8e84ad9 100644 --- a/CMVC_main_opiec.py +++ b/CMVC_main_opiec.py @@ -1,10 +1,9 @@ from helper import * from preprocessing import SideInfo # For processing data and side information from embeddings_multi_view import Embeddings -from utils import * +from cmvc_utils import * import os, argparse, pickle, codecs from collections import defaultdict as ddict -os.environ["CUDA_VISIBLE_DEVICES"] = "1" ''' *************************************** DATASET PREPROCESSING **************************************** ''' class CMVC_Main(object): diff --git a/CMVC_main_reverb45k.py b/CMVC_main_reverb45k.py index acf9890..abb1428 100644 --- a/CMVC_main_reverb45k.py +++ b/CMVC_main_reverb45k.py @@ -2,7 +2,7 @@ import gensim from preprocessing import SideInfo # For processing data and side information from embeddings_multi_view import Embeddings -from utils import * +from cmvc_utils import * import os, argparse, pickle, codecs from collections import defaultdict as ddict os.environ["CUDA_VISIBLE_DEVICES"] = "0" diff --git a/Multi_view_CH_kmeans.py b/Multi_view_CH_kmeans.py index 4ca72ff..571c2dd 100644 --- a/Multi_view_CH_kmeans.py +++ b/Multi_view_CH_kmeans.py @@ -24,7 +24,7 @@ from sklearn.preprocessing._label import LabelEncoder from sklearn.metrics.cluster._unsupervised import check_number_of_labels -from utils import cosine_distance, cos_sim +from cmvc_utils import cosine_distance, cos_sim from test_performance import HAC_getClusters, cluster_test from warnings import simplefilter simplefilter(action='ignore', category=FutureWarning) @@ -303,7 +303,7 @@ def init_seeded_kmeans_plusplus(X, seed_set, n_clusters, x_squared_norms, random n_local_trials = 2 + int(np.log(n_clusters)) if seed_set is None or len(seed_set) == 0: - random_index = np.random.choice(list(range(len(X)))) + random_index = random_state.choice(list(range(len(X)))) seed_set = [random_index] else: seed_set = seed_set @@ -315,9 +315,8 @@ def init_seeded_kmeans_plusplus(X, seed_set, n_clusters, x_squared_norms, random # Initialize list of closest distances and calculate current potential closest_dist_sq = euclidean_distances( - centers[0, np.newaxis], X, Y_norm_squared=x_squared_norms, + X[seed_set[0], np.newaxis], X, Y_norm_squared=x_squared_norms, squared=True) - current_pot = closest_dist_sq.sum() # Pick the remaining n_clusters-1 points for c in range(1, n_clusters): @@ -328,9 +327,14 @@ def init_seeded_kmeans_plusplus(X, seed_set, n_clusters, x_squared_norms, random else: # Choose center candidates by sampling with probability proportional # to the squared distance to the closest existing center - rand_vals = random_state.random_sample(n_local_trials) * current_pot - candidate_ids = np.searchsorted(stable_cumsum(closest_dist_sq), - rand_vals) + # rand_vals = random_state.random_sample(n_local_trials) * current_pot + # candidate_ids = np.searchsorted(stable_cumsum(closest_dist_sq), rand_vals) + if len(closest_dist_sq.shape) == 2: + distances_normalized = closest_dist_sq[0] + else: + distances_normalized = closest_dist_sq + distances_normalized = distances_normalized / sum(distances_normalized) + candidate_ids = random_state.choice(range(len(distances_normalized)), p=distances_normalized, size=n_local_trials) # XXX: numerical imprecision can result in a candidate_id out of range np.clip(candidate_ids, None, closest_dist_sq.size - 1, @@ -342,15 +346,18 @@ def init_seeded_kmeans_plusplus(X, seed_set, n_clusters, x_squared_norms, random # update closest distances squared and potential for each candidate np.minimum(closest_dist_sq, distance_to_candidates, - out=distance_to_candidates) + out= distance_to_candidates) candidates_pot = distance_to_candidates.sum(axis=1) + # Decide which candidate is the best best_candidate = np.argmin(candidates_pot) current_pot = candidates_pot[best_candidate] closest_dist_sq = distance_to_candidates[best_candidate] best_candidate = candidate_ids[best_candidate] + breakpoint() + # Permanently add best center candidate found in local tries if sp.issparse(X): centers[c] = X[best_candidate].toarray() @@ -690,7 +697,7 @@ def multi_view_spherical_k_means( max_iter=300, verbose=False, tol=1e-4, - random_state=None, + random_state=0, copy_x=True, n_jobs=1, algorithm="auto", @@ -916,7 +923,7 @@ def __init__( tol=1e-4, n_jobs=1, verbose=0, - random_state=None, + random_state=0, copy_x=True, normalize=True, p=None, diff --git a/embeddings_multi_view.py b/embeddings_multi_view.py index cb0664b..366535c 100644 --- a/embeddings_multi_view.py +++ b/embeddings_multi_view.py @@ -1,6 +1,6 @@ import gensim, itertools, pickle, random, time from helper import * -from utils import cos_sim +from cmvc_utils import cos_sim from test_performance import cluster_test, HAC_getClusters from train_embedding_model import Train_Embedding_Model, pair2triples from Context_view import BERT_Model @@ -210,6 +210,7 @@ def initialize_cluster_seeds(num_cluster_seeds, ent2id, id_to_embedding_rows, tr return seed_points + class Embeddings(object): """ Learns embeddings for NPs and relation phrases @@ -540,6 +541,12 @@ def fit(self): print() print('Model is multi-view spherical-k-means') + breakpoint() + np.save(open("../data/OPIEC59k/relation_view_embed.npz", 'wb'), np.vstack(self.relation_view_embed)) + np.save(open("../data/OPIEC59k/context_view_embed.npz", 'wb'), np.vstack(self.context_view_embed)) + json.dump(self.side_info.id2sub, open("../data/OPIEC59k/entId2name.json", 'w'), indent=4) + + for random_seed in range(3): print('test time:', random_seed) diff --git a/find_k_methods.py b/find_k_methods.py index 2e5f949..0c425d7 100644 --- a/find_k_methods.py +++ b/find_k_methods.py @@ -3,7 +3,7 @@ from tqdm import tqdm from scipy.cluster.hierarchy import linkage, fcluster from scipy.spatial.distance import cdist, pdist, euclidean -from utils import cos_sim, normalize +from cmvc_utils import cos_sim, normalize def softmax(x): """Compute softmax values for each sets of scores in x.""" diff --git a/metrics.py b/metrics.py index 3294ae8..2cb24d8 100644 --- a/metrics.py +++ b/metrics.py @@ -55,6 +55,31 @@ def microPrecision(C_clust2ele, E_ele2clust): if total == 0: return 0 return float(num_prec) / float(total) + +''' + +for _, cluster in C_clust2ele.items(): + freq_map = {} + pair_map = defaultdict(list) + total += len(cluster) + + for ent in cluster: + if ent not in E_ele2clust: + # sys.stdout.write('.') + print(f"{ent} not found") + continue + for ele in E_ele2clust[ent]: + freq_map[ele] = freq_map.get(ele, 0) + freq_map[ele] += 1 + pair_map[ent].append(ele) + max_rep = 0 + for k, v in freq_map.items(): max_rep = max(max_rep, v) + print(freq_map) + break + +''' + + def pairPrecision(C_clust2ele, E_ele2clust): num_hit = 0 num_pairs = 0 @@ -98,7 +123,6 @@ def pairwiseMetric(C_clust2ele, E_ele2clust, E_clust2ent, max_pairs_from_cluster num_hit = 0 num_C_pairs = 0 num_E_pairs = 0 - for _, cluster in C_clust2ele.items(): all_pairs = capped_combinations(cluster, max_pairs_from_cluster) for e1, e2 in all_pairs: @@ -115,8 +139,8 @@ def pairwiseMetric(C_clust2ele, E_ele2clust, E_clust2ent, max_pairs_from_cluster if num_C_pairs == 0 or num_E_pairs == 0: return 1e-6, 1e-6 - # print( num_hit, num_C_pairs, num_E_pairs) + return float(num_hit) / float(num_C_pairs), float(num_hit) / float(num_E_pairs) diff --git a/preprocessing.py b/preprocessing.py index 11cd59b..35863b8 100644 --- a/preprocessing.py +++ b/preprocessing.py @@ -5,7 +5,7 @@ from helper import * import pdb, itertools from nltk.corpus import stopwords -from utils import * +from cmvc_utils import * import pickle '''*************************************** INPUT CLASS ********************************************''' diff --git a/test_performance.py b/test_performance.py index 51bc2cf..ea053e0 100644 --- a/test_performance.py +++ b/test_performance.py @@ -1,6 +1,7 @@ from helper import * -from utils import * +from cmvc_utils import * from metrics import evaluate # Evaluation metrics +from metrics import macroPrecision, calcF1, pairwiseMetric from scipy.cluster.hierarchy import linkage, fcluster from scipy.spatial.distance import pdist from tqdm import tqdm @@ -62,18 +63,22 @@ def cluster_test(params, side_info, cluster_predict_list, true_ent2clust, true_c triples = side_info.triples ent2id = side_info.ent2id + # Sub_cluster_predict_list is the same as cluster_predict_list. for eid in isSub.keys(): sub_cluster_predict_list.append(cluster_predict_list[eid]) + # Map each cluster to a list of indices of points in that cluster. for sub_id, cluster_id in enumerate(sub_cluster_predict_list): if cluster_id in clust2ent.keys(): clust2ent[cluster_id].append(sub_id) else: clust2ent[cluster_id] = [sub_id] + + # Map each cluster to a set of indices of points in that cluster. cesi_clust2ent = {} - for rep, cluster in clust2ent.items(): + for rep, cluster_points in clust2ent.items(): # cesi_clust2ent[rep] = list(cluster) - cesi_clust2ent[rep] = set(cluster) + cesi_clust2ent[rep] = set(cluster_points) cesi_ent2clust = invertDic(cesi_clust2ent, 'm2os') cesi_ent2clust_u = {} diff --git a/train_embedding_model.py b/train_embedding_model.py index e960f9f..7011e08 100644 --- a/train_embedding_model.py +++ b/train_embedding_model.py @@ -1,7 +1,7 @@ from torch.utils.data import DataLoader from helper import * -from utils import cos_sim +from cmvc_utils import cos_sim from dataloader_max_margin import * from model_max_margin import KGEModel import pickle diff --git a/utils.py b/utils.py deleted file mode 100644 index b3dc5af..0000000 --- a/utils.py +++ /dev/null @@ -1,26 +0,0 @@ -import numpy as np - -def normalize(v): - norm = np.linalg.norm(v) - if norm == 0: - return v - return v / norm - -def cos_sim(a, b): - a = np.array(a) - b = np.array(b) - a_norm = np.linalg.norm(a) - b_norm = np.linalg.norm(b) - cos_theta = float(np.dot(a, b) / (a_norm * b_norm)) - cos_theta = 0.5 + 0.5 * cos_theta - return cos_theta - -def cosine_distance(a, b): - a = np.array(a) - b = np.array(b) - a_norm = np.linalg.norm(a) - b_norm = np.linalg.norm(b) - cos_theta = float(np.dot(a, b) / (a_norm * b_norm)) - cos_distance = 1 - cos_theta - # cos_distance = 0.5 - 0.5 * cos_theta - return cos_distance \ No newline at end of file From 2e276fba9eb05758fc0e37b8338d0bd9560904ec Mon Sep 17 00:00:00 2001 From: Vijay Viswanathan Date: Sun, 18 Jun 2023 22:31:07 -0400 Subject: [PATCH 8/9] save --- CMVC_main_opiec.py | 4 +++- CMVC_main_reverb45k.py | 26 +++++++++++++++----------- Context_view.py | 12 ++++++++---- embeddings_multi_view.py | 10 ++++------ preprocessing.py | 10 +++++++++- train_embedding_model.py | 5 ++++- 6 files changed, 43 insertions(+), 24 deletions(-) diff --git a/CMVC_main_opiec.py b/CMVC_main_opiec.py index 8e84ad9..753d1c7 100644 --- a/CMVC_main_opiec.py +++ b/CMVC_main_opiec.py @@ -4,6 +4,8 @@ from cmvc_utils import * import os, argparse, pickle, codecs from collections import defaultdict as ddict +os.environ["CUDA_VISIBLE_DEVICES"] = "1" + ''' *************************************** DATASET PREPROCESSING **************************************** ''' class CMVC_Main(object): @@ -117,7 +119,7 @@ def embedKG(self): description='CESI: Canonicalizing Open Knowledge Bases using Embeddings and Side Information') parser.add_argument('-data', dest='dataset', default='OPIEC59k', help='Dataset to run CESI on') parser.add_argument('-split', dest='split', default='test', help='Dataset split for evaluation') - parser.add_argument('-data_dir', dest='data_dir', default='../data', help='Data directory') + parser.add_argument('-l', dest='data_dir', default='../data', help='Data directory') parser.add_argument('-out_dir', dest='out_dir', default='../output', help='Directory to store CESI output') parser.add_argument('-reset', dest="reset", action='store_true', default=True, help='Clear the cached files (Start a fresh run)') diff --git a/CMVC_main_reverb45k.py b/CMVC_main_reverb45k.py index abb1428..37001de 100644 --- a/CMVC_main_reverb45k.py +++ b/CMVC_main_reverb45k.py @@ -5,7 +5,6 @@ from cmvc_utils import * import os, argparse, pickle, codecs from collections import defaultdict as ddict -os.environ["CUDA_VISIBLE_DEVICES"] = "0" ''' *************************************** DATASET PREPROCESSING **************************************** ''' @@ -23,10 +22,10 @@ def read_triples(self): self.amb_mentions = {} # Contains all ambiguous mentions self.isAcronym = {} # Contains all mentions which can be acronyms - print('dataset:', args.dataset) - if args.dataset == 'OPIEC': + print('dataset:', self.p.dataset) + if self.p.dataset == 'OPIEC': print('load OPIEC_dataset ... ') - self.triples_list = pickle.load(open(args.data_path, 'rb')) + self.triples_list = pickle.load(open(self.p.data_path, 'rb')) ''' Ground truth clustering ''' self.true_ent2clust = ddict(set) @@ -37,7 +36,7 @@ def read_triples(self): else: if not checkFile(fname): - with codecs.open(args.data_path, encoding='utf-8', errors='ignore') as f: + with codecs.open(self.p.data_path, encoding='utf-8', errors='ignore') as f: for line in f: trp = json.loads(line.strip()) @@ -87,17 +86,17 @@ def read_triples(self): print('self.true_clust2ent:', len(self.true_clust2ent)) print('self.true_ent2clust:', len(self.true_ent2clust)) - folder = '../file/' + args.dataset + '/' + folder = '../file/' + self.p.dataset + '/' if not os.path.exists(folder): os.makedirs(folder) - fname1, fname2 = '../file/' + args.dataset + '/self.ent2true_link_list', '../file/' + args.dataset + '/self.ent2true_link' + fname1, fname2 = '../file/' + self.p.dataset + '/self.ent2true_link_list', '../file/' + self.p.dataset + '/self.ent2true_link' if not checkFile(fname1) or not checkFile(fname2): print('generate ent2true_link_dict') self.ent2true_link_list = dict() for trp in self.triples_list: sub, obj = trp['triple'][0], trp['triple'][2] - if args.dataset == 'OPIEC': + if self.p.dataset == 'OPIEC': true_sub_link, true_obj_link = trp['subject_wiki_link'], trp['object_wiki_link'] else: true_sub_link, true_obj_link = trp['true_sub_link'], trp['true_obj_link'] @@ -161,7 +160,7 @@ def embedKG(self): if not checkFile(fname1) or not checkFile(fname2): embed = Embeddings(self.p, self.side_info, true_ent2clust=self.true_ent2clust, - true_clust2ent=self.true_clust2ent, triple_list=self.triples_list) + true_clust2ent=self.true_clust2ent, triple_list=self.triples_list, num_reinit=self.p.num_reinit) embed.fit() self.ent2embed = embed.ent2embed # Get the learned NP embeddings @@ -259,11 +258,11 @@ def embedKG(self): help='Otherwise use subsampling weighting like in word2vec', default=True) parser.add_argument('-lr', '--learning_rate', default=0.0001, type=float) - parser.add_argument('-cpu', '--cpu_num', default=12, type=int) + parser.add_argument('-cpu', '--cpu_num', default=4, type=int) parser.add_argument('-init', '--init_checkpoint', default=None, type=str) parser.add_argument('--warm_up_steps', default=None, type=int) - parser.add_argument('--save_checkpoint_steps', default=10000, type=int) + parser.add_argument('--save_checkpoint_steps', default=1000, type=int) parser.add_argument('--valid_steps', default=10000, type=int) parser.add_argument('--log_steps', default=100, type=int, help='train log every xx steps') parser.add_argument('--test_log_steps', default=1000, type=int, help='valid/test log every xx steps') @@ -271,6 +270,11 @@ def embedKG(self): parser.add_argument('--nentity', type=int, default=0, help='DO NOT MANUALLY SET') parser.add_argument('--nrelation', type=int, default=0, help='DO NOT MANUALLY SET') parser.add_argument('-embed_dims', dest='embed_dims', default=300, type=int, help='Embedding dimension') + parser.add_argument('--kmeans_initialization', default="k-means++", type=str, choices=["k-means++", "seeded-k-means++", "pc"], help='Embedding dimension') + parser.add_argument('--num_cluster_seeds', default=None, type=int, help='Number of cluster seeds to use, if simulating user seed feedback') + parser.add_argument('--num_reinit', type=int, default=10, help="Number of reinitializations to try for k-Means clustering") + parser.add_argument('--save_model', action="store_true", help="Whether or not to serialize and save model outputs and parameters to disk") + parser.add_argument('--unnormalize', action="store_true", help="Whether to normalize each point before clustering") # word2vec and iteration hyper-parameters parser.add_argument('-retrain_literal_embeds', dest='retrain_literal_embeds', default=True, diff --git a/Context_view.py b/Context_view.py index ed72f36..6a97f78 100644 --- a/Context_view.py +++ b/Context_view.py @@ -22,9 +22,12 @@ def __del__(self): print("BertClassificationModel del ... ") def forward(self, batch_sentences): - batch_tokenized = self.tokenizer.batch_encode_plus(batch_sentences, add_special_tokens=True, - max_length=self.max_length, - pad_to_max_length=True) + try: + batch_tokenized = self.tokenizer.batch_encode_plus(batch_sentences, add_special_tokens=True, + max_length=self.max_length, + pad_to_max_length=True) + except: + breakpoint() input_ids = torch.tensor(batch_tokenized['input_ids']).cuda() attention_mask = torch.tensor(batch_tokenized['attention_mask']).cuda() bert_output = self.bert(input_ids, attention_mask=attention_mask) @@ -44,7 +47,7 @@ def __init__(self, params, side_info, input_list, cluster_predict_list, true_ent self.BERT_self_training_time = BERT_self_training_time self.sub_uni2triple_dict = sub_uni2triple_dict self.rel_id2sentence_list = rel_id2sentence_list - self.batch_size = 40 + self.batch_size = 30 if self.p.dataset == 'reverb45k_change': self.epochs = 100 else: @@ -156,6 +159,7 @@ def fine_tune(self): if i == (batch_count - 1): real_time = time.strftime("%Y_%m_%d") + ' ' + time.strftime("%H:%M:%S") print(real_time, "Epoch: %d, Loss: %.4f" % (epoch, avg_epoch_loss)) + breakpoint() self.BERT_CLS = cls_output.detach().cpu().numpy() pickle.dump(self.BERT_CLS, open(fname1, 'wb')) diff --git a/embeddings_multi_view.py b/embeddings_multi_view.py index 366535c..26c5edf 100644 --- a/embeddings_multi_view.py +++ b/embeddings_multi_view.py @@ -459,7 +459,7 @@ def fit(self): pickle.dump(self.label, open(fname1, 'wb')) else: print('load BERT_fine-tune_', fname1) - self.label = pickle.load(open(fname1, 'rb')) + self.label = pickle.load(open(fname1, 'srb')) context_view_label = self.label old_label, new_label = context_view_label, self.label print('old_label : ') @@ -541,10 +541,9 @@ def fit(self): print() print('Model is multi-view spherical-k-means') - breakpoint() - np.save(open("../data/OPIEC59k/relation_view_embed.npz", 'wb'), np.vstack(self.relation_view_embed)) - np.save(open("../data/OPIEC59k/context_view_embed.npz", 'wb'), np.vstack(self.context_view_embed)) - json.dump(self.side_info.id2sub, open("../data/OPIEC59k/entId2name.json", 'w'), indent=4) + np.save(open("../data/OPIEC59k/valid_relation_view_embed.npz", 'wb'), np.vstack(self.relation_view_embed)) + np.save(open("../data/OPIEC59k/valid_context_view_embed.npz", 'wb'), np.vstack(self.context_view_embed)) + json.dump(self.side_info.id2sub, open("../data/OPIEC59k/valid_entId2name.json", 'w'), indent=4) @@ -569,7 +568,6 @@ def fit(self): seed_set = initialize_cluster_seeds(self.p.num_cluster_seeds, self.side_info.ent2id, id_to_embedding_rows, self.true_clust2ent) print("TODO(Vijay): Not implemented") elif self.p.kmeans_initialization == "pc": - breakpoint() init = "seeded-k-means++" seed_set = np.array([]) print("TODO(Vijay): Not implemented") diff --git a/preprocessing.py b/preprocessing.py index 35863b8..70c89ff 100644 --- a/preprocessing.py +++ b/preprocessing.py @@ -32,6 +32,8 @@ def process(self): fname16, fname17, fname18 = self.folder_to_make + '/self.id2obj', self.folder_to_make + '/self.ent_id2sentence_list', self.folder_to_make + '/self.sentence_list' fname19, fname20 = self.folder_to_make + '/self.ent2triple_id_list', self.folder_to_make + '/self.rel2triple_id_list' + sentence_unprocessed_mapping = {} + if not checkFile(fname1) or not checkFile(fname2): print('generate side_info') ent1List, relList, ent2List = [], [], [] # temp variables @@ -68,8 +70,10 @@ def process(self): self.ent2triple_id_list[obj].append(triple_num) for sentence in triple['src_sentences']: + original_sentence = sentence if self.p.replace_h: sentence = sentence.replace(str(triple[self.triple_str][0]), '') + sentence_unprocessed_mapping[sentence] = original_sentence sentence_ = word_tokenize(sentence) sentence = str() for i in range(len(sentence_)): @@ -81,10 +85,12 @@ def process(self): sentence += str(w) if not i == len(sentence_) - 1: sentence += ' ' + sentence_unprocessed_mapping[sentence] = original_sentence # print('sentenceļ¼š', type(sentence), len(sentence), sentence) if len(sentence) == 0: sentence += str(triple[self.triple_str][0]) # print('sentenceļ¼š', type(sentence), len(sentence), sentence) + sentence_unprocessed_mapping[sentence] = original_sentence self.sentence_List.append(sentence) triple2sentence[triple_num].append(sentence_num) if len(self.sentence_List) == 0: @@ -93,6 +99,8 @@ def process(self): sentence_num += 1 triple_num += 1 + breakpoint() + print('relList:', len(relList)) # 35812 print('ent1List:', len(ent1List)) # 35812 print('ent2List:', len(ent2List)) # 35812 @@ -555,4 +563,4 @@ def initVariables(self): self.rel_freq = {} # Relation to its frequency self.ent2name_seed = {} - self.rel2name_seed = {} \ No newline at end of file + self.rel2name_seed = {} diff --git a/train_embedding_model.py b/train_embedding_model.py index 7011e08..294c172 100644 --- a/train_embedding_model.py +++ b/train_embedding_model.py @@ -1,4 +1,5 @@ +import logging from torch.utils.data import DataLoader from helper import * from cmvc_utils import cos_sim @@ -325,5 +326,7 @@ def train(self): KGEModel.log_metrics(self.p, 'Training average', step, metrics) training_logs = [] + breakpoint() self.entity_embedding = kge_model.entity_embedding.detach().cpu().numpy() - self.relation_embedding = kge_model.relation_embedding.detach().cpu().numpy() \ No newline at end of file + self.relation_embedding = kge_model.relation_embedding.detach().cpu().numpy() + return self.entity_embedding, self.relation_embedding \ No newline at end of file From 7509d7454c29f7bc345b828d8b302a1f0aadde0e Mon Sep 17 00:00:00 2001 From: Vijay Viswanathan Date: Thu, 10 Aug 2023 12:57:22 -0400 Subject: [PATCH 9/9] Update CMVC utils file --- cmvc_utils.py | 26 ++++++++++++++++++++++++++ 1 file changed, 26 insertions(+) create mode 100644 cmvc_utils.py diff --git a/cmvc_utils.py b/cmvc_utils.py new file mode 100644 index 0000000..b3dc5af --- /dev/null +++ b/cmvc_utils.py @@ -0,0 +1,26 @@ +import numpy as np + +def normalize(v): + norm = np.linalg.norm(v) + if norm == 0: + return v + return v / norm + +def cos_sim(a, b): + a = np.array(a) + b = np.array(b) + a_norm = np.linalg.norm(a) + b_norm = np.linalg.norm(b) + cos_theta = float(np.dot(a, b) / (a_norm * b_norm)) + cos_theta = 0.5 + 0.5 * cos_theta + return cos_theta + +def cosine_distance(a, b): + a = np.array(a) + b = np.array(b) + a_norm = np.linalg.norm(a) + b_norm = np.linalg.norm(b) + cos_theta = float(np.dot(a, b) / (a_norm * b_norm)) + cos_distance = 1 - cos_theta + # cos_distance = 0.5 - 0.5 * cos_theta + return cos_distance \ No newline at end of file