-
Notifications
You must be signed in to change notification settings - Fork 29
/
kmeans.py
53 lines (42 loc) · 1.77 KB
/
kmeans.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
import numpy as np
from sklearn.cluster import KMeans
from joblib import Parallel, delayed
def _parallel_compute_distance(X, cluster):
n_samples = X.shape[0]
dis_mat = np.zeros((n_samples, 1))
for i in range(n_samples):
dis_mat[i] += np.sqrt(np.sum((X[i] - cluster) ** 2, axis=0))
return dis_mat
class batch_KMeans(object):
def __init__(self, args):
self.args = args
self.n_features = args.latent_dim
self.n_clusters = args.n_clusters
self.clusters = np.zeros((self.n_clusters, self.n_features))
self.count = 100 * np.ones((self.n_clusters)) # serve as learning rate
self.n_jobs = args.n_jobs
def _compute_dist(self, X):
dis_mat = Parallel(n_jobs=self.n_jobs)(
delayed(_parallel_compute_distance)(X, self.clusters[i])
for i in range(self.n_clusters))
dis_mat = np.hstack(dis_mat)
return dis_mat
def init_cluster(self, X, indices=None):
""" Generate initial clusters using sklearn.Kmeans """
model = KMeans(n_clusters=self.n_clusters,
n_init=20)
model.fit(X)
self.clusters = model.cluster_centers_ # copy clusters
def update_cluster(self, X, cluster_idx):
""" Update clusters in Kmeans on a batch of data """
n_samples = X.shape[0]
for i in range(n_samples):
self.count[cluster_idx] += 1
eta = 1.0 / self.count[cluster_idx]
updated_cluster = ((1 - eta) * self.clusters[cluster_idx] +
eta * X[i])
self.clusters[cluster_idx] = updated_cluster
def update_assign(self, X):
""" Assign samples in `X` to clusters """
dis_mat = self._compute_dist(X)
return np.argmin(dis_mat, axis=1)