-
Notifications
You must be signed in to change notification settings - Fork 4
/
Copy pathclustering.py
59 lines (48 loc) · 1.4 KB
/
clustering.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
from sklearn.cluster import KMeans
import sys
import json
import numpy as np
from sklearn import decomposition
import matplotlib.pyplot as plt
feature_data = open(sys.argv[1], 'r')
tweet_data = open(sys.argv[2], 'r')
X = []
for line in feature_data:
X.append(line.split(','))
X = np.array(X)
num_clusters = 10
kmeans = KMeans(n_clusters=num_clusters,
random_state=0, n_init=1, init='k-means++',
verbose=1, max_iter=1).fit(X)
print "cluster centers:\n", kmeans.cluster_centers_
cluster_file = open('cluster_centres.csv', 'w')
for i in range(num_clusters):
cluster_file.writelines(','.join(map(str, kmeans.cluster_centers_[i].tolist())))
print "cluster inertia: \n", kmeans.inertia_
cluster_labels = kmeans.labels_
#del X
idx = 0
tweets = []
for line in tweet_data:
data_dict = json.loads(line)
tweets.append(data_dict['text'])
tweets = np.array(tweets)
cluster_counts = {}
for clus in range(num_clusters):
index = np.where(cluster_labels == clus)[0]
# cluster_counts[clus] = len(index)
# print index
print "\ntweets cluster: ", clus, ' count: ', len(index), '\n'
print tweets[index[0:5]]
Y = cluster_labels
pca = decomposition.PCA(n_components=2)
pca.fit(X)
X = pca.transform(X)
print X[1:5, :]
print Y[1:5]
plt.scatter(X[:, 0], X[:, 1], color='w')
plt.scatter(X[(Y == 1), 0], X[(Y == 1), 1], color='r')
plt.scatter(X[(Y == 0), 0], X[(Y == 0), 1], color='b')
plt.ylabel('PC2')
plt.xlabel('PC1')
plt.show()