-
Notifications
You must be signed in to change notification settings - Fork 0
/
Copy pathk_means_clustering.py
146 lines (117 loc) · 5.44 KB
/
k_means_clustering.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
"""
Analysis of Enron emails, with a focus on Tim Belden
Code to load tim beldens contacts, then get a list of their contacts which in turn we get emails from.
This amounts to a "neighbors of neighbors" search and will return a large subset of the enron mails.
Then performs a k means clustering and outputs the location of a few interesting parties emails, as well
as the topics of each of the clusters.
"""
import pymongo
from pymongo import MongoClient
import numpy as np
import pandas as pd
from time import time
import sys
import matplotlib.pyplot as plt
import seaborn as sns
sns.set_style("whitegrid")
import colorsys
from sklearn.cluster import KMeans
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.pipeline import Pipeline
from sklearn.decomposition import PCA
from sklearn.feature_extraction import text
# Some general inputs:
# persons_of_interest is a list including tim belden, our focus,
# and some other executives implicated in the scandal.
persons_of_interest = ["[email protected]",
# and the number of clusters:
N_CLUSTERS = 10
# Wrap everything in a if __name__ == "__main__":
# to avoid threading problems
if __name__ == "__main__":
# Connect to mongodb, and use the enron_mail db. This is all 500k+ emails
cn = MongoClient("localhost")
db = cn.enron_mail
# Get all of beldens contact from mongo
belden_contacts = db.messages.find({"$or": [{"senders": "[email protected]"},
{"recipients": "[email protected]"}],
"senders": {"$ne": None}, "recipients": {"$ne": None}},
{"_id": 0, "recipients": 1, "senders": 1})
# we get a set of emails, and add to each of the recipients and
# senders from each of beldens contacts, to get contacts of contacts
emails = set()
for doc in belden_contacts:
recs = doc["recipients"]
senders = doc["senders"]
emails = emails.union([rec.replace("\n\t", "") for rec in recs])
emails = emails.union([senders])
print(len(emails), "Emails in belden\'s contacts")
# get all of the messages from beldens contacts, and their contact"s emails
belden_emails = db.messages.find({"$or": [{"senders": {"$in": list(emails)}},
{"recipients": {"$in": list(emails)}}]},
{"_id": 1, "text": 1, "recipients": 1, "senders": 1})
print(belden_emails.count(), "emails in beldens contacts\' contacts")
# put it into a dataframe to feed into scikit learn
emails_df = pd.DataFrame(list(belden_emails))
print(emails_df.shape)
# print(emails_df.head(5))
# use a term frequency vectorizer
vect = TfidfVectorizer(sublinear_tf=True,
min_df=0.05, max_df=0.5,
stop_words="english",
token_pattern=r"\b[A-Za-z]{3}[A-Za-z]*\b")
# k-means clustering with N_CLUSTERS, 50 iterations should do it. Initialise 10 times to make sure
clf = KMeans(n_clusters=N_CLUSTERS,
random_state=0,
max_iter=50,
init="k-means++",
n_init=10,
verbose=True,
n_jobs=-1)
print("Clustering data with %s" % clf)
# run clustering, takes about 20 minutes on a fast PC.
t0 = time()
word_vec = vect.fit_transform(emails_df["text"])
labels = clf.fit_predict(word_vec)
print("done in %0.3fs" % (time() - t0))
word_vec_2d = word_vec.todense()
emails_df["label"] = labels
# get the centroids of the clusters and order them
order_centroids = clf.cluster_centers_.argsort()[
:, ::-1]
terms = vect.get_feature_names()
# get the top terms for each of the clusters
for i in range(N_CLUSTERS):
print("Cluster %d:" % i, end="")
for ind in order_centroids[i, :N_CLUSTERS]:
print(" %s" % terms[ind], end="")
print()
# for our list of interesting persons, we see how many emails they have in each cluster
for persons in persons_of_interest:
summary_df = emails_df[(emails_df["senders"] == persons) |
(emails_df["senders"].str.contains(persons) == True)]
summary_df = summary_df.groupby(["label"]).size()
print(persons + "\"categories:")
print(summary_df)
# do a principal component analysis on the clusters to reduce down to 2 dimensions
pca = PCA(n_components=2).fit(word_vec_2d)
datapoint = pca.transform(word_vec_2d)
datapoint_df = pd.DataFrame(datapoint, columns=["x", "y"])
# add in the labels data to colour by the cluster labels
datapoint_df["label"] = labels
# colour pallette
clrs = list(sns.color_palette("Set2", N_CLUSTERS))
colours = [clrs[i] for i in labels]
# get the locations of the centroid points and print, for reference
X = clf.cluster_centers_
centroidpoint = pd.DataFrame(pca.transform(X), columns=["x", "y"])
print(centroidpoint)
# plot the graph as a scatter, hopefully they are arranges in a clustering
p1 = sns.lmplot(data=datapoint_df, x="x", y="y", fit_reg=False,
hue="label", scatter_kws={"s": 10})
plt.show()