-
Notifications
You must be signed in to change notification settings - Fork 0
/
Copy pathmasked_wordcloud.py
53 lines (43 loc) · 1.35 KB
/
masked_wordcloud.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
import nltk
from pymongo import MongoClient
import numpy as np
import pandas as pd
from nltk.corpus import stopwords
import datetime
from os import path
from os import stat
from wordcloud import WordCloud, STOPWORDS
from PIL import Image
import matplotlib.pyplot as plt
# connect to mongo
cn = MongoClient('localhost')
db = cn.enron_mail_2
counter = 1
f = open("all_text.txt", "a+")
if stat("all_text.txt").st_size == 0:
for document in db.mail.find():
mail = document["text"]
# remove all text after the "To:" string, hopefully this removes forwarded emails and old emails
#
mail = mail.split("To:")[0]
mail = nltk.word_tokenize(mail)
#print(mail)
mail = [
word for word in mail if word not in stopwords.words('english')]
#print(mail)
counter += 1
# tokenize mail to hopefully split out words
if counter % 100 == 0:
print(datetime.datetime.utcnow(), counter)
doc_words = f.write(" ".join(mail))
text = open("all_text.txt").read()
enron_mask = np.array(Image.open('EnronLogo.png'))
wc = WordCloud(background_color="white", max_words=2000, mask=enron_mask)
wc.generate(text)
wc.to_file("enron_logo_wc.png")
plt.imshow(wc, interpolation='bilinear')
plt.axis("off")
plt.figure()
plt.imshow(enron_mask, interpolation='bilinear')
plt.axis("off")
plt.show()