-
Notifications
You must be signed in to change notification settings - Fork 8
/
Copy pathBiterm_sampler.py
116 lines (95 loc) · 3.55 KB
/
Biterm_sampler.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
__author__ = 'jcapde87'
import sys, pickle
import numpy as np
from os.path import expanduser
PROJECT_PATH = "/Documents/Tweet-models/Biterm/"
def gibbs_sampler_LDA(It, V, B, num_topics, b, alpha=1., beta=0.1):
print "Biterm model ------ "
print "Corpus length: " + str(len(b))
print "Number of topics: " + str(num_topics)
print "alpha: " + str(alpha) + " beta: " + str(beta)
Z = np.zeros(B)
Nwz = np.zeros((V, num_topics))
Nz = np.zeros(num_topics)
theta = np.random.dirichlet([alpha]*num_topics, 1)
for ibi, bi in enumerate(b):
topics = np.random.choice(num_topics, 1, p=theta[0,:])[0]
Nwz[bi[0], topics] += 1
Nwz[bi[1], topics] += 1
Nz[topics] += 1
Z[ibi] = topics
for it in xrange(It):
print "Iteration: " + str(it)
Nzold = np.copy(Nz)
for ibi, bi in enumerate(b):
Nwz[bi[0], Z[ibi]] -= 1
Nwz[bi[1], Z[ibi]] -= 1
Nz[Z[ibi]] -= 1
pz = (Nz + alpha)*(Nwz[bi[0],:]+beta)*(Nwz[bi[1],:]+beta)/(Nwz.sum(axis=0)+beta*V)**2
pz = pz/pz.sum()
Z[ibi] = np.random.choice(num_topics, 1, p=pz)
Nwz[bi[0], Z[ibi]] += 1
Nwz[bi[1], Z[ibi]] += 1
Nz[Z[ibi]] += 1
print "Variation between iterations: " + str(np.sqrt(np.sum((Nz-Nzold)**2)))
return Nz, Nwz, Z
def pbd(doc,names):
ret = []
retnames = []
for term1 in set(doc):
cnts = 0.
for term2 in doc:
if term1 == term2:
cnts +=1.
ret.append(cnts/len(doc))
retnames.append(term1)
if names:
return retnames
else:
return ret
if __name__ == "__main__":
home = expanduser("~")
project = home + PROJECT_PATH
print "Set project directory: " + project
INFILE = "14S2015_cl"
file = open(project + "data/"+INFILE+".pkl", 'rb')
tweets, hashtags = pickle.load(file)
tweets = [tweet for tweet in tweets if len(tweet)>3]
tweets = tweets
N = len(tweets)
dictionary = np.array(list(set([word for tweet in tweets for word in tweet])))
V = len(dictionary)
alpha = 1.
beta = 0.1
btmp = [[(np.where(dictionary==word1)[0][0], np.where(dictionary==word2)[0][0]) for iword1, word1 in enumerate(tweet) for iword2, word2 in enumerate(tweet) if iword1 < iword2] for tweet in tweets]
aux = []
for bi in btmp:
aux.extend(bi)
b = aux
B = len(b)
bset = set(b)
num_topics = 10
pbd_cts = [pbd(doc, False) for doc in btmp]
pbd_names = [pbd(doc, True) for doc in btmp]
Nz, Nwz, Z = gibbs_sampler_LDA(It=20, V=V, B=B, num_topics=num_topics, b=b, alpha=alpha, beta=beta)
topics = [[dictionary[ident] for ident in np.argsort(-Nwz[:,k])[0:10]] for k in xrange(num_topics)]
print "TOP 10 words per topic"
for topic in topics:
print topic
print " ---- "
thetaz = (Nz + alpha)/(B + num_topics*alpha)
phiwz = (Nwz + beta)/np.tile((Nwz.sum(axis=0)+V*beta),(V,1))
pzb = [[list(thetaz*phiwz[term[0],:]*phiwz[term[1],:]/(thetaz*phiwz[term[0],:]*phiwz[term[1],:]).sum()) for term in set(doc)] for doc in btmp]
pdz = []
for idoc, doc in enumerate(pzb):
aux = 0
for iterm, term in enumerate(doc):
aux += np.array(term) * pbd_cts[idoc][iterm]
pdz.append(aux)
pdz = np.array(pdz)
topics = [[tweets[ident] for ident in np.argsort(-pdz[:,k])[0:5]] for k in xrange(num_topics)]
print "TOP 5 tweets per topic"
for topic in topics:
for tweet in topic:
print tweet
print " ---- "