forked from Open-Network-Insight/oni-ml
-
Notifications
You must be signed in to change notification settings - Fork 0
/
lda_pre.py
executable file
·97 lines (87 loc) · 2.29 KB
/
lda_pre.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
#!/user/bin/python
import numpy as np
import csv
#import statsmodels.tsa.stattools as st
import pickle
import collections
import linecache
import sys
k = 20 #number of topics
tt = 6 # number of time intervals
rpath = sys.argv[1]
wsdict = rpath +'widx_dict.pkl'
dsdict = rpath +'docidx_dict.pkl'
tdmfile= rpath +'doc_wc.dat'
wfile = rpath +'words.dat'
dfile = rpath +'doc.dat'
mfile = rpath +'model.dat'
word_dict = True
doc_line = True
dump_doc_line = True
durable = False
# build dictionary of all words - key is word, value is array [index,wc]
# increment unique wc ++
if word_dict:
with open(tdmfile, 'r') as csvfile:
rowct = 1
w_idx = 1
wcdict = {}
readr = csv.reader(csvfile, delimiter=',', quotechar='"')
next(readr) #is there a header?
with open(wfile, 'w') as f:
for row in readr:
if row[1] not in wcdict:
wcdict[row[1]] = w_idx
f.write("%s,%s\n" %(w_idx,row[1]) )
w_idx += 1
rowct += 1
if rowct%1000000 == 0:
print(rowct,"rows processed")
#if rowct == 100000:
#print(wcdict)
#break
if durable:
output = open(wsdict, 'wb')
pickle.dump(wcdict, output)
output.close()
if doc_line:
if durable:
pkl_file = open(wsdict, 'rb')
wcdict = pickle.load(pkl_file)
pkl_file.close()
with open(dfile, 'w') as f:
rowct = 1
doc_idx = 1
docdict = {}
with open(tdmfile, 'r') as csvfile:
readr = csv.reader(csvfile, delimiter=',', quotechar='"')
next(readr)
for row in readr:
if row[0] in docdict:
docdict[row[0]][1] += 1
docdict[row[0]][2].append(" %s:%s" % (wcdict[row[1]],row[2]) )
else:
docdict[row[0]] = [doc_idx,1,[]]
f.write("%s,%s\n" %(doc_idx,row[0]) )
doc_idx += 1
docdict[row[0]][2].append(" %s:%s" % (wcdict[row[1]],row[2]) )
rowct += 1
if rowct%1000000 == 0:
#print(docdict[row[0]])
print(rowct,"rows processed")
wcdict = None
if durable:
output = open(dsdict, 'wb')
pickle.dump(docdict, output)
output.close()
if dump_doc_line:
if durable:
pkl_file = open(dsdict, 'rb')
docdict = pickle.load(pkl_file)
pkl_file.close()
with open(mfile, 'w') as f:
with open(dfile, 'r') as csvfile:
readr = csv.reader(csvfile, delimiter=',', quotechar='"')
print("writing ...")
for row in readr:
f.write("%s%s\n" %(docdict[row[1]][1],''.join(docdict[row[1]][2]) ) )