-
Notifications
You must be signed in to change notification settings - Fork 1
/
Copy pathlda.py
220 lines (176 loc) · 6.81 KB
/
lda.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
# # Importing modules
# import pandas as pd
# # Load the regular expression library
# import re
# import gensim
# from gensim.utils import simple_preprocess
# import nltk
# # nltk.download('stopwords')
# from nltk.corpus import stopwords
# import gensim.corpora as corpora
# from pprint import pprint
# def sent_to_words(sentences):
# for sentence in sentences:
# # deacc=True removes punctuations
# yield(gensim.utils.simple_preprocess(str(sentence), deacc=True))
# def remove_stopwords(texts):
# return [[word for word in simple_preprocess(str(doc)) if word not in stop_words] for doc in texts]
# if __name__ == '__main__':
# papers = pd.read_csv('tag_10_tfidf.csv')
# # Remove punctuation
# papers['Title_processed'] = papers['Tag'].map(lambda x: re.sub('[,\.!?]', '', x))
# # Convert the titles to lowercase
# papers['Title_processed'] = papers['Title_processed'].map(lambda x: x.lower())
# stop_words = stopwords.words('english')
# stop_words.extend(['from', 'subject', 're', 'edu', 'use'])
# data = papers.Title_processed.values.tolist()
# data_words = list(sent_to_words(data))
# # remove stop words
# data_words = remove_stopwords(data_words)
# # print(len(data_words))
# # print(len(data_words[0]))
# # print(len(data_words[0][0]))
# # print(len(data_words[0][0][0]))
# # Create Dictionary
# id2word = corpora.Dictionary(data_words)
# # Create Corpus
# texts = data_words
# # Term Document Frequency
# corpus = [id2word.doc2bow(text) for text in texts]
# # number of topics
# num_topics = 10
# # Build LDA model
# lda_model = gensim.models.LdaMulticore(corpus=corpus,id2word=id2word,num_topics=num_topics)
# # Print the Keyword in the 10 topics
# pprint(lda_model.print_topics())
# doc_lda = lda_model[corpus]
def tokenization_with_gen_stop(text):
result=[]
for token in gensim.utils.simple_preprocess(text) :
if token not in gensim.parsing.preprocessing.STOPWORDS and len(token) > 3:
result.append(token)
return result
import nltk
# nltk.download('stopwords')
import gensim, spacy
import gensim.corpora as corpora
from nltk.corpus import stopwords
import pandas as pd
import re
from tqdm import tqdm
import time
import json
import pyLDAvis
# import pyLDAvis.gensim # don't skip this
# import matplotlib.pyplot as plt
# %matplotlib inline
if __name__ == '__main__':
file = open('books_result.json', 'r')
json_list = json.load(file)
data = []
for json_raw in tqdm(json_list):
# for every product
review_list = ''
for rev in json_raw["reviews"]:
if 'body' in rev.keys():
review_list += rev['body']
else:
continue
if review_list == '':
continue
data.append(review_list)
## Setup nlp for spacy
nlp = spacy.load("en_core_web_sm")
# Load NLTK stopwords
stop_words = stopwords.words('english')
# Add some extra words in it if required
stop_words.extend(['from', 'subject', 'use','pron'])
# df = pd.read_csv('tag_10_tfidf.csv')
# Convert into list
# data = df.Title.values.tolist()
### Cleaning data
# Remove Emails
data = [re.sub('S*@S*s?', '', sent) for sent in data]
# Remove new line characters and extra space
data = [re.sub('s+', ' ', sent) for sent in data]
# Remove single quotes
data = [re.sub("'", "", sent) for sent in data]
### Lemmatization
data_lemma = []
for txt in tqdm(data):
lis = []
doc = nlp(txt)
for token in doc:
lis.append(token.lemma_)
data_lemma.append(' '.join(lis))
## Apply tokenization function
data_words = []
for txt in tqdm(data_lemma):
data_words.append(tokenization_with_gen_stop(txt))
### NLTK Stopword removal (extra stopwords)
data_words_clean = []
for word in tqdm(data_words):
wrd = []
for w in word:
if w not in stop_words:
wrd.append(w)
data_words_clean.append(wrd)
# Create Dictionary
dictionary = corpora.Dictionary(data_words_clean)
# Print dictionary
print(dictionary.token2id)
## Create Term document frequency (corpus)
# Term Document Frequency
corpus = [dictionary.doc2bow(text) for text in data_words_clean]
# Print corpus for first document
print(corpus[0])
# Easy to observe format of corpus
[[(dictionary[id], freq) for id, freq in cp] for cp in corpus[:1]]
start_time = time.time()
##
NUM_TOPICS = 15
ldamodel = gensim.models.ldamodel.LdaModel(corpus, num_topics = NUM_TOPICS, id2word=dictionary,random_state=100,passes=10)
# Saving trained model
ldamodel.save('LDA_NYT')
# Loading trained model
ldamodel = gensim.models.ldamodel.LdaModel.load('LDA_NYT')
## Print time taken to train the model
print("--- %s seconds ---" % (time.time() - start_time))
print(ldamodel.print_topics(-1))
# Compute Perplexity Score
print('nPerplexity Score: ', ldamodel.log_perplexity(corpus))
# Compute Coherence Score
coherence_model_lda = gensim.models.CoherenceModel(model=ldamodel, texts=data_words_clean, dictionary=dictionary, coherence='c_v')
coherence_lda = coherence_model_lda.get_coherence()
print('nCoherence Score: ', coherence_lda)
# predict
## Keeping first content of dataframe as our new document
new_doc = 'history'
### Cleaning data
# Remove Emails
data = re.sub('S*@S*s?', '', new_doc)
# Remove new line characters and extra space
data = re.sub('s+', ' ', data)
# Remove single quotes
data = re.sub("'", "", data)
### Lemmatization
data_lemma = []
lis = []
doc = nlp(data)
for token in doc:
lis.append(token.lemma_)
data_lemma.append(' '.join(lis))
## Apply tokenization function
data_words = []
for txt in tqdm(data_lemma):
data_words.append(tokenization_with_gen_stop(txt))
### NLTK Stopword removal (extra stopwords)
data_words_clean_new = []
for word in tqdm(data_words):
for w in word:
if w not in stop_words:
data_words_clean_new.append(w)
# Create corpus for new document
corpus_new = dictionary.doc2bow(data_words_clean_new)
print(corpus_new)
print(ldamodel.get_document_topics(corpus_new))