-
Notifications
You must be signed in to change notification settings - Fork 0
/
evaluateDoc.py
43 lines (35 loc) · 1.05 KB
/
evaluateDoc.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
# - *- coding: utf- 8 - *-
import nltk
# import tensorflow as tf
from nltk import sent_tokenize, word_tokenize
from nltk.corpus import PlaintextCorpusReader
from removestopwords import removestopwords
termfreq = {}
terms = []
corpus_root = './Data/books'
docs = PlaintextCorpusReader(corpus_root, '.*')
fields = docs.fileids()
totalDoc = len(fields)
def evaluateDoc():
for doc in fields:
# print(doc)
words = []
file = ""
readPath = './Data/books/' + doc
# print(readPath)
read_file = open(readPath, 'r', encoding="utf16")
file = read_file.read()
words = word_tokenize(file)
words = removestopwords(words)
words2 = sorted(words)
for word in words2:
x = termfreq.get(word)
# print(x)
if x == None:
termfreq.setdefault(word, []).append(doc)
else:
if doc not in x:
termfreq.setdefault(word, []).append(doc)
# print("Evaluate doc")
print(termfreq)
return termfreq