-
Notifications
You must be signed in to change notification settings - Fork 0
/
newtest.py
59 lines (49 loc) · 1.32 KB
/
newtest.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
# - *- coding: utf- 8 - *-
import nltk
from nltk import *
import re
from nltk.corpus import PlaintextCorpusReader
import MySQLdb
from evaluateDoc import *
from removestopwords import *
import math
wordlist = []
corpus_root = './Data/books'
docs = PlaintextCorpusReader(corpus_root, '.*')
fields = docs.fileids()
totalDoc = len(fields)
db = MySQLdb.connect("localhost", "root", "root", "sparks", charset='utf8', use_unicode=True)
cursor = db.cursor()
sql = "SELECT * FROM PoSTag"
cursor.execute(sql)
result = cursor.fetchall()
for r in result:
wordlist.append(r[1])
print(wordlist)
newlist = []
termfreq = evaluateDoc()
for doc in fields:
words = []
file = ""
readPath = './Data/books/' + doc
read_file = open(readPath, 'r', encoding="utf16")
file = read_file.read()
cleanr = re.compile('<.*?>')
cleantext = re.sub(cleanr, '', file)
words = word_tokenize(cleantext)
words = removestopwords(words)
for word in words:
if word not in wordlist:
newlist.append(word)
writePath = "./word.txt"
write_file = open(writePath, 'w', encoding="utf16")
for word in newlist:
# print(w/ord)
c = termfreq.get(word)
if c != None:
n = len(termfreq.get(word))
else:
n=0
# print(word + " " + termfreq[word])
# n = termfreq[word]
write_file.write(word+"\n")