-
Notifications
You must be signed in to change notification settings - Fork 0
/
parser.py
50 lines (39 loc) · 1.18 KB
/
parser.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
#!/usr/bin/python3
# -*- coding: utf-8 -*-
import glob
from string import punctuation
import utils
from stopwords import stopwords_list
def RemoveStopWords(token_list):
tmp = [ w for w in token_list
if w not in stopwords_list ]
cap = lambda s: s.capitalize()
cap_stopwords = list(map(cap, stopwords_list))
return [ w for w in tmp
if w not in cap_stopwords ]
def RemoveStrPunc(mystr):
exclude = punctuation
result = ''.join(ch for ch in mystr
if ch not in exclude)
return result
def Tokenize(mystr):
strip_str = mystr.strip()
no_punc_str = RemoveStrPunc(strip_str);
terms = [ w.lower()
for w in no_punc_str.split()
if w.isalpha() ] # contain duplicates
terms = RemoveStopWords(terms)
terms.sort()
return terms
def GenrDocIndex():
doc_index = dict()
files = glob.glob("docs/*")
for f in files:
with open (f, 'r') as input:
contents = input.read()
terms = Tokenize(contents)
doc_index[f] = terms
utils.WriteJSONObj('log/corpus.txt', doc_index)
if __name__ == "__main__":
GenrDocIndex()
# End of File