-
Notifications
You must be signed in to change notification settings - Fork 0
/
Copy pathwordstemming.py
executable file
·90 lines (70 loc) · 2.34 KB
/
wordstemming.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
# - *- coding: utf- 8 - *-
import nltk
#import tensorflow as tf
from nltk import sent_tokenize,word_tokenize
from removestopwords import removestopwords
import re
readPath = './Resources/Suffixes.txt'
read_file = open(readPath, 'r', encoding="utf16")
file = read_file.read()
suffixes = word_tokenize(file)
# print(suffixes)
def stemwords(wordlist):
#print(wordlist)
stemwords = {}
stemmedwords = []
stems = ["මල්වාන", "මල්වාණ", "යුද්ධය", "අගල", "අගළ"]
wordlist1 = removepunc(wordlist)
wordlist2 = removestopwords(wordlist1)
print("Words after removing punctuations and numbers")
print(wordlist2)
wordlist3= sorted(wordlist2)
length = len(wordlist3)
#print(length)
# for stem in stems:
# if wordlist3[0].startswith(stem):
# print(stem +" "+wordlist3[0])
# print(wordlist3[0].startswith(stem))
# else:
# print("out")
# print(wordlist3[0].startswith(stem))
stemwords[wordlist3[0]] = wordlist3[0]
i = 0
while i < length:
for stem in stems:
if wordlist3[i].startswith(stem):
stemwords[wordlist3[i]] = stem
stemmedwords.append(stem)
else:
stemwords[wordlist3[i]] = checkSuffix(wordlist3[i])
stemmedwords.append(checkSuffix(wordlist3[i]))
stems.append(stemwords[wordlist3[i]])
i += 1
print(stemwords)
return stemmedwords
def removepunc(words):
wordlist = []
punclist = [".", ",", "?", "!", ";", ":", "-", "(", ")", "[", "]", "{", "}", "'", '"', "..."]
for n in words:
if n not in punclist:
x = re.match("[0-9]", n)
if x == None:
wordlist.append(n)
return wordlist
def checkSuffix(word):
for suffix in suffixes:
if word.endswith(suffix):
x = len(word)
y = len(suffix)
word_new = word.replace(suffix,"")
stem1 = word_new
break
else:
stem1 = word
return stem1
# readPath = './Data/ලංකාවේ පුරාවෘත්ත/3-මලවාණේ මහ බළකොටුව.txt'
# read_file = open(readPath, 'r', encoding="utf16")
# file = read_file.read()
# list1 = word_tokenize(file)
# #print(list1)
# stemwords(list1)