-
Notifications
You must be signed in to change notification settings - Fork 0
/
Copy pathwordSimilarity.py
111 lines (96 loc) · 2.97 KB
/
wordSimilarity.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
# - *- coding: utf- 8 - *-
import nltk
import MySQLdb
from nltk.corpus import PlaintextCorpusReader
from removestopwords import *
encoding = "utf-8"
stemwords_list = {}
word_cluster ={}
def ngrams(input, n):
# input = input.split(' ')
input_new = []
for letter in input:
input_new.append(letter)
# print(input_new)
output = []
x = len(input_new)
# print(x)
for i in range(x):
if i == 0:
for y in range(n+1):
output.append(input[i:y])
elif i > n:
output.append(input[i:x])
else:
output.append(input[i:i + n])
return output
ngram_list = {}
def checkWordSimilarity(file):
# print(file)
stemwords_list = {}
word_cluster = {}
list1 = word_tokenize(file)
list = removestopwords(list1)
for word in list:
output = ngrams(word, 3)
outputLen = len(output)
for k, v in ngram_list.items():
count = 0
values = ngram_list[k]
valueLen = len(values)
for val in values:
if output.count(val) > 0:
count = count + 1
wt = (2 * count) / (outputLen + valueLen)
if wt > 0.7 and wt< 1:
stemwords_list[word] = k
# print(word + " " + k)
ngram_list[word] = output
for k,v in stemwords_list.items():
# print(k +" "+v)
k_exist = word_cluster.get(k, "key")
# print("k_exist")
# print(k_exist)
if k_exist != "key":
if v not in k_exist:
k_exist.append(v)
# print("k_exist")
# print(k_exist)
else:
v_exist = word_cluster.get(v, "key")
# print("v_exist")
# print(v_exist)
if v_exist != "key":
if k not in v_exist:
v_exist.append(k)
# print("v_exist")
# print(v_exist)
else:
nk = len(ngram_list[k])
nv = len(ngram_list[v])
if nk > nv:
word_cluster[v] = [k]
else:
word_cluster[k] = [v]
# print(word_cluster)
# print(ngram_list)
# print(ngram_list["ආණ්ඩුව"])
# print(ngram_list["ආණ්ඩුවේ"])
# print(stemwords_list)
# print("\n\n")
# print(word_cluster)
return word_cluster
# corpus_root = './Data/Data'
# docs = PlaintextCorpusReader(corpus_root, '.*')
# fields = docs.fileids()
#
# for doc in fields:
# # readPath = './Data/Data/' + doc
# readPath = './Data/Data/කෝට්ටේ නරපතීන් සහ පෘතුගීසීහු.txt'
# read_file = open(readPath, 'r', encoding="utf16")
# file = read_file.read()
# checkWordSimilarity(file)
# readPath = './Data/books/සංක්ෂිප්ත ලංකා ඉතිහාසය 2.txt'
# read_file = open(readPath, 'r', encoding="utf16")
# file = read_file.read()
# checkWordSimilarity(file)