Skip to content

Commit

Permalink
#60 compare 支持交换句子
Browse files Browse the repository at this point in the history
  • Loading branch information
hailiang-wang committed May 28, 2018
1 parent c580b3d commit 4a44eff
Show file tree
Hide file tree
Showing 5 changed files with 32 additions and 21 deletions.
2 changes: 2 additions & 0 deletions CHANGELOG.md
Original file line number Diff line number Diff line change
@@ -1,3 +1,5 @@
# 3.6
* Fix Bug: compare 保证交换两个句子后分数一致 [#60](https://github.com/huyingxi/Synonyms/issues/60)

# 3.5
* 根据实际情况,降低向量距离对近似度分数的影响
Expand Down
2 changes: 1 addition & 1 deletion Requirements.txt
Original file line number Diff line number Diff line change
@@ -1 +1 @@
synonyms>=3.5
synonyms>=3.6
9 changes: 9 additions & 0 deletions demo.py
Original file line number Diff line number Diff line change
Expand Up @@ -114,6 +114,15 @@ def test_similarity(self):
r = synonyms.compare(sen1, sen2, seg=False)
print("%s vs %s" % (sen1, sen2), r)


def test_swap_sent(self):
print("test_swap_sent")
s1 = synonyms.compare("教学", "老师")
s2 = synonyms.compare("老师", "教学")
print('"教学", "老师": %s ' % s1)
print('"老师", "教学": %s ' % s2)
assert s1 == s2, "Scores should be the same after swap sents"

def test_nearby(self):
synonyms.display("奥运") # synonyms.display calls synonyms.nearby
synonyms.display("北新桥") # synonyms.display calls synonyms.nearby
Expand Down
2 changes: 1 addition & 1 deletion setup.py
Original file line number Diff line number Diff line change
Expand Up @@ -13,7 +13,7 @@

setup(
name='synonyms',
version='3.5.0',
version='3.6.0',
description='Chinese Synonyms for Natural Language Processing and Understanding',
long_description=LONGDOC,
author='Hai Liang Wang, Hu Ying Xi',
Expand Down
38 changes: 19 additions & 19 deletions synonyms/synonyms.py
Original file line number Diff line number Diff line change
Expand Up @@ -211,28 +211,28 @@ def _nearby_levenshtein_distance(s1, s2):
使用空间距离近的词汇优化编辑距离计算
'''
s1_len, s2_len = len(s1), len(s2)
maxlen = max(s1_len, s2_len)
first, second = (s2, s1) if s1_len == maxlen else (s1, s2)
ft_1 = set() # all related words with first sentence
maxlen = s1_len
if s1_len == s2_len:
first, second = sorted([s1, s2])
elif s1_len < s2_len:
first = s1
second = s2
maxlen = s2_len
else:
first = s2
second = s1

ft = set() # all related words with first sentence
for x in first:
ft_1.add(x)
ft.add(x)
n, _ = nearby(x)
for o in n[:5]:
ft_1.add(o)

ft_2 = set() # all related words with second sentence
for x in second:
ft_2.add(x)
n, _ = nearby(x)
for o in n[:5]:
ft_2.add(0)

for o in n[:10]:
ft.add(o)

scores = []
if len(ft_1) == 0 or len(ft_2) == 0: return 0.0 # invalid length
for x in ft_1:
for y in ft_2:
scores.append([_levenshtein_distance(x, y)])
s = np.sum(scores) / (s1_len * s2_len)
for x in second:
scores.append(max([_levenshtein_distance(x, y) for y in ft]))
s = np.sum(scores) / maxlen
return s

def _similarity_distance(s1, s2, ignore):
Expand Down

0 comments on commit 4a44eff

Please sign in to comment.