Skip to content

Commit

Permalink
add compatible codes for py2
Browse files Browse the repository at this point in the history
  • Loading branch information
hailiang-wang committed Oct 18, 2017
1 parent b92ba06 commit 2316042
Show file tree
Hide file tree
Showing 4 changed files with 26 additions and 8 deletions.
2 changes: 1 addition & 1 deletion Requirements.txt
Original file line number Diff line number Diff line change
@@ -1 +1 @@
synonyms>=1.0
synonyms>=1.1
3 changes: 2 additions & 1 deletion demo.py
Original file line number Diff line number Diff line change
Expand Up @@ -35,6 +35,7 @@
import synonyms # https://github.com/huyingxi/Synonyms
import numpy
import unittest
import thulac

# run testcase: python /Users/hain/ai/Synonyms/demo.py Test.testExample
class Test(unittest.TestCase):
Expand Down Expand Up @@ -70,7 +71,7 @@ def testNearbyWords(self):
tags.append(_[1])
for (k,v) in enumerate(tags):
if v.startswith("n") or v.startswith("v"): # 去停,去标,去副词、形容词、代词 etc.
print("%s: %s" % (words[k], synonyms.nearby(words[k])))
synonyms.display(words[k]) # synonyms.display calls synonyms.nearby

def test():
unittest.main()
Expand Down
2 changes: 1 addition & 1 deletion setup.py
Original file line number Diff line number Diff line change
Expand Up @@ -12,7 +12,7 @@
"""

setup(name='synonyms',
version='1.0',
version='1.1',
description='Chinese Synonyms for Natural Language Processing and Understanding',
long_description=LONGDOC,
author='Hai Liang Wang, Hu Ying Xi',
Expand Down
27 changes: 22 additions & 5 deletions synonyms/__init__.py
Original file line number Diff line number Diff line change
Expand Up @@ -27,26 +27,39 @@
curdir = os.path.dirname(os.path.abspath(__file__))
sys.path.append(curdir)

PLT = 2

if sys.version_info[0] < 3:
reload(sys)
sys.setdefaultencoding("utf-8")
# raise "Must be using Python 3"
else:
PLT = 3

import gzip
import thulac # http://thulac.thunlp.org/
from collections import defaultdict
wn_raw_data=gzip.open(os.path.join(curdir, 'data', 'words.nearby.gz'),'rt', encoding='utf-8', errors = "ignore")

_vocab = defaultdict(lambda: [[], []])
_size = 0
_thulac = thulac.thulac() #默认模式
_fin = []
_fin_path = os.path.join(curdir, 'data', 'words.nearby.gz')
if PLT == 2:
import io
_fin=io.TextIOWrapper(io.BufferedReader(gzip.open(_fin_path)), encoding='utf8', errors='ignore')
else:
_fin=gzip.open(_fin_path,'rt', encoding='utf-8', errors = "ignore")

def add_word_to_vocab(word, nearby, nearby_score):
'''
Add word into vocab by word, nearby lis and nearby_score lis
'''
global _size
if not word is None:
if PLT == 2:
word = word.encode("utf-8")
nearby = [z.encode("utf-8") for z in nearby]
_vocab[word] = [nearby, nearby_score]
_size += 1

Expand All @@ -57,7 +70,7 @@ def _build_vocab():
c = None # current word
w = [] # word nearby
s = [] # score of word nearby
for v in wn_raw_data.readlines():
for v in _fin.readlines():
v = v.strip()
if v is None or len(v) == 0: continue
if v.startswith("query:"):
Expand Down Expand Up @@ -126,10 +139,14 @@ def compare(s1, s2):
w2, t2 = _segment_words(s2)
return max(_similarity(w1, t1, w2, t2), _similarity(w2, t2, w1, t1))

def display(word):
print("'%s'近义词:" % word)
o = nearby("人脸")
for k,v in enumerate(o[0]):
print(" %d. %s:%s" %(k+1, v, o[1][k]))

def main():
print("人脸", nearby("人脸"))
print("识别", nearby("识别"))
print("OOV", nearby("NOT_EXIST"))
display("人脸")

if __name__ == '__main__':
main()

0 comments on commit 2316042

Please sign in to comment.