Skip to content

Commit

Permalink
GH-528: fix word_token_bug (#537)
Browse files Browse the repository at this point in the history
  • Loading branch information
rain1024 authored Jun 28, 2022
1 parent 428f473 commit 89e0bc7
Show file tree
Hide file tree
Showing 2 changed files with 11 additions and 1 deletion.
8 changes: 8 additions & 0 deletions tests/pipeline/word_tokenize/test_word_tokenize.py
Original file line number Diff line number Diff line change
Expand Up @@ -57,3 +57,11 @@ def test_url_1(self):
actual = word_tokenize(text, format='text')
expected = u"https://www.facebook.com/photo.php?fbid=1627680357512432&set=a.1406713109609159.1073741826.100008114498358&type=1 mình muốn chia_sẻ bài viết của một bác nói về thực_trạng của bộ giáo_dục bây_giờ ! mọi người vào đọc và chia_sẻ để Phạm_Vũ_Luận BIẾT !"
self.assertEqual(actual, expected)

# from issue 528
# link: https://github.com/undertheseanlp/underthesea/issues/528
def test_exception(self):
text = "000 85 ."
actual = word_tokenize(text, format='text')
expected = "000 85 ."
self.assertEqual(actual, expected)
4 changes: 3 additions & 1 deletion underthesea/pipeline/word_tokenize/__init__.py
Original file line number Diff line number Diff line change
Expand Up @@ -37,11 +37,13 @@ def word_tokenize(sentence, format=None):
tokens = [token[0] for token in output]
tags = [token[1] for token in output]
output = []
num_words = 0
for tag, token in zip(tags, tokens):
if tag == "I-W":
if tag == "I-W" and num_words > 0:
output[-1] = output[-1] + u" " + token
else:
output.append(token)
num_words += 1
if format == "text":
output = u" ".join([item.replace(" ", "_") for item in output])
return output

0 comments on commit 89e0bc7

Please sign in to comment.