Skip to content

Commit

Permalink
Word n-grams does not return boundary words for more sentences
Browse files Browse the repository at this point in the history
  • Loading branch information
odek53r authored and miso-belica committed Mar 5, 2017
1 parent 8be86e5 commit bc489c6
Show file tree
Hide file tree
Showing 2 changed files with 19 additions and 8 deletions.
8 changes: 6 additions & 2 deletions sumy/evaluation/rouge.py
Original file line number Diff line number Diff line change
Expand Up @@ -28,8 +28,12 @@ def _get_word_ngrams(n, sentences):
assert (len(sentences) > 0)
assert (n > 0)

words = _split_into_words(sentences)
return _get_ngrams(n, words)
words = set()
for sentence in sentences:
words.update(_get_ngrams(n,_split_into_words([sentence])))

return words



def _get_index_of_lcs(x, y):
Expand Down
19 changes: 13 additions & 6 deletions tests/test_evaluation.py
Original file line number Diff line number Diff line change
Expand Up @@ -216,19 +216,26 @@ def test_split_into_words(self):
Tokenizer("english")).document.sentences
self.assertEqual(["One", "two", "two", "Two", "Three"],
_split_into_words(sentences1))

sentences2 = PlaintextParser.from_string("two two. Two. Three.",
Tokenizer("english")).document.sentences
self.assertEqual(["two", "two", "Two", "Three"],
_split_into_words(sentences2))


def test_get_word_ngrams(self):
sentences = PlaintextParser.from_string("This is a test.",
Tokenizer("english")).document.sentences
correct_ngrams = [("This", "is"), ("is", "a"), ("a", "test")]
sentences = PlaintextParser.from_string("This is a test.", Tokenizer("english")).document.sentences
expected_ngrams = [("This", "is"), ("is", "a"), ("a", "test")]
found_ngrams = _get_word_ngrams(2, sentences)
for ngram in correct_ngrams:
self.assertTrue(ngram in found_ngrams)

self.assertTrue(expected_ngrams, found_ngrams)

def test_ngrams_for_more_sentences_should_not_return_words_at_boundaries(self):
sentences = PlaintextParser.from_string("This is a pencil.\nThis is a eraser.\nThis is a book.", Tokenizer("english")).document.sentences
expected_ngrams = [("This", "is"), ("is", "a"), ("a", "pencil"), ("a", "eraser"), ("a", "book")]
found_ngrams = _get_word_ngrams(2, sentences)

self.assertTrue(expected_ngrams, found_ngrams)


def test_len_lcs(self):
self.assertEqual(_len_lcs("1234", "1224533324"), 4)
Expand Down

0 comments on commit bc489c6

Please sign in to comment.