Word n-grams does not return boundary words for more sentences

miso-belica · Mar 5, 2017 · bc489c6 · bc489c6
1 parent 8be86e5
commit bc489c6
Show file tree

Hide file tree

Showing 2 changed files with 19 additions and 8 deletions.
diff --git a/sumy/evaluation/rouge.py b/sumy/evaluation/rouge.py
@@ -28,8 +28,12 @@ def _get_word_ngrams(n, sentences):
 	assert (len(sentences) > 0)
 	assert (n > 0)
 
-	words = _split_into_words(sentences)
-	return _get_ngrams(n, words)
+	words = set()
+	for sentence in sentences:
+		words.update(_get_ngrams(n,_split_into_words([sentence])))
+
+	return words
+
 
 
 def _get_index_of_lcs(x, y):

diff --git a/tests/test_evaluation.py b/tests/test_evaluation.py
@@ -216,19 +216,26 @@ def test_split_into_words(self):
             Tokenizer("english")).document.sentences
         self.assertEqual(["One", "two", "two", "Two", "Three"], 
             _split_into_words(sentences1))
-
         sentences2 = PlaintextParser.from_string("two two. Two. Three.", 
             Tokenizer("english")).document.sentences
         self.assertEqual(["two", "two", "Two", "Three"], 
             _split_into_words(sentences2))
 
+
     def test_get_word_ngrams(self):
-        sentences = PlaintextParser.from_string("This is a test.", 
-            Tokenizer("english")).document.sentences
-        correct_ngrams = [("This", "is"), ("is", "a"), ("a", "test")]
+        sentences = PlaintextParser.from_string("This is a test.", Tokenizer("english")).document.sentences
+        expected_ngrams = [("This", "is"), ("is", "a"), ("a", "test")]
         found_ngrams = _get_word_ngrams(2, sentences)
-        for ngram in correct_ngrams:
-            self.assertTrue(ngram in found_ngrams)      
+
+        self.assertTrue(expected_ngrams, found_ngrams)
+
+    def test_ngrams_for_more_sentences_should_not_return_words_at_boundaries(self):
+        sentences = PlaintextParser.from_string("This is a pencil.\nThis is a eraser.\nThis is a book.", Tokenizer("english")).document.sentences
+        expected_ngrams = [("This", "is"), ("is", "a"), ("a", "pencil"), ("a", "eraser"), ("a", "book")]
+        found_ngrams = _get_word_ngrams(2, sentences)
+
+        self.assertTrue(expected_ngrams, found_ngrams)
+
 
     def test_len_lcs(self):
         self.assertEqual(_len_lcs("1234", "1224533324"), 4)