miso-belica · odek53r · Feb 25, 2017
diff --git a/sumy/evaluation/rouge.py b/sumy/evaluation/rouge.py
@@ -28,8 +28,12 @@ def _get_word_ngrams(n, sentences):
 	assert (len(sentences) > 0)
 	assert (n > 0)
 
-	words = _split_into_words(sentences)
-	return _get_ngrams(n, words)
+	words = set()
+	for sentence in sentences:
+		words.update(_get_ngrams(n,_split_into_words([sentence])))
+
+	return words
+
 
 
 def _get_index_of_lcs(x, y):

diff --git a/tests/test_evaluation.py b/tests/test_evaluation.py
@@ -216,19 +216,28 @@ def test_split_into_words(self):
             Tokenizer("english")).document.sentences
         self.assertEqual(["One", "two", "two", "Two", "Three"], 
             _split_into_words(sentences1))
-
         sentences2 = PlaintextParser.from_string("two two. Two. Three.", 
             Tokenizer("english")).document.sentences
         self.assertEqual(["two", "two", "Two", "Three"], 
             _split_into_words(sentences2))
 
+
     def test_get_word_ngrams(self):
-        sentences = PlaintextParser.from_string("This is a test.", 
-            Tokenizer("english")).document.sentences
+        sentences = PlaintextParser.from_string("This is a test.",
+                                                Tokenizer("english")).document.sentences
         correct_ngrams = [("This", "is"), ("is", "a"), ("a", "test")]
         found_ngrams = _get_word_ngrams(2, sentences)
         for ngram in correct_ngrams:
-            self.assertTrue(ngram in found_ngrams)      
+            self.assertTrue(ngram in found_ngrams)
+
+        # test with long text
+        sentences = PlaintextParser.from_string("This is a pencil.\nThis is a eraser.\nThis is a book.",
+                                                Tokenizer("english")).document.sentences
+        correct_ngrams = [("This", "is"), ("is", "a"), ("a", "pencil"), ("a", "eraser"), ("a", "book")]
+        found_ngrams = _get_word_ngrams(2, sentences)
+        for ngram in correct_ngrams:
+            self.assertTrue(ngram in found_ngrams)
+
 
     def test_len_lcs(self):
         self.assertEqual(_len_lcs("1234", "1224533324"), 4)