Skip to content

Commit b7db8d4

Browse files
Merge pull request #12 from erdiari/main
Found a bug with stopword_remover
2 parents 1a53ba2 + cb3da31 commit b7db8d4

File tree

2 files changed

+13
-8
lines changed

2 files changed

+13
-8
lines changed

tests/test_general.py

+4-7
Original file line numberDiff line numberDiff line change
@@ -10,8 +10,6 @@
1010
StopwordRemover,
1111
)
1212

13-
import sys
14-
1513

1614
class StemmerTest(unittest.TestCase):
1715
def setUp(self):
@@ -230,14 +228,13 @@ def test_remove_stopwords(self):
230228
)
231229

232230
def test_dynamic_stopwords(self):
233-
py_version = int(sys.version.split('.')[1])
234231
dsw = self.stopword_remover.dynamically_detect_stop_words(
235232
"ben bugün gidip aşı olacağım sonra da eve gelip telefon açacağım aşı nasıl etkiledi eve gelip anlatırım aşı olmak bu dönemde çok ama ama ama ama çok önemli".split()
236233
)
237-
expected = ["ama", "aşı", "çok", "eve"]
238-
if py_version <= 8: #Sorting algorithm returns different results from python 3.8+ on
239-
expected = ["ama", "aşı", "gelip", "eve"]
240-
self.assertEqual(dsw, expected)
234+
expected = ['ama', 'aşı', 'çok', 'eve', 'gelip']
235+
236+
# Converted to set since order is not stable
237+
self.assertEqual(set(dsw), set(expected))
241238
self.stopword_remover.add_to_stop_words(dsw)
242239
self.assertEqual(
243240
self.stopword_remover.drop_stop_words(

vnlp/stopword_remover/stopword_remover.py

+9-1
Original file line numberDiff line numberDiff line change
@@ -1,4 +1,5 @@
11
from typing import List
2+
23
from pathlib import Path
34

45
import numpy as np
@@ -59,6 +60,9 @@ def dynamically_detect_stop_words(
5960
['ama', 'aşı', 'gelip', 'eve']
6061
"""
6162
unq, cnts = np.unique(list_of_tokens, return_counts=True)
63+
# Edgecase: Every word used once
64+
if len(unq) == list_of_tokens:
65+
return []
6266
sorted_indices = cnts.argsort()[
6367
::-1
6468
] # I need them in descending order
@@ -83,8 +87,12 @@ def dynamically_detect_stop_words(
8387
] # removing nan
8488
argmax_second_der = np.argmax(pct_change_two)
8589

90+
# Correction term since argmax finds first occurence
91+
amount_of_max = np.sum(cnts == cnts[argmax_second_der])
92+
8693
# +2 is due to shifting twice due to np.diff()
87-
detected_stop_words = unq[: argmax_second_der + 2].tolist()
94+
# -1 is added to correctly find all values
95+
detected_stop_words = unq[: argmax_second_der + amount_of_max].tolist()
8896

8997
# Determine rare_words according to given rare_words_freq value
9098
# Add them to dynamic_stop_words list

0 commit comments

Comments
 (0)