Skip to content

Commit 208a46f

Browse files
authored
dont consume arbitrary unused kwargs in TerrierIndexer, update overridden properties, addreses #499 (#500)
1 parent d1dc8e5 commit 208a46f

File tree

2 files changed

+16
-2
lines changed

2 files changed

+16
-2
lines changed

pyterrier/terrier/index.py

+6-2
Original file line numberDiff line numberDiff line change
@@ -225,8 +225,9 @@ def __init__(self, index_path : str, *args,
225225
stemmer : Union[None, str, TerrierStemmer] = TerrierStemmer.porter,
226226
stopwords : Union[None, TerrierStopwords, List[str]] = TerrierStopwords.terrier,
227227
tokeniser : Union[str,TerrierTokeniser] = TerrierTokeniser.english,
228-
type=IndexingType.CLASSIC,
229-
**kwargs):
228+
type=IndexingType.CLASSIC,
229+
properties : Dict[str,str] = {}
230+
):
230231
"""
231232
Constructor called by all indexer subclasses. All arguments listed below are available in
232233
IterDictIndexer, DFIndexer, TRECCollectionIndexer and FilesIndsexer.
@@ -240,6 +241,7 @@ def __init__(self, index_path : str, *args,
240241
stopwords (TerrierStopwords): the stopwords list to apply. Default is ``TerrierStemmer.terrier``.
241242
tokeniser (TerrierTokeniser): the stemmer to apply. Default is ``TerrierTokeniser.english``.
242243
type (IndexingType): the specific indexing procedure to use. Default is ``IndexingType.CLASSIC``.
244+
properties (dict): Terrier properties that you wish to overrride.
243245
"""
244246
if type is IndexingType.MEMORY:
245247
self.path = None
@@ -256,6 +258,8 @@ def __init__(self, index_path : str, *args,
256258
self.tokeniser = TerrierTokeniser._to_obj(tokeniser)
257259
self.properties = pt.java.J.Properties()
258260
self.setProperties(**self.default_properties)
261+
for k,v in properties.items():
262+
self.properties[k] = v
259263
self.overwrite = overwrite
260264
self.verbose = verbose
261265
self.meta_reverse = meta_reverse

tests/test_iterdictindex.py

+10
Original file line numberDiff line numberDiff line change
@@ -150,6 +150,16 @@ def test_createindex3_single_pass_2fields(self):
150150
from pyterrier.terrier.index import IndexingType
151151
self._make_check_index(3, IndexingType.SINGLEPASS, fields=['text', 'title'])
152152

153+
def test_long_tokens(self):
154+
longword = 'abc' * (1+int(60/3))
155+
indexer = pt.IterDictIndexer(self.test_dir, tokeniser='identity', overwrite=True, properties={'max.term.length' : '80'})
156+
ref = indexer.index([ {"docno" : "d1", 'text': longword}])
157+
index = pt.IndexFactory.of(ref)
158+
self.assertEqual(1, len(index))
159+
self.assertEqual(1, index.getCollectionStatistics().getNumberOfUniqueTerms())
160+
self.assertEqual('80', pt.java.cast("org.terrier.structures.PropertiesIndex", index).getIndexProperty("max.term.length", None))
161+
self.assertTrue(longword in index.getLexicon())
162+
153163
def test_meta_init(self):
154164
it = [
155165
{'docno': '11', 'url': 'url1', 'text': 'He ran out of money, so he had to stop playing', 'title': 'Woes of playing poker'},

0 commit comments

Comments
 (0)