Skip to content

Commit

Permalink
Expose index statistics (#128)
Browse files Browse the repository at this point in the history
  • Loading branch information
PepijnBoers authored May 23, 2020
1 parent 8034a1a commit cdba0e9
Show file tree
Hide file tree
Showing 2 changed files with 27 additions and 0 deletions.
23 changes: 23 additions & 0 deletions pyserini/index/pyutils.py
Original file line number Diff line number Diff line change
Expand Up @@ -336,3 +336,26 @@ def convert_collection_docid_to_internal_docid(self, docid: str) -> int:
The Lucene internal ``docid`` corresponding to the external collection ``docid``.
"""
return self.object.convertDocidToLuceneDocid(self.reader, docid)

def stats(self) -> Dict[str, int]:
"""Returns dictionary with index statistics.
Returns
-------
Dict[str, int]
Index statistics as a dictionary of statistic's name to statistic.
- documents: number of documents
- non_empty_documents: number of non-empty documents
- unique_terms: number of unique terms
- total_terms: number of total terms
"""
index_stats_map = self.object.getIndexStats(self.reader)

if index_stats_map is None:
return None

index_stats_dict = {}
for term in index_stats_map.keySet().toArray():
index_stats_dict[term] = index_stats_map.get(JString(term.encode('utf-8')))

return index_stats_dict
4 changes: 4 additions & 0 deletions tests/test_indexutils.py
Original file line number Diff line number Diff line change
Expand Up @@ -273,6 +273,10 @@ def test_query_doc_score_custom_similarity(self):
self.index_utils.compute_query_document_score(
hits[i].docid, query, similarity=custom_qld), places=4)

def test_index_stats(self):
self.assertEqual(3204, self.index_utils.stats()['documents'])
self.assertEqual(14363, self.index_utils.stats()['unique_terms'])

def tearDown(self):
os.remove(self.tarball_name)
shutil.rmtree(self.index_dir)
Expand Down

0 comments on commit cdba0e9

Please sign in to comment.