From cdba0e924a7fca1896c88ac84800bb46ef51f812 Mon Sep 17 00:00:00 2001 From: Pepijn Boers Date: Sat, 23 May 2020 15:37:22 +0200 Subject: [PATCH] Expose index statistics (#128) --- pyserini/index/pyutils.py | 23 +++++++++++++++++++++++ tests/test_indexutils.py | 4 ++++ 2 files changed, 27 insertions(+) diff --git a/pyserini/index/pyutils.py b/pyserini/index/pyutils.py index 0deafba2f..cb91f3019 100644 --- a/pyserini/index/pyutils.py +++ b/pyserini/index/pyutils.py @@ -336,3 +336,26 @@ def convert_collection_docid_to_internal_docid(self, docid: str) -> int: The Lucene internal ``docid`` corresponding to the external collection ``docid``. """ return self.object.convertDocidToLuceneDocid(self.reader, docid) + + def stats(self) -> Dict[str, int]: + """Returns dictionary with index statistics. + + Returns + ------- + Dict[str, int] + Index statistics as a dictionary of statistic's name to statistic. + - documents: number of documents + - non_empty_documents: number of non-empty documents + - unique_terms: number of unique terms + - total_terms: number of total terms + """ + index_stats_map = self.object.getIndexStats(self.reader) + + if index_stats_map is None: + return None + + index_stats_dict = {} + for term in index_stats_map.keySet().toArray(): + index_stats_dict[term] = index_stats_map.get(JString(term.encode('utf-8'))) + + return index_stats_dict diff --git a/tests/test_indexutils.py b/tests/test_indexutils.py index d27d33efd..a967eb1b3 100644 --- a/tests/test_indexutils.py +++ b/tests/test_indexutils.py @@ -273,6 +273,10 @@ def test_query_doc_score_custom_similarity(self): self.index_utils.compute_query_document_score( hits[i].docid, query, similarity=custom_qld), places=4) + def test_index_stats(self): + self.assertEqual(3204, self.index_utils.stats()['documents']) + self.assertEqual(14363, self.index_utils.stats()['unique_terms']) + def tearDown(self): os.remove(self.tarball_name) shutil.rmtree(self.index_dir)