Bugfix outputting doc lengths (#800)

The lossy count was calculated using the unique count instead of the real length of the document.
castorini · Sep 10, 2019 · 085c6d1 · 085c6d1
1 parent f60961b
commit 085c6d1
Showing 1 changed file with 2 additions and 7 deletions.
diff --git a/src/main/java/io/anserini/util/ExtractDocumentLengths.java b/src/main/java/io/anserini/util/ExtractDocumentLengths.java
@@ -19,7 +19,6 @@
 import org.apache.lucene.index.DirectoryReader;
 import org.apache.lucene.index.IndexReader;
 import org.apache.lucene.index.Terms;
-import org.apache.lucene.index.TermsEnum;
 import org.apache.lucene.store.Directory;
 import org.apache.lucene.store.FSDirectory;
 import org.apache.lucene.util.SmallFloat;
@@ -60,17 +59,13 @@ public static void main(String[] args) throws Exception {
     int numDocs = reader.numDocs();
     out.println("luceneID\tcount\tuniquecount\tlossycount");
     for (int i = 0; i < numDocs; i++) {
-      int total = 0;
       Terms terms = reader.getTermVector(i, "contents");
       if(terms == null) {
         out.println(i + "\t" + 0 + "\t" + 0 + "\t" + 0);
         continue;
       }
-      TermsEnum termsEnum = terms.iterator();
-      while ((termsEnum.next()) != null) {
-        total += termsEnum.totalTermFreq();
-      }
-      long length = SmallFloat.longToInt4(terms.size());
+      long total = terms.getSumTotalTermFreq();
+      long length = SmallFloat.longToInt4(total);
       out.println(i + "\t" + total + "\t" + terms.size() + "\t" + length) ;
     }
     out.close();