Skip to content

Commit

Permalink
Bugfix outputting doc lengths (#800)
Browse files Browse the repository at this point in the history
The lossy count was calculated using the unique count instead of the real length of the document.
  • Loading branch information
chriskamphuis authored and lintool committed Sep 10, 2019
1 parent f60961b commit 085c6d1
Showing 1 changed file with 2 additions and 7 deletions.
9 changes: 2 additions & 7 deletions src/main/java/io/anserini/util/ExtractDocumentLengths.java
Original file line number Diff line number Diff line change
Expand Up @@ -19,7 +19,6 @@
import org.apache.lucene.index.DirectoryReader;
import org.apache.lucene.index.IndexReader;
import org.apache.lucene.index.Terms;
import org.apache.lucene.index.TermsEnum;
import org.apache.lucene.store.Directory;
import org.apache.lucene.store.FSDirectory;
import org.apache.lucene.util.SmallFloat;
Expand Down Expand Up @@ -60,17 +59,13 @@ public static void main(String[] args) throws Exception {
int numDocs = reader.numDocs();
out.println("luceneID\tcount\tuniquecount\tlossycount");
for (int i = 0; i < numDocs; i++) {
int total = 0;
Terms terms = reader.getTermVector(i, "contents");
if(terms == null) {
out.println(i + "\t" + 0 + "\t" + 0 + "\t" + 0);
continue;
}
TermsEnum termsEnum = terms.iterator();
while ((termsEnum.next()) != null) {
total += termsEnum.totalTermFreq();
}
long length = SmallFloat.longToInt4(terms.size());
long total = terms.getSumTotalTermFreq();
long length = SmallFloat.longToInt4(total);
out.println(i + "\t" + total + "\t" + terms.size() + "\t" + length) ;
}
out.close();
Expand Down

0 comments on commit 085c6d1

Please sign in to comment.