Skip to content

Commit

Permalink
Accurate BM25 support + arbitraryScoreTiesBreak fix (#803)
Browse files Browse the repository at this point in the history
+ cmd line option during indexing and search
+ updated ScoresTiesAdjusterReranker to listen to arbitraryScoreTiesBreak argument. fix #801
+ small fix in AccurateBM25Similarity class
+ move IndexArgs class to separate file to be more consistent with SearchArgs
  • Loading branch information
chriskamphuis authored and lintool committed Sep 12, 2019
1 parent 085c6d1 commit ca253a4
Show file tree
Hide file tree
Showing 12 changed files with 202 additions and 157 deletions.
158 changes: 158 additions & 0 deletions src/main/java/io/anserini/index/IndexArgs.java
Original file line number Diff line number Diff line change
@@ -0,0 +1,158 @@
/**
* Anserini: A Lucene toolkit for replicable information retrieval research
*
* Licensed under the Apache License, Version 2.0 (the "License");
* you may not use this file except in compliance with the License.
* You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/

package io.anserini.index;

import org.kohsuke.args4j.Option;

public class IndexArgs {

private static final int TIMEOUT = 600 * 1000;

// required arguments

@Option(name = "-input", metaVar = "[Directory]", required = true, usage = "collection directory")
public String input;

@Option(name = "-threads", metaVar = "[Number]", required = true, usage = "Number of Threads")
public int threads;

@Option(name = "-collection", required = true, usage = "collection class in io.anserini.collection")
public String collectionClass;

@Option(name = "-generator", required = true, usage = "document generator in io.anserini.index.generator")
public String generatorClass;

// optional arguments

@Option(name = "-index", metaVar = "[Path]", forbids = {"-solr", "-es"}, usage = "index path")
public String index;

@Option(name = "-storePositions", usage = "boolean switch to index storePositions")
public boolean storePositions = false;

@Option(name = "-storeDocvectors", usage = "boolean switch to store document vectors")
public boolean storeDocvectors = false;

@Option(name = "-storeTransformedDocs", usage = "boolean switch to store transformed document text")
public boolean storeTransformedDocs = false;

@Option(name = "-storeRawDocs", usage = "boolean switch to store raw document text")
public boolean storeRawDocs = false;

@Option(name = "-optimize", usage = "boolean switch to optimize index (force merge)")
public boolean optimize = false;

@Option(name = "-keepStopwords", usage = "boolean switch to keep stopwords")
public boolean keepStopwords = false;

@Option(name = "-stemmer", usage = "Stemmer: one of the following porter,krovetz,none. Default porter")
public String stemmer = "porter";

@Option(name = "-uniqueDocid", usage = "remove duplicated documents with the same doc id when indexing. " +
"please note that this option may slow the indexing a lot and if you are sure there is no " +
"duplicated document ids in the corpus you shouldn't use this option.")
public boolean uniqueDocid = false;

@Option(name = "-memorybuffer", usage = "memory buffer size")
public int memorybufferSize = 2048;

@Option(name = "-whitelist", usage = "file containing docids, one per line; only specified docids will be indexed.")
public String whitelist = null;

@Option(name = "-bm25.accurate", usage = "Switch to use the accurate BM25 similarity)")
public boolean bm25Accurate = false;

@Option(name = "-tweet.keepRetweets", usage = "boolean switch to keep retweets while indexing")
public boolean tweetKeepRetweets = false;

@Option(name = "-tweet.keepUrls", usage = "boolean switch to keep URLs while indexing tweets")
public boolean tweetKeepUrls = false;

@Option(name = "-tweet.stemming", usage = "boolean switch to apply Porter stemming while indexing tweets")
public boolean tweetStemming = false;

@Option(name = "-tweet.maxId", usage = "the max tweet Id for indexing. Tweet Ids that are larger " +
" (when being parsed to Long type) than this value will NOT be indexed")
public long tweetMaxId = Long.MAX_VALUE;

@Option(name = "-tweet.deletedIdsFile", metaVar = "[Path]",
usage = "a file that contains deleted tweetIds, one per line. these tweeets won't be indexed")
public String tweetDeletedIdsFile = "";

@Option(name = "-solr", forbids = {"-index", "-es"}, usage = "boolean switch to determine if we should index into Solr")
public boolean solr = false;

@Option(name = "-solr.batch", usage = "the batch size for submitting documents to Solr")
public int solrBatch = 1000;

@Option(name = "-solr.commitWithin", usage = "the number of seconds to commitWithin")
public int solrCommitWithin = 60;

@Option(name = "-solr.index", usage = "the name of the index in Solr")
public String solrIndex = null;

@Option(name = "-solr.zkUrl", usage = "the URL of Solr's ZooKeeper (comma separated list of using ensemble)")
public String zkUrl = null;

@Option(name = "-solr.zkChroot", usage = "the ZooKeeper chroot")
public String zkChroot = "/";

@Option(name = "-solr.poolSize", metaVar = "[NUMBER]", usage = "the number of clients to keep in the pool")
public int solrPoolSize = 16;

@Option(name = "-es", forbids = {"-index", "-solr"}, usage = "boolean switch to determine if we should index through Elasticsearch")
public boolean es = false;

@Option(name = "-es.batch", usage = "the number of index requests in a bulk request sent to Elasticsearch")
public int esBatch = 1000;

@Option(name = "-es.index", usage = "the name of the index in Elasticsearch")
public String esIndex = null;

@Option(name = "-es.hostname", usage = "the name of Elasticsearch HTTP host")
public String esHostname = "localhost";

@Option(name = "-es.port", usage = "the port for Elasticsearch HTTP host")
public int esPort = 9200;

/**
* The user and password are defaulted to those pre-configured for docker-elk
*/
@Option(name = "-es.user", usage = "the user of the ELK stack")
public String esUser = "elastic";

@Option(name = "-es.password", usage = "the password for the ELK stack")
public String esPassword = "changeme";

@Option(name = "-es.poolSize", metaVar = "[NUMBER]", usage = "the number of Elasticsearch clients to keep in the pool")
public int esPoolSize = 10;

@Option(name = "-es.connectTimeout", metaVar = "[NUMBER]", usage = "the Elasticsearch (low level) REST client connect timeout (in ms)")
public int esConnectTimeout = TIMEOUT;

@Option(name = "-es.socketTimeout", metaVar = "[NUMBER]", usage = "the Elasticsearch (low level) REST client socket timeout (in ms)")
public int esSocketTimeout = TIMEOUT;

@Option(name = "-shard.count", usage = "the number of shards for the index")
public int shardCount = -1;

@Option(name = "-shard.current", usage = "the current shard number to produce (indexed from 0)")
public int shardCurrent = -1;

@Option(name = "-dryRun", usage = "performs all analysis steps except Lucene / Solr indexing")
public boolean dryRun = false;
}
153 changes: 12 additions & 141 deletions src/main/java/io/anserini/index/IndexCollection.java
Original file line number Diff line number Diff line change
Expand Up @@ -25,6 +25,7 @@
import io.anserini.collection.FileSegment;
import io.anserini.collection.SourceDocument;
import io.anserini.index.generator.LuceneDocumentGenerator;
import io.anserini.search.similarity.AccurateBM25Similarity;
import org.apache.commons.io.FileUtils;
import org.apache.commons.lang3.time.DurationFormatUtils;
import org.apache.commons.pool2.BasePooledObjectFactory;
Expand Down Expand Up @@ -76,140 +77,6 @@ public final class IndexCollection {

private static final int TIMEOUT = 600 * 1000;

public static final class Args {

// required arguments

@Option(name = "-input", metaVar = "[Directory]", required = true, usage = "collection directory")
public String input;

@Option(name = "-threads", metaVar = "[Number]", required = true, usage = "Number of Threads")
public int threads;

@Option(name = "-collection", required = true, usage = "collection class in io.anserini.collection")
public String collectionClass;

@Option(name = "-generator", required = true, usage = "document generator in io.anserini.index.generator")
public String generatorClass;

// optional arguments

@Option(name = "-index", metaVar = "[Path]", forbids = {"-solr", "-es"}, usage = "index path")
public String index;

@Option(name = "-storePositions", usage = "boolean switch to index storePositions")
public boolean storePositions = false;

@Option(name = "-storeDocvectors", usage = "boolean switch to store document vectors")
public boolean storeDocvectors = false;

@Option(name = "-storeTransformedDocs", usage = "boolean switch to store transformed document text")
public boolean storeTransformedDocs = false;

@Option(name = "-storeRawDocs", usage = "boolean switch to store raw document text")
public boolean storeRawDocs = false;

@Option(name = "-optimize", usage = "boolean switch to optimize index (force merge)")
public boolean optimize = false;

@Option(name = "-keepStopwords", usage = "boolean switch to keep stopwords")
public boolean keepStopwords = false;

@Option(name = "-stemmer", usage = "Stemmer: one of the following porter,krovetz,none. Default porter")
public String stemmer = "porter";

@Option(name = "-uniqueDocid", usage = "remove duplicated documents with the same doc id when indexing. " +
"please note that this option may slow the indexing a lot and if you are sure there is no " +
"duplicated document ids in the corpus you shouldn't use this option.")
public boolean uniqueDocid = false;

@Option(name = "-memorybuffer", usage = "memory buffer size")
public int memorybufferSize = 2048;

@Option(name = "-whitelist", usage = "file containing docids, one per line; only specified docids will be indexed.")
public String whitelist = null;

@Option(name = "-tweet.keepRetweets", usage = "boolean switch to keep retweets while indexing")
public boolean tweetKeepRetweets = false;

@Option(name = "-tweet.keepUrls", usage = "boolean switch to keep URLs while indexing tweets")
public boolean tweetKeepUrls = false;

@Option(name = "-tweet.stemming", usage = "boolean switch to apply Porter stemming while indexing tweets")
public boolean tweetStemming = false;

@Option(name = "-tweet.maxId", usage = "the max tweet Id for indexing. Tweet Ids that are larger " +
" (when being parsed to Long type) than this value will NOT be indexed")
public long tweetMaxId = Long.MAX_VALUE;

@Option(name = "-tweet.deletedIdsFile", metaVar = "[Path]",
usage = "a file that contains deleted tweetIds, one per line. these tweeets won't be indexed")
public String tweetDeletedIdsFile = "";

@Option(name = "-solr", forbids = {"-index", "-es"}, usage = "boolean switch to determine if we should index into Solr")
public boolean solr = false;

@Option(name = "-solr.batch", usage = "the batch size for submitting documents to Solr")
public int solrBatch = 1000;

@Option(name = "-solr.commitWithin", usage = "the number of seconds to commitWithin")
public int solrCommitWithin = 60;

@Option(name = "-solr.index", usage = "the name of the index in Solr")
public String solrIndex = null;

@Option(name = "-solr.zkUrl", usage = "the URL of Solr's ZooKeeper (comma separated list of using ensemble)")
public String zkUrl = null;

@Option(name = "-solr.zkChroot", usage = "the ZooKeeper chroot")
public String zkChroot = "/";

@Option(name = "-solr.poolSize", metaVar = "[NUMBER]", usage = "the number of clients to keep in the pool")
public int solrPoolSize = 16;

@Option(name = "-es", forbids = {"-index", "-solr"}, usage = "boolean switch to determine if we should index through Elasticsearch")
public boolean es = false;

@Option(name = "-es.batch", usage = "the number of index requests in a bulk request sent to Elasticsearch")
public int esBatch = 1000;

@Option(name = "-es.index", usage = "the name of the index in Elasticsearch")
public String esIndex = null;

@Option(name = "-es.hostname", usage = "the name of Elasticsearch HTTP host")
public String esHostname = "localhost";

@Option(name = "-es.port", usage = "the port for Elasticsearch HTTP host")
public int esPort = 9200;

/**
* The user and password are defaulted to those pre-configured for docker-elk
*/
@Option(name = "-es.user", usage = "the user of the ELK stack")
public String esUser = "elastic";

@Option(name = "-es.password", usage = "the password for the ELK stack")
public String esPassword = "changeme";

@Option(name = "-es.poolSize", metaVar = "[NUMBER]", usage = "the number of Elasticsearch clients to keep in the pool")
public int esPoolSize = 10;

@Option(name = "-es.connectTimeout", metaVar = "[NUMBER]", usage = "the Elasticsearch (low level) REST client connect timeout (in ms)")
public int esConnectTimeout = TIMEOUT;

@Option(name = "-es.socketTimeout", metaVar = "[NUMBER]", usage = "the Elasticsearch (low level) REST client socket timeout (in ms)")
public int esSocketTimeout = TIMEOUT;

@Option(name = "-shard.count", usage = "the number of shards for the index")
public int shardCount = -1;

@Option(name = "-shard.current", usage = "the current shard number to produce (indexed from 0)")
public int shardCurrent = -1;

@Option(name = "-dryRun", usage = "performs all analysis steps except Lucene / Solr indexing")
public boolean dryRun = false;
}

public final class Counters {
/**
* Counter for successfully indexed documents.
Expand Down Expand Up @@ -266,7 +133,7 @@ public void run() {
@SuppressWarnings("unchecked")
LuceneDocumentGenerator generator =
(LuceneDocumentGenerator) generatorClass
.getDeclaredConstructor(Args.class, Counters.class)
.getDeclaredConstructor(IndexArgs.class, Counters.class)
.newInstance(args, counters);

int cnt = 0;
Expand Down Expand Up @@ -366,7 +233,7 @@ public void run() {
@SuppressWarnings("unchecked")
LuceneDocumentGenerator generator =
(LuceneDocumentGenerator) generatorClass
.getDeclaredConstructor(Args.class, Counters.class)
.getDeclaredConstructor(IndexArgs.class, Counters.class)
.newInstance(args, counters);

int cnt = 0;
Expand Down Expand Up @@ -505,7 +372,7 @@ public void run() {
@SuppressWarnings("unchecked")
LuceneDocumentGenerator generator =
(LuceneDocumentGenerator) generatorClass
.getDeclaredConstructor(Args.class, Counters.class)
.getDeclaredConstructor(IndexArgs.class, Counters.class)
.newInstance(args, counters);

@SuppressWarnings("unchecked")
Expand Down Expand Up @@ -640,7 +507,7 @@ private void sendBulkRequest() {
}


private final IndexCollection.Args args;
private final IndexArgs args;
private final Path collectionPath;
private final Set whitelistDocids;
private final Class collectionClass;
Expand All @@ -651,7 +518,7 @@ private void sendBulkRequest() {
private ObjectPool<SolrClient> solrPool;
private ObjectPool<RestHighLevelClient> esPool;

public IndexCollection(IndexCollection.Args args) throws Exception {
public IndexCollection(IndexArgs args) throws Exception {
this.args = args;

LOG.info("DocumentCollection path: " + args.input);
Expand Down Expand Up @@ -793,7 +660,11 @@ public void run() throws IOException {
new EnglishStemmingAnalyzer(args.stemmer, CharArraySet.EMPTY_SET) : new EnglishStemmingAnalyzer(args.stemmer);
final TweetAnalyzer tweetAnalyzer = new TweetAnalyzer(args.tweetStemming);
final IndexWriterConfig config = args.collectionClass.equals("TweetCollection") ? new IndexWriterConfig(tweetAnalyzer) : new IndexWriterConfig(analyzer);
config.setSimilarity(new BM25Similarity());
if (args.bm25Accurate) {
config.setSimilarity(new AccurateBM25Similarity()); // necessary during indexing as the norm used in BM25 is already determined at index time.
} else {
config.setSimilarity(new BM25Similarity());
}
config.setOpenMode(IndexWriterConfig.OpenMode.CREATE);
config.setRAMBufferSizeMB(args.memorybufferSize);
config.setUseCompoundFile(false);
Expand Down Expand Up @@ -901,7 +772,7 @@ public void run() throws IOException {
}

public static void main(String[] args) throws Exception {
IndexCollection.Args indexCollectionArgs = new IndexCollection.Args();
IndexArgs indexCollectionArgs = new IndexArgs();
CmdLineParser parser = new CmdLineParser(indexCollectionArgs, ParserProperties.defaults().withUsageWidth(90));

try {
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -16,14 +16,15 @@

package io.anserini.index.generator;

import io.anserini.index.IndexArgs;
import io.anserini.index.IndexCollection;
import io.anserini.index.transform.JsoupStringTransform;

public class JsoupGenerator extends LuceneDocumentGenerator {
public JsoupGenerator() {
super(new JsoupStringTransform());
}
public JsoupGenerator(IndexCollection.Args args, IndexCollection.Counters counters) {
public JsoupGenerator(IndexArgs args, IndexCollection.Counters counters) {
super(new JsoupStringTransform(), args, counters);
}
}
Loading

0 comments on commit ca253a4

Please sign in to comment.