diff --git a/docs/regressions/regressions-dl19-passage-cos-dpr-distil-hnsw-onnx.md b/docs/regressions/regressions-dl19-passage-cos-dpr-distil-hnsw-onnx.md index e98ee2e74e..b08012b307 100644 --- a/docs/regressions/regressions-dl19-passage-cos-dpr-distil-hnsw-onnx.md +++ b/docs/regressions/regressions-dl19-passage-cos-dpr-distil-hnsw-onnx.md @@ -57,7 +57,7 @@ target/appassembler/bin/IndexHnswDenseVectors \ -input /path/to/msmarco-passage-cos-dpr-distil \ -generator HnswDenseVectorDocumentGenerator \ -index indexes/lucene-hnsw.msmarco-passage-cos-dpr-distil/ \ - -threads 16 -M 16 -efC 100 \ + -threads 16 -M 16 -efC 100 -memoryBuffer 65536 \ >& logs/log.msmarco-passage-cos-dpr-distil & ``` diff --git a/docs/regressions/regressions-dl19-passage-cos-dpr-distil-hnsw.md b/docs/regressions/regressions-dl19-passage-cos-dpr-distil-hnsw.md index aed4b16719..dc625e14d0 100644 --- a/docs/regressions/regressions-dl19-passage-cos-dpr-distil-hnsw.md +++ b/docs/regressions/regressions-dl19-passage-cos-dpr-distil-hnsw.md @@ -57,7 +57,7 @@ target/appassembler/bin/IndexHnswDenseVectors \ -input /path/to/msmarco-passage-cos-dpr-distil \ -generator HnswDenseVectorDocumentGenerator \ -index indexes/lucene-hnsw.msmarco-passage-cos-dpr-distil/ \ - -threads 16 -M 16 -efC 100 \ + -threads 16 -M 16 -efC 100 -memoryBuffer 65536 \ >& logs/log.msmarco-passage-cos-dpr-distil & ``` diff --git a/docs/regressions/regressions-dl19-passage-openai-ada2.md b/docs/regressions/regressions-dl19-passage-openai-ada2.md index cecd54ea8e..c5382dc3c5 100644 --- a/docs/regressions/regressions-dl19-passage-openai-ada2.md +++ b/docs/regressions/regressions-dl19-passage-openai-ada2.md @@ -57,7 +57,7 @@ target/appassembler/bin/IndexHnswDenseVectors \ -input /path/to/msmarco-passage-openai-ada2 \ -generator HnswDenseVectorDocumentGenerator \ -index indexes/lucene-hnsw.msmarco-passage-openai-ada2/ \ - -threads 16 -M 16 -efC 100 -memorybuffer 65536 \ + -threads 16 -M 16 -efC 100 -memoryBuffer 65536 \ >& logs/log.msmarco-passage-openai-ada2 & ``` diff --git a/docs/regressions/regressions-dl20-passage-cos-dpr-distil-hnsw-onnx.md b/docs/regressions/regressions-dl20-passage-cos-dpr-distil-hnsw-onnx.md index 045417dab5..f040a9ce41 100644 --- a/docs/regressions/regressions-dl20-passage-cos-dpr-distil-hnsw-onnx.md +++ b/docs/regressions/regressions-dl20-passage-cos-dpr-distil-hnsw-onnx.md @@ -57,7 +57,7 @@ target/appassembler/bin/IndexHnswDenseVectors \ -input /path/to/msmarco-passage-cos-dpr-distil \ -generator HnswDenseVectorDocumentGenerator \ -index indexes/lucene-hnsw.msmarco-passage-cos-dpr-distil/ \ - -threads 16 -M 16 -efC 100 \ + -threads 16 -M 16 -efC 100 -memoryBuffer 65536 \ >& logs/log.msmarco-passage-cos-dpr-distil & ``` diff --git a/docs/regressions/regressions-dl20-passage-cos-dpr-distil-hnsw.md b/docs/regressions/regressions-dl20-passage-cos-dpr-distil-hnsw.md index 7408687a8c..c2f46b422c 100644 --- a/docs/regressions/regressions-dl20-passage-cos-dpr-distil-hnsw.md +++ b/docs/regressions/regressions-dl20-passage-cos-dpr-distil-hnsw.md @@ -57,7 +57,7 @@ target/appassembler/bin/IndexHnswDenseVectors \ -input /path/to/msmarco-passage-cos-dpr-distil \ -generator HnswDenseVectorDocumentGenerator \ -index indexes/lucene-hnsw.msmarco-passage-cos-dpr-distil/ \ - -threads 16 -M 16 -efC 100 \ + -threads 16 -M 16 -efC 100 -memoryBuffer 65536 \ >& logs/log.msmarco-passage-cos-dpr-distil & ``` diff --git a/docs/regressions/regressions-dl20-passage-openai-ada2.md b/docs/regressions/regressions-dl20-passage-openai-ada2.md index 57220c1465..f3e93c63ef 100644 --- a/docs/regressions/regressions-dl20-passage-openai-ada2.md +++ b/docs/regressions/regressions-dl20-passage-openai-ada2.md @@ -57,7 +57,7 @@ target/appassembler/bin/IndexHnswDenseVectors \ -input /path/to/msmarco-passage-openai-ada2 \ -generator HnswDenseVectorDocumentGenerator \ -index indexes/lucene-hnsw.msmarco-passage-openai-ada2/ \ - -threads 16 -M 16 -efC 100 -memorybuffer 65536 \ + -threads 16 -M 16 -efC 100 -memoryBuffer 65536 \ >& logs/log.msmarco-passage-openai-ada2 & ``` diff --git a/docs/regressions/regressions-mb11.md b/docs/regressions/regressions-mb11.md index e748171dad..7da300a807 100644 --- a/docs/regressions/regressions-mb11.md +++ b/docs/regressions/regressions-mb11.md @@ -25,7 +25,7 @@ target/appassembler/bin/IndexCollection \ -input /path/to/mb11 \ -generator TweetGenerator \ -index indexes/lucene-index.mb11/ \ - -threads 44 -storePositions -storeDocvectors -storeRaw -uniqueDocid -tweet.keepUrls -tweet.stemming \ + -threads 44 -storePositions -storeDocvectors -storeRaw -tweet.keepUrls -tweet.stemming \ >& logs/log.mb11 & ``` diff --git a/docs/regressions/regressions-msmarco-passage-cos-dpr-distil-hnsw-onnx.md b/docs/regressions/regressions-msmarco-passage-cos-dpr-distil-hnsw-onnx.md index a19e8abdd5..6df98970f1 100644 --- a/docs/regressions/regressions-msmarco-passage-cos-dpr-distil-hnsw-onnx.md +++ b/docs/regressions/regressions-msmarco-passage-cos-dpr-distil-hnsw-onnx.md @@ -54,7 +54,7 @@ target/appassembler/bin/IndexHnswDenseVectors \ -input /path/to/msmarco-passage-cos-dpr-distil \ -generator HnswDenseVectorDocumentGenerator \ -index indexes/lucene-hnsw.msmarco-passage-cos-dpr-distil/ \ - -threads 16 -M 16 -efC 100 \ + -threads 16 -M 16 -efC 100 -memoryBuffer 65536 \ >& logs/log.msmarco-passage-cos-dpr-distil & ``` diff --git a/docs/regressions/regressions-msmarco-passage-cos-dpr-distil-hnsw.md b/docs/regressions/regressions-msmarco-passage-cos-dpr-distil-hnsw.md index d813573d06..cf41e9645a 100644 --- a/docs/regressions/regressions-msmarco-passage-cos-dpr-distil-hnsw.md +++ b/docs/regressions/regressions-msmarco-passage-cos-dpr-distil-hnsw.md @@ -54,7 +54,7 @@ target/appassembler/bin/IndexHnswDenseVectors \ -input /path/to/msmarco-passage-cos-dpr-distil \ -generator HnswDenseVectorDocumentGenerator \ -index indexes/lucene-hnsw.msmarco-passage-cos-dpr-distil/ \ - -threads 16 -M 16 -efC 100 \ + -threads 16 -M 16 -efC 100 -memoryBuffer 65536 \ >& logs/log.msmarco-passage-cos-dpr-distil & ``` diff --git a/docs/regressions/regressions-msmarco-passage-openai-ada2.md b/docs/regressions/regressions-msmarco-passage-openai-ada2.md index 8bd6c97658..c6ca60e2bc 100644 --- a/docs/regressions/regressions-msmarco-passage-openai-ada2.md +++ b/docs/regressions/regressions-msmarco-passage-openai-ada2.md @@ -54,7 +54,7 @@ target/appassembler/bin/IndexHnswDenseVectors \ -input /path/to/msmarco-passage-openai-ada2 \ -generator HnswDenseVectorDocumentGenerator \ -index indexes/lucene-hnsw.msmarco-passage-openai-ada2/ \ - -threads 16 -M 16 -efC 100 -memorybuffer 65536 \ + -threads 16 -M 16 -efC 100 -memoryBuffer 65536 \ >& logs/log.msmarco-passage-openai-ada2 & ``` diff --git a/src/main/java/io/anserini/index/AbstractIndexer.java b/src/main/java/io/anserini/index/AbstractIndexer.java new file mode 100644 index 0000000000..76a37d986d --- /dev/null +++ b/src/main/java/io/anserini/index/AbstractIndexer.java @@ -0,0 +1,338 @@ +/* + * Anserini: A Lucene toolkit for reproducible information retrieval research + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +package io.anserini.index; + +import io.anserini.collection.DocumentCollection; +import io.anserini.collection.FileSegment; +import io.anserini.collection.SourceDocument; +import io.anserini.index.generator.EmptyDocumentException; +import io.anserini.index.generator.InvalidDocumentException; +import io.anserini.index.generator.LuceneDocumentGenerator; +import io.anserini.index.generator.SkippedDocumentException; +import org.apache.commons.lang3.time.DurationFormatUtils; +import org.apache.logging.log4j.Level; +import org.apache.logging.log4j.LogManager; +import org.apache.logging.log4j.Logger; +import org.apache.logging.log4j.core.config.Configurator; +import org.apache.lucene.document.Document; +import org.apache.lucene.index.IndexWriter; +import org.apache.lucene.index.Term; +import org.kohsuke.args4j.Option; + +import java.io.File; +import java.io.IOException; +import java.lang.reflect.InvocationTargetException; +import java.nio.file.Files; +import java.nio.file.Path; +import java.nio.file.Paths; +import java.util.List; +import java.util.Set; +import java.util.concurrent.Executors; +import java.util.concurrent.ThreadPoolExecutor; +import java.util.concurrent.TimeUnit; + +public abstract class AbstractIndexer implements Runnable { + private static final Logger LOG = LogManager.getLogger(AbstractIndexer.class); + + public static class Args { + @Option(name = "-collection", metaVar = "[class]", required = true, usage = "Collection class in io.anserini.collection.") + public String collectionClass; + + @Option(name = "-input", metaVar = "[path]", required = true, usage = "Input collection.") + public String input; + + @Option(name = "-index", metaVar = "[path]", required = true, usage = "Index path.") + public String index; + + @Option(name = "-uniqueDocid", usage = "Removes duplicate documents with the same docid during indexing.") + public boolean uniqueDocid = false; + + @Option(name = "-optimize", usage = "Optimizes index by merging into a single index segment.") + public boolean optimize = false; + + @Option(name = "-memoryBuffer", metaVar = "[mb]", usage = "Memory buffer size in MB.") + public int memoryBuffer = 4096; + + @Option(name = "-threads", metaVar = "[num]", usage = "Number of indexing threads.") + public int threads = 4; + + @Option(name = "-verbose", forbids = {"-quiet"}, usage = "Enables verbose logging for each indexing thread.") + public boolean verbose = false; + + @Option(name = "-quiet", forbids = {"-verbose"}, usage = "Turns off all logging.") + public boolean quiet = false; + + @Option(name = "-options", usage = "Print information about options.") + public Boolean options = false; + + @Option(name = "-shard.count", metaVar = "[n]", + usage = "Number of shards to partition the document collection into.") + public int shardCount = -1; + + @Option(name = "-shard.current", metaVar = "[n]", + usage = "The current shard number to generate (indexed from 0).") + public int shardCurrent = -1; + } + + public class IndexerThread extends Thread { + private final Path inputFile; + private final LuceneDocumentGenerator generator; + private final Set whitelistDocids; + + public IndexerThread(Path inputFile, LuceneDocumentGenerator generator) { + this(inputFile, generator, null); + } + + public IndexerThread(Path inputFile, LuceneDocumentGenerator generator, Set docids) { + this.inputFile = inputFile; + this.generator = generator; + this.whitelistDocids = docids; + + setName(inputFile.getFileName().toString()); + } + + @Override + public void run() { + try(FileSegment segment = collection.createFileSegment(inputFile)) { + // We keep track of two separate counts: the total count of documents in this file segment (cnt), + // and the number of documents in this current "batch" (batch). We update the global counter every + // 10k documents: this is so that we get intermediate updates, which is informative if a collection + // has only one file segment; see https://github.com/castorini/anserini/issues/683 + int cnt = 0; + int batch = 0; + + for (SourceDocument d : segment) { + if (!d.indexable()) { + counters.unindexable.incrementAndGet(); + continue; + } + + try { + if (whitelistDocids != null && !whitelistDocids.contains(d.id())) { + counters.skipped.incrementAndGet(); + continue; + } + + Document doc = generator.createDocument(d); + if (args.uniqueDocid) { + // Note that we're reading the config directly, which is within scope. + writer.updateDocument(new Term("id", d.id()), doc); + } else { + writer.addDocument(doc); + } + + cnt++; + batch++; + } catch (EmptyDocumentException e1) { + counters.empty.incrementAndGet(); + continue; + } catch (SkippedDocumentException e2) { + counters.skipped.incrementAndGet(); + continue; + } catch (InvalidDocumentException e3) { + counters.errors.incrementAndGet(); + continue; + } + + // Add the counts from this batch, reset batch counter. + if (batch % 10000 == 0) { + counters.indexed.addAndGet(batch); + batch = 0; + } + } + + // Add the remaining documents. + counters.indexed.addAndGet(batch); + + int skipped = segment.getSkippedCount(); + if (skipped > 0) { + counters.skipped.addAndGet(skipped); + LOG.warn(inputFile.getParent().getFileName().toString() + File.separator + + inputFile.getFileName().toString() + ": " + skipped + " docs skipped."); + } + + if (segment.getErrorStatus()) { + counters.errors.incrementAndGet(); + LOG.error(inputFile.getParent().getFileName().toString() + File.separator + + inputFile.getFileName().toString() + ": error iterating through segment."); + } + + // Log at the debug level because this can be quite noisy if there are lots of file segments. + LOG.debug(inputFile.getParent().getFileName().toString() + File.separator + + inputFile.getFileName().toString() + ": " + cnt + " docs added."); + } catch (Exception e) { + LOG.error(Thread.currentThread().getName() + ": Unexpected Exception:", e.getMessage()); + } + } + } + + protected final Args args; + protected Counters counters = new Counters(); + protected Path collectionPath; + protected DocumentCollection collection; + protected Class> generatorClass; + protected IndexWriter writer; + + @SuppressWarnings("unchecked") + public AbstractIndexer(Args args) { + this.args = args; + + if (args.verbose) { + // If verbose logging enabled, changed default log level to DEBUG so we get per-thread logging messages. + Configurator.setRootLevel(Level.DEBUG); + LOG.info("Setting log level to " + Level.DEBUG); + } else if (args.quiet) { + // If quiet mode enabled, only report warnings and above. + Configurator.setRootLevel(Level.WARN); + } else { + // Otherwise, we get the standard set of log messages. + Configurator.setRootLevel(Level.INFO); + LOG.info("Setting log level to " + Level.INFO); + } + + LOG.info("============ Loading Index Configuration ============"); + LOG.info("AbstractIndexer settings:"); + LOG.info(" + DocumentCollection path: " + args.input); + LOG.info(" + CollectionClass: " + args.collectionClass); + LOG.info(" + Index path: " + args.index); + LOG.info(" + Threads: " + args.threads); + LOG.info(" + Optimize (merge segments)? " + args.optimize); + + // Our documentation uses /path/to/foo as a convention: to make copy and paste of the commands work, + // we assume collections/ as the path location. + String pathStr = args.input; + if (pathStr.startsWith("/path/to")) { + pathStr = pathStr.replace("/path/to", "collections"); + } + this.collectionPath = Paths.get(pathStr); + if (!Files.exists(collectionPath) || !Files.isReadable(collectionPath) || !Files.isDirectory(collectionPath)) { + throw new IllegalArgumentException(String.format("Invalid collection path \"%s\".", collectionPath)); + } + + try { + Class> collectionClass = (Class>) + Class.forName("io.anserini.collection." + args.collectionClass); + this.collection = collectionClass.getConstructor(Path.class).newInstance(collectionPath); + } catch (Exception e) { + throw new IllegalArgumentException(String.format("Unable to load collection class \"%s\".", args.collectionClass)); + } + } + + @Override + public void run() { + LOG.info("============ Indexing Collection ============"); + final long start = System.nanoTime(); + + final List segmentPaths = args.shardCount > 1 ? + collection.getSegmentPaths(args.shardCount, args.shardCurrent) : + collection.getSegmentPaths(); + final int segmentCnt = segmentPaths.size(); + + final ThreadPoolExecutor executor = (ThreadPoolExecutor) Executors.newFixedThreadPool(args.threads); + LOG.info(String.format("Thread pool with %s threads initialized.", args.threads)); + LOG.info(String.format("%,d %s found in %s ", segmentCnt, (segmentCnt == 1 ? "file" : "files"), collectionPath)); + LOG.info("Starting to index..."); + + // Dispatch to default method to process the segments; subclasses can override this method if desired. + processSegments(executor, segmentPaths); + executor.shutdown(); + + try { + // Wait for existing tasks to terminate. + while (!executor.awaitTermination(1, TimeUnit.MINUTES)) { + if (segmentCnt == 1) { + LOG.info(String.format("%,d documents indexed", counters.indexed.get())); + } else { + LOG.info(String.format("%.2f%% of files completed, %,d documents indexed", + (double) executor.getCompletedTaskCount() / segmentCnt * 100.0d, counters.indexed.get())); + } + } + } catch (InterruptedException ie) { + // (Re-)Cancel if current thread also interrupted. + executor.shutdownNow(); + // Preserve interrupt status. + Thread.currentThread().interrupt(); + } + + if (segmentCnt != executor.getCompletedTaskCount()) { + throw new RuntimeException("totalFiles = " + segmentCnt + + " is not equal to completedTaskCount = " + executor.getCompletedTaskCount()); + } + + long numIndexed = writer.getDocStats().maxDoc; + if (numIndexed != counters.indexed.get()) { + // We want to log a warning here, as opposed to throw an exception, because for certain collections, + // this might be expected. For example, when indexing tweets - if a tweet is delivered multiple times + // (i.e., same docid), with -uniqueDocid we're going to update the doc in the index in place, leading + // to differences between the counts. + LOG.warn(String.format("Unexpected difference between number of indexed documents (%d) and index maxDoc (%d).", + numIndexed, counters.indexed.get())); + } + + // Do a final commit. + try { + writer.commit(); + if (args.optimize) { + writer.forceMerge(1); + } + } catch (IOException e) { + // It is possible that this happens... but nothing much we can do at this point, + // so just log the error and move on. + LOG.error(e); + } finally { + try { + writer.close(); + } catch (IOException e) { + // It is possible that this happens... but nothing much we can do at this point, + // so just log the error and move on. + LOG.error(e); + } + } + + LOG.info(String.format("Indexing Complete! %,d documents indexed", numIndexed)); + LOG.info("============ Final Counter Values ============"); + LOG.info(String.format("indexed: %,12d", counters.indexed.get())); + LOG.info(String.format("unindexable: %,12d", counters.unindexable.get())); + LOG.info(String.format("empty: %,12d", counters.empty.get())); + LOG.info(String.format("skipped: %,12d", counters.skipped.get())); + LOG.info(String.format("errors: %,12d", counters.errors.get())); + + final long durationMillis = TimeUnit.MILLISECONDS.convert(System.nanoTime() - start, TimeUnit.NANOSECONDS); + LOG.info(String.format("Total %,d documents indexed in %s", numIndexed, + DurationFormatUtils.formatDuration(durationMillis, "HH:mm:ss"))); + } + + // Default method to process the segments; subclasses can override this method if desired. + protected void processSegments(ThreadPoolExecutor executor, List segmentPaths) { + segmentPaths.forEach((segmentPath) -> { + try { + // Each thread gets its own document generator, so we don't need to make any assumptions about its thread safety. + @SuppressWarnings("unchecked") + LuceneDocumentGenerator generator = (LuceneDocumentGenerator) + generatorClass.getDeclaredConstructor((Class []) null).newInstance(); + + executor.execute(new IndexerThread(segmentPath, generator)); + } catch (InstantiationException | IllegalAccessException | InvocationTargetException | NoSuchMethodException e) { + throw new IllegalArgumentException(String.format("Unable to load LuceneDocumentGenerator \"%s\".", generatorClass.getSimpleName())); + } + }); + } + + public Counters getCounters() { + return this.counters; + } +} diff --git a/src/main/java/io/anserini/index/IndexCollection.java b/src/main/java/io/anserini/index/IndexCollection.java index e2d6baa996..f8775d92f0 100644 --- a/src/main/java/io/anserini/index/IndexCollection.java +++ b/src/main/java/io/anserini/index/IndexCollection.java @@ -17,95 +17,50 @@ package io.anserini.index; import io.anserini.analysis.AnalyzerMap; +import io.anserini.analysis.AutoCompositeAnalyzer; import io.anserini.analysis.CompositeAnalyzer; -import io.anserini.analysis.HuggingFaceTokenizerAnalyzer; import io.anserini.analysis.DefaultEnglishAnalyzer; -import io.anserini.analysis.AutoCompositeAnalyzer; +import io.anserini.analysis.HuggingFaceTokenizerAnalyzer; import io.anserini.analysis.TweetAnalyzer; -import io.anserini.collection.DocumentCollection; -import io.anserini.collection.FileSegment; import io.anserini.collection.SourceDocument; -import io.anserini.index.generator.EmptyDocumentException; -import io.anserini.index.generator.InvalidDocumentException; import io.anserini.index.generator.LuceneDocumentGenerator; -import io.anserini.index.generator.SkippedDocumentException; import io.anserini.search.similarity.AccurateBM25Similarity; import io.anserini.search.similarity.ImpactSimilarity; import org.apache.commons.io.FileUtils; -import org.apache.commons.lang3.time.DurationFormatUtils; -import org.apache.logging.log4j.Level; import org.apache.logging.log4j.LogManager; import org.apache.logging.log4j.Logger; -import org.apache.logging.log4j.core.config.Configurator; import org.apache.lucene.analysis.Analyzer; import org.apache.lucene.analysis.core.WhitespaceAnalyzer; -import org.apache.lucene.document.Document; import org.apache.lucene.index.ConcurrentMergeScheduler; import org.apache.lucene.index.IndexWriter; import org.apache.lucene.index.IndexWriterConfig; -import org.apache.lucene.index.Term; import org.apache.lucene.search.similarities.BM25Similarity; import org.apache.lucene.store.Directory; import org.apache.lucene.store.FSDirectory; import org.kohsuke.args4j.CmdLineException; import org.kohsuke.args4j.CmdLineParser; import org.kohsuke.args4j.Option; -import org.kohsuke.args4j.OptionHandlerFilter; import org.kohsuke.args4j.ParserProperties; import org.kohsuke.args4j.spi.StringArrayOptionHandler; import java.io.File; -import java.io.IOException; -import java.nio.file.Files; +import java.lang.reflect.InvocationTargetException; import java.nio.file.Path; import java.nio.file.Paths; +import java.util.ArrayList; import java.util.Arrays; import java.util.HashSet; import java.util.List; import java.util.Set; -import java.util.concurrent.Executors; import java.util.concurrent.ThreadPoolExecutor; -import java.util.concurrent.TimeUnit; -import java.util.concurrent.atomic.AtomicLong; -public final class IndexCollection { +public final class IndexCollection extends AbstractIndexer { private static final Logger LOG = LogManager.getLogger(IndexCollection.class); // This is the default analyzer used, unless another stemming algorithm or language is specified. public static final Analyzer DEFAULT_ANALYZER = DefaultEnglishAnalyzer.newDefaultInstance(); - public static class Args { - - private static final int TIMEOUT = 600 * 1000; - - // required arguments - - @Option(name = "-input", metaVar = "[path]", required = true, - usage = "Location of input collection.") - public String input; - - @Option(name = "-collection", metaVar = "[class]", required = true, - usage = "Collection class in package 'io.anserini.collection'.") - public String collectionClass; - - @Option(name = "-index", metaVar = "[path]", usage = "Index path.", required = true) - public String index; - - // optional general arguments - - @Option(name = "-verbose", forbids = {"-quiet"}, - usage = "Enables verbose logging for each indexing thread; can be noisy if collection has many small file segments.") - public boolean verbose = false; - - @Option(name = "-quiet", forbids = {"-verbose"}, - usage = "Turns off all logging.") - public boolean quiet = false; - - // optional arguments - - @Option(name = "-threads", metaVar = "[num]", usage = "Number of indexing threads.") - public int threads = 8; - + public static class Args extends AbstractIndexer.Args { @Option(name = "-append", usage = "Append documents.") public boolean append = false; @@ -133,10 +88,6 @@ public static class Args { usage = "Boolean switch to store raw source documents.") public boolean storeRaw = false; - @Option(name = "-optimize", - usage = "Boolean switch to optimize index (i.e., force merge) into a single segment; costly for large collections.") - public boolean optimize = false; - @Option(name = "-keepStopwords", usage = "Boolean switch to keep stopwords.") public boolean keepStopwords = false; @@ -149,15 +100,6 @@ public static class Args { usage = "Stemmer: one of the following {porter, krovetz, none}; defaults to 'porter'.") public String stemmer = "porter"; - @Option(name = "-uniqueDocid", - usage = "Removes duplicate documents with the same docid during indexing. This significantly slows indexing throughput " + - "but may be needed for tweet collections since the streaming API might deliver a tweet multiple times.") - public boolean uniqueDocid = false; - - @Option(name = "-memorybuffer", metaVar = "[mb]", - usage = "Memory buffer size (in MB).") - public int memorybufferSize = 2048; - @Option(name = "-whitelist", metaVar = "[file]", usage = "File containing list of docids, one per line; only these docids will be indexed.") public String whitelist = null; @@ -211,189 +153,35 @@ public static class Args { @Option(name = "-tweet.deletedIdsFile", metaVar = "[file]", usage = "File that contains deleted tweet ids (longs), one per line; these tweets will be skipped during indexing.") public String tweetDeletedIdsFile = ""; - - // Sharding options - - @Option(name = "-shard.count", metaVar = "[n]", - usage = "Number of shards to partition the document collection into.") - public int shardCount = -1; - - @Option(name = "-shard.current", metaVar = "[n]", - usage = "The current shard number to generate (indexed from 0).") - public int shardCurrent = -1; - } - - private final class LocalIndexerThread extends Thread { - final private Path inputFile; - final private IndexWriter writer; - final private DocumentCollection collection; - private FileSegment fileSegment; - - private LocalIndexerThread(IndexWriter writer, DocumentCollection collection, Path inputFile) { - this.writer = writer; - this.collection = collection; - this.inputFile = inputFile; - setName(inputFile.getFileName().toString()); - } - - @Override - @SuppressWarnings("unchecked") - public void run() { - try { - LuceneDocumentGenerator generator = (LuceneDocumentGenerator) - generatorClass.getDeclaredConstructor(Args.class).newInstance(args); - - // We keep track of two separate counts: the total count of documents in this file segment (cnt), - // and the number of documents in this current "batch" (batch). We update the global counter every - // 10k documents: this is so that we get intermediate updates, which is informative if a collection - // has only one file segment; see https://github.com/castorini/anserini/issues/683 - int cnt = 0; - int batch = 0; - - FileSegment segment = collection.createFileSegment(inputFile); - // in order to call close() and clean up resources in case of exception - this.fileSegment = segment; - - for (SourceDocument d : segment) { - if (!d.indexable()) { - counters.unindexable.incrementAndGet(); - continue; - } - - Document doc; - try { - doc = generator.createDocument(d); - } catch (EmptyDocumentException e1) { - counters.empty.incrementAndGet(); - continue; - } catch (SkippedDocumentException e2) { - counters.skipped.incrementAndGet(); - continue; - } catch (InvalidDocumentException e3) { - counters.errors.incrementAndGet(); - continue; - } - - if (whitelistDocids != null && !whitelistDocids.contains(d.id())) { - counters.skipped.incrementAndGet(); - continue; - } - - if (args.uniqueDocid) { - writer.updateDocument(new Term("id", d.id()), doc); - } else { - writer.addDocument(doc); - } - cnt++; - batch++; - - // And the counts from this batch, reset batch counter. - if (batch % 10000 == 0) { - counters.indexed.addAndGet(batch); - batch = 0; - } - } - - // Add the remaining documents. - counters.indexed.addAndGet(batch); - - int skipped = segment.getSkippedCount(); - if (skipped > 0) { - // When indexing tweets, this is normal, because there are delete messages that are skipped over. - counters.skipped.addAndGet(skipped); - LOG.warn(inputFile.getParent().getFileName().toString() + File.separator + - inputFile.getFileName().toString() + ": " + skipped + " docs skipped."); - } - - if (segment.getErrorStatus()) { - counters.errors.incrementAndGet(); - LOG.error(inputFile.getParent().getFileName().toString() + File.separator + - inputFile.getFileName().toString() + ": error iterating through segment."); - } - - // Log at the debug level because this can be quite noisy if there are lots of file segments. - LOG.debug(inputFile.getParent().getFileName().toString() + File.separator + - inputFile.getFileName().toString() + ": " + cnt + " docs added."); - } catch (Exception e) { - LOG.error(Thread.currentThread().getName() + ": Unexpected Exception:", e); - } finally { - if (fileSegment != null) { - fileSegment.close(); - } - } - } } - private final Args args; - private final Path collectionPath; - private final Set whitelistDocids; - private final Class collectionClass; - private final Class generatorClass; - private final DocumentCollection collection; - private final Counters counters; - private Path indexPath; + private final Set whitelistDocids; @SuppressWarnings("unchecked") public IndexCollection(Args args) throws Exception { - this.args = args; - - if (args.verbose) { - // If verbose logging enabled, changed default log level to DEBUG so we get per-thread logging messages. - Configurator.setRootLevel(Level.DEBUG); - LOG.info("Setting log level to " + Level.DEBUG); - } else if (args.quiet) { - // If quiet mode enabled, only report warnings and above. - Configurator.setRootLevel(Level.WARN); - } else { - // Otherwise, we get the standard set of log messages. - Configurator.setRootLevel(Level.INFO); - LOG.info("Setting log level to " + Level.INFO); - } - - LOG.info("Starting indexer..."); - LOG.info("============ Loading Parameters ============"); - LOG.info("DocumentCollection path: " + args.input); - LOG.info("CollectionClass: " + args.collectionClass); - LOG.info("Generator: " + args.generatorClass); - LOG.info("Threads: " + args.threads); - LOG.info("Language: " + args.language); - LOG.info("Stemmer: " + args.stemmer); - LOG.info("Keep stopwords? " + args.keepStopwords); - LOG.info("Stopwords: " + args.stopwords); - LOG.info("Store positions? " + args.storePositions); - LOG.info("Store docvectors? " + args.storeDocvectors); - LOG.info("Store document \"contents\" field? " + args.storeContents); - LOG.info("Store document \"raw\" field? " + args.storeRaw); - LOG.info("Additional fields to index: " + Arrays.toString(args.fields)); - LOG.info("Optimize (merge segments)? " + args.optimize); - LOG.info("Whitelist: " + args.whitelist); - LOG.info("Pretokenized?: " + args.pretokenized); - LOG.info("Index path: " + args.index); - - if (args.index != null) { - this.indexPath = Paths.get(args.index); - if (!Files.exists(this.indexPath)) { - Files.createDirectories(this.indexPath); - } - } + super(args); + + LOG.info("IndexCollection settings:"); + LOG.info(" + Generator: " + args.generatorClass); + LOG.info(" + Language: " + args.language); + LOG.info(" + Stemmer: " + args.stemmer); + LOG.info(" + Keep stopwords? " + args.keepStopwords); + LOG.info(" + Stopwords: " + args.stopwords); + LOG.info(" + Store positions? " + args.storePositions); + LOG.info(" + Store docvectors? " + args.storeDocvectors); + LOG.info(" + Store document \"contents\" field? " + args.storeContents); + LOG.info(" + Store document \"raw\" field? " + args.storeRaw); + LOG.info(" + Additional fields to index: " + Arrays.toString(args.fields)); + LOG.info(" + Whitelist: " + args.whitelist); + LOG.info(" + Pretokenized?: " + args.pretokenized); - // Our documentation uses /path/to/foo as a convention: to make copy and paste of the commands work, we assume - // collections/ as the path location. - String pathStr = args.input; - if (pathStr.startsWith("/path/to")) { - pathStr = pathStr.replace("/path/to", "collections"); - } - collectionPath = Paths.get(pathStr); - if (!Files.exists(collectionPath) || !Files.isReadable(collectionPath) || !Files.isDirectory(collectionPath)) { - throw new RuntimeException("Document directory " + collectionPath.toString() + " does not exist or is not readable, please check the path"); + try { + super.generatorClass = (Class>) + Class.forName("io.anserini.index.generator." + args.generatorClass); + } catch (Exception e) { + throw new IllegalArgumentException(String.format("Unable to load generator class \"%s\".", args.generatorClass)); } - this.generatorClass = Class.forName("io.anserini.index.generator." + args.generatorClass); - this.collectionClass = Class.forName("io.anserini.collection." + args.collectionClass); - - // Initialize the collection. - collection = (DocumentCollection) this.collectionClass.getConstructor(Path.class).newInstance(collectionPath); - if (args.whitelist != null) { List lines = FileUtils.readLines(new File(args.whitelist), "utf-8"); this.whitelistDocids = new HashSet<>(lines); @@ -401,179 +189,110 @@ public IndexCollection(Args args) throws Exception { this.whitelistDocids = null; } - this.counters = new Counters(); + final Directory dir = FSDirectory.open(Paths.get(args.index)); + final IndexWriterConfig config = new IndexWriterConfig(getAnalyzer()); + + if (args.bm25Accurate) { + // Necessary during indexing as the norm used in BM25 is already determined at index time. + config.setSimilarity(new AccurateBM25Similarity()); + } if (args.impact ) { + config.setSimilarity(new ImpactSimilarity()); + } else { + config.setSimilarity(new BM25Similarity()); + } + config.setOpenMode(IndexWriterConfig.OpenMode.CREATE); + config.setRAMBufferSizeMB(args.memoryBuffer); + config.setUseCompoundFile(false); + config.setMergeScheduler(new ConcurrentMergeScheduler()); + + super.writer = new IndexWriter(dir, config); } private Analyzer getAnalyzer() { try { - if (args.collectionClass.equals("TweetCollection")) { - return new TweetAnalyzer(args.tweetStemming); - } else if (args.useAutoCompositeAnalyzer) { + // args is stored in the super-class; here, explicitly get from super-class and down-cast. + Args castedArgs = (Args) super.args; + if (castedArgs.collectionClass.equals("TweetCollection")) { + return new TweetAnalyzer(castedArgs.tweetStemming); + } else if (castedArgs.useAutoCompositeAnalyzer) { LOG.info("Using AutoCompositeAnalyzer"); - return AutoCompositeAnalyzer.getAnalyzer(args.language, args.analyzeWithHuggingFaceTokenizer); - } else if (args.useCompositeAnalyzer) { + return AutoCompositeAnalyzer.getAnalyzer(castedArgs.language, castedArgs.analyzeWithHuggingFaceTokenizer); + } else if (castedArgs.useCompositeAnalyzer) { final Analyzer languageSpecificAnalyzer; - if (AnalyzerMap.analyzerMap.containsKey(args.language)) { - languageSpecificAnalyzer = AnalyzerMap.getLanguageSpecificAnalyzer(args.language); - } else if (args.language.equals("en")) { - languageSpecificAnalyzer = DefaultEnglishAnalyzer.fromArguments(args.stemmer, args.keepStopwords, args.stopwords); + if (AnalyzerMap.analyzerMap.containsKey(castedArgs.language)) { + languageSpecificAnalyzer = AnalyzerMap.getLanguageSpecificAnalyzer(castedArgs.language); + } else if (castedArgs.language.equals("en")) { + languageSpecificAnalyzer = DefaultEnglishAnalyzer.fromArguments(castedArgs.stemmer, castedArgs.keepStopwords, castedArgs.stopwords); } else { languageSpecificAnalyzer = new WhitespaceAnalyzer(); } String message = "Using CompositeAnalyzer with HF Tokenizer: %s & Analyzer %s"; - LOG.info(String.format(message, args.analyzeWithHuggingFaceTokenizer, languageSpecificAnalyzer.getClass().getName())); - return new CompositeAnalyzer(args.analyzeWithHuggingFaceTokenizer, languageSpecificAnalyzer); - } else if (args.analyzeWithHuggingFaceTokenizer!= null) { - return new HuggingFaceTokenizerAnalyzer(args.analyzeWithHuggingFaceTokenizer); - } else if (AnalyzerMap.analyzerMap.containsKey(args.language)) { + LOG.info(String.format(message, castedArgs.analyzeWithHuggingFaceTokenizer, languageSpecificAnalyzer.getClass().getName())); + return new CompositeAnalyzer(castedArgs.analyzeWithHuggingFaceTokenizer, languageSpecificAnalyzer); + } else if (castedArgs.analyzeWithHuggingFaceTokenizer!= null) { + return new HuggingFaceTokenizerAnalyzer(castedArgs.analyzeWithHuggingFaceTokenizer); + } else if (AnalyzerMap.analyzerMap.containsKey(castedArgs.language)) { LOG.info("Using language-specific analyzer"); - LOG.info("Language: " + args.language); - return AnalyzerMap.getLanguageSpecificAnalyzer(args.language); - } else if ( Arrays.asList("ha","so","sw","yo").contains(args.language)) { + LOG.info("Language: " + castedArgs.language); + return AnalyzerMap.getLanguageSpecificAnalyzer(castedArgs.language); + } else if ( Arrays.asList("ha","so","sw","yo").contains(castedArgs.language)) { return new WhitespaceAnalyzer(); - } else if (args.pretokenized) { + } else if (castedArgs.pretokenized) { return new WhitespaceAnalyzer(); } else { // Default to English LOG.info("Using DefaultEnglishAnalyzer"); - LOG.info("Stemmer: " + args.stemmer); - LOG.info("Keep stopwords? " + args.keepStopwords); - LOG.info("Stopwords file: " + args.stopwords); - return DefaultEnglishAnalyzer.fromArguments(args.stemmer, args.keepStopwords, args.stopwords); + LOG.info("Stemmer: " + castedArgs.stemmer); + LOG.info("Keep stopwords? " + castedArgs.keepStopwords); + LOG.info("Stopwords file: " + castedArgs.stopwords); + return DefaultEnglishAnalyzer.fromArguments(castedArgs.stemmer, castedArgs.keepStopwords, castedArgs.stopwords); } } catch (Exception e) { return null; } } - public Counters run() throws IOException { - final long start = System.nanoTime(); - LOG.info("============ Indexing Collection ============"); - - int numThreads = args.threads; - IndexWriter writer = null; - - // Used for LocalIndexThread - if (indexPath != null) { - final Directory dir = FSDirectory.open(indexPath); - final IndexWriterConfig config; - final Analyzer analyzer; - analyzer = getAnalyzer(); - config = new IndexWriterConfig(analyzer); - - if (args.bm25Accurate) { - config.setSimilarity(new AccurateBM25Similarity()); // necessary during indexing as the norm used in BM25 is already determined at index time. - } if (args.impact ) { - config.setSimilarity(new ImpactSimilarity()); - } else { - config.setSimilarity(new BM25Similarity()); - } - config.setOpenMode(IndexWriterConfig.OpenMode.CREATE); - config.setRAMBufferSizeMB(args.memorybufferSize); - config.setUseCompoundFile(false); - config.setMergeScheduler(new ConcurrentMergeScheduler()); - - writer = new IndexWriter(dir, config); - } - - final ThreadPoolExecutor executor = (ThreadPoolExecutor) Executors.newFixedThreadPool(numThreads); - LOG.info("Thread pool with " + numThreads + " threads initialized."); - - LOG.info("Initializing collection in " + collectionPath.toString()); - - List segmentPaths = collection.getSegmentPaths(); - // when we want sharding to be done - if (args.shardCount > 1) { - segmentPaths = collection.getSegmentPaths(args.shardCount, args.shardCurrent); - } - final int segmentCnt = segmentPaths.size(); - - LOG.info(String.format("%,d %s found", segmentCnt, (segmentCnt == 1 ? "file" : "files" ))); - LOG.info("Starting to index..."); - - for (int i = 0; i < segmentCnt; i++) { - executor.execute(new LocalIndexerThread(writer, collection, (Path) segmentPaths.get(i))); - } - - executor.shutdown(); - - try { - // Wait for existing tasks to terminate - while (!executor.awaitTermination(1, TimeUnit.MINUTES)) { - if (segmentCnt == 1) { - LOG.info(String.format("%,d documents indexed", counters.indexed.get())); - } else { - LOG.info(String.format("%.2f%% of files completed, %,d documents indexed", - (double) executor.getCompletedTaskCount() / segmentCnt * 100.0d, counters.indexed.get())); - } - } - } catch (InterruptedException ie) { - // (Re-)Cancel if current thread also interrupted - executor.shutdownNow(); - // Preserve interrupt status - Thread.currentThread().interrupt(); - } - - if (segmentCnt != executor.getCompletedTaskCount()) { - throw new RuntimeException("totalFiles = " + segmentCnt + - " is not equal to completedTaskCount = " + executor.getCompletedTaskCount()); - } - - long numIndexed = writer.getDocStats().maxDoc; - - // Do a final commit - try { - if (writer != null) { - writer.commit(); - if (args.optimize) { - writer.forceMerge(1); - } - } - } finally { + protected void processSegments(ThreadPoolExecutor executor, List segmentPaths) { + segmentPaths.forEach((segmentPath) -> { try { - if (writer != null) { - writer.close(); - } - } catch (IOException e) { - // It is possible that this happens... but nothing much we can do at this point, - // so just log the error and move on. - LOG.error(e); + // Each thread gets its own document generator, so we don't need to make any assumptions about its thread safety. + @SuppressWarnings("unchecked") + LuceneDocumentGenerator generator = (LuceneDocumentGenerator) + generatorClass.getDeclaredConstructor(Args.class).newInstance(this.args); + + executor.execute(new AbstractIndexer.IndexerThread(segmentPath, generator, whitelistDocids)); + } catch (InstantiationException | IllegalAccessException | InvocationTargetException | NoSuchMethodException e) { + throw new IllegalArgumentException(String.format("Unable to load LuceneDocumentGenerator \"%s\".", generatorClass.getSimpleName())); } - } - - if (numIndexed != counters.indexed.get()) { - LOG.warn("Unexpected difference between number of indexed documents and index maxDoc."); - } - - LOG.info(String.format("Indexing Complete! %,d documents indexed", numIndexed)); - LOG.info("============ Final Counter Values ============"); - LOG.info(String.format("indexed: %,12d", counters.indexed.get())); - LOG.info(String.format("unindexable: %,12d", counters.unindexable.get())); - LOG.info(String.format("empty: %,12d", counters.empty.get())); - LOG.info(String.format("skipped: %,12d", counters.skipped.get())); - LOG.info(String.format("errors: %,12d", counters.errors.get())); - - final long durationMillis = TimeUnit.MILLISECONDS.convert(System.nanoTime() - start, TimeUnit.NANOSECONDS); - LOG.info(String.format("Total %,d documents indexed in %s", numIndexed, - DurationFormatUtils.formatDuration(durationMillis, "HH:mm:ss"))); - - return counters; + }); } public static void main(String[] args) throws Exception { - Args indexCollectionArgs = new Args(); - CmdLineParser parser = new CmdLineParser(indexCollectionArgs, ParserProperties.defaults().withUsageWidth(100)); + Args indexArgs = new Args(); + CmdLineParser parser = new CmdLineParser(indexArgs, ParserProperties.defaults().withUsageWidth(120)); try { parser.parseArgument(args); } catch (CmdLineException e) { - System.err.println(e.getMessage()); - parser.printUsage(System.err); - System.err.println("Example: " + IndexCollection.class.getSimpleName() + - parser.printExample(OptionHandlerFilter.REQUIRED)); + if (indexArgs.options) { + System.err.printf("Options for %s:\n\n", IndexCollection.class.getSimpleName()); + parser.printUsage(System.err); + + List required = new ArrayList<>(); + parser.getOptions().forEach((option) -> { + if (option.option.required()) { + required.add(option.option.toString()); + } + }); + + System.err.printf("\nRequired options are %s\n", required); + } else { + System.err.printf("Error: %s. For help, use \"-options\" to print out information about options.\n", e.getMessage()); + } + return; } - new IndexCollection(indexCollectionArgs).run(); + new IndexCollection(indexArgs).run(); } } diff --git a/src/main/java/io/anserini/index/IndexHnswDenseVectors.java b/src/main/java/io/anserini/index/IndexHnswDenseVectors.java index 0dbd80a288..10e7c15640 100644 --- a/src/main/java/io/anserini/index/IndexHnswDenseVectors.java +++ b/src/main/java/io/anserini/index/IndexHnswDenseVectors.java @@ -16,18 +16,10 @@ package io.anserini.index; -import io.anserini.collection.DocumentCollection; -import io.anserini.collection.FileSegment; import io.anserini.collection.SourceDocument; -import io.anserini.index.generator.EmptyDocumentException; -import io.anserini.index.generator.InvalidDocumentException; import io.anserini.index.generator.LuceneDocumentGenerator; -import io.anserini.index.generator.SkippedDocumentException; -import org.apache.commons.lang3.time.DurationFormatUtils; -import org.apache.logging.log4j.Level; import org.apache.logging.log4j.LogManager; import org.apache.logging.log4j.Logger; -import org.apache.logging.log4j.core.config.Configurator; import org.apache.lucene.codecs.KnnVectorsFormat; import org.apache.lucene.codecs.KnnVectorsReader; import org.apache.lucene.codecs.KnnVectorsWriter; @@ -44,215 +36,86 @@ import org.kohsuke.args4j.CmdLineException; import org.kohsuke.args4j.CmdLineParser; import org.kohsuke.args4j.Option; -import org.kohsuke.args4j.OptionHandlerFilter; import org.kohsuke.args4j.ParserProperties; -import java.io.File; import java.io.IOException; -import java.nio.file.Files; -import java.nio.file.Path; import java.nio.file.Paths; +import java.util.ArrayList; import java.util.List; -import java.util.concurrent.Executors; -import java.util.concurrent.ThreadPoolExecutor; -import java.util.concurrent.TimeUnit; -public final class IndexHnswDenseVectors { +public final class IndexHnswDenseVectors extends AbstractIndexer { private static final Logger LOG = LogManager.getLogger(IndexHnswDenseVectors.class); - public static final class Args { - @Option(name = "-collection", metaVar = "[class]", required = true, usage = "Collection class in io.anserini.collection.") - public String collectionClass; - - @Option(name = "-input", metaVar = "[path]", required = true, usage = "Input collection.") - public String input; - + public static final class Args extends AbstractIndexer.Args { @Option(name = "-generator", metaVar = "[class]", usage = "Document generator class in io.anserini.index.generator.") public String generatorClass = "HnswDenseVectorDocumentGenerator"; - @Option(name = "-index", metaVar = "[path]", required = true, usage = "Index path.") - public String index; - @Option(name = "-M", metaVar = "[num]", usage = "HNSW parameters M") public int M = 16; @Option(name = "-efC", metaVar = "[num]", usage = "HNSW parameters ef Construction") public int efC = 100; - @Option(name = "-optimize", usage = "Optimizes index by merging into a single index segment.") - public boolean optimize = false; - - @Option(name = "-memorybuffer", metaVar = "[mb]", usage = "Memory buffer size in MB.") - public int memorybufferSize = 65536; - @Option(name = "-storeVectors", usage = "Boolean switch to store raw raw vectors.") public boolean storeVectors = false; - - @Option(name = "-threads", metaVar = "[num]", usage = "Number of indexing threads.") - public int threads = 4; - - @Option(name = "-verbose", forbids = {"-quiet"}, usage = "Enables verbose logging for each indexing thread.") - public boolean verbose = false; - - @Option(name = "-quiet", forbids = {"-verbose"}, usage = "Turns off all logging.") - public boolean quiet = false; - } - - private final class LocalIndexerThread extends Thread { - final private Path inputFile; - final private IndexWriter writer; - final private DocumentCollection collection; - - private LocalIndexerThread(IndexWriter writer, DocumentCollection collection, Path inputFile) { - this.writer = writer; - this.collection = collection; - this.inputFile = inputFile; - setName(inputFile.getFileName().toString()); - } - - @Override - public void run() { - FileSegment segment = null; - - try { - @SuppressWarnings("unchecked") - LuceneDocumentGenerator generator = (LuceneDocumentGenerator) - generatorClass.getDeclaredConstructor(Args.class).newInstance(args); - - // We keep track of two separate counts: the total count of documents in this file segment (cnt), - // and the number of documents in this current "batch" (batch). We update the global counter every - // 10k documents: this is so that we get intermediate updates, which is informative if a collection - // has only one file segment; see https://github.com/castorini/anserini/issues/683 - int cnt = 0; - int batch = 0; - - segment = collection.createFileSegment(inputFile); - - for (SourceDocument d : segment) { - if (!d.indexable()) { - counters.unindexable.incrementAndGet(); - continue; - } - - try { - writer.addDocument(generator.createDocument(d)); - - cnt++; - batch++; - } catch (EmptyDocumentException e1) { - counters.empty.incrementAndGet(); - continue; - } catch (SkippedDocumentException e2) { - counters.skipped.incrementAndGet(); - continue; - } catch (InvalidDocumentException e3) { - counters.errors.incrementAndGet(); - continue; - } - - // And the counts from this batch, reset batch counter. - if (batch % 10000 == 0) { - counters.indexed.addAndGet(batch); - batch = 0; - } - } - - // Add the remaining documents. - counters.indexed.addAndGet(batch); - - int skipped = segment.getSkippedCount(); - if (skipped > 0) { - // When indexing tweets, this is normal, because there are delete messages that are skipped over. - counters.skipped.addAndGet(skipped); - LOG.warn(inputFile.getParent().getFileName().toString() + File.separator + - inputFile.getFileName().toString() + ": " + skipped + " docs skipped."); - } - - if (segment.getErrorStatus()) { - counters.errors.incrementAndGet(); - LOG.error(inputFile.getParent().getFileName().toString() + File.separator + - inputFile.getFileName().toString() + ": error iterating through segment."); - } - - // Log at the debug level because this can be quite noisy if there are lots of file segments. - LOG.debug(inputFile.getParent().getFileName().toString() + File.separator + - inputFile.getFileName().toString() + ": " + cnt + " docs added."); - } catch (Exception e) { - LOG.error(Thread.currentThread().getName() + ": Unexpected Exception:", e); - } finally { - segment.close(); - } - } } - private final Args args; - private final Path collectionPath; - private final Class> generatorClass; - private final DocumentCollection collection; - private final Counters counters; - private final Path indexPath; - @SuppressWarnings("unchecked") public IndexHnswDenseVectors(Args args) throws Exception { - this.args = args; + super(args); - if (args.verbose) { - // If verbose logging enabled, changed default log level to DEBUG so we get per-thread logging messages. - Configurator.setRootLevel(Level.DEBUG); - LOG.info("Setting log level to " + Level.DEBUG); - } else if (args.quiet) { - // If quiet mode enabled, only report warnings and above. - Configurator.setRootLevel(Level.WARN); - } else { - // Otherwise, we get the standard set of log messages. - Configurator.setRootLevel(Level.INFO); - LOG.info("Setting log level to " + Level.INFO); - } + LOG.info("HnswIndexer settings:"); + LOG.info(" + Generator: " + args.generatorClass); + LOG.info(" + M: " + args.M); + LOG.info(" + efC: " + args.efC); + LOG.info(" + Store document vectors? " + args.storeVectors); - LOG.info("Starting indexer..."); - LOG.info("============ Loading Parameters ============"); - LOG.info("DocumentCollection path: " + args.input); - LOG.info("CollectionClass: " + args.collectionClass); - LOG.info("Generator: " + args.generatorClass); - LOG.info("Threads: " + args.threads); - LOG.info("Store document vectors? " + args.storeVectors); - LOG.info("Optimize (merge segments)? " + args.optimize); - LOG.info("Index path: " + args.index); - - this.indexPath = Paths.get(args.index); - if (!Files.exists(this.indexPath)) { - Files.createDirectories(this.indexPath); - } - - // Our documentation uses /path/to/foo as a convention: to make copy and paste of the commands work, we assume - // collections/ as the path location. - String pathStr = args.input; - if (pathStr.startsWith("/path/to")) { - pathStr = pathStr.replace("/path/to", "collections"); - } - this.collectionPath = Paths.get(pathStr); - if (!Files.exists(collectionPath) || !Files.isReadable(collectionPath) || !Files.isDirectory(collectionPath)) { - throw new RuntimeException("Invalid collection path " + collectionPath + "!"); + try { + super.generatorClass = (Class>) + Class.forName("io.anserini.index.generator." + args.generatorClass); + } catch (Exception e) { + throw new IllegalArgumentException(String.format("Unable to load generator class \"%s\".", args.generatorClass)); } - Class> collectionClass = (Class>) - Class.forName("io.anserini.collection." + args.collectionClass); - this.collection = collectionClass.getConstructor(Path.class).newInstance(collectionPath); + try { + final Directory dir = FSDirectory.open(Paths.get(args.index)); + final IndexWriterConfig config = new IndexWriterConfig().setCodec( + new Lucene95Codec() { + @Override + public KnnVectorsFormat getKnnVectorsFormatForField(String field) { + return new DelegatingKnnVectorsFormat( + new Lucene95HnswVectorsFormat(args.M, args.efC), 4096); + } + }); + + config.setOpenMode(IndexWriterConfig.OpenMode.CREATE); + config.setRAMBufferSizeMB(args.memoryBuffer); + config.setUseCompoundFile(false); + config.setMergeScheduler(new ConcurrentMergeScheduler()); - this.generatorClass = (Class>) - Class.forName("io.anserini.index.generator." + args.generatorClass); + if (args.optimize) { + // If we're going to merge down into a single segment at the end, skip intermediate merges, + // since they are a waste of time. + TieredMergePolicy mergePolicy = new TieredMergePolicy(); + mergePolicy.setMaxMergeAtOnce(256); + mergePolicy.setSegmentsPerTier(256); + config.setMergePolicy(mergePolicy); + } - this.counters = new Counters(); + this.writer = new IndexWriter(dir, config); + } catch (Exception e) { + throw new IllegalArgumentException(String.format("Unable to create IndexWriter: %s.", e.getMessage())); + } } // Solution provided by Solr, see https://www.mail-archive.com/java-user@lucene.apache.org/msg52149.html // This class exists because Lucene95HnswVectorsFormat's getMaxDimensions method is final and we // need to workaround that constraint to allow more than the default number of dimensions. - private static final class OpenAiDelegatingKnnVectorsFormat extends KnnVectorsFormat { + private static final class DelegatingKnnVectorsFormat extends KnnVectorsFormat { private final KnnVectorsFormat delegate; private final int maxDimensions; - public OpenAiDelegatingKnnVectorsFormat(KnnVectorsFormat delegate, int maxDimensions) { + public DelegatingKnnVectorsFormat(KnnVectorsFormat delegate, int maxDimensions) { super(delegate.getName()); this.delegate = delegate; this.maxDimensions = maxDimensions; @@ -274,123 +137,32 @@ public int getMaxDimensions(String fieldName) { } } - public Counters run() throws IOException { - final long start = System.nanoTime(); - LOG.info("============ Indexing Collection ============"); + public static void main(String[] args) throws Exception { + Args indexArgs = new Args(); + CmdLineParser parser = new CmdLineParser(indexArgs, ParserProperties.defaults().withUsageWidth(120)); - final Directory dir = FSDirectory.open(indexPath); - final IndexWriterConfig config = new IndexWriterConfig().setCodec( - new Lucene95Codec() { - @Override - public KnnVectorsFormat getKnnVectorsFormatForField(String field) { - return new OpenAiDelegatingKnnVectorsFormat( - new Lucene95HnswVectorsFormat(args.M, args.efC), 4096); + try { + parser.parseArgument(args); + } catch (CmdLineException e) { + if (indexArgs.options) { + System.err.printf("Options for %s:\n\n", IndexHnswDenseVectors.class.getSimpleName()); + parser.printUsage(System.err); + + List required = new ArrayList<>(); + parser.getOptions().forEach((option) -> { + if (option.option.required()) { + required.add(option.option.toString()); } }); - config.setOpenMode(IndexWriterConfig.OpenMode.CREATE); - config.setRAMBufferSizeMB(args.memorybufferSize); - config.setUseCompoundFile(false); - config.setMergeScheduler(new ConcurrentMergeScheduler()); - - if (args.optimize) { - // If we're going to merge down into a single segment at the end, skip intermediate merges, - // since they are a waste of time. - TieredMergePolicy mergePolicy = new TieredMergePolicy(); - mergePolicy.setMaxMergeAtOnce(256); - mergePolicy.setSegmentsPerTier(256); - config.setMergePolicy(mergePolicy); - } - - IndexWriter writer = new IndexWriter(dir, config); - - final ThreadPoolExecutor executor = (ThreadPoolExecutor) Executors.newFixedThreadPool(args.threads); - LOG.info("Thread pool with " + args.threads + " threads initialized."); - LOG.info("Initializing collection in " + collectionPath.toString()); - - List segmentPaths = collection.getSegmentPaths(); - final int segmentCnt = segmentPaths.size(); - - LOG.info(String.format("%,d %s found", segmentCnt, (segmentCnt == 1 ? "file" : "files" ))); - LOG.info("Starting to index..."); - - segmentPaths.forEach((segmentPath) -> executor.execute(new LocalIndexerThread(writer, collection, segmentPath))); - - executor.shutdown(); - - try { - // Wait for existing tasks to terminate - while (!executor.awaitTermination(1, TimeUnit.MINUTES)) { - if (segmentCnt == 1) { - LOG.info(String.format("%,d documents indexed", counters.indexed.get())); - } else { - LOG.info(String.format("%.2f%% of files completed, %,d documents indexed", - (double) executor.getCompletedTaskCount() / segmentCnt * 100.0d, counters.indexed.get())); - } - } - } catch (InterruptedException ie) { - // (Re-)Cancel if current thread also interrupted - executor.shutdownNow(); - // Preserve interrupt status - Thread.currentThread().interrupt(); - } - - if (segmentCnt != executor.getCompletedTaskCount()) { - throw new RuntimeException("totalFiles = " + segmentCnt + - " is not equal to completedTaskCount = " + executor.getCompletedTaskCount()); - } - - long numIndexed = writer.getDocStats().maxDoc; - - // Do a final commit - try { - writer.commit(); - if (args.optimize) { - writer.forceMerge(1); - } - } finally { - try { - writer.close(); - } catch (IOException e) { - // It is possible that this happens... but nothing much we can do at this point, - // so just log the error and move on. - LOG.error(e); + System.err.printf("\nRequired options are %s\n", required); + } else { + System.err.printf("Error: %s. For help, use \"-options\" to print out information about options.\n", e.getMessage()); } - } - - if (numIndexed != counters.indexed.get()) { - LOG.warn("Unexpected difference between number of indexed documents and index maxDoc."); - } - LOG.info(String.format("Indexing Complete! %,d documents indexed", numIndexed)); - LOG.info("============ Final Counter Values ============"); - LOG.info(String.format("indexed: %,12d", counters.indexed.get())); - LOG.info(String.format("unindexable: %,12d", counters.unindexable.get())); - LOG.info(String.format("empty: %,12d", counters.empty.get())); - LOG.info(String.format("skipped: %,12d", counters.skipped.get())); - LOG.info(String.format("errors: %,12d", counters.errors.get())); - - final long durationMillis = TimeUnit.MILLISECONDS.convert(System.nanoTime() - start, TimeUnit.NANOSECONDS); - LOG.info(String.format("Total %,d documents indexed in %s", numIndexed, - DurationFormatUtils.formatDuration(durationMillis, "HH:mm:ss"))); - - return counters; - } - - public static void main(String[] args) throws Exception { - Args indexCollectionArgs = new Args(); - CmdLineParser parser = new CmdLineParser(indexCollectionArgs, ParserProperties.defaults().withUsageWidth(100)); - - try { - parser.parseArgument(args); - } catch (CmdLineException e) { - System.err.println(e.getMessage()); - parser.printUsage(System.err); - System.err.println("Example: " + IndexHnswDenseVectors.class.getSimpleName() + - parser.printExample(OptionHandlerFilter.REQUIRED)); return; } - new IndexHnswDenseVectors(indexCollectionArgs).run(); + new IndexHnswDenseVectors(indexArgs).run(); } } diff --git a/src/main/java/io/anserini/index/IndexInvertedDenseVectors.java b/src/main/java/io/anserini/index/IndexInvertedDenseVectors.java index 54ed6b560d..e3a0c8e810 100644 --- a/src/main/java/io/anserini/index/IndexInvertedDenseVectors.java +++ b/src/main/java/io/anserini/index/IndexInvertedDenseVectors.java @@ -18,18 +18,10 @@ import io.anserini.analysis.fw.FakeWordsEncoderAnalyzer; import io.anserini.analysis.lexlsh.LexicalLshAnalyzer; -import io.anserini.collection.DocumentCollection; -import io.anserini.collection.FileSegment; import io.anserini.collection.SourceDocument; -import io.anserini.index.generator.EmptyDocumentException; -import io.anserini.index.generator.InvalidDocumentException; import io.anserini.index.generator.LuceneDocumentGenerator; -import io.anserini.index.generator.SkippedDocumentException; -import org.apache.commons.lang3.time.DurationFormatUtils; -import org.apache.logging.log4j.Level; import org.apache.logging.log4j.LogManager; import org.apache.logging.log4j.Logger; -import org.apache.logging.log4j.core.config.Configurator; import org.apache.lucene.analysis.Analyzer; import org.apache.lucene.analysis.miscellaneous.PerFieldAnalyzerWrapper; import org.apache.lucene.analysis.standard.StandardAnalyzer; @@ -42,41 +34,24 @@ import org.kohsuke.args4j.CmdLineException; import org.kohsuke.args4j.CmdLineParser; import org.kohsuke.args4j.Option; -import org.kohsuke.args4j.OptionHandlerFilter; import org.kohsuke.args4j.ParserProperties; -import java.io.File; -import java.io.IOException; -import java.nio.file.Files; -import java.nio.file.Path; import java.nio.file.Paths; +import java.util.ArrayList; import java.util.HashMap; import java.util.List; import java.util.Map; -import java.util.concurrent.Executors; -import java.util.concurrent.ThreadPoolExecutor; -import java.util.concurrent.TimeUnit; - -public final class IndexInvertedDenseVectors { +public final class IndexInvertedDenseVectors extends AbstractIndexer { private static final Logger LOG = LogManager.getLogger(IndexInvertedDenseVectors.class); public static final String FW = "fw"; public static final String LEXLSH = "lexlsh"; - public static final class Args { - @Option(name = "-collection", metaVar = "[class]", required = true, usage = "Collection class in io.anserini.collection.") - public String collectionClass; - - @Option(name = "-input", metaVar = "[path]", required = true, usage = "Input collection.") - public String input; - + public static final class Args extends AbstractIndexer.Args { @Option(name = "-generator", metaVar = "[class]", usage = "Document generator class in io.anserini.index.generator.") public String generatorClass = "InvertedDenseVectorDocumentGenerator"; - @Option(name = "-index", metaVar = "[path]", required = true, usage = "Index path.") - public String index; - @Option(name = "-encoding", metaVar = "[word]", usage = "Encoding method: {'fw', 'lexlsh'}.") public String encoding = FW; @@ -97,281 +72,75 @@ public static final class Args { @Option(name = "-lexlsh.b", metaVar = "[int]", usage = "LexLSH encoding: bucket count.") public int bucketCount = 300; - - @Option(name = "-optimize", usage = "Optimizes index by merging into a single index segment.") - public boolean optimize = false; - - @Option(name = "-memorybuffer", metaVar = "[mb]", usage = "Memory buffer size in MB.") - public int memorybufferSize = 4096; - - @Option(name = "-threads", metaVar = "[num]", usage = "Number of indexing threads.") - public int threads = 4; - - @Option(name = "-verbose", forbids = {"-quiet"}, usage = "Enables verbose logging for each indexing thread.") - public boolean verbose = false; - - @Option(name = "-quiet", forbids = {"-verbose"}, usage = "Turns off all logging.") - public boolean quiet = false; - } - - private final class LocalIndexerThread extends Thread { - final private Path inputFile; - final private IndexWriter writer; - final private DocumentCollection collection; - - private LocalIndexerThread(IndexWriter writer, DocumentCollection collection, Path inputFile) { - this.writer = writer; - this.collection = collection; - this.inputFile = inputFile; - setName(inputFile.getFileName().toString()); - } - - @Override - public void run() { - FileSegment segment = null; - - try { - @SuppressWarnings("unchecked") - LuceneDocumentGenerator generator = (LuceneDocumentGenerator) - generatorClass.getDeclaredConstructor(Args.class).newInstance(args); - - // We keep track of two separate counts: the total count of documents in this file segment (cnt), - // and the number of documents in this current "batch" (batch). We update the global counter every - // 10k documents: this is so that we get intermediate updates, which is informative if a collection - // has only one file segment; see https://github.com/castorini/anserini/issues/683 - int cnt = 0; - int batch = 0; - - segment = collection.createFileSegment(inputFile); - - for (SourceDocument d : segment) { - if (!d.indexable()) { - counters.unindexable.incrementAndGet(); - continue; - } - - try { - writer.addDocument(generator.createDocument(d)); - - cnt++; - batch++; - } catch (EmptyDocumentException e1) { - counters.empty.incrementAndGet(); - continue; - } catch (SkippedDocumentException e2) { - counters.skipped.incrementAndGet(); - continue; - } catch (InvalidDocumentException e3) { - counters.errors.incrementAndGet(); - continue; - } - - // Add the counts from this batch, reset batch counter. - if (batch % 10000 == 0) { - counters.indexed.addAndGet(batch); - batch = 0; - } - } - - // Add the remaining documents. - counters.indexed.addAndGet(batch); - - int skipped = segment.getSkippedCount(); - if (skipped > 0) { - counters.skipped.addAndGet(skipped); - LOG.warn(inputFile.getParent().getFileName().toString() + File.separator + - inputFile.getFileName().toString() + ": " + skipped + " docs skipped."); - } - - if (segment.getErrorStatus()) { - counters.errors.incrementAndGet(); - LOG.error(inputFile.getParent().getFileName().toString() + File.separator + - inputFile.getFileName().toString() + ": error iterating through segment."); - } - - // Log at the debug level because this can be quite noisy if there are lots of file segments. - LOG.debug(inputFile.getParent().getFileName().toString() + File.separator + - inputFile.getFileName().toString() + ": " + cnt + " docs added."); - } catch (Exception e) { - LOG.error(Thread.currentThread().getName() + ": Unexpected Exception:", e); - } finally { - segment.close(); - } - } } - private final Args args; - private final Path collectionPath; - private final Class> generatorClass; - private final DocumentCollection collection; - private final Counters counters; - private final Path indexPath; - @SuppressWarnings("unchecked") - public IndexInvertedDenseVectors(Args args) throws Exception { - this.args = args; - - if (args.verbose) { - // If verbose logging enabled, changed default log level to DEBUG so we get per-thread logging messages. - Configurator.setRootLevel(Level.DEBUG); - LOG.info("Setting log level to " + Level.DEBUG); - } else if (args.quiet) { - // If quiet mode enabled, only report warnings and above. - Configurator.setRootLevel(Level.WARN); - } else { - // Otherwise, we get the standard set of log messages. - Configurator.setRootLevel(Level.INFO); - LOG.info("Setting log level to " + Level.INFO); - } + public IndexInvertedDenseVectors(Args args) { + super(args); - LOG.info("Starting indexer..."); - LOG.info("============ Loading Parameters ============"); - LOG.info("Collection class: " + args.collectionClass); - LOG.info("Collection path: " + args.input); - LOG.info("Generator: " + args.generatorClass); - LOG.info("Index path: " + args.index); - LOG.info("Encoding: " + args.encoding); - LOG.info("Threads: " + args.threads); - LOG.info("Optimize? " + args.optimize); + LOG.info("InvertedDenseIndexer settings:"); + LOG.info(" + Generator: " + args.generatorClass); + LOG.info(" + Encoding: " + args.encoding); - this.indexPath = Paths.get(args.index); - if (!Files.exists(this.indexPath)) { - Files.createDirectories(this.indexPath); - } - - // Our documentation uses /path/to/foo as a convention: to make copy and paste of the commands work, - // we assume collections/ as the path location. - String pathStr = args.input; - if (pathStr.startsWith("/path/to")) { - pathStr = pathStr.replace("/path/to", "collections"); - } - this.collectionPath = Paths.get(pathStr); - if (!Files.exists(collectionPath) || !Files.isReadable(collectionPath) || !Files.isDirectory(collectionPath)) { - throw new RuntimeException("Invalid collection path " + collectionPath + "!"); + try { + super.generatorClass = (Class>) + Class.forName("io.anserini.index.generator." + args.generatorClass); + } catch (Exception e) { + throw new IllegalArgumentException(String.format("Unable to load generator class \"%s\".", args.generatorClass)); } - Class> collectionClass = (Class>) - Class.forName("io.anserini.collection." + args.collectionClass); - this.collection = collectionClass.getConstructor(Path.class).newInstance(collectionPath); - - this.generatorClass = (Class>) - Class.forName("io.anserini.index.generator." + args.generatorClass); - - this.counters = new Counters(); - } - - public Counters run() throws IOException { - LOG.info("============ Indexing Collection ============"); - final long start = System.nanoTime(); - Analyzer vectorAnalyzer; if (args.encoding.equalsIgnoreCase(FW)) { vectorAnalyzer = new FakeWordsEncoderAnalyzer(args.q); } else if (args.encoding.equalsIgnoreCase(LEXLSH)) { vectorAnalyzer = new LexicalLshAnalyzer(args.decimals, args.ngrams, args.hashCount, args.bucketCount, args.hashSetSize); } else { - throw new RuntimeException("Invalid encoding scheme!"); + throw new IllegalArgumentException("Invalid encoding scheme."); } Map map = new HashMap<>(); map.put(Constants.VECTOR, vectorAnalyzer); Analyzer analyzer = new PerFieldAnalyzerWrapper(new StandardAnalyzer(), map); - final Directory dir = FSDirectory.open(indexPath); - final IndexWriterConfig config = new IndexWriterConfig(analyzer).setCodec(new Lucene95Codec()); - config.setOpenMode(IndexWriterConfig.OpenMode.CREATE); - config.setRAMBufferSizeMB(args.memorybufferSize); - config.setUseCompoundFile(false); - config.setMergeScheduler(new ConcurrentMergeScheduler()); - IndexWriter writer = new IndexWriter(dir, config); - - final ThreadPoolExecutor executor = (ThreadPoolExecutor) Executors.newFixedThreadPool(args.threads); - LOG.info("Thread pool with " + args.threads + " threads initialized."); - LOG.info("Initializing collection in " + collectionPath); - - List segmentPaths = collection.getSegmentPaths(); - final int segmentCnt = segmentPaths.size(); - - LOG.info(String.format("%,d %s found", segmentCnt, (segmentCnt == 1 ? "file" : "files"))); - LOG.info("Starting to index..."); - - segmentPaths.forEach((segmentPath) -> executor.execute(new LocalIndexerThread(writer, collection, segmentPath))); - - executor.shutdown(); - try { - // Wait for existing tasks to terminate. - while (!executor.awaitTermination(1, TimeUnit.MINUTES)) { - if (segmentCnt == 1) { - LOG.info(String.format("%,d documents indexed", counters.indexed.get())); - } else { - LOG.info(String.format("%.2f%% of files completed, %,d documents indexed", - (double) executor.getCompletedTaskCount() / segmentCnt * 100.0d, counters.indexed.get())); - } - } - } catch (InterruptedException ie) { - // (Re-)Cancel if current thread also interrupted. - executor.shutdownNow(); - // Preserve interrupt status. - Thread.currentThread().interrupt(); + final Directory dir = FSDirectory.open(Paths.get(args.index)); + final IndexWriterConfig config = new IndexWriterConfig(analyzer).setCodec(new Lucene95Codec()); + config.setOpenMode(IndexWriterConfig.OpenMode.CREATE); + config.setRAMBufferSizeMB(args.memoryBuffer); + config.setUseCompoundFile(false); + config.setMergeScheduler(new ConcurrentMergeScheduler()); + this.writer = new IndexWriter(dir, config); + } catch (Exception e) { + throw new IllegalArgumentException(String.format("Unable to create IndexWriter: %s.", e.getMessage())); } - - if (segmentCnt != executor.getCompletedTaskCount()) { - throw new RuntimeException("totalFiles = " + segmentCnt + - " is not equal to completedTaskCount = " + executor.getCompletedTaskCount()); - } - - long numIndexed = writer.getDocStats().maxDoc; - - // Do a final commit. - try { - writer.commit(); - if (args.optimize) { - writer.forceMerge(1); - } - } finally { - try { - writer.close(); - } catch (IOException e) { - // It is possible that this happens... but nothing much we can do at this point, - // so just log the error and move on. - LOG.error(e); - } - } - - if (numIndexed != counters.indexed.get()) { - LOG.warn("Unexpected difference between number of indexed documents and index maxDoc."); - } - - LOG.info(String.format("Indexing Complete! %,d documents indexed", numIndexed)); - LOG.info("============ Final Counter Values ============"); - LOG.info(String.format("indexed: %,12d", counters.indexed.get())); - LOG.info(String.format("unindexable: %,12d", counters.unindexable.get())); - LOG.info(String.format("empty: %,12d", counters.empty.get())); - LOG.info(String.format("skipped: %,12d", counters.skipped.get())); - LOG.info(String.format("errors: %,12d", counters.errors.get())); - - final long durationMillis = TimeUnit.MILLISECONDS.convert(System.nanoTime() - start, TimeUnit.NANOSECONDS); - LOG.info(String.format("Total %,d documents indexed in %s", numIndexed, - DurationFormatUtils.formatDuration(durationMillis, "HH:mm:ss"))); - - return counters; } public static void main(String[] args) throws Exception { - Args indexCollectionArgs = new Args(); - CmdLineParser parser = new CmdLineParser(indexCollectionArgs, ParserProperties.defaults().withUsageWidth(100)); + Args indexArgs = new Args(); + CmdLineParser parser = new CmdLineParser(indexArgs, ParserProperties.defaults().withUsageWidth(120)); try { parser.parseArgument(args); } catch (CmdLineException e) { - System.err.println(e.getMessage()); - parser.printUsage(System.err); - System.err.println("Example: " + IndexInvertedDenseVectors.class.getSimpleName() + - parser.printExample(OptionHandlerFilter.REQUIRED)); + if (indexArgs.options) { + System.err.printf("Options for %s:\n\n", IndexInvertedDenseVectors.class.getSimpleName()); + parser.printUsage(System.err); + + List required = new ArrayList<>(); + parser.getOptions().forEach((option) -> { + if (option.option.required()) { + required.add(option.option.toString()); + } + }); + + System.err.printf("\nRequired options are %s\n", required); + } else { + System.err.printf("Error: %s. For help, use \"-options\" to print out information about options.\n", e.getMessage()); + } + return; } - new IndexInvertedDenseVectors(indexCollectionArgs).run(); + new IndexInvertedDenseVectors(indexArgs).run(); } -} \ No newline at end of file +} diff --git a/src/main/java/io/anserini/index/SimpleIndexer.java b/src/main/java/io/anserini/index/SimpleIndexer.java index a4e78632f7..d3a6302afc 100644 --- a/src/main/java/io/anserini/index/SimpleIndexer.java +++ b/src/main/java/io/anserini/index/SimpleIndexer.java @@ -20,12 +20,10 @@ import io.anserini.analysis.AnalyzerMap; import io.anserini.analysis.DefaultEnglishAnalyzer; import io.anserini.analysis.HuggingFaceTokenizerAnalyzer; -import io.anserini.collection.FileSegment; import io.anserini.collection.JsonCollection; import io.anserini.index.IndexCollection.Args; import io.anserini.index.generator.GeneratorException; import io.anserini.index.generator.LuceneDocumentGenerator; -import org.apache.commons.lang3.time.DurationFormatUtils; import org.apache.logging.log4j.LogManager; import org.apache.logging.log4j.Logger; import org.apache.lucene.analysis.Analyzer; @@ -37,8 +35,6 @@ import org.apache.lucene.store.FSDirectory; import org.kohsuke.args4j.CmdLineException; import org.kohsuke.args4j.CmdLineParser; -import org.kohsuke.args4j.OptionHandlerFilter; -import org.kohsuke.args4j.ParserProperties; import java.io.IOException; import java.nio.file.Files; diff --git a/src/main/java/io/anserini/index/generator/HnswDenseVectorDocumentGenerator.java b/src/main/java/io/anserini/index/generator/HnswDenseVectorDocumentGenerator.java index 958425361c..86ccde63d0 100644 --- a/src/main/java/io/anserini/index/generator/HnswDenseVectorDocumentGenerator.java +++ b/src/main/java/io/anserini/index/generator/HnswDenseVectorDocumentGenerator.java @@ -40,18 +40,7 @@ * @param type of the source document */ public class HnswDenseVectorDocumentGenerator implements LuceneDocumentGenerator { - protected IndexHnswDenseVectors.Args args; - - protected HnswDenseVectorDocumentGenerator() { - } - - /** - * Constructor with config and counters - * - * @param args configuration arguments - */ - public HnswDenseVectorDocumentGenerator(IndexHnswDenseVectors.Args args) { - this.args = args; + public HnswDenseVectorDocumentGenerator() { } private float[] convertJsonArray(String vectorString) throws JsonProcessingException { @@ -87,9 +76,7 @@ public Document createDocument(T src) throws InvalidDocumentException { document.add(new BinaryDocValuesField(Constants.ID, new BytesRef(id))); document.add(new KnnFloatVectorField(Constants.VECTOR, contents, VectorSimilarityFunction.DOT_PRODUCT)); - if (args.storeVectors) { - document.add(new StoredField(Constants.RAW, src.raw())); - } + return document; } } diff --git a/src/main/java/io/anserini/index/generator/InvertedDenseVectorDocumentGenerator.java b/src/main/java/io/anserini/index/generator/InvertedDenseVectorDocumentGenerator.java index a6ab22a0f4..487f296262 100644 --- a/src/main/java/io/anserini/index/generator/InvertedDenseVectorDocumentGenerator.java +++ b/src/main/java/io/anserini/index/generator/InvertedDenseVectorDocumentGenerator.java @@ -37,18 +37,7 @@ * @param type of the source document */ public class InvertedDenseVectorDocumentGenerator implements LuceneDocumentGenerator { - protected IndexInvertedDenseVectors.Args args; - - protected InvertedDenseVectorDocumentGenerator() { - } - - /** - * Constructor with config and counters - * - * @param args configuration arguments - */ - public InvertedDenseVectorDocumentGenerator(IndexInvertedDenseVectors.Args args) { - this.args = args; + public InvertedDenseVectorDocumentGenerator() { } private float[] convertJsonArray(String vectorString) throws JsonProcessingException { diff --git a/src/main/java/io/anserini/search/SearchHnswDenseVectors.java b/src/main/java/io/anserini/search/SearchHnswDenseVectors.java index f2b599e156..dd54d5c566 100644 --- a/src/main/java/io/anserini/search/SearchHnswDenseVectors.java +++ b/src/main/java/io/anserini/search/SearchHnswDenseVectors.java @@ -36,7 +36,6 @@ import org.kohsuke.args4j.CmdLineException; import org.kohsuke.args4j.CmdLineParser; import org.kohsuke.args4j.Option; -import org.kohsuke.args4j.OptionHandlerFilter; import org.kohsuke.args4j.ParserProperties; import org.kohsuke.args4j.spi.StringArrayOptionHandler; @@ -235,7 +234,6 @@ public void close() throws IOException { reader.close(); } - @SuppressWarnings("unchecked") @Override public void run() { LOG.info("============ Launching Search Threads ============"); @@ -343,20 +341,13 @@ public static void main(String[] args) throws Exception { return; } - final long start = System.nanoTime(); - // We're at top-level already inside a main; makes no sense to propagate exceptions further, so reformat the // exception messages and display on console. - try { - SearchHnswDenseVectors searcher = new SearchHnswDenseVectors(searchArgs); + try (SearchHnswDenseVectors searcher = new SearchHnswDenseVectors(searchArgs)) { searcher.run(); - searcher.close(); } catch (IllegalArgumentException e) { System.err.printf("Error: %s\n", e.getMessage()); return; } - - final long durationMillis = TimeUnit.MILLISECONDS.convert(System.nanoTime() - start, TimeUnit.NANOSECONDS); - LOG.info("Total run time: " + DurationFormatUtils.formatDuration(durationMillis, "HH:mm:ss")); } } diff --git a/src/main/java/io/anserini/search/SearchInvertedDenseVectors.java b/src/main/java/io/anserini/search/SearchInvertedDenseVectors.java index 1e696eaf22..0a70e36c14 100644 --- a/src/main/java/io/anserini/search/SearchInvertedDenseVectors.java +++ b/src/main/java/io/anserini/search/SearchInvertedDenseVectors.java @@ -327,20 +327,13 @@ public static void main(String[] args) throws Exception { return; } - final long start = System.nanoTime(); - // We're at top-level already inside a main; makes no sense to propagate exceptions further, so reformat the // exception messages and display on console. - try { - SearchInvertedDenseVectors searcher = new SearchInvertedDenseVectors(searchArgs); + try (SearchInvertedDenseVectors searcher = new SearchInvertedDenseVectors(searchArgs)) { searcher.run(); - searcher.close(); } catch (IllegalArgumentException e) { System.err.printf("Error: %s\n", e.getMessage()); return; } - - final long durationMillis = TimeUnit.MILLISECONDS.convert(System.nanoTime() - start, TimeUnit.NANOSECONDS); - LOG.info("Total run time: " + DurationFormatUtils.formatDuration(durationMillis, "HH:mm:ss")); } } \ No newline at end of file diff --git a/src/main/resources/regression/dl19-passage-cos-dpr-distil-hnsw-onnx.yaml b/src/main/resources/regression/dl19-passage-cos-dpr-distil-hnsw-onnx.yaml index 416e8efc63..31966a5457 100644 --- a/src/main/resources/regression/dl19-passage-cos-dpr-distil-hnsw-onnx.yaml +++ b/src/main/resources/regression/dl19-passage-cos-dpr-distil-hnsw-onnx.yaml @@ -10,7 +10,7 @@ index_type: hnsw collection_class: JsonDenseVectorCollection generator_class: HnswDenseVectorDocumentGenerator index_threads: 16 -index_options: -M 16 -efC 100 +index_options: -M 16 -efC 100 -memoryBuffer 65536 metrics: - metric: AP@1000 diff --git a/src/main/resources/regression/dl19-passage-cos-dpr-distil-hnsw.yaml b/src/main/resources/regression/dl19-passage-cos-dpr-distil-hnsw.yaml index 67abbbf149..8b38c5c1b2 100644 --- a/src/main/resources/regression/dl19-passage-cos-dpr-distil-hnsw.yaml +++ b/src/main/resources/regression/dl19-passage-cos-dpr-distil-hnsw.yaml @@ -10,7 +10,7 @@ index_type: hnsw collection_class: JsonDenseVectorCollection generator_class: HnswDenseVectorDocumentGenerator index_threads: 16 -index_options: -M 16 -efC 100 +index_options: -M 16 -efC 100 -memoryBuffer 65536 metrics: - metric: AP@1000 diff --git a/src/main/resources/regression/dl19-passage-openai-ada2.yaml b/src/main/resources/regression/dl19-passage-openai-ada2.yaml index ea0701a5d0..9667d0907c 100644 --- a/src/main/resources/regression/dl19-passage-openai-ada2.yaml +++ b/src/main/resources/regression/dl19-passage-openai-ada2.yaml @@ -10,7 +10,7 @@ index_type: hnsw collection_class: JsonDenseVectorCollection generator_class: HnswDenseVectorDocumentGenerator index_threads: 16 -index_options: -M 16 -efC 100 -memorybuffer 65536 +index_options: -M 16 -efC 100 -memoryBuffer 65536 metrics: - metric: AP@1000 diff --git a/src/main/resources/regression/dl20-passage-cos-dpr-distil-hnsw-onnx.yaml b/src/main/resources/regression/dl20-passage-cos-dpr-distil-hnsw-onnx.yaml index 15daab5abc..d2e0f89991 100644 --- a/src/main/resources/regression/dl20-passage-cos-dpr-distil-hnsw-onnx.yaml +++ b/src/main/resources/regression/dl20-passage-cos-dpr-distil-hnsw-onnx.yaml @@ -10,7 +10,7 @@ index_type: hnsw collection_class: JsonDenseVectorCollection generator_class: HnswDenseVectorDocumentGenerator index_threads: 16 -index_options: -M 16 -efC 100 +index_options: -M 16 -efC 100 -memoryBuffer 65536 metrics: - metric: AP@1000 diff --git a/src/main/resources/regression/dl20-passage-cos-dpr-distil-hnsw.yaml b/src/main/resources/regression/dl20-passage-cos-dpr-distil-hnsw.yaml index ba016059d6..f120eadd61 100644 --- a/src/main/resources/regression/dl20-passage-cos-dpr-distil-hnsw.yaml +++ b/src/main/resources/regression/dl20-passage-cos-dpr-distil-hnsw.yaml @@ -10,7 +10,7 @@ index_type: hnsw collection_class: JsonDenseVectorCollection generator_class: HnswDenseVectorDocumentGenerator index_threads: 16 -index_options: -M 16 -efC 100 +index_options: -M 16 -efC 100 -memoryBuffer 65536 metrics: - metric: AP@1000 diff --git a/src/main/resources/regression/dl20-passage-openai-ada2.yaml b/src/main/resources/regression/dl20-passage-openai-ada2.yaml index e151b4f91b..152d18765c 100644 --- a/src/main/resources/regression/dl20-passage-openai-ada2.yaml +++ b/src/main/resources/regression/dl20-passage-openai-ada2.yaml @@ -10,7 +10,7 @@ index_type: hnsw collection_class: JsonDenseVectorCollection generator_class: HnswDenseVectorDocumentGenerator index_threads: 16 -index_options: -M 16 -efC 100 -memorybuffer 65536 +index_options: -M 16 -efC 100 -memoryBuffer 65536 metrics: - metric: AP@1000 diff --git a/src/main/resources/regression/mb11.yaml b/src/main/resources/regression/mb11.yaml index 4a1a15e1f2..84db5dd7e7 100644 --- a/src/main/resources/regression/mb11.yaml +++ b/src/main/resources/regression/mb11.yaml @@ -6,7 +6,7 @@ index_path: indexes/lucene-index.mb11/ collection_class: TweetCollection generator_class: TweetGenerator index_threads: 44 -index_options: -storePositions -storeDocvectors -storeRaw -uniqueDocid -tweet.keepUrls -tweet.stemming +index_options: -storePositions -storeDocvectors -storeRaw -tweet.keepUrls -tweet.stemming index_stats: documents: 14950477 documents (non-empty): 14950449 diff --git a/src/main/resources/regression/msmarco-passage-cos-dpr-distil-hnsw-onnx.yaml b/src/main/resources/regression/msmarco-passage-cos-dpr-distil-hnsw-onnx.yaml index b333652350..4746f389f3 100644 --- a/src/main/resources/regression/msmarco-passage-cos-dpr-distil-hnsw-onnx.yaml +++ b/src/main/resources/regression/msmarco-passage-cos-dpr-distil-hnsw-onnx.yaml @@ -10,7 +10,7 @@ index_type: hnsw collection_class: JsonDenseVectorCollection generator_class: HnswDenseVectorDocumentGenerator index_threads: 16 -index_options: -M 16 -efC 100 +index_options: -M 16 -efC 100 -memoryBuffer 65536 metrics: - metric: AP@1000 diff --git a/src/main/resources/regression/msmarco-passage-cos-dpr-distil-hnsw.yaml b/src/main/resources/regression/msmarco-passage-cos-dpr-distil-hnsw.yaml index 65ffa3502d..c63c3b7972 100644 --- a/src/main/resources/regression/msmarco-passage-cos-dpr-distil-hnsw.yaml +++ b/src/main/resources/regression/msmarco-passage-cos-dpr-distil-hnsw.yaml @@ -10,7 +10,7 @@ index_type: hnsw collection_class: JsonDenseVectorCollection generator_class: HnswDenseVectorDocumentGenerator index_threads: 16 -index_options: -M 16 -efC 100 +index_options: -M 16 -efC 100 -memoryBuffer 65536 metrics: - metric: AP@1000 diff --git a/src/main/resources/regression/msmarco-passage-openai-ada2.yaml b/src/main/resources/regression/msmarco-passage-openai-ada2.yaml index 16d1cd8b5a..08289ce2c0 100644 --- a/src/main/resources/regression/msmarco-passage-openai-ada2.yaml +++ b/src/main/resources/regression/msmarco-passage-openai-ada2.yaml @@ -10,7 +10,7 @@ index_type: hnsw collection_class: JsonDenseVectorCollection generator_class: HnswDenseVectorDocumentGenerator index_threads: 16 -index_options: -M 16 -efC 100 -memorybuffer 65536 +index_options: -M 16 -efC 100 -memoryBuffer 65536 metrics: - metric: AP@1000 diff --git a/src/test/java/io/anserini/index/IndexCollectionInovcationsTest.java b/src/test/java/io/anserini/index/IndexCollectionInovcationsTest.java new file mode 100644 index 0000000000..3ec5b6f257 --- /dev/null +++ b/src/test/java/io/anserini/index/IndexCollectionInovcationsTest.java @@ -0,0 +1,118 @@ +/* + * Anserini: A Lucene toolkit for reproducible information retrieval research + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +package io.anserini.index; + +import org.apache.logging.log4j.Level; +import org.apache.logging.log4j.core.config.Configurator; +import org.junit.BeforeClass; +import org.junit.Test; + +import java.io.ByteArrayOutputStream; +import java.io.PrintStream; + +import static org.junit.Assert.assertTrue; + +public class IndexCollectionInovcationsTest { + private final ByteArrayOutputStream err = new ByteArrayOutputStream(); + private PrintStream save; + + private void redirectStderr() { + save = System.err; + err.reset(); + System.setErr(new PrintStream(err)); + } + + private void restoreStderr() { + System.setErr(save); + } + + @BeforeClass + public static void setupClass() { + Configurator.setLevel(AbstractIndexer.class.getName(), Level.ERROR); + Configurator.setLevel(IndexCollection.class.getName(), Level.ERROR); + } + + @Test + public void testEmptyInvocation() throws Exception { + redirectStderr(); + String[] indexArgs = new String[] {}; + + IndexCollection.main(indexArgs); + assertTrue(err.toString().contains("Error")); + assertTrue(err.toString().contains("is required")); + + restoreStderr(); + } + + @Test + public void testAskForHelp() throws Exception { + redirectStderr(); + + IndexCollection.main(new String[] {"-options"}); + assertTrue(err.toString().contains("Options for")); + + restoreStderr(); + } + + @Test(expected = IllegalArgumentException.class) + public void testInvalidCollection() throws Exception { + String[] indexArgs = new String[] { + "-collection", "FakeTrecCollection", + "-input", "src/test/resources/sample_docs/trec/collection2", + "-index", "target/idx-sample-trec-index" + System.currentTimeMillis(), + "-generator", "DefaultLuceneDocumentGenerator", + }; + + IndexCollection.main(indexArgs); + } + + @Test(expected = IllegalArgumentException.class) + public void testCollectionPath() throws Exception { + String[] indexArgs = new String[] { + "-collection", "TrecCollection", + "-input", "src/test/resources/sample_docs/trec/collection2_fake_path", + "-index", "target/idx-sample-trec-index" + System.currentTimeMillis(), + "-generator", "DefaultLuceneDocumentGenerator", + }; + + IndexCollection.main(indexArgs); + } + + @Test(expected = IllegalArgumentException.class) + public void testInvalidGenerator() throws Exception { + String[] indexArgs = new String[] { + "-collection", "TrecCollection", + "-input", "src/test/resources/sample_docs/trec/collection2", + "-index", "target/idx-sample-trec-index" + System.currentTimeMillis(), + "-generator", "FakeDefaultLuceneDocumentGenerator", + }; + + IndexCollection.main(indexArgs); + } + + @Test + public void testDefaultGenerator() throws Exception { + String[] indexArgs = new String[] { + "-collection", "TrecCollection", + "-input", "src/test/resources/sample_docs/trec/collection2", + "-index", "target/idx-sample-trec-index" + System.currentTimeMillis() + }; + + IndexCollection.main(indexArgs); + // If this succeeded, then the default -generator of DefaultLuceneDocumentGenerator must have worked. + } +} \ No newline at end of file diff --git a/src/test/java/io/anserini/index/IndexHnswDenseVectorsTest.java b/src/test/java/io/anserini/index/IndexHnswDenseVectorsTest.java index 35ea033dfc..bade9e0366 100644 --- a/src/test/java/io/anserini/index/IndexHnswDenseVectorsTest.java +++ b/src/test/java/io/anserini/index/IndexHnswDenseVectorsTest.java @@ -49,6 +49,7 @@ private void restoreStderr() { @BeforeClass public static void setupClass() { + Configurator.setLevel(AbstractIndexer.class.getName(), Level.ERROR); Configurator.setLevel(IndexHnswDenseVectors.class.getName(), Level.ERROR); } @@ -58,12 +59,23 @@ public void testEmptyInvocation() throws Exception { String[] indexArgs = new String[] {}; IndexHnswDenseVectors.main(indexArgs); - assertTrue(err.toString().contains("Example: IndexHnswDenseVectors")); + assertTrue(err.toString().contains("Error")); + assertTrue(err.toString().contains("is required")); restoreStderr(); } - @Test(expected = ClassNotFoundException.class) + @Test + public void testAskForHelp() throws Exception { + redirectStderr(); + + IndexHnswDenseVectors.main(new String[] {"-options"}); + assertTrue(err.toString().contains("Options for")); + + restoreStderr(); + } + + @Test(expected = IllegalArgumentException.class) public void testInvalidCollection() throws Exception { String[] indexArgs = new String[] { "-collection", "FakeJsonDenseVectorCollection", @@ -77,7 +89,7 @@ public void testInvalidCollection() throws Exception { IndexHnswDenseVectors.main(indexArgs); } - @Test(expected = RuntimeException.class) + @Test(expected = IllegalArgumentException.class) public void testCollectionPath() throws Exception { String[] indexArgs = new String[] { "-collection", "JsonDenseVectorCollection", @@ -91,7 +103,7 @@ public void testCollectionPath() throws Exception { IndexHnswDenseVectors.main(indexArgs); } - @Test(expected = ClassNotFoundException.class) + @Test(expected = IllegalArgumentException.class) public void testInvalidGenerator() throws Exception { String[] indexArgs = new String[] { "-collection", "JsonDenseVectorCollection", @@ -105,6 +117,20 @@ public void testInvalidGenerator() throws Exception { IndexHnswDenseVectors.main(indexArgs); } + @Test + public void testDefaultGenerator() throws Exception { + String[] indexArgs = new String[] { + "-collection", "JsonDenseVectorCollection", + "-input", "src/test/resources/sample_docs/openai_ada2/json_vector", + "-index", "target/idx-sample-hnsw" + System.currentTimeMillis(), + "-threads", "1", + "-M", "16", "-efC", "100" + }; + + IndexHnswDenseVectors.main(indexArgs); + // If this succeeded, then the default -generator of HnswDenseVectorDocumentGenerator must have worked. + } + @Test public void test1() throws Exception { String indexPath = "target/idx-sample-hnsw" + System.currentTimeMillis(); diff --git a/src/test/java/io/anserini/index/IndexInvertedDenseVectorsTest.java b/src/test/java/io/anserini/index/IndexInvertedDenseVectorsTest.java index b210c44074..2fda6f6cd6 100644 --- a/src/test/java/io/anserini/index/IndexInvertedDenseVectorsTest.java +++ b/src/test/java/io/anserini/index/IndexInvertedDenseVectorsTest.java @@ -49,6 +49,7 @@ private void restoreStderr() { @BeforeClass public static void setupClass() { + Configurator.setLevel(AbstractIndexer.class.getName(), Level.ERROR); Configurator.setLevel(IndexInvertedDenseVectors.class.getName(), Level.ERROR); } @@ -58,12 +59,23 @@ public void testEmptyInvocation() throws Exception { String[] indexArgs = new String[] {}; IndexInvertedDenseVectors.main(indexArgs); - assertTrue(err.toString().contains("Example: IndexInvertedDenseVectors")); + assertTrue(err.toString().contains("Error")); + assertTrue(err.toString().contains("is required")); restoreStderr(); } - @Test(expected = ClassNotFoundException.class) + @Test + public void testAskForHelp() throws Exception { + redirectStderr(); + + IndexInvertedDenseVectors.main(new String[] {"-options"}); + assertTrue(err.toString().contains("Options for")); + + restoreStderr(); + } + + @Test(expected = IllegalArgumentException.class) public void testInvalidCollection() throws Exception { String[] indexArgs = new String[] { "-collection", "FakeCollection", @@ -76,7 +88,7 @@ public void testInvalidCollection() throws Exception { IndexInvertedDenseVectors.main(indexArgs); } - @Test(expected = RuntimeException.class) + @Test(expected = IllegalArgumentException.class) public void testCollectionPath() throws Exception { String[] indexArgs = new String[] { "-collection", "JsonDenseVectorCollection", @@ -89,7 +101,7 @@ public void testCollectionPath() throws Exception { IndexInvertedDenseVectors.main(indexArgs); } - @Test(expected = ClassNotFoundException.class) + @Test(expected = IllegalArgumentException.class) public void testInvalidGenerator() throws Exception { String[] indexArgs = new String[] { "-collection", "JsonDenseVectorCollection", @@ -102,7 +114,20 @@ public void testInvalidGenerator() throws Exception { IndexInvertedDenseVectors.main(indexArgs); } - @Test(expected = RuntimeException.class) + @Test + public void testDefaultGenerator() throws Exception { + String[] indexArgs = new String[] { + "-collection", "JsonDenseVectorCollection", + "-input", "src/test/resources/sample_docs/openai_ada2/json_vector", + "-index", "target/idx-sample-ll-vector" + System.currentTimeMillis(), + "-encoding", "lexlsh" + }; + + IndexInvertedDenseVectors.main(indexArgs); + // If this succeeded, then the default -generator of InvertedDenseVectorDocumentGenerator must have worked. + } + + @Test(expected = IllegalArgumentException.class) public void testInvalidEncoding() throws Exception { String[] indexArgs = new String[] { "-collection", "JsonDenseVectorCollection", diff --git a/src/test/java/io/anserini/integration/TweetEndToEndTest.java b/src/test/java/io/anserini/integration/TweetEndToEndTest.java index cfcd30f750..0a47ae35ab 100644 --- a/src/test/java/io/anserini/integration/TweetEndToEndTest.java +++ b/src/test/java/io/anserini/integration/TweetEndToEndTest.java @@ -17,6 +17,7 @@ package io.anserini.integration; import io.anserini.collection.TweetCollection; +import io.anserini.index.AbstractIndexer; import io.anserini.index.IndexCollection; import io.anserini.index.generator.TweetGenerator; import org.apache.logging.log4j.Level; @@ -28,6 +29,7 @@ public class TweetEndToEndTest extends EndToEndTest { @BeforeClass public static void setupClass() { + Configurator.setLevel(AbstractIndexer.class.getName(), Level.ERROR); Configurator.setLevel(IndexCollection.class.getName(), Level.ERROR); } diff --git a/src/test/java/io/anserini/search/SearchHnswDenseVectorsTest.java b/src/test/java/io/anserini/search/SearchHnswDenseVectorsTest.java index d2f72b65b6..2f9bb98032 100644 --- a/src/test/java/io/anserini/search/SearchHnswDenseVectorsTest.java +++ b/src/test/java/io/anserini/search/SearchHnswDenseVectorsTest.java @@ -17,6 +17,7 @@ package io.anserini.search; import io.anserini.TestUtils; +import io.anserini.index.AbstractIndexer; import io.anserini.index.IndexHnswDenseVectors; import org.apache.logging.log4j.Level; import org.apache.logging.log4j.core.config.Configurator; @@ -49,6 +50,7 @@ private void restoreStderr() { @BeforeClass public static void setupClass() { + Configurator.setLevel(AbstractIndexer.class.getName(), Level.ERROR); Configurator.setLevel(IndexHnswDenseVectors.class.getName(), Level.ERROR); Configurator.setLevel(SearchHnswDenseVectors.class.getName(), Level.ERROR); } diff --git a/src/test/java/io/anserini/search/SearchInvertedDenseVectorsTest.java b/src/test/java/io/anserini/search/SearchInvertedDenseVectorsTest.java index e6c514ee16..7189a8c6f6 100644 --- a/src/test/java/io/anserini/search/SearchInvertedDenseVectorsTest.java +++ b/src/test/java/io/anserini/search/SearchInvertedDenseVectorsTest.java @@ -17,6 +17,7 @@ package io.anserini.search; import io.anserini.TestUtils; +import io.anserini.index.AbstractIndexer; import io.anserini.index.IndexInvertedDenseVectors; import org.apache.logging.log4j.Level; import org.apache.logging.log4j.core.config.Configurator; @@ -49,6 +50,7 @@ private void restoreStderr() { @BeforeClass public static void setupClass() { + Configurator.setLevel(AbstractIndexer.class.getName(), Level.ERROR); Configurator.setLevel(IndexInvertedDenseVectors.class.getName(), Level.ERROR); Configurator.setLevel(SearchInvertedDenseVectors.class.getName(), Level.ERROR); }