diff --git a/src/main/perf/SearchPerfTest.java b/src/main/perf/SearchPerfTest.java index fd8310412..3de21be4c 100755 --- a/src/main/perf/SearchPerfTest.java +++ b/src/main/perf/SearchPerfTest.java @@ -236,8 +236,13 @@ private static void _main(String[] clArgs) throws Exception { final int topN = args.getInt("-topN"); final boolean doStoredLoads = args.getFlag("-loadStoredFields"); final boolean exitable = args.getFlag("-exitable"); + final boolean pollute = args.getFlag("-pollute"); final TestContext testContext = TestContext.parse(args.getString("-context", "")); + if (pollute) { + TypePolluter.pollute(); + } + if (searchConcurrency == -1) { searchConcurrency = Runtime.getRuntime().availableProcessors(); } diff --git a/src/main/perf/TypePolluter.java b/src/main/perf/TypePolluter.java new file mode 100644 index 000000000..d698b2227 --- /dev/null +++ b/src/main/perf/TypePolluter.java @@ -0,0 +1,196 @@ +package perf; + +/** + * Licensed to the Apache Software Foundation (ASF) under one or more + * contributor license agreements. See the NOTICE file distributed with + * this work for additional information regarding copyright ownership. + * The ASF licenses this file to You under the Apache License, Version 2.0 + * (the "License"); you may not use this file except in compliance with + * the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +import java.io.IOException; +import java.util.ArrayList; +import java.util.List; + +import org.apache.lucene.document.Document; +import org.apache.lucene.document.Field.Store; +import org.apache.lucene.document.KnnFloatVectorField; +import org.apache.lucene.document.NumericDocValuesField; +import org.apache.lucene.document.StringField; +import org.apache.lucene.index.DirectoryReader; +import org.apache.lucene.index.ExitableDirectoryReader; +import org.apache.lucene.index.IndexWriter; +import org.apache.lucene.index.IndexWriterConfig; +import org.apache.lucene.index.QueryTimeoutImpl; +import org.apache.lucene.index.Term; +import org.apache.lucene.search.BooleanClause.Occur; +import org.apache.lucene.search.BooleanQuery; +import org.apache.lucene.search.FieldExistsQuery; +import org.apache.lucene.search.IndexSearcher; +import org.apache.lucene.search.KnnFloatVectorQuery; +import org.apache.lucene.search.Query; +import org.apache.lucene.search.Sort; +import org.apache.lucene.search.SortField; +import org.apache.lucene.search.TermQuery; +import org.apache.lucene.search.TopDocs; +import org.apache.lucene.search.similarities.BooleanSimilarity; +import org.apache.lucene.search.similarities.ClassicSimilarity; +import org.apache.lucene.store.ByteBuffersDirectory; +import org.apache.lucene.store.Directory; + +/** + * This helper tries to pollute a bit the types that are typically seen by queries at call sites to + * help better simulate production systems that may + *
This matters because polymorphic call sites are much more expensive than bimorphic call sites,
+ * and bimorphic call sites may be noticeably more expensive than monomorphic call sites.
+ */
+public class TypePolluter {
+
+ public static void pollute() throws IOException {
+ // Use ByteBuffersDirectory instead of MMapDirectory to have multiple IndexInput sub-classes used by queries
+ try (Directory dir = new ByteBuffersDirectory()) {
+
+ // TODO: configure a non-default codec?
+ IndexWriterConfig config = new IndexWriterConfig(null);
+
+ try (IndexWriter w = new IndexWriter(dir, config)) {
+ // Add enough documents for the inverted index to have full blocks (128 postings)
+ int docCount = 1024;
+ for (int i = 0; i < docCount; ++i) {
+ Document doc = new Document();
+ doc.add(new StringField("id", Integer.toString(i), Store.NO));
+ if (i % 3 != 0) {
+ doc.add(new StringField("body", "a", Store.NO));
+ doc.add(new NumericDocValuesField("int1", i % 3));
+ }
+ if (i % 7 != 0) {
+ doc.add(new StringField("body", "b", Store.NO));
+ doc.add(new NumericDocValuesField("int2", i % 7));
+ }
+ if (i % 11 != 0) {
+ doc.add(new StringField("body", "c", Store.NO));
+ doc.add(new NumericDocValuesField("int3", i % 11));
+ }
+ if (i % 13 != 0) {
+ doc.add(new KnnFloatVectorField("vector", new float[] { i % 7 }));
+ }
+ w.addDocument(doc);
+ }
+ w.forceMerge(1);
+
+ try (DirectoryReader reader = DirectoryReader.open(w)) {
+ // Run queries with no deletions
+ runQueries(reader);
+ }
+ // Add deleted docs to make sure that branches that exercise deleted docs are used even
+ // though the benchmark may be running with no deleted docs
+ for (int i = 0; i < docCount; i += 23) {
+ w.deleteDocuments(new Term("id", Integer.toString(i)));
+ }
+ try (DirectoryReader reader = DirectoryReader.open(w)) {
+ // Now run queries with deletions
+ runQueries(reader);
+ // ExitableDirectoryReader adds lots of wrappers everywhere
+ runQueries(new ExitableDirectoryReader(reader, new QueryTimeoutImpl(Long.MAX_VALUE)));
+ }
+ }
+ }
+ }
+
+ private static void runQueries(DirectoryReader reader) throws IOException {
+ IndexSearcher searcher = new IndexSearcher(reader);
+ // Exercise multiple similarities
+ IndexSearcher booleanSearcher = new IndexSearcher(reader);
+ booleanSearcher.setSimilarity(new BooleanSimilarity());
+ IndexSearcher classicSearcher = new IndexSearcher(reader);
+ classicSearcher.setSimilarity(new ClassicSimilarity());
+
+ Query query1 = new TermQuery(new Term("body", "a"));
+ Query query2 = new TermQuery(new Term("body", "b"));
+ Query query3 = new FieldExistsQuery("int1");
+ Query query4 = new FieldExistsQuery("int2");
+ Query query5 = new BooleanQuery.Builder()
+ .add(query1, Occur.SHOULD)
+ .add(query2, Occur.SHOULD)
+ .build();
+ Query query6 = new BooleanQuery.Builder()
+ .add(query1, Occur.MUST)
+ .add(query2, Occur.MUST)
+ .build();
+ Query query7 = new BooleanQuery.Builder()
+ .add(query3, Occur.SHOULD)
+ .add(query4, Occur.SHOULD)
+ .build();
+ Query query8 = new BooleanQuery.Builder()
+ .add(query3, Occur.MUST)
+ .add(query4, Occur.MUST)
+ .build();
+
+ Query[] baseQueries = new Query[] { query1, query2, query3, query4, query5, query6, query7, query8 };
+
+ // dense filter
+ Query filter1 = new TermQuery(new Term("body", "c"));
+ // sparse filter (especially useful to make sure that the vector search query exercises exact search)
+ Query filter2 = new TermQuery(new Term("id", "1"));
+ // filter not based on postings
+ Query filter3 = new FieldExistsQuery("int3");
+
+ List