diff --git a/src/main/perf/SearchPerfTest.java b/src/main/perf/SearchPerfTest.java index fd8310412..3de21be4c 100755 --- a/src/main/perf/SearchPerfTest.java +++ b/src/main/perf/SearchPerfTest.java @@ -236,8 +236,13 @@ private static void _main(String[] clArgs) throws Exception { final int topN = args.getInt("-topN"); final boolean doStoredLoads = args.getFlag("-loadStoredFields"); final boolean exitable = args.getFlag("-exitable"); + final boolean pollute = args.getFlag("-pollute"); final TestContext testContext = TestContext.parse(args.getString("-context", "")); + if (pollute) { + TypePolluter.pollute(); + } + if (searchConcurrency == -1) { searchConcurrency = Runtime.getRuntime().availableProcessors(); } diff --git a/src/main/perf/TypePolluter.java b/src/main/perf/TypePolluter.java new file mode 100644 index 000000000..d698b2227 --- /dev/null +++ b/src/main/perf/TypePolluter.java @@ -0,0 +1,196 @@ +package perf; + +/** + * Licensed to the Apache Software Foundation (ASF) under one or more + * contributor license agreements. See the NOTICE file distributed with + * this work for additional information regarding copyright ownership. + * The ASF licenses this file to You under the Apache License, Version 2.0 + * (the "License"); you may not use this file except in compliance with + * the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +import java.io.IOException; +import java.util.ArrayList; +import java.util.List; + +import org.apache.lucene.document.Document; +import org.apache.lucene.document.Field.Store; +import org.apache.lucene.document.KnnFloatVectorField; +import org.apache.lucene.document.NumericDocValuesField; +import org.apache.lucene.document.StringField; +import org.apache.lucene.index.DirectoryReader; +import org.apache.lucene.index.ExitableDirectoryReader; +import org.apache.lucene.index.IndexWriter; +import org.apache.lucene.index.IndexWriterConfig; +import org.apache.lucene.index.QueryTimeoutImpl; +import org.apache.lucene.index.Term; +import org.apache.lucene.search.BooleanClause.Occur; +import org.apache.lucene.search.BooleanQuery; +import org.apache.lucene.search.FieldExistsQuery; +import org.apache.lucene.search.IndexSearcher; +import org.apache.lucene.search.KnnFloatVectorQuery; +import org.apache.lucene.search.Query; +import org.apache.lucene.search.Sort; +import org.apache.lucene.search.SortField; +import org.apache.lucene.search.TermQuery; +import org.apache.lucene.search.TopDocs; +import org.apache.lucene.search.similarities.BooleanSimilarity; +import org.apache.lucene.search.similarities.ClassicSimilarity; +import org.apache.lucene.store.ByteBuffersDirectory; +import org.apache.lucene.store.Directory; + +/** + * This helper tries to pollute a bit the types that are typically seen by queries at call sites to + * help better simulate production systems that may + * + *

This matters because polymorphic call sites are much more expensive than bimorphic call sites, + * and bimorphic call sites may be noticeably more expensive than monomorphic call sites. + */ +public class TypePolluter { + + public static void pollute() throws IOException { + // Use ByteBuffersDirectory instead of MMapDirectory to have multiple IndexInput sub-classes used by queries + try (Directory dir = new ByteBuffersDirectory()) { + + // TODO: configure a non-default codec? + IndexWriterConfig config = new IndexWriterConfig(null); + + try (IndexWriter w = new IndexWriter(dir, config)) { + // Add enough documents for the inverted index to have full blocks (128 postings) + int docCount = 1024; + for (int i = 0; i < docCount; ++i) { + Document doc = new Document(); + doc.add(new StringField("id", Integer.toString(i), Store.NO)); + if (i % 3 != 0) { + doc.add(new StringField("body", "a", Store.NO)); + doc.add(new NumericDocValuesField("int1", i % 3)); + } + if (i % 7 != 0) { + doc.add(new StringField("body", "b", Store.NO)); + doc.add(new NumericDocValuesField("int2", i % 7)); + } + if (i % 11 != 0) { + doc.add(new StringField("body", "c", Store.NO)); + doc.add(new NumericDocValuesField("int3", i % 11)); + } + if (i % 13 != 0) { + doc.add(new KnnFloatVectorField("vector", new float[] { i % 7 })); + } + w.addDocument(doc); + } + w.forceMerge(1); + + try (DirectoryReader reader = DirectoryReader.open(w)) { + // Run queries with no deletions + runQueries(reader); + } + // Add deleted docs to make sure that branches that exercise deleted docs are used even + // though the benchmark may be running with no deleted docs + for (int i = 0; i < docCount; i += 23) { + w.deleteDocuments(new Term("id", Integer.toString(i))); + } + try (DirectoryReader reader = DirectoryReader.open(w)) { + // Now run queries with deletions + runQueries(reader); + // ExitableDirectoryReader adds lots of wrappers everywhere + runQueries(new ExitableDirectoryReader(reader, new QueryTimeoutImpl(Long.MAX_VALUE))); + } + } + } + } + + private static void runQueries(DirectoryReader reader) throws IOException { + IndexSearcher searcher = new IndexSearcher(reader); + // Exercise multiple similarities + IndexSearcher booleanSearcher = new IndexSearcher(reader); + booleanSearcher.setSimilarity(new BooleanSimilarity()); + IndexSearcher classicSearcher = new IndexSearcher(reader); + classicSearcher.setSimilarity(new ClassicSimilarity()); + + Query query1 = new TermQuery(new Term("body", "a")); + Query query2 = new TermQuery(new Term("body", "b")); + Query query3 = new FieldExistsQuery("int1"); + Query query4 = new FieldExistsQuery("int2"); + Query query5 = new BooleanQuery.Builder() + .add(query1, Occur.SHOULD) + .add(query2, Occur.SHOULD) + .build(); + Query query6 = new BooleanQuery.Builder() + .add(query1, Occur.MUST) + .add(query2, Occur.MUST) + .build(); + Query query7 = new BooleanQuery.Builder() + .add(query3, Occur.SHOULD) + .add(query4, Occur.SHOULD) + .build(); + Query query8 = new BooleanQuery.Builder() + .add(query3, Occur.MUST) + .add(query4, Occur.MUST) + .build(); + + Query[] baseQueries = new Query[] { query1, query2, query3, query4, query5, query6, query7, query8 }; + + // dense filter + Query filter1 = new TermQuery(new Term("body", "c")); + // sparse filter (especially useful to make sure that the vector search query exercises exact search) + Query filter2 = new TermQuery(new Term("id", "1")); + // filter not based on postings + Query filter3 = new FieldExistsQuery("int3"); + + List queries = new ArrayList<>(); + + for (Query query : baseQueries) { + queries.add(query); + for (Query filter : new Query[] { filter1, filter2, filter3 }) { + Query filteredQuery = new BooleanQuery.Builder() + .add(query, Occur.MUST) + .add(filter, Occur.FILTER) + .build(); + queries.add(filteredQuery); + } + } + + // Handle vector search separately since filters need to be applied differently + { + Query vectorQuery = new KnnFloatVectorQuery("vector", new float[] { 1.5f }, 10); + queries.add(vectorQuery); + for (Query filter : new Query[] { filter1, filter2, filter3 }) { + Query filteredQuery = new KnnFloatVectorQuery("vector", new float[] { 1.5f }, 10, filter); + queries.add(filteredQuery); + } + } + + for (Query query : queries) { + // Exhaustive evaluation, no scoring + int count = searcher.count(query); + // top-k evaluation, by score + TopDocs hits1 = searcher.search(query, 10); + TopDocs hits2 = booleanSearcher.search(query, 10); + TopDocs hits3 = classicSearcher.search(query, 10); + // top-k evaluation, by field + TopDocs hits4 = searcher.search(query, 10, new Sort(new SortField("int", SortField.Type.INT))); + + if (count == 0 + || hits1.totalHits.value() == 0 + || hits2.totalHits.value() == 0 + || hits3.totalHits.value() == 0 + || hits4.totalHits.value() == 0) { + // This helps catch errors if queries are malformed, and also prevents the JVM from skipping + // the query if we don't use the result + throw new Error("" + query); + } + } + } +} diff --git a/src/python/benchUtil.py b/src/python/benchUtil.py index 7430879d0..c1b6d2de2 100644 --- a/src/python/benchUtil.py +++ b/src/python/benchUtil.py @@ -1238,6 +1238,8 @@ def runSimpleSearchBench(self, iter, id, c, coldRun, seed, staticSeed, filter=No w("-vectorScale", c.vectorScale) if c.exitable: w("-exitable") + if c.pollute: + w("-pollute") print(" log: %s + stdout" % logFile) t0 = time.time() diff --git a/src/python/competition.py b/src/python/competition.py index 6e75a763d..a84d7a2e5 100644 --- a/src/python/competition.py +++ b/src/python/competition.py @@ -299,6 +299,7 @@ def __init__( javacCommand=constants.JAVAC_EXE, topN=100, testContext="", + pollute=True, ): self.name = name self.checkout = checkout @@ -350,6 +351,9 @@ def __init__( # See also TestContext#parse self.testContext = testContext + # Whether to pollute call sites so that they are not all magically monomorphic + self.pollute = pollute + def getAggregateProfilerResult(self, id, mode, count=30, stackSize=1): # we accept a sequence of stack sizes and will re-aggregate JFR results at each if type(stackSize) is int: