-
Notifications
You must be signed in to change notification settings - Fork 25.8k
Adding bulkSize for benchmarking to better reflect realworld usage #142480
New issue
Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.
By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.
Already on GitHub? Sign in to your account
Changes from all commits
b981bee
eed5d7a
f0162be
346aa60
6c96bf5
File filter
Filter by extension
Conversations
Jump to
Diff view
Diff view
There are no files selected for viewing
| Original file line number | Diff line number | Diff line change |
|---|---|---|
|
|
@@ -59,7 +59,7 @@ | |
| * Benchmark that compares bulk scoring of various scalar quantized vector similarity function | ||
| * implementations (scalar, lucene's panama-ized, and Elasticsearch's native) against sequential | ||
| * and random access target vectors. | ||
| * Run with ./gradlew -p benchmarks run --args 'VectorScorerInt7uBulkScorerBenchmark' | ||
| * Run with ./gradlew -p benchmarks run --args 'VectorScorerInt7uBulkBenchmark' | ||
| */ | ||
| @Fork(value = 1, jvmArgsPrepend = { "--add-modules=jdk.incubator.vector" }) | ||
| @Warmup(iterations = 3, time = 3) | ||
|
|
@@ -80,14 +80,22 @@ public class VectorScorerInt7uBulkBenchmark { | |
| @Param({ "1024" }) | ||
| public int dims; | ||
|
|
||
| // 128k is typically enough to not fit in L1 (core) cache for most processors; | ||
| // 1.5M is typically enough to not fit in L2 (core) cache; | ||
| // 130M is enough to not fit in L3 cache | ||
| // 128kb is typically enough to not fit in L1 (core) cache for most processors; | ||
| // 1.5Mb is typically enough to not fit in L2 (core) cache; | ||
| // 130Mb is enough to not fit in L3 cache | ||
| @Param({ "128", "1500", "130000" }) | ||
| public int numVectors; | ||
| public int numVectorsToScore; | ||
|
|
||
| @Param | ||
| // Bulk sizes to test. | ||
| // DiskBBQ has two bulk sizes, 16 and 32 | ||
| // HNSW params will have the distributed ordinal bulk sizes depending on the number of connections in the graph | ||
| // The default is 16, maximum is 512, and the bottom layer is 2x that the configured setting, so 1024 is a maximum | ||
| // the MOST common case here is 32 | ||
| @Param({ "16", "32", "64", "256", "1024" }) | ||
| public int bulkSize; | ||
|
|
||
| @Param({ "SCALAR", "LUCENE", "NATIVE" }) | ||
|
Contributor
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. Nit: unless we want to explicilty exclude an entry, just @param will do (this way we don't need to worry about keeping this list updated, in case we add a new implementation) |
||
| public VectorImplementation implementation; | ||
thecoop marked this conversation as resolved.
Show resolved
Hide resolved
|
||
|
|
||
| @Param({ "DOT_PRODUCT", "EUCLIDEAN" }) | ||
|
|
@@ -163,6 +171,7 @@ public void setScoringOrdinal(int targetOrd) throws IOException { | |
| private float[] scores; | ||
| private int[] ordinals; | ||
| private int[] ids; | ||
| private int[] toScore; // scratch array for bulk scoring | ||
|
|
||
| private UpdateableRandomVectorScorer scorer; | ||
| private RandomVectorScorer queryScorer; | ||
|
|
@@ -213,7 +222,8 @@ void setup(VectorData vectorData) throws IOException { | |
| writeInt7VectorData(dir, vectorData.vectorData, vectorData.offsets); | ||
|
|
||
| numVectorsToScore = vectorData.numVectorsToScore; | ||
| scores = new float[numVectorsToScore]; | ||
| scores = new float[bulkSize]; | ||
| toScore = new int[bulkSize]; // scratch array for ordinal slices | ||
| ids = IntStream.range(0, numVectors).toArray(); | ||
| ordinals = vectorData.ordinals; | ||
|
|
||
|
|
@@ -255,43 +265,67 @@ public void teardown() throws IOException { | |
|
|
||
| @Benchmark | ||
| public float[] scoreMultipleSequential() throws IOException { | ||
| for (int v = 0; v < numVectorsToScore; v++) { | ||
| scores[v] = scorer.score(v); | ||
| int v = 0; | ||
| while (v < numVectorsToScore) { | ||
|
Contributor
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. This is the same as before, but I'm OK with the change as it highlights that we always have smaller batches. |
||
| for (int i = 0; i < bulkSize && v < numVectorsToScore; i++, v++) { | ||
| scores[i] = scorer.score(v); | ||
| } | ||
| } | ||
| return scores; | ||
| } | ||
|
|
||
| @Benchmark | ||
| public float[] scoreMultipleRandom() throws IOException { | ||
| for (int v = 0; v < numVectorsToScore; v++) { | ||
| scores[v] = scorer.score(ordinals[v]); | ||
| int v = 0; | ||
| while (v < numVectorsToScore) { | ||
| for (int i = 0; i < bulkSize && v < numVectorsToScore; i++, v++) { | ||
| scores[i] = scorer.score(ordinals[v]); | ||
| } | ||
| } | ||
| return scores; | ||
| } | ||
|
|
||
| @Benchmark | ||
| public float[] scoreQueryMultipleRandom() throws IOException { | ||
| for (int v = 0; v < numVectorsToScore; v++) { | ||
| scores[v] = queryScorer.score(ordinals[v]); | ||
| int v = 0; | ||
| while (v < numVectorsToScore) { | ||
| for (int i = 0; i < bulkSize && v < numVectorsToScore; i++, v++) { | ||
| scores[i] = queryScorer.score(ordinals[v]); | ||
| } | ||
| } | ||
| return scores; | ||
| } | ||
|
|
||
| @Benchmark | ||
| public float[] scoreMultipleSequentialBulk() throws IOException { | ||
| scorer.bulkScore(ids, scores, ordinals.length); | ||
| for (int i = 0; i < numVectorsToScore; i += bulkSize) { | ||
| int toScoreInThisBatch = Math.min(bulkSize, numVectorsToScore - i); | ||
| // Copy the slice of sequential IDs to the scratch array | ||
| System.arraycopy(ids, i, toScore, 0, toScoreInThisBatch); | ||
|
Contributor
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. This is probably not negligible in terms of impact on the benchmark, or is it?
Member
Author
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. it has an impact. I don't know of a better way other than pre allocating ALL the batches ahead of time. If we are ok with the heap usage, we can do taht instead
Contributor
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. How does the real code do this? Does it creates a new array? Use a scratch like the benchmark? I think we should do the same. (or add it, with the existing implementation calling
Contributor
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. But that's a problem for another day :)
Member
Author
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more.
For a query, it creates a new score & batch array and those single arrays are used during the entire time of the query, which means over many score runs. However, that also means that the IDs are that are USED for batch are indeed copied in (prod is actually much slower, popping from a queue individually for HNSW) |
||
| scorer.bulkScore(toScore, scores, toScoreInThisBatch); | ||
| } | ||
| return scores; | ||
| } | ||
|
|
||
| @Benchmark | ||
| public float[] scoreMultipleRandomBulk() throws IOException { | ||
| scorer.bulkScore(ordinals, scores, ordinals.length); | ||
| for (int i = 0; i < numVectorsToScore; i += bulkSize) { | ||
| int toScoreInThisBatch = Math.min(bulkSize, numVectorsToScore - i); | ||
| // Copy the slice of random ordinals to the scratch array | ||
| System.arraycopy(ordinals, i, toScore, 0, toScoreInThisBatch); | ||
| scorer.bulkScore(toScore, scores, toScoreInThisBatch); | ||
| } | ||
| return scores; | ||
| } | ||
|
|
||
| @Benchmark | ||
| public float[] scoreQueryMultipleRandomBulk() throws IOException { | ||
| queryScorer.bulkScore(ordinals, scores, ordinals.length); | ||
| for (int i = 0; i < numVectorsToScore; i += bulkSize) { | ||
| int toScoreInThisBatch = Math.min(bulkSize, numVectorsToScore - i); | ||
| // Copy the slice of random ordinals to the scratch array | ||
| System.arraycopy(ordinals, i, toScore, 0, toScoreInThisBatch); | ||
| queryScorer.bulkScore(toScore, scores, toScoreInThisBatch); | ||
| } | ||
| return scores; | ||
| } | ||
| } | ||
| Original file line number | Diff line number | Diff line change |
|---|---|---|
|
|
@@ -49,6 +49,7 @@ public void testSequential() throws Exception { | |
| bench.dims = dims; | ||
| bench.numVectors = 1000; | ||
| bench.numVectorsToScore = 200; | ||
| bench.bulkSize = 200; | ||
|
Contributor
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. I would use a number < numVectorsToScore to exercise better the 2 nested loops
Member
Author
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. I do this because testing makes so many different assumptions on the return value. it will be a significant rewrite |
||
| bench.setup(vectorData); | ||
|
|
||
| try { | ||
|
|
@@ -79,6 +80,7 @@ public void testRandom() throws Exception { | |
| bench.dims = dims; | ||
| bench.numVectors = 1000; | ||
| bench.numVectorsToScore = 200; | ||
| bench.bulkSize = 200; | ||
| bench.setup(vectorData); | ||
|
|
||
| try { | ||
|
|
@@ -110,6 +112,7 @@ public void testQueryRandom() throws Exception { | |
| bench.dims = dims; | ||
| bench.numVectors = 1000; | ||
| bench.numVectorsToScore = 200; | ||
| bench.bulkSize = 200; | ||
| bench.setup(vectorData); | ||
|
|
||
| try { | ||
|
|
||
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
@ChrisHegarty FYI, I think Lucene benchmarks should be updated in the same way?