Skip to content
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
Original file line number Diff line number Diff line change
Expand Up @@ -59,7 +59,7 @@
* Benchmark that compares bulk scoring of various scalar quantized vector similarity function
* implementations (scalar, lucene's panama-ized, and Elasticsearch's native) against sequential
* and random access target vectors.
* Run with ./gradlew -p benchmarks run --args 'VectorScorerInt7uBulkScorerBenchmark'
* Run with ./gradlew -p benchmarks run --args 'VectorScorerInt7uBulkBenchmark'
*/
@Fork(value = 1, jvmArgsPrepend = { "--add-modules=jdk.incubator.vector" })
@Warmup(iterations = 3, time = 3)
Expand All @@ -80,14 +80,22 @@ public class VectorScorerInt7uBulkBenchmark {
@Param({ "1024" })
public int dims;

// 128k is typically enough to not fit in L1 (core) cache for most processors;
// 1.5M is typically enough to not fit in L2 (core) cache;
// 130M is enough to not fit in L3 cache
// 128kb is typically enough to not fit in L1 (core) cache for most processors;
// 1.5Mb is typically enough to not fit in L2 (core) cache;
// 130Mb is enough to not fit in L3 cache
@Param({ "128", "1500", "130000" })
public int numVectors;
public int numVectorsToScore;

@Param
// Bulk sizes to test.
// DiskBBQ has two bulk sizes, 16 and 32
// HNSW params will have the distributed ordinal bulk sizes depending on the number of connections in the graph
// The default is 16, maximum is 512, and the bottom layer is 2x that the configured setting, so 1024 is a maximum
// the MOST common case here is 32
@Param({ "16", "32", "64", "256", "1024" })
Copy link
Copy Markdown
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

@ChrisHegarty FYI, I think Lucene benchmarks should be updated in the same way?

public int bulkSize;

@Param({ "SCALAR", "LUCENE", "NATIVE" })
Copy link
Copy Markdown
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Nit: unless we want to explicilty exclude an entry, just @param will do (this way we don't need to worry about keeping this list updated, in case we add a new implementation)

public VectorImplementation implementation;

@Param({ "DOT_PRODUCT", "EUCLIDEAN" })
Expand Down Expand Up @@ -163,6 +171,7 @@ public void setScoringOrdinal(int targetOrd) throws IOException {
private float[] scores;
private int[] ordinals;
private int[] ids;
private int[] toScore; // scratch array for bulk scoring

private UpdateableRandomVectorScorer scorer;
private RandomVectorScorer queryScorer;
Expand Down Expand Up @@ -213,7 +222,8 @@ void setup(VectorData vectorData) throws IOException {
writeInt7VectorData(dir, vectorData.vectorData, vectorData.offsets);

numVectorsToScore = vectorData.numVectorsToScore;
scores = new float[numVectorsToScore];
scores = new float[bulkSize];
toScore = new int[bulkSize]; // scratch array for ordinal slices
ids = IntStream.range(0, numVectors).toArray();
ordinals = vectorData.ordinals;

Expand Down Expand Up @@ -255,43 +265,67 @@ public void teardown() throws IOException {

@Benchmark
public float[] scoreMultipleSequential() throws IOException {
for (int v = 0; v < numVectorsToScore; v++) {
scores[v] = scorer.score(v);
int v = 0;
while (v < numVectorsToScore) {
Copy link
Copy Markdown
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

This is the same as before, but I'm OK with the change as it highlights that we always have smaller batches.

for (int i = 0; i < bulkSize && v < numVectorsToScore; i++, v++) {
scores[i] = scorer.score(v);
}
}
return scores;
}

@Benchmark
public float[] scoreMultipleRandom() throws IOException {
for (int v = 0; v < numVectorsToScore; v++) {
scores[v] = scorer.score(ordinals[v]);
int v = 0;
while (v < numVectorsToScore) {
for (int i = 0; i < bulkSize && v < numVectorsToScore; i++, v++) {
scores[i] = scorer.score(ordinals[v]);
}
}
return scores;
}

@Benchmark
public float[] scoreQueryMultipleRandom() throws IOException {
for (int v = 0; v < numVectorsToScore; v++) {
scores[v] = queryScorer.score(ordinals[v]);
int v = 0;
while (v < numVectorsToScore) {
for (int i = 0; i < bulkSize && v < numVectorsToScore; i++, v++) {
scores[i] = queryScorer.score(ordinals[v]);
}
}
return scores;
}

@Benchmark
public float[] scoreMultipleSequentialBulk() throws IOException {
scorer.bulkScore(ids, scores, ordinals.length);
for (int i = 0; i < numVectorsToScore; i += bulkSize) {
int toScoreInThisBatch = Math.min(bulkSize, numVectorsToScore - i);
// Copy the slice of sequential IDs to the scratch array
System.arraycopy(ids, i, toScore, 0, toScoreInThisBatch);
Copy link
Copy Markdown
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

This is probably not negligible in terms of impact on the benchmark, or is it?

Copy link
Copy Markdown
Member Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

it has an impact. I don't know of a better way other than pre allocating ALL the batches ahead of time. If we are ok with the heap usage, we can do taht instead

Copy link
Copy Markdown
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

How does the real code do this? Does it creates a new array? Use a scratch like the benchmark? I think we should do the same.
Also, probably, there is room for improvement here; we can avoid copies if we change the API to

void bulkScore(int[] nodes, float[] scores, int offset, int bulkSize)

(or add it, with the existing implementation calling bulkScore(nodes, scores, 0, int bulkSize) or something like that)

Copy link
Copy Markdown
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

But that's a problem for another day :)

Copy link
Copy Markdown
Member Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

How does the real code do this? Does it creates a new array? Use a scratch like the benchmark? I think we should do the same.

For a query, it creates a new score & batch array and those single arrays are used during the entire time of the query, which means over many score runs.

However, that also means that the IDs are that are USED for batch are indeed copied in (prod is actually much slower, popping from a queue individually for HNSW)

scorer.bulkScore(toScore, scores, toScoreInThisBatch);
}
return scores;
}

@Benchmark
public float[] scoreMultipleRandomBulk() throws IOException {
scorer.bulkScore(ordinals, scores, ordinals.length);
for (int i = 0; i < numVectorsToScore; i += bulkSize) {
int toScoreInThisBatch = Math.min(bulkSize, numVectorsToScore - i);
// Copy the slice of random ordinals to the scratch array
System.arraycopy(ordinals, i, toScore, 0, toScoreInThisBatch);
scorer.bulkScore(toScore, scores, toScoreInThisBatch);
}
return scores;
}

@Benchmark
public float[] scoreQueryMultipleRandomBulk() throws IOException {
queryScorer.bulkScore(ordinals, scores, ordinals.length);
for (int i = 0; i < numVectorsToScore; i += bulkSize) {
int toScoreInThisBatch = Math.min(bulkSize, numVectorsToScore - i);
// Copy the slice of random ordinals to the scratch array
System.arraycopy(ordinals, i, toScore, 0, toScoreInThisBatch);
queryScorer.bulkScore(toScore, scores, toScoreInThisBatch);
}
return scores;
}
}
Original file line number Diff line number Diff line change
Expand Up @@ -49,6 +49,7 @@ public void testSequential() throws Exception {
bench.dims = dims;
bench.numVectors = 1000;
bench.numVectorsToScore = 200;
bench.bulkSize = 200;
Copy link
Copy Markdown
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

I would use a number < numVectorsToScore to exercise better the 2 nested loops

Copy link
Copy Markdown
Member Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

I do this because testing makes so many different assumptions on the return value. it will be a significant rewrite

bench.setup(vectorData);

try {
Expand Down Expand Up @@ -79,6 +80,7 @@ public void testRandom() throws Exception {
bench.dims = dims;
bench.numVectors = 1000;
bench.numVectorsToScore = 200;
bench.bulkSize = 200;
bench.setup(vectorData);

try {
Expand Down Expand Up @@ -110,6 +112,7 @@ public void testQueryRandom() throws Exception {
bench.dims = dims;
bench.numVectors = 1000;
bench.numVectorsToScore = 200;
bench.bulkSize = 200;
bench.setup(vectorData);

try {
Expand Down