Skip to content
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
2 changes: 1 addition & 1 deletion libs/native/libraries/build.gradle
Original file line number Diff line number Diff line change
Expand Up @@ -19,7 +19,7 @@ configurations {
}

var zstdVersion = "1.5.7"
var vecVersion = "1.0.42"
var vecVersion = "1.0.44"

repositories {
exclusiveContent {
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -52,7 +52,6 @@ public static Iterable<Object[]> parametersFactory() {
// remove all square distance (not implemented yet)
baseParams.removeIf(os -> os[0] == VectorSimilarityFunctions.Function.SQUARE_DISTANCE);

// duplicate for int1 & int2
return () -> Stream.of(VectorSimilarityFunctions.BBQType.values())
.flatMap(bbq -> baseParams.stream().map(os -> CollectionUtils.concatLists(List.of(bbq), Arrays.asList(os))))
.map(List::toArray)
Expand Down
2 changes: 1 addition & 1 deletion libs/simdvec/native/publish_vec_binaries.sh
Original file line number Diff line number Diff line change
Expand Up @@ -20,7 +20,7 @@ if [ -z "$ARTIFACTORY_API_KEY" ]; then
exit 1;
fi

VERSION="1.0.42"
VERSION="1.0.44"
ARTIFACTORY_REPOSITORY="${ARTIFACTORY_REPOSITORY:-https://artifactory.elastic.dev/artifactory/elasticsearch-native/}"
TEMP=$(mktemp -d)

Expand Down
185 changes: 162 additions & 23 deletions libs/simdvec/native/src/vec/c/amd64/vec_1.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -180,7 +180,44 @@ static inline void sqri7u_inner_bulk(
const int32_t count,
f32_t* results
) {
for (int c = 0; c < count; c++) {
const int blk = dims & ~(STRIDE_BYTES_LEN - 1);
Copy link
Copy Markdown
Member Author

@thecoop thecoop Feb 12, 2026

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

This bulk pattern seems to be template-able, but it needs:

  • the 'inner' method to be a template param (here sqri7u_inner)
  • the vectors tail method can be a template param (vec_sqri7u)
  • the dimension tail. This is a lot less obvious, as it's in-line to the method, and there are several method variables updated by that bit of code. How could that be templated out sensibly?

Copy link
Copy Markdown
Member Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

ALternatively, we could use classes, but then I'm not sure what that does to inline-ability and (potentially virtual) method calls

Copy link
Copy Markdown
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

We tried to stay avoid from classes, and especially virtual methods, up to now, and focus on what can be done with templating instead.
A template would probably be not easy on the eyes; as you point out, it would require 3 template functions.

And it would be good to look at the cost of virtual calls for bulk; the overhead for single distance/score made it not feasible, but maybe with bulk operations we are OK?

Let's split and try both; I can give a template-based version a go, you could try a class-based one.

const int lines_to_fetch = dims / CACHE_LINE_SIZE + 1;
int c = 0;

const int8_t* a0 = safe_mapper_offset<int8_t, 0, mapper>(a, pitch, offsets, count);
const int8_t* a1 = safe_mapper_offset<int8_t, 1, mapper>(a, pitch, offsets, count);

// Process a batch of 2 vectors at a time, after instructing the CPU to
// prefetch the next batch.
for (; c + 3 < count; c += 2) {
const int8_t* next_a0 = a + mapper(c + 2, offsets) * pitch;
const int8_t* next_a1 = a + mapper(c + 3, offsets) * pitch;

prefetch(next_a0, lines_to_fetch);
prefetch(next_a1, lines_to_fetch);

int32_t res0 = 0;
int32_t res1 = 0;
int i = 0;
if (dims > STRIDE_BYTES_LEN) {
i = blk;
res0 = sqri7u_inner(a0, b, i);
res1 = sqri7u_inner(a1, b, i);
}
for (; i < dims; i++) {
int32_t dist0 = a0[i] - b[i];
int32_t dist1 = a1[i] - b[i];
res0 += dist0 * dist0;
res1 += dist1 * dist1;
}
results[c + 0] = (f32_t)res0;
results[c + 1] = (f32_t)res1;
a0 = next_a0;
a1 = next_a1;
}

// Tail-handling: remaining vectors
for (; c < count; c++) {
const int8_t* a0 = a + mapper(c, offsets) * pitch;
results[c] = (f32_t)vec_sqri7u(a0, b, dims);
}
Expand Down Expand Up @@ -346,7 +383,43 @@ static inline void doti8_inner_bulk(
const int32_t count,
f32_t* results
) {
for (int c=0; c<count; c++) {
const int blk = dims & ~(STRIDE_BYTES_LEN - 1);
const int lines_to_fetch = dims / CACHE_LINE_SIZE + 1;
int c = 0;

const int8_t* a0 = safe_mapper_offset<int8_t, 0, mapper>(a, pitch, offsets, count);
const int8_t* a1 = safe_mapper_offset<int8_t, 1, mapper>(a, pitch, offsets, count);

// Process a batch of 2 vectors at a time, after instructing the CPU to
// prefetch the next batch.
for (; c + 3 < count; c += 2) {
const int8_t* next_a0 = a + mapper(c + 2, offsets) * pitch;
const int8_t* next_a1 = a + mapper(c + 3, offsets) * pitch;

prefetch(next_a0, lines_to_fetch);
prefetch(next_a1, lines_to_fetch);

int32_t res0 = 0;
int32_t res1 = 0;
int i = 0;
if (dims > STRIDE_BYTES_LEN) {
i = blk;
res0 = doti8_inner(a0, b, i);
res1 = doti8_inner(a1, b, i);
}
for (; i < dims; i++) {
const int8_t bb = b[i];
res0 += a0[i] * bb;
res1 += a1[i] * bb;
}
results[c + 0] = (f32_t)res0;
results[c + 1] = (f32_t)res1;
a0 = next_a0;
a1 = next_a1;
}

// Tail-handling: remaining vectors
for (; c<count; c++) {
const int8_t* a0 = a + mapper(c, offsets) * pitch;
results[c] = vec_doti8(a0, b, dims);
}
Expand Down Expand Up @@ -415,7 +488,44 @@ static inline void sqri8_inner_bulk(
const int32_t count,
f32_t* results
) {
for (int c=0; c<count; c++) {
const int blk = dims & ~(STRIDE_BYTES_LEN - 1);
const int lines_to_fetch = dims / CACHE_LINE_SIZE + 1;
int c = 0;

const int8_t* a0 = safe_mapper_offset<int8_t, 0, mapper>(a, pitch, offsets, count);
const int8_t* a1 = safe_mapper_offset<int8_t, 1, mapper>(a, pitch, offsets, count);

// Process a batch of 2 vectors at a time, after instructing the CPU to
// prefetch the next batch.
for (; c + 3 < count; c += 2) {
const int8_t* next_a0 = a + mapper(c + 2, offsets) * pitch;
const int8_t* next_a1 = a + mapper(c + 3, offsets) * pitch;

prefetch(next_a0, lines_to_fetch);
prefetch(next_a1, lines_to_fetch);

int32_t res0 = 0;
int32_t res1 = 0;
int i = 0;
if (dims > STRIDE_BYTES_LEN) {
i = blk;
res0 = sqri8_inner(a0, b, i);
res1 = sqri8_inner(a1, b, i);
}
for (; i < dims; i++) {
int32_t dist0 = a0[i] - b[i];
int32_t dist1 = a1[i] - b[i];
res0 += dist0 * dist0;
res1 += dist1 * dist1;
}
results[c + 0] = (f32_t)res0;
results[c + 1] = (f32_t)res1;
a0 = next_a0;
a1 = next_a1;
}

// Tail-handling: remaining vectors
for (; c<count; c++) {
const int8_t* a0 = a + mapper(c, offsets) * pitch;
results[c] = vec_sqri8(a0, b, dims);
}
Expand Down Expand Up @@ -758,16 +868,6 @@ EXPORT int64_t vec_dotd1q4(
return dotd1q4_inner(a_ptr, query_ptr, length);
}

EXPORT int64_t vec_dotd2q4(
const int8_t* a_ptr,
const int8_t* query_ptr,
const int32_t length
) {
int64_t lower = dotd1q4_inner(a_ptr, query_ptr, length/2);
int64_t upper = dotd1q4_inner(a_ptr + length/2, query_ptr, length/2);
return lower + (upper << 1);
}

template <int64_t(*mapper)(const int32_t, const int32_t*)>
static inline void dotd1q4_inner_bulk(
const int8_t* a,
Expand Down Expand Up @@ -839,6 +939,16 @@ EXPORT void vec_dotd1q4_bulk_offsets(
dotd1q4_inner_bulk<array_mapper>(a, query, length, pitch, offsets, count, results);
}

EXPORT int64_t vec_dotd2q4(
const int8_t* a_ptr,
const int8_t* query_ptr,
const int32_t length
) {
int64_t lower = dotd1q4_inner(a_ptr, query_ptr, length/2);
int64_t upper = dotd1q4_inner(a_ptr + length/2, query_ptr, length/2);
return lower + (upper << 1);
}

template <int64_t(*mapper)(const int32_t, const int32_t*)>
static inline void dotd2q4_inner_bulk(
const int8_t* a,
Expand All @@ -856,23 +966,23 @@ static inline void dotd2q4_inner_bulk(
const int8_t* a0 = safe_mapper_offset<int8_t, 0, mapper>(a, pitch, offsets, count);
const int8_t* a1 = safe_mapper_offset<int8_t, 1, mapper>(a, pitch, offsets, count);

// Process a batch of 2 vectors at a time, after instructing the CPU to
// prefetch the next batch.
// Prefetching multiple memory locations while computing keeps the CPU
// execution units busy.
for (; c + 3 < count; c += 2) {
// Process 2 vectors at a time, after instructing the CPU to
// prefetch the next vectors (both stripes).
for (; c + 2 < count; c+=2) {
const int8_t* next_a0 = a + mapper(c + 2, offsets) * pitch;
const int8_t* next_a1 = a + mapper(c + 3, offsets) * pitch;

prefetch(next_a0, lines_to_fetch);
prefetch(next_a0 + bit_length, lines_to_fetch);
prefetch(next_a1, lines_to_fetch);
prefetch(next_a1 + bit_length, lines_to_fetch);

int64_t lower0 = dotd1q4_inner(a0, query, bit_length);
int64_t lower1 = dotd1q4_inner(a1, query, bit_length);
int64_t upper0 = dotd1q4_inner(a0 + bit_length, query, bit_length);
int64_t lower1 = dotd1q4_inner(a1, query, bit_length);
int64_t upper1 = dotd1q4_inner(a1 + bit_length, query, bit_length);

results[c + 0] = (f32_t)(lower0 + (upper0 << 1));
results[c] = (f32_t)(lower0 + (upper0 << 1));
results[c + 1] = (f32_t)(lower1 + (upper1 << 1));

a0 = next_a0;
Expand All @@ -882,8 +992,8 @@ static inline void dotd2q4_inner_bulk(
// Tail-handling: remaining vectors
for (; c < count; c++) {
const int8_t* a0 = a + mapper(c, offsets) * pitch;
int64_t lower = dotd1q4_inner(a0, query, length/2);
int64_t upper = dotd1q4_inner(a0 + length/2, query, length/2);
int64_t lower = dotd1q4_inner(a0, query, bit_length);
int64_t upper = dotd1q4_inner(a0 + bit_length, query, bit_length);
results[c] = (f32_t)(lower + (upper << 1));
}
}
Expand Down Expand Up @@ -927,8 +1037,37 @@ static inline void dotd4q4_inner_bulk(
const int32_t count,
f32_t* results
) {
const int lines_to_fetch = length / CACHE_LINE_SIZE + 1;
const int32_t bit_length = length / 4;
for (int c = 0; c < count; c++) {
int c = 0;

const int8_t* a0 = safe_mapper_offset<int8_t, 0, mapper>(a, pitch, offsets, count);

// Process one vector, after instructing the CPU to prefetch the next vector
for (; c + 1 < count; c++) {
const int8_t* next_a0 = a + mapper(c + 1, offsets) * pitch;

// prefetch stripes 2 and 3 now
prefetch(a0 + 2 * bit_length, lines_to_fetch);
prefetch(a0 + 3 * bit_length, lines_to_fetch);

int64_t p0 = dotd1q4_inner(a0, query, bit_length);
int64_t p1 = dotd1q4_inner(a0 + bit_length, query, bit_length);

// and 0 and 1 of the next vector
prefetch(next_a0, lines_to_fetch);
prefetch(next_a0 + bit_length, lines_to_fetch);

int64_t p2 = dotd1q4_inner(a0 + 2 * bit_length, query, bit_length);
int64_t p3 = dotd1q4_inner(a0 + 3 * bit_length, query, bit_length);

results[c] = (f32_t)(p0 + (p1 << 1) + (p2 << 2) + (p3 << 3));

a0 = next_a0;
}

// Tail-handling: remaining vector
for (; c < count; c++) {
const int8_t* a0 = a + mapper(c, offsets) * pitch;

int64_t p0 = dotd1q4_inner(a0 + 0 * bit_length, query, bit_length);
Expand Down
Loading