elastic · thecoop · Mar 2, 2026 · Feb 11, 2026 · Feb 11, 2026 · Feb 12, 2026
diff --git a/libs/native/libraries/build.gradle b/libs/native/libraries/build.gradle
@@ -19,7 +19,7 @@ configurations {
 }
 
 var zstdVersion = "1.5.7"
-var vecVersion = "1.0.42"
+var vecVersion = "1.0.44"
 
 repositories {
   exclusiveContent {

diff --git a/libs/native/src/test/java/org/elasticsearch/nativeaccess/jdk/JDKVectorLibraryBBQTests.java b/libs/native/src/test/java/org/elasticsearch/nativeaccess/jdk/JDKVectorLibraryBBQTests.java
@@ -52,7 +52,6 @@ public static Iterable<Object[]> parametersFactory() {
         // remove all square distance (not implemented yet)
         baseParams.removeIf(os -> os[0] == VectorSimilarityFunctions.Function.SQUARE_DISTANCE);
 
-        // duplicate for int1 & int2
         return () -> Stream.of(VectorSimilarityFunctions.BBQType.values())
             .flatMap(bbq -> baseParams.stream().map(os -> CollectionUtils.concatLists(List.of(bbq), Arrays.asList(os))))
             .map(List::toArray)

diff --git a/libs/simdvec/native/publish_vec_binaries.sh b/libs/simdvec/native/publish_vec_binaries.sh
@@ -20,7 +20,7 @@ if [ -z "$ARTIFACTORY_API_KEY" ]; then
   exit 1;
 fi
 
-VERSION="1.0.42"
+VERSION="1.0.44"
 ARTIFACTORY_REPOSITORY="${ARTIFACTORY_REPOSITORY:-https://artifactory.elastic.dev/artifactory/elasticsearch-native/}"
 TEMP=$(mktemp -d)
 

diff --git a/libs/simdvec/native/src/vec/c/amd64/vec_1.cpp b/libs/simdvec/native/src/vec/c/amd64/vec_1.cpp
@@ -180,7 +180,44 @@ static inline void sqri7u_inner_bulk(
     const int32_t count,
     f32_t* results
 ) {
-    for (int c = 0; c < count; c++) {
+    const int blk = dims & ~(STRIDE_BYTES_LEN - 1);
+    const int lines_to_fetch = dims / CACHE_LINE_SIZE + 1;
+    int c = 0;
+
+    const int8_t* a0 = safe_mapper_offset<int8_t, 0, mapper>(a, pitch, offsets, count);
+    const int8_t* a1 = safe_mapper_offset<int8_t, 1, mapper>(a, pitch, offsets, count);
+
+    // Process a batch of 2 vectors at a time, after instructing the CPU to
+    // prefetch the next batch.
+    for (; c + 3 < count; c += 2) {
+        const int8_t* next_a0 = a + mapper(c + 2, offsets) * pitch;
+        const int8_t* next_a1 = a + mapper(c + 3, offsets) * pitch;
+
+        prefetch(next_a0, lines_to_fetch);
+        prefetch(next_a1, lines_to_fetch);
+
+        int32_t res0 = 0;
+        int32_t res1 = 0;
+        int i = 0;
+        if (dims > STRIDE_BYTES_LEN) {
+            i = blk;
+            res0 = sqri7u_inner(a0, b, i);
+            res1 = sqri7u_inner(a1, b, i);
+        }
+        for (; i < dims; i++) {
+            int32_t dist0 = a0[i] - b[i];
+            int32_t dist1 = a1[i] - b[i];
+            res0 += dist0 * dist0;
+            res1 += dist1 * dist1;
+        }
+        results[c + 0] = (f32_t)res0;
+        results[c + 1] = (f32_t)res1;
+        a0 = next_a0;
+        a1 = next_a1;
+    }
+
+    // Tail-handling: remaining vectors
+    for (; c < count; c++) {
         const int8_t* a0 = a + mapper(c, offsets) * pitch;
         results[c] = (f32_t)vec_sqri7u(a0, b, dims);
     }
@@ -346,7 +383,43 @@ static inline void doti8_inner_bulk(
     const int32_t count,
     f32_t* results
 ) {
-    for (int c=0; c<count; c++) {
+    const int blk = dims & ~(STRIDE_BYTES_LEN - 1);
+    const int lines_to_fetch = dims / CACHE_LINE_SIZE + 1;
+    int c = 0;
+
+    const int8_t* a0 = safe_mapper_offset<int8_t, 0, mapper>(a, pitch, offsets, count);
+    const int8_t* a1 = safe_mapper_offset<int8_t, 1, mapper>(a, pitch, offsets, count);
+
+    // Process a batch of 2 vectors at a time, after instructing the CPU to
+    // prefetch the next batch.
+    for (; c + 3 < count; c += 2) {
+        const int8_t* next_a0 = a + mapper(c + 2, offsets) * pitch;
+        const int8_t* next_a1 = a + mapper(c + 3, offsets) * pitch;
+
+        prefetch(next_a0, lines_to_fetch);
+        prefetch(next_a1, lines_to_fetch);
+
+        int32_t res0 = 0;
+        int32_t res1 = 0;
+        int i = 0;
+        if (dims > STRIDE_BYTES_LEN) {
+            i = blk;
+            res0 = doti8_inner(a0, b, i);
+            res1 = doti8_inner(a1, b, i);
+        }
+        for (; i < dims; i++) {
+            const int8_t bb = b[i];
+            res0 += a0[i] * bb;
+            res1 += a1[i] * bb;
+        }
+        results[c + 0] = (f32_t)res0;
+        results[c + 1] = (f32_t)res1;
+        a0 = next_a0;
+        a1 = next_a1;
+    }
+
+    // Tail-handling: remaining vectors
+    for (; c<count; c++) {
         const int8_t* a0 = a + mapper(c, offsets) * pitch;
         results[c] = vec_doti8(a0, b, dims);
     }
@@ -415,7 +488,44 @@ static inline void sqri8_inner_bulk(
     const int32_t count,
     f32_t* results
 ) {
-    for (int c=0; c<count; c++) {
+    const int blk = dims & ~(STRIDE_BYTES_LEN - 1);
+    const int lines_to_fetch = dims / CACHE_LINE_SIZE + 1;
+    int c = 0;
+
+    const int8_t* a0 = safe_mapper_offset<int8_t, 0, mapper>(a, pitch, offsets, count);
+    const int8_t* a1 = safe_mapper_offset<int8_t, 1, mapper>(a, pitch, offsets, count);
+
+    // Process a batch of 2 vectors at a time, after instructing the CPU to
+    // prefetch the next batch.
+    for (; c + 3 < count; c += 2) {
+        const int8_t* next_a0 = a + mapper(c + 2, offsets) * pitch;
+        const int8_t* next_a1 = a + mapper(c + 3, offsets) * pitch;
+
+        prefetch(next_a0, lines_to_fetch);
+        prefetch(next_a1, lines_to_fetch);
+
+        int32_t res0 = 0;
+        int32_t res1 = 0;
+        int i = 0;
+        if (dims > STRIDE_BYTES_LEN) {
+            i = blk;
+            res0 = sqri8_inner(a0, b, i);
+            res1 = sqri8_inner(a1, b, i);
+        }
+        for (; i < dims; i++) {
+            int32_t dist0 = a0[i] - b[i];
+            int32_t dist1 = a1[i] - b[i];
+            res0 += dist0 * dist0;
+            res1 += dist1 * dist1;
+        }
+        results[c + 0] = (f32_t)res0;
+        results[c + 1] = (f32_t)res1;
+        a0 = next_a0;
+        a1 = next_a1;
+    }
+
+    // Tail-handling: remaining vectors
+    for (; c<count; c++) {
         const int8_t* a0 = a + mapper(c, offsets) * pitch;
         results[c] = vec_sqri8(a0, b, dims);
     }
@@ -758,16 +868,6 @@ EXPORT int64_t vec_dotd1q4(
     return dotd1q4_inner(a_ptr, query_ptr, length);
 }
 
-EXPORT int64_t vec_dotd2q4(
-    const int8_t* a_ptr,
-    const int8_t* query_ptr,
-    const int32_t length
-) {
-    int64_t lower = dotd1q4_inner(a_ptr, query_ptr, length/2);
-    int64_t upper = dotd1q4_inner(a_ptr + length/2, query_ptr, length/2);
-    return lower + (upper << 1);
-}
-
 template <int64_t(*mapper)(const int32_t, const int32_t*)>
 static inline void dotd1q4_inner_bulk(
     const int8_t* a,
@@ -839,6 +939,16 @@ EXPORT void vec_dotd1q4_bulk_offsets(
     dotd1q4_inner_bulk<array_mapper>(a, query, length, pitch, offsets, count, results);
 }
 
+EXPORT int64_t vec_dotd2q4(
+    const int8_t* a_ptr,
+    const int8_t* query_ptr,
+    const int32_t length
+) {
+    int64_t lower = dotd1q4_inner(a_ptr, query_ptr, length/2);
+    int64_t upper = dotd1q4_inner(a_ptr + length/2, query_ptr, length/2);
+    return lower + (upper << 1);
+}
+
 template <int64_t(*mapper)(const int32_t, const int32_t*)>
 static inline void dotd2q4_inner_bulk(
     const int8_t* a,
@@ -856,23 +966,23 @@ static inline void dotd2q4_inner_bulk(
     const int8_t* a0 = safe_mapper_offset<int8_t, 0, mapper>(a, pitch, offsets, count);
     const int8_t* a1 = safe_mapper_offset<int8_t, 1, mapper>(a, pitch, offsets, count);
 
-    // Process a batch of 2 vectors at a time, after instructing the CPU to
-    // prefetch the next batch.
-    // Prefetching multiple memory locations while computing keeps the CPU
-    // execution units busy.
-    for (; c + 3 < count; c += 2) {
+    // Process 2 vectors at a time, after instructing the CPU to
+    // prefetch the next vectors (both stripes).
+    for (; c + 2 < count; c+=2) {
         const int8_t* next_a0 = a + mapper(c + 2, offsets) * pitch;
         const int8_t* next_a1 = a + mapper(c + 3, offsets) * pitch;
 
         prefetch(next_a0, lines_to_fetch);
+        prefetch(next_a0 + bit_length, lines_to_fetch);
         prefetch(next_a1, lines_to_fetch);
+        prefetch(next_a1 + bit_length, lines_to_fetch);
 
         int64_t lower0 = dotd1q4_inner(a0, query, bit_length);
-        int64_t lower1 = dotd1q4_inner(a1, query, bit_length);
         int64_t upper0 = dotd1q4_inner(a0 + bit_length, query, bit_length);
+        int64_t lower1 = dotd1q4_inner(a1, query, bit_length);
         int64_t upper1 = dotd1q4_inner(a1 + bit_length, query, bit_length);
 
-        results[c + 0] = (f32_t)(lower0 + (upper0 << 1));
+        results[c] = (f32_t)(lower0 + (upper0 << 1));
         results[c + 1] = (f32_t)(lower1 + (upper1 << 1));
 
         a0 = next_a0;
@@ -882,8 +992,8 @@ static inline void dotd2q4_inner_bulk(
     // Tail-handling: remaining vectors
     for (; c < count; c++) {
         const int8_t* a0 = a + mapper(c, offsets) * pitch;
-        int64_t lower = dotd1q4_inner(a0, query, length/2);
-        int64_t upper = dotd1q4_inner(a0 + length/2, query, length/2);
+        int64_t lower = dotd1q4_inner(a0, query, bit_length);
+        int64_t upper = dotd1q4_inner(a0 + bit_length, query, bit_length);
         results[c] = (f32_t)(lower + (upper << 1));
     }
 }
@@ -927,8 +1037,37 @@ static inline void dotd4q4_inner_bulk(
     const int32_t count,
     f32_t* results
 ) {
+    const int lines_to_fetch = length / CACHE_LINE_SIZE + 1;
     const int32_t bit_length = length / 4;
-    for (int c = 0; c < count; c++) {
+    int c = 0;
+
+    const int8_t* a0 = safe_mapper_offset<int8_t, 0, mapper>(a, pitch, offsets, count);
+
+    // Process one vector, after instructing the CPU to prefetch the next vector
+    for (; c + 1 < count; c++) {
+        const int8_t* next_a0 = a + mapper(c + 1, offsets) * pitch;
+
+        // prefetch stripes 2 and 3 now
+        prefetch(a0 + 2 * bit_length, lines_to_fetch);
+        prefetch(a0 + 3 * bit_length, lines_to_fetch);
+
+        int64_t p0 = dotd1q4_inner(a0, query, bit_length);
+        int64_t p1 = dotd1q4_inner(a0 + bit_length, query, bit_length);
+
+        // and 0 and 1 of the next vector
+        prefetch(next_a0, lines_to_fetch);
+        prefetch(next_a0 + bit_length, lines_to_fetch);
+
+        int64_t p2 = dotd1q4_inner(a0 + 2 * bit_length, query, bit_length);
+        int64_t p3 = dotd1q4_inner(a0 + 3 * bit_length, query, bit_length);
+
+        results[c] = (f32_t)(p0 + (p1 << 1) + (p2 << 2) + (p3 << 3));
+
+        a0 = next_a0;
+    }
+
+    // Tail-handling: remaining vector
+    for (; c < count; c++) {
         const int8_t* a0 = a + mapper(c, offsets) * pitch;
 
         int64_t p0 = dotd1q4_inner(a0 + 0 * bit_length, query, bit_length);