diff --git a/libs/native/libraries/build.gradle b/libs/native/libraries/build.gradle
index eb3b86d54d74d..b63b1982b016d 100644
--- a/libs/native/libraries/build.gradle
+++ b/libs/native/libraries/build.gradle
@@ -19,7 +19,7 @@ configurations {
 }
 
 var zstdVersion = "1.5.7"
-var vecVersion = "1.0.42"
+var vecVersion = "1.0.44"
 
 repositories {
   exclusiveContent {
diff --git a/libs/native/src/test/java/org/elasticsearch/nativeaccess/jdk/JDKVectorLibraryBBQTests.java b/libs/native/src/test/java/org/elasticsearch/nativeaccess/jdk/JDKVectorLibraryBBQTests.java
index e8b60a44b7a0d..0d6b30b9d037c 100644
--- a/libs/native/src/test/java/org/elasticsearch/nativeaccess/jdk/JDKVectorLibraryBBQTests.java
+++ b/libs/native/src/test/java/org/elasticsearch/nativeaccess/jdk/JDKVectorLibraryBBQTests.java
@@ -52,7 +52,6 @@ public static Iterable<Object[]> parametersFactory() {
         // remove all square distance (not implemented yet)
         baseParams.removeIf(os -> os[0] == VectorSimilarityFunctions.Function.SQUARE_DISTANCE);
 
-        // duplicate for int1 & int2
         return () -> Stream.of(VectorSimilarityFunctions.BBQType.values())
             .flatMap(bbq -> baseParams.stream().map(os -> CollectionUtils.concatLists(List.of(bbq), Arrays.asList(os))))
             .map(List::toArray)
diff --git a/libs/simdvec/native/publish_vec_binaries.sh b/libs/simdvec/native/publish_vec_binaries.sh
index 27cfbc2960e18..432ac31b24b93 100755
--- a/libs/simdvec/native/publish_vec_binaries.sh
+++ b/libs/simdvec/native/publish_vec_binaries.sh
@@ -20,7 +20,7 @@ if [ -z "$ARTIFACTORY_API_KEY" ]; then
   exit 1;
 fi
 
-VERSION="1.0.42"
+VERSION="1.0.44"
 ARTIFACTORY_REPOSITORY="${ARTIFACTORY_REPOSITORY:-https://artifactory.elastic.dev/artifactory/elasticsearch-native/}"
 TEMP=$(mktemp -d)
 
diff --git a/libs/simdvec/native/src/vec/c/amd64/vec_1.cpp b/libs/simdvec/native/src/vec/c/amd64/vec_1.cpp
index 85c239b3ba97b..4949cc22f5fe4 100644
--- a/libs/simdvec/native/src/vec/c/amd64/vec_1.cpp
+++ b/libs/simdvec/native/src/vec/c/amd64/vec_1.cpp
@@ -180,7 +180,44 @@ static inline void sqri7u_inner_bulk(
     const int32_t count,
     f32_t* results
 ) {
-    for (int c = 0; c < count; c++) {
+    const int blk = dims & ~(STRIDE_BYTES_LEN - 1);
+    const int lines_to_fetch = dims / CACHE_LINE_SIZE + 1;
+    int c = 0;
+
+    const int8_t* a0 = safe_mapper_offset<int8_t, 0, mapper>(a, pitch, offsets, count);
+    const int8_t* a1 = safe_mapper_offset<int8_t, 1, mapper>(a, pitch, offsets, count);
+
+    // Process a batch of 2 vectors at a time, after instructing the CPU to
+    // prefetch the next batch.
+    for (; c + 3 < count; c += 2) {
+        const int8_t* next_a0 = a + mapper(c + 2, offsets) * pitch;
+        const int8_t* next_a1 = a + mapper(c + 3, offsets) * pitch;
+
+        prefetch(next_a0, lines_to_fetch);
+        prefetch(next_a1, lines_to_fetch);
+
+        int32_t res0 = 0;
+        int32_t res1 = 0;
+        int i = 0;
+        if (dims > STRIDE_BYTES_LEN) {
+            i = blk;
+            res0 = sqri7u_inner(a0, b, i);
+            res1 = sqri7u_inner(a1, b, i);
+        }
+        for (; i < dims; i++) {
+            int32_t dist0 = a0[i] - b[i];
+            int32_t dist1 = a1[i] - b[i];
+            res0 += dist0 * dist0;
+            res1 += dist1 * dist1;
+        }
+        results[c + 0] = (f32_t)res0;
+        results[c + 1] = (f32_t)res1;
+        a0 = next_a0;
+        a1 = next_a1;
+    }
+
+    // Tail-handling: remaining vectors
+    for (; c < count; c++) {
         const int8_t* a0 = a + mapper(c, offsets) * pitch;
         results[c] = (f32_t)vec_sqri7u(a0, b, dims);
     }
@@ -346,7 +383,43 @@ static inline void doti8_inner_bulk(
     const int32_t count,
     f32_t* results
 ) {
-    for (int c=0; c<count; c++) {
+    const int blk = dims & ~(STRIDE_BYTES_LEN - 1);
+    const int lines_to_fetch = dims / CACHE_LINE_SIZE + 1;
+    int c = 0;
+
+    const int8_t* a0 = safe_mapper_offset<int8_t, 0, mapper>(a, pitch, offsets, count);
+    const int8_t* a1 = safe_mapper_offset<int8_t, 1, mapper>(a, pitch, offsets, count);
+
+    // Process a batch of 2 vectors at a time, after instructing the CPU to
+    // prefetch the next batch.
+    for (; c + 3 < count; c += 2) {
+        const int8_t* next_a0 = a + mapper(c + 2, offsets) * pitch;
+        const int8_t* next_a1 = a + mapper(c + 3, offsets) * pitch;
+
+        prefetch(next_a0, lines_to_fetch);
+        prefetch(next_a1, lines_to_fetch);
+
+        int32_t res0 = 0;
+        int32_t res1 = 0;
+        int i = 0;
+        if (dims > STRIDE_BYTES_LEN) {
+            i = blk;
+            res0 = doti8_inner(a0, b, i);
+            res1 = doti8_inner(a1, b, i);
+        }
+        for (; i < dims; i++) {
+            const int8_t bb = b[i];
+            res0 += a0[i] * bb;
+            res1 += a1[i] * bb;
+        }
+        results[c + 0] = (f32_t)res0;
+        results[c + 1] = (f32_t)res1;
+        a0 = next_a0;
+        a1 = next_a1;
+    }
+
+    // Tail-handling: remaining vectors
+    for (; c<count; c++) {
         const int8_t* a0 = a + mapper(c, offsets) * pitch;
         results[c] = vec_doti8(a0, b, dims);
     }
@@ -415,7 +488,44 @@ static inline void sqri8_inner_bulk(
     const int32_t count,
     f32_t* results
 ) {
-    for (int c=0; c<count; c++) {
+    const int blk = dims & ~(STRIDE_BYTES_LEN - 1);
+    const int lines_to_fetch = dims / CACHE_LINE_SIZE + 1;
+    int c = 0;
+
+    const int8_t* a0 = safe_mapper_offset<int8_t, 0, mapper>(a, pitch, offsets, count);
+    const int8_t* a1 = safe_mapper_offset<int8_t, 1, mapper>(a, pitch, offsets, count);
+
+    // Process a batch of 2 vectors at a time, after instructing the CPU to
+    // prefetch the next batch.
+    for (; c + 3 < count; c += 2) {
+        const int8_t* next_a0 = a + mapper(c + 2, offsets) * pitch;
+        const int8_t* next_a1 = a + mapper(c + 3, offsets) * pitch;
+
+        prefetch(next_a0, lines_to_fetch);
+        prefetch(next_a1, lines_to_fetch);
+
+        int32_t res0 = 0;
+        int32_t res1 = 0;
+        int i = 0;
+        if (dims > STRIDE_BYTES_LEN) {
+            i = blk;
+            res0 = sqri8_inner(a0, b, i);
+            res1 = sqri8_inner(a1, b, i);
+        }
+        for (; i < dims; i++) {
+            int32_t dist0 = a0[i] - b[i];
+            int32_t dist1 = a1[i] - b[i];
+            res0 += dist0 * dist0;
+            res1 += dist1 * dist1;
+        }
+        results[c + 0] = (f32_t)res0;
+        results[c + 1] = (f32_t)res1;
+        a0 = next_a0;
+        a1 = next_a1;
+    }
+
+    // Tail-handling: remaining vectors
+    for (; c<count; c++) {
         const int8_t* a0 = a + mapper(c, offsets) * pitch;
         results[c] = vec_sqri8(a0, b, dims);
     }
@@ -758,16 +868,6 @@ EXPORT int64_t vec_dotd1q4(
     return dotd1q4_inner(a_ptr, query_ptr, length);
 }
 
-EXPORT int64_t vec_dotd2q4(
-    const int8_t* a_ptr,
-    const int8_t* query_ptr,
-    const int32_t length
-) {
-    int64_t lower = dotd1q4_inner(a_ptr, query_ptr, length/2);
-    int64_t upper = dotd1q4_inner(a_ptr + length/2, query_ptr, length/2);
-    return lower + (upper << 1);
-}
-
 template <int64_t(*mapper)(const int32_t, const int32_t*)>
 static inline void dotd1q4_inner_bulk(
     const int8_t* a,
@@ -839,6 +939,16 @@ EXPORT void vec_dotd1q4_bulk_offsets(
     dotd1q4_inner_bulk<array_mapper>(a, query, length, pitch, offsets, count, results);
 }
 
+EXPORT int64_t vec_dotd2q4(
+    const int8_t* a_ptr,
+    const int8_t* query_ptr,
+    const int32_t length
+) {
+    int64_t lower = dotd1q4_inner(a_ptr, query_ptr, length/2);
+    int64_t upper = dotd1q4_inner(a_ptr + length/2, query_ptr, length/2);
+    return lower + (upper << 1);
+}
+
 template <int64_t(*mapper)(const int32_t, const int32_t*)>
 static inline void dotd2q4_inner_bulk(
     const int8_t* a,
@@ -856,23 +966,23 @@ static inline void dotd2q4_inner_bulk(
     const int8_t* a0 = safe_mapper_offset<int8_t, 0, mapper>(a, pitch, offsets, count);
     const int8_t* a1 = safe_mapper_offset<int8_t, 1, mapper>(a, pitch, offsets, count);
 
-    // Process a batch of 2 vectors at a time, after instructing the CPU to
-    // prefetch the next batch.
-    // Prefetching multiple memory locations while computing keeps the CPU
-    // execution units busy.
-    for (; c + 3 < count; c += 2) {
+    // Process 2 vectors at a time, after instructing the CPU to
+    // prefetch the next vectors (both stripes).
+    for (; c + 2 < count; c+=2) {
         const int8_t* next_a0 = a + mapper(c + 2, offsets) * pitch;
         const int8_t* next_a1 = a + mapper(c + 3, offsets) * pitch;
 
         prefetch(next_a0, lines_to_fetch);
+        prefetch(next_a0 + bit_length, lines_to_fetch);
         prefetch(next_a1, lines_to_fetch);
+        prefetch(next_a1 + bit_length, lines_to_fetch);
 
         int64_t lower0 = dotd1q4_inner(a0, query, bit_length);
-        int64_t lower1 = dotd1q4_inner(a1, query, bit_length);
         int64_t upper0 = dotd1q4_inner(a0 + bit_length, query, bit_length);
+        int64_t lower1 = dotd1q4_inner(a1, query, bit_length);
         int64_t upper1 = dotd1q4_inner(a1 + bit_length, query, bit_length);
 
-        results[c + 0] = (f32_t)(lower0 + (upper0 << 1));
+        results[c] = (f32_t)(lower0 + (upper0 << 1));
         results[c + 1] = (f32_t)(lower1 + (upper1 << 1));
 
         a0 = next_a0;
@@ -882,8 +992,8 @@ static inline void dotd2q4_inner_bulk(
     // Tail-handling: remaining vectors
     for (; c < count; c++) {
         const int8_t* a0 = a + mapper(c, offsets) * pitch;
-        int64_t lower = dotd1q4_inner(a0, query, length/2);
-        int64_t upper = dotd1q4_inner(a0 + length/2, query, length/2);
+        int64_t lower = dotd1q4_inner(a0, query, bit_length);
+        int64_t upper = dotd1q4_inner(a0 + bit_length, query, bit_length);
         results[c] = (f32_t)(lower + (upper << 1));
     }
 }
@@ -927,8 +1037,37 @@ static inline void dotd4q4_inner_bulk(
     const int32_t count,
     f32_t* results
 ) {
+    const int lines_to_fetch = length / CACHE_LINE_SIZE + 1;
     const int32_t bit_length = length / 4;
-    for (int c = 0; c < count; c++) {
+    int c = 0;
+
+    const int8_t* a0 = safe_mapper_offset<int8_t, 0, mapper>(a, pitch, offsets, count);
+
+    // Process one vector, after instructing the CPU to prefetch the next vector
+    for (; c + 1 < count; c++) {
+        const int8_t* next_a0 = a + mapper(c + 1, offsets) * pitch;
+
+        // prefetch stripes 2 and 3 now
+        prefetch(a0 + 2 * bit_length, lines_to_fetch);
+        prefetch(a0 + 3 * bit_length, lines_to_fetch);
+
+        int64_t p0 = dotd1q4_inner(a0, query, bit_length);
+        int64_t p1 = dotd1q4_inner(a0 + bit_length, query, bit_length);
+
+        // and 0 and 1 of the next vector
+        prefetch(next_a0, lines_to_fetch);
+        prefetch(next_a0 + bit_length, lines_to_fetch);
+
+        int64_t p2 = dotd1q4_inner(a0 + 2 * bit_length, query, bit_length);
+        int64_t p3 = dotd1q4_inner(a0 + 3 * bit_length, query, bit_length);
+
+        results[c] = (f32_t)(p0 + (p1 << 1) + (p2 << 2) + (p3 << 3));
+
+        a0 = next_a0;
+    }
+
+    // Tail-handling: remaining vector
+    for (; c < count; c++) {
         const int8_t* a0 = a + mapper(c, offsets) * pitch;
 
         int64_t p0 = dotd1q4_inner(a0 + 0 * bit_length, query, bit_length);
diff --git a/libs/simdvec/native/src/vec/c/amd64/vec_2.cpp b/libs/simdvec/native/src/vec/c/amd64/vec_2.cpp
index 4685ecac4bca0..6e7c8f1a84b68 100644
--- a/libs/simdvec/native/src/vec/c/amd64/vec_2.cpp
+++ b/libs/simdvec/native/src/vec/c/amd64/vec_2.cpp
@@ -306,86 +306,8 @@ static inline void sqr7u_inner_bulk(
     const int32_t count,
     f32_t* results
 ) {
-    for (int c = 0; c < count; c++) {
-        const int8_t* a0 = a + mapper(c, offsets) * pitch;
-        results[c] = (f32_t)vec_sqr7u_2(a0, b, dims);
-    }
-}
-
-EXPORT void vec_sqr7u_bulk_2(const int8_t* a, const int8_t* b, const int32_t dims, const int32_t count, f32_t* results) {
-    sqr7u_inner_bulk<identity_mapper>(a, b, dims, dims, NULL, count, results);
-}
-
-EXPORT void vec_sqr7u_bulk_offsets_2(
-    const int8_t* a,
-    const int8_t* b,
-    const int32_t dims,
-    const int32_t pitch,
-    const int32_t* offsets,
-    const int32_t count,
-    f32_t* results) {
-    sqr7u_inner_bulk<array_mapper>(a, b, dims, pitch, offsets, count, results);
-}
-
-static inline __m512i dot_bit_512(const __m512i a, const int8_t* b) {
-    const __m512i q0 = _mm512_loadu_si512((const __m512i *)b);
-    return _mm512_popcnt_epi64(_mm512_and_si512(q0, a));
-}
-
-static inline int64_t dotd1q4_inner(const int8_t* a, const int8_t* query, const int32_t length) {
-    int r = 0;
-
-    // Init accumulator(s) with 0
-    __m512i acc0 = _mm512_setzero_si512();
-    __m512i acc1 = _mm512_setzero_si512();
-    __m512i acc2 = _mm512_setzero_si512();
-    __m512i acc3 = _mm512_setzero_si512();
-
-    int upperBound = length & ~(STRIDE_BYTES_LEN - 1);
-    for (; r < upperBound; r += STRIDE_BYTES_LEN) {
-        const __m512i value = _mm512_loadu_si512((const __m512i*)(a + r));
-
-        acc0 = _mm512_add_epi64(acc0, dot_bit_512(value, query + r));
-        acc1 = _mm512_add_epi64(acc1, dot_bit_512(value, query + r + length));
-        acc2 = _mm512_add_epi64(acc2, dot_bit_512(value, query + r + 2 * length));
-        acc3 = _mm512_add_epi64(acc3, dot_bit_512(value, query + r + 3 * length));
-    }
-
-    int64_t subRet0 = _mm512_reduce_add_epi64(acc0);
-    int64_t subRet1 = _mm512_reduce_add_epi64(acc1);
-    int64_t subRet2 = _mm512_reduce_add_epi64(acc2);
-    int64_t subRet3 = _mm512_reduce_add_epi64(acc3);
-
-    for (; r < length; r++) {
-        int8_t value = *(a + r);
-        int8_t q0 = *(query + r);
-        subRet0 += __builtin_popcount(q0 & value & 0xFF);
-        int8_t q1 = *(query + r + length);
-        subRet1 += __builtin_popcount(q1 & value & 0xFF);
-        int8_t q2 = *(query + r + 2 * length);
-        subRet2 += __builtin_popcount(q2 & value & 0xFF);
-        int8_t q3 = *(query + r + 3 * length);
-        subRet3 += __builtin_popcount(q3 & value & 0xFF);
-    }
-
-    return subRet0 + (subRet1 << 1) + (subRet2 << 2) + (subRet3 << 3);
-}
-
-EXPORT int64_t vec_dotd1q4_2(const int8_t* a, const int8_t* query, const int32_t length) {
-    return dotd1q4_inner(a, query, length);
-}
-
-template <int64_t(*mapper)(const int32_t, const int32_t*)>
-static inline void dotd1q4_inner_bulk(
-    const int8_t* a,
-    const int8_t* query,
-    const int32_t length,
-    const int32_t pitch,
-    const int32_t* offsets,
-    const int32_t count,
-    f32_t* results
-) {
-    const int lines_to_fetch = length / CACHE_LINE_SIZE + 1;
+    const int blk = dims & ~(STRIDE_BYTES_LEN - 1);
+    const int lines_to_fetch = dims / CACHE_LINE_SIZE + 1;
     int c = 0;
 
     const int8_t* a0 = safe_mapper_offset<int8_t, 0, mapper>(a, pitch, offsets, count);
@@ -408,11 +330,34 @@ static inline void dotd1q4_inner_bulk(
         prefetch(next_a2, lines_to_fetch);
         prefetch(next_a3, lines_to_fetch);
 
-        results[c + 0] = (f32_t)dotd1q4_inner(a0, query, length);
-        results[c + 1] = (f32_t)dotd1q4_inner(a1, query, length);
-        results[c + 2] = (f32_t)dotd1q4_inner(a2, query, length);
-        results[c + 3] = (f32_t)dotd1q4_inner(a3, query, length);
-
+        int32_t res0 = 0;
+        int32_t res1 = 0;
+        int32_t res2 = 0;
+        int32_t res3 = 0;
+        int i = 0;
+        if (dims > STRIDE_BYTES_LEN) {
+            i = blk;
+            res0 = sqr7u_inner_avx512(a0, b, i);
+            res1 = sqr7u_inner_avx512(a1, b, i);
+            res2 = sqr7u_inner_avx512(a2, b, i);
+            res3 = sqr7u_inner_avx512(a3, b, i);
+        }
+        for (; i < dims; i++) {
+            const int8_t bb = b[i];
+            int32_t dist0 = a0[i] - bb;
+            int32_t dist1 = a0[i] - bb;
+            int32_t dist2 = a0[i] - bb;
+            int32_t dist3 = a0[i] - bb;
+
+            res0 += dist0 * dist0;
+            res1 += dist1 * dist1;
+            res2 += dist2 * dist2;
+            res3 += dist3 * dist3;
+        }
+        results[c + 0] = (f32_t)res0;
+        results[c + 1] = (f32_t)res1;
+        results[c + 2] = (f32_t)res2;
+        results[c + 3] = (f32_t)res3;
         a0 = next_a0;
         a1 = next_a1;
         a2 = next_a2;
@@ -422,28 +367,23 @@ static inline void dotd1q4_inner_bulk(
     // Tail-handling: remaining vectors
     for (; c < count; c++) {
         const int8_t* a0 = a + mapper(c, offsets) * pitch;
-        results[c] = (f32_t)dotd1q4_inner(a0, query, length);
+        results[c] = (f32_t)vec_sqr7u_2(a0, b, dims);
     }
 }
 
-EXPORT void vec_dotd1q4_bulk_2(
-    const int8_t* a,
-    const int8_t* query,
-    const int32_t length,
-    const int32_t count,
-    f32_t* results) {
-    dotd1q4_inner_bulk<identity_mapper>(a, query, length, length, NULL, count, results);
+EXPORT void vec_sqr7u_bulk_2(const int8_t* a, const int8_t* b, const int32_t dims, const int32_t count, f32_t* results) {
+    sqr7u_inner_bulk<identity_mapper>(a, b, dims, dims, NULL, count, results);
 }
 
-EXPORT void vec_dotd1q4_bulk_offsets_2(
+EXPORT void vec_sqr7u_bulk_offsets_2(
     const int8_t* a,
-    const int8_t* query,
-    const int32_t length,
+    const int8_t* b,
+    const int32_t dims,
     const int32_t pitch,
     const int32_t* offsets,
     const int32_t count,
     f32_t* results) {
-    dotd1q4_inner_bulk<array_mapper>(a, query, length, pitch, offsets, count, results);
+    sqr7u_inner_bulk<array_mapper>(a, b, dims, pitch, offsets, count, results);
 }
 
 #ifdef __clang__