diff --git a/libs/native/libraries/build.gradle b/libs/native/libraries/build.gradle index eb3b86d54d74d..b63b1982b016d 100644 --- a/libs/native/libraries/build.gradle +++ b/libs/native/libraries/build.gradle @@ -19,7 +19,7 @@ configurations { } var zstdVersion = "1.5.7" -var vecVersion = "1.0.42" +var vecVersion = "1.0.44" repositories { exclusiveContent { diff --git a/libs/native/src/test/java/org/elasticsearch/nativeaccess/jdk/JDKVectorLibraryBBQTests.java b/libs/native/src/test/java/org/elasticsearch/nativeaccess/jdk/JDKVectorLibraryBBQTests.java index e8b60a44b7a0d..0d6b30b9d037c 100644 --- a/libs/native/src/test/java/org/elasticsearch/nativeaccess/jdk/JDKVectorLibraryBBQTests.java +++ b/libs/native/src/test/java/org/elasticsearch/nativeaccess/jdk/JDKVectorLibraryBBQTests.java @@ -52,7 +52,6 @@ public static Iterable parametersFactory() { // remove all square distance (not implemented yet) baseParams.removeIf(os -> os[0] == VectorSimilarityFunctions.Function.SQUARE_DISTANCE); - // duplicate for int1 & int2 return () -> Stream.of(VectorSimilarityFunctions.BBQType.values()) .flatMap(bbq -> baseParams.stream().map(os -> CollectionUtils.concatLists(List.of(bbq), Arrays.asList(os)))) .map(List::toArray) diff --git a/libs/simdvec/native/publish_vec_binaries.sh b/libs/simdvec/native/publish_vec_binaries.sh index 27cfbc2960e18..432ac31b24b93 100755 --- a/libs/simdvec/native/publish_vec_binaries.sh +++ b/libs/simdvec/native/publish_vec_binaries.sh @@ -20,7 +20,7 @@ if [ -z "$ARTIFACTORY_API_KEY" ]; then exit 1; fi -VERSION="1.0.42" +VERSION="1.0.44" ARTIFACTORY_REPOSITORY="${ARTIFACTORY_REPOSITORY:-https://artifactory.elastic.dev/artifactory/elasticsearch-native/}" TEMP=$(mktemp -d) diff --git a/libs/simdvec/native/src/vec/c/amd64/vec_1.cpp b/libs/simdvec/native/src/vec/c/amd64/vec_1.cpp index 85c239b3ba97b..4949cc22f5fe4 100644 --- a/libs/simdvec/native/src/vec/c/amd64/vec_1.cpp +++ b/libs/simdvec/native/src/vec/c/amd64/vec_1.cpp @@ -180,7 +180,44 @@ static inline void sqri7u_inner_bulk( const int32_t count, f32_t* results ) { - for (int c = 0; c < count; c++) { + const int blk = dims & ~(STRIDE_BYTES_LEN - 1); + const int lines_to_fetch = dims / CACHE_LINE_SIZE + 1; + int c = 0; + + const int8_t* a0 = safe_mapper_offset(a, pitch, offsets, count); + const int8_t* a1 = safe_mapper_offset(a, pitch, offsets, count); + + // Process a batch of 2 vectors at a time, after instructing the CPU to + // prefetch the next batch. + for (; c + 3 < count; c += 2) { + const int8_t* next_a0 = a + mapper(c + 2, offsets) * pitch; + const int8_t* next_a1 = a + mapper(c + 3, offsets) * pitch; + + prefetch(next_a0, lines_to_fetch); + prefetch(next_a1, lines_to_fetch); + + int32_t res0 = 0; + int32_t res1 = 0; + int i = 0; + if (dims > STRIDE_BYTES_LEN) { + i = blk; + res0 = sqri7u_inner(a0, b, i); + res1 = sqri7u_inner(a1, b, i); + } + for (; i < dims; i++) { + int32_t dist0 = a0[i] - b[i]; + int32_t dist1 = a1[i] - b[i]; + res0 += dist0 * dist0; + res1 += dist1 * dist1; + } + results[c + 0] = (f32_t)res0; + results[c + 1] = (f32_t)res1; + a0 = next_a0; + a1 = next_a1; + } + + // Tail-handling: remaining vectors + for (; c < count; c++) { const int8_t* a0 = a + mapper(c, offsets) * pitch; results[c] = (f32_t)vec_sqri7u(a0, b, dims); } @@ -346,7 +383,43 @@ static inline void doti8_inner_bulk( const int32_t count, f32_t* results ) { - for (int c=0; c(a, pitch, offsets, count); + const int8_t* a1 = safe_mapper_offset(a, pitch, offsets, count); + + // Process a batch of 2 vectors at a time, after instructing the CPU to + // prefetch the next batch. + for (; c + 3 < count; c += 2) { + const int8_t* next_a0 = a + mapper(c + 2, offsets) * pitch; + const int8_t* next_a1 = a + mapper(c + 3, offsets) * pitch; + + prefetch(next_a0, lines_to_fetch); + prefetch(next_a1, lines_to_fetch); + + int32_t res0 = 0; + int32_t res1 = 0; + int i = 0; + if (dims > STRIDE_BYTES_LEN) { + i = blk; + res0 = doti8_inner(a0, b, i); + res1 = doti8_inner(a1, b, i); + } + for (; i < dims; i++) { + const int8_t bb = b[i]; + res0 += a0[i] * bb; + res1 += a1[i] * bb; + } + results[c + 0] = (f32_t)res0; + results[c + 1] = (f32_t)res1; + a0 = next_a0; + a1 = next_a1; + } + + // Tail-handling: remaining vectors + for (; c(a, pitch, offsets, count); + const int8_t* a1 = safe_mapper_offset(a, pitch, offsets, count); + + // Process a batch of 2 vectors at a time, after instructing the CPU to + // prefetch the next batch. + for (; c + 3 < count; c += 2) { + const int8_t* next_a0 = a + mapper(c + 2, offsets) * pitch; + const int8_t* next_a1 = a + mapper(c + 3, offsets) * pitch; + + prefetch(next_a0, lines_to_fetch); + prefetch(next_a1, lines_to_fetch); + + int32_t res0 = 0; + int32_t res1 = 0; + int i = 0; + if (dims > STRIDE_BYTES_LEN) { + i = blk; + res0 = sqri8_inner(a0, b, i); + res1 = sqri8_inner(a1, b, i); + } + for (; i < dims; i++) { + int32_t dist0 = a0[i] - b[i]; + int32_t dist1 = a1[i] - b[i]; + res0 += dist0 * dist0; + res1 += dist1 * dist1; + } + results[c + 0] = (f32_t)res0; + results[c + 1] = (f32_t)res1; + a0 = next_a0; + a1 = next_a1; + } + + // Tail-handling: remaining vectors + for (; c static inline void dotd1q4_inner_bulk( const int8_t* a, @@ -839,6 +939,16 @@ EXPORT void vec_dotd1q4_bulk_offsets( dotd1q4_inner_bulk(a, query, length, pitch, offsets, count, results); } +EXPORT int64_t vec_dotd2q4( + const int8_t* a_ptr, + const int8_t* query_ptr, + const int32_t length +) { + int64_t lower = dotd1q4_inner(a_ptr, query_ptr, length/2); + int64_t upper = dotd1q4_inner(a_ptr + length/2, query_ptr, length/2); + return lower + (upper << 1); +} + template static inline void dotd2q4_inner_bulk( const int8_t* a, @@ -856,23 +966,23 @@ static inline void dotd2q4_inner_bulk( const int8_t* a0 = safe_mapper_offset(a, pitch, offsets, count); const int8_t* a1 = safe_mapper_offset(a, pitch, offsets, count); - // Process a batch of 2 vectors at a time, after instructing the CPU to - // prefetch the next batch. - // Prefetching multiple memory locations while computing keeps the CPU - // execution units busy. - for (; c + 3 < count; c += 2) { + // Process 2 vectors at a time, after instructing the CPU to + // prefetch the next vectors (both stripes). + for (; c + 2 < count; c+=2) { const int8_t* next_a0 = a + mapper(c + 2, offsets) * pitch; const int8_t* next_a1 = a + mapper(c + 3, offsets) * pitch; prefetch(next_a0, lines_to_fetch); + prefetch(next_a0 + bit_length, lines_to_fetch); prefetch(next_a1, lines_to_fetch); + prefetch(next_a1 + bit_length, lines_to_fetch); int64_t lower0 = dotd1q4_inner(a0, query, bit_length); - int64_t lower1 = dotd1q4_inner(a1, query, bit_length); int64_t upper0 = dotd1q4_inner(a0 + bit_length, query, bit_length); + int64_t lower1 = dotd1q4_inner(a1, query, bit_length); int64_t upper1 = dotd1q4_inner(a1 + bit_length, query, bit_length); - results[c + 0] = (f32_t)(lower0 + (upper0 << 1)); + results[c] = (f32_t)(lower0 + (upper0 << 1)); results[c + 1] = (f32_t)(lower1 + (upper1 << 1)); a0 = next_a0; @@ -882,8 +992,8 @@ static inline void dotd2q4_inner_bulk( // Tail-handling: remaining vectors for (; c < count; c++) { const int8_t* a0 = a + mapper(c, offsets) * pitch; - int64_t lower = dotd1q4_inner(a0, query, length/2); - int64_t upper = dotd1q4_inner(a0 + length/2, query, length/2); + int64_t lower = dotd1q4_inner(a0, query, bit_length); + int64_t upper = dotd1q4_inner(a0 + bit_length, query, bit_length); results[c] = (f32_t)(lower + (upper << 1)); } } @@ -927,8 +1037,37 @@ static inline void dotd4q4_inner_bulk( const int32_t count, f32_t* results ) { + const int lines_to_fetch = length / CACHE_LINE_SIZE + 1; const int32_t bit_length = length / 4; - for (int c = 0; c < count; c++) { + int c = 0; + + const int8_t* a0 = safe_mapper_offset(a, pitch, offsets, count); + + // Process one vector, after instructing the CPU to prefetch the next vector + for (; c + 1 < count; c++) { + const int8_t* next_a0 = a + mapper(c + 1, offsets) * pitch; + + // prefetch stripes 2 and 3 now + prefetch(a0 + 2 * bit_length, lines_to_fetch); + prefetch(a0 + 3 * bit_length, lines_to_fetch); + + int64_t p0 = dotd1q4_inner(a0, query, bit_length); + int64_t p1 = dotd1q4_inner(a0 + bit_length, query, bit_length); + + // and 0 and 1 of the next vector + prefetch(next_a0, lines_to_fetch); + prefetch(next_a0 + bit_length, lines_to_fetch); + + int64_t p2 = dotd1q4_inner(a0 + 2 * bit_length, query, bit_length); + int64_t p3 = dotd1q4_inner(a0 + 3 * bit_length, query, bit_length); + + results[c] = (f32_t)(p0 + (p1 << 1) + (p2 << 2) + (p3 << 3)); + + a0 = next_a0; + } + + // Tail-handling: remaining vector + for (; c < count; c++) { const int8_t* a0 = a + mapper(c, offsets) * pitch; int64_t p0 = dotd1q4_inner(a0 + 0 * bit_length, query, bit_length); diff --git a/libs/simdvec/native/src/vec/c/amd64/vec_2.cpp b/libs/simdvec/native/src/vec/c/amd64/vec_2.cpp index 4685ecac4bca0..6e7c8f1a84b68 100644 --- a/libs/simdvec/native/src/vec/c/amd64/vec_2.cpp +++ b/libs/simdvec/native/src/vec/c/amd64/vec_2.cpp @@ -306,86 +306,8 @@ static inline void sqr7u_inner_bulk( const int32_t count, f32_t* results ) { - for (int c = 0; c < count; c++) { - const int8_t* a0 = a + mapper(c, offsets) * pitch; - results[c] = (f32_t)vec_sqr7u_2(a0, b, dims); - } -} - -EXPORT void vec_sqr7u_bulk_2(const int8_t* a, const int8_t* b, const int32_t dims, const int32_t count, f32_t* results) { - sqr7u_inner_bulk(a, b, dims, dims, NULL, count, results); -} - -EXPORT void vec_sqr7u_bulk_offsets_2( - const int8_t* a, - const int8_t* b, - const int32_t dims, - const int32_t pitch, - const int32_t* offsets, - const int32_t count, - f32_t* results) { - sqr7u_inner_bulk(a, b, dims, pitch, offsets, count, results); -} - -static inline __m512i dot_bit_512(const __m512i a, const int8_t* b) { - const __m512i q0 = _mm512_loadu_si512((const __m512i *)b); - return _mm512_popcnt_epi64(_mm512_and_si512(q0, a)); -} - -static inline int64_t dotd1q4_inner(const int8_t* a, const int8_t* query, const int32_t length) { - int r = 0; - - // Init accumulator(s) with 0 - __m512i acc0 = _mm512_setzero_si512(); - __m512i acc1 = _mm512_setzero_si512(); - __m512i acc2 = _mm512_setzero_si512(); - __m512i acc3 = _mm512_setzero_si512(); - - int upperBound = length & ~(STRIDE_BYTES_LEN - 1); - for (; r < upperBound; r += STRIDE_BYTES_LEN) { - const __m512i value = _mm512_loadu_si512((const __m512i*)(a + r)); - - acc0 = _mm512_add_epi64(acc0, dot_bit_512(value, query + r)); - acc1 = _mm512_add_epi64(acc1, dot_bit_512(value, query + r + length)); - acc2 = _mm512_add_epi64(acc2, dot_bit_512(value, query + r + 2 * length)); - acc3 = _mm512_add_epi64(acc3, dot_bit_512(value, query + r + 3 * length)); - } - - int64_t subRet0 = _mm512_reduce_add_epi64(acc0); - int64_t subRet1 = _mm512_reduce_add_epi64(acc1); - int64_t subRet2 = _mm512_reduce_add_epi64(acc2); - int64_t subRet3 = _mm512_reduce_add_epi64(acc3); - - for (; r < length; r++) { - int8_t value = *(a + r); - int8_t q0 = *(query + r); - subRet0 += __builtin_popcount(q0 & value & 0xFF); - int8_t q1 = *(query + r + length); - subRet1 += __builtin_popcount(q1 & value & 0xFF); - int8_t q2 = *(query + r + 2 * length); - subRet2 += __builtin_popcount(q2 & value & 0xFF); - int8_t q3 = *(query + r + 3 * length); - subRet3 += __builtin_popcount(q3 & value & 0xFF); - } - - return subRet0 + (subRet1 << 1) + (subRet2 << 2) + (subRet3 << 3); -} - -EXPORT int64_t vec_dotd1q4_2(const int8_t* a, const int8_t* query, const int32_t length) { - return dotd1q4_inner(a, query, length); -} - -template -static inline void dotd1q4_inner_bulk( - const int8_t* a, - const int8_t* query, - const int32_t length, - const int32_t pitch, - const int32_t* offsets, - const int32_t count, - f32_t* results -) { - const int lines_to_fetch = length / CACHE_LINE_SIZE + 1; + const int blk = dims & ~(STRIDE_BYTES_LEN - 1); + const int lines_to_fetch = dims / CACHE_LINE_SIZE + 1; int c = 0; const int8_t* a0 = safe_mapper_offset(a, pitch, offsets, count); @@ -408,11 +330,34 @@ static inline void dotd1q4_inner_bulk( prefetch(next_a2, lines_to_fetch); prefetch(next_a3, lines_to_fetch); - results[c + 0] = (f32_t)dotd1q4_inner(a0, query, length); - results[c + 1] = (f32_t)dotd1q4_inner(a1, query, length); - results[c + 2] = (f32_t)dotd1q4_inner(a2, query, length); - results[c + 3] = (f32_t)dotd1q4_inner(a3, query, length); - + int32_t res0 = 0; + int32_t res1 = 0; + int32_t res2 = 0; + int32_t res3 = 0; + int i = 0; + if (dims > STRIDE_BYTES_LEN) { + i = blk; + res0 = sqr7u_inner_avx512(a0, b, i); + res1 = sqr7u_inner_avx512(a1, b, i); + res2 = sqr7u_inner_avx512(a2, b, i); + res3 = sqr7u_inner_avx512(a3, b, i); + } + for (; i < dims; i++) { + const int8_t bb = b[i]; + int32_t dist0 = a0[i] - bb; + int32_t dist1 = a0[i] - bb; + int32_t dist2 = a0[i] - bb; + int32_t dist3 = a0[i] - bb; + + res0 += dist0 * dist0; + res1 += dist1 * dist1; + res2 += dist2 * dist2; + res3 += dist3 * dist3; + } + results[c + 0] = (f32_t)res0; + results[c + 1] = (f32_t)res1; + results[c + 2] = (f32_t)res2; + results[c + 3] = (f32_t)res3; a0 = next_a0; a1 = next_a1; a2 = next_a2; @@ -422,28 +367,23 @@ static inline void dotd1q4_inner_bulk( // Tail-handling: remaining vectors for (; c < count; c++) { const int8_t* a0 = a + mapper(c, offsets) * pitch; - results[c] = (f32_t)dotd1q4_inner(a0, query, length); + results[c] = (f32_t)vec_sqr7u_2(a0, b, dims); } } -EXPORT void vec_dotd1q4_bulk_2( - const int8_t* a, - const int8_t* query, - const int32_t length, - const int32_t count, - f32_t* results) { - dotd1q4_inner_bulk(a, query, length, length, NULL, count, results); +EXPORT void vec_sqr7u_bulk_2(const int8_t* a, const int8_t* b, const int32_t dims, const int32_t count, f32_t* results) { + sqr7u_inner_bulk(a, b, dims, dims, NULL, count, results); } -EXPORT void vec_dotd1q4_bulk_offsets_2( +EXPORT void vec_sqr7u_bulk_offsets_2( const int8_t* a, - const int8_t* query, - const int32_t length, + const int8_t* b, + const int32_t dims, const int32_t pitch, const int32_t* offsets, const int32_t count, f32_t* results) { - dotd1q4_inner_bulk(a, query, length, pitch, offsets, count, results); + sqr7u_inner_bulk(a, b, dims, pitch, offsets, count, results); } #ifdef __clang__