Skip to content
Closed
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
4 changes: 3 additions & 1 deletion faiss/impl/LocalSearchQuantizer.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -628,7 +628,9 @@ void LocalSearchQuantizer::icm_encode_step(
{
size_t binary_idx = (other_m + 1) * M * K * K +
m * K * K + code2 * K + code;
_mm_prefetch(binaries + binary_idx, _MM_HINT_T0);
_mm_prefetch(
(const char*)(binaries + binary_idx),
_MM_HINT_T0);
}
}
#endif
Expand Down
9 changes: 9 additions & 0 deletions faiss/impl/platform_macros.h
Original file line number Diff line number Diff line change
Expand Up @@ -40,11 +40,13 @@

#include <intrin.h>

#ifndef __clang__
inline int __builtin_ctzll(uint64_t x) {
unsigned long ret;
_BitScanForward64(&ret, x);
return (int)ret;
}
#endif

// cudatoolkit provides __builtin_ctz for NVCC >= 11.0
#if !defined(__CUDACC__) || __CUDACC_VER_MAJOR__ < 11
Expand All @@ -55,13 +57,20 @@ inline int __builtin_ctz(unsigned long x) {
}
#endif

#ifndef __clang__
inline int __builtin_clzll(uint64_t x) {
return (int)__lzcnt64(x);
}
#endif

#define __builtin_popcount __popcnt
#define __builtin_popcountl __popcnt64

#ifndef __clang__
#define __m128i_u __m128i
#define __m256i_u __m256i
#endif

// MSVC does not define __SSEx__, and _M_IX86_FP is only defined on 32-bit
// processors cf.
// https://docs.microsoft.com/en-us/cpp/preprocessor/predefined-macros
Expand Down
8 changes: 4 additions & 4 deletions faiss/utils/distances.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -417,8 +417,8 @@ void exhaustive_L2sqr_blas_cmax_avx2(
for (int64_t i = i0; i < i1; i++) {
float* ip_line = ip_block.get() + (i - i0) * (j1 - j0);

_mm_prefetch(ip_line, _MM_HINT_NTA);
_mm_prefetch(ip_line + 16, _MM_HINT_NTA);
_mm_prefetch((const char*)ip_line, _MM_HINT_NTA);
_mm_prefetch((const char*)(ip_line + 16), _MM_HINT_NTA);

// constant
const __m256 mul_minus2 = _mm256_set1_ps(-2);
Expand All @@ -445,8 +445,8 @@ void exhaustive_L2sqr_blas_cmax_avx2(

// process 16 elements per loop
for (; idx_j < (count / 16) * 16; idx_j += 16, ip_line += 16) {
_mm_prefetch(ip_line + 32, _MM_HINT_NTA);
_mm_prefetch(ip_line + 48, _MM_HINT_NTA);
_mm_prefetch((const char*)(ip_line + 32), _MM_HINT_NTA);
_mm_prefetch((const char*)(ip_line + 48), _MM_HINT_NTA);

// load values for norms
const __m256 y_norm_0 =
Expand Down
2 changes: 1 addition & 1 deletion faiss/utils/distances_fused/simdlib_based.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -73,7 +73,7 @@ void kernel(

// prefetch the next point
#if defined(__AVX2__)
_mm_prefetch(xd_0 + DIM * sizeof(float), _MM_HINT_NTA);
_mm_prefetch((const char*)(xd_0 + DIM * sizeof(float)), _MM_HINT_NTA);
#endif

// load a single point from x
Expand Down
18 changes: 9 additions & 9 deletions faiss/utils/distances_simd.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -439,14 +439,14 @@ void fvec_op_ny_D2<ElementOpIP>(

if (ny8 > 0) {
// process 8 D2-vectors per loop.
_mm_prefetch(y, _MM_HINT_T0);
_mm_prefetch(y + 16, _MM_HINT_T0);
_mm_prefetch((const char*)y, _MM_HINT_T0);
_mm_prefetch((const char*)(y + 16), _MM_HINT_T0);

const __m256 m0 = _mm256_set1_ps(x[0]);
const __m256 m1 = _mm256_set1_ps(x[1]);

for (i = 0; i < ny8 * 8; i += 8) {
_mm_prefetch(y + 32, _MM_HINT_T0);
_mm_prefetch((const char*)(y + 32), _MM_HINT_T0);

// load 8x2 matrix and transpose it in registers.
// the typical bottleneck is memory access, so
Expand Down Expand Up @@ -496,14 +496,14 @@ void fvec_op_ny_D2<ElementOpL2>(

if (ny8 > 0) {
// process 8 D2-vectors per loop.
_mm_prefetch(y, _MM_HINT_T0);
_mm_prefetch(y + 16, _MM_HINT_T0);
_mm_prefetch((const char*)y, _MM_HINT_T0);
_mm_prefetch((const char*)(y + 16), _MM_HINT_T0);

const __m256 m0 = _mm256_set1_ps(x[0]);
const __m256 m1 = _mm256_set1_ps(x[1]);

for (i = 0; i < ny8 * 8; i += 8) {
_mm_prefetch(y + 32, _MM_HINT_T0);
_mm_prefetch((const char*)(y + 32), _MM_HINT_T0);

// load 8x2 matrix and transpose it in registers.
// the typical bottleneck is memory access, so
Expand Down Expand Up @@ -1084,8 +1084,8 @@ size_t fvec_L2sqr_ny_nearest_D2(
// process 8 D2-vectors per loop.
const size_t ny8 = ny / 8;
if (ny8 > 0) {
_mm_prefetch(y, _MM_HINT_T0);
_mm_prefetch(y + 16, _MM_HINT_T0);
_mm_prefetch((const char*)y, _MM_HINT_T0);
_mm_prefetch((const char*)(y + 16), _MM_HINT_T0);

// track min distance and the closest vector independently
// for each of 8 AVX2 components.
Expand All @@ -1100,7 +1100,7 @@ size_t fvec_L2sqr_ny_nearest_D2(
const __m256 m1 = _mm256_set1_ps(x[1]);

for (; i < ny8 * 8; i += 8) {
_mm_prefetch(y + 32, _MM_HINT_T0);
_mm_prefetch((const char*)(y + 32), _MM_HINT_T0);

__m256 v0;
__m256 v1;
Expand Down