Skip to content
Closed
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
2 changes: 1 addition & 1 deletion faiss/gpu/impl/DistanceUtils.cuh
Original file line number Diff line number Diff line change
Expand Up @@ -303,7 +303,7 @@ __global__ void incrementIndex(
int k,
idx_t increment) {
for (idx_t i = blockIdx.y; i < indices.getSize(0); i += gridDim.y) {
for (int j = threadIdx.x; j < k; j += blockDim.x) {
for (auto j = threadIdx.x; j < k; j += blockDim.x) {
indices[i][idx_t(blockIdx.x) * k + j] += blockIdx.x * increment;
}
}
Expand Down
6 changes: 3 additions & 3 deletions faiss/gpu/impl/GpuScalarQuantizer.cuh
Original file line number Diff line number Diff line change
Expand Up @@ -377,7 +377,7 @@ struct Codec<ScalarQuantizer::QuantizerType::QT_8bit, DimMultiple> {
smemVmin = smem;
smemVdiff = smem + dim;

for (int i = threadIdx.x; i < dim; i += blockDim.x) {
for (auto i = threadIdx.x; i < dim; i += blockDim.x) {
// We are performing vmin + vdiff * (v + 0.5) / (2^bits - 1)
// This can be simplified to vmin' + vdiff' * v where:
// vdiff' = vdiff / (2^bits - 1)
Expand Down Expand Up @@ -587,7 +587,7 @@ struct Codec<ScalarQuantizer::QuantizerType::QT_6bit, 1> {
smemVmin = smem;
smemVdiff = smem + dim;

for (int i = threadIdx.x; i < dim; i += blockDim.x) {
for (auto i = threadIdx.x; i < dim; i += blockDim.x) {
// We are performing vmin + vdiff * (v + 0.5) / (2^bits - 1)
// This can be simplified to vmin' + vdiff' * v where:
// vdiff' = vdiff / (2^bits - 1)
Expand Down Expand Up @@ -753,7 +753,7 @@ struct Codec<ScalarQuantizer::QuantizerType::QT_4bit, 1> {
smemVmin = smem;
smemVdiff = smem + dim;

for (int i = threadIdx.x; i < dim; i += blockDim.x) {
for (auto i = threadIdx.x; i < dim; i += blockDim.x) {
// We are performing vmin + vdiff * (v + 0.5) / (2^bits - 1)
// This can be simplified to vmin' + vdiff' * v where:
// vdiff' = vdiff / (2^bits - 1)
Expand Down
6 changes: 3 additions & 3 deletions faiss/gpu/impl/IVFAppend.cu
Original file line number Diff line number Diff line change
Expand Up @@ -368,9 +368,9 @@ __global__ void ivfInterleavedAppend(
// The set of addresses for each of the lists
void** listData) {
// FIXME: some issue with getLaneId() and CUDA 10.1 and P4 GPUs?
int laneId = threadIdx.x % kWarpSize;
int warpId = threadIdx.x / kWarpSize;
int warpsPerBlock = blockDim.x / kWarpSize;
auto laneId = threadIdx.x % kWarpSize;
auto warpId = threadIdx.x / kWarpSize;
auto warpsPerBlock = blockDim.x / kWarpSize;

// Each block is dedicated to a separate list
idx_t listId = uniqueLists[blockIdx.x];
Expand Down
4 changes: 2 additions & 2 deletions faiss/gpu/impl/IVFFlatScan.cu
Original file line number Diff line number Diff line change
Expand Up @@ -65,9 +65,9 @@ struct IVFFlatScan {
int limit = utils::divDown(dim, Codec::kDimPerIter);

// Each warp handles a separate chunk of vectors
int warpId = threadIdx.x / kWarpSize;
auto warpId = threadIdx.x / kWarpSize;
// FIXME: why does getLaneId() not work when we write out below!?!?!
int laneId = threadIdx.x % kWarpSize; // getLaneId();
auto laneId = threadIdx.x % kWarpSize; // getLaneId();

// Divide the set of vectors among the warps
idx_t vecsPerWarp = utils::divUp(numVecs, kIVFFlatScanWarps);
Expand Down
4 changes: 2 additions & 2 deletions faiss/gpu/impl/IVFInterleaved.cu
Original file line number Diff line number Diff line change
Expand Up @@ -27,7 +27,7 @@ __global__ void ivfInterleavedScan2(
Tensor<float, 2, true> distanceOut,
Tensor<idx_t, 2, true> indicesOut) {
if constexpr ((NumWarpQ == 1 && NumThreadQ == 1) || NumWarpQ >= kWarpSize) {
int queryId = blockIdx.x;
auto queryId = blockIdx.x;

constexpr int kNumWarps = ThreadsPerBlock / kWarpSize;

Expand Down Expand Up @@ -99,7 +99,7 @@ __global__ void ivfInterleavedScan2(
// Merge all final results
heap.reduce();

for (int i = threadIdx.x; i < k; i += blockDim.x) {
for (auto i = threadIdx.x; i < k; i += blockDim.x) {
// Re-adjust the value we are selecting based on the sorting order
distanceOut[queryId][i] = smemK[i] * adj;
auto packedIndex = smemV[i];
Expand Down
8 changes: 4 additions & 4 deletions faiss/gpu/impl/IVFInterleaved.cuh
Original file line number Diff line number Diff line change
Expand Up @@ -56,7 +56,7 @@ __global__ void ivfInterleavedScan(

for (idx_t queryId = blockIdx.y; queryId < queries.getSize(0);
queryId += gridDim.y) {
int probeId = blockIdx.x;
auto probeId = blockIdx.x;
idx_t listId = listIds[queryId][probeId];

// Safety guard in case NaNs in input cause no list ID to be
Expand All @@ -69,8 +69,8 @@ __global__ void ivfInterleavedScan(
int dim = queries.getSize(1);

// FIXME: some issue with getLaneId() and CUDA 10.1 and P4 GPUs?
int laneId = threadIdx.x % kWarpSize;
int warpId = threadIdx.x / kWarpSize;
auto laneId = threadIdx.x % kWarpSize;
auto warpId = threadIdx.x / kWarpSize;

using EncodeT = typename Codec::EncodeT;

Expand Down Expand Up @@ -215,7 +215,7 @@ __global__ void ivfInterleavedScan(
auto distanceOutBase = distanceOut[queryId][probeId].data();
auto indicesOutBase = indicesOut[queryId][probeId].data();

for (int i = threadIdx.x; i < k; i += blockDim.x) {
for (auto i = threadIdx.x; i < k; i += blockDim.x) {
distanceOutBase[i] = smemK[i];
indicesOutBase[i] = smemV[i];
}
Expand Down
2 changes: 1 addition & 1 deletion faiss/gpu/impl/IVFUtilsSelect1.cu
Original file line number Diff line number Diff line change
Expand Up @@ -90,7 +90,7 @@ __global__ void pass1SelectLists(

// Write out the final k-selected values; they should be all
// together
for (int i = threadIdx.x; i < k; i += blockDim.x) {
for (auto i = threadIdx.x; i < k; i += blockDim.x) {
heapDistances[queryId][sliceId][i] = smemK[i];
heapIndices[queryId][sliceId][i] = idx_t(smemV[i]);
}
Expand Down
2 changes: 1 addition & 1 deletion faiss/gpu/impl/IVFUtilsSelect2.cu
Original file line number Diff line number Diff line change
Expand Up @@ -100,7 +100,7 @@ __global__ void pass2SelectLists(
// Merge all final results
heap.reduce();

for (int i = threadIdx.x; i < k; i += blockDim.x) {
for (auto i = threadIdx.x; i < k; i += blockDim.x) {
outDistances[queryId][i] = smemK[i];

// `v` is the index in `heapIndices`
Expand Down
16 changes: 8 additions & 8 deletions faiss/gpu/impl/IcmEncoder.cu
Original file line number Diff line number Diff line change
Expand Up @@ -46,8 +46,8 @@ __global__ void runIcmEncodeStep(
int m) {
using KVPair = Pair<float, int>;

int id = blockIdx.x; // each block takes care of one vector
int code = threadIdx.x; // each thread takes care of one possible code
auto id = blockIdx.x; // each block takes care of one vector
auto code = threadIdx.x; // each thread takes care of one possible code

// compute the objective value by look-up tables
KVPair obj(0.0f, code);
Expand Down Expand Up @@ -94,8 +94,8 @@ __global__ void runEvaluation(
int M,
int K,
int dims) {
int id = blockIdx.x; // each block takes care of one vector
int d = threadIdx.x; // each thread takes care of one dimension
auto id = blockIdx.x; // each block takes care of one vector
auto d = threadIdx.x; // each thread takes care of one dimension
float acc = 0.0f;

#pragma unroll
Expand Down Expand Up @@ -136,7 +136,7 @@ __global__ void runCodesPerturbation(
int K,
int nperts) {
// each thread takes care of one vector
int id = blockIdx.x * blockDim.x + threadIdx.x;
auto id = blockIdx.x * blockDim.x + threadIdx.x;

if (id >= n) {
return;
Expand Down Expand Up @@ -173,7 +173,7 @@ __global__ void runCodesSelection(
int n,
int M) {
// each thread takes care of one vector
int id = blockIdx.x * blockDim.x + threadIdx.x;
auto id = blockIdx.x * blockDim.x + threadIdx.x;

if (id >= n || objs[id] >= bestObjs[id]) {
return;
Expand All @@ -195,8 +195,8 @@ __global__ void runCodesSelection(
* @param K number of codewords in a codebook
*/
__global__ void runNormAddition(float* uterm, const float* norm, int K) {
int id = blockIdx.x;
int code = threadIdx.x;
auto id = blockIdx.x;
auto code = threadIdx.x;

uterm[id * K + code] += norm[code];
}
Expand Down
2 changes: 1 addition & 1 deletion faiss/gpu/impl/L2Norm.cu
Original file line number Diff line number Diff line change
Expand Up @@ -40,7 +40,7 @@ __global__ void l2NormRowMajor(
// these are fine to be int (just based on block dimensions)
int numWarps = utils::divUp(blockDim.x, kWarpSize);
int laneId = getLaneId();
int warpId = threadIdx.x / kWarpSize;
auto warpId = threadIdx.x / kWarpSize;

bool lastRowTile = (blockIdx.x == (gridDim.x - 1));
idx_t rowStart = idx_t(blockIdx.x) * RowTileSize;
Expand Down
Loading