From bbcacafad25acc69104813dc7eee10df1fbfc2a7 Mon Sep 17 00:00:00 2001
From: Richard Barnes <rbarnes@meta.com>
Date: Tue, 18 Mar 2025 14:25:14 -0700
Subject: [PATCH] Fix CUDA kernel index data type in
 faiss/gpu/impl/DistanceUtils.cuh +10

Summary:
CUDA kernel variables matching the type `(thread|block|grid).(Idx|Dim).(x|y|z)` [have the data type `uint`](https://docs.nvidia.com/cuda/cuda-c-programming-guide/#built-in-variables).

Many programmers mistakenly use implicit casts to turn these data types into `int`. In fact, the [CUDA Programming Guide](https://docs.nvidia.com/cuda/cuda-c-programming-guide/) it self is inconsistent and incorrect in its use of data types in programming examples.

The result of these implicit casts is that our kernels may give unexpected results when exposed to large datasets, i.e., those exceeding >~2B items.

While we now have linters in place to prevent simple mistakes (D71236150), our codebase has many problematic instances. This diff fixes some of them.

Reviewed By: dtolnay

Differential Revision: D71355340
---
 faiss/gpu/impl/DistanceUtils.cuh      |  2 +-
 faiss/gpu/impl/GpuScalarQuantizer.cuh |  6 +++---
 faiss/gpu/impl/IVFAppend.cu           |  6 +++---
 faiss/gpu/impl/IVFFlatScan.cu         |  4 ++--
 faiss/gpu/impl/IVFInterleaved.cu      |  4 ++--
 faiss/gpu/impl/IVFInterleaved.cuh     |  8 ++++----
 faiss/gpu/impl/IVFUtilsSelect1.cu     |  2 +-
 faiss/gpu/impl/IVFUtilsSelect2.cu     |  2 +-
 faiss/gpu/impl/IcmEncoder.cu          | 16 ++++++++--------
 faiss/gpu/impl/L2Norm.cu              |  2 +-
 10 files changed, 26 insertions(+), 26 deletions(-)

diff --git a/faiss/gpu/impl/DistanceUtils.cuh b/faiss/gpu/impl/DistanceUtils.cuh
index fd894ae3bd..4ea899c8ec 100644
--- a/faiss/gpu/impl/DistanceUtils.cuh
+++ b/faiss/gpu/impl/DistanceUtils.cuh
@@ -303,7 +303,7 @@ __global__ void incrementIndex(
         int k,
         idx_t increment) {
     for (idx_t i = blockIdx.y; i < indices.getSize(0); i += gridDim.y) {
-        for (int j = threadIdx.x; j < k; j += blockDim.x) {
+        for (auto j = threadIdx.x; j < k; j += blockDim.x) {
             indices[i][idx_t(blockIdx.x) * k + j] += blockIdx.x * increment;
         }
     }
diff --git a/faiss/gpu/impl/GpuScalarQuantizer.cuh b/faiss/gpu/impl/GpuScalarQuantizer.cuh
index c2d781419d..186ecac1c2 100644
--- a/faiss/gpu/impl/GpuScalarQuantizer.cuh
+++ b/faiss/gpu/impl/GpuScalarQuantizer.cuh
@@ -377,7 +377,7 @@ struct Codec<ScalarQuantizer::QuantizerType::QT_8bit, DimMultiple> {
         smemVmin = smem;
         smemVdiff = smem + dim;
 
-        for (int i = threadIdx.x; i < dim; i += blockDim.x) {
+        for (auto i = threadIdx.x; i < dim; i += blockDim.x) {
             // We are performing vmin + vdiff * (v + 0.5) / (2^bits - 1)
             // This can be simplified to vmin' + vdiff' * v where:
             // vdiff' = vdiff / (2^bits - 1)
@@ -587,7 +587,7 @@ struct Codec<ScalarQuantizer::QuantizerType::QT_6bit, 1> {
         smemVmin = smem;
         smemVdiff = smem + dim;
 
-        for (int i = threadIdx.x; i < dim; i += blockDim.x) {
+        for (auto i = threadIdx.x; i < dim; i += blockDim.x) {
             // We are performing vmin + vdiff * (v + 0.5) / (2^bits - 1)
             // This can be simplified to vmin' + vdiff' * v where:
             // vdiff' = vdiff / (2^bits - 1)
@@ -753,7 +753,7 @@ struct Codec<ScalarQuantizer::QuantizerType::QT_4bit, 1> {
         smemVmin = smem;
         smemVdiff = smem + dim;
 
-        for (int i = threadIdx.x; i < dim; i += blockDim.x) {
+        for (auto i = threadIdx.x; i < dim; i += blockDim.x) {
             // We are performing vmin + vdiff * (v + 0.5) / (2^bits - 1)
             // This can be simplified to vmin' + vdiff' * v where:
             // vdiff' = vdiff / (2^bits - 1)
diff --git a/faiss/gpu/impl/IVFAppend.cu b/faiss/gpu/impl/IVFAppend.cu
index ba5cedf3c7..dd1c9073b5 100644
--- a/faiss/gpu/impl/IVFAppend.cu
+++ b/faiss/gpu/impl/IVFAppend.cu
@@ -368,9 +368,9 @@ __global__ void ivfInterleavedAppend(
         // The set of addresses for each of the lists
         void** listData) {
     // FIXME: some issue with getLaneId() and CUDA 10.1 and P4 GPUs?
-    int laneId = threadIdx.x % kWarpSize;
-    int warpId = threadIdx.x / kWarpSize;
-    int warpsPerBlock = blockDim.x / kWarpSize;
+    auto laneId = threadIdx.x % kWarpSize;
+    auto warpId = threadIdx.x / kWarpSize;
+    auto warpsPerBlock = blockDim.x / kWarpSize;
 
     // Each block is dedicated to a separate list
     idx_t listId = uniqueLists[blockIdx.x];
diff --git a/faiss/gpu/impl/IVFFlatScan.cu b/faiss/gpu/impl/IVFFlatScan.cu
index 457d0afeb6..5c6307b032 100644
--- a/faiss/gpu/impl/IVFFlatScan.cu
+++ b/faiss/gpu/impl/IVFFlatScan.cu
@@ -65,9 +65,9 @@ struct IVFFlatScan {
         int limit = utils::divDown(dim, Codec::kDimPerIter);
 
         // Each warp handles a separate chunk of vectors
-        int warpId = threadIdx.x / kWarpSize;
+        auto warpId = threadIdx.x / kWarpSize;
         // FIXME: why does getLaneId() not work when we write out below!?!?!
-        int laneId = threadIdx.x % kWarpSize; // getLaneId();
+        auto laneId = threadIdx.x % kWarpSize; // getLaneId();
 
         // Divide the set of vectors among the warps
         idx_t vecsPerWarp = utils::divUp(numVecs, kIVFFlatScanWarps);
diff --git a/faiss/gpu/impl/IVFInterleaved.cu b/faiss/gpu/impl/IVFInterleaved.cu
index e5b13f3aa8..fc99a49163 100644
--- a/faiss/gpu/impl/IVFInterleaved.cu
+++ b/faiss/gpu/impl/IVFInterleaved.cu
@@ -27,7 +27,7 @@ __global__ void ivfInterleavedScan2(
         Tensor<float, 2, true> distanceOut,
         Tensor<idx_t, 2, true> indicesOut) {
     if constexpr ((NumWarpQ == 1 && NumThreadQ == 1) || NumWarpQ >= kWarpSize) {
-        int queryId = blockIdx.x;
+        auto queryId = blockIdx.x;
 
         constexpr int kNumWarps = ThreadsPerBlock / kWarpSize;
 
@@ -99,7 +99,7 @@ __global__ void ivfInterleavedScan2(
         // Merge all final results
         heap.reduce();
 
-        for (int i = threadIdx.x; i < k; i += blockDim.x) {
+        for (auto i = threadIdx.x; i < k; i += blockDim.x) {
             // Re-adjust the value we are selecting based on the sorting order
             distanceOut[queryId][i] = smemK[i] * adj;
             auto packedIndex = smemV[i];
diff --git a/faiss/gpu/impl/IVFInterleaved.cuh b/faiss/gpu/impl/IVFInterleaved.cuh
index f1da8342d4..1b7fbbe7a1 100644
--- a/faiss/gpu/impl/IVFInterleaved.cuh
+++ b/faiss/gpu/impl/IVFInterleaved.cuh
@@ -56,7 +56,7 @@ __global__ void ivfInterleavedScan(
 
         for (idx_t queryId = blockIdx.y; queryId < queries.getSize(0);
              queryId += gridDim.y) {
-            int probeId = blockIdx.x;
+            auto probeId = blockIdx.x;
             idx_t listId = listIds[queryId][probeId];
 
             // Safety guard in case NaNs in input cause no list ID to be
@@ -69,8 +69,8 @@ __global__ void ivfInterleavedScan(
             int dim = queries.getSize(1);
 
             // FIXME: some issue with getLaneId() and CUDA 10.1 and P4 GPUs?
-            int laneId = threadIdx.x % kWarpSize;
-            int warpId = threadIdx.x / kWarpSize;
+            auto laneId = threadIdx.x % kWarpSize;
+            auto warpId = threadIdx.x / kWarpSize;
 
             using EncodeT = typename Codec::EncodeT;
 
@@ -215,7 +215,7 @@ __global__ void ivfInterleavedScan(
             auto distanceOutBase = distanceOut[queryId][probeId].data();
             auto indicesOutBase = indicesOut[queryId][probeId].data();
 
-            for (int i = threadIdx.x; i < k; i += blockDim.x) {
+            for (auto i = threadIdx.x; i < k; i += blockDim.x) {
                 distanceOutBase[i] = smemK[i];
                 indicesOutBase[i] = smemV[i];
             }
diff --git a/faiss/gpu/impl/IVFUtilsSelect1.cu b/faiss/gpu/impl/IVFUtilsSelect1.cu
index 3cb88bd9c7..c4f65bab8f 100644
--- a/faiss/gpu/impl/IVFUtilsSelect1.cu
+++ b/faiss/gpu/impl/IVFUtilsSelect1.cu
@@ -90,7 +90,7 @@ __global__ void pass1SelectLists(
 
             // Write out the final k-selected values; they should be all
             // together
-            for (int i = threadIdx.x; i < k; i += blockDim.x) {
+            for (auto i = threadIdx.x; i < k; i += blockDim.x) {
                 heapDistances[queryId][sliceId][i] = smemK[i];
                 heapIndices[queryId][sliceId][i] = idx_t(smemV[i]);
             }
diff --git a/faiss/gpu/impl/IVFUtilsSelect2.cu b/faiss/gpu/impl/IVFUtilsSelect2.cu
index 3a94101bdb..2dbf3c0f00 100644
--- a/faiss/gpu/impl/IVFUtilsSelect2.cu
+++ b/faiss/gpu/impl/IVFUtilsSelect2.cu
@@ -100,7 +100,7 @@ __global__ void pass2SelectLists(
         // Merge all final results
         heap.reduce();
 
-        for (int i = threadIdx.x; i < k; i += blockDim.x) {
+        for (auto i = threadIdx.x; i < k; i += blockDim.x) {
             outDistances[queryId][i] = smemK[i];
 
             // `v` is the index in `heapIndices`
diff --git a/faiss/gpu/impl/IcmEncoder.cu b/faiss/gpu/impl/IcmEncoder.cu
index 20ff36e2d2..b86e390f85 100644
--- a/faiss/gpu/impl/IcmEncoder.cu
+++ b/faiss/gpu/impl/IcmEncoder.cu
@@ -46,8 +46,8 @@ __global__ void runIcmEncodeStep(
         int m) {
     using KVPair = Pair<float, int>;
 
-    int id = blockIdx.x;    // each block takes care of one vector
-    int code = threadIdx.x; // each thread takes care of one possible code
+    auto id = blockIdx.x;    // each block takes care of one vector
+    auto code = threadIdx.x; // each thread takes care of one possible code
 
     // compute the objective value by look-up tables
     KVPair obj(0.0f, code);
@@ -94,8 +94,8 @@ __global__ void runEvaluation(
         int M,
         int K,
         int dims) {
-    int id = blockIdx.x; // each block takes care of one vector
-    int d = threadIdx.x; // each thread takes care of one dimension
+    auto id = blockIdx.x; // each block takes care of one vector
+    auto d = threadIdx.x; // each thread takes care of one dimension
     float acc = 0.0f;
 
 #pragma unroll
@@ -136,7 +136,7 @@ __global__ void runCodesPerturbation(
         int K,
         int nperts) {
     // each thread takes care of one vector
-    int id = blockIdx.x * blockDim.x + threadIdx.x;
+    auto id = blockIdx.x * blockDim.x + threadIdx.x;
 
     if (id >= n) {
         return;
@@ -173,7 +173,7 @@ __global__ void runCodesSelection(
         int n,
         int M) {
     // each thread takes care of one vector
-    int id = blockIdx.x * blockDim.x + threadIdx.x;
+    auto id = blockIdx.x * blockDim.x + threadIdx.x;
 
     if (id >= n || objs[id] >= bestObjs[id]) {
         return;
@@ -195,8 +195,8 @@ __global__ void runCodesSelection(
  * @param K     number of codewords in a codebook
  */
 __global__ void runNormAddition(float* uterm, const float* norm, int K) {
-    int id = blockIdx.x;
-    int code = threadIdx.x;
+    auto id = blockIdx.x;
+    auto code = threadIdx.x;
 
     uterm[id * K + code] += norm[code];
 }
diff --git a/faiss/gpu/impl/L2Norm.cu b/faiss/gpu/impl/L2Norm.cu
index e76a0831ff..0e65015e44 100644
--- a/faiss/gpu/impl/L2Norm.cu
+++ b/faiss/gpu/impl/L2Norm.cu
@@ -40,7 +40,7 @@ __global__ void l2NormRowMajor(
     // these are fine to be int (just based on block dimensions)
     int numWarps = utils::divUp(blockDim.x, kWarpSize);
     int laneId = getLaneId();
-    int warpId = threadIdx.x / kWarpSize;
+    auto warpId = threadIdx.x / kWarpSize;
 
     bool lastRowTile = (blockIdx.x == (gridDim.x - 1));
     idx_t rowStart = idx_t(blockIdx.x) * RowTileSize;