diff --git a/faiss/gpu/impl/DistanceUtils.cuh b/faiss/gpu/impl/DistanceUtils.cuh
index fd894ae3bd..4ea899c8ec 100644
--- a/faiss/gpu/impl/DistanceUtils.cuh
+++ b/faiss/gpu/impl/DistanceUtils.cuh
@@ -303,7 +303,7 @@ __global__ void incrementIndex(
         int k,
         idx_t increment) {
     for (idx_t i = blockIdx.y; i < indices.getSize(0); i += gridDim.y) {
-        for (int j = threadIdx.x; j < k; j += blockDim.x) {
+        for (auto j = threadIdx.x; j < k; j += blockDim.x) {
             indices[i][idx_t(blockIdx.x) * k + j] += blockIdx.x * increment;
         }
     }
diff --git a/faiss/gpu/impl/GpuScalarQuantizer.cuh b/faiss/gpu/impl/GpuScalarQuantizer.cuh
index c2d781419d..186ecac1c2 100644
--- a/faiss/gpu/impl/GpuScalarQuantizer.cuh
+++ b/faiss/gpu/impl/GpuScalarQuantizer.cuh
@@ -377,7 +377,7 @@ struct Codec<ScalarQuantizer::QuantizerType::QT_8bit, DimMultiple> {
         smemVmin = smem;
         smemVdiff = smem + dim;
 
-        for (int i = threadIdx.x; i < dim; i += blockDim.x) {
+        for (auto i = threadIdx.x; i < dim; i += blockDim.x) {
             // We are performing vmin + vdiff * (v + 0.5) / (2^bits - 1)
             // This can be simplified to vmin' + vdiff' * v where:
             // vdiff' = vdiff / (2^bits - 1)
@@ -587,7 +587,7 @@ struct Codec<ScalarQuantizer::QuantizerType::QT_6bit, 1> {
         smemVmin = smem;
         smemVdiff = smem + dim;
 
-        for (int i = threadIdx.x; i < dim; i += blockDim.x) {
+        for (auto i = threadIdx.x; i < dim; i += blockDim.x) {
             // We are performing vmin + vdiff * (v + 0.5) / (2^bits - 1)
             // This can be simplified to vmin' + vdiff' * v where:
             // vdiff' = vdiff / (2^bits - 1)
@@ -753,7 +753,7 @@ struct Codec<ScalarQuantizer::QuantizerType::QT_4bit, 1> {
         smemVmin = smem;
         smemVdiff = smem + dim;
 
-        for (int i = threadIdx.x; i < dim; i += blockDim.x) {
+        for (auto i = threadIdx.x; i < dim; i += blockDim.x) {
             // We are performing vmin + vdiff * (v + 0.5) / (2^bits - 1)
             // This can be simplified to vmin' + vdiff' * v where:
             // vdiff' = vdiff / (2^bits - 1)
diff --git a/faiss/gpu/impl/IVFAppend.cu b/faiss/gpu/impl/IVFAppend.cu
index ba5cedf3c7..dd1c9073b5 100644
--- a/faiss/gpu/impl/IVFAppend.cu
+++ b/faiss/gpu/impl/IVFAppend.cu
@@ -368,9 +368,9 @@ __global__ void ivfInterleavedAppend(
         // The set of addresses for each of the lists
         void** listData) {
     // FIXME: some issue with getLaneId() and CUDA 10.1 and P4 GPUs?
-    int laneId = threadIdx.x % kWarpSize;
-    int warpId = threadIdx.x / kWarpSize;
-    int warpsPerBlock = blockDim.x / kWarpSize;
+    auto laneId = threadIdx.x % kWarpSize;
+    auto warpId = threadIdx.x / kWarpSize;
+    auto warpsPerBlock = blockDim.x / kWarpSize;
 
     // Each block is dedicated to a separate list
     idx_t listId = uniqueLists[blockIdx.x];
diff --git a/faiss/gpu/impl/IVFFlatScan.cu b/faiss/gpu/impl/IVFFlatScan.cu
index 457d0afeb6..5c6307b032 100644
--- a/faiss/gpu/impl/IVFFlatScan.cu
+++ b/faiss/gpu/impl/IVFFlatScan.cu
@@ -65,9 +65,9 @@ struct IVFFlatScan {
         int limit = utils::divDown(dim, Codec::kDimPerIter);
 
         // Each warp handles a separate chunk of vectors
-        int warpId = threadIdx.x / kWarpSize;
+        auto warpId = threadIdx.x / kWarpSize;
         // FIXME: why does getLaneId() not work when we write out below!?!?!
-        int laneId = threadIdx.x % kWarpSize; // getLaneId();
+        auto laneId = threadIdx.x % kWarpSize; // getLaneId();
 
         // Divide the set of vectors among the warps
         idx_t vecsPerWarp = utils::divUp(numVecs, kIVFFlatScanWarps);
diff --git a/faiss/gpu/impl/IVFInterleaved.cu b/faiss/gpu/impl/IVFInterleaved.cu
index e5b13f3aa8..fc99a49163 100644
--- a/faiss/gpu/impl/IVFInterleaved.cu
+++ b/faiss/gpu/impl/IVFInterleaved.cu
@@ -27,7 +27,7 @@ __global__ void ivfInterleavedScan2(
         Tensor<float, 2, true> distanceOut,
         Tensor<idx_t, 2, true> indicesOut) {
     if constexpr ((NumWarpQ == 1 && NumThreadQ == 1) || NumWarpQ >= kWarpSize) {
-        int queryId = blockIdx.x;
+        auto queryId = blockIdx.x;
 
         constexpr int kNumWarps = ThreadsPerBlock / kWarpSize;
 
@@ -99,7 +99,7 @@ __global__ void ivfInterleavedScan2(
         // Merge all final results
         heap.reduce();
 
-        for (int i = threadIdx.x; i < k; i += blockDim.x) {
+        for (auto i = threadIdx.x; i < k; i += blockDim.x) {
             // Re-adjust the value we are selecting based on the sorting order
             distanceOut[queryId][i] = smemK[i] * adj;
             auto packedIndex = smemV[i];
diff --git a/faiss/gpu/impl/IVFInterleaved.cuh b/faiss/gpu/impl/IVFInterleaved.cuh
index f1da8342d4..1b7fbbe7a1 100644
--- a/faiss/gpu/impl/IVFInterleaved.cuh
+++ b/faiss/gpu/impl/IVFInterleaved.cuh
@@ -56,7 +56,7 @@ __global__ void ivfInterleavedScan(
 
         for (idx_t queryId = blockIdx.y; queryId < queries.getSize(0);
              queryId += gridDim.y) {
-            int probeId = blockIdx.x;
+            auto probeId = blockIdx.x;
             idx_t listId = listIds[queryId][probeId];
 
             // Safety guard in case NaNs in input cause no list ID to be
@@ -69,8 +69,8 @@ __global__ void ivfInterleavedScan(
             int dim = queries.getSize(1);
 
             // FIXME: some issue with getLaneId() and CUDA 10.1 and P4 GPUs?
-            int laneId = threadIdx.x % kWarpSize;
-            int warpId = threadIdx.x / kWarpSize;
+            auto laneId = threadIdx.x % kWarpSize;
+            auto warpId = threadIdx.x / kWarpSize;
 
             using EncodeT = typename Codec::EncodeT;
 
@@ -215,7 +215,7 @@ __global__ void ivfInterleavedScan(
             auto distanceOutBase = distanceOut[queryId][probeId].data();
             auto indicesOutBase = indicesOut[queryId][probeId].data();
 
-            for (int i = threadIdx.x; i < k; i += blockDim.x) {
+            for (auto i = threadIdx.x; i < k; i += blockDim.x) {
                 distanceOutBase[i] = smemK[i];
                 indicesOutBase[i] = smemV[i];
             }
diff --git a/faiss/gpu/impl/IVFUtilsSelect1.cu b/faiss/gpu/impl/IVFUtilsSelect1.cu
index 3cb88bd9c7..c4f65bab8f 100644
--- a/faiss/gpu/impl/IVFUtilsSelect1.cu
+++ b/faiss/gpu/impl/IVFUtilsSelect1.cu
@@ -90,7 +90,7 @@ __global__ void pass1SelectLists(
 
             // Write out the final k-selected values; they should be all
             // together
-            for (int i = threadIdx.x; i < k; i += blockDim.x) {
+            for (auto i = threadIdx.x; i < k; i += blockDim.x) {
                 heapDistances[queryId][sliceId][i] = smemK[i];
                 heapIndices[queryId][sliceId][i] = idx_t(smemV[i]);
             }
diff --git a/faiss/gpu/impl/IVFUtilsSelect2.cu b/faiss/gpu/impl/IVFUtilsSelect2.cu
index 3a94101bdb..2dbf3c0f00 100644
--- a/faiss/gpu/impl/IVFUtilsSelect2.cu
+++ b/faiss/gpu/impl/IVFUtilsSelect2.cu
@@ -100,7 +100,7 @@ __global__ void pass2SelectLists(
         // Merge all final results
         heap.reduce();
 
-        for (int i = threadIdx.x; i < k; i += blockDim.x) {
+        for (auto i = threadIdx.x; i < k; i += blockDim.x) {
             outDistances[queryId][i] = smemK[i];
 
             // `v` is the index in `heapIndices`
diff --git a/faiss/gpu/impl/IcmEncoder.cu b/faiss/gpu/impl/IcmEncoder.cu
index 20ff36e2d2..b86e390f85 100644
--- a/faiss/gpu/impl/IcmEncoder.cu
+++ b/faiss/gpu/impl/IcmEncoder.cu
@@ -46,8 +46,8 @@ __global__ void runIcmEncodeStep(
         int m) {
     using KVPair = Pair<float, int>;
 
-    int id = blockIdx.x;    // each block takes care of one vector
-    int code = threadIdx.x; // each thread takes care of one possible code
+    auto id = blockIdx.x;    // each block takes care of one vector
+    auto code = threadIdx.x; // each thread takes care of one possible code
 
     // compute the objective value by look-up tables
     KVPair obj(0.0f, code);
@@ -94,8 +94,8 @@ __global__ void runEvaluation(
         int M,
         int K,
         int dims) {
-    int id = blockIdx.x; // each block takes care of one vector
-    int d = threadIdx.x; // each thread takes care of one dimension
+    auto id = blockIdx.x; // each block takes care of one vector
+    auto d = threadIdx.x; // each thread takes care of one dimension
     float acc = 0.0f;
 
 #pragma unroll
@@ -136,7 +136,7 @@ __global__ void runCodesPerturbation(
         int K,
         int nperts) {
     // each thread takes care of one vector
-    int id = blockIdx.x * blockDim.x + threadIdx.x;
+    auto id = blockIdx.x * blockDim.x + threadIdx.x;
 
     if (id >= n) {
         return;
@@ -173,7 +173,7 @@ __global__ void runCodesSelection(
         int n,
         int M) {
     // each thread takes care of one vector
-    int id = blockIdx.x * blockDim.x + threadIdx.x;
+    auto id = blockIdx.x * blockDim.x + threadIdx.x;
 
     if (id >= n || objs[id] >= bestObjs[id]) {
         return;
@@ -195,8 +195,8 @@ __global__ void runCodesSelection(
  * @param K     number of codewords in a codebook
  */
 __global__ void runNormAddition(float* uterm, const float* norm, int K) {
-    int id = blockIdx.x;
-    int code = threadIdx.x;
+    auto id = blockIdx.x;
+    auto code = threadIdx.x;
 
     uterm[id * K + code] += norm[code];
 }
diff --git a/faiss/gpu/impl/L2Norm.cu b/faiss/gpu/impl/L2Norm.cu
index e76a0831ff..0e65015e44 100644
--- a/faiss/gpu/impl/L2Norm.cu
+++ b/faiss/gpu/impl/L2Norm.cu
@@ -40,7 +40,7 @@ __global__ void l2NormRowMajor(
     // these are fine to be int (just based on block dimensions)
     int numWarps = utils::divUp(blockDim.x, kWarpSize);
     int laneId = getLaneId();
-    int warpId = threadIdx.x / kWarpSize;
+    auto warpId = threadIdx.x / kWarpSize;
 
     bool lastRowTile = (blockIdx.x == (gridDim.x - 1));
     idx_t rowStart = idx_t(blockIdx.x) * RowTileSize;