From 384fac572de0306b2fba886eaf020a42e7e22ccf Mon Sep 17 00:00:00 2001
From: Itay Etelis <itay.etelis@ibm.com>
Date: Tue, 5 May 2026 12:40:19 +0300
Subject: [PATCH 1/2] [Perf][ROCm] Use hipMemcpyBatchAsync in swap_blocks_batch

ROCm 7.1+ exposes the analog of cuMemcpyBatchAsync; route AMD builds
through it for the same single-driver-call fast path.

Signed-off-by: Itay Etelis <itay.etelis@ibm.com>
---
 csrc/cache_kernels.cu | 30 +++++++++++++++++++-----------
 1 file changed, 19 insertions(+), 11 deletions(-)
diff --git a/csrc/cache_kernels.cu b/csrc/cache_kernels.cu
index 7e456d32598b..1334fa3b5db2 100644
--- a/csrc/cache_kernels.cu
+++ b/csrc/cache_kernels.cu
@@ -134,18 +134,26 @@ void swap_blocks_batch(const torch::Tensor& src_ptrs,
                                &fail_idx, static_cast<CUstream>(stream));
     TORCH_CHECK(result == CUDA_SUCCESS, "cuMemcpyBatchAsync failed at index ",
                 fail_idx, " with error ", result);
-  } else
+    return;
+  }
+#elif defined(USE_ROCM) && defined(HIP_VERSION) && HIP_VERSION >= 70100000
+  // ROCm 7.1 does not yet honor hipMemcpyAttributes; pass nullptr.
+  size_t fail_idx = 0;
+  hipError_t result = hipMemcpyBatchAsync(
+      reinterpret_cast<void**>(dst_data), reinterpret_cast<void**>(src_data),
+      reinterpret_cast<size_t*>(size_data), static_cast<size_t>(n), nullptr,
+      nullptr, 0, &fail_idx, stream);
+  TORCH_CHECK(result == hipSuccess, "hipMemcpyBatchAsync failed at index ",
+              fail_idx, " with error ", result);
+  return;
 #endif
-  {
-    // Fallback for CUDA < 12.8, older drivers, and ROCm:
-    // individual async copies.
-    // cudaMemcpyDefault lets the driver infer direction from pointer types.
-    for (int64_t i = 0; i < n; i++) {
-      cudaMemcpyAsync(reinterpret_cast<void*>(dst_data[i]),
-                      reinterpret_cast<void*>(src_data[i]),
-                      static_cast<size_t>(size_data[i]), cudaMemcpyDefault,
-                      stream);
-    }
+  // Fallback for CUDA < 12.8, older drivers, and ROCm < 7.1.
+  // cudaMemcpyDefault lets the driver infer direction from pointer types.
+  for (int64_t i = 0; i < n; i++) {
+    cudaMemcpyAsync(reinterpret_cast<void*>(dst_data[i]),
+                    reinterpret_cast<void*>(src_data[i]),
+                    static_cast<size_t>(size_data[i]), cudaMemcpyDefault,
+                    stream);
   }
 }
 

From fea5520b9ea6482db288f464fe352d46941850cd Mon Sep 17 00:00:00 2001
From: Itay Etelis <itay.etelis@ibm.com>
Date: Tue, 5 May 2026 18:27:02 +0300
Subject: [PATCH 2/2] Assert sizeof(void*) parity in swap_blocks_batch

The ROCm branch reinterpret_casts int64_t* to void**; make the size
assumption explicit alongside the existing CUdeviceptr / size_t asserts.

Signed-off-by: Itay Etelis <itay.etelis@ibm.com>
---
 csrc/cache_kernels.cu | 6 ++++--
 1 file changed, 4 insertions(+), 2 deletions(-)

diff --git a/csrc/cache_kernels.cu b/csrc/cache_kernels.cu
index 1334fa3b5db2..1fcc89cb5b26 100644
--- a/csrc/cache_kernels.cu
+++ b/csrc/cache_kernels.cu
@@ -99,9 +99,11 @@ void swap_blocks_batch(const torch::Tensor& src_ptrs,
 
   // Use cuMemcpyBatchAsync (CUDA 12.8+) to submit all copies in a single
   // driver call, amortizing per-copy submission overhead.
-  // int64_t and CUdeviceptr/size_t are both 8 bytes on 64-bit platforms,
-  // so we reinterpret_cast the tensor data directly to avoid copies.
+  // int64_t, CUdeviceptr, void*, and size_t are all 8 bytes on 64-bit
+  // platforms, so we reinterpret_cast the tensor data directly to avoid
+  // copies.
   static_assert(sizeof(CUdeviceptr) == sizeof(int64_t));
+  static_assert(sizeof(void*) == sizeof(int64_t));
   static_assert(sizeof(size_t) == sizeof(int64_t));
 #if !defined(USE_ROCM) && defined(CUDA_VERSION) && CUDA_VERSION >= 12080
   // Resolve cuMemcpyBatchAsync at runtime via cuGetProcAddress so that