From b6cb209e464c56a9a8fb7b60529cd205be9ff305 Mon Sep 17 00:00:00 2001
From: Ofir Farjon <ofarjon@nvidia.com>
Date: Thu, 5 Mar 2026 18:11:10 +0200
Subject: [PATCH 01/29] NIXL/EP: Use vmm API instead of cudaMalloc

---
 examples/device/ep/csrc/nixl_ep.cpp |  36 ++++++----
 examples/device/ep/csrc/nixl_ep.hpp | 107 ++++++++++++++++++++++++++++
 2 files changed, 128 insertions(+), 15 deletions(-)
diff --git a/examples/device/ep/csrc/nixl_ep.cpp b/examples/device/ep/csrc/nixl_ep.cpp
index 8834b2b213..00884602f7 100644
--- a/examples/device/ep/csrc/nixl_ep.cpp
+++ b/examples/device/ep/csrc/nixl_ep.cpp
@@ -27,6 +27,7 @@
 #include <cstddef>
 #include <cstdint>
 #include <limits>
+#include <cuda.h>
 #include <cuda_runtime.h>
 #include <memory>
 #include <optional>
@@ -103,25 +104,28 @@ void Buffer::init(int num_ranks, int num_experts_per_rank, int64_t num_rdma_byte
     EP_HOST_ASSERT(per_channel_bytes < std::numeric_limits<int>::max());
 
     // Create 32 MiB workspace
-    CUDA_CHECK(cudaMalloc(&workspace, NUM_WORKSPACE_BYTES));
+    m_workspace_alloc = std::make_unique<cuda_allocator>(NUM_WORKSPACE_BYTES);
+    workspace         = m_workspace_alloc->ptr();
     CUDA_CHECK(cudaMemsetAsync(workspace, 0, NUM_WORKSPACE_BYTES, comm_stream));
 
     EP_HOST_ASSERT(max_experts_per_rank > 0);
-    CUDA_CHECK(cudaMalloc(&rdma_buffer_ptr, num_rdma_bytes));
+    m_rdma_alloc    = std::make_unique<cuda_allocator>(num_rdma_bytes);
+    rdma_buffer_ptr = m_rdma_alloc->ptr();
     CUDA_CHECK(cudaMemset(rdma_buffer_ptr, 0, num_rdma_bytes));
 
     // Allocate and clean shrink buffer
-    mask_buffer_ptr = nullptr;
-    sync_buffer_ptr = nullptr;
     int num_mask_buffer_bytes = max_num_ranks * sizeof(int);
-    CUDA_CHECK(cudaMalloc(&mask_buffer_ptr, num_mask_buffer_bytes));
+    m_mask_alloc    = std::make_unique<cuda_allocator>(num_mask_buffer_bytes);
+    mask_buffer_ptr = static_cast<int*>(m_mask_alloc->ptr());
     CUDA_CHECK(cudaMemset(mask_buffer_ptr, 0xff, num_mask_buffer_bytes));
     CUDA_CHECK(cudaMemset(mask_buffer_ptr + rank, 0, sizeof(int)));
 
     int num_sync_buffer_bytes = max_num_ranks * sizeof(int);
-    CUDA_CHECK(cudaMalloc(&sync_buffer_ptr, num_sync_buffer_bytes));
+    m_sync_alloc      = std::make_unique<cuda_allocator>(num_sync_buffer_bytes);
+    sync_buffer_ptr   = static_cast<int*>(m_sync_alloc->ptr());
+    m_sync_count_alloc = std::make_unique<cuda_allocator>(num_sync_buffer_bytes);
+    sync_count_ptr    = static_cast<int*>(m_sync_count_alloc->ptr());
     CUDA_CHECK(cudaMemset(sync_buffer_ptr, 0, num_sync_buffer_bytes));
-    CUDA_CHECK(cudaMalloc(&sync_count_ptr, num_sync_buffer_bytes));
     CUDA_CHECK(cudaMemset(sync_count_ptr, 0, num_sync_buffer_bytes));
     CUDA_CHECK(cudaDeviceSynchronize());
 
@@ -174,20 +178,22 @@ void Buffer::destroy() {
 
     _nixl_ep_destroy();
 
-    cudaFree(rdma_buffer_ptr);
+    m_rdma_alloc.reset();
+    rdma_buffer_ptr = nullptr;
 
     if (nixl_agent_info and nixl_agent_info->agent != nullptr and getenv("NIXL_ETCD_ENDPOINTS")) {
         nixl_agent_info->agent->invalidateLocalMD();
     }
 
-    rdma_buffer_ptr = nullptr;
-
-    cudaFree(mask_buffer_ptr);
-    cudaFree(sync_buffer_ptr);
-    cudaFree(sync_count_ptr);
+    m_mask_alloc.reset();
+    mask_buffer_ptr = nullptr;
+    m_sync_alloc.reset();
+    sync_buffer_ptr = nullptr;
+    m_sync_count_alloc.reset();
+    sync_count_ptr = nullptr;
 
-    // Free workspace
-    CUDA_CHECK(cudaFree(workspace));
+    m_workspace_alloc.reset();
+    workspace = nullptr;
 
     destroyed = true;
     available = false;
diff --git a/examples/device/ep/csrc/nixl_ep.hpp b/examples/device/ep/csrc/nixl_ep.hpp
index 3a405fb7ee..c52f2a86f3 100644
--- a/examples/device/ep/csrc/nixl_ep.hpp
+++ b/examples/device/ep/csrc/nixl_ep.hpp
@@ -35,7 +35,9 @@
 #include <vector>
 #include <string>
 
+#include <cuda.h>
 #include <memory>
+#include <stdexcept>
 #include "config.hpp"
 #include "event.hpp"
 #include "kernels/configs.cuh"
@@ -47,6 +49,104 @@
 #define TORCH_EXTENSION_NAME nixl_ep_cpp
 #endif
 
+/* CUDA memory allocator using VMM. Uses fabric handle type if the device
+ * supports it (CU_DEVICE_ATTRIBUTE_HANDLE_TYPE_FABRIC_SUPPORTED), otherwise
+ * falls back to CU_MEM_HANDLE_TYPE_NONE. */
+class cuda_allocator {
+public:
+    cuda_allocator(size_t size) : m_size(0), m_ptr(0), m_alloc_handle(0)
+    {
+        if (size == 0) {
+            throw std::invalid_argument("cuda_allocator: size must be non-zero");
+        }
+
+        CUdevice device;
+        if (cuCtxGetDevice(&device) != CUDA_SUCCESS) {
+            throw std::runtime_error("Failed to get CUDA device handle");
+        }
+
+        int fabric_supported = 0;
+        cuDeviceGetAttribute(&fabric_supported,
+                             CU_DEVICE_ATTRIBUTE_HANDLE_TYPE_FABRIC_SUPPORTED,
+                             device);
+
+        CUmemAllocationProp prop = {};
+        prop.type                 = CU_MEM_ALLOCATION_TYPE_PINNED;
+        prop.location.type        = CU_MEM_LOCATION_TYPE_DEVICE;
+        prop.location.id          = device;
+        prop.requestedHandleTypes = fabric_supported ? CU_MEM_HANDLE_TYPE_FABRIC
+                                                     : CU_MEM_HANDLE_TYPE_NONE;
+
+        size_t granularity = 0;
+        if (cuMemGetAllocationGranularity(&granularity, &prop,
+                                          CU_MEM_ALLOC_GRANULARITY_MINIMUM) !=
+            CUDA_SUCCESS) {
+            throw std::runtime_error("Failed to get CUDA allocation granularity");
+        }
+
+        init_vmm(size, device, prop, granularity);
+    }
+
+    ~cuda_allocator()
+    {
+        if (m_ptr) {
+            cuMemUnmap(m_ptr, m_size);
+            cuMemAddressFree(m_ptr, m_size);
+        }
+        if (m_alloc_handle) {
+            cuMemRelease(m_alloc_handle);
+        }
+    }
+
+    void*  ptr()  const { return reinterpret_cast<void*>(m_ptr); }
+    size_t size() const { return m_size; }
+
+    cuda_allocator(const cuda_allocator&)            = delete;
+    cuda_allocator& operator=(const cuda_allocator&) = delete;
+
+private:
+    void init_vmm(size_t size, CUdevice device, const CUmemAllocationProp &prop,
+                  size_t granularity)
+    {
+        m_size = (size + granularity - 1) / granularity * granularity;
+
+        if (cuMemCreate(&m_alloc_handle, m_size, &prop, 0) != CUDA_SUCCESS) {
+            throw std::runtime_error("Failed to create CUDA VMM allocation");
+        }
+
+        if (cuMemAddressReserve(&m_ptr, m_size, 0, 0, 0) != CUDA_SUCCESS) {
+            cuMemRelease(m_alloc_handle);
+            m_alloc_handle = 0;
+            throw std::runtime_error("Failed to reserve CUDA virtual address");
+        }
+
+        if (cuMemMap(m_ptr, m_size, 0, m_alloc_handle, 0) != CUDA_SUCCESS) {
+            cuMemAddressFree(m_ptr, m_size);
+            m_ptr = 0;
+            cuMemRelease(m_alloc_handle);
+            m_alloc_handle = 0;
+            throw std::runtime_error("Failed to map CUDA VMM memory");
+        }
+
+        CUmemAccessDesc access_desc = {};
+        access_desc.location.type   = CU_MEM_LOCATION_TYPE_DEVICE;
+        access_desc.location.id     = device;
+        access_desc.flags           = CU_MEM_ACCESS_FLAGS_PROT_READWRITE;
+        if (cuMemSetAccess(m_ptr, m_size, &access_desc, 1) != CUDA_SUCCESS) {
+            cuMemUnmap(m_ptr, m_size);
+            cuMemAddressFree(m_ptr, m_size);
+            m_ptr = 0;
+            cuMemRelease(m_alloc_handle);
+            m_alloc_handle = 0;
+            throw std::runtime_error("Failed to set CUDA memory access");
+        }
+    }
+
+    size_t                        m_size;
+    CUdeviceptr                   m_ptr;
+    CUmemGenericAllocationHandle  m_alloc_handle;
+};
+
 namespace nixl_ep {
 
 struct NixlPeerInfo {
@@ -83,6 +183,13 @@ struct Buffer {
     int *sync_buffer_ptr = nullptr;
     int *sync_count_ptr = nullptr;
 
+    // Owning allocators (keep raw ptrs above as aliases for use throughout)
+    std::unique_ptr<cuda_allocator> m_rdma_alloc;
+    std::unique_ptr<cuda_allocator> m_mask_alloc;
+    std::unique_ptr<cuda_allocator> m_sync_alloc;
+    std::unique_ptr<cuda_allocator> m_sync_count_alloc;
+    std::unique_ptr<cuda_allocator> m_workspace_alloc;
+
     // Device info and communication
     int device_id;
     int num_device_sms;

From c36a4c2655dbea1a742dbcfe9786c6516029849e Mon Sep 17 00:00:00 2001
From: Ofir Farjon <ofarjon@funk11.mec01.labs.mlnx>
Date: Sat, 7 Mar 2026 17:33:33 +0200
Subject: [PATCH 02/29] NIXL/EP: Use vmm API instead of cudaMalloc

---
 examples/device/ep/csrc/nixl_ep.hpp | 65 ++++++++++++++++++-----------
 1 file changed, 40 insertions(+), 25 deletions(-)

diff --git a/examples/device/ep/csrc/nixl_ep.hpp b/examples/device/ep/csrc/nixl_ep.hpp
index c52f2a86f3..003c9a317f 100644
--- a/examples/device/ep/csrc/nixl_ep.hpp
+++ b/examples/device/ep/csrc/nixl_ep.hpp
@@ -36,6 +36,7 @@
 #include <string>
 
 #include <cuda.h>
+#include <cuda_runtime.h>
 #include <memory>
 #include <stdexcept>
 #include "config.hpp"
@@ -49,12 +50,15 @@
 #define TORCH_EXTENSION_NAME nixl_ep_cpp
 #endif
 
-/* CUDA memory allocator using VMM. Uses fabric handle type if the device
- * supports it (CU_DEVICE_ATTRIBUTE_HANDLE_TYPE_FABRIC_SUPPORTED), otherwise
- * falls back to CU_MEM_HANDLE_TYPE_NONE. */
+/* CUDA memory allocator. Uses fabric VMM (cuMemCreate) if the device supports
+ * CU_DEVICE_ATTRIBUTE_HANDLE_TYPE_FABRIC_SUPPORTED, otherwise falls back to
+ * cudaMalloc. The cudaMalloc fallback is intentional: VMM memory without a
+ * fabric handle provides no benefit and is incompatible with GDRCopy, which
+ * requires traditionally allocated memory for kernel-level pinning. */
 class cuda_allocator {
 public:
-    cuda_allocator(size_t size) : m_size(0), m_ptr(0), m_alloc_handle(0)
+    cuda_allocator(size_t size) : m_size(size), m_ptr(0), m_alloc_handle(0),
+                                  m_cuda_ptr(nullptr)
     {
         if (size == 0) {
             throw std::invalid_argument("cuda_allocator: size must be non-zero");
@@ -70,21 +74,11 @@ class cuda_allocator {
                              CU_DEVICE_ATTRIBUTE_HANDLE_TYPE_FABRIC_SUPPORTED,
                              device);
 
-        CUmemAllocationProp prop = {};
-        prop.type                 = CU_MEM_ALLOCATION_TYPE_PINNED;
-        prop.location.type        = CU_MEM_LOCATION_TYPE_DEVICE;
-        prop.location.id          = device;
-        prop.requestedHandleTypes = fabric_supported ? CU_MEM_HANDLE_TYPE_FABRIC
-                                                     : CU_MEM_HANDLE_TYPE_NONE;
-
-        size_t granularity = 0;
-        if (cuMemGetAllocationGranularity(&granularity, &prop,
-                                          CU_MEM_ALLOC_GRANULARITY_MINIMUM) !=
-            CUDA_SUCCESS) {
-            throw std::runtime_error("Failed to get CUDA allocation granularity");
+        if (fabric_supported) {
+            init_vmm(size, device);
+        } else {
+            init_regular(size);
         }
-
-        init_vmm(size, device, prop, granularity);
     }
 
     ~cuda_allocator()
@@ -92,26 +86,39 @@ class cuda_allocator {
         if (m_ptr) {
             cuMemUnmap(m_ptr, m_size);
             cuMemAddressFree(m_ptr, m_size);
-        }
-        if (m_alloc_handle) {
             cuMemRelease(m_alloc_handle);
+        } else if (m_cuda_ptr) {
+            cudaFree(m_cuda_ptr);
         }
     }
 
-    void*  ptr()  const { return reinterpret_cast<void*>(m_ptr); }
+    void*  ptr()  const { return m_ptr ? reinterpret_cast<void*>(m_ptr)
+                                       : m_cuda_ptr; }
     size_t size() const { return m_size; }
 
     cuda_allocator(const cuda_allocator&)            = delete;
     cuda_allocator& operator=(const cuda_allocator&) = delete;
 
 private:
-    void init_vmm(size_t size, CUdevice device, const CUmemAllocationProp &prop,
-                  size_t granularity)
+    void init_vmm(size_t size, CUdevice device)
     {
+        CUmemAllocationProp prop = {};
+        prop.type                 = CU_MEM_ALLOCATION_TYPE_PINNED;
+        prop.location.type        = CU_MEM_LOCATION_TYPE_DEVICE;
+        prop.location.id          = device;
+        prop.requestedHandleTypes = CU_MEM_HANDLE_TYPE_FABRIC;
+
+        size_t granularity = 0;
+        if (cuMemGetAllocationGranularity(&granularity, &prop,
+                                          CU_MEM_ALLOC_GRANULARITY_MINIMUM) !=
+            CUDA_SUCCESS) {
+            throw std::runtime_error("Failed to get CUDA allocation granularity");
+        }
+
         m_size = (size + granularity - 1) / granularity * granularity;
 
         if (cuMemCreate(&m_alloc_handle, m_size, &prop, 0) != CUDA_SUCCESS) {
-            throw std::runtime_error("Failed to create CUDA VMM allocation");
+            throw std::runtime_error("Failed to create CUDA fabric VMM allocation");
         }
 
         if (cuMemAddressReserve(&m_ptr, m_size, 0, 0, 0) != CUDA_SUCCESS) {
@@ -125,7 +132,7 @@ class cuda_allocator {
             m_ptr = 0;
             cuMemRelease(m_alloc_handle);
             m_alloc_handle = 0;
-            throw std::runtime_error("Failed to map CUDA VMM memory");
+            throw std::runtime_error("Failed to map CUDA fabric VMM memory");
         }
 
         CUmemAccessDesc access_desc = {};
@@ -142,9 +149,17 @@ class cuda_allocator {
         }
     }
 
+    void init_regular(size_t size)
+    {
+        if (cudaMalloc(&m_cuda_ptr, size) != cudaSuccess) {
+            throw std::runtime_error("Failed to allocate CUDA memory");
+        }
+    }
+
     size_t                        m_size;
     CUdeviceptr                   m_ptr;
     CUmemGenericAllocationHandle  m_alloc_handle;
+    void                         *m_cuda_ptr;
 };
 
 namespace nixl_ep {

From 29c2c7ae79f7bb13f80ad351774c6f30134cbfc5 Mon Sep 17 00:00:00 2001
From: Ofir Farjon <ofarjon@funk11.mec01.labs.mlnx>
Date: Sat, 7 Mar 2026 18:13:13 +0200
Subject: [PATCH 03/29] NIXL/EP: revert

---
 examples/device/ep/csrc/nixl_ep.hpp | 65 +++++++++++------------------
 1 file changed, 25 insertions(+), 40 deletions(-)

diff --git a/examples/device/ep/csrc/nixl_ep.hpp b/examples/device/ep/csrc/nixl_ep.hpp
index 003c9a317f..c52f2a86f3 100644
--- a/examples/device/ep/csrc/nixl_ep.hpp
+++ b/examples/device/ep/csrc/nixl_ep.hpp
@@ -36,7 +36,6 @@
 #include <string>
 
 #include <cuda.h>
-#include <cuda_runtime.h>
 #include <memory>
 #include <stdexcept>
 #include "config.hpp"
@@ -50,15 +49,12 @@
 #define TORCH_EXTENSION_NAME nixl_ep_cpp
 #endif
 
-/* CUDA memory allocator. Uses fabric VMM (cuMemCreate) if the device supports
- * CU_DEVICE_ATTRIBUTE_HANDLE_TYPE_FABRIC_SUPPORTED, otherwise falls back to
- * cudaMalloc. The cudaMalloc fallback is intentional: VMM memory without a
- * fabric handle provides no benefit and is incompatible with GDRCopy, which
- * requires traditionally allocated memory for kernel-level pinning. */
+/* CUDA memory allocator using VMM. Uses fabric handle type if the device
+ * supports it (CU_DEVICE_ATTRIBUTE_HANDLE_TYPE_FABRIC_SUPPORTED), otherwise
+ * falls back to CU_MEM_HANDLE_TYPE_NONE. */
 class cuda_allocator {
 public:
-    cuda_allocator(size_t size) : m_size(size), m_ptr(0), m_alloc_handle(0),
-                                  m_cuda_ptr(nullptr)
+    cuda_allocator(size_t size) : m_size(0), m_ptr(0), m_alloc_handle(0)
     {
         if (size == 0) {
             throw std::invalid_argument("cuda_allocator: size must be non-zero");
@@ -74,11 +70,21 @@ class cuda_allocator {
                              CU_DEVICE_ATTRIBUTE_HANDLE_TYPE_FABRIC_SUPPORTED,
                              device);
 
-        if (fabric_supported) {
-            init_vmm(size, device);
-        } else {
-            init_regular(size);
+        CUmemAllocationProp prop = {};
+        prop.type                 = CU_MEM_ALLOCATION_TYPE_PINNED;
+        prop.location.type        = CU_MEM_LOCATION_TYPE_DEVICE;
+        prop.location.id          = device;
+        prop.requestedHandleTypes = fabric_supported ? CU_MEM_HANDLE_TYPE_FABRIC
+                                                     : CU_MEM_HANDLE_TYPE_NONE;
+
+        size_t granularity = 0;
+        if (cuMemGetAllocationGranularity(&granularity, &prop,
+                                          CU_MEM_ALLOC_GRANULARITY_MINIMUM) !=
+            CUDA_SUCCESS) {
+            throw std::runtime_error("Failed to get CUDA allocation granularity");
         }
+
+        init_vmm(size, device, prop, granularity);
     }
 
     ~cuda_allocator()
@@ -86,39 +92,26 @@ class cuda_allocator {
         if (m_ptr) {
             cuMemUnmap(m_ptr, m_size);
             cuMemAddressFree(m_ptr, m_size);
+        }
+        if (m_alloc_handle) {
             cuMemRelease(m_alloc_handle);
-        } else if (m_cuda_ptr) {
-            cudaFree(m_cuda_ptr);
         }
     }
 
-    void*  ptr()  const { return m_ptr ? reinterpret_cast<void*>(m_ptr)
-                                       : m_cuda_ptr; }
+    void*  ptr()  const { return reinterpret_cast<void*>(m_ptr); }
     size_t size() const { return m_size; }
 
     cuda_allocator(const cuda_allocator&)            = delete;
     cuda_allocator& operator=(const cuda_allocator&) = delete;
 
 private:
-    void init_vmm(size_t size, CUdevice device)
+    void init_vmm(size_t size, CUdevice device, const CUmemAllocationProp &prop,
+                  size_t granularity)
     {
-        CUmemAllocationProp prop = {};
-        prop.type                 = CU_MEM_ALLOCATION_TYPE_PINNED;
-        prop.location.type        = CU_MEM_LOCATION_TYPE_DEVICE;
-        prop.location.id          = device;
-        prop.requestedHandleTypes = CU_MEM_HANDLE_TYPE_FABRIC;
-
-        size_t granularity = 0;
-        if (cuMemGetAllocationGranularity(&granularity, &prop,
-                                          CU_MEM_ALLOC_GRANULARITY_MINIMUM) !=
-            CUDA_SUCCESS) {
-            throw std::runtime_error("Failed to get CUDA allocation granularity");
-        }
-
         m_size = (size + granularity - 1) / granularity * granularity;
 
         if (cuMemCreate(&m_alloc_handle, m_size, &prop, 0) != CUDA_SUCCESS) {
-            throw std::runtime_error("Failed to create CUDA fabric VMM allocation");
+            throw std::runtime_error("Failed to create CUDA VMM allocation");
         }
 
         if (cuMemAddressReserve(&m_ptr, m_size, 0, 0, 0) != CUDA_SUCCESS) {
@@ -132,7 +125,7 @@ class cuda_allocator {
             m_ptr = 0;
             cuMemRelease(m_alloc_handle);
             m_alloc_handle = 0;
-            throw std::runtime_error("Failed to map CUDA fabric VMM memory");
+            throw std::runtime_error("Failed to map CUDA VMM memory");
         }
 
         CUmemAccessDesc access_desc = {};
@@ -149,17 +142,9 @@ class cuda_allocator {
         }
     }
 
-    void init_regular(size_t size)
-    {
-        if (cudaMalloc(&m_cuda_ptr, size) != cudaSuccess) {
-            throw std::runtime_error("Failed to allocate CUDA memory");
-        }
-    }
-
     size_t                        m_size;
     CUdeviceptr                   m_ptr;
     CUmemGenericAllocationHandle  m_alloc_handle;
-    void                         *m_cuda_ptr;
 };
 
 namespace nixl_ep {

From daecca92684a6d58662e7af006d02aba8af72682 Mon Sep 17 00:00:00 2001
From: Ofir Farjon <ofarjon@funk11.mec01.labs.mlnx>
Date: Sun, 8 Mar 2026 08:20:24 +0200
Subject: [PATCH 04/29] NIXL/EP: Support gdr copy with vmm.

---
 examples/device/ep/csrc/nixl_ep.hpp | 28 ++++++++++++++++++++--------
 1 file changed, 20 insertions(+), 8 deletions(-)

diff --git a/examples/device/ep/csrc/nixl_ep.hpp b/examples/device/ep/csrc/nixl_ep.hpp
index c52f2a86f3..98a0993494 100644
--- a/examples/device/ep/csrc/nixl_ep.hpp
+++ b/examples/device/ep/csrc/nixl_ep.hpp
@@ -36,6 +36,7 @@
 #include <string>
 
 #include <cuda.h>
+#include <cuda_runtime.h>
 #include <memory>
 #include <stdexcept>
 #include "config.hpp"
@@ -49,9 +50,10 @@
 #define TORCH_EXTENSION_NAME nixl_ep_cpp
 #endif
 
-/* CUDA memory allocator using VMM. Uses fabric handle type if the device
- * supports it (CU_DEVICE_ATTRIBUTE_HANDLE_TYPE_FABRIC_SUPPORTED), otherwise
- * falls back to CU_MEM_HANDLE_TYPE_NONE. */
+/* CUDA memory allocator using VMM with gpuDirectRDMACapable. Requires
+ * CU_DEVICE_ATTRIBUTE_GPU_DIRECT_RDMA_WITH_CUDA_VMM_SUPPORTED. Uses a fabric
+ * handle type if CU_DEVICE_ATTRIBUTE_HANDLE_TYPE_FABRIC_SUPPORTED is also set,
+ * otherwise uses CU_MEM_HANDLE_TYPE_NONE. */
 class cuda_allocator {
 public:
     cuda_allocator(size_t size) : m_size(0), m_ptr(0), m_alloc_handle(0)
@@ -65,17 +67,27 @@ class cuda_allocator {
             throw std::runtime_error("Failed to get CUDA device handle");
         }
 
+        int rdma_vmm_supported = 0;
+        cuDeviceGetAttribute(&rdma_vmm_supported,
+                             CU_DEVICE_ATTRIBUTE_GPU_DIRECT_RDMA_WITH_CUDA_VMM_SUPPORTED,
+                             device);
+        if (!rdma_vmm_supported) {
+            throw std::runtime_error("GPUDirect RDMA with CUDA VMM is not supported on this device");
+        }
+
         int fabric_supported = 0;
         cuDeviceGetAttribute(&fabric_supported,
                              CU_DEVICE_ATTRIBUTE_HANDLE_TYPE_FABRIC_SUPPORTED,
                              device);
 
         CUmemAllocationProp prop = {};
-        prop.type                 = CU_MEM_ALLOCATION_TYPE_PINNED;
-        prop.location.type        = CU_MEM_LOCATION_TYPE_DEVICE;
-        prop.location.id          = device;
-        prop.requestedHandleTypes = fabric_supported ? CU_MEM_HANDLE_TYPE_FABRIC
-                                                     : CU_MEM_HANDLE_TYPE_NONE;
+        prop.type                          = CU_MEM_ALLOCATION_TYPE_PINNED;
+        prop.location.type                 = CU_MEM_LOCATION_TYPE_DEVICE;
+        prop.location.id                   = device;
+        prop.allocFlags.gpuDirectRDMACapable = 1;
+        prop.requestedHandleTypes          = fabric_supported ?
+                                             CU_MEM_HANDLE_TYPE_FABRIC :
+                                             CU_MEM_HANDLE_TYPE_NONE;
 
         size_t granularity = 0;
         if (cuMemGetAllocationGranularity(&granularity, &prop,

From 3f5f55ff27718d186b2f4b36c698de0f1a71d152 Mon Sep 17 00:00:00 2001
From: Ofir Farjon <ofarjon@nvidia.com>
Date: Sun, 8 Mar 2026 18:17:38 +0200
Subject: [PATCH 05/29] NIXL/EP: Improve.

---
 examples/device/ep/csrc/nixl_ep.hpp | 50 ++++++++++++++++-------------
 1 file changed, 27 insertions(+), 23 deletions(-)

diff --git a/examples/device/ep/csrc/nixl_ep.hpp b/examples/device/ep/csrc/nixl_ep.hpp
index 98a0993494..ba9d3742da 100644
--- a/examples/device/ep/csrc/nixl_ep.hpp
+++ b/examples/device/ep/csrc/nixl_ep.hpp
@@ -50,10 +50,7 @@
 #define TORCH_EXTENSION_NAME nixl_ep_cpp
 #endif
 
-/* CUDA memory allocator using VMM with gpuDirectRDMACapable. Requires
- * CU_DEVICE_ATTRIBUTE_GPU_DIRECT_RDMA_WITH_CUDA_VMM_SUPPORTED. Uses a fabric
- * handle type if CU_DEVICE_ATTRIBUTE_HANDLE_TYPE_FABRIC_SUPPORTED is also set,
- * otherwise uses CU_MEM_HANDLE_TYPE_NONE. */
+/* CUDA memory allocator using VMM. */
 class cuda_allocator {
 public:
     cuda_allocator(size_t size) : m_size(0), m_ptr(0), m_alloc_handle(0)
@@ -105,6 +102,7 @@ class cuda_allocator {
             cuMemUnmap(m_ptr, m_size);
             cuMemAddressFree(m_ptr, m_size);
         }
+
         if (m_alloc_handle) {
             cuMemRelease(m_alloc_handle);
         }
@@ -120,38 +118,44 @@ class cuda_allocator {
     void init_vmm(size_t size, CUdevice device, const CUmemAllocationProp &prop,
                   size_t granularity)
     {
-        m_size = (size + granularity - 1) / granularity * granularity;
+        CUmemAccessDesc access_desc = {};
+        const char *err_msg;
+
+        m_size = nixl_ep::align_up<size_t>(size, granularity);
 
         if (cuMemCreate(&m_alloc_handle, m_size, &prop, 0) != CUDA_SUCCESS) {
             throw std::runtime_error("Failed to create CUDA VMM allocation");
         }
 
         if (cuMemAddressReserve(&m_ptr, m_size, 0, 0, 0) != CUDA_SUCCESS) {
-            cuMemRelease(m_alloc_handle);
-            m_alloc_handle = 0;
-            throw std::runtime_error("Failed to reserve CUDA virtual address");
+            err_msg = "Failed to reserve CUDA virtual address";
+            goto err_release;
         }
 
         if (cuMemMap(m_ptr, m_size, 0, m_alloc_handle, 0) != CUDA_SUCCESS) {
-            cuMemAddressFree(m_ptr, m_size);
-            m_ptr = 0;
-            cuMemRelease(m_alloc_handle);
-            m_alloc_handle = 0;
-            throw std::runtime_error("Failed to map CUDA VMM memory");
+            err_msg = "Failed to map CUDA VMM memory";
+            goto err_free;
         }
 
-        CUmemAccessDesc access_desc = {};
-        access_desc.location.type   = CU_MEM_LOCATION_TYPE_DEVICE;
-        access_desc.location.id     = device;
-        access_desc.flags           = CU_MEM_ACCESS_FLAGS_PROT_READWRITE;
+        access_desc.location.type = CU_MEM_LOCATION_TYPE_DEVICE;
+        access_desc.location.id   = device;
+        access_desc.flags         = CU_MEM_ACCESS_FLAGS_PROT_READWRITE;
         if (cuMemSetAccess(m_ptr, m_size, &access_desc, 1) != CUDA_SUCCESS) {
-            cuMemUnmap(m_ptr, m_size);
-            cuMemAddressFree(m_ptr, m_size);
-            m_ptr = 0;
-            cuMemRelease(m_alloc_handle);
-            m_alloc_handle = 0;
-            throw std::runtime_error("Failed to set CUDA memory access");
+            err_msg = "Failed to set CUDA memory access";
+            goto err_unmap;
         }
+
+        return;
+
+err_unmap:
+        cuMemUnmap(m_ptr, m_size);
+err_free:
+        cuMemAddressFree(m_ptr, m_size);
+        m_ptr = 0;
+err_release:
+        cuMemRelease(m_alloc_handle);
+        m_alloc_handle = 0;
+        throw std::runtime_error(err_msg);
     }
 
     size_t                        m_size;

From 07674eef76c02af4d4649789d28a71f15afb5790 Mon Sep 17 00:00:00 2001
From: Ofir Farjon <ofarjon@nvidia.com>
Date: Sun, 8 Mar 2026 18:43:43 +0200
Subject: [PATCH 06/29] NIXL/EP: Format.

---
 examples/device/ep/csrc/nixl_ep.cpp | 14 +++----
 examples/device/ep/csrc/nixl_ep.hpp | 65 +++++++++++++++--------------
 2 files changed, 41 insertions(+), 38 deletions(-)

diff --git a/examples/device/ep/csrc/nixl_ep.cpp b/examples/device/ep/csrc/nixl_ep.cpp
index 00884602f7..386a66675c 100644
--- a/examples/device/ep/csrc/nixl_ep.cpp
+++ b/examples/device/ep/csrc/nixl_ep.cpp
@@ -105,26 +105,26 @@ void Buffer::init(int num_ranks, int num_experts_per_rank, int64_t num_rdma_byte
 
     // Create 32 MiB workspace
     m_workspace_alloc = std::make_unique<cuda_allocator>(NUM_WORKSPACE_BYTES);
-    workspace         = m_workspace_alloc->ptr();
+    workspace = m_workspace_alloc->ptr();
     CUDA_CHECK(cudaMemsetAsync(workspace, 0, NUM_WORKSPACE_BYTES, comm_stream));
 
     EP_HOST_ASSERT(max_experts_per_rank > 0);
-    m_rdma_alloc    = std::make_unique<cuda_allocator>(num_rdma_bytes);
+    m_rdma_alloc = std::make_unique<cuda_allocator>(num_rdma_bytes);
     rdma_buffer_ptr = m_rdma_alloc->ptr();
     CUDA_CHECK(cudaMemset(rdma_buffer_ptr, 0, num_rdma_bytes));
 
     // Allocate and clean shrink buffer
     int num_mask_buffer_bytes = max_num_ranks * sizeof(int);
-    m_mask_alloc    = std::make_unique<cuda_allocator>(num_mask_buffer_bytes);
-    mask_buffer_ptr = static_cast<int*>(m_mask_alloc->ptr());
+    m_mask_alloc = std::make_unique<cuda_allocator>(num_mask_buffer_bytes);
+    mask_buffer_ptr = static_cast<int *>(m_mask_alloc->ptr());
     CUDA_CHECK(cudaMemset(mask_buffer_ptr, 0xff, num_mask_buffer_bytes));
     CUDA_CHECK(cudaMemset(mask_buffer_ptr + rank, 0, sizeof(int)));
 
     int num_sync_buffer_bytes = max_num_ranks * sizeof(int);
-    m_sync_alloc      = std::make_unique<cuda_allocator>(num_sync_buffer_bytes);
-    sync_buffer_ptr   = static_cast<int*>(m_sync_alloc->ptr());
+    m_sync_alloc = std::make_unique<cuda_allocator>(num_sync_buffer_bytes);
+    sync_buffer_ptr = static_cast<int *>(m_sync_alloc->ptr());
     m_sync_count_alloc = std::make_unique<cuda_allocator>(num_sync_buffer_bytes);
-    sync_count_ptr    = static_cast<int*>(m_sync_count_alloc->ptr());
+    sync_count_ptr = static_cast<int *>(m_sync_count_alloc->ptr());
     CUDA_CHECK(cudaMemset(sync_buffer_ptr, 0, num_sync_buffer_bytes));
     CUDA_CHECK(cudaMemset(sync_count_ptr, 0, num_sync_buffer_bytes));
     CUDA_CHECK(cudaDeviceSynchronize());
diff --git a/examples/device/ep/csrc/nixl_ep.hpp b/examples/device/ep/csrc/nixl_ep.hpp
index ba9d3742da..f811bd3ff6 100644
--- a/examples/device/ep/csrc/nixl_ep.hpp
+++ b/examples/device/ep/csrc/nixl_ep.hpp
@@ -53,8 +53,7 @@
 /* CUDA memory allocator using VMM. */
 class cuda_allocator {
 public:
-    cuda_allocator(size_t size) : m_size(0), m_ptr(0), m_alloc_handle(0)
-    {
+    cuda_allocator(size_t size) : m_size(0), m_ptr(0), m_alloc_handle(0) {
         if (size == 0) {
             throw std::invalid_argument("cuda_allocator: size must be non-zero");
         }
@@ -69,26 +68,24 @@ class cuda_allocator {
                              CU_DEVICE_ATTRIBUTE_GPU_DIRECT_RDMA_WITH_CUDA_VMM_SUPPORTED,
                              device);
         if (!rdma_vmm_supported) {
-            throw std::runtime_error("GPUDirect RDMA with CUDA VMM is not supported on this device");
+            throw std::runtime_error(
+                "GPUDirect RDMA with CUDA VMM is not supported on this device");
         }
 
         int fabric_supported = 0;
-        cuDeviceGetAttribute(&fabric_supported,
-                             CU_DEVICE_ATTRIBUTE_HANDLE_TYPE_FABRIC_SUPPORTED,
-                             device);
+        cuDeviceGetAttribute(
+            &fabric_supported, CU_DEVICE_ATTRIBUTE_HANDLE_TYPE_FABRIC_SUPPORTED, device);
 
         CUmemAllocationProp prop = {};
-        prop.type                          = CU_MEM_ALLOCATION_TYPE_PINNED;
-        prop.location.type                 = CU_MEM_LOCATION_TYPE_DEVICE;
-        prop.location.id                   = device;
+        prop.type = CU_MEM_ALLOCATION_TYPE_PINNED;
+        prop.location.type = CU_MEM_LOCATION_TYPE_DEVICE;
+        prop.location.id = device;
         prop.allocFlags.gpuDirectRDMACapable = 1;
-        prop.requestedHandleTypes          = fabric_supported ?
-                                             CU_MEM_HANDLE_TYPE_FABRIC :
-                                             CU_MEM_HANDLE_TYPE_NONE;
+        prop.requestedHandleTypes =
+            fabric_supported ? CU_MEM_HANDLE_TYPE_FABRIC : CU_MEM_HANDLE_TYPE_NONE;
 
         size_t granularity = 0;
-        if (cuMemGetAllocationGranularity(&granularity, &prop,
-                                          CU_MEM_ALLOC_GRANULARITY_MINIMUM) !=
+        if (cuMemGetAllocationGranularity(&granularity, &prop, CU_MEM_ALLOC_GRANULARITY_MINIMUM) !=
             CUDA_SUCCESS) {
             throw std::runtime_error("Failed to get CUDA allocation granularity");
         }
@@ -96,8 +93,7 @@ class cuda_allocator {
         init_vmm(size, device, prop, granularity);
     }
 
-    ~cuda_allocator()
-    {
+    ~cuda_allocator() {
         if (m_ptr) {
             cuMemUnmap(m_ptr, m_size);
             cuMemAddressFree(m_ptr, m_size);
@@ -108,16 +104,23 @@ class cuda_allocator {
         }
     }
 
-    void*  ptr()  const { return reinterpret_cast<void*>(m_ptr); }
-    size_t size() const { return m_size; }
+    void *
+    ptr() const {
+        return reinterpret_cast<void *>(m_ptr);
+    }
+
+    size_t
+    size() const {
+        return m_size;
+    }
 
-    cuda_allocator(const cuda_allocator&)            = delete;
-    cuda_allocator& operator=(const cuda_allocator&) = delete;
+    cuda_allocator(const cuda_allocator &) = delete;
+    cuda_allocator &
+    operator=(const cuda_allocator &) = delete;
 
 private:
-    void init_vmm(size_t size, CUdevice device, const CUmemAllocationProp &prop,
-                  size_t granularity)
-    {
+    void
+    init_vmm(size_t size, CUdevice device, const CUmemAllocationProp &prop, size_t granularity) {
         CUmemAccessDesc access_desc = {};
         const char *err_msg;
 
@@ -138,8 +141,8 @@ class cuda_allocator {
         }
 
         access_desc.location.type = CU_MEM_LOCATION_TYPE_DEVICE;
-        access_desc.location.id   = device;
-        access_desc.flags         = CU_MEM_ACCESS_FLAGS_PROT_READWRITE;
+        access_desc.location.id = device;
+        access_desc.flags = CU_MEM_ACCESS_FLAGS_PROT_READWRITE;
         if (cuMemSetAccess(m_ptr, m_size, &access_desc, 1) != CUDA_SUCCESS) {
             err_msg = "Failed to set CUDA memory access";
             goto err_unmap;
@@ -147,20 +150,20 @@ class cuda_allocator {
 
         return;
 
-err_unmap:
+    err_unmap:
         cuMemUnmap(m_ptr, m_size);
-err_free:
+    err_free:
         cuMemAddressFree(m_ptr, m_size);
         m_ptr = 0;
-err_release:
+    err_release:
         cuMemRelease(m_alloc_handle);
         m_alloc_handle = 0;
         throw std::runtime_error(err_msg);
     }
 
-    size_t                        m_size;
-    CUdeviceptr                   m_ptr;
-    CUmemGenericAllocationHandle  m_alloc_handle;
+    size_t m_size;
+    CUdeviceptr m_ptr;
+    CUmemGenericAllocationHandle m_alloc_handle;
 };
 
 namespace nixl_ep {

From bbb8d277d08c0819b6366c998ff26ea59f34e468 Mon Sep 17 00:00:00 2001
From: Ofir Farjon <ofarjon@nvidia.com>
Date: Sun, 8 Mar 2026 22:56:20 +0200
Subject: [PATCH 07/29] NIXL/EP: check return val.

---
 examples/device/ep/csrc/nixl_ep.hpp | 17 ++++++++++++-----
 1 file changed, 12 insertions(+), 5 deletions(-)

diff --git a/examples/device/ep/csrc/nixl_ep.hpp b/examples/device/ep/csrc/nixl_ep.hpp
index f811bd3ff6..46d66bdec3 100644
--- a/examples/device/ep/csrc/nixl_ep.hpp
+++ b/examples/device/ep/csrc/nixl_ep.hpp
@@ -64,17 +64,24 @@ class cuda_allocator {
         }
 
         int rdma_vmm_supported = 0;
-        cuDeviceGetAttribute(&rdma_vmm_supported,
-                             CU_DEVICE_ATTRIBUTE_GPU_DIRECT_RDMA_WITH_CUDA_VMM_SUPPORTED,
-                             device);
+        if (cuDeviceGetAttribute(&rdma_vmm_supported,
+                                 CU_DEVICE_ATTRIBUTE_GPU_DIRECT_RDMA_WITH_CUDA_VMM_SUPPORTED,
+                                 device) != CUDA_SUCCESS) {
+            throw std::runtime_error(
+                "Failed to query GPUDirect RDMA with VMM support attribute");
+        }
         if (!rdma_vmm_supported) {
             throw std::runtime_error(
                 "GPUDirect RDMA with CUDA VMM is not supported on this device");
         }
 
         int fabric_supported = 0;
-        cuDeviceGetAttribute(
-            &fabric_supported, CU_DEVICE_ATTRIBUTE_HANDLE_TYPE_FABRIC_SUPPORTED, device);
+        if (cuDeviceGetAttribute(&fabric_supported,
+                                 CU_DEVICE_ATTRIBUTE_HANDLE_TYPE_FABRIC_SUPPORTED,
+                                 device) != CUDA_SUCCESS) {
+            throw std::runtime_error(
+                "Failed to query fabric handle type support attribute");
+        }
 
         CUmemAllocationProp prop = {};
         prop.type = CU_MEM_ALLOCATION_TYPE_PINNED;

From 7b60106d5480edc80689699ef9a5a311c958aec0 Mon Sep 17 00:00:00 2001
From: Ofir Farjon <ofarjon@nvidia.com>
Date: Sun, 8 Mar 2026 22:57:52 +0200
Subject: [PATCH 08/29] NIXL/EP: Format.

---
 examples/device/ep/csrc/nixl_ep.hpp | 7 +++----
 1 file changed, 3 insertions(+), 4 deletions(-)

diff --git a/examples/device/ep/csrc/nixl_ep.hpp b/examples/device/ep/csrc/nixl_ep.hpp
index 46d66bdec3..a8551d4384 100644
--- a/examples/device/ep/csrc/nixl_ep.hpp
+++ b/examples/device/ep/csrc/nixl_ep.hpp
@@ -67,9 +67,9 @@ class cuda_allocator {
         if (cuDeviceGetAttribute(&rdma_vmm_supported,
                                  CU_DEVICE_ATTRIBUTE_GPU_DIRECT_RDMA_WITH_CUDA_VMM_SUPPORTED,
                                  device) != CUDA_SUCCESS) {
-            throw std::runtime_error(
-                "Failed to query GPUDirect RDMA with VMM support attribute");
+            throw std::runtime_error("Failed to query GPUDirect RDMA with VMM support attribute");
         }
+
         if (!rdma_vmm_supported) {
             throw std::runtime_error(
                 "GPUDirect RDMA with CUDA VMM is not supported on this device");
@@ -79,8 +79,7 @@ class cuda_allocator {
         if (cuDeviceGetAttribute(&fabric_supported,
                                  CU_DEVICE_ATTRIBUTE_HANDLE_TYPE_FABRIC_SUPPORTED,
                                  device) != CUDA_SUCCESS) {
-            throw std::runtime_error(
-                "Failed to query fabric handle type support attribute");
+            throw std::runtime_error("Failed to query fabric handle type support attribute");
         }
 
         CUmemAllocationProp prop = {};

From 25b2e5bb6e394848b60a3bf226ca331b55c9c0c0 Mon Sep 17 00:00:00 2001
From: Ofir Farjon <ofarjon@nvidia.com>
Date: Mon, 9 Mar 2026 18:28:40 +0200
Subject: [PATCH 09/29] NIXL/EP:Improve.

---
 examples/device/ep/csrc/nixl_ep.cpp | 149 +++++++++++++++++++++++++---
 examples/device/ep/csrc/nixl_ep.hpp | 136 ++-----------------------
 2 files changed, 144 insertions(+), 141 deletions(-)

diff --git a/examples/device/ep/csrc/nixl_ep.cpp b/examples/device/ep/csrc/nixl_ep.cpp
index 386a66675c..8104cbe815 100644
--- a/examples/device/ep/csrc/nixl_ep.cpp
+++ b/examples/device/ep/csrc/nixl_ep.cpp
@@ -61,6 +61,125 @@
 
 namespace nixl_ep {
 
+vmm_region vmm_init(size_t size)
+{
+    if (size == 0) {
+        throw std::invalid_argument("vmm_init: size must be non-zero");
+    }
+
+    struct cuda_alloc_ctx {
+        CUdevice            device;
+        CUmemAllocationProp prop;
+        size_t              granularity;
+
+        cuda_alloc_ctx() : device(0), prop({}), granularity(0)
+        {
+            int version;
+            if (cuDriverGetVersion(&version) != CUDA_SUCCESS) {
+                throw std::runtime_error("Failed to get CUDA driver version");
+            }
+            if (version < 11000) {
+                throw std::runtime_error(
+                    "VMM with RDMA is not supported in this CUDA version");
+            }
+
+            if (cuCtxGetDevice(&device) != CUDA_SUCCESS) {
+                throw std::runtime_error("Failed to get CUDA device handle");
+            }
+
+            int rdma_vmm_supported = 0;
+            if (cuDeviceGetAttribute(
+                        &rdma_vmm_supported,
+                        CU_DEVICE_ATTRIBUTE_GPU_DIRECT_RDMA_WITH_CUDA_VMM_SUPPORTED,
+                        device) != CUDA_SUCCESS) {
+                throw std::runtime_error(
+                    "Failed to query GPUDirect RDMA with VMM support attribute");
+            }
+            if (!rdma_vmm_supported) {
+                throw std::runtime_error(
+                    "GPUDirect RDMA with CUDA VMM is not supported on this device");
+            }
+
+            int fabric_supported = 0;
+            if (cuDeviceGetAttribute(&fabric_supported,
+                                     CU_DEVICE_ATTRIBUTE_HANDLE_TYPE_FABRIC_SUPPORTED,
+                                     device) != CUDA_SUCCESS) {
+                throw std::runtime_error(
+                    "Failed to query fabric handle type support attribute");
+            }
+
+            prop.type                            = CU_MEM_ALLOCATION_TYPE_PINNED;
+            prop.location.type                   = CU_MEM_LOCATION_TYPE_DEVICE;
+            prop.location.id                     = device;
+            prop.allocFlags.gpuDirectRDMACapable = 1;
+            prop.requestedHandleTypes            = fabric_supported ?
+                                                   CU_MEM_HANDLE_TYPE_FABRIC :
+                                                   CU_MEM_HANDLE_TYPE_NONE;
+
+            if (cuMemGetAllocationGranularity(&granularity, &prop,
+                                              CU_MEM_ALLOC_GRANULARITY_MINIMUM) !=
+                CUDA_SUCCESS) {
+                throw std::runtime_error(
+                    "Failed to get CUDA allocation granularity");
+            }
+        }
+    };
+    static cuda_alloc_ctx ctx;
+
+    vmm_region      region      = {};
+    CUmemAccessDesc access_desc = {};
+    const char     *err_msg;
+
+    region.size = align_up<size_t>(size, ctx.granularity);
+
+    if (cuMemCreate(&region.handle, region.size, &ctx.prop, 0) != CUDA_SUCCESS) {
+        throw std::runtime_error("Failed to create CUDA VMM allocation");
+    }
+
+    if (cuMemAddressReserve(&region.ptr, region.size, 0, 0, 0) != CUDA_SUCCESS) {
+        err_msg = "Failed to reserve CUDA virtual address";
+        goto err_release;
+    }
+
+    if (cuMemMap(region.ptr, region.size, 0, region.handle, 0) != CUDA_SUCCESS) {
+        err_msg = "Failed to map CUDA VMM memory";
+        goto err_free;
+    }
+
+    access_desc.location.type = CU_MEM_LOCATION_TYPE_DEVICE;
+    access_desc.location.id   = ctx.device;
+    access_desc.flags         = CU_MEM_ACCESS_FLAGS_PROT_READWRITE;
+    if (cuMemSetAccess(region.ptr, region.size, &access_desc, 1) != CUDA_SUCCESS) {
+        err_msg = "Failed to set CUDA memory access";
+        goto err_unmap;
+    }
+
+    return region;
+
+err_unmap:
+    cuMemUnmap(region.ptr, region.size);
+err_free:
+    cuMemAddressFree(region.ptr, region.size);
+    region.ptr = 0;
+err_release:
+    cuMemRelease(region.handle);
+    region.handle = 0;
+    throw std::runtime_error(err_msg);
+}
+
+void vmm_free(vmm_region &region)
+{
+    if (region.ptr) {
+        cuMemUnmap(region.ptr, region.size);
+        cuMemAddressFree(region.ptr, region.size);
+        region.ptr = 0;
+    }
+    if (region.handle) {
+        cuMemRelease(region.handle);
+        region.handle = 0;
+    }
+}
+
 static void sleep_ms(int milliseconds) {
     std::this_thread::sleep_for(std::chrono::milliseconds(milliseconds));
 }
@@ -104,27 +223,27 @@ void Buffer::init(int num_ranks, int num_experts_per_rank, int64_t num_rdma_byte
     EP_HOST_ASSERT(per_channel_bytes < std::numeric_limits<int>::max());
 
     // Create 32 MiB workspace
-    m_workspace_alloc = std::make_unique<cuda_allocator>(NUM_WORKSPACE_BYTES);
-    workspace = m_workspace_alloc->ptr();
+    m_workspace_alloc = vmm_init(NUM_WORKSPACE_BYTES);
+    workspace         = reinterpret_cast<void *>(m_workspace_alloc.ptr);
     CUDA_CHECK(cudaMemsetAsync(workspace, 0, NUM_WORKSPACE_BYTES, comm_stream));
 
     EP_HOST_ASSERT(max_experts_per_rank > 0);
-    m_rdma_alloc = std::make_unique<cuda_allocator>(num_rdma_bytes);
-    rdma_buffer_ptr = m_rdma_alloc->ptr();
+    m_rdma_alloc    = vmm_init(num_rdma_bytes);
+    rdma_buffer_ptr = reinterpret_cast<void *>(m_rdma_alloc.ptr);
     CUDA_CHECK(cudaMemset(rdma_buffer_ptr, 0, num_rdma_bytes));
 
     // Allocate and clean shrink buffer
     int num_mask_buffer_bytes = max_num_ranks * sizeof(int);
-    m_mask_alloc = std::make_unique<cuda_allocator>(num_mask_buffer_bytes);
-    mask_buffer_ptr = static_cast<int *>(m_mask_alloc->ptr());
+    m_mask_alloc    = vmm_init(num_mask_buffer_bytes);
+    mask_buffer_ptr = reinterpret_cast<int *>(m_mask_alloc.ptr);
     CUDA_CHECK(cudaMemset(mask_buffer_ptr, 0xff, num_mask_buffer_bytes));
     CUDA_CHECK(cudaMemset(mask_buffer_ptr + rank, 0, sizeof(int)));
 
     int num_sync_buffer_bytes = max_num_ranks * sizeof(int);
-    m_sync_alloc = std::make_unique<cuda_allocator>(num_sync_buffer_bytes);
-    sync_buffer_ptr = static_cast<int *>(m_sync_alloc->ptr());
-    m_sync_count_alloc = std::make_unique<cuda_allocator>(num_sync_buffer_bytes);
-    sync_count_ptr = static_cast<int *>(m_sync_count_alloc->ptr());
+    m_sync_alloc       = vmm_init(num_sync_buffer_bytes);
+    sync_buffer_ptr    = reinterpret_cast<int *>(m_sync_alloc.ptr);
+    m_sync_count_alloc = vmm_init(num_sync_buffer_bytes);
+    sync_count_ptr     = reinterpret_cast<int *>(m_sync_count_alloc.ptr);
     CUDA_CHECK(cudaMemset(sync_buffer_ptr, 0, num_sync_buffer_bytes));
     CUDA_CHECK(cudaMemset(sync_count_ptr, 0, num_sync_buffer_bytes));
     CUDA_CHECK(cudaDeviceSynchronize());
@@ -178,21 +297,21 @@ void Buffer::destroy() {
 
     _nixl_ep_destroy();
 
-    m_rdma_alloc.reset();
+    vmm_free(m_rdma_alloc);
     rdma_buffer_ptr = nullptr;
 
     if (nixl_agent_info and nixl_agent_info->agent != nullptr and getenv("NIXL_ETCD_ENDPOINTS")) {
         nixl_agent_info->agent->invalidateLocalMD();
     }
 
-    m_mask_alloc.reset();
+    vmm_free(m_mask_alloc);
     mask_buffer_ptr = nullptr;
-    m_sync_alloc.reset();
+    vmm_free(m_sync_alloc);
     sync_buffer_ptr = nullptr;
-    m_sync_count_alloc.reset();
+    vmm_free(m_sync_count_alloc);
     sync_count_ptr = nullptr;
 
-    m_workspace_alloc.reset();
+    vmm_free(m_workspace_alloc);
     workspace = nullptr;
 
     destroyed = true;
diff --git a/examples/device/ep/csrc/nixl_ep.hpp b/examples/device/ep/csrc/nixl_ep.hpp
index a8551d4384..083c0027eb 100644
--- a/examples/device/ep/csrc/nixl_ep.hpp
+++ b/examples/device/ep/csrc/nixl_ep.hpp
@@ -50,126 +50,10 @@
 #define TORCH_EXTENSION_NAME nixl_ep_cpp
 #endif
 
-/* CUDA memory allocator using VMM. */
-class cuda_allocator {
-public:
-    cuda_allocator(size_t size) : m_size(0), m_ptr(0), m_alloc_handle(0) {
-        if (size == 0) {
-            throw std::invalid_argument("cuda_allocator: size must be non-zero");
-        }
-
-        CUdevice device;
-        if (cuCtxGetDevice(&device) != CUDA_SUCCESS) {
-            throw std::runtime_error("Failed to get CUDA device handle");
-        }
-
-        int rdma_vmm_supported = 0;
-        if (cuDeviceGetAttribute(&rdma_vmm_supported,
-                                 CU_DEVICE_ATTRIBUTE_GPU_DIRECT_RDMA_WITH_CUDA_VMM_SUPPORTED,
-                                 device) != CUDA_SUCCESS) {
-            throw std::runtime_error("Failed to query GPUDirect RDMA with VMM support attribute");
-        }
-
-        if (!rdma_vmm_supported) {
-            throw std::runtime_error(
-                "GPUDirect RDMA with CUDA VMM is not supported on this device");
-        }
-
-        int fabric_supported = 0;
-        if (cuDeviceGetAttribute(&fabric_supported,
-                                 CU_DEVICE_ATTRIBUTE_HANDLE_TYPE_FABRIC_SUPPORTED,
-                                 device) != CUDA_SUCCESS) {
-            throw std::runtime_error("Failed to query fabric handle type support attribute");
-        }
-
-        CUmemAllocationProp prop = {};
-        prop.type = CU_MEM_ALLOCATION_TYPE_PINNED;
-        prop.location.type = CU_MEM_LOCATION_TYPE_DEVICE;
-        prop.location.id = device;
-        prop.allocFlags.gpuDirectRDMACapable = 1;
-        prop.requestedHandleTypes =
-            fabric_supported ? CU_MEM_HANDLE_TYPE_FABRIC : CU_MEM_HANDLE_TYPE_NONE;
-
-        size_t granularity = 0;
-        if (cuMemGetAllocationGranularity(&granularity, &prop, CU_MEM_ALLOC_GRANULARITY_MINIMUM) !=
-            CUDA_SUCCESS) {
-            throw std::runtime_error("Failed to get CUDA allocation granularity");
-        }
-
-        init_vmm(size, device, prop, granularity);
-    }
-
-    ~cuda_allocator() {
-        if (m_ptr) {
-            cuMemUnmap(m_ptr, m_size);
-            cuMemAddressFree(m_ptr, m_size);
-        }
-
-        if (m_alloc_handle) {
-            cuMemRelease(m_alloc_handle);
-        }
-    }
-
-    void *
-    ptr() const {
-        return reinterpret_cast<void *>(m_ptr);
-    }
-
-    size_t
-    size() const {
-        return m_size;
-    }
-
-    cuda_allocator(const cuda_allocator &) = delete;
-    cuda_allocator &
-    operator=(const cuda_allocator &) = delete;
-
-private:
-    void
-    init_vmm(size_t size, CUdevice device, const CUmemAllocationProp &prop, size_t granularity) {
-        CUmemAccessDesc access_desc = {};
-        const char *err_msg;
-
-        m_size = nixl_ep::align_up<size_t>(size, granularity);
-
-        if (cuMemCreate(&m_alloc_handle, m_size, &prop, 0) != CUDA_SUCCESS) {
-            throw std::runtime_error("Failed to create CUDA VMM allocation");
-        }
-
-        if (cuMemAddressReserve(&m_ptr, m_size, 0, 0, 0) != CUDA_SUCCESS) {
-            err_msg = "Failed to reserve CUDA virtual address";
-            goto err_release;
-        }
-
-        if (cuMemMap(m_ptr, m_size, 0, m_alloc_handle, 0) != CUDA_SUCCESS) {
-            err_msg = "Failed to map CUDA VMM memory";
-            goto err_free;
-        }
-
-        access_desc.location.type = CU_MEM_LOCATION_TYPE_DEVICE;
-        access_desc.location.id = device;
-        access_desc.flags = CU_MEM_ACCESS_FLAGS_PROT_READWRITE;
-        if (cuMemSetAccess(m_ptr, m_size, &access_desc, 1) != CUDA_SUCCESS) {
-            err_msg = "Failed to set CUDA memory access";
-            goto err_unmap;
-        }
-
-        return;
-
-    err_unmap:
-        cuMemUnmap(m_ptr, m_size);
-    err_free:
-        cuMemAddressFree(m_ptr, m_size);
-        m_ptr = 0;
-    err_release:
-        cuMemRelease(m_alloc_handle);
-        m_alloc_handle = 0;
-        throw std::runtime_error(err_msg);
-    }
-
-    size_t m_size;
-    CUdeviceptr m_ptr;
-    CUmemGenericAllocationHandle m_alloc_handle;
+struct vmm_region {
+    CUdeviceptr                  ptr;
+    size_t                       size;
+    CUmemGenericAllocationHandle handle;
 };
 
 namespace nixl_ep {
@@ -208,12 +92,12 @@ struct Buffer {
     int *sync_buffer_ptr = nullptr;
     int *sync_count_ptr = nullptr;
 
-    // Owning allocators (keep raw ptrs above as aliases for use throughout)
-    std::unique_ptr<cuda_allocator> m_rdma_alloc;
-    std::unique_ptr<cuda_allocator> m_mask_alloc;
-    std::unique_ptr<cuda_allocator> m_sync_alloc;
-    std::unique_ptr<cuda_allocator> m_sync_count_alloc;
-    std::unique_ptr<cuda_allocator> m_workspace_alloc;
+    /* Owning VMM allocations (keep raw ptrs above as aliases) */
+    vmm_region m_rdma_alloc;
+    vmm_region m_mask_alloc;
+    vmm_region m_sync_alloc;
+    vmm_region m_sync_count_alloc;
+    vmm_region m_workspace_alloc;
 
     // Device info and communication
     int device_id;

From 02e379b0c6f9dae2fe575820241396669b1a0152 Mon Sep 17 00:00:00 2001
From: Ofir Farjon <ofarjon@nvidia.com>
Date: Mon, 9 Mar 2026 18:46:08 +0200
Subject: [PATCH 10/29] NIXL/EP: Format.

---
 examples/device/ep/csrc/nixl_ep.cpp | 68 +++++++++++++----------------
 examples/device/ep/csrc/nixl_ep.hpp |  4 +-
 2 files changed, 33 insertions(+), 39 deletions(-)

diff --git a/examples/device/ep/csrc/nixl_ep.cpp b/examples/device/ep/csrc/nixl_ep.cpp
index 8104cbe815..942692f30f 100644
--- a/examples/device/ep/csrc/nixl_ep.cpp
+++ b/examples/device/ep/csrc/nixl_ep.cpp
@@ -61,26 +61,24 @@
 
 namespace nixl_ep {
 
-vmm_region vmm_init(size_t size)
-{
+vmm_region
+vmm_init(size_t size) {
     if (size == 0) {
         throw std::invalid_argument("vmm_init: size must be non-zero");
     }
 
     struct cuda_alloc_ctx {
-        CUdevice            device;
+        CUdevice device;
         CUmemAllocationProp prop;
-        size_t              granularity;
+        size_t granularity;
 
-        cuda_alloc_ctx() : device(0), prop({}), granularity(0)
-        {
+        cuda_alloc_ctx() : device(0), prop({}), granularity(0) {
             int version;
             if (cuDriverGetVersion(&version) != CUDA_SUCCESS) {
                 throw std::runtime_error("Failed to get CUDA driver version");
             }
             if (version < 11000) {
-                throw std::runtime_error(
-                    "VMM with RDMA is not supported in this CUDA version");
+                throw std::runtime_error("VMM with RDMA is not supported in this CUDA version");
             }
 
             if (cuCtxGetDevice(&device) != CUDA_SUCCESS) {
@@ -88,10 +86,9 @@ vmm_region vmm_init(size_t size)
             }
 
             int rdma_vmm_supported = 0;
-            if (cuDeviceGetAttribute(
-                        &rdma_vmm_supported,
-                        CU_DEVICE_ATTRIBUTE_GPU_DIRECT_RDMA_WITH_CUDA_VMM_SUPPORTED,
-                        device) != CUDA_SUCCESS) {
+            if (cuDeviceGetAttribute(&rdma_vmm_supported,
+                                     CU_DEVICE_ATTRIBUTE_GPU_DIRECT_RDMA_WITH_CUDA_VMM_SUPPORTED,
+                                     device) != CUDA_SUCCESS) {
                 throw std::runtime_error(
                     "Failed to query GPUDirect RDMA with VMM support attribute");
             }
@@ -104,31 +101,28 @@ vmm_region vmm_init(size_t size)
             if (cuDeviceGetAttribute(&fabric_supported,
                                      CU_DEVICE_ATTRIBUTE_HANDLE_TYPE_FABRIC_SUPPORTED,
                                      device) != CUDA_SUCCESS) {
-                throw std::runtime_error(
-                    "Failed to query fabric handle type support attribute");
+                throw std::runtime_error("Failed to query fabric handle type support attribute");
             }
 
-            prop.type                            = CU_MEM_ALLOCATION_TYPE_PINNED;
-            prop.location.type                   = CU_MEM_LOCATION_TYPE_DEVICE;
-            prop.location.id                     = device;
+            prop.type = CU_MEM_ALLOCATION_TYPE_PINNED;
+            prop.location.type = CU_MEM_LOCATION_TYPE_DEVICE;
+            prop.location.id = device;
             prop.allocFlags.gpuDirectRDMACapable = 1;
-            prop.requestedHandleTypes            = fabric_supported ?
-                                                   CU_MEM_HANDLE_TYPE_FABRIC :
-                                                   CU_MEM_HANDLE_TYPE_NONE;
+            prop.requestedHandleTypes =
+                fabric_supported ? CU_MEM_HANDLE_TYPE_FABRIC : CU_MEM_HANDLE_TYPE_NONE;
 
-            if (cuMemGetAllocationGranularity(&granularity, &prop,
-                                              CU_MEM_ALLOC_GRANULARITY_MINIMUM) !=
-                CUDA_SUCCESS) {
-                throw std::runtime_error(
-                    "Failed to get CUDA allocation granularity");
+            if (cuMemGetAllocationGranularity(
+                    &granularity, &prop, CU_MEM_ALLOC_GRANULARITY_MINIMUM) != CUDA_SUCCESS) {
+                throw std::runtime_error("Failed to get CUDA allocation granularity");
             }
         }
     };
+
     static cuda_alloc_ctx ctx;
 
-    vmm_region      region      = {};
+    vmm_region region = {};
     CUmemAccessDesc access_desc = {};
-    const char     *err_msg;
+    const char *err_msg;
 
     region.size = align_up<size_t>(size, ctx.granularity);
 
@@ -147,8 +141,8 @@ vmm_region vmm_init(size_t size)
     }
 
     access_desc.location.type = CU_MEM_LOCATION_TYPE_DEVICE;
-    access_desc.location.id   = ctx.device;
-    access_desc.flags         = CU_MEM_ACCESS_FLAGS_PROT_READWRITE;
+    access_desc.location.id = ctx.device;
+    access_desc.flags = CU_MEM_ACCESS_FLAGS_PROT_READWRITE;
     if (cuMemSetAccess(region.ptr, region.size, &access_desc, 1) != CUDA_SUCCESS) {
         err_msg = "Failed to set CUDA memory access";
         goto err_unmap;
@@ -167,8 +161,8 @@ vmm_region vmm_init(size_t size)
     throw std::runtime_error(err_msg);
 }
 
-void vmm_free(vmm_region &region)
-{
+void
+vmm_free(vmm_region &region) {
     if (region.ptr) {
         cuMemUnmap(region.ptr, region.size);
         cuMemAddressFree(region.ptr, region.size);
@@ -224,26 +218,26 @@ void Buffer::init(int num_ranks, int num_experts_per_rank, int64_t num_rdma_byte
 
     // Create 32 MiB workspace
     m_workspace_alloc = vmm_init(NUM_WORKSPACE_BYTES);
-    workspace         = reinterpret_cast<void *>(m_workspace_alloc.ptr);
+    workspace = reinterpret_cast<void *>(m_workspace_alloc.ptr);
     CUDA_CHECK(cudaMemsetAsync(workspace, 0, NUM_WORKSPACE_BYTES, comm_stream));
 
     EP_HOST_ASSERT(max_experts_per_rank > 0);
-    m_rdma_alloc    = vmm_init(num_rdma_bytes);
+    m_rdma_alloc = vmm_init(num_rdma_bytes);
     rdma_buffer_ptr = reinterpret_cast<void *>(m_rdma_alloc.ptr);
     CUDA_CHECK(cudaMemset(rdma_buffer_ptr, 0, num_rdma_bytes));
 
     // Allocate and clean shrink buffer
     int num_mask_buffer_bytes = max_num_ranks * sizeof(int);
-    m_mask_alloc    = vmm_init(num_mask_buffer_bytes);
+    m_mask_alloc = vmm_init(num_mask_buffer_bytes);
     mask_buffer_ptr = reinterpret_cast<int *>(m_mask_alloc.ptr);
     CUDA_CHECK(cudaMemset(mask_buffer_ptr, 0xff, num_mask_buffer_bytes));
     CUDA_CHECK(cudaMemset(mask_buffer_ptr + rank, 0, sizeof(int)));
 
     int num_sync_buffer_bytes = max_num_ranks * sizeof(int);
-    m_sync_alloc       = vmm_init(num_sync_buffer_bytes);
-    sync_buffer_ptr    = reinterpret_cast<int *>(m_sync_alloc.ptr);
+    m_sync_alloc = vmm_init(num_sync_buffer_bytes);
+    sync_buffer_ptr = reinterpret_cast<int *>(m_sync_alloc.ptr);
     m_sync_count_alloc = vmm_init(num_sync_buffer_bytes);
-    sync_count_ptr     = reinterpret_cast<int *>(m_sync_count_alloc.ptr);
+    sync_count_ptr = reinterpret_cast<int *>(m_sync_count_alloc.ptr);
     CUDA_CHECK(cudaMemset(sync_buffer_ptr, 0, num_sync_buffer_bytes));
     CUDA_CHECK(cudaMemset(sync_count_ptr, 0, num_sync_buffer_bytes));
     CUDA_CHECK(cudaDeviceSynchronize());
diff --git a/examples/device/ep/csrc/nixl_ep.hpp b/examples/device/ep/csrc/nixl_ep.hpp
index 083c0027eb..f411a89ca2 100644
--- a/examples/device/ep/csrc/nixl_ep.hpp
+++ b/examples/device/ep/csrc/nixl_ep.hpp
@@ -51,8 +51,8 @@
 #endif
 
 struct vmm_region {
-    CUdeviceptr                  ptr;
-    size_t                       size;
+    CUdeviceptr ptr;
+    size_t size;
     CUmemGenericAllocationHandle handle;
 };
 

From 2c701be7f97520bd2c4b94665c5493d12213feef Mon Sep 17 00:00:00 2001
From: Ofir Farjon <ofarjon@nvidia.com>
Date: Mon, 9 Mar 2026 18:51:47 +0200
Subject: [PATCH 11/29] NIXL/EP: Improve.

---
 examples/device/ep/csrc/nixl_ep.cpp | 59 +++++++++++++++--------------
 1 file changed, 30 insertions(+), 29 deletions(-)

diff --git a/examples/device/ep/csrc/nixl_ep.cpp b/examples/device/ep/csrc/nixl_ep.cpp
index 942692f30f..1f4fe13f03 100644
--- a/examples/device/ep/csrc/nixl_ep.cpp
+++ b/examples/device/ep/csrc/nixl_ep.cpp
@@ -62,33 +62,31 @@
 namespace nixl_ep {
 
 vmm_region
-vmm_init(size_t size) {
+vmm_init(size_t size, CUdevice device) {
     if (size == 0) {
         throw std::invalid_argument("vmm_init: size must be non-zero");
     }
 
     struct cuda_alloc_ctx {
-        CUdevice device;
         CUmemAllocationProp prop;
-        size_t granularity;
+        size_t              granularity;
 
-        cuda_alloc_ctx() : device(0), prop({}), granularity(0) {
+        cuda_alloc_ctx(CUdevice dev) : prop({}), granularity(0)
+        {
             int version;
             if (cuDriverGetVersion(&version) != CUDA_SUCCESS) {
                 throw std::runtime_error("Failed to get CUDA driver version");
             }
             if (version < 11000) {
-                throw std::runtime_error("VMM with RDMA is not supported in this CUDA version");
-            }
-
-            if (cuCtxGetDevice(&device) != CUDA_SUCCESS) {
-                throw std::runtime_error("Failed to get CUDA device handle");
+                throw std::runtime_error(
+                    "VMM with RDMA is not supported in this CUDA version");
             }
 
             int rdma_vmm_supported = 0;
-            if (cuDeviceGetAttribute(&rdma_vmm_supported,
-                                     CU_DEVICE_ATTRIBUTE_GPU_DIRECT_RDMA_WITH_CUDA_VMM_SUPPORTED,
-                                     device) != CUDA_SUCCESS) {
+            if (cuDeviceGetAttribute(
+                        &rdma_vmm_supported,
+                        CU_DEVICE_ATTRIBUTE_GPU_DIRECT_RDMA_WITH_CUDA_VMM_SUPPORTED,
+                        dev) != CUDA_SUCCESS) {
                 throw std::runtime_error(
                     "Failed to query GPUDirect RDMA with VMM support attribute");
             }
@@ -100,25 +98,28 @@ vmm_init(size_t size) {
             int fabric_supported = 0;
             if (cuDeviceGetAttribute(&fabric_supported,
                                      CU_DEVICE_ATTRIBUTE_HANDLE_TYPE_FABRIC_SUPPORTED,
-                                     device) != CUDA_SUCCESS) {
-                throw std::runtime_error("Failed to query fabric handle type support attribute");
+                                     dev) != CUDA_SUCCESS) {
+                throw std::runtime_error(
+                    "Failed to query fabric handle type support attribute");
             }
 
-            prop.type = CU_MEM_ALLOCATION_TYPE_PINNED;
-            prop.location.type = CU_MEM_LOCATION_TYPE_DEVICE;
-            prop.location.id = device;
+            prop.type                            = CU_MEM_ALLOCATION_TYPE_PINNED;
+            prop.location.type                   = CU_MEM_LOCATION_TYPE_DEVICE;
+            prop.location.id                     = dev;
             prop.allocFlags.gpuDirectRDMACapable = 1;
-            prop.requestedHandleTypes =
-                fabric_supported ? CU_MEM_HANDLE_TYPE_FABRIC : CU_MEM_HANDLE_TYPE_NONE;
+            prop.requestedHandleTypes            = fabric_supported ?
+                                                   CU_MEM_HANDLE_TYPE_FABRIC :
+                                                   CU_MEM_HANDLE_TYPE_NONE;
 
-            if (cuMemGetAllocationGranularity(
-                    &granularity, &prop, CU_MEM_ALLOC_GRANULARITY_MINIMUM) != CUDA_SUCCESS) {
+            if (cuMemGetAllocationGranularity(&granularity, &prop,
+                                              CU_MEM_ALLOC_GRANULARITY_MINIMUM) !=
+                CUDA_SUCCESS) {
                 throw std::runtime_error("Failed to get CUDA allocation granularity");
             }
         }
     };
 
-    static cuda_alloc_ctx ctx;
+    static cuda_alloc_ctx ctx(device);
 
     vmm_region region = {};
     CUmemAccessDesc access_desc = {};
@@ -141,8 +142,8 @@ vmm_init(size_t size) {
     }
 
     access_desc.location.type = CU_MEM_LOCATION_TYPE_DEVICE;
-    access_desc.location.id = ctx.device;
-    access_desc.flags = CU_MEM_ACCESS_FLAGS_PROT_READWRITE;
+    access_desc.location.id   = device;
+    access_desc.flags         = CU_MEM_ACCESS_FLAGS_PROT_READWRITE;
     if (cuMemSetAccess(region.ptr, region.size, &access_desc, 1) != CUDA_SUCCESS) {
         err_msg = "Failed to set CUDA memory access";
         goto err_unmap;
@@ -217,26 +218,26 @@ void Buffer::init(int num_ranks, int num_experts_per_rank, int64_t num_rdma_byte
     EP_HOST_ASSERT(per_channel_bytes < std::numeric_limits<int>::max());
 
     // Create 32 MiB workspace
-    m_workspace_alloc = vmm_init(NUM_WORKSPACE_BYTES);
+    m_workspace_alloc = vmm_init(NUM_WORKSPACE_BYTES, device_id);
     workspace = reinterpret_cast<void *>(m_workspace_alloc.ptr);
     CUDA_CHECK(cudaMemsetAsync(workspace, 0, NUM_WORKSPACE_BYTES, comm_stream));
 
     EP_HOST_ASSERT(max_experts_per_rank > 0);
-    m_rdma_alloc = vmm_init(num_rdma_bytes);
+    m_rdma_alloc = vmm_init(num_rdma_bytes, device_id);
     rdma_buffer_ptr = reinterpret_cast<void *>(m_rdma_alloc.ptr);
     CUDA_CHECK(cudaMemset(rdma_buffer_ptr, 0, num_rdma_bytes));
 
     // Allocate and clean shrink buffer
     int num_mask_buffer_bytes = max_num_ranks * sizeof(int);
-    m_mask_alloc = vmm_init(num_mask_buffer_bytes);
+    m_mask_alloc = vmm_init(num_mask_buffer_bytes, device_id);
     mask_buffer_ptr = reinterpret_cast<int *>(m_mask_alloc.ptr);
     CUDA_CHECK(cudaMemset(mask_buffer_ptr, 0xff, num_mask_buffer_bytes));
     CUDA_CHECK(cudaMemset(mask_buffer_ptr + rank, 0, sizeof(int)));
 
     int num_sync_buffer_bytes = max_num_ranks * sizeof(int);
-    m_sync_alloc = vmm_init(num_sync_buffer_bytes);
+    m_sync_alloc = vmm_init(num_sync_buffer_bytes, device_id);
     sync_buffer_ptr = reinterpret_cast<int *>(m_sync_alloc.ptr);
-    m_sync_count_alloc = vmm_init(num_sync_buffer_bytes);
+    m_sync_count_alloc = vmm_init(num_sync_buffer_bytes, device_id);
     sync_count_ptr = reinterpret_cast<int *>(m_sync_count_alloc.ptr);
     CUDA_CHECK(cudaMemset(sync_buffer_ptr, 0, num_sync_buffer_bytes));
     CUDA_CHECK(cudaMemset(sync_count_ptr, 0, num_sync_buffer_bytes));

From 329fb22b319d4fb9fea9754eb42d248c644336d7 Mon Sep 17 00:00:00 2001
From: Ofir Farjon <ofarjon@nvidia.com>
Date: Mon, 9 Mar 2026 18:52:06 +0200
Subject: [PATCH 12/29] NIXL/EP: Format.

---
 examples/device/ep/csrc/nixl_ep.cpp | 38 ++++++++++++-----------------
 1 file changed, 16 insertions(+), 22 deletions(-)

diff --git a/examples/device/ep/csrc/nixl_ep.cpp b/examples/device/ep/csrc/nixl_ep.cpp
index 1f4fe13f03..8d4e754398 100644
--- a/examples/device/ep/csrc/nixl_ep.cpp
+++ b/examples/device/ep/csrc/nixl_ep.cpp
@@ -69,24 +69,21 @@ vmm_init(size_t size, CUdevice device) {
 
     struct cuda_alloc_ctx {
         CUmemAllocationProp prop;
-        size_t              granularity;
+        size_t granularity;
 
-        cuda_alloc_ctx(CUdevice dev) : prop({}), granularity(0)
-        {
+        cuda_alloc_ctx(CUdevice dev) : prop({}), granularity(0) {
             int version;
             if (cuDriverGetVersion(&version) != CUDA_SUCCESS) {
                 throw std::runtime_error("Failed to get CUDA driver version");
             }
             if (version < 11000) {
-                throw std::runtime_error(
-                    "VMM with RDMA is not supported in this CUDA version");
+                throw std::runtime_error("VMM with RDMA is not supported in this CUDA version");
             }
 
             int rdma_vmm_supported = 0;
-            if (cuDeviceGetAttribute(
-                        &rdma_vmm_supported,
-                        CU_DEVICE_ATTRIBUTE_GPU_DIRECT_RDMA_WITH_CUDA_VMM_SUPPORTED,
-                        dev) != CUDA_SUCCESS) {
+            if (cuDeviceGetAttribute(&rdma_vmm_supported,
+                                     CU_DEVICE_ATTRIBUTE_GPU_DIRECT_RDMA_WITH_CUDA_VMM_SUPPORTED,
+                                     dev) != CUDA_SUCCESS) {
                 throw std::runtime_error(
                     "Failed to query GPUDirect RDMA with VMM support attribute");
             }
@@ -99,21 +96,18 @@ vmm_init(size_t size, CUdevice device) {
             if (cuDeviceGetAttribute(&fabric_supported,
                                      CU_DEVICE_ATTRIBUTE_HANDLE_TYPE_FABRIC_SUPPORTED,
                                      dev) != CUDA_SUCCESS) {
-                throw std::runtime_error(
-                    "Failed to query fabric handle type support attribute");
+                throw std::runtime_error("Failed to query fabric handle type support attribute");
             }
 
-            prop.type                            = CU_MEM_ALLOCATION_TYPE_PINNED;
-            prop.location.type                   = CU_MEM_LOCATION_TYPE_DEVICE;
-            prop.location.id                     = dev;
+            prop.type = CU_MEM_ALLOCATION_TYPE_PINNED;
+            prop.location.type = CU_MEM_LOCATION_TYPE_DEVICE;
+            prop.location.id = dev;
             prop.allocFlags.gpuDirectRDMACapable = 1;
-            prop.requestedHandleTypes            = fabric_supported ?
-                                                   CU_MEM_HANDLE_TYPE_FABRIC :
-                                                   CU_MEM_HANDLE_TYPE_NONE;
+            prop.requestedHandleTypes =
+                fabric_supported ? CU_MEM_HANDLE_TYPE_FABRIC : CU_MEM_HANDLE_TYPE_NONE;
 
-            if (cuMemGetAllocationGranularity(&granularity, &prop,
-                                              CU_MEM_ALLOC_GRANULARITY_MINIMUM) !=
-                CUDA_SUCCESS) {
+            if (cuMemGetAllocationGranularity(
+                    &granularity, &prop, CU_MEM_ALLOC_GRANULARITY_MINIMUM) != CUDA_SUCCESS) {
                 throw std::runtime_error("Failed to get CUDA allocation granularity");
             }
         }
@@ -142,8 +136,8 @@ vmm_init(size_t size, CUdevice device) {
     }
 
     access_desc.location.type = CU_MEM_LOCATION_TYPE_DEVICE;
-    access_desc.location.id   = device;
-    access_desc.flags         = CU_MEM_ACCESS_FLAGS_PROT_READWRITE;
+    access_desc.location.id = device;
+    access_desc.flags = CU_MEM_ACCESS_FLAGS_PROT_READWRITE;
     if (cuMemSetAccess(region.ptr, region.size, &access_desc, 1) != CUDA_SUCCESS) {
         err_msg = "Failed to set CUDA memory access";
         goto err_unmap;

From ee2fd647e9667f761a519510c36186ac46f5fe83 Mon Sep 17 00:00:00 2001
From: Ofir Farjon <ofarjon@nvidia.com>
Date: Wed, 11 Mar 2026 11:14:32 +0200
Subject: [PATCH 13/29] NIXL/EP: fallback to cudaMalloc if fabric not supported

---
 examples/device/ep/csrc/nixl_ep.cpp | 49 ++++++++++++++++++++---------
 examples/device/ep/csrc/nixl_ep.hpp |  1 +
 2 files changed, 36 insertions(+), 14 deletions(-)

diff --git a/examples/device/ep/csrc/nixl_ep.cpp b/examples/device/ep/csrc/nixl_ep.cpp
index 8d4e754398..be8da2079a 100644
--- a/examples/device/ep/csrc/nixl_ep.cpp
+++ b/examples/device/ep/csrc/nixl_ep.cpp
@@ -68,16 +68,26 @@ vmm_init(size_t size, CUdevice device) {
     }
 
     struct cuda_alloc_ctx {
+        bool fabric_supported;
         CUmemAllocationProp prop;
         size_t granularity;
 
-        cuda_alloc_ctx(CUdevice dev) : prop({}), granularity(0) {
+        cuda_alloc_ctx(CUdevice dev) : fabric_supported(false), prop({}), granularity(0) {
             int version;
             if (cuDriverGetVersion(&version) != CUDA_SUCCESS) {
                 throw std::runtime_error("Failed to get CUDA driver version");
             }
+
             if (version < 11000) {
-                throw std::runtime_error("VMM with RDMA is not supported in this CUDA version");
+                return; /* too old — fall back to cudaMalloc */
+            }
+
+            int fab = 0;
+            if ((cuDeviceGetAttribute(&fab,
+                                      CU_DEVICE_ATTRIBUTE_HANDLE_TYPE_FABRIC_SUPPORTED,
+                                      dev) != CUDA_SUCCESS) ||
+                (!fab)) {
+                return; /* no fabric — fall back to cudaMalloc */
             }
 
             int rdma_vmm_supported = 0;
@@ -87,34 +97,39 @@ vmm_init(size_t size, CUdevice device) {
                 throw std::runtime_error(
                     "Failed to query GPUDirect RDMA with VMM support attribute");
             }
+
             if (!rdma_vmm_supported) {
                 throw std::runtime_error(
                     "GPUDirect RDMA with CUDA VMM is not supported on this device");
             }
 
-            int fabric_supported = 0;
-            if (cuDeviceGetAttribute(&fabric_supported,
-                                     CU_DEVICE_ATTRIBUTE_HANDLE_TYPE_FABRIC_SUPPORTED,
-                                     dev) != CUDA_SUCCESS) {
-                throw std::runtime_error("Failed to query fabric handle type support attribute");
-            }
-
             prop.type = CU_MEM_ALLOCATION_TYPE_PINNED;
             prop.location.type = CU_MEM_LOCATION_TYPE_DEVICE;
             prop.location.id = dev;
             prop.allocFlags.gpuDirectRDMACapable = 1;
-            prop.requestedHandleTypes =
-                fabric_supported ? CU_MEM_HANDLE_TYPE_FABRIC : CU_MEM_HANDLE_TYPE_NONE;
+            prop.requestedHandleTypes = CU_MEM_HANDLE_TYPE_FABRIC;
 
             if (cuMemGetAllocationGranularity(
                     &granularity, &prop, CU_MEM_ALLOC_GRANULARITY_MINIMUM) != CUDA_SUCCESS) {
                 throw std::runtime_error("Failed to get CUDA allocation granularity");
             }
+
+            fabric_supported = true;
         }
     };
 
     static cuda_alloc_ctx ctx(device);
 
+    if (!ctx.fabric_supported) {
+        vmm_region region = {};
+        region.size = size;
+        region.is_cuda_malloc = true;
+        if (cudaMalloc(reinterpret_cast<void **>(&region.ptr), size) != cudaSuccess) {
+            throw std::runtime_error("cudaMalloc fallback failed (fabric not supported)");
+        }
+        return region;
+    }
+
     vmm_region region = {};
     CUmemAccessDesc access_desc = {};
     const char *err_msg;
@@ -158,11 +173,17 @@ vmm_init(size_t size, CUdevice device) {
 
 void
 vmm_free(vmm_region &region) {
-    if (region.ptr) {
-        cuMemUnmap(region.ptr, region.size);
-        cuMemAddressFree(region.ptr, region.size);
+    if (!region.ptr) {
+        return;
+    }
+    if (region.is_cuda_malloc) {
+        cudaFree(reinterpret_cast<void *>(region.ptr));
         region.ptr = 0;
+        return;
     }
+    cuMemUnmap(region.ptr, region.size);
+    cuMemAddressFree(region.ptr, region.size);
+    region.ptr = 0;
     if (region.handle) {
         cuMemRelease(region.handle);
         region.handle = 0;
diff --git a/examples/device/ep/csrc/nixl_ep.hpp b/examples/device/ep/csrc/nixl_ep.hpp
index f411a89ca2..54b0207f8d 100644
--- a/examples/device/ep/csrc/nixl_ep.hpp
+++ b/examples/device/ep/csrc/nixl_ep.hpp
@@ -54,6 +54,7 @@ struct vmm_region {
     CUdeviceptr ptr;
     size_t size;
     CUmemGenericAllocationHandle handle;
+    bool is_cuda_malloc = false;
 };
 
 namespace nixl_ep {

From 7677e592e897d6dbc459191762755f7e868f20e6 Mon Sep 17 00:00:00 2001
From: Ofir Farjon <ofarjon@nvidia.com>
Date: Wed, 11 Mar 2026 12:02:11 +0200
Subject: [PATCH 14/29] NIXL/EP: set default vals

---
 examples/device/ep/csrc/nixl_ep.hpp | 6 +++---
 1 file changed, 3 insertions(+), 3 deletions(-)

diff --git a/examples/device/ep/csrc/nixl_ep.hpp b/examples/device/ep/csrc/nixl_ep.hpp
index 54b0207f8d..d8dd9b835d 100644
--- a/examples/device/ep/csrc/nixl_ep.hpp
+++ b/examples/device/ep/csrc/nixl_ep.hpp
@@ -51,9 +51,9 @@
 #endif
 
 struct vmm_region {
-    CUdeviceptr ptr;
-    size_t size;
-    CUmemGenericAllocationHandle handle;
+    CUdeviceptr ptr = 0;
+    size_t size = 0;
+    CUmemGenericAllocationHandle handle = 0;
     bool is_cuda_malloc = false;
 };
 

From 8b4b5674c14b3bb3459d4755d5d0f812849a5cb3 Mon Sep 17 00:00:00 2001
From: Ofir Farjon <ofarjon@nvidia.com>
Date: Wed, 11 Mar 2026 12:16:50 +0200
Subject: [PATCH 15/29] NIXL/EP: Fix.

---
 examples/device/ep/csrc/nixl_ep.cpp | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/examples/device/ep/csrc/nixl_ep.cpp b/examples/device/ep/csrc/nixl_ep.cpp
index be8da2079a..191fe23107 100644
--- a/examples/device/ep/csrc/nixl_ep.cpp
+++ b/examples/device/ep/csrc/nixl_ep.cpp
@@ -61,7 +61,7 @@
 
 namespace nixl_ep {
 
-vmm_region
+static vmm_region
 vmm_init(size_t size, CUdevice device) {
     if (size == 0) {
         throw std::invalid_argument("vmm_init: size must be non-zero");
@@ -171,7 +171,7 @@ vmm_init(size_t size, CUdevice device) {
     throw std::runtime_error(err_msg);
 }
 
-void
+static void
 vmm_free(vmm_region &region) {
     if (!region.ptr) {
         return;

From cbe8d1df814af7b35adbf07af009adfd8edfbf07 Mon Sep 17 00:00:00 2001
From: Ofir Farjon <ofarjon@nvidia.com>
Date: Tue, 17 Mar 2026 22:33:49 +0200
Subject: [PATCH 16/29] NIXL/EP: Fix comments.

---
 examples/device/ep/csrc/nixl_ep.cpp | 129 -----------------------
 examples/device/ep/csrc/nixl_ep.hpp |   8 +-
 examples/device/ep/csrc/vmm.cpp     | 155 ++++++++++++++++++++++++++++
 examples/device/ep/csrc/vmm.hpp     |  38 +++++++
 examples/device/ep/meson.build      |   1 +
 5 files changed, 195 insertions(+), 136 deletions(-)
 create mode 100644 examples/device/ep/csrc/vmm.cpp
 create mode 100644 examples/device/ep/csrc/vmm.hpp

diff --git a/examples/device/ep/csrc/nixl_ep.cpp b/examples/device/ep/csrc/nixl_ep.cpp
index 191fe23107..011d0f7343 100644
--- a/examples/device/ep/csrc/nixl_ep.cpp
+++ b/examples/device/ep/csrc/nixl_ep.cpp
@@ -61,135 +61,6 @@
 
 namespace nixl_ep {
 
-static vmm_region
-vmm_init(size_t size, CUdevice device) {
-    if (size == 0) {
-        throw std::invalid_argument("vmm_init: size must be non-zero");
-    }
-
-    struct cuda_alloc_ctx {
-        bool fabric_supported;
-        CUmemAllocationProp prop;
-        size_t granularity;
-
-        cuda_alloc_ctx(CUdevice dev) : fabric_supported(false), prop({}), granularity(0) {
-            int version;
-            if (cuDriverGetVersion(&version) != CUDA_SUCCESS) {
-                throw std::runtime_error("Failed to get CUDA driver version");
-            }
-
-            if (version < 11000) {
-                return; /* too old — fall back to cudaMalloc */
-            }
-
-            int fab = 0;
-            if ((cuDeviceGetAttribute(&fab,
-                                      CU_DEVICE_ATTRIBUTE_HANDLE_TYPE_FABRIC_SUPPORTED,
-                                      dev) != CUDA_SUCCESS) ||
-                (!fab)) {
-                return; /* no fabric — fall back to cudaMalloc */
-            }
-
-            int rdma_vmm_supported = 0;
-            if (cuDeviceGetAttribute(&rdma_vmm_supported,
-                                     CU_DEVICE_ATTRIBUTE_GPU_DIRECT_RDMA_WITH_CUDA_VMM_SUPPORTED,
-                                     dev) != CUDA_SUCCESS) {
-                throw std::runtime_error(
-                    "Failed to query GPUDirect RDMA with VMM support attribute");
-            }
-
-            if (!rdma_vmm_supported) {
-                throw std::runtime_error(
-                    "GPUDirect RDMA with CUDA VMM is not supported on this device");
-            }
-
-            prop.type = CU_MEM_ALLOCATION_TYPE_PINNED;
-            prop.location.type = CU_MEM_LOCATION_TYPE_DEVICE;
-            prop.location.id = dev;
-            prop.allocFlags.gpuDirectRDMACapable = 1;
-            prop.requestedHandleTypes = CU_MEM_HANDLE_TYPE_FABRIC;
-
-            if (cuMemGetAllocationGranularity(
-                    &granularity, &prop, CU_MEM_ALLOC_GRANULARITY_MINIMUM) != CUDA_SUCCESS) {
-                throw std::runtime_error("Failed to get CUDA allocation granularity");
-            }
-
-            fabric_supported = true;
-        }
-    };
-
-    static cuda_alloc_ctx ctx(device);
-
-    if (!ctx.fabric_supported) {
-        vmm_region region = {};
-        region.size = size;
-        region.is_cuda_malloc = true;
-        if (cudaMalloc(reinterpret_cast<void **>(&region.ptr), size) != cudaSuccess) {
-            throw std::runtime_error("cudaMalloc fallback failed (fabric not supported)");
-        }
-        return region;
-    }
-
-    vmm_region region = {};
-    CUmemAccessDesc access_desc = {};
-    const char *err_msg;
-
-    region.size = align_up<size_t>(size, ctx.granularity);
-
-    if (cuMemCreate(&region.handle, region.size, &ctx.prop, 0) != CUDA_SUCCESS) {
-        throw std::runtime_error("Failed to create CUDA VMM allocation");
-    }
-
-    if (cuMemAddressReserve(&region.ptr, region.size, 0, 0, 0) != CUDA_SUCCESS) {
-        err_msg = "Failed to reserve CUDA virtual address";
-        goto err_release;
-    }
-
-    if (cuMemMap(region.ptr, region.size, 0, region.handle, 0) != CUDA_SUCCESS) {
-        err_msg = "Failed to map CUDA VMM memory";
-        goto err_free;
-    }
-
-    access_desc.location.type = CU_MEM_LOCATION_TYPE_DEVICE;
-    access_desc.location.id = device;
-    access_desc.flags = CU_MEM_ACCESS_FLAGS_PROT_READWRITE;
-    if (cuMemSetAccess(region.ptr, region.size, &access_desc, 1) != CUDA_SUCCESS) {
-        err_msg = "Failed to set CUDA memory access";
-        goto err_unmap;
-    }
-
-    return region;
-
-err_unmap:
-    cuMemUnmap(region.ptr, region.size);
-err_free:
-    cuMemAddressFree(region.ptr, region.size);
-    region.ptr = 0;
-err_release:
-    cuMemRelease(region.handle);
-    region.handle = 0;
-    throw std::runtime_error(err_msg);
-}
-
-static void
-vmm_free(vmm_region &region) {
-    if (!region.ptr) {
-        return;
-    }
-    if (region.is_cuda_malloc) {
-        cudaFree(reinterpret_cast<void *>(region.ptr));
-        region.ptr = 0;
-        return;
-    }
-    cuMemUnmap(region.ptr, region.size);
-    cuMemAddressFree(region.ptr, region.size);
-    region.ptr = 0;
-    if (region.handle) {
-        cuMemRelease(region.handle);
-        region.handle = 0;
-    }
-}
-
 static void sleep_ms(int milliseconds) {
     std::this_thread::sleep_for(std::chrono::milliseconds(milliseconds));
 }
diff --git a/examples/device/ep/csrc/nixl_ep.hpp b/examples/device/ep/csrc/nixl_ep.hpp
index d8dd9b835d..e8275e1283 100644
--- a/examples/device/ep/csrc/nixl_ep.hpp
+++ b/examples/device/ep/csrc/nixl_ep.hpp
@@ -43,6 +43,7 @@
 #include "event.hpp"
 #include "kernels/configs.cuh"
 #include "kernels/exception.cuh"
+#include "vmm.hpp"
 
 #include "nixl.h"
 
@@ -50,13 +51,6 @@
 #define TORCH_EXTENSION_NAME nixl_ep_cpp
 #endif
 
-struct vmm_region {
-    CUdeviceptr ptr = 0;
-    size_t size = 0;
-    CUmemGenericAllocationHandle handle = 0;
-    bool is_cuda_malloc = false;
-};
-
 namespace nixl_ep {
 
 struct NixlPeerInfo {
diff --git a/examples/device/ep/csrc/vmm.cpp b/examples/device/ep/csrc/vmm.cpp
new file mode 100644
index 0000000000..0ef73b2916
--- /dev/null
+++ b/examples/device/ep/csrc/vmm.cpp
@@ -0,0 +1,155 @@
+/*
+ * SPDX-FileCopyrightText: Copyright (c) 2025 DeepSeek
+ * SPDX-FileCopyrightText: Copyright (c) 2025-2026 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+ *
+ * This file incorporates material from the DeepSeek project, licensed under the MIT License.
+ * The modifications made by NVIDIA are licensed under the Apache License, Version 2.0.
+ *
+ * SPDX-License-Identifier: MIT AND Apache-2.0
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#include <stdexcept>
+
+#include "config.hpp"
+#include "vmm.hpp"
+
+vmm_region
+vmm_init(size_t size, CUdevice device) {
+    if (size == 0) {
+        throw std::invalid_argument("vmm_init: size must be non-zero");
+    }
+
+    struct cuda_alloc_ctx {
+        bool fabric_supported;
+        CUmemAllocationProp prop;
+        size_t granularity;
+
+        cuda_alloc_ctx(CUdevice dev) : fabric_supported(false), prop({}), granularity(0) {
+            int version;
+            if (cuDriverGetVersion(&version) != CUDA_SUCCESS) {
+                throw std::runtime_error("Failed to get CUDA driver version");
+            }
+
+            if (version < 11000) {
+                return; /* too old — fall back to cudaMalloc */
+            }
+
+            int fab = 0;
+            if ((cuDeviceGetAttribute(&fab,
+                                      CU_DEVICE_ATTRIBUTE_HANDLE_TYPE_FABRIC_SUPPORTED,
+                                      dev) != CUDA_SUCCESS) ||
+                (!fab)) {
+                return; /* no fabric — fall back to cudaMalloc */
+            }
+
+            int rdma_vmm_supported = 0;
+            if (cuDeviceGetAttribute(&rdma_vmm_supported,
+                                     CU_DEVICE_ATTRIBUTE_GPU_DIRECT_RDMA_WITH_CUDA_VMM_SUPPORTED,
+                                     dev) != CUDA_SUCCESS) {
+                throw std::runtime_error(
+                    "Failed to query GPUDirect RDMA with VMM support attribute");
+            }
+
+            if (!rdma_vmm_supported) {
+                throw std::runtime_error(
+                    "GPUDirect RDMA with CUDA VMM is not supported on this device");
+            }
+
+            prop.type = CU_MEM_ALLOCATION_TYPE_PINNED;
+            prop.location.type = CU_MEM_LOCATION_TYPE_DEVICE;
+            prop.location.id = dev;
+            prop.allocFlags.gpuDirectRDMACapable = 1;
+            prop.requestedHandleTypes = CU_MEM_HANDLE_TYPE_FABRIC;
+
+            if (cuMemGetAllocationGranularity(
+                    &granularity, &prop, CU_MEM_ALLOC_GRANULARITY_MINIMUM) != CUDA_SUCCESS) {
+                throw std::runtime_error("Failed to get CUDA allocation granularity");
+            }
+
+            fabric_supported = true;
+        }
+    };
+
+    static cuda_alloc_ctx ctx(device);
+
+    if (!ctx.fabric_supported) {
+        vmm_region region = {};
+        region.size = size;
+        region.is_cuda_malloc = true;
+        if (cudaMalloc(reinterpret_cast<void **>(&region.ptr), size) != cudaSuccess) {
+            throw std::runtime_error("cudaMalloc fallback failed (fabric not supported)");
+        }
+        return region;
+    }
+
+    vmm_region region = {};
+    CUmemAccessDesc access_desc = {};
+    const char *err_msg;
+
+    region.size = align_up<size_t>(size, ctx.granularity);
+
+    if (cuMemCreate(&region.handle, region.size, &ctx.prop, 0) != CUDA_SUCCESS) {
+        throw std::runtime_error("Failed to create CUDA VMM allocation");
+    }
+
+    if (cuMemAddressReserve(&region.ptr, region.size, 0, 0, 0) != CUDA_SUCCESS) {
+        err_msg = "Failed to reserve CUDA virtual address";
+        goto err_release;
+    }
+
+    if (cuMemMap(region.ptr, region.size, 0, region.handle, 0) != CUDA_SUCCESS) {
+        err_msg = "Failed to map CUDA VMM memory";
+        goto err_free;
+    }
+
+    access_desc.location.type = CU_MEM_LOCATION_TYPE_DEVICE;
+    access_desc.location.id = device;
+    access_desc.flags = CU_MEM_ACCESS_FLAGS_PROT_READWRITE;
+    if (cuMemSetAccess(region.ptr, region.size, &access_desc, 1) != CUDA_SUCCESS) {
+        err_msg = "Failed to set CUDA memory access";
+        goto err_unmap;
+    }
+
+    return region;
+
+err_unmap:
+    cuMemUnmap(region.ptr, region.size);
+err_free:
+    cuMemAddressFree(region.ptr, region.size);
+    region.ptr = 0;
+err_release:
+    cuMemRelease(region.handle);
+    region.handle = 0;
+    throw std::runtime_error(err_msg);
+}
+
+void
+vmm_free(vmm_region &region) {
+    if (!region.ptr) {
+        return;
+    }
+    if (region.is_cuda_malloc) {
+        cudaFree(reinterpret_cast<void *>(region.ptr));
+        region.ptr = 0;
+        return;
+    }
+    cuMemUnmap(region.ptr, region.size);
+    cuMemAddressFree(region.ptr, region.size);
+    region.ptr = 0;
+    if (region.handle) {
+        cuMemRelease(region.handle);
+        region.handle = 0;
+    }
+}
diff --git a/examples/device/ep/csrc/vmm.hpp b/examples/device/ep/csrc/vmm.hpp
new file mode 100644
index 0000000000..af63ac643a
--- /dev/null
+++ b/examples/device/ep/csrc/vmm.hpp
@@ -0,0 +1,38 @@
+/*
+ * SPDX-FileCopyrightText: Copyright (c) 2025 DeepSeek
+ * SPDX-FileCopyrightText: Copyright (c) 2025-2026 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+ *
+ * This file incorporates material from the DeepSeek project, licensed under the MIT License.
+ * The modifications made by NVIDIA are licensed under the Apache License, Version 2.0.
+ *
+ * SPDX-License-Identifier: MIT AND Apache-2.0
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#pragma once
+
+#include <cuda.h>
+#include <cuda_runtime.h>
+
+struct vmm_region {
+    CUdeviceptr ptr = 0;
+    size_t size = 0;
+    CUmemGenericAllocationHandle handle = 0;
+    bool is_cuda_malloc = false;
+};
+
+vmm_region
+vmm_init(size_t size, CUdevice device);
+void
+vmm_free(vmm_region &region);
diff --git a/examples/device/ep/meson.build b/examples/device/ep/meson.build
index 84ddad229c..fdc879d902 100644
--- a/examples/device/ep/meson.build
+++ b/examples/device/ep/meson.build
@@ -63,6 +63,7 @@ endif
 
 nixl_ep_sources = [
     'csrc/nixl_ep.cpp',
+    'csrc/vmm.cpp',
     'csrc/kernels/nixl_ep.cu',
 ]
 

From 088c96001ad0e7a8a4501b62ef62fffd15d09dee Mon Sep 17 00:00:00 2001
From: Ofir Farjon <ofarjon@nvidia.com>
Date: Tue, 17 Mar 2026 22:40:26 +0200
Subject: [PATCH 17/29] NIXL/EP: Fix.

---
 examples/device/ep/csrc/vmm.cpp | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/examples/device/ep/csrc/vmm.cpp b/examples/device/ep/csrc/vmm.cpp
index 0ef73b2916..289fa9ab91 100644
--- a/examples/device/ep/csrc/vmm.cpp
+++ b/examples/device/ep/csrc/vmm.cpp
@@ -98,7 +98,7 @@ vmm_init(size_t size, CUdevice device) {
     CUmemAccessDesc access_desc = {};
     const char *err_msg;
 
-    region.size = align_up<size_t>(size, ctx.granularity);
+    region.size = nixl_ep::align_up<size_t>(size, ctx.granularity);
 
     if (cuMemCreate(&region.handle, region.size, &ctx.prop, 0) != CUDA_SUCCESS) {
         throw std::runtime_error("Failed to create CUDA VMM allocation");

From 15f9174ddb82b5d268ef35974e07c01b4a428468 Mon Sep 17 00:00:00 2001
From: Ofir Farjon <ofarjon@nvidia.com>
Date: Tue, 17 Mar 2026 22:43:30 +0200
Subject: [PATCH 18/29] NIXL/EP: Fix.

---
 examples/device/ep/csrc/config.hpp | 6 +++++-
 1 file changed, 5 insertions(+), 1 deletion(-)

diff --git a/examples/device/ep/csrc/config.hpp b/examples/device/ep/csrc/config.hpp
index f17ac113d7..89b49cc419 100644
--- a/examples/device/ep/csrc/config.hpp
+++ b/examples/device/ep/csrc/config.hpp
@@ -122,7 +122,11 @@ struct EPLayout {
     }
 };
 
-size_t get_rdma_size_hint(int num_max_dispatch_tokens_per_rank, int hidden, int num_ranks, int num_experts) {
+inline size_t
+get_rdma_size_hint(int num_max_dispatch_tokens_per_rank,
+                   int hidden,
+                   int num_ranks,
+                   int num_experts) {
     auto num_bytes = EPLayout(nullptr, num_max_dispatch_tokens_per_rank, hidden, num_ranks, num_experts).total_bytes;
     return ((num_bytes + NUM_BUFFER_ALIGNMENT_BYTES) / NUM_BUFFER_ALIGNMENT_BYTES) * NUM_BUFFER_ALIGNMENT_BYTES;
 }

From a44a0947afa55a2675ce5b39aeac1eb51371df59 Mon Sep 17 00:00:00 2001
From: Ofir Farjon <ofarjon@nvidia.com>
Date: Tue, 17 Mar 2026 22:53:28 +0200
Subject: [PATCH 19/29] NIXL/EP: not needed.

---
 examples/device/ep/csrc/nixl_ep.cpp | 1 -
 examples/device/ep/csrc/nixl_ep.hpp | 3 ---
 2 files changed, 4 deletions(-)

diff --git a/examples/device/ep/csrc/nixl_ep.cpp b/examples/device/ep/csrc/nixl_ep.cpp
index 011d0f7343..910bb6d2f6 100644
--- a/examples/device/ep/csrc/nixl_ep.cpp
+++ b/examples/device/ep/csrc/nixl_ep.cpp
@@ -27,7 +27,6 @@
 #include <cstddef>
 #include <cstdint>
 #include <limits>
-#include <cuda.h>
 #include <cuda_runtime.h>
 #include <memory>
 #include <optional>
diff --git a/examples/device/ep/csrc/nixl_ep.hpp b/examples/device/ep/csrc/nixl_ep.hpp
index e8275e1283..92059c5611 100644
--- a/examples/device/ep/csrc/nixl_ep.hpp
+++ b/examples/device/ep/csrc/nixl_ep.hpp
@@ -35,10 +35,7 @@
 #include <vector>
 #include <string>
 
-#include <cuda.h>
-#include <cuda_runtime.h>
 #include <memory>
-#include <stdexcept>
 #include "config.hpp"
 #include "event.hpp"
 #include "kernels/configs.cuh"

From f120473097f59a5ab5c1a03bf814763b5e49b9f7 Mon Sep 17 00:00:00 2001
From: Ofir Farjon <ofarjon@nvidia.com>
Date: Tue, 17 Mar 2026 23:15:38 +0200
Subject: [PATCH 20/29] NIXL/EP: new files terms.

---
 examples/device/ep/csrc/vmm.cpp | 9 ++-------
 examples/device/ep/csrc/vmm.hpp | 9 ++-------
 2 files changed, 4 insertions(+), 14 deletions(-)

diff --git a/examples/device/ep/csrc/vmm.cpp b/examples/device/ep/csrc/vmm.cpp
index 289fa9ab91..62cb10114c 100644
--- a/examples/device/ep/csrc/vmm.cpp
+++ b/examples/device/ep/csrc/vmm.cpp
@@ -1,17 +1,12 @@
 /*
- * SPDX-FileCopyrightText: Copyright (c) 2025 DeepSeek
  * SPDX-FileCopyrightText: Copyright (c) 2025-2026 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
- *
- * This file incorporates material from the DeepSeek project, licensed under the MIT License.
- * The modifications made by NVIDIA are licensed under the Apache License, Version 2.0.
- *
- * SPDX-License-Identifier: MIT AND Apache-2.0
+ * SPDX-License-Identifier: Apache-2.0
  *
  * Licensed under the Apache License, Version 2.0 (the "License");
  * you may not use this file except in compliance with the License.
  * You may obtain a copy of the License at
  *
- *     http://www.apache.org/licenses/LICENSE-2.0
+ * http://www.apache.org/licenses/LICENSE-2.0
  *
  * Unless required by applicable law or agreed to in writing, software
  * distributed under the License is distributed on an "AS IS" BASIS,
diff --git a/examples/device/ep/csrc/vmm.hpp b/examples/device/ep/csrc/vmm.hpp
index af63ac643a..42e0c7a87a 100644
--- a/examples/device/ep/csrc/vmm.hpp
+++ b/examples/device/ep/csrc/vmm.hpp
@@ -1,17 +1,12 @@
 /*
- * SPDX-FileCopyrightText: Copyright (c) 2025 DeepSeek
  * SPDX-FileCopyrightText: Copyright (c) 2025-2026 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
- *
- * This file incorporates material from the DeepSeek project, licensed under the MIT License.
- * The modifications made by NVIDIA are licensed under the Apache License, Version 2.0.
- *
- * SPDX-License-Identifier: MIT AND Apache-2.0
+ * SPDX-License-Identifier: Apache-2.0
  *
  * Licensed under the Apache License, Version 2.0 (the "License");
  * you may not use this file except in compliance with the License.
  * You may obtain a copy of the License at
  *
- *     http://www.apache.org/licenses/LICENSE-2.0
+ * http://www.apache.org/licenses/LICENSE-2.0
  *
  * Unless required by applicable law or agreed to in writing, software
  * distributed under the License is distributed on an "AS IS" BASIS,

From 9ef9dcac07519d35c9162eee341a5ae44104657b Mon Sep 17 00:00:00 2001
From: Ofir Farjon <ofarjon@nvidia.com>
Date: Tue, 24 Mar 2026 18:06:50 +0200
Subject: [PATCH 21/29] NIXL/EP: Fix comments.

---
 examples/device/ep/csrc/nixl_ep.cpp |  30 +++---
 examples/device/ep/csrc/vmm.cpp     | 137 ++++++++++++++++++----------
 examples/device/ep/csrc/vmm.hpp     |  52 +++++++++--
 3 files changed, 146 insertions(+), 73 deletions(-)

diff --git a/examples/device/ep/csrc/nixl_ep.cpp b/examples/device/ep/csrc/nixl_ep.cpp
index 910bb6d2f6..4f8dd30fd5 100644
--- a/examples/device/ep/csrc/nixl_ep.cpp
+++ b/examples/device/ep/csrc/nixl_ep.cpp
@@ -103,27 +103,27 @@ void Buffer::init(int num_ranks, int num_experts_per_rank, int64_t num_rdma_byte
     EP_HOST_ASSERT(per_channel_bytes < std::numeric_limits<int>::max());
 
     // Create 32 MiB workspace
-    m_workspace_alloc = vmm_init(NUM_WORKSPACE_BYTES, device_id);
-    workspace = reinterpret_cast<void *>(m_workspace_alloc.ptr);
+    m_workspace_alloc = vmm_region::allocate(NUM_WORKSPACE_BYTES, device_id);
+    workspace = reinterpret_cast<void *>(m_workspace_alloc.ptr());
     CUDA_CHECK(cudaMemsetAsync(workspace, 0, NUM_WORKSPACE_BYTES, comm_stream));
 
     EP_HOST_ASSERT(max_experts_per_rank > 0);
-    m_rdma_alloc = vmm_init(num_rdma_bytes, device_id);
-    rdma_buffer_ptr = reinterpret_cast<void *>(m_rdma_alloc.ptr);
+    m_rdma_alloc = vmm_region::allocate(num_rdma_bytes, device_id);
+    rdma_buffer_ptr = reinterpret_cast<void *>(m_rdma_alloc.ptr());
     CUDA_CHECK(cudaMemset(rdma_buffer_ptr, 0, num_rdma_bytes));
 
     // Allocate and clean shrink buffer
     int num_mask_buffer_bytes = max_num_ranks * sizeof(int);
-    m_mask_alloc = vmm_init(num_mask_buffer_bytes, device_id);
-    mask_buffer_ptr = reinterpret_cast<int *>(m_mask_alloc.ptr);
+    m_mask_alloc = vmm_region::allocate(num_mask_buffer_bytes, device_id);
+    mask_buffer_ptr = reinterpret_cast<int *>(m_mask_alloc.ptr());
     CUDA_CHECK(cudaMemset(mask_buffer_ptr, 0xff, num_mask_buffer_bytes));
     CUDA_CHECK(cudaMemset(mask_buffer_ptr + rank, 0, sizeof(int)));
 
     int num_sync_buffer_bytes = max_num_ranks * sizeof(int);
-    m_sync_alloc = vmm_init(num_sync_buffer_bytes, device_id);
-    sync_buffer_ptr = reinterpret_cast<int *>(m_sync_alloc.ptr);
-    m_sync_count_alloc = vmm_init(num_sync_buffer_bytes, device_id);
-    sync_count_ptr = reinterpret_cast<int *>(m_sync_count_alloc.ptr);
+    m_sync_alloc = vmm_region::allocate(num_sync_buffer_bytes, device_id);
+    sync_buffer_ptr = reinterpret_cast<int *>(m_sync_alloc.ptr());
+    m_sync_count_alloc = vmm_region::allocate(num_sync_buffer_bytes, device_id);
+    sync_count_ptr = reinterpret_cast<int *>(m_sync_count_alloc.ptr());
     CUDA_CHECK(cudaMemset(sync_buffer_ptr, 0, num_sync_buffer_bytes));
     CUDA_CHECK(cudaMemset(sync_count_ptr, 0, num_sync_buffer_bytes));
     CUDA_CHECK(cudaDeviceSynchronize());
@@ -177,21 +177,21 @@ void Buffer::destroy() {
 
     _nixl_ep_destroy();
 
-    vmm_free(m_rdma_alloc);
+    m_rdma_alloc = vmm_region{};
     rdma_buffer_ptr = nullptr;
 
     if (nixl_agent_info and nixl_agent_info->agent != nullptr and getenv("NIXL_ETCD_ENDPOINTS")) {
         nixl_agent_info->agent->invalidateLocalMD();
     }
 
-    vmm_free(m_mask_alloc);
+    m_mask_alloc = vmm_region{};
     mask_buffer_ptr = nullptr;
-    vmm_free(m_sync_alloc);
+    m_sync_alloc = vmm_region{};
     sync_buffer_ptr = nullptr;
-    vmm_free(m_sync_count_alloc);
+    m_sync_count_alloc = vmm_region{};
     sync_count_ptr = nullptr;
 
-    vmm_free(m_workspace_alloc);
+    m_workspace_alloc = vmm_region{};
     workspace = nullptr;
 
     destroyed = true;
diff --git a/examples/device/ep/csrc/vmm.cpp b/examples/device/ep/csrc/vmm.cpp
index 62cb10114c..ac37a5ec4f 100644
--- a/examples/device/ep/csrc/vmm.cpp
+++ b/examples/device/ep/csrc/vmm.cpp
@@ -16,14 +16,83 @@
  */
 
 #include <stdexcept>
+#include <utility>
 
 #include "config.hpp"
 #include "vmm.hpp"
 
+void
+vmm_region::release() noexcept {
+    if (is_cuda_malloc_) {
+        if (ptr_) {
+            cudaFree(reinterpret_cast<void *>(ptr_));
+        }
+        ptr_ = 0;
+        size_ = 0;
+        is_cuda_malloc_ = false;
+        return;
+    }
+
+    if (vmm_mapped_) {
+        cuMemUnmap(ptr_, size_);
+        vmm_mapped_ = false;
+    }
+    if (vmm_addr_reserved_ && ptr_) {
+        cuMemAddressFree(ptr_, size_);
+        ptr_ = 0;
+        vmm_addr_reserved_ = false;
+    }
+    if (handle_) {
+        cuMemRelease(handle_);
+        handle_ = 0;
+    }
+    size_ = 0;
+}
+
+vmm_region::~vmm_region() {
+    release();
+}
+
+vmm_region::vmm_region(vmm_region &&other) noexcept
+    : ptr_(other.ptr_),
+      size_(other.size_),
+      handle_(other.handle_),
+      is_cuda_malloc_(other.is_cuda_malloc_),
+      vmm_addr_reserved_(other.vmm_addr_reserved_),
+      vmm_mapped_(other.vmm_mapped_) {
+    other.ptr_ = 0;
+    other.size_ = 0;
+    other.handle_ = 0;
+    other.is_cuda_malloc_ = false;
+    other.vmm_addr_reserved_ = false;
+    other.vmm_mapped_ = false;
+}
+
+vmm_region &
+vmm_region::operator=(vmm_region &&other) noexcept {
+    if (this == &other) {
+        return *this;
+    }
+    release();
+    ptr_ = other.ptr_;
+    size_ = other.size_;
+    handle_ = other.handle_;
+    is_cuda_malloc_ = other.is_cuda_malloc_;
+    vmm_addr_reserved_ = other.vmm_addr_reserved_;
+    vmm_mapped_ = other.vmm_mapped_;
+    other.ptr_ = 0;
+    other.size_ = 0;
+    other.handle_ = 0;
+    other.is_cuda_malloc_ = false;
+    other.vmm_addr_reserved_ = false;
+    other.vmm_mapped_ = false;
+    return *this;
+}
+
 vmm_region
-vmm_init(size_t size, CUdevice device) {
+vmm_region::allocate(size_t size, CUdevice device) {
     if (size == 0) {
-        throw std::invalid_argument("vmm_init: size must be non-zero");
+        throw std::invalid_argument("vmm_region::allocate: size must be non-zero");
     }
 
     struct cuda_alloc_ctx {
@@ -79,72 +148,44 @@ vmm_init(size_t size, CUdevice device) {
 
     static cuda_alloc_ctx ctx(device);
 
+    vmm_region region;
+
     if (!ctx.fabric_supported) {
-        vmm_region region = {};
-        region.size = size;
-        region.is_cuda_malloc = true;
-        if (cudaMalloc(reinterpret_cast<void **>(&region.ptr), size) != cudaSuccess) {
+        region.size_ = size;
+        region.is_cuda_malloc_ = true;
+        if (cudaMalloc(reinterpret_cast<void **>(&region.ptr_), size) != cudaSuccess) {
             throw std::runtime_error("cudaMalloc fallback failed (fabric not supported)");
         }
         return region;
     }
 
-    vmm_region region = {};
     CUmemAccessDesc access_desc = {};
-    const char *err_msg;
 
-    region.size = nixl_ep::align_up<size_t>(size, ctx.granularity);
+    region.size_ = nixl_ep::align_up<size_t>(size, ctx.granularity);
 
-    if (cuMemCreate(&region.handle, region.size, &ctx.prop, 0) != CUDA_SUCCESS) {
+    if (cuMemCreate(&region.handle_, region.size_, &ctx.prop, 0) != CUDA_SUCCESS) {
         throw std::runtime_error("Failed to create CUDA VMM allocation");
     }
 
-    if (cuMemAddressReserve(&region.ptr, region.size, 0, 0, 0) != CUDA_SUCCESS) {
-        err_msg = "Failed to reserve CUDA virtual address";
-        goto err_release;
+    if (cuMemAddressReserve(&region.ptr_, region.size_, 0, 0, 0) != CUDA_SUCCESS) {
+        region.release();
+        throw std::runtime_error("Failed to reserve CUDA virtual address");
     }
+    region.vmm_addr_reserved_ = true;
 
-    if (cuMemMap(region.ptr, region.size, 0, region.handle, 0) != CUDA_SUCCESS) {
-        err_msg = "Failed to map CUDA VMM memory";
-        goto err_free;
+    if (cuMemMap(region.ptr_, region.size_, 0, region.handle_, 0) != CUDA_SUCCESS) {
+        region.release();
+        throw std::runtime_error("Failed to map CUDA VMM memory");
     }
+    region.vmm_mapped_ = true;
 
     access_desc.location.type = CU_MEM_LOCATION_TYPE_DEVICE;
     access_desc.location.id = device;
     access_desc.flags = CU_MEM_ACCESS_FLAGS_PROT_READWRITE;
-    if (cuMemSetAccess(region.ptr, region.size, &access_desc, 1) != CUDA_SUCCESS) {
-        err_msg = "Failed to set CUDA memory access";
-        goto err_unmap;
+    if (cuMemSetAccess(region.ptr_, region.size_, &access_desc, 1) != CUDA_SUCCESS) {
+        region.release();
+        throw std::runtime_error("Failed to set CUDA memory access");
     }
 
     return region;
-
-err_unmap:
-    cuMemUnmap(region.ptr, region.size);
-err_free:
-    cuMemAddressFree(region.ptr, region.size);
-    region.ptr = 0;
-err_release:
-    cuMemRelease(region.handle);
-    region.handle = 0;
-    throw std::runtime_error(err_msg);
-}
-
-void
-vmm_free(vmm_region &region) {
-    if (!region.ptr) {
-        return;
-    }
-    if (region.is_cuda_malloc) {
-        cudaFree(reinterpret_cast<void *>(region.ptr));
-        region.ptr = 0;
-        return;
-    }
-    cuMemUnmap(region.ptr, region.size);
-    cuMemAddressFree(region.ptr, region.size);
-    region.ptr = 0;
-    if (region.handle) {
-        cuMemRelease(region.handle);
-        region.handle = 0;
-    }
 }
diff --git a/examples/device/ep/csrc/vmm.hpp b/examples/device/ep/csrc/vmm.hpp
index 42e0c7a87a..b91ae811b5 100644
--- a/examples/device/ep/csrc/vmm.hpp
+++ b/examples/device/ep/csrc/vmm.hpp
@@ -19,15 +19,47 @@
 
 #include <cuda.h>
 #include <cuda_runtime.h>
+#include <cstddef>
 
-struct vmm_region {
-    CUdeviceptr ptr = 0;
-    size_t size = 0;
-    CUmemGenericAllocationHandle handle = 0;
-    bool is_cuda_malloc = false;
-};
+class vmm_region {
+public:
+    vmm_region() = default;
+    ~vmm_region();
+
+    vmm_region(vmm_region &&other) noexcept;
+    vmm_region &
+    operator=(vmm_region &&other) noexcept;
+
+    vmm_region(const vmm_region &) = delete;
+    vmm_region &
+    operator=(const vmm_region &) = delete;
+
+    [[nodiscard]] static vmm_region
+    allocate(size_t size, CUdevice device);
+
+    [[nodiscard]] CUdeviceptr
+    ptr() const noexcept {
+        return ptr_;
+    }
 
-vmm_region
-vmm_init(size_t size, CUdevice device);
-void
-vmm_free(vmm_region &region);
+    [[nodiscard]] size_t
+    size() const noexcept {
+        return size_;
+    }
+
+    [[nodiscard]] CUmemGenericAllocationHandle
+    handle() const noexcept {
+        return handle_;
+    }
+
+private:
+    void
+    release() noexcept;
+
+    CUdeviceptr ptr_ = 0;
+    size_t size_ = 0;
+    CUmemGenericAllocationHandle handle_ = 0;
+    bool is_cuda_malloc_ = false;
+    bool vmm_addr_reserved_ = false;
+    bool vmm_mapped_ = false;
+};

From 22c59f70f87b59aa2177e1b81ffe9dbef553caba Mon Sep 17 00:00:00 2001
From: Ofir Farjon <ofarjon@nvidia.com>
Date: Tue, 24 Mar 2026 19:29:05 +0200
Subject: [PATCH 22/29] NIXL/EP: Fix comment.

---
 examples/device/ep/csrc/cuda_utils.hpp | 48 ++++++++++++++
 examples/device/ep/csrc/nixl_ep.cpp    | 36 ++++++-----
 examples/device/ep/csrc/nixl_ep.hpp    | 10 +--
 examples/device/ep/csrc/vmm.cpp        | 87 ++++++++------------------
 examples/device/ep/csrc/vmm.hpp        | 13 ++--
 5 files changed, 103 insertions(+), 91 deletions(-)
 create mode 100644 examples/device/ep/csrc/cuda_utils.hpp

diff --git a/examples/device/ep/csrc/cuda_utils.hpp b/examples/device/ep/csrc/cuda_utils.hpp
new file mode 100644
index 0000000000..4ec5d4e0b4
--- /dev/null
+++ b/examples/device/ep/csrc/cuda_utils.hpp
@@ -0,0 +1,48 @@
+/*
+ * SPDX-FileCopyrightText: Copyright (c) 2025-2026 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+ * SPDX-License-Identifier: Apache-2.0
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#pragma once
+
+#include <cuda.h>
+#include <cuda_runtime.h>
+
+#include <iostream>
+
+namespace nixl_ep {
+
+/** Log a non-fatal warning if a CUDA runtime API call failed (e.g. during teardown). */
+inline void
+warn_cuda_api(cudaError_t status, const char *context, const char *operation) noexcept {
+    if (status != cudaSuccess) {
+        std::cerr << "WARNING: " << context << " failed to " << operation << ": "
+                  << cudaGetErrorString(status) << '\n';
+    }
+}
+
+/** Log a non-fatal warning if a CUDA driver API call failed (e.g. during teardown). */
+inline void
+warn_cu_api(CUresult status, const char *context, const char *operation) noexcept {
+    if (status != CUDA_SUCCESS) {
+        const char *msg = nullptr;
+        if (cuGetErrorString(status, &msg) != CUDA_SUCCESS || msg == nullptr) {
+            msg = "unknown CUDA driver error";
+        }
+        std::cerr << "WARNING: " << context << " failed to " << operation << ": " << msg << '\n';
+    }
+}
+
+} // namespace nixl_ep
diff --git a/examples/device/ep/csrc/nixl_ep.cpp b/examples/device/ep/csrc/nixl_ep.cpp
index da1d0758e0..39f6f64986 100644
--- a/examples/device/ep/csrc/nixl_ep.cpp
+++ b/examples/device/ep/csrc/nixl_ep.cpp
@@ -34,6 +34,7 @@
 #include <torch/python.h>
 
 #include "nixl_ep.hpp"
+#include "cuda_utils.hpp"
 #include "kernels/api.cuh"
 #include "kernels/configs.cuh"
 #include <cstdio>
@@ -102,28 +103,31 @@ void Buffer::init(int num_ranks, int num_experts_per_rank, int64_t num_rdma_byte
     auto per_channel_bytes = ceil_div<int64_t>(num_rdma_bytes, denom_sms);
     EP_HOST_ASSERT(per_channel_bytes < std::numeric_limits<int>::max());
 
+    const CUdevice cu_dev = static_cast<CUdevice>(device_id);
+
     // Create 32 MiB workspace
-    m_workspace_alloc = vmm_region::allocate(NUM_WORKSPACE_BYTES, device_id);
-    workspace = reinterpret_cast<void *>(m_workspace_alloc.ptr());
+    m_workspace_alloc = std::make_unique<vmm_region>(NUM_WORKSPACE_BYTES, cu_dev);
+    workspace = reinterpret_cast<void *>(m_workspace_alloc->ptr());
     CUDA_CHECK(cudaMemsetAsync(workspace, 0, NUM_WORKSPACE_BYTES, comm_stream));
 
     EP_HOST_ASSERT(max_experts_per_rank > 0);
-    m_rdma_alloc = vmm_region::allocate(num_rdma_bytes, device_id);
-    rdma_buffer_ptr = reinterpret_cast<void *>(m_rdma_alloc.ptr());
+    m_rdma_alloc = std::make_unique<vmm_region>(static_cast<size_t>(num_rdma_bytes), cu_dev);
+    rdma_buffer_ptr = reinterpret_cast<void *>(m_rdma_alloc->ptr());
     CUDA_CHECK(cudaMemset(rdma_buffer_ptr, 0, num_rdma_bytes));
 
     // Allocate and clean shrink buffer
     int num_mask_buffer_bytes = max_num_ranks * sizeof(int);
-    m_mask_alloc = vmm_region::allocate(num_mask_buffer_bytes, device_id);
-    mask_buffer_ptr = reinterpret_cast<int *>(m_mask_alloc.ptr());
+    m_mask_alloc = std::make_unique<vmm_region>(static_cast<size_t>(num_mask_buffer_bytes), cu_dev);
+    mask_buffer_ptr = reinterpret_cast<int *>(m_mask_alloc->ptr());
     CUDA_CHECK(cudaMemset(mask_buffer_ptr, 0xff, num_mask_buffer_bytes));
     CUDA_CHECK(cudaMemset(mask_buffer_ptr + rank, 0, sizeof(int)));
 
     int num_sync_buffer_bytes = max_num_ranks * sizeof(int);
-    m_sync_alloc = vmm_region::allocate(num_sync_buffer_bytes, device_id);
-    sync_buffer_ptr = reinterpret_cast<int *>(m_sync_alloc.ptr());
-    m_sync_count_alloc = vmm_region::allocate(num_sync_buffer_bytes, device_id);
-    sync_count_ptr = reinterpret_cast<int *>(m_sync_count_alloc.ptr());
+    m_sync_alloc = std::make_unique<vmm_region>(static_cast<size_t>(num_sync_buffer_bytes), cu_dev);
+    sync_buffer_ptr = reinterpret_cast<int *>(m_sync_alloc->ptr());
+    m_sync_count_alloc =
+        std::make_unique<vmm_region>(static_cast<size_t>(num_sync_buffer_bytes), cu_dev);
+    sync_count_ptr = reinterpret_cast<int *>(m_sync_count_alloc->ptr());
     CUDA_CHECK(cudaMemset(sync_buffer_ptr, 0, num_sync_buffer_bytes));
     CUDA_CHECK(cudaMemset(sync_count_ptr, 0, num_sync_buffer_bytes));
     CUDA_CHECK(cudaDeviceSynchronize());
@@ -189,7 +193,7 @@ void Buffer::destroy() {
     }
 
     // Synchronize
-    warn_cuda(cudaDeviceSynchronize(), "synchronize device");
+    warn_cuda_api(cudaDeviceSynchronize(), "destroy()", "synchronize device");
 
     _nixl_ep_destroy();
 
@@ -214,16 +218,16 @@ void Buffer::destroy() {
         nixl_agent_info.reset();
     }
 
-    m_rdma_alloc = vmm_region{};
+    m_rdma_alloc.reset();
     rdma_buffer_ptr = nullptr;
-    m_mask_alloc = vmm_region{};
+    m_mask_alloc.reset();
     mask_buffer_ptr = nullptr;
-    m_sync_alloc = vmm_region{};
+    m_sync_alloc.reset();
     sync_buffer_ptr = nullptr;
-    m_sync_count_alloc = vmm_region{};
+    m_sync_count_alloc.reset();
     sync_count_ptr = nullptr;
 
-    m_workspace_alloc = vmm_region{};
+    m_workspace_alloc.reset();
     workspace = nullptr;
 
     destroyed = true;
diff --git a/examples/device/ep/csrc/nixl_ep.hpp b/examples/device/ep/csrc/nixl_ep.hpp
index 9d520a195a..1053ab9d40 100644
--- a/examples/device/ep/csrc/nixl_ep.hpp
+++ b/examples/device/ep/csrc/nixl_ep.hpp
@@ -88,11 +88,11 @@ struct Buffer {
     int *sync_count_ptr = nullptr;
 
     /* Owning VMM allocations (keep raw ptrs above as aliases) */
-    vmm_region m_rdma_alloc;
-    vmm_region m_mask_alloc;
-    vmm_region m_sync_alloc;
-    vmm_region m_sync_count_alloc;
-    vmm_region m_workspace_alloc;
+    std::unique_ptr<vmm_region> m_rdma_alloc;
+    std::unique_ptr<vmm_region> m_mask_alloc;
+    std::unique_ptr<vmm_region> m_sync_alloc;
+    std::unique_ptr<vmm_region> m_sync_count_alloc;
+    std::unique_ptr<vmm_region> m_workspace_alloc;
 
     // Device info and communication
     int device_id;
diff --git a/examples/device/ep/csrc/vmm.cpp b/examples/device/ep/csrc/vmm.cpp
index ac37a5ec4f..63dd2699b8 100644
--- a/examples/device/ep/csrc/vmm.cpp
+++ b/examples/device/ep/csrc/vmm.cpp
@@ -16,16 +16,20 @@
  */
 
 #include <stdexcept>
-#include <utility>
 
 #include "config.hpp"
+#include "cuda_utils.hpp"
 #include "vmm.hpp"
 
+namespace {
+constexpr const char *k_vmm_ctx = "vmm_region";
+}
+
 void
 vmm_region::release() noexcept {
     if (is_cuda_malloc_) {
         if (ptr_) {
-            cudaFree(reinterpret_cast<void *>(ptr_));
+            nixl_ep::warn_cuda_api(cudaFree(reinterpret_cast<void *>(ptr_)), k_vmm_ctx, "cudaFree");
         }
         ptr_ = 0;
         size_ = 0;
@@ -34,16 +38,16 @@ vmm_region::release() noexcept {
     }
 
     if (vmm_mapped_) {
-        cuMemUnmap(ptr_, size_);
+        nixl_ep::warn_cu_api(cuMemUnmap(ptr_, size_), k_vmm_ctx, "cuMemUnmap");
         vmm_mapped_ = false;
     }
     if (vmm_addr_reserved_ && ptr_) {
-        cuMemAddressFree(ptr_, size_);
+        nixl_ep::warn_cu_api(cuMemAddressFree(ptr_, size_), k_vmm_ctx, "cuMemAddressFree");
         ptr_ = 0;
         vmm_addr_reserved_ = false;
     }
     if (handle_) {
-        cuMemRelease(handle_);
+        nixl_ep::warn_cu_api(cuMemRelease(handle_), k_vmm_ctx, "cuMemRelease");
         handle_ = 0;
     }
     size_ = 0;
@@ -53,46 +57,9 @@ vmm_region::~vmm_region() {
     release();
 }
 
-vmm_region::vmm_region(vmm_region &&other) noexcept
-    : ptr_(other.ptr_),
-      size_(other.size_),
-      handle_(other.handle_),
-      is_cuda_malloc_(other.is_cuda_malloc_),
-      vmm_addr_reserved_(other.vmm_addr_reserved_),
-      vmm_mapped_(other.vmm_mapped_) {
-    other.ptr_ = 0;
-    other.size_ = 0;
-    other.handle_ = 0;
-    other.is_cuda_malloc_ = false;
-    other.vmm_addr_reserved_ = false;
-    other.vmm_mapped_ = false;
-}
-
-vmm_region &
-vmm_region::operator=(vmm_region &&other) noexcept {
-    if (this == &other) {
-        return *this;
-    }
-    release();
-    ptr_ = other.ptr_;
-    size_ = other.size_;
-    handle_ = other.handle_;
-    is_cuda_malloc_ = other.is_cuda_malloc_;
-    vmm_addr_reserved_ = other.vmm_addr_reserved_;
-    vmm_mapped_ = other.vmm_mapped_;
-    other.ptr_ = 0;
-    other.size_ = 0;
-    other.handle_ = 0;
-    other.is_cuda_malloc_ = false;
-    other.vmm_addr_reserved_ = false;
-    other.vmm_mapped_ = false;
-    return *this;
-}
-
-vmm_region
-vmm_region::allocate(size_t size, CUdevice device) {
+vmm_region::vmm_region(size_t size, CUdevice device) {
     if (size == 0) {
-        throw std::invalid_argument("vmm_region::allocate: size must be non-zero");
+        throw std::invalid_argument("vmm_region: size must be non-zero");
     }
 
     struct cuda_alloc_ctx {
@@ -148,44 +115,40 @@ vmm_region::allocate(size_t size, CUdevice device) {
 
     static cuda_alloc_ctx ctx(device);
 
-    vmm_region region;
-
     if (!ctx.fabric_supported) {
-        region.size_ = size;
-        region.is_cuda_malloc_ = true;
-        if (cudaMalloc(reinterpret_cast<void **>(&region.ptr_), size) != cudaSuccess) {
+        size_ = size;
+        is_cuda_malloc_ = true;
+        if (cudaMalloc(reinterpret_cast<void **>(&ptr_), size) != cudaSuccess) {
             throw std::runtime_error("cudaMalloc fallback failed (fabric not supported)");
         }
-        return region;
+        return;
     }
 
     CUmemAccessDesc access_desc = {};
 
-    region.size_ = nixl_ep::align_up<size_t>(size, ctx.granularity);
+    size_ = nixl_ep::align_up<size_t>(size, ctx.granularity);
 
-    if (cuMemCreate(&region.handle_, region.size_, &ctx.prop, 0) != CUDA_SUCCESS) {
+    if (cuMemCreate(&handle_, size_, &ctx.prop, 0) != CUDA_SUCCESS) {
         throw std::runtime_error("Failed to create CUDA VMM allocation");
     }
 
-    if (cuMemAddressReserve(&region.ptr_, region.size_, 0, 0, 0) != CUDA_SUCCESS) {
-        region.release();
+    if (cuMemAddressReserve(&ptr_, size_, 0, 0, 0) != CUDA_SUCCESS) {
+        release();
         throw std::runtime_error("Failed to reserve CUDA virtual address");
     }
-    region.vmm_addr_reserved_ = true;
+    vmm_addr_reserved_ = true;
 
-    if (cuMemMap(region.ptr_, region.size_, 0, region.handle_, 0) != CUDA_SUCCESS) {
-        region.release();
+    if (cuMemMap(ptr_, size_, 0, handle_, 0) != CUDA_SUCCESS) {
+        release();
         throw std::runtime_error("Failed to map CUDA VMM memory");
     }
-    region.vmm_mapped_ = true;
+    vmm_mapped_ = true;
 
     access_desc.location.type = CU_MEM_LOCATION_TYPE_DEVICE;
     access_desc.location.id = device;
     access_desc.flags = CU_MEM_ACCESS_FLAGS_PROT_READWRITE;
-    if (cuMemSetAccess(region.ptr_, region.size_, &access_desc, 1) != CUDA_SUCCESS) {
-        region.release();
+    if (cuMemSetAccess(ptr_, size_, &access_desc, 1) != CUDA_SUCCESS) {
+        release();
         throw std::runtime_error("Failed to set CUDA memory access");
     }
-
-    return region;
 }
diff --git a/examples/device/ep/csrc/vmm.hpp b/examples/device/ep/csrc/vmm.hpp
index b91ae811b5..8b3edf8a3b 100644
--- a/examples/device/ep/csrc/vmm.hpp
+++ b/examples/device/ep/csrc/vmm.hpp
@@ -23,19 +23,16 @@
 
 class vmm_region {
 public:
-    vmm_region() = default;
-    ~vmm_region();
+    explicit vmm_region(size_t size, CUdevice device);
 
-    vmm_region(vmm_region &&other) noexcept;
-    vmm_region &
-    operator=(vmm_region &&other) noexcept;
+    ~vmm_region();
 
     vmm_region(const vmm_region &) = delete;
     vmm_region &
     operator=(const vmm_region &) = delete;
-
-    [[nodiscard]] static vmm_region
-    allocate(size_t size, CUdevice device);
+    vmm_region(vmm_region &&) = delete;
+    vmm_region &
+    operator=(vmm_region &&) = delete;
 
     [[nodiscard]] CUdeviceptr
     ptr() const noexcept {

From 0a58f96f70a9a734ee270cff29fe10c8af79a6e1 Mon Sep 17 00:00:00 2001
From: Ofir Farjon <ofarjon@nvidia.com>
Date: Tue, 24 Mar 2026 19:30:41 +0200
Subject: [PATCH 23/29] NIXL/EP: Fix.

---
 examples/device/ep/csrc/nixl_ep.cpp | 7 -------
 1 file changed, 7 deletions(-)

diff --git a/examples/device/ep/csrc/nixl_ep.cpp b/examples/device/ep/csrc/nixl_ep.cpp
index 39f6f64986..aa43e9f60c 100644
--- a/examples/device/ep/csrc/nixl_ep.cpp
+++ b/examples/device/ep/csrc/nixl_ep.cpp
@@ -174,13 +174,6 @@ torch::Stream Buffer::get_comm_stream() const {
 }
 
 void Buffer::destroy() {
-    auto warn_cuda = [](cudaError_t status, const char* operation) noexcept {
-        if (status != cudaSuccess) {
-            std::cerr << "WARNING: destroy() failed to " << operation
-                      << ": " << cudaGetErrorString(status) << '\n';
-        }
-    };
-
     auto warn_nixl = [](nixl_status_t status, const char* operation) noexcept {
         if (status != NIXL_SUCCESS) {
             std::cerr << "WARNING: destroy() failed to " << operation

From d2bdfeac7c79d1953dda929a8683671c9c3ec4eb Mon Sep 17 00:00:00 2001
From: Ofir Farjon <ofarjon@nvidia.com>
Date: Tue, 24 Mar 2026 21:11:39 +0200
Subject: [PATCH 24/29] NIXL/EP: fix.

---
 examples/device/ep/csrc/vmm.cpp | 3 ---
 1 file changed, 3 deletions(-)

diff --git a/examples/device/ep/csrc/vmm.cpp b/examples/device/ep/csrc/vmm.cpp
index 63dd2699b8..0310fc4a56 100644
--- a/examples/device/ep/csrc/vmm.cpp
+++ b/examples/device/ep/csrc/vmm.cpp
@@ -32,8 +32,6 @@ vmm_region::release() noexcept {
             nixl_ep::warn_cuda_api(cudaFree(reinterpret_cast<void *>(ptr_)), k_vmm_ctx, "cudaFree");
         }
         ptr_ = 0;
-        size_ = 0;
-        is_cuda_malloc_ = false;
         return;
     }
 
@@ -50,7 +48,6 @@ vmm_region::release() noexcept {
         nixl_ep::warn_cu_api(cuMemRelease(handle_), k_vmm_ctx, "cuMemRelease");
         handle_ = 0;
     }
-    size_ = 0;
 }
 
 vmm_region::~vmm_region() {

From 040589972c619aa193d6496bb48528d3f5e01973 Mon Sep 17 00:00:00 2001
From: Ofir Farjon <ofarjon@nvidia.com>
Date: Wed, 25 Mar 2026 19:26:33 +0200
Subject: [PATCH 25/29] NIXL/EP: Fix comment.

---
 examples/device/ep/csrc/cuda_utils.hpp | 48 --------------------------
 examples/device/ep/csrc/nixl_ep.cpp    | 15 +++++---
 examples/device/ep/csrc/vmm.cpp        | 37 ++++++++++++++------
 examples/device/ep/csrc/vmm.hpp        | 25 ++++++--------
 4 files changed, 48 insertions(+), 77 deletions(-)
 delete mode 100644 examples/device/ep/csrc/cuda_utils.hpp

diff --git a/examples/device/ep/csrc/cuda_utils.hpp b/examples/device/ep/csrc/cuda_utils.hpp
deleted file mode 100644
index 4ec5d4e0b4..0000000000
--- a/examples/device/ep/csrc/cuda_utils.hpp
+++ /dev/null
@@ -1,48 +0,0 @@
-/*
- * SPDX-FileCopyrightText: Copyright (c) 2025-2026 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
- * SPDX-License-Identifier: Apache-2.0
- *
- * Licensed under the Apache License, Version 2.0 (the "License");
- * you may not use this file except in compliance with the License.
- * You may obtain a copy of the License at
- *
- * http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing, software
- * distributed under the License is distributed on an "AS IS" BASIS,
- * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- * See the License for the specific language governing permissions and
- * limitations under the License.
- */
-
-#pragma once
-
-#include <cuda.h>
-#include <cuda_runtime.h>
-
-#include <iostream>
-
-namespace nixl_ep {
-
-/** Log a non-fatal warning if a CUDA runtime API call failed (e.g. during teardown). */
-inline void
-warn_cuda_api(cudaError_t status, const char *context, const char *operation) noexcept {
-    if (status != cudaSuccess) {
-        std::cerr << "WARNING: " << context << " failed to " << operation << ": "
-                  << cudaGetErrorString(status) << '\n';
-    }
-}
-
-/** Log a non-fatal warning if a CUDA driver API call failed (e.g. during teardown). */
-inline void
-warn_cu_api(CUresult status, const char *context, const char *operation) noexcept {
-    if (status != CUDA_SUCCESS) {
-        const char *msg = nullptr;
-        if (cuGetErrorString(status, &msg) != CUDA_SUCCESS || msg == nullptr) {
-            msg = "unknown CUDA driver error";
-        }
-        std::cerr << "WARNING: " << context << " failed to " << operation << ": " << msg << '\n';
-    }
-}
-
-} // namespace nixl_ep
diff --git a/examples/device/ep/csrc/nixl_ep.cpp b/examples/device/ep/csrc/nixl_ep.cpp
index aa43e9f60c..26d40a9351 100644
--- a/examples/device/ep/csrc/nixl_ep.cpp
+++ b/examples/device/ep/csrc/nixl_ep.cpp
@@ -107,7 +107,7 @@ void Buffer::init(int num_ranks, int num_experts_per_rank, int64_t num_rdma_byte
 
     // Create 32 MiB workspace
     m_workspace_alloc = std::make_unique<vmm_region>(NUM_WORKSPACE_BYTES, cu_dev);
-    workspace = reinterpret_cast<void *>(m_workspace_alloc->ptr());
+    workspace = m_workspace_alloc->ptr();
     CUDA_CHECK(cudaMemsetAsync(workspace, 0, NUM_WORKSPACE_BYTES, comm_stream));
 
     EP_HOST_ASSERT(max_experts_per_rank > 0);
@@ -118,16 +118,16 @@ void Buffer::init(int num_ranks, int num_experts_per_rank, int64_t num_rdma_byte
     // Allocate and clean shrink buffer
     int num_mask_buffer_bytes = max_num_ranks * sizeof(int);
     m_mask_alloc = std::make_unique<vmm_region>(static_cast<size_t>(num_mask_buffer_bytes), cu_dev);
-    mask_buffer_ptr = reinterpret_cast<int *>(m_mask_alloc->ptr());
+    mask_buffer_ptr = static_cast<int *>(m_mask_alloc->ptr());
     CUDA_CHECK(cudaMemset(mask_buffer_ptr, 0xff, num_mask_buffer_bytes));
     CUDA_CHECK(cudaMemset(mask_buffer_ptr + rank, 0, sizeof(int)));
 
     int num_sync_buffer_bytes = max_num_ranks * sizeof(int);
     m_sync_alloc = std::make_unique<vmm_region>(static_cast<size_t>(num_sync_buffer_bytes), cu_dev);
-    sync_buffer_ptr = reinterpret_cast<int *>(m_sync_alloc->ptr());
+    sync_buffer_ptr = static_cast<int *>(m_sync_alloc->ptr());
     m_sync_count_alloc =
         std::make_unique<vmm_region>(static_cast<size_t>(num_sync_buffer_bytes), cu_dev);
-    sync_count_ptr = reinterpret_cast<int *>(m_sync_count_alloc->ptr());
+    sync_count_ptr = static_cast<int *>(m_sync_count_alloc->ptr());
     CUDA_CHECK(cudaMemset(sync_buffer_ptr, 0, num_sync_buffer_bytes));
     CUDA_CHECK(cudaMemset(sync_count_ptr, 0, num_sync_buffer_bytes));
     CUDA_CHECK(cudaDeviceSynchronize());
@@ -174,6 +174,13 @@ torch::Stream Buffer::get_comm_stream() const {
 }
 
 void Buffer::destroy() {
+    auto warn_cuda = [](cudaError_t status, const char *operation) noexcept {
+        if (status != cudaSuccess) {
+            std::cerr << "WARNING: destroy() failed to " << operation << ": "
+                      << cudaGetErrorString(status) << '\n';
+        }
+    };
+
     auto warn_nixl = [](nixl_status_t status, const char* operation) noexcept {
         if (status != NIXL_SUCCESS) {
             std::cerr << "WARNING: destroy() failed to " << operation
diff --git a/examples/device/ep/csrc/vmm.cpp b/examples/device/ep/csrc/vmm.cpp
index 0310fc4a56..d72db7bf27 100644
--- a/examples/device/ep/csrc/vmm.cpp
+++ b/examples/device/ep/csrc/vmm.cpp
@@ -1,5 +1,5 @@
 /*
- * SPDX-FileCopyrightText: Copyright (c) 2025-2026 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+ * SPDX-FileCopyrightText: Copyright (c) 2026 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
  * SPDX-License-Identifier: Apache-2.0
  *
  * Licensed under the Apache License, Version 2.0 (the "License");
@@ -15,37 +15,51 @@
  * limitations under the License.
  */
 
+#include <iostream>
 #include <stdexcept>
 
 #include "config.hpp"
-#include "cuda_utils.hpp"
 #include "vmm.hpp"
 
 namespace {
+
 constexpr const char *k_vmm_ctx = "vmm_region";
 }
 
+namespace nixl_ep {
+
+/** Log a non-fatal warning if a CUDA driver API call failed (e.g. during teardown). */
+void
+vmm_region::warn_cu_api(CUresult status, const char *context, const char *operation) noexcept {
+    if (status != CUDA_SUCCESS) {
+        const char *msg = nullptr;
+        if (cuGetErrorString(status, &msg) != CUDA_SUCCESS || msg == nullptr) {
+            msg = "unknown CUDA driver error";
+        }
+        std::cerr << "WARNING: " << context << " failed to " << operation << ": " << msg << '\n';
+    }
+}
+
 void
 vmm_region::release() noexcept {
     if (is_cuda_malloc_) {
         if (ptr_) {
-            nixl_ep::warn_cuda_api(cudaFree(reinterpret_cast<void *>(ptr_)), k_vmm_ctx, "cudaFree");
+            warn_cu_api(cuMemFree(ptr_), k_vmm_ctx, "cuMemFree");
         }
         ptr_ = 0;
         return;
     }
 
     if (vmm_mapped_) {
-        nixl_ep::warn_cu_api(cuMemUnmap(ptr_, size_), k_vmm_ctx, "cuMemUnmap");
+        warn_cu_api(cuMemUnmap(ptr_, size_), k_vmm_ctx, "cuMemUnmap");
         vmm_mapped_ = false;
     }
-    if (vmm_addr_reserved_ && ptr_) {
-        nixl_ep::warn_cu_api(cuMemAddressFree(ptr_, size_), k_vmm_ctx, "cuMemAddressFree");
+    if (ptr_) {
+        warn_cu_api(cuMemAddressFree(ptr_, size_), k_vmm_ctx, "cuMemAddressFree");
         ptr_ = 0;
-        vmm_addr_reserved_ = false;
     }
     if (handle_) {
-        nixl_ep::warn_cu_api(cuMemRelease(handle_), k_vmm_ctx, "cuMemRelease");
+        warn_cu_api(cuMemRelease(handle_), k_vmm_ctx, "cuMemRelease");
         handle_ = 0;
     }
 }
@@ -115,8 +129,8 @@ vmm_region::vmm_region(size_t size, CUdevice device) {
     if (!ctx.fabric_supported) {
         size_ = size;
         is_cuda_malloc_ = true;
-        if (cudaMalloc(reinterpret_cast<void **>(&ptr_), size) != cudaSuccess) {
-            throw std::runtime_error("cudaMalloc fallback failed (fabric not supported)");
+        if (cuMemAlloc(&ptr_, size) != CUDA_SUCCESS) {
+            throw std::runtime_error("cuMemAlloc fallback failed (fabric not supported)");
         }
         return;
     }
@@ -133,7 +147,6 @@ vmm_region::vmm_region(size_t size, CUdevice device) {
         release();
         throw std::runtime_error("Failed to reserve CUDA virtual address");
     }
-    vmm_addr_reserved_ = true;
 
     if (cuMemMap(ptr_, size_, 0, handle_, 0) != CUDA_SUCCESS) {
         release();
@@ -149,3 +162,5 @@ vmm_region::vmm_region(size_t size, CUdevice device) {
         throw std::runtime_error("Failed to set CUDA memory access");
     }
 }
+
+} // namespace nixl_ep
diff --git a/examples/device/ep/csrc/vmm.hpp b/examples/device/ep/csrc/vmm.hpp
index 8b3edf8a3b..7b71092f76 100644
--- a/examples/device/ep/csrc/vmm.hpp
+++ b/examples/device/ep/csrc/vmm.hpp
@@ -1,5 +1,5 @@
 /*
- * SPDX-FileCopyrightText: Copyright (c) 2025-2026 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+ * SPDX-FileCopyrightText: Copyright (c) 2026 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
  * SPDX-License-Identifier: Apache-2.0
  *
  * Licensed under the Apache License, Version 2.0 (the "License");
@@ -18,8 +18,10 @@
 #pragma once
 
 #include <cuda.h>
-#include <cuda_runtime.h>
 #include <cstddef>
+#include <cstdint>
+
+namespace nixl_ep {
 
 class vmm_region {
 public:
@@ -34,22 +36,15 @@ class vmm_region {
     vmm_region &
     operator=(vmm_region &&) = delete;
 
-    [[nodiscard]] CUdeviceptr
+    [[nodiscard]] void *
     ptr() const noexcept {
-        return ptr_;
-    }
-
-    [[nodiscard]] size_t
-    size() const noexcept {
-        return size_;
-    }
-
-    [[nodiscard]] CUmemGenericAllocationHandle
-    handle() const noexcept {
-        return handle_;
+        return reinterpret_cast<void *>(static_cast<std::uintptr_t>(ptr_));
     }
 
 private:
+    static void
+    warn_cu_api(CUresult status, const char *context, const char *operation) noexcept;
+
     void
     release() noexcept;
 
@@ -60,3 +55,5 @@ class vmm_region {
     bool vmm_addr_reserved_ = false;
     bool vmm_mapped_ = false;
 };
+
+} // namespace nixl_ep

From e82d42277a5888ceef663b583c5f427c5f62d163 Mon Sep 17 00:00:00 2001
From: Ofir Farjon <ofarjon@nvidia.com>
Date: Wed, 25 Mar 2026 19:57:22 +0200
Subject: [PATCH 26/29] NIXL/EP: Fix comment.

---
 examples/device/ep/csrc/nixl_ep.cpp | 16 ++++++----------
 examples/device/ep/csrc/vmm.cpp     | 29 +++++++++++++++++------------
 examples/device/ep/csrc/vmm.hpp     |  2 +-
 3 files changed, 24 insertions(+), 23 deletions(-)

diff --git a/examples/device/ep/csrc/nixl_ep.cpp b/examples/device/ep/csrc/nixl_ep.cpp
index 26d40a9351..0a0d4069ce 100644
--- a/examples/device/ep/csrc/nixl_ep.cpp
+++ b/examples/device/ep/csrc/nixl_ep.cpp
@@ -34,7 +34,6 @@
 #include <torch/python.h>
 
 #include "nixl_ep.hpp"
-#include "cuda_utils.hpp"
 #include "kernels/api.cuh"
 #include "kernels/configs.cuh"
 #include <cstdio>
@@ -103,30 +102,27 @@ void Buffer::init(int num_ranks, int num_experts_per_rank, int64_t num_rdma_byte
     auto per_channel_bytes = ceil_div<int64_t>(num_rdma_bytes, denom_sms);
     EP_HOST_ASSERT(per_channel_bytes < std::numeric_limits<int>::max());
 
-    const CUdevice cu_dev = static_cast<CUdevice>(device_id);
-
     // Create 32 MiB workspace
-    m_workspace_alloc = std::make_unique<vmm_region>(NUM_WORKSPACE_BYTES, cu_dev);
+    m_workspace_alloc = std::make_unique<vmm_region>(NUM_WORKSPACE_BYTES);
     workspace = m_workspace_alloc->ptr();
     CUDA_CHECK(cudaMemsetAsync(workspace, 0, NUM_WORKSPACE_BYTES, comm_stream));
 
     EP_HOST_ASSERT(max_experts_per_rank > 0);
-    m_rdma_alloc = std::make_unique<vmm_region>(static_cast<size_t>(num_rdma_bytes), cu_dev);
+    m_rdma_alloc = std::make_unique<vmm_region>(static_cast<size_t>(num_rdma_bytes));
     rdma_buffer_ptr = reinterpret_cast<void *>(m_rdma_alloc->ptr());
     CUDA_CHECK(cudaMemset(rdma_buffer_ptr, 0, num_rdma_bytes));
 
     // Allocate and clean shrink buffer
     int num_mask_buffer_bytes = max_num_ranks * sizeof(int);
-    m_mask_alloc = std::make_unique<vmm_region>(static_cast<size_t>(num_mask_buffer_bytes), cu_dev);
+    m_mask_alloc = std::make_unique<vmm_region>(static_cast<size_t>(num_mask_buffer_bytes));
     mask_buffer_ptr = static_cast<int *>(m_mask_alloc->ptr());
     CUDA_CHECK(cudaMemset(mask_buffer_ptr, 0xff, num_mask_buffer_bytes));
     CUDA_CHECK(cudaMemset(mask_buffer_ptr + rank, 0, sizeof(int)));
 
     int num_sync_buffer_bytes = max_num_ranks * sizeof(int);
-    m_sync_alloc = std::make_unique<vmm_region>(static_cast<size_t>(num_sync_buffer_bytes), cu_dev);
+    m_sync_alloc = std::make_unique<vmm_region>(static_cast<size_t>(num_sync_buffer_bytes));
     sync_buffer_ptr = static_cast<int *>(m_sync_alloc->ptr());
-    m_sync_count_alloc =
-        std::make_unique<vmm_region>(static_cast<size_t>(num_sync_buffer_bytes), cu_dev);
+    m_sync_count_alloc = std::make_unique<vmm_region>(static_cast<size_t>(num_sync_buffer_bytes));
     sync_count_ptr = static_cast<int *>(m_sync_count_alloc->ptr());
     CUDA_CHECK(cudaMemset(sync_buffer_ptr, 0, num_sync_buffer_bytes));
     CUDA_CHECK(cudaMemset(sync_count_ptr, 0, num_sync_buffer_bytes));
@@ -193,7 +189,7 @@ void Buffer::destroy() {
     }
 
     // Synchronize
-    warn_cuda_api(cudaDeviceSynchronize(), "destroy()", "synchronize device");
+    warn_cuda(cudaDeviceSynchronize(), "synchronize device");
 
     _nixl_ep_destroy();
 
diff --git a/examples/device/ep/csrc/vmm.cpp b/examples/device/ep/csrc/vmm.cpp
index d72db7bf27..dfb4160ce3 100644
--- a/examples/device/ep/csrc/vmm.cpp
+++ b/examples/device/ep/csrc/vmm.cpp
@@ -68,7 +68,7 @@ vmm_region::~vmm_region() {
     release();
 }
 
-vmm_region::vmm_region(size_t size, CUdevice device) {
+vmm_region::vmm_region(size_t size) {
     if (size == 0) {
         throw std::invalid_argument("vmm_region: size must be non-zero");
     }
@@ -77,9 +77,16 @@ vmm_region::vmm_region(size_t size, CUdevice device) {
         bool fabric_supported;
         CUmemAllocationProp prop;
         size_t granularity;
+        CUdevice device;
+        CUmemAccessDesc access_desc = {};
 
-        cuda_alloc_ctx(CUdevice dev) : fabric_supported(false), prop({}), granularity(0) {
+        cuda_alloc_ctx() : fabric_supported(false), prop({}), granularity(0) {
             int version;
+
+            if (cuCtxGetDevice(&device) != CUDA_SUCCESS) {
+                throw std::runtime_error("CUDA device should be set before creating a vmm_region");
+            }
+
             if (cuDriverGetVersion(&version) != CUDA_SUCCESS) {
                 throw std::runtime_error("Failed to get CUDA driver version");
             }
@@ -91,7 +98,7 @@ vmm_region::vmm_region(size_t size, CUdevice device) {
             int fab = 0;
             if ((cuDeviceGetAttribute(&fab,
                                       CU_DEVICE_ATTRIBUTE_HANDLE_TYPE_FABRIC_SUPPORTED,
-                                      dev) != CUDA_SUCCESS) ||
+                                      device) != CUDA_SUCCESS) ||
                 (!fab)) {
                 return; /* no fabric — fall back to cudaMalloc */
             }
@@ -99,7 +106,7 @@ vmm_region::vmm_region(size_t size, CUdevice device) {
             int rdma_vmm_supported = 0;
             if (cuDeviceGetAttribute(&rdma_vmm_supported,
                                      CU_DEVICE_ATTRIBUTE_GPU_DIRECT_RDMA_WITH_CUDA_VMM_SUPPORTED,
-                                     dev) != CUDA_SUCCESS) {
+                                     device) != CUDA_SUCCESS) {
                 throw std::runtime_error(
                     "Failed to query GPUDirect RDMA with VMM support attribute");
             }
@@ -111,7 +118,7 @@ vmm_region::vmm_region(size_t size, CUdevice device) {
 
             prop.type = CU_MEM_ALLOCATION_TYPE_PINNED;
             prop.location.type = CU_MEM_LOCATION_TYPE_DEVICE;
-            prop.location.id = dev;
+            prop.location.id = device;
             prop.allocFlags.gpuDirectRDMACapable = 1;
             prop.requestedHandleTypes = CU_MEM_HANDLE_TYPE_FABRIC;
 
@@ -120,11 +127,14 @@ vmm_region::vmm_region(size_t size, CUdevice device) {
                 throw std::runtime_error("Failed to get CUDA allocation granularity");
             }
 
+            access_desc.location.type = CU_MEM_LOCATION_TYPE_DEVICE;
+            access_desc.location.id = device;
+            access_desc.flags = CU_MEM_ACCESS_FLAGS_PROT_READWRITE;
             fabric_supported = true;
         }
     };
 
-    static cuda_alloc_ctx ctx(device);
+    static cuda_alloc_ctx ctx{};
 
     if (!ctx.fabric_supported) {
         size_ = size;
@@ -135,8 +145,6 @@ vmm_region::vmm_region(size_t size, CUdevice device) {
         return;
     }
 
-    CUmemAccessDesc access_desc = {};
-
     size_ = nixl_ep::align_up<size_t>(size, ctx.granularity);
 
     if (cuMemCreate(&handle_, size_, &ctx.prop, 0) != CUDA_SUCCESS) {
@@ -154,10 +162,7 @@ vmm_region::vmm_region(size_t size, CUdevice device) {
     }
     vmm_mapped_ = true;
 
-    access_desc.location.type = CU_MEM_LOCATION_TYPE_DEVICE;
-    access_desc.location.id = device;
-    access_desc.flags = CU_MEM_ACCESS_FLAGS_PROT_READWRITE;
-    if (cuMemSetAccess(ptr_, size_, &access_desc, 1) != CUDA_SUCCESS) {
+    if (cuMemSetAccess(ptr_, size_, &ctx.access_desc, 1) != CUDA_SUCCESS) {
         release();
         throw std::runtime_error("Failed to set CUDA memory access");
     }
diff --git a/examples/device/ep/csrc/vmm.hpp b/examples/device/ep/csrc/vmm.hpp
index 7b71092f76..be203da569 100644
--- a/examples/device/ep/csrc/vmm.hpp
+++ b/examples/device/ep/csrc/vmm.hpp
@@ -25,7 +25,7 @@ namespace nixl_ep {
 
 class vmm_region {
 public:
-    explicit vmm_region(size_t size, CUdevice device);
+    explicit vmm_region(size_t size);
 
     ~vmm_region();
 

From 5b30ea49b7ef3367901d42bdcc4382c60d6f6efa Mon Sep 17 00:00:00 2001
From: Ofir Farjon <ofarjon@nvidia.com>
Date: Wed, 25 Mar 2026 19:59:44 +0200
Subject: [PATCH 27/29] NIXL/EP: Fix comment.

---
 examples/device/ep/csrc/nixl_ep.cpp | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/examples/device/ep/csrc/nixl_ep.cpp b/examples/device/ep/csrc/nixl_ep.cpp
index 0a0d4069ce..e8f2055d49 100644
--- a/examples/device/ep/csrc/nixl_ep.cpp
+++ b/examples/device/ep/csrc/nixl_ep.cpp
@@ -109,7 +109,7 @@ void Buffer::init(int num_ranks, int num_experts_per_rank, int64_t num_rdma_byte
 
     EP_HOST_ASSERT(max_experts_per_rank > 0);
     m_rdma_alloc = std::make_unique<vmm_region>(static_cast<size_t>(num_rdma_bytes));
-    rdma_buffer_ptr = reinterpret_cast<void *>(m_rdma_alloc->ptr());
+    rdma_buffer_ptr = m_rdma_alloc->ptr();
     CUDA_CHECK(cudaMemset(rdma_buffer_ptr, 0, num_rdma_bytes));
 
     // Allocate and clean shrink buffer

From 32cd73928bdd1f6c2a2dc28f2e2bfc8a5cfb9c03 Mon Sep 17 00:00:00 2001
From: Ofir Farjon <ofarjon@nvidia.com>
Date: Mon, 30 Mar 2026 19:51:31 +0300
Subject: [PATCH 28/29] NIXL/EP: Fix comment.

---
 examples/device/ep/csrc/vmm.hpp | 1 -
 1 file changed, 1 deletion(-)

diff --git a/examples/device/ep/csrc/vmm.hpp b/examples/device/ep/csrc/vmm.hpp
index be203da569..1efbfeb528 100644
--- a/examples/device/ep/csrc/vmm.hpp
+++ b/examples/device/ep/csrc/vmm.hpp
@@ -52,7 +52,6 @@ class vmm_region {
     size_t size_ = 0;
     CUmemGenericAllocationHandle handle_ = 0;
     bool is_cuda_malloc_ = false;
-    bool vmm_addr_reserved_ = false;
     bool vmm_mapped_ = false;
 };
 

From b1cab66f355462495e2208beb852ede2f7e70531 Mon Sep 17 00:00:00 2001
From: Ofir Farjon <ofarjon@nvidia.com>
Date: Mon, 30 Mar 2026 20:22:01 +0300
Subject: [PATCH 29/29] NIXL/EP: Fix comment.

---
 examples/device/ep/csrc/vmm.cpp | 19 +++++++++++--------
 examples/device/ep/csrc/vmm.hpp |  3 ---
 2 files changed, 11 insertions(+), 11 deletions(-)

diff --git a/examples/device/ep/csrc/vmm.cpp b/examples/device/ep/csrc/vmm.cpp
index dfb4160ce3..93c7f28b64 100644
--- a/examples/device/ep/csrc/vmm.cpp
+++ b/examples/device/ep/csrc/vmm.cpp
@@ -24,13 +24,10 @@
 namespace {
 
 constexpr const char *k_vmm_ctx = "vmm_region";
-}
-
-namespace nixl_ep {
 
 /** Log a non-fatal warning if a CUDA driver API call failed (e.g. during teardown). */
 void
-vmm_region::warn_cu_api(CUresult status, const char *context, const char *operation) noexcept {
+warn_cu_api(CUresult status, const char *context, const char *operation) noexcept {
     if (status != CUDA_SUCCESS) {
         const char *msg = nullptr;
         if (cuGetErrorString(status, &msg) != CUDA_SUCCESS || msg == nullptr) {
@@ -40,6 +37,10 @@ vmm_region::warn_cu_api(CUresult status, const char *context, const char *operat
     }
 }
 
+} // namespace
+
+namespace nixl_ep {
+
 void
 vmm_region::release() noexcept {
     if (is_cuda_malloc_) {
@@ -112,8 +113,10 @@ vmm_region::vmm_region(size_t size) {
             }
 
             if (!rdma_vmm_supported) {
-                throw std::runtime_error(
-                    "GPUDirect RDMA with CUDA VMM is not supported on this device");
+                std::cerr << "DIAG: " << k_vmm_ctx
+                          << " - GPUDirect RDMA with CUDA VMM not supported; falling back to "
+                             "cuMemAlloc\n";
+                return;
             }
 
             prop.type = CU_MEM_ALLOCATION_TYPE_PINNED;
@@ -134,13 +137,13 @@ vmm_region::vmm_region(size_t size) {
         }
     };
 
-    static cuda_alloc_ctx ctx{};
+    static cuda_alloc_ctx ctx;
 
     if (!ctx.fabric_supported) {
         size_ = size;
         is_cuda_malloc_ = true;
         if (cuMemAlloc(&ptr_, size) != CUDA_SUCCESS) {
-            throw std::runtime_error("cuMemAlloc fallback failed (fabric not supported)");
+            throw std::runtime_error("cuMemAlloc fallback failed");
         }
         return;
     }
diff --git a/examples/device/ep/csrc/vmm.hpp b/examples/device/ep/csrc/vmm.hpp
index 1efbfeb528..9b007b1243 100644
--- a/examples/device/ep/csrc/vmm.hpp
+++ b/examples/device/ep/csrc/vmm.hpp
@@ -42,9 +42,6 @@ class vmm_region {
     }
 
 private:
-    static void
-    warn_cu_api(CUresult status, const char *context, const char *operation) noexcept;
-
     void
     release() noexcept;