From b6cb209e464c56a9a8fb7b60529cd205be9ff305 Mon Sep 17 00:00:00 2001 From: Ofir Farjon Date: Thu, 5 Mar 2026 18:11:10 +0200 Subject: [PATCH 01/29] NIXL/EP: Use vmm API instead of cudaMalloc --- examples/device/ep/csrc/nixl_ep.cpp | 36 ++++++---- examples/device/ep/csrc/nixl_ep.hpp | 107 ++++++++++++++++++++++++++++ 2 files changed, 128 insertions(+), 15 deletions(-) diff --git a/examples/device/ep/csrc/nixl_ep.cpp b/examples/device/ep/csrc/nixl_ep.cpp index 8834b2b213..00884602f7 100644 --- a/examples/device/ep/csrc/nixl_ep.cpp +++ b/examples/device/ep/csrc/nixl_ep.cpp @@ -27,6 +27,7 @@ #include #include #include +#include #include #include #include @@ -103,25 +104,28 @@ void Buffer::init(int num_ranks, int num_experts_per_rank, int64_t num_rdma_byte EP_HOST_ASSERT(per_channel_bytes < std::numeric_limits::max()); // Create 32 MiB workspace - CUDA_CHECK(cudaMalloc(&workspace, NUM_WORKSPACE_BYTES)); + m_workspace_alloc = std::make_unique(NUM_WORKSPACE_BYTES); + workspace = m_workspace_alloc->ptr(); CUDA_CHECK(cudaMemsetAsync(workspace, 0, NUM_WORKSPACE_BYTES, comm_stream)); EP_HOST_ASSERT(max_experts_per_rank > 0); - CUDA_CHECK(cudaMalloc(&rdma_buffer_ptr, num_rdma_bytes)); + m_rdma_alloc = std::make_unique(num_rdma_bytes); + rdma_buffer_ptr = m_rdma_alloc->ptr(); CUDA_CHECK(cudaMemset(rdma_buffer_ptr, 0, num_rdma_bytes)); // Allocate and clean shrink buffer - mask_buffer_ptr = nullptr; - sync_buffer_ptr = nullptr; int num_mask_buffer_bytes = max_num_ranks * sizeof(int); - CUDA_CHECK(cudaMalloc(&mask_buffer_ptr, num_mask_buffer_bytes)); + m_mask_alloc = std::make_unique(num_mask_buffer_bytes); + mask_buffer_ptr = static_cast(m_mask_alloc->ptr()); CUDA_CHECK(cudaMemset(mask_buffer_ptr, 0xff, num_mask_buffer_bytes)); CUDA_CHECK(cudaMemset(mask_buffer_ptr + rank, 0, sizeof(int))); int num_sync_buffer_bytes = max_num_ranks * sizeof(int); - CUDA_CHECK(cudaMalloc(&sync_buffer_ptr, num_sync_buffer_bytes)); + m_sync_alloc = std::make_unique(num_sync_buffer_bytes); + sync_buffer_ptr = static_cast(m_sync_alloc->ptr()); + m_sync_count_alloc = std::make_unique(num_sync_buffer_bytes); + sync_count_ptr = static_cast(m_sync_count_alloc->ptr()); CUDA_CHECK(cudaMemset(sync_buffer_ptr, 0, num_sync_buffer_bytes)); - CUDA_CHECK(cudaMalloc(&sync_count_ptr, num_sync_buffer_bytes)); CUDA_CHECK(cudaMemset(sync_count_ptr, 0, num_sync_buffer_bytes)); CUDA_CHECK(cudaDeviceSynchronize()); @@ -174,20 +178,22 @@ void Buffer::destroy() { _nixl_ep_destroy(); - cudaFree(rdma_buffer_ptr); + m_rdma_alloc.reset(); + rdma_buffer_ptr = nullptr; if (nixl_agent_info and nixl_agent_info->agent != nullptr and getenv("NIXL_ETCD_ENDPOINTS")) { nixl_agent_info->agent->invalidateLocalMD(); } - rdma_buffer_ptr = nullptr; - - cudaFree(mask_buffer_ptr); - cudaFree(sync_buffer_ptr); - cudaFree(sync_count_ptr); + m_mask_alloc.reset(); + mask_buffer_ptr = nullptr; + m_sync_alloc.reset(); + sync_buffer_ptr = nullptr; + m_sync_count_alloc.reset(); + sync_count_ptr = nullptr; - // Free workspace - CUDA_CHECK(cudaFree(workspace)); + m_workspace_alloc.reset(); + workspace = nullptr; destroyed = true; available = false; diff --git a/examples/device/ep/csrc/nixl_ep.hpp b/examples/device/ep/csrc/nixl_ep.hpp index 3a405fb7ee..c52f2a86f3 100644 --- a/examples/device/ep/csrc/nixl_ep.hpp +++ b/examples/device/ep/csrc/nixl_ep.hpp @@ -35,7 +35,9 @@ #include #include +#include #include +#include #include "config.hpp" #include "event.hpp" #include "kernels/configs.cuh" @@ -47,6 +49,104 @@ #define TORCH_EXTENSION_NAME nixl_ep_cpp #endif +/* CUDA memory allocator using VMM. Uses fabric handle type if the device + * supports it (CU_DEVICE_ATTRIBUTE_HANDLE_TYPE_FABRIC_SUPPORTED), otherwise + * falls back to CU_MEM_HANDLE_TYPE_NONE. */ +class cuda_allocator { +public: + cuda_allocator(size_t size) : m_size(0), m_ptr(0), m_alloc_handle(0) + { + if (size == 0) { + throw std::invalid_argument("cuda_allocator: size must be non-zero"); + } + + CUdevice device; + if (cuCtxGetDevice(&device) != CUDA_SUCCESS) { + throw std::runtime_error("Failed to get CUDA device handle"); + } + + int fabric_supported = 0; + cuDeviceGetAttribute(&fabric_supported, + CU_DEVICE_ATTRIBUTE_HANDLE_TYPE_FABRIC_SUPPORTED, + device); + + CUmemAllocationProp prop = {}; + prop.type = CU_MEM_ALLOCATION_TYPE_PINNED; + prop.location.type = CU_MEM_LOCATION_TYPE_DEVICE; + prop.location.id = device; + prop.requestedHandleTypes = fabric_supported ? CU_MEM_HANDLE_TYPE_FABRIC + : CU_MEM_HANDLE_TYPE_NONE; + + size_t granularity = 0; + if (cuMemGetAllocationGranularity(&granularity, &prop, + CU_MEM_ALLOC_GRANULARITY_MINIMUM) != + CUDA_SUCCESS) { + throw std::runtime_error("Failed to get CUDA allocation granularity"); + } + + init_vmm(size, device, prop, granularity); + } + + ~cuda_allocator() + { + if (m_ptr) { + cuMemUnmap(m_ptr, m_size); + cuMemAddressFree(m_ptr, m_size); + } + if (m_alloc_handle) { + cuMemRelease(m_alloc_handle); + } + } + + void* ptr() const { return reinterpret_cast(m_ptr); } + size_t size() const { return m_size; } + + cuda_allocator(const cuda_allocator&) = delete; + cuda_allocator& operator=(const cuda_allocator&) = delete; + +private: + void init_vmm(size_t size, CUdevice device, const CUmemAllocationProp &prop, + size_t granularity) + { + m_size = (size + granularity - 1) / granularity * granularity; + + if (cuMemCreate(&m_alloc_handle, m_size, &prop, 0) != CUDA_SUCCESS) { + throw std::runtime_error("Failed to create CUDA VMM allocation"); + } + + if (cuMemAddressReserve(&m_ptr, m_size, 0, 0, 0) != CUDA_SUCCESS) { + cuMemRelease(m_alloc_handle); + m_alloc_handle = 0; + throw std::runtime_error("Failed to reserve CUDA virtual address"); + } + + if (cuMemMap(m_ptr, m_size, 0, m_alloc_handle, 0) != CUDA_SUCCESS) { + cuMemAddressFree(m_ptr, m_size); + m_ptr = 0; + cuMemRelease(m_alloc_handle); + m_alloc_handle = 0; + throw std::runtime_error("Failed to map CUDA VMM memory"); + } + + CUmemAccessDesc access_desc = {}; + access_desc.location.type = CU_MEM_LOCATION_TYPE_DEVICE; + access_desc.location.id = device; + access_desc.flags = CU_MEM_ACCESS_FLAGS_PROT_READWRITE; + if (cuMemSetAccess(m_ptr, m_size, &access_desc, 1) != CUDA_SUCCESS) { + cuMemUnmap(m_ptr, m_size); + cuMemAddressFree(m_ptr, m_size); + m_ptr = 0; + cuMemRelease(m_alloc_handle); + m_alloc_handle = 0; + throw std::runtime_error("Failed to set CUDA memory access"); + } + } + + size_t m_size; + CUdeviceptr m_ptr; + CUmemGenericAllocationHandle m_alloc_handle; +}; + namespace nixl_ep { struct NixlPeerInfo { @@ -83,6 +183,13 @@ struct Buffer { int *sync_buffer_ptr = nullptr; int *sync_count_ptr = nullptr; + // Owning allocators (keep raw ptrs above as aliases for use throughout) + std::unique_ptr m_rdma_alloc; + std::unique_ptr m_mask_alloc; + std::unique_ptr m_sync_alloc; + std::unique_ptr m_sync_count_alloc; + std::unique_ptr m_workspace_alloc; + // Device info and communication int device_id; int num_device_sms; From c36a4c2655dbea1a742dbcfe9786c6516029849e Mon Sep 17 00:00:00 2001 From: Ofir Farjon Date: Sat, 7 Mar 2026 17:33:33 +0200 Subject: [PATCH 02/29] NIXL/EP: Use vmm API instead of cudaMalloc --- examples/device/ep/csrc/nixl_ep.hpp | 65 ++++++++++++++++++----------- 1 file changed, 40 insertions(+), 25 deletions(-) diff --git a/examples/device/ep/csrc/nixl_ep.hpp b/examples/device/ep/csrc/nixl_ep.hpp index c52f2a86f3..003c9a317f 100644 --- a/examples/device/ep/csrc/nixl_ep.hpp +++ b/examples/device/ep/csrc/nixl_ep.hpp @@ -36,6 +36,7 @@ #include #include +#include #include #include #include "config.hpp" @@ -49,12 +50,15 @@ #define TORCH_EXTENSION_NAME nixl_ep_cpp #endif -/* CUDA memory allocator using VMM. Uses fabric handle type if the device - * supports it (CU_DEVICE_ATTRIBUTE_HANDLE_TYPE_FABRIC_SUPPORTED), otherwise - * falls back to CU_MEM_HANDLE_TYPE_NONE. */ +/* CUDA memory allocator. Uses fabric VMM (cuMemCreate) if the device supports + * CU_DEVICE_ATTRIBUTE_HANDLE_TYPE_FABRIC_SUPPORTED, otherwise falls back to + * cudaMalloc. The cudaMalloc fallback is intentional: VMM memory without a + * fabric handle provides no benefit and is incompatible with GDRCopy, which + * requires traditionally allocated memory for kernel-level pinning. */ class cuda_allocator { public: - cuda_allocator(size_t size) : m_size(0), m_ptr(0), m_alloc_handle(0) + cuda_allocator(size_t size) : m_size(size), m_ptr(0), m_alloc_handle(0), + m_cuda_ptr(nullptr) { if (size == 0) { throw std::invalid_argument("cuda_allocator: size must be non-zero"); @@ -70,21 +74,11 @@ class cuda_allocator { CU_DEVICE_ATTRIBUTE_HANDLE_TYPE_FABRIC_SUPPORTED, device); - CUmemAllocationProp prop = {}; - prop.type = CU_MEM_ALLOCATION_TYPE_PINNED; - prop.location.type = CU_MEM_LOCATION_TYPE_DEVICE; - prop.location.id = device; - prop.requestedHandleTypes = fabric_supported ? CU_MEM_HANDLE_TYPE_FABRIC - : CU_MEM_HANDLE_TYPE_NONE; - - size_t granularity = 0; - if (cuMemGetAllocationGranularity(&granularity, &prop, - CU_MEM_ALLOC_GRANULARITY_MINIMUM) != - CUDA_SUCCESS) { - throw std::runtime_error("Failed to get CUDA allocation granularity"); + if (fabric_supported) { + init_vmm(size, device); + } else { + init_regular(size); } - - init_vmm(size, device, prop, granularity); } ~cuda_allocator() @@ -92,26 +86,39 @@ class cuda_allocator { if (m_ptr) { cuMemUnmap(m_ptr, m_size); cuMemAddressFree(m_ptr, m_size); - } - if (m_alloc_handle) { cuMemRelease(m_alloc_handle); + } else if (m_cuda_ptr) { + cudaFree(m_cuda_ptr); } } - void* ptr() const { return reinterpret_cast(m_ptr); } + void* ptr() const { return m_ptr ? reinterpret_cast(m_ptr) + : m_cuda_ptr; } size_t size() const { return m_size; } cuda_allocator(const cuda_allocator&) = delete; cuda_allocator& operator=(const cuda_allocator&) = delete; private: - void init_vmm(size_t size, CUdevice device, const CUmemAllocationProp &prop, - size_t granularity) + void init_vmm(size_t size, CUdevice device) { + CUmemAllocationProp prop = {}; + prop.type = CU_MEM_ALLOCATION_TYPE_PINNED; + prop.location.type = CU_MEM_LOCATION_TYPE_DEVICE; + prop.location.id = device; + prop.requestedHandleTypes = CU_MEM_HANDLE_TYPE_FABRIC; + + size_t granularity = 0; + if (cuMemGetAllocationGranularity(&granularity, &prop, + CU_MEM_ALLOC_GRANULARITY_MINIMUM) != + CUDA_SUCCESS) { + throw std::runtime_error("Failed to get CUDA allocation granularity"); + } + m_size = (size + granularity - 1) / granularity * granularity; if (cuMemCreate(&m_alloc_handle, m_size, &prop, 0) != CUDA_SUCCESS) { - throw std::runtime_error("Failed to create CUDA VMM allocation"); + throw std::runtime_error("Failed to create CUDA fabric VMM allocation"); } if (cuMemAddressReserve(&m_ptr, m_size, 0, 0, 0) != CUDA_SUCCESS) { @@ -125,7 +132,7 @@ class cuda_allocator { m_ptr = 0; cuMemRelease(m_alloc_handle); m_alloc_handle = 0; - throw std::runtime_error("Failed to map CUDA VMM memory"); + throw std::runtime_error("Failed to map CUDA fabric VMM memory"); } CUmemAccessDesc access_desc = {}; @@ -142,9 +149,17 @@ class cuda_allocator { } } + void init_regular(size_t size) + { + if (cudaMalloc(&m_cuda_ptr, size) != cudaSuccess) { + throw std::runtime_error("Failed to allocate CUDA memory"); + } + } + size_t m_size; CUdeviceptr m_ptr; CUmemGenericAllocationHandle m_alloc_handle; + void *m_cuda_ptr; }; namespace nixl_ep { From 29c2c7ae79f7bb13f80ad351774c6f30134cbfc5 Mon Sep 17 00:00:00 2001 From: Ofir Farjon Date: Sat, 7 Mar 2026 18:13:13 +0200 Subject: [PATCH 03/29] NIXL/EP: revert --- examples/device/ep/csrc/nixl_ep.hpp | 65 +++++++++++------------------ 1 file changed, 25 insertions(+), 40 deletions(-) diff --git a/examples/device/ep/csrc/nixl_ep.hpp b/examples/device/ep/csrc/nixl_ep.hpp index 003c9a317f..c52f2a86f3 100644 --- a/examples/device/ep/csrc/nixl_ep.hpp +++ b/examples/device/ep/csrc/nixl_ep.hpp @@ -36,7 +36,6 @@ #include #include -#include #include #include #include "config.hpp" @@ -50,15 +49,12 @@ #define TORCH_EXTENSION_NAME nixl_ep_cpp #endif -/* CUDA memory allocator. Uses fabric VMM (cuMemCreate) if the device supports - * CU_DEVICE_ATTRIBUTE_HANDLE_TYPE_FABRIC_SUPPORTED, otherwise falls back to - * cudaMalloc. The cudaMalloc fallback is intentional: VMM memory without a - * fabric handle provides no benefit and is incompatible with GDRCopy, which - * requires traditionally allocated memory for kernel-level pinning. */ +/* CUDA memory allocator using VMM. Uses fabric handle type if the device + * supports it (CU_DEVICE_ATTRIBUTE_HANDLE_TYPE_FABRIC_SUPPORTED), otherwise + * falls back to CU_MEM_HANDLE_TYPE_NONE. */ class cuda_allocator { public: - cuda_allocator(size_t size) : m_size(size), m_ptr(0), m_alloc_handle(0), - m_cuda_ptr(nullptr) + cuda_allocator(size_t size) : m_size(0), m_ptr(0), m_alloc_handle(0) { if (size == 0) { throw std::invalid_argument("cuda_allocator: size must be non-zero"); @@ -74,11 +70,21 @@ class cuda_allocator { CU_DEVICE_ATTRIBUTE_HANDLE_TYPE_FABRIC_SUPPORTED, device); - if (fabric_supported) { - init_vmm(size, device); - } else { - init_regular(size); + CUmemAllocationProp prop = {}; + prop.type = CU_MEM_ALLOCATION_TYPE_PINNED; + prop.location.type = CU_MEM_LOCATION_TYPE_DEVICE; + prop.location.id = device; + prop.requestedHandleTypes = fabric_supported ? CU_MEM_HANDLE_TYPE_FABRIC + : CU_MEM_HANDLE_TYPE_NONE; + + size_t granularity = 0; + if (cuMemGetAllocationGranularity(&granularity, &prop, + CU_MEM_ALLOC_GRANULARITY_MINIMUM) != + CUDA_SUCCESS) { + throw std::runtime_error("Failed to get CUDA allocation granularity"); } + + init_vmm(size, device, prop, granularity); } ~cuda_allocator() @@ -86,39 +92,26 @@ class cuda_allocator { if (m_ptr) { cuMemUnmap(m_ptr, m_size); cuMemAddressFree(m_ptr, m_size); + } + if (m_alloc_handle) { cuMemRelease(m_alloc_handle); - } else if (m_cuda_ptr) { - cudaFree(m_cuda_ptr); } } - void* ptr() const { return m_ptr ? reinterpret_cast(m_ptr) - : m_cuda_ptr; } + void* ptr() const { return reinterpret_cast(m_ptr); } size_t size() const { return m_size; } cuda_allocator(const cuda_allocator&) = delete; cuda_allocator& operator=(const cuda_allocator&) = delete; private: - void init_vmm(size_t size, CUdevice device) + void init_vmm(size_t size, CUdevice device, const CUmemAllocationProp &prop, + size_t granularity) { - CUmemAllocationProp prop = {}; - prop.type = CU_MEM_ALLOCATION_TYPE_PINNED; - prop.location.type = CU_MEM_LOCATION_TYPE_DEVICE; - prop.location.id = device; - prop.requestedHandleTypes = CU_MEM_HANDLE_TYPE_FABRIC; - - size_t granularity = 0; - if (cuMemGetAllocationGranularity(&granularity, &prop, - CU_MEM_ALLOC_GRANULARITY_MINIMUM) != - CUDA_SUCCESS) { - throw std::runtime_error("Failed to get CUDA allocation granularity"); - } - m_size = (size + granularity - 1) / granularity * granularity; if (cuMemCreate(&m_alloc_handle, m_size, &prop, 0) != CUDA_SUCCESS) { - throw std::runtime_error("Failed to create CUDA fabric VMM allocation"); + throw std::runtime_error("Failed to create CUDA VMM allocation"); } if (cuMemAddressReserve(&m_ptr, m_size, 0, 0, 0) != CUDA_SUCCESS) { @@ -132,7 +125,7 @@ class cuda_allocator { m_ptr = 0; cuMemRelease(m_alloc_handle); m_alloc_handle = 0; - throw std::runtime_error("Failed to map CUDA fabric VMM memory"); + throw std::runtime_error("Failed to map CUDA VMM memory"); } CUmemAccessDesc access_desc = {}; @@ -149,17 +142,9 @@ class cuda_allocator { } } - void init_regular(size_t size) - { - if (cudaMalloc(&m_cuda_ptr, size) != cudaSuccess) { - throw std::runtime_error("Failed to allocate CUDA memory"); - } - } - size_t m_size; CUdeviceptr m_ptr; CUmemGenericAllocationHandle m_alloc_handle; - void *m_cuda_ptr; }; namespace nixl_ep { From daecca92684a6d58662e7af006d02aba8af72682 Mon Sep 17 00:00:00 2001 From: Ofir Farjon Date: Sun, 8 Mar 2026 08:20:24 +0200 Subject: [PATCH 04/29] NIXL/EP: Support gdr copy with vmm. --- examples/device/ep/csrc/nixl_ep.hpp | 28 ++++++++++++++++++++-------- 1 file changed, 20 insertions(+), 8 deletions(-) diff --git a/examples/device/ep/csrc/nixl_ep.hpp b/examples/device/ep/csrc/nixl_ep.hpp index c52f2a86f3..98a0993494 100644 --- a/examples/device/ep/csrc/nixl_ep.hpp +++ b/examples/device/ep/csrc/nixl_ep.hpp @@ -36,6 +36,7 @@ #include #include +#include #include #include #include "config.hpp" @@ -49,9 +50,10 @@ #define TORCH_EXTENSION_NAME nixl_ep_cpp #endif -/* CUDA memory allocator using VMM. Uses fabric handle type if the device - * supports it (CU_DEVICE_ATTRIBUTE_HANDLE_TYPE_FABRIC_SUPPORTED), otherwise - * falls back to CU_MEM_HANDLE_TYPE_NONE. */ +/* CUDA memory allocator using VMM with gpuDirectRDMACapable. Requires + * CU_DEVICE_ATTRIBUTE_GPU_DIRECT_RDMA_WITH_CUDA_VMM_SUPPORTED. Uses a fabric + * handle type if CU_DEVICE_ATTRIBUTE_HANDLE_TYPE_FABRIC_SUPPORTED is also set, + * otherwise uses CU_MEM_HANDLE_TYPE_NONE. */ class cuda_allocator { public: cuda_allocator(size_t size) : m_size(0), m_ptr(0), m_alloc_handle(0) @@ -65,17 +67,27 @@ class cuda_allocator { throw std::runtime_error("Failed to get CUDA device handle"); } + int rdma_vmm_supported = 0; + cuDeviceGetAttribute(&rdma_vmm_supported, + CU_DEVICE_ATTRIBUTE_GPU_DIRECT_RDMA_WITH_CUDA_VMM_SUPPORTED, + device); + if (!rdma_vmm_supported) { + throw std::runtime_error("GPUDirect RDMA with CUDA VMM is not supported on this device"); + } + int fabric_supported = 0; cuDeviceGetAttribute(&fabric_supported, CU_DEVICE_ATTRIBUTE_HANDLE_TYPE_FABRIC_SUPPORTED, device); CUmemAllocationProp prop = {}; - prop.type = CU_MEM_ALLOCATION_TYPE_PINNED; - prop.location.type = CU_MEM_LOCATION_TYPE_DEVICE; - prop.location.id = device; - prop.requestedHandleTypes = fabric_supported ? CU_MEM_HANDLE_TYPE_FABRIC - : CU_MEM_HANDLE_TYPE_NONE; + prop.type = CU_MEM_ALLOCATION_TYPE_PINNED; + prop.location.type = CU_MEM_LOCATION_TYPE_DEVICE; + prop.location.id = device; + prop.allocFlags.gpuDirectRDMACapable = 1; + prop.requestedHandleTypes = fabric_supported ? + CU_MEM_HANDLE_TYPE_FABRIC : + CU_MEM_HANDLE_TYPE_NONE; size_t granularity = 0; if (cuMemGetAllocationGranularity(&granularity, &prop, From 3f5f55ff27718d186b2f4b36c698de0f1a71d152 Mon Sep 17 00:00:00 2001 From: Ofir Farjon Date: Sun, 8 Mar 2026 18:17:38 +0200 Subject: [PATCH 05/29] NIXL/EP: Improve. --- examples/device/ep/csrc/nixl_ep.hpp | 50 ++++++++++++++++------------- 1 file changed, 27 insertions(+), 23 deletions(-) diff --git a/examples/device/ep/csrc/nixl_ep.hpp b/examples/device/ep/csrc/nixl_ep.hpp index 98a0993494..ba9d3742da 100644 --- a/examples/device/ep/csrc/nixl_ep.hpp +++ b/examples/device/ep/csrc/nixl_ep.hpp @@ -50,10 +50,7 @@ #define TORCH_EXTENSION_NAME nixl_ep_cpp #endif -/* CUDA memory allocator using VMM with gpuDirectRDMACapable. Requires - * CU_DEVICE_ATTRIBUTE_GPU_DIRECT_RDMA_WITH_CUDA_VMM_SUPPORTED. Uses a fabric - * handle type if CU_DEVICE_ATTRIBUTE_HANDLE_TYPE_FABRIC_SUPPORTED is also set, - * otherwise uses CU_MEM_HANDLE_TYPE_NONE. */ +/* CUDA memory allocator using VMM. */ class cuda_allocator { public: cuda_allocator(size_t size) : m_size(0), m_ptr(0), m_alloc_handle(0) @@ -105,6 +102,7 @@ class cuda_allocator { cuMemUnmap(m_ptr, m_size); cuMemAddressFree(m_ptr, m_size); } + if (m_alloc_handle) { cuMemRelease(m_alloc_handle); } @@ -120,38 +118,44 @@ class cuda_allocator { void init_vmm(size_t size, CUdevice device, const CUmemAllocationProp &prop, size_t granularity) { - m_size = (size + granularity - 1) / granularity * granularity; + CUmemAccessDesc access_desc = {}; + const char *err_msg; + + m_size = nixl_ep::align_up(size, granularity); if (cuMemCreate(&m_alloc_handle, m_size, &prop, 0) != CUDA_SUCCESS) { throw std::runtime_error("Failed to create CUDA VMM allocation"); } if (cuMemAddressReserve(&m_ptr, m_size, 0, 0, 0) != CUDA_SUCCESS) { - cuMemRelease(m_alloc_handle); - m_alloc_handle = 0; - throw std::runtime_error("Failed to reserve CUDA virtual address"); + err_msg = "Failed to reserve CUDA virtual address"; + goto err_release; } if (cuMemMap(m_ptr, m_size, 0, m_alloc_handle, 0) != CUDA_SUCCESS) { - cuMemAddressFree(m_ptr, m_size); - m_ptr = 0; - cuMemRelease(m_alloc_handle); - m_alloc_handle = 0; - throw std::runtime_error("Failed to map CUDA VMM memory"); + err_msg = "Failed to map CUDA VMM memory"; + goto err_free; } - CUmemAccessDesc access_desc = {}; - access_desc.location.type = CU_MEM_LOCATION_TYPE_DEVICE; - access_desc.location.id = device; - access_desc.flags = CU_MEM_ACCESS_FLAGS_PROT_READWRITE; + access_desc.location.type = CU_MEM_LOCATION_TYPE_DEVICE; + access_desc.location.id = device; + access_desc.flags = CU_MEM_ACCESS_FLAGS_PROT_READWRITE; if (cuMemSetAccess(m_ptr, m_size, &access_desc, 1) != CUDA_SUCCESS) { - cuMemUnmap(m_ptr, m_size); - cuMemAddressFree(m_ptr, m_size); - m_ptr = 0; - cuMemRelease(m_alloc_handle); - m_alloc_handle = 0; - throw std::runtime_error("Failed to set CUDA memory access"); + err_msg = "Failed to set CUDA memory access"; + goto err_unmap; } + + return; + +err_unmap: + cuMemUnmap(m_ptr, m_size); +err_free: + cuMemAddressFree(m_ptr, m_size); + m_ptr = 0; +err_release: + cuMemRelease(m_alloc_handle); + m_alloc_handle = 0; + throw std::runtime_error(err_msg); } size_t m_size; From 07674eef76c02af4d4649789d28a71f15afb5790 Mon Sep 17 00:00:00 2001 From: Ofir Farjon Date: Sun, 8 Mar 2026 18:43:43 +0200 Subject: [PATCH 06/29] NIXL/EP: Format. --- examples/device/ep/csrc/nixl_ep.cpp | 14 +++---- examples/device/ep/csrc/nixl_ep.hpp | 65 +++++++++++++++-------------- 2 files changed, 41 insertions(+), 38 deletions(-) diff --git a/examples/device/ep/csrc/nixl_ep.cpp b/examples/device/ep/csrc/nixl_ep.cpp index 00884602f7..386a66675c 100644 --- a/examples/device/ep/csrc/nixl_ep.cpp +++ b/examples/device/ep/csrc/nixl_ep.cpp @@ -105,26 +105,26 @@ void Buffer::init(int num_ranks, int num_experts_per_rank, int64_t num_rdma_byte // Create 32 MiB workspace m_workspace_alloc = std::make_unique(NUM_WORKSPACE_BYTES); - workspace = m_workspace_alloc->ptr(); + workspace = m_workspace_alloc->ptr(); CUDA_CHECK(cudaMemsetAsync(workspace, 0, NUM_WORKSPACE_BYTES, comm_stream)); EP_HOST_ASSERT(max_experts_per_rank > 0); - m_rdma_alloc = std::make_unique(num_rdma_bytes); + m_rdma_alloc = std::make_unique(num_rdma_bytes); rdma_buffer_ptr = m_rdma_alloc->ptr(); CUDA_CHECK(cudaMemset(rdma_buffer_ptr, 0, num_rdma_bytes)); // Allocate and clean shrink buffer int num_mask_buffer_bytes = max_num_ranks * sizeof(int); - m_mask_alloc = std::make_unique(num_mask_buffer_bytes); - mask_buffer_ptr = static_cast(m_mask_alloc->ptr()); + m_mask_alloc = std::make_unique(num_mask_buffer_bytes); + mask_buffer_ptr = static_cast(m_mask_alloc->ptr()); CUDA_CHECK(cudaMemset(mask_buffer_ptr, 0xff, num_mask_buffer_bytes)); CUDA_CHECK(cudaMemset(mask_buffer_ptr + rank, 0, sizeof(int))); int num_sync_buffer_bytes = max_num_ranks * sizeof(int); - m_sync_alloc = std::make_unique(num_sync_buffer_bytes); - sync_buffer_ptr = static_cast(m_sync_alloc->ptr()); + m_sync_alloc = std::make_unique(num_sync_buffer_bytes); + sync_buffer_ptr = static_cast(m_sync_alloc->ptr()); m_sync_count_alloc = std::make_unique(num_sync_buffer_bytes); - sync_count_ptr = static_cast(m_sync_count_alloc->ptr()); + sync_count_ptr = static_cast(m_sync_count_alloc->ptr()); CUDA_CHECK(cudaMemset(sync_buffer_ptr, 0, num_sync_buffer_bytes)); CUDA_CHECK(cudaMemset(sync_count_ptr, 0, num_sync_buffer_bytes)); CUDA_CHECK(cudaDeviceSynchronize()); diff --git a/examples/device/ep/csrc/nixl_ep.hpp b/examples/device/ep/csrc/nixl_ep.hpp index ba9d3742da..f811bd3ff6 100644 --- a/examples/device/ep/csrc/nixl_ep.hpp +++ b/examples/device/ep/csrc/nixl_ep.hpp @@ -53,8 +53,7 @@ /* CUDA memory allocator using VMM. */ class cuda_allocator { public: - cuda_allocator(size_t size) : m_size(0), m_ptr(0), m_alloc_handle(0) - { + cuda_allocator(size_t size) : m_size(0), m_ptr(0), m_alloc_handle(0) { if (size == 0) { throw std::invalid_argument("cuda_allocator: size must be non-zero"); } @@ -69,26 +68,24 @@ class cuda_allocator { CU_DEVICE_ATTRIBUTE_GPU_DIRECT_RDMA_WITH_CUDA_VMM_SUPPORTED, device); if (!rdma_vmm_supported) { - throw std::runtime_error("GPUDirect RDMA with CUDA VMM is not supported on this device"); + throw std::runtime_error( + "GPUDirect RDMA with CUDA VMM is not supported on this device"); } int fabric_supported = 0; - cuDeviceGetAttribute(&fabric_supported, - CU_DEVICE_ATTRIBUTE_HANDLE_TYPE_FABRIC_SUPPORTED, - device); + cuDeviceGetAttribute( + &fabric_supported, CU_DEVICE_ATTRIBUTE_HANDLE_TYPE_FABRIC_SUPPORTED, device); CUmemAllocationProp prop = {}; - prop.type = CU_MEM_ALLOCATION_TYPE_PINNED; - prop.location.type = CU_MEM_LOCATION_TYPE_DEVICE; - prop.location.id = device; + prop.type = CU_MEM_ALLOCATION_TYPE_PINNED; + prop.location.type = CU_MEM_LOCATION_TYPE_DEVICE; + prop.location.id = device; prop.allocFlags.gpuDirectRDMACapable = 1; - prop.requestedHandleTypes = fabric_supported ? - CU_MEM_HANDLE_TYPE_FABRIC : - CU_MEM_HANDLE_TYPE_NONE; + prop.requestedHandleTypes = + fabric_supported ? CU_MEM_HANDLE_TYPE_FABRIC : CU_MEM_HANDLE_TYPE_NONE; size_t granularity = 0; - if (cuMemGetAllocationGranularity(&granularity, &prop, - CU_MEM_ALLOC_GRANULARITY_MINIMUM) != + if (cuMemGetAllocationGranularity(&granularity, &prop, CU_MEM_ALLOC_GRANULARITY_MINIMUM) != CUDA_SUCCESS) { throw std::runtime_error("Failed to get CUDA allocation granularity"); } @@ -96,8 +93,7 @@ class cuda_allocator { init_vmm(size, device, prop, granularity); } - ~cuda_allocator() - { + ~cuda_allocator() { if (m_ptr) { cuMemUnmap(m_ptr, m_size); cuMemAddressFree(m_ptr, m_size); @@ -108,16 +104,23 @@ class cuda_allocator { } } - void* ptr() const { return reinterpret_cast(m_ptr); } - size_t size() const { return m_size; } + void * + ptr() const { + return reinterpret_cast(m_ptr); + } + + size_t + size() const { + return m_size; + } - cuda_allocator(const cuda_allocator&) = delete; - cuda_allocator& operator=(const cuda_allocator&) = delete; + cuda_allocator(const cuda_allocator &) = delete; + cuda_allocator & + operator=(const cuda_allocator &) = delete; private: - void init_vmm(size_t size, CUdevice device, const CUmemAllocationProp &prop, - size_t granularity) - { + void + init_vmm(size_t size, CUdevice device, const CUmemAllocationProp &prop, size_t granularity) { CUmemAccessDesc access_desc = {}; const char *err_msg; @@ -138,8 +141,8 @@ class cuda_allocator { } access_desc.location.type = CU_MEM_LOCATION_TYPE_DEVICE; - access_desc.location.id = device; - access_desc.flags = CU_MEM_ACCESS_FLAGS_PROT_READWRITE; + access_desc.location.id = device; + access_desc.flags = CU_MEM_ACCESS_FLAGS_PROT_READWRITE; if (cuMemSetAccess(m_ptr, m_size, &access_desc, 1) != CUDA_SUCCESS) { err_msg = "Failed to set CUDA memory access"; goto err_unmap; @@ -147,20 +150,20 @@ class cuda_allocator { return; -err_unmap: + err_unmap: cuMemUnmap(m_ptr, m_size); -err_free: + err_free: cuMemAddressFree(m_ptr, m_size); m_ptr = 0; -err_release: + err_release: cuMemRelease(m_alloc_handle); m_alloc_handle = 0; throw std::runtime_error(err_msg); } - size_t m_size; - CUdeviceptr m_ptr; - CUmemGenericAllocationHandle m_alloc_handle; + size_t m_size; + CUdeviceptr m_ptr; + CUmemGenericAllocationHandle m_alloc_handle; }; namespace nixl_ep { From bbb8d277d08c0819b6366c998ff26ea59f34e468 Mon Sep 17 00:00:00 2001 From: Ofir Farjon Date: Sun, 8 Mar 2026 22:56:20 +0200 Subject: [PATCH 07/29] NIXL/EP: check return val. --- examples/device/ep/csrc/nixl_ep.hpp | 17 ++++++++++++----- 1 file changed, 12 insertions(+), 5 deletions(-) diff --git a/examples/device/ep/csrc/nixl_ep.hpp b/examples/device/ep/csrc/nixl_ep.hpp index f811bd3ff6..46d66bdec3 100644 --- a/examples/device/ep/csrc/nixl_ep.hpp +++ b/examples/device/ep/csrc/nixl_ep.hpp @@ -64,17 +64,24 @@ class cuda_allocator { } int rdma_vmm_supported = 0; - cuDeviceGetAttribute(&rdma_vmm_supported, - CU_DEVICE_ATTRIBUTE_GPU_DIRECT_RDMA_WITH_CUDA_VMM_SUPPORTED, - device); + if (cuDeviceGetAttribute(&rdma_vmm_supported, + CU_DEVICE_ATTRIBUTE_GPU_DIRECT_RDMA_WITH_CUDA_VMM_SUPPORTED, + device) != CUDA_SUCCESS) { + throw std::runtime_error( + "Failed to query GPUDirect RDMA with VMM support attribute"); + } if (!rdma_vmm_supported) { throw std::runtime_error( "GPUDirect RDMA with CUDA VMM is not supported on this device"); } int fabric_supported = 0; - cuDeviceGetAttribute( - &fabric_supported, CU_DEVICE_ATTRIBUTE_HANDLE_TYPE_FABRIC_SUPPORTED, device); + if (cuDeviceGetAttribute(&fabric_supported, + CU_DEVICE_ATTRIBUTE_HANDLE_TYPE_FABRIC_SUPPORTED, + device) != CUDA_SUCCESS) { + throw std::runtime_error( + "Failed to query fabric handle type support attribute"); + } CUmemAllocationProp prop = {}; prop.type = CU_MEM_ALLOCATION_TYPE_PINNED; From 7b60106d5480edc80689699ef9a5a311c958aec0 Mon Sep 17 00:00:00 2001 From: Ofir Farjon Date: Sun, 8 Mar 2026 22:57:52 +0200 Subject: [PATCH 08/29] NIXL/EP: Format. --- examples/device/ep/csrc/nixl_ep.hpp | 7 +++---- 1 file changed, 3 insertions(+), 4 deletions(-) diff --git a/examples/device/ep/csrc/nixl_ep.hpp b/examples/device/ep/csrc/nixl_ep.hpp index 46d66bdec3..a8551d4384 100644 --- a/examples/device/ep/csrc/nixl_ep.hpp +++ b/examples/device/ep/csrc/nixl_ep.hpp @@ -67,9 +67,9 @@ class cuda_allocator { if (cuDeviceGetAttribute(&rdma_vmm_supported, CU_DEVICE_ATTRIBUTE_GPU_DIRECT_RDMA_WITH_CUDA_VMM_SUPPORTED, device) != CUDA_SUCCESS) { - throw std::runtime_error( - "Failed to query GPUDirect RDMA with VMM support attribute"); + throw std::runtime_error("Failed to query GPUDirect RDMA with VMM support attribute"); } + if (!rdma_vmm_supported) { throw std::runtime_error( "GPUDirect RDMA with CUDA VMM is not supported on this device"); @@ -79,8 +79,7 @@ class cuda_allocator { if (cuDeviceGetAttribute(&fabric_supported, CU_DEVICE_ATTRIBUTE_HANDLE_TYPE_FABRIC_SUPPORTED, device) != CUDA_SUCCESS) { - throw std::runtime_error( - "Failed to query fabric handle type support attribute"); + throw std::runtime_error("Failed to query fabric handle type support attribute"); } CUmemAllocationProp prop = {}; From 25b2e5bb6e394848b60a3bf226ca331b55c9c0c0 Mon Sep 17 00:00:00 2001 From: Ofir Farjon Date: Mon, 9 Mar 2026 18:28:40 +0200 Subject: [PATCH 09/29] NIXL/EP:Improve. --- examples/device/ep/csrc/nixl_ep.cpp | 149 +++++++++++++++++++++++++--- examples/device/ep/csrc/nixl_ep.hpp | 136 ++----------------------- 2 files changed, 144 insertions(+), 141 deletions(-) diff --git a/examples/device/ep/csrc/nixl_ep.cpp b/examples/device/ep/csrc/nixl_ep.cpp index 386a66675c..8104cbe815 100644 --- a/examples/device/ep/csrc/nixl_ep.cpp +++ b/examples/device/ep/csrc/nixl_ep.cpp @@ -61,6 +61,125 @@ namespace nixl_ep { +vmm_region vmm_init(size_t size) +{ + if (size == 0) { + throw std::invalid_argument("vmm_init: size must be non-zero"); + } + + struct cuda_alloc_ctx { + CUdevice device; + CUmemAllocationProp prop; + size_t granularity; + + cuda_alloc_ctx() : device(0), prop({}), granularity(0) + { + int version; + if (cuDriverGetVersion(&version) != CUDA_SUCCESS) { + throw std::runtime_error("Failed to get CUDA driver version"); + } + if (version < 11000) { + throw std::runtime_error( + "VMM with RDMA is not supported in this CUDA version"); + } + + if (cuCtxGetDevice(&device) != CUDA_SUCCESS) { + throw std::runtime_error("Failed to get CUDA device handle"); + } + + int rdma_vmm_supported = 0; + if (cuDeviceGetAttribute( + &rdma_vmm_supported, + CU_DEVICE_ATTRIBUTE_GPU_DIRECT_RDMA_WITH_CUDA_VMM_SUPPORTED, + device) != CUDA_SUCCESS) { + throw std::runtime_error( + "Failed to query GPUDirect RDMA with VMM support attribute"); + } + if (!rdma_vmm_supported) { + throw std::runtime_error( + "GPUDirect RDMA with CUDA VMM is not supported on this device"); + } + + int fabric_supported = 0; + if (cuDeviceGetAttribute(&fabric_supported, + CU_DEVICE_ATTRIBUTE_HANDLE_TYPE_FABRIC_SUPPORTED, + device) != CUDA_SUCCESS) { + throw std::runtime_error( + "Failed to query fabric handle type support attribute"); + } + + prop.type = CU_MEM_ALLOCATION_TYPE_PINNED; + prop.location.type = CU_MEM_LOCATION_TYPE_DEVICE; + prop.location.id = device; + prop.allocFlags.gpuDirectRDMACapable = 1; + prop.requestedHandleTypes = fabric_supported ? + CU_MEM_HANDLE_TYPE_FABRIC : + CU_MEM_HANDLE_TYPE_NONE; + + if (cuMemGetAllocationGranularity(&granularity, &prop, + CU_MEM_ALLOC_GRANULARITY_MINIMUM) != + CUDA_SUCCESS) { + throw std::runtime_error( + "Failed to get CUDA allocation granularity"); + } + } + }; + static cuda_alloc_ctx ctx; + + vmm_region region = {}; + CUmemAccessDesc access_desc = {}; + const char *err_msg; + + region.size = align_up(size, ctx.granularity); + + if (cuMemCreate(®ion.handle, region.size, &ctx.prop, 0) != CUDA_SUCCESS) { + throw std::runtime_error("Failed to create CUDA VMM allocation"); + } + + if (cuMemAddressReserve(®ion.ptr, region.size, 0, 0, 0) != CUDA_SUCCESS) { + err_msg = "Failed to reserve CUDA virtual address"; + goto err_release; + } + + if (cuMemMap(region.ptr, region.size, 0, region.handle, 0) != CUDA_SUCCESS) { + err_msg = "Failed to map CUDA VMM memory"; + goto err_free; + } + + access_desc.location.type = CU_MEM_LOCATION_TYPE_DEVICE; + access_desc.location.id = ctx.device; + access_desc.flags = CU_MEM_ACCESS_FLAGS_PROT_READWRITE; + if (cuMemSetAccess(region.ptr, region.size, &access_desc, 1) != CUDA_SUCCESS) { + err_msg = "Failed to set CUDA memory access"; + goto err_unmap; + } + + return region; + +err_unmap: + cuMemUnmap(region.ptr, region.size); +err_free: + cuMemAddressFree(region.ptr, region.size); + region.ptr = 0; +err_release: + cuMemRelease(region.handle); + region.handle = 0; + throw std::runtime_error(err_msg); +} + +void vmm_free(vmm_region ®ion) +{ + if (region.ptr) { + cuMemUnmap(region.ptr, region.size); + cuMemAddressFree(region.ptr, region.size); + region.ptr = 0; + } + if (region.handle) { + cuMemRelease(region.handle); + region.handle = 0; + } +} + static void sleep_ms(int milliseconds) { std::this_thread::sleep_for(std::chrono::milliseconds(milliseconds)); } @@ -104,27 +223,27 @@ void Buffer::init(int num_ranks, int num_experts_per_rank, int64_t num_rdma_byte EP_HOST_ASSERT(per_channel_bytes < std::numeric_limits::max()); // Create 32 MiB workspace - m_workspace_alloc = std::make_unique(NUM_WORKSPACE_BYTES); - workspace = m_workspace_alloc->ptr(); + m_workspace_alloc = vmm_init(NUM_WORKSPACE_BYTES); + workspace = reinterpret_cast(m_workspace_alloc.ptr); CUDA_CHECK(cudaMemsetAsync(workspace, 0, NUM_WORKSPACE_BYTES, comm_stream)); EP_HOST_ASSERT(max_experts_per_rank > 0); - m_rdma_alloc = std::make_unique(num_rdma_bytes); - rdma_buffer_ptr = m_rdma_alloc->ptr(); + m_rdma_alloc = vmm_init(num_rdma_bytes); + rdma_buffer_ptr = reinterpret_cast(m_rdma_alloc.ptr); CUDA_CHECK(cudaMemset(rdma_buffer_ptr, 0, num_rdma_bytes)); // Allocate and clean shrink buffer int num_mask_buffer_bytes = max_num_ranks * sizeof(int); - m_mask_alloc = std::make_unique(num_mask_buffer_bytes); - mask_buffer_ptr = static_cast(m_mask_alloc->ptr()); + m_mask_alloc = vmm_init(num_mask_buffer_bytes); + mask_buffer_ptr = reinterpret_cast(m_mask_alloc.ptr); CUDA_CHECK(cudaMemset(mask_buffer_ptr, 0xff, num_mask_buffer_bytes)); CUDA_CHECK(cudaMemset(mask_buffer_ptr + rank, 0, sizeof(int))); int num_sync_buffer_bytes = max_num_ranks * sizeof(int); - m_sync_alloc = std::make_unique(num_sync_buffer_bytes); - sync_buffer_ptr = static_cast(m_sync_alloc->ptr()); - m_sync_count_alloc = std::make_unique(num_sync_buffer_bytes); - sync_count_ptr = static_cast(m_sync_count_alloc->ptr()); + m_sync_alloc = vmm_init(num_sync_buffer_bytes); + sync_buffer_ptr = reinterpret_cast(m_sync_alloc.ptr); + m_sync_count_alloc = vmm_init(num_sync_buffer_bytes); + sync_count_ptr = reinterpret_cast(m_sync_count_alloc.ptr); CUDA_CHECK(cudaMemset(sync_buffer_ptr, 0, num_sync_buffer_bytes)); CUDA_CHECK(cudaMemset(sync_count_ptr, 0, num_sync_buffer_bytes)); CUDA_CHECK(cudaDeviceSynchronize()); @@ -178,21 +297,21 @@ void Buffer::destroy() { _nixl_ep_destroy(); - m_rdma_alloc.reset(); + vmm_free(m_rdma_alloc); rdma_buffer_ptr = nullptr; if (nixl_agent_info and nixl_agent_info->agent != nullptr and getenv("NIXL_ETCD_ENDPOINTS")) { nixl_agent_info->agent->invalidateLocalMD(); } - m_mask_alloc.reset(); + vmm_free(m_mask_alloc); mask_buffer_ptr = nullptr; - m_sync_alloc.reset(); + vmm_free(m_sync_alloc); sync_buffer_ptr = nullptr; - m_sync_count_alloc.reset(); + vmm_free(m_sync_count_alloc); sync_count_ptr = nullptr; - m_workspace_alloc.reset(); + vmm_free(m_workspace_alloc); workspace = nullptr; destroyed = true; diff --git a/examples/device/ep/csrc/nixl_ep.hpp b/examples/device/ep/csrc/nixl_ep.hpp index a8551d4384..083c0027eb 100644 --- a/examples/device/ep/csrc/nixl_ep.hpp +++ b/examples/device/ep/csrc/nixl_ep.hpp @@ -50,126 +50,10 @@ #define TORCH_EXTENSION_NAME nixl_ep_cpp #endif -/* CUDA memory allocator using VMM. */ -class cuda_allocator { -public: - cuda_allocator(size_t size) : m_size(0), m_ptr(0), m_alloc_handle(0) { - if (size == 0) { - throw std::invalid_argument("cuda_allocator: size must be non-zero"); - } - - CUdevice device; - if (cuCtxGetDevice(&device) != CUDA_SUCCESS) { - throw std::runtime_error("Failed to get CUDA device handle"); - } - - int rdma_vmm_supported = 0; - if (cuDeviceGetAttribute(&rdma_vmm_supported, - CU_DEVICE_ATTRIBUTE_GPU_DIRECT_RDMA_WITH_CUDA_VMM_SUPPORTED, - device) != CUDA_SUCCESS) { - throw std::runtime_error("Failed to query GPUDirect RDMA with VMM support attribute"); - } - - if (!rdma_vmm_supported) { - throw std::runtime_error( - "GPUDirect RDMA with CUDA VMM is not supported on this device"); - } - - int fabric_supported = 0; - if (cuDeviceGetAttribute(&fabric_supported, - CU_DEVICE_ATTRIBUTE_HANDLE_TYPE_FABRIC_SUPPORTED, - device) != CUDA_SUCCESS) { - throw std::runtime_error("Failed to query fabric handle type support attribute"); - } - - CUmemAllocationProp prop = {}; - prop.type = CU_MEM_ALLOCATION_TYPE_PINNED; - prop.location.type = CU_MEM_LOCATION_TYPE_DEVICE; - prop.location.id = device; - prop.allocFlags.gpuDirectRDMACapable = 1; - prop.requestedHandleTypes = - fabric_supported ? CU_MEM_HANDLE_TYPE_FABRIC : CU_MEM_HANDLE_TYPE_NONE; - - size_t granularity = 0; - if (cuMemGetAllocationGranularity(&granularity, &prop, CU_MEM_ALLOC_GRANULARITY_MINIMUM) != - CUDA_SUCCESS) { - throw std::runtime_error("Failed to get CUDA allocation granularity"); - } - - init_vmm(size, device, prop, granularity); - } - - ~cuda_allocator() { - if (m_ptr) { - cuMemUnmap(m_ptr, m_size); - cuMemAddressFree(m_ptr, m_size); - } - - if (m_alloc_handle) { - cuMemRelease(m_alloc_handle); - } - } - - void * - ptr() const { - return reinterpret_cast(m_ptr); - } - - size_t - size() const { - return m_size; - } - - cuda_allocator(const cuda_allocator &) = delete; - cuda_allocator & - operator=(const cuda_allocator &) = delete; - -private: - void - init_vmm(size_t size, CUdevice device, const CUmemAllocationProp &prop, size_t granularity) { - CUmemAccessDesc access_desc = {}; - const char *err_msg; - - m_size = nixl_ep::align_up(size, granularity); - - if (cuMemCreate(&m_alloc_handle, m_size, &prop, 0) != CUDA_SUCCESS) { - throw std::runtime_error("Failed to create CUDA VMM allocation"); - } - - if (cuMemAddressReserve(&m_ptr, m_size, 0, 0, 0) != CUDA_SUCCESS) { - err_msg = "Failed to reserve CUDA virtual address"; - goto err_release; - } - - if (cuMemMap(m_ptr, m_size, 0, m_alloc_handle, 0) != CUDA_SUCCESS) { - err_msg = "Failed to map CUDA VMM memory"; - goto err_free; - } - - access_desc.location.type = CU_MEM_LOCATION_TYPE_DEVICE; - access_desc.location.id = device; - access_desc.flags = CU_MEM_ACCESS_FLAGS_PROT_READWRITE; - if (cuMemSetAccess(m_ptr, m_size, &access_desc, 1) != CUDA_SUCCESS) { - err_msg = "Failed to set CUDA memory access"; - goto err_unmap; - } - - return; - - err_unmap: - cuMemUnmap(m_ptr, m_size); - err_free: - cuMemAddressFree(m_ptr, m_size); - m_ptr = 0; - err_release: - cuMemRelease(m_alloc_handle); - m_alloc_handle = 0; - throw std::runtime_error(err_msg); - } - - size_t m_size; - CUdeviceptr m_ptr; - CUmemGenericAllocationHandle m_alloc_handle; +struct vmm_region { + CUdeviceptr ptr; + size_t size; + CUmemGenericAllocationHandle handle; }; namespace nixl_ep { @@ -208,12 +92,12 @@ struct Buffer { int *sync_buffer_ptr = nullptr; int *sync_count_ptr = nullptr; - // Owning allocators (keep raw ptrs above as aliases for use throughout) - std::unique_ptr m_rdma_alloc; - std::unique_ptr m_mask_alloc; - std::unique_ptr m_sync_alloc; - std::unique_ptr m_sync_count_alloc; - std::unique_ptr m_workspace_alloc; + /* Owning VMM allocations (keep raw ptrs above as aliases) */ + vmm_region m_rdma_alloc; + vmm_region m_mask_alloc; + vmm_region m_sync_alloc; + vmm_region m_sync_count_alloc; + vmm_region m_workspace_alloc; // Device info and communication int device_id; From 02e379b0c6f9dae2fe575820241396669b1a0152 Mon Sep 17 00:00:00 2001 From: Ofir Farjon Date: Mon, 9 Mar 2026 18:46:08 +0200 Subject: [PATCH 10/29] NIXL/EP: Format. --- examples/device/ep/csrc/nixl_ep.cpp | 68 +++++++++++++---------------- examples/device/ep/csrc/nixl_ep.hpp | 4 +- 2 files changed, 33 insertions(+), 39 deletions(-) diff --git a/examples/device/ep/csrc/nixl_ep.cpp b/examples/device/ep/csrc/nixl_ep.cpp index 8104cbe815..942692f30f 100644 --- a/examples/device/ep/csrc/nixl_ep.cpp +++ b/examples/device/ep/csrc/nixl_ep.cpp @@ -61,26 +61,24 @@ namespace nixl_ep { -vmm_region vmm_init(size_t size) -{ +vmm_region +vmm_init(size_t size) { if (size == 0) { throw std::invalid_argument("vmm_init: size must be non-zero"); } struct cuda_alloc_ctx { - CUdevice device; + CUdevice device; CUmemAllocationProp prop; - size_t granularity; + size_t granularity; - cuda_alloc_ctx() : device(0), prop({}), granularity(0) - { + cuda_alloc_ctx() : device(0), prop({}), granularity(0) { int version; if (cuDriverGetVersion(&version) != CUDA_SUCCESS) { throw std::runtime_error("Failed to get CUDA driver version"); } if (version < 11000) { - throw std::runtime_error( - "VMM with RDMA is not supported in this CUDA version"); + throw std::runtime_error("VMM with RDMA is not supported in this CUDA version"); } if (cuCtxGetDevice(&device) != CUDA_SUCCESS) { @@ -88,10 +86,9 @@ vmm_region vmm_init(size_t size) } int rdma_vmm_supported = 0; - if (cuDeviceGetAttribute( - &rdma_vmm_supported, - CU_DEVICE_ATTRIBUTE_GPU_DIRECT_RDMA_WITH_CUDA_VMM_SUPPORTED, - device) != CUDA_SUCCESS) { + if (cuDeviceGetAttribute(&rdma_vmm_supported, + CU_DEVICE_ATTRIBUTE_GPU_DIRECT_RDMA_WITH_CUDA_VMM_SUPPORTED, + device) != CUDA_SUCCESS) { throw std::runtime_error( "Failed to query GPUDirect RDMA with VMM support attribute"); } @@ -104,31 +101,28 @@ vmm_region vmm_init(size_t size) if (cuDeviceGetAttribute(&fabric_supported, CU_DEVICE_ATTRIBUTE_HANDLE_TYPE_FABRIC_SUPPORTED, device) != CUDA_SUCCESS) { - throw std::runtime_error( - "Failed to query fabric handle type support attribute"); + throw std::runtime_error("Failed to query fabric handle type support attribute"); } - prop.type = CU_MEM_ALLOCATION_TYPE_PINNED; - prop.location.type = CU_MEM_LOCATION_TYPE_DEVICE; - prop.location.id = device; + prop.type = CU_MEM_ALLOCATION_TYPE_PINNED; + prop.location.type = CU_MEM_LOCATION_TYPE_DEVICE; + prop.location.id = device; prop.allocFlags.gpuDirectRDMACapable = 1; - prop.requestedHandleTypes = fabric_supported ? - CU_MEM_HANDLE_TYPE_FABRIC : - CU_MEM_HANDLE_TYPE_NONE; + prop.requestedHandleTypes = + fabric_supported ? CU_MEM_HANDLE_TYPE_FABRIC : CU_MEM_HANDLE_TYPE_NONE; - if (cuMemGetAllocationGranularity(&granularity, &prop, - CU_MEM_ALLOC_GRANULARITY_MINIMUM) != - CUDA_SUCCESS) { - throw std::runtime_error( - "Failed to get CUDA allocation granularity"); + if (cuMemGetAllocationGranularity( + &granularity, &prop, CU_MEM_ALLOC_GRANULARITY_MINIMUM) != CUDA_SUCCESS) { + throw std::runtime_error("Failed to get CUDA allocation granularity"); } } }; + static cuda_alloc_ctx ctx; - vmm_region region = {}; + vmm_region region = {}; CUmemAccessDesc access_desc = {}; - const char *err_msg; + const char *err_msg; region.size = align_up(size, ctx.granularity); @@ -147,8 +141,8 @@ vmm_region vmm_init(size_t size) } access_desc.location.type = CU_MEM_LOCATION_TYPE_DEVICE; - access_desc.location.id = ctx.device; - access_desc.flags = CU_MEM_ACCESS_FLAGS_PROT_READWRITE; + access_desc.location.id = ctx.device; + access_desc.flags = CU_MEM_ACCESS_FLAGS_PROT_READWRITE; if (cuMemSetAccess(region.ptr, region.size, &access_desc, 1) != CUDA_SUCCESS) { err_msg = "Failed to set CUDA memory access"; goto err_unmap; @@ -167,8 +161,8 @@ vmm_region vmm_init(size_t size) throw std::runtime_error(err_msg); } -void vmm_free(vmm_region ®ion) -{ +void +vmm_free(vmm_region ®ion) { if (region.ptr) { cuMemUnmap(region.ptr, region.size); cuMemAddressFree(region.ptr, region.size); @@ -224,26 +218,26 @@ void Buffer::init(int num_ranks, int num_experts_per_rank, int64_t num_rdma_byte // Create 32 MiB workspace m_workspace_alloc = vmm_init(NUM_WORKSPACE_BYTES); - workspace = reinterpret_cast(m_workspace_alloc.ptr); + workspace = reinterpret_cast(m_workspace_alloc.ptr); CUDA_CHECK(cudaMemsetAsync(workspace, 0, NUM_WORKSPACE_BYTES, comm_stream)); EP_HOST_ASSERT(max_experts_per_rank > 0); - m_rdma_alloc = vmm_init(num_rdma_bytes); + m_rdma_alloc = vmm_init(num_rdma_bytes); rdma_buffer_ptr = reinterpret_cast(m_rdma_alloc.ptr); CUDA_CHECK(cudaMemset(rdma_buffer_ptr, 0, num_rdma_bytes)); // Allocate and clean shrink buffer int num_mask_buffer_bytes = max_num_ranks * sizeof(int); - m_mask_alloc = vmm_init(num_mask_buffer_bytes); + m_mask_alloc = vmm_init(num_mask_buffer_bytes); mask_buffer_ptr = reinterpret_cast(m_mask_alloc.ptr); CUDA_CHECK(cudaMemset(mask_buffer_ptr, 0xff, num_mask_buffer_bytes)); CUDA_CHECK(cudaMemset(mask_buffer_ptr + rank, 0, sizeof(int))); int num_sync_buffer_bytes = max_num_ranks * sizeof(int); - m_sync_alloc = vmm_init(num_sync_buffer_bytes); - sync_buffer_ptr = reinterpret_cast(m_sync_alloc.ptr); + m_sync_alloc = vmm_init(num_sync_buffer_bytes); + sync_buffer_ptr = reinterpret_cast(m_sync_alloc.ptr); m_sync_count_alloc = vmm_init(num_sync_buffer_bytes); - sync_count_ptr = reinterpret_cast(m_sync_count_alloc.ptr); + sync_count_ptr = reinterpret_cast(m_sync_count_alloc.ptr); CUDA_CHECK(cudaMemset(sync_buffer_ptr, 0, num_sync_buffer_bytes)); CUDA_CHECK(cudaMemset(sync_count_ptr, 0, num_sync_buffer_bytes)); CUDA_CHECK(cudaDeviceSynchronize()); diff --git a/examples/device/ep/csrc/nixl_ep.hpp b/examples/device/ep/csrc/nixl_ep.hpp index 083c0027eb..f411a89ca2 100644 --- a/examples/device/ep/csrc/nixl_ep.hpp +++ b/examples/device/ep/csrc/nixl_ep.hpp @@ -51,8 +51,8 @@ #endif struct vmm_region { - CUdeviceptr ptr; - size_t size; + CUdeviceptr ptr; + size_t size; CUmemGenericAllocationHandle handle; }; From 2c701be7f97520bd2c4b94665c5493d12213feef Mon Sep 17 00:00:00 2001 From: Ofir Farjon Date: Mon, 9 Mar 2026 18:51:47 +0200 Subject: [PATCH 11/29] NIXL/EP: Improve. --- examples/device/ep/csrc/nixl_ep.cpp | 59 +++++++++++++++-------------- 1 file changed, 30 insertions(+), 29 deletions(-) diff --git a/examples/device/ep/csrc/nixl_ep.cpp b/examples/device/ep/csrc/nixl_ep.cpp index 942692f30f..1f4fe13f03 100644 --- a/examples/device/ep/csrc/nixl_ep.cpp +++ b/examples/device/ep/csrc/nixl_ep.cpp @@ -62,33 +62,31 @@ namespace nixl_ep { vmm_region -vmm_init(size_t size) { +vmm_init(size_t size, CUdevice device) { if (size == 0) { throw std::invalid_argument("vmm_init: size must be non-zero"); } struct cuda_alloc_ctx { - CUdevice device; CUmemAllocationProp prop; - size_t granularity; + size_t granularity; - cuda_alloc_ctx() : device(0), prop({}), granularity(0) { + cuda_alloc_ctx(CUdevice dev) : prop({}), granularity(0) + { int version; if (cuDriverGetVersion(&version) != CUDA_SUCCESS) { throw std::runtime_error("Failed to get CUDA driver version"); } if (version < 11000) { - throw std::runtime_error("VMM with RDMA is not supported in this CUDA version"); - } - - if (cuCtxGetDevice(&device) != CUDA_SUCCESS) { - throw std::runtime_error("Failed to get CUDA device handle"); + throw std::runtime_error( + "VMM with RDMA is not supported in this CUDA version"); } int rdma_vmm_supported = 0; - if (cuDeviceGetAttribute(&rdma_vmm_supported, - CU_DEVICE_ATTRIBUTE_GPU_DIRECT_RDMA_WITH_CUDA_VMM_SUPPORTED, - device) != CUDA_SUCCESS) { + if (cuDeviceGetAttribute( + &rdma_vmm_supported, + CU_DEVICE_ATTRIBUTE_GPU_DIRECT_RDMA_WITH_CUDA_VMM_SUPPORTED, + dev) != CUDA_SUCCESS) { throw std::runtime_error( "Failed to query GPUDirect RDMA with VMM support attribute"); } @@ -100,25 +98,28 @@ vmm_init(size_t size) { int fabric_supported = 0; if (cuDeviceGetAttribute(&fabric_supported, CU_DEVICE_ATTRIBUTE_HANDLE_TYPE_FABRIC_SUPPORTED, - device) != CUDA_SUCCESS) { - throw std::runtime_error("Failed to query fabric handle type support attribute"); + dev) != CUDA_SUCCESS) { + throw std::runtime_error( + "Failed to query fabric handle type support attribute"); } - prop.type = CU_MEM_ALLOCATION_TYPE_PINNED; - prop.location.type = CU_MEM_LOCATION_TYPE_DEVICE; - prop.location.id = device; + prop.type = CU_MEM_ALLOCATION_TYPE_PINNED; + prop.location.type = CU_MEM_LOCATION_TYPE_DEVICE; + prop.location.id = dev; prop.allocFlags.gpuDirectRDMACapable = 1; - prop.requestedHandleTypes = - fabric_supported ? CU_MEM_HANDLE_TYPE_FABRIC : CU_MEM_HANDLE_TYPE_NONE; + prop.requestedHandleTypes = fabric_supported ? + CU_MEM_HANDLE_TYPE_FABRIC : + CU_MEM_HANDLE_TYPE_NONE; - if (cuMemGetAllocationGranularity( - &granularity, &prop, CU_MEM_ALLOC_GRANULARITY_MINIMUM) != CUDA_SUCCESS) { + if (cuMemGetAllocationGranularity(&granularity, &prop, + CU_MEM_ALLOC_GRANULARITY_MINIMUM) != + CUDA_SUCCESS) { throw std::runtime_error("Failed to get CUDA allocation granularity"); } } }; - static cuda_alloc_ctx ctx; + static cuda_alloc_ctx ctx(device); vmm_region region = {}; CUmemAccessDesc access_desc = {}; @@ -141,8 +142,8 @@ vmm_init(size_t size) { } access_desc.location.type = CU_MEM_LOCATION_TYPE_DEVICE; - access_desc.location.id = ctx.device; - access_desc.flags = CU_MEM_ACCESS_FLAGS_PROT_READWRITE; + access_desc.location.id = device; + access_desc.flags = CU_MEM_ACCESS_FLAGS_PROT_READWRITE; if (cuMemSetAccess(region.ptr, region.size, &access_desc, 1) != CUDA_SUCCESS) { err_msg = "Failed to set CUDA memory access"; goto err_unmap; @@ -217,26 +218,26 @@ void Buffer::init(int num_ranks, int num_experts_per_rank, int64_t num_rdma_byte EP_HOST_ASSERT(per_channel_bytes < std::numeric_limits::max()); // Create 32 MiB workspace - m_workspace_alloc = vmm_init(NUM_WORKSPACE_BYTES); + m_workspace_alloc = vmm_init(NUM_WORKSPACE_BYTES, device_id); workspace = reinterpret_cast(m_workspace_alloc.ptr); CUDA_CHECK(cudaMemsetAsync(workspace, 0, NUM_WORKSPACE_BYTES, comm_stream)); EP_HOST_ASSERT(max_experts_per_rank > 0); - m_rdma_alloc = vmm_init(num_rdma_bytes); + m_rdma_alloc = vmm_init(num_rdma_bytes, device_id); rdma_buffer_ptr = reinterpret_cast(m_rdma_alloc.ptr); CUDA_CHECK(cudaMemset(rdma_buffer_ptr, 0, num_rdma_bytes)); // Allocate and clean shrink buffer int num_mask_buffer_bytes = max_num_ranks * sizeof(int); - m_mask_alloc = vmm_init(num_mask_buffer_bytes); + m_mask_alloc = vmm_init(num_mask_buffer_bytes, device_id); mask_buffer_ptr = reinterpret_cast(m_mask_alloc.ptr); CUDA_CHECK(cudaMemset(mask_buffer_ptr, 0xff, num_mask_buffer_bytes)); CUDA_CHECK(cudaMemset(mask_buffer_ptr + rank, 0, sizeof(int))); int num_sync_buffer_bytes = max_num_ranks * sizeof(int); - m_sync_alloc = vmm_init(num_sync_buffer_bytes); + m_sync_alloc = vmm_init(num_sync_buffer_bytes, device_id); sync_buffer_ptr = reinterpret_cast(m_sync_alloc.ptr); - m_sync_count_alloc = vmm_init(num_sync_buffer_bytes); + m_sync_count_alloc = vmm_init(num_sync_buffer_bytes, device_id); sync_count_ptr = reinterpret_cast(m_sync_count_alloc.ptr); CUDA_CHECK(cudaMemset(sync_buffer_ptr, 0, num_sync_buffer_bytes)); CUDA_CHECK(cudaMemset(sync_count_ptr, 0, num_sync_buffer_bytes)); From 329fb22b319d4fb9fea9754eb42d248c644336d7 Mon Sep 17 00:00:00 2001 From: Ofir Farjon Date: Mon, 9 Mar 2026 18:52:06 +0200 Subject: [PATCH 12/29] NIXL/EP: Format. --- examples/device/ep/csrc/nixl_ep.cpp | 38 ++++++++++++----------------- 1 file changed, 16 insertions(+), 22 deletions(-) diff --git a/examples/device/ep/csrc/nixl_ep.cpp b/examples/device/ep/csrc/nixl_ep.cpp index 1f4fe13f03..8d4e754398 100644 --- a/examples/device/ep/csrc/nixl_ep.cpp +++ b/examples/device/ep/csrc/nixl_ep.cpp @@ -69,24 +69,21 @@ vmm_init(size_t size, CUdevice device) { struct cuda_alloc_ctx { CUmemAllocationProp prop; - size_t granularity; + size_t granularity; - cuda_alloc_ctx(CUdevice dev) : prop({}), granularity(0) - { + cuda_alloc_ctx(CUdevice dev) : prop({}), granularity(0) { int version; if (cuDriverGetVersion(&version) != CUDA_SUCCESS) { throw std::runtime_error("Failed to get CUDA driver version"); } if (version < 11000) { - throw std::runtime_error( - "VMM with RDMA is not supported in this CUDA version"); + throw std::runtime_error("VMM with RDMA is not supported in this CUDA version"); } int rdma_vmm_supported = 0; - if (cuDeviceGetAttribute( - &rdma_vmm_supported, - CU_DEVICE_ATTRIBUTE_GPU_DIRECT_RDMA_WITH_CUDA_VMM_SUPPORTED, - dev) != CUDA_SUCCESS) { + if (cuDeviceGetAttribute(&rdma_vmm_supported, + CU_DEVICE_ATTRIBUTE_GPU_DIRECT_RDMA_WITH_CUDA_VMM_SUPPORTED, + dev) != CUDA_SUCCESS) { throw std::runtime_error( "Failed to query GPUDirect RDMA with VMM support attribute"); } @@ -99,21 +96,18 @@ vmm_init(size_t size, CUdevice device) { if (cuDeviceGetAttribute(&fabric_supported, CU_DEVICE_ATTRIBUTE_HANDLE_TYPE_FABRIC_SUPPORTED, dev) != CUDA_SUCCESS) { - throw std::runtime_error( - "Failed to query fabric handle type support attribute"); + throw std::runtime_error("Failed to query fabric handle type support attribute"); } - prop.type = CU_MEM_ALLOCATION_TYPE_PINNED; - prop.location.type = CU_MEM_LOCATION_TYPE_DEVICE; - prop.location.id = dev; + prop.type = CU_MEM_ALLOCATION_TYPE_PINNED; + prop.location.type = CU_MEM_LOCATION_TYPE_DEVICE; + prop.location.id = dev; prop.allocFlags.gpuDirectRDMACapable = 1; - prop.requestedHandleTypes = fabric_supported ? - CU_MEM_HANDLE_TYPE_FABRIC : - CU_MEM_HANDLE_TYPE_NONE; + prop.requestedHandleTypes = + fabric_supported ? CU_MEM_HANDLE_TYPE_FABRIC : CU_MEM_HANDLE_TYPE_NONE; - if (cuMemGetAllocationGranularity(&granularity, &prop, - CU_MEM_ALLOC_GRANULARITY_MINIMUM) != - CUDA_SUCCESS) { + if (cuMemGetAllocationGranularity( + &granularity, &prop, CU_MEM_ALLOC_GRANULARITY_MINIMUM) != CUDA_SUCCESS) { throw std::runtime_error("Failed to get CUDA allocation granularity"); } } @@ -142,8 +136,8 @@ vmm_init(size_t size, CUdevice device) { } access_desc.location.type = CU_MEM_LOCATION_TYPE_DEVICE; - access_desc.location.id = device; - access_desc.flags = CU_MEM_ACCESS_FLAGS_PROT_READWRITE; + access_desc.location.id = device; + access_desc.flags = CU_MEM_ACCESS_FLAGS_PROT_READWRITE; if (cuMemSetAccess(region.ptr, region.size, &access_desc, 1) != CUDA_SUCCESS) { err_msg = "Failed to set CUDA memory access"; goto err_unmap; From ee2fd647e9667f761a519510c36186ac46f5fe83 Mon Sep 17 00:00:00 2001 From: Ofir Farjon Date: Wed, 11 Mar 2026 11:14:32 +0200 Subject: [PATCH 13/29] NIXL/EP: fallback to cudaMalloc if fabric not supported --- examples/device/ep/csrc/nixl_ep.cpp | 49 ++++++++++++++++++++--------- examples/device/ep/csrc/nixl_ep.hpp | 1 + 2 files changed, 36 insertions(+), 14 deletions(-) diff --git a/examples/device/ep/csrc/nixl_ep.cpp b/examples/device/ep/csrc/nixl_ep.cpp index 8d4e754398..be8da2079a 100644 --- a/examples/device/ep/csrc/nixl_ep.cpp +++ b/examples/device/ep/csrc/nixl_ep.cpp @@ -68,16 +68,26 @@ vmm_init(size_t size, CUdevice device) { } struct cuda_alloc_ctx { + bool fabric_supported; CUmemAllocationProp prop; size_t granularity; - cuda_alloc_ctx(CUdevice dev) : prop({}), granularity(0) { + cuda_alloc_ctx(CUdevice dev) : fabric_supported(false), prop({}), granularity(0) { int version; if (cuDriverGetVersion(&version) != CUDA_SUCCESS) { throw std::runtime_error("Failed to get CUDA driver version"); } + if (version < 11000) { - throw std::runtime_error("VMM with RDMA is not supported in this CUDA version"); + return; /* too old — fall back to cudaMalloc */ + } + + int fab = 0; + if ((cuDeviceGetAttribute(&fab, + CU_DEVICE_ATTRIBUTE_HANDLE_TYPE_FABRIC_SUPPORTED, + dev) != CUDA_SUCCESS) || + (!fab)) { + return; /* no fabric — fall back to cudaMalloc */ } int rdma_vmm_supported = 0; @@ -87,34 +97,39 @@ vmm_init(size_t size, CUdevice device) { throw std::runtime_error( "Failed to query GPUDirect RDMA with VMM support attribute"); } + if (!rdma_vmm_supported) { throw std::runtime_error( "GPUDirect RDMA with CUDA VMM is not supported on this device"); } - int fabric_supported = 0; - if (cuDeviceGetAttribute(&fabric_supported, - CU_DEVICE_ATTRIBUTE_HANDLE_TYPE_FABRIC_SUPPORTED, - dev) != CUDA_SUCCESS) { - throw std::runtime_error("Failed to query fabric handle type support attribute"); - } - prop.type = CU_MEM_ALLOCATION_TYPE_PINNED; prop.location.type = CU_MEM_LOCATION_TYPE_DEVICE; prop.location.id = dev; prop.allocFlags.gpuDirectRDMACapable = 1; - prop.requestedHandleTypes = - fabric_supported ? CU_MEM_HANDLE_TYPE_FABRIC : CU_MEM_HANDLE_TYPE_NONE; + prop.requestedHandleTypes = CU_MEM_HANDLE_TYPE_FABRIC; if (cuMemGetAllocationGranularity( &granularity, &prop, CU_MEM_ALLOC_GRANULARITY_MINIMUM) != CUDA_SUCCESS) { throw std::runtime_error("Failed to get CUDA allocation granularity"); } + + fabric_supported = true; } }; static cuda_alloc_ctx ctx(device); + if (!ctx.fabric_supported) { + vmm_region region = {}; + region.size = size; + region.is_cuda_malloc = true; + if (cudaMalloc(reinterpret_cast(®ion.ptr), size) != cudaSuccess) { + throw std::runtime_error("cudaMalloc fallback failed (fabric not supported)"); + } + return region; + } + vmm_region region = {}; CUmemAccessDesc access_desc = {}; const char *err_msg; @@ -158,11 +173,17 @@ vmm_init(size_t size, CUdevice device) { void vmm_free(vmm_region ®ion) { - if (region.ptr) { - cuMemUnmap(region.ptr, region.size); - cuMemAddressFree(region.ptr, region.size); + if (!region.ptr) { + return; + } + if (region.is_cuda_malloc) { + cudaFree(reinterpret_cast(region.ptr)); region.ptr = 0; + return; } + cuMemUnmap(region.ptr, region.size); + cuMemAddressFree(region.ptr, region.size); + region.ptr = 0; if (region.handle) { cuMemRelease(region.handle); region.handle = 0; diff --git a/examples/device/ep/csrc/nixl_ep.hpp b/examples/device/ep/csrc/nixl_ep.hpp index f411a89ca2..54b0207f8d 100644 --- a/examples/device/ep/csrc/nixl_ep.hpp +++ b/examples/device/ep/csrc/nixl_ep.hpp @@ -54,6 +54,7 @@ struct vmm_region { CUdeviceptr ptr; size_t size; CUmemGenericAllocationHandle handle; + bool is_cuda_malloc = false; }; namespace nixl_ep { From 7677e592e897d6dbc459191762755f7e868f20e6 Mon Sep 17 00:00:00 2001 From: Ofir Farjon Date: Wed, 11 Mar 2026 12:02:11 +0200 Subject: [PATCH 14/29] NIXL/EP: set default vals --- examples/device/ep/csrc/nixl_ep.hpp | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/examples/device/ep/csrc/nixl_ep.hpp b/examples/device/ep/csrc/nixl_ep.hpp index 54b0207f8d..d8dd9b835d 100644 --- a/examples/device/ep/csrc/nixl_ep.hpp +++ b/examples/device/ep/csrc/nixl_ep.hpp @@ -51,9 +51,9 @@ #endif struct vmm_region { - CUdeviceptr ptr; - size_t size; - CUmemGenericAllocationHandle handle; + CUdeviceptr ptr = 0; + size_t size = 0; + CUmemGenericAllocationHandle handle = 0; bool is_cuda_malloc = false; }; From 8b4b5674c14b3bb3459d4755d5d0f812849a5cb3 Mon Sep 17 00:00:00 2001 From: Ofir Farjon Date: Wed, 11 Mar 2026 12:16:50 +0200 Subject: [PATCH 15/29] NIXL/EP: Fix. --- examples/device/ep/csrc/nixl_ep.cpp | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/examples/device/ep/csrc/nixl_ep.cpp b/examples/device/ep/csrc/nixl_ep.cpp index be8da2079a..191fe23107 100644 --- a/examples/device/ep/csrc/nixl_ep.cpp +++ b/examples/device/ep/csrc/nixl_ep.cpp @@ -61,7 +61,7 @@ namespace nixl_ep { -vmm_region +static vmm_region vmm_init(size_t size, CUdevice device) { if (size == 0) { throw std::invalid_argument("vmm_init: size must be non-zero"); @@ -171,7 +171,7 @@ vmm_init(size_t size, CUdevice device) { throw std::runtime_error(err_msg); } -void +static void vmm_free(vmm_region ®ion) { if (!region.ptr) { return; From cbe8d1df814af7b35adbf07af009adfd8edfbf07 Mon Sep 17 00:00:00 2001 From: Ofir Farjon Date: Tue, 17 Mar 2026 22:33:49 +0200 Subject: [PATCH 16/29] NIXL/EP: Fix comments. --- examples/device/ep/csrc/nixl_ep.cpp | 129 ----------------------- examples/device/ep/csrc/nixl_ep.hpp | 8 +- examples/device/ep/csrc/vmm.cpp | 155 ++++++++++++++++++++++++++++ examples/device/ep/csrc/vmm.hpp | 38 +++++++ examples/device/ep/meson.build | 1 + 5 files changed, 195 insertions(+), 136 deletions(-) create mode 100644 examples/device/ep/csrc/vmm.cpp create mode 100644 examples/device/ep/csrc/vmm.hpp diff --git a/examples/device/ep/csrc/nixl_ep.cpp b/examples/device/ep/csrc/nixl_ep.cpp index 191fe23107..011d0f7343 100644 --- a/examples/device/ep/csrc/nixl_ep.cpp +++ b/examples/device/ep/csrc/nixl_ep.cpp @@ -61,135 +61,6 @@ namespace nixl_ep { -static vmm_region -vmm_init(size_t size, CUdevice device) { - if (size == 0) { - throw std::invalid_argument("vmm_init: size must be non-zero"); - } - - struct cuda_alloc_ctx { - bool fabric_supported; - CUmemAllocationProp prop; - size_t granularity; - - cuda_alloc_ctx(CUdevice dev) : fabric_supported(false), prop({}), granularity(0) { - int version; - if (cuDriverGetVersion(&version) != CUDA_SUCCESS) { - throw std::runtime_error("Failed to get CUDA driver version"); - } - - if (version < 11000) { - return; /* too old — fall back to cudaMalloc */ - } - - int fab = 0; - if ((cuDeviceGetAttribute(&fab, - CU_DEVICE_ATTRIBUTE_HANDLE_TYPE_FABRIC_SUPPORTED, - dev) != CUDA_SUCCESS) || - (!fab)) { - return; /* no fabric — fall back to cudaMalloc */ - } - - int rdma_vmm_supported = 0; - if (cuDeviceGetAttribute(&rdma_vmm_supported, - CU_DEVICE_ATTRIBUTE_GPU_DIRECT_RDMA_WITH_CUDA_VMM_SUPPORTED, - dev) != CUDA_SUCCESS) { - throw std::runtime_error( - "Failed to query GPUDirect RDMA with VMM support attribute"); - } - - if (!rdma_vmm_supported) { - throw std::runtime_error( - "GPUDirect RDMA with CUDA VMM is not supported on this device"); - } - - prop.type = CU_MEM_ALLOCATION_TYPE_PINNED; - prop.location.type = CU_MEM_LOCATION_TYPE_DEVICE; - prop.location.id = dev; - prop.allocFlags.gpuDirectRDMACapable = 1; - prop.requestedHandleTypes = CU_MEM_HANDLE_TYPE_FABRIC; - - if (cuMemGetAllocationGranularity( - &granularity, &prop, CU_MEM_ALLOC_GRANULARITY_MINIMUM) != CUDA_SUCCESS) { - throw std::runtime_error("Failed to get CUDA allocation granularity"); - } - - fabric_supported = true; - } - }; - - static cuda_alloc_ctx ctx(device); - - if (!ctx.fabric_supported) { - vmm_region region = {}; - region.size = size; - region.is_cuda_malloc = true; - if (cudaMalloc(reinterpret_cast(®ion.ptr), size) != cudaSuccess) { - throw std::runtime_error("cudaMalloc fallback failed (fabric not supported)"); - } - return region; - } - - vmm_region region = {}; - CUmemAccessDesc access_desc = {}; - const char *err_msg; - - region.size = align_up(size, ctx.granularity); - - if (cuMemCreate(®ion.handle, region.size, &ctx.prop, 0) != CUDA_SUCCESS) { - throw std::runtime_error("Failed to create CUDA VMM allocation"); - } - - if (cuMemAddressReserve(®ion.ptr, region.size, 0, 0, 0) != CUDA_SUCCESS) { - err_msg = "Failed to reserve CUDA virtual address"; - goto err_release; - } - - if (cuMemMap(region.ptr, region.size, 0, region.handle, 0) != CUDA_SUCCESS) { - err_msg = "Failed to map CUDA VMM memory"; - goto err_free; - } - - access_desc.location.type = CU_MEM_LOCATION_TYPE_DEVICE; - access_desc.location.id = device; - access_desc.flags = CU_MEM_ACCESS_FLAGS_PROT_READWRITE; - if (cuMemSetAccess(region.ptr, region.size, &access_desc, 1) != CUDA_SUCCESS) { - err_msg = "Failed to set CUDA memory access"; - goto err_unmap; - } - - return region; - -err_unmap: - cuMemUnmap(region.ptr, region.size); -err_free: - cuMemAddressFree(region.ptr, region.size); - region.ptr = 0; -err_release: - cuMemRelease(region.handle); - region.handle = 0; - throw std::runtime_error(err_msg); -} - -static void -vmm_free(vmm_region ®ion) { - if (!region.ptr) { - return; - } - if (region.is_cuda_malloc) { - cudaFree(reinterpret_cast(region.ptr)); - region.ptr = 0; - return; - } - cuMemUnmap(region.ptr, region.size); - cuMemAddressFree(region.ptr, region.size); - region.ptr = 0; - if (region.handle) { - cuMemRelease(region.handle); - region.handle = 0; - } -} - static void sleep_ms(int milliseconds) { std::this_thread::sleep_for(std::chrono::milliseconds(milliseconds)); } diff --git a/examples/device/ep/csrc/nixl_ep.hpp b/examples/device/ep/csrc/nixl_ep.hpp index d8dd9b835d..e8275e1283 100644 --- a/examples/device/ep/csrc/nixl_ep.hpp +++ b/examples/device/ep/csrc/nixl_ep.hpp @@ -43,6 +43,7 @@ #include "event.hpp" #include "kernels/configs.cuh" #include "kernels/exception.cuh" +#include "vmm.hpp" #include "nixl.h" @@ -50,13 +51,6 @@ #define TORCH_EXTENSION_NAME nixl_ep_cpp #endif -struct vmm_region { - CUdeviceptr ptr = 0; - size_t size = 0; - CUmemGenericAllocationHandle handle = 0; - bool is_cuda_malloc = false; -}; - namespace nixl_ep { struct NixlPeerInfo { diff --git a/examples/device/ep/csrc/vmm.cpp b/examples/device/ep/csrc/vmm.cpp new file mode 100644 index 0000000000..0ef73b2916 --- /dev/null +++ b/examples/device/ep/csrc/vmm.cpp @@ -0,0 +1,155 @@ +/* + * SPDX-FileCopyrightText: Copyright (c) 2025 DeepSeek + * SPDX-FileCopyrightText: Copyright (c) 2025-2026 NVIDIA CORPORATION & AFFILIATES. All rights reserved. + * + * This file incorporates material from the DeepSeek project, licensed under the MIT License. + * The modifications made by NVIDIA are licensed under the Apache License, Version 2.0. + * + * SPDX-License-Identifier: MIT AND Apache-2.0 + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +#include + +#include "config.hpp" +#include "vmm.hpp" + +vmm_region +vmm_init(size_t size, CUdevice device) { + if (size == 0) { + throw std::invalid_argument("vmm_init: size must be non-zero"); + } + + struct cuda_alloc_ctx { + bool fabric_supported; + CUmemAllocationProp prop; + size_t granularity; + + cuda_alloc_ctx(CUdevice dev) : fabric_supported(false), prop({}), granularity(0) { + int version; + if (cuDriverGetVersion(&version) != CUDA_SUCCESS) { + throw std::runtime_error("Failed to get CUDA driver version"); + } + + if (version < 11000) { + return; /* too old — fall back to cudaMalloc */ + } + + int fab = 0; + if ((cuDeviceGetAttribute(&fab, + CU_DEVICE_ATTRIBUTE_HANDLE_TYPE_FABRIC_SUPPORTED, + dev) != CUDA_SUCCESS) || + (!fab)) { + return; /* no fabric — fall back to cudaMalloc */ + } + + int rdma_vmm_supported = 0; + if (cuDeviceGetAttribute(&rdma_vmm_supported, + CU_DEVICE_ATTRIBUTE_GPU_DIRECT_RDMA_WITH_CUDA_VMM_SUPPORTED, + dev) != CUDA_SUCCESS) { + throw std::runtime_error( + "Failed to query GPUDirect RDMA with VMM support attribute"); + } + + if (!rdma_vmm_supported) { + throw std::runtime_error( + "GPUDirect RDMA with CUDA VMM is not supported on this device"); + } + + prop.type = CU_MEM_ALLOCATION_TYPE_PINNED; + prop.location.type = CU_MEM_LOCATION_TYPE_DEVICE; + prop.location.id = dev; + prop.allocFlags.gpuDirectRDMACapable = 1; + prop.requestedHandleTypes = CU_MEM_HANDLE_TYPE_FABRIC; + + if (cuMemGetAllocationGranularity( + &granularity, &prop, CU_MEM_ALLOC_GRANULARITY_MINIMUM) != CUDA_SUCCESS) { + throw std::runtime_error("Failed to get CUDA allocation granularity"); + } + + fabric_supported = true; + } + }; + + static cuda_alloc_ctx ctx(device); + + if (!ctx.fabric_supported) { + vmm_region region = {}; + region.size = size; + region.is_cuda_malloc = true; + if (cudaMalloc(reinterpret_cast(®ion.ptr), size) != cudaSuccess) { + throw std::runtime_error("cudaMalloc fallback failed (fabric not supported)"); + } + return region; + } + + vmm_region region = {}; + CUmemAccessDesc access_desc = {}; + const char *err_msg; + + region.size = align_up(size, ctx.granularity); + + if (cuMemCreate(®ion.handle, region.size, &ctx.prop, 0) != CUDA_SUCCESS) { + throw std::runtime_error("Failed to create CUDA VMM allocation"); + } + + if (cuMemAddressReserve(®ion.ptr, region.size, 0, 0, 0) != CUDA_SUCCESS) { + err_msg = "Failed to reserve CUDA virtual address"; + goto err_release; + } + + if (cuMemMap(region.ptr, region.size, 0, region.handle, 0) != CUDA_SUCCESS) { + err_msg = "Failed to map CUDA VMM memory"; + goto err_free; + } + + access_desc.location.type = CU_MEM_LOCATION_TYPE_DEVICE; + access_desc.location.id = device; + access_desc.flags = CU_MEM_ACCESS_FLAGS_PROT_READWRITE; + if (cuMemSetAccess(region.ptr, region.size, &access_desc, 1) != CUDA_SUCCESS) { + err_msg = "Failed to set CUDA memory access"; + goto err_unmap; + } + + return region; + +err_unmap: + cuMemUnmap(region.ptr, region.size); +err_free: + cuMemAddressFree(region.ptr, region.size); + region.ptr = 0; +err_release: + cuMemRelease(region.handle); + region.handle = 0; + throw std::runtime_error(err_msg); +} + +void +vmm_free(vmm_region ®ion) { + if (!region.ptr) { + return; + } + if (region.is_cuda_malloc) { + cudaFree(reinterpret_cast(region.ptr)); + region.ptr = 0; + return; + } + cuMemUnmap(region.ptr, region.size); + cuMemAddressFree(region.ptr, region.size); + region.ptr = 0; + if (region.handle) { + cuMemRelease(region.handle); + region.handle = 0; + } +} diff --git a/examples/device/ep/csrc/vmm.hpp b/examples/device/ep/csrc/vmm.hpp new file mode 100644 index 0000000000..af63ac643a --- /dev/null +++ b/examples/device/ep/csrc/vmm.hpp @@ -0,0 +1,38 @@ +/* + * SPDX-FileCopyrightText: Copyright (c) 2025 DeepSeek + * SPDX-FileCopyrightText: Copyright (c) 2025-2026 NVIDIA CORPORATION & AFFILIATES. All rights reserved. + * + * This file incorporates material from the DeepSeek project, licensed under the MIT License. + * The modifications made by NVIDIA are licensed under the Apache License, Version 2.0. + * + * SPDX-License-Identifier: MIT AND Apache-2.0 + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +#pragma once + +#include +#include + +struct vmm_region { + CUdeviceptr ptr = 0; + size_t size = 0; + CUmemGenericAllocationHandle handle = 0; + bool is_cuda_malloc = false; +}; + +vmm_region +vmm_init(size_t size, CUdevice device); +void +vmm_free(vmm_region ®ion); diff --git a/examples/device/ep/meson.build b/examples/device/ep/meson.build index 84ddad229c..fdc879d902 100644 --- a/examples/device/ep/meson.build +++ b/examples/device/ep/meson.build @@ -63,6 +63,7 @@ endif nixl_ep_sources = [ 'csrc/nixl_ep.cpp', + 'csrc/vmm.cpp', 'csrc/kernels/nixl_ep.cu', ] From 088c96001ad0e7a8a4501b62ef62fffd15d09dee Mon Sep 17 00:00:00 2001 From: Ofir Farjon Date: Tue, 17 Mar 2026 22:40:26 +0200 Subject: [PATCH 17/29] NIXL/EP: Fix. --- examples/device/ep/csrc/vmm.cpp | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/examples/device/ep/csrc/vmm.cpp b/examples/device/ep/csrc/vmm.cpp index 0ef73b2916..289fa9ab91 100644 --- a/examples/device/ep/csrc/vmm.cpp +++ b/examples/device/ep/csrc/vmm.cpp @@ -98,7 +98,7 @@ vmm_init(size_t size, CUdevice device) { CUmemAccessDesc access_desc = {}; const char *err_msg; - region.size = align_up(size, ctx.granularity); + region.size = nixl_ep::align_up(size, ctx.granularity); if (cuMemCreate(®ion.handle, region.size, &ctx.prop, 0) != CUDA_SUCCESS) { throw std::runtime_error("Failed to create CUDA VMM allocation"); From 15f9174ddb82b5d268ef35974e07c01b4a428468 Mon Sep 17 00:00:00 2001 From: Ofir Farjon Date: Tue, 17 Mar 2026 22:43:30 +0200 Subject: [PATCH 18/29] NIXL/EP: Fix. --- examples/device/ep/csrc/config.hpp | 6 +++++- 1 file changed, 5 insertions(+), 1 deletion(-) diff --git a/examples/device/ep/csrc/config.hpp b/examples/device/ep/csrc/config.hpp index f17ac113d7..89b49cc419 100644 --- a/examples/device/ep/csrc/config.hpp +++ b/examples/device/ep/csrc/config.hpp @@ -122,7 +122,11 @@ struct EPLayout { } }; -size_t get_rdma_size_hint(int num_max_dispatch_tokens_per_rank, int hidden, int num_ranks, int num_experts) { +inline size_t +get_rdma_size_hint(int num_max_dispatch_tokens_per_rank, + int hidden, + int num_ranks, + int num_experts) { auto num_bytes = EPLayout(nullptr, num_max_dispatch_tokens_per_rank, hidden, num_ranks, num_experts).total_bytes; return ((num_bytes + NUM_BUFFER_ALIGNMENT_BYTES) / NUM_BUFFER_ALIGNMENT_BYTES) * NUM_BUFFER_ALIGNMENT_BYTES; } From a44a0947afa55a2675ce5b39aeac1eb51371df59 Mon Sep 17 00:00:00 2001 From: Ofir Farjon Date: Tue, 17 Mar 2026 22:53:28 +0200 Subject: [PATCH 19/29] NIXL/EP: not needed. --- examples/device/ep/csrc/nixl_ep.cpp | 1 - examples/device/ep/csrc/nixl_ep.hpp | 3 --- 2 files changed, 4 deletions(-) diff --git a/examples/device/ep/csrc/nixl_ep.cpp b/examples/device/ep/csrc/nixl_ep.cpp index 011d0f7343..910bb6d2f6 100644 --- a/examples/device/ep/csrc/nixl_ep.cpp +++ b/examples/device/ep/csrc/nixl_ep.cpp @@ -27,7 +27,6 @@ #include #include #include -#include #include #include #include diff --git a/examples/device/ep/csrc/nixl_ep.hpp b/examples/device/ep/csrc/nixl_ep.hpp index e8275e1283..92059c5611 100644 --- a/examples/device/ep/csrc/nixl_ep.hpp +++ b/examples/device/ep/csrc/nixl_ep.hpp @@ -35,10 +35,7 @@ #include #include -#include -#include #include -#include #include "config.hpp" #include "event.hpp" #include "kernels/configs.cuh" From f120473097f59a5ab5c1a03bf814763b5e49b9f7 Mon Sep 17 00:00:00 2001 From: Ofir Farjon Date: Tue, 17 Mar 2026 23:15:38 +0200 Subject: [PATCH 20/29] NIXL/EP: new files terms. --- examples/device/ep/csrc/vmm.cpp | 9 ++------- examples/device/ep/csrc/vmm.hpp | 9 ++------- 2 files changed, 4 insertions(+), 14 deletions(-) diff --git a/examples/device/ep/csrc/vmm.cpp b/examples/device/ep/csrc/vmm.cpp index 289fa9ab91..62cb10114c 100644 --- a/examples/device/ep/csrc/vmm.cpp +++ b/examples/device/ep/csrc/vmm.cpp @@ -1,17 +1,12 @@ /* - * SPDX-FileCopyrightText: Copyright (c) 2025 DeepSeek * SPDX-FileCopyrightText: Copyright (c) 2025-2026 NVIDIA CORPORATION & AFFILIATES. All rights reserved. - * - * This file incorporates material from the DeepSeek project, licensed under the MIT License. - * The modifications made by NVIDIA are licensed under the Apache License, Version 2.0. - * - * SPDX-License-Identifier: MIT AND Apache-2.0 + * SPDX-License-Identifier: Apache-2.0 * * Licensed under the Apache License, Version 2.0 (the "License"); * you may not use this file except in compliance with the License. * You may obtain a copy of the License at * - * http://www.apache.org/licenses/LICENSE-2.0 + * http://www.apache.org/licenses/LICENSE-2.0 * * Unless required by applicable law or agreed to in writing, software * distributed under the License is distributed on an "AS IS" BASIS, diff --git a/examples/device/ep/csrc/vmm.hpp b/examples/device/ep/csrc/vmm.hpp index af63ac643a..42e0c7a87a 100644 --- a/examples/device/ep/csrc/vmm.hpp +++ b/examples/device/ep/csrc/vmm.hpp @@ -1,17 +1,12 @@ /* - * SPDX-FileCopyrightText: Copyright (c) 2025 DeepSeek * SPDX-FileCopyrightText: Copyright (c) 2025-2026 NVIDIA CORPORATION & AFFILIATES. All rights reserved. - * - * This file incorporates material from the DeepSeek project, licensed under the MIT License. - * The modifications made by NVIDIA are licensed under the Apache License, Version 2.0. - * - * SPDX-License-Identifier: MIT AND Apache-2.0 + * SPDX-License-Identifier: Apache-2.0 * * Licensed under the Apache License, Version 2.0 (the "License"); * you may not use this file except in compliance with the License. * You may obtain a copy of the License at * - * http://www.apache.org/licenses/LICENSE-2.0 + * http://www.apache.org/licenses/LICENSE-2.0 * * Unless required by applicable law or agreed to in writing, software * distributed under the License is distributed on an "AS IS" BASIS, From 9ef9dcac07519d35c9162eee341a5ae44104657b Mon Sep 17 00:00:00 2001 From: Ofir Farjon Date: Tue, 24 Mar 2026 18:06:50 +0200 Subject: [PATCH 21/29] NIXL/EP: Fix comments. --- examples/device/ep/csrc/nixl_ep.cpp | 30 +++--- examples/device/ep/csrc/vmm.cpp | 137 ++++++++++++++++++---------- examples/device/ep/csrc/vmm.hpp | 52 +++++++++-- 3 files changed, 146 insertions(+), 73 deletions(-) diff --git a/examples/device/ep/csrc/nixl_ep.cpp b/examples/device/ep/csrc/nixl_ep.cpp index 910bb6d2f6..4f8dd30fd5 100644 --- a/examples/device/ep/csrc/nixl_ep.cpp +++ b/examples/device/ep/csrc/nixl_ep.cpp @@ -103,27 +103,27 @@ void Buffer::init(int num_ranks, int num_experts_per_rank, int64_t num_rdma_byte EP_HOST_ASSERT(per_channel_bytes < std::numeric_limits::max()); // Create 32 MiB workspace - m_workspace_alloc = vmm_init(NUM_WORKSPACE_BYTES, device_id); - workspace = reinterpret_cast(m_workspace_alloc.ptr); + m_workspace_alloc = vmm_region::allocate(NUM_WORKSPACE_BYTES, device_id); + workspace = reinterpret_cast(m_workspace_alloc.ptr()); CUDA_CHECK(cudaMemsetAsync(workspace, 0, NUM_WORKSPACE_BYTES, comm_stream)); EP_HOST_ASSERT(max_experts_per_rank > 0); - m_rdma_alloc = vmm_init(num_rdma_bytes, device_id); - rdma_buffer_ptr = reinterpret_cast(m_rdma_alloc.ptr); + m_rdma_alloc = vmm_region::allocate(num_rdma_bytes, device_id); + rdma_buffer_ptr = reinterpret_cast(m_rdma_alloc.ptr()); CUDA_CHECK(cudaMemset(rdma_buffer_ptr, 0, num_rdma_bytes)); // Allocate and clean shrink buffer int num_mask_buffer_bytes = max_num_ranks * sizeof(int); - m_mask_alloc = vmm_init(num_mask_buffer_bytes, device_id); - mask_buffer_ptr = reinterpret_cast(m_mask_alloc.ptr); + m_mask_alloc = vmm_region::allocate(num_mask_buffer_bytes, device_id); + mask_buffer_ptr = reinterpret_cast(m_mask_alloc.ptr()); CUDA_CHECK(cudaMemset(mask_buffer_ptr, 0xff, num_mask_buffer_bytes)); CUDA_CHECK(cudaMemset(mask_buffer_ptr + rank, 0, sizeof(int))); int num_sync_buffer_bytes = max_num_ranks * sizeof(int); - m_sync_alloc = vmm_init(num_sync_buffer_bytes, device_id); - sync_buffer_ptr = reinterpret_cast(m_sync_alloc.ptr); - m_sync_count_alloc = vmm_init(num_sync_buffer_bytes, device_id); - sync_count_ptr = reinterpret_cast(m_sync_count_alloc.ptr); + m_sync_alloc = vmm_region::allocate(num_sync_buffer_bytes, device_id); + sync_buffer_ptr = reinterpret_cast(m_sync_alloc.ptr()); + m_sync_count_alloc = vmm_region::allocate(num_sync_buffer_bytes, device_id); + sync_count_ptr = reinterpret_cast(m_sync_count_alloc.ptr()); CUDA_CHECK(cudaMemset(sync_buffer_ptr, 0, num_sync_buffer_bytes)); CUDA_CHECK(cudaMemset(sync_count_ptr, 0, num_sync_buffer_bytes)); CUDA_CHECK(cudaDeviceSynchronize()); @@ -177,21 +177,21 @@ void Buffer::destroy() { _nixl_ep_destroy(); - vmm_free(m_rdma_alloc); + m_rdma_alloc = vmm_region{}; rdma_buffer_ptr = nullptr; if (nixl_agent_info and nixl_agent_info->agent != nullptr and getenv("NIXL_ETCD_ENDPOINTS")) { nixl_agent_info->agent->invalidateLocalMD(); } - vmm_free(m_mask_alloc); + m_mask_alloc = vmm_region{}; mask_buffer_ptr = nullptr; - vmm_free(m_sync_alloc); + m_sync_alloc = vmm_region{}; sync_buffer_ptr = nullptr; - vmm_free(m_sync_count_alloc); + m_sync_count_alloc = vmm_region{}; sync_count_ptr = nullptr; - vmm_free(m_workspace_alloc); + m_workspace_alloc = vmm_region{}; workspace = nullptr; destroyed = true; diff --git a/examples/device/ep/csrc/vmm.cpp b/examples/device/ep/csrc/vmm.cpp index 62cb10114c..ac37a5ec4f 100644 --- a/examples/device/ep/csrc/vmm.cpp +++ b/examples/device/ep/csrc/vmm.cpp @@ -16,14 +16,83 @@ */ #include +#include #include "config.hpp" #include "vmm.hpp" +void +vmm_region::release() noexcept { + if (is_cuda_malloc_) { + if (ptr_) { + cudaFree(reinterpret_cast(ptr_)); + } + ptr_ = 0; + size_ = 0; + is_cuda_malloc_ = false; + return; + } + + if (vmm_mapped_) { + cuMemUnmap(ptr_, size_); + vmm_mapped_ = false; + } + if (vmm_addr_reserved_ && ptr_) { + cuMemAddressFree(ptr_, size_); + ptr_ = 0; + vmm_addr_reserved_ = false; + } + if (handle_) { + cuMemRelease(handle_); + handle_ = 0; + } + size_ = 0; +} + +vmm_region::~vmm_region() { + release(); +} + +vmm_region::vmm_region(vmm_region &&other) noexcept + : ptr_(other.ptr_), + size_(other.size_), + handle_(other.handle_), + is_cuda_malloc_(other.is_cuda_malloc_), + vmm_addr_reserved_(other.vmm_addr_reserved_), + vmm_mapped_(other.vmm_mapped_) { + other.ptr_ = 0; + other.size_ = 0; + other.handle_ = 0; + other.is_cuda_malloc_ = false; + other.vmm_addr_reserved_ = false; + other.vmm_mapped_ = false; +} + +vmm_region & +vmm_region::operator=(vmm_region &&other) noexcept { + if (this == &other) { + return *this; + } + release(); + ptr_ = other.ptr_; + size_ = other.size_; + handle_ = other.handle_; + is_cuda_malloc_ = other.is_cuda_malloc_; + vmm_addr_reserved_ = other.vmm_addr_reserved_; + vmm_mapped_ = other.vmm_mapped_; + other.ptr_ = 0; + other.size_ = 0; + other.handle_ = 0; + other.is_cuda_malloc_ = false; + other.vmm_addr_reserved_ = false; + other.vmm_mapped_ = false; + return *this; +} + vmm_region -vmm_init(size_t size, CUdevice device) { +vmm_region::allocate(size_t size, CUdevice device) { if (size == 0) { - throw std::invalid_argument("vmm_init: size must be non-zero"); + throw std::invalid_argument("vmm_region::allocate: size must be non-zero"); } struct cuda_alloc_ctx { @@ -79,72 +148,44 @@ vmm_init(size_t size, CUdevice device) { static cuda_alloc_ctx ctx(device); + vmm_region region; + if (!ctx.fabric_supported) { - vmm_region region = {}; - region.size = size; - region.is_cuda_malloc = true; - if (cudaMalloc(reinterpret_cast(®ion.ptr), size) != cudaSuccess) { + region.size_ = size; + region.is_cuda_malloc_ = true; + if (cudaMalloc(reinterpret_cast(®ion.ptr_), size) != cudaSuccess) { throw std::runtime_error("cudaMalloc fallback failed (fabric not supported)"); } return region; } - vmm_region region = {}; CUmemAccessDesc access_desc = {}; - const char *err_msg; - region.size = nixl_ep::align_up(size, ctx.granularity); + region.size_ = nixl_ep::align_up(size, ctx.granularity); - if (cuMemCreate(®ion.handle, region.size, &ctx.prop, 0) != CUDA_SUCCESS) { + if (cuMemCreate(®ion.handle_, region.size_, &ctx.prop, 0) != CUDA_SUCCESS) { throw std::runtime_error("Failed to create CUDA VMM allocation"); } - if (cuMemAddressReserve(®ion.ptr, region.size, 0, 0, 0) != CUDA_SUCCESS) { - err_msg = "Failed to reserve CUDA virtual address"; - goto err_release; + if (cuMemAddressReserve(®ion.ptr_, region.size_, 0, 0, 0) != CUDA_SUCCESS) { + region.release(); + throw std::runtime_error("Failed to reserve CUDA virtual address"); } + region.vmm_addr_reserved_ = true; - if (cuMemMap(region.ptr, region.size, 0, region.handle, 0) != CUDA_SUCCESS) { - err_msg = "Failed to map CUDA VMM memory"; - goto err_free; + if (cuMemMap(region.ptr_, region.size_, 0, region.handle_, 0) != CUDA_SUCCESS) { + region.release(); + throw std::runtime_error("Failed to map CUDA VMM memory"); } + region.vmm_mapped_ = true; access_desc.location.type = CU_MEM_LOCATION_TYPE_DEVICE; access_desc.location.id = device; access_desc.flags = CU_MEM_ACCESS_FLAGS_PROT_READWRITE; - if (cuMemSetAccess(region.ptr, region.size, &access_desc, 1) != CUDA_SUCCESS) { - err_msg = "Failed to set CUDA memory access"; - goto err_unmap; + if (cuMemSetAccess(region.ptr_, region.size_, &access_desc, 1) != CUDA_SUCCESS) { + region.release(); + throw std::runtime_error("Failed to set CUDA memory access"); } return region; - -err_unmap: - cuMemUnmap(region.ptr, region.size); -err_free: - cuMemAddressFree(region.ptr, region.size); - region.ptr = 0; -err_release: - cuMemRelease(region.handle); - region.handle = 0; - throw std::runtime_error(err_msg); -} - -void -vmm_free(vmm_region ®ion) { - if (!region.ptr) { - return; - } - if (region.is_cuda_malloc) { - cudaFree(reinterpret_cast(region.ptr)); - region.ptr = 0; - return; - } - cuMemUnmap(region.ptr, region.size); - cuMemAddressFree(region.ptr, region.size); - region.ptr = 0; - if (region.handle) { - cuMemRelease(region.handle); - region.handle = 0; - } } diff --git a/examples/device/ep/csrc/vmm.hpp b/examples/device/ep/csrc/vmm.hpp index 42e0c7a87a..b91ae811b5 100644 --- a/examples/device/ep/csrc/vmm.hpp +++ b/examples/device/ep/csrc/vmm.hpp @@ -19,15 +19,47 @@ #include #include +#include -struct vmm_region { - CUdeviceptr ptr = 0; - size_t size = 0; - CUmemGenericAllocationHandle handle = 0; - bool is_cuda_malloc = false; -}; +class vmm_region { +public: + vmm_region() = default; + ~vmm_region(); + + vmm_region(vmm_region &&other) noexcept; + vmm_region & + operator=(vmm_region &&other) noexcept; + + vmm_region(const vmm_region &) = delete; + vmm_region & + operator=(const vmm_region &) = delete; + + [[nodiscard]] static vmm_region + allocate(size_t size, CUdevice device); + + [[nodiscard]] CUdeviceptr + ptr() const noexcept { + return ptr_; + } -vmm_region -vmm_init(size_t size, CUdevice device); -void -vmm_free(vmm_region ®ion); + [[nodiscard]] size_t + size() const noexcept { + return size_; + } + + [[nodiscard]] CUmemGenericAllocationHandle + handle() const noexcept { + return handle_; + } + +private: + void + release() noexcept; + + CUdeviceptr ptr_ = 0; + size_t size_ = 0; + CUmemGenericAllocationHandle handle_ = 0; + bool is_cuda_malloc_ = false; + bool vmm_addr_reserved_ = false; + bool vmm_mapped_ = false; +}; From 22c59f70f87b59aa2177e1b81ffe9dbef553caba Mon Sep 17 00:00:00 2001 From: Ofir Farjon Date: Tue, 24 Mar 2026 19:29:05 +0200 Subject: [PATCH 22/29] NIXL/EP: Fix comment. --- examples/device/ep/csrc/cuda_utils.hpp | 48 ++++++++++++++ examples/device/ep/csrc/nixl_ep.cpp | 36 ++++++----- examples/device/ep/csrc/nixl_ep.hpp | 10 +-- examples/device/ep/csrc/vmm.cpp | 87 ++++++++------------------ examples/device/ep/csrc/vmm.hpp | 13 ++-- 5 files changed, 103 insertions(+), 91 deletions(-) create mode 100644 examples/device/ep/csrc/cuda_utils.hpp diff --git a/examples/device/ep/csrc/cuda_utils.hpp b/examples/device/ep/csrc/cuda_utils.hpp new file mode 100644 index 0000000000..4ec5d4e0b4 --- /dev/null +++ b/examples/device/ep/csrc/cuda_utils.hpp @@ -0,0 +1,48 @@ +/* + * SPDX-FileCopyrightText: Copyright (c) 2025-2026 NVIDIA CORPORATION & AFFILIATES. All rights reserved. + * SPDX-License-Identifier: Apache-2.0 + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +#pragma once + +#include +#include + +#include + +namespace nixl_ep { + +/** Log a non-fatal warning if a CUDA runtime API call failed (e.g. during teardown). */ +inline void +warn_cuda_api(cudaError_t status, const char *context, const char *operation) noexcept { + if (status != cudaSuccess) { + std::cerr << "WARNING: " << context << " failed to " << operation << ": " + << cudaGetErrorString(status) << '\n'; + } +} + +/** Log a non-fatal warning if a CUDA driver API call failed (e.g. during teardown). */ +inline void +warn_cu_api(CUresult status, const char *context, const char *operation) noexcept { + if (status != CUDA_SUCCESS) { + const char *msg = nullptr; + if (cuGetErrorString(status, &msg) != CUDA_SUCCESS || msg == nullptr) { + msg = "unknown CUDA driver error"; + } + std::cerr << "WARNING: " << context << " failed to " << operation << ": " << msg << '\n'; + } +} + +} // namespace nixl_ep diff --git a/examples/device/ep/csrc/nixl_ep.cpp b/examples/device/ep/csrc/nixl_ep.cpp index da1d0758e0..39f6f64986 100644 --- a/examples/device/ep/csrc/nixl_ep.cpp +++ b/examples/device/ep/csrc/nixl_ep.cpp @@ -34,6 +34,7 @@ #include #include "nixl_ep.hpp" +#include "cuda_utils.hpp" #include "kernels/api.cuh" #include "kernels/configs.cuh" #include @@ -102,28 +103,31 @@ void Buffer::init(int num_ranks, int num_experts_per_rank, int64_t num_rdma_byte auto per_channel_bytes = ceil_div(num_rdma_bytes, denom_sms); EP_HOST_ASSERT(per_channel_bytes < std::numeric_limits::max()); + const CUdevice cu_dev = static_cast(device_id); + // Create 32 MiB workspace - m_workspace_alloc = vmm_region::allocate(NUM_WORKSPACE_BYTES, device_id); - workspace = reinterpret_cast(m_workspace_alloc.ptr()); + m_workspace_alloc = std::make_unique(NUM_WORKSPACE_BYTES, cu_dev); + workspace = reinterpret_cast(m_workspace_alloc->ptr()); CUDA_CHECK(cudaMemsetAsync(workspace, 0, NUM_WORKSPACE_BYTES, comm_stream)); EP_HOST_ASSERT(max_experts_per_rank > 0); - m_rdma_alloc = vmm_region::allocate(num_rdma_bytes, device_id); - rdma_buffer_ptr = reinterpret_cast(m_rdma_alloc.ptr()); + m_rdma_alloc = std::make_unique(static_cast(num_rdma_bytes), cu_dev); + rdma_buffer_ptr = reinterpret_cast(m_rdma_alloc->ptr()); CUDA_CHECK(cudaMemset(rdma_buffer_ptr, 0, num_rdma_bytes)); // Allocate and clean shrink buffer int num_mask_buffer_bytes = max_num_ranks * sizeof(int); - m_mask_alloc = vmm_region::allocate(num_mask_buffer_bytes, device_id); - mask_buffer_ptr = reinterpret_cast(m_mask_alloc.ptr()); + m_mask_alloc = std::make_unique(static_cast(num_mask_buffer_bytes), cu_dev); + mask_buffer_ptr = reinterpret_cast(m_mask_alloc->ptr()); CUDA_CHECK(cudaMemset(mask_buffer_ptr, 0xff, num_mask_buffer_bytes)); CUDA_CHECK(cudaMemset(mask_buffer_ptr + rank, 0, sizeof(int))); int num_sync_buffer_bytes = max_num_ranks * sizeof(int); - m_sync_alloc = vmm_region::allocate(num_sync_buffer_bytes, device_id); - sync_buffer_ptr = reinterpret_cast(m_sync_alloc.ptr()); - m_sync_count_alloc = vmm_region::allocate(num_sync_buffer_bytes, device_id); - sync_count_ptr = reinterpret_cast(m_sync_count_alloc.ptr()); + m_sync_alloc = std::make_unique(static_cast(num_sync_buffer_bytes), cu_dev); + sync_buffer_ptr = reinterpret_cast(m_sync_alloc->ptr()); + m_sync_count_alloc = + std::make_unique(static_cast(num_sync_buffer_bytes), cu_dev); + sync_count_ptr = reinterpret_cast(m_sync_count_alloc->ptr()); CUDA_CHECK(cudaMemset(sync_buffer_ptr, 0, num_sync_buffer_bytes)); CUDA_CHECK(cudaMemset(sync_count_ptr, 0, num_sync_buffer_bytes)); CUDA_CHECK(cudaDeviceSynchronize()); @@ -189,7 +193,7 @@ void Buffer::destroy() { } // Synchronize - warn_cuda(cudaDeviceSynchronize(), "synchronize device"); + warn_cuda_api(cudaDeviceSynchronize(), "destroy()", "synchronize device"); _nixl_ep_destroy(); @@ -214,16 +218,16 @@ void Buffer::destroy() { nixl_agent_info.reset(); } - m_rdma_alloc = vmm_region{}; + m_rdma_alloc.reset(); rdma_buffer_ptr = nullptr; - m_mask_alloc = vmm_region{}; + m_mask_alloc.reset(); mask_buffer_ptr = nullptr; - m_sync_alloc = vmm_region{}; + m_sync_alloc.reset(); sync_buffer_ptr = nullptr; - m_sync_count_alloc = vmm_region{}; + m_sync_count_alloc.reset(); sync_count_ptr = nullptr; - m_workspace_alloc = vmm_region{}; + m_workspace_alloc.reset(); workspace = nullptr; destroyed = true; diff --git a/examples/device/ep/csrc/nixl_ep.hpp b/examples/device/ep/csrc/nixl_ep.hpp index 9d520a195a..1053ab9d40 100644 --- a/examples/device/ep/csrc/nixl_ep.hpp +++ b/examples/device/ep/csrc/nixl_ep.hpp @@ -88,11 +88,11 @@ struct Buffer { int *sync_count_ptr = nullptr; /* Owning VMM allocations (keep raw ptrs above as aliases) */ - vmm_region m_rdma_alloc; - vmm_region m_mask_alloc; - vmm_region m_sync_alloc; - vmm_region m_sync_count_alloc; - vmm_region m_workspace_alloc; + std::unique_ptr m_rdma_alloc; + std::unique_ptr m_mask_alloc; + std::unique_ptr m_sync_alloc; + std::unique_ptr m_sync_count_alloc; + std::unique_ptr m_workspace_alloc; // Device info and communication int device_id; diff --git a/examples/device/ep/csrc/vmm.cpp b/examples/device/ep/csrc/vmm.cpp index ac37a5ec4f..63dd2699b8 100644 --- a/examples/device/ep/csrc/vmm.cpp +++ b/examples/device/ep/csrc/vmm.cpp @@ -16,16 +16,20 @@ */ #include -#include #include "config.hpp" +#include "cuda_utils.hpp" #include "vmm.hpp" +namespace { +constexpr const char *k_vmm_ctx = "vmm_region"; +} + void vmm_region::release() noexcept { if (is_cuda_malloc_) { if (ptr_) { - cudaFree(reinterpret_cast(ptr_)); + nixl_ep::warn_cuda_api(cudaFree(reinterpret_cast(ptr_)), k_vmm_ctx, "cudaFree"); } ptr_ = 0; size_ = 0; @@ -34,16 +38,16 @@ vmm_region::release() noexcept { } if (vmm_mapped_) { - cuMemUnmap(ptr_, size_); + nixl_ep::warn_cu_api(cuMemUnmap(ptr_, size_), k_vmm_ctx, "cuMemUnmap"); vmm_mapped_ = false; } if (vmm_addr_reserved_ && ptr_) { - cuMemAddressFree(ptr_, size_); + nixl_ep::warn_cu_api(cuMemAddressFree(ptr_, size_), k_vmm_ctx, "cuMemAddressFree"); ptr_ = 0; vmm_addr_reserved_ = false; } if (handle_) { - cuMemRelease(handle_); + nixl_ep::warn_cu_api(cuMemRelease(handle_), k_vmm_ctx, "cuMemRelease"); handle_ = 0; } size_ = 0; @@ -53,46 +57,9 @@ vmm_region::~vmm_region() { release(); } -vmm_region::vmm_region(vmm_region &&other) noexcept - : ptr_(other.ptr_), - size_(other.size_), - handle_(other.handle_), - is_cuda_malloc_(other.is_cuda_malloc_), - vmm_addr_reserved_(other.vmm_addr_reserved_), - vmm_mapped_(other.vmm_mapped_) { - other.ptr_ = 0; - other.size_ = 0; - other.handle_ = 0; - other.is_cuda_malloc_ = false; - other.vmm_addr_reserved_ = false; - other.vmm_mapped_ = false; -} - -vmm_region & -vmm_region::operator=(vmm_region &&other) noexcept { - if (this == &other) { - return *this; - } - release(); - ptr_ = other.ptr_; - size_ = other.size_; - handle_ = other.handle_; - is_cuda_malloc_ = other.is_cuda_malloc_; - vmm_addr_reserved_ = other.vmm_addr_reserved_; - vmm_mapped_ = other.vmm_mapped_; - other.ptr_ = 0; - other.size_ = 0; - other.handle_ = 0; - other.is_cuda_malloc_ = false; - other.vmm_addr_reserved_ = false; - other.vmm_mapped_ = false; - return *this; -} - -vmm_region -vmm_region::allocate(size_t size, CUdevice device) { +vmm_region::vmm_region(size_t size, CUdevice device) { if (size == 0) { - throw std::invalid_argument("vmm_region::allocate: size must be non-zero"); + throw std::invalid_argument("vmm_region: size must be non-zero"); } struct cuda_alloc_ctx { @@ -148,44 +115,40 @@ vmm_region::allocate(size_t size, CUdevice device) { static cuda_alloc_ctx ctx(device); - vmm_region region; - if (!ctx.fabric_supported) { - region.size_ = size; - region.is_cuda_malloc_ = true; - if (cudaMalloc(reinterpret_cast(®ion.ptr_), size) != cudaSuccess) { + size_ = size; + is_cuda_malloc_ = true; + if (cudaMalloc(reinterpret_cast(&ptr_), size) != cudaSuccess) { throw std::runtime_error("cudaMalloc fallback failed (fabric not supported)"); } - return region; + return; } CUmemAccessDesc access_desc = {}; - region.size_ = nixl_ep::align_up(size, ctx.granularity); + size_ = nixl_ep::align_up(size, ctx.granularity); - if (cuMemCreate(®ion.handle_, region.size_, &ctx.prop, 0) != CUDA_SUCCESS) { + if (cuMemCreate(&handle_, size_, &ctx.prop, 0) != CUDA_SUCCESS) { throw std::runtime_error("Failed to create CUDA VMM allocation"); } - if (cuMemAddressReserve(®ion.ptr_, region.size_, 0, 0, 0) != CUDA_SUCCESS) { - region.release(); + if (cuMemAddressReserve(&ptr_, size_, 0, 0, 0) != CUDA_SUCCESS) { + release(); throw std::runtime_error("Failed to reserve CUDA virtual address"); } - region.vmm_addr_reserved_ = true; + vmm_addr_reserved_ = true; - if (cuMemMap(region.ptr_, region.size_, 0, region.handle_, 0) != CUDA_SUCCESS) { - region.release(); + if (cuMemMap(ptr_, size_, 0, handle_, 0) != CUDA_SUCCESS) { + release(); throw std::runtime_error("Failed to map CUDA VMM memory"); } - region.vmm_mapped_ = true; + vmm_mapped_ = true; access_desc.location.type = CU_MEM_LOCATION_TYPE_DEVICE; access_desc.location.id = device; access_desc.flags = CU_MEM_ACCESS_FLAGS_PROT_READWRITE; - if (cuMemSetAccess(region.ptr_, region.size_, &access_desc, 1) != CUDA_SUCCESS) { - region.release(); + if (cuMemSetAccess(ptr_, size_, &access_desc, 1) != CUDA_SUCCESS) { + release(); throw std::runtime_error("Failed to set CUDA memory access"); } - - return region; } diff --git a/examples/device/ep/csrc/vmm.hpp b/examples/device/ep/csrc/vmm.hpp index b91ae811b5..8b3edf8a3b 100644 --- a/examples/device/ep/csrc/vmm.hpp +++ b/examples/device/ep/csrc/vmm.hpp @@ -23,19 +23,16 @@ class vmm_region { public: - vmm_region() = default; - ~vmm_region(); + explicit vmm_region(size_t size, CUdevice device); - vmm_region(vmm_region &&other) noexcept; - vmm_region & - operator=(vmm_region &&other) noexcept; + ~vmm_region(); vmm_region(const vmm_region &) = delete; vmm_region & operator=(const vmm_region &) = delete; - - [[nodiscard]] static vmm_region - allocate(size_t size, CUdevice device); + vmm_region(vmm_region &&) = delete; + vmm_region & + operator=(vmm_region &&) = delete; [[nodiscard]] CUdeviceptr ptr() const noexcept { From 0a58f96f70a9a734ee270cff29fe10c8af79a6e1 Mon Sep 17 00:00:00 2001 From: Ofir Farjon Date: Tue, 24 Mar 2026 19:30:41 +0200 Subject: [PATCH 23/29] NIXL/EP: Fix. --- examples/device/ep/csrc/nixl_ep.cpp | 7 ------- 1 file changed, 7 deletions(-) diff --git a/examples/device/ep/csrc/nixl_ep.cpp b/examples/device/ep/csrc/nixl_ep.cpp index 39f6f64986..aa43e9f60c 100644 --- a/examples/device/ep/csrc/nixl_ep.cpp +++ b/examples/device/ep/csrc/nixl_ep.cpp @@ -174,13 +174,6 @@ torch::Stream Buffer::get_comm_stream() const { } void Buffer::destroy() { - auto warn_cuda = [](cudaError_t status, const char* operation) noexcept { - if (status != cudaSuccess) { - std::cerr << "WARNING: destroy() failed to " << operation - << ": " << cudaGetErrorString(status) << '\n'; - } - }; - auto warn_nixl = [](nixl_status_t status, const char* operation) noexcept { if (status != NIXL_SUCCESS) { std::cerr << "WARNING: destroy() failed to " << operation From d2bdfeac7c79d1953dda929a8683671c9c3ec4eb Mon Sep 17 00:00:00 2001 From: Ofir Farjon Date: Tue, 24 Mar 2026 21:11:39 +0200 Subject: [PATCH 24/29] NIXL/EP: fix. --- examples/device/ep/csrc/vmm.cpp | 3 --- 1 file changed, 3 deletions(-) diff --git a/examples/device/ep/csrc/vmm.cpp b/examples/device/ep/csrc/vmm.cpp index 63dd2699b8..0310fc4a56 100644 --- a/examples/device/ep/csrc/vmm.cpp +++ b/examples/device/ep/csrc/vmm.cpp @@ -32,8 +32,6 @@ vmm_region::release() noexcept { nixl_ep::warn_cuda_api(cudaFree(reinterpret_cast(ptr_)), k_vmm_ctx, "cudaFree"); } ptr_ = 0; - size_ = 0; - is_cuda_malloc_ = false; return; } @@ -50,7 +48,6 @@ vmm_region::release() noexcept { nixl_ep::warn_cu_api(cuMemRelease(handle_), k_vmm_ctx, "cuMemRelease"); handle_ = 0; } - size_ = 0; } vmm_region::~vmm_region() { From 040589972c619aa193d6496bb48528d3f5e01973 Mon Sep 17 00:00:00 2001 From: Ofir Farjon Date: Wed, 25 Mar 2026 19:26:33 +0200 Subject: [PATCH 25/29] NIXL/EP: Fix comment. --- examples/device/ep/csrc/cuda_utils.hpp | 48 -------------------------- examples/device/ep/csrc/nixl_ep.cpp | 15 +++++--- examples/device/ep/csrc/vmm.cpp | 37 ++++++++++++++------ examples/device/ep/csrc/vmm.hpp | 25 ++++++-------- 4 files changed, 48 insertions(+), 77 deletions(-) delete mode 100644 examples/device/ep/csrc/cuda_utils.hpp diff --git a/examples/device/ep/csrc/cuda_utils.hpp b/examples/device/ep/csrc/cuda_utils.hpp deleted file mode 100644 index 4ec5d4e0b4..0000000000 --- a/examples/device/ep/csrc/cuda_utils.hpp +++ /dev/null @@ -1,48 +0,0 @@ -/* - * SPDX-FileCopyrightText: Copyright (c) 2025-2026 NVIDIA CORPORATION & AFFILIATES. All rights reserved. - * SPDX-License-Identifier: Apache-2.0 - * - * Licensed under the Apache License, Version 2.0 (the "License"); - * you may not use this file except in compliance with the License. - * You may obtain a copy of the License at - * - * http://www.apache.org/licenses/LICENSE-2.0 - * - * Unless required by applicable law or agreed to in writing, software - * distributed under the License is distributed on an "AS IS" BASIS, - * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. - * See the License for the specific language governing permissions and - * limitations under the License. - */ - -#pragma once - -#include -#include - -#include - -namespace nixl_ep { - -/** Log a non-fatal warning if a CUDA runtime API call failed (e.g. during teardown). */ -inline void -warn_cuda_api(cudaError_t status, const char *context, const char *operation) noexcept { - if (status != cudaSuccess) { - std::cerr << "WARNING: " << context << " failed to " << operation << ": " - << cudaGetErrorString(status) << '\n'; - } -} - -/** Log a non-fatal warning if a CUDA driver API call failed (e.g. during teardown). */ -inline void -warn_cu_api(CUresult status, const char *context, const char *operation) noexcept { - if (status != CUDA_SUCCESS) { - const char *msg = nullptr; - if (cuGetErrorString(status, &msg) != CUDA_SUCCESS || msg == nullptr) { - msg = "unknown CUDA driver error"; - } - std::cerr << "WARNING: " << context << " failed to " << operation << ": " << msg << '\n'; - } -} - -} // namespace nixl_ep diff --git a/examples/device/ep/csrc/nixl_ep.cpp b/examples/device/ep/csrc/nixl_ep.cpp index aa43e9f60c..26d40a9351 100644 --- a/examples/device/ep/csrc/nixl_ep.cpp +++ b/examples/device/ep/csrc/nixl_ep.cpp @@ -107,7 +107,7 @@ void Buffer::init(int num_ranks, int num_experts_per_rank, int64_t num_rdma_byte // Create 32 MiB workspace m_workspace_alloc = std::make_unique(NUM_WORKSPACE_BYTES, cu_dev); - workspace = reinterpret_cast(m_workspace_alloc->ptr()); + workspace = m_workspace_alloc->ptr(); CUDA_CHECK(cudaMemsetAsync(workspace, 0, NUM_WORKSPACE_BYTES, comm_stream)); EP_HOST_ASSERT(max_experts_per_rank > 0); @@ -118,16 +118,16 @@ void Buffer::init(int num_ranks, int num_experts_per_rank, int64_t num_rdma_byte // Allocate and clean shrink buffer int num_mask_buffer_bytes = max_num_ranks * sizeof(int); m_mask_alloc = std::make_unique(static_cast(num_mask_buffer_bytes), cu_dev); - mask_buffer_ptr = reinterpret_cast(m_mask_alloc->ptr()); + mask_buffer_ptr = static_cast(m_mask_alloc->ptr()); CUDA_CHECK(cudaMemset(mask_buffer_ptr, 0xff, num_mask_buffer_bytes)); CUDA_CHECK(cudaMemset(mask_buffer_ptr + rank, 0, sizeof(int))); int num_sync_buffer_bytes = max_num_ranks * sizeof(int); m_sync_alloc = std::make_unique(static_cast(num_sync_buffer_bytes), cu_dev); - sync_buffer_ptr = reinterpret_cast(m_sync_alloc->ptr()); + sync_buffer_ptr = static_cast(m_sync_alloc->ptr()); m_sync_count_alloc = std::make_unique(static_cast(num_sync_buffer_bytes), cu_dev); - sync_count_ptr = reinterpret_cast(m_sync_count_alloc->ptr()); + sync_count_ptr = static_cast(m_sync_count_alloc->ptr()); CUDA_CHECK(cudaMemset(sync_buffer_ptr, 0, num_sync_buffer_bytes)); CUDA_CHECK(cudaMemset(sync_count_ptr, 0, num_sync_buffer_bytes)); CUDA_CHECK(cudaDeviceSynchronize()); @@ -174,6 +174,13 @@ torch::Stream Buffer::get_comm_stream() const { } void Buffer::destroy() { + auto warn_cuda = [](cudaError_t status, const char *operation) noexcept { + if (status != cudaSuccess) { + std::cerr << "WARNING: destroy() failed to " << operation << ": " + << cudaGetErrorString(status) << '\n'; + } + }; + auto warn_nixl = [](nixl_status_t status, const char* operation) noexcept { if (status != NIXL_SUCCESS) { std::cerr << "WARNING: destroy() failed to " << operation diff --git a/examples/device/ep/csrc/vmm.cpp b/examples/device/ep/csrc/vmm.cpp index 0310fc4a56..d72db7bf27 100644 --- a/examples/device/ep/csrc/vmm.cpp +++ b/examples/device/ep/csrc/vmm.cpp @@ -1,5 +1,5 @@ /* - * SPDX-FileCopyrightText: Copyright (c) 2025-2026 NVIDIA CORPORATION & AFFILIATES. All rights reserved. + * SPDX-FileCopyrightText: Copyright (c) 2026 NVIDIA CORPORATION & AFFILIATES. All rights reserved. * SPDX-License-Identifier: Apache-2.0 * * Licensed under the Apache License, Version 2.0 (the "License"); @@ -15,37 +15,51 @@ * limitations under the License. */ +#include #include #include "config.hpp" -#include "cuda_utils.hpp" #include "vmm.hpp" namespace { + constexpr const char *k_vmm_ctx = "vmm_region"; } +namespace nixl_ep { + +/** Log a non-fatal warning if a CUDA driver API call failed (e.g. during teardown). */ +void +vmm_region::warn_cu_api(CUresult status, const char *context, const char *operation) noexcept { + if (status != CUDA_SUCCESS) { + const char *msg = nullptr; + if (cuGetErrorString(status, &msg) != CUDA_SUCCESS || msg == nullptr) { + msg = "unknown CUDA driver error"; + } + std::cerr << "WARNING: " << context << " failed to " << operation << ": " << msg << '\n'; + } +} + void vmm_region::release() noexcept { if (is_cuda_malloc_) { if (ptr_) { - nixl_ep::warn_cuda_api(cudaFree(reinterpret_cast(ptr_)), k_vmm_ctx, "cudaFree"); + warn_cu_api(cuMemFree(ptr_), k_vmm_ctx, "cuMemFree"); } ptr_ = 0; return; } if (vmm_mapped_) { - nixl_ep::warn_cu_api(cuMemUnmap(ptr_, size_), k_vmm_ctx, "cuMemUnmap"); + warn_cu_api(cuMemUnmap(ptr_, size_), k_vmm_ctx, "cuMemUnmap"); vmm_mapped_ = false; } - if (vmm_addr_reserved_ && ptr_) { - nixl_ep::warn_cu_api(cuMemAddressFree(ptr_, size_), k_vmm_ctx, "cuMemAddressFree"); + if (ptr_) { + warn_cu_api(cuMemAddressFree(ptr_, size_), k_vmm_ctx, "cuMemAddressFree"); ptr_ = 0; - vmm_addr_reserved_ = false; } if (handle_) { - nixl_ep::warn_cu_api(cuMemRelease(handle_), k_vmm_ctx, "cuMemRelease"); + warn_cu_api(cuMemRelease(handle_), k_vmm_ctx, "cuMemRelease"); handle_ = 0; } } @@ -115,8 +129,8 @@ vmm_region::vmm_region(size_t size, CUdevice device) { if (!ctx.fabric_supported) { size_ = size; is_cuda_malloc_ = true; - if (cudaMalloc(reinterpret_cast(&ptr_), size) != cudaSuccess) { - throw std::runtime_error("cudaMalloc fallback failed (fabric not supported)"); + if (cuMemAlloc(&ptr_, size) != CUDA_SUCCESS) { + throw std::runtime_error("cuMemAlloc fallback failed (fabric not supported)"); } return; } @@ -133,7 +147,6 @@ vmm_region::vmm_region(size_t size, CUdevice device) { release(); throw std::runtime_error("Failed to reserve CUDA virtual address"); } - vmm_addr_reserved_ = true; if (cuMemMap(ptr_, size_, 0, handle_, 0) != CUDA_SUCCESS) { release(); @@ -149,3 +162,5 @@ vmm_region::vmm_region(size_t size, CUdevice device) { throw std::runtime_error("Failed to set CUDA memory access"); } } + +} // namespace nixl_ep diff --git a/examples/device/ep/csrc/vmm.hpp b/examples/device/ep/csrc/vmm.hpp index 8b3edf8a3b..7b71092f76 100644 --- a/examples/device/ep/csrc/vmm.hpp +++ b/examples/device/ep/csrc/vmm.hpp @@ -1,5 +1,5 @@ /* - * SPDX-FileCopyrightText: Copyright (c) 2025-2026 NVIDIA CORPORATION & AFFILIATES. All rights reserved. + * SPDX-FileCopyrightText: Copyright (c) 2026 NVIDIA CORPORATION & AFFILIATES. All rights reserved. * SPDX-License-Identifier: Apache-2.0 * * Licensed under the Apache License, Version 2.0 (the "License"); @@ -18,8 +18,10 @@ #pragma once #include -#include #include +#include + +namespace nixl_ep { class vmm_region { public: @@ -34,22 +36,15 @@ class vmm_region { vmm_region & operator=(vmm_region &&) = delete; - [[nodiscard]] CUdeviceptr + [[nodiscard]] void * ptr() const noexcept { - return ptr_; - } - - [[nodiscard]] size_t - size() const noexcept { - return size_; - } - - [[nodiscard]] CUmemGenericAllocationHandle - handle() const noexcept { - return handle_; + return reinterpret_cast(static_cast(ptr_)); } private: + static void + warn_cu_api(CUresult status, const char *context, const char *operation) noexcept; + void release() noexcept; @@ -60,3 +55,5 @@ class vmm_region { bool vmm_addr_reserved_ = false; bool vmm_mapped_ = false; }; + +} // namespace nixl_ep From e82d42277a5888ceef663b583c5f427c5f62d163 Mon Sep 17 00:00:00 2001 From: Ofir Farjon Date: Wed, 25 Mar 2026 19:57:22 +0200 Subject: [PATCH 26/29] NIXL/EP: Fix comment. --- examples/device/ep/csrc/nixl_ep.cpp | 16 ++++++---------- examples/device/ep/csrc/vmm.cpp | 29 +++++++++++++++++------------ examples/device/ep/csrc/vmm.hpp | 2 +- 3 files changed, 24 insertions(+), 23 deletions(-) diff --git a/examples/device/ep/csrc/nixl_ep.cpp b/examples/device/ep/csrc/nixl_ep.cpp index 26d40a9351..0a0d4069ce 100644 --- a/examples/device/ep/csrc/nixl_ep.cpp +++ b/examples/device/ep/csrc/nixl_ep.cpp @@ -34,7 +34,6 @@ #include #include "nixl_ep.hpp" -#include "cuda_utils.hpp" #include "kernels/api.cuh" #include "kernels/configs.cuh" #include @@ -103,30 +102,27 @@ void Buffer::init(int num_ranks, int num_experts_per_rank, int64_t num_rdma_byte auto per_channel_bytes = ceil_div(num_rdma_bytes, denom_sms); EP_HOST_ASSERT(per_channel_bytes < std::numeric_limits::max()); - const CUdevice cu_dev = static_cast(device_id); - // Create 32 MiB workspace - m_workspace_alloc = std::make_unique(NUM_WORKSPACE_BYTES, cu_dev); + m_workspace_alloc = std::make_unique(NUM_WORKSPACE_BYTES); workspace = m_workspace_alloc->ptr(); CUDA_CHECK(cudaMemsetAsync(workspace, 0, NUM_WORKSPACE_BYTES, comm_stream)); EP_HOST_ASSERT(max_experts_per_rank > 0); - m_rdma_alloc = std::make_unique(static_cast(num_rdma_bytes), cu_dev); + m_rdma_alloc = std::make_unique(static_cast(num_rdma_bytes)); rdma_buffer_ptr = reinterpret_cast(m_rdma_alloc->ptr()); CUDA_CHECK(cudaMemset(rdma_buffer_ptr, 0, num_rdma_bytes)); // Allocate and clean shrink buffer int num_mask_buffer_bytes = max_num_ranks * sizeof(int); - m_mask_alloc = std::make_unique(static_cast(num_mask_buffer_bytes), cu_dev); + m_mask_alloc = std::make_unique(static_cast(num_mask_buffer_bytes)); mask_buffer_ptr = static_cast(m_mask_alloc->ptr()); CUDA_CHECK(cudaMemset(mask_buffer_ptr, 0xff, num_mask_buffer_bytes)); CUDA_CHECK(cudaMemset(mask_buffer_ptr + rank, 0, sizeof(int))); int num_sync_buffer_bytes = max_num_ranks * sizeof(int); - m_sync_alloc = std::make_unique(static_cast(num_sync_buffer_bytes), cu_dev); + m_sync_alloc = std::make_unique(static_cast(num_sync_buffer_bytes)); sync_buffer_ptr = static_cast(m_sync_alloc->ptr()); - m_sync_count_alloc = - std::make_unique(static_cast(num_sync_buffer_bytes), cu_dev); + m_sync_count_alloc = std::make_unique(static_cast(num_sync_buffer_bytes)); sync_count_ptr = static_cast(m_sync_count_alloc->ptr()); CUDA_CHECK(cudaMemset(sync_buffer_ptr, 0, num_sync_buffer_bytes)); CUDA_CHECK(cudaMemset(sync_count_ptr, 0, num_sync_buffer_bytes)); @@ -193,7 +189,7 @@ void Buffer::destroy() { } // Synchronize - warn_cuda_api(cudaDeviceSynchronize(), "destroy()", "synchronize device"); + warn_cuda(cudaDeviceSynchronize(), "synchronize device"); _nixl_ep_destroy(); diff --git a/examples/device/ep/csrc/vmm.cpp b/examples/device/ep/csrc/vmm.cpp index d72db7bf27..dfb4160ce3 100644 --- a/examples/device/ep/csrc/vmm.cpp +++ b/examples/device/ep/csrc/vmm.cpp @@ -68,7 +68,7 @@ vmm_region::~vmm_region() { release(); } -vmm_region::vmm_region(size_t size, CUdevice device) { +vmm_region::vmm_region(size_t size) { if (size == 0) { throw std::invalid_argument("vmm_region: size must be non-zero"); } @@ -77,9 +77,16 @@ vmm_region::vmm_region(size_t size, CUdevice device) { bool fabric_supported; CUmemAllocationProp prop; size_t granularity; + CUdevice device; + CUmemAccessDesc access_desc = {}; - cuda_alloc_ctx(CUdevice dev) : fabric_supported(false), prop({}), granularity(0) { + cuda_alloc_ctx() : fabric_supported(false), prop({}), granularity(0) { int version; + + if (cuCtxGetDevice(&device) != CUDA_SUCCESS) { + throw std::runtime_error("CUDA device should be set before creating a vmm_region"); + } + if (cuDriverGetVersion(&version) != CUDA_SUCCESS) { throw std::runtime_error("Failed to get CUDA driver version"); } @@ -91,7 +98,7 @@ vmm_region::vmm_region(size_t size, CUdevice device) { int fab = 0; if ((cuDeviceGetAttribute(&fab, CU_DEVICE_ATTRIBUTE_HANDLE_TYPE_FABRIC_SUPPORTED, - dev) != CUDA_SUCCESS) || + device) != CUDA_SUCCESS) || (!fab)) { return; /* no fabric — fall back to cudaMalloc */ } @@ -99,7 +106,7 @@ vmm_region::vmm_region(size_t size, CUdevice device) { int rdma_vmm_supported = 0; if (cuDeviceGetAttribute(&rdma_vmm_supported, CU_DEVICE_ATTRIBUTE_GPU_DIRECT_RDMA_WITH_CUDA_VMM_SUPPORTED, - dev) != CUDA_SUCCESS) { + device) != CUDA_SUCCESS) { throw std::runtime_error( "Failed to query GPUDirect RDMA with VMM support attribute"); } @@ -111,7 +118,7 @@ vmm_region::vmm_region(size_t size, CUdevice device) { prop.type = CU_MEM_ALLOCATION_TYPE_PINNED; prop.location.type = CU_MEM_LOCATION_TYPE_DEVICE; - prop.location.id = dev; + prop.location.id = device; prop.allocFlags.gpuDirectRDMACapable = 1; prop.requestedHandleTypes = CU_MEM_HANDLE_TYPE_FABRIC; @@ -120,11 +127,14 @@ vmm_region::vmm_region(size_t size, CUdevice device) { throw std::runtime_error("Failed to get CUDA allocation granularity"); } + access_desc.location.type = CU_MEM_LOCATION_TYPE_DEVICE; + access_desc.location.id = device; + access_desc.flags = CU_MEM_ACCESS_FLAGS_PROT_READWRITE; fabric_supported = true; } }; - static cuda_alloc_ctx ctx(device); + static cuda_alloc_ctx ctx{}; if (!ctx.fabric_supported) { size_ = size; @@ -135,8 +145,6 @@ vmm_region::vmm_region(size_t size, CUdevice device) { return; } - CUmemAccessDesc access_desc = {}; - size_ = nixl_ep::align_up(size, ctx.granularity); if (cuMemCreate(&handle_, size_, &ctx.prop, 0) != CUDA_SUCCESS) { @@ -154,10 +162,7 @@ vmm_region::vmm_region(size_t size, CUdevice device) { } vmm_mapped_ = true; - access_desc.location.type = CU_MEM_LOCATION_TYPE_DEVICE; - access_desc.location.id = device; - access_desc.flags = CU_MEM_ACCESS_FLAGS_PROT_READWRITE; - if (cuMemSetAccess(ptr_, size_, &access_desc, 1) != CUDA_SUCCESS) { + if (cuMemSetAccess(ptr_, size_, &ctx.access_desc, 1) != CUDA_SUCCESS) { release(); throw std::runtime_error("Failed to set CUDA memory access"); } diff --git a/examples/device/ep/csrc/vmm.hpp b/examples/device/ep/csrc/vmm.hpp index 7b71092f76..be203da569 100644 --- a/examples/device/ep/csrc/vmm.hpp +++ b/examples/device/ep/csrc/vmm.hpp @@ -25,7 +25,7 @@ namespace nixl_ep { class vmm_region { public: - explicit vmm_region(size_t size, CUdevice device); + explicit vmm_region(size_t size); ~vmm_region(); From 5b30ea49b7ef3367901d42bdcc4382c60d6f6efa Mon Sep 17 00:00:00 2001 From: Ofir Farjon Date: Wed, 25 Mar 2026 19:59:44 +0200 Subject: [PATCH 27/29] NIXL/EP: Fix comment. --- examples/device/ep/csrc/nixl_ep.cpp | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/examples/device/ep/csrc/nixl_ep.cpp b/examples/device/ep/csrc/nixl_ep.cpp index 0a0d4069ce..e8f2055d49 100644 --- a/examples/device/ep/csrc/nixl_ep.cpp +++ b/examples/device/ep/csrc/nixl_ep.cpp @@ -109,7 +109,7 @@ void Buffer::init(int num_ranks, int num_experts_per_rank, int64_t num_rdma_byte EP_HOST_ASSERT(max_experts_per_rank > 0); m_rdma_alloc = std::make_unique(static_cast(num_rdma_bytes)); - rdma_buffer_ptr = reinterpret_cast(m_rdma_alloc->ptr()); + rdma_buffer_ptr = m_rdma_alloc->ptr(); CUDA_CHECK(cudaMemset(rdma_buffer_ptr, 0, num_rdma_bytes)); // Allocate and clean shrink buffer From 32cd73928bdd1f6c2a2dc28f2e2bfc8a5cfb9c03 Mon Sep 17 00:00:00 2001 From: Ofir Farjon Date: Mon, 30 Mar 2026 19:51:31 +0300 Subject: [PATCH 28/29] NIXL/EP: Fix comment. --- examples/device/ep/csrc/vmm.hpp | 1 - 1 file changed, 1 deletion(-) diff --git a/examples/device/ep/csrc/vmm.hpp b/examples/device/ep/csrc/vmm.hpp index be203da569..1efbfeb528 100644 --- a/examples/device/ep/csrc/vmm.hpp +++ b/examples/device/ep/csrc/vmm.hpp @@ -52,7 +52,6 @@ class vmm_region { size_t size_ = 0; CUmemGenericAllocationHandle handle_ = 0; bool is_cuda_malloc_ = false; - bool vmm_addr_reserved_ = false; bool vmm_mapped_ = false; }; From b1cab66f355462495e2208beb852ede2f7e70531 Mon Sep 17 00:00:00 2001 From: Ofir Farjon Date: Mon, 30 Mar 2026 20:22:01 +0300 Subject: [PATCH 29/29] NIXL/EP: Fix comment. --- examples/device/ep/csrc/vmm.cpp | 19 +++++++++++-------- examples/device/ep/csrc/vmm.hpp | 3 --- 2 files changed, 11 insertions(+), 11 deletions(-) diff --git a/examples/device/ep/csrc/vmm.cpp b/examples/device/ep/csrc/vmm.cpp index dfb4160ce3..93c7f28b64 100644 --- a/examples/device/ep/csrc/vmm.cpp +++ b/examples/device/ep/csrc/vmm.cpp @@ -24,13 +24,10 @@ namespace { constexpr const char *k_vmm_ctx = "vmm_region"; -} - -namespace nixl_ep { /** Log a non-fatal warning if a CUDA driver API call failed (e.g. during teardown). */ void -vmm_region::warn_cu_api(CUresult status, const char *context, const char *operation) noexcept { +warn_cu_api(CUresult status, const char *context, const char *operation) noexcept { if (status != CUDA_SUCCESS) { const char *msg = nullptr; if (cuGetErrorString(status, &msg) != CUDA_SUCCESS || msg == nullptr) { @@ -40,6 +37,10 @@ vmm_region::warn_cu_api(CUresult status, const char *context, const char *operat } } +} // namespace + +namespace nixl_ep { + void vmm_region::release() noexcept { if (is_cuda_malloc_) { @@ -112,8 +113,10 @@ vmm_region::vmm_region(size_t size) { } if (!rdma_vmm_supported) { - throw std::runtime_error( - "GPUDirect RDMA with CUDA VMM is not supported on this device"); + std::cerr << "DIAG: " << k_vmm_ctx + << " - GPUDirect RDMA with CUDA VMM not supported; falling back to " + "cuMemAlloc\n"; + return; } prop.type = CU_MEM_ALLOCATION_TYPE_PINNED; @@ -134,13 +137,13 @@ vmm_region::vmm_region(size_t size) { } }; - static cuda_alloc_ctx ctx{}; + static cuda_alloc_ctx ctx; if (!ctx.fabric_supported) { size_ = size; is_cuda_malloc_ = true; if (cuMemAlloc(&ptr_, size) != CUDA_SUCCESS) { - throw std::runtime_error("cuMemAlloc fallback failed (fabric not supported)"); + throw std::runtime_error("cuMemAlloc fallback failed"); } return; } diff --git a/examples/device/ep/csrc/vmm.hpp b/examples/device/ep/csrc/vmm.hpp index 1efbfeb528..9b007b1243 100644 --- a/examples/device/ep/csrc/vmm.hpp +++ b/examples/device/ep/csrc/vmm.hpp @@ -42,9 +42,6 @@ class vmm_region { } private: - static void - warn_cu_api(CUresult status, const char *context, const char *operation) noexcept; - void release() noexcept;