ai-dynamo · itayalroy · Apr 5, 2026 · Mar 5, 2026 · Mar 7, 2026 · Mar 7, 2026
@@ -122,7 +122,11 @@ struct EPLayout {
     }
 };
 
-size_t get_rdma_size_hint(int num_max_dispatch_tokens_per_rank, int hidden, int num_ranks, int num_experts) {
+inline size_t
+get_rdma_size_hint(int num_max_dispatch_tokens_per_rank,
+                   int hidden,
+                   int num_ranks,
+                   int num_experts) {
     auto num_bytes = EPLayout(nullptr, num_max_dispatch_tokens_per_rank, hidden, num_ranks, num_experts).total_bytes;
     return ((num_bytes + NUM_BUFFER_ALIGNMENT_BYTES) / NUM_BUFFER_ALIGNMENT_BYTES) * NUM_BUFFER_ALIGNMENT_BYTES;
 }

@@ -103,25 +103,28 @@ void Buffer::init(int num_ranks, int num_experts_per_rank, int64_t num_rdma_byte
     EP_HOST_ASSERT(per_channel_bytes < std::numeric_limits<int>::max());
 
     // Create 32 MiB workspace
-    CUDA_CHECK(cudaMalloc(&workspace, NUM_WORKSPACE_BYTES));
+    m_workspace_alloc = std::make_unique<vmm_region>(NUM_WORKSPACE_BYTES);
+    workspace = m_workspace_alloc->ptr();
     CUDA_CHECK(cudaMemsetAsync(workspace, 0, NUM_WORKSPACE_BYTES, comm_stream));
 
     EP_HOST_ASSERT(max_experts_per_rank > 0);
-    CUDA_CHECK(cudaMalloc(&rdma_buffer_ptr, num_rdma_bytes));
+    m_rdma_alloc = std::make_unique<vmm_region>(static_cast<size_t>(num_rdma_bytes));
+    rdma_buffer_ptr = m_rdma_alloc->ptr();
     CUDA_CHECK(cudaMemset(rdma_buffer_ptr, 0, num_rdma_bytes));
 
     // Allocate and clean shrink buffer
-    mask_buffer_ptr = nullptr;
-    sync_buffer_ptr = nullptr;
     int num_mask_buffer_bytes = max_num_ranks * sizeof(int);
-    CUDA_CHECK(cudaMalloc(&mask_buffer_ptr, num_mask_buffer_bytes));
+    m_mask_alloc = std::make_unique<vmm_region>(static_cast<size_t>(num_mask_buffer_bytes));
+    mask_buffer_ptr = static_cast<int *>(m_mask_alloc->ptr());
     CUDA_CHECK(cudaMemset(mask_buffer_ptr, 0xff, num_mask_buffer_bytes));
     CUDA_CHECK(cudaMemset(mask_buffer_ptr + rank, 0, sizeof(int)));
 
     int num_sync_buffer_bytes = max_num_ranks * sizeof(int);
-    CUDA_CHECK(cudaMalloc(&sync_buffer_ptr, num_sync_buffer_bytes));
+    m_sync_alloc = std::make_unique<vmm_region>(static_cast<size_t>(num_sync_buffer_bytes));
+    sync_buffer_ptr = static_cast<int *>(m_sync_alloc->ptr());
+    m_sync_count_alloc = std::make_unique<vmm_region>(static_cast<size_t>(num_sync_buffer_bytes));
+    sync_count_ptr = static_cast<int *>(m_sync_count_alloc->ptr());
     CUDA_CHECK(cudaMemset(sync_buffer_ptr, 0, num_sync_buffer_bytes));
-    CUDA_CHECK(cudaMalloc(&sync_count_ptr, num_sync_buffer_bytes));
     CUDA_CHECK(cudaMemset(sync_count_ptr, 0, num_sync_buffer_bytes));
     CUDA_CHECK(cudaDeviceSynchronize());
 
@@ -167,10 +170,10 @@ torch::Stream Buffer::get_comm_stream() const {
 }
 
 void Buffer::destroy() {
-    auto warn_cuda = [](cudaError_t status, const char* operation) noexcept {
+    auto warn_cuda = [](cudaError_t status, const char *operation) noexcept {
         if (status != cudaSuccess) {
-            std::cerr << "WARNING: destroy() failed to " << operation
-                      << ": " << cudaGetErrorString(status) << '\n';
+            std::cerr << "WARNING: destroy() failed to " << operation << ": "
+                      << cudaGetErrorString(status) << '\n';
         }
     };
 
@@ -195,7 +198,6 @@ void Buffer::destroy() {
             warn_nixl(nixl_agent_info->agent->invalidateLocalMD(),
                       "invalidate local metadata");
         }
-
         warn_nixl(nixl_agent_info->agent->deregisterMem(
                       nixl_agent_info->rdma_reg_descs,
                       &nixl_agent_info->extra_params),
@@ -212,13 +214,17 @@ void Buffer::destroy() {
         nixl_agent_info.reset();
     }
 
-    warn_cuda(cudaFree(rdma_buffer_ptr), "free RDMA buffer");
-    warn_cuda(cudaFree(mask_buffer_ptr), "free mask buffer");
-    warn_cuda(cudaFree(sync_buffer_ptr), "free sync buffer");
-    warn_cuda(cudaFree(sync_count_ptr), "free sync-count buffer");
+    m_rdma_alloc.reset();
+    rdma_buffer_ptr = nullptr;
+    m_mask_alloc.reset();
+    mask_buffer_ptr = nullptr;
+    m_sync_alloc.reset();
+    sync_buffer_ptr = nullptr;
+    m_sync_count_alloc.reset();
+    sync_count_ptr = nullptr;
 
-    // Free workspace
-    warn_cuda(cudaFree(workspace), "free workspace");
+    m_workspace_alloc.reset();
+    workspace = nullptr;
 
     destroyed = true;
     available = false;

@@ -40,6 +40,7 @@
 #include "event.hpp"
 #include "kernels/configs.cuh"
 #include "kernels/exception.cuh"
+#include "vmm.hpp"
 
 #include "nixl.h"
 
@@ -86,6 +87,13 @@ struct Buffer {
     int *sync_buffer_ptr = nullptr;
     int *sync_count_ptr = nullptr;
 
+    /* Owning VMM allocations (keep raw ptrs above as aliases) */
+    std::unique_ptr<vmm_region> m_rdma_alloc;
+    std::unique_ptr<vmm_region> m_mask_alloc;
+    std::unique_ptr<vmm_region> m_sync_alloc;
+    std::unique_ptr<vmm_region> m_sync_count_alloc;
+    std::unique_ptr<vmm_region> m_workspace_alloc;
+
     // Device info and communication
     int device_id;
     int num_device_sms;

@@ -0,0 +1,174 @@
+/*
+ * SPDX-FileCopyrightText: Copyright (c) 2026 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+ * SPDX-License-Identifier: Apache-2.0
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#include <iostream>
+#include <stdexcept>
+
+#include "config.hpp"
+#include "vmm.hpp"
+
+namespace {
+
+constexpr const char *k_vmm_ctx = "vmm_region";
+
+/** Log a non-fatal warning if a CUDA driver API call failed (e.g. during teardown). */
+void
+warn_cu_api(CUresult status, const char *context, const char *operation) noexcept {
+    if (status != CUDA_SUCCESS) {
+        const char *msg = nullptr;
+        if (cuGetErrorString(status, &msg) != CUDA_SUCCESS || msg == nullptr) {
+            msg = "unknown CUDA driver error";
+        }
+        std::cerr << "WARNING: " << context << " failed to " << operation << ": " << msg << '\n';
+    }
+}
+
+} // namespace
+
+namespace nixl_ep {
+
+void
+vmm_region::release() noexcept {
+    if (is_cuda_malloc_) {
+        if (ptr_) {
+            warn_cu_api(cuMemFree(ptr_), k_vmm_ctx, "cuMemFree");
+        }
+        ptr_ = 0;
+        return;
+    }
+
+    if (vmm_mapped_) {
+        warn_cu_api(cuMemUnmap(ptr_, size_), k_vmm_ctx, "cuMemUnmap");
+        vmm_mapped_ = false;
+    }
+    if (ptr_) {
+        warn_cu_api(cuMemAddressFree(ptr_, size_), k_vmm_ctx, "cuMemAddressFree");
+        ptr_ = 0;
+    }
+    if (handle_) {
+        warn_cu_api(cuMemRelease(handle_), k_vmm_ctx, "cuMemRelease");
+        handle_ = 0;
+    }
+}
+
+vmm_region::~vmm_region() {
+    release();
+}
+
+vmm_region::vmm_region(size_t size) {
+    if (size == 0) {
+        throw std::invalid_argument("vmm_region: size must be non-zero");
+    }
+
+    struct cuda_alloc_ctx {
+        bool fabric_supported;
+        CUmemAllocationProp prop;
+        size_t granularity;
+        CUdevice device;
+        CUmemAccessDesc access_desc = {};
+
+        cuda_alloc_ctx() : fabric_supported(false), prop({}), granularity(0) {
+            int version;
+
+            if (cuCtxGetDevice(&device) != CUDA_SUCCESS) {
+                throw std::runtime_error("CUDA device should be set before creating a vmm_region");
+            }
+
+            if (cuDriverGetVersion(&version) != CUDA_SUCCESS) {
+                throw std::runtime_error("Failed to get CUDA driver version");
+            }
+
+            if (version < 11000) {
+                return; /* too old — fall back to cudaMalloc */
+            }
+
+            int fab = 0;
+            if ((cuDeviceGetAttribute(&fab,
+                                      CU_DEVICE_ATTRIBUTE_HANDLE_TYPE_FABRIC_SUPPORTED,
+                                      device) != CUDA_SUCCESS) ||
+                (!fab)) {
+                return; /* no fabric — fall back to cudaMalloc */
+            }
+
+            int rdma_vmm_supported = 0;
+            if (cuDeviceGetAttribute(&rdma_vmm_supported,
+                                     CU_DEVICE_ATTRIBUTE_GPU_DIRECT_RDMA_WITH_CUDA_VMM_SUPPORTED,
+                                     device) != CUDA_SUCCESS) {
+                throw std::runtime_error(
+                    "Failed to query GPUDirect RDMA with VMM support attribute");
+            }
+
+            if (!rdma_vmm_supported) {
+                std::cerr << "DIAG: " << k_vmm_ctx
+                          << " - GPUDirect RDMA with CUDA VMM not supported; falling back to "
+                             "cuMemAlloc\n";
+                return;
+            }
+
+            prop.type = CU_MEM_ALLOCATION_TYPE_PINNED;
+            prop.location.type = CU_MEM_LOCATION_TYPE_DEVICE;
+            prop.location.id = device;
+            prop.allocFlags.gpuDirectRDMACapable = 1;
+            prop.requestedHandleTypes = CU_MEM_HANDLE_TYPE_FABRIC;
+
+            if (cuMemGetAllocationGranularity(
+                    &granularity, &prop, CU_MEM_ALLOC_GRANULARITY_MINIMUM) != CUDA_SUCCESS) {
+                throw std::runtime_error("Failed to get CUDA allocation granularity");
+            }
+
+            access_desc.location.type = CU_MEM_LOCATION_TYPE_DEVICE;
+            access_desc.location.id = device;
+            access_desc.flags = CU_MEM_ACCESS_FLAGS_PROT_READWRITE;
+            fabric_supported = true;
+        }
+    };
+
+    static cuda_alloc_ctx ctx;
+
+    if (!ctx.fabric_supported) {
+        size_ = size;
+        is_cuda_malloc_ = true;
+        if (cuMemAlloc(&ptr_, size) != CUDA_SUCCESS) {
+            throw std::runtime_error("cuMemAlloc fallback failed");
+        }
+        return;
+    }
+
+    size_ = nixl_ep::align_up<size_t>(size, ctx.granularity);
+
+    if (cuMemCreate(&handle_, size_, &ctx.prop, 0) != CUDA_SUCCESS) {
+        throw std::runtime_error("Failed to create CUDA VMM allocation");
+    }
+
+    if (cuMemAddressReserve(&ptr_, size_, 0, 0, 0) != CUDA_SUCCESS) {
+        release();
+        throw std::runtime_error("Failed to reserve CUDA virtual address");
+    }
+
+    if (cuMemMap(ptr_, size_, 0, handle_, 0) != CUDA_SUCCESS) {
+        release();
+        throw std::runtime_error("Failed to map CUDA VMM memory");
+    }
+    vmm_mapped_ = true;
+
+    if (cuMemSetAccess(ptr_, size_, &ctx.access_desc, 1) != CUDA_SUCCESS) {
+        release();
+        throw std::runtime_error("Failed to set CUDA memory access");
+    }
+}
+
+} // namespace nixl_ep
@@ -0,0 +1,55 @@
+/*
+ * SPDX-FileCopyrightText: Copyright (c) 2026 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+ * SPDX-License-Identifier: Apache-2.0
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#pragma once
+
+#include <cuda.h>
+#include <cstddef>
+#include <cstdint>
+
+namespace nixl_ep {
+
+class vmm_region {
+public:
+    explicit vmm_region(size_t size);
+
+    ~vmm_region();
+
+    vmm_region(const vmm_region &) = delete;
+    vmm_region &
+    operator=(const vmm_region &) = delete;
+    vmm_region(vmm_region &&) = delete;
+    vmm_region &
+    operator=(vmm_region &&) = delete;
+
+    [[nodiscard]] void *
+    ptr() const noexcept {
+        return reinterpret_cast<void *>(static_cast<std::uintptr_t>(ptr_));
+    }
+
+private:
+    void
+    release() noexcept;
+
+    CUdeviceptr ptr_ = 0;
+    size_t size_ = 0;
+    CUmemGenericAllocationHandle handle_ = 0;
+    bool is_cuda_malloc_ = false;
+    bool vmm_mapped_ = false;
+};
+
+} // namespace nixl_ep
@@ -63,6 +63,7 @@ endif
 
 nixl_ep_sources = [
     'csrc/nixl_ep.cpp',
+    'csrc/vmm.cpp',
     'csrc/kernels/nixl_ep.cu',
 ]