Skip to content
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
Show all changes
32 commits
Select commit Hold shift + click to select a range
b6cb209
NIXL/EP: Use vmm API instead of cudaMalloc
ofirfarjun7 Mar 5, 2026
c36a4c2
NIXL/EP: Use vmm API instead of cudaMalloc
Mar 7, 2026
29c2c7a
NIXL/EP: revert
Mar 7, 2026
daecca9
NIXL/EP: Support gdr copy with vmm.
Mar 8, 2026
3f5f55f
NIXL/EP: Improve.
ofirfarjun7 Mar 8, 2026
07674ee
NIXL/EP: Format.
ofirfarjun7 Mar 8, 2026
0771375
NIXL/EP: Merge branch 'main' into topic/nixl-ep-use-vmm-api
ofirfarjun7 Mar 8, 2026
bbb8d27
NIXL/EP: check return val.
ofirfarjun7 Mar 8, 2026
7b60106
NIXL/EP: Format.
ofirfarjun7 Mar 8, 2026
25b2e5b
NIXL/EP:Improve.
ofirfarjun7 Mar 9, 2026
02e379b
NIXL/EP: Format.
ofirfarjun7 Mar 9, 2026
2c701be
NIXL/EP: Improve.
ofirfarjun7 Mar 9, 2026
329fb22
NIXL/EP: Format.
ofirfarjun7 Mar 9, 2026
ee2fd64
NIXL/EP: fallback to cudaMalloc if fabric not supported
ofirfarjun7 Mar 11, 2026
7677e59
NIXL/EP: set default vals
ofirfarjun7 Mar 11, 2026
8b4b567
NIXL/EP: Fix.
ofirfarjun7 Mar 11, 2026
cbe8d1d
NIXL/EP: Fix comments.
ofirfarjun7 Mar 17, 2026
088c960
NIXL/EP: Fix.
ofirfarjun7 Mar 17, 2026
15f9174
NIXL/EP: Fix.
ofirfarjun7 Mar 17, 2026
a44a094
NIXL/EP: not needed.
ofirfarjun7 Mar 17, 2026
f120473
NIXL/EP: new files terms.
ofirfarjun7 Mar 17, 2026
9ef9dca
NIXL/EP: Fix comments.
ofirfarjun7 Mar 24, 2026
eab7509
NIXL/EP: Merge main
ofirfarjun7 Mar 24, 2026
22c59f7
NIXL/EP: Fix comment.
ofirfarjun7 Mar 24, 2026
0a58f96
NIXL/EP: Fix.
ofirfarjun7 Mar 24, 2026
d2bdfea
NIXL/EP: fix.
ofirfarjun7 Mar 24, 2026
0405899
NIXL/EP: Fix comment.
ofirfarjun7 Mar 25, 2026
e82d422
NIXL/EP: Fix comment.
ofirfarjun7 Mar 25, 2026
5b30ea4
NIXL/EP: Fix comment.
ofirfarjun7 Mar 25, 2026
437b97a
NIXL/EP: Merge branch 'main'
ofirfarjun7 Mar 30, 2026
32cd739
NIXL/EP: Fix comment.
ofirfarjun7 Mar 30, 2026
b1cab66
NIXL/EP: Fix comment.
ofirfarjun7 Mar 30, 2026
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
6 changes: 5 additions & 1 deletion examples/device/ep/csrc/config.hpp
Original file line number Diff line number Diff line change
Expand Up @@ -122,7 +122,11 @@ struct EPLayout {
}
};

size_t get_rdma_size_hint(int num_max_dispatch_tokens_per_rank, int hidden, int num_ranks, int num_experts) {
inline size_t
get_rdma_size_hint(int num_max_dispatch_tokens_per_rank,
int hidden,
int num_ranks,
int num_experts) {
auto num_bytes = EPLayout(nullptr, num_max_dispatch_tokens_per_rank, hidden, num_ranks, num_experts).total_bytes;
return ((num_bytes + NUM_BUFFER_ALIGNMENT_BYTES) / NUM_BUFFER_ALIGNMENT_BYTES) * NUM_BUFFER_ALIGNMENT_BYTES;
}
Expand Down
40 changes: 23 additions & 17 deletions examples/device/ep/csrc/nixl_ep.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -103,25 +103,28 @@ void Buffer::init(int num_ranks, int num_experts_per_rank, int64_t num_rdma_byte
EP_HOST_ASSERT(per_channel_bytes < std::numeric_limits<int>::max());

// Create 32 MiB workspace
CUDA_CHECK(cudaMalloc(&workspace, NUM_WORKSPACE_BYTES));
m_workspace_alloc = std::make_unique<vmm_region>(NUM_WORKSPACE_BYTES);
workspace = m_workspace_alloc->ptr();
CUDA_CHECK(cudaMemsetAsync(workspace, 0, NUM_WORKSPACE_BYTES, comm_stream));

EP_HOST_ASSERT(max_experts_per_rank > 0);
CUDA_CHECK(cudaMalloc(&rdma_buffer_ptr, num_rdma_bytes));
m_rdma_alloc = std::make_unique<vmm_region>(static_cast<size_t>(num_rdma_bytes));
rdma_buffer_ptr = m_rdma_alloc->ptr();
CUDA_CHECK(cudaMemset(rdma_buffer_ptr, 0, num_rdma_bytes));

// Allocate and clean shrink buffer
mask_buffer_ptr = nullptr;
sync_buffer_ptr = nullptr;
int num_mask_buffer_bytes = max_num_ranks * sizeof(int);
CUDA_CHECK(cudaMalloc(&mask_buffer_ptr, num_mask_buffer_bytes));
m_mask_alloc = std::make_unique<vmm_region>(static_cast<size_t>(num_mask_buffer_bytes));
mask_buffer_ptr = static_cast<int *>(m_mask_alloc->ptr());
CUDA_CHECK(cudaMemset(mask_buffer_ptr, 0xff, num_mask_buffer_bytes));
CUDA_CHECK(cudaMemset(mask_buffer_ptr + rank, 0, sizeof(int)));

int num_sync_buffer_bytes = max_num_ranks * sizeof(int);
CUDA_CHECK(cudaMalloc(&sync_buffer_ptr, num_sync_buffer_bytes));
m_sync_alloc = std::make_unique<vmm_region>(static_cast<size_t>(num_sync_buffer_bytes));
sync_buffer_ptr = static_cast<int *>(m_sync_alloc->ptr());
m_sync_count_alloc = std::make_unique<vmm_region>(static_cast<size_t>(num_sync_buffer_bytes));
sync_count_ptr = static_cast<int *>(m_sync_count_alloc->ptr());
CUDA_CHECK(cudaMemset(sync_buffer_ptr, 0, num_sync_buffer_bytes));
CUDA_CHECK(cudaMalloc(&sync_count_ptr, num_sync_buffer_bytes));
CUDA_CHECK(cudaMemset(sync_count_ptr, 0, num_sync_buffer_bytes));
CUDA_CHECK(cudaDeviceSynchronize());

Expand Down Expand Up @@ -167,10 +170,10 @@ torch::Stream Buffer::get_comm_stream() const {
}

void Buffer::destroy() {
auto warn_cuda = [](cudaError_t status, const char* operation) noexcept {
auto warn_cuda = [](cudaError_t status, const char *operation) noexcept {
if (status != cudaSuccess) {
std::cerr << "WARNING: destroy() failed to " << operation
<< ": " << cudaGetErrorString(status) << '\n';
std::cerr << "WARNING: destroy() failed to " << operation << ": "
<< cudaGetErrorString(status) << '\n';
}
};

Expand All @@ -195,7 +198,6 @@ void Buffer::destroy() {
warn_nixl(nixl_agent_info->agent->invalidateLocalMD(),
"invalidate local metadata");
}

warn_nixl(nixl_agent_info->agent->deregisterMem(
nixl_agent_info->rdma_reg_descs,
&nixl_agent_info->extra_params),
Expand All @@ -212,13 +214,17 @@ void Buffer::destroy() {
nixl_agent_info.reset();
}

warn_cuda(cudaFree(rdma_buffer_ptr), "free RDMA buffer");
warn_cuda(cudaFree(mask_buffer_ptr), "free mask buffer");
warn_cuda(cudaFree(sync_buffer_ptr), "free sync buffer");
warn_cuda(cudaFree(sync_count_ptr), "free sync-count buffer");
m_rdma_alloc.reset();
rdma_buffer_ptr = nullptr;
m_mask_alloc.reset();
mask_buffer_ptr = nullptr;
m_sync_alloc.reset();
sync_buffer_ptr = nullptr;
m_sync_count_alloc.reset();
sync_count_ptr = nullptr;

// Free workspace
warn_cuda(cudaFree(workspace), "free workspace");
m_workspace_alloc.reset();
workspace = nullptr;

destroyed = true;
available = false;
Expand Down
8 changes: 8 additions & 0 deletions examples/device/ep/csrc/nixl_ep.hpp
Original file line number Diff line number Diff line change
Expand Up @@ -40,6 +40,7 @@
#include "event.hpp"
#include "kernels/configs.cuh"
#include "kernels/exception.cuh"
#include "vmm.hpp"

#include "nixl.h"

Expand Down Expand Up @@ -86,6 +87,13 @@ struct Buffer {
int *sync_buffer_ptr = nullptr;
int *sync_count_ptr = nullptr;

/* Owning VMM allocations (keep raw ptrs above as aliases) */
std::unique_ptr<vmm_region> m_rdma_alloc;
std::unique_ptr<vmm_region> m_mask_alloc;
std::unique_ptr<vmm_region> m_sync_alloc;
std::unique_ptr<vmm_region> m_sync_count_alloc;
std::unique_ptr<vmm_region> m_workspace_alloc;

// Device info and communication
int device_id;
int num_device_sms;
Expand Down
174 changes: 174 additions & 0 deletions examples/device/ep/csrc/vmm.cpp
Original file line number Diff line number Diff line change
@@ -0,0 +1,174 @@
/*
* SPDX-FileCopyrightText: Copyright (c) 2026 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
* SPDX-License-Identifier: Apache-2.0
*
* Licensed under the Apache License, Version 2.0 (the "License");
* you may not use this file except in compliance with the License.
* You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/

#include <iostream>
#include <stdexcept>

#include "config.hpp"
#include "vmm.hpp"

namespace {

constexpr const char *k_vmm_ctx = "vmm_region";

/** Log a non-fatal warning if a CUDA driver API call failed (e.g. during teardown). */
void
warn_cu_api(CUresult status, const char *context, const char *operation) noexcept {
if (status != CUDA_SUCCESS) {
const char *msg = nullptr;
if (cuGetErrorString(status, &msg) != CUDA_SUCCESS || msg == nullptr) {
msg = "unknown CUDA driver error";
}
std::cerr << "WARNING: " << context << " failed to " << operation << ": " << msg << '\n';
}
}

} // namespace

namespace nixl_ep {

void
vmm_region::release() noexcept {
if (is_cuda_malloc_) {
if (ptr_) {
warn_cu_api(cuMemFree(ptr_), k_vmm_ctx, "cuMemFree");
}
ptr_ = 0;
return;
}

if (vmm_mapped_) {
warn_cu_api(cuMemUnmap(ptr_, size_), k_vmm_ctx, "cuMemUnmap");
vmm_mapped_ = false;
}
if (ptr_) {
warn_cu_api(cuMemAddressFree(ptr_, size_), k_vmm_ctx, "cuMemAddressFree");
ptr_ = 0;
}
if (handle_) {
warn_cu_api(cuMemRelease(handle_), k_vmm_ctx, "cuMemRelease");
handle_ = 0;
}
}

vmm_region::~vmm_region() {
release();
}

vmm_region::vmm_region(size_t size) {
if (size == 0) {
throw std::invalid_argument("vmm_region: size must be non-zero");
}

struct cuda_alloc_ctx {
bool fabric_supported;
CUmemAllocationProp prop;
size_t granularity;
CUdevice device;
CUmemAccessDesc access_desc = {};

cuda_alloc_ctx() : fabric_supported(false), prop({}), granularity(0) {
int version;

if (cuCtxGetDevice(&device) != CUDA_SUCCESS) {
throw std::runtime_error("CUDA device should be set before creating a vmm_region");
}

if (cuDriverGetVersion(&version) != CUDA_SUCCESS) {
throw std::runtime_error("Failed to get CUDA driver version");
}

if (version < 11000) {
return; /* too old — fall back to cudaMalloc */
}

int fab = 0;
if ((cuDeviceGetAttribute(&fab,
CU_DEVICE_ATTRIBUTE_HANDLE_TYPE_FABRIC_SUPPORTED,
device) != CUDA_SUCCESS) ||
(!fab)) {
return; /* no fabric — fall back to cudaMalloc */
}

int rdma_vmm_supported = 0;
if (cuDeviceGetAttribute(&rdma_vmm_supported,
CU_DEVICE_ATTRIBUTE_GPU_DIRECT_RDMA_WITH_CUDA_VMM_SUPPORTED,
device) != CUDA_SUCCESS) {
throw std::runtime_error(
"Failed to query GPUDirect RDMA with VMM support attribute");
}

if (!rdma_vmm_supported) {
std::cerr << "DIAG: " << k_vmm_ctx
<< " - GPUDirect RDMA with CUDA VMM not supported; falling back to "
"cuMemAlloc\n";
return;
}

prop.type = CU_MEM_ALLOCATION_TYPE_PINNED;
prop.location.type = CU_MEM_LOCATION_TYPE_DEVICE;
prop.location.id = device;
prop.allocFlags.gpuDirectRDMACapable = 1;
prop.requestedHandleTypes = CU_MEM_HANDLE_TYPE_FABRIC;

if (cuMemGetAllocationGranularity(
&granularity, &prop, CU_MEM_ALLOC_GRANULARITY_MINIMUM) != CUDA_SUCCESS) {
throw std::runtime_error("Failed to get CUDA allocation granularity");
}

access_desc.location.type = CU_MEM_LOCATION_TYPE_DEVICE;
access_desc.location.id = device;
access_desc.flags = CU_MEM_ACCESS_FLAGS_PROT_READWRITE;
fabric_supported = true;
}
};

static cuda_alloc_ctx ctx;

if (!ctx.fabric_supported) {
size_ = size;
is_cuda_malloc_ = true;
if (cuMemAlloc(&ptr_, size) != CUDA_SUCCESS) {
throw std::runtime_error("cuMemAlloc fallback failed");
}
return;
}

size_ = nixl_ep::align_up<size_t>(size, ctx.granularity);

if (cuMemCreate(&handle_, size_, &ctx.prop, 0) != CUDA_SUCCESS) {
throw std::runtime_error("Failed to create CUDA VMM allocation");
}

if (cuMemAddressReserve(&ptr_, size_, 0, 0, 0) != CUDA_SUCCESS) {
release();
throw std::runtime_error("Failed to reserve CUDA virtual address");
}

if (cuMemMap(ptr_, size_, 0, handle_, 0) != CUDA_SUCCESS) {
release();
throw std::runtime_error("Failed to map CUDA VMM memory");
}
vmm_mapped_ = true;

if (cuMemSetAccess(ptr_, size_, &ctx.access_desc, 1) != CUDA_SUCCESS) {
release();
throw std::runtime_error("Failed to set CUDA memory access");
}
}

} // namespace nixl_ep
55 changes: 55 additions & 0 deletions examples/device/ep/csrc/vmm.hpp
Original file line number Diff line number Diff line change
@@ -0,0 +1,55 @@
/*
* SPDX-FileCopyrightText: Copyright (c) 2026 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
* SPDX-License-Identifier: Apache-2.0
*
* Licensed under the Apache License, Version 2.0 (the "License");
* you may not use this file except in compliance with the License.
* You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/

#pragma once

#include <cuda.h>
#include <cstddef>
#include <cstdint>

namespace nixl_ep {

class vmm_region {
public:
explicit vmm_region(size_t size);

~vmm_region();

vmm_region(const vmm_region &) = delete;
vmm_region &
operator=(const vmm_region &) = delete;
vmm_region(vmm_region &&) = delete;
vmm_region &
operator=(vmm_region &&) = delete;

[[nodiscard]] void *
ptr() const noexcept {
return reinterpret_cast<void *>(static_cast<std::uintptr_t>(ptr_));
}

private:
void
release() noexcept;

CUdeviceptr ptr_ = 0;
size_t size_ = 0;
CUmemGenericAllocationHandle handle_ = 0;
bool is_cuda_malloc_ = false;
bool vmm_mapped_ = false;
};

} // namespace nixl_ep
1 change: 1 addition & 0 deletions examples/device/ep/meson.build
Original file line number Diff line number Diff line change
Expand Up @@ -63,6 +63,7 @@ endif

nixl_ep_sources = [
'csrc/nixl_ep.cpp',
'csrc/vmm.cpp',
'csrc/kernels/nixl_ep.cu',
]

Expand Down
Loading