Skip to content
Closed
10 changes: 10 additions & 0 deletions cpp/CMakeLists.txt
Original file line number Diff line number Diff line change
Expand Up @@ -212,6 +212,7 @@ endif()
include_directories(
SYSTEM
${CUDAToolkit_INCLUDE_DIRS}
/usr/local/cuda-13.0/targets/x86_64-linux/include/cccl
${CUDNN_ROOT_DIR}/include
$<TARGET_PROPERTY:TensorRT::NvInfer,INTERFACE_INCLUDE_DIRECTORIES>
${3RDPARTY_DIR}/cutlass/include
Expand Down Expand Up @@ -249,6 +250,14 @@ if(${CUDAToolkit_VERSION} VERSION_GREATER_EQUAL "12.8")
)
endif()

if(${CUDAToolkit_VERSION} VERSION_GREATER_EQUAL "13.0")
message(
STATUS
"CUDAToolkit_VERSION ${CUDAToolkit_VERSION_MAJOR}.${CUDAToolkit_VERSION_MINOR} is greater or equal than 13.0, adding visibility flags"
)
set(CMAKE_CUDA_FLAGS "${CMAKE_CUDA_FLAGS} --device-entity-has-hidden-visibility=false -cudart=shared")
endif()

if(ENABLE_MULTI_DEVICE)
# MPI MPI isn't used until tensorrt_llm/CMakeLists.txt is invoked. However, if
# it's not called before "CMAKE_CXX_FLAGS" is set, it breaks on Windows for
Expand Down Expand Up @@ -365,6 +374,7 @@ if(NVCC_TIMING)
set(CMAKE_CUDA_FLAGS
"${CMAKE_CUDA_FLAGS} --time ${CMAKE_CURRENT_BINARY_DIR}/nvcc-timing.csv")
endif()

message("CMAKE_CUDA_FLAGS: ${CMAKE_CUDA_FLAGS}")

set(COMMON_HEADER_DIRS ${PROJECT_SOURCE_DIR} ${CUDAToolkit_INCLUDE_DIR})
Expand Down
6 changes: 3 additions & 3 deletions cpp/include/tensorrt_llm/deep_gemm/tma_utils.cuh
Original file line number Diff line number Diff line change
Expand Up @@ -95,7 +95,7 @@ constexpr CUtensorMapDataType get_CUtensorMapDataType()
}
}

PFN_cuTensorMapEncodeTiled get_cuTensorMapEncodeTiled()
PFN_cuTensorMapEncodeTiled_v12000 get_cuTensorMapEncodeTiled()
{
// Get pointer to `cuTensorMapEncodeTiled`
cudaDriverEntryPointQueryResult driver_status;
Expand All @@ -110,12 +110,12 @@ PFN_cuTensorMapEncodeTiled get_cuTensorMapEncodeTiled()

if (driver_status != cudaDriverEntryPointSuccess)
throw std::runtime_error("driver_status != cudaDriverEntryPointSuccess");
return reinterpret_cast<PFN_cuTensorMapEncodeTiled>(cuTensorMapEncodeTiled_ptr);
return reinterpret_cast<PFN_cuTensorMapEncodeTiled_v12000>(cuTensorMapEncodeTiled_ptr);
}

template <typename T>
CUtensorMap make_2d_tma_copy_desc(T* global_address, uint64_t gmem_dim[2], uint64_t stride_in_bytes,
uint32_t smem_dim[2], CUtensorMapSwizzle swizzle_type, PFN_cuTensorMapEncodeTiled encode_func = nullptr)
uint32_t smem_dim[2], CUtensorMapSwizzle swizzle_type, PFN_cuTensorMapEncodeTiled_v12000 encode_func = nullptr)
{
Comment on lines 116 to 119
Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

💡 Verification agent

🧩 Analysis chain

Default parameter type should match the stable PFN type

Align the encode_func param with the stable, unversioned type to avoid exposing versioned types.

-    uint32_t smem_dim[2], CUtensorMapSwizzle swizzle_type, PFN_cuTensorMapEncodeTiled_v12000 encode_func = nullptr)
+    uint32_t smem_dim[2], CUtensorMapSwizzle swizzle_type, PFN_cuTensorMapEncodeTiled encode_func = nullptr)

Double-check the other call sites for signature consistency.


🏁 Script executed:

#!/bin/bash
set -euo pipefail

echo "=== show file (if exists) ==="
FILE="cpp/include/tensorrt_llm/deep_gemm/tma_utils.cuh"
if [ -f "$FILE" ]; then
  sed -n '1,240p' "$FILE" || true
else
  echo "File not found: $FILE"
fi

echo
echo "=== rg: PFN_cuTensorMapEncodeTiled_v12000 ==="
rg --hidden -n "PFN_cuTensorMapEncodeTiled_v12000" || true

echo
echo "=== rg: PFN_cuTensorMapEncodeTiled ==="
rg --hidden -n "PFN_cuTensorMapEncodeTiled(?!_v12000)" || true

echo
echo "=== rg: make_2d_tma_copy_desc ==="
rg --hidden -n "make_2d_tma_copy_desc" || true

echo
echo "=== rg: cuTensorMapEncodeTiled occurrences ==="
rg --hidden -n "cuTensorMapEncodeTiled" || true

echo
echo "=== rg: cuTensorMap related typedefs/usages ==="
rg --hidden -n "cuTensorMap" || true

Length of output: 25549


🏁 Script executed:

#!/bin/bash
set -euo pipefail

echo "=== fp8_blockscale_tma_utils.cuh (head) ==="
sed -n '1,240p' cpp/tensorrt_llm/kernels/cutlass_kernels/fp8_blockscale_gemm/fp8_blockscale_tma_utils.cuh || true

echo
echo "=== cudaDriverWrapper.h (head) ==="
sed -n '1,320p' cpp/tensorrt_llm/common/cudaDriverWrapper.h || true

echo
echo "=== cudaDriverWrapper.cpp (head) ==="
sed -n '1,360p' cpp/tensorrt_llm/common/cudaDriverWrapper.cpp || true

echo
echo "=== tma_utils.cuh (relevant region) ==="
sed -n '1,220p' cpp/include/tensorrt_llm/deep_gemm/tma_utils.cuh || true

echo
echo "=== rg: where PFN_cuTensorMapEncodeTiled is defined/mentioned ==="
rg --hidden -n "PFN_cuTensorMapEncodeTiled" || true

Length of output: 32409


Prefer the stable PFN_cuTensorMapEncodeTiled type for the default parameter

Use the unversioned PFN to avoid leaking the versioned typedef; callers I checked do not pass encode_func explicitly.

Files to update:

  • cpp/include/tensorrt_llm/deep_gemm/tma_utils.cuh

    • Replace:
      • uint32_t smem_dim[2], CUtensorMapSwizzle swizzle_type, PFN_cuTensorMapEncodeTiled_v12000 encode_func = nullptr)
    • With:
      • uint32_t smem_dim[2], CUtensorMapSwizzle swizzle_type, PFN_cuTensorMapEncodeTiled encode_func = nullptr)
  • cpp/tensorrt_llm/kernels/cutlass_kernels/fp8_blockscale_gemm/fp8_blockscale_tma_utils.cuh

    • Make the same signature change (or rely on the existing
      #define PFN_cuTensorMapEncodeTiled PFN_cuTensorMapEncodeTiled_v12000 and prefer using the unversioned name).

Call sites verified (no explicit encode_func argument): cpp/include/tensorrt_llm/deep_gemm/fp8_gemm.cuh, cpp/tensorrt_llm/kernels/cutlass_kernels/fp8_blockscale_gemm/fp8_blockscale_gemm_kernel.cuh, etc.

Optional safety: add a local fallback in headers to map the unversioned name to the v12000 typedef if the unversioned PFN is not present:

#ifndef PFN_cuTensorMapEncodeTiled
#define PFN_cuTensorMapEncodeTiled PFN_cuTensorMapEncodeTiled_v12000
#endif

🤖 Prompt for AI Agents
In cpp/include/tensorrt_llm/deep_gemm/tma_utils.cuh around lines 116-119, the
function signature uses the versioned typedef PFN_cuTensorMapEncodeTiled_v12000
as the default parameter; change that parameter type to the unversioned
PFN_cuTensorMapEncodeTiled (i.e., replace PFN_cuTensorMapEncodeTiled_v12000 with
PFN_cuTensorMapEncodeTiled) so callers that don’t pass encode_func keep using
the stable name, and make the same signature change in
cpp/tensorrt_llm/kernels/cutlass_kernels/fp8_blockscale_gemm/fp8_blockscale_tma_utils.cuh
(or rely on an existing #define mapping the unversioned name to the v12000
typedef); optionally add a local fallback macro guard at the top of the headers:
if PFN_cuTensorMapEncodeTiled is not defined, #define it to
PFN_cuTensorMapEncodeTiled_v12000.

CUtensorMap tensor_map{};
constexpr uint32_t rank = 2;
Expand Down
95 changes: 11 additions & 84 deletions cpp/tensorrt_llm/deep_ep/CMakeLists.txt
Original file line number Diff line number Diff line change
Expand Up @@ -38,6 +38,13 @@ endif()

# Ensure that dependent libraries are installed
find_library(MLX5_lib NAMES mlx5 REQUIRED)
set(NVSHMEM_INSTALL_PREFIX "${TORCH_INSTALL_PREFIX}/../nvidia/shmem")
# message("Torch Install Prefix: ${TORCH_INSTALL_PREFIX}")

find_path(NVSHMEM_INCLUDE_DIR nvshmem.h HINTS ${NVSHMEM_INSTALL_PREFIX}/include)
find_library(NVSHMEM_HOST_LIBRARY nvshmem_host.so.3 HINTS ${NVSHMEM_INSTALL_PREFIX}/lib)
find_library(NVSHMEM_DEVICE_LIBRARY nvshmem_device HINTS ${NVSHMEM_INSTALL_PREFIX}/lib)

Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

💡 Verification agent

🧩 Analysis chain

NVSHMEM discovery is brittle: hard-coded path, not REQUIRED, and host SONAME pinned to .so.3

  • Hard-coding NVSHMEM_INSTALL_PREFIX to a Torch-relative path without allowing override will break on non-standard layouts.
  • find_library(NVSHMEM_HOST_LIBRARY nvshmem_host.so.3 ...) pins a Linux SONAME and will fail on other platforms or future versions. Use base names via NAMES and search both lib and lib64.
  • None of the NVSHMEM finds are REQUIRED; a missing lib will fail late at link time.

Apply this diff to make discovery robust and fail-fast:

-set(NVSHMEM_INSTALL_PREFIX "${TORCH_INSTALL_PREFIX}/../nvidia/shmem")
-# message("Torch Install Prefix: ${TORCH_INSTALL_PREFIX}")
-
-find_path(NVSHMEM_INCLUDE_DIR nvshmem.h HINTS ${NVSHMEM_INSTALL_PREFIX}/include)
-find_library(NVSHMEM_HOST_LIBRARY nvshmem_host.so.3 HINTS ${NVSHMEM_INSTALL_PREFIX}/lib)
-find_library(NVSHMEM_DEVICE_LIBRARY nvshmem_device HINTS ${NVSHMEM_INSTALL_PREFIX}/lib)
+set(NVSHMEM_INSTALL_PREFIX "${TORCH_INSTALL_PREFIX}/../nvidia/shmem" CACHE PATH "NVSHMEM install root")
+
+find_path(NVSHMEM_INCLUDE_DIR nvshmem.h
+          HINTS ${NVSHMEM_INSTALL_PREFIX}/include
+          REQUIRED)
+find_library(NVSHMEM_HOST_LIBRARY
+             NAMES nvshmem_host nvshmem
+             HINTS ${NVSHMEM_INSTALL_PREFIX}/lib ${NVSHMEM_INSTALL_PREFIX}/lib64
+             REQUIRED)
+find_library(NVSHMEM_DEVICE_LIBRARY
+             NAMES nvshmem_device
+             HINTS ${NVSHMEM_INSTALL_PREFIX}/lib ${NVSHMEM_INSTALL_PREFIX}/lib64
+             REQUIRED)

Please confirm whether your deployment layout sometimes places NVSHMEM under lib64; if so, the added hint is required.


🏁 Script executed:

#!/bin/bash
set -euo pipefail

echo "Repository root: $(pwd)"
echo
echo "=== Show cpp/tensorrt_llm/deep_ep/CMakeLists.txt (first 300 lines) ==="
sed -n '1,300p' cpp/tensorrt_llm/deep_ep/CMakeLists.txt || true

echo
echo "=== Search for NVSHMEM references ==="
rg "NVSHMEM|nvshmem" -n || true

echo
echo "=== Show surrounding context where NVSHMEM_INSTALL_PREFIX might be used ==="
rg -n "NVSHMEM_INSTALL_PREFIX" -n || true
rg -n "TORCH_INSTALL_PREFIX" -n || true

echo
echo "=== Search for usages of nvshmem_host or nvshmem_device in repo ==="
rg -n "nvshmem_host|nvshmem_device" -n || true

echo
echo "=== List typical lib directories referenced elsewhere (lib64) ==="
rg -n "lib64" -n || true

Length of output: 18877


Make NVSHMEM discovery robust and fail-fast

Current code in cpp/tensorrt_llm/deep_ep/CMakeLists.txt hard-codes a Torch-relative install path, pins a specific SONAME (nvshmem_host.so.3), and does not use REQUIRED — this is brittle and will fail late on non-standard layouts or future NVSHMEM versions.

Files/locations to fix:

  • cpp/tensorrt_llm/deep_ep/CMakeLists.txt — lines ~41–47 (NVSHMEM_INSTALL_PREFIX and the three find_* calls).
  • Note: cpp/tensorrt_llm/CMakeLists.txt already uses find_package(NVSHMEM REQUIRED ...); also setup.py and scripts/build_wheel.py reference specific NVSHMEM .so files (e.g. nvshmem_bootstrap_uid.so.3), so ensure packaging and discovery remain consistent.

Apply this diff to make discovery configurable, search lib64, avoid SONAME pinning, and fail fast:

-set(NVSHMEM_INSTALL_PREFIX "${TORCH_INSTALL_PREFIX}/../nvidia/shmem")
-# message("Torch Install Prefix: ${TORCH_INSTALL_PREFIX}")
-
-find_path(NVSHMEM_INCLUDE_DIR nvshmem.h HINTS ${NVSHMEM_INSTALL_PREFIX}/include)
-find_library(NVSHMEM_HOST_LIBRARY nvshmem_host.so.3 HINTS ${NVSHMEM_INSTALL_PREFIX}/lib)
-find_library(NVSHMEM_DEVICE_LIBRARY nvshmem_device HINTS ${NVSHMEM_INSTALL_PREFIX}/lib)
+set(NVSHMEM_INSTALL_PREFIX "${TORCH_INSTALL_PREFIX}/../nvidia/shmem" CACHE PATH "NVSHMEM install root")
+
+find_path(NVSHMEM_INCLUDE_DIR nvshmem.h
+          HINTS ${NVSHMEM_INSTALL_PREFIX}/include
+          REQUIRED)
+find_library(NVSHMEM_HOST_LIBRARY
+             NAMES nvshmem_host nvshmem
+             HINTS ${NVSHMEM_INSTALL_PREFIX}/lib ${NVSHMEM_INSTALL_PREFIX}/lib64
+             REQUIRED)
+find_library(NVSHMEM_DEVICE_LIBRARY
+             NAMES nvshmem_device
+             HINTS ${NVSHMEM_INSTALL_PREFIX}/lib ${NVSHMEM_INSTALL_PREFIX}/lib64
+             REQUIRED)
📝 Committable suggestion

‼️ IMPORTANT
Carefully review the code before committing. Ensure that it accurately replaces the highlighted code, contains no missing lines, and has no issues with indentation. Thoroughly test & benchmark the code to ensure it meets the requirements.

Suggested change
set(NVSHMEM_INSTALL_PREFIX "${TORCH_INSTALL_PREFIX}/../nvidia/shmem")
# message("Torch Install Prefix: ${TORCH_INSTALL_PREFIX}")
find_path(NVSHMEM_INCLUDE_DIR nvshmem.h HINTS ${NVSHMEM_INSTALL_PREFIX}/include)
find_library(NVSHMEM_HOST_LIBRARY nvshmem_host.so.3 HINTS ${NVSHMEM_INSTALL_PREFIX}/lib)
find_library(NVSHMEM_DEVICE_LIBRARY nvshmem_device HINTS ${NVSHMEM_INSTALL_PREFIX}/lib)
set(NVSHMEM_INSTALL_PREFIX "${TORCH_INSTALL_PREFIX}/../nvidia/shmem" CACHE PATH "NVSHMEM install root")
find_path(NVSHMEM_INCLUDE_DIR nvshmem.h
HINTS ${NVSHMEM_INSTALL_PREFIX}/include
REQUIRED)
find_library(NVSHMEM_HOST_LIBRARY
NAMES nvshmem_host nvshmem
HINTS ${NVSHMEM_INSTALL_PREFIX}/lib ${NVSHMEM_INSTALL_PREFIX}/lib64
REQUIRED)
find_library(NVSHMEM_DEVICE_LIBRARY
NAMES nvshmem_device
HINTS ${NVSHMEM_INSTALL_PREFIX}/lib ${NVSHMEM_INSTALL_PREFIX}/lib64
REQUIRED)
🤖 Prompt for AI Agents
In cpp/tensorrt_llm/deep_ep/CMakeLists.txt around lines 41–47, the NVSHMEM
discovery is brittle: it hard-codes a Torch-relative NVSHMEM_INSTALL_PREFIX,
pins a specific SONAME, doesn't search lib64, and doesn't fail-fast. Replace the
fixed install prefix with a cache variable NVSHMEM_INSTALL_PREFIX (allowing
override), use find_path and find_library with NAMES (no SONAME suffix) and
HINTS ${NVSHMEM_INSTALL_PREFIX}/lib ${NVSHMEM_INSTALL_PREFIX}/lib64
${NVSHMEM_INSTALL_PREFIX}/include and system paths, mark the find_* calls
REQUIRED so configuration errors fail early, and prefer using
find_package(NVSHMEM REQUIRED ...) if available for consistency with the
top-level CMake; ensure variables (NVSHMEM_INCLUDE_DIR, NVSHMEM_HOST_LIBRARY,
NVSHMEM_DEVICE_LIBRARY) are set from those robust searches.


# Prepare files
# =============
Expand Down Expand Up @@ -81,87 +88,6 @@ foreach(_f IN LISTS _files)
PROPERTY CMAKE_CONFIGURE_DEPENDS ${_src})
endforeach()

# Delete stale nvshmem on patch update
set(NVSHMEM_STAMP_FILE ${CMAKE_CURRENT_BINARY_DIR}/nvshmem_stamp.txt)
file(SHA256 ${DEEP_EP_SOURCE_DIR}/third-party/nvshmem.patch NVSHMEM_PATCH_HASH)
file(SHA256 ${CMAKE_CURRENT_SOURCE_DIR}/nvshmem_fast_build.patch
NVSHMEM_PATCH_2_HASH)
set(NVSHMEM_STAMP_CONTENT "${NVSHMEM_URL_HASH}")
string(APPEND NVSHMEM_STAMP_CONTENT " PATCH_COMMAND v1")
string(APPEND NVSHMEM_STAMP_CONTENT " ${NVSHMEM_PATCH_HASH}")
string(APPEND NVSHMEM_STAMP_CONTENT " 103")
string(APPEND NVSHMEM_STAMP_CONTENT " ${NVSHMEM_PATCH_2_HASH}")
set(OLD_NVSHMEM_STAMP_CONTENT "")
if(EXISTS ${NVSHMEM_STAMP_FILE})
file(READ ${NVSHMEM_STAMP_FILE} OLD_NVSHMEM_STAMP_CONTENT)
endif()
if(NOT OLD_NVSHMEM_STAMP_CONTENT STREQUAL NVSHMEM_STAMP_CONTENT)
file(REMOVE_RECURSE ${CMAKE_CURRENT_BINARY_DIR}/nvshmem_project-prefix)
file(WRITE ${NVSHMEM_STAMP_FILE} "${NVSHMEM_STAMP_CONTENT}")
endif()
set_property(
DIRECTORY APPEND
PROPERTY CMAKE_CONFIGURE_DEPENDS
${DEEP_EP_SOURCE_DIR}/third-party/nvshmem.patch
${CMAKE_CURRENT_SOURCE_DIR}/nvshmem_fast_build.patch)

# Add NVSHMEM
# ===========

# NVSHMEM only works with GCC. Building NVSHMEM with Clang results in
# compilation errors. Using NVSHMEM with Clang results in slow builds and device
# link issues.
if(NOT CMAKE_CXX_COMPILER_ID STREQUAL "GNU")
set(CMAKE_C_COMPILER gcc)
set(CMAKE_CXX_COMPILER g++)
set(CMAKE_CUDA_HOST_COMPILER g++)
endif()

# Add nvshmem external project
include(ExternalProject)
ExternalProject_Add(
nvshmem_project
URL file://${CMAKE_CURRENT_SOURCE_DIR}/nvshmem_src_3.2.5-1.txz
URL_HASH ${NVSHMEM_URL_HASH}
PATCH_COMMAND patch -p1 --forward --batch -i
${DEEP_EP_SOURCE_DIR}/third-party/nvshmem.patch
COMMAND sed "s/TRANSPORT_VERSION_MAJOR 3/TRANSPORT_VERSION_MAJOR 103/" -i
src/CMakeLists.txt
COMMAND patch -p1 --forward --batch -i
${CMAKE_CURRENT_SOURCE_DIR}/nvshmem_fast_build.patch
CMAKE_CACHE_ARGS
-DCMAKE_C_COMPILER:STRING=${CMAKE_C_COMPILER}
-DCMAKE_C_COMPILER_LAUNCHER:STRING=${CMAKE_C_COMPILER_LAUNCHER}
-DCMAKE_CXX_COMPILER:STRING=${CMAKE_CXX_COMPILER}
-DCMAKE_CXX_COMPILER_LAUNCHER:STRING=${CMAKE_CXX_COMPILER_LAUNCHER}
-DCMAKE_CUDA_ARCHITECTURES:STRING=${DEEP_EP_CUDA_ARCHITECTURES}
-DCMAKE_CUDA_HOST_COMPILER:STRING=${CMAKE_CUDA_HOST_COMPILER}
-DCMAKE_CUDA_COMPILER_LAUNCHER:STRING=${CMAKE_CUDA_COMPILER_LAUNCHER}
-DNVSHMEM_BUILD_EXAMPLES:BOOL=0
-DNVSHMEM_BUILD_PACKAGES:BOOL=0
-DNVSHMEM_BUILD_TESTS:BOOL=0
-DNVSHMEM_IBGDA_SUPPORT:BOOL=1
-DNVSHMEM_IBRC_SUPPORT:BOOL=0
-DNVSHMEM_MPI_SUPPORT:BOOL=0
-DNVSHMEM_PMIX_SUPPORT:BOOL=0
-DNVSHMEM_SHMEM_SUPPORT:BOOL=0
-DNVSHMEM_TIMEOUT_DEVICE_POLLING:BOOL=0
-DNVSHMEM_UCX_SUPPORT:BOOL=0
-DNVSHMEM_USE_GDRCOPY:BOOL=0
-DNVSHMEM_USE_NCCL:BOOL=0
INSTALL_COMMAND ""
BINARY_DIR ${CMAKE_CURRENT_BINARY_DIR}/nvshmem-build
BUILD_BYPRODUCTS
${CMAKE_CURRENT_BINARY_DIR}/nvshmem-build/src/lib/libnvshmem.a)
add_library(nvshmem_project::nvshmem STATIC IMPORTED)
add_dependencies(nvshmem_project::nvshmem nvshmem_project)
file(MAKE_DIRECTORY ${CMAKE_CURRENT_BINARY_DIR}/nvshmem-build/src/include)
set_target_properties(
nvshmem_project::nvshmem
PROPERTIES IMPORTED_LOCATION
${CMAKE_CURRENT_BINARY_DIR}/nvshmem-build/src/lib/libnvshmem.a
INTERFACE_INCLUDE_DIRECTORIES
${CMAKE_CURRENT_BINARY_DIR}/nvshmem-build/src/include)

# Add DeepEP cpp
# ==============
Expand All @@ -188,7 +114,7 @@ set_target_properties(
CUDA_SEPARABLE_COMPILATION ON
CUDA_ARCHITECTURES "${DEEP_EP_CUDA_ARCHITECTURES}"
LINK_DEPENDS ${CMAKE_CURRENT_SOURCE_DIR}/deep_ep_cpp_tllm.version
INSTALL_RPATH "$ORIGIN/libs/nvshmem;${TORCH_INSTALL_PREFIX}/lib"
INSTALL_RPATH "${TORCH_INSTALL_PREFIX}/lib"
BUILD_WITH_INSTALL_RPATH TRUE)
target_compile_options(
deep_ep_cpp_tllm
Expand All @@ -197,8 +123,9 @@ target_compile_options(
target_compile_definitions(
deep_ep_cpp_tllm PRIVATE DISABLE_AGGRESSIVE_PTX_INSTRS
TORCH_EXTENSION_NAME=deep_ep_cpp_tllm)
target_include_directories(deep_ep_cpp_tllm PRIVATE ${NVSHMEM_INCLUDE_DIR})
target_link_libraries(
deep_ep_cpp_tllm PRIVATE nvshmem_project::nvshmem ${TORCH_LIBRARIES}
deep_ep_cpp_tllm PRIVATE ${NVSHMEM_DEVICE_LIBRARY} ${TORCH_LIBRARIES}
${TORCH_PYTHON_LIB})
target_link_options(
deep_ep_cpp_tllm PRIVATE
Expand All @@ -207,4 +134,4 @@ target_link_options(

# Set targets
# ===========
add_dependencies(deep_ep deep_ep_cpp_tllm nvshmem_project)
add_dependencies(deep_ep deep_ep_cpp_tllm)
26 changes: 0 additions & 26 deletions cpp/tensorrt_llm/kernels/beamSearchKernels.cu
Original file line number Diff line number Diff line change
Expand Up @@ -134,32 +134,6 @@ void invokeUpdateCacheIndirection(int* tgtCI, int const* srcCI, BeamHypotheses&
sync_check_cuda_error(stream);
}

template <typename T>
__global__ void addCumLogProbs(T* __restrict pStage1LogProbs, float const* __restrict cumLogProbs,
FinishedState const* finished, int const* endIds, float const* diversityRates,
runtime::SizeType32 const* batchSlots, size_t const nBS, size_t const nBMIn, size_t const nBMOut, size_t const nBM)
{
int const bid = blockIdx.x; // Index of request in batch
runtime::SizeType32 const slot = batchSlots[bid];
float const diversityRate{diversityRates[slot]};
T* pLocalLogProbs = pStage1LogProbs + bid * nBMIn * nBMOut * 2;

for (int i = threadIdx.x; i < nBMIn * nBMOut * 2; i += blockDim.x)
{
int const iBMIn = i / (nBMOut * 2);
if (finished[slot * nBMIn + iBMIn].isFinished())
{
pLocalLogProbs[i] += (i == endIds[slot]) ? 1.0f : 0.0f;
}
else
{
// nBM is used in VBWS since `cumLogProbs` is initialized with kMaxBeamWidth earlier than BeamSearchLayer
pLocalLogProbs[i] += cumLogProbs[slot * nBM + iBMIn] + diversityRate * iBMIn;
}
}
return;
}

template __global__ void addCumLogProbs<float>(float* __restrict pStage1LogProbs, float const* __restrict cumLogProbs,
FinishedState const* finished, int const* endIds, float const* diversityRates,
runtime::SizeType32 const* batchSlots, size_t const nBS, size_t const nBMIn, size_t const nBMOut, size_t const nBM);
Expand Down
35 changes: 34 additions & 1 deletion cpp/tensorrt_llm/kernels/beamSearchKernels.h
Original file line number Diff line number Diff line change
Expand Up @@ -131,9 +131,42 @@ void invokeUpdateCacheIndirection(int* tgtCI, int const* srcCI, BeamHypotheses&
runtime::SizeType32 const maxAttentionWindow, runtime::SizeType32 sinkTokenLength, cudaStream_t stream);

template <typename T>
__global__ void addCumLogProbs(T* __restrict pStage1Probs, float const* __restrict cumLogProbs,
__global__ __attribute__((visibility("default"))) void addCumLogProbs(T* __restrict pStage1LogProbs, float const* __restrict cumLogProbs,
FinishedState const* finished, int const* endIds, float const* diversityRates,
runtime::SizeType32 const* batchSlots, size_t const nBS, size_t const nBMIn, size_t const nBMOut, size_t const nBM)
#ifdef __CUDACC__
{
int const bid = blockIdx.x; // Index of request in batch
runtime::SizeType32 const slot = batchSlots[bid];
float const diversityRate{diversityRates[slot]};
T* pLocalLogProbs = pStage1LogProbs + bid * nBMIn * nBMOut * 2;

for (int i = threadIdx.x; i < nBMIn * nBMOut * 2; i += blockDim.x)
{
int const iBMIn = i / (nBMOut * 2);
if (finished[slot * nBMIn + iBMIn].isFinished())
{
pLocalLogProbs[i] += (i == endIds[slot]) ? 1.0f : 0.0f;
}
else
Comment on lines +147 to +151
Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

💡 Verification agent

🧩 Analysis chain

EOS/endIds indexing appears incorrect and likely out-of-bounds for [BS, BM] layout

endIds is documented as [BS, BM], but the code indexes it as endIds[slot] and compares it to i (a flattened [BM_in, BM_out, 2] index). Two issues:

  • Indexing: endIds[slot] ignores the beam dimension; for per-beam EOS, it should likely be endIds[slot * nBM + iBMIn] (or nBMIn, depending on layout).
  • Semantics: i indexes the flattened candidate grid, not the vocabulary/token ID dimension. Comparing i to an EOS token ID does not look meaningful and may never match.

This can cause incorrect scoring for finished beams and undefined behavior if endIds has only [BS, BM] elements.

Please confirm shapes and intended semantics. If indeed per-beam EOS, a minimal indexing fix would be:

  • Use the per-beam EOS id:

    • int const eosId = endIds[slot * nBM + iBMIn]; // or slot * nBMIn + iBMIn if finished is also sized that way.
  • Compare against the actual token index in pStage1LogProbs, not the flattened [BM_in, BM_out, 2] index. If stage-1 buffer includes a vocab dimension elsewhere, determine the local token index before comparison.

To help locate references and prior logic, run:


🏁 Script executed:

#!/bin/bash
# Look for other usages of endIds indexing and EOS handling in kernels
rg -n -A3 -B3 $'endIds\\[' cpp | sed -n '1,200p'

# Find all write sites for pStage1LogProbs to infer its layout
rg -n -A3 -B3 'pStage1(Log)?Probs' cpp | sed -n '1,200p'

# Find all callers of addCumLogProbs and their launch grids to reason about mapping
rg -n -A3 -B3 'addCumLogProbs\\s*<|<<<' cpp | sed -n '1,200p'

Length of output: 45150


🏁 Script executed:

#!/bin/bash
sed -n '1,240p' cpp/tensorrt_llm/kernels/beamSearchKernels.h

Length of output: 15146


🏁 Script executed:

#!/bin/bash
sed -n '1,420p' cpp/tensorrt_llm/kernels/beamSearchKernels/beamSearchKernelsTemplate.h

Length of output: 17803


Fix required — addCumLogProbs compares candidate index to EOS token and ignores beam dim

Short summary: addCumLogProbs (cpp/tensorrt_llm/kernels/beamSearchKernels.h) iterates i over the flattened candidate grid (nBMIn * nBMOut * 2) and does (i == endIds[slot]). i is a candidate index, not a token id; also the code ignores the beam index in endIds. This will almost never match the EOS token and therefore fails to prefer EOS for finished beams (and is inconsistent with other kernels that treat endIds as per-slot token ids).

Locations to fix:

  • cpp/tensorrt_llm/kernels/beamSearchKernels.h
    • Kernel addCumLogProbs — replace the incorrect comparison and add access to the stage-1 ids.
  • cpp/tensorrt_llm/kernels/beamSearchKernels/beamSearchKernelsTemplate.h
    • Call site that launches addCumLogProbs must pass pStage1Ids (pStage1Ids is already allocated in this scope).

Suggested minimal changes (apply and adjust to your code style):

  • Change kernel signature to accept the stage-1 ids:

    • before: addCumLogProbs(T* pStage1LogProbs, float const* cumLogProbs, FinishedState const* finished, int const* endIds, float const* diversityRates, ...)
    • after: addCumLogProbs(T* pStage1LogProbs, float const* cumLogProbs, FinishedState const* finished, int const* endIds, int const* pStage1Ids, float const* diversityRates, ...)
  • In the kernel body replace the block:

    • before:
      pLocalLogProbs[i] += (i == endIds[slot]) ? 1.0f : 0.0f;
    • after (minimal, assuming endIds is per-slot token id; if endIds is per-beam use the per-beam index shown below):
      int const* pLocalIds = pStage1Ids + bid * nBMIn * nBMOut * 2;
      int const eosId = endIds[slot]; // or: endIds[slot * nBM + iBMIn] if endIds is [BS, BM]
      pLocalLogProbs[i] += (pLocalIds[i] == eosId) ? (T)1.0f : (T)0.0f;
  • Update the kernel launch in beamSearchKernelsTemplate.h:

    • before:
      addCumLogProbs<<<nBS, nThread>>>(pStage1LogProbs, bh.cumLogProbs, bh.finished, bh.endIds, bh.diversityRates, bh.batchSlots, ...);
    • after:
      addCumLogProbs<<<nBS, nThread>>>(pStage1LogProbs, bh.cumLogProbs, bh.finished, bh.endIds, pStage1Ids, bh.diversityRates, bh.batchSlots, ...);

Extra: clarify the declared shape of endIds in BeamHypotheses (cpp/tensorrt_llm/kernels/beamSearchKernels.h) — comments currently read [BS, BM] but most kernels use endIds[slot] (per-slot). Confirm whether endIds should be [BS] or [BS, BM] and update code/comments and indexing across kernels accordingly.

Reason: The kernel must compare the candidate's token id (from stage-1 ids) to the EOS token id; comparing the flattened candidate index is incorrect and will break finished-beam handling.

🤖 Prompt for AI Agents
In cpp/tensorrt_llm/kernels/beamSearchKernels.h around lines 147-151,
addCumLogProbs currently compares the flattened candidate index i to
endIds[slot] which is wrong — you must compare the candidate token id from
stage-1 ids and account for beam dim; update the kernel signature to accept int
const* pStage1Ids, inside the kernel compute a pointer to the local stage1 ids
for this batch/beam region (e.g. pStage1Ids + bid * nBMIn * nBMOut * 2) and
replace the (i == endIds[slot]) test with a comparison of pLocalIds[i] to the
appropriate eosId (use endIds[slot] if endIds is per-slot, or endIds[slot * nBM
+ iBMIn] if per-beam), then add pStage1Ids to the addCumLogProbs kernel launch
in cpp/tensorrt_llm/kernels/beamSearchKernelsTemplate.h so the kernel receives
the stage-1 ids; also verify and if needed fix/comment the declared
shape/indexing of endIds across BeamHypotheses to be consistently [BS] or [BS,
BM].

{
// nBM is used in VBWS since `cumLogProbs` is initialized with kMaxBeamWidth earlier than BeamSearchLayer
pLocalLogProbs[i] += cumLogProbs[slot * nBM + iBMIn] + diversityRate * iBMIn;
}
}
return;
}
#else
;
extern template __global__ __attribute__((visibility("default"))) void addCumLogProbs<float>(float* __restrict pStage1LogProbs, float const* __restrict cumLogProbs,
FinishedState const* finished, int const* endIds, float const* diversityRates,
runtime::SizeType32 const* batchSlots, size_t const nBS, size_t const nBMIn, size_t const nBMOut, size_t const nBM);

extern template __global__ __attribute__((visibility("default"))) void addCumLogProbs<half>(half* __restrict pStage1LogProbs, float const* __restrict cumLogProbs,
FinishedState const* finished, int const* endIds, float const* diversityRates,
runtime::SizeType32 const* batchSlots, size_t const nBS, size_t const nBMIn, size_t const nBMOut, size_t const nBM);
#endif


__global__ void gatherId(int const* __restrict pStage1Id, int* __restrict pStage2Id, size_t const nBS,
size_t const nBMIn, size_t const nBMOut, size_t const nV);
Expand Down
2 changes: 1 addition & 1 deletion cpp/tensorrt_llm/kernels/cutlass_kernels/CMakeLists.txt
Original file line number Diff line number Diff line change
Expand Up @@ -190,7 +190,7 @@ set_cuda_architectures(fb_gemm_src 89 90 120f)
# ${INSTANTIATION_GENERATION_DIR}/fp8_rowwise_gemm)

add_library(fp8_blockscale_gemm_src STATIC ${FP8_BLOCKSCALE_GEMM_SRC_CU})
set_cuda_architectures(fp8_blockscale_gemm_src 89 90 100f)
set_cuda_architectures(fp8_blockscale_gemm_src 90)

set(GEMM_SWIGLU_SM90_SRC_CU
${CMAKE_CURRENT_SOURCE_DIR}/fused_gated_gemm/gemm_swiglu_e4m3.cu)
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -24,6 +24,8 @@
#include <cuda/barrier>
#include <cute/arch/util.hpp>

#define PFN_cuTensorMapEncodeTiled PFN_cuTensorMapEncodeTiled_v12000

Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

🛠️ Refactor suggestion

⚠️ Potential issue

Avoid redefining SDK typedefs via macros

#define PFN_cuTensorMapEncodeTiled PFN_cuTensorMapEncodeTiled_v12000 is brittle, violates macro naming guidelines, and can unintentionally affect other headers. Prefer explicit types or a local alias.

Remove the macro:

-#define PFN_cuTensorMapEncodeTiled PFN_cuTensorMapEncodeTiled_v12000
-
📝 Committable suggestion

‼️ IMPORTANT
Carefully review the code before committing. Ensure that it accurately replaces the highlighted code, contains no missing lines, and has no issues with indentation. Thoroughly test & benchmark the code to ensure it meets the requirements.

Suggested change
#define PFN_cuTensorMapEncodeTiled PFN_cuTensorMapEncodeTiled_v12000
🤖 Prompt for AI Agents
In
cpp/tensorrt_llm/kernels/cutlass_kernels/fp8_blockscale_gemm/fp8_blockscale_tma_utils.cuh
around lines 27-28, remove the macro definition that rebinds
PFN_cuTensorMapEncodeTiled to PFN_cuTensorMapEncodeTiled_v12000; instead delete
the #define and create a local, scoped alias (preferably a typedef or C++
using-alias) that maps PFN_cuTensorMapEncodeTiled to
PFN_cuTensorMapEncodeTiled_v12000 within this translation unit or an appropriate
namespace to avoid leaking a macro into other headers. Ensure the alias is only
visible where needed and that no preprocessor macro remains.

namespace tensorrt_llm::kernels::fp8_blockscale_gemm
{

Expand Down Expand Up @@ -84,7 +86,7 @@ inline CUtensorMapDataType get_CUtensorMapDataType()
}
}

PFN_cuTensorMapEncodeTiled get_cuTensorMapEncodeTiled()
PFN_cuTensorMapEncodeTiled_v12000 get_cuTensorMapEncodeTiled()
{
// Get pointer to cuTensorMapEncodeTiled
cudaDriverEntryPointQueryResult driver_status;
Expand All @@ -101,12 +103,12 @@ PFN_cuTensorMapEncodeTiled get_cuTensorMapEncodeTiled()
throw std::runtime_error("driver_status != cudaDriverEntryPointSuccess");
}

return reinterpret_cast<PFN_cuTensorMapEncodeTiled>(cuTensorMapEncodeTiled_ptr);
return reinterpret_cast<PFN_cuTensorMapEncodeTiled_v12000>(cuTensorMapEncodeTiled_ptr);
}

template <typename data_type>
CUtensorMap make_2d_tma_copy_desc(data_type* global_address, uint64_t gmem_dim[2], uint64_t stride_in_bytes,
uint32_t smem_dim[2], CUtensorMapSwizzle swizzle_type, PFN_cuTensorMapEncodeTiled encode_func = nullptr)
uint32_t smem_dim[2], CUtensorMapSwizzle swizzle_type, PFN_cuTensorMapEncodeTiled_v12000 encode_func = nullptr)
{
Comment on lines +109 to 110
Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

🛠️ Refactor suggestion

Unify encode_func parameter type with stable PFN

Avoid exposing the versioned PFN type in public signatures.

-    uint32_t smem_dim[2], CUtensorMapSwizzle swizzle_type, PFN_cuTensorMapEncodeTiled_v12000 encode_func = nullptr)
+    uint32_t smem_dim[2], CUtensorMapSwizzle swizzle_type, PFN_cuTensorMapEncodeTiled encode_func = nullptr)
📝 Committable suggestion

‼️ IMPORTANT
Carefully review the code before committing. Ensure that it accurately replaces the highlighted code, contains no missing lines, and has no issues with indentation. Thoroughly test & benchmark the code to ensure it meets the requirements.

Suggested change
uint32_t smem_dim[2], CUtensorMapSwizzle swizzle_type, PFN_cuTensorMapEncodeTiled_v12000 encode_func = nullptr)
{
uint32_t smem_dim[2], CUtensorMapSwizzle swizzle_type, PFN_cuTensorMapEncodeTiled encode_func = nullptr)
{
🤖 Prompt for AI Agents
In
cpp/tensorrt_llm/kernels/cutlass_kernels/fp8_blockscale_gemm/fp8_blockscale_tma_utils.cuh
around lines 111-112, the function signature exposes the versioned PFN type
PFN_cuTensorMapEncodeTiled_v12000; change the parameter type to the stable,
unversioned PFN alias (PFN_cuTensorMapEncodeTiled) so the public signature
doesn't leak a versioned typedef, and update any forward declarations or
includes so the unversioned PFN type is visible where this header is compiled.

CUtensorMap tensor_map{};
constexpr uint32_t rank = 2;
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -2597,7 +2597,7 @@ __global__ void __launch_bounds__(MAX_THEADS_PER_BLOCK, MIN_BLOCKS_PER_SM) maske
__shared__ typename BlockReduce::TempStorage temp_storage;
// Obtain a segment of consecutive items that are blocked across threads (final_max from above)
// Compute the block-wide max for thread0
final_max = BlockReduce(temp_storage).Reduce(thread_partial_max, cub::Max(), gridDim.z);
final_max = BlockReduce(temp_storage).Reduce(thread_partial_max, cuda::maximum(), gridDim.z);

__shared__ float final_max_smem;
if (tidx == 0)
Expand Down
4 changes: 2 additions & 2 deletions cpp/tensorrt_llm/kernels/sageAttentionKernels.cu
Original file line number Diff line number Diff line change
Expand Up @@ -250,7 +250,7 @@ __global__ void sage_quant_kernel(void const* q, void const* k, void const* v, i

// Compute the block-wide max for thread0
// cuda::maximum<>{}
float aggregate = BlockReduce(temp_storage).Reduce(local_amax, cub::Max{});
float aggregate = BlockReduce(temp_storage).Reduce(local_amax, cuda::maximum{});

if (row_id == 0 && col_id == 0)
s_block_amax = static_cast<float>(aggregate);
Expand Down Expand Up @@ -429,7 +429,7 @@ __global__ void sage_quant_kernel(void const* q, void const* k, void const* v, i

// Compute the block-wide max for thread0
// cuda::maximum<>{}
float aggregate = BlockReduce(temp_storage).Reduce(local_amax, cub::Max{});
float aggregate = BlockReduce(temp_storage).Reduce(local_amax, cuda::maximum{});

if (row_id == 0 && col_id == 0)
s_block_amax = static_cast<float>(aggregate);
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -504,7 +504,7 @@ __global__ void prepareGenEagleNetInputsKernel(SizeType32* nextSequenceLengths,
BlockScan(tempStorage.scan).ExclusiveSum(numNextLogits, outputLastIndicesBase);
// Sync because tempStorage is reused.
__syncthreads();
auto const maxGenLength = BlockReduce(tempStorage.reduce).Reduce(nextDraftLen, cub::Max());
auto const maxGenLength = BlockReduce(tempStorage.reduce).Reduce(nextDraftLen, cuda::maximum());

// Thread 0 has the result.
if (bid == 0)
Expand Down
13 changes: 7 additions & 6 deletions cpp/tensorrt_llm/kernels/topkLastDim.cu
Original file line number Diff line number Diff line change
Expand Up @@ -25,6 +25,8 @@
#include "topkLastDim.h"
#include <cub/cub.cuh>
#include <cuda/atomic>
#include <thrust/iterator/counting_iterator.h>
#include <thrust/iterator/transform_iterator.h>

namespace tensorrt_llm
{
Expand Down Expand Up @@ -1221,9 +1223,9 @@ void standalone_stable_radix_topk_(void* buf, size_t& buf_size, T const* in, Idx
IdxT* sort_in_idx = nullptr;

air_topk_stable::ComputeOffset<IdxT> computeoffset(k);
cub::CountingInputIterator<IdxT> counting_iter(0);
cub::TransformInputIterator<IdxT, air_topk_stable::ComputeOffset<IdxT>, cub::CountingInputIterator<IdxT>>
transform_iter(counting_iter, computeoffset);
auto counting_iter = thrust::make_counting_iterator<IdxT>(0);
auto transform_iter = thrust::make_transform_iterator(counting_iter, computeoffset);

cub::DeviceSegmentedSort::SortPairs(NULL, temp_storage_bytes, out_idx, out_idx, out, out, k * batch_size,
batch_size, transform_iter, transform_iter + 1, stream);
if (sorted)
Expand Down Expand Up @@ -1348,9 +1350,8 @@ void standalone_stable_radix_topk_one_block_(void* buf, size_t& buf_size, T cons
const IdxT buf_len = air_topk_stable::calc_buf_len<T, IdxT, unsigned>(len);

air_topk_stable::ComputeOffset<IdxT> computeoffset(k);
cub::CountingInputIterator<IdxT> counting_iter(0);
cub::TransformInputIterator<IdxT, air_topk_stable::ComputeOffset<IdxT>, cub::CountingInputIterator<IdxT>>
transform_iter(counting_iter, computeoffset);
auto counting_iter = thrust::make_counting_iterator<IdxT>(0);
auto transform_iter = thrust::make_transform_iterator(counting_iter, computeoffset);

cub::DeviceSegmentedSort::SortPairs(NULL, temp_storage_bytes, out_idx, out_idx, out, out, k * batch_size,
batch_size, transform_iter, transform_iter + 1, stream);
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -154,7 +154,7 @@ __global__ void activationDeepSeekKernel(KernelParams params)
float constexpr E4m3MaxVal{448.f};

// Compute the absolute max
float aMax = BlockReduce(temp_storage).Reduce(fabsf(out), cub::Max());
float aMax = BlockReduce(temp_storage).Reduce(fabsf(out), cuda::maximum());
if (threadIdx.x == 0)
{
s_scaleOut = aMax / E4m3MaxVal;
Expand Down Expand Up @@ -657,7 +657,7 @@ __global__ void finalizeDeepSeekKernel(KernelParams params)
float constexpr E4m3MaxVal{448.f};

// Compute the absolute max
float aMax = BlockReduce(temp_storage).Reduce(fabsf(acc), cub::Max());
float aMax = BlockReduce(temp_storage).Reduce(fabsf(acc), cuda::maximum());

if (threadIdx.x == 0)
{
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -364,7 +364,8 @@ void* HostAccessibleDeviceAllocator::allocate(size_t memorySize)
TLLM_CHECK_WITH_INFO(
mAllowManagedFallback, "HostAccessibleDeviceAllocator is not supported on the current system.");
TLLM_CUDA_CHECK(cudaMallocManaged(&devPtr, memorySize));
TLLM_CUDA_CHECK(cudaMemAdvise(devPtr, memorySize, cudaMemAdviseSetPreferredLocation, currentDevId));
cudaMemLocation location {cudaMemLocationTypeDevice, currentDevId};
TLLM_CUDA_CHECK(cudaMemAdvise(devPtr, memorySize, cudaMemAdviseSetPreferredLocation, location));
hostPtr = devPtr;
Comment on lines +367 to 369
Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

💡 Verification agent

🧩 Analysis chain

Guard new cudaMemAdvise location-API for backward compatibility and fix indentation

The location-based cudaMemAdvise overload requires newer CUDA. Given CMake still allows CUDA >= 11.2, gate this with CUDART_VERSION to avoid compile errors with older toolkits. Also replace the tab with spaces to match file style.

Apply:

-	cudaMemLocation location {cudaMemLocationTypeDevice, currentDevId};
-        TLLM_CUDA_CHECK(cudaMemAdvise(devPtr, memorySize, cudaMemAdviseSetPreferredLocation, location));
+    #if CUDART_VERSION >= 12000
+    cudaMemLocation location{cudaMemLocationTypeDevice, currentDevId};
+    TLLM_CUDA_CHECK(cudaMemAdvise(devPtr, memorySize, cudaMemAdviseSetPreferredLocation, location));
+    #else
+    TLLM_CUDA_CHECK(cudaMemAdvise(devPtr, memorySize, cudaMemAdviseSetPreferredLocation, currentDevId));
+    #endif

Alternatively, if you intend to raise the project’s minimum CUDA to 13, update find_package(CUDAToolkit ...) accordingly and remove the fallback.


🏁 Script executed:

#!/bin/bash
set -euo pipefail
FILE="cpp/tensorrt_llm/runtime/moeLoadBalancer/hostAccessibleDeviceAllocator.cpp"
echo "PWD: $(pwd)"
if [ -f "$FILE" ]; then
  echo "Found file: $FILE"
  echo "---- File header (first 120 lines) ----"
  sed -n '1,120p' "$FILE"
  echo "---- Context lines 340-400 ----"
  sed -n '340,400p' "$FILE"
  echo "---- Search for relevant symbols in file ----"
  rg -n "TLLM_CUDA_CHECK|cudaMemAdvise|cudaMemLocation|CUDART_VERSION" "$FILE" || true
  echo "---- Lines containing tabs (show up to 200) ----"
  nl -ba "$FILE" | sed -n '1,520p' | rg --line-number $'\t' || true
else
  echo "File not found: $FILE"
  rg -n "hostAccessibleDeviceAllocator.cpp" || true
fi

echo "---- Search for find_package(CUDAToolkit in repo ----"
rg -n "find_package\\(CUDAToolkit" || true
echo "---- Search for CUDART_VERSION usage in repo ----"
rg -n "CUDART_VERSION" || true

Length of output: 13048


Guard cudaMemAdvise(location) for older CUDA toolkits & fix indentation

The cudaMemLocation overload requires CUDA ≥ 12, and cpp/CMakeLists.txt still allows CUDA 11.2 — gate the new API with CUDART_VERSION and replace the stray tab with spaces.

  • File: cpp/tensorrt_llm/runtime/moeLoadBalancer/hostAccessibleDeviceAllocator.cpp — replace the cudaMemLocation / cudaMemAdvise lines (around lines ~366–369).
  • CMake: cpp/CMakeLists.txt currently uses find_package(CUDAToolkit 11.2 ...), so keep the fallback unless you bump the min CUDA version.

Apply:

-	cudaMemLocation location {cudaMemLocationTypeDevice, currentDevId};
-        TLLM_CUDA_CHECK(cudaMemAdvise(devPtr, memorySize, cudaMemAdviseSetPreferredLocation, location));
+#if defined(CUDART_VERSION) && (CUDART_VERSION >= 12000)
+        cudaMemLocation location{cudaMemLocationTypeDevice, currentDevId};
+        TLLM_CUDA_CHECK(cudaMemAdvise(devPtr, memorySize, cudaMemAdviseSetPreferredLocation, location));
+#else
+        TLLM_CUDA_CHECK(cudaMemAdvise(devPtr, memorySize, cudaMemAdviseSetPreferredLocation, currentDevId));
+#endif

Alternatively, if you intend to require CUDA ≥ 12 project-wide, update find_package(CUDAToolkit ...) in cpp/CMakeLists.txt and remove the fallback.

📝 Committable suggestion

‼️ IMPORTANT
Carefully review the code before committing. Ensure that it accurately replaces the highlighted code, contains no missing lines, and has no issues with indentation. Thoroughly test & benchmark the code to ensure it meets the requirements.

Suggested change
cudaMemLocation location {cudaMemLocationTypeDevice, currentDevId};
TLLM_CUDA_CHECK(cudaMemAdvise(devPtr, memorySize, cudaMemAdviseSetPreferredLocation, location));
hostPtr = devPtr;
#if defined(CUDART_VERSION) && (CUDART_VERSION >= 12000)
cudaMemLocation location{cudaMemLocationTypeDevice, currentDevId};
TLLM_CUDA_CHECK(cudaMemAdvise(devPtr, memorySize, cudaMemAdviseSetPreferredLocation, location));
#else
TLLM_CUDA_CHECK(cudaMemAdvise(devPtr, memorySize, cudaMemAdviseSetPreferredLocation, currentDevId));
#endif
hostPtr = devPtr;
🤖 Prompt for AI Agents
In cpp/tensorrt_llm/runtime/moeLoadBalancer/hostAccessibleDeviceAllocator.cpp
around lines 367–369, the current cudaMemLocation/cudaMemAdvise usage requires
CUDA ≥ 12 and also contains a stray tab; guard the new API with a preprocessor
check (e.g., #if defined(CUDART_VERSION) && CUDART_VERSION >= 12000) to call
cudaMemLocation and cudaMemAdvise only when available and provide the existing
fallback for older CUDA versions in the #else branch, and replace the stray tab
with spaces to fix indentation; do not change cpp/CMakeLists.txt unless you
intend to raise the minimum CUDA version project-wide.

}
recordAllocation(devPtr, memorySize, hostPtr, memDesc);
Expand Down
Loading