Skip to content
Closed
10 changes: 10 additions & 0 deletions cpp/CMakeLists.txt
Original file line number Diff line number Diff line change
Expand Up @@ -212,6 +212,7 @@ endif()
include_directories(
SYSTEM
${CUDAToolkit_INCLUDE_DIRS}
/usr/local/cuda-13.0/targets/x86_64-linux/include/cccl
${CUDNN_ROOT_DIR}/include
$<TARGET_PROPERTY:TensorRT::NvInfer,INTERFACE_INCLUDE_DIRECTORIES>
${3RDPARTY_DIR}/cutlass/include
Expand Down Expand Up @@ -249,6 +250,14 @@ if(${CUDAToolkit_VERSION} VERSION_GREATER_EQUAL "12.8")
)
endif()

if(${CUDAToolkit_VERSION} VERSION_GREATER_EQUAL "13.0")
message(
STATUS
"CUDAToolkit_VERSION ${CUDAToolkit_VERSION_MAJOR}.${CUDAToolkit_VERSION_MINOR} is greater or equal than 13.0, adding visibility flags"
)
set(CMAKE_CUDA_FLAGS "${CMAKE_CUDA_FLAGS} --device-entity-has-hidden-visibility=false -cudart=shared")
endif()

if(ENABLE_MULTI_DEVICE)
# MPI MPI isn't used until tensorrt_llm/CMakeLists.txt is invoked. However, if
# it's not called before "CMAKE_CXX_FLAGS" is set, it breaks on Windows for
Expand Down Expand Up @@ -365,6 +374,7 @@ if(NVCC_TIMING)
set(CMAKE_CUDA_FLAGS
"${CMAKE_CUDA_FLAGS} --time ${CMAKE_CURRENT_BINARY_DIR}/nvcc-timing.csv")
endif()

message("CMAKE_CUDA_FLAGS: ${CMAKE_CUDA_FLAGS}")

set(COMMON_HEADER_DIRS ${PROJECT_SOURCE_DIR} ${CUDAToolkit_INCLUDE_DIR})
Expand Down
6 changes: 3 additions & 3 deletions cpp/include/tensorrt_llm/deep_gemm/tma_utils.cuh
Original file line number Diff line number Diff line change
Expand Up @@ -95,7 +95,7 @@ constexpr CUtensorMapDataType get_CUtensorMapDataType()
}
}

PFN_cuTensorMapEncodeTiled get_cuTensorMapEncodeTiled()
PFN_cuTensorMapEncodeTiled_v12000 get_cuTensorMapEncodeTiled()
{
// Get pointer to `cuTensorMapEncodeTiled`
cudaDriverEntryPointQueryResult driver_status;
Expand All @@ -110,12 +110,12 @@ PFN_cuTensorMapEncodeTiled get_cuTensorMapEncodeTiled()

if (driver_status != cudaDriverEntryPointSuccess)
throw std::runtime_error("driver_status != cudaDriverEntryPointSuccess");
return reinterpret_cast<PFN_cuTensorMapEncodeTiled>(cuTensorMapEncodeTiled_ptr);
return reinterpret_cast<PFN_cuTensorMapEncodeTiled_v12000>(cuTensorMapEncodeTiled_ptr);
}

template <typename T>
CUtensorMap make_2d_tma_copy_desc(T* global_address, uint64_t gmem_dim[2], uint64_t stride_in_bytes,
uint32_t smem_dim[2], CUtensorMapSwizzle swizzle_type, PFN_cuTensorMapEncodeTiled encode_func = nullptr)
uint32_t smem_dim[2], CUtensorMapSwizzle swizzle_type, PFN_cuTensorMapEncodeTiled_v12000 encode_func = nullptr)
{
Comment on lines 116 to 119
Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

💡 Verification agent

🧩 Analysis chain

Default parameter type should match the stable PFN type

Align the encode_func param with the stable, unversioned type to avoid exposing versioned types.

-    uint32_t smem_dim[2], CUtensorMapSwizzle swizzle_type, PFN_cuTensorMapEncodeTiled_v12000 encode_func = nullptr)
+    uint32_t smem_dim[2], CUtensorMapSwizzle swizzle_type, PFN_cuTensorMapEncodeTiled encode_func = nullptr)

Double-check the other call sites for signature consistency.


🏁 Script executed:

#!/bin/bash
set -euo pipefail

echo "=== show file (if exists) ==="
FILE="cpp/include/tensorrt_llm/deep_gemm/tma_utils.cuh"
if [ -f "$FILE" ]; then
  sed -n '1,240p' "$FILE" || true
else
  echo "File not found: $FILE"
fi

echo
echo "=== rg: PFN_cuTensorMapEncodeTiled_v12000 ==="
rg --hidden -n "PFN_cuTensorMapEncodeTiled_v12000" || true

echo
echo "=== rg: PFN_cuTensorMapEncodeTiled ==="
rg --hidden -n "PFN_cuTensorMapEncodeTiled(?!_v12000)" || true

echo
echo "=== rg: make_2d_tma_copy_desc ==="
rg --hidden -n "make_2d_tma_copy_desc" || true

echo
echo "=== rg: cuTensorMapEncodeTiled occurrences ==="
rg --hidden -n "cuTensorMapEncodeTiled" || true

echo
echo "=== rg: cuTensorMap related typedefs/usages ==="
rg --hidden -n "cuTensorMap" || true

Length of output: 25549


🏁 Script executed:

#!/bin/bash
set -euo pipefail

echo "=== fp8_blockscale_tma_utils.cuh (head) ==="
sed -n '1,240p' cpp/tensorrt_llm/kernels/cutlass_kernels/fp8_blockscale_gemm/fp8_blockscale_tma_utils.cuh || true

echo
echo "=== cudaDriverWrapper.h (head) ==="
sed -n '1,320p' cpp/tensorrt_llm/common/cudaDriverWrapper.h || true

echo
echo "=== cudaDriverWrapper.cpp (head) ==="
sed -n '1,360p' cpp/tensorrt_llm/common/cudaDriverWrapper.cpp || true

echo
echo "=== tma_utils.cuh (relevant region) ==="
sed -n '1,220p' cpp/include/tensorrt_llm/deep_gemm/tma_utils.cuh || true

echo
echo "=== rg: where PFN_cuTensorMapEncodeTiled is defined/mentioned ==="
rg --hidden -n "PFN_cuTensorMapEncodeTiled" || true

Length of output: 32409


Prefer the stable PFN_cuTensorMapEncodeTiled type for the default parameter

Use the unversioned PFN to avoid leaking the versioned typedef; callers I checked do not pass encode_func explicitly.

Files to update:

  • cpp/include/tensorrt_llm/deep_gemm/tma_utils.cuh

    • Replace:
      • uint32_t smem_dim[2], CUtensorMapSwizzle swizzle_type, PFN_cuTensorMapEncodeTiled_v12000 encode_func = nullptr)
    • With:
      • uint32_t smem_dim[2], CUtensorMapSwizzle swizzle_type, PFN_cuTensorMapEncodeTiled encode_func = nullptr)
  • cpp/tensorrt_llm/kernels/cutlass_kernels/fp8_blockscale_gemm/fp8_blockscale_tma_utils.cuh

    • Make the same signature change (or rely on the existing
      #define PFN_cuTensorMapEncodeTiled PFN_cuTensorMapEncodeTiled_v12000 and prefer using the unversioned name).

Call sites verified (no explicit encode_func argument): cpp/include/tensorrt_llm/deep_gemm/fp8_gemm.cuh, cpp/tensorrt_llm/kernels/cutlass_kernels/fp8_blockscale_gemm/fp8_blockscale_gemm_kernel.cuh, etc.

Optional safety: add a local fallback in headers to map the unversioned name to the v12000 typedef if the unversioned PFN is not present:

#ifndef PFN_cuTensorMapEncodeTiled
#define PFN_cuTensorMapEncodeTiled PFN_cuTensorMapEncodeTiled_v12000
#endif

🤖 Prompt for AI Agents
In cpp/include/tensorrt_llm/deep_gemm/tma_utils.cuh around lines 116-119, the
function signature uses the versioned typedef PFN_cuTensorMapEncodeTiled_v12000
as the default parameter; change that parameter type to the unversioned
PFN_cuTensorMapEncodeTiled (i.e., replace PFN_cuTensorMapEncodeTiled_v12000 with
PFN_cuTensorMapEncodeTiled) so callers that don’t pass encode_func keep using
the stable name, and make the same signature change in
cpp/tensorrt_llm/kernels/cutlass_kernels/fp8_blockscale_gemm/fp8_blockscale_tma_utils.cuh
(or rely on an existing #define mapping the unversioned name to the v12000
typedef); optionally add a local fallback macro guard at the top of the headers:
if PFN_cuTensorMapEncodeTiled is not defined, #define it to
PFN_cuTensorMapEncodeTiled_v12000.

CUtensorMap tensor_map{};
constexpr uint32_t rank = 2;
Expand Down
95 changes: 11 additions & 84 deletions cpp/tensorrt_llm/deep_ep/CMakeLists.txt
Original file line number Diff line number Diff line change
Expand Up @@ -38,6 +38,13 @@ endif()

# Ensure that dependent libraries are installed
find_library(MLX5_lib NAMES mlx5 REQUIRED)
set(NVSHMEM_INSTALL_PREFIX "${TORCH_INSTALL_PREFIX}/../nvidia/shmem")
# message("Torch Install Prefix: ${TORCH_INSTALL_PREFIX}")

find_path(NVSHMEM_INCLUDE_DIR nvshmem.h HINTS ${NVSHMEM_INSTALL_PREFIX}/include)
find_library(NVSHMEM_HOST_LIBRARY nvshmem_host.so.3 HINTS ${NVSHMEM_INSTALL_PREFIX}/lib)
find_library(NVSHMEM_DEVICE_LIBRARY nvshmem_device HINTS ${NVSHMEM_INSTALL_PREFIX}/lib)

Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

💡 Verification agent

🧩 Analysis chain

NVSHMEM discovery is brittle: hard-coded path, not REQUIRED, and host SONAME pinned to .so.3

  • Hard-coding NVSHMEM_INSTALL_PREFIX to a Torch-relative path without allowing override will break on non-standard layouts.
  • find_library(NVSHMEM_HOST_LIBRARY nvshmem_host.so.3 ...) pins a Linux SONAME and will fail on other platforms or future versions. Use base names via NAMES and search both lib and lib64.
  • None of the NVSHMEM finds are REQUIRED; a missing lib will fail late at link time.

Apply this diff to make discovery robust and fail-fast:

-set(NVSHMEM_INSTALL_PREFIX "${TORCH_INSTALL_PREFIX}/../nvidia/shmem")
-# message("Torch Install Prefix: ${TORCH_INSTALL_PREFIX}")
-
-find_path(NVSHMEM_INCLUDE_DIR nvshmem.h HINTS ${NVSHMEM_INSTALL_PREFIX}/include)
-find_library(NVSHMEM_HOST_LIBRARY nvshmem_host.so.3 HINTS ${NVSHMEM_INSTALL_PREFIX}/lib)
-find_library(NVSHMEM_DEVICE_LIBRARY nvshmem_device HINTS ${NVSHMEM_INSTALL_PREFIX}/lib)
+set(NVSHMEM_INSTALL_PREFIX "${TORCH_INSTALL_PREFIX}/../nvidia/shmem" CACHE PATH "NVSHMEM install root")
+
+find_path(NVSHMEM_INCLUDE_DIR nvshmem.h
+          HINTS ${NVSHMEM_INSTALL_PREFIX}/include
+          REQUIRED)
+find_library(NVSHMEM_HOST_LIBRARY
+             NAMES nvshmem_host nvshmem
+             HINTS ${NVSHMEM_INSTALL_PREFIX}/lib ${NVSHMEM_INSTALL_PREFIX}/lib64
+             REQUIRED)
+find_library(NVSHMEM_DEVICE_LIBRARY
+             NAMES nvshmem_device
+             HINTS ${NVSHMEM_INSTALL_PREFIX}/lib ${NVSHMEM_INSTALL_PREFIX}/lib64
+             REQUIRED)

Please confirm whether your deployment layout sometimes places NVSHMEM under lib64; if so, the added hint is required.


🏁 Script executed:

#!/bin/bash
set -euo pipefail

echo "Repository root: $(pwd)"
echo
echo "=== Show cpp/tensorrt_llm/deep_ep/CMakeLists.txt (first 300 lines) ==="
sed -n '1,300p' cpp/tensorrt_llm/deep_ep/CMakeLists.txt || true

echo
echo "=== Search for NVSHMEM references ==="
rg "NVSHMEM|nvshmem" -n || true

echo
echo "=== Show surrounding context where NVSHMEM_INSTALL_PREFIX might be used ==="
rg -n "NVSHMEM_INSTALL_PREFIX" -n || true
rg -n "TORCH_INSTALL_PREFIX" -n || true

echo
echo "=== Search for usages of nvshmem_host or nvshmem_device in repo ==="
rg -n "nvshmem_host|nvshmem_device" -n || true

echo
echo "=== List typical lib directories referenced elsewhere (lib64) ==="
rg -n "lib64" -n || true

Length of output: 18877


Make NVSHMEM discovery robust and fail-fast

Current code in cpp/tensorrt_llm/deep_ep/CMakeLists.txt hard-codes a Torch-relative install path, pins a specific SONAME (nvshmem_host.so.3), and does not use REQUIRED — this is brittle and will fail late on non-standard layouts or future NVSHMEM versions.

Files/locations to fix:

  • cpp/tensorrt_llm/deep_ep/CMakeLists.txt — lines ~41–47 (NVSHMEM_INSTALL_PREFIX and the three find_* calls).
  • Note: cpp/tensorrt_llm/CMakeLists.txt already uses find_package(NVSHMEM REQUIRED ...); also setup.py and scripts/build_wheel.py reference specific NVSHMEM .so files (e.g. nvshmem_bootstrap_uid.so.3), so ensure packaging and discovery remain consistent.

Apply this diff to make discovery configurable, search lib64, avoid SONAME pinning, and fail fast:

-set(NVSHMEM_INSTALL_PREFIX "${TORCH_INSTALL_PREFIX}/../nvidia/shmem")
-# message("Torch Install Prefix: ${TORCH_INSTALL_PREFIX}")
-
-find_path(NVSHMEM_INCLUDE_DIR nvshmem.h HINTS ${NVSHMEM_INSTALL_PREFIX}/include)
-find_library(NVSHMEM_HOST_LIBRARY nvshmem_host.so.3 HINTS ${NVSHMEM_INSTALL_PREFIX}/lib)
-find_library(NVSHMEM_DEVICE_LIBRARY nvshmem_device HINTS ${NVSHMEM_INSTALL_PREFIX}/lib)
+set(NVSHMEM_INSTALL_PREFIX "${TORCH_INSTALL_PREFIX}/../nvidia/shmem" CACHE PATH "NVSHMEM install root")
+
+find_path(NVSHMEM_INCLUDE_DIR nvshmem.h
+          HINTS ${NVSHMEM_INSTALL_PREFIX}/include
+          REQUIRED)
+find_library(NVSHMEM_HOST_LIBRARY
+             NAMES nvshmem_host nvshmem
+             HINTS ${NVSHMEM_INSTALL_PREFIX}/lib ${NVSHMEM_INSTALL_PREFIX}/lib64
+             REQUIRED)
+find_library(NVSHMEM_DEVICE_LIBRARY
+             NAMES nvshmem_device
+             HINTS ${NVSHMEM_INSTALL_PREFIX}/lib ${NVSHMEM_INSTALL_PREFIX}/lib64
+             REQUIRED)
📝 Committable suggestion

‼️ IMPORTANT
Carefully review the code before committing. Ensure that it accurately replaces the highlighted code, contains no missing lines, and has no issues with indentation. Thoroughly test & benchmark the code to ensure it meets the requirements.

Suggested change
set(NVSHMEM_INSTALL_PREFIX "${TORCH_INSTALL_PREFIX}/../nvidia/shmem")
# message("Torch Install Prefix: ${TORCH_INSTALL_PREFIX}")
find_path(NVSHMEM_INCLUDE_DIR nvshmem.h HINTS ${NVSHMEM_INSTALL_PREFIX}/include)
find_library(NVSHMEM_HOST_LIBRARY nvshmem_host.so.3 HINTS ${NVSHMEM_INSTALL_PREFIX}/lib)
find_library(NVSHMEM_DEVICE_LIBRARY nvshmem_device HINTS ${NVSHMEM_INSTALL_PREFIX}/lib)
set(NVSHMEM_INSTALL_PREFIX "${TORCH_INSTALL_PREFIX}/../nvidia/shmem" CACHE PATH "NVSHMEM install root")
find_path(NVSHMEM_INCLUDE_DIR nvshmem.h
HINTS ${NVSHMEM_INSTALL_PREFIX}/include
REQUIRED)
find_library(NVSHMEM_HOST_LIBRARY
NAMES nvshmem_host nvshmem
HINTS ${NVSHMEM_INSTALL_PREFIX}/lib ${NVSHMEM_INSTALL_PREFIX}/lib64
REQUIRED)
find_library(NVSHMEM_DEVICE_LIBRARY
NAMES nvshmem_device
HINTS ${NVSHMEM_INSTALL_PREFIX}/lib ${NVSHMEM_INSTALL_PREFIX}/lib64
REQUIRED)
🤖 Prompt for AI Agents
In cpp/tensorrt_llm/deep_ep/CMakeLists.txt around lines 41–47, the NVSHMEM
discovery is brittle: it hard-codes a Torch-relative NVSHMEM_INSTALL_PREFIX,
pins a specific SONAME, doesn't search lib64, and doesn't fail-fast. Replace the
fixed install prefix with a cache variable NVSHMEM_INSTALL_PREFIX (allowing
override), use find_path and find_library with NAMES (no SONAME suffix) and
HINTS ${NVSHMEM_INSTALL_PREFIX}/lib ${NVSHMEM_INSTALL_PREFIX}/lib64
${NVSHMEM_INSTALL_PREFIX}/include and system paths, mark the find_* calls
REQUIRED so configuration errors fail early, and prefer using
find_package(NVSHMEM REQUIRED ...) if available for consistency with the
top-level CMake; ensure variables (NVSHMEM_INCLUDE_DIR, NVSHMEM_HOST_LIBRARY,
NVSHMEM_DEVICE_LIBRARY) are set from those robust searches.


# Prepare files
# =============
Expand Down Expand Up @@ -81,87 +88,6 @@ foreach(_f IN LISTS _files)
PROPERTY CMAKE_CONFIGURE_DEPENDS ${_src})
endforeach()

# Delete stale nvshmem on patch update
set(NVSHMEM_STAMP_FILE ${CMAKE_CURRENT_BINARY_DIR}/nvshmem_stamp.txt)
file(SHA256 ${DEEP_EP_SOURCE_DIR}/third-party/nvshmem.patch NVSHMEM_PATCH_HASH)
file(SHA256 ${CMAKE_CURRENT_SOURCE_DIR}/nvshmem_fast_build.patch
NVSHMEM_PATCH_2_HASH)
set(NVSHMEM_STAMP_CONTENT "${NVSHMEM_URL_HASH}")
string(APPEND NVSHMEM_STAMP_CONTENT " PATCH_COMMAND v1")
string(APPEND NVSHMEM_STAMP_CONTENT " ${NVSHMEM_PATCH_HASH}")
string(APPEND NVSHMEM_STAMP_CONTENT " 103")
string(APPEND NVSHMEM_STAMP_CONTENT " ${NVSHMEM_PATCH_2_HASH}")
set(OLD_NVSHMEM_STAMP_CONTENT "")
if(EXISTS ${NVSHMEM_STAMP_FILE})
file(READ ${NVSHMEM_STAMP_FILE} OLD_NVSHMEM_STAMP_CONTENT)
endif()
if(NOT OLD_NVSHMEM_STAMP_CONTENT STREQUAL NVSHMEM_STAMP_CONTENT)
file(REMOVE_RECURSE ${CMAKE_CURRENT_BINARY_DIR}/nvshmem_project-prefix)
file(WRITE ${NVSHMEM_STAMP_FILE} "${NVSHMEM_STAMP_CONTENT}")
endif()
set_property(
DIRECTORY APPEND
PROPERTY CMAKE_CONFIGURE_DEPENDS
${DEEP_EP_SOURCE_DIR}/third-party/nvshmem.patch
${CMAKE_CURRENT_SOURCE_DIR}/nvshmem_fast_build.patch)

# Add NVSHMEM
# ===========

# NVSHMEM only works with GCC. Building NVSHMEM with Clang results in
# compilation errors. Using NVSHMEM with Clang results in slow builds and device
# link issues.
if(NOT CMAKE_CXX_COMPILER_ID STREQUAL "GNU")
set(CMAKE_C_COMPILER gcc)
set(CMAKE_CXX_COMPILER g++)
set(CMAKE_CUDA_HOST_COMPILER g++)
endif()

# Add nvshmem external project
include(ExternalProject)
ExternalProject_Add(
nvshmem_project
URL file://${CMAKE_CURRENT_SOURCE_DIR}/nvshmem_src_3.2.5-1.txz
URL_HASH ${NVSHMEM_URL_HASH}
PATCH_COMMAND patch -p1 --forward --batch -i
${DEEP_EP_SOURCE_DIR}/third-party/nvshmem.patch
COMMAND sed "s/TRANSPORT_VERSION_MAJOR 3/TRANSPORT_VERSION_MAJOR 103/" -i
src/CMakeLists.txt
COMMAND patch -p1 --forward --batch -i
${CMAKE_CURRENT_SOURCE_DIR}/nvshmem_fast_build.patch
CMAKE_CACHE_ARGS
-DCMAKE_C_COMPILER:STRING=${CMAKE_C_COMPILER}
-DCMAKE_C_COMPILER_LAUNCHER:STRING=${CMAKE_C_COMPILER_LAUNCHER}
-DCMAKE_CXX_COMPILER:STRING=${CMAKE_CXX_COMPILER}
-DCMAKE_CXX_COMPILER_LAUNCHER:STRING=${CMAKE_CXX_COMPILER_LAUNCHER}
-DCMAKE_CUDA_ARCHITECTURES:STRING=${DEEP_EP_CUDA_ARCHITECTURES}
-DCMAKE_CUDA_HOST_COMPILER:STRING=${CMAKE_CUDA_HOST_COMPILER}
-DCMAKE_CUDA_COMPILER_LAUNCHER:STRING=${CMAKE_CUDA_COMPILER_LAUNCHER}
-DNVSHMEM_BUILD_EXAMPLES:BOOL=0
-DNVSHMEM_BUILD_PACKAGES:BOOL=0
-DNVSHMEM_BUILD_TESTS:BOOL=0
-DNVSHMEM_IBGDA_SUPPORT:BOOL=1
-DNVSHMEM_IBRC_SUPPORT:BOOL=0
-DNVSHMEM_MPI_SUPPORT:BOOL=0
-DNVSHMEM_PMIX_SUPPORT:BOOL=0
-DNVSHMEM_SHMEM_SUPPORT:BOOL=0
-DNVSHMEM_TIMEOUT_DEVICE_POLLING:BOOL=0
-DNVSHMEM_UCX_SUPPORT:BOOL=0
-DNVSHMEM_USE_GDRCOPY:BOOL=0
-DNVSHMEM_USE_NCCL:BOOL=0
INSTALL_COMMAND ""
BINARY_DIR ${CMAKE_CURRENT_BINARY_DIR}/nvshmem-build
BUILD_BYPRODUCTS
${CMAKE_CURRENT_BINARY_DIR}/nvshmem-build/src/lib/libnvshmem.a)
add_library(nvshmem_project::nvshmem STATIC IMPORTED)
add_dependencies(nvshmem_project::nvshmem nvshmem_project)
file(MAKE_DIRECTORY ${CMAKE_CURRENT_BINARY_DIR}/nvshmem-build/src/include)
set_target_properties(
nvshmem_project::nvshmem
PROPERTIES IMPORTED_LOCATION
${CMAKE_CURRENT_BINARY_DIR}/nvshmem-build/src/lib/libnvshmem.a
INTERFACE_INCLUDE_DIRECTORIES
${CMAKE_CURRENT_BINARY_DIR}/nvshmem-build/src/include)

# Add DeepEP cpp
# ==============
Expand All @@ -188,7 +114,7 @@ set_target_properties(
CUDA_SEPARABLE_COMPILATION ON
CUDA_ARCHITECTURES "${DEEP_EP_CUDA_ARCHITECTURES}"
LINK_DEPENDS ${CMAKE_CURRENT_SOURCE_DIR}/deep_ep_cpp_tllm.version
INSTALL_RPATH "$ORIGIN/libs/nvshmem;${TORCH_INSTALL_PREFIX}/lib"
INSTALL_RPATH "${TORCH_INSTALL_PREFIX}/lib"
BUILD_WITH_INSTALL_RPATH TRUE)
target_compile_options(
deep_ep_cpp_tllm
Expand All @@ -197,8 +123,9 @@ target_compile_options(
target_compile_definitions(
deep_ep_cpp_tllm PRIVATE DISABLE_AGGRESSIVE_PTX_INSTRS
TORCH_EXTENSION_NAME=deep_ep_cpp_tllm)
target_include_directories(deep_ep_cpp_tllm PRIVATE ${NVSHMEM_INCLUDE_DIR})
target_link_libraries(
deep_ep_cpp_tllm PRIVATE nvshmem_project::nvshmem ${TORCH_LIBRARIES}
deep_ep_cpp_tllm PRIVATE ${NVSHMEM_DEVICE_LIBRARY} ${TORCH_LIBRARIES}
${TORCH_PYTHON_LIB})
target_link_options(
deep_ep_cpp_tllm PRIVATE
Expand All @@ -207,4 +134,4 @@ target_link_options(

# Set targets
# ===========
add_dependencies(deep_ep deep_ep_cpp_tllm nvshmem_project)
add_dependencies(deep_ep deep_ep_cpp_tllm)
2 changes: 1 addition & 1 deletion cpp/tensorrt_llm/kernels/cutlass_kernels/CMakeLists.txt
Original file line number Diff line number Diff line change
Expand Up @@ -190,7 +190,7 @@ set_cuda_architectures(fb_gemm_src 89 90 120f)
# ${INSTANTIATION_GENERATION_DIR}/fp8_rowwise_gemm)

add_library(fp8_blockscale_gemm_src STATIC ${FP8_BLOCKSCALE_GEMM_SRC_CU})
set_cuda_architectures(fp8_blockscale_gemm_src 89 90 100f)
set_cuda_architectures(fp8_blockscale_gemm_src 90)

set(GEMM_SWIGLU_SM90_SRC_CU
${CMAKE_CURRENT_SOURCE_DIR}/fused_gated_gemm/gemm_swiglu_e4m3.cu)
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -24,6 +24,8 @@
#include <cuda/barrier>
#include <cute/arch/util.hpp>

#define PFN_cuTensorMapEncodeTiled PFN_cuTensorMapEncodeTiled_v12000

Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

🛠️ Refactor suggestion

⚠️ Potential issue

Avoid redefining SDK typedefs via macros

#define PFN_cuTensorMapEncodeTiled PFN_cuTensorMapEncodeTiled_v12000 is brittle, violates macro naming guidelines, and can unintentionally affect other headers. Prefer explicit types or a local alias.

Remove the macro:

-#define PFN_cuTensorMapEncodeTiled PFN_cuTensorMapEncodeTiled_v12000
-
📝 Committable suggestion

‼️ IMPORTANT
Carefully review the code before committing. Ensure that it accurately replaces the highlighted code, contains no missing lines, and has no issues with indentation. Thoroughly test & benchmark the code to ensure it meets the requirements.

Suggested change
#define PFN_cuTensorMapEncodeTiled PFN_cuTensorMapEncodeTiled_v12000
🤖 Prompt for AI Agents
In
cpp/tensorrt_llm/kernels/cutlass_kernels/fp8_blockscale_gemm/fp8_blockscale_tma_utils.cuh
around lines 27-28, remove the macro definition that rebinds
PFN_cuTensorMapEncodeTiled to PFN_cuTensorMapEncodeTiled_v12000; instead delete
the #define and create a local, scoped alias (preferably a typedef or C++
using-alias) that maps PFN_cuTensorMapEncodeTiled to
PFN_cuTensorMapEncodeTiled_v12000 within this translation unit or an appropriate
namespace to avoid leaking a macro into other headers. Ensure the alias is only
visible where needed and that no preprocessor macro remains.

namespace tensorrt_llm::kernels::fp8_blockscale_gemm
{

Expand Down Expand Up @@ -84,7 +86,7 @@ inline CUtensorMapDataType get_CUtensorMapDataType()
}
}

PFN_cuTensorMapEncodeTiled get_cuTensorMapEncodeTiled()
PFN_cuTensorMapEncodeTiled_v12000 get_cuTensorMapEncodeTiled()
{
// Get pointer to cuTensorMapEncodeTiled
cudaDriverEntryPointQueryResult driver_status;
Expand All @@ -101,12 +103,12 @@ PFN_cuTensorMapEncodeTiled get_cuTensorMapEncodeTiled()
throw std::runtime_error("driver_status != cudaDriverEntryPointSuccess");
}

return reinterpret_cast<PFN_cuTensorMapEncodeTiled>(cuTensorMapEncodeTiled_ptr);
return reinterpret_cast<PFN_cuTensorMapEncodeTiled_v12000>(cuTensorMapEncodeTiled_ptr);
}

template <typename data_type>
CUtensorMap make_2d_tma_copy_desc(data_type* global_address, uint64_t gmem_dim[2], uint64_t stride_in_bytes,
uint32_t smem_dim[2], CUtensorMapSwizzle swizzle_type, PFN_cuTensorMapEncodeTiled encode_func = nullptr)
uint32_t smem_dim[2], CUtensorMapSwizzle swizzle_type, PFN_cuTensorMapEncodeTiled_v12000 encode_func = nullptr)
{
Comment on lines +109 to 110
Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

🛠️ Refactor suggestion

Unify encode_func parameter type with stable PFN

Avoid exposing the versioned PFN type in public signatures.

-    uint32_t smem_dim[2], CUtensorMapSwizzle swizzle_type, PFN_cuTensorMapEncodeTiled_v12000 encode_func = nullptr)
+    uint32_t smem_dim[2], CUtensorMapSwizzle swizzle_type, PFN_cuTensorMapEncodeTiled encode_func = nullptr)
📝 Committable suggestion

‼️ IMPORTANT
Carefully review the code before committing. Ensure that it accurately replaces the highlighted code, contains no missing lines, and has no issues with indentation. Thoroughly test & benchmark the code to ensure it meets the requirements.

Suggested change
uint32_t smem_dim[2], CUtensorMapSwizzle swizzle_type, PFN_cuTensorMapEncodeTiled_v12000 encode_func = nullptr)
{
uint32_t smem_dim[2], CUtensorMapSwizzle swizzle_type, PFN_cuTensorMapEncodeTiled encode_func = nullptr)
{
🤖 Prompt for AI Agents
In
cpp/tensorrt_llm/kernels/cutlass_kernels/fp8_blockscale_gemm/fp8_blockscale_tma_utils.cuh
around lines 111-112, the function signature exposes the versioned PFN type
PFN_cuTensorMapEncodeTiled_v12000; change the parameter type to the stable,
unversioned PFN alias (PFN_cuTensorMapEncodeTiled) so the public signature
doesn't leak a versioned typedef, and update any forward declarations or
includes so the unversioned PFN type is visible where this header is compiled.

CUtensorMap tensor_map{};
constexpr uint32_t rank = 2;
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -2597,7 +2597,7 @@ __global__ void __launch_bounds__(MAX_THEADS_PER_BLOCK, MIN_BLOCKS_PER_SM) maske
__shared__ typename BlockReduce::TempStorage temp_storage;
// Obtain a segment of consecutive items that are blocked across threads (final_max from above)
// Compute the block-wide max for thread0
final_max = BlockReduce(temp_storage).Reduce(thread_partial_max, cub::Max(), gridDim.z);
final_max = BlockReduce(temp_storage).Reduce(thread_partial_max, cuda::maximum(), gridDim.z);

__shared__ float final_max_smem;
if (tidx == 0)
Expand Down
4 changes: 2 additions & 2 deletions cpp/tensorrt_llm/kernels/sageAttentionKernels.cu
Original file line number Diff line number Diff line change
Expand Up @@ -250,7 +250,7 @@ __global__ void sage_quant_kernel(void const* q, void const* k, void const* v, i

// Compute the block-wide max for thread0
// cuda::maximum<>{}
float aggregate = BlockReduce(temp_storage).Reduce(local_amax, cub::Max{});
float aggregate = BlockReduce(temp_storage).Reduce(local_amax, cuda::maximum{});

if (row_id == 0 && col_id == 0)
s_block_amax = static_cast<float>(aggregate);
Expand Down Expand Up @@ -429,7 +429,7 @@ __global__ void sage_quant_kernel(void const* q, void const* k, void const* v, i

// Compute the block-wide max for thread0
// cuda::maximum<>{}
float aggregate = BlockReduce(temp_storage).Reduce(local_amax, cub::Max{});
float aggregate = BlockReduce(temp_storage).Reduce(local_amax, cuda::maximum{});

if (row_id == 0 && col_id == 0)
s_block_amax = static_cast<float>(aggregate);
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -504,7 +504,7 @@ __global__ void prepareGenEagleNetInputsKernel(SizeType32* nextSequenceLengths,
BlockScan(tempStorage.scan).ExclusiveSum(numNextLogits, outputLastIndicesBase);
// Sync because tempStorage is reused.
__syncthreads();
auto const maxGenLength = BlockReduce(tempStorage.reduce).Reduce(nextDraftLen, cub::Max());
auto const maxGenLength = BlockReduce(tempStorage.reduce).Reduce(nextDraftLen, cuda::maximum());

// Thread 0 has the result.
if (bid == 0)
Expand Down
13 changes: 7 additions & 6 deletions cpp/tensorrt_llm/kernels/topkLastDim.cu
Original file line number Diff line number Diff line change
Expand Up @@ -25,6 +25,8 @@
#include "topkLastDim.h"
#include <cub/cub.cuh>
#include <cuda/atomic>
#include <thrust/iterator/counting_iterator.h>
#include <thrust/iterator/transform_iterator.h>

namespace tensorrt_llm
{
Expand Down Expand Up @@ -1221,9 +1223,9 @@ void standalone_stable_radix_topk_(void* buf, size_t& buf_size, T const* in, Idx
IdxT* sort_in_idx = nullptr;

air_topk_stable::ComputeOffset<IdxT> computeoffset(k);
cub::CountingInputIterator<IdxT> counting_iter(0);
cub::TransformInputIterator<IdxT, air_topk_stable::ComputeOffset<IdxT>, cub::CountingInputIterator<IdxT>>
transform_iter(counting_iter, computeoffset);
auto counting_iter = thrust::make_counting_iterator<IdxT>(0);
auto transform_iter = thrust::make_transform_iterator(counting_iter, computeoffset);

cub::DeviceSegmentedSort::SortPairs(NULL, temp_storage_bytes, out_idx, out_idx, out, out, k * batch_size,
batch_size, transform_iter, transform_iter + 1, stream);
if (sorted)
Expand Down Expand Up @@ -1348,9 +1350,8 @@ void standalone_stable_radix_topk_one_block_(void* buf, size_t& buf_size, T cons
const IdxT buf_len = air_topk_stable::calc_buf_len<T, IdxT, unsigned>(len);

air_topk_stable::ComputeOffset<IdxT> computeoffset(k);
cub::CountingInputIterator<IdxT> counting_iter(0);
cub::TransformInputIterator<IdxT, air_topk_stable::ComputeOffset<IdxT>, cub::CountingInputIterator<IdxT>>
transform_iter(counting_iter, computeoffset);
auto counting_iter = thrust::make_counting_iterator<IdxT>(0);
auto transform_iter = thrust::make_transform_iterator(counting_iter, computeoffset);

cub::DeviceSegmentedSort::SortPairs(NULL, temp_storage_bytes, out_idx, out_idx, out, out, k * batch_size,
batch_size, transform_iter, transform_iter + 1, stream);
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -154,7 +154,7 @@ __global__ void activationDeepSeekKernel(KernelParams params)
float constexpr E4m3MaxVal{448.f};

// Compute the absolute max
float aMax = BlockReduce(temp_storage).Reduce(fabsf(out), cub::Max());
float aMax = BlockReduce(temp_storage).Reduce(fabsf(out), cuda::maximum());
if (threadIdx.x == 0)
{
s_scaleOut = aMax / E4m3MaxVal;
Expand Down Expand Up @@ -657,7 +657,7 @@ __global__ void finalizeDeepSeekKernel(KernelParams params)
float constexpr E4m3MaxVal{448.f};

// Compute the absolute max
float aMax = BlockReduce(temp_storage).Reduce(fabsf(acc), cub::Max());
float aMax = BlockReduce(temp_storage).Reduce(fabsf(acc), cuda::maximum());

if (threadIdx.x == 0)
{
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -364,7 +364,8 @@ void* HostAccessibleDeviceAllocator::allocate(size_t memorySize)
TLLM_CHECK_WITH_INFO(
mAllowManagedFallback, "HostAccessibleDeviceAllocator is not supported on the current system.");
TLLM_CUDA_CHECK(cudaMallocManaged(&devPtr, memorySize));
TLLM_CUDA_CHECK(cudaMemAdvise(devPtr, memorySize, cudaMemAdviseSetPreferredLocation, currentDevId));
cudaMemLocation location {cudaMemLocationTypeDevice, currentDevId};
TLLM_CUDA_CHECK(cudaMemAdvise(devPtr, memorySize, cudaMemAdviseSetPreferredLocation, location));
hostPtr = devPtr;
Comment on lines +367 to 369
Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

💡 Verification agent

🧩 Analysis chain

Guard new cudaMemAdvise location-API for backward compatibility and fix indentation

The location-based cudaMemAdvise overload requires newer CUDA. Given CMake still allows CUDA >= 11.2, gate this with CUDART_VERSION to avoid compile errors with older toolkits. Also replace the tab with spaces to match file style.

Apply:

-	cudaMemLocation location {cudaMemLocationTypeDevice, currentDevId};
-        TLLM_CUDA_CHECK(cudaMemAdvise(devPtr, memorySize, cudaMemAdviseSetPreferredLocation, location));
+    #if CUDART_VERSION >= 12000
+    cudaMemLocation location{cudaMemLocationTypeDevice, currentDevId};
+    TLLM_CUDA_CHECK(cudaMemAdvise(devPtr, memorySize, cudaMemAdviseSetPreferredLocation, location));
+    #else
+    TLLM_CUDA_CHECK(cudaMemAdvise(devPtr, memorySize, cudaMemAdviseSetPreferredLocation, currentDevId));
+    #endif

Alternatively, if you intend to raise the project’s minimum CUDA to 13, update find_package(CUDAToolkit ...) accordingly and remove the fallback.


🏁 Script executed:

#!/bin/bash
set -euo pipefail
FILE="cpp/tensorrt_llm/runtime/moeLoadBalancer/hostAccessibleDeviceAllocator.cpp"
echo "PWD: $(pwd)"
if [ -f "$FILE" ]; then
  echo "Found file: $FILE"
  echo "---- File header (first 120 lines) ----"
  sed -n '1,120p' "$FILE"
  echo "---- Context lines 340-400 ----"
  sed -n '340,400p' "$FILE"
  echo "---- Search for relevant symbols in file ----"
  rg -n "TLLM_CUDA_CHECK|cudaMemAdvise|cudaMemLocation|CUDART_VERSION" "$FILE" || true
  echo "---- Lines containing tabs (show up to 200) ----"
  nl -ba "$FILE" | sed -n '1,520p' | rg --line-number $'\t' || true
else
  echo "File not found: $FILE"
  rg -n "hostAccessibleDeviceAllocator.cpp" || true
fi

echo "---- Search for find_package(CUDAToolkit in repo ----"
rg -n "find_package\\(CUDAToolkit" || true
echo "---- Search for CUDART_VERSION usage in repo ----"
rg -n "CUDART_VERSION" || true

Length of output: 13048


Guard cudaMemAdvise(location) for older CUDA toolkits & fix indentation

The cudaMemLocation overload requires CUDA ≥ 12, and cpp/CMakeLists.txt still allows CUDA 11.2 — gate the new API with CUDART_VERSION and replace the stray tab with spaces.

  • File: cpp/tensorrt_llm/runtime/moeLoadBalancer/hostAccessibleDeviceAllocator.cpp — replace the cudaMemLocation / cudaMemAdvise lines (around lines ~366–369).
  • CMake: cpp/CMakeLists.txt currently uses find_package(CUDAToolkit 11.2 ...), so keep the fallback unless you bump the min CUDA version.

Apply:

-	cudaMemLocation location {cudaMemLocationTypeDevice, currentDevId};
-        TLLM_CUDA_CHECK(cudaMemAdvise(devPtr, memorySize, cudaMemAdviseSetPreferredLocation, location));
+#if defined(CUDART_VERSION) && (CUDART_VERSION >= 12000)
+        cudaMemLocation location{cudaMemLocationTypeDevice, currentDevId};
+        TLLM_CUDA_CHECK(cudaMemAdvise(devPtr, memorySize, cudaMemAdviseSetPreferredLocation, location));
+#else
+        TLLM_CUDA_CHECK(cudaMemAdvise(devPtr, memorySize, cudaMemAdviseSetPreferredLocation, currentDevId));
+#endif

Alternatively, if you intend to require CUDA ≥ 12 project-wide, update find_package(CUDAToolkit ...) in cpp/CMakeLists.txt and remove the fallback.

📝 Committable suggestion

‼️ IMPORTANT
Carefully review the code before committing. Ensure that it accurately replaces the highlighted code, contains no missing lines, and has no issues with indentation. Thoroughly test & benchmark the code to ensure it meets the requirements.

Suggested change
cudaMemLocation location {cudaMemLocationTypeDevice, currentDevId};
TLLM_CUDA_CHECK(cudaMemAdvise(devPtr, memorySize, cudaMemAdviseSetPreferredLocation, location));
hostPtr = devPtr;
#if defined(CUDART_VERSION) && (CUDART_VERSION >= 12000)
cudaMemLocation location{cudaMemLocationTypeDevice, currentDevId};
TLLM_CUDA_CHECK(cudaMemAdvise(devPtr, memorySize, cudaMemAdviseSetPreferredLocation, location));
#else
TLLM_CUDA_CHECK(cudaMemAdvise(devPtr, memorySize, cudaMemAdviseSetPreferredLocation, currentDevId));
#endif
hostPtr = devPtr;
🤖 Prompt for AI Agents
In cpp/tensorrt_llm/runtime/moeLoadBalancer/hostAccessibleDeviceAllocator.cpp
around lines 367–369, the current cudaMemLocation/cudaMemAdvise usage requires
CUDA ≥ 12 and also contains a stray tab; guard the new API with a preprocessor
check (e.g., #if defined(CUDART_VERSION) && CUDART_VERSION >= 12000) to call
cudaMemLocation and cudaMemAdvise only when available and provide the existing
fallback for older CUDA versions in the #else branch, and replace the stray tab
with spaces to fix indentation; do not change cpp/CMakeLists.txt unless you
intend to raise the minimum CUDA version project-wide.

}
recordAllocation(devPtr, memorySize, hostPtr, memDesc);
Expand Down
2 changes: 1 addition & 1 deletion cpp/tensorrt_llm/runtime/utils/debugUtils.cu
Original file line number Diff line number Diff line change
Expand Up @@ -54,7 +54,7 @@ __global__ void checkTensorInvalidKernel(T const* data, std::size_t size, int* f
__shared__ typename BlockReduceT::TempStorage tempStorage;

// Compute block-wide maximum
int blockFound = BlockReduceT(tempStorage).Reduce(found, cub::Max());
int blockFound = BlockReduceT(tempStorage).Reduce(found, cuda::maximum());

// Have thread 0 write out block's result
if (threadIdx.x == 0)
Expand Down
Loading