Skip to content
Closed
11 changes: 11 additions & 0 deletions cpp/CMakeLists.txt
Original file line number Diff line number Diff line change
Expand Up @@ -212,6 +212,7 @@ endif()
include_directories(
SYSTEM
${CUDAToolkit_INCLUDE_DIRS}
/usr/local/cuda-13.0/targets/x86_64-linux/include/cccl
${CUDNN_ROOT_DIR}/include
$<TARGET_PROPERTY:TensorRT::NvInfer,INTERFACE_INCLUDE_DIRECTORIES>
${3RDPARTY_DIR}/cutlass/include
Expand Down Expand Up @@ -249,6 +250,15 @@ if(${CUDAToolkit_VERSION} VERSION_GREATER_EQUAL "12.8")
)
endif()

if(${CUDAToolkit_VERSION} VERSION_GREATER_EQUAL "13.0")
message(
STATUS
"CUDAToolkit_VERSION ${CUDAToolkit_VERSION_MAJOR}.${CUDAToolkit_VERSION_MINOR} is greater or equal than 13.0, adding visibility flags"
)
set(CMAKE_CUDA_FLAGS "${CMAKE_CUDA_FLAGS} --device-entity-has-hidden-visibility=false -cudart=shared")
### set(CMAKE_CUDA_RUNTIME_LIBRARY Shared)
endif()

if(ENABLE_MULTI_DEVICE)
# MPI MPI isn't used until tensorrt_llm/CMakeLists.txt is invoked. However, if
# it's not called before "CMAKE_CXX_FLAGS" is set, it breaks on Windows for
Expand Down Expand Up @@ -365,6 +375,7 @@ if(NVCC_TIMING)
set(CMAKE_CUDA_FLAGS
"${CMAKE_CUDA_FLAGS} --time ${CMAKE_CURRENT_BINARY_DIR}/nvcc-timing.csv")
endif()

message("CMAKE_CUDA_FLAGS: ${CMAKE_CUDA_FLAGS}")

set(COMMON_HEADER_DIRS ${PROJECT_SOURCE_DIR} ${CUDAToolkit_INCLUDE_DIR})
Expand Down
6 changes: 3 additions & 3 deletions cpp/include/tensorrt_llm/deep_gemm/tma_utils.cuh
Original file line number Diff line number Diff line change
Expand Up @@ -95,7 +95,7 @@ constexpr CUtensorMapDataType get_CUtensorMapDataType()
}
}

PFN_cuTensorMapEncodeTiled get_cuTensorMapEncodeTiled()
PFN_cuTensorMapEncodeTiled_v12000 get_cuTensorMapEncodeTiled()
{
// Get pointer to `cuTensorMapEncodeTiled`
cudaDriverEntryPointQueryResult driver_status;
Expand All @@ -110,12 +110,12 @@ PFN_cuTensorMapEncodeTiled get_cuTensorMapEncodeTiled()

if (driver_status != cudaDriverEntryPointSuccess)
throw std::runtime_error("driver_status != cudaDriverEntryPointSuccess");
return reinterpret_cast<PFN_cuTensorMapEncodeTiled>(cuTensorMapEncodeTiled_ptr);
return reinterpret_cast<PFN_cuTensorMapEncodeTiled_v12000>(cuTensorMapEncodeTiled_ptr);
}

template <typename T>
CUtensorMap make_2d_tma_copy_desc(T* global_address, uint64_t gmem_dim[2], uint64_t stride_in_bytes,
uint32_t smem_dim[2], CUtensorMapSwizzle swizzle_type, PFN_cuTensorMapEncodeTiled encode_func = nullptr)
uint32_t smem_dim[2], CUtensorMapSwizzle swizzle_type, PFN_cuTensorMapEncodeTiled_v12000 encode_func = nullptr)
{
Comment on lines 116 to 119
Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

💡 Verification agent

🧩 Analysis chain

Default parameter type should match the stable PFN type

Align the encode_func param with the stable, unversioned type to avoid exposing versioned types.

-    uint32_t smem_dim[2], CUtensorMapSwizzle swizzle_type, PFN_cuTensorMapEncodeTiled_v12000 encode_func = nullptr)
+    uint32_t smem_dim[2], CUtensorMapSwizzle swizzle_type, PFN_cuTensorMapEncodeTiled encode_func = nullptr)

Double-check the other call sites for signature consistency.


🏁 Script executed:

#!/bin/bash
set -euo pipefail

echo "=== show file (if exists) ==="
FILE="cpp/include/tensorrt_llm/deep_gemm/tma_utils.cuh"
if [ -f "$FILE" ]; then
  sed -n '1,240p' "$FILE" || true
else
  echo "File not found: $FILE"
fi

echo
echo "=== rg: PFN_cuTensorMapEncodeTiled_v12000 ==="
rg --hidden -n "PFN_cuTensorMapEncodeTiled_v12000" || true

echo
echo "=== rg: PFN_cuTensorMapEncodeTiled ==="
rg --hidden -n "PFN_cuTensorMapEncodeTiled(?!_v12000)" || true

echo
echo "=== rg: make_2d_tma_copy_desc ==="
rg --hidden -n "make_2d_tma_copy_desc" || true

echo
echo "=== rg: cuTensorMapEncodeTiled occurrences ==="
rg --hidden -n "cuTensorMapEncodeTiled" || true

echo
echo "=== rg: cuTensorMap related typedefs/usages ==="
rg --hidden -n "cuTensorMap" || true

Length of output: 25549


🏁 Script executed:

#!/bin/bash
set -euo pipefail

echo "=== fp8_blockscale_tma_utils.cuh (head) ==="
sed -n '1,240p' cpp/tensorrt_llm/kernels/cutlass_kernels/fp8_blockscale_gemm/fp8_blockscale_tma_utils.cuh || true

echo
echo "=== cudaDriverWrapper.h (head) ==="
sed -n '1,320p' cpp/tensorrt_llm/common/cudaDriverWrapper.h || true

echo
echo "=== cudaDriverWrapper.cpp (head) ==="
sed -n '1,360p' cpp/tensorrt_llm/common/cudaDriverWrapper.cpp || true

echo
echo "=== tma_utils.cuh (relevant region) ==="
sed -n '1,220p' cpp/include/tensorrt_llm/deep_gemm/tma_utils.cuh || true

echo
echo "=== rg: where PFN_cuTensorMapEncodeTiled is defined/mentioned ==="
rg --hidden -n "PFN_cuTensorMapEncodeTiled" || true

Length of output: 32409


Prefer the stable PFN_cuTensorMapEncodeTiled type for the default parameter

Use the unversioned PFN to avoid leaking the versioned typedef; callers I checked do not pass encode_func explicitly.

Files to update:

  • cpp/include/tensorrt_llm/deep_gemm/tma_utils.cuh

    • Replace:
      • uint32_t smem_dim[2], CUtensorMapSwizzle swizzle_type, PFN_cuTensorMapEncodeTiled_v12000 encode_func = nullptr)
    • With:
      • uint32_t smem_dim[2], CUtensorMapSwizzle swizzle_type, PFN_cuTensorMapEncodeTiled encode_func = nullptr)
  • cpp/tensorrt_llm/kernels/cutlass_kernels/fp8_blockscale_gemm/fp8_blockscale_tma_utils.cuh

    • Make the same signature change (or rely on the existing
      #define PFN_cuTensorMapEncodeTiled PFN_cuTensorMapEncodeTiled_v12000 and prefer using the unversioned name).

Call sites verified (no explicit encode_func argument): cpp/include/tensorrt_llm/deep_gemm/fp8_gemm.cuh, cpp/tensorrt_llm/kernels/cutlass_kernels/fp8_blockscale_gemm/fp8_blockscale_gemm_kernel.cuh, etc.

Optional safety: add a local fallback in headers to map the unversioned name to the v12000 typedef if the unversioned PFN is not present:

#ifndef PFN_cuTensorMapEncodeTiled
#define PFN_cuTensorMapEncodeTiled PFN_cuTensorMapEncodeTiled_v12000
#endif

🤖 Prompt for AI Agents
In cpp/include/tensorrt_llm/deep_gemm/tma_utils.cuh around lines 116-119, the
function signature uses the versioned typedef PFN_cuTensorMapEncodeTiled_v12000
as the default parameter; change that parameter type to the unversioned
PFN_cuTensorMapEncodeTiled (i.e., replace PFN_cuTensorMapEncodeTiled_v12000 with
PFN_cuTensorMapEncodeTiled) so callers that don’t pass encode_func keep using
the stable name, and make the same signature change in
cpp/tensorrt_llm/kernels/cutlass_kernels/fp8_blockscale_gemm/fp8_blockscale_tma_utils.cuh
(or rely on an existing #define mapping the unversioned name to the v12000
typedef); optionally add a local fallback macro guard at the top of the headers:
if PFN_cuTensorMapEncodeTiled is not defined, #define it to
PFN_cuTensorMapEncodeTiled_v12000.

CUtensorMap tensor_map{};
constexpr uint32_t rank = 2;
Expand Down
96 changes: 12 additions & 84 deletions cpp/tensorrt_llm/deep_ep/CMakeLists.txt
Original file line number Diff line number Diff line change
Expand Up @@ -38,6 +38,14 @@ endif()

# Ensure that dependent libraries are installed
find_library(MLX5_lib NAMES mlx5 REQUIRED)
set(NVSHMEM_INSTALL_PREFIX "${TORCH_INSTALL_PREFIX}/../nvidia/nvshmem")
# message("Torch Install Prefix: ${TORCH_INSTALL_PREFIX}")

find_path(NVSHMEM_INCLUDE_DIR nvshmem.h HINTS ${NVSHMEM_INSTALL_PREFIX}/include)
# set(NVSHMEM_HOST_LIBRARY ${NVSHMEM_INSTALL_PREFIX}/lib/libnvshmem_host.so.3)
find_library(NVSHMEM_DEVICE_LIBRARY nvshmem_device HINTS ${NVSHMEM_INSTALL_PREFIX}/lib)
find_library(NVSHMEM_HOST_LIBRARY libnvshmem_host.so.3 HINTS ${NVSHMEM_INSTALL_PREFIX}/lib)


# Prepare files
# =============
Expand Down Expand Up @@ -81,87 +89,6 @@ foreach(_f IN LISTS _files)
PROPERTY CMAKE_CONFIGURE_DEPENDS ${_src})
endforeach()

# Delete stale nvshmem on patch update
set(NVSHMEM_STAMP_FILE ${CMAKE_CURRENT_BINARY_DIR}/nvshmem_stamp.txt)
file(SHA256 ${DEEP_EP_SOURCE_DIR}/third-party/nvshmem.patch NVSHMEM_PATCH_HASH)
file(SHA256 ${CMAKE_CURRENT_SOURCE_DIR}/nvshmem_fast_build.patch
NVSHMEM_PATCH_2_HASH)
set(NVSHMEM_STAMP_CONTENT "${NVSHMEM_URL_HASH}")
string(APPEND NVSHMEM_STAMP_CONTENT " PATCH_COMMAND v1")
string(APPEND NVSHMEM_STAMP_CONTENT " ${NVSHMEM_PATCH_HASH}")
string(APPEND NVSHMEM_STAMP_CONTENT " 103")
string(APPEND NVSHMEM_STAMP_CONTENT " ${NVSHMEM_PATCH_2_HASH}")
set(OLD_NVSHMEM_STAMP_CONTENT "")
if(EXISTS ${NVSHMEM_STAMP_FILE})
file(READ ${NVSHMEM_STAMP_FILE} OLD_NVSHMEM_STAMP_CONTENT)
endif()
if(NOT OLD_NVSHMEM_STAMP_CONTENT STREQUAL NVSHMEM_STAMP_CONTENT)
file(REMOVE_RECURSE ${CMAKE_CURRENT_BINARY_DIR}/nvshmem_project-prefix)
file(WRITE ${NVSHMEM_STAMP_FILE} "${NVSHMEM_STAMP_CONTENT}")
endif()
set_property(
DIRECTORY APPEND
PROPERTY CMAKE_CONFIGURE_DEPENDS
${DEEP_EP_SOURCE_DIR}/third-party/nvshmem.patch
${CMAKE_CURRENT_SOURCE_DIR}/nvshmem_fast_build.patch)

# Add NVSHMEM
# ===========

# NVSHMEM only works with GCC. Building NVSHMEM with Clang results in
# compilation errors. Using NVSHMEM with Clang results in slow builds and device
# link issues.
if(NOT CMAKE_CXX_COMPILER_ID STREQUAL "GNU")
set(CMAKE_C_COMPILER gcc)
set(CMAKE_CXX_COMPILER g++)
set(CMAKE_CUDA_HOST_COMPILER g++)
endif()

# Add nvshmem external project
include(ExternalProject)
ExternalProject_Add(
nvshmem_project
URL file://${CMAKE_CURRENT_SOURCE_DIR}/nvshmem_src_3.2.5-1.txz
URL_HASH ${NVSHMEM_URL_HASH}
PATCH_COMMAND patch -p1 --forward --batch -i
${DEEP_EP_SOURCE_DIR}/third-party/nvshmem.patch
COMMAND sed "s/TRANSPORT_VERSION_MAJOR 3/TRANSPORT_VERSION_MAJOR 103/" -i
src/CMakeLists.txt
COMMAND patch -p1 --forward --batch -i
${CMAKE_CURRENT_SOURCE_DIR}/nvshmem_fast_build.patch
CMAKE_CACHE_ARGS
-DCMAKE_C_COMPILER:STRING=${CMAKE_C_COMPILER}
-DCMAKE_C_COMPILER_LAUNCHER:STRING=${CMAKE_C_COMPILER_LAUNCHER}
-DCMAKE_CXX_COMPILER:STRING=${CMAKE_CXX_COMPILER}
-DCMAKE_CXX_COMPILER_LAUNCHER:STRING=${CMAKE_CXX_COMPILER_LAUNCHER}
-DCMAKE_CUDA_ARCHITECTURES:STRING=${DEEP_EP_CUDA_ARCHITECTURES}
-DCMAKE_CUDA_HOST_COMPILER:STRING=${CMAKE_CUDA_HOST_COMPILER}
-DCMAKE_CUDA_COMPILER_LAUNCHER:STRING=${CMAKE_CUDA_COMPILER_LAUNCHER}
-DNVSHMEM_BUILD_EXAMPLES:BOOL=0
-DNVSHMEM_BUILD_PACKAGES:BOOL=0
-DNVSHMEM_BUILD_TESTS:BOOL=0
-DNVSHMEM_IBGDA_SUPPORT:BOOL=1
-DNVSHMEM_IBRC_SUPPORT:BOOL=0
-DNVSHMEM_MPI_SUPPORT:BOOL=0
-DNVSHMEM_PMIX_SUPPORT:BOOL=0
-DNVSHMEM_SHMEM_SUPPORT:BOOL=0
-DNVSHMEM_TIMEOUT_DEVICE_POLLING:BOOL=0
-DNVSHMEM_UCX_SUPPORT:BOOL=0
-DNVSHMEM_USE_GDRCOPY:BOOL=0
-DNVSHMEM_USE_NCCL:BOOL=0
INSTALL_COMMAND ""
BINARY_DIR ${CMAKE_CURRENT_BINARY_DIR}/nvshmem-build
BUILD_BYPRODUCTS
${CMAKE_CURRENT_BINARY_DIR}/nvshmem-build/src/lib/libnvshmem.a)
add_library(nvshmem_project::nvshmem STATIC IMPORTED)
add_dependencies(nvshmem_project::nvshmem nvshmem_project)
file(MAKE_DIRECTORY ${CMAKE_CURRENT_BINARY_DIR}/nvshmem-build/src/include)
set_target_properties(
nvshmem_project::nvshmem
PROPERTIES IMPORTED_LOCATION
${CMAKE_CURRENT_BINARY_DIR}/nvshmem-build/src/lib/libnvshmem.a
INTERFACE_INCLUDE_DIRECTORIES
${CMAKE_CURRENT_BINARY_DIR}/nvshmem-build/src/include)

# Add DeepEP cpp
# ==============
Expand All @@ -188,7 +115,7 @@ set_target_properties(
CUDA_SEPARABLE_COMPILATION ON
CUDA_ARCHITECTURES "${DEEP_EP_CUDA_ARCHITECTURES}"
LINK_DEPENDS ${CMAKE_CURRENT_SOURCE_DIR}/deep_ep_cpp_tllm.version
INSTALL_RPATH "$ORIGIN/libs/nvshmem;${TORCH_INSTALL_PREFIX}/lib"
INSTALL_RPATH "${TORCH_INSTALL_PREFIX}/lib;${NVSHMEM_INSTALL_PREFIX}/lib"
BUILD_WITH_INSTALL_RPATH TRUE)
target_compile_options(
deep_ep_cpp_tllm
Expand All @@ -197,8 +124,9 @@ target_compile_options(
target_compile_definitions(
deep_ep_cpp_tllm PRIVATE DISABLE_AGGRESSIVE_PTX_INSTRS
TORCH_EXTENSION_NAME=deep_ep_cpp_tllm)
target_include_directories(deep_ep_cpp_tllm PRIVATE ${NVSHMEM_INCLUDE_DIR})
target_link_libraries(
deep_ep_cpp_tllm PRIVATE nvshmem_project::nvshmem ${TORCH_LIBRARIES}
deep_ep_cpp_tllm PRIVATE ${NVSHMEM_DEVICE_LIBRARY} ${NVSHMEM_HOST_LIBRARY} ${TORCH_LIBRARIES}
${TORCH_PYTHON_LIB})
target_link_options(
deep_ep_cpp_tllm PRIVATE
Expand All @@ -207,4 +135,4 @@ target_link_options(

# Set targets
# ===========
add_dependencies(deep_ep deep_ep_cpp_tllm nvshmem_project)
add_dependencies(deep_ep deep_ep_cpp_tllm)
26 changes: 0 additions & 26 deletions cpp/tensorrt_llm/kernels/beamSearchKernels.cu
Original file line number Diff line number Diff line change
Expand Up @@ -134,32 +134,6 @@ void invokeUpdateCacheIndirection(int* tgtCI, int const* srcCI, BeamHypotheses&
sync_check_cuda_error(stream);
}

template <typename T>
__global__ void addCumLogProbs(T* __restrict pStage1LogProbs, float const* __restrict cumLogProbs,
FinishedState const* finished, int const* endIds, float const* diversityRates,
runtime::SizeType32 const* batchSlots, size_t const nBS, size_t const nBMIn, size_t const nBMOut, size_t const nBM)
{
int const bid = blockIdx.x; // Index of request in batch
runtime::SizeType32 const slot = batchSlots[bid];
float const diversityRate{diversityRates[slot]};
T* pLocalLogProbs = pStage1LogProbs + bid * nBMIn * nBMOut * 2;

for (int i = threadIdx.x; i < nBMIn * nBMOut * 2; i += blockDim.x)
{
int const iBMIn = i / (nBMOut * 2);
if (finished[slot * nBMIn + iBMIn].isFinished())
{
pLocalLogProbs[i] += (i == endIds[slot]) ? 1.0f : 0.0f;
}
else
{
// nBM is used in VBWS since `cumLogProbs` is initialized with kMaxBeamWidth earlier than BeamSearchLayer
pLocalLogProbs[i] += cumLogProbs[slot * nBM + iBMIn] + diversityRate * iBMIn;
}
}
return;
}

template __global__ void addCumLogProbs<float>(float* __restrict pStage1LogProbs, float const* __restrict cumLogProbs,
FinishedState const* finished, int const* endIds, float const* diversityRates,
runtime::SizeType32 const* batchSlots, size_t const nBS, size_t const nBMIn, size_t const nBMOut, size_t const nBM);
Expand Down
28 changes: 26 additions & 2 deletions cpp/tensorrt_llm/kernels/beamSearchKernels.h
Original file line number Diff line number Diff line change
Expand Up @@ -130,10 +130,34 @@ void invokeTopkBeamSearch(T const* logProbs, T const* bias, void* workspace, Bea
void invokeUpdateCacheIndirection(int* tgtCI, int const* srcCI, BeamHypotheses& bh,
runtime::SizeType32 const maxAttentionWindow, runtime::SizeType32 sinkTokenLength, cudaStream_t stream);

#ifdef __CUDACC__
template <typename T>
__global__ void addCumLogProbs(T* __restrict pStage1Probs, float const* __restrict cumLogProbs,
__global__ __attribute__((visibility("default"))) void addCumLogProbs(T* __restrict pStage1LogProbs, float const* __restrict cumLogProbs,
FinishedState const* finished, int const* endIds, float const* diversityRates,
runtime::SizeType32 const* batchSlots, size_t const nBS, size_t const nBMIn, size_t const nBMOut, size_t const nBM);
runtime::SizeType32 const* batchSlots, size_t const nBS, size_t const nBMIn, size_t const nBMOut, size_t const nBM)
{
int const bid = blockIdx.x; // Index of request in batch
runtime::SizeType32 const slot = batchSlots[bid];
float const diversityRate{diversityRates[slot]};
T* pLocalLogProbs = pStage1LogProbs + bid * nBMIn * nBMOut * 2;

for (int i = threadIdx.x; i < nBMIn * nBMOut * 2; i += blockDim.x)
{
int const iBMIn = i / (nBMOut * 2);
if (finished[slot * nBMIn + iBMIn].isFinished())
{
pLocalLogProbs[i] += (i == endIds[slot]) ? 1.0f : 0.0f;
}
else
Comment on lines +147 to +151
Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

💡 Verification agent

🧩 Analysis chain

EOS/endIds indexing appears incorrect and likely out-of-bounds for [BS, BM] layout

endIds is documented as [BS, BM], but the code indexes it as endIds[slot] and compares it to i (a flattened [BM_in, BM_out, 2] index). Two issues:

  • Indexing: endIds[slot] ignores the beam dimension; for per-beam EOS, it should likely be endIds[slot * nBM + iBMIn] (or nBMIn, depending on layout).
  • Semantics: i indexes the flattened candidate grid, not the vocabulary/token ID dimension. Comparing i to an EOS token ID does not look meaningful and may never match.

This can cause incorrect scoring for finished beams and undefined behavior if endIds has only [BS, BM] elements.

Please confirm shapes and intended semantics. If indeed per-beam EOS, a minimal indexing fix would be:

  • Use the per-beam EOS id:

    • int const eosId = endIds[slot * nBM + iBMIn]; // or slot * nBMIn + iBMIn if finished is also sized that way.
  • Compare against the actual token index in pStage1LogProbs, not the flattened [BM_in, BM_out, 2] index. If stage-1 buffer includes a vocab dimension elsewhere, determine the local token index before comparison.

To help locate references and prior logic, run:


🏁 Script executed:

#!/bin/bash
# Look for other usages of endIds indexing and EOS handling in kernels
rg -n -A3 -B3 $'endIds\\[' cpp | sed -n '1,200p'

# Find all write sites for pStage1LogProbs to infer its layout
rg -n -A3 -B3 'pStage1(Log)?Probs' cpp | sed -n '1,200p'

# Find all callers of addCumLogProbs and their launch grids to reason about mapping
rg -n -A3 -B3 'addCumLogProbs\\s*<|<<<' cpp | sed -n '1,200p'

Length of output: 45150


🏁 Script executed:

#!/bin/bash
sed -n '1,240p' cpp/tensorrt_llm/kernels/beamSearchKernels.h

Length of output: 15146


🏁 Script executed:

#!/bin/bash
sed -n '1,420p' cpp/tensorrt_llm/kernels/beamSearchKernels/beamSearchKernelsTemplate.h

Length of output: 17803


Fix required — addCumLogProbs compares candidate index to EOS token and ignores beam dim

Short summary: addCumLogProbs (cpp/tensorrt_llm/kernels/beamSearchKernels.h) iterates i over the flattened candidate grid (nBMIn * nBMOut * 2) and does (i == endIds[slot]). i is a candidate index, not a token id; also the code ignores the beam index in endIds. This will almost never match the EOS token and therefore fails to prefer EOS for finished beams (and is inconsistent with other kernels that treat endIds as per-slot token ids).

Locations to fix:

  • cpp/tensorrt_llm/kernels/beamSearchKernels.h
    • Kernel addCumLogProbs — replace the incorrect comparison and add access to the stage-1 ids.
  • cpp/tensorrt_llm/kernels/beamSearchKernels/beamSearchKernelsTemplate.h
    • Call site that launches addCumLogProbs must pass pStage1Ids (pStage1Ids is already allocated in this scope).

Suggested minimal changes (apply and adjust to your code style):

  • Change kernel signature to accept the stage-1 ids:

    • before: addCumLogProbs(T* pStage1LogProbs, float const* cumLogProbs, FinishedState const* finished, int const* endIds, float const* diversityRates, ...)
    • after: addCumLogProbs(T* pStage1LogProbs, float const* cumLogProbs, FinishedState const* finished, int const* endIds, int const* pStage1Ids, float const* diversityRates, ...)
  • In the kernel body replace the block:

    • before:
      pLocalLogProbs[i] += (i == endIds[slot]) ? 1.0f : 0.0f;
    • after (minimal, assuming endIds is per-slot token id; if endIds is per-beam use the per-beam index shown below):
      int const* pLocalIds = pStage1Ids + bid * nBMIn * nBMOut * 2;
      int const eosId = endIds[slot]; // or: endIds[slot * nBM + iBMIn] if endIds is [BS, BM]
      pLocalLogProbs[i] += (pLocalIds[i] == eosId) ? (T)1.0f : (T)0.0f;
  • Update the kernel launch in beamSearchKernelsTemplate.h:

    • before:
      addCumLogProbs<<<nBS, nThread>>>(pStage1LogProbs, bh.cumLogProbs, bh.finished, bh.endIds, bh.diversityRates, bh.batchSlots, ...);
    • after:
      addCumLogProbs<<<nBS, nThread>>>(pStage1LogProbs, bh.cumLogProbs, bh.finished, bh.endIds, pStage1Ids, bh.diversityRates, bh.batchSlots, ...);

Extra: clarify the declared shape of endIds in BeamHypotheses (cpp/tensorrt_llm/kernels/beamSearchKernels.h) — comments currently read [BS, BM] but most kernels use endIds[slot] (per-slot). Confirm whether endIds should be [BS] or [BS, BM] and update code/comments and indexing across kernels accordingly.

Reason: The kernel must compare the candidate's token id (from stage-1 ids) to the EOS token id; comparing the flattened candidate index is incorrect and will break finished-beam handling.

🤖 Prompt for AI Agents
In cpp/tensorrt_llm/kernels/beamSearchKernels.h around lines 147-151,
addCumLogProbs currently compares the flattened candidate index i to
endIds[slot] which is wrong — you must compare the candidate token id from
stage-1 ids and account for beam dim; update the kernel signature to accept int
const* pStage1Ids, inside the kernel compute a pointer to the local stage1 ids
for this batch/beam region (e.g. pStage1Ids + bid * nBMIn * nBMOut * 2) and
replace the (i == endIds[slot]) test with a comparison of pLocalIds[i] to the
appropriate eosId (use endIds[slot] if endIds is per-slot, or endIds[slot * nBM
+ iBMIn] if per-beam), then add pStage1Ids to the addCumLogProbs kernel launch
in cpp/tensorrt_llm/kernels/beamSearchKernelsTemplate.h so the kernel receives
the stage-1 ids; also verify and if needed fix/comment the declared
shape/indexing of endIds across BeamHypotheses to be consistently [BS] or [BS,
BM].

{
// nBM is used in VBWS since `cumLogProbs` is initialized with kMaxBeamWidth earlier than BeamSearchLayer
pLocalLogProbs[i] += cumLogProbs[slot * nBM + iBMIn] + diversityRate * iBMIn;
}
}
return;
}
#endif
Comment on lines +133 to +159
Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

💡 Verification agent

🧩 Analysis chain

Avoid ODR/duplicate definitions: add extern templates or move the kernel out of the header

Defining a templated __global__ kernel in a header while also explicitly instantiating it in a .cu risks multiple definitions if any TU implicitly instantiates it via a launch. Either:

  • Keep the definition in the .cu and leave only a declaration in the header, or
  • Keep the definition in the header and prevent implicit instantiation with extern template declarations, keeping explicit instantiations in the .cu.

Minimal change (add extern templates under the CUDA guard):

 #ifdef __CUDACC__
 template <typename T>
 __global__ __attribute__((visibility("default"))) void addCumLogProbs(T* __restrict pStage1LogProbs, float const* __restrict cumLogProbs,
     FinishedState const* finished, int const* endIds, float const* diversityRates,
     runtime::SizeType32 const* batchSlots, size_t const nBS, size_t const nBMIn, size_t const nBMOut, size_t const nBM)
 {
@@
 }
-#endif
+// Prevent implicit instantiation; explicit instantiations live in the .cu
+extern template __global__ __attribute__((visibility("default"))) void addCumLogProbs<float>(
+    float* __restrict pStage1LogProbs, float const* __restrict cumLogProbs, FinishedState const* finished,
+    int const* __restrict endIds, int const* __restrict pStage1Ids, float const* __restrict diversityRates,
+    runtime::SizeType32 const* __restrict batchSlots, size_t const nBS, size_t const nBMIn, size_t const nBMOut, size_t const nBM);
+
+extern template __global__ __attribute__((visibility("default"))) void addCumLogProbs<half>(
+    half* __restrict pStage1LogProbs, float const* __restrict cumLogProbs, FinishedState const* finished,
+    int const* __restrict endIds, int const* __restrict pStage1Ids, float const* __restrict diversityRates,
+    runtime::SizeType32 const* __restrict batchSlots, size_t const nBS, size_t const nBMIn, size_t const nBMOut, size_t const nBM);
+#endif

And update the explicit instantiations in the .cu to match the new signature:

// cpp/tensorrt_llm/kernels/beamSearchKernels.cu (adjust signature)
template __global__ void addCumLogProbs<float>(float* __restrict, float const* __restrict,
    FinishedState const*, int const* __restrict, int const* __restrict, float const* __restrict,
    runtime::SizeType32 const* __restrict, size_t const, size_t const, size_t const, size_t const);

template __global__ void addCumLogProbs<half>(half* __restrict, float const* __restrict,
    FinishedState const*, int const* __restrict, int const* __restrict, float const* __restrict,
    runtime::SizeType32 const* __restrict, size_t const, size_t const, size_t const, size_t const);

Alternative: Move the kernel body to the .cu and leave a guarded declaration in the header to avoid any template instantiation from headers.

To confirm there are explicit instantiations today and avoid duplicates, run:


🏁 Script executed:

#!/bin/bash
rg -n -A2 -B2 $'template\\s+__global__\\s+void\\s+addCumLogProbs' cpp | sed -n '1,200p'

Length of output: 1181


Move the CUDA kernel body out of the header (or add correct extern template specializations) — explicit instantiations exist in the .cu

The header currently defines a templated global kernel while cpp/tensorrt_llm/kernels/beamSearchKernels.cu already contains explicit instantiations for float and half (rg found instantiations at lines ~137 and ~141). This risks ODR/duplicate-definition errors. Fix by removing the definition from the header and leaving only a declaration, or (less preferred) add matching extern template specializations.

Files to change

  • cpp/tensorrt_llm/kernels/beamSearchKernels.h — replace the kernel definition (lines ~133–159) with a declaration.
  • cpp/tensorrt_llm/kernels/beamSearchKernels.cu — already has explicit instantiations; verify they match the declaration (they do: float and half specializations exist).

Suggested minimal change (preferred: move body to .cu; in the header leave only the declaration):

 #ifdef __CUDACC__
-template <typename T>
-__global__ __attribute__((visibility("default"))) void addCumLogProbs(T* __restrict pStage1LogProbs, float const* __restrict cumLogProbs,
-    FinishedState const* finished, int const* endIds, float const* diversityRates,
-    runtime::SizeType32 const* batchSlots, size_t const nBS, size_t const nBMIn, size_t const nBMOut, size_t const nBM)
-{
-    ...
-}
+#ifdef __CUDACC__
+template <typename T>
+__global__ __attribute__((visibility("default"))) void addCumLogProbs(
+    T* __restrict pStage1LogProbs,
+    float const* __restrict cumLogProbs,
+    FinishedState const* finished,
+    int const* endIds,
+    float const* diversityRates,
+    runtime::SizeType32 const* batchSlots,
+    size_t const nBS,
+    size_t const nBMIn,
+    size_t const nBMOut,
+    size_t const nBM);
 #endif

Optional alternative (if you must keep the body in the header): add extern template declarations that exactly match the explicit instantiations in the .cu (use the same parameter list and types). Example externs (place under the CUDA guard after the definition if you keep it, though moving the body is safer):

extern template __global__ void addCumLogProbs<float>(
    float* __restrict pStage1LogProbs, float const* __restrict cumLogProbs,
    FinishedState const* finished, int const* endIds, float const* diversityRates,
    runtime::SizeType32 const* batchSlots, size_t const nBS, size_t const nBMIn, size_t const nBMOut, size_t const nBM);

extern template __global__ void addCumLogProbs<half>(
    half* __restrict pStage1LogProbs, float const* __restrict cumLogProbs,
    FinishedState const* finished, int const* endIds, float const* diversityRates,
    runtime::SizeType32 const* batchSlots, size_t const nBS, size_t const nBMIn, size_t const nBMOut, size_t const nBM);

Action: prefer moving the kernel body to the .cu and keep the declaration in the header; that aligns with the existing explicit instantiations and avoids ODR issues.



__global__ void gatherId(int const* __restrict pStage1Id, int* __restrict pStage2Id, size_t const nBS,
size_t const nBMIn, size_t const nBMOut, size_t const nV);
Expand Down
2 changes: 1 addition & 1 deletion cpp/tensorrt_llm/kernels/cutlass_kernels/CMakeLists.txt
Original file line number Diff line number Diff line change
Expand Up @@ -190,7 +190,7 @@ set_cuda_architectures(fb_gemm_src 89 90 120f)
# ${INSTANTIATION_GENERATION_DIR}/fp8_rowwise_gemm)

add_library(fp8_blockscale_gemm_src STATIC ${FP8_BLOCKSCALE_GEMM_SRC_CU})
set_cuda_architectures(fp8_blockscale_gemm_src 89 90 100f)
set_cuda_architectures(fp8_blockscale_gemm_src 90)

set(GEMM_SWIGLU_SM90_SRC_CU
${CMAKE_CURRENT_SOURCE_DIR}/fused_gated_gemm/gemm_swiglu_e4m3.cu)
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -84,7 +84,7 @@ inline CUtensorMapDataType get_CUtensorMapDataType()
}
}

PFN_cuTensorMapEncodeTiled get_cuTensorMapEncodeTiled()
PFN_cuTensorMapEncodeTiled_v12000 get_cuTensorMapEncodeTiled()
{
// Get pointer to cuTensorMapEncodeTiled
cudaDriverEntryPointQueryResult driver_status;
Expand All @@ -101,12 +101,12 @@ PFN_cuTensorMapEncodeTiled get_cuTensorMapEncodeTiled()
throw std::runtime_error("driver_status != cudaDriverEntryPointSuccess");
}

return reinterpret_cast<PFN_cuTensorMapEncodeTiled>(cuTensorMapEncodeTiled_ptr);
return reinterpret_cast<PFN_cuTensorMapEncodeTiled_v12000>(cuTensorMapEncodeTiled_ptr);
}

template <typename data_type>
CUtensorMap make_2d_tma_copy_desc(data_type* global_address, uint64_t gmem_dim[2], uint64_t stride_in_bytes,
uint32_t smem_dim[2], CUtensorMapSwizzle swizzle_type, PFN_cuTensorMapEncodeTiled encode_func = nullptr)
uint32_t smem_dim[2], CUtensorMapSwizzle swizzle_type, PFN_cuTensorMapEncodeTiled_v12000 encode_func = nullptr)
{
Comment on lines +109 to 110
Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

🛠️ Refactor suggestion

Unify encode_func parameter type with stable PFN

Avoid exposing the versioned PFN type in public signatures.

-    uint32_t smem_dim[2], CUtensorMapSwizzle swizzle_type, PFN_cuTensorMapEncodeTiled_v12000 encode_func = nullptr)
+    uint32_t smem_dim[2], CUtensorMapSwizzle swizzle_type, PFN_cuTensorMapEncodeTiled encode_func = nullptr)
📝 Committable suggestion

‼️ IMPORTANT
Carefully review the code before committing. Ensure that it accurately replaces the highlighted code, contains no missing lines, and has no issues with indentation. Thoroughly test & benchmark the code to ensure it meets the requirements.

Suggested change
uint32_t smem_dim[2], CUtensorMapSwizzle swizzle_type, PFN_cuTensorMapEncodeTiled_v12000 encode_func = nullptr)
{
uint32_t smem_dim[2], CUtensorMapSwizzle swizzle_type, PFN_cuTensorMapEncodeTiled encode_func = nullptr)
{
🤖 Prompt for AI Agents
In
cpp/tensorrt_llm/kernels/cutlass_kernels/fp8_blockscale_gemm/fp8_blockscale_tma_utils.cuh
around lines 111-112, the function signature exposes the versioned PFN type
PFN_cuTensorMapEncodeTiled_v12000; change the parameter type to the stable,
unversioned PFN alias (PFN_cuTensorMapEncodeTiled) so the public signature
doesn't leak a versioned typedef, and update any forward declarations or
includes so the unversioned PFN type is visible where this header is compiled.

CUtensorMap tensor_map{};
constexpr uint32_t rank = 2;
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -2597,7 +2597,7 @@ __global__ void __launch_bounds__(MAX_THEADS_PER_BLOCK, MIN_BLOCKS_PER_SM) maske
__shared__ typename BlockReduce::TempStorage temp_storage;
// Obtain a segment of consecutive items that are blocked across threads (final_max from above)
// Compute the block-wide max for thread0
final_max = BlockReduce(temp_storage).Reduce(thread_partial_max, cub::Max(), gridDim.z);
final_max = BlockReduce(temp_storage).Reduce(thread_partial_max, cuda::maximum(), gridDim.z);

__shared__ float final_max_smem;
if (tidx == 0)
Expand Down
4 changes: 2 additions & 2 deletions cpp/tensorrt_llm/kernels/sageAttentionKernels.cu
Original file line number Diff line number Diff line change
Expand Up @@ -250,7 +250,7 @@ __global__ void sage_quant_kernel(void const* q, void const* k, void const* v, i

// Compute the block-wide max for thread0
// cuda::maximum<>{}
float aggregate = BlockReduce(temp_storage).Reduce(local_amax, cub::Max{});
float aggregate = BlockReduce(temp_storage).Reduce(local_amax, cuda::maximum{});

if (row_id == 0 && col_id == 0)
s_block_amax = static_cast<float>(aggregate);
Expand Down Expand Up @@ -429,7 +429,7 @@ __global__ void sage_quant_kernel(void const* q, void const* k, void const* v, i

// Compute the block-wide max for thread0
// cuda::maximum<>{}
float aggregate = BlockReduce(temp_storage).Reduce(local_amax, cub::Max{});
float aggregate = BlockReduce(temp_storage).Reduce(local_amax, cuda::maximum{});

if (row_id == 0 && col_id == 0)
s_block_amax = static_cast<float>(aggregate);
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -504,7 +504,7 @@ __global__ void prepareGenEagleNetInputsKernel(SizeType32* nextSequenceLengths,
BlockScan(tempStorage.scan).ExclusiveSum(numNextLogits, outputLastIndicesBase);
// Sync because tempStorage is reused.
__syncthreads();
auto const maxGenLength = BlockReduce(tempStorage.reduce).Reduce(nextDraftLen, cub::Max());
auto const maxGenLength = BlockReduce(tempStorage.reduce).Reduce(nextDraftLen, cuda::maximum());

// Thread 0 has the result.
if (bid == 0)
Expand Down
13 changes: 7 additions & 6 deletions cpp/tensorrt_llm/kernels/topkLastDim.cu
Original file line number Diff line number Diff line change
Expand Up @@ -25,6 +25,8 @@
#include "topkLastDim.h"
#include <cub/cub.cuh>
#include <cuda/atomic>
#include <thrust/iterator/counting_iterator.h>
#include <thrust/iterator/transform_iterator.h>

namespace tensorrt_llm
{
Expand Down Expand Up @@ -1221,9 +1223,9 @@ void standalone_stable_radix_topk_(void* buf, size_t& buf_size, T const* in, Idx
IdxT* sort_in_idx = nullptr;

air_topk_stable::ComputeOffset<IdxT> computeoffset(k);
cub::CountingInputIterator<IdxT> counting_iter(0);
cub::TransformInputIterator<IdxT, air_topk_stable::ComputeOffset<IdxT>, cub::CountingInputIterator<IdxT>>
transform_iter(counting_iter, computeoffset);
auto counting_iter = thrust::make_counting_iterator<IdxT>(0);
auto transform_iter = thrust::make_transform_iterator(counting_iter, computeoffset);

cub::DeviceSegmentedSort::SortPairs(NULL, temp_storage_bytes, out_idx, out_idx, out, out, k * batch_size,
batch_size, transform_iter, transform_iter + 1, stream);
if (sorted)
Expand Down Expand Up @@ -1348,9 +1350,8 @@ void standalone_stable_radix_topk_one_block_(void* buf, size_t& buf_size, T cons
const IdxT buf_len = air_topk_stable::calc_buf_len<T, IdxT, unsigned>(len);

air_topk_stable::ComputeOffset<IdxT> computeoffset(k);
cub::CountingInputIterator<IdxT> counting_iter(0);
cub::TransformInputIterator<IdxT, air_topk_stable::ComputeOffset<IdxT>, cub::CountingInputIterator<IdxT>>
transform_iter(counting_iter, computeoffset);
auto counting_iter = thrust::make_counting_iterator<IdxT>(0);
auto transform_iter = thrust::make_transform_iterator(counting_iter, computeoffset);

cub::DeviceSegmentedSort::SortPairs(NULL, temp_storage_bytes, out_idx, out_idx, out, out, k * batch_size,
batch_size, transform_iter, transform_iter + 1, stream);
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -154,7 +154,7 @@ __global__ void activationDeepSeekKernel(KernelParams params)
float constexpr E4m3MaxVal{448.f};

// Compute the absolute max
float aMax = BlockReduce(temp_storage).Reduce(fabsf(out), cub::Max());
float aMax = BlockReduce(temp_storage).Reduce(fabsf(out), cuda::maximum());
if (threadIdx.x == 0)
{
s_scaleOut = aMax / E4m3MaxVal;
Expand Down Expand Up @@ -657,7 +657,7 @@ __global__ void finalizeDeepSeekKernel(KernelParams params)
float constexpr E4m3MaxVal{448.f};

// Compute the absolute max
float aMax = BlockReduce(temp_storage).Reduce(fabsf(acc), cub::Max());
float aMax = BlockReduce(temp_storage).Reduce(fabsf(acc), cuda::maximum());

if (threadIdx.x == 0)
{
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -364,7 +364,8 @@ void* HostAccessibleDeviceAllocator::allocate(size_t memorySize)
TLLM_CHECK_WITH_INFO(
mAllowManagedFallback, "HostAccessibleDeviceAllocator is not supported on the current system.");
TLLM_CUDA_CHECK(cudaMallocManaged(&devPtr, memorySize));
TLLM_CUDA_CHECK(cudaMemAdvise(devPtr, memorySize, cudaMemAdviseSetPreferredLocation, currentDevId));
cudaMemLocation location {cudaMemLocationTypeDevice, currentDevId};
TLLM_CUDA_CHECK(cudaMemAdvise(devPtr, memorySize, cudaMemAdviseSetPreferredLocation, location));
hostPtr = devPtr;
Comment on lines +367 to 369
Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

💡 Verification agent

🧩 Analysis chain

Guard new cudaMemAdvise location-API for backward compatibility and fix indentation

The location-based cudaMemAdvise overload requires newer CUDA. Given CMake still allows CUDA >= 11.2, gate this with CUDART_VERSION to avoid compile errors with older toolkits. Also replace the tab with spaces to match file style.

Apply:

-	cudaMemLocation location {cudaMemLocationTypeDevice, currentDevId};
-        TLLM_CUDA_CHECK(cudaMemAdvise(devPtr, memorySize, cudaMemAdviseSetPreferredLocation, location));
+    #if CUDART_VERSION >= 12000
+    cudaMemLocation location{cudaMemLocationTypeDevice, currentDevId};
+    TLLM_CUDA_CHECK(cudaMemAdvise(devPtr, memorySize, cudaMemAdviseSetPreferredLocation, location));
+    #else
+    TLLM_CUDA_CHECK(cudaMemAdvise(devPtr, memorySize, cudaMemAdviseSetPreferredLocation, currentDevId));
+    #endif

Alternatively, if you intend to raise the project’s minimum CUDA to 13, update find_package(CUDAToolkit ...) accordingly and remove the fallback.


🏁 Script executed:

#!/bin/bash
set -euo pipefail
FILE="cpp/tensorrt_llm/runtime/moeLoadBalancer/hostAccessibleDeviceAllocator.cpp"
echo "PWD: $(pwd)"
if [ -f "$FILE" ]; then
  echo "Found file: $FILE"
  echo "---- File header (first 120 lines) ----"
  sed -n '1,120p' "$FILE"
  echo "---- Context lines 340-400 ----"
  sed -n '340,400p' "$FILE"
  echo "---- Search for relevant symbols in file ----"
  rg -n "TLLM_CUDA_CHECK|cudaMemAdvise|cudaMemLocation|CUDART_VERSION" "$FILE" || true
  echo "---- Lines containing tabs (show up to 200) ----"
  nl -ba "$FILE" | sed -n '1,520p' | rg --line-number $'\t' || true
else
  echo "File not found: $FILE"
  rg -n "hostAccessibleDeviceAllocator.cpp" || true
fi

echo "---- Search for find_package(CUDAToolkit in repo ----"
rg -n "find_package\\(CUDAToolkit" || true
echo "---- Search for CUDART_VERSION usage in repo ----"
rg -n "CUDART_VERSION" || true

Length of output: 13048


Guard cudaMemAdvise(location) for older CUDA toolkits & fix indentation

The cudaMemLocation overload requires CUDA ≥ 12, and cpp/CMakeLists.txt still allows CUDA 11.2 — gate the new API with CUDART_VERSION and replace the stray tab with spaces.

  • File: cpp/tensorrt_llm/runtime/moeLoadBalancer/hostAccessibleDeviceAllocator.cpp — replace the cudaMemLocation / cudaMemAdvise lines (around lines ~366–369).
  • CMake: cpp/CMakeLists.txt currently uses find_package(CUDAToolkit 11.2 ...), so keep the fallback unless you bump the min CUDA version.

Apply:

-	cudaMemLocation location {cudaMemLocationTypeDevice, currentDevId};
-        TLLM_CUDA_CHECK(cudaMemAdvise(devPtr, memorySize, cudaMemAdviseSetPreferredLocation, location));
+#if defined(CUDART_VERSION) && (CUDART_VERSION >= 12000)
+        cudaMemLocation location{cudaMemLocationTypeDevice, currentDevId};
+        TLLM_CUDA_CHECK(cudaMemAdvise(devPtr, memorySize, cudaMemAdviseSetPreferredLocation, location));
+#else
+        TLLM_CUDA_CHECK(cudaMemAdvise(devPtr, memorySize, cudaMemAdviseSetPreferredLocation, currentDevId));
+#endif

Alternatively, if you intend to require CUDA ≥ 12 project-wide, update find_package(CUDAToolkit ...) in cpp/CMakeLists.txt and remove the fallback.

📝 Committable suggestion

‼️ IMPORTANT
Carefully review the code before committing. Ensure that it accurately replaces the highlighted code, contains no missing lines, and has no issues with indentation. Thoroughly test & benchmark the code to ensure it meets the requirements.

Suggested change
cudaMemLocation location {cudaMemLocationTypeDevice, currentDevId};
TLLM_CUDA_CHECK(cudaMemAdvise(devPtr, memorySize, cudaMemAdviseSetPreferredLocation, location));
hostPtr = devPtr;
#if defined(CUDART_VERSION) && (CUDART_VERSION >= 12000)
cudaMemLocation location{cudaMemLocationTypeDevice, currentDevId};
TLLM_CUDA_CHECK(cudaMemAdvise(devPtr, memorySize, cudaMemAdviseSetPreferredLocation, location));
#else
TLLM_CUDA_CHECK(cudaMemAdvise(devPtr, memorySize, cudaMemAdviseSetPreferredLocation, currentDevId));
#endif
hostPtr = devPtr;
🤖 Prompt for AI Agents
In cpp/tensorrt_llm/runtime/moeLoadBalancer/hostAccessibleDeviceAllocator.cpp
around lines 367–369, the current cudaMemLocation/cudaMemAdvise usage requires
CUDA ≥ 12 and also contains a stray tab; guard the new API with a preprocessor
check (e.g., #if defined(CUDART_VERSION) && CUDART_VERSION >= 12000) to call
cudaMemLocation and cudaMemAdvise only when available and provide the existing
fallback for older CUDA versions in the #else branch, and replace the stray tab
with spaces to fix indentation; do not change cpp/CMakeLists.txt unless you
intend to raise the minimum CUDA version project-wide.

}
recordAllocation(devPtr, memorySize, hostPtr, memDesc);
Expand Down
2 changes: 1 addition & 1 deletion cpp/tensorrt_llm/runtime/utils/debugUtils.cu
Original file line number Diff line number Diff line change
Expand Up @@ -54,7 +54,7 @@ __global__ void checkTensorInvalidKernel(T const* data, std::size_t size, int* f
__shared__ typename BlockReduceT::TempStorage tempStorage;

// Compute block-wide maximum
int blockFound = BlockReduceT(tempStorage).Reduce(found, cub::Max());
int blockFound = BlockReduceT(tempStorage).Reduce(found, cuda::maximum());

// Have thread 0 write out block's result
if (threadIdx.x == 0)
Expand Down
Loading