-
Couldn't load subscription status.
- Fork 1.8k
[WIP][chore] : CUDA13 build #6890
New issue
Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.
By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.
Already on GitHub? Sign in to your account
Changes from 7 commits
bb12940
b18be91
e172f96
c61e9ba
c4c7958
32309ae
c47c7b5
0fdaec9
bcca984
4eb7618
0d70377
b5aa676
155f262
e331ef0
File filter
Filter by extension
Conversations
Jump to
Diff view
Diff view
There are no files selected for viewing
| Original file line number | Diff line number | Diff line change |
|---|---|---|
|
|
@@ -95,7 +95,7 @@ constexpr CUtensorMapDataType get_CUtensorMapDataType() | |
| } | ||
| } | ||
|
|
||
| PFN_cuTensorMapEncodeTiled get_cuTensorMapEncodeTiled() | ||
| PFN_cuTensorMapEncodeTiled_v12000 get_cuTensorMapEncodeTiled() | ||
| { | ||
| // Get pointer to `cuTensorMapEncodeTiled` | ||
| cudaDriverEntryPointQueryResult driver_status; | ||
|
|
@@ -110,12 +110,12 @@ PFN_cuTensorMapEncodeTiled get_cuTensorMapEncodeTiled() | |
|
|
||
| if (driver_status != cudaDriverEntryPointSuccess) | ||
| throw std::runtime_error("driver_status != cudaDriverEntryPointSuccess"); | ||
| return reinterpret_cast<PFN_cuTensorMapEncodeTiled>(cuTensorMapEncodeTiled_ptr); | ||
| return reinterpret_cast<PFN_cuTensorMapEncodeTiled_v12000>(cuTensorMapEncodeTiled_ptr); | ||
| } | ||
|
|
||
| template <typename T> | ||
| CUtensorMap make_2d_tma_copy_desc(T* global_address, uint64_t gmem_dim[2], uint64_t stride_in_bytes, | ||
| uint32_t smem_dim[2], CUtensorMapSwizzle swizzle_type, PFN_cuTensorMapEncodeTiled encode_func = nullptr) | ||
| uint32_t smem_dim[2], CUtensorMapSwizzle swizzle_type, PFN_cuTensorMapEncodeTiled_v12000 encode_func = nullptr) | ||
| { | ||
|
Comment on lines
116
to
119
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. 💡 Verification agent 🧩 Analysis chainDefault parameter type should match the stable PFN type Align the encode_func param with the stable, unversioned type to avoid exposing versioned types. - uint32_t smem_dim[2], CUtensorMapSwizzle swizzle_type, PFN_cuTensorMapEncodeTiled_v12000 encode_func = nullptr)
+ uint32_t smem_dim[2], CUtensorMapSwizzle swizzle_type, PFN_cuTensorMapEncodeTiled encode_func = nullptr)Double-check the other call sites for signature consistency. 🏁 Script executed: #!/bin/bash
set -euo pipefail
echo "=== show file (if exists) ==="
FILE="cpp/include/tensorrt_llm/deep_gemm/tma_utils.cuh"
if [ -f "$FILE" ]; then
sed -n '1,240p' "$FILE" || true
else
echo "File not found: $FILE"
fi
echo
echo "=== rg: PFN_cuTensorMapEncodeTiled_v12000 ==="
rg --hidden -n "PFN_cuTensorMapEncodeTiled_v12000" || true
echo
echo "=== rg: PFN_cuTensorMapEncodeTiled ==="
rg --hidden -n "PFN_cuTensorMapEncodeTiled(?!_v12000)" || true
echo
echo "=== rg: make_2d_tma_copy_desc ==="
rg --hidden -n "make_2d_tma_copy_desc" || true
echo
echo "=== rg: cuTensorMapEncodeTiled occurrences ==="
rg --hidden -n "cuTensorMapEncodeTiled" || true
echo
echo "=== rg: cuTensorMap related typedefs/usages ==="
rg --hidden -n "cuTensorMap" || trueLength of output: 25549 🏁 Script executed: #!/bin/bash
set -euo pipefail
echo "=== fp8_blockscale_tma_utils.cuh (head) ==="
sed -n '1,240p' cpp/tensorrt_llm/kernels/cutlass_kernels/fp8_blockscale_gemm/fp8_blockscale_tma_utils.cuh || true
echo
echo "=== cudaDriverWrapper.h (head) ==="
sed -n '1,320p' cpp/tensorrt_llm/common/cudaDriverWrapper.h || true
echo
echo "=== cudaDriverWrapper.cpp (head) ==="
sed -n '1,360p' cpp/tensorrt_llm/common/cudaDriverWrapper.cpp || true
echo
echo "=== tma_utils.cuh (relevant region) ==="
sed -n '1,220p' cpp/include/tensorrt_llm/deep_gemm/tma_utils.cuh || true
echo
echo "=== rg: where PFN_cuTensorMapEncodeTiled is defined/mentioned ==="
rg --hidden -n "PFN_cuTensorMapEncodeTiled" || trueLength of output: 32409 Prefer the stable PFN_cuTensorMapEncodeTiled type for the default parameter Use the unversioned PFN to avoid leaking the versioned typedef; callers I checked do not pass encode_func explicitly. Files to update:
Call sites verified (no explicit encode_func argument): cpp/include/tensorrt_llm/deep_gemm/fp8_gemm.cuh, cpp/tensorrt_llm/kernels/cutlass_kernels/fp8_blockscale_gemm/fp8_blockscale_gemm_kernel.cuh, etc. Optional safety: add a local fallback in headers to map the unversioned name to the v12000 typedef if the unversioned PFN is not present: #ifndef PFN_cuTensorMapEncodeTiled 🤖 Prompt for AI Agents |
||
| CUtensorMap tensor_map{}; | ||
| constexpr uint32_t rank = 2; | ||
|
|
||
| Original file line number | Diff line number | Diff line change | ||||||||||||||||||||||||||||||||||||||
|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
|
|
@@ -38,6 +38,13 @@ endif() | |||||||||||||||||||||||||||||||||||||||
|
|
||||||||||||||||||||||||||||||||||||||||
| # Ensure that dependent libraries are installed | ||||||||||||||||||||||||||||||||||||||||
| find_library(MLX5_lib NAMES mlx5 REQUIRED) | ||||||||||||||||||||||||||||||||||||||||
| set(NVSHMEM_INSTALL_PREFIX "${TORCH_INSTALL_PREFIX}/../nvidia/shmem") | ||||||||||||||||||||||||||||||||||||||||
| # message("Torch Install Prefix: ${TORCH_INSTALL_PREFIX}") | ||||||||||||||||||||||||||||||||||||||||
|
|
||||||||||||||||||||||||||||||||||||||||
| find_path(NVSHMEM_INCLUDE_DIR nvshmem.h HINTS ${NVSHMEM_INSTALL_PREFIX}/include) | ||||||||||||||||||||||||||||||||||||||||
| find_library(NVSHMEM_HOST_LIBRARY nvshmem_host.so.3 HINTS ${NVSHMEM_INSTALL_PREFIX}/lib) | ||||||||||||||||||||||||||||||||||||||||
| find_library(NVSHMEM_DEVICE_LIBRARY nvshmem_device HINTS ${NVSHMEM_INSTALL_PREFIX}/lib) | ||||||||||||||||||||||||||||||||||||||||
|
|
||||||||||||||||||||||||||||||||||||||||
|
||||||||||||||||||||||||||||||||||||||||
| set(NVSHMEM_INSTALL_PREFIX "${TORCH_INSTALL_PREFIX}/../nvidia/shmem") | |
| # message("Torch Install Prefix: ${TORCH_INSTALL_PREFIX}") | |
| find_path(NVSHMEM_INCLUDE_DIR nvshmem.h HINTS ${NVSHMEM_INSTALL_PREFIX}/include) | |
| find_library(NVSHMEM_HOST_LIBRARY nvshmem_host.so.3 HINTS ${NVSHMEM_INSTALL_PREFIX}/lib) | |
| find_library(NVSHMEM_DEVICE_LIBRARY nvshmem_device HINTS ${NVSHMEM_INSTALL_PREFIX}/lib) | |
| set(NVSHMEM_INSTALL_PREFIX "${TORCH_INSTALL_PREFIX}/../nvidia/shmem" CACHE PATH "NVSHMEM install root") | |
| find_path(NVSHMEM_INCLUDE_DIR nvshmem.h | |
| HINTS ${NVSHMEM_INSTALL_PREFIX}/include | |
| REQUIRED) | |
| find_library(NVSHMEM_HOST_LIBRARY | |
| NAMES nvshmem_host nvshmem | |
| HINTS ${NVSHMEM_INSTALL_PREFIX}/lib ${NVSHMEM_INSTALL_PREFIX}/lib64 | |
| REQUIRED) | |
| find_library(NVSHMEM_DEVICE_LIBRARY | |
| NAMES nvshmem_device | |
| HINTS ${NVSHMEM_INSTALL_PREFIX}/lib ${NVSHMEM_INSTALL_PREFIX}/lib64 | |
| REQUIRED) |
🤖 Prompt for AI Agents
In cpp/tensorrt_llm/deep_ep/CMakeLists.txt around lines 41–47, the NVSHMEM
discovery is brittle: it hard-codes a Torch-relative NVSHMEM_INSTALL_PREFIX,
pins a specific SONAME, doesn't search lib64, and doesn't fail-fast. Replace the
fixed install prefix with a cache variable NVSHMEM_INSTALL_PREFIX (allowing
override), use find_path and find_library with NAMES (no SONAME suffix) and
HINTS ${NVSHMEM_INSTALL_PREFIX}/lib ${NVSHMEM_INSTALL_PREFIX}/lib64
${NVSHMEM_INSTALL_PREFIX}/include and system paths, mark the find_* calls
REQUIRED so configuration errors fail early, and prefer using
find_package(NVSHMEM REQUIRED ...) if available for consistency with the
top-level CMake; ensure variables (NVSHMEM_INCLUDE_DIR, NVSHMEM_HOST_LIBRARY,
NVSHMEM_DEVICE_LIBRARY) are set from those robust searches.
borisfom marked this conversation as resolved.
Outdated
Show resolved
Hide resolved
coderabbitai[bot] marked this conversation as resolved.
Show resolved
Hide resolved
| Original file line number | Diff line number | Diff line change | ||||||||
|---|---|---|---|---|---|---|---|---|---|---|
|
|
@@ -24,6 +24,8 @@ | |||||||||
| #include <cuda/barrier> | ||||||||||
| #include <cute/arch/util.hpp> | ||||||||||
|
|
||||||||||
| #define PFN_cuTensorMapEncodeTiled PFN_cuTensorMapEncodeTiled_v12000 | ||||||||||
|
|
||||||||||
|
||||||||||
| #define PFN_cuTensorMapEncodeTiled PFN_cuTensorMapEncodeTiled_v12000 |
🤖 Prompt for AI Agents
In
cpp/tensorrt_llm/kernels/cutlass_kernels/fp8_blockscale_gemm/fp8_blockscale_tma_utils.cuh
around lines 27-28, remove the macro definition that rebinds
PFN_cuTensorMapEncodeTiled to PFN_cuTensorMapEncodeTiled_v12000; instead delete
the #define and create a local, scoped alias (preferably a typedef or C++
using-alias) that maps PFN_cuTensorMapEncodeTiled to
PFN_cuTensorMapEncodeTiled_v12000 within this translation unit or an appropriate
namespace to avoid leaking a macro into other headers. Ensure the alias is only
visible where needed and that no preprocessor macro remains.
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
🛠️ Refactor suggestion
Unify encode_func parameter type with stable PFN
Avoid exposing the versioned PFN type in public signatures.
- uint32_t smem_dim[2], CUtensorMapSwizzle swizzle_type, PFN_cuTensorMapEncodeTiled_v12000 encode_func = nullptr)
+ uint32_t smem_dim[2], CUtensorMapSwizzle swizzle_type, PFN_cuTensorMapEncodeTiled encode_func = nullptr)📝 Committable suggestion
‼️ IMPORTANT
Carefully review the code before committing. Ensure that it accurately replaces the highlighted code, contains no missing lines, and has no issues with indentation. Thoroughly test & benchmark the code to ensure it meets the requirements.
| uint32_t smem_dim[2], CUtensorMapSwizzle swizzle_type, PFN_cuTensorMapEncodeTiled_v12000 encode_func = nullptr) | |
| { | |
| uint32_t smem_dim[2], CUtensorMapSwizzle swizzle_type, PFN_cuTensorMapEncodeTiled encode_func = nullptr) | |
| { |
🤖 Prompt for AI Agents
In
cpp/tensorrt_llm/kernels/cutlass_kernels/fp8_blockscale_gemm/fp8_blockscale_tma_utils.cuh
around lines 111-112, the function signature exposes the versioned PFN type
PFN_cuTensorMapEncodeTiled_v12000; change the parameter type to the stable,
unversioned PFN alias (PFN_cuTensorMapEncodeTiled) so the public signature
doesn't leak a versioned typedef, and update any forward declarations or
includes so the unversioned PFN type is visible where this header is compiled.
| Original file line number | Diff line number | Diff line change | ||||||||||||||||||||
|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
|
|
@@ -364,7 +364,8 @@ void* HostAccessibleDeviceAllocator::allocate(size_t memorySize) | |||||||||||||||||||||
| TLLM_CHECK_WITH_INFO( | ||||||||||||||||||||||
| mAllowManagedFallback, "HostAccessibleDeviceAllocator is not supported on the current system."); | ||||||||||||||||||||||
| TLLM_CUDA_CHECK(cudaMallocManaged(&devPtr, memorySize)); | ||||||||||||||||||||||
| TLLM_CUDA_CHECK(cudaMemAdvise(devPtr, memorySize, cudaMemAdviseSetPreferredLocation, currentDevId)); | ||||||||||||||||||||||
| cudaMemLocation location {cudaMemLocationTypeDevice, currentDevId}; | ||||||||||||||||||||||
| TLLM_CUDA_CHECK(cudaMemAdvise(devPtr, memorySize, cudaMemAdviseSetPreferredLocation, location)); | ||||||||||||||||||||||
| hostPtr = devPtr; | ||||||||||||||||||||||
|
Comment on lines
+367
to
369
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. 💡 Verification agent 🧩 Analysis chainGuard new cudaMemAdvise location-API for backward compatibility and fix indentation The location-based cudaMemAdvise overload requires newer CUDA. Given CMake still allows CUDA >= 11.2, gate this with CUDART_VERSION to avoid compile errors with older toolkits. Also replace the tab with spaces to match file style. Apply: - cudaMemLocation location {cudaMemLocationTypeDevice, currentDevId};
- TLLM_CUDA_CHECK(cudaMemAdvise(devPtr, memorySize, cudaMemAdviseSetPreferredLocation, location));
+ #if CUDART_VERSION >= 12000
+ cudaMemLocation location{cudaMemLocationTypeDevice, currentDevId};
+ TLLM_CUDA_CHECK(cudaMemAdvise(devPtr, memorySize, cudaMemAdviseSetPreferredLocation, location));
+ #else
+ TLLM_CUDA_CHECK(cudaMemAdvise(devPtr, memorySize, cudaMemAdviseSetPreferredLocation, currentDevId));
+ #endifAlternatively, if you intend to raise the project’s minimum CUDA to 13, update find_package(CUDAToolkit ...) accordingly and remove the fallback. 🏁 Script executed: #!/bin/bash
set -euo pipefail
FILE="cpp/tensorrt_llm/runtime/moeLoadBalancer/hostAccessibleDeviceAllocator.cpp"
echo "PWD: $(pwd)"
if [ -f "$FILE" ]; then
echo "Found file: $FILE"
echo "---- File header (first 120 lines) ----"
sed -n '1,120p' "$FILE"
echo "---- Context lines 340-400 ----"
sed -n '340,400p' "$FILE"
echo "---- Search for relevant symbols in file ----"
rg -n "TLLM_CUDA_CHECK|cudaMemAdvise|cudaMemLocation|CUDART_VERSION" "$FILE" || true
echo "---- Lines containing tabs (show up to 200) ----"
nl -ba "$FILE" | sed -n '1,520p' | rg --line-number $'\t' || true
else
echo "File not found: $FILE"
rg -n "hostAccessibleDeviceAllocator.cpp" || true
fi
echo "---- Search for find_package(CUDAToolkit in repo ----"
rg -n "find_package\\(CUDAToolkit" || true
echo "---- Search for CUDART_VERSION usage in repo ----"
rg -n "CUDART_VERSION" || trueLength of output: 13048 Guard cudaMemAdvise(location) for older CUDA toolkits & fix indentation The cudaMemLocation overload requires CUDA ≥ 12, and cpp/CMakeLists.txt still allows CUDA 11.2 — gate the new API with CUDART_VERSION and replace the stray tab with spaces.
Apply: - cudaMemLocation location {cudaMemLocationTypeDevice, currentDevId};
- TLLM_CUDA_CHECK(cudaMemAdvise(devPtr, memorySize, cudaMemAdviseSetPreferredLocation, location));
+#if defined(CUDART_VERSION) && (CUDART_VERSION >= 12000)
+ cudaMemLocation location{cudaMemLocationTypeDevice, currentDevId};
+ TLLM_CUDA_CHECK(cudaMemAdvise(devPtr, memorySize, cudaMemAdviseSetPreferredLocation, location));
+#else
+ TLLM_CUDA_CHECK(cudaMemAdvise(devPtr, memorySize, cudaMemAdviseSetPreferredLocation, currentDevId));
+#endifAlternatively, if you intend to require CUDA ≥ 12 project-wide, update find_package(CUDAToolkit ...) in cpp/CMakeLists.txt and remove the fallback. 📝 Committable suggestion
Suggested change
🤖 Prompt for AI Agents |
||||||||||||||||||||||
| } | ||||||||||||||||||||||
| recordAllocation(devPtr, memorySize, hostPtr, memDesc); | ||||||||||||||||||||||
|
|
||||||||||||||||||||||
Uh oh!
There was an error while loading. Please reload this page.