From 92fce7dabcd7bb2af9529e3e90cf8b260e9b0f96 Mon Sep 17 00:00:00 2001 From: Tailing Yuan Date: Wed, 18 Jun 2025 10:49:32 +0000 Subject: [PATCH 01/40] Add deep_ep/CMakeLists.txt Signed-off-by: Tailing Yuan --- cpp/tensorrt_llm/deep_ep/CMakeLists.txt | 35 +++++++++++++++++++ .../deep_ep/deep_ep_cpp_tllm.version | 4 +++ 2 files changed, 39 insertions(+) create mode 100644 cpp/tensorrt_llm/deep_ep/CMakeLists.txt create mode 100644 cpp/tensorrt_llm/deep_ep/deep_ep_cpp_tllm.version diff --git a/cpp/tensorrt_llm/deep_ep/CMakeLists.txt b/cpp/tensorrt_llm/deep_ep/CMakeLists.txt new file mode 100644 index 00000000000..e29f808dca2 --- /dev/null +++ b/cpp/tensorrt_llm/deep_ep/CMakeLists.txt @@ -0,0 +1,35 @@ +set(NVSHMEM_ROOT_DIR /opt/custom_nvshmem) +set(DEEP_EP_SOURCE_DIR ${PROJECT_SOURCE_DIR}/../3rdparty/DeepEP) + +find_package(NVSHMEM REQUIRED HINTS ${NVSHMEM_ROOT_DIR}/lib/cmake/nvshmem) +find_library(TORCH_PYTHON_LIB torch_python REQUIRED + HINTS ${TORCH_INSTALL_PREFIX}/lib) + +set(CMAKE_INTERPROCEDURAL_OPTIMIZATION TRUE) + +pybind11_add_module( + deep_ep_cpp_tllm + ${DEEP_EP_SOURCE_DIR}/csrc/deep_ep.cpp + ${DEEP_EP_SOURCE_DIR}/csrc/kernels/internode.cu + ${DEEP_EP_SOURCE_DIR}/csrc/kernels/internode_ll.cu + ${DEEP_EP_SOURCE_DIR}/csrc/kernels/intranode.cu + ${DEEP_EP_SOURCE_DIR}/csrc/kernels/layout.cu + ${DEEP_EP_SOURCE_DIR}/csrc/kernels/runtime.cu) + +set_target_properties( + deep_ep_cpp_tllm + PROPERTIES CXX_STANDARD_REQUIRED ON CUDA_STANDARD_REQUIRED ON CXX_STANDARD 17 + CUDA_STANDARD 17 CUDA_SEPARABLE_COMPILATION ON) +target_compile_options( + deep_ep_cpp_tllm + PRIVATE ${TORCH_CXX_FLAGS} -O3 + $<$:--ptxas-options=--register-usage-level=10>) +target_compile_definitions(deep_ep_cpp_tllm + PRIVATE TORCH_EXTENSION_NAME=deep_ep_cpp_tllm) +target_link_libraries( + deep_ep_cpp_tllm PRIVATE nvshmem::nvshmem ${TORCH_LIBRARIES} + ${TORCH_PYTHON_LIB}) +target_link_options( + deep_ep_cpp_tllm PRIVATE + -Wl,--version-script,${CMAKE_CURRENT_SOURCE_DIR}/deep_ep_cpp_tllm.version + -Wl,--no-undefined-version) diff --git a/cpp/tensorrt_llm/deep_ep/deep_ep_cpp_tllm.version b/cpp/tensorrt_llm/deep_ep/deep_ep_cpp_tllm.version new file mode 100644 index 00000000000..f883136167b --- /dev/null +++ b/cpp/tensorrt_llm/deep_ep/deep_ep_cpp_tllm.version @@ -0,0 +1,4 @@ +{ + global: PyInit_deep_ep_cpp_tllm; + local: *; +}; From e67c4ea99ad2063e8a57fa17a3fb7c6c02e32377 Mon Sep 17 00:00:00 2001 From: Tailing Yuan Date: Thu, 19 Jun 2025 09:59:44 +0000 Subject: [PATCH 02/40] Compile nvshmem Signed-off-by: Tailing Yuan --- cpp/tensorrt_llm/deep_ep/CMakeLists.txt | 58 ++++++++++++++++++++++--- 1 file changed, 53 insertions(+), 5 deletions(-) diff --git a/cpp/tensorrt_llm/deep_ep/CMakeLists.txt b/cpp/tensorrt_llm/deep_ep/CMakeLists.txt index e29f808dca2..04d5d3522a2 100644 --- a/cpp/tensorrt_llm/deep_ep/CMakeLists.txt +++ b/cpp/tensorrt_llm/deep_ep/CMakeLists.txt @@ -1,12 +1,61 @@ -set(NVSHMEM_ROOT_DIR /opt/custom_nvshmem) set(DEEP_EP_SOURCE_DIR ${PROJECT_SOURCE_DIR}/../3rdparty/DeepEP) -find_package(NVSHMEM REQUIRED HINTS ${NVSHMEM_ROOT_DIR}/lib/cmake/nvshmem) +# Find libmlx5.so.1 +execute_process( + COMMAND + bash -c + "dirname $(ldconfig -p | grep libmlx5.so.1 | head -n1 | awk '{print $NF}')" + RESULT_VARIABLE _LIBMLX5_DIR_SUCCESS + OUTPUT_VARIABLE LIBMLX5_DIR + OUTPUT_STRIP_TRAILING_WHITESPACE) +if(NOT _LIBMLX5_DIR_SUCCESS EQUAL 0) + message(FATAL_ERROR "Failed to locate libmlx5.so.1") +endif() + +# Add nvshmem +include(ExternalProject) +ExternalProject_Add( + nvshmem_project + URL https://developer.download.nvidia.com/compute/redist/nvshmem/3.2.5/source/nvshmem_src_3.2.5-1.txz + URL_HASH + SHA256=eb2c8fb3b7084c2db86bd9fd905387909f1dfd483e7b45f7b3c3d5fcf5374b5a + PATCH_COMMAND patch -p1 -i ${DEEP_EP_SOURCE_DIR}/third-party/nvshmem.patch + COMMAND sed "s/TRANSPORT_VERSION_MAJOR 3/TRANSPORT_VERSION_MAJOR 103/" -i + src/CMakeLists.txt + CMAKE_ARGS -DNVSHMEM_IBGDA_SUPPORT=1 + -DNVSHMEM_IBRC_SUPPORT=0 + -DNVSHMEM_MPI_SUPPORT=0 + -DNVSHMEM_PMIX_SUPPORT=0 + -DNVSHMEM_SHMEM_SUPPORT=0 + -DNVSHMEM_TIMEOUT_DEVICE_POLLING=0 + -DNVSHMEM_UCX_SUPPORT=0 + -DNVSHMEM_USE_NCCL=0 + -DNVSHMEM_USE_GDRCOPY=0 + -DNVSHMEM_BUILD_HYDRA_LAUNCHER=0 + -DCMAKE_CUDA_ARCHITECTURES=90-real + -DMLX5_lib=${LIBMLX5_DIR}/libmlx5.so.1 + -DNVSHMEM_BUILD_BITCODE_LIBRARY=0 + -DNVSHMEM_BUILD_EXAMPLES=0 + -DNVSHMEM_BUILD_TESTS=0 + INSTALL_COMMAND "" + BINARY_DIR ${CMAKE_CURRENT_BINARY_DIR}/nvshmem-build + EXCLUDE_FROM_ALL TRUE) +add_library(nvshmem_project::nvshmem STATIC IMPORTED) +add_dependencies(nvshmem_project::nvshmem nvshmem_project) +file(MAKE_DIRECTORY ${CMAKE_CURRENT_BINARY_DIR}/nvshmem-build/src/include) +set_target_properties( + nvshmem_project::nvshmem + PROPERTIES IMPORTED_LOCATION + ${CMAKE_CURRENT_BINARY_DIR}/nvshmem-build/src/lib/libnvshmem.a + INTERFACE_INCLUDE_DIRECTORIES + ${CMAKE_CURRENT_BINARY_DIR}/nvshmem-build/src/include) + +# Find torch_python find_library(TORCH_PYTHON_LIB torch_python REQUIRED HINTS ${TORCH_INSTALL_PREFIX}/lib) +# Add deep_ep_cpp_tllm set(CMAKE_INTERPROCEDURAL_OPTIMIZATION TRUE) - pybind11_add_module( deep_ep_cpp_tllm ${DEEP_EP_SOURCE_DIR}/csrc/deep_ep.cpp @@ -15,7 +64,6 @@ pybind11_add_module( ${DEEP_EP_SOURCE_DIR}/csrc/kernels/intranode.cu ${DEEP_EP_SOURCE_DIR}/csrc/kernels/layout.cu ${DEEP_EP_SOURCE_DIR}/csrc/kernels/runtime.cu) - set_target_properties( deep_ep_cpp_tllm PROPERTIES CXX_STANDARD_REQUIRED ON CUDA_STANDARD_REQUIRED ON CXX_STANDARD 17 @@ -27,7 +75,7 @@ target_compile_options( target_compile_definitions(deep_ep_cpp_tllm PRIVATE TORCH_EXTENSION_NAME=deep_ep_cpp_tllm) target_link_libraries( - deep_ep_cpp_tllm PRIVATE nvshmem::nvshmem ${TORCH_LIBRARIES} + deep_ep_cpp_tllm PRIVATE nvshmem_project::nvshmem ${TORCH_LIBRARIES} ${TORCH_PYTHON_LIB}) target_link_options( deep_ep_cpp_tllm PRIVATE From 732c3b34e8573e9d4f5465064c1c4cb052cc774f Mon Sep 17 00:00:00 2001 From: Tailing Yuan Date: Thu, 19 Jun 2025 12:54:41 +0000 Subject: [PATCH 03/40] Download DeepEP as tar.gz rather than submodule Signed-off-by: Tailing Yuan --- cpp/tensorrt_llm/deep_ep/CMakeLists.txt | 34 ++++++++++++++++++++++--- 1 file changed, 30 insertions(+), 4 deletions(-) diff --git a/cpp/tensorrt_llm/deep_ep/CMakeLists.txt b/cpp/tensorrt_llm/deep_ep/CMakeLists.txt index 04d5d3522a2..8d9ede7e6f8 100644 --- a/cpp/tensorrt_llm/deep_ep/CMakeLists.txt +++ b/cpp/tensorrt_llm/deep_ep/CMakeLists.txt @@ -1,4 +1,31 @@ -set(DEEP_EP_SOURCE_DIR ${PROJECT_SOURCE_DIR}/../3rdparty/DeepEP) +set(DEEP_EP_COMMIT 205817d98b692ed32a1d104775251292091cfab7) + +# Download DeepEP +include(FetchContent) +if(DEFINED $ENV{GITHUB_MIRROR}) + set(GITHUB_URL "$ENV{GITHUB_MIRROR}") +else() + set(GITHUB_URL "https://github.com") +endif() +FetchContent_Declare( + deep_ep_download + URL ${GITHUB_URL}/deepseek-ai/DeepEP/archive/${DEEP_EP_COMMIT}.tar.gz) +FetchContent_MakeAvailable(deep_ep_download) +set(DEEP_EP_SOURCE_DIR ${deep_ep_download_SOURCE_DIR}) + +# Delete stale nvshmem +set(NVSHMEM_STAMP_FILE ${CMAKE_CURRENT_BINARY_DIR}/nvshmem_stamp.txt) +file(SHA256 "${DEEP_EP_SOURCE_DIR}/third-party/nvshmem.patch" + NVSHMEM_PATCH_HASH) +set(NVSHMEM_STAMP_CONTENT "3.2.5-1 ${NVSHMEM_PATCH_HASH}") +set(OLD_NVSHMEM_STAMP_CONTENT "") +if(EXISTS ${NVSHMEM_STAMP_FILE}) + file(READ ${NVSHMEM_STAMP_FILE} OLD_NVSHMEM_STAMP_CONTENT) +endif() +if(NOT "${OLD_NVSHMEM_STAMP_CONTENT}" STREQUAL "${NVSHMEM_STAMP_CONTENT}") + file(REMOVE_RECURSE ${CMAKE_CURRENT_BINARY_DIR}/nvshmem-src) + file(WRITE ${NVSHMEM_STAMP_FILE} "${NVSHMEM_STAMP_CONTENT}") +endif() # Find libmlx5.so.1 execute_process( @@ -17,8 +44,8 @@ include(ExternalProject) ExternalProject_Add( nvshmem_project URL https://developer.download.nvidia.com/compute/redist/nvshmem/3.2.5/source/nvshmem_src_3.2.5-1.txz - URL_HASH - SHA256=eb2c8fb3b7084c2db86bd9fd905387909f1dfd483e7b45f7b3c3d5fcf5374b5a + SOURCE_DIR ${CMAKE_CURRENT_BINARY_DIR}/nvshmem-src + BINARY_DIR ${CMAKE_CURRENT_BINARY_DIR}/nvshmem-build PATCH_COMMAND patch -p1 -i ${DEEP_EP_SOURCE_DIR}/third-party/nvshmem.patch COMMAND sed "s/TRANSPORT_VERSION_MAJOR 3/TRANSPORT_VERSION_MAJOR 103/" -i src/CMakeLists.txt @@ -38,7 +65,6 @@ ExternalProject_Add( -DNVSHMEM_BUILD_EXAMPLES=0 -DNVSHMEM_BUILD_TESTS=0 INSTALL_COMMAND "" - BINARY_DIR ${CMAKE_CURRENT_BINARY_DIR}/nvshmem-build EXCLUDE_FROM_ALL TRUE) add_library(nvshmem_project::nvshmem STATIC IMPORTED) add_dependencies(nvshmem_project::nvshmem nvshmem_project) From a3e2d33d6fdeeefdf6fd8fb431035b328bec1417 Mon Sep 17 00:00:00 2001 From: Tailing Yuan Date: Thu, 19 Jun 2025 13:28:11 +0000 Subject: [PATCH 04/40] Copy DeepEP python files Signed-off-by: Tailing Yuan --- .gitignore | 1 + cpp/tensorrt_llm/deep_ep/CMakeLists.txt | 20 ++++++++++++++++++++ 2 files changed, 21 insertions(+) diff --git a/.gitignore b/.gitignore index 88abffd4782..aaa482ffea2 100644 --- a/.gitignore +++ b/.gitignore @@ -40,6 +40,7 @@ tensorrt_llm/libs tensorrt_llm/bindings.*.so tensorrt_llm/bindings.pyi tensorrt_llm/bindings/**/*.pyi +tensorrt_llm/deep_ep/ *docs/cpp_docs* *docs/source/_cpp_gen* docs/source/**/*.rst diff --git a/cpp/tensorrt_llm/deep_ep/CMakeLists.txt b/cpp/tensorrt_llm/deep_ep/CMakeLists.txt index 8d9ede7e6f8..38637250323 100644 --- a/cpp/tensorrt_llm/deep_ep/CMakeLists.txt +++ b/cpp/tensorrt_llm/deep_ep/CMakeLists.txt @@ -13,6 +13,26 @@ FetchContent_Declare( FetchContent_MakeAvailable(deep_ep_download) set(DEEP_EP_SOURCE_DIR ${deep_ep_download_SOURCE_DIR}) +# Copy files +set(DEEP_EP_PYTHON_DEST ${PROJECT_SOURCE_DIR}/../tensorrt_llm/deep_ep) +file(REMOVE_RECURSE ${DEEP_EP_PYTHON_DEST}) +file(MAKE_DIRECTORY ${DEEP_EP_PYTHON_DEST}) +file(COPY ${DEEP_EP_SOURCE_DIR}/LICENSE DESTINATION ${DEEP_EP_PYTHON_DEST}) +set(_files __init__.py buffer.py utils.py) +foreach(_f IN LISTS _files) + set(_src "${DEEP_EP_SOURCE_DIR}/deep_ep/${_f}") + set(_dst "${DEEP_EP_PYTHON_DEST}/${_f}") + file(READ "${_src}" _content) + string(REPLACE "deep_ep_cpp" "tensorrt_llm.deep_ep_cpp_tllm" _content + "${_content}") + string( + PREPEND + _content + "# Adapted from https://github.com/deepseek-ai/DeepEP/blob/${DEEP_EP_COMMIT}/deep_ep/${_f}\n" + ) + file(WRITE "${_dst}" "${_content}") +endforeach() + # Delete stale nvshmem set(NVSHMEM_STAMP_FILE ${CMAKE_CURRENT_BINARY_DIR}/nvshmem_stamp.txt) file(SHA256 "${DEEP_EP_SOURCE_DIR}/third-party/nvshmem.patch" From 0fbbb91a74df5e9ef4ad9e67b6461d84ab14e741 Mon Sep 17 00:00:00 2001 From: Tailing Yuan Date: Tue, 24 Jun 2025 09:14:29 +0000 Subject: [PATCH 05/40] Update build_wheel.py Signed-off-by: Tailing Yuan --- .gitignore | 2 + cpp/tensorrt_llm/CMakeLists.txt | 4 ++ cpp/tensorrt_llm/deep_ep/CMakeLists.txt | 17 +++-- scripts/build_wheel.py | 70 +++++++++++++++---- .../_torch/modules/fused_moe/deep_ep_utils.py | 2 +- 5 files changed, 76 insertions(+), 19 deletions(-) diff --git a/.gitignore b/.gitignore index aaa482ffea2..beef20746af 100644 --- a/.gitignore +++ b/.gitignore @@ -41,6 +41,8 @@ tensorrt_llm/bindings.*.so tensorrt_llm/bindings.pyi tensorrt_llm/bindings/**/*.pyi tensorrt_llm/deep_ep/ +tensorrt_llm/deep_ep_cpp_tllm.*.so +tensorrt_llm/deep_ep_cpp_tllm.pyi *docs/cpp_docs* *docs/source/_cpp_gen* docs/source/**/*.rst diff --git a/cpp/tensorrt_llm/CMakeLists.txt b/cpp/tensorrt_llm/CMakeLists.txt index 5c2c3f0d8f7..f1a617b793f 100644 --- a/cpp/tensorrt_llm/CMakeLists.txt +++ b/cpp/tensorrt_llm/CMakeLists.txt @@ -297,4 +297,8 @@ if(BUILD_PYBIND) add_subdirectory(pybind) endif() +if(BUILD_DEEP_EP) + add_subdirectory(deep_ep) +endif() + add_subdirectory(plugins) diff --git a/cpp/tensorrt_llm/deep_ep/CMakeLists.txt b/cpp/tensorrt_llm/deep_ep/CMakeLists.txt index 38637250323..32a602aee43 100644 --- a/cpp/tensorrt_llm/deep_ep/CMakeLists.txt +++ b/cpp/tensorrt_llm/deep_ep/CMakeLists.txt @@ -13,8 +13,8 @@ FetchContent_Declare( FetchContent_MakeAvailable(deep_ep_download) set(DEEP_EP_SOURCE_DIR ${deep_ep_download_SOURCE_DIR}) -# Copy files -set(DEEP_EP_PYTHON_DEST ${PROJECT_SOURCE_DIR}/../tensorrt_llm/deep_ep) +# Copy and update files +set(DEEP_EP_PYTHON_DEST ${CMAKE_CURRENT_BINARY_DIR}/python/deep_ep) file(REMOVE_RECURSE ${DEEP_EP_PYTHON_DEST}) file(MAKE_DIRECTORY ${DEEP_EP_PYTHON_DEST}) file(COPY ${DEEP_EP_SOURCE_DIR}/LICENSE DESTINATION ${DEEP_EP_PYTHON_DEST}) @@ -112,8 +112,13 @@ pybind11_add_module( ${DEEP_EP_SOURCE_DIR}/csrc/kernels/runtime.cu) set_target_properties( deep_ep_cpp_tllm - PROPERTIES CXX_STANDARD_REQUIRED ON CUDA_STANDARD_REQUIRED ON CXX_STANDARD 17 - CUDA_STANDARD 17 CUDA_SEPARABLE_COMPILATION ON) + PROPERTIES CXX_STANDARD_REQUIRED ON + CUDA_STANDARD_REQUIRED ON + CXX_STANDARD 17 + CUDA_STANDARD 17 + CUDA_SEPARABLE_COMPILATION ON + INSTALL_RPATH "$ORIGIN/libs/nvshmem" + BUILD_WITH_INSTALL_RPATH TRUE) target_compile_options( deep_ep_cpp_tllm PRIVATE ${TORCH_CXX_FLAGS} -O3 @@ -127,3 +132,7 @@ target_link_options( deep_ep_cpp_tllm PRIVATE -Wl,--version-script,${CMAKE_CURRENT_SOURCE_DIR}/deep_ep_cpp_tllm.version -Wl,--no-undefined-version) + +# Add deep_ep +add_custom_target(deep_ep) +add_dependencies(deep_ep deep_ep_cpp_tllm nvshmem_project) diff --git a/scripts/build_wheel.py b/scripts/build_wheel.py index 5f50e49fe3c..a844f7bb0b2 100755 --- a/scripts/build_wheel.py +++ b/scripts/build_wheel.py @@ -413,10 +413,16 @@ def main(*, if cpp_only: build_pyt = "OFF" build_pybind = "OFF" + build_deep_ep = "OFF" else: targets.extend(["bindings", "th_common"]) build_pyt = "ON" build_pybind = "ON" + if on_windows: + build_deep_ep = "OFF" + else: + build_deep_ep = "ON" + targets.append("deep_ep") if benchmarks: targets.append("benchmarks") @@ -455,7 +461,7 @@ def main(*, ) cmake_def_args = " ".join(cmake_def_args) cmake_configure_command = ( - f'cmake -DCMAKE_BUILD_TYPE="{build_type}" -DBUILD_PYT="{build_pyt}" -DBUILD_PYBIND="{build_pybind}"' + f'cmake -DCMAKE_BUILD_TYPE="{build_type}" -DBUILD_PYT="{build_pyt}" -DBUILD_PYBIND="{build_pybind}" -DBUILD_DEEP_EP="{build_deep_ep}"' f' -DNVTX_DISABLE="{disable_nvtx}" -DBUILD_MICRO_BENCHMARKS={build_micro_benchmarks}' f' -DBUILD_WHEEL_TARGETS="{";".join(targets)}"' f' -DPython_EXECUTABLE={venv_python} -DPython3_EXECUTABLE={venv_python}' @@ -505,14 +511,17 @@ def main(*, install_tree = copytree if skip_building_wheel and linking_install_binary: - def symlink_remove_dst(src, dst): + def symlink_remove_dst(src, dst, *, follow_symlinks=True): src = os.path.abspath(src) dst = os.path.abspath(dst) if os.path.isdir(dst): dst = os.path.join(dst, os.path.basename(src)) if os.path.exists(dst): os.remove(dst) - os.symlink(src, dst) + if follow_symlinks: + os.symlink(src, dst) + else: + copy(src, dst, follow_symlinks=follow_symlinks) install_file = symlink_remove_dst @@ -595,6 +604,12 @@ def symlink_remove_dst_tree(src, dst, dirs_exist_ok=True): "tensorrt_llm/kernels/decoderMaskedMultiheadAttention/libdecoder_attention_1.so", lib_dir / "libdecoder_attention_1.so") + deep_ep_dir = pkg_dir / "deep_ep" + if deep_ep_dir.is_symlink(): + deep_ep_dir.unlink() + elif deep_ep_dir.is_dir(): + clear_folder(deep_ep_dir) + bin_dir = pkg_dir / "bin" if bin_dir.exists(): clear_folder(bin_dir) @@ -606,19 +621,47 @@ def symlink_remove_dst_tree(src, dst, dirs_exist_ok=True): if not cpp_only: - def get_pybind_lib(): - pybind_build_dir = (build_dir / "tensorrt_llm" / "pybind") + def get_pybind_lib(subdirectory, name): + pybind_build_dir = (build_dir / "tensorrt_llm" / subdirectory) if on_windows: - pybind_lib = list(pybind_build_dir.glob("bindings.*.pyd")) + pybind_lib = list(pybind_build_dir.glob(f"{name}.*.pyd")) else: - pybind_lib = list(pybind_build_dir.glob("bindings.*.so")) + pybind_lib = list(pybind_build_dir.glob(f"{name}.*.so")) assert len( pybind_lib ) == 1, f"Exactly one pybind library should be present: {pybind_lib}" return pybind_lib[0] - install_file(get_pybind_lib(), pkg_dir) + install_file(get_pybind_lib("pybind", "bindings"), pkg_dir) + if build_deep_ep == "ON": + install_file(get_pybind_lib("deep_ep", "deep_ep_cpp_tllm"), pkg_dir) + install_tree(build_dir / "tensorrt_llm" / "deep_ep" / "python" / + "deep_ep", + deep_ep_dir, + dirs_exist_ok=True) + (lib_dir / "nvshmem").mkdir(exist_ok=True) + install_file( + build_dir / "tensorrt_llm/deep_ep/nvshmem-src/License.txt", + lib_dir / "nvshmem") + install_file( + build_dir / + "tensorrt_llm/deep_ep/nvshmem-build/src/lib/nvshmem_bootstrap_uid.so.3.0.0", + lib_dir / "nvshmem") + install_file( + build_dir / + "tensorrt_llm/deep_ep/nvshmem-build/src/lib/nvshmem_bootstrap_uid.so.3", + lib_dir / "nvshmem", + follow_symlinks=False) + install_file( + build_dir / + "tensorrt_llm/deep_ep/nvshmem-build/src/lib/nvshmem_transport_ibgda.so.103.0.0", + lib_dir / "nvshmem") + install_file( + build_dir / + "tensorrt_llm/deep_ep/nvshmem-build/src/lib/nvshmem_transport_ibgda.so.103", + lib_dir / "nvshmem", + follow_symlinks=False) if not skip_stubs: with working_directory(project_dir): build_run(f"\"{venv_python}\" -m pip install pybind11-stubgen") @@ -652,14 +695,13 @@ def get_pybind_lib(): if 'LD_LIBRARY_PATH' in env_ld: new_library_path += f":{env_ld['LD_LIBRARY_PATH']}" env_ld["LD_LIBRARY_PATH"] = new_library_path - try: + build_run( + f"\"{venv_python}\" -m pybind11_stubgen -o . bindings --exit-code", + env=env_ld) + if build_deep_ep == "ON": build_run( - f"\"{venv_python}\" -m pybind11_stubgen -o . bindings --exit-code", + f"\"{venv_python}\" -m pybind11_stubgen -o . deep_ep_cpp_tllm --exit-code", env=env_ld) - except CalledProcessError as ex: - print(f"Failed to build pybind11 stubgen: {ex}", - file=sys.stderr) - exit(1) if not skip_building_wheel: if dist_dir is None: diff --git a/tensorrt_llm/_torch/modules/fused_moe/deep_ep_utils.py b/tensorrt_llm/_torch/modules/fused_moe/deep_ep_utils.py index e0c7c67748f..4dcf72b2dad 100644 --- a/tensorrt_llm/_torch/modules/fused_moe/deep_ep_utils.py +++ b/tensorrt_llm/_torch/modules/fused_moe/deep_ep_utils.py @@ -9,7 +9,7 @@ from tensorrt_llm.mapping import Mapping try: - from deep_ep import Buffer + from tensorrt_llm.deep_ep import Buffer deep_ep_installed = True except ModuleNotFoundError: deep_ep_installed = False From 821756de27f867ced69b056d18a1dc2c0efca552 Mon Sep 17 00:00:00 2001 From: Tailing Yuan Date: Tue, 24 Jun 2025 10:30:01 +0000 Subject: [PATCH 06/40] Fix: no symlinks in .whl Signed-off-by: Tailing Yuan --- scripts/build_wheel.py | 19 +++---------------- setup.py | 5 ++++- 2 files changed, 7 insertions(+), 17 deletions(-) diff --git a/scripts/build_wheel.py b/scripts/build_wheel.py index a844f7bb0b2..ae9e8acb41a 100755 --- a/scripts/build_wheel.py +++ b/scripts/build_wheel.py @@ -511,17 +511,14 @@ def main(*, install_tree = copytree if skip_building_wheel and linking_install_binary: - def symlink_remove_dst(src, dst, *, follow_symlinks=True): + def symlink_remove_dst(src, dst): src = os.path.abspath(src) dst = os.path.abspath(dst) if os.path.isdir(dst): dst = os.path.join(dst, os.path.basename(src)) if os.path.exists(dst): os.remove(dst) - if follow_symlinks: - os.symlink(src, dst) - else: - copy(src, dst, follow_symlinks=follow_symlinks) + os.symlink(src, dst) install_file = symlink_remove_dst @@ -644,24 +641,14 @@ def get_pybind_lib(subdirectory, name): install_file( build_dir / "tensorrt_llm/deep_ep/nvshmem-src/License.txt", lib_dir / "nvshmem") - install_file( - build_dir / - "tensorrt_llm/deep_ep/nvshmem-build/src/lib/nvshmem_bootstrap_uid.so.3.0.0", - lib_dir / "nvshmem") install_file( build_dir / "tensorrt_llm/deep_ep/nvshmem-build/src/lib/nvshmem_bootstrap_uid.so.3", - lib_dir / "nvshmem", - follow_symlinks=False) - install_file( - build_dir / - "tensorrt_llm/deep_ep/nvshmem-build/src/lib/nvshmem_transport_ibgda.so.103.0.0", lib_dir / "nvshmem") install_file( build_dir / "tensorrt_llm/deep_ep/nvshmem-build/src/lib/nvshmem_transport_ibgda.so.103", - lib_dir / "nvshmem", - follow_symlinks=False) + lib_dir / "nvshmem") if not skip_stubs: with working_directory(project_dir): build_run(f"\"{venv_python}\" -m pip install pybind11-stubgen") diff --git a/setup.py b/setup.py index 1b06f0700fc..97a32ba1b96 100644 --- a/setup.py +++ b/setup.py @@ -104,7 +104,10 @@ def has_ext_modules(self): 'libs/libnvinfer_plugin_tensorrt_llm.so', 'libs/libtensorrt_llm_ucx_wrapper.so', 'libs/libdecoder_attention_0.so', 'libs/libtensorrt_llm_nixl_wrapper.so', - 'libs/libdecoder_attention_1.so', 'bindings.*.so', "include/**/*" + 'libs/libdecoder_attention_1.so', 'libs/nvshmem/License.txt', + 'libs/nvshmem/nvshmem_bootstrap_uid.so.3', + 'libs/nvshmem/nvshmem_transport_ibgda.so.103', 'bindings.*.so', + 'deep_ep/LICENSE', 'deep_ep_cpp_tllm.*.so', "include/**/*" ] package_data += [ From 57c321592ec30e6d955bf6116c26603bb8175d20 Mon Sep 17 00:00:00 2001 From: Tailing Yuan Date: Fri, 27 Jun 2025 02:43:15 +0000 Subject: [PATCH 07/40] Support more CUDA archs Signed-off-by: Tailing Yuan --- cpp/tensorrt_llm/deep_ep/CMakeLists.txt | 82 ++++++++++++++++++++----- scripts/build_wheel.py | 18 +++--- 2 files changed, 76 insertions(+), 24 deletions(-) diff --git a/cpp/tensorrt_llm/deep_ep/CMakeLists.txt b/cpp/tensorrt_llm/deep_ep/CMakeLists.txt index 32a602aee43..2c1d927bebf 100644 --- a/cpp/tensorrt_llm/deep_ep/CMakeLists.txt +++ b/cpp/tensorrt_llm/deep_ep/CMakeLists.txt @@ -1,4 +1,40 @@ set(DEEP_EP_COMMIT 205817d98b692ed32a1d104775251292091cfab7) +set(NVSHMEM_URL_HASH + SHA256=eb2c8fb3b7084c2db86bd9fd905387909f1dfd483e7b45f7b3c3d5fcf5374b5a) + +add_custom_target(deep_ep) + +# CUDA architectures +# ================== + +# Filter CUDA arch >= 9.0 +set(DEEP_EP_CUDA_ARCHITECTURES "") +foreach(CUDA_ARCH IN LISTS CMAKE_CUDA_ARCHITECTURES_NORMALIZED) + string(REGEX MATCHALL "^([1-9][0-9]*)[0-9]a?(-real|-virtual)?$" MATCHES + ${CUDA_ARCH}) + if(NOT CMAKE_MATCH_0) + message(FATAL_ERROR "Invalid CUDA arch format: \"${CUDA_ARCH}\"") + endif() + set(CUDA_ARCH_MAJOR ${CMAKE_MATCH_1}) + if(${CUDA_ARCH_MAJOR} GREATER_EQUAL 9) + list(APPEND DEEP_EP_CUDA_ARCHITECTURES ${CMAKE_MATCH_0}) + endif() +endforeach() + +# Skip build if there is no suitable CUDA arch +if(WIN32) + set(DEEP_EP_CUDA_ARCHITECTURES "") +endif() +message( + STATUS "deep_ep DEEP_EP_CUDA_ARCHITECTURES: ${DEEP_EP_CUDA_ARCHITECTURES}") +file(WRITE ${CMAKE_CURRENT_BINARY_DIR}/cuda_architectures.txt + "${DEEP_EP_CUDA_ARCHITECTURES}") +if(NOT DEEP_EP_CUDA_ARCHITECTURES) + return() +endif() + +# Prepare files +# ============= # Download DeepEP include(FetchContent) @@ -13,7 +49,7 @@ FetchContent_Declare( FetchContent_MakeAvailable(deep_ep_download) set(DEEP_EP_SOURCE_DIR ${deep_ep_download_SOURCE_DIR}) -# Copy and update files +# Copy and update python files set(DEEP_EP_PYTHON_DEST ${CMAKE_CURRENT_BINARY_DIR}/python/deep_ep) file(REMOVE_RECURSE ${DEEP_EP_PYTHON_DEST}) file(MAKE_DIRECTORY ${DEEP_EP_PYTHON_DEST}) @@ -33,20 +69,24 @@ foreach(_f IN LISTS _files) file(WRITE "${_dst}" "${_content}") endforeach() -# Delete stale nvshmem +# Delete stale nvshmem on patch update set(NVSHMEM_STAMP_FILE ${CMAKE_CURRENT_BINARY_DIR}/nvshmem_stamp.txt) -file(SHA256 "${DEEP_EP_SOURCE_DIR}/third-party/nvshmem.patch" - NVSHMEM_PATCH_HASH) -set(NVSHMEM_STAMP_CONTENT "3.2.5-1 ${NVSHMEM_PATCH_HASH}") +file(COPY ${DEEP_EP_SOURCE_DIR}/third-party/nvshmem.patch + DESTINATION ${CMAKE_CURRENT_BINARY_DIR}) +file(SHA256 ${CMAKE_CURRENT_BINARY_DIR}/nvshmem.patch NVSHMEM_PATCH_HASH) +set(NVSHMEM_STAMP_CONTENT "${NVSHMEM_URL_HASH} patch ${NVSHMEM_PATCH_HASH}") set(OLD_NVSHMEM_STAMP_CONTENT "") if(EXISTS ${NVSHMEM_STAMP_FILE}) file(READ ${NVSHMEM_STAMP_FILE} OLD_NVSHMEM_STAMP_CONTENT) endif() if(NOT "${OLD_NVSHMEM_STAMP_CONTENT}" STREQUAL "${NVSHMEM_STAMP_CONTENT}") - file(REMOVE_RECURSE ${CMAKE_CURRENT_BINARY_DIR}/nvshmem-src) + file(REMOVE_RECURSE ${CMAKE_CURRENT_BINARY_DIR}/nvshmem_project-prefix) file(WRITE ${NVSHMEM_STAMP_FILE} "${NVSHMEM_STAMP_CONTENT}") endif() +# Add NVSHMEM +# =========== + # Find libmlx5.so.1 execute_process( COMMAND @@ -59,14 +99,14 @@ if(NOT _LIBMLX5_DIR_SUCCESS EQUAL 0) message(FATAL_ERROR "Failed to locate libmlx5.so.1") endif() -# Add nvshmem +# Add nvshmem external project include(ExternalProject) ExternalProject_Add( nvshmem_project URL https://developer.download.nvidia.com/compute/redist/nvshmem/3.2.5/source/nvshmem_src_3.2.5-1.txz - SOURCE_DIR ${CMAKE_CURRENT_BINARY_DIR}/nvshmem-src + URL_HASH ${NVSHMEM_URL_HASH} BINARY_DIR ${CMAKE_CURRENT_BINARY_DIR}/nvshmem-build - PATCH_COMMAND patch -p1 -i ${DEEP_EP_SOURCE_DIR}/third-party/nvshmem.patch + PATCH_COMMAND patch -p1 -i ${CMAKE_CURRENT_BINARY_DIR}/nvshmem.patch COMMAND sed "s/TRANSPORT_VERSION_MAJOR 3/TRANSPORT_VERSION_MAJOR 103/" -i src/CMakeLists.txt CMAKE_ARGS -DNVSHMEM_IBGDA_SUPPORT=1 @@ -79,13 +119,13 @@ ExternalProject_Add( -DNVSHMEM_USE_NCCL=0 -DNVSHMEM_USE_GDRCOPY=0 -DNVSHMEM_BUILD_HYDRA_LAUNCHER=0 - -DCMAKE_CUDA_ARCHITECTURES=90-real -DMLX5_lib=${LIBMLX5_DIR}/libmlx5.so.1 -DNVSHMEM_BUILD_BITCODE_LIBRARY=0 -DNVSHMEM_BUILD_EXAMPLES=0 -DNVSHMEM_BUILD_TESTS=0 - INSTALL_COMMAND "" - EXCLUDE_FROM_ALL TRUE) + CMAKE_CACHE_ARGS + -DCMAKE_CUDA_ARCHITECTURES:STRING=${DEEP_EP_CUDA_ARCHITECTURES} + INSTALL_COMMAND "") add_library(nvshmem_project::nvshmem STATIC IMPORTED) add_dependencies(nvshmem_project::nvshmem nvshmem_project) file(MAKE_DIRECTORY ${CMAKE_CURRENT_BINARY_DIR}/nvshmem-build/src/include) @@ -96,12 +136,23 @@ set_target_properties( INTERFACE_INCLUDE_DIRECTORIES ${CMAKE_CURRENT_BINARY_DIR}/nvshmem-build/src/include) +# Add DeepEP cpp +# ============== + +# Remove -gencode, use CUDA_ARCHITECTURES +string(REGEX REPLACE "-gencode arch=[^ ]+ " "" CMAKE_CUDA_FLAGS + "${CMAKE_CUDA_FLAGS}") + +# Let CMake generate `fatbinData` for -rdc. Set to FALSE and TRUE are both OK, +# but it generates `code=lto_90a` rather than `code=sm_90a` for +# CUDA_ARCHITECTURES 90a-real if set to TRUE. +set(CMAKE_INTERPROCEDURAL_OPTIMIZATION FALSE) + # Find torch_python find_library(TORCH_PYTHON_LIB torch_python REQUIRED HINTS ${TORCH_INSTALL_PREFIX}/lib) # Add deep_ep_cpp_tllm -set(CMAKE_INTERPROCEDURAL_OPTIMIZATION TRUE) pybind11_add_module( deep_ep_cpp_tllm ${DEEP_EP_SOURCE_DIR}/csrc/deep_ep.cpp @@ -117,6 +168,7 @@ set_target_properties( CXX_STANDARD 17 CUDA_STANDARD 17 CUDA_SEPARABLE_COMPILATION ON + CUDA_ARCHITECTURES "${DEEP_EP_CUDA_ARCHITECTURES}" INSTALL_RPATH "$ORIGIN/libs/nvshmem" BUILD_WITH_INSTALL_RPATH TRUE) target_compile_options( @@ -133,6 +185,6 @@ target_link_options( -Wl,--version-script,${CMAKE_CURRENT_SOURCE_DIR}/deep_ep_cpp_tllm.version -Wl,--no-undefined-version) -# Add deep_ep -add_custom_target(deep_ep) +# Set targets +# =========== add_dependencies(deep_ep deep_ep_cpp_tllm nvshmem_project) diff --git a/scripts/build_wheel.py b/scripts/build_wheel.py index ae9e8acb41a..46166252704 100755 --- a/scripts/build_wheel.py +++ b/scripts/build_wheel.py @@ -415,14 +415,10 @@ def main(*, build_pybind = "OFF" build_deep_ep = "OFF" else: - targets.extend(["bindings", "th_common"]) + targets.extend(["th_common", "bindings", "deep_ep"]) build_pyt = "ON" build_pybind = "ON" - if on_windows: - build_deep_ep = "OFF" - else: - build_deep_ep = "ON" - targets.append("deep_ep") + build_deep_ep = "ON" if benchmarks: targets.append("benchmarks") @@ -631,7 +627,11 @@ def get_pybind_lib(subdirectory, name): return pybind_lib[0] install_file(get_pybind_lib("pybind", "bindings"), pkg_dir) - if build_deep_ep == "ON": + + with (build_dir / "tensorrt_llm" / "deep_ep" / + "cuda_architectures.txt").open() as f: + deep_ep_cuda_architectures = f.read().strip().split(";") + if deep_ep_cuda_architectures: install_file(get_pybind_lib("deep_ep", "deep_ep_cpp_tllm"), pkg_dir) install_tree(build_dir / "tensorrt_llm" / "deep_ep" / "python" / "deep_ep", @@ -639,7 +639,7 @@ def get_pybind_lib(subdirectory, name): dirs_exist_ok=True) (lib_dir / "nvshmem").mkdir(exist_ok=True) install_file( - build_dir / "tensorrt_llm/deep_ep/nvshmem-src/License.txt", + build_dir / "tensorrt_llm/deep_ep/nvshmem-build/License.txt", lib_dir / "nvshmem") install_file( build_dir / @@ -685,7 +685,7 @@ def get_pybind_lib(subdirectory, name): build_run( f"\"{venv_python}\" -m pybind11_stubgen -o . bindings --exit-code", env=env_ld) - if build_deep_ep == "ON": + if deep_ep_cuda_architectures: build_run( f"\"{venv_python}\" -m pybind11_stubgen -o . deep_ep_cpp_tllm --exit-code", env=env_ld) From 1a86b3aecce3645ea80ab5dd4e1342d74d44c9bd Mon Sep 17 00:00:00 2001 From: Tailing Yuan Date: Mon, 30 Jun 2025 02:48:59 +0000 Subject: [PATCH 08/40] Speed up NVSHMEM build Signed-off-by: Tailing Yuan --- .gitignore | 1 + .pre-commit-config.yaml | 1 + cpp/tensorrt_llm/deep_ep/CMakeLists.txt | 14 ++-- .../deep_ep/nvshmem_fast_build.patch | 66 +++++++++++++++++++ 4 files changed, 78 insertions(+), 4 deletions(-) create mode 100644 cpp/tensorrt_llm/deep_ep/nvshmem_fast_build.patch diff --git a/.gitignore b/.gitignore index beef20746af..b7238740cd3 100644 --- a/.gitignore +++ b/.gitignore @@ -58,6 +58,7 @@ llm-test-workspace/ *.safetensors */tllm_debug/** *.patch +!cpp/tensorrt_llm/deep_ep/nvshmem_fast_build.patch # Generated files cpp/include/tensorrt_llm/executor/version.h diff --git a/.pre-commit-config.yaml b/.pre-commit-config.yaml index 7f943c71141..00516b1afa7 100644 --- a/.pre-commit-config.yaml +++ b/.pre-commit-config.yaml @@ -27,6 +27,7 @@ repos: args: [--allow-multiple-documents] exclude: ".*/gitlab/.*.yml" - id: trailing-whitespace + exclude: '\.patch$' - id: check-toml - id: mixed-line-ending args: [--fix=lf] diff --git a/cpp/tensorrt_llm/deep_ep/CMakeLists.txt b/cpp/tensorrt_llm/deep_ep/CMakeLists.txt index 2c1d927bebf..c472e2c9ba6 100644 --- a/cpp/tensorrt_llm/deep_ep/CMakeLists.txt +++ b/cpp/tensorrt_llm/deep_ep/CMakeLists.txt @@ -74,7 +74,12 @@ set(NVSHMEM_STAMP_FILE ${CMAKE_CURRENT_BINARY_DIR}/nvshmem_stamp.txt) file(COPY ${DEEP_EP_SOURCE_DIR}/third-party/nvshmem.patch DESTINATION ${CMAKE_CURRENT_BINARY_DIR}) file(SHA256 ${CMAKE_CURRENT_BINARY_DIR}/nvshmem.patch NVSHMEM_PATCH_HASH) -set(NVSHMEM_STAMP_CONTENT "${NVSHMEM_URL_HASH} patch ${NVSHMEM_PATCH_HASH}") +file(SHA256 ${CMAKE_CURRENT_SOURCE_DIR}/nvshmem_fast_build.patch + NVSHMEM_FAST_BUILD_PATCH_HASH) +set(NVSHMEM_STAMP_CONTENT "${NVSHMEM_URL_HASH}") +string(APPEND NVSHMEM_STAMP_CONTENT " patch ${NVSHMEM_PATCH_HASH}") +string(APPEND NVSHMEM_STAMP_CONTENT " TRANSPORT_VERSION_MAJOR 103") +string(APPEND NVSHMEM_STAMP_CONTENT " patch ${NVSHMEM_FAST_BUILD_PATCH_HASH}") set(OLD_NVSHMEM_STAMP_CONTENT "") if(EXISTS ${NVSHMEM_STAMP_FILE}) file(READ ${NVSHMEM_STAMP_FILE} OLD_NVSHMEM_STAMP_CONTENT) @@ -109,6 +114,7 @@ ExternalProject_Add( PATCH_COMMAND patch -p1 -i ${CMAKE_CURRENT_BINARY_DIR}/nvshmem.patch COMMAND sed "s/TRANSPORT_VERSION_MAJOR 3/TRANSPORT_VERSION_MAJOR 103/" -i src/CMakeLists.txt + COMMAND patch -p1 -i ${CMAKE_CURRENT_SOURCE_DIR}/nvshmem_fast_build.patch CMAKE_ARGS -DNVSHMEM_IBGDA_SUPPORT=1 -DNVSHMEM_IBRC_SUPPORT=0 -DNVSHMEM_MPI_SUPPORT=0 @@ -143,9 +149,9 @@ set_target_properties( string(REGEX REPLACE "-gencode arch=[^ ]+ " "" CMAKE_CUDA_FLAGS "${CMAKE_CUDA_FLAGS}") -# Let CMake generate `fatbinData` for -rdc. Set to FALSE and TRUE are both OK, -# but it generates `code=lto_90a` rather than `code=sm_90a` for -# CUDA_ARCHITECTURES 90a-real if set to TRUE. +# Let CMake generate `fatbinData` for CUDA separable compilation. Set to FALSE +# or TRUE are both OK, but it generates `code=lto_90a` rather than `code=sm_90a` +# for arch `90a-real` if set to TRUE. set(CMAKE_INTERPROCEDURAL_OPTIMIZATION FALSE) # Find torch_python diff --git a/cpp/tensorrt_llm/deep_ep/nvshmem_fast_build.patch b/cpp/tensorrt_llm/deep_ep/nvshmem_fast_build.patch new file mode 100644 index 00000000000..1b79c85841f --- /dev/null +++ b/cpp/tensorrt_llm/deep_ep/nvshmem_fast_build.patch @@ -0,0 +1,66 @@ +diff --git a/src/CMakeLists.txt b/src/CMakeLists.txt +index cba899bba..c27337601 100644 +--- a/src/CMakeLists.txt ++++ b/src/CMakeLists.txt +@@ -264,48 +264,20 @@ set(NVSHMEM_HOST_SOURCES_NOMAXREGCOUNT + host/comm/rma.cu + host/stream/comm/quiet_on_stream.cu + host/stream/comm/cuda_interface_sync.cu +- host/stream/coll/alltoall/alltoall.cu + host/stream/coll/barrier/barrier.cu +- host/stream/coll/broadcast/broadcast.cu +- host/stream/coll/fcollect/fcollect.cu +- host/stream/coll/rdxn/reduce_and.cu +- host/stream/coll/rdxn/reduce_or.cu +- host/stream/coll/rdxn/reduce_xor.cu +- host/stream/coll/rdxn/reduce_min.cu + host/stream/coll/rdxn/reduce_max.cu +- host/stream/coll/rdxn/reduce_prod.cu +- host/stream/coll/rdxn/reduce_sum.cu + host/stream/coll/rdxn/reduce_team.cu +- host/stream/coll/reducescatter/reducescatter_and.cu +- host/stream/coll/reducescatter/reducescatter_or.cu +- host/stream/coll/reducescatter/reducescatter_xor.cu +- host/stream/coll/reducescatter/reducescatter_min.cu +- host/stream/coll/reducescatter/reducescatter_max.cu +- host/stream/coll/reducescatter/reducescatter_prod.cu +- host/stream/coll/reducescatter/reducescatter_sum.cu + ) + + set(NVSHMEM_HOST_SOURCES + host/bootstrap/bootstrap.cpp + host/bootstrap/bootstrap_loader.cpp + host/coll/cpu_coll.cpp +- host/coll/alltoall/alltoall.cpp +- host/coll/alltoall/alltoall_on_stream.cpp + host/coll/barrier/barrier.cpp + host/coll/barrier/barrier_on_stream.cpp +- host/coll/broadcast/broadcast.cpp +- host/coll/broadcast/broadcast_on_stream.cpp +- host/coll/fcollect/fcollect.cpp +- host/coll/fcollect/fcollect_on_stream.cpp +- host/coll/rdxn/rdxn.cpp +- host/coll/rdxn/rdxn_on_stream.cpp +- host/coll/reducescatter/reducescatter.cpp +- host/coll/reducescatter/reducescatter_on_stream.cpp + host/comm/putget.cpp +- host/comm/fence.cpp + host/comm/quiet.cpp + host/comm/sync.cpp +- host/comm/amo.cpp + host/proxy/proxy.cpp + host/transport/transport.cpp + host/transport/p2p/p2p.cpp +@@ -1006,3 +978,12 @@ set(CPACK_RPM_PACKAGE_REQUIRES_PREUN "/sbin/ldconfig") + + include(CPack) + # End Installation definitions ++ ++set_target_properties( ++ git_commit ++ nvshmem_device_project ++ nvshmem_bootstrap_pmi ++ nvshmem_bootstrap_pmi2 ++ nvshmem_host ++ nvshmem-info ++ PROPERTIES EXCLUDE_FROM_ALL TRUE) From 99e733d3787eddd0286adab617b7dd4254ab2f2c Mon Sep 17 00:00:00 2001 From: Tailing Yuan Date: Mon, 30 Jun 2025 03:52:10 +0000 Subject: [PATCH 09/40] Fix NVSHMEM clang aarch64 build Signed-off-by: Tailing Yuan --- .gitignore | 2 +- cpp/tensorrt_llm/deep_ep/CMakeLists.txt | 9 ++++++-- .../nvshmem_fix_clang_aarch64_build.patch | 22 +++++++++++++++++++ 3 files changed, 30 insertions(+), 3 deletions(-) create mode 100644 cpp/tensorrt_llm/deep_ep/nvshmem_fix_clang_aarch64_build.patch diff --git a/.gitignore b/.gitignore index b7238740cd3..7899a349c21 100644 --- a/.gitignore +++ b/.gitignore @@ -58,7 +58,7 @@ llm-test-workspace/ *.safetensors */tllm_debug/** *.patch -!cpp/tensorrt_llm/deep_ep/nvshmem_fast_build.patch +!cpp/tensorrt_llm/deep_ep/*.patch # Generated files cpp/include/tensorrt_llm/executor/version.h diff --git a/cpp/tensorrt_llm/deep_ep/CMakeLists.txt b/cpp/tensorrt_llm/deep_ep/CMakeLists.txt index c472e2c9ba6..649aff646af 100644 --- a/cpp/tensorrt_llm/deep_ep/CMakeLists.txt +++ b/cpp/tensorrt_llm/deep_ep/CMakeLists.txt @@ -74,12 +74,15 @@ set(NVSHMEM_STAMP_FILE ${CMAKE_CURRENT_BINARY_DIR}/nvshmem_stamp.txt) file(COPY ${DEEP_EP_SOURCE_DIR}/third-party/nvshmem.patch DESTINATION ${CMAKE_CURRENT_BINARY_DIR}) file(SHA256 ${CMAKE_CURRENT_BINARY_DIR}/nvshmem.patch NVSHMEM_PATCH_HASH) +file(SHA256 ${CMAKE_CURRENT_SOURCE_DIR}/nvshmem_fix_clang_aarch64_build.patch + NVSHMEM_PATCH_2_HASH) file(SHA256 ${CMAKE_CURRENT_SOURCE_DIR}/nvshmem_fast_build.patch - NVSHMEM_FAST_BUILD_PATCH_HASH) + NVSHMEM_PATCH_3_HASH) set(NVSHMEM_STAMP_CONTENT "${NVSHMEM_URL_HASH}") string(APPEND NVSHMEM_STAMP_CONTENT " patch ${NVSHMEM_PATCH_HASH}") string(APPEND NVSHMEM_STAMP_CONTENT " TRANSPORT_VERSION_MAJOR 103") -string(APPEND NVSHMEM_STAMP_CONTENT " patch ${NVSHMEM_FAST_BUILD_PATCH_HASH}") +string(APPEND NVSHMEM_STAMP_CONTENT " patch ${NVSHMEM_PATCH_2_HASH}") +string(APPEND NVSHMEM_STAMP_CONTENT " patch ${NVSHMEM_PATCH_3_HASH}") set(OLD_NVSHMEM_STAMP_CONTENT "") if(EXISTS ${NVSHMEM_STAMP_FILE}) file(READ ${NVSHMEM_STAMP_FILE} OLD_NVSHMEM_STAMP_CONTENT) @@ -114,6 +117,8 @@ ExternalProject_Add( PATCH_COMMAND patch -p1 -i ${CMAKE_CURRENT_BINARY_DIR}/nvshmem.patch COMMAND sed "s/TRANSPORT_VERSION_MAJOR 3/TRANSPORT_VERSION_MAJOR 103/" -i src/CMakeLists.txt + COMMAND patch -p1 -i + ${CMAKE_CURRENT_SOURCE_DIR}/nvshmem_fix_clang_aarch64_build.patch COMMAND patch -p1 -i ${CMAKE_CURRENT_SOURCE_DIR}/nvshmem_fast_build.patch CMAKE_ARGS -DNVSHMEM_IBGDA_SUPPORT=1 -DNVSHMEM_IBRC_SUPPORT=0 diff --git a/cpp/tensorrt_llm/deep_ep/nvshmem_fix_clang_aarch64_build.patch b/cpp/tensorrt_llm/deep_ep/nvshmem_fix_clang_aarch64_build.patch new file mode 100644 index 00000000000..20b6064e101 --- /dev/null +++ b/cpp/tensorrt_llm/deep_ep/nvshmem_fix_clang_aarch64_build.patch @@ -0,0 +1,22 @@ +diff --git a/src/include/non_abi/device/pt-to-pt/utils_device.h b/src/include/non_abi/device/pt-to-pt/utils_device.h +index 8342ebe06..01667c858 100644 +--- a/src/include/non_abi/device/pt-to-pt/utils_device.h ++++ b/src/include/non_abi/device/pt-to-pt/utils_device.h +@@ -27,6 +27,8 @@ + *x = ((*(x)&0xFF000000) >> 24 | (*(x)&0x00FF0000) >> 8 | (*(x)&0x0000FF00) << 8 | \ + (*(x)&0x000000FF) << 24) + ++#ifdef __CUDA_ARCH__ ++ + #ifdef NVSHMEMI_COMM_DEVICE_UTILS_USE_PTX + + __device__ static inline uint64_t BSWAP64(uint64_t x) { +@@ -99,6 +101,8 @@ __device__ static inline uint16_t BSWAP16(uint16_t x) { + + #endif /* NVSHMEMI_COMM_DEVICE_UTILS_USE_PTX */ + ++#endif /* __CUDA_ARCH__ */ ++ + #define HTOBE64(x) BSWAP64(x) + #define HTOBE32(x) BSWAP32(x) + #define HTOBE16(x) BSWAP16(x) From 19c07b56eab4c0e7ff9e74432607be15bbca1682 Mon Sep 17 00:00:00 2001 From: Tailing Yuan Date: Mon, 30 Jun 2025 10:08:19 +0000 Subject: [PATCH 10/40] Build NVSHMEM with g++; Minor fixes Signed-off-by: Tailing Yuan --- cpp/tensorrt_llm/deep_ep/CMakeLists.txt | 30 ++++++++++++------- .../nvshmem_fix_clang_aarch64_build.patch | 22 -------------- 2 files changed, 20 insertions(+), 32 deletions(-) delete mode 100644 cpp/tensorrt_llm/deep_ep/nvshmem_fix_clang_aarch64_build.patch diff --git a/cpp/tensorrt_llm/deep_ep/CMakeLists.txt b/cpp/tensorrt_llm/deep_ep/CMakeLists.txt index 649aff646af..a5b2bece756 100644 --- a/cpp/tensorrt_llm/deep_ep/CMakeLists.txt +++ b/cpp/tensorrt_llm/deep_ep/CMakeLists.txt @@ -10,7 +10,7 @@ add_custom_target(deep_ep) # Filter CUDA arch >= 9.0 set(DEEP_EP_CUDA_ARCHITECTURES "") foreach(CUDA_ARCH IN LISTS CMAKE_CUDA_ARCHITECTURES_NORMALIZED) - string(REGEX MATCHALL "^([1-9][0-9]*)[0-9]a?(-real|-virtual)?$" MATCHES + string(REGEX MATCHALL "^([1-9][0-9]*)[0-9][af]?(-real|-virtual)?$" MATCHES ${CUDA_ARCH}) if(NOT CMAKE_MATCH_0) message(FATAL_ERROR "Invalid CUDA arch format: \"${CUDA_ARCH}\"") @@ -74,20 +74,17 @@ set(NVSHMEM_STAMP_FILE ${CMAKE_CURRENT_BINARY_DIR}/nvshmem_stamp.txt) file(COPY ${DEEP_EP_SOURCE_DIR}/third-party/nvshmem.patch DESTINATION ${CMAKE_CURRENT_BINARY_DIR}) file(SHA256 ${CMAKE_CURRENT_BINARY_DIR}/nvshmem.patch NVSHMEM_PATCH_HASH) -file(SHA256 ${CMAKE_CURRENT_SOURCE_DIR}/nvshmem_fix_clang_aarch64_build.patch - NVSHMEM_PATCH_2_HASH) file(SHA256 ${CMAKE_CURRENT_SOURCE_DIR}/nvshmem_fast_build.patch - NVSHMEM_PATCH_3_HASH) + NVSHMEM_PATCH_2_HASH) set(NVSHMEM_STAMP_CONTENT "${NVSHMEM_URL_HASH}") string(APPEND NVSHMEM_STAMP_CONTENT " patch ${NVSHMEM_PATCH_HASH}") string(APPEND NVSHMEM_STAMP_CONTENT " TRANSPORT_VERSION_MAJOR 103") string(APPEND NVSHMEM_STAMP_CONTENT " patch ${NVSHMEM_PATCH_2_HASH}") -string(APPEND NVSHMEM_STAMP_CONTENT " patch ${NVSHMEM_PATCH_3_HASH}") set(OLD_NVSHMEM_STAMP_CONTENT "") if(EXISTS ${NVSHMEM_STAMP_FILE}) file(READ ${NVSHMEM_STAMP_FILE} OLD_NVSHMEM_STAMP_CONTENT) endif() -if(NOT "${OLD_NVSHMEM_STAMP_CONTENT}" STREQUAL "${NVSHMEM_STAMP_CONTENT}") +if(NOT OLD_NVSHMEM_STAMP_CONTENT STREQUAL NVSHMEM_STAMP_CONTENT) file(REMOVE_RECURSE ${CMAKE_CURRENT_BINARY_DIR}/nvshmem_project-prefix) file(WRITE ${NVSHMEM_STAMP_FILE} "${NVSHMEM_STAMP_CONTENT}") endif() @@ -95,6 +92,15 @@ endif() # Add NVSHMEM # =========== +# NVSHMEM only works with GCC. Building NVSHMEM with Clang results in +# compilation errors. Using NVSHMEM with Clang results in slow builds and device +# link issues. +if(NOT CMAKE_CXX_COMPILER_ID STREQUAL "GNU") + set(CMAKE_C_COMPILER gcc) + set(CMAKE_CXX_COMPILER g++) + set(CMAKE_CUDA_HOST_COMPILER g++) +endif() + # Find libmlx5.so.1 execute_process( COMMAND @@ -113,12 +119,9 @@ ExternalProject_Add( nvshmem_project URL https://developer.download.nvidia.com/compute/redist/nvshmem/3.2.5/source/nvshmem_src_3.2.5-1.txz URL_HASH ${NVSHMEM_URL_HASH} - BINARY_DIR ${CMAKE_CURRENT_BINARY_DIR}/nvshmem-build PATCH_COMMAND patch -p1 -i ${CMAKE_CURRENT_BINARY_DIR}/nvshmem.patch COMMAND sed "s/TRANSPORT_VERSION_MAJOR 3/TRANSPORT_VERSION_MAJOR 103/" -i src/CMakeLists.txt - COMMAND patch -p1 -i - ${CMAKE_CURRENT_SOURCE_DIR}/nvshmem_fix_clang_aarch64_build.patch COMMAND patch -p1 -i ${CMAKE_CURRENT_SOURCE_DIR}/nvshmem_fast_build.patch CMAKE_ARGS -DNVSHMEM_IBGDA_SUPPORT=1 -DNVSHMEM_IBRC_SUPPORT=0 @@ -135,8 +138,15 @@ ExternalProject_Add( -DNVSHMEM_BUILD_EXAMPLES=0 -DNVSHMEM_BUILD_TESTS=0 CMAKE_CACHE_ARGS + -DCMAKE_C_COMPILER:STRING=${CMAKE_C_COMPILER} + -DCMAKE_CXX_COMPILER:STRING=${CMAKE_CXX_COMPILER} + -DCMAKE_CUDA_HOST_COMPILER:STRING=${CMAKE_CUDA_HOST_COMPILER} + -DCMAKE_C_COMPILER_LAUNCHER:STRING=${CMAKE_C_COMPILER_LAUNCHER} + -DCMAKE_CXX_COMPILER_LAUNCHER:STRING=${CMAKE_CXX_COMPILER_LAUNCHER} + -DCMAKE_CUDA_COMPILER_LAUNCHER:STRING=${CMAKE_CUDA_COMPILER_LAUNCHER} -DCMAKE_CUDA_ARCHITECTURES:STRING=${DEEP_EP_CUDA_ARCHITECTURES} - INSTALL_COMMAND "") + INSTALL_COMMAND "" + BINARY_DIR ${CMAKE_CURRENT_BINARY_DIR}/nvshmem-build) add_library(nvshmem_project::nvshmem STATIC IMPORTED) add_dependencies(nvshmem_project::nvshmem nvshmem_project) file(MAKE_DIRECTORY ${CMAKE_CURRENT_BINARY_DIR}/nvshmem-build/src/include) diff --git a/cpp/tensorrt_llm/deep_ep/nvshmem_fix_clang_aarch64_build.patch b/cpp/tensorrt_llm/deep_ep/nvshmem_fix_clang_aarch64_build.patch deleted file mode 100644 index 20b6064e101..00000000000 --- a/cpp/tensorrt_llm/deep_ep/nvshmem_fix_clang_aarch64_build.patch +++ /dev/null @@ -1,22 +0,0 @@ -diff --git a/src/include/non_abi/device/pt-to-pt/utils_device.h b/src/include/non_abi/device/pt-to-pt/utils_device.h -index 8342ebe06..01667c858 100644 ---- a/src/include/non_abi/device/pt-to-pt/utils_device.h -+++ b/src/include/non_abi/device/pt-to-pt/utils_device.h -@@ -27,6 +27,8 @@ - *x = ((*(x)&0xFF000000) >> 24 | (*(x)&0x00FF0000) >> 8 | (*(x)&0x0000FF00) << 8 | \ - (*(x)&0x000000FF) << 24) - -+#ifdef __CUDA_ARCH__ -+ - #ifdef NVSHMEMI_COMM_DEVICE_UTILS_USE_PTX - - __device__ static inline uint64_t BSWAP64(uint64_t x) { -@@ -99,6 +101,8 @@ __device__ static inline uint16_t BSWAP16(uint16_t x) { - - #endif /* NVSHMEMI_COMM_DEVICE_UTILS_USE_PTX */ - -+#endif /* __CUDA_ARCH__ */ -+ - #define HTOBE64(x) BSWAP64(x) - #define HTOBE32(x) BSWAP32(x) - #define HTOBE16(x) BSWAP16(x) From 84e6e35fbaadbf36b8bd7243379e56e889584ae3 Mon Sep 17 00:00:00 2001 From: Tailing Yuan Date: Tue, 1 Jul 2025 03:28:30 +0000 Subject: [PATCH 11/40] Remove DeepEP from the Docker image Signed-off-by: Tailing Yuan --- docker/Dockerfile.multi | 4 --- docker/common/install_deep_ep.sh | 47 -------------------------------- 2 files changed, 51 deletions(-) delete mode 100644 docker/common/install_deep_ep.sh diff --git a/docker/Dockerfile.multi b/docker/Dockerfile.multi index 2fe4991fe8e..8075b616140 100644 --- a/docker/Dockerfile.multi +++ b/docker/Dockerfile.multi @@ -72,10 +72,6 @@ RUN bash ./install_pytorch.sh $TORCH_INSTALL_TYPE && rm install_pytorch.sh RUN pip3 uninstall -y opencv && rm -rf /usr/local/lib/python3*/dist-packages/cv2/ RUN pip3 install opencv-python-headless --force-reinstall --no-deps --no-cache-dir -# Install DeepEP -COPY docker/common/install_deep_ep.sh install_deep_ep.sh -RUN bash ./install_deep_ep.sh && rm install_deep_ep.sh - # WARs against security issues inherited from pytorch:25.04 # * https://github.com/advisories/GHSA-vqfr-h8mv-ghfj # * https://github.com/advisories/GHSA-7cx3-6m66-7c5m diff --git a/docker/common/install_deep_ep.sh b/docker/common/install_deep_ep.sh deleted file mode 100644 index c6c572eff9e..00000000000 --- a/docker/common/install_deep_ep.sh +++ /dev/null @@ -1,47 +0,0 @@ -#!/bin/bash - -set -euxo pipefail - -GITHUB_URL=${GITHUB_MIRROR:-https://github.com} -DEEP_EP_COMMIT=2b266cf6452134f993ab0fcb3ef2d5de7683c561 - -if [ "$(. /etc/os-release && echo $ID)" == "rocky" ]; then - echo "Skipping DeepEP installation in the Rocky distribution." - exit 0 -fi -libmlx5_dir=$(dirname $(ldconfig -p | grep libmlx5.so.1 | head -n1 | awk '{print $NF}')) - -export NVCC_APPEND_FLAGS="--threads 4" - -# Custom NVSHMEM -curl -fsSL https://developer.download.nvidia.com/compute/redist/nvshmem/3.2.5/source/nvshmem_src_3.2.5-1.txz | tar xz -pushd nvshmem_src -curl -fsSL $GITHUB_URL/deepseek-ai/DeepEP/raw/$DEEP_EP_COMMIT/third-party/nvshmem.patch | patch -p1 -sed "s/TRANSPORT_VERSION_MAJOR 3/TRANSPORT_VERSION_MAJOR 103/" -i src/CMakeLists.txt -ln -s libmlx5.so.1 "$libmlx5_dir/libmlx5.so" -cmake -S . -B build \ - -DCMAKE_INSTALL_PREFIX=/opt/custom_nvshmem \ - -DGDRCOPY_HOME=/usr/include \ - -DNVSHMEM_SHMEM_SUPPORT=0 \ - -DNVSHMEM_UCX_SUPPORT=0 \ - -DNVSHMEM_USE_NCCL=0 \ - -DNVSHMEM_MPI_SUPPORT=0 \ - -DNVSHMEM_IBGDA_SUPPORT=1 \ - -DNVSHMEM_PMIX_SUPPORT=0 \ - -DNVSHMEM_TIMEOUT_DEVICE_POLLING=0 \ - -DNVSHMEM_USE_GDRCOPY=1 \ - -DCMAKE_CUDA_ARCHITECTURES="90-real;100-real;120-real" \ - -DNVSHMEM_BUILD_TESTS=0 \ - -DNVSHMEM_BUILD_EXAMPLES=0 -cmake --build build -j`nproc` -make -C build install -popd - -# DeepEP -curl -fsSL $GITHUB_URL/deepseek-ai/DeepEP/archive/$DEEP_EP_COMMIT.tar.gz | tar xz -TORCH_CUDA_ARCH_LIST="9.0;10.0;12.0" NVSHMEM_DIR=/opt/custom_nvshmem pip install -v --no-cache-dir ./DeepEP-$DEEP_EP_COMMIT - -# Clean up -rm -r nvshmem_src -rm "$libmlx5_dir/libmlx5.so" -rm -r DeepEP-$DEEP_EP_COMMIT From 0c35157ba6f2c20ec6910722f66753154dd0b88a Mon Sep 17 00:00:00 2001 From: Tailing Yuan Date: Tue, 1 Jul 2025 04:10:38 +0000 Subject: [PATCH 12/40] Move CMAKE_ARGS to CMAKE_CACHE_ARGS Signed-off-by: Tailing Yuan --- cpp/tensorrt_llm/deep_ep/CMakeLists.txt | 44 ++++++++++++------------- 1 file changed, 21 insertions(+), 23 deletions(-) diff --git a/cpp/tensorrt_llm/deep_ep/CMakeLists.txt b/cpp/tensorrt_llm/deep_ep/CMakeLists.txt index a5b2bece756..a6a4aad77ff 100644 --- a/cpp/tensorrt_llm/deep_ep/CMakeLists.txt +++ b/cpp/tensorrt_llm/deep_ep/CMakeLists.txt @@ -103,13 +103,12 @@ endif() # Find libmlx5.so.1 execute_process( - COMMAND - bash -c - "dirname $(ldconfig -p | grep libmlx5.so.1 | head -n1 | awk '{print $NF}')" - RESULT_VARIABLE _LIBMLX5_DIR_SUCCESS - OUTPUT_VARIABLE LIBMLX5_DIR + COMMAND bash -c + "ldconfig -p | grep libmlx5.so.1 | head -n1 | awk '{print $NF}'" + RESULT_VARIABLE _LIBMLX5_PATH_SUCCESS + OUTPUT_VARIABLE LIBMLX5_PATH OUTPUT_STRIP_TRAILING_WHITESPACE) -if(NOT _LIBMLX5_DIR_SUCCESS EQUAL 0) +if(NOT _LIBMLX5_PATH_SUCCESS EQUAL 0) message(FATAL_ERROR "Failed to locate libmlx5.so.1") endif() @@ -123,28 +122,27 @@ ExternalProject_Add( COMMAND sed "s/TRANSPORT_VERSION_MAJOR 3/TRANSPORT_VERSION_MAJOR 103/" -i src/CMakeLists.txt COMMAND patch -p1 -i ${CMAKE_CURRENT_SOURCE_DIR}/nvshmem_fast_build.patch - CMAKE_ARGS -DNVSHMEM_IBGDA_SUPPORT=1 - -DNVSHMEM_IBRC_SUPPORT=0 - -DNVSHMEM_MPI_SUPPORT=0 - -DNVSHMEM_PMIX_SUPPORT=0 - -DNVSHMEM_SHMEM_SUPPORT=0 - -DNVSHMEM_TIMEOUT_DEVICE_POLLING=0 - -DNVSHMEM_UCX_SUPPORT=0 - -DNVSHMEM_USE_NCCL=0 - -DNVSHMEM_USE_GDRCOPY=0 - -DNVSHMEM_BUILD_HYDRA_LAUNCHER=0 - -DMLX5_lib=${LIBMLX5_DIR}/libmlx5.so.1 - -DNVSHMEM_BUILD_BITCODE_LIBRARY=0 - -DNVSHMEM_BUILD_EXAMPLES=0 - -DNVSHMEM_BUILD_TESTS=0 CMAKE_CACHE_ARGS -DCMAKE_C_COMPILER:STRING=${CMAKE_C_COMPILER} - -DCMAKE_CXX_COMPILER:STRING=${CMAKE_CXX_COMPILER} - -DCMAKE_CUDA_HOST_COMPILER:STRING=${CMAKE_CUDA_HOST_COMPILER} -DCMAKE_C_COMPILER_LAUNCHER:STRING=${CMAKE_C_COMPILER_LAUNCHER} + -DCMAKE_CXX_COMPILER:STRING=${CMAKE_CXX_COMPILER} -DCMAKE_CXX_COMPILER_LAUNCHER:STRING=${CMAKE_CXX_COMPILER_LAUNCHER} - -DCMAKE_CUDA_COMPILER_LAUNCHER:STRING=${CMAKE_CUDA_COMPILER_LAUNCHER} -DCMAKE_CUDA_ARCHITECTURES:STRING=${DEEP_EP_CUDA_ARCHITECTURES} + -DCMAKE_CUDA_HOST_COMPILER:STRING=${CMAKE_CUDA_HOST_COMPILER} + -DCMAKE_CUDA_COMPILER_LAUNCHER:STRING=${CMAKE_CUDA_COMPILER_LAUNCHER} + -DMLX5_lib:FILEPATH=${LIBMLX5_PATH} + -DNVSHMEM_BUILD_EXAMPLES:BOOL=0 + -DNVSHMEM_BUILD_PACKAGES:BOOL=0 + -DNVSHMEM_BUILD_TESTS:BOOL=0 + -DNVSHMEM_IBGDA_SUPPORT:BOOL=1 + -DNVSHMEM_IBRC_SUPPORT:BOOL=0 + -DNVSHMEM_MPI_SUPPORT:BOOL=0 + -DNVSHMEM_PMIX_SUPPORT:BOOL=0 + -DNVSHMEM_SHMEM_SUPPORT:BOOL=0 + -DNVSHMEM_TIMEOUT_DEVICE_POLLING:BOOL=0 + -DNVSHMEM_UCX_SUPPORT:BOOL=0 + -DNVSHMEM_USE_GDRCOPY:BOOL=0 + -DNVSHMEM_USE_NCCL:BOOL=0 INSTALL_COMMAND "" BINARY_DIR ${CMAKE_CURRENT_BINARY_DIR}/nvshmem-build) add_library(nvshmem_project::nvshmem STATIC IMPORTED) From 39c2fac7eb0f995197eb1a61703f2030c9eebeec Mon Sep 17 00:00:00 2001 From: Tailing Yuan Date: Tue, 1 Jul 2025 04:55:40 +0000 Subject: [PATCH 13/40] Update DeepEP version Signed-off-by: Tailing Yuan --- cpp/tensorrt_llm/deep_ep/CMakeLists.txt | 11 ++++++----- .../_torch/modules/fused_moe/deep_ep_utils.py | 3 +-- 2 files changed, 7 insertions(+), 7 deletions(-) diff --git a/cpp/tensorrt_llm/deep_ep/CMakeLists.txt b/cpp/tensorrt_llm/deep_ep/CMakeLists.txt index a6a4aad77ff..1a8a87872b1 100644 --- a/cpp/tensorrt_llm/deep_ep/CMakeLists.txt +++ b/cpp/tensorrt_llm/deep_ep/CMakeLists.txt @@ -1,4 +1,4 @@ -set(DEEP_EP_COMMIT 205817d98b692ed32a1d104775251292091cfab7) +set(DEEP_EP_COMMIT c381dadf43a85062f6a8947592017ee513abc70b) set(NVSHMEM_URL_HASH SHA256=eb2c8fb3b7084c2db86bd9fd905387909f1dfd483e7b45f7b3c3d5fcf5374b5a) @@ -43,9 +43,10 @@ if(DEFINED $ENV{GITHUB_MIRROR}) else() set(GITHUB_URL "https://github.com") endif() -FetchContent_Declare( - deep_ep_download - URL ${GITHUB_URL}/deepseek-ai/DeepEP/archive/${DEEP_EP_COMMIT}.tar.gz) +set(DEEP_EP_URL + "${GITHUB_URL}/deepseek-ai/DeepEP/archive/${DEEP_EP_COMMIT}.tar.gz") +message(STATUS "deep_ep DEEP_EP_URL: ${DEEP_EP_URL}") +FetchContent_Declare(deep_ep_download URL ${DEEP_EP_URL}) FetchContent_MakeAvailable(deep_ep_download) set(DEEP_EP_SOURCE_DIR ${deep_ep_download_SOURCE_DIR}) @@ -192,7 +193,7 @@ set_target_properties( BUILD_WITH_INSTALL_RPATH TRUE) target_compile_options( deep_ep_cpp_tllm - PRIVATE ${TORCH_CXX_FLAGS} -O3 + PRIVATE ${TORCH_CXX_FLAGS} -O3 $<$:-Xcompiler=-O3> $<$:--ptxas-options=--register-usage-level=10>) target_compile_definitions(deep_ep_cpp_tllm PRIVATE TORCH_EXTENSION_NAME=deep_ep_cpp_tllm) diff --git a/tensorrt_llm/_torch/modules/fused_moe/deep_ep_utils.py b/tensorrt_llm/_torch/modules/fused_moe/deep_ep_utils.py index 4dcf72b2dad..f669bb64563 100644 --- a/tensorrt_llm/_torch/modules/fused_moe/deep_ep_utils.py +++ b/tensorrt_llm/_torch/modules/fused_moe/deep_ep_utils.py @@ -5,7 +5,7 @@ import torch -from tensorrt_llm._utils import local_mpi_size, mpi_comm +from tensorrt_llm._utils import mpi_comm from tensorrt_llm.mapping import Mapping try: @@ -54,7 +54,6 @@ def reserve(self, hidden_size: int, hidden_dtype: torch.dtype): self.buffer = Buffer(None, num_nvl_bytes, num_rdma_bytes, - num_nvl_peers=local_mpi_size(), comm=self.comm) def dispatch(self, x: Union[torch.Tensor, Tuple[torch.Tensor, torch.Tensor]], From 2696352c70370ef0473bc22bed3a493cc14d2dd2 Mon Sep 17 00:00:00 2001 From: Tailing Yuan Date: Tue, 1 Jul 2025 12:09:08 +0000 Subject: [PATCH 14/40] Refine patch command and file glob Signed-off-by: Tailing Yuan --- cpp/tensorrt_llm/deep_ep/CMakeLists.txt | 17 +++++++---------- 1 file changed, 7 insertions(+), 10 deletions(-) diff --git a/cpp/tensorrt_llm/deep_ep/CMakeLists.txt b/cpp/tensorrt_llm/deep_ep/CMakeLists.txt index 1a8a87872b1..a9506d67f31 100644 --- a/cpp/tensorrt_llm/deep_ep/CMakeLists.txt +++ b/cpp/tensorrt_llm/deep_ep/CMakeLists.txt @@ -119,10 +119,12 @@ ExternalProject_Add( nvshmem_project URL https://developer.download.nvidia.com/compute/redist/nvshmem/3.2.5/source/nvshmem_src_3.2.5-1.txz URL_HASH ${NVSHMEM_URL_HASH} - PATCH_COMMAND patch -p1 -i ${CMAKE_CURRENT_BINARY_DIR}/nvshmem.patch + PATCH_COMMAND patch -p1 --forward --batch -i + ${CMAKE_CURRENT_BINARY_DIR}/nvshmem.patch COMMAND sed "s/TRANSPORT_VERSION_MAJOR 3/TRANSPORT_VERSION_MAJOR 103/" -i src/CMakeLists.txt - COMMAND patch -p1 -i ${CMAKE_CURRENT_SOURCE_DIR}/nvshmem_fast_build.patch + COMMAND patch -p1 --forward --batch -i + ${CMAKE_CURRENT_SOURCE_DIR}/nvshmem_fast_build.patch CMAKE_CACHE_ARGS -DCMAKE_C_COMPILER:STRING=${CMAKE_C_COMPILER} -DCMAKE_C_COMPILER_LAUNCHER:STRING=${CMAKE_C_COMPILER_LAUNCHER} @@ -173,14 +175,9 @@ find_library(TORCH_PYTHON_LIB torch_python REQUIRED HINTS ${TORCH_INSTALL_PREFIX}/lib) # Add deep_ep_cpp_tllm -pybind11_add_module( - deep_ep_cpp_tllm - ${DEEP_EP_SOURCE_DIR}/csrc/deep_ep.cpp - ${DEEP_EP_SOURCE_DIR}/csrc/kernels/internode.cu - ${DEEP_EP_SOURCE_DIR}/csrc/kernels/internode_ll.cu - ${DEEP_EP_SOURCE_DIR}/csrc/kernels/intranode.cu - ${DEEP_EP_SOURCE_DIR}/csrc/kernels/layout.cu - ${DEEP_EP_SOURCE_DIR}/csrc/kernels/runtime.cu) +file(GLOB_RECURSE SRC_CPP ${DEEP_EP_SOURCE_DIR}/csrc/*.cpp) +file(GLOB_RECURSE SRC_CU ${DEEP_EP_SOURCE_DIR}/csrc/*.cu) +pybind11_add_module(deep_ep_cpp_tllm ${SRC_CPP} ${SRC_CU}) set_target_properties( deep_ep_cpp_tllm PROPERTIES CXX_STANDARD_REQUIRED ON From e6ad386a23bb8b5997eb5a57b10af1357d6fcb65 Mon Sep 17 00:00:00 2001 From: Tailing Yuan Date: Tue, 1 Jul 2025 13:01:08 +0000 Subject: [PATCH 15/40] Store nvshmem_src_3.2.5-1.txz in LFS Signed-off-by: Tailing Yuan --- .gitattributes | 3 ++- cpp/tensorrt_llm/deep_ep/CMakeLists.txt | 2 +- cpp/tensorrt_llm/deep_ep/nvshmem_src_3.2.5-1.txz | 3 +++ 3 files changed, 6 insertions(+), 2 deletions(-) create mode 100644 cpp/tensorrt_llm/deep_ep/nvshmem_src_3.2.5-1.txz diff --git a/.gitattributes b/.gitattributes index de0b56a7310..e72ba0fe7b7 100644 --- a/.gitattributes +++ b/.gitattributes @@ -1,7 +1,8 @@ *.a filter=lfs diff=lfs merge=lfs -text +*.dll filter=lfs diff=lfs merge=lfs -text *.lib filter=lfs diff=lfs merge=lfs -text *.so filter=lfs diff=lfs merge=lfs -text -*.dll filter=lfs diff=lfs merge=lfs -text +*.txz filter=lfs diff=lfs merge=lfs -text *.xz filter=lfs diff=lfs merge=lfs -text triton_backend/tools/gpt/input_data.json filter=lfs diff=lfs merge=lfs -text *cubin.cpp filter=lfs diff=lfs merge=lfs -text diff --git a/cpp/tensorrt_llm/deep_ep/CMakeLists.txt b/cpp/tensorrt_llm/deep_ep/CMakeLists.txt index a9506d67f31..6ccf2873df3 100644 --- a/cpp/tensorrt_llm/deep_ep/CMakeLists.txt +++ b/cpp/tensorrt_llm/deep_ep/CMakeLists.txt @@ -117,7 +117,7 @@ endif() include(ExternalProject) ExternalProject_Add( nvshmem_project - URL https://developer.download.nvidia.com/compute/redist/nvshmem/3.2.5/source/nvshmem_src_3.2.5-1.txz + URL file://${CMAKE_CURRENT_SOURCE_DIR}/nvshmem_src_3.2.5-1.txz URL_HASH ${NVSHMEM_URL_HASH} PATCH_COMMAND patch -p1 --forward --batch -i ${CMAKE_CURRENT_BINARY_DIR}/nvshmem.patch diff --git a/cpp/tensorrt_llm/deep_ep/nvshmem_src_3.2.5-1.txz b/cpp/tensorrt_llm/deep_ep/nvshmem_src_3.2.5-1.txz new file mode 100644 index 00000000000..76a5a2b78ed --- /dev/null +++ b/cpp/tensorrt_llm/deep_ep/nvshmem_src_3.2.5-1.txz @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:eb2c8fb3b7084c2db86bd9fd905387909f1dfd483e7b45f7b3c3d5fcf5374b5a +size 618175 From 314522d9cb73b5ba19f30afce56958b71a718f97 Mon Sep 17 00:00:00 2001 From: Tailing Yuan Date: Tue, 1 Jul 2025 13:04:42 +0000 Subject: [PATCH 16/40] Update staging Docker image Signed-off-by: Tailing Yuan --- jenkins/L0_MergeRequest.groovy | 8 ++++---- 1 file changed, 4 insertions(+), 4 deletions(-) diff --git a/jenkins/L0_MergeRequest.groovy b/jenkins/L0_MergeRequest.groovy index 04b0e401e3a..cd0e75f9752 100644 --- a/jenkins/L0_MergeRequest.groovy +++ b/jenkins/L0_MergeRequest.groovy @@ -28,10 +28,10 @@ UPLOAD_PATH = env.uploadPath ? env.uploadPath : "sw-tensorrt-generic/llm-artifac // Container configuration // available tags can be found in: https://urm.nvidia.com/artifactory/sw-tensorrt-docker/tensorrt-llm/ // [base_image_name]-[arch]-[os](-[python_version])-[trt_version]-[torch_install_type]-[stage]-[date]-[mr_id] -LLM_DOCKER_IMAGE = "urm.nvidia.com/sw-tensorrt-docker/tensorrt-llm:pytorch-25.05-py3-x86_64-ubuntu24.04-trt10.11.0.33-skip-tritondevel-202506271620-5539" -LLM_SBSA_DOCKER_IMAGE = "urm.nvidia.com/sw-tensorrt-docker/tensorrt-llm:pytorch-25.05-py3-aarch64-ubuntu24.04-trt10.11.0.33-skip-tritondevel-202506271620-5539" -LLM_ROCKYLINUX8_PY310_DOCKER_IMAGE = "urm.nvidia.com/sw-tensorrt-docker/tensorrt-llm:cuda-12.9.0-devel-rocky8-x86_64-rocky8-py310-trt10.11.0.33-skip-tritondevel-202506271620-5539" -LLM_ROCKYLINUX8_PY312_DOCKER_IMAGE = "urm.nvidia.com/sw-tensorrt-docker/tensorrt-llm:cuda-12.9.0-devel-rocky8-x86_64-rocky8-py312-trt10.11.0.33-skip-tritondevel-202506271620-5539" +LLM_DOCKER_IMAGE = "urm.nvidia.com/sw-tensorrt-docker/tensorrt-llm-staging/tritondevel:x86_64-tritondevel-torch_skip-acbaf3f-github-pr-5534-317" +LLM_SBSA_DOCKER_IMAGE = "urm.nvidia.com/sw-tensorrt-docker/tensorrt-llm-staging/tritondevel:sbsa-tritondevel-torch_skip-acbaf3f-github-pr-5534-317" +LLM_ROCKYLINUX8_PY310_DOCKER_IMAGE = "urm.nvidia.com/sw-tensorrt-docker/tensorrt-llm-staging/tritondevel:x86_64-rockylinux8-torch_skip-py310-acbaf3f-github-pr-5534-317" +LLM_ROCKYLINUX8_PY312_DOCKER_IMAGE = "urm.nvidia.com/sw-tensorrt-docker/tensorrt-llm-staging/tritondevel:x86_64-rockylinux8-torch_skip-py312-acbaf3f-github-pr-5534-317" // TODO: Move common variables to an unified location BUILD_CORES_REQUEST = "8" From 3d62063b224793b39bb43aad1aeebe8a05ae4271 Mon Sep 17 00:00:00 2001 From: Tailing Yuan Date: Wed, 2 Jul 2025 02:42:58 +0000 Subject: [PATCH 17/40] Fix the use of `split` Signed-off-by: Tailing Yuan --- scripts/build_wheel.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/scripts/build_wheel.py b/scripts/build_wheel.py index 46166252704..1a5c50fc016 100755 --- a/scripts/build_wheel.py +++ b/scripts/build_wheel.py @@ -630,7 +630,7 @@ def get_pybind_lib(subdirectory, name): with (build_dir / "tensorrt_llm" / "deep_ep" / "cuda_architectures.txt").open() as f: - deep_ep_cuda_architectures = f.read().strip().split(";") + deep_ep_cuda_architectures = f.read().strip().strip(";") if deep_ep_cuda_architectures: install_file(get_pybind_lib("deep_ep", "deep_ep_cpp_tllm"), pkg_dir) install_tree(build_dir / "tensorrt_llm" / "deep_ep" / "python" / From 6dd5661598771b3c08b0f125478be708c83fabff Mon Sep 17 00:00:00 2001 From: Tailing Yuan Date: Wed, 2 Jul 2025 03:11:35 +0000 Subject: [PATCH 18/40] Add `patch` to Docker image Signed-off-by: Tailing Yuan --- docker/common/install_base.sh | 1 + 1 file changed, 1 insertion(+) diff --git a/docker/common/install_base.sh b/docker/common/install_base.sh index 0b4eb91ca8a..e825e58edcb 100644 --- a/docker/common/install_base.sh +++ b/docker/common/install_base.sh @@ -115,6 +115,7 @@ install_gcctoolset_rockylinux() { # https://catalog.ngc.nvidia.com/orgs/nvidia/containers/cuda echo 'export LD_LIBRARY_PATH=/usr/local/cuda/lib64:$LD_LIBRARY_PATH' >> "${ENV}" dnf install \ + patch \ vim \ wget \ git-lfs \ From 77c49e0a564db6d5da7ee7e011ad56f2af79ffd7 Mon Sep 17 00:00:00 2001 From: Tailing Yuan Date: Wed, 2 Jul 2025 04:14:13 +0000 Subject: [PATCH 19/40] Fix if(DEFINED ...) Signed-off-by: Tailing Yuan --- cpp/tensorrt_llm/deep_ep/CMakeLists.txt | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/cpp/tensorrt_llm/deep_ep/CMakeLists.txt b/cpp/tensorrt_llm/deep_ep/CMakeLists.txt index 6ccf2873df3..5f256452699 100644 --- a/cpp/tensorrt_llm/deep_ep/CMakeLists.txt +++ b/cpp/tensorrt_llm/deep_ep/CMakeLists.txt @@ -38,7 +38,7 @@ endif() # Download DeepEP include(FetchContent) -if(DEFINED $ENV{GITHUB_MIRROR}) +if(DEFINED ENV{GITHUB_MIRROR}) set(GITHUB_URL "$ENV{GITHUB_MIRROR}") else() set(GITHUB_URL "https://github.com") From f59d2beab5d39ab1c53b3280d4a3cf7a328209cf Mon Sep 17 00:00:00 2001 From: Tailing Yuan Date: Wed, 2 Jul 2025 06:20:57 +0000 Subject: [PATCH 20/40] Update staging Docker image Signed-off-by: Tailing Yuan --- jenkins/L0_MergeRequest.groovy | 8 ++++---- 1 file changed, 4 insertions(+), 4 deletions(-) diff --git a/jenkins/L0_MergeRequest.groovy b/jenkins/L0_MergeRequest.groovy index cd0e75f9752..1f612517b3e 100644 --- a/jenkins/L0_MergeRequest.groovy +++ b/jenkins/L0_MergeRequest.groovy @@ -28,10 +28,10 @@ UPLOAD_PATH = env.uploadPath ? env.uploadPath : "sw-tensorrt-generic/llm-artifac // Container configuration // available tags can be found in: https://urm.nvidia.com/artifactory/sw-tensorrt-docker/tensorrt-llm/ // [base_image_name]-[arch]-[os](-[python_version])-[trt_version]-[torch_install_type]-[stage]-[date]-[mr_id] -LLM_DOCKER_IMAGE = "urm.nvidia.com/sw-tensorrt-docker/tensorrt-llm-staging/tritondevel:x86_64-tritondevel-torch_skip-acbaf3f-github-pr-5534-317" -LLM_SBSA_DOCKER_IMAGE = "urm.nvidia.com/sw-tensorrt-docker/tensorrt-llm-staging/tritondevel:sbsa-tritondevel-torch_skip-acbaf3f-github-pr-5534-317" -LLM_ROCKYLINUX8_PY310_DOCKER_IMAGE = "urm.nvidia.com/sw-tensorrt-docker/tensorrt-llm-staging/tritondevel:x86_64-rockylinux8-torch_skip-py310-acbaf3f-github-pr-5534-317" -LLM_ROCKYLINUX8_PY312_DOCKER_IMAGE = "urm.nvidia.com/sw-tensorrt-docker/tensorrt-llm-staging/tritondevel:x86_64-rockylinux8-torch_skip-py312-acbaf3f-github-pr-5534-317" +LLM_DOCKER_IMAGE = "urm.nvidia.com/sw-tensorrt-docker/tensorrt-llm-staging/tritondevel:x86_64-tritondevel-torch_skip-8be509a-github-pr-5534-323" +LLM_SBSA_DOCKER_IMAGE = "urm.nvidia.com/sw-tensorrt-docker/tensorrt-llm-staging/tritondevel:sbsa-tritondevel-torch_skip-8be509a-github-pr-5534-323" +LLM_ROCKYLINUX8_PY310_DOCKER_IMAGE = "urm.nvidia.com/sw-tensorrt-docker/tensorrt-llm-staging/tritondevel:x86_64-rockylinux8-torch_skip-py310-8be509a-github-pr-5534-323" +LLM_ROCKYLINUX8_PY312_DOCKER_IMAGE = "urm.nvidia.com/sw-tensorrt-docker/tensorrt-llm-staging/tritondevel:x86_64-rockylinux8-torch_skip-py312-8be509a-github-pr-5534-323" // TODO: Move common variables to an unified location BUILD_CORES_REQUEST = "8" From 8ec209a60d30a47d80cf1b9c71b31c6d660852e4 Mon Sep 17 00:00:00 2001 From: Tailing Yuan Date: Wed, 2 Jul 2025 09:04:53 +0000 Subject: [PATCH 21/40] Add a readme about generating nvshmem_fast_build.patch Signed-off-by: Tailing Yuan --- cpp/tensorrt_llm/deep_ep/README.md | 8 +++ .../deep_ep/strip_nvshmem_helper.py | 55 +++++++++++++++++++ 2 files changed, 63 insertions(+) create mode 100644 cpp/tensorrt_llm/deep_ep/README.md create mode 100644 cpp/tensorrt_llm/deep_ep/strip_nvshmem_helper.py diff --git a/cpp/tensorrt_llm/deep_ep/README.md b/cpp/tensorrt_llm/deep_ep/README.md new file mode 100644 index 00000000000..c11e6bec3e2 --- /dev/null +++ b/cpp/tensorrt_llm/deep_ep/README.md @@ -0,0 +1,8 @@ +How to generate `nvshmem_fast_build.patch`? + +1. Build without `nvshmem_fast_build.patch`. +2. Try linking DeepEP to NVSHMEM while omitting one object file. +3. Repeat step 2 until no more object files can be omitted. +4. Remove the unused files from NVSHMEM's `CMakelists.txt`, and save the differences as `nvshmem_fast_build.patch`. + +The script `strip_nvshmem_helper.py` automatically performs steps 2 and 3. diff --git a/cpp/tensorrt_llm/deep_ep/strip_nvshmem_helper.py b/cpp/tensorrt_llm/deep_ep/strip_nvshmem_helper.py new file mode 100644 index 00000000000..97c38f1ced1 --- /dev/null +++ b/cpp/tensorrt_llm/deep_ep/strip_nvshmem_helper.py @@ -0,0 +1,55 @@ +import pathlib +import re +import subprocess + +project_dir = pathlib.Path(__file__).parent.parent.parent.parent + +# Run `find cpp/build | grep kernels/internode_ll.cu.o$` to get the directory +deep_ep_obj_dir = project_dir / "cpp/build/tensorrt_llm/deep_ep/CMakeFiles/deep_ep_cpp_tllm.dir/__/__/_deps/deep_ep_download-src/csrc" +assert deep_ep_obj_dir.is_dir() + +# Run `find cpp/build | grep host/bootstrap/bootstrap.cpp.o$` to get the directory +# Please set to `nvshmem.dir` rather than `nvshmem_host.dir` +nvshmem_obj_dir = project_dir / "cpp/build/tensorrt_llm/deep_ep/nvshmem-build/src/CMakeFiles/nvshmem.dir" +assert nvshmem_obj_dir.is_dir() + +# Parse -gencode arguments +with (project_dir / + "cpp/build/tensorrt_llm/deep_ep/cuda_architectures.txt").open() as f: + cuda_architectures = f.read() +pattern = re.compile(r'^([1-9][0-9]*[0-9][af]?)(-real|-virtual)?$') +gencode_args = [] +for cuda_arch in cuda_architectures.split(";"): + matches = re.match(pattern, cuda_arch) + assert matches is not None, f"Invalid cuda arch \"{cuda_arch}\"" + sm_version = matches.group(1) + postfix = matches.group(2) or "" + code = { + "": f"[compute_{sm_version},sm_{sm_version}]", + "-real": f"[sm_{sm_version}]", + "-virtual": f"[compute_{sm_version}]", + }[postfix] + gencode_args.append(f"-gencode=arch=compute_{sm_version},{code=:s}") + +temp_dir = project_dir / "cpp/build/tensorrt_llm/deep_ep/strip_nvshmem_helper" +temp_dir.mkdir(exist_ok=True) +ranlib = temp_dir / "liba.a" +if ranlib.exists(): + ranlib.unlink() + +deep_ep_obj_list = sorted(deep_ep_obj_dir.glob("kernels/**/*.o")) +nvshmem_obj_set = set(nvshmem_obj_dir.glob("**/*.o")) +for exclude_obj in sorted(nvshmem_obj_set): + # Create liba.a with one object file less + subprocess.check_call( + ["ar", "rcs", ranlib, *(nvshmem_obj_set - {exclude_obj})]) + # Test whether there are undefined symbols + res = subprocess.call([ + "/usr/local/cuda/bin/nvcc", *gencode_args, "-Xlinker", "--no-undefined", + "-shared", *deep_ep_obj_list, ranlib, "-o", temp_dir / "a.out" + ]) + # If there is no undefined symbols, then print "-" indicating the file could be omitted. + print("-" if res == 0 else "+", + str(exclude_obj.relative_to(nvshmem_obj_dir))[:-2]) + # Unlink the ranlib because `ar` does append + ranlib.unlink() From 940c901157655c50ba9bd5c62d231ae992ede655 Mon Sep 17 00:00:00 2001 From: Tailing Yuan Date: Thu, 3 Jul 2025 02:44:02 +0000 Subject: [PATCH 22/40] Add `libmlx5.so` and headers to Docker image Signed-off-by: Tailing Yuan --- docker/common/install_base.sh | 2 ++ 1 file changed, 2 insertions(+) diff --git a/docker/common/install_base.sh b/docker/common/install_base.sh index e825e58edcb..be9cd4cf0f1 100644 --- a/docker/common/install_base.sh +++ b/docker/common/install_base.sh @@ -53,6 +53,7 @@ init_ubuntu() { llvm \ libclang-rt-dev \ libffi-dev \ + libibverbs-dev \ libnuma1 \ libnuma-dev \ python3-dev \ @@ -126,6 +127,7 @@ install_gcctoolset_rockylinux() { openmpi \ openmpi-devel \ pigz \ + rdma-core-devel \ -y echo "source scl_source enable gcc-toolset-11" >> "${ENV}" echo 'export PATH=/usr/lib64/openmpi/bin:$PATH' >> "${ENV}" From f358ab803651123df9e97c317bddc79d3883bb8f Mon Sep 17 00:00:00 2001 From: Tailing Yuan Date: Thu, 3 Jul 2025 03:40:12 +0000 Subject: [PATCH 23/40] Fix for license check Signed-off-by: Tailing Yuan --- jenkins/license_cpp.json | 1 + 1 file changed, 1 insertion(+) diff --git a/jenkins/license_cpp.json b/jenkins/license_cpp.json index e29d577d80e..d7cf0eea984 100644 --- a/jenkins/license_cpp.json +++ b/jenkins/license_cpp.json @@ -31,6 +31,7 @@ ], "skip": { "": "", + "tensorrt_llm/deep_ep/strip_nvshmem_helper.py": "py", "tensorrt_llm/kernels/selectiveScan/selectiveScan.h": "external", "tensorrt_llm/kernels/cutlass_kernels/python/generate_kernels.py": "py", "tensorrt_llm/kernels/cutlass_kernels/fp8_rowwise_gemm/fp8_rowwise_gemm_kernel_template_sm90.h": "dual license", From 5e23a5667935158eaff6cf706836d03afe2b1191 Mon Sep 17 00:00:00 2001 From: Tailing Yuan Date: Thu, 3 Jul 2025 05:26:27 +0000 Subject: [PATCH 24/40] Fix cmake configure dependencies Signed-off-by: Tailing Yuan --- cpp/tensorrt_llm/deep_ep/CMakeLists.txt | 25 +++++++++++++++++-------- 1 file changed, 17 insertions(+), 8 deletions(-) diff --git a/cpp/tensorrt_llm/deep_ep/CMakeLists.txt b/cpp/tensorrt_llm/deep_ep/CMakeLists.txt index 5f256452699..f03bfc3b8ab 100644 --- a/cpp/tensorrt_llm/deep_ep/CMakeLists.txt +++ b/cpp/tensorrt_llm/deep_ep/CMakeLists.txt @@ -54,7 +54,8 @@ set(DEEP_EP_SOURCE_DIR ${deep_ep_download_SOURCE_DIR}) set(DEEP_EP_PYTHON_DEST ${CMAKE_CURRENT_BINARY_DIR}/python/deep_ep) file(REMOVE_RECURSE ${DEEP_EP_PYTHON_DEST}) file(MAKE_DIRECTORY ${DEEP_EP_PYTHON_DEST}) -file(COPY ${DEEP_EP_SOURCE_DIR}/LICENSE DESTINATION ${DEEP_EP_PYTHON_DEST}) +configure_file(${DEEP_EP_SOURCE_DIR}/LICENSE ${DEEP_EP_PYTHON_DEST}/LICENSE + COPYONLY) set(_files __init__.py buffer.py utils.py) foreach(_f IN LISTS _files) set(_src "${DEEP_EP_SOURCE_DIR}/deep_ep/${_f}") @@ -68,19 +69,22 @@ foreach(_f IN LISTS _files) "# Adapted from https://github.com/deepseek-ai/DeepEP/blob/${DEEP_EP_COMMIT}/deep_ep/${_f}\n" ) file(WRITE "${_dst}" "${_content}") + set_property( + DIRECTORY + APPEND + PROPERTY CMAKE_CONFIGURE_DEPENDS ${_src}) endforeach() # Delete stale nvshmem on patch update set(NVSHMEM_STAMP_FILE ${CMAKE_CURRENT_BINARY_DIR}/nvshmem_stamp.txt) -file(COPY ${DEEP_EP_SOURCE_DIR}/third-party/nvshmem.patch - DESTINATION ${CMAKE_CURRENT_BINARY_DIR}) -file(SHA256 ${CMAKE_CURRENT_BINARY_DIR}/nvshmem.patch NVSHMEM_PATCH_HASH) +file(SHA256 ${DEEP_EP_SOURCE_DIR}/third-party/nvshmem.patch NVSHMEM_PATCH_HASH) file(SHA256 ${CMAKE_CURRENT_SOURCE_DIR}/nvshmem_fast_build.patch NVSHMEM_PATCH_2_HASH) set(NVSHMEM_STAMP_CONTENT "${NVSHMEM_URL_HASH}") -string(APPEND NVSHMEM_STAMP_CONTENT " patch ${NVSHMEM_PATCH_HASH}") -string(APPEND NVSHMEM_STAMP_CONTENT " TRANSPORT_VERSION_MAJOR 103") -string(APPEND NVSHMEM_STAMP_CONTENT " patch ${NVSHMEM_PATCH_2_HASH}") +string(APPEND NVSHMEM_STAMP_CONTENT " PATCH_COMMAND v1") +string(APPEND NVSHMEM_STAMP_CONTENT " ${NVSHMEM_PATCH_HASH}") +string(APPEND NVSHMEM_STAMP_CONTENT " 103") +string(APPEND NVSHMEM_STAMP_CONTENT " ${NVSHMEM_PATCH_2_HASH}") set(OLD_NVSHMEM_STAMP_CONTENT "") if(EXISTS ${NVSHMEM_STAMP_FILE}) file(READ ${NVSHMEM_STAMP_FILE} OLD_NVSHMEM_STAMP_CONTENT) @@ -89,6 +93,11 @@ if(NOT OLD_NVSHMEM_STAMP_CONTENT STREQUAL NVSHMEM_STAMP_CONTENT) file(REMOVE_RECURSE ${CMAKE_CURRENT_BINARY_DIR}/nvshmem_project-prefix) file(WRITE ${NVSHMEM_STAMP_FILE} "${NVSHMEM_STAMP_CONTENT}") endif() +set_property( + DIRECTORY APPEND + PROPERTY CMAKE_CONFIGURE_DEPENDS + ${DEEP_EP_SOURCE_DIR}/third-party/nvshmem.patch + ${CMAKE_CURRENT_SOURCE_DIR}/nvshmem_fast_build.patch) # Add NVSHMEM # =========== @@ -120,7 +129,7 @@ ExternalProject_Add( URL file://${CMAKE_CURRENT_SOURCE_DIR}/nvshmem_src_3.2.5-1.txz URL_HASH ${NVSHMEM_URL_HASH} PATCH_COMMAND patch -p1 --forward --batch -i - ${CMAKE_CURRENT_BINARY_DIR}/nvshmem.patch + ${DEEP_EP_SOURCE_DIR}/third-party/nvshmem.patch COMMAND sed "s/TRANSPORT_VERSION_MAJOR 3/TRANSPORT_VERSION_MAJOR 103/" -i src/CMakeLists.txt COMMAND patch -p1 --forward --batch -i From a2cf24ed80f146bd80ec7b72c2776a05145bbbe5 Mon Sep 17 00:00:00 2001 From: Tailing Yuan Date: Thu, 3 Jul 2025 05:36:30 +0000 Subject: [PATCH 25/40] No specify `libmlx5.so.1` Signed-off-by: Tailing Yuan --- cpp/tensorrt_llm/deep_ep/CMakeLists.txt | 12 ------------ 1 file changed, 12 deletions(-) diff --git a/cpp/tensorrt_llm/deep_ep/CMakeLists.txt b/cpp/tensorrt_llm/deep_ep/CMakeLists.txt index f03bfc3b8ab..5db319a9cf7 100644 --- a/cpp/tensorrt_llm/deep_ep/CMakeLists.txt +++ b/cpp/tensorrt_llm/deep_ep/CMakeLists.txt @@ -111,17 +111,6 @@ if(NOT CMAKE_CXX_COMPILER_ID STREQUAL "GNU") set(CMAKE_CUDA_HOST_COMPILER g++) endif() -# Find libmlx5.so.1 -execute_process( - COMMAND bash -c - "ldconfig -p | grep libmlx5.so.1 | head -n1 | awk '{print $NF}'" - RESULT_VARIABLE _LIBMLX5_PATH_SUCCESS - OUTPUT_VARIABLE LIBMLX5_PATH - OUTPUT_STRIP_TRAILING_WHITESPACE) -if(NOT _LIBMLX5_PATH_SUCCESS EQUAL 0) - message(FATAL_ERROR "Failed to locate libmlx5.so.1") -endif() - # Add nvshmem external project include(ExternalProject) ExternalProject_Add( @@ -142,7 +131,6 @@ ExternalProject_Add( -DCMAKE_CUDA_ARCHITECTURES:STRING=${DEEP_EP_CUDA_ARCHITECTURES} -DCMAKE_CUDA_HOST_COMPILER:STRING=${CMAKE_CUDA_HOST_COMPILER} -DCMAKE_CUDA_COMPILER_LAUNCHER:STRING=${CMAKE_CUDA_COMPILER_LAUNCHER} - -DMLX5_lib:FILEPATH=${LIBMLX5_PATH} -DNVSHMEM_BUILD_EXAMPLES:BOOL=0 -DNVSHMEM_BUILD_PACKAGES:BOOL=0 -DNVSHMEM_BUILD_TESTS:BOOL=0 From dce177ee10e9391698707c7b79eca396daae57a7 Mon Sep 17 00:00:00 2001 From: Tailing Yuan Date: Thu, 3 Jul 2025 05:50:19 +0000 Subject: [PATCH 26/40] Update staging Docker images Signed-off-by: Tailing Yuan --- jenkins/current_image_tags.properties | 12 ++++++------ 1 file changed, 6 insertions(+), 6 deletions(-) diff --git a/jenkins/current_image_tags.properties b/jenkins/current_image_tags.properties index 4ac0d51d3e7..ea63fdf7c50 100644 --- a/jenkins/current_image_tags.properties +++ b/jenkins/current_image_tags.properties @@ -4,9 +4,9 @@ # https://code.visualstudio.com/remote/advancedcontainers/environment-variables#_option-2-use-an-env-file # for reuse in Dev Containers configuration. # Also, the file needs to be parseable by 'sh' for reuse by docker/Makefile. -LLM_DOCKER_IMAGE_URI=urm.nvidia.com/sw-tensorrt-docker/tensorrt-llm -LLM_DOCKER_IMAGE_TAG_SUFFIX=-trt10.11.0.33-skip-tritondevel-202506271620-5539 -LLM_DOCKER_IMAGE=${LLM_DOCKER_IMAGE_URI}:pytorch-25.05-py3-x86_64-ubuntu24.04${LLM_DOCKER_IMAGE_TAG_SUFFIX} -LLM_SBSA_DOCKER_IMAGE=${LLM_DOCKER_IMAGE_URI}:pytorch-25.05-py3-aarch64-ubuntu24.04${LLM_DOCKER_IMAGE_TAG_SUFFIX} -LLM_ROCKYLINUX8_PY310_DOCKER_IMAGE=${LLM_DOCKER_IMAGE_URI}:cuda-12.9.0-devel-rocky8-x86_64-rocky8-py310${LLM_DOCKER_IMAGE_TAG_SUFFIX} -LLM_ROCKYLINUX8_PY312_DOCKER_IMAGE=${LLM_DOCKER_IMAGE_URI}:cuda-12.9.0-devel-rocky8-x86_64-rocky8-py312${LLM_DOCKER_IMAGE_TAG_SUFFIX} +LLM_DOCKER_IMAGE_URI=urm.nvidia.com/sw-tensorrt-docker/tensorrt-llm-staging/tritondevel +LLM_DOCKER_IMAGE_TAG_SUFFIX=-f358ab8-github-pr-5534-336 +LLM_DOCKER_IMAGE=${LLM_DOCKER_IMAGE_URI}:x86_64-tritondevel-torch_skip${LLM_DOCKER_IMAGE_TAG_SUFFIX} +LLM_SBSA_DOCKER_IMAGE=${LLM_DOCKER_IMAGE_URI}:sbsa-tritondevel-torch_skip${LLM_DOCKER_IMAGE_TAG_SUFFIX} +LLM_ROCKYLINUX8_PY310_DOCKER_IMAGE=${LLM_DOCKER_IMAGE_URI}:x86_64-rockylinux8-torch_skip-py310${LLM_DOCKER_IMAGE_TAG_SUFFIX} +LLM_ROCKYLINUX8_PY312_DOCKER_IMAGE=${LLM_DOCKER_IMAGE_URI}:x86_64-rockylinux8-torch_skip-py312${LLM_DOCKER_IMAGE_TAG_SUFFIX} From 635f0d9fc165286d741453c67849fadd4201c3b1 Mon Sep 17 00:00:00 2001 From: Tailing Yuan Date: Fri, 4 Jul 2025 04:17:19 +0000 Subject: [PATCH 27/40] Add torch/lib to rpath Signed-off-by: Tailing Yuan --- cpp/tensorrt_llm/deep_ep/CMakeLists.txt | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/cpp/tensorrt_llm/deep_ep/CMakeLists.txt b/cpp/tensorrt_llm/deep_ep/CMakeLists.txt index 5db319a9cf7..700e48c1503 100644 --- a/cpp/tensorrt_llm/deep_ep/CMakeLists.txt +++ b/cpp/tensorrt_llm/deep_ep/CMakeLists.txt @@ -183,7 +183,7 @@ set_target_properties( CUDA_STANDARD 17 CUDA_SEPARABLE_COMPILATION ON CUDA_ARCHITECTURES "${DEEP_EP_CUDA_ARCHITECTURES}" - INSTALL_RPATH "$ORIGIN/libs/nvshmem" + INSTALL_RPATH "$ORIGIN/libs/nvshmem;${TORCH_INSTALL_PREFIX}/lib" BUILD_WITH_INSTALL_RPATH TRUE) target_compile_options( deep_ep_cpp_tllm From c0b07598a4e20efb33e5efe1fdccc1df7fa53a58 Mon Sep 17 00:00:00 2001 From: Tailing Yuan Date: Fri, 4 Jul 2025 09:05:41 +0000 Subject: [PATCH 28/40] Update Docker images Signed-off-by: Tailing Yuan --- jenkins/controlCCache.groovy | 2 +- jenkins/current_image_tags.properties | 12 ++++++------ 2 files changed, 7 insertions(+), 7 deletions(-) diff --git a/jenkins/controlCCache.groovy b/jenkins/controlCCache.groovy index aa839a1264c..4932b5ae59a 100644 --- a/jenkins/controlCCache.groovy +++ b/jenkins/controlCCache.groovy @@ -1,7 +1,7 @@ import java.lang.InterruptedException -DOCKER_IMAGE = "urm.nvidia.com/sw-tensorrt-docker/tensorrt-llm:pytorch-25.05-py3-x86_64-ubuntu24.04-trt10.11.0.33-skip-tritondevel-202506271620-5539" +DOCKER_IMAGE = "urm.nvidia.com/sw-tensorrt-docker/tensorrt-llm:pytorch-25.05-py3-x86_64-ubuntu24.04-trt10.11.0.33-skip-tritondevel-202507041650-5534" def createKubernetesPodConfig(image, arch = "amd64") { diff --git a/jenkins/current_image_tags.properties b/jenkins/current_image_tags.properties index ea63fdf7c50..bb293b50e2c 100644 --- a/jenkins/current_image_tags.properties +++ b/jenkins/current_image_tags.properties @@ -4,9 +4,9 @@ # https://code.visualstudio.com/remote/advancedcontainers/environment-variables#_option-2-use-an-env-file # for reuse in Dev Containers configuration. # Also, the file needs to be parseable by 'sh' for reuse by docker/Makefile. -LLM_DOCKER_IMAGE_URI=urm.nvidia.com/sw-tensorrt-docker/tensorrt-llm-staging/tritondevel -LLM_DOCKER_IMAGE_TAG_SUFFIX=-f358ab8-github-pr-5534-336 -LLM_DOCKER_IMAGE=${LLM_DOCKER_IMAGE_URI}:x86_64-tritondevel-torch_skip${LLM_DOCKER_IMAGE_TAG_SUFFIX} -LLM_SBSA_DOCKER_IMAGE=${LLM_DOCKER_IMAGE_URI}:sbsa-tritondevel-torch_skip${LLM_DOCKER_IMAGE_TAG_SUFFIX} -LLM_ROCKYLINUX8_PY310_DOCKER_IMAGE=${LLM_DOCKER_IMAGE_URI}:x86_64-rockylinux8-torch_skip-py310${LLM_DOCKER_IMAGE_TAG_SUFFIX} -LLM_ROCKYLINUX8_PY312_DOCKER_IMAGE=${LLM_DOCKER_IMAGE_URI}:x86_64-rockylinux8-torch_skip-py312${LLM_DOCKER_IMAGE_TAG_SUFFIX} +LLM_DOCKER_IMAGE_URI=urm.nvidia.com/sw-tensorrt-docker/tensorrt-llm +LLM_DOCKER_IMAGE_TAG_SUFFIX=-trt10.11.0.33-skip-tritondevel-202507041650-5534 +LLM_DOCKER_IMAGE=${LLM_DOCKER_IMAGE_URI}:pytorch-25.05-py3-x86_64-ubuntu24.04${LLM_DOCKER_IMAGE_TAG_SUFFIX} +LLM_SBSA_DOCKER_IMAGE=${LLM_DOCKER_IMAGE_URI}:pytorch-25.05-py3-aarch64-ubuntu24.04${LLM_DOCKER_IMAGE_TAG_SUFFIX} +LLM_ROCKYLINUX8_PY310_DOCKER_IMAGE=${LLM_DOCKER_IMAGE_URI}:cuda-12.9.0-devel-rocky8-x86_64-rocky8-py310${LLM_DOCKER_IMAGE_TAG_SUFFIX} +LLM_ROCKYLINUX8_PY312_DOCKER_IMAGE=${LLM_DOCKER_IMAGE_URI}:cuda-12.9.0-devel-rocky8-x86_64-rocky8-py312${LLM_DOCKER_IMAGE_TAG_SUFFIX} From d154d203cb6c4616a033a292ae3931b4cbf6fe05 Mon Sep 17 00:00:00 2001 From: Tailing Yuan Date: Fri, 4 Jul 2025 10:05:13 +0000 Subject: [PATCH 29/40] Remove GITHUB_MIRROR from Docker image environment Signed-off-by: Tailing Yuan --- docker/Dockerfile.multi | 11 +++++------ jenkins/Build.groovy | 1 + jenkins/L0_Test.groovy | 1 + 3 files changed, 7 insertions(+), 6 deletions(-) diff --git a/docker/Dockerfile.multi b/docker/Dockerfile.multi index 8075b616140..95cdb133fd4 100644 --- a/docker/Dockerfile.multi +++ b/docker/Dockerfile.multi @@ -16,7 +16,6 @@ LABEL com.nvidia.ai-terms="https://www.nvidia.com/en-us/agreements/enterprise-so ENV BASH_ENV=${BASH_ENV:-/etc/bash.bashrc} ENV ENV=${ENV:-/etc/shinit_v2} ARG GITHUB_MIRROR="" -ENV GITHUB_MIRROR=$GITHUB_MIRROR RUN echo "Using GitHub mirror: $GITHUB_MIRROR" SHELL ["/bin/bash", "-c"] @@ -28,13 +27,13 @@ FROM base AS devel ARG PYTHON_VERSION="3.12.3" RUN echo "Using Python version: $PYTHON_VERSION" COPY docker/common/install_base.sh install_base.sh -RUN bash ./install_base.sh $PYTHON_VERSION && rm install_base.sh +RUN GITHUB_MIRROR=$GITHUB_MIRROR bash ./install_base.sh $PYTHON_VERSION && rm install_base.sh COPY docker/common/install_cmake.sh install_cmake.sh -RUN bash ./install_cmake.sh && rm install_cmake.sh +RUN GITHUB_MIRROR=$GITHUB_MIRROR bash ./install_cmake.sh && rm install_cmake.sh COPY docker/common/install_ccache.sh install_ccache.sh -RUN bash ./install_ccache.sh && rm install_ccache.sh +RUN GITHUB_MIRROR=$GITHUB_MIRROR bash ./install_ccache.sh && rm install_ccache.sh # Only take effect when the base image is Rocky Linux 8 with old CUDA version. COPY docker/common/install_cuda_toolkit.sh install_cuda_toolkit.sh @@ -61,7 +60,7 @@ RUN bash ./install_polygraphy.sh && rm install_polygraphy.sh # Install mpi4py COPY docker/common/install_mpi4py.sh install_mpi4py.sh -RUN bash ./install_mpi4py.sh && rm install_mpi4py.sh +RUN GITHUB_MIRROR=$GITHUB_MIRROR bash ./install_mpi4py.sh && rm install_mpi4py.sh # Install PyTorch ARG TORCH_INSTALL_TYPE="skip" @@ -118,7 +117,7 @@ ENV CCACHE_DIR=/root/.cache/ccache # Build the TRT-LLM wheel ARG BUILD_WHEEL_ARGS="--clean --python_bindings --benchmarks" RUN --mount=type=cache,target=/root/.cache/pip --mount=type=cache,target=${CCACHE_DIR} \ - python3 scripts/build_wheel.py ${BUILD_WHEEL_ARGS} + GITHUB_MIRROR=$GITHUB_MIRROR python3 scripts/build_wheel.py ${BUILD_WHEEL_ARGS} FROM ${DEVEL_IMAGE} AS release diff --git a/jenkins/Build.groovy b/jenkins/Build.groovy index 2c60ec813c9..946dc5ce848 100644 --- a/jenkins/Build.groovy +++ b/jenkins/Build.groovy @@ -591,6 +591,7 @@ pipeline { //Workspace normally is: /home/jenkins/agent/workspace/LLM/L0_MergeRequest@tmp/ HF_HOME="${env.WORKSPACE_TMP}/.cache/huggingface" CCACHE_DIR="${CCACHE_DIR}" + GITHUB_MIRROR="https://urm.nvidia.com/artifactory/github-go-remote" PIP_INDEX_URL="https://urm.nvidia.com/artifactory/api/pypi/pypi-remote/simple" // force datasets to be offline mode, to prevent CI jobs are downloading HF dataset causing test failures HF_DATASETS_OFFLINE=1 diff --git a/jenkins/L0_Test.groovy b/jenkins/L0_Test.groovy index e7b34f1ad97..00f2adeb449 100644 --- a/jenkins/L0_Test.groovy +++ b/jenkins/L0_Test.groovy @@ -2219,6 +2219,7 @@ pipeline { //Workspace normally is: /home/jenkins/agent/workspace/LLM/L0_MergeRequest@tmp/ HF_HOME="${env.WORKSPACE_TMP}/.cache/huggingface" CCACHE_DIR="${CCACHE_DIR}" + GITHUB_MIRROR="https://urm.nvidia.com/artifactory/github-go-remote" PIP_INDEX_URL="https://urm.nvidia.com/artifactory/api/pypi/pypi-remote/simple" // force datasets to be offline mode, to prevent CI jobs are downloading HF dataset causing test failures HF_DATASETS_OFFLINE=1 From 0dd84426e0eab494b519a08997c1a78d55df7c7d Mon Sep 17 00:00:00 2001 From: Tailing Yuan Date: Fri, 4 Jul 2025 12:12:56 +0000 Subject: [PATCH 30/40] Fit to #5476 Signed-off-by: Tailing Yuan --- cpp/tensorrt_llm/deep_ep/CMakeLists.txt | 6 +----- scripts/build_wheel.py | 1 + 2 files changed, 2 insertions(+), 5 deletions(-) diff --git a/cpp/tensorrt_llm/deep_ep/CMakeLists.txt b/cpp/tensorrt_llm/deep_ep/CMakeLists.txt index 700e48c1503..036dac8302a 100644 --- a/cpp/tensorrt_llm/deep_ep/CMakeLists.txt +++ b/cpp/tensorrt_llm/deep_ep/CMakeLists.txt @@ -9,7 +9,7 @@ add_custom_target(deep_ep) # Filter CUDA arch >= 9.0 set(DEEP_EP_CUDA_ARCHITECTURES "") -foreach(CUDA_ARCH IN LISTS CMAKE_CUDA_ARCHITECTURES_NORMALIZED) +foreach(CUDA_ARCH IN LISTS CMAKE_CUDA_ARCHITECTURES) string(REGEX MATCHALL "^([1-9][0-9]*)[0-9][af]?(-real|-virtual)?$" MATCHES ${CUDA_ARCH}) if(NOT CMAKE_MATCH_0) @@ -158,10 +158,6 @@ set_target_properties( # Add DeepEP cpp # ============== -# Remove -gencode, use CUDA_ARCHITECTURES -string(REGEX REPLACE "-gencode arch=[^ ]+ " "" CMAKE_CUDA_FLAGS - "${CMAKE_CUDA_FLAGS}") - # Let CMake generate `fatbinData` for CUDA separable compilation. Set to FALSE # or TRUE are both OK, but it generates `code=lto_90a` rather than `code=sm_90a` # for arch `90a-real` if set to TRUE. diff --git a/scripts/build_wheel.py b/scripts/build_wheel.py index 9de5f6a6408..fd2d7f252f3 100755 --- a/scripts/build_wheel.py +++ b/scripts/build_wheel.py @@ -601,6 +601,7 @@ def symlink_remove_dst_tree(src, dst, dirs_exist_ok=True): deep_ep_dir.unlink() elif deep_ep_dir.is_dir(): clear_folder(deep_ep_dir) + deep_ep_dir.rmdir() bin_dir = pkg_dir / "bin" if bin_dir.exists(): From 9bab62293ab69fcbb4aa83b1c995e953ed920102 Mon Sep 17 00:00:00 2001 From: Tailing Yuan Date: Fri, 4 Jul 2025 15:41:21 +0000 Subject: [PATCH 31/40] Fix for Ninja Signed-off-by: Tailing Yuan --- cpp/tensorrt_llm/deep_ep/CMakeLists.txt | 11 ++++++++--- 1 file changed, 8 insertions(+), 3 deletions(-) diff --git a/cpp/tensorrt_llm/deep_ep/CMakeLists.txt b/cpp/tensorrt_llm/deep_ep/CMakeLists.txt index 036dac8302a..9099eafad3b 100644 --- a/cpp/tensorrt_llm/deep_ep/CMakeLists.txt +++ b/cpp/tensorrt_llm/deep_ep/CMakeLists.txt @@ -10,14 +10,17 @@ add_custom_target(deep_ep) # Filter CUDA arch >= 9.0 set(DEEP_EP_CUDA_ARCHITECTURES "") foreach(CUDA_ARCH IN LISTS CMAKE_CUDA_ARCHITECTURES) - string(REGEX MATCHALL "^([1-9][0-9]*)[0-9][af]?(-real|-virtual)?$" MATCHES + string(REGEX MATCHALL "^([1-9][0-9]*)([0-9])[af]?(-real|-virtual)?$" MATCHES ${CUDA_ARCH}) if(NOT CMAKE_MATCH_0) message(FATAL_ERROR "Invalid CUDA arch format: \"${CUDA_ARCH}\"") endif() set(CUDA_ARCH_MAJOR ${CMAKE_MATCH_1}) + set(CUDA_ARCH_MINOR ${CMAKE_MATCH_2}) + set(CUDA_ARCH_POSTFIX ${CMAKE_MATCH_3}) if(${CUDA_ARCH_MAJOR} GREATER_EQUAL 9) - list(APPEND DEEP_EP_CUDA_ARCHITECTURES ${CMAKE_MATCH_0}) + list(APPEND DEEP_EP_CUDA_ARCHITECTURES + "${CUDA_ARCH_MAJOR}${CUDA_ARCH_MINOR}${CUDA_ARCH_POSTFIX}") endif() endforeach() @@ -144,7 +147,9 @@ ExternalProject_Add( -DNVSHMEM_USE_GDRCOPY:BOOL=0 -DNVSHMEM_USE_NCCL:BOOL=0 INSTALL_COMMAND "" - BINARY_DIR ${CMAKE_CURRENT_BINARY_DIR}/nvshmem-build) + BINARY_DIR ${CMAKE_CURRENT_BINARY_DIR}/nvshmem-build + BUILD_BYPRODUCTS + ${CMAKE_CURRENT_BINARY_DIR}/nvshmem-build/src/lib/libnvshmem.a) add_library(nvshmem_project::nvshmem STATIC IMPORTED) add_dependencies(nvshmem_project::nvshmem nvshmem_project) file(MAKE_DIRECTORY ${CMAKE_CURRENT_BINARY_DIR}/nvshmem-build/src/include) From c625e99ad2b903a9cd9ea92e3f5a8bdce386e033 Mon Sep 17 00:00:00 2001 From: Tailing Yuan Date: Fri, 4 Jul 2025 19:15:11 +0000 Subject: [PATCH 32/40] Check empty GITHUB_MIRROR Signed-off-by: Tailing Yuan --- cpp/tensorrt_llm/deep_ep/CMakeLists.txt | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/cpp/tensorrt_llm/deep_ep/CMakeLists.txt b/cpp/tensorrt_llm/deep_ep/CMakeLists.txt index 9099eafad3b..40d7261058d 100644 --- a/cpp/tensorrt_llm/deep_ep/CMakeLists.txt +++ b/cpp/tensorrt_llm/deep_ep/CMakeLists.txt @@ -41,7 +41,7 @@ endif() # Download DeepEP include(FetchContent) -if(DEFINED ENV{GITHUB_MIRROR}) +if(DEFINED ENV{GITHUB_MIRROR} AND NOT "$ENV{GITHUB_MIRROR}" STREQUAL "") set(GITHUB_URL "$ENV{GITHUB_MIRROR}") else() set(GITHUB_URL "https://github.com") From 869627f66cd29baa1ff9371effae5388aead6230 Mon Sep 17 00:00:00 2001 From: Tailing Yuan Date: Fri, 4 Jul 2025 19:42:45 +0000 Subject: [PATCH 33/40] Fix Dockerfile ARG Signed-off-by: Tailing Yuan --- docker/Dockerfile.multi | 1 + 1 file changed, 1 insertion(+) diff --git a/docker/Dockerfile.multi b/docker/Dockerfile.multi index 95cdb133fd4..b456be578da 100644 --- a/docker/Dockerfile.multi +++ b/docker/Dockerfile.multi @@ -115,6 +115,7 @@ COPY .gitmodules setup.py requirements.txt requirements-dev.txt constraints.txt RUN mkdir -p /root/.cache/pip /root/.cache/ccache ENV CCACHE_DIR=/root/.cache/ccache # Build the TRT-LLM wheel +ARG GITHUB_MIRROR="" ARG BUILD_WHEEL_ARGS="--clean --python_bindings --benchmarks" RUN --mount=type=cache,target=/root/.cache/pip --mount=type=cache,target=${CCACHE_DIR} \ GITHUB_MIRROR=$GITHUB_MIRROR python3 scripts/build_wheel.py ${BUILD_WHEEL_ARGS} From bfd4818b8b8f060f7810dbf2d0000f6d6bc56fa4 Mon Sep 17 00:00:00 2001 From: Tailing Yuan Date: Sat, 5 Jul 2025 22:48:35 +0800 Subject: [PATCH 34/40] Update staging Docker images Signed-off-by: Tailing Yuan --- jenkins/current_image_tags.properties | 12 ++++++------ 1 file changed, 6 insertions(+), 6 deletions(-) diff --git a/jenkins/current_image_tags.properties b/jenkins/current_image_tags.properties index bb293b50e2c..0edfb9a9c9b 100644 --- a/jenkins/current_image_tags.properties +++ b/jenkins/current_image_tags.properties @@ -4,9 +4,9 @@ # https://code.visualstudio.com/remote/advancedcontainers/environment-variables#_option-2-use-an-env-file # for reuse in Dev Containers configuration. # Also, the file needs to be parseable by 'sh' for reuse by docker/Makefile. -LLM_DOCKER_IMAGE_URI=urm.nvidia.com/sw-tensorrt-docker/tensorrt-llm -LLM_DOCKER_IMAGE_TAG_SUFFIX=-trt10.11.0.33-skip-tritondevel-202507041650-5534 -LLM_DOCKER_IMAGE=${LLM_DOCKER_IMAGE_URI}:pytorch-25.05-py3-x86_64-ubuntu24.04${LLM_DOCKER_IMAGE_TAG_SUFFIX} -LLM_SBSA_DOCKER_IMAGE=${LLM_DOCKER_IMAGE_URI}:pytorch-25.05-py3-aarch64-ubuntu24.04${LLM_DOCKER_IMAGE_TAG_SUFFIX} -LLM_ROCKYLINUX8_PY310_DOCKER_IMAGE=${LLM_DOCKER_IMAGE_URI}:cuda-12.9.0-devel-rocky8-x86_64-rocky8-py310${LLM_DOCKER_IMAGE_TAG_SUFFIX} -LLM_ROCKYLINUX8_PY312_DOCKER_IMAGE=${LLM_DOCKER_IMAGE_URI}:cuda-12.9.0-devel-rocky8-x86_64-rocky8-py312${LLM_DOCKER_IMAGE_TAG_SUFFIX} +LLM_DOCKER_IMAGE_URI=urm.nvidia.com/sw-tensorrt-docker/tensorrt-llm-staging/tritondevel +LLM_DOCKER_IMAGE_TAG_SUFFIX=-869627f-github-pr-5534-357 +LLM_DOCKER_IMAGE=${LLM_DOCKER_IMAGE_URI}:x86_64-tritondevel-torch_skip${LLM_DOCKER_IMAGE_TAG_SUFFIX} +LLM_SBSA_DOCKER_IMAGE=${LLM_DOCKER_IMAGE_URI}:sbsa-tritondevel-torch_skip${LLM_DOCKER_IMAGE_TAG_SUFFIX} +LLM_ROCKYLINUX8_PY310_DOCKER_IMAGE=${LLM_DOCKER_IMAGE_URI}:x86_64-rockylinux8-torch_skip-py310${LLM_DOCKER_IMAGE_TAG_SUFFIX} +LLM_ROCKYLINUX8_PY312_DOCKER_IMAGE=${LLM_DOCKER_IMAGE_URI}:x86_64-rockylinux8-torch_skip-py312${LLM_DOCKER_IMAGE_TAG_SUFFIX} From 1427067fb726d0e945beb3a4f28ba2451a064042 Mon Sep 17 00:00:00 2001 From: Tailing Yuan Date: Mon, 7 Jul 2025 02:43:41 +0000 Subject: [PATCH 35/40] Fix indentation Signed-off-by: Tailing Yuan --- docker/common/install_base.sh | 20 ++++++++++---------- 1 file changed, 10 insertions(+), 10 deletions(-) diff --git a/docker/common/install_base.sh b/docker/common/install_base.sh index be9cd4cf0f1..d1c2f036d63 100644 --- a/docker/common/install_base.sh +++ b/docker/common/install_base.sh @@ -117,18 +117,18 @@ install_gcctoolset_rockylinux() { echo 'export LD_LIBRARY_PATH=/usr/local/cuda/lib64:$LD_LIBRARY_PATH' >> "${ENV}" dnf install \ patch \ - vim \ - wget \ - git-lfs \ - gcc-toolset-11 \ - libffi-devel \ - -y + vim \ + wget \ + git-lfs \ + gcc-toolset-11 \ + libffi-devel \ + -y dnf install \ - openmpi \ - openmpi-devel \ - pigz \ + openmpi \ + openmpi-devel \ + pigz \ rdma-core-devel \ - -y + -y echo "source scl_source enable gcc-toolset-11" >> "${ENV}" echo 'export PATH=/usr/lib64/openmpi/bin:$PATH' >> "${ENV}" } From 671a9fa88921801cb246ef88ee2af16874676edd Mon Sep 17 00:00:00 2001 From: Tailing Yuan Date: Mon, 7 Jul 2025 02:52:54 +0000 Subject: [PATCH 36/40] Add LINK_DEPENDS Signed-off-by: Tailing Yuan --- cpp/tensorrt_llm/deep_ep/CMakeLists.txt | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/cpp/tensorrt_llm/deep_ep/CMakeLists.txt b/cpp/tensorrt_llm/deep_ep/CMakeLists.txt index 40d7261058d..76fd96fa9ca 100644 --- a/cpp/tensorrt_llm/deep_ep/CMakeLists.txt +++ b/cpp/tensorrt_llm/deep_ep/CMakeLists.txt @@ -183,7 +183,8 @@ set_target_properties( CXX_STANDARD 17 CUDA_STANDARD 17 CUDA_SEPARABLE_COMPILATION ON - CUDA_ARCHITECTURES "${DEEP_EP_CUDA_ARCHITECTURES}" + CUDA_ARCHITECTURES ${DEEP_EP_CUDA_ARCHITECTURES} + LINK_DEPENDS ${CMAKE_CURRENT_SOURCE_DIR}/deep_ep_cpp_tllm.version INSTALL_RPATH "$ORIGIN/libs/nvshmem;${TORCH_INSTALL_PREFIX}/lib" BUILD_WITH_INSTALL_RPATH TRUE) target_compile_options( From 670a2bfdac8377705f0df1f1269c1ea94ca42b52 Mon Sep 17 00:00:00 2001 From: Tailing Yuan Date: Mon, 7 Jul 2025 03:47:54 +0000 Subject: [PATCH 37/40] Polish comments Signed-off-by: Tailing Yuan --- cpp/tensorrt_llm/deep_ep/strip_nvshmem_helper.py | 10 +++++----- 1 file changed, 5 insertions(+), 5 deletions(-) diff --git a/cpp/tensorrt_llm/deep_ep/strip_nvshmem_helper.py b/cpp/tensorrt_llm/deep_ep/strip_nvshmem_helper.py index 97c38f1ced1..d97582d83ca 100644 --- a/cpp/tensorrt_llm/deep_ep/strip_nvshmem_helper.py +++ b/cpp/tensorrt_llm/deep_ep/strip_nvshmem_helper.py @@ -9,11 +9,11 @@ assert deep_ep_obj_dir.is_dir() # Run `find cpp/build | grep host/bootstrap/bootstrap.cpp.o$` to get the directory -# Please set to `nvshmem.dir` rather than `nvshmem_host.dir` +# Please set it to `nvshmem.dir` rather than `nvshmem_host.dir` nvshmem_obj_dir = project_dir / "cpp/build/tensorrt_llm/deep_ep/nvshmem-build/src/CMakeFiles/nvshmem.dir" assert nvshmem_obj_dir.is_dir() -# Parse -gencode arguments +# Parse the `-gencode` arguments with (project_dir / "cpp/build/tensorrt_llm/deep_ep/cuda_architectures.txt").open() as f: cuda_architectures = f.read() @@ -40,7 +40,7 @@ deep_ep_obj_list = sorted(deep_ep_obj_dir.glob("kernels/**/*.o")) nvshmem_obj_set = set(nvshmem_obj_dir.glob("**/*.o")) for exclude_obj in sorted(nvshmem_obj_set): - # Create liba.a with one object file less + # Create liba.a with one fewer object file subprocess.check_call( ["ar", "rcs", ranlib, *(nvshmem_obj_set - {exclude_obj})]) # Test whether there are undefined symbols @@ -48,8 +48,8 @@ "/usr/local/cuda/bin/nvcc", *gencode_args, "-Xlinker", "--no-undefined", "-shared", *deep_ep_obj_list, ranlib, "-o", temp_dir / "a.out" ]) - # If there is no undefined symbols, then print "-" indicating the file could be omitted. + # If there are no undefined symbols, print "-" to indicate the file could be omitted. print("-" if res == 0 else "+", str(exclude_obj.relative_to(nvshmem_obj_dir))[:-2]) - # Unlink the ranlib because `ar` does append + # Unlink the ranlib because `ar` appends existing archives ranlib.unlink() From 9d41946fa13192cd5c17d453ba58efe6b4060ccc Mon Sep 17 00:00:00 2001 From: Tailing Yuan Date: Mon, 7 Jul 2025 04:18:25 +0000 Subject: [PATCH 38/40] Update Docker images Signed-off-by: Tailing Yuan --- jenkins/controlCCache.groovy | 2 +- jenkins/current_image_tags.properties | 12 ++++++------ 2 files changed, 7 insertions(+), 7 deletions(-) diff --git a/jenkins/controlCCache.groovy b/jenkins/controlCCache.groovy index 4932b5ae59a..32e000bc263 100644 --- a/jenkins/controlCCache.groovy +++ b/jenkins/controlCCache.groovy @@ -1,7 +1,7 @@ import java.lang.InterruptedException -DOCKER_IMAGE = "urm.nvidia.com/sw-tensorrt-docker/tensorrt-llm:pytorch-25.05-py3-x86_64-ubuntu24.04-trt10.11.0.33-skip-tritondevel-202507041650-5534" +DOCKER_IMAGE = "urm.nvidia.com/sw-tensorrt-docker/tensorrt-llm:pytorch-25.05-py3-x86_64-ubuntu24.04-trt10.11.0.33-skip-tritondevel-202507071100-5534" def createKubernetesPodConfig(image, arch = "amd64") { diff --git a/jenkins/current_image_tags.properties b/jenkins/current_image_tags.properties index 0edfb9a9c9b..cf86b5292b2 100644 --- a/jenkins/current_image_tags.properties +++ b/jenkins/current_image_tags.properties @@ -4,9 +4,9 @@ # https://code.visualstudio.com/remote/advancedcontainers/environment-variables#_option-2-use-an-env-file # for reuse in Dev Containers configuration. # Also, the file needs to be parseable by 'sh' for reuse by docker/Makefile. -LLM_DOCKER_IMAGE_URI=urm.nvidia.com/sw-tensorrt-docker/tensorrt-llm-staging/tritondevel -LLM_DOCKER_IMAGE_TAG_SUFFIX=-869627f-github-pr-5534-357 -LLM_DOCKER_IMAGE=${LLM_DOCKER_IMAGE_URI}:x86_64-tritondevel-torch_skip${LLM_DOCKER_IMAGE_TAG_SUFFIX} -LLM_SBSA_DOCKER_IMAGE=${LLM_DOCKER_IMAGE_URI}:sbsa-tritondevel-torch_skip${LLM_DOCKER_IMAGE_TAG_SUFFIX} -LLM_ROCKYLINUX8_PY310_DOCKER_IMAGE=${LLM_DOCKER_IMAGE_URI}:x86_64-rockylinux8-torch_skip-py310${LLM_DOCKER_IMAGE_TAG_SUFFIX} -LLM_ROCKYLINUX8_PY312_DOCKER_IMAGE=${LLM_DOCKER_IMAGE_URI}:x86_64-rockylinux8-torch_skip-py312${LLM_DOCKER_IMAGE_TAG_SUFFIX} +LLM_DOCKER_IMAGE_URI=urm.nvidia.com/sw-tensorrt-docker/tensorrt-llm +LLM_DOCKER_IMAGE_TAG_SUFFIX=-trt10.11.0.33-skip-tritondevel-202507071100-5534 +LLM_DOCKER_IMAGE=${LLM_DOCKER_IMAGE_URI}:pytorch-25.05-py3-x86_64-ubuntu24.04${LLM_DOCKER_IMAGE_TAG_SUFFIX} +LLM_SBSA_DOCKER_IMAGE=${LLM_DOCKER_IMAGE_URI}:pytorch-25.05-py3-aarch64-ubuntu24.04${LLM_DOCKER_IMAGE_TAG_SUFFIX} +LLM_ROCKYLINUX8_PY310_DOCKER_IMAGE=${LLM_DOCKER_IMAGE_URI}:cuda-12.9.0-devel-rocky8-x86_64-rocky8-py310${LLM_DOCKER_IMAGE_TAG_SUFFIX} +LLM_ROCKYLINUX8_PY312_DOCKER_IMAGE=${LLM_DOCKER_IMAGE_URI}:cuda-12.9.0-devel-rocky8-x86_64-rocky8-py312${LLM_DOCKER_IMAGE_TAG_SUFFIX} From c9211426fc12944ab234c334ef2c8afff3282047 Mon Sep 17 00:00:00 2001 From: Tailing Yuan Date: Mon, 7 Jul 2025 06:12:11 +0000 Subject: [PATCH 39/40] Set DISABLE_AGGRESSIVE_PTX_INSTRS Signed-off-by: Tailing Yuan --- cpp/tensorrt_llm/deep_ep/CMakeLists.txt | 5 +++-- 1 file changed, 3 insertions(+), 2 deletions(-) diff --git a/cpp/tensorrt_llm/deep_ep/CMakeLists.txt b/cpp/tensorrt_llm/deep_ep/CMakeLists.txt index 76fd96fa9ca..3e9874973b8 100644 --- a/cpp/tensorrt_llm/deep_ep/CMakeLists.txt +++ b/cpp/tensorrt_llm/deep_ep/CMakeLists.txt @@ -191,8 +191,9 @@ target_compile_options( deep_ep_cpp_tllm PRIVATE ${TORCH_CXX_FLAGS} -O3 $<$:-Xcompiler=-O3> $<$:--ptxas-options=--register-usage-level=10>) -target_compile_definitions(deep_ep_cpp_tllm - PRIVATE TORCH_EXTENSION_NAME=deep_ep_cpp_tllm) +target_compile_definitions( + deep_ep_cpp_tllm PRIVATE DISABLE_AGGRESSIVE_PTX_INSTRS + TORCH_EXTENSION_NAME=deep_ep_cpp_tllm) target_link_libraries( deep_ep_cpp_tllm PRIVATE nvshmem_project::nvshmem ${TORCH_LIBRARIES} ${TORCH_PYTHON_LIB}) From ca3cdcf51d381b7fe2064f42b40d7dbf7d2d321d Mon Sep 17 00:00:00 2001 From: Tailing Yuan Date: Mon, 7 Jul 2025 10:32:52 +0000 Subject: [PATCH 40/40] Update comments Signed-off-by: Tailing Yuan --- cpp/tensorrt_llm/deep_ep/README.md | 4 ++-- cpp/tensorrt_llm/deep_ep/strip_nvshmem_helper.py | 12 +++++++++--- 2 files changed, 11 insertions(+), 5 deletions(-) diff --git a/cpp/tensorrt_llm/deep_ep/README.md b/cpp/tensorrt_llm/deep_ep/README.md index c11e6bec3e2..5c1ea0c797f 100644 --- a/cpp/tensorrt_llm/deep_ep/README.md +++ b/cpp/tensorrt_llm/deep_ep/README.md @@ -1,7 +1,7 @@ How to generate `nvshmem_fast_build.patch`? -1. Build without `nvshmem_fast_build.patch`. -2. Try linking DeepEP to NVSHMEM while omitting one object file. +1. Build the project without applying the `nvshmem_fast_build.patch`. +2. Link NVSHMEM to DeepEP with one NVSHMEM object file omitted. 3. Repeat step 2 until no more object files can be omitted. 4. Remove the unused files from NVSHMEM's `CMakelists.txt`, and save the differences as `nvshmem_fast_build.patch`. diff --git a/cpp/tensorrt_llm/deep_ep/strip_nvshmem_helper.py b/cpp/tensorrt_llm/deep_ep/strip_nvshmem_helper.py index d97582d83ca..e9901558970 100644 --- a/cpp/tensorrt_llm/deep_ep/strip_nvshmem_helper.py +++ b/cpp/tensorrt_llm/deep_ep/strip_nvshmem_helper.py @@ -1,3 +1,9 @@ +# A helper script to detect unused NVSHMEM object files. +# +# The script links NVSHMEM to DeepEP with one object file removed at a time and +# checks whether there are any undefined symbols. See README.md for details. +# This script is not tested or QA'ed, so you may need to update this script if +# the project structure changes or compilation options change. import pathlib import re import subprocess @@ -40,7 +46,7 @@ deep_ep_obj_list = sorted(deep_ep_obj_dir.glob("kernels/**/*.o")) nvshmem_obj_set = set(nvshmem_obj_dir.glob("**/*.o")) for exclude_obj in sorted(nvshmem_obj_set): - # Create liba.a with one fewer object file + # Create liba.a with one object file removed subprocess.check_call( ["ar", "rcs", ranlib, *(nvshmem_obj_set - {exclude_obj})]) # Test whether there are undefined symbols @@ -48,8 +54,8 @@ "/usr/local/cuda/bin/nvcc", *gencode_args, "-Xlinker", "--no-undefined", "-shared", *deep_ep_obj_list, ranlib, "-o", temp_dir / "a.out" ]) - # If there are no undefined symbols, print "-" to indicate the file could be omitted. + # If there are no undefined symbols, print "-" to indicate the file can be omitted print("-" if res == 0 else "+", str(exclude_obj.relative_to(nvshmem_obj_dir))[:-2]) - # Unlink the ranlib because `ar` appends existing archives + # Unlink the archive file because `ar` appends existing archives ranlib.unlink()