Skip to content

Commit 30817d5

Browse files
authored
Merge branch 'main' into mtp_optimizations_round1
2 parents 8d3981d + 5aa958a commit 30817d5

File tree

181 files changed

+4674
-2302
lines changed

Some content is hidden

Large Commits have some content hidden by default. Use the searchbox below for content that may be hidden.

181 files changed

+4674
-2302
lines changed

.github/workflows/blossom-ci.yml

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -40,7 +40,7 @@ jobs:
4040
startsWith(github.event.comment.body, '/bot skip --comment') ||
4141
startsWith(github.event.comment.body, '/bot reuse-pipeline') ||
4242
startsWith(github.event.comment.body, '/bot kill')) && contains(
43-
fromJson('["byshiue","chuangz0","funatiq","hypdeb","jdemouth-nvidia","joyang-nv","lowsfer","Tabrizian","yweng0828","Shixiaowei02","MartinMarciniszyn","schetlur-nv","dcampora","pcastonguay","Naveassaf","lfr-0531","nekorobov","PerkzZheng","kaiyux","nv-guomingz","LinPoly","thorjohnsen","jiahanc","latency1024","tburt-nv","zeroepoch","chzblych","niukuo","ZhanruiSunCh","EmmaQiaoCh","yiqingy0","achartier","suyoggupta","amukkara","mk-nvidia","QiJune","lucaslie","davidmlw","hlu1","nvzhou","syuoni","NVGaryJi","symphonylyh","hello-11","zongfeijing","Jackch-NV","jinyangyuan-nvidia","LarryXFly","crazydemo","jaedeok-nvidia","wm2012011492","rosenrodt","zhuoyao1012","xinhe-nv","Yuening-wa","Shunkangz","zhengd-nv","yibinl-nvidia","StanleySun639","KingsleyLiu-NV","kxdc","yingcanw","BestJuly","ChristinaZ","bobboli","xueweilnvidia","kunlunl","cherichy","lucifer1004","Autumn1998","litaotju","peaceh-nv","liji-nv","SimengLiu-nv","yuxianq","yechank-nvidia","vallis-neria","DylanChen-NV","Tracin","zhhuang-nv","ISEEKYAN","xupinjie","tongyuantongyu","laikhtewari","zhuolingwang","dominicshanshan","jershi425","shifangx","StudyingShao","Superjomn","dongjiyingdjy","guangyunh-nv","wili-65535","tiffany940107","DanBlanaru","mikeiovine","djns99","ruodil","xiaoweiw-nv","xuwchen","bashimao","yizhang-nv","hyukn","nvpohanh","yuki-666","juney-nvidia","barry-delaney","Kefeng-Duan","MinaHuai","yilin-void","jhaotingc","jmydurant","katec846","CarstyYou","Njuapp","Jie-Fang","nvbrantz","inocsin","ruoqianguo","chenfeiz0326","ming-wei","eopXD","longlee0622","dongfengy","georgeliu95","evezhier","rakib-hasan","shangz-ai","JyChang012","wangsiping1997","yuanjings-nvda","tomeras91","roikoren755","amirkl94","shaharmor98","danielafrimi","amitz-nv","hijkzzz","rzilberstein-nvidia","dc3671","hchings","yuhengxnv","dongxuy04","qiaoxj07","omera-nv","DomBrown","brb-nv","FrankD412","yuhsuan-t","Fridah-nv","a-mccarthy","HuiGao-NV","alexmsettle","meenchen","sugunav14","cjluo-nv","kyleliang-nv","chang-l","WeiHaocheng","qixiang-99","BatshevaBlack","ebarilanM","xmchen1987","lingjiew","heyuhhh","netanel-haber","jiefangz-nv","wyw1267","yunruis","sklevtsov-nvidia","jgangani","pamelap-nvidia","ixlmar","GalSha","Dido0o0","rabiel","nvzhihanj","milesial","fzmu727","zackyoray","RoeyAzran1992","viraatc","v-shobhit","yuanjingx87","uchihatmtkinu","nvrohanv","vegaluisjose","qsang-nv","ChunhuanLin","timlee0212","venkywonka","zbpatel","tijyojwad","shyeh25","zihaok","nv-yilinf","ttyio","farazkh80","yuantailing","JennyLiu-nv","moraxu","IzzyPutterman","nvchenghaoz","nvxuanyuc","poweiw","stnie","zhanga5","nzmora-nvidia","greg-kwasniewski1","linda-stadter","Tom-Zheng","vanshilshah97","ixlmar","MatthiasKohl","Wanli-Jiang", "arekay", "davidclark-nv", "2ez4bz", "tcherckez-nvidia", "MrGeva", "galagam", "limin2021", "dhansen-nvidia","talorabr","kanghui0204","wu6u3tw","hvagadia","xavier-nvidia","raayandhar"]'),
43+
fromJson('["byshiue","chuangz0","funatiq","hypdeb","jdemouth-nvidia","joyang-nv","lowsfer","Tabrizian","yweng0828","Shixiaowei02","MartinMarciniszyn","schetlur-nv","dcampora","pcastonguay","Naveassaf","lfr-0531","nekorobov","PerkzZheng","kaiyux","nv-guomingz","LinPoly","thorjohnsen","jiahanc","latency1024","tburt-nv","zeroepoch","chzblych","niukuo","ZhanruiSunCh","EmmaQiaoCh","yiqingy0","achartier","suyoggupta","amukkara","mk-nvidia","QiJune","lucaslie","davidmlw","hlu1","nvzhou","syuoni","NVGaryJi","symphonylyh","hello-11","zongfeijing","Jackch-NV","jinyangyuan-nvidia","LarryXFly","crazydemo","jaedeok-nvidia","wm2012011492","rosenrodt","zhuoyao1012","xinhe-nv","Yuening-wa","Shunkangz","zhengd-nv","yibinl-nvidia","StanleySun639","KingsleyLiu-NV","kxdc","yingcanw","BestJuly","ChristinaZ","bobboli","xueweilnvidia","kunlunl","cherichy","lucifer1004","Autumn1998","litaotju","peaceh-nv","liji-nv","SimengLiu-nv","yuxianq","yechank-nvidia","vallis-neria","DylanChen-NV","Tracin","zhhuang-nv","ISEEKYAN","xupinjie","tongyuantongyu","laikhtewari","zhuolingwang","dominicshanshan","jershi425","shifangx","StudyingShao","Superjomn","dongjiyingdjy","guangyunh-nv","wili-65535","tiffany940107","DanBlanaru","mikeiovine","djns99","ruodil","xiaoweiw-nv","xuwchen","bashimao","yizhang-nv","hyukn","nvpohanh","yuki-666","juney-nvidia","barry-delaney","Kefeng-Duan","MinaHuai","yilin-void","jhaotingc","jmydurant","katec846","CarstyYou","Njuapp","Jie-Fang","nvbrantz","inocsin","ruoqianguo","chenfeiz0326","ming-wei","eopXD","longlee0622","dongfengy","georgeliu95","evezhier","rakib-hasan","shangz-ai","JyChang012","wangsiping1997","yuanjings-nvda","tomeras91","roikoren755","amirkl94","shaharmor98","danielafrimi","amitz-nv","hijkzzz","rzilberstein-nvidia","dc3671","hchings","yuhengxnv","dongxuy04","qiaoxj07","omera-nv","DomBrown","brb-nv","FrankD412","yuhsuan-t","Fridah-nv","a-mccarthy","HuiGao-NV","alexmsettle","meenchen","sugunav14","cjluo-nv","kyleliang-nv","chang-l","WeiHaocheng","qixiang-99","BatshevaBlack","ebarilanM","xmchen1987","lingjiew","heyuhhh","netanel-haber","jiefangz-nv","wyw1267","yunruis","sklevtsov-nvidia","jgangani","pamelap-nvidia","ixlmar","GalSha","Dido0o0","rabiel","nvzhihanj","milesial","fzmu727","zackyoray","RoeyAzran1992","viraatc","v-shobhit","yuanjingx87","uchihatmtkinu","nvrohanv","vegaluisjose","qsang-nv","ChunhuanLin","timlee0212","venkywonka","zbpatel","tijyojwad","shyeh25","zihaok","nv-yilinf","ttyio","farazkh80","yuantailing","JennyLiu-nv","moraxu","IzzyPutterman","nvchenghaoz","nvxuanyuc","poweiw","stnie","zhanga5","nzmora-nvidia","greg-kwasniewski1","linda-stadter","Tom-Zheng","vanshilshah97","ixlmar","MatthiasKohl","Wanli-Jiang", "arekay", "davidclark-nv", "2ez4bz", "tcherckez-nvidia", "MrGeva", "galagam", "limin2021", "dhansen-nvidia","talorabr","kanghui0204","wu6u3tw","hvagadia","xavier-nvidia","raayandhar","dbari","nvjullin","elvischenv","zhenhuaw-me","weireweire","yifeizhang-c","jiaganc","ziyixiong-nv","FelixXidddd","JunyiXu-nv","bo-nv","zerollzeng","RayenTian","ameynaik-hub"]'),
4444
github.actor)
4545
steps:
4646
- name: Check if comment is issued by authorized person

.github/workflows/label_community_pr.yml

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -14,6 +14,7 @@ on:
1414
jobs:
1515
label_pr:
1616
runs-on: ubuntu-latest
17+
if: github.repository == 'NVIDIA/TensorRT-LLM'
1718
steps:
1819
- name: Checkout repository
1920
uses: actions/checkout@v3

README.md

Lines changed: 13 additions & 14 deletions
Original file line numberDiff line numberDiff line change
@@ -9,7 +9,7 @@ TensorRT-LLM
99
[![python](https://img.shields.io/badge/python-3.10-green)](https://www.python.org/downloads/release/python-31012/)
1010
[![cuda](https://img.shields.io/badge/cuda-12.9.0-green)](https://developer.nvidia.com/cuda-downloads)
1111
[![trt](https://img.shields.io/badge/TRT-10.11.0-green)](https://developer.nvidia.com/tensorrt)
12-
[![version](https://img.shields.io/badge/release-1.0.0rc2-green)](./tensorrt_llm/version.py)
12+
[![version](https://img.shields.io/badge/release-1.0.0rc3-green)](./tensorrt_llm/version.py)
1313
[![license](https://img.shields.io/badge/license-Apache%202-blue)](./LICENSE)
1414

1515
[Architecture](./docs/source/torch/arch_overview.md)   |   [Performance](./docs/source/performance/perf-overview.md)   |   [Examples](https://nvidia.github.io/TensorRT-LLM/quick-start-guide.html)   |   [Documentation](./docs/source/)   |   [Roadmap](https://github.com/NVIDIA/TensorRT-LLM/issues?q=is%3Aissue%20state%3Aopen%20label%3Aroadmap)
@@ -61,21 +61,22 @@ TensorRT-LLM
6161
* [02/12] 🌟 How Scaling Laws Drive Smarter, More Powerful AI
6262
[➡️ link](https://blogs.nvidia.com/blog/ai-scaling-laws/?ncid=so-link-889273&linkId=100000338837832)
6363

64-
* [01/25] Nvidia moves AI focus to inference cost, efficiency [➡️ link](https://www.fierceelectronics.com/ai/nvidia-moves-ai-focus-inference-cost-efficiency?linkId=100000332985606)
6564

66-
* [01/24] 🏎️ Optimize AI Inference Performance with NVIDIA Full-Stack Solutions [➡️ link](https://developer.nvidia.com/blog/optimize-ai-inference-performance-with-nvidia-full-stack-solutions/?ncid=so-twit-400810&linkId=100000332621049)
65+
<details close>
66+
<summary>Previous News</summary>
6767

68-
* [01/23] 🚀 Fast, Low-Cost Inference Offers Key to Profitable AI [➡️ link](https://blogs.nvidia.com/blog/ai-inference-platform/?ncid=so-twit-693236-vt04&linkId=100000332307804)
68+
* [2025/01/25] Nvidia moves AI focus to inference cost, efficiency [➡️ link](https://www.fierceelectronics.com/ai/nvidia-moves-ai-focus-inference-cost-efficiency?linkId=100000332985606)
6969

70-
* [01/16] Introducing New KV Cache Reuse Optimizations in TensorRT-LLM [➡️ link](https://developer.nvidia.com/blog/introducing-new-kv-cache-reuse-optimizations-in-nvidia-tensorrt-llm/?ncid=so-twit-363876&linkId=100000330323229)
70+
* [2025/01/24] 🏎️ Optimize AI Inference Performance with NVIDIA Full-Stack Solutions [➡️ link](https://developer.nvidia.com/blog/optimize-ai-inference-performance-with-nvidia-full-stack-solutions/?ncid=so-twit-400810&linkId=100000332621049)
7171

72-
* [01/14] 📣 Bing's Transition to LLM/SLM Models: Optimizing Search with TensorRT-LLM [➡️ link](https://blogs.bing.com/search-quality-insights/December-2024/Bing-s-Transition-to-LLM-SLM-Models-Optimizing-Search-with-TensorRT-LLM)
72+
* [2025/01/23] 🚀 Fast, Low-Cost Inference Offers Key to Profitable AI [➡️ link](https://blogs.nvidia.com/blog/ai-inference-platform/?ncid=so-twit-693236-vt04&linkId=100000332307804)
7373

74-
* [01/04] ⚡Boost Llama 3.3 70B Inference Throughput 3x with TensorRT-LLM Speculative Decoding
75-
[➡️ link](https://developer.nvidia.com/blog/boost-llama-3-3-70b-inference-throughput-3x-with-nvidia-tensorrt-llm-speculative-decoding/)
74+
* [2025/01/16] Introducing New KV Cache Reuse Optimizations in TensorRT-LLM [➡️ link](https://developer.nvidia.com/blog/introducing-new-kv-cache-reuse-optimizations-in-nvidia-tensorrt-llm/?ncid=so-twit-363876&linkId=100000330323229)
7675

77-
<details close>
78-
<summary>Previous News</summary>
76+
* [2025/01/14] 📣 Bing's Transition to LLM/SLM Models: Optimizing Search with TensorRT-LLM [➡️ link](https://blogs.bing.com/search-quality-insights/December-2024/Bing-s-Transition-to-LLM-SLM-Models-Optimizing-Search-with-TensorRT-LLM)
77+
78+
* [2025/01/04] ⚡Boost Llama 3.3 70B Inference Throughput 3x with TensorRT-LLM Speculative Decoding
79+
[➡️ link](https://developer.nvidia.com/blog/boost-llama-3-3-70b-inference-throughput-3x-with-nvidia-tensorrt-llm-speculative-decoding/)
7980

8081
* [2024/12/10] ⚡ Llama 3.3 70B from AI at Meta is accelerated by TensorRT-LLM. 🌟 State-of-the-art model on par with Llama 3.1 405B for reasoning, math, instruction following and tool use. Explore the preview
8182
[➡️ link](https://build.nvidia.com/meta/llama-3_3-70b-instruct)
@@ -204,11 +205,9 @@ Serverless TensorRT-LLM (LLaMA 3 8B) | Modal Docs [➡️ link](https://modal.co
204205

205206
TensorRT-LLM is an open-sourced library for optimizing Large Language Model (LLM) inference. It provides state-of-the-art optimizations, including custom attention kernels, inflight batching, paged KV caching, quantization (FP8, [FP4](https://www.nvidia.com/en-us/data-center/technologies/blackwell-architecture/), INT4 [AWQ](https://arxiv.org/abs/2306.00978), INT8 [SmoothQuant](https://arxiv.org/abs/2211.10438), ...), speculative decoding, and much more, to perform inference efficiently on NVIDIA GPUs.
206207

207-
Recently [re-architected with a **PyTorch backend**](https://nvidia.github.io/TensorRT-LLM/torch.html), TensorRT-LLM now combines peak performance with a more flexible and developer-friendly workflow. The original [TensorRT](https://developer.nvidia.com/tensorrt)-based backend remains supported and continues to provide an ahead-of-time compilation path for building highly optimized "[Engines](https://docs.nvidia.com/deeplearning/tensorrt/quick-start-guide/index.html#ecosystem)" for deployment. The PyTorch backend complements this by enabling faster development iteration and rapid experimentation.
208-
209-
TensorRT-LLM provides a flexible [**LLM API**](https://nvidia.github.io/TensorRT-LLM/quick-start-guide.html#llm-api) to simplify model setup and inference across both PyTorch and TensorRT backends. It supports a wide range of inference use cases from a single GPU to multiple nodes with multiple GPUs using [Tensor Parallelism](https://docs.nvidia.com/nemo-framework/user-guide/latest/nemotoolkit/features/parallelisms.html#tensor-parallelism) and/or [Pipeline Parallelism](https://docs.nvidia.com/nemo-framework/user-guide/latest/nemotoolkit/features/parallelisms.html#pipeline-parallelism). It also includes a [backend](https://github.com/triton-inference-server/tensorrtllm_backend) for integration with the [NVIDIA Triton Inference Server](https://developer.nvidia.com/nvidia-triton-inference-server).
208+
[Architected on PyTorch](https://github.com/NVIDIA/TensorRT-LLM/blob/main/docs/source/torch/arch_overview.md), TensorRT-LLM provides a high-level Python [LLM API](https://nvidia.github.io/TensorRT-LLM/quick-start-guide.html#llm-api) that supports a wide range of inference setups - from single-GPU to multi-GPU or multi-node deployments. It includes built-in support for various parallelism strategies and advanced features. The LLM API integrates seamlessly with the broader inference ecosystem, including NVIDIA [Dynamo](https://github.com/ai-dynamo/dynamo) and the [Triton Inference Server](https://github.com/triton-inference-server/server).
210209

211-
Several popular models are pre-defined and can be easily customized or extended using [native PyTorch code](./tensorrt_llm/_torch/models/modeling_deepseekv3.py) (for the PyTorch backend) or a [PyTorch-style Python API](./tensorrt_llm/models/llama/model.py) (for the TensorRT backend).
210+
TensorRT-LLM is designed to be modular and easy to modify. Its PyTorch-native architecture allows developers to experiment with the runtime or extend functionality. Several popular models are also pre-defined and can be customized using [native PyTorch code](./tensorrt_llm/_torch/models/modeling_deepseekv3.py), making it easy to adapt the system to specific needs.
212211

213212

214213
## Getting Started

cpp/CMakeLists.txt

Lines changed: 18 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -45,6 +45,7 @@ option(ENABLE_MULTI_DEVICE
4545
option(ENABLE_UCX "Enable building with UCX (Uniform Communication X) support"
4646
ON)
4747
option(NVRTC_DYNAMIC_LINKING "Link against the dynamic NVRTC libraries" OFF)
48+
option(ENABLE_NVSHMEM "Enable building with NVSHMEM support" OFF)
4849
option(USING_OSS_CUTLASS_LOW_LATENCY_GEMM
4950
"Using open sourced Cutlass low latency gemm kernel" ON)
5051
option(USING_OSS_CUTLASS_FP4_GEMM "Using open sourced Cutlass fp4 gemm kernel"
@@ -54,6 +55,8 @@ option(USING_OSS_CUTLASS_MOE_GEMM "Using open sourced Cutlass moe gemm kernel"
5455
option(USING_OSS_CUTLASS_ALLREDUCE_GEMM
5556
"Using open sourced Cutlass AR gemm kernel" ON)
5657

58+
message(STATUS "ENABLE_NVSHMEM is ${ENABLE_NVSHMEM}")
59+
5760
if(NVTX_DISABLE)
5861
add_compile_definitions("NVTX_DISABLE")
5962
message(STATUS "NVTX is disabled")
@@ -171,6 +174,7 @@ message(STATUS "CUDA library status:")
171174
message(STATUS " version: ${CUDAToolkit_VERSION}")
172175
message(STATUS " libraries: ${CUDAToolkit_LIBRARY_DIR}")
173176
message(STATUS " include path: ${CUDAToolkit_INCLUDE_DIRS}")
177+
message(STATUS "CUDA_NVML_LIB: ${CUDA_NVML_LIB}")
174178

175179
# Prevent CMake from creating a response file for CUDA compiler, so clangd can
176180
# pick up on the includes
@@ -262,9 +266,21 @@ set(CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} -DBUILD_SYSTEM=cmake_oss ")
262266
# note: cmake expr generation $<BOOL:${ENABLE_MULTI_DEVICE}> is a build time
263267
# evaluation so hard to debug at cmake time
264268
if(ENABLE_MULTI_DEVICE)
265-
set(CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} -DENABLE_MULTI_DEVICE=1")
269+
# Add target definitions for both C++ and CUDA
270+
add_compile_definitions($<$<COMPILE_LANGUAGE:CXX>:ENABLE_MULTI_DEVICE=1>
271+
$<$<COMPILE_LANGUAGE:CUDA>:ENABLE_MULTI_DEVICE=1>)
272+
else()
273+
# Add target definitions for both C++ and CUDA
274+
add_compile_definitions($<$<COMPILE_LANGUAGE:CXX>:ENABLE_MULTI_DEVICE=0>
275+
$<$<COMPILE_LANGUAGE:CUDA>:ENABLE_MULTI_DEVICE=0>)
276+
endif()
277+
278+
if(ENABLE_NVSHMEM)
279+
add_compile_definitions($<$<COMPILE_LANGUAGE:CXX>:ENABLE_NVSHMEM=1>
280+
$<$<COMPILE_LANGUAGE:CUDA>:ENABLE_NVSHMEM=1>)
266281
else()
267-
set(CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} -DENABLE_MULTI_DEVICE=0")
282+
add_compile_definitions($<$<COMPILE_LANGUAGE:CXX>:ENABLE_NVSHMEM=0>
283+
$<$<COMPILE_LANGUAGE:CUDA>:ENABLE_NVSHMEM=0>)
268284
endif()
269285

270286
# Fix linking issue with TRT 10, the detailed description about `--mcmodel` can

cpp/include/tensorrt_llm/common/cudaUtils.h

Lines changed: 6 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -303,7 +303,12 @@ inline int getSMVersion()
303303
int sm_minor = 0;
304304
check_cuda_error(cudaDeviceGetAttribute(&sm_major, cudaDevAttrComputeCapabilityMajor, device));
305305
check_cuda_error(cudaDeviceGetAttribute(&sm_minor, cudaDevAttrComputeCapabilityMinor, device));
306-
return sm_major * 10 + sm_minor;
306+
int sm = sm_major * 10 + sm_minor;
307+
if (sm == 121)
308+
{
309+
return 120;
310+
}
311+
return sm;
307312
}
308313

309314
inline int getDevice()

cpp/tensorrt_llm/CMakeLists.txt

Lines changed: 10 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -72,6 +72,12 @@ if(ENABLE_MULTI_DEVICE)
7272
include_directories(${MPI_C_INCLUDE_DIRS})
7373
endif()
7474

75+
if(ENABLE_NVSHMEM)
76+
# Add hints for aarch64
77+
find_package(NVSHMEM REQUIRED HINTS /usr/lib/sbsa-linux-gnu/cmake/nvshmem/)
78+
include_directories(/usr/include/nvshmem/)
79+
endif()
80+
7581
if(NOT WIN32)
7682
set(DECODER_SHARED_TARGET_0 decoder_attention_0)
7783
set(DECODER_SHARED_TARGET_1 decoder_attention_1)
@@ -231,7 +237,10 @@ if(ENABLE_MULTI_DEVICE)
231237
set(TRTLLM_LINK_LIBS ${TRTLLM_LINK_LIBS} ${MPI_C_LIBRARIES} ${NCCL_LIB})
232238
endif()
233239

234-
message("TRTLLM_LINK_LIBS: ${TRTLLM_LINK_LIBS}")
240+
if(ENABLE_NVSHMEM)
241+
set(TRTLLM_LINK_LIBS ${TRTLLM_LINK_LIBS} nvshmem::nvshmem_host
242+
nvshmem::nvshmem_device)
243+
endif()
235244

236245
if(NOT WIN32) # Unix-like compilers
237246
set(UNDEFINED_FLAG "-Wl,--no-undefined")

cpp/tensorrt_llm/batch_manager/cacheTransceiver.cpp

Lines changed: 3 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -517,7 +517,9 @@ void CacheTransceiver::checkGenTransferStatus(std::optional<int> const& atLeastR
517517
// Gather the kv cache transfer time from all workers and update to leader rank
518518
if (!common::getEnvKVCacheTransferOutputPath().empty())
519519
{
520-
updateKVCacheTransferBW(*mMpiGroupComm, it->first);
520+
auto syncComm
521+
= mCacheState->getParallelConfig().mEnableAttentionDP ? mMpiGroupDataComm.get() : mMpiGroupComm;
522+
updateKVCacheTransferBW(*syncComm, it->first);
521523
}
522524
TLLM_LOG_DEBUG(mpi::MpiComm::world().getRank(),
523525
"**** it->first->mRequestId: %ld, context request ID: %ld ******** get feature ***",

0 commit comments

Comments
 (0)