Skip to content

Commit 60b3ad9

Browse files
authored
Merge branch 'main' into jthomson04/connector-api
2 parents f9a3960 + 344bc45 commit 60b3ad9

File tree

78 files changed

+2249
-791
lines changed

Some content is hidden

Large Commits have some content hidden by default. Use the searchbox below for content that may be hidden.

78 files changed

+2249
-791
lines changed

.gitattributes

Lines changed: 2 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -7,3 +7,5 @@
77
triton_backend/tools/gpt/input_data.json filter=lfs diff=lfs merge=lfs -text
88
*cubin.cpp filter=lfs diff=lfs merge=lfs -text
99
docs/source/blogs/media/tech_blog3_mla_absorb.png filter=lfs diff=lfs merge=lfs -text
10+
tests/integration/test_input_files/*.png filter=lfs diff=lfs merge=lfs -text
11+
tests/integration/test_input_files/*.jpg filter=lfs diff=lfs merge=lfs -text

README.md

Lines changed: 2 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -18,10 +18,9 @@ TensorRT-LLM
1818
<div align="left">
1919

2020
## Tech Blogs
21-
* [08/06] Running a High Performance GPT-OSS-120B Inference Server with TensorRT-LLM
21+
* [08/05] Running a High-Performance GPT-OSS-120B Inference Server with TensorRT-LLM
2222
[➡️ link](./docs/source/blogs/tech_blog/blog9_Deploying_GPT_OSS_on_TRTLLM.md)
2323

24-
2524
* [08/01] Scaling Expert Parallelism in TensorRT-LLM (Part 2: Performance Status and Optimization)
2625
[➡️ link](./docs/source/blogs/tech_blog/blog8_Scaling_Expert_Parallelism_in_TensorRT-LLM_part2.md)
2726

@@ -44,6 +43,7 @@ TensorRT-LLM
4443
[➡️ link](./docs/source/blogs/tech_blog/blog1_Pushing_Latency_Boundaries_Optimizing_DeepSeek-R1_Performance_on_NVIDIA_B200_GPUs.md)
4544

4645
## Latest News
46+
* [08/05] 🌟 TensorRT-LLM delivers Day-0 support for OpenAI's latest open-weights models: GPT-OSS-120B [➡️ link](https://huggingface.co/openai/gpt-oss-120b) and GPT-OSS-20B [➡️ link](https://huggingface.co/openai/gpt-oss-20b)
4747
* [07/15] 🌟 TensorRT-LLM delivers Day-0 support for LG AI Research's latest model, EXAONE 4.0 [➡️ link](https://huggingface.co/LGAI-EXAONE/EXAONE-4.0-32B)
4848
* [06/17] Join NVIDIA and DeepInfra for a developer meetup on June 26 ✨ [➡️ link](https://events.nvidia.com/scaletheunscalablenextgenai)
4949
* [05/22] Blackwell Breaks the 1,000 TPS/User Barrier With Meta’s Llama 4 Maverick

cpp/include/tensorrt_llm/common/logger.h

Lines changed: 7 additions & 6 deletions
Original file line numberDiff line numberDiff line change
@@ -54,20 +54,21 @@ class Logger
5454

5555
#if defined(_MSC_VER)
5656
template <typename... Args>
57-
void log(Level level, char const* format, Args const&... args);
57+
void log(Level const level, char const* format, Args const&... args);
5858

5959
template <typename... Args>
60-
void log(Level level, int rank, char const* format, Args const&... args);
60+
void log(Level const level, int const rank, char const* format, Args const&... args);
6161
#else
6262
template <typename... Args>
63-
void log(Level level, char const* format, Args const&... args) __attribute__((format(printf, 3, 0)));
63+
void log(Level const level, char const* format, Args const&... args) __attribute__((format(printf, 3, 0)));
6464

6565
template <typename... Args>
66-
void log(Level level, int rank, char const* format, Args const&... args) __attribute__((format(printf, 4, 0)));
66+
void log(Level const level, int const rank, char const* format, Args const&... args)
67+
__attribute__((format(printf, 4, 0)));
6768
#endif
6869

6970
template <typename... Args>
70-
void log(Level level, std::string const& format, Args const&... args)
71+
void log(Level const level, std::string const& format, Args const&... args)
7172
{
7273
return log(level, format.c_str(), args...);
7374
}
@@ -134,7 +135,7 @@ class Logger
134135
};
135136

136137
template <typename... Args>
137-
void Logger::log(Logger::Level level, char const* format, Args const&... args)
138+
void Logger::log(Logger::Level const level, char const* format, Args const&... args)
138139
{
139140
if (isEnabled(level))
140141
{

cpp/include/tensorrt_llm/executor/dataTransceiverState.h

Lines changed: 20 additions & 12 deletions
Original file line numberDiff line numberDiff line change
@@ -52,29 +52,30 @@ class CacheState final
5252
AttentionType attentionType = AttentionType::kDEFAULT, int kvFactor = 2)
5353
: mModelConfig(std::move(modelConfig))
5454
, mParallelConfig{worldConfig.getTensorParallelism(), worldConfig.getPipelineParallelism(),
55-
worldConfig.enableAttentionDP(), worldConfig.getTensorParallelRank(), worldConfig.getTensorParallelism()}
55+
worldConfig.getContextParallelism(), worldConfig.enableAttentionDP(), worldConfig.getTensorParallelRank(),
56+
worldConfig.getTensorParallelism()}
5657
, mDataType{dataType}
5758
, mAttentionConfig(attentionType, kvFactor)
5859
{
5960
}
6061

6162
CacheState(std::vector<SizeType32> nbKvHeadPerLayer, SizeType32 sizePerHead, SizeType32 tokensPerBlock,
62-
SizeType32 tensorParallelism, SizeType32 pipelineParallelism, nvinfer1::DataType dataType,
63-
AttentionType attentionType = AttentionType::kDEFAULT, int kvFactor = 2, bool enableAttentionDP = false,
64-
int DPrank = 0, int DPsize = 0)
63+
SizeType32 tensorParallelism, SizeType32 pipelineParallelism, SizeType32 contextParallelism,
64+
nvinfer1::DataType dataType, AttentionType attentionType = AttentionType::kDEFAULT, int kvFactor = 2,
65+
bool enableAttentionDP = false, int DPrank = 0, int DPsize = 0)
6566
: mModelConfig{std::move(nbKvHeadPerLayer), sizePerHead, tokensPerBlock}
66-
, mParallelConfig{tensorParallelism, pipelineParallelism, enableAttentionDP, DPrank, DPsize}
67+
, mParallelConfig{tensorParallelism, pipelineParallelism, contextParallelism, enableAttentionDP, DPrank, DPsize}
6768
, mDataType{dataType}
6869
, mAttentionConfig(attentionType, kvFactor)
6970
{
7071
}
7172

7273
CacheState(SizeType32 nbAttentionLayers, SizeType32 nbKvHeads, SizeType32 sizePerHead, SizeType32 tokensPerBlock,
73-
SizeType32 tensorParallelism, SizeType32 pipelineParallelism, nvinfer1::DataType dataType,
74-
AttentionType attentionType = AttentionType::kDEFAULT, int kvFactor = 2, bool enableAttentionDP = false,
75-
int DPrank = 0, int DPsize = 0)
74+
SizeType32 tensorParallelism, SizeType32 pipelineParallelism, SizeType32 contextParallelism,
75+
nvinfer1::DataType dataType, AttentionType attentionType = AttentionType::kDEFAULT, int kvFactor = 2,
76+
bool enableAttentionDP = false, int DPrank = 0, int DPsize = 0)
7677
: mModelConfig{std::vector(nbAttentionLayers, nbKvHeads), sizePerHead, tokensPerBlock}
77-
, mParallelConfig{tensorParallelism, pipelineParallelism, enableAttentionDP, DPrank, DPsize}
78+
, mParallelConfig{tensorParallelism, pipelineParallelism, contextParallelism, enableAttentionDP, DPrank, DPsize}
7879
, mDataType{dataType}
7980
, mAttentionConfig(attentionType, kvFactor)
8081
{
@@ -83,7 +84,7 @@ class CacheState final
8384
[[nodiscard]] bool operator==(kv_cache::CacheState const& other) const noexcept
8485
{
8586
return mModelConfig == other.mModelConfig && mParallelConfig == other.mParallelConfig
86-
&& mDataType == other.mDataType;
87+
&& mAttentionConfig == other.mAttentionConfig && mDataType == other.mDataType;
8788
}
8889

8990
struct ModelConfig
@@ -103,15 +104,16 @@ class CacheState final
103104
{
104105
SizeType32 mTensorParallelism;
105106
SizeType32 mPipelineParallelism;
107+
SizeType32 mContextParallelism;
106108
bool mEnableAttentionDP;
107109
SizeType32 mDPrank;
108110
SizeType32 mDPsize;
109111

110112
[[nodiscard]] bool operator==(ParallelConfig const& other) const noexcept
111113
{
112114
return mTensorParallelism == other.mTensorParallelism && mPipelineParallelism == other.mPipelineParallelism
113-
&& mEnableAttentionDP == other.mEnableAttentionDP && mDPrank == other.mDPrank
114-
&& mDPsize == other.mDPsize;
115+
&& mContextParallelism == other.mContextParallelism && mEnableAttentionDP == other.mEnableAttentionDP
116+
&& mDPrank == other.mDPrank && mDPsize == other.mDPsize;
115117
}
116118
};
117119

@@ -125,6 +127,11 @@ class CacheState final
125127
{
126128
}
127129

130+
[[nodiscard]] bool operator==(AttentionConfig const& other) const noexcept
131+
{
132+
return mAttentionType == other.mAttentionType && mKvFactor == other.mKvFactor;
133+
}
134+
128135
// attentionType ;
129136
AttentionType mAttentionType;
130137
int mKvFactor;
@@ -162,6 +169,7 @@ class CacheState final
162169
sstring << "mTokensPerBlock:" << mModelConfig.mTokensPerBlock << "\n";
163170
sstring << "tp:" << mParallelConfig.mTensorParallelism << "\n";
164171
sstring << "pp:" << mParallelConfig.mPipelineParallelism << "\n";
172+
sstring << "cp:" << mParallelConfig.mContextParallelism << "\n";
165173
sstring << "enableAttentionDP:" << mParallelConfig.mEnableAttentionDP << "\n";
166174
sstring << "datatype:" << static_cast<int32_t>(mDataType) << "\n";
167175
sstring << "attentionType:" << static_cast<int32_t>(mAttentionConfig.mAttentionType) << "\n";

cpp/kernels/xqa/mha_sm90.cu

Lines changed: 7 additions & 7 deletions
Original file line numberDiff line numberDiff line change
@@ -1012,7 +1012,7 @@ CUBIN_EXPORT __global__
10121012
if (threadIdx.x < smem.gemm1AccColMax.size)
10131013
{
10141014
auto const idx = threadIdx.x;
1015-
smem.gemm1AccColMax[idx] = mha::numeric_limits<float>::lowest();
1015+
smem.gemm1AccColMax[idx] = safeInitRowMax;
10161016
smem.gemm1AccColSum[idx] = 0;
10171017
}
10181018
smem.gemm1WarpGrpBar.arrive_and_wait();
@@ -1949,15 +1949,15 @@ __device__ inline void warpGrpApplyMask(Gemm0Acc& acc, SpecDec const& specDec,
19491949
uint32_t const globalRow = tileStartRow + row;
19501950
if (globalRow >= cacheSeqLen)
19511951
{
1952-
acc(m, n)(i, j) = mha::numeric_limits<float>::lowest();
1952+
acc(m, n)(i, j) = safeInitRowMax;
19531953
continue;
19541954
}
19551955
if (globalRow >= maskStartRow)
19561956
{
19571957
uint32_t const maskRow = globalRow - maskStartRow;
19581958
if ((bit_mask >> maskRow) == 0)
19591959
{
1960-
acc(m, n)(i, j) = mha::numeric_limits<float>::lowest();
1960+
acc(m, n)(i, j) = safeInitRowMax;
19611961
}
19621962
}
19631963
}
@@ -2087,7 +2087,7 @@ __device__ inline void warpGrpApplyMask(uint32_t warpRank, Gemm0Acc& acc, uint32
20872087
#pragma unroll
20882088
for (uint32_t j = 0; j < GmmaAccCoreMat::cols; j++)
20892089
{
2090-
acc(m, n)(i, j) = mha::numeric_limits<float>::lowest();
2090+
acc(m, n)(i, j) = safeInitRowMax;
20912091
}
20922092
}
20932093
}
@@ -2380,9 +2380,9 @@ __device__ inline void warpGrpApplyMask(Gemm0Acc& acc, SpecDec const& specDec,
23802380
{
23812381
uint32_t const col = GmmaAccCoreMat::cols * (4 * n + idxInQuad) + j;
23822382
assert((col < nbValidCols) == bool(endMask & (1ULL << col)));
2383-
if (((mask >> col) & 1) == 0)
2383+
if ((mask & (1ULL << col)) == 0)
23842384
{
2385-
acc(m, n)(i, j) = mha::numeric_limits<float>::lowest();
2385+
acc(m, n)(i, j) = safeInitRowMax;
23862386
}
23872387
}
23882388
}
@@ -2410,7 +2410,7 @@ __device__ inline void warpGrpApplyMask(Gemm0Acc& acc, uint32_t validColBeg, uin
24102410
#pragma unroll
24112411
for (uint32_t i = 0; i < GmmaAccCoreMat::rows; i++)
24122412
{
2413-
acc(m, n)(i, j) = mha::numeric_limits<float>::lowest();
2413+
acc(m, n)(i, j) = safeInitRowMax;
24142414
}
24152415
}
24162416
}

cpp/tensorrt_llm/CMakeLists.txt

Lines changed: 1 addition & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -294,8 +294,7 @@ if(TARGET ${NIXL_WRAPPER_TARGET})
294294
endif()
295295

296296
if(NOT WIN32)
297-
set_target_properties(${SHARED_TARGET} PROPERTIES LINK_FLAGS
298-
"-Wl,-rpath='$ORIGIN'")
297+
set_target_properties(${SHARED_TARGET} PROPERTIES BUILD_RPATH "$ORIGIN")
299298
endif()
300299

301300
if(BUILD_PYT)

cpp/tensorrt_llm/batch_manager/cacheFormatter.cpp

Lines changed: 8 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -822,6 +822,14 @@ void CacheFormatter::unformat(TransferSession& session)
822822
TLLM_LOG_WARNING("CacheFormatter::inquireSupport: only support non-MLA");
823823
return false;
824824
}
825+
if (selfConfig.getParallelConfig().mContextParallelism != 1
826+
|| destConfig.getParallelConfig().mContextParallelism != 1)
827+
{
828+
TLLM_LOG_WARNING(
829+
"CacheFormatter::inquireSupport: context parallelism is not currently supported (selfCP=%d, destCP=%d).",
830+
selfConfig.getParallelConfig().mContextParallelism, destConfig.getParallelConfig().mContextParallelism);
831+
return false;
832+
}
825833

826834
std::unordered_set<int> setVecDest{
827835
destConfig.getModelConfig().mNbKvHeadsPerLayer.begin(), destConfig.getModelConfig().mNbKvHeadsPerLayer.end()};

cpp/tensorrt_llm/batch_manager/mlaCacheFormatter.cpp

Lines changed: 8 additions & 6 deletions
Original file line numberDiff line numberDiff line change
@@ -558,18 +558,20 @@ void MLACacheFormatter::unformat(TransferSession& session)
558558
TLLM_LOG_WARNING("MLACacheFormatter::inquireSupport: only support MLA");
559559
return false;
560560
}
561-
562-
if (selfConfig.getAttentionConfig().mKvFactor != destConfig.getAttentionConfig().mKvFactor)
563-
{
564-
TLLM_LOG_WARNING("MLACacheFormatter::inquireSupport: only support same kv factor");
565-
return false;
566-
}
567561
if (selfConfig.getParallelConfig().mEnableAttentionDP
568562
&& (selfConfig.getParallelConfig().mTensorParallelism % selfConfig.getParallelConfig().mDPsize != 0))
569563
{
570564
TLLM_LOG_WARNING("MLACacheFormatter::inquireSupport: TP size must be divisible by DP size");
571565
return false;
572566
}
567+
if (selfConfig.getParallelConfig().mContextParallelism != 1
568+
|| destConfig.getParallelConfig().mContextParallelism != 1)
569+
{
570+
TLLM_LOG_WARNING(
571+
"MLACacheFormatter::inquireSupport: context parallelism is not currently supported (selfCP=%d, destCP=%d).",
572+
selfConfig.getParallelConfig().mContextParallelism, destConfig.getParallelConfig().mContextParallelism);
573+
return false;
574+
}
573575
if (destConfig.getParallelConfig().mEnableAttentionDP
574576
&& (destConfig.getParallelConfig().mTensorParallelism % destConfig.getParallelConfig().mDPsize != 0))
575577
{

cpp/tensorrt_llm/executor/serialization.cpp

Lines changed: 5 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -531,14 +531,15 @@ kv_cache::CacheState Serialization::deserializeCacheState(std::istream& is)
531531
auto tokensPerBlock = su::deserialize<decltype(CacheState::ModelConfig::mTokensPerBlock)>(is);
532532
auto tensorParallelism = su::deserialize<decltype(CacheState::ParallelConfig::mTensorParallelism)>(is);
533533
auto pipelineParallelism = su::deserialize<decltype(CacheState::ParallelConfig::mPipelineParallelism)>(is);
534+
auto contextParallelism = su::deserialize<decltype(CacheState::ParallelConfig::mContextParallelism)>(is);
534535
auto enableAttentionDP = su::deserialize<decltype(CacheState::ParallelConfig::mEnableAttentionDP)>(is);
535536
auto DPrank = su::deserialize<decltype(CacheState::ParallelConfig::mDPrank)>(is);
536537
auto DPsize = su::deserialize<decltype(CacheState::ParallelConfig::mDPsize)>(is);
537538
auto dataType = su::deserialize<decltype(CacheState::mDataType)>(is);
538539
auto attentionType = su::deserialize<decltype(CacheState::AttentionConfig::mAttentionType)>(is);
539540
auto kvFactor = su::deserialize<decltype(CacheState::AttentionConfig::mKvFactor)>(is);
540-
return CacheState{nbKvHeadsPerLayer, sizePerHead, tokensPerBlock, tensorParallelism, pipelineParallelism, dataType,
541-
attentionType, kvFactor, enableAttentionDP, DPrank, DPsize};
541+
return CacheState{nbKvHeadsPerLayer, sizePerHead, tokensPerBlock, tensorParallelism, pipelineParallelism,
542+
contextParallelism, dataType, attentionType, kvFactor, enableAttentionDP, DPrank, DPsize};
542543
}
543544

544545
void Serialization::serialize(kv_cache::CacheState const& state, std::ostream& os)
@@ -548,6 +549,7 @@ void Serialization::serialize(kv_cache::CacheState const& state, std::ostream& o
548549
su::serialize(state.mModelConfig.mTokensPerBlock, os);
549550
su::serialize(state.mParallelConfig.mTensorParallelism, os);
550551
su::serialize(state.mParallelConfig.mPipelineParallelism, os);
552+
su::serialize(state.mParallelConfig.mContextParallelism, os);
551553
su::serialize(state.mParallelConfig.mEnableAttentionDP, os);
552554
su::serialize(state.mParallelConfig.mDPrank, os);
553555
su::serialize(state.mParallelConfig.mDPsize, os);
@@ -564,6 +566,7 @@ size_t Serialization::serializedSize(kv_cache::CacheState const& state)
564566
totalSize += su::serializedSize(state.mModelConfig.mTokensPerBlock);
565567
totalSize += su::serializedSize(state.mParallelConfig.mTensorParallelism);
566568
totalSize += su::serializedSize(state.mParallelConfig.mPipelineParallelism);
569+
totalSize += su::serializedSize(state.mParallelConfig.mContextParallelism);
567570
totalSize += su::serializedSize(state.mParallelConfig.mEnableAttentionDP);
568571
totalSize += su::serializedSize(state.mParallelConfig.mDPrank);
569572
totalSize += su::serializedSize(state.mParallelConfig.mDPsize);

0 commit comments

Comments
 (0)