Skip to content

Commit e1e212e

Browse files
authored
Merge branch 'NVIDIA:main' into mnnvl_partial
2 parents 31999d0 + aae5d22 commit e1e212e

File tree

187 files changed

+7055
-1941
lines changed

Some content is hidden

Large Commits have some content hidden by default. Use the searchbox below for content that may be hidden.

187 files changed

+7055
-1941
lines changed

.coderabbit.yaml

Lines changed: 3 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -29,8 +29,10 @@ reviews:
2929
suggested_labels: true
3030
suggested_reviewers: true
3131
poem: false
32+
review_status: false
3233
auto_review:
33-
drafts: true
34+
auto_incremental_review: false
35+
drafts: false
3436
base_branches: ["main", "release/.+"]
3537
knowledge_base:
3638
code_guidelines:

.dockerignore

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -9,5 +9,6 @@ examples/**/.git
99
examples/**/*.bin
1010
examples/**/*.engine
1111
examples/**/*.onnx
12+
examples/**/*.safetensors
1213
examples/**/c-model
1314
examples/models/core/gpt/gpt*

.github/CODEOWNERS

Lines changed: 6 additions & 5 deletions
Original file line numberDiff line numberDiff line change
@@ -1,10 +1,5 @@
11
# This file defines code ownership rules for the repository.
22

3-
# The following rule should only be uncommented on release branches (e.g., release/0.19).
4-
# The rule below requires that any PR to release/**/* branches must be approved by at least one member
5-
# of the NVIDIA/trt-llm-release-branch-approval team, regardless of who else approves the PR.
6-
# Without approval from a member of this team, PRs cannot be merged to release branches.
7-
# * @NVIDIA/trt-llm-release-branch-approval
83

94
## TensorRT-LLM Infra
105
### CI
@@ -160,3 +155,9 @@ docs/source/performance/perf-benchmarking.md @NVIDIA/trtllm-bench-reviewers
160155
# from a member of this team, PRs affecting public APIs cannot be merged to main or release branches.
161156
/tests/unittest/api_stability/ @NVIDIA/trt-llm-noncommitted-api-review-committee
162157
/tests/unittest/api_stability/references_committed/ @NVIDIA/trt-llm-committed-api-review-committee
158+
159+
# The following rule should only be uncommented on release branches (e.g., release/0.19).
160+
# The rule below requires that any PR to release/**/* branches must be approved by at least one member
161+
# of the NVIDIA/trt-llm-release-branch-approval team, regardless of who else approves the PR.
162+
# Without approval from a member of this team, PRs cannot be merged to release branches.
163+
# * @NVIDIA/trt-llm-release-branch-approval

README.md

Lines changed: 4 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -9,7 +9,7 @@ TensorRT-LLM
99
[![python](https://img.shields.io/badge/python-3.10-green)](https://www.python.org/downloads/release/python-31012/)
1010
[![cuda](https://img.shields.io/badge/cuda-12.9.1-green)](https://developer.nvidia.com/cuda-downloads)
1111
[![trt](https://img.shields.io/badge/TRT-10.11.0-green)](https://developer.nvidia.com/tensorrt)
12-
[![version](https://img.shields.io/badge/release-1.1.0rc2-green)](./tensorrt_llm/version.py)
12+
[![version](https://img.shields.io/badge/release-1.1.0rc3-green)](./tensorrt_llm/version.py)
1313
[![license](https://img.shields.io/badge/license-Apache%202-blue)](./LICENSE)
1414

1515
[Architecture](./docs/source/torch/arch_overview.md)   |   [Performance](./docs/source/performance/perf-overview.md)   |   [Examples](https://nvidia.github.io/TensorRT-LLM/quick-start-guide.html)   |   [Documentation](./docs/source/)   |   [Roadmap](https://github.com/NVIDIA/TensorRT-LLM/issues?q=is%3Aissue%20state%3Aopen%20label%3Aroadmap)
@@ -18,6 +18,9 @@ TensorRT-LLM
1818
<div align="left">
1919

2020
## Tech Blogs
21+
* [08/29] ADP Balance Strategy
22+
[➡️ link](./docs/source/blogs/tech_blog/blog10_ADP_Balance_Strategy.md)
23+
2124
* [08/05] Running a High-Performance GPT-OSS-120B Inference Server with TensorRT-LLM
2225
[➡️ link](./docs/source/blogs/tech_blog/blog9_Deploying_GPT_OSS_on_TRTLLM.md)
2326

Lines changed: 46 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,46 @@
1+
/*
2+
* Copyright (c) 2022-2024, NVIDIA CORPORATION. All rights reserved.
3+
*
4+
* Licensed under the Apache License, Version 2.0 (the "License");
5+
* you may not use this file except in compliance with the License.
6+
* You may obtain a copy of the License at
7+
*
8+
* http://www.apache.org/licenses/LICENSE-2.0
9+
*
10+
* Unless required by applicable law or agreed to in writing, software
11+
* distributed under the License is distributed on an "AS IS" BASIS,
12+
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
13+
* See the License for the specific language governing permissions and
14+
* limitations under the License.
15+
*/
16+
17+
#pragma once
18+
19+
#include "tensorrt_llm/batch_manager/common.h"
20+
#include "tensorrt_llm/batch_manager/llmRequest.h"
21+
#include "tensorrt_llm/runtime/common.h"
22+
23+
#include <utility>
24+
#include <vector>
25+
26+
using SizeType32 = tensorrt_llm::runtime::SizeType32;
27+
using RequestIdType = tensorrt_llm::batch_manager::LlmRequest::RequestIdType;
28+
29+
/// See tensorrt_llm/_torch/pyexecutor/connector.py for details on the Connector API.
30+
31+
namespace tensorrt_llm::batch_manager::kv_connector
32+
{
33+
34+
/// @brief The KV connector manager. This is passed into the C++ KV Cache Manager when adding sequences.
35+
class KvCacheConnectorManager
36+
{
37+
public:
38+
KvCacheConnectorManager() = default;
39+
virtual ~KvCacheConnectorManager() = default;
40+
41+
/// @brief Handle the getNumNewMatchedTokens call inside the C++ KV Cache Manager.
42+
/// @return The number of tokens that can be loaded from remote KV cache.
43+
virtual SizeType32 getNumNewMatchedTokens(LlmRequest const& request, SizeType32 numComputedTokens) = 0;
44+
};
45+
46+
} // namespace tensorrt_llm::batch_manager::kv_connector

cpp/include/tensorrt_llm/batch_manager/kvCacheManager.h

Lines changed: 29 additions & 10 deletions
Original file line numberDiff line numberDiff line change
@@ -16,6 +16,7 @@
1616

1717
#pragma once
1818

19+
#include "tensorrt_llm/batch_manager/kvCacheConnector.h"
1920
#include "tensorrt_llm/batch_manager/kvCacheEventManager.h"
2021
#include "tensorrt_llm/batch_manager/kvCacheType.h"
2122
#include "tensorrt_llm/batch_manager/llmRequest.h" // TODO forward declare
@@ -479,7 +480,6 @@ class KVCacheBlockPool
479480
SizeType32 numKvHeads;
480481
SizeType32 sizePerHead;
481482
SizeType32 tokensPerBlock;
482-
SizeType32 quantSize;
483483
SizeType32 blockSize;
484484

485485
// Memory pools. Primary is fast memory, secondary is slower memory used for offloading.
@@ -490,15 +490,14 @@ class KVCacheBlockPool
490490
bool containsBlockScales;
491491

492492
KVCacheBlockPool(SizeType32 numLayers, SizeType32 kvFactor, SizeType32 numKvHeads, SizeType32 sizePerHead,
493-
SizeType32 tokensPerBlock, SizeType32 quantSize, runtime::ITensor::SharedPtr primaryPtr = nullptr,
493+
SizeType32 tokensPerBlock, runtime::ITensor::SharedPtr primaryPtr = nullptr,
494494
runtime::ITensor::SharedPtr secondaryPtr = nullptr, bool containsBlockScales = false)
495495
: numLayers(numLayers)
496496
, kvFactor(kvFactor)
497497
, numKvHeads(numKvHeads)
498498
, sizePerHead(sizePerHead)
499499
, tokensPerBlock(tokensPerBlock)
500-
, quantSize(quantSize)
501-
, blockSize((numKvHeads * sizePerHead * tokensPerBlock) / quantSize)
500+
, blockSize(numKvHeads * sizePerHead * tokensPerBlock)
502501
, primaryPtr(std::move(primaryPtr))
503502
, secondaryPtr(std::move(secondaryPtr))
504503
, containsBlockScales(containsBlockScales)
@@ -538,7 +537,8 @@ class WindowBlockManager
538537
SizeType32 sizePerHead, SizeType32 tokensPerBlock, SizeType32 blocksInPrimaryPool,
539538
SizeType32 blocksInSecondaryPool, SizeType32 maxNumSequences, std::shared_ptr<runtime::CudaStream> stream,
540539
bool onboardBlocks, CacheType cacheType, std::optional<executor::RetentionPriority> secondaryOffloadMinPriority,
541-
std::shared_ptr<KVCacheEventManager> eventManager, bool enablePartialReuse, bool copyOnPartialReuse);
540+
std::shared_ptr<KVCacheEventManager> eventManager, bool enablePartialReuse, bool copyOnPartialReuse,
541+
std::shared_ptr<kv_connector::KvCacheConnectorManager> kvCacheConnectorManager);
542542

543543
~WindowBlockManager();
544544

@@ -646,6 +646,15 @@ class WindowBlockManager
646646
return mPools.at(poolIdx).blockSize;
647647
}
648648

649+
[[nodiscard]] SizeType32 getNumEltsPerContainer() const
650+
{
651+
#ifdef ENABLE_FP4
652+
return mDataType == nvinfer1::DataType::kFP4 ? 2 : 1;
653+
#else
654+
return 1;
655+
#endif
656+
}
657+
649658
[[nodiscard]] SizeType32 getNumPools(bool includeBlockScalePools = true) const noexcept
650659
{
651660
if (includeBlockScalePools)
@@ -835,6 +844,8 @@ class WindowBlockManager
835844
bool mEnablePartialReuse;
836845
// Whether partially matched blocks that are already in use should be copied and reused.
837846
bool mCopyOnPartialReuse;
847+
// The kv cache connector manager
848+
std::shared_ptr<kv_connector::KvCacheConnectorManager> mKvCacheConnectorManager;
838849
};
839850

840851
class BlockManager
@@ -852,7 +863,8 @@ class BlockManager
852863
SizeType32 sinkBubbleLength, bool onboardBlocks, CacheType cacheType = CacheType::kSELF,
853864
std::optional<executor::RetentionPriority> secondaryOffloadMinPriority = std::nullopt,
854865
std::shared_ptr<KVCacheEventManager> eventManager = nullptr, bool enablePartialReuse = true,
855-
bool copyOnPartialReuse = true);
866+
bool copyOnPartialReuse = true,
867+
std::shared_ptr<kv_connector::KvCacheConnectorManager> kvCacheConnectorManager = nullptr);
856868

857869
BlockManager(BlockManager const&) = delete;
858870
BlockManager& operator=(BlockManager const&) = delete;
@@ -1238,6 +1250,8 @@ class BaseKVCacheManager
12381250

12391251
[[nodiscard]] virtual runtime::ITensor::SharedPtr getBlockPoolPointers() const = 0;
12401252

1253+
[[nodiscard]] virtual runtime::ITensor::SharedPtr getBlockScalePoolPointers() const = 0;
1254+
12411255
[[nodiscard]] virtual runtime::ITensor::SharedPtr getLayerToPoolMapping() const = 0;
12421256

12431257
virtual void getBlockOffsetsOfBatch(
@@ -1287,6 +1301,7 @@ class BaseKVCacheManager
12871301
LlmRequest::RequestIdType requestId, SizeType32 windowSize) const
12881302
= 0;
12891303

1304+
[[nodiscard]] virtual runtime::ITensor::SharedPtr getUniquePrimaryPool() const = 0;
12901305
[[nodiscard]] virtual runtime::ITensor::SharedPtr getPrimaryPool(SizeType32 layer_idx) const = 0;
12911306
[[nodiscard]] virtual SizeType32 getPoolLayerIdx(SizeType32 layer_idx) const = 0;
12921307

@@ -1373,7 +1388,8 @@ class KVCacheManager : public BaseKVCacheManager
13731388
bool enableBlockReuse = false, bool onboardBlocks = true, CacheType cacheType = CacheType::kSELF,
13741389
std::optional<executor::RetentionPriority> secondaryOffloadMinPriority = std::nullopt,
13751390
std::shared_ptr<KVCacheEventManager> eventManager = nullptr, bool enablePartialReuse = true,
1376-
bool copyOnpartialReuse = true);
1391+
bool copyOnpartialReuse = true,
1392+
std::shared_ptr<kv_connector::KvCacheConnectorManager> kvCacheConnectorManager = nullptr);
13771393

13781394
KVCacheManager(std::vector<SizeType32> const& numKvHeadsPerLayer, SizeType32 sizePerHead, SizeType32 tokensPerBlock,
13791395
BlocksPerWindow const& blocksPerWindow, SizeType32 maxNumSequences, SizeType32 maxBeamWidth,
@@ -1383,7 +1399,8 @@ class KVCacheManager : public BaseKVCacheManager
13831399
bool enableBlockReuse = false, bool onboardBlocks = true, CacheType cacheType = CacheType::kSELF,
13841400
std::optional<executor::RetentionPriority> secondaryOffloadMinPriority = std::nullopt,
13851401
std::shared_ptr<KVCacheEventManager> eventManager = nullptr, bool enablePartialReuse = true,
1386-
bool copyOnpartialReuse = true);
1402+
bool copyOnpartialReuse = true,
1403+
std::shared_ptr<kv_connector::KvCacheConnectorManager> kvCacheConnectorManager = nullptr);
13871404

13881405
KVCacheManager(SizeType32 numLayers, SizeType32 numKvHeads, SizeType32 sizePerHead, SizeType32 tokensPerBlock,
13891406
BlocksPerWindow const& blocksPerWindow, SizeType32 maxNumSequences, SizeType32 maxBeamWidth,
@@ -1393,7 +1410,8 @@ class KVCacheManager : public BaseKVCacheManager
13931410
bool enableBlockReuse = true, bool onboardBlocks = true, CacheType cacheType = CacheType::kSELF,
13941411
std::optional<executor::RetentionPriority> secondaryOffloadMinPriority = std::nullopt,
13951412
std::shared_ptr<KVCacheEventManager> eventManager = nullptr, bool enablePartialReuse = true,
1396-
bool copyOnpartialReuse = true);
1413+
bool copyOnpartialReuse = true,
1414+
std::shared_ptr<kv_connector::KvCacheConnectorManager> kvCacheConnectorManager = nullptr);
13971415

13981416
KVCacheManager(SizeType32 numLayers, SizeType32 numKvHeads, SizeType32 sizePerHead, SizeType32 tokensPerBlock,
13991417
BlocksPerWindow const& blocksPerWindow, SizeType32 maxNumSequences, SizeType32 maxBeamWidth,
@@ -1543,7 +1561,7 @@ class KVCacheManager : public BaseKVCacheManager
15431561
return mLayerToPoolMapping;
15441562
}
15451563

1546-
[[nodiscard]] runtime::ITensor::SharedPtr getBlockScalePoolPointers() const
1564+
[[nodiscard]] runtime::ITensor::SharedPtr getBlockScalePoolPointers() const override
15471565
{
15481566
// TODO: add a new optional model input so the attention plugin can access these
15491567
return mBlockScalePoolPointers;
@@ -1624,6 +1642,7 @@ class KVCacheManager : public BaseKVCacheManager
16241642
std::vector<SizeType32> getNewlyAllocatedBlockIds(
16251643
LlmRequest::RequestIdType requestId, SizeType32 windowSize) const override;
16261644

1645+
runtime::ITensor::SharedPtr getUniquePrimaryPool() const override;
16271646
runtime::ITensor::SharedPtr getPrimaryPool(SizeType32 layer_idx) const override;
16281647

16291648
SizeType32 getPoolLayerIdx(SizeType32 layer_idx) const override

cpp/include/tensorrt_llm/deep_gemm/scheduler.cuh

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -379,7 +379,7 @@ struct GroupedMaskedScheduler
379379
}
380380
};
381381

382-
// Need to keep the same as the one in tests/unittest/_torch/thop/deep_gemm_tests.py
382+
// Need to keep the same as the one in tests/unittest/_torch/thop/parallel/deep_gemm_tests.py
383383
template <typename T_offset, typename T_index>
384384
__host__ __device__ __forceinline__ T_offset compute_padded_offset(T_offset offset, T_index problem_idx)
385385
{

0 commit comments

Comments
 (0)