Skip to content

Commit 70feb7c

Browse files
committed
[kv cache manager] Expose knob TRTLLM_WINDOW_SIZE_SHARES to adjust memory proportion shared
Usage example: export TRTLLM_WINDOW_SIZE_SHARES=0.4,0.6 Signed-off-by: eopXD <[email protected]>
1 parent 259cc66 commit 70feb7c

File tree

1 file changed

+48
-5
lines changed

1 file changed

+48
-5
lines changed

cpp/tensorrt_llm/batch_manager/kvCacheManager.cpp

Lines changed: 48 additions & 5 deletions
Original file line numberDiff line numberDiff line change
@@ -2450,12 +2450,55 @@ BlocksPerWindow BaseKVCacheManager::calculateMaxNumBlocks(executor::KvCacheConfi
24502450
};
24512451

24522452
std::map<SizeType32, float> windowSizeToShare;
2453-
// NOTE: Righteously, blocks allocated should be proportional with
2454-
// regard to window size. Currently, we are first allocating identical
2455-
// number of blocks for all layers to achieve identical performance.
2456-
for (auto const& [windowSize, _] : windowSizeToLayers)
2453+
// By default, we allocate equal proportion shares of memory for all
2454+
// window sizes (see the else case). With TRTLLM_WINDOW_SIZE_SHARES,
2455+
// we can override this behavior to adjust the memory share of each
2456+
// window size. For example, if we have window size of [512, 32768],
2457+
// then setting TRTLLM_WINDOW_SIZE_SHARES=0.4,0.6 will be allocating
2458+
// 40% of the memory to window size 512 and 60% of the memory to window
2459+
// size 32768.
2460+
if (auto envStr = std::getenv("TRTLLM_WINDOW_SIZE_SHARES"))
2461+
{
2462+
std::stringstream ss(envStr);
2463+
std::vector<float> shares;
2464+
float share;
2465+
while (ss >> share)
2466+
{
2467+
shares.push_back(share);
2468+
if (ss.peek() == ',')
2469+
ss.ignore();
2470+
}
2471+
2472+
TLLM_CHECK_WITH_INFO(shares.size() == windowSizeToLayers.size(),
2473+
"Number of shares in TRTLLM_WINDOW_SIZE_SHARES (%ld) must match number of window sizes (%ld)",
2474+
shares.size(), windowSizeToLayers.size());
2475+
float sumShares = 0.0f;
2476+
for (auto s : shares)
2477+
{
2478+
TLLM_CHECK_WITH_INFO(0.0f <= s && s <= 1.0f, "Shares must be in value range [0,1], got %f", s);
2479+
sumShares += s;
2480+
}
2481+
TLLM_CHECK_WITH_INFO(sumShares > 0.0f, "Sum of shares must be > 0.");
2482+
// Normalize shares to 1.0
2483+
for (auto& s : shares)
2484+
{
2485+
s /= sumShares;
2486+
}
2487+
size_t i = 0;
2488+
for (auto const& [windowSize, _] : windowSizeToLayers)
2489+
{
2490+
windowSizeToShare[windowSize] = shares[i++];
2491+
}
2492+
}
2493+
else
24572494
{
2458-
windowSizeToShare[windowSize] = 1.0f / windowSizeToLayers.size();
2495+
// NOTE: Righteously, blocks allocated should be proportional with
2496+
// regard to window size. Currently, we are first allocating identical
2497+
// number of blocks for all layers to achieve identical performance.
2498+
for (auto const& [windowSize, _] : windowSizeToLayers)
2499+
{
2500+
windowSizeToShare[windowSize] = 1.0f / windowSizeToLayers.size();
2501+
}
24592502
}
24602503

24612504
std::vector<SizeType32> blocksPrimary;

0 commit comments

Comments
 (0)