@@ -2450,12 +2450,40 @@ BlocksPerWindow BaseKVCacheManager::calculateMaxNumBlocks(executor::KvCacheConfi
24502450 };
24512451
24522452 std::map<SizeType32, float > windowSizeToShare;
2453- // NOTE: Righteously, blocks allocated should be proportional with
2454- // regard to window size. Currently, we are first allocating identical
2455- // number of blocks for all layers to achieve identical performance.
2456- for (auto const & [windowSize, _] : windowSizeToLayers)
2453+ if (auto envStr = std::getenv (" TRTLLM_WINDOW_SIZE_SHARES" ))
24572454 {
2458- windowSizeToShare[windowSize] = 1 .0f / windowSizeToLayers.size ();
2455+ float const fraction = windowSizeSum / windowSizesTotalSum;
2456+ TLLM_CHECK (0 .0f < fraction && fraction <= 1 .0f );
2457+ windowSizeToShare[windowSize] = fraction;
2458+ std::stringstream ss (envStr);
2459+ std::vector<float > shares;
2460+ float share;
2461+ while (ss >> share)
2462+ {
2463+ shares.push_back (share);
2464+ if (ss.peek () == ' ,' )
2465+ ss.ignore ();
2466+ }
2467+
2468+ TLLM_CHECK_WITH_INFO (shares.size () == windowSizeToLayers.size (),
2469+ " Number of shares in TRTLLM_WINDOW_SIZE_SHARES (%ld) must match number of window sizes (%ld)" ,
2470+ shares.size (), windowSizeToLayers.size ());
2471+
2472+ size_t i = 0 ;
2473+ for (auto const & [windowSize, _] : windowSizeToLayers)
2474+ {
2475+ windowSizeToShare[windowSize] = shares[i++];
2476+ }
2477+ }
2478+ else
2479+ {
2480+ // NOTE: Righteously, blocks allocated should be proportional with
2481+ // regard to window size. Currently, we are first allocating identical
2482+ // number of blocks for all layers to achieve identical performance.
2483+ for (auto const & [windowSize, _] : windowSizeToLayers)
2484+ {
2485+ windowSizeToShare[windowSize] = 1 .0f / windowSizeToLayers.size ();
2486+ }
24592487 }
24602488
24612489 std::vector<SizeType32> blocksPrimary;
0 commit comments