@@ -2450,12 +2450,55 @@ BlocksPerWindow BaseKVCacheManager::calculateMaxNumBlocks(executor::KvCacheConfi
24502450 };
24512451
24522452 std::map<SizeType32, float > windowSizeToShare;
2453- // NOTE: Righteously, blocks allocated should be proportional with
2454- // regard to window size. Currently, we are first allocating identical
2455- // number of blocks for all layers to achieve identical performance.
2456- for (auto const & [windowSize, _] : windowSizeToLayers)
2453+ // By default, we allocate equal proportion shares of memory for all
2454+ // window sizes (see the else case). With TRTLLM_WINDOW_SIZE_SHARES,
2455+ // we can override this behavior to adjust the memory share of each
2456+ // window size. For example, if we have window size of [512, 32768],
2457+ // then setting TRTLLM_WINDOW_SIZE_SHARES=0.4,0.6 will be allocating
2458+ // 40% of the memory to window size 512 and 60% of the memory to window
2459+ // size 32768.
2460+ if (auto envStr = std::getenv (" TRTLLM_WINDOW_SIZE_SHARES" ))
2461+ {
2462+ std::stringstream ss (envStr);
2463+ std::vector<float > shares;
2464+ float share;
2465+ while (ss >> share)
2466+ {
2467+ shares.push_back (share);
2468+ if (ss.peek () == ' ,' )
2469+ ss.ignore ();
2470+ }
2471+
2472+ TLLM_CHECK_WITH_INFO (shares.size () == windowSizeToLayers.size (),
2473+ " Number of shares in TRTLLM_WINDOW_SIZE_SHARES (%ld) must match number of window sizes (%ld)" ,
2474+ shares.size (), windowSizeToLayers.size ());
2475+ float sumShares = 0 .0f ;
2476+ for (auto s : shares)
2477+ {
2478+ TLLM_CHECK_WITH_INFO (0 .0f <= s && s <= 1 .0f , " Shares must be in value range [0,1], got %f" , s);
2479+ sumShares += s;
2480+ }
2481+ TLLM_CHECK_WITH_INFO (sumShares > 0 .0f , " Sum of shares must be > 0." );
2482+ // Normalize shares to 1.0
2483+ for (auto & s : shares)
2484+ {
2485+ s /= sumShares;
2486+ }
2487+ size_t i = 0 ;
2488+ for (auto const & [windowSize, _] : windowSizeToLayers)
2489+ {
2490+ windowSizeToShare[windowSize] = shares[i++];
2491+ }
2492+ }
2493+ else
24572494 {
2458- windowSizeToShare[windowSize] = 1 .0f / windowSizeToLayers.size ();
2495+ // NOTE: Righteously, blocks allocated should be proportional with
2496+ // regard to window size. Currently, we are first allocating identical
2497+ // number of blocks for all layers to achieve identical performance.
2498+ for (auto const & [windowSize, _] : windowSizeToLayers)
2499+ {
2500+ windowSizeToShare[windowSize] = 1 .0f / windowSizeToLayers.size ();
2501+ }
24592502 }
24602503
24612504 std::vector<SizeType32> blocksPrimary;
0 commit comments