Skip to content
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
5 changes: 5 additions & 0 deletions cpp/include/tensorrt_llm/batch_manager/kvCacheManager.h
Original file line number Diff line number Diff line change
Expand Up @@ -192,6 +192,8 @@ struct KvCacheStats
float cacheHitRate;
// Number of free blocks for every configured attention-window size.
std::map<SizeType32, SizeType32> numFreeBlocksPerWindowSize;
// GPU bytes allocated for KV-cache
std::size_t allocatedBytes{};
};

// Basic building block of a paged KV cache - a single
Expand Down Expand Up @@ -1474,6 +1476,7 @@ class KVCacheManager : public BaseKVCacheManager
: static_cast<float>(kvCacheStats.reusedBlocks)
/ static_cast<float>(kvCacheStats.reusedBlocks + kvCacheStats.missedBlocks);
kvCacheStats.numFreeBlocksPerWindowSize = getNumFreeBlocksPerWindowSize();
kvCacheStats.allocatedBytes = mAllocatedBytes;
return kvCacheStats;
}

Expand Down Expand Up @@ -1677,6 +1680,8 @@ class KVCacheManager : public BaseKVCacheManager
runtime::ITensor::SharedPtr mBlockPoolPointers;
runtime::ITensor::SharedPtr mLayerToPoolMapping;
runtime::ITensor::SharedPtr mBlockScalePoolPointers;
// GPU bytes allocated for KV-cache
std::size_t mAllocatedBytes{0};
};

} // namespace tensorrt_llm::batch_manager::kv_cache_manager
11 changes: 9 additions & 2 deletions cpp/include/tensorrt_llm/executor/executor.h
Original file line number Diff line number Diff line change
Expand Up @@ -1006,7 +1006,8 @@ class KvCacheConfig
std::optional<RetentionPriority> secondaryOffloadMinPriority = std::nullopt, size_t eventBufferMaxSize = 0,
bool enablePartialReuse = true, bool copyOnPartialReuse = true, bool useUvm = false,
SizeType32 attentionDpEventsGatherPeriodMs = 5,
std::optional<tensorrt_llm::runtime::RuntimeDefaults> const& runtimeDefaults = std::nullopt);
std::optional<tensorrt_llm::runtime::RuntimeDefaults> const& runtimeDefaults = std::nullopt,
uint64_t const& maxGpuTotalBytes = 0);

[[nodiscard]] bool getEnableBlockReuse() const;
[[nodiscard]] bool getEnablePartialReuse() const;
Expand All @@ -1022,11 +1023,12 @@ class KvCacheConfig
[[nodiscard]] size_t getEventBufferMaxSize() const;
[[nodiscard]] bool getUseUvm() const;
[[nodiscard]] SizeType32 getAttentionDpEventsGatherPeriodMs() const;
[[nodiscard]] uint64_t getMaxGpuTotalBytes() const;

void setEnableBlockReuse(bool enableBlockReuse);
void setEnablePartialReuse(bool enablePartialReuse);
void setCopyOnPartialReuse(bool copyOnPartialReuse);
void setMaxTokens(SizeType32 maxTokens);
void setMaxTokens(std::optional<SizeType32> maxTokens);
void setMaxAttentionWindowVec(std::vector<SizeType32> maxAttentionWindowVec);
void setSinkTokenLength(SizeType32 sinkTokenLength);
void setFreeGpuMemoryFraction(FloatType freeGpuMemoryFraction);
Expand All @@ -1037,6 +1039,7 @@ class KvCacheConfig
void setEventBufferMaxSize(size_t eventBufferMaxSize);
void setUseUvm(bool useUvm);
void setAttentionDpEventsGatherPeriodMs(SizeType32 attentionDpEventsGatherPeriodMs);
void setMaxGpuTotalBytes(uint64_t maxGpuTotalBytes);

void fillEmptyFieldsFromRuntimeDefaults(tensorrt_llm::runtime::RuntimeDefaults const& runtimeDefaults);

Expand Down Expand Up @@ -1095,6 +1098,10 @@ class KvCacheConfig

/// @brief The period in milliseconds to gather attention DP events across ranks
SizeType32 mAttentionDpEventsGatherPeriodMs;
/// @brief The maximum size in bytes of GPU memory that can be allocated for the KV cache.
/// If both mMaxGpuTotalBytes and mFreeGpuMemoryFraction are specified, memory corresponding to the minimum will
/// be allocated.
uint64_t mMaxGpuTotalBytes;
};

/// @brief Configuration class for the runtime perf knobs
Expand Down
34 changes: 18 additions & 16 deletions cpp/tensorrt_llm/batch_manager/kvCacheManager.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -1681,27 +1681,29 @@ void KVCacheManager::allocatePools(bool useUvm)
mBlockManager.allocatePools(useUvm);
auto const numPools = mBlockManager.getNumPools();

if (tc::Logger::getLogger()->getLevel() <= tc::Logger::INFO)
uint64_t cacheSizeBytes = 0;
for (SizeType32 poolIdx = 0; poolIdx < numPools; poolIdx++)
{
uint64_t cacheSizeBytes = 0;
for (SizeType32 poolIdx = 0; poolIdx < numPools; poolIdx++)
{
auto const cacheShape = mBlockManager.getPrimaryPool(poolIdx)->getShape();
auto const cacheVolume = ITensor::volume(cacheShape);
auto const cacheShape = mBlockManager.getPrimaryPool(poolIdx)->getShape();
auto const cacheVolume = ITensor::volume(cacheShape);
#ifdef ENABLE_FP4
auto const isFp4 = mDataType == nvinfer1::DataType::kFP4;
auto const isFp4 = mDataType == nvinfer1::DataType::kFP4;
#else
auto const isFp4 = false;
auto const isFp4 = false;
#endif
if (!isFp4)
{
cacheSizeBytes += cacheVolume * BufferDataType(mDataType).getSize();
}
else
{
cacheSizeBytes += (cacheVolume * 4) / 8;
}
if (!isFp4)
{
cacheSizeBytes += cacheVolume * BufferDataType(mDataType).getSize();
}
else
{
cacheSizeBytes += (cacheVolume * 4) / 8;
}
}
// Save the total number of bytes allocated for the KV-cache for KvCacheStats
mAllocatedBytes = cacheSizeBytes;
if (tc::Logger::getLogger()->getLevel() <= tc::Logger::INFO)
{

TLLM_LOG_INFO("Number of tokens per block: %d.", mBlockManager.getTokensPerBlock());
auto const maxNumTokens = mBlockManager.getNumPrimaryBlocks() * mBlockManager.getTokensPerBlock();
Expand Down
24 changes: 21 additions & 3 deletions cpp/tensorrt_llm/executor/kvCacheConfig.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -28,7 +28,7 @@ KvCacheConfig::KvCacheConfig(bool enableBlockReuse, std::optional<SizeType32> co
std::optional<FloatType> const& crossKvCacheFraction, std::optional<RetentionPriority> secondaryOffloadMinPriority,
size_t eventBufferMaxSize, bool enablePartialReuse, bool copyOnPartialReuse, bool useUvm,
SizeType32 attentionDpEventsGatherPeriodMs,
std::optional<tensorrt_llm::runtime::RuntimeDefaults> const& runtimeDefaults)
std::optional<tensorrt_llm::runtime::RuntimeDefaults> const& runtimeDefaults, uint64_t const& maxGpuTotalBytes)
: mEnableBlockReuse(enableBlockReuse)
, mHostCacheSize(hostCacheSize)
, mOnboardBlocks(onboardBlocks)
Expand All @@ -38,6 +38,7 @@ KvCacheConfig::KvCacheConfig(bool enableBlockReuse, std::optional<SizeType32> co
, mCopyOnPartialReuse{copyOnPartialReuse}
, mUseUvm{useUvm}
, mAttentionDpEventsGatherPeriodMs(attentionDpEventsGatherPeriodMs)
, mMaxGpuTotalBytes{maxGpuTotalBytes}
{
if (maxTokens)
{
Expand All @@ -63,6 +64,10 @@ KvCacheConfig::KvCacheConfig(bool enableBlockReuse, std::optional<SizeType32> co
{
fillEmptyFieldsFromRuntimeDefaults(runtimeDefaults.value());
}
if (maxGpuTotalBytes)
{
setMaxGpuTotalBytes(maxGpuTotalBytes);
}
TLLM_CHECK_WITH_INFO(
mAttentionDpEventsGatherPeriodMs > 0, "Attention DP events gather period must be greater than 0");
}
Expand Down Expand Up @@ -137,6 +142,11 @@ SizeType32 KvCacheConfig::getAttentionDpEventsGatherPeriodMs() const
return mAttentionDpEventsGatherPeriodMs;
}

uint64_t KvCacheConfig::getMaxGpuTotalBytes() const
{
return mMaxGpuTotalBytes;
}

void KvCacheConfig::setEnableBlockReuse(bool enableBlockReuse)
{
mEnableBlockReuse = enableBlockReuse;
Expand All @@ -152,9 +162,12 @@ void KvCacheConfig::setCopyOnPartialReuse(bool copyOnPartialReuse)
mCopyOnPartialReuse = copyOnPartialReuse;
}

void KvCacheConfig::setMaxTokens(SizeType32 maxTokens)
void KvCacheConfig::setMaxTokens(std::optional<SizeType32> maxTokens)
{
TLLM_CHECK(maxTokens > 0);
if (maxTokens)
{
TLLM_CHECK(maxTokens.value() > 0);
}
mMaxTokens = maxTokens;
}

Expand Down Expand Up @@ -219,6 +232,11 @@ void KvCacheConfig::setAttentionDpEventsGatherPeriodMs(SizeType32 attentionDpEve
mAttentionDpEventsGatherPeriodMs = attentionDpEventsGatherPeriodMs;
}

void KvCacheConfig::setMaxGpuTotalBytes(uint64_t maxGpuTotalBytes)
{
mMaxGpuTotalBytes = maxGpuTotalBytes;
}

void KvCacheConfig::fillEmptyFieldsFromRuntimeDefaults(tensorrt_llm::runtime::RuntimeDefaults const& runtimeDefaults)
{
if (!mMaxAttentionWindowVec && runtimeDefaults.maxAttentionWindowVec)
Expand Down
3 changes: 2 additions & 1 deletion cpp/tensorrt_llm/nanobind/batch_manager/kvCacheManager.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -303,7 +303,8 @@ void tb::kv_cache_manager::KVCacheManagerBindings::initBindings(nb::module_& m)
.def_rw("reused_blocks", &tbk::KvCacheStats::reusedBlocks)
.def_rw("missed_blocks", &tbk::KvCacheStats::missedBlocks)
.def_rw("cache_hit_rate", &tbk::KvCacheStats::cacheHitRate)
.def_rw("num_free_blocks_per_window_size", &tbk::KvCacheStats::numFreeBlocksPerWindowSize);
.def_rw("num_free_blocks_per_window_size", &tbk::KvCacheStats::numFreeBlocksPerWindowSize)
.def_ro("allocated_bytes", &tbk::KvCacheStats::allocatedBytes);

nb::class_<tbk::TempAttentionWindowInputs>(m, "TempAttentionWindowInputs")
.def(nb::init<>())
Expand Down
14 changes: 9 additions & 5 deletions cpp/tensorrt_llm/nanobind/executor/executorConfig.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -21,6 +21,7 @@
#include "tensorrt_llm/nanobind/common/customCasters.h"
#include "tensorrt_llm/runtime/cudaStream.h"
#include "tensorrt_llm/runtime/utils/mpiUtils.h"
#include <cstdint>
#include <nanobind/nanobind.h>
#include <nanobind/stl/function.h>
#include <nanobind/stl/map.h>
Expand Down Expand Up @@ -111,11 +112,11 @@ void initConfigBindings(nb::module_& m)
self.getSinkTokenLength(), self.getFreeGpuMemoryFraction(), self.getHostCacheSize(),
self.getOnboardBlocks(), self.getCrossKvCacheFraction(), self.getSecondaryOffloadMinPriority(),
self.getEventBufferMaxSize(), self.getEnablePartialReuse(), self.getCopyOnPartialReuse(), self.getUseUvm(),
self.getAttentionDpEventsGatherPeriodMs());
self.getAttentionDpEventsGatherPeriodMs(), self.getMaxGpuTotalBytes());
};
auto kvCacheConfigSetstate = [](tle::KvCacheConfig& self, nb::tuple const& state)
{
if (state.size() != 14)
if (state.size() != 15)
{
throw std::runtime_error("Invalid state!");
}
Expand All @@ -125,20 +126,21 @@ void initConfigBindings(nb::module_& m)
nb::cast<bool>(state[6]), nb::cast<std::optional<float>>(state[7]),
nb::cast<std::optional<tle::RetentionPriority>>(state[8]), nb::cast<size_t>(state[9]),
nb::cast<bool>(state[10]), nb::cast<bool>(state[11]), nb::cast<bool>(state[12]),
nb::cast<SizeType32>(state[13]));
nb::cast<SizeType32>(state[13]), std::nullopt, nb::cast<uint64_t>(state[14]));
};
nb::class_<tle::KvCacheConfig>(m, "KvCacheConfig")
.def(nb::init<bool, std::optional<SizeType32> const&, std::optional<std::vector<SizeType32>> const&,
std::optional<SizeType32> const&, std::optional<float> const&, std::optional<size_t> const&, bool,
std::optional<float> const&, std::optional<tle::RetentionPriority>, size_t const&, bool, bool, bool,
SizeType32, std::optional<RuntimeDefaults> const&>(),
SizeType32, std::optional<RuntimeDefaults> const&, uint64_t const&>(),
nb::arg("enable_block_reuse") = true, nb::arg("max_tokens") = nb::none(),
nb::arg("max_attention_window") = nb::none(), nb::arg("sink_token_length") = nb::none(),
nb::arg("free_gpu_memory_fraction") = nb::none(), nb::arg("host_cache_size") = nb::none(),
nb::arg("onboard_blocks") = true, nb::arg("cross_kv_cache_fraction") = nb::none(),
nb::arg("secondary_offload_min_priority") = nb::none(), nb::arg("event_buffer_max_size") = 0, nb::kw_only(),
nb::arg("enable_partial_reuse") = true, nb::arg("copy_on_partial_reuse") = true, nb::arg("use_uvm") = false,
nb::arg("attention_dp_events_gather_period_ms") = 5, nb::arg("runtime_defaults") = nb::none())
nb::arg("attention_dp_events_gather_period_ms") = 5, nb::arg("runtime_defaults") = nb::none(),
nb::arg("max_gpu_total_bytes") = 0)
.def_prop_rw(
"enable_block_reuse", &tle::KvCacheConfig::getEnableBlockReuse, &tle::KvCacheConfig::setEnableBlockReuse)
.def_prop_rw("max_tokens", &tle::KvCacheConfig::getMaxTokens, &tle::KvCacheConfig::setMaxTokens)
Expand All @@ -163,6 +165,8 @@ void initConfigBindings(nb::module_& m)
.def_prop_rw("use_uvm", &tle::KvCacheConfig::getUseUvm, &tle::KvCacheConfig::setUseUvm)
.def_prop_rw("attention_dp_events_gather_period_ms", &tle::KvCacheConfig::getAttentionDpEventsGatherPeriodMs,
&tle::KvCacheConfig::setAttentionDpEventsGatherPeriodMs)
.def_prop_rw(
"max_gpu_total_bytes", &tle::KvCacheConfig::getMaxGpuTotalBytes, &tle::KvCacheConfig::setMaxGpuTotalBytes)
.def("fill_empty_fields_from_runtime_defaults", &tle::KvCacheConfig::fillEmptyFieldsFromRuntimeDefaults)
.def("__getstate__", kvCacheConfigGetstate)
.def("__setstate__", kvCacheConfigSetstate);
Expand Down
3 changes: 2 additions & 1 deletion cpp/tensorrt_llm/pybind/batch_manager/kvCacheManager.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -299,7 +299,8 @@ void tb::kv_cache_manager::KVCacheManagerBindings::initBindings(py::module_& m)
.def_readwrite("reused_blocks", &tbk::KvCacheStats::reusedBlocks)
.def_readwrite("missed_blocks", &tbk::KvCacheStats::missedBlocks)
.def_readwrite("cache_hit_rate", &tbk::KvCacheStats::cacheHitRate)
.def_readwrite("num_free_blocks_per_window_size", &tbk::KvCacheStats::numFreeBlocksPerWindowSize);
.def_readwrite("num_free_blocks_per_window_size", &tbk::KvCacheStats::numFreeBlocksPerWindowSize)
.def_readonly("allocated_bytes", &tbk::KvCacheStats::allocatedBytes);

py::class_<tbk::TempAttentionWindowInputs>(m, "TempAttentionWindowInputs")
.def(py::init<>())
Expand Down
13 changes: 8 additions & 5 deletions cpp/tensorrt_llm/pybind/executor/executorConfig.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -104,11 +104,11 @@ void initConfigBindings(pybind11::module_& m)
self.getSinkTokenLength(), self.getFreeGpuMemoryFraction(), self.getHostCacheSize(),
self.getOnboardBlocks(), self.getCrossKvCacheFraction(), self.getSecondaryOffloadMinPriority(),
self.getEventBufferMaxSize(), self.getEnablePartialReuse(), self.getCopyOnPartialReuse(), self.getUseUvm(),
self.getAttentionDpEventsGatherPeriodMs());
self.getAttentionDpEventsGatherPeriodMs(), self.getMaxGpuTotalBytes());
};
auto kvCacheConfigSetstate = [](py::tuple const& state)
{
if (state.size() != 14)
if (state.size() != 15)
{
throw std::runtime_error("Invalid state!");
}
Expand All @@ -117,20 +117,21 @@ void initConfigBindings(pybind11::module_& m)
state[4].cast<std::optional<float>>(), state[5].cast<std::optional<size_t>>(), state[6].cast<bool>(),
state[7].cast<std::optional<float>>(), state[8].cast<std::optional<tle::RetentionPriority>>(),
state[9].cast<size_t>(), state[10].cast<bool>(), state[11].cast<bool>(), state[12].cast<bool>(),
state[13].cast<SizeType32>());
state[13].cast<SizeType32>(), std::nullopt, state[14].cast<uint64_t>());
};
py::class_<tle::KvCacheConfig>(m, "KvCacheConfig")
.def(py::init<bool, std::optional<SizeType32> const&, std::optional<std::vector<SizeType32>> const&,
std::optional<SizeType32> const&, std::optional<float> const&, std::optional<size_t> const&, bool,
std::optional<float> const&, std::optional<tle::RetentionPriority>, size_t const&, bool, bool, bool,
SizeType32, std::optional<RuntimeDefaults> const&>(),
SizeType32, std::optional<RuntimeDefaults> const&, uint64_t const&>(),
py::arg("enable_block_reuse") = true, py::arg("max_tokens") = py::none(),
py::arg("max_attention_window") = py::none(), py::arg("sink_token_length") = py::none(),
py::arg("free_gpu_memory_fraction") = py::none(), py::arg("host_cache_size") = py::none(),
py::arg("onboard_blocks") = true, py::arg("cross_kv_cache_fraction") = py::none(),
py::arg("secondary_offload_min_priority") = py::none(), py::arg("event_buffer_max_size") = 0, py::kw_only(),
py::arg("enable_partial_reuse") = true, py::arg("copy_on_partial_reuse") = true, py::arg("use_uvm") = false,
py::arg("attention_dp_events_gather_period_ms") = 5, py::arg("runtime_defaults") = py::none())
py::arg("attention_dp_events_gather_period_ms") = 5, py::arg("runtime_defaults") = py::none(),
py::arg("max_gpu_total_bytes") = 0)
.def_property(
"enable_block_reuse", &tle::KvCacheConfig::getEnableBlockReuse, &tle::KvCacheConfig::setEnableBlockReuse)
.def_property("max_tokens", &tle::KvCacheConfig::getMaxTokens, &tle::KvCacheConfig::setMaxTokens)
Expand All @@ -140,6 +141,8 @@ void initConfigBindings(pybind11::module_& m)
"sink_token_length", &tle::KvCacheConfig::getSinkTokenLength, &tle::KvCacheConfig::setSinkTokenLength)
.def_property("free_gpu_memory_fraction", &tle::KvCacheConfig::getFreeGpuMemoryFraction,
&tle::KvCacheConfig::setFreeGpuMemoryFraction)
.def_property(
"max_gpu_total_bytes", &tle::KvCacheConfig::getMaxGpuTotalBytes, &tle::KvCacheConfig::setMaxGpuTotalBytes)
.def_property("host_cache_size", &tle::KvCacheConfig::getHostCacheSize, &tle::KvCacheConfig::setHostCacheSize)
.def_property("onboard_blocks", &tle::KvCacheConfig::getOnboardBlocks, &tle::KvCacheConfig::setOnboardBlocks)
.def_property("cross_kv_cache_fraction", &tle::KvCacheConfig::getCrossKvCacheFraction,
Expand Down
Loading