diff --git a/.buildkite/vllm_lkg.version b/.buildkite/vllm_lkg.version index de04a3da0e..caf74571f7 100644 --- a/.buildkite/vllm_lkg.version +++ b/.buildkite/vllm_lkg.version @@ -1 +1 @@ -a3299c3d1d6c260c35a866599bdf4d3e7b7d84dd +c3598d02fa638119ae4ac933850dbcd3d629fa1c diff --git a/tpu_inference/runner/kv_cache_manager.py b/tpu_inference/runner/kv_cache_manager.py index b0941a6f3d..7e094f1029 100644 --- a/tpu_inference/runner/kv_cache_manager.py +++ b/tpu_inference/runner/kv_cache_manager.py @@ -240,7 +240,7 @@ def maybe_reinitialize_input_batch(self, for kv_cache_group in kv_cache_config.kv_cache_groups ] if block_sizes != [self.runner.cache_config.block_size]: - assert self.runner.cache_config.cpu_offload_gb == 0, ( + assert self.runner.vllm_config.offload_config.uva.cpu_offload_gb == 0, ( "Cannot re-initialize the input batch when CPU weight " "offloading is enabled. See https://github.com/vllm-project/vllm/pull/18298 " # noqa: E501 "for more details.")