Skip to content
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
11 changes: 4 additions & 7 deletions vllm/config/cache.py
Original file line number Diff line number Diff line change
Expand Up @@ -19,7 +19,6 @@

logger = init_logger(__name__)

BlockSize = Literal[1, 8, 16, 32, 64, 128, 256]
CacheDType = Literal[
"auto",
"bfloat16",
Expand All @@ -39,13 +38,11 @@
class CacheConfig:
"""Configuration for the KV cache."""

block_size: SkipValidation[BlockSize] = None # type: ignore[assignment]
"""Size of a contiguous cache block in number of tokens. On CUDA devices,
only block sizes up to 32 are supported.
block_size: SkipValidation[int] = None # type: ignore[assignment]
"""Size of a contiguous cache block in number of tokens.

This config has no static default. If left unspecified by the user, it will
be set in `Platform.check_and_update_config()` based on the current
platform."""
This is None until `Platform.check_and_update_config()` sets it based on
the current platform. Always an int by the time the engine starts."""
gpu_memory_utilization: float = Field(default=0.9, gt=0, le=1)
"""The fraction of GPU memory to be used for the model executor, which can
range from 0 to 1. For example, a value of 0.5 would imply 50% GPU memory
Expand Down
3 changes: 1 addition & 2 deletions vllm/engine/arg_utils.py
Original file line number Diff line number Diff line change
Expand Up @@ -59,7 +59,6 @@
get_attr_docs,
)
from vllm.config.cache import (
BlockSize,
CacheDType,
KVOffloadingBackend,
MambaCacheMode,
Expand Down Expand Up @@ -431,7 +430,7 @@ class EngineArgs:
max_parallel_loading_workers: int | None = (
ParallelConfig.max_parallel_loading_workers
)
block_size: BlockSize = CacheConfig.block_size
block_size: int = None # type: ignore[assignment]
enable_prefix_caching: bool | None = None
prefix_caching_hash_algo: PrefixCachingHashAlgo = (
CacheConfig.prefix_caching_hash_algo
Expand Down
Loading