diff --git a/.buildkite/performance-benchmarks/README.md b/.buildkite/performance-benchmarks/README.md index 289877e504bb..3a321c0fefdf 100644 --- a/.buildkite/performance-benchmarks/README.md +++ b/.buildkite/performance-benchmarks/README.md @@ -83,7 +83,6 @@ We test the throughput by using `vllm bench serve` with request rate = inf to co "server_parameters": { "model": "meta-llama/Meta-Llama-3-8B", "tensor_parallel_size": 1, - "swap_space": 16, "disable_log_stats": "", "load_format": "dummy" }, diff --git a/.buildkite/performance-benchmarks/tests/serving-tests-hpu.json b/.buildkite/performance-benchmarks/tests/serving-tests-hpu.json index a2e42aa16fd3..3929aa5fbbe0 100644 --- a/.buildkite/performance-benchmarks/tests/serving-tests-hpu.json +++ b/.buildkite/performance-benchmarks/tests/serving-tests-hpu.json @@ -10,7 +10,6 @@ "server_parameters": { "model": "meta-llama/Meta-Llama-3.1-8B-Instruct", "tensor_parallel_size": 1, - "swap_space": 16, "disable_log_stats": "", "load_format": "dummy", "max-model-len": 2048, @@ -37,7 +36,6 @@ "server_parameters": { "model": "meta-llama/Meta-Llama-3.1-70B-Instruct", "tensor_parallel_size": 4, - "swap_space": 16, "disable_log_stats": "", "load_format": "dummy", "max-model-len": 2048, @@ -64,7 +62,6 @@ "server_parameters": { "model": "mistralai/Mixtral-8x7B-Instruct-v0.1", "tensor_parallel_size": 2, - "swap_space": 16, "disable_log_stats": "", "load_format": "dummy", "max-model-len": 2048, @@ -91,7 +88,6 @@ "server_parameters": { "model": "deepseek-ai/DeepSeek-R1", "tensor_parallel_size": 8, - "swap_space": 16, "disable_log_stats": "", "load_format": "dummy", "max-model-len": 2048, diff --git a/.buildkite/performance-benchmarks/tests/serving-tests.json b/.buildkite/performance-benchmarks/tests/serving-tests.json index a6d4141d5c2d..66d52abc1206 100644 --- a/.buildkite/performance-benchmarks/tests/serving-tests.json +++ b/.buildkite/performance-benchmarks/tests/serving-tests.json @@ -5,7 +5,6 @@ "server_parameters": { "model": "meta-llama/Meta-Llama-3.1-8B-Instruct", "tensor_parallel_size": 1, - "swap_space": 16, "disable_log_stats": "", "load_format": "dummy" }, @@ -23,7 +22,6 @@ "server_parameters": { "model": "meta-llama/Meta-Llama-3.1-70B-Instruct", "tensor_parallel_size": 4, - "swap_space": 16, "disable_log_stats": "", "load_format": "dummy" }, @@ -41,7 +39,6 @@ "server_parameters": { "model": "mistralai/Mixtral-8x7B-Instruct-v0.1", "tensor_parallel_size": 2, - "swap_space": 16, "disable_log_stats": "", "load_format": "dummy" }, @@ -59,7 +56,6 @@ "server_parameters": { "model": "meta-llama/Meta-Llama-3.1-70B-Instruct", "tensor_parallel_size": 4, - "swap_space": 16, "speculative_config": { "model": "turboderp/Qwama-0.5B-Instruct", "num_speculative_tokens": 4, diff --git a/benchmarks/attention_benchmarks/mla_runner.py b/benchmarks/attention_benchmarks/mla_runner.py index 867f55fa9ef7..110f580fb7bd 100644 --- a/benchmarks/attention_benchmarks/mla_runner.py +++ b/benchmarks/attention_benchmarks/mla_runner.py @@ -145,7 +145,6 @@ def create_minimal_vllm_config( cache_config = CacheConfig( block_size=block_size, gpu_memory_utilization=0.9, - swap_space=0, cache_dtype="auto", enable_prefix_caching=False, ) diff --git a/benchmarks/attention_benchmarks/runner.py b/benchmarks/attention_benchmarks/runner.py index 9744b857d96b..7f968cfec148 100644 --- a/benchmarks/attention_benchmarks/runner.py +++ b/benchmarks/attention_benchmarks/runner.py @@ -141,7 +141,6 @@ def _create_vllm_config( cache_config = CacheConfig( block_size=config.block_size, cache_dtype="auto", - swap_space=0, ) cache_config.num_gpu_blocks = max_num_blocks cache_config.num_cpu_blocks = 0 diff --git a/docs/design/metrics.md b/docs/design/metrics.md index a977ce9b9bb2..b24ff64b6783 100644 --- a/docs/design/metrics.md +++ b/docs/design/metrics.md @@ -507,10 +507,10 @@ longer relevant in v1: - `vllm:num_requests_swapped` - `vllm:cpu_cache_usage_perc` -In this mode, when a request is preempted (e.g. to make room in KV -cache to complete other requests), we swap kv cache blocks out to CPU -memory. This is also known as "KV cache offloading" and is configured -with `--swap-space` and `--preemption-mode`. +In this mode, when a request was preempted (e.g. to make room in KV +cache to complete other requests), kv cache blocks were swapped out to +CPU memory. The `--swap-space` flag has been removed as this feature +is no longer used in V1. Historically, [vLLM has long supported beam search](https://github.com/vllm-project/vllm/issues/6226). The SequenceGroup encapsulated the idea of N Sequences which diff --git a/docs/serving/integrations/llamaindex.md b/docs/serving/integrations/llamaindex.md index 4b838cbcaa9d..3d669f169e01 100644 --- a/docs/serving/integrations/llamaindex.md +++ b/docs/serving/integrations/llamaindex.md @@ -17,7 +17,7 @@ llm = Vllm( model="microsoft/Orca-2-7b", tensor_parallel_size=4, max_new_tokens=100, - vllm_kwargs={"swap_space": 1, "gpu_memory_utilization": 0.5}, + vllm_kwargs={"gpu_memory_utilization": 0.5}, ) ``` diff --git a/tests/conftest.py b/tests/conftest.py index 1e9d46d3c169..4b907b7dd760 100644 --- a/tests/conftest.py +++ b/tests/conftest.py @@ -794,7 +794,6 @@ def __init__( tensor_parallel_size: int = 1, block_size: int = 16 if not torch.xpu.is_available() else 64, enable_chunked_prefill: bool | None = False, - swap_space: int = 4, enforce_eager: bool | None = False, # Set this to avoid hanging issue default_torch_num_threads: int | None = None, @@ -831,7 +830,6 @@ def __init__( trust_remote_code=trust_remote_code, dtype=dtype, seed=seed, - swap_space=swap_space, enforce_eager=enforce_eager, disable_log_stats=disable_log_stats, tensor_parallel_size=tensor_parallel_size, diff --git a/tests/distributed/test_torchrun_example.py b/tests/distributed/test_torchrun_example.py index f415409d7b37..8c9898ca20f3 100644 --- a/tests/distributed/test_torchrun_example.py +++ b/tests/distributed/test_torchrun_example.py @@ -22,7 +22,7 @@ sampling_params = SamplingParams(temperature=0.8, top_p=0.95) -# set different `gpu_memory_utilization` and `swap_space` for different ranks, +# set different `gpu_memory_utilization` for different ranks, # to test if all ranks agree on the same kv cache configuration. llm = LLM( model="facebook/opt-125m", @@ -30,7 +30,6 @@ pipeline_parallel_size=int(os.getenv("PP_SIZE", 1)), distributed_executor_backend="external_launcher", gpu_memory_utilization=random.uniform(0.7, 0.9), - swap_space=random.randint(1, 4), seed=0, ) diff --git a/tests/distributed/test_torchrun_example_moe.py b/tests/distributed/test_torchrun_example_moe.py index 1aa7f1793570..a6298d1b6739 100644 --- a/tests/distributed/test_torchrun_example_moe.py +++ b/tests/distributed/test_torchrun_example_moe.py @@ -28,7 +28,7 @@ sampling_params = SamplingParams(temperature=0.8, top_p=0.95) -# set different `gpu_memory_utilization` and `swap_space` for different ranks, +# set different `gpu_memory_utilization` for different ranks, # to test if all ranks agree on the same kv cache configuration. llm = LLM( model="microsoft/Phi-mini-MoE-instruct", @@ -37,7 +37,6 @@ enable_expert_parallel=int(os.getenv("ENABLE_EP", "0")) == 1, distributed_executor_backend="external_launcher", gpu_memory_utilization=random.uniform(0.7, 0.9), - swap_space=random.randint(1, 4), seed=0, ) diff --git a/tests/lora/test_worker.py b/tests/lora/test_worker.py index 274142e8d66e..4af3ccf893ff 100644 --- a/tests/lora/test_worker.py +++ b/tests/lora/test_worker.py @@ -64,7 +64,6 @@ def set_active_loras(worker: Worker, lora_requests: list[LoRARequest]): device_config=DeviceConfig("cuda"), cache_config=CacheConfig( block_size=16, - swap_space=0, cache_dtype="auto", ), lora_config=LoRAConfig( diff --git a/tests/v1/attention/utils.py b/tests/v1/attention/utils.py index 3cff52929146..91decf6658a5 100644 --- a/tests/v1/attention/utils.py +++ b/tests/v1/attention/utils.py @@ -182,7 +182,6 @@ def create_vllm_config( cache_config = CacheConfig( block_size=block_size, cache_dtype="auto", - swap_space=0, ) # Set cache blocks for testing # (these may be set during initialization normally) diff --git a/tests/v1/core/test_scheduler.py b/tests/v1/core/test_scheduler.py index 24edfadb9b53..bbeca6ef7dba 100644 --- a/tests/v1/core/test_scheduler.py +++ b/tests/v1/core/test_scheduler.py @@ -1776,7 +1776,6 @@ def create_scheduler_with_priority( cache_config = CacheConfig( block_size=block_size, gpu_memory_utilization=0.9, - swap_space=0, cache_dtype="auto", enable_prefix_caching=enable_prefix_caching, ) @@ -3726,7 +3725,6 @@ def _create_encoder_decoder_scheduler( cache_config = CacheConfig( block_size=block_size, gpu_memory_utilization=0.9, - swap_space=0, cache_dtype="auto", enable_prefix_caching=False, ) diff --git a/tests/v1/core/utils.py b/tests/v1/core/utils.py index 90c174adf8c8..92122bcb0ba4 100644 --- a/tests/v1/core/utils.py +++ b/tests/v1/core/utils.py @@ -94,7 +94,6 @@ def create_scheduler( cache_config = CacheConfig( block_size=block_size, gpu_memory_utilization=0.9, - swap_space=0, cache_dtype="auto", enable_prefix_caching=enable_prefix_caching, ) diff --git a/tests/v1/engine/test_engine_core.py b/tests/v1/engine/test_engine_core.py index 8d7377c286ac..ae674919ae91 100644 --- a/tests/v1/engine/test_engine_core.py +++ b/tests/v1/engine/test_engine_core.py @@ -506,7 +506,6 @@ def test_encoder_instance_zero_kv_cache( cache_config = CacheConfig( block_size=16, gpu_memory_utilization=gpu_memory_utilization, - swap_space=0, cache_dtype="auto", enable_prefix_caching=enable_prefix_caching, ) diff --git a/tests/v1/kv_connector/unit/test_moriio_connector.py b/tests/v1/kv_connector/unit/test_moriio_connector.py index 7aa824609b7e..2ee224013131 100644 --- a/tests/v1/kv_connector/unit/test_moriio_connector.py +++ b/tests/v1/kv_connector/unit/test_moriio_connector.py @@ -206,7 +206,6 @@ def create_vllm_config( cache_config = CacheConfig( block_size=block_size, gpu_memory_utilization=0.9, - swap_space=0, cache_dtype="auto", enable_prefix_caching=True, ) diff --git a/tests/v1/kv_connector/unit/utils.py b/tests/v1/kv_connector/unit/utils.py index d267299815a6..f03d7c479eb2 100644 --- a/tests/v1/kv_connector/unit/utils.py +++ b/tests/v1/kv_connector/unit/utils.py @@ -118,7 +118,6 @@ def create_vllm_config( cache_config = CacheConfig( block_size=block_size, gpu_memory_utilization=0.9, - swap_space=0, cache_dtype=cache_dtype, enable_prefix_caching=True, ) diff --git a/tests/v1/worker/test_gpu_model_runner.py b/tests/v1/worker/test_gpu_model_runner.py index a2c1466ca61a..c8a6c1301444 100644 --- a/tests/v1/worker/test_gpu_model_runner.py +++ b/tests/v1/worker/test_gpu_model_runner.py @@ -96,7 +96,6 @@ def get_vllm_config(): cache_config = CacheConfig( block_size=BLOCK_SIZE, gpu_memory_utilization=0.9, - swap_space=0, cache_dtype="auto", ) parallel_config = ParallelConfig() @@ -809,7 +808,6 @@ def test_hybrid_attention_mamba_tensor_shapes(): cache_config = CacheConfig( block_size=BLOCK_SIZE, gpu_memory_utilization=0.9, - swap_space=0, cache_dtype="auto", ) parallel_config = ParallelConfig() @@ -1242,7 +1240,6 @@ def test_cudagraph_sizes_capped_for_mamba_cache(): cache_config = CacheConfig( block_size=BLOCK_SIZE, gpu_memory_utilization=0.9, - swap_space=0, cache_dtype="auto", ) parallel_config = ParallelConfig() diff --git a/vllm/config/cache.py b/vllm/config/cache.py index 8a94141c91b6..71603d8c883e 100644 --- a/vllm/config/cache.py +++ b/vllm/config/cache.py @@ -1,21 +1,13 @@ # SPDX-License-Identifier: Apache-2.0 # SPDX-FileCopyrightText: Copyright contributors to the vLLM project -import math from dataclasses import field -from typing import TYPE_CHECKING, Any, Literal +from typing import Literal from pydantic import Field, SkipValidation, field_validator from vllm.config.utils import config from vllm.logger import init_logger -from vllm.utils.mem_constants import GiB_bytes -from vllm.utils.mem_utils import format_gib, get_cpu_memory - -if TYPE_CHECKING: - from vllm.config.parallel import ParallelConfig -else: - ParallelConfig = Any logger = init_logger(__name__) @@ -53,8 +45,6 @@ class CacheConfig: not matter if you have another vLLM instance running on the same GPU. For example, if you have two vLLM instances running on the same GPU, you can set the GPU memory utilization to 0.5 for each instance.""" - swap_space: float = Field(default=4, ge=0) - """Size of the CPU swap space per GPU (in GiB).""" cache_dtype: CacheDType = "auto" """Data type for kv cache storage. If "auto", will use model data type. CUDA 11.8+ supports fp8 (=fp8_e4m3) and fp8_e5m2. ROCm (AMD GPU) supports @@ -173,7 +163,6 @@ def compute_hash(self) -> str: ignored_factors = { # Runtime/derived knobs that don't affect compiled graph shape "gpu_memory_utilization", - "swap_space", "is_attention_free", "num_gpu_blocks_override", "enable_prefix_caching", @@ -208,24 +197,3 @@ def _validate_cache_dtype(cls, cache_dtype: CacheDType) -> CacheDType: "scaling factor." ) return cache_dtype - - def verify_with_parallel_config( - self, - parallel_config: ParallelConfig, - ) -> None: - swap_space_bytes = math.ceil(self.swap_space * GiB_bytes) - total_cpu_memory = get_cpu_memory() - # FIXME(woosuk): Here, it is assumed that the GPUs in a tensor parallel - # group are in the same node. However, the GPUs may span multiple nodes. - num_gpus_per_node = parallel_config.tensor_parallel_size - cpu_memory_usage = swap_space_bytes * num_gpus_per_node - - msg = ( - f"{format_gib(cpu_memory_usage)} GiB out of the " - f"{format_gib(total_cpu_memory)} GiB total CPU memory " - "is allocated for the swap space." - ) - if cpu_memory_usage > 0.7 * total_cpu_memory: - raise ValueError("Too large swap space. " + msg) - elif cpu_memory_usage > 0.4 * total_cpu_memory: - logger.warning("Possibly too large swap space. %s", msg) diff --git a/vllm/config/vllm.py b/vllm/config/vllm.py index 34c668362d40..d5b60a566fd3 100644 --- a/vllm/config/vllm.py +++ b/vllm/config/vllm.py @@ -674,8 +674,6 @@ def __post_init__(self): self.parallel_config.is_moe_model = self.model_config.is_moe - self.cache_config.verify_with_parallel_config(self.parallel_config) - if self.lora_config is not None: self.lora_config.verify_with_model_config(self.model_config) diff --git a/vllm/engine/arg_utils.py b/vllm/engine/arg_utils.py index 09ffd5e121cc..dc1735a01788 100644 --- a/vllm/engine/arg_utils.py +++ b/vllm/engine/arg_utils.py @@ -447,7 +447,6 @@ class EngineArgs: ) disable_sliding_window: bool = ModelConfig.disable_sliding_window disable_cascade_attn: bool = ModelConfig.disable_cascade_attn - swap_space: float = CacheConfig.swap_space offload_backend: str = OffloadConfig.offload_backend cpu_offload_gb: float = UVAOffloadConfig.cpu_offload_gb cpu_offload_params: set[str] = get_field(UVAOffloadConfig, "cpu_offload_params") @@ -961,7 +960,6 @@ def add_cli_args(parser: FlexibleArgumentParser) -> FlexibleArgumentParser: cache_group.add_argument( "--kv-cache-memory-bytes", **cache_kwargs["kv_cache_memory_bytes"] ) - cache_group.add_argument("--swap-space", **cache_kwargs["swap_space"]) cache_group.add_argument("--kv-cache-dtype", **cache_kwargs["cache_dtype"]) cache_group.add_argument( "--num-gpu-blocks-override", **cache_kwargs["num_gpu_blocks_override"] @@ -1526,7 +1524,6 @@ def create_engine_config( block_size=self.block_size, gpu_memory_utilization=self.gpu_memory_utilization, kv_cache_memory_bytes=self.kv_cache_memory_bytes, - swap_space=self.swap_space, cache_dtype=resolved_cache_dtype, # type: ignore[arg-type] is_attention_free=model_config.is_attention_free, num_gpu_blocks_override=self.num_gpu_blocks_override, diff --git a/vllm/entrypoints/llm.py b/vllm/entrypoints/llm.py index eb1d4dbeb365..9c6d6ddcdf75 100644 --- a/vllm/entrypoints/llm.py +++ b/vllm/entrypoints/llm.py @@ -164,12 +164,6 @@ class LLM: compared with using gpu_memory_utilization. Note that kv_cache_memory_bytes (when not-None) ignores gpu_memory_utilization - swap_space: The size (GiB) of CPU memory per GPU to use as swap space. - This can be used for temporarily storing the states of the requests - when their `best_of` sampling parameters are larger than 1. If all - requests will have `best_of=1`, you can safely set this to 0. - Noting that `best_of` is only supported in V0. Otherwise, too small - values may cause out-of-memory (OOM) errors. cpu_offload_gb: The size (GiB) of CPU memory to use for offloading the model weights. This virtually increases the GPU memory space you can use to hold the model weights, at the cost of CPU-GPU data @@ -240,7 +234,6 @@ def __init__( chat_template: Path | str | None = None, seed: int = 0, gpu_memory_utilization: float = 0.9, - swap_space: float = 4, cpu_offload_gb: float = 0, offload_group_size: int = 0, offload_num_in_group: int = 1, @@ -265,6 +258,17 @@ def __init__( ) -> None: """LLM constructor.""" + if "swap_space" in kwargs: + kwargs.pop("swap_space") + import warnings + + warnings.warn( + "The 'swap_space' parameter is deprecated and ignored. " + "It will be removed in a future version.", + DeprecationWarning, + stacklevel=2, + ) + if "disable_log_stats" not in kwargs: kwargs["disable_log_stats"] = True @@ -353,7 +357,6 @@ def _make_config(value: Any, cls: type[_R]) -> _R: seed=seed, gpu_memory_utilization=gpu_memory_utilization, kv_cache_memory_bytes=kv_cache_memory_bytes, - swap_space=swap_space, cpu_offload_gb=cpu_offload_gb, offload_group_size=offload_group_size, offload_num_in_group=offload_num_in_group,