vllm-project · DarkLight1337 · Mar 7, 2026 · Mar 7, 2026 · Mar 7, 2026 · hmellor
diff --git a/.buildkite/performance-benchmarks/README.md b/.buildkite/performance-benchmarks/README.md
@@ -83,7 +83,6 @@ We test the throughput by using `vllm bench serve` with request rate = inf to co
         "server_parameters": {
             "model": "meta-llama/Meta-Llama-3-8B",
             "tensor_parallel_size": 1,
-            "swap_space": 16,
             "disable_log_stats": "",
             "load_format": "dummy"
         },

diff --git a/.buildkite/performance-benchmarks/tests/serving-tests-hpu.json b/.buildkite/performance-benchmarks/tests/serving-tests-hpu.json
@@ -10,7 +10,6 @@
         "server_parameters": {
             "model": "meta-llama/Meta-Llama-3.1-8B-Instruct",
             "tensor_parallel_size": 1,
-            "swap_space": 16,
             "disable_log_stats": "",
             "load_format": "dummy",
             "max-model-len": 2048,
@@ -37,7 +36,6 @@
         "server_parameters": {
             "model": "meta-llama/Meta-Llama-3.1-70B-Instruct",
             "tensor_parallel_size": 4,
-            "swap_space": 16,
             "disable_log_stats": "",
             "load_format": "dummy",
             "max-model-len": 2048,
@@ -64,7 +62,6 @@
         "server_parameters": {
             "model": "mistralai/Mixtral-8x7B-Instruct-v0.1",
             "tensor_parallel_size": 2,
-            "swap_space": 16,
             "disable_log_stats": "",
             "load_format": "dummy",
             "max-model-len": 2048,
@@ -91,7 +88,6 @@
         "server_parameters": {
             "model": "deepseek-ai/DeepSeek-R1",
             "tensor_parallel_size": 8,
-            "swap_space": 16,
             "disable_log_stats": "",
             "load_format": "dummy",
             "max-model-len": 2048,

diff --git a/.buildkite/performance-benchmarks/tests/serving-tests.json b/.buildkite/performance-benchmarks/tests/serving-tests.json
@@ -5,7 +5,6 @@
         "server_parameters": {
             "model": "meta-llama/Meta-Llama-3.1-8B-Instruct",
             "tensor_parallel_size": 1,
-            "swap_space": 16,
             "disable_log_stats": "",
             "load_format": "dummy"
         },
@@ -23,7 +22,6 @@
         "server_parameters": {
             "model": "meta-llama/Meta-Llama-3.1-70B-Instruct",
             "tensor_parallel_size": 4,
-            "swap_space": 16,
             "disable_log_stats": "",
             "load_format": "dummy"
         },
@@ -41,7 +39,6 @@
         "server_parameters": {
             "model": "mistralai/Mixtral-8x7B-Instruct-v0.1",
             "tensor_parallel_size": 2,
-            "swap_space": 16,
             "disable_log_stats": "",
             "load_format": "dummy"
         },
@@ -59,7 +56,6 @@
         "server_parameters": {
             "model": "meta-llama/Meta-Llama-3.1-70B-Instruct", 
             "tensor_parallel_size": 4,
-            "swap_space": 16,
             "speculative_config": {
                 "model": "turboderp/Qwama-0.5B-Instruct",
                 "num_speculative_tokens": 4,

diff --git a/benchmarks/attention_benchmarks/mla_runner.py b/benchmarks/attention_benchmarks/mla_runner.py
@@ -145,7 +145,6 @@ def create_minimal_vllm_config(
     cache_config = CacheConfig(
         block_size=block_size,
         gpu_memory_utilization=0.9,
-        swap_space=0,
         cache_dtype="auto",
         enable_prefix_caching=False,
     )

diff --git a/benchmarks/attention_benchmarks/runner.py b/benchmarks/attention_benchmarks/runner.py
@@ -141,7 +141,6 @@ def _create_vllm_config(
     cache_config = CacheConfig(
         block_size=config.block_size,
         cache_dtype="auto",
-        swap_space=0,
     )
     cache_config.num_gpu_blocks = max_num_blocks
     cache_config.num_cpu_blocks = 0

diff --git a/docs/design/metrics.md b/docs/design/metrics.md
@@ -507,10 +507,10 @@ longer relevant in v1:
 - `vllm:num_requests_swapped`
 - `vllm:cpu_cache_usage_perc`
 
-In this mode, when a request is preempted (e.g. to make room in KV
-cache to complete other requests), we swap kv cache blocks out to CPU
-memory. This is also known as "KV cache offloading" and is configured
-with `--swap-space` and `--preemption-mode`.
+In this mode, when a request was preempted (e.g. to make room in KV
+cache to complete other requests), kv cache blocks were swapped out to
+CPU memory. The `--swap-space` flag has been removed as this feature
+is no longer used in V1.
 
 Historically, [vLLM has long supported beam search](https://github.com/vllm-project/vllm/issues/6226). The
 SequenceGroup encapsulated the idea of N Sequences which

diff --git a/docs/serving/integrations/llamaindex.md b/docs/serving/integrations/llamaindex.md
@@ -17,7 +17,7 @@ llm = Vllm(
     model="microsoft/Orca-2-7b",
     tensor_parallel_size=4,
     max_new_tokens=100,
-    vllm_kwargs={"swap_space": 1, "gpu_memory_utilization": 0.5},
+    vllm_kwargs={"gpu_memory_utilization": 0.5},
 )
 ```
 

diff --git a/tests/conftest.py b/tests/conftest.py
@@ -794,7 +794,6 @@ def __init__(
         tensor_parallel_size: int = 1,
         block_size: int = 16 if not torch.xpu.is_available() else 64,
         enable_chunked_prefill: bool | None = False,
-        swap_space: int = 4,
         enforce_eager: bool | None = False,
         # Set this to avoid hanging issue
         default_torch_num_threads: int | None = None,
@@ -831,7 +830,6 @@ def __init__(
                 trust_remote_code=trust_remote_code,
                 dtype=dtype,
                 seed=seed,
-                swap_space=swap_space,
                 enforce_eager=enforce_eager,
                 disable_log_stats=disable_log_stats,
                 tensor_parallel_size=tensor_parallel_size,

diff --git a/tests/distributed/test_torchrun_example.py b/tests/distributed/test_torchrun_example.py
@@ -22,15 +22,14 @@
 
 sampling_params = SamplingParams(temperature=0.8, top_p=0.95)
 
-# set different `gpu_memory_utilization` and `swap_space` for different ranks,
+# set different `gpu_memory_utilization` for different ranks,
 # to test if all ranks agree on the same kv cache configuration.
 llm = LLM(
     model="facebook/opt-125m",
     tensor_parallel_size=2,
     pipeline_parallel_size=int(os.getenv("PP_SIZE", 1)),
     distributed_executor_backend="external_launcher",
     gpu_memory_utilization=random.uniform(0.7, 0.9),
-    swap_space=random.randint(1, 4),
     seed=0,
 )
 

diff --git a/tests/distributed/test_torchrun_example_moe.py b/tests/distributed/test_torchrun_example_moe.py
@@ -28,7 +28,7 @@
 
 sampling_params = SamplingParams(temperature=0.8, top_p=0.95)
 
-# set different `gpu_memory_utilization` and `swap_space` for different ranks,
+# set different `gpu_memory_utilization` for different ranks,
 # to test if all ranks agree on the same kv cache configuration.
 llm = LLM(
     model="microsoft/Phi-mini-MoE-instruct",
@@ -37,7 +37,6 @@
     enable_expert_parallel=int(os.getenv("ENABLE_EP", "0")) == 1,
     distributed_executor_backend="external_launcher",
     gpu_memory_utilization=random.uniform(0.7, 0.9),
-    swap_space=random.randint(1, 4),
     seed=0,
 )
 

@@ -64,7 +64,6 @@ def set_active_loras(worker: Worker, lora_requests: list[LoRARequest]):
         device_config=DeviceConfig("cuda"),
         cache_config=CacheConfig(
             block_size=16,
-            swap_space=0,
             cache_dtype="auto",
         ),
         lora_config=LoRAConfig(

diff --git a/tests/v1/attention/utils.py b/tests/v1/attention/utils.py
@@ -182,7 +182,6 @@ def create_vllm_config(
     cache_config = CacheConfig(
         block_size=block_size,
         cache_dtype="auto",
-        swap_space=0,
     )
     # Set cache blocks for testing
     #   (these may be set during initialization normally)

@@ -1776,7 +1776,6 @@ def create_scheduler_with_priority(
     cache_config = CacheConfig(
         block_size=block_size,
         gpu_memory_utilization=0.9,
-        swap_space=0,
         cache_dtype="auto",
         enable_prefix_caching=enable_prefix_caching,
     )
@@ -3726,7 +3725,6 @@ def _create_encoder_decoder_scheduler(
     cache_config = CacheConfig(
         block_size=block_size,
         gpu_memory_utilization=0.9,
-        swap_space=0,
         cache_dtype="auto",
         enable_prefix_caching=False,
     )

@@ -94,7 +94,6 @@ def create_scheduler(
     cache_config = CacheConfig(
         block_size=block_size,
         gpu_memory_utilization=0.9,
-        swap_space=0,
         cache_dtype="auto",
         enable_prefix_caching=enable_prefix_caching,
     )

diff --git a/tests/v1/engine/test_engine_core.py b/tests/v1/engine/test_engine_core.py
@@ -506,7 +506,6 @@ def test_encoder_instance_zero_kv_cache(
     cache_config = CacheConfig(
         block_size=16,
         gpu_memory_utilization=gpu_memory_utilization,
-        swap_space=0,
         cache_dtype="auto",
         enable_prefix_caching=enable_prefix_caching,
     )

@@ -206,7 +206,6 @@ def create_vllm_config(
     cache_config = CacheConfig(
         block_size=block_size,
         gpu_memory_utilization=0.9,
-        swap_space=0,
         cache_dtype="auto",
         enable_prefix_caching=True,
     )

@@ -118,7 +118,6 @@ def create_vllm_config(
     cache_config = CacheConfig(
         block_size=block_size,
         gpu_memory_utilization=0.9,
-        swap_space=0,
         cache_dtype=cache_dtype,
         enable_prefix_caching=True,
     )

diff --git a/tests/v1/worker/test_gpu_model_runner.py b/tests/v1/worker/test_gpu_model_runner.py
@@ -96,7 +96,6 @@ def get_vllm_config():
     cache_config = CacheConfig(
         block_size=BLOCK_SIZE,
         gpu_memory_utilization=0.9,
-        swap_space=0,
         cache_dtype="auto",
     )
     parallel_config = ParallelConfig()
@@ -809,7 +808,6 @@ def test_hybrid_attention_mamba_tensor_shapes():
     cache_config = CacheConfig(
         block_size=BLOCK_SIZE,
         gpu_memory_utilization=0.9,
-        swap_space=0,
         cache_dtype="auto",
     )
     parallel_config = ParallelConfig()
@@ -1242,7 +1240,6 @@ def test_cudagraph_sizes_capped_for_mamba_cache():
     cache_config = CacheConfig(
         block_size=BLOCK_SIZE,
         gpu_memory_utilization=0.9,
-        swap_space=0,
         cache_dtype="auto",
     )
     parallel_config = ParallelConfig()

@@ -1,21 +1,13 @@
 # SPDX-License-Identifier: Apache-2.0
 # SPDX-FileCopyrightText: Copyright contributors to the vLLM project
 
-import math
 from dataclasses import field
-from typing import TYPE_CHECKING, Any, Literal
+from typing import Literal
 
 from pydantic import Field, SkipValidation, field_validator
 
 from vllm.config.utils import config
 from vllm.logger import init_logger
-from vllm.utils.mem_constants import GiB_bytes
-from vllm.utils.mem_utils import format_gib, get_cpu_memory
-
-if TYPE_CHECKING:
-    from vllm.config.parallel import ParallelConfig
-else:
-    ParallelConfig = Any
 
 logger = init_logger(__name__)
 
@@ -53,8 +45,6 @@ class CacheConfig:
     not matter if you have another vLLM instance running on the same GPU. For
     example, if you have two vLLM instances running on the same GPU, you can
     set the GPU memory utilization to 0.5 for each instance."""
-    swap_space: float = Field(default=4, ge=0)
-    """Size of the CPU swap space per GPU (in GiB)."""
     cache_dtype: CacheDType = "auto"
     """Data type for kv cache storage. If "auto", will use model data type.
     CUDA 11.8+ supports fp8 (=fp8_e4m3) and fp8_e5m2. ROCm (AMD GPU) supports
@@ -173,7 +163,6 @@ def compute_hash(self) -> str:
         ignored_factors = {
             # Runtime/derived knobs that don't affect compiled graph shape
             "gpu_memory_utilization",
-            "swap_space",
             "is_attention_free",
             "num_gpu_blocks_override",
             "enable_prefix_caching",
@@ -208,24 +197,3 @@ def _validate_cache_dtype(cls, cache_dtype: CacheDType) -> CacheDType:
                 "scaling factor."
             )
         return cache_dtype
-
-    def verify_with_parallel_config(
-        self,
-        parallel_config: ParallelConfig,
-    ) -> None:
-        swap_space_bytes = math.ceil(self.swap_space * GiB_bytes)
-        total_cpu_memory = get_cpu_memory()
-        # FIXME(woosuk): Here, it is assumed that the GPUs in a tensor parallel
-        # group are in the same node. However, the GPUs may span multiple nodes.
-        num_gpus_per_node = parallel_config.tensor_parallel_size
-        cpu_memory_usage = swap_space_bytes * num_gpus_per_node
-
-        msg = (
-            f"{format_gib(cpu_memory_usage)} GiB out of the "
-            f"{format_gib(total_cpu_memory)} GiB total CPU memory "
-            "is allocated for the swap space."
-        )
-        if cpu_memory_usage > 0.7 * total_cpu_memory:
-            raise ValueError("Too large swap space. " + msg)
-        elif cpu_memory_usage > 0.4 * total_cpu_memory:
-            logger.warning("Possibly too large swap space. %s", msg)
@@ -674,8 +674,6 @@ def __post_init__(self):
 
             self.parallel_config.is_moe_model = self.model_config.is_moe
 
-        self.cache_config.verify_with_parallel_config(self.parallel_config)
-
         if self.lora_config is not None:
             self.lora_config.verify_with_model_config(self.model_config)
 

diff --git a/vllm/engine/arg_utils.py b/vllm/engine/arg_utils.py
@@ -447,7 +447,6 @@ class EngineArgs:
     )
     disable_sliding_window: bool = ModelConfig.disable_sliding_window
     disable_cascade_attn: bool = ModelConfig.disable_cascade_attn
-    swap_space: float = CacheConfig.swap_space
     offload_backend: str = OffloadConfig.offload_backend
     cpu_offload_gb: float = UVAOffloadConfig.cpu_offload_gb
     cpu_offload_params: set[str] = get_field(UVAOffloadConfig, "cpu_offload_params")
@@ -961,7 +960,6 @@ def add_cli_args(parser: FlexibleArgumentParser) -> FlexibleArgumentParser:
         cache_group.add_argument(
             "--kv-cache-memory-bytes", **cache_kwargs["kv_cache_memory_bytes"]
         )
-        cache_group.add_argument("--swap-space", **cache_kwargs["swap_space"])
         cache_group.add_argument("--kv-cache-dtype", **cache_kwargs["cache_dtype"])
         cache_group.add_argument(
             "--num-gpu-blocks-override", **cache_kwargs["num_gpu_blocks_override"]
@@ -1526,7 +1524,6 @@ def create_engine_config(
             block_size=self.block_size,
             gpu_memory_utilization=self.gpu_memory_utilization,
             kv_cache_memory_bytes=self.kv_cache_memory_bytes,
-            swap_space=self.swap_space,
             cache_dtype=resolved_cache_dtype,  # type: ignore[arg-type]
             is_attention_free=model_config.is_attention_free,
             num_gpu_blocks_override=self.num_gpu_blocks_override,

@@ -164,12 +164,6 @@ class LLM:
             compared with using gpu_memory_utilization. Note that
             kv_cache_memory_bytes (when not-None) ignores
             gpu_memory_utilization
-        swap_space: The size (GiB) of CPU memory per GPU to use as swap space.
-            This can be used for temporarily storing the states of the requests
-            when their `best_of` sampling parameters are larger than 1. If all
-            requests will have `best_of=1`, you can safely set this to 0.
-            Noting that `best_of` is only supported in V0. Otherwise, too small
-            values may cause out-of-memory (OOM) errors.
         cpu_offload_gb: The size (GiB) of CPU memory to use for offloading
             the model weights. This virtually increases the GPU memory space
             you can use to hold the model weights, at the cost of CPU-GPU data
@@ -240,7 +234,6 @@ def __init__(
         chat_template: Path | str | None = None,
         seed: int = 0,
         gpu_memory_utilization: float = 0.9,
-        swap_space: float = 4,
         cpu_offload_gb: float = 0,
         offload_group_size: int = 0,
         offload_num_in_group: int = 1,
@@ -265,6 +258,17 @@ def __init__(
     ) -> None:
         """LLM constructor."""
 
+        if "swap_space" in kwargs:
+            kwargs.pop("swap_space")
+            import warnings
+
+            warnings.warn(
+                "The 'swap_space' parameter is deprecated and ignored. "
+                "It will be removed in a future version.",
+                DeprecationWarning,
+                stacklevel=2,
+            )
+
         if "disable_log_stats" not in kwargs:
             kwargs["disable_log_stats"] = True
 
@@ -353,7 +357,6 @@ def _make_config(value: Any, cls: type[_R]) -> _R:
             seed=seed,
             gpu_memory_utilization=gpu_memory_utilization,
             kv_cache_memory_bytes=kv_cache_memory_bytes,
-            swap_space=swap_space,
             cpu_offload_gb=cpu_offload_gb,
             offload_group_size=offload_group_size,
             offload_num_in_group=offload_num_in_group,