Skip to content
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
1 change: 0 additions & 1 deletion .buildkite/performance-benchmarks/README.md
Copy link
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Can we remove the newlines added in this file?

Original file line number Diff line number Diff line change
Expand Up @@ -83,7 +83,6 @@ We test the throughput by using `vllm bench serve` with request rate = inf to co
"server_parameters": {
"model": "meta-llama/Meta-Llama-3-8B",
"tensor_parallel_size": 1,
"swap_space": 16,
"disable_log_stats": "",
"load_format": "dummy"
},
Expand Down
Copy link
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Can we remove the newlines added in this file?

Original file line number Diff line number Diff line change
Expand Up @@ -10,7 +10,6 @@
"server_parameters": {
"model": "meta-llama/Meta-Llama-3.1-8B-Instruct",
"tensor_parallel_size": 1,
"swap_space": 16,
"disable_log_stats": "",
"load_format": "dummy",
"max-model-len": 2048,
Expand All @@ -37,7 +36,6 @@
"server_parameters": {
"model": "meta-llama/Meta-Llama-3.1-70B-Instruct",
"tensor_parallel_size": 4,
"swap_space": 16,
"disable_log_stats": "",
"load_format": "dummy",
"max-model-len": 2048,
Expand All @@ -64,7 +62,6 @@
"server_parameters": {
"model": "mistralai/Mixtral-8x7B-Instruct-v0.1",
"tensor_parallel_size": 2,
"swap_space": 16,
"disable_log_stats": "",
"load_format": "dummy",
"max-model-len": 2048,
Expand All @@ -91,7 +88,6 @@
"server_parameters": {
"model": "deepseek-ai/DeepSeek-R1",
"tensor_parallel_size": 8,
"swap_space": 16,
"disable_log_stats": "",
"load_format": "dummy",
"max-model-len": 2048,
Expand Down
4 changes: 0 additions & 4 deletions .buildkite/performance-benchmarks/tests/serving-tests.json
Copy link
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Can we remove the newlines added in this file?

Original file line number Diff line number Diff line change
Expand Up @@ -5,7 +5,6 @@
"server_parameters": {
"model": "meta-llama/Meta-Llama-3.1-8B-Instruct",
"tensor_parallel_size": 1,
"swap_space": 16,
"disable_log_stats": "",
"load_format": "dummy"
},
Expand All @@ -23,7 +22,6 @@
"server_parameters": {
"model": "meta-llama/Meta-Llama-3.1-70B-Instruct",
"tensor_parallel_size": 4,
"swap_space": 16,
"disable_log_stats": "",
"load_format": "dummy"
},
Expand All @@ -41,7 +39,6 @@
"server_parameters": {
"model": "mistralai/Mixtral-8x7B-Instruct-v0.1",
"tensor_parallel_size": 2,
"swap_space": 16,
"disable_log_stats": "",
"load_format": "dummy"
},
Expand All @@ -59,7 +56,6 @@
"server_parameters": {
"model": "meta-llama/Meta-Llama-3.1-70B-Instruct",
"tensor_parallel_size": 4,
"swap_space": 16,
"speculative_config": {
"model": "turboderp/Qwama-0.5B-Instruct",
"num_speculative_tokens": 4,
Expand Down
1 change: 0 additions & 1 deletion benchmarks/attention_benchmarks/mla_runner.py
Original file line number Diff line number Diff line change
Expand Up @@ -145,7 +145,6 @@ def create_minimal_vllm_config(
cache_config = CacheConfig(
block_size=block_size,
gpu_memory_utilization=0.9,
swap_space=0,
cache_dtype="auto",
enable_prefix_caching=False,
)
Expand Down
1 change: 0 additions & 1 deletion benchmarks/attention_benchmarks/runner.py
Original file line number Diff line number Diff line change
Expand Up @@ -141,7 +141,6 @@ def _create_vllm_config(
cache_config = CacheConfig(
block_size=config.block_size,
cache_dtype="auto",
swap_space=0,
)
cache_config.num_gpu_blocks = max_num_blocks
cache_config.num_cpu_blocks = 0
Expand Down
8 changes: 4 additions & 4 deletions docs/design/metrics.md
Original file line number Diff line number Diff line change
Expand Up @@ -507,10 +507,10 @@ longer relevant in v1:
- `vllm:num_requests_swapped`
- `vllm:cpu_cache_usage_perc`

In this mode, when a request is preempted (e.g. to make room in KV
cache to complete other requests), we swap kv cache blocks out to CPU
memory. This is also known as "KV cache offloading" and is configured
with `--swap-space` and `--preemption-mode`.
In this mode, when a request was preempted (e.g. to make room in KV
cache to complete other requests), kv cache blocks were swapped out to
CPU memory. The `--swap-space` flag has been removed as this feature
is no longer used in V1.

Historically, [vLLM has long supported beam search](https://github.com/vllm-project/vllm/issues/6226). The
SequenceGroup encapsulated the idea of N Sequences which
Expand Down
2 changes: 1 addition & 1 deletion docs/serving/integrations/llamaindex.md
Original file line number Diff line number Diff line change
Expand Up @@ -17,7 +17,7 @@ llm = Vllm(
model="microsoft/Orca-2-7b",
tensor_parallel_size=4,
max_new_tokens=100,
vllm_kwargs={"swap_space": 1, "gpu_memory_utilization": 0.5},
vllm_kwargs={"gpu_memory_utilization": 0.5},
)
```

Expand Down
2 changes: 0 additions & 2 deletions tests/conftest.py
Original file line number Diff line number Diff line change
Expand Up @@ -794,7 +794,6 @@ def __init__(
tensor_parallel_size: int = 1,
block_size: int = 16 if not torch.xpu.is_available() else 64,
enable_chunked_prefill: bool | None = False,
swap_space: int = 4,
enforce_eager: bool | None = False,
# Set this to avoid hanging issue
default_torch_num_threads: int | None = None,
Expand Down Expand Up @@ -831,7 +830,6 @@ def __init__(
trust_remote_code=trust_remote_code,
dtype=dtype,
seed=seed,
swap_space=swap_space,
enforce_eager=enforce_eager,
disable_log_stats=disable_log_stats,
tensor_parallel_size=tensor_parallel_size,
Expand Down
3 changes: 1 addition & 2 deletions tests/distributed/test_torchrun_example.py
Original file line number Diff line number Diff line change
Expand Up @@ -22,15 +22,14 @@

sampling_params = SamplingParams(temperature=0.8, top_p=0.95)

# set different `gpu_memory_utilization` and `swap_space` for different ranks,
# set different `gpu_memory_utilization` for different ranks,
# to test if all ranks agree on the same kv cache configuration.
llm = LLM(
model="facebook/opt-125m",
tensor_parallel_size=2,
pipeline_parallel_size=int(os.getenv("PP_SIZE", 1)),
distributed_executor_backend="external_launcher",
gpu_memory_utilization=random.uniform(0.7, 0.9),
swap_space=random.randint(1, 4),
seed=0,
)

Expand Down
3 changes: 1 addition & 2 deletions tests/distributed/test_torchrun_example_moe.py
Original file line number Diff line number Diff line change
Expand Up @@ -28,7 +28,7 @@

sampling_params = SamplingParams(temperature=0.8, top_p=0.95)

# set different `gpu_memory_utilization` and `swap_space` for different ranks,
# set different `gpu_memory_utilization` for different ranks,
# to test if all ranks agree on the same kv cache configuration.
llm = LLM(
model="microsoft/Phi-mini-MoE-instruct",
Expand All @@ -37,7 +37,6 @@
enable_expert_parallel=int(os.getenv("ENABLE_EP", "0")) == 1,
distributed_executor_backend="external_launcher",
gpu_memory_utilization=random.uniform(0.7, 0.9),
swap_space=random.randint(1, 4),
seed=0,
)

Expand Down
1 change: 0 additions & 1 deletion tests/lora/test_worker.py
Original file line number Diff line number Diff line change
Expand Up @@ -64,7 +64,6 @@ def set_active_loras(worker: Worker, lora_requests: list[LoRARequest]):
device_config=DeviceConfig("cuda"),
cache_config=CacheConfig(
block_size=16,
swap_space=0,
cache_dtype="auto",
),
lora_config=LoRAConfig(
Expand Down
1 change: 0 additions & 1 deletion tests/v1/attention/utils.py
Original file line number Diff line number Diff line change
Expand Up @@ -182,7 +182,6 @@ def create_vllm_config(
cache_config = CacheConfig(
block_size=block_size,
cache_dtype="auto",
swap_space=0,
)
# Set cache blocks for testing
# (these may be set during initialization normally)
Expand Down
2 changes: 0 additions & 2 deletions tests/v1/core/test_scheduler.py
Original file line number Diff line number Diff line change
Expand Up @@ -1776,7 +1776,6 @@ def create_scheduler_with_priority(
cache_config = CacheConfig(
block_size=block_size,
gpu_memory_utilization=0.9,
swap_space=0,
cache_dtype="auto",
enable_prefix_caching=enable_prefix_caching,
)
Expand Down Expand Up @@ -3726,7 +3725,6 @@ def _create_encoder_decoder_scheduler(
cache_config = CacheConfig(
block_size=block_size,
gpu_memory_utilization=0.9,
swap_space=0,
cache_dtype="auto",
enable_prefix_caching=False,
)
Expand Down
1 change: 0 additions & 1 deletion tests/v1/core/utils.py
Original file line number Diff line number Diff line change
Expand Up @@ -94,7 +94,6 @@ def create_scheduler(
cache_config = CacheConfig(
block_size=block_size,
gpu_memory_utilization=0.9,
swap_space=0,
cache_dtype="auto",
enable_prefix_caching=enable_prefix_caching,
)
Expand Down
1 change: 0 additions & 1 deletion tests/v1/engine/test_engine_core.py
Original file line number Diff line number Diff line change
Expand Up @@ -506,7 +506,6 @@ def test_encoder_instance_zero_kv_cache(
cache_config = CacheConfig(
block_size=16,
gpu_memory_utilization=gpu_memory_utilization,
swap_space=0,
cache_dtype="auto",
enable_prefix_caching=enable_prefix_caching,
)
Expand Down
1 change: 0 additions & 1 deletion tests/v1/kv_connector/unit/test_moriio_connector.py
Original file line number Diff line number Diff line change
Expand Up @@ -206,7 +206,6 @@ def create_vllm_config(
cache_config = CacheConfig(
block_size=block_size,
gpu_memory_utilization=0.9,
swap_space=0,
cache_dtype="auto",
enable_prefix_caching=True,
)
Expand Down
1 change: 0 additions & 1 deletion tests/v1/kv_connector/unit/utils.py
Original file line number Diff line number Diff line change
Expand Up @@ -118,7 +118,6 @@ def create_vllm_config(
cache_config = CacheConfig(
block_size=block_size,
gpu_memory_utilization=0.9,
swap_space=0,
cache_dtype=cache_dtype,
enable_prefix_caching=True,
)
Expand Down
3 changes: 0 additions & 3 deletions tests/v1/worker/test_gpu_model_runner.py
Original file line number Diff line number Diff line change
Expand Up @@ -96,7 +96,6 @@ def get_vllm_config():
cache_config = CacheConfig(
block_size=BLOCK_SIZE,
gpu_memory_utilization=0.9,
swap_space=0,
cache_dtype="auto",
)
parallel_config = ParallelConfig()
Expand Down Expand Up @@ -809,7 +808,6 @@ def test_hybrid_attention_mamba_tensor_shapes():
cache_config = CacheConfig(
block_size=BLOCK_SIZE,
gpu_memory_utilization=0.9,
swap_space=0,
cache_dtype="auto",
)
parallel_config = ParallelConfig()
Expand Down Expand Up @@ -1242,7 +1240,6 @@ def test_cudagraph_sizes_capped_for_mamba_cache():
cache_config = CacheConfig(
block_size=BLOCK_SIZE,
gpu_memory_utilization=0.9,
swap_space=0,
cache_dtype="auto",
)
parallel_config = ParallelConfig()
Expand Down
34 changes: 1 addition & 33 deletions vllm/config/cache.py
Original file line number Diff line number Diff line change
@@ -1,21 +1,13 @@
# SPDX-License-Identifier: Apache-2.0
# SPDX-FileCopyrightText: Copyright contributors to the vLLM project

import math
from dataclasses import field
from typing import TYPE_CHECKING, Any, Literal
from typing import Literal

from pydantic import Field, SkipValidation, field_validator

from vllm.config.utils import config
from vllm.logger import init_logger
from vllm.utils.mem_constants import GiB_bytes
from vllm.utils.mem_utils import format_gib, get_cpu_memory

if TYPE_CHECKING:
from vllm.config.parallel import ParallelConfig
else:
ParallelConfig = Any

logger = init_logger(__name__)

Expand Down Expand Up @@ -53,8 +45,6 @@ class CacheConfig:
not matter if you have another vLLM instance running on the same GPU. For
example, if you have two vLLM instances running on the same GPU, you can
set the GPU memory utilization to 0.5 for each instance."""
swap_space: float = Field(default=4, ge=0)
"""Size of the CPU swap space per GPU (in GiB)."""
cache_dtype: CacheDType = "auto"
"""Data type for kv cache storage. If "auto", will use model data type.
CUDA 11.8+ supports fp8 (=fp8_e4m3) and fp8_e5m2. ROCm (AMD GPU) supports
Expand Down Expand Up @@ -173,7 +163,6 @@ def compute_hash(self) -> str:
ignored_factors = {
# Runtime/derived knobs that don't affect compiled graph shape
"gpu_memory_utilization",
"swap_space",
"is_attention_free",
"num_gpu_blocks_override",
"enable_prefix_caching",
Expand Down Expand Up @@ -208,24 +197,3 @@ def _validate_cache_dtype(cls, cache_dtype: CacheDType) -> CacheDType:
"scaling factor."
)
return cache_dtype

def verify_with_parallel_config(
self,
parallel_config: ParallelConfig,
) -> None:
swap_space_bytes = math.ceil(self.swap_space * GiB_bytes)
total_cpu_memory = get_cpu_memory()
# FIXME(woosuk): Here, it is assumed that the GPUs in a tensor parallel
# group are in the same node. However, the GPUs may span multiple nodes.
num_gpus_per_node = parallel_config.tensor_parallel_size
cpu_memory_usage = swap_space_bytes * num_gpus_per_node

msg = (
f"{format_gib(cpu_memory_usage)} GiB out of the "
f"{format_gib(total_cpu_memory)} GiB total CPU memory "
"is allocated for the swap space."
)
if cpu_memory_usage > 0.7 * total_cpu_memory:
raise ValueError("Too large swap space. " + msg)
elif cpu_memory_usage > 0.4 * total_cpu_memory:
logger.warning("Possibly too large swap space. %s", msg)
2 changes: 0 additions & 2 deletions vllm/config/vllm.py
Original file line number Diff line number Diff line change
Expand Up @@ -674,8 +674,6 @@ def __post_init__(self):

self.parallel_config.is_moe_model = self.model_config.is_moe

self.cache_config.verify_with_parallel_config(self.parallel_config)

if self.lora_config is not None:
self.lora_config.verify_with_model_config(self.model_config)

Expand Down
3 changes: 0 additions & 3 deletions vllm/engine/arg_utils.py
Original file line number Diff line number Diff line change
Expand Up @@ -447,7 +447,6 @@ class EngineArgs:
)
disable_sliding_window: bool = ModelConfig.disable_sliding_window
disable_cascade_attn: bool = ModelConfig.disable_cascade_attn
swap_space: float = CacheConfig.swap_space
offload_backend: str = OffloadConfig.offload_backend
cpu_offload_gb: float = UVAOffloadConfig.cpu_offload_gb
cpu_offload_params: set[str] = get_field(UVAOffloadConfig, "cpu_offload_params")
Expand Down Expand Up @@ -961,7 +960,6 @@ def add_cli_args(parser: FlexibleArgumentParser) -> FlexibleArgumentParser:
cache_group.add_argument(
"--kv-cache-memory-bytes", **cache_kwargs["kv_cache_memory_bytes"]
)
cache_group.add_argument("--swap-space", **cache_kwargs["swap_space"])
cache_group.add_argument("--kv-cache-dtype", **cache_kwargs["cache_dtype"])
cache_group.add_argument(
"--num-gpu-blocks-override", **cache_kwargs["num_gpu_blocks_override"]
Expand Down Expand Up @@ -1526,7 +1524,6 @@ def create_engine_config(
block_size=self.block_size,
gpu_memory_utilization=self.gpu_memory_utilization,
kv_cache_memory_bytes=self.kv_cache_memory_bytes,
swap_space=self.swap_space,
cache_dtype=resolved_cache_dtype, # type: ignore[arg-type]
is_attention_free=model_config.is_attention_free,
num_gpu_blocks_override=self.num_gpu_blocks_override,
Expand Down
19 changes: 11 additions & 8 deletions vllm/entrypoints/llm.py
Original file line number Diff line number Diff line change
Expand Up @@ -164,12 +164,6 @@ class LLM:
compared with using gpu_memory_utilization. Note that
kv_cache_memory_bytes (when not-None) ignores
gpu_memory_utilization
swap_space: The size (GiB) of CPU memory per GPU to use as swap space.
This can be used for temporarily storing the states of the requests
when their `best_of` sampling parameters are larger than 1. If all
requests will have `best_of=1`, you can safely set this to 0.
Noting that `best_of` is only supported in V0. Otherwise, too small
values may cause out-of-memory (OOM) errors.
cpu_offload_gb: The size (GiB) of CPU memory to use for offloading
the model weights. This virtually increases the GPU memory space
you can use to hold the model weights, at the cost of CPU-GPU data
Expand Down Expand Up @@ -240,7 +234,6 @@ def __init__(
chat_template: Path | str | None = None,
seed: int = 0,
gpu_memory_utilization: float = 0.9,
swap_space: float = 4,
cpu_offload_gb: float = 0,
offload_group_size: int = 0,
offload_num_in_group: int = 1,
Expand All @@ -265,6 +258,17 @@ def __init__(
) -> None:
"""LLM constructor."""

if "swap_space" in kwargs:
kwargs.pop("swap_space")
import warnings

warnings.warn(
"The 'swap_space' parameter is deprecated and ignored. "
"It will be removed in a future version.",
DeprecationWarning,
stacklevel=2,
)

if "disable_log_stats" not in kwargs:
kwargs["disable_log_stats"] = True

Expand Down Expand Up @@ -353,7 +357,6 @@ def _make_config(value: Any, cls: type[_R]) -> _R:
seed=seed,
gpu_memory_utilization=gpu_memory_utilization,
kv_cache_memory_bytes=kv_cache_memory_bytes,
swap_space=swap_space,
cpu_offload_gb=cpu_offload_gb,
offload_group_size=offload_group_size,
offload_num_in_group=offload_num_in_group,
Expand Down