Skip to content
Closed
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
141 changes: 141 additions & 0 deletions tests/engine/test_arg_utils.py
Original file line number Diff line number Diff line change
Expand Up @@ -6,6 +6,7 @@
from contextlib import nullcontext
from dataclasses import dataclass, field
from typing import Annotated, Literal
from unittest.mock import patch

import pytest

Expand All @@ -22,6 +23,7 @@
optional_type,
parse_type,
)
from vllm.platforms import CpuArchEnum
from vllm.utils.argparse_utils import FlexibleArgumentParser


Expand Down Expand Up @@ -343,3 +345,142 @@ def test_human_readable_model_len():
for invalid in ["1a", "pwd", "10.24", "1.23M"]:
with pytest.raises(ArgumentError):
args = parser.parse_args(["--max-model-len", invalid])


@pytest.mark.parametrize(
("cpu_arch", "setting", "value", "error_match"),
[
(
CpuArchEnum.POWERPC,
"enable_chunked_prefill",
True,
"Chunked prefill is not supported",
),
(
CpuArchEnum.S390X,
"enable_chunked_prefill",
True,
"Chunked prefill is not supported",
),
(
CpuArchEnum.ARM,
"enable_prefix_caching",
True,
"Prefix caching is not supported",
),
(
CpuArchEnum.RISCV,
"enable_prefix_caching",
True,
"Prefix caching is not supported",
),
],
)
@patch("vllm.engine.arg_utils.current_platform.get_cpu_architecture")
@patch("vllm.engine.arg_utils.current_platform.is_cpu")
def test_restricted_cpu_enable_features_error(
mock_is_cpu, mock_get_cpu_arch, cpu_arch, setting, value, error_match
):
"""Test that enabling features on restricted CPUs raises errors."""
mock_is_cpu.return_value = True
mock_get_cpu_arch.return_value = cpu_arch

engine_args = EngineArgs(model="facebook/opt-125m", **{setting: value})

with pytest.raises(ValueError, match=error_match):
engine_args.create_engine_config()


@pytest.mark.parametrize(
"cpu_arch",
[CpuArchEnum.POWERPC, CpuArchEnum.S390X, CpuArchEnum.ARM, CpuArchEnum.RISCV],
)
@patch("vllm.engine.arg_utils.current_platform.device_type", "cpu")
@patch("vllm.engine.arg_utils.current_platform.get_cpu_architecture")
@patch("vllm.engine.arg_utils.current_platform.is_cpu")
def test_restricted_cpu_auto_disable(mock_is_cpu, mock_get_cpu_arch, cpu_arch):
"""Test that chunked prefill and prefix caching are auto-disabled."""
mock_is_cpu.return_value = True
mock_get_cpu_arch.return_value = cpu_arch

engine_args = EngineArgs(model="facebook/opt-125m")

config = engine_args.create_engine_config()

assert config.scheduler_config.enable_chunked_prefill is False
assert config.cache_config.enable_prefix_caching is False


@patch("vllm.engine.arg_utils.current_platform.is_cpu")
def test_generation_model_disable_chunked_prefill_error(mock_is_cpu):
"""Test that disabling chunked prefill for generation models raises an error."""
mock_is_cpu.return_value = False

engine_args = EngineArgs(
model="facebook/opt-125m",
enable_chunked_prefill=False,
)

with pytest.raises(
ValueError, match="Chunked prefill is required for generation models"
):
engine_args.create_engine_config()


@pytest.mark.parametrize(
"cpu_arch",
[CpuArchEnum.POWERPC, CpuArchEnum.S390X, CpuArchEnum.ARM, CpuArchEnum.RISCV],
)
@patch("vllm.engine.arg_utils.current_platform.device_type", "cpu")
@patch("vllm.engine.arg_utils.current_platform.get_cpu_architecture")
@patch("vllm.engine.arg_utils.current_platform.is_cpu")
def test_restricted_cpu_generation_model_no_error(
mock_is_cpu, mock_get_cpu_arch, cpu_arch
):
"""Test that platform restrictions take precedence over model requirements.

On restricted CPUs, a generation model can have chunked prefill disabled.
"""
mock_is_cpu.return_value = True
mock_get_cpu_arch.return_value = cpu_arch

engine_args = EngineArgs(
model="facebook/opt-125m",
enable_chunked_prefill=False,
)

config = engine_args.create_engine_config()

assert config.scheduler_config.enable_chunked_prefill is False


@pytest.mark.parametrize(
("setting", "value", "config_path"),
[
("enable_chunked_prefill", True, "scheduler_config.enable_chunked_prefill"),
("enable_chunked_prefill", None, "scheduler_config.enable_chunked_prefill"),
("enable_prefix_caching", False, "cache_config.enable_prefix_caching"),
],
)
@patch("vllm.engine.arg_utils.current_platform.is_cpu")
def test_non_restricted_platform_settings(mock_is_cpu, setting, value, config_path):
"""Test that settings work correctly on non-restricted platforms."""
mock_is_cpu.return_value = False

kwargs = {"model": "facebook/opt-125m"}
if value is not None:
kwargs[setting] = value

engine_args = EngineArgs(**kwargs)
config = engine_args.create_engine_config()

# Navigate to the config value using the path
config_value = config
for attr in config_path.split("."):
config_value = getattr(config_value, attr)

# For None (default), generation models default to True for chunked_prefill
if value is None and setting == "enable_chunked_prefill":
assert config_value is True
else:
assert config_value == value
137 changes: 85 additions & 52 deletions vllm/engine/arg_utils.py
Original file line number Diff line number Diff line change
Expand Up @@ -1353,26 +1353,6 @@ def create_engine_config(

# Set default arguments for V1 Engine.
self._set_default_args(usage_context, model_config)
# Disable chunked prefill and prefix caching for:
# POWER (ppc64le)/ARM/s390x/RISCV CPUs in V1
if current_platform.is_cpu() and current_platform.get_cpu_architecture() in (
CpuArchEnum.POWERPC,
CpuArchEnum.S390X,
CpuArchEnum.ARM,
CpuArchEnum.RISCV,
):
logger.info(
"Chunked prefill is not supported for ARM and POWER, "
"S390X and RISC-V CPUs; "
"disabling it for V1 backend."
)
self.enable_chunked_prefill = False
logger.info(
"Prefix caching is not supported for ARM and POWER, "
"S390X and RISC-V CPUs; "
"disabling it for V1 backend."
)
self.enable_prefix_caching = False

assert self.enable_chunked_prefill is not None

Expand Down Expand Up @@ -1935,46 +1915,99 @@ def _set_default_args(
self, usage_context: UsageContext, model_config: ModelConfig
) -> None:
"""Set Default Arguments for V1 Engine."""
# Check if running on CPU architecture with feature restrictions
Copy link
Copy Markdown
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Why not check these after applying defaults?

is_restricted_cpu = (
current_platform.is_cpu()
and current_platform.get_cpu_architecture()
in (
CpuArchEnum.POWERPC,
CpuArchEnum.S390X,
CpuArchEnum.ARM,
CpuArchEnum.RISCV,
)
)
restricted_cpu_names = "ARM, POWER, S390X, and RISC-V CPUs"

# Validate platform-specific restrictions
if is_restricted_cpu:
if self.enable_chunked_prefill is True:
raise ValueError(
f"Chunked prefill is not supported for {restricted_cpu_names}."
)
if self.enable_prefix_caching is True:
raise ValueError(
f"Prefix caching is not supported for {restricted_cpu_names}."
)

# Validate model-specific requirements
# (except on restricted CPUs where chunked prefill must be disabled)
if (
not is_restricted_cpu
and model_config.runner_type == "generate"
and self.enable_chunked_prefill is False
):
raise ValueError("Chunked prefill is required for generation models. ")
Copy link

Copilot AI Nov 17, 2025

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

The error message has a trailing space before the closing quote. This should be removed for consistency with other error messages.

Suggested change
raise ValueError("Chunked prefill is required for generation models. ")
raise ValueError("Chunked prefill is required for generation models.")

Copilot uses AI. Check for mistakes.
Copy link
Copy Markdown
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

I would suggest changing the message to "Chunked prefill cannot be disabled for generation models."


# Handle defaults for chunked prefill and prefix caching
(
default_chunked_prefill,
default_prefix_caching,
) = self.get_chunked_prefill_prefix_caching_defaults(model_config)

if self.enable_chunked_prefill is None:
self.enable_chunked_prefill = default_chunked_prefill
# Override defaults for restricted CPUs
if is_restricted_cpu:
if self.enable_chunked_prefill is None:
logger.info(
"Chunked prefill is not supported for %s; "
"disabling it for V1 backend.",
Comment on lines +1961 to +1962
Copy link
Copy Markdown
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

shouldn't mention V1 anymore ...

Suggested change
"Chunked prefill is not supported for %s; "
"disabling it for V1 backend.",
"Chunked prefill is not supported for %s "
"and will be disabled.",

restricted_cpu_names,
)
self.enable_chunked_prefill = False

logger.debug(
"%s chunked prefill by default",
"Enabling" if default_chunked_prefill else "Disabling",
)
elif (
model_config.runner_type == "pooling"
and self.enable_chunked_prefill
and not default_chunked_prefill
):
logger.warning(
"This model does not officially support chunked prefill. "
"Enabling this manually may cause the engine to crash "
"or produce incorrect outputs.",
)
if self.enable_prefix_caching is None:
logger.info(
"Prefix caching is not supported for %s; "
"disabling it for V1 backend.",
Comment on lines +1969 to +1970
Copy link
Copy Markdown
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Suggested change
"Prefix caching is not supported for %s; "
"disabling it for V1 backend.",
"Prefix caching is not supported for %s; "
"and will be disabled.",

restricted_cpu_names,
)
self.enable_prefix_caching = False
else:
# Set defaults for supported platforms
if self.enable_chunked_prefill is None:
self.enable_chunked_prefill = default_chunked_prefill

logger.debug(
"%s chunked prefill by default",
"Enabling" if default_chunked_prefill else "Disabling",
)
elif (
model_config.runner_type == "pooling"
and self.enable_chunked_prefill
and not default_chunked_prefill
):
logger.warning(
"This model does not officially support chunked prefill. "
"Enabling this manually may cause the engine to crash "
"or produce incorrect outputs.",
)

if self.enable_prefix_caching is None:
self.enable_prefix_caching = default_prefix_caching
if self.enable_prefix_caching is None:
self.enable_prefix_caching = default_prefix_caching

logger.debug(
"%s prefix caching by default",
"Enabling" if default_prefix_caching else "Disabling",
)
elif (
model_config.runner_type == "pooling"
and self.enable_prefix_caching
and not default_prefix_caching
):
logger.warning(
"This model does not officially support prefix caching. "
"Enabling this manually may cause the engine to crash "
"or produce incorrect outputs.",
)
logger.debug(
"%s prefix caching by default",
"Enabling" if default_prefix_caching else "Disabling",
)
elif (
model_config.runner_type == "pooling"
and self.enable_prefix_caching
and not default_prefix_caching
):
logger.warning(
"This model does not officially support prefix caching. "
"Enabling this manually may cause the engine to crash "
"or produce incorrect outputs.",
)

world_size = self.pipeline_parallel_size * self.tensor_parallel_size
(
Expand Down
Loading