vllm-project · 22quinn · Nov 17, 2025 · DarkLight1337 · Nov 17, 2025 · Copilot
diff --git a/tests/engine/test_arg_utils.py b/tests/engine/test_arg_utils.py
@@ -6,6 +6,7 @@
 from contextlib import nullcontext
 from dataclasses import dataclass, field
 from typing import Annotated, Literal
+from unittest.mock import patch
 
 import pytest
 
@@ -22,6 +23,7 @@
     optional_type,
     parse_type,
 )
+from vllm.platforms import CpuArchEnum
 from vllm.utils.argparse_utils import FlexibleArgumentParser
 
 
@@ -343,3 +345,142 @@ def test_human_readable_model_len():
     for invalid in ["1a", "pwd", "10.24", "1.23M"]:
         with pytest.raises(ArgumentError):
             args = parser.parse_args(["--max-model-len", invalid])
+
+
+@pytest.mark.parametrize(
+    ("cpu_arch", "setting", "value", "error_match"),
+    [
+        (
+            CpuArchEnum.POWERPC,
+            "enable_chunked_prefill",
+            True,
+            "Chunked prefill is not supported",
+        ),
+        (
+            CpuArchEnum.S390X,
+            "enable_chunked_prefill",
+            True,
+            "Chunked prefill is not supported",
+        ),
+        (
+            CpuArchEnum.ARM,
+            "enable_prefix_caching",
+            True,
+            "Prefix caching is not supported",
+        ),
+        (
+            CpuArchEnum.RISCV,
+            "enable_prefix_caching",
+            True,
+            "Prefix caching is not supported",
+        ),
+    ],
+)
+@patch("vllm.engine.arg_utils.current_platform.get_cpu_architecture")
+@patch("vllm.engine.arg_utils.current_platform.is_cpu")
+def test_restricted_cpu_enable_features_error(
+    mock_is_cpu, mock_get_cpu_arch, cpu_arch, setting, value, error_match
+):
+    """Test that enabling features on restricted CPUs raises errors."""
+    mock_is_cpu.return_value = True
+    mock_get_cpu_arch.return_value = cpu_arch
+
+    engine_args = EngineArgs(model="facebook/opt-125m", **{setting: value})
+
+    with pytest.raises(ValueError, match=error_match):
+        engine_args.create_engine_config()
+
+
+@pytest.mark.parametrize(
+    "cpu_arch",
+    [CpuArchEnum.POWERPC, CpuArchEnum.S390X, CpuArchEnum.ARM, CpuArchEnum.RISCV],
+)
+@patch("vllm.engine.arg_utils.current_platform.device_type", "cpu")
+@patch("vllm.engine.arg_utils.current_platform.get_cpu_architecture")
+@patch("vllm.engine.arg_utils.current_platform.is_cpu")
+def test_restricted_cpu_auto_disable(mock_is_cpu, mock_get_cpu_arch, cpu_arch):
+    """Test that chunked prefill and prefix caching are auto-disabled."""
+    mock_is_cpu.return_value = True
+    mock_get_cpu_arch.return_value = cpu_arch
+
+    engine_args = EngineArgs(model="facebook/opt-125m")
+
+    config = engine_args.create_engine_config()
+
+    assert config.scheduler_config.enable_chunked_prefill is False
+    assert config.cache_config.enable_prefix_caching is False
+
+
+@patch("vllm.engine.arg_utils.current_platform.is_cpu")
+def test_generation_model_disable_chunked_prefill_error(mock_is_cpu):
+    """Test that disabling chunked prefill for generation models raises an error."""
+    mock_is_cpu.return_value = False
+
+    engine_args = EngineArgs(
+        model="facebook/opt-125m",
+        enable_chunked_prefill=False,
+    )
+
+    with pytest.raises(
+        ValueError, match="Chunked prefill is required for generation models"
+    ):
+        engine_args.create_engine_config()
+
+
+@pytest.mark.parametrize(
+    "cpu_arch",
+    [CpuArchEnum.POWERPC, CpuArchEnum.S390X, CpuArchEnum.ARM, CpuArchEnum.RISCV],
+)
+@patch("vllm.engine.arg_utils.current_platform.device_type", "cpu")
+@patch("vllm.engine.arg_utils.current_platform.get_cpu_architecture")
+@patch("vllm.engine.arg_utils.current_platform.is_cpu")
+def test_restricted_cpu_generation_model_no_error(
+    mock_is_cpu, mock_get_cpu_arch, cpu_arch
+):
+    """Test that platform restrictions take precedence over model requirements.
+
+    On restricted CPUs, a generation model can have chunked prefill disabled.
+    """
+    mock_is_cpu.return_value = True
+    mock_get_cpu_arch.return_value = cpu_arch
+
+    engine_args = EngineArgs(
+        model="facebook/opt-125m",
+        enable_chunked_prefill=False,
+    )
+
+    config = engine_args.create_engine_config()
+
+    assert config.scheduler_config.enable_chunked_prefill is False
+
+
+@pytest.mark.parametrize(
+    ("setting", "value", "config_path"),
+    [
+        ("enable_chunked_prefill", True, "scheduler_config.enable_chunked_prefill"),
+        ("enable_chunked_prefill", None, "scheduler_config.enable_chunked_prefill"),
+        ("enable_prefix_caching", False, "cache_config.enable_prefix_caching"),
+    ],
+)
+@patch("vllm.engine.arg_utils.current_platform.is_cpu")
+def test_non_restricted_platform_settings(mock_is_cpu, setting, value, config_path):
+    """Test that settings work correctly on non-restricted platforms."""
+    mock_is_cpu.return_value = False
+
+    kwargs = {"model": "facebook/opt-125m"}
+    if value is not None:
+        kwargs[setting] = value
+
+    engine_args = EngineArgs(**kwargs)
+    config = engine_args.create_engine_config()
+
+    # Navigate to the config value using the path
+    config_value = config
+    for attr in config_path.split("."):
+        config_value = getattr(config_value, attr)
+
+    # For None (default), generation models default to True for chunked_prefill
+    if value is None and setting == "enable_chunked_prefill":
+        assert config_value is True
+    else:
+        assert config_value == value
diff --git a/vllm/engine/arg_utils.py b/vllm/engine/arg_utils.py
@@ -1353,26 +1353,6 @@ def create_engine_config(
 
         # Set default arguments for V1 Engine.
         self._set_default_args(usage_context, model_config)
-        # Disable chunked prefill and prefix caching for:
-        # POWER (ppc64le)/ARM/s390x/RISCV CPUs in V1
-        if current_platform.is_cpu() and current_platform.get_cpu_architecture() in (
-            CpuArchEnum.POWERPC,
-            CpuArchEnum.S390X,
-            CpuArchEnum.ARM,
-            CpuArchEnum.RISCV,
-        ):
-            logger.info(
-                "Chunked prefill is not supported for ARM and POWER, "
-                "S390X and RISC-V CPUs; "
-                "disabling it for V1 backend."
-            )
-            self.enable_chunked_prefill = False
-            logger.info(
-                "Prefix caching is not supported for ARM and POWER, "
-                "S390X and RISC-V CPUs; "
-                "disabling it for V1 backend."
-            )
-            self.enable_prefix_caching = False
 
         assert self.enable_chunked_prefill is not None
 
@@ -1935,46 +1915,99 @@ def _set_default_args(
         self, usage_context: UsageContext, model_config: ModelConfig
     ) -> None:
         """Set Default Arguments for V1 Engine."""
+        # Check if running on CPU architecture with feature restrictions
+        is_restricted_cpu = (
+            current_platform.is_cpu()
+            and current_platform.get_cpu_architecture()
+            in (
+                CpuArchEnum.POWERPC,
+                CpuArchEnum.S390X,
+                CpuArchEnum.ARM,
+                CpuArchEnum.RISCV,
+            )
+        )
+        restricted_cpu_names = "ARM, POWER, S390X, and RISC-V CPUs"
+
+        # Validate platform-specific restrictions
+        if is_restricted_cpu:
+            if self.enable_chunked_prefill is True:
+                raise ValueError(
+                    f"Chunked prefill is not supported for {restricted_cpu_names}."
+                )
+            if self.enable_prefix_caching is True:
+                raise ValueError(
+                    f"Prefix caching is not supported for {restricted_cpu_names}."
+                )
+
+        # Validate model-specific requirements
+        # (except on restricted CPUs where chunked prefill must be disabled)
+        if (
+            not is_restricted_cpu
+            and model_config.runner_type == "generate"
+            and self.enable_chunked_prefill is False
+        ):
+            raise ValueError("Chunked prefill is required for generation models. ")
-            raise ValueError("Chunked prefill is required for generation models. ")
+            raise ValueError("Chunked prefill is required for generation models.")
-            raise ValueError("Chunked prefill is required for generation models. ")
+            raise ValueError("Chunked prefill is required for generation models.")
+
+        # Handle defaults for chunked prefill and prefix caching
         (
             default_chunked_prefill,
             default_prefix_caching,
         ) = self.get_chunked_prefill_prefix_caching_defaults(model_config)
 
-        if self.enable_chunked_prefill is None:
-            self.enable_chunked_prefill = default_chunked_prefill
+        # Override defaults for restricted CPUs
+        if is_restricted_cpu:
+            if self.enable_chunked_prefill is None:
+                logger.info(
+                    "Chunked prefill is not supported for %s; "
+                    "disabling it for V1 backend.",
-                    "Chunked prefill is not supported for %s; "
-                    "disabling it for V1 backend.",
+                    "Chunked prefill is not supported for %s "
+                    "and will be disabled.",
-                    "Chunked prefill is not supported for %s; "
-                    "disabling it for V1 backend.",
+                    "Chunked prefill is not supported for %s "
+                    "and will be disabled.",
+                    restricted_cpu_names,
+                )
+            self.enable_chunked_prefill = False
 
-            logger.debug(
-                "%s chunked prefill by default",
-                "Enabling" if default_chunked_prefill else "Disabling",
-            )
-        elif (
-            model_config.runner_type == "pooling"
-            and self.enable_chunked_prefill
-            and not default_chunked_prefill
-        ):
-            logger.warning(
-                "This model does not officially support chunked prefill. "
-                "Enabling this manually may cause the engine to crash "
-                "or produce incorrect outputs.",
-            )
+            if self.enable_prefix_caching is None:
+                logger.info(
+                    "Prefix caching is not supported for %s; "
+                    "disabling it for V1 backend.",
-                    "Prefix caching is not supported for %s; "
-                    "disabling it for V1 backend.",
+                    "Prefix caching is not supported for %s; "
+                    "and will be disabled.",
-                    "Prefix caching is not supported for %s; "
-                    "disabling it for V1 backend.",
+                    "Prefix caching is not supported for %s; "
+                    "and will be disabled.",
+                    restricted_cpu_names,
+                )
+            self.enable_prefix_caching = False
+        else:
+            # Set defaults for supported platforms
+            if self.enable_chunked_prefill is None:
+                self.enable_chunked_prefill = default_chunked_prefill
+
+                logger.debug(
+                    "%s chunked prefill by default",
+                    "Enabling" if default_chunked_prefill else "Disabling",
+                )
+            elif (
+                model_config.runner_type == "pooling"
+                and self.enable_chunked_prefill
+                and not default_chunked_prefill
+            ):
+                logger.warning(
+                    "This model does not officially support chunked prefill. "
+                    "Enabling this manually may cause the engine to crash "
+                    "or produce incorrect outputs.",
+                )
 
-        if self.enable_prefix_caching is None:
-            self.enable_prefix_caching = default_prefix_caching
+            if self.enable_prefix_caching is None:
+                self.enable_prefix_caching = default_prefix_caching
 
-            logger.debug(
-                "%s prefix caching by default",
-                "Enabling" if default_prefix_caching else "Disabling",
-            )
-        elif (
-            model_config.runner_type == "pooling"
-            and self.enable_prefix_caching
-            and not default_prefix_caching
-        ):
-            logger.warning(
-                "This model does not officially support prefix caching. "
-                "Enabling this manually may cause the engine to crash "
-                "or produce incorrect outputs.",
-            )
+                logger.debug(
+                    "%s prefix caching by default",
+                    "Enabling" if default_prefix_caching else "Disabling",
+                )
+            elif (
+                model_config.runner_type == "pooling"
+                and self.enable_prefix_caching
+                and not default_prefix_caching
+            ):
+                logger.warning(
+                    "This model does not officially support prefix caching. "
+                    "Enabling this manually may cause the engine to crash "
+                    "or produce incorrect outputs.",
+                )
 
         world_size = self.pipeline_parallel_size * self.tensor_parallel_size
         (