From 8d74b5aee9e780852de870c936b59707835e84f5 Mon Sep 17 00:00:00 2001
From: youkaichao <youkaichao@gmail.com>
Date: Sat, 16 Nov 2024 23:14:23 -0800
Subject: [PATCH] [platforms] refactor cpu code (#10402)

Signed-off-by: youkaichao <youkaichao@gmail.com>
---
 vllm/executor/cpu_executor.py | 68 +----------------------------------
 vllm/platforms/cpu.py         | 60 +++++++++++++++++++++++++++++++
 2 files changed, 61 insertions(+), 67 deletions(-)

diff --git a/vllm/executor/cpu_executor.py b/vllm/executor/cpu_executor.py
index 4ceb5a837dd7f..1542a2ae367eb 100644
--- a/vllm/executor/cpu_executor.py
+++ b/vllm/executor/cpu_executor.py
@@ -2,9 +2,6 @@
 from functools import partial
 from typing import Any, Awaitable, List, Optional, Set, Tuple, Union
 
-import vllm.envs as envs
-from vllm.config import (CacheConfig, ModelConfig, ParallelConfig,
-                         SchedulerConfig)
 from vllm.executor.executor_base import ExecutorAsyncBase, ExecutorBase
 from vllm.executor.multiproc_worker_utils import (ProcessWorkerWrapper,
                                                   ResultHandler, WorkerMonitor)
@@ -13,7 +10,7 @@
 from vllm.model_executor.layers.sampler import SamplerOutput
 from vllm.prompt_adapter.request import PromptAdapterRequest
 from vllm.sequence import ExecuteModelRequest
-from vllm.utils import (GiB_bytes, get_distributed_init_method, get_open_port,
+from vllm.utils import (get_distributed_init_method, get_open_port,
                         get_vllm_instance_id, make_async)
 from vllm.worker.worker_base import WorkerWrapperBase
 
@@ -57,13 +54,6 @@ def _init_executor(self) -> None:
         os.environ["LOCAL_WORLD_SIZE"] = str(
             self.parallel_config.tensor_parallel_size)
 
-        self.model_config = _verify_and_get_model_config(self.model_config)
-        self.cache_config = _verify_and_get_cache_config(self.cache_config)
-        self.scheduler_config = _verify_and_get_scheduler_config(
-            self.scheduler_config)
-        self.parallel_config = _verify_and_get_parallel_config(
-            self.parallel_config)
-
         # Multiprocessing-based executor does not support multi-node setting.
         # Since it only works for single node, we can use the loopback address
         # 127.0.0.1 for communication.
@@ -313,62 +303,6 @@ async def check_health_async(self) -> None:
         self.check_health()
 
 
-def _verify_and_get_model_config(config: ModelConfig) -> ModelConfig:
-    # Reminder: Please update docs/source/serving/compatibility_matrix.rst
-    # If the feature combo become valid
-    if not config.enforce_eager:
-        logger.warning(
-            "CUDA graph is not supported on CPU, fallback to the eager "
-            "mode.")
-        config.enforce_eager = True
-    return config
-
-
-def _verify_and_get_scheduler_config(
-        config: SchedulerConfig) -> SchedulerConfig:
-    # Reminder: Please update docs/source/serving/compatibility_matrix.rst
-    # If the feature combo become valid
-    if config.chunked_prefill_enabled:
-        logger.warning("Chunked prefill is not supported on CPU, disable it.")
-        config.chunked_prefill_enabled = False
-
-    return config
-
-
-def _verify_and_get_cache_config(config: CacheConfig) -> CacheConfig:
-    # Reminder: Please update docs/source/serving/compatibility_matrix.rst
-    # If the feature combo become valid
-    if config.enable_prefix_caching:
-        logger.warning("Prefix caching is not supported on CPU, disable it.")
-        config.enable_prefix_caching = False
-
-    kv_cache_space = envs.VLLM_CPU_KVCACHE_SPACE
-
-    if kv_cache_space >= 0:
-        if kv_cache_space == 0:
-            config.cpu_kvcache_space_bytes = 4 * GiB_bytes  # type: ignore
-            logger.warning("Environment variable VLLM_CPU_KVCACHE_SPACE (GB) "
-                           "for CPU backend is not set, using 4 by default.")
-        else:
-            config.cpu_kvcache_space_bytes = kv_cache_space * GiB_bytes  # type: ignore
-    else:
-        raise RuntimeError(
-            "Invalid environment variable VLLM_CPU_KVCACHE_SPACE"
-            f" {kv_cache_space}, expect a positive integer value.")
-
-    return config
-
-
-def _verify_and_get_parallel_config(config: ParallelConfig) -> ParallelConfig:
-    if (config.distributed_executor_backend is not None
-            and config.distributed_executor_backend != "mp"):
-        logger.warning(
-            "%s is not supported on CPU, fallback to mp distributed executor "
-            "backend.", config.distributed_executor_backend)
-        config.distributed_executor_backend = "mp"
-    return config
-
-
 def _driver_method_invoker(driver, method: str, *args, **kwargs):
     return getattr(driver, method)(*args, **kwargs)
 
diff --git a/vllm/platforms/cpu.py b/vllm/platforms/cpu.py
index 5243f59203afc..42bee31dfb0e9 100644
--- a/vllm/platforms/cpu.py
+++ b/vllm/platforms/cpu.py
@@ -1,8 +1,19 @@
+from typing import TYPE_CHECKING
+
 import psutil
 import torch
 
+from vllm.logger import init_logger
+
 from .interface import Platform, PlatformEnum
 
+if TYPE_CHECKING:
+    from vllm.config import VllmConfig
+else:
+    VllmConfig = None
+
+logger = init_logger(__name__)
+
 
 class CpuPlatform(Platform):
     _enum = PlatformEnum.CPU
@@ -18,3 +29,52 @@ def get_device_total_memory(cls, device_id: int = 0) -> int:
     @classmethod
     def inference_mode(cls):
         return torch.no_grad()
+
+    @classmethod
+    def check_and_update_config(cls, vllm_config: VllmConfig) -> None:
+        import vllm.envs as envs
+        from vllm.utils import GiB_bytes
+        model_config = vllm_config.model_config
+        # Reminder: Please update docs/source/serving/compatibility_matrix.rst
+        # If the feature combo become valid
+        if not model_config.enforce_eager:
+            logger.warning(
+                "CUDA graph is not supported on CPU, fallback to the eager "
+                "mode.")
+            model_config.enforce_eager = True
+
+        cache_config = vllm_config.cache_config
+
+        if cache_config.enable_prefix_caching:
+            logger.warning(
+                "Prefix caching is not supported on CPU, disable it.")
+            cache_config.enable_prefix_caching = False
+
+        kv_cache_space = envs.VLLM_CPU_KVCACHE_SPACE
+
+        if kv_cache_space >= 0:
+            if kv_cache_space == 0:
+                cache_config.cpu_kvcache_space_bytes = 4 * GiB_bytes  # type: ignore
+                logger.warning(
+                    "Environment variable VLLM_CPU_KVCACHE_SPACE (GB) "
+                    "for CPU backend is not set, using 4 by default.")
+            else:
+                cache_config.cpu_kvcache_space_bytes = kv_cache_space * GiB_bytes  # type: ignore # noqa
+        else:
+            raise RuntimeError(
+                "Invalid environment variable VLLM_CPU_KVCACHE_SPACE"
+                f" {kv_cache_space}, expect a positive integer value.")
+
+        scheduler_config = vllm_config.scheduler_config
+        if scheduler_config.chunked_prefill_enabled:
+            logger.warning(
+                "Chunked prefill is not supported on CPU, disable it.")
+            scheduler_config.chunked_prefill_enabled = False
+
+        parallel_config = vllm_config.parallel_config
+        if (parallel_config.distributed_executor_backend is not None
+                and parallel_config.distributed_executor_backend != "mp"):
+            logger.warning(("%s is not supported on CPU, fallback to mp "
+                            "distributed executor backend."),
+                           parallel_config.distributed_executor_backend)
+            parallel_config.distributed_executor_backend = "mp"