From 8d74b5aee9e780852de870c936b59707835e84f5 Mon Sep 17 00:00:00 2001 From: youkaichao Date: Sat, 16 Nov 2024 23:14:23 -0800 Subject: [PATCH] [platforms] refactor cpu code (#10402) Signed-off-by: youkaichao --- vllm/executor/cpu_executor.py | 68 +---------------------------------- vllm/platforms/cpu.py | 60 +++++++++++++++++++++++++++++++ 2 files changed, 61 insertions(+), 67 deletions(-) diff --git a/vllm/executor/cpu_executor.py b/vllm/executor/cpu_executor.py index 4ceb5a837dd7f..1542a2ae367eb 100644 --- a/vllm/executor/cpu_executor.py +++ b/vllm/executor/cpu_executor.py @@ -2,9 +2,6 @@ from functools import partial from typing import Any, Awaitable, List, Optional, Set, Tuple, Union -import vllm.envs as envs -from vllm.config import (CacheConfig, ModelConfig, ParallelConfig, - SchedulerConfig) from vllm.executor.executor_base import ExecutorAsyncBase, ExecutorBase from vllm.executor.multiproc_worker_utils import (ProcessWorkerWrapper, ResultHandler, WorkerMonitor) @@ -13,7 +10,7 @@ from vllm.model_executor.layers.sampler import SamplerOutput from vllm.prompt_adapter.request import PromptAdapterRequest from vllm.sequence import ExecuteModelRequest -from vllm.utils import (GiB_bytes, get_distributed_init_method, get_open_port, +from vllm.utils import (get_distributed_init_method, get_open_port, get_vllm_instance_id, make_async) from vllm.worker.worker_base import WorkerWrapperBase @@ -57,13 +54,6 @@ def _init_executor(self) -> None: os.environ["LOCAL_WORLD_SIZE"] = str( self.parallel_config.tensor_parallel_size) - self.model_config = _verify_and_get_model_config(self.model_config) - self.cache_config = _verify_and_get_cache_config(self.cache_config) - self.scheduler_config = _verify_and_get_scheduler_config( - self.scheduler_config) - self.parallel_config = _verify_and_get_parallel_config( - self.parallel_config) - # Multiprocessing-based executor does not support multi-node setting. # Since it only works for single node, we can use the loopback address # 127.0.0.1 for communication. @@ -313,62 +303,6 @@ async def check_health_async(self) -> None: self.check_health() -def _verify_and_get_model_config(config: ModelConfig) -> ModelConfig: - # Reminder: Please update docs/source/serving/compatibility_matrix.rst - # If the feature combo become valid - if not config.enforce_eager: - logger.warning( - "CUDA graph is not supported on CPU, fallback to the eager " - "mode.") - config.enforce_eager = True - return config - - -def _verify_and_get_scheduler_config( - config: SchedulerConfig) -> SchedulerConfig: - # Reminder: Please update docs/source/serving/compatibility_matrix.rst - # If the feature combo become valid - if config.chunked_prefill_enabled: - logger.warning("Chunked prefill is not supported on CPU, disable it.") - config.chunked_prefill_enabled = False - - return config - - -def _verify_and_get_cache_config(config: CacheConfig) -> CacheConfig: - # Reminder: Please update docs/source/serving/compatibility_matrix.rst - # If the feature combo become valid - if config.enable_prefix_caching: - logger.warning("Prefix caching is not supported on CPU, disable it.") - config.enable_prefix_caching = False - - kv_cache_space = envs.VLLM_CPU_KVCACHE_SPACE - - if kv_cache_space >= 0: - if kv_cache_space == 0: - config.cpu_kvcache_space_bytes = 4 * GiB_bytes # type: ignore - logger.warning("Environment variable VLLM_CPU_KVCACHE_SPACE (GB) " - "for CPU backend is not set, using 4 by default.") - else: - config.cpu_kvcache_space_bytes = kv_cache_space * GiB_bytes # type: ignore - else: - raise RuntimeError( - "Invalid environment variable VLLM_CPU_KVCACHE_SPACE" - f" {kv_cache_space}, expect a positive integer value.") - - return config - - -def _verify_and_get_parallel_config(config: ParallelConfig) -> ParallelConfig: - if (config.distributed_executor_backend is not None - and config.distributed_executor_backend != "mp"): - logger.warning( - "%s is not supported on CPU, fallback to mp distributed executor " - "backend.", config.distributed_executor_backend) - config.distributed_executor_backend = "mp" - return config - - def _driver_method_invoker(driver, method: str, *args, **kwargs): return getattr(driver, method)(*args, **kwargs) diff --git a/vllm/platforms/cpu.py b/vllm/platforms/cpu.py index 5243f59203afc..42bee31dfb0e9 100644 --- a/vllm/platforms/cpu.py +++ b/vllm/platforms/cpu.py @@ -1,8 +1,19 @@ +from typing import TYPE_CHECKING + import psutil import torch +from vllm.logger import init_logger + from .interface import Platform, PlatformEnum +if TYPE_CHECKING: + from vllm.config import VllmConfig +else: + VllmConfig = None + +logger = init_logger(__name__) + class CpuPlatform(Platform): _enum = PlatformEnum.CPU @@ -18,3 +29,52 @@ def get_device_total_memory(cls, device_id: int = 0) -> int: @classmethod def inference_mode(cls): return torch.no_grad() + + @classmethod + def check_and_update_config(cls, vllm_config: VllmConfig) -> None: + import vllm.envs as envs + from vllm.utils import GiB_bytes + model_config = vllm_config.model_config + # Reminder: Please update docs/source/serving/compatibility_matrix.rst + # If the feature combo become valid + if not model_config.enforce_eager: + logger.warning( + "CUDA graph is not supported on CPU, fallback to the eager " + "mode.") + model_config.enforce_eager = True + + cache_config = vllm_config.cache_config + + if cache_config.enable_prefix_caching: + logger.warning( + "Prefix caching is not supported on CPU, disable it.") + cache_config.enable_prefix_caching = False + + kv_cache_space = envs.VLLM_CPU_KVCACHE_SPACE + + if kv_cache_space >= 0: + if kv_cache_space == 0: + cache_config.cpu_kvcache_space_bytes = 4 * GiB_bytes # type: ignore + logger.warning( + "Environment variable VLLM_CPU_KVCACHE_SPACE (GB) " + "for CPU backend is not set, using 4 by default.") + else: + cache_config.cpu_kvcache_space_bytes = kv_cache_space * GiB_bytes # type: ignore # noqa + else: + raise RuntimeError( + "Invalid environment variable VLLM_CPU_KVCACHE_SPACE" + f" {kv_cache_space}, expect a positive integer value.") + + scheduler_config = vllm_config.scheduler_config + if scheduler_config.chunked_prefill_enabled: + logger.warning( + "Chunked prefill is not supported on CPU, disable it.") + scheduler_config.chunked_prefill_enabled = False + + parallel_config = vllm_config.parallel_config + if (parallel_config.distributed_executor_backend is not None + and parallel_config.distributed_executor_backend != "mp"): + logger.warning(("%s is not supported on CPU, fallback to mp " + "distributed executor backend."), + parallel_config.distributed_executor_backend) + parallel_config.distributed_executor_backend = "mp"