diff --git a/vllm/usage/usage_lib.py b/vllm/usage/usage_lib.py index 7132681050e1..67b834533b7d 100644 --- a/vllm/usage/usage_lib.py +++ b/vllm/usage/usage_lib.py @@ -174,6 +174,15 @@ def _report_usage_once(self, model_architecture: str, cuda_get_device_properties(0, ("name", "total_memory"))) if current_platform.is_cuda(): self.cuda_runtime = torch.version.cuda + if current_platform.is_tpu(): + try: + import torch_xla + self.gpu_count = torch_xla.runtime.world_size() + self.gpu_type = torch_xla.tpu.get_tpu_type() + self.gpu_memory_per_device = ( + torch_xla.core.xla_model.get_memory_info()["bytes_limit"]) + except Exception: + pass self.provider = _detect_cloud_provider() self.architecture = platform.machine() self.platform = platform.platform() diff --git a/vllm/v1/engine/async_llm.py b/vllm/v1/engine/async_llm.py index 54f0232da2b2..c33535b3d360 100644 --- a/vllm/v1/engine/async_llm.py +++ b/vllm/v1/engine/async_llm.py @@ -36,7 +36,6 @@ from vllm.v1.metrics.loggers import (LoggingStatLogger, PrometheusStatLogger, StatLoggerBase) from vllm.v1.metrics.stats import IterationStats, SchedulerStats -from vllm.v1.utils import report_usage_stats logger = init_logger(__name__) @@ -113,9 +112,6 @@ def __init__( except RuntimeError: pass - # If usage stat is enabled, collect relevant info. - report_usage_stats(vllm_config, usage_context) - @classmethod def from_vllm_config( cls, diff --git a/vllm/v1/engine/llm_engine.py b/vllm/v1/engine/llm_engine.py index af67408097ab..a07595a552af 100644 --- a/vllm/v1/engine/llm_engine.py +++ b/vllm/v1/engine/llm_engine.py @@ -28,7 +28,6 @@ from vllm.v1.engine.parallel_sampling import ParentRequest from vllm.v1.engine.processor import Processor from vllm.v1.executor.abstract import Executor -from vllm.v1.utils import report_usage_stats logger = init_logger(__name__) @@ -97,9 +96,6 @@ def __init__( # for v0 compatibility self.model_executor = self.engine_core.engine_core.model_executor # type: ignore - # If usage stat is enabled, collect relevant info. - report_usage_stats(vllm_config, usage_context) - @classmethod def from_vllm_config( cls, diff --git a/vllm/v1/utils.py b/vllm/v1/utils.py index dc6457bf9032..9c238c3aad8e 100644 --- a/vllm/v1/utils.py +++ b/vllm/v1/utils.py @@ -205,7 +205,9 @@ def copy_slice(from_tensor: torch.Tensor, to_tensor: torch.Tensor, return to_tensor[:length].copy_(from_tensor[:length], non_blocking=True) -def report_usage_stats(vllm_config, usage_context: UsageContext) -> None: +def report_usage_stats( + vllm_config, + usage_context: UsageContext = UsageContext.ENGINE_CONTEXT) -> None: """Report usage statistics if enabled.""" if not is_usage_stats_enabled(): diff --git a/vllm/v1/worker/gpu_worker.py b/vllm/v1/worker/gpu_worker.py index 424c73e3ab7f..68c4e94fcd73 100644 --- a/vllm/v1/worker/gpu_worker.py +++ b/vllm/v1/worker/gpu_worker.py @@ -23,6 +23,7 @@ from vllm.utils import GiB_bytes from vllm.v1.kv_cache_interface import KVCacheConfig, KVCacheSpec from vllm.v1.outputs import ModelRunnerOutput +from vllm.v1.utils import report_usage_stats from vllm.v1.worker.gpu_model_runner import GPUModelRunner from vllm.v1.worker.worker_base import WorkerBase @@ -141,6 +142,10 @@ def init_device(self): self.model_runner: GPUModelRunner = GPUModelRunner( self.vllm_config, self.device) + if self.rank == 0: + # If usage stat is enabled, collect relevant info. + report_usage_stats(self.vllm_config) + # FIXME(youkaichao & ywang96): Use TorchDispatchMode instead of memory pool # to hijack tensor allocation. def load_model(self) -> None: diff --git a/vllm/v1/worker/tpu_worker.py b/vllm/v1/worker/tpu_worker.py index 2204f037a6d5..de676541effa 100644 --- a/vllm/v1/worker/tpu_worker.py +++ b/vllm/v1/worker/tpu_worker.py @@ -21,7 +21,7 @@ from vllm.v1.kv_cache_interface import (AttentionSpec, KVCacheConfig, KVCacheSpec) from vllm.v1.outputs import ModelRunnerOutput -from vllm.v1.utils import bind_kv_cache +from vllm.v1.utils import bind_kv_cache, report_usage_stats from vllm.v1.worker.tpu_model_runner import TPUModelRunner logger = init_logger(__name__) @@ -133,6 +133,10 @@ def init_device(self): # Init ModelRunner here, so that we have access to self.device. self.model_runner = TPUModelRunner(self.vllm_config, self.device) + if rank == 0: + # If usage stat is enabled, collect relevant info. + report_usage_stats(self.vllm_config) + def determine_available_memory(self) -> int: kv_caches: dict[str, torch.Tensor] = {} kv_cache_spec = self.model_runner.get_kv_cache_spec()