From 67308e4b7464fdd8383e34ed39fd46de66afafb3 Mon Sep 17 00:00:00 2001 From: Divakar Verma Date: Mon, 17 Feb 2025 11:42:19 -0600 Subject: [PATCH 1/2] use amdsmi for device name on rocm Signed-off-by: Divakar Verma --- vllm/platforms/rocm.py | 36 ++++++++++++++++++++++++++++++------ 1 file changed, 30 insertions(+), 6 deletions(-) diff --git a/vllm/platforms/rocm.py b/vllm/platforms/rocm.py index 393b8a18527f..1ff06c14dc2f 100644 --- a/vllm/platforms/rocm.py +++ b/vllm/platforms/rocm.py @@ -1,9 +1,12 @@ # SPDX-License-Identifier: Apache-2.0 -from functools import lru_cache +import os +from functools import lru_cache, wraps from typing import TYPE_CHECKING, Dict, List, Optional import torch +from amdsmi import (amdsmi_get_gpu_asic_info, amdsmi_get_processor_handles, + amdsmi_init, amdsmi_shut_down) import vllm.envs as envs from vllm.logger import init_logger @@ -54,6 +57,28 @@ } +def with_amdsmi_context(fn): + + @wraps(fn) + def wrapper(*args, **kwargs): + amdsmi_init() + try: + return fn(*args, **kwargs) + finally: + amdsmi_shut_down() + + return wrapper + + +def device_id_to_physical_device_id(device_id: int) -> int: + if "CUDA_VISIBLE_DEVICES" in os.environ: + device_ids = os.environ["CUDA_VISIBLE_DEVICES"].split(",") + physical_device_id = device_ids[device_id] + return int(physical_device_id) + else: + return device_id + + class RocmPlatform(Platform): _enum = PlatformEnum.ROCM device_name: str = "rocm" @@ -96,13 +121,12 @@ def get_device_capability(cls, device_id: int = 0) -> DeviceCapability: return DeviceCapability(major=major, minor=minor) @classmethod + @with_amdsmi_context @lru_cache(maxsize=8) def get_device_name(cls, device_id: int = 0) -> str: - # NOTE: When using V1 this function is called when overriding the - # engine args. Calling torch.cuda.get_device_name(device_id) here - # will result in the ROCm context being initialized before other - # processes can be created. - return "AMD" + physical_device_id = device_id_to_physical_device_id(device_id) + handle = amdsmi_get_processor_handles()[physical_device_id] + return amdsmi_get_gpu_asic_info(handle)["market_name"] @classmethod def get_device_total_memory(cls, device_id: int = 0) -> int: From 3f73843df14bc154edd95de6c96afc86b3b2fb70 Mon Sep 17 00:00:00 2001 From: Divakar Verma Date: Mon, 17 Feb 2025 18:29:36 -0600 Subject: [PATCH 2/2] resolve HIP_VISIBLE_DEVICES and CUDA_VISIBLE_DEVICES Signed-off-by: Divakar Verma --- vllm/platforms/rocm.py | 13 +++++++++++++ 1 file changed, 13 insertions(+) diff --git a/vllm/platforms/rocm.py b/vllm/platforms/rocm.py index 1ff06c14dc2f..e506689dc33c 100644 --- a/vllm/platforms/rocm.py +++ b/vllm/platforms/rocm.py @@ -56,6 +56,19 @@ "by setting `VLLM_USE_TRITON_FLASH_ATTN=0`") } +# Prevent use of clashing `{CUDA/HIP}_VISIBLE_DEVICES`` +if "HIP_VISIBLE_DEVICES" in os.environ: + val = os.environ["HIP_VISIBLE_DEVICES"] + if cuda_val := os.environ.get("CUDA_VISIBLE_DEVICES", None): + assert val == cuda_val + else: + os.environ["CUDA_VISIBLE_DEVICES"] = val + +# AMDSMI utils +# Note that NVML is not affected by `{CUDA/HIP}_VISIBLE_DEVICES`, +# all the related functions work on real physical device ids. +# the major benefit of using AMDSMI is that it will not initialize CUDA + def with_amdsmi_context(fn):