From 67308e4b7464fdd8383e34ed39fd46de66afafb3 Mon Sep 17 00:00:00 2001
From: Divakar Verma <divakar.verma@amd.com>
Date: Mon, 17 Feb 2025 11:42:19 -0600
Subject: [PATCH 1/2] use amdsmi for device name on rocm

Signed-off-by: Divakar Verma <divakar.verma@amd.com>
---
 vllm/platforms/rocm.py | 36 ++++++++++++++++++++++++++++++------
 1 file changed, 30 insertions(+), 6 deletions(-)

diff --git a/vllm/platforms/rocm.py b/vllm/platforms/rocm.py
index 393b8a18527f..1ff06c14dc2f 100644
--- a/vllm/platforms/rocm.py
+++ b/vllm/platforms/rocm.py
@@ -1,9 +1,12 @@
 # SPDX-License-Identifier: Apache-2.0
 
-from functools import lru_cache
+import os
+from functools import lru_cache, wraps
 from typing import TYPE_CHECKING, Dict, List, Optional
 
 import torch
+from amdsmi import (amdsmi_get_gpu_asic_info, amdsmi_get_processor_handles,
+                    amdsmi_init, amdsmi_shut_down)
 
 import vllm.envs as envs
 from vllm.logger import init_logger
@@ -54,6 +57,28 @@
 }
 
 
+def with_amdsmi_context(fn):
+
+    @wraps(fn)
+    def wrapper(*args, **kwargs):
+        amdsmi_init()
+        try:
+            return fn(*args, **kwargs)
+        finally:
+            amdsmi_shut_down()
+
+    return wrapper
+
+
+def device_id_to_physical_device_id(device_id: int) -> int:
+    if "CUDA_VISIBLE_DEVICES" in os.environ:
+        device_ids = os.environ["CUDA_VISIBLE_DEVICES"].split(",")
+        physical_device_id = device_ids[device_id]
+        return int(physical_device_id)
+    else:
+        return device_id
+
+
 class RocmPlatform(Platform):
     _enum = PlatformEnum.ROCM
     device_name: str = "rocm"
@@ -96,13 +121,12 @@ def get_device_capability(cls, device_id: int = 0) -> DeviceCapability:
         return DeviceCapability(major=major, minor=minor)
 
     @classmethod
+    @with_amdsmi_context
     @lru_cache(maxsize=8)
     def get_device_name(cls, device_id: int = 0) -> str:
-        # NOTE: When using V1 this function is called when overriding the
-        # engine args. Calling torch.cuda.get_device_name(device_id) here
-        # will result in the ROCm context being initialized before other
-        # processes can be created.
-        return "AMD"
+        physical_device_id = device_id_to_physical_device_id(device_id)
+        handle = amdsmi_get_processor_handles()[physical_device_id]
+        return amdsmi_get_gpu_asic_info(handle)["market_name"]
 
     @classmethod
     def get_device_total_memory(cls, device_id: int = 0) -> int:

From 3f73843df14bc154edd95de6c96afc86b3b2fb70 Mon Sep 17 00:00:00 2001
From: Divakar Verma <divakar.verma@amd.com>
Date: Mon, 17 Feb 2025 18:29:36 -0600
Subject: [PATCH 2/2] resolve HIP_VISIBLE_DEVICES and CUDA_VISIBLE_DEVICES

Signed-off-by: Divakar Verma <divakar.verma@amd.com>
---
 vllm/platforms/rocm.py | 13 +++++++++++++
 1 file changed, 13 insertions(+)

diff --git a/vllm/platforms/rocm.py b/vllm/platforms/rocm.py
index 1ff06c14dc2f..e506689dc33c 100644
--- a/vllm/platforms/rocm.py
+++ b/vllm/platforms/rocm.py
@@ -56,6 +56,19 @@
      "by setting `VLLM_USE_TRITON_FLASH_ATTN=0`")
 }
 
+# Prevent use of clashing `{CUDA/HIP}_VISIBLE_DEVICES``
+if "HIP_VISIBLE_DEVICES" in os.environ:
+    val = os.environ["HIP_VISIBLE_DEVICES"]
+    if cuda_val := os.environ.get("CUDA_VISIBLE_DEVICES", None):
+        assert val == cuda_val
+    else:
+        os.environ["CUDA_VISIBLE_DEVICES"] = val
+
+# AMDSMI utils
+# Note that NVML is not affected by `{CUDA/HIP}_VISIBLE_DEVICES`,
+# all the related functions work on real physical device ids.
+# the major benefit of using AMDSMI is that it will not initialize CUDA
+
 
 def with_amdsmi_context(fn):