From 71f237378a01b48e7d78870fe3d59f0d2e46b66b Mon Sep 17 00:00:00 2001
From: raviguptaamd <ravi.gupta@amd.com>
Date: Tue, 17 Feb 2026 17:29:25 +0000
Subject: [PATCH 1/2] [ROCm] Enable DBO (Dynamic Batch Optimization) on ROCm

Relax the CUDA-only assertion in SMControlContextManager to also
allow ROCm platforms. The HIP runtime exposes CU count via
torch.cuda.get_device_properties().multi_processor_count, so the
existing SM control logic works unchanged on AMD GPUs.

Set the VLLM_DBO_COMM_SMS default to 64 on ROCm (vs 20 on CUDA),
since MI300X has 304 CUs compared to H100's 132 SMs and benefits
from a higher communication allocation. Users can still override
via the VLLM_DBO_COMM_SMS env var.

Signed-off-by: raviguptaamd <ravi.gupta@amd.com>
---
 vllm/envs.py                         | 12 +++++++++---
 vllm/v1/worker/gpu_ubatch_wrapper.py |  4 ++--
 2 files changed, 11 insertions(+), 5 deletions(-)

diff --git a/vllm/envs.py b/vllm/envs.py
index 5c2a01482ffe..6ede94bfee39 100755
--- a/vllm/envs.py
+++ b/vllm/envs.py
@@ -1545,9 +1545,15 @@ def _get_or_set_default() -> str:
     "VLLM_DEEPEP_LOW_LATENCY_USE_MNNVL": lambda: bool(
         int(os.getenv("VLLM_DEEPEP_LOW_LATENCY_USE_MNNVL", "0"))
     ),
-    # The number of SMs to allocate for communication kernels when running DBO
-    # the rest of the SMs on the device will be allocated to compute
-    "VLLM_DBO_COMM_SMS": lambda: int(os.getenv("VLLM_DBO_COMM_SMS", "20")),
+    # The number of SMs/CUs to allocate for communication kernels when
+    # running DBO; the rest will be allocated to compute.
+    # Default: 20 on CUDA (SMs), 64 on ROCm (CUs).
+    "VLLM_DBO_COMM_SMS": lambda: int(os.getenv(
+        "VLLM_DBO_COMM_SMS",
+        "64" if hasattr(__import__("torch").version, "hip")
+               and __import__("torch").version.hip is not None
+        else "20"
+    )),
     # Enable max_autotune & coordinate_descent_tuning in inductor_config
     # to compile static shapes passed from compile_sizes in compilation_config
     # If set to 1, enable max_autotune; By default, this is enabled (1)
diff --git a/vllm/v1/worker/gpu_ubatch_wrapper.py b/vllm/v1/worker/gpu_ubatch_wrapper.py
index 52faa2e88005..ca60426a628d 100644
--- a/vllm/v1/worker/gpu_ubatch_wrapper.py
+++ b/vllm/v1/worker/gpu_ubatch_wrapper.py
@@ -70,8 +70,8 @@ def __init__(
                 A function that sets the number of SMs for computation.
         """
 
-        assert current_platform.is_cuda(), (
-            "SM control is currently only supported on CUDA"
+        assert current_platform.is_cuda() or current_platform.is_rocm(), (
+            "SM/CU control is supported on CUDA and ROCm platforms"
         )
         device = torch.accelerator.current_device_index()
         total_sms = num_compute_units(device)

From 403bdda575ba928131fb44e64457f20958b0d06e Mon Sep 17 00:00:00 2001
From: raviguptaamd <ravi.gupta@amd.com>
Date: Tue, 28 Apr 2026 16:02:18 +0000
Subject: [PATCH 2/2] [ROCm] Fix pre-commit ruff-format on vllm/envs.py

Reformat the VLLM_DBO_COMM_SMS lambda to satisfy ruff-format
(no logic change). Addresses pre-commit CI failure on PR #34726.

Signed-off-by: raviguptaamd <ravi.gupta@amd.com>
Made-with: Cursor
---
 vllm/envs.py | 15 +++++++++------
 1 file changed, 9 insertions(+), 6 deletions(-)

diff --git a/vllm/envs.py b/vllm/envs.py
index 5bf91ee18a16..3055324a6f2a 100755
--- a/vllm/envs.py
+++ b/vllm/envs.py
@@ -1623,12 +1623,15 @@ def _get_or_set_default() -> str:
     # The number of SMs/CUs to allocate for communication kernels when
     # running DBO; the rest will be allocated to compute.
     # Default: 20 on CUDA (SMs), 64 on ROCm (CUs).
-    "VLLM_DBO_COMM_SMS": lambda: int(os.getenv(
-        "VLLM_DBO_COMM_SMS",
-        "64" if hasattr(__import__("torch").version, "hip")
-               and __import__("torch").version.hip is not None
-        else "20"
-    )),
+    "VLLM_DBO_COMM_SMS": lambda: int(
+        os.getenv(
+            "VLLM_DBO_COMM_SMS",
+            "64"
+            if hasattr(__import__("torch").version, "hip")
+            and __import__("torch").version.hip is not None
+            else "20",
+        )
+    ),
     # Enable max_autotune & coordinate_descent_tuning in inductor_config
     # to compile static shapes passed from compile_sizes in compilation_config
     # If set to 1, enable max_autotune; By default, this is enabled (1)