From 71f237378a01b48e7d78870fe3d59f0d2e46b66b Mon Sep 17 00:00:00 2001 From: raviguptaamd Date: Tue, 17 Feb 2026 17:29:25 +0000 Subject: [PATCH 1/2] [ROCm] Enable DBO (Dynamic Batch Optimization) on ROCm Relax the CUDA-only assertion in SMControlContextManager to also allow ROCm platforms. The HIP runtime exposes CU count via torch.cuda.get_device_properties().multi_processor_count, so the existing SM control logic works unchanged on AMD GPUs. Set the VLLM_DBO_COMM_SMS default to 64 on ROCm (vs 20 on CUDA), since MI300X has 304 CUs compared to H100's 132 SMs and benefits from a higher communication allocation. Users can still override via the VLLM_DBO_COMM_SMS env var. Signed-off-by: raviguptaamd --- vllm/envs.py | 12 +++++++++--- vllm/v1/worker/gpu_ubatch_wrapper.py | 4 ++-- 2 files changed, 11 insertions(+), 5 deletions(-) diff --git a/vllm/envs.py b/vllm/envs.py index 5c2a01482ffe..6ede94bfee39 100755 --- a/vllm/envs.py +++ b/vllm/envs.py @@ -1545,9 +1545,15 @@ def _get_or_set_default() -> str: "VLLM_DEEPEP_LOW_LATENCY_USE_MNNVL": lambda: bool( int(os.getenv("VLLM_DEEPEP_LOW_LATENCY_USE_MNNVL", "0")) ), - # The number of SMs to allocate for communication kernels when running DBO - # the rest of the SMs on the device will be allocated to compute - "VLLM_DBO_COMM_SMS": lambda: int(os.getenv("VLLM_DBO_COMM_SMS", "20")), + # The number of SMs/CUs to allocate for communication kernels when + # running DBO; the rest will be allocated to compute. + # Default: 20 on CUDA (SMs), 64 on ROCm (CUs). + "VLLM_DBO_COMM_SMS": lambda: int(os.getenv( + "VLLM_DBO_COMM_SMS", + "64" if hasattr(__import__("torch").version, "hip") + and __import__("torch").version.hip is not None + else "20" + )), # Enable max_autotune & coordinate_descent_tuning in inductor_config # to compile static shapes passed from compile_sizes in compilation_config # If set to 1, enable max_autotune; By default, this is enabled (1) diff --git a/vllm/v1/worker/gpu_ubatch_wrapper.py b/vllm/v1/worker/gpu_ubatch_wrapper.py index 52faa2e88005..ca60426a628d 100644 --- a/vllm/v1/worker/gpu_ubatch_wrapper.py +++ b/vllm/v1/worker/gpu_ubatch_wrapper.py @@ -70,8 +70,8 @@ def __init__( A function that sets the number of SMs for computation. """ - assert current_platform.is_cuda(), ( - "SM control is currently only supported on CUDA" + assert current_platform.is_cuda() or current_platform.is_rocm(), ( + "SM/CU control is supported on CUDA and ROCm platforms" ) device = torch.accelerator.current_device_index() total_sms = num_compute_units(device) From 403bdda575ba928131fb44e64457f20958b0d06e Mon Sep 17 00:00:00 2001 From: raviguptaamd Date: Tue, 28 Apr 2026 16:02:18 +0000 Subject: [PATCH 2/2] [ROCm] Fix pre-commit ruff-format on vllm/envs.py Reformat the VLLM_DBO_COMM_SMS lambda to satisfy ruff-format (no logic change). Addresses pre-commit CI failure on PR #34726. Signed-off-by: raviguptaamd Made-with: Cursor --- vllm/envs.py | 15 +++++++++------ 1 file changed, 9 insertions(+), 6 deletions(-) diff --git a/vllm/envs.py b/vllm/envs.py index 5bf91ee18a16..3055324a6f2a 100755 --- a/vllm/envs.py +++ b/vllm/envs.py @@ -1623,12 +1623,15 @@ def _get_or_set_default() -> str: # The number of SMs/CUs to allocate for communication kernels when # running DBO; the rest will be allocated to compute. # Default: 20 on CUDA (SMs), 64 on ROCm (CUs). - "VLLM_DBO_COMM_SMS": lambda: int(os.getenv( - "VLLM_DBO_COMM_SMS", - "64" if hasattr(__import__("torch").version, "hip") - and __import__("torch").version.hip is not None - else "20" - )), + "VLLM_DBO_COMM_SMS": lambda: int( + os.getenv( + "VLLM_DBO_COMM_SMS", + "64" + if hasattr(__import__("torch").version, "hip") + and __import__("torch").version.hip is not None + else "20", + ) + ), # Enable max_autotune & coordinate_descent_tuning in inductor_config # to compile static shapes passed from compile_sizes in compilation_config # If set to 1, enable max_autotune; By default, this is enabled (1)