Skip to content
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
1 change: 1 addition & 0 deletions .buildkite/test-amd.yaml
Original file line number Diff line number Diff line change
Expand Up @@ -101,6 +101,7 @@ steps:
- export VLLM_LOGGING_LEVEL=DEBUG
- export VLLM_WORKER_MULTIPROC_METHOD=spawn
- export VLLM_TEST_CLEAN_GPU_MEMORY="1"
- export VLLM_ROCM_USE_AITER=0
- pytest -s -v tests/e2e/offline_inference/test_qwen3_omni.py
- pytest -s -v tests/e2e/online_serving/test_qwen3_omni.py

Expand Down
4 changes: 2 additions & 2 deletions vllm_omni/engine/async_omni_engine.py
Original file line number Diff line number Diff line change
Expand Up @@ -350,9 +350,9 @@ def _launch_llm_stage(
)
finally:
if previous_visible_devices is None:
os.environ.pop(device_control_env, None)
current_omni_platform.unset_device_control_env_var()
else:
os.environ[device_control_env] = previous_visible_devices
current_omni_platform.set_device_control_env_var(previous_visible_devices)
Comment on lines 352 to +355

Copy link
Copy Markdown

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

P1 Badge Restore ROCm visibility from both env vars after stage launch

On ROCm this block only snapshots os.environ[device_control_env_var] before launch, but unset_device_control_env_var() now clears both HIP and CUDA. In environments that enter with only HIP_VISIBLE_DEVICES set (our AMD wrapper does this), previous_visible_devices is None, so the first stage launch drops the inherited HIP mask entirely. Any later stage/worker then inherits full-node visibility, and the multi-stage ROCm configs under tests/e2e/stage_configs/rocm/ get remapped against physical GPUs instead of the shard-local subset.

Useful? React with 👍 / 👎.


logger.info("[AsyncOmniEngine] Stage %s engine launch started", metadata.stage_id)
launch_cm.__exit__(None, None, None)
Expand Down
6 changes: 3 additions & 3 deletions vllm_omni/entrypoints/stage_utils.py
Original file line number Diff line number Diff line change
Expand Up @@ -72,7 +72,7 @@ def set_stage_devices(
else:
mapped_devices.append(str(idx))
mapped_devices_str = ",".join(mapped_devices)
os.environ[env_var] = mapped_devices_str
current_omni_platform.set_device_control_env_var(mapped_devices_str)
if toks:
try:
selected_physical = int(mapped_devices[0])
Expand All @@ -99,7 +99,7 @@ def set_stage_devices(
selected_physical = None
if selected_physical is None:
selected_physical = int(logical_idx)
os.environ[env_var] = str(selected_physical)
current_omni_platform.set_device_control_env_var(str(selected_physical))

Copy link
Copy Markdown

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

P1 Badge Map ROCm logical ids from HIP visibility before mirroring both vars

This assignment happens only after selected_physical has been derived from os.environ.get(env_var) above, and on ROCm device_control_env_var is still CUDA_VISIBLE_DEVICES. If the parent process only exports HIP_VISIBLE_DEVICES (for example .buildkite/scripts/hardware_ci/run-amd-test.sh), a stage configured as devices: "0" or "1" never sees the shard-local mapping and this line rewrites both env vars to the raw logical id. A container pinned to GPU 1 therefore re-exports HIP_VISIBLE_DEVICES=0, so its workers launch on the wrong physical GPU instead of the assigned shard.

Useful? React with 👍 / 👎.

logger.debug(
"[Stage-%s] Logical index %d -> physical %s; set %s to single device",
stage_id,
Expand All @@ -111,7 +111,7 @@ def set_stage_devices(
logger.debug("[Stage-%s] Using default device visibility (devices=%s)", stage_id, devices)
else:
selected_physical = int(str(devices))
os.environ[env_var] = str(selected_physical)
current_omni_platform.set_device_control_env_var(str(selected_physical))
logger.debug("[Stage-%s] Set %s to single device %s (fallback)", stage_id, env_var, selected_physical)
except Exception as e:
logger.warning("Failed to interpret devices for stage %s: %s", stage_id, e)
Expand Down
12 changes: 12 additions & 0 deletions vllm_omni/platforms/interface.py
Original file line number Diff line number Diff line change
Expand Up @@ -113,6 +113,18 @@ def get_free_memory(cls, device: torch.device | None = None) -> int:
def supports_cpu_offload(cls) -> bool:
return True

@classmethod
def set_device_control_env_var(cls, devices: str | int | None) -> None:
import os

os.environ[cls.device_control_env_var] = devices

@classmethod
def unset_device_control_env_var(cls) -> None:
import os

os.environ.pop(cls.device_control_env_var, None)


class UnspecifiedOmniPlatform(OmniPlatform):
_omni_enum = OmniPlatformEnum.UNSPECIFIED
Expand Down
14 changes: 14 additions & 0 deletions vllm_omni/platforms/rocm/platform.py
Original file line number Diff line number Diff line change
Expand Up @@ -99,3 +99,17 @@ def synchronize(cls) -> None:
def get_free_memory(cls, device: torch.device | None = None) -> int:
free, _ = torch.cuda.mem_get_info(device)
return free

@classmethod
def set_device_control_env_var(cls, devices: str | int | None) -> None:
import os

os.environ["HIP_VISIBLE_DEVICES"] = devices

Copy link
Copy Markdown
Collaborator

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Does ROCm also need CUDA_VISIBLE_DEVICES?

@tjtanaa tjtanaa Mar 18, 2026

Copy link
Copy Markdown
Member Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

@gcanlin yes. Some of the libraries like ray backend on vLLM under certain conditions it will find CUDA_VISIBLE_DEVICES instead of HIP_VISIBLE_DEVICES. We will try to find a better way to fix this on vLLM core.

In the latest vLLM platform code, they synced/set both CUDA_VISIBLE_DEVICES and HIP_VISIBLE_DEVICES during platform module import.

os.environ["CUDA_VISIBLE_DEVICES"] = devices

@classmethod
def unset_device_control_env_var(cls) -> None:
import os

os.environ.pop("HIP_VISIBLE_DEVICES", None)
os.environ.pop("CUDA_VISIBLE_DEVICES", None)
Loading