Skip to content
Draft
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
110 changes: 85 additions & 25 deletions .buildkite/test-amd.yaml
Original file line number Diff line number Diff line change
Expand Up @@ -3210,31 +3210,91 @@ steps:
- uv pip install --system conch-triton-kernels
- VLLM_TEST_FORCE_LOAD_FORMAT=auto pytest -v -s quantization/ --ignore quantization/test_blackwell_moe.py

# - label: Quantized MoE Test (B200-MI355) # TBD
# timeout_in_minutes: 180
# mirror_hardwares: [amdexperimental, amdproduction, amdgfx950nightly, amdmi355]
# agent_pool: mi355_1
# working_dir: "/vllm-workspace/"
# source_file_dependencies:
# - tests/quantization/test_gfx950_moe.py
# - vllm/model_executor/models/deepseek_v2.py
# - vllm/model_executor/models/gpt_oss.py
# - vllm/model_executor/models/llama4.py
# - vllm/model_executor/layers/fused_moe
# - vllm/model_executor/layers/quantization/compressed_tensors
# - vllm/model_executor/layers/quantization/modelopt.py
# - vllm/model_executor/layers/quantization/mxfp4.py
# - vllm/v1/attention/backends/triton_attn.py
# - vllm/v1/attention/backends/rocm_attn.py
# - vllm/v1/attention/backends/rocm_aiter_fa.py
# - vllm/v1/attention/backends/mla/
# - vllm/v1/attention/selector.py
# - vllm/model_executor/layers/layernorm.py
# - vllm/_aiter_ops.py
# - vllm/platforms/rocm.py
# - vllm/model_executor/model_loader/
# commands:
# - pytest -s -v tests/quantization/test_gfx950_moe.py
- label: Quantized MoE Test (2xB200-1xMI355) # TBD
timeout_in_minutes: 180
mirror_hardwares: [amdexperimental, amdproduction, amdgfx950nightly, amdmi355]
agent_pool: mi355_1
working_dir: "/vllm-workspace/tests"
source_file_dependencies:
- vllm/model_executor/layers/quantization/quark/
- vllm/model_executor/models/deepseek_v2.py
- vllm/model_executor/models/gpt_oss.py
- vllm/model_executor/models/llama4.py
- vllm/model_executor/layers/fused_moe
- vllm/model_executor/layers/quantization/compressed_tensors
- vllm/model_executor/layers/quantization/modelopt.py
- vllm/model_executor/layers/quantization/mxfp4.py
- vllm/v1/attention/backends/triton_attn.py
- vllm/v1/attention/backends/rocm_attn.py
- vllm/v1/attention/backends/rocm_aiter_fa.py
- vllm/v1/attention/backends/mla/
- vllm/v1/attention/selector.py
- vllm/model_executor/layers/layernorm.py
- vllm/_aiter_ops.py
- vllm/platforms/rocm.py
- vllm/model_executor/model_loader/
- tests/quantization/test_gfx950_moe.py
- tests/quantization/test_quark.py
commands:
- pytest -s -v quantization/test_gfx950_moe.py

- label: Quantization Quark Eval (2xMI355) # TBD
timeout_in_minutes: 180
mirror_hardwares: [amdexperimental, amdproduction, amdgfx950nightly, amdmi355]
agent_pool: mi355_2
num_gpus: 2
optional: true
working_dir: "/vllm-workspace/tests"
source_file_dependencies:
- csrc/
- vllm/model_executor/layers/quantization/
- vllm/model_executor/layers/quantization/quark/
- vllm/model_executor/layers/fused_moe/
- vllm/model_executor/layers/layernorm.py
- vllm/model_executor/models/
- vllm/model_executor/models/qwen2_moe.py
- vllm/model_executor/model_loader/
- vllm/v1/attention/backends/
- vllm/v1/attention/backends/mla/
- vllm/v1/attention/selector.py
- tests/quantization
- tests/quantization/test_quark.py
- vllm/_aiter_ops.py
- vllm/platforms/rocm.py
commands:
- uv pip install --system torchao==0.17.0
- uv pip install --system conch-triton-kernels
- export VLLM_WORKER_MULTIPROC_METHOD=spawn
- VLLM_TEST_FORCE_LOAD_FORMAT=auto pytest -v -s quantization/test_quark.py -k "ocp_mx_wikitext_correctness and tp2"

- label: Quantization Quark Eval (8xMI355) # TBD
timeout_in_minutes: 180
mirror_hardwares: [amdexperimental, amdproduction, amdgfx950nightly, amdmi355]
agent_pool: mi355_8
num_gpus: 8
optional: true
working_dir: "/vllm-workspace/tests"
source_file_dependencies:
- csrc/
- vllm/model_executor/layers/quantization/
- vllm/model_executor/layers/quantization/quark/
- vllm/model_executor/layers/fused_moe/
- vllm/model_executor/layers/layernorm.py
- vllm/model_executor/models/
- vllm/model_executor/models/deepseek_v2.py
- vllm/model_executor/model_loader/
- vllm/v1/attention/backends/
- vllm/v1/attention/backends/mla/
- vllm/v1/attention/selector.py
- tests/quantization
- tests/quantization/test_quark.py
- vllm/_aiter_ops.py
- vllm/platforms/rocm.py
commands:
- uv pip install --system torchao==0.17.0
- uv pip install --system conch-triton-kernels
- export VLLM_WORKER_MULTIPROC_METHOD=spawn
- VLLM_TEST_FORCE_LOAD_FORMAT=auto pytest -v -s quantization/test_quark.py::test_mxfp4_gsm8k_correctness

#------------------------------------------------------------ mi355 · v1 -------------------------------------------------------------#

Expand Down
232 changes: 230 additions & 2 deletions tests/quantization/test_gfx950_moe.py
Original file line number Diff line number Diff line change
@@ -1,6 +1,234 @@
# SPDX-License-Identifier: Apache-2.0
# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
"""ROCm gfx950 quantized-MoE initialization coverage.

This file mirrors the intent of ``test_blackwell_moe.py`` using ROCm-native
features instead of CUDA-only backends:

def test_mi355_moe():
print("TODO: add tests for Mi355 MoE quantization")
- public Neural Magic compressed-tensors MoE models
- public Quark INT8 MoE smoke coverage
- ROCm Quark MXFP4/BF16 MoE with explicit ``aiter`` and ``triton`` backends
- ROCm GPT-OSS MXFP4/FP8 MoE in the same shape the repo already advertises
- ROCm DeepSeek Quark MXFP4/UINT8 MoE with explicit backend coverage
"""

import importlib.metadata
import importlib.util
from typing import Any

import huggingface_hub
import pytest
from packaging import version

from tests.utils import RemoteOpenAIServer
from vllm.platforms import current_platform
from vllm.platforms.rocm import on_mi3xx

pytestmark = pytest.mark.skipif(
not current_platform.is_rocm() or not on_mi3xx(),
reason="MI300/MI350 ROCm only",
)


def _has_quark_mxfp4_support() -> bool:
if importlib.util.find_spec("quark") is None:
return False
try:
return version.parse(importlib.metadata.version("amd-quark")) >= version.parse(
"0.9.0"
)
except importlib.metadata.PackageNotFoundError:
return False


QUARK_MXFP4_AVAILABLE = _has_quark_mxfp4_support()
QUARK_AVAILABLE = importlib.util.find_spec("quark") is not None

HF_OVERRIDE_TEXT = {
"num_layers": 4,
"num_hidden_layers": 4,
}
ROCM_ATTENTION_BACKENDS = [
pytest.param("ROCM_ATTN", id="rocm_attn"),
pytest.param("ROCM_AITER_UNIFIED_ATTN", id="rocm_aiter_unified_attn"),
]


def _has_huggingface_access(repo_id: str) -> bool:
try:
huggingface_hub.list_repo_refs(repo_id)
return True
except huggingface_hub.errors.RepositoryNotFoundError:
return False
except huggingface_hub.errors.HfHubHTTPError:
return False


def _require_repo_access(repo_id: str) -> None:
if not _has_huggingface_access(repo_id):
pytest.skip(f"Read access to huggingface.co/{repo_id} is required.")


def _can_initialize(
model: str,
*,
hf_overrides: dict[str, Any] | None = None,
extra_args: list[str] | None = None,
env: dict[str, str] | None = None,
) -> None:
server_args = [
"--max-model-len",
"2048",
"--max-num-batched-tokens",
"256",
"--max-num-seqs",
"1",
"--load-format",
"dummy",
"--trust-remote-code",
"--enforce-eager",
"--disable-uvicorn-access-log",
*(extra_args or []),
]

with RemoteOpenAIServer(
model,
server_args,
env_dict=env,
max_wait_seconds=1500,
Copy link
Copy Markdown
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

high

The max_wait_seconds is set to 1500 (25 minutes), which is exceptionally high for a test using the dummy load format. While large models can take time to initialize, dummy loading (which skips disk I/O for weights) should typically complete within a few minutes. Such a long timeout can lead to significant CI blockage if a regression causes the server to hang or fail silently. Consider reducing this to a more reasonable value (e.g., 300-600 seconds).

Copy link
Copy Markdown
Collaborator Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

This directly mirrors blackwell test. However, we might indeed not need that value there. At the same time, it is a timeout so there is no real check here I think.

override_hf_configs=hf_overrides,
) as server:
client = server.get_client()
completion = client.completions.create(
model=model,
prompt=["Hello, World!"],
temperature=0,
max_tokens=2,
)
print(completion)
assert completion.choices[0].text is not None


@pytest.mark.parametrize("attention_backend", ROCM_ATTENTION_BACKENDS)
def test_nm_qwen15_w4a16_moe_initializes_across_rocm_attention_backends(
attention_backend: str,
):
"""A public Neural Magic W4A16 MoE model should initialize with both ROCm
attention backends that are meaningful on MI3xx."""
repo_id = "nm-testing/Qwen1.5-MoE-A2.7B-Chat-quantized.w4a16"
_require_repo_access(repo_id)
_can_initialize(
repo_id,
hf_overrides=HF_OVERRIDE_TEXT,
extra_args=["--attention-backend", attention_backend],
)


def test_nm_mixtral_w4a16_moe_initializes():
"""A second public Neural Magic MoE family should initialize on ROCm."""
repo_id = "nm-testing/Mixtral-8x7B-Instruct-v0.1-W4A16-quantized"
_require_repo_access(repo_id)
_can_initialize(
repo_id,
hf_overrides=HF_OVERRIDE_TEXT,
)


@pytest.mark.skipif(
not QUARK_AVAILABLE,
reason="quark package is required for ROCm Quark MoE tests",
)
def test_tiny_quark_int8_moe_initializes():
"""A small public Quark INT8 MoE model should initialize on MI3xx."""
_can_initialize(
"nameistoken/tiny-qwen3-moe-w8a8-int8-quark",
hf_overrides=HF_OVERRIDE_TEXT,
)


@pytest.mark.skipif(
not QUARK_MXFP4_AVAILABLE,
reason="amd-quark>=0.9.0 is required for ROCm MXFP4 MoE tests",
)
@pytest.mark.parametrize("moe_backend", ["aiter", "triton"])
def test_gptoss_rocm_quark_mxfp4_bf16_moe_backends_initialize(
moe_backend: str,
):
"""The ROCm GPT-OSS MXFP4/BF16 Quark MoE path should initialize with the
two real ROCm MoE backends exposed at the CLI."""
repo_id = "amd/gpt-oss-20b-w-mxfp4-a-bf16"
_require_repo_access(repo_id)
_can_initialize(
repo_id,
hf_overrides=HF_OVERRIDE_TEXT,
extra_args=[
"--attention-backend",
"ROCM_AITER_UNIFIED_ATTN",
"--moe-backend",
moe_backend,
"--tokenizer",
"openai/gpt-oss-20b",
"--tensor-parallel-size",
"1",
],
env={"VLLM_ROCM_USE_AITER": "1"} if moe_backend == "aiter" else None,
)


@pytest.mark.skipif(
not current_platform.supports_fp8(),
reason="FP8 not supported on this hardware",
)
@pytest.mark.skipif(
not QUARK_MXFP4_AVAILABLE,
reason="amd-quark>=0.9.0 is required for ROCm MXFP4 MoE tests",
)
def test_gptoss_rocm_quark_mxfp4_fp8_moe_initializes():
"""The ROCm GPT-OSS MXFP4/FP8 Quark MoE path should initialize in the same
form the repo already advertises for ROCm evals."""
repo_id = "amd/gpt-oss-20b-MoE-Quant-W-MXFP4-A-FP8-KV-FP8"
_require_repo_access(repo_id)
_can_initialize(
repo_id,
hf_overrides=HF_OVERRIDE_TEXT,
extra_args=[
"--attention-backend",
"ROCM_AITER_UNIFIED_ATTN",
"--tokenizer",
"openai/gpt-oss-20b",
"--tensor-parallel-size",
"1",
],
env={"VLLM_ROCM_USE_AITER": "1"},
)


@pytest.mark.skipif(
not QUARK_MXFP4_AVAILABLE,
reason="amd-quark>=0.9.0 is required for ROCm MXFP4 MoE tests",
)
@pytest.mark.parametrize(
"moe_backend",
[
pytest.param(None, id="auto"),
pytest.param("aiter", id="aiter"),
pytest.param("triton", id="triton"),
],
)
def test_deepseek_rocm_quark_mxfp4_uint8_moe_backends_initialize(
moe_backend: str | None,
):
"""The ROCm DeepSeek MXFP4/UINT8 Quark MoE path should initialize across
the real ROCm backend choices for the MXFP4 MoE oracle."""
repo_id = "amd/DeepSeek-R1-WMXFP4-AMXFP4-Scale-UINT8-MoE-Quant"
_require_repo_access(repo_id)
_can_initialize(
repo_id,
hf_overrides=HF_OVERRIDE_TEXT,
extra_args=[
"--tensor-parallel-size",
"1",
*([] if moe_backend is None else ["--moe-backend", moe_backend]),
],
env={"VLLM_ROCM_USE_AITER": "1"} if moe_backend == "aiter" else None,
)
Loading
Loading