vllm-project · AndreasKaratzas · Apr 26, 2026 · Apr 27, 2026 · Apr 27, 2026 · gemini-code-assist
@@ -3210,31 +3210,91 @@ steps:
   - uv pip install --system conch-triton-kernels
   - VLLM_TEST_FORCE_LOAD_FORMAT=auto pytest -v -s quantization/ --ignore quantization/test_blackwell_moe.py
 
-# - label: Quantized MoE Test (B200-MI355) # TBD
-#   timeout_in_minutes: 180
-#   mirror_hardwares: [amdexperimental, amdproduction, amdgfx950nightly, amdmi355]
-#   agent_pool: mi355_1
-#   working_dir: "/vllm-workspace/"
-#   source_file_dependencies:
-#   - tests/quantization/test_gfx950_moe.py
-#   - vllm/model_executor/models/deepseek_v2.py
-#   - vllm/model_executor/models/gpt_oss.py
-#   - vllm/model_executor/models/llama4.py
-#   - vllm/model_executor/layers/fused_moe
-#   - vllm/model_executor/layers/quantization/compressed_tensors
-#   - vllm/model_executor/layers/quantization/modelopt.py
-#   - vllm/model_executor/layers/quantization/mxfp4.py
-#   - vllm/v1/attention/backends/triton_attn.py
-#   - vllm/v1/attention/backends/rocm_attn.py
-#   - vllm/v1/attention/backends/rocm_aiter_fa.py
-#   - vllm/v1/attention/backends/mla/
-#   - vllm/v1/attention/selector.py
-#   - vllm/model_executor/layers/layernorm.py
-#   - vllm/_aiter_ops.py
-#   - vllm/platforms/rocm.py
-#   - vllm/model_executor/model_loader/
-#   commands:
-#   - pytest -s -v tests/quantization/test_gfx950_moe.py
+- label: Quantized MoE Test (2xB200-1xMI355) # TBD
+  timeout_in_minutes: 180
+  mirror_hardwares: [amdexperimental, amdproduction, amdgfx950nightly, amdmi355]
+  agent_pool: mi355_1
+  working_dir: "/vllm-workspace/tests"
+  source_file_dependencies:
+  - vllm/model_executor/layers/quantization/quark/
+  - vllm/model_executor/models/deepseek_v2.py
+  - vllm/model_executor/models/gpt_oss.py
+  - vllm/model_executor/models/llama4.py
+  - vllm/model_executor/layers/fused_moe
+  - vllm/model_executor/layers/quantization/compressed_tensors
+  - vllm/model_executor/layers/quantization/modelopt.py
+  - vllm/model_executor/layers/quantization/mxfp4.py
+  - vllm/v1/attention/backends/triton_attn.py
+  - vllm/v1/attention/backends/rocm_attn.py
+  - vllm/v1/attention/backends/rocm_aiter_fa.py
+  - vllm/v1/attention/backends/mla/
+  - vllm/v1/attention/selector.py
+  - vllm/model_executor/layers/layernorm.py
+  - vllm/_aiter_ops.py
+  - vllm/platforms/rocm.py
+  - vllm/model_executor/model_loader/
+  - tests/quantization/test_gfx950_moe.py
+  - tests/quantization/test_quark.py
+  commands:
+  - pytest -s -v quantization/test_gfx950_moe.py
+
+- label: Quantization Quark Eval (2xMI355) # TBD
+  timeout_in_minutes: 180
+  mirror_hardwares: [amdexperimental, amdproduction, amdgfx950nightly, amdmi355]
+  agent_pool: mi355_2
+  num_gpus: 2
+  optional: true
+  working_dir: "/vllm-workspace/tests"
+  source_file_dependencies:
+  - csrc/
+  - vllm/model_executor/layers/quantization/
+  - vllm/model_executor/layers/quantization/quark/
+  - vllm/model_executor/layers/fused_moe/
+  - vllm/model_executor/layers/layernorm.py
+  - vllm/model_executor/models/
+  - vllm/model_executor/models/qwen2_moe.py
+  - vllm/model_executor/model_loader/
+  - vllm/v1/attention/backends/
+  - vllm/v1/attention/backends/mla/
+  - vllm/v1/attention/selector.py
+  - tests/quantization
+  - tests/quantization/test_quark.py
+  - vllm/_aiter_ops.py
+  - vllm/platforms/rocm.py
+  commands:
+  - uv pip install --system torchao==0.17.0
+  - uv pip install --system conch-triton-kernels
+  - export VLLM_WORKER_MULTIPROC_METHOD=spawn
+  - VLLM_TEST_FORCE_LOAD_FORMAT=auto pytest -v -s quantization/test_quark.py -k "ocp_mx_wikitext_correctness and tp2"
+
+- label: Quantization Quark Eval (8xMI355) # TBD
+  timeout_in_minutes: 180
+  mirror_hardwares: [amdexperimental, amdproduction, amdgfx950nightly, amdmi355]
+  agent_pool: mi355_8
+  num_gpus: 8
+  optional: true
+  working_dir: "/vllm-workspace/tests"
+  source_file_dependencies:
+  - csrc/
+  - vllm/model_executor/layers/quantization/
+  - vllm/model_executor/layers/quantization/quark/
+  - vllm/model_executor/layers/fused_moe/
+  - vllm/model_executor/layers/layernorm.py
+  - vllm/model_executor/models/
+  - vllm/model_executor/models/deepseek_v2.py
+  - vllm/model_executor/model_loader/
+  - vllm/v1/attention/backends/
+  - vllm/v1/attention/backends/mla/
+  - vllm/v1/attention/selector.py
+  - tests/quantization
+  - tests/quantization/test_quark.py
+  - vllm/_aiter_ops.py
+  - vllm/platforms/rocm.py
+  commands:
+  - uv pip install --system torchao==0.17.0
+  - uv pip install --system conch-triton-kernels
+  - export VLLM_WORKER_MULTIPROC_METHOD=spawn
+  - VLLM_TEST_FORCE_LOAD_FORMAT=auto pytest -v -s quantization/test_quark.py::test_mxfp4_gsm8k_correctness
 
 #------------------------------------------------------------  mi355 · v1  -------------------------------------------------------------#
 

@@ -1,6 +1,234 @@
 # SPDX-License-Identifier: Apache-2.0
 # SPDX-FileCopyrightText: Copyright contributors to the vLLM project
+"""ROCm gfx950 quantized-MoE initialization coverage.
 
+This file mirrors the intent of ``test_blackwell_moe.py`` using ROCm-native
+features instead of CUDA-only backends:
 
-def test_mi355_moe():
-    print("TODO: add tests for Mi355 MoE quantization")
+- public Neural Magic compressed-tensors MoE models
+- public Quark INT8 MoE smoke coverage
+- ROCm Quark MXFP4/BF16 MoE with explicit ``aiter`` and ``triton`` backends
+- ROCm GPT-OSS MXFP4/FP8 MoE in the same shape the repo already advertises
+- ROCm DeepSeek Quark MXFP4/UINT8 MoE with explicit backend coverage
+"""
+
+import importlib.metadata
+import importlib.util
+from typing import Any
+
+import huggingface_hub
+import pytest
+from packaging import version
+
+from tests.utils import RemoteOpenAIServer
+from vllm.platforms import current_platform
+from vllm.platforms.rocm import on_mi3xx
+
+pytestmark = pytest.mark.skipif(
+    not current_platform.is_rocm() or not on_mi3xx(),
+    reason="MI300/MI350 ROCm only",
+)
+
+
+def _has_quark_mxfp4_support() -> bool:
+    if importlib.util.find_spec("quark") is None:
+        return False
+    try:
+        return version.parse(importlib.metadata.version("amd-quark")) >= version.parse(
+            "0.9.0"
+        )
+    except importlib.metadata.PackageNotFoundError:
+        return False
+
+
+QUARK_MXFP4_AVAILABLE = _has_quark_mxfp4_support()
+QUARK_AVAILABLE = importlib.util.find_spec("quark") is not None
+
+HF_OVERRIDE_TEXT = {
+    "num_layers": 4,
+    "num_hidden_layers": 4,
+}
+ROCM_ATTENTION_BACKENDS = [
+    pytest.param("ROCM_ATTN", id="rocm_attn"),
+    pytest.param("ROCM_AITER_UNIFIED_ATTN", id="rocm_aiter_unified_attn"),
+]
+
+
+def _has_huggingface_access(repo_id: str) -> bool:
+    try:
+        huggingface_hub.list_repo_refs(repo_id)
+        return True
+    except huggingface_hub.errors.RepositoryNotFoundError:
+        return False
+    except huggingface_hub.errors.HfHubHTTPError:
+        return False
+
+
+def _require_repo_access(repo_id: str) -> None:
+    if not _has_huggingface_access(repo_id):
+        pytest.skip(f"Read access to huggingface.co/{repo_id} is required.")
+
+
+def _can_initialize(
+    model: str,
+    *,
+    hf_overrides: dict[str, Any] | None = None,
+    extra_args: list[str] | None = None,
+    env: dict[str, str] | None = None,
+) -> None:
+    server_args = [
+        "--max-model-len",
+        "2048",
+        "--max-num-batched-tokens",
+        "256",
+        "--max-num-seqs",
+        "1",
+        "--load-format",
+        "dummy",
+        "--trust-remote-code",
+        "--enforce-eager",
+        "--disable-uvicorn-access-log",
+        *(extra_args or []),
+    ]
+
+    with RemoteOpenAIServer(
+        model,
+        server_args,
+        env_dict=env,
+        max_wait_seconds=1500,
+        override_hf_configs=hf_overrides,
+    ) as server:
+        client = server.get_client()
+        completion = client.completions.create(
+            model=model,
+            prompt=["Hello, World!"],
+            temperature=0,
+            max_tokens=2,
+        )
+        print(completion)
+        assert completion.choices[0].text is not None
+
+
+@pytest.mark.parametrize("attention_backend", ROCM_ATTENTION_BACKENDS)
+def test_nm_qwen15_w4a16_moe_initializes_across_rocm_attention_backends(
+    attention_backend: str,
+):
+    """A public Neural Magic W4A16 MoE model should initialize with both ROCm
+    attention backends that are meaningful on MI3xx."""
+    repo_id = "nm-testing/Qwen1.5-MoE-A2.7B-Chat-quantized.w4a16"
+    _require_repo_access(repo_id)
+    _can_initialize(
+        repo_id,
+        hf_overrides=HF_OVERRIDE_TEXT,
+        extra_args=["--attention-backend", attention_backend],
+    )
+
+
+def test_nm_mixtral_w4a16_moe_initializes():
+    """A second public Neural Magic MoE family should initialize on ROCm."""
+    repo_id = "nm-testing/Mixtral-8x7B-Instruct-v0.1-W4A16-quantized"
+    _require_repo_access(repo_id)
+    _can_initialize(
+        repo_id,
+        hf_overrides=HF_OVERRIDE_TEXT,
+    )
+
+
+@pytest.mark.skipif(
+    not QUARK_AVAILABLE,
+    reason="quark package is required for ROCm Quark MoE tests",
+)
+def test_tiny_quark_int8_moe_initializes():
+    """A small public Quark INT8 MoE model should initialize on MI3xx."""
+    _can_initialize(
+        "nameistoken/tiny-qwen3-moe-w8a8-int8-quark",
+        hf_overrides=HF_OVERRIDE_TEXT,
+    )
+
+
+@pytest.mark.skipif(
+    not QUARK_MXFP4_AVAILABLE,
+    reason="amd-quark>=0.9.0 is required for ROCm MXFP4 MoE tests",
+)
+@pytest.mark.parametrize("moe_backend", ["aiter", "triton"])
+def test_gptoss_rocm_quark_mxfp4_bf16_moe_backends_initialize(
+    moe_backend: str,
+):
+    """The ROCm GPT-OSS MXFP4/BF16 Quark MoE path should initialize with the
+    two real ROCm MoE backends exposed at the CLI."""
+    repo_id = "amd/gpt-oss-20b-w-mxfp4-a-bf16"
+    _require_repo_access(repo_id)
+    _can_initialize(
+        repo_id,
+        hf_overrides=HF_OVERRIDE_TEXT,
+        extra_args=[
+            "--attention-backend",
+            "ROCM_AITER_UNIFIED_ATTN",
+            "--moe-backend",
+            moe_backend,
+            "--tokenizer",
+            "openai/gpt-oss-20b",
+            "--tensor-parallel-size",
+            "1",
+        ],
+        env={"VLLM_ROCM_USE_AITER": "1"} if moe_backend == "aiter" else None,
+    )
+
+
+@pytest.mark.skipif(
+    not current_platform.supports_fp8(),
+    reason="FP8 not supported on this hardware",
+)
+@pytest.mark.skipif(
+    not QUARK_MXFP4_AVAILABLE,
+    reason="amd-quark>=0.9.0 is required for ROCm MXFP4 MoE tests",
+)
+def test_gptoss_rocm_quark_mxfp4_fp8_moe_initializes():
+    """The ROCm GPT-OSS MXFP4/FP8 Quark MoE path should initialize in the same
+    form the repo already advertises for ROCm evals."""
+    repo_id = "amd/gpt-oss-20b-MoE-Quant-W-MXFP4-A-FP8-KV-FP8"
+    _require_repo_access(repo_id)
+    _can_initialize(
+        repo_id,
+        hf_overrides=HF_OVERRIDE_TEXT,
+        extra_args=[
+            "--attention-backend",
+            "ROCM_AITER_UNIFIED_ATTN",
+            "--tokenizer",
+            "openai/gpt-oss-20b",
+            "--tensor-parallel-size",
+            "1",
+        ],
+        env={"VLLM_ROCM_USE_AITER": "1"},
+    )
+
+
+@pytest.mark.skipif(
+    not QUARK_MXFP4_AVAILABLE,
+    reason="amd-quark>=0.9.0 is required for ROCm MXFP4 MoE tests",
+)
+@pytest.mark.parametrize(
+    "moe_backend",
+    [
+        pytest.param(None, id="auto"),
+        pytest.param("aiter", id="aiter"),
+        pytest.param("triton", id="triton"),
+    ],
+)
+def test_deepseek_rocm_quark_mxfp4_uint8_moe_backends_initialize(
+    moe_backend: str | None,
+):
+    """The ROCm DeepSeek MXFP4/UINT8 Quark MoE path should initialize across
+    the real ROCm backend choices for the MXFP4 MoE oracle."""
+    repo_id = "amd/DeepSeek-R1-WMXFP4-AMXFP4-Scale-UINT8-MoE-Quant"
+    _require_repo_access(repo_id)
+    _can_initialize(
+        repo_id,
+        hf_overrides=HF_OVERRIDE_TEXT,
+        extra_args=[
+            "--tensor-parallel-size",
+            "1",
+            *([] if moe_backend is None else ["--moe-backend", moe_backend]),
+        ],
+        env={"VLLM_ROCM_USE_AITER": "1"} if moe_backend == "aiter" else None,
+    )