[ROCm][Quantization] GPT_OSS in amd-quark format model loading and emulations #29008

New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Merged

robertgshaw2-redhat merged 105 commits into vllm-project:main from xuebwang-amd:xuebin_add_quark_format_mapping_in_gpt_oss

Feb 10, 2026

tests/kernels/moe/test_gpt_oss_triton_kernels.py

-Original file line number
+Diff line change
@@ Expand Up / @@ -22,7 +22,7 @@ @@
     from triton_kernels.tensor_details import layout
     from triton_kernels.testing import assert_close
-    from vllm.model_executor.layers.fused_moe.config import FusedMoEQuantConfig
+    from vllm.model_executor.layers.fused_moe.config import mxfp4_w4a16_moe_quant_config
     from vllm.model_executor.layers.fused_moe.gpt_oss_triton_kernels_moe import (
         triton_kernel_moe_forward,
     )
@@ Expand Down Expand Up @@
             pc2,
         ) = init_compute_data(M, K, N, E, a_dtype, w_dtype, num_warps=8)
-        quant_config = FusedMoEQuantConfig.make(
-            w1_bias=w1_bias_tri,
-            w2_bias=w2_bias_tri,
-            w1_scale=pc1,
-            w2_scale=pc2,
-        )
+        if a_dtype == "bf16" and w_dtype == "mx4":
+            quant_config = mxfp4_w4a16_moe_quant_config(
+                w1_scale=pc1,
+                w2_scale=pc2,
+                w1_bias=w1_bias_tri,
+                w2_bias=w2_bias_tri,
+            )
+        else:
+            raise NotImplementedError(
+                f"Quantization configuration for activation={a_dtype} and weight={w_dtype} "
+                f"has not been implemented."
+            )
         out_triton_monolithic = triton_kernel_moe_forward(
             hidden_states=x_tri,
@@ Expand Down @@

tests/models/quantization/test_gpt_oss.py

-Original file line number
+Diff line change
@@ -0,0 +1,110 @@
+    # SPDX-License-Identifier: Apache-2.0
+    # SPDX-FileCopyrightText: Copyright contributors to the vLLM project
+    """
+    End-to-end accuracy test for GPT-OSS model quantization.
+    Config:
+        Task:   gsm8k_platinum
+        Filter: flexible-extract
+        n-shot: 5
+        Metric: exact_match
+    Run: pytest tests/models/quantization/test_gpt_oss.py
+    """
+    import importlib
+    import importlib.metadata
+    from dataclasses import dataclass
+    import huggingface_hub
+    import lm_eval
+    import pytest
+    from packaging import version
+    MODEL_ACCURACIES = {
+        # Full quantization: attention linears and MoE linears
+        "amd/gpt-oss-20b-WFP8-AFP8-KVFP8": 0.89,
+        # MoE linears only quantization
+        "amd/gpt-oss-20b-MoE-Quant-W-MXFP4-A-FP8-KV-FP8": 0.89,
+        # MoE linears only quantization
+        # "amd/gpt-oss-20b-MoE-Quant-W-MXFP4-A-MXFP4-KV-FP8": 0.90,
+    }
+    QUARK_MXFP4_AVAILABLE = importlib.util.find_spec("quark") is not None and version.parse(
+        importlib.metadata.version("amd-quark")
+    ) >= version.parse("0.9.0")
+    def has_huggingface_access(repo):
+        try:
+            huggingface_hub.list_repo_refs(repo)
+            return True
+        except huggingface_hub.errors.RepositoryNotFoundError:
+            return False
+    HF_HUB_AMD_ORG_ACCESS = all(
+        [has_huggingface_access(model_name) for model_name in MODEL_ACCURACIES]
+    )
+    @dataclass
+    class ModelCase:
+        model_id: str
+        tp: int
+    @dataclass
+    class EvaluationConfig:
+        model_name: str
+        def get_model_args(self, tp_size: int):
+            return {
+                "pretrained": self.model_name,
+                "chat_template_args": {"reasoning_effort": "low"},
+                "enable_thinking": True,
+                "think_end_token": "200008",
+                "tensor_parallel_size": tp_size,
+                "dtype": "auto",
+                "gpu_memory_utilization": 0.95,
+                "trust_remote_code": False,
+                "enable_prefix_caching": False,
+                "enforce_eager": False,
+            }
+    @pytest.mark.skipif(not QUARK_MXFP4_AVAILABLE, reason="amd-quark>=0.9 is not available")
+    @pytest.mark.skipif(
+        not HF_HUB_AMD_ORG_ACCESS,
+        reason="Read access to huggingface.co/amd is required for this test.",
+    )
+    @pytest.mark.parametrize("tp_size", [1, 2, 4, 8])
+    @pytest.mark.parametrize("model_name, expected_accuracy", MODEL_ACCURACIES.items())
+    def test_gpt_oss_attention_quantization(
+        model_name: str, tp_size: int, expected_accuracy: float
+    ):
+        model_args = EvaluationConfig(model_name).get_model_args(tp_size)
+        extra_run_kwargs = {
+            "gen_kwargs": {"max_gen_toks": 8000},
+            "apply_chat_template": True,
+            "fewshot_as_multiturn": True,
+            "num_fewshot": 5,
+        }
+        lm_eval_out = lm_eval.simple_evaluate(
+            model="vllm",
+            model_args=model_args,
+            tasks="gsm8k_platinum",
+            batch_size="auto",
+            **extra_run_kwargs,
+        )
+        measured_accuracy = float(
+            lm_eval_out["results"]["gsm8k_platinum"]["exact_match,flexible-extract"]
+        )
+        rtol = 0.02
+        assert (
+            measured_accuracy - rtol < expected_accuracy
+            and measured_accuracy + rtol > expected_accuracy
+        ), f"Expected: {expected_accuracy} |  Measured: {measured_accuracy}"

tests/models/quantization/test_gpt_oss_attn_quantization.py

This file was deleted.

vllm/model_executor/layers/fused_moe/config.py

-Original file line number
+Diff line change
@@ Expand Up / @@ -386,6 +386,10 @@ def use_mxfp4_w4a4(self) -> bool: @@
         def use_nvfp4_w4a4(self) -> bool:
             return self.quant_dtype == "nvfp4"
+        @property
+        def use_mxfp4_w4a8(self) -> bool:
+            return self._a1.dtype == "fp8" and self._w1.dtype == "mxfp4"
         def config_name(self, dtype: torch.dtype) -> str | None:
             """
             Return a string used to construct the filename that contains the
@@ Expand Down Expand Up / @@ -532,6 +536,8 @@ def fp8_w8a8_moe_quant_config( @@
         w2_scale: torch.Tensor,
         a1_scale: torch.Tensor | None = None,
         a2_scale: torch.Tensor | None = None,
+        w1_bias: torch.Tensor | None = None,
+        w2_bias: torch.Tensor | None = None,
         per_act_token_quant: bool = False,
         per_out_ch_quant: bool = False,
         block_shape: list[int] | None = None,
@@ Expand All / @@ -549,6 +555,8 @@ def fp8_w8a8_moe_quant_config( @@
             g1_alphas=g1_alphas,
             w2_scale=w2_scale,
             g2_alphas=g2_alphas,
+            w1_bias=w1_bias,
+            w2_bias=w2_bias,
             a1_scale=a1_scale,
             a1_gscale=a1_gscale,
             a2_scale=a2_scale,
@@ Expand All / @@ -564,6 +572,8 @@ def int8_w8a8_moe_quant_config( @@
         w2_scale: torch.Tensor,
         a1_scale: torch.Tensor | None,
         a2_scale: torch.Tensor | None,
+        w1_bias: torch.Tensor | None = None,
+        w2_bias: torch.Tensor | None = None,
         per_act_token_quant: bool = False,
     ) -> FusedMoEQuantConfig:
         """
@@ Expand All / @@ -575,6 +585,8 @@ def int8_w8a8_moe_quant_config( @@
             w2_scale=w2_scale,
             a1_scale=a1_scale,
             a2_scale=a2_scale,
+            w1_bias=w1_bias,
+            w2_bias=w2_bias,
             per_act_token_quant=per_act_token_quant,
             per_out_ch_quant=False,
             block_shape=None,
@@ Expand Down Expand Up / @@ -654,6 +666,26 @@ def mxfp4_mxfp8_moe_quant_config( @@
         )
+    def mxfp4_w4a8_moe_quant_config(
+        w1_scale: Union[torch.Tensor, "PrecisionConfig"],
+        w2_scale: Union[torch.Tensor, "PrecisionConfig"],
+        a1_scale: torch.Tensor | None = None,
+        a2_scale: torch.Tensor | None = None,
+        w1_bias: torch.Tensor | None = None,
+        w2_bias: torch.Tensor | None = None,
+        block_shape: list[int] | None = None,
+    ) -> FusedMoEQuantConfig:
+        """
+        Construct a quant config for fp8 activations and mxfp4 weights.
+        """
+        return FusedMoEQuantConfig(
+            _a1=FusedMoEQuantDesc("fp8", None, a1_scale, None, None, None),
+            _a2=FusedMoEQuantDesc("fp8", None, a2_scale, None, None, None),
+            _w1=FusedMoEQuantDesc("mxfp4", None, w1_scale, None, None, w1_bias),
+            _w2=FusedMoEQuantDesc("mxfp4", None, w2_scale, None, None, w2_bias),
+        )
     def ocp_mx_moe_quant_config(
         quant_dtype: str,
         w1_scale: Union[torch.Tensor, "PrecisionConfig"],
@@ Expand Down Expand Up / @@ -691,6 +723,8 @@ def nvfp4_moe_quant_config( @@
         a2_gscale: torch.Tensor,
         w1_scale: torch.Tensor,
         w2_scale: torch.Tensor,
+        w1_bias: torch.Tensor | None = None,
+        w2_bias: torch.Tensor | None = None,
     ) -> FusedMoEQuantConfig:
         """
         Construct a quant config for mxfp4 activations and nvp4 weights.
@@ Expand All / @@ -699,6 +733,8 @@ def nvfp4_moe_quant_config( @@
             "nvfp4",
             w1_scale=w1_scale,
             w2_scale=w2_scale,
+            w1_bias=w1_bias,
+            w2_bias=w2_bias,
             a1_gscale=a1_gscale,
             a2_gscale=a2_gscale,
             g1_alphas=g1_alphas,
@@ Expand Down @@

Provide feedback

Saved searches

Use saved searches to filter your results more quickly

Uh oh!

[ROCm][Quantization] GPT_OSS in amd-quark format model loading and emulations #29008

Uh oh!

Diff view

Diff view

There are no files selected for viewing

Uh oh!

Uh oh!

Uh oh!

Uh oh!

Uh oh!

Uh oh!