vllm-project
diff --git a/‎.buildkite/test-pipeline.yaml‎
Lines changed: 2 additions & 0 deletions b/‎.buildkite/test-pipeline.yaml‎
Lines changed: 2 additions & 0 deletions
diff --git a/‎cmake/external_projects/qutlass.cmake‎
Lines changed: 9 additions & 2 deletions b/‎cmake/external_projects/qutlass.cmake‎
Lines changed: 9 additions & 2 deletions
diff --git a/‎tests/kernels/quantization/test_mxfp4_qutlass.py‎
Lines changed: 129 additions & 142 deletions b/‎tests/kernels/quantization/test_mxfp4_qutlass.py‎
Lines changed: 129 additions & 142 deletions
@@ -800,6 +800,8 @@ steps:
     - pytest -v -s tests/compile/test_fusion_attn.py::test_attention_quant_pattern
     - pytest -v -s tests/kernels/moe/test_flashinfer.py
     - pytest -v -s tests/compile/test_silu_mul_quant_fusion.py
+    - pytest -v -s tests/kernels/quantization/test_nvfp4_qutlass.py
+    - pytest -v -s tests/kernels/quantization/test_mxfp4_qutlass.py
 
 - label: GPT-OSS Eval (Blackwell)
   timeout_in_minutes: 60
 
@@ -32,7 +32,7 @@ endif()
 message(STATUS "[QUTLASS] QuTLASS is available at ${qutlass_SOURCE_DIR}")
 
 cuda_archs_loose_intersection(QUTLASS_ARCHS "12.0a;10.0a" "${CUDA_ARCHS}")
-if(QUTLASS_ARCHS)
+if(${CMAKE_CUDA_COMPILER_VERSION} VERSION_GREATER 12.8 AND QUTLASS_ARCHS)
 
   if(QUTLASS_ARCHS MATCHES "12\\.0a")
     set(QUTLASS_TARGET_CC 120)
@@ -82,5 +82,12 @@ if(QUTLASS_ARCHS)
   )
 
 else()
-  message(STATUS "[QUTLASS] skipping (no 12.0a/10.0a in CUDA_ARCHS='${CUDA_ARCHS}')")
+  if("${CMAKE_CUDA_COMPILER_VERSION}" VERSION_LESS "12.8")
+    message(STATUS
+      "[QUTLASS] Skipping build: CUDA 12.8 or newer is required (found ${CMAKE_CUDA_COMPILER_VERSION}).")
+  else()
+    message(STATUS
+      "[QUTLASS] Skipping build: no supported arch (12.0a / 10.0a) found in "
+      "CUDA_ARCHS='${CUDA_ARCHS}'.")
+  endif()
 endif()
@@ -16,19 +16,28 @@
 # See the License for the specific language governing permissions and
 # limitations under the License.
 #
-
-import unittest
-from typing import ClassVar
-
 import numpy as np
+import pytest
 import torch
 from compressed_tensors.transform.utils.hadamard import (
     deterministic_hadamard_matrix)
 
 from vllm._custom_ops import fusedQuantizeMx, matmul_mxf4_bf16_tn
+from vllm.platforms import current_platform
 from vllm.qutlass_utils.utils import to_blocked
 
+if not torch.cuda.is_available():
+    pytest.skip("CUDA required for these tests.", allow_module_level=True)
+
+if not (current_platform.has_device_capability(100)
+        or current_platform.has_device_capability(120)):
+    pytest.skip(
+        reason="Tests require compute capability 10.0 (100) or 12.0 (120).",
+        allow_module_level=True,
+    )
+
 
+# ----- Helpers -----
 def get_hadamard_matrix(group_size: int, dtype: torch.dtype,
                         device: torch.device):
     return (
@@ -176,141 +185,119 @@ def _forward_quantize_ref(x: torch.Tensor,
     )
 
 
-@unittest.skipUnless(torch.cuda.is_available(),
-                     "CUDA required for these tests")
-class Test(unittest.TestCase):
-    dtype: ClassVar[torch.dtype]
-    device: ClassVar[torch.device]
-
-    @classmethod
-    def setUpClass(cls):
-        seed = 0
-        np.random.seed(seed)
-        torch.random.manual_seed(seed)
-        cls.dtype = torch.bfloat16
-        cls.device = torch.device("cuda:0")
-
-    def run_problem(self, m, n, k, had_size):
-        print(m, n, k)
-        hadamard_matrix = get_hadamard_matrix(had_size, self.dtype,
-                                              self.device)
-
-        a = torch.rand(m, k, dtype=self.dtype, device=self.device) * 25.0
-        b = torch.rand(n, k, dtype=self.dtype, device=self.device) * 25.0
-
-        a_e2m1, a_e8m0 = fusedQuantizeMx(a, hadamard_matrix, method="quest")
-        b_e2m1, b_e8m0 = fusedQuantizeMx(b, hadamard_matrix, method="quest")
-
-        a_dq, *_ = _dq_fp4(a_e2m1, a_e8m0[:m, :k], alpha=1.0)
-        b_dq, *_ = _dq_fp4(b_e2m1, b_e8m0[:n, :k], alpha=1.0)
-        out_ref = a_dq @ b_dq.transpose(-2, -1)
-
-        a_scale_block = to_blocked(a_e8m0, True)
-        b_scale_block = to_blocked(b_e8m0, True)
-        alpha = torch.Tensor([1.0]).to(self.device)
-        out = matmul_mxf4_bf16_tn(a_e2m1, b_e2m1, a_scale_block, b_scale_block,
-                                  alpha)
-        assert out.equal(out_ref.to(dtype=out.dtype))
-
-    def test_fused_quantization(self):
-        dtype, device = self.dtype, self.device
-
-        def _absmax_case(rot_size: int):
-            h = get_hadamard_matrix(rot_size, dtype, device)
-            x = torch.randn(2, 4096, 4096, dtype=dtype, device=device) * 25.0
-
-            xh_dq_ref, _, _ = _forward_quantize_ref(x, h, rot_size, False)
-            xh_e2m1, xh_e8m0 = fusedQuantizeMx(x, h, method="abs_max")
-            xh_e8m0 = xh_e8m0.reshape(2, 4096, 4096 // 32)  #
-            xh_dq, *_ = _dq_fp4(xh_e2m1, xh_e8m0, alpha=3.0)
-
-            torch.testing.assert_close(xh_dq, xh_dq_ref, rtol=0.34, atol=100)
-            assert (xh_dq != xh_dq_ref).float().mean() <= 1e-4
-
-            m, n, k = 1, 504, 4096
-            a = torch.randn(m, k, dtype=dtype, device=device) * 25.0
-            b = torch.randn(n, k, dtype=dtype, device=device) * 25.0
-
-            a_e2m1, a_e8m0 = fusedQuantizeMx(a, h, method="abs_max")
-            b_e2m1, b_e8m0 = fusedQuantizeMx(b, h, method="abs_max")
-            a_dq, *_ = _dq_fp4(a_e2m1, a_e8m0[:m, :k], alpha=1.0)
-            b_dq, *_ = _dq_fp4(b_e2m1, b_e8m0[:n, :k], alpha=1.0)
-            out_ref = a_dq @ b_dq.transpose(-2, -1)
-
-            a_scale_block = to_blocked(a_e8m0)
-            b_scale_block = to_blocked(b_e8m0)
-            alpha = torch.Tensor([1.0]).to(device)
-            out = matmul_mxf4_bf16_tn(a_e2m1, b_e2m1, a_scale_block,
-                                      b_scale_block, alpha)
-            assert out.equal(out_ref.to(dtype=out.dtype))
-
-        def _quest_case(rot_size: int):
-            h = get_hadamard_matrix(rot_size, dtype, device)
-            x = torch.randn(2, 4096, 4096, dtype=dtype, device=device) * 25.0
-
-            xh_dq_ref, _, _ = _forward_quantize_ref(x, h, rot_size, True)
-            xh_e2m1, xh_e8m0 = fusedQuantizeMx(x, h, method="quest")
-            xh_e8m0 = xh_e8m0.reshape(2, 4096, 4096 // 32)  #
-            xh_dq, *_ = _dq_fp4(xh_e2m1, xh_e8m0, alpha=1.0)
-            torch.testing.assert_close(xh_dq, xh_dq_ref, rtol=0.34, atol=100)
-            assert (xh_dq != xh_dq_ref).float().mean() <= 1e-4
-
-            m, n, k = 504, 504, 2048
-            a = torch.randn(m, k, dtype=dtype, device=device) * 25.0
-            b = torch.randn(n, k, dtype=dtype, device=device) * 25.0
-
-            a_e2m1, a_e8m0 = fusedQuantizeMx(a, h, method="quest")
-            b_e2m1, b_e8m0 = fusedQuantizeMx(b, h, method="quest")
-            a_dq, *_ = _dq_fp4(a_e2m1, a_e8m0[:m, :k], alpha=1.0)
-            b_dq, *_ = _dq_fp4(b_e2m1, b_e8m0[:n, :k], alpha=1.0)
-            out_ref = a_dq @ b_dq.transpose(-2, -1)
-
-            a_scale_block = to_blocked(a_e8m0, True)
-            b_scale_block = to_blocked(b_e8m0, True)
-            alpha = torch.Tensor([1.0]).to(device)
-            out = matmul_mxf4_bf16_tn(a_e2m1, b_e2m1, a_scale_block,
-                                      b_scale_block, alpha)
-            assert out.equal(out_ref.to(dtype=out.dtype))
-
-        for rs in (32, 64, 128):
-            _absmax_case(rs)
-        for rs in (32, 64, 128):
-            _quest_case(rs)
-
-    def test_llama_shapes(self):
-        print()
-        MODELS = {
-            " 7B": [
-                (4096, 3 * 4096),
-                (4096, 4096),
-                (4096, 2 * 10752),
-                (10752, 4096),
-            ],
-            "13B": [
-                (5120, 3 * 5120),
-                (5120, 5120),
-                (5120, 2 * 13568),
-                (13568, 5120),
-            ],
-            "33B": [
-                (6656, 3 * 6656),
-                (6656, 6656),
-                (6656, 2 * 17664),
-                (17664, 6656),
-            ],
-            "70B": [
-                (8192, 3 * 8192),
-                (8192, 8192),
-                (8192, 2 * 21760),
-                (21760, 8192),
-            ],
-        }
-        for _, layers in MODELS.items():
-            for layer in layers:
-                for batch in [1, 16]:
-                    for had_size in [32, 64, 128]:
-                        self.run_problem(batch, layer[1], layer[0], had_size)
-
-
-if __name__ == "__main__":
-    unittest.main()
+DTYPE = torch.bfloat16
+DEVICE = torch.device("cuda:0")
+
+ROT_SIZES = [32, 64, 128]
+SEEDS = [0]
+BATCHES = [1, 16]
+
+LLAMA_MODELS = {
+    "7B": [(4096, 3 * 4096), (4096, 4096), (4096, 2 * 10752), (10752, 4096)],
+    "13B": [(5120, 3 * 5120), (5120, 5120), (5120, 2 * 13568), (13568, 5120)],
+    "33B": [(6656, 3 * 6656), (6656, 6656), (6656, 2 * 17664), (17664, 6656)],
+    "70B": [(8192, 3 * 8192), (8192, 8192), (8192, 2 * 21760), (21760, 8192)],
+}
+
+
+@pytest.fixture(autouse=True)
+def _seed_each_test():
+    current_platform.seed_everything(0)
+    np.random.seed(0)
+    torch.random.manual_seed(0)
+
+
+@pytest.mark.parametrize("rot_size", ROT_SIZES)
+@torch.inference_mode()
+def test_fused_quantization_absmax(rot_size: int):
+    dtype, device = DTYPE, DEVICE
+    h = get_hadamard_matrix(rot_size, dtype, device)
+    x = torch.randn(2, 4096, 4096, dtype=dtype, device=device) * 25.0
+
+    xh_dq_ref, _, _ = _forward_quantize_ref(x, h, rot_size, quest=False)
+    xh_e2m1, xh_e8m0 = fusedQuantizeMx(x, h, method="abs_max")
+    xh_e8m0 = xh_e8m0.reshape(2, 4096, 4096 // 32)
+    xh_dq, *_ = _dq_fp4(xh_e2m1, xh_e8m0, alpha=3.0)
+
+    torch.testing.assert_close(xh_dq, xh_dq_ref, rtol=0.34, atol=100)
+    assert (xh_dq != xh_dq_ref).float().mean() <= 1e-4
+
+    m, n, k = 1, 504, 4096
+    a = torch.randn(m, k, dtype=dtype, device=device) * 25.0
+    b = torch.randn(n, k, dtype=dtype, device=device) * 25.0
+
+    a_e2m1, a_e8m0 = fusedQuantizeMx(a, h, method="abs_max")
+    b_e2m1, b_e8m0 = fusedQuantizeMx(b, h, method="abs_max")
+    a_dq, *_ = _dq_fp4(a_e2m1, a_e8m0[:m, :k], alpha=1.0)
+    b_dq, *_ = _dq_fp4(b_e2m1, b_e8m0[:n, :k], alpha=1.0)
+    out_ref = a_dq @ b_dq.transpose(-2, -1)
+
+    a_scale_block = to_blocked(a_e8m0)
+    b_scale_block = to_blocked(b_e8m0)
+    alpha = torch.tensor([1.0], device=device)
+    out = matmul_mxf4_bf16_tn(a_e2m1, b_e2m1, a_scale_block, b_scale_block,
+                              alpha)
+    assert out.equal(out_ref.to(dtype=out.dtype))
+
+
+@pytest.mark.parametrize("rot_size", ROT_SIZES)
+@torch.inference_mode()
+def test_fused_quantization_quest(rot_size: int):
+    dtype, device = DTYPE, DEVICE
+    h = get_hadamard_matrix(rot_size, dtype, device)
+    x = torch.randn(2, 4096, 4096, dtype=dtype, device=device) * 25.0
+
+    xh_dq_ref, _, _ = _forward_quantize_ref(x, h, rot_size, quest=True)
+    xh_e2m1, xh_e8m0 = fusedQuantizeMx(x, h, method="quest")
+    xh_e8m0 = xh_e8m0.reshape(2, 4096, 4096 // 32)
+    xh_dq, *_ = _dq_fp4(xh_e2m1, xh_e8m0, alpha=1.0)
+
+    torch.testing.assert_close(xh_dq, xh_dq_ref, rtol=0.34, atol=100)
+    assert (xh_dq != xh_dq_ref).float().mean() <= 1e-4
+
+    m, n, k = 504, 504, 2048
+    a = torch.randn(m, k, dtype=dtype, device=device) * 25.0
+    b = torch.randn(n, k, dtype=dtype, device=device) * 25.0
+
+    a_e2m1, a_e8m0 = fusedQuantizeMx(a, h, method="quest")
+    b_e2m1, b_e8m0 = fusedQuantizeMx(b, h, method="quest")
+    a_dq, *_ = _dq_fp4(a_e2m1, a_e8m0[:m, :k], alpha=1.0)
+    b_dq, *_ = _dq_fp4(b_e2m1, b_e8m0[:n, :k], alpha=1.0)
+    out_ref = a_dq @ b_dq.transpose(-2, -1)
+
+    a_scale_block = to_blocked(a_e8m0, True)
+    b_scale_block = to_blocked(b_e8m0, True)
+    alpha = torch.tensor([1.0], device=device)
+    out = matmul_mxf4_bf16_tn(a_e2m1, b_e2m1, a_scale_block, b_scale_block,
+                              alpha)
+    assert out.equal(out_ref.to(dtype=out.dtype))
+
+
+@pytest.mark.parametrize("model", list(LLAMA_MODELS.keys()))
+@pytest.mark.parametrize("layer_idx", [0, 1, 2, 3])
+@pytest.mark.parametrize("batch", [1, 16])
+@pytest.mark.parametrize("had_size", ROT_SIZES)
+@torch.inference_mode()
+def test_llama_shapes(model: str, layer_idx: int, batch: int, had_size: int):
+    dtype, device = DTYPE, DEVICE
+    m = batch
+    k, n = LLAMA_MODELS[model][layer_idx]
+
+    h = get_hadamard_matrix(had_size, dtype, device)
+
+    a = torch.rand(m, k, dtype=dtype, device=device) * 25.0
+    b = torch.rand(n, k, dtype=dtype, device=device) * 25.0
+
+    a_e2m1, a_e8m0 = fusedQuantizeMx(a, h, method="quest")
+    b_e2m1, b_e8m0 = fusedQuantizeMx(b, h, method="quest")
+
+    a_dq, *_ = _dq_fp4(a_e2m1, a_e8m0[:m, :k], alpha=1.0)
+    b_dq, *_ = _dq_fp4(b_e2m1, b_e8m0[:n, :k], alpha=1.0)
+    out_ref = a_dq @ b_dq.transpose(-2, -1)
+
+    a_scale_block = to_blocked(a_e8m0, True)
+    b_scale_block = to_blocked(b_e8m0, True)
+    alpha = torch.tensor([1.0], device=device)
+    out = matmul_mxf4_bf16_tn(a_e2m1, b_e2m1, a_scale_block, b_scale_block,
+                              alpha)
+    assert out.equal(out_ref.to(dtype=out.dtype))