vllm-project · tjtanaa · Jan 20, 2026 · Oct 28, 2025 · Oct 30, 2025 · Oct 30, 2025
@@ -0,0 +1,5 @@
+Qwen2.5-1.5B-Instruct.yaml
+Meta-Llama-3.2-1B-Instruct-INT8-compressed-tensors.yaml
+Meta-Llama-3-8B-Instruct-nonuniform-compressed-tensors.yaml
+Qwen2.5-VL-3B-Instruct-FP8-dynamic.yaml
+Qwen1.5-MoE-W4A16-compressed-tensors.yaml
diff --git a/tests/compile/distributed/test_fusion_all_reduce.py b/tests/compile/distributed/test_fusion_all_reduce.py
@@ -26,15 +26,14 @@
     initialize_model_parallel,
 )
 from vllm.model_executor.layers.layernorm import RMSNorm
-from vllm.model_executor.layers.quantization.utils.w8a8_utils import (
-    Fp8LinearOp,
-    GroupShape,
+from vllm.model_executor.layers.quantization.utils.quant_utils import (
+    kFp8StaticTensorSym,
 )
 from vllm.platforms import current_platform
 from vllm.utils.system_utils import update_environment_variables
 from vllm.utils.torch_utils import set_random_seed
 
-from ...utils import has_module_attribute, multi_gpu_test
+from ...utils import TestFP8Layer, has_module_attribute, multi_gpu_test
 from ..backend import TestBackend
 
 
@@ -76,49 +75,40 @@ def ops_in_model_after(self):
 
 
 class TestAllReduceRMSNormStaticQuantFP8Model(torch.nn.Module):
+    quant_key = kFp8StaticTensorSym
+
     def __init__(self, hidden_size=16, token_num=16, eps=1e-6):
         super().__init__()
         self.hidden_size = hidden_size
         self.eps = eps
         self.norm = [RMSNorm(hidden_size, eps) for i in range(4)]
-        self.wscale = [torch.rand(1, dtype=torch.float32) for _ in range(3)]
-        self.w = [
-            torch.rand(hidden_size, hidden_size)
-            .to(dtype=current_platform.fp8_dtype())
-            .t()
-            for _ in range(3)
+        self.fp8_linear_layers = [
+            TestFP8Layer(
+                weight_shape=(hidden_size, hidden_size),
+                activation_quant_key=self.quant_key,
+                weight_quant_key=self.quant_key,
+            )
+            for i in range(3)
         ]
 
-        self.fp8_linear = Fp8LinearOp(
-            act_quant_static=True,
-            act_quant_group_shape=GroupShape.PER_TENSOR,
-        )
-
-        self.scale = [torch.rand(1, dtype=torch.float32) for _ in range(3)]
-
     def forward(self, hidden_states):
         # avoid having graph input be an arg to a pattern directly
         z = torch.relu(hidden_states)
         x = resid = tensor_model_parallel_all_reduce(z)
         y = self.norm[0](x)
 
-        z2 = self.fp8_linear.apply(
-            y, self.w[0], self.wscale[0], input_scale=self.scale[0]
-        )
+        z2 = self.fp8_linear_layers[0](y)
 
         x2 = tensor_model_parallel_all_reduce(z2)
         y2, resid = self.norm[1](x2, resid)
 
-        z3 = self.fp8_linear.apply(
-            y2, self.w[1], self.wscale[1], input_scale=self.scale[1]
-        )
+        z3 = self.fp8_linear_layers[1](y2)
 
         x3 = tensor_model_parallel_all_reduce(z3)
         y3, resid = self.norm[2](x3, resid)  # use resid here
 
-        z4 = self.fp8_linear.apply(
-            y3, self.w[2], self.wscale[2], input_scale=self.scale[2]
-        )
+        z4 = self.fp8_linear_layers[2](y3)
+
         x4 = tensor_model_parallel_all_reduce(z4)
         y4, resid = self.norm[3](x4, resid)  # use resid here
         return y4
@@ -130,7 +120,7 @@ def ops_in_model_before(self):
         return [
             torch.ops.vllm.all_reduce.default,
             torch.ops._C.static_scaled_fp8_quant.default
-            if self.fp8_linear.quant_fp8.enabled()
+            if self.fp8_linear_layers[0].is_quant_fp8_enabled()
             else torch.ops.aten.reciprocal.default,
         ]
 

diff --git a/tests/compile/distributed/test_sequence_parallelism.py b/tests/compile/distributed/test_sequence_parallelism.py
@@ -27,13 +27,14 @@
     initialize_model_parallel,
 )
 from vllm.model_executor.layers.layernorm import RMSNorm
-from vllm.model_executor.layers.quantization.utils.quant_utils import GroupShape
-from vllm.model_executor.layers.quantization.utils.w8a8_utils import Fp8LinearOp
+from vllm.model_executor.layers.quantization.utils.quant_utils import (
+    kFp8StaticTensorSym,
+)
 from vllm.platforms import current_platform
 from vllm.utils.system_utils import update_environment_variables
 from vllm.utils.torch_utils import set_random_seed
 
-from ...utils import multi_gpu_test
+from ...utils import TestFP8Layer, multi_gpu_test
 from ..backend import TestBackend
 
 FP8_DTYPE = current_platform.fp8_dtype()
@@ -94,50 +95,40 @@ def ops_in_model(self):
 
 
 class TestAllReduceRMSNormStaticQuantFP8Model(torch.nn.Module):
+    quant_key = kFp8StaticTensorSym
+
     def __init__(self, hidden_size=16, eps=1e-6):
         super().__init__()
         self.vllm_config = get_current_vllm_config()
         self.hidden_size = hidden_size
         self.eps = eps
         self.norm = [RMSNorm(hidden_size, eps) for i in range(4)]
-        self.wscale = [torch.rand(1, dtype=torch.float32) for _ in range(3)]
-        self.w = [
-            torch.rand(hidden_size, hidden_size)
-            .to(dtype=current_platform.fp8_dtype())
-            .t()
-            for _ in range(3)
+        self.fp8_linear_layers = [
+            TestFP8Layer(
+                weight_shape=(hidden_size, hidden_size),
+                activation_quant_key=self.quant_key,
+                weight_quant_key=self.quant_key,
+            )
+            for i in range(3)
         ]
 
-        self.fp8_linear = Fp8LinearOp(
-            act_quant_static=True,
-            act_quant_group_shape=GroupShape.PER_TENSOR,
-        )
-
-        self.scale = [torch.rand(1, dtype=torch.float32) for _ in range(3)]
-
     def forward(self, hidden_states):
         # avoid having graph input be an arg to a pattern directly
         z = torch.relu(hidden_states)
         x = resid = tensor_model_parallel_all_reduce(z)
         y = self.norm[0](x)
 
-        z2 = self.fp8_linear.apply(
-            y, self.w[0], self.wscale[0], input_scale=self.scale[0]
-        )
+        z2 = self.fp8_linear_layers[0](y)
 
         x2 = tensor_model_parallel_all_reduce(z2)
         y2, resid = self.norm[1](x2, resid)
 
-        z3 = self.fp8_linear.apply(
-            y2, self.w[1], self.wscale[1], input_scale=self.scale[1]
-        )
+        z3 = self.fp8_linear_layers[1](y2)
 
         x3 = tensor_model_parallel_all_reduce(z3)
         y3, resid = self.norm[2](x3, resid)  # use resid here
 
-        z4 = self.fp8_linear.apply(
-            y3, self.w[2], self.wscale[2], input_scale=self.scale[2]
-        )
+        z4 = self.fp8_linear_layers[2](y3)
         x4 = tensor_model_parallel_all_reduce(z4)
         y4, resid = self.norm[3](x4, resid)  # use resid here
         return y4
@@ -160,7 +151,7 @@ def ops_in_model(self):
             return [
                 torch.ops._C.fused_add_rms_norm.default,
             ]
-        elif self.fp8_linear.quant_fp8.enabled():
+        elif any(layer.is_quant_fp8_enabled() for layer in self.fp8_linear_layers):
             return [
                 torch.ops._C.static_scaled_fp8_quant.default,
             ]

diff --git a/tests/compile/test_functionalization.py b/tests/compile/test_functionalization.py
@@ -20,36 +20,36 @@
 )
 from vllm.model_executor.layers.activation import SiluAndMul
 from vllm.model_executor.layers.layernorm import RMSNorm
-from vllm.model_executor.layers.quantization.utils.quant_utils import GroupShape
-from vllm.model_executor.layers.quantization.utils.w8a8_utils import Fp8LinearOp
+from vllm.model_executor.layers.quantization.utils.quant_utils import (
+    kFp8StaticTensorSym,
+)
 from vllm.model_executor.layers.rotary_embedding import get_rope
 from vllm.platforms import current_platform
 
+from ..utils import TestFP8Layer
 from .backend import TestBackend
 
 TEST_FP8 = current_platform.supports_fp8()
 FP8_DTYPE = current_platform.fp8_dtype()
 
 
 class TestSiluMul(torch.nn.Module):
+    quant_key = kFp8StaticTensorSym
+
     def __init__(self, hidden_size: int = 128):
         super().__init__()
         self.silu_and_mul = SiluAndMul()
-        self.wscale = torch.rand(1, dtype=torch.float32)
-        self.scale = torch.rand(1, dtype=torch.float32)
-
         if TEST_FP8:
-            self.w = torch.rand(hidden_size, hidden_size).to(dtype=FP8_DTYPE).t()
-            self.fp8_linear = Fp8LinearOp(
-                act_quant_static=True,
-                act_quant_group_shape=GroupShape.PER_TENSOR,
+            self.fp8_linear = TestFP8Layer(
+                weight_shape=(hidden_size, hidden_size),
+                activation_quant_key=self.quant_key,
+                weight_quant_key=self.quant_key,
             )
 
     def forward(self, x):
         y = self.silu_and_mul(x)
         if TEST_FP8:
-            x2 = self.fp8_linear.apply(y, self.w, self.wscale, input_scale=self.wscale)
-            return x2
+            return self.fp8_linear(y)
         else:
             return y
 
@@ -67,6 +67,8 @@ def ops_not_in_model(self):
 
 
 class TestFusedAddRMSNorm(torch.nn.Module):
+    quant_key = kFp8StaticTensorSym
+
     def __init__(self, hidden_size=16, intermediate_size=32):
         super().__init__()
         self.hidden_size = hidden_size
@@ -81,11 +83,11 @@ def __init__(self, hidden_size=16, intermediate_size=32):
         torch.nn.init.normal_(self.gate_proj, std=0.02)
 
         if TEST_FP8:
-            self.fp8_linear = Fp8LinearOp(act_quant_static=True)
-
-            self.scale = torch.rand(1, dtype=torch.float32)
-            self.w = torch.rand(hidden_size, intermediate_size).to(dtype=FP8_DTYPE).t()
-            self.wscale = torch.rand(1, dtype=torch.float32)
+            self.fp8_linear = TestFP8Layer(
+                weight_shape=(hidden_size, intermediate_size),
+                activation_quant_key=self.quant_key,
+                weight_quant_key=self.quant_key,
+            )
 
     def forward(self, hidden_states, residual):
         # Reshape input
@@ -100,12 +102,7 @@ def forward(self, hidden_states, residual):
 
         if TEST_FP8:
             # scaled_mm with static input quantization
-            fp8_linear_result = self.fp8_linear.apply(
-                norm_output,
-                self.w,
-                self.wscale,
-                input_scale=self.scale.to(norm_output.device),
-            )
+            fp8_linear_result = self.fp8_linear(norm_output)
 
             return fp8_linear_result, residual_output