address comments

cascade812 · cascade812 · commit 4734bfe1902c · 2025-05-15T20:03:06.000Z
Signed-off-by: cascade812 &lt;cascade812@outlook.com&gt;
diff --git a/tests/compile/backend.py b/tests/compile/backend.py
@@ -5,6 +5,8 @@
 
 from torch import fx
 
+from vllm.compilation.fx_utils import (find_specified_fn,
+                                       find_specified_fn_maybe)
 from vllm.compilation.inductor_pass import InductorPass
 from vllm.config import get_current_vllm_config
 
@@ -44,3 +46,19 @@ def post_pass(self, graph: fx.Graph):
         self.graph_post_pass = deepcopy(graph)
         # assign by reference, will reflect the final state of the graph
         self.final_graph = graph
+
+    def check_before_ops(self, ops,
+                         find_fn=find_specified_fn, \
+                         find_fn_maybe=find_specified_fn_maybe, \
+                        ops_fully_replaced=True):
+        for op in ops:
+            find_fn(self.graph_pre_pass.nodes, op)
+            if ops_fully_replaced:
+                assert find_fn_maybe(self.graph_post_pass.nodes, op) is None
+
+    def check_after_ops(self, ops,
+                        find_fn=find_specified_fn, \
+                        find_fn_maybe=find_specified_fn_maybe):
+        for op in ops:
+            find_fn(self.graph_post_pass.nodes, op)
+            assert find_fn_maybe(self.graph_pre_pass.nodes, op) is None
diff --git a/tests/compile/test_async_tp.py b/tests/compile/test_async_tp.py
@@ -7,8 +7,6 @@
 
 import vllm.envs as envs
 from vllm.compilation.collective_fusion import AsyncTPPass
-from vllm.compilation.fx_utils import (find_specified_fn,
-                                       find_specified_fn_maybe)
 from vllm.config import (CompilationConfig, DeviceConfig, ModelConfig,
                          PassConfig, VllmConfig)
 from vllm.distributed import (tensor_model_parallel_all_gather,
@@ -93,7 +91,7 @@ def ops_in_model_after(self):
 
 
 @multi_gpu_test(num_gpus=2)
-@pytest.mark.parametrize("test_model", ["TestMMRSModel", "TestAGMMModel"])
+@pytest.mark.parametrize("test_model", [TestMMRSModel, TestAGMMModel])
 @pytest.mark.parametrize("batch_size", [8])
 @pytest.mark.parametrize("seq_len", [16])
 @pytest.mark.parametrize("hidden_size", [16])
@@ -117,7 +115,8 @@ def run_torch_spawn(fn, nprocs):
 
 
 def async_tp_pass_on_test_model(local_rank: int, world_size: int,
-                                test_model: str, batch_size: int, seq_len: int,
+                                test_model_cls: torch.nn.Module,
+                                batch_size: int, seq_len: int,
                                 hidden_size: int, dtype: torch.dtype):
     current_platform.seed_everything(0)
 
@@ -158,12 +157,7 @@ def async_tp_pass_on_test_model(local_rank: int, world_size: int,
     async_tp_pass = AsyncTPPass(vllm_config)
     backend = TestBackend(async_tp_pass)
 
-    if test_model == "TestMMRSModel":
-        model = TestMMRSModel(hidden_size)
-    elif test_model == "TestAGMMModel":
-        model = TestAGMMModel(hidden_size)
-    else:
-        raise ValueError(f"Unknown model: {test_model}")
+    model = test_model_cls(hidden_size)
 
     hidden_states = torch.randn((batch_size * seq_len, hidden_size),
                                 dtype=dtype,
@@ -172,21 +166,14 @@ def async_tp_pass_on_test_model(local_rank: int, world_size: int,
     compiled_model = torch.compile(model, backend=backend)
     compiled_model(hidden_states)
 
-    # Check substitution worked
-    pre_nodes = backend.graph_pre_pass.nodes
-    post_nodes = backend.graph_post_pass.nodes
-
-    # In pre-nodes, all reduce should exist,
+    # In pre-nodes, all gather or reduce scatter should exist,
     # fused_matmul_reduce_scatter or fused_all_gather_matmul should not
-    for op in model.ops_in_model_before():
-        find_specified_fn(pre_nodes, op)
-    for op in model.ops_in_model_after():
-        assert find_specified_fn_maybe(pre_nodes, op) is None
+    backend.check_before_ops(model.ops_in_model_before(),
+                             ops_fully_replaced=False)
 
     # In post-nodes, fused_matmul_reduce_scatter or \
     # fused_all_gather_matmul should exist
-    for op in model.ops_in_model_after():
-        find_specified_fn(post_nodes, op)
+    backend.check_after_ops(model.ops_in_model_after())
 
 
 @create_new_process_for_each_test()
@@ -258,12 +245,9 @@ def test_async_tp_pass_correctness(
         "mp",
     ]
 
-    try:
-        compare_two_settings(model_id,
-                             aysnc_tp_args,
-                             tp_args,
-                             async_tp_env,
-                             tp_env,
-                             method="generate")
-    except Exception:
-        raise
+    compare_two_settings(model_id,
+                         aysnc_tp_args,
+                         tp_args,
+                         async_tp_env,
+                         tp_env,
+                         method="generate")
diff --git a/tests/compile/test_fusion.py b/tests/compile/test_fusion.py
@@ -29,6 +29,10 @@ def __init__(self, hidden_size: int, eps: float, static: bool,
         self.cutlass_fp8_enabled = cutlass_fp8_enabled
         self.norm = [RMSNorm(hidden_size, eps) for _ in range(3)]
         self.wscale = [torch.rand(1, dtype=torch.float32) for _ in range(2)]
+        self.key = QuantKey(dtype=FP8_DTYPE,
+                            static=static,
+                            per_tensor=static,
+                            symmetric=True)
         if static:
             self.scale = [torch.rand(1, dtype=torch.float32) for _ in range(2)]
         else:
@@ -59,6 +63,15 @@ def forward(self, x):
         y3, resid = self.norm[2](x3, resid)  # use resid here
         return y3
 
+    def ops_in_model_before(self):
+        return [QUANT_OPS[self.key]]
+
+    def ops_in_model_after(self):
+        return [
+            FUSED_OPS[FusedRMSQuantKey(self.key, False)],
+            FUSED_OPS[FusedRMSQuantKey(self.key, True)]
+        ]
+
 
 @pytest.mark.parametrize("dtype", [torch.float16, torch.bfloat16])
 @pytest.mark.parametrize("hidden_size", [64, 3392, 4096])
@@ -107,25 +120,10 @@ def test_fusion_rmsnorm_quant(dtype, hidden_size, num_tokens, eps, static,
 
         torch.testing.assert_close(result, result2, atol=ATOL, rtol=RTOL)
 
-        # Check substitution worked
-        pre_nodes = backend.graph_pre_pass.nodes
-        post_nodes = backend.graph_post_pass.nodes
-
-        # static is per-tensor, dynamic is per-token
-        key = QuantKey(dtype=FP8_DTYPE,
-                       static=static,
-                       per_tensor=static,
-                       symmetric=True)
-        rms_quant = FUSED_OPS[FusedRMSQuantKey(key, False)]
-        add_rms_quant = FUSED_OPS[FusedRMSQuantKey(key, True)]
-        fp8_quant = QUANT_OPS[key]
-
         # In pre-nodes, fp8 quant should be there and fused kernels should not
-        assert find_auto_fn_maybe(pre_nodes, rms_quant) is None
-        assert find_auto_fn_maybe(pre_nodes, add_rms_quant) is None
-        find_auto_fn(pre_nodes, fp8_quant)
+        backend.check_before_ops(model.ops_in_model_before(), find_auto_fn,
+                                 find_auto_fn_maybe)
 
         # In post-nodes, fused kernels should be there and fp8 quant should not
-        find_auto_fn(post_nodes, rms_quant)
-        find_auto_fn(post_nodes, add_rms_quant)
-        assert find_auto_fn_maybe(post_nodes, fp8_quant) is None
+        backend.check_before_ops(model.ops_in_model_after(), find_auto_fn,
+                                 find_auto_fn_maybe)
diff --git a/tests/compile/test_sequence_parallelism.py b/tests/compile/test_sequence_parallelism.py
@@ -5,9 +5,7 @@
 
 import vllm.envs as envs
 from vllm.compilation.fix_functionalization import FixFunctionalizationPass
-from vllm.compilation.fx_utils import (find_auto_fn, find_auto_fn_maybe,
-                                       find_specified_fn,
-                                       find_specified_fn_maybe, is_func)
+from vllm.compilation.fx_utils import find_auto_fn, find_auto_fn_maybe, is_func
 from vllm.compilation.sequence_parallelism import SequenceParallelismPass
 from vllm.config import (CompilationConfig, DeviceConfig, ModelConfig,
                          PassConfig, VllmConfig)
@@ -21,17 +19,6 @@
 from ..utils import multi_gpu_test
 from .backend import TestBackend
 
-OPS_IN_MODEL_BEFORE = [
-    torch.ops.vllm.all_reduce.default,
-]
-
-OPS_IN_MODEL_AFTER = [
-    torch.ops.vllm.reduce_scatter.default,
-    torch.ops.vllm.all_gather.default,
-]
-
-OPS_IN_MODEL = [torch.ops._C.fused_add_rms_norm.default]
-
 prompts = [
     "Hello, my name is",
     "The president of the United States is",
@@ -78,6 +65,18 @@ def forward(self, hidden_states, residual):
 
         return norm_output, residual_output
 
+    def ops_in_model_before(self):
+        return [torch.ops.vllm.all_reduce.default]
+
+    def ops_in_model_after(self):
+        return [
+            torch.ops.vllm.reduce_scatter.default,
+            torch.ops.vllm.all_gather.default
+        ]
+
+    def ops_in_model(self):
+        return [torch.ops._C.fused_add_rms_norm.default]
+
 
 @multi_gpu_test(num_gpus=2)
 @pytest.mark.parametrize("batch_size", [8])
@@ -156,34 +155,24 @@ def sequence_parallelism_pass_on_test_model(local_rank: int, world_size: int,
     compiled_model_func = torch.compile(model, backend=backend_func)
     compiled_model_func(hidden_states, residual)
 
-    # Check substitution worked
-    pre_nodes = backend_no_func.graph_pre_pass.nodes
-    post_nodes = backend_no_func.graph_post_pass.nodes
-
     # In pre-nodes, all reduce should be there,
     # reduce scatter and all gather should not
-    for op in OPS_IN_MODEL_BEFORE:
-        find_specified_fn(pre_nodes, op)
-    for op in OPS_IN_MODEL_AFTER:
-        assert find_specified_fn_maybe(pre_nodes, op) is None
+    backend_no_func.check_before_ops(model.ops_in_model_before())
 
     # In post-nodes, reduce scatter and all gather should be there,
     # all reduce should not
-    for op in OPS_IN_MODEL_AFTER:
-        find_specified_fn(post_nodes, op)
-    for op in OPS_IN_MODEL_BEFORE:
-        assert find_specified_fn_maybe(post_nodes, op) is None
+    backend_no_func.check_after_ops(model.ops_in_model_after())
 
     # check if the functionalization pass is applied
-    for op in OPS_IN_MODEL:
+    for op in model.ops_in_model():
         find_auto_fn(backend_no_func.graph_post_pass.nodes, op)
         assert find_auto_fn_maybe(backend_func.graph_post_pass.nodes,
                                   op) is None  # noqa: E501
 
     # make sure the ops were all de-functionalized
     found = dict()
     for node in backend_func.graph_post_pass.nodes:
-        for op in OPS_IN_MODEL:
+        for op in model.ops_in_model():
             if is_func(node, op):
                 found[op] = True
-    assert all(found[op] for op in OPS_IN_MODEL)
+    assert all(found[op] for op in model.ops_in_model())
diff --git a/vllm/compilation/collective_fusion.py b/vllm/compilation/collective_fusion.py
@@ -106,10 +106,11 @@ def __init__(self, config: VllmConfig):
         enable_symm_mem_for_group(get_tp_group().device_group.group_name)
         self.patterns: PatternMatcherPass = PatternMatcherPass(
             pass_name="async_tp_pass")
-        GEMMReduceScatterPattern(self.dtype,
+        GEMMReduceScatterPattern(self.model_dtype,
                                  self.device).register(self.patterns)
 
-        AllGatherGEMMPattern(self.dtype, self.device).register(self.patterns)
+        AllGatherGEMMPattern(self.model_dtype,
+                             self.device).register(self.patterns)
 
     def is_applicable_for_shape(self, shape: Optional[int]) -> bool:
         # only do replace for specific shapes
diff --git a/vllm/compilation/sequence_parallelism.py b/vllm/compilation/sequence_parallelism.py
@@ -243,12 +243,12 @@ def __init__(self, config: VllmConfig):
             pass_name="sequence_parallelism_pass")
         for epsilon in [1e-5, 1e-6]:
             EmbeddingAllReduceRMSNormPattern(
-                epsilon, self.dtype, self.device).register(self.patterns)
+                epsilon, self.model_dtype, self.device).register(self.patterns)
 
-            MiddleAllReduceRMSNormPattern(epsilon, self.dtype,
+            MiddleAllReduceRMSNormPattern(epsilon, self.model_dtype,
                                           self.device).register(self.patterns)
 
-            LastAllReduceRMSNormPattern(epsilon, self.dtype,
+            LastAllReduceRMSNormPattern(epsilon, self.model_dtype,
                                         self.device).register(self.patterns)
             # WARNING: This is a hack to clear the pattern matcher cache
             # and allow multiple values of epsilon.
diff --git a/vllm/compilation/vllm_inductor_pass.py b/vllm/compilation/vllm_inductor_pass.py
@@ -26,7 +26,8 @@ class VllmInductorPass(InductorPass):
 
     def __init__(self, config: VllmConfig):
         self.pass_config = config.compilation_config.pass_config
-        self.dtype = config.model_config.dtype if config.model_config else None
+        self.model_dtype = config.model_config.dtype if config.model_config \
+            else None
         self.device = config.device_config.device if config.device_config \
             else None
         self.pass_name = self.__class__.__name__