[compile] Turn on TP/SP when use_inductor_graph_partition=True

angelayi · angelayi · commit 3fe31bf926e7 · 2025-10-12T22:23:59.000-07:00
Signed-off-by: angelayi &lt;yiangela7@gmail.com&gt;
diff --git a/vllm/compilation/collective_fusion.py b/vllm/compilation/collective_fusion.py
@@ -435,7 +435,11 @@ def is_applicable(self, shape: int | None) -> bool:
         # This pass is applied on top of the sequence parallelism pass.
         # It inherits the same applicability condition as `SequenceParallelismPass`.
         # See `SequenceParallelismPass.is_applicable` for more details.
-        if self.splitting_ops is None or self.splitting_ops == []:
+        splitting_ops = self.compilation_config.splitting_ops  # type: ignore[attr-defined]
+        use_inductor_graph_partition = (
+            self.compilation_config.use_inductor_graph_partition  # type: ignore[attr-defined]
+        )
+        if not splitting_ops or use_inductor_graph_partition:
             return True
         tp_size = get_tensor_model_parallel_world_size()
         return shape is not None and shape % tp_size == 0
diff --git a/vllm/compilation/pass_manager.py b/vllm/compilation/pass_manager.py
@@ -74,6 +74,8 @@ def __call__(self, graph: fx.Graph):
             if pass_.is_applicable(shape):
                 pass_(graph)
                 VllmInductorPass.dump_prefix += 1
+            else:
+                logger.debug("Skipping %s with shape %s", pass_, shape)
 
         # post-cleanup goes before fix_functionalization
         # because it requires a functional graph
diff --git a/vllm/compilation/sequence_parallelism.py b/vllm/compilation/sequence_parallelism.py
@@ -490,13 +490,17 @@ def is_applicable(self, shape: int | None) -> bool:
         #
         # This pass is therefore only applied when the sequence dimension is
         # concrete:
-        # 1. In full-graph compilation mode (no splitting ops are used).
+        # 1. In full-graph compilation mode (no Dynamo splitting ops are used).
         #   For this case we always pad num_tokens to be a multiple of
         #   tensor_parallel_size, so there's no need to check shape % tp_size == 0.
         # 2. For specific shape provided during compilation (e.g., from
         #    `compile_sizes`), which must be divisible by the tensor-parallel
         #    size.
-        if self.splitting_ops is None or self.splitting_ops == []:
+        splitting_ops = self.compilation_config.splitting_ops  # type: ignore[attr-defined]
+        use_inductor_graph_partition = (
+            self.compilation_config.use_inductor_graph_partition  # type: ignore[attr-defined]
+        )
+        if not splitting_ops or use_inductor_graph_partition:
             return True
         tp_size = get_tensor_model_parallel_world_size()
         return shape is not None and shape % tp_size == 0
diff --git a/vllm/compilation/vllm_inductor_pass.py b/vllm/compilation/vllm_inductor_pass.py
@@ -3,6 +3,7 @@
 import functools
 import operator
 import time
+import weakref
 from typing import ClassVar
 
 import regex as re
@@ -28,12 +29,10 @@ class VllmInductorPass(InductorPass):
     """Keep track of pass index for debug dump ordering."""
 
     def __init__(self, config: VllmConfig):
+        self.compilation_config = weakref.ref(config.compilation_config)
         self.pass_config = config.compilation_config.pass_config
-        self.splitting_ops = config.compilation_config.splitting_ops
-        self.model_dtype = config.model_config.dtype if config.model_config \
-            else None
-        self.device = config.device_config.device if config.device_config \
-            else None
+        self.model_dtype = config.model_config.dtype if config.model_config else None
+        self.device = config.device_config.device if config.device_config else None
         self.pass_name = self.__class__.__name__
 
     @staticmethod