vllm-project · ilmarkov · Sep 6, 2025 · Sep 9, 2025 · Sep 12, 2025 · Sep 16, 2025
diff --git a/.buildkite/test-pipeline.yaml b/.buildkite/test-pipeline.yaml
@@ -450,6 +450,7 @@ steps:
     - pytest -v -s compile/test_decorator.py
     - pytest -v -s compile/test_noop_elimination.py
     - pytest -v -s compile/test_aot_compile.py
+    - pytest -v -s compile/test_compile_ranges.py
 
 - label: PyTorch Fullgraph Smoke Test # 15min
   timeout_in_minutes: 30

diff --git a/tests/compile/test_compile_ranges.py b/tests/compile/test_compile_ranges.py
@@ -0,0 +1,104 @@
+# SPDX-License-Identifier: Apache-2.0
+# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
+import torch
+from torch import fx as fx
+from torch import nn
+
+# This import automatically registers `torch.ops.silly.attention`
+import tests.compile.silly_attention  # noqa
+from vllm.compilation.counter import compilation_counter
+from vllm.compilation.decorators import support_torch_compile
+from vllm.compilation.inductor_pass import (
+    InductorPass,
+    get_pass_context,
+)
+from vllm.config import (
+    VllmConfig,
+    set_current_vllm_config,
+)
+from vllm.config.compilation import CompilationConfig, CompilationMode
+from vllm.config.scheduler import SchedulerConfig
+from vllm.config.utils import Range
+from vllm.forward_context import set_forward_context
+
+BATCH_SIZE = 64
+MLP_SIZE = 128
+
+
+@support_torch_compile
+class TestModel(nn.Module):
+    def __init__(self, *, vllm_config: VllmConfig, prefix: str = "", **kwargs) -> None:
+        super().__init__()
+
+    def forward(self, x: torch.Tensor) -> torch.Tensor:
+        x = x + x
+        attn_output = torch.empty_like(x)
+        torch.ops.silly.attention(x, x, x, attn_output)
+        x = attn_output
+        x = x * 3
+        return x
+
+
+@torch.inference_mode
+def run_model(vllm_config: VllmConfig, model: nn.Module, batch_sizes: list[int]):
+    with set_forward_context({}, vllm_config=vllm_config):
+        model(torch.randn(BATCH_SIZE, MLP_SIZE).cuda())
+        for batch_size in batch_sizes:
+            model(torch.randn(batch_size, MLP_SIZE).cuda())
+
+
+class PostGradPassManagerCheckRanges(InductorPass):
+    def __init__(self, ranges: list[Range]):
+        self.ranges = ranges
+        self.num_calls = 0
+
+    def __call__(self, graph: fx.Graph):
+        compile_range = get_pass_context().compile_range
+        assert compile_range in self.ranges, (
+            f"Compile range {compile_range} not in {self.ranges}"
+        )
+        self.num_calls += 1
+
+    def uuid(self) -> str:
+        state = {
+            "ranges": [str(range) for range in self.ranges],
+            "current_compile_range": str(get_pass_context().compile_range),
+        }
+        return InductorPass.hash_dict(state)
+
+
+def test_compile_ranges():
+    post_grad_pass_manager = PostGradPassManagerCheckRanges(
+        [
+            Range(start=1, end=8),
+            Range(start=8, end=32),
+            Range(start=32, end=8193),
+        ]
+    )
+    vllm_config = VllmConfig(
+        scheduler_config=SchedulerConfig(
+            max_num_batched_tokens=8192,
+        ),
+        compilation_config=CompilationConfig(
+            mode=CompilationMode.VLLM_COMPILE,
+            compile_ranges_split_points=[8, 32],
+            inductor_compile_config={
+                "post_grad_custom_post_pass": post_grad_pass_manager,
+                # Disable inductor cache to get the number of passes correctly
+                "force_disable_caches": True,
+            },
+        ),
+    )
+
+    with set_current_vllm_config(vllm_config):
+        model = TestModel(vllm_config=vllm_config, prefix="").eval().cuda()
+        batch_sizes = [1, 4, 16, 24, 48, 64]
+        # A has support_torch_compile
+        with compilation_counter.expect(
+            num_graphs_seen=1,
+            num_piecewise_graphs_seen=1,
+            num_backend_compilations=3,
+            # num_cudagraph_sizes * num_piecewise_capturable_graphs_seen
+        ):
+            run_model(vllm_config, model, batch_sizes)
+        assert post_grad_pass_manager.num_calls == 3
@@ -22,6 +22,7 @@
     should_split,
 )
 from vllm.config import CompilationConfig, CUDAGraphMode, VllmConfig
+from vllm.config.utils import Range
 from vllm.logger import init_logger
 from vllm.platforms import current_platform
 from vllm.utils.import_utils import resolve_obj_by_qualname
@@ -83,7 +84,7 @@ class CompilerManager:
     """
 
     def __init__(self, compilation_config: CompilationConfig):
-        self.cache: dict[tuple[int | None, int, str], Any] = dict()
+        self.cache: dict[tuple[Range | None, int, str], Any] = dict()
         self.is_cache_updated = False
         self.compilation_config = compilation_config
         self.compiler = make_compiler(compilation_config)
@@ -92,11 +93,11 @@ def compute_hash(self, vllm_config: VllmConfig) -> str:
         return self.compiler.compute_hash(vllm_config)
 
     @contextmanager
-    def compile_context(self, runtime_shape: int | None = None):
+    def compile_context(self, compile_range: Range | None = None):
         """Provide compilation context for the duration of compilation to set
         any torch global properties we want to scope to a single Inductor
         compilation (e.g. partition rules, pass context)."""
-        with pass_context(runtime_shape):
+        with pass_context(compile_range):
             if self.compilation_config.use_inductor_graph_partition:
                 with inductor_partition_rule_context(
                     self.compilation_config.splitting_ops
@@ -152,26 +153,28 @@ def load(
         graph: fx.GraphModule,
         example_inputs: list[Any],
         graph_index: int,
-        runtime_shape: int | None = None,
+        compile_range: Range | None = None,
     ) -> Callable | None:
-        if (runtime_shape, graph_index, self.compiler.name) not in self.cache:
+        if (compile_range, graph_index, self.compiler.name) not in self.cache:
             return None
-        handle = self.cache[(runtime_shape, graph_index, self.compiler.name)]
+        handle = self.cache[(compile_range, graph_index, self.compiler.name)]
         compiled_graph = self.compiler.load(
-            handle, graph, example_inputs, graph_index, runtime_shape
+            handle, graph, example_inputs, graph_index, compile_range
         )
-        if runtime_shape is None:
+        if compile_range is None:
             logger.debug(
-                "Directly load the %s-th graph for dynamic shape from %s via handle %s",
+                "Directly load the %s-th graph for dynamic compile range"
+                "from %s via handle %s",
                 graph_index,
                 self.compiler.name,
                 handle,
             )
         else:
             logger.debug(
-                "Directly load the %s-th graph for shape %s from %s via handle %s",
+                "Directly load the %s-th graph for compile range %s"
+                "from %s via handle %s",
                 graph_index,
-                str(runtime_shape),
+                str(compile_range),
                 self.compiler.name,
                 handle,
             )
@@ -185,7 +188,7 @@ def compile(
         compilation_config: CompilationConfig,
         graph_index: int = 0,
         num_graphs: int = 1,
-        runtime_shape: int | None = None,
+        compile_range: Range | None = None,
     ) -> Any:
         if graph_index == 0:
             # before compiling the first graph, record the start time
@@ -197,25 +200,25 @@ def compile(
         compiled_graph = None
 
         # try to load from the cache
-        compiled_graph = self.load(graph, example_inputs, graph_index, runtime_shape)
+        compiled_graph = self.load(graph, example_inputs, graph_index, compile_range)
         if compiled_graph is not None:
             if graph_index == num_graphs - 1:
                 # after loading the last graph for this shape, record the time.
                 # there can be multiple graphs due to piecewise compilation.
                 now = time.time()
                 elapsed = now - compilation_start_time
                 compilation_config.compilation_time += elapsed
-                if runtime_shape is None:
+                if compile_range is None:
                     logger.info(
                         "Directly load the compiled graph(s) for dynamic shape "
                         "from the cache, took %.3f s",
                         elapsed,
                     )
                 else:
                     logger.info(
-                        "Directly load the compiled graph(s) for shape %s "
+                        "Directly load the compiled graph(s) for compile range %s "
                         "from the cache, took %.3f s",
-                        str(runtime_shape),
+                        str(compile_range),
                         elapsed,
                     )
             return compiled_graph
@@ -226,48 +229,52 @@ def compile(
             # Let compile_fx generate a key for us
             maybe_key = None
         else:
-            maybe_key = f"artifact_shape_{runtime_shape}_subgraph_{graph_index}"
-
-        with self.compile_context(runtime_shape):
+            maybe_key = "artifact_compile_range_"
+            if compile_range is None:
+                maybe_key += "dynamic_shape"
+            else:
+                maybe_key += f"{compile_range.start}_{compile_range.end}"
+            maybe_key += f"_subgraph_{graph_index}"
+        with self.compile_context(compile_range):
             compiled_graph, handle = self.compiler.compile(
                 graph,
                 example_inputs,
                 additional_inductor_config,
-                runtime_shape,
+                compile_range,
                 maybe_key,
             )
 
         assert compiled_graph is not None, "Failed to compile the graph"
 
         # store the artifact in the cache
         if is_compile_cache_enabled(additional_inductor_config) and handle is not None:
-            self.cache[(runtime_shape, graph_index, self.compiler.name)] = handle
+            self.cache[(compile_range, graph_index, self.compiler.name)] = handle
             compilation_counter.num_cache_entries_updated += 1
             self.is_cache_updated = True
             if graph_index == 0:
                 # adds some info logging for the first graph
-                if runtime_shape is None:
+                if compile_range is None:
                     logger.info_once(
                         "Cache the graph for dynamic shape for later use", scope="local"
                     )
                 else:
                     logger.info_once(
-                        "Cache the graph of shape %s for later use",
-                        str(runtime_shape),
-                        scope="local",
+                        "Cache the graph of compile range %s for later use",
+                        str(compile_range),
                     )
-            if runtime_shape is None:
+            if compile_range is None:
                 logger.debug(
-                    "Store the %s-th graph for dynamic shape from %s via handle %s",
+                    "Store the %s-th graph for dynamic compile range"
+                    "from %s via handle %s",
                     graph_index,
                     self.compiler.name,
                     handle,
                 )
             else:
                 logger.debug(
-                    "Store the %s-th graph for shape %s from %s via handle %s",
+                    "Store the %s-th graph for compile range%s from %s via handle %s",
                     graph_index,
-                    str(runtime_shape),
+                    str(compile_range),
                     self.compiler.name,
                     handle,
                 )
@@ -277,16 +284,16 @@ def compile(
             now = time.time()
             elapsed = now - compilation_start_time
             compilation_config.compilation_time += elapsed
-            if runtime_shape is None:
+            if compile_range is None:
                 logger.info_once(
-                    "Compiling a graph for dynamic shape takes %.2f s",
+                    "Compiling a graph for dynamic compile range takes %.2f s",
                     elapsed,
                     scope="local",
                 )
             else:
                 logger.info_once(
-                    "Compiling a graph for shape %s takes %.2f s",
-                    runtime_shape,
+                    "Compiling a graph for compile range %s takes %.2f s",
+                    str(compile_range),
                     elapsed,
                     scope="local",
                 )
@@ -405,19 +412,7 @@ def call_module(
             sym_shape_indices = [
                 i for i, x in enumerate(args) if isinstance(x, torch.SymInt)
             ]
-            global compilation_start_time
 
-            compiled_graph_for_dynamic_shape = (
-                self.vllm_backend.compiler_manager.compile(
-                    submod,
-                    args,
-                    self.compilation_config.inductor_compile_config,
-                    self.compilation_config,
-                    graph_index=index,
-                    num_graphs=len(self.compile_submod_names),
-                    runtime_shape=None,
-                )
-            )
             # Lazy import here to avoid circular import
             from .piecewise_backend import PiecewiseBackend
 
@@ -427,7 +422,6 @@ def call_module(
                 index,
                 len(self.compile_submod_names),
                 sym_shape_indices,
-                compiled_graph_for_dynamic_shape,
                 self.vllm_backend,
             )