dominicshanshan
diff --git a/‎examples/auto_deploy/build_and_run_flux.py‎
Lines changed: 3 additions & 2 deletions b/‎examples/auto_deploy/build_and_run_flux.py‎
Lines changed: 3 additions & 2 deletions
diff --git a/‎tensorrt_llm/_torch/auto_deploy/compile/backends/torch_compile.py‎
Lines changed: 6 additions & 6 deletions b/‎tensorrt_llm/_torch/auto_deploy/compile/backends/torch_compile.py‎
Lines changed: 6 additions & 6 deletions
diff --git a/‎tensorrt_llm/_torch/auto_deploy/compile/backends/torch_cudagraph.py‎
Lines changed: 62 additions & 40 deletions b/‎tensorrt_llm/_torch/auto_deploy/compile/backends/torch_cudagraph.py‎
Lines changed: 62 additions & 40 deletions
diff --git a/‎tensorrt_llm/_torch/auto_deploy/compile/backends/torch_opt.py‎
Lines changed: 9 additions & 8 deletions b/‎tensorrt_llm/_torch/auto_deploy/compile/backends/torch_opt.py‎
Lines changed: 9 additions & 8 deletions
diff --git a/‎tensorrt_llm/_torch/auto_deploy/compile/backends/torch_simple.py‎
Lines changed: 4 additions & 4 deletions b/‎tensorrt_llm/_torch/auto_deploy/compile/backends/torch_simple.py‎
Lines changed: 4 additions & 4 deletions
@@ -5,7 +5,7 @@
 import torch
 from diffusers import DiffusionPipeline
 
-from tensorrt_llm._torch.auto_deploy.compile import compile_and_capture
+from tensorrt_llm._torch.auto_deploy.compile import CompileBackendRegistry
 from tensorrt_llm._torch.auto_deploy.export import torch_export_to_gm
 from tensorrt_llm._torch.auto_deploy.transformations.library.fusion import fuse_gemms
 from tensorrt_llm._torch.auto_deploy.transformations.library.quantization import quantize
@@ -143,7 +143,8 @@ def main():
 
     fuse_gemms(gm)
 
-    gm = compile_and_capture(gm, backend="torch-opt", args=(), kwargs=flux_kwargs)
+    compiler_cls = CompileBackendRegistry.get("torch-opt")
+    gm = compiler_cls(gm, args=(), kwargs=flux_kwargs).compile()
 
     del model
     fx_model = gm
 
@@ -5,15 +5,15 @@
 
 from tensorrt_llm._torch.auto_deploy.utils.logger import ad_logger
 
-from ..compiler import BackendCompiler, BackendRegistry
+from ..compiler import CompileBackendRegistry, CompilerBackend
 
 
-@BackendRegistry.register("torch-compile")
-class TorchCompileCompiler(BackendCompiler):
-    def __init__(self, *args, **kwargs):
-        super().__init__(*args, **kwargs)
+@CompileBackendRegistry.register("torch-compile")
+class TorchCompileCompiler(CompilerBackend):
+    def __init__(self, *args_for_init, **kwargs_for_init):
+        super().__init__(*args_for_init, **kwargs_for_init)
         ad_logger.info(f"Torch Dynamo cache size limit {torch._dynamo.config.cache_size_limit=}")
 
     def compile(self) -> nn.Module:
         """Compile the model using torch.compile."""
-        return torch.compile(self.gm, dynamic=True)
+        return torch.compile(self.model, dynamic=True)
@@ -5,32 +5,41 @@
 import torch
 import torch.nn as nn
 from torch.cuda import CUDAGraph
-from torch.utils._pytree import TreeSpec, tree_flatten
+from torch.fx._pytree import tree_flatten_spec
+from torch.utils._pytree import PyTree, TreeSpec, tree_flatten
 
 from tensorrt_llm._torch.autotuner import autotune
 
 from ...utils.cuda_graph import CudaGraphWarmUpPhase
 from ...utils.logger import ad_logger
-from ..compiler import BackendCompiler, BackendRegistry, _flatten_args
+from ..compiler import CompileBackendRegistry, CompilerBackend
+
+
+def _args_kwargs_flatten_spec(in_spec: TreeSpec, *args, **kwargs) -> List[Any]:
+    """Flatten inputs according to provided in_spec."""
+    all_args: PyTree = (args, kwargs)
+    return tree_flatten_spec(all_args, in_spec)
+
+
+def _args_kwargs_flatten(*args, **kwargs) -> Tuple[List[Any], TreeSpec]:
+    """Flatten inputs and return flattened inputs together with the TreeSpec."""
+    all_args: PyTree = (args, kwargs)
+    return tree_flatten(all_args)
 
 
 class CapturedGraph(nn.Module):
     def __init__(
         self,
         model: nn.Module,
-        in_spec: TreeSpec,
-        out_spec: TreeSpec,
         cuda_graph_batch_sizes: List[int],
-        num_batched_inputs: Optional[int] = 1,  # number of batched, dynamic inputs...
+        num_batched_inputs: int,  # number of batched, dynamic inputs...
     ):
         super().__init__()
-        self._in_spec = in_spec
-        self._out_spec = out_spec
         self.model = model
         self.cuda_graph_max_batch_size = max(cuda_graph_batch_sizes)
         ad_logger.info(f"Setting {self.cuda_graph_max_batch_size=}")
         self.num_batched_inputs = num_batched_inputs if num_batched_inputs is not None else 1
-        self.graphs: Dict[Tuple[int, ...], CUDAGraph] = {}
+        self.cudagraphs: Dict[Tuple[int, ...], CUDAGraph] = {}
         self._input_buffers: List[torch.Tensor] = [
             torch.empty(0, 1) for _ in range(self.num_batched_inputs)
         ]
@@ -39,6 +48,10 @@ def __init__(
         self.cuda_graph_batch_sizes = sorted(cuda_graph_batch_sizes, reverse=True)
         self._cuda_graph_mem_pool = None
 
+        # store the in_spec and out_spec during graph capture
+        self._in_spec = None
+        self._out_spec = None
+
     def _get_hash(self, flat_args: List[Any]) -> Tuple[int, ...]:
         return tuple(hash(a) for a in flat_args)
 
@@ -67,8 +80,7 @@ def _capture_one_graph(self, *args, **kwargs) -> torch.cuda.CUDAGraph:
             # compute output
             out = self.model(*args, **kwargs)
             # write out into output buffer up to out batch size
-            out_flat, out_spec = tree_flatten(out)
-            assert out_spec == self._out_spec, "Output spec mismatch."
+            out_flat = tree_flatten_spec(out, self._out_spec)
             for o_buffer, o in zip(self._out_buffer_flat, out_flat):
                 o_buffer[: o.shape[0]] = o
         torch.cuda.synchronize()
@@ -77,8 +89,11 @@ def _capture_one_graph(self, *args, **kwargs) -> torch.cuda.CUDAGraph:
 
     def capture_graph(self, *args, **kwargs):
         """Capture and pre-fetch the graph for variable batch size."""
-        # flatten args, kwargs
-        all_args_flat = _flatten_args(self._in_spec, *args, **kwargs)
+        # check this is the first time we capture the graph
+        assert not self.cudagraphs, "Graphs already captured."
+
+        # flatten args, kwargs for the first time and record in_spec
+        all_args_flat, self._in_spec = _args_kwargs_flatten(*args, **kwargs)
 
         # extract the batched input tensors
         args_batched = all_args_flat[: self.num_batched_inputs]
@@ -96,10 +111,8 @@ def capture_graph(self, *args, **kwargs):
             f"than the max_batch_size? It will fall back to non-CUDA graph forward pass for "
             f"batch sizes exceeding the max_batch_size."
         )
-        msg_ndim = "Expecting at least a 2D for batched input tensors."
         if any(self.cuda_graph_max_batch_size < input.shape[0] for input in args_batched):
             ad_logger.info(msg_bs)
-        assert all(input.ndim > 1 for input in args_batched), msg_ndim
 
         # repeat the batched input tensors to the cuda_graph_max_batch_size
         self._input_buffers = [
@@ -111,11 +124,11 @@ def capture_graph(self, *args, **kwargs):
         args, kwargs = self._in_spec.unflatten(self._input_buffers + args_static)
 
         # capture output once with cuda_graph_max_batch_size to capture output buffers
+        # store the out_spec at this point
         with CudaGraphWarmUpPhase():
             ad_logger.info(f"Warm up with {self.cuda_graph_max_batch_size=} before graph capture")
             out = self.model(*args, **kwargs)
-        self._out_buffer_flat, out_spec = tree_flatten(out)
-        assert out_spec == self._out_spec, "Output spec mismatch."
+        self._out_buffer_flat, self._out_spec = tree_flatten(out)
 
         # capture graph now for a range of batch sizes
         for bs in self.cuda_graph_batch_sizes:
@@ -132,7 +145,7 @@ def capture_graph(self, *args, **kwargs):
     def forward(self, *args, **kwargs) -> Any:
         """Run the compiled graph."""
         # flatten args, kwargs
-        all_args_flat = _flatten_args(self._in_spec, *args, **kwargs)
+        all_args_flat = _args_kwargs_flatten_spec(self._in_spec, *args, **kwargs)
 
         # extract the batched input tensors
         args_batched = all_args_flat[: self.num_batched_inputs]
@@ -150,30 +163,44 @@ def forward(self, *args, **kwargs) -> Any:
         combined_shape = sum(rounded_shapes, start=())
 
         # regular forward for non-matching shapes
-        if combined_shape not in self.graphs:
+        if combined_shape not in self.cudagraphs:
             return self.model(*args, **kwargs)
 
         # copy inputs to input buffers
         for i, input_tensor in enumerate(args_batched):
             self._input_buffers[i][: input_tensor.shape[0]].copy_(input_tensor, non_blocking=True)
 
         # run forward pass via graph
-        self.graphs[combined_shape].replay()
+        self.cudagraphs[combined_shape].replay()
 
         # retrieve output from buffer, cut to batch size, and unflatten
         bs = args_batched[0].shape[0]
         out_flat = [o_b[:bs].detach().clone() for o_b in self._out_buffer_flat]
         return self._out_spec.unflatten(out_flat)
 
 
-@BackendRegistry.register("torch-cudagraph")
-class TorchCudagraphCompiler(BackendCompiler):
+@CompileBackendRegistry.register("torch-cudagraph")
+class TorchCudagraphCompiler(CompilerBackend):
     """Compiler that uses only CUDA graphs."""
 
-    def __init__(self, *args, **kwargs):
-        super().__init__(*args, **kwargs)
-        requested = self.compiler_kwargs.get("cuda_graph_batch_sizes")
-        if not requested:
+    def __init__(
+        self,
+        *args_for_init,
+        cuda_graph_batch_sizes: Optional[List[int]] = None,
+        num_batched_inputs: int = 1,
+        max_batch_size: Optional[int] = None,
+        **kwargs_for_init,
+    ):
+        super().__init__(*args_for_init, **kwargs_for_init)
+
+        # heuristic to identify max batch size
+        assert max_batch_size or cuda_graph_batch_sizes, (
+            "At least one of max_batch_size or cuda_graph_batch_sizes must be provided."
+        )
+        self.max_batch_size = max_batch_size or max(cuda_graph_batch_sizes)
+
+        self.num_batched_inputs = num_batched_inputs
+        if not cuda_graph_batch_sizes:
             # Use heuristic which includes commonly-used sizes like 1 and max_bs
             self.cuda_graph_batch_sizes = self._get_graph_batch_sizes(self.max_batch_size)
             ad_logger.info(f"Using heuristic cuda_graph_batch_sizes: {self.cuda_graph_batch_sizes}")
@@ -182,39 +209,34 @@ def __init__(self, *args, **kwargs):
             # No point capturing CUDA graphs for batch sizes larger than max_batch_size
             effective = {
                 min(max(1, int(b)), int(self.max_batch_size))
-                for b in requested
+                for b in cuda_graph_batch_sizes
                 if isinstance(b, (int, float)) and b > 0
             }
             self.cuda_graph_batch_sizes = sorted(effective, reverse=True)
 
             # Log if we clamped any values
-            original_values = [int(b) for b in requested if isinstance(b, (int, float)) and b > 0]
+            original_values = [
+                int(b) for b in cuda_graph_batch_sizes if isinstance(b, (int, float)) and b > 0
+            ]
             clamped_values = [v for v in original_values if v > self.max_batch_size]
             if clamped_values:
                 ad_logger.info(
                     f"Clamped CUDA graph batch sizes {clamped_values} to max_batch_size={self.max_batch_size}"
                 )
 
             ad_logger.info(
-                f"Using explicit cuda_graph_batch_sizes: requested={requested}"
+                f"Using explicit cuda_graph_batch_sizes: requested={cuda_graph_batch_sizes}"
                 f" -> effective={self.cuda_graph_batch_sizes}"
                 f" (clamped to [1, {self.max_batch_size}])"
             )
 
-    def _init_captured_graph(
-        self, gm: nn.Module, in_spec: TreeSpec, out_spec: TreeSpec
-    ) -> CapturedGraph:
-        return CapturedGraph(
-            gm,
-            in_spec=in_spec,
-            out_spec=out_spec,
-            cuda_graph_batch_sizes=self.cuda_graph_batch_sizes,
-            num_batched_inputs=self.compiler_kwargs.get("num_batched_inputs"),
-        )
-
     @torch.inference_mode()
     def compile(self) -> CapturedGraph:
-        captured_model = self._init_captured_graph(self.gm, self.gm._in_spec, self.gm._out_spec)
+        captured_model = CapturedGraph(
+            self.model,
+            cuda_graph_batch_sizes=self.cuda_graph_batch_sizes,
+            num_batched_inputs=self.num_batched_inputs,
+        )
 
         # try capturing cudagraph
         if self.args is not None or self.kwargs is not None:
 
@@ -1,19 +1,20 @@
 """Mixed backend with torch.compile + Cudagraph."""
 
 import torch
+import torch.nn as nn
 
 from tensorrt_llm._torch.auto_deploy.utils.logger import ad_logger
 
-from ..compiler import BackendRegistry
-from .torch_cudagraph import CapturedGraph, TorchCudagraphCompiler
+from ..compiler import CompileBackendRegistry
+from .torch_cudagraph import TorchCudagraphCompiler
 
 
-@BackendRegistry.register("torch-opt")
+@CompileBackendRegistry.register("torch-opt")
 class TorchOptCompiler(TorchCudagraphCompiler):
     """Compiler that uses both torch.compile and CUDA graphs."""
 
-    def __init__(self, *args, **kwargs):
-        super().__init__(*args, **kwargs)
+    def __init__(self, *args_for_init, **kwargs_for_init):
+        super().__init__(*args_for_init, **kwargs_for_init)
         torch._dynamo.config.recompile_limit = max(
             len(self.cuda_graph_batch_sizes), torch._dynamo.config.recompile_limit
         )
@@ -22,6 +23,6 @@ def __init__(self, *args, **kwargs):
             f"{torch._dynamo.config.cache_size_limit=}"
         )
 
-    def _init_captured_graph(self, gm, in_spec, out_spec) -> CapturedGraph:
-        gm = torch.compile(gm, dynamic=True)
-        return super()._init_captured_graph(gm, in_spec, out_spec)
+    def compile(self) -> nn.Module:
+        self.model = torch.compile(self.model, dynamic=True)
+        return super().compile()
@@ -2,10 +2,10 @@
 
 import torch.nn as nn
 
-from ..compiler import BackendCompiler, BackendRegistry
+from ..compiler import CompileBackendRegistry, CompilerBackend
 
 
-@BackendRegistry.register("torch-simple")
-class TorchCompiler(BackendCompiler):
+@CompileBackendRegistry.register("torch-simple")
+class TorchCompiler(CompilerBackend):
     def compile(self) -> nn.Module:
-        return self.gm
+        return self.model