NVIDIA
diff --git a/‎tensorrt_llm/_torch/auto_deploy/config/default.yaml‎
Lines changed: 4 additions & 2 deletions b/‎tensorrt_llm/_torch/auto_deploy/config/default.yaml‎
Lines changed: 4 additions & 2 deletions
diff --git a/‎tensorrt_llm/_torch/auto_deploy/transform/interface.py‎
Lines changed: 8 additions & 2 deletions b/‎tensorrt_llm/_torch/auto_deploy/transform/interface.py‎
Lines changed: 8 additions & 2 deletions
diff --git a/‎tensorrt_llm/_torch/auto_deploy/transform/library/attention.py‎
Lines changed: 0 additions & 3 deletions b/‎tensorrt_llm/_torch/auto_deploy/transform/library/attention.py‎
Lines changed: 0 additions & 3 deletions
diff --git a/‎tensorrt_llm/_torch/auto_deploy/transform/library/collectives.py‎
Lines changed: 79 additions & 166 deletions b/‎tensorrt_llm/_torch/auto_deploy/transform/library/collectives.py‎
Lines changed: 79 additions & 166 deletions
diff --git a/‎tensorrt_llm/_torch/auto_deploy/transform/library/fused_moe.py‎
Lines changed: 3 additions & 3 deletions b/‎tensorrt_llm/_torch/auto_deploy/transform/library/fused_moe.py‎
Lines changed: 3 additions & 3 deletions
@@ -32,6 +32,7 @@ transforms:
     stage: pattern_matcher
   match_repeat_kv:
     stage: pattern_matcher
+    run_shape_prop: true
   match_eager_attention:
     stage: pattern_matcher
   match_grouped_attention:
@@ -111,13 +112,14 @@ transforms:
     enabled: true
   fuse_allreduce_residual_rmsnorm:
     stage: post_load_fusion
-  fuse_collectives:
-    stage: post_load_fusion
+  # TODO (lucaslie): add backend selection as part of configurable inference optimizers
+  # check if we can fuse rmsnorm
   fuse_rmsnorm:
     # TODO (lucaslie): add backend selection as part of configurable inference optimizers
     # check if we can fuse rmsnorm
     stage: post_load_fusion
     backend: flashinfer
+    requires_shape_prop: true
   ############################################################################################
   # SWITCH TO CACHED+FLATTENED ATTENTION + INITIALIZE CACHES
   ############################################################################################
 
@@ -5,6 +5,7 @@
 
 import time
 from abc import ABC, abstractmethod
+from contextlib import nullcontext
 from enum import Enum
 from functools import total_ordering, wraps
 from typing import Any, Callable, Dict, Mapping, Tuple, Type, Union, final
@@ -19,6 +20,7 @@
     canonicalize_graph,
     lift_to_meta,
     named_graphmodules,
+    placeholders_on_meta,
     run_shape_prop,
 )
 from ..utils.logger import ad_logger
@@ -416,11 +418,13 @@ def _run_pre_cleanup(self, gm: GraphModule, info: TransformInfo) -> Tuple[bool,
         is_clean = info.is_clean
         has_valid_shapes = is_clean and info.has_valid_shapes
 
+        use_meta = isinstance(gm, GraphModule) and placeholders_on_meta(gm)
+
         # check if run cleanup depending on the config and info
         if self.config.requires_shape_prop and not has_valid_shapes:
             self._log_info("running pre-cleanup with shape_prop")
             canonicalize_graph(gm)
-            with lift_to_meta(gm):
+            with lift_to_meta(gm) if use_meta else nullcontext():
                 run_shape_prop(gm)
             is_clean = True
             has_valid_shapes = True
@@ -444,11 +448,13 @@ def _run_post_cleanup(self, gm: GraphModule, info: TransformInfo) -> TransformIn
         if not self.config.run_graph_cleanup:
             return info
 
+        use_meta = isinstance(gm, GraphModule) and placeholders_on_meta(gm)
+
         # check if run cleanup depending on the config and info
         if self.config.run_shape_prop and not (info.is_clean and info.has_valid_shapes):
             self._log_info("running post-cleanup with shape_prop")
             canonicalize_graph(gm)
-            with lift_to_meta(gm):
+            with lift_to_meta(gm) if use_meta else nullcontext():
                 run_shape_prop(gm)
         elif self.config.run_graph_cleanup and not info.is_clean:
             self._log_info("running post-cleanup (no shape_prop)")
 
@@ -303,9 +303,6 @@ def register_repeat_kv(patterns: ADPatternMatcherPass):
 
         num_kv_patterns = _apply_pattern(gm, "Repeat KV", register_repeat_kv)
 
-        if num_kv_patterns > 0:
-            self.config.run_shape_prop = True
-
         info = TransformInfo(
             skipped=False,
             num_matches=num_kv_patterns,
 
@@ -1,13 +1,11 @@
-import operator
 from typing import Tuple
 
 import torch
 from torch.fx import GraphModule
 
-from ...distributed.trtllm import is_trtllm_op_available
 from ...models.factory import ModelFactory
 from ...shim.interface import CachedSequenceInterface
-from ...utils.node_utils import get_op_overload_packet, get_user_if_pattern_match, is_op
+from ...utils.pattern_matcher import ADPatternMatcherPass, register_ad_pattern
 from ..interface import BaseTransform, SharedConfig, TransformInfo, TransformRegistry
 
 # TODO: This is an overly simplified model that works well for vanilla Llama models.
@@ -18,187 +16,102 @@
 # * ...
 
 
-@TransformRegistry.register("fuse_collectives")
-class FuseCollectives(BaseTransform):
+def _allreduce_residual_rmsnorm_pattern(
+    x: torch.Tensor, residual: torch.Tensor, weight: torch.Tensor, eps: float = 0.1253
+):
     """
-    Fuses all_reduce ops with preceding (quantized) linear ops into a single fused node for improved performance.
+    Reference PyTorch composition of:
+        y = all_reduce(x)
+        z = residual + y
+        normed = RMSNorm(z, weight, eps)
+    Returns (normed, z)
     """
 
-    def _apply(
-        self,
-        gm: GraphModule,
-        cm: CachedSequenceInterface,
-        factory: ModelFactory,
-        shared_config: SharedConfig,
-    ) -> Tuple[GraphModule, TransformInfo]:
-        num_gemm_collective_fusions = 0
-
-        # lookup for fused ops
-        # TODO: avoid this hardcoded lookup, e.g., by generating fused ops on the fly.
-        lookup = {
-            torch.ops.auto_deploy.torch_linear_simple: torch.ops.auto_deploy.trtllm_dist_fused_linear_all_reduce,
-            torch.ops.aten.linear: torch.ops.auto_deploy.trtllm_dist_fused_linear_all_reduce,
-            torch.ops.auto_deploy.torch_quant_fp8_linear: torch.ops.auto_deploy.torch_quant_fused_fp8_linear_all_reduce,
-        }
-
-        # go through all nodes and find all_reduce nodes
-        for node in gm.graph.nodes:
-            if not is_op(node, torch.ops.auto_deploy.torch_dist_all_reduce):
-                continue
-
-            # check if args are as expected
-            assert len(node.args) == 1 and not len(node.kwargs), (
-                "Unexpected args/kwargs for all_reduce"
-            )
-
-            # retrieve parent and check a few conditions on the parent node
-            parent_node = node.args[0]
-            if not is_op(parent_node, lookup.keys()):
-                continue
-            if len(parent_node.users) > 1:
-                continue
-
-            with gm.graph.inserting_before(node):
-                # insert fused node
-                fused_linear_collective_node = gm.graph.call_function(
-                    lookup[get_op_overload_packet(parent_node.target)],
-                    args=parent_node.args,
-                    kwargs=parent_node.kwargs,
-                )
-            node.replace_all_uses_with(fused_linear_collective_node)
-            gm.graph.erase_node(node)
-            gm.graph.erase_node(parent_node)
-            num_gemm_collective_fusions += 1
+    input_dtype = x.dtype
+    hidden_states = torch.ops.auto_deploy.torch_dist_all_reduce(x)
+    add = residual + hidden_states
 
-        info = TransformInfo(
-            skipped=False,
-            num_matches=num_gemm_collective_fusions,
-            is_clean=False,
-            has_valid_shapes=False,
-        )
+    hidden_states = add.to(torch.float32)
+    variance = hidden_states.pow(2).mean(-1, keepdim=True)
+    hidden_states = hidden_states * torch.rsqrt(variance + eps)
 
-        return gm, info
+    normed = weight * hidden_states.to(input_dtype)
 
+    return normed, add
 
-@TransformRegistry.register("fuse_allreduce_residual_rmsnorm")
-class FuseAllreduceResidualRMSNorm(BaseTransform):
-    """Essentially, this transformation fuses the following operators into one allreduce trtllm implementation.
-
-    * target pattern:
-        x = all_reduce(x)
-        y = x + residual
-        return rmsnorm(y), y
-    * replacement:
-        fused_allreduce_residual_rmsnorm(x, residual, rmsnorm_weight, rmsnorm_eps)
 
+def _allreduce_residual_rmsnorm_pattern2(
+    x: torch.Tensor, residual: torch.Tensor, weight: torch.Tensor, eps: float = 0.1253
+):
+    """
+    Reference PyTorch composition of:
+        y = all_reduce(x)
+        z = y + residual
+        normed = RMSNorm(z, weight, eps)
+    Returns (normed, z)
     """
 
+    input_dtype = x.dtype
+    hidden_states = torch.ops.auto_deploy.torch_dist_all_reduce(x)
+    add = hidden_states + residual
+
+    hidden_states = add.to(torch.float32)
+    variance = hidden_states.pow(2).mean(-1, keepdim=True)
+    hidden_states = hidden_states * torch.rsqrt(variance + eps)
+
+    normed = weight * hidden_states.to(input_dtype)
+
+    return normed, add
+
+
+def _allreduce_residual_rmsnorm_repl(
+    x: torch.Tensor, residual: torch.Tensor, weight: torch.Tensor, eps: float
+):
+    return torch.ops.dist.fused_allreduce_residual_rmsnorm(x, residual, weight, eps)
+
+
+@TransformRegistry.register("fuse_allreduce_residual_rmsnorm")
+class FuseAllreduceResidualRMSNorm(BaseTransform):
+    """Fuse (allreduce + residual add + RMSNorm) into one fused op with tuple output."""
+
     def _apply(
         self,
         gm: GraphModule,
         cm: CachedSequenceInterface,
         factory: ModelFactory,
         shared_config: SharedConfig,
     ) -> Tuple[GraphModule, TransformInfo]:
-        if not is_trtllm_op_available():
-            return gm, TransformInfo(
-                skipped=True, num_matches=0, is_clean=True, has_valid_shapes=True
-            )
-
-        num_ar_r_rms_fusions = 0
-
-        def trace_and_fuse(allreduce_node, graph):
-            # Check if all_reduce is followed by addition
-            users = list(allreduce_node.users.keys())
-            if len(users) != 1:
-                return  # Skip if all_reduce has more than one consumer
-            add_node = users[0]
-
-            # Traverse nodes for RMSNorm pattern which is composed of to_copy, pow, mean, add, refer
-            # the Huggingface LlamaRMSNorm implementation as example for more details
-            to_copy_1 = get_user_if_pattern_match(add_node, [torch.ops.aten.add, operator.add], 2)
-            # operand of pow and mul
-            pow_node = get_user_if_pattern_match(
-                to_copy_1, [torch.ops.aten._to_copy, torch.ops.aten.to], 2
-            )
-            mean_node = get_user_if_pattern_match(pow_node, torch.ops.aten.pow, 1)
-            add_eps_node = get_user_if_pattern_match(mean_node, torch.ops.aten.mean, 1)
-            rsqrt_node = get_user_if_pattern_match(
-                add_eps_node, [torch.ops.aten.add, operator.add], 1
-            )
-            mul_node_1 = get_user_if_pattern_match(rsqrt_node, torch.ops.aten.rsqrt, 1)
-            to_copy_2 = get_user_if_pattern_match(mul_node_1, torch.ops.aten.mul, 1)
-            mul_node_2 = get_user_if_pattern_match(
-                to_copy_2, [torch.ops.aten._to_copy, torch.ops.aten.to], 1
-            )
-            # check args of ops: pow(2) and mean(-1)
-            ARGS_MATCH = pow_node is not None and pow_node.args[1] == 2  # exponent
-            ARGS_MATCH &= mean_node is not None and mean_node.args[1] == [-1]  # dimensions
-
-            # Match found: Replace with fused operation
-            if (
-                to_copy_1
-                and pow_node
-                and mean_node
-                and add_eps_node
-                and rsqrt_node
-                and mul_node_1
-                and to_copy_2
-                and mul_node_2
-                and ARGS_MATCH
-            ):
-                # Gather the inputs for the custom operation
-                tensor = allreduce_node.args[0]
-                # Identify the residual argument in the add operation
-                # One of the args in add_node.args is the output of all_reduce
-                # The same idea also applies to norm_weight
-                residual = (
-                    add_node.args[0] if add_node.args[1] is allreduce_node else add_node.args[1]
-                )
-                norm_weight = (
-                    mul_node_2.args[0] if mul_node_2.args[1] is to_copy_2 else mul_node_2.args[1]
-                )
-                eps = add_eps_node.args[1]
-
-                # Insert nodes
-                with graph.inserting_before(allreduce_node):
-                    fused_node = graph.call_function(
-                        torch.ops.dist.fused_allreduce_residual_rmsnorm,
-                        args=(
-                            tensor,
-                            residual,
-                            norm_weight,
-                            eps,
-                        ),
-                    )
-                    # Extract outputs from the tuple returned by `fused_node`
-                    final_output_node = gm.graph.create_node(
-                        "call_function",
-                        target=operator.getitem,
-                        args=(fused_node, 0),
-                    )
-                    add_output_node = gm.graph.create_node(
-                        "call_function",
-                        target=operator.getitem,
-                        args=(fused_node, 1),
-                    )
-
-                    # Replace all uses of rmsnorm_node with final_output_node
-                    mul_node_2.replace_all_uses_with(final_output_node)
-
-                    # Replace all uses of add_node with add_output_node
-                    add_node.replace_all_uses_with(add_output_node)
-
-                nonlocal num_ar_r_rms_fusions
-                num_ar_r_rms_fusions += 1
-
-        # Traverse all nodes
-        for node in gm.graph.nodes:
-            if is_op(node, torch.ops.auto_deploy.torch_dist_all_reduce):
-                trace_and_fuse(allreduce_node=node, graph=gm.graph)
+        patterns = ADPatternMatcherPass()
+
+        # Dummy shapes for tracing
+        bsz, hidden = 8, 512
+        dummy_args = [
+            torch.randn(bsz, hidden, device="meta", dtype=torch.bfloat16),  # x
+            torch.randn(bsz, hidden, device="meta", dtype=torch.bfloat16),  # residual
+            torch.randn(hidden, device="meta", dtype=torch.bfloat16),  # weight
+            0.1253,  # eps
+        ]
+
+        register_ad_pattern(
+            search_fn=_allreduce_residual_rmsnorm_pattern,
+            replace_fn=_allreduce_residual_rmsnorm_repl,
+            patterns=patterns,
+            dummy_args=dummy_args,
+            op_ignore_types={torch.ops.aten.to.dtype: (torch.dtype,)},
+            scalar_workaround={"eps": 0.1253},
+        )
+        register_ad_pattern(
+            search_fn=_allreduce_residual_rmsnorm_pattern2,
+            replace_fn=_allreduce_residual_rmsnorm_repl,
+            patterns=patterns,
+            dummy_args=dummy_args,
+            op_ignore_types={torch.ops.aten.to.dtype: (torch.dtype,)},
+            scalar_workaround={"eps": 0.1253},
+        )
+
+        num_matches = patterns.apply(gm.graph)
 
         info = TransformInfo(
-            skipped=False, num_matches=num_ar_r_rms_fusions, is_clean=False, has_valid_shapes=False
+            skipped=False, num_matches=num_matches, is_clean=False, has_valid_shapes=False
         )
-
         return gm, info
@@ -500,7 +500,7 @@ def target_op(self):
         return torch.ops.auto_deploy.torch_linear_simple
 
     def moe_op(self):
-        return torch.ops.auto_deploy.torch_moe
+        return torch.ops.auto_deploy.torch_moe.default
 
     def scale_arg_indices(self) -> Dict[str, int]:
         return {}
@@ -517,7 +517,7 @@ def target_op(self):
         return torch.ops.auto_deploy.torch_quant_fp8_linear
 
     def moe_op(self):
-        return torch.ops.auto_deploy.torch_quant_fp8_moe
+        return torch.ops.auto_deploy.torch_quant_fp8_moe.default
 
     def scale_arg_indices(self) -> Dict[str, int]:
         return {"input_scale": 3, "weight_scale": 4}
@@ -534,7 +534,7 @@ def target_op(self):
         return torch.ops.auto_deploy.torch_quant_nvfp4_linear
 
     def moe_op(self):
-        return torch.ops.auto_deploy.torch_quant_nvfp4_moe
+        return torch.ops.auto_deploy.torch_quant_nvfp4_moe.default
 
     def scale_arg_indices(self) -> Dict[str, int]:
         return {"input_scale": 3, "weight_scale": 4, "alpha": 5}