ALL WORKS

ProExpertProg · ProExpertProg · commit fe89c0b691dd · 2025-09-22T12:49:46.000-07:00
Signed-off-by: Luka Govedič &lt;lgovedic@redhat.com&gt;
diff --git a/vllm/compilation/fusion.py b/vllm/compilation/fusion.py
@@ -17,7 +17,7 @@
 from vllm.platforms import current_platform
 
 from .inductor_pass import enable_fake_mode
-from .matcher_utils import MatcherQuant, MatcherRMSNorm
+from .matcher_utils import MatcherFusedAddRMSNorm, MatcherQuant, MatcherRMSNorm
 from .vllm_inductor_pass import VllmInductorPass, VllmPatternMatcherPass
 
 logger = init_logger(__name__)
@@ -92,7 +92,8 @@ def __init__(self, epsilon: float, key: FusedRMSQuantKey):
             f"unsupported fused rmsnorm+quant op for {key}"
         self.FUSED_OP = FUSED_OPS[key]
 
-        self.rmsnorm_matcher = MatcherRMSNorm(epsilon)
+        self.rmsnorm_matcher = MatcherRMSNorm(epsilon) if not key.fused_add \
+            else MatcherFusedAddRMSNorm(epsilon)
         self.quant_matcher = MatcherQuant(key.quant)
 
 
@@ -133,8 +134,8 @@ def replacement(input: torch.Tensor, weight: torch.Tensor,
             return at[1]
 
         inputs = [
-            empty_fp32(5, 4),  # input # TODO: rms_input
-            empty_bf16(4, ),  # weight
+            # input, weight
+            *self.rmsnorm_matcher.inputs(),
             empty_fp32(1, 1)  # scale
         ]
         pattern(*inputs)
@@ -157,16 +158,16 @@ def __init__(self,
 
     def register(self, pm_pass: PatternMatcherPass):
 
-        def pattern(input: torch.Tensor, residual: torch.Tensor,
-                    weight: torch.Tensor, scale: torch.Tensor):
+        def pattern(input: torch.Tensor, weight: torch.Tensor,
+                    residual: torch.Tensor, scale: torch.Tensor):
             result_rms, residual = self.rmsnorm_matcher(
                 input, weight, residual)
             result, _ = self.quant_matcher(result_rms, scale)
 
             return result, residual
 
-        def replacement(input: torch.Tensor, residual: torch.Tensor,
-                        weight: torch.Tensor, scale: torch.Tensor):
+        def replacement(input: torch.Tensor, weight: torch.Tensor,
+                        residual: torch.Tensor, scale: torch.Tensor):
             # In case we're matching native rms-norm, conversions might be
             # optimized out. We convert here just to be safe.
             input = input.to(dtype=torch.float16)  # TODO model dtype
@@ -185,11 +186,8 @@ def replacement(input: torch.Tensor, residual: torch.Tensor,
             return at[1], at[2]
 
         inputs = [
-            # TODO: maybe 32bit for torch impl? yes to resolve bug
-            #  TODO dtype doesn't seem to matter? it does matter for what cvts get traced
-            empty_bf16(5, 4),  # input
-            empty_bf16(5, 4),  # residual
-            empty_bf16(4, ),  # weight
+            # input, weight, residual
+            *self.rmsnorm_matcher.inputs(),
             empty_fp32(1, 1)  # scale
         ]
 
@@ -242,15 +240,10 @@ def replacement(input: torch.Tensor, weight: torch.Tensor):
             # result, scale
             return at[1], at[2]
 
-        inputs = [
-            empty_bf16(5, 4),  # input
-            empty_bf16(4),  # weight
-        ]
-
         pm.register_replacement(
             pattern,
             replacement,
-            inputs,
+            self.rmsnorm_matcher.inputs(),
             pm.fwd_only,
             pm_pass,
         )
@@ -272,16 +265,16 @@ def __init__(self,
 
     def register(self, pm_pass: PatternMatcherPass):
 
-        def pattern(input: torch.Tensor, residual: torch.Tensor,
-                    weight: torch.Tensor):
+        def pattern(input: torch.Tensor, weight: torch.Tensor,
+                    residual: torch.Tensor):
             result_rms, residual = self.rmsnorm_matcher(
                 input, weight, residual)
             result, scale = self.quant_matcher(result_rms)
 
             return result, residual, scale
 
-        def replacement(input: torch.Tensor, residual: torch.Tensor,
-                        weight: torch.Tensor):
+        def replacement(input: torch.Tensor, weight: torch.Tensor,
+                        residual: torch.Tensor):
             # In case we're matching native rms-norm, conversions might be
             # optimized out. We convert here just to be safe.
             input = input.to(dtype=torch.float16)  # TODO model dtype
@@ -301,16 +294,10 @@ def replacement(input: torch.Tensor, residual: torch.Tensor,
             # result, residual, scale
             return at[1], at[3], at[2]
 
-        inputs = [
-            empty_bf16(5, 4),  # input
-            empty_bf16(5, 4),  # residual
-            empty_bf16(4),  # weight
-        ]
-
         pm.register_replacement(
             pattern,
             replacement,
-            inputs,
+            self.rmsnorm_matcher.inputs(),
             pm.fwd_only,
             pm_pass,
         )
diff --git a/vllm/compilation/matcher_utils.py b/vllm/compilation/matcher_utils.py
@@ -1,6 +1,7 @@
 # SPDX-License-Identifier: Apache-2.0
 # SPDX-FileCopyrightText: Copyright contributors to the vLLM project
-from typing import Optional, Union
+from abc import ABC, abstractmethod
+from typing import Optional
 
 import torch
 from torch._higher_order_ops import auto_functionalized
@@ -31,55 +32,71 @@
 #         kNvfp4Quant] = torch.ops._C.scaled_fp4_quant.default  # noqa: E501
 
 
-class MatcherRMSNorm:  # TODO separate residual and not residual
+class MatcherCustomOp(ABC):
 
-    def __init__(self, epsilon: float, enabled: Optional[bool] = None):
-        self.epsilon = epsilon
+    def __init__(self, enabled: bool):
+        self.model_dtype = get_current_vllm_config().model_config.dtype
+
+        self.enabled = enabled
+        self.forward = self.forward_custom if enabled else self.forward_native
+
+    @abstractmethod
+    def forward_custom(self, *args, **kws):
+        pass
+
+    @abstractmethod
+    def forward_native(self, *args, **kws):
+        pass
 
+    def __call__(self, *args, **kws):
+        return self.forward(*args, **kws)
+
+    def empty(self, *args, **kws):
+        return torch.empty(*args, dtype=self.model_dtype, device="cuda", **kws)
+
+    def empty_f32(self, *args, **kws):
+        return torch.empty(*args, dtype=torch.float32, device="cuda", **kws)
+
+
+class MatcherRMSNorm(MatcherCustomOp):
+
+    def __init__(self, epsilon: float, enabled: Optional[bool] = None):
         if enabled is None:
             # TODO either pass config to enabled or set it globally
             #  (global during pass init seems reasonable)
             enabled = RMSNorm.enabled()
 
-        self.forward = self.forward_custom if enabled else self.forward_native
-        self.model_dtype = get_current_vllm_config().model_config.dtype
-        print(self.model_dtype)
+        super().__init__(enabled)
+        self.epsilon = epsilon
 
     def inputs(self):
-        return
+        input = self.empty(5, 16) if self.enabled else self.empty_f32(5, 16)
+        weight = self.empty(16, )
+        return [input, weight]
 
     def forward_custom(
         self,
         input: torch.Tensor,
         weight: torch.Tensor,
         residual: Optional[torch.Tensor] = None,
-    ) -> Union[torch.Tensor, tuple[torch.Tensor, torch.Tensor]]:
-        if residual is None:
-            result = torch.empty_like(input)
-            _, result = auto_functionalized(
-                RMS_OP,
-                result=result,
-                input=input,
-                weight=weight,
-                epsilon=self.epsilon,
-            )
-
-            return result
-        else:
-            _, result, residual = auto_functionalized(RMS_ADD_OP,
-                                                      input=input,
-                                                      residual=residual,
-                                                      weight=weight,
-                                                      epsilon=self.epsilon)
+    ) -> torch.Tensor:
+        result = torch.empty_like(input)
+        _, result = auto_functionalized(
+            RMS_OP,
+            result=result,
+            input=input,
+            weight=weight,
+            epsilon=self.epsilon,
+        )
 
-            return result, residual
+        return result
 
     def forward_native(
         self,
         input: torch.Tensor,
         weight: torch.Tensor,
         residual: Optional[torch.Tensor] = None,
-    ) -> Union[torch.Tensor, tuple[torch.Tensor, torch.Tensor]]:
+    ) -> torch.Tensor:
         x = input.to(torch.float32)
         if residual is not None:
             x = x + residual
@@ -94,13 +111,57 @@ def forward_native(
 
         return x if residual is None else (x, residual)
 
-    def __call__(
+
+class MatcherFusedAddRMSNorm(MatcherCustomOp):
+
+    def __init__(self, epsilon: float, enabled: Optional[bool] = None):
+        if enabled is None:
+            # TODO either pass config to enabled or set it globally
+            #  (global during pass init seems reasonable)
+            enabled = RMSNorm.enabled()
+
+        super().__init__(enabled)
+        self.epsilon = epsilon
+
+    def inputs(self):
+        input = self.empty(5, 16) if self.enabled else self.empty_f32(5, 16)
+        weight = self.empty(16, )
+        residual = self.empty(5, 16)
+        return [input, weight, residual]
+
+    def forward_custom(
         self,
         input: torch.Tensor,
         weight: torch.Tensor,
-        residual: Optional[torch.Tensor] = None,
-    ) -> Union[torch.Tensor, tuple[torch.Tensor, torch.Tensor]]:
-        return self.forward(input, weight, residual)
+        residual: torch.Tensor,
+    ) -> tuple[torch.Tensor, torch.Tensor]:
+        _, result, residual = auto_functionalized(RMS_ADD_OP,
+                                                  input=input,
+                                                  residual=residual,
+                                                  weight=weight,
+                                                  epsilon=self.epsilon)
+
+        return result, residual
+
+    def forward_native(
+        self,
+        input: torch.Tensor,
+        weight: torch.Tensor,
+        residual: torch.Tensor,
+    ) -> tuple[torch.Tensor, torch.Tensor]:
+        x = input.to(torch.float32)
+        if residual is not None:
+            x = x + residual
+            residual = x.to(self.model_dtype)
+
+        variance = x.pow(2).mean(dim=-1, keepdim=True)
+
+        x = x * torch.rsqrt(variance + self.epsilon)
+        x = x.to(self.model_dtype)
+        if weight is not None:
+            x = x * weight
+
+        return x if residual is None else (x, residual)
 
 
 class MatcherQuant: