diff --git a/vllm/compilation/passes/fusion/allreduce_rms_fusion.py b/vllm/compilation/passes/fusion/allreduce_rms_fusion.py index 6cb0c8f49f3d..83423c02a6f1 100644 --- a/vllm/compilation/passes/fusion/allreduce_rms_fusion.py +++ b/vllm/compilation/passes/fusion/allreduce_rms_fusion.py @@ -939,7 +939,7 @@ def replacement(self): def _replacement( input: torch.Tensor, weight: torch.Tensor ) -> tuple[torch.Tensor, torch.Tensor]: - residual = torch.empty_like(input) + residual = torch.zeros_like(input) allreduce = self.FUSED_AR_RMSNORM_OP( input_=input, residual=residual, diff --git a/vllm/model_executor/layers/layernorm.py b/vllm/model_executor/layers/layernorm.py index a5d4e4db79fe..f9897a22c8d5 100644 --- a/vllm/model_executor/layers/layernorm.py +++ b/vllm/model_executor/layers/layernorm.py @@ -90,7 +90,11 @@ def forward_native( x, self.weight.data if self.pass_weight else None, self.variance_epsilon, - self.variance_size_override, + *( + (self.variance_size_override,) + if self.variance_size_override is not None + else () + ), ) else: return ir.ops.fused_add_rms_norm.maybe_inplace( @@ -98,7 +102,11 @@ def forward_native( residual, self.weight.data if self.pass_weight_add else None, self.variance_epsilon, - self.variance_size_override, + *( + (self.variance_size_override,) + if self.variance_size_override is not None + else () + ), ) def forward_cuda(