gdn tests

oleksost · oleksost · commit 59db0d6d80f5 · 2025-11-25T20:57:03.000Z
diff --git a/fast_llm/layers/common/normalization/normalization.py b/fast_llm/layers/common/normalization/normalization.py
@@ -311,14 +311,14 @@ def __init__(self, config: ConfigType, hidden_dim: TensorDim, lr_scale: float |
         super().__init__(config, hidden_dim, lr_scale)
 
         if rms_norm_gated is not None:
-            self._forward = self._forward_fused
+            self._forward_gated = self._forward_local
         else:
-            self._forward = self._forward
+            self._forward_gated = self._forward_local
 
     def forward(self, input_: torch.Tensor, gate: torch.Tensor) -> torch.Tensor:
-        return self._forward(input_.view(-1, *self._normalized_shape), gate).view_as(input_)
+        return self._forward_gated(input_.view(-1, *self._normalized_shape), gate).view_as(input_)
 
-    def _forward_fused(self, input_: torch.Tensor, gate: torch.Tensor) -> torch.Tensor:
+    def _forward_fla(self, input_: torch.Tensor, gate: torch.Tensor) -> torch.Tensor:
         return rms_norm_gated(
             input_,
             gate,
@@ -331,6 +331,6 @@ def _forward_fused(self, input_: torch.Tensor, gate: torch.Tensor) -> torch.Tens
             residual_in_fp32=False,
         )
 
-    def _forward(self, input_: torch.Tensor, gate: torch.Tensor) -> torch.Tensor:
-        normalized = self.rmsnorm(input_)
+    def _forward_local(self, input_: torch.Tensor, gate: torch.Tensor) -> torch.Tensor:
+        normalized = self._forward(input_)
         return normalized * F.silu(gate)
diff --git a/fast_llm/layers/ssm/gdn.py b/fast_llm/layers/ssm/gdn.py
@@ -210,9 +210,9 @@ def __init__(
             lr_scale=self._lr_scale,
             peft=self._peft,
         )
-        # self.norm = self._config.normalization.get_layer(
-        #     self._value_head_dim, lr_scale=self._lr_scale, peft=self._peft
-        # )
+        self.norm = self._config.normalization.get_layer(
+            self._value_head_dim, lr_scale=self._lr_scale, peft=self._peft
+        )
 
         self.chunk_gated_delta_rule = chunk_gated_delta_rule or torch_chunk_gated_delta_rule
 
@@ -259,7 +259,6 @@ def fix_query_key_value_ordering(self, mixed_qkvz, mixed_ba):
         Derives `query`, `key` and `value` tensors from `mixed_qkvz` and `mixed_ba`.
         """
 
-        # Split contiguous q/k/v/z blocks and only then project them into per-head shapes.
         local_qkv_sizes = (
             self._local_key_heads * self._config.key_head_dim,
             self._local_key_heads * self._config.key_head_dim,
@@ -370,7 +369,7 @@ def _forward(
 
         core_attn_out = core_attn_out.reshape(-1, core_attn_out.shape[-1])
         z = z.reshape(-1, z.shape[-1])
-        # core_attn_out = self.norm(core_attn_out, z)
+        core_attn_out = self.norm(core_attn_out, z)
         core_attn_out = core_attn_out.reshape(z_shape_og)
         core_attn_out = core_attn_out.reshape(core_attn_out.shape[0], core_attn_out.shape[1], -1)
         if sequence_first:
diff --git a/tests/utils/model_configs.py b/tests/utils/model_configs.py
@@ -3,6 +3,7 @@
 import enum
 import functools
 import os
+import re
 import typing
 
 import pytest
@@ -76,7 +77,7 @@ class ModelTestingConfig:
     groups: dict[ModelTestingGroup, ModelTestingGroupAction]
     # Scale the comparison thresholds for specific models.
     compare_factor: float = 1.0
-    # Option to skip specific distributed configuration with name containing any of the provided strings.
+    # Option to skip specific distributed configuration with name matching any of the provided regex patterns.
     skip_tests: tuple[str] = ()
 
     @functools.cached_property
@@ -125,7 +126,7 @@ def base_model_config_class(self):
         return self.model_config_class.get_base_model_config_class()
 
     def should_skip(self, distributed_config: DistributedTestingConfig) -> bool:
-        return any(key in distributed_config.name for key in self.skip_tests)
+        return any(re.search(pattern, distributed_config.name) for pattern in self.skip_tests)
 
 
 def _update_and_add_testing_config(
@@ -470,7 +471,7 @@ def _update_and_add_testing_config(
     },
     compare_factor=2.0,
     # Arg update for cross-entropy splits doesn't work here.
-    skip_tests=("ce4", "ms"),
+    skip_tests=(r"ce4", r"ms"),
 )
 
 _update_and_add_testing_config(
@@ -603,7 +604,7 @@ def _update_and_add_testing_config(
     },
     compare_factor=2.0,
     # Micro-sequence split not supported.
-    skip_tests=("sdp", "ms"),
+    skip_tests=(r"sdp", r"ms"),
 )
 
 _update_and_add_testing_config(
@@ -645,8 +646,8 @@ def _update_and_add_testing_config(
     compare_factor=2.0,
     # Micro-sequence split not supported.
     skip_tests=(
-        "sdp",
-        "ms",
+        r"sdp",
+        r"ms",
     ),  # "pp","dp", "ce","16", "bf", "df", "stp"),
 )
 
@@ -690,7 +691,7 @@ def _update_and_add_testing_config(
     },
     compare_factor=2.0,
     # Micro-sequence split and sequence-first not supported.
-    skip_tests=("sdp", "ms"),
+    skip_tests=(r"sdp", r"ms"),
 )
 
 
@@ -728,8 +729,10 @@ def _update_and_add_testing_config(
         ModelTestingGroup.megatron: ModelTestingGroupAction.not_implemented,
         ModelTestingGroup.distributed: ModelTestingGroupAction.normal,
     },
-    compare_factor=16,
-    skip_tests=("sdp", "ms", "stp"),
+    compare_factor=10.0,  # with compare_factor 2 fails fp16 and bf16 tests in the normalizaiton layer when using rms_norm_gated from fla (passes with local non-fla norm)
+    # note: tp is excluded because there is currently no gradient reductions implemented for tp norm in gdn.py (STP works though).
+    # we should be using STP with this model!
+    skip_tests=(r"sdp", r"ms", r"^tp2$"),
 )