aqlaboratory · GMNGeoffrey · May 22, 2026 · May 26, 2026
diff --git a/openfold3/core/model/primitives/normalization.py b/openfold3/core/model/primitives/normalization.py
@@ -16,13 +16,15 @@
 """Normalization layers. Includes LayerNorm and AdaptiveLayerNorm."""
 
 import importlib
+import math
 
 import torch
 import torch.nn as nn
 from ml_collections import ConfigDict
 
 import openfold3.core.config.default_linear_init_config as lin_init
 from openfold3.core.model.primitives.linear import Linear
+from openfold3.core.utils.chunk_utils import chunk_layer
 
 deepspeed_is_installed = importlib.util.find_spec("deepspeed") is not None
 if deepspeed_is_installed:
@@ -65,24 +67,55 @@ def forward(self, x) -> torch.Tensor:
             with torch.amp.autocast("cuda", enabled=False):
                 weight = self.weight.to(dtype=d) if self.weight is not None else None
                 bias = self.bias.to(dtype=d) if self.bias is not None else None
-
-                out = nn.functional.layer_norm(
-                    input=x,
-                    normalized_shape=self.c_in,
-                    weight=weight,
-                    bias=bias,
-                    eps=self.eps,
-                )
+                out = self._layer_norm(x, weight, bias)
         else:
-            out = nn.functional.layer_norm(
+            out = self._layer_norm(x, self.weight, self.bias)
+
+        return out
+
+    # There are at least two bugs in the Torch layer norm kernel for very
+    # large inputs at the time of writing:
+
+    # - A: https://github.com/pytorch/pytorch/issues/181555: when numel > 2^32
+    #   and the channel dimension is a multiple of 4 (so the vectorized kernel
+    #   is used) there is a uint32 overflow in the row offset. Fixed in
+    #   https://github.com/pytorch/pytorch/pull/181600, but that is not yet in
+    #   the stable release.
+    # - B: https://github.com/pytorch/pytorch/issues/184826: when combined batch
+    #   dim is > 2^23 and the channel dimension is not a multiple of 4 (so the
+    #   non-vectorized kernel is used), there's some failure not yet root
+    #   caused, that means that some outputs are never addressed and are filled
+    #   with garbage values. At batch dim of *exactly* 2^23, there's actually an
+    #   "invalid configuration argument" error instead of bad output. This is
+    #   not yet fixed at the time of writing.
+
+    # We chunk over the leading (batch) dims to keep every kernel call below
+    # both cliffs: chunk_size < 2^23 AND chunk_size * C < 2^32.
+    _SAFE_NUMEL = 2**32 - 1
+    _SAFE_BATCH = 2**23 - 1
+
+    def _layer_norm(self, x, weight, bias):
+        def _ln(x):
+            return nn.functional.layer_norm(
                 input=x,
                 normalized_shape=self.c_in,
-                weight=self.weight,
-                bias=self.bias,
+                weight=weight,
+                bias=bias,
                 eps=self.eps,
             )
 
-        return out
+        no_batch_dims = x.dim() - len(self.c_in)
+        per_slice_numel = math.prod(self.c_in)
+        flat_batch = x.numel() // per_slice_numel
+        max_chunk = min(self._SAFE_BATCH, self._SAFE_NUMEL // per_slice_numel)
+        if flat_batch <= max_chunk:
+            return _ln(x)
+        return chunk_layer(
+            _ln,
+            {"x": x},
+            chunk_size=max_chunk,
+            no_batch_dims=no_batch_dims,
+        )
 
 
 class AdaLN(nn.Module):

diff --git a/openfold3/tests/test_normalization.py b/openfold3/tests/test_normalization.py
@@ -0,0 +1,60 @@
+# Copyright 2026 AlQuraishi Laboratory
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#      http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+import unittest
+
+import pytest
+import torch
+
+import openfold3.tests.compare_utils as compare_utils
+from openfold3.core.model.primitives.normalization import LayerNorm
+
+
+@compare_utils.skip_unless_cuda_available()
+# Test both the vectorized (C%4==0) and non-vectorized (C%4!=0) code paths for both batch>=2^23 and numel>=2^32
+@pytest.mark.parametrize(
+    ("batch", "C"),
+    [
+        pytest.param(2**23, 4, id="2^23_4"),
+        pytest.param(2**23 - 1, 3, id="2^23-1_3"),
+        # In testing with HIP, there's actually an "invalid configuration argument" error at exactly batch=2^23 as opposed to bad output
+        pytest.param(2**23, 3, id="2^23_3"),
+        pytest.param(2**23 + 1, 3, id="2^23+1_3"),
+        pytest.param(2**23 + 1, 4, id="2^23+1_4"),
+        pytest.param(2**22, 2**10, marks=pytest.mark.slow, id="2^22_2^10"),
+        pytest.param(2**22 + 1, 2**10, marks=pytest.mark.slow, id="2^22+1_2^10"),
+    ],
+)
+def test_layer_norm_overflow_bug_workaround(batch, C, seeded_rng):
+    """Test we don't hit torch bugs in very large layernorms.
+
+    See comments in layer norm implementation for details.
+    """
+    ln = LayerNorm(C).cuda()
+
+    input_row = torch.randn(C, device="cuda")
+    x = input_row.to(torch.bfloat16).unsqueeze(0).expand(batch, C).contiguous()
+
+    expected_row = (input_row - torch.mean(input_row)) / torch.sqrt(
+        torch.var(input_row, correction=0) + ln.eps
+    )
+    expected = expected_row.to(torch.bfloat16).unsqueeze(0).expand(batch, C)
+
+    out = ln(x)
+
+    torch.testing.assert_close(out, expected)
+
+
+if __name__ == "__main__":
+    unittest.main()