From bfb319b06917ecc5f5d3d8b36e784c13b4606bb7 Mon Sep 17 00:00:00 2001
From: Daniel Han <danielhanchen@gmail.com>
Date: Fri, 17 Apr 2026 17:15:47 +0000
Subject: [PATCH 1/8] Add Gemma-4 UNSLOTH_FORCE_FLOAT32 patches for float16
 GRPO stability

Gemma-4 E2B/E4B GRPO training hits a CUDA device-side assert at step 2
when dtype=torch.float16 because the text-decoder MLP saturates in fp16:
gate_proj and up_proj outputs reach fp16_max under the fp16 autocast
context, the gate*up product overflows on downcast, and the subsequent
down_proj fp16 matmul accumulator tips to +inf. Generation with an inf
residual stream produces NaN logits and the categorical sampler aborts.

bf16 training is unaffected because the same intermediate magnitudes fit
in bf16's range. Mirror the Gemma-3 UNSLOTH_FORCE_FLOAT32 recipe with
four Gemma-4 specific patches, all gated on the env flag:

  * Gemma4RMSNorm: fp32 norm and scale, clamp to 65280 before fp16 cast
    (65280 is one bf16 ulp below fp16_max, avoiding the bf16 round-up
    that otherwise produces +-inf on later fp16 conversions).
  * Gemma4TextScaledWordEmbedding: embed lookup and scale in fp32.
  * Gemma4TextMLP: execute gate_proj and up_proj with autocast disabled
    so they see bf16 weights (the actual model dtype under
    UNSLOTH_FORCE_FLOAT32), run activation plus multiply in fp32, clamp
    to the safe fp16 bound, then down_proj in fp16. A final nan_to_num
    rescues the rare case where down_proj's fp16 accumulator overflows
    for wide intermediate dims.
  * Gemma4TextAttention: same autocast-disabled pattern for Q/K/V
    projections, fp32 q_norm/k_norm/v_norm, fp32 RoPE, fp32 SDPA, clamp
    and fp16 cast before o_proj, plus nan_to_num safety net.

Repro recipe (see PR description for full table):

  python scripts/gemma4_grpo_sudoku_repro.py \
    --model unsloth/gemma-4-E2B-it --dtype float16 \
    --max-steps 8 --grad-accum 4 --num-generations 2

Without the patches this crashes at step 2 with CUDA device-side assert
and overflow flagged in layers.0.mlp.down_proj. With the patches
unsloth/gemma-4-E2B-it and unsloth/gemma-4-E4B-it both complete 8 GRPO
steps with finite loss and grad_norm values close to the bf16 baseline,
under both gradient_checkpointing="unsloth" and gradient_checkpointing
=False. bf16 behavior is unaffected because the patches are gated on
UNSLOTH_FORCE_FLOAT32=1 which only fires for float16 requests.
---
 unsloth_zoo/temporary_patches/gemma4.py | 326 +++++++++++++++++++++++-
 1 file changed, 324 insertions(+), 2 deletions(-)

diff --git a/unsloth_zoo/temporary_patches/gemma4.py b/unsloth_zoo/temporary_patches/gemma4.py
index 3a91bba0c..8a9dbcc0a 100644
--- a/unsloth_zoo/temporary_patches/gemma4.py
+++ b/unsloth_zoo/temporary_patches/gemma4.py
@@ -14,10 +14,20 @@
 # You should have received a copy of the GNU Lesser General Public License
 # along with this program.  If not, see <https://www.gnu.org/licenses/>.
 
+import inspect
+from typing import Optional
 import torch
 import os
-from .common import TEMPORARY_PATCHES
-from .utils import raise_error
+from .common import TEMPORARY_PATCHES, torch_compile
+from .utils import (
+    raise_error,
+    patch_function,
+    patch_function_past_key_values,
+    KWARGS_TYPE,
+    Cache,
+)
+
+_UNSLOTH_FLEX_ATTENTION_DISABLED = os.environ.get("UNSLOTH_ENABLE_FLEX_ATTENTION", "1") == "0"
 
 
 # ============================================================================
@@ -606,3 +616,315 @@ def forward(self, hidden_states, position_embeddings, attention_mask=None, **kwa
     Gemma4AudioAttention.forward = forward
 pass
 TEMPORARY_PATCHES.append(patch_Gemma4AudioAttention)
+
+
+# ============================================================================
+# Gemma-4 float16 force-fp32 patches.
+#
+# Goal: make float16 training (notably GRPO) numerically stable without
+# falling back to a blanket float32 compute path. Bisection on E2B GRPO
+# (see PR description) showed the fp16 NaN originates in the text-decoder
+# MLP gate*up product and propagates through the attention projections.
+# The patches below mirror the proven Gemma-3 UNSLOTH_FORCE_FLOAT32 recipe,
+# adapted for Gemma-4's specific forward shapes:
+#
+#   * Gemma4RMSNorm           -> fp32 norm, fp32 (weight * hidden), clamp to
+#                                fp16 range, return fp16.
+#   * Gemma4TextMLP           -> fp32 act_fn(gate) * up, cast fp16 before
+#                                down_proj.
+#   * Gemma4TextAttention     -> fp32 Q/K/V, fp32 q_norm/k_norm/v_norm, fp32
+#                                RoPE, fp32 SDPA, cast fp16 before o_proj.
+#   * Gemma4TextScaledWordEmbedding -> fp32 embed lookup * embed_scale,
+#                                return fp32 so the first RMSNorm gets a
+#                                full-precision input.
+#
+# Every patch is gated on UNSLOTH_FORCE_FLOAT32=1 and registered through
+# TEMPORARY_PATCHES so the registry order matches the existing Gemma-3
+# behavior: the loader flips the env flag for fp16 + gemma4, and these
+# patches become active.
+#
+# Minimality policy: the plan mandates keeping the set as small as
+# bisection evidence allows. The four patches below are the analogues of
+# the four Gemma-3 patches that ship today; Gemma-4 does not have a
+# Gemma-3-style `_update_causal_mask`, so no causal-mask patch is added.
+# ============================================================================
+
+
+def _gemma4_modeling():
+    try:
+        import transformers.models.gemma4.modeling_gemma4 as mod
+        return mod
+    except ImportError:
+        return None
+
+
+def patch_Gemma4RMSNorm():
+    if os.environ.get("UNSLOTH_FORCE_FLOAT32", "0") == "0":
+        return
+    mod = _gemma4_modeling()
+    if mod is None:
+        return
+    try:
+        Gemma4RMSNorm = mod.Gemma4RMSNorm
+    except AttributeError as e:
+        return raise_error("Gemma4RMSNorm.forward", e)
+
+    # Clamp below fp16_max so the subsequent fp16 cast (plus any PEFT LoRA
+    # round-trip to fp16) can never materialize +-inf. 65280 is exactly
+    # representable in both fp16 and bf16 and sits one bf16 ulp below fp16_max.
+    _SAFE_FP16 = 65280.0
+
+    def forward(self, hidden_states: torch.Tensor) -> torch.Tensor:
+        x_fp32 = hidden_states.to(torch.float32)
+        mean_squared = x_fp32.pow(2).mean(-1, keepdim=True) + self.eps
+        normed = x_fp32 * torch.rsqrt(mean_squared)
+        if self.with_scale:
+            normed = normed * self.weight.to(torch.float32)
+        normed = torch.clamp(normed, min=-_SAFE_FP16, max=_SAFE_FP16)
+        return normed.to(torch.float16)
+    try:
+        patch_function(Gemma4RMSNorm, "forward", forward, fullgraph=True)
+    except Exception as e:
+        return raise_error("Gemma4RMSNorm.forward", e)
+pass
+TEMPORARY_PATCHES.append(patch_Gemma4RMSNorm)
+
+
+def patch_Gemma4TextScaledWordEmbedding():
+    if os.environ.get("UNSLOTH_FORCE_FLOAT32", "0") == "0":
+        return
+    mod = _gemma4_modeling()
+    if mod is None:
+        return
+    try:
+        Gemma4TextScaledWordEmbedding = mod.Gemma4TextScaledWordEmbedding
+    except AttributeError as e:
+        return raise_error("Gemma4TextScaledWordEmbedding.forward", e)
+
+    def forward(self, input_ids: torch.Tensor) -> torch.Tensor:
+        emb = torch.nn.functional.embedding(
+            input_ids,
+            weight=self.weight,
+            padding_idx=self.padding_idx,
+        )
+        scale = self.embed_scale.to(torch.float32)
+        return emb.to(torch.float32) * scale
+    try:
+        patch_function(
+            Gemma4TextScaledWordEmbedding, "forward", forward, fullgraph=True,
+        )
+    except Exception as e:
+        return raise_error("Gemma4TextScaledWordEmbedding.forward", e)
+pass
+TEMPORARY_PATCHES.append(patch_Gemma4TextScaledWordEmbedding)
+
+
+def patch_Gemma4TextMLP():
+    if os.environ.get("UNSLOTH_FORCE_FLOAT32", "0") == "0":
+        return
+    mod = _gemma4_modeling()
+    if mod is None:
+        return
+    try:
+        Gemma4TextMLP = mod.Gemma4TextMLP
+    except AttributeError as e:
+        return raise_error("Gemma4TextMLP.forward", e)
+
+    # Clamp below fp16_max with a 224-unit safety margin so that subsequent
+    # bf16 and fp16 casts round DOWN (65280 is exactly representable in both).
+    # Without the margin bf16 rounds 65504 -> 65536 -> +inf on fp16 cast.
+    _SAFE_FP16 = 65280.0
+
+    def forward(self, x: torch.Tensor) -> torch.Tensor:
+        # Disable fp16 autocast so gate_proj/up_proj execute at bf16 (the
+        # actual model weight dtype under UNSLOTH_FORCE_FLOAT32). Running them
+        # inside the GRPO fp16 autocast context otherwise saturates the
+        # projection outputs to fp16_max, and the subsequent gate*up product
+        # + down_proj matmul then tips into +-inf - this is the NaN trigger
+        # on Gemma-4 E2B fp16 GRPO at step ~1-2.
+        device_type = x.device.type
+        with torch.amp.autocast(device_type=device_type, enabled=False):
+            x_bf = x.to(torch.bfloat16)
+            gate_bf = self.gate_proj(x_bf)
+            up_bf = self.up_proj(x_bf)
+
+            # Activation + multiplication in fp32 to avoid bf16 mantissa loss,
+            # since gate values can be in the 10^4 range post-saturation.
+            activated_fp32 = self.act_fn(gate_bf.to(torch.float32))
+            product_fp32 = activated_fp32 * up_bf.to(torch.float32)
+
+            # Clamp strictly below fp16_max so the subsequent fp16 cast (and
+            # any PEFT LoRA internal cast to fp16) never materialize +-inf.
+            product_fp32 = torch.clamp(product_fp32, min=-_SAFE_FP16, max=_SAFE_FP16)
+
+            # Cast directly to fp16 (not bf16). bf16 rounds up near fp16_max
+            # which poisons downstream fp16 casts; fp16 by construction has
+            # max 65504 so the downstream path cannot see +-inf.
+            out = self.down_proj(product_fp32.to(torch.float16))
+
+            # Safety net: even with clamped input, down_proj's fp16 matmul
+            # accumulator can overflow for layers with wide intermediate
+            # dimensions, producing +-inf. Rescue to a safe fp16 magnitude so
+            # the residual stream never carries non-finite values into
+            # subsequent norms / attention. NaN is treated as 0 (structural
+            # defect) and inf as the signed safe bound (saturation).
+            out = torch.nan_to_num(
+                out, nan=0.0, posinf=_SAFE_FP16, neginf=-_SAFE_FP16,
+            )
+        return out
+    try:
+        patch_function(
+            Gemma4TextMLP, "forward", forward, fullgraph=False,
+        )
+    except Exception as e:
+        return raise_error("Gemma4TextMLP.forward", e)
+pass
+TEMPORARY_PATCHES.append(patch_Gemma4TextMLP)
+
+
+def patch_Gemma4TextAttention():
+    if os.environ.get("UNSLOTH_FORCE_FLOAT32", "0") == "0":
+        return
+    mod = _gemma4_modeling()
+    if mod is None:
+        return
+    try:
+        Gemma4TextAttention = mod.Gemma4TextAttention
+        apply_rotary_pos_emb = mod.apply_rotary_pos_emb
+        ALL_ATTENTION_FUNCTIONS = mod.ALL_ATTENTION_FUNCTIONS
+    except AttributeError as e:
+        return raise_error("Gemma4TextAttention.forward", e)
+
+    scaled_dot_product_attention = torch.compiler.disable(
+        torch.nn.functional.scaled_dot_product_attention, recursive=True,
+    )
+
+    fp16_max = float(torch.finfo(torch.float16).max)
+    fp16_min = float(torch.finfo(torch.float16).min)
+
+    def forward(
+        self,
+        hidden_states: torch.Tensor,
+        position_embeddings: torch.Tensor,
+        attention_mask: Optional[torch.Tensor],
+        shared_kv_states: dict,
+        past_key_values: Optional[Cache] = None,
+        **kwargs,
+    ):
+        input_shape = hidden_states.shape[:-1]
+        hidden_shape = (*input_shape, -1, self.head_dim)
+        device_type = hidden_states.device.type
+
+        # Same motivation as patch_Gemma4TextMLP: GRPO sets fp16 autocast but
+        # the model weights live in bf16. Without disabling autocast here the
+        # q/k/v projections saturate to fp16_max and SDPA overflows. We run
+        # the projections in bf16 (match weights), normalisation + RoPE + SDPA
+        # in fp32 for precision, then clamp and cast to the o_proj weight
+        # dtype for the output projection.
+        with torch.amp.autocast(device_type=device_type, enabled=False):
+            h_bf = hidden_states.to(torch.bfloat16)
+            cos, sin = position_embeddings
+            cos_fp32 = cos.to(torch.float32)
+            sin_fp32 = sin.to(torch.float32)
+
+            query_fp32 = self.q_proj(h_bf).view(hidden_shape).to(torch.float32)
+            query_fp32 = self.q_norm(query_fp32).to(torch.float32)
+            query_fp32 = apply_rotary_pos_emb(query_fp32, cos_fp32, sin_fp32, unsqueeze_dim=2)
+            query_fp32 = query_fp32.transpose(1, 2)
+
+            if self.is_kv_shared_layer:
+                key_states, value_states = shared_kv_states[self.kv_shared_layer_index]
+                key_states = key_states.to(query_fp32.device).to(torch.float32)
+                value_states = value_states.to(query_fp32.device).to(torch.float32)
+            else:
+                key_fp32 = self.k_proj(h_bf).view(hidden_shape).to(torch.float32)
+                if self.v_proj is not None:
+                    value_fp32 = self.v_proj(h_bf).view(hidden_shape).to(torch.float32)
+                else:
+                    value_fp32 = key_fp32
+
+                key_fp32 = self.k_norm(key_fp32).to(torch.float32)
+                key_fp32 = apply_rotary_pos_emb(key_fp32, cos_fp32, sin_fp32, unsqueeze_dim=2)
+                key_fp32 = key_fp32.transpose(1, 2)
+
+                value_fp32 = self.v_norm(value_fp32).to(torch.float32)
+                value_fp32 = value_fp32.transpose(1, 2)
+
+                key_states, value_states = key_fp32, value_fp32
+
+            if past_key_values is not None and not self.is_kv_shared_layer:
+                key_states, value_states = past_key_values.update(
+                    key_states, value_states, self.layer_idx,
+                )
+            if self.store_full_length_kv:
+                shared_kv_states[self.layer_idx] = key_states, value_states
+
+            attn_impl = getattr(self.config, "_attn_implementation", "sdpa")
+            if _UNSLOTH_FLEX_ATTENTION_DISABLED and attn_impl == "flex_attention":
+                attn_impl = "sdpa"
+
+            attn_mask = attention_mask
+            if isinstance(attn_mask, torch.Tensor) and attn_mask.dtype != torch.bool:
+                attn_mask = attn_mask.to(torch.float32)
+
+            if attn_impl in ("sdpa", "eager") or attn_impl not in ALL_ATTENTION_FUNCTIONS:
+                is_causal = (
+                    query_fp32.shape[2] > 1
+                    and attn_mask is None
+                    and getattr(self, "is_causal", True)
+                )
+                attn_output_fp32 = scaled_dot_product_attention(
+                    query_fp32.contiguous(),
+                    key_states.contiguous(),
+                    value_states.contiguous(),
+                    attn_mask=attn_mask,
+                    dropout_p=self.attention_dropout if self.training else 0.0,
+                    is_causal=bool(is_causal),
+                    scale=getattr(self, "scaling", None),
+                    enable_gqa=getattr(self, "num_key_value_groups", 1) != 1,
+                )
+                attn_weights = None
+            else:
+                attention_interface = ALL_ATTENTION_FUNCTIONS[attn_impl]
+                attn_output_fp32, attn_weights = attention_interface(
+                    self,
+                    query_fp32,
+                    key_states,
+                    value_states,
+                    attn_mask,
+                    dropout=self.attention_dropout if self.training else 0.0,
+                    scaling=getattr(self, "scaling", None),
+                    sliding_window=getattr(self, "sliding_window", None),
+                    **kwargs,
+                )
+
+            if attn_impl != "flex_attention":
+                attn_output_fp32 = attn_output_fp32.transpose(1, 2).contiguous()
+            attn_output_fp32 = attn_output_fp32.reshape(*input_shape, -1)
+
+            # Clamp below fp16_max with safety margin; cast to fp16 directly
+            # so any PEFT LoRA internal cast to fp16 stays finite.
+            _SAFE_FP16 = 65280.0
+            attn_output_fp32 = torch.clamp(
+                attn_output_fp32, min=-_SAFE_FP16, max=_SAFE_FP16,
+            )
+            attn_output = self.o_proj(attn_output_fp32.to(torch.float16))
+            # Safety net mirroring the MLP patch: o_proj's fp16 accumulator
+            # can still overflow for long sequences; saturate non-finite
+            # values so downstream norms never see +-inf / NaN.
+            attn_output = torch.nan_to_num(
+                attn_output, nan=0.0, posinf=_SAFE_FP16, neginf=-_SAFE_FP16,
+            )
+        return attn_output, None if attn_impl in ("sdpa", "eager") else attn_weights
+
+    try:
+        # relaxed match: Gemma-4's signature contains KW-only shared_kv_states;
+        # the patched forward uses **kwargs to stay compatible with any
+        # additional parameters a future transformers build may add.
+        patch_function(
+            Gemma4TextAttention, "forward", forward, match_level="relaxed",
+        )
+    except Exception as e:
+        return raise_error("Gemma4TextAttention.forward", e)
+pass
+TEMPORARY_PATCHES.append(patch_Gemma4TextAttention)

From 513ce6d27d99cf2ef24957f97d63182d8ca4490e Mon Sep 17 00:00:00 2001
From: Daniel Han <danielhanchen@gmail.com>
Date: Sat, 18 Apr 2026 01:41:17 +0000
Subject: [PATCH 2/8] Reduce Gemma-4 fp16 stability to a single MLP patch

Bisection showed the NaN chain originates exclusively in the text-decoder
MLP gate*up -> down_proj path:

  * Gemma4RMSNorm already computes internally in fp32 and returns
    type_as(hidden_states), which is finite for the trained weight ranges.
  * Gemma4TextScaledWordEmbedding scales by sqrt(hidden_size) which is
    ~45 to 60 for E2B/E4B and fits in fp16.
  * Gemma4TextAttention projections did not overflow in any run.

So the earlier RMSNorm, Embedding, and Attention patches were redundant
once the MLP is stabilized. Removing them lets the text pipeline keep its
original dtype contract (no autocast gymnastics, no KV cache dtype
mismatch), which in turn keeps the gate/up/down matmuls on fp16 tensor
cores.

The MLP patch itself is reduced to the three operations that matter:

  1. Compute act_fn(gate) * up in fp32 so the product cannot overflow.
  2. Clamp to 65280 (one bf16 ulp below fp16_max) before down_proj so
     the fp16 cast cannot produce +-inf.
  3. nan_to_num on the output as defense-in-depth for the rare fp16
     accumulator overflow in down_proj for wide intermediate dims.

Verified on B200 (fp16 autocast under GRPO) - all runs complete 8 steps
with no NaN and no bad params:

  | model                       | gc       | step1 grad | step8 grad | step8 kl |
  | unsloth/gemma-4-E2B-it      | unsloth  | 0.1110     | 0.4077     | 6.03e-05 |
  | unsloth/gemma-4-E2B-it      | off      | 0.1110     | 0.2655     | 1.18e-04 |
  | unsloth/gemma-4-E4B-it      | unsloth  | 0.0987     | 0.0000     | 1.47e-05 |

Tesla T4 compatibility: T4 has no bf16 tensor cores so the loader's
FORCE_FLOAT32 path lands on fp16 weights. The patch's fp32 gate*up is an
elementwise op (runs on CUDA cores at 8.1 TFLOPS); gate_proj, up_proj,
and down_proj matmuls stay on fp16 tensor cores (65 TFLOPS). Net perf
overhead is minimal and the overflow prevention cost is paid only once
per MLP forward.

UNSLOTH_ENABLE_FLEX_ATTENTION on Gemma-4: flex is enabled by default but
only used when sdpa is unavailable. Gemma-4 supports sdpa so flex is not
selected for the text decoder in practice. Setting the env var to 0
produces identical E4B trajectories and near-identical E2B (diverges only
at step 8, still finite, no crash).
---
 unsloth_zoo/temporary_patches/gemma4.py | 359 +++++-------------------
 1 file changed, 76 insertions(+), 283 deletions(-)

diff --git a/unsloth_zoo/temporary_patches/gemma4.py b/unsloth_zoo/temporary_patches/gemma4.py
index 8a9dbcc0a..cb9b92a2b 100644
--- a/unsloth_zoo/temporary_patches/gemma4.py
+++ b/unsloth_zoo/temporary_patches/gemma4.py
@@ -14,20 +14,10 @@
 # You should have received a copy of the GNU Lesser General Public License
 # along with this program.  If not, see <https://www.gnu.org/licenses/>.
 
-import inspect
-from typing import Optional
 import torch
 import os
-from .common import TEMPORARY_PATCHES, torch_compile
-from .utils import (
-    raise_error,
-    patch_function,
-    patch_function_past_key_values,
-    KWARGS_TYPE,
-    Cache,
-)
-
-_UNSLOTH_FLEX_ATTENTION_DISABLED = os.environ.get("UNSLOTH_ENABLE_FLEX_ATTENTION", "1") == "0"
+from .common import TEMPORARY_PATCHES
+from .utils import raise_error, patch_function
 
 
 # ============================================================================
@@ -619,34 +609,40 @@ def forward(self, hidden_states, position_embeddings, attention_mask=None, **kwa
 
 
 # ============================================================================
-# Gemma-4 float16 force-fp32 patches.
+# Gemma-4 float16 stability patch.
+#
+# Goal: make float16 training (notably GRPO on E2B and E4B) numerically
+# stable without paying blanket fp32 compute cost, so that Tesla T4 - which
+# has no bf16 tensor cores - can run Gemma-4 GRPO at full fp16 matmul speed.
+#
+# The NaN chain (verified via TensorStatisticsHooks on E2B GRPO):
 #
-# Goal: make float16 training (notably GRPO) numerically stable without
-# falling back to a blanket float32 compute path. Bisection on E2B GRPO
-# (see PR description) showed the fp16 NaN originates in the text-decoder
-# MLP gate*up product and propagates through the attention projections.
-# The patches below mirror the proven Gemma-3 UNSLOTH_FORCE_FLOAT32 recipe,
-# adapted for Gemma-4's specific forward shapes:
+#   1. Under UNSLOTH_FORCE_FLOAT32 the loader stores linear weights as fp16
+#      (see unsloth_zoo/patching_utils.py do_forced_float32 block).
+#   2. `down_proj(act_fn(gate_proj(x)) * up_proj(x))` saturates in fp16 at
+#      `layers.0.mlp.down_proj` (E2B) / `layers.1.mlp.down_proj` (E4B).
+#   3. The +-inf in down_proj output poisons the residual stream; the next
+#      generation step samples NaN logits and the CUDA categorical sampler
+#      trips a device-side assert.
 #
-#   * Gemma4RMSNorm           -> fp32 norm, fp32 (weight * hidden), clamp to
-#                                fp16 range, return fp16.
-#   * Gemma4TextMLP           -> fp32 act_fn(gate) * up, cast fp16 before
-#                                down_proj.
-#   * Gemma4TextAttention     -> fp32 Q/K/V, fp32 q_norm/k_norm/v_norm, fp32
-#                                RoPE, fp32 SDPA, cast fp16 before o_proj.
-#   * Gemma4TextScaledWordEmbedding -> fp32 embed lookup * embed_scale,
-#                                return fp32 so the first RMSNorm gets a
-#                                full-precision input.
+# Minimal fix: a single patch on Gemma4TextMLP. The product is computed in
+# fp32, clamped to a safe fp16 bound, and down_proj's output is rescued
+# with `nan_to_num` for the rare tail overflow from its fp16 accumulator.
 #
-# Every patch is gated on UNSLOTH_FORCE_FLOAT32=1 and registered through
-# TEMPORARY_PATCHES so the registry order matches the existing Gemma-3
-# behavior: the loader flips the env flag for fp16 + gemma4, and these
-# patches become active.
+# Why we do NOT patch RMSNorm / Attention / Embedding:
+#   * Gemma4RMSNorm already casts to fp32 internally and returns
+#     `type_as(hidden_states)` - that dtype contract is fine for fp16.
+#   * Gemma4TextScaledWordEmbedding multiplies by sqrt(hidden_size) which
+#     is ~45-60 for E2B/E4B, well within fp16 range.
+#   * Gemma4TextAttention projections did not overflow in any run; the
+#     failing path is exclusively the MLP gate*up product + down_proj
+#     accumulator.
 #
-# Minimality policy: the plan mandates keeping the set as small as
-# bisection evidence allows. The four patches below are the analogues of
-# the four Gemma-3 patches that ship today; Gemma-4 does not have a
-# Gemma-3-style `_update_causal_mask`, so no causal-mask patch is added.
+# Bisection evidence: 8-step GRPO on unsloth/gemma-4-E2B-it and
+# unsloth/gemma-4-E4B-it, with gradient_checkpointing on and off, completes
+# cleanly with just patch_Gemma4TextMLP. Adding Attention, RMSNorm, or
+# Embedding patches on top produces byte-identical loss / grad_norm / kl
+# trajectories.
 # ============================================================================
 
 
@@ -658,68 +654,43 @@ def _gemma4_modeling():
         return None
 
 
-def patch_Gemma4RMSNorm():
-    if os.environ.get("UNSLOTH_FORCE_FLOAT32", "0") == "0":
-        return
-    mod = _gemma4_modeling()
-    if mod is None:
-        return
-    try:
-        Gemma4RMSNorm = mod.Gemma4RMSNorm
-    except AttributeError as e:
-        return raise_error("Gemma4RMSNorm.forward", e)
-
-    # Clamp below fp16_max so the subsequent fp16 cast (plus any PEFT LoRA
-    # round-trip to fp16) can never materialize +-inf. 65280 is exactly
-    # representable in both fp16 and bf16 and sits one bf16 ulp below fp16_max.
-    _SAFE_FP16 = 65280.0
-
-    def forward(self, hidden_states: torch.Tensor) -> torch.Tensor:
-        x_fp32 = hidden_states.to(torch.float32)
-        mean_squared = x_fp32.pow(2).mean(-1, keepdim=True) + self.eps
-        normed = x_fp32 * torch.rsqrt(mean_squared)
-        if self.with_scale:
-            normed = normed * self.weight.to(torch.float32)
-        normed = torch.clamp(normed, min=-_SAFE_FP16, max=_SAFE_FP16)
-        return normed.to(torch.float16)
-    try:
-        patch_function(Gemma4RMSNorm, "forward", forward, fullgraph=True)
-    except Exception as e:
-        return raise_error("Gemma4RMSNorm.forward", e)
-pass
-TEMPORARY_PATCHES.append(patch_Gemma4RMSNorm)
-
-
-def patch_Gemma4TextScaledWordEmbedding():
-    if os.environ.get("UNSLOTH_FORCE_FLOAT32", "0") == "0":
-        return
-    mod = _gemma4_modeling()
-    if mod is None:
-        return
-    try:
-        Gemma4TextScaledWordEmbedding = mod.Gemma4TextScaledWordEmbedding
-    except AttributeError as e:
-        return raise_error("Gemma4TextScaledWordEmbedding.forward", e)
-
-    def forward(self, input_ids: torch.Tensor) -> torch.Tensor:
-        emb = torch.nn.functional.embedding(
-            input_ids,
-            weight=self.weight,
-            padding_idx=self.padding_idx,
-        )
-        scale = self.embed_scale.to(torch.float32)
-        return emb.to(torch.float32) * scale
-    try:
-        patch_function(
-            Gemma4TextScaledWordEmbedding, "forward", forward, fullgraph=True,
-        )
-    except Exception as e:
-        return raise_error("Gemma4TextScaledWordEmbedding.forward", e)
-pass
-TEMPORARY_PATCHES.append(patch_Gemma4TextScaledWordEmbedding)
-
-
 def patch_Gemma4TextMLP():
+    """Stabilize Gemma-4 MLP under fp16 autocast (GRPO on fp16, Tesla T4).
+
+    Root cause: `down_proj(act_fn(gate_proj(x)) * up_proj(x))` summed over
+    the wide intermediate dimension can exceed `fp16_max` = 65504 so the
+    fp16 matmul cast produces +-inf. That inf poisons the residual stream
+    and generation then samples NaN logits, tripping the categorical
+    assert at GRPO step ~2 on E2B/E4B with `dtype=torch.float16`.
+
+    Minimal surgical fix (single patch, no attention/norm/embedding
+    changes required):
+
+      1. Run the gate + up activation and multiply in fp32 so the product
+         cannot overflow before we clamp. gate_proj / up_proj stay as fp16
+         tensor-core matmuls (fast on T4 at 65 TFLOPS, B200, etc).
+      2. Clamp the product to 65280 (one bf16 ulp below fp16_max) so
+         down_proj never sees a saturated input - prevents the main failure
+         mode and preserves numerical fidelity for typical activations.
+      3. `nan_to_num` the output as defense-in-depth against the rare case
+         where the fp16 accumulator in down_proj still overflows for long
+         sequences or weights that have drifted during GRPO.
+
+    Dtype contract: preserves the upstream one (input dtype -> input dtype).
+    No KV cache dtype mismatch, so no attention patch is needed. Upstream
+    Gemma4RMSNorm already computes internally in fp32 and returns
+    `type_as(hidden_states)`, so no RMSNorm patch is needed either.
+
+    Tesla T4 (SM 7.5, no bf16) notes:
+      - Our FORCE_FLOAT32 path falls back to fp16 weights on T4 (bf16
+        unsupported), so the autocast context is fp16 and the patch's
+        fp32 product runs on CUDA cores (8.1 TFLOPS) while the
+        gate/up/down matmuls stay on fp16 tensor cores (65 TFLOPS).
+        Net: minimal perf overhead, matmul throughput preserved.
+      - int8 tensor cores (130 TOPS on T4) via bitsandbytes Linear8bitLt
+        could further accelerate the matmuls; out of scope for this patch
+        since it requires a different module path and calibration flow.
+    """
     if os.environ.get("UNSLOTH_FORCE_FLOAT32", "0") == "0":
         return
     mod = _gemma4_modeling()
@@ -730,47 +701,17 @@ def patch_Gemma4TextMLP():
     except AttributeError as e:
         return raise_error("Gemma4TextMLP.forward", e)
 
-    # Clamp below fp16_max with a 224-unit safety margin so that subsequent
-    # bf16 and fp16 casts round DOWN (65280 is exactly representable in both).
-    # Without the margin bf16 rounds 65504 -> 65536 -> +inf on fp16 cast.
     _SAFE_FP16 = 65280.0
 
     def forward(self, x: torch.Tensor) -> torch.Tensor:
-        # Disable fp16 autocast so gate_proj/up_proj execute at bf16 (the
-        # actual model weight dtype under UNSLOTH_FORCE_FLOAT32). Running them
-        # inside the GRPO fp16 autocast context otherwise saturates the
-        # projection outputs to fp16_max, and the subsequent gate*up product
-        # + down_proj matmul then tips into +-inf - this is the NaN trigger
-        # on Gemma-4 E2B fp16 GRPO at step ~1-2.
-        device_type = x.device.type
-        with torch.amp.autocast(device_type=device_type, enabled=False):
-            x_bf = x.to(torch.bfloat16)
-            gate_bf = self.gate_proj(x_bf)
-            up_bf = self.up_proj(x_bf)
-
-            # Activation + multiplication in fp32 to avoid bf16 mantissa loss,
-            # since gate values can be in the 10^4 range post-saturation.
-            activated_fp32 = self.act_fn(gate_bf.to(torch.float32))
-            product_fp32 = activated_fp32 * up_bf.to(torch.float32)
-
-            # Clamp strictly below fp16_max so the subsequent fp16 cast (and
-            # any PEFT LoRA internal cast to fp16) never materialize +-inf.
-            product_fp32 = torch.clamp(product_fp32, min=-_SAFE_FP16, max=_SAFE_FP16)
-
-            # Cast directly to fp16 (not bf16). bf16 rounds up near fp16_max
-            # which poisons downstream fp16 casts; fp16 by construction has
-            # max 65504 so the downstream path cannot see +-inf.
-            out = self.down_proj(product_fp32.to(torch.float16))
-
-            # Safety net: even with clamped input, down_proj's fp16 matmul
-            # accumulator can overflow for layers with wide intermediate
-            # dimensions, producing +-inf. Rescue to a safe fp16 magnitude so
-            # the residual stream never carries non-finite values into
-            # subsequent norms / attention. NaN is treated as 0 (structural
-            # defect) and inf as the signed safe bound (saturation).
-            out = torch.nan_to_num(
-                out, nan=0.0, posinf=_SAFE_FP16, neginf=-_SAFE_FP16,
-            )
+        gate = self.gate_proj(x)
+        up = self.up_proj(x)
+        product = self.act_fn(gate.float()) * up.float()
+        product = torch.clamp(product, min=-_SAFE_FP16, max=_SAFE_FP16)
+        out = self.down_proj(product.to(x.dtype))
+        out = torch.nan_to_num(
+            out, nan=0.0, posinf=_SAFE_FP16, neginf=-_SAFE_FP16,
+        )
         return out
     try:
         patch_function(
@@ -780,151 +721,3 @@ def forward(self, x: torch.Tensor) -> torch.Tensor:
         return raise_error("Gemma4TextMLP.forward", e)
 pass
 TEMPORARY_PATCHES.append(patch_Gemma4TextMLP)
-
-
-def patch_Gemma4TextAttention():
-    if os.environ.get("UNSLOTH_FORCE_FLOAT32", "0") == "0":
-        return
-    mod = _gemma4_modeling()
-    if mod is None:
-        return
-    try:
-        Gemma4TextAttention = mod.Gemma4TextAttention
-        apply_rotary_pos_emb = mod.apply_rotary_pos_emb
-        ALL_ATTENTION_FUNCTIONS = mod.ALL_ATTENTION_FUNCTIONS
-    except AttributeError as e:
-        return raise_error("Gemma4TextAttention.forward", e)
-
-    scaled_dot_product_attention = torch.compiler.disable(
-        torch.nn.functional.scaled_dot_product_attention, recursive=True,
-    )
-
-    fp16_max = float(torch.finfo(torch.float16).max)
-    fp16_min = float(torch.finfo(torch.float16).min)
-
-    def forward(
-        self,
-        hidden_states: torch.Tensor,
-        position_embeddings: torch.Tensor,
-        attention_mask: Optional[torch.Tensor],
-        shared_kv_states: dict,
-        past_key_values: Optional[Cache] = None,
-        **kwargs,
-    ):
-        input_shape = hidden_states.shape[:-1]
-        hidden_shape = (*input_shape, -1, self.head_dim)
-        device_type = hidden_states.device.type
-
-        # Same motivation as patch_Gemma4TextMLP: GRPO sets fp16 autocast but
-        # the model weights live in bf16. Without disabling autocast here the
-        # q/k/v projections saturate to fp16_max and SDPA overflows. We run
-        # the projections in bf16 (match weights), normalisation + RoPE + SDPA
-        # in fp32 for precision, then clamp and cast to the o_proj weight
-        # dtype for the output projection.
-        with torch.amp.autocast(device_type=device_type, enabled=False):
-            h_bf = hidden_states.to(torch.bfloat16)
-            cos, sin = position_embeddings
-            cos_fp32 = cos.to(torch.float32)
-            sin_fp32 = sin.to(torch.float32)
-
-            query_fp32 = self.q_proj(h_bf).view(hidden_shape).to(torch.float32)
-            query_fp32 = self.q_norm(query_fp32).to(torch.float32)
-            query_fp32 = apply_rotary_pos_emb(query_fp32, cos_fp32, sin_fp32, unsqueeze_dim=2)
-            query_fp32 = query_fp32.transpose(1, 2)
-
-            if self.is_kv_shared_layer:
-                key_states, value_states = shared_kv_states[self.kv_shared_layer_index]
-                key_states = key_states.to(query_fp32.device).to(torch.float32)
-                value_states = value_states.to(query_fp32.device).to(torch.float32)
-            else:
-                key_fp32 = self.k_proj(h_bf).view(hidden_shape).to(torch.float32)
-                if self.v_proj is not None:
-                    value_fp32 = self.v_proj(h_bf).view(hidden_shape).to(torch.float32)
-                else:
-                    value_fp32 = key_fp32
-
-                key_fp32 = self.k_norm(key_fp32).to(torch.float32)
-                key_fp32 = apply_rotary_pos_emb(key_fp32, cos_fp32, sin_fp32, unsqueeze_dim=2)
-                key_fp32 = key_fp32.transpose(1, 2)
-
-                value_fp32 = self.v_norm(value_fp32).to(torch.float32)
-                value_fp32 = value_fp32.transpose(1, 2)
-
-                key_states, value_states = key_fp32, value_fp32
-
-            if past_key_values is not None and not self.is_kv_shared_layer:
-                key_states, value_states = past_key_values.update(
-                    key_states, value_states, self.layer_idx,
-                )
-            if self.store_full_length_kv:
-                shared_kv_states[self.layer_idx] = key_states, value_states
-
-            attn_impl = getattr(self.config, "_attn_implementation", "sdpa")
-            if _UNSLOTH_FLEX_ATTENTION_DISABLED and attn_impl == "flex_attention":
-                attn_impl = "sdpa"
-
-            attn_mask = attention_mask
-            if isinstance(attn_mask, torch.Tensor) and attn_mask.dtype != torch.bool:
-                attn_mask = attn_mask.to(torch.float32)
-
-            if attn_impl in ("sdpa", "eager") or attn_impl not in ALL_ATTENTION_FUNCTIONS:
-                is_causal = (
-                    query_fp32.shape[2] > 1
-                    and attn_mask is None
-                    and getattr(self, "is_causal", True)
-                )
-                attn_output_fp32 = scaled_dot_product_attention(
-                    query_fp32.contiguous(),
-                    key_states.contiguous(),
-                    value_states.contiguous(),
-                    attn_mask=attn_mask,
-                    dropout_p=self.attention_dropout if self.training else 0.0,
-                    is_causal=bool(is_causal),
-                    scale=getattr(self, "scaling", None),
-                    enable_gqa=getattr(self, "num_key_value_groups", 1) != 1,
-                )
-                attn_weights = None
-            else:
-                attention_interface = ALL_ATTENTION_FUNCTIONS[attn_impl]
-                attn_output_fp32, attn_weights = attention_interface(
-                    self,
-                    query_fp32,
-                    key_states,
-                    value_states,
-                    attn_mask,
-                    dropout=self.attention_dropout if self.training else 0.0,
-                    scaling=getattr(self, "scaling", None),
-                    sliding_window=getattr(self, "sliding_window", None),
-                    **kwargs,
-                )
-
-            if attn_impl != "flex_attention":
-                attn_output_fp32 = attn_output_fp32.transpose(1, 2).contiguous()
-            attn_output_fp32 = attn_output_fp32.reshape(*input_shape, -1)
-
-            # Clamp below fp16_max with safety margin; cast to fp16 directly
-            # so any PEFT LoRA internal cast to fp16 stays finite.
-            _SAFE_FP16 = 65280.0
-            attn_output_fp32 = torch.clamp(
-                attn_output_fp32, min=-_SAFE_FP16, max=_SAFE_FP16,
-            )
-            attn_output = self.o_proj(attn_output_fp32.to(torch.float16))
-            # Safety net mirroring the MLP patch: o_proj's fp16 accumulator
-            # can still overflow for long sequences; saturate non-finite
-            # values so downstream norms never see +-inf / NaN.
-            attn_output = torch.nan_to_num(
-                attn_output, nan=0.0, posinf=_SAFE_FP16, neginf=-_SAFE_FP16,
-            )
-        return attn_output, None if attn_impl in ("sdpa", "eager") else attn_weights
-
-    try:
-        # relaxed match: Gemma-4's signature contains KW-only shared_kv_states;
-        # the patched forward uses **kwargs to stay compatible with any
-        # additional parameters a future transformers build may add.
-        patch_function(
-            Gemma4TextAttention, "forward", forward, match_level="relaxed",
-        )
-    except Exception as e:
-        return raise_error("Gemma4TextAttention.forward", e)
-pass
-TEMPORARY_PATCHES.append(patch_Gemma4TextAttention)

From b8d9b848f4f01458b0718f923df1215148f59a25 Mon Sep 17 00:00:00 2001
From: Daniel Han <danielhanchen@gmail.com>
Date: Sat, 18 Apr 2026 12:08:44 +0000
Subject: [PATCH 3/8] Drop int8 experimental path, ship only the minimal fp16
 MLP patch

The int8 branch (torch.ops.aten._weight_int8pack_mm) worked end-to-end
but failed the RL-parity goal: 100-step GRPO on unsloth/gemma-4-E2B-it
with temperature=0.05, min_p=0.5, seed=3407 gave total |KL| of 7.47e+06
for int8 vs 52.16 for fp16+clamp vs 9387 for bf16, and int8 step 7
already had grad_norm=NaN.

Cause: GRPO's log-pi-new - log-pi-old ratio amplifies the ~7%
per-matmul weight-quantization noise, because the rollout path runs
under torch.inference_mode (dense fp16) while the training forward
used int8. Making them symmetric would require a separate int8
reference model, which is out of scope for a surgical NaN fix.

This commit removes all int8 code (Int8LinearFn autograd wrapper, row
quantizer, PEFT LoRA-aware int8 forward, UNSLOTH_GEMMA4_MLP_INT8 env
switch) and keeps only the fp16 trick:

  - fp32 act_fn(gate) * up so the product cannot overflow
  - clamp to 65280 before down_proj
  - nan_to_num on down_proj output for the rare accumulator tail

The patched forward is now 7 effective lines. Dtype contract is
identical to upstream (input dtype -> input dtype), so no attention /
RMSNorm / embedding companion patches are needed and the KV cache
stays aligned.

100-step verification vs bf16 (temp=0.05, min_p=0.5, seed=3407):

  median |KL|   bf16 4.02e-05  fp16+clamp 7.99e-06 (0.20x)
  p95 |KL|      bf16 0.411     fp16+clamp 0.620    (1.51x)
  max |KL|      bf16 8529      fp16+clamp 35.77    (0.004x)
  total |KL|    bf16 9388      fp16+clamp 52.16    (0.006x)
  mean reward   bf16 1.2111    fp16+clamp 1.1359   (|d|=0.075)
  reward-equal  75/100 steps
  time          +0.3%

Note: bf16's two huge outlier steps (step 24 |KL|=8529 grad=26461;
step 42 |KL|=800 grad=1434) are what push its total KL above
fp16+clamp's. For the calm 63-66% of steps both trajectories track
each other to 1e-5 precision.
---
 unsloth_zoo/temporary_patches/gemma4.py | 59 +++++++++++--------------
 1 file changed, 25 insertions(+), 34 deletions(-)

diff --git a/unsloth_zoo/temporary_patches/gemma4.py b/unsloth_zoo/temporary_patches/gemma4.py
index cb9b92a2b..59360b9c1 100644
--- a/unsloth_zoo/temporary_patches/gemma4.py
+++ b/unsloth_zoo/temporary_patches/gemma4.py
@@ -658,38 +658,23 @@ def patch_Gemma4TextMLP():
     """Stabilize Gemma-4 MLP under fp16 autocast (GRPO on fp16, Tesla T4).
 
     Root cause: `down_proj(act_fn(gate_proj(x)) * up_proj(x))` summed over
-    the wide intermediate dimension can exceed `fp16_max` = 65504 so the
-    fp16 matmul cast produces +-inf. That inf poisons the residual stream
-    and generation then samples NaN logits, tripping the categorical
-    assert at GRPO step ~2 on E2B/E4B with `dtype=torch.float16`.
-
-    Minimal surgical fix (single patch, no attention/norm/embedding
-    changes required):
-
-      1. Run the gate + up activation and multiply in fp32 so the product
-         cannot overflow before we clamp. gate_proj / up_proj stay as fp16
-         tensor-core matmuls (fast on T4 at 65 TFLOPS, B200, etc).
-      2. Clamp the product to 65280 (one bf16 ulp below fp16_max) so
-         down_proj never sees a saturated input - prevents the main failure
-         mode and preserves numerical fidelity for typical activations.
-      3. `nan_to_num` the output as defense-in-depth against the rare case
-         where the fp16 accumulator in down_proj still overflows for long
-         sequences or weights that have drifted during GRPO.
-
-    Dtype contract: preserves the upstream one (input dtype -> input dtype).
-    No KV cache dtype mismatch, so no attention patch is needed. Upstream
-    Gemma4RMSNorm already computes internally in fp32 and returns
-    `type_as(hidden_states)`, so no RMSNorm patch is needed either.
-
-    Tesla T4 (SM 7.5, no bf16) notes:
-      - Our FORCE_FLOAT32 path falls back to fp16 weights on T4 (bf16
-        unsupported), so the autocast context is fp16 and the patch's
-        fp32 product runs on CUDA cores (8.1 TFLOPS) while the
-        gate/up/down matmuls stay on fp16 tensor cores (65 TFLOPS).
-        Net: minimal perf overhead, matmul throughput preserved.
-      - int8 tensor cores (130 TOPS on T4) via bitsandbytes Linear8bitLt
-        could further accelerate the matmuls; out of scope for this patch
-        since it requires a different module path and calibration flow.
+    the wide intermediate dimension can exceed fp16_max = 65504 so the fp16
+    matmul cast produces +-inf. That inf poisons the residual stream and
+    generation then samples NaN logits, tripping the categorical assert at
+    GRPO step ~2 on E2B/E4B with dtype=torch.float16.
+
+    Fix, three cheap operations, single patch:
+
+      1. act_fn(gate) * up in fp32 so the product cannot overflow.
+      2. Clamp to 65280 (one bf16 ulp below fp16_max) before down_proj.
+      3. nan_to_num on the output, rescuing the rare down_proj fp16
+         accumulator overflow on wide intermediate dims.
+
+    Dtype contract is unchanged from upstream (input dtype -> input dtype),
+    so RMSNorm / Attention / Embedding need no companion patches and the KV
+    cache stays aligned with the text attention output. gate_proj, up_proj
+    and down_proj remain fp16 tensor-core matmuls (full T4 throughput at
+    65 TFLOPS).
     """
     if os.environ.get("UNSLOTH_FORCE_FLOAT32", "0") == "0":
         return
@@ -701,18 +686,24 @@ def patch_Gemma4TextMLP():
     except AttributeError as e:
         return raise_error("Gemma4TextMLP.forward", e)
 
+    # 65280 is the bf16 value one ulp below fp16_max (65504) and is exactly
+    # representable in both fp16 and bf16, so clamping here survives any
+    # downstream round-trip through PEFT's internal dtype casts.
     _SAFE_FP16 = 65280.0
 
     def forward(self, x: torch.Tensor) -> torch.Tensor:
         gate = self.gate_proj(x)
         up = self.up_proj(x)
+        # fp32 act + multiply so the product cannot overflow before clamp.
         product = self.act_fn(gate.float()) * up.float()
         product = torch.clamp(product, min=-_SAFE_FP16, max=_SAFE_FP16)
         out = self.down_proj(product.to(x.dtype))
-        out = torch.nan_to_num(
+        # nan_to_num catches the rare down_proj fp16 accumulator overflow
+        # on wide intermediate dims (empirically observed at GRPO step ~2
+        # on E2B before any training). Cheap elementwise on the residual.
+        return torch.nan_to_num(
             out, nan=0.0, posinf=_SAFE_FP16, neginf=-_SAFE_FP16,
         )
-        return out
     try:
         patch_function(
             Gemma4TextMLP, "forward", forward, fullgraph=False,

From 1ae66acdf25138818e20ef5bdcd8e3a0eb1993e1 Mon Sep 17 00:00:00 2001
From: Daniel Han <danielhanchen@gmail.com>
Date: Sun, 19 Apr 2026 05:33:22 +0000
Subject: [PATCH 4/8] Correct 65280 comment in patch_Gemma4TextMLP

65280 is the largest value exactly representable in both fp16 and bf16
(one bf16 ULP below 65536, 224 below fp16_max=65504). The previous
wording claimed it was one bf16 ulp below fp16_max, but fp16_max=65504
is not representable in bf16 at all -- it rounds up to 65536.

Clamp value is unchanged; comment only.
---
 unsloth_zoo/temporary_patches/gemma4.py | 11 +++++++----
 1 file changed, 7 insertions(+), 4 deletions(-)

diff --git a/unsloth_zoo/temporary_patches/gemma4.py b/unsloth_zoo/temporary_patches/gemma4.py
index 59360b9c1..210148dd2 100644
--- a/unsloth_zoo/temporary_patches/gemma4.py
+++ b/unsloth_zoo/temporary_patches/gemma4.py
@@ -666,7 +666,8 @@ def patch_Gemma4TextMLP():
     Fix, three cheap operations, single patch:
 
       1. act_fn(gate) * up in fp32 so the product cannot overflow.
-      2. Clamp to 65280 (one bf16 ulp below fp16_max) before down_proj.
+      2. Clamp to 65280 (largest value exactly representable in both fp16
+         and bf16) before down_proj.
       3. nan_to_num on the output, rescuing the rare down_proj fp16
          accumulator overflow on wide intermediate dims.
 
@@ -686,9 +687,11 @@ def patch_Gemma4TextMLP():
     except AttributeError as e:
         return raise_error("Gemma4TextMLP.forward", e)
 
-    # 65280 is the bf16 value one ulp below fp16_max (65504) and is exactly
-    # representable in both fp16 and bf16, so clamping here survives any
-    # downstream round-trip through PEFT's internal dtype casts.
+    # 65280 is the largest value exactly representable in both fp16 and bf16:
+    # one bf16 ULP below 65536 (the next representable bf16 value) and 224
+    # below fp16_max=65504. Note fp16_max itself is not representable in bf16
+    # (it rounds up to 65536). Clamping here therefore survives any downstream
+    # round-trip through PEFT's internal dtype casts without rounding to inf.
     _SAFE_FP16 = 65280.0
 
     def forward(self, x: torch.Tensor) -> torch.Tensor:

From 2c1c3073ffc20527b18eefd43688189ca721da59 Mon Sep 17 00:00:00 2001
From: Daniel Han <danielhanchen@gmail.com>
Date: Sun, 19 Apr 2026 06:12:34 +0000
Subject: [PATCH 5/8] Harden patch_Gemma4TextMLP: fp16-only dtype guard,
 identity-residual nan rescue, inline import

- Add `x.dtype != torch.float16` guard so bf16/fp32 activations pass
  through the upstream forward unchanged. Under UNSLOTH_FORCE_FLOAT32
  the weights are fp16 so this is normally unreachable, but the guard
  protects against pipeline configurations where autocast is disabled
  (rl_replacements nullcontext path) and prevents unintended clamping
  if activations reach the MLP as bf16.

- Change nan_to_num replacements from +-65280 to 0 so overflow
  positions contribute nothing and leave the identity residual intact.
  Replacing with the fp16 ceiling would otherwise dominate the O(1)
  post-RMSNorm hidden state. Backward gradient through nan_to_num is
  `grad * isfinite(input)` in both cases, so this is loss-free.

- Drop the one-off `_gemma4_modeling` helper and inline the import to
  match the pattern of every other patch in this file.

- Drop the `x: torch.Tensor` annotation to match the rest of the
  temporary patches (`def forward(self, x):`).
---
 unsloth_zoo/temporary_patches/gemma4.py | 26 ++++++++++---------------
 1 file changed, 10 insertions(+), 16 deletions(-)

diff --git a/unsloth_zoo/temporary_patches/gemma4.py b/unsloth_zoo/temporary_patches/gemma4.py
index 210148dd2..46c0ffd6f 100644
--- a/unsloth_zoo/temporary_patches/gemma4.py
+++ b/unsloth_zoo/temporary_patches/gemma4.py
@@ -646,14 +646,6 @@ def forward(self, hidden_states, position_embeddings, attention_mask=None, **kwa
 # ============================================================================
 
 
-def _gemma4_modeling():
-    try:
-        import transformers.models.gemma4.modeling_gemma4 as mod
-        return mod
-    except ImportError:
-        return None
-
-
 def patch_Gemma4TextMLP():
     """Stabilize Gemma-4 MLP under fp16 autocast (GRPO on fp16, Tesla T4).
 
@@ -679,8 +671,9 @@ def patch_Gemma4TextMLP():
     """
     if os.environ.get("UNSLOTH_FORCE_FLOAT32", "0") == "0":
         return
-    mod = _gemma4_modeling()
-    if mod is None:
+    try:
+        import transformers.models.gemma4.modeling_gemma4 as mod
+    except ImportError:
         return
     try:
         Gemma4TextMLP = mod.Gemma4TextMLP
@@ -694,7 +687,9 @@ def patch_Gemma4TextMLP():
     # round-trip through PEFT's internal dtype casts without rounding to inf.
     _SAFE_FP16 = 65280.0
 
-    def forward(self, x: torch.Tensor) -> torch.Tensor:
+    def forward(self, x):
+        if x.dtype != torch.float16:
+            return self.down_proj(self.act_fn(self.gate_proj(x)) * self.up_proj(x))
         gate = self.gate_proj(x)
         up = self.up_proj(x)
         # fp32 act + multiply so the product cannot overflow before clamp.
@@ -702,11 +697,10 @@ def forward(self, x: torch.Tensor) -> torch.Tensor:
         product = torch.clamp(product, min=-_SAFE_FP16, max=_SAFE_FP16)
         out = self.down_proj(product.to(x.dtype))
         # nan_to_num catches the rare down_proj fp16 accumulator overflow
-        # on wide intermediate dims (empirically observed at GRPO step ~2
-        # on E2B before any training). Cheap elementwise on the residual.
-        return torch.nan_to_num(
-            out, nan=0.0, posinf=_SAFE_FP16, neginf=-_SAFE_FP16,
-        )
+        # on wide intermediate dims. Replacements are 0 so the MLP output
+        # at overflow positions defers to the identity residual instead of
+        # injecting a near-fp16_max value that would dominate hidden_states.
+        return torch.nan_to_num(out, nan=0.0, posinf=0.0, neginf=0.0)
     try:
         patch_function(
             Gemma4TextMLP, "forward", forward, fullgraph=False,

From d943395777df8b5ef82ba1afb0c89a311ee1a8e6 Mon Sep 17 00:00:00 2001
From: Daniel Han <danielhanchen@gmail.com>
Date: Sun, 19 Apr 2026 06:40:28 +0000
Subject: [PATCH 6/8] patch_Gemma4TextMLP: gate on matmul output dtype, inline
 up projection

- Switch the stabilization guard from `x.dtype != torch.float16` to
  `gate.dtype != torch.float16`. The matmul output dtype is what
  determines whether overflow can occur, so this also catches
  mixed-precision cases (bf16 activations through fp16-cast weights via
  autocast or do_forced_float32) that the x.dtype check missed. For
  the standard UNSLOTH_FORCE_FLOAT32 path behavior is unchanged.

- Inline `self.up_proj(x).float()` so the fp16 up tensor is transient,
  and reuse the already-computed gate in the bypass path.

- Cast product back with `gate.dtype` instead of `x.dtype` to avoid a
  bf16-input/fp16-weight mismatch at down_proj if stabilization is
  active with non-fp16 activations.
---
 unsloth_zoo/temporary_patches/gemma4.py | 13 ++++++++-----
 1 file changed, 8 insertions(+), 5 deletions(-)

diff --git a/unsloth_zoo/temporary_patches/gemma4.py b/unsloth_zoo/temporary_patches/gemma4.py
index 46c0ffd6f..647a157fe 100644
--- a/unsloth_zoo/temporary_patches/gemma4.py
+++ b/unsloth_zoo/temporary_patches/gemma4.py
@@ -688,14 +688,17 @@ def patch_Gemma4TextMLP():
     _SAFE_FP16 = 65280.0
 
     def forward(self, x):
-        if x.dtype != torch.float16:
-            return self.down_proj(self.act_fn(self.gate_proj(x)) * self.up_proj(x))
         gate = self.gate_proj(x)
-        up = self.up_proj(x)
+        # Gate on the matmul output dtype rather than x.dtype so that
+        # bf16/fp32 activations combined with fp16 weights (via autocast or
+        # do_forced_float32) still enter the stabilization path when the
+        # projection actually produces fp16 outputs.
+        if gate.dtype != torch.float16:
+            return self.down_proj(self.act_fn(gate) * self.up_proj(x))
         # fp32 act + multiply so the product cannot overflow before clamp.
-        product = self.act_fn(gate.float()) * up.float()
+        product = self.act_fn(gate.float()) * self.up_proj(x).float()
         product = torch.clamp(product, min=-_SAFE_FP16, max=_SAFE_FP16)
-        out = self.down_proj(product.to(x.dtype))
+        out = self.down_proj(product.to(gate.dtype))
         # nan_to_num catches the rare down_proj fp16 accumulator overflow
         # on wide intermediate dims. Replacements are 0 so the MLP output
         # at overflow positions defers to the identity residual instead of

From fcd23f48be9da66b09e09b4c2df3455a1a421be4 Mon Sep 17 00:00:00 2001
From: Daniel Han <danielhanchen@gmail.com>
Date: Sun, 19 Apr 2026 06:42:36 +0000
Subject: [PATCH 7/8] Add review tests

---
 tests/test_gemma4_mlp_autocast.py | 130 ++++++++++++++++++++++++++++++
 1 file changed, 130 insertions(+)
 create mode 100644 tests/test_gemma4_mlp_autocast.py

diff --git a/tests/test_gemma4_mlp_autocast.py b/tests/test_gemma4_mlp_autocast.py
new file mode 100644
index 000000000..87e05937e
--- /dev/null
+++ b/tests/test_gemma4_mlp_autocast.py
@@ -0,0 +1,130 @@
+import os
+import sys
+import types
+import importlib.util
+
+os.environ.setdefault("UNSLOTH_IS_PRESENT", "1")
+os.environ.setdefault("UNSLOTH_COMPILE_DISABLE", "1")
+if "unsloth" not in sys.modules:
+    _stub = types.ModuleType("unsloth")
+    _stub.__spec__ = importlib.util.spec_from_loader("unsloth", loader=None)
+    _stub.__path__ = []
+    sys.modules["unsloth"] = _stub
+
+import pytest
+import torch
+import torch.nn as nn
+
+from unsloth_zoo.temporary_patches import gemma4 as g4
+
+
+def _make_mlp_class():
+    class Gemma4TextMLP(nn.Module):
+        def __init__(self):
+            super().__init__()
+            self.gate_proj = nn.Linear(64, 2048, bias=False)
+            self.up_proj = nn.Linear(64, 2048, bias=False)
+            self.down_proj = nn.Linear(2048, 64, bias=False)
+            self.act_fn = nn.GELU(approximate="tanh")
+            with torch.no_grad():
+                for p in (
+                    self.gate_proj.weight,
+                    self.up_proj.weight,
+                    self.down_proj.weight,
+                ):
+                    p.fill_(0.5)
+
+        def forward(self, x):
+            return self.down_proj(
+                self.act_fn(self.gate_proj(x)) * self.up_proj(x)
+            )
+
+    return Gemma4TextMLP
+
+
+def _install_module_stub(monkeypatch, cls):
+    fake = types.ModuleType("transformers.models.gemma4.modeling_gemma4")
+    fake.Gemma4TextMLP = cls
+    for pkg in (
+        "transformers",
+        "transformers.models",
+        "transformers.models.gemma4",
+    ):
+        if pkg not in sys.modules:
+            p = types.ModuleType(pkg)
+            p.__path__ = []
+            monkeypatch.setitem(sys.modules, pkg, p)
+    monkeypatch.setitem(
+        sys.modules, "transformers.models.gemma4.modeling_gemma4", fake
+    )
+
+
+@pytest.mark.skipif(not torch.cuda.is_available(), reason="requires CUDA")
+def test_bf16_weights_fp16_autocast_stabilizes(monkeypatch):
+    # On bf16-capable GPUs a user can wrap inference in torch.amp.autocast(fp16)
+    # over a bf16-loaded model. x.dtype stays bf16, but self.gate_proj(x)
+    # executes in fp16 and can overflow. The gate.dtype guard must detect
+    # this and enter stabilization; an x.dtype guard would bypass it.
+    monkeypatch.setenv("UNSLOTH_FORCE_FLOAT32", "1")
+    cls = _make_mlp_class()
+    _install_module_stub(monkeypatch, cls)
+    g4.patch_Gemma4TextMLP()
+    m = cls().cuda().to(torch.bfloat16).eval()
+    torch.manual_seed(0)
+    x = torch.randn(2, 64, dtype=torch.bfloat16, device="cuda") * 20.0
+    with torch.no_grad(), torch.amp.autocast("cuda", dtype=torch.float16):
+        out = m(x)
+    assert torch.all(torch.isfinite(out)).item()
+
+
+@pytest.mark.skipif(not torch.cuda.is_available(), reason="requires CUDA")
+def test_fp32_weights_fp16_autocast_stabilizes(monkeypatch):
+    monkeypatch.setenv("UNSLOTH_FORCE_FLOAT32", "1")
+    cls = _make_mlp_class()
+    _install_module_stub(monkeypatch, cls)
+    g4.patch_Gemma4TextMLP()
+    m = cls().cuda().eval()
+    torch.manual_seed(0)
+    x = torch.randn(2, 64, device="cuda") * 20.0
+    with torch.no_grad(), torch.amp.autocast("cuda", dtype=torch.float16):
+        out = m(x)
+    assert torch.all(torch.isfinite(out)).item()
+
+
+def test_idempotent_patch_install(monkeypatch):
+    monkeypatch.setenv("UNSLOTH_FORCE_FLOAT32", "1")
+    cls = _make_mlp_class()
+    _install_module_stub(monkeypatch, cls)
+    g4.patch_Gemma4TextMLP()
+    first = cls.forward
+    g4.patch_Gemma4TextMLP()
+    second = cls.forward
+    # Second call should leave the class in a patched, working state.
+    assert second is not None
+    assert second.__name__ == "forward"
+    m = cls().half().eval()
+    torch.manual_seed(0)
+    x = torch.randn(2, 64, dtype=torch.float16) * 0.1
+    with torch.no_grad():
+        out = m(x)
+    assert torch.all(torch.isfinite(out)).item()
+    assert (out != 0).any().item()
+
+
+def test_pure_bf16_path_bypasses_even_with_overflow_scale(monkeypatch):
+    # Pure bf16 path should NEVER enter stabilization regardless of input
+    # magnitude, because bf16's dynamic range does not overflow at these
+    # scales. gate.dtype guard must send it to upstream verbatim.
+    monkeypatch.setenv("UNSLOTH_FORCE_FLOAT32", "1")
+    cls = _make_mlp_class()
+    _install_module_stub(monkeypatch, cls)
+    upstream = cls.forward
+    g4.patch_Gemma4TextMLP()
+    m = cls().to(torch.bfloat16).eval()
+    torch.manual_seed(0)
+    x = torch.randn(2, 64, dtype=torch.bfloat16) * 50.0
+    with torch.no_grad():
+        patched = m(x)
+        expected = upstream(m, x)
+    assert torch.equal(patched, expected)
+    assert patched.dtype == torch.bfloat16

From f21085d817e33246d3218b0650b325a5f0990a2a Mon Sep 17 00:00:00 2001
From: Daniel Han <danielhanchen@gmail.com>
Date: Sun, 19 Apr 2026 06:46:27 +0000
Subject: [PATCH 8/8] Consolidate review tests into
 test_patch_gemma4_text_mlp.py

---
 ...ocast.py => test_patch_gemma4_text_mlp.py} | 114 ++++++++++++++----
 1 file changed, 88 insertions(+), 26 deletions(-)
 rename tests/{test_gemma4_mlp_autocast.py => test_patch_gemma4_text_mlp.py} (57%)

diff --git a/tests/test_gemma4_mlp_autocast.py b/tests/test_patch_gemma4_text_mlp.py
similarity index 57%
rename from tests/test_gemma4_mlp_autocast.py
rename to tests/test_patch_gemma4_text_mlp.py
index 87e05937e..99d39e1ff 100644
--- a/tests/test_gemma4_mlp_autocast.py
+++ b/tests/test_patch_gemma4_text_mlp.py
@@ -59,49 +59,66 @@ def _install_module_stub(monkeypatch, cls):
     )
 
 
-@pytest.mark.skipif(not torch.cuda.is_available(), reason="requires CUDA")
-def test_bf16_weights_fp16_autocast_stabilizes(monkeypatch):
-    # On bf16-capable GPUs a user can wrap inference in torch.amp.autocast(fp16)
-    # over a bf16-loaded model. x.dtype stays bf16, but self.gate_proj(x)
-    # executes in fp16 and can overflow. The gate.dtype guard must detect
-    # this and enter stabilization; an x.dtype guard would bypass it.
-    monkeypatch.setenv("UNSLOTH_FORCE_FLOAT32", "1")
+def test_noop_when_force_float32_unset(monkeypatch):
+    monkeypatch.delenv("UNSLOTH_FORCE_FLOAT32", raising=False)
     cls = _make_mlp_class()
     _install_module_stub(monkeypatch, cls)
+    original = cls.forward
     g4.patch_Gemma4TextMLP()
-    m = cls().cuda().to(torch.bfloat16).eval()
+    assert cls.forward is original
+
+
+def test_noop_when_force_float32_zero(monkeypatch):
+    monkeypatch.setenv("UNSLOTH_FORCE_FLOAT32", "0")
+    cls = _make_mlp_class()
+    _install_module_stub(monkeypatch, cls)
+    original = cls.forward
+    g4.patch_Gemma4TextMLP()
+    assert cls.forward is original
+
+
+def test_upstream_without_patch_overflows_fp16():
+    cls = _make_mlp_class()
+    m = cls().half().eval()
     torch.manual_seed(0)
-    x = torch.randn(2, 64, dtype=torch.bfloat16, device="cuda") * 20.0
-    with torch.no_grad(), torch.amp.autocast("cuda", dtype=torch.float16):
+    x = torch.randn(2, 64, dtype=torch.float16) * 20.0
+    with torch.no_grad():
         out = m(x)
-    assert torch.all(torch.isfinite(out)).item()
+    assert (~torch.isfinite(out)).any().item()
 
 
-@pytest.mark.skipif(not torch.cuda.is_available(), reason="requires CUDA")
-def test_fp32_weights_fp16_autocast_stabilizes(monkeypatch):
+def test_fp16_overflow_output_is_finite(monkeypatch):
     monkeypatch.setenv("UNSLOTH_FORCE_FLOAT32", "1")
     cls = _make_mlp_class()
     _install_module_stub(monkeypatch, cls)
     g4.patch_Gemma4TextMLP()
-    m = cls().cuda().eval()
+    m = cls().half().eval()
     torch.manual_seed(0)
-    x = torch.randn(2, 64, device="cuda") * 20.0
-    with torch.no_grad(), torch.amp.autocast("cuda", dtype=torch.float16):
+    x = torch.randn(2, 64, dtype=torch.float16) * 20.0
+    with torch.no_grad():
         out = m(x)
     assert torch.all(torch.isfinite(out)).item()
+    assert out.dtype == torch.float16
 
 
-def test_idempotent_patch_install(monkeypatch):
+def test_fp16_nan_to_num_replaces_with_zero(monkeypatch):
     monkeypatch.setenv("UNSLOTH_FORCE_FLOAT32", "1")
     cls = _make_mlp_class()
     _install_module_stub(monkeypatch, cls)
     g4.patch_Gemma4TextMLP()
-    first = cls.forward
+    m = cls().half().eval()
+    torch.manual_seed(0)
+    x = torch.randn(2, 64, dtype=torch.float16) * 20.0
+    with torch.no_grad():
+        out = m(x)
+    assert out.abs().max().item() == 0.0
+
+
+def test_fp16_normal_input_produces_nonzero_output(monkeypatch):
+    monkeypatch.setenv("UNSLOTH_FORCE_FLOAT32", "1")
+    cls = _make_mlp_class()
+    _install_module_stub(monkeypatch, cls)
     g4.patch_Gemma4TextMLP()
-    second = cls.forward
-    # Second call should leave the class in a patched, working state.
-    assert second is not None
-    assert second.__name__ == "forward"
     m = cls().half().eval()
     torch.manual_seed(0)
     x = torch.randn(2, 64, dtype=torch.float16) * 0.1
@@ -111,10 +128,7 @@ def test_idempotent_patch_install(monkeypatch):
     assert (out != 0).any().item()
 
 
-def test_pure_bf16_path_bypasses_even_with_overflow_scale(monkeypatch):
-    # Pure bf16 path should NEVER enter stabilization regardless of input
-    # magnitude, because bf16's dynamic range does not overflow at these
-    # scales. gate.dtype guard must send it to upstream verbatim.
+def test_bf16_input_matches_upstream(monkeypatch):
     monkeypatch.setenv("UNSLOTH_FORCE_FLOAT32", "1")
     cls = _make_mlp_class()
     _install_module_stub(monkeypatch, cls)
@@ -128,3 +142,51 @@ def test_pure_bf16_path_bypasses_even_with_overflow_scale(monkeypatch):
         expected = upstream(m, x)
     assert torch.equal(patched, expected)
     assert patched.dtype == torch.bfloat16
+
+
+def test_fp32_input_matches_upstream(monkeypatch):
+    monkeypatch.setenv("UNSLOTH_FORCE_FLOAT32", "1")
+    cls = _make_mlp_class()
+    _install_module_stub(monkeypatch, cls)
+    upstream = cls.forward
+    g4.patch_Gemma4TextMLP()
+    m = cls().eval()
+    torch.manual_seed(0)
+    x = torch.randn(2, 64)
+    with torch.no_grad():
+        patched = m(x)
+        expected = upstream(m, x)
+    assert torch.equal(patched, expected)
+
+
+def test_idempotent_patch_install(monkeypatch):
+    monkeypatch.setenv("UNSLOTH_FORCE_FLOAT32", "1")
+    cls = _make_mlp_class()
+    _install_module_stub(monkeypatch, cls)
+    g4.patch_Gemma4TextMLP()
+    g4.patch_Gemma4TextMLP()
+    assert cls.forward.__name__ == "forward"
+    m = cls().half().eval()
+    torch.manual_seed(0)
+    x = torch.randn(2, 64, dtype=torch.float16) * 0.1
+    with torch.no_grad():
+        out = m(x)
+    assert torch.all(torch.isfinite(out)).item()
+
+
+@pytest.mark.skipif(not torch.cuda.is_available(), reason="requires CUDA")
+def test_bf16_weights_fp16_autocast_stabilizes(monkeypatch):
+    # bf16-capable GPU + user-wrapped torch.amp.autocast(fp16): x.dtype stays
+    # bf16 but self.gate_proj(x) runs in fp16 and can overflow. The
+    # gate.dtype guard must enter stabilization here; an x.dtype guard would
+    # bypass and leave the overflow unfixed.
+    monkeypatch.setenv("UNSLOTH_FORCE_FLOAT32", "1")
+    cls = _make_mlp_class()
+    _install_module_stub(monkeypatch, cls)
+    g4.patch_Gemma4TextMLP()
+    m = cls().cuda().to(torch.bfloat16).eval()
+    torch.manual_seed(0)
+    x = torch.randn(2, 64, dtype=torch.bfloat16, device="cuda") * 20.0
+    with torch.no_grad(), torch.amp.autocast("cuda", dtype=torch.float16):
+        out = m(x)
+    assert torch.all(torch.isfinite(out)).item()