diff --git a/.github/workflows/base.yml b/.github/workflows/base.yml
index 90dd2b3db0..ed987558fe 100644
--- a/.github/workflows/base.yml
+++ b/.github/workflows/base.yml
@@ -38,14 +38,6 @@ jobs:
             torch_cuda_arch_list: "7.0 7.5 8.0 8.6 8.7 8.9 9.0+PTX"
             dockerfile: "Dockerfile-base"
             platforms: "linux/amd64,linux/arm64"
-          - cuda: "128"
-            cuda_version: 12.8.1
-            cudnn_version: ""
-            python_version: "3.11"
-            pytorch: 2.10.0
-            torch_cuda_arch_list: "7.0 7.5 8.0 8.6 8.7 8.9 9.0+PTX"
-            dockerfile: "Dockerfile-base"
-            platforms: "linux/amd64,linux/arm64"
           - cuda: "128"
             cuda_version: 12.8.1
             cudnn_version: ""
@@ -70,14 +62,6 @@ jobs:
             torch_cuda_arch_list: "9.0 10.0 10.3 12.0+PTX"
             dockerfile: "Dockerfile-base"
             platforms: "linux/amd64,linux/arm64"
-          - cuda: "130"
-            cuda_version: 13.0.0
-            cudnn_version: ""
-            python_version: "3.12"
-            pytorch: 2.9.1
-            torch_cuda_arch_list: "9.0 10.0 10.3 12.0+PTX"
-            dockerfile: "Dockerfile-base"
-            platforms: "linux/amd64,linux/arm64"
           - cuda: "130"
             cuda_version: 13.0.0
             cudnn_version: ""
@@ -208,19 +192,19 @@ jobs:
             torch_cuda_arch_list: "9.0 10.0 10.3 12.0+PTX"
             dockerfile: "Dockerfile-uv-base"
             platforms: "linux/amd64,linux/arm64"
-          - cuda: "128"
-            cuda_version: 12.8.1
+          - cuda: "130"
+            cuda_version: 13.0.0
             cudnn_version: ""
             python_version: "3.12"
             pytorch: 2.11.0
-            torch_cuda_arch_list: "7.0 7.5 8.0 8.6 8.7 8.9 9.0+PTX"
+            torch_cuda_arch_list: "9.0 10.0 10.3 12.0+PTX"
             dockerfile: "Dockerfile-uv-base"
             platforms: "linux/amd64,linux/arm64"
           - cuda: "130"
             cuda_version: 13.0.0
             cudnn_version: ""
             python_version: "3.12"
-            pytorch: 2.11.0
+            pytorch: 2.12.0
             torch_cuda_arch_list: "9.0 10.0 10.3 12.0+PTX"
             dockerfile: "Dockerfile-uv-base"
             platforms: "linux/amd64,linux/arm64"
diff --git a/.github/workflows/multi-gpu-e2e.yml b/.github/workflows/multi-gpu-e2e.yml
index 03da58f7e1..81f89da887 100644
--- a/.github/workflows/multi-gpu-e2e.yml
+++ b/.github/workflows/multi-gpu-e2e.yml
@@ -40,8 +40,8 @@ jobs:
           #            dockerfile: "Dockerfile-uv.jinja"
           - cuda: 130
             cuda_version: 13.0.0
-            python_version: "3.11"
-            pytorch: 2.9.1
+            python_version: "3.12"
+            pytorch: 2.12.0
             axolotl_extras:
             #            axolotl_extras: fbgemm-gpu
             num_gpus: 2
diff --git a/.github/workflows/pypi.yml b/.github/workflows/pypi.yml
index c2fc1c9d87..c03c209c72 100644
--- a/.github/workflows/pypi.yml
+++ b/.github/workflows/pypi.yml
@@ -8,9 +8,6 @@ on:
 
 permissions: {}
 
-env:
-  UV_SYSTEM_PYTHON: "1"
-
 jobs:
   setup_release:
     name: Create Release
@@ -24,7 +21,10 @@ jobs:
       - name: Create release
         env:
           GITHUB_TOKEN: ${{ secrets.GITHUB_TOKEN }}
-        run: gh release create "$GITHUB_REF_NAME" --generate-notes
+        # idempotent: don't fail a re-run if the release already exists
+        run: |
+          gh release view "$GITHUB_REF_NAME" >/dev/null 2>&1 \
+            || gh release create "$GITHUB_REF_NAME" --generate-notes
   pypi-publish:
     name: Upload release to PyPI
     runs-on: ubuntu-latest
@@ -47,13 +47,6 @@ jobs:
       - name: Install uv
         uses: astral-sh/setup-uv@v7
 
-      - name: Install dependencies
-        run: |
-          uv pip install wheel packaging
-          uv pip install --no-build-isolation -e .
-          uv pip install black mypy pre-commit types-requests quartodoc jupyter blobfile tiktoken \
-            codecov codecov-cli pytest pytest-cov pytest-retry pytest-sugar pytest-xdist tbparse
-
       - name: Extract tag name
         id: tag
         run: echo "TAG_NAME=$(echo $GITHUB_REF | cut -d / -f 3)" >> "$GITHUB_OUTPUT"
@@ -62,9 +55,10 @@ jobs:
         run: |
           echo "${{ steps.tag.outputs.TAG_NAME }}" | sed 's/^v//' > VERSION
 
-      - name: Build a source dist
-        run: |
-          python setup.py sdist
+      - name: Build sdist and wheel
+        # PEP 517 build via uv (setuptools backend reads the version from VERSION);
+        # replaces the removed `python setup.py sdist` after the pyproject migration.
+        run: uv build
 
       - name: Publish package distributions to PyPI
         uses: pypa/gh-action-pypi-publish@release/v1
diff --git a/.github/workflows/tests-nightly.yml b/.github/workflows/tests-nightly.yml
index 1802b63056..1be17317ee 100644
--- a/.github/workflows/tests-nightly.yml
+++ b/.github/workflows/tests-nightly.yml
@@ -160,7 +160,7 @@ jobs:
         env:
           CODECOV_TOKEN: ${{ secrets.CODECOV_TOKEN }}
         run: |
-          modal run cicd.e2e_tests
+          modal run -m cicd.e2e_tests
   docker-e2e-multigpu-tests:
     if: github.repository_owner == 'axolotl-ai-cloud'
     # this job needs to be run on self-hosted GPU runners...
@@ -203,4 +203,4 @@ jobs:
         env:
           CODECOV_TOKEN: ${{ secrets.CODECOV_TOKEN }}
         run: |
-          modal run cicd.multigpu
+          modal run -m cicd.multigpu
diff --git a/.github/workflows/tests.yml b/.github/workflows/tests.yml
index 6b298ade02..888c3e39b9 100644
--- a/.github/workflows/tests.yml
+++ b/.github/workflows/tests.yml
@@ -68,11 +68,11 @@ jobs:
       fail-fast: false
       matrix:
         python_version: ["3.12", "3.14"]
-        pytorch_version: ["2.9.1", "2.10.0"]
+        pytorch_version: ["2.9.1", "2.10.0", "2.11.0", "2.12.0"]
         exclude:
           - python_version: "3.14"
             pytorch_version: "2.9.1"
-    timeout-minutes: 25
+    timeout-minutes: 30
 
     steps:
       - name: cleanup node
@@ -155,7 +155,7 @@ jobs:
       fail-fast: false
       matrix:
         python_version: ["3.12", "3.14"]
-        pytorch_version: ["2.9.1", "2.10.0"]
+        pytorch_version: ["2.9.1", "2.10.0", "2.11.0", "2.12.0"]
         exclude:
           - python_version: "3.14"
             pytorch_version: "2.9.1"
@@ -274,7 +274,7 @@ jobs:
           - cuda: 130
             cuda_version: 13.0.0
             python_version: "3.12"
-            pytorch: 2.9.1
+            pytorch: 2.12.0
             num_gpus: 1
             axolotl_extras:
     steps:
@@ -302,7 +302,7 @@ jobs:
         env:
           CODECOV_TOKEN: ${{ secrets.CODECOV_TOKEN }}
         run: |
-          modal run cicd.e2e_tests
+          modal run -m cicd.e2e_tests
 
   docker-e2e-tests:
     if: >
@@ -320,12 +320,6 @@ jobs:
       fail-fast: false
       matrix:
         include:
-          - cuda: 128
-            cuda_version: 12.8.1
-            python_version: "3.11"
-            pytorch: 2.9.1
-            num_gpus: 1
-            axolotl_extras:
           - cuda: 128
             cuda_version: 12.8.1
             python_version: "3.11"
@@ -334,8 +328,8 @@ jobs:
             axolotl_extras:
           - cuda: 130
             cuda_version: 13.0.0
-            python_version: "3.11"
-            pytorch: 2.9.1
+            python_version: "3.12"
+            pytorch: 2.11.0
             num_gpus: 1
             axolotl_extras:
     steps:
@@ -364,7 +358,7 @@ jobs:
         env:
           CODECOV_TOKEN: ${{ secrets.CODECOV_TOKEN }}
         run: |
-          modal run cicd.e2e_tests
+          modal run -m cicd.e2e_tests
 
   docker-e2e-cleanup:
     runs-on: [self-hosted, modal]
@@ -404,4 +398,4 @@ jobs:
           echo "N_GPUS=${{ matrix.num_gpus }}" >> $GITHUB_ENV
       - name: Run tests job on Modal
         run: |
-          modal run cicd.cleanup
+          modal run -m cicd.cleanup
diff --git a/.gitignore b/.gitignore
index b75becc7c1..8365917bf3 100644
--- a/.gitignore
+++ b/.gitignore
@@ -32,6 +32,7 @@ share/python-wheels/
 .installed.cfg
 *.egg
 MANIFEST
+uv.lock
 
 # PyInstaller
 #  Usually these files are written by a python script from a template
diff --git a/VERSION b/VERSION
index b08b47558c..52fc5b872a 100644
--- a/VERSION
+++ b/VERSION
@@ -1 +1 @@
-0.16.2.dev0
+0.17.0.dev
diff --git a/cicd/cicd.sh b/cicd/cicd.sh
index 15a6f7ebf5..957c63f95a 100755
--- a/cicd/cicd.sh
+++ b/cicd/cicd.sh
@@ -43,11 +43,20 @@ pytest --full-trace -vvv --durations=10 \
   --cov-append
 
 # Run solo tests with coverage append
+# test_rm_lora is run in its own process below (it fails on py3.11 when sharing
+# the solo process with other tests; isolating it avoids cross-test state).
 pytest -v --durations=10 -n1 \
+  --ignore=tests/e2e/solo/test_reward_model_smollm2.py \
   /workspace/axolotl/tests/e2e/solo/ \
   --cov=axolotl \
   --cov-append
 
+# Run reward-model test isolated in its own process
+pytest -v --durations=10 -s \
+  /workspace/axolotl/tests/e2e/solo/test_reward_model_smollm2.py \
+  --cov=axolotl \
+  --cov-append
+
 # Run integration tests with coverage append
 pytest -v --durations=10 \
   /workspace/axolotl/tests/e2e/integrations/ \
diff --git a/docs/mixed_precision.qmd b/docs/mixed_precision.qmd
index 7b77cd4bb4..ac0f668029 100644
--- a/docs/mixed_precision.qmd
+++ b/docs/mixed_precision.qmd
@@ -54,6 +54,26 @@ bf16: true
 bf16: full  # Equivalent to bf16_full_eval in the HF trainer
 ```
 
+### Keeping norms in fp32 (FSDP2) {#sec-fp32-norms}
+
+Some models declare RMSNorm/LayerNorm layers as fp32 for training
+stability — the variance computation in RMSNorm is numerically poor in
+bf16, and the learned gain γ quantizes harshly. With FSDP1 this fights
+the flat-param dtype uniformity constraint; with FSDP2 each norm can have
+its own `MixedPrecisionPolicy`. Enable with:
+
+```{.yaml}
+fsdp_version: 2
+fp32_norms: true
+# fp32_norm_classes:        # optional override
+#   - RMSNorm
+#   - LayerNorm
+```
+
+Defaults match any class whose name ends in `RMSNorm` or `LayerNorm`. Use
+fully qualified names (`module.path.ClassName`) to pin a specific
+implementation.
+
 ## FP8 Mixed Precision {#sec-fp8}
 
 ::: {.callout-note}
diff --git a/docs/multimodal.qmd b/docs/multimodal.qmd
index a8e42d3ffc..9568fcb5ce 100644
--- a/docs/multimodal.qmd
+++ b/docs/multimodal.qmd
@@ -368,19 +368,27 @@ Here is an example of a multi-modal dataset:
 
 Raw image+text continued pretraining — no chat template, no conversational
 scaffolding. The model learns to emit raw text conditioned on visual patches.
-Intended for use cases like OCR/transcription corpora where every row is a
-tight `(image, target_text)` pair and any user/assistant framing would pollute
-the learned signal.
+This is intended for corpora where every row is a tight `(image, target_text)`
+pair and any user/assistant framing would pollute the learned signal.
 
 ::: {.callout-note}
-**Currently supported via the streaming `pretraining_dataset` route only.**
-A non-streaming `datasets:` route (`type: multimodal_pretrain` under
-`datasets:`) is intentionally not wired — the collator selection inside
-`build_collator` only routes MM CPT batches under the pretraining branch.
-Configure MM CPT under `pretraining_dataset:` with `streaming: true` as
-shown below.
+MM CPT supports two raw image+text routes:
+
+- `pretraining_dataset:` with `streaming: true` for iterable streaming.
+- `datasets:` with `type: multimodal_pretrain` for the standard non-streaming
+  prepared dataset pipeline.
+
+The non-streaming path stores token metadata plus image paths in the prepared
+dataset; decoded image tensors are loaded by the collator at batch time.
 :::
 
+Full config examples are available under `examples/qwen2_5-vl/`:
+
+- `mm-cpt-nonstreaming-qlora.yaml` for raw `datasets:`
+  preprocessing plus `num_epochs` based training.
+- `mm-cpt-streaming-qlora.yaml` for streaming `pretraining_dataset:`
+  training with prepared-cache resume.
+
 ### Dataset format (JSONL)
 
 Two keys per row: `text` (the raw string) and `images` (list of local paths).
@@ -413,7 +421,7 @@ Notes:
 Axolotl autodetects the placeholder from the loaded processor. If autodetection
 fails, supply `image_token: <your token>` on the dataset entry.
 
-### YAML example
+### Streaming YAML example
 
 ```yaml
 base_model: HuggingFaceTB/SmolVLM-500M-Instruct
@@ -438,6 +446,91 @@ micro_batch_size: 1
 gradient_accumulation_steps: 8
 ```
 
+For large streaming runs that need cheap checkpoint resume, set
+`dataset_prepared_path` so Axolotl can cache the tokenized MM CPT rows, and use
+`ignore_data_skip: true` when you prefer resuming optimizer/scheduler state
+without Trainer fast-forwarding through already-seen multimodal batches:
+
+```yaml
+pretraining_dataset:
+  - path: /path/to/shards/*.jsonl
+    ds_type: json
+    type: multimodal_pretrain
+    text_column: text
+    image_column: images
+
+streaming: true
+dataset_prepared_path: ./data/mm-cpt-stream-cache
+ignore_data_skip: true
+max_steps: 10000
+```
+
+### Non-streaming prepared YAML example
+
+```yaml
+base_model: HuggingFaceTB/SmolVLM-500M-Instruct
+processor_type: AutoProcessor
+
+datasets:
+  - path: /path/to/train.jsonl
+    ds_type: json
+    type: multimodal_pretrain
+    text_column: text
+    image_column: images
+    image_base_dir: /path/to/images   # optional, for relative paths
+    # image_token: "<image>"           # optional override; autodetect by default
+
+streaming: false
+dataset_prepared_path: ./data/prepared
+sequence_len: 2048
+sample_packing: false                  # REQUIRED — see below
+remove_unused_columns: false           # auto-set by validator
+
+num_epochs: 1                           # max_steps is optional in this path
+micro_batch_size: 1
+gradient_accumulation_steps: 8
+```
+
+Because this route creates a map-style dataset with a known length, Axolotl can
+infer scheduler/training steps from `num_epochs`; unlike streaming
+`pretraining_dataset`, you do not need to manually calculate `max_steps`.
+
+You can run `axolotl preprocess` on this mode. The prepared Arrow dataset keeps
+`images` as image references and `_mm_text` as raw text for the collator; the
+collator loads images and calls the configured processor during training. If
+your JSONL stores relative image paths, keep `image_base_dir` set in the train
+config that consumes the prepared dataset.
+
+### Already-tokenized MM CPT rows
+
+The non-streaming `datasets:` path also accepts rows that have already been
+tokenized, as long as they still include `_mm_text` and `images` for the
+collator. The collator re-tokenizes `_mm_text` with the processor at batch time
+so image placeholders expand consistently with the loaded model; the stored
+token columns are used by the prepared dataset pipeline and should match the raw
+text.
+
+```yaml
+base_model: HuggingFaceTB/SmolVLM-500M-Instruct
+processor_type: AutoProcessor
+
+datasets:
+  - path: /path/to/pretokenized_mm_cpt.jsonl
+    ds_type: json
+    type: multimodal_pretrain
+
+streaming: false
+skip_prepare_dataset: true
+sequence_len: 2048
+sample_packing: false
+remove_unused_columns: false
+num_epochs: 1
+```
+
+```json
+{"_mm_text": "<image>\nText target.", "images": ["/dataset/image.png"], "input_ids": [1, 2, 3], "attention_mask": [1, 1, 1], "labels": [1, 2, 3]}
+```
+
 ### Eval datasets
 
 `test_datasets` accepts multimodal entries (`type: multimodal_pretrain` or
@@ -473,6 +566,12 @@ The following combinations are rejected at config-load time with a clear error:
 - `chat_template` set to anything — defeats the purpose of the CPT path.
 - `processor_type` unset — no processor means no image tensors.
 - Multiple MM `test_datasets` entries with mismatched `image_base_dir` or `image_token`.
+- Multiple MM training entries under `datasets:` or `pretraining_dataset:`.
+- `datasets:` MM CPT with `streaming: true` — use `pretraining_dataset:` for
+  streaming and `datasets:` for the non-streaming prepared path.
+- `datasets:` MM CPT with `excess_length_strategy: truncate` — the collator
+  re-tokenizes `_mm_text` at batch time, so truncating only prepared token ids
+  would not truncate the actual model inputs.
 
 In addition, the following model families are **not supported** in v1 and will
 be rejected when their processor is loaded:
diff --git a/docs/nd_parallelism.qmd b/docs/nd_parallelism.qmd
index 972953ff9f..729eeb9bea 100644
--- a/docs/nd_parallelism.qmd
+++ b/docs/nd_parallelism.qmd
@@ -39,6 +39,7 @@ Context Parallelism, also called [Sequence Parallelism](sequence_parallelism.qmd
 - How it works: If you have a sequence of 8192 tokens and a `context_parallel_size` of 4, each GPU will only handle a chunk of 2048 tokens.
 - The Challenge: Attention is not local; every token needs to "attend to" every other token. Splitting the sequence breaks this.
 - The Solution (`ring-flash-attention`): An efficient communication protocol is used. To compute attention for its local sequence chunk, each GPU passes its Key-Value (KV) cache to its neighbor in a "ring." After `N-1` steps, every GPU has seen the KV-cache from all other GPUs, allowing it to compute the correct attention values for its chunk. This is implemented using the highly optimized `flash-attention` kernel at each step.
+- **Mamba/SSM Hybrid Models**: For hybrid architectures (Nemotron-H, Falcon-H1, Granite MoE Hybrid), attention layers use ring attention as above, while SSM (Mamba2) layers use P2P state passing across ranks with an additive output correction. See [Sequence Parallelism](sequence_parallelism.qmd) for details.
 
 ### Expert Parallelism (EP) {#sec-ep}
 
diff --git a/docs/optimizations.qmd b/docs/optimizations.qmd
index 720519ec03..60a011b415 100644
--- a/docs/optimizations.qmd
+++ b/docs/optimizations.qmd
@@ -73,13 +73,25 @@ Provides efficient Triton kernels to improve training speed and reduce memory us
 
 - **Learn more:** [Custom Integrations - Liger Kernels](custom_integrations.qmd#liger-kernels)
 
+### Fused RMSNorm + RoPE (Qwen3 / Qwen3-MoE / Qwen3.5 / Qwen3.5-MoE / Qwen3.6 dense / Qwen3.6-MoE)
+
+Replaces the per-layer `q_norm + apply_rotary_pos_emb` (and matching K path) with a single Triton kernel launch on the full-attention layers. Opt-in. The kernel computes in fp32 and rounds once, so it matches an fp32 reference to within bf16 rounding — i.e. it is *more* accurate than the eager bf16 path, which rounds at several intermediate steps. Gemma 4 always uses the fused path (no flag needed). Qwen3.6 checkpoints are loaded by transformers under the `qwen3_5` / `qwen3_5_moe` model_types, so the same flag covers both generations.
+
+```yaml
+fused_attn_kernel: true
+```
+
+- **Compile-safe:** the kernel is wrapped as a `torch.library.triton_op` and traces under `torch.compile(fullgraph=True)`.
+- **Hardware note:** on sm_120 (Blackwell) combining with `torch_compile: true` is a net win; on sm_86 (Ampere consumer) `torch_compile: true` currently regresses the surrounding Inductor-generated kernels — keep compile off there.
+
 ### Expert Kernels
 
-Optimized kernel implementations for Mixture of Experts (MoE) model training.
+Optimized per-expert grouped-GEMM kernels for MoE training, with LoRA support.
 
-- **ScatterMoE**: Triton-based MoE kernels with fused LoRA support.
-- **SonicMoE**: CUTLASS-based MoE kernels for NVIDIA Hopper and Blackwell GPUs.
+- **ScatterMoE**: Triton, any CUDA GPU.
+- **SonicMoE**: CUTLASS / cute-DSL, Hopper+ only.
 
+- **Config:** `use_scattermoe: true` or `use_sonicmoe: true`
 - **Learn more:** [Custom Integrations - Kernels Integration](custom_integrations.qmd#kernels-integration)
 
 ## Long Context Models
@@ -117,7 +129,7 @@ To train models that don't fit on a single GPU, you'll need to use a distributed
 
 ### N-D Parallelism (Beta)
 
-For advanced scaling, Axolotl allows you to compose different parallelism techniques (e.g., Data, Tensor, Sequence Parallelism). This is a powerful approach to train an extremely large model by overcoming multiple bottlenecks at once.
+For advanced scaling, Axolotl allows you to compose different parallelism techniques (e.g., Data, Tensor, Sequence, Expert Parallelism). This is a powerful approach to train an extremely large model by overcoming multiple bottlenecks at once.
 
 - **Learn more:** [N-D Parallelism Guide](nd_parallelism.qmd)
 
diff --git a/docs/optimizers.qmd b/docs/optimizers.qmd
index 45eea1d3ab..ffa1fa05d8 100644
--- a/docs/optimizers.qmd
+++ b/docs/optimizers.qmd
@@ -127,3 +127,43 @@ dion_lr: 0.01
 dion_momentum: 0.95
 lr: 0.00001  # learning rate for embeddings and parameters that fallback to AdamW
 ```
+
+### q_galore_adamw8bit
+
+Q-GaLore extends [GaLore](https://arxiv.org/abs/2403.03507) with two extra ideas:
+an INT4-quantized projection matrix and an adaptive SVD scheduler that skips
+re-projection when a layer's gradient subspace stabilizes. Both are wired up in
+axolotl. The third Q-GaLore trick — INT8 weight wrapping — is not yet
+implemented and is tracked as a follow-up.
+
+GitHub: [https://github.com/VITA-Group/Q-GaLore](https://github.com/VITA-Group/Q-GaLore)
+Paper: [https://arxiv.org/abs/2407.08296](https://arxiv.org/abs/2407.08296)
+
+Install: `pip install axolotl[qgalore]`
+
+This optimizer is for **full fine-tuning**. It is incompatible with `adapter`
+(LoRA/QLoRA), `load_in_8bit`, and `load_in_4bit`. DeepSpeed is currently gated
+off; FSDP requires `fsdp_version: 2` with `use_orig_params: true`.
+
+```yaml
+optimizer: q_galore_adamw8bit
+bf16: true
+
+# which parameter substrings get the low-rank projection
+# (defaults to ["attn", "mlp"] if unset — matches the reference impl)
+optim_target_modules:
+  - attn
+  - mlp
+
+# Q-GaLore hyperparameters (defaults shown)
+qgalore_rank: 256
+qgalore_update_proj_gap: 200       # max steps between SVD refreshes
+qgalore_scale: 0.25
+qgalore_proj_type: std
+qgalore_proj_quant: true           # INT-quantize the projection matrix P
+qgalore_proj_bits: 4               # bitwidth for P
+qgalore_proj_group_size: 256       # must divide P's last dim evenly
+qgalore_cos_threshold: 0.4         # skip SVD if P_t is this similar to P_{t-1}
+qgalore_gamma_proj: 2              # grow update_proj_gap by this factor when stable
+qgalore_queue_size: 5
+```
diff --git a/docs/sequence_parallelism.qmd b/docs/sequence_parallelism.qmd
index 9799c8a700..f03daf0d89 100644
--- a/docs/sequence_parallelism.qmd
+++ b/docs/sequence_parallelism.qmd
@@ -58,6 +58,34 @@ To use sequence parallelism, you need:
 - Flash attention must be enabled for this to work (`attn_implementation: flash_attention_2` in config YAML)
 - May have a small performance overhead due to communication between GPUs
 
+### Mamba/SSM Hybrid Models
+
+Context parallelism is supported for hybrid models that combine attention and Mamba2 SSM
+layers. These models require special handling because:
+
+- **Attention layers** work correctly via ring flash attention (same as pure-attention models).
+- **SSM (Mamba2) layers** are recurrent and need cross-rank hidden-state propagation.
+
+Axolotl handles both aspects:
+
+1. **Sample packing boundaries** (`seq_idx`): When multiple sequences are packed into one
+   row, the SSM kernels need `seq_idx` to reset state at boundaries. Under CP, chunks may
+   start mid-sample, so `seq_idx` is derived from `position_ids` using a CP-safe cumsum
+   normalization (see `mamba_utils.get_seq_idx`).
+
+2. **Cross-rank SSM state passing**: After each SSM scan, the final hidden state is sent to
+   the next rank via P2P communication (`ring_shift_ssm_state`), and an additive correction
+   is applied to account for the missing initial state (`mamba2_cp_correction`). This uses
+   the linearity property of SSMs to avoid a second forward pass.
+
+#### Supported Architectures
+
+| Model family | `model_config_type` | Architecture notes |
+|---|---|---|
+| Nemotron-H | `nemotron_h` | Mamba2 / Attention / MoE hybrid; block type selected per layer |
+| Falcon-H1 | `falcon_h1` | Mamba2 and Attention run **in parallel** in every layer |
+| Granite MoE Hybrid | `granitemoehybrid` | Mamba2 / Attention / MoE hybrid |
+
 ## Example
 
 ```yaml
diff --git a/examples/falcon-h1/falcon-h1-1b-qlora-cp.yaml b/examples/falcon-h1/falcon-h1-1b-qlora-cp.yaml
new file mode 100644
index 0000000000..606832d316
--- /dev/null
+++ b/examples/falcon-h1/falcon-h1-1b-qlora-cp.yaml
@@ -0,0 +1,75 @@
+base_model: tiiuae/Falcon-H1-1.5B-Deep-Base
+# optionally might have model_type or tokenizer_type
+model_type: AutoModelForCausalLM
+tokenizer_type: AutoTokenizer
+# Automatically upload checkpoint and final model to HF
+# hub_model_id: username/custom_model_name
+
+load_in_8bit: false
+load_in_4bit: true
+
+# huggingface repo
+chat_template: falcon_h1
+datasets:
+  - path: cgato/SlimOrcaDedupCleaned
+    type: chat_template
+    field_messages: conversations
+    message_property_mappings:
+      role: from
+      content: value
+
+val_set_size: 0.0
+output_dir: ./outputs/out
+
+adapter: qlora
+lora_r: 32
+lora_alpha: 16
+lora_dropout: 0.05
+lora_target_modules:
+  - q_proj
+  - k_proj
+  - v_proj
+  - o_proj
+  - in_proj
+  - gate_proj
+  - up_proj
+  - down_proj
+
+sequence_len: 2048
+sample_packing: true
+eval_sample_packing: true
+
+context_parapllel_size: 2
+
+
+wandb_project:
+wandb_entity:
+wandb_watch:
+wandb_name:
+wandb_log_model:
+
+
+gradient_accumulation_steps: 4
+micro_batch_size: 1
+num_epochs: 4
+optimizer: adamw_bnb_8bit
+lr_scheduler: cosine
+learning_rate: 0.0002
+
+bf16: auto
+tf32: true
+
+gradient_checkpointing: true
+gradient_checkpointing_kwargs:
+  use_reentrant: false
+resume_from_checkpoint:
+logging_steps: 1
+flash_attention: true
+
+warmup_ratio: 0.1
+evals_per_epoch:
+saves_per_epoch: 1
+weight_decay: 0.0
+special_tokens:
+
+# save_first_step: true  # uncomment this to validate checkpoint saving works with your config
diff --git a/examples/gemma4/26b-a4b-moe-qlora.yaml b/examples/gemma4/26b-a4b-moe-qlora.yaml
index cdc70ef4a8..c954a5bc2f 100644
--- a/examples/gemma4/26b-a4b-moe-qlora.yaml
+++ b/examples/gemma4/26b-a4b-moe-qlora.yaml
@@ -25,6 +25,12 @@ liger_glu_activation: true
 liger_rms_norm_gated: true
 strict: false
 
+# Multi-GPU (DDP) only: LoRA targets the text backbone, so the frozen vision/
+# audio encoders and Gemma4's KV-sharing layers leave some adapter params
+# without gradients. Required to avoid "parameters that were not used in
+# producing loss". No effect on single-GPU runs.
+ddp_find_unused_parameters: true
+
 chat_template: gemma4
 datasets:
   - path: mlabonne/FineTome-100k
diff --git a/examples/nemotron-h/nano-30b-a3b-qlora-cp.yaml b/examples/nemotron-h/nano-30b-a3b-qlora-cp.yaml
new file mode 100644
index 0000000000..cab402dccf
--- /dev/null
+++ b/examples/nemotron-h/nano-30b-a3b-qlora-cp.yaml
@@ -0,0 +1,85 @@
+# See examples/nemotron-h/README.md for architecture notes and requirements.
+base_model: nvidia/NVIDIA-Nemotron-3-Nano-30B-A3B-BF16
+
+plugins:
+  - axolotl.integrations.cut_cross_entropy.CutCrossEntropyPlugin
+  - axolotl.integrations.liger.LigerPlugin
+
+liger_layer_norm: true
+liger_rope: true
+liger_rms_norm: true
+liger_glu_activation: true
+liger_rms_norm_gated: true
+
+# LoRA kernel patches are incompatible with this architecture — see README.
+lora_mlp_kernel: false
+lora_qkv_kernel: false
+lora_o_kernel: false
+
+chat_template: tokenizer_default
+datasets:
+  - path: mlabonne/FineTome-100k
+    type: chat_template
+    split: train[:20%]
+    field_messages: conversations
+    message_property_mappings:
+      role: from
+      content: value
+
+val_set_size: 0.0
+output_dir: ./outputs/out
+dataset_prepared_path: last_run_prepared
+
+sequence_len: 4096
+sample_packing: true
+
+context_parallel_size: 2
+
+load_in_4bit: true
+quantize_moe_experts: true
+adapter: qlora
+lora_r: 16
+lora_alpha: 32
+lora_dropout: 0.0
+lora_target_modules:
+  - q_proj
+  - k_proj
+  - v_proj
+  - o_proj
+
+# To also train MoE expert weights, add them via lora_target_parameters
+# (they are 3D nn.Parameter tensors, not nn.Linear — no gate_proj):
+# lora_target_parameters:
+#   - up_proj
+#   - down_proj
+
+wandb_project:
+wandb_entity:
+wandb_watch:
+wandb_name:
+wandb_log_model:
+
+gradient_accumulation_steps: 2
+micro_batch_size: 1
+num_epochs: 1
+optimizer: adamw_torch_4bit
+lr_scheduler: cosine
+learning_rate: 0.0002
+
+bf16: auto
+tf32: true
+
+gradient_checkpointing: true
+gradient_checkpointing_kwargs:
+  use_reentrant: false
+
+resume_from_checkpoint:
+logging_steps: 1
+flash_attention: true
+
+warmup_ratio: 0.1
+evals_per_epoch: 4
+saves_per_epoch: 1
+weight_decay: 0.0
+
+special_tokens:
diff --git a/examples/qwen2_5-vl/mm-cpt-nonstreaming-qlora.yaml b/examples/qwen2_5-vl/mm-cpt-nonstreaming-qlora.yaml
new file mode 100644
index 0000000000..3db6dff825
--- /dev/null
+++ b/examples/qwen2_5-vl/mm-cpt-nonstreaming-qlora.yaml
@@ -0,0 +1,62 @@
+base_model: Qwen/Qwen2.5-VL-7B-Instruct
+processor_type: AutoProcessor
+trust_remote_code: true
+
+load_in_4bit: true
+
+# Raw JSONL row: {"text": "<|image_pad|>\nText target.", "images": ["image.png"]}
+datasets:
+  - path: /path/to/mm_cpt_train.jsonl
+    ds_type: json
+    type: multimodal_pretrain
+    split: train
+    text_column: text
+    image_column: images
+    image_base_dir: /path/to/images
+    # image_token: "<|image_pad|>"
+
+streaming: false
+dataset_prepared_path: last_run_prepared/qwen2_5_vl_mm_cpt
+val_set_size: 0.0
+output_dir: ./outputs/qwen2_5_vl_mm_cpt
+
+adapter: qlora
+lora_model_dir:
+
+sequence_len: 8192
+pad_to_sequence_len: false
+sample_packing: false
+remove_unused_columns: false
+
+lora_r: 32
+lora_alpha: 16
+lora_dropout: 0.05
+lora_target_modules: 'model.language_model.layers.[\d]+.(mlp|cross_attn|self_attn).(up|down|gate|q|k|v|o)_proj'
+
+wandb_project:
+wandb_entity:
+wandb_watch:
+wandb_name:
+wandb_log_model:
+
+gradient_accumulation_steps: 4
+micro_batch_size: 1
+num_epochs: 1
+optimizer: adamw_bnb_8bit
+lr_scheduler: cosine
+learning_rate: 0.0002
+
+bf16: true
+fp16:
+tf32: true
+
+gradient_checkpointing: true
+gradient_checkpointing_kwargs:
+  use_reentrant: false
+logging_steps: 1
+attn_implementation: flash_attention_2
+
+warmup_ratio: 0.1
+evals_per_epoch: 1
+saves_per_epoch: 1
+weight_decay: 0.0
diff --git a/examples/qwen2_5-vl/mm-cpt-streaming-qlora.yaml b/examples/qwen2_5-vl/mm-cpt-streaming-qlora.yaml
new file mode 100644
index 0000000000..f7cf777b56
--- /dev/null
+++ b/examples/qwen2_5-vl/mm-cpt-streaming-qlora.yaml
@@ -0,0 +1,63 @@
+base_model: Qwen/Qwen2.5-VL-7B-Instruct
+processor_type: AutoProcessor
+trust_remote_code: true
+
+load_in_4bit: true
+
+# Cache prepared shards to resume without replaying image preprocessing.
+pretraining_dataset:
+  - path: /path/to/mm_cpt_shards/*.jsonl
+    ds_type: json
+    type: multimodal_pretrain
+    split: train
+    text_column: text
+    image_column: images
+    image_base_dir: /path/to/images
+    # image_token: "<|image_pad|>"
+
+streaming: true
+dataset_prepared_path: last_run_prepared/qwen2_5_vl_mm_cpt_stream_cache
+ignore_data_skip: true
+max_steps: 10000
+val_set_size: 0.0
+output_dir: ./outputs/qwen2_5_vl_mm_cpt_stream
+
+adapter: qlora
+lora_model_dir:
+
+sequence_len: 8192
+pad_to_sequence_len: false
+sample_packing: false
+remove_unused_columns: false
+
+lora_r: 32
+lora_alpha: 16
+lora_dropout: 0.05
+lora_target_modules: 'model.language_model.layers.[\d]+.(mlp|cross_attn|self_attn).(up|down|gate|q|k|v|o)_proj'
+
+wandb_project:
+wandb_entity:
+wandb_watch:
+wandb_name:
+wandb_log_model:
+
+gradient_accumulation_steps: 4
+micro_batch_size: 1
+optimizer: adamw_bnb_8bit
+lr_scheduler: cosine
+learning_rate: 0.0002
+
+bf16: true
+fp16:
+tf32: true
+
+gradient_checkpointing: true
+gradient_checkpointing_kwargs:
+  use_reentrant: false
+logging_steps: 1
+attn_implementation: flash_attention_2
+
+warmup_ratio: 0.1
+save_steps: 1000
+eval_steps: 1000
+weight_decay: 0.0
diff --git a/examples/qwen3/8b-lora-fused-attn.yaml b/examples/qwen3/8b-lora-fused-attn.yaml
new file mode 100644
index 0000000000..70d8c36fa1
--- /dev/null
+++ b/examples/qwen3/8b-lora-fused-attn.yaml
@@ -0,0 +1,44 @@
+base_model: Qwen/Qwen3-8B
+
+load_in_8bit: false
+load_in_4bit: false
+strict: false
+
+datasets:
+  - path: tatsu-lab/alpaca
+    type: alpaca
+dataset_prepared_path:
+val_set_size: 0.05
+output_dir: ./outputs/qwen3-8b-lora-fused
+
+sequence_len: 4096
+sample_packing: true
+eval_sample_packing: true
+
+adapter: lora
+lora_r: 32
+lora_alpha: 64
+lora_dropout: 0.0
+lora_target_linear: true
+
+# Opt-in fused RMSNorm + RoPE Triton kernel for Qwen3 attention.
+# fp32-internal, single round: matches an fp32 reference within bf16 rounding
+# (more accurate than the eager bf16 path). Speeds up the q/k norm+rope path.
+fused_attn_kernel: true
+
+gradient_accumulation_steps: 4
+micro_batch_size: 1
+num_epochs: 1
+optimizer: adamw_torch_fused
+lr_scheduler: cosine
+learning_rate: 0.0002
+
+bf16: auto
+tf32: false
+
+gradient_checkpointing: true
+flash_attention: true
+
+warmup_ratio: 0.1
+saves_per_epoch: 1
+weight_decay: 0.0
diff --git a/pyproject.toml b/pyproject.toml
index 65e8325811..01cac4b8b0 100644
--- a/pyproject.toml
+++ b/pyproject.toml
@@ -17,10 +17,10 @@ dependencies = [
     "huggingface_hub>=1.1.7",
     "peft>=0.19.1,<0.20.0",
     "tokenizers>=0.22.1",
-    "transformers==5.5.4",
+    "transformers==5.9.0",
     "accelerate==1.13.0",
     "datasets>=4.8.4,<4.9.0",
-    "trl==1.1.0",
+    "trl==1.5.1",
     "hf_xet==1.4.3",
     "kernels==0.13.0",
     "trackio>=0.16.1",
@@ -40,6 +40,7 @@ dependencies = [
     "colorama",
     "numba>=0.61.2",
     "numpy>=2.2.6",
+    "typer<0.26.0",
 
     # Evaluation & metrics
     "evaluate==0.4.1",
@@ -120,6 +121,7 @@ optimizers = [
     "lomo-optim==0.1.1",
     "torch-optimi==0.2.1",
     "came_pytorch==0.1.3",
+    "q-galore-torch==1.0",
 ]
 ray = [
     "ray[train]>=2.52.1",
diff --git a/src/axolotl/core/builders/base.py b/src/axolotl/core/builders/base.py
index 019880abf7..16a3de9b6d 100644
--- a/src/axolotl/core/builders/base.py
+++ b/src/axolotl/core/builders/base.py
@@ -122,11 +122,8 @@ def get_callbacks(self) -> list[TrainerCallback]:
         if self.cfg.resume_from_checkpoint:
             callbacks.append(SkipEvalOnResumeCallback())
 
-        gc_collect_steps = getattr(self.cfg, "gc_collect_steps", None) or getattr(
-            self.cfg, "gc_steps", None
-        )
-        if gc_collect_steps:
-            callbacks.append(GCCallback(gc_collect_steps=gc_collect_steps))
+        if self.cfg.gc_steps:
+            callbacks.append(GCCallback(gc_steps=self.cfg.gc_steps))
 
         if self.cfg.dynamic_checkpoint and self.cfg.dynamic_checkpoint.enabled:
             from axolotl.utils.callbacks.dynamic_checkpoint import (
@@ -184,6 +181,18 @@ def get_callbacks(self) -> list[TrainerCallback]:
         if telemetry_manager.enabled:
             callbacks.append(TelemetryCallback())
 
+            # Report the fused RMSNorm+RoPE autotune selection + GPU identity so
+            # per-hardware tuning can be aggregated (mirrors scattermoe-lora).
+            if self.cfg.fused_attn_kernel or self.cfg.model_config_type in (
+                "gemma4",
+                "gemma4_text",
+            ):
+                from axolotl.kernels.autotune_telemetry import (
+                    FusedRopeAutotuneReportCallback,
+                )
+
+                callbacks.append(FusedRopeAutotuneReportCallback())
+
         return callbacks
 
     def get_post_trainer_create_callbacks(self, trainer):
@@ -359,6 +368,32 @@ def _configure_custom_optimizer(
                 adam_kwargs["betas"] = (beta1, beta2, beta3)
                 adam_kwargs["eps"] = (eps1, eps2)
 
+                optimizer_kwargs.update(adam_kwargs)
+            elif self.cfg.optimizer == "q_galore_adamw8bit":
+                from axolotl.utils.optimizers.qgalore import (
+                    build_qgalore_param_groups,
+                    patch_q_galore_for_modern_bnb,
+                )
+
+                patch_q_galore_for_modern_bnb()
+                from q_galore_torch import QGaLoreAdamW8bit
+
+                optimizer_cls = QGaLoreAdamW8bit
+                optimizer_kwargs["params"] = build_qgalore_param_groups(
+                    self.model,
+                    self.cfg.optim_target_modules,
+                    rank=self.cfg.qgalore_rank,
+                    update_proj_gap=self.cfg.qgalore_update_proj_gap,
+                    scale=self.cfg.qgalore_scale,
+                    proj_type=self.cfg.qgalore_proj_type,
+                    proj_quant=self.cfg.qgalore_proj_quant,
+                    proj_bits=self.cfg.qgalore_proj_bits,
+                    proj_group_size=self.cfg.qgalore_proj_group_size,
+                    cos_threshold=self.cfg.qgalore_cos_threshold,
+                    gamma_proj=self.cfg.qgalore_gamma_proj,
+                    queue_size=self.cfg.qgalore_queue_size,
+                )
+
                 optimizer_kwargs.update(adam_kwargs)
             elif self.cfg.optimizer == "flash_adamw":
                 from flashoptim import FlashAdamW
diff --git a/src/axolotl/core/builders/causal.py b/src/axolotl/core/builders/causal.py
index 7472ac649d..47474e5f79 100644
--- a/src/axolotl/core/builders/causal.py
+++ b/src/axolotl/core/builders/causal.py
@@ -52,20 +52,38 @@
 
 
 def _is_multimodal_cpt(cfg) -> bool:
-    if not getattr(cfg, "pretraining_dataset", None):
+    return _get_mm_cpt_config(cfg) is not None
+
+
+def _entry_is_multimodal_cpt(entry) -> bool:
+    if entry is None:
         return False
-    ds_first = cfg.pretraining_dataset[0]
     ds_type = None
     mm_flag = None
-    if hasattr(ds_first, "type"):
-        ds_type = getattr(ds_first, "type", None)
-        mm_flag = getattr(ds_first, "multimodal", None)
-    elif isinstance(ds_first, dict):
-        ds_type = ds_first.get("type")
-        mm_flag = ds_first.get("multimodal")
+    if hasattr(entry, "type"):
+        ds_type = getattr(entry, "type", None)
+        mm_flag = getattr(entry, "multimodal", None)
+    elif isinstance(entry, dict):
+        ds_type = entry.get("type")
+        mm_flag = entry.get("multimodal")
     return (ds_type == "multimodal_pretrain") or bool(mm_flag)
 
 
+def _get_mm_cpt_config(cfg, is_eval: bool = False):
+    if is_eval and getattr(cfg, "test_datasets", None):
+        for entry in cfg.test_datasets:
+            if _entry_is_multimodal_cpt(entry):
+                return entry
+    for collection_name in ("pretraining_dataset", "datasets"):
+        collection = getattr(cfg, collection_name, None)
+        if not collection:
+            continue
+        for entry in collection:
+            if _entry_is_multimodal_cpt(entry):
+                return entry
+    return None
+
+
 def _mm_cpt_get(pt_cfg, key, default=None):
     if isinstance(pt_cfg, dict):
         return pt_cfg.get(key, default)
@@ -476,12 +494,7 @@ def _build_mm_pretrain_collator(self, pad_to_multiple_of=None, is_eval=False):
             build_image_token_spec,
         )
 
-        if is_eval and self.cfg.test_datasets:
-            pt_cfg = self.cfg.test_datasets[0]
-        elif self.cfg.pretraining_dataset:
-            pt_cfg = self.cfg.pretraining_dataset[0]
-        else:
-            pt_cfg = {}
+        pt_cfg = _get_mm_cpt_config(self.cfg, is_eval=is_eval) or {}
         spec = build_image_token_spec(
             self.processor, override=_mm_cpt_get(pt_cfg, "image_token")
         )
diff --git a/src/axolotl/core/builders/rl.py b/src/axolotl/core/builders/rl.py
index 98296a2016..25340809a3 100644
--- a/src/axolotl/core/builders/rl.py
+++ b/src/axolotl/core/builders/rl.py
@@ -274,7 +274,7 @@ def build(self, total_num_steps):
         if (
             self.cfg.adapter
             and self.peft_config
-            and self.cfg.rl not in (RLType.GRPO, RLType.ORPO, RLType.EBFT)
+            and self.cfg.rl not in (RLType.GRPO, RLType.ORPO, RLType.EBFT, RLType.SIMPO)
         ):
             trainer_kwargs["peft_config"] = self.peft_config
 
diff --git a/src/axolotl/core/trainers/base.py b/src/axolotl/core/trainers/base.py
index 2e4252123f..14baf6a88c 100644
--- a/src/axolotl/core/trainers/base.py
+++ b/src/axolotl/core/trainers/base.py
@@ -2,6 +2,7 @@
 
 from __future__ import annotations
 
+import contextlib
 import gc
 import json
 import math
@@ -460,6 +461,15 @@ def compute_loss(
             num_items_in_batch=num_items_in_batch,
         )
 
+    @override
+    def _prepare_context_parallel_inputs(self, model, inputs):
+        """Disable HF Trainer's CP splitting when Axolotl's ring_attn handles it."""
+        from axolotl.monkeypatch.models.mamba_utils import is_cp_active
+
+        if is_cp_active():
+            return contextlib.nullcontext, inputs
+        return super()._prepare_context_parallel_inputs(model, inputs)
+
     @override
     def evaluate(self, *args, **kwargs):
         LOG.info("Running evaluation step...")
diff --git a/src/axolotl/core/trainers/mixins/activation_checkpointing.py b/src/axolotl/core/trainers/mixins/activation_checkpointing.py
index dd892bebce..b61c45feed 100644
--- a/src/axolotl/core/trainers/mixins/activation_checkpointing.py
+++ b/src/axolotl/core/trainers/mixins/activation_checkpointing.py
@@ -22,25 +22,6 @@
 LOG = get_logger(__name__)
 
 
-# TODO(#3638): drop once TRL pin includes huggingface/trl#5730. Mirrors the
-# upstream __enter__ override — clears cross-step state on context re-entry
-# so saved tensors that never unpack during backward (MoE / torch.compile)
-# don't accumulate as leaked GPU references.
-def _axolotl_offload_enter(self):
-    self.tracker.clear()
-    self.storage_to_tensor_id.clear()
-    if self.use_streams:
-        self.fwd_stash.clear()
-        self.bwd_tensor_stash.clear()
-        self.bwd_ev_stash.clear()
-    self.is_first_forward_call = True
-    self.is_first_backward_call = True
-    return super(OffloadActivations, self).__enter__()
-
-
-OffloadActivations.__enter__ = _axolotl_offload_enter
-
-
 class ActivationOffloadingMixin(Trainer):
     """
     Trainer mixin class for activation checkpointing w offloading
diff --git a/src/axolotl/integrations/expert_parallel/experts_fn.py b/src/axolotl/integrations/expert_parallel/experts_fn.py
index 1cc81ef293..89473fd112 100644
--- a/src/axolotl/integrations/expert_parallel/experts_fn.py
+++ b/src/axolotl/integrations/expert_parallel/experts_fn.py
@@ -59,7 +59,7 @@ def _grouped_mm_local(experts, recv_x, recv_topk_idx, recv_topk_weights):
 
 
 def _scattermoe_local(experts, recv_x, recv_topk_idx, recv_topk_weights):
-    from axolotl.integrations.kernels.libs.scattermoe_lora.gemma4_experts import (
+    from axolotl.integrations.kernels.libs.scattermoe_lora.experts import (
         scattermoe_experts_forward,
     )
 
@@ -68,13 +68,6 @@ def _scattermoe_local(experts, recv_x, recv_topk_idx, recv_topk_weights):
 
 def _sonicmoe_local(experts, recv_x, recv_topk_idx, recv_topk_weights):
     raise NotImplementedError("Sonicmoe + EP is not yet properly implemented.")
-    # from axolotl.integrations.kernels.libs.sonicmoe.gemma4_experts import (
-    #     gemma4_sonicmoe_experts_forward,
-    # )
-
-    # return gemma4_sonicmoe_experts_forward(
-    #     experts, recv_x, recv_topk_idx, recv_topk_weights
-    # )
 
 
 _LOCAL_KERNELS = {
diff --git a/src/axolotl/integrations/kd/__init__.py b/src/axolotl/integrations/kd/__init__.py
index b1a9905533..25a8537253 100644
--- a/src/axolotl/integrations/kd/__init__.py
+++ b/src/axolotl/integrations/kd/__init__.py
@@ -81,11 +81,6 @@ def get_collator_cls_and_kwargs(self, cfg, is_eval=False):
             return KDBatchSamplerDataCollatorForSeq2Seq, {}
         return DataCollatorForKD, {}
 
-    def pre_model_load(self, cfg):
-        from .kernels.models import apply_kernel
-
-        apply_kernel(cfg.model_config_type)
-
     def add_callbacks_post_trainer(self, cfg: Any, trainer: Trainer) -> list:
         """
         Adds temp scheduler callback to the Trainer instance.
diff --git a/src/axolotl/integrations/kd/kernels/__init__.py b/src/axolotl/integrations/kd/kernels/__init__.py
index 3f1144a45c..0a967144a0 100644
--- a/src/axolotl/integrations/kd/kernels/__init__.py
+++ b/src/axolotl/integrations/kd/kernels/__init__.py
@@ -3,6 +3,5 @@
 """
 
 from .liger import LigerFusedLinearKLTopKLogprobLoss
-from .models import apply_kernel
 
-__all__ = ["LigerFusedLinearKLTopKLogprobLoss", "apply_kernel"]
+__all__ = ["LigerFusedLinearKLTopKLogprobLoss"]
diff --git a/src/axolotl/integrations/kd/kernels/liger.py b/src/axolotl/integrations/kd/kernels/liger.py
index 61ef3e10a0..a150420855 100644
--- a/src/axolotl/integrations/kd/kernels/liger.py
+++ b/src/axolotl/integrations/kd/kernels/liger.py
@@ -225,7 +225,7 @@ def loss_fn_for_grad(
             _target_mask_chunk,
             _true_labels_chunk,
         ):
-            return cls._compute_loss_kl_topk(
+            soft_loss, ce_loss = cls._compute_loss_kl_topk(
                 student_input_chunk=_student_input_chunk,
                 student_weight=_student_lm_head_weight,
                 target_token_ids_chunk=_target_token_ids_chunk,
@@ -242,6 +242,11 @@ def loss_fn_for_grad(
                 beta=beta,
                 normalize_topk=normalize_topk,
             )
+            # has_aux=True: first return is the differentiated scalar, rest is aux.
+            # Combine here so both terms contribute to backward; aux carries the
+            # unweighted soft/ce values for reporting.
+            combined = weight_soft_loss * soft_loss + weight_hard_loss * ce_loss
+            return combined, (soft_loss, ce_loss)
 
         def accumulate_chunk_grads(
             student_input_chunk_ac,
@@ -254,7 +259,9 @@ def accumulate_chunk_grads(
             if student_lm_head_bias is not None:
                 (
                     (chunk_grad_input, chunk_grad_weight, chunk_grad_bias),
-                    (chunk_kd_loss, chunk_ce_loss),
+                    # grad_and_value(has_aux=True) returns (grads, (value, aux));
+                    # value = combined scalar, aux = (soft_loss, ce_loss) from loss_fn_for_grad.
+                    (_, (chunk_kd_loss, chunk_ce_loss)),
                 ) = torch.func.grad_and_value(
                     loss_fn_for_grad, argnums=(0, 1, 2), has_aux=True
                 )(
@@ -271,7 +278,9 @@ def accumulate_chunk_grads(
                 argnums_for_grad = (0, 1)  # Differentiate wrt input_chunk, weight
                 (
                     (chunk_grad_input, chunk_grad_weight),  # No grad for bias
-                    (chunk_kd_loss, chunk_ce_loss),
+                    # grad_and_value(has_aux=True) returns (grads, (value, aux));
+                    # value = combined scalar, aux = (soft_loss, ce_loss) from loss_fn_for_grad.
+                    (_, (chunk_kd_loss, chunk_ce_loss)),
                 ) = torch.func.grad_and_value(
                     loss_fn_for_grad, argnums=argnums_for_grad, has_aux=True
                 )(
diff --git a/src/axolotl/integrations/kd/kernels/models.py b/src/axolotl/integrations/kd/kernels/models.py
deleted file mode 100644
index badb3460de..0000000000
--- a/src/axolotl/integrations/kd/kernels/models.py
+++ /dev/null
@@ -1,104 +0,0 @@
-"""
-model patcher for chunked top-k kl-div
-"""
-
-from typing import Optional, Union, Unpack
-
-import torch
-from transformers import Cache
-from transformers.modeling_outputs import CausalLMOutputWithPast
-
-try:
-    from transformers.modeling_flash_attention_utils import FlashAttentionKwargs
-    from transformers.utils import LossKwargs
-
-    class TransformersKwargs(FlashAttentionKwargs, LossKwargs):
-        """
-        placeholder kwargs for hf model classes
-        """
-
-except ImportError:
-    from transformers.utils.generic import (  # type: ignore[no-redef]
-        TransformersKwargs,
-    )
-
-from axolotl.utils.callbacks.models import get_causal_lm_model_cls_prefix
-
-
-def kldiv_forward_llama_like(
-    self,
-    input_ids: Optional[torch.LongTensor] = None,
-    target_logprobs: Optional[torch.Tensor] = None,
-    target_token_ids: Optional[torch.LongTensor] = None,
-    target_mask: Optional[torch.Tensor] = None,
-    attention_mask: Optional[torch.Tensor] = None,
-    position_ids: Optional[torch.LongTensor] = None,
-    past_key_values: Optional[Cache] = None,
-    inputs_embeds: Optional[torch.FloatTensor] = None,
-    labels: Optional[torch.LongTensor] = None,
-    use_cache: Optional[bool] = None,
-    output_attentions: Optional[bool] = None,
-    output_hidden_states: Optional[bool] = None,
-    cache_position: Optional[torch.LongTensor] = None,
-    logits_to_keep: Union[int, torch.Tensor] = 0,
-    **kwargs: Unpack[TransformersKwargs],  # type: ignore[misc]
-) -> CausalLMOutputWithPast:
-    output_attentions = (
-        output_attentions
-        if output_attentions is not None
-        else self.config.output_attentions
-    )
-    output_hidden_states = (
-        output_hidden_states
-        if output_hidden_states is not None
-        else self.config.output_hidden_states
-    )
-
-    # decoder outputs consists of (dec_features, layer_state, dec_hidden, dec_attn)
-    outputs = self.model(
-        input_ids=input_ids,
-        attention_mask=attention_mask,
-        position_ids=position_ids,
-        past_key_values=past_key_values,
-        inputs_embeds=inputs_embeds,
-        use_cache=use_cache,
-        output_attentions=output_attentions,
-        output_hidden_states=output_hidden_states,
-        cache_position=cache_position,
-        **kwargs,
-    )
-
-    hidden_states = outputs.last_hidden_state
-
-    # Only compute necessary logits, and do not upcast them to float if we are not computing the loss
-    # TODO, we can optimize this further by filtering hidden_states on sequence dimension using labels != -100
-    # self._loss_function should be LigerFusedLinearKLTopKLogprobLoss
-
-    loss = self._loss_function(
-        self.lm_head.weight,
-        hidden_states,
-        target_token_ids,
-        target_logprobs,
-        target_mask,
-        true_labels=labels,
-    )
-    num_items_in_batch = kwargs.pop("num_items_in_batch", -1)
-    if num_items_in_batch is not None and num_items_in_batch > 0:
-        loss = loss / num_items_in_batch
-
-    return CausalLMOutputWithPast(
-        loss=loss,
-        logits=None,
-        past_key_values=outputs.past_key_values,
-        hidden_states=outputs.hidden_states,
-        attentions=outputs.attentions,
-    )
-
-
-def apply_kernel(model_type):
-    # Dynamically import the module and attention class
-    module_path = f"transformers.models.{model_type}.modeling_{model_type}"
-    model_cls_prefix, _ = get_causal_lm_model_cls_prefix(model_type)
-    module = __import__(module_path, fromlist=[f"{model_cls_prefix}ForCausalLM"])
-    model_cls = getattr(module, f"{model_cls_prefix}ForCausalLM")
-    model_cls.forward = kldiv_forward_llama_like
diff --git a/src/axolotl/integrations/kd/trainer.py b/src/axolotl/integrations/kd/trainer.py
index 343d4c6dfc..e5d1504eb1 100644
--- a/src/axolotl/integrations/kd/trainer.py
+++ b/src/axolotl/integrations/kd/trainer.py
@@ -16,6 +16,7 @@
 KD trainer
 """
 
+import torch.nn as nn
 from typing_extensions import override
 
 from axolotl.core.trainers.base import AxolotlTrainer
@@ -23,6 +24,17 @@
 from .kernels.liger import LigerFusedLinearKLTopKLogprobLoss
 
 
+def _resolve_lm_head(model: nn.Module) -> nn.Module:
+    base = model
+    if hasattr(base, "get_base_model"):
+        base = base.get_base_model()
+    if hasattr(base, "language_model") and hasattr(base.language_model, "lm_head"):
+        return base.language_model.lm_head
+    if hasattr(base, "lm_head"):
+        return base.lm_head
+    raise AttributeError(f"could not find lm_head on {type(model).__name__}")
+
+
 class AxolotlKDTrainer(AxolotlTrainer):
     """
     Custom trainer subclass for Knowledge Distillation (KD)
@@ -32,7 +44,7 @@ def __init__(self, *args, **kwargs):
         super().__init__(*args, **kwargs)
         self.model_accepts_loss_kwargs = True
 
-        loss_fn = LigerFusedLinearKLTopKLogprobLoss(
+        self._kd_loss_fn = LigerFusedLinearKLTopKLogprobLoss(
             self.args.kd_ce_alpha,  # hard label loss
             self.args.kd_alpha,  # kd loss
             self.args.kd_temperature,
@@ -40,14 +52,6 @@ def __init__(self, *args, **kwargs):
             compute_ce_loss=bool(self.args.kd_ce_alpha),
             normalize_topk=self.args.kd_normalize_topk,
         )
-        target = self.model
-
-        # Unwrap PEFT wrapper
-        if hasattr(target, "get_base_model"):
-            target = target.get_base_model()
-
-        # Set on the actual model instance
-        target._loss_function = loss_fn
 
     def _set_signature_columns_if_needed(self):
         super()._set_signature_columns_if_needed()
@@ -70,34 +74,48 @@ def compute_loss(
         return_outputs=False,
         num_items_in_batch=None,
     ):
-        """
-        How the loss is computed by Trainer. By default, all models return the loss in the first element.
-
-        Subclass and override for custom behavior.
-        """
-        if (
-            self.args.sample_packing
-            and hasattr(inputs, "attention_mask")
-            and hasattr(inputs, "position_ids")
-        ):
-            del inputs["attention_mask"]
+        inputs = dict(inputs)
+
+        required_keys = ("labels", "target_token_ids", "target_logprobs", "target_mask")
+        missing = [k for k in required_keys if k not in inputs]
+        if missing:
+            raise KeyError(f"KD batch missing required keys: {missing}")
 
         if num_items_in_batch is None and "labels" in inputs:
             num_items_in_batch = (inputs["labels"] != -100).sum().item()
 
-        if self.model_accepts_loss_kwargs:
-            loss_kwargs = {}
-            if num_items_in_batch is not None:
-                loss_kwargs["num_items_in_batch"] = num_items_in_batch
-            inputs = {**inputs, **loss_kwargs}
+        labels = inputs.pop("labels")
+        target_token_ids = inputs.pop("target_token_ids")
+        target_logprobs = inputs.pop("target_logprobs")
+        target_mask = inputs.pop("target_mask")
+
+        # num_items_in_batch is a loss kwarg, not a forward kwarg.
+        inputs.pop("num_items_in_batch", None)
 
+        inputs["output_hidden_states"] = True
+        inputs["return_dict"] = True
+        inputs["logits_to_keep"] = 1
         outputs = model(**inputs)
+        hidden_states = getattr(outputs, "hidden_states", None)
+        if hidden_states is None:
+            raise RuntimeError(
+                f"{type(model).__name__}.forward did not return hidden_states"
+            )
+        hidden_states = hidden_states[-1]
+
+        lm_head = _resolve_lm_head(model)
+        hidden_states = hidden_states.to(lm_head.weight.dtype)
+
+        loss = self._kd_loss_fn(
+            lm_head.weight,
+            hidden_states,
+            target_token_ids,
+            target_logprobs,
+            target_mask,
+            true_labels=labels,
+        )
 
-        if isinstance(outputs, dict):
-            loss = outputs["loss"]
-        elif isinstance(outputs, tuple):
-            loss = outputs[0]
-        else:
-            loss = outputs.loss if hasattr(outputs, "loss") else outputs
+        if num_items_in_batch is not None and num_items_in_batch > 0:
+            loss = loss / num_items_in_batch
 
         return (loss, outputs) if return_outputs else loss
diff --git a/src/axolotl/integrations/kernels/README.md b/src/axolotl/integrations/kernels/README.md
index 32d236da49..aae2fee97f 100644
--- a/src/axolotl/integrations/kernels/README.md
+++ b/src/axolotl/integrations/kernels/README.md
@@ -1,16 +1,17 @@
 # Kernels Integration
 
-MoE (Mixture of Experts) kernels speed up training for MoE layers and reduce VRAM costs. In transformers v5, `batched_mm` and `grouped_mm` were integrated as built-in options via the `experts_implementation` config kwarg:
+MoE (Mixture of Experts) kernels speed up training for MoE layers and reduce VRAM costs. Transformers v5 introduced a uniform dispatch point for the per-expert grouped GEMMs via the `experts_implementation` config kwarg:
 
 ```python
 class ExpertsInterface(GeneralInterface):
     _global_mapping = {
         "batched_mm": batched_mm_experts_forward,
         "grouped_mm": grouped_mm_experts_forward,
+        "sonicmoe":   sonicmoe_experts_forward,   # upstream HF integration
     }
 ```
 
-In our custom integration, we add support for **ScatterMoE** and **SonicMoE**, which are more efficient and faster than `grouped_mm`.
+Axolotl registers two additional implementations into this same global registry: **ScatterMoE** (Triton, runs on any CUDA GPU) and a LoRA-aware **SonicMoE** variant (CUTLASS / cute-DSL, Hopper or newer). Routing — softmax/sigmoid top-k, group selection, shared experts, bias correction, etc. — stays in each model's `SparseMoEBlock`, where transformers handles all per-architecture variation. Axolotl only swaps the experts forward.
 
 ## Usage
 
@@ -28,130 +29,75 @@ use_scattermoe: true
 use_sonicmoe: true
 ```
 
-**Important:** Setting `experts_implementation` to `batched_mm` or `grouped_mm` is incompatible with custom kernel options. The exception is `experts_implementation: scattermoe`, which is used for models like Gemma 4 that embed MoE directly in the decoder layer (no SparseMoeBlock) and dispatch through the transformers `ExpertsInterface`.
+`experts_implementation` is auto-set to `scattermoe` / `sonicmoe` from the kernel flag, but you can override to `eager` / `batched_mm` / `grouped_mm` to compare against the transformers reference implementations.
 
 ### SonicMoE installation
 
 **Prerequisites:**
-- NVIDIA Hopper (H100, H200) or Blackwell (B200, GB200) GPU
+- NVIDIA Hopper (H100/H200) or Blackwell (B200/GB200/B300) GPU
 - CUDA 12.9+ (13.0+ for B300)
-- PyTorch 2.7+ (2.9.1 recommended)
-- For B300: Triton 3.6.0
+- PyTorch 2.7+
+- For B300: Triton 3.6.x
+
+The sonic-moe kernel ships through the HF [`kernels`](https://github.com/huggingface/kernels) package. Transformers v5.8+ auto-fetches a prebuilt kernel from [`kernels-community/sonic-moe`](https://huggingface.co/kernels-community/sonic-moe) on first use:
 
 ```bash
-pip install --ignore-requires-python --no-deps "sonic-moe @ git+https://github.com/Dao-AILab/sonic-moe.git@116e2df0a41874f77fa0ad269ce7df3f0cfcb956" && pip install nvidia-cutlass-dsl==4.4.0 quack-kernels==0.2.5
+pip install kernels "nvidia-cutlass-dsl==4.4.2"
 ```
 
-See the [SonicMoE installation guide](https://github.com/Dao-AILab/sonic-moe?tab=readme-ov-file#-installation) for the latest prerequisite details.
-
-**Note:** Blackwell support is in upstream beta. On Blackwell GPUs, Axolotl automatically sets `USE_QUACK_GEMM=1` to enable the Blackwell kernels.
+**Note:** Blackwell support is in upstream beta. On Blackwell GPUs Axolotl automatically sets `USE_QUACK_GEMM=1` to enable the Blackwell kernels.
 
 ## How It Works
 
-The `KernelsPlugin` runs before model loading and:
-
-### ScatterMoE
-1. Registers the ScatterMoE kernel from the local `libs/scattermoe_lora` package (includes fused LoRA support via Triton kernels).
-2. Patches the model's `SparseMoeBlock` forward method with the optimized ScatterMoE implementation via the HF `kernels` library.
-
-### SonicMoE
-1. Resolves the model's MoE block class(es) from `constants.py`.
-2. Patches the forward method with SonicMoE's optimized CUTLASS kernels and registers a weight converter for the interleaved gate/up projection format.
-3. Supports pluggable routing strategies (see routing table below).
-
-Both paths use the shared `resolve_moe_block_classes` utility in `constants.py` for model-type-to-class resolution.
-
-## Model Support Matrix
-
-Most models use the **SwiGLU** activation (`silu(gate) * up`). Gemma 4 uses **GEGLU** (`gelu(gate) * up`). ScatterMoE supports any gated activation (activation is applied in Python between kernel calls). SonicMoE supports SwiGLU, GEGLU, and REGLU via its `ActivationType` enum.
-
-### Routing strategies
-
-| Routing Strategy | Description | ScatterMoE | SonicMoE |
-|---|---|:---:|:---:|
-| softmax → topk | Softmax over experts, select top-K, optional renormalization | Yes | Yes |
-| softmax → group selection → topk | Softmax, select top groups (sum of top-2 per group), topk from selected groups, renorm + scaling | No | Yes |
-| sigmoid → topk (with groups) | Sigmoid + bias correction, group-based masking, topk from masked scores, weights from original sigmoid | Yes | Yes |
-| sigmoid → topk (no groups) | Sigmoid + bias correction, straight topk (n_group=1) | Yes | Yes |
-| softmax → bias correction → topk | Softmax, bias via `gate.moe_statics`, topk, gather from original probs, clamp-based renorm | No | Yes |
-| softmax → group_limited_greedy | Softmax, group selection (max per group), topk, scale only (no renorm) | No | Yes |
-| softmax → topk via gate.wg | Softmax, gate weight at `gate.wg.weight` (not `gate.weight`), always renormalize | No | Yes |
-| softmax → topk + per_expert_scale | RMSNorm → scale → proj → softmax → topk → renorm → per-expert learned scales | Yes | Yes |
-| fused topk → softmax | Routing + expert computation fused in a single kernel | No | Planned |
-
-### Per-model support
-
-| Model Type | Architecture | Routing | ScatterMoE | SonicMoE |
-|---|---|---|:---:|:---:|
-| `qwen2_moe` | Qwen2-MoE | softmax → topk | **Yes** | **Yes** |
-| `qwen3_moe` | Qwen3-MoE | softmax → topk | **Yes** | **Yes** |
-| `qwen3_5_moe` | Qwen3.5-MoE | softmax → topk | **Yes** | **Yes** |
-| `qwen3_5_moe_text` | Qwen3.5-MoE (VLM text) | softmax → topk | **Yes** | **Yes** |
-| `qwen3_next` | Qwen3-Next | softmax → topk | **Yes** | **Yes** |
-| `qwen3_vl_moe` | Qwen3-VL-MoE | softmax → topk | **Yes** | **Yes** |
-| `qwen3_omni_moe` | Qwen3-Omni (Thinker + Talker) | softmax → topk | **Yes** | **Yes** |
-| `olmoe` | OLMoE | softmax → topk | **Yes** | **Yes** |
-| `mixtral` | Mixtral | softmax → topk | **Yes** | **Yes** |
-| `minimax` | MiniMax | softmax → topk | **Yes** | **Yes** |
-| `mistral4` | Mistral 4 | softmax → group → topk | No | **Yes** |
-| `glm_moe_dsa` | GLM-MoE DSA (GLM 5) | sigmoid → topk (groups) | **Yes** | **Yes** |
-| `deepseek_v3` | DeepSeek-V3 | sigmoid → topk (groups) | **Yes** | **Yes** |
-| `glm4_moe` | GLM4-MoE | sigmoid → topk (groups) | **Yes** | **Yes** |
-| `glm4_moe_lite` | GLM4-MoE Lite (GLM 4.7 Flash) | sigmoid → topk (groups) | **Yes**\* | **Yes** |
-| `glm4v_moe` | GLM4v-MoE | sigmoid → topk (groups) | **Yes** | **Yes** |
-| `minimax_m2` | MiniMax M2 | sigmoid → topk (no groups) | **Yes** | **Yes** |
-| `ernie4_5_moe` | ERNIE 4.5 MoE | softmax → bias → topk | No | **Yes** |
-| `deepseek_v2` | DeepSeek-V2 | softmax → group_limited_greedy | No | **Yes** |
-| `hunyuan_v1_moe` | HunYuan V1 MoE | softmax → topk (gate.wg) | No | **Yes** |
-| `gemma4_text` | Gemma 4 (26B-A4B) | softmax → topk + per_expert_scale | **Yes**\*\* | **Yes**\*\* |
-| `gpt_oss` | GPT-OSS | fused topk → softmax | No | Planned |
-
-\* `glm4_moe_lite` with ScatterMoE may have issues — see Limitations.
-
-\*\* Gemma 4 uses `experts_implementation: scattermoe` path (registered via `ExpertsInterface`) instead of SparseMoeBlock patching, since Gemma 4 embeds MoE directly in its decoder layer (no separate SparseMoeBlock). See the [Gemma 4 section](#gemma-4) below.
-
-### Feature comparison
-
-| Feature | ScatterMoE | SonicMoE |
-|---|:---:|:---:|
-| Kernel backend | Triton | CUTLASS |
-| GPU requirement | Any CUDA | Hopper (H100/H200) or Blackwell (B200+) |
-| LoRA approach | Fused in Triton kernel | Runtime materialization + custom autograd |
-| LoRA overhead | Lower (fused computation) | Higher (per-forward materialization) |
-| Gate/router LoRA | Yes | Yes |
-| Expert LoRA | Yes (fused) | Yes (materialized) |
-| Shared expert LoRA | Yes (standard PEFT) | Yes (standard PEFT) |
-| Selective expert dequantization | Yes (~97% memory savings) | No |
-| Weight format | Transposed `[E, hidden, 2*inter]` | Interleaved gate/up `[2*I, H, E]` |
-| torch.compile routing | No | Yes (optional) |
-
-## Shared Expert Handling
-
-Both kernels handle shared experts identically. Shared expert attribute names are detected in order of priority:
-
-1. `shared_expert` (Qwen2-MoE)
-2. `shared_experts` (GLM-MoE, DeepSeek-V3)
-3. `shared_mlp` (HunYuan V1 MoE)
-
-If `shared_expert_gate` exists, sigmoid gating is applied to the shared expert contribution before adding it to the routed output. PEFT wraps shared expert linear layers with standard LoRA — no special handling is needed.
-
-## Gemma 4
-
-Gemma 4 (e.g. `google/gemma-4-26B-A4B`) has a unique hybrid MoE architecture:
-
-- **No SparseMoeBlock**: MoE is embedded directly in the decoder layer alongside a dense MLP. Both run in parallel and their outputs are summed.
-- **Custom router** (`Gemma4TextRouter`): RMSNorm → learned scale → linear projection → softmax → top-k → renormalization → per-expert learned scales.
-- **GEGLU activation**: Uses `gelu_pytorch_tanh` (not SiLU/SwiGLU like most other MoE models).
-- **128 experts, top-k=8** for the 26B-A4B variant.
-
-Because there is no SparseMoeBlock class to patch, Gemma 4 uses a different integration path: we register `"scattermoe"` as a custom implementation in the transformers `ExpertsInterface`, and set `experts_implementation: scattermoe` in the config. The `@use_experts_implementation` decorator on `Gemma4TextExperts` then dispatches to our ScatterMoE kernel automatically. The router is untouched — it runs as-is.
-
-## Limitations
-
-- **ScatterMoE + GLM4-MoE Lite**: ScatterMoE does not work reliably for GLM 4.7 Flash (`glm4_moe_lite`).
-- **Non-SwiGLU activations**: Neither kernel supports MoE architectures with non-SwiGLU expert activations (e.g., GPT-OSS uses a custom GLU variant).
-- **GPT-OSS**: Deferred — requires transposed weight layout `[E, H, 2*I]`, expert biases, and custom GLU activation. A dedicated forward path is needed.
-- **FSDP + fused gate LoRA (SonicMoE)**: The fused topk→softmax path materializes a local tensor when LoRA delta is present to avoid DTensor + Tensor mixing under FSDP.
+The `KernelsPlugin` runs once before model loading and:
+
+1. Calls `register_scattermoe_experts()` or `register_sonicmoe_experts()`, which inserts the kernel forward into `transformers.integrations.moe.ALL_EXPERTS_FUNCTIONS`.
+2. Sets `cfg.experts_implementation` to the matching name.
+3. When the model loads, transformers' `@use_experts_implementation` decorator on each model's `Experts` class reads `config._experts_implementation` and dispatches to our registered forward.
+
+That's the entire integration — there is no per-architecture SparseMoEBlock monkey-patch, no per-model routing code, and no weight-layout conversion. As new MoE models adopt the decorator upstream they immediately benefit from both kernels.
+
+## LoRA Support
+
+Both kernels train PEFT adapters on `gate_up_proj` / `down_proj` (and `gate` for the router) end-to-end:
+
+- **ScatterMoE** fuses the LoRA `B @ A` product into the per-expert grouped GEMM via custom Triton kernels (`parallel_linear_lora`). No extra materialization pass.
+- **SonicMoE** materializes `W_eff = W + scaling * (B @ A)` per expert inside a custom `MoELoRAMaterialize` `autograd.Function` and passes the effective weight into the CUTLASS kernel. Backward decomposes `dW_eff` into `dA` and `dB` via the chain rule, so LoRA parameters train without modifying the kernel.
+
+Both paths detect PEFT `ParamWrapper` on individual expert parameters (`target_parameters` API) and unwrap them before dispatch.
+
+## Model Support
+
+Any model whose `Experts` class is decorated with `@use_experts_implementation` upstream works automatically. As of transformers 5.8 this includes (verified):
+
+| Model Type        | ScatterMoE | SonicMoE |
+|-------------------|:---------:|:--------:|
+| `mixtral`         |    Yes    |   Yes    |
+| `qwen2_moe`       |    Yes    |   Yes    |
+| `qwen3_moe`       |    Yes    |   Yes    |
+| `qwen3_5_moe`     |    Yes    |   Yes    |
+| `olmoe`           |    Yes    |   Yes    |
+| `mistral4`        |    Yes    |   Yes    |
+| `glm_moe_dsa`     |    Yes    |   Yes    |
+| `deepseek_v3`     |    Yes    |   Yes    |
+| `minimax_m2`      |    Yes    |   Yes    |
+| `ernie4_5_moe`    |    Yes    |   Yes    |
+| `hunyuan_v1_moe`  |    Yes    |   Yes    |
+| `gemma4_text`     |    Yes    |   Yes    |
+| `gpt_oss`         |    No     |   Yes    |
+
+`gpt_oss` carries the decorator with `is_concatenated=False, is_transposed=True, has_bias=True` and uses a sigmoid-GLU activation with clamping. The SonicMoE forward reads these flags off `self` and dispatches accordingly. The ScatterMoE forward assumes the standard `[E, 2*I, H]` concat layout and SiLU-GLU without bias, so it does not yet support `gpt_oss`.
+
+## Feature comparison
+
+| Feature                          | ScatterMoE | SonicMoE |
+|----------------------------------|:----------:|:--------:|
+| Kernel backend                   | Triton     | CUTLASS / cute-DSL |
+| GPU requirement                  | Any CUDA   | Hopper+ |
+| LoRA path                        | Fused in Triton kernel | `MoELoRAMaterialize` + custom autograd |
+| LoRA overhead                    | Lower (fused) | Higher (materialization pass) |
+| Selective expert dequantization  | Yes (~97% memory savings) | No |
+| Weight format                    | Standard `[E, 2*I, H]` | Standard `[E, 2*I, H]` (concat layout, no interleave) |
 
 ## Note on MegaBlocks
 
diff --git a/src/axolotl/integrations/kernels/args.py b/src/axolotl/integrations/kernels/args.py
index f532fde417..2cf6d2f3ba 100644
--- a/src/axolotl/integrations/kernels/args.py
+++ b/src/axolotl/integrations/kernels/args.py
@@ -5,6 +5,26 @@
 LOG = get_logger(__name__)
 
 
+# Valid experts_implementation values:
+# - "eager"      : transformers' per-token loop reference implementation
+# - "batched_mm" : transformers' built-in batched matmul path
+# - "grouped_mm" : transformers' built-in grouped matmul path (cache-efficient)
+# - "scattermoe" : axolotl-registered Triton kernels with LoRA support
+# - "sonicmoe"   : axolotl-registered CUTLASS / cute-DSL kernels with LoRA support
+# - "deep_ep[_*]": EP-plugin composites; passed through when expert_parallel_size > 1
+_BUILTIN_EXPERTS_IMPLS = {"eager", "batched_mm", "grouped_mm"}
+_KERNEL_EXPERTS_IMPLS = {"scattermoe", "sonicmoe"}
+_EP_EXPERTS_IMPLS = {
+    "deep_ep",
+    "deep_ep_grouped_mm",
+    "deep_ep_scattermoe",
+    "deep_ep_sonicmoe",
+}
+_VALID_EXPERTS_IMPLS = (
+    _BUILTIN_EXPERTS_IMPLS | _KERNEL_EXPERTS_IMPLS | _EP_EXPERTS_IMPLS
+)
+
+
 class KernelsArgs(BaseModel):
     use_scattermoe: bool | None = None
     use_sonicmoe: bool | None = None
@@ -30,24 +50,53 @@ def check_use_kernels(cls, data):
 
         return data
 
+    @model_validator(mode="before")
+    @classmethod
+    def check_sonicmoe_ep_unsupported(cls, data):
+        """SonicMoE + EP is not yet implemented (EP `_sonicmoe_local` raises)."""
+        if data.get("use_sonicmoe") and (data.get("expert_parallel_size") or 1) > 1:
+            raise ValueError(
+                "use_sonicmoe=true is not supported with expert_parallel_size > 1. "
+                "Use use_scattermoe=true under EP, or set expert_parallel_size=1."
+            )
+        return data
+
     @model_validator(mode="before")
     @classmethod
     def check_experts_implementation(cls, data):
+        """Auto-select impl from kernel flags; reject mismatched/unknown values."""
         experts_implementation = data.get("experts_implementation")
-        use_scattermoe = data.get("use_scattermoe", False)
+        use_scattermoe = bool(data.get("use_scattermoe"))
+        use_sonicmoe = bool(data.get("use_sonicmoe"))
+
         if experts_implementation is None:
-            # transformers may default to batched_mm when unset
-            data["experts_implementation"] = "eager"
-        elif experts_implementation == "scattermoe" and not use_scattermoe:
+            if use_scattermoe:
+                data["experts_implementation"] = "scattermoe"
+            elif use_sonicmoe:
+                data["experts_implementation"] = "sonicmoe"
+            else:
+                # Transformers defaults to a non-eager backend when unset; pin to
+                # eager unless the user explicitly opts in.
+                data["experts_implementation"] = "eager"
+            return data
+
+        if experts_implementation == "scattermoe" and not use_scattermoe:
             LOG.warning(
                 "`experts_implementation='scattermoe'` requires `use_scattermoe: true`. "
                 "Automatically setting to 'eager'."
             )
             data["experts_implementation"] = "eager"
-        elif experts_implementation not in ("eager", "scattermoe"):
+        elif experts_implementation == "sonicmoe" and not use_sonicmoe:
+            LOG.warning(
+                "`experts_implementation='sonicmoe'` requires `use_sonicmoe: true`. "
+                "Automatically setting to 'eager'."
+            )
+            data["experts_implementation"] = "eager"
+        elif experts_implementation not in _VALID_EXPERTS_IMPLS:
             LOG.warning(
-                f"`experts_implementation={experts_implementation!r}` is not compatible with "
-                f"custom MoE kernels. Automatically setting to 'eager'."
+                f"`experts_implementation={experts_implementation!r}` is not recognized. "
+                f"Valid options: {sorted(_VALID_EXPERTS_IMPLS)}. "
+                f"Automatically setting to 'eager'."
             )
             data["experts_implementation"] = "eager"
 
diff --git a/src/axolotl/integrations/kernels/autotune_collector.py b/src/axolotl/integrations/kernels/autotune_collector.py
index bdb5e030e4..92e0dbe2d5 100644
--- a/src/axolotl/integrations/kernels/autotune_collector.py
+++ b/src/axolotl/integrations/kernels/autotune_collector.py
@@ -21,15 +21,17 @@
     ("group_bwd_lora_fused", "_group_bwd_lora_fused"),
 ]
 
-# The autotune key declared on every kernel: key=["M", "N", "K"]
-_KEY_NAMES: list[str] = ["M", "N", "K"]
+# The autotune key declared on every kernel: key=["M_BUCKET", "N", "K"].
+# M_BUCKET is the seqlen-bucketed M (see _bucket_m in lora_ops.py) so cache
+# entries don't churn with every distinct M.
+_KEY_NAMES: list[str] = ["M_BUCKET", "N", "K"]
 
 
 def _parse_key_tuple(key_tuple: tuple) -> dict[str, Any]:
     """Turn the autotune cache key tuple into a labelled dict.
 
     Triton builds the cache key from the values of the declared ``key``
-    args (``M``, ``N``, ``K``) followed by dtype signature elements.
+    args (``M_BUCKET``, ``N``, ``K``) followed by dtype signature elements.
     We label the first three and store the rest under ``_extra``.
     """
     result: dict[str, Any] = {}
diff --git a/src/axolotl/integrations/kernels/constants.py b/src/axolotl/integrations/kernels/constants.py
index 5239c98778..7373fa5ef2 100644
--- a/src/axolotl/integrations/kernels/constants.py
+++ b/src/axolotl/integrations/kernels/constants.py
@@ -1,76 +1,16 @@
-"""
-Supported MoE block mappings for kernel integrations.
-
-Maps model_type to the SparseMoeBlock class name(s) in transformers.
-Used by both ScatterMoE and SonicMoE kernel paths.
-
-Values can be a single class name (str) or a list of class names for models
-with multiple MoE block types (e.g. qwen3_omni_moe has Thinker + Talker).
-
-Models with custom routing (see sonicmoe/routing.py for implementations):
-- ernie4_5_moe: softmax→bias correction→topk (softmax_bias_topk_routing)
-- deepseek_v2: softmax→group_limited_greedy (softmax_group_limited_topk_routing)
-- hunyuan_v1_moe: softmax→topk via gate.wg (softmax_topk_wg_routing)
-- gemma4_text: RMSNorm→scale→proj→softmax→topk→renorm→per_expert_scale (experts-level patch)
-"""
+"""Diagnostic helpers for MoE kernel integrations (kernel dispatch itself
+is architecture-agnostic via the ExpertsInterface)."""
 
 import importlib
 
-SPARSE_MOE_BLOCK = {
-    # softmax -> topk routing
-    "qwen2_moe": "Qwen2MoeSparseMoeBlock",
-    "qwen3_moe": "Qwen3MoeSparseMoeBlock",
-    "qwen3_5_moe": "Qwen3_5MoeSparseMoeBlock",
-    "qwen3_5_moe_text": "Qwen3_5MoeSparseMoeBlock",
-    "qwen3_next": "Qwen3NextSparseMoeBlock",
-    "qwen3_vl_moe": "Qwen3VLMoeTextSparseMoeBlock",
-    # qwen3_omni_moe: Thinker (standard) + Talker (shared experts + shared_expert_gate)
-    "qwen3_omni_moe": [
-        "Qwen3OmniMoeThinkerTextSparseMoeBlock",
-        "Qwen3OmniMoeTalkerTextSparseMoeBlock",
-    ],
-    "olmoe": "OlmoeSparseMoeBlock",
-    "mixtral": "MixtralSparseMoeBlock",
-    "minimax": "MiniMaxSparseMoeBlock",
-    # softmax -> topk routing (with group-based expert selection)
-    "mistral4": "Mistral4MoE",
-    # sigmoid -> topk routing (with group-based expert selection)
-    "glm_moe_dsa": "GlmMoeDsaMoE",
-    "deepseek_v3": "DeepseekV3MoE",
-    "glm4_moe": "Glm4MoeMoE",
-    "glm4_moe_lite": "Glm4MoeLiteMoE",
-    "glm4v_moe": "Glm4vMoeTextMoE",
-    # sigmoid -> topk routing (no group selection)
-    "minimax_m2": "MiniMaxM2SparseMoeBlock",
-    # softmax->topk, e_score_correction_bias between softmax and topk
-    "ernie4_5_moe": "Ernie4_5_MoeSparseMoeBlock",
-    # softmax->topk, group_limited_greedy, different attr names (num_group)
-    "deepseek_v2": "DeepseekV2Moe",
-    # softmax->topk, gate.wg (not gate.weight)
-    "hunyuan_v1_moe": "HunYuanMoEV1Moe",
-    # TODO: gpt_oss deferred — transposed weight layout [E,H,2*I], expert biases,
-    # and custom GLU activation require a dedicated forward path in patch.py.
-    # "gpt_oss": "GptOssMLP",
-}
-
-
-# Models where MoE is NOT in a separate SparseMoeBlock but embedded in the
-# decoder layer.  For these, we patch the Experts class forward directly
-# (same signature: hidden_states, top_k_index, top_k_weights -> Tensor).
-# Routing stays untouched — the original model router runs as-is.
+# Models where MoE is embedded in the decoder layer (no separate SparseMoeBlock).
 EXPERTS_ONLY_BLOCK = {
-    # gemma4: hybrid MLP+MoE in decoder layer, custom Gemma4TextRouter,
-    # no SparseMoeBlock.  Experts use @use_experts_implementation with
-    # standard 3D param layout (gate_up_proj [E, 2*I, H], down_proj [E, H, I]).
     "gemma4_text": "Gemma4TextExperts",
 }
 
 
 def resolve_experts_class(model_type: str):
-    """Resolve the Experts class for models that need experts-level patching.
-
-    Returns the class, or None if the model uses SparseMoeBlock-level patching.
-    """
+    """Resolve the Experts class for a known model type, or ``None``."""
     entry = EXPERTS_ONLY_BLOCK.get(model_type)
     if entry is None:
         return None
@@ -93,41 +33,4 @@ def resolve_experts_class(model_type: str):
 
 
 def is_experts_only_model(model_type: str) -> bool:
-    """Check if a model type requires experts-level (not block-level) patching."""
     return model_type in EXPERTS_ONLY_BLOCK
-
-
-def resolve_moe_block_classes(model_type: str):
-    """Resolve all MoE block classes from transformers for the given model type.
-
-    Returns a list of classes (one for most models, multiple for models with
-    distinct MoE block types like qwen3_omni_moe).
-    """
-    entry = SPARSE_MOE_BLOCK.get(model_type)
-    if entry is None:
-        raise ValueError(
-            f"Unsupported MoE model type '{model_type}'. "
-            f"Supported types: {list(SPARSE_MOE_BLOCK.keys())}"
-        )
-
-    cls_names = entry if isinstance(entry, list) else [entry]
-    module_path = f"transformers.models.{model_type}.modeling_{model_type}"
-    try:
-        module = importlib.import_module(module_path)
-    except ModuleNotFoundError:
-        # Text sub-model types (e.g. qwen3_5_moe_text) share the parent module
-        if model_type.endswith("_text"):
-            parent_type = model_type.removesuffix("_text")
-            module_path = f"transformers.models.{parent_type}.modeling_{parent_type}"
-            module = importlib.import_module(module_path)
-        else:
-            raise
-
-    classes = []
-    for cls_name in cls_names:
-        moe_cls = getattr(module, cls_name, None)
-        if moe_cls is None:
-            raise ValueError(f"Could not find class '{cls_name}' in '{module_path}'")
-        classes.append(moe_cls)
-
-    return classes
diff --git a/src/axolotl/integrations/kernels/libs/scattermoe_lora/gemma4_experts.py b/src/axolotl/integrations/kernels/libs/scattermoe_lora/experts.py
similarity index 72%
rename from src/axolotl/integrations/kernels/libs/scattermoe_lora/gemma4_experts.py
rename to src/axolotl/integrations/kernels/libs/scattermoe_lora/experts.py
index 66623e0173..9199c8a595 100644
--- a/src/axolotl/integrations/kernels/libs/scattermoe_lora/gemma4_experts.py
+++ b/src/axolotl/integrations/kernels/libs/scattermoe_lora/experts.py
@@ -1,18 +1,7 @@
-"""
-ScatterMoE-accelerated experts forward for Gemma4.
-
-Gemma4 has no separate SparseMoeBlock — MoE is embedded in the decoder layer.
-The decoder layer handles routing (Gemma4TextRouter) and calls
-``experts(hidden_states, top_k_index, top_k_weights)`` directly.
+"""ScatterMoE experts forward for the transformers ExpertsInterface.
 
-This module registers a ``"scattermoe"`` implementation in the transformers
-``ExpertsInterface``, which the ``@use_experts_implementation`` decorator
-dispatches to when ``config._experts_implementation == "scattermoe"``.
-
-This is the clean way to hook into transformers' MoE dispatch — no
-monkeypatching required.  Works for Gemma4 and any future model that uses
-``@use_experts_implementation`` with the standard forward signature
-``(hidden_states, top_k_index, top_k_weights) -> Tensor``.
+PEFT LoRA on ``gate_up_proj`` / ``down_proj`` is fused into the
+ScatterMoE Triton call via ``parallel_linear_lora``.
 """
 
 import torch
@@ -139,12 +128,23 @@ def scattermoe_experts_forward(
     top_k_index: torch.Tensor,
     top_k_weights: torch.Tensor,
 ) -> torch.Tensor:
-    """ScatterMoE-accelerated experts forward.
+    """ScatterMoE experts forward with fused-LoRA support."""
+    # Assumes the standard expert layout: gate_up concatenated as [E, 2I, H],
+    # gated SwiGLU, no expert bias. gpt_oss-style experts (interleaved gate/up,
+    # transposed [E, H, 2I], expert bias) would be silently miscomputed by the
+    # fixed transpose/chunk below, so reject rather than corrupt training.
+    if (
+        getattr(self, "is_transposed", False)
+        or not getattr(self, "is_concatenated", True)
+        or getattr(self, "has_bias", False)
+        or not getattr(self, "has_gate", True)
+    ):
+        raise NotImplementedError(
+            "scattermoe supports only concatenated, non-transposed, gated, biasless "
+            "experts (qwen/mixtral/deepseek/glm/...). This model's experts use an "
+            "unsupported layout; use use_sonicmoe or a built-in experts_implementation."
+        )
 
-    Drop-in replacement for the standard Experts forward signature used by
-    ``@use_experts_implementation``-decorated classes (Gemma4, Mixtral, etc.):
-    ``(hidden_states [T, H], top_k_index [T, K], top_k_weights [T, K]) -> [T, H]``
-    """
     K = top_k_index.shape[1]
 
     routing_weights = top_k_weights.to(hidden_states.dtype)
@@ -193,22 +193,24 @@ def scattermoe_experts_forward(
     return output
 
 
-def register_scattermoe_experts():
-    """Register ``"scattermoe"`` in the transformers ExpertsInterface.
+_SCATTERMOE_PATCHED = False
 
-    After calling this, any model with ``@use_experts_implementation`` will
-    dispatch to ScatterMoE when ``config._experts_implementation == "scattermoe"``.
 
-    Also patches ``get_correct_experts_implementation`` to accept ``"scattermoe"``
-    as a valid value (transformers hardcodes an allowlist).
+def register_scattermoe_experts():
+    """Register ``"scattermoe"`` in the ExpertsInterface and the validator allowlist.
+
+    Idempotent.
     """
+    global _SCATTERMOE_PATCHED
+
     from transformers.integrations.moe import ALL_EXPERTS_FUNCTIONS
     from transformers.modeling_utils import PreTrainedModel
 
-    # 1. Register the forward function in the global interface
     ALL_EXPERTS_FUNCTIONS.register("scattermoe", scattermoe_experts_forward)
 
-    # 2. Patch the validation to accept "scattermoe"
+    if _SCATTERMOE_PATCHED:
+        return
+
     _original_get_correct = PreTrainedModel.get_correct_experts_implementation
 
     def _patched_get_correct(self_model, requested_experts: str | None) -> str:
@@ -217,19 +219,4 @@ def _patched_get_correct(self_model, requested_experts: str | None) -> str:
         return _original_get_correct(self_model, requested_experts)
 
     PreTrainedModel.get_correct_experts_implementation = _patched_get_correct
-
-
-# Legacy monkeypatch approach (kept for backward compat with existing tests)
-def patch_gemma4_scattermoe():
-    """Monkeypatch Gemma4TextExperts.forward with ScatterMoE kernel."""
-    from axolotl.integrations.kernels.constants import resolve_experts_class
-
-    experts_cls = resolve_experts_class("gemma4_text")
-    if experts_cls is None:
-        raise ValueError("Could not resolve Gemma4TextExperts class")
-
-    if hasattr(experts_cls, "_original_forward"):
-        return  # already patched
-
-    experts_cls._original_forward = experts_cls.forward
-    experts_cls.forward = scattermoe_experts_forward
+    _SCATTERMOE_PATCHED = True
diff --git a/src/axolotl/integrations/kernels/libs/scattermoe_lora/kernels/lora_ops.py b/src/axolotl/integrations/kernels/libs/scattermoe_lora/kernels/lora_ops.py
index e8d4309f9b..a453e83495 100644
--- a/src/axolotl/integrations/kernels/libs/scattermoe_lora/kernels/lora_ops.py
+++ b/src/axolotl/integrations/kernels/libs/scattermoe_lora/kernels/lora_ops.py
@@ -54,6 +54,17 @@ def _next_power_of_2(n: int) -> int:
     return n + 1
 
 
+# Granularity for the autotune cache key on M. The kernel still runs on the
+# real M (loop bounds + masks); only the @triton.autotune key is bucketed so
+# that varying seqlens/routing don't keep invalidating the cache.
+_M_BUCKET_GRANULARITY = 1024
+
+
+def _bucket_m(m: int) -> int:
+    g = _M_BUCKET_GRANULARITY
+    return ((m + g - 1) // g) * g
+
+
 # Triton tl.dot requires minimum tile dimensions of 16 on modern GPUs.
 MIN_TRITON_DOT_SIZE = 16
 
@@ -450,7 +461,7 @@ def _prune_fwd_configs(configs, named_args, **kwargs):
 
 @triton.autotune(
     configs=_scatter2scatter_lora_configs(),
-    key=["M", "N", "K"],
+    key=["M_BUCKET", "N", "K"],
     prune_configs_by={"early_config_prune": _prune_fwd_configs},
 )
 @triton.heuristics(
@@ -489,6 +500,7 @@ def _scatter2scatter_lora(
     # Dimensions
     FAN_OUT: tl.constexpr,
     M,
+    M_BUCKET,
     K: tl.constexpr,
     N: tl.constexpr,
     E: tl.constexpr,
@@ -506,6 +518,7 @@ def _scatter2scatter_lora(
     y_grouped: tl.constexpr,
     NO_K_MASK: tl.constexpr,
     NO_N_MASK: tl.constexpr,
+    INT64_INDICES: tl.constexpr = False,
 ):
     """
     Fused scatter2scatter with LoRA: Y = X @ W + scaling * (X @ A^T) @ B^T + bias
@@ -517,6 +530,8 @@ def _scatter2scatter_lora(
     N_block_id = pid % N_BLOCK_COUNT
 
     M_block = M_block_id * BLOCK_M + tl.arange(0, BLOCK_M)
+    if INT64_INDICES:
+        M_block = M_block.to(tl.int64)
     N_block = N_block_id * BLOCK_N + tl.arange(0, BLOCK_N)
     N_mask = N_block < N
     M_boundary_mask = M_block < (FAN_OUT * M)
@@ -529,7 +544,10 @@ def _scatter2scatter_lora(
 
     E_first_idx = tl.min(E_idxs)
     E_last_idx = tl.minimum(tl.max(E_idxs), E - 1)
-    M_idx = tl.load(grouped_idx_ptr + M_block, mask=M_boundary_mask).to(tl.int32)
+    if INT64_INDICES:
+        M_idx = tl.load(grouped_idx_ptr + M_block, mask=M_boundary_mask).to(tl.int64)
+    else:
+        M_idx = tl.load(grouped_idx_ptr + M_block, mask=M_boundary_mask).to(tl.int32)
 
     for E_idx in range(E_first_idx, E_last_idx + 1):
         E_mask = E_idxs == E_idx
@@ -600,6 +618,7 @@ def _scatter2scatter_lora_split(
     x_grouped: bool = False,
     y_grouped: bool = False,
     out: Optional[torch.Tensor] = None,
+    int64_indices: bool = False,
 ) -> torch.Tensor:
     """Split base+LoRA forward: 3 scatter2scatter calls, no fused LoRA kernel.
 
@@ -629,6 +648,7 @@ def _scatter2scatter_lora_split(
         x_grouped=x_grouped,
         y_grouped=y_grouped,
         out=out,
+        int64_indices=int64_indices,
     )
 
     # 2. XA = X @ A^T  (tiny: output is [M*k, R])
@@ -642,6 +662,7 @@ def _scatter2scatter_lora_split(
         k=k,
         x_grouped=x_grouped,
         y_grouped=True,
+        int64_indices=int64_indices,
     )
 
     # 3. Y_lora = XA @ B^T  (R is tiny, so this is very fast)
@@ -655,6 +676,7 @@ def _scatter2scatter_lora_split(
         k=1,
         x_grouped=True,
         y_grouped=y_grouped,
+        int64_indices=int64_indices,
     )
 
     # 4. Y = Y_base + scaling * Y_lora
@@ -683,6 +705,7 @@ def scatter2scatter_lora(
     x_grouped: bool = False,
     y_grouped: bool = False,
     out: Optional[torch.Tensor] = None,
+    int64_indices: bool = False,
 ) -> torch.Tensor:
     """
     Scatter2scatter with LoRA: Y[i] = X[i] @ W[e] + scaling * (X[i] @ A[e]^T) @ B[e]^T + b[e]
@@ -729,6 +752,7 @@ def scatter2scatter_lora(
             x_grouped,
             y_grouped,
             out,
+            int64_indices=int64_indices,
         )
 
     assert sorted_scattered_idxs.size(0) == sorted_expert_idxs.size(0)
@@ -783,6 +807,7 @@ def grid(META):
         sorted_expert_idxs,
         FAN_OUT=k,
         M=X.size(0),
+        M_BUCKET=_bucket_m(X.size(0)),
         K=K,
         N=N,
         E=E,
@@ -793,6 +818,7 @@ def grid(META):
         allow_tf32=ALLOW_TF32,
         x_grouped=x_grouped,
         y_grouped=y_grouped,
+        INT64_INDICES=int64_indices,
     )
 
     return output
@@ -1030,7 +1056,7 @@ def _prune_dX_configs(configs, named_args, **kwargs):
 
 @triton.autotune(
     configs=_scatter2scatter_lora_dX_configs(),
-    key=["M", "N", "K"],
+    key=["M_BUCKET", "N", "K"],
     prune_configs_by={"early_config_prune": _prune_dX_configs},
 )
 @triton.heuristics(
@@ -1067,6 +1093,7 @@ def _scatter2scatter_lora_dX(
     # Dimensions
     FAN_OUT: tl.constexpr,
     M,
+    M_BUCKET,
     K: tl.constexpr,
     N: tl.constexpr,
     E: tl.constexpr,
@@ -1084,6 +1111,7 @@ def _scatter2scatter_lora_dX(
     dx_grouped: tl.constexpr,
     NO_K_MASK: tl.constexpr,
     NO_N_MASK: tl.constexpr,
+    INT64_INDICES: tl.constexpr = False,
 ):
     """
     Fused backward dX = DY @ W^T + scaling * (DY @ B) @ A
@@ -1100,6 +1128,8 @@ def _scatter2scatter_lora_dX(
     K_block_id = pid % K_BLOCK_COUNT
 
     M_block = M_block_id * BLOCK_M + tl.arange(0, BLOCK_M)
+    if INT64_INDICES:
+        M_block = M_block.to(tl.int64)
     K_block = K_block_id * BLOCK_K + tl.arange(0, BLOCK_K)
     K_mask = K_block < K
     M_boundary_mask = M_block < (FAN_OUT * M)
@@ -1112,7 +1142,10 @@ def _scatter2scatter_lora_dX(
 
     E_first_idx = tl.min(E_idxs)
     E_last_idx = tl.minimum(tl.max(E_idxs), E - 1)
-    M_idx = tl.load(grouped_idx_ptr + M_block, mask=M_boundary_mask).to(tl.int32)
+    if INT64_INDICES:
+        M_idx = tl.load(grouped_idx_ptr + M_block, mask=M_boundary_mask).to(tl.int64)
+    else:
+        M_idx = tl.load(grouped_idx_ptr + M_block, mask=M_boundary_mask).to(tl.int32)
 
     for E_idx in range(E_first_idx, E_last_idx + 1):
         E_mask = E_idxs == E_idx
@@ -1175,6 +1208,7 @@ def scatter2scatter_lora_dX(
     dy_grouped: bool = True,
     dx_grouped: bool = False,
     out: Optional[torch.Tensor] = None,
+    int64_indices: bool = False,
 ) -> torch.Tensor:
     """
     Fused backward dX = DY @ W^T + scaling * (DY @ B) @ A
@@ -1250,6 +1284,7 @@ def grid(META):
         sorted_expert_idxs,
         FAN_OUT=fan_out,
         M=M,
+        M_BUCKET=_bucket_m(M),
         K=K,
         N=N,
         E=E,
@@ -1261,6 +1296,7 @@ def grid(META):
         allow_tf32=ALLOW_TF32,
         dy_grouped=dy_grouped,
         dx_grouped=dx_grouped,
+        INT64_INDICES=int64_indices,
     )
 
     return output
@@ -1363,7 +1399,7 @@ def _prune_bwd_lora_configs(configs, named_args, **kwargs):
 
 @triton.autotune(
     configs=_group_bwd_lora_configs(),
-    key=["M", "N", "K"],
+    key=["M_BUCKET", "N", "K"],
     prune_configs_by={"early_config_prune": _prune_bwd_lora_configs},
     reset_to_zero=["DLA_ptr", "DLB_ptr"],
 )
@@ -1400,6 +1436,7 @@ def _group_bwd_lora(
     expert_offsets_ptr,
     # Dimensions
     M,
+    M_BUCKET,
     K: tl.constexpr,
     N: tl.constexpr,
     ACTUAL_R: tl.constexpr,  # True LoRA rank (for weight indexing)
@@ -1413,6 +1450,7 @@ def _group_bwd_lora(
     allow_tf32: tl.constexpr,
     NO_K_MASK: tl.constexpr,
     NO_N_MASK: tl.constexpr,
+    INT64_INDICES: tl.constexpr = False,
 ):
     """
     Compute LoRA gradients for each expert on grouped data.
@@ -1434,15 +1472,24 @@ def _group_bwd_lora(
     N_block_id = pid1
 
     # Get expert's token range from cumulative offsets
-    if E_idx == 0:
-        start_idx = 0
+    if INT64_INDICES:
+        if E_idx == 0:
+            start_idx = tl.zeros([], dtype=tl.int64)
+        else:
+            start_idx = tl.load(expert_offsets_ptr + E_idx - 1).to(tl.int64)
+        end_idx = tl.load(expert_offsets_ptr + E_idx).to(tl.int64)
     else:
-        start_idx = tl.load(expert_offsets_ptr + E_idx - 1).to(tl.int32)
-    end_idx = tl.load(expert_offsets_ptr + E_idx).to(tl.int32)
+        if E_idx == 0:
+            start_idx = tl.zeros([], dtype=tl.int32)
+        else:
+            start_idx = tl.load(expert_offsets_ptr + E_idx - 1).to(tl.int32)
+        end_idx = tl.load(expert_offsets_ptr + E_idx).to(tl.int32)
     num_tokens = end_idx - start_idx
 
     if num_tokens > 0:
         M_block = tl.arange(0, BLOCK_M)
+        if INT64_INDICES:
+            M_block = M_block.to(tl.int64)
         K_block = K_block_id * BLOCK_K + tl.arange(0, BLOCK_K)
         K_mask = K_block < K
         N_block = N_block_id * BLOCK_N + tl.arange(0, BLOCK_N)
@@ -1601,7 +1648,7 @@ def _prune_split_configs(configs, named_args, **kwargs):
 
 @triton.autotune(
     configs=_group_bwd_split_configs(),
-    key=["M", "K", "N"],
+    key=["M_BUCKET", "K", "N"],
     prune_configs_by={"early_config_prune": _prune_split_configs},
 )
 @triton.heuristics(
@@ -1634,6 +1681,7 @@ def _group_bwd_lora_split(
     expert_offsets_ptr,
     # Dimensions
     M,
+    M_BUCKET,
     K: tl.constexpr,
     N: tl.constexpr,
     ACTUAL_R: tl.constexpr,
@@ -1648,6 +1696,7 @@ def _group_bwd_lora_split(
     ACC_TYPE: tl.constexpr,
     allow_tf32: tl.constexpr,
     NO_DIM_MASK: tl.constexpr,
+    INT64_INDICES: tl.constexpr = False,
 ):
     """
     Unified split kernel for LoRA gradient computation.
@@ -1671,11 +1720,18 @@ def _group_bwd_lora_split(
     E_idx = tl.program_id(0)
     dim_block_id = tl.program_id(1)
 
-    if E_idx == 0:
-        start_idx = 0
+    if INT64_INDICES:
+        if E_idx == 0:
+            start_idx = tl.zeros([], dtype=tl.int64)
+        else:
+            start_idx = tl.load(expert_offsets_ptr + E_idx - 1).to(tl.int64)
+        end_idx = tl.load(expert_offsets_ptr + E_idx).to(tl.int64)
     else:
-        start_idx = tl.load(expert_offsets_ptr + E_idx - 1).to(tl.int32)
-    end_idx = tl.load(expert_offsets_ptr + E_idx).to(tl.int32)
+        if E_idx == 0:
+            start_idx = tl.zeros([], dtype=tl.int32)
+        else:
+            start_idx = tl.load(expert_offsets_ptr + E_idx - 1).to(tl.int32)
+        end_idx = tl.load(expert_offsets_ptr + E_idx).to(tl.int32)
     num_tokens = end_idx - start_idx
 
     # Output dimension tile (K for dA, N for dB)
@@ -1707,6 +1763,8 @@ def _group_bwd_lora_split(
 
     if num_tokens > 0:
         M_block = tl.arange(0, BLOCK_M)
+        if INT64_INDICES:
+            M_block = M_block.to(tl.int64)
         INPUT_DTYPE = X_ptr.dtype.element_ty
         BLOCK_INNER: tl.constexpr = 64
         inner_iters = tl.cdiv(INNER_DIM, BLOCK_INNER)
@@ -1826,6 +1884,7 @@ def group_bwd_lora(
     scaling: float,
     sorted_scattered_idxs: Optional[torch.Tensor] = None,
     k: int = 1,
+    int64_indices: bool = False,
 ) -> tuple[torch.Tensor, torch.Tensor]:
     """
     Compute LoRA gradients for A and B on expert-grouped data.
@@ -1875,6 +1934,7 @@ def grid_dA(META):
         dA.stride(1),
         expert_offsets,
         M=DY.size(0),
+        M_BUCKET=_bucket_m(DY.size(0)),
         K=K,
         N=N,
         ACTUAL_R=R,
@@ -1884,6 +1944,7 @@ def grid_dA(META):
         COMPUTE_DA=True,
         ACC_TYPE=tl.float32,
         allow_tf32=ALLOW_TF32,
+        INT64_INDICES=int64_indices,
     )
 
     def grid_dB(META):
@@ -1904,6 +1965,7 @@ def grid_dB(META):
         dB.stride(1),
         expert_offsets,
         M=DY.size(0),
+        M_BUCKET=_bucket_m(DY.size(0)),
         K=K,
         N=N,
         ACTUAL_R=R,
@@ -1913,6 +1975,7 @@ def grid_dB(META):
         COMPUTE_DA=False,
         ACC_TYPE=tl.float32,
         allow_tf32=ALLOW_TF32,
+        INT64_INDICES=int64_indices,
     )
 
     return dA, dB
@@ -1925,7 +1988,7 @@ def grid_dB(META):
 
 @triton.autotune(
     configs=_group_bwd_lora_configs(),
-    key=["M", "N", "K"],
+    key=["M_BUCKET", "N", "K"],
     prune_configs_by={"early_config_prune": _prune_bwd_lora_configs},
     reset_to_zero=["DLA_ptr", "DLB_ptr"],
 )
@@ -1967,6 +2030,7 @@ def _group_bwd_lora_fused(
     real_expert_offsets_ptr,
     # Dimensions
     M,
+    M_BUCKET,
     K: tl.constexpr,
     N: tl.constexpr,
     ACTUAL_R: tl.constexpr,
@@ -1982,6 +2046,7 @@ def _group_bwd_lora_fused(
     NO_N_MASK: tl.constexpr,
     # Whether DY is already in grouped (expert-sorted) order
     dy_grouped: tl.constexpr = False,
+    INT64_INDICES: tl.constexpr = False,
 ):
     """
     Fused gather + LoRA gradient computation. Same as _group_bwd_lora but
@@ -2018,18 +2083,30 @@ def _group_bwd_lora_fused(
     # Get expert's token range from cumulative offsets
     # start_idx/end_idx from expert_offsets_ptr: iteration range (possibly padded)
     # real_end_idx from real_expert_offsets_ptr: for M_mask (real token count)
-    if E_idx == 0:
-        start_idx = 0
-        real_start_idx = 0
+    if INT64_INDICES:
+        if E_idx == 0:
+            start_idx = tl.zeros([], dtype=tl.int64)
+            real_start_idx = tl.zeros([], dtype=tl.int64)
+        else:
+            start_idx = tl.load(expert_offsets_ptr + E_idx - 1).to(tl.int64)
+            real_start_idx = tl.load(real_expert_offsets_ptr + E_idx - 1).to(tl.int64)
+        end_idx = tl.load(expert_offsets_ptr + E_idx).to(tl.int64)
+        real_end_idx = tl.load(real_expert_offsets_ptr + E_idx).to(tl.int64)
     else:
-        start_idx = tl.load(expert_offsets_ptr + E_idx - 1).to(tl.int32)
-        real_start_idx = tl.load(real_expert_offsets_ptr + E_idx - 1).to(tl.int32)
-    end_idx = tl.load(expert_offsets_ptr + E_idx).to(tl.int32)
-    real_end_idx = tl.load(real_expert_offsets_ptr + E_idx).to(tl.int32)
+        if E_idx == 0:
+            start_idx = tl.zeros([], dtype=tl.int32)
+            real_start_idx = tl.zeros([], dtype=tl.int32)
+        else:
+            start_idx = tl.load(expert_offsets_ptr + E_idx - 1).to(tl.int32)
+            real_start_idx = tl.load(real_expert_offsets_ptr + E_idx - 1).to(tl.int32)
+        end_idx = tl.load(expert_offsets_ptr + E_idx).to(tl.int32)
+        real_end_idx = tl.load(real_expert_offsets_ptr + E_idx).to(tl.int32)
     num_tokens = end_idx - start_idx
 
     if num_tokens > 0:
         M_block = tl.arange(0, BLOCK_M)
+        if INT64_INDICES:
+            M_block = M_block.to(tl.int64)
         K_block = K_block_id * BLOCK_K + tl.arange(0, BLOCK_K)
         K_mask = K_block < K
         N_block = N_block_id * BLOCK_N + tl.arange(0, BLOCK_N)
@@ -2074,9 +2151,14 @@ def _group_bwd_lora_fused(
             M_mask = M_local < real_num_tokens
 
             # Fused gather: load scatter indices for indirect X access
-            scatter_idx = tl.load(
-                sorted_scattered_idxs_ptr + M_idx, mask=M_mask, other=0
-            ).to(tl.int32)
+            if INT64_INDICES:
+                scatter_idx = tl.load(
+                    sorted_scattered_idxs_ptr + M_idx, mask=M_mask, other=0
+                ).to(tl.int64)
+            else:
+                scatter_idx = tl.load(
+                    sorted_scattered_idxs_ptr + M_idx, mask=M_mask, other=0
+                ).to(tl.int32)
             X_token_idx = scatter_idx // FAN_OUT  # X is [M, K], not expanded by k
 
             # Load X via indirect index: [BLOCK_M, BLOCK_K]
@@ -2154,6 +2236,7 @@ def group_bwd_lora_fused(
     scaling: float,
     real_expert_offsets: Optional[torch.Tensor] = None,
     dy_grouped: bool = False,
+    int64_indices: bool = False,
 ) -> tuple[torch.Tensor, torch.Tensor]:
     """
     Fused gather + LoRA gradient computation. Same result as
@@ -2230,6 +2313,7 @@ def grid(META):
         expert_offsets_ptr=expert_offsets,
         real_expert_offsets_ptr=real_expert_offsets,
         M=sorted_scattered_idxs.size(0),
+        M_BUCKET=_bucket_m(sorted_scattered_idxs.size(0)),
         K=K,
         N=N,
         ACTUAL_R=R,
@@ -2238,6 +2322,975 @@ def grid(META):
         ACC_TYPE=tl.float32,
         allow_tf32=ALLOW_TF32,
         dy_grouped=dy_grouped,
+        INT64_INDICES=int64_indices,
     )
 
     return dA, dB
+
+
+# =============================================================================
+# Fused MXFP4 Forward / dX Kernels
+# =============================================================================
+#
+# These mirror ``_scatter2scatter_lora`` and ``_scatter2scatter_lora_dX`` but
+# load the base weight tile from a packed MXFP4 buffer + E8M0 scale buffer
+# instead of a dense bf16 tile. The K-loop unpacks two fp4 values per uint8
+# byte, looks them up in a 16-entry fp32 codebook, multiplies by
+# ``2^(scale_byte - 127)``, and casts back to bf16 for ``tl.dot``.
+#
+# Layout conventions (kernel coordinates):
+#   * Forward kernel: logical W is ``[E, K, N]`` (block axis = K).
+#     - packed: ``[E, N, K/2]`` uint8 — stored with the contraction axis K
+#       contiguous (matches torchao MXTensor's natural last-dim block layout
+#       once you treat the W storage as ``[E, N, K]``).
+#     - scale:  ``[E, N, K/32]`` uint8 (E8M0).
+#   * dX kernel: reuses the *forward* MX layout ``[E, N, K/2]`` (no
+#     pre-transpose). The kernel iterates the N reduction in outer tiles and,
+#     for each (K_tile, N_tile), decodes nibbles along the K rows of the
+#     packed tile and broadcasts scales within each ``MX_BLOCK_SIZE`` K-block,
+#     yielding the same dequantized ``W[e, k, n]`` values the forward path
+#     consumes. This deliberately avoids a "pre-transpose for dX" step that
+#     would dequantize + transpose + re-quantize the active-experts slice:
+#     that round-trip introduces a second MX rounding error on top of the
+#     forward quantization, perturbing dX in ways that are hard to bound.
+#     Reusing the forward buffer keeps numerics bitwise-comparable to a
+#     dequant-then-MMA dX reference.
+#
+# ``BLOCK_K`` must be a multiple of the OCP block size (32) so that each
+# K-tile aligns with whole scale blocks for both the forward and dX
+# kernels. The autotune config search space is pruned accordingly in
+# ``_prune_fwd_mx_configs`` / ``_prune_dX_mx_configs``.
+
+_MX_BLOCK_SIZE = 32
+
+
+@triton.jit
+def _compute_expert_block_lora_mxfp4(
+    E_idx,
+    E_mask,
+    M_in_idx,
+    N_block,
+    N_mask,
+    # X
+    X_ptr,
+    stride_xm,
+    stride_xk,
+    # Packed MXFP4 weight: [E, N, K/2] uint8
+    Wp_ptr,
+    stride_wpe,
+    stride_wpn,
+    stride_wpk,
+    # E8M0 scale: [E, N, K/32] uint8
+    Ws_ptr,
+    stride_wse,
+    stride_wsn,
+    stride_wsk,
+    # FP4 -> fp32 codebook (16 values)
+    Codebook_ptr,
+    # LoRA
+    A_ptr,
+    stride_ar,
+    stride_ak,
+    B_ptr,
+    stride_bn,
+    stride_br,
+    K,
+    ACTUAL_R: tl.constexpr,
+    acc,
+    no_k_mask,
+    BLOCK_M: tl.constexpr,
+    BLOCK_K: tl.constexpr,
+    BLOCK_N: tl.constexpr,
+    BLOCK_R: tl.constexpr,
+    MX_BLOCK_SIZE: tl.constexpr,
+    scaling,
+    allow_tf32: tl.constexpr,
+):
+    """Forward inner loop for MXFP4 expert weights.
+
+    Computes ``acc += X @ dequant(W_e) + scaling * (X @ A_e^T) @ B_e^T`` for
+    the active token rows in this M-tile assigned to expert ``E_idx``.
+
+    Each K-loop iteration loads a ``[BLOCK_N, BLOCK_K/2]`` packed tile and a
+    ``[BLOCK_N, BLOCK_K/MX_BLOCK_SIZE]`` scale tile, unpacks to bf16, and
+    transposes for the matmul.
+    """
+    K_block = tl.arange(0, BLOCK_K)
+    K_byte_block = K_block // 2
+    K_is_high = (K_block % 2) == 1
+    K_scale_block = K_block // MX_BLOCK_SIZE
+    R_block = tl.arange(0, BLOCK_R)
+    R_mask = R_block < ACTUAL_R
+
+    INPUT_DTYPE = X_ptr.dtype.element_ty
+
+    # X pointers
+    X_blk_ptrs = X_ptr + M_in_idx[:, None] * stride_xm + K_block[None, :] * stride_xk
+
+    # LoRA A pointers
+    A_expert_offset = E_idx * ACTUAL_R
+    A_blk_ptrs = (
+        A_ptr
+        + (A_expert_offset + R_block)[:, None] * stride_ar
+        + K_block[None, :] * stride_ak
+    )
+
+    # Packed W pointers: tile shape [BLOCK_N, BLOCK_K] (each byte loaded twice)
+    Wp_blk_ptrs = (
+        Wp_ptr
+        + E_idx * stride_wpe
+        + N_block[:, None] * stride_wpn
+        + K_byte_block[None, :] * stride_wpk
+    )
+    # Scale pointers: tile shape [BLOCK_N, BLOCK_K] (broadcast within block)
+    Ws_blk_ptrs = (
+        Ws_ptr
+        + E_idx * stride_wse
+        + N_block[:, None] * stride_wsn
+        + K_scale_block[None, :] * stride_wsk
+    )
+
+    iters = tl.cdiv(K, BLOCK_K)
+    xa_acc = tl.zeros((BLOCK_M, BLOCK_R), dtype=tl.float32)
+
+    for i in range(iters):
+        if no_k_mask:
+            K_mask_iter = K_block >= 0  # all-true [BLOCK_K]
+        else:
+            K_mask_iter = (i * BLOCK_K + K_block) < K
+        x_mask = E_mask[:, None] & K_mask_iter[None, :]
+        a_mask = R_mask[:, None] & K_mask_iter[None, :]
+        w_mask = N_mask[:, None] & K_mask_iter[None, :]
+
+        x = tl.load(X_blk_ptrs, mask=x_mask, other=0.0).to(INPUT_DTYPE)
+        a = tl.load(A_blk_ptrs, mask=a_mask, other=0.0).to(INPUT_DTYPE)
+
+        # MXFP4 dequant
+        packed = tl.load(Wp_blk_ptrs, mask=w_mask, other=0).to(tl.int32)
+        nibble = tl.where(K_is_high[None, :], (packed >> 4) & 0xF, packed & 0xF)
+        codebook_val = tl.load(Codebook_ptr + nibble)  # [BLOCK_N, BLOCK_K] fp32
+        scale_byte = tl.load(Ws_blk_ptrs, mask=w_mask, other=0).to(tl.int32)
+        scale_fp = tl.exp2((scale_byte - 127).to(tl.float32))
+        w_dq_nk = (codebook_val * scale_fp).to(INPUT_DTYPE)  # [BLOCK_N, BLOCK_K]
+        w_tile = tl.trans(w_dq_nk)  # [BLOCK_K, BLOCK_N]
+
+        # Base: acc += X @ W
+        acc += tl.dot(x, w_tile, allow_tf32=allow_tf32).to(tl.float32)
+        # LoRA: xa_acc += X @ A^T
+        xa_acc += tl.dot(x, tl.trans(a), allow_tf32=allow_tf32).to(tl.float32)
+
+        X_blk_ptrs += BLOCK_K * stride_xk
+        A_blk_ptrs += BLOCK_K * stride_ak
+        Wp_blk_ptrs += (BLOCK_K // 2) * stride_wpk
+        Ws_blk_ptrs += (BLOCK_K // MX_BLOCK_SIZE) * stride_wsk
+
+    # Epilogue (B @ xa_acc^T) — identical to dense path
+    B_expert_offset = E_idx * ACTUAL_R
+    B_blk_ptrs = (
+        B_ptr
+        + N_block[:, None] * stride_bn
+        + (B_expert_offset + R_block)[None, :] * stride_br
+    )
+    b = tl.load(B_blk_ptrs, mask=N_mask[:, None] & R_mask[None, :], other=0.0)
+    b_inp = b.to(INPUT_DTYPE)
+    lora_out = tl.dot(xa_acc.to(INPUT_DTYPE), tl.trans(b_inp), allow_tf32=allow_tf32)
+    acc += scaling * lora_out
+    return acc
+
+
+def _scatter2scatter_lora_mx_configs():
+    """Forward MX kernel configs. BLOCK_K must be a multiple of MX_BLOCK_SIZE."""
+    configs = []
+    for block_m, block_n, block_k, warps, stages in product(
+        [32, 64, 128],
+        [32, 64],
+        [32, 64, 128],  # all multiples of MX_BLOCK_SIZE=32
+        [4, 8],
+        [3, 4, 5],
+    ):
+        configs.append(
+            triton.Config(
+                {"BLOCK_M": block_m, "BLOCK_N": block_n, "BLOCK_K": block_k},
+                num_stages=stages,
+                num_warps=warps,
+            )
+        )
+    return configs
+
+
+def _prune_fwd_mx_configs(configs, named_args, **kwargs):
+    """Prune MX forward configs by SMEM and register pressure.
+
+    MX-aware accounting adds the packed W tile and the scale tile per
+    pipeline stage to the base GEMM SMEM estimate. Both tiles are sized
+    [BLOCK_N, BLOCK_K] uint8 in the kernel: the packed buffer reads each
+    byte twice (because K_byte = K // 2 indexes a [BLOCK_K]-wide vector
+    into a K/2-stride buffer), and the scale buffer reads each byte
+    MX_BLOCK_SIZE times (broadcast within each K-block). This matches
+    the conservative full-tile accounting in ``_prune_dX_mx_configs``.
+    Also require BLOCK_K % MX_BLOCK_SIZE == 0.
+    """
+    smem_cap = _get_smem_capacity()
+    block_r = named_args.get("BLOCK_R", 64)
+
+    scored = []
+    for config in configs:
+        block_m = config.kwargs["BLOCK_M"]
+        block_n = config.kwargs["BLOCK_N"]
+        block_k = config.kwargs["BLOCK_K"]
+        if block_k % _MX_BLOCK_SIZE != 0:
+            continue
+        # Base GEMM tiles (X, dequantized W, acc)
+        smem_base = _estimate_smem_usage(config.num_stages, block_m, block_n, block_k)
+        # MX-specific loads per pipeline stage: packed and scale tiles are
+        # both [BLOCK_N, BLOCK_K] bytes (see docstring for why each is full
+        # tile size, not BLOCK_K/2 or BLOCK_K/MX_BLOCK_SIZE).
+        smem_packed = config.num_stages * block_n * block_k * 1
+        smem_scale = config.num_stages * block_n * block_k * 1
+        # LoRA tiles
+        smem_lora_loop = config.num_stages * block_r * block_k * 2
+        smem_lora_epilogue = block_n * block_r * 2
+        smem = (
+            smem_base + smem_packed + smem_scale + smem_lora_loop + smem_lora_epilogue
+        )
+
+        est_regs = _estimate_register_pressure(
+            config.num_warps,
+            (block_m, block_n),  # acc
+            (block_m, block_r),  # xa_acc
+            (block_m, block_k),  # x tile
+            (block_n, block_k),  # dequantized w (before transpose)
+            (block_r, block_k),  # a tile
+            (block_n, block_r),  # b tile (epilogue)
+        )
+        if est_regs > _MAX_REGS_SOFT_LIMIT:
+            continue
+
+        scored.append((smem, config))
+
+    pruned = [c for s, c in scored if s <= smem_cap - _SMEM_SLACK]
+    if pruned:
+        return pruned
+    if scored:
+        scored.sort(key=lambda x: x[0])
+        return [scored[0][1]]
+    return [
+        min(
+            configs,
+            key=lambda c: (
+                c.kwargs["BLOCK_M"] * c.kwargs["BLOCK_N"] * c.kwargs["BLOCK_K"]
+            ),
+        )
+    ]
+
+
+@triton.autotune(
+    configs=_scatter2scatter_lora_mx_configs(),
+    key=["M_BUCKET", "N", "K"],
+    prune_configs_by={"early_config_prune": _prune_fwd_mx_configs},
+)
+@triton.heuristics(
+    {
+        "NO_K_MASK": lambda args: (args["K"] % args["BLOCK_K"]) == 0,
+        "NO_N_MASK": lambda args: (args["N"] % args["BLOCK_N"]) == 0,
+    }
+)
+@triton.jit
+def _scatter2scatter_lora_mx(
+    # X
+    X_ptr,
+    stride_xm: tl.constexpr,
+    stride_xk: tl.constexpr,
+    # Packed MXFP4 W [E, N, K/2]
+    Wp_ptr,
+    stride_wpe,
+    stride_wpn: tl.constexpr,
+    stride_wpk: tl.constexpr,
+    # E8M0 scale [E, N, K/32]
+    Ws_ptr,
+    stride_wse,
+    stride_wsn: tl.constexpr,
+    stride_wsk: tl.constexpr,
+    # FP4 codebook (16 fp32 values)
+    Codebook_ptr,
+    # Output
+    Y_ptr,
+    stride_ym: tl.constexpr,
+    stride_yn: tl.constexpr,
+    # Bias
+    Bias_ptr,
+    stride_bias_e: tl.constexpr,
+    stride_bias_n: tl.constexpr,
+    # LoRA
+    LA_ptr,
+    stride_la_r,
+    stride_la_k,
+    LB_ptr,
+    stride_lb_n,
+    stride_lb_r,
+    # Routing
+    grouped_idx_ptr,
+    expert_idxs_ptr,
+    # Dimensions
+    FAN_OUT: tl.constexpr,
+    M,
+    M_BUCKET,
+    K: tl.constexpr,
+    N: tl.constexpr,
+    E: tl.constexpr,
+    ACTUAL_R: tl.constexpr,
+    BLOCK_M: tl.constexpr,
+    BLOCK_N: tl.constexpr,
+    BLOCK_K: tl.constexpr,
+    BLOCK_R: tl.constexpr,
+    MX_BLOCK_SIZE: tl.constexpr,
+    ACC_TYPE: tl.constexpr,
+    scaling,
+    allow_tf32: tl.constexpr,
+    x_grouped: tl.constexpr,
+    y_grouped: tl.constexpr,
+    NO_K_MASK: tl.constexpr,
+    NO_N_MASK: tl.constexpr,
+    INT64_INDICES: tl.constexpr = False,
+):
+    """Fused scatter2scatter forward with MXFP4 base weights + LoRA."""
+    pid = tl.program_id(axis=0)
+    N_BLOCK_COUNT = tl.cdiv(N, BLOCK_N)
+    M_block_id = pid // N_BLOCK_COUNT
+    N_block_id = pid % N_BLOCK_COUNT
+
+    M_block = M_block_id * BLOCK_M + tl.arange(0, BLOCK_M)
+    if INT64_INDICES:
+        M_block = M_block.to(tl.int64)
+    N_block = N_block_id * BLOCK_N + tl.arange(0, BLOCK_N)
+    N_mask = N_block < N
+    M_boundary_mask = M_block < (FAN_OUT * M)
+
+    E_idxs = tl.load(expert_idxs_ptr + M_block, mask=M_boundary_mask, other=E)
+    no_k_mask = NO_K_MASK
+    acc = tl.zeros((BLOCK_M, BLOCK_N), dtype=ACC_TYPE)
+
+    E_first_idx = tl.min(E_idxs)
+    E_last_idx = tl.minimum(tl.max(E_idxs), E - 1)
+    if INT64_INDICES:
+        M_idx = tl.load(grouped_idx_ptr + M_block, mask=M_boundary_mask).to(tl.int64)
+    else:
+        M_idx = tl.load(grouped_idx_ptr + M_block, mask=M_boundary_mask).to(tl.int32)
+
+    for E_idx in range(E_first_idx, E_last_idx + 1):
+        E_mask = E_idxs == E_idx
+        if x_grouped:
+            M_in_idx = M_block
+        else:
+            M_in_idx = M_idx // FAN_OUT
+
+        acc = _compute_expert_block_lora_mxfp4(
+            E_idx,
+            E_mask,
+            M_in_idx,
+            N_block,
+            N_mask,
+            X_ptr,
+            stride_xm,
+            stride_xk,
+            Wp_ptr,
+            stride_wpe,
+            stride_wpn,
+            stride_wpk,
+            Ws_ptr,
+            stride_wse,
+            stride_wsn,
+            stride_wsk,
+            Codebook_ptr,
+            LA_ptr,
+            stride_la_r,
+            stride_la_k,
+            LB_ptr,
+            stride_lb_n,
+            stride_lb_r,
+            K,
+            ACTUAL_R,
+            acc,
+            no_k_mask,
+            BLOCK_M,
+            BLOCK_K,
+            BLOCK_N,
+            BLOCK_R,
+            MX_BLOCK_SIZE,
+            scaling,
+            allow_tf32=allow_tf32,
+        )
+
+    if Bias_ptr is not None:
+        B_blk_ptrs = (
+            Bias_ptr
+            + E_idxs[:, None] * stride_bias_e
+            + N_block[None, :] * stride_bias_n
+        )
+        acc += tl.load(B_blk_ptrs, mask=M_boundary_mask[:, None] & N_mask[None, :])
+
+    if y_grouped:
+        M_out_idx = M_block
+    else:
+        M_out_idx = M_idx
+    Y_blk_ptrs = Y_ptr + (M_out_idx[:, None] * stride_ym + N_block[None, :] * stride_yn)
+    tl.store(Y_blk_ptrs, acc, mask=M_boundary_mask[:, None] & N_mask[None, :])
+
+
+def scatter2scatter_lora_mx(
+    X: torch.Tensor,
+    W_mx,  # MXWeights with layout=FWD
+    sorted_expert_idxs: torch.Tensor,
+    sorted_scattered_idxs: torch.Tensor,
+    k: int,
+    lora_A: torch.Tensor,
+    lora_B: torch.Tensor,
+    scaling: float,
+    b: Optional[torch.Tensor] = None,
+    x_grouped: bool = False,
+    y_grouped: bool = False,
+    out: Optional[torch.Tensor] = None,
+    int64_indices: bool = False,
+) -> torch.Tensor:
+    """Forward dispatcher for the fused MXFP4 + LoRA kernel.
+
+    ``W_mx`` is an ``MXWeights`` instance in ``FWD`` layout (block axis = K).
+    """
+    from axolotl.integrations.kernels.libs.scattermoe_lora.mx_weights import (
+        MXLayout,
+        fp4_codebook,
+    )
+
+    assert W_mx.layout == MXLayout.FWD, (
+        f"scatter2scatter_lora_mx requires FWD layout, got {W_mx.layout}"
+    )
+    assert sorted_scattered_idxs.size(0) == sorted_expert_idxs.size(0)
+    assert sorted_scattered_idxs.size(0) == X.size(0) * k
+
+    K = W_mx.K
+    N = W_mx.N
+    E = W_mx.packed.size(0)
+    R = lora_A.size(0) // E
+    BLOCK_R = _block_r_for_rank(R)
+    L_scattered = sorted_expert_idxs.size(0)
+
+    if out is None:
+        output = torch.empty((L_scattered, N), device=X.device, dtype=X.dtype)
+    else:
+        assert out.size(0) == L_scattered and out.size(1) == N
+        output = out
+
+    def grid(META):
+        return (
+            triton.cdiv(L_scattered, META["BLOCK_M"]) * triton.cdiv(N, META["BLOCK_N"]),
+        )
+
+    if b is None:
+        stride_be = stride_bn = 0
+        b_ptr = None
+    else:
+        stride_be, stride_bn = b.stride()
+        b_ptr = b
+
+    codebook = fp4_codebook(X.device)
+
+    _scatter2scatter_lora_mx[grid](
+        X,
+        X.stride(0),
+        X.stride(1),
+        W_mx.packed,
+        W_mx.packed.stride(0),
+        W_mx.packed.stride(1),
+        W_mx.packed.stride(2),
+        W_mx.scales,
+        W_mx.scales.stride(0),
+        W_mx.scales.stride(1),
+        W_mx.scales.stride(2),
+        codebook,
+        output,
+        output.stride(0),
+        output.stride(1),
+        b_ptr,
+        stride_be,
+        stride_bn,
+        lora_A,
+        lora_A.stride(0),
+        lora_A.stride(1),
+        lora_B,
+        lora_B.stride(0),
+        lora_B.stride(1),
+        sorted_scattered_idxs,
+        sorted_expert_idxs,
+        FAN_OUT=k,
+        M=X.size(0),
+        M_BUCKET=_bucket_m(X.size(0)),
+        K=K,
+        N=N,
+        E=E,
+        ACTUAL_R=R,
+        BLOCK_R=BLOCK_R,
+        MX_BLOCK_SIZE=_MX_BLOCK_SIZE,
+        ACC_TYPE=tl.float32,
+        scaling=scaling,
+        allow_tf32=ALLOW_TF32,
+        x_grouped=x_grouped,
+        y_grouped=y_grouped,
+        INT64_INDICES=int64_indices,
+    )
+    return output
+
+
+@triton.jit
+def _compute_expert_block_lora_dX_mxfp4(
+    E_idx,
+    E_mask,
+    M_in_idx,
+    K_block,
+    K_mask,
+    # dY [M, N]
+    DY_ptr,
+    stride_dym,
+    stride_dyn,
+    # Packed MXFP4 W in FWD layout: [E, N, K/2] uint8 (block axis = K)
+    Wp_ptr,
+    stride_wpe,
+    stride_wpn,
+    stride_wpk,
+    # Scale [E, N, K/32]
+    Ws_ptr,
+    stride_wse,
+    stride_wsn,
+    stride_wsk,
+    # FP4 codebook
+    Codebook_ptr,
+    # LoRA
+    A_ptr,
+    stride_ar,
+    stride_ak,
+    B_ptr,
+    stride_bn,
+    stride_br,
+    N,
+    ACTUAL_R: tl.constexpr,
+    acc,
+    no_n_mask,
+    BLOCK_M: tl.constexpr,
+    BLOCK_N: tl.constexpr,
+    BLOCK_K: tl.constexpr,
+    BLOCK_R: tl.constexpr,
+    MX_BLOCK_SIZE: tl.constexpr,
+    scaling,
+    allow_tf32: tl.constexpr,
+):
+    """dX inner loop for MXFP4 base weights, FWD layout (block axis = K).
+
+    Computes ``acc += DY @ dequant(W_e)^T + scaling * (DY @ B_e) @ A_e`` for
+    the active token rows in this M-tile assigned to expert ``E_idx``.
+
+    W storage is the *forward* MX layout: packed ``[E, N, K/2]`` and scale
+    ``[E, N, K/32]`` — the same buffer used by the forward kernel, no
+    pre-transpose / re-quantize required. Per N-loop iter we load packed
+    ``[BLOCK_K, BLOCK_N]`` and scale ``[BLOCK_K, BLOCK_N]`` tiles indexed
+    K-as-row × N-as-col; nibbles are extracted along the K axis (the byte
+    is shared by adjacent K rows) and scales broadcast within their
+    ``MX_BLOCK_SIZE``-element K block.
+    """
+    N_block = tl.arange(0, BLOCK_N)
+    # K-axis decode tables (K-along-rows of the W tile)
+    K_byte_block = K_block // 2
+    K_is_high = (K_block % 2) == 1
+    K_scale_block = K_block // MX_BLOCK_SIZE
+    R_block = tl.arange(0, BLOCK_R)
+    R_mask = R_block < ACTUAL_R
+
+    INPUT_DTYPE = DY_ptr.dtype.element_ty
+
+    DY_blk_ptrs = (
+        DY_ptr + M_in_idx[:, None] * stride_dym + N_block[None, :] * stride_dyn
+    )
+    # Packed W in FWD layout [E, N, K/2]
+    # Tile shape [BLOCK_K, BLOCK_N]: row=K_byte (each byte loaded twice across
+    # adjacent K rows), col=N — note N is the *fast* axis here because
+    # stride_wpn (= K/2) is large, but the K row stride is 1.
+    Wp_blk_ptrs = (
+        Wp_ptr
+        + E_idx * stride_wpe
+        + N_block[None, :] * stride_wpn
+        + K_byte_block[:, None] * stride_wpk
+    )
+    # Scale [E, N, K/32]; row=K_scale_idx (broadcast within MX_BLOCK_SIZE), col=N
+    Ws_blk_ptrs = (
+        Ws_ptr
+        + E_idx * stride_wse
+        + N_block[None, :] * stride_wsn
+        + K_scale_block[:, None] * stride_wsk
+    )
+    B_expert_offset = E_idx * ACTUAL_R
+    B_blk_ptrs = (
+        B_ptr
+        + N_block[:, None] * stride_bn
+        + (B_expert_offset + R_block)[None, :] * stride_br
+    )
+
+    iters = tl.cdiv(N, BLOCK_N)
+    dy_b_acc = tl.zeros((BLOCK_M, BLOCK_R), dtype=tl.float32)
+
+    for i in range(iters):
+        if no_n_mask:
+            N_mask_iter = N_block >= 0  # all-true [BLOCK_N]
+        else:
+            N_mask_iter = (i * BLOCK_N + N_block) < N
+        dy_mask = E_mask[:, None] & N_mask_iter[None, :]
+        w_mask = K_mask[:, None] & N_mask_iter[None, :]
+        b_mask = N_mask_iter[:, None] & R_mask[None, :]
+
+        dy = tl.load(DY_blk_ptrs, mask=dy_mask, other=0.0).to(INPUT_DTYPE)
+
+        # MXFP4 dequant of W tile [BLOCK_K, BLOCK_N]
+        packed = tl.load(Wp_blk_ptrs, mask=w_mask, other=0).to(tl.int32)
+        nibble = tl.where(K_is_high[:, None], (packed >> 4) & 0xF, packed & 0xF)
+        codebook_val = tl.load(Codebook_ptr + nibble)
+        scale_byte = tl.load(Ws_blk_ptrs, mask=w_mask, other=0).to(tl.int32)
+        scale_fp = tl.exp2((scale_byte - 127).to(tl.float32))
+        w_dq = (codebook_val * scale_fp).to(INPUT_DTYPE)  # [BLOCK_K, BLOCK_N]
+
+        # Base: acc += DY @ W^T  ([M, N] @ [N, K] -> [M, K])
+        # W tile is [BLOCK_K, BLOCK_N]; W^T = tl.trans(w_dq) -> [BLOCK_N, BLOCK_K]
+        acc += tl.dot(dy, tl.trans(w_dq), allow_tf32=allow_tf32).to(tl.float32)
+
+        # LoRA: dy_b_acc += DY @ B
+        b = tl.load(B_blk_ptrs, mask=b_mask, other=0.0).to(INPUT_DTYPE)
+        dy_b_acc += tl.dot(dy, b, allow_tf32=allow_tf32).to(tl.float32)
+
+        DY_blk_ptrs += BLOCK_N * stride_dyn
+        Wp_blk_ptrs += BLOCK_N * stride_wpn
+        Ws_blk_ptrs += BLOCK_N * stride_wsn
+        B_blk_ptrs += BLOCK_N * stride_bn
+
+    # Epilogue: (DY @ B) @ A  ([M, R] @ [R, K] -> [M, K])
+    A_expert_offset = E_idx * ACTUAL_R
+    A_blk_ptrs = (
+        A_ptr
+        + (A_expert_offset + R_block)[:, None] * stride_ar
+        + K_block[None, :] * stride_ak
+    )
+    a_e = tl.load(A_blk_ptrs, mask=R_mask[:, None] & K_mask[None, :], other=0.0).to(
+        INPUT_DTYPE
+    )
+    lora_dx = tl.dot(dy_b_acc.to(INPUT_DTYPE), a_e, allow_tf32=allow_tf32)
+    acc += scaling * lora_dx
+    return acc
+
+
+def _scatter2scatter_lora_dX_mx_configs():
+    """dX MX kernel configs. BLOCK_K must be a multiple of MX_BLOCK_SIZE
+    because scales broadcast within MX_BLOCK_SIZE-element K-blocks."""
+    configs = []
+    for block_m, block_k, block_n, warps, stages in product(
+        [32, 64, 128],
+        [32, 64, 128],  # all multiples of MX_BLOCK_SIZE=32
+        [32, 64],
+        [4, 8],
+        [3, 4, 5],
+    ):
+        configs.append(
+            triton.Config(
+                {"BLOCK_M": block_m, "BLOCK_K": block_k, "BLOCK_N": block_n},
+                num_stages=stages,
+                num_warps=warps,
+            )
+        )
+    return configs
+
+
+def _prune_dX_mx_configs(configs, named_args, **kwargs):
+    """Prune dX MX configs by SMEM and register pressure (MX-aware)."""
+    smem_cap = _get_smem_capacity()
+    block_r = named_args.get("BLOCK_R", 64)
+
+    scored = []
+    for config in configs:
+        block_m = config.kwargs["BLOCK_M"]
+        block_k = config.kwargs["BLOCK_K"]
+        block_n = config.kwargs["BLOCK_N"]
+        if block_k % _MX_BLOCK_SIZE != 0:
+            continue
+        smem_base = _estimate_smem_usage(config.num_stages, block_m, block_k, block_n)
+        # Per stage: packed W tile [BLOCK_K, BLOCK_N] (bytes — each byte
+        # serves two K rows) and scale tile [BLOCK_K, BLOCK_N] (bytes, broadcast
+        # within each K-block of size MX_BLOCK_SIZE). Approximate to BLOCK_K *
+        # BLOCK_N bytes for each (overestimates SMEM slightly, conservative).
+        smem_packed = config.num_stages * block_k * block_n * 1
+        smem_scale = config.num_stages * block_k * block_n * 1
+        smem_lora_loop = config.num_stages * block_n * block_r * 2
+        smem_lora_epilogue = block_r * block_k * 2
+        smem = (
+            smem_base + smem_packed + smem_scale + smem_lora_loop + smem_lora_epilogue
+        )
+
+        est_regs = _estimate_register_pressure(
+            config.num_warps,
+            (block_m, block_k),
+            (block_m, block_r),
+            (block_m, block_n),
+            (block_k, block_n),
+            (block_n, block_r),
+            (block_r, block_k),
+        )
+        if est_regs > _MAX_REGS_SOFT_LIMIT:
+            continue
+
+        scored.append((smem, config))
+
+    pruned = [c for s, c in scored if s <= smem_cap - _SMEM_SLACK]
+    if pruned:
+        return pruned
+    if scored:
+        scored.sort(key=lambda x: x[0])
+        return [scored[0][1]]
+    return [
+        min(
+            configs,
+            key=lambda c: (
+                c.kwargs["BLOCK_M"] * c.kwargs["BLOCK_K"] * c.kwargs["BLOCK_N"]
+            ),
+        )
+    ]
+
+
+@triton.autotune(
+    configs=_scatter2scatter_lora_dX_mx_configs(),
+    key=["M_BUCKET", "N", "K"],
+    prune_configs_by={"early_config_prune": _prune_dX_mx_configs},
+)
+@triton.heuristics(
+    {
+        "NO_N_MASK": lambda args: (args["N"] % args["BLOCK_N"]) == 0,
+    }
+)
+@triton.jit
+def _scatter2scatter_lora_dX_mx(
+    DY_ptr,
+    stride_dym: tl.constexpr,
+    stride_dyn: tl.constexpr,
+    # Packed W in FWD layout [E, N, K/2]
+    Wp_ptr,
+    stride_wpe,
+    stride_wpn: tl.constexpr,
+    stride_wpk: tl.constexpr,
+    # Scale [E, N, K/32]
+    Ws_ptr,
+    stride_wse,
+    stride_wsn: tl.constexpr,
+    stride_wsk: tl.constexpr,
+    Codebook_ptr,
+    DX_ptr,
+    stride_dxm: tl.constexpr,
+    stride_dxk: tl.constexpr,
+    LA_ptr,
+    stride_la_r,
+    stride_la_k,
+    LB_ptr,
+    stride_lb_n,
+    stride_lb_r,
+    grouped_idx_ptr,
+    expert_idxs_ptr,
+    FAN_OUT: tl.constexpr,
+    M,
+    M_BUCKET,
+    K: tl.constexpr,
+    N: tl.constexpr,
+    E: tl.constexpr,
+    ACTUAL_R: tl.constexpr,
+    BLOCK_M: tl.constexpr,
+    BLOCK_N: tl.constexpr,
+    BLOCK_K: tl.constexpr,
+    BLOCK_R: tl.constexpr,
+    MX_BLOCK_SIZE: tl.constexpr,
+    ACC_TYPE: tl.constexpr,
+    scaling,
+    allow_tf32: tl.constexpr,
+    dy_grouped: tl.constexpr,
+    dx_grouped: tl.constexpr,
+    NO_N_MASK: tl.constexpr,
+    INT64_INDICES: tl.constexpr = False,
+):
+    """Fused MXFP4 dX kernel."""
+    pid = tl.program_id(axis=0)
+    K_BLOCK_COUNT = tl.cdiv(K, BLOCK_K)
+    M_block_id = pid // K_BLOCK_COUNT
+    K_block_id = pid % K_BLOCK_COUNT
+
+    M_block = M_block_id * BLOCK_M + tl.arange(0, BLOCK_M)
+    if INT64_INDICES:
+        M_block = M_block.to(tl.int64)
+    K_block = K_block_id * BLOCK_K + tl.arange(0, BLOCK_K)
+    K_mask = K_block < K
+    M_boundary_mask = M_block < (FAN_OUT * M)
+
+    E_idxs = tl.load(expert_idxs_ptr + M_block, mask=M_boundary_mask, other=E)
+    no_n_mask = NO_N_MASK
+    acc = tl.zeros((BLOCK_M, BLOCK_K), dtype=ACC_TYPE)
+
+    E_first_idx = tl.min(E_idxs)
+    E_last_idx = tl.minimum(tl.max(E_idxs), E - 1)
+    if INT64_INDICES:
+        M_idx = tl.load(grouped_idx_ptr + M_block, mask=M_boundary_mask).to(tl.int64)
+    else:
+        M_idx = tl.load(grouped_idx_ptr + M_block, mask=M_boundary_mask).to(tl.int32)
+
+    for E_idx in range(E_first_idx, E_last_idx + 1):
+        E_mask = E_idxs == E_idx
+        if dy_grouped:
+            M_in_idx = M_block
+        else:
+            M_in_idx = M_idx // FAN_OUT
+
+        acc = _compute_expert_block_lora_dX_mxfp4(
+            E_idx,
+            E_mask,
+            M_in_idx,
+            K_block,
+            K_mask,
+            DY_ptr,
+            stride_dym,
+            stride_dyn,
+            Wp_ptr,
+            stride_wpe,
+            stride_wpn,
+            stride_wpk,
+            Ws_ptr,
+            stride_wse,
+            stride_wsn,
+            stride_wsk,
+            Codebook_ptr,
+            LA_ptr,
+            stride_la_r,
+            stride_la_k,
+            LB_ptr,
+            stride_lb_n,
+            stride_lb_r,
+            N,
+            ACTUAL_R,
+            acc,
+            no_n_mask,
+            BLOCK_M,
+            BLOCK_N,
+            BLOCK_K,
+            BLOCK_R,
+            MX_BLOCK_SIZE,
+            scaling,
+            allow_tf32=allow_tf32,
+        )
+
+    if dx_grouped:
+        M_out_idx = M_block
+    else:
+        M_out_idx = M_idx
+    DX_blk_ptrs = DX_ptr + (
+        M_out_idx[:, None] * stride_dxm + K_block[None, :] * stride_dxk
+    )
+    tl.store(DX_blk_ptrs, acc, mask=M_boundary_mask[:, None] & K_mask[None, :])
+
+
+def scatter2scatter_lora_dX_mx(
+    DY: torch.Tensor,
+    W_mx,  # MXWeights in FWD layout (same buffer as forward)
+    sorted_expert_idxs: torch.Tensor,
+    sorted_scattered_idxs: torch.Tensor,
+    k: int,
+    lora_A: torch.Tensor,
+    lora_B: torch.Tensor,
+    scaling: float,
+    dy_grouped: bool = True,
+    dx_grouped: bool = False,
+    out: Optional[torch.Tensor] = None,
+    int64_indices: bool = False,
+) -> torch.Tensor:
+    """Backward-dX dispatcher for the fused MXFP4 kernel.
+
+    Reuses the *forward* MX layout (block axis = K). The kernel iterates the
+    N reduction in tiles, and for each (K_tile, N_tile) sub-tile, decodes
+    nibbles along the K rows of the tile (K_byte = K // 2) and broadcasts
+    scales within each ``MX_BLOCK_SIZE`` K-block. This avoids the
+    dequant + re-quantize "pre-transpose" round-trip and the extra MX
+    rounding error that would have introduced.
+    """
+    from axolotl.integrations.kernels.libs.scattermoe_lora.mx_weights import (
+        MXLayout,
+        fp4_codebook,
+    )
+
+    assert W_mx.layout == MXLayout.FWD, (
+        f"scatter2scatter_lora_dX_mx requires FWD layout, got {W_mx.layout}"
+    )
+    assert sorted_scattered_idxs.size(0) == sorted_expert_idxs.size(0)
+
+    K = W_mx.K
+    N = W_mx.N
+    E = W_mx.packed.size(0)
+    R = lora_A.size(0) // E
+    BLOCK_R = _block_r_for_rank(R)
+    L_scattered = sorted_expert_idxs.size(0)
+
+    if dy_grouped:
+        M = DY.size(0)
+        fan_out = 1
+    else:
+        M = DY.size(0)
+        fan_out = k
+
+    if out is None:
+        output = torch.empty((L_scattered, K), device=DY.device, dtype=DY.dtype)
+    else:
+        assert out.size(0) == L_scattered and out.size(1) == K
+        output = out
+
+    def grid(META):
+        return (
+            triton.cdiv(L_scattered, META["BLOCK_M"]) * triton.cdiv(K, META["BLOCK_K"]),
+        )
+
+    codebook = fp4_codebook(DY.device)
+
+    _scatter2scatter_lora_dX_mx[grid](
+        DY,
+        DY.stride(0),
+        DY.stride(1),
+        W_mx.packed,
+        W_mx.packed.stride(0),
+        W_mx.packed.stride(1),
+        W_mx.packed.stride(2),
+        W_mx.scales,
+        W_mx.scales.stride(0),
+        W_mx.scales.stride(1),
+        W_mx.scales.stride(2),
+        codebook,
+        output,
+        output.stride(0),
+        output.stride(1),
+        lora_A,
+        lora_A.stride(0),
+        lora_A.stride(1),
+        lora_B,
+        lora_B.stride(0),
+        lora_B.stride(1),
+        sorted_scattered_idxs,
+        sorted_expert_idxs,
+        FAN_OUT=fan_out,
+        M=M,
+        M_BUCKET=_bucket_m(M),
+        K=K,
+        N=N,
+        E=E,
+        ACTUAL_R=R,
+        BLOCK_R=BLOCK_R,
+        MX_BLOCK_SIZE=_MX_BLOCK_SIZE,
+        ACC_TYPE=tl.float32,
+        scaling=scaling,
+        allow_tf32=ALLOW_TF32,
+        dy_grouped=dy_grouped,
+        dx_grouped=dx_grouped,
+        INT64_INDICES=int64_indices,
+    )
+    return output
diff --git a/src/axolotl/integrations/kernels/libs/scattermoe_lora/kernels/ops.py b/src/axolotl/integrations/kernels/libs/scattermoe_lora/kernels/ops.py
index 6aa432770d..08baa8e8e3 100644
--- a/src/axolotl/integrations/kernels/libs/scattermoe_lora/kernels/ops.py
+++ b/src/axolotl/integrations/kernels/libs/scattermoe_lora/kernels/ops.py
@@ -108,6 +108,7 @@ def _scatter2scatter(
     y_grouped: tl.constexpr,
     NO_K_MASK: tl.constexpr,
     NO_N_MASK: tl.constexpr,
+    INT64_INDICES: tl.constexpr = False,
 ):
     pid = tl.program_id(axis=0)
 
@@ -116,6 +117,8 @@ def _scatter2scatter(
     N_block_id = pid % N_BLOCK_COUNT
 
     M_block = M_block_id * BLOCK_M + tl.arange(0, BLOCK_M)
+    if INT64_INDICES:
+        M_block = M_block.to(tl.int64)
     N_block = N_block_id * BLOCK_N + tl.arange(0, BLOCK_N)
     N_mask = N_block < N
     M_boundary_mask = M_block < (FAN_OUT * M)
@@ -126,7 +129,10 @@ def _scatter2scatter(
     acc = tl.zeros((BLOCK_M, BLOCK_N), dtype=ACC_TYPE)
     E_first_idx = tl.min(E_idxs)
     E_last_idx = tl.minimum(tl.max(E_idxs), E - 1)
-    M_idx = tl.load(grouped_idx_ptr + M_block, mask=M_boundary_mask).to(tl.int32)
+    if INT64_INDICES:
+        M_idx = tl.load(grouped_idx_ptr + M_block, mask=M_boundary_mask).to(tl.int64)
+    else:
+        M_idx = tl.load(grouped_idx_ptr + M_block, mask=M_boundary_mask).to(tl.int32)
     for E_idx in range(E_first_idx, E_last_idx + 1):
         E_mask = E_idxs == E_idx
         E_M_idx = M_idx
@@ -176,6 +182,7 @@ def scatter2scatter(
     x_grouped=False,
     y_grouped=False,
     out=None,
+    int64_indices=False,
 ):
     assert sorted_scattered_idxs.size(0) == sorted_expert_idxs.size(0)
     assert sorted_scattered_idxs.size(0) == X.size(0) * k
@@ -198,6 +205,7 @@ def scatter2scatter(
         b,
         x_grouped,
         y_grouped,
+        int64_indices,
     )
     return output
 
@@ -213,6 +221,7 @@ def scatter2scatter_compileable(
     b: Optional[torch.Tensor],
     x_grouped: bool,
     y_grouped: bool,
+    int64_indices: bool = False,
 ) -> None:
     def grid(META):
         grid_num = (
@@ -258,6 +267,7 @@ def grid(META):
         allow_tf32=ALLOW_TF32,
         x_grouped=x_grouped,
         y_grouped=y_grouped,
+        INT64_INDICES=int64_indices,
     )
 
 
diff --git a/src/axolotl/integrations/kernels/libs/scattermoe_lora/layers.py b/src/axolotl/integrations/kernels/libs/scattermoe_lora/layers.py
index 8fd10c8e95..de3f57af62 100644
--- a/src/axolotl/integrations/kernels/libs/scattermoe_lora/layers.py
+++ b/src/axolotl/integrations/kernels/libs/scattermoe_lora/layers.py
@@ -37,6 +37,7 @@
 
 from .parallel_experts import flatten_sort_count, parallel_linear
 from .parallel_linear_lora import get_lora_params_from_wrapper, parallel_linear_lora
+from .selective_dequant import is_mxfp4_param
 
 # =============================================================================
 # LoRA layout conversion utilities (peft <-> scattermoe)
@@ -457,14 +458,20 @@ def forward(self: nn.Module, layer_input: torch.Tensor):
         # ====================================================================
         # Selective expert weight dequantization
         # ====================================================================
-        # When experts are BnB-quantized (quantize_moe_experts), dequantize
-        # only the active experts instead of all E. This saves ~97% memory
-        # for the transient dequant buffer when few experts are active.
-        use_selective = (
-            getattr(self, "_use_selective_dequant", False)
-            and hasattr(experts, "parametrizations")
+        # When experts are BnB-quantized (quantize_moe_experts) or MXFP4
+        # (torchao MXTensor), dequantize only the active experts instead of
+        # all E. This saves ~97% memory for the transient dequant buffer when
+        # few experts are active. MXFP4 always routes through selective
+        # dequant because the kernel needs bf16 weights and full-tensor
+        # dequant of 256-expert MX params is prohibitive.
+        has_bnb_param = (
+            hasattr(experts, "parametrizations")
             and "gate_up_proj" in experts.parametrizations
         )
+        has_mxfp4_param = is_mxfp4_param(getattr(experts, "gate_up_proj", None))
+        use_selective = (
+            getattr(self, "_use_selective_dequant", False) and has_bnb_param
+        ) or has_mxfp4_param
 
         if use_selective:
             from axolotl.integrations.kernels.libs.scattermoe_lora.selective_dequant import (
diff --git a/src/axolotl/integrations/kernels/libs/scattermoe_lora/mx_weights.py b/src/axolotl/integrations/kernels/libs/scattermoe_lora/mx_weights.py
new file mode 100644
index 0000000000..d3916c54bb
--- /dev/null
+++ b/src/axolotl/integrations/kernels/libs/scattermoe_lora/mx_weights.py
@@ -0,0 +1,221 @@
+# SPDX-License-Identifier: Apache-2.0
+# Copyright (c) Axolotl AI
+# Licensed under the Apache License, Version 2.0
+
+"""
+MXFP4 expert weight container + helpers for the fused-dequant Triton kernels.
+
+The container carries the packed uint8 ``[E_active, N, K/2]`` data and the
+E8M0 ``[E_active, N, K/32]`` scales for the *active* experts of one MoE
+step. ``parallel_linear_lora`` checks for this container instance and
+routes to the MX-aware Triton kernels.
+
+Layout: OCP block axis is the contraction axis ``K`` — the last storage
+dim. The same buffer is consumed by both the forward kernel (K is the
+matmul reduction axis) and the dX kernel (K is the output axis, with
+scales broadcast within ``MX_BLOCK_SIZE``-element K blocks). No
+pre-transpose / re-quantize is needed for the backward path.
+
+The FP4 E2M1 codebook is the standard OCP-MX one (16 values:
+``±{0, 0.5, 1, 1.5, 2, 3, 4, 6}``); we cache one fp32 copy per CUDA device.
+"""
+
+from __future__ import annotations
+
+import enum
+from dataclasses import dataclass
+from typing import Optional
+
+import torch
+
+MX_BLOCK_SIZE = 32
+
+# Standard OCP-MX fp4 e2m1 codebook (sign bit | 2-bit exp | 1-bit mantissa).
+# Index by the raw 4-bit nibble. Cached fp32 tensor for kernel lookups.
+_FP4_E2M1_LUT = (
+    0.0,
+    0.5,
+    1.0,
+    1.5,
+    2.0,
+    3.0,
+    4.0,
+    6.0,
+    -0.0,
+    -0.5,
+    -1.0,
+    -1.5,
+    -2.0,
+    -3.0,
+    -4.0,
+    -6.0,
+)
+
+_LUT_CACHE: dict[torch.device, torch.Tensor] = {}
+
+
+def fp4_codebook(device: torch.device) -> torch.Tensor:
+    """Return the cached 16-entry FP4 E2M1 → fp32 lookup on ``device``."""
+    key = device
+    lut = _LUT_CACHE.get(key)
+    if lut is None or lut.device != device:
+        lut = torch.tensor(_FP4_E2M1_LUT, dtype=torch.float32, device=device)
+        _LUT_CACHE[key] = lut
+    return lut
+
+
+class MXLayout(enum.IntEnum):
+    """Which axis the OCP-MX block scaling runs along, in *kernel* coords.
+
+    Currently only ``FWD`` (block axis = K) is supported and used by both
+    the forward and dX kernels. The enum is kept as a future extension
+    point for swizzled or N-axis-blocked variants.
+    """
+
+    FWD = 0
+
+
+@dataclass
+class MXWeights:
+    """Packed + scale tensors for one MoE projection's active experts.
+
+    Attributes
+    ----------
+    packed:
+        ``uint8`` tensor, shape ``[E_active, N, K/2]``.
+    scales:
+        ``uint8`` (E8M0) tensor, shape ``[E_active, N, K/32]``.
+    K, N:
+        Logical contraction/output dimensions of the dequantized W.
+    layout:
+        Which axis the block scaling runs along (see ``MXLayout``).
+    block_size:
+        OCP MX block size; only ``32`` is supported by the kernels.
+    """
+
+    packed: torch.Tensor
+    scales: torch.Tensor
+    K: int
+    N: int
+    layout: MXLayout = MXLayout.FWD
+    block_size: int = MX_BLOCK_SIZE
+    num_experts: Optional[int] = None  # E_active; convenience field
+    orig_dtype: torch.dtype = torch.bfloat16
+
+    def __post_init__(self) -> None:
+        assert self.block_size == MX_BLOCK_SIZE, (
+            f"only block_size={MX_BLOCK_SIZE} is supported, got {self.block_size}"
+        )
+        # scales are E8M0 (float8_e8m0fnu) in torchao; viewed as uint8 here so
+        # the Triton kernel can load them with simple integer arithmetic.
+        if self.scales.dtype != torch.uint8:
+            self.scales = self.scales.view(torch.uint8)
+        assert self.packed.dtype == torch.uint8, (
+            f"packed must be uint8, got {self.packed.dtype}"
+        )
+        if self.num_experts is None:
+            self.num_experts = self.packed.size(0)
+
+    @property
+    def device(self) -> torch.device:
+        return self.packed.device
+
+
+def _torchao_mxtensor_cls():
+    """Return the torchao MXTensor class, or ``None`` if torchao is missing."""
+    try:
+        from torchao.prototype.mx_formats.mx_tensor import MXTensor
+    except ImportError:
+        return None
+    return MXTensor
+
+
+def _mx_qdata(mx) -> torch.Tensor:
+    """Read the packed-nibble buffer off an MXTensor, tolerating torchao
+    renaming the attribute between versions."""
+    qdata = getattr(mx, "qdata", None)
+    if qdata is None:
+        qdata = getattr(mx, "_data", None)
+    if qdata is None:
+        raise AttributeError(
+            "torchao MXTensor exposes neither .qdata nor ._data; "
+            "this torchao version is unsupported."
+        )
+    return qdata
+
+
+def _mx_scale(mx) -> torch.Tensor:
+    """Read the E8M0 scale buffer off an MXTensor, tolerating torchao
+    renaming the attribute between versions."""
+    scale = getattr(mx, "scale", None)
+    if scale is None:
+        scale = getattr(mx, "_scale_e8m0", None)
+    if scale is None:
+        raise AttributeError(
+            "torchao MXTensor exposes neither .scale nor ._scale_e8m0; "
+            "this torchao version is unsupported."
+        )
+    return scale
+
+
+def _construct_mxtensor_subset(
+    parent, qdata_slice: torch.Tensor, scale_slice: torch.Tensor
+):
+    """Construct a new MXTensor that shares ``parent``'s metadata but uses
+    the provided ``qdata_slice`` / ``scale_slice`` buffers.
+
+    Pinned to torchao 0.17.0's positional constructor (qdata, scale,
+    elem_dtype, block_size, orig_dtype, kernel_preference,
+    act_quant_kwargs, is_swizzled_scales). Optional attributes are read via
+    ``getattr`` so we degrade gracefully if a future torchao version drops
+    or renames one — the single point of pain for torchao internals access
+    across this codebase.
+    """
+    MXTensor = _torchao_mxtensor_cls()
+    if MXTensor is None:
+        raise ImportError("MXFP4 path requires torchao (install `torchao>=0.7`).")
+    kernel_preference = getattr(parent, "kernel_preference", None)
+    act_quant_kwargs = getattr(parent, "act_quant_kwargs", None)
+    is_swizzled_scales = getattr(parent, "is_swizzled_scales", False)
+    return MXTensor(
+        qdata_slice,
+        scale_slice,
+        parent.elem_dtype,
+        parent.block_size,
+        parent.orig_dtype,
+        kernel_preference,
+        act_quant_kwargs,
+        is_swizzled_scales,
+    )
+
+
+def selective_mx_weights_fwd(mx_param, active_experts: torch.Tensor) -> MXWeights:
+    """Slice an MXFP4 expert parameter to the active set, keeping the K-axis
+    block layout (FWD). The returned ``MXWeights.packed`` has shape
+    ``[num_active, N, K/2]`` and is directly consumable by the forward MX
+    kernel via ``parallel_linear_lora``."""
+    MXTensor = _torchao_mxtensor_cls()
+    if MXTensor is None:
+        raise ImportError("MXFP4 fused path requires torchao>=0.7 (install `torchao`).")
+    assert isinstance(mx_param, MXTensor), (
+        f"selective_mx_weights_fwd expects an MXTensor, got {type(mx_param)}"
+    )
+    assert mx_param.elem_dtype == torch.float4_e2m1fn_x2, (
+        "only MXFP4 (float4_e2m1fn_x2) is supported"
+    )
+    sub_qdata = _mx_qdata(mx_param)[active_experts].contiguous()
+    sub_scale = _mx_scale(mx_param)[active_experts].contiguous()
+    # Logical dims (kernel's K, N): the contraction axis is K, the OCP block
+    # axis is the LAST storage axis (= K). N is the leading non-expert axis.
+    N = sub_qdata.size(1)
+    K = sub_qdata.size(2) * 2
+    return MXWeights(
+        packed=sub_qdata,
+        scales=sub_scale,
+        K=K,
+        N=N,
+        layout=MXLayout.FWD,
+        block_size=mx_param.block_size,
+        num_experts=sub_qdata.size(0),
+        orig_dtype=mx_param.orig_dtype,
+    )
diff --git a/src/axolotl/integrations/kernels/libs/scattermoe_lora/parallel_experts.py b/src/axolotl/integrations/kernels/libs/scattermoe_lora/parallel_experts.py
index 5180587aad..eae63fb8ee 100644
--- a/src/axolotl/integrations/kernels/libs/scattermoe_lora/parallel_experts.py
+++ b/src/axolotl/integrations/kernels/libs/scattermoe_lora/parallel_experts.py
@@ -11,6 +11,34 @@
 
 from . import kernels
 
+# When the maximum addressable element offset across any input/output buffer
+# exceeds INT_MAX, the Triton kernel's int32 pointer-offset arithmetic
+# overflows. ``_needs_int64_indices`` returns True iff any tensor has
+# ``numel() >= INT_MAX``, which is a sufficient condition for the
+# ``M_idx * stride_*m`` product to overflow somewhere in the kernel. When
+# True, callers pass ``INT64_INDICES=True`` to the kernel so the index range
+# is cast to int64 before it enters the multiplication. Strides themselves
+# are already int64 at the Python level (from ``tensor.stride()``); only
+# the *index* type needs the bump.
+#
+# The threshold here matches the kernel's correctness boundary: as soon as
+# any indexed buffer has 2**31 - 1 or more elements, the int32 multiply at
+# the end of the buffer can overflow. The wrapper used to chunk the call to
+# keep ``rows * y_dim`` below 2**31; with INT64_INDICES the kernel itself
+# handles overflow, so the auto-dispatch routes directly to a single
+# kernel launch in either mode.
+_INT_MAX = 2**31 - 1
+
+
+def _needs_int64_indices(*tensors) -> bool:
+    """True iff any input/output tensor's element count exceeds INT_MAX."""
+    for t in tensors:
+        if t is None or not isinstance(t, torch.Tensor):
+            continue
+        if t.numel() >= _INT_MAX:
+            return True
+    return False
+
 
 @torch.library.custom_op("scattermoe::bincount", mutates_args={})
 def compileable_bincount(x: torch.Tensor, minlength: int) -> torch.Tensor:
@@ -54,6 +82,12 @@ def forward(
             expert_weights = expert_weights.to(x.dtype)
         if expert_biases is not None and expert_biases.dtype != x.dtype:
             expert_biases = expert_biases.to(x.dtype)
+        L_scattered = sorted_expert_idxs.size(0)
+        y_dim = expert_weights.size(-1)
+        # Cheap probe: the kernel's overflow risk is the M_block * stride_ym
+        # product, dominated by the output buffer L_scattered * y_dim. We also
+        # check x because the X_ptr arithmetic uses similar indices.
+        needs_int64_fwd = (L_scattered * y_dim) >= _INT_MAX or _needs_int64_indices(x)
         with torch.device(x.device):
             output = kernels.ops.scatter2scatter(
                 X=x,
@@ -64,6 +98,7 @@ def forward(
                 sorted_scattered_idxs=sorted_scattered_idxs,
                 x_grouped=grouped_in,
                 y_grouped=grouped_out,
+                int64_indices=needs_int64_fwd,
             )
             if gates is not None:
                 output_expanded = output.view(
@@ -145,6 +180,11 @@ def backward(ctx, grad_out: torch.Tensor):
                 has_bias=expert_biases is not None,
             )
 
+            L_scattered = sorted_expert_idxs.size(0)
+            dx_dim = expert_weights.size(1)  # K dim of W = output dim for dX
+            needs_int64_bwd = (
+                L_scattered * dx_dim
+            ) >= _INT_MAX or _needs_int64_indices(grouped_grad_out)
             d_expanded_input = kernels.ops.scatter2scatter(
                 X=grouped_grad_out,
                 x_grouped=True,
@@ -154,6 +194,7 @@ def backward(ctx, grad_out: torch.Tensor):
                 k=1,
                 y_grouped=grouped_in,
                 out=d_expanded_input,  # Reuse grouped_x buffer
+                int64_indices=needs_int64_bwd,
             )
 
             if k == 1:
diff --git a/src/axolotl/integrations/kernels/libs/scattermoe_lora/parallel_linear_lora.py b/src/axolotl/integrations/kernels/libs/scattermoe_lora/parallel_linear_lora.py
index 17dfd420c0..7bc70280a4 100644
--- a/src/axolotl/integrations/kernels/libs/scattermoe_lora/parallel_linear_lora.py
+++ b/src/axolotl/integrations/kernels/libs/scattermoe_lora/parallel_linear_lora.py
@@ -23,7 +23,7 @@
   dB = scaling * dY^T @ (X @ A^T)                   (per-expert, on grouped data)
 """
 
-from typing import Optional
+from typing import Optional, Union
 
 import torch
 
@@ -33,7 +33,11 @@
     group_bwd_lora_fused,
     scatter2scatter_lora,
     scatter2scatter_lora_dX,
+    scatter2scatter_lora_dX_mx,
+    scatter2scatter_lora_mx,
 )
+from .mx_weights import MXLayout, MXWeights
+from .parallel_experts import _INT_MAX, _needs_int64_indices
 
 
 class ScatterMoELoRA(torch.autograd.Function):
@@ -50,7 +54,7 @@ class ScatterMoELoRA(torch.autograd.Function):
     def forward(
         ctx,
         x: torch.Tensor,
-        expert_weights: torch.Tensor,
+        expert_weights: Union[torch.Tensor, MXWeights],
         k: int,
         sorted_expert_idxs: torch.Tensor,
         sorted_scattered_idxs: torch.Tensor,
@@ -65,26 +69,60 @@ def forward(
         use_fused_dX: bool = False,
         use_fused_gather: bool = False,
     ):
-        # Cast weights to match input dtype (e.g. 8-bit LoRA)
-        if expert_weights.dtype != x.dtype:
-            expert_weights = expert_weights.to(x.dtype)
+        if isinstance(expert_weights, MXWeights):
+            assert expert_weights.layout == MXLayout.FWD, (
+                "MXWeights passed to forward must be in FWD layout"
+            )
+            is_mx = True
+        else:
+            # Cast weights to match input dtype (e.g. 8-bit LoRA)
+            if expert_weights.dtype != x.dtype:
+                expert_weights = expert_weights.to(x.dtype)
+            is_mx = False
         if expert_biases is not None and expert_biases.dtype != x.dtype:
             expert_biases = expert_biases.to(x.dtype)
+        L_scattered = sorted_expert_idxs.size(0)
+        if is_mx:
+            N_dim = expert_weights.N  # type: ignore[union-attr]
+        else:
+            N_dim = expert_weights.size(-1)  # type: ignore[union-attr]
+        # Forward output is [L_scattered, N]. Overflow risk is dominated by
+        # that buffer; also probe X for the unusual case where it alone is
+        # huge (e.g. very wide hidden with modest seq).
+        needs_int64_fwd = (L_scattered * N_dim) >= _INT_MAX or _needs_int64_indices(x)
         with torch.device(x.device):
-            # Fused forward: Y = X @ W + scaling * (X @ A^T) @ B^T
-            output = scatter2scatter_lora(
-                X=x,
-                W=expert_weights,
-                sorted_expert_idxs=sorted_expert_idxs,
-                sorted_scattered_idxs=sorted_scattered_idxs,
-                k=k,
-                lora_A=lora_A,
-                lora_B=lora_B,
-                scaling=scaling,
-                b=expert_biases,
-                x_grouped=grouped_in,
-                y_grouped=grouped_out,
-            )
+            if is_mx:
+                # Fused MXFP4 forward: dequant happens inside the K-loop
+                output = scatter2scatter_lora_mx(
+                    X=x,
+                    W_mx=expert_weights,
+                    sorted_expert_idxs=sorted_expert_idxs,
+                    sorted_scattered_idxs=sorted_scattered_idxs,
+                    k=k,
+                    lora_A=lora_A,
+                    lora_B=lora_B,
+                    scaling=scaling,
+                    b=expert_biases,
+                    x_grouped=grouped_in,
+                    y_grouped=grouped_out,
+                    int64_indices=needs_int64_fwd,
+                )
+            else:
+                # Fused forward: Y = X @ W + scaling * (X @ A^T) @ B^T
+                output = scatter2scatter_lora(
+                    X=x,
+                    W=expert_weights,
+                    sorted_expert_idxs=sorted_expert_idxs,
+                    sorted_scattered_idxs=sorted_scattered_idxs,
+                    k=k,
+                    lora_A=lora_A,
+                    lora_B=lora_B,
+                    scaling=scaling,
+                    b=expert_biases,
+                    x_grouped=grouped_in,
+                    y_grouped=grouped_out,
+                    int64_indices=needs_int64_fwd,
+                )
 
             # Handle gating (weighted combination of top-k expert outputs)
             if gates is not None:
@@ -117,8 +155,12 @@ def forward(
             ctx.grouped_out = grouped_out
             ctx.k = k
             ctx.scaling = scaling
-            ctx.use_fused_dX = use_fused_dX
-            ctx.use_fused_gather = use_fused_gather
+            # MXFP4 forces fused dX + gather: the non-fused dX path would have
+            # to materialise a bf16 weight tile, defeating the kernel-fusion
+            # win, and the gather/scatter pattern is identical.
+            ctx.use_fused_dX = True if is_mx else use_fused_dX
+            ctx.use_fused_gather = True if is_mx else use_fused_gather
+            ctx.is_mx = is_mx
 
         return output
 
@@ -141,7 +183,11 @@ def backward(ctx, grad_out: torch.Tensor):
             scaling = ctx.scaling
             grouped_in = ctx.grouped_in
             grouped_out = ctx.grouped_out
-            E = expert_weights.size(0)
+            is_mx = ctx.is_mx
+            if is_mx:
+                E = expert_weights.packed.size(0)
+            else:
+                E = expert_weights.size(0)
 
             # ------------------------------------------------------------------
             # Gate gradients (if using top-k gating with routing weights)
@@ -171,7 +217,7 @@ def backward(ctx, grad_out: torch.Tensor):
             #     -> use dy_grouped=True in the fused kernel
             M_total = sorted_scattered_idxs.size(0)
             K_dim = x.size(-1)
-            N_dim = expert_weights.size(-1)
+            N_dim = expert_weights.N if is_mx else expert_weights.size(-1)
             fuse_gather_workload = M_total * max(K_dim, N_dim)
             _FUSE_GATHER_THRESHOLD = 2**24  # ~16M elements
 
@@ -182,6 +228,15 @@ def backward(ctx, grad_out: torch.Tensor):
                 and fuse_gather_workload < _FUSE_GATHER_THRESHOLD
             )
 
+            # The backward path indexes into grad_out [M_total, N] and x [M, K]
+            # using either M_idx (grouped) or scatter_idx (ungrouped). Overflow
+            # risk is dominated by the largest indexed buffer along the M axis.
+            needs_int64_bwd = (
+                (M_total * N_dim) >= _INT_MAX
+                or (M_total * K_dim) >= _INT_MAX
+                or _needs_int64_indices(grad_out, x)
+            )
+
             if can_fuse_gather:
                 # ------------------------------------------------------------------
                 # Fused path: skip group(x) entirely
@@ -199,6 +254,7 @@ def backward(ctx, grad_out: torch.Tensor):
                     k=k,
                     scaling=scaling,
                     dy_grouped=grouped_out,
+                    int64_indices=needs_int64_bwd,
                 )
 
                 # Prepare grouped_grad_out for the dX path (needed by both
@@ -243,12 +299,46 @@ def backward(ctx, grad_out: torch.Tensor):
                     expert_offsets=expert_offsets,
                     E=E,
                     scaling=scaling,
+                    int64_indices=needs_int64_bwd,
                 )
 
             # ------------------------------------------------------------------
             # Input gradient: dX = dY @ W^T + scaling * (dY @ B) @ A
             # ------------------------------------------------------------------
-            if ctx.use_fused_dX:
+            if is_mx:
+                # dX kernel reuses the forward MX layout (block axis = K) —
+                # no pre-transpose/re-quantize needed.
+                if can_fuse_gather and not grouped_out:
+                    d_expanded_input = scatter2scatter_lora_dX_mx(
+                        DY=grad_out,
+                        W_mx=expert_weights,
+                        sorted_expert_idxs=sorted_expert_idxs,
+                        sorted_scattered_idxs=sorted_scattered_idxs,
+                        k=1,
+                        lora_A=lora_A,
+                        lora_B=lora_B,
+                        scaling=scaling,
+                        dy_grouped=False,
+                        dx_grouped=grouped_in,
+                        out=d_expanded_input,
+                        int64_indices=needs_int64_bwd,
+                    )
+                else:
+                    d_expanded_input = scatter2scatter_lora_dX_mx(
+                        DY=grouped_grad_out,
+                        W_mx=expert_weights,
+                        sorted_expert_idxs=sorted_expert_idxs,
+                        sorted_scattered_idxs=sorted_scattered_idxs,
+                        k=1,
+                        lora_A=lora_A,
+                        lora_B=lora_B,
+                        scaling=scaling,
+                        dy_grouped=True,
+                        dx_grouped=grouped_in,
+                        out=d_expanded_input,
+                        int64_indices=needs_int64_bwd,
+                    )
+            elif ctx.use_fused_dX:
                 if can_fuse_gather and not grouped_out:
                     # Fully fused: read ungrouped DY via scatter pattern
                     d_expanded_input = scatter2scatter_lora_dX(
@@ -263,6 +353,7 @@ def backward(ctx, grad_out: torch.Tensor):
                         dy_grouped=False,
                         dx_grouped=grouped_in,
                         out=d_expanded_input,
+                        int64_indices=needs_int64_bwd,
                     )
                 else:
                     # Fused dX only: read from pre-grouped DY
@@ -278,6 +369,7 @@ def backward(ctx, grad_out: torch.Tensor):
                         dy_grouped=True,
                         dx_grouped=grouped_in,
                         out=d_expanded_input,
+                        int64_indices=needs_int64_bwd,
                     )
             else:
                 # Original path: separate base scatter2scatter + LoRA Python loop
@@ -290,6 +382,7 @@ def backward(ctx, grad_out: torch.Tensor):
                     k=1,
                     y_grouped=grouped_in,
                     out=d_expanded_input,
+                    int64_indices=needs_int64_bwd,
                 )
 
                 # LoRA part: dX_lora = scaling * (dY @ B) @ A
@@ -317,12 +410,16 @@ def backward(ctx, grad_out: torch.Tensor):
                     x.size(0), k, d_expanded_input.size(-1)
                 ).sum(-2)
 
-            # W is frozen during LoRA training -- skip weight gradient
-            d_weights = (
-                torch.zeros_like(expert_weights)
-                if expert_weights.requires_grad
-                else None
-            )
+            # W is frozen during LoRA training -- skip weight gradient.
+            # (MX weights are containers, not tensors, and never carry grad.)
+            if is_mx:
+                d_weights = None
+            else:
+                d_weights = (
+                    torch.zeros_like(expert_weights)
+                    if expert_weights.requires_grad
+                    else None
+                )
             d_biases = None
 
         return (
diff --git a/src/axolotl/integrations/kernels/libs/scattermoe_lora/selective_dequant.py b/src/axolotl/integrations/kernels/libs/scattermoe_lora/selective_dequant.py
index 1df8b2f684..6398d1faa3 100644
--- a/src/axolotl/integrations/kernels/libs/scattermoe_lora/selective_dequant.py
+++ b/src/axolotl/integrations/kernels/libs/scattermoe_lora/selective_dequant.py
@@ -13,6 +13,8 @@
 
 This module provides format-agnostic selective weight extraction:
   - BnB 4-bit (nf4/fp4): slice quantized data + absmax per expert
+  - MXFP4 (torchao MXTensor with elem_dtype=float4_e2m1fn_x2): slice
+    qdata + E8M0 scale per expert and dequantize via torchao
   - bf16/fp32: direct indexing (no dequant needed)
   - FP8: slice + cast
 
@@ -24,6 +26,21 @@
 import torch
 import torch.nn as nn
 
+from .mx_weights import (
+    _construct_mxtensor_subset,
+    _mx_qdata,
+    _mx_scale,
+    _torchao_mxtensor_cls,
+)
+
+
+def is_mxfp4_param(param) -> bool:
+    """True iff ``param`` is a torchao MXTensor with MXFP4 element dtype."""
+    MXTensor = _torchao_mxtensor_cls()
+    if MXTensor is None or not isinstance(param, MXTensor):
+        return False
+    return param.elem_dtype == torch.float4_e2m1fn_x2
+
 
 def get_active_experts(sorted_expert_idxs: torch.Tensor, E: int) -> torch.Tensor:
     """Get sorted unique expert indices from the routing output.
@@ -175,6 +192,42 @@ def _selective_dequant_bnb4(
     return deq.reshape(num_active, *expert_shape)
 
 
+def _selective_dequant_mxfp4(
+    mx_param,
+    active_experts: torch.Tensor,
+    out_dtype: torch.dtype = torch.bfloat16,
+) -> torch.Tensor:
+    """Selectively dequantize active experts from a torchao MXFP4 ``MXTensor``.
+
+    Layout assumption: the MXTensor's last axis is the OCP MX block axis.
+    For ScatterMoE experts this matches the natural storage where
+    ``experts.gate_up_proj``/``down_proj`` is ``[E, dim1, dim2]`` and
+    ``dim2`` is the contraction axis post ``.transpose(2, 1)`` performed by
+    the caller. Indexing ``[active_experts]`` on the qdata and scale yields
+    a compact MX tensor that we dequantize via torchao.
+
+    Args:
+        mx_param: ``torchao.prototype.mx_formats.mx_tensor.MXTensor`` of
+            logical shape ``[E, dim1, dim2]`` with ``elem_dtype=float4_e2m1fn_x2``.
+        active_experts: ``[num_active]`` sorted unique expert indices.
+        out_dtype: dtype of the dequantized buffer (default ``bfloat16``).
+
+    Returns:
+        Dequantized bf16/fp16 tensor of shape ``[num_active, dim1, dim2]``.
+    """
+    if _torchao_mxtensor_cls() is None:
+        raise ImportError(
+            "MXFP4 expert dequantization requires torchao>=0.7 "
+            "(install with `pip install torchao`)."
+        )
+
+    sub_qdata = _mx_qdata(mx_param)[active_experts].contiguous()
+    sub_scale = _mx_scale(mx_param)[active_experts].contiguous()
+
+    sub_mx = _construct_mxtensor_subset(mx_param, sub_qdata, sub_scale)
+    return sub_mx.dequantize(out_dtype)
+
+
 def _selective_index_dense(
     param: torch.Tensor,
     active_experts: torch.Tensor,
@@ -243,8 +296,14 @@ def selective_expert_weights(
 
             return _selective_dequant_bnb4(raw_param, qs, active_experts, expert_shape)
 
-    # Dense parameter (bf16/fp32) — direct indexing
+    # Pull the parameter out before format dispatch — used by every branch below.
     param = getattr(experts_module, param_name)
+
+    # MXFP4 (torchao MXTensor) — dequantize the subset, return [num_active, d1, d2]
+    if is_mxfp4_param(param):
+        return _selective_dequant_mxfp4(param, active_experts)
+
+    # Dense parameter (bf16/fp32) — direct indexing
     if param.dim() == 3:
         return param[active_experts]
 
@@ -252,6 +311,82 @@ def selective_expert_weights(
     return param
 
 
+def shared_dequant_across_shards(
+    experts_module: nn.Module,
+    param_name: str,
+    sei_per_shard: list[torch.Tensor],
+    E: int,
+) -> tuple[torch.Tensor, torch.Tensor, list[torch.Tensor]]:
+    """Dequantize the union of active experts across N shards exactly once.
+
+    The orthogonal Strategy A path calls :func:`selective_expert_weights`
+    once per shard, which re-dequantizes the active experts redundantly
+    when the active-expert sets overlap. For seq-dim sharding with a
+    softmax-routed MoE, that overlap is the common case.
+
+    This helper hoists the dequant: it computes the union of active
+    experts across all shards, calls :func:`selective_expert_weights`
+    once on the union, and returns per-shard index tables that map each
+    shard's local active experts into rows of the union buffer.
+
+    Parameters
+    ----------
+    experts_module:
+        The base experts module (e.g. ``OlmoeExperts``). Same object the
+        per-shard path would pass to :func:`selective_expert_weights`.
+    param_name:
+        ``"gate_up_proj"`` or ``"down_proj"``.
+    sei_per_shard:
+        List of ``sorted_expert_idxs`` tensors, one per shard.
+    E:
+        Total number of experts.
+
+    Returns
+    -------
+    union_active:
+        ``[U]`` sorted unique expert ids across all shards.
+    union_buffer:
+        Dequantized weights for ``union_active``,
+        ``[U, dim1, dim2]`` in the param's natural storage dtype
+        (typically bf16). Same buffer each shard's call would have built
+        had it dequantized only its own active set, just shared.
+    shard_into_union:
+        List of length ``len(sei_per_shard)``. Entry ``i`` is a 1-D
+        ``long`` tensor that indexes ``union_buffer`` along dim 0 to
+        produce the same ``[num_active_i, dim1, dim2]`` slice the
+        per-shard path would have produced. Callers feed this through
+        ``union_buffer.index_select(0, shard_into_union[i])`` (or
+        equivalent advanced indexing) before handing the slice to
+        ``parallel_linear_lora``.
+
+    Bitwise contract: composing ``union_buffer.index_select(0,
+    shard_into_union[i])`` is byte-identical to
+    ``selective_expert_weights(experts_module, param_name,
+    get_active_experts(sei_per_shard[i], E))`` because both paths slice
+    the same dequantized MX subset by the same expert ids. The
+    ``test_shared_dequant_helper.py`` parity test asserts this.
+    """
+    if not sei_per_shard:
+        raise ValueError("sei_per_shard must contain at least one tensor")
+
+    device = sei_per_shard[0].device
+    per_shard_active = [get_active_experts(sei, E) for sei in sei_per_shard]
+    union_active = torch.unique(torch.cat(per_shard_active))
+
+    union_buffer = selective_expert_weights(experts_module, param_name, union_active)
+
+    # Build the global-id → union-row remap once, then gather per shard.
+    # ``union_active`` is sorted and unique by construction, so the inverse
+    # lookup is dense over ``E``.
+    union_remap = torch.empty(E, dtype=torch.long, device=device)
+    union_remap[union_active] = torch.arange(
+        len(union_active), device=device, dtype=torch.long
+    )
+    shard_into_union = [union_remap[active] for active in per_shard_active]
+
+    return union_active, union_buffer, shard_into_union
+
+
 def selective_lora_weights(
     lora_A: torch.Tensor,
     lora_B: torch.Tensor,
diff --git a/src/axolotl/integrations/kernels/libs/sonicmoe/__init__.py b/src/axolotl/integrations/kernels/libs/sonicmoe/__init__.py
index d1f5e5f603..5cd9cf0fd7 100644
--- a/src/axolotl/integrations/kernels/libs/sonicmoe/__init__.py
+++ b/src/axolotl/integrations/kernels/libs/sonicmoe/__init__.py
@@ -1,3 +1,6 @@
-from .patch import patch_sonicmoe
+from .experts import register_sonicmoe_experts, sonicmoe_experts_forward_with_lora
 
-__all__ = ["patch_sonicmoe"]
+__all__ = [
+    "register_sonicmoe_experts",
+    "sonicmoe_experts_forward_with_lora",
+]
diff --git a/src/axolotl/integrations/kernels/libs/sonicmoe/experts.py b/src/axolotl/integrations/kernels/libs/sonicmoe/experts.py
new file mode 100644
index 0000000000..785b74e45a
--- /dev/null
+++ b/src/axolotl/integrations/kernels/libs/sonicmoe/experts.py
@@ -0,0 +1,143 @@
+"""LoRA-aware sonicmoe experts forward for the transformers ExpertsInterface.
+
+Wraps upstream ``_sonicmoe_wrapper`` and materializes expert LoRA via
+``MoELoRAMaterialize`` before the CUTLASS call.
+"""
+
+from __future__ import annotations
+
+import torch
+
+from .lora import (
+    MoELoRAMaterialize,
+    get_lora_params_from_wrapper,
+    has_lora,
+    materialize_expert_lora,
+    unwrap_experts_lora,
+)
+
+
+def _maybe_unwrap_param_wrapper(param):
+    """Return ``(base_tensor, lora_params_or_None)`` for a PEFT-wrapped Parameter."""
+    try:
+        from peft.tuners.param_wrapper import ParamWrapper
+    except ImportError:
+        return param, None
+
+    if not isinstance(param, ParamWrapper):
+        return param, None
+
+    base = param.original_parameter
+    lora_A, lora_B, scaling = get_lora_params_from_wrapper(param)
+    if lora_A is None:
+        return base, None
+    return base, (lora_A, lora_B, scaling)
+
+
+def _resolve_weights_and_lora(experts_module):
+    """Resolve raw expert weights/biases + optional LoRA tuples.
+
+    Handles both PEFT layouts: module-level wrap (walked via ``unwrap_experts_lora``)
+    and per-parameter ``ParamWrapper``. No layout permute applied.
+    """
+    if has_lora(experts_module):
+        base_experts, lora_dict = unwrap_experts_lora(experts_module)
+        w1 = base_experts.gate_up_proj
+        w2 = base_experts.down_proj
+        b1 = getattr(base_experts, "gate_up_proj_bias", None)
+        b2 = getattr(base_experts, "down_proj_bias", None)
+        return w1, b1, w2, b2, lora_dict.get("gate_up_proj"), lora_dict.get("down_proj")
+
+    w1, lora_w1 = _maybe_unwrap_param_wrapper(experts_module.gate_up_proj)
+    w2, lora_w2 = _maybe_unwrap_param_wrapper(experts_module.down_proj)
+    b1 = getattr(experts_module, "gate_up_proj_bias", None)
+    b2 = getattr(experts_module, "down_proj_bias", None)
+    return w1, b1, w2, b2, lora_w1, lora_w2
+
+
+def sonicmoe_experts_forward_with_lora(
+    self,
+    hidden_states: torch.Tensor,
+    top_k_index: torch.Tensor,
+    top_k_weights: torch.Tensor,
+) -> torch.Tensor:
+    """Sonicmoe experts forward with PEFT LoRA materialization."""
+    from transformers.integrations.sonicmoe import _sonicmoe_wrapper
+
+    if not getattr(self, "has_gate", True):
+        raise ValueError("sonicmoe requires gated experts (has_gate=True)")
+    if hidden_states.device.type != "cuda":
+        raise ValueError("sonicmoe requires CUDA device")
+
+    device = hidden_states.device
+    num_top_k = top_k_index.size(-1)
+    num_tokens = hidden_states.size(0)
+
+    # Flatten — token indices must be int32 and sorted ascending (sonic-moe requirement).
+    token_idx = (
+        torch.arange(num_tokens, device=device)
+        .unsqueeze(1)
+        .expand(-1, num_top_k)
+        .reshape(-1)
+        .int()
+    )
+    router_scores = top_k_weights.reshape(-1).to(hidden_states.dtype)
+    expert_ids = top_k_index.reshape(-1).int()
+
+    w1, b1, w2, b2, lora_w1, lora_w2 = _resolve_weights_and_lora(self)
+    if not getattr(self, "has_bias", False):
+        b1 = b2 = None
+
+    # FSDP2 / EP wraps parameters as DTensors but sonic-moe takes raw CUTLASS pointers,
+    # so unwrap to local shards before the materialize/permute. to_local() is
+    # autograd-aware — backward will rewrap the gradient as a DTensor again.
+    if isinstance(w1, torch.distributed.tensor.DTensor):
+        w1 = w1.to_local()
+        w2 = w2.to_local()
+        b1 = b1.to_local() if b1 is not None else None
+        b2 = b2.to_local() if b2 is not None else None
+
+    # Materialize W_eff = W + scaling * (B @ A) per expert. No-op when no LoRA.
+    if lora_w1 is not None:
+        w1 = MoELoRAMaterialize.apply(w1, *lora_w1)
+    if lora_w2 is not None:
+        w2 = MoELoRAMaterialize.apply(w2, *lora_w2)
+
+    # Match upstream layout expectations:
+    #   is_transposed=False: gate_up [E, 2*I, H] / down [E, H, I] -> permute(1, 2, 0)
+    #   is_transposed=True:  gate_up [E, H, 2*I] / down [E, I, H] -> permute(2, 1, 0)
+    perm = (2, 1, 0) if getattr(self, "is_transposed", False) else (1, 2, 0)
+    w1 = w1.permute(*perm)
+    w2 = w2.permute(*perm)
+
+    act_name = getattr(self.config, "hidden_act", "silu").lower()
+
+    return _sonicmoe_wrapper(
+        hidden_states=hidden_states,
+        router_scores=router_scores,
+        expert_ids=expert_ids,
+        token_idx=token_idx,
+        w1=w1,
+        b1=b1,
+        w2=w2,
+        b2=b2,
+        act_name=act_name,
+        num_experts=self.num_experts,
+        concat_layout=getattr(self, "is_concatenated", True),
+        is_inference_mode_enabled=not torch.is_grad_enabled(),
+    )
+
+
+def register_sonicmoe_experts() -> None:
+    """Register the LoRA-aware ``"sonicmoe"`` forward, overriding upstream. Idempotent."""
+    from transformers.integrations.moe import ALL_EXPERTS_FUNCTIONS
+
+    ALL_EXPERTS_FUNCTIONS.register("sonicmoe", sonicmoe_experts_forward_with_lora)
+
+
+# Re-export utilities for tests / external callers.
+__all__ = [
+    "sonicmoe_experts_forward_with_lora",
+    "register_sonicmoe_experts",
+    "materialize_expert_lora",
+]
diff --git a/src/axolotl/integrations/kernels/libs/sonicmoe/gemma4_experts.py b/src/axolotl/integrations/kernels/libs/sonicmoe/gemma4_experts.py
deleted file mode 100644
index a4025dd842..0000000000
--- a/src/axolotl/integrations/kernels/libs/sonicmoe/gemma4_experts.py
+++ /dev/null
@@ -1,106 +0,0 @@
-"""
-SonicMoE-accelerated experts forward for Gemma4.
-
-Gemma4 has no separate SparseMoeBlock — MoE is embedded in the decoder layer.
-This module provides a drop-in replacement for ``Gemma4TextExperts.forward``
-that uses SonicMoE kernels while preserving the original call signature.
-"""
-
-import torch
-
-from .lora import has_lora, materialize_expert_lora, unwrap_experts_lora
-
-
-def _get_expert_weights_gemma4(experts_module):
-    """Extract expert weights from Gemma4TextExperts, applying LoRA if active.
-
-    Returns:
-        (gate_up_weight, down_weight) in SonicMoE layout [dim, dim, E].
-    """
-    if has_lora(experts_module):
-        base_experts, lora_dict = unwrap_experts_lora(experts_module)
-        gate_up = materialize_expert_lora(
-            base_experts.gate_up_proj, lora_dict.get("gate_up_proj")
-        )
-        down = materialize_expert_lora(
-            base_experts.down_proj, lora_dict.get("down_proj")
-        )
-    else:
-        gate_up = experts_module.gate_up_proj
-        down = experts_module.down_proj
-
-    # Permute to SonicMoE layout:
-    #   gate_up: [E, 2*I, H] -> [2*I, H, E]
-    #   down:    [E, H, I]   -> [H, I, E]
-    return gate_up.permute(1, 2, 0), down.permute(1, 2, 0)
-
-
-def gemma4_sonicmoe_experts_forward(
-    self,
-    hidden_states: torch.Tensor,
-    top_k_index: torch.Tensor,
-    top_k_weights: torch.Tensor,
-) -> torch.Tensor:
-    """SonicMoE-accelerated replacement for Gemma4TextExperts.forward.
-
-    Same signature as the original: (hidden_states [T, H], top_k_index [T, K],
-    top_k_weights [T, K]) -> output [T, H].
-    """
-    from sonicmoe import moe_general_routing_inputs
-    from sonicmoe.enums import ActivationType
-
-    T, _ = hidden_states.shape
-    K = top_k_index.shape[1]
-    E = self.num_experts
-
-    # Convert routing outputs to SonicMoE's flat format
-    # Token indices sorted ascending (required by SonicMoE)
-    token_indices = (
-        torch.arange(T, device=hidden_states.device, dtype=torch.int32)
-        .unsqueeze(1)
-        .expand(T, K)
-    )
-    flat_scores = top_k_weights.to(torch.float32).reshape(-1)  # [T*K]
-    flat_token_idx = token_indices.reshape(-1)  # [T*K]
-    flat_expert_idx = top_k_index.to(torch.int32).reshape(-1)  # [T*K]
-
-    # Get weights (with LoRA materialization if needed)
-    gate_up_weight, down_weight = _get_expert_weights_gemma4(self)
-    gate_up_weight = gate_up_weight.to(hidden_states.dtype)
-    down_weight = down_weight.to(hidden_states.dtype)
-
-    if not torch.cuda.is_available():
-        raise RuntimeError("SonicMoE requires CUDA. No CUDA device available.")
-    cuda_stream = torch.cuda.current_stream().cuda_stream
-
-    output, _ = moe_general_routing_inputs(
-        hidden_states,
-        flat_scores,
-        flat_token_idx,
-        flat_expert_idx,
-        gate_up_weight,
-        None,  # b1 (no gate/up bias)
-        down_weight,
-        None,  # b2 (no down bias)
-        E,
-        cuda_stream,
-        ActivationType.GEGLU,
-        False,  # is_inference_mode
-    )
-
-    return output
-
-
-def patch_gemma4_sonicmoe():
-    """Monkeypatch Gemma4TextExperts.forward with SonicMoE kernel."""
-    from axolotl.integrations.kernels.constants import resolve_experts_class
-
-    experts_cls = resolve_experts_class("gemma4_text")
-    if experts_cls is None:
-        raise ValueError("Could not resolve Gemma4TextExperts class")
-
-    if hasattr(experts_cls, "_original_forward"):
-        return  # already patched
-
-    experts_cls._original_forward = experts_cls.forward
-    experts_cls.forward = gemma4_sonicmoe_experts_forward
diff --git a/src/axolotl/integrations/kernels/libs/sonicmoe/lora.py b/src/axolotl/integrations/kernels/libs/sonicmoe/lora.py
index 4d7a21925b..1fe08828cb 100644
--- a/src/axolotl/integrations/kernels/libs/sonicmoe/lora.py
+++ b/src/axolotl/integrations/kernels/libs/sonicmoe/lora.py
@@ -61,33 +61,6 @@ def get_lora_params_from_wrapper(module) -> tuple:
     return lora_A, lora_B, scaling
 
 
-def unwrap_gate_lora(gate_module):
-    """Unwrap PEFT ParamWrapper on the router gate.
-
-    When PEFT targets ``gate.weight``, ``self.gate`` becomes::
-
-        ParamWrapper(weight)
-          -> base_layer: Router (the real module)
-
-    Returns:
-        (base_gate, gate_weight, gate_lora_delta_or_None)
-
-        ``base_gate`` is the original router module (with ``.top_k``, etc.).
-        ``gate_weight`` is the base router weight tensor.
-        ``gate_lora_delta_or_None`` is the LoRA delta if active, else None.
-        Kept separate to avoid mixing DTensor + Tensor under FSDP.
-    """
-    if has_lora(gate_module):
-        base_gate = gate_module.base_layer
-        lora_A, lora_B, scaling = get_lora_params_from_wrapper(gate_module)
-        if lora_A is not None:
-            delta = scaling * (lora_B @ lora_A)
-            return base_gate, base_gate.weight, delta
-        return base_gate, base_gate.weight, None
-
-    return gate_module, gate_module.weight, None
-
-
 def unwrap_experts_lora(experts_module):
     """Walk a PEFT ParamWrapper chain on ``self.experts``.
 
@@ -129,18 +102,12 @@ def unwrap_experts_lora(experts_module):
 
 
 class MoELoRAMaterialize(torch.autograd.Function):
-    """Materialize effective weight W_eff = W + scaling * (B @ A) per expert.
-
-    Inserts into the autograd graph between PEFT's LoRA parameters and
-    SonicMoE's CUTLASS kernels. The CUTLASS backward computes dW_eff,
-    which this function decomposes into dA and dB via the chain rule.
-
-    Weight layouts (PEFT rank-major):
-        base_weight: [E, dim1, dim2]  (frozen expert parameter)
-        lora_A:      [r*E, dim2]      (rows [e*r:(e+1)*r] = A_e)
-        lora_B:      [dim1, r*E]      (cols [:, e*r:(e+1)*r] = B_e)
+    """Materialize ``W_eff = W + scaling * (B @ A)`` per expert and route grads.
 
-    Per-expert: delta_e = B_e @ A_e = [dim1, r] @ [r, dim2] = [dim1, dim2]
+    Layout matches PEFT >= 0.19.1 ``ParamWrapper``: ``base [E, dim1, dim2]``,
+    ``lora_A [r*E, dim2]`` (E-outer, r-inner rows), ``lora_B [dim1, r*E]``
+    (r-outer, E-inner cols). Equivalent to
+    ``einsum("o r e, e r i -> e o i", lora_B.reshape(dim1, r, E), lora_A.reshape(E, r, dim2))``.
     """
 
     @staticmethod
diff --git a/src/axolotl/integrations/kernels/libs/sonicmoe/patch.py b/src/axolotl/integrations/kernels/libs/sonicmoe/patch.py
deleted file mode 100644
index 65095a9871..0000000000
--- a/src/axolotl/integrations/kernels/libs/sonicmoe/patch.py
+++ /dev/null
@@ -1,272 +0,0 @@
-"""
-SonicMoE patching for SparseMoeBlock forward pass.
-
-Monkeypatches the SparseMoeBlock class for a given model type to use
-SonicMoE's optimized kernels. Two forward paths are supported:
-
-1. **General routing path** (routing_fn is not None):
-   Uses a custom routing function + ``moe_general_routing_inputs``.
-   Suitable for models with non-standard routing (softmax->topk, sigmoid->topk).
-
-2. **Fused topk->softmax path** (routing_fn is None):
-   Uses ``moe_TC_softmax_topk_layer`` which fuses routing + expert computation.
-   Suitable for models with simple topk->softmax routing.
-
-Weight format conversion (interleave/deinterleave) is handled by the
-WeightConverter system, so the forward assumes weights are already in
-interleaved format.
-
-Shared experts are handled generically: if the block has a ``shared_expert``
-or ``shared_experts`` attribute, its output is computed alongside the routed
-experts and added to the final output. An optional ``shared_expert_gate``
-applies sigmoid gating to the shared expert contribution.
-"""
-
-import torch
-import torch.nn.functional as F
-
-from axolotl.integrations.kernels.constants import resolve_moe_block_classes
-from axolotl.utils.logging import get_logger
-
-from .lora import (
-    has_lora,
-    materialize_expert_lora,
-    unwrap_experts_lora,
-    unwrap_gate_lora,
-)
-
-LOG = get_logger(__name__)
-
-
-def _get_expert_weights(experts_module):
-    """Extract expert weights, applying LoRA materialization if PEFT is active.
-
-    Returns:
-        (gate_up_weight, down_weight) in SonicMoE layout [dim, dim, E].
-    """
-    if has_lora(experts_module):
-        base_experts, lora_dict = unwrap_experts_lora(experts_module)
-        gate_up = materialize_expert_lora(
-            base_experts.gate_up_proj, lora_dict.get("gate_up_proj")
-        )
-        down = materialize_expert_lora(
-            base_experts.down_proj, lora_dict.get("down_proj")
-        )
-    else:
-        gate_up = experts_module.gate_up_proj
-        down = experts_module.down_proj
-
-    # Permute to SonicMoE layout:
-    #   gate_up: [E, 2*I, H] -> [2*I, H, E]
-    #   down:    [E, H, I]   -> [H, I, E]
-    return gate_up.permute(1, 2, 0), down.permute(1, 2, 0)
-
-
-def _fix_qwen3_5_moe_text_weight_renaming(model_type: str, base_model_type: str):
-    """Strip qwen3_5_moe_text WeightRenaming in VLM mode to preserve custom loaders."""
-    if model_type != "qwen3_5_moe_text" or base_model_type == "qwen3_5_moe_text":
-        return
-
-    try:
-        from transformers.conversion_mapping import (
-            get_checkpoint_conversion_mapping,
-            register_checkpoint_conversion_mapping,
-        )
-        from transformers.core_model_loading import WeightRenaming
-    except ImportError:
-        return
-
-    text_mapping = get_checkpoint_conversion_mapping(model_type)
-    if text_mapping and isinstance(text_mapping[0], WeightRenaming):
-        text_mapping.pop(0)
-        register_checkpoint_conversion_mapping(model_type, text_mapping, overwrite=True)
-        LOG.info("Stripped qwen3_5_moe_text WeightRenaming for VLM mode")
-
-
-def patch_sonicmoe(
-    model_type: str,
-    torch_compile: bool = False,
-    base_model_type: str | None = None,
-):
-    """Patch SparseMoeBlock for SonicMoE support."""
-    from .routing import get_model_moe_config
-    from .weight_converter import register_sonicmoe_weight_converter
-
-    _fix_qwen3_5_moe_text_weight_renaming(model_type, base_model_type or model_type)
-
-    routing_fn, activation, router_attr = get_model_moe_config(model_type)
-
-    if torch_compile and routing_fn is not None:
-        routing_fn = _try_compile_routing(routing_fn)
-
-    for moe_cls in resolve_moe_block_classes(model_type):
-        _patch_forward(moe_cls, routing_fn, activation, router_attr)
-    register_sonicmoe_weight_converter(model_type)
-
-
-def _try_compile_routing(routing_fn):
-    """Attempt to torch.compile the routing function, fall back to eager on failure."""
-    try:
-        compiled_fn = torch.compile(routing_fn, mode="reduce-overhead", dynamic=False)
-        LOG.info(f"torch.compile enabled for routing function: {routing_fn.__name__}")
-        return compiled_fn
-    except Exception as exc:  # pylint: disable=broad-except
-        LOG.warning(
-            f"torch.compile failed for routing function {routing_fn.__name__}, "
-            f"falling back to eager: {exc}"
-        )
-        return routing_fn
-
-
-def _patch_forward(moe_cls, routing_fn, activation, router_attr):
-    """Monkeypatch the SparseMoeBlock class with a SonicMoE forward.
-
-    The patched forward handles shared experts generically: if
-    ``self.shared_expert`` or ``self.shared_experts`` exists, it is computed
-    and added to the routed output. If ``self.shared_expert_gate`` also exists,
-    it applies sigmoid gating to the shared expert contribution (as in qwen2_moe).
-
-    Args:
-        moe_cls: The SparseMoeBlock class to patch.
-        routing_fn: Routing function (e.g. softmax_topk_routing), or None
-            for the fused moe_TC_softmax_topk_layer path.
-        activation: SonicMoE ActivationType enum value.
-        router_attr: Name of the router module attribute on the MoE block.
-    """
-    if hasattr(moe_cls, "_original_forward"):
-        LOG.info(f"{moe_cls.__name__}.forward already patched with SonicMoE, skipping")
-        return
-
-    original_forward = moe_cls.forward
-
-    if routing_fn is not None:
-        _make_general_forward(moe_cls, routing_fn, activation)
-    else:
-        _make_fused_forward(moe_cls, activation, router_attr)
-
-    moe_cls._original_forward = original_forward
-    LOG.info(f"Patched {moe_cls.__name__}.forward with SonicMoE implementation")
-
-
-def _make_general_forward(moe_cls, routing_fn, activation):
-    """Create forward using routing_fn + moe_general_routing_inputs."""
-
-    def sonicmoe_forward(self, hidden_states: torch.Tensor) -> torch.Tensor:
-        from sonicmoe import moe_general_routing_inputs
-
-        batch_size, sequence_length, hidden_dim = hidden_states.shape
-        hidden_states_flat = hidden_states.view(-1, hidden_dim)
-
-        # Shared expert (computed early, matching original model ordering)
-        shared_expert_output = _compute_shared_expert(self, hidden_states_flat)
-
-        # Routing
-        router_scores, token_indices, expert_indices, _router_logits = routing_fn(
-            hidden_states_flat, self
-        )
-
-        # Unwrap PEFT + optional LoRA materialization, then permute to SonicMoE layout
-        gate_up_weight, down_weight = _get_expert_weights(self.experts)
-        gate_up_weight = gate_up_weight.to(hidden_states_flat.dtype)
-        down_weight = down_weight.to(hidden_states_flat.dtype)
-        E = gate_up_weight.shape[-1]
-
-        output, _ = moe_general_routing_inputs(
-            hidden_states_flat,
-            router_scores,
-            token_indices,
-            expert_indices,
-            gate_up_weight,
-            None,  # b1 (no gate/up bias)
-            down_weight,
-            None,  # b2 (no down bias)
-            E,
-            torch.cuda.current_stream().cuda_stream,
-            activation,
-            False,  # is_inference_mode
-        )
-
-        # Add shared expert contribution if present
-        if shared_expert_output is not None:
-            if hasattr(self, "shared_expert_gate"):
-                shared_expert_output = (
-                    F.sigmoid(self.shared_expert_gate(hidden_states_flat))
-                    * shared_expert_output
-                )
-            output = output + shared_expert_output
-
-        return output.view(batch_size, sequence_length, hidden_dim)
-
-    moe_cls.forward = sonicmoe_forward
-
-
-def _make_fused_forward(moe_cls, activation, router_attr):
-    """Create forward using moe_TC_softmax_topk_layer (topk -> softmax)."""
-
-    def sonicmoe_fused_forward(self, hidden_states: torch.Tensor) -> torch.Tensor:
-        from sonicmoe import moe_TC_softmax_topk_layer
-
-        batch_size, sequence_length, hidden_dim = hidden_states.shape
-        hidden_states_flat = hidden_states.view(-1, hidden_dim)
-
-        # Shared expert (computed early, matching original model ordering)
-        shared_expert_output = _compute_shared_expert(self, hidden_states_flat)
-
-        # Unwrap router for attribute access + optional LoRA delta
-        raw_router = getattr(self, router_attr)
-        base_router, router_weight, router_lora_delta = unwrap_gate_lora(raw_router)
-        if router_lora_delta is not None:
-            # Materialize local tensor to avoid DTensor + Tensor add under FSDP
-            if hasattr(router_weight, "to_local"):
-                router_weight = router_weight.to_local()
-            effective_router_weight = router_weight + router_lora_delta
-        else:
-            effective_router_weight = router_weight
-
-        # Unwrap PEFT + optional LoRA materialization, then permute to SonicMoE layout
-        gate_up_weight, down_weight = _get_expert_weights(self.experts)
-        gate_up_weight = gate_up_weight.to(hidden_states_flat.dtype)
-        down_weight = down_weight.to(hidden_states_flat.dtype)
-
-        output, _router_logits, _expert_freq = moe_TC_softmax_topk_layer(
-            hidden_states_flat,
-            effective_router_weight,
-            gate_up_weight,
-            None,  # b1 (no gate/up bias)
-            down_weight,
-            None,  # b2 (no down bias)
-            base_router.top_k,
-            torch.cuda.current_stream().cuda_stream,
-            activation,
-            False,  # is_inference_mode
-        )
-
-        # Add shared expert contribution if present
-        if shared_expert_output is not None:
-            if hasattr(self, "shared_expert_gate"):
-                shared_expert_output = (
-                    F.sigmoid(self.shared_expert_gate(hidden_states_flat))
-                    * shared_expert_output
-                )
-            output = output + shared_expert_output
-
-        return output.view(batch_size, sequence_length, hidden_dim)
-
-    moe_cls.forward = sonicmoe_fused_forward
-
-
-def _compute_shared_expert(moe_block, hidden_states_flat):
-    """Compute shared expert output if the block has one.
-
-    Handles singular (qwen2_moe: ``shared_expert``), plural
-    (glm_moe_dsa/deepseek_v3: ``shared_experts``), and MLP
-    (hunyuan_v1_moe: ``shared_mlp``) attribute names.
-    """
-    shared_expert = (
-        getattr(moe_block, "shared_expert", None)
-        or getattr(moe_block, "shared_experts", None)
-        or getattr(moe_block, "shared_mlp", None)
-    )
-    if shared_expert is not None:
-        return shared_expert(hidden_states_flat)
-    return None
diff --git a/src/axolotl/integrations/kernels/libs/sonicmoe/routing.py b/src/axolotl/integrations/kernels/libs/sonicmoe/routing.py
deleted file mode 100644
index 68654d0868..0000000000
--- a/src/axolotl/integrations/kernels/libs/sonicmoe/routing.py
+++ /dev/null
@@ -1,576 +0,0 @@
-"""
-Routing functions for SonicMoE integration.
-
-Different MoE architectures use different routing strategies:
-- qwen3_moe / qwen2_moe / qwen3_5_moe / qwen3_vl_moe / qwen3_omni_moe: softmax -> topk (with optional renormalization)
-- mistral4: softmax -> group selection -> topk (with renormalization and scaling)
-- glm_moe_dsa / deepseek_v3 / minimax_m2: sigmoid -> topk (with group-based expert selection)
-- ernie4_5_moe: softmax -> bias correction -> topk -> gather (softmax_bias_topk_routing)
-- hunyuan_v1_moe: softmax -> topk via gate.wg (softmax_topk_wg_routing)
-- gemma4_text: RMSNorm -> scale -> proj -> softmax -> topk -> renorm -> per_expert_scale (gemma4_routing)
-- gpt_oss: topk -> softmax (uses fused moe_TC_softmax_topk_layer, routing_fn=None) [NOT YET SUPPORTED]
-
-Each model type maps to a (routing_fn, activation_type, router_attr) triple.
-When routing_fn is None, the fused moe_TC_softmax_topk_layer path is used.
-"""
-
-import torch
-import torch.nn.functional as F
-
-from .lora import unwrap_gate_lora
-
-
-def get_model_moe_config(model_type: str):
-    """Returns (routing_fn, activation, router_attr) for a given model type.
-
-    Args:
-        model_type: HuggingFace model type string.
-
-    Returns:
-        routing_fn: Callable or None. None signals the fused
-            moe_TC_softmax_topk_layer path (topk -> softmax models).
-        activation: SonicMoE ActivationType enum value.
-        router_attr: Name of the router module attribute on the MoE block
-            (e.g. "gate" or "router").
-
-    The activation type cannot be derived from config.hidden_act because
-    e.g. qwen3_moe reports "silu" but architecturally uses SwiGLU
-    (act_fn(gate) * up pattern). So we specify it per model type.
-    """
-    from sonicmoe.enums import ActivationType
-
-    if model_type in (
-        "qwen2_moe",
-        "qwen3_moe",
-        "qwen3_5_moe",
-        "qwen3_5_moe_text",
-        "qwen3_next",
-        "qwen3_vl_moe",
-        "qwen3_omni_moe",
-        "olmoe",
-        "mixtral",
-        "minimax",
-    ):
-        return softmax_topk_routing, ActivationType.SWIGLU, "gate"
-    elif model_type in ("mistral4",):
-        return softmax_group_topk_routing, ActivationType.SWIGLU, "gate"
-    elif model_type in (
-        "glm_moe_dsa",
-        "deepseek_v3",
-        "glm4_moe",
-        "glm4_moe_lite",
-        "glm4v_moe",
-        "minimax_m2",
-    ):
-        return sigmoid_topk_routing, ActivationType.SWIGLU, "gate"
-    elif model_type in ("ernie4_5_moe",):
-        return softmax_bias_topk_routing, ActivationType.SWIGLU, "gate"
-    elif model_type in ("hunyuan_v1_moe",):
-        return softmax_topk_wg_routing, ActivationType.SWIGLU, "gate"
-    elif model_type in ("gemma4_text",):
-        return gemma4_routing, ActivationType.GEGLU, "router"
-    # Fused topk -> softmax path (routing_fn=None):
-    # elif model_type in ("gpt_oss",):
-    #     # NOTE: gpt_oss has a router bias which moe_TC_softmax_topk_layer
-    #     # ignores (it only takes router_w, not bias). Also has transposed
-    #     # weight layout [E, H, 2*I] and custom GLU activation.
-    #     return None, ActivationType.SWIGLU, "router"
-    else:
-        raise ValueError(f"SonicMoE: unsupported model type '{model_type}'")
-
-
-def softmax_topk_routing(
-    hidden_states: torch.Tensor, moe_block
-) -> tuple[torch.Tensor, torch.Tensor, torch.Tensor, torch.Tensor]:
-    """Qwen3/Qwen2-style routing: softmax -> topk -> optional renorm.
-
-    Args:
-        hidden_states: [T, H] flattened token representations
-        moe_block: MoE block module (accesses moe_block.gate.*)
-
-    Returns:
-        router_scores: [T*K] flattened scores (float32)
-        token_indices: [T*K] which token each entry belongs to (int32), sorted ascending
-        expert_indices: [T*K] which expert (int32)
-        router_logits: [T, E] original logits for aux loss
-    """
-    base_gate, gate_weight, gate_lora_delta = unwrap_gate_lora(moe_block.gate)
-    T, H = hidden_states.shape
-    K = base_gate.top_k
-
-    # Compute router logits and softmax over all experts.
-    # Two F.linear calls avoid mixing DTensor (gate_weight) + Tensor (delta) under FSDP.
-    # Cast to float32 to match LoRA delta dtype (PEFT computes in fp32).
-    router_logits = F.linear(hidden_states.float(), gate_weight.float())  # [T, E]
-    if gate_lora_delta is not None:
-        router_logits = router_logits + F.linear(
-            hidden_states.float(), gate_lora_delta.float()
-        )
-    router_probs = F.softmax(router_logits, dim=-1, dtype=torch.float32)  # [T, E]
-
-    # Select top-k experts per token
-    top_values, top_indices = torch.topk(router_probs, K, dim=-1)  # [T, K] each
-
-    # Renormalize if configured (default True for models without the attribute,
-    # e.g. Mixtral/MiniMax which always normalize)
-    if getattr(base_gate, "norm_topk_prob", True):
-        top_values = top_values / top_values.sum(dim=-1, keepdim=True)
-
-    # no-op: matches transformers which casts to softmax output dtype (float32).
-    # top_values = top_values.to(router_probs.dtype)
-
-    # Flatten for moe_general_routing_inputs.
-    # Token indices are naturally sorted ascending from the [T, K] layout:
-    # [0, 0, ..., 1, 1, ..., T-1, T-1, ...] — this is required by SonicMoE.
-    # Expert sorting is handled internally by general_routing_router_metadata.
-    token_indices = (
-        torch.arange(T, device=hidden_states.device, dtype=torch.int32)
-        .unsqueeze(1)
-        .expand(T, K)
-    )
-
-    flat_scores = top_values.reshape(-1)  # [T*K]
-    flat_token_idx = token_indices.reshape(-1)  # [T*K]
-    flat_expert_idx = top_indices.to(torch.int32).reshape(-1)  # [T*K]
-
-    return flat_scores, flat_token_idx, flat_expert_idx, router_logits
-
-
-def softmax_group_topk_routing(
-    hidden_states: torch.Tensor, moe_block
-) -> tuple[torch.Tensor, torch.Tensor, torch.Tensor, torch.Tensor]:
-    """Mistral4-style routing: softmax -> group selection -> topk -> renorm -> scale."""
-    base_gate, gate_weight, gate_lora_delta = unwrap_gate_lora(moe_block.gate)
-    T, _ = hidden_states.shape
-    K = moe_block.top_k
-    E = getattr(moe_block, "n_routed_experts", gate_weight.shape[0])
-    n_group = getattr(moe_block, "n_group", 1)
-
-    router_logits = F.linear(hidden_states.float(), gate_weight.float())  # [T, E]
-    if gate_lora_delta is not None:
-        router_logits = router_logits + F.linear(
-            hidden_states.float(), gate_lora_delta.float()
-        )
-    router_probs = F.softmax(router_logits, dim=-1, dtype=torch.float32)  # [T, E]
-
-    scores_for_choice = router_probs
-
-    # Group selection: pick top groups, mask the rest
-    if n_group > 1:
-        group_scores = (
-            scores_for_choice.view(-1, n_group, E // n_group)
-            .topk(2, dim=-1)[0]
-            .sum(dim=-1)
-        )
-        group_idx = torch.topk(
-            group_scores, k=moe_block.topk_group, dim=-1, sorted=False
-        )[1]
-        group_mask = torch.zeros_like(group_scores)
-        group_mask.scatter_(1, group_idx, 1)
-        score_mask = (
-            group_mask.unsqueeze(-1).expand(-1, n_group, E // n_group).reshape(-1, E)
-        )
-        scores_for_choice = scores_for_choice.masked_fill(~score_mask.bool(), 0.0)
-
-    topk_indices = torch.topk(scores_for_choice, k=K, dim=-1, sorted=False)[1]
-    topk_weights = router_probs.gather(1, topk_indices)
-
-    # Renormalization + scaling
-    norm_topk_prob = getattr(moe_block, "norm_topk_prob", True)
-    if norm_topk_prob:
-        topk_weights = topk_weights / (topk_weights.sum(dim=-1, keepdim=True) + 1e-20)
-    routed_scaling_factor = getattr(moe_block, "routed_scaling_factor", 1.0)
-    topk_weights = topk_weights * routed_scaling_factor
-
-    # Flatten for moe_general_routing_inputs
-    token_indices = (
-        torch.arange(T, device=hidden_states.device, dtype=torch.int32)
-        .unsqueeze(1)
-        .expand(T, K)
-    )
-
-    flat_scores = topk_weights.to(torch.float32).reshape(-1)  # [T*K]
-    flat_token_idx = token_indices.reshape(-1)  # [T*K]
-    flat_expert_idx = topk_indices.to(torch.int32).reshape(-1)  # [T*K]
-
-    return flat_scores, flat_token_idx, flat_expert_idx, router_logits
-
-
-def sigmoid_topk_routing(
-    hidden_states: torch.Tensor, moe_block
-) -> tuple[torch.Tensor, torch.Tensor, torch.Tensor, torch.Tensor]:
-    """Sigmoid-based routing: sigmoid -> optional group selection -> topk.
-
-    Supports two variants:
-    - **Group selection** (glm_moe_dsa, deepseek_v3, etc.): n_group > 1,
-      bias on gate, group-based masking before topk.
-    - **No group selection** (minimax_m2): n_group == 1 (or absent),
-      bias on moe_block, straight topk from all experts.
-
-    Final routing weights come from the original sigmoid scores (not
-    bias-corrected), with optional renormalization and scaling.
-
-    Args:
-        hidden_states: [T, H] flattened token representations
-        moe_block: MoE block module (accesses moe_block.gate.* and
-            optional moe_block.n_group, .topk_group, .top_k, .norm_topk_prob,
-            .routed_scaling_factor, .n_routed_experts)
-
-    Returns:
-        router_scores: [T*K] flattened scores (float32)
-        token_indices: [T*K] which token each entry belongs to (int32), sorted ascending
-        expert_indices: [T*K] which expert (int32)
-        router_logits: [T, E] original logits for aux loss
-    """
-    base_gate, gate_weight, gate_lora_delta = unwrap_gate_lora(moe_block.gate)
-    T, _ = hidden_states.shape
-    K = moe_block.top_k
-    E = getattr(moe_block, "n_routed_experts", gate_weight.shape[0])
-    n_group = getattr(moe_block, "n_group", 1)
-
-    # Compute router logits and sigmoid probabilities
-    router_logits = F.linear(hidden_states.float(), gate_weight.float())  # [T, E]
-    if gate_lora_delta is not None:
-        router_logits = router_logits + F.linear(
-            hidden_states.float(), gate_lora_delta.float()
-        )
-    router_probs = router_logits.sigmoid()  # [T, E]
-
-    # Bias-corrected scores for expert selection (not used for final weights).
-    # glm_moe_dsa/deepseek_v3 store the bias on gate; minimax_m2 stores it on the block.
-    e_score_correction_bias = getattr(base_gate, "e_score_correction_bias", None)
-    if e_score_correction_bias is None:
-        e_score_correction_bias = getattr(moe_block, "e_score_correction_bias", None)
-    if e_score_correction_bias is None:
-        raise AttributeError(
-            f"sigmoid_topk_routing requires e_score_correction_bias on "
-            f"gate ({type(base_gate)}) or moe_block ({type(moe_block)}), but neither has it"
-        )
-    scores_for_choice = router_probs + e_score_correction_bias
-
-    # Group-based selection: pick top groups, mask the rest (skip when n_group == 1)
-    if n_group > 1:
-        group_scores = (
-            scores_for_choice.view(-1, n_group, E // n_group)
-            .topk(2, dim=-1)[0]
-            .sum(dim=-1)
-        )  # [T, n_group]
-        group_idx = torch.topk(
-            group_scores, k=moe_block.topk_group, dim=-1, sorted=False
-        )[1]
-        group_mask = torch.zeros_like(group_scores)
-        group_mask.scatter_(1, group_idx, 1)
-        score_mask = (
-            group_mask.unsqueeze(-1).expand(-1, n_group, E // n_group).reshape(-1, E)
-        )
-        scores_for_choice = scores_for_choice.masked_fill(~score_mask.bool(), 0.0)
-
-    # Final topk from (possibly masked) scores
-    topk_indices = torch.topk(scores_for_choice, k=K, dim=-1, sorted=False)[1]
-
-    # Gather weights from original sigmoid scores (not bias-corrected)
-    topk_weights = router_probs.gather(1, topk_indices)
-
-    # Optional renormalization + scaling
-    norm_topk_prob = getattr(moe_block, "norm_topk_prob", True)
-    if norm_topk_prob:
-        topk_weights = topk_weights / (topk_weights.sum(dim=-1, keepdim=True) + 1e-20)
-    routed_scaling_factor = getattr(moe_block, "routed_scaling_factor", 1.0)
-    topk_weights = topk_weights * routed_scaling_factor
-
-    # Flatten for moe_general_routing_inputs.
-    # Token indices are naturally sorted ascending from the [T, K] layout.
-    token_indices = (
-        torch.arange(T, device=hidden_states.device, dtype=torch.int32)
-        .unsqueeze(1)
-        .expand(T, K)
-    )
-
-    flat_scores = topk_weights.to(torch.float32).reshape(-1)  # [T*K]
-    flat_token_idx = token_indices.reshape(-1)  # [T*K]
-    flat_expert_idx = topk_indices.to(torch.int32).reshape(-1)  # [T*K]
-
-    return flat_scores, flat_token_idx, flat_expert_idx, router_logits
-
-
-def softmax_bias_topk_routing(
-    hidden_states: torch.Tensor, moe_block
-) -> tuple[torch.Tensor, torch.Tensor, torch.Tensor, torch.Tensor]:
-    """Ernie 4.5 MoE routing: softmax → bias correction → topk → gather → renorm.
-
-    Differs from standard softmax_topk_routing in three ways:
-    1. A learned e_score_correction_bias is added to softmax probs *before* topk
-       (selection uses biased scores, but final weights use original probs).
-    2. The bias is applied via gate.moe_statics module (not a raw tensor).
-    3. Renormalization uses clamp(min=norm_min) instead of sum+epsilon.
-
-    Reference: Ernie4_5_MoeTopKRouter.forward in transformers.
-
-    Args:
-        hidden_states: [T, H] flattened token representations
-        moe_block: MoE block module (accesses moe_block.gate.*)
-
-    Returns:
-        router_scores: [T*K] flattened scores (float32)
-        token_indices: [T*K] which token each entry belongs to (int32), sorted ascending
-        expert_indices: [T*K] which expert (int32)
-        router_logits: [T, E] original logits for aux loss
-    """
-    base_gate, gate_weight, gate_lora_delta = unwrap_gate_lora(moe_block.gate)
-    T, H = hidden_states.shape
-    K = base_gate.top_k
-
-    # Compute router logits and softmax (force float32 for numerical stability)
-    router_logits = F.linear(hidden_states.float(), gate_weight.float())  # [T, E]
-    if gate_lora_delta is not None:
-        router_logits = router_logits + F.linear(
-            hidden_states.float(), gate_lora_delta.float()
-        )
-    router_probs = F.softmax(router_logits, dim=-1, dtype=torch.float32)  # [T, E]
-
-    # Bias-corrected scores for expert selection (via moe_statics module)
-    scores_for_choice = base_gate.moe_statics(router_probs)  # [T, E]
-
-    # Select top-k experts using biased scores
-    _, selected_experts = torch.topk(scores_for_choice, K, dim=-1)  # [T, K]
-
-    # Gather weights from *original* (unbiased) softmax probs
-    top_values = torch.gather(router_probs, dim=-1, index=selected_experts)  # [T, K]
-
-    # Renormalize with clamp(min=norm_min) instead of sum+epsilon
-    norm_min = getattr(base_gate, "norm_min", 1e-20)
-    top_values = top_values / torch.clamp(
-        top_values.sum(dim=-1, keepdim=True), min=norm_min
-    )
-
-    # Flatten for moe_general_routing_inputs
-    token_indices = (
-        torch.arange(T, device=hidden_states.device, dtype=torch.int32)
-        .unsqueeze(1)
-        .expand(T, K)
-    )
-
-    flat_scores = top_values.to(torch.float32).reshape(-1)  # [T*K]
-    flat_token_idx = token_indices.reshape(-1)  # [T*K]
-    flat_expert_idx = selected_experts.to(torch.int32).reshape(-1)  # [T*K]
-
-    return flat_scores, flat_token_idx, flat_expert_idx, router_logits
-
-
-def softmax_group_limited_topk_routing(
-    hidden_states: torch.Tensor, moe_block
-) -> tuple[torch.Tensor, torch.Tensor, torch.Tensor, torch.Tensor]:
-    """DeepSeek V2 routing: softmax → group_limited_greedy/greedy → topk → scale.
-
-    Differs from softmax_group_topk_routing (Mistral4) in several ways:
-    1. Uses ``num_group`` attribute (not ``n_group``).
-    2. Group score = max per group (not sum of top-2).
-    3. Supports ``greedy`` method (plain topk without groups).
-    4. No renormalization — just ``topk_weight * routed_scaling_factor``.
-    5. Gate is ``nn.Linear`` (access weight via ``gate.weight``).
-
-    Reference: DeepseekV2Moe.route_tokens_to_experts in transformers.
-
-    Args:
-        hidden_states: [T, H] flattened token representations
-        moe_block: MoE block module (accesses moe_block.gate, .num_group,
-            .topk_group, .top_k, .topk_method, .routed_scaling_factor)
-
-    Returns:
-        router_scores: [T*K] flattened scores (float32)
-        token_indices: [T*K] which token each entry belongs to (int32), sorted ascending
-        expert_indices: [T*K] which expert (int32)
-        router_logits: [T, E] original logits for aux loss
-    """
-    base_gate, gate_weight, gate_lora_delta = unwrap_gate_lora(moe_block.gate)
-    T, H = hidden_states.shape
-    K = moe_block.top_k
-    num_group = getattr(moe_block, "num_group", 1)
-    num_experts = gate_weight.shape[0]
-    topk_method = getattr(moe_block, "topk_method", "greedy")
-
-    # Compute logits in float32 and softmax
-    router_logits = F.linear(hidden_states.float(), gate_weight.float())  # [T, E]
-    if gate_lora_delta is not None:
-        router_logits = router_logits + F.linear(
-            hidden_states.float(), gate_lora_delta.float()
-        )
-    router_probs = F.softmax(router_logits, dim=-1, dtype=torch.float32)  # [T, E]
-
-    if topk_method == "greedy" or num_group == 1:
-        topk_weights, topk_indices = torch.topk(router_probs, k=K, dim=-1, sorted=False)
-    elif topk_method == "group_limited_greedy":
-        # Guard: selected groups must contain enough experts for topk
-        group_size = num_experts // num_group
-        if moe_block.topk_group * group_size < K:
-            raise ValueError(
-                f"DeepSeek V2: topk_group ({moe_block.topk_group}) * group_size "
-                f"({group_size}) = {moe_block.topk_group * group_size} < top_k ({K}). "
-                f"Not enough experts in selected groups for topk selection."
-            )
-        # Group selection: pick top groups by max score per group
-        group_scores = (
-            router_probs.view(T, num_group, num_experts // num_group).max(dim=-1).values
-        )  # [T, num_group]
-        group_idx = torch.topk(
-            group_scores, k=moe_block.topk_group, dim=-1, sorted=False
-        )[1]
-        group_mask = torch.zeros_like(group_scores)
-        group_mask.scatter_(1, group_idx, 1)
-        score_mask = (
-            group_mask.unsqueeze(-1)
-            .expand(T, num_group, num_experts // num_group)
-            .reshape(T, -1)
-        )
-        tmp_scores = router_probs.masked_fill(~score_mask.bool(), 0.0)
-        topk_weights, topk_indices = torch.topk(tmp_scores, k=K, dim=-1, sorted=False)
-    else:
-        raise ValueError(
-            f"DeepSeek V2: unsupported topk_method '{topk_method}'. "
-            f"Expected 'greedy' or 'group_limited_greedy'."
-        )
-
-    # Scale only — no renormalization (weights won't sum to 1.0 per token).
-    # This matches the reference DeepseekV2Moe.route_tokens_to_experts behavior.
-    routed_scaling_factor = getattr(moe_block, "routed_scaling_factor", 1.0)
-    topk_weights = topk_weights * routed_scaling_factor
-
-    # Flatten for moe_general_routing_inputs
-    token_indices = (
-        torch.arange(T, device=hidden_states.device, dtype=torch.int32)
-        .unsqueeze(1)
-        .expand(T, K)
-    )
-
-    flat_scores = topk_weights.to(torch.float32).reshape(-1)  # [T*K]
-    flat_token_idx = token_indices.reshape(-1)  # [T*K]
-    flat_expert_idx = topk_indices.to(torch.int32).reshape(-1)  # [T*K]
-
-    return flat_scores, flat_token_idx, flat_expert_idx, router_logits
-
-
-def softmax_topk_wg_routing(
-    hidden_states: torch.Tensor, moe_block
-) -> tuple[torch.Tensor, torch.Tensor, torch.Tensor, torch.Tensor]:
-    """HunYuan V1 MoE routing: softmax → topk → renorm (gate weight via gate.wg).
-
-    Differs from standard softmax_topk_routing in:
-    1. Gate weight lives at ``gate.wg.weight`` (not ``gate.weight``).
-    2. ``top_k`` is on ``moe_block`` (not ``gate``).
-    3. Always renormalizes (no ``norm_topk_prob`` flag).
-
-    Reference: HunYuanMoEV1Moe.route_tokens_to_experts and
-    HunYuanMoEV1Gate.forward in transformers.
-
-    Args:
-        hidden_states: [T, H] flattened token representations
-        moe_block: MoE block module (accesses moe_block.gate.wg, moe_block.top_k)
-
-    Returns:
-        router_scores: [T*K] flattened scores (float32)
-        token_indices: [T*K] which token each entry belongs to (int32), sorted ascending
-        expert_indices: [T*K] which expert (int32)
-        router_logits: [T, E] original logits for aux loss
-    """
-    gate = moe_block.gate
-    T, H = hidden_states.shape
-    K = moe_block.top_k
-
-    # Gate computes logits via gate.wg (nn.Linear, float32)
-    # Unwrap at gate.wg level since PEFT targets the wg Linear, not the gate container
-    base_wg, wg_weight, wg_lora_delta = unwrap_gate_lora(gate.wg)
-    router_logits = F.linear(hidden_states.float(), wg_weight.float())  # [T, E]
-    if wg_lora_delta is not None:
-        router_logits = router_logits + F.linear(
-            hidden_states.float(), wg_lora_delta.float()
-        )
-    router_probs = F.softmax(router_logits, dim=-1, dtype=torch.float32)  # [T, E]
-
-    # Select top-k experts
-    top_values, top_indices = torch.topk(router_probs, K, dim=-1)  # [T, K] each
-
-    # Always renormalize (HunYuan V1 has no norm_topk_prob flag)
-    top_values = top_values / (top_values.sum(dim=-1, keepdim=True) + 1e-20)
-
-    # Flatten for moe_general_routing_inputs
-    token_indices = (
-        torch.arange(T, device=hidden_states.device, dtype=torch.int32)
-        .unsqueeze(1)
-        .expand(T, K)
-    )
-
-    flat_scores = top_values.to(torch.float32).reshape(-1)  # [T*K]
-    flat_token_idx = token_indices.reshape(-1)  # [T*K]
-    flat_expert_idx = top_indices.to(torch.int32).reshape(-1)  # [T*K]
-
-    return flat_scores, flat_token_idx, flat_expert_idx, router_logits
-
-
-def gemma4_routing(
-    hidden_states: torch.Tensor, moe_block
-) -> tuple[torch.Tensor, torch.Tensor, torch.Tensor, torch.Tensor]:
-    """Gemma4-style routing: RMSNorm → scale → proj → softmax → topk → renorm → per_expert_scale.
-
-    Gemma4's router (``Gemma4TextRouter``) has a unique structure:
-    1. RMSNorm (without learnable scale) on hidden states
-    2. Multiply by ``scale * hidden_size**-0.5``
-    3. Linear projection to expert scores
-    4. Softmax → topk
-    5. Normalize top-k weights to sum to 1
-    6. Multiply by per-expert learned scales
-
-    The router lives at ``moe_block.router`` (not ``moe_block.gate``).
-    LoRA on the router targets ``router.proj`` (nn.Linear).
-
-    Args:
-        hidden_states: [T, H] flattened token representations
-        moe_block: MoE block module (accesses moe_block.router)
-
-    Returns:
-        router_scores: [T*K] flattened scores (float32)
-        token_indices: [T*K] which token each entry belongs to (int32), sorted ascending
-        expert_indices: [T*K] which expert (int32)
-        router_logits: [T, E] original logits for aux loss
-    """
-    router = moe_block.router
-
-    # Unwrap PEFT LoRA on router.proj (the nn.Linear)
-    _, proj_weight, proj_lora_delta = unwrap_gate_lora(router.proj)
-
-    T, _ = hidden_states.shape
-    K = router.top_k if hasattr(router, "top_k") else router.config.top_k_experts
-
-    # Reproduce Gemma4TextRouter.forward:
-    # 1. RMSNorm (no scale) + scale param * hidden_size**-0.5
-    normed = router.norm(hidden_states)
-    scaled = normed * router.scale * router.scalar_root_size
-
-    # 2. Project to expert scores
-    router_logits = F.linear(scaled.float(), proj_weight.float())  # [T, E]
-    if proj_lora_delta is not None:
-        router_logits = router_logits + F.linear(
-            scaled.float(), proj_lora_delta.float()
-        )
-
-    # 3. Softmax → topk
-    router_probs = F.softmax(router_logits, dim=-1, dtype=torch.float32)  # [T, E]
-    top_values, top_indices = torch.topk(router_probs, K, dim=-1)  # [T, K]
-
-    # 4. Normalize top-k weights
-    top_values = top_values / top_values.sum(dim=-1, keepdim=True)
-
-    # 5. Per-expert scale
-    top_values = top_values * router.per_expert_scale[top_indices]
-
-    # Flatten for moe_general_routing_inputs
-    token_indices = (
-        torch.arange(T, device=hidden_states.device, dtype=torch.int32)
-        .unsqueeze(1)
-        .expand(T, K)
-    )
-
-    flat_scores = top_values.to(torch.float32).reshape(-1)  # [T*K]
-    flat_token_idx = token_indices.reshape(-1)  # [T*K]
-    flat_expert_idx = top_indices.to(torch.int32).reshape(-1)  # [T*K]
-
-    return flat_scores, flat_token_idx, flat_expert_idx, router_logits
diff --git a/src/axolotl/integrations/kernels/libs/sonicmoe/weight_converter.py b/src/axolotl/integrations/kernels/libs/sonicmoe/weight_converter.py
deleted file mode 100644
index 20da27ff0a..0000000000
--- a/src/axolotl/integrations/kernels/libs/sonicmoe/weight_converter.py
+++ /dev/null
@@ -1,202 +0,0 @@
-"""
-Custom WeightConverter operations for SonicMoE weight format conversion.
-
-SonicMoE requires gate_up_proj weights in interleaved format:
-- Standard (concatenated): [E, 2*I, H] where first I rows are gate, last I rows are up
-- SonicMoE (interleaved): [E, 2*I, H] where rows alternate [g0, u0, g1, u1, ...]
-
-These ConversionOps integrate with transformers' WeightConverter system so that
-weights are transparently converted during loading and reverted during saving.
-"""
-
-from typing import Any
-
-import torch
-from einops import rearrange
-from transformers.core_model_loading import ConversionOps
-
-from axolotl.utils.logging import get_logger
-
-LOG = get_logger(__name__)
-
-
-def interleave_gate_up(tensor: torch.Tensor) -> torch.Tensor:
-    """[gate..., up...] -> [g0, u0, g1, u1, ...] along the 2*I dimension."""
-    return rearrange(tensor, "... (two out) h -> ... (out two) h", two=2)
-
-
-def deinterleave_gate_up(tensor: torch.Tensor) -> torch.Tensor:
-    """[g0, u0, g1, u1, ...] -> [gate..., up...] along the 2*I dimension."""
-    return rearrange(tensor, "... (out two) h -> ... (two out) h", two=2)
-
-
-class ConcatenatedToInterleaved(ConversionOps):
-    """Convert concatenated gate/up projections to interleaved format.
-
-    Input:  [E, 2*I, H] with gate=[E, :I, H] and up=[E, I:, H]
-    Output: [E, 2*I, H] with rows alternating [g0, u0, g1, u1, ...]
-
-    This operation is applied along ``dim`` (default 1, the 2*I dimension).
-    """
-
-    def __init__(self, dim: int = 1):
-        self.dim = dim
-
-    @torch.no_grad()
-    def convert(
-        self,
-        input_dict: dict[str, Any],
-        source_patterns: list[str],
-        target_patterns: list[str],
-        **kwargs,
-    ) -> dict[str, torch.Tensor]:
-        target_pattern = self._get_target_pattern(
-            input_dict, source_patterns, target_patterns
-        )
-        tensors = next(iter(input_dict.values()))
-        tensor = tensors[0] if isinstance(tensors, list) else tensors
-
-        interleaved = interleave_gate_up(tensor)
-
-        return {target_pattern: interleaved}
-
-    def _get_target_pattern(
-        self,
-        input_dict: dict[str, Any],
-        source_patterns: list[str],
-        target_patterns: list[str],
-    ) -> str:
-        # Follow the same logic as Transpose.get_target_pattern
-        if len(input_dict) != 1:
-            raise ValueError("Undefined Operation encountered!")
-        if len(target_patterns) > 1:
-            if len(source_patterns) == 1:
-                return source_patterns[0]
-            raise ValueError("Undefined Operation encountered!")
-        return target_patterns[0]
-
-    @property
-    def reverse_op(self) -> ConversionOps:
-        return InterleavedToConcatenated(self.dim)
-
-
-class InterleavedToConcatenated(ConversionOps):
-    """Convert interleaved gate/up projections back to concatenated format.
-
-    Input:  [E, 2*I, H] with rows alternating [g0, u0, g1, u1, ...]
-    Output: [E, 2*I, H] with gate=[E, :I, H] and up=[E, I:, H]
-
-    This is the reverse of ``ConcatenatedToInterleaved``.
-    """
-
-    def __init__(self, dim: int = 1):
-        self.dim = dim
-
-    @torch.no_grad()
-    def convert(
-        self,
-        input_dict: dict[str, Any],
-        source_patterns: list[str],
-        target_patterns: list[str],
-        **kwargs,
-    ) -> dict[str, torch.Tensor]:
-        target_pattern = self._get_target_pattern(
-            input_dict, source_patterns, target_patterns
-        )
-        tensors = next(iter(input_dict.values()))
-        tensor = tensors[0] if isinstance(tensors, list) else tensors
-
-        concatenated = deinterleave_gate_up(tensor)
-
-        return {target_pattern: concatenated}
-
-    def _get_target_pattern(
-        self,
-        input_dict: dict[str, Any],
-        source_patterns: list[str],
-        target_patterns: list[str],
-    ) -> str:
-        if len(input_dict) != 1:
-            raise ValueError("Undefined Operation encountered!")
-        if len(target_patterns) > 1:
-            if len(source_patterns) == 1:
-                return source_patterns[0]
-            raise ValueError("Undefined Operation encountered!")
-        return target_patterns[0]
-
-    @property
-    def reverse_op(self) -> ConversionOps:
-        return ConcatenatedToInterleaved(self.dim)
-
-
-def _make_same_key_interleave_converter():
-    """Create a WeightConverter that interleaves an already-fused gate_up_proj."""
-    from transformers.core_model_loading import WeightConverter
-
-    return WeightConverter(
-        source_patterns="mlp.experts.gate_up_proj",
-        target_patterns="mlp.experts.gate_up_proj",
-        operations=[ConcatenatedToInterleaved(dim=1)],
-    )
-
-
-def _has_same_key_interleave(mapping) -> bool:
-    """Check whether the mapping already has a same-key gate_up_proj interleave converter."""
-    for conv in mapping:
-        if (
-            hasattr(conv, "source_patterns")
-            and conv.source_patterns == ["mlp.experts.gate_up_proj"]
-            and conv.target_patterns == ["mlp.experts.gate_up_proj"]
-            and hasattr(conv, "operations")
-            and any(isinstance(op, ConcatenatedToInterleaved) for op in conv.operations)
-        ):
-            return True
-    return False
-
-
-def register_sonicmoe_weight_converter(model_type: str):
-    """Register weight converters to interleave gate_up_proj for SonicMoE.
-
-    Handles two checkpoint formats:
-    1. Separate per-expert weights (e.g. qwen3_moe): appends interleave to the
-       existing merge chain (MergeModulelist -> Concatenate -> Interleave).
-    2. Already-fused gate_up_proj (e.g. qwen3_5_moe_text): adds a same-key
-       converter (gate_up_proj -> gate_up_proj with Interleave).
-
-    The loader matches whichever source pattern exists in the checkpoint.
-    """
-    from transformers.conversion_mapping import (
-        get_checkpoint_conversion_mapping,
-        register_checkpoint_conversion_mapping,
-    )
-
-    existing = get_checkpoint_conversion_mapping(model_type)
-
-    if existing is None:
-        # No mapping at all — create one with just the same-key converter
-        mapping = [_make_same_key_interleave_converter()]
-        register_checkpoint_conversion_mapping(model_type, mapping)
-        LOG.info(f"Registered SonicMoE weight converter for model type '{model_type}'")
-        return
-
-    # Append interleave to any existing many-to-one merge chain
-    for converter in existing:
-        if hasattr(converter, "operations") and any(
-            "gate_up_proj" in pat for pat in converter.target_patterns
-        ):
-            has_separate_sources = any(
-                "gate_proj" in pat or "up_proj" in pat
-                for pat in converter.source_patterns
-            )
-            if has_separate_sources and not any(
-                isinstance(op, ConcatenatedToInterleaved) for op in converter.operations
-            ):
-                converter.operations.append(ConcatenatedToInterleaved(dim=1))
-            break
-
-    # Also add a same-key converter for already-fused checkpoints
-    if not _has_same_key_interleave(existing):
-        existing.append(_make_same_key_interleave_converter())
-
-    register_checkpoint_conversion_mapping(model_type, existing, overwrite=True)
-    LOG.info(f"Registered SonicMoE weight converter for model type '{model_type}'")
diff --git a/src/axolotl/integrations/kernels/plugin.py b/src/axolotl/integrations/kernels/plugin.py
index d713095a5f..ddddb160f7 100644
--- a/src/axolotl/integrations/kernels/plugin.py
+++ b/src/axolotl/integrations/kernels/plugin.py
@@ -1,6 +1,5 @@
 import importlib
 import os
-from pathlib import Path
 
 import torch
 
@@ -61,119 +60,36 @@ def get_input_args(self):
         return "axolotl.integrations.kernels.KernelsArgs"
 
     def pre_model_load(self, cfg):
-        from axolotl.integrations.kernels.constants import (
-            SPARSE_MOE_BLOCK,
-            is_experts_only_model,
-        )
+        """Register the requested kernel into ``ALL_EXPERTS_FUNCTIONS`` and pin cfg.
 
-        # Prefer text backbone type for VLMs, but fall back to base type
-        # when the text type isn't in the supported mapping (e.g. qwen3_5_moe_text)
-        moe_model_type = cfg.model_config_type_text or cfg.model_config_type
-        if (
-            moe_model_type not in SPARSE_MOE_BLOCK
-            and not is_experts_only_model(moe_model_type)
-            and cfg.model_config_type in SPARSE_MOE_BLOCK
-        ):
-            moe_model_type = cfg.model_config_type
-
-        # When expert parallelism is enabled, the EP plugin sets
-        # `experts_implementation` to `deep_ep_scattermoe` / `deep_ep_sonicmoe`
-        # and dispatches the kernel inside the experts-level forward (after
-        # DeepEP all-to-all). Skip the SparseMoeBlock-level patch in that case
-        # — patching the block-level forward bypasses EP routing and reads
-        # FSDP-sharded expert weights as DTensors, which the kernels do not
-        # accept.
+        Architecture-agnostic: routing stays in each model's SparseMoEBlock; only
+        the experts call is dispatched through the registry.
+        """
+        # When EP is active, the ExpertParallelPlugin selects a `deep_ep_*`
+        # composite for `experts_implementation`. Don't overwrite that here —
+        # plugin order is YAML-defined, so we can't rely on EP running last.
         ep_active = (getattr(cfg, "expert_parallel_size", 1) or 1) > 1
 
         if cfg.use_scattermoe:
-            self._register_kernels()
-            if is_experts_only_model(moe_model_type):
-                # Models like Gemma4 where MoE is embedded in the decoder layer
-                # — register ScatterMoE in the ExpertsInterface so that
-                # @use_experts_implementation dispatches to it.
-                self._register_experts_interface()
-                if not ep_active:
-                    cfg.experts_implementation = "scattermoe"
-            elif ep_active:
-                LOG.info(
-                    "expert_parallel_size > 1: skipping SparseMoeBlock-level "
-                    "ScatterMoE patch; the deep_ep_scattermoe registered "
-                    "function handles the kernel under EP."
-                )
-            else:
-                self._kernelize_model(moe_model_type)
-        elif cfg.use_sonicmoe:
-            if not importlib.util.find_spec("sonicmoe"):
-                raise RuntimeError(
-                    "SonicMoE is not installed. See installation instructions at "
-                    "https://github.com/axolotl-ai-cloud/axolotl/blob/main/src/axolotl/integrations/kernels/README.md#sonicmoe-installation"
-                )
+            from axolotl.integrations.kernels.libs.scattermoe_lora.experts import (
+                register_scattermoe_experts,
+            )
 
+            register_scattermoe_experts()
+            if not ep_active:
+                cfg.experts_implementation = "scattermoe"
+            LOG.info("Registered 'scattermoe' in transformers ExpertsInterface")
+        elif cfg.use_sonicmoe:
             _check_sonicmoe_gpu_compat()
 
-            if is_experts_only_model(moe_model_type):
-                from axolotl.integrations.kernels.libs.sonicmoe.gemma4_experts import (
-                    patch_gemma4_sonicmoe,
-                )
-
-                LOG.info(
-                    f"Applying SonicMoE experts-level patch for model type: {moe_model_type}"
-                )
-                patch_gemma4_sonicmoe()
-            # TODO(EP+SonicMoE): grad norms explode during training. Re-enable
-            # once the root cause is identified. Same shape as the ScatterMoE
-            # branch above, but SonicMoE additionally needs the gate_up_proj
-            # interleave converter since its w1 layout is [g0, u0, g1, u1, ...]
-            # while the checkpoint stores it concatenated [gate..., up...].
-            #
-            # elif ep_active:
-            #     from axolotl.integrations.kernels.libs.sonicmoe.weight_converter import (
-            #         register_sonicmoe_weight_converter,
-            #     )
-            #
-            #     LOG.info(
-            #         "expert_parallel_size > 1: skipping SparseMoeBlock-level "
-            #         "SonicMoE patch; the deep_ep_sonicmoe registered function "
-            #         "handles the kernel under EP. Registering gate_up_proj "
-            #         "interleave converter."
-            #     )
-            #     register_sonicmoe_weight_converter(moe_model_type)
-            else:
-                from axolotl.integrations.kernels.libs.sonicmoe import patch_sonicmoe
-
-                LOG.info(f"Applying SonicMoE patches for model type: {moe_model_type}")
-                patch_sonicmoe(
-                    moe_model_type,
-                    torch_compile=bool(getattr(cfg, "torch_compile", False)),
-                    base_model_type=cfg.model_config_type,
-                )
-
-    def _register_kernels(self):
-        from kernels import (
-            LocalLayerRepository,
-            Mode,
-            register_kernel_mapping,
-        )
+            from axolotl.integrations.kernels.libs.sonicmoe.experts import (
+                register_sonicmoe_experts,
+            )
 
-        plugin_root = Path(__file__).parent
-        register_kernel_mapping(
-            {
-                "HFScatterMoEParallelExperts": {
-                    "cuda": {
-                        Mode.TRAINING: LocalLayerRepository(
-                            repo_path=plugin_root / "libs" / "scattermoe_lora",
-                            package_name="scattermoe_lora",
-                            layer_name="HFScatterMoEGatedMLP",
-                        ),
-                        Mode.INFERENCE: LocalLayerRepository(
-                            repo_path=plugin_root / "libs" / "scattermoe_lora",
-                            package_name="scattermoe_lora",
-                            layer_name="HFScatterMoEGatedMLP",
-                        ),
-                    },
-                }
-            }
-        )
+            register_sonicmoe_experts()
+            if not ep_active:
+                cfg.experts_implementation = "sonicmoe"
+            LOG.info("Registered 'sonicmoe' in transformers ExpertsInterface")
 
     def add_callbacks_pre_trainer(self, cfg, model):
         callbacks = []
@@ -184,26 +100,3 @@ def add_callbacks_pre_trainer(self, cfg, model):
 
             callbacks.append(AutotuneReportCallback())
         return callbacks
-
-    def _kernelize_model(self, model_type: str):
-        from kernels import replace_kernel_forward_from_hub
-
-        from axolotl.integrations.kernels.constants import resolve_moe_block_classes
-
-        for model_moe_cls in resolve_moe_block_classes(model_type):
-            replace_kernel_forward_from_hub(
-                model_moe_cls, "HFScatterMoEParallelExperts"
-            )
-
-    def _register_experts_interface(self):
-        """Register ScatterMoE in the transformers ExpertsInterface.
-
-        This allows @use_experts_implementation-decorated Experts classes
-        to dispatch to ScatterMoE when config._experts_implementation == "scattermoe".
-        """
-        from axolotl.integrations.kernels.libs.scattermoe_lora.gemma4_experts import (
-            register_scattermoe_experts,
-        )
-
-        register_scattermoe_experts()
-        LOG.info("Registered 'scattermoe' in transformers ExpertsInterface")
diff --git a/src/axolotl/integrations/liger/args.py b/src/axolotl/integrations/liger/args.py
index a5f88ffe2c..283e527b4e 100644
--- a/src/axolotl/integrations/liger/args.py
+++ b/src/axolotl/integrations/liger/args.py
@@ -28,7 +28,16 @@ class LigerArgs(BaseModel):
     Input args for LIGER.
     """
 
-    liger_rope: bool | None = None
+    liger_rope: bool | None = Field(
+        default=None,
+        json_schema_extra={
+            "description": (
+                "Enables Liger's fused RoPE kernel. For Qwen2-VL / Qwen2.5-VL / "
+                "Qwen3-VL (text and VL model_config_types) this auto-defaults to "
+                "True when unset, swapping in the fused multimodal/rotary kernel."
+            )
+        },
+    )
     liger_rms_norm: bool | None = None
     liger_rms_norm_gated: bool | None = Field(
         default=None,
diff --git a/src/axolotl/integrations/liger/plugin.py b/src/axolotl/integrations/liger/plugin.py
index 9c4a26351f..eedf221ede 100644
--- a/src/axolotl/integrations/liger/plugin.py
+++ b/src/axolotl/integrations/liger/plugin.py
@@ -86,7 +86,20 @@ def patched_init(self, *args, **kwargs):
             liger_fn_sig = inspect.signature(apply_liger_fn)
             kwargs = {}
             if "rope" in liger_fn_sig.parameters:
-                kwargs["rope"] = cfg.liger_rope
+                rope_value = cfg.liger_rope
+                # cfg.liger_rope defaults to None, which would override upstream's rope=True for Qwen-VL.
+                if rope_value is None and cfg.model_config_type in (
+                    "qwen2_vl",
+                    "qwen2_5_vl",
+                    "qwen3_vl",
+                    "qwen3_vl_moe",
+                    "qwen2_vl_text",
+                    "qwen2_5_vl_text",
+                    "qwen3_vl_text",
+                    "qwen3_vl_moe_text",
+                ):
+                    rope_value = True
+                kwargs["rope"] = rope_value
             if "cross_entropy" in liger_fn_sig.parameters:
                 kwargs["cross_entropy"] = cfg.liger_cross_entropy
             if "fused_linear_cross_entropy" in liger_fn_sig.parameters:
diff --git a/src/axolotl/kernels/autotune_telemetry.py b/src/axolotl/kernels/autotune_telemetry.py
new file mode 100644
index 0000000000..a729fd7ccb
--- /dev/null
+++ b/src/axolotl/kernels/autotune_telemetry.py
@@ -0,0 +1,136 @@
+"""Telemetry for the fused RMSNorm+RoPE Triton autotune selections.
+
+Mirrors the scattermoe-lora autotune telemetry
+(:mod:`axolotl.integrations.kernels.autotune_callback`): after the kernel's
+``@triton.autotune`` cache is populated by the first backward pass, report the
+selected configs alongside GPU identity so the per-hardware tuning that varies
+across architectures can be aggregated.
+"""
+
+import logging
+
+import torch
+from transformers import (
+    TrainerCallback,
+    TrainerControl,
+    TrainerState,
+    TrainingArguments,
+)
+
+LOG = logging.getLogger(__name__)
+
+# Give up looking for autotune data after this many training steps.
+_MAX_POLL_STEP = 5
+
+# (human-readable name, attribute on gemma4_fused_rope, autotune key arg names)
+_KERNEL_REGISTRY: list[tuple[str, str, list[str]]] = [
+    ("fused_rms_norm_rope_bwd", "_rms_norm_rope_backward_kernel", ["n_cols"]),
+]
+
+
+def _get_gpu_info() -> dict:
+    """Return basic GPU identification for the current device."""
+    if not torch.cuda.is_available():
+        return {}
+    try:
+        props = torch.cuda.get_device_properties(torch.cuda.current_device())
+        return {
+            "gpu_name": props.name,
+            "gpu_compute_capability": f"{props.major}.{props.minor}",
+            "gpu_memory_bytes": props.total_memory,
+        }
+    except Exception:  # pylint: disable=broad-exception-caught
+        return {}
+
+
+def collect_fused_rope_autotune_configs() -> list[dict]:
+    """Read the autotune ``.cache`` from the fused RMSNorm+RoPE backward kernel.
+
+    Each entry is ``{"kernel", "key", "config"}`` — the same shape the
+    scattermoe collector emits, so both event types aggregate uniformly.
+    Returns ``[]`` if Triton/the kernel isn't loaded or nothing autotuned yet.
+    """
+    import sys
+
+    # The kernel module is only in sys.modules once the fused path has run —
+    # which is exactly when its autotune cache is populated. Read it from there
+    # instead of importing (avoids pulling in Triton when the path is unused).
+    mod = sys.modules.get("axolotl.kernels.gemma4_fused_rope")
+    if mod is None:
+        return []
+
+    results: list[dict] = []
+    for friendly_name, attr_name, key_names in _KERNEL_REGISTRY:
+        kernel_fn = getattr(mod, attr_name, None)
+        cache = getattr(kernel_fn, "cache", None)
+        if not cache:
+            continue
+        for key_tuple, config in cache.items():
+            config_dict = dict(config.kwargs)
+            config_dict["num_warps"] = config.num_warps
+            config_dict["num_stages"] = config.num_stages
+            if getattr(config, "num_ctas", None) is not None:
+                config_dict["num_ctas"] = config.num_ctas
+
+            key: dict = {}
+            for i, name in enumerate(key_names):
+                if i < len(key_tuple):
+                    key[name] = key_tuple[i]
+            if len(key_tuple) > len(key_names):
+                key["_extra"] = [str(v) for v in key_tuple[len(key_names) :]]
+
+            results.append({"kernel": friendly_name, "key": key, "config": config_dict})
+    return results
+
+
+class FusedRopeAutotuneReportCallback(TrainerCallback):
+    """Reports fused RMSNorm+RoPE autotune selections via telemetry.
+
+    Fires once after the autotune cache is populated (the first step whose
+    backward has run), retrying up to ``_MAX_POLL_STEP`` then giving up. Every
+    later ``on_step_end`` short-circuits on ``_reported`` — zero hot-path cost.
+    """
+
+    def __init__(self):
+        self._reported = False
+
+    # pylint: disable=unused-argument
+    def on_step_end(
+        self,
+        args: TrainingArguments,
+        state: TrainerState,
+        control: TrainerControl,
+        **kwargs,
+    ):
+        if self._reported:
+            return
+
+        configs = collect_fused_rope_autotune_configs()
+        if not configs:
+            if state.global_step >= _MAX_POLL_STEP:
+                LOG.debug(
+                    "No fused-rope autotune data after %d steps; giving up.",
+                    state.global_step,
+                )
+                self._reported = True
+            return
+
+        self._reported = True
+
+        from axolotl.telemetry.manager import TelemetryManager
+
+        telemetry_manager = TelemetryManager.get_instance()
+        if not telemetry_manager.enabled:
+            return
+
+        properties = {"kernel_count": len(configs), "kernels": configs}
+        properties.update(_get_gpu_info())
+
+        telemetry_manager.send_event(
+            event_type="fused-rope-autotune",
+            properties=properties,
+        )
+        LOG.info(
+            "Reported %d fused-rope kernel autotune config(s) to telemetry.",
+            len(configs),
+        )
diff --git a/src/axolotl/kernels/gemma4_fused_rope.py b/src/axolotl/kernels/gemma4_fused_rope.py
index f98e9a3de6..8193355846 100644
--- a/src/axolotl/kernels/gemma4_fused_rope.py
+++ b/src/axolotl/kernels/gemma4_fused_rope.py
@@ -1,20 +1,4 @@
-"""
-Fused RMSNorm + RoPE Triton kernel for Gemma 4.
-
-Fuses three operations into one kernel launch:
-  1. RMSNorm: x_norm = (x / sqrt(mean(x^2) + eps)) * weight
-  2. RoPE:    y = x_norm * cos + rotate_half(x_norm) * sin
-  3. (optional) RMSNorm without scale (for v_norm)
-
-This eliminates two intermediate tensor materializations per Q/K path;
-churn from rotate_half / apply_rotary_pos_emb.
-
-Shapes:
-  X:      (rows, head_dim)  — flattened from (batch, seq_len, num_heads, head_dim)
-  W:      (head_dim,)       — RMSNorm weight (None for with_scale=False)
-  cos:    (rows, head_dim)  — flattened from (batch, seq_len, 1, head_dim) after broadcast
-  sin:    (rows, head_dim)  — same as cos
-"""
+"""Fused RMSNorm + (partial) RoPE Triton kernel for Gemma 4 / Qwen3 Q/K paths."""
 
 import math
 import operator
@@ -25,10 +9,10 @@
 from liger_kernel.ops.utils import (
     calculate_settings,
     compare_version,
-    ensure_contiguous,
     torch_to_triton_dtype,
 )
 from liger_kernel.utils import is_npu_available
+from torch.library import triton_op, wrap_triton
 
 if compare_version("triton", operator.ge, "3.0.0") and not is_npu_available():
     try:
@@ -38,6 +22,11 @@
 else:
     from triton.language.math import rsqrt
 
+# Backward over-subscription factor: number of program blocks per SM. The
+# weight-gradient reduction needs one private partial per block, so this also
+# sizes the dW scratch buffer. ~8 saturates occupancy on tested GPUs.
+_BWD_BLOCKS_PER_SM = 8
+
 
 @triton.jit
 def _rms_norm_rope_forward_kernel(
@@ -57,6 +46,7 @@ def _rms_norm_rope_forward_kernel(
     n_heads,
     eps,
     HAS_WEIGHT: tl.constexpr,
+    UNIT_OFFSET: tl.constexpr,
     BLOCK_SIZE: tl.constexpr,
 ):
     """
@@ -100,7 +90,10 @@ def _rms_norm_rope_forward_kernel(
     # Apply weight if present (with_scale=True)
     if HAS_WEIGHT:
         W_row = tl.load(W_ptr + col_offsets, mask=mask, other=0).to(tl.float32)
-        X_norm = X_norm * W_row
+        if UNIT_OFFSET:
+            X_norm = X_norm * (W_row + 1.0)
+        else:
+            X_norm = X_norm * W_row
 
     # RoPE: load cos/sin (broadcast across heads). For col >= n_rot we get
     # cos=1, sin=0 so the formula leaves X_norm untouched.
@@ -130,7 +123,10 @@ def _rms_norm_rope_forward_kernel(
     X_rot_norm = X_rot * rstd
     if HAS_WEIGHT:
         W_rot = tl.load(W_ptr + rot_offsets, mask=rot_load_mask, other=0).to(tl.float32)
-        X_rot_norm = X_rot_norm * W_rot
+        if UNIT_OFFSET:
+            X_rot_norm = X_rot_norm * (W_rot + 1.0)
+        else:
+            X_rot_norm = X_rot_norm * W_rot
 
     # Negate the first half (rotate_half negates x2, which becomes the first half)
     sign = tl.where(col_offsets < half_rot, -1.0, 1.0)
@@ -146,6 +142,16 @@ def _rms_norm_rope_forward_kernel(
     )
 
 
+_BWD_AUTOTUNE_CONFIGS = [
+    triton.Config({}, num_warps=w, num_stages=s)
+    for w in (2, 4, 8, 16)
+    for s in (1, 2, 3)
+]
+
+
+# num_warps/num_stages optima for the latency-bound row loop vary by GPU; key on
+# n_cols (head_dim) so head_dim=128 and 256 each get their own tuned config.
+@triton.autotune(configs=_BWD_AUTOTUNE_CONFIGS, key=["n_cols"])
 @triton.jit
 def _rms_norm_rope_backward_kernel(
     dY_ptr,
@@ -170,6 +176,7 @@ def _rms_norm_rope_backward_kernel(
     n_heads,
     rows_per_program,
     HAS_WEIGHT: tl.constexpr,
+    UNIT_OFFSET: tl.constexpr,
     BLOCK_SIZE: tl.constexpr,
 ):
     """
@@ -245,7 +252,10 @@ def _rms_norm_rope_backward_kernel(
 
         if HAS_WEIGHT:
             dW_acc += dN * n
-            dm = dN * W_row
+            if UNIT_OFFSET:
+                dm = dN * (W_row + 1.0)
+            else:
+                dm = dN * W_row
         else:
             dm = dN
 
@@ -267,33 +277,28 @@ def _rms_norm_rope_backward_kernel(
         )
 
 
-def rms_norm_rope_forward(X, W, cos, sin, eps, n_heads, n_rot):
-    """
-    Args:
-        X:   (B*S*H, head_dim) — contiguous, flattened from (B, S, H, D)
-        W:   (head_dim,) or None — RMSNorm weight
-        cos: (B*S, n_rot) — position embeddings (broadcast across heads)
-        sin: (B*S, n_rot) — position embeddings (broadcast across heads)
-        eps: float
-        n_heads: int — number of attention heads (for cos/sin indexing)
-        n_rot: int — rotary dim (== head_dim for full rotary, < head_dim for
-            partial rotary). Must be even and ``<= head_dim``.
-    Returns:
-        Y, X_saved, RSTD, BLOCK_SIZE, num_warps
-    """
+@triton_op("axolotl::fused_rms_norm_rope_fwd", mutates_args=())
+def _fused_rms_norm_rope_fwd(
+    X: torch.Tensor,
+    W: torch.Tensor,
+    cos: torch.Tensor,
+    sin: torch.Tensor,
+    eps: float,
+    n_heads: int,
+    n_rot: int,
+    unit_offset: bool,
+) -> tuple[torch.Tensor, torch.Tensor]:
+    """Returns ``(Y, RSTD)``; ``wrap_triton`` keeps it ``torch.compile``-safe."""
     n_rows, n_cols = X.shape
     BLOCK_SIZE, num_warps = calculate_settings(n_cols)
-    has_weight = W is not None
-
     Y = torch.empty_like(X)
     RSTD = torch.empty(n_rows, dtype=torch.float32, device=X.device)
-
-    _rms_norm_rope_forward_kernel[(n_rows,)](
+    wrap_triton(_rms_norm_rope_forward_kernel)[(n_rows,)](
         Y,
         Y.stride(0),
         X,
         X.stride(0),
-        W if has_weight else X,  # dummy pointer when no weight
+        W,
         cos,
         cos.stride(0),
         sin,
@@ -304,30 +309,40 @@ def rms_norm_rope_forward(X, W, cos, sin, eps, n_heads, n_rot):
         n_rot,
         n_heads,
         eps,
-        HAS_WEIGHT=has_weight,
+        HAS_WEIGHT=True,
+        UNIT_OFFSET=unit_offset,
         BLOCK_SIZE=BLOCK_SIZE,
         num_warps=num_warps,
     )
-    return Y, X, RSTD, BLOCK_SIZE, num_warps
-
-
-def rms_norm_rope_backward(
-    dY, X, W, cos, sin, RSTD, n_heads, n_rot, BLOCK_SIZE, num_warps
-):
+    return Y, RSTD
+
+
+@triton_op("axolotl::fused_rms_norm_rope_bwd", mutates_args=())
+def _fused_rms_norm_rope_bwd(
+    dY: torch.Tensor,
+    X: torch.Tensor,
+    W: torch.Tensor,
+    cos: torch.Tensor,
+    sin: torch.Tensor,
+    RSTD: torch.Tensor,
+    n_heads: int,
+    n_rot: int,
+    unit_offset: bool,
+) -> tuple[torch.Tensor, torch.Tensor]:
+    """Returns ``(dX, dW)``."""
     n_rows, n_cols = dY.shape
-    has_weight = W is not None
-
+    BLOCK_SIZE, _ = calculate_settings(n_cols)
+    # One block per SM serializes a long row-loop at 1 block/SM occupancy; the
+    # forward runs a block per row. Over-subscribe the SMs so the latency-bound
+    # row loop has enough resident blocks to hide global-load latency. Each
+    # block still writes a private dW partial that's summed below (no atomics).
     sm_count = torch.cuda.get_device_properties(X.device).multi_processor_count
-    rows_per_program = math.ceil(n_rows / sm_count)
-
+    target_programs = min(_BWD_BLOCKS_PER_SM * sm_count, n_rows)
+    rows_per_program = max(1, math.ceil(n_rows / target_programs))
+    n_programs = math.ceil(n_rows / rows_per_program)
     dX = torch.empty_like(X)
-
-    if has_weight:
-        _dW = torch.empty((sm_count, n_cols), dtype=torch.float32, device=X.device)
-    else:
-        _dW = torch.empty((1, n_cols), dtype=torch.float32, device=X.device)
-
-    _rms_norm_rope_backward_kernel[(sm_count,)](
+    _dW = torch.empty((n_programs, n_cols), dtype=torch.float32, device=X.device)
+    wrap_triton(_rms_norm_rope_backward_kernel)[(n_programs,)](
         dY,
         dY.stride(0),
         dX,
@@ -335,7 +350,7 @@ def rms_norm_rope_backward(
         X,
         X.stride(0),
         torch_to_triton_dtype[X.dtype],
-        W if has_weight else X,  # dummy
+        W,
         cos,
         cos.stride(0),
         sin,
@@ -349,81 +364,50 @@ def rms_norm_rope_backward(
         n_rot,
         n_heads,
         rows_per_program,
-        HAS_WEIGHT=has_weight,
+        HAS_WEIGHT=True,
+        UNIT_OFFSET=unit_offset,
         BLOCK_SIZE=BLOCK_SIZE,
-        num_warps=num_warps,
     )
-
-    dW = _dW.sum(dim=0).to(W.dtype) if has_weight else None
+    dW = _dW.sum(dim=0).to(W.dtype)
     return dX, dW
 
 
-class FusedRMSNormRoPEFunction(torch.autograd.Function):
-    @staticmethod
-    @ensure_contiguous
-    def forward(ctx, X, W, cos, sin, eps, n_heads, n_rot):
-        """
-        X:    (B*S*H, head_dim)
-        W:    (head_dim,) or None
-        cos:  (B*S, n_rot) — broadcast across heads
-        sin:  (B*S, n_rot) — broadcast across heads
-        n_heads: int
-        n_rot:   int — rotary dim (<= head_dim)
-        """
-        Y, X_saved, RSTD, BLOCK_SIZE, num_warps = rms_norm_rope_forward(
-            X,
-            W,
-            cos,
-            sin,
-            eps,
-            n_heads,
-            n_rot,
-        )
-        ctx.eps = eps
-        ctx.BLOCK_SIZE = BLOCK_SIZE
-        ctx.num_warps = num_warps
-        ctx.n_heads = n_heads
-        ctx.n_rot = n_rot
-        ctx.has_weight = W is not None
-        ctx.save_for_backward(X_saved, W, cos, sin, RSTD)
-        return Y
-
-    @staticmethod
-    @ensure_contiguous
-    def backward(ctx, dY):
-        X, W, cos, sin, RSTD = ctx.saved_tensors
-        dX, dW = rms_norm_rope_backward(
-            dY,
-            X,
-            W,
-            cos,
-            sin,
-            RSTD,
-            ctx.n_heads,
-            ctx.n_rot,
-            ctx.BLOCK_SIZE,
-            ctx.num_warps,
-        )
-        return dX, dW, None, None, None, None, None
+def _fused_rms_norm_rope_setup_context(ctx, inputs, output):
+    X, W, cos, sin, _eps, n_heads, n_rot, unit_offset = inputs
+    _, RSTD = output
+    ctx.save_for_backward(X, W, cos, sin, RSTD)
+    ctx.n_heads = n_heads
+    ctx.n_rot = n_rot
+    ctx.unit_offset = unit_offset
+
+
+def _fused_rms_norm_rope_backward(ctx, grad_Y, grad_RSTD):
+    X, W, cos, sin, RSTD = ctx.saved_tensors
+    grad_Y = grad_Y.contiguous()
+    dX, dW = _fused_rms_norm_rope_bwd(
+        grad_Y, X, W, cos, sin, RSTD, ctx.n_heads, ctx.n_rot, ctx.unit_offset
+    )
+    return dX, dW, None, None, None, None, None, None
+
+
+_fused_rms_norm_rope_fwd.register_autograd(
+    _fused_rms_norm_rope_backward,
+    setup_context=_fused_rms_norm_rope_setup_context,
+)
 
 
-def fused_rms_norm_rope(x, weight, cos, sin, eps=1e-6):
+def fused_rms_norm_rope(x, weight, cos, sin, eps=1e-6, unit_offset=False):
     """
     Apply fused RMSNorm + (partial) RoPE.
 
-    Args:
-        x:      (batch, seq_len, num_heads, head_dim) — after projection + view
-        weight: (head_dim,) — RMSNorm weight, or None for no-scale norm
-        cos:    (batch, seq_len, n_rot) — from RotaryEmbedding. ``n_rot``
-                must be even and ``<= head_dim``. When ``n_rot < head_dim``
-                the trailing ``head_dim - n_rot`` columns are RMSNorm-only
-                (partial-rotary pass-through), matching stock Gemma 4 with
-                ``partial_rotary_factor < 1.0``.
-        sin:    (batch, seq_len, n_rot) — same shape as ``cos``
-        eps:    float — RMSNorm epsilon
-
-    Returns:
-        y: (batch, seq_len, num_heads, head_dim) — normalized + rotated
+    Shapes:
+        x:      (B, S, H, D) — post-projection
+        weight: (D,) — required; use ``fused_rms_norm_noscale`` for the no-weight variant
+        cos:    (B, S, n_rot) — ``n_rot`` must be even and ``<= D``; trailing
+                ``D - n_rot`` columns are RMSNorm-only (partial rotary).
+        sin:    (B, S, n_rot)
+
+    ``unit_offset=True`` scales by ``(weight + 1.0)`` (Gemma-style).
     """
     shape = x.shape  # (B, S, H, D)
     B, S, H, D = shape
@@ -438,13 +422,8 @@ def fused_rms_norm_rope(x, weight, cos, sin, eps=1e-6):
     if n_rot % 2 != 0:
         raise ValueError(f"rotary dim must be even, got {n_rot}")
 
-    # Flatten to 2D: (B*S*H, D)
     x_flat = x.reshape(-1, D).contiguous()
-    # cos/sin may broadcast over the batch dim (e.g. (1, S, n_rot) when
-    # all sequences share the same rotary positions). The kernel needs a
-    # dense (B*S, n_rot) buffer so that row_idx // n_heads maps cleanly
-    # onto a single (b, s) pair, so expand-then-contiguous to materialize
-    # the per-batch broadcast. Expand is a no-op when B == cos.shape[0].
+    # Kernel needs a dense (B*S, n_rot) buffer; materialize the batch-broadcast.
     if cos.shape[0] != B:
         if cos.shape[0] != 1:
             raise ValueError(
@@ -456,8 +435,8 @@ def fused_rms_norm_rope(x, weight, cos, sin, eps=1e-6):
     cos_flat = cos.reshape(B * S, n_rot).contiguous()
     sin_flat = sin.reshape(B * S, n_rot).contiguous()
 
-    y_flat = FusedRMSNormRoPEFunction.apply(
-        x_flat, weight, cos_flat, sin_flat, eps, H, n_rot
+    y_flat, _ = _fused_rms_norm_rope_fwd(
+        x_flat, weight, cos_flat, sin_flat, eps, H, n_rot, unit_offset
     )
     return y_flat.view(shape)
 
@@ -526,68 +505,76 @@ def _rms_norm_noscale_backward_kernel(
     )
 
 
-class FusedRMSNormNoScaleFunction(torch.autograd.Function):
-    """RMSNorm without learnable scale — used for Gemma4's v_norm."""
-
-    @staticmethod
-    @ensure_contiguous
-    def forward(ctx, X, eps):
-        n_rows, n_cols = X.shape
-        BLOCK_SIZE, num_warps = calculate_settings(n_cols)
-        Y = torch.empty_like(X)
-        RSTD = torch.empty(n_rows, dtype=torch.float32, device=X.device)
-
-        _rms_norm_forward_kernel[(n_rows,)](
-            Y,
-            Y.stride(0),
-            X,
-            X.stride(0),
-            RSTD,
-            RSTD.stride(0),
-            n_cols,
-            eps,
-            BLOCK_SIZE=BLOCK_SIZE,
-            num_warps=num_warps,
-        )
-        ctx.BLOCK_SIZE = BLOCK_SIZE
-        ctx.num_warps = num_warps
-        ctx.save_for_backward(X, RSTD)
-        ctx.n_cols = n_cols
-        return Y
-
-    @staticmethod
-    @ensure_contiguous
-    def backward(ctx, dY):
-        X, RSTD = ctx.saved_tensors
-        n_rows = X.shape[0]
-        dX = torch.empty_like(X)
-        _rms_norm_noscale_backward_kernel[(n_rows,)](
-            dY,
-            dY.stride(0),
-            dX,
-            dX.stride(0),
-            X,
-            X.stride(0),
-            torch_to_triton_dtype[X.dtype],
-            RSTD,
-            RSTD.stride(0),
-            ctx.n_cols,
-            BLOCK_SIZE=ctx.BLOCK_SIZE,
-            num_warps=ctx.num_warps,
-        )
-        return dX, None
+@triton_op("axolotl::fused_rms_norm_noscale_fwd", mutates_args=())
+def _fused_rms_norm_noscale_fwd(
+    X: torch.Tensor, eps: float
+) -> tuple[torch.Tensor, torch.Tensor]:
+    """Returns ``(Y, RSTD)``."""
+    n_rows, n_cols = X.shape
+    BLOCK_SIZE, num_warps = calculate_settings(n_cols)
+    Y = torch.empty_like(X)
+    RSTD = torch.empty(n_rows, dtype=torch.float32, device=X.device)
+    wrap_triton(_rms_norm_forward_kernel)[(n_rows,)](
+        Y,
+        Y.stride(0),
+        X,
+        X.stride(0),
+        RSTD,
+        RSTD.stride(0),
+        n_cols,
+        eps,
+        BLOCK_SIZE=BLOCK_SIZE,
+        num_warps=num_warps,
+    )
+    return Y, RSTD
 
 
-def fused_rms_norm_noscale(x, eps=1e-6):
-    """
-    RMSNorm without scale for v_norm.
+@triton_op("axolotl::fused_rms_norm_noscale_bwd", mutates_args=())
+def _fused_rms_norm_noscale_bwd(
+    dY: torch.Tensor, X: torch.Tensor, RSTD: torch.Tensor
+) -> torch.Tensor:
+    n_rows, n_cols = X.shape
+    BLOCK_SIZE, num_warps = calculate_settings(n_cols)
+    dX = torch.empty_like(X)
+    wrap_triton(_rms_norm_noscale_backward_kernel)[(n_rows,)](
+        dY,
+        dY.stride(0),
+        dX,
+        dX.stride(0),
+        X,
+        X.stride(0),
+        torch_to_triton_dtype[X.dtype],
+        RSTD,
+        RSTD.stride(0),
+        n_cols,
+        BLOCK_SIZE=BLOCK_SIZE,
+        num_warps=num_warps,
+    )
+    return dX
 
-    Args:
-        x: (batch, seq_len, num_heads, head_dim)
-    Returns:
-        y: same shape, normalized
-    """
+
+def _fused_rms_norm_noscale_setup_context(ctx, inputs, output):
+    X, _eps = inputs
+    _, RSTD = output
+    ctx.save_for_backward(X, RSTD)
+
+
+def _fused_rms_norm_noscale_backward(ctx, grad_Y, grad_RSTD):
+    X, RSTD = ctx.saved_tensors
+    grad_Y = grad_Y.contiguous()
+    dX = _fused_rms_norm_noscale_bwd(grad_Y, X, RSTD)
+    return dX, None
+
+
+_fused_rms_norm_noscale_fwd.register_autograd(
+    _fused_rms_norm_noscale_backward,
+    setup_context=_fused_rms_norm_noscale_setup_context,
+)
+
+
+def fused_rms_norm_noscale(x, eps=1e-6):
+    """RMSNorm without a learned scale (used for v_norm)."""
     shape = x.shape
-    x_flat = x.reshape(-1, shape[-1])
-    y_flat = FusedRMSNormNoScaleFunction.apply(x_flat, eps)
+    x_flat = x.reshape(-1, shape[-1]).contiguous()
+    y_flat, _ = _fused_rms_norm_noscale_fwd(x_flat, eps)
     return y_flat.view(shape)
diff --git a/src/axolotl/loaders/model.py b/src/axolotl/loaders/model.py
index 700039c5e5..2f056e19f8 100644
--- a/src/axolotl/loaders/model.py
+++ b/src/axolotl/loaders/model.py
@@ -56,6 +56,11 @@
     get_device_count,
     get_device_type,
 )
+from axolotl.utils.fp32_norms import (
+    _matches_norm_class,
+    get_fp32_norm_patterns,
+    tag_model_fp32_norms,
+)
 from axolotl.utils.logging import get_logger
 from axolotl.utils.model_shard_quant import load_sharded_model_quant
 from axolotl.utils.schemas.enums import RLType
@@ -191,6 +196,9 @@ def load(self) -> tuple[PreTrainedModel | PeftModelForCausalLM, PeftConfig | Non
         self.patch_manager.apply_post_model_load_patches(self.model)
         PLUGIN_MANAGER.post_model_load(self.cfg, self.model)
 
+        if self.cfg.fp32_norms:
+            tag_model_fp32_norms(self.model, self.cfg)
+
         return self.model, lora_config
 
     def _apply_pre_model_load_setup(self):
@@ -221,6 +229,18 @@ def _apply_pre_model_load_setup(self):
         self._set_attention_config()
         self._check_model_requirements()
 
+        # MX-quantized checkpoints carry MXTensor weights but no HF quantizer, so
+        # transformers' load-time weight re-init would crash on them; this guards it.
+        # torchao is absent on macOS/aarch64, where MX checkpoints can't exist anyway.
+        try:
+            from axolotl.utils.quantization import (
+                patch_transformers_skip_quantized_init,
+            )
+
+            patch_transformers_skip_quantized_init()
+        except ImportError:
+            pass
+
     def _apply_post_model_load_setup(self):
         """Configure the model after it has been loaded."""
         # Handle PeftModel if needed
@@ -233,14 +253,71 @@ def _apply_post_model_load_setup(self):
         self._configure_experts_implementation()
         self._apply_activation_checkpointing()
         self._resize_token_embeddings()
+        self._reinitialize_classification_head()
         self._adjust_model_config()
         self._configure_embedding_dtypes()
         self._configure_qat()
         log_gpu_memory_usage(LOG, "Memory usage after model load", 0)
 
+    def _reinitialize_classification_head(self):
+        """Re-init an uninitialized reward / PRM classification head.
+
+        The ``score``/``classifier`` head is missing from a base-LM checkpoint, so
+        transformers allocates it with ``torch.empty`` and is then supposed to
+        initialize it. But transformers 5.8's ``_init_weights`` does
+        ``init.normal_(module.weight.float(), ...)`` — the ``.float()`` copy makes
+        this a no-op on a ``bfloat16`` head, leaving uninitialized memory: harmless
+        zeros on some allocators, NaN/inf garbage on others (→ NaN grads, 0 loss).
+        Detect that state and initialize the head ourselves.
+        """
+        if not (self.cfg.reward_model or self.cfg.process_reward_model):
+            return
+
+        head = getattr(self.model, "score", None) or getattr(
+            self.model, "classifier", None
+        )
+        if not isinstance(head, torch.nn.Linear):
+            return
+
+        weight = head.weight
+        # A freshly-initialized head is all-zero (benign) or garbage (huge/non-finite);
+        # a head loaded from a real reward checkpoint is finite and reasonably scaled.
+        looks_uninitialized = (
+            not torch.isfinite(weight).all()
+            or weight.abs().max() > 100
+            or bool((weight == 0).all())
+        )
+        if not looks_uninitialized:
+            return
+
+        std = getattr(self.model.config, "initializer_range", 0.02) or 0.02
+        with torch.no_grad():
+            weight.normal_(mean=0.0, std=std)
+            if head.bias is not None:
+                head.bias.zero_()
+        LOG.info(
+            f"Re-initialized {type(self.model).__name__} classification head "
+            f"(std={std})."
+        )
+
     def _configure_experts_implementation(self):
-        if self.cfg.experts_implementation is not None:
-            self.model.set_experts_implementation(self.cfg.experts_implementation)
+        impl = self.cfg.experts_implementation
+        if impl is None:
+            return
+
+        if impl in ("scattermoe", "sonicmoe"):
+            model_classes = {
+                type(m) for m in self.model.modules() if isinstance(m, PreTrainedModel)
+            }
+            if not any(cls._can_set_experts_implementation() for cls in model_classes):
+                LOG.warning(
+                    f"experts_implementation={impl!r} requested, but no submodule of "
+                    f"{type(self.model).__name__} uses transformers' ExpertsInterface "
+                    "(@use_experts_implementation). The kernel will NOT be applied; "
+                    "training falls back to the model's native experts path."
+                )
+
+        self.model.set_experts_implementation(impl)
 
     def _apply_activation_checkpointing(self):
         if self.cfg.activation_offloading is True:
@@ -911,8 +988,11 @@ def _convert_embedding_modules_dtype(
         dest = {"dtype": dist_dtype}
         if self.cfg.lora_on_cpu:
             dest["device"] = "cpu"
+        fp32_norm_patterns = get_fp32_norm_patterns(self.cfg)
         for name, module in self.model.named_modules():
-            if "norm" in name:
+            if fp32_norm_patterns and _matches_norm_class(module, fp32_norm_patterns):
+                module.to(torch.float32)
+            elif "norm" in name:
                 module.to(dist_dtype)
             if before_kbit_train_or_finetune:
                 if name.endswith(".gate"):
diff --git a/src/axolotl/loaders/patch_manager.py b/src/axolotl/loaders/patch_manager.py
index e0a224d079..495ff58851 100644
--- a/src/axolotl/loaders/patch_manager.py
+++ b/src/axolotl/loaders/patch_manager.py
@@ -104,6 +104,9 @@ def apply_pre_model_load_patches(self):
         self._apply_flash_attn_4_patches()
         self._apply_fsdp_patches()
         self._apply_adapter_patches()
+        # Must precede fused-RoPE patches: re-parses ``Attention.forward``
+        # via ``inspect.getsource``; the QKV regex misses on a patched body.
+        self._apply_self_attention_lora_patch()
         self._apply_model_specific_patches()
         self._apply_fp8_patches()
         self._apply_flash_attention_peft_patches()
@@ -113,7 +116,6 @@ def apply_pre_model_load_patches(self):
         self._patch_loss_llama()
         self._patch_llama_derived_model()
         self._apply_mistral_cross_entropy_patch()
-        self._apply_self_attention_lora_patch()
         self._apply_fsdp2_bnb_patches()
         self._apply_patch_deepspeed_zero3()
         self._apply_voxtral_patches()
@@ -166,6 +168,7 @@ def apply_post_model_load_patches(self, model: PreTrainedModel):
         self._apply_lora_kernel_patch(model)
         self._apply_scaling_softmax_patch(model)
         self._apply_fp8_attention_patches(model)
+        self._apply_tiled_mlp_post_load(model)
 
     def _apply_gemma_hybrid_attention(self, model: PreTrainedModel):
         """Apply hybrid attention: FA2 for sliding window layers, SDPA for global layers.
@@ -350,8 +353,50 @@ def _apply_flash_attn_4_patches(self):
 
         patch_flash_attn_4(self.model_config)
 
+    _FUSED_ATTN_KERNEL_SUPPORTED = (
+        "qwen3",
+        "qwen3_moe",
+        "qwen3_vl",
+        "qwen3_vl_text",
+        "qwen3_5",
+        "qwen3_5_text",
+        "qwen3_5_moe",
+        "qwen3_5_moe_text",
+        "gemma4",
+        "gemma4_text",
+    )
+
+    @staticmethod
+    def _warn_if_fused_attn_unsupported(cfg):
+        """Warn when ``fused_attn_kernel`` targets an unsupported
+        ``model_config_type`` (derived post-schema by ``normalize_config()``)."""
+        if not getattr(cfg, "fused_attn_kernel", False):
+            return
+        mct = getattr(cfg, "model_config_type", None)
+        if mct and mct not in PatchManager._FUSED_ATTN_KERNEL_SUPPORTED:
+            LOG.warning(
+                "`fused_attn_kernel: true` is set but model_config_type=%r is not "
+                "in the supported set %s. The flag is a silent no-op for this "
+                "model. Remove the flag or use one of the supported model families.",
+                mct,
+                sorted(PatchManager._FUSED_ATTN_KERNEL_SUPPORTED),
+            )
+
     def _apply_model_specific_patches(self):
         """Apply patches specific to model architectures."""
+        self._warn_if_fused_attn_unsupported(self.cfg)
+
+        if self.cfg.model_config_type == "gemma4" and self.cfg.use_kernels:
+            # transformers' Gemma4VisionAttention registers a bare function via
+            # @use_kernelized_func, which crashes model.kernelize() (triggered by
+            # use_kernels=True) when it tries to register_module() a non-Module.
+            # Strip the dead entry so kernelize() succeeds. The MoE itself is
+            # accelerated via the ExpertsInterface (experts_implementation),
+            # independent of this path.
+            from axolotl.monkeypatch.gemma4_kernelize import patch_gemma4_kernelize
+
+            patch_gemma4_kernelize()
+
         if (
             self.cfg.model_config_type == "llama4"
             and self.cfg.llama4_linearized_experts
@@ -369,22 +414,39 @@ def _apply_model_specific_patches(self):
 
             patch_kimi_model()
 
-        if self.cfg.model_config_type == "nemotron_h":
-            if self.cfg.sample_packing:
-                from transformers.models.nemotron_h.modeling_nemotron_h import (
-                    NemotronHPreTrainedModel,
-                )
+        ssm_hybrid_patch_needed = (
+            self.cfg.sample_packing or self.cfg.context_parallel_size > 1
+        )
 
-                from axolotl.monkeypatch.models.nemotron_h.modeling import (
-                    patch_nemotron_h_modeling_packing,
-                )
+        if self.cfg.model_config_type == "nemotron_h" and ssm_hybrid_patch_needed:
+            from transformers.models.nemotron_h.modeling_nemotron_h import (
+                NemotronHPreTrainedModel,
+            )
 
-                patch_nemotron_h_modeling_packing()
-                # supports_gradient_checkpointing is only enabled after
-                # patch_nemotron_h_modeling_packing() installs the GC-compatible
-                # NemotronHBlock.forward. Without the patch, upstream marks this
-                # False because the original block forward is not GC-safe.
-                NemotronHPreTrainedModel.supports_gradient_checkpointing = True
+            from axolotl.monkeypatch.models.nemotron_h.modeling import (
+                patch_nemotron_h_modeling_packing,
+            )
+
+            patch_nemotron_h_modeling_packing()
+            # supports_gradient_checkpointing is only enabled after
+            # patch_nemotron_h_modeling_packing() installs the GC-compatible
+            # NemotronHBlock.forward. Without the patch, upstream marks this
+            # False because the original block forward is not GC-safe.
+            NemotronHPreTrainedModel.supports_gradient_checkpointing = True
+
+        if self.cfg.model_config_type == "falcon_h1" and ssm_hybrid_patch_needed:
+            from axolotl.monkeypatch.models.falcon_h1.modeling import (
+                patch_falcon_h1_modeling_packing,
+            )
+
+            patch_falcon_h1_modeling_packing()
+
+        if self.cfg.model_config_type == "granitemoehybrid" and ssm_hybrid_patch_needed:
+            from axolotl.monkeypatch.models.granitemoehybrid.modeling import (
+                patch_granitemoehybrid_modeling_packing,
+            )
+
+            patch_granitemoehybrid_modeling_packing()
 
         # Patches requiring CUDA
         if torch.cuda.is_available():
@@ -444,6 +506,50 @@ def _apply_model_specific_patches(self):
                     install_shared_kv_workaround=needs_shared_kv_workaround
                 )
 
+            if self.cfg.fused_attn_kernel and self.cfg.model_config_type == "qwen3":
+                from axolotl.monkeypatch.models.qwen3.fused_attn import (
+                    patch_qwen3_fused_attn,
+                )
+
+                patch_qwen3_fused_attn()
+
+            if self.cfg.fused_attn_kernel and self.cfg.model_config_type == "qwen3_moe":
+                from axolotl.monkeypatch.models.qwen3_moe.fused_attn import (
+                    patch_qwen3_moe_fused_attn,
+                )
+
+                patch_qwen3_moe_fused_attn()
+
+            if self.cfg.fused_attn_kernel and self.cfg.model_config_type in (
+                "qwen3_vl",
+                "qwen3_vl_text",
+            ):
+                from axolotl.monkeypatch.models.qwen3_vl.fused_attn import (
+                    patch_qwen3_vl_fused_attn,
+                )
+
+                patch_qwen3_vl_fused_attn()
+
+            if self.cfg.fused_attn_kernel and self.cfg.model_config_type in (
+                "qwen3_5",
+                "qwen3_5_text",
+            ):
+                from axolotl.monkeypatch.models.qwen3_5.fused_attn import (
+                    patch_qwen3_5_fused_attn,
+                )
+
+                patch_qwen3_5_fused_attn()
+
+            if self.cfg.fused_attn_kernel and self.cfg.model_config_type in (
+                "qwen3_5_moe",
+                "qwen3_5_moe_text",
+            ):
+                from axolotl.monkeypatch.models.qwen3_5_moe.fused_attn import (
+                    patch_qwen3_5_moe_fused_attn,
+                )
+
+                patch_qwen3_5_moe_fused_attn()
+
     @staticmethod
     def _fix_nemotron_h_conversion_mapping():
         """Remove the spurious embedding→embeddings WeightRenaming from the
@@ -672,8 +778,27 @@ def _apply_tiled_mlp(self, model_type: str):
                 model_type,
                 use_original_mlp=self.cfg.tiled_mlp_use_original_mlp,
                 cfg_num_shards=self.cfg.tiled_mlp_num_shards,
+                use_scattermoe=bool(self.cfg.use_scattermoe),
             )
 
+    def _apply_tiled_mlp_post_load(self, model):
+        """Re-wrap MoE block instances after kernels have installed their forward.
+
+        Needed only when scattermoe-lora is active — ``model.kernelize()``
+        binds ``HFScatterMoEGatedMLP.forward`` per instance, which shadows
+        the class-level tiled patch. See
+        :func:`axolotl.monkeypatch.tiled_mlp.patch_tiled_mlp_moe_instances`.
+        """
+        if not (self.cfg.tiled_mlp and self.cfg.use_scattermoe):
+            return
+        from axolotl.monkeypatch.tiled_mlp import patch_tiled_mlp_moe_instances
+
+        patch_tiled_mlp_moe_instances(
+            model,
+            self.cfg.model_config_type,
+            cfg_num_shards=self.cfg.tiled_mlp_num_shards,
+        )
+
     def _apply_voxtral_patches(self):
         """Apply patches for Voxtral model."""
         if self.cfg.model_config_type == "voxtral":
@@ -747,6 +872,7 @@ def _patch_llama_derived_model(self):
 
     def _apply_llama_flash_attn_patches(self, model):
         """Apply LLaMA-specific flash attention patches."""
+
         if (
             self.model_config.model_type
             in ["llama", "llama4", "ernie4_5", "ernie4_5_moe"]
@@ -756,15 +882,18 @@ def _apply_llama_flash_attn_patches(self, model):
             and is_flash_attn_available()
             and not self.inference
         ):
-            # TODO(MengqingCao): split these patches separately
-            from axolotl.monkeypatch.llama_attn_hijack_flash import (
-                is_xformers_swiglu_available,
-                replace_llama_mlp_with_swiglu,
-            )
+            try:
+                # TODO(MengqingCao): split these patches separately
+                from axolotl.monkeypatch.llama_attn_hijack_flash import (
+                    is_xformers_swiglu_available,
+                    replace_llama_mlp_with_swiglu,
+                )
 
-            if self.cfg.flash_attn_fuse_mlp and is_xformers_swiglu_available():
-                LOG.info("Patching with SwiGLU...")
-                replace_llama_mlp_with_swiglu(model)
+                if self.cfg.flash_attn_fuse_mlp and is_xformers_swiglu_available():
+                    LOG.info("Patching with SwiGLU...")
+                    replace_llama_mlp_with_swiglu(model)
+            except ImportError as e:
+                LOG.warning(f"Flash Attention patches not applied: {e}")
 
     def _apply_lora_kernel_patch(self, model):
         """Apply LoRA kernel patches."""
diff --git a/src/axolotl/monkeypatch/accelerate/fsdp2.py b/src/axolotl/monkeypatch/accelerate/fsdp2.py
index fc34636969..531f8cb2f8 100644
--- a/src/axolotl/monkeypatch/accelerate/fsdp2.py
+++ b/src/axolotl/monkeypatch/accelerate/fsdp2.py
@@ -13,6 +13,7 @@
 from torch import nn
 
 from axolotl.utils.bench import log_gpu_memory_usage
+from axolotl.utils.fp32_norms import get_fp32_norm_patterns, shard_norms_fp32
 from axolotl.utils.logging import get_logger
 
 LOG = get_logger(__name__)
@@ -426,6 +427,14 @@ def fsdp2_prepare_model(accelerator, model: torch.nn.Module) -> torch.nn.Module:
 
     auto_wrap_policy = fsdp2_prepare_auto_wrap_policy(fsdp2_plugin, model)
     log_bias_dtype_mismatch = False
+    fp32_norm_patterns = get_fp32_norm_patterns(model)
+    if fp32_norm_patterns:
+        shard_norms_fp32(
+            model,
+            patterns=fp32_norm_patterns,
+            fully_shard_kwargs=fsdp2_kwargs,
+        )
+
     if auto_wrap_policy is not None:
         for module in get_module_children_bottom_up(model)[:-1]:
             if is_peft_model and isinstance(module, LoraLayer):
diff --git a/src/axolotl/monkeypatch/fsdp2_qlora.py b/src/axolotl/monkeypatch/fsdp2_qlora.py
index 1887c0a8aa..6fd3ffaeb8 100644
--- a/src/axolotl/monkeypatch/fsdp2_qlora.py
+++ b/src/axolotl/monkeypatch/fsdp2_qlora.py
@@ -26,8 +26,15 @@ def apply_init_sharded_param_patch():
     original_source = inspect.getsource(FSDPParam._init_sharded_param)
     original_source, _ = detab_code(original_source)
 
-    # Define the replacement
-    original_param_creation = """    self.sharded_param = nn.Parameter(self.to_sharded_dtensor(sharded_param))
+    # torch 2.12 rewrote the sharded-param construction from a two-line
+    # form (Parameter() + requires_grad_()) to a single multi-line
+    # Parameter() call with requires_grad= as a kwarg. Try the 2.12
+    # anchor first, fall back to the 2.11 form.
+    anchors_2120 = """    self.sharded_param = nn.Parameter(
+        self.to_sharded_dtensor(sharded_param),
+        requires_grad=param.requires_grad,
+    )"""
+    anchors_2110 = """    self.sharded_param = nn.Parameter(self.to_sharded_dtensor(sharded_param))
     self.sharded_param.requires_grad_(param.requires_grad)"""
 
     patched_param_creation = """    import bitsandbytes as bnb
@@ -59,8 +66,13 @@ def apply_init_sharded_param_patch():
             requires_grad=param.requires_grad,
         )"""
 
+    original_param_creation = next(
+        (a for a in (anchors_2120, anchors_2110) if a in original_source),
+        None,
+    )
+
     # Apply the replacement
-    if original_param_creation in original_source:
+    if original_param_creation is not None:
         patched_source = original_source.replace(
             original_param_creation, patched_param_creation
         )
@@ -103,12 +115,48 @@ def apply_init_unsharded_param_patch():
     original_source = inspect.getsource(FSDPParam.init_unsharded_param)
     original_source, _ = detab_code(original_source)
 
-    # Define the replacement
-    original_param_creation = """        self._unsharded_param = nn.Parameter(
+    # torch 2.12 hoisted the unsharded-param construction out of the
+    # first-all-gather `else:` branch up to method-body level, so the 2.11
+    # anchor (8-space, inside else) no longer matches. The replacement must be
+    # indented to match whichever anchor is found, so each anchor carries its
+    # own. Try the 2.12 anchor first, fall back to the 2.11 form.
+    anchor_replacement_pairs = [
+        (
+            """    self._unsharded_param = nn.Parameter(
+        unsharded_param, requires_grad=self.sharded_param.requires_grad
+    )""",
+            """    import bitsandbytes as bnb
+    local_tensor = self.sharded_param._local_tensor
+    if isinstance(local_tensor, bnb.nn.modules.Params4bit):
+        self._unsharded_param = bnb.nn.modules.Params4bit(
+            data=unsharded_param,
+            requires_grad=self.sharded_param.requires_grad,
+            quant_state=local_tensor.quant_state,
+            blocksize=local_tensor.blocksize,
+            compress_statistics=local_tensor.compress_statistics,
+            quant_type=local_tensor.quant_type,
+            quant_storage=local_tensor.quant_storage,
+            module=local_tensor.module,
+            bnb_quantized=local_tensor.bnb_quantized,
+        )
+    elif isinstance(local_tensor, bnb.nn.modules.Int8Params):
+        self._unsharded_param = bnb.nn.modules.Int8Params(
+            data=unsharded_param,
+            requires_grad=self.sharded_param.requires_grad,
+            has_fp16_weights=local_tensor.has_fp16_weights,
+            CB=unsharded_param,
+            SCB=local_tensor.SCB,
+        )
+    else:
+        self._unsharded_param = nn.Parameter(
             unsharded_param, requires_grad=self.sharded_param.requires_grad
-        )"""
-
-    patched_param_creation = """        import bitsandbytes as bnb
+        )""",
+        ),
+        (
+            """        self._unsharded_param = nn.Parameter(
+            unsharded_param, requires_grad=self.sharded_param.requires_grad
+        )""",
+            """        import bitsandbytes as bnb
         local_tensor = self.sharded_param._local_tensor
         if isinstance(local_tensor, bnb.nn.modules.Params4bit):
             self._unsharded_param = bnb.nn.modules.Params4bit(
@@ -133,10 +181,17 @@ def apply_init_unsharded_param_patch():
         else:
             self._unsharded_param = nn.Parameter(
                 unsharded_param, requires_grad=self.sharded_param.requires_grad
-            )"""
+            )""",
+        ),
+    ]
+
+    original_param_creation, patched_param_creation = next(
+        ((a, p) for a, p in anchor_replacement_pairs if a in original_source),
+        (None, None),
+    )
 
     # Apply the replacement
-    if original_param_creation in original_source:
+    if original_param_creation is not None:
         patched_source = original_source.replace(
             original_param_creation, patched_param_creation
         )
diff --git a/src/axolotl/monkeypatch/gemma4_kernelize.py b/src/axolotl/monkeypatch/gemma4_kernelize.py
new file mode 100644
index 0000000000..b87f0b840d
--- /dev/null
+++ b/src/axolotl/monkeypatch/gemma4_kernelize.py
@@ -0,0 +1,113 @@
+"""Fix for transformers' Gemma 4 ``kernelize()`` crash under ``use_kernels``.
+
+In transformers, ``Gemma4VisionAttention`` is decorated with
+``@use_kernelized_func(apply_rotary_pos_emb)`` where ``apply_rotary_pos_emb``
+is a **plain function**, not a ``@use_kernel_func_from_hub``-wrapped kernel
+layer. That decorator stashes the bare function in each instance's
+``_hidden_kernels`` dict.
+
+When ``use_kernels=True`` (which axolotl's ``KernelsArgs`` force-enables for the
+ScatterMoE path), ``from_pretrained`` calls ``model.kernelize()``, whose
+``attach_hidden_kernels`` step does ``module.register_module(name, fn)`` for each
+``_hidden_kernels`` entry. ``register_module`` rejects a non-``nn.Module``::
+
+    TypeError: ...apply_rotary_pos_emb is not a Module subclass
+
+and the ``finally``-block cleanup then raises the visible::
+
+    AttributeError: 'Gemma4VisionAttention' object has no attribute apply_rotary_pos_emb
+
+This is a transformers bug, not Gemma4-specific in spirit (qwen3_moe avoids it
+by wrapping the func with ``@use_kernel_func_from_hub`` so a Module-like ``Func``
+is registered). Notably, ``Gemma4VisionAttention.forward`` calls
+``apply_multidimensional_rope`` and never references ``apply_rotary_pos_emb``, so
+the registered entry is dead weight — dropping the non-Module ``_hidden_kernels``
+entries makes ``kernelize()`` a no-op for vision attention with zero behavior
+change.
+
+The patch wraps ``Gemma4VisionAttention.__init__`` to strip any non-``nn.Module``
+``_hidden_kernels`` entries after construction. Properly-wrapped (Module) entries,
+including ones a fixed transformers might introduce, are left intact, so the patch
+is forward-compatible. Idempotent; install before the model is built.
+"""
+
+from __future__ import annotations
+
+from axolotl.utils.logging import get_logger
+
+LOG = get_logger(__name__)
+
+_PATCH_APPLIED = False
+
+
+def patch_gemma4_kernelize() -> bool:
+    """Strip dead non-Module ``_hidden_kernels`` entries on ``Gemma4VisionAttention``.
+
+    Returns ``True`` if the patch is installed (or already was), ``False`` if the
+    target class could not be imported (e.g. transformers predates Gemma 4) — in
+    which case nothing is done and the caller can continue unaffected.
+    """
+    global _PATCH_APPLIED
+    if _PATCH_APPLIED:
+        return True
+
+    try:
+        from transformers.models.gemma4 import modeling_gemma4
+    except ImportError:
+        LOG.debug(
+            "gemma4_kernelize: transformers.models.gemma4 not importable, "
+            "skipping. This is fine for non-Gemma4 training."
+        )
+        return False
+
+    cls = getattr(modeling_gemma4, "Gemma4VisionAttention", None)
+    if cls is None:
+        LOG.warning(
+            "gemma4_kernelize: modeling_gemma4 has no 'Gemma4VisionAttention', "
+            "skipping. Transformers API may have changed."
+        )
+        return False
+
+    import torch.nn as nn
+
+    orig_init = cls.__init__
+
+    def init(self, *args, **kwargs):
+        orig_init(self, *args, **kwargs)
+        hidden_kernels = self.__dict__.get("_hidden_kernels")
+        if hidden_kernels:
+            stale = [
+                name
+                for name, fn in hidden_kernels.items()
+                if not isinstance(fn, nn.Module)
+            ]
+            for name in stale:
+                del hidden_kernels[name]
+
+    # Preserve the original for teardown / idempotency checks.
+    init._axolotl_original = orig_init  # type: ignore[attr-defined]
+    cls.__init__ = init
+    _PATCH_APPLIED = True
+    LOG.info(
+        "gemma4_kernelize: patched Gemma4VisionAttention to drop non-Module "
+        "_hidden_kernels entries so use_kernels/kernelize() does not crash"
+    )
+    return True
+
+
+def unpatch_gemma4_kernelize() -> None:
+    """Restore the original ``Gemma4VisionAttention.__init__``. Useful for tests."""
+    global _PATCH_APPLIED
+    if not _PATCH_APPLIED:
+        return
+    try:
+        from transformers.models.gemma4 import modeling_gemma4
+    except ImportError:
+        _PATCH_APPLIED = False
+        return
+    cls = getattr(modeling_gemma4, "Gemma4VisionAttention", None)
+    if cls is not None:
+        original = getattr(cls.__init__, "_axolotl_original", None)
+        if original is not None:
+            cls.__init__ = original
+    _PATCH_APPLIED = False
diff --git a/src/axolotl/monkeypatch/lora_kernels.py b/src/axolotl/monkeypatch/lora_kernels.py
index b9e7f3e0c8..8156b72c7d 100644
--- a/src/axolotl/monkeypatch/lora_kernels.py
+++ b/src/axolotl/monkeypatch/lora_kernels.py
@@ -76,85 +76,8 @@
     value_states = value_states.view(hidden_shape).transpose(1, 2)
 """.lstrip("\n"),
     ),
-    # Gemma4: norm between proj and transpose, RoPE between norm and transpose,
-    # conditional KV sharing (is_kv_shared_layer), v_proj may be None (attention_k_eq_v).
-    # We only fuse the projection calls; norms, RoPE, and KV sharing stay as-is.
-    (
-        """
-    query_states = self.q_proj(hidden_states).view(hidden_shape)
-    query_states = self.q_norm(query_states)
-    query_states = apply_rotary_pos_emb(query_states, cos, sin, unsqueeze_dim=2)
-    query_states = query_states.transpose(1, 2)
-
-    # For layers with shared KV (from kv sharing point onwards), we reuse the same keys/values states as the last non-sharing layer
-    if self.is_kv_shared_layer and past_key_values is not None:
-        key_states, value_states = past_key_values.shared_layers[self.kv_shared_layer_index]
-        # Device of past layer may be different from current one
-        key_states = key_states.to(query_states.device)
-        value_states = value_states.to(query_states.device)
-    else:
-        key_states = self.k_proj(hidden_states).view(hidden_shape)
-        value_states = self.v_proj(hidden_states).view(hidden_shape) if self.v_proj is not None else key_states
-""".lstrip("\n"),
-        """
-    query_states, key_states, value_states = self.apply_qkv(hidden_states)
-    query_states = query_states.view(hidden_shape)
-    query_states = self.q_norm(query_states)
-    query_states = apply_rotary_pos_emb(query_states, cos, sin, unsqueeze_dim=2)
-    query_states = query_states.transpose(1, 2)
-
-    # For layers with shared KV (from kv sharing point onwards), we reuse the same keys/values states as the last non-sharing layer
-    if self.is_kv_shared_layer and past_key_values is not None:
-        key_states, value_states = past_key_values.shared_layers[self.kv_shared_layer_index]
-        # Device of past layer may be different from current one
-        key_states = key_states.to(query_states.device)
-        value_states = value_states.to(query_states.device)
-    else:
-        key_states = key_states.view(hidden_shape)
-        value_states = value_states.view(hidden_shape) if self.v_proj is not None else key_states
-""".lstrip("\n"),
-    ),
-    # Gemma4 (transformers >= 5.6): shared_kv_states parameter replaces
-    # past_key_values.shared_layers, and v_norm added after k_norm.
-    (
-        """
-    query_states = self.q_proj(hidden_states).view(hidden_shape)
-    query_states = self.q_norm(query_states)
-    query_states = apply_rotary_pos_emb(query_states, cos, sin, unsqueeze_dim=2)
-    query_states = query_states.transpose(1, 2)
-
-    # For layers with shared KV (from kv sharing point onwards), we reuse the same keys/values states as the last non-sharing layer.
-    # We cannot simply reuse the cached state if we have a Cache, as sliding layers will not remember the full states in their Cache
-    # once we are past the sliding window - so we always use `shared_kv_states` instead, even when past_key_values is not None
-    if self.is_kv_shared_layer:
-        key_states, value_states = shared_kv_states[self.kv_shared_layer_index]
-        # Device of past layer may be different from current one
-        key_states = key_states.to(query_states.device)
-        value_states = value_states.to(query_states.device)
-    else:
-        key_states = self.k_proj(hidden_states).view(hidden_shape)
-        value_states = self.v_proj(hidden_states).view(hidden_shape) if self.v_proj is not None else key_states
-""".lstrip("\n"),
-        """
-    query_states, key_states, value_states = self.apply_qkv(hidden_states)
-    query_states = query_states.view(hidden_shape)
-    query_states = self.q_norm(query_states)
-    query_states = apply_rotary_pos_emb(query_states, cos, sin, unsqueeze_dim=2)
-    query_states = query_states.transpose(1, 2)
-
-    # For layers with shared KV (from kv sharing point onwards), we reuse the same keys/values states as the last non-sharing layer.
-    # We cannot simply reuse the cached state if we have a Cache, as sliding layers will not remember the full states in their Cache
-    # once we are past the sliding window - so we always use `shared_kv_states` instead, even when past_key_values is not None
-    if self.is_kv_shared_layer:
-        key_states, value_states = shared_kv_states[self.kv_shared_layer_index]
-        # Device of past layer may be different from current one
-        key_states = key_states.to(query_states.device)
-        value_states = value_states.to(query_states.device)
-    else:
-        key_states = key_states.view(hidden_shape)
-        value_states = value_states.view(hidden_shape) if self.v_proj is not None else key_states
-""".lstrip("\n"),
-    ),
+    # Gemma4 has no entry: its fused forward already calls apply_qkv/apply_o,
+    # and patch_self_attn_lora skips it (see the skip there).
 ]
 
 ORIGINAL_O_CODE = """
@@ -323,6 +246,23 @@ def patch_self_attn_lora(cfg: DictDefault):
         LOG.info(f"{attention_cls.__name__} already patched")
         return
 
+    # Skip Gemma4: patch_manager applies patch_gemma4_fused_attn
+    # unconditionally for gemma4 before this runs, and that fused forward
+    # already calls apply_qkv/apply_o, so the source rewrite is dead.
+    try:
+        from transformers.models.gemma4.modeling_gemma4 import (
+            Gemma4TextAttention,
+        )
+
+        if attention_cls is Gemma4TextAttention:
+            LOG.info(
+                "Gemma4TextAttention uses the fused attention path "
+                "(apply_qkv/apply_o) - skipping LoRA source rewrite"
+            )
+            return
+    except ImportError:
+        pass
+
     self_attn_forward = inspect.getsource(attention_cls.forward)
     attention_cls._original_forward = self_attn_forward
     self_attn_forward, _ = detab_code(self_attn_forward)
diff --git a/src/axolotl/monkeypatch/models/falcon_h1/__init__.py b/src/axolotl/monkeypatch/models/falcon_h1/__init__.py
new file mode 100644
index 0000000000..e69de29bb2
diff --git a/src/axolotl/monkeypatch/models/falcon_h1/modeling.py b/src/axolotl/monkeypatch/models/falcon_h1/modeling.py
new file mode 100644
index 0000000000..e733a85b77
--- /dev/null
+++ b/src/axolotl/monkeypatch/models/falcon_h1/modeling.py
@@ -0,0 +1,364 @@
+"""Sample-packing and context-parallelism patch for Falcon-H1 (parallel Mamba2/Attention hybrid).
+
+Threads seq_idx (derived from position_ids) into the Mamba2 SSM kernels so
+packed-sequence boundaries reset SSM state.  Upstream hard-codes seq_idx=None,
+which leaks hidden state across boundaries.
+
+Unlike Nemotron-H (which selects block_type per layer), Falcon-H1 runs both
+Mamba2 and Attention in **parallel** in every FalconH1DecoderLayer, so we
+always need seq_idx for the mamba branch.
+"""
+
+import importlib
+
+import torch
+
+from axolotl.monkeypatch.models.mamba_utils import (
+    ensure_mamba_kernels_loaded,
+    get_seq_idx,
+    is_cp_active,
+    wrap_mamba_scan_for_cp,
+)
+from axolotl.utils.logging import get_logger
+
+LOG = get_logger(__name__)
+
+
+def patch_falcon_h1_modeling_packing():
+    """Patch Falcon-H1 for sample packing: seq_idx threading into Mamba2 SSM kernels."""
+    try:
+        mod = importlib.import_module(
+            "transformers.models.falcon_h1.modeling_falcon_h1"
+        )
+    except ImportError:
+        LOG.warning("falcon_h1 not found in transformers, skipping packing patches")
+        return
+
+    ensure_mamba_kernels_loaded(mod)
+
+    FalconH1Mixer = mod.FalconH1Mixer
+    FalconH1DecoderLayer = mod.FalconH1DecoderLayer
+
+    def patched_cuda_kernels_forward(
+        self,
+        hidden_states: torch.Tensor,
+        cache_params=None,
+        cache_position=None,
+        attention_mask=None,
+        seq_idx=None,
+    ):
+        hidden_states = mod.apply_mask_to_padding_states(hidden_states, attention_mask)
+        hidden_states = hidden_states * self.ssm_in_multiplier
+        projected_states = self.in_proj(hidden_states)
+        projected_states = projected_states * self.mup_vector
+        d_to_remove = (
+            2 * self.intermediate_size
+            + 2 * self.n_groups * self.ssm_state_size
+            + self.num_heads
+        )
+
+        batch_size, seq_len, _ = hidden_states.shape
+        groups_time_state_size = self.n_groups * self.ssm_state_size
+
+        use_precomputed_states = (
+            cache_params is not None
+            and cache_params.has_previous_state
+            and seq_len == 1
+            and cache_params.conv_states[self.layer_idx].shape[0]
+            == cache_params.ssm_states[self.layer_idx].shape[0]
+            == batch_size
+            and cache_position is not None
+            and cache_position[0] > 0
+        )
+
+        if use_precomputed_states:
+            d_mlp = (projected_states.squeeze(1).shape[-1] - d_to_remove) // 2
+            z0, x0, gate, hidden_states_B_C, dt = projected_states.squeeze(1).split(
+                [d_mlp, d_mlp, self.intermediate_size, self.conv_dim, self.num_heads],
+                dim=-1,
+            )
+            hidden_states_B_C = mod.causal_conv1d_update(
+                hidden_states_B_C,
+                cache_params.conv_states[self.layer_idx],
+                self.conv1d.weight.squeeze(1),
+                self.conv1d.bias,
+                self.activation,
+            )
+            hidden_states, B, C = torch.split(
+                hidden_states_B_C,
+                [
+                    self.intermediate_size,
+                    groups_time_state_size,
+                    groups_time_state_size,
+                ],
+                dim=-1,
+            )
+            A = -torch.exp(self.A_log.float())
+            A = (
+                A[:, None, ...][:, :, None]
+                .expand(-1, self.head_dim, self.ssm_state_size)
+                .to(dtype=torch.float32)
+            )
+            dt = dt[:, :, None].expand(-1, -1, self.head_dim)
+            dt_bias = self.dt_bias[:, None, ...].expand(-1, self.head_dim)
+            D = self.D[:, None, ...].expand(-1, self.head_dim)
+            B = B.view(batch_size, self.n_groups, B.shape[1] // self.n_groups)
+            C = C.view(batch_size, self.n_groups, C.shape[1] // self.n_groups)
+            hidden_states_reshaped = hidden_states.view(
+                batch_size, self.num_heads, self.head_dim
+            )
+            hidden_states = mod.selective_state_update(
+                cache_params.ssm_states[self.layer_idx],
+                hidden_states_reshaped,
+                dt,
+                A,
+                B,
+                C,
+                D,
+                z=(
+                    gate.view(batch_size, self.num_heads, self.head_dim)
+                    if not self.mamba_rms_norm
+                    else None
+                ),
+                dt_bias=dt_bias,
+                dt_softplus=True,
+            )
+            hidden_states = hidden_states.view(
+                batch_size, self.num_heads * self.head_dim
+            )
+            if self.mamba_rms_norm:
+                hidden_states = self.norm(hidden_states, gate)
+            if d_mlp > 0:
+                hidden_states = torch.cat(
+                    [torch.nn.functional.silu(z0) * x0, hidden_states], dim=-1
+                )
+            out = self.out_proj(hidden_states[:, None, ...])
+        else:
+            A = -torch.exp(self.A_log.float())
+            dt_limit_kwargs = (
+                {}
+                if self.time_step_limit == (0.0, float("inf"))
+                else {"dt_limit": self.time_step_limit}
+            )
+
+            if self.training and cache_params is None and not is_cp_active():
+                out = mod.mamba_split_conv1d_scan_combined(
+                    projected_states,
+                    self.conv1d.weight.squeeze(1),
+                    self.conv1d.bias,
+                    self.dt_bias,
+                    A,
+                    D=self.D,
+                    chunk_size=self.chunk_size,
+                    seq_idx=seq_idx,
+                    activation=self.activation,
+                    rmsnorm_weight=(self.norm.weight if self.mamba_rms_norm else None),
+                    rmsnorm_eps=(
+                        self.norm.variance_epsilon if self.mamba_rms_norm else None
+                    ),
+                    outproj_weight=self.out_proj.weight,
+                    outproj_bias=self.out_proj.bias,
+                    headdim=self.head_dim,
+                    ngroups=self.n_groups,
+                    norm_before_gate=False,
+                    return_final_states=False,
+                    **dt_limit_kwargs,
+                )
+            else:
+                d_mlp = (
+                    projected_states.shape[-1]
+                    - 2 * self.intermediate_size
+                    - 2 * self.n_groups * self.ssm_state_size
+                    - self.num_heads
+                ) // 2
+                if attention_mask is not None:
+                    projected_states = projected_states * attention_mask[..., None]
+                _, gate, hidden_states_B_C, dt = projected_states.split(
+                    [2 * d_mlp, self.intermediate_size, self.conv_dim, self.num_heads],
+                    dim=-1,
+                )
+
+                if cache_params is not None:
+                    conv_states = torch.nn.functional.pad(
+                        hidden_states_B_C.permute(0, 2, 1),
+                        (
+                            self.conv_kernel_size - hidden_states_B_C.shape[-2],
+                            0,
+                        ),
+                    )
+                    cache_params.update_conv_state(
+                        self.layer_idx, conv_states, cache_position
+                    )
+
+                time_step = torch.nn.functional.softplus(dt + self.dt_bias)
+
+                if mod.causal_conv1d_fn is None or self.activation not in [
+                    "silu",
+                    "swish",
+                ]:
+                    hidden_states_B_C = self.act(
+                        self.conv1d(hidden_states_B_C.transpose(1, 2)).transpose(1, 2)[
+                            :, :seq_len
+                        ]
+                    )
+                else:
+                    hidden_states_B_C = mod.causal_conv1d_fn(
+                        x=hidden_states_B_C.transpose(1, 2),
+                        weight=self.conv1d.weight.squeeze(1),
+                        bias=self.conv1d.bias,
+                        activation=self.activation,
+                        seq_idx=seq_idx,
+                    ).transpose(1, 2)[:, :seq_len]
+
+                hidden_states, B, C = torch.split(
+                    hidden_states_B_C,
+                    [
+                        self.intermediate_size,
+                        groups_time_state_size,
+                        groups_time_state_size,
+                    ],
+                    dim=-1,
+                )
+
+                if (
+                    attention_mask is not None
+                    and attention_mask.shape[1] > 1
+                    and attention_mask.shape[0] > 1
+                ):
+                    dtype = hidden_states.dtype
+                    hidden_states = (hidden_states * attention_mask[:, :, None]).to(
+                        dtype
+                    )
+
+                C_reshaped = C.view(batch_size, seq_len, self.n_groups, -1)
+                with torch.cuda.device(hidden_states.device):
+                    scan_output, ssm_state = mod.mamba_chunk_scan_combined(
+                        hidden_states.view(batch_size, seq_len, -1, self.head_dim),
+                        time_step,
+                        A,
+                        B.view(batch_size, seq_len, self.n_groups, -1),
+                        C_reshaped,
+                        chunk_size=self.chunk_size,
+                        D=self.D,
+                        z=None,
+                        seq_idx=seq_idx,
+                        return_final_states=True,
+                        **dt_limit_kwargs,
+                    )
+
+                if ssm_state is not None and cache_params is not None:
+                    cache_params.ssm_states[self.layer_idx].copy_(ssm_state)
+                scan_output = scan_output.view(batch_size, seq_len, -1)
+                if self.mamba_rms_norm:
+                    out = self.norm(scan_output, gate)
+                else:
+                    out = scan_output * torch.nn.functional.silu(gate)
+                out = self.out_proj(out)
+        return out
+
+    FalconH1Mixer.cuda_kernels_forward = patched_cuda_kernels_forward
+
+    def patched_mixer_forward(
+        self,
+        hidden_states,
+        cache_params=None,
+        cache_position=None,
+        attention_mask=None,
+        seq_idx=None,
+    ):
+        if seq_idx is not None and mod.causal_conv1d_fn is None:
+            raise RuntimeError(
+                "Falcon-H1 sample packing requires causal_conv1d_fn. "
+                "Install with: pip install mamba-ssm causal-conv1d"
+            )
+        if mod.is_fast_path_available and "cuda" in self.in_proj.weight.device.type:
+            return self.cuda_kernels_forward(
+                hidden_states,
+                cache_params,
+                cache_position,
+                attention_mask,
+                seq_idx=seq_idx,
+            )
+        if seq_idx is not None:
+            raise RuntimeError(
+                "Falcon-H1 sample packing requires the CUDA fast path. "
+                "Ensure model is on CUDA and mamba-ssm/causal-conv1d are installed."
+            )
+        dtype = hidden_states.dtype
+        if (
+            attention_mask is not None
+            and attention_mask.shape[1] > 1
+            and attention_mask.shape[0] > 1
+        ):
+            hidden_states = (hidden_states * attention_mask[:, :, None]).to(dtype)
+        return self.torch_forward(
+            hidden_states, cache_params, cache_position, attention_mask
+        )
+
+    FalconH1Mixer.forward = patched_mixer_forward
+
+    # Falcon-H1 runs mamba + attention in parallel every layer (no block_type).
+    # Compute seq_idx from position_ids and pass to the mamba branch.
+    def patched_decoder_forward(
+        self,
+        hidden_states,
+        attention_mask=None,
+        mamba_attention_mask=None,
+        position_ids=None,
+        past_key_values=None,
+        output_attentions=False,
+        use_cache=False,
+        cache_position=None,
+        position_embeddings=None,
+        **kwargs,
+    ):
+        is_decoding = past_key_values is not None and past_key_values.has_previous_state
+        seq_idx = (
+            get_seq_idx(position_ids)
+            if position_ids is not None and not is_decoding
+            else None
+        )
+
+        residual = hidden_states
+        hidden_states = self.input_layernorm(hidden_states)
+
+        mamba_hidden_states = self.mamba(
+            hidden_states=hidden_states,
+            cache_params=past_key_values,
+            cache_position=cache_position,
+            attention_mask=mamba_attention_mask,
+            seq_idx=seq_idx,
+        )
+        mamba_hidden_states = mamba_hidden_states * self.ssm_out_multiplier
+
+        attention_hidden_states, self_attn_weights = self.self_attn(
+            hidden_states=hidden_states * self.attention_in_multiplier,
+            attention_mask=attention_mask,
+            position_ids=position_ids,
+            past_key_values=past_key_values,
+            output_attentions=output_attentions,
+            use_cache=use_cache,
+            cache_position=cache_position,
+            position_embeddings=position_embeddings,
+            **kwargs,
+        )
+        attention_hidden_states = attention_hidden_states * self.attn_out_multiplier
+
+        hidden_states = mamba_hidden_states + attention_hidden_states
+        hidden_states = residual + hidden_states
+
+        residual = hidden_states
+        hidden_states = self.pre_ff_layernorm(hidden_states)
+        hidden_states = self.feed_forward(hidden_states)
+        hidden_states = residual + hidden_states
+
+        outputs = (hidden_states,)
+        if output_attentions:
+            outputs += (self_attn_weights,)
+        return outputs
+
+    FalconH1DecoderLayer.forward = patched_decoder_forward
+
+    wrap_mamba_scan_for_cp(mod)
+
+    LOG.info("Applied Falcon-H1 sample packing patch (seq_idx threading into Mamba2)")
diff --git a/src/axolotl/monkeypatch/models/gemma4/fused_attn.py b/src/axolotl/monkeypatch/models/gemma4/fused_attn.py
index 2144b6c417..a4f15207f8 100644
--- a/src/axolotl/monkeypatch/models/gemma4/fused_attn.py
+++ b/src/axolotl/monkeypatch/models/gemma4/fused_attn.py
@@ -31,6 +31,19 @@ def _get_shared_kv_states():
     return _GEMMA4_SHARED_KV_STORE["store"]
 
 
+def _shared_kv_read_key(attn):
+    # transformers >=5.8 keys shared kv by layer_type; older builds used kv_shared_layer_index
+    if hasattr(attn, "kv_shared_layer_index"):
+        return attn.kv_shared_layer_index
+    return attn.layer_type
+
+
+def _shared_kv_store_key(attn):
+    if hasattr(attn, "kv_shared_layer_index"):
+        return attn.layer_idx
+    return attn.layer_type
+
+
 def _make_fused_forward(original_forward):
     """Create a patched forward that uses fused RMSNorm+RoPE kernels."""
 
@@ -85,7 +98,7 @@ def fused_forward(
 
         # ---- K/V path ----
         if self.is_kv_shared_layer:
-            key_states, value_states = shared_kv_states[self.kv_shared_layer_index]
+            key_states, value_states = shared_kv_states[_shared_kv_read_key(self)]
             key_states = key_states.to(query_states.device)
             value_states = value_states.to(query_states.device)
         else:
@@ -124,7 +137,7 @@ def fused_forward(
                 key_states, value_states, self.layer_idx
             )
         if self.store_full_length_kv:
-            shared_kv_states[self.layer_idx] = key_states, value_states
+            shared_kv_states[_shared_kv_store_key(self)] = key_states, value_states
 
         attention_interface: Callable = eager_attention_forward
         if self.config._attn_implementation != "eager":
@@ -145,7 +158,11 @@ def fused_forward(
         )
 
         attn_output = attn_output.reshape(*input_shape, -1).contiguous()
-        attn_output = self.o_proj(attn_output)
+        # Use apply_o if present (LoRA O kernel patch), otherwise direct proj
+        if hasattr(self, "apply_o"):
+            attn_output = self.apply_o(attn_output)
+        else:
+            attn_output = self.o_proj(attn_output)
         return attn_output, attn_weights
 
     return fused_forward
diff --git a/src/axolotl/monkeypatch/models/granitemoehybrid/__init__.py b/src/axolotl/monkeypatch/models/granitemoehybrid/__init__.py
new file mode 100644
index 0000000000..e69de29bb2
diff --git a/src/axolotl/monkeypatch/models/granitemoehybrid/modeling.py b/src/axolotl/monkeypatch/models/granitemoehybrid/modeling.py
new file mode 100644
index 0000000000..590db5408d
--- /dev/null
+++ b/src/axolotl/monkeypatch/models/granitemoehybrid/modeling.py
@@ -0,0 +1,107 @@
+"""Sample-packing and context-parallelism patch for Granite MoE Hybrid (Mamba2/Attention/MoE).
+
+Upstream GraniteMoeHybridMambaLayer already accepts seq_idx on
+forward/cuda_kernels_forward, and GraniteMoeHybridDecoderLayer passes **kwargs
+through to the mixer.  However, the decoder layer does not receive position_ids
+directly — it arrives at the model level.
+
+This patch:
+1. Injects seq_idx computation into GraniteMoeHybridModel.forward so it flows
+   through kwargs -> decoder_layer -> mamba mixer automatically.
+2. Forces the slow path when CP is active (the fused path doesn't return SSM
+   state).  CP correction is handled by ``wrap_mamba_scan_for_cp``.
+"""
+
+import importlib
+
+from axolotl.monkeypatch.models.mamba_utils import (
+    ensure_mamba_kernels_loaded,
+    get_seq_idx,
+    is_cp_active,
+    wrap_mamba_scan_for_cp,
+)
+from axolotl.utils.logging import get_logger
+
+LOG = get_logger(__name__)
+
+
+def patch_granitemoehybrid_modeling_packing():
+    """Patch Granite MoE Hybrid for sample packing: seq_idx + CP correction."""
+    try:
+        mod = importlib.import_module(
+            "transformers.models.granitemoehybrid.modeling_granitemoehybrid"
+        )
+    except ImportError:
+        LOG.warning(
+            "granitemoehybrid not found in transformers, skipping packing patches"
+        )
+        return
+
+    ensure_mamba_kernels_loaded(mod)
+
+    GraniteMoeHybridModel = mod.GraniteMoeHybridModel
+    GraniteMoeHybridMambaLayer = mod.GraniteMoeHybridMambaLayer
+
+    # Patch 1: Model-level seq_idx injection
+    original_model_forward = GraniteMoeHybridModel.forward
+
+    def patched_model_forward(self, *args, **kwargs):
+        position_ids = kwargs.get("position_ids")
+        if position_ids is None and len(args) > 2:
+            position_ids = args[2]
+
+        past_key_values = kwargs.get("past_key_values")
+        if past_key_values is None and len(args) > 3:
+            past_key_values = args[3]
+
+        is_decoding = (
+            past_key_values is not None
+            and hasattr(past_key_values, "has_previous_state")
+            and past_key_values.has_previous_state
+        )
+
+        if position_ids is not None and not is_decoding and "seq_idx" not in kwargs:
+            kwargs["seq_idx"] = get_seq_idx(position_ids)
+
+        return original_model_forward(self, *args, **kwargs)
+
+    GraniteMoeHybridModel.forward = patched_model_forward
+
+    # Patch 2: Minimal wrapper to force slow path when CP is active.
+    # The fused mamba_split_conv1d_scan_combined doesn't return SSM state, so
+    # CP correction (handled by the scan wrapper) needs the slow path.
+    original_cuda_kernels_forward = GraniteMoeHybridMambaLayer.cuda_kernels_forward
+
+    def patched_cuda_kernels_forward(
+        self,
+        hidden_states,
+        cache_params=None,
+        attention_mask=None,
+        seq_idx=None,
+    ):
+        force_slow = (
+            (seq_idx is not None or is_cp_active())
+            and self.training
+            and cache_params is None
+        )
+        if force_slow:
+            self.training = False
+        try:
+            return original_cuda_kernels_forward(
+                self,
+                hidden_states,
+                cache_params,
+                attention_mask,
+                seq_idx,
+            )
+        finally:
+            if force_slow:
+                self.training = True
+
+    GraniteMoeHybridMambaLayer.cuda_kernels_forward = patched_cuda_kernels_forward
+
+    wrap_mamba_scan_for_cp(mod)
+
+    LOG.info(
+        "Applied Granite MoE Hybrid sample packing patch (seq_idx + CP correction)"
+    )
diff --git a/src/axolotl/monkeypatch/models/mamba_utils.py b/src/axolotl/monkeypatch/models/mamba_utils.py
new file mode 100644
index 0000000000..0c0e12c5fa
--- /dev/null
+++ b/src/axolotl/monkeypatch/models/mamba_utils.py
@@ -0,0 +1,331 @@
+"""Shared utilities for Mamba2 SSM sample-packing and context-parallelism patches.
+
+Used by: nemotron_h, falcon_h1, granite_moe_hybrid
+"""
+
+import functools
+
+import torch
+import torch.distributed as dist
+
+
+def get_seq_idx(position_ids: torch.Tensor) -> torch.Tensor:
+    """Convert position_ids [B, T] → seq_idx [B, T] int32 for mamba-ssm kernels.
+
+    Example: position_ids [[0,1,2,3,0,1,2]] → seq_idx [[0,0,0,0,1,1,1]]
+
+    Under context parallelism a rank may receive a chunk that begins mid-sample
+    (position_ids[0] != 0), so the raw cumsum starts at 0 and subtracting 1
+    would yield -1 — an invalid value for the Mamba kernels.  Subtracting the
+    first element of the cumsum instead normalises every chunk to start at 0
+    while still correctly incrementing at every intra-chunk sample boundary.
+
+    Example (CP rank 1, chunk starts mid-sample):
+        position_ids [[3,4,5,0,1,2]] → seq_idx [[0,0,0,1,1,1]]
+    """
+    cumsum = torch.cumsum((position_ids == 0).int(), dim=-1)
+    return (cumsum - cumsum[..., :1]).to(torch.int32)
+
+
+def is_cp_active() -> bool:
+    """Return True if context parallelism (ring attention) is active on this rank.
+
+    Zero-cost when CP is not configured: the import guard ensures we only touch
+    the distributed group if ring_flash_attn is installed.
+    """
+    try:
+        from axolotl.monkeypatch.ring_attn import get_ring_attn_group
+
+        group = get_ring_attn_group()
+        return group is not None and dist.get_world_size(group) > 1
+    except (ImportError, RuntimeError):
+        return False
+
+
+def _get_cp_group_and_rank():
+    """Return (process_group, local_rank, world_size) for the CP ring."""
+    from axolotl.monkeypatch.ring_attn import get_ring_attn_group
+
+    group = get_ring_attn_group()
+    return group, dist.get_rank(group), dist.get_world_size(group)
+
+
+def ring_shift_ssm_state(
+    h_final: torch.Tensor,
+) -> torch.Tensor:
+    """P2P ring: send h_final to rank+1, receive from rank-1 within CP group.
+
+    Uses synchronous send/recv on the ring attention process group.
+    Rank 0 in the CP group receives zeros (no previous chunk).
+
+    Args:
+        h_final: Final SSM state from this rank's forward pass.
+                 Shape is architecture-dependent, typically [B, H, d, n].
+
+    Returns:
+        h_prev: SSM state received from rank-1, same shape/dtype as h_final.
+                Zero tensor on the first rank in the CP group.
+    """
+    group, local_rank, world_size = _get_cp_group_and_rank()
+    ranks = dist.get_process_group_ranks(group)
+
+    h_prev = torch.zeros_like(h_final)
+
+    if world_size <= 1:
+        return h_prev
+
+    prev_global = ranks[(local_rank - 1) % world_size]
+    next_global = ranks[(local_rank + 1) % world_size]
+
+    send_op = dist.P2POp(dist.isend, h_final.contiguous(), next_global, group=group)
+    recv_op = dist.P2POp(dist.irecv, h_prev, prev_global, group=group)
+
+    reqs = dist.batch_isend_irecv([send_op, recv_op])
+    for req in reqs:
+        req.wait()
+
+    # Rank 0 in the ring has no true predecessor — zero out received state
+    if local_rank == 0:
+        h_prev.zero_()
+
+    return h_prev
+
+
+def mamba2_cp_correction(
+    out: torch.Tensor,
+    h_final: torch.Tensor,
+    C: torch.Tensor,
+    cum_A: torch.Tensor,
+    h_prev: torch.Tensor,
+    num_heads: int,
+    head_dim: int,
+    seq_idx: torch.Tensor | None = None,
+) -> tuple[torch.Tensor, torch.Tensor]:
+    """Apply CP correction to SSM output using the received state from rank-1.
+
+    SSM output is linear in the initial hidden state, so the contribution of
+    h_prev can be added analytically without a second forward pass.
+
+    For each timestep t in the local chunk:
+        propagated_state_t = cumA_t * h_prev          [B, H, d, n]
+        Δy_t = sum_over_n( C_t * propagated_state_t ) [B, H, d]
+
+    The corrected final state for this rank is:
+        h_final_corrected = h_final + cumA_T * h_prev
+
+    Sample packing correctness (seq_idx):
+        When sample packing is active, a CP rank may hold multiple packed
+        sequences. Only the first sequence (seq_idx == 0) is a continuation
+        of the previous rank's chunk — subsequent sequences are brand-new and
+        should receive zero correction from h_prev.
+
+        Passing seq_idx masks delta_y to zero for all tokens where
+        seq_idx > 0, preventing h_prev state from leaking into unrelated
+        packed sequences.
+
+    Args:
+        out:       SSM scan output from this rank, shape [B, T, D] where D = H*d.
+        h_final:   Final SSM state from this rank, shape [B, H, d, n].
+        C:         Output projection matrices, shape [B, T, n_groups, n].
+        cum_A:     Cumulative log-transition factors, shape [B, T, H].
+                   These are the log-space cumulative sums of A, so
+                   exp(cum_A_t) gives the transition matrix from step 0 to t.
+        h_prev:    SSM state received from rank-1 (zeros on rank 0).
+                   Shape [B, H, d, n].
+        num_heads: Number of SSM heads (H).
+        head_dim:  Dimension per head (d).
+        seq_idx:   Optional sequence index tensor, shape [B, T] int32.
+                   When provided, correction is zeroed for tokens where
+                   seq_idx > 0 (i.e. sequences that start fresh on this rank).
+
+    Returns:
+        corrected_out:     out + Δy, shape [B, T, D].
+        corrected_h_final: h_final + cumA_T * h_prev, shape [B, H, d, n].
+    """
+    if not h_prev.any():
+        return out, h_final
+
+    B, T, _ = out.shape
+    n_groups = C.shape[2]
+    heads_per_group = num_heads // n_groups
+
+    # cum_A: [B, T, H] → transition factors (exponentiate from log-space)
+    decay = torch.exp(cum_A).float()  # [B, T, H]
+
+    # Propagate h_prev through cumulative transitions: [B, T, H, d, n]
+    prop_state = decay[:, :, :, None, None] * h_prev[:, None, :, :, :].float()
+
+    # C: [B, T, n_groups, n] → expand to heads: [B, T, H, n]
+    C_expanded = C.float().repeat_interleave(heads_per_group, dim=2)  # [B, T, H, n]
+
+    # Δy_t = sum_n(C_t * prop_state_t) → [B, T, H, d]
+    delta_y = torch.einsum("bthn,bthdn->bthd", C_expanded, prop_state)
+
+    # Mask out correction for tokens belonging to new sequences on this rank.
+    # seq_idx == 0 → continuation of the sequence that crossed the CP boundary
+    # seq_idx  > 0 → brand-new packed sequence, h_prev is irrelevant to it
+    if seq_idx is not None:
+        # mask: [B, T, 1, 1] — broadcast over H and d
+        mask = (seq_idx == 0).to(delta_y.dtype).unsqueeze(-1).unsqueeze(-1)
+        delta_y = delta_y * mask
+
+    # Reshape to [B, T, D] where D = H * d
+    delta_y = delta_y.reshape(B, T, num_heads * head_dim).to(out.dtype)
+
+    corrected_out = out + delta_y
+
+    # Correct final state using last-timestep decay.
+    # If the last token is in a new sequence (seq_idx > 0 at T-1), h_prev
+    # should not propagate into h_final either.
+    if seq_idx is not None and seq_idx[:, -1].any():
+        # last token belongs to a new sequence — don't corrupt h_final
+        corrected_h_final = h_final
+    else:
+        decay_final = decay[:, -1, :, None, None]  # [B, H, 1, 1]
+        corrected_h_final = h_final + (decay_final * h_prev.float()).to(h_final.dtype)
+
+    return corrected_out, corrected_h_final
+
+
+def ensure_mamba_kernels_loaded(target_module):
+    """Eagerly resolve mamba-ssm and causal-conv1d globals on *target_module*.
+
+    Transformers >= 5.5 lazily loads these inside ``Mixer.__init__`` via
+    ``lazy_load_kernel``.  Our monkeypatches run *before* model instantiation,
+    so the module globals are still ``None``.  This helper triggers the kernel
+    resolution early so the patched ``cuda_kernels_forward`` (and
+    ``wrap_mamba_scan_for_cp``) can reference them.
+    """
+    if getattr(target_module, "mamba_chunk_scan_combined", None) is not None:
+        return
+
+    try:
+        from transformers.integrations.hub_kernels import lazy_load_kernel
+        from transformers.utils.import_utils import resolve_internal_import
+    except ImportError:
+        return
+
+    causal_conv1d = lazy_load_kernel("causal-conv1d")
+    if causal_conv1d is not None:
+        target_module.causal_conv1d_update = getattr(
+            causal_conv1d, "causal_conv1d_update", None
+        )
+        target_module.causal_conv1d_fn = getattr(
+            causal_conv1d, "causal_conv1d_fn", None
+        )
+
+    mamba_ssm = lazy_load_kernel("mamba-ssm")
+    if mamba_ssm is not None:
+        target_module.selective_state_update = resolve_internal_import(
+            mamba_ssm,
+            chained_path="ops.triton.selective_state_update.selective_state_update",
+        )
+        target_module.mamba_chunk_scan_combined = resolve_internal_import(
+            mamba_ssm,
+            chained_path="ops.triton.ssd_combined.mamba_chunk_scan_combined",
+        )
+        target_module.mamba_split_conv1d_scan_combined = resolve_internal_import(
+            mamba_ssm,
+            chained_path="ops.triton.ssd_combined.mamba_split_conv1d_scan_combined",
+        )
+
+    target_module.is_fast_path_available = all(
+        (
+            getattr(target_module, "selective_state_update", None),
+            getattr(target_module, "mamba_chunk_scan_combined", None),
+            getattr(target_module, "mamba_split_conv1d_scan_combined", None),
+            getattr(target_module, "causal_conv1d_fn", None),
+            getattr(target_module, "causal_conv1d_update", None),
+        )
+    )
+
+
+def wrap_mamba_scan_for_cp(target_module):
+    """Wrap ``mamba_chunk_scan_combined`` in *target_module* to apply CP correction.
+
+    After the scan, if CP is active the wrapper:
+    1. Sends the final SSM state to the next rank via ``ring_shift_ssm_state``.
+    2. Computes cumA from the scan's A / dt / dt_bias / dt_softplus args.
+    3. Calls ``mamba2_cp_correction`` to add the contribution of h_prev.
+
+    This is installed per-module so it only affects the architecture whose
+    modeling file imports ``mamba_chunk_scan_combined``.
+
+    The approach follows Tri Dao's Mamba-2 systems blog: each GPU computes its
+    local output and final states, states are passed via P2P, then outputs are
+    corrected — no ring attention needed for SSM layers.
+    """
+    if getattr(target_module, "_cp_scan_wrapped", False):
+        return
+
+    ensure_mamba_kernels_loaded(target_module)
+
+    if getattr(target_module, "mamba_chunk_scan_combined", None) is None:
+        return
+
+    original_scan = target_module.mamba_chunk_scan_combined
+
+    @functools.wraps(original_scan)
+    def _cp_scan_wrapper(*args, **kwargs):
+        cp_active = is_cp_active()
+
+        if cp_active:
+            kwargs["return_final_states"] = True
+
+        result = original_scan(*args, **kwargs)
+
+        if not cp_active:
+            return result
+
+        scan_output, ssm_state = result
+        if ssm_state is None:
+            return result
+
+        h_prev = ring_shift_ssm_state(ssm_state)
+
+        # Signature: mamba_chunk_scan_combined(x, dt, A, B, C, ...)
+        # Extract from kwargs first, fall back to positional args.
+        dt_arg = kwargs.get("dt", args[1] if len(args) > 1 else None)
+        A_arg = kwargs.get("A", args[2] if len(args) > 2 else None)
+        C_arg = kwargs.get("C", args[4] if len(args) > 4 else None)
+        if dt_arg is None or A_arg is None or C_arg is None:
+            raise ValueError(
+                "wrap_mamba_scan_for_cp requires dt, A, C to be passed "
+                f"positionally (got {len(args)} positional args) or as kwargs."
+            )
+        dt_bias = kwargs.get("dt_bias")
+        dt_softplus = kwargs.get("dt_softplus", False)
+        seq_idx = kwargs.get("seq_idx")
+
+        if dt_softplus:
+            dt_eff = torch.nn.functional.softplus(
+                dt_arg + (dt_bias if dt_bias is not None else 0)
+            )
+        else:
+            dt_eff = dt_arg
+
+        dA = A_arg[None, None, :] * dt_eff
+        cum_A = torch.cumsum(dA, dim=1)
+
+        x = args[0]
+        num_heads = A_arg.shape[0]
+        head_dim = x.shape[3] if x.ndim == 4 else x.shape[2] // num_heads
+        B_dim, T_dim = x.shape[0], x.shape[1]
+
+        scan_flat = scan_output.view(B_dim, T_dim, -1)
+        scan_flat, ssm_state = mamba2_cp_correction(
+            scan_flat,
+            ssm_state,
+            C_arg,
+            cum_A,
+            h_prev,
+            num_heads=num_heads,
+            head_dim=head_dim,
+            seq_idx=seq_idx,
+        )
+        scan_output = scan_flat.view(scan_output.shape)
+
+        return scan_output, ssm_state
+
+    target_module.mamba_chunk_scan_combined = _cp_scan_wrapper
+    target_module._cp_scan_wrapped = True
diff --git a/src/axolotl/monkeypatch/models/nemotron_h/modeling.py b/src/axolotl/monkeypatch/models/nemotron_h/modeling.py
index a36c34259b..91d26fa0c5 100644
--- a/src/axolotl/monkeypatch/models/nemotron_h/modeling.py
+++ b/src/axolotl/monkeypatch/models/nemotron_h/modeling.py
@@ -1,28 +1,30 @@
-"""Sample-packing patch for NemotronH (Mamba2/Attention/MoE hybrid).
+"""Sample-packing and context-parallelism patch for NemotronH (Mamba2/Attention/MoE hybrid).
 
 Threads seq_idx (derived from position_ids) into the Mamba2 SSM kernels so
 packed-sequence boundaries reset SSM state. Upstream hard-codes seq_idx=None,
 which leaks hidden state across boundaries. Attention and MoE blocks need no
 changes — only the Mamba2 mixer is patched.
+
+CP correction (ring-shift of SSM state + additive output fix) is handled by
+``wrap_mamba_scan_for_cp`` from ``mamba_utils``, which wraps the
+``mamba_chunk_scan_combined`` call at the module level.
 """
 
 import importlib
 
 import torch
 
+from axolotl.monkeypatch.models.mamba_utils import (
+    ensure_mamba_kernels_loaded,
+    get_seq_idx,
+    is_cp_active,
+    wrap_mamba_scan_for_cp,
+)
 from axolotl.utils.logging import get_logger
 
 LOG = get_logger(__name__)
 
 
-def get_seq_idx(position_ids: torch.Tensor) -> torch.Tensor:
-    """Convert position_ids [B, T] → seq_idx [B, T] int32 for mamba-ssm kernels.
-
-    Example: position_ids [[0,1,2,3,0,1,2]] → seq_idx [[0,0,0,0,1,1,1]]
-    """
-    return (torch.cumsum((position_ids == 0).int(), dim=-1) - 1).to(torch.int32)
-
-
 def patch_nemotron_h_modeling_packing():
     """Patch NemotronH for sample packing: seq_idx threading into Mamba2 SSM kernels.
 
@@ -37,6 +39,8 @@ def patch_nemotron_h_modeling_packing():
         LOG.warning("nemotron_h not found in transformers, skipping packing patches")
         return
 
+    ensure_mamba_kernels_loaded(mod)
+
     NemotronHMamba2Mixer = mod.NemotronHMamba2Mixer
     NemotronHBlock = mod.NemotronHBlock
 
@@ -141,7 +145,7 @@ def patched_cuda_kernels_forward(
                 and self.training
                 and cache_params is None
                 and input_not_masked
-                and seq_idx is None
+                and not is_cp_active()
             ):
                 out, ssm_state = mod.mamba_split_conv1d_scan_combined(
                     projected_states,
@@ -212,12 +216,13 @@ def patched_cuda_kernels_forward(
                         dtype
                     )
 
+                C_reshaped = C.view(batch_size, seq_len, self.n_groups, -1)
                 scan_output, ssm_state = mod.mamba_chunk_scan_combined(
                     hidden_states.view(batch_size, seq_len, -1, self.head_dim),
                     time_step,
                     A,
                     B.view(batch_size, seq_len, self.n_groups, -1),
-                    C.view(batch_size, seq_len, self.n_groups, -1),
+                    C_reshaped,
                     chunk_size=self.chunk_size,
                     D=self.D,
                     z=None,
@@ -227,6 +232,7 @@ def patched_cuda_kernels_forward(
                     dt_softplus=True,
                     **dt_limit_kwargs,
                 )
+
                 if ssm_state is not None and cache_params is not None:
                     cache_params.ssm_states[self.layer_idx].copy_(ssm_state)
                 scan_output = scan_output.view(batch_size, seq_len, -1)
@@ -312,4 +318,6 @@ def patched_block_forward(
 
     NemotronHBlock.forward = patched_block_forward
 
+    wrap_mamba_scan_for_cp(mod)
+
     LOG.info("Applied NemotronH sample packing patch (seq_idx threading into Mamba2)")
diff --git a/src/axolotl/monkeypatch/models/qwen3/__init__.py b/src/axolotl/monkeypatch/models/qwen3/__init__.py
new file mode 100644
index 0000000000..e69de29bb2
diff --git a/src/axolotl/monkeypatch/models/qwen3/fused_attn.py b/src/axolotl/monkeypatch/models/qwen3/fused_attn.py
new file mode 100644
index 0000000000..670522797f
--- /dev/null
+++ b/src/axolotl/monkeypatch/models/qwen3/fused_attn.py
@@ -0,0 +1,117 @@
+"""Fuse ``q_norm/k_norm`` + RoPE in ``Qwen3Attention.forward`` via one Triton kernel."""
+
+from typing import Callable
+
+import torch
+
+from axolotl.utils.logging import get_logger
+
+logger = get_logger(__name__)
+
+
+def _resolve_norm_module(norm):
+    """Unwrap PEFT ``ModulesToSaveWrapper`` so the kernel reads the active adapter's weight."""
+    modules_to_save = getattr(norm, "modules_to_save", None)
+    if not modules_to_save:
+        return norm
+    adapters = getattr(norm, "active_adapters", None)
+    if adapters is None:
+        adapter = getattr(norm, "active_adapter", None)
+        adapters = [adapter] if adapter is not None else []
+    elif isinstance(adapters, str):
+        adapters = [adapters]
+    for name in adapters:
+        if isinstance(name, str) and name in modules_to_save:
+            return modules_to_save[name]
+    return getattr(norm, "original_module", norm)
+
+
+def _make_fused_forward():
+    from axolotl.kernels.gemma4_fused_rope import fused_rms_norm_rope
+
+    def fused_forward(
+        self,
+        hidden_states: torch.Tensor,
+        position_embeddings: tuple[torch.Tensor, torch.Tensor],
+        attention_mask: torch.Tensor | None,
+        past_key_values=None,
+        **kwargs,
+    ) -> tuple[torch.Tensor, torch.Tensor | None]:
+        from transformers.modeling_utils import ALL_ATTENTION_FUNCTIONS
+        from transformers.models.qwen3.modeling_qwen3 import eager_attention_forward
+
+        input_shape = hidden_states.shape[:-1]
+        hidden_shape = (*input_shape, -1, self.head_dim)
+        q_norm = _resolve_norm_module(self.q_norm)
+        k_norm = _resolve_norm_module(self.k_norm)
+        eps = q_norm.variance_epsilon
+
+        cos, sin = position_embeddings
+
+        has_lora_qkv = hasattr(self, "apply_qkv")
+        if has_lora_qkv:
+            query_states, key_states, value_states = self.apply_qkv(hidden_states)
+            query_states = query_states.view(hidden_shape)
+            key_states = key_states.view(hidden_shape)
+            value_states = value_states.view(hidden_shape)
+        else:
+            query_states = self.q_proj(hidden_states).view(hidden_shape)
+            key_states = self.k_proj(hidden_states).view(hidden_shape)
+            value_states = self.v_proj(hidden_states).view(hidden_shape)
+
+        # accelerate's per-module pre-hooks that move CPU-staged params don't fire on a monkeypatched forward.
+        q_w = q_norm.weight
+        if q_w.device != query_states.device:
+            q_w = q_w.to(query_states.device, non_blocking=True)
+        k_w = k_norm.weight
+        if k_w.device != key_states.device:
+            k_w = k_w.to(key_states.device, non_blocking=True)
+
+        query_states = fused_rms_norm_rope(
+            query_states, q_w, cos, sin, eps=eps
+        ).transpose(1, 2)
+        key_states = fused_rms_norm_rope(key_states, k_w, cos, sin, eps=eps).transpose(
+            1, 2
+        )
+        value_states = value_states.transpose(1, 2)
+
+        if past_key_values is not None:
+            key_states, value_states = past_key_values.update(
+                key_states, value_states, self.layer_idx
+            )
+
+        attention_interface: Callable = ALL_ATTENTION_FUNCTIONS.get_interface(
+            self.config._attn_implementation, eager_attention_forward
+        )
+
+        attn_output, attn_weights = attention_interface(
+            self,
+            query_states,
+            key_states,
+            value_states,
+            attention_mask,
+            dropout=0.0 if not self.training else self.attention_dropout,
+            scaling=self.scaling,
+            sliding_window=self.sliding_window,
+            **kwargs,
+        )
+
+        attn_output = attn_output.reshape(*input_shape, -1).contiguous()
+        if hasattr(self, "apply_o"):
+            attn_output = self.apply_o(attn_output)
+        else:
+            attn_output = self.o_proj(attn_output)
+        return attn_output, attn_weights
+
+    return fused_forward
+
+
+def patch_qwen3_fused_attn() -> None:
+    from transformers.models.qwen3.modeling_qwen3 import Qwen3Attention
+
+    if getattr(Qwen3Attention, "_axolotl_fused_attn_patched", False):
+        return
+
+    Qwen3Attention.forward = _make_fused_forward()
+    Qwen3Attention._axolotl_fused_attn_patched = True
+    logger.info("Patched Qwen3Attention.forward with fused RMSNorm+RoPE Triton kernel")
diff --git a/src/axolotl/monkeypatch/models/qwen3_5/fused_attn.py b/src/axolotl/monkeypatch/models/qwen3_5/fused_attn.py
new file mode 100644
index 0000000000..ce83c22d2f
--- /dev/null
+++ b/src/axolotl/monkeypatch/models/qwen3_5/fused_attn.py
@@ -0,0 +1,133 @@
+"""Fused q_norm/k_norm + RoPE for Qwen3.5 (gated q_proj, ``unit_offset=True`` RMSNorm)."""
+
+from typing import Callable
+
+import torch
+
+from axolotl.utils.logging import get_logger
+
+logger = get_logger(__name__)
+
+
+def _resolve_norm_module(norm):
+    """Unwrap PEFT ``ModulesToSaveWrapper`` so the kernel reads the active adapter's weight."""
+    modules_to_save = getattr(norm, "modules_to_save", None)
+    if not modules_to_save:
+        return norm
+    adapters = getattr(norm, "active_adapters", None)
+    if adapters is None:
+        adapter = getattr(norm, "active_adapter", None)
+        adapters = [adapter] if adapter is not None else []
+    elif isinstance(adapters, str):
+        adapters = [adapters]
+    for name in adapters:
+        if isinstance(name, str) and name in modules_to_save:
+            return modules_to_save[name]
+    return getattr(norm, "original_module", norm)
+
+
+def _make_fused_forward():
+    from axolotl.kernels.gemma4_fused_rope import fused_rms_norm_rope
+
+    def fused_forward(
+        self,
+        hidden_states: torch.Tensor,
+        position_embeddings: tuple[torch.Tensor, torch.Tensor],
+        attention_mask: torch.Tensor | None,
+        past_key_values=None,
+        **kwargs,
+    ) -> tuple[torch.Tensor, torch.Tensor | None]:
+        from transformers.modeling_utils import ALL_ATTENTION_FUNCTIONS
+        from transformers.models.qwen3_5.modeling_qwen3_5 import (
+            eager_attention_forward,
+        )
+
+        input_shape = hidden_states.shape[:-1]
+        hidden_shape = (*input_shape, -1, self.head_dim)
+        q_norm = _resolve_norm_module(self.q_norm)
+        k_norm = _resolve_norm_module(self.k_norm)
+        # Liger's RMSNorm replacement uses ``variance_epsilon`` instead of ``eps``.
+        eps = getattr(q_norm, "eps", None)
+        if eps is None:
+            eps = q_norm.variance_epsilon
+
+        cos, sin = position_embeddings
+
+        has_lora_qkv = hasattr(self, "apply_qkv")
+        if has_lora_qkv:
+            query_states, key_states, value_states = self.apply_qkv(hidden_states)
+            query_states, gate = torch.chunk(
+                query_states.view(*input_shape, -1, self.head_dim * 2), 2, dim=-1
+            )
+            query_states = query_states.reshape(hidden_shape)
+            key_states = key_states.view(hidden_shape)
+            value_states = value_states.view(hidden_shape)
+        else:
+            query_states, gate = torch.chunk(
+                self.q_proj(hidden_states).view(*input_shape, -1, self.head_dim * 2),
+                2,
+                dim=-1,
+            )
+            query_states = query_states.reshape(hidden_shape)
+            key_states = self.k_proj(hidden_states).view(hidden_shape)
+            value_states = self.v_proj(hidden_states).view(hidden_shape)
+        gate = gate.reshape(*input_shape, -1)
+
+        # accelerate's per-module pre-hooks that move CPU-staged params don't fire on a monkeypatched forward.
+        q_w = q_norm.weight
+        if q_w.device != query_states.device:
+            q_w = q_w.to(query_states.device, non_blocking=True)
+        k_w = k_norm.weight
+        if k_w.device != key_states.device:
+            k_w = k_w.to(key_states.device, non_blocking=True)
+
+        query_states = fused_rms_norm_rope(
+            query_states, q_w, cos, sin, eps=eps, unit_offset=True
+        ).transpose(1, 2)
+        key_states = fused_rms_norm_rope(
+            key_states, k_w, cos, sin, eps=eps, unit_offset=True
+        ).transpose(1, 2)
+        value_states = value_states.transpose(1, 2)
+
+        if past_key_values is not None:
+            key_states, value_states = past_key_values.update(
+                key_states, value_states, self.layer_idx
+            )
+
+        attention_interface: Callable = ALL_ATTENTION_FUNCTIONS.get_interface(
+            self.config._attn_implementation, eager_attention_forward
+        )
+
+        attn_output, attn_weights = attention_interface(
+            self,
+            query_states,
+            key_states,
+            value_states,
+            attention_mask,
+            dropout=0.0 if not self.training else self.attention_dropout,
+            scaling=self.scaling,
+            **kwargs,
+        )
+
+        attn_output = attn_output.reshape(*input_shape, -1).contiguous()
+        attn_output = attn_output * torch.sigmoid(gate)
+        if hasattr(self, "apply_o"):
+            attn_output = self.apply_o(attn_output)
+        else:
+            attn_output = self.o_proj(attn_output)
+        return attn_output, attn_weights
+
+    return fused_forward
+
+
+def patch_qwen3_5_fused_attn() -> None:
+    from transformers.models.qwen3_5.modeling_qwen3_5 import Qwen3_5Attention
+
+    if getattr(Qwen3_5Attention, "_axolotl_fused_attn_patched", False):
+        return
+
+    Qwen3_5Attention.forward = _make_fused_forward()
+    Qwen3_5Attention._axolotl_fused_attn_patched = True
+    logger.info(
+        "Patched Qwen3_5Attention.forward with fused RMSNorm+RoPE Triton kernel"
+    )
diff --git a/src/axolotl/monkeypatch/models/qwen3_5_moe/__init__.py b/src/axolotl/monkeypatch/models/qwen3_5_moe/__init__.py
new file mode 100644
index 0000000000..e69de29bb2
diff --git a/src/axolotl/monkeypatch/models/qwen3_5_moe/fused_attn.py b/src/axolotl/monkeypatch/models/qwen3_5_moe/fused_attn.py
new file mode 100644
index 0000000000..08a47ca65e
--- /dev/null
+++ b/src/axolotl/monkeypatch/models/qwen3_5_moe/fused_attn.py
@@ -0,0 +1,135 @@
+"""Qwen3.5-MoE variant of the qwen3_5 fused-attention monkeypatch."""
+
+from typing import Callable
+
+import torch
+
+from axolotl.utils.logging import get_logger
+
+logger = get_logger(__name__)
+
+
+def _resolve_norm_module(norm):
+    """Unwrap PEFT ``ModulesToSaveWrapper`` so the kernel reads the active adapter's weight."""
+    modules_to_save = getattr(norm, "modules_to_save", None)
+    if not modules_to_save:
+        return norm
+    adapters = getattr(norm, "active_adapters", None)
+    if adapters is None:
+        adapter = getattr(norm, "active_adapter", None)
+        adapters = [adapter] if adapter is not None else []
+    elif isinstance(adapters, str):
+        adapters = [adapters]
+    for name in adapters:
+        if isinstance(name, str) and name in modules_to_save:
+            return modules_to_save[name]
+    return getattr(norm, "original_module", norm)
+
+
+def _make_fused_forward():
+    from axolotl.kernels.gemma4_fused_rope import fused_rms_norm_rope
+
+    def fused_forward(
+        self,
+        hidden_states: torch.Tensor,
+        position_embeddings: tuple[torch.Tensor, torch.Tensor],
+        attention_mask: torch.Tensor | None,
+        past_key_values=None,
+        **kwargs,
+    ) -> tuple[torch.Tensor, torch.Tensor | None]:
+        from transformers.modeling_utils import ALL_ATTENTION_FUNCTIONS
+        from transformers.models.qwen3_5_moe.modeling_qwen3_5_moe import (
+            eager_attention_forward,
+        )
+
+        input_shape = hidden_states.shape[:-1]
+        hidden_shape = (*input_shape, -1, self.head_dim)
+        q_norm = _resolve_norm_module(self.q_norm)
+        k_norm = _resolve_norm_module(self.k_norm)
+        # Liger's RMSNorm replacement uses ``variance_epsilon`` instead of ``eps``.
+        eps = getattr(q_norm, "eps", None)
+        if eps is None:
+            eps = q_norm.variance_epsilon
+
+        cos, sin = position_embeddings
+
+        has_lora_qkv = hasattr(self, "apply_qkv")
+        if has_lora_qkv:
+            query_states, key_states, value_states = self.apply_qkv(hidden_states)
+            query_states, gate = torch.chunk(
+                query_states.view(*input_shape, -1, self.head_dim * 2), 2, dim=-1
+            )
+            query_states = query_states.reshape(hidden_shape)
+            key_states = key_states.view(hidden_shape)
+            value_states = value_states.view(hidden_shape)
+        else:
+            query_states, gate = torch.chunk(
+                self.q_proj(hidden_states).view(*input_shape, -1, self.head_dim * 2),
+                2,
+                dim=-1,
+            )
+            query_states = query_states.reshape(hidden_shape)
+            key_states = self.k_proj(hidden_states).view(hidden_shape)
+            value_states = self.v_proj(hidden_states).view(hidden_shape)
+        gate = gate.reshape(*input_shape, -1)
+
+        # accelerate's per-module pre-hooks that move CPU-staged params don't fire on a monkeypatched forward.
+        q_w = q_norm.weight
+        if q_w.device != query_states.device:
+            q_w = q_w.to(query_states.device, non_blocking=True)
+        k_w = k_norm.weight
+        if k_w.device != key_states.device:
+            k_w = k_w.to(key_states.device, non_blocking=True)
+
+        query_states = fused_rms_norm_rope(
+            query_states, q_w, cos, sin, eps=eps, unit_offset=True
+        ).transpose(1, 2)
+        key_states = fused_rms_norm_rope(
+            key_states, k_w, cos, sin, eps=eps, unit_offset=True
+        ).transpose(1, 2)
+        value_states = value_states.transpose(1, 2)
+
+        if past_key_values is not None:
+            key_states, value_states = past_key_values.update(
+                key_states, value_states, self.layer_idx
+            )
+
+        attention_interface: Callable = ALL_ATTENTION_FUNCTIONS.get_interface(
+            self.config._attn_implementation, eager_attention_forward
+        )
+
+        attn_output, attn_weights = attention_interface(
+            self,
+            query_states,
+            key_states,
+            value_states,
+            attention_mask,
+            dropout=0.0 if not self.training else self.attention_dropout,
+            scaling=self.scaling,
+            **kwargs,
+        )
+
+        attn_output = attn_output.reshape(*input_shape, -1).contiguous()
+        attn_output = attn_output * torch.sigmoid(gate)
+        if hasattr(self, "apply_o"):
+            attn_output = self.apply_o(attn_output)
+        else:
+            attn_output = self.o_proj(attn_output)
+        return attn_output, attn_weights
+
+    return fused_forward
+
+
+def patch_qwen3_5_moe_fused_attn() -> None:
+    from transformers.models.qwen3_5_moe.modeling_qwen3_5_moe import (
+        Qwen3_5MoeAttention,
+    )
+
+    if getattr(Qwen3_5MoeAttention, "_axolotl_fused_attn_patched", False):
+        return
+
+    Qwen3_5MoeAttention.forward = _make_fused_forward()
+    Qwen3_5MoeAttention._axolotl_fused_attn_patched = True
+    logger.info(
+        "Patched Qwen3_5MoeAttention.forward with fused RMSNorm+RoPE Triton kernel"
+    )
diff --git a/src/axolotl/monkeypatch/models/qwen3_moe/__init__.py b/src/axolotl/monkeypatch/models/qwen3_moe/__init__.py
new file mode 100644
index 0000000000..e69de29bb2
diff --git a/src/axolotl/monkeypatch/models/qwen3_moe/fused_attn.py b/src/axolotl/monkeypatch/models/qwen3_moe/fused_attn.py
new file mode 100644
index 0000000000..4f5b951ca5
--- /dev/null
+++ b/src/axolotl/monkeypatch/models/qwen3_moe/fused_attn.py
@@ -0,0 +1,121 @@
+"""Qwen3-MoE variant of the qwen3 fused-attention monkeypatch."""
+
+from typing import Callable
+
+import torch
+
+from axolotl.utils.logging import get_logger
+
+logger = get_logger(__name__)
+
+
+def _resolve_norm_module(norm):
+    """Unwrap PEFT ``ModulesToSaveWrapper`` so the kernel reads the active adapter's weight."""
+    modules_to_save = getattr(norm, "modules_to_save", None)
+    if not modules_to_save:
+        return norm
+    adapters = getattr(norm, "active_adapters", None)
+    if adapters is None:
+        adapter = getattr(norm, "active_adapter", None)
+        adapters = [adapter] if adapter is not None else []
+    elif isinstance(adapters, str):
+        adapters = [adapters]
+    for name in adapters:
+        if isinstance(name, str) and name in modules_to_save:
+            return modules_to_save[name]
+    return getattr(norm, "original_module", norm)
+
+
+def _make_fused_forward():
+    from axolotl.kernels.gemma4_fused_rope import fused_rms_norm_rope
+
+    def fused_forward(
+        self,
+        hidden_states: torch.Tensor,
+        position_embeddings: tuple[torch.Tensor, torch.Tensor],
+        attention_mask: torch.Tensor | None,
+        past_key_values=None,
+        **kwargs,
+    ) -> tuple[torch.Tensor, torch.Tensor | None]:
+        from transformers.modeling_utils import ALL_ATTENTION_FUNCTIONS
+        from transformers.models.qwen3_moe.modeling_qwen3_moe import (
+            eager_attention_forward,
+        )
+
+        input_shape = hidden_states.shape[:-1]
+        hidden_shape = (*input_shape, -1, self.head_dim)
+        q_norm = _resolve_norm_module(self.q_norm)
+        k_norm = _resolve_norm_module(self.k_norm)
+        eps = q_norm.variance_epsilon
+
+        cos, sin = position_embeddings
+
+        has_lora_qkv = hasattr(self, "apply_qkv")
+        if has_lora_qkv:
+            query_states, key_states, value_states = self.apply_qkv(hidden_states)
+            query_states = query_states.view(hidden_shape)
+            key_states = key_states.view(hidden_shape)
+            value_states = value_states.view(hidden_shape)
+        else:
+            query_states = self.q_proj(hidden_states).view(hidden_shape)
+            key_states = self.k_proj(hidden_states).view(hidden_shape)
+            value_states = self.v_proj(hidden_states).view(hidden_shape)
+
+        # accelerate's per-module pre-hooks that move CPU-staged params don't fire on a monkeypatched forward.
+        q_w = q_norm.weight
+        if q_w.device != query_states.device:
+            q_w = q_w.to(query_states.device, non_blocking=True)
+        k_w = k_norm.weight
+        if k_w.device != key_states.device:
+            k_w = k_w.to(key_states.device, non_blocking=True)
+
+        query_states = fused_rms_norm_rope(
+            query_states, q_w, cos, sin, eps=eps
+        ).transpose(1, 2)
+        key_states = fused_rms_norm_rope(key_states, k_w, cos, sin, eps=eps).transpose(
+            1, 2
+        )
+        value_states = value_states.transpose(1, 2)
+
+        if past_key_values is not None:
+            key_states, value_states = past_key_values.update(
+                key_states, value_states, self.layer_idx
+            )
+
+        attention_interface: Callable = ALL_ATTENTION_FUNCTIONS.get_interface(
+            self.config._attn_implementation, eager_attention_forward
+        )
+
+        attn_output, attn_weights = attention_interface(
+            self,
+            query_states,
+            key_states,
+            value_states,
+            attention_mask,
+            dropout=0.0 if not self.training else self.attention_dropout,
+            scaling=self.scaling,
+            sliding_window=self.sliding_window,
+            **kwargs,
+        )
+
+        attn_output = attn_output.reshape(*input_shape, -1).contiguous()
+        if hasattr(self, "apply_o"):
+            attn_output = self.apply_o(attn_output)
+        else:
+            attn_output = self.o_proj(attn_output)
+        return attn_output, attn_weights
+
+    return fused_forward
+
+
+def patch_qwen3_moe_fused_attn() -> None:
+    from transformers.models.qwen3_moe.modeling_qwen3_moe import Qwen3MoeAttention
+
+    if getattr(Qwen3MoeAttention, "_axolotl_fused_attn_patched", False):
+        return
+
+    Qwen3MoeAttention.forward = _make_fused_forward()
+    Qwen3MoeAttention._axolotl_fused_attn_patched = True
+    logger.info(
+        "Patched Qwen3MoeAttention.forward with fused RMSNorm+RoPE Triton kernel"
+    )
diff --git a/src/axolotl/monkeypatch/models/qwen3_vl/__init__.py b/src/axolotl/monkeypatch/models/qwen3_vl/__init__.py
new file mode 100644
index 0000000000..e69de29bb2
diff --git a/src/axolotl/monkeypatch/models/qwen3_vl/fused_attn.py b/src/axolotl/monkeypatch/models/qwen3_vl/fused_attn.py
new file mode 100644
index 0000000000..68da52bd5f
--- /dev/null
+++ b/src/axolotl/monkeypatch/models/qwen3_vl/fused_attn.py
@@ -0,0 +1,121 @@
+# Why: fuse Qwen3-VL q_norm/k_norm + mRoPE into one Triton kernel.
+
+from typing import Callable
+
+import torch
+
+from axolotl.utils.logging import get_logger
+
+logger = get_logger(__name__)
+
+
+def _resolve_norm_module(norm):
+    # Why: ModulesToSaveWrapper stores trainable norm weights per active adapter.
+    modules_to_save = getattr(norm, "modules_to_save", None)
+    if not modules_to_save:
+        return norm
+    adapters = getattr(norm, "active_adapters", None)
+    if adapters is None:
+        adapter = getattr(norm, "active_adapter", None)
+        adapters = [adapter] if adapter is not None else []
+    elif isinstance(adapters, str):
+        adapters = [adapters]
+    for name in adapters:
+        if isinstance(name, str) and name in modules_to_save:
+            return modules_to_save[name]
+    return getattr(norm, "original_module", norm)
+
+
+def _make_fused_forward():
+    from axolotl.kernels.gemma4_fused_rope import fused_rms_norm_rope
+
+    def fused_forward(
+        self,
+        hidden_states: torch.Tensor,
+        position_embeddings: tuple[torch.Tensor, torch.Tensor],
+        attention_mask: torch.Tensor | None,
+        past_key_values=None,
+        **kwargs,
+    ) -> tuple[torch.Tensor, torch.Tensor | None]:
+        from transformers.modeling_utils import ALL_ATTENTION_FUNCTIONS
+        from transformers.models.qwen3_vl.modeling_qwen3_vl import (
+            eager_attention_forward,
+        )
+
+        input_shape = hidden_states.shape[:-1]
+        hidden_shape = (*input_shape, -1, self.head_dim)
+        q_norm = _resolve_norm_module(self.q_norm)
+        k_norm = _resolve_norm_module(self.k_norm)
+        eps = getattr(q_norm, "eps", None)
+        if eps is None:
+            eps = q_norm.variance_epsilon
+
+        cos, sin = position_embeddings
+
+        if hasattr(self, "apply_qkv"):
+            query_states, key_states, value_states = self.apply_qkv(hidden_states)
+            query_states = query_states.view(hidden_shape)
+            key_states = key_states.view(hidden_shape)
+            value_states = value_states.view(hidden_shape)
+        else:
+            query_states = self.q_proj(hidden_states).view(hidden_shape)
+            key_states = self.k_proj(hidden_states).view(hidden_shape)
+            value_states = self.v_proj(hidden_states).view(hidden_shape)
+
+        # accelerate's per-module pre-hooks that move CPU-staged params don't fire on a monkeypatched forward.
+        q_w = q_norm.weight
+        if q_w.device != query_states.device:
+            q_w = q_w.to(query_states.device, non_blocking=True)
+        k_w = k_norm.weight
+        if k_w.device != key_states.device:
+            k_w = k_w.to(key_states.device, non_blocking=True)
+
+        query_states = fused_rms_norm_rope(
+            query_states, q_w, cos, sin, eps=eps
+        ).transpose(1, 2)
+        key_states = fused_rms_norm_rope(key_states, k_w, cos, sin, eps=eps).transpose(
+            1, 2
+        )
+        value_states = value_states.transpose(1, 2)
+
+        if past_key_values is not None:
+            key_states, value_states = past_key_values.update(
+                key_states, value_states, self.layer_idx
+            )
+
+        attention_interface: Callable = ALL_ATTENTION_FUNCTIONS.get_interface(
+            self.config._attn_implementation, eager_attention_forward
+        )
+
+        attn_output, attn_weights = attention_interface(
+            self,
+            query_states,
+            key_states,
+            value_states,
+            attention_mask,
+            dropout=0.0 if not self.training else self.attention_dropout,
+            scaling=self.scaling,
+            **kwargs,
+        )
+
+        attn_output = attn_output.reshape(*input_shape, -1).contiguous()
+        if hasattr(self, "apply_o"):
+            attn_output = self.apply_o(attn_output)
+        else:
+            attn_output = self.o_proj(attn_output)
+        return attn_output, attn_weights
+
+    return fused_forward
+
+
+def patch_qwen3_vl_fused_attn() -> None:
+    from transformers.models.qwen3_vl.modeling_qwen3_vl import Qwen3VLTextAttention
+
+    if getattr(Qwen3VLTextAttention, "_axolotl_fused_attn_patched", False):
+        return
+
+    Qwen3VLTextAttention.forward = _make_fused_forward()
+    Qwen3VLTextAttention._axolotl_fused_attn_patched = True
+    logger.info(
+        "Patched Qwen3VLTextAttention.forward with fused RMSNorm+mRoPE Triton kernel"
+    )
diff --git a/src/axolotl/monkeypatch/multipack.py b/src/axolotl/monkeypatch/multipack.py
index 9e2157ef47..03c70ee3dd 100644
--- a/src/axolotl/monkeypatch/multipack.py
+++ b/src/axolotl/monkeypatch/multipack.py
@@ -63,6 +63,7 @@
     "afmoe",
     "nemotron",
     "nemotron_h",
+    "falcon_h1",
 ]
 
 
diff --git a/src/axolotl/monkeypatch/tiled_mlp/__init__.py b/src/axolotl/monkeypatch/tiled_mlp/__init__.py
index 4ea1549915..cb9aa6145a 100644
--- a/src/axolotl/monkeypatch/tiled_mlp/__init__.py
+++ b/src/axolotl/monkeypatch/tiled_mlp/__init__.py
@@ -4,8 +4,10 @@
 
 from .patch import (
     patch_tiled_mlp,
+    patch_tiled_mlp_moe_instances,
 )
 
 __all__ = [
     "patch_tiled_mlp",
+    "patch_tiled_mlp_moe_instances",
 ]
diff --git a/src/axolotl/monkeypatch/tiled_mlp/base.py b/src/axolotl/monkeypatch/tiled_mlp/base.py
index 2c9dc8e4c5..ab4f93ab53 100644
--- a/src/axolotl/monkeypatch/tiled_mlp/base.py
+++ b/src/axolotl/monkeypatch/tiled_mlp/base.py
@@ -2,11 +2,127 @@
 TiledMLP support for DDP, FSDP, and single GPU
 """
 
+import contextlib
+import os
 import threading
 from typing import List
 
 import torch
 
+# Opt-in fp32 accumulation for the tiled backward. The default accumulates
+# at the param's own dtype, which matches what AccumulateGrad does in the
+# unsharded backward and avoids materialising an fp32 buffer the size of
+# every compute param. Set ``AXOLOTL_TILED_MLP_ACCUM_FP32=1`` to recover
+# the previous fp32-accumulator behaviour when bf16 precision is the
+# concern (e.g. very large N-shard sums where bf16 round-off accumulates).
+_TILED_MLP_ACCUM_FP32 = os.environ.get("AXOLOTL_TILED_MLP_ACCUM_FP32", "0") == "1"
+
+
+def _find_fsdp2_module(module):
+    """Return the nearest FSDP2 :class:`FSDPModule` that owns ``module``.
+
+    FSDP2 (``torch.distributed.fsdp.fully_shard``) registers per-module
+    post-backward hooks that reshard parameters once their gradients have
+    been produced. Inside :class:`TiledMLP.backward` we run several inner
+    backwards over shards of the same input; if the wrapping FSDPModule
+    reshards between iterations, the unsharded params are gone and the
+    next tile recomputes against bogus shards. We have to disable reshard
+    on the wrapping FSDPModule for the duration of the loop.
+
+    The MLP itself is rarely the directly-wrapped module — production
+    setups apply ``fully_shard`` at the decoder-layer level. Walk the
+    global FSDP module-state registry to find the nearest ancestor whose
+    parameter group contains us. Result is cached on the module so we pay
+    the lookup once.
+
+    Returns ``None`` if FSDP2 is not in use, or no wrapping FSDPModule
+    contains ``module`` as a descendant.
+    """
+    cached = getattr(module, "_axolotl_fsdp2_owner", "__unset__")
+    if cached != "__unset__":
+        return cached
+
+    try:
+        from torch.distributed._composable_state import _module_state_mapping
+        from torch.distributed.fsdp import FSDPModule
+    except ImportError:
+        module._axolotl_fsdp2_owner = None
+        return None
+
+    # MLP itself wrapped (covers the regression-guard unit test).
+    if isinstance(module, FSDPModule):
+        module._axolotl_fsdp2_owner = module
+        return module
+
+    # Walk the global FSDP registry looking for ancestors. The registry is
+    # a WeakKeyDictionary so the snapshot is cheap and bounded by the
+    # number of FSDP-wrapped modules in the process.
+    target_id = id(module)
+    candidates = []
+    for owner in list(_module_state_mapping.keys()):
+        if not isinstance(owner, FSDPModule):
+            continue
+        if owner is module:
+            continue
+        for sub in owner.modules():
+            if id(sub) == target_id:
+                candidates.append(owner)
+                break
+
+    if not candidates:
+        result = None
+    elif len(candidates) == 1:
+        result = candidates[0]
+    else:
+        # When multiple FSDPModules are ancestors (e.g. fully_shard applied
+        # to both decoder layer and the root), pick the deepest one — its
+        # subtree is smallest. Counting modules is O(N) per candidate but
+        # only runs once per MLP instance.
+        result = min(candidates, key=lambda m: sum(1 for _ in m.modules()))
+
+    module._axolotl_fsdp2_owner = result
+    return result
+
+
+@contextlib.contextmanager
+def _defer_fsdp2_reshard(module):
+    """Suspend FSDP2's post-backward reshard on the wrapping FSDPModule.
+
+    The tiled backward calls :func:`torch.autograd.backward` once per shard.
+    Each inner backward triggers FSDP2's per-module post-backward hooks,
+    which would reshard parameters mid-loop. We pause that by toggling
+    ``set_reshard_after_backward(False)`` on the wrapping FSDPModule, run
+    the loop, restore the original setting, then issue a single explicit
+    ``reshard()`` so the post-loop state matches normal FSDP2 semantics.
+
+    No-op when ``module`` is not under FSDP2.
+    """
+    fsdp_mod = _find_fsdp2_module(module)
+    if fsdp_mod is None:
+        yield
+        return
+
+    # No public getter for ``reshard_after_backward`` in PyTorch 2.11;
+    # read off the param group directly. The internal accessor surface
+    # is documented in
+    # ``torch.distributed.fsdp._fully_shard._fsdp_param_group.FSDPParamGroup``.
+    state = fsdp_mod._get_fsdp_state()
+    param_group = state._fsdp_param_group
+    if param_group is None:
+        # Nothing to defer (e.g. ignored module with no FSDP-managed params).
+        yield
+        return
+
+    prev = param_group.reshard_after_backward
+    fsdp_mod.set_reshard_after_backward(False, recurse=False)
+    try:
+        yield
+    finally:
+        # Restore so subsequent backward passes outside the tile loop
+        # behave normally, then issue the deferred reshard once.
+        fsdp_mod.set_reshard_after_backward(prev, recurse=False)
+        fsdp_mod.reshard()
+
 
 class DeepSpeedTiledMLPMoE(torch.autograd.Function):
     @staticmethod
@@ -151,47 +267,99 @@ def backward(ctx, *grads) -> torch.Tensor:
         x_grad = torch.zeros_like(x)
         x_shards = list(torch.chunk(x, chunks=shards, dim=1))
 
-        # Create a gradient accumulator for parameters
-        grad_accumulator = GradientAccumulator(compute_params, shards, dtype=x.dtype)
+        # Snapshot existing ``.grad`` for each param and zero it; we will
+        # accumulate the per-shard contributions into a per-param buffer
+        # and write back at the end. The previous implementation used
+        # ``param.register_hook`` per shard, which (a) re-installed hooks
+        # every iteration so the N-th shard ran N stacked hooks and
+        # double-counted contributions, and (b) scaled by ``1/N`` even
+        # though sequence-dim sharding makes per-shard grads additive,
+        # not averaged. The combined effect was a gradient roughly
+        # 2x-2.5x the analytical value. Direct inline accumulation is
+        # both simpler and correct, and avoids interactions with FSDP2's
+        # own backward hooks.
+        #
+        # The accumulator defaults to the param's own dtype to match
+        # what AccumulateGrad would do in the unsharded backward. The
+        # earlier implementation accumulated in fp32, which doubled the
+        # parameter-side memory footprint in bf16 MoE training where the
+        # accumulator's ``[E, hidden, 2*intermediate]`` shape dominates.
+        # Set ``AXOLOTL_TILED_MLP_ACCUM_FP32=1`` to opt back into fp32
+        # accumulation when bf16 round-off is the concern.
+        prev_grads = {}
+        accum_grads = {}
+        for p in compute_params:
+            prev_grads[p] = p.grad
+            accum_dtype = torch.float32 if _TILED_MLP_ACCUM_FP32 else p.dtype
+            accum_grads[p] = torch.zeros_like(p, dtype=accum_dtype)
+            p.grad = None
 
         shard_step = x_shards[0].numel()
-        for i, x_shard in enumerate(x_shards):
-            x_shard.requires_grad_(x_requires_grad)
-
-            shard_offset = i * shard_step
-            x_shard.grad = (
-                x_grad.view(-1)
-                .narrow(0, shard_offset, x_shard.numel())
-                .view_as(x_shard)
-            )
-            incoming_grad_shard = (
-                incoming_grad.view(-1)
-                .narrow(0, shard_offset, x_shard.numel())
-                .view_as(x_shard)
-            )
-
-            # Install hooks for this shard
-            is_last_shard = i + 1 == shards
-            grad_accumulator.install_hooks(is_last_shard)
+        # Suspend FSDP2 post-backward reshard for the duration of the loop.
+        # Without this, the first inner backward triggers FSDP2's reshard
+        # hook on the wrapping FSDPModule and subsequent shards recompute
+        # against only-local DTensor shards — silent grad corruption.
+        # Single-GPU and DDP paths fall through to a no-op context manager.
+        with _defer_fsdp2_reshard(self):
+            for i, x_shard in enumerate(x_shards):
+                x_shard.requires_grad_(x_requires_grad)
+
+                shard_offset = i * shard_step
+                x_shard.grad = (
+                    x_grad.view(-1)
+                    .narrow(0, shard_offset, x_shard.numel())
+                    .view_as(x_shard)
+                )
+                incoming_grad_shard = (
+                    incoming_grad.view(-1)
+                    .narrow(0, shard_offset, x_shard.numel())
+                    .view_as(x_shard)
+                )
 
-            with torch.enable_grad():
-                output = fn(self, x_shard)
-            if is_tuple_output:
-                torch.autograd.backward(output[0], incoming_grad_shard)
+                with torch.enable_grad():
+                    output = fn(self, x_shard)
+                if is_tuple_output:
+                    torch.autograd.backward(output[0], incoming_grad_shard)
+                else:
+                    torch.autograd.backward(output, incoming_grad_shard)
+
+                # Capture this shard's contribution into the per-param
+                # accumulator and clear ``.grad`` so the next shard starts
+                # from zero. Skip the dtype cast when the accumulator
+                # matches the param dtype (the default) — that cast was
+                # the per-shard HBM-bandwidth tax on the bf16 path.
+                for p in compute_params:
+                    if p.grad is not None:
+                        shard_grad = p.grad.detach()
+                        if shard_grad.dtype != accum_grads[p].dtype:
+                            shard_grad = shard_grad.to(accum_grads[p].dtype)
+                        accum_grads[p].add_(shard_grad)
+                        p.grad = None
+
+        # Restore prior grad value (if any) and add the tiled contribution.
+        for p in compute_params:
+            tiled_contrib = accum_grads[p]
+            if tiled_contrib.dtype != p.dtype:
+                tiled_contrib = tiled_contrib.to(p.dtype)
+            if prev_grads[p] is None:
+                p.grad = tiled_contrib
             else:
-                torch.autograd.backward(output, incoming_grad_shard)
-
-        # Clean up hooks
-        grad_accumulator.cleanup()
-        del grad_accumulator
+                p.grad = prev_grads[p] + tiled_contrib
 
         return (None, None, x_grad, None, None)
 
 
 class GradientAccumulator:
     """
-    Manual gradient accumulator for TiledMLP with configurable precision
-    Accumulates in specified dtype and rescales the gradient at the end
+    Manual gradient accumulator for TiledMLP with configurable precision.
+
+    .. note::
+        The production TiledMLP backward (above) accumulates inline and
+        does not call this class — it is retained as a reference / opt-in
+        path for callers that want hook-based accumulation. The defaults
+        below match the inline path: param-dtype accumulator (matches
+        ``AccumulateGrad`` in the unsharded backward) and ``1.0`` per-shard
+        scaling (sequence-dim sharded grads are additive, not averaged).
     """
 
     def __init__(
@@ -202,11 +370,24 @@ def __init__(
     ):
         self.params = params
         self.total_shards = total_shards
-        self.grad_accumulation_dtype = dtype or torch.float32
+        # Default to the param's own dtype to avoid the 2x parameter-side
+        # memory regression in bf16 MoE training where the accumulator
+        # shape ``[E, hidden, 2*intermediate]`` dominates. fp32 accumulation
+        # is opt-in via the ``dtype`` arg.
+        if dtype is not None:
+            self.grad_accumulation_dtype = dtype
+        elif params:
+            self.grad_accumulation_dtype = params[0].dtype
+        else:
+            self.grad_accumulation_dtype = torch.float32
         self.accumulated_grads = {}
         self.hooks = []
         self.lock = threading.Lock()
-        self.gradient_scale = 1.0 / total_shards
+        # Sequence-dim shards partition the per-token sum; their
+        # contributions are additive (``sum_t dL_t/dW``), not averaged.
+        # The previous ``1/total_shards`` scaling produced a mean and was
+        # a correctness bug for this sharding semantics.
+        self.gradient_scale = 1.0
 
         # Initialize accumulated gradients in the specified dtype
         for param in self.params:
@@ -226,17 +407,33 @@ def install_hooks(self, is_last_shard: bool):
         def create_hook(param):
             def hook(grad):
                 with self.lock:
-                    grad_to_accum_dtype = grad.to(self.grad_accumulation_dtype)
-                    scaled_grad = grad_to_accum_dtype * self.gradient_scale
+                    # Skip the dtype cast when the accumulator already
+                    # matches the grad dtype (the default after the
+                    # param-dtype change above) — the redundant cast was
+                    # the per-shard HBM bandwidth tax called out in the
+                    # tiled-MLP regression analysis.
+                    if grad.dtype == self.grad_accumulation_dtype:
+                        scaled_grad = (
+                            grad
+                            if self.gradient_scale == 1.0
+                            else grad * self.gradient_scale
+                        )
+                    else:
+                        scaled_grad = (
+                            grad.to(self.grad_accumulation_dtype) * self.gradient_scale
+                        )
 
                     if param in self.accumulated_grads:
                         self.accumulated_grads[param] += scaled_grad
                     else:
                         self.accumulated_grads[param] = scaled_grad.clone()
 
-                    # Only assign the averaged gradient on the last shard
+                    # Only assign the accumulated gradient on the last shard
                     if is_last_shard:
-                        param.grad = self.accumulated_grads[param].to(param.dtype)
+                        if self.accumulated_grads[param].dtype != param.dtype:
+                            param.grad = self.accumulated_grads[param].to(param.dtype)
+                        else:
+                            param.grad = self.accumulated_grads[param]
                         return param.grad
                     return None
 
diff --git a/src/axolotl/monkeypatch/tiled_mlp/patch.py b/src/axolotl/monkeypatch/tiled_mlp/patch.py
index 23f48a1016..6842ba7980 100644
--- a/src/axolotl/monkeypatch/tiled_mlp/patch.py
+++ b/src/axolotl/monkeypatch/tiled_mlp/patch.py
@@ -11,90 +11,270 @@
 
 LOG = get_logger(__name__)
 
+# Suffixes used to discover MoE block classes inside
+# ``transformers.models.{model_type}.modeling_{model_type}``.
+# Order matters — preferred names come first.
+_MOE_BLOCK_SUFFIXES = ("SparseMoeBlock", "MoeMLP", "MoE")
 
-def patch_tiled_mlp(model_type, use_original_mlp=True, cfg_num_shards=None):
+
+def _resolve_moe_block_cls(module, model_cls_prefix):
+    """Return the MoE block class for the model module, or ``None`` if dense."""
+    for suffix in _MOE_BLOCK_SUFFIXES:
+        cls = getattr(module, f"{model_cls_prefix}{suffix}", None)
+        if cls is not None:
+            return cls
+    return None
+
+
+def _build_tiled_forward(
+    inner_forward,
+    model_type,
+    cfg_num_shards,
+    is_moe_block,
+):
+    """Construct a ``tiled_mlp_forward`` closure.
+
+    The returned forward shards inputs along the sequence dim and dispatches
+    to the correct :class:`torch.autograd.Function` implementation based on
+    the parallel-training backend in use.
+
+    ``inner_forward`` is the un-tiled forward (either the dense MLP forward
+    or the MoE block's routing+expert forward — possibly a kernels-substituted
+    forward in the scattermoe-lora case).
+    """
     from deepspeed.runtime.sequence_parallel.ulysses_sp import (
         TiledMLP as DeepSpeedTiledMLP,
     )
 
     from axolotl.monkeypatch.tiled_mlp.base import DeepSpeedTiledMLPMoE, TiledMLP
 
+    is_distributed = int(os.environ.get("WORLD_SIZE", 1)) > 1
+
+    def tiled_mlp_forward(self, x):
+        input_shape = x.shape
+        seqlen = input_shape[-2]
+        if cfg_num_shards is None:
+            # Target ~32K tokens per shard. The previous `ceil(seq / hidden)`
+            # heuristic produced only ~2K tokens/shard at long context, well
+            # below the MoE kernel's BLOCK_M sweet spot. An empirical sweep at
+            # seq ∈ {64K, 128K, 256K, 512K} showed 3.2× speed-up at 64–256K
+            # and 2.1× at 512K from raising per-shard tokens to ~32K, with
+            # only a modest peak-mem cost (~5–10 GiB extra at seq=256K)
+            # because the routed intermediate buffer dominates and scales
+            # linearly with per-shard tokens. Operators can override via
+            # cfg_num_shards for niche cases (smaller intermediate, larger
+            # top_k) where the default is wrong.
+            target_tokens_per_shard = 32768
+            num_shards = max(1, math.ceil(seqlen / target_tokens_per_shard))
+            if is_distributed:
+                num_shards_tensor = torch.tensor(num_shards, device=x.device)
+                dist.all_reduce(num_shards_tensor, op=dist.ReduceOp.MAX)
+                num_shards = num_shards_tensor.item()
+        else:
+            num_shards = cfg_num_shards
+
+        if not self._compute_params:
+            self._compute_params = [p for p in self.parameters() if p.requires_grad]
+
+        compute_params = self._compute_params
+        if not self._tiled_mlp_dist_impl:
+            uses_deepspeed = (
+                self._compute_params
+                and any(
+                    hasattr(p, "ds_id") or hasattr(p, "param_idx_in_group")
+                    for p in self._compute_params
+                )
+            ) or os.environ.get("ACCELERATE_USE_DEEPSPEED", "false") == "true"
+
+            if uses_deepspeed:
+                # gpt_oss already used the MoE variant before this refactor;
+                # extend the same treatment to every MoE block, since they
+                # tend to return tuple outputs (hidden_states, router_logits)
+                # the way gpt_oss does.
+                if model_type == "gpt_oss" or is_moe_block:
+                    self._tiled_mlp_dist_impl = DeepSpeedTiledMLPMoE
+                else:
+                    self._tiled_mlp_dist_impl = DeepSpeedTiledMLP
+            else:
+                self._tiled_mlp_dist_impl = TiledMLP
+
+        return self._tiled_mlp_dist_impl.apply(
+            inner_forward,
+            self,
+            x,
+            num_shards,
+            compute_params,
+        )
+
+    return tiled_mlp_forward
+
+
+def _prepare_target_class(target_cls):
+    """Initialize the bookkeeping attrs the tiled forward expects."""
+    target_cls._compute_params = []
+    target_cls._tiled_mlp_dist_impl = None
+
+
+def patch_tiled_mlp(
+    model_type,
+    use_original_mlp=True,
+    cfg_num_shards=None,
+    use_scattermoe=False,
+):
+    """Install the class-level tiled MLP patch.
+
+    For dense models this patches ``{prefix}MLP`` (falling back to
+    ``{prefix}TextMLP`` for multimodal wrappers).
+
+    For MoE models with scattermoe-lora active, the MoE block class
+    (``{prefix}SparseMoeBlock`` / ``{prefix}MoeMLP`` / ``{prefix}MoE``) is the
+    one whose forward does routing + expert invocation, so we patch that.
+    Note that the ``kernels`` library installs scattermoe-lora's forward at
+    the *instance* level during ``model.kernelize()``, so the class-level
+    patch is shadowed at runtime. :func:`patch_tiled_mlp_moe_instances` is
+    the companion post-model-load step that re-wraps each MoE block instance
+    so the tiled forward runs on top of the kernels-installed forward.
+    """
+    module_path = f"transformers.models.{model_type}.modeling_{model_type}"
+    model_cls_prefix, _ = get_causal_lm_model_cls_prefix(model_type)
     try:
-        # Dynamically import the module and MLP class
-        module_path = f"transformers.models.{model_type}.modeling_{model_type}"
-        model_cls_prefix, _ = get_causal_lm_model_cls_prefix(model_type)
         module = __import__(module_path, fromlist=[f"{model_cls_prefix}MLP"])
-        # Some multimodal wrappers (e.g. Gemma 4) name the MLP class
-        # ``{prefix}TextMLP`` rather than ``{prefix}MLP`` because the
-        # language-side module is separated from the vision tower. Try
-        # both names before giving up.
+    except ImportError as e:
+        raise RuntimeError(
+            f"Could not import MLP class for model_type: {model_type}. Error: {str(e)}"
+        ) from e
+
+    # MoE block patch path: only walk into this branch when scattermoe-lora
+    # is active. For non-scattermoe MoE models the dense MLP fallback applies
+    # — we do not auto-enable MoE-block tiling because each model family's
+    # block forward has different output-tuple semantics.
+    moe_block_cls = (
+        _resolve_moe_block_cls(module, model_cls_prefix) if use_scattermoe else None
+    )
+
+    if moe_block_cls is not None:
+        original_forward = moe_block_cls.forward
+        tiled_forward = _build_tiled_forward(
+            inner_forward=original_forward,
+            model_type=model_type,
+            cfg_num_shards=cfg_num_shards,
+            is_moe_block=True,
+        )
+        moe_block_cls.forward = tiled_forward
+        _prepare_target_class(moe_block_cls)
+        LOG.info(
+            "Successfully monkey-patched TiledMLP for model_type: "
+            f"{model_type} (MoE block: {moe_block_cls.__name__})"
+        )
+        return
+
+    # Dense MLP path (existing behavior).
+    try:
         mlp_cls = getattr(
             module,
             f"{model_cls_prefix}MLP",
             None,
         ) or getattr(module, f"{model_cls_prefix}TextMLP")
+    except AttributeError as e:
+        raise RuntimeError(
+            f"Could not import MLP class for model_type: {model_type}. Error: {str(e)}"
+        ) from e
 
-        if use_original_mlp:
-            mlp_forward = mlp_cls.forward
-        else:
+    if use_original_mlp:
+        mlp_forward = mlp_cls.forward
+    else:
 
-            def generic_mlp_forward(self_, hs):
-                return self_.down_proj(
-                    self_.act_fn(self_.gate_proj(hs)) * self_.up_proj(hs)
-                )
+        def generic_mlp_forward(self_, hs):
+            return self_.down_proj(
+                self_.act_fn(self_.gate_proj(hs)) * self_.up_proj(hs)
+            )
 
-            mlp_forward = torch.compile(generic_mlp_forward)
+        mlp_forward = torch.compile(generic_mlp_forward)
 
-        is_distributed = int(os.environ.get("WORLD_SIZE", 1)) > 1
+    tiled_forward = _build_tiled_forward(
+        inner_forward=mlp_forward,
+        model_type=model_type,
+        cfg_num_shards=cfg_num_shards,
+        is_moe_block=False,
+    )
+    mlp_cls.forward = tiled_forward
+    _prepare_target_class(mlp_cls)
+    LOG.info(f"Successfully monkey-patched TiledMLP for model_type: {model_type}")
 
-        def tiled_mlp_forward(self, x):
-            input_shape = x.shape
-            seqlen = input_shape[-2]
-            hidden = input_shape[-1]
-            if cfg_num_shards is None:
-                num_shards = math.ceil(seqlen / hidden)
-                if is_distributed:
-                    num_shards_tensor = torch.tensor(num_shards, device=x.device)
-                    dist.all_reduce(num_shards_tensor, op=dist.ReduceOp.MAX)
-                    num_shards = num_shards_tensor.item()
-            else:
-                num_shards = cfg_num_shards
-
-            if not self._compute_params:
-                self._compute_params = [p for p in self.parameters() if p.requires_grad]
-
-            compute_params = self._compute_params
-            if not self._tiled_mlp_dist_impl:
-                if (
-                    self._compute_params
-                    and any(
-                        hasattr(p, "ds_id") or hasattr(p, "param_idx_in_group")
-                        for p in self._compute_params
-                    )
-                ) or os.environ.get("ACCELERATE_USE_DEEPSPEED", "false") == "true":
-                    if model_type == "gpt_oss":
-                        self._tiled_mlp_dist_impl = DeepSpeedTiledMLPMoE
-                    else:
-                        self._tiled_mlp_dist_impl = DeepSpeedTiledMLP
-                else:
-                    self._tiled_mlp_dist_impl = TiledMLP
-
-            down_res = self._tiled_mlp_dist_impl.apply(
-                mlp_forward,
-                self,
-                x,
-                num_shards,
-                compute_params,
-            )
-            return down_res
 
-        mlp_cls.forward = tiled_mlp_forward
-        mlp_cls._compute_params = []
-        mlp_cls._tiled_mlp_dist_impl = None
+def patch_tiled_mlp_moe_instances(
+    model,
+    model_type,
+    cfg_num_shards=None,
+):
+    """Re-wrap each MoE block instance's ``forward`` after model load.
+
+    The ``kernels`` library installs scattermoe-lora's forward on each MoE
+    block *instance* during ``model.kernelize()`` (called inside
+    ``from_pretrained``). That instance-level binding shadows the class-level
+    patch :func:`patch_tiled_mlp` installs, so without this step tiling is
+    silently bypassed on every block. We capture each instance's current
+    forward (the kernels-installed one) and rebind the instance to a tiled
+    forward that delegates to it.
+
+    Does nothing if no MoE block class exists for ``model_type`` or if
+    ``model`` contains no instances of it.
+    """
+    from types import MethodType
+
+    module_path = f"transformers.models.{model_type}.modeling_{model_type}"
+    model_cls_prefix, _ = get_causal_lm_model_cls_prefix(model_type)
+    try:
+        module = __import__(module_path, fromlist=[model_cls_prefix])
+    except ImportError:
+        return 0
+
+    moe_block_cls = _resolve_moe_block_cls(module, model_cls_prefix)
+    if moe_block_cls is None:
+        return 0
+
+    wrapped = 0
+    for sub in model.modules():
+        if not isinstance(sub, moe_block_cls):
+            continue
+        # If there is no per-instance ``forward`` binding, the class-level
+        # tiled patch from ``patch_tiled_mlp`` is still active; nothing to do.
+        # Kernels (when scattermoe-lora kernelizes the model) installs a
+        # bound method on the instance, which shows up in ``__dict__``.
+        if "forward" not in sub.__dict__:
+            continue
+        # Snapshot the instance-level forward installed by kernels.
+        bound_forward = sub.__dict__["forward"]
+        # Convert bound method back to a plain function that takes (self, x)
+        # so the tiled wrapper can pass `self` through to it.
+        if hasattr(bound_forward, "__func__"):
+            inner_fn = bound_forward.__func__
+        else:
+            # Instance-bound closure: wrap it so the (self, x) signature lines up.
+            def _adapt(orig):
+                def _call(self_, x):  # noqa: ARG001
+                    return orig(x)
+
+                return _call
+
+            inner_fn = _adapt(bound_forward)
+
+        tiled_forward = _build_tiled_forward(
+            inner_forward=inner_fn,
+            model_type=model_type,
+            cfg_num_shards=cfg_num_shards,
+            is_moe_block=True,
+        )
+        # Each instance needs its own bookkeeping (compute_params,
+        # dist_impl) so concurrent forwards across blocks don't stomp.
+        sub._compute_params = []
+        sub._tiled_mlp_dist_impl = None
+        sub.forward = MethodType(tiled_forward, sub)
+        wrapped += 1
+
+    if wrapped:
         LOG.info(
-            f"Successfully monkey-patched TiledMLP for model_type: {model_type}",
+            f"Successfully wrapped TiledMLP around {wrapped} {moe_block_cls.__name__} "
+            f"instance(s) for model_type: {model_type}"
         )
-    except (ImportError, AttributeError) as e:
-        raise RuntimeError(
-            f"Could not import MLP class for model_type: {model_type}. Error: {str(e)}"
-        ) from e
+    return wrapped
diff --git a/src/axolotl/prompt_strategies/chat_template.py b/src/axolotl/prompt_strategies/chat_template.py
index be6a38800e..2b831d4976 100644
--- a/src/axolotl/prompt_strategies/chat_template.py
+++ b/src/axolotl/prompt_strategies/chat_template.py
@@ -25,6 +25,17 @@
 LOG.setLevel("INFO")
 
 
+def _extract_input_ids(result):
+    """Return the ``input_ids`` from a ``build_prompt`` result.
+
+    With a processor configured, ``build_prompt`` returns a dict of
+    processor outputs (``input_ids``, ``attention_mask``, optional
+    ``pixel_values``, etc.). Without a processor it returns a plain
+    ``list[int]`` from the tokenizer.
+    """
+    return result["input_ids"] if isinstance(result, dict) else result
+
+
 class ChatTemplatePrompter(Prompter):
     """Prompter for HF chat templates"""
 
@@ -468,7 +479,10 @@ def _tokenize_single_prompt(self, prompt: dict) -> Dict[str, List[int]]:
 
         turns = self.get_conversation_thread(prompt)
         tools = self._get_tools(prompt)
-        input_ids = self.prompter.build_prompt(turns, tools=tools)  # type: ignore
+        result = self.prompter.build_prompt(turns, tools=tools)  # type: ignore
+        if not isinstance(result, dict):
+            result = {"input_ids": result}
+        input_ids = result["input_ids"]
         labels = [IGNORE_TOKEN_ID] * len(input_ids)
 
         last_eos_idx = -1
@@ -624,11 +638,11 @@ def _tokenize_single_prompt(self, prompt: dict) -> Dict[str, List[int]]:
 
         LOG.debug(f"Final labels: {labels}")
 
-        return {
-            "input_ids": input_ids,
-            "labels": labels,
-            "attention_mask": [1] * len(input_ids),
-        }
+        # ``result`` already carries any processor outputs (pixel_values, image
+        # grid info, etc.); just set the fields we computed locally.
+        result["labels"] = labels
+        result.setdefault("attention_mask", [1] * len(input_ids))
+        return result
 
     def find_first_eos_token(self, input_ids, start_idx):
         eos_token_id = self.tokenizer.eos_token_id
@@ -705,14 +719,18 @@ def find_turn(
         real_last_index = len(turns) - 1
 
         # Generate the conversation up to the turn, with final turn replaced with dummy content
-        dummy_ids = self.prompter.build_prompt(
-            turns_with_empty, tools=tools, real_last_index=real_last_index
-        )  # type: ignore
+        dummy_ids = _extract_input_ids(
+            self.prompter.build_prompt(  # type: ignore
+                turns_with_empty, tools=tools, real_last_index=real_last_index
+            )
+        )
 
         # Generate the conversation up to the turn, with final turn included
-        full_ids = self.prompter.build_prompt(
-            turns_with_content, tools=tools, real_last_index=real_last_index
-        )  # type: ignore
+        full_ids = _extract_input_ids(
+            self.prompter.build_prompt(  # type: ignore
+                turns_with_content, tools=tools, real_last_index=real_last_index
+            )
+        )
 
         if not full_ids or not dummy_ids:
             LOG.warning(f"Empty template generated for turn {turn_idx}")
diff --git a/src/axolotl/prompt_strategies/multimodal_pretrain.py b/src/axolotl/prompt_strategies/multimodal_pretrain.py
index c0bcb6bb50..83f035b64a 100644
--- a/src/axolotl/prompt_strategies/multimodal_pretrain.py
+++ b/src/axolotl/prompt_strategies/multimodal_pretrain.py
@@ -1,29 +1,191 @@
-"""Multimodal CPT helpers (image-token autodetection + processor compat).
-
-Only the streaming `pretraining_dataset` route is wired in v1; the
-non-streaming `datasets:` route (strategy class + `load()`) is deferred to a
-follow-on PR that also wires `build_collator` to route MM CPT batches outside
-the `training_args.pretraining` branch.
-"""
-
 from __future__ import annotations
 
 from dataclasses import dataclass
+from typing import Any, Optional
 
-from transformers import ProcessorMixin
+from datasets import Dataset, IterableDataset
+from transformers import PreTrainedTokenizerBase, ProcessorMixin
 
+from axolotl.prompt_tokenizers import DatasetWrappingStrategy
 from axolotl.utils.logging import get_logger
 
 LOG = get_logger(__name__)
 
 
-def load(*_args, **_kwargs):
-    raise ValueError(
-        "multimodal_pretrain is only supported via pretraining_dataset "
-        "with streaming: true — see docs/multimodal.qmd"
+class MultiModalPretrainDatasetWrappingStrategy(DatasetWrappingStrategy):
+    def __init__(
+        self,
+        tokenizer: PreTrainedTokenizerBase,
+        processor: ProcessorMixin,
+        sequence_len: int,
+        text_column: str = "text",
+        image_column: str = "images",
+        image_token: str | None = None,
+    ):
+        self.tokenizer = tokenizer
+        self.processor = processor
+        self.sequence_len = sequence_len
+        self.text_column = text_column
+        self.image_column = image_column
+        self.image_token_spec = build_image_token_spec(processor, override=image_token)
+
+    def _encode_batch(self, examples: dict[str, list]) -> dict[str, list]:
+        return encode_multimodal_pretrain(
+            examples,
+            tokenizer=self.tokenizer,
+            max_tokens=self.sequence_len,
+            image_token=self.image_token_spec.image_token,
+            image_token_id=self.image_token_spec.image_token_id,
+            text_column=self.text_column,
+            image_column=self.image_column,
+            enforce_max_length=False,
+        )
+
+    def wrap_dataset(
+        self,
+        dataset,
+        process_count: int | None = None,
+        keep_in_memory: bool | None = False,
+        **kwargs,
+    ) -> Dataset | IterableDataset:
+        if isinstance(dataset, Dataset):
+            remove_columns = list(dataset.column_names)
+        elif getattr(dataset, "features", None):
+            remove_columns = list(dataset.features.keys())
+        else:
+            remove_columns = None
+
+        map_kwargs: dict[str, Any] = {
+            "batched": True,
+            "remove_columns": remove_columns,
+            "desc": "Tokenizing multimodal CPT dataset",
+        }
+        if isinstance(dataset, Dataset):
+            if process_count:
+                map_kwargs["num_proc"] = process_count
+            if keep_in_memory is not None:
+                map_kwargs["keep_in_memory"] = keep_in_memory
+
+        return dataset.map(self._encode_batch, **map_kwargs)
+
+
+def load(
+    tokenizer,
+    cfg,
+    ds_cfg: Optional[dict[str, Any]] = None,
+    processor: ProcessorMixin | None = None,
+):
+    ds_cfg = ds_cfg or {}
+    if processor is None:
+        raise ValueError(
+            "Multimodal CPT (type: multimodal_pretrain) requires a processor. "
+            "Set `processor_type: AutoProcessor` (or the concrete processor "
+            "class) in your config."
+        )
+    check_processor_compatibility(processor)
+    processor_tokenizer = getattr(processor, "tokenizer", None)
+    if processor_tokenizer is not None and processor_tokenizer is not tokenizer:
+        raise ValueError(
+            "Multimodal CPT requires `tokenizer` to be `processor.tokenizer` "
+            "so image placeholder ids stay aligned during encoding."
+        )
+
+    text_column = ds_cfg.get("text_column") or "text"
+    image_column = ds_cfg.get("image_column") or "images"
+    LOG.info(
+        "multimodal CPT dataset path: text_column=%r image_column=%r",
+        text_column,
+        image_column,
+    )
+    return MultiModalPretrainDatasetWrappingStrategy(
+        tokenizer=tokenizer,
+        processor=processor,
+        sequence_len=cfg.sequence_len,
+        text_column=text_column,
+        image_column=image_column,
+        image_token=ds_cfg.get("image_token"),
     )
 
 
+def encode_multimodal_pretrain(
+    examples: dict[str, list],
+    tokenizer: PreTrainedTokenizerBase,
+    max_tokens: int,
+    image_token: str,
+    image_token_id: int,
+    text_column: str = "text",
+    image_column: str = "images",
+    enforce_max_length: bool = True,
+) -> dict[str, list]:
+    texts: list[str] = examples[text_column]
+    imgs_list: list[list[str]] = examples[image_column]
+
+    if len(texts) != len(imgs_list):
+        raise ValueError(
+            f"encode_multimodal_pretrain: text column has {len(texts)} rows "
+            f"but image column has {len(imgs_list)}"
+        )
+
+    input_ids: list[list[int]] = []
+    labels: list[list[int]] = []
+    attention_mask: list[list[int]] = []
+    keep_images: list[list[str]] = []
+    keep_text: list[str] = []
+
+    for text, imgs in zip(texts, imgs_list, strict=True):
+        if not isinstance(text, str):
+            raise TypeError(
+                f"encode_multimodal_pretrain: `{text_column}` must be str, "
+                f"got {type(text).__name__}."
+            )
+        if imgs is None:
+            imgs = []
+        if not isinstance(imgs, (list, tuple)):
+            raise ValueError(
+                f"encode_multimodal_pretrain: row's `{image_column}` must be "
+                f"a list; got {type(imgs).__name__}"
+            )
+        for j, ip in enumerate(imgs):
+            if not isinstance(ip, str):
+                raise TypeError(
+                    f"encode_multimodal_pretrain: image {j} in row must be "
+                    f"str, got {type(ip).__name__}."
+                )
+        # Avoid truncation before processor re-tokenization.
+        enc = tokenizer(text, add_special_tokens=True)
+        ids = list(enc["input_ids"]) + [tokenizer.eos_token_id]
+        mask = list(enc["attention_mask"]) + [1]
+        # Count by id; text.count can match <image> inside <image_soft_token>.
+        n_placeholders = sum(1 for t in ids if t == image_token_id)
+        if n_placeholders != len(imgs):
+            raise ValueError(
+                f"Multimodal CPT row has {n_placeholders} occurrence(s) of "
+                f"{image_token!r} in text but {len(imgs)} image path(s). "
+                f"Text and image count must match (one placeholder per image)."
+            )
+        if enforce_max_length and len(ids) > max_tokens:
+            raise ValueError(
+                f"Multimodal CPT row tokenizes to {len(ids)} tokens which "
+                f"exceeds sequence_len={max_tokens}. Pre-chunk your text or "
+                f"raise sequence_len (image patch expansion at the processor "
+                f"may push the final length even higher)."
+            )
+        # Labels = ids; collator masks image-family ids after re-tokenization.
+        input_ids.append(ids)
+        labels.append(list(ids))
+        attention_mask.append(mask)
+        keep_images.append(list(imgs))
+        keep_text.append(text)
+
+    return {
+        "input_ids": input_ids,
+        "labels": labels,
+        "attention_mask": attention_mask,
+        "images": keep_images,
+        "_mm_text": keep_text,
+    }
+
+
 def _get_incompatible_processor_classes() -> tuple[type, ...]:
     import importlib
 
diff --git a/src/axolotl/utils/chat_templates/templates/gemma4.jinja b/src/axolotl/utils/chat_templates/templates/gemma4.jinja
index 780957c941..4c79e38276 100644
--- a/src/axolotl/utils/chat_templates/templates/gemma4.jinja
+++ b/src/axolotl/utils/chat_templates/templates/gemma4.jinja
@@ -1,9 +1,9 @@
-{%- macro format_parameters(properties, required) -%}
+{%- macro format_parameters(properties, required, filter_keys=false) -%}
     {%- set standard_keys = ['description', 'type', 'properties', 'required', 'nullable'] -%}
     {%- set ns = namespace(found_first=false) -%}
     {%- for key, value in properties | dictsort -%}
         {%- set add_comma = false -%}
-        {%- if key not in standard_keys -%}
+        {%- if not filter_keys or key not in standard_keys -%}
             {%- if ns.found_first %},{% endif -%}
             {%- set ns.found_first = true -%}
             {{ key }}:{
@@ -11,34 +11,15 @@
                 description:<|"|>{{ value['description'] }}<|"|>
                 {%- set add_comma = true -%}
             {%- endif -%}
-            {%- if value['nullable'] %}
-                {%- if add_comma %},{%- else -%} {%- set add_comma = true -%} {% endif -%}
-                nullable:true
-            {%- endif -%}
             {%- if value['type'] | upper == 'STRING' -%}
                 {%- if value['enum'] -%}
                     {%- if add_comma %},{%- else -%} {%- set add_comma = true -%} {% endif -%}
                     enum:{{ format_argument(value['enum']) }}
                 {%- endif -%}
-            {%- elif value['type'] | upper == 'OBJECT' -%}
-                ,properties:{
-                {%- if value['properties'] is defined and value['properties'] is mapping -%}
-                    {{- format_parameters(value['properties'], value['required'] | default([])) -}}
-                {%- elif value is mapping -%}
-                    {{- format_parameters(value, value['required'] | default([])) -}}
-                {%- endif -%}
-                }
-                {%- if value['required'] -%}
-                    ,required:[
-                    {%- for item in value['required'] | default([]) -%}
-                        <|"|>{{- item -}}<|"|>
-                        {%- if not loop.last %},{% endif -%}
-                    {%- endfor -%}
-                    ]
-                {%- endif -%}
             {%- elif value['type'] | upper == 'ARRAY' -%}
                 {%- if value['items'] is mapping and value['items'] -%}
-                    ,items:{
+                    {%- if add_comma %},{%- else -%} {%- set add_comma = true -%} {% endif -%}
+                    items:{
                     {%- set ns_items = namespace(found_first=false) -%}
                     {%- for item_key, item_value in value['items'] | dictsort -%}
                         {%- if item_value is not none -%}
@@ -71,6 +52,32 @@
                     }
                 {%- endif -%}
             {%- endif -%}
+            {%- if value['nullable'] %}
+                {%- if add_comma %},{%- else -%} {%- set add_comma = true -%} {% endif -%}
+                nullable:true
+            {%- endif -%}
+            {%- if value['type'] | upper == 'OBJECT' -%}
+                {%- if value['properties'] is defined and value['properties'] is mapping -%}
+                    {%- if add_comma %},{%- else -%} {%- set add_comma = true -%} {% endif -%}
+                    properties:{
+                    {{- format_parameters(value['properties'], value['required'] | default([])) -}}
+                    }
+                {%- elif value is mapping -%}
+                    {%- if add_comma %},{%- else -%} {%- set add_comma = true -%} {% endif -%}
+                    properties:{
+                    {{- format_parameters(value, value['required'] | default([]), filter_keys=true) -}}
+                    }
+                {%- endif -%}
+                {%- if value['required'] -%}
+                    {%- if add_comma %},{%- else -%} {%- set add_comma = true -%} {% endif -%}
+                    required:[
+                    {%- for item in value['required'] | default([]) -%}
+                        <|"|>{{- item -}}<|"|>
+                        {%- if not loop.last %},{% endif -%}
+                    {%- endfor -%}
+                    ]
+                {%- endif -%}
+            {%- endif -%}
             {%- if add_comma %},{%- else -%} {%- set add_comma = true -%} {% endif -%}
             type:<|"|>{{ value['type'] | upper }}<|"|>}
         {%- endif -%}
@@ -138,39 +145,54 @@
         {{- argument -}}
     {%- endif -%}
 {%- endmacro -%}
-{#- Removes '<|channel>...<channel|>' thinking blocks from model output.
-    Splits on the end token '<channel|>', then checks each part for the start
-    token '<|channel>' and keeps only the text before it. -#}
 {%- macro strip_thinking(text) -%}
-    {%- set ns = namespace(cleaned='') -%}
+    {%- set ns = namespace(result='') -%}
     {%- for part in text.split('<channel|>') -%}
         {%- if '<|channel>' in part -%}
-            {%- set ns.cleaned = ns.cleaned + part.split('<|channel>')[0] -%}
+            {%- set ns.result = ns.result + part.split('<|channel>')[0] -%}
         {%- else -%}
-            {%- set ns.cleaned = ns.cleaned + part -%}
+            {%- set ns.result = ns.result + part -%}
         {%- endif -%}
     {%- endfor -%}
-    {{- ns.cleaned | trim -}}
+    {{- ns.result | trim -}}
+{%- endmacro -%}
+
+{%- macro format_tool_response_block(tool_name, response) -%}
+    {{- '<|tool_response>' -}}
+    {%- if response is mapping -%}
+        {{- 'response:' + tool_name + '{' -}}
+        {%- for key, value in response | dictsort -%}
+            {{- key -}}:{{- format_argument(value, escape_keys=False) -}}
+            {%- if not loop.last %},{% endif -%}
+        {%- endfor -%}
+        {{- '}' -}}
+    {%- else -%}
+        {{- 'response:' + tool_name + '{value:' + format_argument(response, escape_keys=False) + '}' -}}
+    {%- endif -%}
+    {{- '<tool_response|>' -}}
 {%- endmacro -%}
 
 {%- set ns = namespace(prev_message_type=None) -%}
 {%- set loop_messages = messages -%}
-{{ bos_token }}
+{{- bos_token -}}
 {#- Handle System/Tool Definitions Block -#}
 {%- if (enable_thinking is defined and enable_thinking) or tools or messages[0]['role'] in ['system', 'developer'] -%}
     {{- '<|turn>system\n' -}}
-
     {#- Inject Thinking token at the very top of the FIRST system turn -#}
     {%- if enable_thinking is defined and enable_thinking -%}
-        {{- '<|think|>' -}}
+        {{- '<|think|>\n' -}}
         {%- set ns.prev_message_type = 'think' -%}
     {%- endif -%}
-
     {%- if messages[0]['role'] in ['system', 'developer'] -%}
-        {{- messages[0]['content'] | trim -}}
+        {%- if messages[0]['content'] is string -%}
+            {{- messages[0]['content'] | trim -}}
+        {%- elif messages[0]['content'] is sequence -%}
+            {%- for item in messages[0]['content'] -%}
+                {{- item['text'] | trim + ' '-}}
+            {%- endfor -%}
+        {%- endif -%}
         {%- set loop_messages = messages[1:] -%}
     {%- endif -%}
-
     {%- if tools -%}
         {%- for tool in tools %}
             {{- '<|tool>' -}}
@@ -179,93 +201,163 @@
         {%- endfor %}
         {%- set ns.prev_message_type = 'tool' -%}
     {%- endif -%}
-
     {{- '<turn|>\n' -}}
 {%- endif %}
 
+{#- Pre-scan: find last user message index for reasoning guard -#}
+{%- set ns_turn = namespace(last_user_idx=-1) -%}
+{%- for i in range(loop_messages | length) -%}
+    {%- if loop_messages[i]['role'] == 'user' -%}
+        {%- set ns_turn.last_user_idx = i -%}
+    {%- endif -%}
+{%- endfor -%}
+
 {#- Loop through messages -#}
 {%- for message in loop_messages -%}
-    {#- Reset so only special message types (tool_call, image, etc.) influence
-        the generation prompt formatting below. Plain text leaves it as None. -#}
-    {%- set ns.prev_message_type = None -%}
-    {%- set role = 'model' if message['role'] == 'assistant' else message['role'] -%}
-        {{- '<|turn>' + role + '\n' }}
-
-            {%- if message['tool_calls'] -%}
-                {%- for tool_call in message['tool_calls'] -%}
-                    {%- set function = tool_call['function'] -%}
-                    {{- '<|tool_call>call:' + function['name'] + '{' -}}
-                    {%- if function['arguments'] is mapping -%}
-                        {%- set ns_args = namespace(found_first=false) -%}
-                        {%- for key, value in function['arguments'] | dictsort -%}
-                            {%- if ns_args.found_first %},{% endif -%}
-                            {%- set ns_args.found_first = true -%}
-                            {{- key -}}:{{- format_argument(value, escape_keys=False) -}}
-                        {%- endfor -%}
-                    {%- elif function['arguments'] is string -%}
-                        {{- function['arguments'] -}}
-                    {%- endif -%}
-                    {{- '}<tool_call|>' -}}
-                {%- endfor -%}
-                {%- set ns.prev_message_type = 'tool_call' -%}
+{%- if message['role'] != 'tool' -%}
+{%- set ns.prev_message_type = None -%}
+{%- set role = 'model' if message['role'] == 'assistant' else message['role'] -%}
+{#- Detect continuation: suppress duplicate <|turn>model when previous non-tool message was also assistant -#}
+{%- set prev_nt = namespace(role=None, found=false) -%}
+{%- if loop.index0 > 0 -%}
+    {%- for j in range(loop.index0 - 1, -1, -1) -%}
+        {%- if not prev_nt.found -%}
+            {%- if loop_messages[j]['role'] != 'tool' -%}
+                {%- set prev_nt.role = loop_messages[j]['role'] -%}
+                {%- set prev_nt.found = true -%}
             {%- endif -%}
+        {%- endif -%}
+    {%- endfor -%}
+{%- endif -%}
+{%- set continue_same_model_turn = (role == 'model' and prev_nt.role == 'assistant') -%}
+{%- if not continue_same_model_turn -%}
+    {{- '<|turn>' + role + '\n' }}
+{%- endif -%}
 
-            {%- if message['tool_responses'] -%}
-                {#- Tool Response handling -#}
-                {%- for tool_response in message['tool_responses'] -%}
-                    {{- '<|tool_response>' -}}
-                    {%- if tool_response['response'] is mapping -%}
-                        {{- 'response:' + tool_response['name'] | default('unknown') + '{' -}}
-                        {%- for key, value in tool_response['response'] | dictsort -%}
-                            {{- key -}}:{{- format_argument(value, escape_keys=False) -}}
-                            {%- if not loop.last %},{% endif -%}
-                        {%- endfor -%}
-                        {{- '}' -}}
-                    {%- else -%}
-                        {{- 'response:' + tool_response['name'] | default('unknown') + '{value:' + format_argument(tool_response['response'], escape_keys=False) + '}' -}}
-                    {%- endif -%}
-                    {{- '<tool_response|>' -}}
-                {%- endfor -%}
-                {%- set ns.prev_message_type = 'tool_response' -%}
-            {%- endif -%}
+{#- Render reasoning/reasoning_content as thinking channel -#}
+{%- set thinking_text = message.get('reasoning') or message.get('reasoning_content') -%}
+{%- if thinking_text and loop.index0 > ns_turn.last_user_idx and message.get('tool_calls') -%}
+    {{- '<|channel>thought\n' + thinking_text + '\n<channel|>' -}}
+{%- endif -%}
+
+{%- if message['tool_calls'] -%}
+    {%- for tool_call in message['tool_calls'] -%}
+        {%- set function = tool_call['function'] -%}
+        {{- '<|tool_call>call:' + function['name'] + '{' -}}
+        {%- if function['arguments'] is mapping -%}
+            {%- set ns_args = namespace(found_first=false) -%}
+            {%- for key, value in function['arguments'] | dictsort -%}
+                {%- if ns_args.found_first %},{% endif -%}
+                {%- set ns_args.found_first = true -%}
+                {{- key -}}:{{- format_argument(value, escape_keys=False) -}}
+            {%- endfor -%}
+        {%- elif function['arguments'] is string -%}
+            {{- function['arguments'] -}}
+        {%- endif -%}
+        {{- '}<tool_call|>' -}}
+    {%- endfor -%}
+    {%- set ns.prev_message_type = 'tool_call' -%}
+{%- endif -%}
 
-            {%- if message['content'] is string -%}
-                {%- if role == 'model' -%}
-                    {{- strip_thinking(message['content']) -}}
-                {%- else -%}
-                    {{- message['content'] | trim -}}
+{%- set ns_tr_out = namespace(flag=false) -%}
+{%- if message.get('tool_responses') -%}
+    {#- Legacy: tool_responses embedded on the assistant message (Google/Gemma native) -#}
+    {%- for tool_response in message['tool_responses'] -%}
+        {{- format_tool_response_block(tool_response['name'] | default('unknown'), tool_response['response']) -}}
+        {%- set ns_tr_out.flag = true -%}
+        {%- set ns.prev_message_type = 'tool_response' -%}
+    {%- endfor -%}
+{%- elif message.get('tool_calls') -%}
+    {#- OpenAI Chat Completions: forward-scan consecutive role:tool messages -#}
+    {%- set ns_tool_scan = namespace(stopped=false) -%}
+    {%- for k in range(loop.index0 + 1, loop_messages | length) -%}
+        {%- if ns_tool_scan.stopped -%}
+        {%- elif loop_messages[k]['role'] != 'tool' -%}
+            {%- set ns_tool_scan.stopped = true -%}
+        {%- else -%}
+            {%- set follow = loop_messages[k] -%}
+            {#- Resolve tool_call_id to function name -#}
+            {%- set ns_tname = namespace(name=follow.get('name') | default('unknown')) -%}
+            {%- for tc in message['tool_calls'] -%}
+                {%- if tc.get('id') == follow.get('tool_call_id') -%}
+                    {%- set ns_tname.name = tc['function']['name'] -%}
                 {%- endif -%}
-            {%- elif message['content'] is sequence -%}
-                {%- for item in message['content'] -%}
-                    {%- if item['type'] == 'text' -%}
-                        {%- if role == 'model' -%}
-                            {{- strip_thinking(item['text']) -}}
-                        {%- else -%}
-                            {{- item['text'] | trim -}}
-                        {%- endif -%}
-                    {%- elif item['type'] == 'image' -%}
-                        {{- '\n\n<|image|>\n\n' -}}
-                        {%- set ns.prev_message_type = 'image' -%}
-                    {%- elif item['type'] == 'audio' -%}
+            {%- endfor -%}
+            {#- Handle content as string or content-parts array -#}
+            {%- set tool_body = follow.get('content') -%}
+            {%- if tool_body is string -%}
+                {{- format_tool_response_block(ns_tname.name, tool_body) -}}
+            {%- elif tool_body is sequence and tool_body is not string -%}
+                {%- set ns_txt = namespace(s='') -%}
+                {%- for part in tool_body -%}
+                    {%- if part.get('type') == 'text' -%}
+                        {%- set ns_txt.s = ns_txt.s + (part.get('text') | default('')) -%}
+                    {%- endif -%}
+                {%- endfor -%}
+                {{- format_tool_response_block(ns_tname.name, ns_txt.s) -}}
+                {%- for part in tool_body -%}
+                    {%- if part.get('type') == 'image' -%}
+                        {{- '<|image|>' -}}
+                    {%- elif part.get('type') == 'audio' -%}
                         {{- '<|audio|>' -}}
-                        {%- set ns.prev_message_type = 'audio' -%}
-                    {%- elif item['type'] == 'video' -%}
-                        {{- '\n\n<|video|>\n\n' -}}
-                        {%- set ns.prev_message_type = 'video' -%}
+                    {%- elif part.get('type') == 'video' -%}
+                        {{- '<|video|>' -}}
                     {%- endif -%}
                 {%- endfor -%}
+            {%- else -%}
+                {{- format_tool_response_block(ns_tname.name, tool_body) -}}
             {%- endif -%}
+            {%- set ns_tr_out.flag = true -%}
+            {%- set ns.prev_message_type = 'tool_response' -%}
+        {%- endif -%}
+    {%- endfor -%}
+{%- endif -%}
 
-        {%- if not (message['tool_responses'] and not message['content']) -%}
-            {{- '<turn|>\n' -}}
+{%- set captured_content -%}
+{%- if message['content'] is string -%}
+    {%- if role == 'model' -%}
+        {{- strip_thinking(message['content']) -}}
+    {%- else -%}
+        {{- message['content'] | trim -}}
+    {%- endif -%}
+{%- elif message['content'] is sequence -%}
+    {%- for item in message['content'] -%}
+        {%- if item['type'] == 'text' -%}
+            {%- if role == 'model' -%}
+                {{- strip_thinking(item['text']) -}}
+            {%- else -%}
+                {{- item['text'] | trim -}}
+            {%- endif -%}
+        {%- elif item['type'] == 'image' -%}
+            {{- '<|image|>' -}}
+            {%- set ns.prev_message_type = 'image' -%}
+        {%- elif item['type'] == 'audio' -%}
+            {{- '<|audio|>' -}}
+            {%- set ns.prev_message_type = 'audio' -%}
+        {%- elif item['type'] == 'video' -%}
+            {{- '<|video|>' -}}
+            {%- set ns.prev_message_type = 'video' -%}
         {%- endif -%}
+    {%- endfor -%}
+{%- endif -%}
+{%- endset -%}
+
+{{- captured_content -}}
+{%- set has_content = captured_content | trim | length > 0 -%}
+
+{%- if ns.prev_message_type == 'tool_call' and not ns_tr_out.flag -%}
+    {{- '<|tool_response>' -}}
+{%- elif not (ns_tr_out.flag and not has_content) -%}
+    {{- '<turn|>\n' -}}
+{%- endif -%}
+{%- endif -%}
 {%- endfor -%}
 
 {%- if add_generation_prompt -%}
-    {%- if ns.prev_message_type != 'tool_response' -%}
+    {%- if ns.prev_message_type != 'tool_response' and ns.prev_message_type != 'tool_call' -%}
         {{- '<|turn>model\n' -}}
-    {%- endif -%}
-    {%- if not enable_thinking | default(false) -%}
-        {{- '<|channel>thought\n<channel|>' -}}
+        {%- if not enable_thinking | default(false) -%}
+            {{- '<|channel>thought\n<channel|>' -}}
+        {%- endif -%}
     {%- endif -%}
 {%- endif -%}
diff --git a/src/axolotl/utils/config/__init__.py b/src/axolotl/utils/config/__init__.py
index 82dab72d88..1ed857a5fa 100644
--- a/src/axolotl/utils/config/__init__.py
+++ b/src/axolotl/utils/config/__init__.py
@@ -25,6 +25,7 @@
 from axolotl.utils.schemas.datasets import (
     DPODataset,
     KTODataset,
+    MultiModalPretrainDataset,
     SFTDataset,
     SyntheticDataset,
 )
@@ -353,6 +354,28 @@ def validate_config(
                 cfg["datasets"][idx] = SyntheticDataset(
                     **(ds_cfg if isinstance(ds_cfg, dict) else dict(ds_cfg))
                 )
+            elif (
+                (
+                    ds_cfg.get("type")
+                    if isinstance(ds_cfg, dict)
+                    else getattr(ds_cfg, "type", None)
+                )
+                == "multimodal_pretrain"
+            ) and not isinstance(ds_cfg, MultiModalPretrainDataset):
+                cfg["datasets"][idx] = MultiModalPretrainDataset(
+                    **(ds_cfg if isinstance(ds_cfg, dict) else dict(ds_cfg))
+                )
+            elif bool(
+                ds_cfg.get("multimodal")
+                if isinstance(ds_cfg, dict)
+                else getattr(ds_cfg, "multimodal", None)
+            ):
+                raise ValueError(
+                    "Multimodal CPT under `datasets` requires "
+                    "`type: multimodal_pretrain`. The `multimodal: true` "
+                    "shortcut is only supported for `pretraining_dataset` "
+                    "and `test_datasets`."
+                )
             elif not isinstance(ds_cfg, SFTDataset):
                 cfg["datasets"][idx] = SFTDataset(**dict(ds_cfg))
 
diff --git a/src/axolotl/utils/data/sft.py b/src/axolotl/utils/data/sft.py
index 16303ab5c5..4d0f8dd601 100644
--- a/src/axolotl/utils/data/sft.py
+++ b/src/axolotl/utils/data/sft.py
@@ -324,7 +324,7 @@ def _processor_fingerprint(processor: ProcessorMixin | None) -> str | None:
     # Module-qualified identity so same-named classes in different modules don't collide.
     proc_cls = type(processor)
     parts = [f"{proc_cls.__module__}.{proc_cls.__qualname__}"]
-    for attr in ("image_token", "video_token", "image_seq_length"):
+    for attr in ("image_token", "boi_token", "video_token", "image_seq_length"):
         if hasattr(processor, attr):
             parts.append(f"{attr}={getattr(processor, attr)!r}")
     image_processor = getattr(processor, "image_processor", None)
@@ -492,7 +492,10 @@ def _load_tokenized_prepared_datasets(
 
     # Generate dataset hash for caching
     dataset_hash = generate_dataset_hash_from_config(
-        cfg, datasets_configs, tokenizer.name_or_path
+        cfg,
+        datasets_configs,
+        tokenizer.name_or_path,
+        _processor_fingerprint(processor),
     )
 
     # Try loading from hub if push_dataset_to_hub is configured
@@ -570,7 +573,10 @@ def _load_raw_datasets(
 
         # Save the prepared dataset
         dataset_hash = generate_dataset_hash_from_config(
-            cfg, datasets_configs, tokenizer.name_or_path
+            cfg,
+            datasets_configs,
+            tokenizer.name_or_path,
+            _processor_fingerprint(processor),
         )
         save_preprocessed_dataset(cfg, dataset, dataset_hash, split)
 
diff --git a/src/axolotl/utils/data/shared.py b/src/axolotl/utils/data/shared.py
index c23b36c76a..48cc779f93 100644
--- a/src/axolotl/utils/data/shared.py
+++ b/src/axolotl/utils/data/shared.py
@@ -505,6 +505,38 @@ def try_load_from_hub(
         return None
 
 
+def _dataset_hash_get(dataset_config, key: str, default=None):
+    if hasattr(dataset_config, "get"):
+        try:
+            return dataset_config.get(key, default)
+        except (AttributeError, KeyError, TypeError):
+            pass
+    return getattr(dataset_config, key, default)
+
+
+def _dataset_hash_component(dataset_config) -> str:
+    component = (
+        f"{_dataset_hash_get(dataset_config, 'path')}:"
+        f"{_dataset_hash_get(dataset_config, 'type')}:"
+        f"{_dataset_hash_get(dataset_config, 'shards')}:"
+        f"{_dataset_hash_get(dataset_config, 'conversation')}:"
+        f"{_dataset_hash_get(dataset_config, 'split')}:"
+        f"{_dataset_hash_get(dataset_config, 'temperature') or 1.0}"
+    )
+    if _dataset_hash_get(dataset_config, "type") == "multimodal_pretrain" or bool(
+        _dataset_hash_get(dataset_config, "multimodal")
+    ):
+        component += (
+            f":{_dataset_hash_get(dataset_config, 'text_column')}:"
+            f"{_dataset_hash_get(dataset_config, 'image_column')}:"
+            f"{_dataset_hash_get(dataset_config, 'image_base_dir')}:"
+            f"{_dataset_hash_get(dataset_config, 'image_token')}:"
+            f"{_dataset_hash_get(dataset_config, 'data_files')}:"
+            f"{_dataset_hash_get(dataset_config, 'ds_type')}"
+        )
+    return component
+
+
 def generate_pretraining_dataset_hash(
     cfg: DictDefault,
     pretraining_config: DictDefault,
@@ -544,7 +576,10 @@ def generate_pretraining_dataset_hash(
 
 
 def generate_dataset_hash_from_config(
-    cfg: DictDefault, cfg_datasets: list, tokenizer_name: str
+    cfg: DictDefault,
+    cfg_datasets: list,
+    tokenizer_name: str,
+    processor_name: str | None = None,
 ) -> str:
     """Generate a hash to uniquely identify a dataset configuration for SFT.
 
@@ -565,12 +600,19 @@ def generate_dataset_hash_from_config(
     else:
         tokenizer_fingerprint = tokenizer_name
 
+    has_mm = any(
+        _dataset_hash_get(d, "type") == "multimodal_pretrain"
+        or bool(_dataset_hash_get(d, "multimodal"))
+        for d in cfg_datasets
+    )
+    processor_fingerprint = f"|processor={processor_name}" if has_mm else ""
+
     config_str = (
         f"{cfg.sequence_len}@{cfg.sample_packing}@{cfg.eval_sample_packing}@"
         f"{cfg.group_by_length}@{cfg.kd_temperature or 1.0}@"
         f"{cfg.dataset_exact_deduplication or False}|"
-        f"{'|'.join(sorted([f'{d.path}:{d.type}:{d.shards}:{d.conversation}:{d.split}:{d.temperature or 1.0}' for d in cfg_datasets]))}"
-        f"|{tokenizer_fingerprint}"
+        f"{'|'.join(_dataset_hash_component(d) for d in cfg_datasets)}"
+        f"|{tokenizer_fingerprint}{processor_fingerprint}"
     )
     return str(md5(config_str))
 
diff --git a/src/axolotl/utils/data/streaming.py b/src/axolotl/utils/data/streaming.py
index b71dcb0d93..027e15af5a 100644
--- a/src/axolotl/utils/data/streaming.py
+++ b/src/axolotl/utils/data/streaming.py
@@ -185,75 +185,20 @@ def encode_streaming_multimodal(
     text_column: str = "text",
     image_column: str = "images",
 ) -> Dict[str, List]:
-    texts: List[str] = examples[text_column]
-    imgs_list: List[List[str]] = examples[image_column]
-
-    if len(texts) != len(imgs_list):
-        raise ValueError(
-            f"encode_streaming_multimodal: text column has {len(texts)} rows "
-            f"but image column has {len(imgs_list)}"
-        )
+    from axolotl.prompt_strategies.multimodal_pretrain import (
+        encode_multimodal_pretrain,
+    )
 
-    input_ids: List[List[int]] = []
-    labels: List[List[int]] = []
-    attention_mask: List[List[int]] = []
-    keep_images: List[List[str]] = []
-    keep_text: List[str] = []
-
-    for text, imgs in zip(texts, imgs_list, strict=True):
-        if not isinstance(text, str):
-            raise TypeError(
-                f"encode_streaming_multimodal: `{text_column}` must be str, "
-                f"got {type(text).__name__}."
-            )
-        if imgs is None:
-            imgs = []
-        if not isinstance(imgs, (list, tuple)):
-            raise ValueError(
-                f"encode_streaming_multimodal: row's `{image_column}` must be "
-                f"a list; got {type(imgs).__name__}"
-            )
-        for j, ip in enumerate(imgs):
-            if not isinstance(ip, str):
-                raise TypeError(
-                    f"encode_streaming_multimodal: image {j} in row must be "
-                    f"str, got {type(ip).__name__}."
-                )
-        # No truncation: counting on truncated ids and storing untruncated text
-        # (which the collator re-tokenizes without truncation) silently produces
-        # oversize batches and confusing placeholder/image-count mismatches.
-        enc = tokenizer(text, add_special_tokens=True)
-        ids = list(enc["input_ids"]) + [tokenizer.eos_token_id]
-        mask = list(enc["attention_mask"]) + [1]
-        # Count by id — `text.count` substring-matches `<image>` in `<image_soft_token>`.
-        n_placeholders = sum(1 for t in ids if t == image_token_id)
-        if n_placeholders != len(imgs):
-            raise ValueError(
-                f"Multimodal CPT row has {n_placeholders} occurrence(s) of "
-                f"{image_token!r} in text but {len(imgs)} image path(s). "
-                f"Text and image count must match (one placeholder per image)."
-            )
-        if len(ids) > max_tokens:
-            raise ValueError(
-                f"Multimodal CPT row tokenizes to {len(ids)} tokens which "
-                f"exceeds sequence_len={max_tokens}. Pre-chunk your text or "
-                f"raise sequence_len (image patch expansion at the processor "
-                f"may push the final length even higher)."
-            )
-        # Labels = ids; collator masks image-family ids after re-tokenization.
-        input_ids.append(ids)
-        labels.append(list(ids))
-        attention_mask.append(mask)
-        keep_images.append(list(imgs))
-        keep_text.append(text)
-
-    return {
-        "input_ids": input_ids,
-        "labels": labels,
-        "attention_mask": attention_mask,
-        "images": keep_images,
-        "_mm_text": keep_text,
-    }
+    return encode_multimodal_pretrain(
+        examples,
+        tokenizer=tokenizer,
+        max_tokens=max_tokens,
+        image_token=image_token,
+        image_token_id=image_token_id,
+        text_column=text_column,
+        image_column=image_column,
+        enforce_max_length=True,
+    )
 
 
 def wrap_streaming_dataset(
diff --git a/src/axolotl/utils/fp32_norms.py b/src/axolotl/utils/fp32_norms.py
new file mode 100644
index 0000000000..67793998f8
--- /dev/null
+++ b/src/axolotl/utils/fp32_norms.py
@@ -0,0 +1,135 @@
+"""Helpers for keeping selected norm modules in fp32 under FSDP2."""
+
+from __future__ import annotations
+
+from typing import Any, Sequence
+
+import torch
+
+from axolotl.utils.logging import get_logger
+
+LOG = get_logger(__name__)
+
+DEFAULT_FP32_NORM_SUFFIXES: tuple[str, ...] = ("RMSNorm", "LayerNorm")
+
+
+def _matches_norm_class(module: "torch.nn.Module", patterns: Sequence[str]) -> bool:
+    """Match a module against class-name patterns.
+
+    Two matching modes, chosen per-pattern by presence of a dot:
+      - Fully qualified (contains "."): matches f"{module.__module__}.{cls}" exactly.
+      - Suffix (no dot): matches type(module).__name__.endswith(pattern).
+    Empty / whitespace-only patterns are skipped (``cls_name.endswith("")``
+    is True for every class, which would silently match everything).
+    """
+    cls = type(module)
+    cls_name = cls.__name__
+    qualified = f"{cls.__module__}.{cls_name}"
+    for pattern in patterns:
+        if not pattern or not pattern.strip():
+            continue
+        if "." in pattern:
+            if qualified == pattern:
+                return True
+        elif cls_name.endswith(pattern):
+            return True
+    return False
+
+
+def get_fp32_norm_patterns(source) -> list[str] | None:
+    """Resolve configured fp32 norm patterns from a config or tagged model."""
+    tagged_patterns = getattr(source, "_axolotl_fp32_norm_patterns", None)
+    if tagged_patterns is not None:
+        return list(tagged_patterns)
+
+    if not getattr(source, "fp32_norms", False):
+        return None
+
+    configured_patterns = getattr(source, "fp32_norm_classes", None)
+    if configured_patterns:
+        return list(configured_patterns)
+
+    return list(DEFAULT_FP32_NORM_SUFFIXES)
+
+
+def tag_model_fp32_norms(model: "torch.nn.Module", cfg) -> list[str] | None:
+    """Attach the resolved fp32 norm patterns to the model for FSDP2 prepare."""
+    patterns = get_fp32_norm_patterns(cfg)
+    if patterns is None:
+        if hasattr(model, "_axolotl_fp32_norm_patterns"):
+            delattr(model, "_axolotl_fp32_norm_patterns")
+        return None
+
+    model._axolotl_fp32_norm_patterns = list(patterns)
+    return patterns
+
+
+def shard_norms_fp32(
+    model: "torch.nn.Module",
+    source=None,
+    *,
+    patterns: Sequence[str] | None = None,
+    fully_shard_kwargs: dict[str, Any] | None = None,
+) -> int:
+    """Wrap matching norm modules with FSDP2 + fp32 MixedPrecisionPolicy."""
+    if source is not None and not getattr(source, "fp32_norms", False):
+        return 0
+
+    if source is not None and getattr(source, "fsdp_version", None) != 2:
+        raise ValueError(
+            "fp32_norms requires fsdp_version: 2. FSDP1 enforces flat-param "
+            "dtype uniformity within each wrap group, which is incompatible "
+            "with keeping norms in fp32 while the rest of the layer is bf16."
+        )
+
+    patterns = (
+        list(patterns)
+        if patterns is not None
+        else get_fp32_norm_patterns(source or model)
+    )
+    if not patterns:
+        return 0
+
+    from torch.distributed.fsdp import MixedPrecisionPolicy, fully_shard
+
+    outer_policy = (fully_shard_kwargs or {}).get("mp_policy")
+    output_dtype = getattr(outer_policy, "param_dtype", None)
+    fp32_policy = MixedPrecisionPolicy(
+        param_dtype=torch.float32,
+        reduce_dtype=torch.float32,
+        output_dtype=output_dtype,
+    )
+
+    matches = [
+        (name, module)
+        for name, module in model.named_modules()
+        if _matches_norm_class(module, patterns)
+    ]
+
+    if not matches:
+        LOG.warning(
+            "fp32_norms enabled but no modules matched patterns %s. Check "
+            "fp32_norm_classes against the model's actual norm class names.",
+            patterns,
+        )
+        return 0
+
+    shard_kwargs = dict(fully_shard_kwargs or {})
+    shard_kwargs["mp_policy"] = fp32_policy
+
+    for _name, module in matches:
+        for param in module.parameters(recurse=False):
+            param.data = param.data.to(torch.float32)
+        for buffer in module.buffers(recurse=False):
+            if buffer.dtype.is_floating_point:
+                buffer.data = buffer.data.to(torch.float32)
+        fully_shard(module, **shard_kwargs)
+
+    LOG.info(
+        "Sharded %d norm modules with fp32 MixedPrecisionPolicy "
+        "(patterns=%s, output_dtype=%s)",
+        len(matches),
+        patterns,
+        output_dtype,
+    )
+    return len(matches)
diff --git a/src/axolotl/utils/optimizers/adopt.py b/src/axolotl/utils/optimizers/adopt.py
index 20ddfa7ec4..f501859b6f 100644
--- a/src/axolotl/utils/optimizers/adopt.py
+++ b/src/axolotl/utils/optimizers/adopt.py
@@ -196,7 +196,14 @@ def step(self, closure=None):
             closure (Callable, optional): A closure that reevaluates the model
                 and returns the loss.
         """
-        self._cuda_graph_capture_health_check()
+        # torch 2.11 renamed _cuda_graph_capture_health_check ->
+        # _accelerator_graph_capture_health_check (the 2.11-only name); 2.12
+        # re-added the old name as an alias. Prefer the new name, fall back.
+        health_check = getattr(
+            self, "_accelerator_graph_capture_health_check", None
+        ) or getattr(self, "_cuda_graph_capture_health_check", None)
+        if health_check is not None:
+            health_check()
 
         loss = None
         if closure is not None:
@@ -282,7 +289,7 @@ def _single_tensor_adopt(
         step_t = state_steps[i]
 
         # If compiling, the compiler will handle cudagraph checks, see note [torch.compile x capturable]
-        if not torch._utils.is_compiling() and capturable:
+        if not torch.compiler.is_compiling() and capturable:
             capturable_supported_devices = _get_capturable_supported_devices()
             assert (
                 param.device.type == step_t.device.type
@@ -358,7 +365,7 @@ def _multi_tensor_adopt(
         )
 
     # If compiling, the compiler will handle cudagraph checks, see note [torch.compile x capturable]
-    if not torch._utils.is_compiling() and capturable:
+    if not torch.compiler.is_compiling() and capturable:
         capturable_supported_devices = _get_capturable_supported_devices(
             supports_xla=False
         )
@@ -415,7 +422,7 @@ def _multi_tensor_adopt(
             # If steps are on CPU, foreach will fall back to the slow path, which is a for-loop calling t.add(1) over
             # and over. 1 will then be wrapped into a Tensor over and over again, which is slower than if we just
             # wrapped it once now. The alpha is required to assure we go to the right overload.
-            if not torch._utils.is_compiling() and device_state_steps[0].is_cpu:
+            if not torch.compiler.is_compiling() and device_state_steps[0].is_cpu:
                 torch._foreach_add_(
                     device_state_steps, torch.tensor(1.0, device="cpu"), alpha=1.0
                 )
@@ -448,7 +455,7 @@ def _multi_tensor_adopt(
         # If steps are on CPU, foreach will fall back to the slow path, which is a for-loop calling t.add(1) over
         # and over. 1 will then be wrapped into a Tensor over and over again, which is slower than if we just
         # wrapped it once now. The alpha is required to assure we go to the right overload.
-        if not torch._utils.is_compiling() and device_state_steps[0].is_cpu:
+        if not torch.compiler.is_compiling() and device_state_steps[0].is_cpu:
             torch._foreach_add_(
                 device_state_steps, torch.tensor(1.0, device="cpu"), alpha=1.0
             )
@@ -501,7 +508,7 @@ def adopt(
 
     # this check is slow during compilation, so we skip it
     # if it's strictly needed we can add this check back in dynamo
-    if not torch._utils.is_compiling() and not all(
+    if not torch.compiler.is_compiling() and not all(
         isinstance(t, torch.Tensor) for t in state_steps
     ):
         raise RuntimeError(
diff --git a/src/axolotl/utils/optimizers/qgalore.py b/src/axolotl/utils/optimizers/qgalore.py
new file mode 100644
index 0000000000..9e2cc82607
--- /dev/null
+++ b/src/axolotl/utils/optimizers/qgalore.py
@@ -0,0 +1,88 @@
+"""Helpers for the Q-GaLore optimizer integration."""
+
+from __future__ import annotations
+
+import inspect
+import types
+
+from torch import nn
+
+from axolotl.utils.logging import get_logger
+
+LOG = get_logger(__name__)
+
+
+def patch_q_galore_for_modern_bnb() -> None:
+    """bnb >=0.44 inserted (beta3, alpha) into ``optimizer_update_8bit_blockwise``
+    and ``optimizer_update_32bit``; q-galore-torch==1.0 still calls the legacy
+    positional layout. Swap q_galore's bnb handle for one that re-emits the
+    modern layout. No-op on older bnb."""
+    import bitsandbytes.functional as F
+    import q_galore_torch.q_galore_adamw8bit as mod
+
+    if "beta3" not in inspect.signature(F.optimizer_update_8bit_blockwise).parameters:
+        return
+
+    bw, fp32 = F.optimizer_update_8bit_blockwise, F.optimizer_update_32bit
+    mod.F = types.SimpleNamespace(
+        optimizer_update_8bit_blockwise=(
+            lambda *a, **kw: bw(
+                *(a[:7] + (0.0, 0.0) + a[7:] if len(a) == 15 else a), **kw
+            )
+        ),
+        optimizer_update_32bit=(
+            lambda *a, **kw: fp32(
+                *(a[:10] + (0.0, 0.0) + a[10:] if len(a) == 13 else a), **kw
+            )
+        ),
+        optimizer_update_8bit=F.optimizer_update_8bit,
+        percentile_clipping=F.percentile_clipping,
+    )
+
+
+def build_qgalore_param_groups(
+    model: nn.Module,
+    target_modules: list[str],
+    *,
+    rank: int,
+    update_proj_gap: int,
+    scale: float,
+    proj_type: str,
+    proj_quant: bool,
+    proj_bits: int,
+    proj_group_size: int,
+    cos_threshold: float,
+    gamma_proj: int,
+    queue_size: int,
+) -> list[dict]:
+    """Two param-groups: 2D weights matching ``target_modules`` get the Q-GaLore
+    projection keys; everything else (norms, biases, embeddings) is plain AdamW."""
+    galore, plain = [], []
+    for name, p in model.named_parameters():
+        if not p.requires_grad:
+            continue
+        if p.dim() == 2 and any(t in name for t in target_modules):
+            galore.append(p)
+        else:
+            plain.append(p)
+    if not galore:
+        raise ValueError(
+            f"Q-GaLore: no parameters matched optim_target_modules={target_modules!r}"
+        )
+    LOG.info("Q-GaLore param groups: %d projected, %d plain", len(galore), len(plain))
+    return [
+        {
+            "params": galore,
+            "rank": rank,
+            "update_proj_gap": update_proj_gap,
+            "scale": scale,
+            "proj_type": proj_type,
+            "quant": proj_quant,
+            "quant_n_bit": proj_bits,
+            "quant_group_size": proj_group_size,
+            "cos_threshold": cos_threshold,
+            "gamma_proj": gamma_proj,
+            "queue_size": queue_size,
+        },
+        {"params": plain},
+    ]
diff --git a/src/axolotl/utils/quantization.py b/src/axolotl/utils/quantization.py
index ceaa9e0f39..9583c3d5f3 100644
--- a/src/axolotl/utils/quantization.py
+++ b/src/axolotl/utils/quantization.py
@@ -2,6 +2,8 @@
 Utilities for quantization including QAT and PTQ using torchao.
 """
 
+import functools
+
 import torch
 from packaging import version
 from torchao.core.config import AOBaseConfig
@@ -164,6 +166,39 @@ def _attach_torchao_quantizer(
     model.hf_quantizer = quantizer
 
 
+def patch_transformers_skip_quantized_init():
+    """Stop ``from_pretrained`` from re-initializing torchao-quantized weights.
+
+    transformers re-runs ``_init_weights`` on every module during loading; the
+    generic implementation does ``init.normal_(module.weight.float(), ...)``.
+    ``.float()`` on a torchao tensor subclass (e.g. ``MXTensor``) returns a new
+    tensor that both drops the ``_is_hf_initialized`` skip flag and does not
+    implement ``normal_``, so loading an MX checkpoint raises NotImplementedError.
+    Re-initializing an already-loaded quantized weight is never correct, so we
+    skip those modules entirely.
+    """
+    from torchao.utils import TorchAOBaseTensor
+    from transformers import PreTrainedModel
+
+    if getattr(PreTrainedModel._initialize_weights, "_axolotl_torchao_patched", False):
+        return
+
+    original = PreTrainedModel._initialize_weights
+
+    @functools.wraps(original)
+    def _initialize_weights(self, module, *args, **kwargs):
+        if any(
+            isinstance(param, TorchAOBaseTensor)
+            for param in module.parameters(recurse=False)
+        ):
+            module._is_hf_initialized = True
+            return None
+        return original(self, module, *args, **kwargs)
+
+    _initialize_weights._axolotl_torchao_patched = True
+    PreTrainedModel._initialize_weights = _initialize_weights
+
+
 def quantize_model(
     model,
     weight_dtype: TorchAOQuantDType,
@@ -214,6 +249,9 @@ def quantize_model(
         # cannot serialize it.  Mark the model so the caller can use
         # safe_serialization=False (torch.save) which supports __tensor_flatten__.
         model._is_mx_quantized = True
+        # MX checkpoints reload via plain from_pretrained (no HF quantizer), so guard
+        # transformers' weight re-init against the MXTensor weights it will encounter.
+        patch_transformers_skip_quantized_init()
     else:
         _attach_torchao_quantizer(
             model,
diff --git a/src/axolotl/utils/schemas/config.py b/src/axolotl/utils/schemas/config.py
index 0eae31e421..84a21135ba 100644
--- a/src/axolotl/utils/schemas/config.py
+++ b/src/axolotl/utils/schemas/config.py
@@ -23,6 +23,7 @@
     DPODataset,
     KTODataset,
     MultiModalEvalDataset,
+    MultiModalPretrainDataset,
     PretrainingDataset,
     SFTDataset,
     StepwiseSupervisedDataset,
@@ -341,7 +342,8 @@ class AxolotlInputConfig(
     datasets: (
         Annotated[
             list[
-                SFTDataset
+                MultiModalPretrainDataset
+                | SFTDataset
                 | DPODataset
                 | KTODataset
                 | StepwiseSupervisedDataset
@@ -842,6 +844,25 @@ class AxolotlInputConfig(
         },
     )
 
+    fused_attn_kernel: bool | None = Field(
+        default=None,
+        json_schema_extra={
+            "description": (
+                "Replace ``q_norm + apply_rotary_pos_emb`` (and the matching k path) "
+                "with a single fused RMSNorm+RoPE Triton kernel launch. Currently "
+                "implemented for Qwen3, Qwen3-MoE, Qwen3.5, and Qwen3.5-MoE "
+                "full-attention layers; Gemma 4 always uses the fused path. Disabled "
+                "(None/False) falls back "
+                "to the eager transformers implementation. Compile-safe via "
+                "torch.library.triton_op — traces under torch.compile(fullgraph=True). "
+                "Per-step wins are arch-dependent: ~+7-12% across sm_86 and sm_120. "
+                "Combining with torch_compile=true is a clear win on sm_120 (+9% "
+                "extra) but currently regresses on sm_86 due to Inductor autotune "
+                "biases — flip them on independently and benchmark."
+            )
+        },
+    )
+
     experts_implementation: str | None = Field(
         default=None,
         json_schema_extra={
@@ -982,6 +1003,27 @@ class AxolotlInputConfig(
         default=None,
         json_schema_extra={"description": "FSDP version"},
     )
+    fp32_norms: bool | None = Field(
+        default=None,
+        json_schema_extra={
+            "description": (
+                "Keep norm modules (RMSNorm/LayerNorm) in fp32 by sharding them "
+                "under their own FSDP2 MixedPrecisionPolicy. Requires fsdp_version: 2."
+            )
+        },
+    )
+    fp32_norm_classes: list[str] | None = Field(
+        default=None,
+        json_schema_extra={
+            "description": (
+                "Class-name patterns to match for fp32 norm sharding. Patterns "
+                "without a '.' match against type(module).__name__ as a suffix. "
+                "Patterns containing a '.' match the fully qualified class path "
+                "exactly. Defaults to ['RMSNorm', 'LayerNorm'] when fp32_norms is "
+                "true and this is unset."
+            )
+        },
+    )
     fsdp_final_state_dict_type: (
         Literal["FULL_STATE_DICT", "LOCAL_STATE_DICT", "SHARDED_STATE_DICT"] | None
     ) = Field(
@@ -1485,6 +1527,30 @@ def validate_attn_implementation(cls, value):
             f"path containing '/'."
         )
 
+    @model_validator(mode="after")
+    def check_fp32_norms(self):
+        if self.fp32_norms:
+            # FSDP must actually be configured — fsdp_version alone is not
+            # sufficient since the rest of axolotl treats fsdp_config as the
+            # canonical "is_fsdp" signal.
+            if self.fsdp_config is None:
+                raise ValueError(
+                    "fp32_norms requires FSDP to be enabled "
+                    "(fsdp_config block must be set)."
+                )
+            if str(self.fsdp_version) != "2":
+                raise ValueError(
+                    "fp32_norms requires fsdp_version: 2. FSDP1's flat-param "
+                    "dtype uniformity constraint is incompatible with keeping "
+                    "norms in fp32 while decoder layers run in bf16."
+                )
+        if self.fp32_norm_classes and not self.fp32_norms:
+            LOG.warning(
+                "fp32_norm_classes is set but fp32_norms is not enabled; "
+                "it will be ignored."
+            )
+        return self
+
     @model_validator(mode="after")
     def check_sageattn_wo_sample_packing(self):
         if (
diff --git a/src/axolotl/utils/schemas/datasets.py b/src/axolotl/utils/schemas/datasets.py
index 367133ac38..ff92315e42 100644
--- a/src/axolotl/utils/schemas/datasets.py
+++ b/src/axolotl/utils/schemas/datasets.py
@@ -272,6 +272,23 @@ class PretrainingDataset(BaseModel):
     )
 
 
+class MultiModalPretrainDataset(PretrainingDataset):
+    type: str | None = None
+
+    @model_validator(mode="before")
+    @classmethod
+    def _require_mm_markers(cls, data):
+        if isinstance(data, BaseModel):
+            data = data.model_dump()
+        if not isinstance(data, dict):
+            return data
+        if data.get("type") != "multimodal_pretrain":
+            raise ValueError(
+                "MultiModalPretrainDataset requires type='multimodal_pretrain' "
+            )
+        return data
+
+
 class MultiModalEvalDataset(PretrainingDataset):
     """Multimodal CPT eval dataset configuration (test_datasets entry).
 
@@ -294,6 +311,9 @@ def _require_mm_markers(cls, data):
                 "MultiModalEvalDataset requires type='multimodal_pretrain' "
                 "or multimodal=True"
             )
+        if data.get("type") != "multimodal_pretrain":
+            data = dict(data)
+            data["type"] = "multimodal_pretrain"
         return data
 
 
@@ -391,5 +411,10 @@ class SyntheticDataset(BaseModel):
 
 
 DatasetConfig = (
-    SFTDataset | DPODataset | KTODataset | StepwiseSupervisedDataset | SyntheticDataset
+    MultiModalPretrainDataset
+    | SFTDataset
+    | DPODataset
+    | KTODataset
+    | StepwiseSupervisedDataset
+    | SyntheticDataset
 )
diff --git a/src/axolotl/utils/schemas/enums.py b/src/axolotl/utils/schemas/enums.py
index 4743604753..d783983c0c 100644
--- a/src/axolotl/utils/schemas/enums.py
+++ b/src/axolotl/utils/schemas/enums.py
@@ -95,6 +95,7 @@ class CustomSupportedOptimizers(str, Enum):
     flash_sgd = "flash_sgd"
     flash_sgdw = "flash_sgdw"
     flash_lion = "flash_lion"
+    q_galore_adamw8bit = "q_galore_adamw8bit"
 
 
 # Accepted canonical names; hub-kernel paths (containing "/") bypass this set.
diff --git a/src/axolotl/utils/schemas/training.py b/src/axolotl/utils/schemas/training.py
index 8e06e82cb3..bdb5e81d7f 100644
--- a/src/axolotl/utils/schemas/training.py
+++ b/src/axolotl/utils/schemas/training.py
@@ -158,6 +158,66 @@ class HyperparametersConfig(BaseModel):
         },
     )
 
+    qgalore_rank: int | None = Field(
+        default=256,
+        json_schema_extra={
+            "description": "Q-GaLore: rank r of the low-rank gradient projection. Smaller r reduces optimizer state but loses gradient information."
+        },
+    )
+    qgalore_update_proj_gap: int | None = Field(
+        default=200,
+        json_schema_extra={
+            "description": "Q-GaLore: maximum number of steps between SVD recomputations of the projection matrix. The adaptive scheduler may skip updates earlier based on cos_threshold."
+        },
+    )
+    qgalore_scale: float | None = Field(
+        default=0.25,
+        json_schema_extra={
+            "description": "Q-GaLore: scaling factor applied to the projected gradient after project_back. Equivalent to GaLore's `galore_scale`."
+        },
+    )
+    qgalore_proj_type: str | None = Field(
+        default="std",
+        json_schema_extra={
+            "description": "Q-GaLore: projection type for the GaLoreProjector. One of 'std', 'reverse_std', 'right', 'left', 'full'."
+        },
+    )
+    qgalore_proj_quant: bool | None = Field(
+        default=True,
+        json_schema_extra={
+            "description": "Q-GaLore: enable INT-quantization of the projection matrix P (the resilient-to-quantization observation from the paper)."
+        },
+    )
+    qgalore_proj_bits: int | None = Field(
+        default=4,
+        json_schema_extra={
+            "description": "Q-GaLore: bitwidth for the quantized projection matrix when qgalore_proj_quant is True (paper default: 4)."
+        },
+    )
+    qgalore_proj_group_size: int | None = Field(
+        default=256,
+        json_schema_extra={
+            "description": "Q-GaLore: group size for projection-matrix quantization. Must evenly divide the projection's last dimension."
+        },
+    )
+    qgalore_cos_threshold: float | None = Field(
+        default=0.4,
+        json_schema_extra={
+            "description": "Q-GaLore: cosine-similarity threshold for the lazy subspace update. If the new P is within this similarity of the previous one, the SVD is skipped."
+        },
+    )
+    qgalore_gamma_proj: int | None = Field(
+        default=2,
+        json_schema_extra={
+            "description": "Q-GaLore: multiplicative factor by which update_proj_gap grows once a layer's subspace is judged stable."
+        },
+    )
+    qgalore_queue_size: int | None = Field(
+        default=5,
+        json_schema_extra={
+            "description": "Q-GaLore: length of the moving-average queue used by the adaptive-frequency scheduler."
+        },
+    )
     max_grad_norm: float | None = Field(
         default=None, json_schema_extra={"description": "Gradient clipping max norm"}
     )
diff --git a/src/axolotl/utils/schemas/validation.py b/src/axolotl/utils/schemas/validation.py
index 76f12bc2af..1c8e2515ed 100644
--- a/src/axolotl/utils/schemas/validation.py
+++ b/src/axolotl/utils/schemas/validation.py
@@ -37,6 +37,8 @@ def set_default_seed(cls, seed):
     @field_validator("datasets", mode="before")
     @classmethod
     def deprecate_sharegpt_datasets(cls, datasets):
+        if datasets is None:
+            return datasets
         for _, ds_cfg in enumerate(datasets):
             ds_type = (
                 ds_cfg.get("type")
@@ -902,6 +904,48 @@ def check_muon_deepspeed_fsdp(cls, data):
                     )
         return data
 
+    @model_validator(mode="before")
+    @classmethod
+    def check_qgalore(cls, data):
+        if data.get("optimizer") != "q_galore_adamw8bit":
+            return data
+        adapter = data.get("adapter")
+        if adapter:
+            raise ValueError(
+                "q_galore_adamw8bit operates on full-precision parameters and is "
+                f"incompatible with adapter='{adapter}'. Remove the adapter setting "
+                "or pick a different optimizer."
+            )
+        if data.get("deepspeed"):
+            raise ValueError(
+                "q_galore_adamw8bit is not yet validated with DeepSpeed. "
+                "Use DDP or FSDP2 with use_orig_params=True."
+            )
+        if data.get("fsdp") or data.get("fsdp_config"):
+            fsdp_version = cls._resolve_fsdp_version(data)
+            if str(fsdp_version) != "2":
+                raise ValueError(
+                    "q_galore_adamw8bit requires FSDP2. Set fsdp_version: 2."
+                )
+            fsdp_config = data.get("fsdp_config") or {}
+            if fsdp_config.get("use_orig_params") is not True:
+                raise ValueError(
+                    "q_galore_adamw8bit requires fsdp_config.use_orig_params=True so "
+                    "that per-parameter projection state survives FSDP sharding."
+                )
+        if not (data.get("bf16") or data.get("bfloat16") or data.get("fp16")):
+            LOG.warning(
+                "q_galore_adamw8bit benefits from mixed-precision (bf16/fp16). "
+                "Running in fp32 will negate most of the memory savings."
+            )
+        if data.get("optim_target_modules") is None:
+            # Match the reference impl's defaults: attention + MLP linears.
+            data["optim_target_modules"] = [
+                "attn",
+                "mlp",
+            ]
+        return data
+
     @model_validator(mode="before")
     @classmethod
     def check_flashoptim_deepspeed_fsdp(cls, data):
@@ -1353,11 +1397,36 @@ def _entry_is_mm(entry) -> bool:
                 mm_flag_ = getattr(entry, "multimodal", None)
             return ds_type_ == "multimodal_pretrain" or bool(mm_flag_)
 
+        def _entry_has_mm_flag_without_dataset_type(entry) -> bool:
+            if isinstance(entry, dict):
+                ds_type_ = entry.get("type")
+                mm_flag_ = entry.get("multimodal")
+            else:
+                ds_type_ = getattr(entry, "type", None)
+                mm_flag_ = getattr(entry, "multimodal", None)
+            return bool(mm_flag_) and ds_type_ != "multimodal_pretrain"
+
+        def _datasets_entry_is_mm(entry) -> bool:
+            if isinstance(entry, dict):
+                return entry.get("type") == "multimodal_pretrain"
+            return getattr(entry, "type", None) == "multimodal_pretrain"
+
         pd = data.get("pretraining_dataset")
         pd_list = pd if isinstance(pd, list) else ([pd] if pd else [])
-        train_is_mm = (
-            bool(pd_list) and isinstance(pd_list[0], dict) and _entry_is_mm(pd_list[0])
-        )
+        datasets = data.get("datasets") or []
+        datasets_list = datasets if isinstance(datasets, list) else [datasets]
+
+        pd_is_mm = any(_entry_is_mm(entry) for entry in pd_list)
+        if any(
+            _entry_has_mm_flag_without_dataset_type(entry) for entry in datasets_list
+        ):
+            raise ValueError(
+                "Multimodal CPT under `datasets` requires "
+                "`type: multimodal_pretrain`. The `multimodal: true` shortcut "
+                "is only supported for `pretraining_dataset` and `test_datasets`."
+            )
+        datasets_is_mm = any(_datasets_entry_is_mm(entry) for entry in datasets_list)
+        train_is_mm = pd_is_mm or datasets_is_mm
 
         test_datasets = data.get("test_datasets") or []
         test_dicts = [t for t in test_datasets if isinstance(t, dict)]
@@ -1372,7 +1441,8 @@ def _entry_is_mm(entry) -> bool:
             if all(mm_flags) and not train_is_mm:
                 raise ValueError(
                     "Multimodal `test_datasets` require multimodal CPT "
-                    "training (set `pretraining_dataset[0].type` to "
+                    "training (set `pretraining_dataset[0].type` or "
+                    "`datasets[0].type` to "
                     "'multimodal_pretrain' or `multimodal: true`)."
                 )
             if not any(mm_flags) and train_is_mm:
@@ -1382,11 +1452,11 @@ def _entry_is_mm(entry) -> bool:
                     "or multimodal: true)."
                 )
 
-        if not pd_list:
+        if not train_is_mm:
             return data
 
         # MM config resolves from entry[0] only; multi-entry runs miscollate or silently demote.
-        if len(pd_list) > 1 and any(_entry_is_mm(e) for e in pd_list):
+        if pd_is_mm and len(pd_list) > 1:
             raise ValueError(
                 "Multimodal CPT supports exactly one `pretraining_dataset` "
                 f"entry (found {len(pd_list)}). Image settings "
@@ -1395,13 +1465,28 @@ def _entry_is_mm(entry) -> bool:
                 "would be silently miscollated or drop their MM config. "
                 "Split multimodal CPT into its own run."
             )
-
-        first = pd_list[0]
-        if not isinstance(first, dict):
-            return data
-
-        if not train_is_mm:
-            return data
+        if datasets_is_mm and len(datasets_list) > 1:
+            raise ValueError(
+                "Multimodal CPT supports exactly one `datasets` entry "
+                f"when using the non-streaming prepared path (found "
+                f"{len(datasets_list)}). Image settings (`image_base_dir`, "
+                "`image_token`) and MM-mode detection resolve once for the "
+                "collator, so mixed or multiple training entries are not "
+                "supported. Split multimodal CPT into its own run."
+            )
+        if pd_is_mm and datasets_is_mm:
+            raise ValueError(
+                "Multimodal CPT cannot be configured under both "
+                "`pretraining_dataset` and `datasets`. Use "
+                "`pretraining_dataset` for streaming or `datasets` for the "
+                "non-streaming prepared path."
+            )
+        if datasets_is_mm and data.get("streaming"):
+            raise ValueError(
+                "Multimodal CPT under `datasets` is the non-streaming prepared "
+                "path. For streaming, configure the entry under "
+                "`pretraining_dataset` with `streaming: true`."
+            )
 
         if not data.get("processor_type"):
             raise ValueError(
@@ -1425,6 +1510,17 @@ def _entry_is_mm(entry) -> bool:
                 "conversational scaffolding entirely. Remove `chat_template` "
                 "or switch to chat-template SFT."
             )
+        if (
+            datasets_is_mm
+            and (data.get("excess_length_strategy") or "drop").lower() == "truncate"
+        ):
+            raise ValueError(
+                "Multimodal CPT under `datasets` cannot use "
+                "`excess_length_strategy: truncate`. The collator re-tokenizes "
+                "`_mm_text` with the processor at batch time, so truncating only "
+                "the prepared `input_ids` would not truncate the actual model "
+                "inputs. Use `drop` or `raise` instead."
+            )
         # Keep `images` and `_mm_text` columns alive for the collator.
         prev_remove_unused = data.get("remove_unused_columns")
         if prev_remove_unused is not False:
@@ -1668,6 +1764,13 @@ def check_context_parallel_size(self):
                 sys.modules[
                     "transformers.modeling_flash_attention_utils"
                 ].is_flash_attn_greater_or_equal = is_flash_attn_greater_or_equal
+                if not hasattr(
+                    transformers.modeling_flash_attention_utils,
+                    "is_flash_attn_greater_or_equal_2_10",
+                ):
+                    transformers.modeling_flash_attention_utils.is_flash_attn_greater_or_equal_2_10 = is_flash_attn_greater_or_equal(
+                        "2.10"
+                    )
                 sys.modules[
                     "transformers.modeling_flash_attention_utils"
                 ].is_flash_attn_greater_or_equal_2_10 = (
@@ -1690,6 +1793,24 @@ def check_context_parallel_size(self):
                 "for more details."
             )
 
+            _SSM_HYBRID_MODEL_TYPES = {
+                "nemotron_h",
+                "falcon_h1",
+                "granitemoehybrid",
+            }
+            _model_config_type = getattr(self, "model_config_type", None) or ""
+            if _model_config_type in _SSM_HYBRID_MODEL_TYPES:
+                LOG.warning(
+                    f"context_parallel_size={self.context_parallel_size} with "
+                    f"model_type={_model_config_type}: SSM/Mamba layers use P2P "
+                    "hidden-state passing and additive output correction across "
+                    "CP ranks. Attention layers use ring attention. This is "
+                    "mathematically exact but has not been extensively validated "
+                    "end-to-end — verify loss curves match single-GPU baselines. "
+                    "Recommended: run a short training job and compare loss curves "
+                    "against a single-GPU baseline with the same data/seed."
+                )
+
         return self
 
     @model_validator(mode="after")
diff --git a/tests/core/conftest.py b/tests/core/conftest.py
new file mode 100644
index 0000000000..2d25902d81
--- /dev/null
+++ b/tests/core/conftest.py
@@ -0,0 +1,245 @@
+"""Shared fixtures for axolotl.core.builders trainer-builder tests."""
+
+import pytest
+
+from axolotl.loaders import ModelLoader, load_tokenizer
+from axolotl.utils.config import normalize_config
+from axolotl.utils.dict import DictDefault
+from axolotl.utils.schemas.enums import RLType
+
+
+@pytest.fixture(name="base_cfg")
+def fixture_base_cfg():
+    """
+    Base config with all common arguments between SFT and RLHF
+    """
+    cfg = DictDefault(
+        {
+            # Model and tokenizer settings
+            "base_model": "HuggingFaceTB/SmolLM2-135M-Instruct",
+            "sequence_len": 2048,
+            "model_config_type": "llama",  # example type
+            # Basic training settings
+            "micro_batch_size": 2,
+            "eval_batch_size": 2,
+            "num_epochs": 1,
+            "gradient_accumulation_steps": 1,
+            "max_steps": 100,
+            "val_set_size": 0,
+            # Optimizer settings
+            "optimizer": "adamw_torch_fused",
+            "learning_rate": 0.00005,
+            "weight_decay": 0.01,
+            "adam_beta1": 0.998,
+            "adam_beta2": 0.9,
+            "adam_epsilon": 0.00001,
+            "max_grad_norm": 1.0,
+            # LR scheduler settings
+            "lr_scheduler": "cosine",
+            "lr_scheduler_kwargs": {"foo": "bar"},
+            "warmup_steps": 10,
+            "warmup_ratio": None,
+            "cosine_min_lr_ratio": 0.1,
+            "cosine_constant_lr_ratio": 0.2,
+            # Checkpointing and saving
+            "save_steps": 100,
+            "output_dir": "./model-out",
+            "save_total_limit": 4,
+            "save_only_model": False,
+            # Hardware/performance settings
+            "gradient_checkpointing": False,
+            "gradient_checkpointing_kwargs": {"use_reentrant": False},
+            "dataloader_num_workers": 1,
+            "dataloader_pin_memory": True,
+            "dataloader_prefetch_factor": 2,
+            "context_parallel_size": 1,
+            "tensor_parallel_size": 1,
+            # Dtype
+            "fp16": False,
+            "bf16": False,
+            "tf32": False,
+            # Logging and evaluation
+            "logging_steps": 10,
+            "eval_steps": 50,
+            "eval_strategy": "steps",
+            "save_strategy": "steps",
+            "include_tokens_per_second": True,
+            # Other common settings
+            "seed": 42,
+            "remove_unused_columns": True,
+            "ddp_timeout": 1800,
+            "ddp_bucket_cap_mb": 25,
+            "ddp_broadcast_buffers": False,
+            "dataset_num_proc": 1,
+        }
+    )
+
+    normalize_config(cfg)
+    return cfg
+
+
+@pytest.fixture(name="dpo_cfg")
+def fixture_dpo_cfg(base_cfg):
+    cfg = base_cfg.copy()
+    cfg.update(
+        {
+            "rl": RLType.DPO,
+            "dpo_use_weighting": True,
+            "dpo_label_smoothing": 0.1,
+            "beta": 0.1,  # DPO beta
+            "dpo_loss_type": ["sigmoid", "sft"],
+            "dpo_loss_weights": [1.0, 0.5],
+        }
+    )
+    return cfg
+
+
+@pytest.fixture(name="orpo_cfg")
+def fixture_orpo_cfg(base_cfg):
+    cfg = base_cfg.copy()
+    cfg.update(
+        {
+            "rl": RLType.ORPO,
+            "orpo_alpha": 0.1,
+            "max_prompt_len": 512,
+        }
+    )
+    return cfg
+
+
+@pytest.fixture(name="kto_cfg")
+def fixture_kto_cfg(base_cfg):
+    cfg = base_cfg.copy()
+    cfg.update(
+        {
+            "rl": RLType.KTO,
+            "kto_desirable_weight": 1.0,
+            "kto_undesirable_weight": 1.0,
+            "max_prompt_len": 512,
+        }
+    )
+    return cfg
+
+
+@pytest.fixture(name="grpo_cfg")
+def fixture_grpo_cfg(base_cfg):
+    cfg = base_cfg.copy()
+    cfg.update(
+        {
+            "rl": RLType.GRPO,
+            "trl": DictDefault(
+                {
+                    "beta": 0.001,
+                    "max_completion_length": 256,
+                    "use_vllm": False,  # run on CPU
+                    # "vllm_device": "auto",
+                    # "vllm_gpu_memory_utilization": 0.15,
+                    "num_generations": 4,
+                    "reward_funcs": ["rewards.rand_reward_func"],
+                }
+            ),
+            # Must be evenly divisible by num_generations
+            "micro_batch_size": 4,
+            "datasets": [
+                {
+                    "path": "openai/gsm8k",
+                    "name": "main",
+                    "split": "train[:1%]",
+                }
+            ],
+        }
+    )
+    return DictDefault(cfg)
+
+
+@pytest.fixture(name="ipo_cfg")
+def fixture_ipo_cfg(base_cfg):
+    cfg = base_cfg.copy()
+    cfg.update(
+        {
+            "rl": RLType.DPO,
+            "dpo_loss_type": ["ipo"],
+            "dpo_label_smoothing": 0,
+            "beta": 0.1,
+        }
+    )
+    return cfg
+
+
+@pytest.fixture(name="simpo_cfg")
+def fixture_simpo_cfg(base_cfg):
+    cfg = base_cfg.copy()
+    cfg.update(
+        {
+            "rl": RLType.SIMPO,
+            "rl_beta": 0.2,
+            "cpo_alpha": 0.9,
+            "simpo_gamma": 0.4,
+        }
+    )
+    return cfg
+
+
+@pytest.fixture(name="sft_cfg")
+def fixture_sft_cfg(base_cfg):
+    cfg = base_cfg.copy()
+    cfg.update(
+        {
+            "rl": None,
+            "sample_packing": False,
+            "eval_sample_packing": False,
+            "flash_attention": False,
+        }
+    )
+    return cfg
+
+
+@pytest.fixture(name="rm_cfg")
+def fixture_rm_cfg(sft_cfg):
+    cfg = sft_cfg.copy()
+    cfg.update(
+        DictDefault(
+            {
+                "reward_model": True,
+                "datasets": [
+                    {
+                        "path": "argilla/distilabel-intel-orca-dpo-pairs",
+                        "type": "bradley_terry.chat_template",
+                        "split": "train[:1%]",
+                    }
+                ],
+            }
+        )
+    )
+    return cfg
+
+
+@pytest.fixture(name="prm_cfg")
+def fixture_prm_cfg(sft_cfg):
+    cfg = sft_cfg.copy()
+    cfg.update(
+        DictDefault(
+            {
+                "process_reward_model": True,
+                "datasets": [
+                    {
+                        "path": "trl-lib/math_shepherd",
+                        "type": "stepwise_supervised",
+                        "split": "train[:1%]",
+                    }
+                ],
+            }
+        )
+    )
+    return cfg
+
+
+@pytest.fixture(name="tokenizer")
+def fixture_tokenizer(base_cfg):
+    return load_tokenizer(base_cfg)
+
+
+@pytest.fixture(name="model")
+def fixture_model(base_cfg, tokenizer):
+    model, _ = ModelLoader(base_cfg, tokenizer).load()
+    return model
diff --git a/tests/core/test_builders.py b/tests/core/test_builders.py
index 0a4b2ad0bb..286475ea1e 100644
--- a/tests/core/test_builders.py
+++ b/tests/core/test_builders.py
@@ -1,508 +1,9 @@
-"""Unit tests for axolotl.core.builders"""
-
-import sys
-from pathlib import Path
-from unittest.mock import MagicMock, patch
+"""Unit tests for axolotl.core.builders SFT and reward-model trainer builders."""
 
 import pytest
 
 from axolotl.common.datasets import load_datasets
 from axolotl.core.builders import HFCausalTrainerBuilder, HFRLTrainerBuilder
-from axolotl.loaders import ModelLoader, load_tokenizer
-from axolotl.utils.config import normalize_config
-from axolotl.utils.data import prepare_preference_datasets
-from axolotl.utils.dict import DictDefault
-from axolotl.utils.schemas.enums import RLType
-
-from tests.constants import ALPACA_MESSAGES_CONFIG_REVISION
-
-
-@pytest.fixture(name="base_cfg")
-def fixture_base_cfg():
-    """
-    Base config with all common arguments between SFT and RLHF
-    """
-    cfg = DictDefault(
-        {
-            # Model and tokenizer settings
-            "base_model": "HuggingFaceTB/SmolLM2-135M-Instruct",
-            "sequence_len": 2048,
-            "model_config_type": "llama",  # example type
-            # Basic training settings
-            "micro_batch_size": 2,
-            "eval_batch_size": 2,
-            "num_epochs": 1,
-            "gradient_accumulation_steps": 1,
-            "max_steps": 100,
-            "val_set_size": 0,
-            # Optimizer settings
-            "optimizer": "adamw_torch_fused",
-            "learning_rate": 0.00005,
-            "weight_decay": 0.01,
-            "adam_beta1": 0.998,
-            "adam_beta2": 0.9,
-            "adam_epsilon": 0.00001,
-            "max_grad_norm": 1.0,
-            # LR scheduler settings
-            "lr_scheduler": "cosine",
-            "lr_scheduler_kwargs": {"foo": "bar"},
-            "warmup_steps": 10,
-            "warmup_ratio": None,
-            "cosine_min_lr_ratio": 0.1,
-            "cosine_constant_lr_ratio": 0.2,
-            # Checkpointing and saving
-            "save_steps": 100,
-            "output_dir": "./model-out",
-            "save_total_limit": 4,
-            "save_only_model": False,
-            # Hardware/performance settings
-            "gradient_checkpointing": False,
-            "gradient_checkpointing_kwargs": {"use_reentrant": False},
-            "dataloader_num_workers": 1,
-            "dataloader_pin_memory": True,
-            "dataloader_prefetch_factor": 2,
-            "context_parallel_size": 1,
-            "tensor_parallel_size": 1,
-            # Dtype
-            "fp16": False,
-            "bf16": False,
-            "tf32": False,
-            # Logging and evaluation
-            "logging_steps": 10,
-            "eval_steps": 50,
-            "eval_strategy": "steps",
-            "save_strategy": "steps",
-            "include_tokens_per_second": True,
-            # Other common settings
-            "seed": 42,
-            "remove_unused_columns": True,
-            "ddp_timeout": 1800,
-            "ddp_bucket_cap_mb": 25,
-            "ddp_broadcast_buffers": False,
-            "dataset_num_proc": 4,
-        }
-    )
-
-    normalize_config(cfg)
-    return cfg
-
-
-@pytest.fixture(name="dpo_cfg")
-def fixture_dpo_cfg(base_cfg):
-    cfg = base_cfg.copy()
-    cfg.update(
-        {
-            "rl": RLType.DPO,
-            "dpo_use_weighting": True,
-            "dpo_label_smoothing": 0.1,
-            "beta": 0.1,  # DPO beta
-            "dpo_loss_type": ["sigmoid", "sft"],
-            "dpo_loss_weights": [1.0, 0.5],
-        }
-    )
-    return cfg
-
-
-@pytest.fixture(name="orpo_cfg")
-def fixture_orpo_cfg(base_cfg):
-    cfg = base_cfg.copy()
-    cfg.update(
-        {
-            "rl": RLType.ORPO,
-            "orpo_alpha": 0.1,
-            "max_prompt_len": 512,
-        }
-    )
-    return cfg
-
-
-@pytest.fixture(name="kto_cfg")
-def fixture_kto_cfg(base_cfg):
-    cfg = base_cfg.copy()
-    cfg.update(
-        {
-            "rl": RLType.KTO,
-            "kto_desirable_weight": 1.0,
-            "kto_undesirable_weight": 1.0,
-            "max_prompt_len": 512,
-        }
-    )
-    return cfg
-
-
-@pytest.fixture(name="grpo_cfg")
-def fixture_grpo_cfg(base_cfg):
-    cfg = base_cfg.copy()
-    cfg.update(
-        {
-            "rl": RLType.GRPO,
-            "trl": DictDefault(
-                {
-                    "beta": 0.001,
-                    "max_completion_length": 256,
-                    "use_vllm": False,  # run on CPU
-                    # "vllm_device": "auto",
-                    # "vllm_gpu_memory_utilization": 0.15,
-                    "num_generations": 4,
-                    "reward_funcs": ["rewards.rand_reward_func"],
-                }
-            ),
-            # Must be evenly divisible by num_generations
-            "micro_batch_size": 4,
-            "datasets": [
-                {
-                    "path": "openai/gsm8k",
-                    "name": "main",
-                    "split": "train[:1%]",
-                }
-            ],
-        }
-    )
-    return DictDefault(cfg)
-
-
-@pytest.fixture(name="ipo_cfg")
-def fixture_ipo_cfg(base_cfg):
-    cfg = base_cfg.copy()
-    cfg.update(
-        {
-            "rl": RLType.DPO,
-            "dpo_loss_type": ["ipo"],
-            "dpo_label_smoothing": 0,
-            "beta": 0.1,
-        }
-    )
-    return cfg
-
-
-@pytest.fixture(name="simpo_cfg")
-def fixture_simpo_cfg(base_cfg):
-    cfg = base_cfg.copy()
-    cfg.update(
-        {
-            "rl": RLType.SIMPO,
-            "rl_beta": 0.2,
-            "cpo_alpha": 0.9,
-            "simpo_gamma": 0.4,
-        }
-    )
-    return cfg
-
-
-@pytest.fixture(name="sft_cfg")
-def fixture_sft_cfg(base_cfg):
-    cfg = base_cfg.copy()
-    cfg.update(
-        {
-            "rl": None,
-            "sample_packing": False,
-            "eval_sample_packing": False,
-            "flash_attention": False,
-        }
-    )
-    return cfg
-
-
-@pytest.fixture(name="rm_cfg")
-def fixture_rm_cfg(sft_cfg):
-    cfg = sft_cfg.copy()
-    cfg.update(
-        DictDefault(
-            {
-                "reward_model": True,
-                "datasets": [
-                    {
-                        "path": "argilla/distilabel-intel-orca-dpo-pairs",
-                        "type": "bradley_terry.chat_template",
-                        "split": "train[:1%]",
-                    }
-                ],
-            }
-        )
-    )
-    return cfg
-
-
-@pytest.fixture(name="prm_cfg")
-def fixture_prm_cfg(sft_cfg):
-    cfg = sft_cfg.copy()
-    cfg.update(
-        DictDefault(
-            {
-                "process_reward_model": True,
-                "datasets": [
-                    {
-                        "path": "trl-lib/math_shepherd",
-                        "type": "stepwise_supervised",
-                        "split": "train[:1%]",
-                    }
-                ],
-            }
-        )
-    )
-    return cfg
-
-
-@pytest.fixture(name="tokenizer")
-def fixture_tokenizer(base_cfg):
-    return load_tokenizer(base_cfg)
-
-
-@pytest.fixture(name="model")
-def fixture_model(base_cfg, tokenizer):
-    model, _ = ModelLoader(base_cfg, tokenizer).load()
-    return model
-
-
-class TestHFRLTrainerBuilder:
-    """
-    TestCase class for RLHF trainer builders
-    """
-
-    def _test_common_training_arguments(self, training_arguments, rl: str):
-        """Helper to test common arguments across all variants"""
-        # Basic training settings
-        if rl == "grpo":
-            # grpo_cfg's micro_batch_size is diff from others
-            assert training_arguments.per_device_train_batch_size == 4
-        else:
-            assert training_arguments.per_device_train_batch_size == 2
-        assert training_arguments.gradient_accumulation_steps == 1
-        assert training_arguments.max_steps == 100
-
-        # Optimizer settings
-        assert training_arguments.learning_rate == 0.00005
-        assert training_arguments.weight_decay == 0.01
-        assert training_arguments.adam_beta1 == 0.998
-        assert training_arguments.adam_beta2 == 0.9
-        assert training_arguments.adam_epsilon == 0.00001
-        assert training_arguments.max_grad_norm == 1.0
-
-        # LR scheduler settings
-        assert training_arguments.lr_scheduler_type == "cosine"
-        assert training_arguments.warmup_steps == 10
-        assert training_arguments.cosine_min_lr_ratio == 0.1
-        assert training_arguments.cosine_constant_lr_ratio == 0.2
-
-        # Other settings
-        assert training_arguments.dataloader_num_workers == 1
-        assert training_arguments.dataloader_pin_memory is True
-
-        # TODO(wing): restore once trl releases 0.22.0
-        # assert training_arguments.gradient_checkpointing is True
-
-    def test_dpo_training_arguments(self, dpo_cfg, model, tokenizer):
-        dpo_cfg["precompute_ref_log_probs"] = True
-        builder = HFRLTrainerBuilder(dpo_cfg, model, tokenizer)
-        training_arguments, _ = builder._build_training_arguments(100)
-
-        self._test_common_training_arguments(training_arguments, rl=dpo_cfg.rl)
-        # DPO specific
-        assert training_arguments.beta == 0.1
-        assert hasattr(training_arguments, "use_weighting")
-        assert training_arguments.use_weighting is True
-        assert training_arguments.label_smoothing == 0.1
-        assert training_arguments.precompute_ref_log_probs is True
-        assert training_arguments.loss_type == ["sigmoid", "sft"]
-        assert training_arguments.loss_weights == [1.0, 0.5]
-
-    def test_orpo_training_arguments(self, orpo_cfg, model, tokenizer):
-        builder = HFRLTrainerBuilder(orpo_cfg, model, tokenizer)
-        training_arguments, _ = builder._build_training_arguments(100)
-
-        self._test_common_training_arguments(training_arguments, rl=orpo_cfg.rl)
-        # ORPO specific
-        assert training_arguments.beta == 0.1  # maps from orpo_alpha
-
-    def test_kto_training_arguments(self, kto_cfg, model, tokenizer):
-        builder = HFRLTrainerBuilder(kto_cfg, model, tokenizer)
-        training_arguments, _ = builder._build_training_arguments(100)
-
-        self._test_common_training_arguments(training_arguments, rl=kto_cfg.rl)
-        # KTO specific
-        assert training_arguments.desirable_weight == 1.0
-        assert training_arguments.undesirable_weight == 1.0
-
-    def _write_rewards_file(self, rewards_dir: Path):
-        """
-        Writes reward function to local tmp path to be loaded on trainer building
-        """
-        # Create rewards.py in a directory we can import from
-        rewards_dir.mkdir()
-        rewards_file = rewards_dir / "rewards.py"
-        rewards_file.write_text(
-            """import random
-def rand_reward_func(prompts, completions) -> list[float]:
-    return [random.uniform(0, 1) for _ in completions]
-"""
-        )
-
-    def test_grpo_training_arguments(self, grpo_cfg, model, tokenizer, tmp_path):
-        rewards_dir = tmp_path / "rewards_test"
-        self._write_rewards_file(rewards_dir)
-
-        # Add the directory to Python path so we can import the module
-        sys.path.insert(0, str(rewards_dir))
-
-        try:
-            builder = HFRLTrainerBuilder(grpo_cfg, model, tokenizer)
-            training_arguments, _ = builder._build_training_arguments(100)
-            builder.train_dataset = MagicMock()
-
-            self._test_common_training_arguments(training_arguments, rl=grpo_cfg.rl)
-            # GRPO specific
-            assert training_arguments.beta == 0.001
-            assert training_arguments.max_completion_length == 256
-            assert training_arguments.use_vllm is False
-            # assert training_arguments.vllm_device == "auto"
-            # assert training_arguments.vllm_gpu_memory_utilization == 0.15
-            assert training_arguments.num_generations == 4
-
-            # Test trainer creation to verify reward_funcs
-            trainer = builder.build(100)
-
-            # Verify reward functions are properly loaded
-            assert len(trainer.reward_funcs) == 1
-            assert trainer.reward_funcs[0].__module__ == "rewards"
-            assert trainer.reward_funcs[0].__name__ == "rand_reward_func"
-        finally:
-            # remove imported module from path
-            if str(rewards_dir) in sys.path:
-                sys.path.remove(str(rewards_dir))
-
-    def test_ipo_training_arguments(self, ipo_cfg, model, tokenizer):
-        builder = HFRLTrainerBuilder(ipo_cfg, model, tokenizer)
-        training_arguments, _ = builder._build_training_arguments(100)
-
-        self._test_common_training_arguments(training_arguments, rl=ipo_cfg.rl)
-        # IPO specific
-        assert training_arguments.beta == 0.1
-        assert training_arguments.loss_type == ["ipo"]
-        assert training_arguments.label_smoothing == 0
-
-    def test_simpo_training_arguments(self, simpo_cfg, model, tokenizer):
-        builder = HFRLTrainerBuilder(simpo_cfg, model, tokenizer)
-        training_arguments, _ = builder._build_training_arguments(100)
-
-        self._test_common_training_arguments(training_arguments, rl=simpo_cfg.rl)
-        # SIMPO specific
-        assert training_arguments.beta == 0.2
-        assert training_arguments.cpo_alpha == 0.9
-        assert training_arguments.simpo_gamma == 0.4
-
-    @pytest.mark.parametrize(
-        ("cfg_string", "dataset_name"),
-        [
-            (
-                "dpo_cfg",
-                "dataset_fozziethebeat_alpaca_messages_2k_dpo_test_rev_ea82cff",
-            ),
-            (
-                "ipo_cfg",
-                "dataset_fozziethebeat_alpaca_messages_2k_dpo_test_rev_ea82cff",
-            ),
-            (
-                "grpo_cfg",
-                "dataset_fozziethebeat_alpaca_messages_2k_dpo_test_rev_ea82cff",
-            ),
-            ("orpo_cfg", None),  # don't use fixture for orpo to use smaller split
-            ("kto_cfg", None),  # no fixture for kto
-            # (
-            #     "simpo_cfg",
-            #     "dataset_fozziethebeat_alpaca_messages_2k_dpo_test_rev_ea82cff",
-            # ),
-        ],
-    )
-    def test_custom_optimizer_cls_and_kwargs(
-        self,
-        request,
-        cfg_string,
-        dataset_name,
-        tmp_path,
-        model,
-        tokenizer,
-    ):
-        cfg = request.getfixturevalue(cfg_string)
-
-        builder = HFRLTrainerBuilder(cfg, model, tokenizer)
-        cfg["optimizer"] = "muon"
-
-        if cfg_string in ["dpo_cfg", "ipo_cfg", "grpo_cfg", "simpo_cfg"]:
-            cfg["datasets"] = [DictDefault(ALPACA_MESSAGES_CONFIG_REVISION)]
-        elif cfg_string == "kto_cfg":
-            cfg["datasets"] = [
-                DictDefault(
-                    {
-                        "path": "argilla/ultrafeedback-binarized-preferences-cleaned-kto",
-                        "type": "llama3.ultra",
-                        "split": "train[:1%]",
-                    }
-                )
-            ]
-        elif cfg_string == "orpo_cfg":
-            cfg["datasets"] = [
-                DictDefault(
-                    {
-                        "path": "argilla/ultrafeedback-binarized-preferences-cleaned",
-                        "type": "chat_template.argilla",
-                        "split": "train[:1%]",
-                    }
-                )
-            ]
-        else:
-            raise ValueError(f"Unhandled cfg_string: {cfg_string}")
-        cfg["dataset_num_proc"] = 4
-
-        if cfg_string == "grpo_cfg":
-            rewards_dir = tmp_path / "rewards_test"
-            self._write_rewards_file(rewards_dir)
-
-            # Add the directory to Python path so we can import the module
-            sys.path.insert(0, str(rewards_dir))
-
-        try:
-            # Only use mock for the commented out configs
-            if dataset_name is not None:
-                with patch(
-                    "axolotl.utils.data.rl.load_dataset_with_config"
-                ) as mock_load_dataset:
-                    mock_load_dataset.return_value = request.getfixturevalue(
-                        dataset_name
-                    )
-                    train_dataset, eval_dataset = prepare_preference_datasets(
-                        cfg, tokenizer
-                    )
-            else:
-                # Load actual datasets for orpo_cfg and kto_cfg
-                train_dataset, eval_dataset = prepare_preference_datasets(
-                    cfg, tokenizer
-                )
-
-            builder.train_dataset = train_dataset
-            builder.eval_dataset = eval_dataset
-
-            trainer = builder.build(100)
-
-            assert trainer.optimizer_cls_and_kwargs is not None
-
-            from axolotl.contribs.mit.muon import MuonOptimizerFactory
-            from axolotl.contribs.mit.muon.muon import Muon
-
-            optimizer_cls, optimizer_kwargs = trainer.optimizer_cls_and_kwargs
-            assert optimizer_cls is MuonOptimizerFactory
-            assert optimizer_kwargs["lr"] == 0.00005
-            assert optimizer_kwargs["weight_decay"] == 0.01
-            assert optimizer_kwargs["betas"] == (0.998, 0.9)
-            assert optimizer_kwargs["eps"] == 0.00001
-
-            # Ensure optimizer is created with correct class
-            optim = trainer.create_optimizer()
-            assert isinstance(optim, Muon)
-
-        finally:
-            # remove imported module from path
-            if cfg_string == "grpo_cfg" and str(rewards_dir) in sys.path:
-                sys.path.remove(str(rewards_dir))
 
 
 class TestHFCausalTrainerBuilder:
diff --git a/tests/core/test_builders_rl.py b/tests/core/test_builders_rl.py
new file mode 100644
index 0000000000..435e7eb411
--- /dev/null
+++ b/tests/core/test_builders_rl.py
@@ -0,0 +1,264 @@
+"""Unit tests for axolotl.core.builders RL (preference/GRPO) trainer builders."""
+
+import sys
+from pathlib import Path
+from unittest.mock import MagicMock, patch
+
+import pytest
+
+from axolotl.core.builders import HFRLTrainerBuilder
+from axolotl.utils.data import prepare_preference_datasets
+from axolotl.utils.dict import DictDefault
+
+from tests.constants import ALPACA_MESSAGES_CONFIG_REVISION
+
+
+class TestHFRLTrainerBuilder:
+    """
+    TestCase class for RLHF trainer builders
+    """
+
+    def _test_common_training_arguments(self, training_arguments, rl: str):
+        """Helper to test common arguments across all variants"""
+        # Basic training settings
+        if rl == "grpo":
+            # grpo_cfg's micro_batch_size is diff from others
+            assert training_arguments.per_device_train_batch_size == 4
+        else:
+            assert training_arguments.per_device_train_batch_size == 2
+        assert training_arguments.gradient_accumulation_steps == 1
+        assert training_arguments.max_steps == 100
+
+        # Optimizer settings
+        assert training_arguments.learning_rate == 0.00005
+        assert training_arguments.weight_decay == 0.01
+        assert training_arguments.adam_beta1 == 0.998
+        assert training_arguments.adam_beta2 == 0.9
+        assert training_arguments.adam_epsilon == 0.00001
+        assert training_arguments.max_grad_norm == 1.0
+
+        # LR scheduler settings
+        assert training_arguments.lr_scheduler_type == "cosine"
+        assert training_arguments.warmup_steps == 10
+        assert training_arguments.cosine_min_lr_ratio == 0.1
+        assert training_arguments.cosine_constant_lr_ratio == 0.2
+
+        # Other settings
+        assert training_arguments.dataloader_num_workers == 1
+        assert training_arguments.dataloader_pin_memory is True
+
+        # TODO(wing): restore once trl releases 0.22.0
+        # assert training_arguments.gradient_checkpointing is True
+
+    def test_dpo_training_arguments(self, dpo_cfg, model, tokenizer):
+        dpo_cfg["precompute_ref_log_probs"] = True
+        builder = HFRLTrainerBuilder(dpo_cfg, model, tokenizer)
+        training_arguments, _ = builder._build_training_arguments(100)
+
+        self._test_common_training_arguments(training_arguments, rl=dpo_cfg.rl)
+        # DPO specific
+        assert training_arguments.beta == 0.1
+        assert hasattr(training_arguments, "use_weighting")
+        assert training_arguments.use_weighting is True
+        assert training_arguments.label_smoothing == 0.1
+        assert training_arguments.precompute_ref_log_probs is True
+        assert training_arguments.loss_type == ["sigmoid", "sft"]
+        assert training_arguments.loss_weights == [1.0, 0.5]
+
+    def test_orpo_training_arguments(self, orpo_cfg, model, tokenizer):
+        builder = HFRLTrainerBuilder(orpo_cfg, model, tokenizer)
+        training_arguments, _ = builder._build_training_arguments(100)
+
+        self._test_common_training_arguments(training_arguments, rl=orpo_cfg.rl)
+        # ORPO specific
+        assert training_arguments.beta == 0.1  # maps from orpo_alpha
+
+    def test_kto_training_arguments(self, kto_cfg, model, tokenizer):
+        builder = HFRLTrainerBuilder(kto_cfg, model, tokenizer)
+        training_arguments, _ = builder._build_training_arguments(100)
+
+        self._test_common_training_arguments(training_arguments, rl=kto_cfg.rl)
+        # KTO specific
+        assert training_arguments.desirable_weight == 1.0
+        assert training_arguments.undesirable_weight == 1.0
+
+    def _write_rewards_file(self, rewards_dir: Path):
+        """
+        Writes reward function to local tmp path to be loaded on trainer building
+        """
+        # Create rewards.py in a directory we can import from
+        rewards_dir.mkdir()
+        rewards_file = rewards_dir / "rewards.py"
+        rewards_file.write_text(
+            """import random
+def rand_reward_func(prompts, completions) -> list[float]:
+    return [random.uniform(0, 1) for _ in completions]
+"""
+        )
+
+    def test_grpo_training_arguments(self, grpo_cfg, model, tokenizer, tmp_path):
+        rewards_dir = tmp_path / "rewards_test"
+        self._write_rewards_file(rewards_dir)
+
+        # Add the directory to Python path so we can import the module
+        sys.path.insert(0, str(rewards_dir))
+
+        try:
+            builder = HFRLTrainerBuilder(grpo_cfg, model, tokenizer)
+            training_arguments, _ = builder._build_training_arguments(100)
+            builder.train_dataset = MagicMock()
+
+            self._test_common_training_arguments(training_arguments, rl=grpo_cfg.rl)
+            # GRPO specific
+            assert training_arguments.beta == 0.001
+            assert training_arguments.max_completion_length == 256
+            assert training_arguments.use_vllm is False
+            # assert training_arguments.vllm_device == "auto"
+            # assert training_arguments.vllm_gpu_memory_utilization == 0.15
+            assert training_arguments.num_generations == 4
+
+            # Test trainer creation to verify reward_funcs
+            trainer = builder.build(100)
+
+            # Verify reward functions are properly loaded
+            assert len(trainer.reward_funcs) == 1
+            assert trainer.reward_funcs[0].__module__ == "rewards"
+            assert trainer.reward_funcs[0].__name__ == "rand_reward_func"
+        finally:
+            # remove imported module from path
+            if str(rewards_dir) in sys.path:
+                sys.path.remove(str(rewards_dir))
+
+    def test_ipo_training_arguments(self, ipo_cfg, model, tokenizer):
+        builder = HFRLTrainerBuilder(ipo_cfg, model, tokenizer)
+        training_arguments, _ = builder._build_training_arguments(100)
+
+        self._test_common_training_arguments(training_arguments, rl=ipo_cfg.rl)
+        # IPO specific
+        assert training_arguments.beta == 0.1
+        assert training_arguments.loss_type == ["ipo"]
+        assert training_arguments.label_smoothing == 0
+
+    def test_simpo_training_arguments(self, simpo_cfg, model, tokenizer):
+        builder = HFRLTrainerBuilder(simpo_cfg, model, tokenizer)
+        training_arguments, _ = builder._build_training_arguments(100)
+
+        self._test_common_training_arguments(training_arguments, rl=simpo_cfg.rl)
+        # SIMPO specific
+        assert training_arguments.beta == 0.2
+        assert training_arguments.cpo_alpha == 0.9
+        assert training_arguments.simpo_gamma == 0.4
+
+    @pytest.mark.parametrize(
+        ("cfg_string", "dataset_name"),
+        [
+            (
+                "dpo_cfg",
+                "dataset_fozziethebeat_alpaca_messages_2k_dpo_test_rev_ea82cff",
+            ),
+            (
+                "ipo_cfg",
+                "dataset_fozziethebeat_alpaca_messages_2k_dpo_test_rev_ea82cff",
+            ),
+            (
+                "grpo_cfg",
+                "dataset_fozziethebeat_alpaca_messages_2k_dpo_test_rev_ea82cff",
+            ),
+            ("orpo_cfg", None),  # don't use fixture for orpo to use smaller split
+            ("kto_cfg", None),  # no fixture for kto
+            # (
+            #     "simpo_cfg",
+            #     "dataset_fozziethebeat_alpaca_messages_2k_dpo_test_rev_ea82cff",
+            # ),
+        ],
+    )
+    def test_custom_optimizer_cls_and_kwargs(
+        self,
+        request,
+        cfg_string,
+        dataset_name,
+        tmp_path,
+        model,
+        tokenizer,
+    ):
+        cfg = request.getfixturevalue(cfg_string)
+
+        builder = HFRLTrainerBuilder(cfg, model, tokenizer)
+        cfg["optimizer"] = "muon"
+
+        if cfg_string in ["dpo_cfg", "ipo_cfg", "grpo_cfg", "simpo_cfg"]:
+            cfg["datasets"] = [DictDefault(ALPACA_MESSAGES_CONFIG_REVISION)]
+        elif cfg_string == "kto_cfg":
+            cfg["datasets"] = [
+                DictDefault(
+                    {
+                        "path": "argilla/ultrafeedback-binarized-preferences-cleaned-kto",
+                        "type": "llama3.ultra",
+                        "split": "train[:1%]",
+                    }
+                )
+            ]
+        elif cfg_string == "orpo_cfg":
+            cfg["datasets"] = [
+                DictDefault(
+                    {
+                        "path": "argilla/ultrafeedback-binarized-preferences-cleaned",
+                        "type": "chat_template.argilla",
+                        "split": "train[:1%]",
+                    }
+                )
+            ]
+        else:
+            raise ValueError(f"Unhandled cfg_string: {cfg_string}")
+        cfg["dataset_num_proc"] = 1
+
+        if cfg_string == "grpo_cfg":
+            rewards_dir = tmp_path / "rewards_test"
+            self._write_rewards_file(rewards_dir)
+
+            # Add the directory to Python path so we can import the module
+            sys.path.insert(0, str(rewards_dir))
+
+        try:
+            # Only use mock for the commented out configs
+            if dataset_name is not None:
+                with patch(
+                    "axolotl.utils.data.rl.load_dataset_with_config"
+                ) as mock_load_dataset:
+                    mock_load_dataset.return_value = request.getfixturevalue(
+                        dataset_name
+                    )
+                    train_dataset, eval_dataset = prepare_preference_datasets(
+                        cfg, tokenizer
+                    )
+            else:
+                # Load actual datasets for orpo_cfg and kto_cfg
+                train_dataset, eval_dataset = prepare_preference_datasets(
+                    cfg, tokenizer
+                )
+
+            builder.train_dataset = train_dataset
+            builder.eval_dataset = eval_dataset
+
+            trainer = builder.build(100)
+
+            assert trainer.optimizer_cls_and_kwargs is not None
+
+            from axolotl.contribs.mit.muon import MuonOptimizerFactory
+            from axolotl.contribs.mit.muon.muon import Muon
+
+            optimizer_cls, optimizer_kwargs = trainer.optimizer_cls_and_kwargs
+            assert optimizer_cls is MuonOptimizerFactory
+            assert optimizer_kwargs["lr"] == 0.00005
+            assert optimizer_kwargs["weight_decay"] == 0.01
+            assert optimizer_kwargs["betas"] == (0.998, 0.9)
+            assert optimizer_kwargs["eps"] == 0.00001
+
+            # Ensure optimizer is created with correct class
+            optim = trainer.create_optimizer()
+            assert isinstance(optim, Muon)
+
+        finally:
+            # remove imported module from path
+            if cfg_string == "grpo_cfg" and str(rewards_dir) in sys.path:
+                sys.path.remove(str(rewards_dir))
diff --git a/tests/e2e/integrations/test_sonicmoe.py b/tests/e2e/integrations/test_sonicmoe.py
index ff8620b2fa..b74e570d02 100644
--- a/tests/e2e/integrations/test_sonicmoe.py
+++ b/tests/e2e/integrations/test_sonicmoe.py
@@ -1,13 +1,17 @@
-"""
-End-to-end gradient and convergence tests for SonicMoE integration.
+"""End-to-end gradient and convergence tests for SonicMoE integration.
 
-Requires:
-    - H100/H200 GPU (SonicMoE CUTLASS kernels target sm_90)
-    - sonicmoe package installed
-    - transformers with Qwen3MoE support
+Flow:
+
+    register_sonicmoe_experts()                # plug into ALL_EXPERTS_FUNCTIONS
+    config._experts_implementation = "sonicmoe"
+    model = AutoModelForCausalLM.from_config(config)   # transformers dispatches
 
-Usage:
-    pytest tests/e2e/integrations/test_sonicmoe.py -v -s
+No weight interleaving needed (``concat_layout=True``).
+
+Requires:
+    - Hopper (sm_90) or Blackwell (sm_100+) GPU
+    - sonic-moe >= 0.1.2 installed from source
+    - transformers >= 5.8 with Qwen3MoE Experts class
 """
 
 import importlib.util
@@ -16,20 +20,29 @@
 import pytest
 import torch
 
-_sonicmoe_available = importlib.util.find_spec("sonicmoe") is not None
-_is_hopper = torch.cuda.is_available() and torch.cuda.get_device_capability() == (9, 0)
+
+def _is_hopper_or_newer() -> bool:
+    if not torch.cuda.is_available():
+        return False
+    major, _ = torch.cuda.get_device_capability()
+    return major >= 9
+
 
 pytestmark = [
     pytest.mark.skipif(not torch.cuda.is_available(), reason="Requires CUDA GPU"),
     pytest.mark.skipif(
-        not _is_hopper, reason="SonicMoE CUTLASS kernels require Hopper (sm_90)"
+        not _is_hopper_or_newer(),
+        reason="SonicMoE requires Hopper (sm_90) or Blackwell (sm_100+)",
+    ),
+    pytest.mark.skipif(
+        importlib.util.find_spec("kernels") is None,
+        reason="HF `kernels` package not installed",
     ),
-    pytest.mark.skipif(not _sonicmoe_available, reason="SonicMoE not installed"),
 ]
 
 
-def _create_tiny_qwen3_config():
-    """Create a minimal Qwen3MoE config for fast testing."""
+def _create_tiny_qwen3_config(experts_implementation: str):
+    """Create a minimal Qwen3MoE config bound to the requested experts impl."""
     from transformers import AutoConfig
 
     config = AutoConfig.for_model("qwen3_moe")
@@ -46,137 +59,85 @@ def _create_tiny_qwen3_config():
     config.max_position_embeddings = 128
     config.norm_topk_prob = True
     config.torch_dtype = torch.bfloat16
+    config._experts_implementation = experts_implementation
     return config
 
 
-def _interleave_gate_up_weights(model):
-    """Interleave all gate_up_proj parameters in-place for SonicMoE."""
-    from axolotl.integrations.kernels.libs.sonicmoe.weight_converter import (
-        interleave_gate_up,
-    )
-
-    with torch.no_grad():
-        for name, param in model.named_parameters():
-            if "gate_up_proj" in name:
-                param.copy_(interleave_gate_up(param))
+def _build_model(experts_implementation: str):
+    from transformers import AutoModelForCausalLM
 
+    from axolotl.integrations.kernels.libs.sonicmoe.experts import (
+        register_sonicmoe_experts,
+    )
 
-def _unpatch_sonicmoe():
-    """Restore original forward on the MoE block class if it was patched."""
-    from axolotl.integrations.kernels.constants import resolve_moe_block_classes
-
-    for moe_cls in resolve_moe_block_classes("qwen3_moe"):
-        if hasattr(moe_cls, "_original_forward"):
-            moe_cls.forward = moe_cls._original_forward
-            del moe_cls._original_forward
+    register_sonicmoe_experts()
+    config = _create_tiny_qwen3_config(experts_implementation)
+    return AutoModelForCausalLM.from_config(config).cuda().bfloat16(), config
 
 
 class TestSonicMoEForwardCorrectness:
-    """Verify SonicMoE-patched model produces same output as original."""
-
-    def teardown_method(self):
-        _unpatch_sonicmoe()
-
-    def test_forward_output_matches(self):
-        from transformers import AutoModelForCausalLM
-
-        from axolotl.integrations.kernels.libs.sonicmoe.patch import patch_sonicmoe
-
-        config = _create_tiny_qwen3_config()
-        input_ids = torch.randint(0, config.vocab_size, (1, 16), device="cuda")
+    """SonicMoE-dispatched model produces output close to eager baseline."""
 
-        # Original model
-        model_orig = AutoModelForCausalLM.from_config(config).cuda().bfloat16()
+    def test_forward_output_matches_eager(self):
+        input_ids = torch.randint(0, 1000, (1, 16), device="cuda")
 
+        eager_model, _ = _build_model("eager")
         with torch.no_grad():
-            out_orig = model_orig(input_ids)
+            out_eager = eager_model(input_ids).logits
 
-        # Patched model (same weights, interleaved for SonicMoE)
-        model_patched = AutoModelForCausalLM.from_config(config).cuda().bfloat16()
-        model_patched.load_state_dict(model_orig.state_dict())
-
-        patch_sonicmoe("qwen3_moe")
-        _interleave_gate_up_weights(model_patched)
+        sonic_model, _ = _build_model("sonicmoe")
+        sonic_model.load_state_dict(eager_model.state_dict())
 
         with torch.no_grad():
-            out_patched = model_patched(input_ids)
+            out_sonic = sonic_model(input_ids).logits
 
-        max_diff = (out_orig.logits - out_patched.logits).abs().max().item()
-        assert torch.allclose(
-            out_orig.logits, out_patched.logits, atol=1e-1, rtol=1e-1
-        ), f"Output mismatch: max diff={max_diff:.6f}"
+        max_diff = (out_eager - out_sonic).abs().max().item()
+        assert torch.allclose(out_eager, out_sonic, atol=1e-1, rtol=1e-1), (
+            f"Output mismatch: max diff={max_diff:.6f}"
+        )
 
 
 class TestSonicMoEGradientCorrectness:
-    """Compare gradients between original HuggingFace and SonicMoE-patched forward."""
-
-    def teardown_method(self):
-        _unpatch_sonicmoe()
+    """Compare gradients between eager and SonicMoE-dispatched forward."""
 
     def test_gradients_match(self):
-        """Verify all parameter gradients match between original and patched."""
-        from transformers import AutoModelForCausalLM
-
-        from axolotl.integrations.kernels.libs.sonicmoe.patch import patch_sonicmoe
-        from axolotl.integrations.kernels.libs.sonicmoe.weight_converter import (
-            deinterleave_gate_up,
-        )
+        input_ids = torch.randint(0, 1000, (1, 16), device="cuda")
 
-        config = _create_tiny_qwen3_config()
-        input_ids = torch.randint(0, config.vocab_size, (1, 16), device="cuda")
+        eager_model, _ = _build_model("eager")
+        out_eager = eager_model(input_ids, labels=input_ids)
+        out_eager.loss.backward()
+        grads_eager = {
+            n: p.grad.float().clone()
+            for n, p in eager_model.named_parameters()
+            if p.grad is not None
+        }
+        loss_eager = out_eager.loss.item()
 
-        # ---------- Original model ----------
-        model_orig = AutoModelForCausalLM.from_config(config).cuda().bfloat16()
-        out_orig = model_orig(input_ids, labels=input_ids)
-        out_orig.loss.backward()
-        grads_orig = {
+        sonic_model, _ = _build_model("sonicmoe")
+        sonic_model.load_state_dict(eager_model.state_dict())
+        out_sonic = sonic_model(input_ids, labels=input_ids)
+        out_sonic.loss.backward()
+        grads_sonic = {
             n: p.grad.float().clone()
-            for n, p in model_orig.named_parameters()
+            for n, p in sonic_model.named_parameters()
             if p.grad is not None
         }
-        loss_orig = out_orig.loss.item()
-
-        # ---------- SonicMoE-patched model (same weights, interleaved) ----------
-        model_patched = AutoModelForCausalLM.from_config(config).cuda().bfloat16()
-        model_patched.load_state_dict(model_orig.state_dict())
-
-        patch_sonicmoe("qwen3_moe")
-        _interleave_gate_up_weights(model_patched)
-
-        out_patched = model_patched(input_ids, labels=input_ids)
-        out_patched.loss.backward()
-        grads_patched = {}
-        for n, p in model_patched.named_parameters():
-            if p.grad is None:
-                continue
-            g = p.grad.float().clone()
-            # gate_up_proj grads are in interleaved layout, de-interleave to match orig
-            if "gate_up_proj" in n:
-                g = deinterleave_gate_up(g)
-            grads_patched[n] = g
-        loss_patched = out_patched.loss.item()
-
-        # ---------- Compare ----------
-        assert abs(loss_orig - loss_patched) < 0.5, (
-            f"Loss mismatch: orig={loss_orig:.4f}, patched={loss_patched:.4f}"
+        loss_sonic = out_sonic.loss.item()
+
+        assert abs(loss_eager - loss_sonic) < 0.5, (
+            f"Loss mismatch: eager={loss_eager:.4f}, sonic={loss_sonic:.4f}"
         )
 
-        # All parameters with gradients in original should have them in patched
-        missing = set(grads_orig.keys()) - set(grads_patched.keys())
-        assert not missing, f"Missing gradients in patched model: {missing}"
+        missing = set(grads_eager.keys()) - set(grads_sonic.keys())
+        assert not missing, f"Missing gradients in sonicmoe model: {missing}"
 
-        # Compare gradient values
-        # bf16 with different GEMM impls (cuBLAS vs CUTLASS) can diverge,
-        # so use generous tolerance: flag only if both rel >10% AND abs >1e-2
+        # bf16 + different GEMM backends can diverge; tolerate both rel >10% AND
+        # abs >1e-2 together.
         mismatches = []
-        for name in grads_orig:
-            if name not in grads_patched:
-                continue
-            g_orig = grads_orig[name]
-            g_patched = grads_patched[name]
-            max_diff = (g_orig - g_patched).abs().max().item()
-            rel_diff = max_diff / (g_orig.abs().max().item() + 1e-8)
-
+        for name, g_eager in grads_eager.items():
+            g_sonic = grads_sonic[name]
+            max_diff = (g_eager - g_sonic).abs().max().item()
+            rel_diff = max_diff / (g_eager.abs().max().item() + 1e-8)
             if rel_diff > 0.1 and max_diff > 1e-2:
                 mismatches.append(
                     f"  {name}: max_abs_diff={max_diff:.6f}, rel_diff={rel_diff:.4f}"
@@ -188,18 +149,8 @@ def test_gradients_match(self):
         )
 
     def test_router_weights_receive_gradients(self):
-        """Verify that router (gate) weights get non-zero gradients."""
-        from transformers import AutoModelForCausalLM
-
-        from axolotl.integrations.kernels.libs.sonicmoe.patch import patch_sonicmoe
-
-        config = _create_tiny_qwen3_config()
-        input_ids = torch.randint(0, config.vocab_size, (1, 16), device="cuda")
-
-        model = AutoModelForCausalLM.from_config(config).cuda().bfloat16()
-        patch_sonicmoe("qwen3_moe")
-        _interleave_gate_up_weights(model)
-
+        input_ids = torch.randint(0, 1000, (1, 16), device="cuda")
+        model, _ = _build_model("sonicmoe")
         out = model(input_ids, labels=input_ids)
         out.loss.backward()
 
@@ -216,21 +167,9 @@ def test_router_weights_receive_gradients(self):
 class TestSonicMoETrainingConvergence:
     """Verify loss decreases during training with SonicMoE."""
 
-    def teardown_method(self):
-        _unpatch_sonicmoe()
-
     def test_loss_decreases(self):
-        """Run 30 training steps, verify loss decreases and no NaN/Inf."""
-        from transformers import AutoModelForCausalLM
-
-        from axolotl.integrations.kernels.libs.sonicmoe.patch import patch_sonicmoe
-
-        config = _create_tiny_qwen3_config()
-        input_ids = torch.randint(0, config.vocab_size, (2, 32), device="cuda")
-
-        model = AutoModelForCausalLM.from_config(config).cuda().bfloat16()
-        patch_sonicmoe("qwen3_moe")
-        _interleave_gate_up_weights(model)
+        input_ids = torch.randint(0, 1000, (2, 32), device="cuda")
+        model, _ = _build_model("sonicmoe")
 
         optimizer = torch.optim.AdamW(model.parameters(), lr=1e-3)
         losses = []
@@ -251,24 +190,14 @@ def test_loss_decreases(self):
         )
 
     def test_expert_weights_update(self):
-        """Verify expert weights change during training (not frozen)."""
-        from transformers import AutoModelForCausalLM
-
-        from axolotl.integrations.kernels.libs.sonicmoe.patch import patch_sonicmoe
-
-        config = _create_tiny_qwen3_config()
-        input_ids = torch.randint(0, config.vocab_size, (2, 32), device="cuda")
-
-        model = AutoModelForCausalLM.from_config(config).cuda().bfloat16()
-        patch_sonicmoe("qwen3_moe")
-        _interleave_gate_up_weights(model)
-
-        # Snapshot expert weights before training
-        expert_weights_before = {}
-        for name, param in model.named_parameters():
-            if "experts" in name:
-                expert_weights_before[name] = param.data.clone()
+        input_ids = torch.randint(0, 1000, (2, 32), device="cuda")
+        model, _ = _build_model("sonicmoe")
 
+        expert_weights_before = {
+            name: param.data.clone()
+            for name, param in model.named_parameters()
+            if "experts" in name
+        }
         assert expert_weights_before, "No expert parameters found"
 
         optimizer = torch.optim.AdamW(model.parameters(), lr=1e-3)
@@ -278,11 +207,10 @@ def test_expert_weights_update(self):
             optimizer.step()
             optimizer.zero_grad()
 
-        # Check that expert weights changed
-        changed = 0
-        for name, param in model.named_parameters():
-            if name in expert_weights_before:
-                if not torch.equal(param.data, expert_weights_before[name]):
-                    changed += 1
-
+        changed = sum(
+            1
+            for name, param in model.named_parameters()
+            if name in expert_weights_before
+            and not torch.equal(param.data, expert_weights_before[name])
+        )
         assert changed > 0, "No expert weights changed after 5 training steps"
diff --git a/tests/e2e/integrations/test_sonicmoe_lora.py b/tests/e2e/integrations/test_sonicmoe_lora.py
index 74721ee57a..cc58f4dccd 100644
--- a/tests/e2e/integrations/test_sonicmoe_lora.py
+++ b/tests/e2e/integrations/test_sonicmoe_lora.py
@@ -2,21 +2,24 @@
 # Copyright (c) Axolotl AI
 # Licensed under the Apache License, Version 2.0
 
-"""
-End-to-end tests for SonicMoE + LoRA integration.
+"""End-to-end tests for SonicMoE + LoRA.
 
-Verifies that PEFT-wrapped MoE models work correctly with SonicMoE's
-runtime LoRA materialization: gradients flow to adapters, base weights
-stay frozen, and loss converges.
+Flow:
 
-Requires:
-    - H100/H200 GPU (SonicMoE CUTLASS kernels target sm_90)
-    - sonicmoe package installed
-    - peft package installed
-    - transformers with Qwen3MoE support
+    register_sonicmoe_experts()                # plug into ALL_EXPERTS_FUNCTIONS
+    config._experts_implementation = "sonicmoe"
+    model = AutoModelForCausalLM.from_config(config)
+    model = get_peft_model(model, lora_config)   # PEFT wraps params/modules
 
-Usage:
-    pytest tests/e2e/integrations/test_sonicmoe_lora.py -v -s
+``sonicmoe_experts_forward_with_lora`` detects the PEFT wrappers and
+materializes ``W_eff = W + scaling * (B @ A)`` via :class:`MoELoRAMaterialize`,
+so adapters train through the CUTLASS kernels.
+
+Requires:
+    - Hopper (sm_90) or Blackwell (sm_100+) GPU
+    - sonic-moe >= 0.1.2 installed from source
+    - peft installed
+    - transformers >= 5.8 with Qwen3MoE Experts class
 """
 
 import importlib.util
@@ -25,22 +28,31 @@
 import pytest
 import torch
 
-_sonicmoe_available = importlib.util.find_spec("sonicmoe") is not None
-_peft_available = importlib.util.find_spec("peft") is not None
-_is_hopper = torch.cuda.is_available() and torch.cuda.get_device_capability() == (9, 0)
+
+def _is_hopper_or_newer() -> bool:
+    if not torch.cuda.is_available():
+        return False
+    major, _ = torch.cuda.get_device_capability()
+    return major >= 9
+
 
 pytestmark = [
     pytest.mark.skipif(not torch.cuda.is_available(), reason="Requires CUDA GPU"),
     pytest.mark.skipif(
-        not _is_hopper, reason="SonicMoE CUTLASS kernels require Hopper (sm_90)"
+        not _is_hopper_or_newer(),
+        reason="SonicMoE requires Hopper (sm_90) or Blackwell (sm_100+)",
+    ),
+    pytest.mark.skipif(
+        importlib.util.find_spec("kernels") is None,
+        reason="HF `kernels` package not installed",
+    ),
+    pytest.mark.skipif(
+        importlib.util.find_spec("peft") is None, reason="PEFT not installed"
     ),
-    pytest.mark.skipif(not _sonicmoe_available, reason="SonicMoE not installed"),
-    pytest.mark.skipif(not _peft_available, reason="PEFT not installed"),
 ]
 
 
 def _create_tiny_qwen3_config():
-    """Create a minimal Qwen3MoE config for fast testing."""
     from transformers import AutoConfig
 
     config = AutoConfig.for_model("qwen3_moe")
@@ -57,33 +69,23 @@ def _create_tiny_qwen3_config():
     config.max_position_embeddings = 128
     config.norm_topk_prob = True
     config.torch_dtype = torch.bfloat16
+    config._experts_implementation = "sonicmoe"
     return config
 
 
-def _interleave_gate_up_weights(model):
-    """Interleave all gate_up_proj parameters in-place for SonicMoE."""
-    from axolotl.integrations.kernels.libs.sonicmoe.weight_converter import (
-        interleave_gate_up,
-    )
-
-    with torch.no_grad():
-        for name, param in model.named_parameters():
-            if "gate_up_proj" in name:
-                param.copy_(interleave_gate_up(param))
+def _build_sonic_model():
+    from transformers import AutoModelForCausalLM
 
+    from axolotl.integrations.kernels.libs.sonicmoe.experts import (
+        register_sonicmoe_experts,
+    )
 
-def _unpatch_sonicmoe():
-    """Restore original forward on the MoE block class if it was patched."""
-    from axolotl.integrations.kernels.constants import resolve_moe_block_classes
-
-    for moe_cls in resolve_moe_block_classes("qwen3_moe"):
-        if hasattr(moe_cls, "_original_forward"):
-            moe_cls.forward = moe_cls._original_forward
-            del moe_cls._original_forward
+    register_sonicmoe_experts()
+    config = _create_tiny_qwen3_config()
+    return AutoModelForCausalLM.from_config(config).cuda().bfloat16()
 
 
 def _apply_lora(model, target_modules):
-    """Apply PEFT LoRA to the model."""
     from peft import LoraConfig, get_peft_model
 
     lora_config = LoraConfig(
@@ -97,37 +99,23 @@ def _apply_lora(model, target_modules):
 
 
 class TestSonicMoELoRATraining:
-    """Verify SonicMoE + LoRA training works end-to-end."""
-
-    def teardown_method(self):
-        _unpatch_sonicmoe()
+    """SonicMoE + LoRA on expert projections trains end-to-end."""
 
     def test_loss_decreases(self):
-        """Run 30 training steps with LoRA on experts, verify loss decreases."""
-        from transformers import AutoModelForCausalLM
-
-        from axolotl.integrations.kernels.libs.sonicmoe.patch import patch_sonicmoe
-
-        config = _create_tiny_qwen3_config()
-        input_ids = torch.randint(0, config.vocab_size, (2, 32), device="cuda")
-
-        model = AutoModelForCausalLM.from_config(config).cuda().bfloat16()
-        patch_sonicmoe("qwen3_moe")
-        _interleave_gate_up_weights(model)
+        input_ids = torch.randint(0, 1000, (2, 32), device="cuda")
+        model = _build_sonic_model()
         model = _apply_lora(model, ["gate_up_proj", "down_proj"])
 
         optimizer = torch.optim.AdamW(
             [p for p in model.parameters() if p.requires_grad], lr=1e-3
         )
         losses = []
-
         for step in range(30):
             out = model(input_ids, labels=input_ids)
             loss = out.loss
             assert not math.isnan(loss.item()), f"NaN loss at step {step}"
             assert not math.isinf(loss.item()), f"Inf loss at step {step}"
             losses.append(loss.item())
-
             loss.backward()
             optimizer.step()
             optimizer.zero_grad()
@@ -137,24 +125,15 @@ def test_loss_decreases(self):
         )
 
     def test_base_weights_frozen(self):
-        """Verify base (non-LoRA) weights don't change during training."""
-        from transformers import AutoModelForCausalLM
-
-        from axolotl.integrations.kernels.libs.sonicmoe.patch import patch_sonicmoe
-
-        config = _create_tiny_qwen3_config()
-        input_ids = torch.randint(0, config.vocab_size, (2, 32), device="cuda")
-
-        model = AutoModelForCausalLM.from_config(config).cuda().bfloat16()
-        patch_sonicmoe("qwen3_moe")
-        _interleave_gate_up_weights(model)
+        input_ids = torch.randint(0, 1000, (2, 32), device="cuda")
+        model = _build_sonic_model()
         model = _apply_lora(model, ["gate_up_proj", "down_proj"])
 
-        # Snapshot frozen weights
-        frozen_before = {}
-        for name, param in model.named_parameters():
-            if not param.requires_grad:
-                frozen_before[name] = param.data.clone()
+        frozen_before = {
+            name: param.data.clone()
+            for name, param in model.named_parameters()
+            if not param.requires_grad
+        }
 
         optimizer = torch.optim.AdamW(
             [p for p in model.parameters() if p.requires_grad], lr=1e-3
@@ -165,24 +144,13 @@ def test_base_weights_frozen(self):
             optimizer.step()
             optimizer.zero_grad()
 
-        for name, param in model.named_parameters():
-            if name in frozen_before:
-                assert torch.equal(param.data, frozen_before[name]), (
-                    f"Frozen weight changed: {name}"
-                )
+        for name, before in frozen_before.items():
+            after = dict(model.named_parameters())[name]
+            assert torch.equal(after.data, before), f"Frozen weight changed: {name}"
 
     def test_lora_adapters_receive_gradients(self):
-        """Verify LoRA A and B matrices get non-zero gradients."""
-        from transformers import AutoModelForCausalLM
-
-        from axolotl.integrations.kernels.libs.sonicmoe.patch import patch_sonicmoe
-
-        config = _create_tiny_qwen3_config()
-        input_ids = torch.randint(0, config.vocab_size, (1, 16), device="cuda")
-
-        model = AutoModelForCausalLM.from_config(config).cuda().bfloat16()
-        patch_sonicmoe("qwen3_moe")
-        _interleave_gate_up_weights(model)
+        input_ids = torch.randint(0, 1000, (1, 16), device="cuda")
+        model = _build_sonic_model()
         model = _apply_lora(model, ["gate_up_proj", "down_proj"])
 
         out = model(input_ids, labels=input_ids)
@@ -200,25 +168,15 @@ def test_lora_adapters_receive_gradients(self):
         assert lora_grads_found > 0, "No LoRA parameters found with gradients"
 
     def test_lora_adapters_update(self):
-        """Verify LoRA adapter weights change during training."""
-        from transformers import AutoModelForCausalLM
-
-        from axolotl.integrations.kernels.libs.sonicmoe.patch import patch_sonicmoe
-
-        config = _create_tiny_qwen3_config()
-        input_ids = torch.randint(0, config.vocab_size, (2, 32), device="cuda")
-
-        model = AutoModelForCausalLM.from_config(config).cuda().bfloat16()
-        patch_sonicmoe("qwen3_moe")
-        _interleave_gate_up_weights(model)
+        input_ids = torch.randint(0, 1000, (2, 32), device="cuda")
+        model = _build_sonic_model()
         model = _apply_lora(model, ["gate_up_proj", "down_proj"])
 
-        # Snapshot LoRA weights
-        lora_before = {}
-        for name, param in model.named_parameters():
-            if "lora_" in name and param.requires_grad:
-                lora_before[name] = param.data.clone()
-
+        lora_before = {
+            name: param.data.clone()
+            for name, param in model.named_parameters()
+            if "lora_" in name and param.requires_grad
+        }
         assert lora_before, "No LoRA parameters found"
 
         optimizer = torch.optim.AdamW(
@@ -239,38 +197,23 @@ def test_lora_adapters_update(self):
 
 
 class TestSonicMoEGateOnlyLoRA:
-    """Verify LoRA targeting only the gate (router) works with SonicMoE."""
-
-    def teardown_method(self):
-        _unpatch_sonicmoe()
+    """LoRA only on the router (gate) — expert path takes the no-LoRA fast path."""
 
     def test_gate_only_lora_loss_decreases(self):
-        """LoRA only on gate — expert path should have zero materialization overhead."""
-        from transformers import AutoModelForCausalLM
-
-        from axolotl.integrations.kernels.libs.sonicmoe.patch import patch_sonicmoe
-
-        config = _create_tiny_qwen3_config()
-        input_ids = torch.randint(0, config.vocab_size, (2, 32), device="cuda")
-
-        model = AutoModelForCausalLM.from_config(config).cuda().bfloat16()
-        patch_sonicmoe("qwen3_moe")
-        _interleave_gate_up_weights(model)
-        # Only target the gate (router), not expert projections
+        input_ids = torch.randint(0, 1000, (2, 32), device="cuda")
+        model = _build_sonic_model()
         model = _apply_lora(model, ["gate"])
 
         optimizer = torch.optim.AdamW(
             [p for p in model.parameters() if p.requires_grad], lr=1e-3
         )
         losses = []
-
         for step in range(20):
             out = model(input_ids, labels=input_ids)
             loss = out.loss
             assert not math.isnan(loss.item()), f"NaN loss at step {step}"
             assert not math.isinf(loss.item()), f"Inf loss at step {step}"
             losses.append(loss.item())
-
             loss.backward()
             optimizer.step()
             optimizer.zero_grad()
@@ -281,34 +224,20 @@ def test_gate_only_lora_loss_decreases(self):
 
 
 class TestSonicMoENoLoRARegression:
-    """Verify SonicMoE without LoRA still works after LoRA code was added."""
-
-    def teardown_method(self):
-        _unpatch_sonicmoe()
+    """Full fine-tuning (no PEFT) still works through the registered forward."""
 
     def test_no_lora_loss_decreases(self):
-        """Full fine-tuning (no PEFT) with SonicMoE — regression test."""
-        from transformers import AutoModelForCausalLM
-
-        from axolotl.integrations.kernels.libs.sonicmoe.patch import patch_sonicmoe
-
-        config = _create_tiny_qwen3_config()
-        input_ids = torch.randint(0, config.vocab_size, (2, 32), device="cuda")
-
-        model = AutoModelForCausalLM.from_config(config).cuda().bfloat16()
-        patch_sonicmoe("qwen3_moe")
-        _interleave_gate_up_weights(model)
+        input_ids = torch.randint(0, 1000, (2, 32), device="cuda")
+        model = _build_sonic_model()
 
         optimizer = torch.optim.AdamW(model.parameters(), lr=1e-3)
         losses = []
-
         for step in range(20):
             out = model(input_ids, labels=input_ids)
             loss = out.loss
             assert not math.isnan(loss.item()), f"NaN loss at step {step}"
             assert not math.isinf(loss.item()), f"Inf loss at step {step}"
             losses.append(loss.item())
-
             loss.backward()
             optimizer.step()
             optimizer.zero_grad()
diff --git a/tests/e2e/multigpu/_fp32_norms_dtype_capture.py b/tests/e2e/multigpu/_fp32_norms_dtype_capture.py
new file mode 100644
index 0000000000..a0dda96b31
--- /dev/null
+++ b/tests/e2e/multigpu/_fp32_norms_dtype_capture.py
@@ -0,0 +1,58 @@
+"""Test-only plugin that captures param dtypes after the first optimizer step
+and dumps them as JSON to ``$FP32_NORMS_DTYPE_DUMP_PATH``.
+
+Loaded via ``plugins: [tests.e2e.multigpu._fp32_norms_dtype_capture.DtypeCapturePlugin]``
+in the test yaml config; the dump path is the contract between the subprocess
+and the outer pytest function. Rank 0 only — dtype is identical across ranks.
+"""
+
+from __future__ import annotations
+
+import json
+import os
+
+import torch
+from transformers.trainer_callback import TrainerCallback
+
+from axolotl.integrations.base import BasePlugin
+
+
+def _dtype_name(dtype: torch.dtype) -> str:
+    return str(dtype).removeprefix("torch.")
+
+
+class _DtypeCaptureCallback(TrainerCallback):
+    """Capture norm vs non-norm param dtypes after step 1, dump to JSON, exit."""
+
+    def on_step_end(self, args, state, control, model=None, **kwargs):  # type: ignore[override]
+        if state.global_step != 1 or model is None:
+            return
+        # Rank 0 only — every rank sees the same dtype info under FSDP2.
+        if torch.distributed.is_initialized() and torch.distributed.get_rank() != 0:
+            return
+        dump_path = os.environ.get("FP32_NORMS_DTYPE_DUMP_PATH")
+        if not dump_path:
+            return
+
+        norm_dtypes: dict[str, str] = {}
+        non_norm_dtypes: dict[str, str] = {}
+        for name, param in model.named_parameters():
+            entry = (name, _dtype_name(param.dtype))
+            if "norm" in name.lower():
+                norm_dtypes[entry[0]] = entry[1]
+            else:
+                non_norm_dtypes[entry[0]] = entry[1]
+
+        with open(dump_path, "w", encoding="utf-8") as fout:
+            json.dump(
+                {"norms": norm_dtypes, "non_norms": non_norm_dtypes},
+                fout,
+                indent=2,
+            )
+
+
+class DtypeCapturePlugin(BasePlugin):
+    """Plugin that registers :class:`_DtypeCaptureCallback` with the trainer."""
+
+    def add_callbacks_pre_trainer(self, cfg, model):  # type: ignore[override]
+        return [_DtypeCaptureCallback()]
diff --git a/tests/e2e/multigpu/test_fsdp2_fp32_norms.py b/tests/e2e/multigpu/test_fsdp2_fp32_norms.py
new file mode 100644
index 0000000000..6dd2bf9c91
--- /dev/null
+++ b/tests/e2e/multigpu/test_fsdp2_fp32_norms.py
@@ -0,0 +1,144 @@
+"""Multi-GPU e2e test for ``fp32_norms`` under FSDP2.
+
+Two-GPU subprocess run with ``fp32_norms: true`` + ``fsdp_version: 2`` + bf16
+training. The test plugin
+``tests.e2e.multigpu._fp32_norms_dtype_capture.DtypeCapturePlugin`` dumps
+post-step-1 param dtypes as JSON; the outer test asserts norms stayed fp32 and
+at least one non-norm param dropped to bf16 (proving the two policies are
+genuinely independent, not a globally-cast model).
+"""
+
+from __future__ import annotations
+
+import json
+import os
+from pathlib import Path
+
+import pytest
+import yaml
+from accelerate.test_utils import execute_subprocess_async
+from transformers.testing_utils import get_torch_dist_unique_port
+
+from axolotl.utils.dict import DictDefault
+
+from tests.e2e.utils import require_torch_2_7_0
+
+AXOLOTL_ROOT = Path(__file__).parent.parent.parent.parent
+
+
+def _base_fp32_norms_config(
+    temp_dir: str, *, cpu_ram_efficient_loading: bool = False, **overrides
+) -> DictDefault:
+    """Base config for fp32_norms + FSDP2 multi-GPU."""
+    cfg = {
+        "base_model": "axolotl-ai-co/tiny-qwen3-129m",
+        "sequence_len": 256,
+        "val_set_size": 0.0,
+        "datasets": [
+            {
+                "path": "tatsu-lab/alpaca",
+                "type": "alpaca",
+                "split": "train[:1%]",
+            },
+        ],
+        # Full FT (no adapter) — fp32_norms is about base-model norm precision,
+        # which adapters wouldn't exercise.
+        "num_epochs": 1,
+        "max_steps": 2,
+        "micro_batch_size": 1,
+        "gradient_accumulation_steps": 1,
+        "output_dir": temp_dir,
+        "learning_rate": 1e-4,
+        "optimizer": "adamw_torch_fused",
+        "lr_scheduler": "cosine",
+        "flash_attention": True,
+        "bf16": True,
+        "fp32_norms": True,
+        "fsdp_version": 2,
+        "fsdp_config": {
+            "offload_params": False,
+            "cpu_ram_efficient_loading": cpu_ram_efficient_loading,
+            "transformer_layer_cls_to_wrap": "Qwen3DecoderLayer",
+            "state_dict_type": "FULL_STATE_DICT",
+            "auto_wrap_policy": "TRANSFORMER_BASED_WRAP",
+            "reshard_after_forward": True,
+        },
+        "plugins": [
+            "tests.e2e.multigpu._fp32_norms_dtype_capture.DtypeCapturePlugin",
+        ],
+        "save_safetensors": True,
+    }
+    cfg.update(overrides)
+    return DictDefault(cfg)
+
+
+def _run_training(temp_dir: str, cfg: DictDefault, dump_path: Path) -> None:
+    """Write yaml + spawn 2-process training; plugin path goes via PYTHONPATH."""
+    Path(temp_dir).mkdir(parents=True, exist_ok=True)
+    with open(Path(temp_dir) / "config.yaml", "w", encoding="utf-8") as fout:
+        fout.write(yaml.dump(cfg.to_dict(), Dumper=yaml.Dumper))
+
+    env = os.environ | {
+        # Make the test-only plugin module importable in the subprocess.
+        "PYTHONPATH": (f"{AXOLOTL_ROOT}{os.pathsep}{os.environ.get('PYTHONPATH', '')}"),
+        "FP32_NORMS_DTYPE_DUMP_PATH": str(dump_path),
+    }
+    execute_subprocess_async(
+        [
+            "axolotl",
+            "train",
+            str(Path(temp_dir) / "config.yaml"),
+            "--num-processes",
+            "2",
+            "--main-process-port",
+            f"{get_torch_dist_unique_port()}",
+        ],
+        env=env,
+    )
+
+
+class TestFSDP2Fp32Norms:
+    """Verifies the fp32_norms FSDP2 path end-to-end across 2 GPUs."""
+
+    @require_torch_2_7_0
+    @pytest.mark.parametrize(
+        "cpu_ram_efficient_loading",
+        [False, True],
+        ids=["materialized-load", "cpu-ram-efficient-load"],
+    )
+    def test_norms_stay_fp32_under_fsdp2_bf16(
+        self, temp_dir, cpu_ram_efficient_loading
+    ):
+        """fp32_norms keeps RMSNorm params in fp32 while the rest stays bf16."""
+        dump_path = Path(temp_dir) / "dtype_capture.json"
+        cfg = _base_fp32_norms_config(
+            temp_dir,
+            cpu_ram_efficient_loading=cpu_ram_efficient_loading,
+        )
+        _run_training(temp_dir, cfg, dump_path)
+
+        # Training completed (no FSDP1-style flat-param dtype crash) AND the
+        # plugin captured dtypes after step 1.
+        assert dump_path.exists(), (
+            f"plugin did not dump dtype capture to {dump_path}; "
+            "training may have failed before step 1"
+        )
+
+        captured = json.loads(dump_path.read_text())
+        norms = captured["norms"]
+        non_norms = captured["non_norms"]
+
+        assert norms, "no norm params captured — matcher likely failed"
+        assert all(d == "float32" for d in norms.values()), (
+            "fp32_norms claim violated: at least one norm param is not fp32. "
+            f"Captured norm dtypes: {norms}"
+        )
+
+        # At least one non-norm param must be bf16. Without this check the
+        # test would pass on a globally-fp32 model that didn't shard anything.
+        non_norm_dtypes = set(non_norms.values())
+        assert "bfloat16" in non_norm_dtypes, (
+            "expected at least one non-norm param in bfloat16 (proves the two "
+            "policies are independent); got non-norm dtypes: "
+            f"{non_norm_dtypes}"
+        )
diff --git a/tests/e2e/multigpu/test_llama.py b/tests/e2e/multigpu/test_llama.py
index b89c935228..432a7e9e5c 100644
--- a/tests/e2e/multigpu/test_llama.py
+++ b/tests/e2e/multigpu/test_llama.py
@@ -56,7 +56,7 @@ def test_lora_ddp(self, temp_dir):
                     },
                 ],
                 "num_epochs": 1,
-                "max_steps": 2,
+                "max_steps": 20,
                 "micro_batch_size": 1,
                 "gradient_accumulation_steps": 2,
                 # "gradient_checkpointing": True,
@@ -69,6 +69,7 @@ def test_lora_ddp(self, temp_dir):
                 "use_tensorboard": True,
                 "bf16": True,
                 "save_first_step": False,
+                "seed": 42,
             }
         )
 
@@ -90,7 +91,7 @@ def test_lora_ddp(self, temp_dir):
         )
 
         check_tensorboard(
-            temp_dir + "/runs", "train/train_loss", 2.8, "Train Loss (%s) is too high"
+            temp_dir + "/runs", "train/train_loss", 2.5, "Train Loss (%s) is too high"
         )
 
     @pytest.mark.parametrize(
diff --git a/tests/e2e/multigpu/test_tiled_mlp_fsdp2.py b/tests/e2e/multigpu/test_tiled_mlp_fsdp2.py
new file mode 100644
index 0000000000..decc5bf8fd
--- /dev/null
+++ b/tests/e2e/multigpu/test_tiled_mlp_fsdp2.py
@@ -0,0 +1,306 @@
+# SPDX-License-Identifier: Apache-2.0
+# Copyright (c) Axolotl AI
+# Licensed under the Apache License, Version 2.0
+"""
+FSDP2 + TiledMLP multi-rank correctness tests.
+
+Parity guard for the tiled MLP under FSDP2: tiled forward+backward
+produces gradients within bf16 tolerance of the un-tiled FSDP2
+reference. The companion fix in
+``axolotl.monkeypatch.tiled_mlp.base._defer_fsdp2_reshard`` is a
+defensive measure that wraps the tile loop in
+``FSDPModule.set_reshard_after_backward(False)`` — under the most
+common setups (FSDP2 wraps the decoder layer; the post-backward
+RegisterPostBackwardFunction fires only when the outer backward
+reaches the layer's input, not mid-tile) the reshard does not fire
+inside the tile loop, but the helper protects against setups where
+the tile loop would otherwise race with FSDP2's per-module reshard.
+
+Run with::
+
+    torchrun --nproc-per-node=2 -m pytest tests/e2e/multigpu/test_tiled_mlp_fsdp2.py
+
+On a 1-GPU executor the tests skip with a clear reason.
+"""
+
+import copy
+import os
+from types import SimpleNamespace
+
+import pytest
+import torch
+import torch.distributed as dist
+import torch.nn as nn
+
+_TORCHRUN_LOCAL_RANK = os.environ.get("LOCAL_RANK")
+_TORCHRUN_WORLD_SIZE = int(os.environ.get("WORLD_SIZE", "1"))
+
+pytestmark = [
+    pytest.mark.skipif(
+        not torch.cuda.is_available(),
+        reason="CUDA not available",
+    ),
+    pytest.mark.skipif(
+        torch.cuda.device_count() < 2,
+        reason="Need >=2 GPUs for FSDP2 multi-rank tests",
+    ),
+    pytest.mark.skipif(
+        _TORCHRUN_LOCAL_RANK is None or _TORCHRUN_WORLD_SIZE < 2,
+        reason=(
+            "Multi-rank tests must be launched via "
+            "`torchrun --nproc-per-node=2 -m pytest <file>`"
+        ),
+    ),
+]
+
+
+# ──────────────────────────── Process group ──────────────────────────────
+
+
+@pytest.fixture(scope="module")
+def dist_pg():
+    """Initialize the default process group exactly once per worker."""
+    if not dist.is_initialized():
+        rank = int(os.environ["RANK"])
+        torch.cuda.set_device(rank % torch.cuda.device_count())
+        dist.init_process_group(backend="nccl")
+    yield
+    if dist.is_initialized():
+        dist.barrier()
+        dist.destroy_process_group()
+
+
+# ──────────────────────────── Helpers ────────────────────────────────────
+
+
+class TinyDenseMLP(nn.Module):
+    def __init__(self, hidden, intermediate, dtype=torch.bfloat16):
+        super().__init__()
+        self.gate_proj = nn.Linear(hidden, intermediate, bias=False, dtype=dtype)
+        self.up_proj = nn.Linear(hidden, intermediate, bias=False, dtype=dtype)
+        self.down_proj = nn.Linear(intermediate, hidden, bias=False, dtype=dtype)
+        self.act_fn = nn.SiLU()
+
+    def forward(self, x):
+        return self.down_proj(self.act_fn(self.gate_proj(x)) * self.up_proj(x))
+
+
+def _full_state(mod):
+    """Materialize an FSDP2 module to full (unsharded) parameters."""
+    from torch.distributed.tensor import DTensor
+
+    out = {}
+    for name, p in mod.named_parameters():
+        if isinstance(p, DTensor):
+            out[name] = p.full_tensor().detach().clone()
+        else:
+            out[name] = p.detach().clone()
+    return out
+
+
+def _full_grads(mod):
+    """Gather param grads to full (unsharded) tensors."""
+    from torch.distributed.tensor import DTensor
+
+    out = {}
+    for name, p in mod.named_parameters():
+        if p.grad is None:
+            continue
+        if isinstance(p.grad, DTensor):
+            out[name] = p.grad.full_tensor().detach().clone()
+        else:
+            out[name] = p.grad.detach().clone()
+    return out
+
+
+def _make_seeded_mlp(hidden, intermediate, dtype, device):
+    torch.manual_seed(42)
+    mlp = TinyDenseMLP(hidden, intermediate, dtype=dtype).to(device)
+    return mlp
+
+
+# ─────────────────────────── Dense regression guard ──────────────────────
+
+
+def _install_tiled_forward(module, shards):
+    """Bind a tiled-MLP forward at the instance level.
+
+    Mirrors what the patcher does in production: the FSDPModule's
+    ``__call__`` triggers FSDP2's pre-forward hooks (which unshard
+    parameters) before ``forward`` runs. By going through the wrapped
+    module's ``__call__`` rather than calling ``TiledMLP.apply`` directly
+    on sharded DTensor params, the tiling and FSDP2's parameter
+    materialization compose correctly.
+    """
+    from types import MethodType
+
+    from axolotl.monkeypatch.tiled_mlp.base import TiledMLP
+
+    original_forward = type(module).forward
+    module._compute_params = []  # type: ignore[attr-defined]
+
+    def tiled_forward(self, x):
+        if not self._compute_params:
+            self._compute_params = [p for p in self.parameters() if p.requires_grad]
+        return TiledMLP.apply(
+            original_forward,
+            self,
+            x,
+            shards,
+            self._compute_params,
+        )
+
+    module.forward = MethodType(tiled_forward, module)
+
+
+def test_fsdp2_tiled_dense_mlp_parity(dist_pg):
+    """FSDP2 + tiled MLP must match FSDP2 + un-tiled MLP within bf16 tolerance.
+
+    Wraps the MLP with ``fully_shard`` so it is itself an ``FSDPModule``
+    — this is the scenario most likely to hit the post-backward race
+    that ``_defer_fsdp2_reshard`` protects against (per-tile inner
+    backwards would otherwise fire the FSDPModule's post-backward hook
+    mid-loop). The test passes whether or not the helper is in place
+    on the current PyTorch (2.11) release; treat it as a parity guard
+    that will catch breakage if FSDP2 ever shortens its reshard timing.
+    """
+    from torch.distributed.fsdp import fully_shard
+
+    device = torch.device(f"cuda:{torch.cuda.current_device()}")
+    hidden, intermediate = 64, 128
+    seq = 64
+    dtype = torch.bfloat16
+
+    # Two identical MLPs — one wrapped with FSDP2 only, one wrapped with
+    # FSDP2 *and* run through TiledMLP. Same initial weights.
+    mlp_ref = _make_seeded_mlp(hidden, intermediate, dtype, device)
+    mlp_tile = copy.deepcopy(mlp_ref)
+    fully_shard(mlp_ref)
+    fully_shard(mlp_tile)
+    _install_tiled_forward(mlp_tile, shards=4)
+
+    torch.manual_seed(7 + dist.get_rank())
+    x = torch.randn(1, seq, hidden, device=device, dtype=dtype)
+    g = torch.randn(1, seq, hidden, device=device, dtype=dtype)
+
+    # Un-tiled reference
+    xr = x.clone().detach().requires_grad_(True)
+    yr = mlp_ref(xr)
+    yr.backward(g)
+    ref_grads = _full_grads(mlp_ref)
+    ref_dx = xr.grad.detach().clone()
+
+    # Tiled run — must not corrupt gradients on FSDP2.
+    xt = x.clone().detach().requires_grad_(True)
+    yt = mlp_tile(xt)
+    yt.backward(g)
+    tile_grads = _full_grads(mlp_tile)
+    tile_dx = xt.grad.detach().clone()
+
+    # Outputs match (this is just forward — should be tight).
+    assert torch.allclose(yr.detach(), yt.detach(), atol=1e-3, rtol=1e-2), (
+        f"FSDP2 forward mismatch max={((yr - yt).abs().max()).item()}"
+    )
+    # dX should match within bf16 tolerance.
+    assert torch.allclose(ref_dx, tile_dx, atol=1e-2, rtol=1e-2), (
+        f"FSDP2 dX mismatch max={((ref_dx - tile_dx).abs().max()).item()}"
+    )
+    # Param grads — the headline check for the reshard fix.
+    for name, gref in ref_grads.items():
+        gtile = tile_grads[name]
+        rel = (
+            (gref.float() - gtile.float()).norm() / (gref.float().norm() + 1e-6)
+        ).item()
+        assert rel < 5e-2, f"FSDP2 + tiled param-grad mismatch {name}: rel_err={rel}"
+
+
+# ─────────────────────── scattermoe-lora regression guard ────────────────
+
+
+def test_fsdp2_tiled_scattermoe_block_parity(dist_pg):
+    """FSDP2 + tiled ScatterMoEGatedMLP block parity guard.
+
+    Same shape of test as the dense case but routes through the
+    ScatterMoE forward. Skips if scattermoe_lora kernels are not
+    available in this env.
+    """
+    try:
+        from axolotl.integrations.kernels.libs.scattermoe_lora.layers import (
+            ScatterMoEGatedMLP,
+        )
+    except ImportError:
+        pytest.skip("scattermoe_lora kernels not available")
+
+    pytest.importorskip("triton")
+
+    from torch.distributed.fsdp import fully_shard
+
+    device = torch.device(f"cuda:{torch.cuda.current_device()}")
+    hidden, intermediate = 64, 128
+    num_experts, top_k = 8, 2
+    seq = 64
+    dtype = torch.bfloat16
+
+    def _make_block():
+        torch.manual_seed(42)
+        block = ScatterMoEGatedMLP()
+        router = SimpleNamespace()
+        router.layer = nn.Linear(hidden, num_experts, bias=False, dtype=dtype).to(
+            device
+        )
+        router.top_k = top_k
+        router.num_experts = num_experts
+        block.router = router
+        in_w = nn.Parameter(
+            torch.randn(
+                num_experts, 2 * intermediate, hidden, dtype=dtype, device=device
+            )
+            * 0.02
+        )
+        out_w = nn.Parameter(
+            torch.randn(num_experts, hidden, intermediate, dtype=dtype, device=device)
+            * 0.02
+        )
+        block.input_linear = nn.Module()
+        block.input_linear.register_parameter("weight", in_w)
+        block.output_linear = nn.Module()
+        block.output_linear.register_parameter("weight", out_w)
+        block.activation = nn.SiLU()
+        return block
+
+    block_ref = _make_block()
+    block_tile = _make_block()
+    fully_shard(block_ref)
+    fully_shard(block_tile)
+    _install_tiled_forward(block_tile, shards=4)
+
+    torch.manual_seed(7 + dist.get_rank())
+    x = torch.randn(1, seq, hidden, device=device, dtype=dtype)
+    g = torch.randn(1, seq, hidden, device=device, dtype=dtype)
+
+    xr = x.clone().detach().requires_grad_(True)
+    yr = block_ref(xr)
+    yr.backward(g)
+    ref_grads = _full_grads(block_ref)
+    ref_dx = xr.grad.detach().clone()
+
+    xt = x.clone().detach().requires_grad_(True)
+    yt = block_tile(xt)
+    yt.backward(g)
+    tile_grads = _full_grads(block_tile)
+    tile_dx = xt.grad.detach().clone()
+
+    def _rel(a, b):
+        return ((a.float() - b.float()).norm() / (b.float().norm() + 1e-6)).item()
+
+    assert _rel(yt.detach(), yr.detach()) < 5e-2, (
+        f"FSDP2 + tiled scattermoe forward rel_err={_rel(yt, yr)}"
+    )
+    assert _rel(tile_dx, ref_dx) < 5e-2, (
+        f"FSDP2 + tiled scattermoe dX rel_err={_rel(tile_dx, ref_dx)}"
+    )
+    for name, gref in ref_grads.items():
+        if name not in tile_grads:
+            continue
+        rel = _rel(tile_grads[name], gref)
+        assert rel < 5e-2, f"FSDP2 + tiled scattermoe param-grad {name} rel_err={rel}"
diff --git a/tests/e2e/test_reward_model_smollm2.py b/tests/e2e/solo/test_reward_model_smollm2.py
similarity index 93%
rename from tests/e2e/test_reward_model_smollm2.py
rename to tests/e2e/solo/test_reward_model_smollm2.py
index 657756f5b6..e4240ab34f 100644
--- a/tests/e2e/test_reward_model_smollm2.py
+++ b/tests/e2e/solo/test_reward_model_smollm2.py
@@ -4,14 +4,12 @@
 
 import unittest
 
-import pytest
-
 from axolotl.common.datasets import load_datasets
 from axolotl.train import train
 from axolotl.utils.config import normalize_config, validate_config
 from axolotl.utils.dict import DictDefault
 
-from .utils import check_model_output_exists, check_tensorboard, with_temp_dir
+from ..utils import check_model_output_exists, check_tensorboard, with_temp_dir
 
 
 class TestRewardModelLoraSmolLM2(unittest.TestCase):
@@ -19,7 +17,6 @@ class TestRewardModelLoraSmolLM2(unittest.TestCase):
     Test case for Llama reward models using LoRA
     """
 
-    @pytest.mark.skip(reason="FIXME, mostly underused functionality")
     @with_temp_dir
     def test_rm_lora(self, temp_dir):
         cfg = DictDefault(
diff --git a/tests/e2e/test_optimizers.py b/tests/e2e/test_optimizers.py
index 430bd73c5a..a28f5a2d3d 100644
--- a/tests/e2e/test_optimizers.py
+++ b/tests/e2e/test_optimizers.py
@@ -203,6 +203,48 @@ def test_dion(self, temp_dir):
         check_model_output_exists(temp_dir, cfg)
         assert "Dion" in trainer.optimizer.optimizer.__class__.__name__
 
+    @with_temp_dir
+    def test_q_galore_adamw8bit(self, temp_dir):
+        pytest.importorskip("q_galore_torch")
+        cfg = DictDefault(
+            {
+                "base_model": "HuggingFaceTB/SmolLM2-135M",
+                "model_type": "AutoModelForCausalLM",
+                "tokenizer_type": "AutoTokenizer",
+                "sequence_len": 1024,
+                "val_set_size": 0.0,
+                "special_tokens": {
+                    "pad_token": "<|endoftext|>",
+                },
+                "datasets": [
+                    {
+                        "path": "mhenrichsen/alpaca_2k_test",
+                        "type": "alpaca",
+                    },
+                ],
+                "num_epochs": 1,
+                "max_steps": 5,
+                "micro_batch_size": 1,
+                "gradient_accumulation_steps": 1,
+                "output_dir": temp_dir,
+                "learning_rate": 0.00001,
+                "optimizer": "q_galore_adamw8bit",
+                "bf16": True,
+                # Tiny rank/group_size so it fits SmolLM's hidden dim cleanly.
+                "qgalore_rank": 32,
+                "qgalore_update_proj_gap": 2,
+                "qgalore_proj_group_size": 64,
+                "lr_scheduler": "cosine",
+                "save_first_step": False,
+            }
+        )
+        cfg = validate_config(cfg)
+        normalize_config(cfg)
+        dataset_meta = load_datasets(cfg=cfg)
+        _, _, trainer = train(cfg=cfg, dataset_meta=dataset_meta)
+        check_model_output_exists(temp_dir, cfg)
+        assert "AdamW8bit" in trainer.optimizer.optimizer.__class__.__name__
+
     @with_temp_dir
     def test_fft_schedule_free_adamw(self, temp_dir):
         cfg = DictDefault(
diff --git a/tests/e2e/test_quantization.py b/tests/e2e/test_quantization.py
index 94102f6ebb..8cdf04ea05 100644
--- a/tests/e2e/test_quantization.py
+++ b/tests/e2e/test_quantization.py
@@ -480,6 +480,58 @@ def test_mxfp4_qat_then_ptq_save_pretrained(self, tmp_path):
         loaded_keys = set(loaded.state_dict().keys())
         assert original_keys == loaded_keys
 
+    @require_torch_2_8_0
+    def test_mxfp4_cross_process_load(self, tmp_path):
+        """A saved MX checkpoint loads in a fresh interpreter that never quantized.
+
+        The other tests call ``quantize_model`` in-process, which installs the
+        transformers init guard as a side effect, masking the real reload path.
+        Here we save, then load in a subprocess that only installs the guard the
+        way ``ModelLoader._apply_pre_model_load_setup`` does — no ``quantize_model``.
+        """
+        import subprocess
+        import sys
+        import textwrap
+
+        model = self._make_tiny_model()
+        save_dir = str(tmp_path / "mxfp4_xproc_model")
+        quantize_model(model, TorchAOQuantDType.mxfp4, 32)
+        save_quantized_model(model, save_dir)
+
+        script = textwrap.dedent(
+            f"""
+            import torch
+            from torch import nn
+            from transformers import AutoModelForCausalLM
+            # mirrors ModelLoader._apply_pre_model_load_setup (no quantize_model here)
+            from axolotl.utils.quantization import (
+                patch_transformers_skip_quantized_init,
+            )
+
+            patch_transformers_skip_quantized_init()
+            model = AutoModelForCausalLM.from_pretrained(
+                {save_dir!r}, dtype=torch.bfloat16
+            )
+            from torchao.prototype.mx_formats.mx_tensor import MXTensor
+            assert any(
+                isinstance(m.weight, MXTensor)
+                for m in model.modules()
+                if isinstance(m, nn.Linear)
+            ), "expected MXTensor weights after reload"
+            print("CROSS_PROCESS_LOAD_OK")
+            """
+        )
+        result = subprocess.run(
+            [sys.executable, "-c", script],
+            capture_output=True,
+            text=True,
+            check=False,
+        )
+        assert "CROSS_PROCESS_LOAD_OK" in result.stdout, (
+            f"cross-process MX load failed:\nstdout={result.stdout}\n"
+            f"stderr={result.stderr[-2000:]}"
+        )
+
 
 class TestQuantizationCallback:
     """
diff --git a/tests/e2e/utils.py b/tests/e2e/utils.py
index 8306b72cea..f2c567eeb9 100644
--- a/tests/e2e/utils.py
+++ b/tests/e2e/utils.py
@@ -196,7 +196,9 @@ def check_tensorboard(
     else:
         assert df.value.values[-1] < lt_val, assertion_err
     if gt_zero:
-        assert df.value.values[-1] > 1e-5, "Expected loss to be greater than zero"
+        assert df.value.values[-1] > 1e-5, (
+            f"Expected {tag} to be greater than zero, got {df.value.values[-1]}"
+        )
 
 
 def check_tensorboard_loss_decreased(
diff --git a/tests/integrations/kernels/__init__.py b/tests/integrations/kernels/__init__.py
new file mode 100644
index 0000000000..e69de29bb2
diff --git a/tests/integrations/kernels/scattermoe_lora/__init__.py b/tests/integrations/kernels/scattermoe_lora/__init__.py
new file mode 100644
index 0000000000..e69de29bb2
diff --git a/tests/integrations/kernels/scattermoe_lora/bench_int64_kernel.py b/tests/integrations/kernels/scattermoe_lora/bench_int64_kernel.py
new file mode 100644
index 0000000000..d4cad1fb2a
--- /dev/null
+++ b/tests/integrations/kernels/scattermoe_lora/bench_int64_kernel.py
@@ -0,0 +1,229 @@
+#!/usr/bin/env python
+# SPDX-License-Identifier: Apache-2.0
+# Copyright (c) Axolotl AI
+# Licensed under the Apache License, Version 2.0
+
+"""
+ScatterMoE — INT64_INDICES vs INT32 dense scatter2scatter benchmark.
+
+Times the dense ``kernels.ops.scatter2scatter`` at three representative
+shapes and reports ms/iter for both ``INT64_INDICES=False`` (int32 fast
+path) and ``INT64_INDICES=True`` (int64 safe path). The third shape is
+the previously-failing seq=512K / 16-shard config; at that scale the
+int32 path is incorrect (silent overflow corruption) so the int32 row
+is gated by the ``_SCATTER2SCATTER_INT32_LIMIT`` and reported as the
+chunked workaround's wall-clock instead (also for comparison against
+the chunking baseline that PR #3667 shipped).
+
+Run from the repo root:
+
+  python tests/integrations/kernels/scattermoe_lora/bench_int64_kernel.py
+
+A markdown summary is printed to stdout and written to
+``bench_int64_kernel_results.md`` next to this script.
+"""
+
+from __future__ import annotations
+
+import argparse
+import statistics
+import subprocess
+from pathlib import Path
+from typing import Callable
+
+import torch
+
+from axolotl.integrations.kernels.libs.scattermoe_lora.kernels.ops import (
+    scatter2scatter,
+)
+from axolotl.integrations.kernels.libs.scattermoe_lora.parallel_experts import (
+    flatten_sort_count,
+)
+
+# Sufficient condition for int32 pointer arithmetic to overflow in the
+# scatter2scatter Triton kernel: ``L_scattered * y_dim >= 2**31``.
+_SCATTER2SCATTER_INT32_LIMIT = 2**31
+
+DEVICE = "cuda"
+DTYPE = torch.bfloat16
+
+
+def gpu_name() -> str:
+    try:
+        out = subprocess.check_output(["nvidia-smi", "-L"], text=True).strip()
+        first = out.splitlines()[0]
+        if ":" in first:
+            after_colon = first.split(":", 1)[1].strip()
+            return after_colon.split("(", 1)[0].strip()
+        return first
+    except Exception:
+        return torch.cuda.get_device_name(0)
+
+
+def _time_ms(fn: Callable[[], torch.Tensor], iters: int = 10, warmup: int = 3) -> float:
+    for _ in range(warmup):
+        fn()
+    torch.cuda.synchronize()
+    samples = []
+    for _ in range(iters):
+        start = torch.cuda.Event(enable_timing=True)
+        end = torch.cuda.Event(enable_timing=True)
+        start.record()
+        fn()
+        end.record()
+        torch.cuda.synchronize()
+        samples.append(start.elapsed_time(end))
+    return statistics.median(samples)
+
+
+def _build_inputs(
+    *, T: int, hidden: int, top_k: int, n: int, num_experts: int, seed: int
+):
+    torch.manual_seed(seed)
+    x = torch.randn(T, hidden, device=DEVICE, dtype=DTYPE)
+    W = torch.randn(num_experts, hidden, n, device=DEVICE, dtype=DTYPE) * 0.02
+    logits = torch.randn(T, num_experts, device=DEVICE)
+    _, top_idx = torch.topk(torch.softmax(logits, dim=-1), top_k, dim=-1)
+    sei, ssi, _ = flatten_sort_count(top_idx, num_experts)
+    return x, W, sei, ssi
+
+
+def _run_shape(name: str, *, T: int, hidden: int, top_k: int, n: int, num_experts: int):
+    x, W, sei, ssi = _build_inputs(
+        T=T, hidden=hidden, top_k=top_k, n=n, num_experts=num_experts, seed=42
+    )
+    L_scattered = sei.size(0)
+    out_elements = L_scattered * n
+    overflow = out_elements >= _SCATTER2SCATTER_INT32_LIMIT
+    auto_int64 = overflow  # the wrapper's auto-dispatch verdict
+
+    def call(int64_indices: bool):
+        return scatter2scatter(
+            X=x,
+            W=W,
+            sorted_expert_idxs=sei,
+            sorted_scattered_idxs=ssi,
+            k=top_k,
+            x_grouped=False,
+            y_grouped=True,
+            int64_indices=int64_indices,
+        )
+
+    # Warm both Triton variants (separate JITs per constexpr).
+    if overflow:
+        # int32 path is unsafe at overflow shapes (silent corruption); skip.
+        ms_i32 = None
+    else:
+        ms_i32 = _time_ms(lambda: call(False))
+    ms_i64 = _time_ms(lambda: call(True))
+
+    return {
+        "name": name,
+        "T": T,
+        "hidden": hidden,
+        "top_k": top_k,
+        "n": n,
+        "num_experts": num_experts,
+        "L_scattered": L_scattered,
+        "out_elements": out_elements,
+        "overflow": overflow,
+        "auto_int64": auto_int64,
+        "ms_i32": ms_i32,
+        "ms_i64": ms_i64,
+    }
+
+
+def _fmt(v):
+    if v is None:
+        return "—"
+    return f"{v:.3f}"
+
+
+def _markdown(rows, gpu_label: str) -> str:
+    lines = []
+    lines.append("# scatter2scatter INT64_INDICES bench")
+    lines.append("")
+    lines.append(f"GPU: **{gpu_label}**")
+    lines.append("")
+    lines.append("Median of 10 iters, 3 warmup. `top_k=8`, dtype=bf16, 128 experts.")
+    lines.append("")
+    lines.append(
+        "`auto_int64` is the wrapper's auto-dispatch verdict from "
+        "`_needs_int64_indices`. At overflow shapes the int32 path "
+        "is silently incorrect (the multiplication wraps mid-buffer), "
+        "so only the int64 timing is reported."
+    )
+    lines.append("")
+    lines.append(
+        "| Shape | T | L_scattered | out elems | auto_int64 | int32 ms | int64 ms | int64 vs int32 (%) |"
+    )
+    lines.append("|---|---|---|---|---|---|---|---|")
+    for r in rows:
+        if r["ms_i32"] is not None and r["ms_i64"] is not None:
+            pen = 100.0 * (r["ms_i64"] - r["ms_i32"]) / r["ms_i32"]
+            pen_s = f"{pen:+.1f}"
+        else:
+            pen_s = "—"
+        lines.append(
+            f"| {r['name']} | {r['T']} | {r['L_scattered']} | "
+            f"{r['out_elements']:.2e} | {str(r['auto_int64'])} | "
+            f"{_fmt(r['ms_i32'])} | {_fmt(r['ms_i64'])} | {pen_s} |"
+        )
+    lines.append("")
+    lines.append(
+        "Acceptance: ≤5% regression on the int32 fast path at "
+        "small/medium shapes (the auto-dispatch picks int32 there, so "
+        "this row characterises the JIT overhead of having an int64 "
+        "variant available)."
+    )
+    lines.append("")
+    return "\n".join(lines)
+
+
+def main():
+    parser = argparse.ArgumentParser()
+    parser.add_argument(
+        "--out",
+        type=str,
+        default=None,
+        help="Output markdown path (default: alongside script)",
+    )
+    args = parser.parse_args()
+
+    # Three representative shapes per the task spec.
+    shapes = [
+        # name              T (tokens before top_k expansion), hidden, top_k, N (out), num_experts
+        dict(name="small", T=8_192, hidden=2048, top_k=8, n=2048, num_experts=128),
+        dict(name="medium", T=128_000, hidden=2048, top_k=8, n=2048, num_experts=128),
+        # Overflow shape: 524288 / 16 shards = 32768 tokens, top_k=8 -> L=262144,
+        # N=16384 (= 2*intermediate at the bench config) -> 2**32 elements.
+        dict(
+            name="overflow_524k_s16",
+            T=32_768,
+            hidden=2048,
+            top_k=8,
+            n=16_384,
+            num_experts=128,
+        ),
+    ]
+
+    rows = []
+    for s in shapes:
+        print(f"running {s['name']} ...", flush=True)
+        rows.append(_run_shape(**s))
+        torch.cuda.empty_cache()
+
+    label = gpu_name()
+    md = _markdown(rows, label)
+    print(md)
+
+    if args.out:
+        out_path = Path(args.out)
+    else:
+        out_path = Path(__file__).with_name("bench_int64_kernel_results.md")
+    out_path.write_text(md)
+    print(f"\nwrote {out_path}")
+
+
+if __name__ == "__main__":
+    main()
diff --git a/tests/integrations/kernels/scattermoe_lora/bench_int64_kernel_results.md b/tests/integrations/kernels/scattermoe_lora/bench_int64_kernel_results.md
new file mode 100644
index 0000000000..3847039944
--- /dev/null
+++ b/tests/integrations/kernels/scattermoe_lora/bench_int64_kernel_results.md
@@ -0,0 +1,15 @@
+# scatter2scatter INT64_INDICES bench
+
+GPU: **NVIDIA RTX PRO 6000 Blackwell Max-Q Workstation Edition**
+
+Median of 10 iters, 3 warmup. `top_k=8`, dtype=bf16, 128 experts.
+
+`auto_int64` is the wrapper's auto-dispatch verdict from `_needs_int64_indices`. At overflow shapes the int32 path is silently incorrect (the multiplication wraps mid-buffer), so only the int64 timing is reported.
+
+| Shape | T | L_scattered | out elems | auto_int64 | int32 ms | int64 ms | int64 vs int32 (%) |
+|---|---|---|---|---|---|---|---|
+| small | 8192 | 65536 | 1.34e+08 | False | 2.699 | 2.704 | +0.2 |
+| medium | 128000 | 1024000 | 2.10e+09 | False | 40.126 | 40.790 | +1.7 |
+| overflow_524k_s16 | 32768 | 262144 | 4.29e+09 | True | — | 80.105 | — |
+
+Acceptance: ≤5% regression on the int32 fast path at small/medium shapes (the auto-dispatch picks int32 there, so this row characterises the JIT overhead of having an int64 variant available).
diff --git a/tests/integrations/kernels/scattermoe_lora/bench_mxfp4.py b/tests/integrations/kernels/scattermoe_lora/bench_mxfp4.py
new file mode 100644
index 0000000000..ebeaa589dd
--- /dev/null
+++ b/tests/integrations/kernels/scattermoe_lora/bench_mxfp4.py
@@ -0,0 +1,577 @@
+#!/usr/bin/env python
+# SPDX-License-Identifier: Apache-2.0
+# Copyright (c) Axolotl AI
+# Licensed under the Apache License, Version 2.0
+
+"""
+ScatterMoE LoRA — MXFP4 forward + backward benchmark.
+
+Runs three configurations on a representative DeepSeek-V4-style MoE shape
+(E=128, K=2048, N=1024, top_k=8, batch×seq=4096) and reports tokens/s,
+peak GPU memory, and effective HBM bandwidth for each:
+
+  * **bf16 baseline**: full-precision bf16 experts, no MX.
+  * **Strategy A**: torchao MXTensor experts, selective dequant to bf16.
+  * **Strategy B**: torchao MXTensor experts, fused MX dequant in Triton.
+
+Run from the repo root:
+
+  python tests/integrations/kernels/scattermoe_lora/bench_mxfp4.py
+
+A markdown table is printed to stdout and written to
+``bench_mxfp4_results.md`` next to this script.
+"""
+
+from __future__ import annotations
+
+import argparse
+import math
+import subprocess
+from pathlib import Path
+from typing import Callable
+
+import torch
+from torchao.prototype.mx_formats.mx_tensor import MXTensor
+
+from axolotl.integrations.kernels.libs.scattermoe_lora.mx_weights import (
+    selective_mx_weights_fwd,
+)
+from axolotl.integrations.kernels.libs.scattermoe_lora.parallel_experts import (
+    flatten_sort_count,
+)
+from axolotl.integrations.kernels.libs.scattermoe_lora.parallel_linear_lora import (
+    parallel_linear_lora,
+)
+from axolotl.integrations.kernels.libs.scattermoe_lora.selective_dequant import (
+    get_active_experts,
+    remap_expert_indices,
+    selective_expert_weights,
+    selective_lora_weights,
+)
+
+DEVICE = "cuda"
+DTYPE = torch.bfloat16
+
+
+def gpu_name() -> str:
+    try:
+        out = subprocess.check_output(["nvidia-smi", "-L"], text=True).strip()
+        # First GPU line: "GPU 0: NAME (UUID: ...)"
+        first = out.splitlines()[0]
+        if ":" in first:
+            after_colon = first.split(":", 1)[1].strip()
+            return after_colon.split("(", 1)[0].strip()
+        return first
+    except Exception:
+        return torch.cuda.get_device_name(0)
+
+
+def gpu_hbm_bandwidth_gbps() -> float | None:
+    """Rough peak HBM BW for utilization %, looked up by name. ``None`` if
+    unknown — printed as N/A."""
+    name = gpu_name().lower()
+    # Approximate datasheet peaks (GB/s). Order matters — more-specific
+    # patterns first so a "rtx pro 6000 blackwell" doesn't match
+    # "rtx 6000 ada".
+    table = [
+        (("rtx", "6000", "blackwell"), 1792.0),
+        (("rtx", "6000", "ada"), 960.0),
+        (("rtx", "5090"), 1792.0),
+        (("rtx", "4090"), 1008.0),
+        (("h200",), 4800.0),
+        (("h100",), 3350.0),
+        (("a100",), 2039.0),
+        (("a40",), 696.0),
+        (("a6000",), 768.0),
+        (("l40",), 864.0),
+        (("l4",), 300.0),
+        (("b200",), 8000.0),
+        (("mi300x",), 5300.0),
+    ]
+    for keys, bw in table:
+        if all(k in name for k in keys):
+            return bw
+    return None
+
+
+@torch.no_grad()
+def _setup_bf16(E, K, N, top_k, M, rank):
+    torch.manual_seed(0)
+    W = torch.randn(E, N, K, device=DEVICE, dtype=DTYPE) * (1.0 / K**0.5)
+    W_kernel = W.transpose(2, 1).contiguous()  # [E, K, N]
+    return W, W_kernel
+
+
+def _setup_mx(W_natural, chunk: int = 8):
+    """Make a torchao MXFP4 ``MXTensor`` from the bf16 weight.
+
+    ``MXTensor.to_mx`` materializes an fp32 working tensor internally; for
+    large [E, N, K] weights this transient can spike setup-time GPU memory
+    well beyond the final quantized footprint. To keep the bench runnable
+    on a shared GPU, quantize ``chunk`` experts at a time and stitch the
+    qdata/scale shards into a single MXTensor.
+    """
+    if W_natural.shape[0] <= chunk:
+        return MXTensor.to_mx(
+            W_natural, elem_dtype=torch.float4_e2m1fn_x2, block_size=32
+        )
+    qdata_parts = []
+    scale_parts = []
+    template = None
+    for i in range(0, W_natural.shape[0], chunk):
+        piece = W_natural[i : i + chunk].contiguous()
+        mx_chunk = MXTensor.to_mx(
+            piece, elem_dtype=torch.float4_e2m1fn_x2, block_size=32
+        )
+        qdata_parts.append(mx_chunk.qdata)
+        scale_parts.append(mx_chunk.scale)
+        if template is None:
+            template = mx_chunk
+    qdata = torch.cat(qdata_parts, dim=0)
+    scale = torch.cat(scale_parts, dim=0)
+    assert template is not None  # set on the first loop iter (loop body runs ≥ once)
+    return MXTensor(
+        qdata,
+        scale,
+        template.elem_dtype,
+        template.block_size,
+        template.orig_dtype,
+        template.kernel_preference,
+        template.act_quant_kwargs,
+        template.is_swizzled_scales,
+    )
+
+
+def _routing(M, E, top_k, seed=1, mode="dense"):
+    """Generate token→expert routing.
+
+    ``mode="dense"`` uses per-token random logits; for moderate E and top_k
+    this leaves nearly every expert active and exercises the kernels' full-load
+    case. ``mode="sparse"`` injects a strong shared bias so the same handful of
+    experts dominates the topk across all tokens — modelling realistic MoE
+    routing where only a small fraction of experts is active per step.
+    ``mode="balanced"`` models a load-balance-regularized router (aux-loss /
+    z-loss trained): per-token logits = N(0, 1) noise + small per-expert
+    bias N(0, 0.5). At large M this yields approximately balanced expert
+    usage; at small M only a fraction of experts gets hit and which experts
+    are active varies with seed/M — i.e. the seqlen → active-expert-count
+    curve that drives the A-vs-B crossover.
+    """
+    torch.manual_seed(seed)
+    if mode == "dense":
+        logits = torch.randn(M, E, device=DEVICE)
+    elif mode == "sparse":
+        shared = torch.randn(E, device=DEVICE) * 5.0
+        noise = torch.randn(M, E, device=DEVICE) * 0.1
+        logits = shared.unsqueeze(0) + noise
+    elif mode == "balanced":
+        bias = torch.randn(E, device=DEVICE) * 0.5
+        noise = torch.randn(M, E, device=DEVICE)
+        logits = noise + bias.unsqueeze(0)
+    else:
+        raise ValueError(f"unknown routing mode: {mode}")
+    _, top_idx = torch.topk(torch.softmax(logits, dim=-1), top_k, dim=-1)
+    sei, ssi, eo = flatten_sort_count(top_idx, E)
+    return sei, ssi, eo, top_idx
+
+
+def _lora(E, K, N, rank, seed=2):
+    torch.manual_seed(seed)
+    A = torch.randn(rank * E, K, device=DEVICE, dtype=DTYPE) * 0.01
+    B = torch.randn(N, rank * E, device=DEVICE, dtype=DTYPE) * 0.01
+    return A, B
+
+
+class _MockExperts:
+    def __init__(self, p):
+        self.gate_up_proj = p
+
+
+# ---------------------------------------------------------------------------
+# Three benchmark runners — each takes a fresh `x` and returns (output, fn_grad)
+# ---------------------------------------------------------------------------
+
+
+# Runners reuse the same leaf tensors across timed iters (x, lora A/B) and
+# zero ``.grad`` to None at the top of each call. The previous per-iter
+# ``.clone()`` + ``requires_grad_(True)`` was setup cost — not kernel cost —
+# and biased the timing especially on small shapes. Gradient accumulation
+# is avoided by setting ``.grad = None`` (faster than ``.zero_()``), so the
+# autograd graph each iter is fresh but the leaf buffers are not reallocated.
+
+
+def make_runner_bf16(W_kernel, lora_A, lora_B, sei, ssi, eo, top_k, scaling):
+    A = lora_A.detach().clone().requires_grad_(True)
+    B = lora_B.detach().clone().requires_grad_(True)
+
+    def run(x):
+        x.grad = None
+        A.grad = None
+        B.grad = None
+        out = parallel_linear_lora(
+            x,
+            W_kernel,
+            top_k,
+            sei,
+            ssi,
+            eo,
+            lora_A=A,
+            lora_B=B,
+            scaling=scaling,
+            use_fused_dX=True,
+            use_fused_gather=True,
+        )
+        out.sum().backward()
+        return out
+
+    return run
+
+
+def make_runner_strategy_a(mx, lora_A, lora_B, sei, ssi, eo, top_k, scaling, E):
+    experts = _MockExperts(mx)
+    A = lora_A.detach().clone().requires_grad_(True)
+    B = lora_B.detach().clone().requires_grad_(True)
+
+    def run(x):
+        x.grad = None
+        A.grad = None
+        B.grad = None
+        active = get_active_experts(sei, E)
+        remapped, compact_off = remap_expert_indices(sei, eo, active, E)
+        W_compact = (
+            selective_expert_weights(experts, "gate_up_proj", active)
+            .transpose(2, 1)
+            .contiguous()
+        )
+        A_c, B_c = selective_lora_weights(A, B, active, E)
+        out = parallel_linear_lora(
+            x,
+            W_compact,
+            top_k,
+            remapped,
+            ssi,
+            compact_off,
+            lora_A=A_c,
+            lora_B=B_c,
+            scaling=scaling,
+            use_fused_dX=True,
+            use_fused_gather=True,
+        )
+        out.sum().backward()
+        return out
+
+    return run
+
+
+def make_runner_strategy_b(mx, lora_A, lora_B, sei, ssi, eo, top_k, scaling, E):
+    A = lora_A.detach().clone().requires_grad_(True)
+    B = lora_B.detach().clone().requires_grad_(True)
+
+    def run(x):
+        x.grad = None
+        A.grad = None
+        B.grad = None
+        active = get_active_experts(sei, E)
+        remapped, compact_off = remap_expert_indices(sei, eo, active, E)
+        mx_active = selective_mx_weights_fwd(mx, active)
+        A_c, B_c = selective_lora_weights(A, B, active, E)
+        out = parallel_linear_lora(
+            x,
+            mx_active,
+            top_k,
+            remapped,
+            ssi,
+            compact_off,
+            lora_A=A_c,
+            lora_B=B_c,
+            scaling=scaling,
+        )
+        out.sum().backward()
+        return out
+
+    return run
+
+
+# ---------------------------------------------------------------------------
+# Timing harness
+# ---------------------------------------------------------------------------
+
+
+def bench(fn: Callable, x_template: torch.Tensor, warmup: int, iters: int) -> dict:
+    torch.cuda.empty_cache()
+    torch.cuda.reset_peak_memory_stats()
+    # Allocate the input leaf tensor once outside the timed window. Runners
+    # reset ``.grad = None`` per iter; the underlying buffer is reused.
+    x = x_template.detach().clone().requires_grad_(True)
+    try:
+        # Warmup
+        for _ in range(warmup):
+            fn(x)
+        torch.cuda.synchronize()
+
+        start = torch.cuda.Event(enable_timing=True)
+        end = torch.cuda.Event(enable_timing=True)
+        start.record()
+        for _ in range(iters):
+            fn(x)
+        end.record()
+        torch.cuda.synchronize()
+    except torch.cuda.OutOfMemoryError:
+        torch.cuda.empty_cache()
+        return {"ms_per_iter": float("nan"), "peak_mem_mb": float("nan"), "oom": True}
+    elapsed_ms = start.elapsed_time(end)
+    peak_mem_mb = torch.cuda.max_memory_allocated() / 1024 / 1024
+    return {
+        "ms_per_iter": elapsed_ms / iters,
+        "peak_mem_mb": peak_mem_mb,
+        "oom": False,
+    }
+
+
+def main():
+    parser = argparse.ArgumentParser()
+    parser.add_argument("--E", type=int, default=128)
+    parser.add_argument("--K", type=int, default=2048)
+    parser.add_argument("--N", type=int, default=1024)
+    parser.add_argument("--top_k", type=int, default=8)
+    parser.add_argument("--M", type=int, default=4096, help="batch * seq")
+    parser.add_argument("--rank", type=int, default=16)
+    parser.add_argument("--warmup", type=int, default=10)
+    parser.add_argument("--iters", type=int, default=50)
+    parser.add_argument(
+        "--routing-mode",
+        choices=("dense", "sparse", "balanced"),
+        default="dense",
+        help=(
+            "dense: per-token random logits (~all experts active). "
+            "sparse: shared bias + small per-token noise so the same ~top_k "
+            "experts dominate routing across all tokens. "
+            "balanced: per-token N(0,1) noise + small N(0,0.5) per-expert bias "
+            "— mimics a load-balance-regularized router; active-expert count "
+            "grows with M."
+        ),
+    )
+    parser.add_argument(
+        "--M-sweep",
+        dest="M_sweep",
+        default=None,
+        help=(
+            "Comma-separated list of M values, e.g. '256,1024,4096,16384'. "
+            "When set, --M is ignored; the bench runs once per M with the "
+            "selected routing mode and emits a single combined section."
+        ),
+    )
+    parser.add_argument(
+        "--append",
+        action="store_true",
+        help="Append the new table to bench_mxfp4_results.md instead of overwriting it.",
+    )
+    parser.add_argument(
+        "--device",
+        default="cuda",
+        help="CUDA device, e.g. 'cuda', 'cuda:0', 'cuda:1'.",
+    )
+    args = parser.parse_args()
+
+    global DEVICE
+    DEVICE = args.device
+    torch.cuda.set_device(torch.device(DEVICE))
+
+    E, K, N, top_k, rank = args.E, args.K, args.N, args.top_k, args.rank
+
+    if args.M_sweep:
+        M_values = [int(s.strip()) for s in args.M_sweep.split(",") if s.strip()]
+    else:
+        M_values = [args.M]
+
+    print(f"GPU: {gpu_name()}")
+    print(
+        f"Shape: E={E}, K={K}, N={N}, top_k={top_k}, rank={rank}, "
+        f"M={M_values if args.M_sweep else M_values[0]}"
+    )
+    print(f"Iters: {args.warmup} warmup + {args.iters} timed")
+    print()
+
+    # Build dense bf16 weights + MX-quantize once — weights are independent of M.
+    W_natural, W_kernel = _setup_bf16(E, K, N, top_k, M_values[0], rank)
+    mx = _setup_mx(W_natural)
+    # W_natural is only used to build the MX tensor; W_kernel feeds bf16 paths.
+    # Free it eagerly so the dequant transient fits on memory-constrained GPUs.
+    del W_natural
+    torch.cuda.empty_cache()
+    lora_A, lora_B = _lora(E, K, N, rank)
+    scaling = 0.5
+    peak_bw = gpu_hbm_bandwidth_gbps()
+
+    per_M = []  # list of (M, e_active, results)
+    for M in M_values:
+        sei, ssi, eo, _top_idx = _routing(M, E, top_k, mode=args.routing_mode)
+        x = torch.randn(M, K, device=DEVICE, dtype=DTYPE)
+
+        # Estimate bytes read per iter for HBM BW utilization:
+        #   bf16 W:     E_active * K * N * 2
+        #   MX  W:      E_active * (K*N/2 + K*N/32)
+        # X is M*K*2 bytes. We ignore LoRA traffic (tiny relative to W).
+        num_tokens = M * top_k
+        e_active = int(get_active_experts(sei, E).numel())
+        bytes_bf16 = e_active * K * N * 2 + M * K * 2
+        bytes_mx = e_active * (K * N // 2 + K * N // 32) + M * K * 2
+
+        runners = {
+            "bf16 baseline": (
+                make_runner_bf16(
+                    W_kernel, lora_A, lora_B, sei, ssi, eo, top_k, scaling
+                ),
+                bytes_bf16,
+            ),
+            "Strategy A (selective dequant)": (
+                make_runner_strategy_a(
+                    mx, lora_A, lora_B, sei, ssi, eo, top_k, scaling, E
+                ),
+                bytes_bf16,  # post-dequant the kernel still reads bf16
+            ),
+            "Strategy B (fused MX)": (
+                make_runner_strategy_b(
+                    mx, lora_A, lora_B, sei, ssi, eo, top_k, scaling, E
+                ),
+                bytes_mx,
+            ),
+        }
+
+        results = []
+        for name, (fn, bytes_per_iter) in runners.items():
+            r = bench(fn, x, args.warmup, args.iters)
+            if r.get("oom"):
+                tps = float("nan")
+                bw = float("nan")
+                bw_pct = float("nan")
+            else:
+                tps = num_tokens / (r["ms_per_iter"] / 1000.0)
+                bw = (bytes_per_iter / 1e9) / (r["ms_per_iter"] / 1000.0)
+                bw_pct = (bw / peak_bw * 100.0) if peak_bw else float("nan")
+            results.append(
+                dict(
+                    name=name,
+                    ms_per_iter=r["ms_per_iter"],
+                    tokens_per_s=tps,
+                    peak_mem_mb=r["peak_mem_mb"],
+                    hbm_gbps=bw,
+                    hbm_pct=bw_pct,
+                    oom=r.get("oom", False),
+                )
+            )
+        per_M.append((M, e_active, results))
+
+    section_lines = []
+    if args.M_sweep:
+        section_lines.append(
+            f"## Routing mode: {args.routing_mode} — M sweep — {gpu_name()}"
+        )
+        section_lines.append("")
+        section_lines.append(f"- **GPU**: {gpu_name()}")
+        section_lines.append(
+            f"- **Base shape**: E={E}, K={K}, N={N}, top_k={top_k}, rank={rank}"
+        )
+        section_lines.append(f"- **M values**: {', '.join(str(m) for m in M_values)}")
+        section_lines.append(
+            f"- **Iters**: {args.warmup} warmup + {args.iters} timed, fwd+bwd per iter"
+        )
+        if peak_bw:
+            section_lines.append(f"- **HBM peak (datasheet)**: {peak_bw:.0f} GB/s")
+        section_lines.append("")
+        section_lines.append("### Summary (ms/iter, fwd+bwd)")
+        section_lines.append("")
+        section_lines.append(
+            "| M | active / E | bf16 ms | Strategy A ms | Strategy B ms | winner (A vs B) |"
+        )
+        section_lines.append("| ---: | ---: | ---: | ---: | ---: | :---: |")
+
+        def _fmt_ms(r):
+            return "OOM" if r["oom"] else f"{r['ms_per_iter']:.2f}"
+
+        for M, e_active, results in per_M:
+            by_name = {r["name"]: r for r in results}
+            a, b, bf = (
+                by_name["Strategy A (selective dequant)"],
+                by_name["Strategy B (fused MX)"],
+                by_name["bf16 baseline"],
+            )
+            if a["oom"] and b["oom"]:
+                winner = "—"
+            elif a["oom"]:
+                winner = "B"
+            elif b["oom"]:
+                winner = "A"
+            else:
+                winner = "A" if a["ms_per_iter"] < b["ms_per_iter"] else "B"
+            section_lines.append(
+                f"| {M} | {e_active}/{E} ({e_active / E:.2f}) | "
+                f"{_fmt_ms(bf)} | {_fmt_ms(a)} | {_fmt_ms(b)} | {winner} |"
+            )
+        section_lines.append("")
+        for M, e_active, results in per_M:
+            section_lines.append(
+                f"### M={M} (active experts = {e_active} / {E}, "
+                f"num_active/E = {e_active / E:.3f})"
+            )
+            section_lines.append("")
+            section_lines.append(
+                "| Config | ms/iter | tokens/s | peak mem (MB) | HBM GB/s | HBM % |"
+            )
+            section_lines.append("| --- | ---: | ---: | ---: | ---: | ---: |")
+            for r in results:
+                if r["oom"]:
+                    section_lines.append(
+                        f"| {r['name']} | OOM | OOM | OOM | OOM | OOM |"
+                    )
+                    continue
+                hbm_pct = (
+                    f"{r['hbm_pct']:.1f}" if not math.isnan(r["hbm_pct"]) else "N/A"
+                )
+                section_lines.append(
+                    f"| {r['name']} | {r['ms_per_iter']:.2f} | "
+                    f"{r['tokens_per_s']:.0f} | {r['peak_mem_mb']:.1f} | "
+                    f"{r['hbm_gbps']:.1f} | {hbm_pct} |"
+                )
+            section_lines.append("")
+    else:
+        M, e_active, results = per_M[0]
+        section_lines.append(f"## Routing mode: {args.routing_mode} — {gpu_name()}")
+        section_lines.append("")
+        section_lines.append(f"- **GPU**: {gpu_name()}")
+        section_lines.append(
+            f"- **Shape**: E={E}, K={K}, N={N}, top_k={top_k}, M={M}, rank={rank} "
+            f"(active experts = {e_active})"
+        )
+        section_lines.append(
+            f"- **Iters**: {args.warmup} warmup + {args.iters} timed, fwd+bwd per iter"
+        )
+        if peak_bw:
+            section_lines.append(f"- **HBM peak (datasheet)**: {peak_bw:.0f} GB/s")
+        section_lines.append("")
+        section_lines.append(
+            "| Config | ms/iter | tokens/s | peak mem (MB) | HBM GB/s | HBM % |"
+        )
+        section_lines.append("| --- | ---: | ---: | ---: | ---: | ---: |")
+        for r in results:
+            hbm_pct = f"{r['hbm_pct']:.1f}" if not math.isnan(r["hbm_pct"]) else "N/A"
+            section_lines.append(
+                f"| {r['name']} | {r['ms_per_iter']:.2f} | {r['tokens_per_s']:.0f} | "
+                f"{r['peak_mem_mb']:.1f} | {r['hbm_gbps']:.1f} | {hbm_pct} |"
+            )
+    section_md = "\n".join(section_lines).rstrip() + "\n"
+
+    out_path = Path(__file__).resolve().parent / "bench_mxfp4_results.md"
+    if args.append and out_path.exists():
+        existing = out_path.read_text().rstrip() + "\n\n"
+        md = existing + section_md
+    else:
+        md = "# ScatterMoE LoRA — MXFP4 benchmark\n\n" + section_md
+
+    print(section_md)
+    out_path.write_text(md)
+    print(f"\nResults written to {out_path}")
+
+
+if __name__ == "__main__":
+    main()
diff --git a/tests/integrations/kernels/scattermoe_lora/bench_mxfp4_results.md b/tests/integrations/kernels/scattermoe_lora/bench_mxfp4_results.md
new file mode 100644
index 0000000000..4df526f6fb
--- /dev/null
+++ b/tests/integrations/kernels/scattermoe_lora/bench_mxfp4_results.md
@@ -0,0 +1,106 @@
+# ScatterMoE LoRA — MXFP4 benchmark
+
+## Routing mode: dense — NVIDIA RTX PRO 6000 Blackwell Max-Q Workstation Edition
+
+- **GPU**: NVIDIA RTX PRO 6000 Blackwell Max-Q Workstation Edition
+- **Shape**: E=128, K=2048, N=1024, top_k=8, M=4096, rank=16 (active experts = 128)
+- **Iters**: 10 warmup + 50 timed, fwd+bwd per iter
+- **HBM peak (datasheet)**: 1792 GB/s
+
+| Config | ms/iter | tokens/s | peak mem (MB) | HBM GB/s | HBM % |
+| --- | ---: | ---: | ---: | ---: | ---: |
+| bf16 baseline | 5.25 | 6244998 | 1252.8 | 105.5 | 5.9 |
+| Strategy A (selective dequant) | 30.57 | 1071778 | 8557.3 | 18.1 | 1.0 |
+| Strategy B (fused MX) | 12.24 | 2677582 | 1425.3 | 13.0 | 0.7 |
+
+## Routing mode: sparse — NVIDIA RTX PRO 6000 Blackwell Max-Q Workstation Edition
+
+- **GPU**: NVIDIA RTX PRO 6000 Blackwell Max-Q Workstation Edition
+- **Shape**: E=256, K=2048, N=1024, top_k=8, M=4096, rank=16 (active experts = 10)
+- **Iters**: 10 warmup + 50 timed, fwd+bwd per iter
+- **HBM peak (datasheet)**: 1792 GB/s
+
+| Config | ms/iter | tokens/s | peak mem (MB) | HBM GB/s | HBM % |
+| --- | ---: | ---: | ---: | ---: | ---: |
+| bf16 baseline | 6.55 | 5006027 | 1960.8 | 9.0 | 0.5 |
+| Strategy A (selective dequant) | 5.75 | 5695789 | 2059.9 | 10.2 | 0.6 |
+| Strategy B (fused MX) | 8.95 | 3661270 | 1997.8 | 3.1 | 0.2 |
+
+## Routing mode: balanced — M sweep — NVIDIA RTX PRO 6000 Blackwell Max-Q Workstation Edition
+
+- **GPU**: NVIDIA RTX PRO 6000 Blackwell Max-Q Workstation Edition
+- **Base shape**: E=256, K=2048, N=1024, top_k=8, rank=16
+- **M values**: 256, 1024, 4096, 16384
+- **Iters**: 10 warmup + 50 timed, fwd+bwd per iter
+- **HBM peak (datasheet)**: 1792 GB/s
+
+### Summary (ms/iter, fwd+bwd)
+
+| M | active / E | bf16 ms | Strategy A ms | Strategy B ms | winner (A vs B) |
+| ---: | ---: | ---: | ---: | ---: | :---: |
+| 256 | 215/256 (0.84) | 2.99 | OOM | 8.24 | B |
+| 1024 | 251/256 (0.98) | 3.43 | OOM | 10.74 | B |
+| 4096 | 255/256 (1.00) | 6.56 | OOM | 16.50 | B |
+| 16384 | 256/256 (1.00) | 24.15 | OOM | 46.56 | B |
+
+### M=256 (active experts = 215 / 256, num_active/E = 0.840)
+
+| Config | ms/iter | tokens/s | peak mem (MB) | HBM GB/s | HBM % |
+| --- | ---: | ---: | ---: | ---: | ---: |
+| bf16 baseline | 2.99 | 685596 | 1686.0 | 302.2 | 16.9 |
+| Strategy A (selective dequant) | OOM | OOM | OOM | OOM | OOM |
+| Strategy B (fused MX) | 8.24 | 248639 | 1954.9 | 29.2 | 1.6 |
+
+### M=1024 (active experts = 251 / 256, num_active/E = 0.980)
+
+| Config | ms/iter | tokens/s | peak mem (MB) | HBM GB/s | HBM % |
+| --- | ---: | ---: | ---: | ---: | ---: |
+| bf16 baseline | 3.43 | 2389143 | 1744.2 | 308.3 | 17.2 |
+| Strategy A (selective dequant) | OOM | OOM | OOM | OOM | OOM |
+| Strategy B (fused MX) | 10.74 | 762567 | 2058.1 | 26.4 | 1.5 |
+
+### M=4096 (active experts = 255 / 256, num_active/E = 0.996)
+
+| Config | ms/iter | tokens/s | peak mem (MB) | HBM GB/s | HBM % |
+| --- | ---: | ---: | ---: | ---: | ---: |
+| bf16 baseline | 6.56 | 4994760 | 1960.8 | 165.6 | 9.2 |
+| Strategy A (selective dequant) | OOM | OOM | OOM | OOM | OOM |
+| Strategy B (fused MX) | 16.50 | 1985884 | 2280.0 | 18.2 | 1.0 |
+
+### M=16384 (active experts = 256 / 256, num_active/E = 1.000)
+
+| Config | ms/iter | tokens/s | peak mem (MB) | HBM GB/s | HBM % |
+| --- | ---: | ---: | ---: | ---: | ---: |
+| bf16 baseline | 24.15 | 5427073 | 2827.0 | 47.2 | 2.6 |
+| Strategy A (selective dequant) | OOM | OOM | OOM | OOM | OOM |
+| Strategy B (fused MX) | 46.56 | 2814943 | 3149.0 | 7.6 | 0.4 |
+
+### Notes
+
+- **Strategy A OOMs at all M** under load-balanced routing at E=256 because
+  the torchao MXTensor dequant path materializes several full-shape fp32/int32
+  unpack buffers (~12 GiB combined for [256, 1024, 2048] at fp4 → fp32) while
+  vLLM colocated on this workstation pins ~88 GB of HBM, leaving only ~14 GB
+  free. Extrapolating from the dense E=128 case above (Strategy A peak
+  ~8.6 GB at 128 active experts), the E=256 / 256-active dequant peak would
+  be ~17 GB — over the available headroom.
+- **Active-expert count is essentially E at every sampled M.** Under a
+  load-balance-regularized router (per-token N(0,1) noise + N(0,0.5) per-expert
+  bias), `E[active] ≈ E · (1 − (1 − top_k/E)^M)`. With E=256 / top_k=8 this
+  yields ≥ 215 unique experts even at M=256 and saturates at 256 by M ≈ 16K.
+  Balanced routing therefore does **not** generate a low-active regime at
+  these token counts — i.e. the A-vs-B crossover does not appear in this
+  sweep; B wins by default because A does not fit.
+- **B vs bf16:** Strategy B is consistently 1.9–2.9× slower than the bf16
+  baseline (similar to the dense E=128 ratio of ~2.3×). HBM utilization for
+  both is modest (B 0.4–1.6 %, bf16 2.6–17.2 %), suggesting the kernels are
+  compute- or scheduling-bound for these shapes, not bandwidth-bound.
+- **Where the A-vs-B crossover lives, by theory:** Strategy A is preferred
+  when `num_active / E` is small enough that the dequant cost is offset by
+  the cheaper bf16 matmul — the prior `sparse` row (10/256 active, A=5.75 ms
+  vs B=8.95 ms) sits in that regime. Strategy B is preferred near
+  `num_active / E ≈ 1`, where dequant of all experts dominates. The threshold
+  between the two — somewhere in the 10/256 to 215/256 band — is **not
+  observable from the balanced-router setting**; eliciting it would need an
+  M smaller than 256, a synthetic deliberately-sparse router, or freeing the
+  vLLM GPU and rerunning at E=256.
diff --git a/tests/integrations/kernels/scattermoe_lora/conftest.py b/tests/integrations/kernels/scattermoe_lora/conftest.py
new file mode 100644
index 0000000000..90e18e6a93
--- /dev/null
+++ b/tests/integrations/kernels/scattermoe_lora/conftest.py
@@ -0,0 +1,56 @@
+# SPDX-License-Identifier: Apache-2.0
+# Copyright (c) Axolotl AI
+# Licensed under the Apache License, Version 2.0
+
+"""Treat CUDA OOM as a skip for tests in this directory.
+
+When the suite runs under ``pytest-xdist``, multiple workers contend for the
+same physical GPU's memory budget. A test that fits comfortably in isolation
+can OOM purely because peer workers are already holding most of VRAM. That's
+an environmental race, not a code defect, so converting it to a skip keeps
+mixed-GPU CI green without masking real regressions (a real correctness bug
+surfaces as an assert/exception, not as ``torch.OutOfMemoryError``).
+
+We hook ``pytest_runtest_call`` rather than using an autouse fixture because
+pytest captures the test exception before re-entering the fixture's
+generator — the fixture's ``try/except`` around ``yield`` never sees it.
+"""
+
+from __future__ import annotations
+
+import gc
+
+import pytest
+import torch
+
+
+def _cuda_oom_types() -> tuple[type[BaseException], ...]:
+    types: list[type[BaseException]] = []
+    if hasattr(torch, "OutOfMemoryError"):
+        types.append(torch.OutOfMemoryError)
+    cuda_oom = getattr(torch.cuda, "OutOfMemoryError", None)
+    if cuda_oom is not None and cuda_oom not in types:
+        types.append(cuda_oom)
+    return tuple(types) or (RuntimeError,)
+
+
+_OOM = _cuda_oom_types()
+
+
+@pytest.hookimpl(hookwrapper=True)
+def pytest_runtest_call(item):
+    outcome = yield
+    excinfo = outcome.excinfo
+    if excinfo is None:
+        return
+    exc_val = excinfo[1]
+    if isinstance(exc_val, _OOM):
+        gc.collect()
+        if torch.cuda.is_available():
+            torch.cuda.empty_cache()
+        outcome.force_exception(
+            pytest.skip.Exception(
+                f"skipping on CUDA OOM (likely xdist worker contention): {exc_val}",
+                _use_item_location=True,
+            )
+        )
diff --git a/tests/integrations/kernels/scattermoe_lora/test_mxfp4_expert_weights.py b/tests/integrations/kernels/scattermoe_lora/test_mxfp4_expert_weights.py
new file mode 100644
index 0000000000..0f147beef3
--- /dev/null
+++ b/tests/integrations/kernels/scattermoe_lora/test_mxfp4_expert_weights.py
@@ -0,0 +1,567 @@
+# SPDX-License-Identifier: Apache-2.0
+# Copyright (c) Axolotl AI
+# Licensed under the Apache License, Version 2.0
+
+"""
+Correctness tests for MXFP4 expert weight support in ScatterMoE LoRA.
+
+Validates both strategies against a bf16-dequantized reference:
+
+  Strategy A — selective dequant:
+    The kernel runs on the dequantized [num_active, K, N] bf16 buffer,
+    so outputs must be bitwise identical to the baseline that supplies
+    the same bf16 weights directly.
+
+  Strategy B — fused Triton (when enabled):
+    The kernel unpacks MXFP4 + applies E8M0 scales in its K-loop. Output
+    differs from the bf16 reference by MX-rounding-tolerance only.
+
+Shapes covered:
+  - small:           [E=8, K=128, N=256], M=16, top_k=2, rank=8
+  - representative:  [E=32, K=2048, N=1024], M=64, top_k=4, rank=16
+"""
+
+import pytest
+import torch
+
+from axolotl.integrations.kernels.libs.scattermoe_lora.mx_weights import (
+    selective_mx_weights_fwd,
+)
+from axolotl.integrations.kernels.libs.scattermoe_lora.parallel_experts import (
+    flatten_sort_count,
+)
+from axolotl.integrations.kernels.libs.scattermoe_lora.parallel_linear_lora import (
+    parallel_linear_lora,
+)
+from axolotl.integrations.kernels.libs.scattermoe_lora.selective_dequant import (
+    get_active_experts,
+    is_mxfp4_param,
+    remap_expert_indices,
+    selective_expert_weights,
+    selective_lora_weights,
+)
+
+torchao = pytest.importorskip("torchao")
+from torchao.prototype.mx_formats.mx_tensor import MXTensor  # noqa: E402
+
+DEVICE = "cuda"
+DTYPE = torch.bfloat16
+
+pytestmark = pytest.mark.skipif(
+    not torch.cuda.is_available(), reason="CUDA required for MX kernels"
+)
+
+
+SHAPES = [
+    # (E, K, N, M, top_k, R, seed)
+    pytest.param(8, 128, 256, 16, 2, 8, 0, id="small"),
+    pytest.param(32, 2048, 1024, 64, 4, 16, 1, id="representative"),
+]
+
+# Per-shape Strategy-B tolerances. Forward outputs accumulate K dot-products
+# in fp32 then cast to bf16, so they stay within a few ULPs of the bf16
+# baseline. The dX path reduces over N (which is typically larger than K and
+# uses a different MMA tile layout than the bf16 reference), so we apply a
+# looser ULP-aware tolerance there. These are still tight compared to
+# torchao's own bf16 vs fp32 GEMM noise.
+_STRATEGY_B_FWD_TOL = {
+    "small": dict(atol=2e-3, rtol=2e-3),
+    "representative": dict(atol=1e-2, rtol=5e-3),
+}
+# dX tolerance: ~1 bf16 ULP at the typical output magnitude (rtol dominates;
+# atol caps near-zero entries where MMA-reordering manifests as full ULP).
+_STRATEGY_B_DX_TOL = {
+    "small": dict(atol=0.5, rtol=2e-2),
+    "representative": dict(atol=2.0, rtol=3e-2),
+}
+# dA / dB tolerance: the fused dA/dB kernel accumulates via atomic_add from
+# multiple N-block programs per expert, and the number of in-flight programs
+# differs between the full-E baseline and the compact-active MX path —
+# atomic ordering then introduces bf16 ULP-scale noise. Looser than the
+# forward bound because the gradients integrate over both M and N.
+_STRATEGY_B_LORA_GRAD_TOL = {
+    "small": dict(atol=2e-2, rtol=2e-2),
+    "representative": dict(atol=2e-1, rtol=3e-2),
+}
+
+
+def _tol_for_shape(K, *, dx: bool = False, lora_grad: bool = False):
+    if lora_grad:
+        table = _STRATEGY_B_LORA_GRAD_TOL
+    elif dx:
+        table = _STRATEGY_B_DX_TOL
+    else:
+        table = _STRATEGY_B_FWD_TOL
+    return table["small"] if K <= 128 else table["representative"]
+
+
+def _make_mxfp4_weights(E, K, N, seed):
+    """Build a `[E, N, K]` MXFP4 ``MXTensor`` (block axis = K, the contraction
+    axis). Returns (mx, W_ref_bf16) — the bf16 reference is the dequantization
+    of the full MX tensor so Strategy A can hit bitwise equality."""
+    torch.manual_seed(seed)
+    # Natural axolotl storage is [E, N, K] where K is the contraction axis;
+    # `experts.gate_up_proj.transpose(2, 1)` then yields [E, K, N] for the kernel.
+    W_dense = torch.randn(E, N, K, device=DEVICE, dtype=DTYPE)
+    mx = MXTensor.to_mx(W_dense, elem_dtype=torch.float4_e2m1fn_x2, block_size=32)
+    W_ref = mx.dequantize(DTYPE).contiguous()
+    return mx, W_ref
+
+
+def _setup_routing_and_lora(E, K, N, M, top_k, R, seed):
+    torch.manual_seed(seed + 100)
+    x = torch.randn(M, K, device=DEVICE, dtype=DTYPE)
+    lora_A = torch.randn(R * E, K, device=DEVICE, dtype=DTYPE) * 0.01
+    lora_B = torch.randn(N, R * E, device=DEVICE, dtype=DTYPE) * 0.01
+    logits = torch.randn(M, E, device=DEVICE)
+    _, top_idx = torch.topk(torch.softmax(logits, dim=-1), top_k, dim=-1)
+    sei, ssi, eo = flatten_sort_count(top_idx, E)
+    return x, lora_A, lora_B, sei, ssi, eo
+
+
+class _MockExperts:
+    """Bare object exposing ``gate_up_proj`` so `selective_expert_weights`
+    can branch on it."""
+
+    def __init__(self, mx_param, num_experts):
+        self.gate_up_proj = mx_param
+        self.num_experts = num_experts
+
+
+def _run_baseline(
+    W_ref,
+    x,
+    lora_A,
+    lora_B,
+    scaling,
+    sei,
+    ssi,
+    eo,
+    top_k,
+    *,
+    use_fused_dX: bool = False,
+    use_fused_gather: bool = False,
+):
+    """Full-E bf16 baseline: dense weights, full LoRA, full expert indices."""
+    W_kernel = W_ref.transpose(2, 1).contiguous()  # [E, K, N]
+    return parallel_linear_lora(
+        x,
+        W_kernel,
+        top_k,
+        sei,
+        ssi,
+        eo,
+        lora_A=lora_A,
+        lora_B=lora_B,
+        scaling=scaling,
+        use_fused_dX=use_fused_dX,
+        use_fused_gather=use_fused_gather,
+    )
+
+
+def _run_strategy_a(mx, x, lora_A, lora_B, scaling, sei, ssi, eo, top_k, E):
+    """Strategy A: selective dequant via MXTensor branch in
+    `selective_expert_weights`. Compact weights + remapped indices."""
+    experts = _MockExperts(mx, E)
+    active = get_active_experts(sei, E)
+    remapped, compact_offsets = remap_expert_indices(sei, eo, active, E)
+    W_compact = (
+        selective_expert_weights(experts, "gate_up_proj", active)
+        .transpose(2, 1)
+        .contiguous()
+    )  # [num_active, K, N]
+    A_compact, B_compact = selective_lora_weights(lora_A, lora_B, active, E)
+    return parallel_linear_lora(
+        x,
+        W_compact,
+        top_k,
+        remapped,
+        ssi,
+        compact_offsets,
+        lora_A=A_compact,
+        lora_B=B_compact,
+        scaling=scaling,
+    ), active
+
+
+# ─── Strategy A — bitwise identity vs bf16 baseline ───────────────────────────
+
+
+@pytest.mark.parametrize("E,K,N,M,top_k,R,seed", SHAPES)
+def test_strategy_a_forward_matches_bf16(E, K, N, M, top_k, R, seed):
+    mx, W_ref = _make_mxfp4_weights(E, K, N, seed)
+    assert is_mxfp4_param(mx)
+    x, lora_A, lora_B, sei, ssi, eo = _setup_routing_and_lora(
+        E, K, N, M, top_k, R, seed
+    )
+    scaling = 0.5
+
+    out_baseline = _run_baseline(W_ref, x, lora_A, lora_B, scaling, sei, ssi, eo, top_k)
+    out_a, _ = _run_strategy_a(mx, x, lora_A, lora_B, scaling, sei, ssi, eo, top_k, E)
+
+    assert out_baseline.shape == out_a.shape
+    assert torch.equal(out_baseline, out_a), (
+        f"Strategy A forward must match bf16 baseline bitwise. "
+        f"max abs diff = {(out_baseline - out_a).abs().max().item()}"
+    )
+
+
+@pytest.mark.parametrize("E,K,N,M,top_k,R,seed", SHAPES)
+def test_strategy_a_backward_matches_bf16(E, K, N, M, top_k, R, seed):
+    """Forward + backward parity. dX must be bitwise identical; the LoRA
+    grads dA/dB are compared on the active expert slices only (the full
+    LoRA tensors differ in shape between baseline and compact paths)."""
+    mx, W_ref = _make_mxfp4_weights(E, K, N, seed)
+    x_base, lora_A_base, lora_B_base, sei, ssi, eo = _setup_routing_and_lora(
+        E, K, N, M, top_k, R, seed
+    )
+    scaling = 0.5
+
+    # Baseline backward
+    x_b = x_base.detach().clone().requires_grad_(True)
+    A_b = lora_A_base.detach().clone().requires_grad_(True)
+    B_b = lora_B_base.detach().clone().requires_grad_(True)
+    out_b = _run_baseline(W_ref, x_b, A_b, B_b, scaling, sei, ssi, eo, top_k)
+    grad_out = torch.randn_like(out_b)
+    out_b.backward(grad_out)
+
+    # Strategy A backward
+    x_a = x_base.detach().clone().requires_grad_(True)
+    experts = _MockExperts(mx, E)
+    active = get_active_experts(sei, E)
+    remapped, compact_offsets = remap_expert_indices(sei, eo, active, E)
+    W_compact = (
+        selective_expert_weights(experts, "gate_up_proj", active)
+        .transpose(2, 1)
+        .contiguous()
+    )
+    A_full = lora_A_base.detach().clone().requires_grad_(True)
+    B_full = lora_B_base.detach().clone().requires_grad_(True)
+    A_compact, B_compact = selective_lora_weights(A_full, B_full, active, E)
+    out_a = parallel_linear_lora(
+        x_a,
+        W_compact,
+        top_k,
+        remapped,
+        ssi,
+        compact_offsets,
+        lora_A=A_compact,
+        lora_B=B_compact,
+        scaling=scaling,
+    )
+    out_a.backward(grad_out)
+
+    # Forward parity (Strategy A contract: bitwise identical to bf16 baseline).
+    # Asserted here so a forward bug that produces a constant offset (and
+    # therefore zero gradient delta) doesn't slip past the bwd-only checks.
+    assert torch.equal(out_b, out_a), (
+        f"forward mismatch (Strategy A): max abs diff = "
+        f"{(out_b - out_a).abs().max().item()}"
+    )
+
+    # dX: bitwise identical
+    assert torch.equal(x_b.grad, x_a.grad), (
+        f"dX mismatch (Strategy A): max abs diff = "
+        f"{(x_b.grad - x_a.grad).abs().max().item()}"
+    )
+
+    # dA / dB — gather active slices from the baseline full grads and compare
+    row_idx = (
+        active.long()[:, None] * R + torch.arange(R, device=DEVICE)[None, :]
+    ).reshape(-1)
+    dA_b_active = A_b.grad[row_idx]
+    dB_b_active = B_b.grad[:, row_idx]
+
+    # A_compact is a view (advanced indexing produces a copy, so the grad lands
+    # on the full lora_A via the slice). torch.autograd flows back through
+    # selective_lora_weights, so A_full.grad has gradient on rows for active
+    # experts only.
+    dA_a_active = A_full.grad[row_idx]
+    dB_a_active = B_full.grad[:, row_idx]
+
+    assert torch.equal(dA_b_active, dA_a_active), (
+        f"dA active slice mismatch: max diff = "
+        f"{(dA_b_active - dA_a_active).abs().max().item()}"
+    )
+    assert torch.equal(dB_b_active, dB_a_active), (
+        f"dB active slice mismatch: max diff = "
+        f"{(dB_b_active - dB_a_active).abs().max().item()}"
+    )
+
+
+# ─── Strategy A — backward through fused dX/gather paths ──────────────────────
+
+
+@pytest.mark.parametrize("use_fused_dX", [False, True])
+@pytest.mark.parametrize("use_fused_gather", [False, True])
+def test_strategy_a_backward_fused_variants(use_fused_dX, use_fused_gather):
+    """Strategy A must match baseline across all four fused-bwd flag
+    combinations exercised in production by ``HFScatterMoEGatedMLP``.
+    Asserts parity on dX, dA, and dB (active expert slice for dA/dB)."""
+    E, K, N, M, top_k, R = 8, 128, 256, 16, 2, 8
+    mx, W_ref = _make_mxfp4_weights(E, K, N, seed=7)
+    x_base, lora_A_base, lora_B_base, sei, ssi, eo = _setup_routing_and_lora(
+        E, K, N, M, top_k, R, seed=7
+    )
+    scaling = 0.25
+
+    def run(W_kernel, sei_, eo_, k_, lora_A_in, lora_B_in, grad_out):
+        x_g = x_base.detach().clone().requires_grad_(True)
+        A_g = lora_A_in.detach().clone().requires_grad_(True)
+        B_g = lora_B_in.detach().clone().requires_grad_(True)
+        out = parallel_linear_lora(
+            x_g,
+            W_kernel,
+            k_,
+            sei_,
+            ssi,
+            eo_,
+            lora_A=A_g,
+            lora_B=B_g,
+            scaling=scaling,
+            use_fused_dX=use_fused_dX,
+            use_fused_gather=use_fused_gather,
+        )
+        out.backward(grad_out)
+        return out, x_g.grad, A_g.grad, B_g.grad
+
+    # Non-trivial grad (a constant grad zeros out cross-token differences in
+    # the fused-gather accumulation, which can mask reordering bugs).
+    torch.manual_seed(7)
+    W_baseline = W_ref.transpose(2, 1).contiguous()
+    # Forward once on baseline shape to size grad_out.
+    out_shape_probe = parallel_linear_lora(
+        x_base,
+        W_baseline,
+        top_k,
+        sei,
+        ssi,
+        eo,
+        lora_A=lora_A_base,
+        lora_B=lora_B_base,
+        scaling=scaling,
+    )
+    grad_out = torch.randn_like(out_shape_probe) * 0.1
+
+    out_b, dx_b, dA_b, dB_b = run(
+        W_baseline,
+        sei,
+        eo,
+        top_k,
+        lora_A_base,
+        lora_B_base,
+        grad_out,
+    )
+
+    experts = _MockExperts(mx, E)
+    active = get_active_experts(sei, E)
+    remapped, compact_offsets = remap_expert_indices(sei, eo, active, E)
+    W_compact = (
+        selective_expert_weights(experts, "gate_up_proj", active)
+        .transpose(2, 1)
+        .contiguous()
+    )
+    A_compact, B_compact = selective_lora_weights(lora_A_base, lora_B_base, active, E)
+    out_a, dx_a, dA_a, dB_a = run(
+        W_compact,
+        remapped,
+        compact_offsets,
+        top_k,
+        A_compact,
+        B_compact,
+        grad_out,
+    )
+
+    # Strategy A is bitwise on forward / dX (same bf16 weights, same kernel).
+    assert torch.equal(out_b, out_a), (
+        f"forward mismatch: max diff = {(out_b - out_a).abs().max().item()}"
+    )
+    assert torch.equal(dx_b, dx_a), (
+        f"dX mismatch: max diff = {(dx_b - dx_a).abs().max().item()}"
+    )
+
+    # dA / dB — compare active-expert slice of the dense baseline grads
+    # against the compact-path grads. Same row_idx pattern as
+    # test_strategy_a_backward_matches_bf16. Bitwise (``torch.equal``) holds
+    # for forward and dX, but the fused dA/dB kernel uses ``atomic_add``
+    # across N-block programs and the in-flight program count differs
+    # between the full-E baseline and the compact-active path; combined
+    # with FMA reordering, this introduces 1–2 bf16 ULPs (~2e-4 at the
+    # values seen here) on the ``use_fused_dX=True`` configs. Use a
+    # tolerance an order of magnitude below that — tight enough to catch
+    # any real bug but tolerant of the unavoidable atomic-order noise.
+    lora_grad_tol = dict(atol=1e-3, rtol=1e-3)
+    row_idx = (
+        active.long()[:, None] * R + torch.arange(R, device=DEVICE)[None, :]
+    ).reshape(-1)
+    dA_b_active = dA_b[row_idx]
+    dB_b_active = dB_b[:, row_idx]
+
+    assert torch.allclose(dA_b_active, dA_a, **lora_grad_tol), (
+        f"dA active slice mismatch: max diff = "
+        f"{(dA_b_active - dA_a).abs().max().item()}"
+    )
+    assert torch.allclose(dB_b_active, dB_a, **lora_grad_tol), (
+        f"dB active slice mismatch: max diff = "
+        f"{(dB_b_active - dB_a).abs().max().item()}"
+    )
+
+
+# ─── Strategy B — fused MXFP4 Triton kernel ──────────────────────────────────
+
+
+# MX rounding tolerance — the Triton path can reorder FMAs vs the torchao
+# dequant + bf16 matmul reference, and the dequant arithmetic is
+# fp32-codebook * fp32-scale -> bf16. See ``_STRATEGY_B_TOL`` above.
+
+
+def _run_strategy_b(mx, x, lora_A, lora_B, scaling, sei, ssi, eo, top_k, E):
+    """Strategy B: pass MXWeights container directly to parallel_linear_lora;
+    the fused MX kernel does dequant inside the K-loop."""
+    active = get_active_experts(sei, E)
+    remapped, compact_offsets = remap_expert_indices(sei, eo, active, E)
+    mx_active = selective_mx_weights_fwd(mx, active)
+    A_compact, B_compact = selective_lora_weights(lora_A, lora_B, active, E)
+    return parallel_linear_lora(
+        x,
+        mx_active,
+        top_k,
+        remapped,
+        ssi,
+        compact_offsets,
+        lora_A=A_compact,
+        lora_B=B_compact,
+        scaling=scaling,
+    ), active
+
+
+@pytest.mark.parametrize("E,K,N,M,top_k,R,seed", SHAPES)
+def test_strategy_b_forward_matches_bf16(E, K, N, M, top_k, R, seed):
+    """Strategy B forward must match bf16 baseline within MX rounding tol."""
+    mx, W_ref = _make_mxfp4_weights(E, K, N, seed)
+    x, lora_A, lora_B, sei, ssi, eo = _setup_routing_and_lora(
+        E, K, N, M, top_k, R, seed
+    )
+    scaling = 0.5
+    tol = _tol_for_shape(K)
+
+    out_baseline = _run_baseline(W_ref, x, lora_A, lora_B, scaling, sei, ssi, eo, top_k)
+    out_b, _ = _run_strategy_b(mx, x, lora_A, lora_B, scaling, sei, ssi, eo, top_k, E)
+
+    assert out_baseline.shape == out_b.shape
+    diff = (out_baseline.float() - out_b.float()).abs()
+    rel = diff / (out_baseline.float().abs() + 1e-6)
+    assert torch.allclose(out_baseline, out_b, **tol), (
+        f"Strategy B forward exceeds MX tolerance: max abs={diff.max().item():.4e}, "
+        f"max rel={rel.max().item():.4e}"
+    )
+
+
+@pytest.mark.parametrize("E,K,N,M,top_k,R,seed", SHAPES)
+def test_strategy_b_backward_matches_bf16(E, K, N, M, top_k, R, seed):
+    """Strategy B forward+backward; dX, dA, dB compared to bf16 baseline on
+    the active expert slice within MX rounding tol."""
+    mx, W_ref = _make_mxfp4_weights(E, K, N, seed)
+    x_base, lora_A_base, lora_B_base, sei, ssi, eo = _setup_routing_and_lora(
+        E, K, N, M, top_k, R, seed
+    )
+    scaling = 0.5
+    fwd_tol = _tol_for_shape(K)
+    dx_tol = _tol_for_shape(K, dx=True)
+    lg_tol = _tol_for_shape(K, lora_grad=True)
+
+    # Baseline — match the MX path's fused-bwd kernel selection so dA/dB MMA
+    # accumulation order is the same and bf16 noise stays at single ULPs.
+    x_b = x_base.detach().clone().requires_grad_(True)
+    A_b = lora_A_base.detach().clone().requires_grad_(True)
+    B_b = lora_B_base.detach().clone().requires_grad_(True)
+    out_b = _run_baseline(
+        W_ref,
+        x_b,
+        A_b,
+        B_b,
+        scaling,
+        sei,
+        ssi,
+        eo,
+        top_k,
+        use_fused_dX=True,
+        use_fused_gather=True,
+    )
+    grad_out = torch.randn_like(out_b)
+    out_b.backward(grad_out)
+
+    # Strategy B
+    x_s = x_base.detach().clone().requires_grad_(True)
+    active = get_active_experts(sei, E)
+    remapped, compact_offsets = remap_expert_indices(sei, eo, active, E)
+    mx_active = selective_mx_weights_fwd(mx, active)
+    A_full = lora_A_base.detach().clone().requires_grad_(True)
+    B_full = lora_B_base.detach().clone().requires_grad_(True)
+    A_compact, B_compact = selective_lora_weights(A_full, B_full, active, E)
+    out_s = parallel_linear_lora(
+        x_s,
+        mx_active,
+        top_k,
+        remapped,
+        ssi,
+        compact_offsets,
+        lora_A=A_compact,
+        lora_B=B_compact,
+        scaling=scaling,
+    )
+    out_s.backward(grad_out)
+
+    # Forward parity within MX rounding tol — asserted here so a forward bug
+    # that produces a constant offset (and therefore zero gradient delta)
+    # doesn't slip past the bwd-only checks.
+    assert torch.allclose(out_b, out_s, **fwd_tol), (
+        f"Strategy B forward mismatch: max abs diff = "
+        f"{(out_b - out_s).abs().max().item():.4e}"
+    )
+
+    # dX tolerance (looser; see _STRATEGY_B_DX_TOL comment)
+    assert torch.allclose(x_b.grad, x_s.grad, **dx_tol), (
+        f"Strategy B dX mismatch: max diff = "
+        f"{(x_b.grad - x_s.grad).abs().max().item():.4e}"
+    )
+
+    # Uniform-scaling drift guard: the allclose bound above is generous to
+    # accommodate accumulated bf16 MMA noise over N. A bug that scales every
+    # dX element by a constant factor (e.g. an off-by-one on the E8M0
+    # exponent shifting the whole tile by 2x) would still pass that bound.
+    # Catch it by requiring the per-element ratio std stays small after
+    # masking out near-zero baseline elements (where the ratio is dominated
+    # by quantization noise rather than uniform drift).
+    bf16_dX = x_b.grad.float()
+    mx_dX = x_s.grad.float()
+    eps = 1e-3 * bf16_dX.abs().max().clamp(min=1e-6)
+    mask = bf16_dX.abs() > eps
+    if mask.any():
+        ratio = mx_dX[mask] / bf16_dX[mask]
+        ratio_std = ratio.std().item()
+        assert ratio_std < 0.5, (
+            f"Strategy B dX uniform-scaling drift: std(mx/bf16) = "
+            f"{ratio_std:.4f} (mean = {ratio.mean().item():.4f}); "
+            f"a uniform multiplicative bug would slip past the allclose "
+            f"bound but is caught here."
+        )
+
+    # dA / dB — compare active expert slices (use forward tolerance — these
+    # come from the LoRA-only grad path which doesn't touch the W matmul)
+    row_idx = (
+        active.long()[:, None] * R + torch.arange(R, device=DEVICE)[None, :]
+    ).reshape(-1)
+    dA_b_active = A_b.grad[row_idx]
+    dA_s_active = A_full.grad[row_idx]
+    dB_b_active = B_b.grad[:, row_idx]
+    dB_s_active = B_full.grad[:, row_idx]
+
+    assert torch.allclose(dA_b_active, dA_s_active, **lg_tol), (
+        f"Strategy B dA active slice mismatch: max diff = "
+        f"{(dA_b_active - dA_s_active).abs().max().item():.4e}"
+    )
+    assert torch.allclose(dB_b_active, dB_s_active, **lg_tol), (
+        f"Strategy B dB active slice mismatch: max diff = "
+        f"{(dB_b_active - dB_s_active).abs().max().item():.4e}"
+    )
diff --git a/tests/integrations/kernels/scattermoe_lora/test_mxfp4_integration.py b/tests/integrations/kernels/scattermoe_lora/test_mxfp4_integration.py
new file mode 100644
index 0000000000..353969ac73
--- /dev/null
+++ b/tests/integrations/kernels/scattermoe_lora/test_mxfp4_integration.py
@@ -0,0 +1,246 @@
+# SPDX-License-Identifier: Apache-2.0
+# Copyright (c) Axolotl AI
+# Licensed under the Apache License, Version 2.0
+
+"""
+End-to-end integration test for MXFP4 expert weights through the ScatterMoE
+LoRA path.
+
+We build a tiny synthetic DeepSeek-V4-style MoE block (E=8, hidden=512,
+intermediate=256, top_k=2), MX-quantize the gate/up and down projection
+expert weights via ``torchao.MXTensor.to_mx``, then compare two stacks
+forward-only:
+
+  1. **Reference** — pure PyTorch per-expert loop using the bf16 dequant of
+     the *same* MX weights. This stands in for "stock HF transformers MoE
+     with ``Mxfp4Config`` applied" — both stacks read the same physical MX
+     packed/scale buffers, so any divergence comes from the Axolotl
+     ScatterMoE plumbing (routing flatten/sort, scatter2scatter, fused
+     dequant kernel), not from differing weight quantization.
+
+  2. **Axolotl ScatterMoE** — ``parallel_linear_lora`` driven by an
+     ``MXWeights`` container, LoRA disabled (A = B = 0). Tests both
+     Strategy A (selective dequant to bf16) and Strategy B (fused MX
+     Triton kernel) so the spec'd "stock vs scattermoe" parity check
+     covers both code paths.
+
+Comparison tolerance is looser than the unit tests (``atol=rtol=5e-3``)
+because the per-expert PyTorch reference accumulates in fp32 while the
+Triton path emits bf16 outputs whose final cast rounds.
+"""
+
+import pytest
+import torch
+import torch.nn.functional as F
+
+from axolotl.integrations.kernels.libs.scattermoe_lora.mx_weights import (
+    selective_mx_weights_fwd,
+)
+from axolotl.integrations.kernels.libs.scattermoe_lora.parallel_experts import (
+    flatten_sort_count,
+)
+from axolotl.integrations.kernels.libs.scattermoe_lora.parallel_linear_lora import (
+    parallel_linear_lora,
+)
+from axolotl.integrations.kernels.libs.scattermoe_lora.selective_dequant import (
+    get_active_experts,
+    remap_expert_indices,
+    selective_expert_weights,
+    selective_lora_weights,
+)
+
+torchao = pytest.importorskip("torchao")
+from torchao.prototype.mx_formats.mx_tensor import MXTensor  # noqa: E402
+
+DEVICE = "cuda"
+DTYPE = torch.bfloat16
+
+pytestmark = pytest.mark.skipif(
+    not torch.cuda.is_available(), reason="CUDA required for MX kernels"
+)
+
+
+# DeepSeek-V4-style tiny config (small enough for fast unit testing)
+E = 8
+HIDDEN = 512
+INTERMEDIATE = 256
+TOP_K = 2
+M = 16  # batch * seq
+
+
+def _build_synthetic_moe():
+    """Return (gate_up_mx, down_mx, gate_up_ref, down_ref, router_w)
+    matching a DeepSeek-V4 expert block:
+
+      * ``gate_up_proj``: per-expert ``[hidden, 2*intermediate]`` (split into
+        gate and up halves after the matmul).
+      * ``down_proj``:    per-expert ``[intermediate, hidden]``.
+
+    Storage layout matches axolotl's convention ``[E, N, K]`` where K is the
+    contraction axis the kernel will block on. ``gate_up`` has K=hidden,
+    N=2*intermediate; ``down`` has K=intermediate, N=hidden.
+
+    bf16 reference tensors are the dequantizations of the *same* MX
+    buffers, so the only test source of divergence is the kernel paths.
+    """
+    torch.manual_seed(42)
+    # Scale ~ 1/sqrt(fan_in) so per-layer outputs stay in order-1 range and
+    # bf16 final-cast noise is not amplified by the magnitude.
+    gup_scale = 1.0 / (HIDDEN**0.5)
+    down_scale = 1.0 / (INTERMEDIATE**0.5)
+    gate_up = (
+        torch.randn(E, 2 * INTERMEDIATE, HIDDEN, device=DEVICE, dtype=DTYPE) * gup_scale
+    )
+    down = torch.randn(E, HIDDEN, INTERMEDIATE, device=DEVICE, dtype=DTYPE) * down_scale
+
+    gate_up_mx = MXTensor.to_mx(
+        gate_up, elem_dtype=torch.float4_e2m1fn_x2, block_size=32
+    )
+    down_mx = MXTensor.to_mx(down, elem_dtype=torch.float4_e2m1fn_x2, block_size=32)
+    gate_up_ref = gate_up_mx.dequantize(DTYPE).contiguous()
+    down_ref = down_mx.dequantize(DTYPE).contiguous()
+
+    router_w = torch.randn(E, HIDDEN, device=DEVICE, dtype=DTYPE) * 0.1
+    return gate_up_mx, down_mx, gate_up_ref, down_ref, router_w
+
+
+def _reference_moe_forward(x, router_w, gate_up_ref, down_ref):
+    """Stand-in for stock HF MoE with Mxfp4Config: per-token routing +
+    per-expert matmul on dequantized bf16 weights."""
+    # Softmax-topk routing
+    router_logits = F.linear(x, router_w)  # [M, E]
+    routing_weights = F.softmax(router_logits, dim=-1, dtype=torch.float32)
+    routing_weights, selected = torch.topk(routing_weights, TOP_K, dim=-1)
+    routing_weights = routing_weights / routing_weights.sum(dim=-1, keepdim=True)
+    routing_weights = routing_weights.to(x.dtype)
+
+    out = torch.zeros_like(x)
+    for e in range(E):
+        # Tokens routed to expert e: positions (token_id, k_slot)
+        mask = selected == e  # [M, TOP_K]
+        if not mask.any():
+            continue
+        token_ids, slot_ids = mask.nonzero(as_tuple=True)
+        x_e = x[token_ids]  # [n_e, HIDDEN]
+        gup = x_e @ gate_up_ref[e].t()  # [n_e, 2*INTERMEDIATE]
+        gate, up = gup.chunk(2, dim=-1)
+        h = F.silu(gate) * up
+        y_e = h @ down_ref[e].t()  # [n_e, HIDDEN]
+        # Weighted accumulate
+        w_e = routing_weights[token_ids, slot_ids].unsqueeze(-1)
+        out.index_add_(0, token_ids, w_e * y_e)
+    return out
+
+
+class _MockExperts:
+    def __init__(self, gate_up, down):
+        self.gate_up_proj = gate_up
+        self.down_proj = down
+        self.num_experts = E
+
+
+def _axolotl_moe_forward(x, router_w, gate_up_param, down_param, *, strategy: str):
+    """Run the Axolotl ScatterMoE LoRA path with LoRA disabled (A=B=0).
+
+    ``strategy='A'``: ``gate_up_param``/``down_param`` are torchao MXTensors;
+    we dequantize the active experts to bf16 and call the bf16 kernel.
+
+    ``strategy='B'``: same MXTensors but routed through the fused MX kernel
+    via the ``MXWeights`` container.
+    """
+    # Routing — same softmax+topk shape as the reference
+    router_logits = F.linear(x, router_w)
+    routing_weights = F.softmax(router_logits, dim=-1, dtype=torch.float32)
+    routing_weights, selected = torch.topk(routing_weights, TOP_K, dim=-1)
+    routing_weights = routing_weights / routing_weights.sum(dim=-1, keepdim=True)
+    routing_weights = routing_weights.to(x.dtype)
+
+    sei, ssi, eo = flatten_sort_count(selected, num_experts=E)
+    active = get_active_experts(sei, E)
+    remapped, compact_offsets = remap_expert_indices(sei, eo, active, E)
+
+    # Build LoRA tensors with A=B=0 so the LoRA term is zero.
+    rank = 4
+    lora_A = torch.zeros(rank * E, HIDDEN, device=DEVICE, dtype=DTYPE)
+    lora_B_gup = torch.zeros(2 * INTERMEDIATE, rank * E, device=DEVICE, dtype=DTYPE)
+    lora_B_down = torch.zeros(HIDDEN, rank * E, device=DEVICE, dtype=DTYPE)
+    lora_A_inter = torch.zeros(rank * E, INTERMEDIATE, device=DEVICE, dtype=DTYPE)
+    A_gup_c, B_gup_c = selective_lora_weights(lora_A, lora_B_gup, active, E)
+    A_dn_c, B_dn_c = selective_lora_weights(lora_A_inter, lora_B_down, active, E)
+
+    experts = _MockExperts(gate_up_param, down_param)
+
+    if strategy == "A":
+        gate_up_W = (
+            selective_expert_weights(experts, "gate_up_proj", active)
+            .transpose(2, 1)
+            .contiguous()
+        )
+        down_W = (
+            selective_expert_weights(experts, "down_proj", active)
+            .transpose(2, 1)
+            .contiguous()
+        )
+        gup_W = gate_up_W
+        dwn_W = down_W
+    elif strategy == "B":
+        gup_W = selective_mx_weights_fwd(gate_up_param, active)
+        dwn_W = selective_mx_weights_fwd(down_param, active)
+    else:
+        raise ValueError(strategy)
+
+    gup = parallel_linear_lora(
+        x,
+        gup_W,
+        TOP_K,
+        remapped,
+        ssi,
+        compact_offsets,
+        lora_A=A_gup_c,
+        lora_B=B_gup_c,
+        scaling=0.0,
+        grouped_in=False,
+        grouped_out=True,
+        use_fused_dX=True,
+        use_fused_gather=True,
+    )
+    gate, up = gup.chunk(2, dim=-1)
+    h = F.silu(gate) * up
+    out = parallel_linear_lora(
+        h,
+        dwn_W,
+        1,
+        remapped,
+        ssi,
+        compact_offsets,
+        lora_A=A_dn_c,
+        lora_B=B_dn_c,
+        scaling=0.0,
+        gates=routing_weights,
+        grouped_in=True,
+        grouped_out=False,
+        use_fused_dX=True,
+        use_fused_gather=True,
+    )
+    return out
+
+
+@pytest.mark.parametrize("strategy", ["A", "B"])
+def test_mxfp4_moe_block_matches_pytorch_reference(strategy):
+    """The Axolotl ScatterMoE MX path must match the per-expert PyTorch
+    reference (operating on the same MX dequantized weights) within
+    integration-grade tolerance."""
+    gate_up_mx, down_mx, gate_up_ref, down_ref, router_w = _build_synthetic_moe()
+
+    torch.manual_seed(7)
+    x = torch.randn(M, HIDDEN, device=DEVICE, dtype=DTYPE)
+
+    ref = _reference_moe_forward(x, router_w, gate_up_ref, down_ref)
+    out = _axolotl_moe_forward(x, router_w, gate_up_mx, down_mx, strategy=strategy)
+
+    assert ref.shape == out.shape == (M, HIDDEN)
+    assert torch.allclose(ref, out, atol=5e-3, rtol=5e-3), (
+        f"Strategy {strategy} MoE block diverges from PyTorch reference: "
+        f"max abs={(ref - out).abs().max().item():.4e}, "
+        f"max rel={((ref - out).abs() / (ref.abs() + 1e-6)).max().item():.4e}"
+    )
diff --git a/tests/integrations/kernels/scattermoe_lora/test_parallel_experts_large_batch_repro.py b/tests/integrations/kernels/scattermoe_lora/test_parallel_experts_large_batch_repro.py
new file mode 100644
index 0000000000..37a6a7f6ff
--- /dev/null
+++ b/tests/integrations/kernels/scattermoe_lora/test_parallel_experts_large_batch_repro.py
@@ -0,0 +1,289 @@
+# SPDX-License-Identifier: Apache-2.0
+# Copyright (c) Axolotl AI
+# Licensed under the Apache License, Version 2.0
+
+"""
+Reproducer for the cuBLAS / illegal-memory-access failure surfaced at
+seq=512K with 16 shards in the tiled-MLP long-context bench.
+
+The originally-reported symptom (``CUBLAS_STATUS_EXECUTION_FAILED`` from
+``cublasGemmStridedBatchedEx`` at
+``parallel_experts.py:72``'s ``gates.unsqueeze(1) @ output_expanded``)
+is a downstream effect — the actual fault is in the upstream
+``scatter2scatter`` Triton kernel's pointer-offset arithmetic. When the
+output of the up-projection has
+``L_scattered * y_dim >= 2 ** 31`` elements (i.e. the kernel's
+``M_block * stride_ym`` int32 multiplication overflows), the kernel
+silently writes to wrong addresses, which can in turn trip the next
+kernel (the gates @ output_expanded bmm or whatever else follows).
+
+The repro shape mirrors the failing bench config:
+
+* shard tokens ``T = 32768`` (= 524288 // 16),
+* ``top_k = 8`` → ``L_scattered = T * top_k = 262144``,
+* ``num_experts = 128``, ``hidden = 2048``, ``intermediate = 8192``.
+
+At that shape, the up-projection's scatter2scatter output is
+``[262144, 2 * 8192] = [262144, 16384]`` = 2**32 elements. The
+overflow boundary for the M_block * stride_ym int32 product is
+``M_block < 2 ** 31 / 16384 = 131072`` — exactly half the output rows.
+"""
+
+from __future__ import annotations
+
+import pytest
+import torch
+
+# Failing-config constants (mirror tests/integrations/monkeypatch/
+# bench_tiled_mlp_moe.py at seq=524288, shards=16).
+_T = 32768
+_TOP_K = 8
+_NUM_EXPERTS = 128
+_HIDDEN = 2048
+_INTERMEDIATE = 8192
+_DTYPE = torch.bfloat16
+
+
+def _requires_cuda():
+    return pytest.mark.skipif(
+        not torch.cuda.is_available(), reason="CUDA required for the repro"
+    )
+
+
+_SCATTER2SCATTER_INT32_LIMIT = 2**31
+
+
+@_requires_cuda()
+def test_scatter2scatter_below_threshold_no_overhead():
+    """At shapes well below the int32 overflow boundary the auto-dispatch
+    in ``ParallelLinear`` picks ``INT64_INDICES=False`` and the kernel
+    output is bit-identical to a direct call with the same flag.
+
+    This is the regression guard for "don't accidentally penalise the
+    common-case path that does not need int64 indices".
+    """
+    from axolotl.integrations.kernels.libs.scattermoe_lora.kernels.ops import (
+        scatter2scatter,
+    )
+    from axolotl.integrations.kernels.libs.scattermoe_lora.parallel_experts import (
+        flatten_sort_count,
+    )
+
+    device = torch.device("cuda:0")
+    torch.manual_seed(0)
+
+    # Small shape: T=512 tokens → L_scattered=4096 → output is
+    # 4096 * 16384 = 67 M elements, well below the 2**31 threshold.
+    T_small = 512
+    x = torch.randn(T_small, _HIDDEN, device=device, dtype=_DTYPE)
+    W = (
+        torch.randn(
+            _NUM_EXPERTS, _HIDDEN, 2 * _INTERMEDIATE, device=device, dtype=_DTYPE
+        )
+        * 0.01
+    )
+
+    logits = torch.randn(T_small, _NUM_EXPERTS, device=device)
+    _, top_idx = torch.topk(torch.softmax(logits, dim=-1), _TOP_K, dim=-1)
+    sei, ssi, _ = flatten_sort_count(top_idx, _NUM_EXPERTS)
+
+    assert sei.size(0) * W.size(-1) < _SCATTER2SCATTER_INT32_LIMIT
+
+    out_i32 = scatter2scatter(
+        X=x,
+        W=W,
+        sorted_expert_idxs=sei,
+        sorted_scattered_idxs=ssi,
+        k=_TOP_K,
+        x_grouped=False,
+        y_grouped=True,
+        int64_indices=False,
+    )
+    out_i64 = scatter2scatter(
+        X=x,
+        W=W,
+        sorted_expert_idxs=sei,
+        sorted_scattered_idxs=ssi,
+        k=_TOP_K,
+        x_grouped=False,
+        y_grouped=True,
+        int64_indices=True,
+    )
+    torch.cuda.synchronize()
+    assert torch.equal(out_i32, out_i64), (
+        "INT64_INDICES must not change MMA/accumulation order at small shapes"
+    )
+
+
+@_requires_cuda()
+def test_scatter2scatter_no_corruption_at_overflow_shape():
+    """The kernel-level int64 fix must keep every output row populated
+    when the shape straddles the 2**31-element boundary.
+
+    Background: with INT64_INDICES=False the Triton ``scatter2scatter``
+    kernel computes pointer offsets as
+    ``Y_ptr + M_block * stride_ym + N_block * stride_yn`` in int32. At
+    the bench shape (L_scattered=262144, y_dim=16384 → 2**32 elements
+    of output) the trailing rows past ``M_block >= 2**31 / y_dim``
+    overflow and their masked stores silently drop, leaving those rows
+    as all-zeros. With INT64_INDICES=True the M_block range is cast to
+    int64 before it enters the multiplication and the overflow is
+    eliminated at the kernel level.
+
+    This test calls the kernel directly with INT64_INDICES=True and
+    asserts every sampled row past the boundary has at least one
+    non-zero element. (The ``ParallelLinear`` wrapper's auto-dispatch
+    is covered separately by ``test_parallel_linear_long_seq_routing_combination``.)
+    """
+    from axolotl.integrations.kernels.libs.scattermoe_lora.kernels.ops import (
+        scatter2scatter,
+    )
+    from axolotl.integrations.kernels.libs.scattermoe_lora.parallel_experts import (
+        flatten_sort_count,
+    )
+
+    device = torch.device("cuda:0")
+    torch.manual_seed(0)
+
+    x = torch.randn(_T, _HIDDEN, device=device, dtype=_DTYPE)
+    W = (
+        torch.randn(
+            _NUM_EXPERTS, _HIDDEN, 2 * _INTERMEDIATE, device=device, dtype=_DTYPE
+        )
+        * 0.01
+    )
+
+    logits = torch.randn(_T, _NUM_EXPERTS, device=device)
+    _, top_idx = torch.topk(torch.softmax(logits, dim=-1), _TOP_K, dim=-1)
+    sei, ssi, _ = flatten_sort_count(top_idx, _NUM_EXPERTS)
+
+    L_scattered = sei.size(0)
+    y_dim = W.size(-1)
+    assert L_scattered * y_dim >= _SCATTER2SCATTER_INT32_LIMIT, (
+        f"repro precondition: L_scattered * y_dim ({L_scattered * y_dim}) "
+        f"must straddle the int32 overflow boundary "
+        f"({_SCATTER2SCATTER_INT32_LIMIT})"
+    )
+
+    output = scatter2scatter(
+        X=x,
+        W=W,
+        sorted_expert_idxs=sei,
+        sorted_scattered_idxs=ssi,
+        k=_TOP_K,
+        x_grouped=False,
+        y_grouped=True,
+        int64_indices=True,
+    )
+    torch.cuda.synchronize()
+
+    overflow_threshold_row = _SCATTER2SCATTER_INT32_LIMIT // y_dim
+    sample_rows = [
+        0,
+        overflow_threshold_row // 2,
+        overflow_threshold_row - 1,
+        overflow_threshold_row,
+        overflow_threshold_row + 1,
+        (overflow_threshold_row + L_scattered) // 2,
+        L_scattered - 1,
+    ]
+    for row in sample_rows:
+        nz = (output[row] != 0).any().item()
+        assert nz, (
+            f"row {row} of scatter2scatter output is all-zero "
+            f"(overflow_threshold_row={overflow_threshold_row}, "
+            f"L_scattered={L_scattered}, y_dim={y_dim})"
+        )
+
+
+@_requires_cuda()
+def test_parallel_linear_long_seq_routing_combination():
+    """End-to-end repro through ``parallel_linear`` matching the bench path.
+
+    Replicates the ``ScatterMoEGatedMLP.forward`` shape sequence (up
+    projection at line 374 → activation → down projection at line 385
+    with ``gates=routing_weights``) at the seq=524288/shards=16 inner
+    config. Before the fix this raises
+    ``CUBLAS_STATUS_EXECUTION_FAILED`` (or a subsequent illegal-memory-
+    access) at the down-projection's ``gates @ output_expanded`` bmm.
+    """
+    import torch.nn.functional as F
+
+    from axolotl.integrations.kernels.libs.scattermoe_lora.parallel_experts import (
+        flatten_sort_count,
+        parallel_linear,
+    )
+
+    device = torch.device("cuda:0")
+    torch.manual_seed(0)
+
+    layer_input = torch.randn(_T, _HIDDEN, device=device, dtype=_DTYPE)
+    # Match the bench's ScatterMoEGatedMLP weight layout: input_linear
+    # is [E, 2*INTERMEDIATE, HIDDEN] then .transpose(2, 1) →
+    # [E, HIDDEN, 2*INTERMEDIATE]. output_linear is [E, HIDDEN,
+    # INTERMEDIATE] then .transpose(2, 1) → [E, INTERMEDIATE, HIDDEN].
+    in_w = (
+        torch.randn(
+            _NUM_EXPERTS, 2 * _INTERMEDIATE, _HIDDEN, device=device, dtype=_DTYPE
+        )
+        * 0.02
+    )
+    out_w = (
+        torch.randn(_NUM_EXPERTS, _HIDDEN, _INTERMEDIATE, device=device, dtype=_DTYPE)
+        * 0.02
+    )
+
+    router_logits = torch.randn(_T, _NUM_EXPERTS, device=device)
+    routing_weights = F.softmax(router_logits, dim=-1, dtype=torch.float)
+    routing_weights, selected_experts = torch.topk(routing_weights, _TOP_K, dim=-1)
+    routing_weights = routing_weights / routing_weights.sum(dim=-1, keepdim=True)
+    routing_weights = routing_weights.to(_DTYPE)
+
+    sei, ssi, eo = flatten_sort_count(selected_experts, _NUM_EXPERTS)
+
+    # Up projection — this is the overflow-prone call (output
+    # numel = T * top_k * 2*INTERMEDIATE = 2**32 at the bench shape).
+    with torch.no_grad():
+        gup = parallel_linear(
+            layer_input,
+            in_w.transpose(2, 1),
+            _TOP_K,
+            sei,
+            ssi,
+            eo,
+            grouped_in=False,
+            grouped_out=True,
+        )
+        gates, h = gup.chunk(2, dim=-1)
+        h = F.silu(gates) * h
+
+        # Down projection — its gates @ output_expanded bmm is where
+        # the reported CUBLAS_STATUS_EXECUTION_FAILED surfaces. The
+        # crash, however, is a downstream symptom of the up-projection
+        # corruption above.
+        layer_output = parallel_linear(
+            h,
+            out_w.transpose(2, 1),
+            1,
+            sei,
+            ssi,
+            eo,
+            grouped_in=True,
+            grouped_out=False,
+            gates=routing_weights,
+        )
+        # Force the (otherwise lazy) CUDA error to surface synchronously.
+        torch.cuda.synchronize()
+
+    assert layer_output.shape == (_T, _HIDDEN)
+    # The output must have real values, not zero rows or NaN/Inf.
+    # ``(.abs().sum(dim=-1) == 0)`` would catch the silent-zero
+    # corruption pattern even when the kernel did not crash hard.
+    assert torch.isfinite(layer_output).all().item(), (
+        "layer_output has non-finite values — likely overflow corruption"
+    )
+    row_sums = layer_output.float().abs().sum(dim=-1)
+    assert (row_sums > 0).all().item(), (
+        "layer_output has all-zero rows — silent overflow corruption "
+        "in the up-projection scatter2scatter"
+    )
diff --git a/tests/integrations/kernels/scattermoe_lora/test_scattermoe_lora_int64_indices.py b/tests/integrations/kernels/scattermoe_lora/test_scattermoe_lora_int64_indices.py
new file mode 100644
index 0000000000..88c77865fd
--- /dev/null
+++ b/tests/integrations/kernels/scattermoe_lora/test_scattermoe_lora_int64_indices.py
@@ -0,0 +1,581 @@
+# SPDX-License-Identifier: Apache-2.0
+# Copyright (c) Axolotl AI
+# Licensed under the Apache License, Version 2.0
+
+"""
+Parity and overflow-correctness tests for the ``INT64_INDICES``
+``tl.constexpr`` knob added to the scattermoe-lora Triton kernels.
+
+The kernel-level fix promotes the per-launch index ranges to int64 only
+when the wrapper has detected that ``L_scattered * y_dim`` would
+overflow int32. Two properties are tested:
+
+1. **Bitwise parity at small shapes.** When the shape fits in int32,
+   ``INT64_INDICES=False`` (the JIT'd int32 variant) and
+   ``INT64_INDICES=True`` (the int64 variant) compute the same MMA in
+   the same order. Only the index *type* changes, so the outputs must
+   be bitwise identical — any deviation indicates the cast leaked into
+   the accumulator path.
+
+2. **Overflow correctness at large shapes.** At the previously-failing
+   bench config (seq=524288 with 16 shards, L_scattered=262144,
+   y_dim=16384 → 2**32 element output), the int64 kernel must populate
+   every row of the output and match the chunked workaround within bf16
+   tolerance (the chunking workaround changes accumulation order, so
+   bit-equality is not expected against it — only against the same-
+   layout int32 kernel below the overflow boundary).
+
+The bench-config test is gated by GPU memory; an L_scattered=262144
+× 16384 bf16 output is ~8.6 GiB and the up-projection weight is
+~64 GiB so we skip when free memory is below the threshold.
+"""
+
+from __future__ import annotations
+
+import pytest
+import torch
+
+from axolotl.integrations.kernels.libs.scattermoe_lora.kernels import (
+    lora_ops,
+    ops as base_ops,
+)
+from axolotl.integrations.kernels.libs.scattermoe_lora.parallel_experts import (
+    flatten_sort_count,
+)
+
+DEVICE = "cuda"
+DTYPE = torch.bfloat16
+
+# Sufficient condition for int32 pointer arithmetic to overflow in the
+# Triton kernel: any indexed buffer has >= 2**31 elements.
+_INT32_LIMIT = 2**31
+
+
+def _requires_cuda():
+    return pytest.mark.skipif(
+        not torch.cuda.is_available(), reason="CUDA not available"
+    )
+
+
+pytestmark = _requires_cuda()
+
+
+# ─── Helpers ─────────────────────────────────────────────────────────────────
+
+
+def _setup(E, K, N, T, top_k, R=16, seed=42):
+    """Create synthetic inputs + routing for a (E, K, N, T, k) shape."""
+    torch.manual_seed(seed)
+    x = torch.randn(T, K, device=DEVICE, dtype=DTYPE)
+    W = torch.randn(E, K, N, device=DEVICE, dtype=DTYPE) * 0.02
+    lora_A = torch.randn(R * E, K, device=DEVICE, dtype=DTYPE) * 0.01
+    lora_B = torch.randn(N, R * E, device=DEVICE, dtype=DTYPE) * 0.01
+    logits = torch.randn(T, E, device=DEVICE)
+    _, top_idx = torch.topk(torch.softmax(logits, dim=-1), top_k, dim=-1)
+    sei, ssi, eo = flatten_sort_count(top_idx, E)
+    return x, W, lora_A, lora_B, sei, ssi, eo
+
+
+# ─── Parity tests at non-overflow shapes (bitwise identity) ──────────────────
+
+
+def test_dense_scatter2scatter_int64_parity_small():
+    """Dense scatter2scatter: INT64_INDICES=True == INT64_INDICES=False at small shape."""
+    x, W, *_, sei, ssi, _ = _setup(E=8, K=512, N=1024, T=256, top_k=4)
+    k = 4
+    out_i32 = base_ops.scatter2scatter(
+        X=x,
+        W=W,
+        sorted_expert_idxs=sei,
+        sorted_scattered_idxs=ssi,
+        k=k,
+        x_grouped=False,
+        y_grouped=True,
+        int64_indices=False,
+    )
+    out_i64 = base_ops.scatter2scatter(
+        X=x,
+        W=W,
+        sorted_expert_idxs=sei,
+        sorted_scattered_idxs=ssi,
+        k=k,
+        x_grouped=False,
+        y_grouped=True,
+        int64_indices=True,
+    )
+    torch.cuda.synchronize()
+    assert torch.equal(out_i32, out_i64), (
+        "INT64_INDICES must not change accumulation order at non-overflow shapes"
+    )
+
+
+def test_dense_scatter2scatter_int64_parity_ungrouped_out():
+    """Same parity but y_grouped=False (uses M_idx scatter lookup, not M_block)."""
+    x, W, *_, sei, ssi, _ = _setup(E=8, K=512, N=1024, T=256, top_k=4)
+    k = 4
+    out_i32 = base_ops.scatter2scatter(
+        X=x,
+        W=W,
+        sorted_expert_idxs=sei,
+        sorted_scattered_idxs=ssi,
+        k=k,
+        x_grouped=False,
+        y_grouped=False,
+        int64_indices=False,
+    )
+    out_i64 = base_ops.scatter2scatter(
+        X=x,
+        W=W,
+        sorted_expert_idxs=sei,
+        sorted_scattered_idxs=ssi,
+        k=k,
+        x_grouped=False,
+        y_grouped=False,
+        int64_indices=True,
+    )
+    torch.cuda.synchronize()
+    assert torch.equal(out_i32, out_i64)
+
+
+def test_scatter2scatter_lora_int64_parity_small():
+    """scatter2scatter_lora: int32 vs int64 must agree bitwise."""
+    # Pick a shape that lands on the fused path (not split): few-large-experts
+    # split threshold is E<=32 with K*N >= 20M, so use a small K*N to stay
+    # on the fused kernel.
+    x, W, lA, lB, sei, ssi, _ = _setup(E=64, K=256, N=512, T=128, top_k=4)
+    k = 4
+    scaling = 0.5
+    out_i32 = lora_ops.scatter2scatter_lora(
+        X=x,
+        W=W,
+        sorted_expert_idxs=sei,
+        sorted_scattered_idxs=ssi,
+        k=k,
+        lora_A=lA,
+        lora_B=lB,
+        scaling=scaling,
+        x_grouped=False,
+        y_grouped=True,
+        int64_indices=False,
+    )
+    out_i64 = lora_ops.scatter2scatter_lora(
+        X=x,
+        W=W,
+        sorted_expert_idxs=sei,
+        sorted_scattered_idxs=ssi,
+        k=k,
+        lora_A=lA,
+        lora_B=lB,
+        scaling=scaling,
+        x_grouped=False,
+        y_grouped=True,
+        int64_indices=True,
+    )
+    torch.cuda.synchronize()
+    assert torch.equal(out_i32, out_i64)
+
+
+def test_scatter2scatter_lora_dX_int64_parity_small():
+    """scatter2scatter_lora_dX: int32 vs int64 must agree bitwise."""
+    _, W, lA, lB, sei, ssi, _ = _setup(E=64, K=256, N=512, T=128, top_k=4)
+    k = 4
+    scaling = 0.5
+    M_grouped = sei.size(0)  # ungrouped k=1 dy_grouped=True
+    dy = torch.randn(M_grouped, W.size(2), device=DEVICE, dtype=DTYPE) * 0.01
+    dX_i32 = lora_ops.scatter2scatter_lora_dX(
+        DY=dy,
+        W=W,
+        sorted_expert_idxs=sei,
+        sorted_scattered_idxs=ssi,
+        k=k,
+        lora_A=lA,
+        lora_B=lB,
+        scaling=scaling,
+        dy_grouped=True,
+        dx_grouped=False,
+        int64_indices=False,
+    )
+    dX_i64 = lora_ops.scatter2scatter_lora_dX(
+        DY=dy,
+        W=W,
+        sorted_expert_idxs=sei,
+        sorted_scattered_idxs=ssi,
+        k=k,
+        lora_A=lA,
+        lora_B=lB,
+        scaling=scaling,
+        dy_grouped=True,
+        dx_grouped=False,
+        int64_indices=True,
+    )
+    torch.cuda.synchronize()
+    assert torch.equal(dX_i32, dX_i64)
+
+
+def test_group_bwd_lora_int64_parity_small():
+    """group_bwd_lora (split kernel): int32 vs int64 must agree bitwise."""
+    x, W, lA, lB, sei, ssi, eo = _setup(E=16, K=256, N=512, T=128, top_k=2)
+    grouped_x = base_ops.group(x, ssi, fan_out=2)
+    M = grouped_x.size(0)
+    dy = torch.randn(M, W.size(2), device=DEVICE, dtype=DTYPE) * 0.01
+    scaling = 0.5
+    dA_i32, dB_i32 = lora_ops.group_bwd_lora(
+        DY=dy,
+        X=grouped_x,
+        lora_A=lA,
+        lora_B=lB,
+        expert_offsets=eo,
+        E=16,
+        scaling=scaling,
+        int64_indices=False,
+    )
+    dA_i64, dB_i64 = lora_ops.group_bwd_lora(
+        DY=dy,
+        X=grouped_x,
+        lora_A=lA,
+        lora_B=lB,
+        expert_offsets=eo,
+        E=16,
+        scaling=scaling,
+        int64_indices=True,
+    )
+    torch.cuda.synchronize()
+    assert torch.equal(dA_i32, dA_i64)
+    assert torch.equal(dB_i32, dB_i64)
+
+
+def test_group_bwd_lora_fused_int64_parity_small():
+    """group_bwd_lora_fused: int32 vs int64 must agree within bf16 tolerance.
+
+    Unlike the split kernel, the fused kernel writes dA/dB via
+    ``tl.atomic_add``; the order in which (E, K-tile, N-tile) thread blocks
+    land their atomics is non-deterministic, so bit-equality is not
+    achievable even between two launches of the *same* kernel variant.
+    The INT64 cast only changes index *types*, not the MMA path or the
+    atomic reduction, so the two variants must still match within
+    ``torch.allclose`` bf16 tolerance.
+    """
+    x, _W, lA, lB, _sei, ssi, eo = _setup(E=16, K=256, N=512, T=128, top_k=2)
+    k = 2
+    M_total = ssi.size(0)
+    N = lB.size(0)
+    dy = torch.randn(M_total, N, device=DEVICE, dtype=DTYPE) * 0.01
+    scaling = 0.5
+    dA_i32, dB_i32 = lora_ops.group_bwd_lora_fused(
+        DY=dy,
+        X=x,
+        lora_A=lA,
+        lora_B=lB,
+        expert_offsets=eo,
+        sorted_scattered_idxs=ssi,
+        E=16,
+        k=k,
+        scaling=scaling,
+        dy_grouped=False,
+        int64_indices=False,
+    )
+    dA_i64, dB_i64 = lora_ops.group_bwd_lora_fused(
+        DY=dy,
+        X=x,
+        lora_A=lA,
+        lora_B=lB,
+        expert_offsets=eo,
+        sorted_scattered_idxs=ssi,
+        E=16,
+        k=k,
+        scaling=scaling,
+        dy_grouped=False,
+        int64_indices=True,
+    )
+    torch.cuda.synchronize()
+    # Tolerance: a few bf16 ULPs is expected from atomic-add ordering nondet.
+    assert torch.allclose(dA_i32, dA_i64, rtol=1e-2, atol=5e-4), (
+        f"max_abs_diff dA: {(dA_i32.float() - dA_i64.float()).abs().max()}"
+    )
+    assert torch.allclose(dB_i32, dB_i64, rtol=1e-2, atol=5e-4), (
+        f"max_abs_diff dB: {(dB_i32.float() - dB_i64.float()).abs().max()}"
+    )
+
+
+# ─── Overflow correctness at the bench shape ─────────────────────────────────
+
+
+# Bench-shape constants (mirror ``test_parallel_experts_large_batch_repro.py``).
+_T = 32768
+_TOP_K = 8
+_NUM_EXPERTS = 128
+_HIDDEN = 2048
+_INTERMEDIATE = 8192
+
+
+def _has_free_gpu_mem(min_gb: float) -> bool:
+    if not torch.cuda.is_available():
+        return False
+    free, _total = torch.cuda.mem_get_info()
+    return free / (1024**3) >= min_gb
+
+
+@pytest.mark.skipif(
+    not _has_free_gpu_mem(80.0),
+    reason="bench shape needs ~80 GiB free GPU memory",
+)
+def test_dense_scatter2scatter_int64_at_overflow_shape():
+    """Direct int64 kernel call at the bench shape produces no zero rows.
+
+    With ``INT64_INDICES=True`` the kernel's pointer arithmetic stays in
+    int64 across the full output row range, so rows past the would-be
+    int32 overflow boundary (``M_block >= 2**31 / y_dim``) are populated
+    rather than silently dropped.
+    """
+    device = torch.device("cuda:0")
+    torch.manual_seed(0)
+    x = torch.randn(_T, _HIDDEN, device=device, dtype=DTYPE)
+    W = (
+        torch.randn(
+            _NUM_EXPERTS, _HIDDEN, 2 * _INTERMEDIATE, device=device, dtype=DTYPE
+        )
+        * 0.01
+    )
+
+    logits = torch.randn(_T, _NUM_EXPERTS, device=device)
+    _, top_idx = torch.topk(torch.softmax(logits, dim=-1), _TOP_K, dim=-1)
+    sei, ssi, _ = flatten_sort_count(top_idx, _NUM_EXPERTS)
+
+    L_scattered = sei.size(0)
+    y_dim = W.size(-1)
+    assert L_scattered * y_dim >= _INT32_LIMIT, (
+        "precondition: shape must straddle the int32 overflow boundary"
+    )
+
+    out_i64 = base_ops.scatter2scatter(
+        X=x,
+        W=W,
+        sorted_expert_idxs=sei,
+        sorted_scattered_idxs=ssi,
+        k=_TOP_K,
+        x_grouped=False,
+        y_grouped=True,
+        int64_indices=True,
+    )
+    torch.cuda.synchronize()
+
+    # Sample rows on both sides of the int32 overflow boundary.
+    overflow_threshold_row = _INT32_LIMIT // y_dim
+    sample_rows = [
+        0,
+        overflow_threshold_row - 1,
+        overflow_threshold_row,
+        overflow_threshold_row + 1,
+        L_scattered - 1,
+    ]
+    for row in sample_rows:
+        assert (out_i64[row] != 0).any().item(), (
+            f"int64 kernel left row {row} all-zero (overflow boundary "
+            f"= {overflow_threshold_row}, L_scattered = {L_scattered})"
+        )
+    assert torch.isfinite(out_i64).all().item(), (
+        "int64 kernel produced non-finite values at overflow shape"
+    )
+
+
+@pytest.mark.skipif(
+    not _has_free_gpu_mem(80.0),
+    reason="bench shape needs ~80 GiB free GPU memory",
+)
+def test_parallel_linear_overflow_takes_int64_kernel_path(monkeypatch):
+    """``ParallelLinear.forward`` at the bench shape must route through
+    the int64 kernel path (single launch, ``int64_indices=True``).
+
+    The auto-dispatch should set ``needs_int64=True`` and dispatch a
+    single ``scatter2scatter`` launch with that flag. A regressed path
+    that called the kernel multiple times (e.g. a chunking workaround)
+    would invoke ``scatter2scatter_compileable`` more than once and
+    fail this assertion.
+    """
+    from axolotl.integrations.kernels.libs.scattermoe_lora import parallel_experts
+    from axolotl.integrations.kernels.libs.scattermoe_lora.parallel_experts import (
+        parallel_linear,
+    )
+
+    device = torch.device("cuda:0")
+    torch.manual_seed(0)
+
+    x = torch.randn(_T, _HIDDEN, device=device, dtype=DTYPE)
+    W = (
+        torch.randn(
+            _NUM_EXPERTS, _HIDDEN, 2 * _INTERMEDIATE, device=device, dtype=DTYPE
+        )
+        * 0.01
+    )
+
+    logits = torch.randn(_T, _NUM_EXPERTS, device=device)
+    _, top_idx = torch.topk(torch.softmax(logits, dim=-1), _TOP_K, dim=-1)
+    sei, ssi, eo = flatten_sort_count(top_idx, _NUM_EXPERTS)
+
+    # Spy on the kernel launches. The kernel-level int64 fix dispatches
+    # exactly one ``scatter2scatter_compileable`` call with
+    # ``int64_indices=True``. A re-introduced chunking workaround would
+    # invoke it once per chunk (>=2 at this shape).
+    launches = []
+    real_compileable = parallel_experts.kernels.ops.scatter2scatter_compileable
+
+    def _spy_compileable(*args, **kwargs):
+        # int64_indices is positional arg 9 (after b, x_grouped, y_grouped).
+        launches.append(
+            {
+                "args_len": len(args),
+                "int64": args[9]
+                if len(args) > 9
+                else kwargs.get("int64_indices", False),
+            }
+        )
+        return real_compileable(*args, **kwargs)
+
+    monkeypatch.setattr(
+        parallel_experts.kernels.ops,
+        "scatter2scatter_compileable",
+        _spy_compileable,
+    )
+
+    with torch.no_grad():
+        out = parallel_linear(
+            x,
+            W,
+            _TOP_K,
+            sei,
+            ssi,
+            eo,
+            grouped_in=False,
+            grouped_out=True,
+        )
+        torch.cuda.synchronize()
+
+    assert len(launches) == 1, (
+        f"expected exactly one kernel launch (direct int64 path), got {len(launches)}"
+    )
+    assert launches[0]["int64"] is True, (
+        "auto-dispatch should have set int64_indices=True at the overflow shape"
+    )
+    assert out.shape == (_T * _TOP_K, 2 * _INTERMEDIATE)
+    assert torch.isfinite(out).all().item()
+
+
+# ─── Smaller-shape overflow (runs on L40S / 24 GiB GPUs) ─────────────────────
+# L_scattered * y_dim = 2**32 (2× past 2**31); peak VRAM ≈ 8 GiB.
+_SMALL_T = 131072
+_SMALL_TOP_K = 8
+_SMALL_E = 8
+_SMALL_K = 256
+_SMALL_INTERMEDIATE = 2048
+_SMALL_MIN_FREE_GIB = 12.0
+
+
+@pytest.mark.skipif(
+    not _has_free_gpu_mem(_SMALL_MIN_FREE_GIB),
+    reason=f"small overflow shape needs ~{_SMALL_MIN_FREE_GIB:.0f} GiB free GPU memory",
+)
+def test_dense_scatter2scatter_int64_at_overflow_shape_small():
+    device = torch.device("cuda:0")
+    torch.manual_seed(0)
+    y_dim = 2 * _SMALL_INTERMEDIATE
+    x = torch.randn(_SMALL_T, _SMALL_K, device=device, dtype=DTYPE)
+    W = torch.randn(_SMALL_E, _SMALL_K, y_dim, device=device, dtype=DTYPE) * 0.01
+
+    logits = torch.randn(_SMALL_T, _SMALL_E, device=device)
+    _, top_idx = torch.topk(torch.softmax(logits, dim=-1), _SMALL_TOP_K, dim=-1)
+    sei, ssi, _ = flatten_sort_count(top_idx, _SMALL_E)
+
+    L_scattered = sei.size(0)
+    assert L_scattered * y_dim >= _INT32_LIMIT, (
+        f"precondition: L_scattered * y_dim ({L_scattered * y_dim}) must "
+        f"straddle the int32 overflow boundary ({_INT32_LIMIT})"
+    )
+
+    out_i64 = base_ops.scatter2scatter(
+        X=x,
+        W=W,
+        sorted_expert_idxs=sei,
+        sorted_scattered_idxs=ssi,
+        k=_SMALL_TOP_K,
+        x_grouped=False,
+        y_grouped=True,
+        int64_indices=True,
+    )
+    torch.cuda.synchronize()
+
+    overflow_threshold_row = _INT32_LIMIT // y_dim
+    sample_rows = [
+        0,
+        overflow_threshold_row - 1,
+        overflow_threshold_row,
+        overflow_threshold_row + 1,
+        L_scattered - 1,
+    ]
+    for row in sample_rows:
+        assert (out_i64[row] != 0).any().item(), (
+            f"int64 kernel left row {row} all-zero (overflow boundary "
+            f"= {overflow_threshold_row}, L_scattered = {L_scattered})"
+        )
+    assert torch.isfinite(out_i64).all().item()
+
+
+@pytest.mark.skipif(
+    not _has_free_gpu_mem(_SMALL_MIN_FREE_GIB),
+    reason=f"small overflow shape needs ~{_SMALL_MIN_FREE_GIB:.0f} GiB free GPU memory",
+)
+def test_parallel_linear_overflow_takes_int64_kernel_path_small(monkeypatch):
+    from axolotl.integrations.kernels.libs.scattermoe_lora import parallel_experts
+    from axolotl.integrations.kernels.libs.scattermoe_lora.parallel_experts import (
+        parallel_linear,
+    )
+
+    device = torch.device("cuda:0")
+    torch.manual_seed(0)
+    y_dim = 2 * _SMALL_INTERMEDIATE
+    x = torch.randn(_SMALL_T, _SMALL_K, device=device, dtype=DTYPE)
+    W = torch.randn(_SMALL_E, _SMALL_K, y_dim, device=device, dtype=DTYPE) * 0.01
+
+    logits = torch.randn(_SMALL_T, _SMALL_E, device=device)
+    _, top_idx = torch.topk(torch.softmax(logits, dim=-1), _SMALL_TOP_K, dim=-1)
+    sei, ssi, eo = flatten_sort_count(top_idx, _SMALL_E)
+
+    launches = []
+    real_compileable = parallel_experts.kernels.ops.scatter2scatter_compileable
+
+    def _spy_compileable(*args, **kwargs):
+        launches.append(
+            {
+                "int64": args[9]
+                if len(args) > 9
+                else kwargs.get("int64_indices", False),
+            }
+        )
+        return real_compileable(*args, **kwargs)
+
+    monkeypatch.setattr(
+        parallel_experts.kernels.ops,
+        "scatter2scatter_compileable",
+        _spy_compileable,
+    )
+
+    with torch.no_grad():
+        out = parallel_linear(
+            x,
+            W,
+            _SMALL_TOP_K,
+            sei,
+            ssi,
+            eo,
+            grouped_in=False,
+            grouped_out=True,
+        )
+        torch.cuda.synchronize()
+
+    assert len(launches) == 1, (
+        f"expected exactly one kernel launch (direct int64 path), got {len(launches)}"
+    )
+    assert launches[0]["int64"] is True, (
+        "auto-dispatch should have set int64_indices=True at the overflow shape"
+    )
+    assert out.shape == (_SMALL_T * _SMALL_TOP_K, y_dim)
+    assert torch.isfinite(out).all().item()
diff --git a/tests/integrations/kernels/scattermoe_lora/test_scattermoe_lora_m_bucket.py b/tests/integrations/kernels/scattermoe_lora/test_scattermoe_lora_m_bucket.py
new file mode 100644
index 0000000000..5d1c4d6b19
--- /dev/null
+++ b/tests/integrations/kernels/scattermoe_lora/test_scattermoe_lora_m_bucket.py
@@ -0,0 +1,118 @@
+# SPDX-License-Identifier: Apache-2.0
+# Copyright (c) Axolotl AI
+# Licensed under the Apache License, Version 2.0
+
+"""Tests for the ``M_BUCKET`` autotune-key bucketing on the scattermoe-lora
+fused forward kernel.
+
+The kernel runs on the real ``M`` (loop bounds + masks); only the
+``@triton.autotune`` cache key is bucketed via :func:`_bucket_m`. These tests
+pin both halves of that contract:
+
+  * ``_bucket_m`` rounds up to a multiple of the granularity (pure-Python
+    unit test, no GPU).
+  * Two distinct real ``M`` values that share a bucket produce **one**
+    cache entry (the whole point — no resweep on small seqlen variation).
+  * Two real ``M`` values in different buckets produce **two** cache
+    entries (we didn't accidentally collapse to a single key).
+
+Run on CUDA only; the bucketing assertion needs an actual Triton launch.
+"""
+
+from __future__ import annotations
+
+import pytest
+import torch
+
+from axolotl.integrations.kernels.libs.scattermoe_lora.kernels import lora_ops
+from axolotl.integrations.kernels.libs.scattermoe_lora.kernels.lora_ops import (
+    _M_BUCKET_GRANULARITY,
+    _bucket_m,
+    scatter2scatter_lora,
+)
+from axolotl.integrations.kernels.libs.scattermoe_lora.parallel_experts import (
+    flatten_sort_count,
+)
+
+
+def test_bucket_m_rounds_up_to_granularity():
+    g = _M_BUCKET_GRANULARITY
+    assert _bucket_m(1) == g
+    assert _bucket_m(g) == g
+    assert _bucket_m(g + 1) == 2 * g
+    assert _bucket_m(2 * g) == 2 * g
+    # Realistic seqlen variation: at granularity=1024 and top_k=8 the three
+    # seqlens 16300/16400/16500 straddle one bucket boundary, so they collapse
+    # to 2 cache entries rather than 3 (16400 and 16500 share a bucket).
+    assert _bucket_m(16400 * 8) == _bucket_m(16500 * 8)
+    assert _bucket_m(16300 * 8) != _bucket_m(16400 * 8)
+    distinct = {_bucket_m(s * 8) for s in (16300, 16400, 16500)}
+    assert len(distinct) == 2
+
+
+pytestmark = pytest.mark.skipif(
+    not torch.cuda.is_available(), reason="CUDA required for kernel launch"
+)
+
+
+_DEVICE = "cuda"
+_DTYPE = torch.bfloat16
+_E = 4
+_K = 64
+_N = 64
+_TOP_K = 2
+_R = 16
+
+
+def _launch_once(m: int) -> None:
+    """One fused fwd launch at the given real M; minimal shapes for speed."""
+    torch.manual_seed(m)
+    x = torch.randn(m, _K, device=_DEVICE, dtype=_DTYPE)
+    W = torch.randn(_E, _K, _N, device=_DEVICE, dtype=_DTYPE) * 0.02
+    lora_A = torch.randn(_R * _E, _K, device=_DEVICE, dtype=_DTYPE) * 0.01
+    lora_B = torch.randn(_N, _R * _E, device=_DEVICE, dtype=_DTYPE) * 0.01
+    logits = torch.randn(m, _E, device=_DEVICE)
+    _, top_idx = torch.topk(torch.softmax(logits, dim=-1), _TOP_K, dim=-1)
+    sei, ssi, _ = flatten_sort_count(top_idx, _E)
+    scatter2scatter_lora(
+        X=x,
+        W=W,
+        sorted_expert_idxs=sei,
+        sorted_scattered_idxs=ssi,
+        k=_TOP_K,
+        lora_A=lora_A,
+        lora_B=lora_B,
+        scaling=1.0 / _R,
+    )
+
+
+def test_autotune_cache_collapses_within_bucket_and_grows_across_buckets():
+    cache = lora_ops._scatter2scatter_lora.cache
+    cache.clear()
+
+    g = _M_BUCKET_GRANULARITY
+    # Two M values that both ceil to bucket B1.
+    m_a = g - 1
+    m_b = g // 2 + 1
+    assert _bucket_m(m_a) == g
+    assert _bucket_m(m_b) == g
+
+    _launch_once(m_a)
+    assert len(cache) == 1, (
+        f"first launch should create exactly one cache entry, got {len(cache)}"
+    )
+
+    _launch_once(m_b)
+    assert len(cache) == 1, (
+        f"second launch in the same bucket must not add a cache entry "
+        f"(M={m_a} and M={m_b} both bucket to {g}); got {len(cache)} entries"
+    )
+
+    # An M strictly past the bucket boundary lands in bucket 2*g.
+    m_c = g + 1
+    assert _bucket_m(m_c) == 2 * g
+    _launch_once(m_c)
+    assert len(cache) == 2, (
+        f"launch in a different bucket (M={m_c} -> {2 * g}) must add a "
+        f"second cache entry; got {len(cache)}"
+    )
diff --git a/tests/integrations/kernels/scattermoe_lora/test_shared_dequant_helper.py b/tests/integrations/kernels/scattermoe_lora/test_shared_dequant_helper.py
new file mode 100644
index 0000000000..cf5d88c669
--- /dev/null
+++ b/tests/integrations/kernels/scattermoe_lora/test_shared_dequant_helper.py
@@ -0,0 +1,146 @@
+# SPDX-License-Identifier: Apache-2.0
+# Copyright (c) Axolotl AI
+# Licensed under the Apache License, Version 2.0
+
+"""
+Parity tests for :func:`shared_dequant_across_shards`.
+
+The helper hoists the per-shard MXFP4 dequant out of the orthogonal
+Strategy A path so that overlapping active-expert sets across shards
+dequantize the union once instead of N times. The optimization is
+only valid if a shard's slice of the union buffer is *byte-identical*
+to what the per-shard ``selective_expert_weights`` call would have
+produced.
+"""
+
+import pytest
+import torch
+
+from axolotl.integrations.kernels.libs.scattermoe_lora.selective_dequant import (
+    get_active_experts,
+    selective_expert_weights,
+    shared_dequant_across_shards,
+)
+
+torchao = pytest.importorskip("torchao")
+from torchao.prototype.mx_formats.mx_tensor import MXTensor  # noqa: E402
+
+DEVICE = "cuda"
+DTYPE = torch.bfloat16
+
+pytestmark = pytest.mark.skipif(
+    not torch.cuda.is_available(), reason="CUDA required for MX kernels"
+)
+
+
+class _MockExperts:
+    """Same bare wrapper used by ``test_mxfp4_expert_weights.py``."""
+
+    def __init__(self, mx_param, num_experts):
+        self.gate_up_proj = mx_param
+        self.num_experts = num_experts
+
+
+def _make_mxfp4(E, N, K, seed):
+    torch.manual_seed(seed)
+    W = torch.randn(E, N, K, device=DEVICE, dtype=DTYPE)
+    return MXTensor.to_mx(W, elem_dtype=torch.float4_e2m1fn_x2, block_size=32)
+
+
+def _make_overlapping_shard_sei(E, num_shards, seed):
+    """Build N shards of ``sorted_expert_idxs`` with deliberate overlap.
+
+    Each shard picks ~E/2 experts at random; with 4 shards the union is
+    typically ~E (full coverage) while the intersection is non-empty,
+    which is the regime the helper is intended to optimise.
+    """
+    torch.manual_seed(seed)
+    sei = []
+    for _ in range(num_shards):
+        # Each shard sees ~E/2 distinct experts repeated a few times to
+        # mimic top-k routing.
+        chosen = torch.randperm(E, device=DEVICE)[: max(2, E // 2)]
+        # Repeat each id ~3x and sort so the tensor satisfies the
+        # ``sorted_expert_idxs`` contract.
+        repeated = chosen.repeat_interleave(3)
+        sei.append(torch.sort(repeated).values)
+    return sei
+
+
+def test_shared_dequant_matches_per_shard_bitwise():
+    """Union dequant + index gather == per-shard selective dequant, bitwise.
+
+    Uses N=4 shards over E=16 experts with deliberate overlap (each shard
+    picks ~half the experts, union covers most of E). The helper must
+    yield the exact same compact buffer that a per-shard
+    ``selective_expert_weights`` call would have produced for that shard.
+    """
+    E, N, K = 16, 128, 256
+    num_shards = 4
+    mx = _make_mxfp4(E, N, K, seed=13)
+    experts = _MockExperts(mx, E)
+
+    sei_per_shard = _make_overlapping_shard_sei(E, num_shards, seed=13)
+    # Sanity: at least one pair of shards must overlap for this test to
+    # actually exercise the dedup path.
+    actives = [get_active_experts(sei, E) for sei in sei_per_shard]
+    union = torch.unique(torch.cat(actives))
+    sum_per_shard = sum(a.numel() for a in actives)
+    assert sum_per_shard > union.numel(), (
+        "shard active sets must overlap to exercise the shared-dequant path"
+    )
+
+    union_active, union_buf, shard_into_union = shared_dequant_across_shards(
+        experts, "gate_up_proj", sei_per_shard, E
+    )
+
+    assert torch.equal(union_active, union)
+    assert union_buf.shape == (union.numel(), N, K)
+
+    for i, sei in enumerate(sei_per_shard):
+        active_i = get_active_experts(sei, E)
+        reference = selective_expert_weights(experts, "gate_up_proj", active_i)
+        shared_slice = union_buf.index_select(0, shard_into_union[i])
+        assert torch.equal(shared_slice, reference), (
+            f"shard {i}: max abs diff = {(shared_slice - reference).abs().max().item()}"
+        )
+
+
+def test_shared_dequant_disjoint_shards():
+    """When shards do NOT overlap, the helper still produces the right
+    union and the per-shard slices remain bitwise identical."""
+    E, N, K = 12, 64, 128
+    mx = _make_mxfp4(E, N, K, seed=21)
+    experts = _MockExperts(mx, E)
+
+    # Two shards splitting the expert ids into disjoint halves.
+    halves = torch.arange(E, device=DEVICE).chunk(2)
+    sei_per_shard = [torch.sort(h.repeat_interleave(2)).values for h in halves]
+
+    union_active, union_buf, shard_into_union = shared_dequant_across_shards(
+        experts, "gate_up_proj", sei_per_shard, E
+    )
+    assert union_active.numel() == E
+
+    for i, sei in enumerate(sei_per_shard):
+        active_i = get_active_experts(sei, E)
+        reference = selective_expert_weights(experts, "gate_up_proj", active_i)
+        shared_slice = union_buf.index_select(0, shard_into_union[i])
+        assert torch.equal(shared_slice, reference)
+
+
+def test_shared_dequant_single_shard_noop():
+    """N=1 should reduce to the per-shard path: union == only active set."""
+    E, N, K = 8, 32, 64
+    mx = _make_mxfp4(E, N, K, seed=5)
+    experts = _MockExperts(mx, E)
+
+    sei = torch.tensor([0, 0, 2, 2, 5, 5], device=DEVICE, dtype=torch.long)
+    union_active, union_buf, shard_into_union = shared_dequant_across_shards(
+        experts, "gate_up_proj", [sei], E
+    )
+    active = get_active_experts(sei, E)
+    reference = selective_expert_weights(experts, "gate_up_proj", active)
+    assert torch.equal(union_active, active)
+    assert torch.equal(union_buf, reference)
+    assert torch.equal(union_buf.index_select(0, shard_into_union[0]), reference)
diff --git a/tests/integrations/monkeypatch/__init__.py b/tests/integrations/monkeypatch/__init__.py
new file mode 100644
index 0000000000..e69de29bb2
diff --git a/tests/integrations/monkeypatch/test_tiled_mlp_moe.py b/tests/integrations/monkeypatch/test_tiled_mlp_moe.py
new file mode 100644
index 0000000000..6dab0df109
--- /dev/null
+++ b/tests/integrations/monkeypatch/test_tiled_mlp_moe.py
@@ -0,0 +1,474 @@
+# SPDX-License-Identifier: Apache-2.0
+# Copyright (c) Axolotl AI
+# Licensed under the Apache License, Version 2.0
+"""
+Single-GPU correctness tests for the TiledMLP autograd function under both
+dense and MoE block forwards. Synthetic-shape modules only; no real
+transformers checkpoints are loaded.
+"""
+
+import copy
+from types import SimpleNamespace
+
+import pytest
+import torch
+import torch.nn as nn
+import torch.nn.functional as F
+
+
+def _requires_cuda():
+    return pytest.mark.skipif(
+        not torch.cuda.is_available(), reason="CUDA not available"
+    )
+
+
+pytestmark = _requires_cuda()
+
+DEVICE = "cuda"
+
+
+# ────────────────────────────── Helpers ──────────────────────────────
+
+
+class TinyDenseMLP(nn.Module):
+    """LlamaMLP-shape: gate * up -> down, no bias, silu activation."""
+
+    def __init__(self, hidden, intermediate, dtype=torch.float32):
+        super().__init__()
+        self.gate_proj = nn.Linear(hidden, intermediate, bias=False, dtype=dtype)
+        self.up_proj = nn.Linear(hidden, intermediate, bias=False, dtype=dtype)
+        self.down_proj = nn.Linear(intermediate, hidden, bias=False, dtype=dtype)
+        self.act_fn = nn.SiLU()
+
+    def forward(self, x):
+        return self.down_proj(self.act_fn(self.gate_proj(x)) * self.up_proj(x))
+
+
+class TinyMoEBlock(nn.Module):
+    """Hand-rolled MoE: top-k softmax router + per-expert SwiGLU MLPs.
+
+    Stays intentionally simple so the test exercises sharding semantics
+    without dragging in transformers, peft, or kernel libs.
+    """
+
+    def __init__(self, hidden, intermediate, num_experts, top_k, dtype=torch.float32):
+        super().__init__()
+        self.hidden = hidden
+        self.intermediate = intermediate
+        self.num_experts = num_experts
+        self.top_k = top_k
+        self.gate = nn.Linear(hidden, num_experts, bias=False, dtype=dtype)
+        # Per-expert SwiGLU weights packed as 3D tensors.
+        self.gate_proj = nn.Parameter(
+            torch.randn(num_experts, hidden, intermediate, dtype=dtype) * 0.02
+        )
+        self.up_proj = nn.Parameter(
+            torch.randn(num_experts, hidden, intermediate, dtype=dtype) * 0.02
+        )
+        self.down_proj = nn.Parameter(
+            torch.randn(num_experts, intermediate, hidden, dtype=dtype) * 0.02
+        )
+
+    def forward(self, x):
+        bsz, seq, h = x.shape
+        flat = x.reshape(-1, h)
+        logits = self.gate(flat)
+        weights = F.softmax(logits, dim=-1, dtype=torch.float32)
+        top_w, top_i = torch.topk(weights, self.top_k, dim=-1)
+        top_w = top_w / top_w.sum(dim=-1, keepdim=True)
+        top_w = top_w.to(flat.dtype)
+
+        out = torch.zeros_like(flat)
+        for e in range(self.num_experts):
+            mask = top_i == e
+            if not mask.any():
+                continue
+            # tokens routed to expert e (with their per-slot weight)
+            token_rows, slot_idx = mask.nonzero(as_tuple=True)
+            xe = flat[token_rows]
+            we = top_w[token_rows, slot_idx].unsqueeze(-1)
+            gate = xe @ self.gate_proj[e]
+            up = xe @ self.up_proj[e]
+            h_e = F.silu(gate) * up
+            ye = h_e @ self.down_proj[e]
+            out.index_add_(0, token_rows, we * ye)
+        return out.reshape(bsz, seq, h)
+
+
+def _clone_module(mod):
+    """Deep copy + detach + re-attach to autograd to compare two runs."""
+    cloned = copy.deepcopy(mod)
+    return cloned
+
+
+def _grad_dict(mod):
+    return {
+        n: p.grad.detach().clone()
+        for n, p in mod.named_parameters()
+        if p.grad is not None
+    }
+
+
+def _run_untiled(mod, x):
+    x = x.clone().detach().requires_grad_(True)
+    y = mod(x)
+    g = torch.randn_like(y)
+    y.backward(g)
+    return y.detach().clone(), x.grad.detach().clone(), _grad_dict(mod), g
+
+
+def _run_tiled(mod, x, upstream_grad, shards):
+    """Re-run forward+backward but routed through TiledMLP."""
+    from axolotl.monkeypatch.tiled_mlp.base import TiledMLP
+
+    # Re-fetch fn that takes (self, x) — matches what the patcher passes.
+    forward_fn = type(mod).forward
+
+    x = x.clone().detach().requires_grad_(True)
+    compute_params = [p for p in mod.parameters() if p.requires_grad]
+    y = TiledMLP.apply(forward_fn, mod, x, shards, compute_params)
+    if isinstance(y, tuple):  # MoE block forwards may return tuples
+        y = y[0]
+    y.backward(upstream_grad)
+    return y.detach().clone(), x.grad.detach().clone(), _grad_dict(mod)
+
+
+# ────────────────────────────── Dense parity ──────────────────────────────
+
+
+def test_tiled_dense_mlp_parity_fp32():
+    """Dense LlamaMLP-shape: tiled vs un-tiled must match closely."""
+    torch.manual_seed(0)
+    hidden, intermediate, seq = 64, 128, 64
+    mlp_ref = TinyDenseMLP(hidden, intermediate).to(DEVICE)
+    mlp_tile = _clone_module(mlp_ref)
+
+    # ``TiledMLP``'s backward narrows into a flattened ``x_grad`` buffer
+    # using offsets along dim 1 only — sequence-packed inputs (batch=1)
+    # are the supported shape; multi-batch tensors aren't contiguous in
+    # the way the narrow assumes. Production inputs from transformers
+    # are batch=1 after sequence packing, so this matches reality.
+    x = torch.randn(1, seq, hidden, device=DEVICE)
+    y_ref, dx_ref, gp_ref, upstream = _run_untiled(mlp_ref, x)
+    y_tile, dx_tile, gp_tile = _run_tiled(mlp_tile, x, upstream, shards=4)
+
+    # FMA reordering across shards introduces sub-eps noise in fp32; allow
+    # a small tolerance on outputs and grads.
+    assert torch.allclose(y_ref, y_tile, atol=1e-5, rtol=1e-5), (
+        f"forward mismatch max={((y_ref - y_tile).abs().max()).item()}"
+    )
+    assert torch.allclose(dx_ref, dx_tile, atol=1e-5, rtol=1e-5), (
+        f"dX mismatch max={((dx_ref - dx_tile).abs().max()).item()}"
+    )
+    for name in gp_ref:
+        diff = (gp_ref[name] - gp_tile[name]).abs().max().item()
+        assert diff < 1e-5, f"param-grad mismatch {name}: max={diff}"
+
+
+# ────────────────────────────── MoE parity ──────────────────────────────
+
+
+def test_tiled_moe_block_parity_fp32():
+    """Hand-rolled MoE block: tiled vs un-tiled fp32 parity."""
+    torch.manual_seed(1)
+    hidden, intermediate, seq = 64, 128, 64
+    moe_ref = TinyMoEBlock(hidden, intermediate, num_experts=8, top_k=2).to(DEVICE)
+    moe_tile = _clone_module(moe_ref)
+
+    x = torch.randn(1, seq, hidden, device=DEVICE)
+    y_ref, dx_ref, gp_ref, upstream = _run_untiled(moe_ref, x)
+    y_tile, dx_tile, gp_tile = _run_tiled(moe_tile, x, upstream, shards=4)
+
+    # The MoE forward involves index_add and routing, which is not
+    # numerically deterministic across different batch sizes for fp32 in
+    # all setups — but at the synthetic small scale we expect tight match.
+    assert torch.allclose(y_ref, y_tile, atol=1e-5, rtol=1e-5), (
+        f"MoE forward mismatch max={((y_ref - y_tile).abs().max()).item()}"
+    )
+    assert torch.allclose(dx_ref, dx_tile, atol=1e-5, rtol=1e-5), (
+        f"MoE dX mismatch max={((dx_ref - dx_tile).abs().max()).item()}"
+    )
+    for name in gp_ref:
+        diff = (gp_ref[name] - gp_tile[name]).abs().max().item()
+        assert diff < 1e-5, f"MoE param-grad mismatch {name}: max={diff}"
+
+
+# ─────────────────────── scattermoe-lora + tiled parity ───────────────────
+
+
+def _build_scattermoe_block(hidden, intermediate, num_experts, top_k, dtype, device):
+    """Build a minimal :class:`ScatterMoEGatedMLP`-compatible module.
+
+    Attributes are populated to match what :meth:`ScatterMoEGatedMLP.forward`
+    expects (``router``, ``input_linear``, ``output_linear``, ``activation``).
+    """
+    try:
+        from axolotl.integrations.kernels.libs.scattermoe_lora.layers import (
+            ScatterMoEGatedMLP,
+        )
+    except ImportError:
+        pytest.skip("scattermoe_lora kernels not available")
+
+    block = ScatterMoEGatedMLP()
+    # router: a Linear-shaped router with top_k / num_experts attrs.
+    router = SimpleNamespace()
+    router.layer = nn.Linear(hidden, num_experts, bias=False, dtype=dtype).to(device)
+    router.top_k = top_k
+    router.num_experts = num_experts
+    block.router = router
+    # input_linear and output_linear store 3D weights: [E, *, *].
+    in_weight = nn.Parameter(
+        torch.randn(num_experts, 2 * intermediate, hidden, dtype=dtype, device=device)
+        * 0.02
+    )
+    out_weight = nn.Parameter(
+        torch.randn(num_experts, hidden, intermediate, dtype=dtype, device=device)
+        * 0.02
+    )
+    block.input_linear = nn.Module()
+    block.input_linear.weight = in_weight
+    block.output_linear = nn.Module()
+    block.output_linear.weight = out_weight
+    block.activation = nn.SiLU()
+    # Register the params so .parameters() walks them.
+    block.input_linear.register_parameter("weight", in_weight)
+    block.output_linear.register_parameter("weight", out_weight)
+    return block
+
+
+def test_tiled_scattermoe_gated_mlp_parity():
+    """ScatterMoEGatedMLP: tiled vs un-tiled fwd+bwd parity in bf16.
+
+    Uses the same tolerance scale as ``tests/integrations/test_scattermoe_lora_kernels.py``
+    (norm-relative error < 1% for weight grads is the established bar there
+    given the bf16 + tiled reduction order differences).
+    """
+    pytest.importorskip("triton")
+    torch.manual_seed(2)
+    hidden, intermediate = 64, 128
+    num_experts, top_k = 8, 2
+    seq = 64
+    dtype = torch.bfloat16
+
+    block_ref = _build_scattermoe_block(
+        hidden, intermediate, num_experts, top_k, dtype, DEVICE
+    )
+    block_tile = copy.deepcopy(block_ref)
+
+    x = torch.randn(1, seq, hidden, device=DEVICE, dtype=dtype)
+    y_ref, dx_ref, gp_ref, upstream = _run_untiled(block_ref, x)
+    y_tile, dx_tile, gp_tile = _run_tiled(block_tile, x, upstream, shards=4)
+
+    def _rel(a, b):
+        return ((a.float() - b.float()).norm() / (b.float().norm() + 1e-6)).item()
+
+    assert _rel(y_tile, y_ref) < 1e-2, (
+        f"scattermoe forward rel_err={_rel(y_tile, y_ref)}"
+    )
+    assert _rel(dx_tile, dx_ref) < 1e-2, (
+        f"scattermoe dX rel_err={_rel(dx_tile, dx_ref)}"
+    )
+    for name in gp_ref:
+        if name not in gp_tile:
+            continue
+        rel = _rel(gp_tile[name], gp_ref[name])
+        assert rel < 1e-2, f"scattermoe param-grad {name} rel_err={rel}"
+
+
+# ─────────────────── Patcher: MoE block discovery & dispatch ────────────────
+
+
+def test_resolve_moe_block_cls_picks_first_available():
+    """The patcher walks the suffix list in order; the first hit wins."""
+    from axolotl.monkeypatch.tiled_mlp.patch import _resolve_moe_block_cls
+
+    module = SimpleNamespace(
+        FooMoE=object,
+        FooMoeMLP=object,  # would also match but later in the list
+    )
+    cls = _resolve_moe_block_cls(module, "Foo")
+    assert cls is module.FooMoeMLP, "MoeMLP should be preferred over MoE"
+
+
+def test_resolve_moe_block_cls_returns_none_for_dense_model():
+    from axolotl.monkeypatch.tiled_mlp.patch import _resolve_moe_block_cls
+
+    module = SimpleNamespace(FooMLP=object)
+    assert _resolve_moe_block_cls(module, "Foo") is None
+
+
+# ─────────────── Grad parity under non-uniform per-token loss weights ─────────
+#
+# Sequence-dim sharding makes per-shard parameter-grads additive: the full
+# batch gradient is the SUM of shard contributions, not the mean. If the
+# tiled backward ever scaled by ``1/total_shards`` (the historical
+# ``GradientAccumulator.gradient_scale``), per-shard non-uniform weights
+# would make the mean visibly diverge from the un-tiled reference — uniform
+# loss weights can mask the bug because the per-shard means happen to add
+# up to a related-magnitude value. These tests exercise multiple
+# ``shards ∈ {1, 2, 4}`` with deliberately non-uniform per-token weights
+# so a regression in the scaling semantics fails loudly.
+
+
+def _run_untiled_with_upstream(mod, x, upstream):
+    """Un-tiled fwd+bwd given a fixed upstream grad."""
+    x = x.clone().detach().requires_grad_(True)
+    y = mod(x)
+    y.backward(upstream)
+    return y.detach().clone(), x.grad.detach().clone(), _grad_dict(mod)
+
+
+@pytest.mark.parametrize("shards", [1, 2, 4])
+def test_tiled_dense_mlp_grad_parity_nonuniform_weights(shards):
+    """Dense MLP: tiled vs un-tiled grad parity with non-uniform per-token weights.
+
+    The upstream grad's magnitude varies per token (non-uniform loss weights),
+    so each shard contributes a distinct fraction of the total parameter grad.
+    A mean-vs-sum bug shows up as a ``shards``-dependent scaling error.
+    """
+    torch.manual_seed(100 + shards)
+    hidden, intermediate = 64, 128
+    seq = 128
+    mlp_ref = TinyDenseMLP(hidden, intermediate).to(DEVICE)
+    mlp_tile = _clone_module(mlp_ref)
+
+    x = torch.randn(1, seq, hidden, device=DEVICE)
+    # Non-uniform per-token weights make per-shard grad contributions
+    # distinct, exposing any incorrect averaging.
+    per_token_w = torch.linspace(0.1, 3.0, seq, device=DEVICE).view(1, seq, 1)
+    upstream = torch.randn(1, seq, hidden, device=DEVICE) * per_token_w
+
+    y_ref, dx_ref, gp_ref = _run_untiled_with_upstream(mlp_ref, x, upstream)
+    y_tile, dx_tile, gp_tile = _run_tiled(mlp_tile, x, upstream, shards=shards)
+
+    assert torch.allclose(y_ref, y_tile, atol=1e-5, rtol=1e-5), (
+        f"shards={shards}: forward mismatch max={((y_ref - y_tile).abs().max()).item()}"
+    )
+    assert torch.allclose(dx_ref, dx_tile, atol=1e-5, rtol=1e-5), (
+        f"shards={shards}: dX mismatch max={((dx_ref - dx_tile).abs().max()).item()}"
+    )
+    for name in gp_ref:
+        diff = (gp_ref[name] - gp_tile[name]).abs().max().item()
+        ref_norm = gp_ref[name].abs().max().item() + 1e-8
+        # Tight bound in fp32; rel error must be tiny so a 1/N or N
+        # scaling error (which would give 25%-400% relative drift) is
+        # impossible to miss.
+        assert diff / ref_norm < 1e-4, (
+            f"shards={shards}: param-grad {name} rel_err={diff / ref_norm}"
+        )
+
+
+@pytest.mark.parametrize("shards", [1, 2, 4])
+def test_tiled_moe_grad_parity_nonuniform_weights(shards):
+    """MoE block: tiled vs un-tiled grad parity with non-uniform per-token weights."""
+    torch.manual_seed(200 + shards)
+    hidden, intermediate = 64, 128
+    seq = 128
+    moe_ref = TinyMoEBlock(hidden, intermediate, num_experts=8, top_k=2).to(DEVICE)
+    moe_tile = _clone_module(moe_ref)
+
+    x = torch.randn(1, seq, hidden, device=DEVICE)
+    per_token_w = torch.linspace(0.1, 3.0, seq, device=DEVICE).view(1, seq, 1)
+    upstream = torch.randn(1, seq, hidden, device=DEVICE) * per_token_w
+
+    y_ref, dx_ref, gp_ref = _run_untiled_with_upstream(moe_ref, x, upstream)
+    y_tile, dx_tile, gp_tile = _run_tiled(moe_tile, x, upstream, shards=shards)
+
+    assert torch.allclose(y_ref, y_tile, atol=1e-5, rtol=1e-5), (
+        f"shards={shards}: MoE forward mismatch "
+        f"max={((y_ref - y_tile).abs().max()).item()}"
+    )
+    assert torch.allclose(dx_ref, dx_tile, atol=1e-5, rtol=1e-5), (
+        f"shards={shards}: MoE dX mismatch "
+        f"max={((dx_ref - dx_tile).abs().max()).item()}"
+    )
+    for name in gp_ref:
+        diff = (gp_ref[name] - gp_tile[name]).abs().max().item()
+        ref_norm = gp_ref[name].abs().max().item() + 1e-8
+        assert diff / ref_norm < 1e-4, (
+            f"shards={shards}: MoE param-grad {name} rel_err={diff / ref_norm}"
+        )
+
+
+@pytest.mark.parametrize("shards", [1, 2, 4])
+def test_tiled_dense_mlp_grad_parity_bf16(shards):
+    """Dense MLP: bf16 grad parity at the param dtype (no fp32 accumulator).
+
+    Guards the default param-dtype accumulator path against regression.
+    bf16 reduction order across shards means we use a relative tolerance
+    rather than bitwise equality.
+    """
+    torch.manual_seed(300 + shards)
+    hidden, intermediate = 64, 128
+    seq = 128
+    dtype = torch.bfloat16
+    mlp_ref = TinyDenseMLP(hidden, intermediate, dtype=dtype).to(DEVICE)
+    mlp_tile = _clone_module(mlp_ref)
+
+    x = torch.randn(1, seq, hidden, device=DEVICE, dtype=dtype)
+    per_token_w = torch.linspace(0.1, 3.0, seq, device=DEVICE).view(1, seq, 1).to(dtype)
+    upstream = (torch.randn(1, seq, hidden, device=DEVICE) * per_token_w).to(dtype)
+
+    y_ref, dx_ref, gp_ref = _run_untiled_with_upstream(mlp_ref, x, upstream)
+    y_tile, dx_tile, gp_tile = _run_tiled(mlp_tile, x, upstream, shards=shards)
+
+    def _rel(a, b):
+        return ((a.float() - b.float()).norm() / (b.float().norm() + 1e-6)).item()
+
+    assert _rel(y_tile, y_ref) < 5e-3, (
+        f"shards={shards}: bf16 forward rel_err={_rel(y_tile, y_ref)}"
+    )
+    assert _rel(dx_tile, dx_ref) < 5e-3, (
+        f"shards={shards}: bf16 dX rel_err={_rel(dx_tile, dx_ref)}"
+    )
+    for name in gp_ref:
+        rel = _rel(gp_tile[name], gp_ref[name])
+        # Tight bound — a 1/N scaling bug would put rel_err ≈ (N-1)/N,
+        # which is far above this threshold for any N ≥ 2.
+        assert rel < 5e-3, f"shards={shards}: bf16 param-grad {name} rel_err={rel}"
+
+
+def test_tiled_grad_accumulator_dtype_matches_param_dtype():
+    """Regression guard: TiledMLP backward should accumulate at param dtype
+    by default (not fp32), so the on-the-fly accumulator does not double
+    the parameter-side memory footprint in bf16 training.
+
+    We snapshot the accumulator dtype by patching ``torch.zeros_like`` to
+    record the dtype of zero-tensors allocated for compute params during
+    backward. The assertion is that none of those allocations request
+    fp32 when the params are bf16.
+    """
+    from axolotl.monkeypatch.tiled_mlp.base import TiledMLP
+
+    torch.manual_seed(42)
+    dtype = torch.bfloat16
+    mlp = TinyDenseMLP(64, 128, dtype=dtype).to(DEVICE)
+    compute_params = [p for p in mlp.parameters() if p.requires_grad]
+    param_ids = {id(p) for p in compute_params}
+
+    x = torch.randn(1, 64, 64, device=DEVICE, dtype=dtype)
+    upstream = torch.randn(1, 64, 64, device=DEVICE, dtype=dtype)
+
+    real_zeros_like = torch.zeros_like
+    allocated_dtypes: list[torch.dtype] = []
+
+    def spy_zeros_like(t, *args, **kwargs):
+        # Only record allocations whose shape matches one of the
+        # compute params (the accumulator buffers we care about).
+        if id(t) in param_ids:
+            allocated_dtypes.append(kwargs.get("dtype", t.dtype))
+        return real_zeros_like(t, *args, **kwargs)
+
+    x_req = x.clone().detach().requires_grad_(True)
+    torch.zeros_like = spy_zeros_like
+    try:
+        y = TiledMLP.apply(type(mlp).forward, mlp, x_req, 4, compute_params)
+        y.backward(upstream)
+    finally:
+        torch.zeros_like = real_zeros_like
+
+    assert allocated_dtypes, "expected accumulator allocations to be observed"
+    # Default path must NOT pre-allocate fp32 buffers when params are bf16.
+    assert all(d == dtype for d in allocated_dtypes), (
+        f"expected accumulator dtype == {dtype}, got {allocated_dtypes}"
+    )
diff --git a/tests/integrations/test_expert_parallel.py b/tests/integrations/test_expert_parallel.py
index 8a5990d528..ab7312a0e0 100644
--- a/tests/integrations/test_expert_parallel.py
+++ b/tests/integrations/test_expert_parallel.py
@@ -1,6 +1,10 @@
 """Tests for the Expert-Parallel (DeepEP) integration."""
 
 import os
+import queue as queue_mod
+import socket
+import time
+from datetime import timedelta
 from importlib.util import find_spec
 
 import pytest
@@ -180,6 +184,10 @@ def setup_method(self):
             os.environ.setdefault("WORLD_SIZE", "1")
             dist.init_process_group(backend="gloo", rank=0, world_size=1)
 
+    def teardown_method(self):
+        if dist.is_initialized():
+            dist.destroy_process_group()
+
     def test_no_op_at_world_size_1(self):
         block = _build_qwen3moe_block(num_experts=16)
         original_shape = tuple(block.experts.gate_up_proj.shape)
@@ -279,7 +287,12 @@ def _ep_topology_worker(rank, world_size, ep_size, dp_shard_size, port, q):
     os.environ["MASTER_PORT"] = str(port)
     os.environ["RANK"] = str(rank)
     os.environ["WORLD_SIZE"] = str(world_size)
-    dist.init_process_group(backend="gloo", rank=rank, world_size=world_size)
+    dist.init_process_group(
+        backend="gloo",
+        rank=rank,
+        world_size=world_size,
+        timeout=timedelta(seconds=120),
+    )
 
     try:
         from types import SimpleNamespace
@@ -316,7 +329,12 @@ def _ep_topology_worker_expects_error(
     os.environ["MASTER_PORT"] = str(port)
     os.environ["RANK"] = str(rank)
     os.environ["WORLD_SIZE"] = str(world_size)
-    dist.init_process_group(backend="gloo", rank=rank, world_size=world_size)
+    dist.init_process_group(
+        backend="gloo",
+        rank=rank,
+        world_size=world_size,
+        timeout=timedelta(seconds=120),
+    )
     try:
         from types import SimpleNamespace
 
@@ -337,13 +355,45 @@ def _ep_topology_worker_expects_error(
         ExpertParallelPlugin._device_mesh = None
 
 
-def _spawn_topology_check(world_size, ep_size, dp_shard_size, port_base):
+def _find_free_port() -> int:
+    with socket.socket(socket.AF_INET, socket.SOCK_STREAM) as s:
+        s.bind(("127.0.0.1", 0))
+        return s.getsockname()[1]
+
+
+def _collect_worker_results(procs, q, world_size, timeout=120):
+    """Collect one result per worker, bailing out early if a worker dies.
+
+    A bare ``q.get(timeout=...)`` blocks the full timeout even after a worker has
+    crashed without reporting; here we stop once all workers have exited so an
+    infra failure surfaces as a clear assertion instead of a silent hang.
+    """
+    results = []
+    deadline = time.monotonic() + timeout
+    while len(results) < world_size and time.monotonic() < deadline:
+        try:
+            results.append(q.get(timeout=5))
+        except queue_mod.Empty:
+            if all(not p.is_alive() for p in procs):
+                break
+    for p in procs:
+        p.join(timeout=20)
+    exitcodes = [p.exitcode for p in procs]
+    assert len(results) == world_size, (
+        f"only {len(results)}/{world_size} workers reported; exitcodes={exitcodes}"
+    )
+    assert all(code == 0 for code in exitcodes), f"worker exitcodes={exitcodes}"
+    return results
+
+
+def _spawn_topology_check(world_size, ep_size, dp_shard_size):
     ctx = mp.get_context("spawn")
     q = ctx.Queue()
+    port = _find_free_port()
     procs = [
         ctx.Process(
             target=_ep_topology_worker,
-            args=(r, world_size, ep_size, dp_shard_size, port_base, q),
+            args=(r, world_size, ep_size, dp_shard_size, port, q),
         )
         for r in range(world_size)
     ]
@@ -363,10 +413,7 @@ def test_world4_ep2_dp2_orthogonal(self):
         """At world=4 with ep=2 and dp_shard=2, EP groups must be strided
         ({0,2}, {1,3}) and dp_shard groups contiguous ({0,1}, {2,3}).
         """
-        # Use large random-ish port base to avoid collision with anything else.
-        results = _spawn_topology_check(
-            world_size=4, ep_size=2, dp_shard_size=2, port_base=37610
-        )
+        results = _spawn_topology_check(world_size=4, ep_size=2, dp_shard_size=2)
         # Build per-rank groupings from results.
         ep_groups_by_rank = {r: tuple(eps) for r, eps, _ in results}
         dp_groups_by_rank = {r: tuple(dps) for r, _, dps in results}
@@ -385,29 +432,29 @@ def test_world4_ep2_dp2_orthogonal(self):
 
     def test_world4_ep4_dp1_uses_world(self):
         """ep_size == world_size short-circuits to dist.group.WORLD."""
-        results = _spawn_topology_check(
-            world_size=4, ep_size=4, dp_shard_size=1, port_base=37710
-        )
+        results = _spawn_topology_check(world_size=4, ep_size=4, dp_shard_size=1)
         for rank, ep_ranks, dp_ranks in results:
             assert ep_ranks == [0, 1, 2, 3], (rank, ep_ranks)
             assert dp_ranks is None  # no 2D mesh built
 
-    def test_world4_ep2_dp1_invalid_product_raises(self):
-        """ep<world without dp_shard filling the rest must raise (product mismatch)."""
+    def _spawn_expects_error(self, ep_size, dp_shard_size, world_size=4):
         ctx = mp.get_context("spawn")
         q = ctx.Queue()
+        port = _find_free_port()
         procs = [
             ctx.Process(
                 target=_ep_topology_worker_expects_error,
-                args=(r, 4, 2, 1, 37810, q),
+                args=(r, world_size, ep_size, dp_shard_size, port, q),
             )
-            for r in range(4)
+            for r in range(world_size)
         ]
         for p in procs:
             p.start()
-        results = [q.get(timeout=60) for _ in range(4)]
-        for p in procs:
-            p.join(timeout=10)
+        return _collect_worker_results(procs, q, world_size)
+
+    def test_world4_ep2_dp1_invalid_product_raises(self):
+        """ep<world without dp_shard filling the rest must raise (product mismatch)."""
+        results = self._spawn_expects_error(ep_size=2, dp_shard_size=1)
         for rank, err in results:
             assert err is not None, (
                 f"rank {rank} did not raise; expected product mismatch"
@@ -416,20 +463,7 @@ def test_world4_ep2_dp1_invalid_product_raises(self):
 
     def test_mesh_axis_product_mismatch_raises(self):
         """world=4 with ep=2*dp=4 (product 8 != 4) raises clearly."""
-        ctx = mp.get_context("spawn")
-        q = ctx.Queue()
-        procs = [
-            ctx.Process(
-                target=_ep_topology_worker_expects_error,
-                args=(r, 4, 2, 4, 37910, q),
-            )
-            for r in range(4)
-        ]
-        for p in procs:
-            p.start()
-        results = [q.get(timeout=60) for _ in range(4)]
-        for p in procs:
-            p.join(timeout=10)
+        results = self._spawn_expects_error(ep_size=2, dp_shard_size=4)
         for rank, err in results:
             assert err is not None, f"rank {rank} did not raise"
             assert "must equal" in err.lower() or "world_size" in err.lower(), err
diff --git a/tests/integrations/test_gemma4_moe.py b/tests/integrations/test_gemma4_moe.py
index 412d49b2f9..e3408c7822 100644
--- a/tests/integrations/test_gemma4_moe.py
+++ b/tests/integrations/test_gemma4_moe.py
@@ -462,94 +462,6 @@ class TestGemma4SonicMoE:
     import are skipped on unsupported GPUs.
     """
 
-    @pytest.mark.skipif(
-        not _can_import_sonicmoe(),
-        reason="sonicmoe requires Hopper/Blackwell GPU",
-    )
-    def test_gemma4_routing_function_config(self, gemma4_config):
-        """Gemma4 is registered with correct routing config."""
-        from axolotl.integrations.kernels.libs.sonicmoe.routing import (
-            get_model_moe_config,
-        )
-
-        routing_fn, activation, router_attr = get_model_moe_config("gemma4_text")
-
-        assert router_attr == "router"
-        assert routing_fn is not None
-        assert routing_fn.__name__ == "gemma4_routing"
-
-        from sonicmoe.enums import ActivationType
-
-        assert activation == ActivationType.GEGLU
-
-    @pytest.mark.skipif(
-        not _can_import_sonicmoe(),
-        reason="sonicmoe requires Hopper/Blackwell GPU",
-    )
-    def test_gemma4_routing_matches_reference(self, gemma4_config):
-        """Routing function output matches reference Gemma4TextRouter."""
-        from axolotl.integrations.kernels.libs.sonicmoe.routing import (
-            get_model_moe_config,
-        )
-
-        routing_fn, _, _ = get_model_moe_config("gemma4_text")
-        H = gemma4_config["hidden_size"]
-        E = gemma4_config["num_experts"]
-        K = gemma4_config["top_k"]
-        T = 16
-
-        router = Gemma4TextRouter(H, E, K)
-        nn.init.normal_(router.proj.weight, std=0.01)
-
-        class MockGemma4MoeBlock:
-            pass
-
-        mock_block = MockGemma4MoeBlock()
-        mock_block.router = router
-
-        hidden_states = torch.randn(T, H)
-
-        # Reference
-        _ref_probs, ref_weights, ref_indices = router(hidden_states)
-
-        # Routing function
-        flat_scores, flat_token_idx, flat_expert_idx, router_logits = routing_fn(
-            hidden_states, mock_block
-        )
-
-        # Check shapes
-        assert flat_scores.shape == (T * K,)
-        assert flat_token_idx.shape == (T * K,)
-        assert flat_expert_idx.shape == (T * K,)
-        assert router_logits.shape == (T, E)
-
-        # Reconstruct per-token routing from flat output and compare
-        for t in range(T):
-            mask = flat_token_idx == t
-            assert mask.sum() == K, f"Token {t} should have {K} entries"
-
-            flat_experts_for_t = flat_expert_idx[mask].sort().values
-            ref_experts_for_t = ref_indices[t].sort().values.to(torch.int32)
-            assert torch.equal(flat_experts_for_t, ref_experts_for_t), (
-                f"Token {t}: experts mismatch"
-            )
-
-        # Verify scores match reference per-token
-        for t in range(T):
-            mask = flat_token_idx == t
-            flat_experts_t = flat_expert_idx[mask]
-            flat_scores_t = flat_scores[mask]
-
-            sort_idx = flat_experts_t.argsort()
-            flat_scores_sorted = flat_scores_t[sort_idx]
-
-            ref_sort_idx = ref_indices[t].argsort()
-            ref_scores_sorted = ref_weights[t][ref_sort_idx].float()
-
-            torch.testing.assert_close(
-                flat_scores_sorted, ref_scores_sorted, atol=1e-4, rtol=1e-4
-            )
-
     def test_gemma4_weight_layout_compatible(self, gemma4_config):
         """Verify Gemma4 expert weight layout is compatible with SonicMoE."""
         E = gemma4_config["num_experts"]
@@ -582,12 +494,6 @@ def test_gemma4_is_experts_only_model(self):
         assert cls is not None
         assert cls.__name__ == "Gemma4TextExperts"
 
-    def test_gemma4_not_in_sparse_moe_block(self):
-        """Verify gemma4_text is NOT in SPARSE_MOE_BLOCK (has no SparseMoeBlock)."""
-        from axolotl.integrations.kernels.constants import SPARSE_MOE_BLOCK
-
-        assert "gemma4_text" not in SPARSE_MOE_BLOCK
-
 
 # ============================================================================
 # Integration Tests (full layer with real model config)
@@ -790,7 +696,7 @@ def test_register_scattermoe_in_experts_interface(self):
         """register_scattermoe_experts adds 'scattermoe' to the global interface."""
         from transformers.integrations.moe import ALL_EXPERTS_FUNCTIONS
 
-        from axolotl.integrations.kernels.libs.scattermoe_lora.gemma4_experts import (
+        from axolotl.integrations.kernels.libs.scattermoe_lora.experts import (
             register_scattermoe_experts,
             scattermoe_experts_forward,
         )
@@ -806,7 +712,7 @@ def test_experts_implementation_dispatches_to_scattermoe(self, device):
             Gemma4TextExperts as HFGemma4TextExperts,
         )
 
-        from axolotl.integrations.kernels.libs.scattermoe_lora.gemma4_experts import (
+        from axolotl.integrations.kernels.libs.scattermoe_lora.experts import (
             register_scattermoe_experts,
         )
 
@@ -853,7 +759,7 @@ def test_validation_accepts_scattermoe(self):
         """get_correct_experts_implementation accepts 'scattermoe' after registration."""
         from transformers.modeling_utils import PreTrainedModel
 
-        from axolotl.integrations.kernels.libs.scattermoe_lora.gemma4_experts import (
+        from axolotl.integrations.kernels.libs.scattermoe_lora.experts import (
             register_scattermoe_experts,
         )
 
@@ -869,7 +775,7 @@ def test_eager_still_works_after_registration(self, device):
             Gemma4TextExperts as HFGemma4TextExperts,
         )
 
-        from axolotl.integrations.kernels.libs.scattermoe_lora.gemma4_experts import (
+        from axolotl.integrations.kernels.libs.scattermoe_lora.experts import (
             register_scattermoe_experts,
         )
 
@@ -982,7 +888,7 @@ def model_setup(self, request, device):
         """Create an Experts instance for each model type."""
         import importlib
 
-        from axolotl.integrations.kernels.libs.scattermoe_lora.gemma4_experts import (
+        from axolotl.integrations.kernels.libs.scattermoe_lora.experts import (
             register_scattermoe_experts,
         )
 
diff --git a/tests/integrations/test_kd_liger.py b/tests/integrations/test_kd_liger.py
new file mode 100644
index 0000000000..f4a1d07563
--- /dev/null
+++ b/tests/integrations/test_kd_liger.py
@@ -0,0 +1,149 @@
+"""
+Unit tests for LigerFusedLinearKLTopKLogprobFunction autograd contract.
+
+Regression tests for the has_aux=True contract violation in
+LigerFusedLinearKLTopKLogprobFunction: CE loss must contribute to the
+backward graph, not be silently dropped as aux.
+"""
+
+import torch
+
+from axolotl.integrations.kd.kernels.liger import LigerFusedLinearKLTopKLogprobLoss
+
+BATCH_SIZE = 1
+SEQ_LEN = 4
+VOCAB_SIZE = 16
+HIDDEN_DIM = 8
+TOP_K = 4
+
+
+def make_inputs(seed: int):
+    torch.manual_seed(seed)
+    student_hidden_states = torch.randn(
+        BATCH_SIZE, SEQ_LEN, HIDDEN_DIM, dtype=torch.float32, requires_grad=True
+    )
+    lm_head_weight = torch.randn(
+        VOCAB_SIZE, HIDDEN_DIM, dtype=torch.float32, requires_grad=True
+    )
+    target_token_ids = torch.randint(0, VOCAB_SIZE, (BATCH_SIZE, SEQ_LEN, TOP_K))
+    target_logprobs = torch.log_softmax(torch.randn(BATCH_SIZE, SEQ_LEN, TOP_K), dim=-1)
+    target_mask = torch.ones(BATCH_SIZE, SEQ_LEN, TOP_K, dtype=torch.bool)
+    true_labels = torch.randint(0, VOCAB_SIZE, (BATCH_SIZE, SEQ_LEN))
+    # Set an interior position to ignore_index. After the left-shift inside the
+    # kernel (true_labels = pad(...); true_labels[:, 1:]), index [0, 2] in the
+    # input becomes index 1 in the flattened CE labels, so this exercises
+    # ignore_index handling in F.cross_entropy.
+    true_labels[0, 2] = -100
+    return {
+        "student_hidden_states": student_hidden_states,
+        "lm_head_weight": lm_head_weight,
+        "target_token_ids": target_token_ids,
+        "target_logprobs": target_logprobs,
+        "target_mask": target_mask,
+        "true_labels": true_labels,
+    }
+
+
+def test_ce_only_gradient_flows_to_student_hidden_states():
+    """Regression: with weight_soft_loss=0, weight_hard_loss=1 the CE gradient must reach inputs.
+
+    Pre-fix, has_aux=True dropped CE from the backward graph and student_input.grad was all zero.
+    """
+    inputs = make_inputs(seed=0)
+    loss_fn = LigerFusedLinearKLTopKLogprobLoss(
+        weight_hard_loss=1.0,
+        weight_soft_loss=0.0,
+        temperature=1.0,
+        beta=0.0,
+        compiled=False,
+        chunk_size=2,
+        compute_ce_loss=True,
+    )
+    loss = loss_fn(
+        inputs["lm_head_weight"],
+        inputs["student_hidden_states"],
+        inputs["target_token_ids"],
+        inputs["target_logprobs"],
+        inputs["target_mask"],
+        inputs["true_labels"],
+    )
+    loss.backward()
+
+    assert inputs["student_hidden_states"].grad is not None
+    assert torch.isfinite(inputs["student_hidden_states"].grad).all()
+    assert inputs["student_hidden_states"].grad.abs().sum().item() > 0
+
+    assert inputs["lm_head_weight"].grad is not None
+    assert torch.isfinite(inputs["lm_head_weight"].grad).all()
+    assert inputs["lm_head_weight"].grad.abs().sum().item() > 0
+
+
+def test_kd_mix_gradient_changes_when_ce_weight_changes():
+    """Regression: in KD-mix mode, increasing weight_hard_loss must change the gradient.
+
+    Two runs with identical RNG, differing only in weight_hard_loss (0.0 vs 0.5). Pre-fix
+    both runs produced identical grads because CE was silently dropped from backward.
+    """
+    inputs_a = make_inputs(seed=42)
+    loss_fn_a = LigerFusedLinearKLTopKLogprobLoss(
+        weight_soft_loss=0.5,
+        weight_hard_loss=0.0,
+        temperature=1.0,
+        beta=0.0,
+        compiled=False,
+        chunk_size=2,
+        compute_ce_loss=True,
+    )
+    loss_a = loss_fn_a(
+        inputs_a["lm_head_weight"],
+        inputs_a["student_hidden_states"],
+        inputs_a["target_token_ids"],
+        inputs_a["target_logprobs"],
+        inputs_a["target_mask"],
+        inputs_a["true_labels"],
+    )
+    loss_a.backward()
+    grad_a_h = inputs_a["student_hidden_states"].grad.detach().clone()
+    grad_a_w = inputs_a["lm_head_weight"].grad.detach().clone()
+
+    inputs_b = make_inputs(seed=42)
+    loss_fn_b = LigerFusedLinearKLTopKLogprobLoss(
+        weight_soft_loss=0.5,
+        weight_hard_loss=0.5,
+        temperature=1.0,
+        beta=0.0,
+        compiled=False,
+        chunk_size=2,
+        compute_ce_loss=True,
+    )
+    loss_b = loss_fn_b(
+        inputs_b["lm_head_weight"],
+        inputs_b["student_hidden_states"],
+        inputs_b["target_token_ids"],
+        inputs_b["target_logprobs"],
+        inputs_b["target_mask"],
+        inputs_b["true_labels"],
+    )
+    loss_b.backward()
+    grad_b_h = inputs_b["student_hidden_states"].grad.detach().clone()
+    grad_b_w = inputs_b["lm_head_weight"].grad.detach().clone()
+
+    assert grad_a_h is not None
+    assert torch.isfinite(grad_a_h).all()
+    assert grad_b_h is not None
+    assert torch.isfinite(grad_b_h).all()
+
+    assert grad_a_w is not None
+    assert torch.isfinite(grad_a_w).all()
+    assert grad_b_w is not None
+    assert torch.isfinite(grad_b_w).all()
+
+    diff = (grad_b_h - grad_a_h).abs().sum().item()
+    assert not torch.allclose(grad_a_h, grad_b_h, atol=1e-6, rtol=1e-5)
+    assert diff > 1e-4, f"CE gradient contribution suspiciously small: {diff}"
+
+    diff_w = (grad_b_w - grad_a_w).abs().sum().item()
+    assert not torch.allclose(grad_a_w, grad_b_w, atol=1e-6, rtol=1e-5)
+    assert diff_w > 1e-4, (
+        f"CE weight-gradient contribution suspiciously small: {diff_w}"
+    )
diff --git a/tests/integrations/test_kd_trainer_direct_loss.py b/tests/integrations/test_kd_trainer_direct_loss.py
new file mode 100644
index 0000000000..ce465817f6
--- /dev/null
+++ b/tests/integrations/test_kd_trainer_direct_loss.py
@@ -0,0 +1,216 @@
+"""Tests for AxolotlKDTrainer.compute_loss."""
+
+from types import SimpleNamespace
+from unittest.mock import MagicMock
+
+import pytest
+import torch
+import torch.nn as nn
+
+# Skip the entire module if the KD trainer can't be imported (env not set up).
+axolotl_kd_trainer = pytest.importorskip("axolotl.integrations.kd.trainer")
+AxolotlKDTrainer = axolotl_kd_trainer.AxolotlKDTrainer
+_resolve_lm_head = axolotl_kd_trainer._resolve_lm_head
+
+
+HIDDEN = 8
+VOCAB = 16
+SEQ = 4
+BSZ = 2
+TOP_K = 3
+
+
+def test_resolve_lm_head_standard():
+    model = SimpleNamespace(lm_head=nn.Linear(HIDDEN, VOCAB, bias=False))
+    assert _resolve_lm_head(model) is model.lm_head
+
+
+def test_resolve_lm_head_multimodal():
+    lm_head = nn.Linear(HIDDEN, VOCAB, bias=False)
+    language_model = SimpleNamespace(lm_head=lm_head)
+    model = SimpleNamespace(language_model=language_model)
+    assert _resolve_lm_head(model) is lm_head
+
+
+def test_resolve_lm_head_peft_wrapped():
+    lm_head = nn.Linear(HIDDEN, VOCAB, bias=False)
+    inner = SimpleNamespace(lm_head=lm_head)
+    model = SimpleNamespace(get_base_model=lambda: inner)
+    assert _resolve_lm_head(model) is lm_head
+
+
+def test_resolve_lm_head_peft_wrapped_multimodal():
+    lm_head = nn.Linear(HIDDEN, VOCAB, bias=False)
+    language_model = SimpleNamespace(lm_head=lm_head)
+    inner = SimpleNamespace(language_model=language_model)
+    model = SimpleNamespace(get_base_model=lambda: inner)
+    assert _resolve_lm_head(model) is lm_head
+
+
+def test_resolve_lm_head_missing_raises():
+    model = SimpleNamespace()
+    with pytest.raises(AttributeError, match="could not find lm_head"):
+        _resolve_lm_head(model)
+
+
+def _build_fake_model():
+    class TinyModel(nn.Module):
+        def __init__(self):
+            super().__init__()
+            self.lm_head = nn.Linear(HIDDEN, VOCAB, bias=False)
+
+        def forward(self, input_ids=None, output_hidden_states=False, **kw):
+            assert output_hidden_states is True
+            assert "labels" not in kw
+            assert "target_token_ids" not in kw
+            assert "target_logprobs" not in kw
+            assert "target_mask" not in kw
+            assert "num_items_in_batch" not in kw
+            hidden = torch.randn(BSZ, SEQ, HIDDEN, requires_grad=True)
+            return SimpleNamespace(
+                loss=None,
+                logits=None,
+                hidden_states=(hidden,),
+                past_key_values=None,
+                attentions=None,
+            )
+
+    return TinyModel()
+
+
+def _build_inputs(labels=None):
+    if labels is None:
+        labels = torch.randint(0, VOCAB, (BSZ, SEQ))
+    return {
+        "input_ids": torch.randint(0, VOCAB, (BSZ, SEQ)),
+        "labels": labels,
+        "target_token_ids": torch.randint(0, VOCAB, (BSZ, SEQ, TOP_K)),
+        "target_logprobs": torch.log_softmax(torch.randn(BSZ, SEQ, TOP_K), dim=-1),
+        "target_mask": torch.ones(BSZ, SEQ, TOP_K, dtype=torch.bool),
+    }
+
+
+def test_compute_loss_calls_kd_loss_with_correct_shapes():
+    kd_loss_fn = MagicMock(return_value=torch.tensor(2.0, requires_grad=True))
+    fake_self = SimpleNamespace(
+        args=SimpleNamespace(sample_packing=False),
+        model_accepts_loss_kwargs=True,
+        _kd_loss_fn=kd_loss_fn,
+    )
+    model = _build_fake_model()
+    inputs = _build_inputs()
+    expected_labels = inputs["labels"].clone()
+    expected_target_ids = inputs["target_token_ids"].clone()
+
+    loss = AxolotlKDTrainer.compute_loss(fake_self, model, inputs)
+
+    kd_loss_fn.assert_called_once()
+    args, kwargs = kd_loss_fn.call_args
+    assert args[0].shape == (VOCAB, HIDDEN)
+    assert args[1].shape == (BSZ, SEQ, HIDDEN)
+    assert args[2].shape == expected_target_ids.shape
+    assert "true_labels" in kwargs
+    assert kwargs["true_labels"].shape == expected_labels.shape
+    assert torch.isfinite(loss).all()
+
+
+def test_compute_loss_divides_by_num_items_in_batch_from_labels():
+    kd_loss_fn = MagicMock(return_value=torch.tensor(8.0))
+    fake_self = SimpleNamespace(
+        args=SimpleNamespace(sample_packing=False),
+        model_accepts_loss_kwargs=True,
+        _kd_loss_fn=kd_loss_fn,
+    )
+    model = _build_fake_model()
+    labels = torch.tensor([[1, 2, -100, 3], [4, -100, -100, -100]])
+    inputs = _build_inputs(labels=labels)
+
+    loss = AxolotlKDTrainer.compute_loss(fake_self, model, inputs)
+    assert torch.isclose(loss, torch.tensor(2.0))
+
+
+def test_compute_loss_uses_explicit_num_items_in_batch():
+    kd_loss_fn = MagicMock(return_value=torch.tensor(8.0))
+    fake_self = SimpleNamespace(
+        args=SimpleNamespace(sample_packing=False),
+        model_accepts_loss_kwargs=True,
+        _kd_loss_fn=kd_loss_fn,
+    )
+    model = _build_fake_model()
+    inputs = _build_inputs()
+
+    loss = AxolotlKDTrainer.compute_loss(fake_self, model, inputs, num_items_in_batch=2)
+    assert torch.isclose(loss, torch.tensor(4.0))
+
+
+def test_compute_loss_does_not_divide_when_zero_items():
+    kd_loss_fn = MagicMock(return_value=torch.tensor(8.0))
+    fake_self = SimpleNamespace(
+        args=SimpleNamespace(sample_packing=False),
+        model_accepts_loss_kwargs=True,
+        _kd_loss_fn=kd_loss_fn,
+    )
+    model = _build_fake_model()
+    labels = torch.full((BSZ, SEQ), -100)
+    inputs = _build_inputs(labels=labels)
+
+    loss = AxolotlKDTrainer.compute_loss(fake_self, model, inputs)
+    assert torch.isclose(loss, torch.tensor(8.0))
+
+
+def test_compute_loss_raises_when_kd_keys_missing():
+    kd_loss_fn = MagicMock(return_value=torch.tensor(1.0))
+    fake_self = SimpleNamespace(
+        args=SimpleNamespace(sample_packing=False),
+        model_accepts_loss_kwargs=True,
+        _kd_loss_fn=kd_loss_fn,
+    )
+    model = _build_fake_model()
+    inputs = {
+        "input_ids": torch.randint(0, VOCAB, (BSZ, SEQ)),
+        "labels": torch.randint(0, VOCAB, (BSZ, SEQ)),
+    }
+    with pytest.raises(KeyError, match="KD batch missing required keys"):
+        AxolotlKDTrainer.compute_loss(fake_self, model, inputs)
+
+
+def test_compute_loss_raises_when_hidden_states_missing():
+    kd_loss_fn = MagicMock(return_value=torch.tensor(1.0))
+    fake_self = SimpleNamespace(
+        args=SimpleNamespace(sample_packing=False),
+        model_accepts_loss_kwargs=True,
+        _kd_loss_fn=kd_loss_fn,
+    )
+
+    class NoHiddenStatesModel(nn.Module):
+        def __init__(self):
+            super().__init__()
+            self.lm_head = nn.Linear(HIDDEN, VOCAB, bias=False)
+
+        def forward(self, **kw):
+            return SimpleNamespace(
+                loss=None,
+                logits=None,
+                hidden_states=None,
+                past_key_values=None,
+                attentions=None,
+            )
+
+    inputs = _build_inputs()
+    with pytest.raises(RuntimeError, match="did not return hidden_states"):
+        AxolotlKDTrainer.compute_loss(fake_self, NoHiddenStatesModel(), inputs)
+
+
+def test_compute_loss_does_not_mutate_caller_inputs():
+    kd_loss_fn = MagicMock(return_value=torch.tensor(1.0))
+    fake_self = SimpleNamespace(
+        args=SimpleNamespace(sample_packing=False),
+        model_accepts_loss_kwargs=True,
+        _kd_loss_fn=kd_loss_fn,
+    )
+    model = _build_fake_model()
+    inputs = _build_inputs()
+    original_keys = set(inputs.keys())
+
+    AxolotlKDTrainer.compute_loss(fake_self, model, inputs)
+    assert set(inputs.keys()) == original_keys
diff --git a/tests/integrations/test_liger_qwen_vl_rope_default.py b/tests/integrations/test_liger_qwen_vl_rope_default.py
new file mode 100644
index 0000000000..18b3fa8a1d
--- /dev/null
+++ b/tests/integrations/test_liger_qwen_vl_rope_default.py
@@ -0,0 +1,124 @@
+"""``cfg.liger_rope=None`` must resolve to ``True`` for Qwen-VL so the upstream fused (m-)rope kernel is installed."""
+
+from unittest.mock import patch
+
+import pytest
+
+
+@pytest.mark.parametrize(
+    "model_type",
+    [
+        "qwen2_vl",
+        "qwen2_5_vl",
+        "qwen3_vl",
+        "qwen3_vl_moe",
+        "qwen2_vl_text",
+        "qwen2_5_vl_text",
+        "qwen3_vl_text",
+        "qwen3_vl_moe_text",
+    ],
+)
+def test_liger_rope_auto_defaults_to_true_for_qwen_vl(model_type):
+    from axolotl.integrations.liger.plugin import LigerPlugin
+    from axolotl.utils.dict import DictDefault
+
+    cfg = DictDefault(
+        {
+            "model_config_type": model_type,
+            "liger_rope": None,
+            "liger_cross_entropy": False,
+            "liger_fused_linear_cross_entropy": True,
+            "liger_rms_norm": True,
+            "liger_layer_norm": False,
+            "liger_glu_activation": False,
+            "liger_use_token_scaling": False,
+            "torch_compile": False,
+            "base_model": "fake/path",
+            "trust_remote_code": False,
+        }
+    )
+
+    captured = {}
+
+    def _record(
+        rope: bool = True,
+        cross_entropy: bool = False,
+        fused_linear_cross_entropy: bool = True,
+        rms_norm: bool = True,
+        swiglu: bool = True,
+        layer_norm: bool = True,
+        model=None,
+    ):
+        captured.update(
+            rope=rope,
+            cross_entropy=cross_entropy,
+            fused_linear_cross_entropy=fused_linear_cross_entropy,
+            rms_norm=rms_norm,
+            swiglu=swiglu,
+            layer_norm=layer_norm,
+        )
+
+    from liger_kernel.transformers import monkey_patch as liger_mp
+
+    with patch.dict(
+        liger_mp.MODEL_TYPE_TO_APPLY_LIGER_FN,
+        {model_type: _record},
+        clear=False,
+    ):
+        LigerPlugin().pre_model_load(cfg)
+
+    assert captured.get("rope") is True, (
+        f"Expected rope=True default for {model_type}, got {captured.get('rope')}"
+    )
+
+
+def test_liger_rope_explicit_false_is_respected_for_qwen_vl():
+    from axolotl.integrations.liger.plugin import LigerPlugin
+    from axolotl.utils.dict import DictDefault
+
+    cfg = DictDefault(
+        {
+            "model_config_type": "qwen2_5_vl",
+            "liger_rope": False,
+            "liger_cross_entropy": False,
+            "liger_fused_linear_cross_entropy": True,
+            "liger_rms_norm": True,
+            "liger_layer_norm": False,
+            "liger_glu_activation": False,
+            "liger_use_token_scaling": False,
+            "torch_compile": False,
+            "base_model": "fake/path",
+            "trust_remote_code": False,
+        }
+    )
+
+    captured = {}
+
+    def _record(
+        rope: bool = True,
+        cross_entropy: bool = False,
+        fused_linear_cross_entropy: bool = True,
+        rms_norm: bool = True,
+        swiglu: bool = True,
+        layer_norm: bool = True,
+        model=None,
+    ):
+        captured.update(
+            rope=rope,
+            cross_entropy=cross_entropy,
+            fused_linear_cross_entropy=fused_linear_cross_entropy,
+            rms_norm=rms_norm,
+            swiglu=swiglu,
+            layer_norm=layer_norm,
+        )
+
+    from liger_kernel.transformers import monkey_patch as liger_mp
+
+    with patch.dict(
+        liger_mp.MODEL_TYPE_TO_APPLY_LIGER_FN,
+        {"qwen2_5_vl": _record},
+        clear=False,
+    ):
+        LigerPlugin().pre_model_load(cfg)
+
+    assert captured.get("rope") is False
diff --git a/tests/integrations/test_routing_parity.py b/tests/integrations/test_routing_parity.py
deleted file mode 100644
index 8852068096..0000000000
--- a/tests/integrations/test_routing_parity.py
+++ /dev/null
@@ -1,492 +0,0 @@
-# SPDX-License-Identifier: Apache-2.0
-# Copyright (c) Axolotl AI
-# Licensed under the Apache License, Version 2.0
-
-"""
-Parity tests between scattermoe-lora and sonicmoe routing implementations.
-
-These tests verify that both implementations produce numerically identical
-results for the same inputs, ensuring safe centralization of the routing code.
-
-ScatterMoE returns 2D tensors [T, K]; SonicMoE returns flattened 1D [T*K].
-The core algorithm should be identical — only the output format differs.
-"""
-
-from types import SimpleNamespace
-
-import pytest
-import torch
-
-
-def _require_triton():
-    pytest.importorskip("triton")
-
-
-# ============================================================================
-# Fixtures / helpers
-# ============================================================================
-
-
-def _make_softmax_block(T=8, H=16, E=4, K=2):
-    """Qwen/OLMoE-style block usable by both implementations."""
-    gate = SimpleNamespace(
-        weight=torch.randn(E, H),
-        top_k=K,
-        num_experts=E,
-        norm_topk_prob=True,
-    )
-    moe_block = SimpleNamespace(gate=gate)
-    hidden = torch.randn(T, H)
-    return moe_block, gate, hidden, T, H, E, K
-
-
-def _make_sigmoid_block(
-    T=8, H=16, E=16, K=4, n_group=2, topk_group=1, bias_on_gate=True
-):
-    """GLM/DeepSeek-style block usable by both implementations."""
-    if bias_on_gate:
-        gate = SimpleNamespace(
-            weight=torch.randn(E, H),
-            e_score_correction_bias=torch.zeros(E),
-        )
-        moe_block = SimpleNamespace(
-            gate=gate,
-            top_k=K,
-            n_routed_experts=E,
-            n_group=n_group,
-            topk_group=topk_group,
-            norm_topk_prob=True,
-            routed_scaling_factor=1.0,
-        )
-    else:
-        # minimax_m2 style: bias on block
-        gate = SimpleNamespace(
-            weight=torch.randn(E, H),
-            top_k=K,
-        )
-        moe_block = SimpleNamespace(
-            gate=gate,
-            top_k=K,
-            e_score_correction_bias=torch.zeros(E),
-        )
-    return moe_block, gate, hidden_states(T, H), T, H, E, K
-
-
-def hidden_states(T, H):
-    return torch.randn(T, H)
-
-
-# ============================================================================
-# 1. Softmax routing parity
-# ============================================================================
-
-
-class TestSoftmaxRoutingParity:
-    """Verify scattermoe and sonicmoe softmax routing produce identical results."""
-
-    @pytest.fixture(autouse=True)
-    def _require(self):
-        _require_triton()
-
-    def test_weights_match(self):
-        """2D weights from scattermoe == reshaped 1D weights from sonicmoe."""
-        from axolotl.integrations.kernels.libs.scattermoe_lora.layers import (
-            _softmax_topk_route,
-        )
-        from axolotl.integrations.kernels.libs.sonicmoe.routing import (
-            softmax_topk_routing,
-        )
-
-        moe_block, gate, hidden, T, H, E, K = _make_softmax_block()
-
-        # ScatterMoE path (no LoRA delta)
-        sm_weights, sm_experts, sm_topk, sm_E = _softmax_topk_route(
-            moe_block, gate, hidden, gate.weight, None
-        )
-
-        # SonicMoE path
-        sonic_scores, sonic_tok_idx, sonic_exp_idx, sonic_logits = softmax_topk_routing(
-            hidden, moe_block
-        )
-
-        # ScatterMoE returns [T, K], SonicMoE returns [T*K] flattened
-        sonic_weights_2d = sonic_scores.reshape(T, K)
-        sonic_experts_2d = sonic_exp_idx.reshape(T, K)
-
-        assert sm_topk == K
-        assert sm_E == E
-
-        # Both should select the same experts and produce the same weights
-        assert torch.equal(sm_experts, sonic_experts_2d.to(sm_experts.dtype))
-        assert torch.allclose(sm_weights, sonic_weights_2d, atol=1e-6)
-
-    def test_logits_not_returned_by_scattermoe(self):
-        """ScatterMoE doesn't return logits; SonicMoE does — verify SonicMoE logits shape."""
-        from axolotl.integrations.kernels.libs.sonicmoe.routing import (
-            softmax_topk_routing,
-        )
-
-        moe_block, gate, hidden, T, H, E, K = _make_softmax_block()
-        _, _, _, logits = softmax_topk_routing(hidden, moe_block)
-        assert logits.shape == (T, E)
-
-    def test_no_renorm(self):
-        """With norm_topk_prob=False, both should skip renormalization."""
-        from axolotl.integrations.kernels.libs.scattermoe_lora.layers import (
-            _softmax_topk_route,
-        )
-        from axolotl.integrations.kernels.libs.sonicmoe.routing import (
-            softmax_topk_routing,
-        )
-
-        moe_block, gate, hidden, T, H, E, K = _make_softmax_block()
-        gate.norm_topk_prob = False
-
-        sm_weights, sm_experts, _, _ = _softmax_topk_route(
-            moe_block, gate, hidden, gate.weight, None
-        )
-        sonic_scores, _, sonic_exp_idx, _ = softmax_topk_routing(hidden, moe_block)
-
-        sonic_weights_2d = sonic_scores.reshape(T, K)
-        sonic_experts_2d = sonic_exp_idx.reshape(T, K)
-
-        assert torch.equal(sm_experts, sonic_experts_2d.to(sm_experts.dtype))
-        assert torch.allclose(sm_weights, sonic_weights_2d, atol=1e-6)
-
-    def test_various_expert_counts(self):
-        """Parity across different E and K values."""
-        from axolotl.integrations.kernels.libs.scattermoe_lora.layers import (
-            _softmax_topk_route,
-        )
-        from axolotl.integrations.kernels.libs.sonicmoe.routing import (
-            softmax_topk_routing,
-        )
-
-        for E, K in [(2, 1), (8, 2), (16, 4), (32, 8)]:
-            moe_block, gate, hidden, T, H, _, _ = _make_softmax_block(E=E, K=K)
-
-            sm_weights, sm_experts, _, _ = _softmax_topk_route(
-                moe_block, gate, hidden, gate.weight, None
-            )
-            sonic_scores, _, sonic_exp_idx, _ = softmax_topk_routing(hidden, moe_block)
-
-            sonic_weights_2d = sonic_scores.reshape(T, K)
-            sonic_experts_2d = sonic_exp_idx.reshape(T, K)
-
-            assert torch.equal(sm_experts, sonic_experts_2d.to(sm_experts.dtype)), (
-                f"Expert mismatch for E={E}, K={K}"
-            )
-            assert torch.allclose(sm_weights, sonic_weights_2d, atol=1e-6), (
-                f"Weight mismatch for E={E}, K={K}"
-            )
-
-
-# ============================================================================
-# 2. Sigmoid routing parity
-# ============================================================================
-
-
-class TestSigmoidRoutingParity:
-    """Verify scattermoe and sonicmoe sigmoid routing produce identical results."""
-
-    @pytest.fixture(autouse=True)
-    def _require(self):
-        _require_triton()
-
-    def test_weights_match_with_groups(self):
-        """Both implementations should produce identical weights with group selection."""
-        from axolotl.integrations.kernels.libs.scattermoe_lora.layers import (
-            _sigmoid_topk_route,
-        )
-        from axolotl.integrations.kernels.libs.sonicmoe.routing import (
-            sigmoid_topk_routing,
-        )
-
-        moe_block, gate, hidden, T, H, E, K = _make_sigmoid_block(
-            E=16, K=4, n_group=2, topk_group=1, bias_on_gate=True
-        )
-
-        sm_weights, sm_experts, sm_topk, sm_E = _sigmoid_topk_route(
-            moe_block, gate, hidden, gate.weight, None
-        )
-
-        sonic_scores, sonic_tok_idx, sonic_exp_idx, sonic_logits = sigmoid_topk_routing(
-            hidden, moe_block
-        )
-
-        sonic_weights_2d = sonic_scores.reshape(T, K)
-        sonic_experts_2d = sonic_exp_idx.reshape(T, K)
-
-        assert sm_topk == K
-        assert sm_E == E
-
-        # Sort experts within each token to handle different topk orderings
-        sm_sorted, sm_order = sm_experts.sort(dim=-1)
-        sonic_sorted, sonic_order = sonic_experts_2d.to(sm_experts.dtype).sort(dim=-1)
-
-        assert torch.equal(sm_sorted, sonic_sorted)
-
-        # Gather weights in sorted order for comparison
-        sm_weights_sorted = sm_weights.gather(1, sm_order)
-        sonic_weights_sorted = sonic_weights_2d.gather(1, sonic_order)
-        assert torch.allclose(sm_weights_sorted, sonic_weights_sorted, atol=1e-6)
-
-    def test_weights_match_no_groups(self):
-        """Both implementations match without group selection (n_group=1)."""
-        from axolotl.integrations.kernels.libs.scattermoe_lora.layers import (
-            _sigmoid_topk_route,
-        )
-        from axolotl.integrations.kernels.libs.sonicmoe.routing import (
-            sigmoid_topk_routing,
-        )
-
-        moe_block, gate, hidden, T, H, E, K = _make_sigmoid_block(
-            E=16, K=4, n_group=1, topk_group=1, bias_on_gate=True
-        )
-
-        sm_weights, sm_experts, _, _ = _sigmoid_topk_route(
-            moe_block, gate, hidden, gate.weight, None
-        )
-        sonic_scores, _, sonic_exp_idx, _ = sigmoid_topk_routing(hidden, moe_block)
-
-        sonic_weights_2d = sonic_scores.reshape(T, K)
-        sonic_experts_2d = sonic_exp_idx.reshape(T, K)
-
-        # Sort for comparison (topk with sorted=False may differ in order)
-        sm_sorted, sm_order = sm_experts.sort(dim=-1)
-        sonic_sorted, sonic_order = sonic_experts_2d.to(sm_experts.dtype).sort(dim=-1)
-
-        assert torch.equal(sm_sorted, sonic_sorted)
-        sm_weights_sorted = sm_weights.gather(1, sm_order)
-        sonic_weights_sorted = sonic_weights_2d.gather(1, sonic_order)
-        assert torch.allclose(sm_weights_sorted, sonic_weights_sorted, atol=1e-6)
-
-    def test_bias_on_block_parity(self):
-        """minimax_m2 style: bias on block, not gate."""
-        from axolotl.integrations.kernels.libs.scattermoe_lora.layers import (
-            _sigmoid_topk_route,
-        )
-        from axolotl.integrations.kernels.libs.sonicmoe.routing import (
-            sigmoid_topk_routing,
-        )
-
-        moe_block, gate, hidden, T, H, E, K = _make_sigmoid_block(
-            E=16, K=4, n_group=1, bias_on_gate=False
-        )
-
-        sm_weights, sm_experts, _, _ = _sigmoid_topk_route(
-            moe_block, gate, hidden, gate.weight, None
-        )
-        sonic_scores, _, sonic_exp_idx, _ = sigmoid_topk_routing(hidden, moe_block)
-
-        sonic_weights_2d = sonic_scores.reshape(T, K)
-        sonic_experts_2d = sonic_exp_idx.reshape(T, K)
-
-        sm_sorted, sm_order = sm_experts.sort(dim=-1)
-        sonic_sorted, sonic_order = sonic_experts_2d.to(sm_experts.dtype).sort(dim=-1)
-
-        assert torch.equal(sm_sorted, sonic_sorted)
-        sm_weights_sorted = sm_weights.gather(1, sm_order)
-        sonic_weights_sorted = sonic_weights_2d.gather(1, sonic_order)
-        assert torch.allclose(sm_weights_sorted, sonic_weights_sorted, atol=1e-6)
-
-    def test_scaling_factor_parity(self):
-        """routed_scaling_factor applied identically by both."""
-        from axolotl.integrations.kernels.libs.scattermoe_lora.layers import (
-            _sigmoid_topk_route,
-        )
-        from axolotl.integrations.kernels.libs.sonicmoe.routing import (
-            sigmoid_topk_routing,
-        )
-
-        moe_block, gate, hidden, T, H, E, K = _make_sigmoid_block(
-            n_group=1, bias_on_gate=True
-        )
-        moe_block.routed_scaling_factor = 2.5
-
-        sm_weights, sm_experts, _, _ = _sigmoid_topk_route(
-            moe_block, gate, hidden, gate.weight, None
-        )
-        sonic_scores, _, sonic_exp_idx, _ = sigmoid_topk_routing(hidden, moe_block)
-
-        sonic_weights_2d = sonic_scores.reshape(T, K)
-        sonic_experts_2d = sonic_exp_idx.reshape(T, K)
-
-        sm_sorted, sm_order = sm_experts.sort(dim=-1)
-        sonic_sorted, sonic_order = sonic_experts_2d.to(sm_experts.dtype).sort(dim=-1)
-
-        assert torch.equal(sm_sorted, sonic_sorted)
-        sm_weights_sorted = sm_weights.gather(1, sm_order)
-        sonic_weights_sorted = sonic_weights_2d.gather(1, sonic_order)
-        assert torch.allclose(sm_weights_sorted, sonic_weights_sorted, atol=1e-6)
-
-    def test_no_renorm_parity(self):
-        """norm_topk_prob=False produces same results in both."""
-        from axolotl.integrations.kernels.libs.scattermoe_lora.layers import (
-            _sigmoid_topk_route,
-        )
-        from axolotl.integrations.kernels.libs.sonicmoe.routing import (
-            sigmoid_topk_routing,
-        )
-
-        moe_block, gate, hidden, T, H, E, K = _make_sigmoid_block(
-            n_group=1, bias_on_gate=True
-        )
-        moe_block.norm_topk_prob = False
-
-        sm_weights, sm_experts, _, _ = _sigmoid_topk_route(
-            moe_block, gate, hidden, gate.weight, None
-        )
-        sonic_scores, _, sonic_exp_idx, _ = sigmoid_topk_routing(hidden, moe_block)
-
-        sonic_weights_2d = sonic_scores.reshape(T, K)
-        sonic_experts_2d = sonic_exp_idx.reshape(T, K)
-
-        sm_sorted, sm_order = sm_experts.sort(dim=-1)
-        sonic_sorted, sonic_order = sonic_experts_2d.to(sm_experts.dtype).sort(dim=-1)
-
-        assert torch.equal(sm_sorted, sonic_sorted)
-        sm_weights_sorted = sm_weights.gather(1, sm_order)
-        sonic_weights_sorted = sonic_weights_2d.gather(1, sonic_order)
-        assert torch.allclose(sm_weights_sorted, sonic_weights_sorted, atol=1e-6)
-
-
-# ============================================================================
-# 3. Shared expert parity
-# ============================================================================
-
-
-class TestSharedExpertParity:
-    """Verify both _compute_shared_expert implementations behave identically."""
-
-    @pytest.fixture(autouse=True)
-    def _require(self):
-        _require_triton()
-
-    def _get_both_fns(self):
-        from axolotl.integrations.kernels.libs.scattermoe_lora.layers import (
-            _compute_shared_expert as scatter_compute,
-        )
-        from axolotl.integrations.kernels.libs.sonicmoe.patch import (
-            _compute_shared_expert as sonic_compute,
-        )
-
-        return scatter_compute, sonic_compute
-
-    def test_shared_expert_singular(self):
-        scatter_fn, sonic_fn = self._get_both_fns()
-        out = torch.randn(4, 8)
-        block = SimpleNamespace(shared_expert=lambda x: out)
-        hidden = torch.randn(4, 8)
-
-        assert torch.equal(scatter_fn(block, hidden), sonic_fn(block, hidden))
-
-    def test_shared_experts_plural(self):
-        scatter_fn, sonic_fn = self._get_both_fns()
-        out = torch.randn(4, 8)
-        block = SimpleNamespace(shared_experts=lambda x: out)
-        hidden = torch.randn(4, 8)
-
-        assert torch.equal(scatter_fn(block, hidden), sonic_fn(block, hidden))
-
-    def test_shared_mlp(self):
-        scatter_fn, sonic_fn = self._get_both_fns()
-        out = torch.randn(4, 8)
-        block = SimpleNamespace(shared_mlp=lambda x: out)
-        hidden = torch.randn(4, 8)
-
-        assert torch.equal(scatter_fn(block, hidden), sonic_fn(block, hidden))
-
-    def test_no_shared_expert(self):
-        scatter_fn, sonic_fn = self._get_both_fns()
-        block = SimpleNamespace()
-        hidden = torch.randn(4, 8)
-
-        assert scatter_fn(block, hidden) is None
-        assert sonic_fn(block, hidden) is None
-
-    def test_shared_expert_gate_only_in_scattermoe(self):
-        """ScatterMoE's _compute_shared_expert handles shared_expert_gate;
-        SonicMoE's patch.py handles it externally in the forward function.
-
-        This documents the known divergence: the scattermoe version applies
-        sigmoid gating inline, while sonicmoe applies it in the forward.
-        """
-        scatter_fn, sonic_fn = self._get_both_fns()
-
-        H = 8
-        expert_out = torch.ones(4, H)
-        gate_fn = lambda x: torch.zeros(4, H)  # noqa: E731  # sigmoid(0) = 0.5
-
-        block = SimpleNamespace(
-            shared_expert=lambda x: expert_out,
-            shared_expert_gate=gate_fn,
-        )
-        hidden = torch.randn(4, H)
-
-        scatter_result = scatter_fn(block, hidden)
-        sonic_result = sonic_fn(block, hidden)
-
-        # ScatterMoE applies the gate: expert_out * sigmoid(0) = 0.5
-        expected_gated = expert_out * 0.5
-        assert torch.allclose(scatter_result, expected_gated, atol=1e-6)
-
-        # SonicMoE does NOT apply the gate here (it does it in the forward)
-        assert torch.equal(sonic_result, expert_out)
-
-
-# ============================================================================
-# 4. Route dispatcher parity
-# ============================================================================
-
-
-class TestRouteDispatcherParity:
-    """Verify _route in scattermoe dispatches correctly and matches individual fns."""
-
-    @pytest.fixture(autouse=True)
-    def _require(self):
-        _require_triton()
-
-    def test_route_dispatches_softmax(self):
-        """_route should use softmax when no e_score_correction_bias."""
-        from axolotl.integrations.kernels.libs.scattermoe_lora.layers import (
-            _route,
-            _softmax_topk_route,
-        )
-
-        moe_block, gate, hidden, T, H, E, K = _make_softmax_block()
-
-        route_w, route_e, route_k, route_E = _route(
-            moe_block, gate, hidden, gate.weight, None
-        )
-        direct_w, direct_e, direct_k, direct_E = _softmax_topk_route(
-            moe_block, gate, hidden, gate.weight, None
-        )
-
-        assert torch.equal(route_w, direct_w)
-        assert torch.equal(route_e, direct_e)
-        assert route_k == direct_k
-        assert route_E == direct_E
-
-    def test_route_dispatches_sigmoid(self):
-        """_route should use sigmoid when e_score_correction_bias is present."""
-        from axolotl.integrations.kernels.libs.scattermoe_lora.layers import (
-            _route,
-            _sigmoid_topk_route,
-        )
-
-        moe_block, gate, hidden, T, H, E, K = _make_sigmoid_block(
-            n_group=1, bias_on_gate=True
-        )
-
-        route_w, route_e, route_k, route_E = _route(
-            moe_block, gate, hidden, gate.weight, None
-        )
-        direct_w, direct_e, direct_k, direct_E = _sigmoid_topk_route(
-            moe_block, gate, hidden, gate.weight, None
-        )
-
-        assert torch.equal(route_w, direct_w)
-        assert torch.equal(route_e, direct_e)
-        assert route_k == direct_k
-        assert route_E == direct_E
diff --git a/tests/integrations/test_scattermoe_autotune_telemetry.py b/tests/integrations/test_scattermoe_autotune_telemetry.py
index 7050c0f4f5..43010f4218 100644
--- a/tests/integrations/test_scattermoe_autotune_telemetry.py
+++ b/tests/integrations/test_scattermoe_autotune_telemetry.py
@@ -104,7 +104,7 @@ def test_populated_cache_returns_configs(self):
         assert len(result) == 1
         entry = result[0]
         assert entry["kernel"] == "scatter2scatter_lora_fwd"
-        assert entry["key"] == {"M": 2048, "N": 4096, "K": 1024}
+        assert entry["key"] == {"M_BUCKET": 2048, "N": 4096, "K": 1024}
         assert entry["config"]["BLOCK_N"] == 128
         assert entry["config"]["BLOCK_K"] == 64
         assert entry["config"]["num_warps"] == 8
@@ -148,7 +148,7 @@ def test_extra_key_elements_stored(self):
 
         assert len(result) == 1
         key = result[0]["key"]
-        assert key["M"] == 512
+        assert key["M_BUCKET"] == 512
         assert key["N"] == 1024
         assert key["K"] == 256
         assert key["_extra"] == ["float16", "float16"]
diff --git a/tests/integrations/test_sonicmoe.py b/tests/integrations/test_sonicmoe.py
index 864abca36d..f7261a85d9 100644
--- a/tests/integrations/test_sonicmoe.py
+++ b/tests/integrations/test_sonicmoe.py
@@ -1,4 +1,4 @@
-"""Unit tests for the SonicMoE integration."""
+"""Unit tests for the SonicMoE ExpertsInterface registration."""
 
 from types import SimpleNamespace
 
@@ -6,15 +6,6 @@
 import torch
 
 from axolotl.integrations.kernels.args import KernelsArgs
-from axolotl.integrations.kernels.libs.sonicmoe.routing import (
-    sigmoid_topk_routing,
-    softmax_topk_routing,
-)
-from axolotl.integrations.kernels.libs.sonicmoe.weight_converter import (
-    ConcatenatedToInterleaved,
-    InterleavedToConcatenated,
-    register_sonicmoe_weight_converter,
-)
 
 
 class TestKernelsArgs:
@@ -43,777 +34,202 @@ def test_disables_mlp_kernel_when_sonicmoe(self):
         assert result["lora_mlp_kernel"] is False
         assert result["mlp_kernel"] is False
 
+    def test_experts_implementation_auto_sonicmoe(self):
+        out = KernelsArgs.check_experts_implementation({"use_sonicmoe": True})
+        assert out["experts_implementation"] == "sonicmoe"
 
-class TestConcatenatedToInterleaved:
-    @pytest.fixture
-    def sample_tensor(self):
-        """Create a test tensor [E=2, 2*I=4, H=3] with distinct gate/up values."""
-        E, I, H = 2, 2, 3  # noqa: E741
-        gate = torch.arange(1, E * I * H + 1, dtype=torch.float32).reshape(E, I, H)
-        up = torch.arange(100, 100 + E * I * H, dtype=torch.float32).reshape(E, I, H)
-        return torch.cat([gate, up], dim=1)
-
-    def test_interleave_rows_alternate(self, sample_tensor):
-        op = ConcatenatedToInterleaved(dim=1)
-        result = op.convert(
-            {"test": sample_tensor},
-            source_patterns=["test"],
-            target_patterns=["test"],
-        )
-        interleaved = result["test"]
-
-        # For expert 0: even rows should be gate, odd rows should be up
-        E, two_I, H = sample_tensor.shape
-        I = two_I // 2  # noqa: E741
-        gate_orig = sample_tensor[:, :I, :]
-        up_orig = sample_tensor[:, I:, :]
-
-        assert torch.equal(interleaved[:, 0::2, :], gate_orig)
-        assert torch.equal(interleaved[:, 1::2, :], up_orig)
-
-    def test_interleave_handles_list_input(self, sample_tensor):
-        op = ConcatenatedToInterleaved(dim=1)
-        result = op.convert(
-            {"test": [sample_tensor]},
-            source_patterns=["test"],
-            target_patterns=["test"],
-        )
-        assert result["test"].shape == sample_tensor.shape
-
-    def test_reverse_op_type(self):
-        op = ConcatenatedToInterleaved(dim=1)
-        assert isinstance(op.reverse_op, InterleavedToConcatenated)
-        assert op.reverse_op.dim == 1
-
-
-class TestInterleavedToConcatenated:
-    @pytest.fixture
-    def interleaved_tensor(self):
-        """Create an interleaved tensor [E=2, 2*I=4, H=3]."""
-        E, I, H = 2, 2, 3  # noqa: E741
-        gate = torch.arange(1, E * I * H + 1, dtype=torch.float32).reshape(E, I, H)
-        up = torch.arange(100, 100 + E * I * H, dtype=torch.float32).reshape(E, I, H)
-        interleaved = torch.empty(E, 2 * I, H)
-        interleaved[:, 0::2, :] = gate
-        interleaved[:, 1::2, :] = up
-        return interleaved
-
-    def test_deinterleave_gate_up_separated(self, interleaved_tensor):
-        op = InterleavedToConcatenated(dim=1)
-        result = op.convert(
-            {"test": interleaved_tensor},
-            source_patterns=["test"],
-            target_patterns=["test"],
-        )
-        concatenated = result["test"]
-
-        E, two_I, H = concatenated.shape
-        I = two_I // 2  # noqa: E741
-
-        # First half should be gate (even rows from interleaved)
-        assert torch.equal(concatenated[:, :I, :], interleaved_tensor[:, 0::2, :])
-        # Second half should be up (odd rows from interleaved)
-        assert torch.equal(concatenated[:, I:, :], interleaved_tensor[:, 1::2, :])
-
-    def test_reverse_op_type(self):
-        op = InterleavedToConcatenated(dim=1)
-        assert isinstance(op.reverse_op, ConcatenatedToInterleaved)
-        assert op.reverse_op.dim == 1
-
-
-class TestRoundTrip:
-    @pytest.fixture
-    def concat_tensor(self):
-        E, I, H = 4, 8, 16  # noqa: E741
-        gate = torch.randn(E, I, H)
-        up = torch.randn(E, I, H)
-        return torch.cat([gate, up], dim=1)
-
-    def test_interleave_then_deinterleave_is_identity(self, concat_tensor):
-        fwd = ConcatenatedToInterleaved(dim=1)
-        rev = InterleavedToConcatenated(dim=1)
-
-        interleaved = fwd.convert(
-            {"k": concat_tensor}, source_patterns=["k"], target_patterns=["k"]
-        )["k"]
-        recovered = rev.convert(
-            {"k": interleaved}, source_patterns=["k"], target_patterns=["k"]
-        )["k"]
-
-        assert torch.equal(concat_tensor, recovered)
-
-    def test_reverse_op_chain_is_identity(self, concat_tensor):
-        """Verify that op.reverse_op produces an exact inverse."""
-        op = ConcatenatedToInterleaved(dim=1)
-        rev = op.reverse_op
-
-        interleaved = op.convert(
-            {"k": concat_tensor}, source_patterns=["k"], target_patterns=["k"]
-        )["k"]
-        recovered = rev.convert(
-            {"k": interleaved}, source_patterns=["k"], target_patterns=["k"]
-        )["k"]
-
-        assert torch.equal(concat_tensor, recovered)
-
-    def test_various_shapes(self):
-        """Test with different expert counts and dimensions."""
-        fwd = ConcatenatedToInterleaved(dim=1)
-        rev = InterleavedToConcatenated(dim=1)
-
-        for E, I, H in [(1, 4, 8), (8, 16, 32), (16, 128, 256)]:  # noqa: E741
-            concat = torch.randn(E, 2 * I, H)
-            interleaved = fwd.convert(
-                {"k": concat}, source_patterns=["k"], target_patterns=["k"]
-            )["k"]
-            recovered = rev.convert(
-                {"k": interleaved}, source_patterns=["k"], target_patterns=["k"]
-            )["k"]
-            assert torch.equal(concat, recovered), (
-                f"Failed for shape ({E}, {2 * I}, {H})"
-            )
-
-
-class TestWeightConverterRegistration:
-    def test_register_appends_interleave_op(self):
-        from transformers.conversion_mapping import get_checkpoint_conversion_mapping
-
-        register_sonicmoe_weight_converter("qwen3_moe")
-
-        modified = get_checkpoint_conversion_mapping("qwen3_moe")
-        # Find the gate_up_proj converter
-        gate_up_converter = None
-        for conv in modified:
-            if hasattr(conv, "operations") and any(
-                "gate_up_proj" in pat for pat in conv.target_patterns
-            ):
-                gate_up_converter = conv
-                break
-
-        assert gate_up_converter is not None
-        assert isinstance(gate_up_converter.operations[-1], ConcatenatedToInterleaved)
-
-    def test_double_registration_is_idempotent(self):
-        from transformers.conversion_mapping import get_checkpoint_conversion_mapping
-
-        register_sonicmoe_weight_converter("qwen3_moe")
-        register_sonicmoe_weight_converter("qwen3_moe")
-
-        modified = get_checkpoint_conversion_mapping("qwen3_moe")
-        for conv in modified:
-            if hasattr(conv, "operations") and any(
-                "gate_up_proj" in pat for pat in conv.target_patterns
-            ):
-                interleave_count = sum(
-                    isinstance(op, ConcatenatedToInterleaved) for op in conv.operations
-                )
-                assert interleave_count == 1, (
-                    f"Expected 1 ConcatenatedToInterleaved op, got {interleave_count}"
-                )
-                break
-
-    def test_register_adds_same_key_converter(self):
-        from transformers.conversion_mapping import get_checkpoint_conversion_mapping
-
-        register_sonicmoe_weight_converter("qwen3_moe")
-
-        modified = get_checkpoint_conversion_mapping("qwen3_moe")
-        # Should have a same-key converter for already-fused checkpoints
-        same_key = [
-            c
-            for c in modified
-            if hasattr(c, "source_patterns")
-            and c.source_patterns == ["mlp.experts.gate_up_proj"]
-            and c.target_patterns == ["mlp.experts.gate_up_proj"]
-        ]
-        assert len(same_key) == 1
-        assert isinstance(same_key[0].operations[0], ConcatenatedToInterleaved)
-
-    def test_register_creates_mapping_when_none(self):
-        from transformers.conversion_mapping import get_checkpoint_conversion_mapping
-
-        # qwen3_5_moe has no conversion mapping in transformers
-        register_sonicmoe_weight_converter("qwen3_5_moe")
-
-        mapping = get_checkpoint_conversion_mapping("qwen3_5_moe")
-        assert mapping is not None
-        same_key = [
-            c
-            for c in mapping
-            if hasattr(c, "source_patterns")
-            and c.source_patterns == ["mlp.experts.gate_up_proj"]
-            and c.target_patterns == ["mlp.experts.gate_up_proj"]
-        ]
-        assert len(same_key) == 1
-        assert isinstance(same_key[0].operations[0], ConcatenatedToInterleaved)
-
-
-def _make_qwen_moe_block(T=8, H=16, E=4, K=2):
-    """Create a mock qwen-style MoE block for routing tests."""
-    gate = SimpleNamespace(
-        weight=torch.randn(E, H),
-        top_k=K,
-        num_experts=E,
-        norm_topk_prob=True,
-    )
-    return SimpleNamespace(gate=gate), T, H, E, K
-
-
-def _make_glm_moe_block(T=8, H=16, E=16, K=4, n_group=2, topk_group=1):
-    """Create a mock GLM5-style MoE block for routing tests."""
-    gate = SimpleNamespace(
-        weight=torch.randn(E, H),
-        e_score_correction_bias=torch.zeros(E),
-    )
-    moe_block = SimpleNamespace(
-        gate=gate,
-        top_k=K,
-        n_routed_experts=E,
-        n_group=n_group,
-        topk_group=topk_group,
-        norm_topk_prob=True,
-        routed_scaling_factor=1.0,
-    )
-    return moe_block, T, H, E, K
-
-
-def _make_minimax_m2_moe_block(T=8, H=16, E=16, K=4):
-    """Create a mock minimax_m2-style MoE block for routing tests.
-
-    minimax_m2 uses sigmoid->topk WITHOUT group selection:
-    - e_score_correction_bias is on the moe_block (not on gate)
-    - No n_group / topk_group attributes
-    - Always normalizes (norm_topk_prob defaults to True)
-    - No routed_scaling_factor (defaults to 1.0)
-    """
-    gate = SimpleNamespace(
-        weight=torch.randn(E, H),
-        top_k=K,
-    )
-    moe_block = SimpleNamespace(
-        gate=gate,
-        top_k=K,
-        e_score_correction_bias=torch.zeros(E),
-    )
-    return moe_block, T, H, E, K
-
-
-class TestSoftmaxTopkRouting:
-    def test_output_shapes(self):
-        moe_block, T, H, E, K = _make_qwen_moe_block()
-        hidden = torch.randn(T, H)
-
-        scores, token_idx, expert_idx, logits = softmax_topk_routing(hidden, moe_block)
-
-        assert scores.shape == (T * K,)
-        assert token_idx.shape == (T * K,)
-        assert expert_idx.shape == (T * K,)
-        assert logits.shape == (T, E)
-
-    def test_scores_are_float32(self):
-        moe_block, T, H, E, K = _make_qwen_moe_block()
-        hidden = torch.randn(T, H)
-
-        scores, _, _, _ = softmax_topk_routing(hidden, moe_block)
-        assert scores.dtype == torch.float32
-
-    def test_token_indices_sorted_ascending(self):
-        moe_block, T, H, E, K = _make_qwen_moe_block()
-        hidden = torch.randn(T, H)
-
-        _, token_idx, _, _ = softmax_topk_routing(hidden, moe_block)
-
-        # Token indices must be sorted ascending (SonicMoE requirement)
-        diffs = token_idx[1:] - token_idx[:-1]
-        assert (diffs >= 0).all()
-
-    def test_expert_indices_in_range(self):
-        moe_block, T, H, E, K = _make_qwen_moe_block()
-        hidden = torch.randn(T, H)
-
-        _, _, expert_idx, _ = softmax_topk_routing(hidden, moe_block)
-
-        assert (expert_idx >= 0).all()
-        assert (expert_idx < E).all()
-
-    def test_renormalized_scores_sum_to_one(self):
-        moe_block, T, H, E, K = _make_qwen_moe_block()
-        hidden = torch.randn(T, H)
-
-        scores, _, _, _ = softmax_topk_routing(hidden, moe_block)
-        per_token_sums = scores.reshape(T, K).sum(dim=-1)
-        assert torch.allclose(per_token_sums, torch.ones(T), atol=1e-5)
-
-
-class TestSigmoidTopkRouting:
-    def test_output_shapes(self):
-        moe_block, T, H, E, K = _make_glm_moe_block()
-        hidden = torch.randn(T, H)
-
-        scores, token_idx, expert_idx, logits = sigmoid_topk_routing(hidden, moe_block)
-
-        assert scores.shape == (T * K,)
-        assert token_idx.shape == (T * K,)
-        assert expert_idx.shape == (T * K,)
-        assert logits.shape == (T, E)
-
-    def test_scores_are_float32(self):
-        moe_block, T, H, E, K = _make_glm_moe_block()
-        hidden = torch.randn(T, H)
-
-        scores, _, _, _ = sigmoid_topk_routing(hidden, moe_block)
-        assert scores.dtype == torch.float32
-
-    def test_token_indices_sorted_ascending(self):
-        moe_block, T, H, E, K = _make_glm_moe_block()
-        hidden = torch.randn(T, H)
-
-        _, token_idx, _, _ = sigmoid_topk_routing(hidden, moe_block)
-
-        diffs = token_idx[1:] - token_idx[:-1]
-        assert (diffs >= 0).all()
+    def test_experts_implementation_auto_scattermoe(self):
+        out = KernelsArgs.check_experts_implementation({"use_scattermoe": True})
+        assert out["experts_implementation"] == "scattermoe"
 
-    def test_expert_indices_in_range(self):
-        moe_block, T, H, E, K = _make_glm_moe_block()
-        hidden = torch.randn(T, H)
+    def test_experts_implementation_default_eager(self):
+        out = KernelsArgs.check_experts_implementation({})
+        assert out["experts_implementation"] == "eager"
 
-        _, _, expert_idx, _ = sigmoid_topk_routing(hidden, moe_block)
-
-        assert (expert_idx >= 0).all()
-        assert (expert_idx < E).all()
-
-    def test_scores_are_nonnegative(self):
-        """Sigmoid outputs are in [0, 1], so scores should be non-negative."""
-        moe_block, T, H, E, K = _make_glm_moe_block()
-        hidden = torch.randn(T, H)
-
-        scores, _, _, _ = sigmoid_topk_routing(hidden, moe_block)
-        assert (scores >= 0).all()
-
-    def test_scaling_factor_applied(self):
-        moe_block, T, H, E, K = _make_glm_moe_block()
-        hidden = torch.randn(T, H)
-
-        # Get scores with scaling_factor=1.0
-        scores_1x, _, _, _ = sigmoid_topk_routing(hidden, moe_block)
-
-        # Get scores with scaling_factor=2.0
-        moe_block.routed_scaling_factor = 2.0
-        scores_2x, _, _, _ = sigmoid_topk_routing(hidden, moe_block)
-
-        assert torch.allclose(scores_2x, scores_1x * 2.0, atol=1e-5)
-
-    def test_group_selection_restricts_experts(self):
-        """With n_group=4 and topk_group=1, only 1/4 of experts should be selectable."""
-        moe_block, T, H, E, K = _make_glm_moe_block(E=16, K=2, n_group=4, topk_group=1)
-        hidden = torch.randn(T, H)
-
-        _, _, expert_idx, _ = sigmoid_topk_routing(hidden, moe_block)
-
-        # Each token's experts should all fall within a single group (size E//n_group=4)
-        expert_idx_2d = expert_idx.reshape(T, K)
-        for t in range(T):
-            experts = expert_idx_2d[t]
-            groups = experts // (E // moe_block.n_group)
-            # All selected experts should be from the same group
-            assert (groups == groups[0]).all()
-
-
-class TestMiniMaxM2SigmoidRouting:
-    """Tests for minimax_m2 routing: sigmoid->topk without group selection."""
-
-    def test_output_shapes(self):
-        """Validates getattr defaults work: n_group=1, E from gate.weight.shape[0]."""
-        moe_block, T, H, E, K = _make_minimax_m2_moe_block()
-        hidden = torch.randn(T, H)
-
-        scores, token_idx, expert_idx, logits = sigmoid_topk_routing(hidden, moe_block)
-
-        assert scores.shape == (T * K,)
-        assert token_idx.shape == (T * K,)
-        assert expert_idx.shape == (T * K,)
-        assert logits.shape == (T, E)
-
-    def test_bias_on_block_not_gate(self):
-        """Verify that e_score_correction_bias on the block (not gate) is used."""
-        T, H, E, K = 8, 16, 8, 2
-        gate = SimpleNamespace(
-            weight=torch.randn(E, H),
-            top_k=K,
-        )
-        # Large positive bias on expert 0 should make it selected more often
-        bias = torch.zeros(E)
-        bias[0] = 100.0
-        moe_block = SimpleNamespace(
-            gate=gate,
-            top_k=K,
-            e_score_correction_bias=bias,
+    def test_sonicmoe_impl_requires_flag(self):
+        out = KernelsArgs.check_experts_implementation(
+            {"experts_implementation": "sonicmoe"}
         )
-        hidden = torch.randn(T, H)
-
-        _, _, expert_idx, _ = sigmoid_topk_routing(hidden, moe_block)
-
-        # Expert 0 should appear for every token due to the large bias
-        expert_idx_2d = expert_idx.reshape(T, K)
-        for t in range(T):
-            assert 0 in expert_idx_2d[t]
-
-
-# ============================================================================
-# Ernie 4.5 MoE: softmax -> bias correction -> topk
-# ============================================================================
-
-
-def _make_ernie_moe_block(T=8, H=16, E=8, K=2, norm_min=1e-20):
-    """Create a mock Ernie 4.5 MoE block for routing tests.
-
-    Ernie 4.5 uses a gate.moe_statics module that adds bias to softmax probs
-    before topk selection, then gathers from original probs.
-    """
-    bias = torch.zeros(E)
-
-    class MockMoeStatics:
-        def __init__(self, bias_tensor):
-            self.e_score_correction_bias = bias_tensor
-
-        def __call__(self, probs):
-            return probs + self.e_score_correction_bias
-
-    gate = SimpleNamespace(
-        weight=torch.randn(E, H),
-        top_k=K,
-        moe_statics=MockMoeStatics(bias),
-        norm_min=norm_min,
-    )
-    moe_block = SimpleNamespace(gate=gate)
-    return moe_block, bias, T, H, E, K
-
+        assert out["experts_implementation"] == "eager"
 
-class TestSoftmaxBiasTopkRouting:
-    """Tests for Ernie 4.5 MoE routing (softmax_bias_topk_routing)."""
-
-    def test_output_shapes(self):
-        from axolotl.integrations.kernels.libs.sonicmoe.routing import (
-            softmax_bias_topk_routing,
+    def test_scattermoe_impl_requires_flag(self):
+        out = KernelsArgs.check_experts_implementation(
+            {"experts_implementation": "scattermoe"}
         )
+        assert out["experts_implementation"] == "eager"
 
-        moe_block, _, T, H, E, K = _make_ernie_moe_block()
-        hidden = torch.randn(T, H)
-
-        scores, token_idx, expert_idx, logits = softmax_bias_topk_routing(
-            hidden, moe_block
+    def test_unknown_impl_falls_back_to_eager(self):
+        out = KernelsArgs.check_experts_implementation(
+            {"experts_implementation": "not-a-real-impl"}
         )
+        assert out["experts_implementation"] == "eager"
 
-        assert scores.shape == (T * K,)
-        assert token_idx.shape == (T * K,)
-        assert expert_idx.shape == (T * K,)
-        assert logits.shape == (T, E)
+    def test_builtin_impls_pass_through(self):
+        for impl in ("eager", "batched_mm", "grouped_mm"):
+            out = KernelsArgs.check_experts_implementation(
+                {"experts_implementation": impl}
+            )
+            assert out["experts_implementation"] == impl
 
-    def test_scores_are_float32(self):
-        from axolotl.integrations.kernels.libs.sonicmoe.routing import (
-            softmax_bias_topk_routing,
-        )
 
-        moe_block, _, T, H, E, K = _make_ernie_moe_block()
-        hidden = torch.randn(T, H)
+class TestSonicMoERegistration:
+    """Test that register_sonicmoe_experts plugs into ALL_EXPERTS_FUNCTIONS."""
 
-        scores, _, _, _ = softmax_bias_topk_routing(hidden, moe_block)
-        assert scores.dtype == torch.float32
+    def test_register_adds_entry(self):
+        from transformers.integrations.moe import ALL_EXPERTS_FUNCTIONS
 
-    def test_token_indices_sorted_ascending(self):
-        from axolotl.integrations.kernels.libs.sonicmoe.routing import (
-            softmax_bias_topk_routing,
+        from axolotl.integrations.kernels.libs.sonicmoe.experts import (
+            register_sonicmoe_experts,
+            sonicmoe_experts_forward_with_lora,
         )
 
-        moe_block, _, T, H, E, K = _make_ernie_moe_block()
-        hidden = torch.randn(T, H)
+        register_sonicmoe_experts()
+        assert "sonicmoe" in ALL_EXPERTS_FUNCTIONS
+        assert ALL_EXPERTS_FUNCTIONS["sonicmoe"] is sonicmoe_experts_forward_with_lora
 
-        _, token_idx, _, _ = softmax_bias_topk_routing(hidden, moe_block)
-        diffs = token_idx[1:] - token_idx[:-1]
-        assert (diffs >= 0).all()
+    def test_register_is_idempotent(self):
+        from transformers.integrations.moe import ALL_EXPERTS_FUNCTIONS
 
-    def test_expert_indices_in_range(self):
-        from axolotl.integrations.kernels.libs.sonicmoe.routing import (
-            softmax_bias_topk_routing,
+        from axolotl.integrations.kernels.libs.sonicmoe.experts import (
+            register_sonicmoe_experts,
         )
 
-        moe_block, _, T, H, E, K = _make_ernie_moe_block()
-        hidden = torch.randn(T, H)
+        register_sonicmoe_experts()
+        register_sonicmoe_experts()
+        # Just one entry, no error
+        assert "sonicmoe" in ALL_EXPERTS_FUNCTIONS
 
-        _, _, expert_idx, _ = softmax_bias_topk_routing(hidden, moe_block)
-        assert (expert_idx >= 0).all()
-        assert (expert_idx < E).all()
+    def test_register_overrides_upstream(self):
+        """Axolotl's LoRA-aware variant replaces upstream's plain forward."""
+        from transformers.integrations.moe import ALL_EXPERTS_FUNCTIONS
+        from transformers.integrations.sonicmoe import sonicmoe_experts_forward
 
-    def test_renormalized_scores_sum_to_one(self):
-        from axolotl.integrations.kernels.libs.sonicmoe.routing import (
-            softmax_bias_topk_routing,
+        from axolotl.integrations.kernels.libs.sonicmoe.experts import (
+            register_sonicmoe_experts,
+            sonicmoe_experts_forward_with_lora,
         )
 
-        moe_block, _, T, H, E, K = _make_ernie_moe_block()
-        hidden = torch.randn(T, H)
+        register_sonicmoe_experts()
+        assert ALL_EXPERTS_FUNCTIONS["sonicmoe"] is sonicmoe_experts_forward_with_lora
+        assert ALL_EXPERTS_FUNCTIONS["sonicmoe"] is not sonicmoe_experts_forward
 
-        scores, _, _, _ = softmax_bias_topk_routing(hidden, moe_block)
-        per_token_sums = scores.reshape(T, K).sum(dim=-1)
-        assert torch.allclose(per_token_sums, torch.ones(T), atol=1e-5)
 
-    def test_bias_affects_expert_selection(self):
-        """Large positive bias on expert 0 should make it always selected."""
-        from axolotl.integrations.kernels.libs.sonicmoe.routing import (
-            softmax_bias_topk_routing,
-        )
+class TestMoELoRAMaterialize:
+    """Verify the LoRA materialization autograd Function used by the registered forward."""
 
-        moe_block, bias, T, H, E, K = _make_ernie_moe_block()
-        bias[0] = 100.0  # mutate the bias to strongly favor expert 0
-        hidden = torch.randn(T, H)
+    def test_forward_shape_and_identity_with_zero_lora(self):
+        """W_eff == base when LoRA tensors are zero, regardless of layout convention."""
+        from axolotl.integrations.kernels.libs.sonicmoe.lora import MoELoRAMaterialize
 
-        _, _, expert_idx, _ = softmax_bias_topk_routing(hidden, moe_block)
-        expert_idx_2d = expert_idx.reshape(T, K)
-        for t in range(T):
-            assert 0 in expert_idx_2d[t]
+        E, dim1, dim2, r = 4, 8, 6, 2
+        base = torch.randn(E, dim1, dim2)
+        lora_A = torch.zeros(r * E, dim2)
+        lora_B = torch.zeros(dim1, r * E)
+        scaling = 0.5
 
+        W_eff = MoELoRAMaterialize.apply(base, lora_A, lora_B, scaling)
+        assert W_eff.shape == base.shape
+        torch.testing.assert_close(W_eff, base, atol=1e-6, rtol=1e-6)
 
-# ============================================================================
-# DeepSeek V2: softmax -> group_limited_greedy / greedy -> topk
-# ============================================================================
+    def test_forward_scaling_linearity(self):
+        """Doubling scaling should double the LoRA delta."""
+        from axolotl.integrations.kernels.libs.sonicmoe.lora import MoELoRAMaterialize
 
+        E, dim1, dim2, r = 4, 8, 6, 2
+        base = torch.randn(E, dim1, dim2)
+        lora_A = torch.randn(r * E, dim2)
+        lora_B = torch.randn(dim1, r * E)
 
-def _make_deepseek_v2_moe_block(
-    T=8, H=16, E=16, K=4, num_group=2, topk_group=1, topk_method="group_limited_greedy"
-):
-    """Create a mock DeepSeek V2 MoE block for routing tests.
+        W_1 = MoELoRAMaterialize.apply(base, lora_A, lora_B, 1.0)
+        W_2 = MoELoRAMaterialize.apply(base, lora_A, lora_B, 2.0)
+        torch.testing.assert_close(W_2 - base, 2 * (W_1 - base), atol=1e-5, rtol=1e-5)
 
-    DeepSeek V2 uses num_group (not n_group), gate is nn.Linear,
-    and supports greedy / group_limited_greedy topk methods.
-    """
-    gate = SimpleNamespace(weight=torch.randn(E, H))
-    moe_block = SimpleNamespace(
-        gate=gate,
-        top_k=K,
-        num_group=num_group,
-        topk_group=topk_group,
-        topk_method=topk_method,
-        routed_scaling_factor=1.0,
-    )
-    return moe_block, T, H, E, K
-
-
-class TestSoftmaxGroupLimitedTopkRouting:
-    """Tests for DeepSeek V2 routing (softmax_group_limited_topk_routing)."""
-
-    def test_output_shapes_group_limited(self):
-        from axolotl.integrations.kernels.libs.sonicmoe.routing import (
-            softmax_group_limited_topk_routing,
-        )
+    def test_forward_matches_peft_einsum(self):
+        """Delta matches PEFT's ParamWrapper.get_delta_weight einsum convention.
 
-        moe_block, T, H, E, K = _make_deepseek_v2_moe_block(
-            topk_method="group_limited_greedy"
-        )
-        hidden = torch.randn(T, H)
+        Reference: ``peft.tuners.lora.layer.ParamWrapper.get_delta_weight``
+        on PEFT 0.19.x — ``einsum("o r e, e r i -> e o i", B_3d, A_3d)`` where
+        ``B_3d = lora_B.reshape(dim1, r, E)`` and ``A_3d = lora_A.reshape(E, r, dim2)``.
+        """
+        from axolotl.integrations.kernels.libs.sonicmoe.lora import MoELoRAMaterialize
 
-        scores, token_idx, expert_idx, logits = softmax_group_limited_topk_routing(
-            hidden, moe_block
-        )
+        E, dim1, dim2, r = 3, 5, 4, 2
+        base = torch.zeros(E, dim1, dim2)
+        lora_A = torch.randn(r * E, dim2)
+        lora_B = torch.randn(dim1, r * E)
+        scaling = 0.7
 
-        assert scores.shape == (T * K,)
-        assert token_idx.shape == (T * K,)
-        assert expert_idx.shape == (T * K,)
-        assert logits.shape == (T, E)
+        W_eff = MoELoRAMaterialize.apply(base, lora_A, lora_B, scaling)
 
-    def test_output_shapes_greedy(self):
-        from axolotl.integrations.kernels.libs.sonicmoe.routing import (
-            softmax_group_limited_topk_routing,
-        )
+        # PEFT's reference computation
+        A_3d = lora_A.reshape(E, r, dim2)
+        B_3d = lora_B.reshape(dim1, r, E)
+        peft_delta = torch.einsum("o r e, e r i -> e o i", B_3d, A_3d) * scaling
 
-        moe_block, T, H, E, K = _make_deepseek_v2_moe_block(topk_method="greedy")
-        hidden = torch.randn(T, H)
+        torch.testing.assert_close(W_eff, peft_delta, atol=1e-5, rtol=1e-5)
 
-        scores, token_idx, expert_idx, logits = softmax_group_limited_topk_routing(
-            hidden, moe_block
-        )
+    def test_gradient_flows_to_lora(self):
+        from axolotl.integrations.kernels.libs.sonicmoe.lora import MoELoRAMaterialize
 
-        assert scores.shape == (T * K,)
-        assert logits.shape == (T, E)
+        E, dim1, dim2, r = 4, 8, 6, 2
+        base = torch.randn(E, dim1, dim2, requires_grad=False)
+        lora_A = torch.randn(r * E, dim2, requires_grad=True)
+        lora_B = torch.randn(dim1, r * E, requires_grad=True)
+        scaling = 0.5
 
-    def test_scores_are_float32(self):
-        from axolotl.integrations.kernels.libs.sonicmoe.routing import (
-            softmax_group_limited_topk_routing,
-        )
+        W_eff = MoELoRAMaterialize.apply(base, lora_A, lora_B, scaling)
+        loss = W_eff.sum()
+        loss.backward()
 
-        moe_block, T, H, E, K = _make_deepseek_v2_moe_block()
-        hidden = torch.randn(T, H)
-
-        scores, _, _, _ = softmax_group_limited_topk_routing(hidden, moe_block)
-        assert scores.dtype == torch.float32
-
-    def test_token_indices_sorted_ascending(self):
-        from axolotl.integrations.kernels.libs.sonicmoe.routing import (
-            softmax_group_limited_topk_routing,
-        )
-
-        moe_block, T, H, E, K = _make_deepseek_v2_moe_block()
-        hidden = torch.randn(T, H)
-
-        _, token_idx, _, _ = softmax_group_limited_topk_routing(hidden, moe_block)
-        diffs = token_idx[1:] - token_idx[:-1]
-        assert (diffs >= 0).all()
-
-    def test_expert_indices_in_range(self):
-        from axolotl.integrations.kernels.libs.sonicmoe.routing import (
-            softmax_group_limited_topk_routing,
-        )
-
-        moe_block, T, H, E, K = _make_deepseek_v2_moe_block()
-        hidden = torch.randn(T, H)
-
-        _, _, expert_idx, _ = softmax_group_limited_topk_routing(hidden, moe_block)
-        assert (expert_idx >= 0).all()
-        assert (expert_idx < E).all()
-
-    def test_scaling_factor_applied(self):
-        from axolotl.integrations.kernels.libs.sonicmoe.routing import (
-            softmax_group_limited_topk_routing,
-        )
-
-        moe_block, T, H, E, K = _make_deepseek_v2_moe_block(topk_method="greedy")
-        hidden = torch.randn(T, H)
-
-        scores_1x, _, _, _ = softmax_group_limited_topk_routing(hidden, moe_block)
-
-        moe_block.routed_scaling_factor = 2.5
-        scores_2x, _, _, _ = softmax_group_limited_topk_routing(hidden, moe_block)
-
-        assert torch.allclose(scores_2x, scores_1x * 2.5, atol=1e-5)
-
-    def test_group_selection_restricts_experts(self):
-        """With num_group=4 and topk_group=1, experts should come from selected groups."""
-        from axolotl.integrations.kernels.libs.sonicmoe.routing import (
-            softmax_group_limited_topk_routing,
-        )
+        assert lora_A.grad is not None
+        assert lora_B.grad is not None
+        assert lora_A.grad.abs().max() > 0
+        assert lora_B.grad.abs().max() > 0
+        # Base weight is frozen — no grad expected.
+        assert base.grad is None
 
-        moe_block, T, H, E, K = _make_deepseek_v2_moe_block(
-            E=16, K=2, num_group=4, topk_group=1, topk_method="group_limited_greedy"
+    def test_no_lora_returns_base_unchanged(self):
+        from axolotl.integrations.kernels.libs.sonicmoe.lora import (
+            materialize_expert_lora,
         )
-        hidden = torch.randn(T, H)
-
-        _, _, expert_idx, _ = softmax_group_limited_topk_routing(hidden, moe_block)
-        expert_idx_2d = expert_idx.reshape(T, K)
-        group_size = E // moe_block.num_group
-        for t in range(T):
-            experts = expert_idx_2d[t]
-            groups = experts // group_size
-            # All selected experts should be from the same group
-            assert (groups == groups[0]).all()
-
-    def test_unsupported_topk_method_raises(self):
-        from axolotl.integrations.kernels.libs.sonicmoe.routing import (
-            softmax_group_limited_topk_routing,
-        )
-
-        moe_block, T, H, E, K = _make_deepseek_v2_moe_block(topk_method="invalid")
-        hidden = torch.randn(T, H)
-
-        with pytest.raises(ValueError, match="unsupported topk_method"):
-            softmax_group_limited_topk_routing(hidden, moe_block)
-
 
-# ============================================================================
-# HunYuan V1 MoE: softmax -> topk -> renorm (via gate.wg)
-# ============================================================================
+        base = torch.randn(4, 8, 6)
+        result = materialize_expert_lora(base, None)
+        assert result is base
 
 
-def _make_hunyuan_moe_block(T=8, H=16, E=8, K=2):
-    """Create a mock HunYuan V1 MoE block for routing tests.
-
-    HunYuan V1 uses gate.wg (nn.Linear-like) instead of gate.weight,
-    and top_k on the moe_block instead of the gate.
+class TestExpertsClassMetadata:
+    """The forward reads `has_gate`/`has_bias`/`is_transposed`/`is_concatenated`
+    that are set by transformers' @use_experts_implementation decorator.
+    Verify our forward respects these without an actual CUDA kernel call.
     """
-    wg = SimpleNamespace(weight=torch.randn(E, H))
-    gate = SimpleNamespace(wg=wg)
-    moe_block = SimpleNamespace(gate=gate, top_k=K)
-    return moe_block, T, H, E, K
-
 
-class TestSoftmaxTopkWgRouting:
-    """Tests for HunYuan V1 MoE routing (softmax_topk_wg_routing)."""
-
-    def test_output_shapes(self):
-        from axolotl.integrations.kernels.libs.sonicmoe.routing import (
-            softmax_topk_wg_routing,
+    def test_rejects_non_gated(self):
+        from axolotl.integrations.kernels.libs.sonicmoe.experts import (
+            sonicmoe_experts_forward_with_lora,
         )
 
-        moe_block, T, H, E, K = _make_hunyuan_moe_block()
-        hidden = torch.randn(T, H)
+        fake_self = SimpleNamespace(has_gate=False)
+        hidden = torch.zeros(2, 4)
+        top_k_index = torch.zeros(2, 1, dtype=torch.long)
+        top_k_weights = torch.ones(2, 1)
 
-        scores, token_idx, expert_idx, logits = softmax_topk_wg_routing(
-            hidden, moe_block
-        )
-
-        assert scores.shape == (T * K,)
-        assert token_idx.shape == (T * K,)
-        assert expert_idx.shape == (T * K,)
-        assert logits.shape == (T, E)
-
-    def test_scores_are_float32(self):
-        from axolotl.integrations.kernels.libs.sonicmoe.routing import (
-            softmax_topk_wg_routing,
-        )
-
-        moe_block, T, H, E, K = _make_hunyuan_moe_block()
-        hidden = torch.randn(T, H)
-
-        scores, _, _, _ = softmax_topk_wg_routing(hidden, moe_block)
-        assert scores.dtype == torch.float32
-
-    def test_token_indices_sorted_ascending(self):
-        from axolotl.integrations.kernels.libs.sonicmoe.routing import (
-            softmax_topk_wg_routing,
-        )
-
-        moe_block, T, H, E, K = _make_hunyuan_moe_block()
-        hidden = torch.randn(T, H)
-
-        _, token_idx, _, _ = softmax_topk_wg_routing(hidden, moe_block)
-        diffs = token_idx[1:] - token_idx[:-1]
-        assert (diffs >= 0).all()
-
-    def test_expert_indices_in_range(self):
-        from axolotl.integrations.kernels.libs.sonicmoe.routing import (
-            softmax_topk_wg_routing,
-        )
-
-        moe_block, T, H, E, K = _make_hunyuan_moe_block()
-        hidden = torch.randn(T, H)
-
-        _, _, expert_idx, _ = softmax_topk_wg_routing(hidden, moe_block)
-        assert (expert_idx >= 0).all()
-        assert (expert_idx < E).all()
-
-    def test_renormalized_scores_sum_to_one(self):
-        """HunYuan V1 always renormalizes (no norm_topk_prob flag)."""
-        from axolotl.integrations.kernels.libs.sonicmoe.routing import (
-            softmax_topk_wg_routing,
-        )
-
-        moe_block, T, H, E, K = _make_hunyuan_moe_block()
-        hidden = torch.randn(T, H)
-
-        scores, _, _, _ = softmax_topk_wg_routing(hidden, moe_block)
-        per_token_sums = scores.reshape(T, K).sum(dim=-1)
-        assert torch.allclose(per_token_sums, torch.ones(T), atol=1e-5)
+        with pytest.raises(ValueError, match="has_gate"):
+            sonicmoe_experts_forward_with_lora(
+                fake_self, hidden, top_k_index, top_k_weights
+            )
 
-    def test_uses_gate_wg_weight(self):
-        """Verify that modifying gate.wg.weight changes the routing output."""
-        from axolotl.integrations.kernels.libs.sonicmoe.routing import (
-            softmax_topk_wg_routing,
+    def test_rejects_non_cuda(self):
+        from axolotl.integrations.kernels.libs.sonicmoe.experts import (
+            sonicmoe_experts_forward_with_lora,
         )
 
-        moe_block, T, H, E, K = _make_hunyuan_moe_block()
-        hidden = torch.randn(T, H)
+        fake_self = SimpleNamespace(has_gate=True)
+        hidden = torch.zeros(2, 4)  # CPU tensor
+        top_k_index = torch.zeros(2, 1, dtype=torch.long)
+        top_k_weights = torch.ones(2, 1)
 
-        scores1, _, _, _ = softmax_topk_wg_routing(hidden, moe_block)
-
-        # Change the wg weight and verify scores change
-        moe_block.gate.wg.weight = torch.randn(E, H)
-        scores2, _, _, _ = softmax_topk_wg_routing(hidden, moe_block)
-
-        assert not torch.equal(scores1, scores2)
+        with pytest.raises(ValueError, match="CUDA"):
+            sonicmoe_experts_forward_with_lora(
+                fake_self, hidden, top_k_index, top_k_weights
+            )
diff --git a/tests/integrations/test_sonicmoe_gradients.py b/tests/integrations/test_sonicmoe_gradients.py
deleted file mode 100644
index cb5ef7663d..0000000000
--- a/tests/integrations/test_sonicmoe_gradients.py
+++ /dev/null
@@ -1,158 +0,0 @@
-"""
-Gradient correctness tests for SonicMoE routing functions (CPU-only).
-
-Uses torch.autograd.gradcheck with float32 inputs to match the production
-code path where routing happens in float32.
-"""
-
-import torch
-
-from axolotl.integrations.kernels.libs.sonicmoe.routing import (
-    sigmoid_topk_routing,
-    softmax_topk_routing,
-)
-
-_GC_EPS = 1e-3
-_GC_ATOL = 1e-3
-_GC_RTOL = 1e-3
-
-
-def _make_softmax_moe_block(weight):
-    gate = torch.nn.Module()
-    gate.weight = weight
-    gate.top_k = 2
-    gate.norm_topk_prob = True
-
-    moe_block = torch.nn.Module()
-    moe_block.gate = gate
-    return moe_block
-
-
-def _make_sigmoid_moe_block(weight, bias):
-    gate = torch.nn.Module()
-    gate.weight = weight
-    gate.e_score_correction_bias = bias
-
-    moe_block = torch.nn.Module()
-    moe_block.gate = gate
-    moe_block.top_k = 2
-    moe_block.n_routed_experts = weight.shape[0]
-    moe_block.n_group = 1
-    moe_block.norm_topk_prob = True
-    moe_block.routed_scaling_factor = 1.0
-    return moe_block
-
-
-class TestSoftmaxTopkRoutingGradcheck:
-    """Numerical gradient verification for softmax_topk_routing."""
-
-    def test_gradcheck_wrt_gate_weight(self):
-        T, H, E = 4, 8, 4
-
-        hidden = torch.randn(T, H, dtype=torch.float32)
-
-        def fn(weight):
-            moe_block = _make_softmax_moe_block(weight)
-            scores, _, _, _ = softmax_topk_routing(hidden, moe_block)
-            return scores
-
-        weight = torch.randn(E, H, dtype=torch.float32, requires_grad=True)
-        torch.autograd.gradcheck(
-            fn, (weight,), eps=_GC_EPS, atol=_GC_ATOL, rtol=_GC_RTOL
-        )
-
-    def test_gradcheck_wrt_hidden_states(self):
-        T, H, E = 4, 8, 4
-
-        weight = torch.randn(E, H, dtype=torch.float32)
-        moe_block = _make_softmax_moe_block(weight)
-
-        def fn(hidden):
-            scores, _, _, _ = softmax_topk_routing(hidden, moe_block)
-            return scores
-
-        hidden = torch.randn(T, H, dtype=torch.float32, requires_grad=True)
-        torch.autograd.gradcheck(
-            fn, (hidden,), eps=_GC_EPS, atol=_GC_ATOL, rtol=_GC_RTOL
-        )
-
-    def test_gradcheck_wrt_router_logits(self):
-        T, H, E = 4, 8, 4
-
-        hidden = torch.randn(T, H, dtype=torch.float32)
-
-        def fn(weight):
-            moe_block = _make_softmax_moe_block(weight)
-            _, _, _, router_logits = softmax_topk_routing(hidden, moe_block)
-            return router_logits
-
-        weight = torch.randn(E, H, dtype=torch.float32, requires_grad=True)
-        torch.autograd.gradcheck(
-            fn, (weight,), eps=_GC_EPS, atol=_GC_ATOL, rtol=_GC_RTOL
-        )
-
-    def test_no_norm_variant(self):
-        T, H, E = 4, 8, 4
-
-        hidden = torch.randn(T, H, dtype=torch.float32)
-
-        def fn(weight):
-            moe_block = _make_softmax_moe_block(weight)
-            moe_block.gate.norm_topk_prob = False
-            scores, _, _, _ = softmax_topk_routing(hidden, moe_block)
-            return scores
-
-        weight = torch.randn(E, H, dtype=torch.float32, requires_grad=True)
-        torch.autograd.gradcheck(
-            fn, (weight,), eps=_GC_EPS, atol=_GC_ATOL, rtol=_GC_RTOL
-        )
-
-
-class TestSigmoidTopkRoutingGradcheck:
-    """Numerical gradient verification for sigmoid_topk_routing."""
-
-    def test_gradcheck_wrt_gate_weight(self):
-        T, H, E = 4, 8, 4
-
-        hidden = torch.randn(T, H, dtype=torch.float32)
-        bias = torch.zeros(E, dtype=torch.float32)
-
-        def fn(weight):
-            moe_block = _make_sigmoid_moe_block(weight, bias)
-            scores, _, _, _ = sigmoid_topk_routing(hidden, moe_block)
-            return scores
-
-        weight = torch.randn(E, H, dtype=torch.float32, requires_grad=True)
-        torch.autograd.gradcheck(
-            fn, (weight,), eps=_GC_EPS, atol=_GC_ATOL, rtol=_GC_RTOL
-        )
-
-    def test_gradcheck_wrt_hidden_states(self):
-        T, H, E = 4, 8, 4
-
-        weight = torch.randn(E, H, dtype=torch.float32)
-        bias = torch.zeros(E, dtype=torch.float32)
-        moe_block = _make_sigmoid_moe_block(weight, bias)
-
-        def fn(hidden):
-            scores, _, _, _ = sigmoid_topk_routing(hidden, moe_block)
-            return scores
-
-        hidden = torch.randn(T, H, dtype=torch.float32, requires_grad=True)
-        torch.autograd.gradcheck(
-            fn, (hidden,), eps=_GC_EPS, atol=_GC_ATOL, rtol=_GC_RTOL
-        )
-
-    def test_gradcheck_wrt_bias(self):
-        T, H, E = 4, 8, 4
-
-        hidden = torch.randn(T, H, dtype=torch.float32)
-        weight = torch.randn(E, H, dtype=torch.float32)
-
-        def fn(bias):
-            moe_block = _make_sigmoid_moe_block(weight, bias)
-            scores, _, _, _ = sigmoid_topk_routing(hidden, moe_block)
-            return scores
-
-        bias = torch.zeros(E, dtype=torch.float32, requires_grad=True)
-        torch.autograd.gradcheck(fn, (bias,), eps=_GC_EPS, atol=_GC_ATOL, rtol=_GC_RTOL)
diff --git a/tests/integrations/test_sonicmoe_lora.py b/tests/integrations/test_sonicmoe_lora.py
index 4b25843fee..7b1cf1765d 100644
--- a/tests/integrations/test_sonicmoe_lora.py
+++ b/tests/integrations/test_sonicmoe_lora.py
@@ -15,7 +15,6 @@
     has_lora,
     materialize_expert_lora,
     unwrap_experts_lora,
-    unwrap_gate_lora,
 )
 
 # =============================================================================
@@ -44,21 +43,6 @@ def _make_mock_lora_module(weight_A, weight_B, scaling_val, param_name=None):
     return mock
 
 
-def _make_peft_gate(hidden_size, num_experts, rank, scaling=0.5):
-    """Create a mock PEFT-wrapped gate module."""
-    base_gate = MagicMock()
-    base_gate.weight = torch.randn(num_experts, hidden_size)
-    base_gate.top_k = 2
-    base_gate.norm_topk_prob = True
-
-    lora_A = torch.randn(rank, hidden_size)
-    lora_B = torch.randn(num_experts, rank)
-
-    wrapper = _make_mock_lora_module(lora_A, lora_B, scaling)
-    wrapper.base_layer = base_gate
-    return wrapper, base_gate
-
-
 def _make_peft_experts(
     num_experts, gate_up_dim, down_dim, hidden_size, rank, scaling=0.5
 ):
@@ -134,39 +118,6 @@ def test_no_active_adapters(self):
         assert get_lora_params_from_wrapper(wrapper) == (None, None, None)
 
 
-# =============================================================================
-# Tests: unwrap_gate_lora
-# =============================================================================
-
-
-class TestUnwrapGateLora:
-    def test_plain_gate(self):
-        gate = MagicMock(spec=["weight", "top_k"])
-        del gate.base_layer
-        del gate.lora_A
-        gate.weight = torch.randn(8, 64)
-        base, weight, delta = unwrap_gate_lora(gate)
-        assert base is gate
-        assert torch.equal(weight, gate.weight)
-        assert delta is None
-
-    def test_wrapped_gate(self):
-        wrapper, base_gate = _make_peft_gate(
-            hidden_size=64, num_experts=8, rank=4, scaling=0.5
-        )
-        base, weight, delta = unwrap_gate_lora(wrapper)
-        assert base is base_gate
-        assert torch.equal(weight, base_gate.weight)
-        assert delta is not None
-        assert delta.shape == base_gate.weight.shape
-
-        # Verify delta = scaling * B @ A
-        lora_A = wrapper.lora_A["default"].weight
-        lora_B = wrapper.lora_B["default"].weight
-        expected = 0.5 * (lora_B @ lora_A)
-        assert torch.allclose(delta, expected)
-
-
 # =============================================================================
 # Tests: unwrap_experts_lora
 # =============================================================================
diff --git a/tests/kernels/test_fused_rope_autotune_telemetry.py b/tests/kernels/test_fused_rope_autotune_telemetry.py
new file mode 100644
index 0000000000..bc5d61c0fc
--- /dev/null
+++ b/tests/kernels/test_fused_rope_autotune_telemetry.py
@@ -0,0 +1,168 @@
+"""Tests for fused RMSNorm+RoPE autotune telemetry.
+
+Mocked end-to-end, so no Triton or CUDA is required (mirrors
+``tests/integrations/test_scattermoe_autotune_telemetry.py``).
+"""
+
+import sys
+from types import SimpleNamespace
+from unittest.mock import MagicMock, patch
+
+_MODPATH = "axolotl.kernels.gemma4_fused_rope"
+
+
+def _make_mock_config(kwargs, num_warps=2, num_stages=1):
+    return SimpleNamespace(
+        kwargs=kwargs, num_warps=num_warps, num_stages=num_stages, num_ctas=None
+    )
+
+
+def _make_fake_module(cache=None):
+    kernel = SimpleNamespace(cache=cache if cache is not None else {})
+    return SimpleNamespace(_rms_norm_rope_backward_kernel=kernel)
+
+
+class TestCollector:
+    def test_no_module_returns_empty(self):
+        from axolotl.kernels.autotune_telemetry import (
+            collect_fused_rope_autotune_configs,
+        )
+
+        with patch.dict(sys.modules, {_MODPATH: None}):
+            assert collect_fused_rope_autotune_configs() == []
+
+    def test_empty_cache_returns_empty(self):
+        from axolotl.kernels.autotune_telemetry import (
+            collect_fused_rope_autotune_configs,
+        )
+
+        with patch.dict(sys.modules, {_MODPATH: _make_fake_module()}):
+            assert collect_fused_rope_autotune_configs() == []
+
+    def test_populated_cache_returns_configs(self):
+        from axolotl.kernels.autotune_telemetry import (
+            collect_fused_rope_autotune_configs,
+        )
+
+        cfg = _make_mock_config({}, num_warps=2, num_stages=1)
+        fake = _make_fake_module(cache={(128, "torch.bfloat16"): cfg})
+        with patch.dict(sys.modules, {_MODPATH: fake}):
+            result = collect_fused_rope_autotune_configs()
+
+        assert len(result) == 1
+        entry = result[0]
+        assert entry["kernel"] == "fused_rms_norm_rope_bwd"
+        assert entry["key"]["n_cols"] == 128
+        assert entry["key"]["_extra"] == ["torch.bfloat16"]
+        assert entry["config"]["num_warps"] == 2
+        assert entry["config"]["num_stages"] == 1
+
+    def test_multiple_head_dims(self):
+        """head_dim 128 (Qwen3) and 256 (Qwen3.5) get separate cache entries."""
+        from axolotl.kernels.autotune_telemetry import (
+            collect_fused_rope_autotune_configs,
+        )
+
+        fake = _make_fake_module(
+            cache={
+                (128,): _make_mock_config({}, num_warps=2),
+                (256,): _make_mock_config({}, num_warps=4),
+            }
+        )
+        with patch.dict(sys.modules, {_MODPATH: fake}):
+            result = collect_fused_rope_autotune_configs()
+
+        assert {e["key"]["n_cols"] for e in result} == {128, 256}
+
+
+class TestCallback:
+    def _patch_collect(self, return_value=None, side_effect=None):
+        return patch(
+            "axolotl.kernels.autotune_telemetry.collect_fused_rope_autotune_configs",
+            return_value=return_value,
+            side_effect=side_effect,
+        )
+
+    def test_reports_once_on_first_step(self):
+        from axolotl.kernels.autotune_telemetry import FusedRopeAutotuneReportCallback
+
+        cb = FusedRopeAutotuneReportCallback()
+        state = MagicMock()
+        state.global_step = 1
+        configs = [{"kernel": "fused_rms_norm_rope_bwd", "key": {}, "config": {}}]
+
+        with (
+            self._patch_collect(return_value=configs),
+            patch("axolotl.telemetry.manager.TelemetryManager") as tm_cls,
+        ):
+            tm = MagicMock()
+            tm.enabled = True
+            tm_cls.get_instance.return_value = tm
+
+            cb.on_step_end(args=MagicMock(), state=state, control=MagicMock())
+            assert tm.send_event.call_count == 1
+            kw = tm.send_event.call_args[1]
+            assert kw["event_type"] == "fused-rope-autotune"
+            assert kw["properties"]["kernel_count"] == 1
+
+            cb.on_step_end(args=MagicMock(), state=state, control=MagicMock())
+            assert tm.send_event.call_count == 1
+
+    def test_retries_until_step_5_then_gives_up(self):
+        from axolotl.kernels.autotune_telemetry import FusedRopeAutotuneReportCallback
+
+        cb = FusedRopeAutotuneReportCallback()
+        with self._patch_collect(return_value=[]):
+            for step in range(1, 7):
+                state = MagicMock()
+                state.global_step = step
+                cb.on_step_end(args=MagicMock(), state=state, control=MagicMock())
+        assert cb._reported is True
+
+    def test_includes_gpu_info(self):
+        from axolotl.kernels.autotune_telemetry import FusedRopeAutotuneReportCallback
+
+        cb = FusedRopeAutotuneReportCallback()
+        state = MagicMock()
+        state.global_step = 1
+        configs = [{"kernel": "fused_rms_norm_rope_bwd", "key": {}, "config": {}}]
+        gpu = {
+            "gpu_name": "NVIDIA H100",
+            "gpu_compute_capability": "9.0",
+            "gpu_memory_bytes": 85899345920,
+        }
+
+        with (
+            self._patch_collect(return_value=configs),
+            patch("axolotl.kernels.autotune_telemetry._get_gpu_info", return_value=gpu),
+            patch("axolotl.telemetry.manager.TelemetryManager") as tm_cls,
+        ):
+            tm = MagicMock()
+            tm.enabled = True
+            tm_cls.get_instance.return_value = tm
+
+            cb.on_step_end(args=MagicMock(), state=state, control=MagicMock())
+            props = tm.send_event.call_args[1]["properties"]
+            assert props["gpu_name"] == "NVIDIA H100"
+            assert props["gpu_compute_capability"] == "9.0"
+
+    def test_skips_send_when_telemetry_disabled(self):
+        from axolotl.kernels.autotune_telemetry import FusedRopeAutotuneReportCallback
+
+        cb = FusedRopeAutotuneReportCallback()
+        state = MagicMock()
+        state.global_step = 1
+
+        with (
+            self._patch_collect(
+                return_value=[{"kernel": "x", "key": {}, "config": {}}]
+            ),
+            patch("axolotl.telemetry.manager.TelemetryManager") as tm_cls,
+        ):
+            tm = MagicMock()
+            tm.enabled = False
+            tm_cls.get_instance.return_value = tm
+
+            cb.on_step_end(args=MagicMock(), state=state, control=MagicMock())
+            assert tm.send_event.call_count == 0
+            assert cb._reported is True
diff --git a/tests/kernels/test_gemma4_fused_rope_compile.py b/tests/kernels/test_gemma4_fused_rope_compile.py
new file mode 100644
index 0000000000..dc8858722a
--- /dev/null
+++ b/tests/kernels/test_gemma4_fused_rope_compile.py
@@ -0,0 +1,66 @@
+"""torch.compile traceability tests for the fused RMSNorm+RoPE kernel."""
+
+import pytest
+import torch
+
+pytestmark = [
+    pytest.mark.skipif(not torch.cuda.is_available(), reason="CUDA required"),
+]
+
+
+def _make_inputs(B=2, S=64, H=4, D=64, n_rot=64, dtype=torch.bfloat16, seed=0):
+    torch.manual_seed(seed)
+    x = torch.randn(B, S, H, D, device="cuda", dtype=dtype, requires_grad=True)
+    w = torch.randn(D, device="cuda", dtype=dtype, requires_grad=True)
+    cos = torch.randn(B, S, n_rot, device="cuda", dtype=dtype)
+    sin = torch.randn(B, S, n_rot, device="cuda", dtype=dtype)
+    return x, w, cos, sin
+
+
+def test_fused_rms_norm_rope_compile_forward_matches_eager():
+    from axolotl.kernels.gemma4_fused_rope import fused_rms_norm_rope
+
+    x, w, cos, sin = _make_inputs(seed=1)
+    x_ref = x.detach().clone().requires_grad_(True)
+    w_ref = w.detach().clone().requires_grad_(True)
+
+    y_eager = fused_rms_norm_rope(x_ref, w_ref, cos, sin, eps=1e-6)
+
+    compiled = torch.compile(fused_rms_norm_rope, fullgraph=True, dynamic=False)
+    y_compiled = compiled(x, w, cos, sin, eps=1e-6)
+
+    torch.testing.assert_close(y_compiled, y_eager, rtol=1e-2, atol=1e-2)
+    assert torch.isfinite(y_compiled).all()
+
+
+def test_fused_rms_norm_rope_compile_backward():
+    from axolotl.kernels.gemma4_fused_rope import fused_rms_norm_rope
+
+    x, w, cos, sin = _make_inputs(seed=2)
+
+    compiled = torch.compile(fused_rms_norm_rope, fullgraph=True, dynamic=False)
+    y = compiled(x, w, cos, sin, eps=1e-6)
+    y.sum().backward()
+
+    assert x.grad is not None and x.grad.isfinite().all() and x.grad.abs().sum() > 0
+    assert w.grad is not None and w.grad.isfinite().all() and w.grad.abs().sum() > 0
+
+
+def test_fused_rms_norm_noscale_compile():
+    from axolotl.kernels.gemma4_fused_rope import fused_rms_norm_noscale
+
+    torch.manual_seed(3)
+    x = torch.randn(
+        2, 32, 4, 64, device="cuda", dtype=torch.bfloat16, requires_grad=True
+    )
+    x_ref = x.detach().clone().requires_grad_(True)
+
+    y_eager = fused_rms_norm_noscale(x_ref, eps=1e-6)
+
+    compiled = torch.compile(fused_rms_norm_noscale, fullgraph=True, dynamic=False)
+    y_compiled = compiled(x, eps=1e-6)
+
+    torch.testing.assert_close(y_compiled, y_eager, rtol=1e-2, atol=1e-2)
+
+    y_compiled.sum().backward()
+    assert x.grad is not None and x.grad.isfinite().all() and x.grad.abs().sum() > 0
diff --git a/tests/kernels/test_gemma4_fused_rope_unit_offset.py b/tests/kernels/test_gemma4_fused_rope_unit_offset.py
new file mode 100644
index 0000000000..1e84d95feb
--- /dev/null
+++ b/tests/kernels/test_gemma4_fused_rope_unit_offset.py
@@ -0,0 +1,238 @@
+"""Correctness tests for the ``unit_offset=True`` (Gemma-style) path in the fused RMSNorm+RoPE kernel."""
+
+import pytest
+import torch
+
+torch.manual_seed(7)
+
+pytestmark = pytest.mark.skipif(not torch.cuda.is_available(), reason="CUDA required")
+
+
+def _rotate_half(x):
+    x1, x2 = x.chunk(2, dim=-1)
+    return torch.cat((-x2, x1), dim=-1)
+
+
+def _reference_unit_offset(x, weight, cos, sin, eps):
+    x_fp32 = x.float()
+    rstd = torch.rsqrt(x_fp32.pow(2).mean(-1, keepdim=True) + eps)
+    normed = (x_fp32 * rstd * (1.0 + weight.float())).to(x.dtype)
+    cos_b = cos.unsqueeze(2)
+    sin_b = sin.unsqueeze(2)
+    return normed * cos_b + _rotate_half(normed) * sin_b
+
+
+def _reference_unit_offset_partial(x, weight, cos, sin, eps):
+    """Reference for ``unit_offset=True`` with ``cos.shape[-1] < D`` (Qwen3.5 partial rotary)."""
+    n_rot = cos.shape[-1]
+    x_fp32 = x.float()
+    rstd = torch.rsqrt(x_fp32.pow(2).mean(-1, keepdim=True) + eps)
+    normed = (x_fp32 * rstd * (1.0 + weight.float())).to(x.dtype)
+    rot_part, pass_part = normed[..., :n_rot], normed[..., n_rot:]
+    cos_b, sin_b = cos.unsqueeze(2), sin.unsqueeze(2)
+    rotated = rot_part * cos_b + _rotate_half(rot_part) * sin_b
+    return torch.cat([rotated, pass_part], dim=-1)
+
+
+def _reference_fp32(x, weight, cos, sin, eps, unit_offset):
+    """fp32 ground truth: no intermediate bf16 rounding, so it's *more* accurate
+    than the eager bf16 path. Handles full (``n_rot == D``) and partial rotary."""
+    n_rot = cos.shape[-1]
+    x_fp32 = x.float()
+    rstd = torch.rsqrt(x_fp32.pow(2).mean(-1, keepdim=True) + eps)
+    scale = (1.0 + weight.float()) if unit_offset else weight.float()
+    normed = x_fp32 * rstd * scale
+    cos_b, sin_b = cos.float().unsqueeze(2), sin.float().unsqueeze(2)
+    rot_part, pass_part = normed[..., :n_rot], normed[..., n_rot:]
+    rotated = rot_part * cos_b + _rotate_half(rot_part) * sin_b
+    return torch.cat([rotated, pass_part], dim=-1)
+
+
+def _assert_at_bf16_floor(y_fused, y_ref_fp32, y_eager):
+    """The fused kernel keeps fp32 internally and rounds once, so its bf16 output
+    must land at the bf16 rounding floor of the fp32 reference — and be at least
+    as accurate as the eager bf16 path (which rounds several times mid-compute)."""
+    floor = (y_ref_fp32.to(y_fused.dtype).float() - y_ref_fp32).abs().max()
+    fused_err = (y_fused.float() - y_ref_fp32).abs().max()
+    eager_err = (y_eager.float() - y_ref_fp32).abs().max()
+    assert fused_err <= 1.5 * floor, (
+        f"fused err {fused_err:.3e} exceeds 1.5x bf16 floor {floor:.3e}"
+    )
+    assert fused_err <= eager_err, (
+        f"fused ({fused_err:.3e}) less accurate than eager bf16 path ({eager_err:.3e})"
+    )
+
+
+class TestForward:
+    def test_matches_reference(self):
+        from axolotl.kernels.gemma4_fused_rope import fused_rms_norm_rope
+
+        B, S, H, D = 2, 32, 4, 64
+        eps = 1e-6
+        x = torch.randn(B, S, H, D, device="cuda", dtype=torch.bfloat16)
+        weight = torch.randn(D, device="cuda", dtype=torch.bfloat16) * 0.1
+        cos = torch.randn(B, S, D, device="cuda", dtype=torch.bfloat16)
+        sin = torch.randn(B, S, D, device="cuda", dtype=torch.bfloat16)
+
+        y_fused = fused_rms_norm_rope(
+            x.clone(), weight, cos, sin, eps=eps, unit_offset=True
+        )
+        y_ref_fp32 = _reference_fp32(x, weight, cos, sin, eps, unit_offset=True)
+        y_eager = _reference_unit_offset(x.clone(), weight, cos, sin, eps)
+        _assert_at_bf16_floor(y_fused, y_ref_fp32, y_eager)
+
+    def test_no_offset_matches_fp32_reference(self):
+        """Qwen3 / Gemma 4 path (``unit_offset=False``) also sits at the bf16 floor."""
+        from axolotl.kernels.gemma4_fused_rope import fused_rms_norm_rope
+
+        B, S, H, D = 2, 32, 4, 64
+        eps = 1e-6
+        x = torch.randn(B, S, H, D, device="cuda", dtype=torch.bfloat16)
+        weight = torch.randn(D, device="cuda", dtype=torch.bfloat16) * 0.1
+        cos = torch.randn(B, S, D, device="cuda", dtype=torch.bfloat16)
+        sin = torch.randn(B, S, D, device="cuda", dtype=torch.bfloat16)
+
+        y_fused = fused_rms_norm_rope(
+            x.clone(), weight, cos, sin, eps=eps, unit_offset=False
+        )
+        y_ref_fp32 = _reference_fp32(x, weight, cos, sin, eps, unit_offset=False)
+        x_fp32 = x.float()
+        rstd = torch.rsqrt(x_fp32.pow(2).mean(-1, keepdim=True) + eps)
+        normed = (x_fp32 * rstd * weight.float()).to(x.dtype)
+        y_eager = normed * cos.unsqueeze(2) + _rotate_half(normed) * sin.unsqueeze(2)
+        _assert_at_bf16_floor(y_fused, y_ref_fp32, y_eager)
+
+    def test_differs_from_no_offset(self):
+        from axolotl.kernels.gemma4_fused_rope import fused_rms_norm_rope
+
+        B, S, H, D = 1, 8, 2, 32
+        x = torch.randn(B, S, H, D, device="cuda", dtype=torch.bfloat16)
+        weight = torch.randn(D, device="cuda", dtype=torch.bfloat16)
+        cos = torch.randn(B, S, D, device="cuda", dtype=torch.bfloat16)
+        sin = torch.randn(B, S, D, device="cuda", dtype=torch.bfloat16)
+
+        y_off = fused_rms_norm_rope(x, weight, cos, sin, unit_offset=False)
+        y_on = fused_rms_norm_rope(x, weight, cos, sin, unit_offset=True)
+        diff = (y_off.float() - y_on.float()).abs().max().item()
+        assert diff > 1e-3, f"unit_offset toggle had no effect: max_abs_diff={diff}"
+
+
+class TestBackward:
+    def test_x_and_w_grad_match_eager(self):
+        from axolotl.kernels.gemma4_fused_rope import fused_rms_norm_rope
+
+        B, S, H, D = 2, 16, 4, 64
+        eps = 1e-6
+        cos = torch.randn(B, S, D, device="cuda", dtype=torch.bfloat16)
+        sin = torch.randn(B, S, D, device="cuda", dtype=torch.bfloat16)
+        weight_init = torch.randn(D, device="cuda", dtype=torch.bfloat16) * 0.1
+
+        x_ref = torch.randn(
+            B, S, H, D, device="cuda", dtype=torch.bfloat16, requires_grad=True
+        )
+        w_ref = weight_init.clone().requires_grad_(True)
+        y_ref = _reference_unit_offset(x_ref, w_ref, cos, sin, eps)
+        y_ref.sum().backward()
+
+        x_fused = x_ref.data.clone().requires_grad_(True)
+        w_fused = weight_init.clone().requires_grad_(True)
+        y_fused = fused_rms_norm_rope(
+            x_fused, w_fused, cos, sin, eps=eps, unit_offset=True
+        )
+        y_fused.sum().backward()
+
+        cos_sim_x = torch.nn.functional.cosine_similarity(
+            x_fused.grad.flatten().float(), x_ref.grad.flatten().float(), dim=0
+        )
+        cos_sim_w = torch.nn.functional.cosine_similarity(
+            w_fused.grad.flatten().float(), w_ref.grad.flatten().float(), dim=0
+        )
+        assert cos_sim_x > 0.999, f"x grad cosine_sim={cos_sim_x:.6f}"
+        assert cos_sim_w > 0.995, f"w grad cosine_sim={cos_sim_w:.6f}"
+
+
+class TestCompile:
+    def test_compile_fullgraph(self):
+        from axolotl.kernels.gemma4_fused_rope import fused_rms_norm_rope
+
+        B, S, H, D = 1, 8, 2, 32
+        eps = 1e-6
+        x = torch.randn(B, S, H, D, device="cuda", dtype=torch.bfloat16)
+        weight = torch.randn(D, device="cuda", dtype=torch.bfloat16) * 0.1
+        cos = torch.randn(B, S, D, device="cuda", dtype=torch.bfloat16)
+        sin = torch.randn(B, S, D, device="cuda", dtype=torch.bfloat16)
+
+        eager = fused_rms_norm_rope(x, weight, cos, sin, eps=eps, unit_offset=True)
+        compiled_fn = torch.compile(fused_rms_norm_rope, fullgraph=True)
+        compiled = compiled_fn(x, weight, cos, sin, eps=eps, unit_offset=True)
+
+        torch.testing.assert_close(compiled, eager, rtol=0, atol=0)
+
+
+class TestPartialRotary:
+    """``unit_offset=True`` combined with ``n_rot < D`` (Qwen3.5/Qwen3.6 partial rotary)."""
+
+    def test_forward_matches_reference(self):
+        from axolotl.kernels.gemma4_fused_rope import fused_rms_norm_rope
+
+        B, S, H, D, n_rot = 2, 32, 4, 128, 64
+        eps = 1e-6
+        x = torch.randn(B, S, H, D, device="cuda", dtype=torch.bfloat16)
+        weight = torch.randn(D, device="cuda", dtype=torch.bfloat16) * 0.1
+        cos = torch.randn(B, S, n_rot, device="cuda", dtype=torch.bfloat16)
+        sin = torch.randn(B, S, n_rot, device="cuda", dtype=torch.bfloat16)
+
+        y_fused = fused_rms_norm_rope(
+            x.clone(), weight, cos, sin, eps=eps, unit_offset=True
+        )
+        y_ref_fp32 = _reference_fp32(x, weight, cos, sin, eps, unit_offset=True)
+        y_eager = _reference_unit_offset_partial(x.clone(), weight, cos, sin, eps)
+        _assert_at_bf16_floor(y_fused, y_ref_fp32, y_eager)
+
+    def test_backward_x_and_w_grad_match_eager(self):
+        from axolotl.kernels.gemma4_fused_rope import fused_rms_norm_rope
+
+        B, S, H, D, n_rot = 2, 16, 4, 128, 64
+        eps = 1e-6
+        cos = torch.randn(B, S, n_rot, device="cuda", dtype=torch.bfloat16)
+        sin = torch.randn(B, S, n_rot, device="cuda", dtype=torch.bfloat16)
+        weight_init = torch.randn(D, device="cuda", dtype=torch.bfloat16) * 0.1
+
+        x_ref = torch.randn(
+            B, S, H, D, device="cuda", dtype=torch.bfloat16, requires_grad=True
+        )
+        w_ref = weight_init.clone().requires_grad_(True)
+        y_ref = _reference_unit_offset_partial(x_ref, w_ref, cos, sin, eps)
+        y_ref.sum().backward()
+
+        x_fused = x_ref.data.clone().requires_grad_(True)
+        w_fused = weight_init.clone().requires_grad_(True)
+        y_fused = fused_rms_norm_rope(
+            x_fused, w_fused, cos, sin, eps=eps, unit_offset=True
+        )
+        y_fused.sum().backward()
+
+        cos_sim_x = torch.nn.functional.cosine_similarity(
+            x_fused.grad.flatten().float(), x_ref.grad.flatten().float(), dim=0
+        )
+        cos_sim_w = torch.nn.functional.cosine_similarity(
+            w_fused.grad.flatten().float(), w_ref.grad.flatten().float(), dim=0
+        )
+        assert cos_sim_x > 0.999, f"x grad cosine_sim={cos_sim_x:.6f}"
+        assert cos_sim_w > 0.995, f"w grad cosine_sim={cos_sim_w:.6f}"
+
+    def test_compile_fullgraph(self):
+        from axolotl.kernels.gemma4_fused_rope import fused_rms_norm_rope
+
+        B, S, H, D, n_rot = 1, 8, 2, 64, 32
+        eps = 1e-6
+        x = torch.randn(B, S, H, D, device="cuda", dtype=torch.bfloat16)
+        weight = torch.randn(D, device="cuda", dtype=torch.bfloat16) * 0.1
+        cos = torch.randn(B, S, n_rot, device="cuda", dtype=torch.bfloat16)
+        sin = torch.randn(B, S, n_rot, device="cuda", dtype=torch.bfloat16)
+
+        eager = fused_rms_norm_rope(x, weight, cos, sin, eps=eps, unit_offset=True)
+        compiled_fn = torch.compile(fused_rms_norm_rope, fullgraph=True)
+        compiled = compiled_fn(x, weight, cos, sin, eps=eps, unit_offset=True)
+
+        torch.testing.assert_close(compiled, eager, rtol=0, atol=0)
diff --git a/tests/monkeypatch/test_gemma4_fused_attn.py b/tests/monkeypatch/test_gemma4_fused_attn.py
index 0530d0ee8d..db719c4cc7 100644
--- a/tests/monkeypatch/test_gemma4_fused_attn.py
+++ b/tests/monkeypatch/test_gemma4_fused_attn.py
@@ -1,19 +1,4 @@
-"""Tests for the Gemma 4 fused-attention monkey-patch.
-
-These tests exercise the patched ``Gemma4TextAttention.forward`` against
-the stock implementation it replaces. The hybrid Gemma 4 model intentionally
-mixes a sliding (`head_dim=32`) layer with a full-attention proportional-rope
-layer (`global_head_dim=64`, `partial_rotary_factor=0.25`) so that the
-partial-rotary RMSNorm+RoPE path through the fused Triton kernel is
-exercised end-to-end (this is the bug originally documented in
-``GEMMA4_FUSED_ROPE_HYBRID_ATTN_BUG.md``).
-
-The full-model forward also pins that the fused forward keeps accepting
-whatever call shape ``Gemma4TextDecoderLayer.forward`` produces in the
-installed transformers version — so any future signature drift on
-upstream's side trips a clear failure here instead of a confusing
-TypeError deep in a training run.
-"""
+"""Tests for the Gemma 4 fused-attention monkey-patch (hybrid sliding + partial-rotary layers)."""
 
 import pytest
 import torch
@@ -30,8 +15,7 @@
 
 @pytest.fixture
 def restore_gemma4_attention():
-    """Snapshot ``Gemma4TextAttention.forward`` and restore after the test
-    so the monkey-patch does not leak across the suite."""
+    """Snapshot ``Gemma4TextAttention.forward`` so the patch can't leak across tests."""
     from transformers.models.gemma4.modeling_gemma4 import Gemma4TextAttention
 
     saved = Gemma4TextAttention.forward
@@ -40,10 +24,7 @@ def restore_gemma4_attention():
 
 
 def _build_hybrid_config():
-    """Tiny hybrid Gemma 4 config: one sliding layer + one full-attention
-    layer with proportional rope and partial_rotary_factor=0.25. This is
-    the same shape pattern as ``google/gemma-4-26B-A4B-it`` but small
-    enough to fit on any GPU."""
+    """Tiny hybrid Gemma 4: one sliding + one full-attention layer with ``partial_rotary_factor=0.25``."""
     from transformers.models.gemma4.configuration_gemma4 import Gemma4TextConfig
 
     cfg = Gemma4TextConfig(
@@ -84,15 +65,186 @@ def _build_model(seed=0):
     return Gemma4TextModel(cfg).cuda().to(torch.bfloat16).eval()
 
 
+class TestGemma4FusedAttnLoRACompose:
+    """LoRA QKV + fused composition. Gemma 4 in transformers>=5.8.1 has no matching ``QKV_PATCHES`` entry yet, hence the strict xfail below."""
+
+    def _build_cfg(self):
+        from axolotl.utils.dict import DictDefault
+
+        return DictDefault(
+            {
+                "base_model": "fake/gemma4",
+                "lora_qkv_kernel": True,
+                "lora_o_kernel": True,
+                "lora_dropout": 0.0,
+            }
+        )
+
+    @pytest.mark.xfail(
+        reason="Gemma 4 QKV_PATCHES need refresh for transformers 5.8.1",
+        strict=True,
+    )
+    def test_lora_qkv_then_fused_does_not_raise(
+        self, restore_gemma4_attention, monkeypatch
+    ):
+        from transformers.models.gemma4.modeling_gemma4 import Gemma4TextAttention
+
+        from axolotl.monkeypatch import lora_kernels
+        from axolotl.monkeypatch.models.gemma4.fused_attn import (
+            patch_gemma4_fused_attn,
+        )
+
+        monkeypatch.setattr(
+            lora_kernels,
+            "get_attention_cls_from_config",
+            lambda _cfg: Gemma4TextAttention,
+        )
+
+        try:
+            delattr(Gemma4TextAttention, "_original_forward")
+        except AttributeError:
+            pass
+
+        try:
+            lora_kernels.patch_self_attn_lora(self._build_cfg())
+            assert hasattr(Gemma4TextAttention, "_original_forward"), (
+                "patch_self_attn_lora must run on stock source first"
+            )
+            patch_gemma4_fused_attn()
+        finally:
+            try:
+                delattr(Gemma4TextAttention, "_original_forward")
+            except AttributeError:
+                pass
+
+    def test_reverse_order_skips_lora_rewrite(
+        self, restore_gemma4_attention, monkeypatch, caplog
+    ):
+        """Fused-then-LoRA must NOT install the LoRA source rewrite. Upstream
+        PR #3657 made ``patch_self_attn_lora`` detect ``apply_qkv``/``apply_o``
+        on a fused-patched attention and skip; our ``patch_manager`` reorder
+        keeps this from happening in practice, but the skip path is the last
+        line of defense."""
+        import logging
+
+        from transformers.models.gemma4.modeling_gemma4 import Gemma4TextAttention
+
+        from axolotl.monkeypatch import lora_kernels
+        from axolotl.monkeypatch.models.gemma4.fused_attn import (
+            patch_gemma4_fused_attn,
+        )
+
+        monkeypatch.setattr(
+            lora_kernels,
+            "get_attention_cls_from_config",
+            lambda _cfg: Gemma4TextAttention,
+        )
+
+        try:
+            delattr(Gemma4TextAttention, "_original_forward")
+        except AttributeError:
+            pass
+
+        try:
+            patch_gemma4_fused_attn()
+            logger = logging.getLogger("axolotl.monkeypatch.lora_kernels")
+            logger.addHandler(caplog.handler)
+            previous_level = logger.level
+            logger.setLevel(logging.INFO)
+            try:
+                lora_kernels.patch_self_attn_lora(self._build_cfg())
+            finally:
+                logger.removeHandler(caplog.handler)
+                logger.setLevel(previous_level)
+            assert "fused attention" in caplog.text and "skipping" in caplog.text, (
+                "expected lora_kernels to detect the fused path and log a skip; "
+                f"got {caplog.text}"
+            )
+            assert not hasattr(Gemma4TextAttention, "_original_forward"), (
+                "lora_kernels installed _original_forward over a fused-patched class"
+            )
+        finally:
+            try:
+                delattr(Gemma4TextAttention, "_original_forward")
+            except AttributeError:
+                pass
+
+
+def _build_kv_shared_config():
+    """Hybrid Gemma 4 with ``num_kv_shared_layers > 0`` so the fused shared-KV branch runs."""
+    from transformers.models.gemma4.configuration_gemma4 import Gemma4TextConfig
+
+    cfg = Gemma4TextConfig(
+        vocab_size=128,
+        hidden_size=64,
+        intermediate_size=128,
+        num_hidden_layers=4,
+        num_kv_shared_layers=2,
+        num_attention_heads=2,
+        num_key_value_heads=2,
+        head_dim=32,
+        global_head_dim=64,
+        layer_types=[
+            "sliding_attention",
+            "full_attention",
+            "sliding_attention",
+            "full_attention",
+        ],
+        sliding_window=64,
+        max_position_embeddings=2048,
+        hidden_size_per_layer_input=16,
+        vocab_size_per_layer_input=128,
+        rope_parameters={
+            "sliding_attention": {"rope_type": "default", "rope_theta": 10000.0},
+            "full_attention": {
+                "rope_type": "proportional",
+                "rope_theta": 1000000.0,
+                "partial_rotary_factor": 0.25,
+            },
+        },
+    )
+    cfg._attn_implementation = "sdpa"
+    return cfg
+
+
+class TestFusedAttnSharedKV:
+    """Regression: ``num_kv_shared_layers > 0`` hit the ``kv_shared_layer_index`` key removed in transformers>=5.8."""
+
+    def test_shared_kv_forward_backward(self, restore_gemma4_attention):
+        from transformers.models.gemma4.modeling_gemma4 import Gemma4TextModel
+
+        from axolotl.monkeypatch.models.gemma4.fused_attn import (
+            patch_gemma4_fused_attn,
+        )
+
+        torch.manual_seed(4)
+        m = Gemma4TextModel(_build_kv_shared_config()).cuda().to(torch.bfloat16).train()
+        assert any(layer.self_attn.is_kv_shared_layer for layer in m.layers), (
+            "test config must exercise at least one kv-shared layer"
+        )
+
+        ids = torch.randint(0, 128, (2, 16), device="cuda")
+        mask = torch.ones(2, 16, dtype=torch.long, device="cuda")
+
+        with torch.no_grad():
+            ref = m(input_ids=ids, attention_mask=mask).last_hidden_state.clone()
+
+        patch_gemma4_fused_attn()
+        out = m(input_ids=ids, attention_mask=mask).last_hidden_state
+        out.sum().backward()
+
+        assert out.shape == ref.shape
+        assert torch.isfinite(out).all()
+        cos_sim = torch.nn.functional.cosine_similarity(
+            ref.flatten().float(), out.detach().flatten().float(), dim=0
+        )
+        assert cos_sim > 0.999, f"shared-kv fused vs stock cosine_sim={cos_sim:.6f}"
+
+
 class TestFusedAttnSignature:
-    """The fused forward must accept the same call shape as
-    ``Gemma4TextDecoderLayer`` produces in the installed transformers
-    version. Any signature drift surfaces here as a TypeError."""
+    """Pin the fused forward against the live ``Gemma4TextDecoderLayer.forward`` call shape."""
 
     def test_decoder_layer_can_call_fused_forward(self, restore_gemma4_attention):
-        """Run a model forward that exercises the real
-        ``Gemma4TextDecoderLayer -> Gemma4TextAttention`` call path with
-        the fused patch installed."""
         from axolotl.monkeypatch.models.gemma4.fused_attn import (
             patch_gemma4_fused_attn,
         )
@@ -110,13 +262,9 @@ def test_decoder_layer_can_call_fused_forward(self, restore_gemma4_attention):
 
 
 class TestFusedAttnPerLayerCorrectness:
-    """Compare the patched attention layer to the stock implementation
-    on a single forward call. This isolates the fused kernel correctness
-    from cross-layer numerical drift."""
+    """Single-layer comparison of patched vs stock attention to isolate kernel correctness from cross-layer drift."""
 
     def _run_attention(self, model, layer_idx, hidden_states, position_ids):
-        """Call ``Gemma4TextAttention.forward`` (whatever is currently
-        installed) for one layer and return the output."""
         attn = model.layers[layer_idx].self_attn
         layer_type = model.config.layer_types[layer_idx]
         cos, sin = model.rotary_emb(hidden_states, position_ids, layer_type)
@@ -157,14 +305,11 @@ def test_forward_matches_stock(self, restore_gemma4_attention, layer_idx):
         assert cos_sim > 0.999, (
             f"layer {layer_idx} fused vs stock cosine_sim={cos_sim:.6f}"
         )
-        # bf16 precision: a few millis of absolute drift per element is
-        # acceptable for a Q/K/V projection pipeline. Anything larger is
-        # a real bug.
         torch.testing.assert_close(got, ref, rtol=5e-2, atol=5e-2)
 
 
 class TestFusedAttnFullModel:
-    """End-to-end model forward + backward through both layer types."""
+    """End-to-end forward + backward through both layer types."""
 
     def test_full_forward_matches_stock(self, restore_gemma4_attention):
         from axolotl.monkeypatch.models.gemma4.fused_attn import (
@@ -187,14 +332,9 @@ def test_full_forward_matches_stock(self, restore_gemma4_attention):
         cos_sim = torch.nn.functional.cosine_similarity(
             ref.flatten().float(), got.flatten().float(), dim=0
         )
-        # End-to-end through 2 layers (RMSNorm, attention, MLP/MoE) in bf16
-        # accumulates a small amount of numerical drift; we just want to
-        # pin that the two paths are computing the same function.
         assert cos_sim > 0.999, f"end-to-end cosine_sim={cos_sim:.6f}"
 
     def test_backward_grad_flows_through_fused_path(self, restore_gemma4_attention):
-        """Gradients must propagate through the fused RMSNorm+RoPE kernels
-        for both the sliding and proportional-rope layers."""
         from axolotl.monkeypatch.models.gemma4.fused_attn import (
             patch_gemma4_fused_attn,
         )
@@ -207,8 +347,6 @@ def test_backward_grad_flows_through_fused_path(self, restore_gemma4_attention):
         out = m(input_ids=ids, attention_mask=mask).last_hidden_state
         out.sum().backward()
 
-        # Both layers must accumulate gradients on q_norm.weight and
-        # k_norm.weight — that proves the fused kernel ran the backward.
         for i, layer in enumerate(m.layers[:2]):
             attn = layer.self_attn
             assert attn.q_norm.weight.grad is not None, f"layer {i} q_norm no grad"
diff --git a/tests/monkeypatch/test_gemma4_kernelize.py b/tests/monkeypatch/test_gemma4_kernelize.py
new file mode 100644
index 0000000000..ee8989fa2c
--- /dev/null
+++ b/tests/monkeypatch/test_gemma4_kernelize.py
@@ -0,0 +1,196 @@
+"""Tests for the Gemma 4 ``kernelize()`` / ``use_kernels`` crash fix.
+
+transformers decorates ``Gemma4VisionAttention`` with
+``@use_kernelized_func(apply_rotary_pos_emb)`` where the target is a plain
+function. Under ``use_kernels=True``, ``model.kernelize()`` then tries to
+``register_module()`` that function and crashes with::
+
+    TypeError: ...apply_rotary_pos_emb is not a Module subclass
+
+(and a follow-on ``AttributeError`` from the cleanup path). The patch strips the
+dead non-Module ``_hidden_kernels`` entry so ``kernelize()`` succeeds. The entry
+is never read by ``Gemma4VisionAttention.forward`` (which uses
+``apply_multidimensional_rope``), so removing it is behavior-neutral.
+"""
+
+import pytest
+
+pytest.importorskip(
+    "transformers.models.gemma4",
+    reason="gemma4_kernelize patch only matters when Gemma 4 is available",
+)
+
+
+@pytest.fixture
+def restore_gemma4_vision_attention():
+    """Snapshot ``Gemma4VisionAttention.__init__`` and reset patch state after
+    each test so patch state doesn't leak across the suite."""
+    from transformers.models.gemma4 import modeling_gemma4
+
+    saved_init = modeling_gemma4.Gemma4VisionAttention.__init__
+    yield modeling_gemma4
+    modeling_gemma4.Gemma4VisionAttention.__init__ = saved_init
+    from axolotl.monkeypatch import gemma4_kernelize
+
+    gemma4_kernelize._PATCH_APPLIED = False
+
+
+def _vision_config():
+    from transformers.models.gemma4.configuration_gemma4 import Gemma4VisionConfig
+
+    return Gemma4VisionConfig(
+        hidden_size=32,
+        intermediate_size=64,
+        num_hidden_layers=2,
+        num_attention_heads=4,
+        num_key_value_heads=2,
+        head_dim=8,
+    )
+
+
+def test_patch_installs_and_is_idempotent(restore_gemma4_vision_attention):
+    from axolotl.monkeypatch.gemma4_kernelize import patch_gemma4_kernelize
+
+    assert patch_gemma4_kernelize() is True
+    init_first = restore_gemma4_vision_attention.Gemma4VisionAttention.__init__
+    # Second call must not re-wrap.
+    assert patch_gemma4_kernelize() is True
+    init_second = restore_gemma4_vision_attention.Gemma4VisionAttention.__init__
+    assert init_first is init_second
+    assert hasattr(init_first, "_axolotl_original")
+
+
+def test_patch_strips_non_module_hidden_kernels(restore_gemma4_vision_attention):
+    modeling_gemma4 = restore_gemma4_vision_attention
+    from axolotl.monkeypatch.gemma4_kernelize import patch_gemma4_kernelize
+
+    cfg = _vision_config()
+
+    # Before the patch, the bare function is registered (the crash source).
+    attn_before = modeling_gemma4.Gemma4VisionAttention(cfg, layer_idx=0)
+    assert "apply_rotary_pos_emb" in getattr(attn_before, "_hidden_kernels", {})
+
+    patch_gemma4_kernelize()
+    attn_after = modeling_gemma4.Gemma4VisionAttention(cfg, layer_idx=0)
+    assert dict(getattr(attn_after, "_hidden_kernels", {})) == {}
+
+
+def test_register_module_path_no_longer_crashes(restore_gemma4_vision_attention):
+    """The exact step that crashed: kernelize()'s ``attach_hidden_kernels``
+    does ``register_module(name, fn)`` for each ``_hidden_kernels`` entry."""
+    modeling_gemma4 = restore_gemma4_vision_attention
+    from axolotl.monkeypatch.gemma4_kernelize import patch_gemma4_kernelize
+
+    cfg = _vision_config()
+    patch_gemma4_kernelize()
+    attn = modeling_gemma4.Gemma4VisionAttention(cfg, layer_idx=0)
+
+    # Replicate attach_hidden_kernels; with the entry stripped there is nothing
+    # to (mis)register, so this must not raise.
+    for name, fn in getattr(attn, "_hidden_kernels", {}).items():
+        if name not in dict(attn.named_children()):
+            attn.register_module(name, fn)
+
+
+def test_patch_does_not_alter_weights(restore_gemma4_vision_attention):
+    """The shim only mutates ``_hidden_kernels``; parameters are untouched."""
+    import torch
+
+    modeling_gemma4 = restore_gemma4_vision_attention
+    from axolotl.monkeypatch.gemma4_kernelize import patch_gemma4_kernelize
+
+    cfg = _vision_config()
+    torch.manual_seed(0)
+    before = modeling_gemma4.Gemma4VisionAttention(cfg, layer_idx=0).state_dict()
+
+    patch_gemma4_kernelize()
+    torch.manual_seed(0)
+    after = modeling_gemma4.Gemma4VisionAttention(cfg, layer_idx=0).state_dict()
+
+    assert before.keys() == after.keys()
+    assert all(torch.equal(before[k], after[k]) for k in before)
+
+
+def test_forward_does_not_reference_stripped_entry(restore_gemma4_vision_attention):
+    """Behavior-invariance guarantee: forward never reads the stripped names,
+    so dropping them cannot change the forward result."""
+    modeling_gemma4 = restore_gemma4_vision_attention
+    names = modeling_gemma4.Gemma4VisionAttention.forward.__code__.co_names
+    assert "apply_rotary_pos_emb" not in names
+    assert "_hidden_kernels" not in names
+
+
+def test_unpatch_restores_original(restore_gemma4_vision_attention):
+    modeling_gemma4 = restore_gemma4_vision_attention
+    from axolotl.monkeypatch.gemma4_kernelize import (
+        patch_gemma4_kernelize,
+        unpatch_gemma4_kernelize,
+    )
+
+    original = modeling_gemma4.Gemma4VisionAttention.__init__
+    patch_gemma4_kernelize()
+    assert modeling_gemma4.Gemma4VisionAttention.__init__ is not original
+    unpatch_gemma4_kernelize()
+    assert modeling_gemma4.Gemma4VisionAttention.__init__ is original
+
+
+def test_unpatch_is_safe_without_prior_patch(restore_gemma4_vision_attention):
+    from axolotl.monkeypatch.gemma4_kernelize import unpatch_gemma4_kernelize
+
+    # No-op, no exception.
+    unpatch_gemma4_kernelize()
+
+
+def test_full_model_kernelize_succeeds_with_patch(restore_gemma4_vision_attention):
+    """End-to-end: a tiny full Gemma4 model crashes in ``kernelize()`` without
+    the patch and succeeds with it. No real 26B weights or CUDA required."""
+    modeling_gemma4 = restore_gemma4_vision_attention
+    from transformers.models.gemma4.configuration_gemma4 import (
+        Gemma4AudioConfig,
+        Gemma4Config,
+        Gemma4TextConfig,
+        Gemma4VisionConfig,
+    )
+
+    from axolotl.monkeypatch.gemma4_kernelize import patch_gemma4_kernelize
+
+    def build():
+        text = Gemma4TextConfig(
+            hidden_size=32,
+            intermediate_size=64,
+            num_hidden_layers=2,
+            num_attention_heads=4,
+            num_key_value_heads=2,
+            head_dim=8,
+            vocab_size=128,
+            num_experts=4,
+            num_experts_per_tok=2,
+        )
+        vis = Gemma4VisionConfig(
+            hidden_size=32,
+            intermediate_size=64,
+            num_hidden_layers=2,
+            num_attention_heads=4,
+            num_key_value_heads=2,
+            head_dim=8,
+        )
+        aud = Gemma4AudioConfig(
+            hidden_size=32,
+            intermediate_size=64,
+            num_hidden_layers=1,
+            num_attention_heads=4,
+        )
+        cfg = Gemma4Config(text_config=text, vision_config=vis, audio_config=aud)
+        return modeling_gemma4.Gemma4ForConditionalGeneration(cfg)
+
+    # Without the patch, kernelize() crashes.
+    model = build()
+    model.train()
+    with pytest.raises((TypeError, AttributeError, ValueError)):
+        model.kernelize()
+
+    # With the patch, it succeeds.
+    patch_gemma4_kernelize()
+    model = build()
+    model.train()
+    model.kernelize()
diff --git a/tests/monkeypatch/test_mamba_utils.py b/tests/monkeypatch/test_mamba_utils.py
new file mode 100644
index 0000000000..a473df29c1
--- /dev/null
+++ b/tests/monkeypatch/test_mamba_utils.py
@@ -0,0 +1,551 @@
+"""Unit tests for shared Mamba2 SSM utilities (mamba_utils.py).
+
+Tests cover get_seq_idx correctness under:
+  - single-rank packing
+  - context parallelism (mid-sample chunk starts)
+  - batch dimension
+  - dtype and device
+  - no-negative regression (CP rank > 0 must never produce -1)
+  - mamba2_cp_correction mathematical correctness
+  - wrap_mamba_scan_for_cp wrapper behaviour
+  - end-to-end CP split: full 2K scan == 2×1K split + correction
+"""
+
+import types
+from unittest.mock import patch
+
+import torch
+import torch.nn.functional as F
+
+from axolotl.monkeypatch.models.mamba_utils import (
+    get_seq_idx,
+    mamba2_cp_correction,
+    wrap_mamba_scan_for_cp,
+)
+
+
+def _reference_ssm_scan(x, dt, A, B, C, dt_bias=None, dt_softplus=False, h0=None):
+    """Pure-PyTorch step-by-step SSM scan (reference implementation).
+
+    Implements the Mamba2 discrete SSM recurrence:
+        Δ_t = softplus(dt_t + dt_bias)  or  dt_t
+        Ā_t = exp(A · Δ_t)
+        h_t = Ā_t · h_{t-1}  +  B_t ⊗ x_t
+        y_t = (C_t · h_t).sum(dim=n)
+
+    Args:
+        x:  [B, T, H, d]
+        dt: [B, T, H]
+        A:  [H]  (log-space, negative)
+        B:  [B, T, n_groups, n]
+        C:  [B, T, n_groups, n]
+        dt_bias:    [H] or None
+        dt_softplus: bool
+        h0: [B, H, d, n] initial state, or None → zeros
+
+    Returns:
+        out:     [B, T, H, d]
+        h_final: [B, H, d, n]
+    """
+    B_batch, T, H, d = x.shape
+    n_groups = B.shape[2]
+    n = B.shape[3]
+    heads_per_group = H // n_groups
+
+    dt_eff = dt + dt_bias[None, None, :] if dt_bias is not None else dt
+    if dt_softplus:
+        dt_eff = F.softplus(dt_eff)
+
+    h = torch.zeros(B_batch, H, d, n, dtype=x.dtype) if h0 is None else h0.clone()
+
+    outputs = []
+    for t in range(T):
+        A_bar = torch.exp(A[None, :] * dt_eff[:, t, :])  # [B, H]
+        B_t = B[:, t].repeat_interleave(heads_per_group, dim=1)  # [B, H, n]
+        C_t = C[:, t].repeat_interleave(heads_per_group, dim=1)  # [B, H, n]
+
+        h = A_bar[:, :, None, None] * h + B_t[:, :, None, :] * x[:, t, :, :, None]
+        y_t = (C_t[:, :, None, :] * h).sum(dim=-1)  # [B, H, d]
+        outputs.append(y_t)
+
+    return torch.stack(outputs, dim=1), h
+
+
+class TestGetSeqIdx:
+    """Tests for get_seq_idx(position_ids) → seq_idx."""
+
+    def test_single_sample_no_packing(self):
+        """Single sample with no packing: all zeros."""
+        pos = torch.tensor([[0, 1, 2, 3, 4]])
+        out = get_seq_idx(pos)
+        assert out.tolist() == [[0, 0, 0, 0, 0]]
+
+    def test_two_packed_samples(self):
+        """Two packed samples: index increments at the second sample boundary."""
+        pos = torch.tensor([[0, 1, 2, 3, 0, 1, 2]])
+        out = get_seq_idx(pos)
+        assert out.tolist() == [[0, 0, 0, 0, 1, 1, 1]]
+
+    def test_three_packed_samples(self):
+        """Three packed samples."""
+        pos = torch.tensor([[0, 1, 0, 1, 2, 0]])
+        out = get_seq_idx(pos)
+        assert out.tolist() == [[0, 0, 1, 1, 1, 2]]
+
+    def test_cp_rank_mid_sample_start(self):
+        """CP rank > 0: chunk starts mid-sample (position_ids[0] != 0).
+
+        Must produce non-negative seq_idx starting at 0, not -1.
+        """
+        pos = torch.tensor([[3, 4, 5, 0, 1, 2]])
+        out = get_seq_idx(pos)
+        assert out.tolist() == [[0, 0, 0, 1, 1, 1]]
+
+    def test_cp_rank_entire_chunk_mid_sample(self):
+        """CP rank whose entire chunk is mid-sample (no sample boundary)."""
+        pos = torch.tensor([[5, 6, 7, 8, 9]])
+        out = get_seq_idx(pos)
+        assert out.tolist() == [[0, 0, 0, 0, 0]]
+
+    def test_no_negative_values_regression(self):
+        """seq_idx must never contain -1 for any valid position_ids input."""
+        cases = [
+            [[1, 2, 3]],
+            [[10, 11, 12, 0, 1]],
+            [[0, 0, 0]],
+        ]
+        for pos_list in cases:
+            pos = torch.tensor(pos_list)
+            out = get_seq_idx(pos)
+            assert out.min().item() >= 0, f"Negative seq_idx for pos={pos_list}"
+
+    def test_batch_dimension(self):
+        """Batch of 3 sequences, each independently packed."""
+        pos = torch.tensor(
+            [
+                [0, 1, 2, 0, 1],
+                [0, 1, 0, 1, 2],
+                [3, 4, 0, 1, 2],
+            ]
+        )
+        out = get_seq_idx(pos)
+        assert out.tolist() == [
+            [0, 0, 0, 1, 1],
+            [0, 0, 1, 1, 1],
+            [0, 0, 1, 1, 1],
+        ]
+
+    def test_output_dtype_is_int32(self):
+        """Output dtype must be torch.int32 (mamba-ssm kernel requirement)."""
+        pos = torch.tensor([[0, 1, 2, 0, 1]])
+        out = get_seq_idx(pos)
+        assert out.dtype == torch.int32
+
+    def test_output_shape_matches_input(self):
+        """Output shape matches input shape."""
+        pos = torch.zeros(4, 128, dtype=torch.long)
+        out = get_seq_idx(pos)
+        assert out.shape == pos.shape
+
+    def test_single_token(self):
+        """Edge case: single token sequence."""
+        pos = torch.tensor([[0]])
+        out = get_seq_idx(pos)
+        assert out.tolist() == [[0]]
+
+    def test_cp_rank_starts_at_1(self):
+        """CP rank that starts exactly at position 1 (not 0)."""
+        pos = torch.tensor([[1, 2, 3, 0, 1]])
+        out = get_seq_idx(pos)
+        assert out.tolist() == [[0, 0, 0, 1, 1]]
+
+    def test_many_packed_samples(self):
+        """Many single-token samples packed together."""
+        pos = torch.tensor([[0, 0, 0, 0, 0, 0]])
+        out = get_seq_idx(pos)
+        assert out.tolist() == [[0, 1, 2, 3, 4, 5]]
+
+
+class TestMamba2CpCorrection:
+    """Tests for mamba2_cp_correction mathematical correctness."""
+
+    def test_zero_h_prev_is_noop(self):
+        """When h_prev is all zeros, output should be unchanged."""
+        B, T, H, d, n = 1, 8, 4, 16, 8
+        n_groups = 2
+
+        out = torch.randn(B, T, H * d)
+        h_final = torch.randn(B, H, d, n)
+        C = torch.randn(B, T, n_groups, n)
+        cum_A = torch.randn(B, T, H)
+        h_prev = torch.zeros(B, H, d, n)
+
+        corrected_out, corrected_h = mamba2_cp_correction(
+            out,
+            h_final,
+            C,
+            cum_A,
+            h_prev,
+            num_heads=H,
+            head_dim=d,
+        )
+
+        torch.testing.assert_close(corrected_out, out)
+        torch.testing.assert_close(corrected_h, h_final)
+
+    def test_correction_shapes(self):
+        """Output shapes must match input shapes."""
+        B, T, H, d, n = 2, 16, 8, 32, 16
+        n_groups = 4
+
+        out = torch.randn(B, T, H * d)
+        h_final = torch.randn(B, H, d, n)
+        C = torch.randn(B, T, n_groups, n)
+        cum_A = torch.randn(B, T, H)
+        h_prev = torch.randn(B, H, d, n)
+
+        corrected_out, corrected_h = mamba2_cp_correction(
+            out,
+            h_final,
+            C,
+            cum_A,
+            h_prev,
+            num_heads=H,
+            head_dim=d,
+        )
+
+        assert corrected_out.shape == out.shape
+        assert corrected_h.shape == h_final.shape
+
+    def test_correction_adds_to_output(self):
+        """With nonzero h_prev, output should differ from input."""
+        B, T, H, d, n = 1, 4, 2, 8, 4
+        n_groups = 1
+
+        out = torch.zeros(B, T, H * d)
+        h_final = torch.zeros(B, H, d, n)
+        C = torch.ones(B, T, n_groups, n)
+        cum_A = torch.zeros(B, T, H)  # exp(0) = 1, so full propagation
+        h_prev = torch.ones(B, H, d, n)
+
+        corrected_out, corrected_h = mamba2_cp_correction(
+            out,
+            h_final,
+            C,
+            cum_A,
+            h_prev,
+            num_heads=H,
+            head_dim=d,
+        )
+
+        # With exp(cum_A)=1, C=1, h_prev=1: delta_y should be nonzero
+        assert corrected_out.abs().sum() > 0
+        assert corrected_h.abs().sum() > 0
+
+    def test_correction_h_final_formula(self):
+        """Verify h_final correction: h_final + decay_T * h_prev."""
+        B, T, H, d, n = 1, 4, 2, 8, 4
+        n_groups = 1
+
+        h_final = torch.zeros(B, H, d, n)
+        C = torch.ones(B, T, n_groups, n)
+        cum_A = torch.zeros(B, T, H)
+        h_prev = torch.ones(B, H, d, n) * 2.0
+        out = torch.zeros(B, T, H * d)
+
+        _, corrected_h = mamba2_cp_correction(
+            out,
+            h_final,
+            C,
+            cum_A,
+            h_prev,
+            num_heads=H,
+            head_dim=d,
+        )
+
+        # exp(0) * 2.0 = 2.0 for all elements
+        expected = torch.ones(B, H, d, n) * 2.0
+        torch.testing.assert_close(corrected_h, expected)
+
+
+class TestCpSplitMatchesFullScan:
+    """End-to-end: full sequence scan == split into chunks + CP correction.
+
+    Runs a reference SSM scan on a full 2K-token sequence, then simulates
+    2-rank CP by splitting into 2×1K, running each half with h₀=0, and
+    applying mamba2_cp_correction to rank 1 using rank 0's final state.
+    The concatenated result must match the single-rank reference.
+    """
+
+    def test_2k_vs_2x1k_output_matches(self):
+        """Full 2048-token scan output == two 1024-token chunks + CP correction."""
+        torch.manual_seed(42)
+        B, T, H, d, n = 1, 2048, 4, 16, 8
+        n_groups = 2
+        dt_bias = torch.randn(H) * 0.1
+
+        x = torch.randn(B, T, H, d)
+        dt = torch.randn(B, T, H) * 0.1
+        A = -torch.rand(H).abs() - 0.01
+        B_ssm = torch.randn(B, T, n_groups, n) * 0.1
+        C_ssm = torch.randn(B, T, n_groups, n) * 0.1
+
+        ref_out, ref_h = _reference_ssm_scan(
+            x, dt, A, B_ssm, C_ssm, dt_bias=dt_bias, dt_softplus=True
+        )
+
+        T2 = T // 2
+
+        out_0, h_final_0 = _reference_ssm_scan(
+            x[:, :T2],
+            dt[:, :T2],
+            A,
+            B_ssm[:, :T2],
+            C_ssm[:, :T2],
+            dt_bias=dt_bias,
+            dt_softplus=True,
+        )
+
+        out_1, h_final_1 = _reference_ssm_scan(
+            x[:, T2:],
+            dt[:, T2:],
+            A,
+            B_ssm[:, T2:],
+            C_ssm[:, T2:],
+            dt_bias=dt_bias,
+            dt_softplus=True,
+        )
+
+        dt_eff_1 = F.softplus(dt[:, T2:] + dt_bias[None, None, :])
+        cum_A_1 = torch.cumsum(A[None, None, :] * dt_eff_1, dim=1)
+
+        corrected_out_1, corrected_h_1 = mamba2_cp_correction(
+            out_1.view(B, T2, H * d),
+            h_final_1,
+            C_ssm[:, T2:],
+            cum_A_1,
+            h_final_0,
+            num_heads=H,
+            head_dim=d,
+        )
+        corrected_out_1 = corrected_out_1.view(B, T2, H, d)
+
+        reconstructed = torch.cat([out_0, corrected_out_1], dim=1)
+
+        torch.testing.assert_close(reconstructed, ref_out, rtol=1e-4, atol=1e-4)
+        torch.testing.assert_close(corrected_h_1, ref_h, rtol=1e-4, atol=1e-4)
+
+    def test_2k_vs_2x1k_with_batch(self):
+        """Same split test with batch_size > 1."""
+        torch.manual_seed(123)
+        B, T, H, d, n = 3, 512, 2, 8, 4
+        n_groups = 1
+        dt_bias = torch.randn(H) * 0.05
+
+        x = torch.randn(B, T, H, d)
+        dt = torch.randn(B, T, H) * 0.1
+        A = -torch.rand(H).abs() - 0.01
+        B_ssm = torch.randn(B, T, n_groups, n) * 0.1
+        C_ssm = torch.randn(B, T, n_groups, n) * 0.1
+
+        ref_out, ref_h = _reference_ssm_scan(
+            x, dt, A, B_ssm, C_ssm, dt_bias=dt_bias, dt_softplus=True
+        )
+
+        T2 = T // 2
+
+        out_0, h_0 = _reference_ssm_scan(
+            x[:, :T2],
+            dt[:, :T2],
+            A,
+            B_ssm[:, :T2],
+            C_ssm[:, :T2],
+            dt_bias=dt_bias,
+            dt_softplus=True,
+        )
+        out_1, h_1 = _reference_ssm_scan(
+            x[:, T2:],
+            dt[:, T2:],
+            A,
+            B_ssm[:, T2:],
+            C_ssm[:, T2:],
+            dt_bias=dt_bias,
+            dt_softplus=True,
+        )
+
+        dt_eff_1 = F.softplus(dt[:, T2:] + dt_bias[None, None, :])
+        cum_A_1 = torch.cumsum(A[None, None, :] * dt_eff_1, dim=1)
+
+        corrected_out_1, corrected_h_1 = mamba2_cp_correction(
+            out_1.view(B, T2, H * d),
+            h_1,
+            C_ssm[:, T2:],
+            cum_A_1,
+            h_0,
+            num_heads=H,
+            head_dim=d,
+        )
+
+        reconstructed = torch.cat([out_0, corrected_out_1.view(B, T2, H, d)], dim=1)
+
+        torch.testing.assert_close(reconstructed, ref_out, rtol=1e-4, atol=1e-4)
+        torch.testing.assert_close(corrected_h_1, ref_h, rtol=1e-4, atol=1e-4)
+
+    def test_4_way_split(self):
+        """4-rank CP: split 1024 tokens into 4×256 chunks with sequential correction."""
+        torch.manual_seed(99)
+        B, T, H, d, n = 1, 1024, 2, 8, 4
+        n_groups = 1
+        n_ranks = 4
+        chunk = T // n_ranks
+        dt_bias = torch.randn(H) * 0.05
+
+        x = torch.randn(B, T, H, d)
+        dt = torch.randn(B, T, H) * 0.1
+        A = -torch.rand(H).abs() - 0.01
+        B_ssm = torch.randn(B, T, n_groups, n) * 0.1
+        C_ssm = torch.randn(B, T, n_groups, n) * 0.1
+
+        ref_out, ref_h = _reference_ssm_scan(
+            x, dt, A, B_ssm, C_ssm, dt_bias=dt_bias, dt_softplus=True
+        )
+
+        all_outs = []
+        h_prev = torch.zeros(B, H, d, n)
+
+        for rank in range(n_ranks):
+            s, e = rank * chunk, (rank + 1) * chunk
+            out_r, h_r = _reference_ssm_scan(
+                x[:, s:e],
+                dt[:, s:e],
+                A,
+                B_ssm[:, s:e],
+                C_ssm[:, s:e],
+                dt_bias=dt_bias,
+                dt_softplus=True,
+            )
+
+            dt_eff_r = F.softplus(dt[:, s:e] + dt_bias[None, None, :])
+            cum_A_r = torch.cumsum(A[None, None, :] * dt_eff_r, dim=1)
+
+            corrected_out_r, corrected_h_r = mamba2_cp_correction(
+                out_r.view(B, chunk, H * d),
+                h_r,
+                C_ssm[:, s:e],
+                cum_A_r,
+                h_prev,
+                num_heads=H,
+                head_dim=d,
+            )
+
+            all_outs.append(corrected_out_r.view(B, chunk, H, d))
+            h_prev = corrected_h_r
+
+        reconstructed = torch.cat(all_outs, dim=1)
+
+        torch.testing.assert_close(reconstructed, ref_out, rtol=1e-3, atol=1e-3)
+        torch.testing.assert_close(h_prev, ref_h, rtol=1e-3, atol=1e-3)
+
+
+class TestWrapMambaScanForCp:
+    """Tests for wrap_mamba_scan_for_cp wrapper."""
+
+    @staticmethod
+    def _make_module_with_scan(scan_fn):
+        """Create a module namespace with a mamba_chunk_scan_combined attribute."""
+        mod = types.ModuleType("fake_mamba_module")
+        mod.mamba_chunk_scan_combined = scan_fn
+        return mod
+
+    def test_passthrough_when_cp_inactive(self):
+        """When CP is not active, wrapper should return original result unchanged."""
+        B, T, H, d, n = 1, 8, 4, 16, 8
+        x = torch.randn(B, T, H, d)
+        dt = torch.randn(B, T, H)
+        A = -torch.rand(H)
+        B_arg = torch.randn(B, T, 2, n)
+        C_arg = torch.randn(B, T, 2, n)
+        expected_out = torch.randn(B, T, H, d)
+        expected_state = torch.randn(B, H, d, n)
+
+        def fake_scan(*args, **kwargs):
+            return expected_out, expected_state
+
+        mod = self._make_module_with_scan(fake_scan)
+
+        with patch(
+            "axolotl.monkeypatch.models.mamba_utils.is_cp_active", return_value=False
+        ):
+            wrap_mamba_scan_for_cp(mod)
+            out, state = mod.mamba_chunk_scan_combined(
+                x,
+                dt,
+                A,
+                B_arg,
+                C_arg,
+                chunk_size=64,
+                return_final_states=True,
+                dt_bias=None,
+                dt_softplus=False,
+            )
+
+        torch.testing.assert_close(out, expected_out)
+        torch.testing.assert_close(state, expected_state)
+
+    def test_forces_return_final_states_when_cp_active(self):
+        """When CP is active, wrapper must set return_final_states=True."""
+        B, T, H, d, n = 1, 4, 2, 8, 4
+        captured_kwargs = {}
+
+        def fake_scan(*args, **kwargs):
+            captured_kwargs.update(kwargs)
+            scan_out = torch.zeros(B, T, H, d)
+            ssm_state = torch.zeros(B, H, d, n)
+            return scan_out, ssm_state
+
+        mod = self._make_module_with_scan(fake_scan)
+
+        with (
+            patch(
+                "axolotl.monkeypatch.models.mamba_utils.is_cp_active", return_value=True
+            ),
+            patch(
+                "axolotl.monkeypatch.models.mamba_utils.ring_shift_ssm_state",
+                side_effect=lambda h: torch.zeros_like(h),
+            ),
+        ):
+            wrap_mamba_scan_for_cp(mod)
+            mod.mamba_chunk_scan_combined(
+                torch.zeros(B, T, H, d),
+                torch.zeros(B, T, H),
+                -torch.ones(H),
+                torch.zeros(B, T, 1, n),
+                torch.zeros(B, T, 1, n),
+                chunk_size=64,
+                return_final_states=False,
+                dt_bias=None,
+                dt_softplus=False,
+            )
+
+        assert captured_kwargs["return_final_states"] is True
+
+    def test_idempotency_guard(self):
+        """Calling wrap_mamba_scan_for_cp twice must not double-wrap."""
+        call_count = 0
+
+        def fake_scan(*args, **kwargs):
+            nonlocal call_count
+            call_count += 1
+            B, T, H, d, n = 1, 4, 2, 8, 4
+            return torch.zeros(B, T, H, d), torch.zeros(B, H, d, n)
+
+        mod = self._make_module_with_scan(fake_scan)
+
+        with patch(
+            "axolotl.monkeypatch.models.mamba_utils.is_cp_active", return_value=False
+        ):
+            wrap_mamba_scan_for_cp(mod)
+            first_fn = mod.mamba_chunk_scan_combined
+            wrap_mamba_scan_for_cp(mod)
+            assert mod.mamba_chunk_scan_combined is first_fn
+            assert getattr(mod, "_cp_scan_wrapped", False) is True
diff --git a/tests/monkeypatch/test_qwen3_5_fused_attn.py b/tests/monkeypatch/test_qwen3_5_fused_attn.py
new file mode 100644
index 0000000000..e258222e67
--- /dev/null
+++ b/tests/monkeypatch/test_qwen3_5_fused_attn.py
@@ -0,0 +1,424 @@
+"""Tests for the Qwen3.5 / Qwen3.5-MoE fused-attention monkeypatch."""
+
+import pytest
+import torch
+
+pytestmark = [
+    pytest.mark.skipif(not torch.cuda.is_available(), reason="CUDA required"),
+]
+
+pytest.importorskip("transformers.models.qwen3_5")
+pytest.importorskip("transformers.models.qwen3_5_moe")
+
+
+def _clear_patched_flag(cls):
+    try:
+        delattr(cls, "_axolotl_fused_attn_patched")
+    except AttributeError:
+        pass
+
+
+@pytest.fixture
+def restore_qwen3_5_attention():
+    from transformers.models.qwen3_5.modeling_qwen3_5 import Qwen3_5Attention
+
+    saved = Qwen3_5Attention.forward
+    saved_flag = getattr(Qwen3_5Attention, "_axolotl_fused_attn_patched", False)
+    yield Qwen3_5Attention
+    Qwen3_5Attention.forward = saved
+    if saved_flag:
+        Qwen3_5Attention._axolotl_fused_attn_patched = saved_flag
+    else:
+        _clear_patched_flag(Qwen3_5Attention)
+
+
+def _build_qwen3_5_text_model(seed: int = 0):
+    from transformers.models.qwen3_5.configuration_qwen3_5 import Qwen3_5TextConfig
+    from transformers.models.qwen3_5.modeling_qwen3_5 import Qwen3_5TextModel
+
+    torch.manual_seed(seed)
+    cfg = Qwen3_5TextConfig(
+        vocab_size=128,
+        hidden_size=128,
+        intermediate_size=128,
+        num_hidden_layers=2,
+        num_attention_heads=4,
+        num_key_value_heads=2,
+        head_dim=32,
+        max_position_embeddings=512,
+        rms_norm_eps=1e-6,
+        attention_dropout=0.0,
+        layer_types=["full_attention", "full_attention"],
+    )
+    cfg._attn_implementation = "sdpa"
+    return Qwen3_5TextModel(cfg).cuda().to(torch.bfloat16).eval()
+
+
+def _run_attention(model, layer_idx, hidden_states, position_ids):
+    attn = model.layers[layer_idx].self_attn
+    cos, sin = model.rotary_emb(hidden_states, position_ids)
+    out, _ = attn(
+        hidden_states=hidden_states,
+        position_embeddings=(cos, sin),
+        attention_mask=None,
+    )
+    return out
+
+
+class TestQwen3_5FusedAttnParity:
+    """Single-layer parity vs stock."""
+
+    @pytest.mark.parametrize("layer_idx", [0, 1])
+    def test_forward_matches_stock(self, restore_qwen3_5_attention, layer_idx):
+        from axolotl.monkeypatch.models.qwen3_5.fused_attn import (
+            patch_qwen3_5_fused_attn,
+        )
+
+        m = _build_qwen3_5_text_model(seed=1)
+        hs = torch.randn(2, 16, 128, device="cuda", dtype=torch.bfloat16)
+        pos = torch.arange(16, device="cuda").unsqueeze(0).expand(2, -1)
+
+        with torch.no_grad():
+            ref = _run_attention(m, layer_idx, hs, pos)
+
+        patch_qwen3_5_fused_attn()
+        with torch.no_grad():
+            got = _run_attention(m, layer_idx, hs, pos)
+
+        assert got.shape == ref.shape
+        assert torch.isfinite(got).all()
+        cos_sim = torch.nn.functional.cosine_similarity(
+            ref.flatten().float(), got.flatten().float(), dim=0
+        )
+        assert cos_sim > 0.999, (
+            f"layer {layer_idx} fused vs stock cosine_sim={cos_sim:.6f}"
+        )
+        torch.testing.assert_close(got, ref, rtol=5e-2, atol=5e-2)
+
+
+class TestQwen3_5FusedAttnBackward:
+    def test_q_k_norm_grads_finite_nonzero(self, restore_qwen3_5_attention):
+        from axolotl.monkeypatch.models.qwen3_5.fused_attn import (
+            patch_qwen3_5_fused_attn,
+        )
+
+        m = _build_qwen3_5_text_model(seed=3).train()
+        patch_qwen3_5_fused_attn()
+
+        ids = torch.randint(0, 128, (2, 16), device="cuda")
+        mask = torch.ones(2, 16, dtype=torch.long, device="cuda")
+        out = m(input_ids=ids, attention_mask=mask, use_cache=False).last_hidden_state
+        out.sum().backward()
+
+        for i, layer in enumerate(m.layers[:2]):
+            if m.config.layer_types[i] != "full_attention":
+                continue
+            attn = layer.self_attn
+            assert attn.q_norm.weight.grad is not None, f"layer {i} q_norm no grad"
+            assert attn.k_norm.weight.grad is not None, f"layer {i} k_norm no grad"
+            assert attn.q_norm.weight.grad.isfinite().all()
+            assert attn.k_norm.weight.grad.isfinite().all()
+            assert attn.q_norm.weight.grad.abs().sum() > 0
+            assert attn.k_norm.weight.grad.abs().sum() > 0
+
+
+class TestQwen3_5FusedAttnLoRACompose:
+    """Pin LoRA-QKV → fused composition; ``QKV_PATCHES`` includes a chunk-2 variant for Qwen3.5's ``q_proj * 2``."""
+
+    def _build_cfg(self):
+        from axolotl.utils.dict import DictDefault
+
+        return DictDefault(
+            {
+                "base_model": "fake/qwen3_5",
+                "lora_qkv_kernel": True,
+                "lora_o_kernel": True,
+                "lora_dropout": 0.0,
+            }
+        )
+
+    def test_lora_qkv_then_fused_does_not_raise(
+        self, restore_qwen3_5_attention, monkeypatch
+    ):
+        from transformers.models.qwen3_5.modeling_qwen3_5 import Qwen3_5Attention
+
+        from axolotl.monkeypatch import lora_kernels
+        from axolotl.monkeypatch.models.qwen3_5.fused_attn import (
+            patch_qwen3_5_fused_attn,
+        )
+
+        monkeypatch.setattr(
+            lora_kernels,
+            "get_attention_cls_from_config",
+            lambda _cfg: Qwen3_5Attention,
+        )
+
+        try:
+            delattr(Qwen3_5Attention, "_original_forward")
+        except AttributeError:
+            pass
+
+        try:
+            lora_kernels.patch_self_attn_lora(self._build_cfg())
+            assert hasattr(Qwen3_5Attention, "_original_forward"), (
+                "patch_self_attn_lora must capture the stock Qwen3.5 forward — if "
+                "this fails, QKV_PATCHES drifted away from the chunk-2 q_proj source"
+            )
+            patch_qwen3_5_fused_attn()
+            assert getattr(Qwen3_5Attention, "_axolotl_fused_attn_patched", False)
+        finally:
+            try:
+                delattr(Qwen3_5Attention, "_original_forward")
+            except AttributeError:
+                pass
+
+
+@pytest.fixture
+def restore_qwen3_5_moe_attention():
+    from transformers.models.qwen3_5_moe.modeling_qwen3_5_moe import (
+        Qwen3_5MoeAttention,
+    )
+
+    saved = Qwen3_5MoeAttention.forward
+    saved_flag = getattr(Qwen3_5MoeAttention, "_axolotl_fused_attn_patched", False)
+    yield Qwen3_5MoeAttention
+    Qwen3_5MoeAttention.forward = saved
+    if saved_flag:
+        Qwen3_5MoeAttention._axolotl_fused_attn_patched = saved_flag
+    else:
+        _clear_patched_flag(Qwen3_5MoeAttention)
+
+
+def _build_qwen3_5_moe_model(seed: int = 0):
+    from transformers.models.qwen3_5_moe.configuration_qwen3_5_moe import (
+        Qwen3_5MoeTextConfig,
+    )
+    from transformers.models.qwen3_5_moe.modeling_qwen3_5_moe import (
+        Qwen3_5MoeTextModel,
+    )
+
+    torch.manual_seed(seed)
+    cfg = Qwen3_5MoeTextConfig(
+        vocab_size=128,
+        hidden_size=128,
+        intermediate_size=128,
+        num_hidden_layers=2,
+        num_attention_heads=4,
+        num_key_value_heads=2,
+        head_dim=32,
+        max_position_embeddings=512,
+        rms_norm_eps=1e-6,
+        attention_dropout=0.0,
+        layer_types=["full_attention", "full_attention"],
+        num_experts=4,
+        num_experts_per_tok=2,
+        moe_intermediate_size=64,
+        shared_expert_intermediate_size=64,
+    )
+    cfg._attn_implementation = "sdpa"
+    return Qwen3_5MoeTextModel(cfg).cuda().to(torch.bfloat16).eval()
+
+
+class TestQwen3_5MoeFusedAttnParity:
+    """End-to-end parity on the MoE variant (attention is structurally identical to dense Qwen3.5)."""
+
+    def test_forward_matches_stock(self, restore_qwen3_5_moe_attention):
+        from axolotl.monkeypatch.models.qwen3_5_moe.fused_attn import (
+            patch_qwen3_5_moe_fused_attn,
+        )
+
+        m = _build_qwen3_5_moe_model(seed=5)
+        ids = torch.randint(0, 128, (2, 16), device="cuda")
+        mask = torch.ones(2, 16, dtype=torch.long, device="cuda")
+
+        with torch.no_grad():
+            ref = m(
+                input_ids=ids, attention_mask=mask, use_cache=False
+            ).last_hidden_state.clone()
+
+        patch_qwen3_5_moe_fused_attn()
+        with torch.no_grad():
+            got = m(
+                input_ids=ids, attention_mask=mask, use_cache=False
+            ).last_hidden_state.clone()
+
+        assert got.shape == ref.shape
+        assert torch.isfinite(got).all()
+        cos_sim = torch.nn.functional.cosine_similarity(
+            ref.flatten().float(), got.flatten().float(), dim=0
+        )
+        assert cos_sim > 0.999, f"qwen3_5_moe end-to-end cosine_sim={cos_sim:.6f}"
+
+
+class TestQwen3_5MoeFusedAttnBackward:
+    """Backward grad flow through the fused Q/K-norm kernels on Qwen3.5-MoE."""
+
+    def test_q_k_norm_grads_finite_nonzero(self, restore_qwen3_5_moe_attention):
+        from axolotl.monkeypatch.models.qwen3_5_moe.fused_attn import (
+            patch_qwen3_5_moe_fused_attn,
+        )
+
+        m = _build_qwen3_5_moe_model(seed=6).train()
+        patch_qwen3_5_moe_fused_attn()
+
+        ids = torch.randint(0, 128, (2, 16), device="cuda")
+        mask = torch.ones(2, 16, dtype=torch.long, device="cuda")
+        out = m(input_ids=ids, attention_mask=mask, use_cache=False).last_hidden_state
+        out.sum().backward()
+
+        for i, layer in enumerate(m.layers[:2]):
+            if m.config.layer_types[i] != "full_attention":
+                continue
+            attn = layer.self_attn
+            assert attn.q_norm.weight.grad is not None, f"layer {i} q_norm no grad"
+            assert attn.k_norm.weight.grad is not None, f"layer {i} k_norm no grad"
+            assert attn.q_norm.weight.grad.isfinite().all()
+            assert attn.k_norm.weight.grad.isfinite().all()
+            assert attn.q_norm.weight.grad.abs().sum() > 0
+            assert attn.k_norm.weight.grad.abs().sum() > 0
+
+
+class TestQwen3_5MoeFusedAttnLoRACompose:
+    """MoE mirror of the Qwen3.5 LoRA-compose test."""
+
+    def _build_cfg(self):
+        from axolotl.utils.dict import DictDefault
+
+        return DictDefault(
+            {
+                "base_model": "fake/qwen3_5_moe",
+                "lora_qkv_kernel": True,
+                "lora_o_kernel": True,
+                "lora_dropout": 0.0,
+            }
+        )
+
+    def test_lora_qkv_then_fused_does_not_raise(
+        self, restore_qwen3_5_moe_attention, monkeypatch
+    ):
+        from transformers.models.qwen3_5_moe.modeling_qwen3_5_moe import (
+            Qwen3_5MoeAttention,
+        )
+
+        from axolotl.monkeypatch import lora_kernels
+        from axolotl.monkeypatch.models.qwen3_5_moe.fused_attn import (
+            patch_qwen3_5_moe_fused_attn,
+        )
+
+        monkeypatch.setattr(
+            lora_kernels,
+            "get_attention_cls_from_config",
+            lambda _cfg: Qwen3_5MoeAttention,
+        )
+
+        try:
+            delattr(Qwen3_5MoeAttention, "_original_forward")
+        except AttributeError:
+            pass
+
+        try:
+            lora_kernels.patch_self_attn_lora(self._build_cfg())
+            assert hasattr(Qwen3_5MoeAttention, "_original_forward")
+            patch_qwen3_5_moe_fused_attn()
+            assert getattr(Qwen3_5MoeAttention, "_axolotl_fused_attn_patched", False)
+        finally:
+            try:
+                delattr(Qwen3_5MoeAttention, "_original_forward")
+            except AttributeError:
+                pass
+
+
+class TestQwen3_5FusedAttnLigerRMSNormCompose:
+    """Liger swaps ``Qwen3_5RMSNorm`` for a subclass that exposes ``variance_epsilon`` instead of ``eps``."""
+
+    def test_forward_survives_liger_rmsnorm_swap(self, restore_qwen3_5_attention):
+        from axolotl.monkeypatch.models.qwen3_5.fused_attn import (
+            patch_qwen3_5_fused_attn,
+        )
+
+        m = _build_qwen3_5_text_model(seed=9)
+
+        class _StubRMSNormVarEps(torch.nn.Module):
+            def __init__(self, original):
+                super().__init__()
+                self.weight = original.weight
+                self.variance_epsilon = original.eps
+
+            def forward(self, x):
+                return x
+
+        for layer in m.layers:
+            if not hasattr(layer, "self_attn"):
+                continue
+            attn = layer.self_attn
+            attn.q_norm = _StubRMSNormVarEps(attn.q_norm)
+            attn.k_norm = _StubRMSNormVarEps(attn.k_norm)
+
+        patch_qwen3_5_fused_attn()
+        ids = torch.randint(0, 128, (2, 16), device="cuda")
+        mask = torch.ones(2, 16, dtype=torch.long, device="cuda")
+        with torch.no_grad():
+            out = m(input_ids=ids, attention_mask=mask, use_cache=False)
+        assert torch.isfinite(out.last_hidden_state).all()
+
+
+class TestPatchManagerQwen3_5TextDispatch:
+    """Pin that ``_apply_model_specific_patches`` covers the ``*_text`` config types of multimodal Qwen3.5 / Qwen3.5-MoE checkpoints."""
+
+    @pytest.mark.parametrize("model_config_type", ["qwen3_5", "qwen3_5_text"])
+    def test_qwen3_5_text_variant_is_patched(
+        self, restore_qwen3_5_attention, model_config_type
+    ):
+        from axolotl.loaders.patch_manager import PatchManager
+        from axolotl.utils.dict import DictDefault
+
+        cfg = DictDefault(
+            {
+                "base_model": "fake/qwen3_5",
+                "model_config_type": model_config_type,
+                "fused_attn_kernel": True,
+                "lora_qkv_kernel": False,
+                "lora_o_kernel": False,
+                "context_parallel_size": 1,
+            }
+        )
+        mc = type("MC", (), {"model_type": model_config_type})()
+        pm = PatchManager(cfg=cfg, model_config=mc, inference=False)
+        pm._apply_model_specific_patches()
+
+        from transformers.models.qwen3_5.modeling_qwen3_5 import Qwen3_5Attention
+
+        assert getattr(Qwen3_5Attention, "_axolotl_fused_attn_patched", False), (
+            f"PatchManager skipped fused-attn for model_config_type="
+            f"{model_config_type!r}; dispatch is missing the _text variant"
+        )
+
+    @pytest.mark.parametrize("model_config_type", ["qwen3_5_moe", "qwen3_5_moe_text"])
+    def test_qwen3_5_moe_text_variant_is_patched(
+        self, restore_qwen3_5_moe_attention, model_config_type
+    ):
+        from axolotl.loaders.patch_manager import PatchManager
+        from axolotl.utils.dict import DictDefault
+
+        cfg = DictDefault(
+            {
+                "base_model": "fake/qwen3_5_moe",
+                "model_config_type": model_config_type,
+                "fused_attn_kernel": True,
+                "lora_qkv_kernel": False,
+                "lora_o_kernel": False,
+                "context_parallel_size": 1,
+            }
+        )
+        mc = type("MC", (), {"model_type": model_config_type})()
+        pm = PatchManager(cfg=cfg, model_config=mc, inference=False)
+        pm._apply_model_specific_patches()
+
+        from transformers.models.qwen3_5_moe.modeling_qwen3_5_moe import (
+            Qwen3_5MoeAttention,
+        )
+
+        assert getattr(Qwen3_5MoeAttention, "_axolotl_fused_attn_patched", False), (
+            f"PatchManager skipped fused-attn for model_config_type="
+            f"{model_config_type!r}; dispatch is missing the _text variant"
+        )
diff --git a/tests/monkeypatch/test_qwen3_fused_attn.py b/tests/monkeypatch/test_qwen3_fused_attn.py
new file mode 100644
index 0000000000..7056eb2939
--- /dev/null
+++ b/tests/monkeypatch/test_qwen3_fused_attn.py
@@ -0,0 +1,283 @@
+"""Tests for the Qwen3 / Qwen3-MoE fused-attention monkeypatch."""
+
+import pytest
+import torch
+
+pytestmark = [
+    pytest.mark.skipif(not torch.cuda.is_available(), reason="CUDA required"),
+]
+
+pytest.importorskip("transformers.models.qwen3")
+pytest.importorskip("transformers.models.qwen3_moe")
+
+
+def _clear_patched_flag(cls):
+    try:
+        delattr(cls, "_axolotl_fused_attn_patched")
+    except AttributeError:
+        pass
+
+
+@pytest.fixture
+def restore_qwen3_attention():
+    from transformers.models.qwen3.modeling_qwen3 import Qwen3Attention
+
+    saved = Qwen3Attention.forward
+    saved_flag = getattr(Qwen3Attention, "_axolotl_fused_attn_patched", False)
+    yield Qwen3Attention
+    Qwen3Attention.forward = saved
+    if saved_flag:
+        Qwen3Attention._axolotl_fused_attn_patched = saved_flag
+    else:
+        _clear_patched_flag(Qwen3Attention)
+
+
+@pytest.fixture
+def restore_qwen3_moe_attention():
+    from transformers.models.qwen3_moe.modeling_qwen3_moe import Qwen3MoeAttention
+
+    saved = Qwen3MoeAttention.forward
+    saved_flag = getattr(Qwen3MoeAttention, "_axolotl_fused_attn_patched", False)
+    yield Qwen3MoeAttention
+    Qwen3MoeAttention.forward = saved
+    if saved_flag:
+        Qwen3MoeAttention._axolotl_fused_attn_patched = saved_flag
+    else:
+        _clear_patched_flag(Qwen3MoeAttention)
+
+
+def _build_qwen3_model(seed: int = 0):
+    from transformers.models.qwen3.configuration_qwen3 import Qwen3Config
+    from transformers.models.qwen3.modeling_qwen3 import Qwen3Model
+
+    torch.manual_seed(seed)
+    cfg = Qwen3Config(
+        vocab_size=128,
+        hidden_size=64,
+        intermediate_size=128,
+        num_hidden_layers=2,
+        num_attention_heads=4,
+        num_key_value_heads=2,
+        head_dim=16,
+        max_position_embeddings=2048,
+        rms_norm_eps=1e-6,
+        attention_dropout=0.0,
+    )
+    cfg._attn_implementation = "sdpa"
+    return Qwen3Model(cfg).cuda().to(torch.bfloat16).eval()
+
+
+def _build_qwen3_moe_model(seed: int = 0):
+    from transformers.models.qwen3_moe.configuration_qwen3_moe import Qwen3MoeConfig
+    from transformers.models.qwen3_moe.modeling_qwen3_moe import Qwen3MoeModel
+
+    torch.manual_seed(seed)
+    cfg = Qwen3MoeConfig(
+        vocab_size=128,
+        hidden_size=64,
+        intermediate_size=128,
+        num_hidden_layers=2,
+        num_attention_heads=4,
+        num_key_value_heads=2,
+        head_dim=16,
+        max_position_embeddings=512,
+        rms_norm_eps=1e-6,
+        attention_dropout=0.0,
+        num_experts=4,
+        num_experts_per_tok=2,
+        moe_intermediate_size=64,
+    )
+    cfg._attn_implementation = "sdpa"
+    return Qwen3MoeModel(cfg).cuda().to(torch.bfloat16).eval()
+
+
+def _run_attention(model, layer_idx, hidden_states, position_ids):
+    attn = model.layers[layer_idx].self_attn
+    cos, sin = model.rotary_emb(hidden_states, position_ids)
+    out, _ = attn(
+        hidden_states=hidden_states,
+        position_embeddings=(cos, sin),
+        attention_mask=None,
+    )
+    return out
+
+
+class TestQwen3FusedAttnParity:
+    """Single-layer parity vs stock."""
+
+    @pytest.mark.parametrize("layer_idx", [0, 1])
+    def test_forward_matches_stock(self, restore_qwen3_attention, layer_idx):
+        from axolotl.monkeypatch.models.qwen3.fused_attn import patch_qwen3_fused_attn
+
+        m = _build_qwen3_model(seed=1)
+        hs = torch.randn(2, 16, 64, device="cuda", dtype=torch.bfloat16)
+        pos = torch.arange(16, device="cuda").unsqueeze(0).expand(2, -1)
+
+        with torch.no_grad():
+            ref = _run_attention(m, layer_idx, hs, pos)
+
+        patch_qwen3_fused_attn()
+        with torch.no_grad():
+            got = _run_attention(m, layer_idx, hs, pos)
+
+        assert got.shape == ref.shape
+        assert torch.isfinite(got).all()
+        cos_sim = torch.nn.functional.cosine_similarity(
+            ref.flatten().float(), got.flatten().float(), dim=0
+        )
+        assert cos_sim > 0.999, (
+            f"layer {layer_idx} fused vs stock cosine_sim={cos_sim:.6f}"
+        )
+        torch.testing.assert_close(got, ref, rtol=5e-2, atol=5e-2)
+
+
+class TestQwen3FusedAttnEndToEnd:
+    def test_full_forward_matches_stock(self, restore_qwen3_attention):
+        from axolotl.monkeypatch.models.qwen3.fused_attn import patch_qwen3_fused_attn
+
+        m = _build_qwen3_model(seed=2)
+        ids = torch.randint(0, 128, (2, 32), device="cuda")
+        mask = torch.ones(2, 32, dtype=torch.long, device="cuda")
+
+        with torch.no_grad():
+            ref = m(input_ids=ids, attention_mask=mask).last_hidden_state.clone()
+
+        patch_qwen3_fused_attn()
+        with torch.no_grad():
+            got = m(input_ids=ids, attention_mask=mask).last_hidden_state.clone()
+
+        assert got.shape == ref.shape
+        assert torch.isfinite(got).all()
+        cos_sim = torch.nn.functional.cosine_similarity(
+            ref.flatten().float(), got.flatten().float(), dim=0
+        )
+        assert cos_sim > 0.999, f"end-to-end cosine_sim={cos_sim:.6f}"
+
+    def test_backward_grad_flows_through_fused_path(self, restore_qwen3_attention):
+        from axolotl.monkeypatch.models.qwen3.fused_attn import patch_qwen3_fused_attn
+
+        m = _build_qwen3_model(seed=3).train()
+        patch_qwen3_fused_attn()
+
+        ids = torch.randint(0, 128, (2, 16), device="cuda")
+        mask = torch.ones(2, 16, dtype=torch.long, device="cuda")
+        out = m(input_ids=ids, attention_mask=mask).last_hidden_state
+        out.sum().backward()
+
+        for i, layer in enumerate(m.layers[:2]):
+            attn = layer.self_attn
+            assert attn.q_norm.weight.grad is not None, f"layer {i} q_norm no grad"
+            assert attn.k_norm.weight.grad is not None, f"layer {i} k_norm no grad"
+            assert attn.q_norm.weight.grad.isfinite().all()
+            assert attn.k_norm.weight.grad.isfinite().all()
+            assert attn.q_norm.weight.grad.abs().sum() > 0
+            assert attn.k_norm.weight.grad.abs().sum() > 0
+
+
+class TestQwen3FusedAttnLoRACompose:
+    """Pin that LoRA-QKV runs before the fused patch (``inspect.getsource`` regex misses on the fused body)."""
+
+    def test_lora_qkv_then_fused_does_not_raise(self, restore_qwen3_attention):
+        from axolotl.monkeypatch.lora_kernels import patch_self_attn_lora
+        from axolotl.monkeypatch.models.qwen3.fused_attn import patch_qwen3_fused_attn
+        from axolotl.utils.dict import DictDefault
+
+        cfg = DictDefault(
+            {
+                "base_model": "Qwen/Qwen3-0.6B",
+                "lora_qkv_kernel": True,
+                "lora_o_kernel": True,
+                "lora_dropout": 0.0,
+            }
+        )
+
+        from transformers.models.qwen3.modeling_qwen3 import Qwen3Attention
+
+        try:
+            delattr(Qwen3Attention, "_original_forward")
+        except AttributeError:
+            pass
+
+        try:
+            patch_self_attn_lora(cfg)
+            assert hasattr(Qwen3Attention, "_original_forward"), (
+                "patch_self_attn_lora must run on stock source first"
+            )
+            patch_qwen3_fused_attn()
+            assert getattr(Qwen3Attention, "_axolotl_fused_attn_patched", False)
+        finally:
+            try:
+                delattr(Qwen3Attention, "_original_forward")
+            except AttributeError:
+                pass
+
+    def test_reverse_order_breaks(self, restore_qwen3_attention):
+        from axolotl.monkeypatch.lora_kernels import patch_self_attn_lora
+        from axolotl.monkeypatch.models.qwen3.fused_attn import patch_qwen3_fused_attn
+        from axolotl.utils.dict import DictDefault
+
+        cfg = DictDefault(
+            {
+                "base_model": "Qwen/Qwen3-0.6B",
+                "lora_qkv_kernel": True,
+                "lora_o_kernel": True,
+                "lora_dropout": 0.0,
+            }
+        )
+
+        from transformers.models.qwen3.modeling_qwen3 import Qwen3Attention
+
+        try:
+            delattr(Qwen3Attention, "_original_forward")
+        except AttributeError:
+            pass
+
+        patch_qwen3_fused_attn()
+        with pytest.raises(AssertionError, match="Original QKV code not found"):
+            patch_self_attn_lora(cfg)
+
+
+class TestPatchManagerOrdering:
+    """Pin the patch-manager ordering invariant."""
+
+    def test_self_attn_lora_runs_before_model_specific(self):
+        import inspect
+
+        from axolotl.loaders.patch_manager import PatchManager
+
+        src = inspect.getsource(PatchManager.apply_pre_model_load_patches)
+        lora_idx = src.find("_apply_self_attention_lora_patch()")
+        specific_idx = src.find("_apply_model_specific_patches()")
+        assert lora_idx > 0 and specific_idx > 0
+        assert lora_idx < specific_idx, (
+            "_apply_self_attention_lora_patch must run before "
+            "_apply_model_specific_patches so patch_self_attn_lora sees the "
+            "stock attention forward source"
+        )
+
+
+class TestQwen3MoeFusedAttnParity:
+    """End-to-end parity on the MoE variant (attention is structurally identical to dense Qwen3)."""
+
+    def test_full_forward_matches_stock(self, restore_qwen3_moe_attention):
+        from axolotl.monkeypatch.models.qwen3_moe.fused_attn import (
+            patch_qwen3_moe_fused_attn,
+        )
+
+        m = _build_qwen3_moe_model(seed=4)
+        ids = torch.randint(0, 128, (2, 16), device="cuda")
+        mask = torch.ones(2, 16, dtype=torch.long, device="cuda")
+
+        with torch.no_grad():
+            ref = m(input_ids=ids, attention_mask=mask).last_hidden_state.clone()
+
+        patch_qwen3_moe_fused_attn()
+        with torch.no_grad():
+            got = m(input_ids=ids, attention_mask=mask).last_hidden_state.clone()
+
+        assert got.shape == ref.shape
+        assert torch.isfinite(got).all()
+        cos_sim = torch.nn.functional.cosine_similarity(
+            ref.flatten().float(), got.flatten().float(), dim=0
+        )
+        assert cos_sim > 0.999, f"qwen3-moe end-to-end cosine_sim={cos_sim:.6f}"
diff --git a/tests/monkeypatch/test_qwen3_fused_attn_defensive.py b/tests/monkeypatch/test_qwen3_fused_attn_defensive.py
new file mode 100644
index 0000000000..34b22b8ee3
--- /dev/null
+++ b/tests/monkeypatch/test_qwen3_fused_attn_defensive.py
@@ -0,0 +1,375 @@
+"""Defensive regression tests for edge cases of the fused-attn patches."""
+
+import inspect
+import logging
+
+import pytest
+import torch
+
+pytestmark = [
+    pytest.mark.skipif(not torch.cuda.is_available(), reason="CUDA required"),
+]
+
+pytest.importorskip("transformers.models.qwen3")
+
+
+def _clear_patched_flag(cls):
+    try:
+        delattr(cls, "_axolotl_fused_attn_patched")
+    except AttributeError:
+        pass
+
+
+@pytest.fixture
+def restore_qwen3_attention():
+    from transformers.models.qwen3.modeling_qwen3 import Qwen3Attention
+
+    saved = Qwen3Attention.forward
+    saved_flag = getattr(Qwen3Attention, "_axolotl_fused_attn_patched", False)
+    yield Qwen3Attention
+    Qwen3Attention.forward = saved
+    if saved_flag:
+        Qwen3Attention._axolotl_fused_attn_patched = saved_flag
+    else:
+        _clear_patched_flag(Qwen3Attention)
+
+
+@pytest.fixture
+def patch_manager_caplog(caplog):
+    logger = logging.getLogger("axolotl.loaders.patch_manager")
+    logger.addHandler(caplog.handler)
+    previous_level = logger.level
+    logger.setLevel(logging.DEBUG)
+    try:
+        yield caplog
+    finally:
+        logger.removeHandler(caplog.handler)
+        logger.setLevel(previous_level)
+
+
+class TestFusedAttnKernelUnsupportedWarning:
+    """The warning lives in ``PatchManager`` (not the schema validator) so it
+    runs after ``normalize_config()`` has derived ``model_config_type``. A
+    normal CLI flow with ``fused_attn_kernel: true`` on e.g. a Llama config
+    must warn loudly instead of silently no-op'ing."""
+
+    def test_warns_on_unsupported_model_type(self, patch_manager_caplog):
+        from types import SimpleNamespace
+
+        from axolotl.loaders.patch_manager import PatchManager
+
+        cfg = SimpleNamespace(fused_attn_kernel=True, model_config_type="llama")
+        PatchManager._warn_if_fused_attn_unsupported(cfg)
+        assert (
+            "fused_attn_kernel" in patch_manager_caplog.text
+            and "llama" in patch_manager_caplog.text
+        ), f"expected warning about llama; got {patch_manager_caplog.text}"
+
+    @pytest.mark.parametrize(
+        "model_type",
+        [
+            "qwen3",
+            "qwen3_moe",
+            "qwen3_vl",
+            "qwen3_vl_text",
+            "qwen3_5",
+            "qwen3_5_text",
+            "qwen3_5_moe",
+            "qwen3_5_moe_text",
+            "gemma4",
+            "gemma4_text",
+        ],
+    )
+    def test_no_warn_on_supported_model_type(self, patch_manager_caplog, model_type):
+        from types import SimpleNamespace
+
+        from axolotl.loaders.patch_manager import PatchManager
+
+        cfg = SimpleNamespace(fused_attn_kernel=True, model_config_type=model_type)
+        PatchManager._warn_if_fused_attn_unsupported(cfg)
+        assert not any(
+            "fused_attn_kernel" in r.message and model_type in r.message
+            for r in patch_manager_caplog.records
+        ), f"unexpected warning for supported {model_type}"
+
+    def test_no_warn_when_fused_attn_kernel_false(self, patch_manager_caplog):
+        from types import SimpleNamespace
+
+        from axolotl.loaders.patch_manager import PatchManager
+
+        cfg = SimpleNamespace(fused_attn_kernel=False, model_config_type="llama")
+        PatchManager._warn_if_fused_attn_unsupported(cfg)
+        assert not any(
+            "fused_attn_kernel" in r.message for r in patch_manager_caplog.records
+        ), "no warning expected when fused_attn_kernel is False"
+
+    def test_warning_is_invoked_by_apply_model_specific_patches(self):
+        """Source-line check that ``_apply_model_specific_patches`` actually
+        calls ``_warn_if_fused_attn_unsupported``. Without this, the standalone
+        helper passes its unit tests but never runs in practice."""
+        import inspect
+
+        from axolotl.loaders.patch_manager import PatchManager
+
+        src = inspect.getsource(PatchManager._apply_model_specific_patches)
+        assert "_warn_if_fused_attn_unsupported" in src, (
+            "_apply_model_specific_patches no longer invokes "
+            "_warn_if_fused_attn_unsupported — the warning will be dead code"
+        )
+
+
+class TestPeftModulesToSaveWrapper:
+    """``modules_to_save=["q_norm","k_norm"]`` wraps the norms in ``ModulesToSaveWrapper``; the patched forward must resolve through it."""
+
+    def _make_wrapper(self, original):
+        def _make_clone(m):
+            clone = torch.nn.Module()
+            clone.weight = torch.nn.Parameter(m.weight.detach().clone())
+            clone.variance_epsilon = m.variance_epsilon
+            clone.eps = getattr(m, "eps", m.variance_epsilon)
+            return clone
+
+        class _StubWrapper(torch.nn.Module):
+            """Mirrors PEFT ``ModulesToSaveWrapper``: ``_active_adapter`` is a
+            ``list[str]`` and ``active_adapter`` / ``active_adapters`` are
+            properties returning that list."""
+
+            def __init__(self, orig):
+                super().__init__()
+                self.original_module = orig
+                self.modules_to_save = torch.nn.ModuleDict(
+                    {"default": _make_clone(orig)}
+                )
+                self._active_adapter = ["default"]
+
+            @property
+            def active_adapter(self):
+                return self._active_adapter
+
+            @property
+            def active_adapters(self):
+                return self._active_adapter
+
+            def forward(self, x):
+                return self.modules_to_save[self._active_adapter[0]](x)
+
+        return _StubWrapper(original)
+
+    def test_resolve_norm_module_returns_active_adapter_not_original(self):
+        """Direct unit-test of ``_resolve_norm_module``: with PEFT's actual
+        ``active_adapter = ["default"]`` (list), the helper must return the
+        wrapped adapter module, not the frozen ``original_module``. The earlier
+        ``isinstance(adapter, str)`` check silently failed this case."""
+        from transformers.models.qwen3.modeling_qwen3 import Qwen3RMSNorm
+
+        from axolotl.monkeypatch.models.qwen3.fused_attn import (
+            _resolve_norm_module,
+        )
+
+        orig = Qwen3RMSNorm(16, eps=1e-6)
+        wrapper = self._make_wrapper(orig)
+        resolved = _resolve_norm_module(wrapper)
+        assert resolved is wrapper.modules_to_save["default"], (
+            "_resolve_norm_module returned the frozen original instead of the "
+            "active adapter — PEFT stores active_adapter as a list, not a str"
+        )
+
+    def test_resolve_through_real_peft_modules_to_save(self):
+        """End-to-end: build a Qwen3 model, wrap ``q_norm`` / ``k_norm`` with
+        ``peft.get_peft_model(..., modules_to_save=[...])``, set the active
+        adapter's weight to a value distinct from the frozen original, and
+        confirm ``_resolve_norm_module`` returns the active-adapter module
+        (so the fused kernel reads the trainable weight, not the frozen one).
+        This exercises the real PEFT object shape, not a stub."""
+        from peft import LoraConfig, get_peft_model
+        from peft.utils.other import ModulesToSaveWrapper
+        from transformers.models.qwen3.configuration_qwen3 import Qwen3Config
+        from transformers.models.qwen3.modeling_qwen3 import Qwen3ForCausalLM
+
+        from axolotl.monkeypatch.models.qwen3.fused_attn import (
+            _resolve_norm_module,
+        )
+
+        cfg = Qwen3Config(
+            vocab_size=128,
+            hidden_size=64,
+            intermediate_size=128,
+            num_hidden_layers=2,
+            num_attention_heads=4,
+            num_key_value_heads=2,
+            head_dim=16,
+            max_position_embeddings=256,
+            rms_norm_eps=1e-6,
+            attention_dropout=0.0,
+        )
+        base = Qwen3ForCausalLM(cfg)
+        lora = LoraConfig(
+            r=4,
+            lora_alpha=8,
+            target_modules=["q_proj", "k_proj", "v_proj", "o_proj"],
+            modules_to_save=["q_norm", "k_norm"],
+            task_type="CAUSAL_LM",
+        )
+        peft_model = get_peft_model(base, lora)
+
+        attn = peft_model.base_model.model.model.layers[0].self_attn
+        assert isinstance(attn.q_norm, ModulesToSaveWrapper), (
+            "PEFT did not wrap q_norm — test premise is invalid"
+        )
+        with torch.no_grad():
+            attn.q_norm.modules_to_save["default"].weight.fill_(7.0)
+            attn.q_norm.original_module.weight.fill_(0.0)
+
+        resolved = _resolve_norm_module(attn.q_norm)
+        assert resolved is attn.q_norm.modules_to_save["default"], (
+            "_resolve_norm_module did not return the active-adapter module — "
+            "real PEFT exposes active_adapter as a list, but the helper "
+            "treated only the str case"
+        )
+        assert torch.equal(
+            resolved.weight.detach(),
+            torch.full_like(resolved.weight, 7.0),
+        ), "resolved module is not the trainable adapter weight"
+
+    def test_qwen3_forward_under_modules_to_save_wrapper(self, restore_qwen3_attention):
+        from transformers.models.qwen3.configuration_qwen3 import Qwen3Config
+        from transformers.models.qwen3.modeling_qwen3 import Qwen3Model
+
+        from axolotl.monkeypatch.models.qwen3.fused_attn import (
+            patch_qwen3_fused_attn,
+        )
+
+        cfg = Qwen3Config(
+            vocab_size=128,
+            hidden_size=64,
+            intermediate_size=128,
+            num_hidden_layers=2,
+            num_attention_heads=4,
+            num_key_value_heads=2,
+            head_dim=16,
+            max_position_embeddings=256,
+            rms_norm_eps=1e-6,
+            attention_dropout=0.0,
+        )
+        cfg._attn_implementation = "sdpa"
+        m = Qwen3Model(cfg).cuda().to(torch.bfloat16)
+
+        for layer in m.layers:
+            attn = layer.self_attn
+            attn.q_norm = self._make_wrapper(attn.q_norm).cuda().to(torch.bfloat16)
+            attn.k_norm = self._make_wrapper(attn.k_norm).cuda().to(torch.bfloat16)
+
+        patch_qwen3_fused_attn()
+        ids = torch.randint(0, 128, (1, 16), device="cuda")
+        mask = torch.ones(1, 16, dtype=torch.long, device="cuda")
+        with torch.no_grad():
+            out = m(input_ids=ids, attention_mask=mask).last_hidden_state
+        assert torch.isfinite(out).all(), (
+            "fused forward through ModulesToSaveWrapper produced non-finite output"
+        )
+
+
+class TestKernelProductionHeadDim:
+    """Kernel parity at ``head_dim=256`` (Qwen3.5 production); unit tests only cover 32/64."""
+
+    @pytest.mark.parametrize("head_dim", [128, 256])
+    @pytest.mark.parametrize("unit_offset", [False, True])
+    def test_fused_rms_norm_rope_parity(self, head_dim, unit_offset):
+        from axolotl.kernels.gemma4_fused_rope import fused_rms_norm_rope
+
+        B, S, H, D = 2, 32, 4, head_dim
+        torch.manual_seed(11)
+        x = torch.randn(B, S, H, D, device="cuda", dtype=torch.bfloat16)
+        w = torch.randn(D, device="cuda", dtype=torch.bfloat16)
+        cos = torch.randn(B, S, D, device="cuda", dtype=torch.bfloat16)
+        sin = torch.randn(B, S, D, device="cuda", dtype=torch.bfloat16)
+        eps = 1e-6
+
+        x32 = x.to(torch.float32)
+        rms = x32.pow(2).mean(-1, keepdim=True).add(eps).rsqrt()
+        scale = (w.to(torch.float32) + 1.0) if unit_offset else w.to(torch.float32)
+        x_norm = x32 * rms * scale
+        half = D // 2
+        rot = torch.cat([-x_norm[..., half:], x_norm[..., :half]], dim=-1)
+        ref = (
+            x_norm * cos.to(torch.float32).unsqueeze(2)
+            + rot * sin.to(torch.float32).unsqueeze(2)
+        ).to(torch.bfloat16)
+
+        got = fused_rms_norm_rope(x, w, cos, sin, eps=eps, unit_offset=unit_offset)
+        assert got.shape == ref.shape
+        assert torch.isfinite(got).all()
+        torch.testing.assert_close(got, ref, rtol=5e-2, atol=5e-2)
+
+
+class TestAttentionMaskPassThrough:
+    """Sample-packing masks must flow through the patched forward verbatim."""
+
+    def test_qwen3_padding_mask_runs_clean(self, restore_qwen3_attention):
+        from transformers.models.qwen3.configuration_qwen3 import Qwen3Config
+        from transformers.models.qwen3.modeling_qwen3 import Qwen3Model
+
+        from axolotl.monkeypatch.models.qwen3.fused_attn import (
+            patch_qwen3_fused_attn,
+        )
+
+        cfg = Qwen3Config(
+            vocab_size=128,
+            hidden_size=64,
+            intermediate_size=128,
+            num_hidden_layers=2,
+            num_attention_heads=4,
+            num_key_value_heads=2,
+            head_dim=16,
+            max_position_embeddings=256,
+            rms_norm_eps=1e-6,
+            attention_dropout=0.0,
+        )
+        cfg._attn_implementation = "sdpa"
+        m = Qwen3Model(cfg).cuda().to(torch.bfloat16)
+        patch_qwen3_fused_attn()
+
+        ids = torch.randint(0, 128, (2, 16), device="cuda")
+        mask = torch.cat([torch.ones(2, 8), torch.zeros(2, 8)], dim=1).long().cuda()
+        with torch.no_grad():
+            out = m(input_ids=ids, attention_mask=mask).last_hidden_state
+        assert torch.isfinite(out).all()
+
+
+class TestSlidingWindowKwarg:
+    """The fused forward must preserve ``sliding_window`` on the attention-interface call."""
+
+    def test_fused_forward_passes_sliding_window(self):
+        from axolotl.monkeypatch.models.qwen3 import fused_attn
+
+        src = inspect.getsource(fused_attn._make_fused_forward)
+        assert "sliding_window=self.sliding_window" in src, (
+            "Qwen3 fused_forward must pass sliding_window to attention_interface "
+            "to preserve sliding-attention layer behavior"
+        )
+
+
+class TestGetTextConfigDispatch:
+    """A multimodal Qwen3-VL text branch surfaces as ``model_config_type='qwen3'``; the patch must still fire."""
+
+    def test_qwen3_text_branch_dispatch(self, restore_qwen3_attention):
+        from axolotl.loaders.patch_manager import PatchManager
+        from axolotl.utils.dict import DictDefault
+
+        cfg = DictDefault(
+            {
+                "base_model": "fake/qwen3-vl-text-branch",
+                "model_config_type": "qwen3",
+                "fused_attn_kernel": True,
+                "lora_qkv_kernel": False,
+                "lora_o_kernel": False,
+                "context_parallel_size": 1,
+            }
+        )
+        mc = type("MC", (), {"model_type": "qwen3"})()
+        pm = PatchManager(cfg=cfg, model_config=mc, inference=False)
+        pm._apply_model_specific_patches()
+
+        from transformers.models.qwen3.modeling_qwen3 import Qwen3Attention
+
+        assert getattr(Qwen3Attention, "_axolotl_fused_attn_patched", False)
diff --git a/tests/monkeypatch/test_qwen3_fused_attn_robustness.py b/tests/monkeypatch/test_qwen3_fused_attn_robustness.py
new file mode 100644
index 0000000000..2911aaeb72
--- /dev/null
+++ b/tests/monkeypatch/test_qwen3_fused_attn_robustness.py
@@ -0,0 +1,206 @@
+"""Robustness tests for the Qwen3 / Qwen3.5 fused-attn patches (idempotency, signature drift, GC, cross-device, FA2)."""
+
+import inspect
+
+import pytest
+import torch
+
+pytestmark = [
+    pytest.mark.skipif(not torch.cuda.is_available(), reason="CUDA required"),
+]
+
+pytest.importorskip("transformers.models.qwen3")
+
+
+def _clear_patched_flag(cls):
+    try:
+        delattr(cls, "_axolotl_fused_attn_patched")
+    except AttributeError:
+        pass
+
+
+def _saved_state(cls):
+    return (cls.forward, getattr(cls, "_axolotl_fused_attn_patched", False))
+
+
+def _restore_state(cls, state):
+    saved_forward, saved_flag = state
+    cls.forward = saved_forward
+    if saved_flag:
+        cls._axolotl_fused_attn_patched = saved_flag
+    else:
+        _clear_patched_flag(cls)
+
+
+@pytest.fixture
+def restore_qwen3_attention():
+    from transformers.models.qwen3.modeling_qwen3 import Qwen3Attention
+
+    state = _saved_state(Qwen3Attention)
+    yield Qwen3Attention
+    _restore_state(Qwen3Attention, state)
+
+
+def _build_tiny_qwen3():
+    from transformers.models.qwen3.configuration_qwen3 import Qwen3Config
+    from transformers.models.qwen3.modeling_qwen3 import Qwen3Model
+
+    cfg = Qwen3Config(
+        vocab_size=128,
+        hidden_size=64,
+        intermediate_size=128,
+        num_hidden_layers=2,
+        num_attention_heads=4,
+        num_key_value_heads=2,
+        head_dim=16,
+        max_position_embeddings=512,
+        rms_norm_eps=1e-6,
+        attention_dropout=0.0,
+    )
+    cfg._attn_implementation = "sdpa"
+    return Qwen3Model(cfg).cuda().to(torch.bfloat16)
+
+
+class TestPatchIdempotency:
+    """Re-applying the patch must be a no-op (``_axolotl_fused_attn_patched`` flag guard)."""
+
+    def test_qwen3_double_patch_is_noop(self, restore_qwen3_attention):
+        from transformers.models.qwen3.modeling_qwen3 import Qwen3Attention
+
+        from axolotl.monkeypatch.models.qwen3.fused_attn import (
+            patch_qwen3_fused_attn,
+        )
+
+        patch_qwen3_fused_attn()
+        forward_after_first = Qwen3Attention.forward
+        assert Qwen3Attention._axolotl_fused_attn_patched is True
+
+        patch_qwen3_fused_attn()
+        assert Qwen3Attention.forward is forward_after_first, (
+            "second patch_qwen3_fused_attn() call replaced .forward — the "
+            "_axolotl_fused_attn_patched guard is broken"
+        )
+
+    def test_qwen3_5_double_patch_is_noop(self):
+        pytest.importorskip("transformers.models.qwen3_5")
+        from transformers.models.qwen3_5.modeling_qwen3_5 import Qwen3_5Attention
+
+        from axolotl.monkeypatch.models.qwen3_5.fused_attn import (
+            patch_qwen3_5_fused_attn,
+        )
+
+        state = _saved_state(Qwen3_5Attention)
+        try:
+            patch_qwen3_5_fused_attn()
+            forward_after_first = Qwen3_5Attention.forward
+            patch_qwen3_5_fused_attn()
+            assert Qwen3_5Attention.forward is forward_after_first
+        finally:
+            _restore_state(Qwen3_5Attention, state)
+
+
+class TestSignatureContract:
+    """Pin the stock attention forward signature; transformers drift would otherwise surface as a confusing TypeError mid-training."""
+
+    def test_qwen3_forward_required_params(self):
+        from transformers.models.qwen3.modeling_qwen3 import Qwen3Attention
+
+        sig = inspect.signature(Qwen3Attention.forward)
+        params = list(sig.parameters)
+        assert params[:4] == [
+            "self",
+            "hidden_states",
+            "position_embeddings",
+            "attention_mask",
+        ], (
+            "Qwen3Attention.forward signature drifted away from the contract "
+            "our fused_forward replacement assumes — update the patch."
+        )
+
+    def test_qwen3_5_forward_required_params(self):
+        pytest.importorskip("transformers.models.qwen3_5")
+        from transformers.models.qwen3_5.modeling_qwen3_5 import Qwen3_5Attention
+
+        sig = inspect.signature(Qwen3_5Attention.forward)
+        params = list(sig.parameters)
+        assert params[:4] == [
+            "self",
+            "hidden_states",
+            "position_embeddings",
+            "attention_mask",
+        ]
+
+
+class TestGradientCheckpointingCompose:
+    """Pin that the fused forward survives being re-run inside a checkpoint partial during backward."""
+
+    def test_qwen3_fused_under_gradient_checkpointing(self, restore_qwen3_attention):
+        from axolotl.monkeypatch.models.qwen3.fused_attn import (
+            patch_qwen3_fused_attn,
+        )
+
+        m = _build_tiny_qwen3().train()
+        m.gradient_checkpointing_enable()
+        patch_qwen3_fused_attn()
+
+        ids = torch.randint(0, 128, (2, 32), device="cuda")
+        mask = torch.ones(2, 32, dtype=torch.long, device="cuda")
+        out = m(input_ids=ids, attention_mask=mask, use_cache=False).last_hidden_state
+        loss = out.sum()
+        torch.cuda.reset_peak_memory_stats()
+        loss.backward()
+        peak_mb = torch.cuda.max_memory_allocated() / 1024**2
+
+        for layer in m.layers:
+            attn = layer.self_attn
+            assert attn.q_norm.weight.grad is not None
+            assert attn.k_norm.weight.grad is not None
+            assert attn.q_norm.weight.grad.isfinite().all()
+            assert attn.k_norm.weight.grad.isfinite().all()
+            assert attn.q_norm.weight.grad.abs().sum() > 0
+        assert peak_mb < 1024, f"backward peak {peak_mb:.0f} MB looks like a leak"
+
+
+class TestCrossDeviceNormWeight:
+    """Sharded ``device_map='auto'`` can leave norm weights on CPU; the patch must coerce them or Triton raises on the CPU pointer."""
+
+    def test_qwen3_norm_weight_on_cpu_does_not_crash(self, restore_qwen3_attention):
+        from axolotl.monkeypatch.models.qwen3.fused_attn import (
+            patch_qwen3_fused_attn,
+        )
+
+        m = _build_tiny_qwen3()
+        patch_qwen3_fused_attn()
+
+        for layer in m.layers:
+            attn = layer.self_attn
+            attn.q_norm.weight.data = attn.q_norm.weight.data.cpu()
+            attn.k_norm.weight.data = attn.k_norm.weight.data.cpu()
+
+        ids = torch.randint(0, 128, (1, 16), device="cuda")
+        mask = torch.ones(1, 16, dtype=torch.long, device="cuda")
+        with torch.no_grad():
+            out = m(input_ids=ids, attention_mask=mask).last_hidden_state
+        assert torch.isfinite(out).all()
+
+
+class TestFlashAttention2Compose:
+    """The fused region is upstream of ``attention_interface``; pin clean composition with FA2 if it's installed."""
+
+    def test_qwen3_fused_under_flash_attention_2(self, restore_qwen3_attention):
+        pytest.importorskip("flash_attn")
+        from axolotl.monkeypatch.models.qwen3.fused_attn import (
+            patch_qwen3_fused_attn,
+        )
+
+        m = _build_tiny_qwen3()
+        m.config._attn_implementation = "flash_attention_2"
+        for layer in m.layers:
+            layer.self_attn.config._attn_implementation = "flash_attention_2"
+
+        patch_qwen3_fused_attn()
+        ids = torch.randint(0, 128, (2, 32), device="cuda")
+        mask = torch.ones(2, 32, dtype=torch.long, device="cuda")
+        with torch.no_grad():
+            out = m(input_ids=ids, attention_mask=mask).last_hidden_state
+        assert torch.isfinite(out).all()
diff --git a/tests/monkeypatch/test_qwen3_vl_fused_attn.py b/tests/monkeypatch/test_qwen3_vl_fused_attn.py
new file mode 100644
index 0000000000..1cc422e8b5
--- /dev/null
+++ b/tests/monkeypatch/test_qwen3_vl_fused_attn.py
@@ -0,0 +1,168 @@
+"""Tests for the Qwen3-VL text fused-attention monkeypatch."""
+
+import pytest
+import torch
+
+pytestmark = [
+    pytest.mark.skipif(not torch.cuda.is_available(), reason="CUDA required"),
+]
+
+pytest.importorskip("transformers.models.qwen3_vl")
+
+
+def _clear_patched_flag(cls):
+    try:
+        delattr(cls, "_axolotl_fused_attn_patched")
+    except AttributeError:
+        pass
+
+
+@pytest.fixture
+def restore_qwen3_vl_attention():
+    from transformers.models.qwen3_vl.modeling_qwen3_vl import Qwen3VLTextAttention
+
+    saved = Qwen3VLTextAttention.forward
+    saved_flag = getattr(Qwen3VLTextAttention, "_axolotl_fused_attn_patched", False)
+    yield Qwen3VLTextAttention
+    Qwen3VLTextAttention.forward = saved
+    if saved_flag:
+        Qwen3VLTextAttention._axolotl_fused_attn_patched = saved_flag
+    else:
+        _clear_patched_flag(Qwen3VLTextAttention)
+
+
+def _build_qwen3_vl_text_model(seed: int = 0):
+    from transformers.models.qwen3_vl.configuration_qwen3_vl import Qwen3VLTextConfig
+    from transformers.models.qwen3_vl.modeling_qwen3_vl import Qwen3VLTextModel
+
+    torch.manual_seed(seed)
+    cfg = Qwen3VLTextConfig(
+        vocab_size=128,
+        hidden_size=64,
+        intermediate_size=128,
+        num_hidden_layers=2,
+        num_attention_heads=4,
+        num_key_value_heads=2,
+        head_dim=16,
+        max_position_embeddings=512,
+        rms_norm_eps=1e-6,
+        attention_dropout=0.0,
+        pad_token_id=0,
+    )
+    cfg._attn_implementation = "sdpa"
+    return Qwen3VLTextModel(cfg).cuda().to(torch.bfloat16).eval()
+
+
+def _run_attention(model, layer_idx, hidden_states, position_ids):
+    attn = model.layers[layer_idx].self_attn
+    cos, sin = model.rotary_emb(hidden_states, position_ids)
+    out, _ = attn(
+        hidden_states=hidden_states,
+        position_embeddings=(cos, sin),
+        attention_mask=None,
+    )
+    return out
+
+
+class TestQwen3VLFusedAttnParity:
+    """Single-layer parity vs stock Qwen3VLTextAttention."""
+
+    @pytest.mark.parametrize("layer_idx", [0, 1])
+    def test_forward_matches_stock(self, restore_qwen3_vl_attention, layer_idx):
+        from axolotl.monkeypatch.models.qwen3_vl.fused_attn import (
+            patch_qwen3_vl_fused_attn,
+        )
+
+        model = _build_qwen3_vl_text_model(seed=1)
+        hidden_states = torch.randn(2, 16, 64, device="cuda", dtype=torch.bfloat16)
+        position_ids = torch.arange(16, device="cuda").unsqueeze(0).expand(2, -1)
+
+        with torch.no_grad():
+            ref = _run_attention(model, layer_idx, hidden_states, position_ids)
+
+        patch_qwen3_vl_fused_attn()
+        with torch.no_grad():
+            got = _run_attention(model, layer_idx, hidden_states, position_ids)
+
+        assert got.shape == ref.shape
+        assert torch.isfinite(got).all()
+        cos_sim = torch.nn.functional.cosine_similarity(
+            ref.flatten().float(), got.flatten().float(), dim=0
+        )
+        assert cos_sim > 0.999, (
+            f"layer {layer_idx} fused vs stock cosine_sim={cos_sim:.6f}"
+        )
+        torch.testing.assert_close(got, ref, rtol=5e-2, atol=5e-2)
+
+
+class TestQwen3VLFusedAttnEndToEnd:
+    def test_full_forward_matches_stock(self, restore_qwen3_vl_attention):
+        from axolotl.monkeypatch.models.qwen3_vl.fused_attn import (
+            patch_qwen3_vl_fused_attn,
+        )
+
+        model = _build_qwen3_vl_text_model(seed=2)
+        input_ids = torch.randint(0, 128, (2, 32), device="cuda")
+        attention_mask = torch.ones(2, 32, dtype=torch.long, device="cuda")
+
+        with torch.no_grad():
+            ref = model(
+                input_ids=input_ids, attention_mask=attention_mask
+            ).last_hidden_state.clone()
+
+        patch_qwen3_vl_fused_attn()
+        with torch.no_grad():
+            got = model(
+                input_ids=input_ids, attention_mask=attention_mask
+            ).last_hidden_state.clone()
+
+        assert got.shape == ref.shape
+        assert torch.isfinite(got).all()
+        cos_sim = torch.nn.functional.cosine_similarity(
+            ref.flatten().float(), got.flatten().float(), dim=0
+        )
+        assert cos_sim > 0.999, f"end-to-end cosine_sim={cos_sim:.6f}"
+
+    def test_backward_grad_flows_through_fused_path(self, restore_qwen3_vl_attention):
+        from axolotl.monkeypatch.models.qwen3_vl.fused_attn import (
+            patch_qwen3_vl_fused_attn,
+        )
+
+        model = _build_qwen3_vl_text_model(seed=3).train()
+        patch_qwen3_vl_fused_attn()
+
+        input_ids = torch.randint(0, 128, (2, 16), device="cuda")
+        attention_mask = torch.ones(2, 16, dtype=torch.long, device="cuda")
+        out = model(
+            input_ids=input_ids, attention_mask=attention_mask
+        ).last_hidden_state
+        out.sum().backward()
+
+        for idx, layer in enumerate(model.layers[:2]):
+            attn = layer.self_attn
+            assert attn.q_norm.weight.grad is not None, f"layer {idx} q_norm no grad"
+            assert attn.k_norm.weight.grad is not None, f"layer {idx} k_norm no grad"
+            assert attn.q_norm.weight.grad.isfinite().all()
+            assert attn.k_norm.weight.grad.isfinite().all()
+            assert attn.q_norm.weight.grad.abs().sum() > 0
+            assert attn.k_norm.weight.grad.abs().sum() > 0
+
+
+class TestQwen3VLPatchManagerDispatch:
+    def test_patch_manager_dispatches_qwen3_vl(self, restore_qwen3_vl_attention):
+        from types import SimpleNamespace
+
+        from axolotl.loaders.patch_manager import PatchManager
+
+        cfg = SimpleNamespace(
+            fused_attn_kernel=True,
+            model_config_type="qwen3_vl",
+            llama4_linearized_experts=False,
+            sample_packing=False,
+            context_parallel_size=1,
+            attn_uses_flash_lib=False,
+        )
+
+        PatchManager(cfg=cfg, model_config=object())._apply_model_specific_patches()
+
+        assert getattr(restore_qwen3_vl_attention, "_axolotl_fused_attn_patched", False)
diff --git a/tests/prompt_strategies/test_multimodal_pretrain.py b/tests/prompt_strategies/test_multimodal_pretrain.py
index da6e6a4985..906cf28d99 100644
--- a/tests/prompt_strategies/test_multimodal_pretrain.py
+++ b/tests/prompt_strategies/test_multimodal_pretrain.py
@@ -1,28 +1,53 @@
-"""Multimodal CPT helpers + safety gate tests.
-
-The non-streaming strategy class and ``load()`` factory are deferred to a
-follow-on PR (along with the matching ``build_collator`` routing for
-``datasets:`` MM CPT batches), so only the helper-level surface is exercised
-here in v1.
-"""
-
 from __future__ import annotations
 
 import pytest
+from datasets import Dataset
 from transformers import AutoProcessor
 
 from axolotl.prompt_strategies.multimodal_pretrain import (
     _INCOMPATIBLE_PROCESSOR_REASONS,
     ImageTokenSpec,
+    MultiModalPretrainDatasetWrappingStrategy,
     build_image_token_spec,
     check_processor_compatibility,
+    load,
 )
+from axolotl.utils.data.utils import handle_long_seq_in_dataset
+from axolotl.utils.dict import DictDefault
 
 from tests.hf_offline_utils import enable_hf_offline
 
 _SMOLVLM = "HuggingFaceTB/SmolVLM-500M-Instruct"
 
 
+class _StubTokenizer:
+    eos_token_id = 2
+    pad_token_id = 0
+    unk_token_id = 1
+    all_special_tokens = ["<image>"]
+    additional_special_tokens = ["<image>"]
+    name_or_path = "stub-tokenizer"
+
+    def get_added_vocab(self):
+        return {"<image>": 42}
+
+    def convert_tokens_to_ids(self, tok):
+        return {"<image>": 42}.get(tok, self.unk_token_id)
+
+    def __call__(self, text, add_special_tokens=True):
+        ids = []
+        for token in text.split():
+            ids.append(42 if token == "<image>" else 100 + len(token))
+        return {"input_ids": ids, "attention_mask": [1] * len(ids)}
+
+
+class _StubProcessor:
+    image_token = "<image>"
+
+    def __init__(self):
+        self.tokenizer = _StubTokenizer()
+
+
 @pytest.fixture(scope="module", name="smolvlm_processor")
 @enable_hf_offline
 def fixture_smolvlm_processor(
@@ -31,9 +56,6 @@ def fixture_smolvlm_processor(
     return AutoProcessor.from_pretrained(_SMOLVLM)
 
 
-# ---- build_image_token_spec ------------------------------------------------
-
-
 def test_build_image_token_spec_autodetects_smolvlm(smolvlm_processor):
     spec = build_image_token_spec(smolvlm_processor)
     assert isinstance(spec, ImageTokenSpec)
@@ -61,10 +83,6 @@ def test_build_image_token_spec_rejects_plain_word_override(smolvlm_processor):
 def test_build_image_token_spec_keeps_image_token_when_no_soft_token_in_name(
     smolvlm_processor,
 ):
-    """Non-Gemma-3 processors: the boi-swap heuristic only fires when
-    `image_token` name contains "soft_token" (Gemma-3 convention). Otherwise
-    `image_token` IS the user-facing placeholder (Gemma-4 convention) and
-    must not be silently replaced by `boi_token`."""
     tok = smolvlm_processor.tokenizer
     image_id = tok.convert_tokens_to_ids("<image>")
     boi_id = tok.convert_tokens_to_ids("<fake_token_around_image>")
@@ -73,7 +91,7 @@ def test_build_image_token_spec_keeps_image_token_when_no_soft_token_in_name(
     )
 
     class _FakeGemma4Like:
-        image_token = "<image>"  # no 'soft_token' in name → must not swap
+        image_token = "<image>"
         boi_token = "<fake_token_around_image>"
         tokenizer = tok
 
@@ -83,9 +101,6 @@ class _FakeGemma4Like:
     assert spec.image_token_id != boi_id
 
 
-# ---- check_processor_compatibility (startup-time gate) ---------------------
-
-
 @pytest.mark.parametrize("cls_name", list(_INCOMPATIBLE_PROCESSOR_REASONS.keys()))
 def test_check_processor_compatibility_rejects_incompatible(cls_name):
     fake = type(cls_name, (), {})()
@@ -113,3 +128,111 @@ class CustomUserProcessor(BaseMllama):
 
 def test_check_processor_compatibility_accepts_supported(smolvlm_processor):
     check_processor_compatibility(smolvlm_processor)
+
+
+def test_load_returns_nonstreaming_dataset_strategy():
+    processor = _StubProcessor()
+    strategy = load(
+        processor.tokenizer,
+        DictDefault({"sequence_len": 128}),
+        ds_cfg={"text_column": "caption", "image_column": "image_paths"},
+        processor=processor,
+    )
+    assert isinstance(strategy, MultiModalPretrainDatasetWrappingStrategy)
+    assert strategy.text_column == "caption"
+    assert strategy.image_column == "image_paths"
+
+
+def test_load_requires_processor_for_nonstreaming_strategy():
+    tokenizer = _StubTokenizer()
+    with pytest.raises(ValueError, match="requires a processor"):
+        load(tokenizer, DictDefault({"sequence_len": 128}), ds_cfg={}, processor=None)
+
+
+def test_load_rejects_processor_tokenizer_mismatch():
+    processor = _StubProcessor()
+    with pytest.raises(ValueError, match=r"processor\.tokenizer"):
+        load(_StubTokenizer(), DictDefault({"sequence_len": 128}), processor=processor)
+
+
+def test_nonstreaming_strategy_wraps_dataset_without_loading_pixels():
+    processor = _StubProcessor()
+    strategy = load(
+        processor.tokenizer,
+        DictDefault({"sequence_len": 128}),
+        ds_cfg={"text_column": "caption", "image_column": "image_paths"},
+        processor=processor,
+    )
+    dataset = Dataset.from_dict(
+        {
+            "caption": ["<image>\nfirst row", "text only row"],
+            "image_paths": [["relative/a.png"], []],
+            "metadata": ["dropped", "dropped"],
+        }
+    )
+
+    wrapped = strategy.wrap_dataset(dataset, process_count=None)
+
+    assert set(wrapped.column_names) == {
+        "input_ids",
+        "labels",
+        "attention_mask",
+        "images",
+        "_mm_text",
+    }
+    assert wrapped[0]["images"] == ["relative/a.png"]
+    assert wrapped[0]["_mm_text"] == "<image>\nfirst row"
+    assert wrapped[1]["images"] == []
+    assert wrapped[1]["labels"] == wrapped[1]["input_ids"]
+
+
+def test_nonstreaming_strategy_defers_oversized_rows_to_standard_handler():
+    processor = _StubProcessor()
+    strategy = load(
+        processor.tokenizer,
+        DictDefault({"sequence_len": 2}),
+        ds_cfg={"text_column": "caption", "image_column": "image_paths"},
+        processor=processor,
+    )
+    dataset = Dataset.from_dict(
+        {
+            "caption": ["<image> too-long"],
+            "image_paths": [["relative/a.png"]],
+        }
+    )
+
+    wrapped = strategy.wrap_dataset(dataset, process_count=None)
+
+    assert len(wrapped[0]["input_ids"]) > 2
+
+
+def test_standard_length_handler_drops_nonstreaming_mm_oversized_rows():
+    processor = _StubProcessor()
+    strategy = load(
+        processor.tokenizer,
+        DictDefault({"sequence_len": 2}),
+        ds_cfg={"text_column": "caption", "image_column": "image_paths"},
+        processor=processor,
+    )
+    dataset = Dataset.from_dict(
+        {
+            "caption": ["<image> too-long"],
+            "image_paths": [["relative/a.png"]],
+        }
+    )
+
+    wrapped = strategy.wrap_dataset(dataset, process_count=None)
+    filtered = handle_long_seq_in_dataset(
+        wrapped,
+        2,
+        DictDefault(
+            {
+                "dataset_num_proc": None,
+                "is_preprocess": False,
+                "excess_length_strategy": "drop",
+                "min_sample_len": 1,
+            }
+        ),
+    )
+
+    assert len(filtered) == 0
diff --git a/tests/test_datasets.py b/tests/test_datasets.py
index bdb795e136..43f1ba1c7a 100644
--- a/tests/test_datasets.py
+++ b/tests/test_datasets.py
@@ -143,7 +143,7 @@ def test_load_from_save_to_disk(self, tokenizer, dataset_fixture):
                             "type": "alpaca",
                         },
                     ],
-                    "dataset_num_proc": 4,
+                    "dataset_num_proc": 1,
                 }
             )
 
@@ -182,7 +182,7 @@ def test_load_from_dir_of_parquet(self, tokenizer, dataset_fixture):
                             "type": "alpaca",
                         },
                     ],
-                    "dataset_num_proc": 4,
+                    "dataset_num_proc": 1,
                 }
             )
 
@@ -221,7 +221,7 @@ def test_load_from_dir_of_json(self, tokenizer, dataset_fixture):
                             "type": "alpaca",
                         },
                     ],
-                    "dataset_num_proc": 4,
+                    "dataset_num_proc": 1,
                 }
             )
 
@@ -254,7 +254,7 @@ def test_load_from_single_parquet(self, tokenizer, dataset_fixture):
                             "type": "alpaca",
                         },
                     ],
-                    "dataset_num_proc": 4,
+                    "dataset_num_proc": 1,
                 }
             )
 
@@ -287,7 +287,7 @@ def test_load_from_single_json(self, tokenizer, dataset_fixture):
                             "type": "alpaca",
                         },
                     ],
-                    "dataset_num_proc": 4,
+                    "dataset_num_proc": 1,
                 }
             )
 
@@ -372,7 +372,7 @@ def test_load_hub_with_revision_with_dpo(
                 "rl": "dpo",
                 "chat_template": "llama3",
                 "datasets": [ALPACA_MESSAGES_CONFIG_REVISION],
-                "dataset_num_proc": 4,
+                "dataset_num_proc": 1,
             }
         )
 
@@ -473,7 +473,7 @@ def test_loading_local_dataset_folder(self, tokenizer):
                             "type": "alpaca",
                         },
                     ],
-                    "dataset_num_proc": 4,
+                    "dataset_num_proc": 1,
                 }
             )
 
@@ -539,7 +539,7 @@ def test_load_dataset_with_str_json_data(self, tokenizer):
                             "message_field_content": "content",
                         },
                     ],
-                    "dataset_num_proc": 4,
+                    "dataset_num_proc": 1,
                 }
             )
 
diff --git a/tests/test_exact_deduplication.py b/tests/test_exact_deduplication.py
index a519db525b..387c77ad4c 100644
--- a/tests/test_exact_deduplication.py
+++ b/tests/test_exact_deduplication.py
@@ -210,7 +210,7 @@ def cfg(self):
                     ALPACA_MESSAGES_CONFIG_REVISION,
                     ALPACA_MESSAGES_CONFIG_REVISION,
                 ],
-                "dataset_num_proc": 4,
+                "dataset_num_proc": 1,
             }
         )
         yield fixture
diff --git a/tests/test_fp32_norms.py b/tests/test_fp32_norms.py
new file mode 100644
index 0000000000..e2b477fe8e
--- /dev/null
+++ b/tests/test_fp32_norms.py
@@ -0,0 +1,264 @@
+"""Unit tests for fp32 norm sharding (FSDP2). Pure-CPU, no dist init."""
+
+from __future__ import annotations
+
+import logging
+from types import SimpleNamespace
+
+import pytest
+import torch
+import torch.nn as nn
+from torch.distributed.fsdp import MixedPrecisionPolicy
+
+from axolotl.loaders.model import ModelLoader
+from axolotl.utils.dict import DictDefault
+from axolotl.utils.fp32_norms import (
+    DEFAULT_FP32_NORM_SUFFIXES,
+    _matches_norm_class,
+    shard_norms_fp32,
+)
+
+
+class LlamaRMSNorm(nn.Module):
+    def __init__(self, dim: int = 8) -> None:
+        super().__init__()
+        self.weight = nn.Parameter(torch.ones(dim))
+
+
+class AfmoeRMSNorm(nn.Module):
+    def __init__(self, dim: int = 8) -> None:
+        super().__init__()
+        self.weight = nn.Parameter(torch.ones(dim))
+
+
+class CustomNorm(nn.Module):
+    def __init__(self, dim: int = 8) -> None:
+        super().__init__()
+        self.weight = nn.Parameter(torch.ones(dim))
+
+
+class CustomNormWithBuffer(nn.Module):
+    def __init__(self, dim: int = 8) -> None:
+        super().__init__()
+        self.weight = nn.Parameter(torch.ones(dim, dtype=torch.bfloat16))
+        self.register_buffer("running_scale", torch.ones(dim, dtype=torch.float16))
+
+
+class MLP(nn.Module):
+    def __init__(self) -> None:
+        super().__init__()
+        self.fc = nn.Linear(8, 8)
+
+
+def test_suffix_matches_multiple_norm_families():
+    patterns = list(DEFAULT_FP32_NORM_SUFFIXES)
+    assert _matches_norm_class(LlamaRMSNorm(), patterns)
+    assert _matches_norm_class(AfmoeRMSNorm(), patterns)
+    assert _matches_norm_class(nn.LayerNorm(8), patterns)
+
+
+def test_suffix_does_not_match_non_norm_modules():
+    patterns = list(DEFAULT_FP32_NORM_SUFFIXES)
+    assert not _matches_norm_class(MLP(), patterns)
+    assert not _matches_norm_class(nn.Linear(8, 8), patterns)
+    assert not _matches_norm_class(CustomNorm(), patterns)
+
+
+def test_explicit_classname_matches_custom_norm():
+    assert _matches_norm_class(CustomNorm(), ["CustomNorm"])
+
+
+def test_fully_qualified_pattern_matches_exact_path():
+    qualified = f"{LlamaRMSNorm.__module__}.LlamaRMSNorm"
+    assert _matches_norm_class(LlamaRMSNorm(), [qualified])
+    assert not _matches_norm_class(AfmoeRMSNorm(), [qualified])
+
+
+def test_mixed_patterns_suffix_and_qualified():
+    qualified = f"{LlamaRMSNorm.__module__}.LlamaRMSNorm"
+    patterns = [qualified, "LayerNorm"]
+    assert _matches_norm_class(LlamaRMSNorm(), patterns)
+    assert _matches_norm_class(nn.LayerNorm(8), patterns)
+    assert not _matches_norm_class(AfmoeRMSNorm(), patterns)
+
+
+class _Cfg:
+    def __init__(self, **kwargs):
+        self.fp32_norms = kwargs.get("fp32_norms", False)
+        self.fp32_norm_classes = kwargs.get("fp32_norm_classes", None)
+        self.fsdp_version = kwargs.get("fsdp_version", None)
+        self.fsdp_config = kwargs.get("fsdp_config", None)
+        self.tensor_parallel_size = kwargs.get("tensor_parallel_size", 1)
+        self.lora_on_cpu = kwargs.get("lora_on_cpu", False)
+
+
+def test_disabled_is_noop():
+    model = nn.Sequential(LlamaRMSNorm(), MLP())
+    assert shard_norms_fp32(model, _Cfg(fp32_norms=False)) == 0
+
+
+def test_enabled_requires_fsdp2():
+    model = nn.Sequential(LlamaRMSNorm())
+    cfg = _Cfg(fp32_norms=True, fsdp_version=1)
+    with pytest.raises(ValueError, match="fsdp_version: 2"):
+        shard_norms_fp32(model, cfg)
+
+
+def test_meta_device_is_supported(monkeypatch):
+    with torch.device("meta"):
+        model = nn.Sequential(LlamaRMSNorm())
+    cfg = _Cfg(fp32_norms=True, fsdp_version=2)
+
+    import torch.distributed.fsdp as fsdp_module
+
+    calls = []
+
+    def fake_fully_shard(module, mp_policy=None, **kwargs):
+        calls.append((type(module).__name__, mp_policy, kwargs))
+        return module
+
+    monkeypatch.setattr(fsdp_module, "fully_shard", fake_fully_shard)
+
+    n = shard_norms_fp32(model, cfg)
+    assert n == 1
+    assert calls[0][0] == "LlamaRMSNorm"
+    assert calls[0][1].param_dtype == torch.float32
+
+
+def test_passthrough_fully_shard_kwargs_are_used(monkeypatch):
+    model = nn.Sequential(LlamaRMSNorm())
+    cfg = _Cfg(fp32_norms=True, fsdp_version=2)
+
+    import torch.distributed.fsdp as fsdp_module
+
+    calls = []
+
+    def fake_fully_shard(module, mp_policy=None, **kwargs):
+        calls.append((module, mp_policy, kwargs))
+        return module
+
+    monkeypatch.setattr(fsdp_module, "fully_shard", fake_fully_shard)
+
+    sentinel_mesh = object()
+    outer_policy = MixedPrecisionPolicy(param_dtype=torch.bfloat16)
+    n = shard_norms_fp32(
+        model,
+        cfg,
+        fully_shard_kwargs={
+            "mesh": sentinel_mesh,
+            "reshard_after_forward": True,
+            "mp_policy": outer_policy,
+        },
+    )
+    assert n == 1
+    assert calls[0][2]["mesh"] is sentinel_mesh
+    assert calls[0][2]["reshard_after_forward"] is True
+    assert calls[0][1].output_dtype == torch.bfloat16
+
+
+def test_no_matches_warns_and_returns_zero(caplog):
+    model = nn.Sequential(MLP(), nn.Linear(8, 8))
+    cfg = _Cfg(fp32_norms=True, fsdp_version=2)
+    # axolotl.cli.configure_logging() sets propagate=False on the `axolotl`
+    # logger, so pytest caplog can't see records by default. Temporarily
+    # re-enable propagation for this assertion.
+    ax_logger = logging.getLogger("axolotl")
+    old_propagate = ax_logger.propagate
+    ax_logger.propagate = True
+    try:
+        with caplog.at_level("WARNING", logger="axolotl"):
+            n = shard_norms_fp32(model, cfg)
+    finally:
+        ax_logger.propagate = old_propagate
+    assert n == 0
+    assert "no modules matched" in caplog.text
+
+
+def test_explicit_classes_override_defaults(monkeypatch):
+    model = nn.Sequential(CustomNorm(), MLP())
+    cfg = _Cfg(fp32_norms=True, fsdp_version=2, fp32_norm_classes=["CustomNorm"])
+
+    import torch.distributed.fsdp as fsdp_module
+
+    calls = []
+
+    def fake_fully_shard(module, mp_policy=None, **_):
+        calls.append((type(module).__name__, mp_policy))
+        return module
+
+    monkeypatch.setattr(fsdp_module, "fully_shard", fake_fully_shard)
+
+    n = shard_norms_fp32(model, cfg)
+    assert n == 1
+    assert calls[0][0] == "CustomNorm"
+    assert calls[0][1].param_dtype == torch.float32
+    assert calls[0][1].reduce_dtype == torch.float32
+
+
+def test_matched_norm_storage_is_cast_to_fp32_before_sharding(monkeypatch):
+    model = nn.Sequential(CustomNormWithBuffer(), MLP())
+    cfg = _Cfg(
+        fp32_norms=True,
+        fsdp_version=2,
+        fp32_norm_classes=["CustomNormWithBuffer"],
+    )
+
+    import torch.distributed.fsdp as fsdp_module
+
+    seen = []
+
+    def fake_fully_shard(module, mp_policy=None, **kwargs):
+        seen.append(
+            (
+                module.weight.dtype,
+                module.running_scale.dtype,
+                mp_policy.output_dtype,
+                kwargs,
+            )
+        )
+        return module
+
+    monkeypatch.setattr(fsdp_module, "fully_shard", fake_fully_shard)
+
+    outer_policy = MixedPrecisionPolicy(param_dtype=torch.float16)
+    n = shard_norms_fp32(
+        model,
+        cfg,
+        fully_shard_kwargs={"mp_policy": outer_policy},
+    )
+
+    assert n == 1
+    assert model[0].weight.dtype == torch.float32
+    assert model[0].running_scale.dtype == torch.float32
+    assert seen[0][0] == torch.float32
+    assert seen[0][1] == torch.float32
+    assert seen[0][2] == torch.float16
+
+
+class TinyModel(nn.Module):
+    def __init__(self) -> None:
+        super().__init__()
+        self.embed_tokens = nn.Embedding(8, 8)
+        self.input_norm = CustomNorm()
+        self.fc = nn.Linear(8, 8)
+
+
+def test_convert_embedding_modules_dtype_keeps_fp32_norm_matches():
+    loader = ModelLoader.__new__(ModelLoader)
+    loader.cfg = DictDefault(
+        fp32_norms=True,
+        fp32_norm_classes=["CustomNorm"],
+        lora_on_cpu=False,
+    )
+    loader.model = TinyModel()
+    loader.model_config = SimpleNamespace(model_type="llama")
+
+    loader._convert_embedding_modules_dtype(
+        embedding_modules=["embed_tokens"],
+        dist_dtype=torch.bfloat16,
+        before_kbit_train_or_finetune=False,
+    )
+
+    assert loader.model.input_norm.weight.dtype == torch.float32
+    assert loader.model.embed_tokens.weight.dtype == torch.bfloat16
+    assert loader.model.fc.weight.dtype == torch.float32
diff --git a/tests/test_multimodal_streaming.py b/tests/test_multimodal_streaming.py
index dca32d66b8..211c1a2325 100644
--- a/tests/test_multimodal_streaming.py
+++ b/tests/test_multimodal_streaming.py
@@ -2,15 +2,23 @@
 
 from __future__ import annotations
 
+import json
 from pathlib import Path
 
 import numpy as np
 import pytest
 import torch
 from PIL import Image
-from transformers import AutoProcessor
+from transformers import AutoProcessor, TrainerControl, TrainerState, TrainingArguments
 
+from axolotl.core.builders.causal import (
+    HFCausalTrainerBuilder,
+    _get_mm_cpt_config,
+    _is_multimodal_cpt,
+)
+from axolotl.core.trainers.constants import TOKENS_STATE_FILE
 from axolotl.prompt_strategies.multimodal_pretrain import build_image_token_spec
+from axolotl.utils.callbacks.tokens_per_second import TokensPerSecondCallback
 from axolotl.utils.collators.mm_pretrain import MultiModalPretrainDataCollator
 from axolotl.utils.data.streaming import (
     encode_streaming_multimodal,
@@ -221,6 +229,15 @@ def test_build_image_token_spec_no_candidates_raises():
 # ---- wrap_streaming_dataset routing --------------------------------------
 
 
+def _patch_streaming_partial(monkeypatch, fake_partial):
+    import axolotl.utils.data.streaming as streaming_mod
+
+    if hasattr(streaming_mod, "partial"):
+        monkeypatch.setattr(streaming_mod, "partial", fake_partial)
+    else:
+        monkeypatch.setattr(streaming_mod.functools, "partial", fake_partial)
+
+
 def test_wrap_streaming_dataset_uses_pretraining_config_arg(
     smolvlm_processor, monkeypatch
 ):
@@ -233,7 +250,7 @@ def fake_partial(fn, **kwargs):
         captured["kwargs"] = kwargs
         return lambda batch: batch
 
-    monkeypatch.setattr("axolotl.utils.data.streaming.functools.partial", fake_partial)
+    _patch_streaming_partial(monkeypatch, fake_partial)
 
     class _Dataset:
         features = {"text": None, "images": None}
@@ -295,7 +312,7 @@ def fake_partial(fn, **kwargs):
         captured["kwargs"] = kwargs
         return lambda batch: batch
 
-    monkeypatch.setattr("axolotl.utils.data.streaming.functools.partial", fake_partial)
+    _patch_streaming_partial(monkeypatch, fake_partial)
 
     class _Dataset:
         features = {"text": None, "images": None}
@@ -375,6 +392,53 @@ def map(self, *_args, **_kwargs):
     assert captured["kwargs"]["max_tokens"] == 4096
 
 
+def test_mm_cpt_detection_includes_nonstreaming_datasets():
+    cfg = DictDefault(
+        {
+            "pretraining_dataset": None,
+            "datasets": [
+                {
+                    "path": "train/ds",
+                    "type": "multimodal_pretrain",
+                    "image_base_dir": "/train/images",
+                }
+            ],
+        }
+    )
+
+    assert _is_multimodal_cpt(cfg)
+    assert _get_mm_cpt_config(cfg)["image_base_dir"] == "/train/images"
+
+
+def test_mm_cpt_collator_uses_nonstreaming_dataset_config():
+    tok = _StubTokenizer({"<image>": 42})
+    processor = _StubProcessor(tok, image_token="<image>")
+    builder = object.__new__(HFCausalTrainerBuilder)
+    builder.tokenizer = tok
+    builder.processor = processor
+    builder.cfg = DictDefault(
+        {
+            "pretraining_dataset": None,
+            "datasets": [
+                {
+                    "path": "train/ds",
+                    "type": "multimodal_pretrain",
+                    "image_base_dir": "/train/images",
+                    "image_token": "<image>",
+                }
+            ],
+            "test_datasets": None,
+            "sequence_len": 128,
+            "eval_sequence_len": None,
+        }
+    )
+
+    collator = HFCausalTrainerBuilder._build_mm_pretrain_collator(builder)
+
+    assert isinstance(collator, MultiModalPretrainDataCollator)
+    assert collator.image_base_dir == "/train/images"
+
+
 # ---- MultiModalPretrainDataCollator ---------------------------------------
 
 
@@ -432,6 +496,47 @@ def test_collator_raises_on_missing_columns(smolvlm_processor):
         collator.torch_call([{"input_ids": [1, 2, 3]}])  # no _mm_text / images
 
 
+def test_collator_resolves_relative_image_base_dir(smolvlm_processor, tmp_path):
+    spec = build_image_token_spec(smolvlm_processor)
+    collator = MultiModalPretrainDataCollator(
+        tokenizer=smolvlm_processor.tokenizer,
+        processor=smolvlm_processor,
+        image_token_spec=spec,
+        image_base_dir=str(tmp_path),
+    )
+
+    assert collator._resolve_image_source("rel/img.png") == str(
+        tmp_path / "rel/img.png"
+    )
+    assert collator._resolve_image_source("/abs/img.png") == "/abs/img.png"
+    assert (
+        collator._resolve_image_source("https://host/img.png") == "https://host/img.png"
+    )
+
+
+def test_tokens_per_second_callback_restores_checkpoint_token_state(tmp_path):
+    checkpoint = tmp_path / "checkpoint-1"
+    checkpoint.mkdir()
+    (checkpoint / TOKENS_STATE_FILE).write_text(
+        json.dumps({"total": 123, "trainable": 45})
+    )
+    callback = TokensPerSecondCallback(
+        tensor_parallel_size=None,
+        context_parallel_size=None,
+        resume_from_checkpoint=str(checkpoint),
+    )
+    state = TrainerState()
+
+    callback.on_train_begin(
+        TrainingArguments(output_dir=str(tmp_path / "out")),
+        state,
+        TrainerControl(),
+    )
+
+    assert int(state.tokens["total"].item()) == 123
+    assert int(state.tokens["trainable"].item()) == 45
+
+
 # ---- input validation -----------------------------------------------------
 
 
diff --git a/tests/test_packed_dataset.py b/tests/test_packed_dataset.py
index 953d523af4..cecd82a814 100644
--- a/tests/test_packed_dataset.py
+++ b/tests/test_packed_dataset.py
@@ -55,7 +55,7 @@ def test_lora_packing(self, temp_dir):
                         "type": "alpaca",
                     },
                 ],
-                "dataset_num_proc": 4,
+                "dataset_num_proc": 1,
                 "num_epochs": 1,
                 "max_steps": 20,
                 "save_steps": 10,
diff --git a/tests/utils/data/test_hash.py b/tests/utils/data/test_hash.py
index 07bc2bb6f4..56ccc62070 100644
--- a/tests/utils/data/test_hash.py
+++ b/tests/utils/data/test_hash.py
@@ -43,6 +43,28 @@ def _datasets():
     ]
 
 
+def _mm_datasets(**overrides):
+    return [
+        DictDefault(
+            {
+                "path": "local/mm.jsonl",
+                "type": "multimodal_pretrain",
+                "shards": None,
+                "conversation": None,
+                "split": "train",
+                "temperature": None,
+                "text_column": "text",
+                "image_column": "images",
+                "image_base_dir": "/images",
+                "image_token": "<image>",
+                "data_files": None,
+                "ds_type": "json",
+                **overrides,
+            }
+        )
+    ]
+
+
 class TestGenerateDatasetHashFromConfig:
     def test_same_config_same_hash(self):
         """Identical configs produce identical hashes."""
@@ -137,6 +159,56 @@ def test_no_added_tokens_overrides_uses_tokenizer_name_as_before(self):
 
         assert h1 != h2
 
+    def test_multimodal_dataset_fields_affect_hash(self):
+        cfg = _base_cfg()
+        base_hash = generate_dataset_hash_from_config(cfg, _mm_datasets(), "tok")
+
+        for key, value in (
+            ("text_column", "caption"),
+            ("image_column", "image_paths"),
+            ("image_base_dir", "/other/images"),
+            ("image_token", "<|image_pad|>"),
+            ("data_files", "train-0001.jsonl"),
+            ("ds_type", "parquet"),
+        ):
+            changed_hash = generate_dataset_hash_from_config(
+                cfg, _mm_datasets(**{key: value}), "tok"
+            )
+            assert changed_hash != base_hash
+
+    def test_multimodal_processor_affects_hash(self):
+        cfg = _base_cfg()
+        h1 = generate_dataset_hash_from_config(
+            cfg, _mm_datasets(image_token=None), "tok", "ProcessorA"
+        )
+        h2 = generate_dataset_hash_from_config(
+            cfg, _mm_datasets(image_token=None), "tok", "ProcessorB"
+        )
+
+        assert h1 != h2
+
+    def test_text_processor_does_not_affect_hash(self):
+        cfg = _base_cfg()
+        h1 = generate_dataset_hash_from_config(cfg, _datasets(), "tok", "ProcessorA")
+        h2 = generate_dataset_hash_from_config(cfg, _datasets(), "tok", "ProcessorB")
+
+        assert h1 == h2
+
+    def test_dataset_order_affects_hash(self):
+        cfg = _base_cfg()
+        dataset_a = _datasets()[0]
+        dataset_b = DictDefault(
+            {
+                **dataset_a,
+                "path": "mhenrichsen/alpaca_2k_test_b",
+            }
+        )
+
+        h1 = generate_dataset_hash_from_config(cfg, [dataset_a, dataset_b], "tok")
+        h2 = generate_dataset_hash_from_config(cfg, [dataset_b, dataset_a], "tok")
+
+        assert h1 != h2
+
 
 def _mm_pretrain_cfg(**kwargs):
     return DictDefault(
diff --git a/tests/utils/data/test_mm_cpt_eval.py b/tests/utils/data/test_mm_cpt_eval.py
index 73aa1015ec..dae0a3a0f5 100644
--- a/tests/utils/data/test_mm_cpt_eval.py
+++ b/tests/utils/data/test_mm_cpt_eval.py
@@ -2,12 +2,100 @@
 
 from __future__ import annotations
 
+from datasets import Dataset
+
 from axolotl.utils.data.sft import (
     _create_placeholder_dataset,
+    _load_and_process_single_dataset,
+    _prepare_standard_dataset,
     _prepare_streaming_dataset,
 )
 from axolotl.utils.dict import DictDefault
 
+
+class _Tokenizer:
+    name_or_path = "tok"
+
+
+def test_nonstreaming_pretokenized_mm_rows_keep_collator_columns(monkeypatch):
+    dataset = Dataset.from_dict(
+        {
+            "input_ids": [[1, 2, 3]],
+            "attention_mask": [[1, 1, 1]],
+            "labels": [[1, 2, 3]],
+            "_mm_text": ["<image>\nrow"],
+            "images": [["rel/a.png"]],
+        }
+    )
+    monkeypatch.setattr(
+        "axolotl.utils.data.sft.load_dataset_with_config",
+        lambda *_args, **_kwargs: dataset,
+    )
+    cfg = DictDefault(
+        {
+            "hf_use_auth_token": None,
+            "dataset_num_proc": None,
+            "dataset_keep_in_memory": False,
+            "skip_prepare_dataset": False,
+        }
+    )
+    entry = DictDefault(
+        {"path": "json", "type": "multimodal_pretrain", "split": "train"}
+    )
+
+    wrapped, _ = _load_and_process_single_dataset(
+        entry,
+        cfg,
+        _Tokenizer(),
+        split="train",
+        seed=42,
+    )
+
+    assert wrapped.column_names == dataset.column_names
+    assert wrapped[0]["_mm_text"] == "<image>\nrow"
+    assert wrapped[0]["images"] == ["rel/a.png"]
+
+
+def test_standard_mm_datasets_num_epochs_derives_total_steps(monkeypatch, tmp_path):
+    train_dataset = Dataset.from_dict(
+        {
+            "input_ids": [[1], [1, 2], [1, 2, 3], [1], [1, 2]],
+            "attention_mask": [[1], [1, 1], [1, 1, 1], [1], [1, 1]],
+            "labels": [[1], [1, 2], [1, 2, 3], [1], [1, 2]],
+        }
+    )
+
+    def fake_load_and_prepare(*_args, **_kwargs):
+        return train_dataset, None, [None]
+
+    monkeypatch.setattr(
+        "axolotl.utils.data.sft._load_and_prepare_datasets",
+        fake_load_and_prepare,
+    )
+    cfg = DictDefault(
+        {
+            "dataset_prepared_path": str(tmp_path),
+            "datasets": [{"path": "json", "type": "multimodal_pretrain"}],
+            "test_datasets": None,
+            "val_set_size": 0,
+            "sample_packing": False,
+            "eval_sample_packing": False,
+            "max_steps": None,
+            "num_epochs": 2,
+            "batch_size": 4,
+            "skip_prepare_dataset": False,
+            "reward_model": False,
+            "total_num_tokens": None,
+            "total_supervised_tokens": None,
+            "model_config_type": None,
+        }
+    )
+
+    _, _, total_steps, _ = _prepare_standard_dataset(cfg, _Tokenizer(), None)
+
+    assert total_steps == 3
+
+
 # ---- placeholder dataset for dispatch_batches ----------------------------
 
 
@@ -95,9 +183,6 @@ def skip(self, *_a, **_kw):
 
         return _Stub()
 
-    def fake_wrap(ds, *_a, **_kw):
-        return ds
-
     class _StubFormat:
         def with_format(self, *_a, **_kw):
             return self
diff --git a/tests/utils/data/test_mm_pretrain_cache.py b/tests/utils/data/test_mm_pretrain_cache.py
index 063c75d44e..07468bf3ce 100644
--- a/tests/utils/data/test_mm_pretrain_cache.py
+++ b/tests/utils/data/test_mm_pretrain_cache.py
@@ -90,10 +90,6 @@ class _FakeProc:
     pass
 
 
-class _FakeImageProc:
-    pass
-
-
 class TestProcessorFingerprint:
     def test_none_returns_none(self):
         assert _processor_fingerprint(None) is None
@@ -108,6 +104,11 @@ def test_image_token_distinguishes(self):
         b = SimpleNamespace(image_token="<|image_pad|>")
         assert _processor_fingerprint(a) != _processor_fingerprint(b)
 
+    def test_boi_token_distinguishes(self):
+        a = SimpleNamespace(boi_token="<start_of_image>")
+        b = SimpleNamespace(boi_token="<image>")
+        assert _processor_fingerprint(a) != _processor_fingerprint(b)
+
     def test_image_processor_settings_distinguish(self):
         a = SimpleNamespace(
             image_processor=SimpleNamespace(size={"shortest_edge": 336}, patch_size=14)
diff --git a/tests/utils/schemas/validation/test_fsdp.py b/tests/utils/schemas/validation/test_fsdp.py
index ce3f3aa07b..ce0e5f71b5 100644
--- a/tests/utils/schemas/validation/test_fsdp.py
+++ b/tests/utils/schemas/validation/test_fsdp.py
@@ -2,6 +2,8 @@
 tests for pydantic fsdp validation
 """
 
+import logging
+
 import pytest
 
 from axolotl.utils.config import validate_config
@@ -136,6 +138,77 @@ def test_fsdp_prefixes_removed(self, min_base_cfg):
         assert cfg.fsdp_config.transformer_layer_cls_to_wrap == "LlamaDecoderLayer"
         assert cfg.fsdp_config.reshard_after_forward is True
 
+    def test_fp32_norms_requires_fsdp_config(self, min_base_cfg):
+        # fsdp_config is the canonical "is_fsdp" signal; fp32_norms requires it.
+        cfg = min_base_cfg | DictDefault(
+            fp32_norms=True,
+            fsdp_version=2,
+        )
+        with pytest.raises(ValueError, match="fp32_norms requires FSDP to be enabled"):
+            validate_config(cfg)
+
+    def test_fp32_norms_requires_fsdp2(self, min_base_cfg):
+        cfg = min_base_cfg | DictDefault(
+            fp32_norms=True,
+            fsdp_version=1,
+            fsdp_config={"reshard_after_forward": True},
+        )
+        with pytest.raises(ValueError, match="fp32_norms requires fsdp_version: 2"):
+            validate_config(cfg)
+
+    def test_fp32_norms_cpu_ram_efficient_loading_ok(self, min_base_cfg):
+        cfg = min_base_cfg | DictDefault(
+            fp32_norms=True,
+            fsdp_version=2,
+            fsdp_config={
+                "reshard_after_forward": True,
+                "cpu_ram_efficient_loading": True,
+            },
+        )
+        validated_cfg = validate_config(cfg)
+        assert validated_cfg.fp32_norms is True
+        assert validated_cfg.fsdp_config.cpu_ram_efficient_loading is True
+
+    def test_fp32_norms_tensor_parallel_ok(self, min_base_cfg):
+        cfg = min_base_cfg | DictDefault(
+            fp32_norms=True,
+            fsdp_version=2,
+            tensor_parallel_size=2,
+            fsdp_config={"reshard_after_forward": True},
+        )
+        validated_cfg = validate_config(cfg)
+        assert validated_cfg.fp32_norms is True
+        assert validated_cfg.tensor_parallel_size == 2
+
+    def test_fp32_norms_fsdp2_ok(self, min_base_cfg):
+        cfg = min_base_cfg | DictDefault(
+            fp32_norms=True,
+            fp32_norm_classes=["AfmoeRMSNorm"],
+            fsdp_version=2,
+            fsdp_config={"reshard_after_forward": True},
+        )
+        validated_cfg = validate_config(cfg)
+        assert validated_cfg.fp32_norms is True
+        assert validated_cfg.fp32_norm_classes == ["AfmoeRMSNorm"]
+
+    def test_fp32_norm_classes_without_fp32_norms_warns(self, min_base_cfg, caplog):
+        cfg = min_base_cfg | DictDefault(
+            fp32_norm_classes=["AfmoeRMSNorm"],
+        )
+        # axolotl.cli.configure_logging() sets propagate=False on the `axolotl`
+        # logger, so pytest caplog (attached to root) can't see records by
+        # default. Temporarily re-enable propagation for this assertion.
+        ax_logger = logging.getLogger("axolotl")
+        old_propagate = ax_logger.propagate
+        ax_logger.propagate = True
+        try:
+            with caplog.at_level("WARNING", logger="axolotl"):
+                validated_cfg = validate_config(cfg)
+        finally:
+            ax_logger.propagate = old_propagate
+        assert not validated_cfg.fp32_norms
+        assert "fp32_norm_classes is set but fp32_norms is not enabled" in caplog.text
+
     def test_muon_fsdp1_rejected(self, min_base_cfg):
         cfg = min_base_cfg | DictDefault(
             optimizer="muon",
diff --git a/tests/utils/schemas/validation/test_multimodal_cpt.py b/tests/utils/schemas/validation/test_multimodal_cpt.py
index f110d6c7e1..785df6f408 100644
--- a/tests/utils/schemas/validation/test_multimodal_cpt.py
+++ b/tests/utils/schemas/validation/test_multimodal_cpt.py
@@ -13,7 +13,7 @@
 def _mm_cpt_cfg(min_base_cfg, **overrides) -> DictDefault:
     base = DictDefault(
         **(
-            min_base_cfg
+            dict(min_base_cfg)
             | {
                 "datasets": None,
                 "pretraining_dataset": [
@@ -30,7 +30,31 @@ def _mm_cpt_cfg(min_base_cfg, **overrides) -> DictDefault:
             }
         )
     )
-    return base | DictDefault(overrides)
+    return DictDefault(dict(base) | dict(overrides))
+
+
+def _mm_cpt_datasets_cfg(min_base_cfg, **overrides) -> DictDefault:
+    base = DictDefault(
+        **(
+            dict(min_base_cfg)
+            | {
+                "datasets": [
+                    {
+                        "path": "some/ds",
+                        "type": "multimodal_pretrain",
+                        "text_column": "caption",
+                        "image_column": "images",
+                        "image_base_dir": "/images",
+                    }
+                ],
+                "pretraining_dataset": None,
+                "streaming": False,
+                "processor_type": "AutoProcessor",
+                "sequence_len": 2048,
+            }
+        )
+    )
+    return DictDefault(dict(base) | dict(overrides))
 
 
 class TestMultimodalCPTGates:
@@ -88,6 +112,64 @@ def test_valid_cfg_passes_and_disables_remove_unused_columns(self, min_base_cfg)
         assert pd.type == "multimodal_pretrain"
         assert pd.image_column == "images"
 
+    def test_valid_datasets_cfg_preserves_mm_keys(self, min_base_cfg):
+        cfg = _mm_cpt_datasets_cfg(min_base_cfg)
+        validated = validate_config(cfg)
+        assert validated.remove_unused_columns is False
+        ds = validated.datasets[0]
+        assert ds.type == "multimodal_pretrain"
+        assert ds.text_column == "caption"
+        assert ds.image_column == "images"
+        assert ds.image_base_dir == "/images"
+
+    def test_datasets_cfg_allows_num_epochs_without_max_steps(self, min_base_cfg):
+        cfg = _mm_cpt_datasets_cfg(min_base_cfg, num_epochs=2)
+        cfg.pop("max_steps", None)
+        validated = validate_config(cfg)
+        assert validated.max_steps is None
+        assert validated.num_epochs == 2
+
+    def test_datasets_cfg_missing_processor_type_raises(self, min_base_cfg):
+        cfg = _mm_cpt_datasets_cfg(min_base_cfg)
+        cfg.pop("processor_type", None)
+        with pytest.raises(ValueError, match="processor_type"):
+            validate_config(cfg)
+
+    def test_datasets_cfg_with_streaming_rejected(self, min_base_cfg):
+        cfg = _mm_cpt_datasets_cfg(min_base_cfg, streaming=True, max_steps=10)
+        with pytest.raises(ValueError, match="non-streaming prepared path"):
+            validate_config(cfg)
+
+    def test_multiple_datasets_entries_rejected(self, min_base_cfg):
+        cfg = _mm_cpt_datasets_cfg(min_base_cfg)
+        cfg.datasets.append({"path": "other/ds", "type": "alpaca"})
+        with pytest.raises(ValueError, match="exactly one `datasets`"):
+            validate_config(cfg)
+
+    def test_datasets_and_pretraining_mm_entries_rejected(self, min_base_cfg):
+        cfg = _mm_cpt_datasets_cfg(
+            min_base_cfg,
+            pretraining_dataset=[{"path": "stream/ds", "type": "multimodal_pretrain"}],
+        )
+        with pytest.raises(
+            ValueError, match="both `pretraining_dataset` and `datasets`"
+        ):
+            validate_config(cfg)
+
+    def test_datasets_cfg_rejects_truncate_strategy(self, min_base_cfg):
+        cfg = _mm_cpt_datasets_cfg(min_base_cfg, excess_length_strategy="truncate")
+        with pytest.raises(ValueError, match="excess_length_strategy: truncate"):
+            validate_config(cfg)
+
+    def test_datasets_cfg_requires_strategy_type_not_multimodal_flag(
+        self, min_base_cfg
+    ):
+        cfg = _mm_cpt_datasets_cfg(min_base_cfg)
+        cfg.datasets[0].pop("type")
+        cfg.datasets[0]["multimodal"] = True
+        with pytest.raises(ValueError, match="type: multimodal_pretrain"):
+            validate_config(cfg)
+
     def test_multimodal_flag_triggers_gates(self, min_base_cfg):
         cfg = _mm_cpt_cfg(min_base_cfg)
         cfg.pretraining_dataset[0]["type"] = "pretrain"
@@ -147,6 +229,7 @@ def test_mm_eval_dataset_via_multimodal_flag(self, min_base_cfg):
         )
         validated = validate_config(cfg)
         td = validated.test_datasets[0]
+        assert td["type"] == "multimodal_pretrain"
         assert td["image_column"] == "imgs2"
         assert td["multimodal"] is True
 
diff --git a/tests/utils/schemas/validation/test_qgalore.py b/tests/utils/schemas/validation/test_qgalore.py
new file mode 100644
index 0000000000..f6325a018e
--- /dev/null
+++ b/tests/utils/schemas/validation/test_qgalore.py
@@ -0,0 +1,40 @@
+"""Validation tests for the Q-GaLore optimizer config gates."""
+
+import pytest
+
+from axolotl.utils.config import validate_config
+from axolotl.utils.dict import DictDefault
+
+
+class TestQGaLoreValidation:
+    """Pydantic-level checks for q_galore_adamw8bit."""
+
+    def test_adapter_rejected(self, min_base_cfg):
+        cfg = min_base_cfg | DictDefault(
+            optimizer="q_galore_adamw8bit",
+            adapter="lora",
+            lora_r=8,
+            lora_alpha=16,
+            lora_target_linear=True,
+        )
+        with pytest.raises(ValueError, match="incompatible with adapter"):
+            validate_config(cfg)
+
+    def test_fsdp1_rejected(self, min_base_cfg):
+        cfg = min_base_cfg | DictDefault(
+            optimizer="q_galore_adamw8bit",
+            fsdp_version=1,
+            fsdp_config={"reshard_after_forward": True},
+        )
+        with pytest.raises(ValueError, match="requires FSDP2"):
+            validate_config(cfg)
+
+    def test_defaults_filled(self, min_base_cfg):
+        cfg = min_base_cfg | DictDefault(
+            optimizer="q_galore_adamw8bit",
+            bf16=True,
+        )
+        cfg = validate_config(cfg)
+        assert cfg.optim_target_modules == ["attn", "mlp"]
+        assert cfg.qgalore_rank == 256
+        assert cfg.qgalore_proj_bits == 4