axolotl-ai-cloud · winglian · Aug 6, 2025 · Jul 6, 2025
diff --git a/.runpod/README.md b/.runpod/README.md
@@ -185,7 +185,6 @@ datasets:
 | `flash_attention`          | `false` | Use flash attention           |
 | `flash_attn_cross_entropy` | `false` | Flash attention cross entropy |
 | `flash_attn_rms_norm`      | `false` | Flash attention RMS norm      |
-| `flash_attn_fuse_qkv`      | `false` | Fuse QKV operations           |
 | `flash_attn_fuse_mlp`      | `false` | Fuse MLP operations           |
 | `sdp_attention`            | `false` | Use scaled dot product        |
 | `s2_attention`             | `false` | Use shifted sparse attention  |

diff --git a/.runpod/src/config/config.yaml b/.runpod/src/config/config.yaml
@@ -296,7 +296,6 @@
 # flash_attention:
 # flash_attn_cross_entropy:  # Whether to use flash-attention cross entropy implementation - advanced use only
 # flash_attn_rms_norm:  # Whether to use flash-attention rms norm implementation - advanced use only
-# flash_attn_fuse_qkv: # Whether to fuse QKV into a single operation
 # flash_attn_fuse_mlp: # Whether to fuse part of the MLP into a single operation
 # # Whether to use scaled-dot-product attention
 # # https://pytorch.org/docs/stable/generated/torch.nn.functional.scaled_dot_product_attention.html
@@ -541,7 +540,6 @@ xformers_attention: ${XFORMERS_ATTENTION}
 flash_attention: ${FLASH_ATTENTION}
 flash_attn_cross_entropy: ${FLASH_ATTN_CROSS_ENTROPY}
 flash_attn_rms_norm: ${FLASH_ATTN_RMS_NORM}
-flash_attn_fuse_qkv: ${FLASH_ATTN_FUSE_QKV}
 flash_attn_fuse_mlp: ${FLASH_ATTN_FUSE_MLP}
 sdp_attention: ${SDP_ATTENTION}
 s2_attention: ${S2_ATTENTION}

diff --git a/examples/archived/stablelm-2/1.6b/fft.yml b/examples/archived/stablelm-2/1.6b/fft.yml
@@ -47,7 +47,6 @@ logging_steps: 1
 flash_attention: true
 flash_attn_cross_entropy: false
 flash_attn_rms_norm: true
-flash_attn_fuse_qkv: false
 flash_attn_fuse_mlp: true
 
 warmup_ratio: 0.1

diff --git a/examples/llama-2/fft_optimized.yml b/examples/llama-2/fft_optimized.yml
@@ -45,7 +45,6 @@ logging_steps: 1
 flash_attention: true
 flash_attn_cross_entropy: false
 flash_attn_rms_norm: true
-flash_attn_fuse_qkv: false
 flash_attn_fuse_mlp: true
 
 warmup_ratio: 0.1

diff --git a/examples/llama-2/lisa.yml b/examples/llama-2/lisa.yml
@@ -49,7 +49,6 @@ logging_steps: 1
 flash_attention: true
 flash_attn_cross_entropy: false
 flash_attn_rms_norm: true
-flash_attn_fuse_qkv: false
 flash_attn_fuse_mlp: true
 
 warmup_ratio: 0.1

diff --git a/src/axolotl/loaders/patch_manager.py b/src/axolotl/loaders/patch_manager.py
@@ -330,31 +330,21 @@ def _patch_loss_llama(self):
 
             patch_self_attn_lora()
 
-    def _patch_llama_flash_attention(self, packed=False):
+    def _patch_llama_flash_attention(self):
         """Apply Flash Attention patches for LLaMA models."""
         from axolotl.monkeypatch.llama_attn_hijack_flash import (
             replace_llama_attn_with_flash_attn,
         )
 
-        if packed:
-            if self.cfg.device not in ["mps", "cpu"] and not self.inference:
-                LOG.info("patching with flash attention for sample packing")
-                replace_llama_attn_with_flash_attn(
-                    packed=True,
-                    cross_entropy=self.cfg.flash_attn_cross_entropy,
-                    rms_norm=self.cfg.flash_attn_rms_norm,
-                )
-        elif self.cfg.s2_attention:
+        if self.cfg.s2_attention:
             LOG.info("patching w/ flash-enabled, shifted-sparse attention")
             replace_llama_attn_with_flash_attn(
-                packed=False,
                 cross_entropy=self.cfg.flash_attn_cross_entropy,
                 rms_norm=self.cfg.flash_attn_rms_norm,
                 use_shifted_sparse_attn=True,
             )
         elif self.cfg.flash_attn_cross_entropy or self.cfg.flash_attn_rms_norm:
             replace_llama_attn_with_flash_attn(
-                packed=False,
                 cross_entropy=self.cfg.flash_attn_cross_entropy,
                 rms_norm=self.cfg.flash_attn_rms_norm,
             )
@@ -385,7 +375,7 @@ def _patch_llama_derived_model(self):
             and self.cfg.sample_packing
         ):
             if self.cfg.flash_attention:
-                self._patch_llama_flash_attention(packed=self.cfg.sample_packing)
+                self._patch_llama_flash_attention()
             elif self.cfg.xformers_attention:
                 self._patch_llama_xformers_attention()
             elif self.cfg.sample_packing:
@@ -408,17 +398,12 @@ def _apply_llama_flash_attn_patches(self, model):
             from axolotl.monkeypatch.llama_attn_hijack_flash import (
                 is_xformers_swiglu_available,
                 replace_llama_mlp_with_swiglu,
-                replace_llama_qkv_with_fused,
             )
 
             if self.cfg.flash_attn_fuse_mlp and is_xformers_swiglu_available():
                 LOG.info("Patching with SwiGLU...")
                 replace_llama_mlp_with_swiglu(model)
 
-            if self.cfg.flash_attn_fuse_qkv:
-                LOG.info("Patching with fused QKV...")
-                replace_llama_qkv_with_fused(model)
-
     def _apply_unsloth_patches(self, model):
         """Apply unsloth optimization patches."""
         if self.cfg.unsloth_lora_mlp: