diff --git a/setup.py b/setup.py
index 6306cefea2..19a97a60b8 100644
--- a/setup.py
+++ b/setup.py
@@ -404,6 +404,8 @@ def get_rocm_agent_arch():
         arches = subprocess.check_output([exec_path], universal_newlines=True)
         arch_list = arches.strip().split()
         return arch_list[0]
+    elif torch.cuda.is_available() and torch.version.hip:
+        return torch.cuda.get_device_properties(0).gcnArchName.split(":")[0]
     else:
         return "gfx942"
 
@@ -579,7 +581,9 @@ def get_extensions():
         and (torch.cuda.is_available() or os.getenv("HIP_ARCHITECTURE", "") != "")
     ):
         rename_cpp_cu(source_hip)
-        hip_version = get_hip_version(ROCM_HOME)
+        hip_version = get_hip_version(
+            None if platform.system() == "Windows" else ROCM_HOME
+        )
 
         source_hip_cu = []
         for ff in source_hip:
@@ -597,27 +601,48 @@ def get_extensions():
         ]
 
         cc_flag = ["-DBUILD_PYTHON_PACKAGE"]
-        use_rtn_bf16_convert = os.getenv("ENABLE_HIP_FMHA_RTN_BF16_CONVERT", "0")
-        if use_rtn_bf16_convert == "1":
-            cc_flag += ["-DCK_TILE_FLOAT_TO_BFLOAT16_DEFAULT=3"]
-        else:
-            cc_flag += ["-DCK_TILE_FLOAT_TO_BFLOAT16_DEFAULT=2"]
-
         arch = os.getenv("HIP_ARCHITECTURE", "native")
 
         if arch == "native":
             arch = get_rocm_agent_arch()
 
-        if arch not in ["gfx908", "gfx90a", "gfx942", "gfx950"]:
+        use_rtn_bf16_convert = os.getenv("ENABLE_HIP_FMHA_RTN_BF16_CONVERT")
+        if use_rtn_bf16_convert is None:
+            # RDNA CK bf16 forward output can be reused by FlashAttention backward;
+            # truncating fp32 accumulators is not accurate enough for that pairing.
+            use_rtn_bf16_convert = "1" if arch.startswith(("gfx11", "gfx12")) else "0"
+        if use_rtn_bf16_convert == "1":
+            cc_flag += ["-DCK_TILE_FLOAT_TO_BFLOAT16_DEFAULT=3"]
+        else:
+            cc_flag += ["-DCK_TILE_FLOAT_TO_BFLOAT16_DEFAULT=2"]
+
+        if (
+            arch not in ["gfx908", "gfx90a", "gfx942", "gfx950"]
+            and not arch.startswith(("gfx11", "gfx12"))
+        ):
             raise ValueError(f"Not supported AMD GPU arch: {arch}")
 
         if arch == "gfx950":
             cc_flag += ["-DFMHA_BUILD_ON_GFX950"]
+        elif arch.startswith("gfx11"):
+            cc_flag += ["-DFMHA_BUILD_ON_GFX11"]
+        elif arch.startswith("gfx12"):
+            cc_flag += ["-DFMHA_BUILD_ON_GFX12"]
 
         offload_compress_flag = []
         if hip_version >= "6.2.":
             offload_compress_flag = ["--offload-compress"]
 
+        if platform.system() == "Windows":
+            cc_flag += [
+                "-Wno-deprecated-declarations",
+                "-Wno-unused-command-line-argument",
+                # CK headers use C++20 attributes such as [[no_unique_address]].
+                # Windows HIP host compilation warns on that spelling under
+                # -Werror, even though the device-side gfx12 build is valid.
+                "-Wno-unknown-attributes",
+            ]
+
         extra_compile_args["nvcc"] = [
             "-O3",
             "-std=c++20",
@@ -697,7 +722,31 @@ def __init__(self, *args, **kwargs) -> None:
         self.pkg_name = "xformers"
         super().__init__(*args, **kwargs)
 
+    def _use_windows_link_response_files(self) -> None:
+        if platform.system() != "Windows":
+            return
+
+        original_spawn = self.compiler.spawn
+
+        def spawn_with_response_file(cmd):
+            tool = os.path.basename(str(cmd[0])).lower() if cmd else ""
+            command_length = sum(len(str(arg)) + 1 for arg in cmd)
+            if tool not in {"link.exe", "lld-link.exe"} or command_length < 30_000:
+                return original_spawn(cmd)
+
+            rsp_path = Path(self.build_temp, f"xformers_link_{id(cmd)}.rsp").resolve()
+            rsp_path.parent.mkdir(parents=True, exist_ok=True)
+            rsp_path.write_text(
+                "\n".join(subprocess.list2cmdline([str(arg)]) for arg in cmd[1:])
+                + "\n"
+            )
+
+            return original_spawn([cmd[0], f"@{rsp_path}"])
+
+        self.compiler.spawn = spawn_with_response_file
+
     def build_extensions(self) -> None:
+        self._use_windows_link_response_files()
         super().build_extensions()
 
         # Fix incorrect output names caused by py_limited_api=True on Windows. see item #1272
diff --git a/third_party/composable_kernel_tiled b/third_party/composable_kernel_tiled
index fe2e29fa68..457f153b69 160000
--- a/third_party/composable_kernel_tiled
+++ b/third_party/composable_kernel_tiled
@@ -1 +1 @@
-Subproject commit fe2e29fa68ce52eda49506d7e59738ba311de986
+Subproject commit 457f153b69472b84fa1819d384ee451632091467
diff --git a/xformers/csrc/attention/hip_fmha/ck_tiled_fmha_batched_infer_dispatch.h b/xformers/csrc/attention/hip_fmha/ck_tiled_fmha_batched_infer_dispatch.h
index ba6000087a..5a1654fb96 100644
--- a/xformers/csrc/attention/hip_fmha/ck_tiled_fmha_batched_infer_dispatch.h
+++ b/xformers/csrc/attention/hip_fmha/ck_tiled_fmha_batched_infer_dispatch.h
@@ -158,12 +158,19 @@ struct batched_infer_mask_bias_dropout_dispatch {
     };
 #endif
 
+#if defined(FMHA_BUILD_ON_GFX11) || defined(FMHA_BUILD_ON_GFX12)
+    // Current RDNA3/4 CK FMHA builds use the sync pipeline; the async
+    // global-to-LDS path fails for these targets, so keep it uninstantiated
+    // instead of relying on a core CK fallback.
+    constexpr bool enable_async_pipeline = false;
+#else
     const bool enable_async_pipeline = []() {
       const char* env_p = std::getenv("FMHA_ENABLE_ASYNC_PIPELINE");
       if (env_p == nullptr)
         return false;
       return static_cast<bool>(atoi(env_p));
     }();
+#endif
 
     const bool use_async_pipeline =
         (!kHasBias && (param.K % 8 == 0) && (param.Kv % 8 == 0) &&
@@ -267,7 +274,9 @@ struct batched_infer_mask_bias_dropout_dispatch {
               RunWithKernel<FmhaKernel>(param, stream);
             }
           });
-    } else {
+    }
+#if !defined(FMHA_BUILD_ON_GFX11) && !defined(FMHA_BUILD_ON_GFX12)
+    else {
       using FmhaShape = typename FmhaFwdCommonShape<MaxK, MTile>::Type;
 
       const bool pad_seqlen_k = !(param.N % FmhaShape::kN0 == 0);
@@ -307,7 +316,9 @@ struct batched_infer_mask_bias_dropout_dispatch {
           /* runtime will never get here, so no codes to compile */
         };
       });
-    };
+    }
+#endif
+    ;
   };
 
   template <typename FmhaKernel>
diff --git a/xformers/csrc/attention/hip_fmha/ck_tiled_fmha_fwd_setting.h b/xformers/csrc/attention/hip_fmha/ck_tiled_fmha_fwd_setting.h
index 038a129ee4..5c2cc257ab 100644
--- a/xformers/csrc/attention/hip_fmha/ck_tiled_fmha_fwd_setting.h
+++ b/xformers/csrc/attention/hip_fmha/ck_tiled_fmha_fwd_setting.h
@@ -70,7 +70,13 @@ struct FmhaFwdCommonBlockTile<128, 128> {
 
 template <ck_tile::index_t MTile>
 struct FmhaFwdCommonBlockTile<256, MTile> {
+#if defined(FMHA_BUILD_ON_GFX11)
+  // gfx11 WMMA duplicates Q data across subgroups, so a 128x256 Q tile
+  // can exceed static_for's 256-iteration limit. Keep hdim-256 at M=64.
+  using tile_lengths = ck_tile::sequence<64, 128, 32, 256, 32, 256>;
+#else
   using tile_lengths = ck_tile::sequence<128, 128, 32, 256, 32, 256>;
+#endif
   using gemm0_warps = ck_tile::sequence<4, 1, 1>;
   using gemm1_warps = ck_tile::sequence<4, 1, 1>;
 };
diff --git a/xformers/csrc/attention/hip_fmha/ck_tiled_fmha_grouped_infer_dispatch.h b/xformers/csrc/attention/hip_fmha/ck_tiled_fmha_grouped_infer_dispatch.h
index 9cc2ce6b55..8909b765c6 100644
--- a/xformers/csrc/attention/hip_fmha/ck_tiled_fmha_grouped_infer_dispatch.h
+++ b/xformers/csrc/attention/hip_fmha/ck_tiled_fmha_grouped_infer_dispatch.h
@@ -163,12 +163,19 @@ struct grouped_infer_mask_bias_dropout_dispatch {
     constexpr bool kPadSeqLenQ = false;
     constexpr bool kPadSeqLenK = true;
 
+#if defined(FMHA_BUILD_ON_GFX11) || defined(FMHA_BUILD_ON_GFX12)
+    // Current RDNA3/4 CK FMHA builds use the sync pipeline; the async
+    // global-to-LDS path fails for these targets, so keep it uninstantiated
+    // instead of relying on a core CK fallback.
+    constexpr bool enable_async_pipeline = false;
+#else
     const bool enable_async_pipeline = []() {
       const char* env_p = std::getenv("FMHA_ENABLE_ASYNC_PIPELINE");
       if (env_p == nullptr)
         return false;
       return static_cast<bool>(atoi(env_p));
     }();
+#endif
 
     const bool use_async_pipeline =
         (!kHasBias && (param.K % 8 == 0) && (param.Kv % 8 == 0) &&
@@ -261,7 +268,9 @@ struct grouped_infer_mask_bias_dropout_dispatch {
               RunWithKernel<FmhaKernel>(param, stream);
             }
           });
-    } else {
+    }
+#if !defined(FMHA_BUILD_ON_GFX11) && !defined(FMHA_BUILD_ON_GFX12)
+    else {
       if constexpr (MaxK <= 128 && MTile <= 128) {
         using FmhaShape = typename FmhaFwdCommonShape<MaxK, MTile>::Type;
 
@@ -297,7 +306,9 @@ struct grouped_infer_mask_bias_dropout_dispatch {
       } else {
         /* runtime will never get here, so no codes to compile */
       };
-    };
+    }
+#endif
+    ;
   };
 
   template <typename FmhaKernel>
diff --git a/xformers/csrc/attention/hip_fmha/ck_tiled_fmha_warp_tile_define.h b/xformers/csrc/attention/hip_fmha/ck_tiled_fmha_warp_tile_define.h
index 55aac7814f..4f5a8852b6 100644
--- a/xformers/csrc/attention/hip_fmha/ck_tiled_fmha_warp_tile_define.h
+++ b/xformers/csrc/attention/hip_fmha/ck_tiled_fmha_warp_tile_define.h
@@ -8,6 +8,18 @@
 
 #include <ck_tile/core.hpp>
 
+#if defined(FMHA_BUILD_ON_GFX11) || defined(FMHA_BUILD_ON_GFX12)
+// RDNA WMMA only registers 16x16x16 in CK tile's warp_gemm dispatcher for
+// fp16 / bf16 (and fp8 / int8). The xformers wrapper layer was authored for
+// CDNA MFMA shapes (32x32x16 and 16x16x32). To keep the per-arch *_setting.h
+// headers small, we alias all three names down to the WMMA-supported shape.
+// Block tile geometry is unchanged — each warp simply issues more WMMA
+// instructions to cover the same output tile.
+using WarpTile_32x32x16 = ck_tile::sequence<16, 16, 16>;
+using WarpTile_16x16x32 = ck_tile::sequence<16, 16, 16>;
+using WarpTile_16x16x16 = ck_tile::sequence<16, 16, 16>;
+#else
 using WarpTile_32x32x16 = ck_tile::sequence<32, 32, 16>;
 using WarpTile_16x16x32 = ck_tile::sequence<16, 16, 32>;
 using WarpTile_16x16x16 = ck_tile::sequence<16, 16, 16>;
+#endif
diff --git a/xformers/ops/differentiable_collectives.py b/xformers/ops/differentiable_collectives.py
index 9d87629dd1..e4a49cecb9 100644
--- a/xformers/ops/differentiable_collectives.py
+++ b/xformers/ops/differentiable_collectives.py
@@ -3,6 +3,7 @@
 # This source code is licensed under the BSD license found in the
 # LICENSE file in the root directory of this source tree.
 
+from __future__ import annotations
 
 from typing import Optional, Tuple
 
diff --git a/xformers/ops/fmha/flash.py b/xformers/ops/fmha/flash.py
index 1201b3360c..8b6701e683 100644
--- a/xformers/ops/fmha/flash.py
+++ b/xformers/ops/fmha/flash.py
@@ -73,7 +73,7 @@
 
     FLASH_VERSION = flash_attn.__version__
     FLASH_VER_MIN = (2, 7, 1)
-    FLASH_VER_LAST = (2, 8, 0)  # last supported, inclusive
+    FLASH_VER_LAST = (2, 8, 4)  # last supported, inclusive
     flash_ver_parsed = tuple(int(s) for s in FLASH_VERSION.split(".")[:3])
     if (
         flash_ver_parsed < FLASH_VER_MIN or flash_ver_parsed > FLASH_VER_LAST
@@ -358,6 +358,68 @@ def _create_dq_dk_dv(
         return torch.empty_like(query), torch.empty_like(key), torch.empty_like(value)
 
 
+def _pack_kv_for_seqused_k_flash(
+    attn_bias: AttentionBias,
+    key: torch.Tensor,
+    value: torch.Tensor,
+) -> Tuple[torch.Tensor, torch.Tensor, torch.Tensor, int]:
+    key_chunks: List[torch.Tensor] = []
+    value_chunks: List[torch.Tensor] = []
+    cu_seqlens_k = [0]
+
+    if isinstance(
+        attn_bias, (PagedBlockDiagonalGappyKeysMask, PagedBlockDiagonalPaddedKeysMask)
+    ):
+        assert key.ndim == 4
+        assert value.ndim == 4
+        for row_idx, seqlen in enumerate(attn_bias.k_seqinfo.seqlen_py):
+            page_indices = attn_bias.block_tables[row_idx].to(
+                device=key.device, dtype=torch.long
+            )
+            row_key = key.index_select(0, page_indices).reshape(
+                [-1, *key.shape[2:]]
+            )
+            row_value = value.index_select(0, page_indices).reshape(
+                [-1, *value.shape[2:]]
+            )
+            if isinstance(attn_bias, PagedBlockDiagonalGappyKeysMask):
+                start = attn_bias.k_seqinfo.seqstart_py[row_idx]
+                end = seqlen
+            else:
+                start = 0
+                end = seqlen
+            key_chunks.append(row_key[start:end])
+            value_chunks.append(row_value[start:end])
+            cu_seqlens_k.append(cu_seqlens_k[-1] + end - start)
+    else:
+        assert key.ndim == 3
+        assert value.ndim == 3
+        for start, end in attn_bias.k_seqinfo.intervals():
+            key_chunks.append(key[start:end])
+            value_chunks.append(value[start:end])
+            cu_seqlens_k.append(cu_seqlens_k[-1] + end - start)
+
+    if key_chunks:
+        key = torch.cat(key_chunks, dim=0)
+        value = torch.cat(value_chunks, dim=0)
+    else:
+        key = key.new_empty([0, *key.shape[-2:]])
+        value = value.new_empty([0, *value.shape[-2:]])
+
+    cu_seqlens_k_tensor = torch.tensor(
+        cu_seqlens_k, dtype=torch.int32, device=key.device
+    )
+    max_seqlen_k = max(
+        (end - start for start, end in zip(cu_seqlens_k[:-1], cu_seqlens_k[1:])),
+        default=0,
+    )
+    return key, value, cu_seqlens_k_tensor, max_seqlen_k
+
+
+def _is_rocm_device(device: torch.device) -> bool:
+    return torch.version.hip is not None and device.type == "cuda"
+
+
 def _convert_input_format(
     inp: Inputs,
     supports_mqa: bool,
@@ -446,6 +508,23 @@ def fold(x):
             num_pages = value.shape[0] // attn_bias.page_size
             key = key.view(num_pages, attn_bias.page_size, *key.shape[1:])
             value = value.view(num_pages, attn_bias.page_size, *value.shape[1:])
+        if _is_rocm_device(query.device) and isinstance(
+            attn_bias,
+            (
+                BlockDiagonalGappyKeysMask,
+                BlockDiagonalPaddedKeysMask,
+                PagedBlockDiagonalGappyKeysMask,
+                PagedBlockDiagonalPaddedKeysMask,
+            ),
+        ):
+            # The ROCm CK flash-attn varlen path accepts seqused_k but does not
+            # consume it in the CK host wrapper, so padded/gappy K/V layouts can
+            # be read as real tokens. Compacting logical K/V sequences lets us
+            # use the ordinary cu_seqlens path.
+            key, value, cu_seqlen_k, max_seqlen_k = _pack_kv_for_seqused_k_flash(
+                attn_bias, key, value
+            )
+            seqused_k = None
 
     new_inp = Inputs(
         query=query,
@@ -604,6 +683,13 @@ class FwOp(AttentionFwOpBase):
     CUDA_MINIMUM_COMPUTE_CAPABILITY = (8, 0)
     SUPPORTED_DTYPES: Set[torch.dtype] = {torch.half, torch.bfloat16}
     SUPPORTED_MAX_K = 256
+    # ROCm flash-attn bf16 forward (observed with 2.8.4 on gfx1201) can
+    # exceed the default absolute tolerance while staying within relative
+    # tolerance, so keep the relaxed threshold scoped to HIP bf16 only.
+    ERROR_ATOL = {
+        **AttentionFwOpBase.ERROR_ATOL,
+        **({torch.bfloat16: 4e-2} if torch.version.hip is not None else {}),
+    }
     SUPPORTED_ATTN_BIAS_TYPES: Iterable[Any] = (
         type(None),
         LowerTriangularMask,
@@ -683,6 +769,7 @@ def apply(
             block_tables = (
                 inp.attn_bias.block_tables
                 if isinstance(inp.attn_bias, PagedBlockDiagonalPaddedKeysMask)
+                and inp.key.ndim == 4
                 else None
             )
             out, softmax_lse, rng_state = cls.OPERATOR(
@@ -731,9 +818,9 @@ def apply(
             out=out,
             lse=_post_process_lse(softmax_lse, inp, original_query_shape),
         )
+        ctx.rng_state = rng_state
         if inp.p != 0.0:
             ctx.op_bw = BwOp
-            ctx.rng_state = rng_state
         return (out, ctx)
 
 
@@ -817,6 +904,14 @@ def apply(cls, ctx: Context, inp: Inputs, grad: torch.Tensor) -> Gradients:
         ]
         assert grad.dtype in cls.SUPPORTED_DTYPES
 
+        rng_state = ctx.rng_state
+        if rng_state is None:
+            if inp.p != 0.0:
+                raise RuntimeError(
+                    "Flash-Attention backward requires an RNG state when dropout is enabled"
+                )
+            rng_state = torch.zeros((2,), dtype=torch.int64, device=inp.query.device)
+
         if inp.query.numel() and inp.key.numel():
             win_left, win_right = _window_size(inp.attn_bias)
             grads = Gradients(
@@ -837,7 +932,7 @@ def apply(cls, ctx: Context, inp: Inputs, grad: torch.Tensor) -> Gradients:
                     _is_causal(inp.attn_bias),
                     window_left=win_left,
                     window_right=win_right,
-                    rng_state=ctx.rng_state if inp.p > 0.0 else None,
+                    rng_state=rng_state,
                 )
             )
         else:
diff --git a/xformers/ops/seqpar.py b/xformers/ops/seqpar.py
index b81eaf3304..3114e7495b 100644
--- a/xformers/ops/seqpar.py
+++ b/xformers/ops/seqpar.py
@@ -7,7 +7,15 @@
 from typing import Callable, List, Tuple
 
 import torch
-from torch.distributed.distributed_c10d import _resolve_process_group
+
+# Some torch builds (e.g. Windows ROCm / TheRock) ship a stripped-down
+# torch.distributed without distributed_c10d. Importing this module must not
+# break xformers for single-GPU users; the fallback is only reachable if one
+# of the sequence-parallel ops below is actually called.
+try:
+    from torch.distributed.distributed_c10d import _resolve_process_group
+except ImportError:
+    _resolve_process_group = None  # type: ignore[assignment]
 
 from .differentiable_collectives import (
     gather_along_first_dim,
diff --git a/xformers/ops/sequence_parallel_fused_ops.py b/xformers/ops/sequence_parallel_fused_ops.py
index 1f76667382..179125abd6 100644
--- a/xformers/ops/sequence_parallel_fused_ops.py
+++ b/xformers/ops/sequence_parallel_fused_ops.py
@@ -9,7 +9,15 @@
 import torch
 import torch.distributed as dist
 import torch.multiprocessing.reductions
-from torch.distributed._symmetric_memory import get_symm_mem_workspace
+
+# torch.distributed._symmetric_memory is absent from some torch builds
+# (e.g. Windows ROCm / TheRock). Guard the import so xformers stays usable
+# on single-GPU; the None fallback is only reached if the fused ops below
+# are actually invoked.
+try:
+    from torch.distributed._symmetric_memory import get_symm_mem_workspace
+except ImportError:
+    get_symm_mem_workspace = None  # type: ignore[assignment]
 
 OP_FINISHED_CHANNEL = 0
 COMMS_READY_CHANNEL = 1