diff --git a/3rdparty/composable_kernel b/3rdparty/composable_kernel
index e339101e9c..c9f112b026 160000
--- a/3rdparty/composable_kernel
+++ b/3rdparty/composable_kernel
@@ -1 +1 @@
-Subproject commit e339101e9c9961fe1bc8305d5c316b39d1980d3e
+Subproject commit c9f112b0267625016a58ce3465ee34232c85812b
diff --git a/aiter/aot/test/matmul_fp16.py b/aiter/aot/test/matmul_fp16.py
index f12c1623c4..1d419f67c8 100644
--- a/aiter/aot/test/matmul_fp16.py
+++ b/aiter/aot/test/matmul_fp16.py
@@ -1,5 +1,5 @@
 # SPDX-License-Identifier: MIT
-# Copyright (C) 2024-2025, Advanced Micro Devices, Inc. All rights reserved.
+# Copyright (C) 2024-2026, Advanced Micro Devices, Inc. All rights reserved.
 
 import triton
 import triton.language as tl
diff --git a/aiter/dist/device_communicators/communicator_cuda.py b/aiter/dist/device_communicators/communicator_cuda.py
index b37af44810..4094ca1be6 100644
--- a/aiter/dist/device_communicators/communicator_cuda.py
+++ b/aiter/dist/device_communicators/communicator_cuda.py
@@ -155,7 +155,10 @@ def all_reduce(
             qr_comm is not None
             and not qr_comm.disabled
             and qr_comm.should_quick_allreduce(input_)
-            and (input_.nelement() * input_.element_size()) >= 4*1024*1024 # input shape should be such that quick reduce will show benefits.
+            and (input_.nelement() * input_.element_size())
+            >= 4
+            * 1024
+            * 1024  # input shape should be such that quick reduce will show benefits.
             # input shape estimated at 2 * max concurrency for now. if performance issues, subject to change
         ):
             out = qr_comm.quick_all_reduce(input_)
diff --git a/aiter/jit/core.py b/aiter/jit/core.py
index 73849a05a5..7e446636fc 100644
--- a/aiter/jit/core.py
+++ b/aiter/jit/core.py
@@ -818,7 +818,7 @@ def wrapper(*args, custom_build_args={}, **kwargs):
                 if module is None:
                     try:
                         module = get_module(md_name)
-                    except Exception as e:
+                    except Exception:
                         md = custom_build_args.get("md_name", md_name)
                         module = get_module(md)
             except ModuleNotFoundError:
diff --git a/aiter/ops/mha.py b/aiter/ops/mha.py
index 2674a772b1..0005321c17 100644
--- a/aiter/ops/mha.py
+++ b/aiter/ops/mha.py
@@ -1,5 +1,5 @@
 # SPDX-License-Identifier: MIT
-# Copyright (C) 2024-2025, Advanced Micro Devices, Inc. All rights reserved.
+# Copyright (C) 2024-2026, Advanced Micro Devices, Inc. All rights reserved.
 
 from typing import Any, Optional, Tuple
 
@@ -973,6 +973,9 @@ def cmdGenFunc_mha_batch_prefill(
     k_descale: Optional[Tensor] = None,
     v_descale: Optional[Tensor] = None,
     gen: Optional[Generator] = None,
+    kv_last_page_lens: Optional[Tensor] = None,
+    block_table: Optional[Tensor] = None,
+    seqlen_k: Optional[Tensor] = None,
 ):
     # causal=true is the same as causal=false in this case
     causal = is_causal
@@ -2598,15 +2601,26 @@ def mha_batch_prefill_fake_tensors(
     return_softmax_lse: bool,
     return_dropout_randval: bool,
     out: Optional[torch.Tensor] = None,
+    bias: Optional[torch.Tensor] = None,
     alibi_slopes: Optional[torch.Tensor] = None,
     q_descale: Optional[torch.Tensor] = None,
     k_descale: Optional[torch.Tensor] = None,
     v_descale: Optional[torch.Tensor] = None,
     gen: Optional[Generator] = None,
+    kv_last_page_lens: Optional[torch.Tensor] = None,
+    block_table: Optional[torch.Tensor] = None,
+    seqlen_k: Optional[torch.Tensor] = None,
 ) -> Tuple[Tensor, Tensor, Tensor, Tensor]:
     # ) -> Tuple[torch.Tensor, torch.Tensor, torch.Tensor, torch.Tensor]:
+    is_vectorized = k.dim() == 5 and v.dim() == 5
+    is_linear = (k.dim() == 4 and v.dim() == 4) or (k.dim() == 3 and v.dim() == 3)
+    if not (is_vectorized or is_linear):
+        raise ValueError(
+            "Batch prefill requires 5D vectorized, 4D linear, or 3D linear (page_size=1) K/V"
+            " tensors"
+        )
     num_heads = q.size(1)  # num_heads = q.sizes()[1]
-    head_size_v = v.size(2)  # head_size_v = v.size(2)
+    head_size_v = v.size(-2) if is_vectorized else v.size(-1)
     total_q = q.size(0)  # total_q = q.size(0)
 
     if out is None:
@@ -2671,6 +2685,9 @@ def mha_batch_prefill(
     q_descale: Optional[torch.Tensor] = None,
     k_descale: Optional[torch.Tensor] = None,
     v_descale: Optional[torch.Tensor] = None,
+    kv_last_page_lens: Optional[Tensor] = None,
+    block_table: Optional[Tensor] = None,
+    seqlen_k: Optional[Tensor] = None,
     gen: Optional[Generator] = None,
 ) -> Tuple[Tensor, Tensor, Tensor, Tensor]: ...
 
@@ -2696,6 +2713,9 @@ def _mha_batch_prefill(
     return_softmax: bool = False,
     zero_tensors: bool = False,
     out: torch.Tensor = None,
+    kv_last_page_lens: torch.Tensor = None,
+    block_table: torch.Tensor = None,
+    seqlen_k: torch.Tensor = None,
     q_descale: Optional[torch.Tensor] = None,
     k_descale: Optional[torch.Tensor] = None,
     v_descale: Optional[torch.Tensor] = None,
@@ -2726,6 +2746,9 @@ def _mha_batch_prefill(
         q_descale,
         k_descale,
         v_descale,
+        kv_last_page_lens,
+        block_table,
+        seqlen_k,
         # custom_build_args={"md_name": md_name, "blob_gen_cmd": blob_gen_cmd},
     )
     return out, softmax_lse, S_dmask, rng_state
@@ -2750,19 +2773,44 @@ def mha_batch_prefill_func(
     return_lse=False,
     return_attn_probs=False,
     out=None,
+    kv_last_page_lens=None,
+    block_table=None,
+    seqlen_k=None,
     q_descale=None,
     k_descale=None,
     v_descale=None,
 ):
     if softmax_scale is None:
         softmax_scale = q.shape[-1] ** (-0.5)
-    head_size_q_og = q.size(2)
-    head_size_v_og = v.size(2)
-    if head_size_q_og % 8 != 0:
-        q = torch.nn.functional.pad(q, [0, 8 - head_size_q_og % 8])
-        k = torch.nn.functional.pad(k, [0, 8 - head_size_q_og % 8])
-    if head_size_v_og % 8 != 0:
-        v = torch.nn.functional.pad(v, [0, 8 - head_size_v_og % 8])
+    head_size_q_og = q.size(-1)
+    # 16 bytes = 128-bit (dwordx4) vector width assumed by CK kernels.
+    k_vector_size = 16 // k.element_size()
+    is_vectorized = k.dim() == 5 and v.dim() == 5
+    is_linear = (k.dim() == 4 and v.dim() == 4) or (k.dim() == 3 and v.dim() == 3)
+    if not (is_vectorized or is_linear):
+        raise ValueError(
+            "Batch prefill requires 5D vectorized, 4D linear, or 3D linear (page_size=1) K/V"
+            " tensors"
+        )
+    head_size_v_og = v.size(-2) if is_vectorized else v.size(-1)
+    if head_size_q_og % k_vector_size != 0 or head_size_v_og % k_vector_size != 0:
+        raise ValueError("Batch prefill requires head size divisible by vector size")
+    if is_vectorized:
+        if k.size(-3) * k_vector_size != head_size_q_og:
+            raise ValueError("K vectorized layout does not match Q head size")
+        if k.size(-2) % k_vector_size != 0:
+            raise ValueError(
+                "Vectorized KV requires page size divisible by vector size"
+            )
+        if v.size(-1) != k_vector_size:
+            raise ValueError("Vectorized KV requires last dim equal to vector size")
+    else:
+        if k.size(-1) != head_size_q_og:
+            raise ValueError("K linear layout does not match Q head size")
+        if k.size(1) != v.size(1) or k.size(2) != v.size(2):
+            raise ValueError("K/V linear layout must match page size and head count")
+    if k.stride(-1) != 1 or v.stride(-1) != 1:
+        raise ValueError("Batch prefill requires K/V with contiguous last dimension")
     out_padded, softmax_lse, S_dmask, rng_state = _mha_batch_prefill(
         q,
         k,
@@ -2782,6 +2830,9 @@ def mha_batch_prefill_func(
         return_lse=return_lse,
         return_softmax=return_attn_probs and dropout_p > 0,
         out=out,
+        kv_last_page_lens=kv_last_page_lens,
+        block_table=block_table,
+        seqlen_k=seqlen_k,
         q_descale=q_descale,
         k_descale=k_descale,
         v_descale=v_descale,
diff --git a/aiter/ops/moe_op.py b/aiter/ops/moe_op.py
index a8f36637d7..3068dff1e1 100755
--- a/aiter/ops/moe_op.py
+++ b/aiter/ops/moe_op.py
@@ -1,5 +1,5 @@
 # SPDX-License-Identifier: MIT
-# Copyright (C) 2024-2025, Advanced Micro Devices, Inc. All rights reserved.
+# Copyright (C) 2024-2026, Advanced Micro Devices, Inc. All rights reserved.
 
 import torch
 from torch import Tensor
diff --git a/aiter/ops/triton/__init__.py b/aiter/ops/triton/__init__.py
index fc10be22fb..d09175d91c 100644
--- a/aiter/ops/triton/__init__.py
+++ b/aiter/ops/triton/__init__.py
@@ -1,5 +1,5 @@
 # SPDX-License-Identifier: MIT
-# Copyright (C) 2024-2025, Advanced Micro Devices, Inc. All rights reserved.
+# Copyright (C) 2024-2026, Advanced Micro Devices, Inc. All rights reserved.
 
 import importlib.util
 import sys
@@ -42,7 +42,7 @@
     )
 
 """
-These following help implement backward-compatibility 
+These following help implement backward-compatibility
 for modules that were reorganized so that external repos (like sglang for example),
 which depend on the old module names, can still import it the old "way" of importing.
 """
diff --git a/aiter/ops/triton/_triton_kernels/attention/chunked_pa_prefill.py b/aiter/ops/triton/_triton_kernels/attention/chunked_pa_prefill.py
index 889d3631c9..1429ca99a2 100644
--- a/aiter/ops/triton/_triton_kernels/attention/chunked_pa_prefill.py
+++ b/aiter/ops/triton/_triton_kernels/attention/chunked_pa_prefill.py
@@ -1,5 +1,5 @@
 # SPDX-License-Identifier: MIT
-# Copyright (C) 2024-2025, Advanced Micro Devices, Inc. All rights reserved.
+# Copyright (C) 2024-2026, Advanced Micro Devices, Inc. All rights reserved.
 
 # The kernel in this file is adapted from the VLLM project:
 # https://github.com/ROCm/vllm/blob/aiter_integration_final/vllm/attention/ops/chunked_prefill_paged_decode.py
diff --git a/aiter/ops/triton/_triton_kernels/attention/extend_attention.py b/aiter/ops/triton/_triton_kernels/attention/extend_attention.py
index 9ba1d04097..c71908e497 100644
--- a/aiter/ops/triton/_triton_kernels/attention/extend_attention.py
+++ b/aiter/ops/triton/_triton_kernels/attention/extend_attention.py
@@ -1,4 +1,4 @@
-# Copyright (C) 2023-2025 SGLang Team
+# Copyright (C) 2023-2026 SGLang Team
 # Licensed under the Apache License, Version 2.0 (the "License");
 # you may not use this file except in compliance with the License.
 # You may obtain a copy of the License at
diff --git a/aiter/ops/triton/_triton_kernels/attention/hstu_attention.py b/aiter/ops/triton/_triton_kernels/attention/hstu_attention.py
index 59ac5ab620..4eea668c74 100644
--- a/aiter/ops/triton/_triton_kernels/attention/hstu_attention.py
+++ b/aiter/ops/triton/_triton_kernels/attention/hstu_attention.py
@@ -1,5 +1,5 @@
 # Copyright (C) Advanced Micro Devices, Inc. All rights reserved.
-# Copyright (C) 2024-2025, The vLLM team.
+# Copyright (C) 2024-2026, The vLLM team.
 #
 # Licensed under the Apache License, Version 2.0 (the "License");
 # you may not use this file except in compliance with the License.
diff --git a/aiter/ops/triton/_triton_kernels/attention/lean_atten.py b/aiter/ops/triton/_triton_kernels/attention/lean_atten.py
index 73fd70f430..20dfe36610 100644
--- a/aiter/ops/triton/_triton_kernels/attention/lean_atten.py
+++ b/aiter/ops/triton/_triton_kernels/attention/lean_atten.py
@@ -1,5 +1,5 @@
 # SPDX-License-Identifier: MIT
-# Copyright (C) 2024-2025, Advanced Micro Devices, Inc. All rights reserved.
+# Copyright (C) 2024-2026, Advanced Micro Devices, Inc. All rights reserved.
 
 """
 Lean Attention
diff --git a/aiter/ops/triton/_triton_kernels/attention/mha.py b/aiter/ops/triton/_triton_kernels/attention/mha.py
index b3acb81f44..610b36c30a 100644
--- a/aiter/ops/triton/_triton_kernels/attention/mha.py
+++ b/aiter/ops/triton/_triton_kernels/attention/mha.py
@@ -1,5 +1,5 @@
 # SPDX-License-Identifier: MIT
-# Copyright (C) 2024-2025, Advanced Micro Devices, Inc. All rights reserved.
+# Copyright (C) 2024-2026, Advanced Micro Devices, Inc. All rights reserved.
 
 import functools
 import json
diff --git a/aiter/ops/triton/_triton_kernels/attention/mha_fused_bwd.py b/aiter/ops/triton/_triton_kernels/attention/mha_fused_bwd.py
index da98768b2c..f774867ebc 100644
--- a/aiter/ops/triton/_triton_kernels/attention/mha_fused_bwd.py
+++ b/aiter/ops/triton/_triton_kernels/attention/mha_fused_bwd.py
@@ -1,5 +1,5 @@
 # SPDX-License-Identifier: MIT
-# Copyright (C) 2024-2025, Advanced Micro Devices, Inc. All rights reserved.
+# Copyright (C) 2024-2026, Advanced Micro Devices, Inc. All rights reserved.
 
 import functools
 import json
diff --git a/aiter/ops/triton/_triton_kernels/attention/mha_onekernel_bwd.py b/aiter/ops/triton/_triton_kernels/attention/mha_onekernel_bwd.py
index c6015c2d30..b845b781f0 100644
--- a/aiter/ops/triton/_triton_kernels/attention/mha_onekernel_bwd.py
+++ b/aiter/ops/triton/_triton_kernels/attention/mha_onekernel_bwd.py
@@ -1,5 +1,5 @@
 # SPDX-License-Identifier: MIT
-# Copyright (C) 2024-2025, Advanced Micro Devices, Inc. All rights reserved.
+# Copyright (C) 2024-2026, Advanced Micro Devices, Inc. All rights reserved.
 
 import functools
 import json
diff --git a/aiter/ops/triton/_triton_kernels/attention/mla_decode_rope.py b/aiter/ops/triton/_triton_kernels/attention/mla_decode_rope.py
index ed783b0619..acc03bf28d 100644
--- a/aiter/ops/triton/_triton_kernels/attention/mla_decode_rope.py
+++ b/aiter/ops/triton/_triton_kernels/attention/mla_decode_rope.py
@@ -1,7 +1,7 @@
 # SPDX-License-Identifier: MIT
-# Copyright (C) 2024-2025, Advanced Micro Devices, Inc. All rights reserved.
+# Copyright (C) 2024-2026, Advanced Micro Devices, Inc. All rights reserved.
 
-# Copyright (C) 2023-2025 SGLang Team
+# Copyright (C) 2023-2026 SGLang Team
 # Licensed under the Apache License, Version 2.0 (the "License");
 # you may not use this file except in compliance with the License.
 # You may obtain a copy of the License at
diff --git a/aiter/ops/triton/_triton_kernels/attention/pa_decode.py b/aiter/ops/triton/_triton_kernels/attention/pa_decode.py
index 4499e9b234..4c5d62b5a7 100644
--- a/aiter/ops/triton/_triton_kernels/attention/pa_decode.py
+++ b/aiter/ops/triton/_triton_kernels/attention/pa_decode.py
@@ -1,5 +1,5 @@
 # SPDX-License-Identifier: MIT
-# Copyright (C) 2024-2025, Advanced Micro Devices, Inc. All rights reserved.
+# Copyright (C) 2024-2026, Advanced Micro Devices, Inc. All rights reserved.
 
 import triton
 import triton.language as tl
diff --git a/aiter/ops/triton/_triton_kernels/attention/pa_mqa_logits.py b/aiter/ops/triton/_triton_kernels/attention/pa_mqa_logits.py
index 0bbe5f6096..d6df12174a 100644
--- a/aiter/ops/triton/_triton_kernels/attention/pa_mqa_logits.py
+++ b/aiter/ops/triton/_triton_kernels/attention/pa_mqa_logits.py
@@ -1,5 +1,5 @@
 # SPDX-License-Identifier: MIT
-# Copyright (C) 2024-2025, Advanced Micro Devices, Inc. All rights reserved.
+# Copyright (C) 2024-2026, Advanced Micro Devices, Inc. All rights reserved.
 
 import triton
 import triton.language as tl
diff --git a/aiter/ops/triton/_triton_kernels/attention/pa_prefill.py b/aiter/ops/triton/_triton_kernels/attention/pa_prefill.py
index c511eccf65..82d36a1956 100644
--- a/aiter/ops/triton/_triton_kernels/attention/pa_prefill.py
+++ b/aiter/ops/triton/_triton_kernels/attention/pa_prefill.py
@@ -1,8 +1,8 @@
 # SPDX-License-Identifier: MIT
-# Copyright (C) 2024-2025, Advanced Micro Devices, Inc. All rights reserved.
+# Copyright (C) 2024-2026, Advanced Micro Devices, Inc. All rights reserved.
 
 # SPDX-License-Identifier: MIT
-# Copyright (C) 2024-2025, Advanced Micro Devices, Inc. All rights reserved.
+# Copyright (C) 2024-2026, Advanced Micro Devices, Inc. All rights reserved.
 
 # The kernels in this file are adapted from LightLLM's context_attention_fwd:
 # https://github.com/ModelTC/lightllm/blob/main/lightllm/models/llama/triton_kernel/context_flashattention_nopad.py
diff --git a/aiter/ops/triton/_triton_kernels/attention/prefill_attention.py b/aiter/ops/triton/_triton_kernels/attention/prefill_attention.py
index 72b1ff8e8f..690feb60c1 100644
--- a/aiter/ops/triton/_triton_kernels/attention/prefill_attention.py
+++ b/aiter/ops/triton/_triton_kernels/attention/prefill_attention.py
@@ -1,7 +1,7 @@
 # SPDX-License-Identifier: MIT
-# Copyright (C) 2024-2025, Advanced Micro Devices, Inc. All rights reserved.
+# Copyright (C) 2024-2026, Advanced Micro Devices, Inc. All rights reserved.
 
-# Copyright (C) 2023-2025 SGLang Team
+# Copyright (C) 2023-2026 SGLang Team
 # Licensed under the Apache License, Version 2.0 (the "License");
 # you may not use this file except in compliance with the License.
 # You may obtain a copy of the License at
diff --git a/aiter/ops/triton/_triton_kernels/gemm/basic/gemm_a16w16_atomic.py b/aiter/ops/triton/_triton_kernels/gemm/basic/gemm_a16w16_atomic.py
index 71e0d4c2f6..dca0c22a83 100644
--- a/aiter/ops/triton/_triton_kernels/gemm/basic/gemm_a16w16_atomic.py
+++ b/aiter/ops/triton/_triton_kernels/gemm/basic/gemm_a16w16_atomic.py
@@ -1,5 +1,5 @@
 # SPDX-License-Identifier: MIT
-# Copyright (C) 2024-2025, Advanced Micro Devices, Inc. All rights reserved.
+# Copyright (C) 2024-2026, Advanced Micro Devices, Inc. All rights reserved.
 
 import triton.language as tl
 from aiter.ops.triton.utils._triton.kernel_repr import make_kernel_repr
diff --git a/aiter/ops/triton/_triton_kernels/gemm/basic/gemm_a16w16_gated.py b/aiter/ops/triton/_triton_kernels/gemm/basic/gemm_a16w16_gated.py
index eeeaa57a09..2b9dae137d 100644
--- a/aiter/ops/triton/_triton_kernels/gemm/basic/gemm_a16w16_gated.py
+++ b/aiter/ops/triton/_triton_kernels/gemm/basic/gemm_a16w16_gated.py
@@ -1,5 +1,5 @@
 # SPDX-License-Identifier: MIT
-# Copyright (C) 2024-2025, Advanced Micro Devices, Inc. All rights reserved.
+# Copyright (C) 2024-2026, Advanced Micro Devices, Inc. All rights reserved.
 
 import triton.language as tl
 from aiter.ops.triton.utils._triton.kernel_repr import make_kernel_repr
diff --git a/aiter/ops/triton/_triton_kernels/gemm/basic/gemm_a16w8_blockscale.py b/aiter/ops/triton/_triton_kernels/gemm/basic/gemm_a16w8_blockscale.py
index fe1e927791..5910ed0e64 100644
--- a/aiter/ops/triton/_triton_kernels/gemm/basic/gemm_a16w8_blockscale.py
+++ b/aiter/ops/triton/_triton_kernels/gemm/basic/gemm_a16w8_blockscale.py
@@ -1,5 +1,5 @@
 # SPDX-License-Identifier: MIT
-# Copyright (C) 2024-2025, Advanced Micro Devices, Inc. All rights reserved.
+# Copyright (C) 2024-2026, Advanced Micro Devices, Inc. All rights reserved.
 
 import triton.language as tl
 from aiter.ops.triton._triton_kernels.quant.fused_fp8_quant import _fp8_quant_op
diff --git a/aiter/ops/triton/_triton_kernels/gemm/basic/gemm_a16wfp4.py b/aiter/ops/triton/_triton_kernels/gemm/basic/gemm_a16wfp4.py
index 0528288add..30f81e5f03 100644
--- a/aiter/ops/triton/_triton_kernels/gemm/basic/gemm_a16wfp4.py
+++ b/aiter/ops/triton/_triton_kernels/gemm/basic/gemm_a16wfp4.py
@@ -1,5 +1,5 @@
 # SPDX-License-Identifier: MIT
-# Copyright (C) 2024-2025, Advanced Micro Devices, Inc. All rights reserved.
+# Copyright (C) 2024-2026, Advanced Micro Devices, Inc. All rights reserved.
 
 import triton.language as tl
 from aiter.ops.triton._triton_kernels.quant.quant import _mxfp4_quant_op
diff --git a/aiter/ops/triton/_triton_kernels/gemm/basic/gemm_a8w8.py b/aiter/ops/triton/_triton_kernels/gemm/basic/gemm_a8w8.py
index 1aa6659bcd..cd4ec07ab0 100644
--- a/aiter/ops/triton/_triton_kernels/gemm/basic/gemm_a8w8.py
+++ b/aiter/ops/triton/_triton_kernels/gemm/basic/gemm_a8w8.py
@@ -1,5 +1,5 @@
 # SPDX-License-Identifier: MIT
-# Copyright (C) 2024-2025, Advanced Micro Devices, Inc. All rights reserved.
+# Copyright (C) 2024-2026, Advanced Micro Devices, Inc. All rights reserved.
 
 import triton.language as tl
 from aiter.ops.triton.utils._triton.kernel_repr import make_kernel_repr
diff --git a/aiter/ops/triton/_triton_kernels/gemm/basic/gemm_a8w8_blockscale.py b/aiter/ops/triton/_triton_kernels/gemm/basic/gemm_a8w8_blockscale.py
index 943d6032da..1d75520b58 100644
--- a/aiter/ops/triton/_triton_kernels/gemm/basic/gemm_a8w8_blockscale.py
+++ b/aiter/ops/triton/_triton_kernels/gemm/basic/gemm_a8w8_blockscale.py
@@ -1,5 +1,5 @@
 # SPDX-License-Identifier: MIT
-# Copyright (C) 2024-2025, Advanced Micro Devices, Inc. All rights reserved.
+# Copyright (C) 2024-2026, Advanced Micro Devices, Inc. All rights reserved.
 
 import triton
 import triton.language as tl
diff --git a/aiter/ops/triton/_triton_kernels/gemm/basic/gemm_a8w8_per_token_scale.py b/aiter/ops/triton/_triton_kernels/gemm/basic/gemm_a8w8_per_token_scale.py
index cb0ef83816..686ccba53e 100644
--- a/aiter/ops/triton/_triton_kernels/gemm/basic/gemm_a8w8_per_token_scale.py
+++ b/aiter/ops/triton/_triton_kernels/gemm/basic/gemm_a8w8_per_token_scale.py
@@ -1,5 +1,5 @@
 # SPDX-License-Identifier: MIT
-# Copyright (C) 2024-2025, Advanced Micro Devices, Inc. All rights reserved.
+# Copyright (C) 2024-2026, Advanced Micro Devices, Inc. All rights reserved.
 
 import triton.language as tl
 from aiter.ops.triton.utils._triton.kernel_repr import make_kernel_repr
diff --git a/aiter/ops/triton/_triton_kernels/gemm/basic/gemm_a8wfp4.py b/aiter/ops/triton/_triton_kernels/gemm/basic/gemm_a8wfp4.py
index 18721ab392..66e8b41653 100644
--- a/aiter/ops/triton/_triton_kernels/gemm/basic/gemm_a8wfp4.py
+++ b/aiter/ops/triton/_triton_kernels/gemm/basic/gemm_a8wfp4.py
@@ -1,5 +1,5 @@
 # SPDX-License-Identifier: MIT
-# Copyright (C) 2024-2025, Advanced Micro Devices, Inc. All rights reserved.
+# Copyright (C) 2024-2026, Advanced Micro Devices, Inc. All rights reserved.
 
 import triton.language as tl
 from aiter.ops.triton.utils._triton.kernel_repr import make_kernel_repr
diff --git a/aiter/ops/triton/_triton_kernels/gemm/basic/gemm_afp4wfp4.py b/aiter/ops/triton/_triton_kernels/gemm/basic/gemm_afp4wfp4.py
index e389cb292e..b420eabea3 100644
--- a/aiter/ops/triton/_triton_kernels/gemm/basic/gemm_afp4wfp4.py
+++ b/aiter/ops/triton/_triton_kernels/gemm/basic/gemm_afp4wfp4.py
@@ -1,5 +1,5 @@
 # SPDX-License-Identifier: MIT
-# Copyright (C) 2024-2025, Advanced Micro Devices, Inc. All rights reserved.
+# Copyright (C) 2024-2026, Advanced Micro Devices, Inc. All rights reserved.
 
 import triton.language as tl
 from aiter.ops.triton.utils._triton.kernel_repr import make_kernel_repr
diff --git a/aiter/ops/triton/_triton_kernels/gemm/batched/batched_gemm_a16wfp4.py b/aiter/ops/triton/_triton_kernels/gemm/batched/batched_gemm_a16wfp4.py
index 120d4ff0af..4c433c4ab6 100755
--- a/aiter/ops/triton/_triton_kernels/gemm/batched/batched_gemm_a16wfp4.py
+++ b/aiter/ops/triton/_triton_kernels/gemm/batched/batched_gemm_a16wfp4.py
@@ -1,5 +1,5 @@
 # SPDX-License-Identifier: MIT
-# Copyright (C) 2024-2025, Advanced Micro Devices, Inc. All rights reserved.
+# Copyright (C) 2024-2026, Advanced Micro Devices, Inc. All rights reserved.
 
 import triton.language as tl
 from aiter.ops.triton._triton_kernels.quant.quant import _mxfp4_quant_op
diff --git a/aiter/ops/triton/_triton_kernels/gemm/batched/batched_gemm_a8w8.py b/aiter/ops/triton/_triton_kernels/gemm/batched/batched_gemm_a8w8.py
index 89d8384aad..29f0a34443 100644
--- a/aiter/ops/triton/_triton_kernels/gemm/batched/batched_gemm_a8w8.py
+++ b/aiter/ops/triton/_triton_kernels/gemm/batched/batched_gemm_a8w8.py
@@ -1,5 +1,5 @@
 # SPDX-License-Identifier: MIT
-# Copyright (C) 2024-2025, Advanced Micro Devices, Inc. All rights reserved.
+# Copyright (C) 2024-2026, Advanced Micro Devices, Inc. All rights reserved.
 
 import triton.language as tl
 from aiter.ops.triton.utils._triton.kernel_repr import make_kernel_repr
diff --git a/aiter/ops/triton/_triton_kernels/gemm/batched/batched_gemm_a8w8_a_per_token_group_prequant_w_per_batched_tensor_quant.py b/aiter/ops/triton/_triton_kernels/gemm/batched/batched_gemm_a8w8_a_per_token_group_prequant_w_per_batched_tensor_quant.py
index 5e5241a51b..bffb53963a 100644
--- a/aiter/ops/triton/_triton_kernels/gemm/batched/batched_gemm_a8w8_a_per_token_group_prequant_w_per_batched_tensor_quant.py
+++ b/aiter/ops/triton/_triton_kernels/gemm/batched/batched_gemm_a8w8_a_per_token_group_prequant_w_per_batched_tensor_quant.py
@@ -1,5 +1,5 @@
 # SPDX-License-Identifier: MIT
-# Copyright (C) 2024-2025, Advanced Micro Devices, Inc. All rights reserved.
+# Copyright (C) 2024-2026, Advanced Micro Devices, Inc. All rights reserved.
 
 import triton.language as tl
 from aiter.ops.triton.utils._triton.kernel_repr import make_kernel_repr
diff --git a/aiter/ops/triton/_triton_kernels/gemm/batched/batched_gemm_afp4wfp4.py b/aiter/ops/triton/_triton_kernels/gemm/batched/batched_gemm_afp4wfp4.py
index 66e7bb0282..54ef0fd89e 100755
--- a/aiter/ops/triton/_triton_kernels/gemm/batched/batched_gemm_afp4wfp4.py
+++ b/aiter/ops/triton/_triton_kernels/gemm/batched/batched_gemm_afp4wfp4.py
@@ -1,5 +1,5 @@
 # SPDX-License-Identifier: MIT
-# Copyright (C) 2024-2025, Advanced Micro Devices, Inc. All rights reserved.
+# Copyright (C) 2024-2026, Advanced Micro Devices, Inc. All rights reserved.
 
 import triton.language as tl
 from aiter.ops.triton.utils._triton.kernel_repr import make_kernel_repr
diff --git a/aiter/ops/triton/_triton_kernels/gemm/batched/batched_gemm_bf16.py b/aiter/ops/triton/_triton_kernels/gemm/batched/batched_gemm_bf16.py
index 029f4e57e2..6d3554a118 100644
--- a/aiter/ops/triton/_triton_kernels/gemm/batched/batched_gemm_bf16.py
+++ b/aiter/ops/triton/_triton_kernels/gemm/batched/batched_gemm_bf16.py
@@ -1,5 +1,5 @@
 # SPDX-License-Identifier: MIT
-# Copyright (C) 2024-2025, Advanced Micro Devices, Inc. All rights reserved.
+# Copyright (C) 2024-2026, Advanced Micro Devices, Inc. All rights reserved.
 
 import triton.language as tl
 from aiter.ops.triton.utils._triton.kernel_repr import make_kernel_repr
diff --git a/aiter/ops/triton/_triton_kernels/gemm/feed_forward/ff_a16w16_fused_gated.py b/aiter/ops/triton/_triton_kernels/gemm/feed_forward/ff_a16w16_fused_gated.py
index 854ace7086..86c87a545a 100644
--- a/aiter/ops/triton/_triton_kernels/gemm/feed_forward/ff_a16w16_fused_gated.py
+++ b/aiter/ops/triton/_triton_kernels/gemm/feed_forward/ff_a16w16_fused_gated.py
@@ -1,5 +1,5 @@
 # SPDX-License-Identifier: MIT
-# Copyright (C) 2024-2025, Advanced Micro Devices, Inc. All rights reserved.
+# Copyright (C) 2024-2026, Advanced Micro Devices, Inc. All rights reserved.
 
 import triton.language as tl
 from aiter.ops.triton.utils._triton.pid_preprocessing import pid_grid, remap_xcd
diff --git a/aiter/ops/triton/_triton_kernels/gemm/feed_forward/ff_a16w16_fused_ungated.py b/aiter/ops/triton/_triton_kernels/gemm/feed_forward/ff_a16w16_fused_ungated.py
index 331b7c0ded..3365fcef7c 100644
--- a/aiter/ops/triton/_triton_kernels/gemm/feed_forward/ff_a16w16_fused_ungated.py
+++ b/aiter/ops/triton/_triton_kernels/gemm/feed_forward/ff_a16w16_fused_ungated.py
@@ -1,5 +1,5 @@
 # SPDX-License-Identifier: MIT
-# Copyright (C) 2024-2025, Advanced Micro Devices, Inc. All rights reserved.
+# Copyright (C) 2024-2026, Advanced Micro Devices, Inc. All rights reserved.
 
 import triton.language as tl
 from aiter.ops.triton.utils._triton.pid_preprocessing import pid_grid, remap_xcd
diff --git a/aiter/ops/triton/_triton_kernels/gemm/fused/fused_gemm_a8w8_blockscale_a16w16.py b/aiter/ops/triton/_triton_kernels/gemm/fused/fused_gemm_a8w8_blockscale_a16w16.py
index 2290f8b92e..3c73b94389 100644
--- a/aiter/ops/triton/_triton_kernels/gemm/fused/fused_gemm_a8w8_blockscale_a16w16.py
+++ b/aiter/ops/triton/_triton_kernels/gemm/fused/fused_gemm_a8w8_blockscale_a16w16.py
@@ -1,5 +1,5 @@
 # SPDX-License-Identifier: MIT
-# Copyright (C) 2024-2025, Advanced Micro Devices, Inc. All rights reserved.
+# Copyright (C) 2024-2026, Advanced Micro Devices, Inc. All rights reserved.
 
 import triton.language as tl
 from aiter.ops.triton.utils._triton.pid_preprocessing import pid_grid, remap_xcd
diff --git a/aiter/ops/triton/_triton_kernels/gemm/fused/fused_gemm_a8w8_blockscale_mul_add.py b/aiter/ops/triton/_triton_kernels/gemm/fused/fused_gemm_a8w8_blockscale_mul_add.py
index 83c78c1b48..cf36ee4b4c 100644
--- a/aiter/ops/triton/_triton_kernels/gemm/fused/fused_gemm_a8w8_blockscale_mul_add.py
+++ b/aiter/ops/triton/_triton_kernels/gemm/fused/fused_gemm_a8w8_blockscale_mul_add.py
@@ -1,5 +1,5 @@
 # SPDX-License-Identifier: MIT
-# Copyright (C) 2024-2025, Advanced Micro Devices, Inc. All rights reserved.
+# Copyright (C) 2024-2026, Advanced Micro Devices, Inc. All rights reserved.
 
 import triton
 import triton.language as tl
diff --git a/aiter/ops/triton/_triton_kernels/gemm/fused/fused_gemm_afp4wfp4_a16w16.py b/aiter/ops/triton/_triton_kernels/gemm/fused/fused_gemm_afp4wfp4_a16w16.py
index 05601f7df5..611532300e 100644
--- a/aiter/ops/triton/_triton_kernels/gemm/fused/fused_gemm_afp4wfp4_a16w16.py
+++ b/aiter/ops/triton/_triton_kernels/gemm/fused/fused_gemm_afp4wfp4_a16w16.py
@@ -1,5 +1,5 @@
 # SPDX-License-Identifier: MIT
-# Copyright (C) 2024-2025, Advanced Micro Devices, Inc. All rights reserved.
+# Copyright (C) 2024-2026, Advanced Micro Devices, Inc. All rights reserved.
 
 import triton.language as tl
 from aiter.ops.triton.utils._triton.kernel_repr import make_kernel_repr
diff --git a/aiter/ops/triton/_triton_kernels/gemm/fused/fused_gemm_afp4wfp4_mul_add.py b/aiter/ops/triton/_triton_kernels/gemm/fused/fused_gemm_afp4wfp4_mul_add.py
index 34af0f38da..0f55772fe5 100644
--- a/aiter/ops/triton/_triton_kernels/gemm/fused/fused_gemm_afp4wfp4_mul_add.py
+++ b/aiter/ops/triton/_triton_kernels/gemm/fused/fused_gemm_afp4wfp4_mul_add.py
@@ -1,5 +1,5 @@
 # SPDX-License-Identifier: MIT
-# Copyright (C) 2024-2025, Advanced Micro Devices, Inc. All rights reserved.
+# Copyright (C) 2024-2026, Advanced Micro Devices, Inc. All rights reserved.
 
 import triton.language as tl
 from aiter.ops.triton.utils._triton.kernel_repr import make_kernel_repr
diff --git a/aiter/ops/triton/_triton_kernels/gemm/fused/fused_gemm_afp4wfp4_split_cat.py b/aiter/ops/triton/_triton_kernels/gemm/fused/fused_gemm_afp4wfp4_split_cat.py
index cf8831d504..3c9bc52985 100644
--- a/aiter/ops/triton/_triton_kernels/gemm/fused/fused_gemm_afp4wfp4_split_cat.py
+++ b/aiter/ops/triton/_triton_kernels/gemm/fused/fused_gemm_afp4wfp4_split_cat.py
@@ -1,5 +1,5 @@
 # SPDX-License-Identifier: MIT
-# Copyright (C) 2024-2025, Advanced Micro Devices, Inc. All rights reserved.
+# Copyright (C) 2024-2026, Advanced Micro Devices, Inc. All rights reserved.
 
 import triton
 import triton.language as tl
diff --git a/aiter/ops/triton/_triton_kernels/gmm.py b/aiter/ops/triton/_triton_kernels/gmm.py
index b1baf95568..71e643cd40 100644
--- a/aiter/ops/triton/_triton_kernels/gmm.py
+++ b/aiter/ops/triton/_triton_kernels/gmm.py
@@ -1,5 +1,5 @@
 # SPDX-License-Identifier: MIT
-# Copyright (C) 2025, Advanced Micro Devices, Inc. All rights reserved.
+# Copyright (C) 2025-2026, Advanced Micro Devices, Inc. All rights reserved.
 
 
 # Imports.
diff --git a/aiter/ops/triton/_triton_kernels/moe/moe_align_block_size.py b/aiter/ops/triton/_triton_kernels/moe/moe_align_block_size.py
index b135454adc..97d8ba1095 100644
--- a/aiter/ops/triton/_triton_kernels/moe/moe_align_block_size.py
+++ b/aiter/ops/triton/_triton_kernels/moe/moe_align_block_size.py
@@ -1,5 +1,5 @@
 # SPDX-License-Identifier: MIT
-# Copyright (C) 2024-2025, Advanced Micro Devices, Inc. All rights reserved.
+# Copyright (C) 2024-2026, Advanced Micro Devices, Inc. All rights reserved.
 
 import triton
 import triton.language as tl
diff --git a/aiter/ops/triton/_triton_kernels/moe/moe_op.py b/aiter/ops/triton/_triton_kernels/moe/moe_op.py
index 137dea368e..b2147bcf89 100644
--- a/aiter/ops/triton/_triton_kernels/moe/moe_op.py
+++ b/aiter/ops/triton/_triton_kernels/moe/moe_op.py
@@ -1,5 +1,5 @@
 # SPDX-License-Identifier: MIT
-# Copyright (C) 2024-2025, Advanced Micro Devices, Inc. All rights reserved.
+# Copyright (C) 2024-2026, Advanced Micro Devices, Inc. All rights reserved.
 
 import triton
 import triton.language as tl
diff --git a/aiter/ops/triton/_triton_kernels/moe/moe_op_e2e.py b/aiter/ops/triton/_triton_kernels/moe/moe_op_e2e.py
index 659f58b918..7f22c3c88e 100644
--- a/aiter/ops/triton/_triton_kernels/moe/moe_op_e2e.py
+++ b/aiter/ops/triton/_triton_kernels/moe/moe_op_e2e.py
@@ -1,5 +1,5 @@
 # SPDX-License-Identifier: MIT
-# Copyright (C) 2024-2025, Advanced Micro Devices, Inc. All rights reserved.
+# Copyright (C) 2024-2026, Advanced Micro Devices, Inc. All rights reserved.
 
 import triton
 import triton.language as tl
diff --git a/aiter/ops/triton/_triton_kernels/moe/moe_op_gelu.py b/aiter/ops/triton/_triton_kernels/moe/moe_op_gelu.py
index f3a0dde8d4..d41a68605e 100644
--- a/aiter/ops/triton/_triton_kernels/moe/moe_op_gelu.py
+++ b/aiter/ops/triton/_triton_kernels/moe/moe_op_gelu.py
@@ -1,5 +1,5 @@
 # SPDX-License-Identifier: MIT
-# Copyright (C) 2024-2025, Advanced Micro Devices, Inc. All rights reserved.
+# Copyright (C) 2024-2026, Advanced Micro Devices, Inc. All rights reserved.
 
 import triton
 import triton.language as tl
diff --git a/aiter/ops/triton/_triton_kernels/moe/moe_op_mxfp4.py b/aiter/ops/triton/_triton_kernels/moe/moe_op_mxfp4.py
index 50d1ac5ca9..b46741cfaa 100644
--- a/aiter/ops/triton/_triton_kernels/moe/moe_op_mxfp4.py
+++ b/aiter/ops/triton/_triton_kernels/moe/moe_op_mxfp4.py
@@ -1,5 +1,5 @@
 # SPDX-License-Identifier: MIT
-# Copyright (C) 2024-2025, Advanced Micro Devices, Inc. All rights reserved.
+# Copyright (C) 2024-2026, Advanced Micro Devices, Inc. All rights reserved.
 
 import triton
 import triton.language as tl
diff --git a/aiter/ops/triton/_triton_kernels/moe/moe_op_mxfp4_silu_fused.py b/aiter/ops/triton/_triton_kernels/moe/moe_op_mxfp4_silu_fused.py
index ab042f878d..97d66598e3 100644
--- a/aiter/ops/triton/_triton_kernels/moe/moe_op_mxfp4_silu_fused.py
+++ b/aiter/ops/triton/_triton_kernels/moe/moe_op_mxfp4_silu_fused.py
@@ -1,5 +1,5 @@
 # SPDX-License-Identifier: MIT
-# Copyright (C) 2024-2025, Advanced Micro Devices, Inc. All rights reserved.
+# Copyright (C) 2024-2026, Advanced Micro Devices, Inc. All rights reserved.
 
 import triton
 import triton.language as tl
diff --git a/aiter/ops/triton/_triton_kernels/moe/moe_op_silu_fused.py b/aiter/ops/triton/_triton_kernels/moe/moe_op_silu_fused.py
index 8f99e0f9d2..702e3e076f 100644
--- a/aiter/ops/triton/_triton_kernels/moe/moe_op_silu_fused.py
+++ b/aiter/ops/triton/_triton_kernels/moe/moe_op_silu_fused.py
@@ -1,5 +1,5 @@
 # SPDX-License-Identifier: MIT
-# Copyright (C) 2024-2025, Advanced Micro Devices, Inc. All rights reserved.
+# Copyright (C) 2024-2026, Advanced Micro Devices, Inc. All rights reserved.
 
 import triton
 import triton.language as tl
diff --git a/aiter/ops/triton/_triton_kernels/moe/moe_routing_sigmoid_top1_fused.py b/aiter/ops/triton/_triton_kernels/moe/moe_routing_sigmoid_top1_fused.py
index 4fbfec9bb2..8434028428 100644
--- a/aiter/ops/triton/_triton_kernels/moe/moe_routing_sigmoid_top1_fused.py
+++ b/aiter/ops/triton/_triton_kernels/moe/moe_routing_sigmoid_top1_fused.py
@@ -1,5 +1,5 @@
 # SPDX-License-Identifier: MIT
-# Copyright (C) 2024-2025, Advanced Micro Devices, Inc. All rights reserved.
+# Copyright (C) 2024-2026, Advanced Micro Devices, Inc. All rights reserved.
 
 import functools
 import json
diff --git a/aiter/ops/triton/_triton_kernels/moe/quant_moe.py b/aiter/ops/triton/_triton_kernels/moe/quant_moe.py
index f6cf1431e5..121d76f362 100644
--- a/aiter/ops/triton/_triton_kernels/moe/quant_moe.py
+++ b/aiter/ops/triton/_triton_kernels/moe/quant_moe.py
@@ -405,7 +405,7 @@ def _upcast_from_mxfp(
         # 3) x is zero, do nothing
         dst_tensor = tl.interleave(x0, x1).to(dst_dtype, bitcast=True)
 
-    # Reshape for proper broadcasting: the scale was stored with a 32‐sized “inner” grouping.
+    # Reshape for proper broadcasting: the scale was stored with a 32-sized "inner" grouping.
     dst_tensor = dst_tensor.reshape([BLOCK_SIZE_OUT_DIM, BLOCK_SIZE_QUANT_MX_SCALE, 32])
     dst_scale = dst_scale.reshape([BLOCK_SIZE_OUT_DIM, BLOCK_SIZE_QUANT_MX_SCALE, 1])
     scale = scale.reshape(dst_scale.shape)
diff --git a/aiter/ops/triton/_triton_kernels/normalization/norm.py b/aiter/ops/triton/_triton_kernels/normalization/norm.py
index 77b7e60410..50068e12ae 100644
--- a/aiter/ops/triton/_triton_kernels/normalization/norm.py
+++ b/aiter/ops/triton/_triton_kernels/normalization/norm.py
@@ -1,5 +1,5 @@
 # SPDX-License-Identifier: MIT
-# Copyright (C) 2024-2025, Advanced Micro Devices, Inc. All rights reserved.
+# Copyright (C) 2024-2026, Advanced Micro Devices, Inc. All rights reserved.
 
 import triton
 import triton.language as tl
diff --git a/aiter/ops/triton/_triton_kernels/normalization/rmsnorm.py b/aiter/ops/triton/_triton_kernels/normalization/rmsnorm.py
index ac478dd5d7..7e889b4b1d 100644
--- a/aiter/ops/triton/_triton_kernels/normalization/rmsnorm.py
+++ b/aiter/ops/triton/_triton_kernels/normalization/rmsnorm.py
@@ -1,5 +1,5 @@
 # SPDX-License-Identifier: MIT
-# Copyright (C) 2024-2025, Advanced Micro Devices, Inc. All rights reserved.
+# Copyright (C) 2024-2026, Advanced Micro Devices, Inc. All rights reserved.
 
 import triton
 import triton.language as tl
diff --git a/aiter/ops/triton/_triton_kernels/quant/quant.py b/aiter/ops/triton/_triton_kernels/quant/quant.py
index 3773fb077b..3b88c8b2b2 100644
--- a/aiter/ops/triton/_triton_kernels/quant/quant.py
+++ b/aiter/ops/triton/_triton_kernels/quant/quant.py
@@ -1,5 +1,5 @@
 # SPDX-License-Identifier: MIT
-# Copyright (C) 2024-2025, Advanced Micro Devices, Inc. All rights reserved.
+# Copyright (C) 2024-2026, Advanced Micro Devices, Inc. All rights reserved.
 
 import triton
 import triton.language as tl
diff --git a/aiter/ops/triton/_triton_kernels/rope/rope.py b/aiter/ops/triton/_triton_kernels/rope/rope.py
index 077eb23f8f..85c88d03e6 100644
--- a/aiter/ops/triton/_triton_kernels/rope/rope.py
+++ b/aiter/ops/triton/_triton_kernels/rope/rope.py
@@ -1,5 +1,5 @@
 # SPDX-License-Identifier: MIT
-# Copyright (C) 2024-2025, Advanced Micro Devices, Inc. All rights reserved.
+# Copyright (C) 2024-2026, Advanced Micro Devices, Inc. All rights reserved.
 
 import triton
 import triton.language as tl
diff --git a/aiter/ops/triton/_triton_kernels/topk.py b/aiter/ops/triton/_triton_kernels/topk.py
index 1f6d8f9536..30bbe18819 100644
--- a/aiter/ops/triton/_triton_kernels/topk.py
+++ b/aiter/ops/triton/_triton_kernels/topk.py
@@ -1,5 +1,5 @@
 # SPDX-License-Identifier: MIT
-# Copyright (C) 2025, Advanced Micro Devices, Inc. All rights reserved.
+# Copyright (C) 2025-2026, Advanced Micro Devices, Inc. All rights reserved.
 
 # The kernel in this file is adapted from FlagGems' topk:
 # https://github.com/FlagOpen/FlagGems/blob/master/src/flag_gems/ops/topk.py
diff --git a/aiter/ops/triton/attention/chunked_pa_prefill.py b/aiter/ops/triton/attention/chunked_pa_prefill.py
index 2f40e4f30c..b791834838 100644
--- a/aiter/ops/triton/attention/chunked_pa_prefill.py
+++ b/aiter/ops/triton/attention/chunked_pa_prefill.py
@@ -1,5 +1,5 @@
 # SPDX-License-Identifier: MIT
-# Copyright (C) 2024-2025, Advanced Micro Devices, Inc. All rights reserved.
+# Copyright (C) 2024-2026, Advanced Micro Devices, Inc. All rights reserved.
 
 # The kernel in this file is adapted from the VLLM project:
 # https://github.com/ROCm/vllm/blob/aiter_integration_final/vllm/attention/ops/chunked_prefill_paged_decode.py
diff --git a/aiter/ops/triton/attention/extend_attention.py b/aiter/ops/triton/attention/extend_attention.py
index 1a7dc1d085..b45c4a0998 100644
--- a/aiter/ops/triton/attention/extend_attention.py
+++ b/aiter/ops/triton/attention/extend_attention.py
@@ -1,4 +1,4 @@
-# Copyright (C) 2023-2025 SGLang Team
+# Copyright (C) 2023-2026 SGLang Team
 # Licensed under the Apache License, Version 2.0 (the "License");
 # you may not use this file except in compliance with the License.
 # You may obtain a copy of the License at
diff --git a/aiter/ops/triton/attention/hstu_attention.py b/aiter/ops/triton/attention/hstu_attention.py
index 3bf333f51c..344c8efcd6 100644
--- a/aiter/ops/triton/attention/hstu_attention.py
+++ b/aiter/ops/triton/attention/hstu_attention.py
@@ -1,5 +1,5 @@
 # Copyright (C) Advanced Micro Devices, Inc. All rights reserved.
-# Copyright (C) 2024-2025, The vLLM team.
+# Copyright (C) 2024-2026, The vLLM team.
 #
 # Licensed under the Apache License, Version 2.0 (the "License");
 # you may not use this file except in compliance with the License.
diff --git a/aiter/ops/triton/attention/lean_atten.py b/aiter/ops/triton/attention/lean_atten.py
index 8f981f9613..74839501eb 100644
--- a/aiter/ops/triton/attention/lean_atten.py
+++ b/aiter/ops/triton/attention/lean_atten.py
@@ -1,5 +1,5 @@
 # SPDX-License-Identifier: MIT
-# Copyright (C) 2024-2025, Advanced Micro Devices, Inc. All rights reserved.
+# Copyright (C) 2024-2026, Advanced Micro Devices, Inc. All rights reserved.
 
 """
 Lean Attention
diff --git a/aiter/ops/triton/attention/mha.py b/aiter/ops/triton/attention/mha.py
index ae8bbf78fd..e4332046c3 100644
--- a/aiter/ops/triton/attention/mha.py
+++ b/aiter/ops/triton/attention/mha.py
@@ -1,5 +1,5 @@
 # SPDX-License-Identifier: MIT
-# Copyright (C) 2024-2025, Advanced Micro Devices, Inc. All rights reserved.
+# Copyright (C) 2024-2026, Advanced Micro Devices, Inc. All rights reserved.
 
 from typing import Optional, Tuple, Union
 import torch
diff --git a/aiter/ops/triton/attention/mha_fused_bwd.py b/aiter/ops/triton/attention/mha_fused_bwd.py
index 6049cb6b36..c9cc67743d 100644
--- a/aiter/ops/triton/attention/mha_fused_bwd.py
+++ b/aiter/ops/triton/attention/mha_fused_bwd.py
@@ -1,5 +1,5 @@
 # SPDX-License-Identifier: MIT
-# Copyright (C) 2024-2025, Advanced Micro Devices, Inc. All rights reserved.
+# Copyright (C) 2024-2026, Advanced Micro Devices, Inc. All rights reserved.
 
 from typing import Optional, Dict
 import torch
diff --git a/aiter/ops/triton/attention/mha_onekernel_bwd.py b/aiter/ops/triton/attention/mha_onekernel_bwd.py
index db79aa2c3d..7205bb1150 100644
--- a/aiter/ops/triton/attention/mha_onekernel_bwd.py
+++ b/aiter/ops/triton/attention/mha_onekernel_bwd.py
@@ -1,5 +1,5 @@
 # SPDX-License-Identifier: MIT
-# Copyright (C) 2024-2025, Advanced Micro Devices, Inc. All rights reserved.
+# Copyright (C) 2024-2026, Advanced Micro Devices, Inc. All rights reserved.
 
 from typing import Optional, Dict
 import torch
diff --git a/aiter/ops/triton/attention/mha_v3.py b/aiter/ops/triton/attention/mha_v3.py
index c2fa24c769..6b93c99581 100644
--- a/aiter/ops/triton/attention/mha_v3.py
+++ b/aiter/ops/triton/attention/mha_v3.py
@@ -1,5 +1,5 @@
 # SPDX-License-Identifier: MIT
-# Copyright (C) 2024-2025, Advanced Micro Devices, Inc. All rights reserved.
+# Copyright (C) 2024-2026, Advanced Micro Devices, Inc. All rights reserved.
 
 from __future__ import annotations
 from typing import Optional, Tuple, Union
diff --git a/aiter/ops/triton/attention/mla_decode_rope.py b/aiter/ops/triton/attention/mla_decode_rope.py
index 9332b6d8a6..9e2532c743 100644
--- a/aiter/ops/triton/attention/mla_decode_rope.py
+++ b/aiter/ops/triton/attention/mla_decode_rope.py
@@ -1,7 +1,7 @@
 # SPDX-License-Identifier: MIT
-# Copyright (C) 2024-2025, Advanced Micro Devices, Inc. All rights reserved.
+# Copyright (C) 2024-2026, Advanced Micro Devices, Inc. All rights reserved.
 
-# Copyright (C) 2023-2025 SGLang Team
+# Copyright (C) 2023-2026 SGLang Team
 # Licensed under the Apache License, Version 2.0 (the "License");
 # you may not use this file except in compliance with the License.
 # You may obtain a copy of the License at
diff --git a/aiter/ops/triton/attention/pa_decode.py b/aiter/ops/triton/attention/pa_decode.py
index 50d0dfe2fa..3a38ca4c8c 100644
--- a/aiter/ops/triton/attention/pa_decode.py
+++ b/aiter/ops/triton/attention/pa_decode.py
@@ -1,5 +1,5 @@
 # SPDX-License-Identifier: MIT
-# Copyright (C) 2024-2025, Advanced Micro Devices, Inc. All rights reserved.
+# Copyright (C) 2024-2026, Advanced Micro Devices, Inc. All rights reserved.
 
 import math
 from typing import Optional
diff --git a/aiter/ops/triton/attention/pa_mqa_logits.py b/aiter/ops/triton/attention/pa_mqa_logits.py
index f5d9573d7c..1f28abf18b 100644
--- a/aiter/ops/triton/attention/pa_mqa_logits.py
+++ b/aiter/ops/triton/attention/pa_mqa_logits.py
@@ -1,5 +1,5 @@
 # SPDX-License-Identifier: MIT
-# Copyright (C) 2024-2025, Advanced Micro Devices, Inc. All rights reserved.
+# Copyright (C) 2024-2026, Advanced Micro Devices, Inc. All rights reserved.
 
 # ========================================================================
 # How to use AOT gluon kernel for pa_mqa_logits on lower triton version (below 3.4.0):
diff --git a/aiter/ops/triton/attention/pa_prefill.py b/aiter/ops/triton/attention/pa_prefill.py
index 2be4bb4fc0..7ce3f1c814 100644
--- a/aiter/ops/triton/attention/pa_prefill.py
+++ b/aiter/ops/triton/attention/pa_prefill.py
@@ -1,8 +1,8 @@
 # SPDX-License-Identifier: MIT
-# Copyright (C) 2024-2025, Advanced Micro Devices, Inc. All rights reserved.
+# Copyright (C) 2024-2026, Advanced Micro Devices, Inc. All rights reserved.
 
 # SPDX-License-Identifier: MIT
-# Copyright (C) 2024-2025, Advanced Micro Devices, Inc. All rights reserved.
+# Copyright (C) 2024-2026, Advanced Micro Devices, Inc. All rights reserved.
 
 # The kernels in this file are adapted from LightLLM's context_attention_fwd:
 # https://github.com/ModelTC/lightllm/blob/main/lightllm/models/llama/triton_kernel/context_flashattention_nopad.py
diff --git a/aiter/ops/triton/attention/prefill_attention.py b/aiter/ops/triton/attention/prefill_attention.py
index e8ae05cccc..f5fdac2d9d 100644
--- a/aiter/ops/triton/attention/prefill_attention.py
+++ b/aiter/ops/triton/attention/prefill_attention.py
@@ -1,7 +1,7 @@
 # SPDX-License-Identifier: MIT
-# Copyright (C) 2024-2025, Advanced Micro Devices, Inc. All rights reserved.
+# Copyright (C) 2024-2026, Advanced Micro Devices, Inc. All rights reserved.
 
-# Copyright (C) 2023-2025 SGLang Team
+# Copyright (C) 2023-2026 SGLang Team
 # Licensed under the Apache License, Version 2.0 (the "License");
 # you may not use this file except in compliance with the License.
 # You may obtain a copy of the License at
diff --git a/aiter/ops/triton/comms/fused/reduce_scatter_rmsnorm_quant_all_gather.py b/aiter/ops/triton/comms/fused/reduce_scatter_rmsnorm_quant_all_gather.py
index 447a502352..259a54ecc6 100644
--- a/aiter/ops/triton/comms/fused/reduce_scatter_rmsnorm_quant_all_gather.py
+++ b/aiter/ops/triton/comms/fused/reduce_scatter_rmsnorm_quant_all_gather.py
@@ -1,5 +1,5 @@
 # SPDX-License-Identifier: MIT
-# Copyright (C) 2025, Advanced Micro Devices, Inc. All rights reserved.
+# Copyright (C) 2025-2026, Advanced Micro Devices, Inc. All rights reserved.
 
 """
 Fused Reduce-Scatter + RMSNorm + Quantization + All-Gather
diff --git a/aiter/ops/triton/gemm/basic/gemm_a16w16.py b/aiter/ops/triton/gemm/basic/gemm_a16w16.py
index 8e98407863..d06ba0eb3a 100644
--- a/aiter/ops/triton/gemm/basic/gemm_a16w16.py
+++ b/aiter/ops/triton/gemm/basic/gemm_a16w16.py
@@ -1,5 +1,5 @@
 # SPDX-License-Identifier: MIT
-# Copyright (C) 2024-2025, Advanced Micro Devices, Inc. All rights reserved.
+# Copyright (C) 2024-2026, Advanced Micro Devices, Inc. All rights reserved.
 
 from typing import Optional
 import torch
diff --git a/aiter/ops/triton/gemm/basic/gemm_a16w16_agnostic.py b/aiter/ops/triton/gemm/basic/gemm_a16w16_agnostic.py
index d56f3bd452..d04d56626c 100644
--- a/aiter/ops/triton/gemm/basic/gemm_a16w16_agnostic.py
+++ b/aiter/ops/triton/gemm/basic/gemm_a16w16_agnostic.py
@@ -1,5 +1,5 @@
 # SPDX-License-Identifier: MIT
-# Copyright (C) 2024-2025, Advanced Micro Devices, Inc. All rights reserved.
+# Copyright (C) 2024-2026, Advanced Micro Devices, Inc. All rights reserved.
 
 from typing import Optional
 import triton
diff --git a/aiter/ops/triton/gemm/basic/gemm_a16w16_atomic.py b/aiter/ops/triton/gemm/basic/gemm_a16w16_atomic.py
index 3622837d2a..01234f64c5 100644
--- a/aiter/ops/triton/gemm/basic/gemm_a16w16_atomic.py
+++ b/aiter/ops/triton/gemm/basic/gemm_a16w16_atomic.py
@@ -1,5 +1,5 @@
 # SPDX-License-Identifier: MIT
-# Copyright (C) 2024-2025, Advanced Micro Devices, Inc. All rights reserved.
+# Copyright (C) 2024-2026, Advanced Micro Devices, Inc. All rights reserved.
 
 from typing import Optional
 import torch
diff --git a/aiter/ops/triton/gemm/basic/gemm_a16w16_gated.py b/aiter/ops/triton/gemm/basic/gemm_a16w16_gated.py
index 0ebddce0e6..3ce568a941 100644
--- a/aiter/ops/triton/gemm/basic/gemm_a16w16_gated.py
+++ b/aiter/ops/triton/gemm/basic/gemm_a16w16_gated.py
@@ -1,5 +1,5 @@
 # SPDX-License-Identifier: MIT
-# Copyright (C) 2024-2025, Advanced Micro Devices, Inc. All rights reserved.
+# Copyright (C) 2024-2026, Advanced Micro Devices, Inc. All rights reserved.
 
 from typing import Optional
 import torch
diff --git a/aiter/ops/triton/gemm/basic/gemm_a16w8_blockscale.py b/aiter/ops/triton/gemm/basic/gemm_a16w8_blockscale.py
index aeeb013aa6..3105641bcd 100644
--- a/aiter/ops/triton/gemm/basic/gemm_a16w8_blockscale.py
+++ b/aiter/ops/triton/gemm/basic/gemm_a16w8_blockscale.py
@@ -1,5 +1,5 @@
 # SPDX-License-Identifier: MIT
-# Copyright (C) 2024-2025, Advanced Micro Devices, Inc. All rights reserved.
+# Copyright (C) 2024-2026, Advanced Micro Devices, Inc. All rights reserved.
 
 from typing import Optional
 import torch
diff --git a/aiter/ops/triton/gemm/basic/gemm_a16wfp4.py b/aiter/ops/triton/gemm/basic/gemm_a16wfp4.py
index fcef64d6d6..ab691ba860 100644
--- a/aiter/ops/triton/gemm/basic/gemm_a16wfp4.py
+++ b/aiter/ops/triton/gemm/basic/gemm_a16wfp4.py
@@ -1,5 +1,5 @@
 # SPDX-License-Identifier: MIT
-# Copyright (C) 2024-2025, Advanced Micro Devices, Inc. All rights reserved.
+# Copyright (C) 2024-2026, Advanced Micro Devices, Inc. All rights reserved.
 
 from typing import Optional
 import torch
diff --git a/aiter/ops/triton/gemm/basic/gemm_a8w8.py b/aiter/ops/triton/gemm/basic/gemm_a8w8.py
index eb65ffe6df..40b1fe8f68 100644
--- a/aiter/ops/triton/gemm/basic/gemm_a8w8.py
+++ b/aiter/ops/triton/gemm/basic/gemm_a8w8.py
@@ -1,5 +1,5 @@
 # SPDX-License-Identifier: MIT
-# Copyright (C) 2024-2025, Advanced Micro Devices, Inc. All rights reserved.
+# Copyright (C) 2024-2026, Advanced Micro Devices, Inc. All rights reserved.
 
 from typing import Optional
 import torch
diff --git a/aiter/ops/triton/gemm/basic/gemm_a8w8_blockscale.py b/aiter/ops/triton/gemm/basic/gemm_a8w8_blockscale.py
index d6e9954d1b..ca28714eaa 100644
--- a/aiter/ops/triton/gemm/basic/gemm_a8w8_blockscale.py
+++ b/aiter/ops/triton/gemm/basic/gemm_a8w8_blockscale.py
@@ -1,5 +1,5 @@
 # SPDX-License-Identifier: MIT
-# Copyright (C) 2024-2025, Advanced Micro Devices, Inc. All rights reserved.
+# Copyright (C) 2024-2026, Advanced Micro Devices, Inc. All rights reserved.
 
 from typing import Optional
 import torch
diff --git a/aiter/ops/triton/gemm/basic/gemm_a8w8_per_token_scale.py b/aiter/ops/triton/gemm/basic/gemm_a8w8_per_token_scale.py
index e0ea2e6428..34dcd4873b 100644
--- a/aiter/ops/triton/gemm/basic/gemm_a8w8_per_token_scale.py
+++ b/aiter/ops/triton/gemm/basic/gemm_a8w8_per_token_scale.py
@@ -1,5 +1,5 @@
 # SPDX-License-Identifier: MIT
-# Copyright (C) 2024-2025, Advanced Micro Devices, Inc. All rights reserved.
+# Copyright (C) 2024-2026, Advanced Micro Devices, Inc. All rights reserved.
 
 from typing import Optional
 import torch
diff --git a/aiter/ops/triton/gemm/basic/gemm_a8wfp4.py b/aiter/ops/triton/gemm/basic/gemm_a8wfp4.py
index 884d95da18..3a16dd9ecb 100644
--- a/aiter/ops/triton/gemm/basic/gemm_a8wfp4.py
+++ b/aiter/ops/triton/gemm/basic/gemm_a8wfp4.py
@@ -1,5 +1,5 @@
 # SPDX-License-Identifier: MIT
-# Copyright (C) 2024-2025, Advanced Micro Devices, Inc. All rights reserved.
+# Copyright (C) 2024-2026, Advanced Micro Devices, Inc. All rights reserved.
 
 from typing import Optional
 import torch
diff --git a/aiter/ops/triton/gemm/basic/gemm_afp4wfp4.py b/aiter/ops/triton/gemm/basic/gemm_afp4wfp4.py
index 19c7948d5d..763090904d 100644
--- a/aiter/ops/triton/gemm/basic/gemm_afp4wfp4.py
+++ b/aiter/ops/triton/gemm/basic/gemm_afp4wfp4.py
@@ -1,5 +1,5 @@
 # SPDX-License-Identifier: MIT
-# Copyright (C) 2024-2025, Advanced Micro Devices, Inc. All rights reserved.
+# Copyright (C) 2024-2026, Advanced Micro Devices, Inc. All rights reserved.
 
 from typing import Optional
 import torch
diff --git a/aiter/ops/triton/gemm/basic/gemm_afp4wfp4_pre_quant_atomic.py b/aiter/ops/triton/gemm/basic/gemm_afp4wfp4_pre_quant_atomic.py
index 86b87530f4..da26736a96 100644
--- a/aiter/ops/triton/gemm/basic/gemm_afp4wfp4_pre_quant_atomic.py
+++ b/aiter/ops/triton/gemm/basic/gemm_afp4wfp4_pre_quant_atomic.py
@@ -1,5 +1,5 @@
 # SPDX-License-Identifier: MIT
-# Copyright (C) 2024-2025, Advanced Micro Devices, Inc. All rights reserved.
+# Copyright (C) 2024-2026, Advanced Micro Devices, Inc. All rights reserved.
 
 from typing import Optional
 import torch
diff --git a/aiter/ops/triton/gemm/batched/batched_gemm_a16wfp4.py b/aiter/ops/triton/gemm/batched/batched_gemm_a16wfp4.py
index 4622416c43..11b41b0b47 100755
--- a/aiter/ops/triton/gemm/batched/batched_gemm_a16wfp4.py
+++ b/aiter/ops/triton/gemm/batched/batched_gemm_a16wfp4.py
@@ -1,5 +1,5 @@
 # SPDX-License-Identifier: MIT
-# Copyright (C) 2024-2025, Advanced Micro Devices, Inc. All rights reserved.
+# Copyright (C) 2024-2026, Advanced Micro Devices, Inc. All rights reserved.
 
 from typing import Optional
 import torch
diff --git a/aiter/ops/triton/gemm/batched/batched_gemm_a8w8.py b/aiter/ops/triton/gemm/batched/batched_gemm_a8w8.py
index 596e16d850..875b17e696 100644
--- a/aiter/ops/triton/gemm/batched/batched_gemm_a8w8.py
+++ b/aiter/ops/triton/gemm/batched/batched_gemm_a8w8.py
@@ -1,5 +1,5 @@
 # SPDX-License-Identifier: MIT
-# Copyright (C) 2024-2025, Advanced Micro Devices, Inc. All rights reserved.
+# Copyright (C) 2024-2026, Advanced Micro Devices, Inc. All rights reserved.
 
 from typing import Optional
 import torch
diff --git a/aiter/ops/triton/gemm/batched/batched_gemm_a8w8_a_per_token_group_prequant_w_per_batched_tensor_quant.py b/aiter/ops/triton/gemm/batched/batched_gemm_a8w8_a_per_token_group_prequant_w_per_batched_tensor_quant.py
index 51c379e695..a413116877 100644
--- a/aiter/ops/triton/gemm/batched/batched_gemm_a8w8_a_per_token_group_prequant_w_per_batched_tensor_quant.py
+++ b/aiter/ops/triton/gemm/batched/batched_gemm_a8w8_a_per_token_group_prequant_w_per_batched_tensor_quant.py
@@ -1,5 +1,5 @@
 # SPDX-License-Identifier: MIT
-# Copyright (C) 2024-2025, Advanced Micro Devices, Inc. All rights reserved.
+# Copyright (C) 2024-2026, Advanced Micro Devices, Inc. All rights reserved.
 
 from typing import Optional
 import torch
diff --git a/aiter/ops/triton/gemm/batched/batched_gemm_afp4wfp4.py b/aiter/ops/triton/gemm/batched/batched_gemm_afp4wfp4.py
index ed930be0c0..40acb42adc 100755
--- a/aiter/ops/triton/gemm/batched/batched_gemm_afp4wfp4.py
+++ b/aiter/ops/triton/gemm/batched/batched_gemm_afp4wfp4.py
@@ -1,5 +1,5 @@
 # SPDX-License-Identifier: MIT
-# Copyright (C) 2024-2025, Advanced Micro Devices, Inc. All rights reserved.
+# Copyright (C) 2024-2026, Advanced Micro Devices, Inc. All rights reserved.
 
 from typing import Optional
 import torch
diff --git a/aiter/ops/triton/gemm/batched/batched_gemm_afp4wfp4_pre_quant.py b/aiter/ops/triton/gemm/batched/batched_gemm_afp4wfp4_pre_quant.py
index 48ae17a996..b6548bff98 100755
--- a/aiter/ops/triton/gemm/batched/batched_gemm_afp4wfp4_pre_quant.py
+++ b/aiter/ops/triton/gemm/batched/batched_gemm_afp4wfp4_pre_quant.py
@@ -1,5 +1,5 @@
 # SPDX-License-Identifier: MIT
-# Copyright (C) 2024-2025, Advanced Micro Devices, Inc. All rights reserved.
+# Copyright (C) 2024-2026, Advanced Micro Devices, Inc. All rights reserved.
 
 from typing import Optional
 import torch
diff --git a/aiter/ops/triton/gemm/batched/batched_gemm_bf16.py b/aiter/ops/triton/gemm/batched/batched_gemm_bf16.py
index eb4cdd4343..43ca825a10 100644
--- a/aiter/ops/triton/gemm/batched/batched_gemm_bf16.py
+++ b/aiter/ops/triton/gemm/batched/batched_gemm_bf16.py
@@ -1,5 +1,5 @@
 # SPDX-License-Identifier: MIT
-# Copyright (C) 2024-2025, Advanced Micro Devices, Inc. All rights reserved.
+# Copyright (C) 2024-2026, Advanced Micro Devices, Inc. All rights reserved.
 
 from typing import Optional
 import torch
diff --git a/aiter/ops/triton/gemm/feed_forward/ff_a16w16.py b/aiter/ops/triton/gemm/feed_forward/ff_a16w16.py
index c4326f6279..f579b78366 100644
--- a/aiter/ops/triton/gemm/feed_forward/ff_a16w16.py
+++ b/aiter/ops/triton/gemm/feed_forward/ff_a16w16.py
@@ -1,5 +1,5 @@
 # SPDX-License-Identifier: MIT
-# Copyright (C) 2024-2025, Advanced Micro Devices, Inc. All rights reserved.
+# Copyright (C) 2024-2026, Advanced Micro Devices, Inc. All rights reserved.
 
 from typing import Optional
 import torch
diff --git a/aiter/ops/triton/gemm/feed_forward/ff_a16w16_fused_gated.py b/aiter/ops/triton/gemm/feed_forward/ff_a16w16_fused_gated.py
index 856dc880d4..feca3bff30 100644
--- a/aiter/ops/triton/gemm/feed_forward/ff_a16w16_fused_gated.py
+++ b/aiter/ops/triton/gemm/feed_forward/ff_a16w16_fused_gated.py
@@ -1,5 +1,5 @@
 # SPDX-License-Identifier: MIT
-# Copyright (C) 2024-2025, Advanced Micro Devices, Inc. All rights reserved.
+# Copyright (C) 2024-2026, Advanced Micro Devices, Inc. All rights reserved.
 
 from typing import Optional
 import torch
diff --git a/aiter/ops/triton/gemm/feed_forward/ff_a16w16_fused_ungated.py b/aiter/ops/triton/gemm/feed_forward/ff_a16w16_fused_ungated.py
index dbf0da5eee..3428ade14b 100644
--- a/aiter/ops/triton/gemm/feed_forward/ff_a16w16_fused_ungated.py
+++ b/aiter/ops/triton/gemm/feed_forward/ff_a16w16_fused_ungated.py
@@ -1,5 +1,5 @@
 # SPDX-License-Identifier: MIT
-# Copyright (C) 2024-2025, Advanced Micro Devices, Inc. All rights reserved.
+# Copyright (C) 2024-2026, Advanced Micro Devices, Inc. All rights reserved.
 
 from typing import Optional
 import torch
diff --git a/aiter/ops/triton/gemm/fused/fused_gemm_a8w8_blockscale_a16w16.py b/aiter/ops/triton/gemm/fused/fused_gemm_a8w8_blockscale_a16w16.py
index 3cb6f2a5b5..ae76e79012 100644
--- a/aiter/ops/triton/gemm/fused/fused_gemm_a8w8_blockscale_a16w16.py
+++ b/aiter/ops/triton/gemm/fused/fused_gemm_a8w8_blockscale_a16w16.py
@@ -1,5 +1,5 @@
 # SPDX-License-Identifier: MIT
-# Copyright (C) 2024-2025, Advanced Micro Devices, Inc. All rights reserved.
+# Copyright (C) 2024-2026, Advanced Micro Devices, Inc. All rights reserved.
 
 from typing import Optional
 import torch
diff --git a/aiter/ops/triton/gemm/fused/fused_gemm_a8w8_blockscale_mul_add.py b/aiter/ops/triton/gemm/fused/fused_gemm_a8w8_blockscale_mul_add.py
index ee6dda5999..2104f612dd 100644
--- a/aiter/ops/triton/gemm/fused/fused_gemm_a8w8_blockscale_mul_add.py
+++ b/aiter/ops/triton/gemm/fused/fused_gemm_a8w8_blockscale_mul_add.py
@@ -1,5 +1,5 @@
 # SPDX-License-Identifier: MIT
-# Copyright (C) 2024-2025, Advanced Micro Devices, Inc. All rights reserved.
+# Copyright (C) 2024-2026, Advanced Micro Devices, Inc. All rights reserved.
 
 from typing import Optional, Union
 import torch
diff --git a/aiter/ops/triton/gemm/fused/fused_gemm_afp4wfp4_a16w16.py b/aiter/ops/triton/gemm/fused/fused_gemm_afp4wfp4_a16w16.py
index b56c54b76f..9728919c94 100644
--- a/aiter/ops/triton/gemm/fused/fused_gemm_afp4wfp4_a16w16.py
+++ b/aiter/ops/triton/gemm/fused/fused_gemm_afp4wfp4_a16w16.py
@@ -1,5 +1,5 @@
 # SPDX-License-Identifier: MIT
-# Copyright (C) 2024-2025, Advanced Micro Devices, Inc. All rights reserved.
+# Copyright (C) 2024-2026, Advanced Micro Devices, Inc. All rights reserved.
 
 from typing import Optional
 import os
diff --git a/aiter/ops/triton/gemm/fused/fused_gemm_afp4wfp4_mul_add.py b/aiter/ops/triton/gemm/fused/fused_gemm_afp4wfp4_mul_add.py
index 40121d7a01..0d1956ae0d 100644
--- a/aiter/ops/triton/gemm/fused/fused_gemm_afp4wfp4_mul_add.py
+++ b/aiter/ops/triton/gemm/fused/fused_gemm_afp4wfp4_mul_add.py
@@ -1,5 +1,5 @@
 # SPDX-License-Identifier: MIT
-# Copyright (C) 2024-2025, Advanced Micro Devices, Inc. All rights reserved.
+# Copyright (C) 2024-2026, Advanced Micro Devices, Inc. All rights reserved.
 
 from typing import Optional, Union
 import torch
diff --git a/aiter/ops/triton/gemm/fused/fused_gemm_afp4wfp4_split_cat.py b/aiter/ops/triton/gemm/fused/fused_gemm_afp4wfp4_split_cat.py
index 92ef460c4b..89a43c8f50 100644
--- a/aiter/ops/triton/gemm/fused/fused_gemm_afp4wfp4_split_cat.py
+++ b/aiter/ops/triton/gemm/fused/fused_gemm_afp4wfp4_split_cat.py
@@ -1,5 +1,5 @@
 # SPDX-License-Identifier: MIT
-# Copyright (C) 2024-2025, Advanced Micro Devices, Inc. All rights reserved.
+# Copyright (C) 2024-2026, Advanced Micro Devices, Inc. All rights reserved.
 
 from typing import Optional
 import torch
diff --git a/aiter/ops/triton/gluon/gemm_a8w8_blockscale.py b/aiter/ops/triton/gluon/gemm_a8w8_blockscale.py
index 3bb3d7519a..499f2025d4 100644
--- a/aiter/ops/triton/gluon/gemm_a8w8_blockscale.py
+++ b/aiter/ops/triton/gluon/gemm_a8w8_blockscale.py
@@ -1,5 +1,5 @@
 # SPDX-License-Identifier: MIT
-# Copyright (C) 2024-2025, Advanced Micro Devices, Inc. All rights reserved.
+# Copyright (C) 2024-2026, Advanced Micro Devices, Inc. All rights reserved.
 
 from typing import Optional
 import functools
diff --git a/aiter/ops/triton/moe/moe_align_block_size.py b/aiter/ops/triton/moe/moe_align_block_size.py
index b0918981e7..66bb631c25 100644
--- a/aiter/ops/triton/moe/moe_align_block_size.py
+++ b/aiter/ops/triton/moe/moe_align_block_size.py
@@ -1,5 +1,5 @@
 # SPDX-License-Identifier: MIT
-# Copyright (C) 2024-2025, Advanced Micro Devices, Inc. All rights reserved.
+# Copyright (C) 2024-2026, Advanced Micro Devices, Inc. All rights reserved.
 
 import torch
 from aiter.ops.triton.utils.logger import AiterTritonLogger
diff --git a/aiter/ops/triton/moe/moe_op.py b/aiter/ops/triton/moe/moe_op.py
index 9bdcfdf48f..677afc4ec6 100644
--- a/aiter/ops/triton/moe/moe_op.py
+++ b/aiter/ops/triton/moe/moe_op.py
@@ -1,5 +1,5 @@
 # SPDX-License-Identifier: MIT
-# Copyright (C) 2024-2025, Advanced Micro Devices, Inc. All rights reserved.
+# Copyright (C) 2024-2026, Advanced Micro Devices, Inc. All rights reserved.
 
 import torch
 import triton
diff --git a/aiter/ops/triton/moe/moe_op_e2e.py b/aiter/ops/triton/moe/moe_op_e2e.py
index d251ae5aae..f571ffdcd6 100644
--- a/aiter/ops/triton/moe/moe_op_e2e.py
+++ b/aiter/ops/triton/moe/moe_op_e2e.py
@@ -1,5 +1,5 @@
 # SPDX-License-Identifier: MIT
-# Copyright (C) 2024-2025, Advanced Micro Devices, Inc. All rights reserved.
+# Copyright (C) 2024-2026, Advanced Micro Devices, Inc. All rights reserved.
 
 import torch
 import triton
diff --git a/aiter/ops/triton/moe/moe_op_gelu.py b/aiter/ops/triton/moe/moe_op_gelu.py
index 492c3a49e2..6abac7ebe0 100644
--- a/aiter/ops/triton/moe/moe_op_gelu.py
+++ b/aiter/ops/triton/moe/moe_op_gelu.py
@@ -1,5 +1,5 @@
 # SPDX-License-Identifier: MIT
-# Copyright (C) 2024-2025, Advanced Micro Devices, Inc. All rights reserved.
+# Copyright (C) 2024-2026, Advanced Micro Devices, Inc. All rights reserved.
 
 import torch
 import triton
diff --git a/aiter/ops/triton/moe/moe_op_mxfp4.py b/aiter/ops/triton/moe/moe_op_mxfp4.py
index d58d9f399e..2143c74cc3 100644
--- a/aiter/ops/triton/moe/moe_op_mxfp4.py
+++ b/aiter/ops/triton/moe/moe_op_mxfp4.py
@@ -1,5 +1,5 @@
 # SPDX-License-Identifier: MIT
-# Copyright (C) 2024-2025, Advanced Micro Devices, Inc. All rights reserved.
+# Copyright (C) 2024-2026, Advanced Micro Devices, Inc. All rights reserved.
 
 import torch
 import triton
diff --git a/aiter/ops/triton/moe/moe_op_mxfp4_silu_fused.py b/aiter/ops/triton/moe/moe_op_mxfp4_silu_fused.py
index 493f4d66b3..32dd7b3a67 100644
--- a/aiter/ops/triton/moe/moe_op_mxfp4_silu_fused.py
+++ b/aiter/ops/triton/moe/moe_op_mxfp4_silu_fused.py
@@ -1,5 +1,5 @@
 # SPDX-License-Identifier: MIT
-# Copyright (C) 2024-2025, Advanced Micro Devices, Inc. All rights reserved.
+# Copyright (C) 2024-2026, Advanced Micro Devices, Inc. All rights reserved.
 
 import torch
 import triton
diff --git a/aiter/ops/triton/moe/moe_op_silu_fused.py b/aiter/ops/triton/moe/moe_op_silu_fused.py
index 646f0897aa..b9718e9486 100644
--- a/aiter/ops/triton/moe/moe_op_silu_fused.py
+++ b/aiter/ops/triton/moe/moe_op_silu_fused.py
@@ -1,5 +1,5 @@
 # SPDX-License-Identifier: MIT
-# Copyright (C) 2024-2025, Advanced Micro Devices, Inc. All rights reserved.
+# Copyright (C) 2024-2026, Advanced Micro Devices, Inc. All rights reserved.
 
 import torch
 import triton
diff --git a/aiter/ops/triton/moe/moe_routing_sigmoid_top1_fused.py b/aiter/ops/triton/moe/moe_routing_sigmoid_top1_fused.py
index 596e6d2856..f6898d2f91 100644
--- a/aiter/ops/triton/moe/moe_routing_sigmoid_top1_fused.py
+++ b/aiter/ops/triton/moe/moe_routing_sigmoid_top1_fused.py
@@ -1,5 +1,5 @@
 # SPDX-License-Identifier: MIT
-# Copyright (C) 2024-2025, Advanced Micro Devices, Inc. All rights reserved.
+# Copyright (C) 2024-2026, Advanced Micro Devices, Inc. All rights reserved.
 
 from typing import Optional
 import torch
diff --git a/aiter/ops/triton/normalization/norm.py b/aiter/ops/triton/normalization/norm.py
index 4305e9a921..11d862acd3 100644
--- a/aiter/ops/triton/normalization/norm.py
+++ b/aiter/ops/triton/normalization/norm.py
@@ -1,5 +1,5 @@
 # SPDX-License-Identifier: MIT
-# Copyright (C) 2024-2025, Advanced Micro Devices, Inc. All rights reserved.
+# Copyright (C) 2024-2026, Advanced Micro Devices, Inc. All rights reserved.
 
 import torch
 import triton
diff --git a/aiter/ops/triton/normalization/rmsnorm.py b/aiter/ops/triton/normalization/rmsnorm.py
index f08549ffe8..a4127feec2 100644
--- a/aiter/ops/triton/normalization/rmsnorm.py
+++ b/aiter/ops/triton/normalization/rmsnorm.py
@@ -1,5 +1,5 @@
 # SPDX-License-Identifier: MIT
-# Copyright (C) 2024-2025, Advanced Micro Devices, Inc. All rights reserved.
+# Copyright (C) 2024-2026, Advanced Micro Devices, Inc. All rights reserved.
 
 import torch
 import triton
diff --git a/aiter/ops/triton/quant/quant.py b/aiter/ops/triton/quant/quant.py
index d6ee3f0387..0883d78df0 100644
--- a/aiter/ops/triton/quant/quant.py
+++ b/aiter/ops/triton/quant/quant.py
@@ -1,5 +1,5 @@
 # SPDX-License-Identifier: MIT
-# Copyright (C) 2024-2025, Advanced Micro Devices, Inc. All rights reserved.
+# Copyright (C) 2024-2026, Advanced Micro Devices, Inc. All rights reserved.
 
 import triton
 import torch
diff --git a/aiter/ops/triton/rope/rope.py b/aiter/ops/triton/rope/rope.py
index 570963e512..b02927d3ce 100644
--- a/aiter/ops/triton/rope/rope.py
+++ b/aiter/ops/triton/rope/rope.py
@@ -1,5 +1,5 @@
 # SPDX-License-Identifier: MIT
-# Copyright (C) 2024-2025, Advanced Micro Devices, Inc. All rights reserved.
+# Copyright (C) 2024-2026, Advanced Micro Devices, Inc. All rights reserved.
 
 import torch
 import triton
diff --git a/aiter/rotary_embedding.py b/aiter/rotary_embedding.py
index 32bbe43dc9..1f1a1d3068 100644
--- a/aiter/rotary_embedding.py
+++ b/aiter/rotary_embedding.py
@@ -2,8 +2,8 @@
 # coding=utf-8
 # Adapted from
 # https://github.com/huggingface/transformers/blob/v4.33.2/src/transformers/models/llama/modeling_llama.py
-# Copyright (C) 2023-2025 The vLLM team.
-# Copyright (C) 2022-2025 EleutherAI and the HuggingFace Inc. team. All rights reserved.
+# Copyright (C) 2023-2026 The vLLM team.
+# Copyright (C) 2022-2026 EleutherAI and the HuggingFace Inc. team. All rights reserved.
 #
 # This code is based on EleutherAI's GPT-NeoX library and the GPT-NeoX
 # and OPT implementations in this library. It has been modified from its
diff --git a/aiter/utility/dtypes.py b/aiter/utility/dtypes.py
index a5a837e222..dba8279da3 100644
--- a/aiter/utility/dtypes.py
+++ b/aiter/utility/dtypes.py
@@ -1,5 +1,5 @@
 # SPDX-License-Identifier: MIT
-# Copyright (C) 2024-2025, Advanced Micro Devices, Inc. All rights reserved.
+# Copyright (C) 2024-2026, Advanced Micro Devices, Inc. All rights reserved.
 import torch
 from ..jit.utils.chip_info import get_gfx
 from ..ops.enum import QuantType
@@ -58,9 +58,9 @@ def str2bool(v):
 def str2tuple(v):
     """
     Convert string to int or tuple of ints.
-    - "512" → 512 (single value without comma returns int)
-    - "512," → (512,) (trailing comma returns tuple)
-    - "512,1024" → (512, 1024) (multiple values return tuple)
+    - "512" -> 512 (single value without comma returns int)
+    - "512," -> (512,) (trailing comma returns tuple)
+    - "512,1024" -> (512, 1024) (multiple values return tuple)
     """
     try:
         parts = [int(p.strip()) for p in v.strip("()").split(",") if p.strip()]
diff --git a/csrc/ck_batched_gemm_a8w8/batched_gemm_a8w8_tune.py b/csrc/ck_batched_gemm_a8w8/batched_gemm_a8w8_tune.py
index 32a375832e..ca3a59c7ee 100644
--- a/csrc/ck_batched_gemm_a8w8/batched_gemm_a8w8_tune.py
+++ b/csrc/ck_batched_gemm_a8w8/batched_gemm_a8w8_tune.py
@@ -7,7 +7,6 @@
 from aiter.jit.core import AITER_CONFIG_A8W8_BATCHED_GEMM
 from aiter.utility.base_tuner import GemmCommonTuner
 from batched_gemm_a8w8_common import kernels_list
-import argparse
 from aiter.utility.mp_tuner import mp_tuner
 
 
diff --git a/csrc/ck_batched_gemm_bf16/batched_gemm_bf16_tune.py b/csrc/ck_batched_gemm_bf16/batched_gemm_bf16_tune.py
index b0e8990b35..e03891aa12 100644
--- a/csrc/ck_batched_gemm_bf16/batched_gemm_bf16_tune.py
+++ b/csrc/ck_batched_gemm_bf16/batched_gemm_bf16_tune.py
@@ -8,7 +8,6 @@
 from aiter import dtypes
 from batched_gemm_bf16_common import kernels_list
 from aiter.utility.mp_tuner import mp_tuner
-import argparse
 
 
 def run_torch(x, weight, bias=None, dtype=dtypes.bf16):
diff --git a/csrc/ck_deepgemm/include/deepgemm_common.cuh b/csrc/ck_deepgemm/include/deepgemm_common.cuh
index 801ff24ea1..a524e25e32 100644
--- a/csrc/ck_deepgemm/include/deepgemm_common.cuh
+++ b/csrc/ck_deepgemm/include/deepgemm_common.cuh
@@ -1,5 +1,5 @@
 // SPDX-License-Identifier: MIT
-// Copyright (C) 2025, Advanced Micro Devices, Inc. All rights reserved.
+// Copyright (C) 2025-2026, Advanced Micro Devices, Inc. All rights reserved.
 
 #pragma once
 #include <cstdlib>
@@ -132,11 +132,10 @@ void grouped_flatmm(KernelArguments& args, ck_stream_config& s)
     const bool has_hot_loop            = BaseGemmPipeline::BlockHasHotloop(num_loop);
     const ck_tile::TailNumber tail_num = BaseGemmPipeline::GetBlockLoopTailNum(num_loop);
 
-    const auto Run = [&](const auto has_hot_loop_,
-                         const auto tail_number_) {
-        constexpr bool has_hot_loop_v   = has_hot_loop_.value;
-        constexpr auto tail_number_v    = tail_number_.value;
-        constexpr auto scheduler        = FlatmmConfig::Scheduler;
+    const auto Run = [&](const auto has_hot_loop_, const auto tail_number_) {
+        constexpr bool has_hot_loop_v = has_hot_loop_.value;
+        constexpr auto tail_number_v  = tail_number_.value;
+        constexpr auto scheduler      = FlatmmConfig::Scheduler;
 
         using CodegenPipelineProblem = ck_tile::FlatmmPipelineProblem<ADataType,
                                                                       BDataType,
diff --git a/csrc/ck_gemm_a8w8/gemm_a8w8_tune.cu b/csrc/ck_gemm_a8w8/gemm_a8w8_tune.cu
index a1325d4167..538f0c2861 100644
--- a/csrc/ck_gemm_a8w8/gemm_a8w8_tune.cu
+++ b/csrc/ck_gemm_a8w8/gemm_a8w8_tune.cu
@@ -1,5 +1,5 @@
 // SPDX-License-Identifier: MIT
-// Copyright (c) 2024, Advanced Micro Devices, Inc. All rights reserved.
+// Copyright (C) 2024-2026, Advanced Micro Devices, Inc. All rights reserved.
 
 #include "gemm_a8w8_common.cuh"
 #include "gemm_a8w8_manifest.h"
diff --git a/csrc/ck_gemm_a8w8/gemm_a8w8_tune.py b/csrc/ck_gemm_a8w8/gemm_a8w8_tune.py
index 29f82517ce..a1bd121146 100644
--- a/csrc/ck_gemm_a8w8/gemm_a8w8_tune.py
+++ b/csrc/ck_gemm_a8w8/gemm_a8w8_tune.py
@@ -1,5 +1,5 @@
 # SPDX-License-Identifier: MIT
-# Copyright (C) 2024-2025, Advanced Micro Devices, Inc. All rights reserved.
+# Copyright (C) 2024-2026, Advanced Micro Devices, Inc. All rights reserved.
 import os
 import aiter
 import pandas as pd
@@ -10,7 +10,6 @@
 from aiter.utility.base_tuner import GemmCommonTuner
 from gemm_a8w8_common import kernels_list
 from aiter.utility.mp_tuner import mp_tuner
-import argparse
 
 
 def checkClose(a, b, rtol=1e-3, atol=0.01):
diff --git a/csrc/ck_gemm_a8w8/gen_instances.py b/csrc/ck_gemm_a8w8/gen_instances.py
index 4ac882bbc7..46d93f57cf 100644
--- a/csrc/ck_gemm_a8w8/gen_instances.py
+++ b/csrc/ck_gemm_a8w8/gen_instances.py
@@ -1,5 +1,5 @@
 # SPDX-License-Identifier: MIT
-# Copyright (C) 2024-2025, Advanced Micro Devices, Inc. All rights reserved.
+# Copyright (C) 2024-2026, Advanced Micro Devices, Inc. All rights reserved.
 import argparse
 import os
 import shutil
diff --git a/csrc/ck_gemm_a8w8_blockscale/gemm_a8w8_blockscale_tune.py b/csrc/ck_gemm_a8w8_blockscale/gemm_a8w8_blockscale_tune.py
index ebccfe1a4a..6b00b2a43e 100755
--- a/csrc/ck_gemm_a8w8_blockscale/gemm_a8w8_blockscale_tune.py
+++ b/csrc/ck_gemm_a8w8_blockscale/gemm_a8w8_blockscale_tune.py
@@ -1,8 +1,6 @@
 # SPDX-License-Identifier: MIT
 # Copyright (C) 2024-2025, Advanced Micro Devices, Inc. All rights reserved.
-import os
 import aiter
-import pandas as pd
 import torch
 import torch.nn.functional as F
 from aiter import dtypes
@@ -10,7 +8,6 @@
 from aiter.jit.core import AITER_CONFIG_GEMM_A8W8_BLOCKSCALE
 from aiter.utility.base_tuner import GemmCommonTuner
 from gemm_a8w8_blockscale_common import kernels_list
-import argparse
 from einops import rearrange
 from aiter.utility.mp_tuner import mp_tuner
 
diff --git a/csrc/ck_gemm_a8w8_blockscale_bpreshuffle/gemm_a8w8_blockscale_bpreshuffle_tune.py b/csrc/ck_gemm_a8w8_blockscale_bpreshuffle/gemm_a8w8_blockscale_bpreshuffle_tune.py
index 77a6c95b8c..ed653d3c66 100755
--- a/csrc/ck_gemm_a8w8_blockscale_bpreshuffle/gemm_a8w8_blockscale_bpreshuffle_tune.py
+++ b/csrc/ck_gemm_a8w8_blockscale_bpreshuffle/gemm_a8w8_blockscale_bpreshuffle_tune.py
@@ -1,9 +1,6 @@
 # SPDX-License-Identifier: MIT
 # Copyright (C) 2024-2025, Advanced Micro Devices, Inc. All rights reserved.
-import argparse
-import os
 
-import pandas as pd
 import torch
 import torch.nn.functional as F
 from aiter.jit.core import AITER_CONFIG_GEMM_A8W8_BLOCKSCALE_BPRESHUFFLE
diff --git a/csrc/ck_gemm_moe_2stages_codegen/gemm_moe_tune.py b/csrc/ck_gemm_moe_2stages_codegen/gemm_moe_tune.py
index 501142409f..5137eeb5ca 100644
--- a/csrc/ck_gemm_moe_2stages_codegen/gemm_moe_tune.py
+++ b/csrc/ck_gemm_moe_2stages_codegen/gemm_moe_tune.py
@@ -2008,7 +2008,7 @@ def post_process(self, results, args, topk=-1, fast_mode=False):
         if len(prorfiles) > 0:
             profile_result = pd.concat(prorfiles)
             profile_result["err"] = profile_result["err"].apply(lambda x: f"{x:.1%}")
-            profile_file = f"aiter/configs/profile_fmoe.csv"
+            profile_file = "aiter/configs/profile_fmoe.csv"
             old_profile = self.get_tuned_gemm_list(
                 profile_file, profile_result.columns.tolist()
             )
diff --git a/csrc/ck_tile_gemm_moe_2stages/include/moe_cktile2stages.h b/csrc/ck_tile_gemm_moe_2stages/include/moe_cktile2stages.h
index 6ae10e9bf6..8069534061 100644
--- a/csrc/ck_tile_gemm_moe_2stages/include/moe_cktile2stages.h
+++ b/csrc/ck_tile_gemm_moe_2stages/include/moe_cktile2stages.h
@@ -1,6 +1,6 @@
 #pragma once
 // SPDX-License-Identifier: MIT
-// Copyright (C) 2024-2025, Advanced Micro Devices, Inc. All rights reserved.
+// Copyright (C) 2024-2026, Advanced Micro Devices, Inc. All rights reserved.
 
 // #include "moe_flatmm.hpp"
 #include "ck_tile/core.hpp"
@@ -20,7 +20,7 @@
 #include <torch/all.h>
 #include <torch/extension.h>
 
-using MoeKernel        = std::function<torch::Tensor(torch::Tensor&,
+using MoeKernel = std::function<torch::Tensor(torch::Tensor&,
                                               torch::Tensor&,
                                               torch::Tensor&,
                                               torch::Tensor&,
@@ -44,15 +44,28 @@ using fp16             = ck_tile::half_t;
 using fp8              = ck_tile::fp8_t;
 using pk_fp4           = ck_tile::pk_fp4_t;
 
-template <typename ADataType, typename BDataType, typename AccDataType, typename CDataType, int activation, bool kHasBias, int split_k>
-struct moe_gemm1_heuristic_dispatcher{
-    static MoeKernel dispatch(int M, int N, int K, int block_m){}
+template <typename ADataType,
+          typename BDataType,
+          typename AccDataType,
+          typename CDataType,
+          int activation,
+          bool kHasBias,
+          int split_k>
+struct moe_gemm1_heuristic_dispatcher
+{
+    static MoeKernel dispatch(int M, int N, int K, int block_m) {}
 };
 
-
-template <typename ADataType, typename BDataType, typename AccDataType, typename CDataType, int activation, bool kHasBias, int split_k>
-struct moe_gemm2_heuristic_dispatcher{
-    static MoeKernel dispatch(int M, int N, int K, int block_m){}
+template <typename ADataType,
+          typename BDataType,
+          typename AccDataType,
+          typename CDataType,
+          int activation,
+          bool kHasBias,
+          int split_k>
+struct moe_gemm2_heuristic_dispatcher
+{
+    static MoeKernel dispatch(int M, int N, int K, int block_m) {}
 };
 
 __attribute__((visibility("default"))) torch::Tensor
diff --git a/csrc/ck_tile_gemm_moe_2stages/include/moe_cktile2stages_common.cuh b/csrc/ck_tile_gemm_moe_2stages/include/moe_cktile2stages_common.cuh
index b9c0ba84a6..38af9b3bce 100644
--- a/csrc/ck_tile_gemm_moe_2stages/include/moe_cktile2stages_common.cuh
+++ b/csrc/ck_tile_gemm_moe_2stages/include/moe_cktile2stages_common.cuh
@@ -1,5 +1,5 @@
 // SPDX-License-Identifier: MIT
-// Copyright (C) 2024-2025, Advanced Micro Devices, Inc. All rights reserved.
+// Copyright (C) 2024-2026, Advanced Micro Devices, Inc. All rights reserved.
 #pragma once
 
 #include "ck_tile/core.hpp"
@@ -67,13 +67,13 @@ struct MoeFlatmmConfig
 
 __host__ static constexpr int32_t GetBMemNTType(int32_t M, int32_t N, int32_t K)
 {
-	(void)N;
-	(void)K;
-	if(M <= 416)
-	{
-		return 2;
-	}
-	return 0;
+    (void)N;
+    (void)K;
+    if(M <= 416)
+    {
+        return 2;
+    }
+    return 0;
 }
 
 template <typename FlatmmConfig,
@@ -159,9 +159,9 @@ void moe_gemm(const MoeFlatmmHostArgs& args, const ck_stream_config& s)
     const auto Run = [&](const auto has_hot_loop_,
                          const auto tail_number_,
                          const auto b_mem_nt_type_) {
-        constexpr bool has_hot_loop_v   = has_hot_loop_.value;
-        constexpr auto tail_number_v    = tail_number_.value;
-        constexpr auto scheduler        = FlatmmConfig::Scheduler;
+        constexpr bool has_hot_loop_v = has_hot_loop_.value;
+        constexpr auto tail_number_v  = tail_number_.value;
+        constexpr auto scheduler      = FlatmmConfig::Scheduler;
         constexpr auto b_mem_nt_type_v =
             static_cast<ck_tile::amd_buffer_coherence_enum>(b_mem_nt_type_.value);
 
@@ -232,7 +232,6 @@ void moe_gemm(const MoeFlatmmHostArgs& args, const ck_stream_config& s)
                 ck_tile::F16xMXF4FlatmmPipelineAGmemBGmemCRegV1<CodegenPipelineProblem>>,
             ck_tile::MoeFlatmmPipelineAGmemBGmemCRegV1<CodegenPipelineProblem>>;
 
-
         // TODO: support more act type.
         using FusedAct =
             std::conditional_t<ActivationOp == 2, ck_tile::moe::Swiglu, ck_tile::moe::MoeSilu>;
@@ -322,33 +321,28 @@ void moe_gemm(const MoeFlatmmHostArgs& args, const ck_stream_config& s)
         // return ave_time;
     };
 
-    const auto RunBMem =
-        [&](const auto has_hot_loop_, const auto tail_number_) {
-            switch(b_mem_nt_type)
-            {
-            case 2: {
-                Run(has_hot_loop_,
-                    tail_number_,
-                    ck_tile::integral_constant<int32_t, 2>{});
-            }
-            break;
-            default: {
-                Run(has_hot_loop_,
-                    tail_number_,
-                    ck_tile::integral_constant<int32_t, 0>{});
-            }
-            }
-        };
+    const auto RunBMem = [&](const auto has_hot_loop_, const auto tail_number_) {
+        switch(b_mem_nt_type)
+        {
+        case 2: {
+            Run(has_hot_loop_, tail_number_, ck_tile::integral_constant<int32_t, 2>{});
+        }
+        break;
+        default: {
+            Run(has_hot_loop_, tail_number_, ck_tile::integral_constant<int32_t, 0>{});
+        }
+        }
+    };
 
     if(tail_num == ck_tile::TailNumber::Odd)
     {
         RunBMem(ck_tile::bool_constant<true>{},
-                  ck_tile::integral_constant<ck_tile::TailNumber, ck_tile::TailNumber::Odd>{});
+                ck_tile::integral_constant<ck_tile::TailNumber, ck_tile::TailNumber::Odd>{});
     }
     else if(tail_num == ck_tile::TailNumber::Even)
     {
         RunBMem(ck_tile::bool_constant<true>{},
-                  ck_tile::integral_constant<ck_tile::TailNumber, ck_tile::TailNumber::Even>{});
+                ck_tile::integral_constant<ck_tile::TailNumber, ck_tile::TailNumber::Even>{});
     }
     else
     {
diff --git a/csrc/ck_tile_gemm_moe_2stages/moe_cktile2stages_common.py b/csrc/ck_tile_gemm_moe_2stages/moe_cktile2stages_common.py
index 765a3228dd..c04f9b62ca 100644
--- a/csrc/ck_tile_gemm_moe_2stages/moe_cktile2stages_common.py
+++ b/csrc/ck_tile_gemm_moe_2stages/moe_cktile2stages_common.py
@@ -1,5 +1,5 @@
 # SPDX-License-Identifier: MIT
-# Copyright (C) 2024-2025, Advanced Micro Devices, Inc. All rights reserved.
+# Copyright (C) 2024-2026, Advanced Micro Devices, Inc. All rights reserved.
 from dataclasses import dataclass
 import os
 import sys
diff --git a/csrc/cktile_gemm_a8w8_bpreshuffle/include/gemm_a8w8_bpreshuffle_cktile_common.cuh b/csrc/cktile_gemm_a8w8_bpreshuffle/include/gemm_a8w8_bpreshuffle_cktile_common.cuh
index 9a07ebba97..06ef0f9e9f 100644
--- a/csrc/cktile_gemm_a8w8_bpreshuffle/include/gemm_a8w8_bpreshuffle_cktile_common.cuh
+++ b/csrc/cktile_gemm_a8w8_bpreshuffle/include/gemm_a8w8_bpreshuffle_cktile_common.cuh
@@ -1,6 +1,6 @@
 #pragma once
 // SPDX-License-Identifier: MIT
-// Copyright (C) 2024-2025, Advanced Micro Devices, Inc. All rights reserved.
+// Copyright (C) 2024-2026, Advanced Micro Devices, Inc. All rights reserved.
 
 #ifdef USE_ROCM
 
@@ -100,11 +100,10 @@ float flatmm_calc(const ck_tile::ScaleFlatmmHostArgs<ScaleM, ScaleN>& args,
     const ck_tile::TailNumber tail_num = BaseGemmPipeline::GetBlockLoopTailNum(num_loop);
     float ave_time{0};
 
-    const auto Run = [&](const auto has_hot_loop_,
-                         const auto tail_number_) {
-        constexpr bool has_hot_loop_v   = has_hot_loop_.value;
-        constexpr auto tail_number_v    = tail_number_.value;
-        constexpr auto scheduler        = FlatmmConfig::Scheduler;
+    const auto Run = [&](const auto has_hot_loop_, const auto tail_number_) {
+        constexpr bool has_hot_loop_v = has_hot_loop_.value;
+        constexpr auto tail_number_v  = tail_number_.value;
+        constexpr auto scheduler      = FlatmmConfig::Scheduler;
 
         using CodegenPipelineProblem = ck_tile::FlatmmPipelineProblem<ADataType,
                                                                       BDataType,
diff --git a/csrc/cpp_itfs/gluon_aot_tools/extra/hip/compile.h b/csrc/cpp_itfs/gluon_aot_tools/extra/hip/compile.h
index 22876bc6d5..a59e0b4d56 100644
--- a/csrc/cpp_itfs/gluon_aot_tools/extra/hip/compile.h
+++ b/csrc/cpp_itfs/gluon_aot_tools/extra/hip/compile.h
@@ -1,5 +1,5 @@
 // SPDX-License-Identifier: MIT
-// Copyright (C) 2025, Advanced Micro Devices, Inc. All rights reserved.
+// Copyright (C) 2025-2026, Advanced Micro Devices, Inc. All rights reserved.
 
 #pragma once
 
diff --git a/csrc/cpp_itfs/mha_fwd_batch_prefill.cpp b/csrc/cpp_itfs/mha_fwd_batch_prefill.cpp
index 1b1d57aa4d..2c5da43ef2 100644
--- a/csrc/cpp_itfs/mha_fwd_batch_prefill.cpp
+++ b/csrc/cpp_itfs/mha_fwd_batch_prefill.cpp
@@ -2,35 +2,36 @@
 #include <string>
 
 namespace aiter {
-mha_fwd_traits get_mha_fwd_traits(int head_size_q,
-                                  int head_size_v,
-                                  std::string dtype,
-                                  bool is_group_mode,
-                                  bool has_logits_soft_cap,
-                                  mask_enum mask_type,
-                                  bias_enum bias_type,
-                                  bool has_lse,
-                                  bool has_dropout,
-                                  quant_scale_enum qscale_type,
-                                  bool use_ext_asm,
-                                  bool has_sink          = false,
-                                  int how_v3_bf16_cvt    = 1,
-                                  bool skip_min_seqlen_q = false)
+mha_batch_prefill_traits
+get_mha_batch_prefill_traits(int head_size_q,
+                             int head_size_v,
+                             std::string dtype,
+                             bool is_group_mode,
+                             bool has_logits_soft_cap,
+                             mask_enum mask_type,
+                             bias_enum bias_type,
+                             bool has_lse,
+                             bool has_dropout,
+                             quant_scale_enum qscale_type,
+                             ck_tile::BlockAttentionKVCacheMemoryLayoutEnum kv_memory_layout,
+                             ck_tile::BlockAttentionKVCacheLookupTableEnum kv_lookup_table,
+                             int page_size,
+                             bool skip_min_seqlen_q = false)
 {
-    return mha_fwd_traits(head_size_q,
-                          head_size_v,
-                          dtype,
-                          is_group_mode,
-                          has_logits_soft_cap,
-                          mask_type,
-                          bias_type,
-                          has_lse,
-                          has_dropout,
-                          qscale_type,
-                          use_ext_asm,
-                          how_v3_bf16_cvt,
-                          skip_min_seqlen_q,
-                          has_sink);
+    return mha_batch_prefill_traits(head_size_q,
+                                    head_size_v,
+                                    dtype,
+                                    is_group_mode,
+                                    has_logits_soft_cap,
+                                    mask_type,
+                                    bias_type,
+                                    has_lse,
+                                    has_dropout,
+                                    qscale_type,
+                                    skip_min_seqlen_q,
+                                    kv_memory_layout,
+                                    kv_lookup_table,
+                                    page_size);
 }
 
 float mha_batch_prefill(mha_batch_prefill_args args,
@@ -46,17 +47,19 @@ float mha_batch_prefill(mha_batch_prefill_args args,
     int head_size_q  = args.hdim_q;
     int head_size_v  = args.hdim_v;
     bool has_dropout = args.p_drop > 0.f;
-    auto traits      = get_mha_fwd_traits(head_size_q,
-                                     head_size_v,
-                                     q_dtype_str,
-                                     is_group_mode,
-                                     args.logits_soft_cap > 0.f,
-                                     mask_type,
-                                     bias_type,
-                                     has_lse,
-                                     has_dropout,
-                                     qscale_type,
-                                     use_ext_asm);
+    auto traits      = get_mha_batch_prefill_traits(head_size_q,
+                                               head_size_v,
+                                               q_dtype_str,
+                                               is_group_mode,
+                                               args.logits_soft_cap > 0.f,
+                                               mask_type,
+                                               bias_type,
+                                               has_lse,
+                                               has_dropout,
+                                               qscale_type,
+                                               args.kv_memory_layout,
+                                               args.kv_lookup_table,
+                                               args.page_block_size);
     return fmha_batch_prefill(traits, args, stream_config);
 }
 
diff --git a/csrc/include/aiter_hip_common.h b/csrc/include/aiter_hip_common.h
index 3e7b4ad097..73915a3b84 100644
--- a/csrc/include/aiter_hip_common.h
+++ b/csrc/include/aiter_hip_common.h
@@ -2,8 +2,8 @@
 // Copyright (C) 2024-2025, Advanced Micro Devices, Inc. All rights reserved.
 #pragma once
 #include "ck_tile/core.hpp"
-#include <hip/hip_runtime.h>
 #include <cstdint>
+#include <hip/hip_runtime.h>
 #include <iostream>
 
 enum class GPUArch
@@ -12,15 +12,14 @@ enum class GPUArch
     gfx950
 };
 
-
-#define CHECK_COND(x) \
-    do { \
-        if (!(x)) { \
-            std::cerr << "check failed, file=" \
-                << __FILE__ << ", line=" \
-                << __LINE__ << std::endl; \
-            std::terminate(); \
-        } \
+#define CHECK_COND(x)                                                                             \
+    do                                                                                            \
+    {                                                                                             \
+        if(!(x))                                                                                  \
+        {                                                                                         \
+            std::cerr << "check failed, file=" << __FILE__ << ", line=" << __LINE__ << std::endl; \
+            std::terminate();                                                                     \
+        }                                                                                         \
     } while(0)
 
 #define HIP_CALL(call)                                                       \
diff --git a/csrc/include/custom_all_reduce.cuh b/csrc/include/custom_all_reduce.cuh
index a66ba77bc1..349404f5bc 100644
--- a/csrc/include/custom_all_reduce.cuh
+++ b/csrc/include/custom_all_reduce.cuh
@@ -28,127 +28,125 @@
 #include <unordered_map>
 #include <vector>
 
+namespace aiter {
 
-namespace aiter
+constexpr int kMaxBlocks = 80;
+// note: we don't want to use atomics for signals because peer atomics are no
+// supported on PCIe links
+struct Signal
 {
-
-  constexpr int kMaxBlocks = 80;
-  // note: we don't want to use atomics for signals because peer atomics are no
-  // supported on PCIe links
-  struct Signal
-  {
     alignas(128) uint32_t start[kMaxBlocks][8];
     alignas(128) uint32_t end[kMaxBlocks][8];
     alignas(128) uint32_t _flag[kMaxBlocks]; // incremental flags for each rank
-  };
+};
 
 #ifdef USE_ROCM
-  struct __align__(16) RankData { const void *ptrs[8]; };
+struct __align__(16) RankData { const void* ptrs[8]; };
 #else
-  struct __align__(16) RankData { const void *__restrict__ ptrs[8]; };
+struct __align__(16) RankData { const void* __restrict__ ptrs[8]; };
 #endif
 
-  struct __align__(16) RankSignals
-  {
+struct __align__(16) RankSignals
+{
 #ifndef USE_ROCM
     volatile
 #endif
-        Signal *signals[8];
-  };
+        Signal* signals[8];
+};
 
-  // like std::array, but aligned
-  template <typename T, int sz>
-  struct __align__(alignof(T) * sz) array_t
-  {
+// like std::array, but aligned
+template <typename T, int sz>
+struct __align__(alignof(T) * sz) array_t
+{
     T data[sz];
-    using type = T;
+    using type                = T;
     static constexpr int size = sz;
-  };
+};
 
-  // use packed type to maximize memory efficiency
-  // goal: generate ld.128 and st.128 instructions
-  template <typename T>
-  struct packed_t
-  {
+// use packed type to maximize memory efficiency
+// goal: generate ld.128 and st.128 instructions
+template <typename T>
+struct packed_t
+{
     // the (P)acked type for load/store
     using P = array_t<T, 16 / sizeof(T)>;
     // the (A)ccumulator type for reduction
     using A = array_t<float, 16 / sizeof(T)>;
-  };
+};
 
 #define DINLINE __device__ __forceinline__
 
-  // scalar cast functions
-  DINLINE float upcast_s(half val) { return __half2float(val); }
+// scalar cast functions
+DINLINE float upcast_s(half val) { return __half2float(val); }
 
-  template <typename T>
-  DINLINE T downcast_s(float val);
-  template <>
-  DINLINE half downcast_s(float val)
-  {
+template <typename T>
+DINLINE T downcast_s(float val);
+template <>
+DINLINE half downcast_s(float val)
+{
     return __float2half(val);
-  }
+}
 
-  // scalar add functions
-  // for some reason when compiling with Pytorch, the + operator for half and
-  // bfloat is disabled so we call the intrinsics directly
-  DINLINE half &assign_add(half &a, half b)
-  {
+// scalar add functions
+// for some reason when compiling with Pytorch, the + operator for half and
+// bfloat is disabled so we call the intrinsics directly
+DINLINE half& assign_add(half& a, half b)
+{
     a = __hadd(a, b);
     return a;
-  }
-  DINLINE float &assign_add(float &a, float b) { return a += b; }
-
-#if (__CUDA_ARCH__ >= 800 || !defined(__CUDA_ARCH__))
-  DINLINE float upcast_s(__hip_bfloat16 val) { return __bfloat162float(val); }
-  template <>
-  DINLINE __hip_bfloat16 downcast_s(float val)
-  {
+}
+DINLINE float& assign_add(float& a, float b) { return a += b; }
+
+#if(__CUDA_ARCH__ >= 800 || !defined(__CUDA_ARCH__))
+DINLINE float upcast_s(__hip_bfloat16 val) { return __bfloat162float(val); }
+template <>
+DINLINE __hip_bfloat16 downcast_s(float val)
+{
     return __float2bfloat16(val);
-  }
-  DINLINE __hip_bfloat16 &assign_add(__hip_bfloat16 &a, __hip_bfloat16 b)
-  {
+}
+DINLINE __hip_bfloat16& assign_add(__hip_bfloat16& a, __hip_bfloat16 b)
+{
     a = __hadd(a, b);
     return a;
-  }
+}
 #endif
 
-  template <typename T, int N>
-  DINLINE array_t<T, N> &packed_assign_add(array_t<T, N> &a, array_t<T, N> b)
-  {
+template <typename T, int N>
+DINLINE array_t<T, N>& packed_assign_add(array_t<T, N>& a, array_t<T, N> b)
+{
 #pragma unroll
-    for (int i = 0; i < N; i++)
+    for(int i = 0; i < N; i++)
     {
-      assign_add(a.data[i], b.data[i]);
+        assign_add(a.data[i], b.data[i]);
     }
     return a;
-  }
+}
 
-  template <typename T, int N>
-  DINLINE array_t<float, N> upcast(array_t<T, N> val)
-  {
-    if constexpr (std::is_same<T, float>::value)
+template <typename T, int N>
+DINLINE array_t<float, N> upcast(array_t<T, N> val)
+{
+    if constexpr(std::is_same<T, float>::value)
     {
-      return val;
+        return val;
     }
     else
     {
-      array_t<float, N> out;
+        array_t<float, N> out;
 #pragma unroll
-      for (int i = 0; i < N; i++)
-      {
-        out.data[i] = upcast_s(val.data[i]);
-      }
-      return out;
+        for(int i = 0; i < N; i++)
+        {
+            out.data[i] = upcast_s(val.data[i]);
+        }
+        return out;
     }
-  }
+}
 
-  template <typename O>
-  DINLINE O downcast(array_t<float, O::size> val)
-  {
-    if constexpr (std::is_same<typename O::type, float>::value)
+template <typename O>
+DINLINE O downcast(array_t<float, O::size> val)
+{
+    if constexpr(std::is_same<typename O::type, float>::value)
     {
-      return val;
+        return val;
     }
     //   else if constexpr (std::is_same<typename O::type, __hip_bfloat16>::value)
     //   {
@@ -167,73 +165,75 @@ namespace aiter
     //   }
     else
     {
-      O out;
+        O out;
 #pragma unroll
-      for (int i = 0; i < O::size; i++)
-      {
-        out.data[i] = downcast_s<typename O::type>(val.data[i]);
-      }
-      return out;
-    }
-  }
-
-  // This function is meant to be used as the first synchronization in the all
-  // reduce kernel. Thus, it doesn't need to make any visibility guarantees for
-  // prior memory accesses. Note: volatile writes will not be reordered against
-  // other volatile writes.
-  template <int ngpus>
-  DINLINE void start_sync(const RankSignals &sg,
+        for(int i = 0; i < O::size; i++)
+        {
+            out.data[i] = downcast_s<typename O::type>(val.data[i]);
+        }
+        return out;
+    }
+}
+
+// This function is meant to be used as the first synchronization in the all
+// reduce kernel. Thus, it doesn't need to make any visibility guarantees for
+// prior memory accesses. Note: volatile writes will not be reordered against
+// other volatile writes.
+template <int ngpus>
+DINLINE void start_sync(const RankSignals& sg,
 #ifndef USE_ROCM
-                          volatile
+                        volatile
 #endif
-                          Signal *self_sg,
-                          int rank)
-  {
+                        Signal* self_sg,
+                        int rank)
+{
 #ifdef USE_ROCM
     uint32_t flag = self_sg->_flag[blockIdx.x] + 1;
-    if (threadIdx.x < ngpus)
-    {
-      // simultaneously write to the corresponding flag of all ranks.
-      // Latency = 1 p2p write
-      __scoped_atomic_store_n(&sg.signals[threadIdx.x]->start[blockIdx.x][rank],
-                              flag, __ATOMIC_RELAXED, __MEMORY_SCOPE_SYSTEM);
-      // wait until we got true from all ranks
-      while (__scoped_atomic_load_n(&self_sg->start[blockIdx.x][threadIdx.x],
-                                    __ATOMIC_RELAXED,
-                                    __MEMORY_SCOPE_DEVICE) < flag)
-        ;
+    if(threadIdx.x < ngpus)
+    {
+        // simultaneously write to the corresponding flag of all ranks.
+        // Latency = 1 p2p write
+        __scoped_atomic_store_n(&sg.signals[threadIdx.x]->start[blockIdx.x][rank],
+                                flag,
+                                __ATOMIC_RELAXED,
+                                __MEMORY_SCOPE_SYSTEM);
+        // wait until we got true from all ranks
+        while(__scoped_atomic_load_n(&self_sg->start[blockIdx.x][threadIdx.x],
+                                     __ATOMIC_RELAXED,
+                                     __MEMORY_SCOPE_DEVICE) < flag)
+            ;
     }
     __syncthreads();
     // use one thread to update flag
-    if (threadIdx.x == 0)
-      self_sg->_flag[blockIdx.x] = flag;
+    if(threadIdx.x == 0)
+        self_sg->_flag[blockIdx.x] = flag;
 #else
-    if (threadIdx.x < ngpus)
+    if(threadIdx.x < ngpus)
     {
-      // reset flag for next time
-      self_sg->end[blockIdx.x][threadIdx.x] = 0;
-      // simultaneously write to the corresponding flag of all ranks.
-      // Latency = 1 p2p write
-      sg.signals[threadIdx.x]->start[blockIdx.x][rank] = 1;
-      // wait until we got true from all ranks
-      while (!self_sg->start[blockIdx.x][threadIdx.x])
-        ;
+        // reset flag for next time
+        self_sg->end[blockIdx.x][threadIdx.x] = 0;
+        // simultaneously write to the corresponding flag of all ranks.
+        // Latency = 1 p2p write
+        sg.signals[threadIdx.x]->start[blockIdx.x][rank] = 1;
+        // wait until we got true from all ranks
+        while(!self_sg->start[blockIdx.x][threadIdx.x])
+            ;
     }
     __syncthreads();
 #endif
-  }
+}
 
-  // This function is meant to be used as the second or the final synchronization
-  // barrier in the all reduce kernel. If it's the final synchronization barrier,
-  // we don't need to make any visibility guarantees for prior memory accesses.
-  template <int ngpus, bool final_sync = false>
-  DINLINE void end_sync(const RankSignals &sg,
+// This function is meant to be used as the second or the final synchronization
+// barrier in the all reduce kernel. If it's the final synchronization barrier,
+// we don't need to make any visibility guarantees for prior memory accesses.
+template <int ngpus, bool final_sync = false>
+DINLINE void end_sync(const RankSignals& sg,
 #ifndef USE_ROCM
-                        volatile
+                      volatile
 #endif
-                        Signal *self_sg,
-                        int rank)
-  {
+                      Signal* self_sg,
+                      int rank)
+{
 #ifdef USE_ROCM
     __syncthreads();
     // eliminate the case that prior writes are not visible after signals become
@@ -241,70 +241,71 @@ namespace aiter
     // testing. Might be the case that hardware provides stronger guarantee than
     // the memory model.
     uint32_t flag = self_sg->_flag[blockIdx.x] + 1;
-    if (threadIdx.x < ngpus)
-    {
-      // simultaneously write to the corresponding flag of all ranks.
-      // Latency = 1 p2p write
-      __scoped_atomic_store_n(&sg.signals[threadIdx.x]->end[blockIdx.x][rank],
-                              flag,
-                              final_sync ? __ATOMIC_RELAXED : __ATOMIC_RELEASE,
-                              __MEMORY_SCOPE_SYSTEM);
-      // wait until we got true from all ranks
-      while (
-          __scoped_atomic_load_n(&self_sg->end[blockIdx.x][threadIdx.x],
-                                 final_sync ? __ATOMIC_RELAXED : __ATOMIC_ACQUIRE,
-                                 __MEMORY_SCOPE_DEVICE) < flag)
-        ;
+    if(threadIdx.x < ngpus)
+    {
+        // simultaneously write to the corresponding flag of all ranks.
+        // Latency = 1 p2p write
+        __scoped_atomic_store_n(&sg.signals[threadIdx.x]->end[blockIdx.x][rank],
+                                flag,
+                                final_sync ? __ATOMIC_RELAXED : __ATOMIC_RELEASE,
+                                __MEMORY_SCOPE_SYSTEM);
+        // wait until we got true from all ranks
+        while(__scoped_atomic_load_n(&self_sg->end[blockIdx.x][threadIdx.x],
+                                     final_sync ? __ATOMIC_RELAXED : __ATOMIC_ACQUIRE,
+                                     __MEMORY_SCOPE_DEVICE) < flag)
+            ;
     }
     __syncthreads();
     // use one thread to update flag
-    if (threadIdx.x == 0)
-      self_sg->_flag[blockIdx.x] = flag;
+    if(threadIdx.x == 0)
+        self_sg->_flag[blockIdx.x] = flag;
 #else
     __syncthreads();
     // eliminate the case that prior writes are not visible after signals become
     // visible. Note that I did not managed to make this happen through a lot of
     // testing. Might be the case that hardware provides stronger guarantee than
     // the memory model.
-    if constexpr (!final_sync)
-      __threadfence_system();
-    if (threadIdx.x < ngpus)
-    {
-      // reset flag for next time
-      self_sg->start[blockIdx.x][threadIdx.x] = 0;
-      // simultaneously write to the corresponding flag of all ranks.
-      // Latency = 1 p2p write
-      sg.signals[threadIdx.x]->end[blockIdx.x][rank] = 1;
-      // wait until we got true from all ranks
-      while (!self_sg->end[blockIdx.x][threadIdx.x])
-        ;
-    }
-    if constexpr (!final_sync)
-      __syncthreads();
+    if constexpr(!final_sync)
+        __threadfence_system();
+    if(threadIdx.x < ngpus)
+    {
+        // reset flag for next time
+        self_sg->start[blockIdx.x][threadIdx.x] = 0;
+        // simultaneously write to the corresponding flag of all ranks.
+        // Latency = 1 p2p write
+        sg.signals[threadIdx.x]->end[blockIdx.x][rank] = 1;
+        // wait until we got true from all ranks
+        while(!self_sg->end[blockIdx.x][threadIdx.x])
+            ;
+    }
+    if constexpr(!final_sync)
+        __syncthreads();
 #endif
-  }
+}
 
-  template <typename P, int ngpus, typename A>
-  DINLINE P packed_reduce(const P *ptrs[], int idx)
-  {
+template <typename P, int ngpus, typename A>
+DINLINE P packed_reduce(const P* ptrs[], int idx)
+{
     A tmp = upcast(ptrs[0][idx]);
 #pragma unroll
-    for (int i = 1; i < ngpus; i++)
+    for(int i = 1; i < ngpus; i++)
     {
-      packed_assign_add(tmp, upcast(ptrs[i][idx]));
+        packed_assign_add(tmp, upcast(ptrs[i][idx]));
     }
     return downcast<P>(tmp);
-  }
+}
 
-  template <typename T, int ngpus>
-  __global__ void __launch_bounds__(512, 1)
-      cross_device_reduce_1stage_naive(RankData *_dp, RankSignals sg,
+template <typename T, int ngpus>
+__global__ void __launch_bounds__(512, 1) cross_device_reduce_1stage_naive(RankData* _dp,
+                                                                           RankSignals sg,
 #ifndef USE_ROCM
-                                 volatile
+                                                                           volatile
 #endif
-                                 Signal *self_sg,
-                                 T *__restrict__ result, int rank, int size)
-  {
+                                                                           Signal* self_sg,
+                                                                           T* __restrict__ result,
+                                                                           int rank,
+                                                                           int size)
+{
     using P = typename packed_t<T>::P;
     using A = typename packed_t<T>::A;
     // note: we don't reorder the address so the accumulation order is the same
@@ -312,57 +313,58 @@ namespace aiter
     auto dp = *_dp;
     start_sync<ngpus>(sg, self_sg, rank);
     // do the actual reduction
-    for (int idx = blockIdx.x * blockDim.x + threadIdx.x; idx < size;
-         idx += gridDim.x * blockDim.x)
+    for(int idx = blockIdx.x * blockDim.x + threadIdx.x; idx < size; idx += gridDim.x * blockDim.x)
     {
-      ((P *)result)[idx] = packed_reduce<P, ngpus, A>((const P **)&dp.ptrs[0], idx);
+        ((P*)result)[idx] = packed_reduce<P, ngpus, A>((const P**)&dp.ptrs[0], idx);
     }
     end_sync<ngpus, true>(sg, self_sg, rank);
-  }
+}
 
-  template <typename P>
+template <typename P>
 #ifdef USE_ROCM
-  DINLINE P *get_tmp_buf(Signal *sg)
-  {
+DINLINE P* get_tmp_buf(Signal* sg)
+{
 #else
-  DINLINE P *get_tmp_buf(volatile Signal *sg)
-  {
+DINLINE P* get_tmp_buf(volatile Signal* sg)
+{
 #endif
-    return (P *)(((Signal *)sg) + 1);
-  }
+    return (P*)(((Signal*)sg) + 1);
+}
 
-  template <typename T, int ngpus>
-  __global__ void __launch_bounds__(512, 1)
-      cross_device_reduce_2stage_naive(RankData *_dp, RankSignals sg,
+template <typename T, int ngpus>
+__global__ void __launch_bounds__(512, 1) cross_device_reduce_2stage_naive(RankData* _dp,
+                                                                           RankSignals sg,
 #ifndef USE_ROCM
-                                 volatile
+                                                                           volatile
 #endif
-                                 Signal *self_sg,
-                                 T *__restrict__ result, int rank, int size)
-  {
-    int tid = blockIdx.x * blockDim.x + threadIdx.x;
-    int stride = gridDim.x * blockDim.x;
-    using P = typename packed_t<T>::P;
-    using A = typename packed_t<T>::A;
-    int part = size / ngpus;
-    int start = rank * part;
-    int end = rank == ngpus - 1 ? size : start + part;
+                                                                           Signal* self_sg,
+                                                                           T* __restrict__ result,
+                                                                           int rank,
+                                                                           int size)
+{
+    int tid          = blockIdx.x * blockDim.x + threadIdx.x;
+    int stride       = gridDim.x * blockDim.x;
+    using P          = typename packed_t<T>::P;
+    using A          = typename packed_t<T>::A;
+    int part         = size / ngpus;
+    int start        = rank * part;
+    int end          = rank == ngpus - 1 ? size : start + part;
     int largest_part = part + size % ngpus;
-    const P *ptrs[ngpus];
-    P *tmps[ngpus];
+    const P* ptrs[ngpus];
+    P* tmps[ngpus];
 #pragma unroll
-    for (int i = 0; i < ngpus; i++)
+    for(int i = 0; i < ngpus; i++)
     {
-      int target = (rank + i) % ngpus;
-      ptrs[i] = (const P *)_dp->ptrs[target];
-      tmps[i] = get_tmp_buf<P>(sg.signals[target]);
+        int target = (rank + i) % ngpus;
+        ptrs[i]    = (const P*)_dp->ptrs[target];
+        tmps[i]    = get_tmp_buf<P>(sg.signals[target]);
     }
     auto tmp_out = tmps[0];
     start_sync<ngpus>(sg, self_sg, rank);
     // stage 1: reduce scatter
-    for (int idx = start + tid; idx < end; idx += stride)
+    for(int idx = start + tid; idx < end; idx += stride)
     {
-      tmp_out[idx - start] = packed_reduce<P, ngpus, A>(ptrs, idx);
+        tmp_out[idx - start] = packed_reduce<P, ngpus, A>(ptrs, idx);
     }
     end_sync<ngpus>(sg, self_sg, rank);
 
@@ -371,36 +373,38 @@ namespace aiter
     // between threads that have the same tid. If thread i computes the sum of
     // start + i in the first stage, then thread i also gathers start + i from all
     // ranks.
-    for (int idx = tid; idx < largest_part; idx += stride)
+    for(int idx = tid; idx < largest_part; idx += stride)
     {
 #pragma unroll
-      for (int i = 0; i < ngpus; i++)
-      {
-        int gather_from_rank = ((rank + i) % ngpus);
-        if (gather_from_rank == ngpus - 1 || idx < part)
+        for(int i = 0; i < ngpus; i++)
         {
-          int dst_idx = gather_from_rank * part + idx;
-          ((P *)result)[dst_idx] = tmps[i][idx];
+            int gather_from_rank = ((rank + i) % ngpus);
+            if(gather_from_rank == ngpus - 1 || idx < part)
+            {
+                int dst_idx           = gather_from_rank * part + idx;
+                ((P*)result)[dst_idx] = tmps[i][idx];
+            }
         }
-      }
     }
-  }
+}
 
 #define THREAD_NUM 512
 
-  template <typename T, int ngpus>
-  __global__ void __launch_bounds__(512, 1)
-      cross_device_reduce_1stage(RankData *_dp, RankSignals sg,
+template <typename T, int ngpus>
+__global__ void __launch_bounds__(512, 1) cross_device_reduce_1stage(RankData* _dp,
+                                                                     RankSignals sg,
 #ifndef USE_ROCM
-                                 volatile
+                                                                     volatile
 #endif
-                                 Signal *self_sg,
-                                 T *__restrict__ result, int rank, int size)
-  {
-    using P = typename packed_t<T>::P;
-    using A = typename packed_t<T>::A;
+                                                                     Signal* self_sg,
+                                                                     T* __restrict__ result,
+                                                                     int rank,
+                                                                     int size)
+{
+    using P                 = typename packed_t<T>::P;
+    using A                 = typename packed_t<T>::A;
     constexpr int pack_size = packed_t<T>::P::size;
-    constexpr int tnum_gpu = THREAD_NUM / ngpus;
+    constexpr int tnum_gpu  = THREAD_NUM / ngpus;
     __shared__ T tmp_smem[tnum_gpu * ngpus * pack_size];
     // note: we don't reorder the address so the accumulation order is the same
     // for all ranks, ensuring bitwise identical results
@@ -411,109 +415,115 @@ namespace aiter
     int lane_id = threadIdx.x % tnum_gpu;
     start_sync<ngpus>(sg, self_sg, rank);
     // do the actual reduction
-    for (int idx = blockIdx.x * tnum_gpu + lane_id; idx < size;
-         idx += gridDim.x * tnum_gpu)
-    {
-      *(reinterpret_cast<P*>(&tmp_smem[0]) + threadIdx.x) = ((const P**)&dp.ptrs[0])[warp_id][idx];
-      __syncthreads();
-      if (warp_id == 0)
-      {
-        A add_reg;
-#pragma unroll
-        for (int i = 0; i < pack_size; ++i)
+    for(int idx = blockIdx.x * tnum_gpu + lane_id; idx < size; idx += gridDim.x * tnum_gpu)
+    {
+        *(reinterpret_cast<P*>(&tmp_smem[0]) + threadIdx.x) =
+            ((const P**)&dp.ptrs[0])[warp_id][idx];
+        __syncthreads();
+        if(warp_id == 0)
         {
-          add_reg.data[i] = ck_tile::type_convert<float>(tmp_smem[threadIdx.x * pack_size + i]);
-        }
-        constexpr int smem_gpu_loop_stride = tnum_gpu * pack_size;
+            A add_reg;
 #pragma unroll
-        for (int i = 1; i < ngpus; ++i)
-        {
+            for(int i = 0; i < pack_size; ++i)
+            {
+                add_reg.data[i] =
+                    ck_tile::type_convert<float>(tmp_smem[threadIdx.x * pack_size + i]);
+            }
+            constexpr int smem_gpu_loop_stride = tnum_gpu * pack_size;
 #pragma unroll
-          for (int j = 0; j < pack_size; ++j)
-          {
-            add_reg.data[j] += ck_tile::type_convert<float>(tmp_smem[smem_gpu_loop_stride * i + threadIdx.x * pack_size + j]);
-          }
-        }
-        P write_reg;
+            for(int i = 1; i < ngpus; ++i)
+            {
 #pragma unroll
-        for (int i = 0; i < pack_size; ++i)
-        {
-          write_reg.data[i] = ck_tile::type_convert<T>(add_reg.data[i]);
+                for(int j = 0; j < pack_size; ++j)
+                {
+                    add_reg.data[j] += ck_tile::type_convert<float>(
+                        tmp_smem[smem_gpu_loop_stride * i + threadIdx.x * pack_size + j]);
+                }
+            }
+            P write_reg;
+#pragma unroll
+            for(int i = 0; i < pack_size; ++i)
+            {
+                write_reg.data[i] = ck_tile::type_convert<T>(add_reg.data[i]);
+            }
+            ((P*)result)[idx] = write_reg;
         }
-        ((P *)result)[idx] = write_reg;
-      }
-      __syncthreads();
+        __syncthreads();
     }
     // maybe do not need device sync
     // end_sync<ngpus, true>(sg, self_sg, rank);
-  }
+}
 
-  template <typename T, int ngpus>
-  __global__ void __launch_bounds__(512, 1)
-      cross_device_reduce_2stage(RankData *_dp, RankSignals sg,
+template <typename T, int ngpus>
+__global__ void __launch_bounds__(512, 1) cross_device_reduce_2stage(RankData* _dp,
+                                                                     RankSignals sg,
 #ifndef USE_ROCM
-                                 volatile
+                                                                     volatile
 #endif
-                                 Signal *self_sg,
-                                 T *__restrict__ result, int rank, int size)
-  {
+                                                                     Signal* self_sg,
+                                                                     T* __restrict__ result,
+                                                                     int rank,
+                                                                     int size)
+{
     constexpr int pack_size = packed_t<T>::P::size;
-    constexpr int tnum_gpu = THREAD_NUM / ngpus;
-    using P = typename packed_t<T>::P;
-    using A = typename packed_t<T>::A;
+    constexpr int tnum_gpu  = THREAD_NUM / ngpus;
+    using P                 = typename packed_t<T>::P;
+    using A                 = typename packed_t<T>::A;
     __shared__ T tmp_smem[tnum_gpu * ngpus * pack_size];
-    int warp_id = threadIdx.x / tnum_gpu;
-    int lane_id = threadIdx.x % tnum_gpu;
-    int tid = blockIdx.x * tnum_gpu + lane_id;
-    int stride = gridDim.x * tnum_gpu;
-    int part = size / ngpus;
-    int start = rank * part;
-    int end = rank == ngpus - 1 ? size : start + part;
+    int warp_id      = threadIdx.x / tnum_gpu;
+    int lane_id      = threadIdx.x % tnum_gpu;
+    int tid          = blockIdx.x * tnum_gpu + lane_id;
+    int stride       = gridDim.x * tnum_gpu;
+    int part         = size / ngpus;
+    int start        = rank * part;
+    int end          = rank == ngpus - 1 ? size : start + part;
     int largest_part = part + size % ngpus;
-    const P *ptrs[ngpus];
-    P *tmps[ngpus];
+    const P* ptrs[ngpus];
+    P* tmps[ngpus];
 #pragma unroll
-    for (int i = 0; i < ngpus; i++)
+    for(int i = 0; i < ngpus; i++)
     {
-      int target = (rank + i) % ngpus;
-      ptrs[i] = (const P *)_dp->ptrs[target];
-      tmps[i] = get_tmp_buf<P>(sg.signals[target]);
+        int target = (rank + i) % ngpus;
+        ptrs[i]    = (const P*)_dp->ptrs[target];
+        tmps[i]    = get_tmp_buf<P>(sg.signals[target]);
     }
     auto tmp_out = tmps[0];
     start_sync<ngpus>(sg, self_sg, rank);
     // stage 1: reduce scatter
-    for (int idx = start + tid; idx < end; idx += stride)
-    {
-      *(reinterpret_cast<P*>(&tmp_smem[0]) + threadIdx.x) = ptrs[warp_id][idx];
-      __syncthreads();
-      // cal add in first 64 threads
-      if (warp_id == 0)
-      {
-        A add_reg;
-#pragma unroll
-        for (int i = 0; i < pack_size; ++i)
+    for(int idx = start + tid; idx < end; idx += stride)
+    {
+        *(reinterpret_cast<P*>(&tmp_smem[0]) + threadIdx.x) = ptrs[warp_id][idx];
+        __syncthreads();
+        // cal add in first 64 threads
+        if(warp_id == 0)
         {
-          add_reg.data[i] = ck_tile::type_convert<float>(tmp_smem[pack_size * threadIdx.x + i]);
-        }
-        constexpr int smem_gpu_loop_stride = tnum_gpu * pack_size;
+            A add_reg;
 #pragma unroll
-        for (int i = 1; i < ngpus; ++i)
-        {
+            for(int i = 0; i < pack_size; ++i)
+            {
+                add_reg.data[i] =
+                    ck_tile::type_convert<float>(tmp_smem[pack_size * threadIdx.x + i]);
+            }
+            constexpr int smem_gpu_loop_stride = tnum_gpu * pack_size;
 #pragma unroll
-          for (int j = 0; j < pack_size; ++j)
-          {
-            add_reg.data[j] += ck_tile::type_convert<float>(tmp_smem[i * smem_gpu_loop_stride + pack_size * threadIdx.x + j]);
-          }
-        }
-        P write_reg;
+            for(int i = 1; i < ngpus; ++i)
+            {
 #pragma unroll
-        for (int i = 0; i < pack_size; ++i)
-        {
-          write_reg.data[i] = ck_tile::type_convert<T>(add_reg.data[i]);
+                for(int j = 0; j < pack_size; ++j)
+                {
+                    add_reg.data[j] += ck_tile::type_convert<float>(
+                        tmp_smem[i * smem_gpu_loop_stride + pack_size * threadIdx.x + j]);
+                }
+            }
+            P write_reg;
+#pragma unroll
+            for(int i = 0; i < pack_size; ++i)
+            {
+                write_reg.data[i] = ck_tile::type_convert<T>(add_reg.data[i]);
+            }
+            tmp_out[idx - start] = write_reg;
         }
-        tmp_out[idx - start] = write_reg;
-      }
-      __syncthreads();
+        __syncthreads();
     }
     end_sync<ngpus>(sg, self_sg, rank);
 
@@ -522,734 +532,706 @@ namespace aiter
     // between threads that have the same tid. If thread i computes the sum of
     // start + i in the first stage, then thread i also gathers start + i from all
     // ranks.
-    for (int idx = tid; idx < largest_part; idx += stride)
-    {
-        int dst_idx = (warp_id + rank) % ngpus * part + idx;
-        ((P *)result)[dst_idx] = tmps[warp_id][idx];
-    }
-  }
-
-  /*
-   * naive allgather
-   * for case: input(1345,)
-   * */
-  template <typename T, int ngpus>
-  __global__ void __launch_bounds__(512, 1) allgather_naive(
-      RankData* _dp,
-      RankSignals sg,
-      Signal* self_sg,
-      T* __restrict__ result,
-      int rank,
-      int size
-  )
-  {
+    for(int idx = tid; idx < largest_part; idx += stride)
+    {
+        int dst_idx           = (warp_id + rank) % ngpus * part + idx;
+        ((P*)result)[dst_idx] = tmps[warp_id][idx];
+    }
+}
+
+/*
+ * naive allgather
+ * for case: input(1345,)
+ * */
+template <typename T, int ngpus>
+__global__ void __launch_bounds__(512, 1) allgather_naive(
+    RankData* _dp, RankSignals sg, Signal* self_sg, T* __restrict__ result, int rank, int size)
+{
     constexpr int tnum_gpu = THREAD_NUM / ngpus;
-    int warp_id = threadIdx.x / tnum_gpu;
-    int lane_id = threadIdx.x % tnum_gpu;
-    int tid = blockIdx.x * tnum_gpu + lane_id;
-    int stride = gridDim.x * tnum_gpu;
+    int warp_id            = threadIdx.x / tnum_gpu;
+    int lane_id            = threadIdx.x % tnum_gpu;
+    int tid                = blockIdx.x * tnum_gpu + lane_id;
+    int stride             = gridDim.x * tnum_gpu;
     const T* ptrs[ngpus];
 
 #pragma unroll
-    for (int i = 0; i < ngpus; ++i)
+    for(int i = 0; i < ngpus; ++i)
     {
-      ptrs[i] = (const T*)_dp->ptrs[i];
+        ptrs[i] = (const T*)_dp->ptrs[i];
     }
     start_sync<ngpus>(sg, self_sg, rank);
 
-    for (int idx = tid; idx < size; idx += stride)
+    for(int idx = tid; idx < size; idx += stride)
     {
-      int write_idx = warp_id * size + idx;
-      result[write_idx] = ptrs[warp_id][idx];
+        int write_idx     = warp_id * size + idx;
+        result[write_idx] = ptrs[warp_id][idx];
     }
-  }
+}
 
-  template <typename T, int ngpus>
-  __global__ void __launch_bounds__(512, 1) allgather_vec(
-      RankData* _dp,
-      RankSignals sg,
-      Signal* self_sg,
-      T* __restrict__ result,
-      int rank,
-      int size
-  )
-  {
+template <typename T, int ngpus>
+__global__ void __launch_bounds__(512, 1) allgather_vec(
+    RankData* _dp, RankSignals sg, Signal* self_sg, T* __restrict__ result, int rank, int size)
+{
     constexpr int tnum_gpu = THREAD_NUM / ngpus;
-    using P = typename packed_t<T>::P;
-    int warp_id = threadIdx.x / tnum_gpu;
-    int lane_id = threadIdx.x % tnum_gpu;
-    int tid = blockIdx.x * tnum_gpu + lane_id;
-    int stride = gridDim.x * tnum_gpu;
+    using P                = typename packed_t<T>::P;
+    int warp_id            = threadIdx.x / tnum_gpu;
+    int lane_id            = threadIdx.x % tnum_gpu;
+    int tid                = blockIdx.x * tnum_gpu + lane_id;
+    int stride             = gridDim.x * tnum_gpu;
     const P* ptrs[ngpus];
 
 #pragma unroll
-    for (int i = 0; i < ngpus; ++i)
+    for(int i = 0; i < ngpus; ++i)
     {
-      ptrs[i] = (const P*)_dp->ptrs[i];
+        ptrs[i] = (const P*)_dp->ptrs[i];
     }
     start_sync<ngpus>(sg, self_sg, rank);
 
-    for (int idx = tid; idx < size; idx += stride)
-    {
-      int write_idx = warp_id * size + idx;
-      *(reinterpret_cast<P*>(&result[0]) + write_idx) = ptrs[warp_id][idx];
-    }
-  }
-
-  /*
-   * reduce_scatter, at first dim
-   * range = size / (pack_size * ngpu)
-   * for case:
-   *  input:(ngpus * n) -> output:(n)
-   *  input:(ngpus * m, n, ...) -> output(m, n, ...)
-   * cond: size % (pack_size * ngpus) == 0
-   * */
-  template <typename T, int ngpus>
-  __global__ void __launch_bounds__(512, 1) reduce_scatter_first_dim(
-      RankData *_dp,
-      RankSignals sg,
-      Signal *self_sg,
-      T *__restrict__ result,
-      int rank,
-      int range
-  )
-  {
-    int tid = blockIdx.x * blockDim.x + threadIdx.x;
+    for(int idx = tid; idx < size; idx += stride)
+    {
+        int write_idx                                   = warp_id * size + idx;
+        *(reinterpret_cast<P*>(&result[0]) + write_idx) = ptrs[warp_id][idx];
+    }
+}
+
+/*
+ * reduce_scatter, at first dim
+ * range = size / (pack_size * ngpu)
+ * for case:
+ *  input:(ngpus * n) -> output:(n)
+ *  input:(ngpus * m, n, ...) -> output(m, n, ...)
+ * cond: size % (pack_size * ngpus) == 0
+ * */
+template <typename T, int ngpus>
+__global__ void __launch_bounds__(512, 1) reduce_scatter_first_dim(
+    RankData* _dp, RankSignals sg, Signal* self_sg, T* __restrict__ result, int rank, int range)
+{
+    int tid    = blockIdx.x * blockDim.x + threadIdx.x;
     int stride = blockDim.x * gridDim.x;
-    using P = typename packed_t<T>::P;
-    using A = typename packed_t<T>::A;
-    const P *ptrs[ngpus];
+    using P    = typename packed_t<T>::P;
+    using A    = typename packed_t<T>::A;
+    const P* ptrs[ngpus];
 #pragma unroll
-    for (int i = 0; i < ngpus; i++)
+    for(int i = 0; i < ngpus; i++)
     {
-      int target = (rank + i) % ngpus;
-      ptrs[i] = (const P *)_dp->ptrs[target];
+        int target = (rank + i) % ngpus;
+        ptrs[i]    = (const P*)_dp->ptrs[target];
     }
     start_sync<ngpus>(sg, self_sg, rank);
 
-    for (int idx = tid; idx < range; idx += stride)
+    for(int idx = tid; idx < range; idx += stride)
     {
-      int load_index = rank * range + idx;
-      int store_index = idx;
-      *(reinterpret_cast<P*>(result) + store_index) = packed_reduce<P, ngpus, A>(ptrs, load_index);
+        int load_index  = rank * range + idx;
+        int store_index = idx;
+        *(reinterpret_cast<P*>(result) + store_index) =
+            packed_reduce<P, ngpus, A>(ptrs, load_index);
     }
-  }
+}
 
-  // fp8 quant all-reduce code start
-  template <typename T>
-  struct Fp16Filter
-  {
+// fp8 quant all-reduce code start
+template <typename T>
+struct Fp16Filter
+{
     static const bool value = false;
-  };
+};
 
-  template <>
-  struct Fp16Filter<half>
-  {
+template <>
+struct Fp16Filter<half>
+{
     static const bool value = true;
-  };
+};
 
-  template <typename T>
-  struct Bf16Filter
-  {
+template <typename T>
+struct Bf16Filter
+{
     static const bool value = false;
-  };
+};
 
-  template <>
-  struct Bf16Filter<__hip_bfloat16>
-  {
+template <>
+struct Bf16Filter<__hip_bfloat16>
+{
     static const bool value = true;
-  };
+};
 
-  // dtypes only support half and bf16 now
-#define FP16_FILTER \
-  typename std::enable_if<Fp16Filter<T>::value, void>::type* = nullptr
+// dtypes only support half and bf16 now
+#define FP16_FILTER typename std::enable_if<Fp16Filter<T>::value, void>::type* = nullptr
 
-#define BF16_FILTER \
-  typename std::enable_if<Bf16Filter<T>::value, void>::type* = nullptr
+#define BF16_FILTER typename std::enable_if<Bf16Filter<T>::value, void>::type* = nullptr
 
-  template <template <typename> class functor, typename T, int size>
-  DINLINE T packReduce(array_t<T, size> pack)
-  {
-    auto op = functor<T>();
+template <template <typename> class functor, typename T, int size>
+DINLINE T packReduce(array_t<T, size> pack)
+{
+    auto op   = functor<T>();
     T ret_val = pack.data[0];
 #pragma unroll
-    for (int i = 1; i < size; ++i)
+    for(int i = 1; i < size; ++i)
     {
-      ret_val = op(ret_val, pack.data[i]);
+        ret_val = op(ret_val, pack.data[i]);
     }
     return ret_val;
-  }
+}
 
-  template <template<typename> class functor, typename T, int size>
-  DINLINE array_t<T, size> packOp(array_t<T, size> a, array_t<T, size> b)
-  {
+template <template <typename> class functor, typename T, int size>
+DINLINE array_t<T, size> packOp(array_t<T, size> a, array_t<T, size> b)
+{
     auto op = functor<T>();
     array_t<T, size> ret_pack;
 #pragma unroll
-    for (int i = 0; i < size; ++i)
+    for(int i = 0; i < size; ++i)
     {
-      ret_pack.data[i] = op(a.data[i], b.data[i]);
+        ret_pack.data[i] = op(a.data[i], b.data[i]);
     }
     return ret_pack;
-  }
+}
 
-  template <typename T>
-  struct AddFunctor
-  {
-    DINLINE T operator() (T a, T b)
-    {
-      return a + b;
-    }
-  };
+template <typename T>
+struct AddFunctor
+{
+    DINLINE T operator()(T a, T b) { return a + b; }
+};
 
-  template <>
-  struct AddFunctor<half>
-  {
-    DINLINE half operator() (half a, half b)
+template <>
+struct AddFunctor<half>
+{
+    DINLINE half operator()(half a, half b)
     {
-      float a_fp32 = ck_tile::type_convert<float>(a);
-      float b_fp32 = ck_tile::type_convert<float>(b);
-      return ck_tile::type_convert<half>(a_fp32 + b_fp32);
+        float a_fp32 = ck_tile::type_convert<float>(a);
+        float b_fp32 = ck_tile::type_convert<float>(b);
+        return ck_tile::type_convert<half>(a_fp32 + b_fp32);
     }
-  };
+};
 
-  template <>
-  struct AddFunctor<__hip_bfloat16>
-  {
-    DINLINE __hip_bfloat16 operator() (__hip_bfloat16 a, __hip_bfloat16 b)
+template <>
+struct AddFunctor<__hip_bfloat16>
+{
+    DINLINE __hip_bfloat16 operator()(__hip_bfloat16 a, __hip_bfloat16 b)
     {
-      float a_fp32 = ck_tile::type_convert<float>(a);
-      float b_fp32 = ck_tile::type_convert<float>(b);
-      return ck_tile::type_convert<__hip_bfloat16>(a_fp32 + b_fp32);
+        float a_fp32 = ck_tile::type_convert<float>(a);
+        float b_fp32 = ck_tile::type_convert<float>(b);
+        return ck_tile::type_convert<__hip_bfloat16>(a_fp32 + b_fp32);
     }
-  };
+};
 
-  template <typename T>
-  struct MaxFunctor
-  {
-    DINLINE T operator() (T a, T b)
-    {
-      return max(a, b);
-    }
-  };
+template <typename T>
+struct MaxFunctor
+{
+    DINLINE T operator()(T a, T b) { return max(a, b); }
+};
 
-  /*
-   * todo:
-   * static_cast may not safe
-   * need a convert dtype template function defined by myself
-   *
-   * done
-   * */
-  template <typename T>
-  struct AbsMaxFunctor
-  {
-    DINLINE T operator() (T a, T b)
+/*
+ * todo:
+ * static_cast may not safe
+ * need a convert dtype template function defined by myself
+ *
+ * done
+ * */
+template <typename T>
+struct AbsMaxFunctor
+{
+    DINLINE T operator()(T a, T b)
     {
-      T zero_t = ck_tile::type_convert<T>(0.0f);
-      a = a > zero_t ? a : zero_t - a;
-      b = b > zero_t ? b : zero_t - b;
-      return max(a, b);
+        T zero_t = ck_tile::type_convert<T>(0.0f);
+        a        = a > zero_t ? a : zero_t - a;
+        b        = b > zero_t ? b : zero_t - b;
+        return max(a, b);
     }
-  };
+};
 
-  template <template <typename> class functor, typename T, int reduce_range>
-  DINLINE T warpReduce(T val)
-  {
+template <template <typename> class functor, typename T, int reduce_range>
+DINLINE T warpReduce(T val)
+{
     auto op = functor<T>();
 #pragma unroll
-    for (int stride = reduce_range / 2; stride > 0; stride >>= 1)
+    for(int stride = reduce_range / 2; stride > 0; stride >>= 1)
     {
-      T tmp = __shfl_xor(val, stride, reduce_range);
-      val = op(val, tmp);
+        T tmp = __shfl_xor(val, stride, reduce_range);
+        val   = op(val, tmp);
     }
     return val;
-  }
-
-  // the following code only support bf16 and fp16
-  template <typename T>
-  DINLINE hip_fp8 elementQuant(T input, T scale_functor)
-  {
-    return hip_fp8(ck_tile::type_convert<float>(input) / ck_tile::type_convert<float>(scale_functor));
-  }
-
-  template <typename T>
-  DINLINE T elementDequant(hip_fp8 input, T scale_functor)
-  {
+}
+
+// the following code only support bf16 and fp16
+template <typename T>
+DINLINE hip_fp8 elementQuant(T input, T scale_functor)
+{
+    return hip_fp8(ck_tile::type_convert<float>(input) /
+                   ck_tile::type_convert<float>(scale_functor));
+}
+
+template <typename T>
+DINLINE T elementDequant(hip_fp8 input, T scale_functor)
+{
     return ck_tile::type_convert<T>(float(input) * ck_tile::type_convert<float>(scale_functor));
-  }
+}
 
-  template <typename T, int pack_size>
-  DINLINE array_t<hip_fp8, pack_size> packQuant(array_t<T, pack_size> inp_pack, T scale_functor)
-  {
+template <typename T, int pack_size>
+DINLINE array_t<hip_fp8, pack_size> packQuant(array_t<T, pack_size> inp_pack, T scale_functor)
+{
     array_t<hip_fp8, pack_size> ret_val;
 #pragma unroll
-    for (int i = 0; i < pack_size; ++i)
+    for(int i = 0; i < pack_size; ++i)
     {
-      ret_val.data[i] = elementQuant<T>(inp_pack.data[i], scale_functor);
+        ret_val.data[i] = elementQuant<T>(inp_pack.data[i], scale_functor);
     }
     return ret_val;
-  }
+}
 
-  template <typename T, int pack_size>
-  DINLINE array_t<T, pack_size> packDequant(array_t<hip_fp8, pack_size> inp_pack, T scale_functor)
-  {
+template <typename T, int pack_size>
+DINLINE array_t<T, pack_size> packDequant(array_t<hip_fp8, pack_size> inp_pack, T scale_functor)
+{
     array_t<T, pack_size> ret_val;
 #pragma unroll
-    for (int i = 0; i < pack_size; ++i)
+    for(int i = 0; i < pack_size; ++i)
     {
-      ret_val.data[i] = elementDequant<T>(inp_pack.data[i], scale_functor);
+        ret_val.data[i] = elementDequant<T>(inp_pack.data[i], scale_functor);
     }
     return ret_val;
-  }
+}
 
-  // convert fp16 pack to fp32 pack
-  template <typename T, int pack_size>
-  DINLINE array_t<float, pack_size> packUpcast(array_t<T, pack_size> inp)
-  {
+// convert fp16 pack to fp32 pack
+template <typename T, int pack_size>
+DINLINE array_t<float, pack_size> packUpcast(array_t<T, pack_size> inp)
+{
     array_t<float, pack_size> ret_val;
 #pragma unroll
-    for (int i = 0; i < pack_size; ++i)
+    for(int i = 0; i < pack_size; ++i)
     {
-      ret_val.data[i] = ck_tile::type_convert<float>(inp.data[i]);
+        ret_val.data[i] = ck_tile::type_convert<float>(inp.data[i]);
     }
     return ret_val;
-  }
+}
 
-  template <typename T, int pack_size>
-  DINLINE array_t<T, pack_size> packDowncast(array_t<float, pack_size> inp)
-  {
+template <typename T, int pack_size>
+DINLINE array_t<T, pack_size> packDowncast(array_t<float, pack_size> inp)
+{
     array_t<T, pack_size> ret_val;
 #pragma unroll
-    for (int i = 0; i < pack_size; ++i)
+    for(int i = 0; i < pack_size; ++i)
     {
-      ret_val.data[i] = ck_tile::type_convert<T>(inp.data[i]);
+        ret_val.data[i] = ck_tile::type_convert<T>(inp.data[i]);
     }
     return ret_val;
-  }
+}
 
-
-  template <typename T, int pack_size, int ngpus>
-  DINLINE array_t<T, pack_size> multiGPUPackReduce(const array_t<T, pack_size> *ptrs[ngpus], int index)
-  {
+template <typename T, int pack_size, int ngpus>
+DINLINE array_t<T, pack_size> multiGPUPackReduce(const array_t<T, pack_size>* ptrs[ngpus],
+                                                 int index)
+{
     array_t<float, pack_size> ret_val = packUpcast<T, pack_size>(ptrs[0][index]);
 #pragma unroll
-    for (int gpu_id = 1; gpu_id < ngpus; ++gpu_id)
+    for(int gpu_id = 1; gpu_id < ngpus; ++gpu_id)
     {
-      array_t<float, pack_size> tmp = packUpcast<T, pack_size>(ptrs[gpu_id][index]);
+        array_t<float, pack_size> tmp = packUpcast<T, pack_size>(ptrs[gpu_id][index]);
 #pragma unroll
-      for (int i = 0; i < pack_size; ++i)
-      {
-        ret_val.data[i] += tmp.data[i];
-      }
+        for(int i = 0; i < pack_size; ++i)
+        {
+            ret_val.data[i] += tmp.data[i];
+        }
     }
     return packDowncast<T, pack_size>(ret_val);
-  }
-
-  // bf16 quant fp8 kernel function
-  // too slow need to be optimized
-  // fp16
-  template <typename T, int quant_scale, int pack_size, int ngpus, FP16_FILTER>
-  __global__ __forceinline__ void __launch_bounds__(512, 1) allReduceQuantFp8(RankData* _dp, RankSignals sg, Signal* self_sg, T* __restrict__ result, int rank, int size)
-  {
+}
+
+// bf16 quant fp8 kernel function
+// too slow need to be optimized
+// fp16
+template <typename T, int quant_scale, int pack_size, int ngpus, FP16_FILTER>
+__global__ __forceinline__ void __launch_bounds__(512, 1) allReduceQuantFp8(
+    RankData* _dp, RankSignals sg, Signal* self_sg, T* __restrict__ result, int rank, int size)
+{
     float FP8_UPBOUND = ck_tile::type_convert<float>(ck_tile::numeric<ck_tile::fp8_t>::max());
-    int tid = blockIdx.x * blockDim.x + threadIdx.x;
-    int stride = gridDim.x * blockDim.x;
-    using inp_pack = array_t<T, pack_size>;
-    using fp8_pack = array_t<hip_fp8, pack_size>;
-    int part = size / ngpus;
-    int start = rank * part;
-    int end = rank == ngpus - 1 ? size : start + part;
-    int largest_part = part + size % ngpus;
-    const inp_pack *ptrs[ngpus];
-    fp8_pack *tmps[ngpus];
+    int tid           = blockIdx.x * blockDim.x + threadIdx.x;
+    int stride        = gridDim.x * blockDim.x;
+    using inp_pack    = array_t<T, pack_size>;
+    using fp8_pack    = array_t<hip_fp8, pack_size>;
+    int part          = size / ngpus;
+    int start         = rank * part;
+    int end           = rank == ngpus - 1 ? size : start + part;
+    int largest_part  = part + size % ngpus;
+    const inp_pack* ptrs[ngpus];
+    fp8_pack* tmps[ngpus];
 #pragma unroll
-    for (int i = 0; i < ngpus; i++)
+    for(int i = 0; i < ngpus; i++)
     {
-      int target = (rank + i) % ngpus;
-      ptrs[i] = (const inp_pack *)_dp->ptrs[target];
-      tmps[i] = get_tmp_buf<fp8_pack>(sg.signals[target]);
+        int target = (rank + i) % ngpus;
+        ptrs[i]    = (const inp_pack*)_dp->ptrs[target];
+        tmps[i]    = get_tmp_buf<fp8_pack>(sg.signals[target]);
     }
     auto tmp_out = tmps[0];
     start_sync<ngpus>(sg, self_sg, rank);
     // stage 1: reduce scatter
-    for (int idx = start + tid; idx < end; idx += stride)
-    {
-      inp_pack half8_reg;
-      // half8_reg = packed_reduce<P, ngpus, A>(ptrs, idx);
-      half8_reg = multiGPUPackReduce<T, pack_size, ngpus>(ptrs, idx);
-      ((inp_pack *)result)[idx] = half8_reg;
-      // quant
-      T thread_max = packReduce<AbsMaxFunctor, T, pack_size>(half8_reg);
-      thread_max = warpReduce<MaxFunctor, T, quant_scale / pack_size>(thread_max);
-      T scale_factor = ck_tile::type_convert<T>(ck_tile::type_convert<float>(thread_max) / FP8_UPBOUND);
-      tmp_out[idx - start] = packQuant<T, pack_size>(half8_reg, scale_factor);
-      if (threadIdx.x % (quant_scale / pack_size) == 0)
-      {
-        *(reinterpret_cast<T*>(&tmp_out[part]) + (idx - start) / (quant_scale / pack_size)) = scale_factor;
-      }
+    for(int idx = start + tid; idx < end; idx += stride)
+    {
+        inp_pack half8_reg;
+        // half8_reg = packed_reduce<P, ngpus, A>(ptrs, idx);
+        half8_reg                = multiGPUPackReduce<T, pack_size, ngpus>(ptrs, idx);
+        ((inp_pack*)result)[idx] = half8_reg;
+        // quant
+        T thread_max = packReduce<AbsMaxFunctor, T, pack_size>(half8_reg);
+        thread_max   = warpReduce<MaxFunctor, T, quant_scale / pack_size>(thread_max);
+        T scale_factor =
+            ck_tile::type_convert<T>(ck_tile::type_convert<float>(thread_max) / FP8_UPBOUND);
+        tmp_out[idx - start] = packQuant<T, pack_size>(half8_reg, scale_factor);
+        if(threadIdx.x % (quant_scale / pack_size) == 0)
+        {
+            *(reinterpret_cast<T*>(&tmp_out[part]) + (idx - start) / (quant_scale / pack_size)) =
+                scale_factor;
+        }
     }
     end_sync<ngpus>(sg, self_sg, rank);
 
     // stage 2: all-gather
-    for (int idx = tid; idx < largest_part; idx += stride)
+    for(int idx = tid; idx < largest_part; idx += stride)
     {
 #pragma unroll
-      for (int i = 1; i < ngpus; i++)
-      {
-        int gather_from_rank = ((rank + i) % ngpus);
-        if (gather_from_rank == ngpus - 1 || idx < part)
+        for(int i = 1; i < ngpus; i++)
         {
-          // dequant
-          T scale_factor;
-          int factor_stride = quant_scale / pack_size;
-          if (threadIdx.x % factor_stride == 0)
-          {
-            scale_factor = *(reinterpret_cast<T*>(&tmps[i][part]) + idx / factor_stride);
-          }
-          scale_factor = __shfl(scale_factor, (threadIdx.x / factor_stride) * factor_stride);
-          inp_pack half8_reg = packDequant<T, pack_size>(tmps[i][idx], scale_factor);
-          int dst_idx = gather_from_rank * part + idx;
-          ((inp_pack *)result)[dst_idx] = half8_reg;
+            int gather_from_rank = ((rank + i) % ngpus);
+            if(gather_from_rank == ngpus - 1 || idx < part)
+            {
+                // dequant
+                T scale_factor;
+                int factor_stride = quant_scale / pack_size;
+                if(threadIdx.x % factor_stride == 0)
+                {
+                    scale_factor = *(reinterpret_cast<T*>(&tmps[i][part]) + idx / factor_stride);
+                }
+                scale_factor = __shfl(scale_factor, (threadIdx.x / factor_stride) * factor_stride);
+                inp_pack half8_reg = packDequant<T, pack_size>(tmps[i][idx], scale_factor);
+                int dst_idx        = gather_from_rank * part + idx;
+                ((inp_pack*)result)[dst_idx] = half8_reg;
+            }
         }
-      }
-    }
-  }
-
-  // fused allreduce rmsnorm first step
-  template <typename T, int ngpus>
-  __global__ void __launch_bounds__(512, 1) reduce_scatter_cross_device_store(
-      RankData* _dp,
-      RankSignals sg,
-      Signal* self_sg,
-      int rank,
-      int size
-  )
-  {
+    }
+}
+
+// fused allreduce rmsnorm first step
+template <typename T, int ngpus>
+__global__ void __launch_bounds__(512, 1) reduce_scatter_cross_device_store(
+    RankData* _dp, RankSignals sg, Signal* self_sg, int rank, int size)
+{
     constexpr int pack_size = packed_t<T>::P::size;
-    constexpr int tnum_gpu = THREAD_NUM / ngpus;
-    using P = typename packed_t<T>::P;
-    using A = typename packed_t<T>::A;
+    constexpr int tnum_gpu  = THREAD_NUM / ngpus;
+    using P                 = typename packed_t<T>::P;
+    using A                 = typename packed_t<T>::A;
     __shared__ T tmp_smem[tnum_gpu * ngpus * pack_size];
     int warp_id = threadIdx.x / tnum_gpu;
     int lane_id = threadIdx.x % tnum_gpu;
-    int tid = blockIdx.x * tnum_gpu + lane_id;
+    int tid     = blockIdx.x * tnum_gpu + lane_id;
     const P* ptrs[ngpus];
     P* tmps[ngpus];
 #pragma unroll
-    for (int i = 0; i < ngpus; ++i)
+    for(int i = 0; i < ngpus; ++i)
     {
-      ptrs[i] = (const P*)_dp->ptrs[i];
-      tmps[i] = get_tmp_buf<P>(sg.signals[i]);
+        ptrs[i] = (const P*)_dp->ptrs[i];
+        tmps[i] = get_tmp_buf<P>(sg.signals[i]);
     }
     start_sync<ngpus>(sg, self_sg, rank);
 
     int part = size / (pack_size * ngpus);
-    for (int idx = tid; idx < part; idx += gridDim.x * tnum_gpu)
-    {
-      // cross device read by all warp
-      P input_reg = ptrs[warp_id][rank * part + idx];
-      *(reinterpret_cast<P*>(&tmp_smem[0]) + threadIdx.x) = input_reg;
-      __syncthreads();
-      // calculate and save in first warp
-      if (warp_id == 0)
-      {
-        A add_reg;
-#pragma unroll
-        for (int i = 0; i < pack_size; ++i)
+    for(int idx = tid; idx < part; idx += gridDim.x * tnum_gpu)
+    {
+        // cross device read by all warp
+        P input_reg                                         = ptrs[warp_id][rank * part + idx];
+        *(reinterpret_cast<P*>(&tmp_smem[0]) + threadIdx.x) = input_reg;
+        __syncthreads();
+        // calculate and save in first warp
+        if(warp_id == 0)
         {
-          add_reg.data[i] = ck_tile::type_convert<float>(tmp_smem[pack_size * threadIdx.x + i]);
-        }
+            A add_reg;
 #pragma unroll
-        for (int i = 1; i < ngpus; ++i)
-        {
+            for(int i = 0; i < pack_size; ++i)
+            {
+                add_reg.data[i] =
+                    ck_tile::type_convert<float>(tmp_smem[pack_size * threadIdx.x + i]);
+            }
 #pragma unroll
-          for (int j = 0; j < pack_size; ++j)
-          {
-            add_reg.data[j] += ck_tile::type_convert<float>(tmp_smem[i * pack_size * tnum_gpu + pack_size * threadIdx.x + j]);
-          }
-        }
-        P add_rslt;
+            for(int i = 1; i < ngpus; ++i)
+            {
 #pragma unroll
-        for (int i = 0; i < pack_size; ++i)
-        {
-          add_rslt.data[i] = ck_tile::type_convert<T>(add_reg.data[i]);
+                for(int j = 0; j < pack_size; ++j)
+                {
+                    add_reg.data[j] += ck_tile::type_convert<float>(
+                        tmp_smem[i * pack_size * tnum_gpu + pack_size * threadIdx.x + j]);
+                }
+            }
+            P add_rslt;
+#pragma unroll
+            for(int i = 0; i < pack_size; ++i)
+            {
+                add_rslt.data[i] = ck_tile::type_convert<T>(add_reg.data[i]);
+            }
+            *(reinterpret_cast<P*>(&tmp_smem[0]) + lane_id) = add_rslt;
         }
-        *(reinterpret_cast<P*>(&tmp_smem[0]) + lane_id) = add_rslt;
-      }
-      __syncthreads();
+        __syncthreads();
 
-      // cross device store
-      P rslt = *(reinterpret_cast<P*>(&tmp_smem[0]) + lane_id);
-      tmps[warp_id][rank * part + idx] = rslt;
+        // cross device store
+        P rslt                           = *(reinterpret_cast<P*>(&tmp_smem[0]) + lane_id);
+        tmps[warp_id][rank * part + idx] = rslt;
     }
     end_sync<ngpus, true>(sg, self_sg, rank);
-  }
+}
 
-  template <int reduce_range>
-  DINLINE void smemReduceSum(float* smem_addr)
-  {
+template <int reduce_range>
+DINLINE void smemReduceSum(float* smem_addr)
+{
     // a warp executes the same instruction
 #pragma unroll
-    for (int stride = reduce_range / 2; stride > 32; stride >>= 1)
+    for(int stride = reduce_range / 2; stride > 32; stride >>= 1)
     {
-      if (threadIdx.x < stride)
-      {
-        smem_addr[threadIdx.x] += smem_addr[threadIdx.x + stride];
-      }
-      __syncthreads();
+        if(threadIdx.x < stride)
+        {
+            smem_addr[threadIdx.x] += smem_addr[threadIdx.x + stride];
+        }
+        __syncthreads();
     }
     volatile float* v_smem = &smem_addr[0];
-    if (threadIdx.x < 32)
+    if(threadIdx.x < 32)
     {
-      v_smem[threadIdx.x] += v_smem[threadIdx.x + 32];
-      v_smem[threadIdx.x] += v_smem[threadIdx.x + 16];
-      v_smem[threadIdx.x] += v_smem[threadIdx.x + 8];
-      v_smem[threadIdx.x] += v_smem[threadIdx.x + 4];
-      v_smem[threadIdx.x] += v_smem[threadIdx.x + 2];
-      v_smem[threadIdx.x] += v_smem[threadIdx.x + 1];
+        v_smem[threadIdx.x] += v_smem[threadIdx.x + 32];
+        v_smem[threadIdx.x] += v_smem[threadIdx.x + 16];
+        v_smem[threadIdx.x] += v_smem[threadIdx.x + 8];
+        v_smem[threadIdx.x] += v_smem[threadIdx.x + 4];
+        v_smem[threadIdx.x] += v_smem[threadIdx.x + 2];
+        v_smem[threadIdx.x] += v_smem[threadIdx.x + 1];
     }
     __syncthreads();
-  }
-
-  /*
-   * input case n dim should be divided by 4096 with dtype bf16
-   * and should be divided by 2048 with dtype fp32
-   * */
-  template <typename T, int tnum, int n_loop>
-  __global__ void __launch_bounds__(tnum, 1) local_device_load_rmsnorm_naive(
-      RankSignals sg,
-      T* __restrict__ residual_inp,
-      T* __restrict__ residual_out,
-      T* __restrict__ results,
-      T* __restrict__ weight,
-      float eps,
-      int rank,
-      int m,
-      int n
-  )
-  {
+}
+
+/*
+ * input case n dim should be divided by 4096 with dtype bf16
+ * and should be divided by 2048 with dtype fp32
+ * */
+template <typename T, int tnum, int n_loop>
+__global__ void __launch_bounds__(tnum, 1)
+    local_device_load_rmsnorm_naive(RankSignals sg,
+                                    T* __restrict__ residual_inp,
+                                    T* __restrict__ residual_out,
+                                    T* __restrict__ results,
+                                    T* __restrict__ weight,
+                                    float eps,
+                                    int rank,
+                                    int m,
+                                    int n)
+{
     constexpr int pack_size = packed_t<T>::P::size;
-    using P = typename packed_t<T>::P;
-    using A = typename packed_t<T>::A;
+    using P                 = typename packed_t<T>::P;
+    using A                 = typename packed_t<T>::A;
     __shared__ float smem[tnum];
     P* tmps = get_tmp_buf<P>(sg.signals[rank]);
 
-    for (int bid = blockIdx.x; bid < m; bid += gridDim.x)
+    for(int bid = blockIdx.x; bid < m; bid += gridDim.x)
     {
-      float square_sum = 0.0f;
-      A rms_inp_f32[n_loop];
-      P w_arr[n_loop];
-#pragma unroll
-      for (int n_iter = 0; n_iter < n_loop; ++n_iter)
-      {
-        int read_idx = bid * n_loop * blockDim.x + n_iter * blockDim.x + threadIdx.x;
-        P reduce_out_pack = tmps[read_idx];
-        P residual_inp_pack = *(reinterpret_cast<P*>(residual_inp) + read_idx);
-        w_arr[n_iter] = *(reinterpret_cast<P*>(weight) + n_iter * blockDim.x + threadIdx.x);
-        A reduce_pack;
+        float square_sum = 0.0f;
+        A rms_inp_f32[n_loop];
+        P w_arr[n_loop];
 #pragma unroll
-        for (int i = 0; i < pack_size; ++i)
+        for(int n_iter = 0; n_iter < n_loop; ++n_iter)
         {
-          float res_inp = ck_tile::type_convert<float>(residual_inp_pack.data[i]);
-          float ar_out = ck_tile::type_convert<float>(reduce_out_pack.data[i]);
-          float rms_inp = res_inp + ar_out;
-          rms_inp_f32[n_iter].data[i] = rms_inp;
-          reduce_pack.data[i] = rms_inp * rms_inp;
-        }
-        square_sum += packReduce<AddFunctor, float, pack_size>(reduce_pack);
-      }
-      smem[threadIdx.x] = square_sum;
-      __syncthreads();
-      smemReduceSum<tnum>(&smem[0]);
-      square_sum = smem[0];
-      float denom = rsqrtf(square_sum / n + eps);
+            int read_idx        = bid * n_loop * blockDim.x + n_iter * blockDim.x + threadIdx.x;
+            P reduce_out_pack   = tmps[read_idx];
+            P residual_inp_pack = *(reinterpret_cast<P*>(residual_inp) + read_idx);
+            w_arr[n_iter] = *(reinterpret_cast<P*>(weight) + n_iter * blockDim.x + threadIdx.x);
+            A reduce_pack;
 #pragma unroll
-      for (int n_iter = 0; n_iter < n_loop; ++n_iter)
-      {
-        P rmsnorm_rslt;
-        P rmsnorm_inp;
+            for(int i = 0; i < pack_size; ++i)
+            {
+                float res_inp = ck_tile::type_convert<float>(residual_inp_pack.data[i]);
+                float ar_out  = ck_tile::type_convert<float>(reduce_out_pack.data[i]);
+                float rms_inp = res_inp + ar_out;
+                rms_inp_f32[n_iter].data[i] = rms_inp;
+                reduce_pack.data[i]         = rms_inp * rms_inp;
+            }
+            square_sum += packReduce<AddFunctor, float, pack_size>(reduce_pack);
+        }
+        smem[threadIdx.x] = square_sum;
+        __syncthreads();
+        smemReduceSum<tnum>(&smem[0]);
+        square_sum  = smem[0];
+        float denom = rsqrtf(square_sum / n + eps);
 #pragma unroll
-        for (int i = 0; i < pack_size; ++i)
+        for(int n_iter = 0; n_iter < n_loop; ++n_iter)
         {
-          float x_f32 = rms_inp_f32[n_iter].data[i];
-          float w_f32 = ck_tile::type_convert<float>(w_arr[n_iter].data[i]);
-          rmsnorm_inp.data[i] = ck_tile::type_convert<T>(x_f32);
-          rmsnorm_rslt.data[i] = ck_tile::type_convert<T>(x_f32 * w_f32 * denom);
+            P rmsnorm_rslt;
+            P rmsnorm_inp;
+#pragma unroll
+            for(int i = 0; i < pack_size; ++i)
+            {
+                float x_f32          = rms_inp_f32[n_iter].data[i];
+                float w_f32          = ck_tile::type_convert<float>(w_arr[n_iter].data[i]);
+                rmsnorm_inp.data[i]  = ck_tile::type_convert<T>(x_f32);
+                rmsnorm_rslt.data[i] = ck_tile::type_convert<T>(x_f32 * w_f32 * denom);
+            }
+            int write_idx = bid * n_loop * blockDim.x + n_iter * blockDim.x + threadIdx.x;
+            *(reinterpret_cast<P*>(results) + write_idx)      = rmsnorm_rslt;
+            *(reinterpret_cast<P*>(residual_out) + write_idx) = rmsnorm_inp;
         }
-        int write_idx = bid * n_loop * blockDim.x + n_iter * blockDim.x + threadIdx.x;
-        *(reinterpret_cast<P*>(results) + write_idx) = rmsnorm_rslt;
-        *(reinterpret_cast<P*>(residual_out) + write_idx) = rmsnorm_inp;
-      }
-    }
-  }
-
-  /*
-   * block size can be 256 and 512
-   * corresponding 2048 and 4096 elem per block
-   * */
-  template <typename T, int tnum, int n_loop>
-  __global__ void __launch_bounds__(tnum, 1) local_device_load_rmsnorm(
-      RankSignals sg,
-      T* __restrict__ residual_inp,
-      T* __restrict__ residual_out,
-      T* __restrict__ results,
-      T* __restrict__ weight,
-      float eps,
-      int rank,
-      int m,
-      int n
-  )
-  {
+    }
+}
+
+/*
+ * block size can be 256 and 512
+ * corresponding 2048 and 4096 elem per block
+ * */
+template <typename T, int tnum, int n_loop>
+__global__ void __launch_bounds__(tnum, 1) local_device_load_rmsnorm(RankSignals sg,
+                                                                     T* __restrict__ residual_inp,
+                                                                     T* __restrict__ residual_out,
+                                                                     T* __restrict__ results,
+                                                                     T* __restrict__ weight,
+                                                                     float eps,
+                                                                     int rank,
+                                                                     int m,
+                                                                     int n)
+{
     constexpr int pack_size = packed_t<T>::P::size;
-    using P = typename packed_t<T>::P;
-    using A = typename packed_t<T>::A;
+    using P                 = typename packed_t<T>::P;
+    using A                 = typename packed_t<T>::A;
     __shared__ float smem[tnum];
     P* tmps = get_tmp_buf<P>(sg.signals[rank]);
 
-    for (int bid = blockIdx.x; bid < m; bid += gridDim.x)
+    for(int bid = blockIdx.x; bid < m; bid += gridDim.x)
     {
-      float square_sum = 0.0f;
-      A rms_inp_f32[n_loop];
-      P w_arr[n_loop];
+        float square_sum = 0.0f;
+        A rms_inp_f32[n_loop];
+        P w_arr[n_loop];
 #pragma unroll
-      for (int n_iter = 0; n_iter < n_loop; ++n_iter)
-      {
-        if (n_iter * tnum + threadIdx.x < (n / pack_size))
+        for(int n_iter = 0; n_iter < n_loop; ++n_iter)
         {
-          int read_idx = bid * (n / pack_size) + n_iter * tnum + threadIdx.x;
-          P reduce_out_pack = tmps[read_idx];
-          P residual_inp_pack = *(reinterpret_cast<P*>(residual_inp) + read_idx);
-          w_arr[n_iter] = *(reinterpret_cast<P*>(weight) + n_iter * tnum + threadIdx.x);
-          A reduce_pack;
+            if(n_iter * tnum + threadIdx.x < (n / pack_size))
+            {
+                int read_idx        = bid * (n / pack_size) + n_iter * tnum + threadIdx.x;
+                P reduce_out_pack   = tmps[read_idx];
+                P residual_inp_pack = *(reinterpret_cast<P*>(residual_inp) + read_idx);
+                w_arr[n_iter]       = *(reinterpret_cast<P*>(weight) + n_iter * tnum + threadIdx.x);
+                A reduce_pack;
 #pragma unroll
-          for (int i = 0; i < pack_size; ++i)
-          {
-            float ar_out = ck_tile::type_convert<float>(reduce_out_pack.data[i]);
-            float res_inp = ck_tile::type_convert<float>(residual_inp_pack.data[i]);
-            float rms_inp = ar_out + res_inp;
-            rms_inp_f32[n_iter].data[i] = rms_inp;
-            reduce_pack.data[i] = rms_inp * rms_inp;
-          }
-          square_sum += packReduce<AddFunctor, float, pack_size>(reduce_pack);
+                for(int i = 0; i < pack_size; ++i)
+                {
+                    float ar_out  = ck_tile::type_convert<float>(reduce_out_pack.data[i]);
+                    float res_inp = ck_tile::type_convert<float>(residual_inp_pack.data[i]);
+                    float rms_inp = ar_out + res_inp;
+                    rms_inp_f32[n_iter].data[i] = rms_inp;
+                    reduce_pack.data[i]         = rms_inp * rms_inp;
+                }
+                square_sum += packReduce<AddFunctor, float, pack_size>(reduce_pack);
+            }
         }
-      }
-      smem[threadIdx.x] = square_sum;
-      __syncthreads();
-      smemReduceSum<tnum>(&smem[0]);
-      square_sum = smem[0];
-      float denom = rsqrtf(square_sum / n + eps);
+        smem[threadIdx.x] = square_sum;
+        __syncthreads();
+        smemReduceSum<tnum>(&smem[0]);
+        square_sum  = smem[0];
+        float denom = rsqrtf(square_sum / n + eps);
 #pragma unroll
-      for (int n_iter = 0; n_iter < n_loop; ++n_iter)
-      {
-        if (n_iter * tnum + threadIdx.x < (n / pack_size))
+        for(int n_iter = 0; n_iter < n_loop; ++n_iter)
         {
-          P rmsnorm_rslt;
-          P rmsnorm_inp;
+            if(n_iter * tnum + threadIdx.x < (n / pack_size))
+            {
+                P rmsnorm_rslt;
+                P rmsnorm_inp;
 #pragma unroll
-          for (int i = 0; i < pack_size; ++i)
-          {
-            float x_f32 = rms_inp_f32[n_iter].data[i];
-            float w_f32 = ck_tile::type_convert<float>(w_arr[n_iter].data[i]);
-            rmsnorm_inp.data[i] = ck_tile::type_convert<T>(x_f32);
-            rmsnorm_rslt.data[i] = ck_tile::type_convert<T>(x_f32 * w_f32 * denom);
-          }
-          int write_idx = bid * (n / pack_size) + n_iter * tnum + threadIdx.x;
-          *(reinterpret_cast<P*>(results) + write_idx) = rmsnorm_rslt;
-          *(reinterpret_cast<P*>(residual_out) + write_idx) = rmsnorm_inp;
+                for(int i = 0; i < pack_size; ++i)
+                {
+                    float x_f32          = rms_inp_f32[n_iter].data[i];
+                    float w_f32          = ck_tile::type_convert<float>(w_arr[n_iter].data[i]);
+                    rmsnorm_inp.data[i]  = ck_tile::type_convert<T>(x_f32);
+                    rmsnorm_rslt.data[i] = ck_tile::type_convert<T>(x_f32 * w_f32 * denom);
+                }
+                int write_idx = bid * (n / pack_size) + n_iter * tnum + threadIdx.x;
+                *(reinterpret_cast<P*>(results) + write_idx)      = rmsnorm_rslt;
+                *(reinterpret_cast<P*>(residual_out) + write_idx) = rmsnorm_inp;
+            }
         }
-      }
-    }
-  }
-
-  template <typename T, int n_loop>
-  __global__ void __launch_bounds__(256, 1) local_device_load_rmsnorm_512n(
-      RankSignals sg,
-      T* __restrict__ residual_inp,
-      T* __restrict__ residual_out,
-      T* __restrict__ results,
-      T* __restrict__ weight,
-      float eps,
-      int rank,
-      int m,
-      int n
-  )
-  {
+    }
+}
+
+template <typename T, int n_loop>
+__global__ void __launch_bounds__(256, 1)
+    local_device_load_rmsnorm_512n(RankSignals sg,
+                                   T* __restrict__ residual_inp,
+                                   T* __restrict__ residual_out,
+                                   T* __restrict__ results,
+                                   T* __restrict__ weight,
+                                   float eps,
+                                   int rank,
+                                   int m,
+                                   int n)
+{
     constexpr int pack_size = packed_t<T>::P::size;
-    using P = typename packed_t<T>::P;
-    using A = typename packed_t<T>::A;
-    P* tmps = get_tmp_buf<P>(sg.signals[rank]);
-    int warp_id = threadIdx.x / 64;
-    int lane_id = threadIdx.x % 64;
-    int warp_num = blockDim.x / 64;
-
-    for (int bid = blockIdx.x * warp_num + warp_id; bid < m; bid += gridDim.x * warp_num)
-    {
-      float square_sum = 0.0f;
-      A rms_inp_f32[n_loop];
-      P w_arr[n_loop];
-#pragma unroll
-      for (int n_iter = 0; n_iter < n_loop; ++n_iter)
-      {
-        int read_idx = bid * 64 * n_loop + n_iter * 64 + lane_id;
-        P reduce_out_pack = tmps[read_idx];
-        P residual_inp_pack = *(reinterpret_cast<P*>(residual_inp) + read_idx);
-        w_arr[n_iter] = *(reinterpret_cast<P*>(weight) + n_iter * 64 + lane_id);
-        A reduce_pack;
+    using P                 = typename packed_t<T>::P;
+    using A                 = typename packed_t<T>::A;
+    P* tmps                 = get_tmp_buf<P>(sg.signals[rank]);
+    int warp_id             = threadIdx.x / 64;
+    int lane_id             = threadIdx.x % 64;
+    int warp_num            = blockDim.x / 64;
+
+    for(int bid = blockIdx.x * warp_num + warp_id; bid < m; bid += gridDim.x * warp_num)
+    {
+        float square_sum = 0.0f;
+        A rms_inp_f32[n_loop];
+        P w_arr[n_loop];
 #pragma unroll
-        for (int i = 0; i < pack_size; ++i)
+        for(int n_iter = 0; n_iter < n_loop; ++n_iter)
         {
-          float ar_out = ck_tile::type_convert<float>(reduce_out_pack.data[i]);
-          float res_inp = ck_tile::type_convert<float>(residual_inp_pack.data[i]);
-          float rms_inp = ar_out + res_inp;
-          rms_inp_f32[n_iter].data[i] = rms_inp;
-          reduce_pack.data[i] = rms_inp * rms_inp;
-        }
-        float tmp_sum = packReduce<AddFunctor, float, pack_size>(reduce_pack);
-        square_sum += tmp_sum;
-      }
-      square_sum = warpReduce<AddFunctor, float, 64>(square_sum);
-      float denom = rsqrtf(square_sum / n + eps);
+            int read_idx        = bid * 64 * n_loop + n_iter * 64 + lane_id;
+            P reduce_out_pack   = tmps[read_idx];
+            P residual_inp_pack = *(reinterpret_cast<P*>(residual_inp) + read_idx);
+            w_arr[n_iter]       = *(reinterpret_cast<P*>(weight) + n_iter * 64 + lane_id);
+            A reduce_pack;
 #pragma unroll
-      for (int n_iter = 0; n_iter < n_loop; ++n_iter)
-      {
-        P rmsnorm_rslt;
-        P rmsnorm_inp;
+            for(int i = 0; i < pack_size; ++i)
+            {
+                float ar_out  = ck_tile::type_convert<float>(reduce_out_pack.data[i]);
+                float res_inp = ck_tile::type_convert<float>(residual_inp_pack.data[i]);
+                float rms_inp = ar_out + res_inp;
+                rms_inp_f32[n_iter].data[i] = rms_inp;
+                reduce_pack.data[i]         = rms_inp * rms_inp;
+            }
+            float tmp_sum = packReduce<AddFunctor, float, pack_size>(reduce_pack);
+            square_sum += tmp_sum;
+        }
+        square_sum  = warpReduce<AddFunctor, float, 64>(square_sum);
+        float denom = rsqrtf(square_sum / n + eps);
 #pragma unroll
-        for (int i = 0; i < pack_size; ++i)
+        for(int n_iter = 0; n_iter < n_loop; ++n_iter)
         {
-          float x_f32 = rms_inp_f32[n_iter].data[i];
-          float w_f32 = ck_tile::type_convert<float>(w_arr[n_iter].data[i]);
-          rmsnorm_inp.data[i] = ck_tile::type_convert<T>(x_f32);
-          rmsnorm_rslt.data[i] = ck_tile::type_convert<T>(x_f32 * w_f32 * denom);
+            P rmsnorm_rslt;
+            P rmsnorm_inp;
+#pragma unroll
+            for(int i = 0; i < pack_size; ++i)
+            {
+                float x_f32          = rms_inp_f32[n_iter].data[i];
+                float w_f32          = ck_tile::type_convert<float>(w_arr[n_iter].data[i]);
+                rmsnorm_inp.data[i]  = ck_tile::type_convert<T>(x_f32);
+                rmsnorm_rslt.data[i] = ck_tile::type_convert<T>(x_f32 * w_f32 * denom);
+            }
+            int write_idx = bid * 64 * n_loop + n_iter * 64 + lane_id;
+            *(reinterpret_cast<P*>(results) + write_idx)      = rmsnorm_rslt;
+            *(reinterpret_cast<P*>(residual_out) + write_idx) = rmsnorm_inp;
         }
-        int write_idx = bid * 64 * n_loop + n_iter * 64 + lane_id;
-        *(reinterpret_cast<P*>(results) + write_idx) = rmsnorm_rslt;
-        *(reinterpret_cast<P*>(residual_out) + write_idx) = rmsnorm_inp;
-      }
     }
-  }
+}
 
-  using IPC_KEY = std::array<uint8_t, sizeof(hipIpcMemHandle_t)>;
-  static_assert(sizeof(IPC_KEY) == sizeof(hipIpcMemHandle_t));
-  static_assert(alignof(IPC_KEY) == alignof(hipIpcMemHandle_t));
+using IPC_KEY = std::array<uint8_t, sizeof(hipIpcMemHandle_t)>;
+static_assert(sizeof(IPC_KEY) == sizeof(hipIpcMemHandle_t));
+static_assert(alignof(IPC_KEY) == alignof(hipIpcMemHandle_t));
 
-  class CustomAllreduce
-  {
-  public:
+class CustomAllreduce
+{
+    public:
     int rank_;
     int world_size_;
     bool full_nvlink_;
 
     // below are device pointers
     RankSignals sg_;
-    std::unordered_map<void *, RankData *> buffers_;
-    Signal *self_sg_;
+    std::unordered_map<void*, RankData*> buffers_;
+    Signal* self_sg_;
 
     // stores the registered device pointers from all ranks
     RankData *d_rank_data_base_, *d_rank_data_end_;
-    std::vector<void *> graph_unreg_buffers_;
+    std::vector<void*> graph_unreg_buffers_;
     // a map from IPC handles to opened IPC pointers
-    std::map<IPC_KEY, char *> ipc_handles_;
+    std::map<IPC_KEY, char*> ipc_handles_;
 
     /**
      * meta is a pointer to device metadata and temporary buffer for allreduce.
@@ -1260,137 +1242,135 @@ namespace aiter
      * note: this class does not own any device memory. Any required buffers
      * are passed in from the constructor
      */
-    CustomAllreduce(Signal *meta, void *rank_data, size_t rank_data_sz,
-                    const hipIpcMemHandle_t *handles,
-                    const std::vector<int64_t> &offsets, int rank,
+    CustomAllreduce(Signal* meta,
+                    void* rank_data,
+                    size_t rank_data_sz,
+                    const hipIpcMemHandle_t* handles,
+                    const std::vector<int64_t>& offsets,
+                    int rank,
                     bool fully_connected = true)
         : rank_(rank),
           world_size_(offsets.size()),
           full_nvlink_(fully_connected),
           self_sg_(meta),
-          d_rank_data_base_(reinterpret_cast<RankData *>(rank_data)),
+          d_rank_data_base_(reinterpret_cast<RankData*>(rank_data)),
           d_rank_data_end_(d_rank_data_base_ + rank_data_sz / sizeof(RankData))
     {
-      for (int i = 0; i < world_size_; i++)
-      {
-        Signal *rank_sg;
-        if (i != rank_)
+        for(int i = 0; i < world_size_; i++)
         {
-          char *handle = open_ipc_handle(&handles[i]);
-          handle += offsets[i];
-          rank_sg = (Signal *)handle;
+            Signal* rank_sg;
+            if(i != rank_)
+            {
+                char* handle = open_ipc_handle(&handles[i]);
+                handle += offsets[i];
+                rank_sg = (Signal*)handle;
+            }
+            else
+            {
+                rank_sg = self_sg_;
+            }
+            sg_.signals[i] = rank_sg;
         }
-        else
+    }
+
+    char* open_ipc_handle(const void* ipc_handle)
+    {
+        auto [it, new_handle] = ipc_handles_.insert({*((IPC_KEY*)ipc_handle), nullptr});
+        if(new_handle)
         {
-          rank_sg = self_sg_;
+            char* ipc_ptr;
+            HIP_CALL(hipIpcOpenMemHandle((void**)&ipc_ptr,
+                                         *((const hipIpcMemHandle_t*)ipc_handle),
+                                         hipIpcMemLazyEnablePeerAccess));
+            it->second = ipc_ptr;
         }
-        sg_.signals[i] = rank_sg;
-      }
-    }
-
-    char *open_ipc_handle(const void *ipc_handle)
-    {
-      auto [it, new_handle] =
-          ipc_handles_.insert({*((IPC_KEY *)ipc_handle), nullptr});
-      if (new_handle)
-      {
-        char *ipc_ptr;
-        HIP_CALL(hipIpcOpenMemHandle((void **)&ipc_ptr,
-                                       *((const hipIpcMemHandle_t *)ipc_handle),
-                                       hipIpcMemLazyEnablePeerAccess));
-        it->second = ipc_ptr;
-      }
-      return it->second;
-    }
-
-    std::pair<std::vector<uint8_t>, std::vector<int64_t>>
-    get_graph_buffer_ipc_meta()
-    {
-      auto num_buffers = graph_unreg_buffers_.size();
-      auto handle_sz = sizeof(hipIpcMemHandle_t);
-      std::vector<uint8_t> handles(handle_sz * num_buffers, 0);
-      std::vector<int64_t> offsets(num_buffers);
-      for (int i = 0; i < num_buffers; i++)
-      {
-        auto ptr = graph_unreg_buffers_[i];
-        void *base_ptr;
-        // note: must share the base address of each allocation, or we get wrong
-        // address
-        if (hipPointerGetAttribute(&base_ptr,
+        return it->second;
+    }
+
+    std::pair<std::vector<uint8_t>, std::vector<int64_t>> get_graph_buffer_ipc_meta()
+    {
+        auto num_buffers = graph_unreg_buffers_.size();
+        auto handle_sz   = sizeof(hipIpcMemHandle_t);
+        std::vector<uint8_t> handles(handle_sz * num_buffers, 0);
+        std::vector<int64_t> offsets(num_buffers);
+        for(int i = 0; i < num_buffers; i++)
+        {
+            auto ptr = graph_unreg_buffers_[i];
+            void* base_ptr;
+            // note: must share the base address of each allocation, or we get wrong
+            // address
+            if(hipPointerGetAttribute(&base_ptr,
 #ifdef USE_ROCM
-                                  HIP_POINTER_ATTRIBUTE_RANGE_START_ADDR,
+                                      HIP_POINTER_ATTRIBUTE_RANGE_START_ADDR,
 #else
-                                  CU_POINTER_ATTRIBUTE_RANGE_START_ADDR,
+                                      CU_POINTER_ATTRIBUTE_RANGE_START_ADDR,
 #endif
-                                  (hipDeviceptr_t)ptr) != CUDA_SUCCESS)
-          throw std::runtime_error("failed to get pointer attr");
-        HIP_CALL(hipIpcGetMemHandle(
-            (hipIpcMemHandle_t *)&handles[i * handle_sz], base_ptr));
-        offsets[i] = ((char *)ptr) - ((char *)base_ptr);
-      }
-      return std::make_pair(handles, offsets);
+                                      (hipDeviceptr_t)ptr) != CUDA_SUCCESS)
+                throw std::runtime_error("failed to get pointer attr");
+            HIP_CALL(hipIpcGetMemHandle((hipIpcMemHandle_t*)&handles[i * handle_sz], base_ptr));
+            offsets[i] = ((char*)ptr) - ((char*)base_ptr);
+        }
+        return std::make_pair(handles, offsets);
     }
 
     void check_rank_data_capacity(size_t num = 1)
     {
-      if (d_rank_data_base_ + num > d_rank_data_end_)
-        throw std::runtime_error(
-            "Rank data buffer is overflowed by " +
-            std::to_string(d_rank_data_base_ + num - d_rank_data_end_));
+        if(d_rank_data_base_ + num > d_rank_data_end_)
+            throw std::runtime_error("Rank data buffer is overflowed by " +
+                                     std::to_string(d_rank_data_base_ + num - d_rank_data_end_));
     }
 
-    void register_buffer(const std::vector<torch::Tensor> &handles,
-                         const std::vector<int64_t> &offsets, void *self)
+    void register_buffer(const std::vector<torch::Tensor>& handles,
+                         const std::vector<int64_t>& offsets,
+                         void* self)
     {
-      check_rank_data_capacity();
-      RankData data;
-      for (int i = 0; i < world_size_; i++)
-      {
-        if (i != rank_)
-        {
-          hipIpcMemHandle_t* ipc_handle_ptr = (hipIpcMemHandle_t*)handles[i].data_ptr();
-          char *handle = open_ipc_handle((void*)ipc_handle_ptr);
-          handle += offsets[i];
-          data.ptrs[i] = handle;
-        }
-        else
+        check_rank_data_capacity();
+        RankData data;
+        for(int i = 0; i < world_size_; i++)
         {
-          data.ptrs[i] = self;
+            if(i != rank_)
+            {
+                hipIpcMemHandle_t* ipc_handle_ptr = (hipIpcMemHandle_t*)handles[i].data_ptr();
+                char* handle                      = open_ipc_handle((void*)ipc_handle_ptr);
+                handle += offsets[i];
+                data.ptrs[i] = handle;
+            }
+            else
+            {
+                data.ptrs[i] = self;
+            }
         }
-      }
-      auto d_data = d_rank_data_base_++;
-      HIP_CALL(
-          hipMemcpy(d_data, &data, sizeof(RankData), hipMemcpyHostToDevice));
-      buffers_[self] = d_data;
-    }
-
-    RankData *get_buffer_RD(hipStream_t stream, void *input)
-    {
-      RankData *ptrs;
-      auto it = buffers_.find(input);
-      if (it != buffers_.end())
-      {
-        ptrs = it->second;
-      }
-      else
-      {
-        hipStreamCaptureStatus status;
-        HIP_CALL(hipStreamIsCapturing(stream, &status));
-        if (status == hipStreamCaptureStatusActive)
+        auto d_data = d_rank_data_base_++;
+        HIP_CALL(hipMemcpy(d_data, &data, sizeof(RankData), hipMemcpyHostToDevice));
+        buffers_[self] = d_data;
+    }
+
+    RankData* get_buffer_RD(hipStream_t stream, void* input)
+    {
+        RankData* ptrs;
+        auto it = buffers_.find(input);
+        if(it != buffers_.end())
         {
-          ptrs = d_rank_data_base_ + graph_unreg_buffers_.size();
-          graph_unreg_buffers_.push_back(input);
+            ptrs = it->second;
         }
         else
         {
-          throw std::runtime_error(
-              "buffer address " +
-              std::to_string(reinterpret_cast<uint64_t>(input)) +
-              " is not registered!");
+            hipStreamCaptureStatus status;
+            HIP_CALL(hipStreamIsCapturing(stream, &status));
+            if(status == hipStreamCaptureStatusActive)
+            {
+                ptrs = d_rank_data_base_ + graph_unreg_buffers_.size();
+                graph_unreg_buffers_.push_back(input);
+            }
+            else
+            {
+                throw std::runtime_error("buffer address " +
+                                         std::to_string(reinterpret_cast<uint64_t>(input)) +
+                                         " is not registered!");
+            }
         }
-      }
 
-      return ptrs;
+        return ptrs;
     }
 
     // note: when registering graph buffers, we intentionally choose to not
@@ -1400,37 +1380,38 @@ namespace aiter
     // rank 1 may get the same input address for the second allreduce, but rank 2
     // got a different address. IPC handles have internal reference counting
     // mechanism so overhead should be small.
-    void register_graph_buffers(
-        const std::vector<torch::Tensor> &handles,
-        const std::vector<torch::Tensor> &offsets)
-    {
-      auto num_buffers = graph_unreg_buffers_.size();
-      check_rank_data_capacity(num_buffers);
-      std::vector<RankData> rank_data(num_buffers);
-      for (int i = 0; i < num_buffers; i++)
-      {
-        auto self_ptr = graph_unreg_buffers_[i];
-        auto &rd = rank_data[i];
-        for (int j = 0; j < world_size_; j++)
+    void register_graph_buffers(const std::vector<torch::Tensor>& handles,
+                                const std::vector<torch::Tensor>& offsets)
+    {
+        auto num_buffers = graph_unreg_buffers_.size();
+        check_rank_data_capacity(num_buffers);
+        std::vector<RankData> rank_data(num_buffers);
+        for(int i = 0; i < num_buffers; i++)
         {
-          if (j != rank_)
-          {
-            hipIpcMemHandle_t* ipc_handle_ptr = (hipIpcMemHandle_t*)handles[j].data_ptr() + i;
-            char *handle = open_ipc_handle(ipc_handle_ptr);
-            handle += *((int64_t*)offsets[j].data_ptr() + i);
-            rd.ptrs[j] = handle;
-          }
-          else
-          {
-            rd.ptrs[j] = self_ptr;
-          }
+            auto self_ptr = graph_unreg_buffers_[i];
+            auto& rd      = rank_data[i];
+            for(int j = 0; j < world_size_; j++)
+            {
+                if(j != rank_)
+                {
+                    hipIpcMemHandle_t* ipc_handle_ptr =
+                        (hipIpcMemHandle_t*)handles[j].data_ptr() + i;
+                    char* handle = open_ipc_handle(ipc_handle_ptr);
+                    handle += *((int64_t*)offsets[j].data_ptr() + i);
+                    rd.ptrs[j] = handle;
+                }
+                else
+                {
+                    rd.ptrs[j] = self_ptr;
+                }
+            }
         }
-      }
-      HIP_CALL(hipMemcpy(d_rank_data_base_, rank_data.data(),
+        HIP_CALL(hipMemcpy(d_rank_data_base_,
+                           rank_data.data(),
                            sizeof(RankData) * num_buffers,
                            hipMemcpyHostToDevice));
-      d_rank_data_base_ += num_buffers;
-      graph_unreg_buffers_.clear();
+        d_rank_data_base_ += num_buffers;
+        graph_unreg_buffers_.clear();
     }
 
     /*
@@ -1443,58 +1424,58 @@ namespace aiter
     template <typename T>
     void runFp8QuantKernel(hipStream_t stream, T* input, T* output, int size)
     {
-      RankData *ptrs = get_buffer_RD(stream, input);
-      // 32 block 512 thread or 64 block 256 thread
-#define DISPATHC_UNIT(pack_size, quant_scale, ngpus)                                                                             \
-  do                                                                                                                             \
-  {                                                                                                                              \
-    case ngpus:                                                                                                                  \
-    {                                                                                                                            \
-      allReduceQuantFp8<T, quant_scale, pack_size, ngpus><<<grid, block, 0, stream>>>(ptrs, sg_, self_sg_, output, rank_, size); \
-      return ;                                                                                                                   \
-    }                                                                                                                            \
-  }while(0)
-
-#define DISPATCH_CALL(pack_size, block_size, quant_scale)                                \
-  do                                                                                     \
-  {                                                                                      \
-   block.x = block_size;                                                                 \
-    grid.x = min((16384 / block_size), (single_device_size / (pack_size * block_size))); \
-    size /= pack_size;                                                                   \
-    switch (world_size_)                                                                 \
-    {                                                                                    \
-      DISPATHC_UNIT(pack_size, quant_scale, 2);                                          \
-      DISPATHC_UNIT(pack_size, quant_scale, 4);                                          \
-      DISPATHC_UNIT(pack_size, quant_scale, 6);                                          \
-      DISPATHC_UNIT(pack_size, quant_scale, 8);                                          \
-    }                                                                                    \
-  } while(0)
-
-      int single_device_size = size / world_size_;
-      constexpr int max_thread_num = 512;
-      constexpr int max_pack_size = 8;
-      constexpr int max_elem_perblock = max_thread_num * max_pack_size;
-      dim3 grid, block;
-      if (single_device_size % 128 == 0)
-      {
-        DISPATCH_CALL(8, 256, 128);
-      }
-      else if (single_device_size % 64 == 0)
-      {
-        DISPATCH_CALL(8, 256, 64);
-      }
-      else if (single_device_size % 32 == 0)
-      {
-        DISPATCH_CALL(8, 256, 32);
-      }
-      else if (single_device_size % 16 == 0)
-      {
-        DISPATCH_CALL(8, 256, 16);
-      }
-      else // 512
-      {
-        DISPATCH_CALL(8, 256, 8);
-      }
+        RankData* ptrs = get_buffer_RD(stream, input);
+        // 32 block 512 thread or 64 block 256 thread
+#define DISPATHC_UNIT(pack_size, quant_scale, ngpus)                                \
+    do                                                                              \
+    {                                                                               \
+    case ngpus: {                                                                   \
+        allReduceQuantFp8<T, quant_scale, pack_size, ngpus>                         \
+            <<<grid, block, 0, stream>>>(ptrs, sg_, self_sg_, output, rank_, size); \
+        return;                                                                     \
+    }                                                                               \
+    } while(0)
+
+#define DISPATCH_CALL(pack_size, block_size, quant_scale)                                     \
+    do                                                                                        \
+    {                                                                                         \
+        block.x = block_size;                                                                 \
+        grid.x  = min((16384 / block_size), (single_device_size / (pack_size * block_size))); \
+        size /= pack_size;                                                                    \
+        switch(world_size_)                                                                   \
+        {                                                                                     \
+            DISPATHC_UNIT(pack_size, quant_scale, 2);                                         \
+            DISPATHC_UNIT(pack_size, quant_scale, 4);                                         \
+            DISPATHC_UNIT(pack_size, quant_scale, 6);                                         \
+            DISPATHC_UNIT(pack_size, quant_scale, 8);                                         \
+        }                                                                                     \
+    } while(0)
+
+        int single_device_size          = size / world_size_;
+        constexpr int max_thread_num    = 512;
+        constexpr int max_pack_size     = 8;
+        constexpr int max_elem_perblock = max_thread_num * max_pack_size;
+        dim3 grid, block;
+        if(single_device_size % 128 == 0)
+        {
+            DISPATCH_CALL(8, 256, 128);
+        }
+        else if(single_device_size % 64 == 0)
+        {
+            DISPATCH_CALL(8, 256, 64);
+        }
+        else if(single_device_size % 32 == 0)
+        {
+            DISPATCH_CALL(8, 256, 32);
+        }
+        else if(single_device_size % 16 == 0)
+        {
+            DISPATCH_CALL(8, 256, 16);
+        }
+        else // 512
+        {
+            DISPATCH_CALL(8, 256, 8);
+        }
     }
 
     /**
@@ -1505,229 +1486,246 @@ namespace aiter
      * will cause contention on NVLink bus.
      */
     template <typename T>
-    void allreduce(hipStream_t stream, T *input, T *output, int size, bool use_new = true,
+    void allreduce(hipStream_t stream,
+                   T* input,
+                   T* output,
+                   int size,
+                   bool use_new = true,
 #ifndef USE_ROCM
-                   int threads = 512, int block_limit = 20){
+                   int threads     = 512,
+                   int block_limit = 20){
 #else
-                   int threads = 512, int block_limit = 16)
+                   int threads     = 512,
+                   int block_limit = 16)
     {
 #endif
         auto d = packed_t<T>::P::size;
-    if (size % d != 0)
-      throw std::runtime_error(
-          "custom allreduce currently requires input length to be multiple "
-          "of " +
-          std::to_string(d));
-    if (block_limit > kMaxBlocks)
-      throw std::runtime_error("max supported block limit is " +
-                               std::to_string(kMaxBlocks) + ". Got " +
-                               std::to_string(block_limit));
-
-    RankData *ptrs = get_buffer_RD(stream, input);
+    if(size % d != 0)
+        throw std::runtime_error("custom allreduce currently requires input length to be multiple "
+                                 "of " +
+                                 std::to_string(d));
+    if(block_limit > kMaxBlocks)
+        throw std::runtime_error("max supported block limit is " + std::to_string(kMaxBlocks) +
+                                 ". Got " + std::to_string(block_limit));
+
+    RankData* ptrs = get_buffer_RD(stream, input);
 
     auto bytes = size * sizeof(T);
     size /= d;
 
     // use new version of allreduce kernel
-    if (use_new)
-    {
-      int blocks = 16;
-      bool call_1stage = false;
-      bool call_2stage = false;
-      if (world_size_ == 2)
-      {
-        call_1stage = true;
-      }
-      else if (full_nvlink_)
-      {
-        if ((world_size_ <= 4 && bytes < 160 * 1024) || (world_size_ <= 8 && bytes < 80 * 1024))
+    if(use_new)
+    {
+        int blocks       = 16;
+        bool call_1stage = false;
+        bool call_2stage = false;
+        if(world_size_ == 2)
         {
-          call_1stage = true;
+            call_1stage = true;
         }
-        else
+        else if(full_nvlink_)
+        {
+            if((world_size_ <= 4 && bytes < 160 * 1024) || (world_size_ <= 8 && bytes < 80 * 1024))
+            {
+                call_1stage = true;
+            }
+            else
+            {
+                call_2stage = true;
+            }
+        }
+        if(call_1stage)
         {
-          call_2stage = true;
+            blocks = std::min(kMaxBlocks,
+                              (size + (threads / world_size_) - 1) / (threads / world_size_));
         }
-      }
-      if (call_1stage)
-      {
-        blocks = std::min(kMaxBlocks, (size + (threads / world_size_) - 1) / (threads / world_size_));
-      }
-      else if (call_2stage)
-      {
-        blocks = std::min(kMaxBlocks, (size / world_size_ + (threads / world_size_) - 1) / (threads / world_size_));
-      }
-
-#define KL(ngpus, name)                                                       \
-  name<T, ngpus><<<blocks, threads, 0, stream>>>(ptrs, sg_, self_sg_, output, \
-                                                 rank_, size);
-
-#define dispatch(ngpus, name)                            \
-    do                                                   \
-    {                                                    \
-      if (bytes % (ngpus * 16) == 0 && world_size_ != 6) \
-      {                                                  \
-        KL(ngpus, name)                                  \
-      }                                                  \
-      else                                               \
-      {                                                  \
-        KL(ngpus, name##_naive)                          \
-      }                                                  \
+        else if(call_2stage)
+        {
+            blocks = std::min(kMaxBlocks,
+                              (size / world_size_ + (threads / world_size_) - 1) /
+                                  (threads / world_size_));
+        }
+
+#define KL(ngpus, name) \
+    name<T, ngpus><<<blocks, threads, 0, stream>>>(ptrs, sg_, self_sg_, output, rank_, size);
+
+#define dispatch(ngpus, name)                             \
+    do                                                    \
+    {                                                     \
+        if(bytes % (ngpus * 16) == 0 && world_size_ != 6) \
+        {                                                 \
+            KL(ngpus, name)                               \
+        }                                                 \
+        else                                              \
+        {                                                 \
+            KL(ngpus, name##_naive)                       \
+        }                                                 \
     } while(0)
 
-#define REDUCE_CASE(ngpus)                         \
-  case ngpus:                                      \
-  {                                                \
-    if (call_1stage)                               \
-    {                                              \
-      KL(ngpus, cross_device_reduce_1stage);       \
-    }                                              \
-    else if (call_2stage)                          \
-    {                                              \
-      dispatch(ngpus, cross_device_reduce_2stage); \
-    }                                              \
-    break;                                         \
-  }
-
-      switch (world_size_)
-      {
-        REDUCE_CASE(2)
-        REDUCE_CASE(4)
-        REDUCE_CASE(6)
-        REDUCE_CASE(8)
-      default:
-        throw std::runtime_error(
-            "custom allreduce only supports num gpus in (2,4,6,8). Actual num "
-            "gpus = " +
-            std::to_string(world_size_));
-      }
+#define REDUCE_CASE(ngpus)                               \
+    case ngpus: {                                        \
+        if(call_1stage)                                  \
+        {                                                \
+            KL(ngpus, cross_device_reduce_1stage);       \
+        }                                                \
+        else if(call_2stage)                             \
+        {                                                \
+            dispatch(ngpus, cross_device_reduce_2stage); \
+        }                                                \
+        break;                                           \
+    }
+
+        switch(world_size_)
+        {
+            REDUCE_CASE(2)
+            REDUCE_CASE(4)
+            REDUCE_CASE(6)
+            REDUCE_CASE(8)
+        default:
+            throw std::runtime_error(
+                "custom allreduce only supports num gpus in (2,4,6,8). Actual num "
+                "gpus = " +
+                std::to_string(world_size_));
+        }
     }
     else // use vllm allreduce kernel
     {
-      int blocks = std::min(block_limit, (size + threads - 1) / threads);
-#define VLLM_REDUCE_CASE(ngpus)                            \
-  case ngpus:                                              \
-  {                                                        \
-    if (world_size_ == 2)                                  \
-    {                                                      \
-      KL(ngpus, cross_device_reduce_1stage);               \
-    }                                                      \
-    else if (full_nvlink_)                                 \
-    {                                                      \
-      if ((world_size_ <= 4 && bytes < 512 * 1024) ||      \
-          (world_size_ <= 8 && bytes < 256 * 1024))        \
-      {                                                    \
-        KL(ngpus, cross_device_reduce_1stage_naive);       \
-      }                                                    \
-      else                                                 \
-      {                                                    \
-        KL(ngpus, cross_device_reduce_2stage_naive);       \
-      }                                                    \
-    }                                                      \
-    break;                                                 \
-  }
-
-      switch (world_size_)
-      {
-        VLLM_REDUCE_CASE(2)
-        VLLM_REDUCE_CASE(4)
-        VLLM_REDUCE_CASE(6)
-        VLLM_REDUCE_CASE(8)
-      default:
-        throw std::runtime_error(
-            "custom allreduce only supports num gpus in (2,4,6,8). Actual num "
-            "gpus = " +
-            std::to_string(world_size_));
-      }
+        int blocks = std::min(block_limit, (size + threads - 1) / threads);
+#define VLLM_REDUCE_CASE(ngpus)                              \
+    case ngpus: {                                            \
+        if(world_size_ == 2)                                 \
+        {                                                    \
+            KL(ngpus, cross_device_reduce_1stage);           \
+        }                                                    \
+        else if(full_nvlink_)                                \
+        {                                                    \
+            if((world_size_ <= 4 && bytes < 512 * 1024) ||   \
+               (world_size_ <= 8 && bytes < 256 * 1024))     \
+            {                                                \
+                KL(ngpus, cross_device_reduce_1stage_naive); \
+            }                                                \
+            else                                             \
+            {                                                \
+                KL(ngpus, cross_device_reduce_2stage_naive); \
+            }                                                \
+        }                                                    \
+        break;                                               \
+    }
+
+        switch(world_size_)
+        {
+            VLLM_REDUCE_CASE(2)
+            VLLM_REDUCE_CASE(4)
+            VLLM_REDUCE_CASE(6)
+            VLLM_REDUCE_CASE(8)
+        default:
+            throw std::runtime_error(
+                "custom allreduce only supports num gpus in (2,4,6,8). Actual num "
+                "gpus = " +
+                std::to_string(world_size_));
+        }
     }
 #undef REDUCE_CASE
 #undef KL
-  }
+}
 
-  template <typename T>
-  void dispatchReduceScatter(hipStream_t stream, T* input, T* output, int size)
-  {
+template <typename T>
+void dispatchReduceScatter(hipStream_t stream, T* input, T* output, int size)
+{
     RankData* ptrs = get_buffer_RD(stream, input);
-    auto d = packed_t<T>::P::size;
-    int range = size / (world_size_ * d);
+    auto d         = packed_t<T>::P::size;
+    int range      = size / (world_size_ * d);
     dim3 block(512);
     int block_num = (range + 511) / 512;
     dim3 grid(std::min(16, block_num));
-    switch (world_size_)
+    switch(world_size_)
     {
-      case 8:
-        reduce_scatter_first_dim<T, 8><<<grid, block, 0, stream>>>(ptrs, sg_, self_sg_, output, rank_, range);
+    case 8:
+        reduce_scatter_first_dim<T, 8>
+            <<<grid, block, 0, stream>>>(ptrs, sg_, self_sg_, output, rank_, range);
         break;
-      case 4:
-        reduce_scatter_first_dim<T, 4><<<grid, block, 0, stream>>>(ptrs, sg_, self_sg_, output, rank_, range);
+    case 4:
+        reduce_scatter_first_dim<T, 4>
+            <<<grid, block, 0, stream>>>(ptrs, sg_, self_sg_, output, rank_, range);
         break;
-      case 2:
-        reduce_scatter_first_dim<T, 2><<<grid, block, 0, stream>>>(ptrs, sg_, self_sg_, output, rank_, range);
+    case 2:
+        reduce_scatter_first_dim<T, 2>
+            <<<grid, block, 0, stream>>>(ptrs, sg_, self_sg_, output, rank_, range);
         break;
-      default:
-        printf("reduce_scatter world_size error!\n");
+    default: printf("reduce_scatter world_size error!\n");
     }
-  }
+}
 
-  template <typename T>
-  void dispatchAllGather(hipStream_t stream, T* input, T* output, int size)
-  {
+template <typename T>
+void dispatchAllGather(hipStream_t stream, T* input, T* output, int size)
+{
     RankData* ptrs = get_buffer_RD(stream, input);
-    auto d = packed_t<T>::P::size;
+    auto d         = packed_t<T>::P::size;
     dim3 block(512);
-    if (size % d != 0)
+    if(size % d != 0)
     {
-      int block_num = (size + 512 - 1) / 512;
-      dim3 grid(std::min(block_num, 80));
-      switch (world_size_)
-      {
+        int block_num = (size + 512 - 1) / 512;
+        dim3 grid(std::min(block_num, 80));
+        switch(world_size_)
+        {
         case 8:
-          allgather_naive<T, 8><<<grid, block, 0, stream>>>(ptrs, sg_, self_sg_, output, rank_, size);
-          break;
+            allgather_naive<T, 8>
+                <<<grid, block, 0, stream>>>(ptrs, sg_, self_sg_, output, rank_, size);
+            break;
         case 4:
-          allgather_naive<T, 4><<<grid, block, 0, stream>>>(ptrs, sg_, self_sg_, output, rank_, size);
-          break;
+            allgather_naive<T, 4>
+                <<<grid, block, 0, stream>>>(ptrs, sg_, self_sg_, output, rank_, size);
+            break;
         case 2:
-          allgather_naive<T, 2><<<grid, block, 0, stream>>>(ptrs, sg_, self_sg_, output, rank_, size);
-          break;
-        default:
-          printf("allgather world_size error\n");
-      }
+            allgather_naive<T, 2>
+                <<<grid, block, 0, stream>>>(ptrs, sg_, self_sg_, output, rank_, size);
+            break;
+        default: printf("allgather world_size error\n");
+        }
     }
     else
     {
-      size /= d;
-      int tnum_per_block = 512 / world_size_;
-      int block_num = (size + tnum_per_block - 1) / tnum_per_block;
-      dim3 grid(std::min(block_num, 80));
-      switch (world_size_)
-      {
+        size /= d;
+        int tnum_per_block = 512 / world_size_;
+        int block_num      = (size + tnum_per_block - 1) / tnum_per_block;
+        dim3 grid(std::min(block_num, 80));
+        switch(world_size_)
+        {
         case 8:
-          allgather_vec<T, 8><<<grid, block, 0, stream>>>(ptrs, sg_, self_sg_, output, rank_, size);
-          break;
+            allgather_vec<T, 8>
+                <<<grid, block, 0, stream>>>(ptrs, sg_, self_sg_, output, rank_, size);
+            break;
         case 4:
-          allgather_vec<T, 4><<<grid, block, 0, stream>>>(ptrs, sg_, self_sg_, output, rank_, size);
-          break;
+            allgather_vec<T, 4>
+                <<<grid, block, 0, stream>>>(ptrs, sg_, self_sg_, output, rank_, size);
+            break;
         case 2:
-          allgather_vec<T, 2><<<grid, block, 0, stream>>>(ptrs, sg_, self_sg_, output, rank_, size);
-          break;
-        default:
-          printf("allgather world_size error\n");
-      }
+            allgather_vec<T, 2>
+                <<<grid, block, 0, stream>>>(ptrs, sg_, self_sg_, output, rank_, size);
+            break;
+        default: printf("allgather world_size error\n");
+        }
     }
-  }
-
-  template <typename T>
-  void dispatchFusedAllReduceRMSNorm(hipStream_t stream, T* input, T* residual_inp, T* residual_out, T* output, T* weight, float eps, int m, int n)
-  {
-    auto d = packed_t<T>::P::size;
+}
+
+template <typename T>
+void dispatchFusedAllReduceRMSNorm(hipStream_t stream,
+                                   T* input,
+                                   T* residual_inp,
+                                   T* residual_out,
+                                   T* output,
+                                   T* weight,
+                                   float eps,
+                                   int m,
+                                   int n)
+{
+    auto d   = packed_t<T>::P::size;
     int size = m * n;
-    if (size % d != 0)
+    if(size % d != 0)
     {
-      throw std::runtime_error(
-          "custom allreduce currently requires input length to be multiple "
-          "of " +
-          std::to_string(d));
+        throw std::runtime_error("custom allreduce currently requires input length to be multiple "
+                                 "of " +
+                                 std::to_string(d));
     }
     RankData* ptrs = get_buffer_RD(stream, input);
     hipDevice_t dev;
@@ -1740,130 +1738,126 @@ namespace aiter
     dim3 block(512);
     int block_num = ((size / world_size_) + 512 - 1) / 512;
     dim3 grid(std::min(block_num, 80));
-    switch (world_size_)
+    switch(world_size_)
     {
-      case 8:
-        reduce_scatter_cross_device_store<T, 8><<<grid, block, 0, stream>>>(ptrs, sg_, self_sg_, rank_, size);
+    case 8:
+        reduce_scatter_cross_device_store<T, 8>
+            <<<grid, block, 0, stream>>>(ptrs, sg_, self_sg_, rank_, size);
         break;
-      case 4:
-        reduce_scatter_cross_device_store<T, 4><<<grid, block, 0, stream>>>(ptrs, sg_, self_sg_, rank_, size);
+    case 4:
+        reduce_scatter_cross_device_store<T, 4>
+            <<<grid, block, 0, stream>>>(ptrs, sg_, self_sg_, rank_, size);
         break;
-      case 2:
-        reduce_scatter_cross_device_store<T, 2><<<grid, block, 0, stream>>>(ptrs, sg_, self_sg_, rank_, size);
+    case 2:
+        reduce_scatter_cross_device_store<T, 2>
+            <<<grid, block, 0, stream>>>(ptrs, sg_, self_sg_, rank_, size);
         break;
-      default:
-        printf("fused allreduce rmsnorm world size error\n");
+    default: printf("fused allreduce rmsnorm world size error\n");
     }
 
     // step 2, run allgather local device load + rmsnorm
-    int n_bytes = n * sizeof(T);
-    auto setGrid = [&](int naive_grid_size, const void* kernel_ptr)
-    {
-      int occupancy;
-      hipOccupancyMaxActiveBlocksPerMultiprocessor(&occupancy, kernel_ptr, block.x, 0);
-      grid.x = naive_grid_size < num_cu * occupancy ? naive_grid_size : num_cu * occupancy;
+    int n_bytes  = n * sizeof(T);
+    auto setGrid = [&](int naive_grid_size, const void* kernel_ptr) {
+        int occupancy;
+        hipOccupancyMaxActiveBlocksPerMultiprocessor(&occupancy, kernel_ptr, block.x, 0);
+        grid.x = naive_grid_size < num_cu * occupancy ? naive_grid_size : num_cu * occupancy;
     };
 
-#define launch_fused_allreduce_rmsnorm(template_kernel)                                                               \
-    do                                                                                                                \
-    {                                                                                                                 \
-      auto kernel_ptr = reinterpret_cast<const void*>(template_kernel);                                               \
-      setGrid(naive_grid_size, kernel_ptr);                                                                           \
-      template_kernel<<<grid, block, 0, stream>>>(sg_, residual_inp, residual_out, output, weight, eps, rank_, m, n); \
-    } while (0)
-
-    if (n_bytes % 1024 == 0)
-    {
-      if (8192 <= n_bytes && n_bytes <= 32768)
-      {
-        int naive_grid_size = m;
-        int n_loop = n_bytes / 8192; // 1, 2, 3, 4
-        if (n_bytes % 8192 == 0)
+#define launch_fused_allreduce_rmsnorm(template_kernel)                         \
+    do                                                                          \
+    {                                                                           \
+        auto kernel_ptr = reinterpret_cast<const void*>(template_kernel);       \
+        setGrid(naive_grid_size, kernel_ptr);                                   \
+        template_kernel<<<grid, block, 0, stream>>>(                            \
+            sg_, residual_inp, residual_out, output, weight, eps, rank_, m, n); \
+    } while(0)
+
+    if(n_bytes % 1024 == 0)
+    {
+        if(8192 <= n_bytes && n_bytes <= 32768)
         {
-          switch (n_loop)
-          {
-            case 1:
-              launch_fused_allreduce_rmsnorm((local_device_load_rmsnorm_naive<T, 512, 1>));
-              break;
-            case 2:
-              launch_fused_allreduce_rmsnorm((local_device_load_rmsnorm_naive<T, 512, 2>));
-              break;
-            case 3:
-              launch_fused_allreduce_rmsnorm((local_device_load_rmsnorm_naive<T, 512, 3>));
-              break;
-            case 4:
-              launch_fused_allreduce_rmsnorm((local_device_load_rmsnorm_naive<T, 512, 4>));
-              break;
-          }
+            int naive_grid_size = m;
+            int n_loop          = n_bytes / 8192; // 1, 2, 3, 4
+            if(n_bytes % 8192 == 0)
+            {
+                switch(n_loop)
+                {
+                case 1:
+                    launch_fused_allreduce_rmsnorm((local_device_load_rmsnorm_naive<T, 512, 1>));
+                    break;
+                case 2:
+                    launch_fused_allreduce_rmsnorm((local_device_load_rmsnorm_naive<T, 512, 2>));
+                    break;
+                case 3:
+                    launch_fused_allreduce_rmsnorm((local_device_load_rmsnorm_naive<T, 512, 3>));
+                    break;
+                case 4:
+                    launch_fused_allreduce_rmsnorm((local_device_load_rmsnorm_naive<T, 512, 4>));
+                    break;
+                }
+            }
+            else
+            {
+                n_loop += 1;
+                switch(n_loop)
+                {
+                case 2:
+                    launch_fused_allreduce_rmsnorm((local_device_load_rmsnorm<T, 512, 2>));
+                    break;
+                case 3:
+                    launch_fused_allreduce_rmsnorm((local_device_load_rmsnorm<T, 512, 3>));
+                    break;
+                case 4:
+                    launch_fused_allreduce_rmsnorm((local_device_load_rmsnorm<T, 512, 4>));
+                    break;
+                }
+            }
         }
-        else
+        else if(4096 <= n_bytes && n_bytes < 8192)
         {
-          n_loop += 1;
-          switch (n_loop)
-          {
-            case 2:
-              launch_fused_allreduce_rmsnorm((local_device_load_rmsnorm<T, 512, 2>));
-              break;
-            case 3:
-              launch_fused_allreduce_rmsnorm((local_device_load_rmsnorm<T, 512, 3>));
-              break;
-            case 4:
-              launch_fused_allreduce_rmsnorm((local_device_load_rmsnorm<T, 512, 4>));
-              break;
-          }
+            block.x             = 256;
+            int naive_grid_size = m;
+            if(n_bytes == 4096)
+            {
+                // naive n_loop = 1
+                launch_fused_allreduce_rmsnorm((local_device_load_rmsnorm_naive<T, 256, 1>));
+            }
+            else
+            {
+                // n_loop = 2
+                launch_fused_allreduce_rmsnorm((local_device_load_rmsnorm<T, 256, 2>));
+            }
         }
-      }
-      else if (4096 <= n_bytes && n_bytes < 8192)
-      {
-        block.x = 256;
-        int naive_grid_size = m;
-        if (n_bytes == 4096)
+        else if(1024 <= n_bytes && n_bytes < 4096)
         {
-          // naive n_loop = 1
-          launch_fused_allreduce_rmsnorm((local_device_load_rmsnorm_naive<T, 256, 1>));
+            block.x             = 256;
+            int naive_grid_size = (m + 3) / 4;
+            int n_loop          = n_bytes / 1024;
+            switch(n_loop)
+            {
+            case 1: launch_fused_allreduce_rmsnorm((local_device_load_rmsnorm_512n<T, 1>)); break;
+            case 2: launch_fused_allreduce_rmsnorm((local_device_load_rmsnorm_512n<T, 2>)); break;
+            case 3: launch_fused_allreduce_rmsnorm((local_device_load_rmsnorm_512n<T, 3>)); break;
+            }
         }
         else
         {
-          // n_loop = 2
-          launch_fused_allreduce_rmsnorm((local_device_load_rmsnorm<T, 256, 2>));
+            printf("fused allreduce rmsnorm shape size error\n");
         }
-      }
-      else if (1024 <= n_bytes && n_bytes < 4096)
-      {
-        block.x = 256;
-        int naive_grid_size = (m + 3) / 4;
-        int n_loop = n_bytes / 1024;
-        switch (n_loop)
-        {
-          case 1:
-            launch_fused_allreduce_rmsnorm((local_device_load_rmsnorm_512n<T, 1>));
-            break;
-          case 2:
-            launch_fused_allreduce_rmsnorm((local_device_load_rmsnorm_512n<T, 2>));
-            break;
-          case 3:
-            launch_fused_allreduce_rmsnorm((local_device_load_rmsnorm_512n<T, 3>));
-            break;
-        }
-      }
-      else
-      {
-        printf("fused allreduce rmsnorm shape size error\n");
-      }
     }
     else
     {
-      printf("fused allreduce rmsnorm shape error\n");
+        printf("fused allreduce rmsnorm shape error\n");
     }
-  }
+}
 
-  ~CustomAllreduce()
-  {
-    for (auto [_, ptr] : ipc_handles_)
+~CustomAllreduce()
+{
+    for(auto [_, ptr] : ipc_handles_)
     {
-      HIP_CALL(hipIpcCloseMemHandle(ptr));
+        HIP_CALL(hipIpcCloseMemHandle(ptr));
     }
-  }
+}
 }; // namespace aiter
 /**
  * To inspect PTX/SASS, copy paste this header file to compiler explorer and add
diff --git a/csrc/include/custom_all_reduce.h b/csrc/include/custom_all_reduce.h
index c708b50b84..a5e9ff1cbd 100644
--- a/csrc/include/custom_all_reduce.h
+++ b/csrc/include/custom_all_reduce.h
@@ -35,22 +35,22 @@ void all_reduce(fptr_t _fa,
                 bool open_fp8_quant,
                 std::optional<torch::Tensor> reg_buffer);
 void reduce_scatter(fptr_t _fa,
-                torch::Tensor& inp,
-                torch::Tensor& out,
-                std::optional<torch::Tensor> reg_buffer);
+                    torch::Tensor& inp,
+                    torch::Tensor& out,
+                    std::optional<torch::Tensor> reg_buffer);
 void all_gather_reg(fptr_t _fa, torch::Tensor& inp, torch::Tensor& out);
 void all_gather_unreg(fptr_t _fa,
                       torch::Tensor& inp,
                       torch::Tensor& reg_buffer,
                       torch::Tensor& out);
 void fused_allreduce_rmsnorm(fptr_t _fa,
-                torch::Tensor& inp,
-                torch::Tensor& res_inp,
-                torch::Tensor& res_out,
-                torch::Tensor& out,
-                torch::Tensor& w,
-                float eps,
-                std::optional<torch::Tensor> reg_buffer);
+                             torch::Tensor& inp,
+                             torch::Tensor& res_inp,
+                             torch::Tensor& res_out,
+                             torch::Tensor& out,
+                             torch::Tensor& w,
+                             float eps,
+                             std::optional<torch::Tensor> reg_buffer);
 
 void dispose(fptr_t _fa);
 int64_t meta_size();
diff --git a/csrc/include/groupnorm.hpp b/csrc/include/groupnorm.hpp
index 2e25e50c88..648fcacada 100644
--- a/csrc/include/groupnorm.hpp
+++ b/csrc/include/groupnorm.hpp
@@ -5,26 +5,29 @@
 
 namespace rocm_torch_x {
 
-class __attribute__ ((visibility("hidden"))) GroupNorm final
+class __attribute__((visibility("hidden"))) GroupNorm final
 {
-public:
+    public:
     explicit GroupNorm() = default;
-    ~GroupNorm() = default;
-public:
+    ~GroupNorm()         = default;
+
+    public:
     // return empty if not supported
-    std::optional<torch::Tensor> Run(
-        torch::Tensor x,
-        int num_groups,
-        torch::Tensor weights,
-        torch::Tensor bias,
-        float epsilon);
-private:
-    template<typename T>
-    torch::Tensor launchGroupNormKernel(uint32_t num_groups, float epsilon,
-        const torch::Tensor x, const torch::Tensor weights, const torch::Tensor bias, hipStream_t stream);
+    std::optional<torch::Tensor>
+    Run(torch::Tensor x, int num_groups, torch::Tensor weights, torch::Tensor bias, float epsilon);
+
+    private:
+    template <typename T>
+    torch::Tensor launchGroupNormKernel(uint32_t num_groups,
+                                        float epsilon,
+                                        const torch::Tensor x,
+                                        const torch::Tensor weights,
+                                        const torch::Tensor bias,
+                                        hipStream_t stream);
 
     void reserveMeanAccumulator(uint32_t nums_to_reserve, torch::Device device);
-private:
+
+    private:
     torch::Tensor mean_accumulator_;
 };
 
diff --git a/csrc/include/mha_fwd.h b/csrc/include/mha_fwd.h
index 607c8a55ef..ae59f68493 100644
--- a/csrc/include/mha_fwd.h
+++ b/csrc/include/mha_fwd.h
@@ -46,6 +46,42 @@ struct mha_fwd_traits : public fmha_fwd_traits
     int how_v3_bf16_cvt;
 };
 
+struct mha_batch_prefill_traits : public fmha_batch_prefill_traits
+{
+    mha_batch_prefill_traits(int head_size_q,
+                             int head_size_v,
+                             std::string dtype,
+                             bool is_group_mode,
+                             bool has_logits_soft_cap,
+                             mask_enum mask_type,
+                             bias_enum bias_type,
+                             bool has_lse,
+                             bool has_dropout,
+                             quant_scale_enum qscale_type,
+                             bool skip_min_seqlen_q,
+                             ck_tile::BlockAttentionKVCacheMemoryLayoutEnum kv_memory_layout,
+                             ck_tile::BlockAttentionKVCacheLookupTableEnum kv_lookup_table,
+                             int page_size)
+        : fmha_batch_prefill_traits{head_size_q,
+                                    head_size_v,
+                                    dtype,
+                                    is_group_mode,
+                                    true, // is_v_rowmajor
+                                    has_logits_soft_cap,
+                                    mask_type,
+                                    bias_type,
+                                    has_lse,
+                                    has_dropout,
+                                    qscale_type,
+                                    skip_min_seqlen_q,
+                                    false, // has_sink
+                                    kv_memory_layout,
+                                    kv_lookup_table,
+                                    page_size}
+    {
+    }
+};
+
 struct mha_fwd_splitkv_traits : public fmha_fwd_splitkv_traits
 {
     mha_fwd_splitkv_traits(int head_size_q,
diff --git a/csrc/include/quick_all_reduce.cuh b/csrc/include/quick_all_reduce.cuh
index 6920befd59..9e21d354e9 100644
--- a/csrc/include/quick_all_reduce.cuh
+++ b/csrc/include/quick_all_reduce.cuh
@@ -1,6 +1,6 @@
 #pragma once
 // SPDX-License-Identifier: MIT
-// Copyright (C) 2024-2025, Advanced Micro Devices, Inc. All rights reserved.
+// Copyright (C) 2024-2026, Advanced Micro Devices, Inc. All rights reserved.
 #include "quick_all_reduce_base.h"
 #include <vector>
 #define caltime
@@ -601,7 +601,8 @@ struct AllReduceTwoshot
         Codec codec(thread, rank);
         int block_id = blockIdx.x;
         uint8_t* buffer_ptr[kWorldSize];
-        for (int i = 0; i < kWorldSize; ++i) {
+        for(int i = 0; i < kWorldSize; ++i)
+        {
             buffer_ptr[i] = buffer_list[i];
         }
         // --------------------------------------------------------
diff --git a/csrc/include/rocm_ops.hpp b/csrc/include/rocm_ops.hpp
index c4891ac875..8c1e28bbcb 100644
--- a/csrc/include/rocm_ops.hpp
+++ b/csrc/include/rocm_ops.hpp
@@ -1,5 +1,5 @@
 // SPDX-License-Identifier: MIT
-// Copyright (C) 2024-2025, Advanced Micro Devices, Inc. All rights reserved.
+// Copyright (C) 2024-2026, Advanced Micro Devices, Inc. All rights reserved.
 #pragma once
 
 #include <pybind11/pybind11.h>
@@ -1018,33 +1018,36 @@ namespace py = pybind11;
           py::arg("cu_seqlens_q_padded") = std::nullopt, \
           py::arg("cu_seqlens_k_padded") = std::nullopt);
 
-#define MHA_BATCH_PREFILL_PYBIND                  \
-    m.def("mha_batch_prefill",                    \
-          &aiter::torch_itfs::mha_batch_prefill,  \
-          py::arg("q"),                           \
-          py::arg("k"),                           \
-          py::arg("v"),                           \
-          py::arg("cu_seqlens_q"),                \
-          py::arg("kv_indptr"),                   \
-          py::arg("kv_page_indices"),             \
-          py::arg("max_seqlen_q"),                \
-          py::arg("max_seqlen_k"),                \
-          py::arg("dropout_p"),                   \
-          py::arg("softmax_scale"),               \
-          py::arg("logits_soft_cap"),             \
-          py::arg("zero_tensors"),                \
-          py::arg("is_causal"),                   \
-          py::arg("window_size_left"),            \
-          py::arg("window_size_right"),           \
-          py::arg("return_softmax_lse"),          \
-          py::arg("return_dropout_randval"),      \
-          py::arg("out")          = std::nullopt, \
-          py::arg("bias")         = std::nullopt, \
-          py::arg("alibi_slopes") = std::nullopt, \
-          py::arg("q_descale")    = std::nullopt, \
-          py::arg("k_descale")    = std::nullopt, \
-          py::arg("v_descale")    = std::nullopt, \
-          py::arg("gen")          = std::nullopt);
+#define MHA_BATCH_PREFILL_PYBIND                       \
+    m.def("mha_batch_prefill",                         \
+          &aiter::torch_itfs::mha_batch_prefill,       \
+          py::arg("q"),                                \
+          py::arg("k"),                                \
+          py::arg("v"),                                \
+          py::arg("cu_seqlens_q"),                     \
+          py::arg("kv_indptr"),                        \
+          py::arg("kv_page_indices"),                  \
+          py::arg("max_seqlen_q"),                     \
+          py::arg("max_seqlen_k"),                     \
+          py::arg("dropout_p"),                        \
+          py::arg("softmax_scale"),                    \
+          py::arg("logits_soft_cap"),                  \
+          py::arg("zero_tensors"),                     \
+          py::arg("is_causal"),                        \
+          py::arg("window_size_left"),                 \
+          py::arg("window_size_right"),                \
+          py::arg("return_softmax_lse"),               \
+          py::arg("return_dropout_randval"),           \
+          py::arg("out")               = std::nullopt, \
+          py::arg("bias")              = std::nullopt, \
+          py::arg("alibi_slopes")      = std::nullopt, \
+          py::arg("q_descale")         = std::nullopt, \
+          py::arg("k_descale")         = std::nullopt, \
+          py::arg("v_descale")         = std::nullopt, \
+          py::arg("kv_last_page_lens") = std::nullopt, \
+          py::arg("block_table")       = std::nullopt, \
+          py::arg("seqlen_k")          = std::nullopt, \
+          py::arg("gen")               = std::nullopt);
 
 #define MOE_OP_PYBIND                                                          \
     m.def("topk_softmax",                                                      \
diff --git a/csrc/include/torch/mha_batch_prefill.h b/csrc/include/torch/mha_batch_prefill.h
index 9f035e0175..b3e4031223 100644
--- a/csrc/include/torch/mha_batch_prefill.h
+++ b/csrc/include/torch/mha_batch_prefill.h
@@ -7,8 +7,8 @@ namespace aiter {
 namespace torch_itfs {
 std::vector<at::Tensor>
 mha_batch_prefill(at::Tensor& q,                  // [total_q, hq, d]
-                  const at::Tensor& k,            // [total_k, hk, d]
-                  const at::Tensor& v,            // [total_k, hk, d]
+                  const at::Tensor& k,            // [num_blocks, hk, d/8, block_size, 8]
+                  const at::Tensor& v,            // [num_blocks, hk, block_size/8, d, 8]
                   const at::Tensor& cu_seqlens_q, // [b+1]
                   const at::Tensor& kv_indptr,    // [b+1]
                   const at::Tensor& kv_page_indices,
@@ -29,6 +29,10 @@ mha_batch_prefill(at::Tensor& q,                  // [total_q, hq, d]
                   std::optional<const at::Tensor> q_descale,     // [1]
                   std::optional<const at::Tensor> k_descale,     // [1]
                   std::optional<const at::Tensor> v_descale,     // [1]
+                  std::optional<const at::Tensor> kv_last_page_lens,
+                  std::optional<const at::Tensor> block_table,
+                  std::optional<const at::Tensor> seqlen_k,
                   std::optional<at::Generator> gen_);
+
 } // namespace torch_itfs
 } // namespace aiter
diff --git a/csrc/kernels/mla/metadata/v1_0_device.cuh b/csrc/kernels/mla/metadata/v1_0_device.cuh
index 1c8f1e2f9b..3ca0ddb0ae 100644
--- a/csrc/kernels/mla/metadata/v1_0_device.cuh
+++ b/csrc/kernels/mla/metadata/v1_0_device.cuh
@@ -1,5 +1,5 @@
 // SPDX-License-Identifier: MIT
-// Copyright (C) 2025, Advanced Micro Devices, Inc. All rights reserved.
+// Copyright (C) 2025-2026, Advanced Micro Devices, Inc. All rights reserved.
 
 #pragma once
 
@@ -12,14 +12,16 @@ __device__ int32_t get_local_splits(int32_t seqlen_kv,
 #if defined(__gfx942__)
     return 16;
 #else
-    int32_t ex_splits = seqlen_kv / 196; // magic num 196. Experiments shows 196 per splits can get better performance.
-	return ck_tile::min(ck_tile::min(ex_splits, num_splits_per_cu), num_splits);
+    int32_t ex_splits =
+        seqlen_kv /
+        196; // magic num 196. Experiments shows 196 per splits can get better performance.
+    return ck_tile::min(ck_tile::min(ex_splits, num_splits_per_cu), num_splits);
 #endif
 }
 
-template<bool DP_MODE=false>
+template <bool DP_MODE = false>
 __launch_bounds__(ck_tile::get_warp_size(), 1) __global__
-void kn_get_mla_metadata_v1_0(MlaMetadataV1KernelParameter params)
+    void kn_get_mla_metadata_v1_0(MlaMetadataV1KernelParameter params)
 {
     const int32_t lane_idx = ck_tile::get_lane_id();
 
@@ -35,9 +37,9 @@ void kn_get_mla_metadata_v1_0(MlaMetadataV1KernelParameter params)
             static_cast<uint64_t>(reinterpret_cast<uintptr_t>(p_work_info_set));
     }
     extern __shared__ uint8_t p_smem[];
-    int32_t* p_lds_shift = reinterpret_cast<int32_t*>(p_smem);
-    int32_t* p_lds_split = p_lds_shift + params.num_batches;
-    int32_t* p_lds_payload = p_lds_split  + params.num_batches;
+    int32_t* p_lds_shift     = reinterpret_cast<int32_t*>(p_smem);
+    int32_t* p_lds_split     = p_lds_shift + params.num_batches;
+    int32_t* p_lds_payload   = p_lds_split + params.num_batches;
     int32_t* p_lds_kv_seqlen = p_lds_payload + params.num_batches;
 
     int32_t num_splits_per_cu = (params.num_cu + params.num_batches - 1) / params.num_batches;
@@ -47,7 +49,7 @@ void kn_get_mla_metadata_v1_0(MlaMetadataV1KernelParameter params)
         const int32_t bid_ori = bid / params.qk_batch_ratio;
 
         const int32_t kv_begin = params.p_seqlens_kv_indptr[bid_ori];
-        int32_t kv_tail = [&](){
+        int32_t kv_tail        = [&]() {
             if constexpr(DP_MODE)
             {
                 return bid % params.ori_seqlen_qo - params.ori_seqlen_qo + 1;
@@ -63,41 +65,42 @@ void kn_get_mla_metadata_v1_0(MlaMetadataV1KernelParameter params)
 
         const int32_t num_blocks = integer_divide_ceil_power2(
             seqlen_kv, params.kv_granularity, params.kv_granularity_log2);
-        const int32_t num_splits = get_local_splits(seqlen_kv, params.num_splits, num_splits_per_cu);
+        const int32_t num_splits =
+            get_local_splits(seqlen_kv, params.num_splits, num_splits_per_cu);
         const int32_t payload = ck_tile::integer_divide_ceil(num_blocks, num_splits);
-        int32_t split_local = ck_tile::integer_divide_ceil(num_blocks, payload);
-        int32_t tail = seqlen_kv % (payload * params.kv_granularity);
-        if (tail <= 4 && tail != 0 && split_local > 1)
+        int32_t split_local   = ck_tile::integer_divide_ceil(num_blocks, payload);
+        int32_t tail          = seqlen_kv % (payload * params.kv_granularity);
+        if(tail <= 4 && tail != 0 && split_local > 1)
         {
             split_local--;
         }
-        p_lds_split[bid] = split_local;
-        p_lds_payload[bid] = payload;
+        p_lds_split[bid]             = split_local;
+        p_lds_payload[bid]           = payload;
         p_lds_kv_seqlen[bid_ori + 1] = kv_end;
     }
 
     __syncthreads();
-    if (lane_idx == 0)
+    if(lane_idx == 0)
     {
-        p_lds_shift[0] = 0;
+        p_lds_shift[0]     = 0;
         p_lds_kv_seqlen[0] = 0;
-        for (int32_t bid = 1; bid < params.num_batches; bid++)
+        for(int32_t bid = 1; bid < params.num_batches; bid++)
         {
             p_lds_shift[bid] = p_lds_shift[bid - 1] + p_lds_split[bid - 1];
         }
     }
     __syncthreads();
 
-    int32_t work_end = p_lds_shift[params.num_batches - 1] + p_lds_split[params.num_batches - 1];
+    int32_t work_end    = p_lds_shift[params.num_batches - 1] + p_lds_split[params.num_batches - 1];
     int32_t work_per_cu = work_end / params.num_cu;
-    int32_t work_res = work_end % params.num_cu;
+    int32_t work_res    = work_end % params.num_cu;
 
     for(int32_t bid = lane_idx; bid < params.num_batches; bid += ck_tile::get_warp_size())
     {
         const int32_t bid_ori = bid / params.qk_batch_ratio;
 
-        const int32_t kv_begin  = p_lds_kv_seqlen[bid_ori];
-        int32_t kv_tail  = [&](){
+        const int32_t kv_begin = p_lds_kv_seqlen[bid_ori];
+        int32_t kv_tail        = [&]() {
             if constexpr(DP_MODE)
             {
                 return bid % params.ori_seqlen_qo - params.ori_seqlen_qo + 1;
@@ -122,14 +125,15 @@ void kn_get_mla_metadata_v1_0(MlaMetadataV1KernelParameter params)
             work_info.qo_start       = bid * params.uni_seqlen_qo;
             work_info.qo_end         = work_info.qo_start + params.uni_seqlen_qo;
             work_info.kv_start       = kv_begin + (sid * payload * params.kv_granularity);
-            work_info.kv_end         = ck_tile::min(work_info.kv_start + payload * params.kv_granularity, kv_end);
-            work_info.kv_offset      = kv_end - work_info.kv_end;
-            if (work_info.kv_offset <= 4 && split_local > 1)
+            work_info.kv_end =
+                ck_tile::min(work_info.kv_start + payload * params.kv_granularity, kv_end);
+            work_info.kv_offset = kv_end - work_info.kv_end;
+            if(work_info.kv_offset <= 4 && split_local > 1)
             {
-                work_info.kv_end = kv_end;
+                work_info.kv_end    = kv_end;
                 work_info.kv_offset = 0;
             }
-            p_work_info_set[work_index] = work_info;
+            p_work_info_set[work_index]             = work_info;
             params.p_reduce_partial_map[work_index] = work_info.partial_qo_loc;
         }
 
@@ -139,19 +143,21 @@ void kn_get_mla_metadata_v1_0(MlaMetadataV1KernelParameter params)
     }
 
     int32_t reduce_end = params.p_reduce_indptr[params.num_batches];
-    for (int32_t work_id = lane_idx + 1; work_id < work_res; work_id += ck_tile::get_warp_size())
+    for(int32_t work_id = lane_idx + 1; work_id < work_res; work_id += ck_tile::get_warp_size())
     {
         params.p_work_indptr[work_id] = min(work_id * (work_per_cu + 1), work_end);
     }
 
     int32_t stage = work_res * (work_per_cu + 1);
 
-    for (int32_t work_id = work_res + lane_idx; work_id < params.num_cu + 1; work_id += ck_tile::get_warp_size())
+    for(int32_t work_id = work_res + lane_idx; work_id < params.num_cu + 1;
+        work_id += ck_tile::get_warp_size())
     {
         params.p_work_indptr[work_id] = stage + (work_id - work_res) * work_per_cu;
     }
 
-    for (int32_t reduce_id = params.num_batches + lane_idx; reduce_id <= params.fixed_num_batches; reduce_id += ck_tile::get_warp_size())
+    for(int32_t reduce_id = params.num_batches + lane_idx; reduce_id <= params.fixed_num_batches;
+        reduce_id += ck_tile::get_warp_size())
     {
         params.p_reduce_indptr[reduce_id] = reduce_end;
     }
@@ -205,7 +211,7 @@ void get_mla_metadata_v1_0_device(const torch::Tensor& seqlens_qo_indptr, // [ba
     if(num_heads == 128)
     {
         qk_batch_ratio = uni_seqlen_qo;
-        uni_seqlen_qo = 1;
+        uni_seqlen_qo  = 1;
         num_batches *= qk_batch_ratio;
     }
 
@@ -240,10 +246,14 @@ void get_mla_metadata_v1_0_device(const torch::Tensor& seqlens_qo_indptr, // [ba
     const dim3 grid = dim3(1, 1, 1);
     if(num_heads == 128 && q_dtype != at::ScalarType::BFloat16)
     {
-        kn_get_mla_metadata_v1_0<true><<<grid, dev_prop.warpSize, dev_prop.maxSharedMemoryPerMultiProcessor, stream>>>(params);
+        kn_get_mla_metadata_v1_0<true>
+            <<<grid, dev_prop.warpSize, dev_prop.maxSharedMemoryPerMultiProcessor, stream>>>(
+                params);
     }
     else
     {
-        kn_get_mla_metadata_v1_0<false><<<grid, dev_prop.warpSize, dev_prop.maxSharedMemoryPerMultiProcessor, stream>>>(params);
+        kn_get_mla_metadata_v1_0<false>
+            <<<grid, dev_prop.warpSize, dev_prop.maxSharedMemoryPerMultiProcessor, stream>>>(
+                params);
     }
 }
diff --git a/csrc/py_itfs_ck/mha_batch_prefill_kernels.cu b/csrc/py_itfs_ck/mha_batch_prefill_kernels.cu
index 68dc4d45f2..362e59ce2d 100644
--- a/csrc/py_itfs_ck/mha_batch_prefill_kernels.cu
+++ b/csrc/py_itfs_ck/mha_batch_prefill_kernels.cu
@@ -1,5 +1,5 @@
 // SPDX-License-Identifier: MIT
-// Copyright (C) 2024-2025, Advanced Micro Devices, Inc. All rights reserved.
+// Copyright (C) 2024-2026, Advanced Micro Devices, Inc. All rights reserved.
 
 #include "mha_common.h"
 #include "mha_fwd.h"
@@ -20,6 +20,9 @@ get_ck_fmha_batch_prefill_args(bool has_lse,
                                const int h_k,
                                const int d,
                                const int d_v,
+                               const int num_total_pages,
+                               const int page_block_size,
+                               ck_tile::BlockAttentionKVCacheMemoryLayoutEnum kv_memory_layout,
                                // device pointers
                                const at::Tensor q,
                                const at::Tensor k,
@@ -38,11 +41,10 @@ get_ck_fmha_batch_prefill_args(bool has_lse,
                                float softmax_scale,
                                float logits_soft_cap,
                                float p_dropout,
-                               std::pair<uint64_t*, uint64_t*> drop_seed_offset)
+                               std::pair<uint64_t*, uint64_t*> drop_seed_offset,
+                               std::optional<const at::Tensor>& kv_last_page_lens_)
 {
     // q: (total_q, nheads, d)
-    // k: (total_k, nheads_k, d)
-    // v: (total_k, nheads_k, d_v)
     // o: (total_q, nheads, d_v)
 
     // bias:(total_q, max_seqlen_k)
@@ -51,24 +53,212 @@ get_ck_fmha_batch_prefill_args(bool has_lse,
     // randval: (nheads, total_q, max_seqlen_k)
 
     ck_tile::index_t total_q = q.size(0);
-    ck_tile::index_t total_k = k.size(0);
+    ck_tile::index_t total_k = k.size(0) * page_block_size;
 
-    ck_tile::index_t stride_q       = q.stride(0);
-    ck_tile::index_t stride_k       = k.stride(0);
-    ck_tile::index_t stride_v       = v.stride(0);
-    ck_tile::index_t stride_o       = out.stride(0);
+    ck_tile::index_t stride_q       = q.stride(-3);
+    ck_tile::index_t stride_k;
+    ck_tile::index_t stride_v;
+
+    const int k_vector_size = 16 / static_cast<int>(k.element_size());
+
+    if(kv_memory_layout == ck_tile::BlockAttentionKVCacheMemoryLayoutEnum::VECTORIZED_LAYOUT)
+    {
+        TORCH_CHECK(
+            k.dim() == 5,
+            "K tensor must be 5D [NumBlocks, NumHeads, HeadDim/kVectorSize, PageSize, kVectorSize]");
+        TORCH_CHECK(
+            v.dim() == 5,
+            "V tensor must be 5D [NumBlocks, NumHeads, PageSize/kVectorSize, HeadDim, kVectorSize]");
+
+        // Vectorized layout strides
+        // K: [NumBlocks, NumHeads, HeadDim/kVectorSize, PageSize, kVectorSize] -> stride(-2) is PageSize
+        // V: [NumBlocks, NumHeads, PageSize/kVectorSize, HeadDim, kVectorSize] -> stride(-2) is HeadDim
+        stride_k = k.stride(-2);
+        stride_v = v.stride(-2);
+
+        const int64_t k_stride_batch = k.stride(0);
+        const int64_t k_stride_head  = k.stride(1);
+        const int64_t k_stride_dvec  = k.stride(2);
+        const int64_t k_stride_tok   = k.stride(3);
+        const int64_t k_stride_vec   = k.stride(4);
+
+        TORCH_CHECK(stride_k == k_vector_size,
+                    "stride_k (PageSize stride) must be ",
+                    k_vector_size,
+                    " in 5D vectorized layout");
+        TORCH_CHECK(stride_v == k_vector_size,
+                    "stride_v (HeadDim stride) must be ",
+                    k_vector_size,
+                    " in 5D vectorized layout");
+        TORCH_CHECK(k_stride_vec == 1 && k.size(-1) == k_vector_size,
+                    "K last dim must be ",
+                    k_vector_size,
+                    " and contiguous");
+        TORCH_CHECK(k_stride_tok == k_vector_size,
+                    "K page stride must be ",
+                    k_vector_size,
+                    " in 5D vectorized layout");
+        TORCH_CHECK(k_stride_dvec == static_cast<int64_t>(page_block_size) * k_vector_size,
+                    "K head-dim stride must be page_size * vector_size");
+        TORCH_CHECK(k_stride_head >= static_cast<int64_t>(d) * page_block_size,
+                    "K head stride must be >= head_dim * page_size");
+        TORCH_CHECK(k_stride_batch >= static_cast<int64_t>(h_k) * k_stride_head,
+                    "K batch stride must be >= num_heads * head_stride");
+        TORCH_CHECK(k_stride_head % k_vector_size == 0,
+                    "K head stride must be a multiple of vector size");
+        TORCH_CHECK(k_stride_batch % k_vector_size == 0,
+                    "K batch stride must be a multiple of vector size");
+
+        const int64_t v_stride_batch = v.stride(0);
+        const int64_t v_stride_head  = v.stride(1);
+        const int64_t v_stride_tok   = v.stride(2);
+        const int64_t v_stride_dim   = v.stride(3);
+        const int64_t v_stride_vec   = v.stride(4);
+
+        TORCH_CHECK(v_stride_vec == 1 && v.size(-1) == k_vector_size,
+                    "V last dim must be ",
+                    k_vector_size,
+                    " and contiguous");
+        TORCH_CHECK(v_stride_dim == k_vector_size,
+                    "V head-dim stride must be ",
+                    k_vector_size,
+                    " in 5D vectorized layout");
+        TORCH_CHECK(v_stride_tok == static_cast<int64_t>(d_v) * k_vector_size,
+                    "V page stride must be head_dim * vector_size");
+        TORCH_CHECK(v_stride_head >= static_cast<int64_t>(d_v) * page_block_size,
+                    "V head stride must be >= head_dim * page_size");
+        TORCH_CHECK(v_stride_batch >= static_cast<int64_t>(h_k) * v_stride_head,
+                    "V batch stride must be >= num_heads * head_stride");
+        TORCH_CHECK(v_stride_head % k_vector_size == 0,
+                    "V head stride must be a multiple of vector size");
+        TORCH_CHECK(v_stride_batch % k_vector_size == 0,
+                    "V batch stride must be a multiple of vector size");
+    }
+    else
+    {
+        if(k.dim() == 4)
+        {
+            TORCH_CHECK(v.dim() == 4,
+                        "V tensor must be 4D [NumBlocks, PageSize, NumHeads, HeadDim]");
+
+            // Linear layout strides
+            // K/V: [NumBlocks, PageSize, NumHeads, HeadDim] -> stride(1) is PageSize
+            stride_k = k.stride(1);
+            stride_v = v.stride(1);
+
+            const int64_t k_stride_batch = k.stride(0);
+            const int64_t k_stride_page  = k.stride(1);
+            const int64_t k_stride_head  = k.stride(2);
+            const int64_t k_stride_dim   = k.stride(3);
+
+            TORCH_CHECK(k_stride_dim == 1, "K last dim must be contiguous");
+            TORCH_CHECK(k_stride_head >= d, "K head stride must be >= head_dim");
+            TORCH_CHECK(k_stride_page >= static_cast<int64_t>(h_k) * k_stride_head,
+                        "K page stride must be >= num_heads * head_stride");
+            TORCH_CHECK(k_stride_batch >= static_cast<int64_t>(page_block_size) * k_stride_page,
+                        "K batch stride must be >= page_size * page_stride");
+            TORCH_CHECK(k_stride_head % k_vector_size == 0,
+                        "K head stride must be a multiple of vector size");
+            TORCH_CHECK(k_stride_page % k_vector_size == 0,
+                        "K page stride must be a multiple of vector size");
+            TORCH_CHECK(k_stride_batch % k_vector_size == 0,
+                        "K batch stride must be a multiple of vector size");
+
+            const int64_t v_stride_batch = v.stride(0);
+            const int64_t v_stride_page  = v.stride(1);
+            const int64_t v_stride_head  = v.stride(2);
+            const int64_t v_stride_dim   = v.stride(3);
+
+            TORCH_CHECK(v_stride_dim == 1, "V last dim must be contiguous");
+            TORCH_CHECK(v_stride_head >= d_v, "V head stride must be >= head_dim");
+            TORCH_CHECK(v_stride_page >= static_cast<int64_t>(h_k) * v_stride_head,
+                        "V page stride must be >= num_heads * head_stride");
+            TORCH_CHECK(v_stride_batch >= static_cast<int64_t>(page_block_size) * v_stride_page,
+                        "V batch stride must be >= page_size * page_stride");
+            TORCH_CHECK(v_stride_head % k_vector_size == 0,
+                        "V head stride must be a multiple of vector size");
+            TORCH_CHECK(v_stride_page % k_vector_size == 0,
+                        "V page stride must be a multiple of vector size");
+            TORCH_CHECK(v_stride_batch % k_vector_size == 0,
+                        "V batch stride must be a multiple of vector size");
+        }
+        else if(k.dim() == 3)
+        {
+            TORCH_CHECK(page_block_size == 1,
+                        "3D K/V tensors require page_block_size == 1");
+            TORCH_CHECK(v.dim() == 3,
+                        "V tensor must be 3D [NumBlocks, NumHeads, HeadDim]");
+
+            // Treat 3D K/V as PageSize=1 linear layout.
+            stride_k = k.stride(1);
+            stride_v = v.stride(1);
+
+            const int64_t k_stride_batch = k.stride(0);
+            const int64_t k_stride_head  = k.stride(1);
+            const int64_t k_stride_dim   = k.stride(2);
+
+            TORCH_CHECK(k_stride_dim == 1, "K last dim must be contiguous");
+            TORCH_CHECK(k_stride_head >= d, "K head stride must be >= head_dim");
+            TORCH_CHECK(k_stride_batch >= static_cast<int64_t>(h_k) * k_stride_head,
+                        "K batch stride must be >= num_heads * head_stride");
+            TORCH_CHECK(k_stride_head % k_vector_size == 0,
+                        "K head stride must be a multiple of vector size");
+            TORCH_CHECK(k_stride_batch % k_vector_size == 0,
+                        "K batch stride must be a multiple of vector size");
+
+            const int64_t v_stride_batch = v.stride(0);
+            const int64_t v_stride_head  = v.stride(1);
+            const int64_t v_stride_dim   = v.stride(2);
+
+            TORCH_CHECK(v_stride_dim == 1, "V last dim must be contiguous");
+            TORCH_CHECK(v_stride_head >= d_v, "V head stride must be >= head_dim");
+            TORCH_CHECK(v_stride_batch >= static_cast<int64_t>(h_k) * v_stride_head,
+                        "V batch stride must be >= num_heads * head_stride");
+            TORCH_CHECK(v_stride_head % k_vector_size == 0,
+                        "V head stride must be a multiple of vector size");
+            TORCH_CHECK(v_stride_batch % k_vector_size == 0,
+                        "V batch stride must be a multiple of vector size");
+        }
+        else
+        {
+            TORCH_CHECK(false,
+                        "K tensor must be 4D [NumBlocks, PageSize, NumHeads, HeadDim] or "
+                        "3D [NumBlocks, NumHeads, HeadDim] (page_block_size == 1)");
+        }
+    }
+
+    ck_tile::index_t stride_o       = out.stride(-3);
     ck_tile::index_t stride_randval = has_dropout_randval ? dropout_randval.stride(1) : 0;
 
-    ck_tile::index_t nhead_stride_q       = q.stride(1);
-    ck_tile::index_t nhead_stride_k       = k.stride(1);
-    ck_tile::index_t nhead_stride_v       = v.stride(1);
-    ck_tile::index_t nhead_stride_o       = out.stride(1);
+    ck_tile::index_t nhead_stride_q       = q.stride(-2);
+    const bool is_vectorized_layout =
+        kv_memory_layout == ck_tile::BlockAttentionKVCacheMemoryLayoutEnum::VECTORIZED_LAYOUT;
+    // Vectorized: head dim at index 1. Linear: head dim at index 2.
+    ck_tile::index_t nhead_stride_k;
+    ck_tile::index_t nhead_stride_v;
+    if(is_vectorized_layout)
+    {
+        nhead_stride_k = k.stride(1);
+        nhead_stride_v = v.stride(1);
+    }
+    else if(k.dim() == 3)
+    {
+        nhead_stride_k = k.stride(1);
+        nhead_stride_v = v.stride(1);
+    }
+    else
+    {
+        nhead_stride_k = k.stride(2);
+        nhead_stride_v = v.stride(2);
+    }
+    
+    ck_tile::index_t nhead_stride_o       = out.stride(-2);
     ck_tile::index_t nhead_stride_lse     = has_lse ? softmax_lse.stride(0) : 0;
     ck_tile::index_t nhead_stride_randval = has_dropout_randval ? dropout_randval.stride(0) : 0;
 
     ck_tile::index_t batch_stride_q       = 0;
-    ck_tile::index_t batch_stride_k       = 0;
-    ck_tile::index_t batch_stride_v       = 0;
+    ck_tile::index_t batch_stride_k       = k.stride(0);
+    ck_tile::index_t batch_stride_v       = v.stride(0);
     ck_tile::index_t batch_stride_o       = 0;
     ck_tile::index_t batch_stride_lse     = 0;
     ck_tile::index_t batch_stride_randval = 0;
@@ -100,33 +290,48 @@ get_ck_fmha_batch_prefill_args(bool has_lse,
         stride_bias = alibi_slopes.dim() == 2 ? alibi_slopes.stride(0) : 0;
     }
 
+    void* kv_last_page_lens_ptr = nullptr;
+    if(kv_last_page_lens_.has_value())
+    {
+        auto kv_last_page_lens = kv_last_page_lens_.value();
+        CHECK_DEVICE(kv_last_page_lens);
+        TORCH_CHECK(kv_last_page_lens.dim() == 1, "kv_last_page_lens must be 1d");
+        kv_last_page_lens_ptr = kv_last_page_lens.data_ptr();
+    }
+
     fmha_batch_prefill_args args;
 
-    args.q_ptr           = q.data_ptr();
-    args.k_ptr           = k.data_ptr();
-    args.v_ptr           = v.data_ptr();
-    args.bias_ptr        = bias_ptr;
-    args.q_descale_ptr   = q_descale.has_value() ? q_descale.value().data_ptr() : nullptr;
-    args.k_descale_ptr   = k_descale.has_value() ? k_descale.value().data_ptr() : nullptr;
-    args.v_descale_ptr   = v_descale.has_value() ? v_descale.value().data_ptr() : nullptr;
-    args.rand_val_ptr    = has_dropout_randval ? dropout_randval.data_ptr() : nullptr;
-    args.lse_ptr         = has_lse ? softmax_lse.data_ptr() : nullptr;
-    args.o_ptr           = out.data_ptr();
-    args.seqstart_q_ptr  = seqlens_q.data_ptr();
-    args.seqlen_q        = total_q;
-    args.seqlen_k        = total_k;
-    args.batch           = b;
-    args.max_seqlen_q    = max_seqlen_q;
-    args.hdim_q          = d;
-    args.hdim_v          = d_v;
-    args.nhead_q         = h;
-    args.nhead_k         = h_k;
-    args.num_total_pages = total_k;
-    args.kv_indptr       = kv_indptr.data_ptr();
-    args.kv_page_indices = kv_page_indices.data_ptr();
-    args.scale_s         = softmax_scale;
-    args.scale_p         = 1;
-    args.scale_o         = 1;
+    args.q_ptr             = q.data_ptr();
+    args.k_ptr             = k.data_ptr();
+    args.v_ptr             = v.data_ptr();
+    args.q_descale_ptr     = q_descale.has_value() ? q_descale.value().data_ptr() : nullptr;
+    args.k_descale_ptr     = k_descale.has_value() ? k_descale.value().data_ptr() : nullptr;
+    args.v_descale_ptr     = v_descale.has_value() ? v_descale.value().data_ptr() : nullptr;
+    args.bias_ptr          = bias_ptr;
+    args.rand_val_ptr      = has_dropout_randval ? dropout_randval.data_ptr() : nullptr;
+    args.lse_ptr           = has_lse ? softmax_lse.data_ptr() : nullptr;
+    args.o_ptr             = out.data_ptr();
+    args.seqstart_q_ptr    = seqlens_q.data_ptr();
+    args.seqlen_q          = total_q;
+    args.seqlen_k          = total_k;
+    args.batch             = b;
+    args.max_seqlen_q      = max_seqlen_q;
+    args.hdim_q            = d;
+    args.hdim_v            = d_v;
+    args.nhead_q           = h;
+    args.nhead_k           = h_k;
+    args.num_total_pages   = num_total_pages;
+    args.page_block_size   = page_block_size;
+    args.kv_memory_layout  = kv_memory_layout;
+    args.kv_lookup_table   = ck_tile::BlockAttentionKVCacheLookupTableEnum::SGLANG_PAGE_TABLE_1D;
+    args.kv_indptr         = kv_indptr.data_ptr();
+    args.kv_page_indices   = kv_page_indices.data_ptr();
+    args.kv_last_page_lens = kv_last_page_lens_ptr;
+    args.seqlen_k_ptr      = nullptr;
+    args.batch_stride_block_table = 0;
+    args.scale_s           = softmax_scale;
+    args.scale_p           = 1;
+    args.scale_o           = 1;
 
     args.logits_soft_cap = logits_soft_cap;
 
@@ -161,9 +366,9 @@ get_ck_fmha_batch_prefill_args(bool has_lse,
 }
 
 std::vector<at::Tensor>
-mha_batch_prefill(at::Tensor& q,                  // [total_q, hq, d]
-                  const at::Tensor& k,            // [total_k, hk, d]
-                  const at::Tensor& v,            // [total_k, hk, d]
+mha_batch_prefill(at::Tensor& q,       // [total_q, hq, d]
+                  const at::Tensor& k, // [num_blocks, hk, d/k_vector_size, block_size, k_vector_size]
+                  const at::Tensor& v, // [num_blocks, hk, block_size/k_vector_size, d, k_vector_size]
                   const at::Tensor& cu_seqlens_q, // [b+1]
                   const at::Tensor& kv_indptr,    // [b+1]
                   const at::Tensor& kv_page_indices,
@@ -184,7 +389,11 @@ mha_batch_prefill(at::Tensor& q,                  // [total_q, hq, d]
                   std::optional<const at::Tensor> q_descale,     // [1]
                   std::optional<const at::Tensor> k_descale,     // [1]
                   std::optional<const at::Tensor> v_descale,     // [1]
-                  std::optional<at::Generator> gen_)
+                  std::optional<const at::Tensor> kv_last_page_lens_,
+                  std::optional<const at::Tensor> block_table_,
+                  std::optional<const at::Tensor> seqlen_k_,
+                  std::optional<at::Generator> gen_
+                )
 {
     auto q_dtype = q.scalar_type();
     bool is_qkv_fp8 =
@@ -215,6 +424,8 @@ mha_batch_prefill(at::Tensor& q,                  // [total_q, hq, d]
     quant_scale_enum qscale_type =
         q_descale.has_value() ? quant_scale_enum::pertensor : quant_scale_enum::no_scale;
 
+    const int k_vector_size = 16 / static_cast<int>(q.element_size());
+
     CHECK_DEVICE(q);
     CHECK_DEVICE(k);
     CHECK_DEVICE(v);
@@ -238,10 +449,61 @@ mha_batch_prefill(at::Tensor& q,                  // [total_q, hq, d]
     const int batch_size  = cu_seqlens_q.numel() - 1;
     int num_heads         = sizes[1];
     const int head_size_q = sizes[2];
-    const int head_size_v = v.size(2);
-    const int num_heads_k = k.size(1);
+    
+    ck_tile::BlockAttentionKVCacheMemoryLayoutEnum kv_memory_layout;
+    int num_heads_k     = 0;
+    int page_block_size = 0;
+    int head_size_v     = 0;
+    int num_blocks      = 0;
+
+    if(k.dim() == 5)
+    {
+        kv_memory_layout = ck_tile::BlockAttentionKVCacheMemoryLayoutEnum::VECTORIZED_LAYOUT;
+        TORCH_CHECK(
+            v.dim() == 5,
+            "V tensor must be 5D [NumBlocks, NumHeads, PageSize/kVectorSize, HeadDim, kVectorSize]");
+
+        // K: [NumBlocks, NumHeads, HeadDim/kVectorSize, PageSize, kVectorSize]
+        num_heads_k     = k.size(1);
+        page_block_size = k.size(3);
+        TORCH_CHECK(page_block_size % k_vector_size == 0,
+                    "Vectorized KV requires page size divisible by ",
+                    k_vector_size);
+
+        // V: [NumBlocks, NumHeads, PageSize/kVector_size, HeadDim, kVector_size]
+        head_size_v = v.size(3);
+        num_blocks  = k.size(0);
+    }
+    else if(k.dim() == 4)
+    {
+        kv_memory_layout = ck_tile::BlockAttentionKVCacheMemoryLayoutEnum::LINEAR_LAYOUT;
+        TORCH_CHECK(v.dim() == 4,
+                    "V tensor must be 4D [NumBlocks, PageSize, NumHeads, HeadDim]");
+
+        // K/V: [NumBlocks, PageSize, NumHeads, HeadDim]
+        num_heads_k     = k.size(2);
+        page_block_size = k.size(1);
+        head_size_v     = v.size(3);
+        num_blocks      = k.size(0);
+    }
+    else if(k.dim() == 3)
+    {
+        kv_memory_layout = ck_tile::BlockAttentionKVCacheMemoryLayoutEnum::LINEAR_LAYOUT;
+        TORCH_CHECK(v.dim() == 3, "V tensor must be 3D [NumBlocks, NumHeads, HeadDim]");
+
+        // K/V: [NumBlocks, NumHeads, HeadDim] (PageSize=1)
+        num_heads_k     = k.size(1);
+        page_block_size = 1;
+        head_size_v     = v.size(2);
+        num_blocks      = k.size(0);
+    }
+    else
+    {
+        TORCH_CHECK(false,
+                    "K tensor must be 5D (vectorized), 4D (linear), or 3D (linear, page_size=1) "
+                    "for batch prefill");
+    }
 
-    const int num_blocks = k.size(0);
 
     if(max_seqlen_q == 1 && !alibi_slopes_.has_value())
     {
@@ -263,10 +525,12 @@ mha_batch_prefill(at::Tensor& q,                  // [total_q, hq, d]
     TORCH_CHECK(batch_size > 0, "batch size must be postive");
     TORCH_CHECK(head_size_q <= 256, "CK only supports head dimension at most 256");
     TORCH_CHECK(head_size_v <= 256, "CK only supports head dimension at most 256");
-    TORCH_CHECK(head_size_q % 8 == 0,
-                "query, key, value, and out_ must have a head_size that is a multiple of 8");
-    TORCH_CHECK(head_size_v % 8 == 0,
-                "query, key, value, and out_ must have a head_size that is a multiple of 8");
+    TORCH_CHECK(head_size_q % k_vector_size == 0,
+                "query, key, value, and out_ must have a head_size that is a multiple of ",
+                k_vector_size);
+    TORCH_CHECK(head_size_v % k_vector_size == 0,
+                "query, key, value, and out_ must have a head_size that is a multiple of ",
+                k_vector_size);
     TORCH_CHECK(num_heads % num_heads_k == 0,
                 "Number of heads in key/value must divide number of heads in query");
 
@@ -301,8 +565,45 @@ mha_batch_prefill(at::Tensor& q,                  // [total_q, hq, d]
     }
 
     CHECK_SHAPE(q, total_q, num_heads, head_size_q);
-    CHECK_SHAPE(k, num_blocks, num_heads_k, head_size_q);
-    CHECK_SHAPE(v, num_blocks, num_heads_k, head_size_v);
+    
+    if(kv_memory_layout == ck_tile::BlockAttentionKVCacheMemoryLayoutEnum::VECTORIZED_LAYOUT)
+    {
+        // K: [NumBlocks, NumHeads, HeadDim/k_vector_size, PageSize, k_vector_size]
+        CHECK_SHAPE(k,
+                    num_blocks,
+                    num_heads_k,
+                    head_size_q / k_vector_size,
+                    page_block_size,
+                    k_vector_size);
+        // V: [NumBlocks, NumHeads, PageSize/k_vector_size, HeadDim, k_vector_size]
+        CHECK_SHAPE(v,
+                    num_blocks,
+                    num_heads_k,
+                    page_block_size / k_vector_size,
+                    head_size_v,
+                    k_vector_size);
+    }
+    else
+    {
+        if(k.dim() == 3)
+        {
+            // K/V: [NumBlocks, NumHeads, HeadDim] (PageSize=1)
+            CHECK_SHAPE(k, num_blocks, num_heads_k, head_size_q);
+            CHECK_SHAPE(v, num_blocks, num_heads_k, head_size_v);
+        }
+        else
+        {
+            // K/V: [NumBlocks, PageSize, NumHeads, HeadDim]
+            CHECK_SHAPE(k, num_blocks, page_block_size, num_heads_k, head_size_q);
+            CHECK_SHAPE(v, num_blocks, page_block_size, num_heads_k, head_size_v);
+        }
+    }
+
+    if(page_block_size > 1 && !block_table_.has_value())
+    {
+        TORCH_CHECK(kv_last_page_lens_.has_value(),
+                    "if page_block_size > 1, must pass kv_last_page_lens to kernel");
+    }
 
     CHECK_SHAPE(cu_seqlens_q, batch_size + 1);
     CHECK_SHAPE(kv_indptr, batch_size + 1);
@@ -393,6 +694,9 @@ mha_batch_prefill(at::Tensor& q,                  // [total_q, hq, d]
                                                    num_heads_k,
                                                    head_size_q,
                                                    head_size_v,
+                                                   num_blocks,
+                                                   page_block_size,
+                                                   kv_memory_layout,
                                                    q,
                                                    k,
                                                    v,
@@ -410,7 +714,37 @@ mha_batch_prefill(at::Tensor& q,                  // [total_q, hq, d]
                                                    softmax_scale,
                                                    logits_soft_cap,
                                                    p_dropout,
-                                                   drop_seed_offset);
+                                                   drop_seed_offset,
+                                                   kv_last_page_lens_);
+
+        if(block_table_.has_value())
+        {
+            auto block_table = block_table_.value();
+            CHECK_DEVICE(block_table);
+            TORCH_CHECK(block_table.scalar_type() == at::kInt,
+                        "block_table must be int32");
+            TORCH_CHECK(block_table.dim() == 2, "block_table must be 2d");
+            TORCH_CHECK(block_table.size(0) == batch_size,
+                        "block_table first dim must match batch_size");
+            TORCH_CHECK(block_table.stride(-1) == 1,
+                        "block_table must have contiguous last dimension");
+            TORCH_CHECK(seqlen_k_.has_value(),
+                        "block_table requires seqlen_k for per-batch lengths");
+
+            auto seqlen_k = seqlen_k_.value();
+            CHECK_DEVICE(seqlen_k);
+            TORCH_CHECK(seqlen_k.scalar_type() == at::kInt,
+                        "seqlen_k must be int32");
+            TORCH_CHECK(seqlen_k.dim() == 1, "seqlen_k must be 1d");
+            TORCH_CHECK(seqlen_k.size(0) == batch_size,
+                        "seqlen_k must have shape [batch_size]");
+
+            args.kv_page_indices = block_table.data_ptr();
+            args.batch_stride_block_table = block_table.stride(0);
+            args.seqlen_k_ptr = seqlen_k.data_ptr();
+            args.kv_lookup_table =
+                ck_tile::BlockAttentionKVCacheLookupTableEnum::VLLM_BLOCK_TABLE_2D;
+        }
 
         float t = aiter::mha_batch_prefill(args,
                                            stream_config,
diff --git a/op_tests/op_benchmarks/triton/bench_deepgemm_attention.py b/op_tests/op_benchmarks/triton/bench_deepgemm_attention.py
index 7d2d80acee..bd4b21f85b 100644
--- a/op_tests/op_benchmarks/triton/bench_deepgemm_attention.py
+++ b/op_tests/op_benchmarks/triton/bench_deepgemm_attention.py
@@ -1,5 +1,5 @@
 # SPDX-License-Identifier: MIT
-# Copyright (C) 2024-2025, Advanced Micro Devices, Inc. All rights reserved.
+# Copyright (C) 2024-2026, Advanced Micro Devices, Inc. All rights reserved.
 
 import random
 import argparse
diff --git a/op_tests/op_benchmarks/triton/bench_la.py b/op_tests/op_benchmarks/triton/bench_la.py
index 92adff1d9a..de7dfe642f 100644
--- a/op_tests/op_benchmarks/triton/bench_la.py
+++ b/op_tests/op_benchmarks/triton/bench_la.py
@@ -1,5 +1,5 @@
 # SPDX-License-Identifier: MIT
-# Copyright (C) 2024-2025, Advanced Micro Devices, Inc. All rights reserved.
+# Copyright (C) 2024-2026, Advanced Micro Devices, Inc. All rights reserved.
 
 import sys
 import torch
diff --git a/op_tests/test_aiter_add.py b/op_tests/test_aiter_add.py
index fd3ebb1bf4..4175dfd868 100644
--- a/op_tests/test_aiter_add.py
+++ b/op_tests/test_aiter_add.py
@@ -1,5 +1,5 @@
 # SPDX-License-Identifier: MIT
-# Copyright (C) 2024-2025, Advanced Micro Devices, Inc. All rights reserved.
+# Copyright (C) 2024-2026, Advanced Micro Devices, Inc. All rights reserved.
 
 import torch
 import aiter
diff --git a/op_tests/test_aiter_addInp.py b/op_tests/test_aiter_addInp.py
index e35737b8f3..c6922893eb 100644
--- a/op_tests/test_aiter_addInp.py
+++ b/op_tests/test_aiter_addInp.py
@@ -1,5 +1,5 @@
 # SPDX-License-Identifier: MIT
-# Copyright (c) 2024, Advanced Micro Devices, Inc. All rights reserved.
+# Copyright (C) 2024-2026, Advanced Micro Devices, Inc. All rights reserved.
 
 import torch
 import aiter
diff --git a/op_tests/test_aiter_sigmoid.py b/op_tests/test_aiter_sigmoid.py
index e015d6a688..b759848706 100644
--- a/op_tests/test_aiter_sigmoid.py
+++ b/op_tests/test_aiter_sigmoid.py
@@ -1,5 +1,5 @@
 # SPDX-License-Identifier: MIT
-# Copyright (C) 2024-2025, Advanced Micro Devices, Inc. All rights reserved.
+# Copyright (C) 2024-2026, Advanced Micro Devices, Inc. All rights reserved.
 
 import torch
 import aiter
diff --git a/op_tests/test_batch_prefill.py b/op_tests/test_batch_prefill.py
index a13ae4f48f..c5b5b228e7 100644
--- a/op_tests/test_batch_prefill.py
+++ b/op_tests/test_batch_prefill.py
@@ -1,5 +1,5 @@
 # SPDX-License-Identifier: MIT
-# Copyright (C) 2024-2025, Advanced Micro Devices, Inc. All rights reserved.
+# Copyright (C) 2024-2026, Advanced Micro Devices, Inc. All rights reserved.
 
 import itertools
 import math
@@ -7,12 +7,178 @@
 import pytest
 import torch
 
+import pandas as pd
+
 import aiter
 from aiter import dtypes
 from aiter import per_tensor_quant
 from einops import rearrange, repeat
 import argparse
 
+from aiter.test_common import (
+    perftest,
+)
+
+
+def skip_test_if(condition: bool, reason: str) -> bool:
+    """
+    Skip the test if condition is True.
+
+    Works in both pytest and direct python execution:
+    - pytest session: calls pytest.skip()
+    - direct python: prints message and returns True
+
+    Usage:
+        if skip_test_if(causal and kv_len < qo_len, "reason"):
+            return
+
+    Returns:
+        True if test should be skipped (caller should return early)
+    """
+    if not condition:
+        return False
+
+    # PYTEST_CURRENT_TEST is only set when pytest is actively running tests,
+    # not when pytest is just imported. This is the reliable way to detect
+    # if we're inside a pytest session.
+    if "PYTEST_CURRENT_TEST" in os.environ:
+        pytest.skip(reason)
+
+    print(f"SKIP: {reason}")
+    return True
+
+
+def get_vector_size(dtype) -> int:
+    """Calculate vector size for a given dtype (16 bytes / element_size)."""
+    return 16 // torch.tensor([], dtype=dtype).element_size()
+
+
+def check_common_skip_conditions(
+    is_input_fp8: bool,
+    dtype,
+    causal: bool,
+    kv_len: int,
+    qo_len: int,
+    contiguous_kv: bool,
+) -> bool:
+    """
+    Check common skip conditions shared across test functions.
+    Returns True if test should be skipped.
+    """
+    if skip_test_if(
+        is_input_fp8 and dtype != torch.bfloat16,
+        "FP8 tests use BF16 reference dtype only",
+    ):
+        return True
+
+    if skip_test_if(
+        causal and kv_len < qo_len,
+        "kv_len < qo_len is not allowed if causal=True",
+    ):
+        return True
+
+    if skip_test_if(
+        not contiguous_kv and is_input_fp8,
+        "Non-contiguous KV is only validated for non-FP8 path",
+    ):
+        return True
+
+    return False
+
+
+def check_layout_skip_conditions(
+    kvcache_layout: str,
+    head_dim: int,
+    page_size: int,
+    k_vector_size: int,
+    k_vector_size_fp8: int,
+    is_input_fp8: bool,
+    contiguous_kv: bool,
+) -> bool:
+    """
+    Check layout-specific skip conditions.
+    Returns True if test should be skipped.
+    """
+    if kvcache_layout == "vectorized":
+        if skip_test_if(
+            not contiguous_kv,
+            "Non-contiguous KV is only validated for linear layout",
+        ):
+            return True
+        if skip_test_if(
+            page_size % k_vector_size != 0 or head_dim % k_vector_size != 0,
+            "Vectorized layout requires page/head dim divisible by vector size",
+        ):
+            return True
+        if skip_test_if(
+            is_input_fp8
+            and (
+                page_size % k_vector_size_fp8 != 0 or head_dim % k_vector_size_fp8 != 0
+            ),
+            "FP8 vectorized layout requires page/head dim divisible by vector size",
+        ):
+            return True
+    else:
+        if skip_test_if(
+            head_dim % k_vector_size != 0,
+            "Linear layout requires head dim divisible by vector size",
+        ):
+            return True
+        if skip_test_if(
+            is_input_fp8 and head_dim % k_vector_size_fp8 != 0,
+            "FP8 linear layout requires head dim divisible by vector size",
+        ):
+            return True
+
+    return False
+
+
+def get_tolerances(dtype, is_fp8: bool = False) -> tuple[float, float]:
+    """Return (rtol, atol) tolerances based on dtype and FP8 mode."""
+    if is_fp8:
+        return 2e-2, 1e-2
+    if dtype == torch.float16:
+        return 1e-3, 1e-3
+    return 2e-2, 1e-2
+
+
+def build_q_tensor_for_test(
+    qo_lens,
+    batch_size: int,
+    qo_len: int,
+    num_qo_heads: int,
+    head_dim: int,
+    dtype,
+    q_init_min: float,
+    q_init_max: float,
+    is_input_fp8: bool,
+):
+    """Build Q tensor, handling both FP8 and non-FP8 cases."""
+    if is_input_fp8:
+        total_q_tokens = torch.sum(qo_lens).item()
+        return torch.rand(
+            total_q_tokens, num_qo_heads, head_dim, device="cuda", dtype=dtype
+        )
+    return build_q_tensor(
+        batch_size * qo_len, num_qo_heads, head_dim, dtype, q_init_min, q_init_max
+    )
+
+
+def extract_kv_caches(kv_cache: dict, contiguous_kv: bool):
+    """Extract K and V reference tensors from KV cache dict."""
+    if contiguous_kv:
+        return split_kv_pages(kv_cache["kv_data"])
+    return kv_cache["kv_data"][:, 0], kv_cache["kv_data"][:, 1]
+
+
+def verify_fp8_output(out_fp8, o_ref, threshold: float = 0.055):
+    """Verify FP8 kernel output against reference."""
+    max_diff = (out_fp8 - o_ref).abs().max().item()
+    assert max_diff < threshold, (
+        f"FP8 kernel vs reference difference too large: "
+        f"{max_diff} (threshold: {threshold})"
+    )
+
 
 def construct_local_mask(
     seqlen_q,
@@ -96,183 +262,737 @@ def ref_masked_attention(
     return out.to(query)
 
 
+def make_scaled_rand(min_val, max_val, *shape, dtype, device="cuda"):
+    x = torch.randn(*shape, device=device, dtype=dtype)
+    x = (x - x.min()) / (x.max() - x.min())
+    return min_val + (max_val - min_val) * x
+
+
+def convert_lens_to_indptr(lens):
+    return torch.cumsum(torch.cat((torch.tensor([0]), lens)), dim=0).int()
+
+
+def build_qo_lens(batch_size, qo_len, randomize=True):
+    if randomize and batch_size > 1:
+        return torch.randint(1, qo_len + 1, (batch_size,)).int()
+    return torch.full((batch_size,), qo_len).int()
+
+
+def build_kv_lens(batch_size, kv_len, qo_lens, randomize=True, ensure_at_least_q=True):
+    if randomize and batch_size > 1:
+        kv_lens = torch.randint(1, kv_len + 1, (batch_size,)).int()
+        return torch.maximum(qo_lens, kv_lens) if ensure_at_least_q else kv_lens
+    return torch.full((batch_size,), kv_len).int()
+
+
+def build_q_tensor(
+    total_q_tokens, num_qo_heads, head_dim, dtype, q_init_min, q_init_max
+):
+    return make_scaled_rand(
+        q_init_min,
+        q_init_max,
+        total_q_tokens,
+        num_qo_heads,
+        head_dim,
+        dtype=dtype,
+    ).to(0)
+
+
+def build_paged_kv_cache(
+    batch_size,
+    kv_len,
+    page_size,
+    num_kv_heads,
+    head_dim,
+    kv_lens,
+    kv_init_min,
+    kv_init_max,
+    dtype,
+    use_uniform=False,
+    contiguous_kv=True,
+):
+    max_num_pages_per_seq = (kv_len + page_size - 1) // page_size
+    total_num_pages = max_num_pages_per_seq * batch_size
+    kv_shape = [total_num_pages, 2, page_size, num_kv_heads, head_dim]
+    if contiguous_kv:
+        if use_uniform:
+            kv_data_fp32 = torch.rand(*kv_shape, device="cuda", dtype=torch.float32)
+            if kv_init_min is not None and kv_init_max is not None:
+                kv_data_fp32 = kv_init_min + (kv_init_max - kv_init_min) * kv_data_fp32
+        else:
+            kv_data_fp32 = make_scaled_rand(
+                kv_init_min, kv_init_max, *kv_shape, dtype=torch.float32
+            ).to(0)
+        kv_data = kv_data_fp32.to(dtype)
+    else:
+        kv_shape_nc = [kv_shape[0]]
+        for dim in kv_shape[1:]:
+            kv_shape_nc.append(2)
+            kv_shape_nc.append(dim)
+        if use_uniform:
+            kv_data_fp32 = torch.rand(*kv_shape_nc, device="cuda", dtype=torch.float32)
+            if kv_init_min is not None and kv_init_max is not None:
+                kv_data_fp32 = kv_init_min + (kv_init_max - kv_init_min) * kv_data_fp32
+        else:
+            kv_data_fp32 = make_scaled_rand(
+                kv_init_min, kv_init_max, *kv_shape_nc, dtype=torch.float32
+            ).to(0)
+        kv_data = kv_data_fp32.to(dtype)
+        kv_data = kv_data[:, 1, :, 1, :, 1, :, 1, :]
+        kv_data_fp32 = kv_data_fp32[:, 1, :, 1, :, 1, :, 1, :]
+    kv_num_used_pages = (kv_lens + page_size - 1) // page_size
+    kv_indptr_cpu = convert_lens_to_indptr(kv_num_used_pages)
+    kv_indices_cpu = torch.nn.functional.pad(
+        torch.randperm(total_num_pages).int(), (0, 128), value=0
+    )
+    kv_last_page_len_cpu = ((kv_lens - 1) % page_size + 1).int()
+    return {
+        "kv_data_fp32": kv_data_fp32,
+        "kv_data": kv_data,
+        "kv_indptr_cpu": kv_indptr_cpu,
+        "kv_indices_cpu": kv_indices_cpu,
+        "kv_last_page_len_cpu": kv_last_page_len_cpu,
+        "max_num_pages_per_seq": max_num_pages_per_seq,
+        "total_num_pages": total_num_pages,
+    }
+
+
+def split_kv_pages(kv_data):
+    chunks = torch.chunk(kv_data, 2, dim=1)
+    k_cache_ref = chunks[0].squeeze(1).contiguous()
+    v_cache_ref = chunks[1].squeeze(1).contiguous()
+    return k_cache_ref, v_cache_ref
+
+
+def apply_kv_layout(
+    k_cache_ref,
+    v_cache_ref,
+    num_kv_heads,
+    head_dim,
+    page_size,
+    k_vector_size,
+    layout,
+):
+    if layout == "vectorized":
+        return vectorize_kv_cache(
+            k_cache_ref,
+            v_cache_ref,
+            num_kv_heads,
+            head_dim,
+            page_size,
+            k_vector_size,
+        )
+    if layout == "linear":
+        return k_cache_ref.contiguous(), v_cache_ref.contiguous()
+    raise ValueError(f"Unsupported KV layout: {layout}")
+
+
+def build_block_table(kv_indptr_cpu, kv_indices_cpu, batch_size, max_num_pages_per_seq):
+    block_table_cpu = torch.zeros(
+        (batch_size, max_num_pages_per_seq), dtype=torch.int32
+    )
+    for i in range(batch_size):
+        start = kv_indptr_cpu[i].item()
+        end = kv_indptr_cpu[i + 1].item()
+        block_table_cpu[i, : (end - start)] = kv_indices_cpu[start:end]
+    return block_table_cpu
+
+
+def build_reference_output(
+    q,
+    q_indptr_cpu,
+    kv_data_fp32,
+    kv_indices_cpu,
+    kv_indptr_cpu,
+    kv_last_page_len_cpu,
+    num_kv_heads,
+    head_dim,
+    dtype,
+    causal,
+    logits_soft_cap,
+):
+    o_ref_list = []
+    for i in range(len(q_indptr_cpu) - 1):
+        perm_dims = [0, 1, 2, 3]
+        perm_dims_last = [0, 1, 2]
+        qi = q[q_indptr_cpu[i] : q_indptr_cpu[i + 1]]
+        used_kv_indices = kv_indices_cpu[kv_indptr_cpu[i] : kv_indptr_cpu[i + 1]]
+        last_k = kv_data_fp32[used_kv_indices[-1], 0, : kv_last_page_len_cpu[i], :]
+        last_v = kv_data_fp32[used_kv_indices[-1], 1, : kv_last_page_len_cpu[i], :]
+        ki = torch.cat(
+            [
+                kv_data_fp32[used_kv_indices[:-1], 0]
+                .permute(*perm_dims)
+                .reshape(-1, num_kv_heads, head_dim),
+                last_k.permute(*perm_dims_last).reshape(-1, num_kv_heads, head_dim),
+            ],
+            dim=0,
+        ).to(dtype)
+        vi = torch.cat(
+            [
+                kv_data_fp32[used_kv_indices[:-1], 1]
+                .permute(*perm_dims)
+                .reshape(-1, num_kv_heads, head_dim),
+                last_v.permute(*perm_dims_last).reshape(-1, num_kv_heads, head_dim),
+            ],
+            dim=0,
+        ).to(dtype)
+        o_ref_list.append(
+            ref_masked_attention(
+                qi, ki, vi, causal=causal, logits_soft_cap=logits_soft_cap
+            )
+        )
+    return torch.cat(o_ref_list, dim=0)
+
+
+def assert_output_matches_reference(out, q_indptr_cpu, o_ref, rtol, atol):
+    for i in range(len(q_indptr_cpu) - 1):
+        start = q_indptr_cpu[i]
+        end = q_indptr_cpu[i + 1]
+        torch.testing.assert_close(
+            out[start:end], o_ref[start:end], rtol=rtol, atol=atol
+        )
+
+
+@pytest.mark.parametrize("input_dtype", ["bf16", "fp8"])
 @pytest.mark.parametrize("batch_size", [1, 3, 7])
 @pytest.mark.parametrize(
     "qo_len,kv_len",
     [
-        (128, 128),
         (1024, 1024),
         (1023, 1024),
         (1024, 1023),
         (2048, 2048),
     ],
 )
-@pytest.mark.parametrize("page_size", [1])
 @pytest.mark.parametrize("num_qo_heads,num_kv_heads", [(6, 1), (3, 1)])
 @pytest.mark.parametrize("head_dim", [128])
 @pytest.mark.parametrize("causal", [False, True])
-@pytest.mark.parametrize("kv_layout", ["NHD"])
 @pytest.mark.parametrize("logits_soft_cap", [0.0, 30.0])
-@pytest.mark.parametrize("contiguous_kv", [True, False])
 @pytest.mark.parametrize("dtype", [torch.float16, torch.bfloat16])
 @pytest.mark.parametrize("q_init_min,q_init_max", [(-10, 10)])
 @pytest.mark.parametrize("kv_init_min,kv_init_max", [(-5, 5)])
+@pytest.mark.parametrize("kv_dim", [4, 3])
+@pytest.mark.parametrize("contiguous_kv", [True, False])
 @pytest.mark.parametrize("seed", [19378])
-def test_batch_prefill_with_paged_kv_cache(
+def test_batch_prefill_page_size_1_linear_sglang(
+    input_dtype,
     batch_size,
     kv_len,
     qo_len,
-    page_size,
     num_qo_heads,
     num_kv_heads,
     head_dim,
     causal,
-    kv_layout,
     logits_soft_cap,
+    dtype,
+    q_init_min,
+    q_init_max,
+    kv_init_min,
+    kv_init_max,
+    kv_dim,
     contiguous_kv,
+    seed,
+):
+    if seed is not None:
+        torch.manual_seed(seed)
+
+    is_input_fp8 = input_dtype == dtypes.fp8 or input_dtype == "fp8"
+    k_vector_size = get_vector_size(dtype)
+    k_vector_size_fp8 = get_vector_size(dtypes.fp8)
+    page_size = 1
+
+    # Skip conditions
+    if check_common_skip_conditions(
+        is_input_fp8, dtype, causal, kv_len, qo_len, contiguous_kv
+    ):
+        return
+    if check_layout_skip_conditions(
+        "linear",
+        head_dim,
+        page_size,
+        k_vector_size,
+        k_vector_size_fp8,
+        is_input_fp8,
+        contiguous_kv,
+    ):
+        return
+
+    # Build test tensors
+    qo_lens = build_qo_lens(batch_size, qo_len, randomize=True)
+    q_indptr_cpu = convert_lens_to_indptr(qo_lens)
+    q = build_q_tensor_for_test(
+        qo_lens,
+        batch_size,
+        qo_len,
+        num_qo_heads,
+        head_dim,
+        dtype,
+        q_init_min,
+        q_init_max,
+        is_input_fp8,
+    )
+
+    kv_lens = build_kv_lens(batch_size, kv_len, qo_lens, randomize=True)
+    kv_cache = build_paged_kv_cache(
+        batch_size,
+        kv_len,
+        page_size,
+        num_kv_heads,
+        head_dim,
+        kv_lens,
+        None if is_input_fp8 else kv_init_min,
+        None if is_input_fp8 else kv_init_max,
+        dtype,
+        use_uniform=is_input_fp8,
+        contiguous_kv=contiguous_kv,
+    )
+
+    # Move to GPU
+    q_indptr_gpu = q_indptr_cpu.to(0)
+    kv_indptr_gpu = kv_cache["kv_indptr_cpu"].to(0)
+    kv_indices_gpu = kv_cache["kv_indices_cpu"].to(0)
+    kv_last_page_len_gpu = kv_cache["kv_last_page_len_cpu"].to(0)
+
+    k_cache_ref, v_cache_ref = extract_kv_caches(kv_cache, contiguous_kv)
+    max_qo_len = torch.max(qo_lens).item()
+    max_kv_len = torch.max(kv_lens).item()
+
+    # Build reference output (shared between FP8 and non-FP8)
+    o_ref = build_reference_output(
+        q,
+        q_indptr_cpu,
+        kv_cache["kv_data_fp32"],
+        kv_cache["kv_indices_cpu"],
+        kv_cache["kv_indptr_cpu"],
+        kv_cache["kv_last_page_len_cpu"],
+        num_kv_heads,
+        head_dim,
+        dtype,
+        causal,
+        logits_soft_cap,
+    )
+
+    if is_input_fp8:
+        q_quant, q_descale = per_tensor_quant(q, quant_dtype=dtypes.fp8)
+        k_cache_quant, k_descale = per_tensor_quant(
+            k_cache_ref.to(dtype), quant_dtype=dtypes.fp8
+        )
+        v_cache_quant, v_descale = per_tensor_quant(
+            v_cache_ref.to(dtype), quant_dtype=dtypes.fp8
+        )
+
+        # Apply layout based on kv_dim
+        if kv_dim == 3:
+            k_cache_fp8 = k_cache_quant.squeeze(1).contiguous()
+            v_cache_fp8 = v_cache_quant.squeeze(1).contiguous()
+            k_cache_ref_layout = k_cache_ref.squeeze(1).contiguous()
+            v_cache_ref_layout = v_cache_ref.squeeze(1).contiguous()
+        else:
+            k_cache_fp8, v_cache_fp8 = apply_kv_layout(
+                k_cache_quant,
+                v_cache_quant,
+                num_kv_heads,
+                head_dim,
+                page_size,
+                k_vector_size_fp8,
+                "linear",
+            )
+            k_cache_ref_layout, v_cache_ref_layout = apply_kv_layout(
+                k_cache_ref.to(dtype),
+                v_cache_ref.to(dtype),
+                num_kv_heads,
+                head_dim,
+                page_size,
+                k_vector_size,
+                "linear",
+            )
+
+        out_fp8 = aiter.mha_batch_prefill_func(
+            q_quant,
+            k_cache_fp8,
+            v_cache_fp8,
+            q_indptr_gpu,
+            kv_indptr_gpu,
+            kv_indices_gpu,
+            max_qo_len,
+            max_kv_len,
+            causal=causal,
+            logits_soft_cap=logits_soft_cap,
+            q_descale=q_descale,
+            k_descale=k_descale,
+            v_descale=v_descale,
+            kv_last_page_lens=kv_last_page_len_gpu,
+        )
+
+        out_ref = aiter.mha_batch_prefill_func(
+            q,
+            k_cache_ref_layout,
+            v_cache_ref_layout,
+            q_indptr_gpu,
+            kv_indptr_gpu,
+            kv_indices_gpu,
+            max_qo_len,
+            max_kv_len,
+            causal=causal,
+            logits_soft_cap=logits_soft_cap,
+            kv_last_page_lens=kv_last_page_len_gpu,
+        )
+
+        verify_fp8_output(out_fp8, o_ref)
+        rtol, atol = get_tolerances(dtype, is_fp8=True)
+        torch.testing.assert_close(out_ref, o_ref, rtol=rtol, atol=atol)
+    else:
+        # Prepare KV cache based on kv_dim and contiguity
+        if kv_dim == 3:
+            k_cache = k_cache_ref.squeeze(1)
+            v_cache = v_cache_ref.squeeze(1)
+            if contiguous_kv:
+                k_cache = k_cache.contiguous()
+                v_cache = v_cache.contiguous()
+        elif contiguous_kv:
+            k_cache, v_cache = apply_kv_layout(
+                k_cache_ref,
+                v_cache_ref,
+                num_kv_heads,
+                head_dim,
+                page_size,
+                k_vector_size,
+                "linear",
+            )
+        else:
+            k_cache, v_cache = k_cache_ref, v_cache_ref
+
+        # Verify contiguity expectations
+        assert k_cache.is_contiguous() == contiguous_kv
+        assert v_cache.is_contiguous() == contiguous_kv
+
+        out = aiter.mha_batch_prefill_func(
+            q,
+            k_cache,
+            v_cache,
+            q_indptr_gpu,
+            kv_indptr_gpu,
+            kv_indices_gpu,
+            max_qo_len,
+            max_kv_len,
+            causal=causal,
+            logits_soft_cap=logits_soft_cap,
+            kv_last_page_lens=kv_last_page_len_gpu,
+        )
+        rtol, atol = get_tolerances(dtype)
+        assert_output_matches_reference(out, q_indptr_cpu, o_ref, rtol, atol)
+
+
+@pytest.mark.parametrize("kvcache_layout", ["linear", "vectorized"])
+@pytest.mark.parametrize("table_layout", ["sglang", "vllm"])
+@pytest.mark.parametrize("input_dtype", ["bf16", "fp8"])
+@pytest.mark.parametrize("batch_size", [1, 3, 7])
+@pytest.mark.parametrize(
+    "qo_len,kv_len",
+    [
+        (128, 128),
+        (1024, 1024),
+        (1023, 1024),
+        (1024, 1023),
+        (2048, 2048),
+        (8192, 8192),
+    ],
+)
+@pytest.mark.parametrize("page_size", [128, 256, 1024])
+@pytest.mark.parametrize("num_qo_heads,num_kv_heads", [(8, 1), (16, 1)])
+@pytest.mark.parametrize("head_dim", [128])
+@pytest.mark.parametrize("causal", [False, True])
+@pytest.mark.parametrize("logits_soft_cap", [0.0, 30.0])
+@pytest.mark.parametrize("dtype", [torch.float16, torch.bfloat16])
+@pytest.mark.parametrize("q_init_min,q_init_max", [(-10, 10)])
+@pytest.mark.parametrize("kv_init_min,kv_init_max", [(-5, 5)])
+@pytest.mark.parametrize("contiguous_kv", [True, False])
+@pytest.mark.parametrize("seed", [19378])
+def test_batch_prefill(
+    kvcache_layout,
+    table_layout,
+    input_dtype,
+    batch_size,
+    qo_len,
+    kv_len,
+    page_size,
+    num_qo_heads,
+    num_kv_heads,
+    head_dim,
+    causal,
+    logits_soft_cap,
     dtype,
     q_init_min,
     q_init_max,
     kv_init_min,
     kv_init_max,
+    contiguous_kv,
     seed,
+    profile=False,
 ):
     if seed is not None:
         torch.manual_seed(seed)
 
-    if causal and kv_len < qo_len:
-        pytest.skip("kv_len < qo_len is not allowed if causal=True")
+    is_input_fp8 = input_dtype == dtypes.fp8 or input_dtype == "fp8"
+    k_vector_size = get_vector_size(dtype)
+    k_vector_size_fp8 = get_vector_size(dtypes.fp8)
+
+    # Skip conditions
+    if check_common_skip_conditions(
+        is_input_fp8, dtype, causal, kv_len, qo_len, contiguous_kv
+    ):
+        return {"status": "skipped"}
+    if check_layout_skip_conditions(
+        kvcache_layout,
+        head_dim,
+        page_size,
+        k_vector_size,
+        k_vector_size_fp8,
+        is_input_fp8,
+        contiguous_kv,
+    ):
+        return {"status": "skipped"}
+
+    # Build test tensors
+    qo_lens = build_qo_lens(batch_size, qo_len, randomize=True)
+    q_indptr_cpu = convert_lens_to_indptr(qo_lens)
+    q = build_q_tensor_for_test(
+        qo_lens,
+        batch_size,
+        qo_len,
+        num_qo_heads,
+        head_dim,
+        dtype,
+        q_init_min,
+        q_init_max,
+        is_input_fp8,
+    )
+
+    kv_lens = build_kv_lens(batch_size, kv_len, qo_lens, randomize=True)
+    kv_cache = build_paged_kv_cache(
+        batch_size,
+        kv_len,
+        page_size,
+        num_kv_heads,
+        head_dim,
+        kv_lens,
+        None if is_input_fp8 else kv_init_min,
+        None if is_input_fp8 else kv_init_max,
+        dtype,
+        use_uniform=is_input_fp8,
+        contiguous_kv=contiguous_kv,
+    )
+
+    # Move to GPU
+    q_indptr_gpu = q_indptr_cpu.to(0)
+    kv_indptr_gpu = kv_cache["kv_indptr_cpu"].to(0)
+    kv_indices_gpu = kv_cache["kv_indices_cpu"].to(0)
+    kv_last_page_len_gpu = kv_cache["kv_last_page_len_cpu"].to(0)
+
+    k_cache_ref, v_cache_ref = extract_kv_caches(kv_cache, contiguous_kv)
+    max_qo_len = torch.max(qo_lens).item()
+    max_kv_len = torch.max(kv_lens).item()
+
+    # Build vLLM-style block table if needed
+    block_table_gpu = None
+    seqlen_k_gpu = None
+    if table_layout == "vllm":
+        block_table_cpu = build_block_table(
+            kv_cache["kv_indptr_cpu"],
+            kv_cache["kv_indices_cpu"],
+            batch_size,
+            kv_cache["max_num_pages_per_seq"],
+        )
+        block_table_gpu = block_table_cpu.to(0)
+        seqlen_k_gpu = kv_lens.to(0).int()
+
+    # Build reference output (shared between FP8 and non-FP8)
+    o_ref = build_reference_output(
+        q,
+        q_indptr_cpu,
+        kv_cache["kv_data_fp32"],
+        kv_cache["kv_indices_cpu"],
+        kv_cache["kv_indptr_cpu"],
+        kv_cache["kv_last_page_len_cpu"],
+        num_kv_heads,
+        head_dim,
+        dtype,
+        causal,
+        logits_soft_cap,
+    )
+
+    profile_result = {"status": "passed"}
+
+    if is_input_fp8:
+        q_quant, q_descale = per_tensor_quant(q, quant_dtype=dtypes.fp8)
+        k_cache_quant, k_descale = per_tensor_quant(
+            k_cache_ref.to(dtype), quant_dtype=dtypes.fp8
+        )
+        v_cache_quant, v_descale = per_tensor_quant(
+            v_cache_ref.to(dtype), quant_dtype=dtypes.fp8
+        )
+        k_cache_quant, v_cache_quant = apply_kv_layout(
+            k_cache_quant,
+            v_cache_quant,
+            num_kv_heads,
+            head_dim,
+            page_size,
+            k_vector_size_fp8,
+            kvcache_layout,
+        )
+        k_cache_ref_layout, v_cache_ref_layout = apply_kv_layout(
+            k_cache_ref.to(dtype),
+            v_cache_ref.to(dtype),
+            num_kv_heads,
+            head_dim,
+            page_size,
+            k_vector_size,
+            kvcache_layout,
+        )
+
+        # Run FP8 kernel (with optional profiling)
+        fp8_result = run_ck(
+            batch_size,
+            num_kv_heads,
+            q_quant,
+            k_cache_quant,
+            v_cache_quant,
+            q_indptr_gpu,
+            kv_indptr_gpu,
+            kv_indices_gpu,
+            max_qo_len,
+            max_kv_len,
+            causal=causal,
+            logits_soft_cap=logits_soft_cap,
+            q_descale=q_descale,
+            k_descale=k_descale,
+            v_descale=v_descale,
+            kv_last_page_lens=kv_last_page_len_gpu,
+            block_table=block_table_gpu,
+            seqlen_k=seqlen_k_gpu,
+            profile=profile,
+        )
+        if profile:
+            out_fp8, time_us, tflops = fp8_result
+            profile_result = {"status": "passed", "time_us": time_us, "tflops": tflops}
+        else:
+            out_fp8 = fp8_result
 
-    if head_dim == 64 and qo_len <= 64:
-        pytest.skip("Unsupported configuration")
+        # Run reference (BF16/FP16) - no profiling for reference
+        out_ref = run_ck(
+            batch_size,
+            num_kv_heads,
+            q,
+            k_cache_ref_layout,
+            v_cache_ref_layout,
+            q_indptr_gpu,
+            kv_indptr_gpu,
+            kv_indices_gpu,
+            max_qo_len,
+            max_kv_len,
+            causal=causal,
+            logits_soft_cap=logits_soft_cap,
+            kv_last_page_lens=kv_last_page_len_gpu,
+            block_table=block_table_gpu,
+            seqlen_k=seqlen_k_gpu,
+            profile=False,
+        )
 
-    def create_tensor(min, max, *args, **kwargs):
-        x = torch.randn(*args, **kwargs)
-        x = (x - x.min()) / (x.max() - x.min())
-        return min + (max - min) * x
+        verify_fp8_output(out_fp8, o_ref)
+        rtol, atol = get_tolerances(dtype, is_fp8=True)
+        torch.testing.assert_close(out_ref, o_ref, rtol=rtol, atol=atol)
+    else:
+        # Prepare KV cache based on layout and contiguity
+        if kvcache_layout == "linear" and not contiguous_kv:
+            k_cache, v_cache = k_cache_ref, v_cache_ref
+        else:
+            k_cache, v_cache = apply_kv_layout(
+                k_cache_ref,
+                v_cache_ref,
+                num_kv_heads,
+                head_dim,
+                page_size,
+                k_vector_size,
+                kvcache_layout,
+            )
 
-    def convert_lens_to_indtpr(lens):
-        return torch.cumsum(torch.cat((torch.tensor([0]), lens)), dim=0).int()
+        # Verify contiguity for linear layout
+        if kvcache_layout == "linear":
+            assert k_cache.is_contiguous() == contiguous_kv
+            assert v_cache.is_contiguous() == contiguous_kv
 
-    q = create_tensor(
-        q_init_min, q_init_max, batch_size * qo_len, num_qo_heads, head_dim, dtype=dtype
-    ).to(0)
-    if 1 < batch_size:
-        qo_lens = torch.randint(1, qo_len + 1, (batch_size,)).int()
-    else:
-        qo_lens = torch.full((batch_size,), qo_len).int()
-    q_indptr_cpu = convert_lens_to_indtpr(qo_lens)
-    max_num_pages_per_seq = (kv_len + page_size - 1) // page_size
-    total_num_pages = max_num_pages_per_seq * batch_size
-    kv_shape = [total_num_pages, 2, num_kv_heads, page_size, head_dim]
-    if not contiguous_kv:
-        tmp = [kv_shape[0]]
-        for v in kv_shape[1:]:
-            tmp.append(2)
-            tmp.append(v)
-        kv_shape = tmp
-        kv_data_fp32 = create_tensor(
-            kv_init_min, kv_init_max, *kv_shape, dtype=torch.float32
-        ).to(0)
-        kv_data = kv_data_fp32.to(dtype)
-        kv_data = kv_data[:, 1, :, 1, :, 1, :, 1, :]
-        kv_data_fp32 = kv_data_fp32[:, 1, :, 1, :, 1, :, 1, :]
-        # actual data is stored in non-contiguous memory
-        assert (
-            kv_data.stride(-4)
-            != kv_data.shape[-3] * kv_data.shape[-2] * kv_data.shape[-1]
+        # Run kernel (with optional profiling)
+        run_result = run_ck(
+            batch_size,
+            num_kv_heads,
+            q,
+            k_cache,
+            v_cache,
+            q_indptr_gpu,
+            kv_indptr_gpu,
+            kv_indices_gpu,
+            max_qo_len,
+            max_kv_len,
+            causal=causal,
+            logits_soft_cap=logits_soft_cap,
+            kv_last_page_lens=kv_last_page_len_gpu,
+            block_table=block_table_gpu,
+            seqlen_k=seqlen_k_gpu,
+            profile=profile,
         )
-    else:
-        kv_data_fp32 = create_tensor(
-            kv_init_min, kv_init_max, *kv_shape, dtype=torch.float32
-        ).to(0)
-        kv_data = kv_data_fp32.to(dtype)
-    if 1 < batch_size:
-        kv_lens = torch.maximum(
-            qo_lens, torch.randint(1, kv_len + 1, (batch_size,))
-        ).int()
-    else:
-        kv_lens = torch.full((batch_size,), kv_len).int()
-    kv_num_used_pages = (kv_lens + page_size - 1) // page_size
-    kv_indptr_cpu = convert_lens_to_indtpr(kv_num_used_pages)
-    kv_indices_cpu = torch.nn.functional.pad(
-        torch.randperm(total_num_pages).int(), (0, 128), value=0
-    )
-    kv_last_page_len_cpu = ((kv_lens - 1) % page_size + 1).int()
+        if profile:
+            out, time_us, tflops = run_result
+            profile_result = {"status": "passed", "time_us": time_us, "tflops": tflops}
+        else:
+            out = run_result
 
-    q_indptr_gpu = q_indptr_cpu.to(0)
-    kv_indptr_gpu = kv_indptr_cpu.to(0)
-    kv_indices_gpu = kv_indices_cpu.to(0)
+        rtol, atol = get_tolerances(dtype)
+        assert_output_matches_reference(out, q_indptr_cpu, o_ref, rtol, atol)
 
-    chunks = torch.chunk(kv_data, 2, dim=1)
-    k_cache = chunks[0].squeeze(2).squeeze(2)
-    v_cache = chunks[1].squeeze(2).squeeze(2)
+    # Suppress return value in pytest to avoid PytestReturnNotNoneWarning
+    if os.environ.get("PYTEST_CURRENT_TEST"):
+        return
+    return profile_result
 
-    o_ck_flash_attn = aiter.mha_batch_prefill_func(
-        q,
-        k_cache,
-        v_cache,
-        q_indptr_gpu,
-        kv_indptr_gpu,
-        kv_indices_gpu,
-        torch.max(qo_lens).item(),
-        torch.max(kv_lens).item(),
-        causal=causal,
-        logits_soft_cap=logits_soft_cap,
-    )
 
-    for i in range(batch_size):
-        perm_dims = [0, 2, 1, 3] if kv_layout == "HND" else [0, 1, 2, 3]
-        perm_dims_last = [1, 0, 2] if kv_layout == "HND" else [0, 1, 2]
-        qi = q[q_indptr_cpu[i] : q_indptr_cpu[i + 1]]
-        used_kv_indices = kv_indices_cpu[kv_indptr_cpu[i] : kv_indptr_cpu[i + 1]]
-        ki = torch.cat(
-            [
-                kv_data_fp32[used_kv_indices[:-1], 0]
-                .permute(*perm_dims)
-                .reshape(-1, num_kv_heads, head_dim),
-                (
-                    kv_data_fp32[used_kv_indices[-1], 0, :, : kv_last_page_len_cpu[i]]
-                    if kv_layout == "HND"
-                    else kv_data_fp32[
-                        used_kv_indices[-1], 0, : kv_last_page_len_cpu[i], :
-                    ]
-                )
-                .permute(*perm_dims_last)
-                .reshape(-1, num_kv_heads, head_dim),
-            ],
-            dim=0,
-        ).to(dtype)
-        vi = torch.cat(
-            [
-                kv_data_fp32[used_kv_indices[:-1], 1]
-                .permute(*perm_dims)
-                .reshape(-1, num_kv_heads, head_dim),
-                (
-                    kv_data_fp32[used_kv_indices[-1], 1, :, : kv_last_page_len_cpu[i]]
-                    if kv_layout == "HND"
-                    else kv_data_fp32[
-                        used_kv_indices[-1], 1, : kv_last_page_len_cpu[i], :
-                    ]
-                )
-                .permute(*perm_dims_last)
-                .reshape(-1, num_kv_heads, head_dim),
-            ],
-            dim=0,
-        ).to(dtype)
+@perftest()
+def profile_func(target_func, *args, **kwargs):
+    return target_func(*args, **kwargs)
 
-        # enlarge rtol for bf16 to allow passing very few numeric errors
-        rtol, atol = (1e-3, 1e-3) if dtype == torch.float16 else (2e-2, 1e-2)
 
-        o_ref_i = ref_masked_attention(
-            qi, ki, vi, causal=causal, logits_soft_cap=logits_soft_cap
-        )
+def flops(
+    batch,
+    seqlen_q,
+    seqlen_k,
+    headdim_q,
+    headdim_v,
+    nheads_q,
+    nheads_k,
+    causal,
+    mode="fwd",
+):
+    assert mode in ["fwd", "bwd", "fwd_bwd"]
+    mask_area = seqlen_q * seqlen_k // (2 if causal else 1)
+    qk = 2 * batch * mask_area * nheads_q * headdim_q
+    # Match CK's fmha_fwd_runner.hpp which always scales PV by nheads_q,
+    # even for MQA/GQA where KV heads are fewer than query heads.
+    pv = 2 * batch * mask_area * nheads_q * headdim_v
+    base = qk + pv
+    if mode == "fwd":
+        return base
+    if mode == "bwd":
+        return 2.5 * base
+    return 3.5 * base
 
-        o_i = o_ck_flash_attn[q_indptr_cpu[i] : q_indptr_cpu[i + 1]]
-        torch.testing.assert_close(o_i, o_ref_i, rtol=rtol, atol=atol)
+
+def efficiency(flop, time_in_us):
+    return flop / time_in_us / 10**6
 
 
 def run_ck(
+    batch_size,
+    num_kv_heads,
     q,
     k_cache,
     v_cache,
@@ -286,43 +1006,82 @@ def run_ck(
     q_descale=None,
     k_descale=None,
     v_descale=None,
+    kv_last_page_lens=None,
+    block_table=None,
+    seqlen_k=None,
+    profile=False,
 ):
-    """Unified interface for running batch_prefill with or without FP8."""
-    if (
-        q.dtype == dtypes.fp8
-        and k_cache.dtype == dtypes.fp8
-        and v_cache.dtype == dtypes.fp8
-    ):
-        # FP8 path
-        return aiter.mha_batch_prefill_func(
-            q,
-            k_cache,
-            v_cache,
-            cu_seqlens_q,
-            kv_indptr,
-            kv_page_indices,
+    """
+    Run CK kernel with optional profiling.
+
+    Returns:
+        If profile=False: out tensor
+        If profile=True: (out tensor, time_us, tflops)
+    """
+    kernel_args = (
+        q,
+        k_cache,
+        v_cache,
+        cu_seqlens_q,
+        kv_indptr,
+        kv_page_indices,
+        max_seqlen_q,
+        max_seqlen_k,
+    )
+    kernel_kwargs = dict(
+        causal=causal,
+        logits_soft_cap=logits_soft_cap,
+        q_descale=q_descale,
+        k_descale=k_descale,
+        v_descale=v_descale,
+        kv_last_page_lens=kv_last_page_lens,
+        block_table=block_table,
+        seqlen_k=seqlen_k,
+    )
+
+    if profile:
+        out, time_us = profile_func(
+            aiter.mha_batch_prefill_func, *kernel_args, **kernel_kwargs
+        )
+        nheads_q = q.shape[1]
+        headdim = q.shape[2]
+        total_flops = flops(
+            batch_size,
             max_seqlen_q,
             max_seqlen_k,
-            causal=causal,
-            logits_soft_cap=logits_soft_cap,
-            q_descale=q_descale,
-            k_descale=k_descale,
-            v_descale=v_descale,
+            headdim,
+            headdim,
+            nheads_q,
+            num_kv_heads,
+            causal,
         )
+        tflops = efficiency(total_flops, time_us)
+        return out, time_us, tflops
     else:
-        # Standard BF16/FP16 path
-        return aiter.mha_batch_prefill_func(
-            q,
-            k_cache,
-            v_cache,
-            cu_seqlens_q,
-            kv_indptr,
-            kv_page_indices,
-            max_seqlen_q,
-            max_seqlen_k,
-            causal=causal,
-            logits_soft_cap=logits_soft_cap,
+        out = aiter.mha_batch_prefill_func(*kernel_args, **kernel_kwargs)
+        return out
+
+
+def vectorize_kv_cache(
+    k_cache, v_cache, num_kv_heads, head_dim, page_size, k_vector_size
+):
+    k_cache = k_cache.contiguous()
+    v_cache = v_cache.contiguous()
+    k_cache = (
+        k_cache.view(
+            -1, page_size, num_kv_heads, head_dim // k_vector_size, k_vector_size
+        )
+        .permute(0, 2, 3, 1, 4)
+        .contiguous()
+    )
+    v_cache = (
+        v_cache.view(
+            -1, page_size // k_vector_size, k_vector_size, num_kv_heads, head_dim
         )
+        .permute(0, 3, 1, 4, 2)
+        .contiguous()
+    )
+    return k_cache, v_cache
 
 
 def varlen_to_paged_kv(k_varlen, v_varlen, kv_lens, page_size=1):
@@ -336,7 +1095,7 @@ def varlen_to_paged_kv(k_varlen, v_varlen, kv_lens, page_size=1):
         page_size: tokens per page
 
     Returns:
-        kv_data: [total_num_pages, 2, num_kv_heads, page_size, head_dim]
+        kv_data: [total_num_pages, 2, page_size, num_kv_heads, head_dim]
         kv_indptr: [batch_size + 1]
         kv_indices: [total_num_pages + padding]
     """
@@ -355,8 +1114,8 @@ def varlen_to_paged_kv(k_varlen, v_varlen, kv_lens, page_size=1):
     kv_data = torch.zeros(
         total_num_pages,
         2,
-        num_kv_heads,
         page_size,
+        num_kv_heads,
         head_dim,
         dtype=dtype,
         device=device,
@@ -367,13 +1126,8 @@ def varlen_to_paged_kv(k_varlen, v_varlen, kv_lens, page_size=1):
     kv_indices = torch.nn.functional.pad(kv_indices, (0, 128), value=0)
 
     # Fill in the data
-    def convert_lens_to_indptr_local(lens):
-        return torch.cumsum(torch.cat((torch.tensor([0]), lens)), dim=0).int()
-
-    kv_indptr = convert_lens_to_indptr_local(
-        ((kv_lens + page_size - 1) // page_size).cpu()
-    )
-    cu_kv_lens = convert_lens_to_indptr_local(kv_lens.cpu())
+    kv_indptr = convert_lens_to_indptr(((kv_lens + page_size - 1) // page_size).cpu())
+    cu_kv_lens = convert_lens_to_indptr(kv_lens.cpu())
 
     for batch_idx in range(batch_size):
         seq_start = cu_kv_lens[batch_idx].item()
@@ -391,198 +1145,18 @@ def convert_lens_to_indptr_local(lens):
             tokens_in_page = token_end - token_start
 
             # K data
-            kv_data[global_page_idx, 0, :, :tokens_in_page, :] = k_varlen[
+            kv_data[global_page_idx, 0, :tokens_in_page, :, :] = k_varlen[
                 seq_start + token_start : seq_start + token_end, :, :
             ]
 
             # V data
-            kv_data[global_page_idx, 1, :, :tokens_in_page, :] = v_varlen[
+            kv_data[global_page_idx, 1, :tokens_in_page, :, :] = v_varlen[
                 seq_start + token_start : seq_start + token_end, :, :
             ]
 
     return kv_data, kv_indptr, kv_indices
 
 
-@pytest.mark.parametrize("causal", [False, True])
-@pytest.mark.parametrize("logits_soft_cap", [0.0, 30.0])
-@pytest.mark.parametrize("batch_size", [1, 3])
-@pytest.mark.parametrize("num_qo_heads,num_kv_heads", [(6, 1), (3, 1)])
-@pytest.mark.parametrize("head_dim", [128])
-@pytest.mark.parametrize(
-    "qo_len,kv_len",
-    [
-        (128, 128),
-        (1024, 1024),
-        (1023, 1024),
-        (1024, 1023),
-        (2048, 2048),
-    ],
-)
-@pytest.mark.parametrize("page_size", [1])
-@pytest.mark.parametrize("seed", [19378])
-def test_batch_prefill_fp8_output(
-    batch_size,
-    num_qo_heads,
-    num_kv_heads,
-    qo_len,
-    kv_len,
-    head_dim,
-    page_size,
-    causal,
-    logits_soft_cap,
-    seed,
-):
-    """Test FP8 batch_prefill by comparing with BF16 kernel, following test_mha_varlen_fp8 pattern."""
-    if seed is not None:
-        torch.manual_seed(seed)
-
-    torch.cuda.empty_cache()
-
-    if causal and kv_len < qo_len:
-        pytest.skip("kv_len < qo_len is not allowed if causal=True")
-
-    if head_dim == 64 and qo_len <= 64:
-        pytest.skip("Unsupported configuration")
-
-    dtype = torch.bfloat16
-    quant_dtype = dtypes.fp8
-
-    def convert_lens_to_indptr(lens):
-        return torch.cumsum(torch.cat((torch.tensor([0]), lens)), dim=0).int()
-
-    # Create variable sequence lengths first
-    if batch_size > 1:
-        qo_lens = torch.randint(1, qo_len + 1, (batch_size,)).int()
-    else:
-        qo_lens = torch.full((batch_size,), qo_len).int()
-    q_indptr_cpu = convert_lens_to_indptr(qo_lens)
-
-    # Create Q tensor with actual total tokens needed
-    total_q_tokens = torch.sum(qo_lens).item()
-    q = torch.rand(total_q_tokens, num_qo_heads, head_dim, device="cuda", dtype=dtype)
-
-    # Create paged KV cache following test_batch_prefill_with_paged_kv_cache pattern
-    # Generate in FP32 first for accurate reference computation
-    max_num_pages_per_seq = (kv_len + page_size - 1) // page_size
-    total_num_pages = max_num_pages_per_seq * batch_size
-    kv_shape = [total_num_pages, 2, num_kv_heads, page_size, head_dim]
-
-    kv_data_fp32 = torch.rand(*kv_shape, device="cuda", dtype=torch.float32)
-    kv_data = kv_data_fp32.to(dtype)
-
-    if batch_size > 1:
-        kv_lens = torch.maximum(
-            qo_lens, torch.randint(1, kv_len + 1, (batch_size,))
-        ).int()
-    else:
-        kv_lens = torch.full((batch_size,), kv_len).int()
-
-    kv_num_used_pages = (kv_lens + page_size - 1) // page_size
-    kv_indptr_cpu = convert_lens_to_indptr(kv_num_used_pages)
-    kv_indices_cpu = torch.nn.functional.pad(
-        torch.randperm(total_num_pages).int(), (0, 128), value=0
-    )
-    kv_last_page_len_cpu = ((kv_lens - 1) % page_size + 1).int()
-
-    q_indptr_gpu = q_indptr_cpu.to(0)
-    kv_indptr_gpu = kv_indptr_cpu.to(0)
-    kv_indices_gpu = kv_indices_cpu.to(0)
-
-    # Extract K and V caches
-    chunks = torch.chunk(kv_data, 2, dim=1)
-    k_cache = chunks[0].squeeze(2).squeeze(2)
-    v_cache = chunks[1].squeeze(2).squeeze(2)
-
-    # Quantize to FP8 following test_mha_varlen_fp8 pattern
-    q_quant, q_descale = per_tensor_quant(q, quant_dtype=quant_dtype)
-    k_cache_quant, k_descale = per_tensor_quant(k_cache, quant_dtype=quant_dtype)
-    v_cache_quant, v_descale = per_tensor_quant(v_cache, quant_dtype=quant_dtype)
-
-    # Run FP8 kernel
-    out_fp8 = run_ck(
-        q_quant,
-        k_cache_quant,
-        v_cache_quant,
-        q_indptr_gpu,
-        kv_indptr_gpu,
-        kv_indices_gpu,
-        torch.max(qo_lens).item(),
-        torch.max(kv_lens).item(),
-        causal=causal,
-        logits_soft_cap=logits_soft_cap,
-        q_descale=q_descale,
-        k_descale=k_descale,
-        v_descale=v_descale,
-    )
-
-    # Run BF16 reference kernel
-    out_ref = run_ck(
-        q,
-        k_cache,
-        v_cache,
-        q_indptr_gpu,
-        kv_indptr_gpu,
-        kv_indices_gpu,
-        torch.max(qo_lens).item(),
-        torch.max(kv_lens).item(),
-        causal=causal,
-        logits_soft_cap=logits_soft_cap,
-    )
-
-    # Compute reference output for all sequences
-    o_ref_list = []
-    for i in range(batch_size):
-        # Extract valid Q for this sequence
-        qi = q[q_indptr_cpu[i] : q_indptr_cpu[i + 1]]
-
-        # Extract valid K and V for this sequence (NHD layout)
-        used_kv_indices = kv_indices_cpu[kv_indptr_cpu[i] : kv_indptr_cpu[i + 1]]
-        ki = torch.cat(
-            [
-                kv_data_fp32[used_kv_indices[:-1], 0].reshape(
-                    -1, num_kv_heads, head_dim
-                ),
-                kv_data_fp32[
-                    used_kv_indices[-1], 0, : kv_last_page_len_cpu[i], :
-                ].reshape(-1, num_kv_heads, head_dim),
-            ],
-            dim=0,
-        ).to(dtype)
-        vi = torch.cat(
-            [
-                kv_data_fp32[used_kv_indices[:-1], 1].reshape(
-                    -1, num_kv_heads, head_dim
-                ),
-                kv_data_fp32[
-                    used_kv_indices[-1], 1, : kv_last_page_len_cpu[i], :
-                ].reshape(-1, num_kv_heads, head_dim),
-            ],
-            dim=0,
-        ).to(dtype)
-
-        # Compute reference attention for this sequence
-        o_ref_i = ref_masked_attention(
-            qi, ki, vi, causal=causal, logits_soft_cap=logits_soft_cap
-        )
-        o_ref_list.append(o_ref_i)
-
-    # Concatenate all reference outputs
-    o_ref = torch.cat(o_ref_list, dim=0)
-
-    # Compare FP8 output with reference (entire tensor)
-    # Following test_mha_varlen_fp8 threshold
-    max_diff = (out_fp8 - o_ref).abs().max().item()
-    threshold = 0.055
-    assert max_diff < threshold, (
-        f"FP8 kernel vs reference difference too large: "
-        f"{max_diff} (threshold: {threshold})"
-    )
-
-    # Also verify BF16 kernel matches reference (sanity check)
-    rtol, atol = 2e-2, 1e-2  # bf16 tolerances
-    torch.testing.assert_close(out_ref, o_ref, rtol=rtol, atol=atol)
-
-
 @pytest.mark.parametrize("batch_size", [1, 3])
 @pytest.mark.parametrize("num_qo_heads,num_kv_heads", [(6, 1), (8, 1)])
 @pytest.mark.parametrize("head_dim", [128])
@@ -614,39 +1188,33 @@ def test_batch_prefill_vs_varlen_fp8(
     torch.manual_seed(42)
     dtype = torch.bfloat16
     quant_dtype = dtypes.fp8
+    page_size = 128
+    k_vector_size = get_vector_size(quant_dtype)
 
-    def create_tensor(min_val, max_val, *args, **kwargs):
-        """Create a tensor with values uniformly distributed in [min_val, max_val]."""
-        x = torch.randn(*args, **kwargs)
-        x = (x - x.min()) / (x.max() - x.min())
-        return min_val + (max_val - min_val) * x
-
-    def convert_lens_to_indptr(lens):
-        """Convert sequence lengths to cumulative index pointer."""
-        return torch.cumsum(torch.cat((torch.tensor([0]), lens)), dim=0).int()
-
-    # Create Q, K, V in varlen format (BF16 first)
-    if batch_size > 1:
-        qo_lens = torch.randint(qo_len // 2, qo_len + 1, (batch_size,)).int()
-        kv_lens = torch.maximum(
-            qo_lens, torch.randint(kv_len // 2, kv_len + 1, (batch_size,))
-        ).int()
-    else:
-        qo_lens = torch.full((batch_size,), qo_len).int()
-        kv_lens = torch.full((batch_size,), kv_len).int()
+    if skip_test_if(
+        page_size % k_vector_size != 0 or head_dim % k_vector_size != 0,
+        "Vectorized layout requires page/head dim divisible by vector size",
+    ):
+        return
 
+    # Build sequence lengths
+    qo_lens = build_qo_lens(batch_size, qo_len, randomize=batch_size > 1)
+    kv_lens = build_kv_lens(batch_size, kv_len, qo_lens, randomize=batch_size > 1)
     total_q_tokens = qo_lens.sum().item()
     total_kv_tokens = kv_lens.sum().item()
+    max_qo_len = qo_lens.max().item()
+    max_kv_len = kv_lens.max().item()
 
-    q_bf16 = create_tensor(
+    # Create Q, K, V in varlen format
+    q_bf16 = make_scaled_rand(
         -10, 10, total_q_tokens, num_qo_heads, head_dim, dtype=dtype
-    ).cuda()
-    k_bf16 = create_tensor(
+    )
+    k_bf16 = make_scaled_rand(
         -5, 5, total_kv_tokens, num_kv_heads, head_dim, dtype=dtype
-    ).cuda()
-    v_bf16 = create_tensor(
+    )
+    v_bf16 = make_scaled_rand(
         -5, 5, total_kv_tokens, num_kv_heads, head_dim, dtype=dtype
-    ).cuda()
+    )
 
     # Quantize to FP8
     q_fp8, q_descale = per_tensor_quant(q_bf16, quant_dtype=quant_dtype)
@@ -666,8 +1234,8 @@ def convert_lens_to_indptr(lens):
         v_descale,
         cu_seqlens_q,
         cu_seqlens_k,
-        max_seqlen_q=qo_lens.max().item(),
-        max_seqlen_k=kv_lens.max().item(),
+        max_seqlen_q=max_qo_len,
+        max_seqlen_k=max_kv_len,
         min_seqlen_q=0,
         causal=causal,
         logits_soft_cap=logits_soft_cap,
@@ -676,12 +1244,31 @@ def convert_lens_to_indptr(lens):
 
     # Convert to paged KV cache format
     kv_data, kv_indptr, kv_indices = varlen_to_paged_kv(
-        k_fp8, v_fp8, kv_lens, page_size=1
+        k_fp8, v_fp8, kv_lens, page_size=page_size
     )
+    kv_last_page_len_gpu = ((kv_lens - 1) % page_size + 1).int().to(0)
+    seqlen_k_gpu = kv_lens.to(0).int()
+    max_num_pages_per_seq = (max_kv_len + page_size - 1) // page_size
 
-    # Extract K and V from paged format
-    k_paged = kv_data[:, 0, :, :, :].squeeze(2)
-    v_paged = kv_data[:, 1, :, :, :].squeeze(2)
+    # Build block table
+    block_table_cpu = torch.zeros(
+        (batch_size, max_num_pages_per_seq), dtype=torch.int32
+    )
+    for i in range(batch_size):
+        start, end = kv_indptr[i].item(), kv_indptr[i + 1].item()
+        block_table_cpu[i, : (end - start)] = kv_indices[start:end]
+    block_table_gpu = block_table_cpu.to(0)
+
+    # Extract and vectorize K/V from paged format
+    k_cache_raw, v_cache_raw = split_kv_pages(kv_data)
+    k_paged, v_paged = vectorize_kv_cache(
+        k_cache_raw,
+        v_cache_raw,
+        num_kv_heads,
+        head_dim,
+        page_size,
+        k_vector_size,
+    )
 
     # Run batch_prefill FP8
     out_batch_prefill = aiter.mha_batch_prefill_func(
@@ -691,23 +1278,18 @@ def convert_lens_to_indptr(lens):
         cu_seqlens_q,
         kv_indptr.cuda(),
         kv_indices.cuda(),
-        max_seqlen_q=qo_lens.max().item(),
-        max_seqlen_k=kv_lens.max().item(),
+        max_seqlen_q=max_qo_len,
+        max_seqlen_k=max_kv_len,
         causal=causal,
         logits_soft_cap=logits_soft_cap,
         q_descale=q_descale,
         k_descale=k_descale,
         v_descale=v_descale,
+        kv_last_page_lens=kv_last_page_len_gpu,
+        block_table=block_table_gpu,
+        seqlen_k=seqlen_k_gpu,
     )
 
-    # Compare results (all tokens are valid, no padding)
-    print("\n=== FP8 Comparison: batch_prefill vs varlen ===")
-    print(
-        f"batch_size={batch_size}, heads={num_qo_heads}/{num_kv_heads}, "
-        f"dim={head_dim}, qo_len={qo_len}, kv_len={kv_len}"
-    )
-    print(f"causal={causal}, logits_soft_cap={logits_soft_cap}")
-
     # Sanity check: outputs should not be all zeros
     assert (
         out_varlen.abs().max().item() > 1e-6
@@ -716,22 +1298,7 @@ def convert_lens_to_indptr(lens):
         out_batch_prefill.abs().max().item() > 1e-6
     ), "Batch_prefill output is all zeros - kernel may not have launched!"
 
-    # Compute differences on entire tensor
-    diff = (out_varlen - out_batch_prefill).abs()
-    max_diff_all = diff.max().item()
-    mean_diff_all = diff.mean().item()
-
-    print(f"Max diff: {max_diff_all:.6e}")
-    print(f"Mean diff: {mean_diff_all:.6e}")
-    print(f"Varlen output max: {out_varlen.abs().max().item():.6e}")
-    print(f"Batch_prefill output max: {out_batch_prefill.abs().max().item():.6e}")
-
-    if out_varlen.abs().max().item() > 0:
-        rel_error = max_diff_all / out_varlen.abs().max().item()
-        print(f"Relative error: {rel_error * 100:.4f}%")
-
     # Should be nearly identical (same pipeline, same computation)
-    # FP8 may have slightly larger tolerance
     rtol, atol = 1e-4, 1e-4
     torch.testing.assert_close(out_batch_prefill, out_varlen, rtol=rtol, atol=atol)
 
@@ -771,55 +1338,162 @@ def convert_lens_to_indptr(lens):
     e.g.: -d bf16""",
 )
 parser.add_argument(
-    "--test_fp8",
+    "-s",
+    "--seqlen",
+    type=int,
+    const=None,
+    default=1024,
+    help="""seqlen.
+    e.g.: -s 1024""",
+)
+parser.add_argument(
+    "-p",
+    "--pagesize",
+    type=int,
+    const=None,
+    choices=[1, 1024],
+    default=[1, 1024],
+    nargs="*",
+    help="""page size.
+    e.g.: -p 1024""",
+)
+parser.add_argument(
+    "-q",
+    "--headq",
+    type=int,
+    const=None,
+    default=8,
+    help="""number of q head.
+    e.g.: -h 8""",
+)
+parser.add_argument(
+    "-k",
+    "--headk",
+    type=int,
+    const=None,
+    default=8,
+    help="""number of kv head.
+    e.g.: -h_k 8""",
+)
+parser.add_argument(
+    "-t",
+    "--lookup_table",
+    type=str,
+    const=None,
+    choices=["sglang", "vllm"],
+    default=["sglang", "vllm"],
+    nargs="*",
+    help="""lookup table.
+    e.g.: -t sglang""",
+)
+parser.add_argument(
+    "--kv_layout",
+    type=str,
+    const=None,
+    choices=["vectorized", "linear"],
+    default=["vectorized"],
+    nargs="*",
+    help="""kv cache table.
+    e.g.: -o vectorized""",
+)
+parser.add_argument(
+    "--input_dtype",
+    type=dtypes.str2Dtype,
+    const=None,
+    choices=[dtypes.d_dtypes["fp16"], dtypes.d_dtypes["bf16"], dtypes.d_dtypes["fp8"]],
+    default="bf16, fp8",
+    nargs="*",
+    help="""input dtype.
+    e.g.: -o bf16""",
+)
+parser.add_argument(
+    "--profile",
     action="store_true",
-    help="""Run FP8 test instead of standard test.
-    e.g.: --test_fp8""",
+    help="Enable profiling mode",
 )
 
+
 if __name__ == "__main__":
     args = parser.parse_args()
 
-    if args.test_fp8:
-        # Run FP8 tests
-        for causal, logits_soft_cap in itertools.product(
-            args.causal, args.logits_soft_cap
-        ):
-            test_batch_prefill_fp8_output(
-                batch_size=1,
-                qo_len=8192,
-                kv_len=8192,
-                page_size=1,
-                num_qo_heads=6,
-                num_kv_heads=1,
-                head_dim=128,
-                causal=causal,
-                logits_soft_cap=logits_soft_cap,
-                seed=19378,
-            )
-    else:
-        # Run standard tests
-        for (
-            causal,
-            logits_soft_cap,
-            dtype,
-        ) in itertools.product(args.causal, args.logits_soft_cap, args.dtype):
-            test_batch_prefill_with_paged_kv_cache(
-                batch_size=1,
-                kv_len=8192,
-                qo_len=8192,
-                page_size=1,
-                num_qo_heads=6,
-                num_kv_heads=1,
-                head_dim=128,
-                causal=causal,
-                kv_layout="NHD",
-                logits_soft_cap=logits_soft_cap,
-                contiguous_kv=True,
-                dtype=dtype,
-                q_init_min=-10,
-                q_init_max=10,
-                kv_init_min=-5,
-                kv_init_max=5,
-                seed=19378,
-            )
+    collected = []
+    for (
+        page_size,
+        causal,
+        logits_soft_cap,
+        dtype,
+        lookup_table,
+        kv_layout,
+        input_dtype,
+        contiguous_kv,
+    ) in itertools.product(
+        args.pagesize,
+        args.causal,
+        args.logits_soft_cap,
+        args.dtype,
+        args.lookup_table,
+        args.kv_layout,
+        args.input_dtype,
+        [True, False],  # contiguous_kv
+    ):
+        result = test_batch_prefill(
+            kvcache_layout=kv_layout,
+            table_layout=lookup_table,
+            input_dtype=input_dtype,
+            batch_size=1,
+            qo_len=args.seqlen,
+            kv_len=args.seqlen,
+            page_size=page_size,
+            num_qo_heads=args.headq,
+            num_kv_heads=args.headk,
+            head_dim=128,
+            causal=causal,
+            logits_soft_cap=logits_soft_cap,
+            dtype=dtype,
+            q_init_min=-10,
+            q_init_max=10,
+            kv_init_min=-5,
+            kv_init_max=5,
+            contiguous_kv=contiguous_kv,
+            seed=19378,
+            profile=args.profile,
+        )
+
+        # Build result row
+        time_us = result.get("time_us") if result else None
+        tflops = result.get("tflops") if result else None
+        row = {
+            "seqlen": args.seqlen,
+            "page_sz": page_size,
+            "h_q": args.headq,
+            "h_kv": args.headk,
+            "hdim": 128,
+            "input_dtype": str(input_dtype).split(".")[-1],
+            "kv_layout": kv_layout,
+            "table": lookup_table,
+            "causal": causal,
+            "soft_cap": logits_soft_cap,
+            "contig": contiguous_kv,
+            "status": result.get("status", "passed") if result else "passed",
+            "time_us": f"{time_us:.2f}" if time_us is not None else "-",
+            "tflops": f"{tflops:.2f}" if tflops is not None else "-",
+        }
+
+        collected.append(row)
+
+    # Print summary
+    df = pd.DataFrame(collected)
+    pd.set_option("display.max_rows", None)
+    pd.set_option("display.max_columns", None)
+    pd.set_option("display.width", None)
+    pd.set_option("display.float_format", lambda x: f"{x:.2f}")
+
+    print("\n" + "=" * 100)
+    aiter.logger.info(f"\n=== Batch Prefill Summary ===\n{df.to_string(index=False)}")
+
+    # Print statistics
+    passed = df[df["status"] == "passed"].shape[0]
+    skipped = df[df["status"] == "skipped"].shape[0]
+    total = len(collected)
+    print(f"\nTotal: {total}, Passed: {passed}, Skipped: {skipped}")
+    print("=" * 100)
diff --git a/op_tests/test_batched_gemm_a8w8.py b/op_tests/test_batched_gemm_a8w8.py
index ddac5f8153..a89976cac8 100644
--- a/op_tests/test_batched_gemm_a8w8.py
+++ b/op_tests/test_batched_gemm_a8w8.py
@@ -1,5 +1,5 @@
 # SPDX-License-Identifier: MIT
-# Copyright (C) 2024-2025, Advanced Micro Devices, Inc. All rights reserved.
+# Copyright (C) 2024-2026, Advanced Micro Devices, Inc. All rights reserved.
 
 import torch
 import torch.nn.functional as F
diff --git a/op_tests/test_batched_gemm_bf16.py b/op_tests/test_batched_gemm_bf16.py
index e0c6d81420..078514d11b 100644
--- a/op_tests/test_batched_gemm_bf16.py
+++ b/op_tests/test_batched_gemm_bf16.py
@@ -1,5 +1,5 @@
 # SPDX-License-Identifier: MIT
-# Copyright (C) 2024-2025, Advanced Micro Devices, Inc. All rights reserved.
+# Copyright (C) 2024-2026, Advanced Micro Devices, Inc. All rights reserved.
 
 import torch
 import torch.nn.functional as F
diff --git a/op_tests/test_deepgemm.py b/op_tests/test_deepgemm.py
index c300cc5540..b8db4cfc25 100644
--- a/op_tests/test_deepgemm.py
+++ b/op_tests/test_deepgemm.py
@@ -1,5 +1,5 @@
 # SPDX-License-Identifier: MIT
-# Copyright (C) 2025, Advanced Micro Devices, Inc. All rights reserved.
+# Copyright (C) 2025-2026, Advanced Micro Devices, Inc. All rights reserved.
 
 import torch
 import itertools
diff --git a/op_tests/test_fused_mrope_rms.py b/op_tests/test_fused_mrope_rms.py
index eb495b9e5d..13a17b4f40 100644
--- a/op_tests/test_fused_mrope_rms.py
+++ b/op_tests/test_fused_mrope_rms.py
@@ -1,5 +1,5 @@
 # SPDX-License-Identifier: MIT
-# Copyright (C) 2024-2025, Advanced Micro Devices, Inc. All rights reserved.
+# Copyright (C) 2024-2026, Advanced Micro Devices, Inc. All rights reserved.
 
 import torch
 from torch import Tensor
diff --git a/op_tests/test_fused_qk_norm_rope_cache_quant.py b/op_tests/test_fused_qk_norm_rope_cache_quant.py
index 52e6daad92..7219bf822a 100644
--- a/op_tests/test_fused_qk_norm_rope_cache_quant.py
+++ b/op_tests/test_fused_qk_norm_rope_cache_quant.py
@@ -1,5 +1,5 @@
 # SPDX-License-Identifier: MIT
-# Copyright (C) 2024-2025, Advanced Micro Devices, Inc. All rights reserved.
+# Copyright (C) 2024-2026, Advanced Micro Devices, Inc. All rights reserved.
 
 import torch
 from torch import Tensor
diff --git a/op_tests/test_gemm_a16w16.py b/op_tests/test_gemm_a16w16.py
index 23a1a5f850..f8740fb30e 100755
--- a/op_tests/test_gemm_a16w16.py
+++ b/op_tests/test_gemm_a16w16.py
@@ -1,5 +1,5 @@
 # SPDX-License-Identifier: MIT
-# Copyright (C) 2024-2025, Advanced Micro Devices, Inc. All rights reserved.
+# Copyright (C) 2024-2026, Advanced Micro Devices, Inc. All rights reserved.
 
 import argparse
 import random
diff --git a/op_tests/test_gemm_a4w4.py b/op_tests/test_gemm_a4w4.py
index d009f5bf74..51b34d1cfb 100644
--- a/op_tests/test_gemm_a4w4.py
+++ b/op_tests/test_gemm_a4w4.py
@@ -1,5 +1,5 @@
 # SPDX-License-Identifier: MIT
-# Copyright (C) 2024-2025, Advanced Micro Devices, Inc. All rights reserved.
+# Copyright (C) 2024-2026, Advanced Micro Devices, Inc. All rights reserved.
 
 import argparse
 
diff --git a/op_tests/test_gemm_a8w8.py b/op_tests/test_gemm_a8w8.py
index fff29471bd..a0a8d61dbe 100755
--- a/op_tests/test_gemm_a8w8.py
+++ b/op_tests/test_gemm_a8w8.py
@@ -1,5 +1,5 @@
 # SPDX-License-Identifier: MIT
-# Copyright (C) 2024-2025, Advanced Micro Devices, Inc. All rights reserved.
+# Copyright (C) 2024-2026, Advanced Micro Devices, Inc. All rights reserved.
 
 import torch
 import torch.nn.functional as F
diff --git a/op_tests/test_mla.py b/op_tests/test_mla.py
index 1c0df429c3..7322b30bec 100644
--- a/op_tests/test_mla.py
+++ b/op_tests/test_mla.py
@@ -1,5 +1,5 @@
 # SPDX-License-Identifier: MIT
-# Copyright (C) 2024-2025, Advanced Micro Devices, Inc. All rights reserved.
+# Copyright (C) 2024-2026, Advanced Micro Devices, Inc. All rights reserved.
 
 import argparse
 import itertools
diff --git a/op_tests/triton_tests/attention/test_chunked_pa_prefill.py b/op_tests/triton_tests/attention/test_chunked_pa_prefill.py
index f23263b71e..424f5a1147 100644
--- a/op_tests/triton_tests/attention/test_chunked_pa_prefill.py
+++ b/op_tests/triton_tests/attention/test_chunked_pa_prefill.py
@@ -1,5 +1,5 @@
 # SPDX-License-Identifier: MIT
-# Copyright (C) 2024-2025, Advanced Micro Devices, Inc. All rights reserved.
+# Copyright (C) 2024-2026, Advanced Micro Devices, Inc. All rights reserved.
 
 import math
 import random
diff --git a/op_tests/triton_tests/attention/test_extend_attention.py b/op_tests/triton_tests/attention/test_extend_attention.py
index a0b5b8ef4a..7fbe8f0135 100644
--- a/op_tests/triton_tests/attention/test_extend_attention.py
+++ b/op_tests/triton_tests/attention/test_extend_attention.py
@@ -1,5 +1,5 @@
 # SPDX-License-Identifier: MIT
-# Copyright (C) 2024-2025, Advanced Micro Devices, Inc. All rights reserved.
+# Copyright (C) 2024-2026, Advanced Micro Devices, Inc. All rights reserved.
 
 import torch
 import pytest
diff --git a/op_tests/triton_tests/attention/test_la.py b/op_tests/triton_tests/attention/test_la.py
index b2a7a448a0..ad38626027 100644
--- a/op_tests/triton_tests/attention/test_la.py
+++ b/op_tests/triton_tests/attention/test_la.py
@@ -1,5 +1,5 @@
 # SPDX-License-Identifier: MIT
-# Copyright (C) 2024-2025, Advanced Micro Devices, Inc. All rights reserved.
+# Copyright (C) 2024-2026, Advanced Micro Devices, Inc. All rights reserved.
 
 import sys
 import pytest
diff --git a/op_tests/triton_tests/attention/test_mha.py b/op_tests/triton_tests/attention/test_mha.py
index a3b2d0e917..d13804b9a5 100644
--- a/op_tests/triton_tests/attention/test_mha.py
+++ b/op_tests/triton_tests/attention/test_mha.py
@@ -1,5 +1,5 @@
 # SPDX-License-Identifier: MIT
-# Copyright (C) 2024-2025, Advanced Micro Devices, Inc. All rights reserved.
+# Copyright (C) 2024-2026, Advanced Micro Devices, Inc. All rights reserved.
 
 import torch
 import pytest
diff --git a/op_tests/triton_tests/attention/test_mla_decode_rope.py b/op_tests/triton_tests/attention/test_mla_decode_rope.py
index b3ec09f752..d4ce4d4024 100644
--- a/op_tests/triton_tests/attention/test_mla_decode_rope.py
+++ b/op_tests/triton_tests/attention/test_mla_decode_rope.py
@@ -1,5 +1,5 @@
 # SPDX-License-Identifier: MIT
-# Copyright (C) 2024-2025, Advanced Micro Devices, Inc. All rights reserved.
+# Copyright (C) 2024-2026, Advanced Micro Devices, Inc. All rights reserved.
 
 import torch
 import pytest
diff --git a/op_tests/triton_tests/attention/test_pa_decode.py b/op_tests/triton_tests/attention/test_pa_decode.py
index 57e3ab4111..bb696bd577 100644
--- a/op_tests/triton_tests/attention/test_pa_decode.py
+++ b/op_tests/triton_tests/attention/test_pa_decode.py
@@ -1,5 +1,5 @@
 # SPDX-License-Identifier: MIT
-# Copyright (C) 2024-2025, Advanced Micro Devices, Inc. All rights reserved.
+# Copyright (C) 2024-2026, Advanced Micro Devices, Inc. All rights reserved.
 
 import triton.language as tl
 import torch
diff --git a/op_tests/triton_tests/attention/test_pa_prefill.py b/op_tests/triton_tests/attention/test_pa_prefill.py
index 8c3635452c..d97f89f4e9 100644
--- a/op_tests/triton_tests/attention/test_pa_prefill.py
+++ b/op_tests/triton_tests/attention/test_pa_prefill.py
@@ -1,5 +1,5 @@
 # SPDX-License-Identifier: MIT
-# Copyright (C) 2024-2025, Advanced Micro Devices, Inc. All rights reserved.
+# Copyright (C) 2024-2026, Advanced Micro Devices, Inc. All rights reserved.
 
 import math
 import random
diff --git a/op_tests/triton_tests/attention/test_prefill_attention.py b/op_tests/triton_tests/attention/test_prefill_attention.py
index 9750b61892..b7da56bc4d 100644
--- a/op_tests/triton_tests/attention/test_prefill_attention.py
+++ b/op_tests/triton_tests/attention/test_prefill_attention.py
@@ -1,5 +1,5 @@
 # SPDX-License-Identifier: MIT
-# Copyright (C) 2024-2025, Advanced Micro Devices, Inc. All rights reserved.
+# Copyright (C) 2024-2026, Advanced Micro Devices, Inc. All rights reserved.
 
 import torch
 import pytest
diff --git a/op_tests/triton_tests/gemm/basic/test_gemm_a16w16.py b/op_tests/triton_tests/gemm/basic/test_gemm_a16w16.py
index 78858572b5..b6a7df7215 100644
--- a/op_tests/triton_tests/gemm/basic/test_gemm_a16w16.py
+++ b/op_tests/triton_tests/gemm/basic/test_gemm_a16w16.py
@@ -1,5 +1,5 @@
 # SPDX-License-Identifier: MIT
-# Copyright (C) 2024-2025, Advanced Micro Devices, Inc. All rights reserved.
+# Copyright (C) 2024-2026, Advanced Micro Devices, Inc. All rights reserved.
 
 import torch
 import torch.nn.functional as F
diff --git a/op_tests/triton_tests/gemm/basic/test_gemm_a16w8_blockscale.py b/op_tests/triton_tests/gemm/basic/test_gemm_a16w8_blockscale.py
index a4b816c57b..3cf32d9832 100644
--- a/op_tests/triton_tests/gemm/basic/test_gemm_a16w8_blockscale.py
+++ b/op_tests/triton_tests/gemm/basic/test_gemm_a16w8_blockscale.py
@@ -1,5 +1,5 @@
 # SPDX-License-Identifier: MIT
-# Copyright (C) 2024-2025, Advanced Micro Devices, Inc. All rights reserved.
+# Copyright (C) 2024-2026, Advanced Micro Devices, Inc. All rights reserved.
 
 import torch
 import triton
diff --git a/op_tests/triton_tests/gemm/basic/test_gemm_a8w8.py b/op_tests/triton_tests/gemm/basic/test_gemm_a8w8.py
index 38262a90fc..4609fd94ca 100644
--- a/op_tests/triton_tests/gemm/basic/test_gemm_a8w8.py
+++ b/op_tests/triton_tests/gemm/basic/test_gemm_a8w8.py
@@ -1,5 +1,5 @@
 # SPDX-License-Identifier: MIT
-# Copyright (C) 2024-2025, Advanced Micro Devices, Inc. All rights reserved.
+# Copyright (C) 2024-2026, Advanced Micro Devices, Inc. All rights reserved.
 
 import torch
 import pytest
diff --git a/op_tests/triton_tests/gemm/basic/test_gemm_a8w8_blockscale.py b/op_tests/triton_tests/gemm/basic/test_gemm_a8w8_blockscale.py
index 29771e33a7..c8beda893b 100644
--- a/op_tests/triton_tests/gemm/basic/test_gemm_a8w8_blockscale.py
+++ b/op_tests/triton_tests/gemm/basic/test_gemm_a8w8_blockscale.py
@@ -1,5 +1,5 @@
 # SPDX-License-Identifier: MIT
-# Copyright (C) 2024-2025, Advanced Micro Devices, Inc. All rights reserved.
+# Copyright (C) 2024-2026, Advanced Micro Devices, Inc. All rights reserved.
 
 import torch
 import pytest
diff --git a/op_tests/triton_tests/gemm/basic/test_gemm_a8w8_per_token_scale.py b/op_tests/triton_tests/gemm/basic/test_gemm_a8w8_per_token_scale.py
index 67de79da55..51faff9d24 100644
--- a/op_tests/triton_tests/gemm/basic/test_gemm_a8w8_per_token_scale.py
+++ b/op_tests/triton_tests/gemm/basic/test_gemm_a8w8_per_token_scale.py
@@ -1,5 +1,5 @@
 # SPDX-License-Identifier: MIT
-# Copyright (C) 2024-2025, Advanced Micro Devices, Inc. All rights reserved.
+# Copyright (C) 2024-2026, Advanced Micro Devices, Inc. All rights reserved.
 
 import torch
 import pytest
diff --git a/op_tests/triton_tests/gemm/basic/test_gemm_a8wfp4.py b/op_tests/triton_tests/gemm/basic/test_gemm_a8wfp4.py
index 87a53db38b..b3dc421912 100644
--- a/op_tests/triton_tests/gemm/basic/test_gemm_a8wfp4.py
+++ b/op_tests/triton_tests/gemm/basic/test_gemm_a8wfp4.py
@@ -1,5 +1,5 @@
 # SPDX-License-Identifier: MIT
-# Copyright (C) 2024-2025, Advanced Micro Devices, Inc. All rights reserved.
+# Copyright (C) 2024-2026, Advanced Micro Devices, Inc. All rights reserved.
 
 import torch
 import pytest
diff --git a/op_tests/triton_tests/gemm/basic/test_gemm_afp4wfp4.py b/op_tests/triton_tests/gemm/basic/test_gemm_afp4wfp4.py
index e457a263d3..1963564a66 100644
--- a/op_tests/triton_tests/gemm/basic/test_gemm_afp4wfp4.py
+++ b/op_tests/triton_tests/gemm/basic/test_gemm_afp4wfp4.py
@@ -1,5 +1,5 @@
 # SPDX-License-Identifier: MIT
-# Copyright (C) 2024-2025, Advanced Micro Devices, Inc. All rights reserved.
+# Copyright (C) 2024-2026, Advanced Micro Devices, Inc. All rights reserved.
 import pytest
 import torch
 from aiter.ops.triton.gemm.basic.gemm_afp4wfp4 import (
diff --git a/op_tests/triton_tests/gemm/batched/test_batched_gemm_a8w8.py b/op_tests/triton_tests/gemm/batched/test_batched_gemm_a8w8.py
index 9f2053a914..6366a0660c 100644
--- a/op_tests/triton_tests/gemm/batched/test_batched_gemm_a8w8.py
+++ b/op_tests/triton_tests/gemm/batched/test_batched_gemm_a8w8.py
@@ -1,5 +1,5 @@
 # SPDX-License-Identifier: MIT
-# Copyright (C) 2024-2025, Advanced Micro Devices, Inc. All rights reserved.
+# Copyright (C) 2024-2026, Advanced Micro Devices, Inc. All rights reserved.
 
 import torch
 import pytest
diff --git a/op_tests/triton_tests/gemm/batched/test_batched_gemm_a8w8_a_per_token_group_prequant_w_per_batched_tensor_quant.py b/op_tests/triton_tests/gemm/batched/test_batched_gemm_a8w8_a_per_token_group_prequant_w_per_batched_tensor_quant.py
index 1d1842af0d..cb654772a1 100644
--- a/op_tests/triton_tests/gemm/batched/test_batched_gemm_a8w8_a_per_token_group_prequant_w_per_batched_tensor_quant.py
+++ b/op_tests/triton_tests/gemm/batched/test_batched_gemm_a8w8_a_per_token_group_prequant_w_per_batched_tensor_quant.py
@@ -1,5 +1,5 @@
 # SPDX-License-Identifier: MIT
-# Copyright (C) 2024-2025, Advanced Micro Devices, Inc. All rights reserved.
+# Copyright (C) 2024-2026, Advanced Micro Devices, Inc. All rights reserved.
 
 import torch
 import triton
diff --git a/op_tests/triton_tests/gemm/batched/test_batched_gemm_bf16.py b/op_tests/triton_tests/gemm/batched/test_batched_gemm_bf16.py
index f1228661a6..5d89a6cb6c 100644
--- a/op_tests/triton_tests/gemm/batched/test_batched_gemm_bf16.py
+++ b/op_tests/triton_tests/gemm/batched/test_batched_gemm_bf16.py
@@ -1,5 +1,5 @@
 # SPDX-License-Identifier: MIT
-# Copyright (C) 2024-2025, Advanced Micro Devices, Inc. All rights reserved.
+# Copyright (C) 2024-2026, Advanced Micro Devices, Inc. All rights reserved.
 
 import torch
 import pytest
diff --git a/op_tests/triton_tests/gemm/fused/test_fused_gemm_a8w8_blockscale_a16w16.py b/op_tests/triton_tests/gemm/fused/test_fused_gemm_a8w8_blockscale_a16w16.py
index d34e4c686d..6adda8f67c 100644
--- a/op_tests/triton_tests/gemm/fused/test_fused_gemm_a8w8_blockscale_a16w16.py
+++ b/op_tests/triton_tests/gemm/fused/test_fused_gemm_a8w8_blockscale_a16w16.py
@@ -1,5 +1,5 @@
 # SPDX-License-Identifier: MIT
-# Copyright (C) 2024-2025, Advanced Micro Devices, Inc. All rights reserved.
+# Copyright (C) 2024-2026, Advanced Micro Devices, Inc. All rights reserved.
 
 import torch
 import triton
diff --git a/op_tests/triton_tests/gemm/fused/test_fused_gemm_a8w8_blockscale_mul_add.py b/op_tests/triton_tests/gemm/fused/test_fused_gemm_a8w8_blockscale_mul_add.py
index 85d11b32b9..79d0dfdb37 100644
--- a/op_tests/triton_tests/gemm/fused/test_fused_gemm_a8w8_blockscale_mul_add.py
+++ b/op_tests/triton_tests/gemm/fused/test_fused_gemm_a8w8_blockscale_mul_add.py
@@ -1,5 +1,5 @@
 # SPDX-License-Identifier: MIT
-# Copyright (C) 2024-2025, Advanced Micro Devices, Inc. All rights reserved.
+# Copyright (C) 2024-2026, Advanced Micro Devices, Inc. All rights reserved.
 import pytest
 import torch
 from aiter.ops.triton.gemm.fused.fused_gemm_a8w8_blockscale_mul_add import (
diff --git a/op_tests/triton_tests/gemm/fused/test_fused_gemm_afp4wfp4_a16w16.py b/op_tests/triton_tests/gemm/fused/test_fused_gemm_afp4wfp4_a16w16.py
index 13fc600362..2daaa1e942 100644
--- a/op_tests/triton_tests/gemm/fused/test_fused_gemm_afp4wfp4_a16w16.py
+++ b/op_tests/triton_tests/gemm/fused/test_fused_gemm_afp4wfp4_a16w16.py
@@ -1,5 +1,5 @@
 # SPDX-License-Identifier: MIT
-# Copyright (C) 2024-2025, Advanced Micro Devices, Inc. All rights reserved.
+# Copyright (C) 2024-2026, Advanced Micro Devices, Inc. All rights reserved.
 
 import torch
 import triton
diff --git a/op_tests/triton_tests/gemm/fused/test_fused_gemm_afp4wfp4_mul_add.py b/op_tests/triton_tests/gemm/fused/test_fused_gemm_afp4wfp4_mul_add.py
index dd8d843de6..28bb8b4ceb 100644
--- a/op_tests/triton_tests/gemm/fused/test_fused_gemm_afp4wfp4_mul_add.py
+++ b/op_tests/triton_tests/gemm/fused/test_fused_gemm_afp4wfp4_mul_add.py
@@ -1,5 +1,5 @@
 # SPDX-License-Identifier: MIT
-# Copyright (C) 2024-2025, Advanced Micro Devices, Inc. All rights reserved.
+# Copyright (C) 2024-2026, Advanced Micro Devices, Inc. All rights reserved.
 import pytest
 import torch
 from aiter.ops.triton.gemm.fused.fused_gemm_afp4wfp4_mul_add import (
diff --git a/op_tests/triton_tests/gemm/fused/test_fused_gemm_afp4wfp4_split_cat.py b/op_tests/triton_tests/gemm/fused/test_fused_gemm_afp4wfp4_split_cat.py
index 8afb1c2647..e05dd8d762 100644
--- a/op_tests/triton_tests/gemm/fused/test_fused_gemm_afp4wfp4_split_cat.py
+++ b/op_tests/triton_tests/gemm/fused/test_fused_gemm_afp4wfp4_split_cat.py
@@ -1,5 +1,5 @@
 # SPDX-License-Identifier: MIT
-# Copyright (C) 2024-2025, Advanced Micro Devices, Inc. All rights reserved.
+# Copyright (C) 2024-2026, Advanced Micro Devices, Inc. All rights reserved.
 
 import torch
 import pytest
diff --git a/op_tests/triton_tests/moe/test_moe.py b/op_tests/triton_tests/moe/test_moe.py
index 9b5f4935d7..35faae453e 100644
--- a/op_tests/triton_tests/moe/test_moe.py
+++ b/op_tests/triton_tests/moe/test_moe.py
@@ -1,5 +1,5 @@
 # SPDX-License-Identifier: MIT
-# Copyright (C) 2024-2025, Advanced Micro Devices, Inc. All rights reserved.
+# Copyright (C) 2024-2026, Advanced Micro Devices, Inc. All rights reserved.
 
 import torch
 import pytest
diff --git a/op_tests/triton_tests/moe/test_moe_align_block_size.py b/op_tests/triton_tests/moe/test_moe_align_block_size.py
index a9ce3dd66e..d5afe4f5c9 100644
--- a/op_tests/triton_tests/moe/test_moe_align_block_size.py
+++ b/op_tests/triton_tests/moe/test_moe_align_block_size.py
@@ -1,5 +1,5 @@
 # SPDX-License-Identifier: MIT
-# Copyright (C) 2024-2025, Advanced Micro Devices, Inc. All rights reserved.
+# Copyright (C) 2024-2026, Advanced Micro Devices, Inc. All rights reserved.
 
 import triton
 import torch
diff --git a/op_tests/triton_tests/moe/test_moe_mx.py b/op_tests/triton_tests/moe/test_moe_mx.py
index f205deb86f..74d2796e25 100644
--- a/op_tests/triton_tests/moe/test_moe_mx.py
+++ b/op_tests/triton_tests/moe/test_moe_mx.py
@@ -1,5 +1,5 @@
 # SPDX-License-Identifier: MIT
-# Copyright (C) 2024-2025, Advanced Micro Devices, Inc. All rights reserved.
+# Copyright (C) 2024-2026, Advanced Micro Devices, Inc. All rights reserved.
 
 import pytest
 import torch
diff --git a/op_tests/triton_tests/moe/test_moe_routing_sigmoid_top1_fused.py b/op_tests/triton_tests/moe/test_moe_routing_sigmoid_top1_fused.py
index 504bfd35e0..9c16966aeb 100644
--- a/op_tests/triton_tests/moe/test_moe_routing_sigmoid_top1_fused.py
+++ b/op_tests/triton_tests/moe/test_moe_routing_sigmoid_top1_fused.py
@@ -1,5 +1,5 @@
 # SPDX-License-Identifier: MIT
-# Copyright (C) 2024-2025, Advanced Micro Devices, Inc. All rights reserved.
+# Copyright (C) 2024-2026, Advanced Micro Devices, Inc. All rights reserved.
 
 from functools import partial
 
diff --git a/op_tests/triton_tests/normalization/test_layernorm.py b/op_tests/triton_tests/normalization/test_layernorm.py
index 6eb25f96eb..9fbc25f44f 100644
--- a/op_tests/triton_tests/normalization/test_layernorm.py
+++ b/op_tests/triton_tests/normalization/test_layernorm.py
@@ -1,5 +1,5 @@
 # SPDX-License-Identifier: MIT
-# Copyright (C) 2024-2025, Advanced Micro Devices, Inc. All rights reserved.
+# Copyright (C) 2024-2026, Advanced Micro Devices, Inc. All rights reserved.
 
 import torch
 import torch.nn.functional as F
diff --git a/op_tests/triton_tests/normalization/test_rmsnorm.py b/op_tests/triton_tests/normalization/test_rmsnorm.py
index 9869dec3fb..0a3d5406c4 100644
--- a/op_tests/triton_tests/normalization/test_rmsnorm.py
+++ b/op_tests/triton_tests/normalization/test_rmsnorm.py
@@ -1,5 +1,5 @@
 # SPDX-License-Identifier: MIT
-# Copyright (C) 2024-2025, Advanced Micro Devices, Inc. All rights reserved.
+# Copyright (C) 2024-2026, Advanced Micro Devices, Inc. All rights reserved.
 
 import pytest
 import torch
diff --git a/op_tests/triton_tests/rope/test_rope.py b/op_tests/triton_tests/rope/test_rope.py
index 790c45daee..04134bbaeb 100644
--- a/op_tests/triton_tests/rope/test_rope.py
+++ b/op_tests/triton_tests/rope/test_rope.py
@@ -1,5 +1,5 @@
 # SPDX-License-Identifier: MIT
-# Copyright (C) 2024-2025, Advanced Micro Devices, Inc. All rights reserved.
+# Copyright (C) 2024-2026, Advanced Micro Devices, Inc. All rights reserved.
 
 import torch
 import pytest