diff --git a/benchmark/blocksparse_attention/benchmark_tilelang_block_sparse_fmha.py b/benchmark/blocksparse_attention/benchmark_tilelang_block_sparse_fmha.py
index aefe4d420..7c9edb595 100644
--- a/benchmark/blocksparse_attention/benchmark_tilelang_block_sparse_fmha.py
+++ b/benchmark/blocksparse_attention/benchmark_tilelang_block_sparse_fmha.py
@@ -95,6 +95,8 @@ def Softmax(
             T.copy(scores_max, scores_max_prev)
             T.fill(scores_max, -T.infinity(accum_dtype))
             T.reduce_max(acc_s, scores_max, dim=1, clear=False)
+            for i in T.Parallel(block_M):
+                scores_max[i] = T.max(scores_max[i], scores_max_prev[i])
             # To do causal softmax, we need to set the scores_max to 0 if it is -inf
             # This process is called Check_inf in FlashAttention3 code, and it only need to be done
             # in the first ceil_div(kBlockM, kBlockN) steps.
diff --git a/examples/amd/example_amd_flash_attn_bwd.py b/examples/amd/example_amd_flash_attn_bwd.py
index d47866e1e..d5c52f9ca 100644
--- a/examples/amd/example_amd_flash_attn_bwd.py
+++ b/examples/amd/example_amd_flash_attn_bwd.py
@@ -178,6 +178,8 @@ def main(
 
                     T.copy(m_i, m_prev)
                     T.reduce_max(acc_s, m_i, dim=1, clear=False)
+                    for i in T.Parallel(block_M):
+                        m_i[i] = T.max(m_i[i], m_prev[i])
 
                     for i in T.Parallel(block_M):
                         if m_prev[i] == -T.infinity(accum_dtype):
diff --git a/examples/amd/example_amd_flash_attn_fwd.py b/examples/amd/example_amd_flash_attn_fwd.py
index 6ec5db1e5..3c422c285 100644
--- a/examples/amd/example_amd_flash_attn_fwd.py
+++ b/examples/amd/example_amd_flash_attn_fwd.py
@@ -171,6 +171,8 @@ def main(
 
                     T.copy(m_i, m_prev)
                     T.reduce_max(acc_s, m_i, dim=1, clear=False)
+                    for i in T.Parallel(block_M):
+                        m_i[i] = T.max(m_i[i], m_prev[i])
 
                     for i in T.Parallel(block_M):
                         sf = T.exp(m_prev[i] * scale - m_i[i] * scale)
diff --git a/examples/attention_sink/example_gqa_sink_bwd_bhsd.py b/examples/attention_sink/example_gqa_sink_bwd_bhsd.py
index eec43db99..b442505fc 100644
--- a/examples/attention_sink/example_gqa_sink_bwd_bhsd.py
+++ b/examples/attention_sink/example_gqa_sink_bwd_bhsd.py
@@ -99,6 +99,8 @@ def flash_fwd(
                 T.copy(V[bz, by // groups, k * block_N:(k + 1) * block_N, :], V_shared)
                 T.copy(scores_max, scores_max_prev)
                 T.reduce_max(acc_s, scores_max, dim=1, clear=False)
+                for i in T.Parallel(block_M):
+                    scores_max[i] = T.max(scores_max[i], scores_max_prev[i])
                 # To do causal softmax, we need to set the scores_max to 0 if it is -inf
                 # This process is called Check_inf in FlashAttention3 code, and it only need to be done
                 # NOTE(wt): check_inf is necessary for sliding window attention.
diff --git a/examples/attention_sink/example_gqa_sink_fwd_bhsd_wgmma_pipelined.py b/examples/attention_sink/example_gqa_sink_fwd_bhsd_wgmma_pipelined.py
index 7765603af..8d1817267 100644
--- a/examples/attention_sink/example_gqa_sink_fwd_bhsd_wgmma_pipelined.py
+++ b/examples/attention_sink/example_gqa_sink_fwd_bhsd_wgmma_pipelined.py
@@ -105,6 +105,8 @@ def Softmax(
         T.copy(scores_max, scores_max_prev)
         T.fill(scores_max, -T.infinity(accum_dtype))
         T.reduce_max(acc_s, scores_max, dim=1, clear=False)
+        for i in T.Parallel(block_M):
+            scores_max[i] = T.max(scores_max[i], scores_max_prev[i])
         # To do causal softmax, we need to set the scores_max to 0 if it is -inf
         # This process is called Check_inf in FlashAttention3 code, and it only need to be done
         # NOTE(wt): check_inf is necessary for sliding window attention.
@@ -181,7 +183,7 @@ def main(
                     num_stages=num_stages,
                     order=[-1, 0, 3, 1, -1, 2],
                     stage=[-1, 0, 0, 1, -1, 1],
-                    group=[[0], [1, 2], [3, 4, 5, 6, 7, 8, 9, 10], [11], [12], [13]]):
+                    group=[[0], [1, 2], [3, 4, 5, 6, 7, 8, 9, 10, 11], [12], [13], [14]]):
                 MMA0(K, Q_shared, K_shared, acc_s, k, bx, by, bz)
                 Softmax(acc_s, acc_s_cast, scores_max, scores_max_prev, scores_scale, scores_sum,
                         logsum)
diff --git a/examples/attention_sink/example_mha_sink_bwd_bhsd.py b/examples/attention_sink/example_mha_sink_bwd_bhsd.py
index 866668e41..b9fa0fd97 100644
--- a/examples/attention_sink/example_mha_sink_bwd_bhsd.py
+++ b/examples/attention_sink/example_mha_sink_bwd_bhsd.py
@@ -96,6 +96,8 @@ def flash_fwd(
                 T.copy(V[bz, by, k * block_N:(k + 1) * block_N, :], V_shared)
                 T.copy(scores_max, scores_max_prev)
                 T.reduce_max(acc_s, scores_max, dim=1, clear=False)
+                for i in T.Parallel(block_M):
+                    scores_max[i] = T.max(scores_max[i], scores_max_prev[i])
                 # To do causal softmax, we need to set the scores_max to 0 if it is -inf
                 # This process is called Check_inf in FlashAttention3 code, and it only need to be done
                 # NOTE(wt): check_inf is necessary for sliding window attention.
diff --git a/examples/attention_sink/example_mha_sink_fwd_bhsd.py b/examples/attention_sink/example_mha_sink_fwd_bhsd.py
index 2449b090c..0ccb69588 100644
--- a/examples/attention_sink/example_mha_sink_fwd_bhsd.py
+++ b/examples/attention_sink/example_mha_sink_fwd_bhsd.py
@@ -95,6 +95,8 @@ def Softmax(
         T.copy(scores_max, scores_max_prev)
         T.fill(scores_max, -T.infinity(accum_dtype))
         T.reduce_max(acc_s, scores_max, dim=1, clear=False)
+        for i in T.Parallel(block_M):
+            scores_max[i] = T.max(scores_max[i], scores_max_prev[i])
         # To do causal softmax, we need to set the scores_max to 0 if it is -inf
         # This process is called Check_inf in FlashAttention3 code, and it only need to be done
         # NOTE(wt): check_inf is necessary for sliding window attention.
diff --git a/examples/attention_sink/example_mha_sink_fwd_bhsd_wgmma_pipelined.py b/examples/attention_sink/example_mha_sink_fwd_bhsd_wgmma_pipelined.py
index 352844075..64d6ec698 100644
--- a/examples/attention_sink/example_mha_sink_fwd_bhsd_wgmma_pipelined.py
+++ b/examples/attention_sink/example_mha_sink_fwd_bhsd_wgmma_pipelined.py
@@ -98,6 +98,8 @@ def Softmax(
         T.copy(scores_max, scores_max_prev)
         T.fill(scores_max, -T.infinity(accum_dtype))
         T.reduce_max(acc_s, scores_max, dim=1, clear=False)
+        for i in T.Parallel(block_M):
+            scores_max[i] = T.max(scores_max[i], scores_max_prev[i])
         # To do causal softmax, we need to set the scores_max to 0 if it is -inf
         # This process is called Check_inf in FlashAttention3 code, and it only need to be done
         # NOTE(wt): check_inf is necessary for sliding window attention.
@@ -174,7 +176,7 @@ def main(
                     num_stages=num_stages,
                     order=[-1, 0, 3, 1, -1, 2],
                     stage=[-1, 0, 0, 1, -1, 1],
-                    group=[[0], [1, 2], [3, 4, 5, 6, 7, 8, 9, 10], [11], [12], [13]]):
+                    group=[[0], [1, 2], [3, 4, 5, 6, 7, 8, 9, 10, 11], [12], [13], [14]]):
                 MMA0(K, Q_shared, K_shared, acc_s, k, bx, by, bz)
                 Softmax(acc_s, acc_s_cast, scores_max, scores_max_prev, scores_scale, scores_sum,
                         logsum)
diff --git a/examples/blocksparse_attention/example_tilelang_sparse_gqa_decode_paged.py b/examples/blocksparse_attention/example_tilelang_sparse_gqa_decode_paged.py
index e29982162..1c4b847de 100644
--- a/examples/blocksparse_attention/example_tilelang_sparse_gqa_decode_paged.py
+++ b/examples/blocksparse_attention/example_tilelang_sparse_gqa_decode_paged.py
@@ -105,8 +105,7 @@ def flash_attn_split(
                         T.fill(scores_max, -T.infinity(accum_dtype))
                         T.reduce_max(acc_s, scores_max, dim=1, clear=False)
                         for i in T.Parallel(block_H):
-                            scores_max[i] = T.if_then_else(scores_max[i] > scores_max_prev[i],
-                                                           scores_max[i], scores_max_prev[i])
+                            scores_max[i] = T.max(scores_max[i], scores_max_prev[i])
                             scores_scale[i] = T.exp2(scores_max_prev[i] * scale -
                                                      scores_max[i] * scale)
                         for i, j in T.Parallel(block_H, block_N):
diff --git a/examples/blocksparse_attention/example_tilelang_sparse_gqa_decode_varlen_indice.py b/examples/blocksparse_attention/example_tilelang_sparse_gqa_decode_varlen_indice.py
index ae3004267..b30875228 100644
--- a/examples/blocksparse_attention/example_tilelang_sparse_gqa_decode_varlen_indice.py
+++ b/examples/blocksparse_attention/example_tilelang_sparse_gqa_decode_varlen_indice.py
@@ -95,8 +95,7 @@ def flash_attn_split(
                         T.fill(scores_max, -T.infinity(accum_dtype))
                         T.reduce_max(acc_s, scores_max, dim=1, clear=False)
                         for i in T.Parallel(block_H):
-                            scores_max[i] = T.if_then_else(scores_max[i] > scores_max_prev[i],
-                                                           scores_max[i], scores_max_prev[i])
+                            scores_max[i] = T.max(scores_max[i], scores_max_prev[i])
                             scores_scale[i] = T.exp2(scores_max_prev[i] * scale -
                                                      scores_max[i] * scale)
                         for i, j in T.Parallel(block_H, block_N):
diff --git a/examples/blocksparse_attention/example_tilelang_sparse_gqa_decode_varlen_mask.py b/examples/blocksparse_attention/example_tilelang_sparse_gqa_decode_varlen_mask.py
index ad62817dd..3417bd7f8 100644
--- a/examples/blocksparse_attention/example_tilelang_sparse_gqa_decode_varlen_mask.py
+++ b/examples/blocksparse_attention/example_tilelang_sparse_gqa_decode_varlen_mask.py
@@ -92,6 +92,7 @@ def flash_attn_split(
                         T.fill(scores_max, -T.infinity(accum_dtype))
                         T.reduce_max(acc_s, scores_max, dim=1, clear=False)
                         for i in T.Parallel(block_H):
+                            scores_max[i] = T.max(scores_max[i], scores_max_prev[i])
                             scores_scale[i] = T.exp2(scores_max_prev[i] * scale -
                                                      scores_max[i] * scale)
                         for i, j in T.Parallel(block_H, block_N):
diff --git a/examples/deepseek_mla/amd/benchmark_mla_decode_amd_tilelang.py b/examples/deepseek_mla/amd/benchmark_mla_decode_amd_tilelang.py
index db460437f..61c3b63c0 100644
--- a/examples/deepseek_mla/amd/benchmark_mla_decode_amd_tilelang.py
+++ b/examples/deepseek_mla/amd/benchmark_mla_decode_amd_tilelang.py
@@ -91,6 +91,8 @@ def flash_attn(
                 T.copy(scores_max, scores_max_prev)
                 T.fill(scores_max, -T.infinity(accum_dtype))
                 T.reduce_max(acc_s, scores_max, dim=1, clear=False)
+                for i in T.Parallel(block_H):
+                    scores_max[i] = T.max(scores_max[i], scores_max_prev[i])
                 for i in T.Parallel(block_H):
                     scores_scale[i] = T.exp2(scores_max_prev[i] * scale - scores_max[i] * scale)
                 for i, j in T.Parallel(block_H, block_N):
@@ -157,6 +159,8 @@ def flash_attn_split(
                 T.copy(scores_max, scores_max_prev)
                 T.fill(scores_max, -T.infinity(accum_dtype))
                 T.reduce_max(acc_s, scores_max, dim=1, clear=False)
+                for i in T.Parallel(block_H):
+                    scores_max[i] = T.max(scores_max[i], scores_max_prev[i])
                 for i in T.Parallel(block_H):
                     scores_scale[i] = T.exp2(scores_max_prev[i] * scale - scores_max[i] * scale)
                 for i, j in T.Parallel(block_H, block_N):
diff --git a/examples/deepseek_mla/example_mla_decode.py b/examples/deepseek_mla/example_mla_decode.py
index 417e319fd..3932d112e 100644
--- a/examples/deepseek_mla/example_mla_decode.py
+++ b/examples/deepseek_mla/example_mla_decode.py
@@ -74,6 +74,8 @@ def flash_attn(
                 T.copy(scores_max, scores_max_prev)
                 T.fill(scores_max, -T.infinity(accum_dtype))
                 T.reduce_max(acc_s, scores_max, dim=1, clear=False)
+                for i in T.Parallel(block_H):
+                    scores_max[i] = T.max(scores_max[i], scores_max_prev[i])
                 for i in T.Parallel(block_H):
                     scores_scale[i] = T.exp2(scores_max_prev[i] * scale - scores_max[i] * scale)
                 for i, j in T.Parallel(block_H, block_N):
@@ -148,6 +150,8 @@ def flash_attn_split(
                 T.copy(scores_max, scores_max_prev)
                 T.fill(scores_max, -T.infinity(accum_dtype))
                 T.reduce_max(acc_s, scores_max, dim=1, clear=False)
+                for i in T.Parallel(block_H):
+                    scores_max[i] = T.max(scores_max[i], scores_max_prev[i])
                 for i in T.Parallel(block_H):
                     scores_scale[i] = T.exp2(scores_max_prev[i] * scale - scores_max[i] * scale)
                 for i, j in T.Parallel(block_H, block_N):
diff --git a/examples/deepseek_mla/example_mla_decode_paged.py b/examples/deepseek_mla/example_mla_decode_paged.py
index fe50d4d4f..d23ff00c4 100644
--- a/examples/deepseek_mla/example_mla_decode_paged.py
+++ b/examples/deepseek_mla/example_mla_decode_paged.py
@@ -93,6 +93,8 @@ def flash_mla_kernel(
                         acc_s[i, j] = T.if_then_else(k * block_N + j >= CACHE_SEQLENS[bx],
                                                      -T.infinity(accum_dtype), acc_s[i, j])
                 T.reduce_max(acc_s, scores_max, dim=1, clear=False)
+                for i in T.Parallel(block_H):
+                    scores_max[i] = T.max(scores_max[i], scores_max_prev[i])
                 for i in T.Parallel(block_H):
                     scores_scale[i] = T.exp2(scores_max_prev[i] * scale - scores_max[i] * scale)
                 for i, j in T.Parallel(block_H, block_N):
@@ -176,6 +178,8 @@ def flash_mla_split_kv_kernel(
                     acc_s[i, j] = T.if_then_else(start + k * block_N + j >= CACHE_SEQLENS[bx],
                                                  -T.infinity(accum_dtype), acc_s[i, j])
                 T.reduce_max(acc_s, scores_max, dim=1, clear=False)
+                for i in T.Parallel(block_H):
+                    scores_max[i] = T.max(scores_max[i], scores_max_prev[i])
                 for i in T.Parallel(block_H):
                     scores_scale[i] = T.exp2(scores_max_prev[i] * scale - scores_max[i] * scale)
                 for i, j in T.Parallel(block_H, block_N):
diff --git a/examples/deepseek_mla/example_mla_decode_persistent.py b/examples/deepseek_mla/example_mla_decode_persistent.py
index 3f57ea051..2f896f265 100644
--- a/examples/deepseek_mla/example_mla_decode_persistent.py
+++ b/examples/deepseek_mla/example_mla_decode_persistent.py
@@ -98,6 +98,8 @@ def main_split_persistent(
                         T.copy(scores_max, scores_max_prev)
                         T.fill(scores_max, -T.infinity(accum_dtype))
                         T.reduce_max(acc_s, scores_max, dim=1, clear=False)
+                        for i in T.Parallel(block_H):
+                            scores_max[i] = T.max(scores_max[i], scores_max_prev[i])
                         for i in T.Parallel(block_H):
                             scores_scale[i] = T.exp2(scores_max_prev[i] * scale -
                                                      scores_max[i] * scale)
diff --git a/examples/deepseek_mla/example_mla_decode_ws.py b/examples/deepseek_mla/example_mla_decode_ws.py
index 6554d57de..fcd427efa 100644
--- a/examples/deepseek_mla/example_mla_decode_ws.py
+++ b/examples/deepseek_mla/example_mla_decode_ws.py
@@ -104,7 +104,9 @@ def flash_attn(
                         T.barrier_wait(bar_sScale_and_sS_free, ((i_i * 2) & 1) ^ 1)
 
                     T.copy(m_i, m_i_prev)
-                    T.reduce_max(acc_s, m_i, dim=1, clear=False)
+                    T.reduce_max(acc_s, out=m_i, dim=1, clear=False)
+                    for h_i in T.Parallel(block_H):
+                        m_i[h_i] = T.max(m_i[h_i], m_i_prev[h_i])
                     for h_i in T.Parallel(block_H):
                         alpha_local[h_i] = T.exp2((m_i_prev[h_i] - m_i[h_i]) * sm_scale)
                     for h_i, bi_i in T.Parallel(block_H, block_N):
@@ -137,6 +139,8 @@ def flash_attn(
 
                     T.copy(m_i, m_i_prev)
                     T.reduce_max(acc_s, m_i, dim=1, clear=False)
+                    for h_i in T.Parallel(block_H):
+                        m_i[h_i] = T.max(m_i[h_i], m_i_prev[h_i])
                     for h_i in T.Parallel(block_H):
                         alpha_local[h_i] = T.exp2((m_i_prev[h_i] - m_i[h_i]) * sm_scale)
                     for h_i, bi_i in T.Parallel(block_H, block_N):
@@ -324,6 +328,8 @@ def flash_attn_split(
 
                     T.copy(m_i, m_i_prev)
                     T.reduce_max(acc_s, m_i, dim=1, clear=False)
+                    for h_i in T.Parallel(block_H):
+                        m_i[h_i] = T.max(m_i[h_i], m_i_prev[h_i])
                     for h_i in T.Parallel(block_H):
                         alpha_local[h_i] = T.exp2((m_i_prev[h_i] - m_i[h_i]) * sm_scale)
                     for h_i, bi_i in T.Parallel(block_H, block_N):
@@ -356,6 +362,8 @@ def flash_attn_split(
 
                     T.copy(m_i, m_i_prev)
                     T.reduce_max(acc_s, m_i, dim=1, clear=False)
+                    for h_i in T.Parallel(block_H):
+                        m_i[h_i] = T.max(m_i[h_i], m_i_prev[h_i])
                     for h_i in T.Parallel(block_H):
                         alpha_local[h_i] = T.exp2((m_i_prev[h_i] - m_i[h_i]) * sm_scale)
                     for h_i, bi_i in T.Parallel(block_H, block_N):
diff --git a/examples/deepseek_mla/experimental/example_mla_decode_kv_fp8.py b/examples/deepseek_mla/experimental/example_mla_decode_kv_fp8.py
index 1b1447e88..b141822fe 100644
--- a/examples/deepseek_mla/experimental/example_mla_decode_kv_fp8.py
+++ b/examples/deepseek_mla/experimental/example_mla_decode_kv_fp8.py
@@ -74,6 +74,8 @@ def main_no_split(
                 T.copy(scores_max, scores_max_prev)
                 T.fill(scores_max, -T.infinity(accum_dtype))
                 T.reduce_max(acc_s, scores_max, dim=1, clear=False)
+                for i in T.Parallel(block_H):
+                    scores_max[i] = T.max(scores_max[i], scores_max_prev[i])
                 for i in T.Parallel(block_H):
                     scores_scale[i] = T.exp2(scores_max_prev[i] * scale - scores_max[i] * scale)
                 for i, j in T.Parallel(block_H, block_N):
diff --git a/examples/deepseek_v32/sparse_mla_fwd.py b/examples/deepseek_v32/sparse_mla_fwd.py
index a39c72c40..e65b89017 100644
--- a/examples/deepseek_v32/sparse_mla_fwd.py
+++ b/examples/deepseek_v32/sparse_mla_fwd.py
@@ -147,6 +147,8 @@ def main(
                 )
                 T.copy(m_i, m_i_prev)
                 T.reduce_max(acc_s, m_i, dim=1, clear=False)
+                for h_i in T.Parallel(H_per_block):
+                    m_i[h_i] = T.max(m_i[h_i], m_i_prev[h_i])
                 for h_i in T.Parallel(H_per_block):
                     alpha[h_i] = T.exp2((m_i_prev[h_i] - m_i[h_i]) * sm_scale)
                 for h_i, bi_i in T.Parallel(H_per_block, BI):
diff --git a/examples/deepseek_v32/sparse_mla_fwd_pipelined.py b/examples/deepseek_v32/sparse_mla_fwd_pipelined.py
index 96dda7df5..1621d85ba 100644
--- a/examples/deepseek_v32/sparse_mla_fwd_pipelined.py
+++ b/examples/deepseek_v32/sparse_mla_fwd_pipelined.py
@@ -164,6 +164,8 @@ def main(
 
                     T.copy(m_i, m_i_prev)
                     T.reduce_max(acc_s, m_i, dim=1, clear=False)
+                    for h_i in T.Parallel(H_per_block):
+                        m_i[h_i] = T.max(m_i[h_i], m_i_prev[h_i])
                     for h_i in T.Parallel(H_per_block):
                         alpha_local[h_i] = T.exp2((m_i_prev[h_i] - m_i[h_i]) * sm_scale)
                     for h_i, bi_i in T.Parallel(H_per_block, BI):
@@ -198,6 +200,8 @@ def main(
 
                     T.copy(m_i, m_i_prev)
                     T.reduce_max(acc_s, m_i, dim=1, clear=False)
+                    for h_i in T.Parallel(H_per_block):
+                        m_i[h_i] = T.max(m_i[h_i], m_i_prev[h_i])
                     for h_i in T.Parallel(H_per_block):
                         alpha_local[h_i] = T.exp2((m_i_prev[h_i] - m_i[h_i]) * sm_scale)
                     for h_i, bi_i in T.Parallel(H_per_block, BI):
diff --git a/examples/flash_attention/README.md b/examples/flash_attention/README.md
index be11a8dc6..633727ec4 100644
--- a/examples/flash_attention/README.md
+++ b/examples/flash_attention/README.md
@@ -77,7 +77,9 @@ def flash_attention(
 
             # Compute the maximum value per row on dimension 1 (block_N)
             T.reduce_max(acc_s, scores_max, dim=1, clear=False)
-
+            for i in T.Parallel(block_M):
+                scores_max[i] = T.max(scores_max[i], scores_max_prev[i])
+                
             # Compute the factor by which we need to rescale previous partial sums
             for i in T.Parallel(block_M):
                 scores_scale[i] = T.exp2(scores_max_prev[i] - scores_max[i])
diff --git a/examples/flash_attention/example_gqa_bwd.py b/examples/flash_attention/example_gqa_bwd.py
index dd9c8f7c1..968d1de33 100644
--- a/examples/flash_attention/example_gqa_bwd.py
+++ b/examples/flash_attention/example_gqa_bwd.py
@@ -61,6 +61,8 @@ def flash_fwd(
                 T.copy(V[bz, k * block_N:(k + 1) * block_N, by // groups, :], V_shared)
                 T.copy(scores_max, scores_max_prev)
                 T.reduce_max(acc_s, scores_max, dim=1, clear=False)
+                for i in T.Parallel(block_M):
+                    scores_max[i] = T.max(scores_max[i], scores_max_prev[i])
                 for i in T.Parallel(block_M):
                     scores_scale[i] = T.exp2(scores_max_prev[i] * scale - scores_max[i] * scale)
                 for i, j in T.Parallel(block_M, dim_v):
diff --git a/examples/flash_attention/example_gqa_bwd_tma_reduce.py b/examples/flash_attention/example_gqa_bwd_tma_reduce.py
index 2af06e4bc..c427908a6 100644
--- a/examples/flash_attention/example_gqa_bwd_tma_reduce.py
+++ b/examples/flash_attention/example_gqa_bwd_tma_reduce.py
@@ -66,6 +66,8 @@ def flash_fwd(
                 T.copy(V[bz, k * block_N:(k + 1) * block_N, by // groups, :], V_shared)
                 T.copy(scores_max, scores_max_prev)
                 T.reduce_max(acc_s, scores_max, dim=1, clear=False)
+                for i in T.Parallel(block_M):
+                    scores_max[i] = T.max(scores_max[i], scores_max_prev[i])
                 for i in T.Parallel(block_M):
                     scores_scale[i] = T.exp2(scores_max_prev[i] * scale - scores_max[i] * scale)
                 for i, j in T.Parallel(block_M, dim_v):
diff --git a/examples/flash_attention/example_gqa_bwd_tma_reduce_varlen.py b/examples/flash_attention/example_gqa_bwd_tma_reduce_varlen.py
index 88f2d81e1..a9604f4de 100644
--- a/examples/flash_attention/example_gqa_bwd_tma_reduce_varlen.py
+++ b/examples/flash_attention/example_gqa_bwd_tma_reduce_varlen.py
@@ -119,6 +119,8 @@ def flash_fwd(
                         V_shared[i, d] = 0.0
                 T.copy(scores_max, scores_max_prev)
                 T.reduce_max(acc_s, scores_max, dim=1, clear=False)
+                for i in T.Parallel(block_M):
+                    scores_max[i] = T.max(scores_max[i], scores_max_prev[i])
                 for i in T.Parallel(block_M):
                     scores_scale[i] = T.exp2(scores_max_prev[i] * scale - scores_max[i] * scale)
                 for i, j in T.Parallel(block_M, dim_v):
diff --git a/examples/flash_attention/example_gqa_bwd_wgmma_pipelined.py b/examples/flash_attention/example_gqa_bwd_wgmma_pipelined.py
index 024212499..e916812f5 100644
--- a/examples/flash_attention/example_gqa_bwd_wgmma_pipelined.py
+++ b/examples/flash_attention/example_gqa_bwd_wgmma_pipelined.py
@@ -61,6 +61,8 @@ def flash_fwd(
                 T.copy(V[bz, k * block_N:(k + 1) * block_N, by // groups, :], V_shared)
                 T.copy(scores_max, scores_max_prev)
                 T.reduce_max(acc_s, scores_max, dim=1, clear=False)
+                for i in T.Parallel(block_M):
+                    scores_max[i] = T.max(scores_max[i], scores_max_prev[i])
                 for i in T.Parallel(block_M):
                     scores_scale[i] = T.exp2(scores_max_prev[i] * scale - scores_max[i] * scale)
                 for i, j in T.Parallel(block_M, dim_v):
diff --git a/examples/flash_attention/example_gqa_fwd_bshd.py b/examples/flash_attention/example_gqa_fwd_bshd.py
index 3d4bfe455..a6d3b5f20 100644
--- a/examples/flash_attention/example_gqa_fwd_bshd.py
+++ b/examples/flash_attention/example_gqa_fwd_bshd.py
@@ -127,6 +127,8 @@ def Softmax(
         T.copy(scores_max, scores_max_prev)
         T.fill(scores_max, -T.infinity(accum_dtype))
         T.reduce_max(acc_s, scores_max, dim=1, clear=False)
+        for i in T.Parallel(block_M):
+            scores_max[i] = T.max(scores_max[i], scores_max_prev[i])
         # To do causal softmax, we need to set the scores_max to 0 if it is -inf
         # This process is called Check_inf in FlashAttention3 code, and it only need to be done
         # in the first ceil_div(kBlockM, kBlockN) steps.
diff --git a/examples/flash_attention/example_gqa_fwd_bshd_wgmma_pipelined.py b/examples/flash_attention/example_gqa_fwd_bshd_wgmma_pipelined.py
index 21f5e9a9d..03ad15e94 100644
--- a/examples/flash_attention/example_gqa_fwd_bshd_wgmma_pipelined.py
+++ b/examples/flash_attention/example_gqa_fwd_bshd_wgmma_pipelined.py
@@ -94,6 +94,8 @@ def Softmax(
         T.copy(scores_max, scores_max_prev)
         T.fill(scores_max, -T.infinity(accum_dtype))
         T.reduce_max(acc_s, scores_max, dim=1, clear=False)
+        for i in T.Parallel(block_M):
+            scores_max[i] = T.max(scores_max[i], scores_max_prev[i])
         # To do causal softmax, we need to set the scores_max to 0 if it is -inf
         # This process is called Check_inf in FlashAttention3 code, and it only need to be done
         # in the first ceil_div(kBlockM, kBlockN) steps.
@@ -154,7 +156,7 @@ def main(
                     num_stages=num_stages,
                     order=[-1, 0, 3, 1, -1, 2],
                     stage=[-1, 0, 0, 1, -1, 1],
-                    group=[[0], [1, 2], [3, 4, 5, 6, 7, 8, 9, 10], [11], [12], [13]]):
+                    group=[[0], [1, 2], [3, 4, 5, 6, 7, 8, 9, 10, 11], [12], [13], [14]]):
                 MMA0(K, Q_shared, K_shared, acc_s, k, bx, by, bz)
                 Softmax(acc_s, acc_s_cast, scores_max, scores_max_prev, scores_scale, scores_sum,
                         logsum)
diff --git a/examples/flash_attention/example_gqa_fwd_varlen.py b/examples/flash_attention/example_gqa_fwd_varlen.py
index db16e1586..ccc50e413 100644
--- a/examples/flash_attention/example_gqa_fwd_varlen.py
+++ b/examples/flash_attention/example_gqa_fwd_varlen.py
@@ -155,7 +155,6 @@ def main(
                 T.copy(scores_max, scores_max_prev)
                 T.fill(scores_max, -T.infinity(accum_dtype))
                 T.reduce_max(acc_s, scores_max, dim=1, clear=False)
-
                 for i in T.Parallel(block_M):
                     scores_max[i] = T.max(scores_max[i], scores_max_prev[i])
 
diff --git a/examples/flash_attention/example_mha_bwd_bhsd.py b/examples/flash_attention/example_mha_bwd_bhsd.py
index 8247b2654..d91d1770f 100644
--- a/examples/flash_attention/example_mha_bwd_bhsd.py
+++ b/examples/flash_attention/example_mha_bwd_bhsd.py
@@ -63,6 +63,8 @@ def flash_fwd(
                 T.copy(V[bz, by, k * block_N:(k + 1) * block_N, :], V_shared)
                 T.copy(scores_max, scores_max_prev)
                 T.reduce_max(acc_s, scores_max, dim=1, clear=False)
+                for i in T.Parallel(block_M):
+                    scores_max[i] = T.max(scores_max[i], scores_max_prev[i])
                 for i in T.Parallel(block_M):
                     scores_scale[i] = T.exp2(scores_max_prev[i] * scale - scores_max[i] * scale)
                 for i, j in T.Parallel(block_M, dim):
diff --git a/examples/flash_attention/example_mha_bwd_bshd.py b/examples/flash_attention/example_mha_bwd_bshd.py
index 414061ffb..7c85f982e 100644
--- a/examples/flash_attention/example_mha_bwd_bshd.py
+++ b/examples/flash_attention/example_mha_bwd_bshd.py
@@ -59,6 +59,8 @@ def flash_fwd(
                 T.copy(V[bz, k * block_N:(k + 1) * block_N, by, :], V_shared)
                 T.copy(scores_max, scores_max_prev)
                 T.reduce_max(acc_s, scores_max, dim=1, clear=False)
+                for i in T.Parallel(block_M):
+                    scores_max[i] = T.max(scores_max[i], scores_max_prev[i])
                 for i in T.Parallel(block_M):
                     scores_scale[i] = T.exp2(scores_max_prev[i] * scale - scores_max[i] * scale)
                 for i, j in T.Parallel(block_M, dim):
@@ -344,7 +346,7 @@ def run1():
     parser = argparse.ArgumentParser()
     parser.add_argument('--batch', type=int, default=8, help='Batch size')
     parser.add_argument('--h', type=int, default=32, help='Number of heads')
-    parser.add_argument('--n_ctx', type=int, default=1048, help='Context size')
+    parser.add_argument('--n_ctx', type=int, default=1024, help='Context size')
     parser.add_argument('--d_head', type=int, default=64, help='Head dimension')
     parser.add_argument('--causal', type=bool, default=False, help='Causal flag')
     args = parser.parse_args()
diff --git a/examples/flash_attention/example_mha_bwd_bshd_wgmma_pipelined.py b/examples/flash_attention/example_mha_bwd_bshd_wgmma_pipelined.py
index e10ef5816..e8ee5d973 100644
--- a/examples/flash_attention/example_mha_bwd_bshd_wgmma_pipelined.py
+++ b/examples/flash_attention/example_mha_bwd_bshd_wgmma_pipelined.py
@@ -60,6 +60,8 @@ def flash_fwd(
                 T.copy(V[bz, k * block_N:(k + 1) * block_N, by, :], V_shared)
                 T.copy(scores_max, scores_max_prev)
                 T.reduce_max(acc_s, scores_max, dim=1, clear=False)
+                for i in T.Parallel(block_M):
+                    scores_max[i] = T.max(scores_max[i], scores_max_prev[i])
                 for i in T.Parallel(block_M):
                     scores_scale[i] = T.exp2(scores_max_prev[i] * scale - scores_max[i] * scale)
                 for i, j in T.Parallel(block_M, dim):
diff --git a/examples/flash_attention/example_mha_fwd_bhsd_wgmma_pipelined.py b/examples/flash_attention/example_mha_fwd_bhsd_wgmma_pipelined.py
index e1d0130a5..b797bbcc6 100644
--- a/examples/flash_attention/example_mha_fwd_bhsd_wgmma_pipelined.py
+++ b/examples/flash_attention/example_mha_fwd_bhsd_wgmma_pipelined.py
@@ -86,6 +86,8 @@ def Softmax(
         T.copy(scores_max, scores_max_prev)
         T.fill(scores_max, -T.infinity(accum_dtype))
         T.reduce_max(acc_s, scores_max, dim=1, clear=False)
+        for i in T.Parallel(block_M):
+            scores_max[i] = T.max(scores_max[i], scores_max_prev[i])
         # To do causal softmax, we need to set the scores_max to 0 if it is -inf
         # This process is called Check_inf in FlashAttention3 code, and it only need to be done
         # in the first ceil_div(kBlockM, kBlockN) steps.
@@ -149,7 +151,7 @@ def main(
                     num_stages=num_stages,
                     order=[-1, 0, 3, 1, -1, 2],
                     stage=[-1, 0, 0, 1, -1, 1],
-                    group=[[0], [1, 2], [3, 4, 5, 6, 7, 8, 9, 10], [11], [12], [13]]):
+                    group=[[0], [1, 2], [3, 4, 5, 6, 7, 8, 9, 10, 11], [12], [13], [14]]):
                 MMA0(K, Q_shared, K_shared, acc_s, k, bx, by, bz)
                 Softmax(acc_s, acc_s_cast, scores_max, scores_max_prev, scores_scale, scores_sum,
                         logsum)
diff --git a/examples/flash_attention/example_mha_fwd_bshd.py b/examples/flash_attention/example_mha_fwd_bshd.py
index a9268019a..b5b728287 100644
--- a/examples/flash_attention/example_mha_fwd_bshd.py
+++ b/examples/flash_attention/example_mha_fwd_bshd.py
@@ -81,6 +81,8 @@ def Softmax(
         T.copy(scores_max, scores_max_prev)
         T.fill(scores_max, -T.infinity(accum_dtype))
         T.reduce_max(acc_s, scores_max, dim=1, clear=False)
+        for i in T.Parallel(block_M):
+            scores_max[i] = T.max(scores_max[i], scores_max_prev[i])
         # To do causal softmax, we need to set the scores_max to 0 if it is -inf
         # This process is called Check_inf in FlashAttention3 code, and it only need to be done
         # in the first ceil_div(kBlockM, kBlockN) steps.
diff --git a/examples/flash_attention/example_mha_fwd_bshd_wgmma_pipelined.py b/examples/flash_attention/example_mha_fwd_bshd_wgmma_pipelined.py
index d7023a203..02d8baef2 100644
--- a/examples/flash_attention/example_mha_fwd_bshd_wgmma_pipelined.py
+++ b/examples/flash_attention/example_mha_fwd_bshd_wgmma_pipelined.py
@@ -81,6 +81,8 @@ def Softmax(
         T.copy(scores_max, scores_max_prev)
         T.fill(scores_max, -T.infinity(accum_dtype))
         T.reduce_max(acc_s, scores_max, dim=1, clear=False)
+        for i in T.Parallel(block_M):
+            scores_max[i] = T.max(scores_max[i], scores_max_prev[i])
         # To do causal softmax, we need to set the scores_max to 0 if it is -inf
         # This process is called Check_inf in FlashAttention3 code, and it only need to be done
         # in the first ceil_div(kBlockM, kBlockN) steps.
@@ -141,7 +143,7 @@ def main(
                     num_stages=num_stages,
                     order=[-1, 0, 3, 1, -1, 2],
                     stage=[-1, 0, 0, 1, -1, 1],
-                    group=[[0], [1, 2], [3, 4, 5, 6, 7, 8, 9, 10], [11], [12], [13]]):
+                    group=[[0], [1, 2], [3, 4, 5, 6, 7, 8, 9, 10, 11], [12], [13], [14]]):
                 MMA0(K, Q_shared, K_shared, acc_s, k, bx, by, bz)
                 Softmax(acc_s, acc_s_cast, scores_max, scores_max_prev, scores_scale, scores_sum,
                         logsum)
diff --git a/examples/flash_attention/example_mha_fwd_varlen.py b/examples/flash_attention/example_mha_fwd_varlen.py
index f381e900a..bbb4546ca 100644
--- a/examples/flash_attention/example_mha_fwd_varlen.py
+++ b/examples/flash_attention/example_mha_fwd_varlen.py
@@ -167,6 +167,8 @@ def main(
                 T.copy(scores_max, scores_max_prev)
                 T.fill(scores_max, -T.infinity(accum_dtype))
                 T.reduce_max(acc_s, scores_max, dim=1, clear=False)
+                for i in T.Parallel(block_M):
+                    scores_max[i] = T.max(scores_max[i], scores_max_prev[i])
                 # To do causal softmax, we need to set the scores_max to 0 if it is -inf
                 # This process is called Check_inf in FlashAttention3 code, and it only need to be done
                 # in the first ceil_div(kBlockM, kBlockN) steps.
diff --git a/examples/flash_decoding/example_gqa_decode.py b/examples/flash_decoding/example_gqa_decode.py
index 9ec3a0265..46d9beeaa 100644
--- a/examples/flash_decoding/example_gqa_decode.py
+++ b/examples/flash_decoding/example_gqa_decode.py
@@ -115,6 +115,8 @@ def flash_attn(
                 T.copy(scores_max, scores_max_prev)
                 T.fill(scores_max, -T.infinity(accum_dtype))
                 T.reduce_max(acc_s, scores_max, dim=1, clear=False)
+                for i in T.Parallel(block_H):
+                    scores_max[i] = T.max(scores_max[i], scores_max_prev[i])
                 for i in T.Parallel(block_H):
                     scores_scale[i] = T.exp2(scores_max_prev[i] * scale - scores_max[i] * scale)
                 for i, j in T.Parallel(block_H, block_N):
@@ -188,6 +190,8 @@ def flash_attn_split(
                 T.copy(scores_max, scores_max_prev)
                 T.fill(scores_max, -T.infinity(accum_dtype))
                 T.reduce_max(acc_s, scores_max, dim=1, clear=False)
+                for i in T.Parallel(block_H):
+                    scores_max[i] = T.max(scores_max[i], scores_max_prev[i])
                 for i in T.Parallel(block_H):
                     scores_scale[i] = T.exp2(scores_max_prev[i] * scale - scores_max[i] * scale)
                 for i, j in T.Parallel(block_H, block_N):
diff --git a/examples/flash_decoding/example_mha_inference.py b/examples/flash_decoding/example_mha_inference.py
index 3eabc9a76..0360b3e2b 100644
--- a/examples/flash_decoding/example_mha_inference.py
+++ b/examples/flash_decoding/example_mha_inference.py
@@ -70,6 +70,8 @@ def Softmax(
         T.copy(scores_max, scores_max_prev)
         T.fill(scores_max, -T.infinity(accum_dtype))
         T.reduce_max(acc_s, scores_max, dim=1, clear=False)
+        for i in T.Parallel(block_M):
+            scores_max[i] = T.max(scores_max[i], scores_max_prev[i])
         # To do causal softmax, we need to set the scores_max to 0 if it is -inf
         # This process is called Check_inf in FlashAttention3 code, and it only need to be done
         # in the first ceil_div(kBlockM, kBlockN) steps.
diff --git a/examples/minference/example_vertical_slash_sparse_attn.py b/examples/minference/example_vertical_slash_sparse_attn.py
index ebf8513a1..48df3e091 100644
--- a/examples/minference/example_vertical_slash_sparse_attn.py
+++ b/examples/minference/example_vertical_slash_sparse_attn.py
@@ -87,6 +87,8 @@ def Compute(
 
             T.copy(scores_max, scores_max_prev)
             T.reduce_max(acc_s, scores_max, dim=1, clear=False)
+            for i in T.Parallel(block_M):
+                scores_max[i] = T.max(scores_max[i], scores_max_prev[i])
 
             for i in T.Parallel(block_M):
                 scores_scale[i] = T.exp2(scores_max_prev[i] * scale - scores_max[i] * scale)
@@ -194,6 +196,8 @@ def vs_sparse_flashattn_ws(
                         T.copy(scores_max, scores_max_prev)
 
                         T.reduce_max(acc_s, scores_max, dim=1, clear=False)
+                        for i in T.Parallel(block_M):
+                            scores_max[i] = T.max(scores_max[i], scores_max_prev[i])
 
                         for i in T.Parallel(block_M):
                             scores_scale[i] = T.exp2(scores_max_prev[i] * scale -
diff --git a/examples/seer_attention/block_sparse_attn_tilelang.py b/examples/seer_attention/block_sparse_attn_tilelang.py
index dcd581c6b..219d3ee35 100644
--- a/examples/seer_attention/block_sparse_attn_tilelang.py
+++ b/examples/seer_attention/block_sparse_attn_tilelang.py
@@ -62,6 +62,8 @@ def Softmax(
             T.copy(scores_max, scores_max_prev)
             T.fill(scores_max, -T.infinity(accum_dtype))
             T.reduce_max(acc_s, scores_max, dim=1, clear=False)
+            for i in T.Parallel(block_M):
+                scores_max[i] = T.max(scores_max[i], scores_max_prev[i])
             # To do causal softmax, we need to set the scores_max to 0 if it is -inf
             # This process is called Check_inf in FlashAttention3 code, and it only need to be done
             # in the first ceil_div(kBlockM, kBlockN) steps.
diff --git a/testing/python/transform/test_tilelang_transform_config_index_bitwidth.py b/testing/python/transform/test_tilelang_transform_config_index_bitwidth.py
index f051f0282..1ef1589a7 100644
--- a/testing/python/transform/test_tilelang_transform_config_index_bitwidth.py
+++ b/testing/python/transform/test_tilelang_transform_config_index_bitwidth.py
@@ -71,6 +71,8 @@ def Softmax(
             T.copy(scores_max, scores_max_prev)
             T.fill(scores_max, -T.infinity(accum_dtype))
             T.reduce_max(acc_s, scores_max, dim=1, clear=False)
+            for i in T.Parallel(block_M):
+                scores_max[i] = T.max(scores_max[i], scores_max_prev[i])
             # To do causal softmax, we need to set the scores_max to 0 if it is -inf
             # This process is called Check_inf in FlashAttention3 code, and it only need to be done
             # in the first ceil_div(kBlockM, kBlockN) steps.