flagos-ai · xingxing588 · Nov 25, 2025 · 0x45f · Nov 26, 2025
diff --git a/src/flag_gems/ops/attention.py b/src/flag_gems/ops/attention.py
@@ -1065,7 +1065,7 @@ def flash_attention_forward(
     else:
         non_null_window_right = -1
 
-    out = torch.empty_like(query)
+    out = torch.empty(query.shape, device=query.device, dtype=query.dtype)
     if cumulative_sequence_length_q is not None:
         out, q, k, v, lse, philox_seed, philox_offset, p = mha_varlan_fwd(
             query,

diff --git a/tests/test_attention_ops.py b/tests/test_attention_ops.py
@@ -192,6 +192,7 @@ def attention_ref(
     else:
         attention_drop = attention
     output = torch.einsum("bhts,bshd->bthd", attention_drop, v * dropout_scaling)
+    output = output.contiguous()
     if query_padding_mask is not None:
         output.masked_fill_((~query_padding_mask)[:, :, None, None], 0.0)
     return output.to(dtype=dtype_og), attention.to(dtype=dtype_og)
@@ -662,7 +663,7 @@ def test_flash_fwd_gqa_alibi_softcap(
 @pytest.mark.flash_attention_forward
 @pytest.mark.parametrize(
     ["batch", "num_head", "num_head_k", "q_seq_len", "kv_seq_len"],
-    [(1, 4, 1, 1, 1024), (4, 4, 4, 1, 519)],
+    [(1, 4, 1, 1, 1024), (4, 4, 4, 1, 519), (1, 4, 1, 2, 16)],
 )
 @pytest.mark.parametrize("head_size", [128, 192])
 @pytest.mark.parametrize("is_causal", [False, True])