flashinfer-ai · yzh119 · Nov 25, 2025 · Nov 21, 2025 · Nov 22, 2025 · Nov 24, 2025
@@ -1310,8 +1310,8 @@ CUBIN_EXPORT __global__
         uint32_t kv_stride_page, uint32_t kv_stride_token, uint32_t kv_stride_head,
         uint32_t* __restrict__ semaphores = nullptr, void* __restrict__ scratch = nullptr) {
 
-  float const qScaleValue = qScalePtr != nullptr ? *qScalePtr : qScale;
-  float const kvCacheScaleValue = kvScalePtr != nullptr ? *kvScalePtr : kvCacheScale;
+  float const qScaleValue = qScalePtr != nullptr ? qScalePtr[0] : qScale;
+  float const kvCacheScaleValue = kvScalePtr != nullptr ? kvScalePtr[0] : kvCacheScale;
   assert(allowMultiBlockMode || gridDim.x == 1);
   bool const isMultiBlock = allowMultiBlockMode && (gridDim.x != 1);
   uint32_t const nbSubSeqPerSeq = allowMultiBlockMode ? gridDim.x : 1;

@@ -640,8 +640,8 @@ __launch_bounds__(128 * 3)
         uint32_t* __restrict__ const semaphores =
             nullptr,  // [nbReq][nbKHeads][divUp(specDecParams.qSeqLen, inputTokensPerCta)]
         void* __restrict__ const scratch = nullptr) {
-  float const qScaleValue = qScalePtr != nullptr ? *qScalePtr : qScale;
-  float const kvCacheScaleValue = kvScalePtr != nullptr ? *kvScalePtr : kvCacheScale;
+  float const qScaleValue = qScalePtr != nullptr ? qScalePtr[0] : qScale;
+  float const kvCacheScaleValue = kvScalePtr != nullptr ? kvScalePtr[0] : kvCacheScale;
 #if defined(__CUDA_ARCH__) && __CUDA_ARCH__ == 900 && defined(__CUDA_ARCH_FEAT_SM90_ALL) && \
     (IS_SUPPORTED_F16_CASE || CACHE_ELEM_ENUM == 2) && BEAM_WIDTH == 1
   uint32_t const idxReq = blockIdx.z / nbKHeads;

@@ -1564,8 +1564,8 @@ __launch_bounds__(32 * 4 * 3, 1) __cluster_dims__(cgaSize, 1, 1) void kernel_mha
     PartialResult* __restrict__ const partialResults =
         nullptr)  // [totalNbInputTokens][maxNbSubSeq]
 {
-  float const qScaleValue = qScalePtr != nullptr ? *qScalePtr : qScale;
-  float const kvCacheScaleValue = kvScalePtr != nullptr ? *kvScalePtr : kvCacheScale;
+  float const qScaleValue = qScalePtr != nullptr ? qScalePtr[0] : qScale;
+  float const kvCacheScaleValue = kvScalePtr != nullptr ? kvScalePtr[0] : kvCacheScale;
   assert(blockDim.x == 32 * 12 && blockDim.y == 1 && blockDim.z == 1);
   extern __shared__ char smemBuf[];
   uint32_t const warpRank = makeWarpUniform(this_warp(), threadIdx.x / warp_size);

diff --git a/tests/attention/test_xqa.py b/tests/attention/test_xqa.py
@@ -8,7 +8,7 @@
 from flashinfer.utils import get_compute_capability
 
 
-def set_random_seed(seed=42):
+def set_random_seed(seed=0):
     torch.manual_seed(seed)
     torch.cuda.manual_seed(seed)
     torch.cuda.manual_seed_all(seed)
@@ -173,7 +173,7 @@ def test_xqa(
     q_scale,
     use_fp8_output,
 ):
-    set_random_seed(42)
+    set_random_seed(0)
 
     nb_q_heads = nb_k_heads * head_grp_size
 
@@ -268,7 +268,9 @@ def test_xqa(
 
     # Shuffle page indices
     flattened = page_list_arg.flatten()
-    indices = torch.randperm(flattened.numel(), device="cuda")
+    generator = torch.Generator(device="cuda")
+    generator.manual_seed(42)
+    indices = torch.randperm(flattened.numel(), generator=generator, device="cuda")
     shuffled_flat = flattened[indices]
     page_list_arg = shuffled_flat.view(batch_size, nb_pages_per_seq)
 
@@ -335,6 +337,9 @@ def test_xqa(
 
     rcp_out_scale = 4.0 if use_fp8_output else 1.0
 
+    torch.cuda.synchronize()
+    semaphores.zero_()
+
     xqa(
         q_heads,
         cache_k_heads.to(torch.float8_e4m3fn) if fp8_kv_cache else cache_k_heads,
@@ -347,15 +352,17 @@ def test_xqa(
         nb_k_heads,
         tokens_per_page,
         sinks=attention_sinks,
-        q_scale=q_scale,
-        kv_scale=kv_cache_scale,
+        q_scale=torch.tensor(q_scale, device="cuda"),
+        kv_scale=torch.tensor(kv_cache_scale, device="cuda"),
         sliding_win_size=sliding_win_size,
         kv_layout=kv_layout,
         sm_count=sm_count,
         enable_pdl=enable_pdl,
         rcp_out_scale=rcp_out_scale,
     )
 
+    torch.cuda.synchronize()
+
     for req in range(batch_size):
         for b in range(beam_width):
             for idx_k_head in range(nb_k_heads):
@@ -446,7 +453,7 @@ def test_xqa_mla(
     q_scale,
     enable_pdl,
 ):
-    set_random_seed(42)
+    set_random_seed(0)
 
     # MLA specific constants (fixed, not parameterized)
     nb_k_heads = 1  # MLA only supports 1 K head