move enable_pdl to runtime

yzh119 · yzh119 · commit 9c08d33d09ee · 2025-10-31T20:46:31.000Z
diff --git a/csrc/flashinfer_xqa_binding.cu b/csrc/flashinfer_xqa_binding.cu
@@ -25,7 +25,7 @@ void xqa_wrapper_mla(int64_t multiProcessorCount, double qScale, TensorView outp
 #endif
                      TensorView kvCachePageList, int64_t maxSeqLen, TensorView seqLen,
                      int64_t batchSize, TensorView kvCacheScale, TensorView semaphores,
-                     TensorView scratch);
+                     TensorView scratch, bool enable_pdl);
 
 TVM_FFI_DLL_EXPORT_TYPED_FUNC(xqa_wrapper_mla, xqa_wrapper_mla);
 
@@ -47,7 +47,7 @@ void xqa_wrapper(bool run_sm90_fp8_mha, int64_t multiProcessorCount, int64_t nbK
 #if SPEC_DEC
                  int64_t qSeqLen, TensorView qCuSeqLens, TensorView mask,
 #endif
-                 TensorView semaphores, TensorView scratch);
+                 TensorView semaphores, TensorView scratch, bool enable_pdl);
 
 TVM_FFI_DLL_EXPORT_TYPED_FUNC(xqa_wrapper, xqa_wrapper);
 
diff --git a/csrc/xqa/defines.h b/csrc/xqa/defines.h
@@ -129,7 +129,16 @@ static_assert(SPEC_DEC, "SPEC_Q_SEQ_LEN should only be used when SPEC_DEC is ena
 // 1 - naive PDL
 // 2 - aggressive PDL (implemented only in mha_sm90.cu for now)
 #ifndef ENABLE_PDL
+#if defined(__CUDA_ARCH__) && __CUDA_ARCH__ >= 900
+#if __CUDA_ARCH__ == 900
 #define ENABLE_PDL 2
+#else
+#define ENABLE_PDL 1
+#endif
+#else
+/* default for host or older architectures */
+#define ENABLE_PDL 0
+#endif
 #endif
 
 #ifndef USE_INPUT_KV
diff --git a/csrc/xqa/mha.cu b/csrc/xqa/mha.cu
@@ -2548,7 +2548,7 @@ void launchMHA(
 #if SPEC_DEC
     SpecDecParams const& specDecParams,
 #endif
-    uint32_t* semaphores, void* scratch, cudaStream_t stream) {
+    uint32_t* semaphores, void* scratch, bool enable_pdl, cudaStream_t stream) {
 #if SPEC_DEC
   auto const qSeqLen = specDecParams.qSeqLen;
   auto const qCuSeqLens = specDecParams.qCuSeqLens;
@@ -2590,7 +2590,7 @@ void launchMHA(
   dim3 const dimGrid{nbSubSeqPerSeq, nbKHeads, batchSize};
 #endif
   dim3 const dimCta{warp_size * ctaShapeInWarps.x, ctaShapeInWarps.y, ctaShapeInWarps.z};
-  auto const launchCfg = makeLaunchConfig(dimGrid, dimCta, hostSmemSize, stream, ENABLE_PDL != 0);
+  auto const launchCfg = makeLaunchConfig(dimGrid, dimCta, hostSmemSize, stream, enable_pdl);
 #if USE_PAGED_KV_CACHE
   uint32_t const maxNbPagesPerSeq = exactDiv(maxSeqLen, tokensPerPage);
 #if PAGED_KV_CACHE_LAYOUT == 1
@@ -2681,7 +2681,8 @@ void launchMHAFlashInfer(uint32_t multiProcessorCount, uint32_t nbKHeads, uint32
 #if SPEC_DEC
                          uint32_t qSeqLen, uint32_t const* qCuSeqLens, MaskType const* mask,
 #endif
-                         uint32_t* semaphores, void* scratch, cudaStream_t stream) {
+                         uint32_t* semaphores, void* scratch, bool enable_pdl,
+                         cudaStream_t stream) {
   uint32_t const nbSubSeqPerSeq = [&]() -> uint32_t {
     if (!allowMultiBlockMode) {
       return 1;
@@ -2696,7 +2697,7 @@ void launchMHAFlashInfer(uint32_t multiProcessorCount, uint32_t nbKHeads, uint32
   dim3 const dimGrid{nbSubSeqPerSeq, nbKHeads, batchSize};
 #endif
   dim3 const dimCta{warp_size * ctaShapeInWarps.x, ctaShapeInWarps.y, ctaShapeInWarps.z};
-  auto const launchCfg = makeLaunchConfig(dimGrid, dimCta, hostSmemSize, stream, ENABLE_PDL != 0);
+  auto const launchCfg = makeLaunchConfig(dimGrid, dimCta, hostSmemSize, stream, enable_pdl);
 #if USE_PAGED_KV_CACHE
   uint32_t const maxNbPagesPerSeq = exactDiv(maxSeqLen, tokensPerPage);
 #if PAGED_KV_CACHE_LAYOUT == 1
diff --git a/csrc/xqa/mha.h b/csrc/xqa/mha.h
@@ -128,7 +128,7 @@ void launchMHA(
 #if SPEC_DEC
     SpecDecParams const& specDecParams,
 #endif
-    uint32_t* semaphores, void* scratch, cudaStream_t stream);
+    uint32_t* semaphores, void* scratch, bool enable_pdl, cudaStream_t stream);
 
 void launchMHAFlashInfer(uint32_t multiProcessorCount, uint32_t nbKHeads, uint32_t slidingWinSize,
                          float qScale, OutputHead* output,
@@ -147,7 +147,7 @@ void launchMHAFlashInfer(uint32_t multiProcessorCount, uint32_t nbKHeads, uint32
 #if SPEC_DEC
                          uint32_t qSeqLen, uint32_t const* qCuSeqLens, MaskType const* mask,
 #endif
-                         uint32_t* semaphores, void* scratch, cudaStream_t stream);
+                         uint32_t* semaphores, void* scratch, bool enable_pdl, cudaStream_t stream);
 
 void launchHopperF8MHA(
     cudaDeviceProp const& prop, uint32_t nbKHeads,
@@ -189,7 +189,7 @@ void launchHopperF8MHA(
 #if SPEC_DEC
     SpecDecParams const& specDecParams,
 #endif
-    uint32_t* semaphores, void* scratch, cudaStream_t stream);
+    uint32_t* semaphores, void* scratch, bool enable_pdl, cudaStream_t stream);
 
 void launchHopperF8MHAFlashInfer(uint32_t multiProcessorCount, uint32_t nbKHeads,
                                  uint32_t slidingWinSize, float qScale, OutputHead* output,
@@ -208,7 +208,8 @@ void launchHopperF8MHAFlashInfer(uint32_t multiProcessorCount, uint32_t nbKHeads
 #if SPEC_DEC
                                  uint32_t qSeqLen, uint32_t const* qCuSeqLens, MaskType const* mask,
 #endif
-                                 uint32_t* semaphores, void* scratch, cudaStream_t stream);
+                                 uint32_t* semaphores, void* scratch, bool enable_pdl,
+                                 cudaStream_t stream);
 
 void launchMLA(
     cudaDeviceProp const& prop,
@@ -230,7 +231,7 @@ void launchMLA(
     uint32_t maxSeqLen, uint32_t const* seqLen, uint32_t batchSize,
     float const* __restrict__ kvCacheScale,  // Device memory scalar. Same scale for K and V cache.
                                              // Used only for int8/fp8 KV cache.
-    uint32_t* semaphores, void* scratch, cudaStream_t stream);
+    uint32_t* semaphores, void* scratch, bool enable_pdl, cudaStream_t stream);
 
 void launchMLAFlashInfer(
     uint32_t multiProcessorCount,
@@ -248,7 +249,7 @@ void launchMLAFlashInfer(
     uint32_t maxSeqLen, uint32_t const* seqLen, uint32_t batchSize,
     float const* __restrict__ kvCacheScale,  // Device memory scalar. Same scale for K and V cache.
                                              // Used only for int8/fp8 KV cache.
-    uint32_t* semaphores, void* scratch, cudaStream_t stream);
+    uint32_t* semaphores, void* scratch, bool enable_pdl, cudaStream_t stream);
 
 #if STATIC_NB_K_HEADS
 constexpr uint32_t nbKHeads = NB_K_HEADS;
diff --git a/csrc/xqa/mha_sm90.cu b/csrc/xqa/mha_sm90.cu
@@ -3036,7 +3036,7 @@ void launchHopperF8MHA(
 #if SPEC_DEC
     SpecDecParams const& specDecParams,
 #endif
-    uint32_t* semaphores, void* scratch, cudaStream_t stream) {
+    uint32_t* semaphores, void* scratch, bool enable_pdl, cudaStream_t stream) {
   if (beamWidth != 1) {
     throw std::runtime_error("not implemented");
   }
@@ -3073,7 +3073,7 @@ void launchHopperF8MHA(
   // nbInputSeqSplit
   dim3 const dimGrid{divUp(qSeqLen, inputTokensPerCta), nbSubSeqPerSeq, nbKHeads * batchSize};
   dim3 const dimCta{warp_size * gmmaWarpsPerGrp, 1, 3};
-  auto const launchCfg = makeLaunchConfig(dimGrid, dimCta, hostSmemSize, stream, ENABLE_PDL != 0);
+  auto const launchCfg = makeLaunchConfig(dimGrid, dimCta, hostSmemSize, stream, enable_pdl);
 #if USE_PAGED_KV_CACHE
   uint32_t const maxNbPagesPerSeq = exactDiv(maxSeqLen, tokensPerPage);
   auto const dtype = [] {
@@ -3194,7 +3194,8 @@ void launchHopperF8MHAFlashInfer(uint32_t multiProcessorCount, uint32_t nbKHeads
 #if SPEC_DEC
                                  uint32_t qSeqLen, uint32_t const* qCuSeqLens, MaskType const* mask,
 #endif
-                                 uint32_t* semaphores, void* scratch, cudaStream_t stream) {
+                                 uint32_t* semaphores, void* scratch, bool enable_pdl,
+                                 cudaStream_t stream) {
   uint32_t const nbSubSeqPerSeq = [&]() -> uint32_t {
     float const factor = 0.25f;
     return mha::min<uint32_t>(
@@ -3210,7 +3211,7 @@ void launchHopperF8MHAFlashInfer(uint32_t multiProcessorCount, uint32_t nbKHeads
 #endif
   dim3 const dimGrid{divUp(qLen, inputTokensPerCta), nbSubSeqPerSeq, nbKHeads * batchSize};
   dim3 const dimCta{warp_size * gmmaWarpsPerGrp, 1, 3};
-  auto const launchCfg = makeLaunchConfig(dimGrid, dimCta, hostSmemSize, stream, ENABLE_PDL != 0);
+  auto const launchCfg = makeLaunchConfig(dimGrid, dimCta, hostSmemSize, stream, enable_pdl);
 #if USE_PAGED_KV_CACHE
   uint32_t const maxNbPagesPerSeq = exactDiv(maxSeqLen, tokensPerPage);
   auto const dtype = [] {
diff --git a/csrc/xqa/mla_sm120.cu b/csrc/xqa/mla_sm120.cu
@@ -1724,7 +1724,7 @@ void launchMLA(
     uint32_t maxSeqLen, uint32_t const* seqLen, uint32_t batchSize,
     float const* __restrict__ kvCacheScale,  // Device memory scalar. Same scale for K and V cache.
                                              // Used only for int8/fp8 KV cache.
-    uint32_t* semaphores, void* scratch, cudaStream_t stream) {
+    uint32_t* semaphores, void* scratch, bool enable_pdl, cudaStream_t stream) {
 #if IS_MLA
   static_assert(
       SLIDING_WINDOW == 0 && LOW_PREC_OUTPUT == 0 && USE_INPUT_KV == 0 && USE_BEAM_SEARCH == 0,
@@ -1762,7 +1762,7 @@ void launchMLA(
   // nbInputSeqSplit
   dim3 const dimGrid{4 * inputSeqLen, nbSubSeqPerSeq, nbKHeads * batchSize};
   dim3 const dimCta{warp_size * 4 * 3, 1, 1};
-  auto const launchCfg = makeLaunchConfig(dimGrid, dimCta, hostSmemSize, stream, ENABLE_PDL != 0);
+  auto const launchCfg = makeLaunchConfig(dimGrid, dimCta, hostSmemSize, stream, enable_pdl);
 #if USE_PAGED_KV_CACHE
   uint32_t const maxNbPagesPerSeq = exactDiv(maxSeqLen, tokensPerPage);
 #if PAGED_KV_CACHE_LAYOUT == 1
@@ -1861,7 +1861,7 @@ void launchMLAFlashInfer(
     uint32_t maxSeqLen, uint32_t const* seqLen, uint32_t batchSize,
     float const* __restrict__ kvCacheScale,  // Device memory scalar. Same scale for K and V cache.
                                              // Used only for int8/fp8 KV cache.
-    uint32_t* semaphores, void* scratch, cudaStream_t stream) {
+    uint32_t* semaphores, void* scratch, bool enable_pdl, cudaStream_t stream) {
 #if IS_MLA
   static_assert(
       SLIDING_WINDOW == 0 && LOW_PREC_OUTPUT == 0 && USE_INPUT_KV == 0 && USE_BEAM_SEARCH == 0,
@@ -1885,7 +1885,7 @@ void launchMLAFlashInfer(
   // nbInputSeqSplit
   dim3 const dimGrid{4 * inputSeqLen, nbSubSeqPerSeq, nbKHeads * batchSize};
   dim3 const dimCta{warp_size * 4 * 3, 1, 1};
-  auto const launchCfg = makeLaunchConfig(dimGrid, dimCta, hostSmemSize, stream, ENABLE_PDL != 0);
+  auto const launchCfg = makeLaunchConfig(dimGrid, dimCta, hostSmemSize, stream, enable_pdl);
 #if USE_PAGED_KV_CACHE
   uint32_t const maxNbPagesPerSeq = exactDiv(maxSeqLen, tokensPerPage);
 #if PAGED_KV_CACHE_LAYOUT == 1
diff --git a/csrc/xqa/xqa_wrapper.cu b/csrc/xqa/xqa_wrapper.cu
@@ -28,7 +28,7 @@ void xqa_wrapper_mla(int64_t multiProcessorCount, double qScale, TensorView outp
 #endif
                      TensorView kvCachePageList, int64_t maxSeqLen, TensorView seqLen,
                      int64_t batchSize, TensorView kvCacheScale, TensorView semaphores,
-                     TensorView scratch) {
+                     TensorView scratch, bool enable_pdl) {
   auto stream = get_stream(output.device());
 
   launchMLAFlashInfer(multiProcessorCount, 1, qScale,
@@ -44,7 +44,7 @@ void xqa_wrapper_mla(int64_t multiProcessorCount, double qScale, TensorView outp
                       maxSeqLen, reinterpret_cast<uint32_t const*>(seqLen.data_ptr()), batchSize,
                       reinterpret_cast<float const*>(kvCacheScale.data_ptr()),
                       reinterpret_cast<uint32_t*>(semaphores.data_ptr()),
-                      reinterpret_cast<void*>(scratch.data_ptr()), stream);
+                      reinterpret_cast<void*>(scratch.data_ptr()), enable_pdl, stream);
 }
 #else
 
@@ -64,7 +64,7 @@ void xqa_wrapper(bool run_sm90_fp8_mha, int64_t multiProcessorCount, int64_t nbK
 #if SPEC_DEC
                  int64_t qSeqLen, TensorView qCuSeqLens, TensorView mask,
 #endif
-                 TensorView semaphores, TensorView scratch) {
+                 TensorView semaphores, TensorView scratch, bool enable_pdl) {
   auto stream = get_stream(output.device());
   float const* attentionSinksPtr =
       attentionSinks.has_value() ? reinterpret_cast<float const*>(attentionSinks.value().data_ptr())
@@ -91,6 +91,6 @@ void xqa_wrapper(bool run_sm90_fp8_mha, int64_t multiProcessorCount, int64_t nbK
            reinterpret_cast<MaskType const*>(mask.data_ptr()),
 #endif
            reinterpret_cast<uint32_t*>(semaphores.data_ptr()),
-           reinterpret_cast<void*>(scratch.data_ptr()), stream);
+           reinterpret_cast<void*>(scratch.data_ptr()), enable_pdl, stream);
 }
 #endif
diff --git a/flashinfer/aot.py b/flashinfer/aot.py
@@ -404,7 +404,6 @@ def gen_xqa(
             head_dim=head_size,
             head_group_ratio=head_grp_size,
             use_sliding_window=use_sliding_window,
-            enable_pdl=True,
         )
 
     if has_sm120 or has_sm121:
@@ -416,7 +415,6 @@ def gen_xqa(
                 head_dim=576,
                 head_group_ratio=128,
                 use_sliding_window=False,
-                enable_pdl=True,
             )
 
 
diff --git a/flashinfer/jit/xqa.py b/flashinfer/jit/xqa.py
@@ -42,7 +42,6 @@ def gen_xqa_module(
     head_dim: int,
     head_group_ratio: int,
     use_sliding_window: bool,
-    enable_pdl: bool,
 ) -> JitSpec:
     if input_dtype == torch.float16:
         flag_input_dtype = ["-DINPUT_FP16=1", "-DDTYPE=__half"]
@@ -85,15 +84,10 @@ def gen_xqa_module(
     )
     sm_nvcc_flags = nvcc_flags
 
-    if enable_pdl:
-        flag_enable_pdl = ["-DENABLE_PDL=2"]
-    else:
-        flag_enable_pdl = ["-DENABLE_PDL=0"]
-
     flag_mla_wrapper = ["-DMLA_WRAPPER=0"]
 
     return gen_jit_spec(
-        f"xqa_input_{filename_safe_dtype_map[input_dtype]}_kv_cache_{filename_safe_dtype_map[kv_cache_dtype]}_page_size_{page_size}_head_dim_{head_dim}_head_group_ratio_{head_group_ratio}_use_sliding_window_{use_sliding_window}_enable_pdl_{enable_pdl}",
+        f"xqa_input_{filename_safe_dtype_map[input_dtype]}_kv_cache_{filename_safe_dtype_map[kv_cache_dtype]}_page_size_{page_size}_head_dim_{head_dim}_head_group_ratio_{head_group_ratio}_use_sliding_window_{use_sliding_window}",
         [
             jit_env.FLASHINFER_CSRC_DIR / "xqa/mha.cu",
             jit_env.FLASHINFER_CSRC_DIR / "xqa/mha_sm90.cu",
@@ -109,7 +103,6 @@ def gen_xqa_module(
         + flag_kv_cache_dtype
         + flag_head_group_ratio
         + flag_sliding_window
-        + flag_enable_pdl
         + flag_mla_wrapper,
         extra_ldflags=["-lcuda"],  # Add CUDA Driver API library
         extra_cflags=["-DPAGED_KV_CACHE_LAYOUT=1"],
@@ -123,7 +116,6 @@ def gen_xqa_module_mla(
     head_dim: int,
     head_group_ratio: int,
     use_sliding_window: bool = False,
-    enable_pdl: bool = True,
 ) -> JitSpec:
     assert head_group_ratio == 128, "Only head group ratio 128 is supported for xqa MLA"
     assert head_dim == 576, "Only head dim 576 is supported for xqa_module_mla"
@@ -153,15 +145,10 @@ def gen_xqa_module_mla(
     nvcc_flags = compilation_context.get_nvcc_flags_list(supported_major_versions=[12])
     sm_nvcc_flags = nvcc_flags
 
-    if enable_pdl:
-        flag_enable_pdl = ["-DENABLE_PDL=2"]
-    else:
-        flag_enable_pdl = ["-DENABLE_PDL=0"]
-
     flag_mla_wrapper = ["-DMLA_WRAPPER=1"]
 
     return gen_jit_spec(
-        f"xqa_mla_input_{filename_safe_dtype_map[input_dtype]}_kv_cache_{filename_safe_dtype_map[kv_cache_dtype]}_page_size_{page_size}_head_dim_{head_dim}_head_group_ratio_{head_group_ratio}_use_sliding_window_{use_sliding_window}_enable_pdl_{enable_pdl}",
+        f"xqa_mla_input_{filename_safe_dtype_map[input_dtype]}_kv_cache_{filename_safe_dtype_map[kv_cache_dtype]}_page_size_{page_size}_head_dim_{head_dim}_head_group_ratio_{head_group_ratio}_use_sliding_window_{use_sliding_window}",
         [
             jit_env.FLASHINFER_CSRC_DIR / "xqa/mla_sm120.cu",
             jit_env.FLASHINFER_CSRC_DIR / "xqa/tensorMap.cpp",
@@ -175,8 +162,7 @@ def gen_xqa_module_mla(
         + flag_kv_cache_dtype
         + flag_head_group_ratio
         + flag_sliding_window
-        + flag_mla_wrapper
-        + flag_enable_pdl,
+        + flag_mla_wrapper,
         extra_ldflags=["-lcuda"],  # Add CUDA Driver API library
         extra_cflags=["-DPAGED_KV_CACHE_LAYOUT=1"],
     )
diff --git a/flashinfer/xqa.py b/flashinfer/xqa.py

Original file line number	Diff line number	Diff line change
`@@ -404,7 +404,6 @@ def gen_xqa(`
`404`	`404`	`head_dim=head_size,`
`405`	`405`	`head_group_ratio=head_grp_size,`
`406`	`406`	`use_sliding_window=use_sliding_window,`
`407`		`- enable_pdl=True,`
`408`	`407`	`)`
`409`	`408`
`410`	`409`	`if has_sm120 or has_sm121:`
`@@ -416,7 +415,6 @@ def gen_xqa(`
`416`	`415`	`head_dim=576,`
`417`	`416`	`head_group_ratio=128,`
`418`	`417`	`use_sliding_window=False,`
`419`		`- enable_pdl=True,`
`420`	`418`	`)`
`421`	`419`
`422`	`420`