perf: enable pdl for cutlass fp4 gemm (#2095)

yzh119 · web-flow · commit 4aed50cfa663 · 2025-11-16T17:40:45.000-05:00
## 📌 Description The `enablePDL` flag is set to false, this PR turned them on. Set to true for both because sm_100 and sm_120 should have support of pdl. ## 🔍 Related Issues  ## 🚀 Pull Request Checklist Thank you for contributing to FlashInfer! Before we review your pull request, please make sure the following items are complete. ### ✅ Pre-commit Checks - [ ] I have installed `pre-commit` by running `pip install pre-commit` (or used your preferred method). - [ ] I have installed the hooks with `pre-commit install`. - [ ] I have run the hooks manually with `pre-commit run --all-files` and fixed any reported issues. > If you are unsure about how to set up `pre-commit`, see [the pre-commit documentation](https://pre-commit.com/). ## 🧪 Tests - [ ] Tests have been added or updated as needed. - [ ] All tests are passing (`unittest`, etc.). ## Reviewer Notes   ## Summary by CodeRabbit * **Refactor** * Updated runtime configuration for FP4 GEMM operations to enhance execution performance on SM100 and SM120 GPU architectures.
diff --git a/include/flashinfer/gemm/fp4_gemm_template_sm100.h b/include/flashinfer/gemm/fp4_gemm_template_sm100.h
@@ -273,7 +273,7 @@ size_t genericFp4GemmKernelLauncher(void* D, void const* A, void const* B, void
                            std::string(cutlassGetStatusString(initStatus));                                  \
       throw std::runtime_error("[FP4 gemm Runner] " + errMsg);                                               \
     }                                                                                                        \
-    auto runStatus = gemm.run(args, workspace, stream, nullptr, /* enablePDL */ false);                      \
+    auto runStatus = gemm.run(args, workspace, stream, nullptr, /*enablePDL=*/true);                         \
     if (runStatus != cutlass::Status::kSuccess) {                                                            \
       std::string errMsg = "Failed to run cutlass FP4 gemm on sm100. Error: " +                              \
                            std::string(cutlassGetStatusString(runStatus));                                   \
diff --git a/include/flashinfer/gemm/fp4_gemm_template_sm120.h b/include/flashinfer/gemm/fp4_gemm_template_sm120.h
@@ -257,7 +257,7 @@ size_t genericFp4GemmKernelLauncher(void* D, void const* A, void const* B, void
                            std::string(cutlass::cutlassGetStatusString(initStatus));                       \
       throw std::runtime_error("[FP4 gemm Runner] " + errMsg);                                             \
     }                                                                                                      \
-    auto runStatus = gemm.run(args, workspace, stream, nullptr, /* enablePDL */ false);                    \
+    auto runStatus = gemm.run(args, workspace, stream, nullptr, /*enablePDL=*/true);                       \
     if (runStatus != cutlass::Status::kSuccess) {                                                          \
       std::string errMsg = "Failed to run cutlass FP4 gemm on sm120. Error: " +                            \
                            std::string(cutlass::cutlassGetStatusString(runStatus));                        \