flashinfer-ai
diff --git a/‎.github/CODEOWNERS‎
Lines changed: 19 additions & 19 deletions b/‎.github/CODEOWNERS‎
Lines changed: 19 additions & 19 deletions
diff --git a/‎.github/workflows/nightly-release.yml‎
Lines changed: 0 additions & 1 deletion b/‎.github/workflows/nightly-release.yml‎
Lines changed: 0 additions & 1 deletion
diff --git a/‎.github/workflows/release.yml‎
Lines changed: 0 additions & 1 deletion b/‎.github/workflows/release.yml‎
Lines changed: 0 additions & 1 deletion
diff --git a/‎CONTRIBUTING.md‎
Lines changed: 6 additions & 7 deletions b/‎CONTRIBUTING.md‎
Lines changed: 6 additions & 7 deletions
diff --git a/‎benchmarks/README.md‎
Lines changed: 1 addition & 0 deletions b/‎benchmarks/README.md‎
Lines changed: 1 addition & 0 deletions
diff --git a/‎benchmarks/bench_mixed_attention.py‎
Lines changed: 69 additions & 62 deletions b/‎benchmarks/bench_mixed_attention.py‎
Lines changed: 69 additions & 62 deletions
diff --git a/‎benchmarks/bench_rope_quantize_fp8.py‎
Lines changed: 23 additions & 1 deletion b/‎benchmarks/bench_rope_quantize_fp8.py‎
Lines changed: 23 additions & 1 deletion
@@ -3,41 +3,41 @@
 # Analysis period: 180 days
 # Minimum commits threshold: 1
 
-benchmarks/ @bkryu @cyx-6 @nv-yunzheq @kahyunnam @nvmbreughe
+benchmarks/ @bkryu @cyx-6 @jiahanc @nv-yunzheq @kahyunnam
 benchmarks/routines/ @bkryu @nv-yunzheq @cyx-6 @nvmbreughe @Anerudhan
 ci/ @cyx-6 @yzh119 @nvmbreughe
 ci/scripts/ @cyx-6
 ci/scripts/jenkins/ @cyx-6
-csrc/ @yzh119 @wenscarl @cyx-6 @yongwww @kahyunnam
-csrc/fused_moe/ @yzh119 @yongwww @wenscarl @cyx-6 @yongwww
-csrc/fused_moe/cutlass_backend/ @yzh119 @yongwww @wenscarl @cyx-6 @yongwww
-csrc/nv_internal/ @wenscarl @yzh119 @cyx-6 @yongwww @aleozlx
-csrc/nv_internal/cpp/ @wenscarl @yongwww @joker-eph @ttyio @azhurkevich
+csrc/ @wenscarl @yzh119 @cyx-6 @djmmoss @yongwww
+csrc/fused_moe/ @yzh119 @yongwww @djmmoss @cyx-6 @wenscarl
+csrc/fused_moe/cutlass_backend/ @yzh119 @yongwww @djmmoss @cyx-6 @wenscarl
+csrc/nv_internal/ @wenscarl @djmmoss @cyx-6 @yzh119 @yongwww
+csrc/nv_internal/cpp/ @wenscarl @yongwww @djmmoss @joker-eph @ttyio
 csrc/nv_internal/include/ @wenscarl
-csrc/nv_internal/tensorrt_llm/ @wenscarl @yzh119 @cyx-6 @yongwww @aleozlx
-csrc/xqa/ @yzh119 @cyx-6
+csrc/nv_internal/tensorrt_llm/ @wenscarl @djmmoss @cyx-6 @yzh119 @yongwww
+csrc/xqa/ @cyx-6 @yzh119
 docs/ @yzh119 @cyx-6 @wenscarl @nv-yunzheq @aleozlx
-flashinfer/ @yzh119 @cyx-6 @nvmbreughe @wenscarl @yongwww
+flashinfer/ @yzh119 @cyx-6 @wenscarl @nvmbreughe @yongwww
 flashinfer-cubin/ @yzh119 @cyx-6
 flashinfer-cubin/flashinfer_cubin/ @yzh119
 flashinfer-jit-cache/ @yzh119 @cyx-6
 flashinfer-jit-cache/flashinfer_jit_cache/ @yzh119
-flashinfer/comm/ @yzh119 @cyx-6 @nvmbreughe @wenscarl @aleozlx
+flashinfer/comm/ @yzh119 @cyx-6 @nvmbreughe @wenscarl @djmmoss
 flashinfer/cudnn/ @Anerudhan @yzh119 @cyx-6 @Anerudhan
 flashinfer/cute_dsl/ @yzh119 @kaixih @Amir-19 @aleozlx
-flashinfer/fused_moe/ @yzh119 @cyx-6 @wenscarl @IwakuraRein @joker-eph
-flashinfer/jit/ @yzh119 @cyx-6 @aleozlx @yongwww @bkryu
-flashinfer/jit/attention/ @yzh119 @Anerudhan @joker-eph
+flashinfer/fused_moe/ @djmmoss @yzh119 @cyx-6 @wenscarl @IwakuraRein
+flashinfer/jit/ @yzh119 @cyx-6 @djmmoss @jiahanc @aleozlx
+flashinfer/jit/attention/ @yzh119 @cyx-6 @Anerudhan @joker-eph
 flashinfer/jit/gemm/ @yzh119
 flashinfer/logits_processor/ @cyx-6 @yzh119
 flashinfer/profiler/ @cyx-6
 flashinfer/triton/ @cyx-6 @nvmbreughe @yzh119
 flashinfer/tuning_configs/ @kaixih
-include/ @yzh119 @cyx-6 @kahyunnam @joker-eph @aleozlx
-include/flashinfer/ @yzh119 @cyx-6 @kahyunnam @joker-eph @aleozlx
+include/ @yzh119 @wenscarl @kahyunnam @joker-eph @cyx-6
+include/flashinfer/ @yzh119 @wenscarl @kahyunnam @joker-eph @cyx-6
 include/flashinfer/attention/ @yzh119 @kahyunnam @joker-eph
-include/flashinfer/comm/ @yongwww @nvmbreughe @yzh119 @cyx-6
-include/flashinfer/gemm/ @ttyio @yongwww @aleozlx @cyx-6
-include/flashinfer/trtllm/ @joker-eph @aleozlx @yzh119 @cyx-6 @aleozlx
+include/flashinfer/comm/ @yongwww @nvmbreughe @djmmoss @yzh119 @cyx-6
+include/flashinfer/gemm/ @ttyio @yongwww @aleozlx
+include/flashinfer/trtllm/ @joker-eph @aleozlx @yzh119 @cyx-6 @wenscarl
 profiler/ @cyx-6
-scripts/ @yzh119 @nvmbreughe @yongwww @bkryu @dierksen
+scripts/ @yzh119 @nvmbreughe @dierksen @yongwww @bkryu
@@ -98,7 +98,6 @@ jobs:
         run: |
           python -m pip install --upgrade pip
           pip install build twine wheel
-          pip install setuptools>=61.0 requests filelock torch tqdm numpy apache-tvm-ffi==0.1.0b15
 
       - name: Build flashinfer-cubin wheel
         env:
 
@@ -136,7 +136,6 @@ jobs:
         run: |
           python -m pip install --upgrade pip
           pip install build twine wheel
-          pip install setuptools>=61.0 requests filelock torch tqdm numpy apache-tvm-ffi==0.1.0b15
 
       - name: Build flashinfer-cubin wheel
         run: |
 
@@ -36,12 +36,11 @@ Code Contribution Procedure
 
 # Release Versioning
 
-When incrementing a version and creating a release, follow [Semantic Versioning](https://packaging.python.org/en/latest/discussions/versioning/) (`major.minor.patch`) [^1]. In particular:
+When incrementing a version and creating a release, follow a "right-shifted" versioning scheme similar to [vLLM Release Versioning](https://github.com/vllm-project/vllm/blob/main/RELEASE.md) (`major.minor.patch[.post1]`) [^1]. In particular:
 
-* major increment signals incompatible API changes
-* minor increment signals added functionality that is backwards-compatible (e.g. new kernels, new SM support, etc)
-* patch increment signals backwards-compatible bug fixes (both for functional and performance issues)
+* _major_ increment signals architectural milestone and/or when incompatible API changes are made, similar to PyTorch 2.0.
+* _minor_ increment signals significant backwards-compatible new features
+* _patch_ increment signals small backwards-compatible features (e.g. new kernels, new SM support, etc) and backwards-compatible bug fixes
+* _post1_ is an optional suffix for a quick follow up release with just backwards-compatible bug fixes
 
-Optionally, use post-releases (e.g., `X.Y.Z.post1`) for minor changes, like a documentation change.
-
-[^1]: We have not followed this strictly through v0.2.14.post1. But after v0.2.14.post1, the versioning should follow SemVer.
+[^1]: We have not followed this strictly through v0.4.0. But after v0.4.0, the versioning should follow this "right-shifted" versioning scheme.
@@ -16,6 +16,7 @@ Currently supports testing most attention, gemm, and fused MOE APIs:
     - `BatchPrefillWithPagedKVCacheWrapper` - Prefill attention with paged KV cache.
         - Also supports computationally similar `cudnn_batch_prefill_with_kv_cache` and  `trtllm_batch_context_with_kv_cache`.
     - `BatchPrefillWithRaggedKVCacheWrapper` - Prefill attention with ragged KV cache.
+        - Also supports computationally similar `cudnn_batch_prefill_with_kv_cache` and  `trtllm_ragged_attention_deepseek`.
     - `BatchMLAPagedAttentionWrapper` - MLA attention proposed in DeepSeek series of models.
         - Also supports computationally similar `trtllm_batch_decode_with_kv_cache_mla`.
 - GEMM:
 
@@ -72,6 +72,24 @@ def run_bench(
     measurements = bench_gpu_time(lambda: wrapper_old.run(q, kv_data))
     ms_old = np.median(measurements)
 
+    wrapper_persistent = flashinfer.BatchAttention(kv_layout="NHD")
+    wrapper_persistent.plan(
+        q_indptr.to(device),
+        kv_indptr.to(device),
+        torch.arange(num_blocks, dtype=torch.int32, device=device),
+        seq_lens.to(device),
+        num_qo_heads,
+        num_kv_heads,
+        head_dim,
+        head_dim,
+        page_block_size,
+        causal=causal,
+        q_data_type=torch.bfloat16,
+        kv_data_type=torch.bfloat16,
+    )
+    o_persistent, _ = wrapper_persistent.run(q, kv_data)
+    measurements_persistent = bench_gpu_time(lambda: wrapper_persistent.run(q, kv_data))
+    ms_persistent = np.mean(measurements_persistent)
     if len(p_kv_lens) == 1:
         q_d = q[: d_q_indptr[-1]]
         kv_d = kv_data[: d_kv_indptr[-1]].unbind(1)
@@ -123,9 +141,46 @@ def run_bench(
             )
         )
         ms_pod = np.median(measurements)
+
+        # Sequential two kernels: single prefill + batch decode (tensor cores)
+        # Prefill using single_prefill_with_kv_cache
+        def _run_single_prefill():
+            return flashinfer.prefill.single_prefill_with_kv_cache(
+                q_p,
+                k_p,
+                v_p,
+                causal=causal,
+                pos_encoding_mode="NONE",
+                backend="fa2",
+            )
+
+        measurements_prefill = bench_gpu_time(lambda: _run_single_prefill())
+        ms_prefill = np.median(measurements_prefill)
+
+        # Batch decode using tensor cores
+        wrapper_decode = flashinfer.BatchDecodeWithPagedKVCacheWrapper(
+            workspace_buffer, kv_layout=kv_layout, use_tensor_cores=True
+        )
+        wrapper_decode.plan(
+            d_kv_indptr.to(device),
+            kv_indices_d.to(device),
+            last_page_len_d,
+            num_qo_heads,
+            num_kv_heads,
+            head_dim,
+            page_block_size,
+            data_type=torch.bfloat16,
+            q_data_type=torch.bfloat16,
+        )
+        measurements_decode = bench_gpu_time(lambda: wrapper_decode.run(q_d, kv_d))
+        ms_decode = np.median(measurements_decode)
+        ms_seq_two_kernels = ms_prefill + ms_decode
+
     print(f"Elapsed time (Batched Prefill): {ms_old:.2f} ms")
     if len(p_kv_lens) == 1:
         print(f"Elapsed time (POD Attention): {ms_pod:.2f} ms")
+        print(f"Elapsed time (Sequential two kernels): {ms_seq_two_kernels:.2f} ms")
+    print(f"Elapsed time (Persistent BatchAttention): {ms_persistent:.2f} ms")
     total_bytes = (
         q.numel() * q.element_size() + kv_data.numel() * kv_data.element_size()
     )
@@ -137,77 +192,29 @@ def run_bench(
     if len(p_kv_lens) == 1:
         bandwidth_pod_gb_s = total_bytes / (ms_pod * 1e-3) / (1024**3)
         print(f"Memory bandwidth (POD Attention): {bandwidth_pod_gb_s:.2f} GB/s")
+        bandwidth_seq_gb_s = total_bytes / (ms_seq_two_kernels * 1e-3) / (1024**3)
+        print(
+            f"Memory bandwidth (Sequential two kernels): {bandwidth_seq_gb_s:.2f} GB/s"
+        )
+    bandwidth_persistent_gb_s = total_bytes / (ms_persistent * 1e-3) / (1024**3)
+    print(
+        f"Memory bandwidth (Persistent BatchAttention): {bandwidth_persistent_gb_s:.2f} GB/s"
+    )
 
 
 if __name__ == "__main__":
     np.random.seed(42)
     torch.random.manual_seed(42)
 
     # Irregular sequence lengths for prefill and decode
-    d_q_len_configs = [[1] * 122, [1] * 128, [1] * 242, [1] * 256]
-    d_kv_len_configs = [[600] * 122, [10000] * 128, [400] * 242, [8192] * 256]
-    p_q_configs = [[17] * 1, [10000], [17] * 1, []]
-    p_kv_configs = [[10000] * 1, [10000], [8192] * 1, []]
-
-    # construct random length testcases
-    for _ in range(1):
-        bsz = 256
-        stride = 16
-        sparsity = 0.05
-
-        full_kv_len = np.random.randint(1000, 8192, size=bsz)
-        p_q_lens = []
-        p_kv_lens = []
-        d_q_lens = []
-        d_kv_lens = []
-        for i in range(bsz):
-            if i % stride == 0:
-                kv_len = full_kv_len[i]
-                qo_len = stride + 1
-                p_q_lens.append(qo_len)
-                p_kv_lens.append(kv_len)
-            else:
-                kv_len = int(full_kv_len[i] * sparsity)
-                qo_len = 1
-                d_q_lens.append(qo_len)
-                d_kv_lens.append(kv_len)
-
-        p_q_configs.append(p_q_lens)
-        p_kv_configs.append(p_kv_lens)
-        d_q_len_configs.append(d_q_lens)
-        d_kv_len_configs.append(d_kv_lens)
-
-    for _ in range(1):
-        bsz = 128
-        stride = 16
-        sparsity = 0.05
-
-        full_kv_len = np.random.randint(2000, 16000, size=bsz)
-        p_q_lens = []
-        p_kv_lens = []
-        d_q_lens = []
-        d_kv_lens = []
-
-        for i in range(bsz):
-            if i % stride == 0:
-                kv_len = full_kv_len[i]
-                qo_len = stride + 1
-                p_q_lens.append(qo_len)
-                p_kv_lens.append(kv_len)
-            else:
-                kv_len = int(full_kv_len[i] * sparsity)
-                qo_len = 1
-                d_q_lens.append(qo_len)
-                d_kv_lens.append(kv_len)
-
-        p_q_configs.append(p_q_lens)
-        p_kv_configs.append(p_kv_lens)
-        d_q_len_configs.append(d_q_lens)
-        d_kv_len_configs.append(d_kv_lens)
+    d_q_len_configs = [[1] * 128, [1] * 128, [1] * 128, [1] * 128]
+    d_kv_len_configs = [[2048] * 128, [4096] * 128, [8192] * 128, [8192] * 128]
+    p_q_configs = [[2048], [4096], [4096], [6000]]
+    p_kv_configs = [[2048], [4096], [4096], [7000]]
 
     page_block_size = 1
-    num_kv_heads = 4
-    num_qo_heads = 28
+    num_kv_heads = 8
+    num_qo_heads = 32
     head_dim = 128
 
     for idx, (p_q_lens, p_kv_lens, d_q_len, d_kv_len) in enumerate(
 
@@ -88,7 +88,7 @@ def _apply_rotary_emb(
             return torch.stack((o1, o2), dim=-1).flatten(-2)
 
 
-def benchmark_config(config_name, num_tokens, provider):
+def benchmark_config(config_name, num_tokens, provider, enable_pdl=False):
     """Benchmark a specific attention configuration."""
     input_dtype = torch.bfloat16
     device = "cuda"
@@ -177,6 +177,7 @@ def execute():
                 k_nope_out=k_nope_out,
                 quant_scale_q=1.0,
                 quant_scale_kv=1.0,
+                enable_pdl=enable_pdl,
             )
 
             if mode_ncu and run_idx == 20:
@@ -278,6 +279,23 @@ def benchmark_mha(provider, num_tokens):
     return benchmark_config("mha", num_tokens, provider)
 
 
+@triton.testing.perf_report(
+    triton.testing.Benchmark(
+        x_names=["num_tokens"],
+        x_vals=[768] if mode_ncu else [1, 2, 4, 8, 16, 32, 64, 128, 256, 384, 512, 768],
+        line_arg="enable_pdl",
+        line_vals=[False, True],
+        line_names=["enable_pdl=False", "enable_pdl=True"],
+        styles=[("blue", "-"), ("red", "-")],
+        ylabel="Latency (ms)",
+        plot_name="rope-pdl-benchmark",
+        args={},
+    )
+)
+def benchmark_pdl(enable_pdl, num_tokens):
+    return benchmark_config("mla", num_tokens, "flashinfer", enable_pdl=enable_pdl)
+
+
 if __name__ == "__main__":
     # Run all benchmarks and generate individual plots
     print("Running MLA benchmark...")
@@ -289,6 +307,9 @@ def benchmark_mha(provider, num_tokens):
     print("Running MHA benchmark...")
     benchmark_mha.run(print_data=False, show_plots=True, save_path=".")
 
+    print("Running PDL benchmark...")
+    benchmark_pdl.run(print_data=False, show_plots=True, save_path=".")
+
     # Collect results for summary table
     token_counts = (
         [1, 2, 4, 8, 16, 32, 64, 128, 256, 384, 512, 768] if not mode_ncu else [768]
@@ -319,3 +340,4 @@ def benchmark_mha(provider, num_tokens):
     print("  mla-rope-benchmark.png (FlashInfer vs PyTorch)")
     print("  gqa-rope-benchmark.png (FlashInfer vs PyTorch)")
     print("  mha-rope-benchmark.png (FlashInfer vs PyTorch)")
+    print("  rope-pdl-benchmark.png (enable_pdl=False vs enable_pdl=True)")