Fix formatting required by pre-commit

Aditya K Kamath · Aditya K Kamath · commit 42424b8e45b2 · 2025-11-12T00:13:05.000Z
diff --git a/benchmarks/bench_mixed_attention.py b/benchmarks/bench_mixed_attention.py
@@ -107,12 +107,8 @@ def run_bench(
     kv_d = kv_data[: d_kv_indptr[-1]].unbind(1)
     q_p = q[d_q_indptr[-1] :]
     kv_p = kv_data[d_kv_indptr[-1] :].unbind(1)
-    kv_indices_d = torch.arange(
-        0, d_kv_indptr[-1], device=device, dtype=torch.int32
-    )
-    kv_indices_p = torch.arange(
-        0, p_kv_indptr[-1], device=device, dtype=torch.int32
-    )
+    kv_indices_d = torch.arange(0, d_kv_indptr[-1], device=device, dtype=torch.int32)
+    kv_indices_p = torch.arange(0, p_kv_indptr[-1], device=device, dtype=torch.int32)
 
     last_page_len_d = (d_seq_lens_blocks - 1) % page_block_size + 1
     last_page_len_p = (p_seq_lens_blocks - 1) % page_block_size + 1
@@ -266,7 +262,9 @@ def _run_single_prefill():
 
     print(f"Memory bandwidth (Batched Prefill): {bandwidth_old_gb_s:.2f} GB/s")
     bandwidth_batch_pod_gb_s = total_bytes / (ms_batch_pod * 1e-3) / (1024**3)
-    print(f"Memory bandwidth (Batched POD Attention): {bandwidth_batch_pod_gb_s:.2f} GB/s")
+    print(
+        f"Memory bandwidth (Batched POD Attention): {bandwidth_batch_pod_gb_s:.2f} GB/s"
+    )
     if len(p_kv_lens) == 1:
         bandwidth_pod_gb_s = total_bytes / (ms_pod * 1e-3) / (1024**3)
         print(f"Memory bandwidth (POD Attention): {bandwidth_pod_gb_s:.2f} GB/s")
@@ -286,7 +284,13 @@ def _run_single_prefill():
 
     # Irregular sequence lengths for prefill and decode
     d_q_len_configs = [[1] * 128, [1] * 128, [1] * 128, [1] * 128, [1] * 128]
-    d_kv_len_configs = [[2048] * 128, [2048] * 128, [4096] * 128, [8192] * 128, [8192] * 128]
+    d_kv_len_configs = [
+        [2048] * 128,
+        [2048] * 128,
+        [4096] * 128,
+        [8192] * 128,
+        [8192] * 128,
+    ]
     p_q_configs = [[2048] * 2, [2048], [4096], [4096], [6000]]
     p_kv_configs = [[2048] * 2, [2048], [4096], [4096], [7000]]
 
diff --git a/csrc/batch_pod.cu b/csrc/batch_pod.cu
@@ -21,14 +21,15 @@
 
 namespace flashinfer {
 template <uint32_t HEAD_DIM_QK, uint32_t HEAD_DIM_VO, PosEncodingMode POS_ENCODING_MODE,
-          bool USE_FP16_QK_REDUCTION, uint32_t CTA_TILE_Q_P, MaskMode MASK_MODE_P, uint32_t CTA_TILE_Q_D,
-          MaskMode MASK_MODE_D, typename PrefillAttentionVariant, typename DecodeAttentionVariant,
-          typename PrefillParams, typename DecodeParams>
+          bool USE_FP16_QK_REDUCTION, uint32_t CTA_TILE_Q_P, MaskMode MASK_MODE_P,
+          uint32_t CTA_TILE_Q_D, MaskMode MASK_MODE_D, typename PrefillAttentionVariant,
+          typename DecodeAttentionVariant, typename PrefillParams, typename DecodeParams>
 cudaError_t BatchPODWithKVCacheTensorDispatched(PrefillParams prefill_params,
-                                           typename PrefillParams::DTypeO* tmp_v_p, float* tmp_s_p,
-                                           DecodeParams decode_params,
-                                           typename DecodeParams::DTypeO* tmp_v_d, float* tmp_s_d,
-                                           bool enable_pdl, cudaStream_t stream);
+                                                typename PrefillParams::DTypeO* tmp_v_p,
+                                                float* tmp_s_p, DecodeParams decode_params,
+                                                typename DecodeParams::DTypeO* tmp_v_d,
+                                                float* tmp_s_d, bool enable_pdl,
+                                                cudaStream_t stream);
 
 }  // namespace flashinfer
 
@@ -122,8 +123,8 @@ void batch_pod_with_kv_cache_tensor(
     num_kv_heads_d = paged_k_cache_d.size(2);
   }
   TVM_FFI_ICHECK_EQ(num_kv_heads_p, num_kv_heads_d)
-      << "POD currently requires same # KV heads for prefill and decode; Prefill: " << num_kv_heads_p
-      << ", Decode: " << num_kv_heads_d;
+      << "POD currently requires same # KV heads for prefill and decode; Prefill: "
+      << num_kv_heads_p << ", Decode: " << num_kv_heads_d;
 
   if (maybe_lse_d.has_value()) {
     const auto& lse = maybe_lse_d.value();
@@ -151,8 +152,8 @@ void batch_pod_with_kv_cache_tensor(
   kv_cache_strides_d = k_strides_d.data();
 
   // Already handled by prefill
-  //cudaSetDevice(float_workspace_buffer_d.device().device_id);
-  //const cudaStream_t stream = get_stream(float_workspace_buffer_d.device());
+  // cudaSetDevice(float_workspace_buffer_d.device().device_id);
+  // const cudaStream_t stream = get_stream(float_workspace_buffer_d.device());
 
   DISPATCH_context(
       MASK_MODE_P, MASK_MODE_D, DTypeQ, DTypeKV, HEAD_DIM_QK, USE_SLIDING_WINDOW_P,
@@ -213,7 +214,8 @@ void batch_pod_with_kv_cache_tensor(
               GetPtrFromBaseOffset<IdType>(int_buffer_ptr_p, plan_info_p.qo_tile_indices_offset);
           params.kv_tile_indices =
               GetPtrFromBaseOffset<IdType>(int_buffer_ptr_p, plan_info_p.kv_tile_indices_offset);
-          params.o_indptr = GetPtrFromBaseOffset<IdType>(int_buffer_ptr_p, plan_info_p.o_indptr_offset);
+          params.o_indptr =
+              GetPtrFromBaseOffset<IdType>(int_buffer_ptr_p, plan_info_p.o_indptr_offset);
           params.kv_chunk_size_ptr =
               GetPtrFromBaseOffset<IdType>(int_buffer_ptr_p, plan_info_p.kv_chunk_size_ptr_offset);
           if (plan_info_p.split_kv) {
@@ -290,7 +292,8 @@ void batch_pod_with_kv_cache_tensor(
               GetPtrFromBaseOffset<IdType>(int_buffer_ptr_d, plan_info_d.qo_tile_indices_offset);
           params.kv_tile_indices =
               GetPtrFromBaseOffset<IdType>(int_buffer_ptr_d, plan_info_d.kv_tile_indices_offset);
-          params.o_indptr = GetPtrFromBaseOffset<IdType>(int_buffer_ptr_d, plan_info_d.o_indptr_offset);
+          params.o_indptr =
+              GetPtrFromBaseOffset<IdType>(int_buffer_ptr_d, plan_info_d.o_indptr_offset);
           params.kv_chunk_size_ptr =
               GetPtrFromBaseOffset<IdType>(int_buffer_ptr_d, plan_info_d.kv_chunk_size_ptr_offset);
           if (plan_info_d.split_kv) {
@@ -322,10 +325,10 @@ void batch_pod_with_kv_cache_tensor(
         DISPATCH_CTA_TILE_Q(plan_info_p.cta_tile_q, CTA_TILE_Q_P, {
           constexpr size_t CTA_TILE_Q_D = 16;
           cudaError_t status = flashinfer::BatchPODWithKVCacheTensorDispatched<
-              HEAD_DIM_QK, HEAD_DIM_VO, POS_ENCODING_MODE, USE_FP16_QK_REDUCTION, CTA_TILE_Q_P, MASK_MODE_P,
-              CTA_TILE_Q_D, MASK_MODE_D, PrefillAttentionVariant, DecodeAttentionVariant>(
-              prefill_params, tmp_v_p, tmp_s_p, decode_params, tmp_v_d, tmp_s_d,
-              enable_pdl, stream);
+              HEAD_DIM_QK, HEAD_DIM_VO, POS_ENCODING_MODE, USE_FP16_QK_REDUCTION, CTA_TILE_Q_P,
+              MASK_MODE_P, CTA_TILE_Q_D, MASK_MODE_D, PrefillAttentionVariant,
+              DecodeAttentionVariant>(prefill_params, tmp_v_p, tmp_s_p, decode_params, tmp_v_d,
+                                      tmp_s_d, enable_pdl, stream);
           TVM_FFI_ICHECK(status == cudaSuccess)
               << "BatchPODWithKVCache kernel launch failed, error: " << cudaGetErrorString(status);
         });
diff --git a/csrc/batch_pod_kernel_inst.jinja b/csrc/batch_pod_kernel_inst.jinja
@@ -21,11 +21,11 @@ constexpr auto POS_ENCODING_MODE = PosEncodingMode::kNone;
 {% for cta_tile_q in [16, 64, 128] %}
 template cudaError_t BatchPODWithKVCacheTensorDispatched<
     {{ head_dim_qk }}, {{ head_dim_vo }}, POS_ENCODING_MODE,
-    {{ use_fp16_qk_reduction }}, /*CTA_TILE_Q_P=*/{{cta_tile_q}}, {{ mask_mode_p }}, 
+    {{ use_fp16_qk_reduction }}, /*CTA_TILE_Q_P=*/{{cta_tile_q}}, {{ mask_mode_p }},
     /*CTA_TILE_Q_D=*/16, {{ mask_mode_d }}, {{ variant_name_p }},
     {{ variant_name_d }}, PrefillParams, DecodeParams>(
             PrefillParams prefill_params, {{ dtype_o }}* tmp_v_p, float *tmp_s_p,
-            DecodeParams decode_params, {{ dtype_o }}* tmp_v_d, float *tmp_s_d, 
+            DecodeParams decode_params, {{ dtype_o }}* tmp_v_d, float *tmp_s_d,
             bool enable_pdl, cudaStream_t stream);
 {% endfor %}
 };
diff --git a/csrc/pod_customize_config.jinja b/csrc/pod_customize_config.jinja
@@ -40,4 +40,3 @@ using DecodeParams = BatchPrefillPagedParams<DTypeQ, DTypeKV, DTypeO, IdType>;
       __VA_ARGS__();                                                                \
     });                                                                             \
 });
-
diff --git a/flashinfer/jit/attention/modules.py b/flashinfer/jit/attention/modules.py
@@ -629,6 +629,7 @@ def gen_pod_module(
         use_fp16_qk_reduction=use_fp16_qk_reduction,
     )
 
+
 def gen_batch_pod_module(
     dtype_q: torch.dtype,
     dtype_kv: torch.dtype,
@@ -643,7 +644,7 @@ def gen_batch_pod_module(
     use_sliding_window_d: bool,
     use_logits_soft_cap_d: bool,
 ) -> JitSpec:
-    uri = 'batch_' + get_pod_uri(
+    uri = "batch_" + get_pod_uri(
         dtype_q,
         dtype_kv,
         dtype_o,
@@ -693,6 +694,7 @@ def gen_batch_pod_module(
         use_fp16_qk_reduction=use_fp16_qk_reduction,
     )
 
+
 def gen_customize_pod_module(
     uri: str,
     dtype_q: torch.dtype,
@@ -792,6 +794,7 @@ def gen_customize_pod_module(
 
     return gen_jit_spec(uri, source_paths)
 
+
 def gen_customize_batch_pod_module(
     uri: str,
     dtype_q: torch.dtype,
@@ -891,6 +894,7 @@ def gen_customize_batch_pod_module(
 
     return gen_jit_spec(uri, source_paths)
 
+
 def gen_batch_decode_module(
     dtype_q: torch.dtype,
     dtype_kv: torch.dtype,
diff --git a/flashinfer/pod.py b/flashinfer/pod.py
@@ -46,11 +46,13 @@ def get_pod_module(*args):
     module = gen_pod_module(*args).build_and_load()
     return SimpleNamespace(run_tensor=module.pod_with_kv_cache_tensor)
 
+
 @functools.cache
 def get_batch_pod_module(*args):
     module = gen_batch_pod_module(*args).build_and_load()
     return SimpleNamespace(run_tensor=module.batch_pod_with_kv_cache_tensor)
 
+
 class PODWithPagedKVCacheWrapper:
     r"""Wrapper class for POD-Attention with paged kv-cache (first proposed in
     `<https://arxiv.org/abs/2410.18038>`_) for batch of requests.
@@ -615,6 +617,7 @@ def end_forward(self) -> None:
         r"""Warning: this function is deprecated and has no effect."""
         pass
 
+
 class BatchPODWithPagedKVCacheWrapper:
     r"""Wrapper class for POD-Attention with paged kv-cache (first proposed in
     `<https://arxiv.org/abs/2410.18038>`_) for batch of requests.
@@ -837,12 +840,8 @@ def plan(
         batch_size_p = len(last_page_len_p)
         qo_indptr_host_p = qo_indptr_p.to("cpu")
         total_num_rows_p = int(qo_indptr_host_p[-1])
-        self._kv_indptr_buf_p = kv_indptr_p.to(
-            self.device, non_blocking=non_blocking
-        )
-        self._kv_indices_buf_p = kv_indices_p.to(
-            self.device, non_blocking=non_blocking
-        )
+        self._kv_indptr_buf_p = kv_indptr_p.to(self.device, non_blocking=non_blocking)
+        self._kv_indices_buf_p = kv_indices_p.to(self.device, non_blocking=non_blocking)
         self._kv_last_page_len_buf_p = last_page_len_p.to(
             self.device, non_blocking=non_blocking
         )
@@ -851,7 +850,9 @@ def plan(
         )
         kv_indptr_host_p = kv_indptr_p.to("cpu")
         last_page_len_host_p = last_page_len_p.to("cpu")
-        kv_lens_arr_host_p = get_seq_lens(kv_indptr_host_p, last_page_len_host_p, page_size)
+        kv_lens_arr_host_p = get_seq_lens(
+            kv_indptr_host_p, last_page_len_host_p, page_size
+        )
 
         if data_type is not None:
             if q_data_type is None:
@@ -908,12 +909,8 @@ def plan(
         batch_size_d = len(last_page_len_d)
         qo_indptr_host_d = qo_indptr_d.to("cpu")
         total_num_rows_d = int(qo_indptr_host_d[-1])
-        self._kv_indptr_buf_d = kv_indptr_d.to(
-            self.device, non_blocking=non_blocking
-        )
-        self._kv_indices_buf_d = kv_indices_d.to(
-            self.device, non_blocking=non_blocking
-        )
+        self._kv_indptr_buf_d = kv_indptr_d.to(self.device, non_blocking=non_blocking)
+        self._kv_indices_buf_d = kv_indices_d.to(self.device, non_blocking=non_blocking)
         self._kv_last_page_len_buf_d = last_page_len_d.to(
             self.device, non_blocking=non_blocking
         )
@@ -922,7 +919,9 @@ def plan(
         )
         kv_indptr_host_d = kv_indptr_d.to("cpu")
         last_page_len_host_d = last_page_len_d.to("cpu")
-        kv_lens_arr_host_d = get_seq_lens(kv_indptr_host_d, last_page_len_host_d, page_size)
+        kv_lens_arr_host_d = get_seq_lens(
+            kv_indptr_host_d, last_page_len_host_d, page_size
+        )
 
         self._plan_info_d = self._cached_module.plan(
             self._float_workspace_buffer_d,
diff --git a/include/flashinfer/attention/batch_pod.cuh b/include/flashinfer/attention/batch_pod.cuh

-Original file line number
+Diff line change
       __VA_ARGS__();                                                                \
     });                                                                             \
 });
+-