Dao-AILab · skrider · Feb 8, 2024 · Feb 9, 2024 · Feb 11, 2024 · Feb 11, 2024
diff --git a/csrc/cutlass b/csrc/cutlass
diff --git a/csrc/flash_attn/flash_api.cpp b/csrc/flash_attn/flash_api.cpp
@@ -561,7 +561,7 @@ mha_varlen_fwd(at::Tensor &q,  // total_q x num_heads x head_size, total_q := \s
     const int max_num_blocks_per_seq = !paged_KV ? 0 : block_table.size(1);
     const int num_blocks = !paged_KV ? 0 : k.size(0);
     const int page_block_size = !paged_KV ? 1 : k.size(1);
-    TORCH_CHECK(!paged_KV || page_block_size % 256 == 0, "Paged KV cache block size must be divisible by 256");
+    TORCH_CHECK(!paged_KV || page_block_size % 16 == 0, "Paged KV cache block size must be divisible by 16");
 
     if (max_seqlen_q == 1 && !alibi_slopes_.has_value()) { is_causal = false; }  // causal=true is the same as causal=false in this case
     if (is_causal) { window_size_right = 0; }
@@ -1285,7 +1285,7 @@ mha_fwd_kvcache(at::Tensor &q,                 // batch_size x seqlen_q x num_he
     const int max_num_blocks_per_seq = !paged_KV ? 0 : block_table.size(1);
     const int num_blocks = !paged_KV ? 0 : kcache.size(0);
     const int page_block_size = !paged_KV ? 1 : kcache.size(1);
-    TORCH_CHECK(!paged_KV || page_block_size % 256 == 0, "Paged KV cache block size must be divisible by 256");
+    TORCH_CHECK(!paged_KV || page_block_size % 16 == 0, "Paged KV cache block size must be divisible by 16");
     const int seqlen_k = !paged_KV ? kcache.size(1) : max_num_blocks_per_seq * page_block_size;
     const int num_heads_k = kcache.size(2);
     const int batch_size_c = !paged_KV ? kcache.size(0) : batch_size;
+2 −7		CHANGELOG.md
+3 −24		CMakeLists.txt
+0 −7		PUBLICATIONS.md
+4 −9		README.md
+38 −0		cmake/version.h.in
+0 −34		cmake/version_extended.h.in
+0 −1		examples/02_dump_reg_shmem/CMakeLists.txt
+2 −2		examples/08_turing_tensorop_gemm/turing_tensorop_gemm.cu
+7 −7		examples/56_hopper_ptr_array_batched_gemm/56_hopper_ptr_array_batched_gemm.cu
+8 −10		examples/56_hopper_ptr_array_batched_gemm/CMakeLists.txt
+49 −96		examples/57_hopper_grouped_gemm/57_hopper_grouped_gemm.cu
+0 −10		examples/57_hopper_grouped_gemm/CMakeLists.txt
+1 −1		include/cute/arch/copy_sm90_desc.hpp
+0 −2		include/cute/atom/mma_atom.hpp
+2 −2		include/cute/util/print.hpp
+0 −3		include/cute/util/type_traits.hpp
+0 −4		include/cutlass/arch/mma_sm90.h
+0 −1		include/cutlass/bfloat16.h
+1 −35		include/cutlass/detail/layout.hpp
+7 −12		include/cutlass/epilogue/collective/builders/sm90_builder.inl
+0 −1		include/cutlass/epilogue/collective/default_epilogue.hpp
+18 −32		include/cutlass/epilogue/collective/default_epilogue_array.hpp
+38 −76		include/cutlass/epilogue/collective/sm90_epilogue_tma_warpspecialized.hpp
+2 −1		include/cutlass/epilogue/dispatch_policy.hpp
+0 −28		include/cutlass/epilogue/fusion/sm90_callbacks_tma_warpspecialized.hpp
+0 −1		include/cutlass/epilogue/fusion/sm90_visitor_store_tma_warpspecialized.hpp
+12 −57		include/cutlass/epilogue/thread/linear_combination.h
+183 −0		include/cutlass/epilogue/threadblock/default_epilogue_tensor_op_row_broadcast.h
+519 −0		include/cutlass/epilogue/threadblock/predicated_tile_iterator_row_broadcast.h
+8 −4		include/cutlass/gemm/collective/builders/sm90_gmma_builder.inl
+29 −45		include/cutlass/gemm/collective/sm90_mma_array_tma_gmma_ss_warpspecialized.hpp
+514 −0		include/cutlass/gemm/device/gemm_sparse_row_broadcast.h
+7 −4		include/cutlass/gemm/dispatch_policy.hpp
+0 −12		include/cutlass/gemm/group_array_problem_shape.hpp
+191 −0		include/cutlass/gemm/kernel/default_gemm_sparse_row_broadcast.h
+35 −30		include/cutlass/gemm/kernel/sm90_gemm_array_tma_warpspecialized_cooperative.hpp
+7 −5		include/cutlass/gemm/kernel/sm90_gemm_tma.hpp
+7 −5		include/cutlass/gemm/kernel/sm90_gemm_tma_warpspecialized.hpp
+7 −5		include/cutlass/gemm/kernel/sm90_gemm_tma_warpspecialized_cooperative.hpp
+7 −5		include/cutlass/gemm/kernel/sm90_gemm_tma_warpspecialized_pingpong.hpp
+7 −5		include/cutlass/gemm/kernel/sm90_gemm_warpspecialized.hpp
+7 −5		include/cutlass/gemm/kernel/sm90_gemm_warpspecialized_cooperative.hpp
+7 −5		include/cutlass/gemm/kernel/sm90_gemm_warpspecialized_pingpong.hpp
+86 −140		include/cutlass/gemm/kernel/sm90_tile_scheduler_group.hpp
+400 −0		include/cutlass/gemm/kernel/sparse_gemm_row_broadcast.h
+6 −14		include/cutlass/gemm/kernel/tile_scheduler_params.h
+0 −80		include/cutlass/version.h
+2 −2		pyproject.toml
+3 −3		python/cutlass/__init__.py
+2 −6		python/cutlass/backend/c_types.py
+1 −23		python/cutlass/backend/epilogue.py
+2 −2		python/cutlass/backend/evt/frontend/frontend_base.py
+16 −0		python/cutlass/backend/evt/passes/graph_drawer.py
+18 −28		python/cutlass/backend/gemm_operation.py
+1 −1		python/setup_library.py
+1 −1		python/setup_pycute.py
+0 −1		test/unit/gemm/device/CMakeLists.txt
+19 −0		test/unit/gemm/device/gemm_f16n_f16n_f16t_tensor_op_f32_sparse_sm80.cu
+0 −685		test/unit/gemm/device/sm90_gemm_f16_f16_f16_tensor_op_f32_cluster_warpspecialized_cooperative_aux_store.cu
+20 −7		test/unit/gemm/device/testbed_sparse.h
+1 −1		tools/util/include/cutlass/util/packed_stride.hpp