diff --git a/python/tvm/relax/backend/cuda/flashinfer.py b/python/tvm/relax/backend/cuda/flashinfer.py index 1fea39e9a221..f1af2f3d1573 100644 --- a/python/tvm/relax/backend/cuda/flashinfer.py +++ b/python/tvm/relax/backend/cuda/flashinfer.py @@ -141,8 +141,8 @@ def get_object_file_path(src: Path) -> Path: ) include_paths += [ Path(tvm_home).resolve() / "include", - Path(tvm_home).resolve() / "ffi" / "include", - Path(tvm_home).resolve() / "ffi" / "3rdparty" / "dlpack" / "include", + Path(tvm_home).resolve() / "3rdparty" / "tvm-ffi" / "include", + Path(tvm_home).resolve() / "3rdparty" / "tvm-ffi" / "3rdparty" / "dlpack" / "include", Path(tvm_home).resolve() / "3rdparty" / "dmlc-core" / "include", ] else: @@ -160,8 +160,13 @@ def get_object_file_path(src: Path) -> Path: # The package is installed from source. include_paths += [ tvm_package_path.parent.parent / "include", - tvm_package_path.parent.parent / "ffi" / "include", - tvm_package_path.parent.parent / "ffi" / "3rdparty" / "dlpack" / "include", + tvm_package_path.parent.parent / "3rdparty" / "tvm-ffi" / "include", + tvm_package_path.parent.parent + / "3rdparty" + / "tvm-ffi" + / "3rdparty" + / "dlpack" + / "include", tvm_package_path.parent.parent / "3rdparty" / "dmlc-core" / "include", ] else: diff --git a/src/runtime/vm/attn_backend.h b/src/runtime/vm/attn_backend.h index bc58d1c9e1d8..ea5f49c6c08a 100644 --- a/src/runtime/vm/attn_backend.h +++ b/src/runtime/vm/attn_backend.h @@ -176,7 +176,8 @@ class FlashInferPagedPrefillFunc : public PagedPrefillFunc { plan_func_(float_workspace_buffer, int_workspace_buffer, page_locked_int_workspace_buffer, qo_indptr->as_tensor(), page_indptr->as_tensor(), IntTuple(std::move(kv_len)), total_qo_len, batch_size, num_qo_heads, num_kv_heads, page_size, - /*enable_cuda_graph=*/false, qk_head_dim, v_head_dim, causal, copy_stream) + /*enable_cuda_graph=*/false, qk_head_dim, v_head_dim, causal, + /*window_left=*/-1, copy_stream) .cast(); } else if (attn_kind == AttnKind::kMLA) { plan_info_vec = @@ -280,7 +281,8 @@ class FlashInferRaggedPrefillFunc : public RaggedPrefillFunc { plan_func_(float_workspace_buffer, int_workspace_buffer, page_locked_int_workspace_buffer, qo_indptr->as_tensor(), kv_indptr->as_tensor(), IntTuple(std::move(kv_len)), total_qo_len, batch_size, num_qo_heads, num_kv_heads, /*page_size=*/1, - /*enable_cuda_graph=*/false, qk_head_dim, v_head_dim, causal, copy_stream) + /*enable_cuda_graph=*/false, qk_head_dim, v_head_dim, causal, + /*window_left=*/-1, copy_stream) .cast(); }