rebase

yzh119 · yzh119 · commit 9081522aead0 · 2024-06-02T06:24:15.000Z
diff --git a/python/flashinfer/decode.py b/python/flashinfer/decode.py
@@ -824,267 +824,11 @@ def __init__(
             should be large enough to store the maximum batch size (``[max_batch_size]``)
             during the lifecycle of this wrapper.
         """
-        check_kv_layout(kv_layout)
-        self._kv_layout = kv_layout
-        self._workspace_buffer = workspace_buffer
-        max_batch_size = len(last_page_len_buffer)
-        self._wrapper = _kernels.CUDAGraphBatchDecodeWithPagedKVCachePyTorchWrapper(
-            TensorLayout[kv_layout].value,
-            max_batch_size,
-        )
-        self._paged_kv_indptr_buf = indptr_buffer
-        self._paged_kv_indices_buf = indices_buffer
-        self._paged_kv_last_page_len_buf = last_page_len_buffer
-
-    def reset_workspace_buffer(self, new_workspace_buffer: torch.Tensor):
-        r"""Reset the workspace buffer.
-
-        Parameters
-        ----------
-        new_workspace_buffer : torch.Tensor
-            The new workspace buffer, the device of the new workspace buffer should
-            be the same as the device of the input tensors.
-        """
-        self._workspace_buffer = new_workspace_buffer
-
-    def begin_forward(
-        self,
-        indptr: torch.Tensor,
-        indices: torch.Tensor,
-        last_page_len: torch.Tensor,
-        num_qo_heads: int,
-        num_kv_heads: int,
-        head_dim: int,
-        page_size: int,
-        pos_encoding_mode: str = "NONE",
-        data_type: Union[str, torch.dtype] = "float16",
-    ):
-        r"""Create auxiliary data structures for batch decode for multiple forward calls
-        within the same decode step.
-
-        Parameters
-        ----------
-        indptr : torch.Tensor
-            The indptr of the paged kv cache, shape: ``[batch_size + 1]``
-        indices_host : torch.Tensor
-            The page indices of the paged kv cache, shape: ``[qo_indptr[-1]]``
-        last_page_len : torch.Tensor
-            The number of entries in the last page of each request in the paged kv
-            cache, shape: ``[batch_size]``
-        num_qo_heads : int
-            The number of query/output heads
-        num_kv_heads : int
-            The number of key/value heads
-        head_dim : int
-            The dimension of the heads
-        page_size : int
-            The page size of the paged kv cache
-        pos_encoding_mode : str
-            Whether to apply RoPE on-the-fly inside attention kernels, could be
-            ``NONE``/``ROPE_LLAMA`` (LLAMA style rotary embedding) /``ALIBI``.
-        data_type : Union[str, torch.dtype]
-            The data type of the paged kv cache
-
-        Note
-        ----
-        The :meth:`begin_forward` method should be called before any :meth:`forward` or
-        :meth:`forward_return_lse` calls, auxiliary data structures will be created
-        during this call and cached for multiple forward calls.
-
-        The ``num_qo_heads`` must be a multiple of ``num_kv_heads``. If ``num_qo_heads``
-        is not equal to ``num_kv_heads``, the function will use
-        `grouped query attention <https://arxiv.org/abs/2305.13245>`_.
-        """
-
-        self._paged_kv_indptr_buf[: len(indptr)] = indptr
-        self._paged_kv_indices_buf[: len(indices)] = indices
-        self._paged_kv_last_page_len_buf[: len(last_page_len)] = last_page_len
-
-        batch_size = len(indptr) - 1
-        # NOTE(Zihao): the following tensor acts as placeholder to pass dtype info
-        empty_data = torch.empty(
-            0,
-            dtype=(
-                getattr(torch, data_type) if isinstance(data_type, str) else data_type
-            ),
-        )
-        self._wrapper.begin_forward(
-            self._workspace_buffer,
-            indptr,
-            last_page_len,
-            batch_size,
-            num_qo_heads,
-            num_kv_heads,
-            head_dim,
-            page_size,
-            PosEncodingMode[pos_encoding_mode].value,
-            empty_data,
-        )
-
-    def end_forward(self):
-        r"""Clear auxiliary data structures created by :meth:`begin_forward`."""
-        self._wrapper.end_forward()
-
-    def forward(
-        self,
-        q: torch.Tensor,
-        paged_kv_data: torch.Tensor,
-        pos_encoding_mode: str = "NONE",
-        q_scale: Optional[float] = None,
-        k_scale: Optional[float] = None,
-        v_scale: Optional[float] = None,
-        sm_scale: Optional[float] = None,
-        rope_scale: Optional[float] = None,
-        rope_theta: Optional[float] = None,
-    ):
-        r"""Compute batch decode attention between query and paged kv cache.
-
-        Parameters
-        ----------
-        q : torch.Tensor
-            The query tensor, shape: ``[batch_size, num_qo_heads, head_dim]``
-        paged_kv_data : torch.Tensor
-            A 5-D tensor of the reserved paged kv-cache data, shape:
-            ``[max_num_pages, 2, page_size, num_kv_heads, head_dim]`` if
-            :attr:`kv_layout` is ``NHD``, or
-            ``[max_num_pages, 2, num_kv_heads, page_size, head_dim]`` if
-            :attr:`kv_layout` is ``HND``.
-        pos_encoding_mode : str
-            Whether to apply RoPE on-the-fly inside attention kernels, could be
-            ``NONE``/``ROPE_LLAMA`` (LLAMA style rotary embedding) /``ALIBI``.
-        q_scale : Optional[float]
-            The calibration scale of query for fp8 input, if not provided, will be set to ``1.0``.
-        k_scale : Optional[float]
-            The calibration scale of key for fp8 input, if not provided, will be set to ``1.0``.
-        v_scale : Optional[float]
-            The calibration scale of value for fp8 input, if not provided, will be set to ``1.0``.
-        sm_scale : Optional[float]
-            The scale of softmax, if not provided, will be set to ``1 / sqrt(head_dim)``.
-        rope_scale : Optional[float]
-            The scale used in RoPE interpolation, if not provided, will be set to
-            ``1.0``.
-        rope_theta : Optional[float]
-            The theta used in RoPE, if not provided, will be set to ``1e4``.
-
-        Returns
-        -------
-        torch.Tensor
-            The attention output, shape: ``[batch_size, num_qo_heads, head_dim]``.
-        """
-        check_pos_encoding_mode(pos_encoding_mode)
-        if sm_scale is None:
-            head_dim = q.shape[-1]
-            sm_scale = 1.0 / math.sqrt(head_dim)
-        if q_scale is not None:
-            sm_scale *= q_scale
-        if k_scale is not None:
-            sm_scale *= k_scale
-        if rope_scale is None:
-            rope_scale = 1.0
-        if rope_theta is None:
-            rope_theta = 1e4
-
-        paged_kv_data = expand_5d(paged_kv_data, self._kv_layout)
-        out = self._wrapper.forward(
-            q,
-            paged_kv_data,
-            self._paged_kv_indptr_buf,
-            self._paged_kv_indices_buf,
-            self._paged_kv_last_page_len_buf,
-            PosEncodingMode[pos_encoding_mode].value,
-            sm_scale,
-            rope_scale,
-            rope_theta,
-            False,
-        )[0]
-        if v_scale is not None:
-            out *= v_scale
-        return out
-
-    def forward_return_lse(
-        self,
-        q: torch.Tensor,
-        paged_kv_data: torch.Tensor,
-        pos_encoding_mode: str = "NONE",
-        q_scale: Optional[float] = None,
-        k_scale: Optional[float] = None,
-        v_scale: Optional[float] = None,
-        sm_scale: Optional[float] = None,
-        rope_scale: Optional[float] = None,
-        rope_theta: Optional[float] = None,
-    ):
-        r"""Compute batch decode attention with paged kv cache, return attention output
-        and logsumexp of attention scores.
-
-        Parameters
-        ----------
-        q : torch.Tensor
-            The query tensor, shape: ``[batch_size, num_qo_heads, head_dim]``
-        paged_kv_data : torch.Tensor
-            A 5-D tensor of the reserved paged kv-cache data, shape:
-            ``[max_num_pages, 2, page_size, num_kv_heads, head_dim]`` if
-            :attr:`kv_layout` is ``NHD``, or
-            ``[max_num_pages, 2, num_kv_heads, page_size, head_dim]`` if
-            :attr:`kv_layout` is ``HND``.
-        pos_encoding_mode : str
-            Whether to apply RoPE on-the-fly inside attention kernels, could be
-            ``NONE``/``ROPE_LLAMA`` (LLAMA style rotary embedding) /``ALIBI``.
-        q_scale : Optional[float]
-            The calibration scale of query for fp8 input, if not provided, will be set to ``1.0``.
-        k_scale : Optional[float]
-            The calibration scale of key for fp8 input, if not provided, will be set to ``1.0``.
-        v_scale : Optional[float]
-            The calibration scale of value for fp8 input, if not provided, will be set to ``1.0``.
-        sm_scale : Optional[float]
-            The scale of softmax, if not provided, will be set to ``1 / sqrt(head_dim)``.
-        rope_scale : Optional[float]
-            The scale used in RoPE interpolation, if not provided, will be set to
-            ``1.0``.
-        rope_theta : Optional[float]
-            The theta used in RoPE, if not provided, will be set to ``1e4``.
-
-        Returns
-        -------
-        V : torch.Tensor
-            The attention output, shape: ``[batch_size, num_qo_heads, head_dim]``.
-        S : torch.Tensor
-            The logsumexp of attention scores, Shape: ``[batch_size, num_qo_heads]``.
-
-        Notes
-        -----
-        Please refer to the :ref:`tutorial <recursive-attention>` for a detailed
-        explanation of the log-sum-exp function and attention states.
-        """
-        check_pos_encoding_mode(pos_encoding_mode)
-        if sm_scale is None:
-            head_dim = q.shape[-1]
-            sm_scale = 1.0 / math.sqrt(head_dim)
-        if q_scale is not None:
-            sm_scale *= q_scale
-        if k_scale is not None:
-            sm_scale *= k_scale
-        if rope_scale is None:
-            rope_scale = 1.0
-        if rope_theta is None:
-            rope_theta = 1e4
-        paged_kv_data = expand_5d(paged_kv_data, self._kv_layout)
-        V, s = self._wrapper.forward(
-            q,
-            paged_kv_data,
-            self._paged_kv_indptr_buf,
-            self._paged_kv_indices_buf,
-            self._paged_kv_last_page_len_buf,
-            self._batch_size,
-            self._nnz_pages,
-            PosEncodingMode[pos_encoding_mode].value,
-            sm_scale,
-            rope_scale,
-            rope_theta,
+        super().__init__(
+            workspace_buffer,
+            kv_layout,
             True,
             indptr_buffer,
             indices_buffer,
             last_page_len_buffer,
         )
-        if v_scale is not None:
-            V *= v_scale
-        return V, s