From 6b4dca633e39a8639285006576aec55cc4e33897 Mon Sep 17 00:00:00 2001
From: WAYKEN-TSE <760301162@qq.com>
Date: Fri, 6 Dec 2024 10:40:13 +0000
Subject: [PATCH 1/8] =?UTF-8?q?mp3model=E6=B5=8B=E8=AF=95=E6=97=B6flash-at?=
 =?UTF-8?q?ten=E9=97=AE=E9=A2=98?=
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

---
 paddlemix/mPLUGOwl3/__init__.py               |   22 +
 paddlemix/mPLUGOwl3/activations.py            |  174 ++
 paddlemix/mPLUGOwl3/bert_padding.py           |  111 ++
 .../mPLUGOwl3/configuration_hyper_qwen2.py    |  103 ++
 .../mPLUGOwl3/configuration_mplugowl3.py      |   37 +
 .../mPLUGOwl3/image_processing_mplugowl3.py   |  470 ++++++
 paddlemix/mPLUGOwl3/imagetest.py              |   26 +
 paddlemix/mPLUGOwl3/modeling_hyper_qwen2.py   | 1468 +++++++++++++++++
 paddlemix/mPLUGOwl3/modeling_mplugowl3.py     |  210 +++
 paddlemix/mPLUGOwl3/modeling_navit_siglip.py  |  870 ++++++++++
 paddlemix/mPLUGOwl3/processing_mplugowl3.py   |  348 ++++
 paddlemix/mPLUGOwl3/utils/paddle_aux.py       |   15 +
 paddlemix/mPLUGOwl3/x_sdpa.py                 |   49 +
 13 files changed, 3903 insertions(+)
 create mode 100644 paddlemix/mPLUGOwl3/__init__.py
 create mode 100644 paddlemix/mPLUGOwl3/activations.py
 create mode 100644 paddlemix/mPLUGOwl3/bert_padding.py
 create mode 100644 paddlemix/mPLUGOwl3/configuration_hyper_qwen2.py
 create mode 100644 paddlemix/mPLUGOwl3/configuration_mplugowl3.py
 create mode 100644 paddlemix/mPLUGOwl3/image_processing_mplugowl3.py
 create mode 100644 paddlemix/mPLUGOwl3/imagetest.py
 create mode 100644 paddlemix/mPLUGOwl3/modeling_hyper_qwen2.py
 create mode 100644 paddlemix/mPLUGOwl3/modeling_mplugowl3.py
 create mode 100644 paddlemix/mPLUGOwl3/modeling_navit_siglip.py
 create mode 100644 paddlemix/mPLUGOwl3/processing_mplugowl3.py
 create mode 100644 paddlemix/mPLUGOwl3/utils/paddle_aux.py
 create mode 100644 paddlemix/mPLUGOwl3/x_sdpa.py

diff --git a/paddlemix/mPLUGOwl3/__init__.py b/paddlemix/mPLUGOwl3/__init__.py
new file mode 100644
index 000000000..6dc187d0d
--- /dev/null
+++ b/paddlemix/mPLUGOwl3/__init__.py
@@ -0,0 +1,22 @@
+# Copyright (c) 2023 PaddlePaddle Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+from .configuration_hyper_qwen2 import *
+from .configuration_mplugowl3 import *
+from .image_processing_mplugowl3 import *
+from .modeling_navit_siglip import *
+from .modeling_hyper_qwen2 import *
+from .modeling_mplugowl3 import *
+from .processing_mplugowl3 import *
+from .bert_padding import *
+from .x_sdpa import *
\ No newline at end of file
diff --git a/paddlemix/mPLUGOwl3/activations.py b/paddlemix/mPLUGOwl3/activations.py
new file mode 100644
index 000000000..ab9be1167
--- /dev/null
+++ b/paddlemix/mPLUGOwl3/activations.py
@@ -0,0 +1,174 @@
+# Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved.
+# Copyright 2022 The HuggingFace Team. All rights reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+import math
+from collections import OrderedDict
+
+import paddle
+import paddle.nn.functional as F
+from paddle import Tensor, nn
+
+
+class NewGELUActivation(nn.Layer):
+    """
+    Implementation of the GELU activation function currently in Google BERT repo (identical to OpenAI GPT). Also see
+    the Gaussian Error Linear Units paper: https://arxiv.org/abs/1606.08415
+    """
+
+    def forward(self, input: Tensor) -> Tensor:
+        return (
+            0.5 * input * (1.0 + paddle.tanh(math.sqrt(2.0 / math.pi) * (input + 0.044715 * paddle.pow(input, 3.0))))
+        )
+
+
+class GELUActivation(nn.Layer):
+    """
+    Original Implementation of the GELU activation function in Google BERT repo when initially created. For
+    information: OpenAI GPT's GELU is slightly different (and gives slightly different results): 0.5 * x * (1 +
+    torch.tanh(math.sqrt(2 / math.pi) * (x + 0.044715 * torch.pow(x, 3)))) This is now written in C in nn.functional
+    Also see the Gaussian Error Linear Units paper: https://arxiv.org/abs/1606.08415
+    """
+
+    def __init__(self, use_gelu_python: bool = False):
+        super().__init__()
+        if use_gelu_python:
+            self.act = self._gelu_python
+        else:
+            self.act = nn.functional.gelu
+
+    def _gelu_python(self, input: Tensor) -> Tensor:
+        return input * 0.5 * (1.0 + paddle.erf(input / math.sqrt(2.0)))
+
+    def forward(self, input: Tensor) -> Tensor:
+        return self.act(input)
+
+
+class FastGELUActivation(nn.Layer):
+    """
+    Applies GELU approximation that is slower than QuickGELU but more accurate. See: https://github.com/hendrycks/GELUs
+    """
+
+    def forward(self, input: Tensor) -> Tensor:
+        return 0.5 * input * (1.0 + paddle.tanh(input * 0.7978845608 * (1.0 + 0.044715 * input * input)))
+
+
+class QuickGELUActivation(nn.Layer):
+    """
+    Applies GELU approximation that is fast but somewhat inaccurate. See: https://github.com/hendrycks/GELUs
+    """
+
+    def forward(self, input: Tensor) -> Tensor:
+        return input * F.sigmoid(1.702 * input)
+
+
+class ClippedGELUActivation(nn.Layer):
+    """
+    Clip the range of possible GeLU outputs between [min, max]. This is especially useful for quantization purpose, as
+    it allows mapping negatives values in the GeLU spectrum. For more information on this trick, please refer to
+    https://arxiv.org/abs/2004.09602.
+
+    Gaussian Error Linear Unit. Original Implementation of the gelu activation function in Google Bert repo when
+    initially created.
+
+    For information: OpenAI GPT's gelu is slightly different (and gives slightly different results): 0.5 * x * (1 +
+    torch.tanh(math.sqrt(2 / math.pi) * (x + 0.044715 * torch.pow(x, 3)))). See https://arxiv.org/abs/1606.08415
+    """
+
+    def __init__(self, min: float, max: float):
+        if min > max:
+            raise ValueError(f"min should be < max (got min: {min}, max: {max})")
+
+        super().__init__()
+        self.min = min
+        self.max = max
+
+    def forward(self, x: Tensor) -> Tensor:
+        return paddle.clip(gelu(x), self.min, self.max)
+
+
+class SiLUActivation(nn.Layer):
+    """
+    See Gaussian Error Linear Units (Hendrycks et al., https://arxiv.org/abs/1606.08415) where the SiLU (Sigmoid Linear
+    Unit) was originally introduced and coined, and see Sigmoid-Weighted Linear Units for Neural Network Function
+    Approximation in Reinforcement Learning (Elfwing et al., https://arxiv.org/abs/1702.03118) and Swish: a Self-Gated
+    Activation Function (Ramachandran et al., https://arxiv.org/abs/1710.05941v1) where the SiLU was experimented with
+    later.
+    """
+
+    def forward(self, input: Tensor) -> Tensor:
+        return F.silu(input)
+
+
+class MishActivation(nn.Layer):
+    """
+    See Mish: A Self-Regularized Non-Monotonic Activation Function (Misra., https://arxiv.org/abs/1908.08681). Also
+    visit the official repository for the paper: https://github.com/digantamisra98/Mish
+    """
+
+    def forward(self, input: Tensor) -> Tensor:
+        return F.mish(input)
+
+
+class LinearActivation(nn.Layer):
+    """
+    Applies the linear activation function, i.e. forwarding input directly to output.
+    """
+
+    def forward(self, input: Tensor) -> Tensor:
+        return input
+
+
+class ClassInstantier(OrderedDict):
+    def __getitem__(self, key):
+        content = super().__getitem__(key)
+        cls, kwargs = content if isinstance(content, tuple) else (content, {})
+        return cls(**kwargs)
+
+
+ACT2CLS = {
+    "gelu": GELUActivation,
+    "gelu_10": (ClippedGELUActivation, {"min": -10, "max": 10}),
+    "gelu_fast": FastGELUActivation,
+    "gelu_new": NewGELUActivation,
+    "gelu_python": (GELUActivation, {"use_gelu_python": True}),
+    "linear": LinearActivation,
+    "mish": MishActivation,
+    "quick_gelu": QuickGELUActivation,
+    "relu": nn.ReLU,
+    "relu6": nn.ReLU6,
+    "sigmoid": nn.Sigmoid,
+    "silu": SiLUActivation,
+    "swish": SiLUActivation,
+    "tanh": nn.Tanh,
+}
+ACT2FN = ClassInstantier(ACT2CLS)
+
+
+def get_activation(activation_string):
+    if activation_string in ACT2FN:
+        return ACT2FN[activation_string]
+    else:
+        raise KeyError(f"function {activation_string} not found in ACT2FN mapping {list(ACT2FN.keys())}")
+
+
+# For backwards compatibility with: from activations import gelu_python
+gelu_python = get_activation("gelu_python")
+gelu_new = get_activation("gelu_new")
+gelu = get_activation("gelu")
+gelu_fast = get_activation("gelu_fast")
+quick_gelu = get_activation("quick_gelu")
+silu = get_activation("silu")
+mish = get_activation("mish")
+linear_act = get_activation("linear")
diff --git a/paddlemix/mPLUGOwl3/bert_padding.py b/paddlemix/mPLUGOwl3/bert_padding.py
new file mode 100644
index 000000000..017aa78bf
--- /dev/null
+++ b/paddlemix/mPLUGOwl3/bert_padding.py
@@ -0,0 +1,111 @@
+# Copyright (c) 2024 PaddlePaddle Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+# reference from Dao-AILAB flash-attn
+# https://github.com/Dao-AILab/flash-attention/blob/74b0761ff7efc7b90d4e5aeb529c1b2a09a7458c/flash_attn/bert_padding.py#L38
+import operator
+from functools import reduce
+
+import paddle
+import paddle.nn.functional as F
+from einops import rearrange, repeat
+
+
+class IndexFirstAxis(paddle.autograd.PyLayer):
+    @staticmethod
+    def forward(ctx, input, indices):
+        ctx.save_for_backward(indices)
+        assert input.ndim >= 2
+        ctx.first_axis_dim, other_shape = input.shape[0], input.shape[1:]
+        second_dim = reduce(operator.mul, other_shape, 1)
+        return paddle.take_along_axis(
+            arr=rearrange(input, "b ... -> b (...)"), axis=0, indices=repeat(indices, "z -> z d", d=second_dim)
+        ).reshape([-1, *other_shape])
+
+    @staticmethod
+    def backward(ctx, grad_output):
+        """Class Attribute: torch.autograd.function.FunctionCtx.saved_tensors, can not convert, please check whether it is torch.Tensor.*/torch.autograd.function.FunctionCtx.*/torch.distributions.Distribution.* and convert manually"""
+        (indices,) = ctx.saved_tensor()
+        assert grad_output.ndim >= 2
+        other_shape = grad_output.shape[1:]
+        grad_output = rearrange(grad_output, "b ... -> b (...)")
+        grad_input = paddle.zeros(shape=[ctx.first_axis_dim, tuple(grad_output.shape)[1]], dtype=grad_output.dtype)
+
+        grad_input.put_along_axis_(
+            axis=0,
+            indices=repeat(indices, "z -> z d", d=tuple(grad_output.shape)[1]),
+            values=grad_output,
+        )
+        return grad_input.reshape([ctx.first_axis_dim, *other_shape]), None
+
+
+index_first_axis = IndexFirstAxis.apply
+
+
+class IndexPutFirstAxis(paddle.autograd.PyLayer):
+    @staticmethod
+    def forward(ctx, values, indices, first_axis_dim):
+        ctx.save_for_backward(indices)
+        assert indices.ndim == 1
+        assert values.ndim >= 2
+        output = paddle.zeros(shape=[first_axis_dim, *tuple(values.shape)[1:]], dtype=values.dtype)
+        output[indices] = values
+        return output
+
+    @staticmethod
+    def backward(ctx, grad_output):
+        """Class Attribute: torch.autograd.function.FunctionCtx.saved_tensors, can not convert, please check whether it is torch.Tensor.*/torch.autograd.function.FunctionCtx.*/torch.distributions.Distribution.* and convert manually"""
+        (indices,) = ctx.saved_tensor()
+        grad_values = grad_output[indices]
+        return grad_values, None, None
+
+
+index_put_first_axis = IndexPutFirstAxis.apply
+
+
+def unpad_input(hidden_states, attention_mask):
+    """
+    Arguments:
+        hidden_states: (batch, seqlen, ...)
+        attention_mask: (batch, seqlen), bool / int, 1 means valid and 0 means not valid.
+    Return:
+        hidden_states: (total_nnz, ...), where total_nnz = number of tokens in selected in attention_mask.
+        indices: (total_nnz), the indices of non-masked tokens from the flattened input sequence.
+        cu_seqlens: (batch + 1), the cumulative sequence lengths, used to index into hidden_states.
+        max_seqlen_in_batch: int
+    """
+    seqlens_in_batch = paddle.sum(attention_mask, axis=-1, dtype="int32")
+    indices = paddle.nonzero(attention_mask.flatten(), as_tuple=False).flatten()
+    max_seqlen_in_batch = paddle.max(seqlens_in_batch).item()
+    cu_seqlens = F.pad(paddle.cumsum(seqlens_in_batch, axis=0), [1, 0])
+
+    return (
+        index_first_axis(rearrange(hidden_states, "b s ... -> (b s) ..."), indices),
+        indices,
+        cu_seqlens,
+        max_seqlen_in_batch,
+    )
+
+
+def pad_input(hidden_states, indices, batch, seqlen):
+    """
+    Arguments:
+        hidden_states: (total_nnz, ...), where total_nnz = number of tokens in selected in attention_mask.
+        indices: (total_nnz), the indices that represent the non-masked tokens of the original padded input sequence.
+        batch: int, batch size for the padded sequence.
+        seqlen: int, maximum sequence length for the padded sequence.
+    Return:
+        hidden_states: (batch, seqlen, ...)
+    """
+    output = index_put_first_axis(hidden_states, indices, batch * seqlen)
+    return rearrange(output, "(b s) ... -> b s ...", b=batch)
diff --git a/paddlemix/mPLUGOwl3/configuration_hyper_qwen2.py b/paddlemix/mPLUGOwl3/configuration_hyper_qwen2.py
new file mode 100644
index 000000000..ab5f73ec8
--- /dev/null
+++ b/paddlemix/mPLUGOwl3/configuration_hyper_qwen2.py
@@ -0,0 +1,103 @@
+import paddlenlp
+from paddlenlp.transformers import PretrainedConfig
+
+
+# >>>>>>class HyperQwen2Config(transformers.configuration_utils.PretrainedConfig):
+class HyperQwen2Config(PretrainedConfig):
+    """
+    This is the configuration class to store the configuration of a [`Qwen2Model`]. It is used to instantiate a
+    Qwen2 model according to the specified arguments, defining the model architecture. Instantiating a configuration
+    with the defaults will yield a similar configuration to that of
+    Qwen2-7B-beta [Qwen/Qwen2-7B-beta](https://huggingface.co/Qwen/Qwen2-7B-beta).
+
+    Configuration objects inherit from [`PretrainedConfig`] and can be used to control the model outputs. Read the
+    documentation from [`PretrainedConfig`] for more information.
+
+
+    Args:
+        vocab_size (`int`, *optional*, defaults to 151936):
+            Vocabulary size of the Qwen2 model. Defines the number of different tokens that can be represented by the
+            `inputs_ids` passed when calling [`Qwen2Model`]
+        hidden_size (`int`, *optional*, defaults to 4096):
+            Dimension of the hidden representations.
+        intermediate_size (`int`, *optional*, defaults to 22016):
+            Dimension of the MLP representations.
+        num_hidden_layers (`int`, *optional*, defaults to 32):
+            Number of hidden layers in the Transformer encoder.
+        num_attention_heads (`int`, *optional*, defaults to 32):
+            Number of attention heads for each attention layer in the Transformer encoder.
+        num_key_value_heads (`int`, *optional*, defaults to 32):
+            This is the number of key_value heads that should be used to implement Grouped Query Attention. If
+            `num_key_value_heads=num_attention_heads`, the model will use Multi Head Attention (MHA), if
+            `num_key_value_heads=1` the model will use Multi Query Attention (MQA) otherwise GQA is used. When
+            converting a multi-head checkpoint to a GQA checkpoint, each group key and value head should be constructed
+            by meanpooling all the original heads within that group. For more details checkout [this
+            paper](https://arxiv.org/pdf/2305.13245.pdf). If it is not specified, will default to `32`.
+        hidden_act (`str` or `function`, *optional*, defaults to `"silu"`):
+            The non-linear activation function (function or string) in the decoder.
+        max_position_embeddings (`int`, *optional*, defaults to 32768):
+            The maximum sequence length that this model might ever be used with.
+        initializer_range (`float`, *optional*, defaults to 0.02):
+            The standard deviation of the truncated_normal_initializer for initializing all weight matrices.
+        rms_norm_eps (`float`, *optional*, defaults to 1e-06):
+            The epsilon used by the rms normalization layers.
+        use_cache (`bool`, *optional*, defaults to `True`):
+            Whether or not the model should return the last key/values attentions (not used by all models). Only
+            relevant if `config.is_decoder=True`.
+        tie_word_embeddings (`bool`, *optional*, defaults to `False`):
+            Whether the model's input and output word embeddings should be tied.
+        rope_theta (`float`, *optional*, defaults to 10000.0):
+            The base period of the RoPE embeddings.
+        use_sliding_window (`bool`, *optional*, defaults to `False`):
+            Whether to use sliding window attention.
+        sliding_window (`int`, *optional*, defaults to 4096):
+            Sliding window attention (SWA) window size. If not specified, will default to `4096`.
+        max_window_layers (`int`, *optional*, defaults to 28):
+            The number of layers that use SWA (Sliding Window Attention). The bottom layers use SWA while the top use full attention.
+        attention_dropout (`float`, *optional*, defaults to 0.0):
+            The dropout ratio for the attention probabilities.
+
+    ```python
+    >>> from transformers import Qwen2Model, Qwen2Config
+
+    >>> # Initializing a Qwen2 style configuration
+    >>> configuration = Qwen2Config()
+
+    >>> # Initializing a model from the Qwen2-7B style configuration
+    >>> model = Qwen2Model(configuration)
+
+    >>> # Accessing the model configuration
+    >>> configuration = model.config
+    ```"""
+    model_type = 'qwen2'
+    keys_to_ignore_at_inference = ['past_key_values']
+
+    def __init__(self, vocab_size=151936, hidden_size=4096,
+        intermediate_size=22016, num_hidden_layers=32, num_attention_heads=
+        32, num_key_value_heads=32, hidden_act='silu',
+        max_position_embeddings=32768, initializer_range=0.02, rms_norm_eps
+        =1e-06, use_cache=True, tie_word_embeddings=False, rope_theta=
+        10000.0, use_sliding_window=False, sliding_window=4096,
+        max_window_layers=28, attention_dropout=0.0, hyper_layers=[1, 9, 17,
+        25], _attn_implementation="sdpa",**kwargs):
+        self.vocab_size = vocab_size
+        self.max_position_embeddings = max_position_embeddings
+        self.hidden_size = hidden_size
+        self.intermediate_size = intermediate_size
+        self.num_hidden_layers = num_hidden_layers
+        self.num_attention_heads = num_attention_heads
+        self.use_sliding_window = use_sliding_window
+        self.sliding_window = sliding_window if use_sliding_window else None
+        self.max_window_layers = max_window_layers
+        if num_key_value_heads is None:
+            num_key_value_heads = num_attention_heads
+        self.num_key_value_heads = num_key_value_heads
+        self.hidden_act = hidden_act
+        self.initializer_range = initializer_range
+        self.rms_norm_eps = rms_norm_eps
+        self.use_cache = use_cache
+        self.rope_theta = rope_theta
+        self.attention_dropout = attention_dropout
+        self.hyper_layers = hyper_layers
+        self._attn_implementation = _attn_implementation
+        super().__init__(tie_word_embeddings=tie_word_embeddings, **kwargs)
diff --git a/paddlemix/mPLUGOwl3/configuration_mplugowl3.py b/paddlemix/mPLUGOwl3/configuration_mplugowl3.py
new file mode 100644
index 000000000..8dcbcf135
--- /dev/null
+++ b/paddlemix/mPLUGOwl3/configuration_mplugowl3.py
@@ -0,0 +1,37 @@
+import os
+import paddlenlp
+""" mPLUGOwl3 model configuration"""
+# from paddlenlp.transformers import PretrainedConfig, Qwen2Config
+from typing import Union
+from .configuration_hyper_qwen2 import HyperQwen2Config
+# logger = paddle.utils.try_import('logging').getLogger(name=__name__)
+from paddlemix.utils.log import logger
+from .modeling_navit_siglip import SigLipVisionConfig
+
+
+class mPLUGOwl3Config(HyperQwen2Config):
+    model_type = 'mplugowl3'
+    keys_to_ignore_at_inference = ['past_key_values']
+    default_vision_config = {'hidden_size': 1152, 'image_size': 384,
+        'intermediate_size': 4304, 'model_type': 'siglip_vision_model',
+        'num_attention_heads': 16, 'num_hidden_layers': 27, 'patch_size': 14}
+
+    def __init__(self, use_cache=True, vision_config=None, **kwargs):
+        self.use_cache = use_cache
+        if vision_config is None:
+# >>>>>>            self.vision_config = (transformers.models.siglip.
+#                 configuration_siglip.SiglipVisionConfig(**self.
+#                 default_vision_config))
+            self.vision_config = SigLipVisionConfig(**self.default_vision_config)
+            logger.info('vision_config is None, using default vision config')
+        elif isinstance(vision_config, dict):
+# >>>>>>            self.vision_config = (transformers.models.siglip.
+                # configuration_siglip.SiglipVisionConfig(**vision_config))
+            self.vision_config = SigLipVisionConfig(**vision_config)
+# >>>>>>        elif isinstance(vision_config, transformers.models.siglip.
+#             configuration_siglip.SiglipVisionConfig):
+        elif isinstance(vision_config, SigLipVisionConfig):
+            self.vision_config = vision_config
+        self.image_size = self.vision_config.image_size
+        self.patch_size = self.vision_config.patch_size
+        super().__init__(**kwargs)
diff --git a/paddlemix/mPLUGOwl3/image_processing_mplugowl3.py b/paddlemix/mPLUGOwl3/image_processing_mplugowl3.py
new file mode 100644
index 000000000..7a2082677
--- /dev/null
+++ b/paddlemix/mPLUGOwl3/image_processing_mplugowl3.py
@@ -0,0 +1,470 @@
+import sys
+sys.path.append('/home/aistudio/paddle_test/mPLUGOwl3/utils')
+# import paddle_aux
+import paddle
+import paddlenlp
+from paddlenlp.transformers.image_processing_utils import BatchFeature, BaseImageProcessor
+import paddle.nn.functional as F
+import random
+from typing import Optional, Union, Dict, Any, List
+from einops import rearrange, repeat
+import math
+import PIL.Image
+import PIL.ImageSequence
+import numpy as np
+import PIL
+from PIL import Image
+from enum import Enum
+
+
+def recursive_converter(converter, value):
+    if isinstance(value, list):
+        new_value = []
+        for v in value:
+            new_value += [recursive_converter(converter, v)]
+        return new_value
+    else:
+        return converter(value)
+
+
+def box_area(boxes):
+    # 获取边界框的宽度和高度
+    width = boxes[:, 2] - boxes[:, 0]
+    height = boxes[:, 3] - boxes[:, 1]
+    # 计算面积
+    area = width * height
+    return area
+
+def custom_max(a, b):
+    return paddle.where(a > b, a, b)
+
+def custom_min(a, b):
+    return paddle.where(a < b, a, b)
+
+
+def box_iou(boxes1, area1, boxes2, eps=1e-05):
+# >>>>>>    area2 = torchvision.ops.boxes.box_area(boxes2)
+    area1 = area1.astype('float32')
+    boxes1 = boxes1.astype('float32')
+    boxes2 = boxes2.astype('float32')
+
+    area2 = box_area(boxes2).astype('float32')
+    lt = custom_max(boxes1[:, None, :2], boxes2[:, :2])
+    rb = custom_min(boxes1[:, None, 2:], boxes2[:, 2:])
+    wh = (rb - lt).clip(min=0)
+    inter = wh[:, :, 0] * wh[:, :, 1]
+    union = area1[:, None] + area2 - inter
+    iou = inter / (union + eps)
+    return iou, union
+
+
+available_anchor_strategy = ['docowl', 'random', 'highest', 'last', 'llava']
+grid_dict = {'grid_33': [(1, 1), (1, 2), (2, 1), (1, 3), (3, 1), (2, 2), (1,
+    4), (4, 1), (1, 5), (5, 1), (1, 6), (6, 1), (2, 3), (3, 2), (1, 7), (7,
+    1), (4, 2), (2, 4), (1, 8), (8, 1), (3, 3), (1, 9), (9, 1)],
+    'grid_squ_3x3': [(1, 1), (2, 2), (3, 3)], 'grid_squ_4': [(2, 2), (1, 3),
+    (1, 4), (3, 1), (4, 1)], 'grid_squ_6': [(2, 2), (1, 3), (1, 4), (3, 1),
+    (4, 1), (2, 3), (3, 2)], 'grid_squ_2': [(2, 1)], 'grid_squ_9': [(1, 1),
+    (1, 2), (2, 1), (1, 3), (3, 1), (2, 2), (1, 4), (4, 1), (1, 5), (5, 1),
+    (1, 6), (6, 1), (2, 3), (3, 2), (1, 7), (7, 1), (4, 2), (2, 4), (1, 8),
+    (8, 1), (3, 3), (1, 9), (9, 1)]}
+cut_prompt_template_dict = {'v0': lambda img_token, h, w: f''.join([
+    f'{img_token}' for i in range(h) for j in range(w)]), 'v1': lambda
+    img_token, h, w: f'Cut to {h} rows {w} columns, ' + ' '.join([
+    f'subimg({i},{j}){img_token}' for i in range(h) for j in range(w)]),
+    'v1_global': lambda img_token, h, w: 
+    f'Cut to {h} rows {w} columns with a global view, ' + ' '.join([
+    f'subimg({i},{j}){img_token}' for i in range(h) for j in range(w)] + [
+    f'global_view{img_token}']), 'v2_global': lambda img_token, h, w: 
+    f"""Cut to {h} rows {w} columns with a global view
+""" + '\n'.join([' '
+    .join([f'subimg({i},{j}){img_token}' for j in range(w)]) for i in range
+    (h)]) + f"""
+global_view{img_token}""", 'v3': lambda img_token, h, w: 
+    f'<|start_cut|>{h}*{w}' + ' '.join([f'{img_token}' for i in range(h) for
+    j in range(w)]) + '<|end_cut|>', 'v3_global': lambda img_token, h, w: 
+    f"""<|start_cut|>{h}*{w}
+""" + '\n'.join([' '.join([f'{img_token}' for
+    j in range(w)]) for i in range(h)]) + f"""
+{img_token}<|end_cut|>"""}
+
+
+def anchor_rank(anchors, anchors_areas, input_image_size, eps=1e-05):
+    input_image_bbox = paddle.to_tensor(data=[0, 0, input_image_size[1],
+        input_image_size[0]]).unsqueeze(axis=0)
+    boxes1 = anchors
+    boxes2 = input_image_bbox
+    boxes3 = anchors.clone()
+    boxes3[:, 3] = input_image_size[0] / input_image_size[1] * anchors[:, 2]
+    area1 = anchors_areas
+    iou, _ = box_iou(boxes1, area1, boxes2)
+    iou = iou.squeeze(axis=1)
+    shape_iou, _ = box_iou(boxes1, area1, boxes3)
+    shape_iou = shape_iou.diag()
+    index = paddle.argmax(x=shape_iou * 100 + iou, axis=0)
+    return index
+
+
+def select_best_resolution(anchors, anchors_areas, input_image_size):
+    """
+    Selects the best resolution from a list of possible resolutions based on the original size.
+
+    Args:
+        original_size (tuple): The original size of the image in the format (width, height).
+        possible_resolutions (list): A list of possible resolutions in the format [(width1, height1), (width2, height2), ...].
+
+    Returns:
+        tuple: The best fit resolution in the format (width, height).
+    """
+    original_size = input_image_size[1], input_image_size[0]
+    possible_resolutions = [(_[2], _[3]) for _ in anchors]
+    original_width, original_height = original_size
+    best_fit = None
+    max_effective_resolution = 0
+    min_wasted_resolution = float('inf')
+    index = 0
+    for i, (width, height) in enumerate(possible_resolutions):
+        scale = min(width / original_width, height / original_height)
+        downscaled_width, downscaled_height = int(original_width * scale), int(
+            original_height * scale)
+        effective_resolution = min(downscaled_width * downscaled_height, 
+            original_width * original_height)
+        wasted_resolution = width * height - effective_resolution
+        if (effective_resolution > max_effective_resolution or 
+            effective_resolution == max_effective_resolution and 
+            wasted_resolution < min_wasted_resolution):
+            max_effective_resolution = effective_resolution
+            min_wasted_resolution = wasted_resolution
+            best_fit = width, height
+            index = i
+    return index
+
+
+def build_cut_shape_indices(cut_shape):
+    cut_shape_indices = []
+    for shape in cut_shape:
+        n = shape[0] * shape[1]
+        indices = paddle.concat(x=[repeat(paddle.to_tensor(data=shape),
+            'l -> n l', n=n), paddle.arange(end=n).unsqueeze(axis=1)], axis=1)
+        assert tuple(indices.shape)[0] == n
+        assert tuple(indices.shape)[1] == 3
+        cut_shape_indices.append(indices)
+    cut_shape_indices = paddle.concat(x=cut_shape_indices, axis=0).astype(dtype
+        ='int64')
+    return cut_shape_indices
+
+
+class AnchorResize(paddle.nn.Layer):
+
+# >>>>>>    def __init__(self, image_size, anchors, interpolation=torchvision.
+#         transforms.transforms.InterpolationMode.BILINEAR, antialias=None,
+#         anchor_strategy='docowl'):
+    def __init__(self, image_size, anchors, interpolation='bilinear', antialias=None,
+        anchor_strategy='docowl'):
+        super().__init__()
+        self.image_size = image_size
+        self.anchors = paddle.to_tensor(data=[[0, 0, _[1] * image_size[1], 
+            _[0] * image_size[0]] for _ in anchors], stop_gradient=not False)
+# >>>>>>        self.anchor_areas = torchvision.ops.boxes.box_area(self.anchors)
+        self.anchor_areas = box_area(self.anchors)
+        self.interpolation = interpolation
+        self.antialias = antialias
+        self.anchor_strategy = anchor_strategy
+        assert self.anchor_strategy in available_anchor_strategy
+
+    def resize_global(self, img):
+# >>>>>>        return torchvision.transforms.functional.resize(img, self.
+#             image_size, self.interpolation, max_size=None, antialias=self.
+#             antialias)
+        image_np = np.array(img)
+        image_tensor = paddle.to_tensor(image_np, dtype='float32')
+        image_tensor = image_tensor.transpose([2, 0, 1])  # 变成 (3, 500, 500)
+        if self.interpolation =="bilinear" or "bicubic":
+            image_tensor = image_tensor.unsqueeze(0)  # 变成 (1, 3, 500, 500)
+        return F.interpolate(image_tensor, size=self.image_size, mode=self.interpolation, align_corners=False)
+
+    def forward(self, img, skip_resize=False):
+        """
+        Args:
+            img (PIL Image or Tensor): Image to be scaled.
+
+        Returns:
+            PIL Image or Tensor: Rescaled image.
+        """
+        if self.anchor_strategy == 'docowl':
+            selected_anchor = anchor_rank(self.anchors, self.anchor_areas,
+                (img.size[1], img.size[0]))
+        elif self.anchor_strategy == 'random':
+            selected_anchor = random.randint(0, len(self.anchors) - 1)
+        elif self.anchor_strategy == 'highest':
+            selected_anchor = paddle.argmax(x=self.anchors[:, 2] * self.
+                anchors[:, 3] * 100 - paddle.abs(x=self.anchors[:, 2] -
+                self.anchors[:, 3]))
+        elif self.anchor_strategy == 'last':
+            selected_anchor = len(self.anchors) - 1
+        elif self.anchor_strategy == 'llava':
+            selected_anchor = select_best_resolution(self.anchors, self.
+                anchor_areas, (img.size[1], img.size[0]))
+        else:
+            selected_anchor = None
+        assert selected_anchor is not None
+        target_size = self.anchors[selected_anchor][2:].tolist()
+        if skip_resize:
+            return selected_anchor
+# >>>>>>        return torchvision.transforms.functional.resize(img, [target_size[1
+#             ], target_size[0]], self.interpolation, max_size=None,
+#             antialias=self.antialias), selected_anchor
+        image_np = np.array(img)
+        image_tensor = paddle.to_tensor(image_np, dtype='float32')
+        image_tensor = image_tensor.transpose([2, 0, 1])  # 变成 (3, 500, 500)
+        if self.interpolation =="bilinear" or "bicubic":
+            image_tensor = image_tensor.unsqueeze(0)  # 变成 (1, 3, 500, 500)
+        return F.interpolate(image_tensor, size=[target_size[1], target_size[0]], mode=self.interpolation, align_corners=False), selected_anchor
+
+    def __repr__(self) ->str:
+        detail = (
+            f'(size={self.image_size}, anchor={self.anchors}, interpolation={self.interpolation.value}, antialias={self.antialias})'
+            )
+        return f'{self.__class__.__name__}{detail}'
+
+
+class CutMixin:
+
+    def __init__(self, cut_cfg={'anchors': 'grid_squ_6', 'anchor_strategy':
+        'docowl', 'cut_prompt': 'v3', 'add_global': True, 'cut_prob': 1.0}
+        ) ->None:
+        if cut_cfg is None:
+            self.cut_enable = False
+            return
+        else:
+            self.cut_enable = True
+        image_size = self.image_size
+        anchors = cut_cfg.get('anchors', 'grid_33')
+        anchor_strategy = cut_cfg.get('anchor_strategy', 'docowl')
+        cut_prompt = cut_cfg.get('cut_prompt', 'v0')
+        self.cut_prob = cut_cfg.get('cut_prob', 1.0)
+        self.force_shape_cut = cut_cfg.get('force_shape_cut', False)
+        force_shape_cut_anchors = cut_cfg.get('force_shape_cut_anchors',
+            'force_shape_cut_anchors')
+        self.add_global = cut_cfg.get('add_global', False)
+        if isinstance(image_size, int):
+            image_size = image_size, image_size
+        self.image_size = image_size
+        if anchors in grid_dict:
+            anchors = grid_dict[anchors]
+        else:
+            anchors = eval(anchors)
+        self.anchors = [tuple(_) for _ in anchors]
+        self.anchor_max = max([max(_) for _ in self.anchors])
+        self.resizer = AnchorResize(image_size=image_size, anchors=anchors,
+            interpolation="bicubic", anchor_strategy=anchor_strategy)
+        if force_shape_cut_anchors in grid_dict:
+            force_shape_cut_anchors = grid_dict[force_shape_cut_anchors]
+        else:
+            force_shape_cut_anchors = eval(force_shape_cut_anchors)
+        self.force_shape_cut_anchors = [tuple(_) for _ in
+            force_shape_cut_anchors]
+        self.force_shape_cut_anchors_max = max([max(_) for _ in self.
+            force_shape_cut_anchors])
+# >>>>>>        self.old_resizer = torchvision.transforms.Resize(image_size,
+#             interpolation=torchvision.transforms.transforms.
+#             InterpolationMode.BICUBIC)
+        self.old_resizer = paddle.vision.transforms.Resize(size=image_size,interpolation="bicubic")
+# >>>>>>        self.image_transform = torchvision.transforms.Compose(self.
+#             image_transform.transforms[1:])
+        self.image_transform = paddle.vision.transforms.Compose(self.image_transform.transforms[1:]) 
+        if self.add_global:
+            self.cut_prompt_template = cut_prompt_template_dict[cut_prompt +
+                '_global']
+        else:
+            self.cut_prompt_template = cut_prompt_template_dict[cut_prompt]
+        self.media_tokens = ['<|image|>', '<|video|>']
+
+    def _process_image(self, images):
+        new_images = []
+        cut_shape = []
+        for image in images:
+            print(len(images))
+            raw_image = image
+            print(raw_image)
+            print('-'*100)
+            image, selected_anchor = self.resizer(image)
+            print(image.shape)
+            print('-'*100)
+            image_input = self.image_transform(image)
+            image_input = image_input[0]
+            print(image_input.shape)
+            cut_shape.append((tuple(image_input.shape)[1] // self.
+                image_size[0], tuple(image_input.shape)[2] // self.
+                image_size[1]))
+            image_input = rearrange(image_input,
+                'C (num_h h) (num_w w) -> (num_h num_w) C h w', h=self.
+                image_size[0], w=self.image_size[1])
+            new_images.append(image_input)
+            print("1:",image_input.shape)
+            if self.add_global:
+                new_images.append(self.image_transform(self.resizer.
+                    resize_global(raw_image)))
+                print("2:",new_images[1].shape)
+                cut_shape.append((1, 1))
+        print('cutshape:',cut_shape)
+        new_images = paddle.concat(x=new_images, axis=0)
+        cut_shape_indices = build_cut_shape_indices(cut_shape)
+        return new_images, cut_shape, cut_shape_indices
+
+class TensorType(Enum):
+    PADDLE = "paddle"
+    TORCH = "torch"
+
+
+# >>>>>>class mPLUGOwl3BatchFeature(transformers.image_processing_utils.BatchFeature):
+class mPLUGOwl3BatchFeature(BatchFeature):
+    """
+    Extend from BatchFeature for supporting various image size
+    """
+
+    def __init__(
+        self, 
+        data: Optional[Dict[str, Any]]=None, 
+        # tensor_type:Union[None, str, transformers.utils.TensorType]=None):
+        tensor_type:Union[None, str, TensorType]=None):
+        super().__init__(data)
+        self.convert_to_tensors(tensor_type=tensor_type)
+
+    # def convert_to_tensors(self, tensor_type: Optional[Union[str,transformers.utils.TensorType]]=None):
+    def convert_to_tensors(self, tensor_type: Optional[Union[str,TensorType]]=None):
+        if tensor_type is None:
+            return self
+        
+        is_tensor = lambda x: isinstance(x, paddle.Tensor)
+        as_tensor = paddle.to_tensor
+
+        def converter(value):
+            try:
+                if not is_tensor(value):
+                    tensor = as_tensor(value)
+                    return tensor
+            except:
+                if key == 'overflowing_values':
+                    raise ValueError(
+                        'Unable to create tensor returning overflowing values of different lengths. '
+                        )
+                raise ValueError(
+                    "Unable to create tensor, you should probably activate padding with 'padding=True' to have batched tensors with the same length."
+                    )
+        for key, value in self.items():
+            self[key] = recursive_converter(converter, value)
+        return self
+
+    def to(self, *args, **kwargs) ->'mPLUGOwl3BatchFeature':
+# >>>>>>        transformers.utils.requires_backends(self, ['torch'])
+
+        def cast_tensor(v):
+#             if paddle.is_floating_point(x=v):
+#                 """Class Method: *.to, can not convert, please check whether it is torch.Tensor.*/Optimizer.*/nn.Module.*/torch.distributions.Distribution.*/torch.autograd.function.FunctionCtx.*/torch.profiler.profile.*/torch.autograd.profiler.profile.*, and convert manually"""
+# >>>>>>                return v.to(*args, **kwargs)
+#             elif device is not None:
+#                 return v.to(device=device)
+#             else:
+#                 return v
+            if isinstance(v, paddle.Tensor):
+                # For floating point tensors
+                if v.dtype in [paddle.float32, paddle.float64]:
+                    if 'dtype' in kwargs:
+                        v = v.cast(kwargs['dtype'])
+                    if 'place' in kwargs:
+                        v = v.place(kwargs['place'])
+                    return v
+                # For non-floating point tensors, only handle device
+                elif 'place' in kwargs:
+                    return v.place(kwargs['place'])
+            return v
+
+        new_data = {}
+        # Handle place (device in paddle)
+        place = kwargs.get("place")
+        if place is None and len(args) > 0:
+            arg = args[0]
+            if isinstance(arg, str) or isinstance(arg, paddle.CPUPlace) or isinstance(arg, paddle.CUDAPlace):
+                place = arg
+            else:
+                raise ValueError(f"Attempting to cast a BatchFeature to type {str(arg)}. This is not supported.")
+
+#         device = kwargs.get('device')
+#         if device is None and len(args) > 0:
+#             arg = args[0]
+# # >>>>>>            if transformers.utils.is_torch_dtype(arg):
+#             if isinstance(arg, paddle.Tensor):
+#                 pass
+# # >>>>>>            elif isinstance(arg, str) or transformers.utils.is_torch_device(arg
+# #                 ) or isinstance(arg, int):
+# #                 device = arg
+#             elif isinstance(arg, str):
+#                 # 如果是字符串，可以直接使用该字符串作为设备标识
+#                 device = arg
+#             elif isinstance(arg, (int, paddle.device.Device)):
+#                 if isinstance(arg, int):
+#                     device = f'gpu:{arg}' if arg >= 0 else 'cpu'
+#                 else:
+#                     device = str(arg)
+#             else:
+#                 raise ValueError(
+#                     f'Attempting to cast a BatchFeature to type {str(arg)}. This is not supported.'
+#                     )
+        for k, v in self.items():
+            new_data[k] = recursive_converter(cast_tensor, v)
+        self.data = new_data
+        return self
+
+
+# >>>>>>class mPLUGOwl3ImageProcessor(transformers.image_processing_utils.
+#     BaseImageProcessor, CutMixin):
+class mPLUGOwl3ImageProcessor(BaseImageProcessor,CutMixin):
+    model_input_names = ['pixel_values']
+
+    def __init__(self, image_size, mean=[0.5, 0.5, 0.5], std=[0.5, 0.5, 0.5
+        ], **kwargs):
+        # super().__init__(**kwargs)
+        self.image_size = image_size
+        self.mean = mean
+        self.std = std
+# >>>>>>        self.image_transform = torchvision.transforms.Compose([torchvision.
+#             transforms.Resize((image_size, image_size), interpolation=Image
+#             .BICUBIC), torchvision.transforms.ToTensor(), torchvision.
+#             transforms.Normalize(mean, std)])
+        self.image_transform = paddle.vision.transforms.Compose([paddle.vision.transforms.Resize(size=(image_size, image_size),interpolation="bicubic"),
+            paddle.vision.transforms.ToTensor(),paddle.vision.transforms.Normalize(mean=mean,std=std)])
+
+
+
+        CutMixin.__init__(self)
+
+    def preprocess(self, images: Union[Image.Image, List[Image.Image]],
+        cut_enable=True, **kwargs) ->mPLUGOwl3BatchFeature:
+        if isinstance(images, Image.Image):
+            images_list = [images]
+        else:
+            images_list = images
+        if self.cut_enable and cut_enable:
+            image_data, cut_shape, cut_shape_indices = self._process_image(
+                images_list)
+        else:
+            image_data = [self.image_transform(self.resizer.resize_global(
+                image)) for image in images_list]
+            image_data = paddle.stack(x=image_data, axis=0)
+            cut_shape = cut_shape_indices = None
+        return mPLUGOwl3BatchFeature(data={'pixel_values': image_data,
+            'cut_shape': cut_shape, 'cut_shape_indices': cut_shape_indices})
+
+    def to_dict(self):
+        # encoder_dict = super().to_dict()
+        encoder_dict = {}
+        pop_keys = ['image_transform', 'resizer', 'old_resizer',
+            'cut_prompt_template']
+        for pk in pop_keys:
+            encoder_dict.pop(pk, None)
+        return encoder_dict
+
+
+# >>>>>>transformers.AutoImageProcessor.register('mPLUGOwl3ImageProcessor',
+#     mPLUGOwl3ImageProcessor)
diff --git a/paddlemix/mPLUGOwl3/imagetest.py b/paddlemix/mPLUGOwl3/imagetest.py
new file mode 100644
index 000000000..7a01a96b7
--- /dev/null
+++ b/paddlemix/mPLUGOwl3/imagetest.py
@@ -0,0 +1,26 @@
+import paddlenlp
+import paddle
+from paddlenlp.transformers import PretrainedModel, AutoTokenizer
+from PIL import Image
+from decord import VideoReader, cpu
+from mPLUGOwl3.modeling_mplugowl3 import mPLUGOwl3Model
+from mPLUGOwl3.configuration_mplugowl3 import mPLUGOwl3Config
+
+model_path = '/home/aistudio/paddle_test/mPLUGOwl3'
+config = mPLUGOwl3Config.from_pretrained(model_path)
+# print(config)
+model = mPLUGOwl3Model.from_pretrained(model_path, config=config, dtype="float16")
+model=model.eval()
+tokenizer = AutoTokenizer.from_pretrained(model_path)
+processor = model.init_processor(tokenizer)
+image = Image.new('RGB', (500, 500), color='red')
+messages = [{'role': 'user', 'content':
+    """<|image|>
+Describe this image."""}, {'role': 'assistant', 'content': ''}
+    ]
+inputs = processor(messages, images=[image], videos=None)
+# inputs.to('cuda')
+inputs.update({'tokenizer': tokenizer, 'max_new_tokens': 100, 'decode_text':
+    True})
+g = model.generate(**inputs)
+print(g)
diff --git a/paddlemix/mPLUGOwl3/modeling_hyper_qwen2.py b/paddlemix/mPLUGOwl3/modeling_hyper_qwen2.py
new file mode 100644
index 000000000..fb0d39402
--- /dev/null
+++ b/paddlemix/mPLUGOwl3/modeling_hyper_qwen2.py
@@ -0,0 +1,1468 @@
+import sys
+sys.path.append('/home/aistudio/paddle_test/mPLUGOwl3/utils')
+import paddle_aux
+import paddle
+import paddlenlp
+""" PyTorch Qwen2 model."""
+import inspect
+import math
+from typing import List, Optional, Tuple, Union
+from einops import rearrange, repeat
+from .configuration_hyper_qwen2 import HyperQwen2Config
+from .bert_padding import index_first_axis, pad_input, unpad_input
+from .activations import ACT2FN
+
+from paddlemix.models.flash_attn_utils import has_flash_attn_func
+from paddlemix.models.flash_attn_utils import is_flash_attn_available
+if is_flash_attn_available():
+    flash_attn_func, flash_attn_varlen_func = has_flash_attn_func()
+    _flash_supports_window_size = 'window_size' in list(inspect.signature(flash_attn_func).parameters)
+# >>>>>>if transformers.utils.is_flash_attn_2_available():
+#     pass
+#     _flash_supports_window_size = 'window_size' in list(inspect.signature(
+#         flash_attn_func).parameters)
+
+from .x_sdpa import ScaleDotProductAttention
+try:
+    from einops import rearrange
+    use_flash_rotary = True
+    print('use flash_attn rotary')
+except ImportError:
+    use_flash_rotary = False
+    print('import flash_attn rotary fail')
+logger = paddle.utils.try_import('logging').getLogger(name=__name__)
+_CHECKPOINT_FOR_DOC = 'Qwen/Qwen2-7B-beta'
+_CONFIG_FOR_DOC = 'HyperQwen2Config'
+
+
+def _get_unpad_data(attention_mask):
+    seqlens_in_batch = attention_mask.sum(axis=-1, dtype='int32')
+    paddle.utils.try_import('warnings').warn(
+        'Now, the return shape is inconsistent with torch when as_tuple is True'
+        )
+    indices = paddle.nonzero(x=attention_mask.flatten(), as_tuple=False
+        ).flatten()
+    max_seqlen_in_batch = seqlens_in_batch.max().item()
+    cu_seqlens = paddle.nn.functional.pad(x=paddle.cumsum(x=
+        seqlens_in_batch, axis=0, dtype='int32'), pad=(1, 0),
+        pad_from_left_axis=False)
+    return indices, cu_seqlens, max_seqlen_in_batch
+
+
+class Qwen2RMSNorm(paddle.nn.Layer):
+
+    def __init__(self, hidden_size, eps=1e-06):
+        """
+        Qwen2RMSNorm is equivalent to T5LayerNorm
+        """
+        super().__init__()
+        self.weight = paddle.base.framework.EagerParamBase.from_tensor(tensor
+            =paddle.ones(shape=hidden_size))
+        self.variance_epsilon = eps
+
+    def forward(self, hidden_states):
+        input_dtype = hidden_states.dtype
+        hidden_states = hidden_states.to('float32')
+        variance = hidden_states.pow(y=2).mean(axis=-1, keepdim=True)
+        hidden_states = hidden_states * paddle.rsqrt(x=variance + self.
+            variance_epsilon)
+        return self.weight * hidden_states.to(input_dtype)
+
+
+class Qwen2RotaryEmbedding(paddle.nn.Layer):
+
+    def __init__(self, dim, max_position_embeddings=2048, base=10000,
+        device=None):
+        super().__init__()
+        self.dim = dim
+        self.max_position_embeddings = max_position_embeddings
+        self.base = base
+        inv_freq = 1.0 / self.base ** (paddle.arange(start=0, end=self.dim,
+            step=2, dtype='int64').astype(dtype='float32').to(device) /
+            self.dim)
+        self.register_buffer(name='inv_freq', tensor=inv_freq, persistable=
+            False)
+        self._set_cos_sin_cache(seq_len=max_position_embeddings, device=
+            self.inv_freq.place, dtype=paddle.get_default_dtype())
+
+    def _set_cos_sin_cache(self, seq_len, device, dtype):
+        self.max_seq_len_cached = seq_len
+        t = paddle.arange(dtype='int64', end=self.max_seq_len_cached).astype(
+            dtype=self.inv_freq.dtype)
+        freqs = paddle.outer(x=t, y=self.inv_freq)
+        emb = paddle.concat(x=(freqs, freqs), axis=-1)
+        self.register_buffer(name='cos_cached', tensor=emb.cos().to(dtype),
+            persistable=False)
+        self.register_buffer(name='sin_cached', tensor=emb.sin().to(dtype),
+            persistable=False)
+
+    def forward(self, x, seq_len=None):
+        if seq_len > self.max_seq_len_cached:
+            self._set_cos_sin_cache(seq_len=seq_len, device=x.place, dtype=
+                x.dtype)
+        return self.cos_cached[:seq_len].to(dtype=x.dtype), self.sin_cached[:
+            seq_len].to(dtype=x.dtype)
+
+
+class RotaryEmbedding(paddle.nn.Layer):
+
+    def __init__(self, dim, base=10000, use_fp32=False, use_outer_in_rope=False
+        ):
+        super().__init__()
+        self.dim = dim
+        self.base = base
+        self.use_fp32 = use_fp32
+        if use_fp32:
+            self.inv_freq = 1.0 / base ** (paddle.arange(start=0, end=dim,
+                step=2).astype(dtype='float32') / dim)
+        else:
+            inv_freq = 1.0 / base ** (paddle.arange(start=0, end=dim, step=
+                2).astype(dtype='float32') / dim)
+            self.register_buffer(name='inv_freq', tensor=inv_freq)
+        self._rotary_pos_emb_cache = None
+        self._seq_len_cached = 0
+        self.use_outer_in_rope = use_outer_in_rope
+        self._ntk_alpha_cached = 1.0
+
+    def update_rotary_pos_emb_cache(self, max_seq_len, offset=0, ntk_alpha=1.0
+        ):
+        seqlen = max_seq_len + offset
+        if (seqlen > self._seq_len_cached or ntk_alpha != self.
+            _ntk_alpha_cached):
+            base = self.base * ntk_alpha ** (self.dim / (self.dim - 2))
+            self.inv_freq = 1.0 / base ** (paddle.arange(start=0, end=self.
+                dim, step=2).astype(dtype='float32') / self.dim)
+            self._seq_len_cached = seqlen
+            self._ntk_alpha_cached = ntk_alpha
+            seq = paddle.arange(end=seqlen)
+            if self.use_outer_in_rope:
+                freqs = paddle.outer(x=seq.astype(dtype=self.inv_freq.dtype
+                    ), y=self.inv_freq)
+            else:
+                freqs = einsum('i , j -> i j', seq.astype(dtype=self.
+                    inv_freq.dtype), self.inv_freq)
+            emb = paddle.concat(x=(freqs, freqs), axis=-1)
+            from einops import rearrange
+            self._rotary_pos_emb_cache = rearrange(emb, 'n d -> n 1 1 d')
+
+    def forward(self, max_seq_len, offset=0, ntk_alpha=1.0):
+        self.update_rotary_pos_emb_cache(max_seq_len, offset, ntk_alpha)
+        return self._rotary_pos_emb_cache[offset:offset + max_seq_len]
+
+
+def rotate_half(x):
+    """Rotates half the hidden dims of the input."""
+    x1 = x[..., :tuple(x.shape)[-1] // 2]
+    x2 = x[..., tuple(x.shape)[-1] // 2:]
+    return paddle.concat(x=(-x2, x1), axis=-1)
+
+
+def apply_rotary_pos_emb(q, k, cos, sin, position_ids, unsqueeze_dim=1):
+    """Applies Rotary Position Embedding to the query and key tensors.
+
+    Args:
+        q (`torch.Tensor`): The query tensor.
+        k (`torch.Tensor`): The key tensor.
+        cos (`torch.Tensor`): The cosine part of the rotary embedding.
+        sin (`torch.Tensor`): The sine part of the rotary embedding.
+        position_ids (`torch.Tensor`):
+            The position indices of the tokens corresponding to the query and key tensors. For example, this can be
+            used to pass offsetted position ids when working with a KV-cache.
+        unsqueeze_dim (`int`, *optional*, defaults to 1):
+            The 'unsqueeze_dim' argument specifies the dimension along which to unsqueeze cos[position_ids] and
+            sin[position_ids] so that they can be properly broadcasted to the dimensions of q and k. For example, note
+            that cos[position_ids] and sin[position_ids] have the shape [batch_size, seq_len, head_dim]. Then, if q and
+            k have the shape [batch_size, heads, seq_len, head_dim], then setting unsqueeze_dim=1 makes
+            cos[position_ids] and sin[position_ids] broadcastable to the shapes of q and k. Similarly, if q and k have
+            the shape [batch_size, seq_len, heads, head_dim], then set unsqueeze_dim=2.
+    Returns:
+        `tuple(torch.Tensor)` comprising of the query and key tensors rotated using the Rotary Position Embedding.
+    """
+    cos = cos[position_ids].unsqueeze(axis=unsqueeze_dim)
+    sin = sin[position_ids].unsqueeze(axis=unsqueeze_dim)
+    # print(q.shape)
+    # print('-----------------')
+    # print((rotate_half(q) * sin).shape)
+    q_embed = q * cos + rotate_half(q) * sin
+    k_embed = k * cos + rotate_half(k) * sin
+    return q_embed, k_embed
+
+# Copied from transformers.models.llama.modeling_llama._prepare_4d_causal_attention_mask_with_cache_position
+def _prepare_4d_causal_attention_mask_with_cache_position(
+    attention_mask: paddle.Tensor,
+    sequence_length: int,
+    target_length: int,
+    dtype: paddle.dtype,
+    min_dtype: float,
+    cache_position: paddle.Tensor,
+    batch_size: int,
+):
+    """
+    Creates a causal 4D mask of shape `(batch_size, 1, query_length, key_value_length)` from a 2D mask of shape
+    `(batch_size, key_value_length)`, or if the input `attention_mask` is already 4D, do nothing.
+
+    Args:
+        attention_mask (`paddle.Tensor`):
+            A 2D attention mask of shape `(batch_size, key_value_length)` or a 4D attention mask of shape `(batch_size, 1, query_length, key_value_length)`.
+        sequence_length (`int`):
+            The sequence length being processed.
+        target_length (`int`):
+            The target length: when generating with static cache, the mask should be as long as the static cache, to account for the 0 padding, the part of the cache that is not filled yet.
+        dtype (`paddle.dtype`):
+            The dtype to use for the 4D attention mask.
+        min_dtype (`float`):
+            The minimum value representable with the dtype `dtype`.
+        cache_position (`paddle.Tensor`):
+            Indices depicting the position of the input sequence tokens in the sequence.
+        batch_size (`paddle.Tensor`):
+            Batch size.
+    """
+    if attention_mask is not None and attention_mask.dim() == 4:
+        # In this case we assume that the mask comes already in inverted form and requires no inversion or slicing.
+        causal_mask = attention_mask
+    else:
+        causal_mask = paddle.full([sequence_length, target_length], fill_value=min_dtype, dtype=dtype)
+        if sequence_length != 1:
+            causal_mask = paddle.triu(x=causal_mask, diagonal=1)
+        bool_tensor=paddle.arange(target_length) > cache_position.reshape([-1, 1])
+        float_tensor=float16_tensor = bool_tensor.astype(paddle.float16)
+        causal_mask *= float_tensor
+        causal_mask = causal_mask[None, None, :, :].expand(shape=[batch_size, 1, -1, -1])
+        if attention_mask is not None:
+            causal_mask = causal_mask.clone()
+            mask_length = tuple(attention_mask.shape)[-1]
+            padding_mask = causal_mask[:, :, :, :mask_length] + attention_mask[:, None, None, :]
+            padding_mask = padding_mask == 0
+            causal_mask[:, :, :, :mask_length] = causal_mask[:, :, :, :mask_length].masked_fill(
+                mask=padding_mask, value=min_dtype
+            )
+    return causal_mask
+
+
+class Qwen2MLP(paddle.nn.Layer):
+
+    def __init__(self, config):
+        super().__init__()
+        self.config = config
+        self.hidden_size = config.hidden_size
+        self.intermediate_size = config.intermediate_size
+        self.gate_proj = paddle.nn.Linear(in_features=self.hidden_size,
+            out_features=self.intermediate_size, bias_attr=False)
+        self.up_proj = paddle.nn.Linear(in_features=self.hidden_size,
+            out_features=self.intermediate_size, bias_attr=False)
+        self.down_proj = paddle.nn.Linear(in_features=self.
+            intermediate_size, out_features=self.hidden_size, bias_attr=False)
+        self.act_fn = ACT2FN[config.hidden_act]
+
+    def forward(self, x):
+        return self.down_proj(self.act_fn(self.gate_proj(x)) * self.up_proj(x))
+
+
+def repeat_kv(hidden_states: paddle.Tensor, n_rep: int) ->paddle.Tensor:
+    """
+    This is the equivalent of torch.repeat_interleave(x, dim=1, repeats=n_rep). The hidden states go from (batch,
+    num_key_value_heads, seqlen, head_dim) to (batch, num_attention_heads, seqlen, head_dim)
+    """
+    batch, num_key_value_heads, slen, head_dim = tuple(hidden_states.shape)
+    if n_rep == 1:
+        return hidden_states
+    hidden_states = hidden_states[:, :, None, :, :].expand(shape=[batch,
+        num_key_value_heads, n_rep, slen, head_dim])
+    return hidden_states.reshape(batch, num_key_value_heads * n_rep, slen,
+        head_dim)
+
+
+def make_t2v_mask(media_offset_line, num_images):
+    assert len(tuple(media_offset_line.shape)) == 1
+    # media_offset_line = media_offset_line.view(-1, 1)
+    # visual_arange = paddle.arange(end=num_images).view(1, -1)
+    media_offset_line = paddle.reshape(media_offset_line, [-1, 1])
+    visual_arange = paddle.arange(end=num_images).reshape([1, -1])
+
+    mask = media_offset_line <= visual_arange
+    return mask
+
+
+def select_query(media_offset, num_queries=None):
+    query_indices = media_offset[:, :, 1] >= 0
+    assert query_indices.sum().item() % num_queries == 0, query_indices.sum(
+        ).item()
+    query_indices = query_indices.nonzero()
+    ptr = 0
+    while ptr < tuple(query_indices.shape)[0]:
+        first_query_index, last_query_index = query_indices[ptr
+            ], query_indices[ptr + num_queries - 1]
+        assert (last_query_index[1] - first_query_index[1] + 1).item(
+            ) == num_queries
+        assert last_query_index[0].item() == first_query_index[0].item()
+        batch_id, begin_i, end_i = first_query_index[0].item(
+            ), first_query_index[1].item(), first_query_index[1].item(
+            ) + num_queries
+        yield batch_id, begin_i, end_i
+        ptr += num_queries
+
+
+def _rotate_half(x):
+    """
+    change sign so the last dimension becomes [-odd, +even]
+    """
+    from einops import rearrange
+    x = rearrange(x, '... (j d) -> ... j d', j=2)
+    x1, x2 = x.unbind(axis=-2)
+    return paddle.concat(x=(-x2, x1), axis=-1)
+
+
+def apply_rotary_pos_emb_core(t, freqs, use_fp32=False, debug=False):
+    """
+    input tensor t is of shape [seq_length, ..., dim]
+    rotary positional embeding tensor freqs is of shape [seq_length, ..., dim]
+    check https://kexue.fm/archives/8265 for detailed formulas
+    """
+    if use_flash_rotary and use_fp32:
+        t_ = rearrange(t, 's b ... -> b s ...').contiguous()
+        if use_fp32:
+            t_ = t_.astype(dtype='float32')
+        freqs = freqs.squeeze(axis=1).squeeze(axis=1)
+        cos = freqs[:, :tuple(freqs.shape)[-1] // 2].cos()
+        sin = freqs[:, :tuple(freqs.shape)[-1] // 2].sin()
+        output = paddle_aux.apply_rotary_emb_func(x=t_, cos=cos, sin=sin
+            ).astype(dtype=t.dtype)
+        if debug:
+            from icecream import ic
+            ic(tuple(t_.shape), tuple(freqs.shape), tuple(cos.shape))
+        return rearrange(output, 'b s ... -> s b ...')
+    rot_dim = tuple(freqs.shape)[-1]
+    t_, t_pass_ = t[..., :rot_dim], t[..., rot_dim:]
+    if use_fp32:
+        t_ = t_.astype(dtype='float32')
+        t_pass_ = t_pass_.astype(dtype='float32')
+    t_ = t_ * freqs.cos() + _rotate_half(t_) * freqs.sin()
+    return paddle.concat(x=(t_, t_pass_), axis=-1).astype(dtype=t.dtype)
+
+
+class HyperQwen2Attention(paddle.nn.Layer):
+    """
+    Multi-headed attention from 'Attention Is All You Need' paper. Modified to use sliding window attention: Longformer
+    and "Generating Long Sequences with Sparse Transformers".
+    """
+
+    def __init__(self, config: HyperQwen2Config, layer_idx: Optional[int]=
+        None, is_hyper_enabed=False):
+        super().__init__()
+        self.config = config
+        self.layer_idx = layer_idx
+        if layer_idx is None:
+            logger.warning_once(
+                f'Instantiating {self.__class__.__name__} without passing `layer_idx` is not recommended and will to errors during the forward call, if caching is used. Please make sure to provide a `layer_idx` when creating this class.'
+                )
+        self.hidden_size = config.hidden_size
+        self.num_heads = config.num_attention_heads
+        self.head_dim = self.hidden_size // self.num_heads
+        self.num_key_value_heads = config.num_key_value_heads
+        self.num_key_value_groups = self.num_heads // self.num_key_value_heads
+        self.max_position_embeddings = config.max_position_embeddings
+        self.rope_theta = config.rope_theta
+        self.is_causal = True
+        self.attention_dropout = config.attention_dropout
+        if self.head_dim * self.num_heads != self.hidden_size:
+            raise ValueError(
+                f'hidden_size must be divisible by num_heads (got `hidden_size`: {self.hidden_size} and `num_heads`: {self.num_heads}).'
+                )
+        self.q_proj = paddle.nn.Linear(in_features=self.hidden_size,
+            out_features=self.num_heads * self.head_dim, bias_attr=True)
+        self.k_proj = paddle.nn.Linear(in_features=self.hidden_size,
+            out_features=self.num_key_value_heads * self.head_dim,
+            bias_attr=True)
+        self.v_proj = paddle.nn.Linear(in_features=self.hidden_size,
+            out_features=self.num_key_value_heads * self.head_dim,
+            bias_attr=True)
+        self.o_proj = paddle.nn.Linear(in_features=self.num_heads * self.
+            head_dim, out_features=self.hidden_size, bias_attr=False)
+        self.rotary_emb = Qwen2RotaryEmbedding(self.head_dim,
+            max_position_embeddings=self.max_position_embeddings, base=self
+            .rope_theta)
+        self.rotary_emb_core = RotaryEmbedding(self.head_dim, base=self.
+            rope_theta, use_fp32=True, use_outer_in_rope=True)
+        self.is_hyper_enabed = is_hyper_enabed
+        if self.is_hyper_enabed:
+            self.v_kv_proj = paddle.nn.Linear(in_features=self.hidden_size,
+                out_features=self.num_key_value_heads * self.head_dim * 2,
+                bias_attr=True)
+            self.gate = paddle.base.framework.EagerParamBase.from_tensor(tensor
+                =paddle.zeros(shape=self.hidden_size))
+            self.v_core_attention_sdpa = ScaleDotProductAttention(layer_number
+                =-1, causal=False, attention_dropout=self.attention_dropout)
+            self.visual_cache = {}
+
+    def apply_mi_rope(self, key_layer, media_offset_line, length_each_img):
+        key_layer = rearrange(key_layer, 'b h s d -> s b h d')
+        if self.rotary_emb_core.inv_freq.place != key_layer.place:
+            self.rotary_emb_core.inv_freq = self.rotary_emb_core.inv_freq.to(
+                key_layer.place)
+        rotary_pos_emb_max_seq_len = self.config.max_position_embeddings
+        ntk_alpha = 1
+        rotary_pos_emb = self.rotary_emb_core(rotary_pos_emb_max_seq_len,
+            ntk_alpha=ntk_alpha)
+        assert rotary_pos_emb is not None
+        if isinstance(rotary_pos_emb, tuple):
+            rotary_pos_emb = rotary_pos_emb
+        else:
+            rotary_pos_emb = (rotary_pos_emb,) * 2
+        if rotary_pos_emb is not None:
+            q_pos_emb, k_pos_emb = rotary_pos_emb
+            image_pos = (media_offset_line[1:] - media_offset_line[:-1]
+                ).nonzero().squeeze(axis=1) + 1
+            k_pos_emb = repeat(k_pos_emb[image_pos],
+                'N_img b h d -> (N_img L) b h d', L=length_each_img)
+            key_layer = apply_rotary_pos_emb_core(key_layer, k_pos_emb,
+                use_fp32=True)
+        key_layer = rearrange(key_layer, 's b h d -> b h s d')
+        return key_layer
+
+    def crossattention(self, query_layer, vision_features, media_offset,
+        context_layer):
+        """
+        query_layer: [s b h d]
+        vision_features: [b' lv d]
+        context_layer: s b d
+        """
+        if vision_features is None or self.is_hyper_enabed == False:
+            return context_layer
+        context_layer_clone = context_layer.clone()
+        vision_features = vision_features.contiguous()
+        vision_features = self.v_kv_proj(vision_features)
+        length_each_img = tuple(vision_features.shape)[1]
+        sequence_length = tuple(query_layer.shape)[0]
+        if sequence_length == 1:
+            completion_flag = True
+            media_offset = media_offset[:, -1:]
+        else:
+            completion_flag = False
+            self.visual_cache['media_offset'] = media_offset
+            self.visual_cache['vision_features'] = vision_features
+        query_layer = rearrange(query_layer, 'L B H D -> B H L D')
+        assert sequence_length == tuple(media_offset.shape)[1], (
+            sequence_length, tuple(media_offset.shape))
+        gate_value = paddle.nn.functional.sigmoid(x=self.gate)
+        for batch_id, begin_i, end_i in select_query(media_offset,
+            sequence_length):
+            assert begin_i == 0
+            assert end_i == sequence_length, (end_i, sequence_length)
+            curr_offset = media_offset[batch_id, end_i - 1]
+            if not completion_flag:
+                re_to_zero_media_offset = (media_offset[batch_id, :, 1] -
+                    curr_offset[0]).to(query_layer.place)
+                query_shift = re_to_zero_media_offset.nonzero()[0].item()
+                curr_mask = make_t2v_mask(re_to_zero_media_offset[
+                    query_shift:], num_images=curr_offset[1] - curr_offset[0])
+                curr_mask = repeat(curr_mask,
+                    's_q s_k -> B H s_q (s_k img_l)', B=1, H=1, img_l=
+                    length_each_img)
+            else:
+                curr_mask = None
+                query_shift = 0
+            curr_query_tokens = query_layer[batch_id, :, query_shift:
+                ].unsqueeze(axis=0).clone().contiguous()
+            assert curr_offset[0] < tuple(vision_features.shape)[0]
+            assert curr_offset[1] <= tuple(vision_features.shape)[0]
+            curr_vision_kv: paddle.Tensor = rearrange(vision_features[
+                curr_offset[0]:curr_offset[1]].clone(),
+                'BL Lv (H KV D) -> KV 1 H (BL Lv) D', KV=2, H=self.
+                num_key_value_heads)
+            key_layer = curr_vision_kv[0].contiguous()
+            value_layer = curr_vision_kv[1].contiguous()
+            key_layer = self.apply_mi_rope(key_layer, media_offset_line=
+                self.visual_cache['media_offset'][batch_id, :, 1] -
+                curr_offset[0], length_each_img=length_each_img)
+            key_layer = repeat_kv(key_layer, self.num_key_value_groups)
+            value_layer = repeat_kv(value_layer, self.num_key_value_groups)
+            v_context_layer = self.v_core_attention_sdpa(curr_query_tokens,
+                key_layer, value_layer, attn_mask=curr_mask, order='bhsd'
+                ).squeeze(axis=1)
+            context_layer_clone[query_shift:, batch_id] = context_layer[
+                query_shift:, batch_id].clone() * (1 - gate_value
+                ) + v_context_layer * gate_value
+        return context_layer_clone
+
+    def forward(self, hidden_states: paddle.Tensor, attention_mask:
+        Optional[paddle.Tensor]=None, position_ids: Optional[paddle.Tensor]
+        =None, image_embeds=None, media_offset=None, past_key_value:
+        Optional[Tuple[paddle.Tensor]]=None, output_attentions:
+        bool=False, use_cache: bool=False) ->Tuple[paddle.Tensor, Optional[
+        paddle.Tensor], Optional[Tuple[paddle.Tensor]]]:
+        raise NotImplementedError(
+            'We do not support eager model yet. Use attn_implementation == "flash_attention_2" or attn_implementation == "sdpa".'
+            )
+        bsz, q_len, _ = tuple(hidden_states.shape)
+        query_states = self.q_proj(hidden_states)
+        key_states = self.k_proj(hidden_states)
+        value_states = self.v_proj(hidden_states)
+        # query_states = query_states.view(bsz, q_len, self.num_heads, self.
+        #     head_dim).transpose(perm=paddle_aux.transpose_aux_func(
+        #     query_states.view(bsz, q_len, self.num_heads, self.head_dim).
+        #     ndim, 1, 2))
+        query_states = paddle.reshape(query_states, [bsz, q_len, self.num_heads, self.head_dim])
+        query_states = paddle.transpose(query_states, perm=[0, 2, 1, 3])  # 交换第1和第2维度
+
+        # key_states = key_states.view(bsz, q_len, self.num_key_value_heads,
+        #     self.head_dim).transpose(perm=paddle_aux.transpose_aux_func(
+        #     key_states.view(bsz, q_len, self.num_key_value_heads, self.
+        #     head_dim).ndim, 1, 2))
+        key_states = paddle.reshape(key_states, [bsz, q_len, self.num_key_value_heads, self.head_dim])
+        key_states = paddle.transpose(key_states, perm=[0, 2, 1, 3])  # 交换第1和第2维度
+
+        # value_states = value_states.view(bsz, q_len, self.
+        #     num_key_value_heads, self.head_dim).transpose(perm=paddle_aux.
+        #     transpose_aux_func(value_states.view(bsz, q_len, self.
+        #     num_key_value_heads, self.head_dim).ndim, 1, 2))
+        value_states = paddle.reshape(value_states, [bsz, q_len, self.num_key_value_heads, self.head_dim])
+        value_states = paddle.transpose(value_states, perm=[0, 2, 1, 3])  # 交换第1和第2维度
+
+        kv_seq_len = tuple(key_states.shape)[-2]
+        if past_key_value is not None:
+            if self.layer_idx is None:
+                raise ValueError(
+                    f'The cache structure has changed since version v4.36. If you are using {self.__class__.__name__} for auto-regressive decoding with k/v caching, please make sure to initialize the attention class with a layer index.'
+                    )
+            kv_seq_len += past_key_value.get_usable_length(kv_seq_len, self
+                .layer_idx)
+        cos, sin = self.rotary_emb(value_states, seq_len=kv_seq_len)
+        
+        query_states, key_states = apply_rotary_pos_emb(query_states,
+            key_states, cos, sin, position_ids)
+        if past_key_value is not None:
+            cache_kwargs = {'sin': sin, 'cos': cos}
+            key_states, value_states = past_key_value.update(key_states,
+                value_states, self.layer_idx, cache_kwargs)
+        key_states = repeat_kv(key_states, self.num_key_value_groups)
+        value_states = repeat_kv(value_states, self.num_key_value_groups)
+        attn_weights = paddle.matmul(x=query_states, y=key_states.transpose
+            (perm=paddle_aux.transpose_aux_func(key_states.ndim, 2, 3))
+            ) / math.sqrt(self.head_dim)
+        if tuple(attn_weights.shape) != (bsz, self.num_heads, q_len, kv_seq_len
+            ):
+            raise ValueError(
+                f'Attention weights should be of size {bsz, self.num_heads, q_len, kv_seq_len}, but is {tuple(attn_weights.shape)}'
+                )
+        if attention_mask is not None:
+            if tuple(attention_mask.shape) != (bsz, 1, q_len, kv_seq_len):
+                raise ValueError(
+                    f'Attention mask should be of size {bsz, 1, q_len, kv_seq_len}, but is {tuple(attention_mask.shape)}'
+                    )
+            attn_weights = attn_weights + attention_mask
+        attn_weights = paddle.nn.functional.softmax(x=attn_weights, axis=-1,
+            dtype='float32').to(query_states.dtype)
+        attn_weights = paddle.nn.functional.dropout(x=attn_weights, p=self.
+            attention_dropout, training=self.training)
+        attn_output = paddle.matmul(x=attn_weights, y=value_states)
+        if tuple(attn_output.shape) != (bsz, self.num_heads, q_len, self.
+            head_dim):
+            raise ValueError(
+                f'`attn_output` should be of size {bsz, self.num_heads, q_len, self.head_dim}, but is {tuple(attn_output.shape)}'
+                )
+        attn_output = attn_output.transpose(perm=paddle_aux.
+            transpose_aux_func(attn_output.ndim, 1, 2)).contiguous()
+        attn_output = attn_output.reshape(bsz, q_len, self.hidden_size)
+        attn_output = self.crossattention(query_states.transpose(perm=[1, 0,
+            1, 3]), image_embeds, media_offset, attn_output.transpose(perm=
+            [1, 0, 2]))
+        attn_output = attn_output.transpose(perm=[1, 0, 2])
+        attn_output = self.o_proj(attn_output)
+        if not output_attentions:
+            attn_weights = None
+        return attn_output, attn_weights, past_key_value
+
+
+class HyperQwen2FlashAttention2(HyperQwen2Attention):
+    """
+    Qwen2 flash attention module, following Qwen2 attention module. This module inherits from `Qwen2Attention`
+    as the weights of the module stays untouched. The only required change would be on the forward pass
+    where it needs to correctly call the public API of flash attention and deal with padding tokens
+    in case the input contains any of them. Additionally, for sliding window attention, we apply SWA only to the bottom
+    config.max_window_layers layers.
+    """
+
+    def __init__(self, *args, **kwargs):
+        super().__init__(*args, **kwargs)
+# >>>>>>        self._flash_attn_uses_top_left_mask = (not transformers.utils.
+#             is_flash_attn_greater_or_equal_2_10())
+
+    def forward(self, hidden_states: paddle.Tensor, attention_mask:
+        Optional[paddle.Tensor]=None, position_ids: Optional[paddle.Tensor]
+        =None, image_embeds=None, media_offset=None, past_key_value:
+        Optional[Tuple[paddle.Tensor]]=None, output_attentions:
+        bool=False, use_cache: bool=False):
+        bsz, q_len, _ = tuple(hidden_states.shape)
+        query_states = self.q_proj(hidden_states)
+        key_states = self.k_proj(hidden_states)
+        value_states = self.v_proj(hidden_states)
+        # query_states = query_states.view(bsz, q_len, self.num_heads, self.
+        #     head_dim).transpose(perm=paddle_aux.transpose_aux_func(
+        #     query_states.view(bsz, q_len, self.num_heads, self.head_dim).
+        #     ndim, 1, 2))
+        query_states = paddle.reshape(query_states, [bsz, q_len, self.num_heads, self.head_dim])
+        query_states = paddle.transpose(query_states, perm=[0, 2, 1, 3])  # 交换第1和第2维度
+
+        # key_states = key_states.view(bsz, q_len, self.num_key_value_heads,
+        #     self.head_dim).transpose(perm=paddle_aux.transpose_aux_func(
+        #     key_states.view(bsz, q_len, self.num_key_value_heads, self.
+        #     head_dim).ndim, 1, 2))
+        key_states = paddle.reshape(key_states, [bsz, q_len, self.num_key_value_heads, self.head_dim])
+        key_states = paddle.transpose(key_states, perm=[0, 2, 1, 3])  # 交换第1和第2维度
+
+        # value_states = value_states.view(bsz, q_len, self.
+        #     num_key_value_heads, self.head_dim).transpose(perm=paddle_aux.
+        #     transpose_aux_func(value_states.view(bsz, q_len, self.
+        #     num_key_value_heads, self.head_dim).ndim, 1, 2))
+        value_states = paddle.reshape(value_states, [bsz, q_len, self.num_key_value_heads, self.head_dim])
+        value_states = paddle.transpose(value_states, perm=[0, 2, 1, 3])  # 交换第1和第2维度
+
+        kv_seq_len = tuple(key_states.shape)[-2]
+        if past_key_value is not None:
+            if self.layer_idx is None:
+                raise ValueError(
+                    f'The cache structure has changed since version v4.36. If you are using {self.__class__.__name__} for auto-regressive decoding with k/v caching, please make sure to initialize the attention class with a layer index.'
+                    )
+            kv_seq_len += past_key_value.get_usable_length(kv_seq_len, self
+                .layer_idx)
+        rotary_seq_len = max(kv_seq_len, position_ids[:, -1].max().item()) + 1
+        cos, sin = self.rotary_emb(value_states, seq_len=rotary_seq_len)
+        query_states, key_states = apply_rotary_pos_emb(query_states,
+            key_states, cos, sin, position_ids)
+        use_sliding_windows = (_flash_supports_window_size and getattr(self
+            .config, 'sliding_window', None) is not None and kv_seq_len >
+            self.config.sliding_window and self.config.use_sliding_window)
+        if not _flash_supports_window_size:
+            logger.warning_once(
+                'The current flash attention version does not support sliding window attention, for a more memory efficient implementation make sure to upgrade flash-attn library.'
+                )
+        if past_key_value is not None:
+            cache_has_contents = past_key_value.get_seq_length(self.layer_idx
+                ) > 0
+            if (getattr(self.config, 'sliding_window', None) is not None and
+                kv_seq_len > self.config.sliding_window and cache_has_contents
+                ):
+                slicing_tokens = 1 - self.config.sliding_window
+                past_key = past_key_value[self.layer_idx][0]
+                past_value = past_key_value[self.layer_idx][1]
+                past_key = past_key[:, :, slicing_tokens:, :].contiguous()
+                past_value = past_value[:, :, slicing_tokens:, :].contiguous()
+                if tuple(past_key.shape)[-2] != self.config.sliding_window - 1:
+                    raise ValueError(
+                        f'past key must have a shape of (`batch_size, num_heads, self.config.sliding_window-1, head_dim`), got {tuple(past_key.shape)}'
+                        )
+                if attention_mask is not None:
+                    attention_mask = attention_mask[:, slicing_tokens:]
+                    attention_mask = paddle.concat(x=[attention_mask,
+                        paddle.ones_like(x=attention_mask[:, -1:])], axis=-1)
+            cache_kwargs = {'sin': sin, 'cos': cos}
+            key_states, value_states = past_key_value.update(key_states,
+                value_states, self.layer_idx, cache_kwargs)
+        key_states = repeat_kv(key_states, self.num_key_value_groups)
+        value_states = repeat_kv(value_states, self.num_key_value_groups)
+        dropout_rate = 0.0 if not self.training else self.attention_dropout
+        input_dtype = query_states.dtype
+        if input_dtype == 'float32':
+# >>>>>>            if torch.is_autocast_enabled():
+# >>>>>>                target_dtype = torch.get_autocast_gpu_dtype()
+            if paddle.amp.auto_cast_enabled():
+                target_dtype = paddle.get_device('gpu').dtype 
+            elif hasattr(self.config, '_pre_quantization_dtype'):
+                target_dtype = self.config._pre_quantization_dtype
+            else:
+                target_dtype = self.q_proj.weight.dtype
+            logger.warning_once(
+                f'The input hidden states seems to be silently casted in float32, this might be related to the fact you have upcasted embedding or layer norm layers in float32. We will cast back the input in {target_dtype}.'
+                )
+            query_states = query_states.to(target_dtype)
+            key_states = key_states.to(target_dtype)
+            value_states = value_states.to(target_dtype)
+        query_states = query_states.transpose(perm=paddle_aux.
+            transpose_aux_func(query_states.ndim, 1, 2))
+        key_states = key_states.transpose(perm=paddle_aux.
+            transpose_aux_func(key_states.ndim, 1, 2))
+        value_states = value_states.transpose(perm=paddle_aux.
+            transpose_aux_func(value_states.ndim, 1, 2))
+        attn_output = self._flash_attention_forward(query_states,
+            key_states, value_states, attention_mask, q_len, dropout=
+            dropout_rate, use_sliding_windows=use_sliding_windows)
+        attn_output = attn_output.reshape(bsz, q_len, self.hidden_size
+            ).contiguous()
+        attn_output = self.crossattention(query_states.transpose(perm=[1, 0,
+            2, 3]), image_embeds, media_offset, attn_output.transpose(perm=
+            [1, 0, 2]))
+        attn_output = attn_output.transpose(perm=[1, 0, 2])
+        attn_output = self.o_proj(attn_output)
+        if not output_attentions:
+            attn_weights = None
+        return attn_output, attn_weights, past_key_value
+
+    def _flash_attention_forward(self, query_states, key_states,
+        value_states, attention_mask, query_length, dropout=0.0,
+        softmax_scale=None, use_sliding_windows=False):
+        """
+        Calls the forward method of Flash Attention - if the input hidden states contain at least one padding token
+        first unpad the input, then computes the attention scores and pad the final attention scores.
+
+        Args:
+            query_states (`torch.Tensor`):
+                Input query states to be passed to Flash Attention API
+            key_states (`torch.Tensor`):
+                Input key states to be passed to Flash Attention API
+            value_states (`torch.Tensor`):
+                Input value states to be passed to Flash Attention API
+            attention_mask (`torch.Tensor`):
+                The padding mask - corresponds to a tensor of size `(batch_size, seq_len)` where 0 stands for the
+                position of padding tokens and 1 for the position of non-padding tokens.
+            dropout (`float`):
+                Attention dropout
+            softmax_scale (`float`, *optional*):
+                The scaling of QK^T before applying softmax. Default to 1 / sqrt(head_dim)
+            use_sliding_windows (`bool`, *optional*):
+                Whether to activate sliding window attention.
+        """
+        if not self._flash_attn_uses_top_left_mask:
+            causal = self.is_causal
+        else:
+            causal = self.is_causal and query_length != 1
+        if (use_sliding_windows and self.layer_idx >= self.config.
+            max_window_layers):
+            use_sliding_windows = False
+        if attention_mask is not None:
+            batch_size = tuple(query_states.shape)[0]
+            (query_states, key_states, value_states, indices_q, cu_seq_lens,
+                max_seq_lens) = (self._upad_input(query_states, key_states,
+                value_states, attention_mask, query_length))
+            cu_seqlens_q, cu_seqlens_k = cu_seq_lens
+            max_seqlen_in_batch_q, max_seqlen_in_batch_k = max_seq_lens
+            if not use_sliding_windows:
+# >>>>>>                attn_output_unpad = flash_attn.flash_attn_varlen_func(
+                attn_output_unpad = flash_attn_varlen_func(
+                    query_states, key_states, value_states, cu_seqlens_q=
+                    cu_seqlens_q, cu_seqlens_k=cu_seqlens_k, max_seqlen_q=
+                    max_seqlen_in_batch_q, max_seqlen_k=
+                    max_seqlen_in_batch_k, dropout_p=dropout, 
+                    # softmax_scale = softmax_scale, causal=causal)
+                    scale = softmax_scale, causal=causal)[0]
+            else:
+# >>>>>>                
+                attn_output_unpad = flash_attn_varlen_func(
+                    query_states, key_states, value_states, cu_seqlens_q=
+                    cu_seqlens_q, cu_seqlens_k=cu_seqlens_k, max_seqlen_q=
+                    max_seqlen_in_batch_q, max_seqlen_k=
+                    # max_seqlen_in_batch_k, dropout_p=dropout, softmax_scale
+                    max_seqlen_in_batch_k, dropout_p=dropout, scale
+                    =softmax_scale, causal=causal, window_size=(self.config
+                    .sliding_window, self.config.sliding_window))[0]
+# >>>>>>            attn_output = flash_attn.bert_padding.pad_input(attn_output_unpad,
+#                 indices_q, batch_size, query_length)
+            attn_output = pad_input(attn_output_unpad, indices_q, batch_size, query_length)
+        elif not use_sliding_windows:
+# >>>>>>            attn_output = flash_attn.flash_attn_func(query_states,
+            attn_output = flash_attn_func(query_states,key_states,
+                # value_states, dropout, softmax_scale=softmax_scale,
+                value_states, dropout,causal=causal)[0]
+        else:
+# >>>>>>            attn_output = flash_attn.flash_attn_func(query_states,
+            attn_output = flash_attn.flash_attn_func(query_states,
+                # key_states, value_states, dropout, softmax_scale=softmax_scale,
+                key_states, value_states, dropout,
+                causal=causal, window_size=(self.config.
+                sliding_window, self.config.sliding_window))[0]
+        return attn_output
+
+    def _upad_input(self, query_layer, key_layer, value_layer,
+        attention_mask, query_length):
+        batch_size, kv_seq_len, num_heads, head_dim = tuple(key_layer.shape)
+        if kv_seq_len != tuple(attention_mask.shape)[-1]:
+            attention_mask_num_tokens = tuple(attention_mask.shape)[-1]
+            attention_mask = attention_mask[:, attention_mask_num_tokens -
+                kv_seq_len:]
+        indices_k, cu_seqlens_k, max_seqlen_in_batch_k = _get_unpad_data(
+            attention_mask)
+# >>>>>>        key_layer = flash_attn.bert_padding.index_first_axis(key_layer.
+#             reshape(batch_size * kv_seq_len, num_heads, head_dim), indices_k)
+        key_layer = index_first_axis(
+            key_layer.reshape([batch_size * kv_seq_len, num_heads, head_dim]), indices_k)
+# >>>>>>        value_layer = flash_attn.bert_padding.index_first_axis(value_layer.
+#             reshape(batch_size * kv_seq_len, num_heads, head_dim), indices_k)
+        value_layer = index_first_axis(
+            value_layer.reshape([batch_size * kv_seq_len, num_heads, head_dim]), indices_k)
+        if query_length == kv_seq_len:
+# >>>>>>            query_layer = flash_attn.bert_padding.index_first_axis(query_layer
+#                 .reshape(batch_size * kv_seq_len, num_heads, head_dim),
+#                 indices_k)
+            query_layer = index_first_axis(
+                query_layer.reshape([batch_size * kv_seq_len, num_heads, head_dim]), indices_k)
+            cu_seqlens_q = cu_seqlens_k
+            max_seqlen_in_batch_q = max_seqlen_in_batch_k
+            indices_q = indices_k
+        elif query_length == 1:
+            max_seqlen_in_batch_q = 1
+            cu_seqlens_q = paddle.arange(dtype='int32', end=batch_size + 1)
+            indices_q = cu_seqlens_q[:-1]
+            query_layer = query_layer.squeeze(axis=1)
+        else:
+            attention_mask = attention_mask[:, -query_length:]
+            query_layer, indices_q, cu_seqlens_q, max_seqlen_in_batch_q = (
+                # flash_attn.bert_padding.unpad_input(query_layer,
+                # attention_mask))
+                unpad_input(query_states,attention_mask))
+        return query_layer, key_layer, value_layer, indices_q, (cu_seqlens_q,
+            cu_seqlens_k), (max_seqlen_in_batch_q, max_seqlen_in_batch_k)
+
+
+class HyperQwen2SdpaAttention(HyperQwen2Attention):
+    """
+    Qwen2 attention module using torch.nn.functional.scaled_dot_product_attention. This module inherits from
+    `Qwen2Attention` as the weights of the module stays untouched. The only changes are on the forward pass to adapt to
+    SDPA API.
+    """
+
+    def forward(self, hidden_states: paddle.Tensor, attention_mask:
+        Optional[paddle.Tensor]=None, position_ids: Optional[paddle.Tensor]
+        =None, image_embeds=None, media_offset=None, past_key_value:
+        Optional[Tuple[paddle.Tensor]]=None, output_attentions:
+        bool=False, use_cache: bool=False) ->Tuple[paddle.Tensor, Optional[
+        paddle.Tensor], Optional[Tuple[paddle.Tensor]]]:
+        # print('*&'*100)
+        # print('output_attentions:',output_attentions)
+        # print('attention_mask:',attention_mask)#(1,1,1,60)
+        if output_attentions:#false
+            logger.warning_once(
+                'Qwen2Model is using Qwen2SdpaAttention, but `torch.nn.functional.scaled_dot_product_attention` does not support `output_attentions=True`. Falling back to the manual attention implementation, but specifying the manual implementation will be required from Transformers version v5.0.0 onwards. This warning can be removed using the argument `attn_implementation="eager"` when loading the model.'
+                )
+            return super().forward(hidden_states=hidden_states,
+                attention_mask=attention_mask, position_ids=position_ids,
+                past_key_value=past_key_value, output_attentions=
+                output_attentions, use_cache=use_cache)
+        print(hidden_states.shape)
+        bsz, q_len, _ = tuple(hidden_states.shape)
+        query_states = self.q_proj(hidden_states)
+        key_states = self.k_proj(hidden_states)
+        value_states = self.v_proj(hidden_states)
+        # query_states = query_states.view(bsz, q_len, self.num_heads, self.
+        #     head_dim).transpose(perm=paddle_aux.transpose_aux_func(
+        #     query_states.view(bsz, q_len, self.num_heads, self.head_dim).
+        #     ndim, 1, 2))
+        print('bsz:',bsz)
+        print("qlen:",q_len)
+        print("num_heads:",self.num_heads)
+        print("head_dim:",self.head_dim)
+        query_states = paddle.reshape(query_states, [bsz, q_len, self.num_heads, self.head_dim])
+        query_states = paddle.transpose(query_states, perm=[0, 2, 1, 3])  # 交换 1 和 2 维度
+
+        # key_states = key_states.view(bsz, q_len, self.num_key_value_heads,
+        #     self.head_dim).transpose(perm=paddle_aux.transpose_aux_func(
+        #     key_states.view(bsz, q_len, self.num_key_value_heads, self.
+        #     head_dim).ndim, 1, 2))
+        key_states = paddle.reshape(key_states, [bsz, q_len, self.num_key_value_heads, self.head_dim])
+        key_states = paddle.transpose(key_states, perm=[0, 2, 1, 3])  # 交换 1 和 2 维度
+
+        # value_states = value_states.view(bsz, q_len, self.
+        #     num_key_value_heads, self.head_dim).transpose(perm=paddle_aux.
+        #     transpose_aux_func(value_states.view(bsz, q_len, self.
+        #     num_key_value_heads, self.head_dim).ndim, 1, 2))
+
+        value_states = paddle.reshape(value_states, [bsz, q_len, self.num_key_value_heads, self.head_dim])
+        value_states = paddle.transpose(value_states, perm=[0, 2, 1, 3])  # 交换第1和第2维度
+
+        kv_seq_len = tuple(key_states.shape)[-2]
+        if past_key_value is not None:
+            kv_seq_len += past_key_value.get_usable_length(kv_seq_len, self
+                .layer_idx)
+        cos, sin = self.rotary_emb(value_states, seq_len=kv_seq_len)
+        # print('2'*100)
+        query_states, key_states = apply_rotary_pos_emb(query_states,
+            key_states, cos, sin, position_ids)
+        if past_key_value is not None:
+            cache_kwargs = {'sin': sin, 'cos': cos}
+            key_states, value_states = past_key_value.update(key_states,value_states, self.layer_idx, cache_kwargs)
+        
+        key_states = repeat_kv(key_states, self.num_key_value_groups)
+        value_states = repeat_kv(value_states, self.num_key_value_groups)
+        
+        if attention_mask is not None:#(1,1,1,60)
+            if tuple(attention_mask.shape) != (bsz, 1, q_len, kv_seq_len):
+                raise ValueError(
+                    f'Attention mask should be of size {bsz, 1, q_len, kv_seq_len}, but is {tuple(attention_mask.shape)}'
+                    )
+        # if query_states.device.type == 'cuda' and attention_mask is not None:
+            # query_states = query_states.contiguous()
+            # key_states = key_states.contiguous()
+            # value_states = value_states.contiguous()
+        attn_output = paddle.nn.functional.scaled_dot_product_attention(query
+            =query_states, key=key_states, value=value_states, attn_mask=
+            attention_mask, dropout_p=self.attention_dropout if self.
+            training else 0.0, is_causal=self.is_causal and attention_mask is
+            None and q_len > 1)
+        attn_output = attn_output.transpose(perm=paddle_aux.
+            transpose_aux_func(attn_output.ndim, 1, 2)).contiguous()
+        # attn_output = attn_output.view(bsz, q_len, self.hidden_size)
+        attn_output = paddle.reshape(attn_output, [bsz, q_len, self.hidden_size])
+        attn_output = self.crossattention(query_states.transpose(perm=[2, 0,
+            1, 3]), image_embeds, media_offset, attn_output.transpose(perm=
+            [1, 0, 2]))
+        attn_output = attn_output.transpose(perm=[1, 0, 2])
+        attn_output = self.o_proj(attn_output)
+        return attn_output, None, past_key_value
+
+
+QWEN2_ATTENTION_CLASSES = {'eager': HyperQwen2Attention,
+    'flash_attention_2': HyperQwen2FlashAttention2, 'sdpa':
+    HyperQwen2SdpaAttention}
+
+
+class HyperQwen2DecoderLayer(paddle.nn.Layer):
+
+    def __init__(self, config: HyperQwen2Config, layer_idx: int):
+        super().__init__()
+        self.hidden_size = config.hidden_size
+        if (config.use_sliding_window and config._attn_implementation !=
+            'flash_attention_2'):
+            logger.warning_once(
+                f'Sliding Window Attention is enabled but not implemented for `{config._attn_implementation}`; unexpected results may be encountered.'
+                )
+        self.is_hyper_enabled = layer_idx + 1 in config.hyper_layers
+        self.self_attn = QWEN2_ATTENTION_CLASSES[config._attn_implementation](
+            config, layer_idx, is_hyper_enabed=self.is_hyper_enabled)
+        self.mlp = Qwen2MLP(config)
+        self.input_layernorm = Qwen2RMSNorm(config.hidden_size, eps=config.
+            rms_norm_eps)
+        self.post_attention_layernorm = Qwen2RMSNorm(config.hidden_size,
+            eps=config.rms_norm_eps)
+
+    def forward(self, hidden_states: paddle.Tensor, attention_mask:
+        Optional[paddle.Tensor]=None, position_ids: Optional[paddle.Tensor]
+        =None, image_embeds=None, media_offset=None, past_key_value:
+        Optional[Tuple[paddle.Tensor]]=None, output_attentions: Optional[
+        bool]=False, use_cache: Optional[bool]=False) ->Tuple[paddle.Tensor,
+        Optional[Tuple[paddle.Tensor, paddle.Tensor]]]:
+        """
+        Args:
+            hidden_states (`torch.FloatTensor`): input to the layer of shape `(batch, seq_len, embed_dim)`
+            attention_mask (`torch.FloatTensor`, *optional*): attention mask of size
+                `(batch, sequence_length)` where padding elements are indicated by 0.
+            output_attentions (`bool`, *optional*):
+                Whether or not to return the attentions tensors of all attention layers. See `attentions` under
+                returned tensors for more detail.
+            use_cache (`bool`, *optional*):
+                If set to `True`, `past_key_values` key value states are returned and can be used to speed up decoding
+                (see `past_key_values`).
+            past_key_value (`Tuple(torch.FloatTensor)`, *optional*): cached past key and value projection states
+        """
+        residual = hidden_states
+        hidden_states = self.input_layernorm(hidden_states)
+        if image_embeds is not None and self.is_hyper_enabled:
+            image_embeds = self.input_layernorm(image_embeds)
+        else:
+            image_embeds = media_offset = None
+        # print('*&'*100)
+        # print('attention_mask:',attention_mask)
+        hidden_states, self_attn_weights, present_key_value = self.self_attn(
+            hidden_states=hidden_states, attention_mask=attention_mask,#(1,1,1,60)
+            position_ids=position_ids, image_embeds=image_embeds,
+            media_offset=media_offset, past_key_value=past_key_value,
+            output_attentions=output_attentions, use_cache=use_cache)
+        hidden_states = residual + hidden_states
+        residual = hidden_states
+        hidden_states = self.post_attention_layernorm(hidden_states)
+        hidden_states = self.mlp(hidden_states)
+        hidden_states = residual + hidden_states
+        outputs = hidden_states,
+        if output_attentions:
+            outputs += self_attn_weights,
+        if use_cache:
+            outputs += present_key_value,
+        return outputs
+
+
+QWEN2_START_DOCSTRING = """
+    This model inherits from [`PreTrainedModel`]. Check the superclass documentation for the generic methods the
+    library implements for all its model (such as downloading or saving, resizing the input embeddings, pruning heads
+    etc.)
+
+    This model is also a PyTorch [torch.nn.Module](https://pytorch.org/docs/stable/nn.html#torch.nn.Module) subclass.
+    Use it as a regular PyTorch Module and refer to the PyTorch documentation for all matter related to general usage
+    and behavior.
+
+    Parameters:
+        config ([`HyperQwen2Config`]):
+            Model configuration class with all the parameters of the model. Initializing with a config file does not
+            load the weights associated with the model, only the configuration. Check out the
+            [`~PreTrainedModel.from_pretrained`] method to load the model weights.
+"""
+
+
+# >>>>>>@transformers.utils.add_start_docstrings(
+#     'The bare Qwen2 Model outputting raw hidden-states without any specific head on top.'
+#     , QWEN2_START_DOCSTRING)
+class Qwen2PreTrainedModel(paddlenlp.transformers.model_utils.PretrainedModel):
+    config_class = HyperQwen2Config
+    base_model_prefix = 'model'
+    supports_gradient_checkpointing = True
+    _no_split_modules = ['HyperQwen2DecoderLayer']
+    _skip_keys_device_placement = 'past_key_values'
+    _supports_flash_attn_2 = True
+    _supports_sdpa = True
+    _supports_cache_class = True
+
+    # def _init_weights(self, module):
+    #     std = self.config.initializer_range
+    #     if isinstance(module, paddle.nn.Linear):
+    #         module.weight.data.normal_(mean=0.0, std=std)
+    #         if module.bias is not None:
+    #             module.bias.data.zero_()
+    #     elif isinstance(module, paddle.nn.Embedding):
+    #         module.weight.data.normal_(mean=0.0, std=std)
+    #         if module.padding_idx is not None:
+    #             module.weight.data[module.padding_idx].zero_()
+
+    def _init_weights(self, layer):
+        std = self.config.initializer_range
+        if isinstance(layer, (paddle.nn.Linear, paddle.nn.Conv3D)):
+            paddle.nn.initializer.Normal(mean=0.0, std=std)(layer.weight)
+            if layer.bias is not None:
+                paddle.nn.initializer.Constant(0.0)(layer.bias)
+        elif isinstance(layer, paddle.nn.Embedding):
+            paddle.nn.initializer.Normal(mean=0.0, std=std)(layer.weight)
+            if layer._padding_idx is not None:
+                with paddle.no_grad():
+                    layer.weight[layer._padding_idx] = 0.0
+
+
+QWEN2_INPUTS_DOCSTRING = """
+    Args:
+        input_ids (`torch.LongTensor` of shape `(batch_size, sequence_length)`):
+            Indices of input sequence tokens in the vocabulary. Padding will be ignored by default should you provide
+            it.
+
+            Indices can be obtained using [`AutoTokenizer`]. See [`PreTrainedTokenizer.encode`] and
+            [`PreTrainedTokenizer.__call__`] for details.
+
+            [What are input IDs?](../glossary#input-ids)
+        attention_mask (`torch.Tensor` of shape `(batch_size, sequence_length)`, *optional*):
+            Mask to avoid performing attention on padding token indices. Mask values selected in `[0, 1]`:
+
+            - 1 for tokens that are **not masked**,
+            - 0 for tokens that are **masked**.
+
+            [What are attention masks?](../glossary#attention-mask)
+
+            Indices can be obtained using [`AutoTokenizer`]. See [`PreTrainedTokenizer.encode`] and
+            [`PreTrainedTokenizer.__call__`] for details.
+
+            If `past_key_values` is used, optionally only the last `decoder_input_ids` have to be input (see
+            `past_key_values`).
+
+            If you want to change padding behavior, you should read [`modeling_opt._prepare_decoder_attention_mask`]
+            and modify to your needs. See diagram 1 in [the paper](https://arxiv.org/abs/1910.13461) for more
+            information on the default strategy.
+
+            - 1 indicates the head is **not masked**,
+            - 0 indicates the head is **masked**.
+        position_ids (`torch.LongTensor` of shape `(batch_size, sequence_length)`, *optional*):
+            Indices of positions of each input sequence tokens in the position embeddings. Selected in the range `[0,
+            config.n_positions - 1]`.
+
+            [What are position IDs?](../glossary#position-ids)
+        past_key_values (`Cache` or `tuple(tuple(torch.FloatTensor))`, *optional*):
+            Pre-computed hidden-states (key and values in the self-attention blocks and in the cross-attention
+            blocks) that can be used to speed up sequential decoding. This typically consists in the `past_key_values`
+            returned by the model at a previous stage of decoding, when `use_cache=True` or `config.use_cache=True`.
+
+            Two formats are allowed:
+            - a [`~cache_utils.Cache`] instance;
+            - Tuple of `tuple(torch.FloatTensor)` of length `config.n_layers`, with each tuple having 2 tensors of
+            shape `(batch_size, num_heads, sequence_length, embed_size_per_head)`). This is also known as the legacy
+            cache format.
+
+            The model will output the same cache format that is fed as input. If no `past_key_values` are passed, the
+            legacy cache format will be returned.
+
+            If `past_key_values` are used, the user can optionally input only the last `input_ids` (those that don't
+            have their past key value states given to this model) of shape `(batch_size, 1)` instead of all `input_ids`
+            of shape `(batch_size, sequence_length)`.
+        inputs_embeds (`torch.FloatTensor` of shape `(batch_size, sequence_length, hidden_size)`, *optional*):
+            Optionally, instead of passing `input_ids` you can choose to directly pass an embedded representation. This
+            is useful if you want more control over how to convert `input_ids` indices into associated vectors than the
+            model's internal embedding lookup matrix.
+        use_cache (`bool`, *optional*):
+            If set to `True`, `past_key_values` key value states are returned and can be used to speed up decoding (see
+            `past_key_values`).
+        output_attentions (`bool`, *optional*):
+            Whether or not to return the attentions tensors of all attention layers. See `attentions` under returned
+            tensors for more detail.
+        output_hidden_states (`bool`, *optional*):
+            Whether or not to return the hidden states of all layers. See `hidden_states` under returned tensors for
+            more detail.
+        return_dict (`bool`, *optional*):
+            Whether or not to return a [`~utils.ModelOutput`] instead of a plain tuple.
+"""
+
+
+# >>>>>>@transformers.utils.add_start_docstrings(
+#     'The bare Qwen2 Model outputting raw hidden-states without any specific head on top.'
+#     , QWEN2_START_DOCSTRING)
+class HyperQwen2Model(Qwen2PreTrainedModel):
+    """
+    Transformer decoder consisting of *config.num_hidden_layers* layers. Each layer is a [`Qwen2DecoderLayer`]
+
+    Args:
+        config: HyperQwen2Config
+    """
+
+    def __init__(self, config: HyperQwen2Config):
+        super().__init__(config)
+        self.padding_idx = config.pad_token_id
+        self.vocab_size = config.vocab_size
+        self.embed_tokens = paddle.nn.Embedding(num_embeddings=config.
+            vocab_size, embedding_dim=config.hidden_size, padding_idx=self.
+            padding_idx)
+        self.layers = paddle.nn.LayerList(sublayers=[HyperQwen2DecoderLayer
+            (config, layer_idx) for layer_idx in range(config.
+            num_hidden_layers)])
+        self._attn_implementation = config._attn_implementation
+        self.norm = Qwen2RMSNorm(config.hidden_size, eps=config.rms_norm_eps)
+        self.gradient_checkpointing = False
+        # self.post_init()
+
+    def get_input_embeddings(self):
+        return self.embed_tokens
+
+    def set_input_embeddings(self, value):
+        self.embed_tokens = value
+
+# >>>>>>    @transformers.utils.add_start_docstrings_to_model_forward(
+#         QWEN2_INPUTS_DOCSTRING)
+    def forward(self, input_ids: paddle.Tensor=None, attention_mask:
+        Optional[paddle.Tensor]=None, position_ids: Optional[paddle.Tensor]
+        =None, past_key_values: Optional[List[paddle.Tensor]]=None,
+        inputs_embeds: Optional[paddle.Tensor]=None, image_embeds=None,
+        media_offset=None, use_cache: Optional[bool]=None,
+        output_attentions: Optional[bool]=None, output_hidden_states:
+        Optional[bool]=None, return_dict: Optional[bool]=None) ->Union[
+        Tuple, paddlenlp.transformers.model_outputs.BaseModelOutputWithPast]:
+        print("^()"*100)
+        print('attention_mask',attention_mask.shape)
+        output_attentions = (output_attentions if output_attentions is not
+            None else self.config.output_attentions)
+        output_hidden_states = (output_hidden_states if 
+            output_hidden_states is not None else self.config.
+            output_hidden_states)
+        use_cache = (use_cache if use_cache is not None else self.config.
+            use_cache)
+        return_dict = (return_dict if return_dict is not None else self.
+            config.use_return_dict)
+        if input_ids is not None and inputs_embeds is not None:
+            raise ValueError(
+                'You cannot specify both decoder_input_ids and decoder_inputs_embeds at the same time'
+                )
+        elif input_ids is not None:
+            # print("%"*100)
+            # print(input_ids.shape)
+            batch_size, seq_length = tuple(input_ids.shape)#(1,60)
+        elif inputs_embeds is not None:
+            # print("tuple(inputs_embeds.shape):",inputs_embeds.shape)
+            batch_size, seq_length, _ = tuple(inputs_embeds.shape)
+        else:
+            raise ValueError(
+                'You have to specify either decoder_input_ids or decoder_inputs_embeds'
+                )
+        if self.gradient_checkpointing and self.training:
+            if use_cache:
+                logger.warning_once(
+                    '`use_cache=True` is incompatible with gradient checkpointing. Setting `use_cache=False`...'
+                    )
+                use_cache = False
+        
+        if past_key_values is None:
+            cache_position = paddle.arange(input_ids.shape[1])
+        else:
+            cache_position = paddle.to_tensor([seq_length - 1])
+
+        if past_key_values is not None:
+            input_ids = input_ids[:, -1].unsqueeze(-1)
+
+        past_key_values_length = 0
+        print("past_key_values:",past_key_values)
+        # if use_cache:
+            # use_legacy_cache = not isinstance(past_key_values, transformers
+            #     .cache_utils.Cache)
+            # use_legacy_cache = not isinstance(past_key_values, list) and all(isinstance(item, paddle.Tensor) for item in past_key_values)
+#             if use_legacy_cache:
+# >>>>>>                past_key_values = (transformers.cache_utils.DynamicCache.
+#                     from_legacy_cache(past_key_values))
+#             past_key_values_length = past_key_values.get_usable_length(
+#                 seq_length)
+        if position_ids is None:
+            device = (input_ids.place if input_ids is not None else
+                inputs_embeds.place)
+            position_ids = paddle.arange(start=past_key_values_length, end=
+                seq_length + past_key_values_length, dtype='int64')
+            # position_ids = position_ids.unsqueeze(axis=0).view(-1, seq_length)
+            position_ids = paddle.unsqueeze(position_ids, axis=0)
+            position_ids = paddle.reshape(position_ids, [-1, seq_length])
+
+        else:
+            device = input_ids.place
+            # position_ids = position_ids.view(-1, seq_length).astype(dtype='int64')
+            # position_ids = position_ids.reshape(-1, seq_length).astype(dtype='int64')
+            position_ids = paddle.reshape(position_ids, [-1, seq_length]).astype(dtype='int64')
+        if inputs_embeds is None:
+            print("^"*100)
+            inputs_embeds = self.embed_tokens(input_ids)
+        if (attention_mask is not None and self._attn_implementation ==
+            'flash_attention_2' and use_cache):
+            is_padding_right = attention_mask[:, -1].sum().item() != batch_size
+            if is_padding_right:
+                raise ValueError(
+                    "You are attempting to perform batched generation with padding_side='right' this may lead to unexpected behaviour for Flash Attention version of Qwen2. Make sure to  call `tokenizer.padding_side  = 'left'` before tokenizing the input. "
+                    )
+        # print("^()"*100)
+        # print('attention_mask',attention_mask)
+        print(self._attn_implementation)
+        if self._attn_implementation == 'flash_attention_2':
+            attention_mask = (attention_mask if attention_mask is not None and
+                0 in attention_mask else None)
+        # elif self._attn_implementation == 'sdpa' and not output_attentions:
+# >>>>>>            attention_mask = (transformers.modeling_attn_mask_utils.
+#                 _prepare_4d_causal_attention_mask_for_sdpa(attention_mask,
+#                 (batch_size, seq_length), inputs_embeds,
+#                 past_key_values_length, sliding_window=self.config.
+#                 sliding_window))
+            
+        else:
+# >>>>>>            attention_mask = (transformers.modeling_attn_mask_utils.
+#                 _prepare_4d_causal_attention_mask(attention_mask, (
+#                 batch_size, seq_length), inputs_embeds,
+#                 past_key_values_length, sliding_window=self.config.
+#                 sliding_window))
+            print("5"*200)
+            attention_mask=None
+            min_dtype = paddle.finfo(paddle.float16).min
+            # print("past_key_values_length:",past_key_values_length)
+           
+            attention_mask = _prepare_4d_causal_attention_mask_with_cache_position(
+                attention_mask,
+                sequence_length=seq_length,
+                target_length=seq_length,
+                dtype=inputs_embeds.dtype,
+                # device=device,
+                min_dtype=min_dtype,
+                cache_position=cache_position,
+                batch_size=batch_size,
+            )
+            # attention_mask = _prepare_4d_causal_attention_mask_with_cache_position(
+            #     attention_mask,
+            #     sequence_length=seq_length,
+            #     target_length=past_key_values.get_max_length(),
+            #     dtype=dtype,
+            #     # device=device,
+            #     min_dtype=min_dtype,
+            #     cache_position=cache_position,
+            #     batch_size=batch_size,
+            # )
+            print('attention_mask',attention_mask)
+        # print("^**"*100)
+        # print('attention_mask',attention_mask)
+        hidden_states = inputs_embeds
+        all_hidden_states = () if output_hidden_states else None
+        all_self_attns = () if output_attentions else None
+        next_decoder_cache = None
+        for decoder_layer in self.layers:
+            if output_hidden_states:
+                all_hidden_states += hidden_states,
+            if self.gradient_checkpointing and self.training:
+                layer_outputs = self._gradient_checkpointing_func(decoder_layer
+                    .__call__, hidden_states, attention_mask, position_ids,
+                    image_embeds, media_offset, past_key_values,
+                    output_attentions, use_cache)
+            else:
+                print("hidden_states:",hidden_states)
+                layer_outputs = decoder_layer(hidden_states, attention_mask
+                    =attention_mask, position_ids=position_ids,
+                    image_embeds=image_embeds, media_offset=media_offset,
+                    past_key_value=past_key_values, output_attentions=
+                    output_attentions, use_cache=use_cache)
+            hidden_states = layer_outputs[0]
+            if use_cache:
+                next_decoder_cache = layer_outputs[2 if output_attentions else
+                    1]
+            if output_attentions:
+                all_self_attns += layer_outputs[1],
+        hidden_states = self.norm(hidden_states)
+        if output_hidden_states:
+            all_hidden_states += hidden_states,
+        next_cache = None
+        if use_cache:
+            next_cache = next_decoder_cache.to_legacy_cache(
+                ) if use_legacy_cache else next_decoder_cache
+        if not return_dict:
+            return tuple(v for v in [hidden_states, next_cache,
+                all_hidden_states, all_self_attns] if v is not None)
+        return paddlenlp.transformers.model_outputs.BaseModelOutputWithPast(
+            last_hidden_state=hidden_states, past_key_values=next_cache,
+            hidden_states=all_hidden_states, attentions=all_self_attns)
+
+
+class HyperQwen2ForCausalLM(Qwen2PreTrainedModel):
+    _tied_weights_keys = ['lm_head.weight']
+
+    def __init__(self, config):
+        super().__init__(config)
+        self.model = HyperQwen2Model(config)
+        self.vocab_size = config.vocab_size
+        self.lm_head = paddle.nn.Linear(in_features=config.hidden_size,
+            out_features=config.vocab_size, bias_attr=False)
+        # self.post_init()
+
+    def get_input_embeddings(self):
+        return self.model.embed_tokens
+
+    def set_input_embeddings(self, value):
+        self.model.embed_tokens = value
+
+    def get_output_embeddings(self):
+        return self.lm_head
+
+    def set_output_embeddings(self, new_embeddings):
+        self.lm_head = new_embeddings
+
+    def set_decoder(self, decoder):
+        self.model = decoder
+
+    def get_decoder(self):
+        return self.model
+
+# >>>>>>    @transformers.utils.add_start_docstrings_to_model_forward(
+#         QWEN2_INPUTS_DOCSTRING)
+# >>>>>>    @transformers.utils.replace_return_docstrings(output_type=
+#         CausalLMOutputWithPast, config_class=_CONFIG_FOR_DOC)
+    def forward(self, input_ids: paddle.Tensor=None, attention_mask:
+        Optional[paddle.Tensor]=None, position_ids: Optional[paddle.Tensor]
+        =None, past_key_values: Optional[List[paddle.Tensor]]=None,
+        inputs_embeds: Optional[paddle.Tensor]=None, image_embeds=None,
+        media_offset=None, labels: Optional[paddle.Tensor]=None, use_cache:
+        Optional[bool]=None, output_attentions: Optional[bool]=None,
+        output_hidden_states: Optional[bool]=None, return_dict: Optional[
+        bool]=None) ->Union[Tuple, paddlenlp.transformers.model_outputs.
+        CausalLMOutputWithPast]:
+        """
+        Args:
+            labels (`torch.LongTensor` of shape `(batch_size, sequence_length)`, *optional*):
+                Labels for computing the masked language modeling loss. Indices should either be in `[0, ...,
+                config.vocab_size]` or -100 (see `input_ids` docstring). Tokens with indices set to `-100` are ignored
+                (masked), the loss is only computed for the tokens with labels in `[0, ..., config.vocab_size]`.
+
+        Returns:
+
+        Example:
+
+        ```python
+        >>> from transformers import AutoTokenizer, Qwen2ForCausalLM
+
+        >>> model = Qwen2ForCausalLM.from_pretrained(PATH_TO_CONVERTED_WEIGHTS)
+        >>> tokenizer = AutoTokenizer.from_pretrained(PATH_TO_CONVERTED_TOKENIZER)
+
+        >>> prompt = "Hey, are you conscious? Can you talk to me?"
+        >>> inputs = tokenizer(prompt, return_tensors="pt")
+
+        >>> # Generate
+        >>> generate_ids = model.generate(inputs.input_ids, max_length=30)
+        >>> tokenizer.batch_decode(generate_ids, skip_special_tokens=True, clean_up_tokenization_spaces=False)[0]
+        "Hey, are you conscious? Can you talk to me?\\nI'm not conscious, but I can talk to you."
+        ```"""
+        output_attentions = (output_attentions if output_attentions is not
+            None else self.config.output_attentions)
+        output_hidden_states = (output_hidden_states if 
+            output_hidden_states is not None else self.config.
+            output_hidden_states)
+        return_dict = (return_dict if return_dict is not None else self.
+            config.use_return_dict)
+        # print(self.model)   HyperQwen2Model
+        # print('::'*100)
+        # print('attention_mask',attention_mask)
+        outputs = self.model(input_ids=input_ids, attention_mask= #(1,1,1,60)
+            attention_mask, position_ids=position_ids, past_key_values=
+            past_key_values, inputs_embeds=inputs_embeds, image_embeds=
+            image_embeds, media_offset=media_offset, use_cache=use_cache,
+            output_attentions=output_attentions, output_hidden_states=
+            output_hidden_states, return_dict=return_dict)
+        hidden_states = outputs[0]
+        logits = self.lm_head(hidden_states)
+        logits = logits.astype(dtype='float32')
+        loss = None
+        if labels is not None:
+            shift_logits = logits[..., :-1, :].contiguous()
+            shift_labels = labels[..., 1:].contiguous()
+            loss_fct = paddle.nn.CrossEntropyLoss()
+            # shift_logits = shift_logits.view(-1, self.config.vocab_size)
+            # shift_labels = shift_labels.view(-1)
+            shift_logits = paddle.reshape(shift_logits, [-1, self.config.vocab_size])
+            shift_labels = paddle.reshape(shift_labels, [-1])
+
+            shift_labels = shift_labels.to(shift_logits.place)
+            loss = loss_fct(shift_logits, shift_labels)
+        if not return_dict:
+            output = (logits,) + outputs[1:]
+            return (loss,) + output if loss is not None else output
+        return paddlenlp.transformers.model_outputs.CausalLMOutputWithPast(loss
+            =loss, logits=logits, past_key_values=outputs.past_key_values,
+            hidden_states=outputs.hidden_states, attentions=outputs.attentions)
+
+    def prepare_inputs_for_generation(self, input_ids, past_key_values=None,
+        attention_mask=None, inputs_embeds=None, **kwargs):
+        if past_key_values is not None:
+# >>>>>>            if isinstance(past_key_values, transformers.cache_utils.Cache):
+#                 cache_length = past_key_values.get_seq_length()
+#                 past_length = past_key_values.seen_tokens
+#                 max_cache_length = past_key_values.get_max_length()
+            if past_key_values is not None and isinstance(past_key_values, list):
+                # 确保所有元素都是 paddle.Tensor，并且获取每个 Tensor 的序列长度
+                if all(isinstance(tensor, paddle.Tensor) for tensor in past_key_values):
+                    # 计算 cache_length 和 max_cache_length
+                    cache_length = len(past_key_values)  # 仍然是 Tensor 的数量
+                    past_length = sum(tensor.numel() for tensor in past_key_values)  # 计算所有 Tensor 的元素总数
+                    max_cache_length = max(tensor.shape[-2] for tensor in past_key_values)  # 获取最大序列长度（假设是 shape[-2]）
+                else:
+                    raise ValueError("past_key_values should be a list of paddle.Tensors")
+
+            else:
+                cache_length = past_length = tuple(past_key_values[0][0].shape
+                    )[2]
+                max_cache_length = None
+            if attention_mask is not None and tuple(attention_mask.shape)[1
+                ] > tuple(input_ids.shape)[1]:
+                input_ids = input_ids[:, -(tuple(attention_mask.shape)[1] -
+                    past_length):]
+            elif past_length < tuple(input_ids.shape)[1]:
+                input_ids = input_ids[:, past_length:]
+            if (max_cache_length is not None and attention_mask is not None and
+                cache_length + tuple(input_ids.shape)[1] > max_cache_length):
+                attention_mask = attention_mask[:, -max_cache_length:]
+        position_ids = kwargs.get('position_ids', None)
+        if attention_mask is not None and position_ids is None:
+            position_ids = attention_mask.astype(dtype='int64').cumsum(axis=-1
+                ) - 1
+            position_ids.masked_fill_(mask=attention_mask == 0, value=1)
+            if past_key_values:
+                position_ids = position_ids[:, -tuple(input_ids.shape)[1]:]
+        if inputs_embeds is not None and past_key_values is None:
+            model_inputs = {'inputs_embeds': inputs_embeds}
+        else:
+            model_inputs = {'input_ids': input_ids}
+        model_inputs.update({'position_ids': position_ids,
+            'past_key_values': past_key_values, 'use_cache': kwargs.get(
+            'use_cache'), 'attention_mask': attention_mask, 'image_embeds':
+            kwargs.get('image_embeds'), 'media_offset': kwargs.get(
+            'media_offset')})
+        return model_inputs
+
+    @staticmethod
+    def _reorder_cache(past_key_values, beam_idx):
+        reordered_past = ()
+        for layer_past in past_key_values:
+            reordered_past += tuple(past_state.index_select(axis=0, index=
+                beam_idx.to(past_state.place)) for past_state in layer_past),
+        return reordered_past
diff --git a/paddlemix/mPLUGOwl3/modeling_mplugowl3.py b/paddlemix/mPLUGOwl3/modeling_mplugowl3.py
new file mode 100644
index 000000000..fa81e853c
--- /dev/null
+++ b/paddlemix/mPLUGOwl3/modeling_mplugowl3.py
@@ -0,0 +1,210 @@
+import paddle
+import paddlenlp
+import math
+from typing import List, Optional
+import json
+from threading import Thread
+from copy import deepcopy
+from PIL import Image
+from .processing_mplugowl3 import mPLUGOwl3Processor
+from .image_processing_mplugowl3 import mPLUGOwl3ImageProcessor
+from .configuration_mplugowl3 import mPLUGOwl3Config
+from .x_sdpa import ScaleDotProductAttention
+from .modeling_hyper_qwen2 import HyperQwen2ForCausalLM
+from paddlenlp.transformers import Qwen2ForCausalLM, Qwen2PretrainedModel
+from .modeling_navit_siglip import SigLipVisionTransformer
+from paddlenlp.generation import TextIteratorStreamer
+
+
+def is_flash_attn_available():
+    try:
+        import paddle
+        if "npu" in paddle.get_device(): # NOTE: flash attn has not been tested yet
+            return False
+        q = paddle.rand((1, 4, 2, 8)).astype('float16') 
+        output = paddle.nn.functional.flash_attention.flash_attention(q, q, q, 0.9, False, False)
+        return True
+    except:
+        return False
+# >>>>>>class mPLUGOwl3PreTrainedModel(transformers.Qwen2PreTrainedModel):
+#     config_class = mPLUGOwl3Config
+class mPLUGOwl3PreTrainedModel(Qwen2PretrainedModel):
+    config_class = mPLUGOwl3Config
+
+class mPLUGOwl3Model(mPLUGOwl3PreTrainedModel):
+
+    def __init__(self, config):
+        super().__init__(config)
+        self.language_model = HyperQwen2ForCausalLM(config)
+        self.vision_model = self.init_vision_module()
+        self.vision_dim = self.vision_model.embed_dim
+        self.embed_dim = self.language_model.config.hidden_size
+        self.vision2text_model = paddle.nn.Linear(in_features=self.
+            vision_dim, out_features=self.embed_dim)
+        self.processor = None
+        self.terminators = ['<|im_end|>', '<|endoftext|>']
+
+    def init_vision_module(self):
+        print('-'*100)
+        if is_flash_attn_available():
+            self.config.vision_config._attn_implementation = (
+                    'flash_attention_2')
+        else:
+            self.config.vision_config._attn_implementation = "eager"
+        # self.config.vision_config._attn_implementation = (self.config.
+        #     vision_config._attn_implementation)
+# >>>>>>        model = (transformers.models.siglip.modeling_siglip.
+#             SiglipVisionTransformer(self.config.vision_config))
+        print("*"*100)
+        model=SigLipVisionTransformer(self.config.vision_config)
+        print("-"*100)
+        setattr(model, 'embed_dim', model.embeddings.embed_dim)
+        setattr(model, 'patch_size', model.embeddings.patch_size)
+        return model
+
+    def get_input_embeddings(self):
+        return self.language_model.get_input_embeddings()
+
+    def set_input_embeddings(self, value):
+        self.language_model.embed_tokens = value
+
+    def get_output_embeddings(self):
+        return self.language_model.lm_head
+
+    def set_output_embeddings(self, new_embeddings):
+        self.language_model.lm_head = new_embeddings
+
+    def set_decoder(self, decoder):
+        self.language_model = decoder
+
+    def get_decoder(self):
+        return self.language_model
+
+    def forward_image(self, pixel_values):
+        if pixel_values is None:
+            return None
+        dtype = self.language_model.model.embed_tokens.weight.dtype
+        with paddle.no_grad():
+            print('*'*100)
+            image_embeds = self.vision_model(pixel_values.to(dtype),
+                output_hidden_states=True).hidden_states[-2]
+            print('*'*150)
+        if self.vision2text_model is not None:
+            image_embeds = self.vision2text_model(image_embeds)
+        else:
+            pass
+        return image_embeds
+
+    def forward(self, pixel_values=None, **kwargs):
+        image_embeds = self.forward_image(pixel_values)
+        return self.language_model(image_embeds=image_embeds, **kwargs)
+
+    def _decode(self, input_ids, image_embeds, media_offset, tokenizer,
+        attention_mask, decode_text=False, **kwargs):
+        terminators = [tokenizer.convert_tokens_to_ids(i) for i in self.
+            terminators]
+        print(f"terminators dtype: {type(terminators)}")
+        print("inputids:",input_ids)
+        print(f"attention_mask: {attention_mask}")
+        # print(self.language_model)
+        output = self.language_model.generate(
+            input_ids=input_ids,#(1,60)
+            image_embeds=image_embeds,
+            media_offset=media_offset,
+            pad_token_id=0, eos_token_id=terminators, attention_mask=
+            attention_mask, **kwargs)[0]
+        output = output[:, tuple(input_ids.shape)[1]:]
+        if decode_text:
+            return self._decode_text(output, tokenizer)
+        return output
+
+    def _decode_stream(self, input_ids, image_embeds, media_offset,
+        tokenizer, **kwargs):
+        terminators = [tokenizer.convert_tokens_to_ids(i) for i in self.terminators]
+# >>>>>>        streamer = transformers.TextIteratorStreamer(tokenizer=tokenizer)
+        streamer = TextIteratorStreamer(tokenizer=tokenizer)
+        generation_kwargs = {'input_ids': input_ids, 'image_embeds':
+            image_embeds, 'media_offset': media_offset, 'pad_token_id': 0,
+            'eos_token_id': terminators, 'streamer': streamer}
+        generation_kwargs.update(kwargs)
+        thread = Thread(target=self.language_model.generate, kwargs=
+            generation_kwargs)
+        """Class Method: *.start, can not convert, please check whether it is torch.Tensor.*/Optimizer.*/nn.Module.*/torch.distributions.Distribution.*/torch.autograd.function.FunctionCtx.*/torch.profiler.profile.*/torch.autograd.profiler.profile.*, and convert manually"""
+# >>>>>>        thread.start()
+        thread.start()
+        return streamer
+
+    def _decode_text(self, result_ids, tokenizer):
+        terminators = [tokenizer.convert_tokens_to_ids(i) for i in self.
+            terminators]
+        result_text = []
+        for result in result_ids:
+            result = result[result != 0]
+            if result[-1] in terminators:
+                result = result[:-1]
+            result_text.append(tokenizer.decode(result).strip())
+        return result_text
+
+    def init_processor(self, tokenizer):
+        ip = mPLUGOwl3ImageProcessor(image_size=384)
+        self.processor = mPLUGOwl3Processor(image_processor=ip, tokenizer=
+            tokenizer)
+        processor = self.processor
+        return processor
+
+    def generate(self, input_ids=None, pixel_values=None, media_offset=None,
+        attention_mask=None, tokenizer=None, stream=False, decode_text=
+        False, **kwargs):
+        assert input_ids is not None
+        
+        with paddle.no_grad():
+            image_embeds = self.forward_image(pixel_values)
+            if stream:
+                result = self._decode_stream(input_ids=input_ids,
+                    image_embeds=image_embeds, media_offset=media_offset,
+                    tokenizer=tokenizer, **kwargs)
+            else:
+                result = self._decode(input_ids=input_ids, image_embeds=
+                    image_embeds, media_offset=media_offset, tokenizer=
+                    tokenizer, attention_mask=attention_mask, decode_text=
+                    decode_text, **kwargs)
+        return result
+
+    def chat(self, images, videos, messages, tokenizer, processor=None,
+        max_new_tokens=2048, min_new_tokens=0, sampling=True,
+        max_inp_length=8192, system_prompt='', stream=False, max_slice_nums
+        =None, use_image_id=None, **kwargs):
+        cut_flag = kwargs.get('kwargs', True)
+        if processor is None:
+            if self.processor is None:
+                processor = self.init_processor(tokenizer)
+            else:
+                processor = self.processor
+        inputs = processor(messages, images=images, videos=videos,
+            cut_enable=cut_flag)
+        inputs.to('cuda')
+        inputs.update({'tokenizer': tokenizer, 'max_new_tokens':
+            max_new_tokens})
+        if sampling:
+            generation_config = {'top_p': 0.8, 'top_k': 100, 'temperature':
+                0.7, 'do_sample': True}
+        else:
+            generation_config = {'num_beams': 3}
+        if min_new_tokens > 0:
+            generation_config['min_new_tokens'] = min_new_tokens
+        generation_config.update((k, kwargs[k]) for k in generation_config.
+            keys() & kwargs.keys())
+        with paddle.no_grad():
+            res = self.generate(**inputs, stream=stream, decode_text=True,
+                **generation_config)
+        if stream:
+
+            def stream_gen():
+                for text in res:
+                    for term in self.terminators:
+                        text = text.replace(term, '')
+                    yield text
+            return stream_gen()
+        else:
+            answer = res[0]
+            return answer
diff --git a/paddlemix/mPLUGOwl3/modeling_navit_siglip.py b/paddlemix/mPLUGOwl3/modeling_navit_siglip.py
new file mode 100644
index 000000000..51dbb9a9b
--- /dev/null
+++ b/paddlemix/mPLUGOwl3/modeling_navit_siglip.py
@@ -0,0 +1,870 @@
+# Copyright (c) 2024 PaddlePaddle Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+import os
+
+import paddle
+
+""" PyTorch Siglip model. """
+import math
+import warnings
+from dataclasses import dataclass
+from typing import Optional, Tuple, Union
+
+import numpy as np
+from paddle import nn
+from paddlenlp.transformers import PretrainedConfig
+from paddlenlp.transformers.activations import ACT2FN
+from paddlenlp.transformers.model_outputs import (
+    BaseModelOutput,
+    BaseModelOutputWithPooling,
+    ModelOutput,
+)
+from paddlenlp.transformers.model_utils import PretrainedModel
+
+from paddlemix.utils.initializer import _calculate_fan_in_and_fan_out
+
+from .bert_padding import pad_input, unpad_input
+from paddlemix.models.flash_attn_utils import has_flash_attn_func
+
+flash_attn_func, flash_attn_varlen_func = has_flash_attn_func()
+
+@dataclass
+class PaddleAttentionMaskConverter:
+    """
+    A utility attention mask class for Paddle that allows one to:
+        - Convert a 2d attention mask (batch_size, query_length) to a 4d attention mask
+          (batch_size, 1, query_length, key_value_length)
+    """
+
+    @staticmethod
+    def _expand_mask(mask: paddle.Tensor, dtype: str, tgt_len: Optional[int] = None):
+        """
+        Expands attention_mask from `[bsz, seq_len]` to `[bsz, 1, tgt_seq_len, src_seq_len]`.
+        """
+        bsz, src_len = mask.shape
+        tgt_len = tgt_len if tgt_len is not None else src_len
+
+        # Expand dimensions: [bsz, 1, 1, src_len]
+        expanded_mask = mask.unsqueeze([1, 2])
+
+        # Broadcast to target shape: [bsz, 1, tgt_len, src_len]
+        expanded_mask = paddle.expand(expanded_mask, shape=[bsz, 1, tgt_len, src_len])
+        expanded_mask = expanded_mask.astype(dtype)
+
+        # Invert the mask (1.0 for positions to attend to)
+        inverted_mask = 1.0 - expanded_mask
+
+        # Replace 1s with large negative values
+        min_value = paddle.to_tensor(float("-1e9"), dtype=dtype)
+        inverted_mask = paddle.where(inverted_mask.astype("bool"), min_value, paddle.zeros_like(inverted_mask))
+
+        return inverted_mask
+
+
+def _prepare_4d_attention_mask(mask: paddle.Tensor, dtype: str, tgt_len: Optional[int] = None):
+    """
+    Creates a 4D attention mask from a 2D mask.
+
+    Args:
+        mask (paddle.Tensor): A 2D attention mask of shape (batch_size, key_value_length)
+        dtype (str): The dtype the created mask should have
+        tgt_len (int, optional): The target length the created mask should have
+
+    Returns:
+        paddle.Tensor: A 4D attention mask of shape (batch_size, 1, query_length, key_value_length)
+    """
+    return PaddleAttentionMaskConverter._expand_mask(mask=mask, dtype=dtype, tgt_len=tgt_len)
+
+
+class SigLipVisionConfig(PretrainedConfig):
+
+    model_type = "siglip_vision_model"
+
+    def __init__(
+        self,
+        hidden_size=768,
+        intermediate_size=3072,
+        num_hidden_layers=12,
+        num_attention_heads=12,
+        num_channels=3,
+        image_size=224,
+        patch_size=16,
+        hidden_act="gelu",
+        layer_norm_eps=1e-06,
+        attention_dropout=0.0,
+        _attn_implementation="eager", 
+        **kwargs
+    ):
+        super().__init__(**kwargs)
+        self.hidden_size = hidden_size
+        self.intermediate_size = intermediate_size
+        self.num_hidden_layers = num_hidden_layers
+        self.num_attention_heads = num_attention_heads
+        self.num_channels = num_channels
+        self.patch_size = patch_size
+        self.image_size = image_size
+        self.attention_dropout = attention_dropout
+        self.layer_norm_eps = layer_norm_eps
+        self.hidden_act = hidden_act
+        self._attn_implementation = _attn_implementation
+
+    @classmethod
+    def from_pretrained(cls, pretrained_model_name_or_path: Union[str, os.PathLike], **kwargs) -> "PretrainedConfig":
+        # cls._set_token_in_kwargs(kwargs)
+
+        config_dict, kwargs = cls.get_config_dict(pretrained_model_name_or_path, **kwargs)
+
+        # get the vision config dict if we are loading from SigLipConfig
+        if config_dict.get("model_type") == "siglip":
+            config_dict = config_dict["vision_config"]
+
+        if "model_type" in config_dict and hasattr(cls, "model_type") and config_dict["model_type"] != cls.model_type:
+            print(
+                f"You are using a model of type {config_dict['model_type']} to instantiate a model of type "
+                f"{cls.model_type}. This is not supported for all configurations of models and can yield errors."
+            )
+
+        return cls.from_dict(config_dict, **kwargs)
+
+
+# _CHECKPOINT_FOR_DOC = 'google/siglip-base-patch16-224'
+
+
+def _get_unpad_data(attention_mask):
+    seqlens_in_batch = attention_mask.sum(axis=-1, dtype="int32")
+    paddle.utils.try_import("warnings").warn("Now, the return shape is inconsistent with torch when as_tuple is True")
+    indices = paddle.nonzero(x=attention_mask.flatten(), as_tuple=False).flatten()
+    max_seqlen_in_batch = seqlens_in_batch.max().item()
+    cu_seqlens = nn.functional.pad(
+        x=paddle.cumsum(x=seqlens_in_batch, axis=0, dtype="int32"), pad=(1, 0), pad_from_left_axis=False
+    )
+    return indices, cu_seqlens, max_seqlen_in_batch
+
+
+def _trunc_normal_(tensor, mean, std, a, b):
+    # 确保mean是浮点数
+    mean = float(mean)
+    std = float(std)
+    a = float(a)
+    b = float(b)
+
+    def norm_cdf(x):
+        return (1.0 + math.erf(x / math.sqrt(2.0))) / 2.0
+
+    if mean < a - 2 * std or mean > b + 2 * std:
+        warnings.warn(
+            "mean is more than 2 std from [a, b] in nn.init.trunc_normal_. The distribution of values may be incorrect.",
+            stacklevel=2,
+        )
+    l = norm_cdf((a - mean) / std)
+    u = norm_cdf((b - mean) / std)
+    tensor.uniform_(min=2 * l - 1, max=2 * u - 1)
+    if tensor.dtype in ["float16", "bfloat16"]:
+        og_dtype = tensor.dtype
+        tensor = tensor.to("float32")
+        tensor.erfinv_()
+        tensor = tensor.to(og_dtype)
+    else:
+        tensor.erfinv_()
+    tensor.multiply_(y=paddle.to_tensor(std * math.sqrt(2.0)))
+    tensor.add_(y=paddle.to_tensor(mean))
+    if tensor.dtype == "float16":
+        tensor = tensor.to("float32")
+        tensor.clip_(min=a, max=b)
+        tensor = tensor.to("float16")
+    else:
+        tensor.clip_(min=a, max=b)
+
+
+def trunc_normal_tf_(
+    tensor: paddle.Tensor, mean: float = 0.0, std: float = 1.0, a: float = -2.0, b: float = 2.0
+) -> paddle.Tensor:
+    """Fills the input Tensor with values drawn from a truncated
+    normal distribution. The values are effectively drawn from the
+    normal distribution :math:`\\mathcal{N}(	ext{mean}, 	ext{std}^2)`
+    with values outside :math:`[a, b]` redrawn until they are within
+    the bounds. The method used for generating the random values works
+    best when :math:`a \\leq 	ext{mean} \\leq b`.
+    NOTE: this 'tf' variant behaves closer to Tensorflow / JAX impl where the
+    bounds [a, b] are applied when sampling the normal distribution with mean=0, std=1.0
+    and the result is subsquently scaled and shifted by the mean and std args.
+    Args:
+        tensor: an n-dimensional `torch.Tensor`
+        mean: the mean of the normal distribution
+        std: the standard deviation of the normal distribution
+        a: the minimum cutoff value
+        b: the maximum cutoff value
+    """
+    with paddle.no_grad():
+        _trunc_normal_(tensor, 0, 1.0, a, b)
+        tensor.multiply_(y=paddle.to_tensor(std)).add_(y=paddle.to_tensor(mean))
+
+
+def variance_scaling_(tensor, scale=1.0, mode="fan_in", distribution="normal"):
+    fan_in, fan_out = _calculate_fan_in_and_fan_out(tensor)
+    if mode == "fan_in":
+        denom = fan_in
+    elif mode == "fan_out":
+        denom = fan_out
+    elif mode == "fan_avg":
+        denom = (fan_in + fan_out) / 2
+    variance = scale / denom
+    if distribution == "truncated_normal":
+        trunc_normal_tf_(tensor, std=math.sqrt(variance) / 0.8796256610342398)
+    elif distribution == "normal":
+        with paddle.no_grad():
+            tensor.normal_(std=math.sqrt(variance))
+    elif distribution == "uniform":
+        bound = math.sqrt(3 * variance)
+        with paddle.no_grad():
+            tensor.uniform_(min=-bound, max=bound)
+    else:
+        raise ValueError(f"invalid distribution {distribution}")
+
+
+def lecun_normal_(tensor):
+    variance_scaling_(tensor, mode="fan_in", distribution="truncated_normal")
+
+
+def default_flax_embed_init(tensor):
+    variance_scaling_(tensor, mode="fan_in", distribution="normal")
+
+
+@dataclass
+class SiglipVisionModelOutput(ModelOutput):
+    """
+    Base class for vision model's outputs that also contains image embeddings of the pooling of the last hidden states.
+    Args:
+        image_embeds (`torch.FloatTensor` of shape `(batch_size, output_dim)` *optional* returned when model is initialized with `with_projection=True`):
+            The image embeddings obtained by applying the projection layer to the pooler_output.
+        last_hidden_state (`torch.FloatTensor` of shape `(batch_size, sequence_length, hidden_size)`):
+            Sequence of hidden-states at the output of the last layer of the model.
+        hidden_states (`tuple(torch.FloatTensor)`, *optional*, returned when `output_hidden_states=True` is passed or when `config.output_hidden_states=True`):
+            Tuple of `torch.FloatTensor` (one for the output of the embeddings, if the model has an embedding layer, +
+            one for the output of each layer) of shape `(batch_size, sequence_length, hidden_size)`.
+            Hidden-states of the model at the output of each layer plus the optional initial embedding outputs.
+        attentions (`tuple(torch.FloatTensor)`, *optional*, returned when `output_attentions=True` is passed or when `config.output_attentions=True`):
+            Tuple of `torch.FloatTensor` (one for each layer) of shape `(batch_size, num_heads, sequence_length,
+            sequence_length)`.
+            Attentions weights after the attention softmax, used to compute the weighted average in the self-attention
+            heads.
+    """
+
+    image_embeds: Optional[paddle.Tensor] = None
+    last_hidden_state: paddle.float32 = None
+    hidden_states: Optional[Tuple[paddle.Tensor]] = None
+    attentions: Optional[Tuple[paddle.Tensor]] = None
+
+
+class SiglipVisionEmbeddings(nn.Layer):
+    def __init__(self, config: SigLipVisionConfig):
+        super().__init__()
+        self.config = config
+        self.embed_dim = config.hidden_size
+        self.image_size = config.image_size
+        self.patch_size = config.patch_size
+
+        self.patch_embedding = nn.Conv2D(
+            in_channels=config.num_channels,
+            out_channels=self.embed_dim,
+            kernel_size=self.patch_size,
+            stride=self.patch_size,
+            padding="valid",
+        )
+        self.num_patches_per_side = self.image_size // self.patch_size
+        self.num_patches = self.num_patches_per_side**2
+        self.num_positions = self.num_patches
+        self.position_embedding = nn.Embedding(self.num_positions, self.embed_dim)
+
+    def forward(
+        self,
+        pixel_values: paddle.Tensor,
+        patch_attention_mask: paddle.Tensor,
+        tgt_sizes: Optional[paddle.Tensor] = None,
+    ) -> paddle.Tensor:
+        batch_size = pixel_values.shape[0]
+        patch_embeds = self.patch_embedding(pixel_values)  # shape = [*, width, grid, grid]
+        embeddings = patch_embeds.flatten(2).transpose([0, 2, 1])
+
+        max_im_h, max_im_w = pixel_values.shape[2], pixel_values.shape[3]
+        max_nb_patches_h, max_nb_patches_w = (max_im_h // self.patch_size, max_im_w // self.patch_size)
+        boundaries = paddle.arange(start=1 / self.num_patches_per_side, end=1.0, step=1 / self.num_patches_per_side)
+        position_ids = paddle.full(shape=(batch_size, max_nb_patches_h * max_nb_patches_w), fill_value=0)
+        for batch_idx, p_attn_mask in enumerate(patch_attention_mask):
+            if tgt_sizes is not None:
+                nb_patches_h = tgt_sizes[batch_idx][0]
+                nb_patches_w = tgt_sizes[batch_idx][1]
+            else:
+                nb_patches_h = p_attn_mask[:, 0].sum()
+                nb_patches_w = p_attn_mask[0].sum()
+
+            fractional_coords_h = paddle.arange(start=0, end=1 - 1e-06, step=1 / nb_patches_h)
+            fractional_coords_w = paddle.arange(start=0, end=1 - 1e-06, step=1 / nb_patches_w)
+            bucket_coords_h = paddle.bucketize(x=fractional_coords_h, sorted_sequence=boundaries, right=True)
+            bucket_coords_w = paddle.bucketize(x=fractional_coords_w, sorted_sequence=boundaries, right=True)
+            pos_ids = (bucket_coords_h[:, None] * self.num_patches_per_side + bucket_coords_w).flatten()
+            position_ids[batch_idx].scatter_(
+                paddle.nonzero(p_attn_mask.reshape([-1]))[:, 0], pos_ids.astype(position_ids.dtype)
+            )
+        position_ids = position_ids.to(self.position_embedding.weight.place)
+
+        embeddings = embeddings + self.position_embedding(position_ids.cast("int64"))
+        return embeddings
+
+
+class SigLipAttention(nn.Layer):
+    """Multi-headed attention from 'Attention Is All You Need' paper"""
+
+    # Copied from transformers.models.clip.modeling_clip.CLIPAttention.__init__
+    def __init__(self, config):
+        super().__init__()
+        self.config = config
+        self.embed_dim = config.hidden_size
+        self.num_heads = config.num_attention_heads
+        self.head_dim = self.embed_dim // self.num_heads
+        if self.head_dim * self.num_heads != self.embed_dim:
+            raise ValueError(
+                f"embed_dim must be divisible by num_heads (got `embed_dim`: {self.embed_dim} and `num_heads`:"
+                f" {self.num_heads})."
+            )
+        self.scale = self.head_dim**-0.5
+        self.dropout = config.attention_dropout
+
+        self.k_proj = nn.Linear(self.embed_dim, self.embed_dim)
+        self.v_proj = nn.Linear(self.embed_dim, self.embed_dim)
+        self.q_proj = nn.Linear(self.embed_dim, self.embed_dim)
+        self.out_proj = nn.Linear(self.embed_dim, self.embed_dim)
+
+    def forward(
+        self,
+        hidden_states: paddle.Tensor,
+        attention_mask: Optional[paddle.Tensor] = None,
+        output_attentions: Optional[bool] = False,
+    ) -> Tuple[paddle.Tensor, Optional[paddle.Tensor], Optional[Tuple[paddle.Tensor]]]:
+        """Input shape: Batch x Time x Channel"""
+
+        batch_size, q_len, _ = hidden_states.shape
+
+        query_states = self.q_proj(hidden_states)
+        key_states = self.k_proj(hidden_states)
+        value_states = self.v_proj(hidden_states)
+
+        query_states = query_states.reshape([batch_size, q_len, self.num_heads, self.head_dim]).transpose([0, 2, 1])
+        key_states = key_states.reshape([batch_size, q_len, self.num_heads, self.head_dim]).transpose([0, 2, 1])
+        value_states = value_states.reshape([batch_size, q_len, self.num_heads, self.head_dim]).transpose([0, 2, 1])
+
+        k_v_seq_len = key_states.shape[-2]
+        attn_weights = paddle.matmul(query_states, key_states.transpose([0, 1, 3, 2])) * self.scale
+
+        if attn_weights.shape != [batch_size, self.num_heads, q_len, k_v_seq_len]:
+            raise ValueError(
+                f"Attention weights should be of size {(batch_size, self.num_heads, q_len, k_v_seq_len)}, but is"
+                f" {attn_weights.shape}"
+            )
+
+        if attention_mask is not None:
+            if attention_mask.shape != [batch_size, 1, q_len, k_v_seq_len]:
+                raise ValueError(
+                    f"Attention mask should be of size {(batch_size, 1, q_len, k_v_seq_len)}, but is {attention_mask.shape}"
+                )
+            attn_weights = attn_weights + attention_mask
+
+        # upcast attention to fp32
+        attn_weights = nn.functional.softmax(attn_weights, axis=-1)
+        attn_weights = nn.functional.dropout(attn_weights, p=self.dropout, training=self.training)
+        attn_output = paddle.matmul(attn_weights, value_states)
+
+        if attn_output.shape != [batch_size, self.num_heads, q_len, self.head_dim]:
+            raise ValueError(
+                f"`attn_output` should be of size {(batch_size, self.num_heads, q_len, self.head_dim)}, but is"
+                f" {attn_output.shape}"
+            )
+
+        attn_output = attn_output.transpose([0, 2, 1]).contiguous()
+        attn_output = attn_output.reshape([batch_size, q_len, self.embed_dim])
+
+        attn_output = self.out_proj(attn_output)
+
+        return attn_output, attn_weights
+
+
+class SiglipFlashAttention2(SigLipAttention):
+    """
+    Llama flash attention module. This module inherits from `LlamaAttention` as the weights of the module stays
+    untouched. The only required change would be on the forward pass where it needs to correctly call the public API of
+    flash attention and deal with padding tokens in case the input contains any of them.
+    """
+
+    def __init__(self, *args, **kwargs):
+        super().__init__(*args, **kwargs)
+        self.is_causal = False
+
+    def forward(
+        self,
+        hidden_states: paddle.Tensor,
+        attention_mask: Optional[paddle.Tensor] = None,
+        position_ids: Optional[paddle.Tensor] = None,
+        past_key_value: Optional[Tuple[paddle.Tensor]] = None,
+        output_attentions: bool = False,
+        use_cache: bool = False,
+        **kwargs
+    ) -> Tuple[paddle.Tensor, Optional[paddle.Tensor], Optional[Tuple[paddle.Tensor]]]:
+        output_attentions = False
+        bsz, q_len, _ = hidden_states.shape
+        query_states = self.q_proj(hidden_states)
+        key_states = self.k_proj(hidden_states)
+        value_states = self.v_proj(hidden_states)
+
+        query_states = query_states.reshape([bsz, q_len, self.num_heads, self.head_dim]).transpose([0, 2, 1])
+        key_states = key_states.reshape([bsz, q_len, self.num_heads, self.head_dim]).transpose([0, 2, 1])
+        value_states = value_states.reshape([bsz, q_len, self.num_heads, self.head_dim]).transpose([0, 2, 1])
+
+        kv_seq_len = tuple(key_states.shape)[-2]
+        if past_key_value is not None:
+            kv_seq_len += past_key_value.get_usable_length(kv_seq_len, self.layer_idx)
+        query_states = query_states.transpose([0, 2, 1])
+        key_states = key_states.transpose([0, 2, 1])
+        value_states = value_states.transpose([0, 2, 1])
+
+        dropout_rate = self.dropout if self.training else 0.0
+        input_dtype = query_states.dtype
+        if input_dtype == paddle.float32:
+            if paddle.amp.is_auto_cast_enabled():
+                target_dtype = paddle.amp.get_default_dtype()
+            # Handle the case where the model is quantized
+            elif hasattr(self.config, "_pre_quantization_dtype"):
+                target_dtype = self.config._pre_quantization_dtype
+            else:
+                target_dtype = self.q_proj.weight.dtype
+            query_states = query_states.to(target_dtype)
+            key_states = key_states.to(target_dtype)
+            value_states = value_states.to(target_dtype)
+        attn_output = self._flash_attention_forward(
+            query_states, key_states, value_states, attention_mask, q_len, dropout=dropout_rate
+        )
+        attn_output = attn_output.reshape(bsz, q_len, self.embed_dim).contiguous()
+        attn_output = self.out_proj(attn_output)
+        if not output_attentions:
+            attn_weights = None
+        return attn_output, attn_weights
+
+    def _flash_attention_forward(
+        self, query_states, key_states, value_states, attention_mask, query_length, dropout=0.0, softmax_scale=None
+    ):
+        """
+        Calls the forward method of Flash Attention - if the input hidden states contain at least one padding token
+        first unpad the input, then computes the attention scores and pad the final attention scores.
+
+        Args:
+            query_states (`paddle.Tensor`):
+                Input query states to be passed to Flash Attention API
+            key_states (`paddle.Tensor`):
+                Input key states to be passed to Flash Attention API
+            value_states (`paddle.Tensor`):
+                Input value states to be passed to Flash Attention API
+            attention_mask (`paddle.Tensor`):
+                The padding mask - corresponds to a tensor of size `(batch_size, seq_len)` where 0 stands for the
+                position of padding tokens and 1 for the position of non-padding tokens.
+            dropout (`int`, *optional*):
+                Attention dropout
+            softmax_scale (`float`, *optional*):
+                The scaling of QK^T before applying softmax. Default to 1 / sqrt(head_dim)
+        """
+        # Contains at least one padding token in the sequence
+        causal = self.is_causal and query_length != 1
+
+        head_dim = query_states.shape[-1]
+        softmax_scale = head_dim**-0.5  # TODO: 需要手动加上
+
+        if attention_mask is not None:
+            batch_size = query_states.shape[0]  # [2, 3383, 16, 128]
+
+            query_states, key_states, value_states, indices_q, cu_seq_lens, max_seq_lens = unpad_input(
+                query_states, key_states, value_states, attention_mask, query_length
+            )
+
+            cu_seqlens_q, cu_seqlens_k = cu_seq_lens
+            max_seqlen_in_batch_q, max_seqlen_in_batch_k = max_seq_lens
+
+            attn_output_unpad = flash_attn_varlen_func(  # TODO: flash_attn_unpadded
+                query_states,  # [5998, 16, 128]
+                key_states,  # [5998, 8, 128]
+                value_states,  # [5998, 8, 128]
+                cu_seqlens_q=cu_seqlens_q,
+                cu_seqlens_k=cu_seqlens_k,
+                max_seqlen_q=max_seqlen_in_batch_q,
+                max_seqlen_k=max_seqlen_in_batch_k,
+                scale=softmax_scale,  # not softmax_scale=
+                dropout=dropout,
+                causal=causal,
+            )[0]
+
+            attn_output = pad_input(attn_output_unpad, indices_q, batch_size, query_length)
+        else:
+            attn_output = flash_attn_func(
+                query_states,
+                key_states,
+                value_states,
+                dropout,
+                causal=causal,  # no softmax_scale=
+            )[0]
+
+        return attn_output
+
+
+class SigLipMLP(nn.Layer):
+    def __init__(self, config):
+        super().__init__()
+        self.config = config
+        self.activation_fn = ACT2FN[config.hidden_act]
+        self.fc1 = nn.Linear(config.hidden_size, config.intermediate_size)
+        self.fc2 = nn.Linear(config.intermediate_size, config.hidden_size)
+
+    def forward(self, hidden_states: paddle.Tensor) -> paddle.Tensor:
+        hidden_states = self.fc1(hidden_states)
+        hidden_states = self.activation_fn(hidden_states)
+        hidden_states = self.fc2(hidden_states)
+        return hidden_states
+
+
+class SigLipEncoderLayer(nn.Layer):
+    def __init__(self, config: SigLipVisionConfig):
+        super().__init__()
+        self.embed_dim = config.hidden_size
+        self.self_attn = SigLipAttention(config)
+        self.layer_norm1 = nn.LayerNorm(self.embed_dim, epsilon=config.layer_norm_eps)
+        self.mlp = SigLipMLP(config)
+        self.layer_norm2 = nn.LayerNorm(self.embed_dim, epsilon=config.layer_norm_eps)
+
+    # Ignore copy
+    def forward(
+        self,
+        hidden_states: paddle.Tensor,
+        attention_mask: paddle.Tensor,
+        output_attentions: Optional[bool] = False,
+    ) -> Tuple[paddle.Tensor]:
+        """
+        Args:
+            hidden_states (`paddle.Tensor`):
+                Input to the layer of shape `(batch, seq_len, embed_dim)`.
+            attention_mask (`paddle.Tensor`):
+                Attention mask of shape `(batch, 1, q_len, k_v_seq_len)` where padding elements are indicated by very large negative values.
+            output_attentions (`bool`, *optional*, defaults to `False`):
+                Whether or not to return the attentions tensors of all attention layers. See `attentions` under
+                returned tensors for more detail.
+        """
+        residual = hidden_states
+
+        hidden_states = self.layer_norm1(hidden_states)
+        hidden_states, attn_weights = self.self_attn(
+            hidden_states=hidden_states,
+            attention_mask=attention_mask,
+            output_attentions=output_attentions,
+        )
+        hidden_states = residual + hidden_states
+
+        residual = hidden_states
+        hidden_states = self.layer_norm2(hidden_states)
+        hidden_states = self.mlp(hidden_states)
+        hidden_states = residual + hidden_states
+
+        outputs = (hidden_states,)
+
+        if output_attentions:
+            outputs += (attn_weights,)
+
+        return outputs
+
+
+class SigLipPreTrainedModel(PretrainedModel):
+    """
+    An abstract class to handle weights initialization and a simple interface for downloading and loading pretrained
+    models.
+    """
+
+    config_class = SigLipVisionConfig
+    base_model_prefix = "siglip"
+    supports_gradient_checkpointing = True
+
+    def _init_weights(self, module):
+        """Initialize the weights"""
+        if isinstance(module, SiglipVisionEmbeddings):
+            width = self.config.hidden_size
+            init_Normal = nn.initializer.Normal(std=1 / np.sqrt(width))
+            init_Normal(module.position_embedding.weight)
+        elif isinstance(module, nn.Embedding):
+            default_flax_embed_init(module.weight)
+        elif isinstance(module, SigLipAttention):
+            # 初始化投影层权重
+            for proj in [module.q_proj, module.k_proj, module.v_proj, module.out_proj]:
+                init_Normal = nn.initializer.Normal()
+                init_Normal(proj.weight)
+                # 使用assign替代原地操作初始化偏置
+                if hasattr(proj, "bias") and proj.bias is not None:
+                    proj.bias.set_value(paddle.zeros_like(proj.bias))
+
+        elif isinstance(module, SigLipMLP):
+            # 初始化FC层权重
+            init_Normal = nn.initializer.Normal()
+            init_Normal(module.fc1.weight)
+            init_Normal(module.fc2.weight)
+
+            # 使用assign初始化偏置
+            if hasattr(module.fc1, "bias") and module.fc1.bias is not None:
+                module.fc1.bias.set_value(paddle.normal(shape=module.fc1.bias.shape, mean=0.0, std=1e-06))
+            if hasattr(module.fc2, "bias") and module.fc2.bias is not None:
+                module.fc2.bias.set_value(paddle.normal(shape=module.fc2.bias.shape, mean=0.0, std=1e-06))
+
+        elif isinstance(module, (nn.Linear, nn.Conv2D)):
+            lecun_normal_(module.weight)
+            if module.bias is not None:
+                module.bias.set_value(paddle.zeros_like(module.bias))
+
+        elif isinstance(module, nn.LayerNorm):
+            # 使用set_value替代原地操作
+            if module.bias is not None:
+                module.bias.set_value(paddle.zeros_like(module.bias))
+            if module.weight is not None:
+                module.weight.set_value(paddle.ones_like(module.weight))
+
+
+SIGLIP_START_DOCSTRING = """
+    This model inherits from [`PreTrainedModel`]. Check the superclass documentation for the generic methods the
+    library implements for all its model (such as downloading or saving, resizing the input embeddings, pruning heads
+    etc.)
+    This model is also a PyTorch [torch.nn.Module](https://pytorch.org/docs/stable/nn.html#torch.nn.Module) subclass.
+    Use it as a regular PyTorch Module and refer to the PyTorch documentation for all matter related to general usage
+    and behavior.
+    Parameters:
+        config ([`SiglipVisionConfig`]): Model configuration class with all the parameters of the model.
+            Initializing with a config file does not load the weights associated with the model, only the
+            configuration. Check out the [`~PreTrainedModel.from_pretrained`] method to load the model weights.
+"""
+SIGLIP_VISION_INPUTS_DOCSTRING = """
+    Args:
+        pixel_values (`torch.FloatTensor` of shape `(batch_size, num_channels, height, width)`):
+            Pixel values. Padding will be ignored by default should you provide it. Pixel values can be obtained using
+            [`AutoImageProcessor`]. See [`CLIPImageProcessor.__call__`] for details.
+        output_attentions (`bool`, *optional*):
+            Whether or not to return the attentions tensors of all attention layers. See `attentions` under returned
+            tensors for more detail.
+        output_hidden_states (`bool`, *optional*):
+            Whether or not to return the hidden states of all layers. See `hidden_states` under returned tensors for
+            more detail.
+        return_dict (`bool`, *optional*):
+            Whether or not to return a [`~utils.ModelOutput`] instead of a plain tuple.
+"""
+
+
+class SigLipEncoder(nn.Layer):
+    """
+    Transformer encoder consisting of `config.num_hidden_layers` self attention layers. Each layer is a
+    [`SiglipEncoderLayer`].
+    Args:
+        config: SiglipConfig
+    """
+
+    def __init__(self, config: SigLipVisionConfig):
+        super().__init__()
+        self.config = config
+        self.layers = nn.LayerList(sublayers=[SigLipEncoderLayer(config) for _ in range(config.num_hidden_layers)])
+        self.gradient_checkpointing = False
+
+    def forward(
+        self,
+        inputs_embeds,
+        attention_mask: Optional[paddle.Tensor] = None,
+        output_attentions: Optional[bool] = None,
+        output_hidden_states: Optional[bool] = None,
+        return_dict: Optional[bool] = True,
+    ) -> Union[Tuple, BaseModelOutput]:
+        """
+        Args:
+            inputs_embeds (`torch.FloatTensor` of shape `(batch_size, sequence_length, hidden_size)`):
+                Optionally, instead of passing `input_ids` you can choose to directly pass an embedded representation.
+                This is useful if you want more control over how to convert `input_ids` indices into associated vectors
+                than the model's internal embedding lookup matrix.
+            attention_mask (`torch.Tensor` of shape `(batch_size, sequence_length)`, *optional*):
+                Mask to avoid performing attention on padding token indices. Mask values selected in `[0, 1]`:
+                - 1 for tokens that are **not masked**,
+                - 0 for tokens that are **masked**.
+                [What are attention masks?](../glossary#attention-mask)
+            output_attentions (`bool`, *optional*):
+                Whether or not to return the attentions tensors of all attention layers. See `attentions` under
+                returned tensors for more detail.
+            output_hidden_states (`bool`, *optional*):
+                Whether or not to return the hidden states of all layers. See `hidden_states` under returned tensors
+                for more detail.
+            return_dict (`bool`, *optional*):
+                Whether or not to return a [`~utils.ModelOutput`] instead of a plain tuple.
+        """
+        output_attentions = output_attentions if output_attentions is not None else self.config.output_attentions
+        output_hidden_states = (
+            output_hidden_states if output_hidden_states is not None else self.config.output_hidden_states
+        )
+        return_dict = return_dict if return_dict is not None else self.config.use_return_dict
+
+        encoder_states = () if output_hidden_states else None
+        all_attentions = () if output_attentions else None
+        hidden_states = inputs_embeds
+        for encoder_layer in self.layers:
+            if output_hidden_states:
+                encoder_states = encoder_states + (hidden_states,)
+            if self.gradient_checkpointing and self.training:
+                layer_outputs = self._gradient_checkpointing_func(
+                    encoder_layer.__call__, hidden_states, attention_mask, output_attentions
+                )
+            else:
+                layer_outputs = encoder_layer(hidden_states, attention_mask, output_attentions=output_attentions)
+            hidden_states = layer_outputs[0]
+            if output_attentions:
+                all_attentions = all_attentions + (layer_outputs[1],)
+        if output_hidden_states:
+            encoder_states = encoder_states + (hidden_states,)
+
+        if not return_dict:
+            return tuple(v for v in [hidden_states, encoder_states, all_attentions] if v is not None)
+        return BaseModelOutput(
+            last_hidden_state=hidden_states, hidden_states=encoder_states, attentions=all_attentions
+        )
+
+
+# Copied from transformers.models.llama.modeling_llama._prepare_4d_causal_attention_mask_with_cache_position
+def _prepare_4d_causal_attention_mask_with_cache_position(
+    attention_mask: paddle.Tensor,
+    sequence_length: int,
+    target_length: int,
+    dtype: paddle.dtype,
+    min_dtype: float,
+    cache_position: paddle.Tensor,
+    batch_size: int,
+):
+    """
+    Creates a causal 4D mask of shape `(batch_size, 1, query_length, key_value_length)` from a 2D mask of shape
+    `(batch_size, key_value_length)`, or if the input `attention_mask` is already 4D, do nothing.
+
+    Args:
+        attention_mask (`paddle.Tensor`):
+            A 2D attention mask of shape `(batch_size, key_value_length)` or a 4D attention mask of shape `(batch_size, 1, query_length, key_value_length)`.
+        sequence_length (`int`):
+            The sequence length being processed.
+        target_length (`int`):
+            The target length: when generating with static cache, the mask should be as long as the static cache, to account for the 0 padding, the part of the cache that is not filled yet.
+        dtype (`paddle.dtype`):
+            The dtype to use for the 4D attention mask.
+        min_dtype (`float`):
+            The minimum value representable with the dtype `dtype`.
+        cache_position (`paddle.Tensor`):
+            Indices depicting the position of the input sequence tokens in the sequence.
+        batch_size (`paddle.Tensor`):
+            Batch size.
+    """
+    if attention_mask is not None and attention_mask.dim() == 4:
+        # In this case we assume that the mask comes already in inverted form and requires no inversion or slicing.
+        causal_mask = attention_mask
+    else:
+        causal_mask = paddle.full([sequence_length, target_length], fill_value=min_dtype, dtype=dtype)
+        if sequence_length != 1:
+            causal_mask = paddle.triu(x=causal_mask, diagonal=1)
+        causal_mask *= paddle.arange(target_length) > cache_position.reshape([-1, 1])
+        causal_mask = causal_mask[None, None, :, :].expand(shape=[batch_size, 1, -1, -1])
+        if attention_mask is not None:
+            causal_mask = causal_mask.clone()
+            mask_length = tuple(attention_mask.shape)[-1]
+            padding_mask = causal_mask[:, :, :, :mask_length] + attention_mask[:, None, None, :]
+            padding_mask = padding_mask == 0
+            causal_mask[:, :, :, :mask_length] = causal_mask[:, :, :, :mask_length].masked_fill(
+                mask=padding_mask, value=min_dtype
+            )
+
+    return causal_mask
+
+
+class SigLipVisionTransformer(SigLipPreTrainedModel):
+    config_class = SigLipVisionConfig
+    main_input_name = "pixel_values"
+    _supports_flash_attn_2 = True
+
+    def __init__(self, config: SigLipVisionConfig):
+        super().__init__(config)
+        self.config = config
+        embed_dim = config.hidden_size
+        self.embeddings = SiglipVisionEmbeddings(config)
+        self.encoder = SigLipEncoder(config)
+        self.post_layernorm = nn.LayerNorm(normalized_shape=embed_dim, epsilon=config.layer_norm_eps)
+        self._use_flash_attention_2 = config._attn_implementation == "flash_attention_2"
+
+        # self.post_init()
+
+    def get_input_embeddings(self) -> nn.Layer:
+        return self.embeddings.patch_embedding
+
+    def forward(
+        self,
+        pixel_values,
+        patch_attention_mask: Optional[paddle.Tensor] = None,
+        tgt_sizes: Optional[paddle.Tensor] = None,
+        output_attentions: Optional[bool] = None,
+        output_hidden_states: Optional[bool] = None,
+        return_dict: Optional[bool] = True,
+    ) -> Union[Tuple, BaseModelOutputWithPooling]:
+        """
+        Returns:
+        """
+        output_attentions = output_attentions if output_attentions is not None else self.config.output_attentions
+        output_hidden_states = (
+            output_hidden_states if output_hidden_states is not None else self.config.output_hidden_states
+        )
+        return_dict = return_dict if return_dict is not None else self.config.use_return_dict
+        batch_size = pixel_values.shape[0]
+        if patch_attention_mask is None:
+            patch_attention_mask = paddle.ones(
+                shape=(
+                    batch_size,
+                    pixel_values.shape[2] // self.config.patch_size,
+                    pixel_values.shape[3] // self.config.patch_size,
+                ),
+                dtype="bool",
+            )
+
+        hidden_states = self.embeddings(
+            pixel_values=pixel_values, patch_attention_mask=patch_attention_mask, tgt_sizes=tgt_sizes
+        )
+        patch_attention_mask = patch_attention_mask.reshape([batch_size, -1])
+        if not paddle.any(x=~patch_attention_mask):
+            attention_mask = None
+        else:
+            attention_mask = (
+                _prepare_4d_attention_mask(patch_attention_mask, hidden_states.dtype)
+                if not self._use_flash_attention_2
+                else patch_attention_mask
+            )
+        encoder_outputs = self.encoder(
+            inputs_embeds=hidden_states,
+            attention_mask=attention_mask,
+            output_attentions=output_attentions,
+            output_hidden_states=output_hidden_states,
+            return_dict=return_dict,
+        )
+        last_hidden_state = encoder_outputs[0]
+        last_hidden_state = self.post_layernorm(last_hidden_state)
+        if not return_dict:
+            return (last_hidden_state, None) + encoder_outputs[1:]
+        return BaseModelOutputWithPooling(
+            last_hidden_state=last_hidden_state,
+            pooler_output=None,
+            hidden_states=encoder_outputs.hidden_states,
+            attentions=encoder_outputs.attentions,
+        )
diff --git a/paddlemix/mPLUGOwl3/processing_mplugowl3.py b/paddlemix/mPLUGOwl3/processing_mplugowl3.py
new file mode 100644
index 000000000..9a7cb06ec
--- /dev/null
+++ b/paddlemix/mPLUGOwl3/processing_mplugowl3.py
@@ -0,0 +1,348 @@
+import sys
+sys.path.append('/home/aistudio/paddle_test/mPLUGOwl3/utils')
+import paddle_aux
+import paddle
+import paddlenlp
+from paddlenlp.transformers.processing_utils import ProcessorMixin
+"""
+Processor class for mPLUGOwl3.
+"""
+from typing import List, Optional, Union, Dict, Any
+import warnings
+import re
+# from .image_processing_mplugowl3 import mPLUGOwl3BatchFeature, mPLUGOwl3ImageProcessor
+from .image_processing_mplugowl3 import mPLUGOwl3BatchFeature, mPLUGOwl3ImageProcessor,TensorType
+OWL_MEDIA_TOKEN = ['<|image|>']
+
+
+class MediaIndicesHelper:
+
+    def __init__(self, tokenizer) ->None:
+        self.media_position = []
+        self.tokenizer = tokenizer
+
+    def has_media(self, text, media_tokens=None):
+        if media_tokens is None:
+            media_tokens = OWL_MEDIA_TOKEN
+        has_media_flag = any([(media_token == text) for media_token in
+            media_tokens])
+        if any([(media_token in text) for media_token in media_tokens]):
+            assert has_media_flag, text
+        return has_media_flag
+
+    def add_media(self, text_chunk, text=None, tokenize_fn=None):
+        assert tokenize_fn is not None
+        assert text is not None
+        assert text in OWL_MEDIA_TOKEN
+        media_token_ids = tokenize_fn(text)
+        start = len(text_chunk)
+        end = start + len(media_token_ids)
+        self.media_position.append([start, end])
+        text_chunk.extend(media_token_ids)
+        return len(media_token_ids)
+
+    def cal_media_offset(self, input_ids):
+        if len(self.media_position) == 0:
+            return paddle.ones_like(x=input_ids) * -1000000
+        media_starts = paddle.to_tensor(data=[_[0] for _ in self.
+            media_position]).reshape(1, -1)
+        rng = paddle.arange(end=tuple(input_ids.shape)[0]).reshape(-1, 1)
+        matrix = (rng > media_starts).sum(axis=1)
+        return matrix
+
+    def len_images(self):
+        return len(self.media_position)
+
+
+# >>>>>>class mPLUGOwl3Processor(transformers.processing_utils.ProcessorMixin):
+class mPLUGOwl3Processor(ProcessorMixin):
+    """
+    Args:
+        image_processor ([`mPLUGOwl3ImageProcessor`], *optional*):
+            The image processor is a required input.
+        tokenizer ([`LlamaTokenizerWrapper`], *optional*):
+            The tokenizer is a required input.
+    """
+    attributes = ['image_processor', 'tokenizer']
+    image_processor_class = 'mPLUGOwl3ImageProcessor'
+    tokenizer_class = 'AutoTokenizer'
+
+    def __init__(self, image_processor: mPLUGOwl3ImageProcessor=None,
+        tokenizer=None, prompt_style='chatml', inference_mode=True,
+        addition_eod='<|endoftext|>'):
+        super().__init__(image_processor, tokenizer)
+        self.image_processor: mPLUGOwl3ImageProcessor
+        self.prompt_style = prompt_style
+        self.inference_mode = inference_mode
+        self.media_tokens = ['<|image|>']
+        self.addition_eod = addition_eod
+
+    def build_text_qwen(self, messages):
+        im_start, im_end = '<|im_start|>', '<|im_end|>'
+        text = []
+        for num_turn, message in enumerate(messages):
+            if num_turn == 0 and message['role'] != 'system':
+                if self.prompt_style != 'plain':
+                    text.append({'text': f'{im_start}system\n{im_end}',
+                        'label': 0})
+            if message['role'] == 'system':
+                if self.prompt_style != 'plain':
+                    text.append({'text':
+                        f"{im_start}system\n{message['content']}{im_end}",
+                        'label': 0})
+            elif message['role'] == 'user':
+                if self.prompt_style != 'plain':
+                    content = f"\n{im_start}user\n{message['content']}{im_end}"
+                else:
+                    content = message['content']
+                pattern = '|'.join(map(re.escape, self.media_tokens))
+                chunk_strs = re.split(f'({pattern})', content)
+                for chunk_str in chunk_strs:
+                    text.append({'text': chunk_str, 'label': 0})
+            elif message['role'] == 'assistant':
+                if self.prompt_style != 'plain':
+                    text.append({'text': f'\n{im_start}assistant\n',
+                        'label': 0})
+                    text.append({'text': f"{message['content']}{im_end}",
+                        'label': 1})
+                else:
+                    text.append({'text': f"{message['content']}", 'label': 1})
+                text.append({'text': self.addition_eod, 'label': 1})
+            else:
+                raise NotImplementedError
+        if self.inference_mode:
+            while text and text[-1]['label'] == 1:
+                text.pop()
+        return text
+
+    def wrapped_tokenize(self, text):
+        return self.tokenizer(text).input_ids
+
+    def encode_text_sft(self, texts):
+        enc_chunk = []
+        label_chunk = []
+        enc_length = 0
+        num_images = 0
+        media_helper = MediaIndicesHelper(tokenizer=self.tokenizer)
+        for current_ti, text_chunk in enumerate(texts):
+            text = text_chunk['text']
+            label = text_chunk['label']
+            if not media_helper.has_media(text):
+                curr_chunk = self.wrapped_tokenize(text)
+                if label == 1:
+                    enc_length += len(curr_chunk)
+                    enc_chunk += curr_chunk
+                    label_chunk += [label] * len(curr_chunk)
+                else:
+                    enc_length += len(curr_chunk)
+                    enc_chunk += curr_chunk
+                    label_chunk += [label] * len(curr_chunk)
+            else:
+                add_length = media_helper.add_media(enc_chunk, text=text,
+                    tokenize_fn=self.wrapped_tokenize)
+                enc_length += add_length
+                label_chunk += [label] * add_length
+                num_images += 1
+        enc_chunk = paddle.to_tensor(data=enc_chunk).astype(dtype='int64')
+        media_offset = []
+        media_before = 0
+        for i, _ in enumerate([media_helper]):
+            mo = _.cal_media_offset(enc_chunk)
+            media_offset.append(paddle.concat(x=[(paddle.ones(shape=[tuple(
+                mo.shape)[0], 1]) * media_before).astype(dtype='int64').to(
+                mo.place), (mo + media_before).unsqueeze(axis=1)], axis=1))
+            media_before += _.len_images()
+        media_offset = paddle.stack(x=media_offset, axis=0)
+        return {'input_ids': enc_chunk.unsqueeze(axis=0), 'media_offset':
+            media_offset}
+
+    def __call__(self, messages, images=None, videos=None, max_length:
+        Optional[int]=None, cut_enable=True, 
+        # return_tensors: Optional[Union[str, transformers.utils.TensorType]]=transformers.utils.TensorType.PYTORCH, **kwargs) ->mPLUGOwl3BatchFeature:
+        return_tensors: Optional[Union[str, TensorType]]=TensorType.PADDLE, **kwargs) ->mPLUGOwl3BatchFeature:
+        medias = []
+        if videos is not None:
+            medias.extend([{'type': 'video', 'content': video,
+                'use_video_span': True} for video in videos])
+        if images is not None:
+            medias.extend([{'type': 'image', 'content': image} for image in
+                images])
+        if len(medias):
+            image_tensor_list = []
+            pattern = '(<\\|image\\|>|<\\|video\\|>)'
+            image_token_ptr = 0
+            media_layout = []
+            for message in messages:
+                text_list = re.split(pattern, message['content'])
+                text = ''
+                for text_content in text_list:
+                    if text_content in ['<|image|>', '<|video|>']:
+                        media_item = medias[image_token_ptr]
+                        image_token_ptr += 1
+                        if text_content == '<|image|>':
+                            assert media_item['type'] == 'image'
+                            image = media_item['content']
+                            image_inputs = self.image_processor([image],
+                                cut_enable=cut_enable, return_tensors=
+                                return_tensors)
+                            if image_inputs.get('cut_shape', None) is not None:
+                                cut_shape = image_inputs['cut_shape']
+                                cut_text = (self.image_processor.
+                                    cut_prompt_template(img_token=
+                                    '<|image|>', h=cut_shape[0][0], w=
+                                    cut_shape[0][1]))
+                                text += cut_text
+                                image_tensor_list.append(image_inputs[
+                                    'pixel_values'])
+                            else:
+                                text += text_content
+                        elif text_content == '<|video|>':
+                            assert media_item['type'] == 'video'
+                            video = media_item['content']
+                            use_video_span = media_item['use_video_span']
+                            image_tensor = self.image_processor(video,
+                                cut_enable=False)['pixel_values']
+                            image_tensor_list.append(image_tensor)
+                            num_video_frame = tuple(image_tensor.shape)[0]
+                            if use_video_span:
+                                text_content = ('<|start_video_frame|>' + 
+                                    '<|image|>' * num_video_frame +
+                                    '<|end_video_frame|>')
+                            else:
+                                text_content = '<|image|>' * num_video_frame
+                            text += text_content
+                    else:
+                        text += text_content
+                message['content'] = text
+            assert image_token_ptr == len(medias), (image_token_ptr, len(
+                medias))
+            assert all(len(tuple(_.shape)) == 4 for _ in image_tensor_list), [
+                tuple(_.shape) for _ in image_tensor_list]
+            num_image_tokens = sum([_['content'].count('<|image|>') for _ in
+                messages])
+            num_image_shapes = sum([tuple(_.shape)[0] for _ in
+                image_tensor_list])
+            assert num_image_tokens == num_image_shapes, (messages, [tuple(
+                _.shape) for _ in image_tensor_list])
+        image_tensor_list = paddle.concat(x=image_tensor_list, axis=0)
+        text = self.build_text_qwen(messages)
+        model_inputs = self.encode_text_sft(text)
+        if len(medias) is not None:
+            model_inputs.update({'pixel_values': image_tensor_list})
+        return mPLUGOwl3BatchFeature(model_inputs)
+
+    def check_media(self, images, messages):
+        media_num = 0 if images is None else len(images)
+        media_count = sum([message['content'].count('<|image|>') for
+            message in messages])
+        assert media_num == media_count
+
+    def batch_decode(self, *args, **kwargs):
+        """
+        This method forwards all its arguments to LlamaTokenizerFast's [`~PreTrainedTokenizer.batch_decode`]. Please
+        refer to the docstring of this method for more information.
+        """
+        output_ids = args[0]
+        result_text = []
+        for result in output_ids:
+            result = result[result != 0]
+            if result[0] == self.tokenizer.bos_id:
+                result = result[1:]
+            if result[-1] == self.tokenizer.eos_id:
+                result = result[:-1]
+            result_text.append(self.tokenizer.decode(result, *args[1:], **
+                kwargs).strip())
+        return result_text
+
+    def decode(self, *args, **kwargs):
+        """
+        This method forwards all its arguments to LlamaTokenizerFast's [`~PreTrainedTokenizer.decode`]. Please refer to
+        the docstring of this method for more information.
+        """
+        result = args[0]
+        result = result[result != 0]
+        if result[0] == self.tokenizer.bos_id:
+            result = result[1:]
+        if result[-1] == self.tokenizer.eos_id or hasattr(self.tokenizer,
+            'eot_id') and result[-1] == self.tokenizer.eot_id:
+            result = result[:-1]
+        return self.tokenizer.decode(result, *args[1:], **kwargs).strip()
+
+    def _convert(self, input_str, max_inp_length: Optional[int]=None):
+        if self.version > 2.5 or not getattr(self.tokenizer,
+            'add_bos_token', False):
+            input_ids = self.tokenizer.encode(input_str)
+        else:
+            input_ids = [self.tokenizer.bos_id] + self.tokenizer.encode(
+                input_str)
+        if max_inp_length is not None:
+            input_ids = input_ids[:max_inp_length]
+        input_ids = paddle.to_tensor(data=input_ids, dtype='int32')
+        start_cond = (input_ids == self.tokenizer.im_start_id) | (input_ids ==
+            self.tokenizer.slice_start_id)
+        end_cond = (input_ids == self.tokenizer.im_end_id) | (input_ids ==
+            self.tokenizer.slice_end_id)
+# >>>>>>        image_start_tokens = torch.where(start_cond)[0]
+        image_start_tokens = paddle.nonzero(start_cond)[:,0]
+        image_start_tokens += 1
+# >>>>>>        image_end_tokens = torch.where(end_cond)[0]
+        image_end_tokens = paddle.nonzero(end_cond)[:,0]
+        valid_image_nums = max(len(image_start_tokens), len(image_end_tokens))
+        image_bounds = paddle.hstack(x=[image_start_tokens[:
+            valid_image_nums].unsqueeze(axis=-1), image_end_tokens[:
+            valid_image_nums].unsqueeze(axis=-1)])
+        return input_ids, image_bounds
+
+    @property
+    def model_input_names(self):
+        tokenizer_input_names = self.tokenizer.model_input_names
+        image_processor_input_names = self.image_processor.model_input_names
+        return list(dict.fromkeys(tokenizer_input_names +
+            image_processor_input_names))
+
+    def pad(self, inputs, max_length=None, padding_value=0, padding_side='left'
+        ):
+        items = []
+        if isinstance(inputs[0], list):
+            assert isinstance(inputs[0][0], paddle.Tensor)
+            for it in inputs:
+                for tr in it:
+                    items.append(tr)
+        else:
+            assert isinstance(inputs[0], paddle.Tensor)
+            items = inputs
+        batch_size = len(items)
+        shape = tuple(items[0].shape)
+        dim = len(shape)
+        assert dim <= 2
+        if max_length is None:
+            max_length = 0
+        max_length = max(max_length, max(tuple(item.shape)[-1] for item in
+            items))
+        min_length = min(tuple(item.shape)[-1] for item in items)
+        dtype = items[0].dtype
+        if dim == 0:
+            return paddle.stack(x=[item for item in items], axis=0), [0]
+        elif dim == 1:
+            if max_length == min_length:
+                return paddle.stack(x=[item for item in items], axis=0), [0
+                    ] * batch_size
+            tensor = paddle.zeros(shape=(batch_size, max_length), dtype=dtype
+                ) + padding_value
+        else:
+            tensor = paddle.zeros(shape=(batch_size, max_length, shape[-1]),
+                dtype=dtype) + padding_value
+        padding_length = []
+        for i, item in enumerate(items):
+            if dim == 1:
+                if padding_side == 'left':
+                    tensor[i, -len(item):] = item.clone()
+                else:
+                    tensor[i, :len(item)] = item.clone()
+            elif dim == 2:
+                if padding_side == 'left':
+                    tensor[i, -len(item):, :] = item.clone()
+                else:
+                    tensor[i, :len(item), :] = item.clone()
+            padding_length.append(tuple(tensor.shape)[-1] - len(item))
+        return tensor, padding_length
diff --git a/paddlemix/mPLUGOwl3/utils/paddle_aux.py b/paddlemix/mPLUGOwl3/utils/paddle_aux.py
new file mode 100644
index 000000000..0e8d9e56c
--- /dev/null
+++ b/paddlemix/mPLUGOwl3/utils/paddle_aux.py
@@ -0,0 +1,15 @@
+
+# This file is generated by PaConvert ToolKit, please Don't edit it!
+import paddle
+
+def reshape(self, *args, **kwargs):
+    if args:
+        if len(args)==1 and isinstance(args[0], (tuple, list)):
+            return paddle.reshape(self, args[0])
+        else:
+            return paddle.reshape(self, list(args))
+    elif kwargs:
+        assert 'shape' in kwargs
+        return paddle.reshape(self, shape=kwargs['shape'])
+
+setattr(paddle.Tensor, 'reshape', reshape)
diff --git a/paddlemix/mPLUGOwl3/x_sdpa.py b/paddlemix/mPLUGOwl3/x_sdpa.py
new file mode 100644
index 000000000..172efffba
--- /dev/null
+++ b/paddlemix/mPLUGOwl3/x_sdpa.py
@@ -0,0 +1,49 @@
+import paddle
+from icecream import ic
+from einops import rearrange
+
+
+class ScaleDotProductAttention(paddle.nn.Layer):
+
+    def __init__(self, layer_number, causal=False, softmax_scale=None,
+        attention_dropout=0.0):
+        super().__init__()
+        self.layer_number = layer_number
+        self.causal = causal
+        self.softmax_scale = softmax_scale
+        self.dropout_p = attention_dropout
+
+    def forward(self, q, k, v, attn_mask=None, order='sbhd'):
+        """Implements the multihead softmax attention.
+        Arguments
+        ---------
+            q, k, v: The tensor containing the query, key, and value. (B, S, H, D)
+        """
+        if order == 'sbhd':
+            q, k, v = [rearrange(x, 's b h d -> b h s d').contiguous() for
+                x in (q, k, v)]
+        elif order == 'bhsd':
+            pass
+        if attn_mask is not None:
+            attn_mask = (~attn_mask.clone().astype(dtype='bool')).contiguous()
+        else:
+            attn_mask = None
+        if self.training:
+            if self.causal:
+                assert tuple(q.shape)[-2] == tuple(k.shape)[-2]
+            is_causal = self.causal
+            dropout_p = self.dropout_p
+        else:
+            if self.causal:
+                is_causal = tuple(q.shape)[-2] == tuple(k.shape)[-2]
+            else:
+                is_causal = self.causal
+            dropout_p = 0.0
+        assert self.softmax_scale == None or self.softmax_scale == paddle.utils.try_import(
+            'math').sqrt(q.shape[-1]
+            ), 'Fault: The scale parameter defaults to the square root of the last dimension of query, not allowed manually set'
+        o = paddle.nn.functional.scaled_dot_product_attention(query=q, key=
+            k, value=v, attn_mask=attn_mask, dropout_p=dropout_p, is_causal
+            =is_causal)
+        o = rearrange(o, 'B Head L D -> L B (Head D)').contiguous()
+        return o

From 862dfca8a77560930739f6cfc3824faf70376798 Mon Sep 17 00:00:00 2001
From: WAYKEN-TSE <760301162@qq.com>
Date: Fri, 6 Dec 2024 10:41:41 +0000
Subject: [PATCH 2/8] =?UTF-8?q?mp3model=E6=B5=8B=E8=AF=95=E6=97=B6flash-at?=
 =?UTF-8?q?ten=E9=97=AE=E9=A2=98?=
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

---
 .pre-commit-config.yaml                       |   45 -
 build_env.sh                                  |    6 +-
 paddlemix/mPLUGOwl3/__init__.py               |    6 +-
 .../mPLUGOwl3/configuration_hyper_qwen2.py    |   50 +-
 .../mPLUGOwl3/configuration_mplugowl3.py      |   52 +-
 .../mPLUGOwl3/image_processing_mplugowl3.py   |  480 ++++---
 paddlemix/mPLUGOwl3/imagetest.py              |   43 +-
 paddlemix/mPLUGOwl3/modeling_hyper_qwen2.py   | 1182 +++++++++--------
 paddlemix/mPLUGOwl3/modeling_mplugowl3.py     |  208 +--
 paddlemix/mPLUGOwl3/modeling_navit_siglip.py  |    5 +-
 paddlemix/mPLUGOwl3/processing_mplugowl3.py   |  299 +++--
 paddlemix/mPLUGOwl3/utils/paddle_aux.py       |   23 +-
 paddlemix/mPLUGOwl3/x_sdpa.py                 |   45 +-
 13 files changed, 1366 insertions(+), 1078 deletions(-)
 delete mode 100644 .pre-commit-config.yaml

diff --git a/.pre-commit-config.yaml b/.pre-commit-config.yaml
deleted file mode 100644
index b358ac0fc..000000000
--- a/.pre-commit-config.yaml
+++ /dev/null
@@ -1,45 +0,0 @@
-repos:
-# For Python files
--   repo: https://github.com/psf/black.git
-    rev: 22.8.0
-    hooks:
-    -   id: black
-        files: \.(py|pyi)$
-        additional_dependencies: [toml]
--   repo: https://github.com/PyCQA/isort
-    rev: 5.11.5
-    hooks:
-    -   id: isort
--   repo: https://github.com/PyCQA/flake8
-    rev: 4.0.1
-    hooks:
-    -   id: flake8
--   repo: https://github.com/pre-commit/pre-commit-hooks
-    rev: v4.1.0
-    hooks:
-    -   id: check-merge-conflict
-    -   id: check-symlinks
-    -   id: detect-private-key
-        files: (?!.*paddle)^.*$
-    -   id: end-of-file-fixer
-        files: \.md$
-    -   id: trailing-whitespace
-        files: \.md$
--   repo: https://github.com/Lucas-C/pre-commit-hooks
-    rev: v1.1.14
-    hooks:
-    -   id: forbid-crlf
-        files: \.md$
-    -   id: remove-crlf
-        files: \.md$
-    -   id: forbid-tabs
-        files: \.md$
-    -   id: remove-tabs
-        files: \.md$
--   repo: local
-    hooks:
-    -   id: copyright_checker
-        name: copyright_checker
-        entry: python .copyright.hook
-        language: system
-        files: \.(c|cc|cxx|cpp|cu|h|hpp|hxx|proto|xpu|kps|py|sh)$
diff --git a/build_env.sh b/build_env.sh
index b963f56cf..a1b2829b2 100644
--- a/build_env.sh
+++ b/build_env.sh
@@ -21,19 +21,19 @@ echo "开始安装 PaddleMIX 及其依赖..."
 
 # 安装 PaddleMIX
 echo "安装 PaddleMIX..."
-pip install -e .
+pip install -e . -i https://mirrors.aliyun.com/pypi/simple/
 
 # 安装 ppdiffusers
 echo "安装 ppdiffusers..."
 cd ppdiffusers
-pip install -e .
+pip install -e . -i https://mirrors.aliyun.com/pypi/simple/
 cd ..
 #注：ppdiffusers部分模型需要依赖 CUDA 11.2 及以上版本，如果本地机器不符合要求，建议前往 [AI Studio](https://aistudio.baidu.com/index) 进行模型训练、推理任务。
 #如果希望使用**bf16**训练推理，请使用支持**bf16**的GPU，如A100。
 
 # 安装依赖包
 echo "安装依赖包..."
-pip install -r requirements.txt
+pip install -r requirements.txt -i https://mirrors.aliyun.com/pypi/simple/
 
 # 安装自定义算子，非CUDA环境（例如昇腾环境）则跳过
 if command -v nvcc > /dev/null 2>&1; then
diff --git a/paddlemix/mPLUGOwl3/__init__.py b/paddlemix/mPLUGOwl3/__init__.py
index 6dc187d0d..9c2aaca31 100644
--- a/paddlemix/mPLUGOwl3/__init__.py
+++ b/paddlemix/mPLUGOwl3/__init__.py
@@ -11,12 +11,12 @@
 # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 # See the License for the specific language governing permissions and
 # limitations under the License.
+from .bert_padding import *
 from .configuration_hyper_qwen2 import *
 from .configuration_mplugowl3 import *
 from .image_processing_mplugowl3 import *
-from .modeling_navit_siglip import *
 from .modeling_hyper_qwen2 import *
 from .modeling_mplugowl3 import *
+from .modeling_navit_siglip import *
 from .processing_mplugowl3 import *
-from .bert_padding import *
-from .x_sdpa import *
\ No newline at end of file
+from .x_sdpa import *
diff --git a/paddlemix/mPLUGOwl3/configuration_hyper_qwen2.py b/paddlemix/mPLUGOwl3/configuration_hyper_qwen2.py
index ab5f73ec8..caec5d27b 100644
--- a/paddlemix/mPLUGOwl3/configuration_hyper_qwen2.py
+++ b/paddlemix/mPLUGOwl3/configuration_hyper_qwen2.py
@@ -1,3 +1,17 @@
+# Copyright (c) 2024 PaddlePaddle Authors. All Rights Reserved.
+# 
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+# 
+#     http://www.apache.org/licenses/LICENSE-2.0
+# 
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
 import paddlenlp
 from paddlenlp.transformers import PretrainedConfig
 
@@ -69,17 +83,33 @@ class HyperQwen2Config(PretrainedConfig):
     >>> # Accessing the model configuration
     >>> configuration = model.config
     ```"""
-    model_type = 'qwen2'
-    keys_to_ignore_at_inference = ['past_key_values']
 
-    def __init__(self, vocab_size=151936, hidden_size=4096,
-        intermediate_size=22016, num_hidden_layers=32, num_attention_heads=
-        32, num_key_value_heads=32, hidden_act='silu',
-        max_position_embeddings=32768, initializer_range=0.02, rms_norm_eps
-        =1e-06, use_cache=True, tie_word_embeddings=False, rope_theta=
-        10000.0, use_sliding_window=False, sliding_window=4096,
-        max_window_layers=28, attention_dropout=0.0, hyper_layers=[1, 9, 17,
-        25], _attn_implementation="sdpa",**kwargs):
+    model_type = "qwen2"
+    keys_to_ignore_at_inference = ["past_key_values"]
+
+    def __init__(
+        self,
+        vocab_size=151936,
+        hidden_size=4096,
+        intermediate_size=22016,
+        num_hidden_layers=32,
+        num_attention_heads=32,
+        num_key_value_heads=32,
+        hidden_act="silu",
+        max_position_embeddings=32768,
+        initializer_range=0.02,
+        rms_norm_eps=1e-06,
+        use_cache=True,
+        tie_word_embeddings=False,
+        rope_theta=10000.0,
+        use_sliding_window=False,
+        sliding_window=4096,
+        max_window_layers=28,
+        attention_dropout=0.0,
+        hyper_layers=[1, 9, 17, 25],
+        _attn_implementation="sdpa",
+        **kwargs
+    ):
         self.vocab_size = vocab_size
         self.max_position_embeddings = max_position_embeddings
         self.hidden_size = hidden_size
diff --git a/paddlemix/mPLUGOwl3/configuration_mplugowl3.py b/paddlemix/mPLUGOwl3/configuration_mplugowl3.py
index 8dcbcf135..7858154fc 100644
--- a/paddlemix/mPLUGOwl3/configuration_mplugowl3.py
+++ b/paddlemix/mPLUGOwl3/configuration_mplugowl3.py
@@ -1,35 +1,59 @@
+# Copyright (c) 2024 PaddlePaddle Authors. All Rights Reserved.
+# 
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+# 
+#     http://www.apache.org/licenses/LICENSE-2.0
+# 
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
 import os
+
 import paddlenlp
+
 """ mPLUGOwl3 model configuration"""
 # from paddlenlp.transformers import PretrainedConfig, Qwen2Config
 from typing import Union
-from .configuration_hyper_qwen2 import HyperQwen2Config
+
 # logger = paddle.utils.try_import('logging').getLogger(name=__name__)
 from paddlemix.utils.log import logger
+
+from .configuration_hyper_qwen2 import HyperQwen2Config
 from .modeling_navit_siglip import SigLipVisionConfig
 
 
 class mPLUGOwl3Config(HyperQwen2Config):
-    model_type = 'mplugowl3'
-    keys_to_ignore_at_inference = ['past_key_values']
-    default_vision_config = {'hidden_size': 1152, 'image_size': 384,
-        'intermediate_size': 4304, 'model_type': 'siglip_vision_model',
-        'num_attention_heads': 16, 'num_hidden_layers': 27, 'patch_size': 14}
+    model_type = "mplugowl3"
+    keys_to_ignore_at_inference = ["past_key_values"]
+    default_vision_config = {
+        "hidden_size": 1152,
+        "image_size": 384,
+        "intermediate_size": 4304,
+        "model_type": "siglip_vision_model",
+        "num_attention_heads": 16,
+        "num_hidden_layers": 27,
+        "patch_size": 14,
+    }
 
     def __init__(self, use_cache=True, vision_config=None, **kwargs):
         self.use_cache = use_cache
         if vision_config is None:
-# >>>>>>            self.vision_config = (transformers.models.siglip.
-#                 configuration_siglip.SiglipVisionConfig(**self.
-#                 default_vision_config))
+            # >>>>>>            self.vision_config = (transformers.models.siglip.
+            #                 configuration_siglip.SiglipVisionConfig(**self.
+            #                 default_vision_config))
             self.vision_config = SigLipVisionConfig(**self.default_vision_config)
-            logger.info('vision_config is None, using default vision config')
+            logger.info("vision_config is None, using default vision config")
         elif isinstance(vision_config, dict):
-# >>>>>>            self.vision_config = (transformers.models.siglip.
-                # configuration_siglip.SiglipVisionConfig(**vision_config))
+            # >>>>>>            self.vision_config = (transformers.models.siglip.
+            # configuration_siglip.SiglipVisionConfig(**vision_config))
             self.vision_config = SigLipVisionConfig(**vision_config)
-# >>>>>>        elif isinstance(vision_config, transformers.models.siglip.
-#             configuration_siglip.SiglipVisionConfig):
+        # >>>>>>        elif isinstance(vision_config, transformers.models.siglip.
+        #             configuration_siglip.SiglipVisionConfig):
         elif isinstance(vision_config, SigLipVisionConfig):
             self.vision_config = vision_config
         self.image_size = self.vision_config.image_size
diff --git a/paddlemix/mPLUGOwl3/image_processing_mplugowl3.py b/paddlemix/mPLUGOwl3/image_processing_mplugowl3.py
index 7a2082677..de801016d 100644
--- a/paddlemix/mPLUGOwl3/image_processing_mplugowl3.py
+++ b/paddlemix/mPLUGOwl3/image_processing_mplugowl3.py
@@ -1,20 +1,40 @@
+# Copyright (c) 2024 PaddlePaddle Authors. All Rights Reserved.
+# 
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+# 
+#     http://www.apache.org/licenses/LICENSE-2.0
+# 
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
 import sys
-sys.path.append('/home/aistudio/paddle_test/mPLUGOwl3/utils')
+
+sys.path.append("/home/aistudio/paddle_test/mPLUGOwl3/utils")
+import math
+import random
+from enum import Enum
+from typing import Any, Dict, List, Optional, Union
+
+import numpy as np
+
 # import paddle_aux
 import paddle
-import paddlenlp
-from paddlenlp.transformers.image_processing_utils import BatchFeature, BaseImageProcessor
 import paddle.nn.functional as F
-import random
-from typing import Optional, Union, Dict, Any, List
-from einops import rearrange, repeat
-import math
+import paddlenlp
+import PIL
 import PIL.Image
 import PIL.ImageSequence
-import numpy as np
-import PIL
+from einops import rearrange, repeat
+from paddlenlp.transformers.image_processing_utils import (
+    BaseImageProcessor,
+    BatchFeature,
+)
 from PIL import Image
-from enum import Enum
 
 
 def recursive_converter(converter, value):
@@ -35,20 +55,22 @@ def box_area(boxes):
     area = width * height
     return area
 
+
 def custom_max(a, b):
     return paddle.where(a > b, a, b)
 
+
 def custom_min(a, b):
     return paddle.where(a < b, a, b)
 
 
 def box_iou(boxes1, area1, boxes2, eps=1e-05):
-# >>>>>>    area2 = torchvision.ops.boxes.box_area(boxes2)
-    area1 = area1.astype('float32')
-    boxes1 = boxes1.astype('float32')
-    boxes2 = boxes2.astype('float32')
+    # >>>>>>    area2 = torchvision.ops.boxes.box_area(boxes2)
+    area1 = area1.astype("float32")
+    boxes1 = boxes1.astype("float32")
+    boxes2 = boxes2.astype("float32")
 
-    area2 = box_area(boxes2).astype('float32')
+    area2 = box_area(boxes2).astype("float32")
     lt = custom_max(boxes1[:, None, :2], boxes2[:, :2])
     rb = custom_min(boxes1[:, None, 2:], boxes2[:, 2:])
     wh = (rb - lt).clip(min=0)
@@ -58,40 +80,87 @@ def box_iou(boxes1, area1, boxes2, eps=1e-05):
     return iou, union
 
 
-available_anchor_strategy = ['docowl', 'random', 'highest', 'last', 'llava']
-grid_dict = {'grid_33': [(1, 1), (1, 2), (2, 1), (1, 3), (3, 1), (2, 2), (1,
-    4), (4, 1), (1, 5), (5, 1), (1, 6), (6, 1), (2, 3), (3, 2), (1, 7), (7,
-    1), (4, 2), (2, 4), (1, 8), (8, 1), (3, 3), (1, 9), (9, 1)],
-    'grid_squ_3x3': [(1, 1), (2, 2), (3, 3)], 'grid_squ_4': [(2, 2), (1, 3),
-    (1, 4), (3, 1), (4, 1)], 'grid_squ_6': [(2, 2), (1, 3), (1, 4), (3, 1),
-    (4, 1), (2, 3), (3, 2)], 'grid_squ_2': [(2, 1)], 'grid_squ_9': [(1, 1),
-    (1, 2), (2, 1), (1, 3), (3, 1), (2, 2), (1, 4), (4, 1), (1, 5), (5, 1),
-    (1, 6), (6, 1), (2, 3), (3, 2), (1, 7), (7, 1), (4, 2), (2, 4), (1, 8),
-    (8, 1), (3, 3), (1, 9), (9, 1)]}
-cut_prompt_template_dict = {'v0': lambda img_token, h, w: f''.join([
-    f'{img_token}' for i in range(h) for j in range(w)]), 'v1': lambda
-    img_token, h, w: f'Cut to {h} rows {w} columns, ' + ' '.join([
-    f'subimg({i},{j}){img_token}' for i in range(h) for j in range(w)]),
-    'v1_global': lambda img_token, h, w: 
-    f'Cut to {h} rows {w} columns with a global view, ' + ' '.join([
-    f'subimg({i},{j}){img_token}' for i in range(h) for j in range(w)] + [
-    f'global_view{img_token}']), 'v2_global': lambda img_token, h, w: 
-    f"""Cut to {h} rows {w} columns with a global view
-""" + '\n'.join([' '
-    .join([f'subimg({i},{j}){img_token}' for j in range(w)]) for i in range
-    (h)]) + f"""
-global_view{img_token}""", 'v3': lambda img_token, h, w: 
-    f'<|start_cut|>{h}*{w}' + ' '.join([f'{img_token}' for i in range(h) for
-    j in range(w)]) + '<|end_cut|>', 'v3_global': lambda img_token, h, w: 
-    f"""<|start_cut|>{h}*{w}
-""" + '\n'.join([' '.join([f'{img_token}' for
-    j in range(w)]) for i in range(h)]) + f"""
-{img_token}<|end_cut|>"""}
+available_anchor_strategy = ["docowl", "random", "highest", "last", "llava"]
+grid_dict = {
+    "grid_33": [
+        (1, 1),
+        (1, 2),
+        (2, 1),
+        (1, 3),
+        (3, 1),
+        (2, 2),
+        (1, 4),
+        (4, 1),
+        (1, 5),
+        (5, 1),
+        (1, 6),
+        (6, 1),
+        (2, 3),
+        (3, 2),
+        (1, 7),
+        (7, 1),
+        (4, 2),
+        (2, 4),
+        (1, 8),
+        (8, 1),
+        (3, 3),
+        (1, 9),
+        (9, 1),
+    ],
+    "grid_squ_3x3": [(1, 1), (2, 2), (3, 3)],
+    "grid_squ_4": [(2, 2), (1, 3), (1, 4), (3, 1), (4, 1)],
+    "grid_squ_6": [(2, 2), (1, 3), (1, 4), (3, 1), (4, 1), (2, 3), (3, 2)],
+    "grid_squ_2": [(2, 1)],
+    "grid_squ_9": [
+        (1, 1),
+        (1, 2),
+        (2, 1),
+        (1, 3),
+        (3, 1),
+        (2, 2),
+        (1, 4),
+        (4, 1),
+        (1, 5),
+        (5, 1),
+        (1, 6),
+        (6, 1),
+        (2, 3),
+        (3, 2),
+        (1, 7),
+        (7, 1),
+        (4, 2),
+        (2, 4),
+        (1, 8),
+        (8, 1),
+        (3, 3),
+        (1, 9),
+        (9, 1),
+    ],
+}
+cut_prompt_template_dict = {
+    "v0": lambda img_token, h, w: f"".join([f"{img_token}" for i in range(h) for j in range(w)]),
+    "v1": lambda img_token, h, w: f"Cut to {h} rows {w} columns, "
+    + " ".join([f"subimg({i},{j}){img_token}" for i in range(h) for j in range(w)]),
+    "v1_global": lambda img_token, h, w: f"Cut to {h} rows {w} columns with a global view, "
+    + " ".join([f"subimg({i},{j}){img_token}" for i in range(h) for j in range(w)] + [f"global_view{img_token}"]),
+    "v2_global": lambda img_token, h, w: f"""Cut to {h} rows {w} columns with a global view
+"""
+    + "\n".join([" ".join([f"subimg({i},{j}){img_token}" for j in range(w)]) for i in range(h)])
+    + f"""
+global_view{img_token}""",
+    "v3": lambda img_token, h, w: f"<|start_cut|>{h}*{w}"
+    + " ".join([f"{img_token}" for i in range(h) for j in range(w)])
+    + "<|end_cut|>",
+    "v3_global": lambda img_token, h, w: f"""<|start_cut|>{h}*{w}
+"""
+    + "\n".join([" ".join([f"{img_token}" for j in range(w)]) for i in range(h)])
+    + f"""
+{img_token}<|end_cut|>""",
+}
 
 
 def anchor_rank(anchors, anchors_areas, input_image_size, eps=1e-05):
-    input_image_bbox = paddle.to_tensor(data=[0, 0, input_image_size[1],
-        input_image_size[0]]).unsqueeze(axis=0)
+    input_image_bbox = paddle.to_tensor(data=[0, 0, input_image_size[1], input_image_size[0]]).unsqueeze(axis=0)
     boxes1 = anchors
     boxes2 = input_image_bbox
     boxes3 = anchors.clone()
@@ -121,18 +190,18 @@ def select_best_resolution(anchors, anchors_areas, input_image_size):
     original_width, original_height = original_size
     best_fit = None
     max_effective_resolution = 0
-    min_wasted_resolution = float('inf')
+    min_wasted_resolution = float("inf")
     index = 0
     for i, (width, height) in enumerate(possible_resolutions):
         scale = min(width / original_width, height / original_height)
-        downscaled_width, downscaled_height = int(original_width * scale), int(
-            original_height * scale)
-        effective_resolution = min(downscaled_width * downscaled_height, 
-            original_width * original_height)
+        downscaled_width, downscaled_height = int(original_width * scale), int(original_height * scale)
+        effective_resolution = min(downscaled_width * downscaled_height, original_width * original_height)
         wasted_resolution = width * height - effective_resolution
-        if (effective_resolution > max_effective_resolution or 
-            effective_resolution == max_effective_resolution and 
-            wasted_resolution < min_wasted_resolution):
+        if (
+            effective_resolution > max_effective_resolution
+            or effective_resolution == max_effective_resolution
+            and wasted_resolution < min_wasted_resolution
+        ):
             max_effective_resolution = effective_resolution
             min_wasted_resolution = wasted_resolution
             best_fit = width, height
@@ -144,28 +213,28 @@ def build_cut_shape_indices(cut_shape):
     cut_shape_indices = []
     for shape in cut_shape:
         n = shape[0] * shape[1]
-        indices = paddle.concat(x=[repeat(paddle.to_tensor(data=shape),
-            'l -> n l', n=n), paddle.arange(end=n).unsqueeze(axis=1)], axis=1)
+        indices = paddle.concat(
+            x=[repeat(paddle.to_tensor(data=shape), "l -> n l", n=n), paddle.arange(end=n).unsqueeze(axis=1)], axis=1
+        )
         assert tuple(indices.shape)[0] == n
         assert tuple(indices.shape)[1] == 3
         cut_shape_indices.append(indices)
-    cut_shape_indices = paddle.concat(x=cut_shape_indices, axis=0).astype(dtype
-        ='int64')
+    cut_shape_indices = paddle.concat(x=cut_shape_indices, axis=0).astype(dtype="int64")
     return cut_shape_indices
 
 
 class AnchorResize(paddle.nn.Layer):
 
-# >>>>>>    def __init__(self, image_size, anchors, interpolation=torchvision.
-#         transforms.transforms.InterpolationMode.BILINEAR, antialias=None,
-#         anchor_strategy='docowl'):
-    def __init__(self, image_size, anchors, interpolation='bilinear', antialias=None,
-        anchor_strategy='docowl'):
+    # >>>>>>    def __init__(self, image_size, anchors, interpolation=torchvision.
+    #         transforms.transforms.InterpolationMode.BILINEAR, antialias=None,
+    #         anchor_strategy='docowl'):
+    def __init__(self, image_size, anchors, interpolation="bilinear", antialias=None, anchor_strategy="docowl"):
         super().__init__()
         self.image_size = image_size
-        self.anchors = paddle.to_tensor(data=[[0, 0, _[1] * image_size[1], 
-            _[0] * image_size[0]] for _ in anchors], stop_gradient=not False)
-# >>>>>>        self.anchor_areas = torchvision.ops.boxes.box_area(self.anchors)
+        self.anchors = paddle.to_tensor(
+            data=[[0, 0, _[1] * image_size[1], _[0] * image_size[0]] for _ in anchors], stop_gradient=not False
+        )
+        # >>>>>>        self.anchor_areas = torchvision.ops.boxes.box_area(self.anchors)
         self.anchor_areas = box_area(self.anchors)
         self.interpolation = interpolation
         self.antialias = antialias
@@ -173,13 +242,13 @@ def __init__(self, image_size, anchors, interpolation='bilinear', antialias=None
         assert self.anchor_strategy in available_anchor_strategy
 
     def resize_global(self, img):
-# >>>>>>        return torchvision.transforms.functional.resize(img, self.
-#             image_size, self.interpolation, max_size=None, antialias=self.
-#             antialias)
+        # >>>>>>        return torchvision.transforms.functional.resize(img, self.
+        #             image_size, self.interpolation, max_size=None, antialias=self.
+        #             antialias)
         image_np = np.array(img)
-        image_tensor = paddle.to_tensor(image_np, dtype='float32')
+        image_tensor = paddle.to_tensor(image_np, dtype="float32")
         image_tensor = image_tensor.transpose([2, 0, 1])  # 变成 (3, 500, 500)
-        if self.interpolation =="bilinear" or "bicubic":
+        if self.interpolation == "bilinear" or "bicubic":
             image_tensor = image_tensor.unsqueeze(0)  # 变成 (1, 3, 500, 500)
         return F.interpolate(image_tensor, size=self.image_size, mode=self.interpolation, align_corners=False)
 
@@ -191,62 +260,68 @@ def forward(self, img, skip_resize=False):
         Returns:
             PIL Image or Tensor: Rescaled image.
         """
-        if self.anchor_strategy == 'docowl':
-            selected_anchor = anchor_rank(self.anchors, self.anchor_areas,
-                (img.size[1], img.size[0]))
-        elif self.anchor_strategy == 'random':
+        if self.anchor_strategy == "docowl":
+            selected_anchor = anchor_rank(self.anchors, self.anchor_areas, (img.size[1], img.size[0]))
+        elif self.anchor_strategy == "random":
             selected_anchor = random.randint(0, len(self.anchors) - 1)
-        elif self.anchor_strategy == 'highest':
-            selected_anchor = paddle.argmax(x=self.anchors[:, 2] * self.
-                anchors[:, 3] * 100 - paddle.abs(x=self.anchors[:, 2] -
-                self.anchors[:, 3]))
-        elif self.anchor_strategy == 'last':
+        elif self.anchor_strategy == "highest":
+            selected_anchor = paddle.argmax(
+                x=self.anchors[:, 2] * self.anchors[:, 3] * 100 - paddle.abs(x=self.anchors[:, 2] - self.anchors[:, 3])
+            )
+        elif self.anchor_strategy == "last":
             selected_anchor = len(self.anchors) - 1
-        elif self.anchor_strategy == 'llava':
-            selected_anchor = select_best_resolution(self.anchors, self.
-                anchor_areas, (img.size[1], img.size[0]))
+        elif self.anchor_strategy == "llava":
+            selected_anchor = select_best_resolution(self.anchors, self.anchor_areas, (img.size[1], img.size[0]))
         else:
             selected_anchor = None
         assert selected_anchor is not None
         target_size = self.anchors[selected_anchor][2:].tolist()
         if skip_resize:
             return selected_anchor
-# >>>>>>        return torchvision.transforms.functional.resize(img, [target_size[1
-#             ], target_size[0]], self.interpolation, max_size=None,
-#             antialias=self.antialias), selected_anchor
+        # >>>>>>        return torchvision.transforms.functional.resize(img, [target_size[1
+        #             ], target_size[0]], self.interpolation, max_size=None,
+        #             antialias=self.antialias), selected_anchor
         image_np = np.array(img)
-        image_tensor = paddle.to_tensor(image_np, dtype='float32')
+        image_tensor = paddle.to_tensor(image_np, dtype="float32")
         image_tensor = image_tensor.transpose([2, 0, 1])  # 变成 (3, 500, 500)
-        if self.interpolation =="bilinear" or "bicubic":
+        if self.interpolation == "bilinear" or "bicubic":
             image_tensor = image_tensor.unsqueeze(0)  # 变成 (1, 3, 500, 500)
-        return F.interpolate(image_tensor, size=[target_size[1], target_size[0]], mode=self.interpolation, align_corners=False), selected_anchor
+        return (
+            F.interpolate(
+                image_tensor, size=[target_size[1], target_size[0]], mode=self.interpolation, align_corners=False
+            ),
+            selected_anchor,
+        )
 
-    def __repr__(self) ->str:
-        detail = (
-            f'(size={self.image_size}, anchor={self.anchors}, interpolation={self.interpolation.value}, antialias={self.antialias})'
-            )
-        return f'{self.__class__.__name__}{detail}'
+    def __repr__(self) -> str:
+        detail = f"(size={self.image_size}, anchor={self.anchors}, interpolation={self.interpolation.value}, antialias={self.antialias})"
+        return f"{self.__class__.__name__}{detail}"
 
 
 class CutMixin:
-
-    def __init__(self, cut_cfg={'anchors': 'grid_squ_6', 'anchor_strategy':
-        'docowl', 'cut_prompt': 'v3', 'add_global': True, 'cut_prob': 1.0}
-        ) ->None:
+    def __init__(
+        self,
+        cut_cfg={
+            "anchors": "grid_squ_6",
+            "anchor_strategy": "docowl",
+            "cut_prompt": "v3",
+            "add_global": True,
+            "cut_prob": 1.0,
+        },
+    ) -> None:
         if cut_cfg is None:
             self.cut_enable = False
             return
         else:
             self.cut_enable = True
         image_size = self.image_size
-        anchors = cut_cfg.get('anchors', 'grid_33')
-        anchor_strategy = cut_cfg.get('anchor_strategy', 'docowl')
-        cut_prompt = cut_cfg.get('cut_prompt', 'v0')
-        self.cut_prob = cut_cfg.get('cut_prob', 1.0)
-        self.force_shape_cut = cut_cfg.get('force_shape_cut', False)
-        force_shape_cut_anchors = cut_cfg.get('force_shape_cut_anchors',
-            'force_shape_cut_anchors')
-        self.add_global = cut_cfg.get('add_global', False)
+        anchors = cut_cfg.get("anchors", "grid_33")
+        anchor_strategy = cut_cfg.get("anchor_strategy", "docowl")
+        cut_prompt = cut_cfg.get("cut_prompt", "v0")
+        self.cut_prob = cut_cfg.get("cut_prob", 1.0)
+        self.force_shape_cut = cut_cfg.get("force_shape_cut", False)
+        force_shape_cut_anchors = cut_cfg.get("force_shape_cut_anchors", "force_shape_cut_anchors")
+        self.add_global = cut_cfg.get("add_global", False)
         if isinstance(image_size, int):
             image_size = image_size, image_size
         self.image_size = image_size
@@ -256,29 +331,27 @@ def __init__(self, cut_cfg={'anchors': 'grid_squ_6', 'anchor_strategy':
             anchors = eval(anchors)
         self.anchors = [tuple(_) for _ in anchors]
         self.anchor_max = max([max(_) for _ in self.anchors])
-        self.resizer = AnchorResize(image_size=image_size, anchors=anchors,
-            interpolation="bicubic", anchor_strategy=anchor_strategy)
+        self.resizer = AnchorResize(
+            image_size=image_size, anchors=anchors, interpolation="bicubic", anchor_strategy=anchor_strategy
+        )
         if force_shape_cut_anchors in grid_dict:
             force_shape_cut_anchors = grid_dict[force_shape_cut_anchors]
         else:
             force_shape_cut_anchors = eval(force_shape_cut_anchors)
-        self.force_shape_cut_anchors = [tuple(_) for _ in
-            force_shape_cut_anchors]
-        self.force_shape_cut_anchors_max = max([max(_) for _ in self.
-            force_shape_cut_anchors])
-# >>>>>>        self.old_resizer = torchvision.transforms.Resize(image_size,
-#             interpolation=torchvision.transforms.transforms.
-#             InterpolationMode.BICUBIC)
-        self.old_resizer = paddle.vision.transforms.Resize(size=image_size,interpolation="bicubic")
-# >>>>>>        self.image_transform = torchvision.transforms.Compose(self.
-#             image_transform.transforms[1:])
-        self.image_transform = paddle.vision.transforms.Compose(self.image_transform.transforms[1:]) 
+        self.force_shape_cut_anchors = [tuple(_) for _ in force_shape_cut_anchors]
+        self.force_shape_cut_anchors_max = max([max(_) for _ in self.force_shape_cut_anchors])
+        # >>>>>>        self.old_resizer = torchvision.transforms.Resize(image_size,
+        #             interpolation=torchvision.transforms.transforms.
+        #             InterpolationMode.BICUBIC)
+        self.old_resizer = paddle.vision.transforms.Resize(size=image_size, interpolation="bicubic")
+        # >>>>>>        self.image_transform = torchvision.transforms.Compose(self.
+        #             image_transform.transforms[1:])
+        self.image_transform = paddle.vision.transforms.Compose(self.image_transform.transforms[1:])
         if self.add_global:
-            self.cut_prompt_template = cut_prompt_template_dict[cut_prompt +
-                '_global']
+            self.cut_prompt_template = cut_prompt_template_dict[cut_prompt + "_global"]
         else:
             self.cut_prompt_template = cut_prompt_template_dict[cut_prompt]
-        self.media_tokens = ['<|image|>', '<|video|>']
+        self.media_tokens = ["<|image|>", "<|video|>"]
 
     def _process_image(self, images):
         new_images = []
@@ -287,31 +360,31 @@ def _process_image(self, images):
             print(len(images))
             raw_image = image
             print(raw_image)
-            print('-'*100)
+            print("-" * 100)
             image, selected_anchor = self.resizer(image)
             print(image.shape)
-            print('-'*100)
+            print("-" * 100)
             image_input = self.image_transform(image)
             image_input = image_input[0]
             print(image_input.shape)
-            cut_shape.append((tuple(image_input.shape)[1] // self.
-                image_size[0], tuple(image_input.shape)[2] // self.
-                image_size[1]))
-            image_input = rearrange(image_input,
-                'C (num_h h) (num_w w) -> (num_h num_w) C h w', h=self.
-                image_size[0], w=self.image_size[1])
+            cut_shape.append(
+                (tuple(image_input.shape)[1] // self.image_size[0], tuple(image_input.shape)[2] // self.image_size[1])
+            )
+            image_input = rearrange(
+                image_input, "C (num_h h) (num_w w) -> (num_h num_w) C h w", h=self.image_size[0], w=self.image_size[1]
+            )
             new_images.append(image_input)
-            print("1:",image_input.shape)
+            print("1:", image_input.shape)
             if self.add_global:
-                new_images.append(self.image_transform(self.resizer.
-                    resize_global(raw_image)))
-                print("2:",new_images[1].shape)
+                new_images.append(self.image_transform(self.resizer.resize_global(raw_image)))
+                print("2:", new_images[1].shape)
                 cut_shape.append((1, 1))
-        print('cutshape:',cut_shape)
+        print("cutshape:", cut_shape)
         new_images = paddle.concat(x=new_images, axis=0)
         cut_shape_indices = build_cut_shape_indices(cut_shape)
         return new_images, cut_shape, cut_shape_indices
 
+
 class TensorType(Enum):
     PADDLE = "paddle"
     TORCH = "torch"
@@ -324,18 +397,19 @@ class mPLUGOwl3BatchFeature(BatchFeature):
     """
 
     def __init__(
-        self, 
-        data: Optional[Dict[str, Any]]=None, 
+        self,
+        data: Optional[Dict[str, Any]] = None,
         # tensor_type:Union[None, str, transformers.utils.TensorType]=None):
-        tensor_type:Union[None, str, TensorType]=None):
+        tensor_type: Union[None, str, TensorType] = None,
+    ):
         super().__init__(data)
         self.convert_to_tensors(tensor_type=tensor_type)
 
     # def convert_to_tensors(self, tensor_type: Optional[Union[str,transformers.utils.TensorType]]=None):
-    def convert_to_tensors(self, tensor_type: Optional[Union[str,TensorType]]=None):
+    def convert_to_tensors(self, tensor_type: Optional[Union[str, TensorType]] = None):
         if tensor_type is None:
             return self
-        
+
         is_tensor = lambda x: isinstance(x, paddle.Tensor)
         as_tensor = paddle.to_tensor
 
@@ -345,39 +419,38 @@ def converter(value):
                     tensor = as_tensor(value)
                     return tensor
             except:
-                if key == 'overflowing_values':
-                    raise ValueError(
-                        'Unable to create tensor returning overflowing values of different lengths. '
-                        )
+                if key == "overflowing_values":
+                    raise ValueError("Unable to create tensor returning overflowing values of different lengths. ")
                 raise ValueError(
                     "Unable to create tensor, you should probably activate padding with 'padding=True' to have batched tensors with the same length."
-                    )
+                )
+
         for key, value in self.items():
             self[key] = recursive_converter(converter, value)
         return self
 
-    def to(self, *args, **kwargs) ->'mPLUGOwl3BatchFeature':
-# >>>>>>        transformers.utils.requires_backends(self, ['torch'])
+    def to(self, *args, **kwargs) -> "mPLUGOwl3BatchFeature":
+        # >>>>>>        transformers.utils.requires_backends(self, ['torch'])
 
         def cast_tensor(v):
-#             if paddle.is_floating_point(x=v):
-#                 """Class Method: *.to, can not convert, please check whether it is torch.Tensor.*/Optimizer.*/nn.Module.*/torch.distributions.Distribution.*/torch.autograd.function.FunctionCtx.*/torch.profiler.profile.*/torch.autograd.profiler.profile.*, and convert manually"""
-# >>>>>>                return v.to(*args, **kwargs)
-#             elif device is not None:
-#                 return v.to(device=device)
-#             else:
-#                 return v
+            #             if paddle.is_floating_point(x=v):
+            #                 """Class Method: *.to, can not convert, please check whether it is torch.Tensor.*/Optimizer.*/nn.Module.*/torch.distributions.Distribution.*/torch.autograd.function.FunctionCtx.*/torch.profiler.profile.*/torch.autograd.profiler.profile.*, and convert manually"""
+            # >>>>>>                return v.to(*args, **kwargs)
+            #             elif device is not None:
+            #                 return v.to(device=device)
+            #             else:
+            #                 return v
             if isinstance(v, paddle.Tensor):
                 # For floating point tensors
                 if v.dtype in [paddle.float32, paddle.float64]:
-                    if 'dtype' in kwargs:
-                        v = v.cast(kwargs['dtype'])
-                    if 'place' in kwargs:
-                        v = v.place(kwargs['place'])
+                    if "dtype" in kwargs:
+                        v = v.cast(kwargs["dtype"])
+                    if "place" in kwargs:
+                        v = v.place(kwargs["place"])
                     return v
                 # For non-floating point tensors, only handle device
-                elif 'place' in kwargs:
-                    return v.place(kwargs['place'])
+                elif "place" in kwargs:
+                    return v.place(kwargs["place"])
             return v
 
         new_data = {}
@@ -390,27 +463,27 @@ def cast_tensor(v):
             else:
                 raise ValueError(f"Attempting to cast a BatchFeature to type {str(arg)}. This is not supported.")
 
-#         device = kwargs.get('device')
-#         if device is None and len(args) > 0:
-#             arg = args[0]
-# # >>>>>>            if transformers.utils.is_torch_dtype(arg):
-#             if isinstance(arg, paddle.Tensor):
-#                 pass
-# # >>>>>>            elif isinstance(arg, str) or transformers.utils.is_torch_device(arg
-# #                 ) or isinstance(arg, int):
-# #                 device = arg
-#             elif isinstance(arg, str):
-#                 # 如果是字符串，可以直接使用该字符串作为设备标识
-#                 device = arg
-#             elif isinstance(arg, (int, paddle.device.Device)):
-#                 if isinstance(arg, int):
-#                     device = f'gpu:{arg}' if arg >= 0 else 'cpu'
-#                 else:
-#                     device = str(arg)
-#             else:
-#                 raise ValueError(
-#                     f'Attempting to cast a BatchFeature to type {str(arg)}. This is not supported.'
-#                     )
+        #         device = kwargs.get('device')
+        #         if device is None and len(args) > 0:
+        #             arg = args[0]
+        # # >>>>>>            if transformers.utils.is_torch_dtype(arg):
+        #             if isinstance(arg, paddle.Tensor):
+        #                 pass
+        # # >>>>>>            elif isinstance(arg, str) or transformers.utils.is_torch_device(arg
+        # #                 ) or isinstance(arg, int):
+        # #                 device = arg
+        #             elif isinstance(arg, str):
+        #                 # 如果是字符串，可以直接使用该字符串作为设备标识
+        #                 device = arg
+        #             elif isinstance(arg, (int, paddle.device.Device)):
+        #                 if isinstance(arg, int):
+        #                     device = f'gpu:{arg}' if arg >= 0 else 'cpu'
+        #                 else:
+        #                     device = str(arg)
+        #             else:
+        #                 raise ValueError(
+        #                     f'Attempting to cast a BatchFeature to type {str(arg)}. This is not supported.'
+        #                     )
         for k, v in self.items():
             new_data[k] = recursive_converter(cast_tensor, v)
         self.data = new_data
@@ -419,48 +492,49 @@ def cast_tensor(v):
 
 # >>>>>>class mPLUGOwl3ImageProcessor(transformers.image_processing_utils.
 #     BaseImageProcessor, CutMixin):
-class mPLUGOwl3ImageProcessor(BaseImageProcessor,CutMixin):
-    model_input_names = ['pixel_values']
+class mPLUGOwl3ImageProcessor(BaseImageProcessor, CutMixin):
+    model_input_names = ["pixel_values"]
 
-    def __init__(self, image_size, mean=[0.5, 0.5, 0.5], std=[0.5, 0.5, 0.5
-        ], **kwargs):
+    def __init__(self, image_size, mean=[0.5, 0.5, 0.5], std=[0.5, 0.5, 0.5], **kwargs):
         # super().__init__(**kwargs)
         self.image_size = image_size
         self.mean = mean
         self.std = std
-# >>>>>>        self.image_transform = torchvision.transforms.Compose([torchvision.
-#             transforms.Resize((image_size, image_size), interpolation=Image
-#             .BICUBIC), torchvision.transforms.ToTensor(), torchvision.
-#             transforms.Normalize(mean, std)])
-        self.image_transform = paddle.vision.transforms.Compose([paddle.vision.transforms.Resize(size=(image_size, image_size),interpolation="bicubic"),
-            paddle.vision.transforms.ToTensor(),paddle.vision.transforms.Normalize(mean=mean,std=std)])
-
-
+        # >>>>>>        self.image_transform = torchvision.transforms.Compose([torchvision.
+        #             transforms.Resize((image_size, image_size), interpolation=Image
+        #             .BICUBIC), torchvision.transforms.ToTensor(), torchvision.
+        #             transforms.Normalize(mean, std)])
+        self.image_transform = paddle.vision.transforms.Compose(
+            [
+                paddle.vision.transforms.Resize(size=(image_size, image_size), interpolation="bicubic"),
+                paddle.vision.transforms.ToTensor(),
+                paddle.vision.transforms.Normalize(mean=mean, std=std),
+            ]
+        )
 
         CutMixin.__init__(self)
 
-    def preprocess(self, images: Union[Image.Image, List[Image.Image]],
-        cut_enable=True, **kwargs) ->mPLUGOwl3BatchFeature:
+    def preprocess(
+        self, images: Union[Image.Image, List[Image.Image]], cut_enable=True, **kwargs
+    ) -> mPLUGOwl3BatchFeature:
         if isinstance(images, Image.Image):
             images_list = [images]
         else:
             images_list = images
         if self.cut_enable and cut_enable:
-            image_data, cut_shape, cut_shape_indices = self._process_image(
-                images_list)
+            image_data, cut_shape, cut_shape_indices = self._process_image(images_list)
         else:
-            image_data = [self.image_transform(self.resizer.resize_global(
-                image)) for image in images_list]
+            image_data = [self.image_transform(self.resizer.resize_global(image)) for image in images_list]
             image_data = paddle.stack(x=image_data, axis=0)
             cut_shape = cut_shape_indices = None
-        return mPLUGOwl3BatchFeature(data={'pixel_values': image_data,
-            'cut_shape': cut_shape, 'cut_shape_indices': cut_shape_indices})
+        return mPLUGOwl3BatchFeature(
+            data={"pixel_values": image_data, "cut_shape": cut_shape, "cut_shape_indices": cut_shape_indices}
+        )
 
     def to_dict(self):
         # encoder_dict = super().to_dict()
         encoder_dict = {}
-        pop_keys = ['image_transform', 'resizer', 'old_resizer',
-            'cut_prompt_template']
+        pop_keys = ["image_transform", "resizer", "old_resizer", "cut_prompt_template"]
         for pk in pop_keys:
             encoder_dict.pop(pk, None)
         return encoder_dict
diff --git a/paddlemix/mPLUGOwl3/imagetest.py b/paddlemix/mPLUGOwl3/imagetest.py
index 7a01a96b7..6fe5782e9 100644
--- a/paddlemix/mPLUGOwl3/imagetest.py
+++ b/paddlemix/mPLUGOwl3/imagetest.py
@@ -1,26 +1,43 @@
-import paddlenlp
+# Copyright (c) 2024 PaddlePaddle Authors. All Rights Reserved.
+# 
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+# 
+#     http://www.apache.org/licenses/LICENSE-2.0
+# 
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
 import paddle
-from paddlenlp.transformers import PretrainedModel, AutoTokenizer
-from PIL import Image
+import paddlenlp
 from decord import VideoReader, cpu
-from mPLUGOwl3.modeling_mplugowl3 import mPLUGOwl3Model
 from mPLUGOwl3.configuration_mplugowl3 import mPLUGOwl3Config
+from mPLUGOwl3.modeling_mplugowl3 import mPLUGOwl3Model
+from paddlenlp.transformers import AutoTokenizer, PretrainedModel
+from PIL import Image
 
-model_path = '/home/aistudio/paddle_test/mPLUGOwl3'
+model_path = "/home/aistudio/paddle_test/mPLUGOwl3"
 config = mPLUGOwl3Config.from_pretrained(model_path)
 # print(config)
 model = mPLUGOwl3Model.from_pretrained(model_path, config=config, dtype="float16")
-model=model.eval()
+model = model.eval()
 tokenizer = AutoTokenizer.from_pretrained(model_path)
 processor = model.init_processor(tokenizer)
-image = Image.new('RGB', (500, 500), color='red')
-messages = [{'role': 'user', 'content':
-    """<|image|>
-Describe this image."""}, {'role': 'assistant', 'content': ''}
-    ]
+image = Image.new("RGB", (500, 500), color="red")
+messages = [
+    {
+        "role": "user",
+        "content": """<|image|>
+Describe this image.""",
+    },
+    {"role": "assistant", "content": ""},
+]
 inputs = processor(messages, images=[image], videos=None)
 # inputs.to('cuda')
-inputs.update({'tokenizer': tokenizer, 'max_new_tokens': 100, 'decode_text':
-    True})
+inputs.update({"tokenizer": tokenizer, "max_new_tokens": 100, "decode_text": True})
 g = model.generate(**inputs)
 print(g)
diff --git a/paddlemix/mPLUGOwl3/modeling_hyper_qwen2.py b/paddlemix/mPLUGOwl3/modeling_hyper_qwen2.py
index fb0d39402..a87685955 100644
--- a/paddlemix/mPLUGOwl3/modeling_hyper_qwen2.py
+++ b/paddlemix/mPLUGOwl3/modeling_hyper_qwen2.py
@@ -1,159 +1,163 @@
+# Copyright (c) 2024 PaddlePaddle Authors. All Rights Reserved.
+# 
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+# 
+#     http://www.apache.org/licenses/LICENSE-2.0
+# 
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
 import sys
-sys.path.append('/home/aistudio/paddle_test/mPLUGOwl3/utils')
-import paddle_aux
+
+sys.path.append("/home/aistudio/paddle_test/mPLUGOwl3/utils")
 import paddle
+import paddle_aux
 import paddlenlp
+
 """ PyTorch Qwen2 model."""
 import inspect
 import math
 from typing import List, Optional, Tuple, Union
+
 from einops import rearrange, repeat
-from .configuration_hyper_qwen2 import HyperQwen2Config
-from .bert_padding import index_first_axis, pad_input, unpad_input
+
+from paddlemix.models.flash_attn_utils import (
+    has_flash_attn_func,
+    is_flash_attn_available,
+)
+
 from .activations import ACT2FN
+from .bert_padding import index_first_axis, pad_input, unpad_input
+from .configuration_hyper_qwen2 import HyperQwen2Config
 
-from paddlemix.models.flash_attn_utils import has_flash_attn_func
-from paddlemix.models.flash_attn_utils import is_flash_attn_available
 if is_flash_attn_available():
     flash_attn_func, flash_attn_varlen_func = has_flash_attn_func()
-    _flash_supports_window_size = 'window_size' in list(inspect.signature(flash_attn_func).parameters)
+    _flash_supports_window_size = "window_size" in list(inspect.signature(flash_attn_func).parameters)
 # >>>>>>if transformers.utils.is_flash_attn_2_available():
 #     pass
 #     _flash_supports_window_size = 'window_size' in list(inspect.signature(
 #         flash_attn_func).parameters)
 
 from .x_sdpa import ScaleDotProductAttention
+
 try:
     from einops import rearrange
+
     use_flash_rotary = True
-    print('use flash_attn rotary')
+    print("use flash_attn rotary")
 except ImportError:
     use_flash_rotary = False
-    print('import flash_attn rotary fail')
-logger = paddle.utils.try_import('logging').getLogger(name=__name__)
-_CHECKPOINT_FOR_DOC = 'Qwen/Qwen2-7B-beta'
-_CONFIG_FOR_DOC = 'HyperQwen2Config'
+    print("import flash_attn rotary fail")
+logger = paddle.utils.try_import("logging").getLogger(name=__name__)
+_CHECKPOINT_FOR_DOC = "Qwen/Qwen2-7B-beta"
+_CONFIG_FOR_DOC = "HyperQwen2Config"
 
 
 def _get_unpad_data(attention_mask):
-    seqlens_in_batch = attention_mask.sum(axis=-1, dtype='int32')
-    paddle.utils.try_import('warnings').warn(
-        'Now, the return shape is inconsistent with torch when as_tuple is True'
-        )
-    indices = paddle.nonzero(x=attention_mask.flatten(), as_tuple=False
-        ).flatten()
+    seqlens_in_batch = attention_mask.sum(axis=-1, dtype="int32")
+    paddle.utils.try_import("warnings").warn("Now, the return shape is inconsistent with torch when as_tuple is True")
+    indices = paddle.nonzero(x=attention_mask.flatten(), as_tuple=False).flatten()
     max_seqlen_in_batch = seqlens_in_batch.max().item()
-    cu_seqlens = paddle.nn.functional.pad(x=paddle.cumsum(x=
-        seqlens_in_batch, axis=0, dtype='int32'), pad=(1, 0),
-        pad_from_left_axis=False)
+    cu_seqlens = paddle.nn.functional.pad(
+        x=paddle.cumsum(x=seqlens_in_batch, axis=0, dtype="int32"), pad=(1, 0), pad_from_left_axis=False
+    )
     return indices, cu_seqlens, max_seqlen_in_batch
 
 
 class Qwen2RMSNorm(paddle.nn.Layer):
-
     def __init__(self, hidden_size, eps=1e-06):
         """
         Qwen2RMSNorm is equivalent to T5LayerNorm
         """
         super().__init__()
-        self.weight = paddle.base.framework.EagerParamBase.from_tensor(tensor
-            =paddle.ones(shape=hidden_size))
+        self.weight = paddle.base.framework.EagerParamBase.from_tensor(tensor=paddle.ones(shape=hidden_size))
         self.variance_epsilon = eps
 
     def forward(self, hidden_states):
         input_dtype = hidden_states.dtype
-        hidden_states = hidden_states.to('float32')
+        hidden_states = hidden_states.to("float32")
         variance = hidden_states.pow(y=2).mean(axis=-1, keepdim=True)
-        hidden_states = hidden_states * paddle.rsqrt(x=variance + self.
-            variance_epsilon)
+        hidden_states = hidden_states * paddle.rsqrt(x=variance + self.variance_epsilon)
         return self.weight * hidden_states.to(input_dtype)
 
 
 class Qwen2RotaryEmbedding(paddle.nn.Layer):
-
-    def __init__(self, dim, max_position_embeddings=2048, base=10000,
-        device=None):
+    def __init__(self, dim, max_position_embeddings=2048, base=10000, device=None):
         super().__init__()
         self.dim = dim
         self.max_position_embeddings = max_position_embeddings
         self.base = base
-        inv_freq = 1.0 / self.base ** (paddle.arange(start=0, end=self.dim,
-            step=2, dtype='int64').astype(dtype='float32').to(device) /
-            self.dim)
-        self.register_buffer(name='inv_freq', tensor=inv_freq, persistable=
-            False)
-        self._set_cos_sin_cache(seq_len=max_position_embeddings, device=
-            self.inv_freq.place, dtype=paddle.get_default_dtype())
+        inv_freq = 1.0 / self.base ** (
+            paddle.arange(start=0, end=self.dim, step=2, dtype="int64").astype(dtype="float32").to(device) / self.dim
+        )
+        self.register_buffer(name="inv_freq", tensor=inv_freq, persistable=False)
+        self._set_cos_sin_cache(
+            seq_len=max_position_embeddings, device=self.inv_freq.place, dtype=paddle.get_default_dtype()
+        )
 
     def _set_cos_sin_cache(self, seq_len, device, dtype):
         self.max_seq_len_cached = seq_len
-        t = paddle.arange(dtype='int64', end=self.max_seq_len_cached).astype(
-            dtype=self.inv_freq.dtype)
+        t = paddle.arange(dtype="int64", end=self.max_seq_len_cached).astype(dtype=self.inv_freq.dtype)
         freqs = paddle.outer(x=t, y=self.inv_freq)
         emb = paddle.concat(x=(freqs, freqs), axis=-1)
-        self.register_buffer(name='cos_cached', tensor=emb.cos().to(dtype),
-            persistable=False)
-        self.register_buffer(name='sin_cached', tensor=emb.sin().to(dtype),
-            persistable=False)
+        self.register_buffer(name="cos_cached", tensor=emb.cos().to(dtype), persistable=False)
+        self.register_buffer(name="sin_cached", tensor=emb.sin().to(dtype), persistable=False)
 
     def forward(self, x, seq_len=None):
         if seq_len > self.max_seq_len_cached:
-            self._set_cos_sin_cache(seq_len=seq_len, device=x.place, dtype=
-                x.dtype)
-        return self.cos_cached[:seq_len].to(dtype=x.dtype), self.sin_cached[:
-            seq_len].to(dtype=x.dtype)
+            self._set_cos_sin_cache(seq_len=seq_len, device=x.place, dtype=x.dtype)
+        return self.cos_cached[:seq_len].to(dtype=x.dtype), self.sin_cached[:seq_len].to(dtype=x.dtype)
 
 
 class RotaryEmbedding(paddle.nn.Layer):
-
-    def __init__(self, dim, base=10000, use_fp32=False, use_outer_in_rope=False
-        ):
+    def __init__(self, dim, base=10000, use_fp32=False, use_outer_in_rope=False):
         super().__init__()
         self.dim = dim
         self.base = base
         self.use_fp32 = use_fp32
         if use_fp32:
-            self.inv_freq = 1.0 / base ** (paddle.arange(start=0, end=dim,
-                step=2).astype(dtype='float32') / dim)
+            self.inv_freq = 1.0 / base ** (paddle.arange(start=0, end=dim, step=2).astype(dtype="float32") / dim)
         else:
-            inv_freq = 1.0 / base ** (paddle.arange(start=0, end=dim, step=
-                2).astype(dtype='float32') / dim)
-            self.register_buffer(name='inv_freq', tensor=inv_freq)
+            inv_freq = 1.0 / base ** (paddle.arange(start=0, end=dim, step=2).astype(dtype="float32") / dim)
+            self.register_buffer(name="inv_freq", tensor=inv_freq)
         self._rotary_pos_emb_cache = None
         self._seq_len_cached = 0
         self.use_outer_in_rope = use_outer_in_rope
         self._ntk_alpha_cached = 1.0
 
-    def update_rotary_pos_emb_cache(self, max_seq_len, offset=0, ntk_alpha=1.0
-        ):
+    def update_rotary_pos_emb_cache(self, max_seq_len, offset=0, ntk_alpha=1.0):
         seqlen = max_seq_len + offset
-        if (seqlen > self._seq_len_cached or ntk_alpha != self.
-            _ntk_alpha_cached):
+        if seqlen > self._seq_len_cached or ntk_alpha != self._ntk_alpha_cached:
             base = self.base * ntk_alpha ** (self.dim / (self.dim - 2))
-            self.inv_freq = 1.0 / base ** (paddle.arange(start=0, end=self.
-                dim, step=2).astype(dtype='float32') / self.dim)
+            self.inv_freq = 1.0 / base ** (
+                paddle.arange(start=0, end=self.dim, step=2).astype(dtype="float32") / self.dim
+            )
             self._seq_len_cached = seqlen
             self._ntk_alpha_cached = ntk_alpha
             seq = paddle.arange(end=seqlen)
             if self.use_outer_in_rope:
-                freqs = paddle.outer(x=seq.astype(dtype=self.inv_freq.dtype
-                    ), y=self.inv_freq)
+                freqs = paddle.outer(x=seq.astype(dtype=self.inv_freq.dtype), y=self.inv_freq)
             else:
-                freqs = einsum('i , j -> i j', seq.astype(dtype=self.
-                    inv_freq.dtype), self.inv_freq)
+                freqs = einsum("i , j -> i j", seq.astype(dtype=self.inv_freq.dtype), self.inv_freq)
             emb = paddle.concat(x=(freqs, freqs), axis=-1)
             from einops import rearrange
-            self._rotary_pos_emb_cache = rearrange(emb, 'n d -> n 1 1 d')
+
+            self._rotary_pos_emb_cache = rearrange(emb, "n d -> n 1 1 d")
 
     def forward(self, max_seq_len, offset=0, ntk_alpha=1.0):
         self.update_rotary_pos_emb_cache(max_seq_len, offset, ntk_alpha)
-        return self._rotary_pos_emb_cache[offset:offset + max_seq_len]
+        return self._rotary_pos_emb_cache[offset : offset + max_seq_len]
 
 
 def rotate_half(x):
     """Rotates half the hidden dims of the input."""
-    x1 = x[..., :tuple(x.shape)[-1] // 2]
-    x2 = x[..., tuple(x.shape)[-1] // 2:]
+    x1 = x[..., : tuple(x.shape)[-1] // 2]
+    x2 = x[..., tuple(x.shape)[-1] // 2 :]
     return paddle.concat(x=(-x2, x1), axis=-1)
 
 
@@ -187,6 +191,7 @@ def apply_rotary_pos_emb(q, k, cos, sin, position_ids, unsqueeze_dim=1):
     k_embed = k * cos + rotate_half(k) * sin
     return q_embed, k_embed
 
+
 # Copied from transformers.models.llama.modeling_llama._prepare_4d_causal_attention_mask_with_cache_position
 def _prepare_4d_causal_attention_mask_with_cache_position(
     attention_mask: paddle.Tensor,
@@ -224,8 +229,8 @@ def _prepare_4d_causal_attention_mask_with_cache_position(
         causal_mask = paddle.full([sequence_length, target_length], fill_value=min_dtype, dtype=dtype)
         if sequence_length != 1:
             causal_mask = paddle.triu(x=causal_mask, diagonal=1)
-        bool_tensor=paddle.arange(target_length) > cache_position.reshape([-1, 1])
-        float_tensor=float16_tensor = bool_tensor.astype(paddle.float16)
+        bool_tensor = paddle.arange(target_length) > cache_position.reshape([-1, 1])
+        float_tensor = float16_tensor = bool_tensor.astype(paddle.float16)
         causal_mask *= float_tensor
         causal_mask = causal_mask[None, None, :, :].expand(shape=[batch_size, 1, -1, -1])
         if attention_mask is not None:
@@ -240,25 +245,27 @@ def _prepare_4d_causal_attention_mask_with_cache_position(
 
 
 class Qwen2MLP(paddle.nn.Layer):
-
     def __init__(self, config):
         super().__init__()
         self.config = config
         self.hidden_size = config.hidden_size
         self.intermediate_size = config.intermediate_size
-        self.gate_proj = paddle.nn.Linear(in_features=self.hidden_size,
-            out_features=self.intermediate_size, bias_attr=False)
-        self.up_proj = paddle.nn.Linear(in_features=self.hidden_size,
-            out_features=self.intermediate_size, bias_attr=False)
-        self.down_proj = paddle.nn.Linear(in_features=self.
-            intermediate_size, out_features=self.hidden_size, bias_attr=False)
+        self.gate_proj = paddle.nn.Linear(
+            in_features=self.hidden_size, out_features=self.intermediate_size, bias_attr=False
+        )
+        self.up_proj = paddle.nn.Linear(
+            in_features=self.hidden_size, out_features=self.intermediate_size, bias_attr=False
+        )
+        self.down_proj = paddle.nn.Linear(
+            in_features=self.intermediate_size, out_features=self.hidden_size, bias_attr=False
+        )
         self.act_fn = ACT2FN[config.hidden_act]
 
     def forward(self, x):
         return self.down_proj(self.act_fn(self.gate_proj(x)) * self.up_proj(x))
 
 
-def repeat_kv(hidden_states: paddle.Tensor, n_rep: int) ->paddle.Tensor:
+def repeat_kv(hidden_states: paddle.Tensor, n_rep: int) -> paddle.Tensor:
     """
     This is the equivalent of torch.repeat_interleave(x, dim=1, repeats=n_rep). The hidden states go from (batch,
     num_key_value_heads, seqlen, head_dim) to (batch, num_attention_heads, seqlen, head_dim)
@@ -266,10 +273,8 @@ def repeat_kv(hidden_states: paddle.Tensor, n_rep: int) ->paddle.Tensor:
     batch, num_key_value_heads, slen, head_dim = tuple(hidden_states.shape)
     if n_rep == 1:
         return hidden_states
-    hidden_states = hidden_states[:, :, None, :, :].expand(shape=[batch,
-        num_key_value_heads, n_rep, slen, head_dim])
-    return hidden_states.reshape(batch, num_key_value_heads * n_rep, slen,
-        head_dim)
+    hidden_states = hidden_states[:, :, None, :, :].expand(shape=[batch, num_key_value_heads, n_rep, slen, head_dim])
+    return hidden_states.reshape(batch, num_key_value_heads * n_rep, slen, head_dim)
 
 
 def make_t2v_mask(media_offset_line, num_images):
@@ -285,19 +290,18 @@ def make_t2v_mask(media_offset_line, num_images):
 
 def select_query(media_offset, num_queries=None):
     query_indices = media_offset[:, :, 1] >= 0
-    assert query_indices.sum().item() % num_queries == 0, query_indices.sum(
-        ).item()
+    assert query_indices.sum().item() % num_queries == 0, query_indices.sum().item()
     query_indices = query_indices.nonzero()
     ptr = 0
     while ptr < tuple(query_indices.shape)[0]:
-        first_query_index, last_query_index = query_indices[ptr
-            ], query_indices[ptr + num_queries - 1]
-        assert (last_query_index[1] - first_query_index[1] + 1).item(
-            ) == num_queries
+        first_query_index, last_query_index = query_indices[ptr], query_indices[ptr + num_queries - 1]
+        assert (last_query_index[1] - first_query_index[1] + 1).item() == num_queries
         assert last_query_index[0].item() == first_query_index[0].item()
-        batch_id, begin_i, end_i = first_query_index[0].item(
-            ), first_query_index[1].item(), first_query_index[1].item(
-            ) + num_queries
+        batch_id, begin_i, end_i = (
+            first_query_index[0].item(),
+            first_query_index[1].item(),
+            first_query_index[1].item() + num_queries,
+        )
         yield batch_id, begin_i, end_i
         ptr += num_queries
 
@@ -307,7 +311,8 @@ def _rotate_half(x):
     change sign so the last dimension becomes [-odd, +even]
     """
     from einops import rearrange
-    x = rearrange(x, '... (j d) -> ... j d', j=2)
+
+    x = rearrange(x, "... (j d) -> ... j d", j=2)
     x1, x2 = x.unbind(axis=-2)
     return paddle.concat(x=(-x2, x1), axis=-1)
 
@@ -319,23 +324,23 @@ def apply_rotary_pos_emb_core(t, freqs, use_fp32=False, debug=False):
     check https://kexue.fm/archives/8265 for detailed formulas
     """
     if use_flash_rotary and use_fp32:
-        t_ = rearrange(t, 's b ... -> b s ...').contiguous()
+        t_ = rearrange(t, "s b ... -> b s ...").contiguous()
         if use_fp32:
-            t_ = t_.astype(dtype='float32')
+            t_ = t_.astype(dtype="float32")
         freqs = freqs.squeeze(axis=1).squeeze(axis=1)
-        cos = freqs[:, :tuple(freqs.shape)[-1] // 2].cos()
-        sin = freqs[:, :tuple(freqs.shape)[-1] // 2].sin()
-        output = paddle_aux.apply_rotary_emb_func(x=t_, cos=cos, sin=sin
-            ).astype(dtype=t.dtype)
+        cos = freqs[:, : tuple(freqs.shape)[-1] // 2].cos()
+        sin = freqs[:, : tuple(freqs.shape)[-1] // 2].sin()
+        output = paddle_aux.apply_rotary_emb_func(x=t_, cos=cos, sin=sin).astype(dtype=t.dtype)
         if debug:
             from icecream import ic
+
             ic(tuple(t_.shape), tuple(freqs.shape), tuple(cos.shape))
-        return rearrange(output, 'b s ... -> s b ...')
+        return rearrange(output, "b s ... -> s b ...")
     rot_dim = tuple(freqs.shape)[-1]
     t_, t_pass_ = t[..., :rot_dim], t[..., rot_dim:]
     if use_fp32:
-        t_ = t_.astype(dtype='float32')
-        t_pass_ = t_pass_.astype(dtype='float32')
+        t_ = t_.astype(dtype="float32")
+        t_pass_ = t_pass_.astype(dtype="float32")
     t_ = t_ * freqs.cos() + _rotate_half(t_) * freqs.sin()
     return paddle.concat(x=(t_, t_pass_), axis=-1).astype(dtype=t.dtype)
 
@@ -346,15 +351,14 @@ class HyperQwen2Attention(paddle.nn.Layer):
     and "Generating Long Sequences with Sparse Transformers".
     """
 
-    def __init__(self, config: HyperQwen2Config, layer_idx: Optional[int]=
-        None, is_hyper_enabed=False):
+    def __init__(self, config: HyperQwen2Config, layer_idx: Optional[int] = None, is_hyper_enabed=False):
         super().__init__()
         self.config = config
         self.layer_idx = layer_idx
         if layer_idx is None:
             logger.warning_once(
-                f'Instantiating {self.__class__.__name__} without passing `layer_idx` is not recommended and will to errors during the forward call, if caching is used. Please make sure to provide a `layer_idx` when creating this class.'
-                )
+                f"Instantiating {self.__class__.__name__} without passing `layer_idx` is not recommended and will to errors during the forward call, if caching is used. Please make sure to provide a `layer_idx` when creating this class."
+            )
         self.hidden_size = config.hidden_size
         self.num_heads = config.num_attention_heads
         self.head_dim = self.hidden_size // self.num_heads
@@ -366,43 +370,44 @@ def __init__(self, config: HyperQwen2Config, layer_idx: Optional[int]=
         self.attention_dropout = config.attention_dropout
         if self.head_dim * self.num_heads != self.hidden_size:
             raise ValueError(
-                f'hidden_size must be divisible by num_heads (got `hidden_size`: {self.hidden_size} and `num_heads`: {self.num_heads}).'
-                )
-        self.q_proj = paddle.nn.Linear(in_features=self.hidden_size,
-            out_features=self.num_heads * self.head_dim, bias_attr=True)
-        self.k_proj = paddle.nn.Linear(in_features=self.hidden_size,
-            out_features=self.num_key_value_heads * self.head_dim,
-            bias_attr=True)
-        self.v_proj = paddle.nn.Linear(in_features=self.hidden_size,
-            out_features=self.num_key_value_heads * self.head_dim,
-            bias_attr=True)
-        self.o_proj = paddle.nn.Linear(in_features=self.num_heads * self.
-            head_dim, out_features=self.hidden_size, bias_attr=False)
-        self.rotary_emb = Qwen2RotaryEmbedding(self.head_dim,
-            max_position_embeddings=self.max_position_embeddings, base=self
-            .rope_theta)
-        self.rotary_emb_core = RotaryEmbedding(self.head_dim, base=self.
-            rope_theta, use_fp32=True, use_outer_in_rope=True)
+                f"hidden_size must be divisible by num_heads (got `hidden_size`: {self.hidden_size} and `num_heads`: {self.num_heads})."
+            )
+        self.q_proj = paddle.nn.Linear(
+            in_features=self.hidden_size, out_features=self.num_heads * self.head_dim, bias_attr=True
+        )
+        self.k_proj = paddle.nn.Linear(
+            in_features=self.hidden_size, out_features=self.num_key_value_heads * self.head_dim, bias_attr=True
+        )
+        self.v_proj = paddle.nn.Linear(
+            in_features=self.hidden_size, out_features=self.num_key_value_heads * self.head_dim, bias_attr=True
+        )
+        self.o_proj = paddle.nn.Linear(
+            in_features=self.num_heads * self.head_dim, out_features=self.hidden_size, bias_attr=False
+        )
+        self.rotary_emb = Qwen2RotaryEmbedding(
+            self.head_dim, max_position_embeddings=self.max_position_embeddings, base=self.rope_theta
+        )
+        self.rotary_emb_core = RotaryEmbedding(
+            self.head_dim, base=self.rope_theta, use_fp32=True, use_outer_in_rope=True
+        )
         self.is_hyper_enabed = is_hyper_enabed
         if self.is_hyper_enabed:
-            self.v_kv_proj = paddle.nn.Linear(in_features=self.hidden_size,
-                out_features=self.num_key_value_heads * self.head_dim * 2,
-                bias_attr=True)
-            self.gate = paddle.base.framework.EagerParamBase.from_tensor(tensor
-                =paddle.zeros(shape=self.hidden_size))
-            self.v_core_attention_sdpa = ScaleDotProductAttention(layer_number
-                =-1, causal=False, attention_dropout=self.attention_dropout)
+            self.v_kv_proj = paddle.nn.Linear(
+                in_features=self.hidden_size, out_features=self.num_key_value_heads * self.head_dim * 2, bias_attr=True
+            )
+            self.gate = paddle.base.framework.EagerParamBase.from_tensor(tensor=paddle.zeros(shape=self.hidden_size))
+            self.v_core_attention_sdpa = ScaleDotProductAttention(
+                layer_number=-1, causal=False, attention_dropout=self.attention_dropout
+            )
             self.visual_cache = {}
 
     def apply_mi_rope(self, key_layer, media_offset_line, length_each_img):
-        key_layer = rearrange(key_layer, 'b h s d -> s b h d')
+        key_layer = rearrange(key_layer, "b h s d -> s b h d")
         if self.rotary_emb_core.inv_freq.place != key_layer.place:
-            self.rotary_emb_core.inv_freq = self.rotary_emb_core.inv_freq.to(
-                key_layer.place)
+            self.rotary_emb_core.inv_freq = self.rotary_emb_core.inv_freq.to(key_layer.place)
         rotary_pos_emb_max_seq_len = self.config.max_position_embeddings
         ntk_alpha = 1
-        rotary_pos_emb = self.rotary_emb_core(rotary_pos_emb_max_seq_len,
-            ntk_alpha=ntk_alpha)
+        rotary_pos_emb = self.rotary_emb_core(rotary_pos_emb_max_seq_len, ntk_alpha=ntk_alpha)
         assert rotary_pos_emb is not None
         if isinstance(rotary_pos_emb, tuple):
             rotary_pos_emb = rotary_pos_emb
@@ -410,17 +415,13 @@ def apply_mi_rope(self, key_layer, media_offset_line, length_each_img):
             rotary_pos_emb = (rotary_pos_emb,) * 2
         if rotary_pos_emb is not None:
             q_pos_emb, k_pos_emb = rotary_pos_emb
-            image_pos = (media_offset_line[1:] - media_offset_line[:-1]
-                ).nonzero().squeeze(axis=1) + 1
-            k_pos_emb = repeat(k_pos_emb[image_pos],
-                'N_img b h d -> (N_img L) b h d', L=length_each_img)
-            key_layer = apply_rotary_pos_emb_core(key_layer, k_pos_emb,
-                use_fp32=True)
-        key_layer = rearrange(key_layer, 's b h d -> b h s d')
+            image_pos = (media_offset_line[1:] - media_offset_line[:-1]).nonzero().squeeze(axis=1) + 1
+            k_pos_emb = repeat(k_pos_emb[image_pos], "N_img b h d -> (N_img L) b h d", L=length_each_img)
+            key_layer = apply_rotary_pos_emb_core(key_layer, k_pos_emb, use_fp32=True)
+        key_layer = rearrange(key_layer, "s b h d -> b h s d")
         return key_layer
 
-    def crossattention(self, query_layer, vision_features, media_offset,
-        context_layer):
+    def crossattention(self, query_layer, vision_features, media_offset, context_layer):
         """
         query_layer: [s b h d]
         vision_features: [b' lv d]
@@ -438,61 +439,65 @@ def crossattention(self, query_layer, vision_features, media_offset,
             media_offset = media_offset[:, -1:]
         else:
             completion_flag = False
-            self.visual_cache['media_offset'] = media_offset
-            self.visual_cache['vision_features'] = vision_features
-        query_layer = rearrange(query_layer, 'L B H D -> B H L D')
-        assert sequence_length == tuple(media_offset.shape)[1], (
-            sequence_length, tuple(media_offset.shape))
+            self.visual_cache["media_offset"] = media_offset
+            self.visual_cache["vision_features"] = vision_features
+        query_layer = rearrange(query_layer, "L B H D -> B H L D")
+        assert sequence_length == tuple(media_offset.shape)[1], (sequence_length, tuple(media_offset.shape))
         gate_value = paddle.nn.functional.sigmoid(x=self.gate)
-        for batch_id, begin_i, end_i in select_query(media_offset,
-            sequence_length):
+        for batch_id, begin_i, end_i in select_query(media_offset, sequence_length):
             assert begin_i == 0
             assert end_i == sequence_length, (end_i, sequence_length)
             curr_offset = media_offset[batch_id, end_i - 1]
             if not completion_flag:
-                re_to_zero_media_offset = (media_offset[batch_id, :, 1] -
-                    curr_offset[0]).to(query_layer.place)
+                re_to_zero_media_offset = (media_offset[batch_id, :, 1] - curr_offset[0]).to(query_layer.place)
                 query_shift = re_to_zero_media_offset.nonzero()[0].item()
-                curr_mask = make_t2v_mask(re_to_zero_media_offset[
-                    query_shift:], num_images=curr_offset[1] - curr_offset[0])
-                curr_mask = repeat(curr_mask,
-                    's_q s_k -> B H s_q (s_k img_l)', B=1, H=1, img_l=
-                    length_each_img)
+                curr_mask = make_t2v_mask(
+                    re_to_zero_media_offset[query_shift:], num_images=curr_offset[1] - curr_offset[0]
+                )
+                curr_mask = repeat(curr_mask, "s_q s_k -> B H s_q (s_k img_l)", B=1, H=1, img_l=length_each_img)
             else:
                 curr_mask = None
                 query_shift = 0
-            curr_query_tokens = query_layer[batch_id, :, query_shift:
-                ].unsqueeze(axis=0).clone().contiguous()
+            curr_query_tokens = query_layer[batch_id, :, query_shift:].unsqueeze(axis=0).clone().contiguous()
             assert curr_offset[0] < tuple(vision_features.shape)[0]
             assert curr_offset[1] <= tuple(vision_features.shape)[0]
-            curr_vision_kv: paddle.Tensor = rearrange(vision_features[
-                curr_offset[0]:curr_offset[1]].clone(),
-                'BL Lv (H KV D) -> KV 1 H (BL Lv) D', KV=2, H=self.
-                num_key_value_heads)
+            curr_vision_kv: paddle.Tensor = rearrange(
+                vision_features[curr_offset[0] : curr_offset[1]].clone(),
+                "BL Lv (H KV D) -> KV 1 H (BL Lv) D",
+                KV=2,
+                H=self.num_key_value_heads,
+            )
             key_layer = curr_vision_kv[0].contiguous()
             value_layer = curr_vision_kv[1].contiguous()
-            key_layer = self.apply_mi_rope(key_layer, media_offset_line=
-                self.visual_cache['media_offset'][batch_id, :, 1] -
-                curr_offset[0], length_each_img=length_each_img)
+            key_layer = self.apply_mi_rope(
+                key_layer,
+                media_offset_line=self.visual_cache["media_offset"][batch_id, :, 1] - curr_offset[0],
+                length_each_img=length_each_img,
+            )
             key_layer = repeat_kv(key_layer, self.num_key_value_groups)
             value_layer = repeat_kv(value_layer, self.num_key_value_groups)
-            v_context_layer = self.v_core_attention_sdpa(curr_query_tokens,
-                key_layer, value_layer, attn_mask=curr_mask, order='bhsd'
-                ).squeeze(axis=1)
-            context_layer_clone[query_shift:, batch_id] = context_layer[
-                query_shift:, batch_id].clone() * (1 - gate_value
-                ) + v_context_layer * gate_value
+            v_context_layer = self.v_core_attention_sdpa(
+                curr_query_tokens, key_layer, value_layer, attn_mask=curr_mask, order="bhsd"
+            ).squeeze(axis=1)
+            context_layer_clone[query_shift:, batch_id] = (
+                context_layer[query_shift:, batch_id].clone() * (1 - gate_value) + v_context_layer * gate_value
+            )
         return context_layer_clone
 
-    def forward(self, hidden_states: paddle.Tensor, attention_mask:
-        Optional[paddle.Tensor]=None, position_ids: Optional[paddle.Tensor]
-        =None, image_embeds=None, media_offset=None, past_key_value:
-        Optional[Tuple[paddle.Tensor]]=None, output_attentions:
-        bool=False, use_cache: bool=False) ->Tuple[paddle.Tensor, Optional[
-        paddle.Tensor], Optional[Tuple[paddle.Tensor]]]:
+    def forward(
+        self,
+        hidden_states: paddle.Tensor,
+        attention_mask: Optional[paddle.Tensor] = None,
+        position_ids: Optional[paddle.Tensor] = None,
+        image_embeds=None,
+        media_offset=None,
+        past_key_value: Optional[Tuple[paddle.Tensor]] = None,
+        output_attentions: bool = False,
+        use_cache: bool = False,
+    ) -> Tuple[paddle.Tensor, Optional[paddle.Tensor], Optional[Tuple[paddle.Tensor]]]:
         raise NotImplementedError(
             'We do not support eager model yet. Use attn_implementation == "flash_attention_2" or attn_implementation == "sdpa".'
-            )
+        )
         bsz, q_len, _ = tuple(hidden_states.shape)
         query_states = self.q_proj(hidden_states)
         key_states = self.k_proj(hidden_states)
@@ -522,50 +527,45 @@ def forward(self, hidden_states: paddle.Tensor, attention_mask:
         if past_key_value is not None:
             if self.layer_idx is None:
                 raise ValueError(
-                    f'The cache structure has changed since version v4.36. If you are using {self.__class__.__name__} for auto-regressive decoding with k/v caching, please make sure to initialize the attention class with a layer index.'
-                    )
-            kv_seq_len += past_key_value.get_usable_length(kv_seq_len, self
-                .layer_idx)
+                    f"The cache structure has changed since version v4.36. If you are using {self.__class__.__name__} for auto-regressive decoding with k/v caching, please make sure to initialize the attention class with a layer index."
+                )
+            kv_seq_len += past_key_value.get_usable_length(kv_seq_len, self.layer_idx)
         cos, sin = self.rotary_emb(value_states, seq_len=kv_seq_len)
-        
-        query_states, key_states = apply_rotary_pos_emb(query_states,
-            key_states, cos, sin, position_ids)
+
+        query_states, key_states = apply_rotary_pos_emb(query_states, key_states, cos, sin, position_ids)
         if past_key_value is not None:
-            cache_kwargs = {'sin': sin, 'cos': cos}
-            key_states, value_states = past_key_value.update(key_states,
-                value_states, self.layer_idx, cache_kwargs)
+            cache_kwargs = {"sin": sin, "cos": cos}
+            key_states, value_states = past_key_value.update(key_states, value_states, self.layer_idx, cache_kwargs)
         key_states = repeat_kv(key_states, self.num_key_value_groups)
         value_states = repeat_kv(value_states, self.num_key_value_groups)
-        attn_weights = paddle.matmul(x=query_states, y=key_states.transpose
-            (perm=paddle_aux.transpose_aux_func(key_states.ndim, 2, 3))
-            ) / math.sqrt(self.head_dim)
-        if tuple(attn_weights.shape) != (bsz, self.num_heads, q_len, kv_seq_len
-            ):
+        attn_weights = paddle.matmul(
+            x=query_states, y=key_states.transpose(perm=paddle_aux.transpose_aux_func(key_states.ndim, 2, 3))
+        ) / math.sqrt(self.head_dim)
+        if tuple(attn_weights.shape) != (bsz, self.num_heads, q_len, kv_seq_len):
             raise ValueError(
-                f'Attention weights should be of size {bsz, self.num_heads, q_len, kv_seq_len}, but is {tuple(attn_weights.shape)}'
-                )
+                f"Attention weights should be of size {bsz, self.num_heads, q_len, kv_seq_len}, but is {tuple(attn_weights.shape)}"
+            )
         if attention_mask is not None:
             if tuple(attention_mask.shape) != (bsz, 1, q_len, kv_seq_len):
                 raise ValueError(
-                    f'Attention mask should be of size {bsz, 1, q_len, kv_seq_len}, but is {tuple(attention_mask.shape)}'
-                    )
+                    f"Attention mask should be of size {bsz, 1, q_len, kv_seq_len}, but is {tuple(attention_mask.shape)}"
+                )
             attn_weights = attn_weights + attention_mask
-        attn_weights = paddle.nn.functional.softmax(x=attn_weights, axis=-1,
-            dtype='float32').to(query_states.dtype)
-        attn_weights = paddle.nn.functional.dropout(x=attn_weights, p=self.
-            attention_dropout, training=self.training)
+        attn_weights = paddle.nn.functional.softmax(x=attn_weights, axis=-1, dtype="float32").to(query_states.dtype)
+        attn_weights = paddle.nn.functional.dropout(x=attn_weights, p=self.attention_dropout, training=self.training)
         attn_output = paddle.matmul(x=attn_weights, y=value_states)
-        if tuple(attn_output.shape) != (bsz, self.num_heads, q_len, self.
-            head_dim):
+        if tuple(attn_output.shape) != (bsz, self.num_heads, q_len, self.head_dim):
             raise ValueError(
-                f'`attn_output` should be of size {bsz, self.num_heads, q_len, self.head_dim}, but is {tuple(attn_output.shape)}'
-                )
-        attn_output = attn_output.transpose(perm=paddle_aux.
-            transpose_aux_func(attn_output.ndim, 1, 2)).contiguous()
+                f"`attn_output` should be of size {bsz, self.num_heads, q_len, self.head_dim}, but is {tuple(attn_output.shape)}"
+            )
+        attn_output = attn_output.transpose(perm=paddle_aux.transpose_aux_func(attn_output.ndim, 1, 2)).contiguous()
         attn_output = attn_output.reshape(bsz, q_len, self.hidden_size)
-        attn_output = self.crossattention(query_states.transpose(perm=[1, 0,
-            1, 3]), image_embeds, media_offset, attn_output.transpose(perm=
-            [1, 0, 2]))
+        attn_output = self.crossattention(
+            query_states.transpose(perm=[1, 0, 1, 3]),
+            image_embeds,
+            media_offset,
+            attn_output.transpose(perm=[1, 0, 2]),
+        )
         attn_output = attn_output.transpose(perm=[1, 0, 2])
         attn_output = self.o_proj(attn_output)
         if not output_attentions:
@@ -584,14 +584,21 @@ class HyperQwen2FlashAttention2(HyperQwen2Attention):
 
     def __init__(self, *args, **kwargs):
         super().__init__(*args, **kwargs)
-# >>>>>>        self._flash_attn_uses_top_left_mask = (not transformers.utils.
-#             is_flash_attn_greater_or_equal_2_10())
-
-    def forward(self, hidden_states: paddle.Tensor, attention_mask:
-        Optional[paddle.Tensor]=None, position_ids: Optional[paddle.Tensor]
-        =None, image_embeds=None, media_offset=None, past_key_value:
-        Optional[Tuple[paddle.Tensor]]=None, output_attentions:
-        bool=False, use_cache: bool=False):
+
+    # >>>>>>        self._flash_attn_uses_top_left_mask = (not transformers.utils.
+    #             is_flash_attn_greater_or_equal_2_10())
+
+    def forward(
+        self,
+        hidden_states: paddle.Tensor,
+        attention_mask: Optional[paddle.Tensor] = None,
+        position_ids: Optional[paddle.Tensor] = None,
+        image_embeds=None,
+        media_offset=None,
+        past_key_value: Optional[Tuple[paddle.Tensor]] = None,
+        output_attentions: bool = False,
+        use_cache: bool = False,
+    ):
         bsz, q_len, _ = tuple(hidden_states.shape)
         query_states = self.q_proj(hidden_states)
         key_states = self.k_proj(hidden_states)
@@ -621,27 +628,29 @@ def forward(self, hidden_states: paddle.Tensor, attention_mask:
         if past_key_value is not None:
             if self.layer_idx is None:
                 raise ValueError(
-                    f'The cache structure has changed since version v4.36. If you are using {self.__class__.__name__} for auto-regressive decoding with k/v caching, please make sure to initialize the attention class with a layer index.'
-                    )
-            kv_seq_len += past_key_value.get_usable_length(kv_seq_len, self
-                .layer_idx)
+                    f"The cache structure has changed since version v4.36. If you are using {self.__class__.__name__} for auto-regressive decoding with k/v caching, please make sure to initialize the attention class with a layer index."
+                )
+            kv_seq_len += past_key_value.get_usable_length(kv_seq_len, self.layer_idx)
         rotary_seq_len = max(kv_seq_len, position_ids[:, -1].max().item()) + 1
         cos, sin = self.rotary_emb(value_states, seq_len=rotary_seq_len)
-        query_states, key_states = apply_rotary_pos_emb(query_states,
-            key_states, cos, sin, position_ids)
-        use_sliding_windows = (_flash_supports_window_size and getattr(self
-            .config, 'sliding_window', None) is not None and kv_seq_len >
-            self.config.sliding_window and self.config.use_sliding_window)
+        query_states, key_states = apply_rotary_pos_emb(query_states, key_states, cos, sin, position_ids)
+        use_sliding_windows = (
+            _flash_supports_window_size
+            and getattr(self.config, "sliding_window", None) is not None
+            and kv_seq_len > self.config.sliding_window
+            and self.config.use_sliding_window
+        )
         if not _flash_supports_window_size:
             logger.warning_once(
-                'The current flash attention version does not support sliding window attention, for a more memory efficient implementation make sure to upgrade flash-attn library.'
-                )
+                "The current flash attention version does not support sliding window attention, for a more memory efficient implementation make sure to upgrade flash-attn library."
+            )
         if past_key_value is not None:
-            cache_has_contents = past_key_value.get_seq_length(self.layer_idx
-                ) > 0
-            if (getattr(self.config, 'sliding_window', None) is not None and
-                kv_seq_len > self.config.sliding_window and cache_has_contents
-                ):
+            cache_has_contents = past_key_value.get_seq_length(self.layer_idx) > 0
+            if (
+                getattr(self.config, "sliding_window", None) is not None
+                and kv_seq_len > self.config.sliding_window
+                and cache_has_contents
+            ):
                 slicing_tokens = 1 - self.config.sliding_window
                 past_key = past_key_value[self.layer_idx][0]
                 past_value = past_key_value[self.layer_idx][1]
@@ -649,57 +658,70 @@ def forward(self, hidden_states: paddle.Tensor, attention_mask:
                 past_value = past_value[:, :, slicing_tokens:, :].contiguous()
                 if tuple(past_key.shape)[-2] != self.config.sliding_window - 1:
                     raise ValueError(
-                        f'past key must have a shape of (`batch_size, num_heads, self.config.sliding_window-1, head_dim`), got {tuple(past_key.shape)}'
-                        )
+                        f"past key must have a shape of (`batch_size, num_heads, self.config.sliding_window-1, head_dim`), got {tuple(past_key.shape)}"
+                    )
                 if attention_mask is not None:
                     attention_mask = attention_mask[:, slicing_tokens:]
-                    attention_mask = paddle.concat(x=[attention_mask,
-                        paddle.ones_like(x=attention_mask[:, -1:])], axis=-1)
-            cache_kwargs = {'sin': sin, 'cos': cos}
-            key_states, value_states = past_key_value.update(key_states,
-                value_states, self.layer_idx, cache_kwargs)
+                    attention_mask = paddle.concat(
+                        x=[attention_mask, paddle.ones_like(x=attention_mask[:, -1:])], axis=-1
+                    )
+            cache_kwargs = {"sin": sin, "cos": cos}
+            key_states, value_states = past_key_value.update(key_states, value_states, self.layer_idx, cache_kwargs)
         key_states = repeat_kv(key_states, self.num_key_value_groups)
         value_states = repeat_kv(value_states, self.num_key_value_groups)
         dropout_rate = 0.0 if not self.training else self.attention_dropout
         input_dtype = query_states.dtype
-        if input_dtype == 'float32':
-# >>>>>>            if torch.is_autocast_enabled():
-# >>>>>>                target_dtype = torch.get_autocast_gpu_dtype()
+        if input_dtype == "float32":
+            # >>>>>>            if torch.is_autocast_enabled():
+            # >>>>>>                target_dtype = torch.get_autocast_gpu_dtype()
             if paddle.amp.auto_cast_enabled():
-                target_dtype = paddle.get_device('gpu').dtype 
-            elif hasattr(self.config, '_pre_quantization_dtype'):
+                target_dtype = paddle.get_device("gpu").dtype
+            elif hasattr(self.config, "_pre_quantization_dtype"):
                 target_dtype = self.config._pre_quantization_dtype
             else:
                 target_dtype = self.q_proj.weight.dtype
             logger.warning_once(
-                f'The input hidden states seems to be silently casted in float32, this might be related to the fact you have upcasted embedding or layer norm layers in float32. We will cast back the input in {target_dtype}.'
-                )
+                f"The input hidden states seems to be silently casted in float32, this might be related to the fact you have upcasted embedding or layer norm layers in float32. We will cast back the input in {target_dtype}."
+            )
             query_states = query_states.to(target_dtype)
             key_states = key_states.to(target_dtype)
             value_states = value_states.to(target_dtype)
-        query_states = query_states.transpose(perm=paddle_aux.
-            transpose_aux_func(query_states.ndim, 1, 2))
-        key_states = key_states.transpose(perm=paddle_aux.
-            transpose_aux_func(key_states.ndim, 1, 2))
-        value_states = value_states.transpose(perm=paddle_aux.
-            transpose_aux_func(value_states.ndim, 1, 2))
-        attn_output = self._flash_attention_forward(query_states,
-            key_states, value_states, attention_mask, q_len, dropout=
-            dropout_rate, use_sliding_windows=use_sliding_windows)
-        attn_output = attn_output.reshape(bsz, q_len, self.hidden_size
-            ).contiguous()
-        attn_output = self.crossattention(query_states.transpose(perm=[1, 0,
-            2, 3]), image_embeds, media_offset, attn_output.transpose(perm=
-            [1, 0, 2]))
+        query_states = query_states.transpose(perm=paddle_aux.transpose_aux_func(query_states.ndim, 1, 2))
+        key_states = key_states.transpose(perm=paddle_aux.transpose_aux_func(key_states.ndim, 1, 2))
+        value_states = value_states.transpose(perm=paddle_aux.transpose_aux_func(value_states.ndim, 1, 2))
+        attn_output = self._flash_attention_forward(
+            query_states,
+            key_states,
+            value_states,
+            attention_mask,
+            q_len,
+            dropout=dropout_rate,
+            use_sliding_windows=use_sliding_windows,
+        )
+        attn_output = attn_output.reshape(bsz, q_len, self.hidden_size).contiguous()
+        attn_output = self.crossattention(
+            query_states.transpose(perm=[1, 0, 2, 3]),
+            image_embeds,
+            media_offset,
+            attn_output.transpose(perm=[1, 0, 2]),
+        )
         attn_output = attn_output.transpose(perm=[1, 0, 2])
         attn_output = self.o_proj(attn_output)
         if not output_attentions:
             attn_weights = None
         return attn_output, attn_weights, past_key_value
 
-    def _flash_attention_forward(self, query_states, key_states,
-        value_states, attention_mask, query_length, dropout=0.0,
-        softmax_scale=None, use_sliding_windows=False):
+    def _flash_attention_forward(
+        self,
+        query_states,
+        key_states,
+        value_states,
+        attention_mask,
+        query_length,
+        dropout=0.0,
+        softmax_scale=None,
+        use_sliding_windows=False,
+    ):
         """
         Calls the forward method of Flash Attention - if the input hidden states contain at least one padding token
         first unpad the input, then computes the attention scores and pad the final attention scores.
@@ -725,81 +747,98 @@ def _flash_attention_forward(self, query_states, key_states,
             causal = self.is_causal
         else:
             causal = self.is_causal and query_length != 1
-        if (use_sliding_windows and self.layer_idx >= self.config.
-            max_window_layers):
+        if use_sliding_windows and self.layer_idx >= self.config.max_window_layers:
             use_sliding_windows = False
         if attention_mask is not None:
             batch_size = tuple(query_states.shape)[0]
-            (query_states, key_states, value_states, indices_q, cu_seq_lens,
-                max_seq_lens) = (self._upad_input(query_states, key_states,
-                value_states, attention_mask, query_length))
+            (query_states, key_states, value_states, indices_q, cu_seq_lens, max_seq_lens) = self._upad_input(
+                query_states, key_states, value_states, attention_mask, query_length
+            )
             cu_seqlens_q, cu_seqlens_k = cu_seq_lens
             max_seqlen_in_batch_q, max_seqlen_in_batch_k = max_seq_lens
             if not use_sliding_windows:
-# >>>>>>                attn_output_unpad = flash_attn.flash_attn_varlen_func(
+                # >>>>>>                attn_output_unpad = flash_attn.flash_attn_varlen_func(
                 attn_output_unpad = flash_attn_varlen_func(
-                    query_states, key_states, value_states, cu_seqlens_q=
-                    cu_seqlens_q, cu_seqlens_k=cu_seqlens_k, max_seqlen_q=
-                    max_seqlen_in_batch_q, max_seqlen_k=
-                    max_seqlen_in_batch_k, dropout_p=dropout, 
+                    query_states,
+                    key_states,
+                    value_states,
+                    cu_seqlens_q=cu_seqlens_q,
+                    cu_seqlens_k=cu_seqlens_k,
+                    max_seqlen_q=max_seqlen_in_batch_q,
+                    max_seqlen_k=max_seqlen_in_batch_k,
+                    dropout_p=dropout,
                     # softmax_scale = softmax_scale, causal=causal)
-                    scale = softmax_scale, causal=causal)[0]
+                    scale=softmax_scale,
+                    causal=causal,
+                )[0]
             else:
-# >>>>>>                
+                # >>>>>>
                 attn_output_unpad = flash_attn_varlen_func(
-                    query_states, key_states, value_states, cu_seqlens_q=
-                    cu_seqlens_q, cu_seqlens_k=cu_seqlens_k, max_seqlen_q=
-                    max_seqlen_in_batch_q, max_seqlen_k=
+                    query_states,
+                    key_states,
+                    value_states,
+                    cu_seqlens_q=cu_seqlens_q,
+                    cu_seqlens_k=cu_seqlens_k,
+                    max_seqlen_q=max_seqlen_in_batch_q,
+                    max_seqlen_k=
                     # max_seqlen_in_batch_k, dropout_p=dropout, softmax_scale
-                    max_seqlen_in_batch_k, dropout_p=dropout, scale
-                    =softmax_scale, causal=causal, window_size=(self.config
-                    .sliding_window, self.config.sliding_window))[0]
-# >>>>>>            attn_output = flash_attn.bert_padding.pad_input(attn_output_unpad,
-#                 indices_q, batch_size, query_length)
+                    max_seqlen_in_batch_k,
+                    dropout_p=dropout,
+                    scale=softmax_scale,
+                    causal=causal,
+                    window_size=(self.config.sliding_window, self.config.sliding_window),
+                )[0]
+            # >>>>>>            attn_output = flash_attn.bert_padding.pad_input(attn_output_unpad,
+            #                 indices_q, batch_size, query_length)
             attn_output = pad_input(attn_output_unpad, indices_q, batch_size, query_length)
         elif not use_sliding_windows:
-# >>>>>>            attn_output = flash_attn.flash_attn_func(query_states,
-            attn_output = flash_attn_func(query_states,key_states,
+            # >>>>>>            attn_output = flash_attn.flash_attn_func(query_states,
+            attn_output = flash_attn_func(
+                query_states,
+                key_states,
                 # value_states, dropout, softmax_scale=softmax_scale,
-                value_states, dropout,causal=causal)[0]
+                value_states,
+                dropout,
+                causal=causal,
+            )[0]
         else:
-# >>>>>>            attn_output = flash_attn.flash_attn_func(query_states,
-            attn_output = flash_attn.flash_attn_func(query_states,
+            # >>>>>>            attn_output = flash_attn.flash_attn_func(query_states,
+            attn_output = flash_attn.flash_attn_func(
+                query_states,
                 # key_states, value_states, dropout, softmax_scale=softmax_scale,
-                key_states, value_states, dropout,
-                causal=causal, window_size=(self.config.
-                sliding_window, self.config.sliding_window))[0]
+                key_states,
+                value_states,
+                dropout,
+                causal=causal,
+                window_size=(self.config.sliding_window, self.config.sliding_window),
+            )[0]
         return attn_output
 
-    def _upad_input(self, query_layer, key_layer, value_layer,
-        attention_mask, query_length):
+    def _upad_input(self, query_layer, key_layer, value_layer, attention_mask, query_length):
         batch_size, kv_seq_len, num_heads, head_dim = tuple(key_layer.shape)
         if kv_seq_len != tuple(attention_mask.shape)[-1]:
             attention_mask_num_tokens = tuple(attention_mask.shape)[-1]
-            attention_mask = attention_mask[:, attention_mask_num_tokens -
-                kv_seq_len:]
-        indices_k, cu_seqlens_k, max_seqlen_in_batch_k = _get_unpad_data(
-            attention_mask)
-# >>>>>>        key_layer = flash_attn.bert_padding.index_first_axis(key_layer.
-#             reshape(batch_size * kv_seq_len, num_heads, head_dim), indices_k)
-        key_layer = index_first_axis(
-            key_layer.reshape([batch_size * kv_seq_len, num_heads, head_dim]), indices_k)
-# >>>>>>        value_layer = flash_attn.bert_padding.index_first_axis(value_layer.
-#             reshape(batch_size * kv_seq_len, num_heads, head_dim), indices_k)
-        value_layer = index_first_axis(
-            value_layer.reshape([batch_size * kv_seq_len, num_heads, head_dim]), indices_k)
+            attention_mask = attention_mask[:, attention_mask_num_tokens - kv_seq_len :]
+        indices_k, cu_seqlens_k, max_seqlen_in_batch_k = _get_unpad_data(attention_mask)
+        # >>>>>>        key_layer = flash_attn.bert_padding.index_first_axis(key_layer.
+        #             reshape(batch_size * kv_seq_len, num_heads, head_dim), indices_k)
+        key_layer = index_first_axis(key_layer.reshape([batch_size * kv_seq_len, num_heads, head_dim]), indices_k)
+        # >>>>>>        value_layer = flash_attn.bert_padding.index_first_axis(value_layer.
+        #             reshape(batch_size * kv_seq_len, num_heads, head_dim), indices_k)
+        value_layer = index_first_axis(value_layer.reshape([batch_size * kv_seq_len, num_heads, head_dim]), indices_k)
         if query_length == kv_seq_len:
-# >>>>>>            query_layer = flash_attn.bert_padding.index_first_axis(query_layer
-#                 .reshape(batch_size * kv_seq_len, num_heads, head_dim),
-#                 indices_k)
+            # >>>>>>            query_layer = flash_attn.bert_padding.index_first_axis(query_layer
+            #                 .reshape(batch_size * kv_seq_len, num_heads, head_dim),
+            #                 indices_k)
             query_layer = index_first_axis(
-                query_layer.reshape([batch_size * kv_seq_len, num_heads, head_dim]), indices_k)
+                query_layer.reshape([batch_size * kv_seq_len, num_heads, head_dim]), indices_k
+            )
             cu_seqlens_q = cu_seqlens_k
             max_seqlen_in_batch_q = max_seqlen_in_batch_k
             indices_q = indices_k
         elif query_length == 1:
             max_seqlen_in_batch_q = 1
-            cu_seqlens_q = paddle.arange(dtype='int32', end=batch_size + 1)
+            cu_seqlens_q = paddle.arange(dtype="int32", end=batch_size + 1)
             indices_q = cu_seqlens_q[:-1]
             query_layer = query_layer.squeeze(axis=1)
         else:
@@ -807,9 +846,16 @@ def _upad_input(self, query_layer, key_layer, value_layer,
             query_layer, indices_q, cu_seqlens_q, max_seqlen_in_batch_q = (
                 # flash_attn.bert_padding.unpad_input(query_layer,
                 # attention_mask))
-                unpad_input(query_states,attention_mask))
-        return query_layer, key_layer, value_layer, indices_q, (cu_seqlens_q,
-            cu_seqlens_k), (max_seqlen_in_batch_q, max_seqlen_in_batch_k)
+                unpad_input(query_states, attention_mask)
+            )
+        return (
+            query_layer,
+            key_layer,
+            value_layer,
+            indices_q,
+            (cu_seqlens_q, cu_seqlens_k),
+            (max_seqlen_in_batch_q, max_seqlen_in_batch_k),
+        )
 
 
 class HyperQwen2SdpaAttention(HyperQwen2Attention):
@@ -819,23 +865,32 @@ class HyperQwen2SdpaAttention(HyperQwen2Attention):
     SDPA API.
     """
 
-    def forward(self, hidden_states: paddle.Tensor, attention_mask:
-        Optional[paddle.Tensor]=None, position_ids: Optional[paddle.Tensor]
-        =None, image_embeds=None, media_offset=None, past_key_value:
-        Optional[Tuple[paddle.Tensor]]=None, output_attentions:
-        bool=False, use_cache: bool=False) ->Tuple[paddle.Tensor, Optional[
-        paddle.Tensor], Optional[Tuple[paddle.Tensor]]]:
+    def forward(
+        self,
+        hidden_states: paddle.Tensor,
+        attention_mask: Optional[paddle.Tensor] = None,
+        position_ids: Optional[paddle.Tensor] = None,
+        image_embeds=None,
+        media_offset=None,
+        past_key_value: Optional[Tuple[paddle.Tensor]] = None,
+        output_attentions: bool = False,
+        use_cache: bool = False,
+    ) -> Tuple[paddle.Tensor, Optional[paddle.Tensor], Optional[Tuple[paddle.Tensor]]]:
         # print('*&'*100)
         # print('output_attentions:',output_attentions)
         # print('attention_mask:',attention_mask)#(1,1,1,60)
-        if output_attentions:#false
+        if output_attentions:  # false
             logger.warning_once(
                 'Qwen2Model is using Qwen2SdpaAttention, but `torch.nn.functional.scaled_dot_product_attention` does not support `output_attentions=True`. Falling back to the manual attention implementation, but specifying the manual implementation will be required from Transformers version v5.0.0 onwards. This warning can be removed using the argument `attn_implementation="eager"` when loading the model.'
-                )
-            return super().forward(hidden_states=hidden_states,
-                attention_mask=attention_mask, position_ids=position_ids,
-                past_key_value=past_key_value, output_attentions=
-                output_attentions, use_cache=use_cache)
+            )
+            return super().forward(
+                hidden_states=hidden_states,
+                attention_mask=attention_mask,
+                position_ids=position_ids,
+                past_key_value=past_key_value,
+                output_attentions=output_attentions,
+                use_cache=use_cache,
+            )
         print(hidden_states.shape)
         bsz, q_len, _ = tuple(hidden_states.shape)
         query_states = self.q_proj(hidden_states)
@@ -845,10 +900,10 @@ def forward(self, hidden_states: paddle.Tensor, attention_mask:
         #     head_dim).transpose(perm=paddle_aux.transpose_aux_func(
         #     query_states.view(bsz, q_len, self.num_heads, self.head_dim).
         #     ndim, 1, 2))
-        print('bsz:',bsz)
-        print("qlen:",q_len)
-        print("num_heads:",self.num_heads)
-        print("head_dim:",self.head_dim)
+        print("bsz:", bsz)
+        print("qlen:", q_len)
+        print("num_heads:", self.num_heads)
+        print("head_dim:", self.head_dim)
         query_states = paddle.reshape(query_states, [bsz, q_len, self.num_heads, self.head_dim])
         query_states = paddle.transpose(query_states, perm=[0, 2, 1, 3])  # 交换 1 和 2 维度
 
@@ -869,75 +924,82 @@ def forward(self, hidden_states: paddle.Tensor, attention_mask:
 
         kv_seq_len = tuple(key_states.shape)[-2]
         if past_key_value is not None:
-            kv_seq_len += past_key_value.get_usable_length(kv_seq_len, self
-                .layer_idx)
+            kv_seq_len += past_key_value.get_usable_length(kv_seq_len, self.layer_idx)
         cos, sin = self.rotary_emb(value_states, seq_len=kv_seq_len)
         # print('2'*100)
-        query_states, key_states = apply_rotary_pos_emb(query_states,
-            key_states, cos, sin, position_ids)
+        query_states, key_states = apply_rotary_pos_emb(query_states, key_states, cos, sin, position_ids)
         if past_key_value is not None:
-            cache_kwargs = {'sin': sin, 'cos': cos}
-            key_states, value_states = past_key_value.update(key_states,value_states, self.layer_idx, cache_kwargs)
-        
+            cache_kwargs = {"sin": sin, "cos": cos}
+            key_states, value_states = past_key_value.update(key_states, value_states, self.layer_idx, cache_kwargs)
+
         key_states = repeat_kv(key_states, self.num_key_value_groups)
         value_states = repeat_kv(value_states, self.num_key_value_groups)
-        
-        if attention_mask is not None:#(1,1,1,60)
+
+        if attention_mask is not None:  # (1,1,1,60)
             if tuple(attention_mask.shape) != (bsz, 1, q_len, kv_seq_len):
                 raise ValueError(
-                    f'Attention mask should be of size {bsz, 1, q_len, kv_seq_len}, but is {tuple(attention_mask.shape)}'
-                    )
+                    f"Attention mask should be of size {bsz, 1, q_len, kv_seq_len}, but is {tuple(attention_mask.shape)}"
+                )
         # if query_states.device.type == 'cuda' and attention_mask is not None:
-            # query_states = query_states.contiguous()
-            # key_states = key_states.contiguous()
-            # value_states = value_states.contiguous()
-        attn_output = paddle.nn.functional.scaled_dot_product_attention(query
-            =query_states, key=key_states, value=value_states, attn_mask=
-            attention_mask, dropout_p=self.attention_dropout if self.
-            training else 0.0, is_causal=self.is_causal and attention_mask is
-            None and q_len > 1)
-        attn_output = attn_output.transpose(perm=paddle_aux.
-            transpose_aux_func(attn_output.ndim, 1, 2)).contiguous()
+        # query_states = query_states.contiguous()
+        # key_states = key_states.contiguous()
+        # value_states = value_states.contiguous()
+        attn_output = paddle.nn.functional.scaled_dot_product_attention(
+            query=query_states,
+            key=key_states,
+            value=value_states,
+            attn_mask=attention_mask,
+            dropout_p=self.attention_dropout if self.training else 0.0,
+            is_causal=self.is_causal and attention_mask is None and q_len > 1,
+        )
+        attn_output = attn_output.transpose(perm=paddle_aux.transpose_aux_func(attn_output.ndim, 1, 2)).contiguous()
         # attn_output = attn_output.view(bsz, q_len, self.hidden_size)
         attn_output = paddle.reshape(attn_output, [bsz, q_len, self.hidden_size])
-        attn_output = self.crossattention(query_states.transpose(perm=[2, 0,
-            1, 3]), image_embeds, media_offset, attn_output.transpose(perm=
-            [1, 0, 2]))
+        attn_output = self.crossattention(
+            query_states.transpose(perm=[2, 0, 1, 3]),
+            image_embeds,
+            media_offset,
+            attn_output.transpose(perm=[1, 0, 2]),
+        )
         attn_output = attn_output.transpose(perm=[1, 0, 2])
         attn_output = self.o_proj(attn_output)
         return attn_output, None, past_key_value
 
 
-QWEN2_ATTENTION_CLASSES = {'eager': HyperQwen2Attention,
-    'flash_attention_2': HyperQwen2FlashAttention2, 'sdpa':
-    HyperQwen2SdpaAttention}
+QWEN2_ATTENTION_CLASSES = {
+    "eager": HyperQwen2Attention,
+    "flash_attention_2": HyperQwen2FlashAttention2,
+    "sdpa": HyperQwen2SdpaAttention,
+}
 
 
 class HyperQwen2DecoderLayer(paddle.nn.Layer):
-
     def __init__(self, config: HyperQwen2Config, layer_idx: int):
         super().__init__()
         self.hidden_size = config.hidden_size
-        if (config.use_sliding_window and config._attn_implementation !=
-            'flash_attention_2'):
+        if config.use_sliding_window and config._attn_implementation != "flash_attention_2":
             logger.warning_once(
-                f'Sliding Window Attention is enabled but not implemented for `{config._attn_implementation}`; unexpected results may be encountered.'
-                )
+                f"Sliding Window Attention is enabled but not implemented for `{config._attn_implementation}`; unexpected results may be encountered."
+            )
         self.is_hyper_enabled = layer_idx + 1 in config.hyper_layers
         self.self_attn = QWEN2_ATTENTION_CLASSES[config._attn_implementation](
-            config, layer_idx, is_hyper_enabed=self.is_hyper_enabled)
+            config, layer_idx, is_hyper_enabed=self.is_hyper_enabled
+        )
         self.mlp = Qwen2MLP(config)
-        self.input_layernorm = Qwen2RMSNorm(config.hidden_size, eps=config.
-            rms_norm_eps)
-        self.post_attention_layernorm = Qwen2RMSNorm(config.hidden_size,
-            eps=config.rms_norm_eps)
-
-    def forward(self, hidden_states: paddle.Tensor, attention_mask:
-        Optional[paddle.Tensor]=None, position_ids: Optional[paddle.Tensor]
-        =None, image_embeds=None, media_offset=None, past_key_value:
-        Optional[Tuple[paddle.Tensor]]=None, output_attentions: Optional[
-        bool]=False, use_cache: Optional[bool]=False) ->Tuple[paddle.Tensor,
-        Optional[Tuple[paddle.Tensor, paddle.Tensor]]]:
+        self.input_layernorm = Qwen2RMSNorm(config.hidden_size, eps=config.rms_norm_eps)
+        self.post_attention_layernorm = Qwen2RMSNorm(config.hidden_size, eps=config.rms_norm_eps)
+
+    def forward(
+        self,
+        hidden_states: paddle.Tensor,
+        attention_mask: Optional[paddle.Tensor] = None,
+        position_ids: Optional[paddle.Tensor] = None,
+        image_embeds=None,
+        media_offset=None,
+        past_key_value: Optional[Tuple[paddle.Tensor]] = None,
+        output_attentions: Optional[bool] = False,
+        use_cache: Optional[bool] = False,
+    ) -> Tuple[paddle.Tensor, Optional[Tuple[paddle.Tensor, paddle.Tensor]]]:
         """
         Args:
             hidden_states (`torch.FloatTensor`): input to the layer of shape `(batch, seq_len, embed_dim)`
@@ -960,20 +1022,25 @@ def forward(self, hidden_states: paddle.Tensor, attention_mask:
         # print('*&'*100)
         # print('attention_mask:',attention_mask)
         hidden_states, self_attn_weights, present_key_value = self.self_attn(
-            hidden_states=hidden_states, attention_mask=attention_mask,#(1,1,1,60)
-            position_ids=position_ids, image_embeds=image_embeds,
-            media_offset=media_offset, past_key_value=past_key_value,
-            output_attentions=output_attentions, use_cache=use_cache)
+            hidden_states=hidden_states,
+            attention_mask=attention_mask,  # (1,1,1,60)
+            position_ids=position_ids,
+            image_embeds=image_embeds,
+            media_offset=media_offset,
+            past_key_value=past_key_value,
+            output_attentions=output_attentions,
+            use_cache=use_cache,
+        )
         hidden_states = residual + hidden_states
         residual = hidden_states
         hidden_states = self.post_attention_layernorm(hidden_states)
         hidden_states = self.mlp(hidden_states)
         hidden_states = residual + hidden_states
-        outputs = hidden_states,
+        outputs = (hidden_states,)
         if output_attentions:
-            outputs += self_attn_weights,
+            outputs += (self_attn_weights,)
         if use_cache:
-            outputs += present_key_value,
+            outputs += (present_key_value,)
         return outputs
 
 
@@ -999,10 +1066,10 @@ def forward(self, hidden_states: paddle.Tensor, attention_mask:
 #     , QWEN2_START_DOCSTRING)
 class Qwen2PreTrainedModel(paddlenlp.transformers.model_utils.PretrainedModel):
     config_class = HyperQwen2Config
-    base_model_prefix = 'model'
+    base_model_prefix = "model"
     supports_gradient_checkpointing = True
-    _no_split_modules = ['HyperQwen2DecoderLayer']
-    _skip_keys_device_placement = 'past_key_values'
+    _no_split_modules = ["HyperQwen2DecoderLayer"]
+    _skip_keys_device_placement = "past_key_values"
     _supports_flash_attn_2 = True
     _supports_sdpa = True
     _supports_cache_class = True
@@ -1116,12 +1183,12 @@ def __init__(self, config: HyperQwen2Config):
         super().__init__(config)
         self.padding_idx = config.pad_token_id
         self.vocab_size = config.vocab_size
-        self.embed_tokens = paddle.nn.Embedding(num_embeddings=config.
-            vocab_size, embedding_dim=config.hidden_size, padding_idx=self.
-            padding_idx)
-        self.layers = paddle.nn.LayerList(sublayers=[HyperQwen2DecoderLayer
-            (config, layer_idx) for layer_idx in range(config.
-            num_hidden_layers)])
+        self.embed_tokens = paddle.nn.Embedding(
+            num_embeddings=config.vocab_size, embedding_dim=config.hidden_size, padding_idx=self.padding_idx
+        )
+        self.layers = paddle.nn.LayerList(
+            sublayers=[HyperQwen2DecoderLayer(config, layer_idx) for layer_idx in range(config.num_hidden_layers)]
+        )
         self._attn_implementation = config._attn_implementation
         self.norm = Qwen2RMSNorm(config.hidden_size, eps=config.rms_norm_eps)
         self.gradient_checkpointing = False
@@ -1133,49 +1200,48 @@ def get_input_embeddings(self):
     def set_input_embeddings(self, value):
         self.embed_tokens = value
 
-# >>>>>>    @transformers.utils.add_start_docstrings_to_model_forward(
-#         QWEN2_INPUTS_DOCSTRING)
-    def forward(self, input_ids: paddle.Tensor=None, attention_mask:
-        Optional[paddle.Tensor]=None, position_ids: Optional[paddle.Tensor]
-        =None, past_key_values: Optional[List[paddle.Tensor]]=None,
-        inputs_embeds: Optional[paddle.Tensor]=None, image_embeds=None,
-        media_offset=None, use_cache: Optional[bool]=None,
-        output_attentions: Optional[bool]=None, output_hidden_states:
-        Optional[bool]=None, return_dict: Optional[bool]=None) ->Union[
-        Tuple, paddlenlp.transformers.model_outputs.BaseModelOutputWithPast]:
-        print("^()"*100)
-        print('attention_mask',attention_mask.shape)
-        output_attentions = (output_attentions if output_attentions is not
-            None else self.config.output_attentions)
-        output_hidden_states = (output_hidden_states if 
-            output_hidden_states is not None else self.config.
-            output_hidden_states)
-        use_cache = (use_cache if use_cache is not None else self.config.
-            use_cache)
-        return_dict = (return_dict if return_dict is not None else self.
-            config.use_return_dict)
+    # >>>>>>    @transformers.utils.add_start_docstrings_to_model_forward(
+    #         QWEN2_INPUTS_DOCSTRING)
+    def forward(
+        self,
+        input_ids: paddle.Tensor = None,
+        attention_mask: Optional[paddle.Tensor] = None,
+        position_ids: Optional[paddle.Tensor] = None,
+        past_key_values: Optional[List[paddle.Tensor]] = None,
+        inputs_embeds: Optional[paddle.Tensor] = None,
+        image_embeds=None,
+        media_offset=None,
+        use_cache: Optional[bool] = None,
+        output_attentions: Optional[bool] = None,
+        output_hidden_states: Optional[bool] = None,
+        return_dict: Optional[bool] = None,
+    ) -> Union[Tuple, paddlenlp.transformers.model_outputs.BaseModelOutputWithPast]:
+        print("^()" * 100)
+        print("attention_mask", attention_mask.shape)
+        output_attentions = output_attentions if output_attentions is not None else self.config.output_attentions
+        output_hidden_states = (
+            output_hidden_states if output_hidden_states is not None else self.config.output_hidden_states
+        )
+        use_cache = use_cache if use_cache is not None else self.config.use_cache
+        return_dict = return_dict if return_dict is not None else self.config.use_return_dict
         if input_ids is not None and inputs_embeds is not None:
-            raise ValueError(
-                'You cannot specify both decoder_input_ids and decoder_inputs_embeds at the same time'
-                )
+            raise ValueError("You cannot specify both decoder_input_ids and decoder_inputs_embeds at the same time")
         elif input_ids is not None:
             # print("%"*100)
             # print(input_ids.shape)
-            batch_size, seq_length = tuple(input_ids.shape)#(1,60)
+            batch_size, seq_length = tuple(input_ids.shape)  # (1,60)
         elif inputs_embeds is not None:
             # print("tuple(inputs_embeds.shape):",inputs_embeds.shape)
             batch_size, seq_length, _ = tuple(inputs_embeds.shape)
         else:
-            raise ValueError(
-                'You have to specify either decoder_input_ids or decoder_inputs_embeds'
-                )
+            raise ValueError("You have to specify either decoder_input_ids or decoder_inputs_embeds")
         if self.gradient_checkpointing and self.training:
             if use_cache:
                 logger.warning_once(
-                    '`use_cache=True` is incompatible with gradient checkpointing. Setting `use_cache=False`...'
-                    )
+                    "`use_cache=True` is incompatible with gradient checkpointing. Setting `use_cache=False`..."
+                )
                 use_cache = False
-        
+
         if past_key_values is None:
             cache_position = paddle.arange(input_ids.shape[1])
         else:
@@ -1185,21 +1251,21 @@ def forward(self, input_ids: paddle.Tensor=None, attention_mask:
             input_ids = input_ids[:, -1].unsqueeze(-1)
 
         past_key_values_length = 0
-        print("past_key_values:",past_key_values)
+        print("past_key_values:", past_key_values)
         # if use_cache:
-            # use_legacy_cache = not isinstance(past_key_values, transformers
-            #     .cache_utils.Cache)
-            # use_legacy_cache = not isinstance(past_key_values, list) and all(isinstance(item, paddle.Tensor) for item in past_key_values)
-#             if use_legacy_cache:
-# >>>>>>                past_key_values = (transformers.cache_utils.DynamicCache.
-#                     from_legacy_cache(past_key_values))
-#             past_key_values_length = past_key_values.get_usable_length(
-#                 seq_length)
+        # use_legacy_cache = not isinstance(past_key_values, transformers
+        #     .cache_utils.Cache)
+        # use_legacy_cache = not isinstance(past_key_values, list) and all(isinstance(item, paddle.Tensor) for item in past_key_values)
+        #             if use_legacy_cache:
+        # >>>>>>                past_key_values = (transformers.cache_utils.DynamicCache.
+        #                     from_legacy_cache(past_key_values))
+        #             past_key_values_length = past_key_values.get_usable_length(
+        #                 seq_length)
         if position_ids is None:
-            device = (input_ids.place if input_ids is not None else
-                inputs_embeds.place)
-            position_ids = paddle.arange(start=past_key_values_length, end=
-                seq_length + past_key_values_length, dtype='int64')
+            device = input_ids.place if input_ids is not None else inputs_embeds.place
+            position_ids = paddle.arange(
+                start=past_key_values_length, end=seq_length + past_key_values_length, dtype="int64"
+            )
             # position_ids = position_ids.unsqueeze(axis=0).view(-1, seq_length)
             position_ids = paddle.unsqueeze(position_ids, axis=0)
             position_ids = paddle.reshape(position_ids, [-1, seq_length])
@@ -1208,41 +1274,39 @@ def forward(self, input_ids: paddle.Tensor=None, attention_mask:
             device = input_ids.place
             # position_ids = position_ids.view(-1, seq_length).astype(dtype='int64')
             # position_ids = position_ids.reshape(-1, seq_length).astype(dtype='int64')
-            position_ids = paddle.reshape(position_ids, [-1, seq_length]).astype(dtype='int64')
+            position_ids = paddle.reshape(position_ids, [-1, seq_length]).astype(dtype="int64")
         if inputs_embeds is None:
-            print("^"*100)
+            print("^" * 100)
             inputs_embeds = self.embed_tokens(input_ids)
-        if (attention_mask is not None and self._attn_implementation ==
-            'flash_attention_2' and use_cache):
+        if attention_mask is not None and self._attn_implementation == "flash_attention_2" and use_cache:
             is_padding_right = attention_mask[:, -1].sum().item() != batch_size
             if is_padding_right:
                 raise ValueError(
                     "You are attempting to perform batched generation with padding_side='right' this may lead to unexpected behaviour for Flash Attention version of Qwen2. Make sure to  call `tokenizer.padding_side  = 'left'` before tokenizing the input. "
-                    )
+                )
         # print("^()"*100)
         # print('attention_mask',attention_mask)
         print(self._attn_implementation)
-        if self._attn_implementation == 'flash_attention_2':
-            attention_mask = (attention_mask if attention_mask is not None and
-                0 in attention_mask else None)
+        if self._attn_implementation == "flash_attention_2":
+            attention_mask = attention_mask if attention_mask is not None and 0 in attention_mask else None
         # elif self._attn_implementation == 'sdpa' and not output_attentions:
-# >>>>>>            attention_mask = (transformers.modeling_attn_mask_utils.
-#                 _prepare_4d_causal_attention_mask_for_sdpa(attention_mask,
-#                 (batch_size, seq_length), inputs_embeds,
-#                 past_key_values_length, sliding_window=self.config.
-#                 sliding_window))
-            
+        # >>>>>>            attention_mask = (transformers.modeling_attn_mask_utils.
+        #                 _prepare_4d_causal_attention_mask_for_sdpa(attention_mask,
+        #                 (batch_size, seq_length), inputs_embeds,
+        #                 past_key_values_length, sliding_window=self.config.
+        #                 sliding_window))
+
         else:
-# >>>>>>            attention_mask = (transformers.modeling_attn_mask_utils.
-#                 _prepare_4d_causal_attention_mask(attention_mask, (
-#                 batch_size, seq_length), inputs_embeds,
-#                 past_key_values_length, sliding_window=self.config.
-#                 sliding_window))
-            print("5"*200)
-            attention_mask=None
+            # >>>>>>            attention_mask = (transformers.modeling_attn_mask_utils.
+            #                 _prepare_4d_causal_attention_mask(attention_mask, (
+            #                 batch_size, seq_length), inputs_embeds,
+            #                 past_key_values_length, sliding_window=self.config.
+            #                 sliding_window))
+            print("5" * 200)
+            attention_mask = None
             min_dtype = paddle.finfo(paddle.float16).min
             # print("past_key_values_length:",past_key_values_length)
-           
+
             attention_mask = _prepare_4d_causal_attention_mask_with_cache_position(
                 attention_mask,
                 sequence_length=seq_length,
@@ -1263,7 +1327,7 @@ def forward(self, input_ids: paddle.Tensor=None, attention_mask:
             #     cache_position=cache_position,
             #     batch_size=batch_size,
             # )
-            print('attention_mask',attention_mask)
+            print("attention_mask", attention_mask)
         # print("^**"*100)
         # print('attention_mask',attention_mask)
         hidden_states = inputs_embeds
@@ -1272,49 +1336,62 @@ def forward(self, input_ids: paddle.Tensor=None, attention_mask:
         next_decoder_cache = None
         for decoder_layer in self.layers:
             if output_hidden_states:
-                all_hidden_states += hidden_states,
+                all_hidden_states += (hidden_states,)
             if self.gradient_checkpointing and self.training:
-                layer_outputs = self._gradient_checkpointing_func(decoder_layer
-                    .__call__, hidden_states, attention_mask, position_ids,
-                    image_embeds, media_offset, past_key_values,
-                    output_attentions, use_cache)
+                layer_outputs = self._gradient_checkpointing_func(
+                    decoder_layer.__call__,
+                    hidden_states,
+                    attention_mask,
+                    position_ids,
+                    image_embeds,
+                    media_offset,
+                    past_key_values,
+                    output_attentions,
+                    use_cache,
+                )
             else:
-                print("hidden_states:",hidden_states)
-                layer_outputs = decoder_layer(hidden_states, attention_mask
-                    =attention_mask, position_ids=position_ids,
-                    image_embeds=image_embeds, media_offset=media_offset,
-                    past_key_value=past_key_values, output_attentions=
-                    output_attentions, use_cache=use_cache)
+                print("hidden_states:", hidden_states)
+                layer_outputs = decoder_layer(
+                    hidden_states,
+                    attention_mask=attention_mask,
+                    position_ids=position_ids,
+                    image_embeds=image_embeds,
+                    media_offset=media_offset,
+                    past_key_value=past_key_values,
+                    output_attentions=output_attentions,
+                    use_cache=use_cache,
+                )
             hidden_states = layer_outputs[0]
             if use_cache:
-                next_decoder_cache = layer_outputs[2 if output_attentions else
-                    1]
+                next_decoder_cache = layer_outputs[2 if output_attentions else 1]
             if output_attentions:
-                all_self_attns += layer_outputs[1],
+                all_self_attns += (layer_outputs[1],)
         hidden_states = self.norm(hidden_states)
         if output_hidden_states:
-            all_hidden_states += hidden_states,
+            all_hidden_states += (hidden_states,)
         next_cache = None
         if use_cache:
-            next_cache = next_decoder_cache.to_legacy_cache(
-                ) if use_legacy_cache else next_decoder_cache
+            next_cache = next_decoder_cache.to_legacy_cache() if use_legacy_cache else next_decoder_cache
         if not return_dict:
-            return tuple(v for v in [hidden_states, next_cache,
-                all_hidden_states, all_self_attns] if v is not None)
+            return tuple(v for v in [hidden_states, next_cache, all_hidden_states, all_self_attns] if v is not None)
         return paddlenlp.transformers.model_outputs.BaseModelOutputWithPast(
-            last_hidden_state=hidden_states, past_key_values=next_cache,
-            hidden_states=all_hidden_states, attentions=all_self_attns)
+            last_hidden_state=hidden_states,
+            past_key_values=next_cache,
+            hidden_states=all_hidden_states,
+            attentions=all_self_attns,
+        )
 
 
 class HyperQwen2ForCausalLM(Qwen2PreTrainedModel):
-    _tied_weights_keys = ['lm_head.weight']
+    _tied_weights_keys = ["lm_head.weight"]
 
     def __init__(self, config):
         super().__init__(config)
         self.model = HyperQwen2Model(config)
         self.vocab_size = config.vocab_size
-        self.lm_head = paddle.nn.Linear(in_features=config.hidden_size,
-            out_features=config.vocab_size, bias_attr=False)
+        self.lm_head = paddle.nn.Linear(
+            in_features=config.hidden_size, out_features=config.vocab_size, bias_attr=False
+        )
         # self.post_init()
 
     def get_input_embeddings(self):
@@ -1335,19 +1412,25 @@ def set_decoder(self, decoder):
     def get_decoder(self):
         return self.model
 
-# >>>>>>    @transformers.utils.add_start_docstrings_to_model_forward(
-#         QWEN2_INPUTS_DOCSTRING)
-# >>>>>>    @transformers.utils.replace_return_docstrings(output_type=
-#         CausalLMOutputWithPast, config_class=_CONFIG_FOR_DOC)
-    def forward(self, input_ids: paddle.Tensor=None, attention_mask:
-        Optional[paddle.Tensor]=None, position_ids: Optional[paddle.Tensor]
-        =None, past_key_values: Optional[List[paddle.Tensor]]=None,
-        inputs_embeds: Optional[paddle.Tensor]=None, image_embeds=None,
-        media_offset=None, labels: Optional[paddle.Tensor]=None, use_cache:
-        Optional[bool]=None, output_attentions: Optional[bool]=None,
-        output_hidden_states: Optional[bool]=None, return_dict: Optional[
-        bool]=None) ->Union[Tuple, paddlenlp.transformers.model_outputs.
-        CausalLMOutputWithPast]:
+    # >>>>>>    @transformers.utils.add_start_docstrings_to_model_forward(
+    #         QWEN2_INPUTS_DOCSTRING)
+    # >>>>>>    @transformers.utils.replace_return_docstrings(output_type=
+    #         CausalLMOutputWithPast, config_class=_CONFIG_FOR_DOC)
+    def forward(
+        self,
+        input_ids: paddle.Tensor = None,
+        attention_mask: Optional[paddle.Tensor] = None,
+        position_ids: Optional[paddle.Tensor] = None,
+        past_key_values: Optional[List[paddle.Tensor]] = None,
+        inputs_embeds: Optional[paddle.Tensor] = None,
+        image_embeds=None,
+        media_offset=None,
+        labels: Optional[paddle.Tensor] = None,
+        use_cache: Optional[bool] = None,
+        output_attentions: Optional[bool] = None,
+        output_hidden_states: Optional[bool] = None,
+        return_dict: Optional[bool] = None,
+    ) -> Union[Tuple, paddlenlp.transformers.model_outputs.CausalLMOutputWithPast]:
         """
         Args:
             labels (`torch.LongTensor` of shape `(batch_size, sequence_length)`, *optional*):
@@ -1373,25 +1456,30 @@ def forward(self, input_ids: paddle.Tensor=None, attention_mask:
         >>> tokenizer.batch_decode(generate_ids, skip_special_tokens=True, clean_up_tokenization_spaces=False)[0]
         "Hey, are you conscious? Can you talk to me?\\nI'm not conscious, but I can talk to you."
         ```"""
-        output_attentions = (output_attentions if output_attentions is not
-            None else self.config.output_attentions)
-        output_hidden_states = (output_hidden_states if 
-            output_hidden_states is not None else self.config.
-            output_hidden_states)
-        return_dict = (return_dict if return_dict is not None else self.
-            config.use_return_dict)
+        output_attentions = output_attentions if output_attentions is not None else self.config.output_attentions
+        output_hidden_states = (
+            output_hidden_states if output_hidden_states is not None else self.config.output_hidden_states
+        )
+        return_dict = return_dict if return_dict is not None else self.config.use_return_dict
         # print(self.model)   HyperQwen2Model
         # print('::'*100)
         # print('attention_mask',attention_mask)
-        outputs = self.model(input_ids=input_ids, attention_mask= #(1,1,1,60)
-            attention_mask, position_ids=position_ids, past_key_values=
-            past_key_values, inputs_embeds=inputs_embeds, image_embeds=
-            image_embeds, media_offset=media_offset, use_cache=use_cache,
-            output_attentions=output_attentions, output_hidden_states=
-            output_hidden_states, return_dict=return_dict)
+        outputs = self.model(
+            input_ids=input_ids,
+            attention_mask=attention_mask,  # (1,1,1,60)
+            position_ids=position_ids,
+            past_key_values=past_key_values,
+            inputs_embeds=inputs_embeds,
+            image_embeds=image_embeds,
+            media_offset=media_offset,
+            use_cache=use_cache,
+            output_attentions=output_attentions,
+            output_hidden_states=output_hidden_states,
+            return_dict=return_dict,
+        )
         hidden_states = outputs[0]
         logits = self.lm_head(hidden_states)
-        logits = logits.astype(dtype='float32')
+        logits = logits.astype(dtype="float32")
         loss = None
         if labels is not None:
             shift_logits = logits[..., :-1, :].contiguous()
@@ -1407,17 +1495,22 @@ def forward(self, input_ids: paddle.Tensor=None, attention_mask:
         if not return_dict:
             output = (logits,) + outputs[1:]
             return (loss,) + output if loss is not None else output
-        return paddlenlp.transformers.model_outputs.CausalLMOutputWithPast(loss
-            =loss, logits=logits, past_key_values=outputs.past_key_values,
-            hidden_states=outputs.hidden_states, attentions=outputs.attentions)
+        return paddlenlp.transformers.model_outputs.CausalLMOutputWithPast(
+            loss=loss,
+            logits=logits,
+            past_key_values=outputs.past_key_values,
+            hidden_states=outputs.hidden_states,
+            attentions=outputs.attentions,
+        )
 
-    def prepare_inputs_for_generation(self, input_ids, past_key_values=None,
-        attention_mask=None, inputs_embeds=None, **kwargs):
+    def prepare_inputs_for_generation(
+        self, input_ids, past_key_values=None, attention_mask=None, inputs_embeds=None, **kwargs
+    ):
         if past_key_values is not None:
-# >>>>>>            if isinstance(past_key_values, transformers.cache_utils.Cache):
-#                 cache_length = past_key_values.get_seq_length()
-#                 past_length = past_key_values.seen_tokens
-#                 max_cache_length = past_key_values.get_max_length()
+            # >>>>>>            if isinstance(past_key_values, transformers.cache_utils.Cache):
+            #                 cache_length = past_key_values.get_seq_length()
+            #                 past_length = past_key_values.seen_tokens
+            #                 max_cache_length = past_key_values.get_max_length()
             if past_key_values is not None and isinstance(past_key_values, list):
                 # 确保所有元素都是 paddle.Tensor，并且获取每个 Tensor 的序列长度
                 if all(isinstance(tensor, paddle.Tensor) for tensor in past_key_values):
@@ -1429,40 +1522,47 @@ def prepare_inputs_for_generation(self, input_ids, past_key_values=None,
                     raise ValueError("past_key_values should be a list of paddle.Tensors")
 
             else:
-                cache_length = past_length = tuple(past_key_values[0][0].shape
-                    )[2]
+                cache_length = past_length = tuple(past_key_values[0][0].shape)[2]
                 max_cache_length = None
-            if attention_mask is not None and tuple(attention_mask.shape)[1
-                ] > tuple(input_ids.shape)[1]:
-                input_ids = input_ids[:, -(tuple(attention_mask.shape)[1] -
-                    past_length):]
+            if attention_mask is not None and tuple(attention_mask.shape)[1] > tuple(input_ids.shape)[1]:
+                input_ids = input_ids[:, -(tuple(attention_mask.shape)[1] - past_length) :]
             elif past_length < tuple(input_ids.shape)[1]:
                 input_ids = input_ids[:, past_length:]
-            if (max_cache_length is not None and attention_mask is not None and
-                cache_length + tuple(input_ids.shape)[1] > max_cache_length):
+            if (
+                max_cache_length is not None
+                and attention_mask is not None
+                and cache_length + tuple(input_ids.shape)[1] > max_cache_length
+            ):
                 attention_mask = attention_mask[:, -max_cache_length:]
-        position_ids = kwargs.get('position_ids', None)
+        position_ids = kwargs.get("position_ids", None)
         if attention_mask is not None and position_ids is None:
-            position_ids = attention_mask.astype(dtype='int64').cumsum(axis=-1
-                ) - 1
+            position_ids = attention_mask.astype(dtype="int64").cumsum(axis=-1) - 1
             position_ids.masked_fill_(mask=attention_mask == 0, value=1)
             if past_key_values:
-                position_ids = position_ids[:, -tuple(input_ids.shape)[1]:]
+                position_ids = position_ids[:, -tuple(input_ids.shape)[1] :]
         if inputs_embeds is not None and past_key_values is None:
-            model_inputs = {'inputs_embeds': inputs_embeds}
+            model_inputs = {"inputs_embeds": inputs_embeds}
         else:
-            model_inputs = {'input_ids': input_ids}
-        model_inputs.update({'position_ids': position_ids,
-            'past_key_values': past_key_values, 'use_cache': kwargs.get(
-            'use_cache'), 'attention_mask': attention_mask, 'image_embeds':
-            kwargs.get('image_embeds'), 'media_offset': kwargs.get(
-            'media_offset')})
+            model_inputs = {"input_ids": input_ids}
+        model_inputs.update(
+            {
+                "position_ids": position_ids,
+                "past_key_values": past_key_values,
+                "use_cache": kwargs.get("use_cache"),
+                "attention_mask": attention_mask,
+                "image_embeds": kwargs.get("image_embeds"),
+                "media_offset": kwargs.get("media_offset"),
+            }
+        )
         return model_inputs
 
     @staticmethod
     def _reorder_cache(past_key_values, beam_idx):
         reordered_past = ()
         for layer_past in past_key_values:
-            reordered_past += tuple(past_state.index_select(axis=0, index=
-                beam_idx.to(past_state.place)) for past_state in layer_past),
+            reordered_past += (
+                tuple(
+                    past_state.index_select(axis=0, index=beam_idx.to(past_state.place)) for past_state in layer_past
+                ),
+            )
         return reordered_past
diff --git a/paddlemix/mPLUGOwl3/modeling_mplugowl3.py b/paddlemix/mPLUGOwl3/modeling_mplugowl3.py
index fa81e853c..1729033c2 100644
--- a/paddlemix/mPLUGOwl3/modeling_mplugowl3.py
+++ b/paddlemix/mPLUGOwl3/modeling_mplugowl3.py
@@ -1,65 +1,82 @@
-import paddle
-import paddlenlp
-import math
-from typing import List, Optional
+# Copyright (c) 2024 PaddlePaddle Authors. All Rights Reserved.
+# 
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+# 
+#     http://www.apache.org/licenses/LICENSE-2.0
+# 
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
 import json
-from threading import Thread
+import math
 from copy import deepcopy
+from threading import Thread
+from typing import List, Optional
+
+import paddle
+import paddlenlp
+from paddlenlp.generation import TextIteratorStreamer
+from paddlenlp.transformers import Qwen2ForCausalLM, Qwen2PretrainedModel
 from PIL import Image
-from .processing_mplugowl3 import mPLUGOwl3Processor
-from .image_processing_mplugowl3 import mPLUGOwl3ImageProcessor
+
 from .configuration_mplugowl3 import mPLUGOwl3Config
-from .x_sdpa import ScaleDotProductAttention
+from .image_processing_mplugowl3 import mPLUGOwl3ImageProcessor
 from .modeling_hyper_qwen2 import HyperQwen2ForCausalLM
-from paddlenlp.transformers import Qwen2ForCausalLM, Qwen2PretrainedModel
 from .modeling_navit_siglip import SigLipVisionTransformer
-from paddlenlp.generation import TextIteratorStreamer
+from .processing_mplugowl3 import mPLUGOwl3Processor
+from .x_sdpa import ScaleDotProductAttention
 
 
 def is_flash_attn_available():
     try:
         import paddle
-        if "npu" in paddle.get_device(): # NOTE: flash attn has not been tested yet
+
+        if "npu" in paddle.get_device():  # NOTE: flash attn has not been tested yet
             return False
-        q = paddle.rand((1, 4, 2, 8)).astype('float16') 
+        q = paddle.rand((1, 4, 2, 8)).astype("float16")
         output = paddle.nn.functional.flash_attention.flash_attention(q, q, q, 0.9, False, False)
         return True
     except:
         return False
+
+
 # >>>>>>class mPLUGOwl3PreTrainedModel(transformers.Qwen2PreTrainedModel):
 #     config_class = mPLUGOwl3Config
 class mPLUGOwl3PreTrainedModel(Qwen2PretrainedModel):
     config_class = mPLUGOwl3Config
 
-class mPLUGOwl3Model(mPLUGOwl3PreTrainedModel):
 
+class mPLUGOwl3Model(mPLUGOwl3PreTrainedModel):
     def __init__(self, config):
         super().__init__(config)
         self.language_model = HyperQwen2ForCausalLM(config)
         self.vision_model = self.init_vision_module()
         self.vision_dim = self.vision_model.embed_dim
         self.embed_dim = self.language_model.config.hidden_size
-        self.vision2text_model = paddle.nn.Linear(in_features=self.
-            vision_dim, out_features=self.embed_dim)
+        self.vision2text_model = paddle.nn.Linear(in_features=self.vision_dim, out_features=self.embed_dim)
         self.processor = None
-        self.terminators = ['<|im_end|>', '<|endoftext|>']
+        self.terminators = ["<|im_end|>", "<|endoftext|>"]
 
     def init_vision_module(self):
-        print('-'*100)
+        print("-" * 100)
         if is_flash_attn_available():
-            self.config.vision_config._attn_implementation = (
-                    'flash_attention_2')
+            self.config.vision_config._attn_implementation = "flash_attention_2"
         else:
             self.config.vision_config._attn_implementation = "eager"
         # self.config.vision_config._attn_implementation = (self.config.
         #     vision_config._attn_implementation)
-# >>>>>>        model = (transformers.models.siglip.modeling_siglip.
-#             SiglipVisionTransformer(self.config.vision_config))
-        print("*"*100)
-        model=SigLipVisionTransformer(self.config.vision_config)
-        print("-"*100)
-        setattr(model, 'embed_dim', model.embeddings.embed_dim)
-        setattr(model, 'patch_size', model.embeddings.patch_size)
+        # >>>>>>        model = (transformers.models.siglip.modeling_siglip.
+        #             SiglipVisionTransformer(self.config.vision_config))
+        print("*" * 100)
+        model = SigLipVisionTransformer(self.config.vision_config)
+        print("-" * 100)
+        setattr(model, "embed_dim", model.embeddings.embed_dim)
+        setattr(model, "patch_size", model.embeddings.patch_size)
         return model
 
     def get_input_embeddings(self):
@@ -85,10 +102,9 @@ def forward_image(self, pixel_values):
             return None
         dtype = self.language_model.model.embed_tokens.weight.dtype
         with paddle.no_grad():
-            print('*'*100)
-            image_embeds = self.vision_model(pixel_values.to(dtype),
-                output_hidden_states=True).hidden_states[-2]
-            print('*'*150)
+            print("*" * 100)
+            image_embeds = self.vision_model(pixel_values.to(dtype), output_hidden_states=True).hidden_states[-2]
+            print("*" * 150)
         if self.vision2text_model is not None:
             image_embeds = self.vision2text_model(image_embeds)
         else:
@@ -99,44 +115,47 @@ def forward(self, pixel_values=None, **kwargs):
         image_embeds = self.forward_image(pixel_values)
         return self.language_model(image_embeds=image_embeds, **kwargs)
 
-    def _decode(self, input_ids, image_embeds, media_offset, tokenizer,
-        attention_mask, decode_text=False, **kwargs):
-        terminators = [tokenizer.convert_tokens_to_ids(i) for i in self.
-            terminators]
+    def _decode(self, input_ids, image_embeds, media_offset, tokenizer, attention_mask, decode_text=False, **kwargs):
+        terminators = [tokenizer.convert_tokens_to_ids(i) for i in self.terminators]
         print(f"terminators dtype: {type(terminators)}")
-        print("inputids:",input_ids)
+        print("inputids:", input_ids)
         print(f"attention_mask: {attention_mask}")
         # print(self.language_model)
         output = self.language_model.generate(
-            input_ids=input_ids,#(1,60)
+            input_ids=input_ids,  # (1,60)
             image_embeds=image_embeds,
             media_offset=media_offset,
-            pad_token_id=0, eos_token_id=terminators, attention_mask=
-            attention_mask, **kwargs)[0]
-        output = output[:, tuple(input_ids.shape)[1]:]
+            pad_token_id=0,
+            eos_token_id=terminators,
+            attention_mask=attention_mask,
+            **kwargs,
+        )[0]
+        output = output[:, tuple(input_ids.shape)[1] :]
         if decode_text:
             return self._decode_text(output, tokenizer)
         return output
 
-    def _decode_stream(self, input_ids, image_embeds, media_offset,
-        tokenizer, **kwargs):
+    def _decode_stream(self, input_ids, image_embeds, media_offset, tokenizer, **kwargs):
         terminators = [tokenizer.convert_tokens_to_ids(i) for i in self.terminators]
-# >>>>>>        streamer = transformers.TextIteratorStreamer(tokenizer=tokenizer)
+        # >>>>>>        streamer = transformers.TextIteratorStreamer(tokenizer=tokenizer)
         streamer = TextIteratorStreamer(tokenizer=tokenizer)
-        generation_kwargs = {'input_ids': input_ids, 'image_embeds':
-            image_embeds, 'media_offset': media_offset, 'pad_token_id': 0,
-            'eos_token_id': terminators, 'streamer': streamer}
+        generation_kwargs = {
+            "input_ids": input_ids,
+            "image_embeds": image_embeds,
+            "media_offset": media_offset,
+            "pad_token_id": 0,
+            "eos_token_id": terminators,
+            "streamer": streamer,
+        }
         generation_kwargs.update(kwargs)
-        thread = Thread(target=self.language_model.generate, kwargs=
-            generation_kwargs)
+        thread = Thread(target=self.language_model.generate, kwargs=generation_kwargs)
         """Class Method: *.start, can not convert, please check whether it is torch.Tensor.*/Optimizer.*/nn.Module.*/torch.distributions.Distribution.*/torch.autograd.function.FunctionCtx.*/torch.profiler.profile.*/torch.autograd.profiler.profile.*, and convert manually"""
-# >>>>>>        thread.start()
+        # >>>>>>        thread.start()
         thread.start()
         return streamer
 
     def _decode_text(self, result_ids, tokenizer):
-        terminators = [tokenizer.convert_tokens_to_ids(i) for i in self.
-            terminators]
+        terminators = [tokenizer.convert_tokens_to_ids(i) for i in self.terminators]
         result_text = []
         for result in result_ids:
             result = result[result != 0]
@@ -147,63 +166,88 @@ def _decode_text(self, result_ids, tokenizer):
 
     def init_processor(self, tokenizer):
         ip = mPLUGOwl3ImageProcessor(image_size=384)
-        self.processor = mPLUGOwl3Processor(image_processor=ip, tokenizer=
-            tokenizer)
+        self.processor = mPLUGOwl3Processor(image_processor=ip, tokenizer=tokenizer)
         processor = self.processor
         return processor
 
-    def generate(self, input_ids=None, pixel_values=None, media_offset=None,
-        attention_mask=None, tokenizer=None, stream=False, decode_text=
-        False, **kwargs):
+    def generate(
+        self,
+        input_ids=None,
+        pixel_values=None,
+        media_offset=None,
+        attention_mask=None,
+        tokenizer=None,
+        stream=False,
+        decode_text=False,
+        **kwargs
+    ):
         assert input_ids is not None
-        
+
         with paddle.no_grad():
             image_embeds = self.forward_image(pixel_values)
             if stream:
-                result = self._decode_stream(input_ids=input_ids,
-                    image_embeds=image_embeds, media_offset=media_offset,
-                    tokenizer=tokenizer, **kwargs)
+                result = self._decode_stream(
+                    input_ids=input_ids,
+                    image_embeds=image_embeds,
+                    media_offset=media_offset,
+                    tokenizer=tokenizer,
+                    **kwargs,
+                )
             else:
-                result = self._decode(input_ids=input_ids, image_embeds=
-                    image_embeds, media_offset=media_offset, tokenizer=
-                    tokenizer, attention_mask=attention_mask, decode_text=
-                    decode_text, **kwargs)
+                result = self._decode(
+                    input_ids=input_ids,
+                    image_embeds=image_embeds,
+                    media_offset=media_offset,
+                    tokenizer=tokenizer,
+                    attention_mask=attention_mask,
+                    decode_text=decode_text,
+                    **kwargs,
+                )
         return result
 
-    def chat(self, images, videos, messages, tokenizer, processor=None,
-        max_new_tokens=2048, min_new_tokens=0, sampling=True,
-        max_inp_length=8192, system_prompt='', stream=False, max_slice_nums
-        =None, use_image_id=None, **kwargs):
-        cut_flag = kwargs.get('kwargs', True)
+    def chat(
+        self,
+        images,
+        videos,
+        messages,
+        tokenizer,
+        processor=None,
+        max_new_tokens=2048,
+        min_new_tokens=0,
+        sampling=True,
+        max_inp_length=8192,
+        system_prompt="",
+        stream=False,
+        max_slice_nums=None,
+        use_image_id=None,
+        **kwargs
+    ):
+        cut_flag = kwargs.get("kwargs", True)
         if processor is None:
             if self.processor is None:
                 processor = self.init_processor(tokenizer)
             else:
                 processor = self.processor
-        inputs = processor(messages, images=images, videos=videos,
-            cut_enable=cut_flag)
-        inputs.to('cuda')
-        inputs.update({'tokenizer': tokenizer, 'max_new_tokens':
-            max_new_tokens})
+        inputs = processor(messages, images=images, videos=videos, cut_enable=cut_flag)
+        inputs.to("cuda")
+        inputs.update({"tokenizer": tokenizer, "max_new_tokens": max_new_tokens})
         if sampling:
-            generation_config = {'top_p': 0.8, 'top_k': 100, 'temperature':
-                0.7, 'do_sample': True}
+            generation_config = {"top_p": 0.8, "top_k": 100, "temperature": 0.7, "do_sample": True}
         else:
-            generation_config = {'num_beams': 3}
+            generation_config = {"num_beams": 3}
         if min_new_tokens > 0:
-            generation_config['min_new_tokens'] = min_new_tokens
-        generation_config.update((k, kwargs[k]) for k in generation_config.
-            keys() & kwargs.keys())
+            generation_config["min_new_tokens"] = min_new_tokens
+        generation_config.update((k, kwargs[k]) for k in generation_config.keys() & kwargs.keys())
         with paddle.no_grad():
-            res = self.generate(**inputs, stream=stream, decode_text=True,
-                **generation_config)
+            res = self.generate(**inputs, stream=stream, decode_text=True, **generation_config)
         if stream:
 
             def stream_gen():
                 for text in res:
                     for term in self.terminators:
-                        text = text.replace(term, '')
+                        text = text.replace(term, "")
                     yield text
+
             return stream_gen()
         else:
             answer = res[0]
diff --git a/paddlemix/mPLUGOwl3/modeling_navit_siglip.py b/paddlemix/mPLUGOwl3/modeling_navit_siglip.py
index 51dbb9a9b..4f736c50d 100644
--- a/paddlemix/mPLUGOwl3/modeling_navit_siglip.py
+++ b/paddlemix/mPLUGOwl3/modeling_navit_siglip.py
@@ -33,13 +33,14 @@
 )
 from paddlenlp.transformers.model_utils import PretrainedModel
 
+from paddlemix.models.flash_attn_utils import has_flash_attn_func
 from paddlemix.utils.initializer import _calculate_fan_in_and_fan_out
 
 from .bert_padding import pad_input, unpad_input
-from paddlemix.models.flash_attn_utils import has_flash_attn_func
 
 flash_attn_func, flash_attn_varlen_func = has_flash_attn_func()
 
+
 @dataclass
 class PaddleAttentionMaskConverter:
     """
@@ -104,7 +105,7 @@ def __init__(
         hidden_act="gelu",
         layer_norm_eps=1e-06,
         attention_dropout=0.0,
-        _attn_implementation="eager", 
+        _attn_implementation="eager",
         **kwargs
     ):
         super().__init__(**kwargs)
diff --git a/paddlemix/mPLUGOwl3/processing_mplugowl3.py b/paddlemix/mPLUGOwl3/processing_mplugowl3.py
index 9a7cb06ec..e11f790ba 100644
--- a/paddlemix/mPLUGOwl3/processing_mplugowl3.py
+++ b/paddlemix/mPLUGOwl3/processing_mplugowl3.py
@@ -1,31 +1,51 @@
+# Copyright (c) 2024 PaddlePaddle Authors. All Rights Reserved.
+# 
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+# 
+#     http://www.apache.org/licenses/LICENSE-2.0
+# 
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
 import sys
-sys.path.append('/home/aistudio/paddle_test/mPLUGOwl3/utils')
-import paddle_aux
+
+sys.path.append("/home/aistudio/paddle_test/mPLUGOwl3/utils")
 import paddle
+import paddle_aux
 import paddlenlp
 from paddlenlp.transformers.processing_utils import ProcessorMixin
+
 """
 Processor class for mPLUGOwl3.
 """
-from typing import List, Optional, Union, Dict, Any
-import warnings
 import re
+import warnings
+from typing import Any, Dict, List, Optional, Union
+
 # from .image_processing_mplugowl3 import mPLUGOwl3BatchFeature, mPLUGOwl3ImageProcessor
-from .image_processing_mplugowl3 import mPLUGOwl3BatchFeature, mPLUGOwl3ImageProcessor,TensorType
-OWL_MEDIA_TOKEN = ['<|image|>']
+from .image_processing_mplugowl3 import (
+    TensorType,
+    mPLUGOwl3BatchFeature,
+    mPLUGOwl3ImageProcessor,
+)
 
+OWL_MEDIA_TOKEN = ["<|image|>"]
 
-class MediaIndicesHelper:
 
-    def __init__(self, tokenizer) ->None:
+class MediaIndicesHelper:
+    def __init__(self, tokenizer) -> None:
         self.media_position = []
         self.tokenizer = tokenizer
 
     def has_media(self, text, media_tokens=None):
         if media_tokens is None:
             media_tokens = OWL_MEDIA_TOKEN
-        has_media_flag = any([(media_token == text) for media_token in
-            media_tokens])
+        has_media_flag = any([(media_token == text) for media_token in media_tokens])
         if any([(media_token in text) for media_token in media_tokens]):
             assert has_media_flag, text
         return has_media_flag
@@ -44,8 +64,7 @@ def add_media(self, text_chunk, text=None, tokenize_fn=None):
     def cal_media_offset(self, input_ids):
         if len(self.media_position) == 0:
             return paddle.ones_like(x=input_ids) * -1000000
-        media_starts = paddle.to_tensor(data=[_[0] for _ in self.
-            media_position]).reshape(1, -1)
+        media_starts = paddle.to_tensor(data=[_[0] for _ in self.media_position]).reshape(1, -1)
         rng = paddle.arange(end=tuple(input_ids.shape)[0]).reshape(-1, 1)
         matrix = (rng > media_starts).sum(axis=1)
         return matrix
@@ -63,55 +82,56 @@ class mPLUGOwl3Processor(ProcessorMixin):
         tokenizer ([`LlamaTokenizerWrapper`], *optional*):
             The tokenizer is a required input.
     """
-    attributes = ['image_processor', 'tokenizer']
-    image_processor_class = 'mPLUGOwl3ImageProcessor'
-    tokenizer_class = 'AutoTokenizer'
 
-    def __init__(self, image_processor: mPLUGOwl3ImageProcessor=None,
-        tokenizer=None, prompt_style='chatml', inference_mode=True,
-        addition_eod='<|endoftext|>'):
+    attributes = ["image_processor", "tokenizer"]
+    image_processor_class = "mPLUGOwl3ImageProcessor"
+    tokenizer_class = "AutoTokenizer"
+
+    def __init__(
+        self,
+        image_processor: mPLUGOwl3ImageProcessor = None,
+        tokenizer=None,
+        prompt_style="chatml",
+        inference_mode=True,
+        addition_eod="<|endoftext|>",
+    ):
         super().__init__(image_processor, tokenizer)
         self.image_processor: mPLUGOwl3ImageProcessor
         self.prompt_style = prompt_style
         self.inference_mode = inference_mode
-        self.media_tokens = ['<|image|>']
+        self.media_tokens = ["<|image|>"]
         self.addition_eod = addition_eod
 
     def build_text_qwen(self, messages):
-        im_start, im_end = '<|im_start|>', '<|im_end|>'
+        im_start, im_end = "<|im_start|>", "<|im_end|>"
         text = []
         for num_turn, message in enumerate(messages):
-            if num_turn == 0 and message['role'] != 'system':
-                if self.prompt_style != 'plain':
-                    text.append({'text': f'{im_start}system\n{im_end}',
-                        'label': 0})
-            if message['role'] == 'system':
-                if self.prompt_style != 'plain':
-                    text.append({'text':
-                        f"{im_start}system\n{message['content']}{im_end}",
-                        'label': 0})
-            elif message['role'] == 'user':
-                if self.prompt_style != 'plain':
+            if num_turn == 0 and message["role"] != "system":
+                if self.prompt_style != "plain":
+                    text.append({"text": f"{im_start}system\n{im_end}", "label": 0})
+            if message["role"] == "system":
+                if self.prompt_style != "plain":
+                    text.append({"text": f"{im_start}system\n{message['content']}{im_end}", "label": 0})
+            elif message["role"] == "user":
+                if self.prompt_style != "plain":
                     content = f"\n{im_start}user\n{message['content']}{im_end}"
                 else:
-                    content = message['content']
-                pattern = '|'.join(map(re.escape, self.media_tokens))
-                chunk_strs = re.split(f'({pattern})', content)
+                    content = message["content"]
+                pattern = "|".join(map(re.escape, self.media_tokens))
+                chunk_strs = re.split(f"({pattern})", content)
                 for chunk_str in chunk_strs:
-                    text.append({'text': chunk_str, 'label': 0})
-            elif message['role'] == 'assistant':
-                if self.prompt_style != 'plain':
-                    text.append({'text': f'\n{im_start}assistant\n',
-                        'label': 0})
-                    text.append({'text': f"{message['content']}{im_end}",
-                        'label': 1})
+                    text.append({"text": chunk_str, "label": 0})
+            elif message["role"] == "assistant":
+                if self.prompt_style != "plain":
+                    text.append({"text": f"\n{im_start}assistant\n", "label": 0})
+                    text.append({"text": f"{message['content']}{im_end}", "label": 1})
                 else:
-                    text.append({'text': f"{message['content']}", 'label': 1})
-                text.append({'text': self.addition_eod, 'label': 1})
+                    text.append({"text": f"{message['content']}", "label": 1})
+                text.append({"text": self.addition_eod, "label": 1})
             else:
                 raise NotImplementedError
         if self.inference_mode:
-            while text and text[-1]['label'] == 1:
+            while text and text[-1]["label"] == 1:
                 text.pop()
         return text
 
@@ -125,8 +145,8 @@ def encode_text_sft(self, texts):
         num_images = 0
         media_helper = MediaIndicesHelper(tokenizer=self.tokenizer)
         for current_ti, text_chunk in enumerate(texts):
-            text = text_chunk['text']
-            label = text_chunk['label']
+            text = text_chunk["text"]
+            label = text_chunk["label"]
             if not media_helper.has_media(text):
                 curr_chunk = self.wrapped_tokenize(text)
                 if label == 1:
@@ -138,103 +158,105 @@ def encode_text_sft(self, texts):
                     enc_chunk += curr_chunk
                     label_chunk += [label] * len(curr_chunk)
             else:
-                add_length = media_helper.add_media(enc_chunk, text=text,
-                    tokenize_fn=self.wrapped_tokenize)
+                add_length = media_helper.add_media(enc_chunk, text=text, tokenize_fn=self.wrapped_tokenize)
                 enc_length += add_length
                 label_chunk += [label] * add_length
                 num_images += 1
-        enc_chunk = paddle.to_tensor(data=enc_chunk).astype(dtype='int64')
+        enc_chunk = paddle.to_tensor(data=enc_chunk).astype(dtype="int64")
         media_offset = []
         media_before = 0
         for i, _ in enumerate([media_helper]):
             mo = _.cal_media_offset(enc_chunk)
-            media_offset.append(paddle.concat(x=[(paddle.ones(shape=[tuple(
-                mo.shape)[0], 1]) * media_before).astype(dtype='int64').to(
-                mo.place), (mo + media_before).unsqueeze(axis=1)], axis=1))
+            media_offset.append(
+                paddle.concat(
+                    x=[
+                        (paddle.ones(shape=[tuple(mo.shape)[0], 1]) * media_before).astype(dtype="int64").to(mo.place),
+                        (mo + media_before).unsqueeze(axis=1),
+                    ],
+                    axis=1,
+                )
+            )
             media_before += _.len_images()
         media_offset = paddle.stack(x=media_offset, axis=0)
-        return {'input_ids': enc_chunk.unsqueeze(axis=0), 'media_offset':
-            media_offset}
+        return {"input_ids": enc_chunk.unsqueeze(axis=0), "media_offset": media_offset}
 
-    def __call__(self, messages, images=None, videos=None, max_length:
-        Optional[int]=None, cut_enable=True, 
+    def __call__(
+        self,
+        messages,
+        images=None,
+        videos=None,
+        max_length: Optional[int] = None,
+        cut_enable=True,
         # return_tensors: Optional[Union[str, transformers.utils.TensorType]]=transformers.utils.TensorType.PYTORCH, **kwargs) ->mPLUGOwl3BatchFeature:
-        return_tensors: Optional[Union[str, TensorType]]=TensorType.PADDLE, **kwargs) ->mPLUGOwl3BatchFeature:
+        return_tensors: Optional[Union[str, TensorType]] = TensorType.PADDLE,
+        **kwargs
+    ) -> mPLUGOwl3BatchFeature:
         medias = []
         if videos is not None:
-            medias.extend([{'type': 'video', 'content': video,
-                'use_video_span': True} for video in videos])
+            medias.extend([{"type": "video", "content": video, "use_video_span": True} for video in videos])
         if images is not None:
-            medias.extend([{'type': 'image', 'content': image} for image in
-                images])
+            medias.extend([{"type": "image", "content": image} for image in images])
         if len(medias):
             image_tensor_list = []
-            pattern = '(<\\|image\\|>|<\\|video\\|>)'
+            pattern = "(<\\|image\\|>|<\\|video\\|>)"
             image_token_ptr = 0
             media_layout = []
             for message in messages:
-                text_list = re.split(pattern, message['content'])
-                text = ''
+                text_list = re.split(pattern, message["content"])
+                text = ""
                 for text_content in text_list:
-                    if text_content in ['<|image|>', '<|video|>']:
+                    if text_content in ["<|image|>", "<|video|>"]:
                         media_item = medias[image_token_ptr]
                         image_token_ptr += 1
-                        if text_content == '<|image|>':
-                            assert media_item['type'] == 'image'
-                            image = media_item['content']
-                            image_inputs = self.image_processor([image],
-                                cut_enable=cut_enable, return_tensors=
-                                return_tensors)
-                            if image_inputs.get('cut_shape', None) is not None:
-                                cut_shape = image_inputs['cut_shape']
-                                cut_text = (self.image_processor.
-                                    cut_prompt_template(img_token=
-                                    '<|image|>', h=cut_shape[0][0], w=
-                                    cut_shape[0][1]))
+                        if text_content == "<|image|>":
+                            assert media_item["type"] == "image"
+                            image = media_item["content"]
+                            image_inputs = self.image_processor(
+                                [image], cut_enable=cut_enable, return_tensors=return_tensors
+                            )
+                            if image_inputs.get("cut_shape", None) is not None:
+                                cut_shape = image_inputs["cut_shape"]
+                                cut_text = self.image_processor.cut_prompt_template(
+                                    img_token="<|image|>", h=cut_shape[0][0], w=cut_shape[0][1]
+                                )
                                 text += cut_text
-                                image_tensor_list.append(image_inputs[
-                                    'pixel_values'])
+                                image_tensor_list.append(image_inputs["pixel_values"])
                             else:
                                 text += text_content
-                        elif text_content == '<|video|>':
-                            assert media_item['type'] == 'video'
-                            video = media_item['content']
-                            use_video_span = media_item['use_video_span']
-                            image_tensor = self.image_processor(video,
-                                cut_enable=False)['pixel_values']
+                        elif text_content == "<|video|>":
+                            assert media_item["type"] == "video"
+                            video = media_item["content"]
+                            use_video_span = media_item["use_video_span"]
+                            image_tensor = self.image_processor(video, cut_enable=False)["pixel_values"]
                             image_tensor_list.append(image_tensor)
                             num_video_frame = tuple(image_tensor.shape)[0]
                             if use_video_span:
-                                text_content = ('<|start_video_frame|>' + 
-                                    '<|image|>' * num_video_frame +
-                                    '<|end_video_frame|>')
+                                text_content = (
+                                    "<|start_video_frame|>" + "<|image|>" * num_video_frame + "<|end_video_frame|>"
+                                )
                             else:
-                                text_content = '<|image|>' * num_video_frame
+                                text_content = "<|image|>" * num_video_frame
                             text += text_content
                     else:
                         text += text_content
-                message['content'] = text
-            assert image_token_ptr == len(medias), (image_token_ptr, len(
-                medias))
+                message["content"] = text
+            assert image_token_ptr == len(medias), (image_token_ptr, len(medias))
             assert all(len(tuple(_.shape)) == 4 for _ in image_tensor_list), [
-                tuple(_.shape) for _ in image_tensor_list]
-            num_image_tokens = sum([_['content'].count('<|image|>') for _ in
-                messages])
-            num_image_shapes = sum([tuple(_.shape)[0] for _ in
-                image_tensor_list])
-            assert num_image_tokens == num_image_shapes, (messages, [tuple(
-                _.shape) for _ in image_tensor_list])
+                tuple(_.shape) for _ in image_tensor_list
+            ]
+            num_image_tokens = sum([_["content"].count("<|image|>") for _ in messages])
+            num_image_shapes = sum([tuple(_.shape)[0] for _ in image_tensor_list])
+            assert num_image_tokens == num_image_shapes, (messages, [tuple(_.shape) for _ in image_tensor_list])
         image_tensor_list = paddle.concat(x=image_tensor_list, axis=0)
         text = self.build_text_qwen(messages)
         model_inputs = self.encode_text_sft(text)
         if len(medias) is not None:
-            model_inputs.update({'pixel_values': image_tensor_list})
+            model_inputs.update({"pixel_values": image_tensor_list})
         return mPLUGOwl3BatchFeature(model_inputs)
 
     def check_media(self, images, messages):
         media_num = 0 if images is None else len(images)
-        media_count = sum([message['content'].count('<|image|>') for
-            message in messages])
+        media_count = sum([message["content"].count("<|image|>") for message in messages])
         assert media_num == media_count
 
     def batch_decode(self, *args, **kwargs):
@@ -250,8 +272,7 @@ def batch_decode(self, *args, **kwargs):
                 result = result[1:]
             if result[-1] == self.tokenizer.eos_id:
                 result = result[:-1]
-            result_text.append(self.tokenizer.decode(result, *args[1:], **
-                kwargs).strip())
+            result_text.append(self.tokenizer.decode(result, *args[1:], **kwargs).strip())
         return result_text
 
     def decode(self, *args, **kwargs):
@@ -263,45 +284,45 @@ def decode(self, *args, **kwargs):
         result = result[result != 0]
         if result[0] == self.tokenizer.bos_id:
             result = result[1:]
-        if result[-1] == self.tokenizer.eos_id or hasattr(self.tokenizer,
-            'eot_id') and result[-1] == self.tokenizer.eot_id:
+        if (
+            result[-1] == self.tokenizer.eos_id
+            or hasattr(self.tokenizer, "eot_id")
+            and result[-1] == self.tokenizer.eot_id
+        ):
             result = result[:-1]
         return self.tokenizer.decode(result, *args[1:], **kwargs).strip()
 
-    def _convert(self, input_str, max_inp_length: Optional[int]=None):
-        if self.version > 2.5 or not getattr(self.tokenizer,
-            'add_bos_token', False):
+    def _convert(self, input_str, max_inp_length: Optional[int] = None):
+        if self.version > 2.5 or not getattr(self.tokenizer, "add_bos_token", False):
             input_ids = self.tokenizer.encode(input_str)
         else:
-            input_ids = [self.tokenizer.bos_id] + self.tokenizer.encode(
-                input_str)
+            input_ids = [self.tokenizer.bos_id] + self.tokenizer.encode(input_str)
         if max_inp_length is not None:
             input_ids = input_ids[:max_inp_length]
-        input_ids = paddle.to_tensor(data=input_ids, dtype='int32')
-        start_cond = (input_ids == self.tokenizer.im_start_id) | (input_ids ==
-            self.tokenizer.slice_start_id)
-        end_cond = (input_ids == self.tokenizer.im_end_id) | (input_ids ==
-            self.tokenizer.slice_end_id)
-# >>>>>>        image_start_tokens = torch.where(start_cond)[0]
-        image_start_tokens = paddle.nonzero(start_cond)[:,0]
+        input_ids = paddle.to_tensor(data=input_ids, dtype="int32")
+        start_cond = (input_ids == self.tokenizer.im_start_id) | (input_ids == self.tokenizer.slice_start_id)
+        end_cond = (input_ids == self.tokenizer.im_end_id) | (input_ids == self.tokenizer.slice_end_id)
+        # >>>>>>        image_start_tokens = torch.where(start_cond)[0]
+        image_start_tokens = paddle.nonzero(start_cond)[:, 0]
         image_start_tokens += 1
-# >>>>>>        image_end_tokens = torch.where(end_cond)[0]
-        image_end_tokens = paddle.nonzero(end_cond)[:,0]
+        # >>>>>>        image_end_tokens = torch.where(end_cond)[0]
+        image_end_tokens = paddle.nonzero(end_cond)[:, 0]
         valid_image_nums = max(len(image_start_tokens), len(image_end_tokens))
-        image_bounds = paddle.hstack(x=[image_start_tokens[:
-            valid_image_nums].unsqueeze(axis=-1), image_end_tokens[:
-            valid_image_nums].unsqueeze(axis=-1)])
+        image_bounds = paddle.hstack(
+            x=[
+                image_start_tokens[:valid_image_nums].unsqueeze(axis=-1),
+                image_end_tokens[:valid_image_nums].unsqueeze(axis=-1),
+            ]
+        )
         return input_ids, image_bounds
 
     @property
     def model_input_names(self):
         tokenizer_input_names = self.tokenizer.model_input_names
         image_processor_input_names = self.image_processor.model_input_names
-        return list(dict.fromkeys(tokenizer_input_names +
-            image_processor_input_names))
+        return list(dict.fromkeys(tokenizer_input_names + image_processor_input_names))
 
-    def pad(self, inputs, max_length=None, padding_value=0, padding_side='left'
-        ):
+    def pad(self, inputs, max_length=None, padding_value=0, padding_side="left"):
         items = []
         if isinstance(inputs[0], list):
             assert isinstance(inputs[0][0], paddle.Tensor)
@@ -317,32 +338,28 @@ def pad(self, inputs, max_length=None, padding_value=0, padding_side='left'
         assert dim <= 2
         if max_length is None:
             max_length = 0
-        max_length = max(max_length, max(tuple(item.shape)[-1] for item in
-            items))
+        max_length = max(max_length, max(tuple(item.shape)[-1] for item in items))
         min_length = min(tuple(item.shape)[-1] for item in items)
         dtype = items[0].dtype
         if dim == 0:
             return paddle.stack(x=[item for item in items], axis=0), [0]
         elif dim == 1:
             if max_length == min_length:
-                return paddle.stack(x=[item for item in items], axis=0), [0
-                    ] * batch_size
-            tensor = paddle.zeros(shape=(batch_size, max_length), dtype=dtype
-                ) + padding_value
+                return paddle.stack(x=[item for item in items], axis=0), [0] * batch_size
+            tensor = paddle.zeros(shape=(batch_size, max_length), dtype=dtype) + padding_value
         else:
-            tensor = paddle.zeros(shape=(batch_size, max_length, shape[-1]),
-                dtype=dtype) + padding_value
+            tensor = paddle.zeros(shape=(batch_size, max_length, shape[-1]), dtype=dtype) + padding_value
         padding_length = []
         for i, item in enumerate(items):
             if dim == 1:
-                if padding_side == 'left':
-                    tensor[i, -len(item):] = item.clone()
+                if padding_side == "left":
+                    tensor[i, -len(item) :] = item.clone()
                 else:
-                    tensor[i, :len(item)] = item.clone()
+                    tensor[i, : len(item)] = item.clone()
             elif dim == 2:
-                if padding_side == 'left':
-                    tensor[i, -len(item):, :] = item.clone()
+                if padding_side == "left":
+                    tensor[i, -len(item) :, :] = item.clone()
                 else:
-                    tensor[i, :len(item), :] = item.clone()
+                    tensor[i, : len(item), :] = item.clone()
             padding_length.append(tuple(tensor.shape)[-1] - len(item))
         return tensor, padding_length
diff --git a/paddlemix/mPLUGOwl3/utils/paddle_aux.py b/paddlemix/mPLUGOwl3/utils/paddle_aux.py
index 0e8d9e56c..ab3de5d4f 100644
--- a/paddlemix/mPLUGOwl3/utils/paddle_aux.py
+++ b/paddlemix/mPLUGOwl3/utils/paddle_aux.py
@@ -1,15 +1,30 @@
+# Copyright (c) 2024 PaddlePaddle Authors. All Rights Reserved.
+# 
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+# 
+#     http://www.apache.org/licenses/LICENSE-2.0
+# 
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
 
 # This file is generated by PaConvert ToolKit, please Don't edit it!
 import paddle
 
+
 def reshape(self, *args, **kwargs):
     if args:
-        if len(args)==1 and isinstance(args[0], (tuple, list)):
+        if len(args) == 1 and isinstance(args[0], (tuple, list)):
             return paddle.reshape(self, args[0])
         else:
             return paddle.reshape(self, list(args))
     elif kwargs:
-        assert 'shape' in kwargs
-        return paddle.reshape(self, shape=kwargs['shape'])
+        assert "shape" in kwargs
+        return paddle.reshape(self, shape=kwargs["shape"])
+
 
-setattr(paddle.Tensor, 'reshape', reshape)
+setattr(paddle.Tensor, "reshape", reshape)
diff --git a/paddlemix/mPLUGOwl3/x_sdpa.py b/paddlemix/mPLUGOwl3/x_sdpa.py
index 172efffba..4984d0e15 100644
--- a/paddlemix/mPLUGOwl3/x_sdpa.py
+++ b/paddlemix/mPLUGOwl3/x_sdpa.py
@@ -1,31 +1,42 @@
+# Copyright (c) 2024 PaddlePaddle Authors. All Rights Reserved.
+# 
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+# 
+#     http://www.apache.org/licenses/LICENSE-2.0
+# 
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
 import paddle
-from icecream import ic
 from einops import rearrange
+from icecream import ic
 
 
 class ScaleDotProductAttention(paddle.nn.Layer):
-
-    def __init__(self, layer_number, causal=False, softmax_scale=None,
-        attention_dropout=0.0):
+    def __init__(self, layer_number, causal=False, softmax_scale=None, attention_dropout=0.0):
         super().__init__()
         self.layer_number = layer_number
         self.causal = causal
         self.softmax_scale = softmax_scale
         self.dropout_p = attention_dropout
 
-    def forward(self, q, k, v, attn_mask=None, order='sbhd'):
+    def forward(self, q, k, v, attn_mask=None, order="sbhd"):
         """Implements the multihead softmax attention.
         Arguments
         ---------
             q, k, v: The tensor containing the query, key, and value. (B, S, H, D)
         """
-        if order == 'sbhd':
-            q, k, v = [rearrange(x, 's b h d -> b h s d').contiguous() for
-                x in (q, k, v)]
-        elif order == 'bhsd':
+        if order == "sbhd":
+            q, k, v = [rearrange(x, "s b h d -> b h s d").contiguous() for x in (q, k, v)]
+        elif order == "bhsd":
             pass
         if attn_mask is not None:
-            attn_mask = (~attn_mask.clone().astype(dtype='bool')).contiguous()
+            attn_mask = (~attn_mask.clone().astype(dtype="bool")).contiguous()
         else:
             attn_mask = None
         if self.training:
@@ -39,11 +50,11 @@ def forward(self, q, k, v, attn_mask=None, order='sbhd'):
             else:
                 is_causal = self.causal
             dropout_p = 0.0
-        assert self.softmax_scale == None or self.softmax_scale == paddle.utils.try_import(
-            'math').sqrt(q.shape[-1]
-            ), 'Fault: The scale parameter defaults to the square root of the last dimension of query, not allowed manually set'
-        o = paddle.nn.functional.scaled_dot_product_attention(query=q, key=
-            k, value=v, attn_mask=attn_mask, dropout_p=dropout_p, is_causal
-            =is_causal)
-        o = rearrange(o, 'B Head L D -> L B (Head D)').contiguous()
+        assert self.softmax_scale == None or self.softmax_scale == paddle.utils.try_import("math").sqrt(
+            q.shape[-1]
+        ), "Fault: The scale parameter defaults to the square root of the last dimension of query, not allowed manually set"
+        o = paddle.nn.functional.scaled_dot_product_attention(
+            query=q, key=k, value=v, attn_mask=attn_mask, dropout_p=dropout_p, is_causal=is_causal
+        )
+        o = rearrange(o, "B Head L D -> L B (Head D)").contiguous()
         return o

From 4e3b706c9876d74488c3cb46f8b56d1b385abedd Mon Sep 17 00:00:00 2001
From: "nemonameless@qq.com@github.com" <nemonameless@qq.com>
Date: Sun, 8 Dec 2024 11:55:37 +0000
Subject: [PATCH 3/8] align codes after 241101

---
 0.sh                                          |    1 +
 paddlemix/examples/mPLUG_Owl3/README.md       |   50 +
 paddlemix/examples/mPLUG_Owl3/requirement.txt |    3 +
 .../examples/mPLUG_Owl3/run_inference.py      |   48 +
 .../mPLUG_Owl3/run_inference_video.py         |   77 +
 .../mPLUGOwl3/image_processing_mplugowl3.py   |  544 ------
 paddlemix/mPLUGOwl3/imagetest.py              |   43 -
 paddlemix/mPLUGOwl3/modeling_hyper_qwen2.py   | 1568 -----------------
 paddlemix/mPLUGOwl3/utils/paddle_aux.py       |   30 -
 paddlemix/mPLUGOwl3/x_sdpa.py                 |   60 -
 paddlemix/{ => models}/mPLUGOwl3/__init__.py  |    5 +-
 .../{ => models}/mPLUGOwl3/activations.py     |    0
 .../{ => models}/mPLUGOwl3/bert_padding.py    |    0
 .../mPLUGOwl3/configuration_hyper_qwen2.py    |   24 +-
 .../mPLUGOwl3/configuration_mplugowl3.py      |   35 +-
 .../mPLUGOwl3/image_processing_mplugowl3.py   |  489 +++++
 .../models/mPLUGOwl3/modeling_hyper_qwen2.py  | 1027 +++++++++++
 .../mPLUGOwl3/modeling_mplugowl3.py           |  171 +-
 .../mPLUGOwl3/modeling_navit_siglip.py        |   98 +-
 .../mPLUGOwl3/processing_mplugowl3.py         |  280 +--
 20 files changed, 2022 insertions(+), 2531 deletions(-)
 create mode 100644 0.sh
 create mode 100644 paddlemix/examples/mPLUG_Owl3/README.md
 create mode 100644 paddlemix/examples/mPLUG_Owl3/requirement.txt
 create mode 100644 paddlemix/examples/mPLUG_Owl3/run_inference.py
 create mode 100644 paddlemix/examples/mPLUG_Owl3/run_inference_video.py
 delete mode 100644 paddlemix/mPLUGOwl3/image_processing_mplugowl3.py
 delete mode 100644 paddlemix/mPLUGOwl3/imagetest.py
 delete mode 100644 paddlemix/mPLUGOwl3/modeling_hyper_qwen2.py
 delete mode 100644 paddlemix/mPLUGOwl3/utils/paddle_aux.py
 delete mode 100644 paddlemix/mPLUGOwl3/x_sdpa.py
 rename paddlemix/{ => models}/mPLUGOwl3/__init__.py (94%)
 rename paddlemix/{ => models}/mPLUGOwl3/activations.py (100%)
 rename paddlemix/{ => models}/mPLUGOwl3/bert_padding.py (100%)
 rename paddlemix/{ => models}/mPLUGOwl3/configuration_hyper_qwen2.py (91%)
 rename paddlemix/{ => models}/mPLUGOwl3/configuration_mplugowl3.py (65%)
 create mode 100644 paddlemix/models/mPLUGOwl3/image_processing_mplugowl3.py
 create mode 100644 paddlemix/models/mPLUGOwl3/modeling_hyper_qwen2.py
 rename paddlemix/{ => models}/mPLUGOwl3/modeling_mplugowl3.py (59%)
 rename paddlemix/{ => models}/mPLUGOwl3/modeling_navit_siglip.py (93%)
 rename paddlemix/{ => models}/mPLUGOwl3/processing_mplugowl3.py (57%)

diff --git a/0.sh b/0.sh
new file mode 100644
index 000000000..1ce640685
--- /dev/null
+++ b/0.sh
@@ -0,0 +1 @@
+CUDA_VISIBLE_DEVICES=7 python paddlemix/examples/mPLUG_Owl3/run_inference.py
diff --git a/paddlemix/examples/mPLUG_Owl3/README.md b/paddlemix/examples/mPLUG_Owl3/README.md
new file mode 100644
index 000000000..13ed53c03
--- /dev/null
+++ b/paddlemix/examples/mPLUG_Owl3/README.md
@@ -0,0 +1,50 @@
+# mPLUG-Owl3
+
+## 1. 模型介绍
+
+**本仓库支持的模型权重:**
+
+| Model              |
+|--------------------|
+<!-- | mPLUG/mPLUG-Owl3-1B-241014  |
+| mPLUG/mPLUG-Owl3-2B-241014  | -->
+| mPLUG/mPLUG-Owl3-7B-241101  |
+
+注意：与huggingface权重同名，但权重为paddle框架的Tensor，使用`xxx.from_pretrained("mPLUG/mPLUG-Owl3-7B-241101")`即可自动下载该权重文件夹到缓存目录。
+
+
+## 2 环境准备
+
+1）[安装 PaddleMIX 环境依赖包](https://github.com/PaddlePaddle/PaddleMIX/tree/develop?tab=readme-ov-file#%E5%AE%89%E8%A3%85)
+
+2）pip install pillow tqdm paddlenlp==3.0.0b2
+
+注意：Python版本最好为3.10及以上版本。
+
+## 3 快速开始
+
+### 推理
+```bash
+# 图片理解
+python paddlemix/examples/mPLUG_Owl3/run_inference.py \
+
+# 视频理解
+python paddlemix/examples/mPLUG_Owl3/run_inference_video.py \
+```
+
+### 效果展示
+
+
+
+### 参考文献
+```BibTeX
+@misc{ye2024mplugowl3longimagesequenceunderstanding,
+      title={mPLUG-Owl3: Towards Long Image-Sequence Understanding in Multi-Modal Large Language Models},
+      author={Jiabo Ye and Haiyang Xu and Haowei Liu and Anwen Hu and Ming Yan and Qi Qian and Ji Zhang and Fei Huang and Jingren Zhou},
+      year={2024},
+      eprint={2408.04840},
+      archivePrefix={arXiv},
+      primaryClass={cs.CV},
+      url={https://arxiv.org/abs/2408.04840},
+}
+```
diff --git a/paddlemix/examples/mPLUG_Owl3/requirement.txt b/paddlemix/examples/mPLUG_Owl3/requirement.txt
new file mode 100644
index 000000000..c1cc9aebb
--- /dev/null
+++ b/paddlemix/examples/mPLUG_Owl3/requirement.txt
@@ -0,0 +1,3 @@
+pillow
+tqdm
+paddlenlp==3.0.0b2
\ No newline at end of file
diff --git a/paddlemix/examples/mPLUG_Owl3/run_inference.py b/paddlemix/examples/mPLUG_Owl3/run_inference.py
new file mode 100644
index 000000000..fd0d741c0
--- /dev/null
+++ b/paddlemix/examples/mPLUG_Owl3/run_inference.py
@@ -0,0 +1,48 @@
+# Copyright (c) 2024 PaddlePaddle Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+from PIL import Image
+import paddle
+from paddlenlp.transformers import Qwen2Tokenizer
+from paddlemix.models.mPLUGOwl3.configuration_mplugowl3 import mPLUGOwl3Config
+from paddlemix.models.mPLUGOwl3.modeling_mplugowl3 import mPLUGOwl3Model
+# from paddlemix.models.mPLUGOwl3.processing_mplugowl3 import mPLUGOwl3Processor
+# from paddlemix.models.mPLUGOwl3.image_processing_mplugowl3 import mPLUGOwl3ImageProcessor
+
+#model_path = 'mPLUG/mPLUG-Owl3-7B-241101'
+model_path = 'mPLUG-Owl3-7B-241101'
+
+config = mPLUGOwl3Config.from_pretrained(model_path)
+print(config)
+model = mPLUGOwl3Model.from_pretrained(model_path, dtype=paddle.bfloat16).eval()
+tokenizer = Qwen2Tokenizer.from_pretrained(model_path)
+processor = model.init_processor(tokenizer)
+
+image = Image.new('RGB', (500, 500), color='red')
+
+messages = [
+    {"role": "user", "content": """<|image|>Describe this image."""},
+    {"role": "assistant", "content": ""}
+]
+
+inputs = processor(messages, images=[image], videos=None)
+
+inputs.update({
+    'tokenizer': tokenizer,
+    'max_new_tokens':100,
+    'decode_text':True,
+})
+
+g = model.generate(**inputs)
+print(g)
diff --git a/paddlemix/examples/mPLUG_Owl3/run_inference_video.py b/paddlemix/examples/mPLUG_Owl3/run_inference_video.py
new file mode 100644
index 000000000..778c9cb73
--- /dev/null
+++ b/paddlemix/examples/mPLUG_Owl3/run_inference_video.py
@@ -0,0 +1,77 @@
+# Copyright (c) 2024 PaddlePaddle Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+import argparse
+import base64
+import io
+from typing import Dict, List
+
+import paddle
+import PIL.Image
+from paddlenlp.transformers import LlamaTokenizerFast
+
+from paddlemix.models.janus import JanusMultiModalityCausalLM
+from paddlemix.processors import JanusImageProcessor, JanusVLChatProcessor
+
+import paddle
+model_path = 'mPLUG/mPLUG-Owl3-7B-241101'
+
+config = AutoConfig.from_pretrained(model_path)
+print(config)
+model = AutoModel.from_pretrained(model_path, dtype=paddle.bfloat16).eval()
+
+from PIL import Image
+
+from modelscope import AutoTokenizer
+from decord import VideoReader, cpu    # pip install decord
+tokenizer = AutoTokenizer.from_pretrained(model_path)
+processor = model.init_processor(tokenizer)
+
+
+messages = [
+    {"role": "user", "content": """<|video|>
+Describe this video."""},
+    {"role": "assistant", "content": ""}
+]
+
+videos = ['/nas-mmu-data/examples/car_room.mp4']
+
+MAX_NUM_FRAMES=16
+
+def encode_video(video_path):
+    def uniform_sample(l, n):
+        gap = len(l) / n
+        idxs = [int(i * gap + gap / 2) for i in range(n)]
+        return [l[i] for i in idxs]
+
+    vr = VideoReader(video_path, ctx=cpu(0))
+    sample_fps = round(vr.get_avg_fps() / 1)  # FPS
+    frame_idx = [i for i in range(0, len(vr), sample_fps)]
+    if len(frame_idx) > MAX_NUM_FRAMES:
+        frame_idx = uniform_sample(frame_idx, MAX_NUM_FRAMES)
+    frames = vr.get_batch(frame_idx).asnumpy()
+    frames = [Image.fromarray(v.astype('uint8')) for v in frames]
+    print('num frames:', len(frames))
+    return frames
+video_frames = [encode_video(_) for _ in videos]
+inputs = processor(messages, images=None, videos=video_frames)
+
+inputs.update({
+    'tokenizer': tokenizer,
+    'max_new_tokens':100,
+    'decode_text':True,
+})
+
+g = model.generate(**inputs)
+print(g)
diff --git a/paddlemix/mPLUGOwl3/image_processing_mplugowl3.py b/paddlemix/mPLUGOwl3/image_processing_mplugowl3.py
deleted file mode 100644
index de801016d..000000000
--- a/paddlemix/mPLUGOwl3/image_processing_mplugowl3.py
+++ /dev/null
@@ -1,544 +0,0 @@
-# Copyright (c) 2024 PaddlePaddle Authors. All Rights Reserved.
-# 
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-# 
-#     http://www.apache.org/licenses/LICENSE-2.0
-# 
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-
-import sys
-
-sys.path.append("/home/aistudio/paddle_test/mPLUGOwl3/utils")
-import math
-import random
-from enum import Enum
-from typing import Any, Dict, List, Optional, Union
-
-import numpy as np
-
-# import paddle_aux
-import paddle
-import paddle.nn.functional as F
-import paddlenlp
-import PIL
-import PIL.Image
-import PIL.ImageSequence
-from einops import rearrange, repeat
-from paddlenlp.transformers.image_processing_utils import (
-    BaseImageProcessor,
-    BatchFeature,
-)
-from PIL import Image
-
-
-def recursive_converter(converter, value):
-    if isinstance(value, list):
-        new_value = []
-        for v in value:
-            new_value += [recursive_converter(converter, v)]
-        return new_value
-    else:
-        return converter(value)
-
-
-def box_area(boxes):
-    # 获取边界框的宽度和高度
-    width = boxes[:, 2] - boxes[:, 0]
-    height = boxes[:, 3] - boxes[:, 1]
-    # 计算面积
-    area = width * height
-    return area
-
-
-def custom_max(a, b):
-    return paddle.where(a > b, a, b)
-
-
-def custom_min(a, b):
-    return paddle.where(a < b, a, b)
-
-
-def box_iou(boxes1, area1, boxes2, eps=1e-05):
-    # >>>>>>    area2 = torchvision.ops.boxes.box_area(boxes2)
-    area1 = area1.astype("float32")
-    boxes1 = boxes1.astype("float32")
-    boxes2 = boxes2.astype("float32")
-
-    area2 = box_area(boxes2).astype("float32")
-    lt = custom_max(boxes1[:, None, :2], boxes2[:, :2])
-    rb = custom_min(boxes1[:, None, 2:], boxes2[:, 2:])
-    wh = (rb - lt).clip(min=0)
-    inter = wh[:, :, 0] * wh[:, :, 1]
-    union = area1[:, None] + area2 - inter
-    iou = inter / (union + eps)
-    return iou, union
-
-
-available_anchor_strategy = ["docowl", "random", "highest", "last", "llava"]
-grid_dict = {
-    "grid_33": [
-        (1, 1),
-        (1, 2),
-        (2, 1),
-        (1, 3),
-        (3, 1),
-        (2, 2),
-        (1, 4),
-        (4, 1),
-        (1, 5),
-        (5, 1),
-        (1, 6),
-        (6, 1),
-        (2, 3),
-        (3, 2),
-        (1, 7),
-        (7, 1),
-        (4, 2),
-        (2, 4),
-        (1, 8),
-        (8, 1),
-        (3, 3),
-        (1, 9),
-        (9, 1),
-    ],
-    "grid_squ_3x3": [(1, 1), (2, 2), (3, 3)],
-    "grid_squ_4": [(2, 2), (1, 3), (1, 4), (3, 1), (4, 1)],
-    "grid_squ_6": [(2, 2), (1, 3), (1, 4), (3, 1), (4, 1), (2, 3), (3, 2)],
-    "grid_squ_2": [(2, 1)],
-    "grid_squ_9": [
-        (1, 1),
-        (1, 2),
-        (2, 1),
-        (1, 3),
-        (3, 1),
-        (2, 2),
-        (1, 4),
-        (4, 1),
-        (1, 5),
-        (5, 1),
-        (1, 6),
-        (6, 1),
-        (2, 3),
-        (3, 2),
-        (1, 7),
-        (7, 1),
-        (4, 2),
-        (2, 4),
-        (1, 8),
-        (8, 1),
-        (3, 3),
-        (1, 9),
-        (9, 1),
-    ],
-}
-cut_prompt_template_dict = {
-    "v0": lambda img_token, h, w: f"".join([f"{img_token}" for i in range(h) for j in range(w)]),
-    "v1": lambda img_token, h, w: f"Cut to {h} rows {w} columns, "
-    + " ".join([f"subimg({i},{j}){img_token}" for i in range(h) for j in range(w)]),
-    "v1_global": lambda img_token, h, w: f"Cut to {h} rows {w} columns with a global view, "
-    + " ".join([f"subimg({i},{j}){img_token}" for i in range(h) for j in range(w)] + [f"global_view{img_token}"]),
-    "v2_global": lambda img_token, h, w: f"""Cut to {h} rows {w} columns with a global view
-"""
-    + "\n".join([" ".join([f"subimg({i},{j}){img_token}" for j in range(w)]) for i in range(h)])
-    + f"""
-global_view{img_token}""",
-    "v3": lambda img_token, h, w: f"<|start_cut|>{h}*{w}"
-    + " ".join([f"{img_token}" for i in range(h) for j in range(w)])
-    + "<|end_cut|>",
-    "v3_global": lambda img_token, h, w: f"""<|start_cut|>{h}*{w}
-"""
-    + "\n".join([" ".join([f"{img_token}" for j in range(w)]) for i in range(h)])
-    + f"""
-{img_token}<|end_cut|>""",
-}
-
-
-def anchor_rank(anchors, anchors_areas, input_image_size, eps=1e-05):
-    input_image_bbox = paddle.to_tensor(data=[0, 0, input_image_size[1], input_image_size[0]]).unsqueeze(axis=0)
-    boxes1 = anchors
-    boxes2 = input_image_bbox
-    boxes3 = anchors.clone()
-    boxes3[:, 3] = input_image_size[0] / input_image_size[1] * anchors[:, 2]
-    area1 = anchors_areas
-    iou, _ = box_iou(boxes1, area1, boxes2)
-    iou = iou.squeeze(axis=1)
-    shape_iou, _ = box_iou(boxes1, area1, boxes3)
-    shape_iou = shape_iou.diag()
-    index = paddle.argmax(x=shape_iou * 100 + iou, axis=0)
-    return index
-
-
-def select_best_resolution(anchors, anchors_areas, input_image_size):
-    """
-    Selects the best resolution from a list of possible resolutions based on the original size.
-
-    Args:
-        original_size (tuple): The original size of the image in the format (width, height).
-        possible_resolutions (list): A list of possible resolutions in the format [(width1, height1), (width2, height2), ...].
-
-    Returns:
-        tuple: The best fit resolution in the format (width, height).
-    """
-    original_size = input_image_size[1], input_image_size[0]
-    possible_resolutions = [(_[2], _[3]) for _ in anchors]
-    original_width, original_height = original_size
-    best_fit = None
-    max_effective_resolution = 0
-    min_wasted_resolution = float("inf")
-    index = 0
-    for i, (width, height) in enumerate(possible_resolutions):
-        scale = min(width / original_width, height / original_height)
-        downscaled_width, downscaled_height = int(original_width * scale), int(original_height * scale)
-        effective_resolution = min(downscaled_width * downscaled_height, original_width * original_height)
-        wasted_resolution = width * height - effective_resolution
-        if (
-            effective_resolution > max_effective_resolution
-            or effective_resolution == max_effective_resolution
-            and wasted_resolution < min_wasted_resolution
-        ):
-            max_effective_resolution = effective_resolution
-            min_wasted_resolution = wasted_resolution
-            best_fit = width, height
-            index = i
-    return index
-
-
-def build_cut_shape_indices(cut_shape):
-    cut_shape_indices = []
-    for shape in cut_shape:
-        n = shape[0] * shape[1]
-        indices = paddle.concat(
-            x=[repeat(paddle.to_tensor(data=shape), "l -> n l", n=n), paddle.arange(end=n).unsqueeze(axis=1)], axis=1
-        )
-        assert tuple(indices.shape)[0] == n
-        assert tuple(indices.shape)[1] == 3
-        cut_shape_indices.append(indices)
-    cut_shape_indices = paddle.concat(x=cut_shape_indices, axis=0).astype(dtype="int64")
-    return cut_shape_indices
-
-
-class AnchorResize(paddle.nn.Layer):
-
-    # >>>>>>    def __init__(self, image_size, anchors, interpolation=torchvision.
-    #         transforms.transforms.InterpolationMode.BILINEAR, antialias=None,
-    #         anchor_strategy='docowl'):
-    def __init__(self, image_size, anchors, interpolation="bilinear", antialias=None, anchor_strategy="docowl"):
-        super().__init__()
-        self.image_size = image_size
-        self.anchors = paddle.to_tensor(
-            data=[[0, 0, _[1] * image_size[1], _[0] * image_size[0]] for _ in anchors], stop_gradient=not False
-        )
-        # >>>>>>        self.anchor_areas = torchvision.ops.boxes.box_area(self.anchors)
-        self.anchor_areas = box_area(self.anchors)
-        self.interpolation = interpolation
-        self.antialias = antialias
-        self.anchor_strategy = anchor_strategy
-        assert self.anchor_strategy in available_anchor_strategy
-
-    def resize_global(self, img):
-        # >>>>>>        return torchvision.transforms.functional.resize(img, self.
-        #             image_size, self.interpolation, max_size=None, antialias=self.
-        #             antialias)
-        image_np = np.array(img)
-        image_tensor = paddle.to_tensor(image_np, dtype="float32")
-        image_tensor = image_tensor.transpose([2, 0, 1])  # 变成 (3, 500, 500)
-        if self.interpolation == "bilinear" or "bicubic":
-            image_tensor = image_tensor.unsqueeze(0)  # 变成 (1, 3, 500, 500)
-        return F.interpolate(image_tensor, size=self.image_size, mode=self.interpolation, align_corners=False)
-
-    def forward(self, img, skip_resize=False):
-        """
-        Args:
-            img (PIL Image or Tensor): Image to be scaled.
-
-        Returns:
-            PIL Image or Tensor: Rescaled image.
-        """
-        if self.anchor_strategy == "docowl":
-            selected_anchor = anchor_rank(self.anchors, self.anchor_areas, (img.size[1], img.size[0]))
-        elif self.anchor_strategy == "random":
-            selected_anchor = random.randint(0, len(self.anchors) - 1)
-        elif self.anchor_strategy == "highest":
-            selected_anchor = paddle.argmax(
-                x=self.anchors[:, 2] * self.anchors[:, 3] * 100 - paddle.abs(x=self.anchors[:, 2] - self.anchors[:, 3])
-            )
-        elif self.anchor_strategy == "last":
-            selected_anchor = len(self.anchors) - 1
-        elif self.anchor_strategy == "llava":
-            selected_anchor = select_best_resolution(self.anchors, self.anchor_areas, (img.size[1], img.size[0]))
-        else:
-            selected_anchor = None
-        assert selected_anchor is not None
-        target_size = self.anchors[selected_anchor][2:].tolist()
-        if skip_resize:
-            return selected_anchor
-        # >>>>>>        return torchvision.transforms.functional.resize(img, [target_size[1
-        #             ], target_size[0]], self.interpolation, max_size=None,
-        #             antialias=self.antialias), selected_anchor
-        image_np = np.array(img)
-        image_tensor = paddle.to_tensor(image_np, dtype="float32")
-        image_tensor = image_tensor.transpose([2, 0, 1])  # 变成 (3, 500, 500)
-        if self.interpolation == "bilinear" or "bicubic":
-            image_tensor = image_tensor.unsqueeze(0)  # 变成 (1, 3, 500, 500)
-        return (
-            F.interpolate(
-                image_tensor, size=[target_size[1], target_size[0]], mode=self.interpolation, align_corners=False
-            ),
-            selected_anchor,
-        )
-
-    def __repr__(self) -> str:
-        detail = f"(size={self.image_size}, anchor={self.anchors}, interpolation={self.interpolation.value}, antialias={self.antialias})"
-        return f"{self.__class__.__name__}{detail}"
-
-
-class CutMixin:
-    def __init__(
-        self,
-        cut_cfg={
-            "anchors": "grid_squ_6",
-            "anchor_strategy": "docowl",
-            "cut_prompt": "v3",
-            "add_global": True,
-            "cut_prob": 1.0,
-        },
-    ) -> None:
-        if cut_cfg is None:
-            self.cut_enable = False
-            return
-        else:
-            self.cut_enable = True
-        image_size = self.image_size
-        anchors = cut_cfg.get("anchors", "grid_33")
-        anchor_strategy = cut_cfg.get("anchor_strategy", "docowl")
-        cut_prompt = cut_cfg.get("cut_prompt", "v0")
-        self.cut_prob = cut_cfg.get("cut_prob", 1.0)
-        self.force_shape_cut = cut_cfg.get("force_shape_cut", False)
-        force_shape_cut_anchors = cut_cfg.get("force_shape_cut_anchors", "force_shape_cut_anchors")
-        self.add_global = cut_cfg.get("add_global", False)
-        if isinstance(image_size, int):
-            image_size = image_size, image_size
-        self.image_size = image_size
-        if anchors in grid_dict:
-            anchors = grid_dict[anchors]
-        else:
-            anchors = eval(anchors)
-        self.anchors = [tuple(_) for _ in anchors]
-        self.anchor_max = max([max(_) for _ in self.anchors])
-        self.resizer = AnchorResize(
-            image_size=image_size, anchors=anchors, interpolation="bicubic", anchor_strategy=anchor_strategy
-        )
-        if force_shape_cut_anchors in grid_dict:
-            force_shape_cut_anchors = grid_dict[force_shape_cut_anchors]
-        else:
-            force_shape_cut_anchors = eval(force_shape_cut_anchors)
-        self.force_shape_cut_anchors = [tuple(_) for _ in force_shape_cut_anchors]
-        self.force_shape_cut_anchors_max = max([max(_) for _ in self.force_shape_cut_anchors])
-        # >>>>>>        self.old_resizer = torchvision.transforms.Resize(image_size,
-        #             interpolation=torchvision.transforms.transforms.
-        #             InterpolationMode.BICUBIC)
-        self.old_resizer = paddle.vision.transforms.Resize(size=image_size, interpolation="bicubic")
-        # >>>>>>        self.image_transform = torchvision.transforms.Compose(self.
-        #             image_transform.transforms[1:])
-        self.image_transform = paddle.vision.transforms.Compose(self.image_transform.transforms[1:])
-        if self.add_global:
-            self.cut_prompt_template = cut_prompt_template_dict[cut_prompt + "_global"]
-        else:
-            self.cut_prompt_template = cut_prompt_template_dict[cut_prompt]
-        self.media_tokens = ["<|image|>", "<|video|>"]
-
-    def _process_image(self, images):
-        new_images = []
-        cut_shape = []
-        for image in images:
-            print(len(images))
-            raw_image = image
-            print(raw_image)
-            print("-" * 100)
-            image, selected_anchor = self.resizer(image)
-            print(image.shape)
-            print("-" * 100)
-            image_input = self.image_transform(image)
-            image_input = image_input[0]
-            print(image_input.shape)
-            cut_shape.append(
-                (tuple(image_input.shape)[1] // self.image_size[0], tuple(image_input.shape)[2] // self.image_size[1])
-            )
-            image_input = rearrange(
-                image_input, "C (num_h h) (num_w w) -> (num_h num_w) C h w", h=self.image_size[0], w=self.image_size[1]
-            )
-            new_images.append(image_input)
-            print("1:", image_input.shape)
-            if self.add_global:
-                new_images.append(self.image_transform(self.resizer.resize_global(raw_image)))
-                print("2:", new_images[1].shape)
-                cut_shape.append((1, 1))
-        print("cutshape:", cut_shape)
-        new_images = paddle.concat(x=new_images, axis=0)
-        cut_shape_indices = build_cut_shape_indices(cut_shape)
-        return new_images, cut_shape, cut_shape_indices
-
-
-class TensorType(Enum):
-    PADDLE = "paddle"
-    TORCH = "torch"
-
-
-# >>>>>>class mPLUGOwl3BatchFeature(transformers.image_processing_utils.BatchFeature):
-class mPLUGOwl3BatchFeature(BatchFeature):
-    """
-    Extend from BatchFeature for supporting various image size
-    """
-
-    def __init__(
-        self,
-        data: Optional[Dict[str, Any]] = None,
-        # tensor_type:Union[None, str, transformers.utils.TensorType]=None):
-        tensor_type: Union[None, str, TensorType] = None,
-    ):
-        super().__init__(data)
-        self.convert_to_tensors(tensor_type=tensor_type)
-
-    # def convert_to_tensors(self, tensor_type: Optional[Union[str,transformers.utils.TensorType]]=None):
-    def convert_to_tensors(self, tensor_type: Optional[Union[str, TensorType]] = None):
-        if tensor_type is None:
-            return self
-
-        is_tensor = lambda x: isinstance(x, paddle.Tensor)
-        as_tensor = paddle.to_tensor
-
-        def converter(value):
-            try:
-                if not is_tensor(value):
-                    tensor = as_tensor(value)
-                    return tensor
-            except:
-                if key == "overflowing_values":
-                    raise ValueError("Unable to create tensor returning overflowing values of different lengths. ")
-                raise ValueError(
-                    "Unable to create tensor, you should probably activate padding with 'padding=True' to have batched tensors with the same length."
-                )
-
-        for key, value in self.items():
-            self[key] = recursive_converter(converter, value)
-        return self
-
-    def to(self, *args, **kwargs) -> "mPLUGOwl3BatchFeature":
-        # >>>>>>        transformers.utils.requires_backends(self, ['torch'])
-
-        def cast_tensor(v):
-            #             if paddle.is_floating_point(x=v):
-            #                 """Class Method: *.to, can not convert, please check whether it is torch.Tensor.*/Optimizer.*/nn.Module.*/torch.distributions.Distribution.*/torch.autograd.function.FunctionCtx.*/torch.profiler.profile.*/torch.autograd.profiler.profile.*, and convert manually"""
-            # >>>>>>                return v.to(*args, **kwargs)
-            #             elif device is not None:
-            #                 return v.to(device=device)
-            #             else:
-            #                 return v
-            if isinstance(v, paddle.Tensor):
-                # For floating point tensors
-                if v.dtype in [paddle.float32, paddle.float64]:
-                    if "dtype" in kwargs:
-                        v = v.cast(kwargs["dtype"])
-                    if "place" in kwargs:
-                        v = v.place(kwargs["place"])
-                    return v
-                # For non-floating point tensors, only handle device
-                elif "place" in kwargs:
-                    return v.place(kwargs["place"])
-            return v
-
-        new_data = {}
-        # Handle place (device in paddle)
-        place = kwargs.get("place")
-        if place is None and len(args) > 0:
-            arg = args[0]
-            if isinstance(arg, str) or isinstance(arg, paddle.CPUPlace) or isinstance(arg, paddle.CUDAPlace):
-                place = arg
-            else:
-                raise ValueError(f"Attempting to cast a BatchFeature to type {str(arg)}. This is not supported.")
-
-        #         device = kwargs.get('device')
-        #         if device is None and len(args) > 0:
-        #             arg = args[0]
-        # # >>>>>>            if transformers.utils.is_torch_dtype(arg):
-        #             if isinstance(arg, paddle.Tensor):
-        #                 pass
-        # # >>>>>>            elif isinstance(arg, str) or transformers.utils.is_torch_device(arg
-        # #                 ) or isinstance(arg, int):
-        # #                 device = arg
-        #             elif isinstance(arg, str):
-        #                 # 如果是字符串，可以直接使用该字符串作为设备标识
-        #                 device = arg
-        #             elif isinstance(arg, (int, paddle.device.Device)):
-        #                 if isinstance(arg, int):
-        #                     device = f'gpu:{arg}' if arg >= 0 else 'cpu'
-        #                 else:
-        #                     device = str(arg)
-        #             else:
-        #                 raise ValueError(
-        #                     f'Attempting to cast a BatchFeature to type {str(arg)}. This is not supported.'
-        #                     )
-        for k, v in self.items():
-            new_data[k] = recursive_converter(cast_tensor, v)
-        self.data = new_data
-        return self
-
-
-# >>>>>>class mPLUGOwl3ImageProcessor(transformers.image_processing_utils.
-#     BaseImageProcessor, CutMixin):
-class mPLUGOwl3ImageProcessor(BaseImageProcessor, CutMixin):
-    model_input_names = ["pixel_values"]
-
-    def __init__(self, image_size, mean=[0.5, 0.5, 0.5], std=[0.5, 0.5, 0.5], **kwargs):
-        # super().__init__(**kwargs)
-        self.image_size = image_size
-        self.mean = mean
-        self.std = std
-        # >>>>>>        self.image_transform = torchvision.transforms.Compose([torchvision.
-        #             transforms.Resize((image_size, image_size), interpolation=Image
-        #             .BICUBIC), torchvision.transforms.ToTensor(), torchvision.
-        #             transforms.Normalize(mean, std)])
-        self.image_transform = paddle.vision.transforms.Compose(
-            [
-                paddle.vision.transforms.Resize(size=(image_size, image_size), interpolation="bicubic"),
-                paddle.vision.transforms.ToTensor(),
-                paddle.vision.transforms.Normalize(mean=mean, std=std),
-            ]
-        )
-
-        CutMixin.__init__(self)
-
-    def preprocess(
-        self, images: Union[Image.Image, List[Image.Image]], cut_enable=True, **kwargs
-    ) -> mPLUGOwl3BatchFeature:
-        if isinstance(images, Image.Image):
-            images_list = [images]
-        else:
-            images_list = images
-        if self.cut_enable and cut_enable:
-            image_data, cut_shape, cut_shape_indices = self._process_image(images_list)
-        else:
-            image_data = [self.image_transform(self.resizer.resize_global(image)) for image in images_list]
-            image_data = paddle.stack(x=image_data, axis=0)
-            cut_shape = cut_shape_indices = None
-        return mPLUGOwl3BatchFeature(
-            data={"pixel_values": image_data, "cut_shape": cut_shape, "cut_shape_indices": cut_shape_indices}
-        )
-
-    def to_dict(self):
-        # encoder_dict = super().to_dict()
-        encoder_dict = {}
-        pop_keys = ["image_transform", "resizer", "old_resizer", "cut_prompt_template"]
-        for pk in pop_keys:
-            encoder_dict.pop(pk, None)
-        return encoder_dict
-
-
-# >>>>>>transformers.AutoImageProcessor.register('mPLUGOwl3ImageProcessor',
-#     mPLUGOwl3ImageProcessor)
diff --git a/paddlemix/mPLUGOwl3/imagetest.py b/paddlemix/mPLUGOwl3/imagetest.py
deleted file mode 100644
index 6fe5782e9..000000000
--- a/paddlemix/mPLUGOwl3/imagetest.py
+++ /dev/null
@@ -1,43 +0,0 @@
-# Copyright (c) 2024 PaddlePaddle Authors. All Rights Reserved.
-# 
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-# 
-#     http://www.apache.org/licenses/LICENSE-2.0
-# 
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-
-import paddle
-import paddlenlp
-from decord import VideoReader, cpu
-from mPLUGOwl3.configuration_mplugowl3 import mPLUGOwl3Config
-from mPLUGOwl3.modeling_mplugowl3 import mPLUGOwl3Model
-from paddlenlp.transformers import AutoTokenizer, PretrainedModel
-from PIL import Image
-
-model_path = "/home/aistudio/paddle_test/mPLUGOwl3"
-config = mPLUGOwl3Config.from_pretrained(model_path)
-# print(config)
-model = mPLUGOwl3Model.from_pretrained(model_path, config=config, dtype="float16")
-model = model.eval()
-tokenizer = AutoTokenizer.from_pretrained(model_path)
-processor = model.init_processor(tokenizer)
-image = Image.new("RGB", (500, 500), color="red")
-messages = [
-    {
-        "role": "user",
-        "content": """<|image|>
-Describe this image.""",
-    },
-    {"role": "assistant", "content": ""},
-]
-inputs = processor(messages, images=[image], videos=None)
-# inputs.to('cuda')
-inputs.update({"tokenizer": tokenizer, "max_new_tokens": 100, "decode_text": True})
-g = model.generate(**inputs)
-print(g)
diff --git a/paddlemix/mPLUGOwl3/modeling_hyper_qwen2.py b/paddlemix/mPLUGOwl3/modeling_hyper_qwen2.py
deleted file mode 100644
index a87685955..000000000
--- a/paddlemix/mPLUGOwl3/modeling_hyper_qwen2.py
+++ /dev/null
@@ -1,1568 +0,0 @@
-# Copyright (c) 2024 PaddlePaddle Authors. All Rights Reserved.
-# 
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-# 
-#     http://www.apache.org/licenses/LICENSE-2.0
-# 
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-
-import sys
-
-sys.path.append("/home/aistudio/paddle_test/mPLUGOwl3/utils")
-import paddle
-import paddle_aux
-import paddlenlp
-
-""" PyTorch Qwen2 model."""
-import inspect
-import math
-from typing import List, Optional, Tuple, Union
-
-from einops import rearrange, repeat
-
-from paddlemix.models.flash_attn_utils import (
-    has_flash_attn_func,
-    is_flash_attn_available,
-)
-
-from .activations import ACT2FN
-from .bert_padding import index_first_axis, pad_input, unpad_input
-from .configuration_hyper_qwen2 import HyperQwen2Config
-
-if is_flash_attn_available():
-    flash_attn_func, flash_attn_varlen_func = has_flash_attn_func()
-    _flash_supports_window_size = "window_size" in list(inspect.signature(flash_attn_func).parameters)
-# >>>>>>if transformers.utils.is_flash_attn_2_available():
-#     pass
-#     _flash_supports_window_size = 'window_size' in list(inspect.signature(
-#         flash_attn_func).parameters)
-
-from .x_sdpa import ScaleDotProductAttention
-
-try:
-    from einops import rearrange
-
-    use_flash_rotary = True
-    print("use flash_attn rotary")
-except ImportError:
-    use_flash_rotary = False
-    print("import flash_attn rotary fail")
-logger = paddle.utils.try_import("logging").getLogger(name=__name__)
-_CHECKPOINT_FOR_DOC = "Qwen/Qwen2-7B-beta"
-_CONFIG_FOR_DOC = "HyperQwen2Config"
-
-
-def _get_unpad_data(attention_mask):
-    seqlens_in_batch = attention_mask.sum(axis=-1, dtype="int32")
-    paddle.utils.try_import("warnings").warn("Now, the return shape is inconsistent with torch when as_tuple is True")
-    indices = paddle.nonzero(x=attention_mask.flatten(), as_tuple=False).flatten()
-    max_seqlen_in_batch = seqlens_in_batch.max().item()
-    cu_seqlens = paddle.nn.functional.pad(
-        x=paddle.cumsum(x=seqlens_in_batch, axis=0, dtype="int32"), pad=(1, 0), pad_from_left_axis=False
-    )
-    return indices, cu_seqlens, max_seqlen_in_batch
-
-
-class Qwen2RMSNorm(paddle.nn.Layer):
-    def __init__(self, hidden_size, eps=1e-06):
-        """
-        Qwen2RMSNorm is equivalent to T5LayerNorm
-        """
-        super().__init__()
-        self.weight = paddle.base.framework.EagerParamBase.from_tensor(tensor=paddle.ones(shape=hidden_size))
-        self.variance_epsilon = eps
-
-    def forward(self, hidden_states):
-        input_dtype = hidden_states.dtype
-        hidden_states = hidden_states.to("float32")
-        variance = hidden_states.pow(y=2).mean(axis=-1, keepdim=True)
-        hidden_states = hidden_states * paddle.rsqrt(x=variance + self.variance_epsilon)
-        return self.weight * hidden_states.to(input_dtype)
-
-
-class Qwen2RotaryEmbedding(paddle.nn.Layer):
-    def __init__(self, dim, max_position_embeddings=2048, base=10000, device=None):
-        super().__init__()
-        self.dim = dim
-        self.max_position_embeddings = max_position_embeddings
-        self.base = base
-        inv_freq = 1.0 / self.base ** (
-            paddle.arange(start=0, end=self.dim, step=2, dtype="int64").astype(dtype="float32").to(device) / self.dim
-        )
-        self.register_buffer(name="inv_freq", tensor=inv_freq, persistable=False)
-        self._set_cos_sin_cache(
-            seq_len=max_position_embeddings, device=self.inv_freq.place, dtype=paddle.get_default_dtype()
-        )
-
-    def _set_cos_sin_cache(self, seq_len, device, dtype):
-        self.max_seq_len_cached = seq_len
-        t = paddle.arange(dtype="int64", end=self.max_seq_len_cached).astype(dtype=self.inv_freq.dtype)
-        freqs = paddle.outer(x=t, y=self.inv_freq)
-        emb = paddle.concat(x=(freqs, freqs), axis=-1)
-        self.register_buffer(name="cos_cached", tensor=emb.cos().to(dtype), persistable=False)
-        self.register_buffer(name="sin_cached", tensor=emb.sin().to(dtype), persistable=False)
-
-    def forward(self, x, seq_len=None):
-        if seq_len > self.max_seq_len_cached:
-            self._set_cos_sin_cache(seq_len=seq_len, device=x.place, dtype=x.dtype)
-        return self.cos_cached[:seq_len].to(dtype=x.dtype), self.sin_cached[:seq_len].to(dtype=x.dtype)
-
-
-class RotaryEmbedding(paddle.nn.Layer):
-    def __init__(self, dim, base=10000, use_fp32=False, use_outer_in_rope=False):
-        super().__init__()
-        self.dim = dim
-        self.base = base
-        self.use_fp32 = use_fp32
-        if use_fp32:
-            self.inv_freq = 1.0 / base ** (paddle.arange(start=0, end=dim, step=2).astype(dtype="float32") / dim)
-        else:
-            inv_freq = 1.0 / base ** (paddle.arange(start=0, end=dim, step=2).astype(dtype="float32") / dim)
-            self.register_buffer(name="inv_freq", tensor=inv_freq)
-        self._rotary_pos_emb_cache = None
-        self._seq_len_cached = 0
-        self.use_outer_in_rope = use_outer_in_rope
-        self._ntk_alpha_cached = 1.0
-
-    def update_rotary_pos_emb_cache(self, max_seq_len, offset=0, ntk_alpha=1.0):
-        seqlen = max_seq_len + offset
-        if seqlen > self._seq_len_cached or ntk_alpha != self._ntk_alpha_cached:
-            base = self.base * ntk_alpha ** (self.dim / (self.dim - 2))
-            self.inv_freq = 1.0 / base ** (
-                paddle.arange(start=0, end=self.dim, step=2).astype(dtype="float32") / self.dim
-            )
-            self._seq_len_cached = seqlen
-            self._ntk_alpha_cached = ntk_alpha
-            seq = paddle.arange(end=seqlen)
-            if self.use_outer_in_rope:
-                freqs = paddle.outer(x=seq.astype(dtype=self.inv_freq.dtype), y=self.inv_freq)
-            else:
-                freqs = einsum("i , j -> i j", seq.astype(dtype=self.inv_freq.dtype), self.inv_freq)
-            emb = paddle.concat(x=(freqs, freqs), axis=-1)
-            from einops import rearrange
-
-            self._rotary_pos_emb_cache = rearrange(emb, "n d -> n 1 1 d")
-
-    def forward(self, max_seq_len, offset=0, ntk_alpha=1.0):
-        self.update_rotary_pos_emb_cache(max_seq_len, offset, ntk_alpha)
-        return self._rotary_pos_emb_cache[offset : offset + max_seq_len]
-
-
-def rotate_half(x):
-    """Rotates half the hidden dims of the input."""
-    x1 = x[..., : tuple(x.shape)[-1] // 2]
-    x2 = x[..., tuple(x.shape)[-1] // 2 :]
-    return paddle.concat(x=(-x2, x1), axis=-1)
-
-
-def apply_rotary_pos_emb(q, k, cos, sin, position_ids, unsqueeze_dim=1):
-    """Applies Rotary Position Embedding to the query and key tensors.
-
-    Args:
-        q (`torch.Tensor`): The query tensor.
-        k (`torch.Tensor`): The key tensor.
-        cos (`torch.Tensor`): The cosine part of the rotary embedding.
-        sin (`torch.Tensor`): The sine part of the rotary embedding.
-        position_ids (`torch.Tensor`):
-            The position indices of the tokens corresponding to the query and key tensors. For example, this can be
-            used to pass offsetted position ids when working with a KV-cache.
-        unsqueeze_dim (`int`, *optional*, defaults to 1):
-            The 'unsqueeze_dim' argument specifies the dimension along which to unsqueeze cos[position_ids] and
-            sin[position_ids] so that they can be properly broadcasted to the dimensions of q and k. For example, note
-            that cos[position_ids] and sin[position_ids] have the shape [batch_size, seq_len, head_dim]. Then, if q and
-            k have the shape [batch_size, heads, seq_len, head_dim], then setting unsqueeze_dim=1 makes
-            cos[position_ids] and sin[position_ids] broadcastable to the shapes of q and k. Similarly, if q and k have
-            the shape [batch_size, seq_len, heads, head_dim], then set unsqueeze_dim=2.
-    Returns:
-        `tuple(torch.Tensor)` comprising of the query and key tensors rotated using the Rotary Position Embedding.
-    """
-    cos = cos[position_ids].unsqueeze(axis=unsqueeze_dim)
-    sin = sin[position_ids].unsqueeze(axis=unsqueeze_dim)
-    # print(q.shape)
-    # print('-----------------')
-    # print((rotate_half(q) * sin).shape)
-    q_embed = q * cos + rotate_half(q) * sin
-    k_embed = k * cos + rotate_half(k) * sin
-    return q_embed, k_embed
-
-
-# Copied from transformers.models.llama.modeling_llama._prepare_4d_causal_attention_mask_with_cache_position
-def _prepare_4d_causal_attention_mask_with_cache_position(
-    attention_mask: paddle.Tensor,
-    sequence_length: int,
-    target_length: int,
-    dtype: paddle.dtype,
-    min_dtype: float,
-    cache_position: paddle.Tensor,
-    batch_size: int,
-):
-    """
-    Creates a causal 4D mask of shape `(batch_size, 1, query_length, key_value_length)` from a 2D mask of shape
-    `(batch_size, key_value_length)`, or if the input `attention_mask` is already 4D, do nothing.
-
-    Args:
-        attention_mask (`paddle.Tensor`):
-            A 2D attention mask of shape `(batch_size, key_value_length)` or a 4D attention mask of shape `(batch_size, 1, query_length, key_value_length)`.
-        sequence_length (`int`):
-            The sequence length being processed.
-        target_length (`int`):
-            The target length: when generating with static cache, the mask should be as long as the static cache, to account for the 0 padding, the part of the cache that is not filled yet.
-        dtype (`paddle.dtype`):
-            The dtype to use for the 4D attention mask.
-        min_dtype (`float`):
-            The minimum value representable with the dtype `dtype`.
-        cache_position (`paddle.Tensor`):
-            Indices depicting the position of the input sequence tokens in the sequence.
-        batch_size (`paddle.Tensor`):
-            Batch size.
-    """
-    if attention_mask is not None and attention_mask.dim() == 4:
-        # In this case we assume that the mask comes already in inverted form and requires no inversion or slicing.
-        causal_mask = attention_mask
-    else:
-        causal_mask = paddle.full([sequence_length, target_length], fill_value=min_dtype, dtype=dtype)
-        if sequence_length != 1:
-            causal_mask = paddle.triu(x=causal_mask, diagonal=1)
-        bool_tensor = paddle.arange(target_length) > cache_position.reshape([-1, 1])
-        float_tensor = float16_tensor = bool_tensor.astype(paddle.float16)
-        causal_mask *= float_tensor
-        causal_mask = causal_mask[None, None, :, :].expand(shape=[batch_size, 1, -1, -1])
-        if attention_mask is not None:
-            causal_mask = causal_mask.clone()
-            mask_length = tuple(attention_mask.shape)[-1]
-            padding_mask = causal_mask[:, :, :, :mask_length] + attention_mask[:, None, None, :]
-            padding_mask = padding_mask == 0
-            causal_mask[:, :, :, :mask_length] = causal_mask[:, :, :, :mask_length].masked_fill(
-                mask=padding_mask, value=min_dtype
-            )
-    return causal_mask
-
-
-class Qwen2MLP(paddle.nn.Layer):
-    def __init__(self, config):
-        super().__init__()
-        self.config = config
-        self.hidden_size = config.hidden_size
-        self.intermediate_size = config.intermediate_size
-        self.gate_proj = paddle.nn.Linear(
-            in_features=self.hidden_size, out_features=self.intermediate_size, bias_attr=False
-        )
-        self.up_proj = paddle.nn.Linear(
-            in_features=self.hidden_size, out_features=self.intermediate_size, bias_attr=False
-        )
-        self.down_proj = paddle.nn.Linear(
-            in_features=self.intermediate_size, out_features=self.hidden_size, bias_attr=False
-        )
-        self.act_fn = ACT2FN[config.hidden_act]
-
-    def forward(self, x):
-        return self.down_proj(self.act_fn(self.gate_proj(x)) * self.up_proj(x))
-
-
-def repeat_kv(hidden_states: paddle.Tensor, n_rep: int) -> paddle.Tensor:
-    """
-    This is the equivalent of torch.repeat_interleave(x, dim=1, repeats=n_rep). The hidden states go from (batch,
-    num_key_value_heads, seqlen, head_dim) to (batch, num_attention_heads, seqlen, head_dim)
-    """
-    batch, num_key_value_heads, slen, head_dim = tuple(hidden_states.shape)
-    if n_rep == 1:
-        return hidden_states
-    hidden_states = hidden_states[:, :, None, :, :].expand(shape=[batch, num_key_value_heads, n_rep, slen, head_dim])
-    return hidden_states.reshape(batch, num_key_value_heads * n_rep, slen, head_dim)
-
-
-def make_t2v_mask(media_offset_line, num_images):
-    assert len(tuple(media_offset_line.shape)) == 1
-    # media_offset_line = media_offset_line.view(-1, 1)
-    # visual_arange = paddle.arange(end=num_images).view(1, -1)
-    media_offset_line = paddle.reshape(media_offset_line, [-1, 1])
-    visual_arange = paddle.arange(end=num_images).reshape([1, -1])
-
-    mask = media_offset_line <= visual_arange
-    return mask
-
-
-def select_query(media_offset, num_queries=None):
-    query_indices = media_offset[:, :, 1] >= 0
-    assert query_indices.sum().item() % num_queries == 0, query_indices.sum().item()
-    query_indices = query_indices.nonzero()
-    ptr = 0
-    while ptr < tuple(query_indices.shape)[0]:
-        first_query_index, last_query_index = query_indices[ptr], query_indices[ptr + num_queries - 1]
-        assert (last_query_index[1] - first_query_index[1] + 1).item() == num_queries
-        assert last_query_index[0].item() == first_query_index[0].item()
-        batch_id, begin_i, end_i = (
-            first_query_index[0].item(),
-            first_query_index[1].item(),
-            first_query_index[1].item() + num_queries,
-        )
-        yield batch_id, begin_i, end_i
-        ptr += num_queries
-
-
-def _rotate_half(x):
-    """
-    change sign so the last dimension becomes [-odd, +even]
-    """
-    from einops import rearrange
-
-    x = rearrange(x, "... (j d) -> ... j d", j=2)
-    x1, x2 = x.unbind(axis=-2)
-    return paddle.concat(x=(-x2, x1), axis=-1)
-
-
-def apply_rotary_pos_emb_core(t, freqs, use_fp32=False, debug=False):
-    """
-    input tensor t is of shape [seq_length, ..., dim]
-    rotary positional embeding tensor freqs is of shape [seq_length, ..., dim]
-    check https://kexue.fm/archives/8265 for detailed formulas
-    """
-    if use_flash_rotary and use_fp32:
-        t_ = rearrange(t, "s b ... -> b s ...").contiguous()
-        if use_fp32:
-            t_ = t_.astype(dtype="float32")
-        freqs = freqs.squeeze(axis=1).squeeze(axis=1)
-        cos = freqs[:, : tuple(freqs.shape)[-1] // 2].cos()
-        sin = freqs[:, : tuple(freqs.shape)[-1] // 2].sin()
-        output = paddle_aux.apply_rotary_emb_func(x=t_, cos=cos, sin=sin).astype(dtype=t.dtype)
-        if debug:
-            from icecream import ic
-
-            ic(tuple(t_.shape), tuple(freqs.shape), tuple(cos.shape))
-        return rearrange(output, "b s ... -> s b ...")
-    rot_dim = tuple(freqs.shape)[-1]
-    t_, t_pass_ = t[..., :rot_dim], t[..., rot_dim:]
-    if use_fp32:
-        t_ = t_.astype(dtype="float32")
-        t_pass_ = t_pass_.astype(dtype="float32")
-    t_ = t_ * freqs.cos() + _rotate_half(t_) * freqs.sin()
-    return paddle.concat(x=(t_, t_pass_), axis=-1).astype(dtype=t.dtype)
-
-
-class HyperQwen2Attention(paddle.nn.Layer):
-    """
-    Multi-headed attention from 'Attention Is All You Need' paper. Modified to use sliding window attention: Longformer
-    and "Generating Long Sequences with Sparse Transformers".
-    """
-
-    def __init__(self, config: HyperQwen2Config, layer_idx: Optional[int] = None, is_hyper_enabed=False):
-        super().__init__()
-        self.config = config
-        self.layer_idx = layer_idx
-        if layer_idx is None:
-            logger.warning_once(
-                f"Instantiating {self.__class__.__name__} without passing `layer_idx` is not recommended and will to errors during the forward call, if caching is used. Please make sure to provide a `layer_idx` when creating this class."
-            )
-        self.hidden_size = config.hidden_size
-        self.num_heads = config.num_attention_heads
-        self.head_dim = self.hidden_size // self.num_heads
-        self.num_key_value_heads = config.num_key_value_heads
-        self.num_key_value_groups = self.num_heads // self.num_key_value_heads
-        self.max_position_embeddings = config.max_position_embeddings
-        self.rope_theta = config.rope_theta
-        self.is_causal = True
-        self.attention_dropout = config.attention_dropout
-        if self.head_dim * self.num_heads != self.hidden_size:
-            raise ValueError(
-                f"hidden_size must be divisible by num_heads (got `hidden_size`: {self.hidden_size} and `num_heads`: {self.num_heads})."
-            )
-        self.q_proj = paddle.nn.Linear(
-            in_features=self.hidden_size, out_features=self.num_heads * self.head_dim, bias_attr=True
-        )
-        self.k_proj = paddle.nn.Linear(
-            in_features=self.hidden_size, out_features=self.num_key_value_heads * self.head_dim, bias_attr=True
-        )
-        self.v_proj = paddle.nn.Linear(
-            in_features=self.hidden_size, out_features=self.num_key_value_heads * self.head_dim, bias_attr=True
-        )
-        self.o_proj = paddle.nn.Linear(
-            in_features=self.num_heads * self.head_dim, out_features=self.hidden_size, bias_attr=False
-        )
-        self.rotary_emb = Qwen2RotaryEmbedding(
-            self.head_dim, max_position_embeddings=self.max_position_embeddings, base=self.rope_theta
-        )
-        self.rotary_emb_core = RotaryEmbedding(
-            self.head_dim, base=self.rope_theta, use_fp32=True, use_outer_in_rope=True
-        )
-        self.is_hyper_enabed = is_hyper_enabed
-        if self.is_hyper_enabed:
-            self.v_kv_proj = paddle.nn.Linear(
-                in_features=self.hidden_size, out_features=self.num_key_value_heads * self.head_dim * 2, bias_attr=True
-            )
-            self.gate = paddle.base.framework.EagerParamBase.from_tensor(tensor=paddle.zeros(shape=self.hidden_size))
-            self.v_core_attention_sdpa = ScaleDotProductAttention(
-                layer_number=-1, causal=False, attention_dropout=self.attention_dropout
-            )
-            self.visual_cache = {}
-
-    def apply_mi_rope(self, key_layer, media_offset_line, length_each_img):
-        key_layer = rearrange(key_layer, "b h s d -> s b h d")
-        if self.rotary_emb_core.inv_freq.place != key_layer.place:
-            self.rotary_emb_core.inv_freq = self.rotary_emb_core.inv_freq.to(key_layer.place)
-        rotary_pos_emb_max_seq_len = self.config.max_position_embeddings
-        ntk_alpha = 1
-        rotary_pos_emb = self.rotary_emb_core(rotary_pos_emb_max_seq_len, ntk_alpha=ntk_alpha)
-        assert rotary_pos_emb is not None
-        if isinstance(rotary_pos_emb, tuple):
-            rotary_pos_emb = rotary_pos_emb
-        else:
-            rotary_pos_emb = (rotary_pos_emb,) * 2
-        if rotary_pos_emb is not None:
-            q_pos_emb, k_pos_emb = rotary_pos_emb
-            image_pos = (media_offset_line[1:] - media_offset_line[:-1]).nonzero().squeeze(axis=1) + 1
-            k_pos_emb = repeat(k_pos_emb[image_pos], "N_img b h d -> (N_img L) b h d", L=length_each_img)
-            key_layer = apply_rotary_pos_emb_core(key_layer, k_pos_emb, use_fp32=True)
-        key_layer = rearrange(key_layer, "s b h d -> b h s d")
-        return key_layer
-
-    def crossattention(self, query_layer, vision_features, media_offset, context_layer):
-        """
-        query_layer: [s b h d]
-        vision_features: [b' lv d]
-        context_layer: s b d
-        """
-        if vision_features is None or self.is_hyper_enabed == False:
-            return context_layer
-        context_layer_clone = context_layer.clone()
-        vision_features = vision_features.contiguous()
-        vision_features = self.v_kv_proj(vision_features)
-        length_each_img = tuple(vision_features.shape)[1]
-        sequence_length = tuple(query_layer.shape)[0]
-        if sequence_length == 1:
-            completion_flag = True
-            media_offset = media_offset[:, -1:]
-        else:
-            completion_flag = False
-            self.visual_cache["media_offset"] = media_offset
-            self.visual_cache["vision_features"] = vision_features
-        query_layer = rearrange(query_layer, "L B H D -> B H L D")
-        assert sequence_length == tuple(media_offset.shape)[1], (sequence_length, tuple(media_offset.shape))
-        gate_value = paddle.nn.functional.sigmoid(x=self.gate)
-        for batch_id, begin_i, end_i in select_query(media_offset, sequence_length):
-            assert begin_i == 0
-            assert end_i == sequence_length, (end_i, sequence_length)
-            curr_offset = media_offset[batch_id, end_i - 1]
-            if not completion_flag:
-                re_to_zero_media_offset = (media_offset[batch_id, :, 1] - curr_offset[0]).to(query_layer.place)
-                query_shift = re_to_zero_media_offset.nonzero()[0].item()
-                curr_mask = make_t2v_mask(
-                    re_to_zero_media_offset[query_shift:], num_images=curr_offset[1] - curr_offset[0]
-                )
-                curr_mask = repeat(curr_mask, "s_q s_k -> B H s_q (s_k img_l)", B=1, H=1, img_l=length_each_img)
-            else:
-                curr_mask = None
-                query_shift = 0
-            curr_query_tokens = query_layer[batch_id, :, query_shift:].unsqueeze(axis=0).clone().contiguous()
-            assert curr_offset[0] < tuple(vision_features.shape)[0]
-            assert curr_offset[1] <= tuple(vision_features.shape)[0]
-            curr_vision_kv: paddle.Tensor = rearrange(
-                vision_features[curr_offset[0] : curr_offset[1]].clone(),
-                "BL Lv (H KV D) -> KV 1 H (BL Lv) D",
-                KV=2,
-                H=self.num_key_value_heads,
-            )
-            key_layer = curr_vision_kv[0].contiguous()
-            value_layer = curr_vision_kv[1].contiguous()
-            key_layer = self.apply_mi_rope(
-                key_layer,
-                media_offset_line=self.visual_cache["media_offset"][batch_id, :, 1] - curr_offset[0],
-                length_each_img=length_each_img,
-            )
-            key_layer = repeat_kv(key_layer, self.num_key_value_groups)
-            value_layer = repeat_kv(value_layer, self.num_key_value_groups)
-            v_context_layer = self.v_core_attention_sdpa(
-                curr_query_tokens, key_layer, value_layer, attn_mask=curr_mask, order="bhsd"
-            ).squeeze(axis=1)
-            context_layer_clone[query_shift:, batch_id] = (
-                context_layer[query_shift:, batch_id].clone() * (1 - gate_value) + v_context_layer * gate_value
-            )
-        return context_layer_clone
-
-    def forward(
-        self,
-        hidden_states: paddle.Tensor,
-        attention_mask: Optional[paddle.Tensor] = None,
-        position_ids: Optional[paddle.Tensor] = None,
-        image_embeds=None,
-        media_offset=None,
-        past_key_value: Optional[Tuple[paddle.Tensor]] = None,
-        output_attentions: bool = False,
-        use_cache: bool = False,
-    ) -> Tuple[paddle.Tensor, Optional[paddle.Tensor], Optional[Tuple[paddle.Tensor]]]:
-        raise NotImplementedError(
-            'We do not support eager model yet. Use attn_implementation == "flash_attention_2" or attn_implementation == "sdpa".'
-        )
-        bsz, q_len, _ = tuple(hidden_states.shape)
-        query_states = self.q_proj(hidden_states)
-        key_states = self.k_proj(hidden_states)
-        value_states = self.v_proj(hidden_states)
-        # query_states = query_states.view(bsz, q_len, self.num_heads, self.
-        #     head_dim).transpose(perm=paddle_aux.transpose_aux_func(
-        #     query_states.view(bsz, q_len, self.num_heads, self.head_dim).
-        #     ndim, 1, 2))
-        query_states = paddle.reshape(query_states, [bsz, q_len, self.num_heads, self.head_dim])
-        query_states = paddle.transpose(query_states, perm=[0, 2, 1, 3])  # 交换第1和第2维度
-
-        # key_states = key_states.view(bsz, q_len, self.num_key_value_heads,
-        #     self.head_dim).transpose(perm=paddle_aux.transpose_aux_func(
-        #     key_states.view(bsz, q_len, self.num_key_value_heads, self.
-        #     head_dim).ndim, 1, 2))
-        key_states = paddle.reshape(key_states, [bsz, q_len, self.num_key_value_heads, self.head_dim])
-        key_states = paddle.transpose(key_states, perm=[0, 2, 1, 3])  # 交换第1和第2维度
-
-        # value_states = value_states.view(bsz, q_len, self.
-        #     num_key_value_heads, self.head_dim).transpose(perm=paddle_aux.
-        #     transpose_aux_func(value_states.view(bsz, q_len, self.
-        #     num_key_value_heads, self.head_dim).ndim, 1, 2))
-        value_states = paddle.reshape(value_states, [bsz, q_len, self.num_key_value_heads, self.head_dim])
-        value_states = paddle.transpose(value_states, perm=[0, 2, 1, 3])  # 交换第1和第2维度
-
-        kv_seq_len = tuple(key_states.shape)[-2]
-        if past_key_value is not None:
-            if self.layer_idx is None:
-                raise ValueError(
-                    f"The cache structure has changed since version v4.36. If you are using {self.__class__.__name__} for auto-regressive decoding with k/v caching, please make sure to initialize the attention class with a layer index."
-                )
-            kv_seq_len += past_key_value.get_usable_length(kv_seq_len, self.layer_idx)
-        cos, sin = self.rotary_emb(value_states, seq_len=kv_seq_len)
-
-        query_states, key_states = apply_rotary_pos_emb(query_states, key_states, cos, sin, position_ids)
-        if past_key_value is not None:
-            cache_kwargs = {"sin": sin, "cos": cos}
-            key_states, value_states = past_key_value.update(key_states, value_states, self.layer_idx, cache_kwargs)
-        key_states = repeat_kv(key_states, self.num_key_value_groups)
-        value_states = repeat_kv(value_states, self.num_key_value_groups)
-        attn_weights = paddle.matmul(
-            x=query_states, y=key_states.transpose(perm=paddle_aux.transpose_aux_func(key_states.ndim, 2, 3))
-        ) / math.sqrt(self.head_dim)
-        if tuple(attn_weights.shape) != (bsz, self.num_heads, q_len, kv_seq_len):
-            raise ValueError(
-                f"Attention weights should be of size {bsz, self.num_heads, q_len, kv_seq_len}, but is {tuple(attn_weights.shape)}"
-            )
-        if attention_mask is not None:
-            if tuple(attention_mask.shape) != (bsz, 1, q_len, kv_seq_len):
-                raise ValueError(
-                    f"Attention mask should be of size {bsz, 1, q_len, kv_seq_len}, but is {tuple(attention_mask.shape)}"
-                )
-            attn_weights = attn_weights + attention_mask
-        attn_weights = paddle.nn.functional.softmax(x=attn_weights, axis=-1, dtype="float32").to(query_states.dtype)
-        attn_weights = paddle.nn.functional.dropout(x=attn_weights, p=self.attention_dropout, training=self.training)
-        attn_output = paddle.matmul(x=attn_weights, y=value_states)
-        if tuple(attn_output.shape) != (bsz, self.num_heads, q_len, self.head_dim):
-            raise ValueError(
-                f"`attn_output` should be of size {bsz, self.num_heads, q_len, self.head_dim}, but is {tuple(attn_output.shape)}"
-            )
-        attn_output = attn_output.transpose(perm=paddle_aux.transpose_aux_func(attn_output.ndim, 1, 2)).contiguous()
-        attn_output = attn_output.reshape(bsz, q_len, self.hidden_size)
-        attn_output = self.crossattention(
-            query_states.transpose(perm=[1, 0, 1, 3]),
-            image_embeds,
-            media_offset,
-            attn_output.transpose(perm=[1, 0, 2]),
-        )
-        attn_output = attn_output.transpose(perm=[1, 0, 2])
-        attn_output = self.o_proj(attn_output)
-        if not output_attentions:
-            attn_weights = None
-        return attn_output, attn_weights, past_key_value
-
-
-class HyperQwen2FlashAttention2(HyperQwen2Attention):
-    """
-    Qwen2 flash attention module, following Qwen2 attention module. This module inherits from `Qwen2Attention`
-    as the weights of the module stays untouched. The only required change would be on the forward pass
-    where it needs to correctly call the public API of flash attention and deal with padding tokens
-    in case the input contains any of them. Additionally, for sliding window attention, we apply SWA only to the bottom
-    config.max_window_layers layers.
-    """
-
-    def __init__(self, *args, **kwargs):
-        super().__init__(*args, **kwargs)
-
-    # >>>>>>        self._flash_attn_uses_top_left_mask = (not transformers.utils.
-    #             is_flash_attn_greater_or_equal_2_10())
-
-    def forward(
-        self,
-        hidden_states: paddle.Tensor,
-        attention_mask: Optional[paddle.Tensor] = None,
-        position_ids: Optional[paddle.Tensor] = None,
-        image_embeds=None,
-        media_offset=None,
-        past_key_value: Optional[Tuple[paddle.Tensor]] = None,
-        output_attentions: bool = False,
-        use_cache: bool = False,
-    ):
-        bsz, q_len, _ = tuple(hidden_states.shape)
-        query_states = self.q_proj(hidden_states)
-        key_states = self.k_proj(hidden_states)
-        value_states = self.v_proj(hidden_states)
-        # query_states = query_states.view(bsz, q_len, self.num_heads, self.
-        #     head_dim).transpose(perm=paddle_aux.transpose_aux_func(
-        #     query_states.view(bsz, q_len, self.num_heads, self.head_dim).
-        #     ndim, 1, 2))
-        query_states = paddle.reshape(query_states, [bsz, q_len, self.num_heads, self.head_dim])
-        query_states = paddle.transpose(query_states, perm=[0, 2, 1, 3])  # 交换第1和第2维度
-
-        # key_states = key_states.view(bsz, q_len, self.num_key_value_heads,
-        #     self.head_dim).transpose(perm=paddle_aux.transpose_aux_func(
-        #     key_states.view(bsz, q_len, self.num_key_value_heads, self.
-        #     head_dim).ndim, 1, 2))
-        key_states = paddle.reshape(key_states, [bsz, q_len, self.num_key_value_heads, self.head_dim])
-        key_states = paddle.transpose(key_states, perm=[0, 2, 1, 3])  # 交换第1和第2维度
-
-        # value_states = value_states.view(bsz, q_len, self.
-        #     num_key_value_heads, self.head_dim).transpose(perm=paddle_aux.
-        #     transpose_aux_func(value_states.view(bsz, q_len, self.
-        #     num_key_value_heads, self.head_dim).ndim, 1, 2))
-        value_states = paddle.reshape(value_states, [bsz, q_len, self.num_key_value_heads, self.head_dim])
-        value_states = paddle.transpose(value_states, perm=[0, 2, 1, 3])  # 交换第1和第2维度
-
-        kv_seq_len = tuple(key_states.shape)[-2]
-        if past_key_value is not None:
-            if self.layer_idx is None:
-                raise ValueError(
-                    f"The cache structure has changed since version v4.36. If you are using {self.__class__.__name__} for auto-regressive decoding with k/v caching, please make sure to initialize the attention class with a layer index."
-                )
-            kv_seq_len += past_key_value.get_usable_length(kv_seq_len, self.layer_idx)
-        rotary_seq_len = max(kv_seq_len, position_ids[:, -1].max().item()) + 1
-        cos, sin = self.rotary_emb(value_states, seq_len=rotary_seq_len)
-        query_states, key_states = apply_rotary_pos_emb(query_states, key_states, cos, sin, position_ids)
-        use_sliding_windows = (
-            _flash_supports_window_size
-            and getattr(self.config, "sliding_window", None) is not None
-            and kv_seq_len > self.config.sliding_window
-            and self.config.use_sliding_window
-        )
-        if not _flash_supports_window_size:
-            logger.warning_once(
-                "The current flash attention version does not support sliding window attention, for a more memory efficient implementation make sure to upgrade flash-attn library."
-            )
-        if past_key_value is not None:
-            cache_has_contents = past_key_value.get_seq_length(self.layer_idx) > 0
-            if (
-                getattr(self.config, "sliding_window", None) is not None
-                and kv_seq_len > self.config.sliding_window
-                and cache_has_contents
-            ):
-                slicing_tokens = 1 - self.config.sliding_window
-                past_key = past_key_value[self.layer_idx][0]
-                past_value = past_key_value[self.layer_idx][1]
-                past_key = past_key[:, :, slicing_tokens:, :].contiguous()
-                past_value = past_value[:, :, slicing_tokens:, :].contiguous()
-                if tuple(past_key.shape)[-2] != self.config.sliding_window - 1:
-                    raise ValueError(
-                        f"past key must have a shape of (`batch_size, num_heads, self.config.sliding_window-1, head_dim`), got {tuple(past_key.shape)}"
-                    )
-                if attention_mask is not None:
-                    attention_mask = attention_mask[:, slicing_tokens:]
-                    attention_mask = paddle.concat(
-                        x=[attention_mask, paddle.ones_like(x=attention_mask[:, -1:])], axis=-1
-                    )
-            cache_kwargs = {"sin": sin, "cos": cos}
-            key_states, value_states = past_key_value.update(key_states, value_states, self.layer_idx, cache_kwargs)
-        key_states = repeat_kv(key_states, self.num_key_value_groups)
-        value_states = repeat_kv(value_states, self.num_key_value_groups)
-        dropout_rate = 0.0 if not self.training else self.attention_dropout
-        input_dtype = query_states.dtype
-        if input_dtype == "float32":
-            # >>>>>>            if torch.is_autocast_enabled():
-            # >>>>>>                target_dtype = torch.get_autocast_gpu_dtype()
-            if paddle.amp.auto_cast_enabled():
-                target_dtype = paddle.get_device("gpu").dtype
-            elif hasattr(self.config, "_pre_quantization_dtype"):
-                target_dtype = self.config._pre_quantization_dtype
-            else:
-                target_dtype = self.q_proj.weight.dtype
-            logger.warning_once(
-                f"The input hidden states seems to be silently casted in float32, this might be related to the fact you have upcasted embedding or layer norm layers in float32. We will cast back the input in {target_dtype}."
-            )
-            query_states = query_states.to(target_dtype)
-            key_states = key_states.to(target_dtype)
-            value_states = value_states.to(target_dtype)
-        query_states = query_states.transpose(perm=paddle_aux.transpose_aux_func(query_states.ndim, 1, 2))
-        key_states = key_states.transpose(perm=paddle_aux.transpose_aux_func(key_states.ndim, 1, 2))
-        value_states = value_states.transpose(perm=paddle_aux.transpose_aux_func(value_states.ndim, 1, 2))
-        attn_output = self._flash_attention_forward(
-            query_states,
-            key_states,
-            value_states,
-            attention_mask,
-            q_len,
-            dropout=dropout_rate,
-            use_sliding_windows=use_sliding_windows,
-        )
-        attn_output = attn_output.reshape(bsz, q_len, self.hidden_size).contiguous()
-        attn_output = self.crossattention(
-            query_states.transpose(perm=[1, 0, 2, 3]),
-            image_embeds,
-            media_offset,
-            attn_output.transpose(perm=[1, 0, 2]),
-        )
-        attn_output = attn_output.transpose(perm=[1, 0, 2])
-        attn_output = self.o_proj(attn_output)
-        if not output_attentions:
-            attn_weights = None
-        return attn_output, attn_weights, past_key_value
-
-    def _flash_attention_forward(
-        self,
-        query_states,
-        key_states,
-        value_states,
-        attention_mask,
-        query_length,
-        dropout=0.0,
-        softmax_scale=None,
-        use_sliding_windows=False,
-    ):
-        """
-        Calls the forward method of Flash Attention - if the input hidden states contain at least one padding token
-        first unpad the input, then computes the attention scores and pad the final attention scores.
-
-        Args:
-            query_states (`torch.Tensor`):
-                Input query states to be passed to Flash Attention API
-            key_states (`torch.Tensor`):
-                Input key states to be passed to Flash Attention API
-            value_states (`torch.Tensor`):
-                Input value states to be passed to Flash Attention API
-            attention_mask (`torch.Tensor`):
-                The padding mask - corresponds to a tensor of size `(batch_size, seq_len)` where 0 stands for the
-                position of padding tokens and 1 for the position of non-padding tokens.
-            dropout (`float`):
-                Attention dropout
-            softmax_scale (`float`, *optional*):
-                The scaling of QK^T before applying softmax. Default to 1 / sqrt(head_dim)
-            use_sliding_windows (`bool`, *optional*):
-                Whether to activate sliding window attention.
-        """
-        if not self._flash_attn_uses_top_left_mask:
-            causal = self.is_causal
-        else:
-            causal = self.is_causal and query_length != 1
-        if use_sliding_windows and self.layer_idx >= self.config.max_window_layers:
-            use_sliding_windows = False
-        if attention_mask is not None:
-            batch_size = tuple(query_states.shape)[0]
-            (query_states, key_states, value_states, indices_q, cu_seq_lens, max_seq_lens) = self._upad_input(
-                query_states, key_states, value_states, attention_mask, query_length
-            )
-            cu_seqlens_q, cu_seqlens_k = cu_seq_lens
-            max_seqlen_in_batch_q, max_seqlen_in_batch_k = max_seq_lens
-            if not use_sliding_windows:
-                # >>>>>>                attn_output_unpad = flash_attn.flash_attn_varlen_func(
-                attn_output_unpad = flash_attn_varlen_func(
-                    query_states,
-                    key_states,
-                    value_states,
-                    cu_seqlens_q=cu_seqlens_q,
-                    cu_seqlens_k=cu_seqlens_k,
-                    max_seqlen_q=max_seqlen_in_batch_q,
-                    max_seqlen_k=max_seqlen_in_batch_k,
-                    dropout_p=dropout,
-                    # softmax_scale = softmax_scale, causal=causal)
-                    scale=softmax_scale,
-                    causal=causal,
-                )[0]
-            else:
-                # >>>>>>
-                attn_output_unpad = flash_attn_varlen_func(
-                    query_states,
-                    key_states,
-                    value_states,
-                    cu_seqlens_q=cu_seqlens_q,
-                    cu_seqlens_k=cu_seqlens_k,
-                    max_seqlen_q=max_seqlen_in_batch_q,
-                    max_seqlen_k=
-                    # max_seqlen_in_batch_k, dropout_p=dropout, softmax_scale
-                    max_seqlen_in_batch_k,
-                    dropout_p=dropout,
-                    scale=softmax_scale,
-                    causal=causal,
-                    window_size=(self.config.sliding_window, self.config.sliding_window),
-                )[0]
-            # >>>>>>            attn_output = flash_attn.bert_padding.pad_input(attn_output_unpad,
-            #                 indices_q, batch_size, query_length)
-            attn_output = pad_input(attn_output_unpad, indices_q, batch_size, query_length)
-        elif not use_sliding_windows:
-            # >>>>>>            attn_output = flash_attn.flash_attn_func(query_states,
-            attn_output = flash_attn_func(
-                query_states,
-                key_states,
-                # value_states, dropout, softmax_scale=softmax_scale,
-                value_states,
-                dropout,
-                causal=causal,
-            )[0]
-        else:
-            # >>>>>>            attn_output = flash_attn.flash_attn_func(query_states,
-            attn_output = flash_attn.flash_attn_func(
-                query_states,
-                # key_states, value_states, dropout, softmax_scale=softmax_scale,
-                key_states,
-                value_states,
-                dropout,
-                causal=causal,
-                window_size=(self.config.sliding_window, self.config.sliding_window),
-            )[0]
-        return attn_output
-
-    def _upad_input(self, query_layer, key_layer, value_layer, attention_mask, query_length):
-        batch_size, kv_seq_len, num_heads, head_dim = tuple(key_layer.shape)
-        if kv_seq_len != tuple(attention_mask.shape)[-1]:
-            attention_mask_num_tokens = tuple(attention_mask.shape)[-1]
-            attention_mask = attention_mask[:, attention_mask_num_tokens - kv_seq_len :]
-        indices_k, cu_seqlens_k, max_seqlen_in_batch_k = _get_unpad_data(attention_mask)
-        # >>>>>>        key_layer = flash_attn.bert_padding.index_first_axis(key_layer.
-        #             reshape(batch_size * kv_seq_len, num_heads, head_dim), indices_k)
-        key_layer = index_first_axis(key_layer.reshape([batch_size * kv_seq_len, num_heads, head_dim]), indices_k)
-        # >>>>>>        value_layer = flash_attn.bert_padding.index_first_axis(value_layer.
-        #             reshape(batch_size * kv_seq_len, num_heads, head_dim), indices_k)
-        value_layer = index_first_axis(value_layer.reshape([batch_size * kv_seq_len, num_heads, head_dim]), indices_k)
-        if query_length == kv_seq_len:
-            # >>>>>>            query_layer = flash_attn.bert_padding.index_first_axis(query_layer
-            #                 .reshape(batch_size * kv_seq_len, num_heads, head_dim),
-            #                 indices_k)
-            query_layer = index_first_axis(
-                query_layer.reshape([batch_size * kv_seq_len, num_heads, head_dim]), indices_k
-            )
-            cu_seqlens_q = cu_seqlens_k
-            max_seqlen_in_batch_q = max_seqlen_in_batch_k
-            indices_q = indices_k
-        elif query_length == 1:
-            max_seqlen_in_batch_q = 1
-            cu_seqlens_q = paddle.arange(dtype="int32", end=batch_size + 1)
-            indices_q = cu_seqlens_q[:-1]
-            query_layer = query_layer.squeeze(axis=1)
-        else:
-            attention_mask = attention_mask[:, -query_length:]
-            query_layer, indices_q, cu_seqlens_q, max_seqlen_in_batch_q = (
-                # flash_attn.bert_padding.unpad_input(query_layer,
-                # attention_mask))
-                unpad_input(query_states, attention_mask)
-            )
-        return (
-            query_layer,
-            key_layer,
-            value_layer,
-            indices_q,
-            (cu_seqlens_q, cu_seqlens_k),
-            (max_seqlen_in_batch_q, max_seqlen_in_batch_k),
-        )
-
-
-class HyperQwen2SdpaAttention(HyperQwen2Attention):
-    """
-    Qwen2 attention module using torch.nn.functional.scaled_dot_product_attention. This module inherits from
-    `Qwen2Attention` as the weights of the module stays untouched. The only changes are on the forward pass to adapt to
-    SDPA API.
-    """
-
-    def forward(
-        self,
-        hidden_states: paddle.Tensor,
-        attention_mask: Optional[paddle.Tensor] = None,
-        position_ids: Optional[paddle.Tensor] = None,
-        image_embeds=None,
-        media_offset=None,
-        past_key_value: Optional[Tuple[paddle.Tensor]] = None,
-        output_attentions: bool = False,
-        use_cache: bool = False,
-    ) -> Tuple[paddle.Tensor, Optional[paddle.Tensor], Optional[Tuple[paddle.Tensor]]]:
-        # print('*&'*100)
-        # print('output_attentions:',output_attentions)
-        # print('attention_mask:',attention_mask)#(1,1,1,60)
-        if output_attentions:  # false
-            logger.warning_once(
-                'Qwen2Model is using Qwen2SdpaAttention, but `torch.nn.functional.scaled_dot_product_attention` does not support `output_attentions=True`. Falling back to the manual attention implementation, but specifying the manual implementation will be required from Transformers version v5.0.0 onwards. This warning can be removed using the argument `attn_implementation="eager"` when loading the model.'
-            )
-            return super().forward(
-                hidden_states=hidden_states,
-                attention_mask=attention_mask,
-                position_ids=position_ids,
-                past_key_value=past_key_value,
-                output_attentions=output_attentions,
-                use_cache=use_cache,
-            )
-        print(hidden_states.shape)
-        bsz, q_len, _ = tuple(hidden_states.shape)
-        query_states = self.q_proj(hidden_states)
-        key_states = self.k_proj(hidden_states)
-        value_states = self.v_proj(hidden_states)
-        # query_states = query_states.view(bsz, q_len, self.num_heads, self.
-        #     head_dim).transpose(perm=paddle_aux.transpose_aux_func(
-        #     query_states.view(bsz, q_len, self.num_heads, self.head_dim).
-        #     ndim, 1, 2))
-        print("bsz:", bsz)
-        print("qlen:", q_len)
-        print("num_heads:", self.num_heads)
-        print("head_dim:", self.head_dim)
-        query_states = paddle.reshape(query_states, [bsz, q_len, self.num_heads, self.head_dim])
-        query_states = paddle.transpose(query_states, perm=[0, 2, 1, 3])  # 交换 1 和 2 维度
-
-        # key_states = key_states.view(bsz, q_len, self.num_key_value_heads,
-        #     self.head_dim).transpose(perm=paddle_aux.transpose_aux_func(
-        #     key_states.view(bsz, q_len, self.num_key_value_heads, self.
-        #     head_dim).ndim, 1, 2))
-        key_states = paddle.reshape(key_states, [bsz, q_len, self.num_key_value_heads, self.head_dim])
-        key_states = paddle.transpose(key_states, perm=[0, 2, 1, 3])  # 交换 1 和 2 维度
-
-        # value_states = value_states.view(bsz, q_len, self.
-        #     num_key_value_heads, self.head_dim).transpose(perm=paddle_aux.
-        #     transpose_aux_func(value_states.view(bsz, q_len, self.
-        #     num_key_value_heads, self.head_dim).ndim, 1, 2))
-
-        value_states = paddle.reshape(value_states, [bsz, q_len, self.num_key_value_heads, self.head_dim])
-        value_states = paddle.transpose(value_states, perm=[0, 2, 1, 3])  # 交换第1和第2维度
-
-        kv_seq_len = tuple(key_states.shape)[-2]
-        if past_key_value is not None:
-            kv_seq_len += past_key_value.get_usable_length(kv_seq_len, self.layer_idx)
-        cos, sin = self.rotary_emb(value_states, seq_len=kv_seq_len)
-        # print('2'*100)
-        query_states, key_states = apply_rotary_pos_emb(query_states, key_states, cos, sin, position_ids)
-        if past_key_value is not None:
-            cache_kwargs = {"sin": sin, "cos": cos}
-            key_states, value_states = past_key_value.update(key_states, value_states, self.layer_idx, cache_kwargs)
-
-        key_states = repeat_kv(key_states, self.num_key_value_groups)
-        value_states = repeat_kv(value_states, self.num_key_value_groups)
-
-        if attention_mask is not None:  # (1,1,1,60)
-            if tuple(attention_mask.shape) != (bsz, 1, q_len, kv_seq_len):
-                raise ValueError(
-                    f"Attention mask should be of size {bsz, 1, q_len, kv_seq_len}, but is {tuple(attention_mask.shape)}"
-                )
-        # if query_states.device.type == 'cuda' and attention_mask is not None:
-        # query_states = query_states.contiguous()
-        # key_states = key_states.contiguous()
-        # value_states = value_states.contiguous()
-        attn_output = paddle.nn.functional.scaled_dot_product_attention(
-            query=query_states,
-            key=key_states,
-            value=value_states,
-            attn_mask=attention_mask,
-            dropout_p=self.attention_dropout if self.training else 0.0,
-            is_causal=self.is_causal and attention_mask is None and q_len > 1,
-        )
-        attn_output = attn_output.transpose(perm=paddle_aux.transpose_aux_func(attn_output.ndim, 1, 2)).contiguous()
-        # attn_output = attn_output.view(bsz, q_len, self.hidden_size)
-        attn_output = paddle.reshape(attn_output, [bsz, q_len, self.hidden_size])
-        attn_output = self.crossattention(
-            query_states.transpose(perm=[2, 0, 1, 3]),
-            image_embeds,
-            media_offset,
-            attn_output.transpose(perm=[1, 0, 2]),
-        )
-        attn_output = attn_output.transpose(perm=[1, 0, 2])
-        attn_output = self.o_proj(attn_output)
-        return attn_output, None, past_key_value
-
-
-QWEN2_ATTENTION_CLASSES = {
-    "eager": HyperQwen2Attention,
-    "flash_attention_2": HyperQwen2FlashAttention2,
-    "sdpa": HyperQwen2SdpaAttention,
-}
-
-
-class HyperQwen2DecoderLayer(paddle.nn.Layer):
-    def __init__(self, config: HyperQwen2Config, layer_idx: int):
-        super().__init__()
-        self.hidden_size = config.hidden_size
-        if config.use_sliding_window and config._attn_implementation != "flash_attention_2":
-            logger.warning_once(
-                f"Sliding Window Attention is enabled but not implemented for `{config._attn_implementation}`; unexpected results may be encountered."
-            )
-        self.is_hyper_enabled = layer_idx + 1 in config.hyper_layers
-        self.self_attn = QWEN2_ATTENTION_CLASSES[config._attn_implementation](
-            config, layer_idx, is_hyper_enabed=self.is_hyper_enabled
-        )
-        self.mlp = Qwen2MLP(config)
-        self.input_layernorm = Qwen2RMSNorm(config.hidden_size, eps=config.rms_norm_eps)
-        self.post_attention_layernorm = Qwen2RMSNorm(config.hidden_size, eps=config.rms_norm_eps)
-
-    def forward(
-        self,
-        hidden_states: paddle.Tensor,
-        attention_mask: Optional[paddle.Tensor] = None,
-        position_ids: Optional[paddle.Tensor] = None,
-        image_embeds=None,
-        media_offset=None,
-        past_key_value: Optional[Tuple[paddle.Tensor]] = None,
-        output_attentions: Optional[bool] = False,
-        use_cache: Optional[bool] = False,
-    ) -> Tuple[paddle.Tensor, Optional[Tuple[paddle.Tensor, paddle.Tensor]]]:
-        """
-        Args:
-            hidden_states (`torch.FloatTensor`): input to the layer of shape `(batch, seq_len, embed_dim)`
-            attention_mask (`torch.FloatTensor`, *optional*): attention mask of size
-                `(batch, sequence_length)` where padding elements are indicated by 0.
-            output_attentions (`bool`, *optional*):
-                Whether or not to return the attentions tensors of all attention layers. See `attentions` under
-                returned tensors for more detail.
-            use_cache (`bool`, *optional*):
-                If set to `True`, `past_key_values` key value states are returned and can be used to speed up decoding
-                (see `past_key_values`).
-            past_key_value (`Tuple(torch.FloatTensor)`, *optional*): cached past key and value projection states
-        """
-        residual = hidden_states
-        hidden_states = self.input_layernorm(hidden_states)
-        if image_embeds is not None and self.is_hyper_enabled:
-            image_embeds = self.input_layernorm(image_embeds)
-        else:
-            image_embeds = media_offset = None
-        # print('*&'*100)
-        # print('attention_mask:',attention_mask)
-        hidden_states, self_attn_weights, present_key_value = self.self_attn(
-            hidden_states=hidden_states,
-            attention_mask=attention_mask,  # (1,1,1,60)
-            position_ids=position_ids,
-            image_embeds=image_embeds,
-            media_offset=media_offset,
-            past_key_value=past_key_value,
-            output_attentions=output_attentions,
-            use_cache=use_cache,
-        )
-        hidden_states = residual + hidden_states
-        residual = hidden_states
-        hidden_states = self.post_attention_layernorm(hidden_states)
-        hidden_states = self.mlp(hidden_states)
-        hidden_states = residual + hidden_states
-        outputs = (hidden_states,)
-        if output_attentions:
-            outputs += (self_attn_weights,)
-        if use_cache:
-            outputs += (present_key_value,)
-        return outputs
-
-
-QWEN2_START_DOCSTRING = """
-    This model inherits from [`PreTrainedModel`]. Check the superclass documentation for the generic methods the
-    library implements for all its model (such as downloading or saving, resizing the input embeddings, pruning heads
-    etc.)
-
-    This model is also a PyTorch [torch.nn.Module](https://pytorch.org/docs/stable/nn.html#torch.nn.Module) subclass.
-    Use it as a regular PyTorch Module and refer to the PyTorch documentation for all matter related to general usage
-    and behavior.
-
-    Parameters:
-        config ([`HyperQwen2Config`]):
-            Model configuration class with all the parameters of the model. Initializing with a config file does not
-            load the weights associated with the model, only the configuration. Check out the
-            [`~PreTrainedModel.from_pretrained`] method to load the model weights.
-"""
-
-
-# >>>>>>@transformers.utils.add_start_docstrings(
-#     'The bare Qwen2 Model outputting raw hidden-states without any specific head on top.'
-#     , QWEN2_START_DOCSTRING)
-class Qwen2PreTrainedModel(paddlenlp.transformers.model_utils.PretrainedModel):
-    config_class = HyperQwen2Config
-    base_model_prefix = "model"
-    supports_gradient_checkpointing = True
-    _no_split_modules = ["HyperQwen2DecoderLayer"]
-    _skip_keys_device_placement = "past_key_values"
-    _supports_flash_attn_2 = True
-    _supports_sdpa = True
-    _supports_cache_class = True
-
-    # def _init_weights(self, module):
-    #     std = self.config.initializer_range
-    #     if isinstance(module, paddle.nn.Linear):
-    #         module.weight.data.normal_(mean=0.0, std=std)
-    #         if module.bias is not None:
-    #             module.bias.data.zero_()
-    #     elif isinstance(module, paddle.nn.Embedding):
-    #         module.weight.data.normal_(mean=0.0, std=std)
-    #         if module.padding_idx is not None:
-    #             module.weight.data[module.padding_idx].zero_()
-
-    def _init_weights(self, layer):
-        std = self.config.initializer_range
-        if isinstance(layer, (paddle.nn.Linear, paddle.nn.Conv3D)):
-            paddle.nn.initializer.Normal(mean=0.0, std=std)(layer.weight)
-            if layer.bias is not None:
-                paddle.nn.initializer.Constant(0.0)(layer.bias)
-        elif isinstance(layer, paddle.nn.Embedding):
-            paddle.nn.initializer.Normal(mean=0.0, std=std)(layer.weight)
-            if layer._padding_idx is not None:
-                with paddle.no_grad():
-                    layer.weight[layer._padding_idx] = 0.0
-
-
-QWEN2_INPUTS_DOCSTRING = """
-    Args:
-        input_ids (`torch.LongTensor` of shape `(batch_size, sequence_length)`):
-            Indices of input sequence tokens in the vocabulary. Padding will be ignored by default should you provide
-            it.
-
-            Indices can be obtained using [`AutoTokenizer`]. See [`PreTrainedTokenizer.encode`] and
-            [`PreTrainedTokenizer.__call__`] for details.
-
-            [What are input IDs?](../glossary#input-ids)
-        attention_mask (`torch.Tensor` of shape `(batch_size, sequence_length)`, *optional*):
-            Mask to avoid performing attention on padding token indices. Mask values selected in `[0, 1]`:
-
-            - 1 for tokens that are **not masked**,
-            - 0 for tokens that are **masked**.
-
-            [What are attention masks?](../glossary#attention-mask)
-
-            Indices can be obtained using [`AutoTokenizer`]. See [`PreTrainedTokenizer.encode`] and
-            [`PreTrainedTokenizer.__call__`] for details.
-
-            If `past_key_values` is used, optionally only the last `decoder_input_ids` have to be input (see
-            `past_key_values`).
-
-            If you want to change padding behavior, you should read [`modeling_opt._prepare_decoder_attention_mask`]
-            and modify to your needs. See diagram 1 in [the paper](https://arxiv.org/abs/1910.13461) for more
-            information on the default strategy.
-
-            - 1 indicates the head is **not masked**,
-            - 0 indicates the head is **masked**.
-        position_ids (`torch.LongTensor` of shape `(batch_size, sequence_length)`, *optional*):
-            Indices of positions of each input sequence tokens in the position embeddings. Selected in the range `[0,
-            config.n_positions - 1]`.
-
-            [What are position IDs?](../glossary#position-ids)
-        past_key_values (`Cache` or `tuple(tuple(torch.FloatTensor))`, *optional*):
-            Pre-computed hidden-states (key and values in the self-attention blocks and in the cross-attention
-            blocks) that can be used to speed up sequential decoding. This typically consists in the `past_key_values`
-            returned by the model at a previous stage of decoding, when `use_cache=True` or `config.use_cache=True`.
-
-            Two formats are allowed:
-            - a [`~cache_utils.Cache`] instance;
-            - Tuple of `tuple(torch.FloatTensor)` of length `config.n_layers`, with each tuple having 2 tensors of
-            shape `(batch_size, num_heads, sequence_length, embed_size_per_head)`). This is also known as the legacy
-            cache format.
-
-            The model will output the same cache format that is fed as input. If no `past_key_values` are passed, the
-            legacy cache format will be returned.
-
-            If `past_key_values` are used, the user can optionally input only the last `input_ids` (those that don't
-            have their past key value states given to this model) of shape `(batch_size, 1)` instead of all `input_ids`
-            of shape `(batch_size, sequence_length)`.
-        inputs_embeds (`torch.FloatTensor` of shape `(batch_size, sequence_length, hidden_size)`, *optional*):
-            Optionally, instead of passing `input_ids` you can choose to directly pass an embedded representation. This
-            is useful if you want more control over how to convert `input_ids` indices into associated vectors than the
-            model's internal embedding lookup matrix.
-        use_cache (`bool`, *optional*):
-            If set to `True`, `past_key_values` key value states are returned and can be used to speed up decoding (see
-            `past_key_values`).
-        output_attentions (`bool`, *optional*):
-            Whether or not to return the attentions tensors of all attention layers. See `attentions` under returned
-            tensors for more detail.
-        output_hidden_states (`bool`, *optional*):
-            Whether or not to return the hidden states of all layers. See `hidden_states` under returned tensors for
-            more detail.
-        return_dict (`bool`, *optional*):
-            Whether or not to return a [`~utils.ModelOutput`] instead of a plain tuple.
-"""
-
-
-# >>>>>>@transformers.utils.add_start_docstrings(
-#     'The bare Qwen2 Model outputting raw hidden-states without any specific head on top.'
-#     , QWEN2_START_DOCSTRING)
-class HyperQwen2Model(Qwen2PreTrainedModel):
-    """
-    Transformer decoder consisting of *config.num_hidden_layers* layers. Each layer is a [`Qwen2DecoderLayer`]
-
-    Args:
-        config: HyperQwen2Config
-    """
-
-    def __init__(self, config: HyperQwen2Config):
-        super().__init__(config)
-        self.padding_idx = config.pad_token_id
-        self.vocab_size = config.vocab_size
-        self.embed_tokens = paddle.nn.Embedding(
-            num_embeddings=config.vocab_size, embedding_dim=config.hidden_size, padding_idx=self.padding_idx
-        )
-        self.layers = paddle.nn.LayerList(
-            sublayers=[HyperQwen2DecoderLayer(config, layer_idx) for layer_idx in range(config.num_hidden_layers)]
-        )
-        self._attn_implementation = config._attn_implementation
-        self.norm = Qwen2RMSNorm(config.hidden_size, eps=config.rms_norm_eps)
-        self.gradient_checkpointing = False
-        # self.post_init()
-
-    def get_input_embeddings(self):
-        return self.embed_tokens
-
-    def set_input_embeddings(self, value):
-        self.embed_tokens = value
-
-    # >>>>>>    @transformers.utils.add_start_docstrings_to_model_forward(
-    #         QWEN2_INPUTS_DOCSTRING)
-    def forward(
-        self,
-        input_ids: paddle.Tensor = None,
-        attention_mask: Optional[paddle.Tensor] = None,
-        position_ids: Optional[paddle.Tensor] = None,
-        past_key_values: Optional[List[paddle.Tensor]] = None,
-        inputs_embeds: Optional[paddle.Tensor] = None,
-        image_embeds=None,
-        media_offset=None,
-        use_cache: Optional[bool] = None,
-        output_attentions: Optional[bool] = None,
-        output_hidden_states: Optional[bool] = None,
-        return_dict: Optional[bool] = None,
-    ) -> Union[Tuple, paddlenlp.transformers.model_outputs.BaseModelOutputWithPast]:
-        print("^()" * 100)
-        print("attention_mask", attention_mask.shape)
-        output_attentions = output_attentions if output_attentions is not None else self.config.output_attentions
-        output_hidden_states = (
-            output_hidden_states if output_hidden_states is not None else self.config.output_hidden_states
-        )
-        use_cache = use_cache if use_cache is not None else self.config.use_cache
-        return_dict = return_dict if return_dict is not None else self.config.use_return_dict
-        if input_ids is not None and inputs_embeds is not None:
-            raise ValueError("You cannot specify both decoder_input_ids and decoder_inputs_embeds at the same time")
-        elif input_ids is not None:
-            # print("%"*100)
-            # print(input_ids.shape)
-            batch_size, seq_length = tuple(input_ids.shape)  # (1,60)
-        elif inputs_embeds is not None:
-            # print("tuple(inputs_embeds.shape):",inputs_embeds.shape)
-            batch_size, seq_length, _ = tuple(inputs_embeds.shape)
-        else:
-            raise ValueError("You have to specify either decoder_input_ids or decoder_inputs_embeds")
-        if self.gradient_checkpointing and self.training:
-            if use_cache:
-                logger.warning_once(
-                    "`use_cache=True` is incompatible with gradient checkpointing. Setting `use_cache=False`..."
-                )
-                use_cache = False
-
-        if past_key_values is None:
-            cache_position = paddle.arange(input_ids.shape[1])
-        else:
-            cache_position = paddle.to_tensor([seq_length - 1])
-
-        if past_key_values is not None:
-            input_ids = input_ids[:, -1].unsqueeze(-1)
-
-        past_key_values_length = 0
-        print("past_key_values:", past_key_values)
-        # if use_cache:
-        # use_legacy_cache = not isinstance(past_key_values, transformers
-        #     .cache_utils.Cache)
-        # use_legacy_cache = not isinstance(past_key_values, list) and all(isinstance(item, paddle.Tensor) for item in past_key_values)
-        #             if use_legacy_cache:
-        # >>>>>>                past_key_values = (transformers.cache_utils.DynamicCache.
-        #                     from_legacy_cache(past_key_values))
-        #             past_key_values_length = past_key_values.get_usable_length(
-        #                 seq_length)
-        if position_ids is None:
-            device = input_ids.place if input_ids is not None else inputs_embeds.place
-            position_ids = paddle.arange(
-                start=past_key_values_length, end=seq_length + past_key_values_length, dtype="int64"
-            )
-            # position_ids = position_ids.unsqueeze(axis=0).view(-1, seq_length)
-            position_ids = paddle.unsqueeze(position_ids, axis=0)
-            position_ids = paddle.reshape(position_ids, [-1, seq_length])
-
-        else:
-            device = input_ids.place
-            # position_ids = position_ids.view(-1, seq_length).astype(dtype='int64')
-            # position_ids = position_ids.reshape(-1, seq_length).astype(dtype='int64')
-            position_ids = paddle.reshape(position_ids, [-1, seq_length]).astype(dtype="int64")
-        if inputs_embeds is None:
-            print("^" * 100)
-            inputs_embeds = self.embed_tokens(input_ids)
-        if attention_mask is not None and self._attn_implementation == "flash_attention_2" and use_cache:
-            is_padding_right = attention_mask[:, -1].sum().item() != batch_size
-            if is_padding_right:
-                raise ValueError(
-                    "You are attempting to perform batched generation with padding_side='right' this may lead to unexpected behaviour for Flash Attention version of Qwen2. Make sure to  call `tokenizer.padding_side  = 'left'` before tokenizing the input. "
-                )
-        # print("^()"*100)
-        # print('attention_mask',attention_mask)
-        print(self._attn_implementation)
-        if self._attn_implementation == "flash_attention_2":
-            attention_mask = attention_mask if attention_mask is not None and 0 in attention_mask else None
-        # elif self._attn_implementation == 'sdpa' and not output_attentions:
-        # >>>>>>            attention_mask = (transformers.modeling_attn_mask_utils.
-        #                 _prepare_4d_causal_attention_mask_for_sdpa(attention_mask,
-        #                 (batch_size, seq_length), inputs_embeds,
-        #                 past_key_values_length, sliding_window=self.config.
-        #                 sliding_window))
-
-        else:
-            # >>>>>>            attention_mask = (transformers.modeling_attn_mask_utils.
-            #                 _prepare_4d_causal_attention_mask(attention_mask, (
-            #                 batch_size, seq_length), inputs_embeds,
-            #                 past_key_values_length, sliding_window=self.config.
-            #                 sliding_window))
-            print("5" * 200)
-            attention_mask = None
-            min_dtype = paddle.finfo(paddle.float16).min
-            # print("past_key_values_length:",past_key_values_length)
-
-            attention_mask = _prepare_4d_causal_attention_mask_with_cache_position(
-                attention_mask,
-                sequence_length=seq_length,
-                target_length=seq_length,
-                dtype=inputs_embeds.dtype,
-                # device=device,
-                min_dtype=min_dtype,
-                cache_position=cache_position,
-                batch_size=batch_size,
-            )
-            # attention_mask = _prepare_4d_causal_attention_mask_with_cache_position(
-            #     attention_mask,
-            #     sequence_length=seq_length,
-            #     target_length=past_key_values.get_max_length(),
-            #     dtype=dtype,
-            #     # device=device,
-            #     min_dtype=min_dtype,
-            #     cache_position=cache_position,
-            #     batch_size=batch_size,
-            # )
-            print("attention_mask", attention_mask)
-        # print("^**"*100)
-        # print('attention_mask',attention_mask)
-        hidden_states = inputs_embeds
-        all_hidden_states = () if output_hidden_states else None
-        all_self_attns = () if output_attentions else None
-        next_decoder_cache = None
-        for decoder_layer in self.layers:
-            if output_hidden_states:
-                all_hidden_states += (hidden_states,)
-            if self.gradient_checkpointing and self.training:
-                layer_outputs = self._gradient_checkpointing_func(
-                    decoder_layer.__call__,
-                    hidden_states,
-                    attention_mask,
-                    position_ids,
-                    image_embeds,
-                    media_offset,
-                    past_key_values,
-                    output_attentions,
-                    use_cache,
-                )
-            else:
-                print("hidden_states:", hidden_states)
-                layer_outputs = decoder_layer(
-                    hidden_states,
-                    attention_mask=attention_mask,
-                    position_ids=position_ids,
-                    image_embeds=image_embeds,
-                    media_offset=media_offset,
-                    past_key_value=past_key_values,
-                    output_attentions=output_attentions,
-                    use_cache=use_cache,
-                )
-            hidden_states = layer_outputs[0]
-            if use_cache:
-                next_decoder_cache = layer_outputs[2 if output_attentions else 1]
-            if output_attentions:
-                all_self_attns += (layer_outputs[1],)
-        hidden_states = self.norm(hidden_states)
-        if output_hidden_states:
-            all_hidden_states += (hidden_states,)
-        next_cache = None
-        if use_cache:
-            next_cache = next_decoder_cache.to_legacy_cache() if use_legacy_cache else next_decoder_cache
-        if not return_dict:
-            return tuple(v for v in [hidden_states, next_cache, all_hidden_states, all_self_attns] if v is not None)
-        return paddlenlp.transformers.model_outputs.BaseModelOutputWithPast(
-            last_hidden_state=hidden_states,
-            past_key_values=next_cache,
-            hidden_states=all_hidden_states,
-            attentions=all_self_attns,
-        )
-
-
-class HyperQwen2ForCausalLM(Qwen2PreTrainedModel):
-    _tied_weights_keys = ["lm_head.weight"]
-
-    def __init__(self, config):
-        super().__init__(config)
-        self.model = HyperQwen2Model(config)
-        self.vocab_size = config.vocab_size
-        self.lm_head = paddle.nn.Linear(
-            in_features=config.hidden_size, out_features=config.vocab_size, bias_attr=False
-        )
-        # self.post_init()
-
-    def get_input_embeddings(self):
-        return self.model.embed_tokens
-
-    def set_input_embeddings(self, value):
-        self.model.embed_tokens = value
-
-    def get_output_embeddings(self):
-        return self.lm_head
-
-    def set_output_embeddings(self, new_embeddings):
-        self.lm_head = new_embeddings
-
-    def set_decoder(self, decoder):
-        self.model = decoder
-
-    def get_decoder(self):
-        return self.model
-
-    # >>>>>>    @transformers.utils.add_start_docstrings_to_model_forward(
-    #         QWEN2_INPUTS_DOCSTRING)
-    # >>>>>>    @transformers.utils.replace_return_docstrings(output_type=
-    #         CausalLMOutputWithPast, config_class=_CONFIG_FOR_DOC)
-    def forward(
-        self,
-        input_ids: paddle.Tensor = None,
-        attention_mask: Optional[paddle.Tensor] = None,
-        position_ids: Optional[paddle.Tensor] = None,
-        past_key_values: Optional[List[paddle.Tensor]] = None,
-        inputs_embeds: Optional[paddle.Tensor] = None,
-        image_embeds=None,
-        media_offset=None,
-        labels: Optional[paddle.Tensor] = None,
-        use_cache: Optional[bool] = None,
-        output_attentions: Optional[bool] = None,
-        output_hidden_states: Optional[bool] = None,
-        return_dict: Optional[bool] = None,
-    ) -> Union[Tuple, paddlenlp.transformers.model_outputs.CausalLMOutputWithPast]:
-        """
-        Args:
-            labels (`torch.LongTensor` of shape `(batch_size, sequence_length)`, *optional*):
-                Labels for computing the masked language modeling loss. Indices should either be in `[0, ...,
-                config.vocab_size]` or -100 (see `input_ids` docstring). Tokens with indices set to `-100` are ignored
-                (masked), the loss is only computed for the tokens with labels in `[0, ..., config.vocab_size]`.
-
-        Returns:
-
-        Example:
-
-        ```python
-        >>> from transformers import AutoTokenizer, Qwen2ForCausalLM
-
-        >>> model = Qwen2ForCausalLM.from_pretrained(PATH_TO_CONVERTED_WEIGHTS)
-        >>> tokenizer = AutoTokenizer.from_pretrained(PATH_TO_CONVERTED_TOKENIZER)
-
-        >>> prompt = "Hey, are you conscious? Can you talk to me?"
-        >>> inputs = tokenizer(prompt, return_tensors="pt")
-
-        >>> # Generate
-        >>> generate_ids = model.generate(inputs.input_ids, max_length=30)
-        >>> tokenizer.batch_decode(generate_ids, skip_special_tokens=True, clean_up_tokenization_spaces=False)[0]
-        "Hey, are you conscious? Can you talk to me?\\nI'm not conscious, but I can talk to you."
-        ```"""
-        output_attentions = output_attentions if output_attentions is not None else self.config.output_attentions
-        output_hidden_states = (
-            output_hidden_states if output_hidden_states is not None else self.config.output_hidden_states
-        )
-        return_dict = return_dict if return_dict is not None else self.config.use_return_dict
-        # print(self.model)   HyperQwen2Model
-        # print('::'*100)
-        # print('attention_mask',attention_mask)
-        outputs = self.model(
-            input_ids=input_ids,
-            attention_mask=attention_mask,  # (1,1,1,60)
-            position_ids=position_ids,
-            past_key_values=past_key_values,
-            inputs_embeds=inputs_embeds,
-            image_embeds=image_embeds,
-            media_offset=media_offset,
-            use_cache=use_cache,
-            output_attentions=output_attentions,
-            output_hidden_states=output_hidden_states,
-            return_dict=return_dict,
-        )
-        hidden_states = outputs[0]
-        logits = self.lm_head(hidden_states)
-        logits = logits.astype(dtype="float32")
-        loss = None
-        if labels is not None:
-            shift_logits = logits[..., :-1, :].contiguous()
-            shift_labels = labels[..., 1:].contiguous()
-            loss_fct = paddle.nn.CrossEntropyLoss()
-            # shift_logits = shift_logits.view(-1, self.config.vocab_size)
-            # shift_labels = shift_labels.view(-1)
-            shift_logits = paddle.reshape(shift_logits, [-1, self.config.vocab_size])
-            shift_labels = paddle.reshape(shift_labels, [-1])
-
-            shift_labels = shift_labels.to(shift_logits.place)
-            loss = loss_fct(shift_logits, shift_labels)
-        if not return_dict:
-            output = (logits,) + outputs[1:]
-            return (loss,) + output if loss is not None else output
-        return paddlenlp.transformers.model_outputs.CausalLMOutputWithPast(
-            loss=loss,
-            logits=logits,
-            past_key_values=outputs.past_key_values,
-            hidden_states=outputs.hidden_states,
-            attentions=outputs.attentions,
-        )
-
-    def prepare_inputs_for_generation(
-        self, input_ids, past_key_values=None, attention_mask=None, inputs_embeds=None, **kwargs
-    ):
-        if past_key_values is not None:
-            # >>>>>>            if isinstance(past_key_values, transformers.cache_utils.Cache):
-            #                 cache_length = past_key_values.get_seq_length()
-            #                 past_length = past_key_values.seen_tokens
-            #                 max_cache_length = past_key_values.get_max_length()
-            if past_key_values is not None and isinstance(past_key_values, list):
-                # 确保所有元素都是 paddle.Tensor，并且获取每个 Tensor 的序列长度
-                if all(isinstance(tensor, paddle.Tensor) for tensor in past_key_values):
-                    # 计算 cache_length 和 max_cache_length
-                    cache_length = len(past_key_values)  # 仍然是 Tensor 的数量
-                    past_length = sum(tensor.numel() for tensor in past_key_values)  # 计算所有 Tensor 的元素总数
-                    max_cache_length = max(tensor.shape[-2] for tensor in past_key_values)  # 获取最大序列长度（假设是 shape[-2]）
-                else:
-                    raise ValueError("past_key_values should be a list of paddle.Tensors")
-
-            else:
-                cache_length = past_length = tuple(past_key_values[0][0].shape)[2]
-                max_cache_length = None
-            if attention_mask is not None and tuple(attention_mask.shape)[1] > tuple(input_ids.shape)[1]:
-                input_ids = input_ids[:, -(tuple(attention_mask.shape)[1] - past_length) :]
-            elif past_length < tuple(input_ids.shape)[1]:
-                input_ids = input_ids[:, past_length:]
-            if (
-                max_cache_length is not None
-                and attention_mask is not None
-                and cache_length + tuple(input_ids.shape)[1] > max_cache_length
-            ):
-                attention_mask = attention_mask[:, -max_cache_length:]
-        position_ids = kwargs.get("position_ids", None)
-        if attention_mask is not None and position_ids is None:
-            position_ids = attention_mask.astype(dtype="int64").cumsum(axis=-1) - 1
-            position_ids.masked_fill_(mask=attention_mask == 0, value=1)
-            if past_key_values:
-                position_ids = position_ids[:, -tuple(input_ids.shape)[1] :]
-        if inputs_embeds is not None and past_key_values is None:
-            model_inputs = {"inputs_embeds": inputs_embeds}
-        else:
-            model_inputs = {"input_ids": input_ids}
-        model_inputs.update(
-            {
-                "position_ids": position_ids,
-                "past_key_values": past_key_values,
-                "use_cache": kwargs.get("use_cache"),
-                "attention_mask": attention_mask,
-                "image_embeds": kwargs.get("image_embeds"),
-                "media_offset": kwargs.get("media_offset"),
-            }
-        )
-        return model_inputs
-
-    @staticmethod
-    def _reorder_cache(past_key_values, beam_idx):
-        reordered_past = ()
-        for layer_past in past_key_values:
-            reordered_past += (
-                tuple(
-                    past_state.index_select(axis=0, index=beam_idx.to(past_state.place)) for past_state in layer_past
-                ),
-            )
-        return reordered_past
diff --git a/paddlemix/mPLUGOwl3/utils/paddle_aux.py b/paddlemix/mPLUGOwl3/utils/paddle_aux.py
deleted file mode 100644
index ab3de5d4f..000000000
--- a/paddlemix/mPLUGOwl3/utils/paddle_aux.py
+++ /dev/null
@@ -1,30 +0,0 @@
-# Copyright (c) 2024 PaddlePaddle Authors. All Rights Reserved.
-# 
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-# 
-#     http://www.apache.org/licenses/LICENSE-2.0
-# 
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-
-# This file is generated by PaConvert ToolKit, please Don't edit it!
-import paddle
-
-
-def reshape(self, *args, **kwargs):
-    if args:
-        if len(args) == 1 and isinstance(args[0], (tuple, list)):
-            return paddle.reshape(self, args[0])
-        else:
-            return paddle.reshape(self, list(args))
-    elif kwargs:
-        assert "shape" in kwargs
-        return paddle.reshape(self, shape=kwargs["shape"])
-
-
-setattr(paddle.Tensor, "reshape", reshape)
diff --git a/paddlemix/mPLUGOwl3/x_sdpa.py b/paddlemix/mPLUGOwl3/x_sdpa.py
deleted file mode 100644
index 4984d0e15..000000000
--- a/paddlemix/mPLUGOwl3/x_sdpa.py
+++ /dev/null
@@ -1,60 +0,0 @@
-# Copyright (c) 2024 PaddlePaddle Authors. All Rights Reserved.
-# 
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-# 
-#     http://www.apache.org/licenses/LICENSE-2.0
-# 
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-
-import paddle
-from einops import rearrange
-from icecream import ic
-
-
-class ScaleDotProductAttention(paddle.nn.Layer):
-    def __init__(self, layer_number, causal=False, softmax_scale=None, attention_dropout=0.0):
-        super().__init__()
-        self.layer_number = layer_number
-        self.causal = causal
-        self.softmax_scale = softmax_scale
-        self.dropout_p = attention_dropout
-
-    def forward(self, q, k, v, attn_mask=None, order="sbhd"):
-        """Implements the multihead softmax attention.
-        Arguments
-        ---------
-            q, k, v: The tensor containing the query, key, and value. (B, S, H, D)
-        """
-        if order == "sbhd":
-            q, k, v = [rearrange(x, "s b h d -> b h s d").contiguous() for x in (q, k, v)]
-        elif order == "bhsd":
-            pass
-        if attn_mask is not None:
-            attn_mask = (~attn_mask.clone().astype(dtype="bool")).contiguous()
-        else:
-            attn_mask = None
-        if self.training:
-            if self.causal:
-                assert tuple(q.shape)[-2] == tuple(k.shape)[-2]
-            is_causal = self.causal
-            dropout_p = self.dropout_p
-        else:
-            if self.causal:
-                is_causal = tuple(q.shape)[-2] == tuple(k.shape)[-2]
-            else:
-                is_causal = self.causal
-            dropout_p = 0.0
-        assert self.softmax_scale == None or self.softmax_scale == paddle.utils.try_import("math").sqrt(
-            q.shape[-1]
-        ), "Fault: The scale parameter defaults to the square root of the last dimension of query, not allowed manually set"
-        o = paddle.nn.functional.scaled_dot_product_attention(
-            query=q, key=k, value=v, attn_mask=attn_mask, dropout_p=dropout_p, is_causal=is_causal
-        )
-        o = rearrange(o, "B Head L D -> L B (Head D)").contiguous()
-        return o
diff --git a/paddlemix/mPLUGOwl3/__init__.py b/paddlemix/models/mPLUGOwl3/__init__.py
similarity index 94%
rename from paddlemix/mPLUGOwl3/__init__.py
rename to paddlemix/models/mPLUGOwl3/__init__.py
index 9c2aaca31..4122a3ef1 100644
--- a/paddlemix/mPLUGOwl3/__init__.py
+++ b/paddlemix/models/mPLUGOwl3/__init__.py
@@ -11,7 +11,8 @@
 # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 # See the License for the specific language governing permissions and
 # limitations under the License.
-from .bert_padding import *
+
+#from .bert_padding import *
 from .configuration_hyper_qwen2 import *
 from .configuration_mplugowl3 import *
 from .image_processing_mplugowl3 import *
@@ -19,4 +20,4 @@
 from .modeling_mplugowl3 import *
 from .modeling_navit_siglip import *
 from .processing_mplugowl3 import *
-from .x_sdpa import *
+#from .x_sdpa import *
diff --git a/paddlemix/mPLUGOwl3/activations.py b/paddlemix/models/mPLUGOwl3/activations.py
similarity index 100%
rename from paddlemix/mPLUGOwl3/activations.py
rename to paddlemix/models/mPLUGOwl3/activations.py
diff --git a/paddlemix/mPLUGOwl3/bert_padding.py b/paddlemix/models/mPLUGOwl3/bert_padding.py
similarity index 100%
rename from paddlemix/mPLUGOwl3/bert_padding.py
rename to paddlemix/models/mPLUGOwl3/bert_padding.py
diff --git a/paddlemix/mPLUGOwl3/configuration_hyper_qwen2.py b/paddlemix/models/mPLUGOwl3/configuration_hyper_qwen2.py
similarity index 91%
rename from paddlemix/mPLUGOwl3/configuration_hyper_qwen2.py
rename to paddlemix/models/mPLUGOwl3/configuration_hyper_qwen2.py
index caec5d27b..d0057b667 100644
--- a/paddlemix/mPLUGOwl3/configuration_hyper_qwen2.py
+++ b/paddlemix/models/mPLUGOwl3/configuration_hyper_qwen2.py
@@ -12,11 +12,9 @@
 # See the License for the specific language governing permissions and
 # limitations under the License.
 
-import paddlenlp
 from paddlenlp.transformers import PretrainedConfig
 
 
-# >>>>>>class HyperQwen2Config(transformers.configuration_utils.PretrainedConfig):
 class HyperQwen2Config(PretrainedConfig):
     """
     This is the configuration class to store the configuration of a [`Qwen2Model`]. It is used to instantiate a
@@ -98,7 +96,7 @@ def __init__(
         hidden_act="silu",
         max_position_embeddings=32768,
         initializer_range=0.02,
-        rms_norm_eps=1e-06,
+        rms_norm_eps=1e-6,
         use_cache=True,
         tie_word_embeddings=False,
         rope_theta=10000.0,
@@ -106,9 +104,10 @@ def __init__(
         sliding_window=4096,
         max_window_layers=28,
         attention_dropout=0.0,
-        hyper_layers=[1, 9, 17, 25],
-        _attn_implementation="sdpa",
-        **kwargs
+        hyper_layers=[1,9,17,25],
+        vision_batch_size=16,
+        rope_scaling=None,
+        **kwargs,
     ):
         self.vocab_size = vocab_size
         self.max_position_embeddings = max_position_embeddings
@@ -119,8 +118,13 @@ def __init__(
         self.use_sliding_window = use_sliding_window
         self.sliding_window = sliding_window if use_sliding_window else None
         self.max_window_layers = max_window_layers
+        self.rope_scaling = rope_scaling
+        if self.rope_scaling is not None and "type" in self.rope_scaling:
+            self.rope_scaling["rope_type"] = self.rope_scaling["type"]
+        # for backward compatibility
         if num_key_value_heads is None:
             num_key_value_heads = num_attention_heads
+
         self.num_key_value_heads = num_key_value_heads
         self.hidden_act = hidden_act
         self.initializer_range = initializer_range
@@ -129,5 +133,9 @@ def __init__(
         self.rope_theta = rope_theta
         self.attention_dropout = attention_dropout
         self.hyper_layers = hyper_layers
-        self._attn_implementation = _attn_implementation
-        super().__init__(tie_word_embeddings=tie_word_embeddings, **kwargs)
+        self.vision_batch_size = vision_batch_size
+        self.seq_length = 1 #self.max_length ###
+        super().__init__(
+            tie_word_embeddings=tie_word_embeddings,
+            **kwargs,
+        )
diff --git a/paddlemix/mPLUGOwl3/configuration_mplugowl3.py b/paddlemix/models/mPLUGOwl3/configuration_mplugowl3.py
similarity index 65%
rename from paddlemix/mPLUGOwl3/configuration_mplugowl3.py
rename to paddlemix/models/mPLUGOwl3/configuration_mplugowl3.py
index 7858154fc..36adf6f1c 100644
--- a/paddlemix/mPLUGOwl3/configuration_mplugowl3.py
+++ b/paddlemix/models/mPLUGOwl3/configuration_mplugowl3.py
@@ -13,49 +13,44 @@
 # limitations under the License.
 
 import os
-
 import paddlenlp
-
-""" mPLUGOwl3 model configuration"""
-# from paddlenlp.transformers import PretrainedConfig, Qwen2Config
-from typing import Union
-
-# logger = paddle.utils.try_import('logging').getLogger(name=__name__)
-from paddlemix.utils.log import logger
-
 from .configuration_hyper_qwen2 import HyperQwen2Config
 from .modeling_navit_siglip import SigLipVisionConfig
+from paddlemix.utils.log import logger
 
 
 class mPLUGOwl3Config(HyperQwen2Config):
     model_type = "mplugowl3"
     keys_to_ignore_at_inference = ["past_key_values"]
+
     default_vision_config = {
         "hidden_size": 1152,
-        "image_size": 384,
+        "image_size": 378,
         "intermediate_size": 4304,
         "model_type": "siglip_vision_model",
         "num_attention_heads": 16,
         "num_hidden_layers": 27,
-        "patch_size": 14,
+        "patch_size": 14
     }
 
-    def __init__(self, use_cache=True, vision_config=None, **kwargs):
+
+    def __init__(
+        self,
+        use_cache=True,
+        vision_config=None,
+        **kwargs,
+    ):
         self.use_cache = use_cache
+
+        # same as HuggingFaceM4/siglip-so400m-14-980-flash-attn2-navit add tgt_sizes
         if vision_config is None:
-            # >>>>>>            self.vision_config = (transformers.models.siglip.
-            #                 configuration_siglip.SiglipVisionConfig(**self.
-            #                 default_vision_config))
             self.vision_config = SigLipVisionConfig(**self.default_vision_config)
             logger.info("vision_config is None, using default vision config")
         elif isinstance(vision_config, dict):
-            # >>>>>>            self.vision_config = (transformers.models.siglip.
-            # configuration_siglip.SiglipVisionConfig(**vision_config))
             self.vision_config = SigLipVisionConfig(**vision_config)
-        # >>>>>>        elif isinstance(vision_config, transformers.models.siglip.
-        #             configuration_siglip.SiglipVisionConfig):
         elif isinstance(vision_config, SigLipVisionConfig):
             self.vision_config = vision_config
-        self.image_size = self.vision_config.image_size
+        self.image_size = 378
         self.patch_size = self.vision_config.patch_size
+
         super().__init__(**kwargs)
diff --git a/paddlemix/models/mPLUGOwl3/image_processing_mplugowl3.py b/paddlemix/models/mPLUGOwl3/image_processing_mplugowl3.py
new file mode 100644
index 000000000..418e8d6f9
--- /dev/null
+++ b/paddlemix/models/mPLUGOwl3/image_processing_mplugowl3.py
@@ -0,0 +1,489 @@
+# Copyright (c) 2024 PaddlePaddle Authors. All Rights Reserved.
+# 
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+# 
+#     http://www.apache.org/licenses/LICENSE-2.0
+# 
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+# import sys
+
+# sys.path.append("/home/aistudio/paddle_test/mPLUGOwl3/utils")
+import math
+import random
+from enum import Enum
+from typing import Any, Dict, List, Optional, Union
+
+import numpy as np
+import paddle.vision.transforms as transforms
+
+# import paddle_aux
+import paddle
+import paddle.nn.functional as F
+import paddlenlp
+import PIL
+import PIL.Image
+import PIL.ImageSequence
+from einops import rearrange, repeat
+from paddlenlp.transformers.image_processing_utils import (
+    BaseImageProcessor,
+    BatchFeature,
+)
+from PIL import Image
+
+
+def recursive_converter(converter, value):
+    if isinstance(value, list):
+        new_value = []
+        for v in value:
+            new_value += [recursive_converter(converter, v)]
+        return new_value
+    else:
+        return converter(value)
+
+
+def box_area(boxes):
+    # 获取边界框的宽度和高度
+    width = boxes[:, 2] - boxes[:, 0]
+    height = boxes[:, 3] - boxes[:, 1]
+    # 计算面积
+    area = width * height
+    return area
+
+
+def custom_max(a, b):
+    return paddle.where(a > b, a, b)
+
+
+def custom_min(a, b):
+    return paddle.where(a < b, a, b)
+
+def box_iou(boxes1, area1, boxes2, eps=1e-05):
+    # >>>>>>    area2 = torchvision.ops.boxes.box_area(boxes2)
+    area1 = area1.astype("float32")
+    boxes1 = boxes1.astype("float32")
+    boxes2 = boxes2.astype("float32")
+
+    area2 = box_area(boxes2).astype("float32")
+    lt = custom_max(boxes1[:, None, :2], boxes2[:, :2])
+    rb = custom_min(boxes1[:, None, 2:], boxes2[:, 2:])
+    wh = (rb - lt).clip(min=0)
+    inter = wh[:, :, 0] * wh[:, :, 1]
+    union = area1[:, None] + area2 - inter
+    iou = inter / (union + eps)
+    return iou, union
+
+
+# def box_iou(boxes1, area1, boxes2, eps=1e-5):
+#     area2 = box_area(boxes2)
+
+#     lt = paddle.max(boxes1[:, None, :2], boxes2[:, :2])  # [N,M,2]
+#     rb = paddle.min(boxes1[:, None, 2:], boxes2[:, 2:])  # [N,M,2]
+
+#     wh = (rb - lt).clip(min=0)  # [N,M,2]
+#     inter = wh[:, :, 0] * wh[:, :, 1]  # [N,M]
+
+#     union = area1[:, None] + area2 - inter
+
+#     iou = inter / (union + eps)
+#     return iou, union
+
+
+available_anchor_strategy = ['docowl', 'random', 'highest', 'last', 'llava']
+
+grid_dict = {
+    'grid_33':[
+        (1,1),
+        (1,2),(2,1),
+        (1,3),(3,1),
+        (2,2),(1,4),(4,1),
+        (1,5),(5,1),
+        (1,6),(6,1),(2,3),(3,2),
+        (1,7),(7,1),
+        (4,2),(2,4),(1,8),(8,1),
+        (3,3),(1,9),(9,1)],
+    'grid_squ_3x3':[
+        (1,1),(2,2),(3,3)
+    ],
+    'grid_squ_4':[
+        (2,2),(1,3),(1,4),(3,1),(4,1)
+    ],
+    'grid_squ_6':[
+        (2,2),(1,3),(1,4),(3,1),(4,1), (2,3),(3,2)
+    ],
+    'grid_squ_2':[
+        (2,1)
+    ],
+    'grid_squ_9':[
+        (1,1),
+        (1,2),(2,1),
+        (1,3),(3,1),
+        (2,2),(1,4),(4,1),
+        (1,5),(5,1),
+        (1,6),(6,1),(2,3),(3,2),
+        (1,7),(7,1),
+        (4,2),(2,4),(1,8),(8,1),
+        (3,3),(1,9),(9,1)],
+}
+
+
+cut_prompt_template_dict = {
+    'v0': lambda img_token, h, w: f''.join([f"{img_token}" for i in range(h) for j in range(w)]),
+    'v1': lambda img_token, h, w: f'Cut to {h} rows {w} columns, '+ ' '.join([f"subimg({i},{j}){img_token}"for i in range(h) for j in range(w)]),
+    'v1_global': lambda img_token, h, w: f'Cut to {h} rows {w} columns with a global view, '+ ' '.join([f"subimg({i},{j}){img_token}"for i in range(h) for j in range(w)]+[f"global_view{img_token}"]),
+    'v2_global': lambda img_token, h, w: f'Cut to {h} rows {w} columns with a global view\n'+ '\n'.join([' '.join([f"subimg({i},{j}){img_token}" for j in range(w)]) for i in range(h)])+f"\nglobal_view{img_token}",
+    'v3': lambda img_token, h, w: f'<|start_cut|>{h}*{w}'+ ' '.join([f"{img_token}"for i in range(h) for j in range(w)])+'<|end_cut|>',
+    'v3_global': lambda img_token, h, w: f'<|start_cut|>{h}*{w}\n'+ '\n'.join([' '.join([f"{img_token}" for j in range(w)]) for i in range(h)])+f'\n{img_token}<|end_cut|>',
+
+}
+
+def anchor_rank(anchors, anchors_areas, input_image_size, eps=1e-5):
+    # anchors x1 y1 x2 y2
+
+    # image_size: (h, w)
+    # xyxy
+    input_image_bbox = paddle.to_tensor([0, 0, input_image_size[1], input_image_size[0]]).unsqueeze(0)
+
+    boxes1 = anchors
+    boxes2 = input_image_bbox
+    boxes3 = anchors.clone()
+    # y2
+    boxes3[:,3] = input_image_size[0]/input_image_size[1]*anchors[:,2] # 用于算分辨率无关的iou
+    
+    area1 = anchors_areas
+    
+    iou, _ = box_iou(boxes1, area1, boxes2)
+    iou = iou.squeeze(1)
+    shape_iou, _ = box_iou(boxes1, area1, boxes3)
+    shape_iou = shape_iou.diag()
+    # 优先匹配形状接近 再匹配分辨率接近
+    index = paddle.argmax(shape_iou*100+iou,axis=0)
+    return index
+
+
+def select_best_resolution(anchors, anchors_areas, input_image_size): # TODO For a futher check
+    """
+    Selects the best resolution from a list of possible resolutions based on the original size.
+
+    Args:
+        original_size (tuple): The original size of the image in the format (width, height).
+        possible_resolutions (list): A list of possible resolutions in the format [(width1, height1), (width2, height2), ...].
+
+    Returns:
+        tuple: The best fit resolution in the format (width, height).
+    """
+    original_size = (input_image_size[1], input_image_size[0])
+    possible_resolutions = [(_[2], _[3]) for _ in anchors] # xyxy -> w,h
+
+    original_width, original_height = original_size
+    best_fit = None
+    max_effective_resolution = 0
+    min_wasted_resolution = float('inf')
+
+    index = 0
+    for i, (width, height) in enumerate(possible_resolutions):
+        scale = min(width / original_width, height / original_height)
+        downscaled_width, downscaled_height = int(original_width * scale), int(original_height * scale)
+        effective_resolution = min(downscaled_width * downscaled_height, original_width * original_height)
+        wasted_resolution = (width * height) - effective_resolution
+
+        if effective_resolution > max_effective_resolution or (effective_resolution == max_effective_resolution and wasted_resolution < min_wasted_resolution):
+            max_effective_resolution = effective_resolution
+            min_wasted_resolution = wasted_resolution
+            best_fit = (width, height)
+            index = i
+
+    return index
+
+def build_cut_shape_indices(cut_shape):
+    # cut_shape: a list of (nh,nw)
+    cut_shape_indices = []
+    for shape in cut_shape:
+        n=shape[0]*shape[1]
+        indices = paddle.concat([
+            repeat(paddle.to_tensor(shape),'l -> n l',n=n),
+            paddle.arange(n).unsqueeze(1)
+        ], axis=1)
+        assert indices.shape[0] == n
+        assert indices.shape[1] == 3 # nh,nw,idx
+
+        cut_shape_indices.append(indices)
+    cut_shape_indices = paddle.concat(cut_shape_indices,axis=0).astype('int64')
+    return cut_shape_indices
+
+
+class AnchorResize(paddle.nn.Layer):
+
+    def __init__(self, image_size, anchors, interpolation="bilinear", antialias=None, anchor_strategy='docowl'):
+        super().__init__()
+        self.image_size = image_size
+        # xyxy
+        self.anchors = paddle.to_tensor(
+            [[0, 0, _[1]*image_size[1], _[0]*image_size[0]] for _ in anchors],
+        )
+
+        self.anchor_areas = box_area(self.anchors)
+
+        self.interpolation = interpolation
+        self.antialias = antialias
+        self.anchor_strategy = anchor_strategy
+        assert self.anchor_strategy in available_anchor_strategy
+
+    def resize_global(self, img):
+        #return F.resize(img, self.image_size, self.interpolation, max_size=None, antialias=self.antialias)
+        image_np = np.array(img)
+        image_tensor = paddle.to_tensor(image_np, dtype="float32")
+        image_tensor = image_tensor.transpose([2, 0, 1])  # 变成 (3, 500, 500)
+        if self.interpolation == "bilinear" or "bicubic":
+            image_tensor = image_tensor.unsqueeze(0)  # 变成 (1, 3, 500, 500)
+        return F.interpolate(image_tensor, size=self.image_size, mode=self.interpolation, align_corners=False)[0]
+
+    def forward(self, img, skip_resize=False):
+        """
+        Args:
+            img (PIL Image or Tensor): Image to be scaled.
+
+        Returns:
+            PIL Image or Tensor: Rescaled image.
+        """
+        if self.anchor_strategy == 'docowl':
+            selected_anchor = anchor_rank(self.anchors, self.anchor_areas, (img.size[1], img.size[0]))
+        elif self.anchor_strategy == 'random':
+            selected_anchor = random.randint(0,len(self.anchors)-1)
+        elif self.anchor_strategy == 'highest':
+            # 选面积最大的 在这个基础上 尽可能选最方正的
+            selected_anchor = paddle.argmax(self.anchors[:,2]*self.anchors[:,3]*100-paddle.abs(self.anchors[:,2]-self.anchors[:,3]))
+        elif self.anchor_strategy == 'last':
+            selected_anchor = len(self.anchors)-1
+        elif self.anchor_strategy == 'llava':
+            selected_anchor = select_best_resolution(self.anchors, self.anchor_areas, (img.size[1], img.size[0]))
+        else:
+            selected_anchor = None
+        assert selected_anchor is not None
+
+        target_size = self.anchors[selected_anchor][2:].tolist() # w,h
+        if skip_resize:
+            # for debug
+            return selected_anchor
+        #return F.resize(img, [target_size[1],target_size[0]], self.interpolation, max_size=None, antialias=self.antialias), selected_anchor
+        image_np = np.array(img)
+        image_tensor = paddle.to_tensor(image_np, dtype="float32")
+        image_tensor = image_tensor.transpose([2, 0, 1])  # 变成 (3, 500, 500)
+        if self.interpolation == "bilinear" or "bicubic":
+            image_tensor = image_tensor.unsqueeze(0)  # 变成 (1, 3, 500, 500)
+        return (
+            F.interpolate(
+                image_tensor, size=[target_size[1], target_size[0]], mode=self.interpolation, align_corners=False
+            )[0],
+            selected_anchor,
+        )
+
+    def __repr__(self) -> str:
+        detail = f"(size={self.image_size}, anchor={self.anchors}, interpolation={self.interpolation.value}, antialias={self.antialias})"
+        return f"{self.__class__.__name__}{detail}"
+
+
+class CutMixin:
+    def __init__(self, cut_cfg={"anchors": "grid_squ_6", "anchor_strategy": "docowl", "cut_prompt": "v3", "add_global": True, "cut_prob": 1.0}) -> None:
+        if cut_cfg is None:
+            self.cut_enable = False
+            return
+        else:
+            self.cut_enable = True
+        image_size = self.image_size
+        anchors = cut_cfg.get('anchors','grid_33')
+        anchor_strategy = cut_cfg.get('anchor_strategy','docowl')
+        cut_prompt = cut_cfg.get('cut_prompt','v0')
+        self.cut_prob = cut_cfg.get('cut_prob', 1.0)
+        
+        self.force_shape_cut = cut_cfg.get('force_shape_cut', False)
+        force_shape_cut_anchors = cut_cfg.get('force_shape_cut_anchors', 'force_shape_cut_anchors')
+
+        self.add_global = cut_cfg.get('add_global', False)
+        
+        # h,w
+        if isinstance(image_size, int):
+            image_size = (image_size, image_size)
+        self.image_size = image_size
+
+        if anchors in grid_dict:
+            anchors = grid_dict[anchors]
+        else:
+            anchors = eval(anchors)
+        self.anchors = [tuple(_) for _ in anchors]
+        self.anchor_max = max([max(_) for _ in self.anchors])
+        self.resizer = AnchorResize(image_size=image_size, anchors=anchors, interpolation='bicubic', anchor_strategy=anchor_strategy)
+
+        if force_shape_cut_anchors in grid_dict:
+            force_shape_cut_anchors = grid_dict[force_shape_cut_anchors]
+        else:
+            force_shape_cut_anchors = eval(force_shape_cut_anchors)
+        self.force_shape_cut_anchors = [tuple(_) for _ in force_shape_cut_anchors]
+        self.force_shape_cut_anchors_max = max([max(_) for _ in self.force_shape_cut_anchors])
+
+        self.old_resizer = transforms.Resize(image_size,interpolation="bicubic")
+
+        # 把image processor的缩放去掉 只保留后面的变换
+        self.image_transform = transforms.Compose(self.image_transform.transforms[1:])
+        if self.add_global:
+            self.cut_prompt_template = cut_prompt_template_dict[cut_prompt+'_global']
+        else:
+            self.cut_prompt_template = cut_prompt_template_dict[cut_prompt]
+
+        self.media_tokens = ["<|image|>", "<|video|>"]
+
+    def _process_image(self, images):
+        new_images = []
+        cut_shape = []
+        for image in images:
+            raw_image = image
+            image, selected_anchor = self.resizer(image)
+            image_input = self.image_transform(image) # h,w,3 -> 3,h,w
+            cut_shape.append((image_input.shape[1]//self.image_size[0], image_input.shape[2]//self.image_size[1])) # cut_h, cut_w
+            image_input = rearrange(image_input, 'C (num_h h) (num_w w) -> (num_h num_w) C h w', h=self.image_size[0], w=self.image_size[1])
+
+            new_images.append(image_input)
+        
+            if self.add_global:
+                new_images.append(self.image_transform(self.resizer.resize_global(raw_image)).unsqueeze(0))
+                cut_shape.append((1,1))
+
+        new_images = paddle.concat(new_images, axis=0)
+        cut_shape_indices = build_cut_shape_indices(cut_shape)
+        return new_images, cut_shape, cut_shape_indices
+
+
+class TensorType(Enum):
+    PADDLE = "paddle"
+
+
+class mPLUGOwl3BatchFeature(BatchFeature):
+    r"""
+    Extend from BatchFeature for supporting various image size
+    """
+    def __init__(self, data: Optional[Dict[str, Any]] = None, tensor_type: Union[None, str, TensorType] = None):
+        super().__init__(data)
+        self.convert_to_tensors(tensor_type=tensor_type)
+
+    def convert_to_tensors(self, tensor_type: Optional[Union[str, TensorType]] = None):
+        if tensor_type is None:
+            return self
+        
+        #is_tensor, as_tensor = self._get_is_as_tensor_fns(tensor_type)
+        is_tensor = lambda x: isinstance(x, paddle.Tensor)
+        as_tensor = paddle.to_tensor
+
+        def converter(value):
+            try:
+                if not is_tensor(value):
+                    tensor = as_tensor(value)
+                    return tensor
+            except:  # noqa E722
+                if key == "overflowing_values":
+                    raise ValueError("Unable to create tensor returning overflowing values of different lengths. ")
+                raise ValueError(
+                    "Unable to create tensor, you should probably activate padding "
+                    "with 'padding=True' to have batched tensors with the same length."
+                )
+
+        for key, value in self.items():
+            self[key] = recursive_converter(converter, value)
+        return self
+
+    # def to(self, *args, **kwargs) -> "mPLUGOwl3BatchFeature":
+    #     requires_backends(self, ["torch"])
+    #     import torch
+
+    #     def cast_tensor(v):
+    #         # # check if v is a floating point
+    #         # if torch.is_floating_point(v):
+    #         #     # cast and send to device
+    #         #     return v.to(*args, **kwargs)
+    #         # elif device is not None:
+    #         #     return v.to(device=device)
+    #         # else:
+    #         #     return v
+    #         if isinstance(v, paddle.Tensor):
+    #             # For floating point tensors
+    #             if v.dtype in [paddle.float32, paddle.float64]:
+    #                 if "dtype" in kwargs:
+    #                     v = v.cast(kwargs["dtype"])
+    #                 if "place" in kwargs:
+    #                     v = v.place(kwargs["place"])
+    #                 return v
+    #             # For non-floating point tensors, only handle device
+    #             elif "place" in kwargs:
+    #                 return v.place(kwargs["place"])
+    #         return v
+
+    #     new_data = {}
+    #     device = kwargs.get("place")
+    #     # Check if the args are a device or a dtype
+    #     if device is None and len(args) > 0:
+    #         # device should be always the first argument
+    #         arg = args[0]
+    #         if is_torch_dtype(arg):
+    #             # The first argument is a dtype
+    #             pass
+    #         elif isinstance(arg, str) or is_torch_device(arg) or isinstance(arg, int):
+    #             device = arg
+    #         else:
+    #             # it's something else
+    #             raise ValueError(f"Attempting to cast a BatchFeature to type {str(arg)}. This is not supported.")
+    #     # We cast only floating point tensors to avoid issues with tokenizers casting `LongTensor` to `FloatTensor`
+    #     for k, v in self.items():
+    #         new_data[k] = recursive_converter(cast_tensor, v)
+    #     self.data = new_data
+    #     return self
+
+
+class mPLUGOwl3ImageProcessor(BaseImageProcessor, CutMixin):
+    model_input_names = ["pixel_values"]
+
+    def __init__(
+            self, 
+            image_size,
+            mean=[0.5, 0.5, 0.5],
+            std=[0.5, 0.5, 0.5],
+            **kwargs):
+        super().__init__(**kwargs)
+        self.image_size = image_size
+        self.image_transform = transforms.Compose([
+            transforms.Resize((image_size, image_size), interpolation="bicubic"),
+            transforms.ToTensor(),
+            transforms.Normalize(mean, std),
+        ])
+        CutMixin.__init__(self)
+
+    def preprocess(
+            self, 
+            images: Union[Image.Image, List[Image.Image]],
+            cut_enable=True,
+            **kwargs
+        ) -> mPLUGOwl3BatchFeature:
+        if isinstance(images, Image.Image):
+            images_list = [images]
+        else:
+            images_list = images
+
+        if self.cut_enable and cut_enable:
+            image_data, cut_shape, cut_shape_indices = self._process_image(images_list)
+        else:
+            image_data = [self.image_transform(self.resizer.resize_global(image)) for image in images_list]
+            image_data = paddle.stack(image_data, axis=0)
+            cut_shape = cut_shape_indices = None
+            
+        return mPLUGOwl3BatchFeature(data={'pixel_values': image_data, 'cut_shape':cut_shape, 'cut_shape_indices':cut_shape_indices})
+    
+    def to_dict(self):
+        encoder_dict = super().to_dict()
+        pop_keys = ['image_transform', 'resizer', 'old_resizer', 'cut_prompt_template']
+        for pk in pop_keys:
+            encoder_dict.pop(pk, None)
+        return encoder_dict
diff --git a/paddlemix/models/mPLUGOwl3/modeling_hyper_qwen2.py b/paddlemix/models/mPLUGOwl3/modeling_hyper_qwen2.py
new file mode 100644
index 000000000..af91ffd48
--- /dev/null
+++ b/paddlemix/models/mPLUGOwl3/modeling_hyper_qwen2.py
@@ -0,0 +1,1027 @@
+# Copyright (c) 2024 PaddlePaddle Authors. All Rights Reserved.
+# 
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+# 
+#     http://www.apache.org/licenses/LICENSE-2.0
+# 
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+import paddle
+import paddle.nn as nn
+from paddle.nn import MultiHeadAttention
+import paddlenlp
+
+from typing import List, Optional, Tuple, Union
+
+from einops import rearrange, repeat
+
+# from paddlemix.models.flash_attn_utils import (
+#     has_flash_attn_func,
+#     is_flash_attn_available,
+# )
+
+#from .activations import ACT2FN
+from ...activations import ACT2FN
+from .bert_padding import index_first_axis, pad_input, unpad_input
+from .configuration_hyper_qwen2 import HyperQwen2Config
+
+# if is_flash_attn_available():
+#     flash_attn_func, flash_attn_varlen_func = has_flash_attn_func()
+#     _flash_supports_window_size = "window_size" in list(inspect.signature(flash_attn_func).parameters)
+# # >>>>>>if transformers.utils.is_flash_attn_2_available():
+# #     pass
+# #     _flash_supports_window_size = 'window_size' in list(inspect.signature(
+# #         flash_attn_func).parameters)
+
+# from .x_sdpa import ScaleDotProductAttention
+
+try:
+    from einops import rearrange
+
+    use_flash_rotary = True
+    print("use flash_attn rotary")
+except ImportError:
+    use_flash_rotary = False
+    print("import flash_attn rotary fail")
+logger = paddle.utils.try_import("logging").getLogger(name=__name__)
+# _CHECKPOINT_FOR_DOC = "Qwen/Qwen2-7B-beta"
+# _CONFIG_FOR_DOC = "HyperQwen2Config"
+
+
+# def _get_unpad_data(attention_mask):
+#     seqlens_in_batch = attention_mask.sum(axis=-1, dtype="int32")
+#     paddle.utils.try_import("warnings").warn("Now, the return shape is inconsistent with torch when as_tuple is True")
+#     indices = paddle.nonzero(x=attention_mask.flatten(), as_tuple=False).flatten()
+#     max_seqlen_in_batch = seqlens_in_batch.max().item()
+#     cu_seqlens = paddle.nn.functional.pad(
+#         x=paddle.cumsum(x=seqlens_in_batch, axis=0, dtype="int32"), pad=(1, 0), pad_from_left_axis=False
+#     )
+#     return indices, cu_seqlens, max_seqlen_in_batch
+
+
+class Qwen2RMSNorm(paddle.nn.Layer):
+    def __init__(self, hidden_size, eps=1e-06):
+        """
+        Qwen2RMSNorm is equivalent to T5LayerNorm
+        """
+        super().__init__()
+        self.weight = paddle.base.framework.EagerParamBase.from_tensor(tensor=paddle.ones(shape=hidden_size))
+        self.variance_epsilon = eps
+
+    def forward(self, hidden_states):
+        input_dtype = hidden_states.dtype
+        hidden_states = hidden_states.to("float32")
+        variance = hidden_states.pow(y=2).mean(axis=-1, keepdim=True)
+        hidden_states = hidden_states * paddle.rsqrt(x=variance + self.variance_epsilon)
+        return self.weight * hidden_states.to(input_dtype)
+
+
+class Qwen2RotaryEmbedding(nn.Layer):
+    def __init__(self, dim, max_position_embeddings=2048, base=10000):
+        super().__init__()
+        self.dim = dim
+        self.max_position_embeddings = max_position_embeddings
+        self.base = base
+        inv_freq = 1.0 / self.base ** (
+            paddle.arange(start=0, end=self.dim, step=2, dtype="int64").astype(dtype="float32") / self.dim
+        )
+        self.register_buffer(name="inv_freq", tensor=inv_freq, persistable=False)
+        self._set_cos_sin_cache(
+            seq_len=max_position_embeddings, dtype=paddle.get_default_dtype()
+        )
+
+    def _set_cos_sin_cache(self, seq_len, dtype):
+        self.max_seq_len_cached = seq_len
+        t = paddle.arange(dtype="int64", end=self.max_seq_len_cached).astype(dtype=self.inv_freq.dtype)
+        freqs = paddle.outer(x=t, y=self.inv_freq)
+        emb = paddle.concat(x=(freqs, freqs), axis=-1)
+        self.register_buffer(name="cos_cached", tensor=emb.cos().to(dtype), persistable=False)
+        self.register_buffer(name="sin_cached", tensor=emb.sin().to(dtype), persistable=False)
+
+    def forward(self, x, seq_len=None):
+        # x: [bs, num_attention_heads, seq_len, head_size]
+        if seq_len > self.max_seq_len_cached:
+            self._set_cos_sin_cache(seq_len=seq_len, dtype=x.dtype)
+
+        return (
+            self.cos_cached[:seq_len].to(dtype=x.dtype),
+            self.sin_cached[:seq_len].to(dtype=x.dtype),
+        )
+
+class RotaryEmbedding(paddle.nn.Layer):
+    def __init__(self, dim, base=10000, use_fp32=False, use_outer_in_rope=False):
+        super().__init__()
+        self.dim = dim
+        self.base = base
+        self.use_fp32 = use_fp32
+        if use_fp32:
+            self.inv_freq = 1.0 / base ** (paddle.arange(start=0, end=dim, step=2).astype(dtype="float32") / dim)
+        else:
+            inv_freq = 1.0 / base ** (paddle.arange(start=0, end=dim, step=2).astype(dtype="float32") / dim)
+            self.register_buffer(name="inv_freq", tensor=inv_freq)
+        self._rotary_pos_emb_cache = None
+        self._seq_len_cached = 0
+        self.use_outer_in_rope = use_outer_in_rope
+        self._ntk_alpha_cached = 1.0
+
+    def update_rotary_pos_emb_cache(self, max_seq_len, offset=0, ntk_alpha=1.0):
+        seqlen = max_seq_len + offset
+        if seqlen > self._seq_len_cached or ntk_alpha != self._ntk_alpha_cached:
+            base = self.base * ntk_alpha ** (self.dim / (self.dim - 2))
+            self.inv_freq = 1.0 / base ** (
+                paddle.arange(start=0, end=self.dim, step=2).astype(dtype="float32") / self.dim
+            )
+            self._seq_len_cached = seqlen
+            self._ntk_alpha_cached = ntk_alpha
+            seq = paddle.arange(end=seqlen)
+            if self.use_outer_in_rope:
+                freqs = paddle.outer(x=seq.astype(dtype=self.inv_freq.dtype), y=self.inv_freq)
+            else:
+                freqs = einsum("i , j -> i j", seq.astype(dtype=self.inv_freq.dtype), self.inv_freq)
+            emb = paddle.concat(x=(freqs, freqs), axis=-1)
+            from einops import rearrange
+
+            self._rotary_pos_emb_cache = rearrange(emb, "n d -> n 1 1 d")
+
+    def forward(self, max_seq_len, offset=0, ntk_alpha=1.0):
+        self.update_rotary_pos_emb_cache(max_seq_len, offset, ntk_alpha)
+        return self._rotary_pos_emb_cache[offset : offset + max_seq_len]
+
+
+def rotate_half(x):
+    """Rotates half the hidden dims of the input."""
+    x1 = x[..., : tuple(x.shape)[-1] // 2]
+    x2 = x[..., tuple(x.shape)[-1] // 2 :]
+    return paddle.concat(x=(-x2, x1), axis=-1)
+
+
+def apply_rotary_pos_emb(q, k, cos, sin, position_ids, unsqueeze_dim=1):
+    """Applies Rotary Position Embedding to the query and key tensors.
+
+    Args:
+        q (`torch.Tensor`): The query tensor.
+        k (`torch.Tensor`): The key tensor.
+        cos (`torch.Tensor`): The cosine part of the rotary embedding.
+        sin (`torch.Tensor`): The sine part of the rotary embedding.
+        position_ids (`torch.Tensor`):
+            The position indices of the tokens corresponding to the query and key tensors. For example, this can be
+            used to pass offsetted position ids when working with a KV-cache.
+        unsqueeze_dim (`int`, *optional*, defaults to 1):
+            The 'unsqueeze_dim' argument specifies the dimension along which to unsqueeze cos[position_ids] and
+            sin[position_ids] so that they can be properly broadcasted to the dimensions of q and k. For example, note
+            that cos[position_ids] and sin[position_ids] have the shape [batch_size, seq_len, head_dim]. Then, if q and
+            k have the shape [batch_size, heads, seq_len, head_dim], then setting unsqueeze_dim=1 makes
+            cos[position_ids] and sin[position_ids] broadcastable to the shapes of q and k. Similarly, if q and k have
+            the shape [batch_size, seq_len, heads, head_dim], then set unsqueeze_dim=2.
+    Returns:
+        `tuple(torch.Tensor)` comprising of the query and key tensors rotated using the Rotary Position Embedding.
+    """
+    cos = cos[position_ids].unsqueeze(unsqueeze_dim)
+    sin = sin[position_ids].unsqueeze(unsqueeze_dim)
+    q_embed = (q * cos) + (rotate_half(q) * sin)
+    k_embed = (k * cos) + (rotate_half(k) * sin)
+    return q_embed, k_embed
+
+
+class Qwen2MLP(nn.Layer):
+    def __init__(self, config):
+        super().__init__()
+        self.config = config
+        self.hidden_size = config.hidden_size
+        self.intermediate_size = config.intermediate_size
+        self.gate_proj = nn.Linear(self.hidden_size, self.intermediate_size, bias_attr=False)
+        self.up_proj = nn.Linear(self.hidden_size, self.intermediate_size, bias_attr=False)
+        self.down_proj = nn.Linear(self.intermediate_size, self.hidden_size, bias_attr=False)
+        self.act_fn = ACT2FN[config.hidden_act]
+
+    def forward(self, x):
+        return self.down_proj(self.act_fn(self.gate_proj(x)) * self.up_proj(x))
+
+
+def repeat_kv(hidden_states: paddle.Tensor, n_rep: int) -> paddle.Tensor:
+    """
+    This is the equivalent of torch.repeat_interleave(x, dim=1, repeats=n_rep). The hidden states go from (batch,
+    num_key_value_heads, seqlen, head_dim) to (batch, num_attention_heads, seqlen, head_dim)
+    """
+    batch, num_key_value_heads, slen, head_dim = tuple(hidden_states.shape)
+    if n_rep == 1:
+        return hidden_states
+    hidden_states = hidden_states[:, :, None, :, :].expand(shape=[batch, num_key_value_heads, n_rep, slen, head_dim])
+    return hidden_states.reshape([batch, num_key_value_heads * n_rep, slen, head_dim])
+
+
+
+
+
+def _rotate_half(x):
+    """
+    change sign so the last dimension becomes [-odd, +even]
+    """
+    from einops import rearrange
+    x = rearrange(x, '... (j d) -> ... j d', j=2)
+    x1, x2 = x.unbind(axis=-2)
+    return paddle.concat(x=(-x2, x1), axis=-1)
+
+
+def apply_rotary_pos_emb_core(t, freqs, use_fp32=False, debug=False):
+    """
+    input tensor t is of shape [seq_length, ..., dim]
+    rotary positional embeding tensor freqs is of shape [seq_length, ..., dim]
+    check https://kexue.fm/archives/8265 for detailed formulas
+    """
+    if use_flash_rotary and use_fp32:
+        t_ = rearrange(t, "s b ... -> b s ...")
+        if use_fp32:
+            t_ = t_.astype(dtype="float32")
+        freqs = freqs.squeeze(axis=1).squeeze(axis=1)
+        cos = freqs[:, :freqs.shape[-1] // 2].cos()
+        sin = freqs[:, :freqs.shape[-1] // 2].sin()
+        output = apply_rotary_emb_func(t_, cos, sin).astype(dtype=t.dtype) # TODO
+        return rearrange(output, 'b s ... -> s b ...')
+
+    rot_dim = freqs.shape[-1]
+    # ideally t_pass is empty so rotary pos embedding is applied to all tensor t
+    t_, t_pass_ = t[..., :rot_dim], t[..., rot_dim:]
+    
+    if use_fp32:
+        t_ = t_.astype(dtype="float32")
+        t_pass_ = t_pass_.astype(dtype="float32")
+    # first part is cosine component
+    # second part is sine component, need to change signs with _rotate_half method
+    t_ = (t_ * freqs.cos()) + (_rotate_half(t_) * freqs.sin())
+    return paddle.concat(x=(t_, t_pass_), axis=-1).astype(dtype=t.dtype)
+
+
+class HyperQwen2Attention(nn.Layer):
+    """
+    Multi-headed attention from 'Attention Is All You Need' paper. Modified to use sliding window attention: Longformer
+    and "Generating Long Sequences with Sparse Transformers".
+    """
+
+    def __init__(self, config: HyperQwen2Config, layer_idx: Optional[int] = None, is_hyper_enabled=False):
+        super().__init__()
+        self.config = config
+        self.layer_idx = layer_idx
+        if layer_idx is None:
+            logger.warning_once(
+                f"Instantiating {self.__class__.__name__} without passing `layer_idx` is not recommended and will "
+                "to errors during the forward call, if caching is used. Please make sure to provide a `layer_idx` "
+                "when creating this class."
+            )
+
+        self.hidden_size = config.hidden_size
+        self.num_heads = config.num_attention_heads
+        self.head_dim = self.hidden_size // self.num_heads
+        self.num_key_value_heads = config.num_key_value_heads
+        self.num_key_value_groups = self.num_heads // self.num_key_value_heads
+        self.max_position_embeddings = config.max_position_embeddings
+        self.rope_theta = config.rope_theta
+        self.is_causal = True
+        self.attention_dropout = config.attention_dropout
+
+        if (self.head_dim * self.num_heads) != self.hidden_size:
+            raise ValueError(
+                f"hidden_size must be divisible by num_heads (got `hidden_size`: {self.hidden_size}"
+                f" and `num_heads`: {self.num_heads})."
+            )
+        self.q_proj = nn.Linear(self.hidden_size, self.num_heads * self.head_dim, bias_attr=True)
+        self.k_proj = nn.Linear(self.hidden_size, self.num_key_value_heads * self.head_dim, bias_attr=True)
+        self.v_proj = nn.Linear(self.hidden_size, self.num_key_value_heads * self.head_dim, bias_attr=True)
+        self.o_proj = nn.Linear(self.num_heads * self.head_dim, self.hidden_size, bias_attr=False)
+
+        self.rotary_emb = Qwen2RotaryEmbedding(
+            self.head_dim,
+            max_position_embeddings=self.max_position_embeddings,
+            base=self.rope_theta,
+        )
+        self.rotary_emb_core = RotaryEmbedding(
+            self.head_dim, base=self.rope_theta, use_fp32=True, use_outer_in_rope=True
+        )
+        # Hyper Attention Modules
+        self.is_hyper_enabled = is_hyper_enabled
+        if self.is_hyper_enabled:
+            self.v_kv_proj = nn.Linear(self.hidden_size, self.num_key_value_heads * self.head_dim * 2, bias_attr=True)
+          
+            self.visual_cache={}
+        
+        self.use_flexattention = True
+
+    def apply_mi_rope(self, key_layer, image_pos, length_each_img):
+        # input shape should be [s b h d]
+        key_layer = rearrange(key_layer, 'b h s d -> s b h d')
+        # if self.rotary_emb_core.inv_freq.device!=key_layer.device:
+        #     self.rotary_emb_core.inv_freq = self.rotary_emb_core.inv_freq.to(key_layer.device)
+        rotary_pos_emb_max_seq_len = self.config.max_position_embeddings
+        ntk_alpha = 1
+        rotary_pos_emb = self.rotary_emb_core(rotary_pos_emb_max_seq_len, ntk_alpha=ntk_alpha)
+        assert rotary_pos_emb is not None
+
+        if isinstance(rotary_pos_emb, tuple):
+            rotary_pos_emb = rotary_pos_emb
+        else:
+            rotary_pos_emb = ((rotary_pos_emb,) * 2)
+
+        if rotary_pos_emb is not None:
+            q_pos_emb, k_pos_emb = rotary_pos_emb
+            
+            k_pos_emb = repeat(k_pos_emb[image_pos], 'N_img b h d -> (N_img L) b h d', L=length_each_img) # N_img, dim
+
+            key_layer = apply_rotary_pos_emb_core(key_layer, k_pos_emb, use_fp32=True) # TODO difference
+        key_layer = rearrange(key_layer, 's b h d -> b h s d')
+        return key_layer
+
+
+# def hyper_mask_always_true(b, h, q_idx, kv_idx):
+#     return q_idx>=0
+
+# def causal(b, h, q_idx, kv_idx):
+#     return q_idx >= kv_idx
+
+
+# def create_hyper_attention(media_starts_extend, q_len, kv_len, each_visual_len):
+    
+#     visual_len = kv_len - q_len
+#     def hyper_mask_dynamic(b, h, q_idx, kv_idx):
+#         return torch.where(kv_idx<visual_len, q_idx>=media_starts_extend[kv_idx], causal(b, h, q_idx, kv_idx-visual_len))
+    
+#     return create_block_mask(hyper_mask_dynamic, B=None, H=None, Q_LEN=q_len, KV_LEN=kv_len, BLOCK_SIZE=128, _compile=True)
+
+# Copied from transformers.models.mistral.modeling_mistral.MistralSdpaAttention with Mistral->Qwen2
+class HyperQwen2SdpaAttention(HyperQwen2Attention):
+    """
+    Qwen2 attention module using torch.nn.functional.scaled_dot_product_attention. This module inherits from
+    `Qwen2Attention` as the weights of the module stays untouched. The only changes are on the forward pass to adapt to
+    SDPA API.
+    """
+
+    def hyperattention(self,hidden_states: paddle.Tensor,
+        attention_mask: Optional[paddle.Tensor] = None,
+        position_ids: Optional[paddle.Tensor] = None,
+        image_embeds=None,
+        media_offset=None,
+        past_key_value: Optional[MultiHeadAttention.Cache] = None,
+        output_attentions: bool = False,
+        use_cache: bool = False,
+    )-> Tuple[paddle.Tensor, Optional[paddle.Tensor], Optional[Tuple[paddle.Tensor]]]:
+        bsz, q_len, _ = hidden_states.shape
+
+        query_states = self.q_proj(hidden_states)
+        key_states = self.k_proj(hidden_states)
+        value_states = self.v_proj(hidden_states)
+
+        query_states = query_states.reshape([bsz, q_len, self.num_heads, self.head_dim]).transpose([0, 2, 1, 3])
+        key_states = key_states.reshape([bsz, q_len, self.num_key_value_heads, self.head_dim]).transpose([0, 2, 1, 3])
+        value_states = value_states.reshape([bsz, q_len, self.num_key_value_heads, self.head_dim]).transpose([0, 2, 1, 3])
+
+        kv_seq_len = key_states.shape[-2]
+        if past_key_value is not None:
+            kv_seq_len += past_key_value.get_usable_length(kv_seq_len, self.layer_idx)
+        cos, sin = self.rotary_emb(value_states, seq_len=kv_seq_len)
+
+        query_states, key_states = apply_rotary_pos_emb(query_states, key_states, cos, sin, position_ids)
+
+        if past_key_value is not None:
+            cache_kwargs = {"sin": sin, "cos": cos}  # Specific to RoPE models
+            key_states, value_states = past_key_value.update(key_states, value_states, self.layer_idx, cache_kwargs)
+
+        key_states = repeat_kv(key_states, self.num_key_value_groups)
+        value_states = repeat_kv(value_states, self.num_key_value_groups)
+
+        # add visual to kv
+        length_each_img = image_embeds.shape[1]
+        image_embeds = self.v_kv_proj(image_embeds)
+        image_start = 0
+        context_layer = []                 
+        for bi, media_starts in enumerate(media_offset):
+            num_images = media_starts.shape[0]
+            if num_images > 0:
+                if q_len == 1:
+                    full_mask = paddle.ones((1,1,1, num_images*length_each_img + kv_seq_len)).astype(paddle.bool)
+                else:
+                    causal_mask = paddle.tril(paddle.ones([q_len, kv_seq_len])).astype(paddle.bool)
+                    # 扩展维度以匹配 (bsz, 1, q_len, kv_seq_len)
+                    causal_mask = causal_mask.unsqueeze(0).unsqueeze(0)
+
+                    matrix = paddle.arange(q_len).reshape([-1,1])
+                    t2vmask = ~(matrix<media_starts.reshape([1, -1])) 
+                    t2vmask = repeat(t2vmask, 'seq_t seq_v -> 1 1 seq_t (seq_v v_token)', v_token=length_each_img)
+                    full_mask = paddle.concat([t2vmask, causal_mask], axis=3) # unsqueeze batch dim (batch, 1, seq_q, seq_k)
+
+                curr_query_layer = query_states[bi:bi+1]
+                # order is sbhd
+                curr_visual_key_layer, curr_visual_value_layer = rearrange(image_embeds[image_start:image_start+num_images], 'BL Lv (H KV D) -> KV 1 H (BL Lv) D', KV=2, H=self.num_key_value_heads) # b h s d
+                image_start += num_images
+
+                curr_visual_key_layer = self.apply_mi_rope(curr_visual_key_layer, media_starts, length_each_img=length_each_img)
+
+                curr_visual_key_layer = repeat_kv(curr_visual_key_layer, self.num_key_value_groups)
+                curr_visual_value_layer = repeat_kv(curr_visual_value_layer, self.num_key_value_groups)
+
+                curr_key_layer = paddle.concat([curr_visual_key_layer, key_states[bi:bi+1]], axis=2)
+                curr_value_layer = paddle.concat([curr_visual_value_layer, value_states[bi:bi+1]], axis=2)
+                is_causal = False
+            else:
+                # 执行无图attention
+                curr_query_layer = query_states[bi:bi+1]
+                curr_key_layer = key_states[bi:bi+1]
+                curr_value_layer = value_states[bi:bi+1]
+                is_causal = True if q_len > 1 else False
+                if is_causal:
+                    full_mask = None
+                else:
+                    causal_mask = paddle.tril(paddle.ones([q_len, kv_seq_len])).astype(paddle.bool)
+                    # 扩展维度以匹配 (bsz, 1, q_len, kv_seq_len)
+                    causal_mask = causal_mask.unsqueeze(0).unsqueeze(0)
+                    full_mask = causal_mask
+
+
+            # SDPA with memory-efficient backend is currently (torch==2.1.2) bugged with non-contiguous inputs with custom attn_mask,
+            # # Reference: https://github.com/pytorch/pytorch/issues/112577.
+            # if curr_query_layer.device.type == "cuda" and full_mask is not None:
+            #     curr_query_layer = curr_query_layer.contiguous()
+            #     curr_key_layer = curr_key_layer.contiguous()
+            #     curr_value_layer = curr_value_layer.contiguous()
+           
+            attn_output = paddle.nn.functional.scaled_dot_product_attention(
+                curr_query_layer, # (batch, ..., sequence, dim)
+                curr_key_layer,
+                curr_value_layer,
+                attn_mask=full_mask, # (N, ..., L, S) A boolean mask where a value of True indicates that the element *should* take part in attention.
+                dropout_p=self.attention_dropout if self.training else 0.0,
+                # The q_len > 1 is necessary to match with AttentionMaskConverter.to_causal_4d that does not create a causal mask in case q_len == 1.
+                is_causal=is_causal,
+                # enable_gqa=True, # gqa can not be used because mask requires XFORMERS and not support gqa
+            ) # -> (N, ..., L, Ev)
+            assert attn_output.shape[0] == 1
+            context_layer.append(attn_output)
+        attn_output = context_layer = paddle.concat(context_layer, axis=0)
+
+        attn_output = attn_output.transpose([0, 2, 1, 3])
+        attn_output = attn_output.reshape([bsz, q_len, self.hidden_size])
+
+        attn_output = self.o_proj(attn_output)
+
+        return attn_output, None, past_key_value
+
+    # Adapted from Qwen2Attention.forward
+    def forward(
+        self,
+        hidden_states: paddle.Tensor,
+        attention_mask: Optional[paddle.Tensor] = None,
+        position_ids: Optional[paddle.Tensor] = None,
+        image_embeds=None,
+        media_offset=None,
+        past_key_value: Optional[Tuple[paddle.Tensor]] = None,
+        output_attentions: bool = False,
+        use_cache: bool = False,
+    ) -> Tuple[paddle.Tensor, Optional[paddle.Tensor], Optional[Tuple[paddle.Tensor]]]:
+        if output_attentions:
+            # TODO: Improve this warning with e.g. `model.config.attn_implementation = "manual"` once this is implemented.
+            logger.warning_once(
+                "Qwen2Model is using Qwen2SdpaAttention, but `torch.nn.functional.scaled_dot_product_attention` does not support `output_attentions=True`. Falling back to the manual attention implementation, "
+                'but specifying the manual implementation will be required from Transformers version v5.0.0 onwards. This warning can be removed using the argument `attn_implementation="eager"` when loading the model.'
+            )
+            return super().forward(
+                hidden_states=hidden_states,
+                attention_mask=attention_mask,
+                position_ids=position_ids,
+                past_key_value=past_key_value,
+                output_attentions=output_attentions,
+                use_cache=use_cache,
+            )
+        if self.is_hyper_enabled and image_embeds is not None:
+            return self.hyperattention(hidden_states, attention_mask, position_ids, image_embeds, media_offset, past_key_value, output_attentions, use_cache)
+
+        bsz, q_len, _ = hidden_states.shape
+
+        query_states = self.q_proj(hidden_states)
+        key_states = self.k_proj(hidden_states)
+        value_states = self.v_proj(hidden_states)
+
+        query_states = query_states.reshape([bsz, q_len, self.num_heads, self.head_dim]).transpose([0, 2, 1, 3])
+        key_states = key_states.reshape([bsz, q_len, self.num_key_value_heads, self.head_dim]).transpose([0, 2, 1, 3])
+        value_states = value_states.reshape([bsz, q_len, self.num_key_value_heads, self.head_dim]).transpose([0, 2, 1, 3])
+
+        kv_seq_len = key_states.shape[-2]
+        if past_key_value is not None:
+            kv_seq_len += past_key_value.get_usable_length(kv_seq_len, self.layer_idx)
+        cos, sin = self.rotary_emb(value_states, seq_len=kv_seq_len)
+
+        query_states, key_states = apply_rotary_pos_emb(query_states, key_states, cos, sin, position_ids)
+
+        if past_key_value is not None:
+            cache_kwargs = {"sin": sin, "cos": cos}  # Specific to RoPE models
+            key_states, value_states = past_key_value.update(key_states, value_states, self.layer_idx, cache_kwargs)
+
+        key_states = repeat_kv(key_states, self.num_key_value_groups)
+        value_states = repeat_kv(value_states, self.num_key_value_groups)
+
+        if attention_mask is not None:  # (1,1,1,60)
+            if tuple(attention_mask.shape) != (bsz, 1, q_len, kv_seq_len):
+                raise ValueError(
+                    f"Attention mask should be of size {bsz, 1, q_len, kv_seq_len}, but is {tuple(attention_mask.shape)}"
+                )
+        # # SDPA with memory-efficient backend is currently (torch==2.1.2) bugged with non-contiguous inputs with custom attn_mask,
+        # # Reference: https://github.com/pytorch/pytorch/issues/112577.
+        # if query_states.device.type == "cuda" and attention_mask is not None:
+        #     query_states = query_states.contiguous()
+        #     key_states = key_states.contiguous()
+        #     value_states = value_states.contiguous()
+
+        attn_output = paddle.nn.functional.scaled_dot_product_attention(
+            query_states,
+            key_states,
+            value_states,
+            attn_mask=attention_mask,
+            dropout_p=self.attention_dropout if self.training else 0.0,
+            # The q_len > 1 is necessary to match with AttentionMaskConverter.to_causal_4d that does not create a causal mask in case q_len == 1.
+            is_causal=self.is_causal and attention_mask is None and q_len > 1,
+        )
+
+        attn_output = attn_output.transpose([0, 2, 1, 3])
+        attn_output = attn_output.reshape([bsz, q_len, self.hidden_size])
+
+        attn_output = self.o_proj(attn_output)
+
+        return attn_output, None, past_key_value
+
+
+from paddlenlp.transformers.qwen2.modeling import Qwen2Attention
+# Original Attention of Qwen2
+QWEN2_ATTENTION_CLASSES = {
+    "eager": Qwen2Attention,
+    "flash_attention_2": Qwen2Attention, #Qwen2FlashAttention2,
+    "sdpa": Qwen2Attention, #Qwen2SdpaAttention,
+}
+
+
+class HyperQwen2DecoderLayer(nn.Layer):
+    def __init__(self, config: HyperQwen2Config, layer_idx: int):
+        super().__init__()
+        self.hidden_size = config.hidden_size
+
+        if config.use_sliding_window and config._attn_implementation != "flash_attention_2":
+            logger.warning_once(
+                f"Sliding Window Attention is enabled but not implemented for `{config._attn_implementation}`; "
+                "unexpected results may be encountered."
+            )
+        self.is_hyper_enabled = (layer_idx+1) in config.hyper_layers
+        if self.is_hyper_enabled:
+            self.self_attn = HyperQwen2SdpaAttention(config, layer_idx, is_hyper_enabled=self.is_hyper_enabled)
+        else:
+            #self.self_attn = QWEN2_ATTENTION_CLASSES[config._attn_implementation](config, layer_idx)
+            self.self_attn = QWEN2_ATTENTION_CLASSES["flash_attention_2"](config, layer_idx)
+
+        self.mlp = Qwen2MLP(config)
+        self.input_layernorm = Qwen2RMSNorm(config.hidden_size, eps=config.rms_norm_eps)
+        self.post_attention_layernorm = Qwen2RMSNorm(config.hidden_size, eps=config.rms_norm_eps)
+
+    def forward(
+        self,
+        hidden_states: paddle.Tensor,
+        attention_mask: Optional[paddle.Tensor] = None,
+        position_ids: Optional[paddle.Tensor] = None,
+        image_embeds=None,
+        media_offset=None,
+        past_key_value: Optional[Tuple[paddle.Tensor]] = None,
+        output_attentions: Optional[bool] = False,
+        use_cache: Optional[bool] = False,
+    ) -> Tuple[paddle.Tensor, Optional[Tuple[paddle.Tensor, paddle.Tensor]]]:
+        """
+        Args:
+            hidden_states (`torch.FloatTensor`): input to the layer of shape `(batch, seq_len, embed_dim)`
+            attention_mask (`torch.FloatTensor`, *optional*): attention mask of size
+                `(batch, sequence_length)` where padding elements are indicated by 0.
+            output_attentions (`bool`, *optional*):
+                Whether or not to return the attentions tensors of all attention layers. See `attentions` under
+                returned tensors for more detail.
+            use_cache (`bool`, *optional*):
+                If set to `True`, `past_key_values` key value states are returned and can be used to speed up decoding
+                (see `past_key_values`).
+            past_key_value (`Tuple(torch.FloatTensor)`, *optional*): cached past key and value projection states
+        """
+        residual = hidden_states
+        hidden_states = self.input_layernorm(hidden_states)
+        
+        # Shared LayerNorm
+        if image_embeds is not None and self.is_hyper_enabled:
+            image_embeds = self.input_layernorm(image_embeds)
+            media_kwargs = {"image_embeds": image_embeds, "media_offset": media_offset}
+        else:
+            image_embeds = media_offset = None
+            media_kwargs = {}
+        # Self Attention
+        hidden_states, self_attn_weights, present_key_value = self.self_attn(
+            hidden_states=hidden_states,
+            attention_mask=attention_mask,
+            position_ids=position_ids,
+            past_key_value=past_key_value,
+            output_attentions=output_attentions,
+            use_cache=use_cache,
+            **media_kwargs,
+        )
+        hidden_states = residual + hidden_states
+
+        # Fully Connected
+        residual = hidden_states
+        hidden_states = self.post_attention_layernorm(hidden_states)
+        hidden_states = self.mlp(hidden_states)
+        hidden_states = residual + hidden_states
+
+        outputs = (hidden_states,)
+
+        if output_attentions:
+            outputs += (self_attn_weights,)
+
+        if use_cache:
+            outputs += (present_key_value,)
+
+        return outputs
+
+
+class Qwen2PreTrainedModel(paddlenlp.transformers.model_utils.PretrainedModel):
+    config_class = HyperQwen2Config
+    base_model_prefix = "model"
+    supports_gradient_checkpointing = True
+    _no_split_modules = ["HyperQwen2DecoderLayer"]
+    _skip_keys_device_placement = "past_key_values"
+    _supports_flash_attn_2 = True
+    _supports_sdpa = True
+    _supports_cache_class = True
+
+    def _init_weights(self, layer):
+        std = self.config.initializer_range
+        if isinstance(layer, (paddle.nn.Linear, paddle.nn.Conv3D)):
+            paddle.nn.initializer.Normal(mean=0.0, std=std)(layer.weight)
+            if layer.bias is not None:
+                paddle.nn.initializer.Constant(0.0)(layer.bias)
+        elif isinstance(layer, paddle.nn.Embedding):
+            paddle.nn.initializer.Normal(mean=0.0, std=std)(layer.weight)
+            if layer._padding_idx is not None:
+                with paddle.no_grad():
+                    layer.weight[layer._padding_idx] = 0.0
+
+
+class HyperQwen2Model(Qwen2PreTrainedModel):
+    """
+    Transformer decoder consisting of *config.num_hidden_layers* layers. Each layer is a [`Qwen2DecoderLayer`]
+
+    Args:
+        config: HyperQwen2Config
+    """
+
+    def __init__(self, config: HyperQwen2Config):
+        super().__init__(config)
+        self.padding_idx = config.pad_token_id
+        self.vocab_size = config.vocab_size
+
+        self.embed_tokens = nn.Embedding(config.vocab_size, config.hidden_size, self.padding_idx)
+        self.layers = nn.LayerList(
+            [HyperQwen2DecoderLayer(config, layer_idx) for layer_idx in range(config.num_hidden_layers)]
+        )
+        self._attn_implementation = 'flash_attention_2' #config._attn_implementation
+        self.norm = Qwen2RMSNorm(config.hidden_size, eps=config.rms_norm_eps)
+
+        self.gradient_checkpointing = False
+        # Initialize weights and apply final processing
+        #self.post_init()
+
+    def get_input_embeddings(self):
+        return self.embed_tokens
+
+    def set_input_embeddings(self, value):
+        self.embed_tokens = value
+
+    def forward(
+        self,
+        input_ids: paddle.Tensor = None,
+        attention_mask: Optional[paddle.Tensor] = None,
+        position_ids: Optional[paddle.Tensor] = None,
+        past_key_values: Optional[List[paddle.Tensor]] = None,
+        inputs_embeds: Optional[paddle.Tensor] = None,
+        image_embeds=None,
+        media_offset=None,
+        use_cache: Optional[bool] = None,
+        output_attentions: Optional[bool] = None,
+        output_hidden_states: Optional[bool] = None,
+        return_dict: Optional[bool] = None,
+    ) -> Union[Tuple, paddlenlp.transformers.model_outputs.BaseModelOutputWithPast]:
+        output_attentions = output_attentions if output_attentions is not None else self.config.output_attentions
+        output_hidden_states = (
+            output_hidden_states if output_hidden_states is not None else self.config.output_hidden_states
+        )
+        use_cache = use_cache if use_cache is not None else self.config.use_cache
+
+        return_dict = return_dict if return_dict is not None else self.config.use_return_dict
+
+        # retrieve input_ids and inputs_embeds
+        if input_ids is not None and inputs_embeds is not None:
+            raise ValueError("You cannot specify both decoder_input_ids and decoder_inputs_embeds at the same time")
+        elif input_ids is not None:
+            batch_size, seq_length = input_ids.shape
+        elif inputs_embeds is not None:
+            batch_size, seq_length, _ = inputs_embeds.shape
+        else:
+            raise ValueError("You have to specify either decoder_input_ids or decoder_inputs_embeds")
+
+        if self.gradient_checkpointing and self.training:
+            if use_cache:
+                logger.warning_once(
+                    "`use_cache=True` is incompatible with gradient checkpointing. Setting `use_cache=False`..."
+                )
+                use_cache = False
+
+        past_key_values_length = 0
+
+        if use_cache:
+            use_legacy_cache = False #not isinstance(past_key_values, Cache)
+            #if use_legacy_cache:
+            #    past_key_values = DynamicCache.from_legacy_cache(past_key_values)
+            past_key_values_length = past_key_values.get_usable_length(seq_length)
+
+        if position_ids is None:
+            #device = input_ids.device if input_ids is not None else inputs_embeds.device
+            position_ids = paddle.arange(
+                past_key_values_length, seq_length + past_key_values_length, dtype=paddle.int64
+            )
+            position_ids = position_ids.unsqueeze(0).reshape([-1, seq_length])
+        else:
+            position_ids = position_ids.reshape([-1, seq_length]).astype(dtype="int64")
+
+        if inputs_embeds is None:
+            inputs_embeds = self.embed_tokens(input_ids)
+
+        if attention_mask is not None and self._attn_implementation == "flash_attention_2" and use_cache:
+            is_padding_right = attention_mask[:, -1].sum().item() != batch_size
+            if is_padding_right:
+                raise ValueError(
+                    "You are attempting to perform batched generation with padding_side='right'"
+                    " this may lead to unexpected behaviour for Flash Attention version of Qwen2. Make sure to "
+                    " call `tokenizer.padding_side  = 'left'` before tokenizing the input. "
+                )
+
+        if self._attn_implementation == "flash_attention_2":
+            # 2d mask is passed through the layers
+            attention_mask = attention_mask if (attention_mask is not None and 0 in attention_mask) else None
+        elif self._attn_implementation == "sdpa" and not output_attentions:
+            # output_attentions=True can not be supported when using SDPA, and we fall back on
+            # the manual implementation that requires a 4D causal mask in all cases.
+            attention_mask = _prepare_4d_causal_attention_mask_for_sdpa(
+                attention_mask,
+                (batch_size, seq_length),
+                inputs_embeds,
+                past_key_values_length,
+                sliding_window=self.config.sliding_window,
+            )
+        else:
+            # 4d mask is passed through the layers
+            attention_mask = _prepare_4d_causal_attention_mask(
+                attention_mask,
+                (batch_size, seq_length),
+                inputs_embeds,
+                past_key_values_length,
+                sliding_window=self.config.sliding_window,
+            )
+
+        hidden_states = inputs_embeds
+
+        # beam search
+        if batch_size != len(media_offset):
+            # The model is performing beamsearch, repeat the visual content
+            beam_factor = batch_size // len(media_offset)
+            assert batch_size % len(media_offset) == 0
+            media_offset = media_offset * beam_factor
+            image_embeds = repeat(image_embeds, 'B L D -> (factor B) L D', factor=beam_factor)
+
+        # decoder layers
+        all_hidden_states = () if output_hidden_states else None
+        all_self_attns = () if output_attentions else None
+        next_decoder_cache = None
+
+        for decoder_layer in self.layers:
+            if output_hidden_states:
+                all_hidden_states += (hidden_states,)
+
+            layer_outputs = decoder_layer(
+                hidden_states,
+                attention_mask=attention_mask,
+                position_ids=position_ids,
+                image_embeds=image_embeds,
+                media_offset=media_offset,
+                past_key_value=past_key_values,
+                output_attentions=output_attentions,
+                use_cache=use_cache,
+            )
+
+            hidden_states = layer_outputs[0]
+
+            if use_cache:
+                next_decoder_cache = layer_outputs[2 if output_attentions else 1]
+
+            if output_attentions:
+                all_self_attns += (layer_outputs[1],)
+
+        hidden_states = self.norm(hidden_states)
+
+        # add hidden states from the last decoder layer
+        if output_hidden_states:
+            all_hidden_states += (hidden_states,)
+
+        next_cache = None
+        if use_cache:
+            next_cache = next_decoder_cache.to_legacy_cache() if use_legacy_cache else next_decoder_cache
+
+        if not return_dict:
+            return tuple(v for v in [hidden_states, next_cache, all_hidden_states, all_self_attns] if v is not None)
+        return paddlenlp.transformers.model_outputs.BaseModelOutputWithPast(
+            last_hidden_state=hidden_states,
+            past_key_values=next_cache,
+            hidden_states=all_hidden_states,
+            attentions=all_self_attns,
+        )
+
+
+class HyperQwen2ForCausalLM(Qwen2PreTrainedModel):
+    _tied_weights_keys = ["lm_head.weight"]
+
+    def __init__(self, config):
+        super().__init__(config)
+        self.model = HyperQwen2Model(config)
+        self.vocab_size = config.vocab_size
+        self.lm_head = nn.Linear(config.hidden_size, config.vocab_size, bias_attr=False)
+
+        # Initialize weights and apply final processing
+        #self.post_init()
+
+    def get_input_embeddings(self):
+        return self.model.embed_tokens
+
+    def set_input_embeddings(self, value):
+        self.model.embed_tokens = value
+
+    def get_output_embeddings(self):
+        return self.lm_head
+
+    def set_output_embeddings(self, new_embeddings):
+        self.lm_head = new_embeddings
+
+    def set_decoder(self, decoder):
+        self.model = decoder
+
+    def get_decoder(self):
+        return self.model
+
+    def forward(
+        self,
+        input_ids: paddle.Tensor = None,
+        attention_mask: Optional[paddle.Tensor] = None,
+        position_ids: Optional[paddle.Tensor] = None,
+        past_key_values: Optional[List[paddle.Tensor]] = None,
+        inputs_embeds: Optional[paddle.Tensor] = None,
+        image_embeds=None,
+        media_offset=None,
+        labels: Optional[paddle.Tensor] = None,
+        use_cache: Optional[bool] = None,
+        output_attentions: Optional[bool] = None,
+        output_hidden_states: Optional[bool] = None,
+        return_dict: Optional[bool] = None,
+    ) -> Union[Tuple, paddlenlp.transformers.model_outputs.CausalLMOutputWithPast]:
+        r"""
+        Args:
+            labels (`torch.LongTensor` of shape `(batch_size, sequence_length)`, *optional*):
+                Labels for computing the masked language modeling loss. Indices should either be in `[0, ...,
+                config.vocab_size]` or -100 (see `input_ids` docstring). Tokens with indices set to `-100` are ignored
+                (masked), the loss is only computed for the tokens with labels in `[0, ..., config.vocab_size]`.
+
+        Returns:
+
+        Example:
+
+        ```python
+        >>> from transformers import AutoTokenizer, Qwen2ForCausalLM
+
+        >>> model = Qwen2ForCausalLM.from_pretrained(PATH_TO_CONVERTED_WEIGHTS)
+        >>> tokenizer = AutoTokenizer.from_pretrained(PATH_TO_CONVERTED_TOKENIZER)
+
+        >>> prompt = "Hey, are you conscious? Can you talk to me?"
+        >>> inputs = tokenizer(prompt, return_tensors="pt")
+
+        >>> # Generate
+        >>> generate_ids = model.generate(inputs.input_ids, max_length=30)
+        >>> tokenizer.batch_decode(generate_ids, skip_special_tokens=True, clean_up_tokenization_spaces=False)[0]
+        "Hey, are you conscious? Can you talk to me?\nI'm not conscious, but I can talk to you."
+        ```"""
+
+        output_attentions = output_attentions if output_attentions is not None else self.config.output_attentions
+        output_hidden_states = (
+            output_hidden_states if output_hidden_states is not None else self.config.output_hidden_states
+        )
+        return_dict = return_dict if return_dict is not None else self.config.use_return_dict
+
+        # decoder outputs consists of (dec_features, layer_state, dec_hidden, dec_attn)
+        outputs = self.model(
+            input_ids=input_ids,
+            attention_mask=attention_mask,
+            position_ids=position_ids,
+            past_key_values=past_key_values,
+            inputs_embeds=inputs_embeds,
+            image_embeds=image_embeds,
+            media_offset=media_offset,
+            use_cache=use_cache,
+            output_attentions=output_attentions,
+            output_hidden_states=output_hidden_states,
+            return_dict=return_dict,
+        )
+
+        hidden_states = outputs[0]
+        logits = self.lm_head(hidden_states)
+        logits = logits.astype(dtype="float32")
+
+        loss = None
+        if labels is not None:
+            # Shift so that tokens < n predict n
+            shift_logits = logits[..., :-1, :]
+            shift_labels = labels[..., 1:]
+            # Flatten the tokens
+            loss_fct = nn.CrossEntropyLoss()
+            shift_logits = shift_logits.reshape([-1, self.config.vocab_size])
+            shift_labels = shift_labels.reshape([-1])
+            # Enable model parallelism
+            loss = loss_fct(shift_logits, shift_labels)
+
+        if not return_dict:
+            output = (logits,) + outputs[1:]
+            return (loss,) + output if loss is not None else output
+
+        return paddlenlp.transformers.model_outputs.CausalLMOutputWithPast(
+            loss=loss,
+            logits=logits,
+            past_key_values=outputs.past_key_values,
+            hidden_states=outputs.hidden_states,
+            attentions=outputs.attentions,
+        )
+
+    def prepare_inputs_for_generation(
+        self, input_ids, past_key_values=None, attention_mask=None, inputs_embeds=None, **kwargs
+    ):
+        # Omit tokens covered by past_key_values
+        if past_key_values is not None:
+            if isinstance(past_key_values, MultiHeadAttention.Cache):
+                cache_length = past_key_values.get_seq_length()
+                past_length = past_key_values.seen_tokens
+                max_cache_length = past_key_values.get_max_length()
+            else:
+                cache_length = past_length = past_key_values[0][0].shape[2]
+                max_cache_length = None
+
+            # Keep only the unprocessed tokens:
+            # 1 - If the length of the attention_mask exceeds the length of input_ids, then we are in a setting where
+            # some of the inputs are exclusively passed as part of the cache (e.g. when passing input_embeds as
+            # input)
+            if attention_mask is not None and attention_mask.shape[1] > input_ids.shape[1]:
+                input_ids = input_ids[:, -(attention_mask.shape[1] - past_length) :]
+            # 2 - If the past_length is smaller than input_ids', then input_ids holds all input tokens. We can discard
+            # input_ids based on the past_length.
+            elif past_length < input_ids.shape[1]:
+                input_ids = input_ids[:, past_length:]
+            # 3 - Otherwise (past_length >= input_ids.shape[1]), let's assume input_ids only has unprocessed tokens.
+
+            # If we are about to go beyond the maximum cache length, we need to crop the input attention mask.
+            if (
+                max_cache_length is not None
+                and attention_mask is not None
+                and cache_length + input_ids.shape[1] > max_cache_length
+            ):
+                attention_mask = attention_mask[:, -max_cache_length:]
+
+        position_ids = kwargs.get("position_ids", None)
+        if attention_mask is not None and position_ids is None:
+            # create position_ids on the fly for batch generation
+            position_ids = attention_mask.astype(dtype="int64").cumsum(-1) - 1
+            position_ids.masked_fill_(attention_mask == 0, 1)
+            if past_key_values:
+                position_ids = position_ids[:, -input_ids.shape[1] :]
+
+        # if `inputs_embeds` are passed, we only want to use them in the 1st generation step
+        if inputs_embeds is not None and past_key_values is None:
+            model_inputs = {"inputs_embeds": inputs_embeds}
+        else:
+            model_inputs = {"input_ids": input_ids}
+
+        model_inputs.update(
+            {
+                "position_ids": position_ids,
+                "past_key_values": past_key_values,
+                "use_cache": kwargs.get("use_cache"),
+                "attention_mask": attention_mask,
+                'image_embeds': kwargs.get('image_embeds'),
+                'media_offset': kwargs.get('media_offset'),
+            }
+        )
+        return model_inputs
diff --git a/paddlemix/mPLUGOwl3/modeling_mplugowl3.py b/paddlemix/models/mPLUGOwl3/modeling_mplugowl3.py
similarity index 59%
rename from paddlemix/mPLUGOwl3/modeling_mplugowl3.py
rename to paddlemix/models/mPLUGOwl3/modeling_mplugowl3.py
index 1729033c2..5261cf4b0 100644
--- a/paddlemix/mPLUGOwl3/modeling_mplugowl3.py
+++ b/paddlemix/models/mPLUGOwl3/modeling_mplugowl3.py
@@ -19,6 +19,7 @@
 from typing import List, Optional
 
 import paddle
+import paddle.nn as nn
 import paddlenlp
 from paddlenlp.generation import TextIteratorStreamer
 from paddlenlp.transformers import Qwen2ForCausalLM, Qwen2PretrainedModel
@@ -29,26 +30,11 @@
 from .modeling_hyper_qwen2 import HyperQwen2ForCausalLM
 from .modeling_navit_siglip import SigLipVisionTransformer
 from .processing_mplugowl3 import mPLUGOwl3Processor
-from .x_sdpa import ScaleDotProductAttention
 
 
-def is_flash_attn_available():
-    try:
-        import paddle
-
-        if "npu" in paddle.get_device():  # NOTE: flash attn has not been tested yet
-            return False
-        q = paddle.rand((1, 4, 2, 8)).astype("float16")
-        output = paddle.nn.functional.flash_attention.flash_attention(q, q, q, 0.9, False, False)
-        return True
-    except:
-        return False
-
-
-# >>>>>>class mPLUGOwl3PreTrainedModel(transformers.Qwen2PreTrainedModel):
-#     config_class = mPLUGOwl3Config
 class mPLUGOwl3PreTrainedModel(Qwen2PretrainedModel):
     config_class = mPLUGOwl3Config
+    _no_split_modules = ["HyperQwen2DecoderLayer", "SiglipVisionTransformer"]
 
 
 class mPLUGOwl3Model(mPLUGOwl3PreTrainedModel):
@@ -57,26 +43,23 @@ def __init__(self, config):
         self.language_model = HyperQwen2ForCausalLM(config)
         self.vision_model = self.init_vision_module()
         self.vision_dim = self.vision_model.embed_dim
-        self.embed_dim = self.language_model.config.hidden_size
-        self.vision2text_model = paddle.nn.Linear(in_features=self.vision_dim, out_features=self.embed_dim)
+        self.embed_dim = self.config.hidden_size
+        self.vision2text_model = nn.Sequential(
+            nn.Linear(self.vision_dim, self.embed_dim),
+            nn.GELU(),
+            nn.Linear(self.embed_dim, self.embed_dim)
+        )
         self.processor = None
-        self.terminators = ["<|im_end|>", "<|endoftext|>"]
+        self.terminators = ['<|im_end|>', '<|endoftext|>']
+        self.vision_batch_size = config.vision_batch_size
 
     def init_vision_module(self):
-        print("-" * 100)
-        if is_flash_attn_available():
-            self.config.vision_config._attn_implementation = "flash_attention_2"
-        else:
-            self.config.vision_config._attn_implementation = "eager"
-        # self.config.vision_config._attn_implementation = (self.config.
-        #     vision_config._attn_implementation)
-        # >>>>>>        model = (transformers.models.siglip.modeling_siglip.
-        #             SiglipVisionTransformer(self.config.vision_config))
-        print("*" * 100)
+        #self.config.vision_config._attn_implementation = self.config.vision_config._attn_implementation
+        self.config.vision_config._attn_implementation = "flash_attention_2"
         model = SigLipVisionTransformer(self.config.vision_config)
-        print("-" * 100)
-        setattr(model, "embed_dim", model.embeddings.embed_dim)
-        setattr(model, "patch_size", model.embeddings.patch_size)
+
+        setattr(model, 'embed_dim', model.embeddings.embed_dim)
+        setattr(model, 'patch_size', model.embeddings.patch_size)
         return model
 
     def get_input_embeddings(self):
@@ -97,32 +80,45 @@ def set_decoder(self, decoder):
     def get_decoder(self):
         return self.language_model
 
+    def _small_batched_forward(self, pixel_values):
+        vision_batch_size = self.vision_batch_size
+        image_forward_out = []
+        B = len(pixel_values)
+        for i in range(0, B, vision_batch_size):
+            start_idx = i
+            end_idx = min(B, i + vision_batch_size)
+            tmp_hs = self.vision_model(pixel_values[start_idx:end_idx], output_hidden_states=True).hidden_states[-2]
+            image_forward_out.append(tmp_hs)
+        vision_embedding = paddle.concat(image_forward_out, axis=0)
+        assert vision_embedding.shape[0] == B
+        return vision_embedding
+
     def forward_image(self, pixel_values):
         if pixel_values is None:
             return None
         dtype = self.language_model.model.embed_tokens.weight.dtype
-        with paddle.no_grad():
-            print("*" * 100)
-            image_embeds = self.vision_model(pixel_values.to(dtype), output_hidden_states=True).hidden_states[-2]
-            print("*" * 150)
+        image_embeds = self._small_batched_forward(pixel_values.to(dtype))
+        # image_embeds = self.vision_model(pixel_values.to(dtype), output_hidden_states=True).hidden_states[-2]
+        
         if self.vision2text_model is not None:
             image_embeds = self.vision2text_model(image_embeds)
         else:
             pass
+     
         return image_embeds
 
     def forward(self, pixel_values=None, **kwargs):
         image_embeds = self.forward_image(pixel_values)
-        return self.language_model(image_embeds=image_embeds, **kwargs)
-
+        
+        return self.language_model(
+            image_embeds=image_embeds,
+            **kwargs
+        )
+    
     def _decode(self, input_ids, image_embeds, media_offset, tokenizer, attention_mask, decode_text=False, **kwargs):
         terminators = [tokenizer.convert_tokens_to_ids(i) for i in self.terminators]
-        print(f"terminators dtype: {type(terminators)}")
-        print("inputids:", input_ids)
-        print(f"attention_mask: {attention_mask}")
-        # print(self.language_model)
         output = self.language_model.generate(
-            input_ids=input_ids,  # (1,60)
+            input_ids=input_ids,
             image_embeds=image_embeds,
             media_offset=media_offset,
             pad_token_id=0,
@@ -130,28 +126,28 @@ def _decode(self, input_ids, image_embeds, media_offset, tokenizer, attention_ma
             attention_mask=attention_mask,
             **kwargs,
         )[0]
-        output = output[:, tuple(input_ids.shape)[1] :]
+        output = output[:,input_ids.shape[1]:]
+        print('output', output)
         if decode_text:
             return self._decode_text(output, tokenizer)
         return output
 
     def _decode_stream(self, input_ids, image_embeds, media_offset, tokenizer, **kwargs):
         terminators = [tokenizer.convert_tokens_to_ids(i) for i in self.terminators]
-        # >>>>>>        streamer = transformers.TextIteratorStreamer(tokenizer=tokenizer)
         streamer = TextIteratorStreamer(tokenizer=tokenizer)
         generation_kwargs = {
-            "input_ids": input_ids,
-            "image_embeds": image_embeds,
-            "media_offset": media_offset,
-            "pad_token_id": 0,
-            "eos_token_id": terminators,
-            "streamer": streamer,
+            'input_ids': input_ids,
+            'image_embeds': image_embeds,
+            'media_offset': media_offset,
+            'pad_token_id': 0,
+            'eos_token_id': terminators,
+            'streamer': streamer
         }
         generation_kwargs.update(kwargs)
+
         thread = Thread(target=self.language_model.generate, kwargs=generation_kwargs)
-        """Class Method: *.start, can not convert, please check whether it is torch.Tensor.*/Optimizer.*/nn.Module.*/torch.distributions.Distribution.*/torch.autograd.function.FunctionCtx.*/torch.profiler.profile.*/torch.autograd.profiler.profile.*, and convert manually"""
-        # >>>>>>        thread.start()
         thread.start()
+    
         return streamer
 
     def _decode_text(self, result_ids, tokenizer):
@@ -165,7 +161,7 @@ def _decode_text(self, result_ids, tokenizer):
         return result_text
 
     def init_processor(self, tokenizer):
-        ip = mPLUGOwl3ImageProcessor(image_size=384)
+        ip = mPLUGOwl3ImageProcessor(image_size=378)
         self.processor = mPLUGOwl3Processor(image_processor=ip, tokenizer=tokenizer)
         processor = self.processor
         return processor
@@ -185,24 +181,12 @@ def generate(
 
         with paddle.no_grad():
             image_embeds = self.forward_image(pixel_values)
+
             if stream:
-                result = self._decode_stream(
-                    input_ids=input_ids,
-                    image_embeds=image_embeds,
-                    media_offset=media_offset,
-                    tokenizer=tokenizer,
-                    **kwargs,
-                )
+                result = self._decode_stream(input_ids=input_ids, image_embeds=image_embeds, media_offset=media_offset, tokenizer=tokenizer, **kwargs)
             else:
-                result = self._decode(
-                    input_ids=input_ids,
-                    image_embeds=image_embeds,
-                    media_offset=media_offset,
-                    tokenizer=tokenizer,
-                    attention_mask=attention_mask,
-                    decode_text=decode_text,
-                    **kwargs,
-                )
+                result = self._decode(input_ids=input_ids, image_embeds=image_embeds, media_offset=media_offset, tokenizer=tokenizer, attention_mask=attention_mask, decode_text=decode_text, **kwargs)
+        
         return result
 
     def chat(
@@ -216,39 +200,60 @@ def chat(
         min_new_tokens=0,
         sampling=True,
         max_inp_length=8192,
-        system_prompt="",
+        system_prompt='',
         stream=False,
         max_slice_nums=None,
         use_image_id=None,
         **kwargs
     ):
-        cut_flag = kwargs.get("kwargs", True)
+        cut_flag = kwargs.get('kwargs', True)
         if processor is None:
             if self.processor is None:
                 processor = self.init_processor(tokenizer)
             else:
                 processor = self.processor
         inputs = processor(messages, images=images, videos=videos, cut_enable=cut_flag)
-        inputs.to("cuda")
-        inputs.update({"tokenizer": tokenizer, "max_new_tokens": max_new_tokens})
+        inputs.update({
+            'tokenizer': tokenizer,
+            'max_new_tokens': max_new_tokens,
+            # 'stream':True,
+        })
         if sampling:
-            generation_config = {"top_p": 0.8, "top_k": 100, "temperature": 0.7, "do_sample": True}
+            generation_config = {
+                "top_p": 0.8,
+                "top_k": 100,
+                "temperature": 0.7,
+                "do_sample": True,
+                # "repetition_penalty": 1.05
+            }
         else:
-            generation_config = {"num_beams": 3}
+            generation_config = {
+                "num_beams": 3,
+                # "repetition_penalty": 1.2,
+            }
+            
         if min_new_tokens > 0:
-            generation_config["min_new_tokens"] = min_new_tokens
-        generation_config.update((k, kwargs[k]) for k in generation_config.keys() & kwargs.keys())
-        with paddle.no_grad():
-            res = self.generate(**inputs, stream=stream, decode_text=True, **generation_config)
-        if stream:
+            generation_config['min_new_tokens'] = min_new_tokens
 
+        generation_config.update(
+            (k, kwargs[k]) for k in generation_config.keys() & kwargs.keys()
+        )
+        with paddle.inference_mode():
+            res = self.generate(
+                **inputs,
+                stream=stream,
+                decode_text=True,
+                **generation_config
+            )
+        
+        if stream:
             def stream_gen():
                 for text in res:
                     for term in self.terminators:
-                        text = text.replace(term, "")
+                        text = text.replace(term, '')
                     yield text
-
             return stream_gen()
+
         else:
             answer = res[0]
             return answer
diff --git a/paddlemix/mPLUGOwl3/modeling_navit_siglip.py b/paddlemix/models/mPLUGOwl3/modeling_navit_siglip.py
similarity index 93%
rename from paddlemix/mPLUGOwl3/modeling_navit_siglip.py
rename to paddlemix/models/mPLUGOwl3/modeling_navit_siglip.py
index 4f736c50d..e4ac2b2a7 100644
--- a/paddlemix/mPLUGOwl3/modeling_navit_siglip.py
+++ b/paddlemix/models/mPLUGOwl3/modeling_navit_siglip.py
@@ -742,55 +742,55 @@ def forward(
         )
 
 
-# Copied from transformers.models.llama.modeling_llama._prepare_4d_causal_attention_mask_with_cache_position
-def _prepare_4d_causal_attention_mask_with_cache_position(
-    attention_mask: paddle.Tensor,
-    sequence_length: int,
-    target_length: int,
-    dtype: paddle.dtype,
-    min_dtype: float,
-    cache_position: paddle.Tensor,
-    batch_size: int,
-):
-    """
-    Creates a causal 4D mask of shape `(batch_size, 1, query_length, key_value_length)` from a 2D mask of shape
-    `(batch_size, key_value_length)`, or if the input `attention_mask` is already 4D, do nothing.
-
-    Args:
-        attention_mask (`paddle.Tensor`):
-            A 2D attention mask of shape `(batch_size, key_value_length)` or a 4D attention mask of shape `(batch_size, 1, query_length, key_value_length)`.
-        sequence_length (`int`):
-            The sequence length being processed.
-        target_length (`int`):
-            The target length: when generating with static cache, the mask should be as long as the static cache, to account for the 0 padding, the part of the cache that is not filled yet.
-        dtype (`paddle.dtype`):
-            The dtype to use for the 4D attention mask.
-        min_dtype (`float`):
-            The minimum value representable with the dtype `dtype`.
-        cache_position (`paddle.Tensor`):
-            Indices depicting the position of the input sequence tokens in the sequence.
-        batch_size (`paddle.Tensor`):
-            Batch size.
-    """
-    if attention_mask is not None and attention_mask.dim() == 4:
-        # In this case we assume that the mask comes already in inverted form and requires no inversion or slicing.
-        causal_mask = attention_mask
-    else:
-        causal_mask = paddle.full([sequence_length, target_length], fill_value=min_dtype, dtype=dtype)
-        if sequence_length != 1:
-            causal_mask = paddle.triu(x=causal_mask, diagonal=1)
-        causal_mask *= paddle.arange(target_length) > cache_position.reshape([-1, 1])
-        causal_mask = causal_mask[None, None, :, :].expand(shape=[batch_size, 1, -1, -1])
-        if attention_mask is not None:
-            causal_mask = causal_mask.clone()
-            mask_length = tuple(attention_mask.shape)[-1]
-            padding_mask = causal_mask[:, :, :, :mask_length] + attention_mask[:, None, None, :]
-            padding_mask = padding_mask == 0
-            causal_mask[:, :, :, :mask_length] = causal_mask[:, :, :, :mask_length].masked_fill(
-                mask=padding_mask, value=min_dtype
-            )
-
-    return causal_mask
+# # Copied from transformers.models.llama.modeling_llama._prepare_4d_causal_attention_mask_with_cache_position
+# def _prepare_4d_causal_attention_mask_with_cache_position(
+#     attention_mask: paddle.Tensor,
+#     sequence_length: int,
+#     target_length: int,
+#     dtype: paddle.dtype,
+#     min_dtype: float,
+#     cache_position: paddle.Tensor,
+#     batch_size: int,
+# ):
+#     """
+#     Creates a causal 4D mask of shape `(batch_size, 1, query_length, key_value_length)` from a 2D mask of shape
+#     `(batch_size, key_value_length)`, or if the input `attention_mask` is already 4D, do nothing.
+
+#     Args:
+#         attention_mask (`paddle.Tensor`):
+#             A 2D attention mask of shape `(batch_size, key_value_length)` or a 4D attention mask of shape `(batch_size, 1, query_length, key_value_length)`.
+#         sequence_length (`int`):
+#             The sequence length being processed.
+#         target_length (`int`):
+#             The target length: when generating with static cache, the mask should be as long as the static cache, to account for the 0 padding, the part of the cache that is not filled yet.
+#         dtype (`paddle.dtype`):
+#             The dtype to use for the 4D attention mask.
+#         min_dtype (`float`):
+#             The minimum value representable with the dtype `dtype`.
+#         cache_position (`paddle.Tensor`):
+#             Indices depicting the position of the input sequence tokens in the sequence.
+#         batch_size (`paddle.Tensor`):
+#             Batch size.
+#     """
+#     if attention_mask is not None and attention_mask.dim() == 4:
+#         # In this case we assume that the mask comes already in inverted form and requires no inversion or slicing.
+#         causal_mask = attention_mask
+#     else:
+#         causal_mask = paddle.full([sequence_length, target_length], fill_value=min_dtype, dtype=dtype)
+#         if sequence_length != 1:
+#             causal_mask = paddle.triu(x=causal_mask, diagonal=1)
+#         causal_mask *= paddle.arange(target_length) > cache_position.reshape([-1, 1])
+#         causal_mask = causal_mask[None, None, :, :].expand(shape=[batch_size, 1, -1, -1])
+#         if attention_mask is not None:
+#             causal_mask = causal_mask.clone()
+#             mask_length = tuple(attention_mask.shape)[-1]
+#             padding_mask = causal_mask[:, :, :, :mask_length] + attention_mask[:, None, None, :]
+#             padding_mask = padding_mask == 0
+#             causal_mask[:, :, :, :mask_length] = causal_mask[:, :, :, :mask_length].masked_fill(
+#                 mask=padding_mask, value=min_dtype
+#             )
+
+#     return causal_mask
 
 
 class SigLipVisionTransformer(SigLipPreTrainedModel):
diff --git a/paddlemix/mPLUGOwl3/processing_mplugowl3.py b/paddlemix/models/mPLUGOwl3/processing_mplugowl3.py
similarity index 57%
rename from paddlemix/mPLUGOwl3/processing_mplugowl3.py
rename to paddlemix/models/mPLUGOwl3/processing_mplugowl3.py
index e11f790ba..cd5013f59 100644
--- a/paddlemix/mPLUGOwl3/processing_mplugowl3.py
+++ b/paddlemix/models/mPLUGOwl3/processing_mplugowl3.py
@@ -12,11 +12,7 @@
 # See the License for the specific language governing permissions and
 # limitations under the License.
 
-import sys
-
-sys.path.append("/home/aistudio/paddle_test/mPLUGOwl3/utils")
 import paddle
-import paddle_aux
 import paddlenlp
 from paddlenlp.transformers.processing_utils import ProcessorMixin
 
@@ -37,20 +33,22 @@
 OWL_MEDIA_TOKEN = ["<|image|>"]
 
 
-class MediaIndicesHelper:
+class MediaIndicesHelper():
     def __init__(self, tokenizer) -> None:
         self.media_position = []
         self.tokenizer = tokenizer
-
+    
     def has_media(self, text, media_tokens=None):
         if media_tokens is None:
             media_tokens = OWL_MEDIA_TOKEN
-        has_media_flag = any([(media_token == text) for media_token in media_tokens])
-        if any([(media_token in text) for media_token in media_tokens]):
+        has_media_flag = any([media_token == text for media_token in media_tokens])
+        if any([media_token in text for media_token in media_tokens]):
+            # 不允许出现text中包含media token但是不仅仅是media token。 media token必须单独为一个chunk 
             assert has_media_flag, text
         return has_media_flag
-
+    
     def add_media(self, text_chunk, text=None, tokenize_fn=None):
+        # cross
         assert tokenize_fn is not None
         assert text is not None
         assert text in OWL_MEDIA_TOKEN
@@ -63,38 +61,31 @@ def add_media(self, text_chunk, text=None, tokenize_fn=None):
 
     def cal_media_offset(self, input_ids):
         if len(self.media_position) == 0:
-            return paddle.ones_like(x=input_ids) * -1000000
-        media_starts = paddle.to_tensor(data=[_[0] for _ in self.media_position]).reshape(1, -1)
-        rng = paddle.arange(end=tuple(input_ids.shape)[0]).reshape(-1, 1)
+            return paddle.ones_like(input_ids)*(-1000000)
+
+        media_starts = paddle.to_tensor([_[0] for _ in self.media_position]).reshape([1,-1])
+        rng = paddle.arange(input_ids.shape[0]).reshape([-1,1])
         matrix = (rng > media_starts).sum(axis=1)
+
         return matrix
 
-    def len_images(self):
+    def len_images(self,):
         return len(self.media_position)
 
 
-# >>>>>>class mPLUGOwl3Processor(transformers.processing_utils.ProcessorMixin):
 class mPLUGOwl3Processor(ProcessorMixin):
-    """
+    r"""
     Args:
         image_processor ([`mPLUGOwl3ImageProcessor`], *optional*):
             The image processor is a required input.
         tokenizer ([`LlamaTokenizerWrapper`], *optional*):
             The tokenizer is a required input.
     """
-
     attributes = ["image_processor", "tokenizer"]
     image_processor_class = "mPLUGOwl3ImageProcessor"
     tokenizer_class = "AutoTokenizer"
 
-    def __init__(
-        self,
-        image_processor: mPLUGOwl3ImageProcessor = None,
-        tokenizer=None,
-        prompt_style="chatml",
-        inference_mode=True,
-        addition_eod="<|endoftext|>",
-    ):
+    def __init__(self, image_processor: mPLUGOwl3ImageProcessor = None, tokenizer=None, prompt_style='chatml', inference_mode=True, addition_eod="<|endoftext|>"):
         super().__init__(image_processor, tokenizer)
         self.image_processor: mPLUGOwl3ImageProcessor
         self.prompt_style = prompt_style
@@ -103,26 +94,38 @@ def __init__(
         self.addition_eod = addition_eod
 
     def build_text_qwen(self, messages):
-        im_start, im_end = "<|im_start|>", "<|im_end|>"
+        # role should be within ['system', 'user', 'assistant']
+        im_start, im_end = '<|im_start|>', '<|im_end|>'
+  
         text = []
         for num_turn, message in enumerate(messages):
-            if num_turn == 0 and message["role"] != "system":
-                if self.prompt_style != "plain":
-                    text.append({"text": f"{im_start}system\n{im_end}", "label": 0})
-            if message["role"] == "system":
-                if self.prompt_style != "plain":
-                    text.append({"text": f"{im_start}system\n{message['content']}{im_end}", "label": 0})
-            elif message["role"] == "user":
-                if self.prompt_style != "plain":
+            if num_turn == 0 and message['role'] != 'system':
+                if self.prompt_style != 'plain':
+                    text.append({
+                        "text": f"{im_start}system\n{im_end}",
+                        "label": 0
+                    })
+            if message['role'] == 'system':
+                if self.prompt_style != 'plain':
+                    text.append({
+                        "text": f"{im_start}system\n{message['content']}{im_end}",
+                        "label": 0
+                    })
+            elif message['role'] == 'user':
+                if self.prompt_style != 'plain':
                     content = f"\n{im_start}user\n{message['content']}{im_end}"
                 else:
-                    content = message["content"]
-                pattern = "|".join(map(re.escape, self.media_tokens))
-                chunk_strs = re.split(f"({pattern})", content)
+                    content = message['content']
+                pattern = '|'.join(map(re.escape, self.media_tokens))
+                chunk_strs = re.split(f'({pattern})', content)
                 for chunk_str in chunk_strs:
-                    text.append({"text": chunk_str, "label": 0})
-            elif message["role"] == "assistant":
-                if self.prompt_style != "plain":
+                    text.append({
+                        "text": chunk_str,
+                        "label": 0
+                    })
+             
+            elif message['role'] == 'assistant':
+                if self.prompt_style != 'plain':
                     text.append({"text": f"\n{im_start}assistant\n", "label": 0})
                     text.append({"text": f"{message['content']}{im_end}", "label": 1})
                 else:
@@ -131,22 +134,28 @@ def build_text_qwen(self, messages):
             else:
                 raise NotImplementedError
         if self.inference_mode:
-            while text and text[-1]["label"] == 1:
-                text.pop()
+            while text and text[-1]['label']==1:  # 只要列表非空且最后一个元素满足条件
+                text.pop()  # 就移除最后一个元素
         return text
 
     def wrapped_tokenize(self, text):
         return self.tokenizer(text).input_ids
 
     def encode_text_sft(self, texts):
+        # output enc_chunk
+   
         enc_chunk = []
         label_chunk = []
         enc_length = 0
+
         num_images = 0
+
         media_helper = MediaIndicesHelper(tokenizer=self.tokenizer)
         for current_ti, text_chunk in enumerate(texts):
+           
             text = text_chunk["text"]
             label = text_chunk["label"]
+
             if not media_helper.has_media(text):
                 curr_chunk = self.wrapped_tokenize(text)
                 if label == 1:
@@ -154,111 +163,122 @@ def encode_text_sft(self, texts):
                     enc_chunk += curr_chunk
                     label_chunk += [label] * len(curr_chunk)
                 else:
+                   
                     enc_length += len(curr_chunk)
                     enc_chunk += curr_chunk
                     label_chunk += [label] * len(curr_chunk)
+            # For media tokens
             else:
-                add_length = media_helper.add_media(enc_chunk, text=text, tokenize_fn=self.wrapped_tokenize)
+               
+                add_length = media_helper.add_media(
+                    enc_chunk, 
+                    text=text, 
+                    tokenize_fn=self.wrapped_tokenize)
                 enc_length += add_length
                 label_chunk += [label] * add_length
+                # enc_chunk.extend([self.media_tokens[text]] * self.media_lengths[text])
+                # enc_length += self.media_lengths[text]
+                # label_chunk += [label] * self.media_lengths[text]
                 num_images += 1
-        enc_chunk = paddle.to_tensor(data=enc_chunk).astype(dtype="int64")
-        media_offset = []
-        media_before = 0
-        for i, _ in enumerate([media_helper]):
-            mo = _.cal_media_offset(enc_chunk)
-            media_offset.append(
-                paddle.concat(
-                    x=[
-                        (paddle.ones(shape=[tuple(mo.shape)[0], 1]) * media_before).astype(dtype="int64").to(mo.place),
-                        (mo + media_before).unsqueeze(axis=1),
-                    ],
-                    axis=1,
-                )
-            )
-            media_before += _.len_images()
-        media_offset = paddle.stack(x=media_offset, axis=0)
-        return {"input_ids": enc_chunk.unsqueeze(axis=0), "media_offset": media_offset}
+
+        enc_chunk = paddle.to_tensor(enc_chunk).astype(dtype="int64")
+        # media_offset = []
+        # media_before = 0
+        # for i,_ in enumerate([media_helper]):
+        #     mo = _.cal_media_offset(enc_chunk)
+        #     media_offset.append(torch.cat([(torch.ones(mo.shape[0],1)*media_before).long().to(mo.device), (mo+media_before).unsqueeze(1)], dim=1)) # L 2
+
+        #     media_before += _.len_images()
+        # media_offset = torch.stack(media_offset, dim=0)
+        media_offset = [paddle.to_tensor([_[0] for _ in media_helper.media_position]).astype(dtype="int64")]
+        return {
+            'input_ids': enc_chunk.unsqueeze(0), 
+            'media_offset': media_offset,
+        }
+
 
     def __call__(
         self,
         messages,
-        images=None,
-        videos=None,
+        images = None,
+        videos = None,
         max_length: Optional[int] = None,
         cut_enable=True,
-        # return_tensors: Optional[Union[str, transformers.utils.TensorType]]=transformers.utils.TensorType.PYTORCH, **kwargs) ->mPLUGOwl3BatchFeature:
         return_tensors: Optional[Union[str, TensorType]] = TensorType.PADDLE,
         **kwargs
     ) -> mPLUGOwl3BatchFeature:
         medias = []
         if videos is not None:
-            medias.extend([{"type": "video", "content": video, "use_video_span": True} for video in videos])
+            medias.extend([{'type': 'video', 'content': video, 'use_video_span': True} for video in videos])
         if images is not None:
-            medias.extend([{"type": "image", "content": image} for image in images])
+            medias.extend([{'type':'image', 'content': image}  for image in images])
+            
         if len(medias):
             image_tensor_list = []
-            pattern = "(<\\|image\\|>|<\\|video\\|>)"
+            pattern = r"(<\|image\|>|<\|video\|>)"
+            # 存在媒体
             image_token_ptr = 0
             media_layout = []
             for message in messages:
-                text_list = re.split(pattern, message["content"])
-                text = ""
+                text_list = re.split(pattern, message['content'])
+                text = ''
                 for text_content in text_list:
-                    if text_content in ["<|image|>", "<|video|>"]:
+                    if text_content in ['<|image|>', '<|video|>']:
                         media_item = medias[image_token_ptr]
                         image_token_ptr += 1
-                        if text_content == "<|image|>":
-                            assert media_item["type"] == "image"
-                            image = media_item["content"]
-                            image_inputs = self.image_processor(
-                                [image], cut_enable=cut_enable, return_tensors=return_tensors
-                            )
-                            if image_inputs.get("cut_shape", None) is not None:
-                                cut_shape = image_inputs["cut_shape"]
-                                cut_text = self.image_processor.cut_prompt_template(
-                                    img_token="<|image|>", h=cut_shape[0][0], w=cut_shape[0][1]
-                                )
+                        if text_content == '<|image|>':
+                            assert media_item['type'] == 'image'
+                            image = media_item['content']
+
+                            image_inputs = self.image_processor([image], cut_enable=cut_enable, return_tensors=return_tensors)
+                            if image_inputs.get('cut_shape',None) is not None:
+                                cut_shape = image_inputs['cut_shape']
+                                cut_text = self.image_processor.cut_prompt_template(img_token='<|image|>', h=cut_shape[0][0], w=cut_shape[0][1])
                                 text += cut_text
-                                image_tensor_list.append(image_inputs["pixel_values"])
+                                image_tensor_list.append(image_inputs['pixel_values'])
                             else:
                                 text += text_content
-                        elif text_content == "<|video|>":
-                            assert media_item["type"] == "video"
-                            video = media_item["content"]
-                            use_video_span = media_item["use_video_span"]
-                            image_tensor = self.image_processor(video, cut_enable=False)["pixel_values"]
+                                image_tensor_list.append(image_inputs['pixel_values'])
+                        elif text_content == '<|video|>':
+                            assert media_item['type'] == 'video'
+                            video = media_item['content']
+                            use_video_span = media_item['use_video_span']
+                            image_tensor = self.image_processor(video, cut_enable=False)['pixel_values']
                             image_tensor_list.append(image_tensor)
-                            num_video_frame = tuple(image_tensor.shape)[0]
+                            num_video_frame = image_tensor.shape[0]
                             if use_video_span:
-                                text_content = (
-                                    "<|start_video_frame|>" + "<|image|>" * num_video_frame + "<|end_video_frame|>"
-                                )
+                                text_content = '<|start_video_frame|>'+'<|image|>'*num_video_frame+'<|end_video_frame|>'
                             else:
-                                text_content = "<|image|>" * num_video_frame
+                                text_content = '<|image|>'*num_video_frame
                             text += text_content
                     else:
                         text += text_content
-                message["content"] = text
-            assert image_token_ptr == len(medias), (image_token_ptr, len(medias))
-            assert all(len(tuple(_.shape)) == 4 for _ in image_tensor_list), [
-                tuple(_.shape) for _ in image_tensor_list
-            ]
-            num_image_tokens = sum([_["content"].count("<|image|>") for _ in messages])
-            num_image_shapes = sum([tuple(_.shape)[0] for _ in image_tensor_list])
-            assert num_image_tokens == num_image_shapes, (messages, [tuple(_.shape) for _ in image_tensor_list])
-        image_tensor_list = paddle.concat(x=image_tensor_list, axis=0)
+                message['content'] = text
+            assert image_token_ptr == len(medias), (image_token_ptr,len(medias)) # 保证图和token数目一致
+            assert all(len(_.shape) == 4 for _ in image_tensor_list), [_.shape for _ in image_tensor_list]
+            num_image_tokens = sum([_['content'].count('<|image|>')for _ in messages])
+            num_image_shapes = sum([_.shape[0] for _ in image_tensor_list])
+            assert num_image_tokens == num_image_shapes, (messages, [_.shape for _ in image_tensor_list])
+
+        image_tensor_list = paddle.concat(image_tensor_list, axis=0)
+        
         text = self.build_text_qwen(messages)
         model_inputs = self.encode_text_sft(text)
+        
         if len(medias) is not None:
-            model_inputs.update({"pixel_values": image_tensor_list})
+            model_inputs.update({'pixel_values': image_tensor_list})
+            # if 'cut_shape' in model_inputs:
+            #     model_inputs.pop('cut_shape')
+            # if 'cut_shape_indices' in model_inputs:
+            #     model_inputs.pop('cut_shape_indices')
         return mPLUGOwl3BatchFeature(model_inputs)
-
+    
     def check_media(self, images, messages):
         media_num = 0 if images is None else len(images)
-        media_count = sum([message["content"].count("<|image|>") for message in messages])
+        media_count = sum([message['content'].count('<|image|>') for message in messages])
         assert media_num == media_count
 
+    # Copied from transformers.models.clip.processing_clip.CLIPProcessor.batch_decode with CLIP->Llama
     def batch_decode(self, *args, **kwargs):
         """
         This method forwards all its arguments to LlamaTokenizerFast's [`~PreTrainedTokenizer.batch_decode`]. Please
@@ -274,7 +294,9 @@ def batch_decode(self, *args, **kwargs):
                 result = result[:-1]
             result_text.append(self.tokenizer.decode(result, *args[1:], **kwargs).strip())
         return result_text
-
+        # return self.tokenizer.batch_decode(*args, **kwargs)
+    
+    # Copied from transformers.models.clip.processing_clip.CLIPProcessor.decode with CLIP->Llama
     def decode(self, *args, **kwargs):
         """
         This method forwards all its arguments to LlamaTokenizerFast's [`~PreTrainedTokenizer.decode`]. Please refer to
@@ -284,15 +306,13 @@ def decode(self, *args, **kwargs):
         result = result[result != 0]
         if result[0] == self.tokenizer.bos_id:
             result = result[1:]
-        if (
-            result[-1] == self.tokenizer.eos_id
-            or hasattr(self.tokenizer, "eot_id")
-            and result[-1] == self.tokenizer.eot_id
-        ):
+        if result[-1] == self.tokenizer.eos_id or (hasattr(self.tokenizer, "eot_id") and result[-1] == self.tokenizer.eot_id):
             result = result[:-1]
         return self.tokenizer.decode(result, *args[1:], **kwargs).strip()
 
-    def _convert(self, input_str, max_inp_length: Optional[int] = None):
+    def _convert(
+        self, input_str, max_inp_length: Optional[int] = None
+    ):
         if self.version > 2.5 or not getattr(self.tokenizer, "add_bos_token", False):
             input_ids = self.tokenizer.encode(input_str)
         else:
@@ -300,28 +320,33 @@ def _convert(self, input_str, max_inp_length: Optional[int] = None):
         if max_inp_length is not None:
             input_ids = input_ids[:max_inp_length]
         input_ids = paddle.to_tensor(data=input_ids, dtype="int32")
+
         start_cond = (input_ids == self.tokenizer.im_start_id) | (input_ids == self.tokenizer.slice_start_id)
         end_cond = (input_ids == self.tokenizer.im_end_id) | (input_ids == self.tokenizer.slice_end_id)
-        # >>>>>>        image_start_tokens = torch.where(start_cond)[0]
-        image_start_tokens = paddle.nonzero(start_cond)[:, 0]
+
+        image_start_tokens = paddle.where(start_cond)[0] ### or paddle.nonzero(start_cond)[:, 0]
         image_start_tokens += 1
-        # >>>>>>        image_end_tokens = torch.where(end_cond)[0]
-        image_end_tokens = paddle.nonzero(end_cond)[:, 0]
+        image_end_tokens = paddle.where(end_cond)[0]
+
         valid_image_nums = max(len(image_start_tokens), len(image_end_tokens))
+
         image_bounds = paddle.hstack(
-            x=[
-                image_start_tokens[:valid_image_nums].unsqueeze(axis=-1),
-                image_end_tokens[:valid_image_nums].unsqueeze(axis=-1),
+            [
+                image_start_tokens[:valid_image_nums].unsqueeze(-1),
+                image_end_tokens[:valid_image_nums].unsqueeze(-1),
             ]
         )
         return input_ids, image_bounds
 
+   
     @property
+    # Copied from transformers.models.clip.processing_clip.CLIPProcessor.model_input_names
     def model_input_names(self):
         tokenizer_input_names = self.tokenizer.model_input_names
         image_processor_input_names = self.image_processor.model_input_names
         return list(dict.fromkeys(tokenizer_input_names + image_processor_input_names))
 
+
     def pad(self, inputs, max_length=None, padding_value=0, padding_side="left"):
         items = []
         if isinstance(inputs[0], list):
@@ -332,23 +357,29 @@ def pad(self, inputs, max_length=None, padding_value=0, padding_side="left"):
         else:
             assert isinstance(inputs[0], paddle.Tensor)
             items = inputs
+
         batch_size = len(items)
-        shape = tuple(items[0].shape)
+        shape = items[0].shape
         dim = len(shape)
         assert dim <= 2
         if max_length is None:
             max_length = 0
-        max_length = max(max_length, max(tuple(item.shape)[-1] for item in items))
-        min_length = min(tuple(item.shape)[-1] for item in items)
+        max_length = max(max_length, max(item.shape[-1] for item in items))
+        min_length = min(item.shape[-1] for item in items)
         dtype = items[0].dtype
+
         if dim == 0:
-            return paddle.stack(x=[item for item in items], axis=0), [0]
+            return paddle.stack([item for item in items], axis=0), [0]
         elif dim == 1:
             if max_length == min_length:
-                return paddle.stack(x=[item for item in items], axis=0), [0] * batch_size
-            tensor = paddle.zeros(shape=(batch_size, max_length), dtype=dtype) + padding_value
+                return paddle.stack([item for item in items], axis=0), [0] * batch_size
+            tensor = paddle.zeros((batch_size, max_length), dtype=dtype) + padding_value
         else:
-            tensor = paddle.zeros(shape=(batch_size, max_length, shape[-1]), dtype=dtype) + padding_value
+            tensor = (
+                paddle.zeros((batch_size, max_length, shape[-1]), dtype=dtype)
+                + padding_value
+            )
+
         padding_length = []
         for i, item in enumerate(items):
             if dim == 1:
@@ -361,5 +392,6 @@ def pad(self, inputs, max_length=None, padding_value=0, padding_side="left"):
                     tensor[i, -len(item) :, :] = item.clone()
                 else:
                     tensor[i, : len(item), :] = item.clone()
-            padding_length.append(tuple(tensor.shape)[-1] - len(item))
+            padding_length.append(tensor.shape[-1] - len(item))
+
         return tensor, padding_length

From f32b71dc01244cd741575635cc49894f7e145508 Mon Sep 17 00:00:00 2001
From: "nemonameless@qq.com@github.com" <nemonameless@qq.com>
Date: Sun, 8 Dec 2024 15:21:50 +0000
Subject: [PATCH 4/8] fix some bugs

---
 .../examples/mPLUG_Owl3/run_inference.py      |  22 +-
 paddlemix/models/mPLUGOwl3/__init__.py        |   2 -
 paddlemix/models/mPLUGOwl3/activations.py     | 174 -------
 paddlemix/models/mPLUGOwl3/bert_padding.py    | 111 -----
 .../models/mPLUGOwl3/modeling_hyper_qwen2.py  | 436 ++++++++++--------
 .../models/mPLUGOwl3/modeling_mplugowl3.py    |  20 +-
 6 files changed, 284 insertions(+), 481 deletions(-)
 delete mode 100644 paddlemix/models/mPLUGOwl3/activations.py
 delete mode 100644 paddlemix/models/mPLUGOwl3/bert_padding.py

diff --git a/paddlemix/examples/mPLUG_Owl3/run_inference.py b/paddlemix/examples/mPLUG_Owl3/run_inference.py
index fd0d741c0..0606c8bae 100644
--- a/paddlemix/examples/mPLUG_Owl3/run_inference.py
+++ b/paddlemix/examples/mPLUG_Owl3/run_inference.py
@@ -24,12 +24,13 @@
 model_path = 'mPLUG-Owl3-7B-241101'
 
 config = mPLUGOwl3Config.from_pretrained(model_path)
-print(config)
+# print(config)
 model = mPLUGOwl3Model.from_pretrained(model_path, dtype=paddle.bfloat16).eval()
 tokenizer = Qwen2Tokenizer.from_pretrained(model_path)
 processor = model.init_processor(tokenizer)
 
-image = Image.new('RGB', (500, 500), color='red')
+#image = Image.new('RGB', (500, 500), color='red')
+image = Image.open("paddlemix/demo_images/examples_image1.jpg").convert("RGB")
 
 messages = [
     {"role": "user", "content": """<|image|>Describe this image."""},
@@ -37,6 +38,23 @@
 ]
 
 inputs = processor(messages, images=[image], videos=None)
+inputs['pixel_values'] = inputs['pixel_values'].cast(paddle.bfloat16)
+# inputs['input_ids'] [1, 72] # torch [1, 74]
+# inputs['input_ids'] = paddle.to_tensor([[151644,   8948,    198, 151645,    198, 151644,    872,    198,     27,
+#              91,   2468,  41317,     91,     29,     17,      9,     18,    198,
+#              27,     91,   1805,     91,     29,    220,     27,     91,   1805,
+#              91,     29,    220,     27,     91,   1805,     91,     29,    198,
+#              27,     91,   1805,     91,     29,    220,     27,     91,   1805,
+#              91,     29,    220,     27,     91,   1805,     91,     29,    198,
+#              27,     91,   1805,     91,     29,     27,     91,    408,  41317,
+#              91,     29,  74785,    419,   2168,     13, 151645,    198, 151644,
+#           77091,    198]]).astype(paddle.int64)
+# inputs['media_offset'] [17, 23, 29, 35, 41, 47, 53]
+# inputs['pixel_values'] [7, 3, 378, 378] sum 629145600
+
+import numpy as np
+inputs['pixel_values'] = paddle.to_tensor(np.load('pixel_values.npy')).cast(paddle.bfloat16)
+inputs['media_offset'] = [paddle.to_tensor([18, 24, 30, 36, 42, 48, 54])]
 
 inputs.update({
     'tokenizer': tokenizer,
diff --git a/paddlemix/models/mPLUGOwl3/__init__.py b/paddlemix/models/mPLUGOwl3/__init__.py
index 4122a3ef1..93ec48f7a 100644
--- a/paddlemix/models/mPLUGOwl3/__init__.py
+++ b/paddlemix/models/mPLUGOwl3/__init__.py
@@ -12,7 +12,6 @@
 # See the License for the specific language governing permissions and
 # limitations under the License.
 
-#from .bert_padding import *
 from .configuration_hyper_qwen2 import *
 from .configuration_mplugowl3 import *
 from .image_processing_mplugowl3 import *
@@ -20,4 +19,3 @@
 from .modeling_mplugowl3 import *
 from .modeling_navit_siglip import *
 from .processing_mplugowl3 import *
-#from .x_sdpa import *
diff --git a/paddlemix/models/mPLUGOwl3/activations.py b/paddlemix/models/mPLUGOwl3/activations.py
deleted file mode 100644
index ab9be1167..000000000
--- a/paddlemix/models/mPLUGOwl3/activations.py
+++ /dev/null
@@ -1,174 +0,0 @@
-# Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved.
-# Copyright 2022 The HuggingFace Team. All rights reserved.
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#     http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-
-import math
-from collections import OrderedDict
-
-import paddle
-import paddle.nn.functional as F
-from paddle import Tensor, nn
-
-
-class NewGELUActivation(nn.Layer):
-    """
-    Implementation of the GELU activation function currently in Google BERT repo (identical to OpenAI GPT). Also see
-    the Gaussian Error Linear Units paper: https://arxiv.org/abs/1606.08415
-    """
-
-    def forward(self, input: Tensor) -> Tensor:
-        return (
-            0.5 * input * (1.0 + paddle.tanh(math.sqrt(2.0 / math.pi) * (input + 0.044715 * paddle.pow(input, 3.0))))
-        )
-
-
-class GELUActivation(nn.Layer):
-    """
-    Original Implementation of the GELU activation function in Google BERT repo when initially created. For
-    information: OpenAI GPT's GELU is slightly different (and gives slightly different results): 0.5 * x * (1 +
-    torch.tanh(math.sqrt(2 / math.pi) * (x + 0.044715 * torch.pow(x, 3)))) This is now written in C in nn.functional
-    Also see the Gaussian Error Linear Units paper: https://arxiv.org/abs/1606.08415
-    """
-
-    def __init__(self, use_gelu_python: bool = False):
-        super().__init__()
-        if use_gelu_python:
-            self.act = self._gelu_python
-        else:
-            self.act = nn.functional.gelu
-
-    def _gelu_python(self, input: Tensor) -> Tensor:
-        return input * 0.5 * (1.0 + paddle.erf(input / math.sqrt(2.0)))
-
-    def forward(self, input: Tensor) -> Tensor:
-        return self.act(input)
-
-
-class FastGELUActivation(nn.Layer):
-    """
-    Applies GELU approximation that is slower than QuickGELU but more accurate. See: https://github.com/hendrycks/GELUs
-    """
-
-    def forward(self, input: Tensor) -> Tensor:
-        return 0.5 * input * (1.0 + paddle.tanh(input * 0.7978845608 * (1.0 + 0.044715 * input * input)))
-
-
-class QuickGELUActivation(nn.Layer):
-    """
-    Applies GELU approximation that is fast but somewhat inaccurate. See: https://github.com/hendrycks/GELUs
-    """
-
-    def forward(self, input: Tensor) -> Tensor:
-        return input * F.sigmoid(1.702 * input)
-
-
-class ClippedGELUActivation(nn.Layer):
-    """
-    Clip the range of possible GeLU outputs between [min, max]. This is especially useful for quantization purpose, as
-    it allows mapping negatives values in the GeLU spectrum. For more information on this trick, please refer to
-    https://arxiv.org/abs/2004.09602.
-
-    Gaussian Error Linear Unit. Original Implementation of the gelu activation function in Google Bert repo when
-    initially created.
-
-    For information: OpenAI GPT's gelu is slightly different (and gives slightly different results): 0.5 * x * (1 +
-    torch.tanh(math.sqrt(2 / math.pi) * (x + 0.044715 * torch.pow(x, 3)))). See https://arxiv.org/abs/1606.08415
-    """
-
-    def __init__(self, min: float, max: float):
-        if min > max:
-            raise ValueError(f"min should be < max (got min: {min}, max: {max})")
-
-        super().__init__()
-        self.min = min
-        self.max = max
-
-    def forward(self, x: Tensor) -> Tensor:
-        return paddle.clip(gelu(x), self.min, self.max)
-
-
-class SiLUActivation(nn.Layer):
-    """
-    See Gaussian Error Linear Units (Hendrycks et al., https://arxiv.org/abs/1606.08415) where the SiLU (Sigmoid Linear
-    Unit) was originally introduced and coined, and see Sigmoid-Weighted Linear Units for Neural Network Function
-    Approximation in Reinforcement Learning (Elfwing et al., https://arxiv.org/abs/1702.03118) and Swish: a Self-Gated
-    Activation Function (Ramachandran et al., https://arxiv.org/abs/1710.05941v1) where the SiLU was experimented with
-    later.
-    """
-
-    def forward(self, input: Tensor) -> Tensor:
-        return F.silu(input)
-
-
-class MishActivation(nn.Layer):
-    """
-    See Mish: A Self-Regularized Non-Monotonic Activation Function (Misra., https://arxiv.org/abs/1908.08681). Also
-    visit the official repository for the paper: https://github.com/digantamisra98/Mish
-    """
-
-    def forward(self, input: Tensor) -> Tensor:
-        return F.mish(input)
-
-
-class LinearActivation(nn.Layer):
-    """
-    Applies the linear activation function, i.e. forwarding input directly to output.
-    """
-
-    def forward(self, input: Tensor) -> Tensor:
-        return input
-
-
-class ClassInstantier(OrderedDict):
-    def __getitem__(self, key):
-        content = super().__getitem__(key)
-        cls, kwargs = content if isinstance(content, tuple) else (content, {})
-        return cls(**kwargs)
-
-
-ACT2CLS = {
-    "gelu": GELUActivation,
-    "gelu_10": (ClippedGELUActivation, {"min": -10, "max": 10}),
-    "gelu_fast": FastGELUActivation,
-    "gelu_new": NewGELUActivation,
-    "gelu_python": (GELUActivation, {"use_gelu_python": True}),
-    "linear": LinearActivation,
-    "mish": MishActivation,
-    "quick_gelu": QuickGELUActivation,
-    "relu": nn.ReLU,
-    "relu6": nn.ReLU6,
-    "sigmoid": nn.Sigmoid,
-    "silu": SiLUActivation,
-    "swish": SiLUActivation,
-    "tanh": nn.Tanh,
-}
-ACT2FN = ClassInstantier(ACT2CLS)
-
-
-def get_activation(activation_string):
-    if activation_string in ACT2FN:
-        return ACT2FN[activation_string]
-    else:
-        raise KeyError(f"function {activation_string} not found in ACT2FN mapping {list(ACT2FN.keys())}")
-
-
-# For backwards compatibility with: from activations import gelu_python
-gelu_python = get_activation("gelu_python")
-gelu_new = get_activation("gelu_new")
-gelu = get_activation("gelu")
-gelu_fast = get_activation("gelu_fast")
-quick_gelu = get_activation("quick_gelu")
-silu = get_activation("silu")
-mish = get_activation("mish")
-linear_act = get_activation("linear")
diff --git a/paddlemix/models/mPLUGOwl3/bert_padding.py b/paddlemix/models/mPLUGOwl3/bert_padding.py
deleted file mode 100644
index 017aa78bf..000000000
--- a/paddlemix/models/mPLUGOwl3/bert_padding.py
+++ /dev/null
@@ -1,111 +0,0 @@
-# Copyright (c) 2024 PaddlePaddle Authors. All Rights Reserved.
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#     http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-# reference from Dao-AILAB flash-attn
-# https://github.com/Dao-AILab/flash-attention/blob/74b0761ff7efc7b90d4e5aeb529c1b2a09a7458c/flash_attn/bert_padding.py#L38
-import operator
-from functools import reduce
-
-import paddle
-import paddle.nn.functional as F
-from einops import rearrange, repeat
-
-
-class IndexFirstAxis(paddle.autograd.PyLayer):
-    @staticmethod
-    def forward(ctx, input, indices):
-        ctx.save_for_backward(indices)
-        assert input.ndim >= 2
-        ctx.first_axis_dim, other_shape = input.shape[0], input.shape[1:]
-        second_dim = reduce(operator.mul, other_shape, 1)
-        return paddle.take_along_axis(
-            arr=rearrange(input, "b ... -> b (...)"), axis=0, indices=repeat(indices, "z -> z d", d=second_dim)
-        ).reshape([-1, *other_shape])
-
-    @staticmethod
-    def backward(ctx, grad_output):
-        """Class Attribute: torch.autograd.function.FunctionCtx.saved_tensors, can not convert, please check whether it is torch.Tensor.*/torch.autograd.function.FunctionCtx.*/torch.distributions.Distribution.* and convert manually"""
-        (indices,) = ctx.saved_tensor()
-        assert grad_output.ndim >= 2
-        other_shape = grad_output.shape[1:]
-        grad_output = rearrange(grad_output, "b ... -> b (...)")
-        grad_input = paddle.zeros(shape=[ctx.first_axis_dim, tuple(grad_output.shape)[1]], dtype=grad_output.dtype)
-
-        grad_input.put_along_axis_(
-            axis=0,
-            indices=repeat(indices, "z -> z d", d=tuple(grad_output.shape)[1]),
-            values=grad_output,
-        )
-        return grad_input.reshape([ctx.first_axis_dim, *other_shape]), None
-
-
-index_first_axis = IndexFirstAxis.apply
-
-
-class IndexPutFirstAxis(paddle.autograd.PyLayer):
-    @staticmethod
-    def forward(ctx, values, indices, first_axis_dim):
-        ctx.save_for_backward(indices)
-        assert indices.ndim == 1
-        assert values.ndim >= 2
-        output = paddle.zeros(shape=[first_axis_dim, *tuple(values.shape)[1:]], dtype=values.dtype)
-        output[indices] = values
-        return output
-
-    @staticmethod
-    def backward(ctx, grad_output):
-        """Class Attribute: torch.autograd.function.FunctionCtx.saved_tensors, can not convert, please check whether it is torch.Tensor.*/torch.autograd.function.FunctionCtx.*/torch.distributions.Distribution.* and convert manually"""
-        (indices,) = ctx.saved_tensor()
-        grad_values = grad_output[indices]
-        return grad_values, None, None
-
-
-index_put_first_axis = IndexPutFirstAxis.apply
-
-
-def unpad_input(hidden_states, attention_mask):
-    """
-    Arguments:
-        hidden_states: (batch, seqlen, ...)
-        attention_mask: (batch, seqlen), bool / int, 1 means valid and 0 means not valid.
-    Return:
-        hidden_states: (total_nnz, ...), where total_nnz = number of tokens in selected in attention_mask.
-        indices: (total_nnz), the indices of non-masked tokens from the flattened input sequence.
-        cu_seqlens: (batch + 1), the cumulative sequence lengths, used to index into hidden_states.
-        max_seqlen_in_batch: int
-    """
-    seqlens_in_batch = paddle.sum(attention_mask, axis=-1, dtype="int32")
-    indices = paddle.nonzero(attention_mask.flatten(), as_tuple=False).flatten()
-    max_seqlen_in_batch = paddle.max(seqlens_in_batch).item()
-    cu_seqlens = F.pad(paddle.cumsum(seqlens_in_batch, axis=0), [1, 0])
-
-    return (
-        index_first_axis(rearrange(hidden_states, "b s ... -> (b s) ..."), indices),
-        indices,
-        cu_seqlens,
-        max_seqlen_in_batch,
-    )
-
-
-def pad_input(hidden_states, indices, batch, seqlen):
-    """
-    Arguments:
-        hidden_states: (total_nnz, ...), where total_nnz = number of tokens in selected in attention_mask.
-        indices: (total_nnz), the indices that represent the non-masked tokens of the original padded input sequence.
-        batch: int, batch size for the padded sequence.
-        seqlen: int, maximum sequence length for the padded sequence.
-    Return:
-        hidden_states: (batch, seqlen, ...)
-    """
-    output = index_put_first_axis(hidden_states, indices, batch * seqlen)
-    return rearrange(output, "(b s) ... -> b s ...", b=batch)
diff --git a/paddlemix/models/mPLUGOwl3/modeling_hyper_qwen2.py b/paddlemix/models/mPLUGOwl3/modeling_hyper_qwen2.py
index af91ffd48..ab6b37248 100644
--- a/paddlemix/models/mPLUGOwl3/modeling_hyper_qwen2.py
+++ b/paddlemix/models/mPLUGOwl3/modeling_hyper_qwen2.py
@@ -21,48 +21,54 @@
 
 from einops import rearrange, repeat
 
-# from paddlemix.models.flash_attn_utils import (
-#     has_flash_attn_func,
-#     is_flash_attn_available,
-# )
-
-#from .activations import ACT2FN
 from ...activations import ACT2FN
-from .bert_padding import index_first_axis, pad_input, unpad_input
 from .configuration_hyper_qwen2 import HyperQwen2Config
 
-# if is_flash_attn_available():
-#     flash_attn_func, flash_attn_varlen_func = has_flash_attn_func()
-#     _flash_supports_window_size = "window_size" in list(inspect.signature(flash_attn_func).parameters)
-# # >>>>>>if transformers.utils.is_flash_attn_2_available():
-# #     pass
-# #     _flash_supports_window_size = 'window_size' in list(inspect.signature(
-# #         flash_attn_func).parameters)
-
-# from .x_sdpa import ScaleDotProductAttention
-
 try:
     from einops import rearrange
-
     use_flash_rotary = True
     print("use flash_attn rotary")
 except ImportError:
     use_flash_rotary = False
     print("import flash_attn rotary fail")
-logger = paddle.utils.try_import("logging").getLogger(name=__name__)
-# _CHECKPOINT_FOR_DOC = "Qwen/Qwen2-7B-beta"
-# _CONFIG_FOR_DOC = "HyperQwen2Config"
+from paddlemix.utils.log import logger
 
 
-# def _get_unpad_data(attention_mask):
-#     seqlens_in_batch = attention_mask.sum(axis=-1, dtype="int32")
-#     paddle.utils.try_import("warnings").warn("Now, the return shape is inconsistent with torch when as_tuple is True")
-#     indices = paddle.nonzero(x=attention_mask.flatten(), as_tuple=False).flatten()
-#     max_seqlen_in_batch = seqlens_in_batch.max().item()
-#     cu_seqlens = paddle.nn.functional.pad(
-#         x=paddle.cumsum(x=seqlens_in_batch, axis=0, dtype="int32"), pad=(1, 0), pad_from_left_axis=False
-#     )
-#     return indices, cu_seqlens, max_seqlen_in_batch
+def is_casual_mask(attention_mask):
+    """
+    Upper triangular of attention_mask equals to attention_mask is casual
+    """
+    return (paddle.triu(attention_mask) == attention_mask).all().item()
+
+
+def _make_causal_mask(input_ids_shape, past_key_values_length):
+    """
+    Make causal mask used for self-attention
+    """
+    batch_size, target_length = input_ids_shape  # target_length: seq_len
+
+    mask = paddle.tril(paddle.ones((target_length, target_length), dtype="bool"))
+
+    if past_key_values_length > 0:
+        # [tgt_len, tgt_len + past_len]
+        mask = paddle.concat([paddle.ones([target_length, past_key_values_length], dtype="bool"), mask], axis=-1)
+
+    # [bs, 1, tgt_len, tgt_len + past_len]
+    return mask[None, None, :, :].expand([batch_size, 1, target_length, target_length + past_key_values_length])
+
+
+def _expand_2d_mask(mask, dtype, tgt_length):
+    """
+    Expands attention_mask from `[batch_size, src_length]` to `[batch_size, 1, tgt_length, src_length]`.
+    """
+    batch_size, src_length = mask.shape[0], mask.shape[-1]
+    tgt_length = tgt_length if tgt_length is not None else src_length
+
+    mask = mask[:, None, None, :].astype("bool")
+    mask.stop_gradient = True
+    expanded_mask = mask.expand([batch_size, 1, tgt_length, src_length])
+
+    return expanded_mask
 
 
 class Qwen2RMSNorm(paddle.nn.Layer):
@@ -235,15 +241,15 @@ def apply_rotary_pos_emb_core(t, freqs, use_fp32=False, debug=False):
     rotary positional embeding tensor freqs is of shape [seq_length, ..., dim]
     check https://kexue.fm/archives/8265 for detailed formulas
     """
-    if use_flash_rotary and use_fp32:
-        t_ = rearrange(t, "s b ... -> b s ...")
-        if use_fp32:
-            t_ = t_.astype(dtype="float32")
-        freqs = freqs.squeeze(axis=1).squeeze(axis=1)
-        cos = freqs[:, :freqs.shape[-1] // 2].cos()
-        sin = freqs[:, :freqs.shape[-1] // 2].sin()
-        output = apply_rotary_emb_func(t_, cos, sin).astype(dtype=t.dtype) # TODO
-        return rearrange(output, 'b s ... -> s b ...')
+    # if use_flash_rotary and use_fp32:
+    #     t_ = rearrange(t, "s b ... -> b s ...")
+    #     if use_fp32:
+    #         t_ = t_.astype(dtype="float32")
+    #     freqs = freqs.squeeze(axis=1).squeeze(axis=1)
+    #     cos = freqs[:, :freqs.shape[-1] // 2].cos()
+    #     sin = freqs[:, :freqs.shape[-1] // 2].sin()
+    #     output = apply_rotary_emb_func(t_, cos, sin).astype(dtype=t.dtype) # TODO
+    #     return rearrange(output, 'b s ... -> s b ...')
 
     rot_dim = freqs.shape[-1]
     # ideally t_pass is empty so rotary pos embedding is applied to all tensor t
@@ -337,22 +343,6 @@ def apply_mi_rope(self, key_layer, image_pos, length_each_img):
         return key_layer
 
 
-# def hyper_mask_always_true(b, h, q_idx, kv_idx):
-#     return q_idx>=0
-
-# def causal(b, h, q_idx, kv_idx):
-#     return q_idx >= kv_idx
-
-
-# def create_hyper_attention(media_starts_extend, q_len, kv_len, each_visual_len):
-    
-#     visual_len = kv_len - q_len
-#     def hyper_mask_dynamic(b, h, q_idx, kv_idx):
-#         return torch.where(kv_idx<visual_len, q_idx>=media_starts_extend[kv_idx], causal(b, h, q_idx, kv_idx-visual_len))
-    
-#     return create_block_mask(hyper_mask_dynamic, B=None, H=None, Q_LEN=q_len, KV_LEN=kv_len, BLOCK_SIZE=128, _compile=True)
-
-# Copied from transformers.models.mistral.modeling_mistral.MistralSdpaAttention with Mistral->Qwen2
 class HyperQwen2SdpaAttention(HyperQwen2Attention):
     """
     Qwen2 attention module using torch.nn.functional.scaled_dot_product_attention. This module inherits from
@@ -369,11 +359,17 @@ def hyperattention(self,hidden_states: paddle.Tensor,
         output_attentions: bool = False,
         use_cache: bool = False,
     )-> Tuple[paddle.Tensor, Optional[paddle.Tensor], Optional[Tuple[paddle.Tensor]]]:
-        bsz, q_len, _ = hidden_states.shape
-
-        query_states = self.q_proj(hidden_states)
-        key_states = self.k_proj(hidden_states)
-        value_states = self.v_proj(hidden_states)
+        bsz, q_len, _ = hidden_states.shape # (1, 74, 28, 128) bsz, q_len, self.num_heads, self.head_dim
+
+        try:
+            query_states = self.q_proj(hidden_states)
+            key_states = self.k_proj(hidden_states)
+            value_states = self.v_proj(hidden_states)
+        except:
+            hidden_states = hidden_states.astype('bfloat16')
+            query_states = self.q_proj(hidden_states)
+            key_states = self.k_proj(hidden_states)
+            value_states = self.v_proj(hidden_states)
 
         query_states = query_states.reshape([bsz, q_len, self.num_heads, self.head_dim]).transpose([0, 2, 1, 3])
         key_states = key_states.reshape([bsz, q_len, self.num_key_value_heads, self.head_dim]).transpose([0, 2, 1, 3])
@@ -381,21 +377,29 @@ def hyperattention(self,hidden_states: paddle.Tensor,
 
         kv_seq_len = key_states.shape[-2]
         if past_key_value is not None:
-            kv_seq_len += past_key_value.get_usable_length(kv_seq_len, self.layer_idx)
+            #kv_seq_len += past_key_value.get_usable_length(kv_seq_len, self.layer_idx)
+            kv_seq_len += past_key_value[0].shape[-2]
         cos, sin = self.rotary_emb(value_states, seq_len=kv_seq_len)
 
         query_states, key_states = apply_rotary_pos_emb(query_states, key_states, cos, sin, position_ids)
 
         if past_key_value is not None:
-            cache_kwargs = {"sin": sin, "cos": cos}  # Specific to RoPE models
-            key_states, value_states = past_key_value.update(key_states, value_states, self.layer_idx, cache_kwargs)
+            #cache_kwargs = {"sin": sin, "cos": cos}  # Specific to RoPE models
+            #key_states, value_states = past_key_value.update(key_states, value_states, self.layer_idx, cache_kwargs)
+            key_states = paddle.concat([past_key_value[0], key_states], axis=2)
+            value_states = paddle.concat([past_key_value[1], value_states], axis=2)
+        past_key_value = (key_states, value_states) if use_cache else None
+        # q k v [1, 28, 74, 128] [1, 4, 74, 128] [1, 4, 74, 128]
 
         key_states = repeat_kv(key_states, self.num_key_value_groups)
         value_states = repeat_kv(value_states, self.num_key_value_groups)
 
         # add visual to kv
         length_each_img = image_embeds.shape[1]
-        image_embeds = self.v_kv_proj(image_embeds)
+        try:
+            image_embeds = self.v_kv_proj(image_embeds)
+        except:
+            image_embeds = self.v_kv_proj(image_embeds.astype('bfloat16'))
         image_start = 0
         context_layer = []                 
         for bi, media_starts in enumerate(media_offset):
@@ -440,29 +444,31 @@ def hyperattention(self,hidden_states: paddle.Tensor,
                     causal_mask = causal_mask.unsqueeze(0).unsqueeze(0)
                     full_mask = causal_mask
 
-
             # SDPA with memory-efficient backend is currently (torch==2.1.2) bugged with non-contiguous inputs with custom attn_mask,
             # # Reference: https://github.com/pytorch/pytorch/issues/112577.
             # if curr_query_layer.device.type == "cuda" and full_mask is not None:
             #     curr_query_layer = curr_query_layer.contiguous()
             #     curr_key_layer = curr_key_layer.contiguous()
             #     curr_value_layer = curr_value_layer.contiguous()
-           
+
+            # full_mask.shape [1, 1, 72, 5175] # sum 196689
             attn_output = paddle.nn.functional.scaled_dot_product_attention(
-                curr_query_layer, # (batch, ..., sequence, dim)
-                curr_key_layer,
-                curr_value_layer,
-                attn_mask=full_mask, # (N, ..., L, S) A boolean mask where a value of True indicates that the element *should* take part in attention.
+                curr_query_layer.transpose([0, 2, 1, 3]), # (batch, ..., sequence, dim) # [1, 72, 28, 128], torch [1, 28, 74, 128] sum 18304.
+                curr_key_layer.transpose([0, 2, 1, 3]), # [1, 5175, 28, 128], torch [1, 28, 5177, 128] sum 1044480   mean 0.05615234  torch sum 1036288. mean 0.0559
+                curr_value_layer.transpose([0, 2, 1, 3]), # [1, 5175, 28, 128] , torch [1, 28, 5177, 128] sum -158720
+                attn_mask=full_mask.cast(curr_query_layer.dtype), # (N, ..., L, S) A boolean mask where a value of True indicates that the element *should* take part in attention.
                 dropout_p=self.attention_dropout if self.training else 0.0,
                 # The q_len > 1 is necessary to match with AttentionMaskConverter.to_causal_4d that does not create a causal mask in case q_len == 1.
                 is_causal=is_causal,
                 # enable_gqa=True, # gqa can not be used because mask requires XFORMERS and not support gqa
             ) # -> (N, ..., L, Ev)
+            # torch attn_output.shape [1, 28, 72, 128]
+            #attn_output = attn_output.transpose([0, 2, 1, 3])
             assert attn_output.shape[0] == 1
             context_layer.append(attn_output)
         attn_output = context_layer = paddle.concat(context_layer, axis=0)
 
-        attn_output = attn_output.transpose([0, 2, 1, 3])
+        #attn_output = attn_output.transpose([0, 2, 1, 3])
         attn_output = attn_output.reshape([bsz, q_len, self.hidden_size])
 
         attn_output = self.o_proj(attn_output)
@@ -481,28 +487,37 @@ def forward(
         output_attentions: bool = False,
         use_cache: bool = False,
     ) -> Tuple[paddle.Tensor, Optional[paddle.Tensor], Optional[Tuple[paddle.Tensor]]]:
-        if output_attentions:
-            # TODO: Improve this warning with e.g. `model.config.attn_implementation = "manual"` once this is implemented.
-            logger.warning_once(
-                "Qwen2Model is using Qwen2SdpaAttention, but `torch.nn.functional.scaled_dot_product_attention` does not support `output_attentions=True`. Falling back to the manual attention implementation, "
-                'but specifying the manual implementation will be required from Transformers version v5.0.0 onwards. This warning can be removed using the argument `attn_implementation="eager"` when loading the model.'
-            )
-            return super().forward(
-                hidden_states=hidden_states,
-                attention_mask=attention_mask,
-                position_ids=position_ids,
-                past_key_value=past_key_value,
-                output_attentions=output_attentions,
-                use_cache=use_cache,
-            )
-        if self.is_hyper_enabled and image_embeds is not None:
+        ### TODO
+        # if output_attentions:
+        #     # TODO: Improve this warning with e.g. `model.config.attn_implementation = "manual"` once this is implemented.
+        #     logger.warning_once(
+        #         "Qwen2Model is using Qwen2SdpaAttention, but `torch.nn.functional.scaled_dot_product_attention` does not support `output_attentions=True`. Falling back to the manual attention implementation, "
+        #         'but specifying the manual implementation will be required from Transformers version v5.0.0 onwards. This warning can be removed using the argument `attn_implementation="eager"` when loading the model.'
+        #     )
+        #     return super().forward(
+        #         hidden_states=hidden_states,
+        #         attention_mask=attention_mask,
+        #         position_ids=position_ids,
+        #         past_key_value=past_key_value,
+        #         output_attentions=output_attentions,
+        #         use_cache=use_cache,
+        #     )
+
+        if self.is_hyper_enabled and image_embeds is not None: # if 1:
+            # 必走这个分支
             return self.hyperattention(hidden_states, attention_mask, position_ids, image_embeds, media_offset, past_key_value, output_attentions, use_cache)
 
         bsz, q_len, _ = hidden_states.shape
 
-        query_states = self.q_proj(hidden_states)
-        key_states = self.k_proj(hidden_states)
-        value_states = self.v_proj(hidden_states)
+        try:
+            query_states = self.q_proj(hidden_states)
+            key_states = self.k_proj(hidden_states)
+            value_states = self.v_proj(hidden_states)
+        except:
+            hidden_states = hidden_states.astype('bfloat16')
+            query_states = self.q_proj(hidden_states)
+            key_states = self.k_proj(hidden_states)
+            value_states = self.v_proj(hidden_states)
 
         query_states = query_states.reshape([bsz, q_len, self.num_heads, self.head_dim]).transpose([0, 2, 1, 3])
         key_states = key_states.reshape([bsz, q_len, self.num_key_value_heads, self.head_dim]).transpose([0, 2, 1, 3])
@@ -510,14 +525,19 @@ def forward(
 
         kv_seq_len = key_states.shape[-2]
         if past_key_value is not None:
-            kv_seq_len += past_key_value.get_usable_length(kv_seq_len, self.layer_idx)
+            #kv_seq_len += past_key_value.get_usable_length(kv_seq_len, self.layer_idx)
+            kv_seq_len += past_key_value[0].shape[-2]
         cos, sin = self.rotary_emb(value_states, seq_len=kv_seq_len)
 
         query_states, key_states = apply_rotary_pos_emb(query_states, key_states, cos, sin, position_ids)
 
         if past_key_value is not None:
-            cache_kwargs = {"sin": sin, "cos": cos}  # Specific to RoPE models
-            key_states, value_states = past_key_value.update(key_states, value_states, self.layer_idx, cache_kwargs)
+            #cache_kwargs = {"sin": sin, "cos": cos}  # Specific to RoPE models
+            #key_states, value_states = past_key_value.update(key_states, value_states, self.layer_idx, cache_kwargs)
+            key_states = paddle.concat([past_key_value[0], key_states], axis=2)
+            value_states = paddle.concat([past_key_value[1], value_states], axis=2)
+        past_key_value = (key_states, value_states) if use_cache else None
+
 
         key_states = repeat_kv(key_states, self.num_key_value_groups)
         value_states = repeat_kv(value_states, self.num_key_value_groups)
@@ -538,7 +558,7 @@ def forward(
             query_states,
             key_states,
             value_states,
-            attn_mask=attention_mask,
+            attn_mask=attention_mask.astype(query_states.dtype),
             dropout_p=self.attention_dropout if self.training else 0.0,
             # The q_len > 1 is necessary to match with AttentionMaskConverter.to_causal_4d that does not create a causal mask in case q_len == 1.
             is_causal=self.is_causal and attention_mask is None and q_len > 1,
@@ -616,13 +636,14 @@ def forward(
         else:
             image_embeds = media_offset = None
             media_kwargs = {}
+
         # Self Attention
         hidden_states, self_attn_weights, present_key_value = self.self_attn(
-            hidden_states=hidden_states,
+            hidden_states=hidden_states.cast(paddle.bfloat16),
             attention_mask=attention_mask,
             position_ids=position_ids,
             past_key_value=past_key_value,
-            output_attentions=output_attentions,
+            output_attentions=True, # TODO, paddlenlp默认是False，但是不返回self_attn_weights。这里output_attentions全局是false
             use_cache=use_cache,
             **media_kwargs,
         )
@@ -631,7 +652,10 @@ def forward(
         # Fully Connected
         residual = hidden_states
         hidden_states = self.post_attention_layernorm(hidden_states)
-        hidden_states = self.mlp(hidden_states)
+        try:
+            hidden_states = self.mlp(hidden_states.cast(paddle.bfloat16))
+        except:
+            hidden_states = self.mlp(hidden_states)
         hidden_states = residual + hidden_states
 
         outputs = (hidden_states,)
@@ -698,6 +722,34 @@ def get_input_embeddings(self):
     def set_input_embeddings(self, value):
         self.embed_tokens = value
 
+    @staticmethod
+    def _prepare_decoder_attention_mask(attention_mask, input_shape, past_key_values_length, dtype):
+        if attention_mask is not None:
+            # [bsz, seq_len] -> [bsz, 1, tgt_seq_len, src_seq_len]
+            if len(attention_mask.shape) == 2:
+                expanded_attn_mask = _expand_2d_mask(attention_mask, dtype, tgt_length=input_shape[-1])
+                # For decoding phase in generation, seq_length = 1, we don't need to add causal mask
+                if input_shape[-1] > 1:
+                    combined_attention_mask = _make_causal_mask(
+                        input_shape,
+                        past_key_values_length=past_key_values_length,
+                    )
+                    expanded_attn_mask = expanded_attn_mask & combined_attention_mask
+            # [bsz, seq_len, seq_len] -> [bsz, 1, seq_len, seq_len]
+            elif len(attention_mask.shape) == 3:
+                expanded_attn_mask = attention_mask.unsqueeze(1).astype("bool")
+            # if attention_mask is already 4-D, do nothing
+            else:
+                expanded_attn_mask = attention_mask
+        else:
+            expanded_attn_mask = _make_causal_mask(
+                input_shape,
+                past_key_values_length=past_key_values_length,
+            )
+        # Convert bool attention_mask to float attention mask, which will be added to attention_scores later
+        expanded_attn_mask = paddle.where(expanded_attn_mask, 0.0, paddle.finfo(dtype).min).astype(dtype)
+        return expanded_attn_mask
+
     def forward(
         self,
         input_ids: paddle.Tensor = None,
@@ -730,64 +782,62 @@ def forward(
         else:
             raise ValueError("You have to specify either decoder_input_ids or decoder_inputs_embeds")
 
-        if self.gradient_checkpointing and self.training:
-            if use_cache:
-                logger.warning_once(
-                    "`use_cache=True` is incompatible with gradient checkpointing. Setting `use_cache=False`..."
-                )
-                use_cache = False
-
         past_key_values_length = 0
 
-        if use_cache:
-            use_legacy_cache = False #not isinstance(past_key_values, Cache)
-            #if use_legacy_cache:
-            #    past_key_values = DynamicCache.from_legacy_cache(past_key_values)
-            past_key_values_length = past_key_values.get_usable_length(seq_length)
-
-        if position_ids is None:
-            #device = input_ids.device if input_ids is not None else inputs_embeds.device
-            position_ids = paddle.arange(
-                past_key_values_length, seq_length + past_key_values_length, dtype=paddle.int64
-            )
-            position_ids = position_ids.unsqueeze(0).reshape([-1, seq_length])
-        else:
-            position_ids = position_ids.reshape([-1, seq_length]).astype(dtype="int64")
+        # if use_cache:
+        #     use_legacy_cache = False #not isinstance(past_key_values, Cache)
+        #     #if use_legacy_cache:
+        #     #    past_key_values = DynamicCache.from_legacy_cache(past_key_values)
+        #     past_key_values_length = past_key_values.get_usable_length(seq_length)
+
+        if past_key_values is None:
+            past_key_values = tuple([None] * len(self.layers))
+        # NOTE: to make cache can be clear in-time
+        past_key_values = list(past_key_values)
+
+        seq_length_with_past = seq_length
+        cache_length = 0
+        if past_key_values[0] is not None:
+            cache_length = past_key_values[0][0].shape[2]
+            seq_length_with_past += cache_length
+
+        # if position_ids is None:
+        #     position_ids = paddle.arange(
+        #         past_key_values_length, seq_length + past_key_values_length, dtype=paddle.int64
+        #     )
+        #     position_ids = position_ids.unsqueeze(0).reshape([-1, seq_length])
+        # else:
+        #     position_ids = position_ids.reshape([-1, seq_length]).astype(dtype="int64")
+        # if position_ids is None:
+        #     position_ids = paddle.arange(
+        #         past_key_values_length, seq_length + past_key_values_length, dtype=paddle.int64
+        #     )
+        #     position_ids = position_ids.unsqueeze(0)
+
 
         if inputs_embeds is None:
             inputs_embeds = self.embed_tokens(input_ids)
 
-        if attention_mask is not None and self._attn_implementation == "flash_attention_2" and use_cache:
-            is_padding_right = attention_mask[:, -1].sum().item() != batch_size
-            if is_padding_right:
-                raise ValueError(
-                    "You are attempting to perform batched generation with padding_side='right'"
-                    " this may lead to unexpected behaviour for Flash Attention version of Qwen2. Make sure to "
-                    " call `tokenizer.padding_side  = 'left'` before tokenizing the input. "
-                )
 
-        if self._attn_implementation == "flash_attention_2":
-            # 2d mask is passed through the layers
-            attention_mask = attention_mask if (attention_mask is not None and 0 in attention_mask) else None
-        elif self._attn_implementation == "sdpa" and not output_attentions:
-            # output_attentions=True can not be supported when using SDPA, and we fall back on
-            # the manual implementation that requires a 4D causal mask in all cases.
-            attention_mask = _prepare_4d_causal_attention_mask_for_sdpa(
-                attention_mask,
-                (batch_size, seq_length),
-                inputs_embeds,
-                past_key_values_length,
-                sliding_window=self.config.sliding_window,
-            )
-        else:
-            # 4d mask is passed through the layers
-            attention_mask = _prepare_4d_causal_attention_mask(
-                attention_mask,
-                (batch_size, seq_length),
-                inputs_embeds,
-                past_key_values_length,
-                sliding_window=self.config.sliding_window,
-            )
+        attention_mask = None
+        # # embed positions
+        # import pdb; pdb.set_trace()
+        # if attention_mask is None:
+        #     # [bs, seq_len]
+        #     attention_mask = paddle.ones((batch_size, seq_length_with_past), dtype=paddle.bool)
+
+        # attention_mask = self._prepare_decoder_attention_mask(
+        #     attention_mask, (batch_size, seq_length), cache_length, inputs_embeds.dtype
+        # )  # [bs, 1, seq_len, seq_len]
+
+        # if attention_mask is not None and self._attn_implementation == "flash_attention_2" and use_cache:
+        #     is_padding_right = attention_mask[:, -1].sum().item() != batch_size
+        #     if is_padding_right:
+        #         raise ValueError(
+        #             "You are attempting to perform batched generation with padding_side='right'"
+        #             " this may lead to unexpected behaviour for Flash Attention version of Qwen2. Make sure to "
+        #             " call `tokenizer.padding_side  = 'left'` before tokenizing the input. "
+        #         )
 
         hidden_states = inputs_embeds
 
@@ -802,27 +852,36 @@ def forward(
         # decoder layers
         all_hidden_states = () if output_hidden_states else None
         all_self_attns = () if output_attentions else None
-        next_decoder_cache = None
+        next_decoder_cache = () ### not none
+
+        # if attention_mask is not None:
+        #     print('attention_mask', attention_mask.shape, attention_mask.sum().item())
 
-        for decoder_layer in self.layers:
+        for idx, decoder_layer in enumerate(self.layers):
             if output_hidden_states:
                 all_hidden_states += (hidden_states,)
 
+            past_key_value = past_key_values[idx] if past_key_values is not None else None
+
             layer_outputs = decoder_layer(
                 hidden_states,
                 attention_mask=attention_mask,
                 position_ids=position_ids,
                 image_embeds=image_embeds,
                 media_offset=media_offset,
-                past_key_value=past_key_values,
+                past_key_value=past_key_value, # not past_key_values
                 output_attentions=output_attentions,
                 use_cache=use_cache,
             )
 
+            # NOTE: clear outdate cache after it has been used for memory saving
+            past_key_value = past_key_values[idx] = None
+
             hidden_states = layer_outputs[0]
 
-            if use_cache:
-                next_decoder_cache = layer_outputs[2 if output_attentions else 1]
+            # if use_cache:
+            #     next_decoder_cache = layer_outputs[2 if output_attentions else 1]
+            next_decoder_cache = next_decoder_cache + (layer_outputs[-1],) if use_cache else None
 
             if output_attentions:
                 all_self_attns += (layer_outputs[1],)
@@ -833,9 +892,10 @@ def forward(
         if output_hidden_states:
             all_hidden_states += (hidden_states,)
 
-        next_cache = None
-        if use_cache:
-            next_cache = next_decoder_cache.to_legacy_cache() if use_legacy_cache else next_decoder_cache
+        # next_cache = None
+        # if use_cache:
+        #     next_cache = next_decoder_cache.to_legacy_cache() if use_legacy_cache else next_decoder_cache
+        next_cache = next_decoder_cache if use_cache else None
 
         if not return_dict:
             return tuple(v for v in [hidden_states, next_cache, all_hidden_states, all_self_attns] if v is not None)
@@ -940,8 +1000,11 @@ def forward(
         )
 
         hidden_states = outputs[0]
-        logits = self.lm_head(hidden_states)
-        logits = logits.astype(dtype="float32")
+        try:
+            logits = self.lm_head(hidden_states)
+        except:
+            logits = self.lm_head(hidden_states.cast(paddle.bfloat16))
+        logits = logits.cast(paddle.float32)
 
         loss = None
         if labels is not None:
@@ -970,35 +1033,42 @@ def forward(
     def prepare_inputs_for_generation(
         self, input_ids, past_key_values=None, attention_mask=None, inputs_embeds=None, **kwargs
     ):
+        batch_size, seq_length = input_ids.shape
+        attention_mask = paddle.ones((batch_size, seq_length), dtype=paddle.bool)
+
         # Omit tokens covered by past_key_values
         if past_key_values is not None:
-            if isinstance(past_key_values, MultiHeadAttention.Cache):
-                cache_length = past_key_values.get_seq_length()
-                past_length = past_key_values.seen_tokens
-                max_cache_length = past_key_values.get_max_length()
-            else:
-                cache_length = past_length = past_key_values[0][0].shape[2]
-                max_cache_length = None
-
-            # Keep only the unprocessed tokens:
-            # 1 - If the length of the attention_mask exceeds the length of input_ids, then we are in a setting where
-            # some of the inputs are exclusively passed as part of the cache (e.g. when passing input_embeds as
-            # input)
-            if attention_mask is not None and attention_mask.shape[1] > input_ids.shape[1]:
-                input_ids = input_ids[:, -(attention_mask.shape[1] - past_length) :]
-            # 2 - If the past_length is smaller than input_ids', then input_ids holds all input tokens. We can discard
-            # input_ids based on the past_length.
-            elif past_length < input_ids.shape[1]:
+            past_length = past_key_values[0][0].shape[2]
+            if past_length < input_ids.shape[1]:
                 input_ids = input_ids[:, past_length:]
-            # 3 - Otherwise (past_length >= input_ids.shape[1]), let's assume input_ids only has unprocessed tokens.
-
-            # If we are about to go beyond the maximum cache length, we need to crop the input attention mask.
-            if (
-                max_cache_length is not None
-                and attention_mask is not None
-                and cache_length + input_ids.shape[1] > max_cache_length
-            ):
-                attention_mask = attention_mask[:, -max_cache_length:]
+            
+            # if isinstance(past_key_values, MultiHeadAttention.Cache):
+            #     cache_length = past_key_values.get_seq_length()
+            #     past_length = past_key_values.seen_tokens
+            #     max_cache_length = past_key_values.get_max_length()
+            # else:
+            #     cache_length = past_length = past_key_values[0][0].shape[2]
+            #     max_cache_length = None
+
+            # # Keep only the unprocessed tokens:
+            # # 1 - If the length of the attention_mask exceeds the length of input_ids, then we are in a setting where
+            # # some of the inputs are exclusively passed as part of the cache (e.g. when passing input_embeds as
+            # # input)
+            # if attention_mask is not None and attention_mask.shape[1] > input_ids.shape[1]:
+            #     input_ids = input_ids[:, -(attention_mask.shape[1] - past_length) :]
+            # # 2 - If the past_length is smaller than input_ids', then input_ids holds all input tokens. We can discard
+            # # input_ids based on the past_length.
+            # elif past_length < input_ids.shape[1]:
+            #     input_ids = input_ids[:, past_length:]
+            # # 3 - Otherwise (past_length >= input_ids.shape[1]), let's assume input_ids only has unprocessed tokens.
+
+            # # If we are about to go beyond the maximum cache length, we need to crop the input attention mask.
+            # if (
+            #     max_cache_length is not None
+            #     and attention_mask is not None
+            #     and cache_length + input_ids.shape[1] > max_cache_length
+            # ):
+            #     attention_mask = attention_mask[:, -max_cache_length:]
 
         position_ids = kwargs.get("position_ids", None)
         if attention_mask is not None and position_ids is None:
diff --git a/paddlemix/models/mPLUGOwl3/modeling_mplugowl3.py b/paddlemix/models/mPLUGOwl3/modeling_mplugowl3.py
index 5261cf4b0..d6f5fe729 100644
--- a/paddlemix/models/mPLUGOwl3/modeling_mplugowl3.py
+++ b/paddlemix/models/mPLUGOwl3/modeling_mplugowl3.py
@@ -12,18 +12,13 @@
 # See the License for the specific language governing permissions and
 # limitations under the License.
 
-import json
-import math
-from copy import deepcopy
 from threading import Thread
-from typing import List, Optional
 
 import paddle
 import paddle.nn as nn
 import paddlenlp
 from paddlenlp.generation import TextIteratorStreamer
-from paddlenlp.transformers import Qwen2ForCausalLM, Qwen2PretrainedModel
-from PIL import Image
+from paddlenlp.transformers import Qwen2PretrainedModel
 
 from .configuration_mplugowl3 import mPLUGOwl3Config
 from .image_processing_mplugowl3 import mPLUGOwl3ImageProcessor
@@ -117,17 +112,24 @@ def forward(self, pixel_values=None, **kwargs):
     
     def _decode(self, input_ids, image_embeds, media_offset, tokenizer, attention_mask, decode_text=False, **kwargs):
         terminators = [tokenizer.convert_tokens_to_ids(i) for i in self.terminators]
+        
+        # ###  must add position_ids, paddlenlp bug
+        # batch_size, seq_length = attention_mask.shape
+        # position_ids = paddle.arange(seq_length).expand((batch_size, seq_length))
+        # ###
+        
         output = self.language_model.generate(
             input_ids=input_ids,
             image_embeds=image_embeds,
             media_offset=media_offset,
             pad_token_id=0,
             eos_token_id=terminators,
+            #position_ids=position_ids,  ####
             attention_mask=attention_mask,
             **kwargs,
         )[0]
-        output = output[:,input_ids.shape[1]:]
-        print('output', output)
+        #output = output[:,input_ids.shape[1]:] # paddle no need this
+        print('_decode output', output)
         if decode_text:
             return self._decode_text(output, tokenizer)
         return output
@@ -238,7 +240,7 @@ def chat(
         generation_config.update(
             (k, kwargs[k]) for k in generation_config.keys() & kwargs.keys()
         )
-        with paddle.inference_mode():
+        with paddle.no_grad():
             res = self.generate(
                 **inputs,
                 stream=stream,

From ac9e90728b2a08ac9dec9d0d7601a98f8e0a0c9b Mon Sep 17 00:00:00 2001
From: "nemonameless@qq.com@github.com" <nemonameless@qq.com>
Date: Thu, 12 Dec 2024 08:30:40 +0000
Subject: [PATCH 5/8] fix seq_len bug

---
 .../models/mPLUGOwl3/modeling_hyper_qwen2.py  | 95 +++++++++++--------
 1 file changed, 58 insertions(+), 37 deletions(-)

diff --git a/paddlemix/models/mPLUGOwl3/modeling_hyper_qwen2.py b/paddlemix/models/mPLUGOwl3/modeling_hyper_qwen2.py
index ab6b37248..e35a41bd7 100644
--- a/paddlemix/models/mPLUGOwl3/modeling_hyper_qwen2.py
+++ b/paddlemix/models/mPLUGOwl3/modeling_hyper_qwen2.py
@@ -21,6 +21,13 @@
 
 from einops import rearrange, repeat
 
+# from paddlemix.models.flash_attn_utils import (
+#     has_flash_attn_func,
+#     is_flash_attn_available,
+# )
+#from paddle.nn.functional.flash_attention import flash_attention as flash_attn_func
+#from paddle.nn.functional.flash_attention import flash_attn_unpadded as flash_attn_varlen_func
+
 from ...activations import ACT2FN
 from .configuration_hyper_qwen2 import HyperQwen2Config
 
@@ -34,6 +41,17 @@
 from paddlemix.utils.log import logger
 
 
+# def _get_unpad_data(attention_mask):
+#     seqlens_in_batch = attention_mask.sum(axis=-1, dtype="int32")
+#     paddle.utils.try_import("warnings").warn("Now, the return shape is inconsistent with torch when as_tuple is True")
+#     indices = paddle.nonzero(x=attention_mask.flatten(), as_tuple=False).flatten()
+#     max_seqlen_in_batch = seqlens_in_batch.max().item()
+#     cu_seqlens = paddle.nn.functional.pad(
+#         x=paddle.cumsum(x=seqlens_in_batch, axis=0, dtype="int32"), pad=(1, 0), pad_from_left_axis=False
+#     )
+#     return indices, cu_seqlens, max_seqlen_in_batch
+
+
 def is_casual_mask(attention_mask):
     """
     Upper triangular of attention_mask equals to attention_mask is casual
@@ -382,6 +400,8 @@ def hyperattention(self,hidden_states: paddle.Tensor,
         cos, sin = self.rotary_emb(value_states, seq_len=kv_seq_len)
 
         query_states, key_states = apply_rotary_pos_emb(query_states, key_states, cos, sin, position_ids)
+        #print('query_states, key_states', query_states.shape, key_states.shape)
+        # [1, 28, 1, 128] [1, 4, 1, 128]
 
         if past_key_value is not None:
             #cache_kwargs = {"sin": sin, "cos": cos}  # Specific to RoPE models
@@ -389,17 +409,29 @@ def hyperattention(self,hidden_states: paddle.Tensor,
             key_states = paddle.concat([past_key_value[0], key_states], axis=2)
             value_states = paddle.concat([past_key_value[1], value_states], axis=2)
         past_key_value = (key_states, value_states) if use_cache else None
+        #print('query_states key_states, value_states', query_states.sum().item(), key_states.sum().item(), value_states.sum().item())
+        #print('query_states key_states, value_states', query_states.shape, key_states.shape, value_states.shape)
         # q k v [1, 28, 74, 128] [1, 4, 74, 128] [1, 4, 74, 128]
+        # q k v [1, 28, 1, 128] [1, 4, 75, 128] [1, 4, 75, 128]
+
+        # query_states key_states, value_states 18304.0 -792.0 -253.0
+        # query_states key_states, value_states 24832.0 123.5 -198.0
+        # query_states key_states, value_states -16896.0 552.0 -692.0
+        # query_states key_states, value_states -120.0 1200.0 -141.0
 
         key_states = repeat_kv(key_states, self.num_key_value_groups)
         value_states = repeat_kv(value_states, self.num_key_value_groups)
 
         # add visual to kv
         length_each_img = image_embeds.shape[1]
+        # [7, 729, 3584] sum 78848.
+        #import pdb; pdb.set_trace()
         try:
             image_embeds = self.v_kv_proj(image_embeds)
         except:
             image_embeds = self.v_kv_proj(image_embeds.astype('bfloat16'))
+        #import pdb; pdb.set_trace()
+        # [7, 729, 1024] sum 184320.
         image_start = 0
         context_layer = []                 
         for bi, media_starts in enumerate(media_offset):
@@ -451,6 +483,7 @@ def hyperattention(self,hidden_states: paddle.Tensor,
             #     curr_key_layer = curr_key_layer.contiguous()
             #     curr_value_layer = curr_value_layer.contiguous()
 
+
             # full_mask.shape [1, 1, 72, 5175] # sum 196689
             attn_output = paddle.nn.functional.scaled_dot_product_attention(
                 curr_query_layer.transpose([0, 2, 1, 3]), # (batch, ..., sequence, dim) # [1, 72, 28, 128], torch [1, 28, 74, 128] sum 18304.
@@ -463,12 +496,15 @@ def hyperattention(self,hidden_states: paddle.Tensor,
                 # enable_gqa=True, # gqa can not be used because mask requires XFORMERS and not support gqa
             ) # -> (N, ..., L, Ev)
             # torch attn_output.shape [1, 28, 72, 128]
-            #attn_output = attn_output.transpose([0, 2, 1, 3])
+            attn_output = attn_output.transpose([0, 2, 1, 3])
+            #import pdb; pdb.set_trace()
+
             assert attn_output.shape[0] == 1
             context_layer.append(attn_output)
         attn_output = context_layer = paddle.concat(context_layer, axis=0)
 
-        #attn_output = attn_output.transpose([0, 2, 1, 3])
+        attn_output = attn_output.transpose([0, 2, 1, 3])
+        #print('attn_output', attn_output.shape) # [1, 74, 28, 128] [1, 1, 28, 128]
         attn_output = attn_output.reshape([bsz, q_len, self.hidden_size])
 
         attn_output = self.o_proj(attn_output)
@@ -795,49 +831,29 @@ def forward(
         # NOTE: to make cache can be clear in-time
         past_key_values = list(past_key_values)
 
-        seq_length_with_past = seq_length
+        past_key_values_length = seq_length
         cache_length = 0
         if past_key_values[0] is not None:
-            cache_length = past_key_values[0][0].shape[2]
-            seq_length_with_past += cache_length
+            cache_length = past_key_values[0][0].shape[1] # 
+            past_key_values_length += cache_length
 
-        # if position_ids is None:
-        #     position_ids = paddle.arange(
-        #         past_key_values_length, seq_length + past_key_values_length, dtype=paddle.int64
-        #     )
-        #     position_ids = position_ids.unsqueeze(0).reshape([-1, seq_length])
-        # else:
-        #     position_ids = position_ids.reshape([-1, seq_length]).astype(dtype="int64")
-        # if position_ids is None:
-        #     position_ids = paddle.arange(
-        #         past_key_values_length, seq_length + past_key_values_length, dtype=paddle.int64
-        #     )
-        #     position_ids = position_ids.unsqueeze(0)
+        # print('position_ids  before', position_ids)
+        if position_ids is None:
+            position_ids = paddle.arange(
+                past_key_values_length, seq_length + past_key_values_length, dtype=paddle.int64
+            )
+            position_ids = position_ids.unsqueeze(0).reshape([-1, seq_length])
+        else:
+           position_ids = position_ids.reshape([-1, seq_length]).astype(dtype="int64")
 
+        # print('position_ids', position_ids)
+        # print('seq_length', seq_length)
+        # print('past_key_values_length', past_key_values_length)
 
         if inputs_embeds is None:
             inputs_embeds = self.embed_tokens(input_ids)
 
-
         attention_mask = None
-        # # embed positions
-        # import pdb; pdb.set_trace()
-        # if attention_mask is None:
-        #     # [bs, seq_len]
-        #     attention_mask = paddle.ones((batch_size, seq_length_with_past), dtype=paddle.bool)
-
-        # attention_mask = self._prepare_decoder_attention_mask(
-        #     attention_mask, (batch_size, seq_length), cache_length, inputs_embeds.dtype
-        # )  # [bs, 1, seq_len, seq_len]
-
-        # if attention_mask is not None and self._attn_implementation == "flash_attention_2" and use_cache:
-        #     is_padding_right = attention_mask[:, -1].sum().item() != batch_size
-        #     if is_padding_right:
-        #         raise ValueError(
-        #             "You are attempting to perform batched generation with padding_side='right'"
-        #             " this may lead to unexpected behaviour for Flash Attention version of Qwen2. Make sure to "
-        #             " call `tokenizer.padding_side  = 'left'` before tokenizing the input. "
-        #         )
 
         hidden_states = inputs_embeds
 
@@ -1038,9 +1054,12 @@ def prepare_inputs_for_generation(
 
         # Omit tokens covered by past_key_values
         if past_key_values is not None:
-            past_length = past_key_values[0][0].shape[2]
+            past_length = past_key_values[0][0].shape[1] # [1, 74, 4, 128] seq_len 74
+            #print('input_ids before omitting', input_ids)
+            #import pdb; pdb.set_trace()
             if past_length < input_ids.shape[1]:
                 input_ids = input_ids[:, past_length:]
+            #print('input_ids', input_ids.shape, input_ids.sum().item())
             
             # if isinstance(past_key_values, MultiHeadAttention.Cache):
             #     cache_length = past_key_values.get_seq_length()
@@ -1070,6 +1089,7 @@ def prepare_inputs_for_generation(
             # ):
             #     attention_mask = attention_mask[:, -max_cache_length:]
 
+        #print('attention_mask ////', attention_mask.shape, attention_mask.sum().item())
         position_ids = kwargs.get("position_ids", None)
         if attention_mask is not None and position_ids is None:
             # create position_ids on the fly for batch generation
@@ -1078,6 +1098,7 @@ def prepare_inputs_for_generation(
             if past_key_values:
                 position_ids = position_ids[:, -input_ids.shape[1] :]
 
+        #print('position_ids ////', position_ids)
         # if `inputs_embeds` are passed, we only want to use them in the 1st generation step
         if inputs_embeds is not None and past_key_values is None:
             model_inputs = {"inputs_embeds": inputs_embeds}

From f56d4f00df41a9995fef0b82c93203e6c58ca9b6 Mon Sep 17 00:00:00 2001
From: WAYKEN-TSE <760301162@qq.com>
Date: Tue, 17 Dec 2024 10:35:43 +0800
Subject: [PATCH 6/8] fix parameter [pixel_values]

---
 .../mPLUGOwl3/image_processing_mplugowl3.py   | 39 +++++++++++--------
 1 file changed, 22 insertions(+), 17 deletions(-)

diff --git a/paddlemix/models/mPLUGOwl3/image_processing_mplugowl3.py b/paddlemix/models/mPLUGOwl3/image_processing_mplugowl3.py
index 418e8d6f9..4574aaff6 100644
--- a/paddlemix/models/mPLUGOwl3/image_processing_mplugowl3.py
+++ b/paddlemix/models/mPLUGOwl3/image_processing_mplugowl3.py
@@ -36,6 +36,7 @@
     BatchFeature,
 )
 from PIL import Image
+from paddle.vision.transforms import Resize
 
 
 def recursive_converter(converter, value):
@@ -236,13 +237,15 @@ def __init__(self, image_size, anchors, interpolation="bilinear", antialias=None
         assert self.anchor_strategy in available_anchor_strategy
 
     def resize_global(self, img):
+        transform = Resize(size=self.image_size,interpolation=self.interpolation)
+        return transform(img)
         #return F.resize(img, self.image_size, self.interpolation, max_size=None, antialias=self.antialias)
-        image_np = np.array(img)
-        image_tensor = paddle.to_tensor(image_np, dtype="float32")
-        image_tensor = image_tensor.transpose([2, 0, 1])  # 变成 (3, 500, 500)
-        if self.interpolation == "bilinear" or "bicubic":
-            image_tensor = image_tensor.unsqueeze(0)  # 变成 (1, 3, 500, 500)
-        return F.interpolate(image_tensor, size=self.image_size, mode=self.interpolation, align_corners=False)[0]
+        # image_np = np.array(img)
+        # image_tensor = paddle.to_tensor(image_np, dtype="float32")
+        # image_tensor = image_tensor.transpose([2, 0, 1])  # 变成 (3, 500, 500)
+        # if self.interpolation == "bilinear" or "bicubic":
+        #     image_tensor = image_tensor.unsqueeze(0)  # 变成 (1, 3, 500, 500)
+        # return F.interpolate(image_tensor, size=self.image_size, mode=self.interpolation, align_corners=False)[0]
 
     def forward(self, img, skip_resize=False):
         """
@@ -272,17 +275,19 @@ def forward(self, img, skip_resize=False):
             # for debug
             return selected_anchor
         #return F.resize(img, [target_size[1],target_size[0]], self.interpolation, max_size=None, antialias=self.antialias), selected_anchor
-        image_np = np.array(img)
-        image_tensor = paddle.to_tensor(image_np, dtype="float32")
-        image_tensor = image_tensor.transpose([2, 0, 1])  # 变成 (3, 500, 500)
-        if self.interpolation == "bilinear" or "bicubic":
-            image_tensor = image_tensor.unsqueeze(0)  # 变成 (1, 3, 500, 500)
-        return (
-            F.interpolate(
-                image_tensor, size=[target_size[1], target_size[0]], mode=self.interpolation, align_corners=False
-            )[0],
-            selected_anchor,
-        )
+        # image_np = np.array(img)
+        # image_tensor = paddle.to_tensor(image_np, dtype="float32")
+        # image_tensor = image_tensor.transpose([2, 0, 1])  # 变成 (3, 500, 500)
+        # if self.interpolation == "bilinear" or "bicubic":
+        #     image_tensor = image_tensor.unsqueeze(0)  # 变成 (1, 3, 500, 500)
+        transform = Resize(size=[target_size[1],target_size[0]],interpolation=self.interpolation)
+        return (transform(img),selected_anchor)
+        # return (
+        #     F.interpolate(
+        #         image_tensor, size=[target_size[1], target_size[0]], mode=self.interpolation, align_corners=False
+        #     )[0],
+        #     selected_anchor,
+        # )
 
     def __repr__(self) -> str:
         detail = f"(size={self.image_size}, anchor={self.anchors}, interpolation={self.interpolation.value}, antialias={self.antialias})"

From 754c0d6bc6f412c7db30499d2cde3e7cb70d4168 Mon Sep 17 00:00:00 2001
From: "nemonameless@qq.com@github.com" <nemonameless@qq.com>
Date: Tue, 17 Dec 2024 14:08:49 +0000
Subject: [PATCH 7/8] fix infer codes, refine format

---
 .pre-commit-config.yaml                       |  45 +
 0.sh                                          |   1 -
 build_env.sh                                  |   4 +-
 paddlemix/examples/mPLUG_Owl3/README.md       |  10 +-
 .../examples/mPLUG_Owl3/run_inference.py      |  54 +-
 .../mPLUG_Owl3/run_inference_video.py         |  77 --
 paddlemix/models/mPLUGOwl3/__init__.py        |   2 -
 .../mPLUGOwl3/configuration_mplugowl3.py      |  14 +-
 .../mPLUGOwl3/image_processing_mplugowl3.py   | 494 -----------
 .../models/mPLUGOwl3/modeling_hyper_qwen2.py  | 356 ++++----
 .../models/mPLUGOwl3/modeling_mplugowl3.py    | 129 +--
 .../models/mPLUGOwl3/modeling_navit_siglip.py | 136 +--
 .../models/mPLUGOwl3/processing_mplugowl3.py  | 397 ---------
 paddlemix/processors/__init__.py              |   9 +-
 paddlemix/processors/mplugowl3_processing.py  | 824 ++++++++++++++++++
 15 files changed, 1123 insertions(+), 1429 deletions(-)
 create mode 100644 .pre-commit-config.yaml
 delete mode 100644 0.sh
 delete mode 100644 paddlemix/examples/mPLUG_Owl3/run_inference_video.py
 delete mode 100644 paddlemix/models/mPLUGOwl3/image_processing_mplugowl3.py
 delete mode 100644 paddlemix/models/mPLUGOwl3/processing_mplugowl3.py
 create mode 100644 paddlemix/processors/mplugowl3_processing.py

diff --git a/.pre-commit-config.yaml b/.pre-commit-config.yaml
new file mode 100644
index 000000000..b358ac0fc
--- /dev/null
+++ b/.pre-commit-config.yaml
@@ -0,0 +1,45 @@
+repos:
+# For Python files
+-   repo: https://github.com/psf/black.git
+    rev: 22.8.0
+    hooks:
+    -   id: black
+        files: \.(py|pyi)$
+        additional_dependencies: [toml]
+-   repo: https://github.com/PyCQA/isort
+    rev: 5.11.5
+    hooks:
+    -   id: isort
+-   repo: https://github.com/PyCQA/flake8
+    rev: 4.0.1
+    hooks:
+    -   id: flake8
+-   repo: https://github.com/pre-commit/pre-commit-hooks
+    rev: v4.1.0
+    hooks:
+    -   id: check-merge-conflict
+    -   id: check-symlinks
+    -   id: detect-private-key
+        files: (?!.*paddle)^.*$
+    -   id: end-of-file-fixer
+        files: \.md$
+    -   id: trailing-whitespace
+        files: \.md$
+-   repo: https://github.com/Lucas-C/pre-commit-hooks
+    rev: v1.1.14
+    hooks:
+    -   id: forbid-crlf
+        files: \.md$
+    -   id: remove-crlf
+        files: \.md$
+    -   id: forbid-tabs
+        files: \.md$
+    -   id: remove-tabs
+        files: \.md$
+-   repo: local
+    hooks:
+    -   id: copyright_checker
+        name: copyright_checker
+        entry: python .copyright.hook
+        language: system
+        files: \.(c|cc|cxx|cpp|cu|h|hpp|hxx|proto|xpu|kps|py|sh)$
diff --git a/0.sh b/0.sh
deleted file mode 100644
index 1ce640685..000000000
--- a/0.sh
+++ /dev/null
@@ -1 +0,0 @@
-CUDA_VISIBLE_DEVICES=7 python paddlemix/examples/mPLUG_Owl3/run_inference.py
diff --git a/build_env.sh b/build_env.sh
index e87c3e5f4..41fbb66ff 100644
--- a/build_env.sh
+++ b/build_env.sh
@@ -21,12 +21,12 @@ echo "开始安装 PaddleMIX 及其依赖..."
 
 # 安装 PaddleMIX
 echo "安装 PaddleMIX..."
-pip install -e . -i https://mirrors.aliyun.com/pypi/simple/
+pip install -e .
 
 # 安装 ppdiffusers
 echo "安装 ppdiffusers..."
 cd ppdiffusers
-pip install -e . -i https://mirrors.aliyun.com/pypi/simple/
+pip install -e .
 cd ..
 #注：ppdiffusers部分模型需要依赖 CUDA 11.2 及以上版本，如果本地机器不符合要求，建议前往 [AI Studio](https://aistudio.baidu.com/index) 进行模型训练、推理任务。
 #如果希望使用**bf16**训练推理，请使用支持**bf16**的GPU，如A100。
diff --git a/paddlemix/examples/mPLUG_Owl3/README.md b/paddlemix/examples/mPLUG_Owl3/README.md
index 13ed53c03..baababe99 100644
--- a/paddlemix/examples/mPLUG_Owl3/README.md
+++ b/paddlemix/examples/mPLUG_Owl3/README.md
@@ -6,8 +6,6 @@
 
 | Model              |
 |--------------------|
-<!-- | mPLUG/mPLUG-Owl3-1B-241014  |
-| mPLUG/mPLUG-Owl3-2B-241014  | -->
 | mPLUG/mPLUG-Owl3-7B-241101  |
 
 注意：与huggingface权重同名，但权重为paddle框架的Tensor，使用`xxx.from_pretrained("mPLUG/mPLUG-Owl3-7B-241101")`即可自动下载该权重文件夹到缓存目录。
@@ -26,15 +24,9 @@
 ### 推理
 ```bash
 # 图片理解
-python paddlemix/examples/mPLUG_Owl3/run_inference.py \
-
-# 视频理解
-python paddlemix/examples/mPLUG_Owl3/run_inference_video.py \
+CUDA_VISIBLE_DEVICES=0 python paddlemix/examples/mPLUG_Owl3/run_inference.py \
 ```
 
-### 效果展示
-
-
 
 ### 参考文献
 ```BibTeX
diff --git a/paddlemix/examples/mPLUG_Owl3/run_inference.py b/paddlemix/examples/mPLUG_Owl3/run_inference.py
index 0606c8bae..8db4d537d 100644
--- a/paddlemix/examples/mPLUG_Owl3/run_inference.py
+++ b/paddlemix/examples/mPLUG_Owl3/run_inference.py
@@ -12,55 +12,35 @@
 # See the License for the specific language governing permissions and
 # limitations under the License.
 
-from PIL import Image
 import paddle
 from paddlenlp.transformers import Qwen2Tokenizer
+from PIL import Image
+
 from paddlemix.models.mPLUGOwl3.configuration_mplugowl3 import mPLUGOwl3Config
 from paddlemix.models.mPLUGOwl3.modeling_mplugowl3 import mPLUGOwl3Model
-# from paddlemix.models.mPLUGOwl3.processing_mplugowl3 import mPLUGOwl3Processor
-# from paddlemix.models.mPLUGOwl3.image_processing_mplugowl3 import mPLUGOwl3ImageProcessor
 
-#model_path = 'mPLUG/mPLUG-Owl3-7B-241101'
-model_path = 'mPLUG-Owl3-7B-241101'
+model_path = "mPLUG/mPLUG-Owl3-7B-241101"
 
 config = mPLUGOwl3Config.from_pretrained(model_path)
-# print(config)
 model = mPLUGOwl3Model.from_pretrained(model_path, dtype=paddle.bfloat16).eval()
 tokenizer = Qwen2Tokenizer.from_pretrained(model_path)
 processor = model.init_processor(tokenizer)
 
-#image = Image.new('RGB', (500, 500), color='red')
+# image = Image.new('RGB', (500, 500), color='red')
 image = Image.open("paddlemix/demo_images/examples_image1.jpg").convert("RGB")
 
-messages = [
-    {"role": "user", "content": """<|image|>Describe this image."""},
-    {"role": "assistant", "content": ""}
-]
+messages = [{"role": "user", "content": """<|image|>Describe this image."""}, {"role": "assistant", "content": ""}]
 
 inputs = processor(messages, images=[image], videos=None)
-inputs['pixel_values'] = inputs['pixel_values'].cast(paddle.bfloat16)
-# inputs['input_ids'] [1, 72] # torch [1, 74]
-# inputs['input_ids'] = paddle.to_tensor([[151644,   8948,    198, 151645,    198, 151644,    872,    198,     27,
-#              91,   2468,  41317,     91,     29,     17,      9,     18,    198,
-#              27,     91,   1805,     91,     29,    220,     27,     91,   1805,
-#              91,     29,    220,     27,     91,   1805,     91,     29,    198,
-#              27,     91,   1805,     91,     29,    220,     27,     91,   1805,
-#              91,     29,    220,     27,     91,   1805,     91,     29,    198,
-#              27,     91,   1805,     91,     29,     27,     91,    408,  41317,
-#              91,     29,  74785,    419,   2168,     13, 151645,    198, 151644,
-#           77091,    198]]).astype(paddle.int64)
-# inputs['media_offset'] [17, 23, 29, 35, 41, 47, 53]
-# inputs['pixel_values'] [7, 3, 378, 378] sum 629145600
-
-import numpy as np
-inputs['pixel_values'] = paddle.to_tensor(np.load('pixel_values.npy')).cast(paddle.bfloat16)
-inputs['media_offset'] = [paddle.to_tensor([18, 24, 30, 36, 42, 48, 54])]
-
-inputs.update({
-    'tokenizer': tokenizer,
-    'max_new_tokens':100,
-    'decode_text':True,
-})
-
-g = model.generate(**inputs)
-print(g)
+inputs["pixel_values"] = inputs["pixel_values"].cast(paddle.bfloat16)
+
+inputs.update(
+    {
+        "tokenizer": tokenizer,
+        "max_new_tokens": 512,  #
+        "decode_text": True,
+    }
+)
+
+res = model.generate(**inputs)
+print("output:\n", res)
diff --git a/paddlemix/examples/mPLUG_Owl3/run_inference_video.py b/paddlemix/examples/mPLUG_Owl3/run_inference_video.py
deleted file mode 100644
index 778c9cb73..000000000
--- a/paddlemix/examples/mPLUG_Owl3/run_inference_video.py
+++ /dev/null
@@ -1,77 +0,0 @@
-# Copyright (c) 2024 PaddlePaddle Authors. All Rights Reserved.
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#     http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-
-import argparse
-import base64
-import io
-from typing import Dict, List
-
-import paddle
-import PIL.Image
-from paddlenlp.transformers import LlamaTokenizerFast
-
-from paddlemix.models.janus import JanusMultiModalityCausalLM
-from paddlemix.processors import JanusImageProcessor, JanusVLChatProcessor
-
-import paddle
-model_path = 'mPLUG/mPLUG-Owl3-7B-241101'
-
-config = AutoConfig.from_pretrained(model_path)
-print(config)
-model = AutoModel.from_pretrained(model_path, dtype=paddle.bfloat16).eval()
-
-from PIL import Image
-
-from modelscope import AutoTokenizer
-from decord import VideoReader, cpu    # pip install decord
-tokenizer = AutoTokenizer.from_pretrained(model_path)
-processor = model.init_processor(tokenizer)
-
-
-messages = [
-    {"role": "user", "content": """<|video|>
-Describe this video."""},
-    {"role": "assistant", "content": ""}
-]
-
-videos = ['/nas-mmu-data/examples/car_room.mp4']
-
-MAX_NUM_FRAMES=16
-
-def encode_video(video_path):
-    def uniform_sample(l, n):
-        gap = len(l) / n
-        idxs = [int(i * gap + gap / 2) for i in range(n)]
-        return [l[i] for i in idxs]
-
-    vr = VideoReader(video_path, ctx=cpu(0))
-    sample_fps = round(vr.get_avg_fps() / 1)  # FPS
-    frame_idx = [i for i in range(0, len(vr), sample_fps)]
-    if len(frame_idx) > MAX_NUM_FRAMES:
-        frame_idx = uniform_sample(frame_idx, MAX_NUM_FRAMES)
-    frames = vr.get_batch(frame_idx).asnumpy()
-    frames = [Image.fromarray(v.astype('uint8')) for v in frames]
-    print('num frames:', len(frames))
-    return frames
-video_frames = [encode_video(_) for _ in videos]
-inputs = processor(messages, images=None, videos=video_frames)
-
-inputs.update({
-    'tokenizer': tokenizer,
-    'max_new_tokens':100,
-    'decode_text':True,
-})
-
-g = model.generate(**inputs)
-print(g)
diff --git a/paddlemix/models/mPLUGOwl3/__init__.py b/paddlemix/models/mPLUGOwl3/__init__.py
index 93ec48f7a..a9bd46569 100644
--- a/paddlemix/models/mPLUGOwl3/__init__.py
+++ b/paddlemix/models/mPLUGOwl3/__init__.py
@@ -14,8 +14,6 @@
 
 from .configuration_hyper_qwen2 import *
 from .configuration_mplugowl3 import *
-from .image_processing_mplugowl3 import *
 from .modeling_hyper_qwen2 import *
 from .modeling_mplugowl3 import *
 from .modeling_navit_siglip import *
-from .processing_mplugowl3 import *
diff --git a/paddlemix/models/mPLUGOwl3/configuration_mplugowl3.py b/paddlemix/models/mPLUGOwl3/configuration_mplugowl3.py
index 36adf6f1c..f31fa6049 100644
--- a/paddlemix/models/mPLUGOwl3/configuration_mplugowl3.py
+++ b/paddlemix/models/mPLUGOwl3/configuration_mplugowl3.py
@@ -1,22 +1,21 @@
 # Copyright (c) 2024 PaddlePaddle Authors. All Rights Reserved.
-# 
+#
 # Licensed under the Apache License, Version 2.0 (the "License");
 # you may not use this file except in compliance with the License.
 # You may obtain a copy of the License at
-# 
+#
 #     http://www.apache.org/licenses/LICENSE-2.0
-# 
+#
 # Unless required by applicable law or agreed to in writing, software
 # distributed under the License is distributed on an "AS IS" BASIS,
 # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 # See the License for the specific language governing permissions and
 # limitations under the License.
 
-import os
-import paddlenlp
+from paddlemix.utils.log import logger
+
 from .configuration_hyper_qwen2 import HyperQwen2Config
 from .modeling_navit_siglip import SigLipVisionConfig
-from paddlemix.utils.log import logger
 
 
 class mPLUGOwl3Config(HyperQwen2Config):
@@ -30,10 +29,9 @@ class mPLUGOwl3Config(HyperQwen2Config):
         "model_type": "siglip_vision_model",
         "num_attention_heads": 16,
         "num_hidden_layers": 27,
-        "patch_size": 14
+        "patch_size": 14,
     }
 
-
     def __init__(
         self,
         use_cache=True,
diff --git a/paddlemix/models/mPLUGOwl3/image_processing_mplugowl3.py b/paddlemix/models/mPLUGOwl3/image_processing_mplugowl3.py
deleted file mode 100644
index 4574aaff6..000000000
--- a/paddlemix/models/mPLUGOwl3/image_processing_mplugowl3.py
+++ /dev/null
@@ -1,494 +0,0 @@
-# Copyright (c) 2024 PaddlePaddle Authors. All Rights Reserved.
-# 
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-# 
-#     http://www.apache.org/licenses/LICENSE-2.0
-# 
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-
-# import sys
-
-# sys.path.append("/home/aistudio/paddle_test/mPLUGOwl3/utils")
-import math
-import random
-from enum import Enum
-from typing import Any, Dict, List, Optional, Union
-
-import numpy as np
-import paddle.vision.transforms as transforms
-
-# import paddle_aux
-import paddle
-import paddle.nn.functional as F
-import paddlenlp
-import PIL
-import PIL.Image
-import PIL.ImageSequence
-from einops import rearrange, repeat
-from paddlenlp.transformers.image_processing_utils import (
-    BaseImageProcessor,
-    BatchFeature,
-)
-from PIL import Image
-from paddle.vision.transforms import Resize
-
-
-def recursive_converter(converter, value):
-    if isinstance(value, list):
-        new_value = []
-        for v in value:
-            new_value += [recursive_converter(converter, v)]
-        return new_value
-    else:
-        return converter(value)
-
-
-def box_area(boxes):
-    # 获取边界框的宽度和高度
-    width = boxes[:, 2] - boxes[:, 0]
-    height = boxes[:, 3] - boxes[:, 1]
-    # 计算面积
-    area = width * height
-    return area
-
-
-def custom_max(a, b):
-    return paddle.where(a > b, a, b)
-
-
-def custom_min(a, b):
-    return paddle.where(a < b, a, b)
-
-def box_iou(boxes1, area1, boxes2, eps=1e-05):
-    # >>>>>>    area2 = torchvision.ops.boxes.box_area(boxes2)
-    area1 = area1.astype("float32")
-    boxes1 = boxes1.astype("float32")
-    boxes2 = boxes2.astype("float32")
-
-    area2 = box_area(boxes2).astype("float32")
-    lt = custom_max(boxes1[:, None, :2], boxes2[:, :2])
-    rb = custom_min(boxes1[:, None, 2:], boxes2[:, 2:])
-    wh = (rb - lt).clip(min=0)
-    inter = wh[:, :, 0] * wh[:, :, 1]
-    union = area1[:, None] + area2 - inter
-    iou = inter / (union + eps)
-    return iou, union
-
-
-# def box_iou(boxes1, area1, boxes2, eps=1e-5):
-#     area2 = box_area(boxes2)
-
-#     lt = paddle.max(boxes1[:, None, :2], boxes2[:, :2])  # [N,M,2]
-#     rb = paddle.min(boxes1[:, None, 2:], boxes2[:, 2:])  # [N,M,2]
-
-#     wh = (rb - lt).clip(min=0)  # [N,M,2]
-#     inter = wh[:, :, 0] * wh[:, :, 1]  # [N,M]
-
-#     union = area1[:, None] + area2 - inter
-
-#     iou = inter / (union + eps)
-#     return iou, union
-
-
-available_anchor_strategy = ['docowl', 'random', 'highest', 'last', 'llava']
-
-grid_dict = {
-    'grid_33':[
-        (1,1),
-        (1,2),(2,1),
-        (1,3),(3,1),
-        (2,2),(1,4),(4,1),
-        (1,5),(5,1),
-        (1,6),(6,1),(2,3),(3,2),
-        (1,7),(7,1),
-        (4,2),(2,4),(1,8),(8,1),
-        (3,3),(1,9),(9,1)],
-    'grid_squ_3x3':[
-        (1,1),(2,2),(3,3)
-    ],
-    'grid_squ_4':[
-        (2,2),(1,3),(1,4),(3,1),(4,1)
-    ],
-    'grid_squ_6':[
-        (2,2),(1,3),(1,4),(3,1),(4,1), (2,3),(3,2)
-    ],
-    'grid_squ_2':[
-        (2,1)
-    ],
-    'grid_squ_9':[
-        (1,1),
-        (1,2),(2,1),
-        (1,3),(3,1),
-        (2,2),(1,4),(4,1),
-        (1,5),(5,1),
-        (1,6),(6,1),(2,3),(3,2),
-        (1,7),(7,1),
-        (4,2),(2,4),(1,8),(8,1),
-        (3,3),(1,9),(9,1)],
-}
-
-
-cut_prompt_template_dict = {
-    'v0': lambda img_token, h, w: f''.join([f"{img_token}" for i in range(h) for j in range(w)]),
-    'v1': lambda img_token, h, w: f'Cut to {h} rows {w} columns, '+ ' '.join([f"subimg({i},{j}){img_token}"for i in range(h) for j in range(w)]),
-    'v1_global': lambda img_token, h, w: f'Cut to {h} rows {w} columns with a global view, '+ ' '.join([f"subimg({i},{j}){img_token}"for i in range(h) for j in range(w)]+[f"global_view{img_token}"]),
-    'v2_global': lambda img_token, h, w: f'Cut to {h} rows {w} columns with a global view\n'+ '\n'.join([' '.join([f"subimg({i},{j}){img_token}" for j in range(w)]) for i in range(h)])+f"\nglobal_view{img_token}",
-    'v3': lambda img_token, h, w: f'<|start_cut|>{h}*{w}'+ ' '.join([f"{img_token}"for i in range(h) for j in range(w)])+'<|end_cut|>',
-    'v3_global': lambda img_token, h, w: f'<|start_cut|>{h}*{w}\n'+ '\n'.join([' '.join([f"{img_token}" for j in range(w)]) for i in range(h)])+f'\n{img_token}<|end_cut|>',
-
-}
-
-def anchor_rank(anchors, anchors_areas, input_image_size, eps=1e-5):
-    # anchors x1 y1 x2 y2
-
-    # image_size: (h, w)
-    # xyxy
-    input_image_bbox = paddle.to_tensor([0, 0, input_image_size[1], input_image_size[0]]).unsqueeze(0)
-
-    boxes1 = anchors
-    boxes2 = input_image_bbox
-    boxes3 = anchors.clone()
-    # y2
-    boxes3[:,3] = input_image_size[0]/input_image_size[1]*anchors[:,2] # 用于算分辨率无关的iou
-    
-    area1 = anchors_areas
-    
-    iou, _ = box_iou(boxes1, area1, boxes2)
-    iou = iou.squeeze(1)
-    shape_iou, _ = box_iou(boxes1, area1, boxes3)
-    shape_iou = shape_iou.diag()
-    # 优先匹配形状接近 再匹配分辨率接近
-    index = paddle.argmax(shape_iou*100+iou,axis=0)
-    return index
-
-
-def select_best_resolution(anchors, anchors_areas, input_image_size): # TODO For a futher check
-    """
-    Selects the best resolution from a list of possible resolutions based on the original size.
-
-    Args:
-        original_size (tuple): The original size of the image in the format (width, height).
-        possible_resolutions (list): A list of possible resolutions in the format [(width1, height1), (width2, height2), ...].
-
-    Returns:
-        tuple: The best fit resolution in the format (width, height).
-    """
-    original_size = (input_image_size[1], input_image_size[0])
-    possible_resolutions = [(_[2], _[3]) for _ in anchors] # xyxy -> w,h
-
-    original_width, original_height = original_size
-    best_fit = None
-    max_effective_resolution = 0
-    min_wasted_resolution = float('inf')
-
-    index = 0
-    for i, (width, height) in enumerate(possible_resolutions):
-        scale = min(width / original_width, height / original_height)
-        downscaled_width, downscaled_height = int(original_width * scale), int(original_height * scale)
-        effective_resolution = min(downscaled_width * downscaled_height, original_width * original_height)
-        wasted_resolution = (width * height) - effective_resolution
-
-        if effective_resolution > max_effective_resolution or (effective_resolution == max_effective_resolution and wasted_resolution < min_wasted_resolution):
-            max_effective_resolution = effective_resolution
-            min_wasted_resolution = wasted_resolution
-            best_fit = (width, height)
-            index = i
-
-    return index
-
-def build_cut_shape_indices(cut_shape):
-    # cut_shape: a list of (nh,nw)
-    cut_shape_indices = []
-    for shape in cut_shape:
-        n=shape[0]*shape[1]
-        indices = paddle.concat([
-            repeat(paddle.to_tensor(shape),'l -> n l',n=n),
-            paddle.arange(n).unsqueeze(1)
-        ], axis=1)
-        assert indices.shape[0] == n
-        assert indices.shape[1] == 3 # nh,nw,idx
-
-        cut_shape_indices.append(indices)
-    cut_shape_indices = paddle.concat(cut_shape_indices,axis=0).astype('int64')
-    return cut_shape_indices
-
-
-class AnchorResize(paddle.nn.Layer):
-
-    def __init__(self, image_size, anchors, interpolation="bilinear", antialias=None, anchor_strategy='docowl'):
-        super().__init__()
-        self.image_size = image_size
-        # xyxy
-        self.anchors = paddle.to_tensor(
-            [[0, 0, _[1]*image_size[1], _[0]*image_size[0]] for _ in anchors],
-        )
-
-        self.anchor_areas = box_area(self.anchors)
-
-        self.interpolation = interpolation
-        self.antialias = antialias
-        self.anchor_strategy = anchor_strategy
-        assert self.anchor_strategy in available_anchor_strategy
-
-    def resize_global(self, img):
-        transform = Resize(size=self.image_size,interpolation=self.interpolation)
-        return transform(img)
-        #return F.resize(img, self.image_size, self.interpolation, max_size=None, antialias=self.antialias)
-        # image_np = np.array(img)
-        # image_tensor = paddle.to_tensor(image_np, dtype="float32")
-        # image_tensor = image_tensor.transpose([2, 0, 1])  # 变成 (3, 500, 500)
-        # if self.interpolation == "bilinear" or "bicubic":
-        #     image_tensor = image_tensor.unsqueeze(0)  # 变成 (1, 3, 500, 500)
-        # return F.interpolate(image_tensor, size=self.image_size, mode=self.interpolation, align_corners=False)[0]
-
-    def forward(self, img, skip_resize=False):
-        """
-        Args:
-            img (PIL Image or Tensor): Image to be scaled.
-
-        Returns:
-            PIL Image or Tensor: Rescaled image.
-        """
-        if self.anchor_strategy == 'docowl':
-            selected_anchor = anchor_rank(self.anchors, self.anchor_areas, (img.size[1], img.size[0]))
-        elif self.anchor_strategy == 'random':
-            selected_anchor = random.randint(0,len(self.anchors)-1)
-        elif self.anchor_strategy == 'highest':
-            # 选面积最大的 在这个基础上 尽可能选最方正的
-            selected_anchor = paddle.argmax(self.anchors[:,2]*self.anchors[:,3]*100-paddle.abs(self.anchors[:,2]-self.anchors[:,3]))
-        elif self.anchor_strategy == 'last':
-            selected_anchor = len(self.anchors)-1
-        elif self.anchor_strategy == 'llava':
-            selected_anchor = select_best_resolution(self.anchors, self.anchor_areas, (img.size[1], img.size[0]))
-        else:
-            selected_anchor = None
-        assert selected_anchor is not None
-
-        target_size = self.anchors[selected_anchor][2:].tolist() # w,h
-        if skip_resize:
-            # for debug
-            return selected_anchor
-        #return F.resize(img, [target_size[1],target_size[0]], self.interpolation, max_size=None, antialias=self.antialias), selected_anchor
-        # image_np = np.array(img)
-        # image_tensor = paddle.to_tensor(image_np, dtype="float32")
-        # image_tensor = image_tensor.transpose([2, 0, 1])  # 变成 (3, 500, 500)
-        # if self.interpolation == "bilinear" or "bicubic":
-        #     image_tensor = image_tensor.unsqueeze(0)  # 变成 (1, 3, 500, 500)
-        transform = Resize(size=[target_size[1],target_size[0]],interpolation=self.interpolation)
-        return (transform(img),selected_anchor)
-        # return (
-        #     F.interpolate(
-        #         image_tensor, size=[target_size[1], target_size[0]], mode=self.interpolation, align_corners=False
-        #     )[0],
-        #     selected_anchor,
-        # )
-
-    def __repr__(self) -> str:
-        detail = f"(size={self.image_size}, anchor={self.anchors}, interpolation={self.interpolation.value}, antialias={self.antialias})"
-        return f"{self.__class__.__name__}{detail}"
-
-
-class CutMixin:
-    def __init__(self, cut_cfg={"anchors": "grid_squ_6", "anchor_strategy": "docowl", "cut_prompt": "v3", "add_global": True, "cut_prob": 1.0}) -> None:
-        if cut_cfg is None:
-            self.cut_enable = False
-            return
-        else:
-            self.cut_enable = True
-        image_size = self.image_size
-        anchors = cut_cfg.get('anchors','grid_33')
-        anchor_strategy = cut_cfg.get('anchor_strategy','docowl')
-        cut_prompt = cut_cfg.get('cut_prompt','v0')
-        self.cut_prob = cut_cfg.get('cut_prob', 1.0)
-        
-        self.force_shape_cut = cut_cfg.get('force_shape_cut', False)
-        force_shape_cut_anchors = cut_cfg.get('force_shape_cut_anchors', 'force_shape_cut_anchors')
-
-        self.add_global = cut_cfg.get('add_global', False)
-        
-        # h,w
-        if isinstance(image_size, int):
-            image_size = (image_size, image_size)
-        self.image_size = image_size
-
-        if anchors in grid_dict:
-            anchors = grid_dict[anchors]
-        else:
-            anchors = eval(anchors)
-        self.anchors = [tuple(_) for _ in anchors]
-        self.anchor_max = max([max(_) for _ in self.anchors])
-        self.resizer = AnchorResize(image_size=image_size, anchors=anchors, interpolation='bicubic', anchor_strategy=anchor_strategy)
-
-        if force_shape_cut_anchors in grid_dict:
-            force_shape_cut_anchors = grid_dict[force_shape_cut_anchors]
-        else:
-            force_shape_cut_anchors = eval(force_shape_cut_anchors)
-        self.force_shape_cut_anchors = [tuple(_) for _ in force_shape_cut_anchors]
-        self.force_shape_cut_anchors_max = max([max(_) for _ in self.force_shape_cut_anchors])
-
-        self.old_resizer = transforms.Resize(image_size,interpolation="bicubic")
-
-        # 把image processor的缩放去掉 只保留后面的变换
-        self.image_transform = transforms.Compose(self.image_transform.transforms[1:])
-        if self.add_global:
-            self.cut_prompt_template = cut_prompt_template_dict[cut_prompt+'_global']
-        else:
-            self.cut_prompt_template = cut_prompt_template_dict[cut_prompt]
-
-        self.media_tokens = ["<|image|>", "<|video|>"]
-
-    def _process_image(self, images):
-        new_images = []
-        cut_shape = []
-        for image in images:
-            raw_image = image
-            image, selected_anchor = self.resizer(image)
-            image_input = self.image_transform(image) # h,w,3 -> 3,h,w
-            cut_shape.append((image_input.shape[1]//self.image_size[0], image_input.shape[2]//self.image_size[1])) # cut_h, cut_w
-            image_input = rearrange(image_input, 'C (num_h h) (num_w w) -> (num_h num_w) C h w', h=self.image_size[0], w=self.image_size[1])
-
-            new_images.append(image_input)
-        
-            if self.add_global:
-                new_images.append(self.image_transform(self.resizer.resize_global(raw_image)).unsqueeze(0))
-                cut_shape.append((1,1))
-
-        new_images = paddle.concat(new_images, axis=0)
-        cut_shape_indices = build_cut_shape_indices(cut_shape)
-        return new_images, cut_shape, cut_shape_indices
-
-
-class TensorType(Enum):
-    PADDLE = "paddle"
-
-
-class mPLUGOwl3BatchFeature(BatchFeature):
-    r"""
-    Extend from BatchFeature for supporting various image size
-    """
-    def __init__(self, data: Optional[Dict[str, Any]] = None, tensor_type: Union[None, str, TensorType] = None):
-        super().__init__(data)
-        self.convert_to_tensors(tensor_type=tensor_type)
-
-    def convert_to_tensors(self, tensor_type: Optional[Union[str, TensorType]] = None):
-        if tensor_type is None:
-            return self
-        
-        #is_tensor, as_tensor = self._get_is_as_tensor_fns(tensor_type)
-        is_tensor = lambda x: isinstance(x, paddle.Tensor)
-        as_tensor = paddle.to_tensor
-
-        def converter(value):
-            try:
-                if not is_tensor(value):
-                    tensor = as_tensor(value)
-                    return tensor
-            except:  # noqa E722
-                if key == "overflowing_values":
-                    raise ValueError("Unable to create tensor returning overflowing values of different lengths. ")
-                raise ValueError(
-                    "Unable to create tensor, you should probably activate padding "
-                    "with 'padding=True' to have batched tensors with the same length."
-                )
-
-        for key, value in self.items():
-            self[key] = recursive_converter(converter, value)
-        return self
-
-    # def to(self, *args, **kwargs) -> "mPLUGOwl3BatchFeature":
-    #     requires_backends(self, ["torch"])
-    #     import torch
-
-    #     def cast_tensor(v):
-    #         # # check if v is a floating point
-    #         # if torch.is_floating_point(v):
-    #         #     # cast and send to device
-    #         #     return v.to(*args, **kwargs)
-    #         # elif device is not None:
-    #         #     return v.to(device=device)
-    #         # else:
-    #         #     return v
-    #         if isinstance(v, paddle.Tensor):
-    #             # For floating point tensors
-    #             if v.dtype in [paddle.float32, paddle.float64]:
-    #                 if "dtype" in kwargs:
-    #                     v = v.cast(kwargs["dtype"])
-    #                 if "place" in kwargs:
-    #                     v = v.place(kwargs["place"])
-    #                 return v
-    #             # For non-floating point tensors, only handle device
-    #             elif "place" in kwargs:
-    #                 return v.place(kwargs["place"])
-    #         return v
-
-    #     new_data = {}
-    #     device = kwargs.get("place")
-    #     # Check if the args are a device or a dtype
-    #     if device is None and len(args) > 0:
-    #         # device should be always the first argument
-    #         arg = args[0]
-    #         if is_torch_dtype(arg):
-    #             # The first argument is a dtype
-    #             pass
-    #         elif isinstance(arg, str) or is_torch_device(arg) or isinstance(arg, int):
-    #             device = arg
-    #         else:
-    #             # it's something else
-    #             raise ValueError(f"Attempting to cast a BatchFeature to type {str(arg)}. This is not supported.")
-    #     # We cast only floating point tensors to avoid issues with tokenizers casting `LongTensor` to `FloatTensor`
-    #     for k, v in self.items():
-    #         new_data[k] = recursive_converter(cast_tensor, v)
-    #     self.data = new_data
-    #     return self
-
-
-class mPLUGOwl3ImageProcessor(BaseImageProcessor, CutMixin):
-    model_input_names = ["pixel_values"]
-
-    def __init__(
-            self, 
-            image_size,
-            mean=[0.5, 0.5, 0.5],
-            std=[0.5, 0.5, 0.5],
-            **kwargs):
-        super().__init__(**kwargs)
-        self.image_size = image_size
-        self.image_transform = transforms.Compose([
-            transforms.Resize((image_size, image_size), interpolation="bicubic"),
-            transforms.ToTensor(),
-            transforms.Normalize(mean, std),
-        ])
-        CutMixin.__init__(self)
-
-    def preprocess(
-            self, 
-            images: Union[Image.Image, List[Image.Image]],
-            cut_enable=True,
-            **kwargs
-        ) -> mPLUGOwl3BatchFeature:
-        if isinstance(images, Image.Image):
-            images_list = [images]
-        else:
-            images_list = images
-
-        if self.cut_enable and cut_enable:
-            image_data, cut_shape, cut_shape_indices = self._process_image(images_list)
-        else:
-            image_data = [self.image_transform(self.resizer.resize_global(image)) for image in images_list]
-            image_data = paddle.stack(image_data, axis=0)
-            cut_shape = cut_shape_indices = None
-            
-        return mPLUGOwl3BatchFeature(data={'pixel_values': image_data, 'cut_shape':cut_shape, 'cut_shape_indices':cut_shape_indices})
-    
-    def to_dict(self):
-        encoder_dict = super().to_dict()
-        pop_keys = ['image_transform', 'resizer', 'old_resizer', 'cut_prompt_template']
-        for pk in pop_keys:
-            encoder_dict.pop(pk, None)
-        return encoder_dict
diff --git a/paddlemix/models/mPLUGOwl3/modeling_hyper_qwen2.py b/paddlemix/models/mPLUGOwl3/modeling_hyper_qwen2.py
index e35a41bd7..1a83354a1 100644
--- a/paddlemix/models/mPLUGOwl3/modeling_hyper_qwen2.py
+++ b/paddlemix/models/mPLUGOwl3/modeling_hyper_qwen2.py
@@ -1,56 +1,31 @@
 # Copyright (c) 2024 PaddlePaddle Authors. All Rights Reserved.
-# 
+#
 # Licensed under the Apache License, Version 2.0 (the "License");
 # you may not use this file except in compliance with the License.
 # You may obtain a copy of the License at
-# 
+#
 #     http://www.apache.org/licenses/LICENSE-2.0
-# 
+#
 # Unless required by applicable law or agreed to in writing, software
 # distributed under the License is distributed on an "AS IS" BASIS,
 # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 # See the License for the specific language governing permissions and
 # limitations under the License.
 
+from typing import List, Optional, Tuple, Union
+
 import paddle
 import paddle.nn as nn
-from paddle.nn import MultiHeadAttention
 import paddlenlp
-
-from typing import List, Optional, Tuple, Union
-
 from einops import rearrange, repeat
+from paddle.nn import MultiHeadAttention
+from paddlenlp.transformers.qwen2.modeling import Qwen2Attention
 
-# from paddlemix.models.flash_attn_utils import (
-#     has_flash_attn_func,
-#     is_flash_attn_available,
-# )
-#from paddle.nn.functional.flash_attention import flash_attention as flash_attn_func
-#from paddle.nn.functional.flash_attention import flash_attn_unpadded as flash_attn_varlen_func
+from paddlemix.utils.log import logger
 
 from ...activations import ACT2FN
 from .configuration_hyper_qwen2 import HyperQwen2Config
 
-try:
-    from einops import rearrange
-    use_flash_rotary = True
-    print("use flash_attn rotary")
-except ImportError:
-    use_flash_rotary = False
-    print("import flash_attn rotary fail")
-from paddlemix.utils.log import logger
-
-
-# def _get_unpad_data(attention_mask):
-#     seqlens_in_batch = attention_mask.sum(axis=-1, dtype="int32")
-#     paddle.utils.try_import("warnings").warn("Now, the return shape is inconsistent with torch when as_tuple is True")
-#     indices = paddle.nonzero(x=attention_mask.flatten(), as_tuple=False).flatten()
-#     max_seqlen_in_batch = seqlens_in_batch.max().item()
-#     cu_seqlens = paddle.nn.functional.pad(
-#         x=paddle.cumsum(x=seqlens_in_batch, axis=0, dtype="int32"), pad=(1, 0), pad_from_left_axis=False
-#     )
-#     return indices, cu_seqlens, max_seqlen_in_batch
-
 
 def is_casual_mask(attention_mask):
     """
@@ -116,9 +91,7 @@ def __init__(self, dim, max_position_embeddings=2048, base=10000):
             paddle.arange(start=0, end=self.dim, step=2, dtype="int64").astype(dtype="float32") / self.dim
         )
         self.register_buffer(name="inv_freq", tensor=inv_freq, persistable=False)
-        self._set_cos_sin_cache(
-            seq_len=max_position_embeddings, dtype=paddle.get_default_dtype()
-        )
+        self._set_cos_sin_cache(seq_len=max_position_embeddings, dtype=paddle.get_default_dtype())
 
     def _set_cos_sin_cache(self, seq_len, dtype):
         self.max_seq_len_cached = seq_len
@@ -138,6 +111,7 @@ def forward(self, x, seq_len=None):
             self.sin_cached[:seq_len].to(dtype=x.dtype),
         )
 
+
 class RotaryEmbedding(paddle.nn.Layer):
     def __init__(self, dim, base=10000, use_fp32=False, use_outer_in_rope=False):
         super().__init__()
@@ -149,6 +123,7 @@ def __init__(self, dim, base=10000, use_fp32=False, use_outer_in_rope=False):
         else:
             inv_freq = 1.0 / base ** (paddle.arange(start=0, end=dim, step=2).astype(dtype="float32") / dim)
             self.register_buffer(name="inv_freq", tensor=inv_freq)
+
         self._rotary_pos_emb_cache = None
         self._seq_len_cached = 0
         self.use_outer_in_rope = use_outer_in_rope
@@ -164,13 +139,12 @@ def update_rotary_pos_emb_cache(self, max_seq_len, offset=0, ntk_alpha=1.0):
             self._seq_len_cached = seqlen
             self._ntk_alpha_cached = ntk_alpha
             seq = paddle.arange(end=seqlen)
-            if self.use_outer_in_rope:
+            if 1:  # self.use_outer_in_rope:
                 freqs = paddle.outer(x=seq.astype(dtype=self.inv_freq.dtype), y=self.inv_freq)
-            else:
-                freqs = einsum("i , j -> i j", seq.astype(dtype=self.inv_freq.dtype), self.inv_freq)
+            # else:
+            #    freqs = einsum("i , j -> i j", seq.astype(dtype=self.inv_freq.dtype), self.inv_freq)
             emb = paddle.concat(x=(freqs, freqs), axis=-1)
-            from einops import rearrange
-
+            # emb [seq_length, .., dim]
             self._rotary_pos_emb_cache = rearrange(emb, "n d -> n 1 1 d")
 
     def forward(self, max_seq_len, offset=0, ntk_alpha=1.0):
@@ -240,15 +214,11 @@ def repeat_kv(hidden_states: paddle.Tensor, n_rep: int) -> paddle.Tensor:
     return hidden_states.reshape([batch, num_key_value_heads * n_rep, slen, head_dim])
 
 
-
-
-
 def _rotate_half(x):
     """
     change sign so the last dimension becomes [-odd, +even]
     """
-    from einops import rearrange
-    x = rearrange(x, '... (j d) -> ... j d', j=2)
+    x = rearrange(x, "... (j d) -> ... j d", j=2)
     x1, x2 = x.unbind(axis=-2)
     return paddle.concat(x=(-x2, x1), axis=-1)
 
@@ -272,7 +242,7 @@ def apply_rotary_pos_emb_core(t, freqs, use_fp32=False, debug=False):
     rot_dim = freqs.shape[-1]
     # ideally t_pass is empty so rotary pos embedding is applied to all tensor t
     t_, t_pass_ = t[..., :rot_dim], t[..., rot_dim:]
-    
+
     if use_fp32:
         t_ = t_.astype(dtype="float32")
         t_pass_ = t_pass_.astype(dtype="float32")
@@ -331,14 +301,14 @@ def __init__(self, config: HyperQwen2Config, layer_idx: Optional[int] = None, is
         self.is_hyper_enabled = is_hyper_enabled
         if self.is_hyper_enabled:
             self.v_kv_proj = nn.Linear(self.hidden_size, self.num_key_value_heads * self.head_dim * 2, bias_attr=True)
-          
-            self.visual_cache={}
-        
+
+            self.visual_cache = {}
+
         self.use_flexattention = True
 
     def apply_mi_rope(self, key_layer, image_pos, length_each_img):
         # input shape should be [s b h d]
-        key_layer = rearrange(key_layer, 'b h s d -> s b h d')
+        key_layer = rearrange(key_layer, "b h s d -> s b h d")
         # if self.rotary_emb_core.inv_freq.device!=key_layer.device:
         #     self.rotary_emb_core.inv_freq = self.rotary_emb_core.inv_freq.to(key_layer.device)
         rotary_pos_emb_max_seq_len = self.config.max_position_embeddings
@@ -349,15 +319,15 @@ def apply_mi_rope(self, key_layer, image_pos, length_each_img):
         if isinstance(rotary_pos_emb, tuple):
             rotary_pos_emb = rotary_pos_emb
         else:
-            rotary_pos_emb = ((rotary_pos_emb,) * 2)
+            rotary_pos_emb = (rotary_pos_emb,) * 2
 
         if rotary_pos_emb is not None:
             q_pos_emb, k_pos_emb = rotary_pos_emb
-            
-            k_pos_emb = repeat(k_pos_emb[image_pos], 'N_img b h d -> (N_img L) b h d', L=length_each_img) # N_img, dim
 
-            key_layer = apply_rotary_pos_emb_core(key_layer, k_pos_emb, use_fp32=True) # TODO difference
-        key_layer = rearrange(key_layer, 's b h d -> b h s d')
+            k_pos_emb = repeat(k_pos_emb[image_pos], "N_img b h d -> (N_img L) b h d", L=length_each_img)  # N_img, dim
+
+            key_layer = apply_rotary_pos_emb_core(key_layer, k_pos_emb, use_fp32=True)  # TODO difference
+        key_layer = rearrange(key_layer, "s b h d -> b h s d")
         return key_layer
 
 
@@ -368,7 +338,9 @@ class HyperQwen2SdpaAttention(HyperQwen2Attention):
     SDPA API.
     """
 
-    def hyperattention(self,hidden_states: paddle.Tensor,
+    def hyperattention(
+        self,
+        hidden_states: paddle.Tensor,
         attention_mask: Optional[paddle.Tensor] = None,
         position_ids: Optional[paddle.Tensor] = None,
         image_embeds=None,
@@ -376,97 +348,108 @@ def hyperattention(self,hidden_states: paddle.Tensor,
         past_key_value: Optional[MultiHeadAttention.Cache] = None,
         output_attentions: bool = False,
         use_cache: bool = False,
-    )-> Tuple[paddle.Tensor, Optional[paddle.Tensor], Optional[Tuple[paddle.Tensor]]]:
-        bsz, q_len, _ = hidden_states.shape # (1, 74, 28, 128) bsz, q_len, self.num_heads, self.head_dim
+    ) -> Tuple[paddle.Tensor, Optional[paddle.Tensor], Optional[Tuple[paddle.Tensor]]]:
+        bsz, q_len, _ = hidden_states.shape  # (1, 74, 28, 128) bsz, q_len, self.num_heads, self.head_dim
 
         try:
             query_states = self.q_proj(hidden_states)
             key_states = self.k_proj(hidden_states)
             value_states = self.v_proj(hidden_states)
         except:
-            hidden_states = hidden_states.astype('bfloat16')
+            hidden_states = hidden_states.astype("bfloat16")
             query_states = self.q_proj(hidden_states)
             key_states = self.k_proj(hidden_states)
             value_states = self.v_proj(hidden_states)
 
         query_states = query_states.reshape([bsz, q_len, self.num_heads, self.head_dim]).transpose([0, 2, 1, 3])
         key_states = key_states.reshape([bsz, q_len, self.num_key_value_heads, self.head_dim]).transpose([0, 2, 1, 3])
-        value_states = value_states.reshape([bsz, q_len, self.num_key_value_heads, self.head_dim]).transpose([0, 2, 1, 3])
+        value_states = value_states.reshape([bsz, q_len, self.num_key_value_heads, self.head_dim]).transpose(
+            [0, 2, 1, 3]
+        )
 
         kv_seq_len = key_states.shape[-2]
         if past_key_value is not None:
-            #kv_seq_len += past_key_value.get_usable_length(kv_seq_len, self.layer_idx)
+            # kv_seq_len += past_key_value.get_usable_length(kv_seq_len, self.layer_idx)
             kv_seq_len += past_key_value[0].shape[-2]
         cos, sin = self.rotary_emb(value_states, seq_len=kv_seq_len)
 
+        # print('query_states, key_states', query_states.sum().item(), key_states.sum().item())
+        # 29952.0 492.0
         query_states, key_states = apply_rotary_pos_emb(query_states, key_states, cos, sin, position_ids)
-        #print('query_states, key_states', query_states.shape, key_states.shape)
+        # print('query_states, key_states', query_states.sum().item(), key_states.sum().item())
+        # 18304.0 -776.0
+        # print('query_states, key_states', query_states.shape, key_states.shape)
         # [1, 28, 1, 128] [1, 4, 1, 128]
 
         if past_key_value is not None:
-            #cache_kwargs = {"sin": sin, "cos": cos}  # Specific to RoPE models
-            #key_states, value_states = past_key_value.update(key_states, value_states, self.layer_idx, cache_kwargs)
+            # cache_kwargs = {"sin": sin, "cos": cos}  # Specific to RoPE models
+            # key_states, value_states = past_key_value.update(key_states, value_states, self.layer_idx, cache_kwargs)
             key_states = paddle.concat([past_key_value[0], key_states], axis=2)
             value_states = paddle.concat([past_key_value[1], value_states], axis=2)
         past_key_value = (key_states, value_states) if use_cache else None
-        #print('query_states key_states, value_states', query_states.sum().item(), key_states.sum().item(), value_states.sum().item())
-        #print('query_states key_states, value_states', query_states.shape, key_states.shape, value_states.shape)
+        # print('query_states key_states, value_states', query_states.sum().item(), key_states.sum().item(), value_states.sum().item())
+        # print('query_states key_states, value_states', query_states.shape, key_states.shape, value_states.shape)
         # q k v [1, 28, 74, 128] [1, 4, 74, 128] [1, 4, 74, 128]
         # q k v [1, 28, 1, 128] [1, 4, 75, 128] [1, 4, 75, 128]
 
-        # query_states key_states, value_states 18304.0 -792.0 -253.0
-        # query_states key_states, value_states 24832.0 123.5 -198.0
-        # query_states key_states, value_states -16896.0 552.0 -692.0
-        # query_states key_states, value_states -120.0 1200.0 -141.0
-
         key_states = repeat_kv(key_states, self.num_key_value_groups)
         value_states = repeat_kv(value_states, self.num_key_value_groups)
+        # -5440. -1712.
 
         # add visual to kv
         length_each_img = image_embeds.shape[1]
-        # [7, 729, 3584] sum 78848.
-        #import pdb; pdb.set_trace()
+        # [7, 729, 3584] sum 78336. mean 0.00430298
         try:
             image_embeds = self.v_kv_proj(image_embeds)
         except:
-            image_embeds = self.v_kv_proj(image_embeds.astype('bfloat16'))
-        #import pdb; pdb.set_trace()
+            image_embeds = self.v_kv_proj(image_embeds.astype("bfloat16"))
         # [7, 729, 1024] sum 184320.
         image_start = 0
-        context_layer = []                 
+        context_layer = []
         for bi, media_starts in enumerate(media_offset):
             num_images = media_starts.shape[0]
             if num_images > 0:
                 if q_len == 1:
-                    full_mask = paddle.ones((1,1,1, num_images*length_each_img + kv_seq_len)).astype(paddle.bool)
+                    full_mask = paddle.ones((1, 1, 1, num_images * length_each_img + kv_seq_len)).astype(paddle.bool)
                 else:
                     causal_mask = paddle.tril(paddle.ones([q_len, kv_seq_len])).astype(paddle.bool)
                     # 扩展维度以匹配 (bsz, 1, q_len, kv_seq_len)
                     causal_mask = causal_mask.unsqueeze(0).unsqueeze(0)
 
-                    matrix = paddle.arange(q_len).reshape([-1,1])
-                    t2vmask = ~(matrix<media_starts.reshape([1, -1])) 
-                    t2vmask = repeat(t2vmask, 'seq_t seq_v -> 1 1 seq_t (seq_v v_token)', v_token=length_each_img)
-                    full_mask = paddle.concat([t2vmask, causal_mask], axis=3) # unsqueeze batch dim (batch, 1, seq_q, seq_k)
+                    matrix = paddle.arange(q_len).reshape([-1, 1])
+                    t2vmask = ~(matrix < media_starts.reshape([1, -1]))
+                    t2vmask = repeat(t2vmask, "seq_t seq_v -> 1 1 seq_t (seq_v v_token)", v_token=length_each_img)
+                    full_mask = paddle.concat(
+                        [t2vmask, causal_mask], axis=3
+                    )  # unsqueeze batch dim (batch, 1, seq_q, seq_k)
 
-                curr_query_layer = query_states[bi:bi+1]
+                curr_query_layer = query_states[bi : bi + 1]
                 # order is sbhd
-                curr_visual_key_layer, curr_visual_value_layer = rearrange(image_embeds[image_start:image_start+num_images], 'BL Lv (H KV D) -> KV 1 H (BL Lv) D', KV=2, H=self.num_key_value_heads) # b h s d
+                curr_visual_key_layer, curr_visual_value_layer = rearrange(
+                    image_embeds[image_start : image_start + num_images],
+                    "BL Lv (H KV D) -> KV 1 H (BL Lv) D",
+                    KV=2,
+                    H=self.num_key_value_heads,
+                )  # b h s d
                 image_start += num_images
+                # print("curr_query_layer", bi, curr_visual_key_layer.sum().item(), curr_visual_value_layer.sum().item())
+                #  [1, 4, 5103, 128] 206848. -22400.0
 
-                curr_visual_key_layer = self.apply_mi_rope(curr_visual_key_layer, media_starts, length_each_img=length_each_img)
+                curr_visual_key_layer = self.apply_mi_rope(
+                    curr_visual_key_layer, media_starts, length_each_img=length_each_img
+                )
 
                 curr_visual_key_layer = repeat_kv(curr_visual_key_layer, self.num_key_value_groups)
                 curr_visual_value_layer = repeat_kv(curr_visual_value_layer, self.num_key_value_groups)
 
-                curr_key_layer = paddle.concat([curr_visual_key_layer, key_states[bi:bi+1]], axis=2)
-                curr_value_layer = paddle.concat([curr_visual_value_layer, value_states[bi:bi+1]], axis=2)
+                curr_key_layer = paddle.concat([curr_visual_key_layer, key_states[bi : bi + 1]], axis=2)
+                curr_value_layer = paddle.concat([curr_visual_value_layer, value_states[bi : bi + 1]], axis=2)
                 is_causal = False
             else:
                 # 执行无图attention
-                curr_query_layer = query_states[bi:bi+1]
-                curr_key_layer = key_states[bi:bi+1]
-                curr_value_layer = value_states[bi:bi+1]
+                curr_query_layer = query_states[bi : bi + 1]
+                curr_key_layer = key_states[bi : bi + 1]
+                curr_value_layer = value_states[bi : bi + 1]
                 is_causal = True if q_len > 1 else False
                 if is_causal:
                     full_mask = None
@@ -483,28 +466,31 @@ def hyperattention(self,hidden_states: paddle.Tensor,
             #     curr_key_layer = curr_key_layer.contiguous()
             #     curr_value_layer = curr_value_layer.contiguous()
 
-
-            # full_mask.shape [1, 1, 72, 5175] # sum 196689
+            # full_mask.shape [1, 1, 74, 5177] # sum 196689
             attn_output = paddle.nn.functional.scaled_dot_product_attention(
-                curr_query_layer.transpose([0, 2, 1, 3]), # (batch, ..., sequence, dim) # [1, 72, 28, 128], torch [1, 28, 74, 128] sum 18304.
-                curr_key_layer.transpose([0, 2, 1, 3]), # [1, 5175, 28, 128], torch [1, 28, 5177, 128] sum 1044480   mean 0.05615234  torch sum 1036288. mean 0.0559
-                curr_value_layer.transpose([0, 2, 1, 3]), # [1, 5175, 28, 128] , torch [1, 28, 5177, 128] sum -158720
-                attn_mask=full_mask.cast(curr_query_layer.dtype), # (N, ..., L, S) A boolean mask where a value of True indicates that the element *should* take part in attention.
+                curr_query_layer.transpose(
+                    [0, 2, 1, 3]
+                ),  # (batch, ..., sequence, dim) # [1, 74, 28, 128], torch [1, 28, 74, 128] sum 18304.
+                curr_key_layer.transpose(
+                    [0, 2, 1, 3]
+                ),  # [1, 5177, 28, 128], torch [1, 28, 5177, 128] sum 1044480   mean 0.05615234  torch sum 1036288. mean 0.0559
+                curr_value_layer.transpose([0, 2, 1, 3]),  # [1, 5177, 28, 128] , torch [1, 28, 5177, 128] sum -158720
+                attn_mask=full_mask.cast(
+                    curr_query_layer.dtype
+                ),  # (N, ..., L, S) A boolean mask where a value of True indicates that the element *should* take part in attention.
                 dropout_p=self.attention_dropout if self.training else 0.0,
                 # The q_len > 1 is necessary to match with AttentionMaskConverter.to_causal_4d that does not create a causal mask in case q_len == 1.
                 is_causal=is_causal,
                 # enable_gqa=True, # gqa can not be used because mask requires XFORMERS and not support gqa
-            ) # -> (N, ..., L, Ev)
+            )  # -> (N, ..., L, Ev)
             # torch attn_output.shape [1, 28, 72, 128]
             attn_output = attn_output.transpose([0, 2, 1, 3])
-            #import pdb; pdb.set_trace()
-
             assert attn_output.shape[0] == 1
             context_layer.append(attn_output)
         attn_output = context_layer = paddle.concat(context_layer, axis=0)
 
         attn_output = attn_output.transpose([0, 2, 1, 3])
-        #print('attn_output', attn_output.shape) # [1, 74, 28, 128] [1, 1, 28, 128]
+        # print('attn_output', attn_output.shape) # [1, 74, 28, 128] [1, 1, 28, 128]
         attn_output = attn_output.reshape([bsz, q_len, self.hidden_size])
 
         attn_output = self.o_proj(attn_output)
@@ -523,7 +509,7 @@ def forward(
         output_attentions: bool = False,
         use_cache: bool = False,
     ) -> Tuple[paddle.Tensor, Optional[paddle.Tensor], Optional[Tuple[paddle.Tensor]]]:
-        ### TODO
+        # TODO:
         # if output_attentions:
         #     # TODO: Improve this warning with e.g. `model.config.attn_implementation = "manual"` once this is implemented.
         #     logger.warning_once(
@@ -539,9 +525,18 @@ def forward(
         #         use_cache=use_cache,
         #     )
 
-        if self.is_hyper_enabled and image_embeds is not None: # if 1:
+        if self.is_hyper_enabled and image_embeds is not None:
             # 必走这个分支
-            return self.hyperattention(hidden_states, attention_mask, position_ids, image_embeds, media_offset, past_key_value, output_attentions, use_cache)
+            return self.hyperattention(
+                hidden_states,
+                attention_mask,
+                position_ids,
+                image_embeds,
+                media_offset,
+                past_key_value,
+                output_attentions,
+                use_cache,
+            )
 
         bsz, q_len, _ = hidden_states.shape
 
@@ -550,31 +545,32 @@ def forward(
             key_states = self.k_proj(hidden_states)
             value_states = self.v_proj(hidden_states)
         except:
-            hidden_states = hidden_states.astype('bfloat16')
+            hidden_states = hidden_states.astype("bfloat16")
             query_states = self.q_proj(hidden_states)
             key_states = self.k_proj(hidden_states)
             value_states = self.v_proj(hidden_states)
 
         query_states = query_states.reshape([bsz, q_len, self.num_heads, self.head_dim]).transpose([0, 2, 1, 3])
         key_states = key_states.reshape([bsz, q_len, self.num_key_value_heads, self.head_dim]).transpose([0, 2, 1, 3])
-        value_states = value_states.reshape([bsz, q_len, self.num_key_value_heads, self.head_dim]).transpose([0, 2, 1, 3])
+        value_states = value_states.reshape([bsz, q_len, self.num_key_value_heads, self.head_dim]).transpose(
+            [0, 2, 1, 3]
+        )
 
         kv_seq_len = key_states.shape[-2]
         if past_key_value is not None:
-            #kv_seq_len += past_key_value.get_usable_length(kv_seq_len, self.layer_idx)
+            # kv_seq_len += past_key_value.get_usable_length(kv_seq_len, self.layer_idx)
             kv_seq_len += past_key_value[0].shape[-2]
         cos, sin = self.rotary_emb(value_states, seq_len=kv_seq_len)
 
         query_states, key_states = apply_rotary_pos_emb(query_states, key_states, cos, sin, position_ids)
 
         if past_key_value is not None:
-            #cache_kwargs = {"sin": sin, "cos": cos}  # Specific to RoPE models
-            #key_states, value_states = past_key_value.update(key_states, value_states, self.layer_idx, cache_kwargs)
+            # cache_kwargs = {"sin": sin, "cos": cos}  # Specific to RoPE models
+            # key_states, value_states = past_key_value.update(key_states, value_states, self.layer_idx, cache_kwargs)
             key_states = paddle.concat([past_key_value[0], key_states], axis=2)
             value_states = paddle.concat([past_key_value[1], value_states], axis=2)
         past_key_value = (key_states, value_states) if use_cache else None
 
-
         key_states = repeat_kv(key_states, self.num_key_value_groups)
         value_states = repeat_kv(value_states, self.num_key_value_groups)
 
@@ -591,16 +587,15 @@ def forward(
         #     value_states = value_states.contiguous()
 
         attn_output = paddle.nn.functional.scaled_dot_product_attention(
-            query_states,
-            key_states,
-            value_states,
-            attn_mask=attention_mask.astype(query_states.dtype),
+            query_states.transpose([0, 2, 1, 3]),  # [1, 28, 74, 128] sum 21632.
+            key_states.transpose([0, 2, 1, 3]),  # [1, 28, 74, 128] sum 335872.
+            value_states.transpose([0, 2, 1, 3]),  # [1, 28, 74, 128] sum 1680.
+            attn_mask=attention_mask,
             dropout_p=self.attention_dropout if self.training else 0.0,
             # The q_len > 1 is necessary to match with AttentionMaskConverter.to_causal_4d that does not create a causal mask in case q_len == 1.
             is_causal=self.is_causal and attention_mask is None and q_len > 1,
         )
-
-        attn_output = attn_output.transpose([0, 2, 1, 3])
+        # [1, 74, 28, 128] sum 1408.
         attn_output = attn_output.reshape([bsz, q_len, self.hidden_size])
 
         attn_output = self.o_proj(attn_output)
@@ -608,12 +603,11 @@ def forward(
         return attn_output, None, past_key_value
 
 
-from paddlenlp.transformers.qwen2.modeling import Qwen2Attention
 # Original Attention of Qwen2
 QWEN2_ATTENTION_CLASSES = {
     "eager": Qwen2Attention,
-    "flash_attention_2": Qwen2Attention, #Qwen2FlashAttention2,
-    "sdpa": Qwen2Attention, #Qwen2SdpaAttention,
+    "flash_attention_2": Qwen2Attention,  # Qwen2FlashAttention2,
+    "sdpa": Qwen2Attention,  # Qwen2SdpaAttention,
 }
 
 
@@ -627,11 +621,12 @@ def __init__(self, config: HyperQwen2Config, layer_idx: int):
                 f"Sliding Window Attention is enabled but not implemented for `{config._attn_implementation}`; "
                 "unexpected results may be encountered."
             )
-        self.is_hyper_enabled = (layer_idx+1) in config.hyper_layers
-        if self.is_hyper_enabled:
+        self.is_hyper_enabled = (layer_idx + 1) in config.hyper_layers
+        # print('layer_idx', layer_idx, self.is_hyper_enabled)
+        if 1:  # self.is_hyper_enabled:
             self.self_attn = HyperQwen2SdpaAttention(config, layer_idx, is_hyper_enabled=self.is_hyper_enabled)
         else:
-            #self.self_attn = QWEN2_ATTENTION_CLASSES[config._attn_implementation](config, layer_idx)
+            # self.self_attn = QWEN2_ATTENTION_CLASSES[config._attn_implementation](config, layer_idx)
             self.self_attn = QWEN2_ATTENTION_CLASSES["flash_attention_2"](config, layer_idx)
 
         self.mlp = Qwen2MLP(config)
@@ -664,26 +659,30 @@ def forward(
         """
         residual = hidden_states
         hidden_states = self.input_layernorm(hidden_states)
-        
+
         # Shared LayerNorm
         if image_embeds is not None and self.is_hyper_enabled:
+            # 134144
             image_embeds = self.input_layernorm(image_embeds)
+            # 78336.
             media_kwargs = {"image_embeds": image_embeds, "media_offset": media_offset}
         else:
             image_embeds = media_offset = None
             media_kwargs = {}
 
         # Self Attention
-        hidden_states, self_attn_weights, present_key_value = self.self_attn(
-            hidden_states=hidden_states.cast(paddle.bfloat16),
+        # hidden_states.sum 76.50000000
+        hidden_states, self_attn_weights, present_key_value = self.self_attn(  # -704. 2080. (48128., 240.)
+            hidden_states=hidden_states.cast(paddle.bfloat16),  # [1, 74, 3584] sum -704.
             attention_mask=attention_mask,
             position_ids=position_ids,
             past_key_value=past_key_value,
-            output_attentions=True, # TODO, paddlenlp默认是False，但是不返回self_attn_weights。这里output_attentions全局是false
+            output_attentions=True,  # TODO, paddlenlp默认是False，但是不返回self_attn_weights。output_attentions全局是false，这里改成True是无影响的
             use_cache=use_cache,
-            **media_kwargs,
+            **media_kwargs,  # {}
         )
         hidden_states = residual + hidden_states
+        # -1.71093750 + -704.
 
         # Fully Connected
         residual = hidden_states
@@ -711,8 +710,8 @@ class Qwen2PreTrainedModel(paddlenlp.transformers.model_utils.PretrainedModel):
     supports_gradient_checkpointing = True
     _no_split_modules = ["HyperQwen2DecoderLayer"]
     _skip_keys_device_placement = "past_key_values"
-    _supports_flash_attn_2 = True
-    _supports_sdpa = True
+    # _supports_flash_attn_2 = True
+    # _supports_sdpa = True
     _supports_cache_class = True
 
     def _init_weights(self, layer):
@@ -745,12 +744,12 @@ def __init__(self, config: HyperQwen2Config):
         self.layers = nn.LayerList(
             [HyperQwen2DecoderLayer(config, layer_idx) for layer_idx in range(config.num_hidden_layers)]
         )
-        self._attn_implementation = 'flash_attention_2' #config._attn_implementation
+        self._attn_implementation = "flash_attention_2"  # config._attn_implementation
         self.norm = Qwen2RMSNorm(config.hidden_size, eps=config.rms_norm_eps)
 
         self.gradient_checkpointing = False
         # Initialize weights and apply final processing
-        #self.post_init()
+        # self.post_init()
 
     def get_input_embeddings(self):
         return self.embed_tokens
@@ -834,7 +833,7 @@ def forward(
         past_key_values_length = seq_length
         cache_length = 0
         if past_key_values[0] is not None:
-            cache_length = past_key_values[0][0].shape[1] # 
+            cache_length = past_key_values[0][0].shape[1]  #
             past_key_values_length += cache_length
 
         # print('position_ids  before', position_ids)
@@ -844,7 +843,7 @@ def forward(
             )
             position_ids = position_ids.unsqueeze(0).reshape([-1, seq_length])
         else:
-           position_ids = position_ids.reshape([-1, seq_length]).astype(dtype="int64")
+            position_ids = position_ids.reshape([-1, seq_length]).astype(dtype="int64")
 
         # print('position_ids', position_ids)
         # print('seq_length', seq_length)
@@ -863,15 +862,12 @@ def forward(
             beam_factor = batch_size // len(media_offset)
             assert batch_size % len(media_offset) == 0
             media_offset = media_offset * beam_factor
-            image_embeds = repeat(image_embeds, 'B L D -> (factor B) L D', factor=beam_factor)
+            image_embeds = repeat(image_embeds, "B L D -> (factor B) L D", factor=beam_factor)
 
         # decoder layers
         all_hidden_states = () if output_hidden_states else None
         all_self_attns = () if output_attentions else None
-        next_decoder_cache = () ### not none
-
-        # if attention_mask is not None:
-        #     print('attention_mask', attention_mask.shape, attention_mask.sum().item())
+        next_decoder_cache = ()  # not none
 
         for idx, decoder_layer in enumerate(self.layers):
             if output_hidden_states:
@@ -885,7 +881,7 @@ def forward(
                 position_ids=position_ids,
                 image_embeds=image_embeds,
                 media_offset=media_offset,
-                past_key_value=past_key_value, # not past_key_values
+                past_key_value=past_key_value,  # not past_key_values
                 output_attentions=output_attentions,
                 use_cache=use_cache,
             )
@@ -895,8 +891,6 @@ def forward(
 
             hidden_states = layer_outputs[0]
 
-            # if use_cache:
-            #     next_decoder_cache = layer_outputs[2 if output_attentions else 1]
             next_decoder_cache = next_decoder_cache + (layer_outputs[-1],) if use_cache else None
 
             if output_attentions:
@@ -908,9 +902,6 @@ def forward(
         if output_hidden_states:
             all_hidden_states += (hidden_states,)
 
-        # next_cache = None
-        # if use_cache:
-        #     next_cache = next_decoder_cache.to_legacy_cache() if use_legacy_cache else next_decoder_cache
         next_cache = next_decoder_cache if use_cache else None
 
         if not return_dict:
@@ -933,7 +924,7 @@ def __init__(self, config):
         self.lm_head = nn.Linear(config.hidden_size, config.vocab_size, bias_attr=False)
 
         # Initialize weights and apply final processing
-        #self.post_init()
+        # self.post_init()
 
     def get_input_embeddings(self):
         return self.model.embed_tokens
@@ -1002,25 +993,25 @@ def forward(
 
         # decoder outputs consists of (dec_features, layer_state, dec_hidden, dec_attn)
         outputs = self.model(
-            input_ids=input_ids,
-            attention_mask=attention_mask,
-            position_ids=position_ids,
+            input_ids=input_ids,  # [1, 74] # [1, 1]
+            attention_mask=attention_mask,  # [1, 74] # [1, 75]
+            position_ids=position_ids,  # [1, 74] # [1, 1]
             past_key_values=past_key_values,
-            inputs_embeds=inputs_embeds,
-            image_embeds=image_embeds,
-            media_offset=media_offset,
+            inputs_embeds=inputs_embeds,  # none
+            image_embeds=image_embeds,  # [7, 729, 3584] sum 134144.
+            media_offset=media_offset,  # [[18, 24, 30, 36, 42, 48, 54]]
             use_cache=use_cache,
             output_attentions=output_attentions,
             output_hidden_states=output_hidden_states,
-            return_dict=return_dict,
+            return_dict=True,  #
         )
 
-        hidden_states = outputs[0]
+        hidden_states = outputs[0]  # sum 6656 mean 0.02502441
         try:
             logits = self.lm_head(hidden_states)
         except:
             logits = self.lm_head(hidden_states.cast(paddle.bfloat16))
-        logits = logits.cast(paddle.float32)
+        logits = logits.cast(paddle.float32)  # sum -5314405 mean -0.47356287
 
         loss = None
         if labels is not None:
@@ -1050,55 +1041,12 @@ def prepare_inputs_for_generation(
         self, input_ids, past_key_values=None, attention_mask=None, inputs_embeds=None, **kwargs
     ):
         batch_size, seq_length = input_ids.shape
-        attention_mask = paddle.ones((batch_size, seq_length), dtype=paddle.bool)
-
-        # Omit tokens covered by past_key_values
-        if past_key_values is not None:
-            past_length = past_key_values[0][0].shape[1] # [1, 74, 4, 128] seq_len 74
-            #print('input_ids before omitting', input_ids)
-            #import pdb; pdb.set_trace()
-            if past_length < input_ids.shape[1]:
-                input_ids = input_ids[:, past_length:]
-            #print('input_ids', input_ids.shape, input_ids.sum().item())
-            
-            # if isinstance(past_key_values, MultiHeadAttention.Cache):
-            #     cache_length = past_key_values.get_seq_length()
-            #     past_length = past_key_values.seen_tokens
-            #     max_cache_length = past_key_values.get_max_length()
-            # else:
-            #     cache_length = past_length = past_key_values[0][0].shape[2]
-            #     max_cache_length = None
-
-            # # Keep only the unprocessed tokens:
-            # # 1 - If the length of the attention_mask exceeds the length of input_ids, then we are in a setting where
-            # # some of the inputs are exclusively passed as part of the cache (e.g. when passing input_embeds as
-            # # input)
-            # if attention_mask is not None and attention_mask.shape[1] > input_ids.shape[1]:
-            #     input_ids = input_ids[:, -(attention_mask.shape[1] - past_length) :]
-            # # 2 - If the past_length is smaller than input_ids', then input_ids holds all input tokens. We can discard
-            # # input_ids based on the past_length.
-            # elif past_length < input_ids.shape[1]:
-            #     input_ids = input_ids[:, past_length:]
-            # # 3 - Otherwise (past_length >= input_ids.shape[1]), let's assume input_ids only has unprocessed tokens.
-
-            # # If we are about to go beyond the maximum cache length, we need to crop the input attention mask.
-            # if (
-            #     max_cache_length is not None
-            #     and attention_mask is not None
-            #     and cache_length + input_ids.shape[1] > max_cache_length
-            # ):
-            #     attention_mask = attention_mask[:, -max_cache_length:]
-
-        #print('attention_mask ////', attention_mask.shape, attention_mask.sum().item())
-        position_ids = kwargs.get("position_ids", None)
-        if attention_mask is not None and position_ids is None:
-            # create position_ids on the fly for batch generation
-            position_ids = attention_mask.astype(dtype="int64").cumsum(-1) - 1
-            position_ids.masked_fill_(attention_mask == 0, 1)
-            if past_key_values:
-                position_ids = position_ids[:, -input_ids.shape[1] :]
-
-        #print('position_ids ////', position_ids)
+        position_ids = kwargs.get("position_ids", paddle.arange(seq_length).expand((batch_size, seq_length)))
+        attention_mask = kwargs.get("attention_mask", None)
+        if past_key_values:
+            input_ids = input_ids[:, -1].unsqueeze(axis=-1)
+            position_ids = position_ids[:, -1].unsqueeze(-1)
+
         # if `inputs_embeds` are passed, we only want to use them in the 1st generation step
         if inputs_embeds is not None and past_key_values is None:
             model_inputs = {"inputs_embeds": inputs_embeds}
@@ -1111,8 +1059,8 @@ def prepare_inputs_for_generation(
                 "past_key_values": past_key_values,
                 "use_cache": kwargs.get("use_cache"),
                 "attention_mask": attention_mask,
-                'image_embeds': kwargs.get('image_embeds'),
-                'media_offset': kwargs.get('media_offset'),
+                "image_embeds": kwargs.get("image_embeds"),
+                "media_offset": kwargs.get("media_offset"),
             }
         )
         return model_inputs
diff --git a/paddlemix/models/mPLUGOwl3/modeling_mplugowl3.py b/paddlemix/models/mPLUGOwl3/modeling_mplugowl3.py
index d6f5fe729..16415c96a 100644
--- a/paddlemix/models/mPLUGOwl3/modeling_mplugowl3.py
+++ b/paddlemix/models/mPLUGOwl3/modeling_mplugowl3.py
@@ -1,11 +1,11 @@
 # Copyright (c) 2024 PaddlePaddle Authors. All Rights Reserved.
-# 
+#
 # Licensed under the Apache License, Version 2.0 (the "License");
 # you may not use this file except in compliance with the License.
 # You may obtain a copy of the License at
-# 
+#
 #     http://www.apache.org/licenses/LICENSE-2.0
-# 
+#
 # Unless required by applicable law or agreed to in writing, software
 # distributed under the License is distributed on an "AS IS" BASIS,
 # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
@@ -16,15 +16,16 @@
 
 import paddle
 import paddle.nn as nn
-import paddlenlp
 from paddlenlp.generation import TextIteratorStreamer
 from paddlenlp.transformers import Qwen2PretrainedModel
 
+from ...processors.mplugowl3_processing import (
+    mPLUGOwl3ImageProcessor,
+    mPLUGOwl3Processor,
+)
 from .configuration_mplugowl3 import mPLUGOwl3Config
-from .image_processing_mplugowl3 import mPLUGOwl3ImageProcessor
 from .modeling_hyper_qwen2 import HyperQwen2ForCausalLM
 from .modeling_navit_siglip import SigLipVisionTransformer
-from .processing_mplugowl3 import mPLUGOwl3Processor
 
 
 class mPLUGOwl3PreTrainedModel(Qwen2PretrainedModel):
@@ -40,21 +41,17 @@ def __init__(self, config):
         self.vision_dim = self.vision_model.embed_dim
         self.embed_dim = self.config.hidden_size
         self.vision2text_model = nn.Sequential(
-            nn.Linear(self.vision_dim, self.embed_dim),
-            nn.GELU(),
-            nn.Linear(self.embed_dim, self.embed_dim)
+            nn.Linear(self.vision_dim, self.embed_dim), nn.GELU(), nn.Linear(self.embed_dim, self.embed_dim)
         )
         self.processor = None
-        self.terminators = ['<|im_end|>', '<|endoftext|>']
+        self.terminators = ["<|im_end|>", "<|endoftext|>"]
         self.vision_batch_size = config.vision_batch_size
 
     def init_vision_module(self):
-        #self.config.vision_config._attn_implementation = self.config.vision_config._attn_implementation
         self.config.vision_config._attn_implementation = "flash_attention_2"
         model = SigLipVisionTransformer(self.config.vision_config)
-
-        setattr(model, 'embed_dim', model.embeddings.embed_dim)
-        setattr(model, 'patch_size', model.embeddings.patch_size)
+        setattr(model, "embed_dim", model.embeddings.embed_dim)
+        setattr(model, "patch_size", model.embeddings.patch_size)
         return model
 
     def get_input_embeddings(self):
@@ -84,6 +81,8 @@ def _small_batched_forward(self, pixel_values):
             end_idx = min(B, i + vision_batch_size)
             tmp_hs = self.vision_model(pixel_values[start_idx:end_idx], output_hidden_states=True).hidden_states[-2]
             image_forward_out.append(tmp_hs)
+        # image_forward_out[0].sum()
+        # [7, 729, 1152] sum -872448.
         vision_embedding = paddle.concat(image_forward_out, axis=0)
         assert vision_embedding.shape[0] == B
         return vision_embedding
@@ -93,43 +92,38 @@ def forward_image(self, pixel_values):
             return None
         dtype = self.language_model.model.embed_tokens.weight.dtype
         image_embeds = self._small_batched_forward(pixel_values.to(dtype))
-        # image_embeds = self.vision_model(pixel_values.to(dtype), output_hidden_states=True).hidden_states[-2]
-        
+
         if self.vision2text_model is not None:
             image_embeds = self.vision2text_model(image_embeds)
+            # [7, 729, 3584] sum 134144.  mean 0.00735474
         else:
             pass
-     
+
         return image_embeds
 
     def forward(self, pixel_values=None, **kwargs):
         image_embeds = self.forward_image(pixel_values)
-        
-        return self.language_model(
-            image_embeds=image_embeds,
-            **kwargs
-        )
-    
+
+        return self.language_model(image_embeds=image_embeds, **kwargs)
+
     def _decode(self, input_ids, image_embeds, media_offset, tokenizer, attention_mask, decode_text=False, **kwargs):
         terminators = [tokenizer.convert_tokens_to_ids(i) for i in self.terminators]
-        
-        # ###  must add position_ids, paddlenlp bug
-        # batch_size, seq_length = attention_mask.shape
-        # position_ids = paddle.arange(seq_length).expand((batch_size, seq_length))
-        # ###
-        
+
+        # Note: must add position_ids, paddlenlp bug
+        batch_size, seq_length = input_ids.shape
+        position_ids = paddle.arange(seq_length).expand((batch_size, seq_length))
+
         output = self.language_model.generate(
             input_ids=input_ids,
             image_embeds=image_embeds,
             media_offset=media_offset,
             pad_token_id=0,
             eos_token_id=terminators,
-            #position_ids=position_ids,  ####
+            position_ids=position_ids,  # Note: must add position_ids
             attention_mask=attention_mask,
             **kwargs,
         )[0]
-        #output = output[:,input_ids.shape[1]:] # paddle no need this
-        print('_decode output', output)
+        # output = output[:,input_ids.shape[1]:] # paddle no need this
         if decode_text:
             return self._decode_text(output, tokenizer)
         return output
@@ -138,18 +132,18 @@ def _decode_stream(self, input_ids, image_embeds, media_offset, tokenizer, **kwa
         terminators = [tokenizer.convert_tokens_to_ids(i) for i in self.terminators]
         streamer = TextIteratorStreamer(tokenizer=tokenizer)
         generation_kwargs = {
-            'input_ids': input_ids,
-            'image_embeds': image_embeds,
-            'media_offset': media_offset,
-            'pad_token_id': 0,
-            'eos_token_id': terminators,
-            'streamer': streamer
+            "input_ids": input_ids,
+            "image_embeds": image_embeds,
+            "media_offset": media_offset,
+            "pad_token_id": 0,
+            "eos_token_id": terminators,
+            "streamer": streamer,
         }
         generation_kwargs.update(kwargs)
 
         thread = Thread(target=self.language_model.generate, kwargs=generation_kwargs)
         thread.start()
-    
+
         return streamer
 
     def _decode_text(self, result_ids, tokenizer):
@@ -185,10 +179,24 @@ def generate(
             image_embeds = self.forward_image(pixel_values)
 
             if stream:
-                result = self._decode_stream(input_ids=input_ids, image_embeds=image_embeds, media_offset=media_offset, tokenizer=tokenizer, **kwargs)
+                result = self._decode_stream(
+                    input_ids=input_ids,
+                    image_embeds=image_embeds,
+                    media_offset=media_offset,
+                    tokenizer=tokenizer,
+                    **kwargs,
+                )
             else:
-                result = self._decode(input_ids=input_ids, image_embeds=image_embeds, media_offset=media_offset, tokenizer=tokenizer, attention_mask=attention_mask, decode_text=decode_text, **kwargs)
-        
+                result = self._decode(
+                    input_ids=input_ids,
+                    image_embeds=image_embeds,
+                    media_offset=media_offset,
+                    tokenizer=tokenizer,
+                    attention_mask=attention_mask,
+                    decode_text=decode_text,
+                    **kwargs,
+                )
+
         return result
 
     def chat(
@@ -202,24 +210,26 @@ def chat(
         min_new_tokens=0,
         sampling=True,
         max_inp_length=8192,
-        system_prompt='',
+        system_prompt="",
         stream=False,
         max_slice_nums=None,
         use_image_id=None,
         **kwargs
     ):
-        cut_flag = kwargs.get('kwargs', True)
+        cut_flag = kwargs.get("kwargs", True)
         if processor is None:
             if self.processor is None:
                 processor = self.init_processor(tokenizer)
             else:
                 processor = self.processor
         inputs = processor(messages, images=images, videos=videos, cut_enable=cut_flag)
-        inputs.update({
-            'tokenizer': tokenizer,
-            'max_new_tokens': max_new_tokens,
-            # 'stream':True,
-        })
+        inputs.update(
+            {
+                "tokenizer": tokenizer,
+                "max_new_tokens": max_new_tokens,
+                # 'stream':True,
+            }
+        )
         if sampling:
             generation_config = {
                 "top_p": 0.8,
@@ -233,27 +243,22 @@ def chat(
                 "num_beams": 3,
                 # "repetition_penalty": 1.2,
             }
-            
+
         if min_new_tokens > 0:
-            generation_config['min_new_tokens'] = min_new_tokens
+            generation_config["min_new_tokens"] = min_new_tokens
 
-        generation_config.update(
-            (k, kwargs[k]) for k in generation_config.keys() & kwargs.keys()
-        )
+        generation_config.update((k, kwargs[k]) for k in generation_config.keys() & kwargs.keys())
         with paddle.no_grad():
-            res = self.generate(
-                **inputs,
-                stream=stream,
-                decode_text=True,
-                **generation_config
-            )
-        
+            res = self.generate(**inputs, stream=stream, decode_text=True, **generation_config)
+
         if stream:
+
             def stream_gen():
                 for text in res:
                     for term in self.terminators:
-                        text = text.replace(term, '')
+                        text = text.replace(term, "")
                     yield text
+
             return stream_gen()
 
         else:
diff --git a/paddlemix/models/mPLUGOwl3/modeling_navit_siglip.py b/paddlemix/models/mPLUGOwl3/modeling_navit_siglip.py
index e4ac2b2a7..1f399d640 100644
--- a/paddlemix/models/mPLUGOwl3/modeling_navit_siglip.py
+++ b/paddlemix/models/mPLUGOwl3/modeling_navit_siglip.py
@@ -12,17 +12,14 @@
 # See the License for the specific language governing permissions and
 # limitations under the License.
 
-import os
-
-import paddle
-
-""" PyTorch Siglip model. """
 import math
+import os
 import warnings
 from dataclasses import dataclass
 from typing import Optional, Tuple, Union
 
 import numpy as np
+import paddle
 from paddle import nn
 from paddlenlp.transformers import PretrainedConfig
 from paddlenlp.transformers.activations import ACT2FN
@@ -36,8 +33,6 @@
 from paddlemix.models.flash_attn_utils import has_flash_attn_func
 from paddlemix.utils.initializer import _calculate_fan_in_and_fan_out
 
-from .bert_padding import pad_input, unpad_input
-
 flash_attn_func, flash_attn_varlen_func = has_flash_attn_func()
 
 
@@ -140,20 +135,6 @@ def from_pretrained(cls, pretrained_model_name_or_path: Union[str, os.PathLike],
         return cls.from_dict(config_dict, **kwargs)
 
 
-# _CHECKPOINT_FOR_DOC = 'google/siglip-base-patch16-224'
-
-
-def _get_unpad_data(attention_mask):
-    seqlens_in_batch = attention_mask.sum(axis=-1, dtype="int32")
-    paddle.utils.try_import("warnings").warn("Now, the return shape is inconsistent with torch when as_tuple is True")
-    indices = paddle.nonzero(x=attention_mask.flatten(), as_tuple=False).flatten()
-    max_seqlen_in_batch = seqlens_in_batch.max().item()
-    cu_seqlens = nn.functional.pad(
-        x=paddle.cumsum(x=seqlens_in_batch, axis=0, dtype="int32"), pad=(1, 0), pad_from_left_axis=False
-    )
-    return indices, cu_seqlens, max_seqlen_in_batch
-
-
 def _trunc_normal_(tensor, mean, std, a, b):
     # 确保mean是浮点数
     mean = float(mean)
@@ -486,33 +467,9 @@ def _flash_attention_forward(
         # Contains at least one padding token in the sequence
         causal = self.is_causal and query_length != 1
 
-        head_dim = query_states.shape[-1]
-        softmax_scale = head_dim**-0.5  # TODO: 需要手动加上
-
         if attention_mask is not None:
-            batch_size = query_states.shape[0]  # [2, 3383, 16, 128]
+            raise NotImplementedError("Currently only support single image infer and attention_mask is none")
 
-            query_states, key_states, value_states, indices_q, cu_seq_lens, max_seq_lens = unpad_input(
-                query_states, key_states, value_states, attention_mask, query_length
-            )
-
-            cu_seqlens_q, cu_seqlens_k = cu_seq_lens
-            max_seqlen_in_batch_q, max_seqlen_in_batch_k = max_seq_lens
-
-            attn_output_unpad = flash_attn_varlen_func(  # TODO: flash_attn_unpadded
-                query_states,  # [5998, 16, 128]
-                key_states,  # [5998, 8, 128]
-                value_states,  # [5998, 8, 128]
-                cu_seqlens_q=cu_seqlens_q,
-                cu_seqlens_k=cu_seqlens_k,
-                max_seqlen_q=max_seqlen_in_batch_q,
-                max_seqlen_k=max_seqlen_in_batch_k,
-                scale=softmax_scale,  # not softmax_scale=
-                dropout=dropout,
-                causal=causal,
-            )[0]
-
-            attn_output = pad_input(attn_output_unpad, indices_q, batch_size, query_length)
         else:
             attn_output = flash_attn_func(
                 query_states,
@@ -641,34 +598,6 @@ def _init_weights(self, module):
                 module.weight.set_value(paddle.ones_like(module.weight))
 
 
-SIGLIP_START_DOCSTRING = """
-    This model inherits from [`PreTrainedModel`]. Check the superclass documentation for the generic methods the
-    library implements for all its model (such as downloading or saving, resizing the input embeddings, pruning heads
-    etc.)
-    This model is also a PyTorch [torch.nn.Module](https://pytorch.org/docs/stable/nn.html#torch.nn.Module) subclass.
-    Use it as a regular PyTorch Module and refer to the PyTorch documentation for all matter related to general usage
-    and behavior.
-    Parameters:
-        config ([`SiglipVisionConfig`]): Model configuration class with all the parameters of the model.
-            Initializing with a config file does not load the weights associated with the model, only the
-            configuration. Check out the [`~PreTrainedModel.from_pretrained`] method to load the model weights.
-"""
-SIGLIP_VISION_INPUTS_DOCSTRING = """
-    Args:
-        pixel_values (`torch.FloatTensor` of shape `(batch_size, num_channels, height, width)`):
-            Pixel values. Padding will be ignored by default should you provide it. Pixel values can be obtained using
-            [`AutoImageProcessor`]. See [`CLIPImageProcessor.__call__`] for details.
-        output_attentions (`bool`, *optional*):
-            Whether or not to return the attentions tensors of all attention layers. See `attentions` under returned
-            tensors for more detail.
-        output_hidden_states (`bool`, *optional*):
-            Whether or not to return the hidden states of all layers. See `hidden_states` under returned tensors for
-            more detail.
-        return_dict (`bool`, *optional*):
-            Whether or not to return a [`~utils.ModelOutput`] instead of a plain tuple.
-"""
-
-
 class SigLipEncoder(nn.Layer):
     """
     Transformer encoder consisting of `config.num_hidden_layers` self attention layers. Each layer is a
@@ -723,12 +652,7 @@ def forward(
         for encoder_layer in self.layers:
             if output_hidden_states:
                 encoder_states = encoder_states + (hidden_states,)
-            if self.gradient_checkpointing and self.training:
-                layer_outputs = self._gradient_checkpointing_func(
-                    encoder_layer.__call__, hidden_states, attention_mask, output_attentions
-                )
-            else:
-                layer_outputs = encoder_layer(hidden_states, attention_mask, output_attentions=output_attentions)
+            layer_outputs = encoder_layer(hidden_states, attention_mask, output_attentions=output_attentions)
             hidden_states = layer_outputs[0]
             if output_attentions:
                 all_attentions = all_attentions + (layer_outputs[1],)
@@ -742,57 +666,6 @@ def forward(
         )
 
 
-# # Copied from transformers.models.llama.modeling_llama._prepare_4d_causal_attention_mask_with_cache_position
-# def _prepare_4d_causal_attention_mask_with_cache_position(
-#     attention_mask: paddle.Tensor,
-#     sequence_length: int,
-#     target_length: int,
-#     dtype: paddle.dtype,
-#     min_dtype: float,
-#     cache_position: paddle.Tensor,
-#     batch_size: int,
-# ):
-#     """
-#     Creates a causal 4D mask of shape `(batch_size, 1, query_length, key_value_length)` from a 2D mask of shape
-#     `(batch_size, key_value_length)`, or if the input `attention_mask` is already 4D, do nothing.
-
-#     Args:
-#         attention_mask (`paddle.Tensor`):
-#             A 2D attention mask of shape `(batch_size, key_value_length)` or a 4D attention mask of shape `(batch_size, 1, query_length, key_value_length)`.
-#         sequence_length (`int`):
-#             The sequence length being processed.
-#         target_length (`int`):
-#             The target length: when generating with static cache, the mask should be as long as the static cache, to account for the 0 padding, the part of the cache that is not filled yet.
-#         dtype (`paddle.dtype`):
-#             The dtype to use for the 4D attention mask.
-#         min_dtype (`float`):
-#             The minimum value representable with the dtype `dtype`.
-#         cache_position (`paddle.Tensor`):
-#             Indices depicting the position of the input sequence tokens in the sequence.
-#         batch_size (`paddle.Tensor`):
-#             Batch size.
-#     """
-#     if attention_mask is not None and attention_mask.dim() == 4:
-#         # In this case we assume that the mask comes already in inverted form and requires no inversion or slicing.
-#         causal_mask = attention_mask
-#     else:
-#         causal_mask = paddle.full([sequence_length, target_length], fill_value=min_dtype, dtype=dtype)
-#         if sequence_length != 1:
-#             causal_mask = paddle.triu(x=causal_mask, diagonal=1)
-#         causal_mask *= paddle.arange(target_length) > cache_position.reshape([-1, 1])
-#         causal_mask = causal_mask[None, None, :, :].expand(shape=[batch_size, 1, -1, -1])
-#         if attention_mask is not None:
-#             causal_mask = causal_mask.clone()
-#             mask_length = tuple(attention_mask.shape)[-1]
-#             padding_mask = causal_mask[:, :, :, :mask_length] + attention_mask[:, None, None, :]
-#             padding_mask = padding_mask == 0
-#             causal_mask[:, :, :, :mask_length] = causal_mask[:, :, :, :mask_length].masked_fill(
-#                 mask=padding_mask, value=min_dtype
-#             )
-
-#     return causal_mask
-
-
 class SigLipVisionTransformer(SigLipPreTrainedModel):
     config_class = SigLipVisionConfig
     main_input_name = "pixel_values"
@@ -806,7 +679,6 @@ def __init__(self, config: SigLipVisionConfig):
         self.encoder = SigLipEncoder(config)
         self.post_layernorm = nn.LayerNorm(normalized_shape=embed_dim, epsilon=config.layer_norm_eps)
         self._use_flash_attention_2 = config._attn_implementation == "flash_attention_2"
-
         # self.post_init()
 
     def get_input_embeddings(self) -> nn.Layer:
diff --git a/paddlemix/models/mPLUGOwl3/processing_mplugowl3.py b/paddlemix/models/mPLUGOwl3/processing_mplugowl3.py
deleted file mode 100644
index cd5013f59..000000000
--- a/paddlemix/models/mPLUGOwl3/processing_mplugowl3.py
+++ /dev/null
@@ -1,397 +0,0 @@
-# Copyright (c) 2024 PaddlePaddle Authors. All Rights Reserved.
-# 
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-# 
-#     http://www.apache.org/licenses/LICENSE-2.0
-# 
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-
-import paddle
-import paddlenlp
-from paddlenlp.transformers.processing_utils import ProcessorMixin
-
-"""
-Processor class for mPLUGOwl3.
-"""
-import re
-import warnings
-from typing import Any, Dict, List, Optional, Union
-
-# from .image_processing_mplugowl3 import mPLUGOwl3BatchFeature, mPLUGOwl3ImageProcessor
-from .image_processing_mplugowl3 import (
-    TensorType,
-    mPLUGOwl3BatchFeature,
-    mPLUGOwl3ImageProcessor,
-)
-
-OWL_MEDIA_TOKEN = ["<|image|>"]
-
-
-class MediaIndicesHelper():
-    def __init__(self, tokenizer) -> None:
-        self.media_position = []
-        self.tokenizer = tokenizer
-    
-    def has_media(self, text, media_tokens=None):
-        if media_tokens is None:
-            media_tokens = OWL_MEDIA_TOKEN
-        has_media_flag = any([media_token == text for media_token in media_tokens])
-        if any([media_token in text for media_token in media_tokens]):
-            # 不允许出现text中包含media token但是不仅仅是media token。 media token必须单独为一个chunk 
-            assert has_media_flag, text
-        return has_media_flag
-    
-    def add_media(self, text_chunk, text=None, tokenize_fn=None):
-        # cross
-        assert tokenize_fn is not None
-        assert text is not None
-        assert text in OWL_MEDIA_TOKEN
-        media_token_ids = tokenize_fn(text)
-        start = len(text_chunk)
-        end = start + len(media_token_ids)
-        self.media_position.append([start, end])
-        text_chunk.extend(media_token_ids)
-        return len(media_token_ids)
-
-    def cal_media_offset(self, input_ids):
-        if len(self.media_position) == 0:
-            return paddle.ones_like(input_ids)*(-1000000)
-
-        media_starts = paddle.to_tensor([_[0] for _ in self.media_position]).reshape([1,-1])
-        rng = paddle.arange(input_ids.shape[0]).reshape([-1,1])
-        matrix = (rng > media_starts).sum(axis=1)
-
-        return matrix
-
-    def len_images(self,):
-        return len(self.media_position)
-
-
-class mPLUGOwl3Processor(ProcessorMixin):
-    r"""
-    Args:
-        image_processor ([`mPLUGOwl3ImageProcessor`], *optional*):
-            The image processor is a required input.
-        tokenizer ([`LlamaTokenizerWrapper`], *optional*):
-            The tokenizer is a required input.
-    """
-    attributes = ["image_processor", "tokenizer"]
-    image_processor_class = "mPLUGOwl3ImageProcessor"
-    tokenizer_class = "AutoTokenizer"
-
-    def __init__(self, image_processor: mPLUGOwl3ImageProcessor = None, tokenizer=None, prompt_style='chatml', inference_mode=True, addition_eod="<|endoftext|>"):
-        super().__init__(image_processor, tokenizer)
-        self.image_processor: mPLUGOwl3ImageProcessor
-        self.prompt_style = prompt_style
-        self.inference_mode = inference_mode
-        self.media_tokens = ["<|image|>"]
-        self.addition_eod = addition_eod
-
-    def build_text_qwen(self, messages):
-        # role should be within ['system', 'user', 'assistant']
-        im_start, im_end = '<|im_start|>', '<|im_end|>'
-  
-        text = []
-        for num_turn, message in enumerate(messages):
-            if num_turn == 0 and message['role'] != 'system':
-                if self.prompt_style != 'plain':
-                    text.append({
-                        "text": f"{im_start}system\n{im_end}",
-                        "label": 0
-                    })
-            if message['role'] == 'system':
-                if self.prompt_style != 'plain':
-                    text.append({
-                        "text": f"{im_start}system\n{message['content']}{im_end}",
-                        "label": 0
-                    })
-            elif message['role'] == 'user':
-                if self.prompt_style != 'plain':
-                    content = f"\n{im_start}user\n{message['content']}{im_end}"
-                else:
-                    content = message['content']
-                pattern = '|'.join(map(re.escape, self.media_tokens))
-                chunk_strs = re.split(f'({pattern})', content)
-                for chunk_str in chunk_strs:
-                    text.append({
-                        "text": chunk_str,
-                        "label": 0
-                    })
-             
-            elif message['role'] == 'assistant':
-                if self.prompt_style != 'plain':
-                    text.append({"text": f"\n{im_start}assistant\n", "label": 0})
-                    text.append({"text": f"{message['content']}{im_end}", "label": 1})
-                else:
-                    text.append({"text": f"{message['content']}", "label": 1})
-                text.append({"text": self.addition_eod, "label": 1})
-            else:
-                raise NotImplementedError
-        if self.inference_mode:
-            while text and text[-1]['label']==1:  # 只要列表非空且最后一个元素满足条件
-                text.pop()  # 就移除最后一个元素
-        return text
-
-    def wrapped_tokenize(self, text):
-        return self.tokenizer(text).input_ids
-
-    def encode_text_sft(self, texts):
-        # output enc_chunk
-   
-        enc_chunk = []
-        label_chunk = []
-        enc_length = 0
-
-        num_images = 0
-
-        media_helper = MediaIndicesHelper(tokenizer=self.tokenizer)
-        for current_ti, text_chunk in enumerate(texts):
-           
-            text = text_chunk["text"]
-            label = text_chunk["label"]
-
-            if not media_helper.has_media(text):
-                curr_chunk = self.wrapped_tokenize(text)
-                if label == 1:
-                    enc_length += len(curr_chunk)
-                    enc_chunk += curr_chunk
-                    label_chunk += [label] * len(curr_chunk)
-                else:
-                   
-                    enc_length += len(curr_chunk)
-                    enc_chunk += curr_chunk
-                    label_chunk += [label] * len(curr_chunk)
-            # For media tokens
-            else:
-               
-                add_length = media_helper.add_media(
-                    enc_chunk, 
-                    text=text, 
-                    tokenize_fn=self.wrapped_tokenize)
-                enc_length += add_length
-                label_chunk += [label] * add_length
-                # enc_chunk.extend([self.media_tokens[text]] * self.media_lengths[text])
-                # enc_length += self.media_lengths[text]
-                # label_chunk += [label] * self.media_lengths[text]
-                num_images += 1
-
-        enc_chunk = paddle.to_tensor(enc_chunk).astype(dtype="int64")
-        # media_offset = []
-        # media_before = 0
-        # for i,_ in enumerate([media_helper]):
-        #     mo = _.cal_media_offset(enc_chunk)
-        #     media_offset.append(torch.cat([(torch.ones(mo.shape[0],1)*media_before).long().to(mo.device), (mo+media_before).unsqueeze(1)], dim=1)) # L 2
-
-        #     media_before += _.len_images()
-        # media_offset = torch.stack(media_offset, dim=0)
-        media_offset = [paddle.to_tensor([_[0] for _ in media_helper.media_position]).astype(dtype="int64")]
-        return {
-            'input_ids': enc_chunk.unsqueeze(0), 
-            'media_offset': media_offset,
-        }
-
-
-    def __call__(
-        self,
-        messages,
-        images = None,
-        videos = None,
-        max_length: Optional[int] = None,
-        cut_enable=True,
-        return_tensors: Optional[Union[str, TensorType]] = TensorType.PADDLE,
-        **kwargs
-    ) -> mPLUGOwl3BatchFeature:
-        medias = []
-        if videos is not None:
-            medias.extend([{'type': 'video', 'content': video, 'use_video_span': True} for video in videos])
-        if images is not None:
-            medias.extend([{'type':'image', 'content': image}  for image in images])
-            
-        if len(medias):
-            image_tensor_list = []
-            pattern = r"(<\|image\|>|<\|video\|>)"
-            # 存在媒体
-            image_token_ptr = 0
-            media_layout = []
-            for message in messages:
-                text_list = re.split(pattern, message['content'])
-                text = ''
-                for text_content in text_list:
-                    if text_content in ['<|image|>', '<|video|>']:
-                        media_item = medias[image_token_ptr]
-                        image_token_ptr += 1
-                        if text_content == '<|image|>':
-                            assert media_item['type'] == 'image'
-                            image = media_item['content']
-
-                            image_inputs = self.image_processor([image], cut_enable=cut_enable, return_tensors=return_tensors)
-                            if image_inputs.get('cut_shape',None) is not None:
-                                cut_shape = image_inputs['cut_shape']
-                                cut_text = self.image_processor.cut_prompt_template(img_token='<|image|>', h=cut_shape[0][0], w=cut_shape[0][1])
-                                text += cut_text
-                                image_tensor_list.append(image_inputs['pixel_values'])
-                            else:
-                                text += text_content
-                                image_tensor_list.append(image_inputs['pixel_values'])
-                        elif text_content == '<|video|>':
-                            assert media_item['type'] == 'video'
-                            video = media_item['content']
-                            use_video_span = media_item['use_video_span']
-                            image_tensor = self.image_processor(video, cut_enable=False)['pixel_values']
-                            image_tensor_list.append(image_tensor)
-                            num_video_frame = image_tensor.shape[0]
-                            if use_video_span:
-                                text_content = '<|start_video_frame|>'+'<|image|>'*num_video_frame+'<|end_video_frame|>'
-                            else:
-                                text_content = '<|image|>'*num_video_frame
-                            text += text_content
-                    else:
-                        text += text_content
-                message['content'] = text
-            assert image_token_ptr == len(medias), (image_token_ptr,len(medias)) # 保证图和token数目一致
-            assert all(len(_.shape) == 4 for _ in image_tensor_list), [_.shape for _ in image_tensor_list]
-            num_image_tokens = sum([_['content'].count('<|image|>')for _ in messages])
-            num_image_shapes = sum([_.shape[0] for _ in image_tensor_list])
-            assert num_image_tokens == num_image_shapes, (messages, [_.shape for _ in image_tensor_list])
-
-        image_tensor_list = paddle.concat(image_tensor_list, axis=0)
-        
-        text = self.build_text_qwen(messages)
-        model_inputs = self.encode_text_sft(text)
-        
-        if len(medias) is not None:
-            model_inputs.update({'pixel_values': image_tensor_list})
-            # if 'cut_shape' in model_inputs:
-            #     model_inputs.pop('cut_shape')
-            # if 'cut_shape_indices' in model_inputs:
-            #     model_inputs.pop('cut_shape_indices')
-        return mPLUGOwl3BatchFeature(model_inputs)
-    
-    def check_media(self, images, messages):
-        media_num = 0 if images is None else len(images)
-        media_count = sum([message['content'].count('<|image|>') for message in messages])
-        assert media_num == media_count
-
-    # Copied from transformers.models.clip.processing_clip.CLIPProcessor.batch_decode with CLIP->Llama
-    def batch_decode(self, *args, **kwargs):
-        """
-        This method forwards all its arguments to LlamaTokenizerFast's [`~PreTrainedTokenizer.batch_decode`]. Please
-        refer to the docstring of this method for more information.
-        """
-        output_ids = args[0]
-        result_text = []
-        for result in output_ids:
-            result = result[result != 0]
-            if result[0] == self.tokenizer.bos_id:
-                result = result[1:]
-            if result[-1] == self.tokenizer.eos_id:
-                result = result[:-1]
-            result_text.append(self.tokenizer.decode(result, *args[1:], **kwargs).strip())
-        return result_text
-        # return self.tokenizer.batch_decode(*args, **kwargs)
-    
-    # Copied from transformers.models.clip.processing_clip.CLIPProcessor.decode with CLIP->Llama
-    def decode(self, *args, **kwargs):
-        """
-        This method forwards all its arguments to LlamaTokenizerFast's [`~PreTrainedTokenizer.decode`]. Please refer to
-        the docstring of this method for more information.
-        """
-        result = args[0]
-        result = result[result != 0]
-        if result[0] == self.tokenizer.bos_id:
-            result = result[1:]
-        if result[-1] == self.tokenizer.eos_id or (hasattr(self.tokenizer, "eot_id") and result[-1] == self.tokenizer.eot_id):
-            result = result[:-1]
-        return self.tokenizer.decode(result, *args[1:], **kwargs).strip()
-
-    def _convert(
-        self, input_str, max_inp_length: Optional[int] = None
-    ):
-        if self.version > 2.5 or not getattr(self.tokenizer, "add_bos_token", False):
-            input_ids = self.tokenizer.encode(input_str)
-        else:
-            input_ids = [self.tokenizer.bos_id] + self.tokenizer.encode(input_str)
-        if max_inp_length is not None:
-            input_ids = input_ids[:max_inp_length]
-        input_ids = paddle.to_tensor(data=input_ids, dtype="int32")
-
-        start_cond = (input_ids == self.tokenizer.im_start_id) | (input_ids == self.tokenizer.slice_start_id)
-        end_cond = (input_ids == self.tokenizer.im_end_id) | (input_ids == self.tokenizer.slice_end_id)
-
-        image_start_tokens = paddle.where(start_cond)[0] ### or paddle.nonzero(start_cond)[:, 0]
-        image_start_tokens += 1
-        image_end_tokens = paddle.where(end_cond)[0]
-
-        valid_image_nums = max(len(image_start_tokens), len(image_end_tokens))
-
-        image_bounds = paddle.hstack(
-            [
-                image_start_tokens[:valid_image_nums].unsqueeze(-1),
-                image_end_tokens[:valid_image_nums].unsqueeze(-1),
-            ]
-        )
-        return input_ids, image_bounds
-
-   
-    @property
-    # Copied from transformers.models.clip.processing_clip.CLIPProcessor.model_input_names
-    def model_input_names(self):
-        tokenizer_input_names = self.tokenizer.model_input_names
-        image_processor_input_names = self.image_processor.model_input_names
-        return list(dict.fromkeys(tokenizer_input_names + image_processor_input_names))
-
-
-    def pad(self, inputs, max_length=None, padding_value=0, padding_side="left"):
-        items = []
-        if isinstance(inputs[0], list):
-            assert isinstance(inputs[0][0], paddle.Tensor)
-            for it in inputs:
-                for tr in it:
-                    items.append(tr)
-        else:
-            assert isinstance(inputs[0], paddle.Tensor)
-            items = inputs
-
-        batch_size = len(items)
-        shape = items[0].shape
-        dim = len(shape)
-        assert dim <= 2
-        if max_length is None:
-            max_length = 0
-        max_length = max(max_length, max(item.shape[-1] for item in items))
-        min_length = min(item.shape[-1] for item in items)
-        dtype = items[0].dtype
-
-        if dim == 0:
-            return paddle.stack([item for item in items], axis=0), [0]
-        elif dim == 1:
-            if max_length == min_length:
-                return paddle.stack([item for item in items], axis=0), [0] * batch_size
-            tensor = paddle.zeros((batch_size, max_length), dtype=dtype) + padding_value
-        else:
-            tensor = (
-                paddle.zeros((batch_size, max_length, shape[-1]), dtype=dtype)
-                + padding_value
-            )
-
-        padding_length = []
-        for i, item in enumerate(items):
-            if dim == 1:
-                if padding_side == "left":
-                    tensor[i, -len(item) :] = item.clone()
-                else:
-                    tensor[i, : len(item)] = item.clone()
-            elif dim == 2:
-                if padding_side == "left":
-                    tensor[i, -len(item) :, :] = item.clone()
-                else:
-                    tensor[i, : len(item), :] = item.clone()
-            padding_length.append(tensor.shape[-1] - len(item))
-
-        return tensor, padding_length
diff --git a/paddlemix/processors/__init__.py b/paddlemix/processors/__init__.py
index 7a05f5974..1d9ee61eb 100644
--- a/paddlemix/processors/__init__.py
+++ b/paddlemix/processors/__init__.py
@@ -16,21 +16,22 @@
 from .blip_processing import *
 from .clip_processing import *
 from .eva02_processing import *
+from .got_process import *
 from .groundingdino_processing import *
+from .image_processing_minicpmv import *
 from .imagebind_processing import *
 from .internlm_xcomposer2_processing import *
 from .internvl_processing import *
+from .janus_processing import *
 from .llava_next_processing import *
 from .llava_processing import *
 from .minigpt4_image_processing import *
 from .minigpt4_processing import *
+from .mplugowl3_processing import *
+from .processing_minicpmv import *
 from .qwen2_vl_processing import *
 from .qwen_vl_processing import *
 from .sam_processing import *
 from .tokenizer import SimpleTokenizer, tokenize
 from .visualglm_image_processing import *
 from .visualglm_processing import *
-from .image_processing_minicpmv import *
-from .processing_minicpmv import *
-from .janus_processing import *
-from .got_process import *
diff --git a/paddlemix/processors/mplugowl3_processing.py b/paddlemix/processors/mplugowl3_processing.py
new file mode 100644
index 000000000..93e9b4611
--- /dev/null
+++ b/paddlemix/processors/mplugowl3_processing.py
@@ -0,0 +1,824 @@
+# Copyright (c) 2024 PaddlePaddle Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+import random
+import re
+from enum import Enum
+from typing import Any, Dict, List, Optional, Union
+
+import paddle
+import paddle.vision.transforms as transforms
+from einops import rearrange, repeat
+from paddle.vision.transforms import Resize
+from paddlenlp.transformers.image_processing_utils import (
+    BaseImageProcessor,
+    BatchFeature,
+)
+from paddlenlp.transformers.processing_utils import ProcessorMixin
+from PIL import Image
+
+OWL_MEDIA_TOKEN = ["<|image|>"]
+
+
+def recursive_converter(converter, value):
+    if isinstance(value, list):
+        new_value = []
+        for v in value:
+            new_value += [recursive_converter(converter, v)]
+        return new_value
+    else:
+        return converter(value)
+
+
+def box_area(boxes):
+    # 获取边界框的宽度和高度
+    width = boxes[:, 2] - boxes[:, 0]
+    height = boxes[:, 3] - boxes[:, 1]
+    # 计算面积
+    area = width * height
+    return area
+
+
+def custom_max(a, b):
+    return paddle.where(a > b, a, b)
+
+
+def custom_min(a, b):
+    return paddle.where(a < b, a, b)
+
+
+def box_iou(boxes1, area1, boxes2, eps=1e-05):
+    # >>>>>>    area2 = torchvision.ops.boxes.box_area(boxes2)
+    area1 = area1.astype("float32")
+    boxes1 = boxes1.astype("float32")
+    boxes2 = boxes2.astype("float32")
+
+    area2 = box_area(boxes2).astype("float32")
+    lt = custom_max(boxes1[:, None, :2], boxes2[:, :2])
+    rb = custom_min(boxes1[:, None, 2:], boxes2[:, 2:])
+    wh = (rb - lt).clip(min=0)
+    inter = wh[:, :, 0] * wh[:, :, 1]
+    union = area1[:, None] + area2 - inter
+    iou = inter / (union + eps)
+    return iou, union
+
+
+# def box_iou(boxes1, area1, boxes2, eps=1e-5):
+#     area2 = box_area(boxes2)
+
+#     lt = paddle.max(boxes1[:, None, :2], boxes2[:, :2])  # [N,M,2]
+#     rb = paddle.min(boxes1[:, None, 2:], boxes2[:, 2:])  # [N,M,2]
+
+#     wh = (rb - lt).clip(min=0)  # [N,M,2]
+#     inter = wh[:, :, 0] * wh[:, :, 1]  # [N,M]
+
+#     union = area1[:, None] + area2 - inter
+
+#     iou = inter / (union + eps)
+#     return iou, union
+
+
+available_anchor_strategy = ["docowl", "random", "highest", "last", "llava"]
+
+grid_dict = {
+    "grid_33": [
+        (1, 1),
+        (1, 2),
+        (2, 1),
+        (1, 3),
+        (3, 1),
+        (2, 2),
+        (1, 4),
+        (4, 1),
+        (1, 5),
+        (5, 1),
+        (1, 6),
+        (6, 1),
+        (2, 3),
+        (3, 2),
+        (1, 7),
+        (7, 1),
+        (4, 2),
+        (2, 4),
+        (1, 8),
+        (8, 1),
+        (3, 3),
+        (1, 9),
+        (9, 1),
+    ],
+    "grid_squ_3x3": [(1, 1), (2, 2), (3, 3)],
+    "grid_squ_4": [(2, 2), (1, 3), (1, 4), (3, 1), (4, 1)],
+    "grid_squ_6": [(2, 2), (1, 3), (1, 4), (3, 1), (4, 1), (2, 3), (3, 2)],
+    "grid_squ_2": [(2, 1)],
+    "grid_squ_9": [
+        (1, 1),
+        (1, 2),
+        (2, 1),
+        (1, 3),
+        (3, 1),
+        (2, 2),
+        (1, 4),
+        (4, 1),
+        (1, 5),
+        (5, 1),
+        (1, 6),
+        (6, 1),
+        (2, 3),
+        (3, 2),
+        (1, 7),
+        (7, 1),
+        (4, 2),
+        (2, 4),
+        (1, 8),
+        (8, 1),
+        (3, 3),
+        (1, 9),
+        (9, 1),
+    ],
+}
+
+
+cut_prompt_template_dict = {
+    'v0': lambda img_token, h, w: f''.join([f"{img_token}" for i in range(h) for j in range(w)]),
+    'v1': lambda img_token, h, w: f'Cut to {h} rows {w} columns, '+ ' '.join([f"subimg({i},{j}){img_token}"for i in range(h) for j in range(w)]),
+    'v1_global': lambda img_token, h, w: f'Cut to {h} rows {w} columns with a global view, '+ ' '.join([f"subimg({i},{j}){img_token}"for i in range(h) for j in range(w)]+[f"global_view{img_token}"]),
+    'v2_global': lambda img_token, h, w: f'Cut to {h} rows {w} columns with a global view\n'+ '\n'.join([' '.join([f"subimg({i},{j}){img_token}" for j in range(w)]) for i in range(h)])+f"\nglobal_view{img_token}",
+    'v3': lambda img_token, h, w: f'<|start_cut|>{h}*{w}'+ ' '.join([f"{img_token}"for i in range(h) for j in range(w)])+'<|end_cut|>',
+    'v3_global': lambda img_token, h, w: f'<|start_cut|>{h}*{w}\n'+ '\n'.join([' '.join([f"{img_token}" for j in range(w)]) for i in range(h)])+f'\n{img_token}<|end_cut|>',
+}
+
+
+def anchor_rank(anchors, anchors_areas, input_image_size, eps=1e-5):
+    # anchors x1 y1 x2 y2
+
+    # image_size: (h, w)
+    # xyxy
+    input_image_bbox = paddle.to_tensor([0, 0, input_image_size[1], input_image_size[0]]).unsqueeze(0)
+
+    boxes1 = anchors
+    boxes2 = input_image_bbox
+    boxes3 = anchors.clone()
+    # y2
+    boxes3[:, 3] = input_image_size[0] / input_image_size[1] * anchors[:, 2]  # 用于算分辨率无关的iou
+
+    area1 = anchors_areas
+
+    iou, _ = box_iou(boxes1, area1, boxes2)
+    iou = iou.squeeze(1)
+    shape_iou, _ = box_iou(boxes1, area1, boxes3)
+    shape_iou = shape_iou.diag()
+    # 优先匹配形状接近 再匹配分辨率接近
+    index = paddle.argmax(shape_iou * 100 + iou, axis=0)
+    return index
+
+
+def select_best_resolution(anchors, anchors_areas, input_image_size):  # TODO For a futher check
+    """
+    Selects the best resolution from a list of possible resolutions based on the original size.
+
+    Args:
+        original_size (tuple): The original size of the image in the format (width, height).
+        possible_resolutions (list): A list of possible resolutions in the format [(width1, height1), (width2, height2), ...].
+
+    Returns:
+        tuple: The best fit resolution in the format (width, height).
+    """
+    original_size = (input_image_size[1], input_image_size[0])
+    possible_resolutions = [(_[2], _[3]) for _ in anchors]  # xyxy -> w,h
+
+    original_width, original_height = original_size
+    # best_fit = None
+    max_effective_resolution = 0
+    min_wasted_resolution = float("inf")
+
+    index = 0
+    for i, (width, height) in enumerate(possible_resolutions):
+        scale = min(width / original_width, height / original_height)
+        downscaled_width, downscaled_height = int(original_width * scale), int(original_height * scale)
+        effective_resolution = min(downscaled_width * downscaled_height, original_width * original_height)
+        wasted_resolution = (width * height) - effective_resolution
+
+        if effective_resolution > max_effective_resolution or (
+            effective_resolution == max_effective_resolution and wasted_resolution < min_wasted_resolution
+        ):
+            max_effective_resolution = effective_resolution
+            min_wasted_resolution = wasted_resolution
+            # best_fit = (width, height)
+            index = i
+
+    return index
+
+
+def build_cut_shape_indices(cut_shape):
+    # cut_shape: a list of (nh,nw)
+    cut_shape_indices = []
+    for shape in cut_shape:
+        n = shape[0] * shape[1]
+        indices = paddle.concat(
+            [repeat(paddle.to_tensor(shape), "l -> n l", n=n), paddle.arange(n).unsqueeze(1)], axis=1
+        )
+        assert indices.shape[0] == n
+        assert indices.shape[1] == 3  # nh,nw,idx
+
+        cut_shape_indices.append(indices)
+    cut_shape_indices = paddle.concat(cut_shape_indices, axis=0).astype("int64")
+    return cut_shape_indices
+
+
+class AnchorResize(paddle.nn.Layer):
+    def __init__(self, image_size, anchors, interpolation="bilinear", antialias=None, anchor_strategy="docowl"):
+        super().__init__()
+        self.image_size = image_size
+        # xyxy
+        self.anchors = paddle.to_tensor(
+            [[0, 0, _[1] * image_size[1], _[0] * image_size[0]] for _ in anchors],
+        )
+
+        self.anchor_areas = box_area(self.anchors)
+
+        self.interpolation = interpolation
+        self.antialias = antialias
+        self.anchor_strategy = anchor_strategy
+        assert self.anchor_strategy in available_anchor_strategy
+
+    def resize_global(self, img):
+        transform = Resize(size=self.image_size, interpolation=self.interpolation)
+        return transform(img)
+
+    def forward(self, img, skip_resize=False):
+        """
+        Args:
+            img (PIL Image or Tensor): Image to be scaled.
+
+        Returns:
+            PIL Image or Tensor: Rescaled image.
+        """
+        if self.anchor_strategy == "docowl":
+            selected_anchor = anchor_rank(self.anchors, self.anchor_areas, (img.size[1], img.size[0]))
+        elif self.anchor_strategy == "random":
+            selected_anchor = random.randint(0, len(self.anchors) - 1)
+        elif self.anchor_strategy == "highest":
+            # 选面积最大的 在这个基础上 尽可能选最方正的
+            selected_anchor = paddle.argmax(
+                self.anchors[:, 2] * self.anchors[:, 3] * 100 - paddle.abs(self.anchors[:, 2] - self.anchors[:, 3])
+            )
+        elif self.anchor_strategy == "last":
+            selected_anchor = len(self.anchors) - 1
+        elif self.anchor_strategy == "llava":
+            selected_anchor = select_best_resolution(self.anchors, self.anchor_areas, (img.size[1], img.size[0]))
+        else:
+            selected_anchor = None
+        assert selected_anchor is not None
+
+        target_size = self.anchors[selected_anchor][2:].tolist()  # w,h
+        if skip_resize:
+            # for debug
+            return selected_anchor
+        # return F.resize(img, [target_size[1],target_size[0]], self.interpolation, max_size=None, antialias=self.antialias), selected_anchor
+        # image_np = np.array(img)
+        # image_tensor = paddle.to_tensor(image_np, dtype="float32")
+        # image_tensor = image_tensor.transpose([2, 0, 1])  # 变成 (3, 500, 500)
+        # if self.interpolation == "bilinear" or "bicubic":
+        #     image_tensor = image_tensor.unsqueeze(0)  # 变成 (1, 3, 500, 500)
+        transform = Resize(size=[target_size[1], target_size[0]], interpolation=self.interpolation)
+        return (transform(img), selected_anchor)
+        # return (
+        #     F.interpolate(
+        #         image_tensor, size=[target_size[1], target_size[0]], mode=self.interpolation, align_corners=False
+        #     )[0],
+        #     selected_anchor,
+        # )
+
+    def __repr__(self) -> str:
+        detail = f"(size={self.image_size}, anchor={self.anchors}, interpolation={self.interpolation.value}, antialias={self.antialias})"
+        return f"{self.__class__.__name__}{detail}"
+
+
+class CutMixin:
+    def __init__(
+        self,
+        cut_cfg={
+            "anchors": "grid_squ_6",
+            "anchor_strategy": "docowl",
+            "cut_prompt": "v3",
+            "add_global": True,
+            "cut_prob": 1.0,
+        },
+    ) -> None:
+        if cut_cfg is None:
+            self.cut_enable = False
+            return
+        else:
+            self.cut_enable = True
+        image_size = self.image_size
+        anchors = cut_cfg.get("anchors", "grid_33")
+        anchor_strategy = cut_cfg.get("anchor_strategy", "docowl")
+        cut_prompt = cut_cfg.get("cut_prompt", "v0")
+        self.cut_prob = cut_cfg.get("cut_prob", 1.0)
+
+        self.force_shape_cut = cut_cfg.get("force_shape_cut", False)
+        force_shape_cut_anchors = cut_cfg.get("force_shape_cut_anchors", "force_shape_cut_anchors")
+
+        self.add_global = cut_cfg.get("add_global", False)
+
+        # h,w
+        if isinstance(image_size, int):
+            image_size = (image_size, image_size)
+        self.image_size = image_size
+
+        if anchors in grid_dict:
+            anchors = grid_dict[anchors]
+        else:
+            anchors = eval(anchors)
+        self.anchors = [tuple(_) for _ in anchors]
+        self.anchor_max = max([max(_) for _ in self.anchors])
+        self.resizer = AnchorResize(
+            image_size=image_size, anchors=anchors, interpolation="bicubic", anchor_strategy=anchor_strategy
+        )
+
+        if force_shape_cut_anchors in grid_dict:
+            force_shape_cut_anchors = grid_dict[force_shape_cut_anchors]
+        else:
+            force_shape_cut_anchors = eval(force_shape_cut_anchors)
+        self.force_shape_cut_anchors = [tuple(_) for _ in force_shape_cut_anchors]
+        self.force_shape_cut_anchors_max = max([max(_) for _ in self.force_shape_cut_anchors])
+
+        self.old_resizer = transforms.Resize(image_size, interpolation="bicubic")
+
+        # 把image processor的缩放去掉 只保留后面的变换
+        self.image_transform = transforms.Compose(self.image_transform.transforms[1:])
+        if self.add_global:
+            self.cut_prompt_template = cut_prompt_template_dict[cut_prompt + "_global"]
+        else:
+            self.cut_prompt_template = cut_prompt_template_dict[cut_prompt]
+
+        self.media_tokens = ["<|image|>", "<|video|>"]
+
+    def _process_image(self, images):
+        new_images = []
+        cut_shape = []
+        for image in images:
+            raw_image = image
+            image, selected_anchor = self.resizer(image)
+            image_input = self.image_transform(image)  # h,w,3 -> 3,h,w
+            cut_shape.append(
+                (image_input.shape[1] // self.image_size[0], image_input.shape[2] // self.image_size[1])
+            )  # cut_h, cut_w
+            image_input = rearrange(
+                image_input, "C (num_h h) (num_w w) -> (num_h num_w) C h w", h=self.image_size[0], w=self.image_size[1]
+            )
+
+            new_images.append(image_input)
+
+            if self.add_global:
+                new_images.append(self.image_transform(self.resizer.resize_global(raw_image)).unsqueeze(0))
+                cut_shape.append((1, 1))
+
+        new_images = paddle.concat(new_images, axis=0)
+        cut_shape_indices = build_cut_shape_indices(cut_shape)
+        return new_images, cut_shape, cut_shape_indices
+
+
+class TensorType(Enum):
+    PADDLE = "paddle"
+
+
+class mPLUGOwl3BatchFeature(BatchFeature):
+    r"""
+    Extend from BatchFeature for supporting various image size
+    """
+
+    def __init__(self, data: Optional[Dict[str, Any]] = None, tensor_type: Union[None, str, TensorType] = None):
+        super().__init__(data)
+        self.convert_to_tensors(tensor_type=tensor_type)
+
+    def convert_to_tensors(self, tensor_type: Optional[Union[str, TensorType]] = None):
+        if tensor_type is None:
+            return self
+
+        # is_tensor, as_tensor = self._get_is_as_tensor_fns(tensor_type)
+        is_tensor = lambda x: isinstance(x, paddle.Tensor)
+        as_tensor = paddle.to_tensor
+
+        def converter(value):
+            try:
+                if not is_tensor(value):
+                    tensor = as_tensor(value)
+                    return tensor
+            except:  # noqa E722
+                if key == "overflowing_values":
+                    raise ValueError("Unable to create tensor returning overflowing values of different lengths. ")
+                raise ValueError(
+                    "Unable to create tensor, you should probably activate padding "
+                    "with 'padding=True' to have batched tensors with the same length."
+                )
+
+        for key, value in self.items():
+            self[key] = recursive_converter(converter, value)
+        return self
+
+
+class mPLUGOwl3ImageProcessor(BaseImageProcessor, CutMixin):
+    model_input_names = ["pixel_values"]
+
+    def __init__(self, image_size, mean=[0.5, 0.5, 0.5], std=[0.5, 0.5, 0.5], **kwargs):
+        super().__init__(**kwargs)
+        self.image_size = image_size
+        self.image_transform = transforms.Compose(
+            [
+                transforms.Resize((image_size, image_size), interpolation="bicubic"),
+                transforms.ToTensor(),
+                transforms.Normalize(mean, std),
+            ]
+        )
+        CutMixin.__init__(self)
+
+    def preprocess(
+        self, images: Union[Image.Image, List[Image.Image]], cut_enable=True, **kwargs
+    ) -> mPLUGOwl3BatchFeature:
+        if isinstance(images, Image.Image):
+            images_list = [images]
+        else:
+            images_list = images
+
+        if self.cut_enable and cut_enable:
+            image_data, cut_shape, cut_shape_indices = self._process_image(images_list)
+        else:
+            image_data = [self.image_transform(self.resizer.resize_global(image)) for image in images_list]
+            image_data = paddle.stack(image_data, axis=0)
+            cut_shape = cut_shape_indices = None
+
+        return mPLUGOwl3BatchFeature(
+            data={"pixel_values": image_data, "cut_shape": cut_shape, "cut_shape_indices": cut_shape_indices}
+        )
+
+    def to_dict(self):
+        encoder_dict = super().to_dict()
+        pop_keys = ["image_transform", "resizer", "old_resizer", "cut_prompt_template"]
+        for pk in pop_keys:
+            encoder_dict.pop(pk, None)
+        return encoder_dict
+
+
+class MediaIndicesHelper:
+    def __init__(self, tokenizer) -> None:
+        self.media_position = []
+        self.tokenizer = tokenizer
+
+    def has_media(self, text, media_tokens=None):
+        if media_tokens is None:
+            media_tokens = OWL_MEDIA_TOKEN
+        has_media_flag = any([media_token == text for media_token in media_tokens])
+        if any([media_token in text for media_token in media_tokens]):
+            # 不允许出现text中包含media token但是不仅仅是media token。 media token必须单独为一个chunk
+            assert has_media_flag, text
+        return has_media_flag
+
+    def add_media(self, text_chunk, text=None, tokenize_fn=None):
+        # cross
+        assert tokenize_fn is not None
+        assert text is not None
+        assert text in OWL_MEDIA_TOKEN
+        media_token_ids = tokenize_fn(text)
+        start = len(text_chunk)
+        end = start + len(media_token_ids)
+        self.media_position.append([start, end])
+        text_chunk.extend(media_token_ids)
+        return len(media_token_ids)
+
+    def cal_media_offset(self, input_ids):
+        if len(self.media_position) == 0:
+            return paddle.ones_like(input_ids) * (-1000000)
+
+        media_starts = paddle.to_tensor([_[0] for _ in self.media_position]).reshape([1, -1])
+        rng = paddle.arange(input_ids.shape[0]).reshape([-1, 1])
+        matrix = (rng > media_starts).sum(axis=1)
+
+        return matrix
+
+    def len_images(
+        self,
+    ):
+        return len(self.media_position)
+
+
+class mPLUGOwl3Processor(ProcessorMixin):
+    r"""
+    Args:
+        image_processor ([`mPLUGOwl3ImageProcessor`], *optional*):
+            The image processor is a required input.
+        tokenizer ([`LlamaTokenizerWrapper`], *optional*):
+            The tokenizer is a required input.
+    """
+    attributes = ["image_processor", "tokenizer"]
+    image_processor_class = "mPLUGOwl3ImageProcessor"
+    tokenizer_class = "AutoTokenizer"
+
+    def __init__(
+        self,
+        image_processor: mPLUGOwl3ImageProcessor = None,
+        tokenizer=None,
+        prompt_style="chatml",
+        inference_mode=True,
+        addition_eod="<|endoftext|>",
+    ):
+        super().__init__(image_processor, tokenizer)
+        self.image_processor: mPLUGOwl3ImageProcessor
+        self.prompt_style = prompt_style
+        self.inference_mode = inference_mode
+        self.media_tokens = ["<|image|>"]
+        self.addition_eod = addition_eod
+
+    def build_text_qwen(self, messages):
+        # role should be within ['system', 'user', 'assistant']
+        im_start, im_end = "<|im_start|>", "<|im_end|>"
+
+        text = []
+        for num_turn, message in enumerate(messages):
+            if num_turn == 0 and message["role"] != "system":
+                if self.prompt_style != "plain":
+                    text.append({"text": f"{im_start}system\n{im_end}", "label": 0})
+            if message["role"] == "system":
+                if self.prompt_style != "plain":
+                    text.append({"text": f"{im_start}system\n{message['content']}{im_end}", "label": 0})
+            elif message["role"] == "user":
+                if self.prompt_style != "plain":
+                    content = f"\n{im_start}user\n{message['content']}{im_end}"
+                else:
+                    content = message["content"]
+                pattern = "|".join(map(re.escape, self.media_tokens))
+                chunk_strs = re.split(f"({pattern})", content)
+                for chunk_str in chunk_strs:
+                    text.append({"text": chunk_str, "label": 0})
+
+            elif message["role"] == "assistant":
+                if self.prompt_style != "plain":
+                    text.append({"text": f"\n{im_start}assistant\n", "label": 0})
+                    text.append({"text": f"{message['content']}{im_end}", "label": 1})
+                else:
+                    text.append({"text": f"{message['content']}", "label": 1})
+                text.append({"text": self.addition_eod, "label": 1})
+            else:
+                raise NotImplementedError
+        if self.inference_mode:
+            while text and text[-1]["label"] == 1:  # 只要列表非空且最后一个元素满足条件
+                text.pop()  # 就移除最后一个元素
+        return text
+
+    def wrapped_tokenize(self, text):
+        return self.tokenizer(text).input_ids
+
+    def encode_text_sft(self, texts):
+        # output enc_chunk
+
+        enc_chunk = []
+        label_chunk = []
+        enc_length = 0
+
+        num_images = 0
+
+        media_helper = MediaIndicesHelper(tokenizer=self.tokenizer)
+        for current_ti, text_chunk in enumerate(texts):
+
+            text = text_chunk["text"]
+            label = text_chunk["label"]
+
+            if not media_helper.has_media(text):
+                curr_chunk = self.wrapped_tokenize(text)
+                if label == 1:
+                    enc_length += len(curr_chunk)
+                    enc_chunk += curr_chunk
+                    label_chunk += [label] * len(curr_chunk)
+                else:
+
+                    enc_length += len(curr_chunk)
+                    enc_chunk += curr_chunk
+                    label_chunk += [label] * len(curr_chunk)
+            # For media tokens
+            else:
+
+                add_length = media_helper.add_media(enc_chunk, text=text, tokenize_fn=self.wrapped_tokenize)
+                enc_length += add_length
+                label_chunk += [label] * add_length
+                # enc_chunk.extend([self.media_tokens[text]] * self.media_lengths[text])
+                # enc_length += self.media_lengths[text]
+                # label_chunk += [label] * self.media_lengths[text]
+                num_images += 1
+
+        enc_chunk = paddle.to_tensor(enc_chunk).astype(dtype="int64")
+        media_offset = [paddle.to_tensor([_[0] for _ in media_helper.media_position]).astype(dtype="int64")]
+        return {
+            "input_ids": enc_chunk.unsqueeze(0),
+            "media_offset": media_offset,
+        }
+
+    def __call__(
+        self,
+        messages,
+        images=None,
+        videos=None,
+        max_length: Optional[int] = None,
+        cut_enable=True,
+        return_tensors: Optional[Union[str, TensorType]] = TensorType.PADDLE,
+        **kwargs
+    ) -> mPLUGOwl3BatchFeature:
+        medias = []
+        if videos is not None:
+            medias.extend([{"type": "video", "content": video, "use_video_span": True} for video in videos])
+        if images is not None:
+            medias.extend([{"type": "image", "content": image} for image in images])
+
+        if len(medias):
+            image_tensor_list = []
+            pattern = r"(<\|image\|>|<\|video\|>)"
+            # 存在媒体
+            image_token_ptr = 0
+            # media_layout = []
+            for message in messages:
+                text_list = re.split(pattern, message["content"])
+                text = ""
+                for text_content in text_list:
+                    if text_content in ["<|image|>", "<|video|>"]:
+                        media_item = medias[image_token_ptr]
+                        image_token_ptr += 1
+                        if text_content == "<|image|>":
+                            assert media_item["type"] == "image"
+                            image = media_item["content"]
+
+                            image_inputs = self.image_processor(
+                                [image], cut_enable=cut_enable, return_tensors=return_tensors
+                            )
+                            if image_inputs.get("cut_shape", None) is not None:
+                                cut_shape = image_inputs["cut_shape"]
+                                cut_text = self.image_processor.cut_prompt_template(
+                                    img_token="<|image|>", h=cut_shape[0][0], w=cut_shape[0][1]
+                                )
+                                text += cut_text
+                                image_tensor_list.append(image_inputs["pixel_values"])
+                            else:
+                                text += text_content
+                                image_tensor_list.append(image_inputs["pixel_values"])
+                        elif text_content == "<|video|>":
+                            assert media_item["type"] == "video"
+                            video = media_item["content"]
+                            use_video_span = media_item["use_video_span"]
+                            image_tensor = self.image_processor(video, cut_enable=False)["pixel_values"]
+                            image_tensor_list.append(image_tensor)
+                            num_video_frame = image_tensor.shape[0]
+                            if use_video_span:
+                                text_content = (
+                                    "<|start_video_frame|>" + "<|image|>" * num_video_frame + "<|end_video_frame|>"
+                                )
+                            else:
+                                text_content = "<|image|>" * num_video_frame
+                            text += text_content
+                    else:
+                        text += text_content
+                message["content"] = text
+            assert image_token_ptr == len(medias), (image_token_ptr, len(medias))  # 保证图和token数目一致
+            assert all(len(_.shape) == 4 for _ in image_tensor_list), [_.shape for _ in image_tensor_list]
+            num_image_tokens = sum([_["content"].count("<|image|>") for _ in messages])
+            num_image_shapes = sum([_.shape[0] for _ in image_tensor_list])
+            assert num_image_tokens == num_image_shapes, (messages, [_.shape for _ in image_tensor_list])
+
+        image_tensor_list = paddle.concat(image_tensor_list, axis=0)
+
+        text = self.build_text_qwen(messages)
+        model_inputs = self.encode_text_sft(text)
+
+        if len(medias) is not None:
+            model_inputs.update({"pixel_values": image_tensor_list})
+            # if 'cut_shape' in model_inputs:
+            #     model_inputs.pop('cut_shape')
+            # if 'cut_shape_indices' in model_inputs:
+            #     model_inputs.pop('cut_shape_indices')
+        return mPLUGOwl3BatchFeature(model_inputs)
+
+    def check_media(self, images, messages):
+        media_num = 0 if images is None else len(images)
+        media_count = sum([message["content"].count("<|image|>") for message in messages])
+        assert media_num == media_count
+
+    # Copied from transformers.models.clip.processing_clip.CLIPProcessor.batch_decode with CLIP->Llama
+    def batch_decode(self, *args, **kwargs):
+        """
+        This method forwards all its arguments to LlamaTokenizerFast's [`~PreTrainedTokenizer.batch_decode`]. Please
+        refer to the docstring of this method for more information.
+        """
+        output_ids = args[0]
+        result_text = []
+        for result in output_ids:
+            result = result[result != 0]
+            if result[0] == self.tokenizer.bos_id:
+                result = result[1:]
+            if result[-1] == self.tokenizer.eos_id:
+                result = result[:-1]
+            result_text.append(self.tokenizer.decode(result, *args[1:], **kwargs).strip())
+        return result_text
+        # return self.tokenizer.batch_decode(*args, **kwargs)
+
+    # Copied from transformers.models.clip.processing_clip.CLIPProcessor.decode with CLIP->Llama
+    def decode(self, *args, **kwargs):
+        """
+        This method forwards all its arguments to LlamaTokenizerFast's [`~PreTrainedTokenizer.decode`]. Please refer to
+        the docstring of this method for more information.
+        """
+        result = args[0]
+        result = result[result != 0]
+        if result[0] == self.tokenizer.bos_id:
+            result = result[1:]
+        if result[-1] == self.tokenizer.eos_id or (
+            hasattr(self.tokenizer, "eot_id") and result[-1] == self.tokenizer.eot_id
+        ):
+            result = result[:-1]
+        return self.tokenizer.decode(result, *args[1:], **kwargs).strip()
+
+    def _convert(self, input_str, max_inp_length: Optional[int] = None):
+        if self.version > 2.5 or not getattr(self.tokenizer, "add_bos_token", False):
+            input_ids = self.tokenizer.encode(input_str)
+        else:
+            input_ids = [self.tokenizer.bos_id] + self.tokenizer.encode(input_str)
+        if max_inp_length is not None:
+            input_ids = input_ids[:max_inp_length]
+        input_ids = paddle.to_tensor(data=input_ids, dtype="int32")
+
+        start_cond = (input_ids == self.tokenizer.im_start_id) | (input_ids == self.tokenizer.slice_start_id)
+        end_cond = (input_ids == self.tokenizer.im_end_id) | (input_ids == self.tokenizer.slice_end_id)
+
+        image_start_tokens = paddle.where(start_cond)[0]  # or paddle.nonzero(start_cond)[:, 0]
+        image_start_tokens += 1
+        image_end_tokens = paddle.where(end_cond)[0]
+
+        valid_image_nums = max(len(image_start_tokens), len(image_end_tokens))
+
+        image_bounds = paddle.hstack(
+            [
+                image_start_tokens[:valid_image_nums].unsqueeze(-1),
+                image_end_tokens[:valid_image_nums].unsqueeze(-1),
+            ]
+        )
+        return input_ids, image_bounds
+
+    @property
+    # Copied from transformers.models.clip.processing_clip.CLIPProcessor.model_input_names
+    def model_input_names(self):
+        tokenizer_input_names = self.tokenizer.model_input_names
+        image_processor_input_names = self.image_processor.model_input_names
+        return list(dict.fromkeys(tokenizer_input_names + image_processor_input_names))
+
+    def pad(self, inputs, max_length=None, padding_value=0, padding_side="left"):
+        items = []
+        if isinstance(inputs[0], list):
+            assert isinstance(inputs[0][0], paddle.Tensor)
+            for it in inputs:
+                for tr in it:
+                    items.append(tr)
+        else:
+            assert isinstance(inputs[0], paddle.Tensor)
+            items = inputs
+
+        batch_size = len(items)
+        shape = items[0].shape
+        dim = len(shape)
+        assert dim <= 2
+        if max_length is None:
+            max_length = 0
+        max_length = max(max_length, max(item.shape[-1] for item in items))
+        min_length = min(item.shape[-1] for item in items)
+        dtype = items[0].dtype
+
+        if dim == 0:
+            return paddle.stack([item for item in items], axis=0), [0]
+        elif dim == 1:
+            if max_length == min_length:
+                return paddle.stack([item for item in items], axis=0), [0] * batch_size
+            tensor = paddle.zeros((batch_size, max_length), dtype=dtype) + padding_value
+        else:
+            tensor = paddle.zeros((batch_size, max_length, shape[-1]), dtype=dtype) + padding_value
+
+        padding_length = []
+        for i, item in enumerate(items):
+            if dim == 1:
+                if padding_side == "left":
+                    tensor[i, -len(item) :] = item.clone()
+                else:
+                    tensor[i, : len(item)] = item.clone()
+            elif dim == 2:
+                if padding_side == "left":
+                    tensor[i, -len(item) :, :] = item.clone()
+                else:
+                    tensor[i, : len(item), :] = item.clone()
+            padding_length.append(tensor.shape[-1] - len(item))
+
+        return tensor, padding_length

From a71f200541e4c30ca3d7df632b1efb3073b0cc7c Mon Sep 17 00:00:00 2001
From: "nemonameless@qq.com@github.com" <nemonameless@qq.com>
Date: Wed, 18 Dec 2024 03:29:02 +0000
Subject: [PATCH 8/8] remove some comments

---
 .../models/mPLUGOwl3/modeling_hyper_qwen2.py  | 105 ++----------------
 .../models/mPLUGOwl3/modeling_mplugowl3.py    |   4 +-
 2 files changed, 12 insertions(+), 97 deletions(-)

diff --git a/paddlemix/models/mPLUGOwl3/modeling_hyper_qwen2.py b/paddlemix/models/mPLUGOwl3/modeling_hyper_qwen2.py
index 1a83354a1..104a9bf23 100644
--- a/paddlemix/models/mPLUGOwl3/modeling_hyper_qwen2.py
+++ b/paddlemix/models/mPLUGOwl3/modeling_hyper_qwen2.py
@@ -309,8 +309,6 @@ def __init__(self, config: HyperQwen2Config, layer_idx: Optional[int] = None, is
     def apply_mi_rope(self, key_layer, image_pos, length_each_img):
         # input shape should be [s b h d]
         key_layer = rearrange(key_layer, "b h s d -> s b h d")
-        # if self.rotary_emb_core.inv_freq.device!=key_layer.device:
-        #     self.rotary_emb_core.inv_freq = self.rotary_emb_core.inv_freq.to(key_layer.device)
         rotary_pos_emb_max_seq_len = self.config.max_position_embeddings
         ntk_alpha = 1
         rotary_pos_emb = self.rotary_emb_core(rotary_pos_emb_max_seq_len, ntk_alpha=ntk_alpha)
@@ -369,41 +367,26 @@ def hyperattention(
 
         kv_seq_len = key_states.shape[-2]
         if past_key_value is not None:
-            # kv_seq_len += past_key_value.get_usable_length(kv_seq_len, self.layer_idx)
             kv_seq_len += past_key_value[0].shape[-2]
         cos, sin = self.rotary_emb(value_states, seq_len=kv_seq_len)
 
-        # print('query_states, key_states', query_states.sum().item(), key_states.sum().item())
-        # 29952.0 492.0
         query_states, key_states = apply_rotary_pos_emb(query_states, key_states, cos, sin, position_ids)
-        # print('query_states, key_states', query_states.sum().item(), key_states.sum().item())
-        # 18304.0 -776.0
-        # print('query_states, key_states', query_states.shape, key_states.shape)
         # [1, 28, 1, 128] [1, 4, 1, 128]
 
         if past_key_value is not None:
-            # cache_kwargs = {"sin": sin, "cos": cos}  # Specific to RoPE models
-            # key_states, value_states = past_key_value.update(key_states, value_states, self.layer_idx, cache_kwargs)
             key_states = paddle.concat([past_key_value[0], key_states], axis=2)
             value_states = paddle.concat([past_key_value[1], value_states], axis=2)
         past_key_value = (key_states, value_states) if use_cache else None
-        # print('query_states key_states, value_states', query_states.sum().item(), key_states.sum().item(), value_states.sum().item())
-        # print('query_states key_states, value_states', query_states.shape, key_states.shape, value_states.shape)
-        # q k v [1, 28, 74, 128] [1, 4, 74, 128] [1, 4, 74, 128]
-        # q k v [1, 28, 1, 128] [1, 4, 75, 128] [1, 4, 75, 128]
 
         key_states = repeat_kv(key_states, self.num_key_value_groups)
         value_states = repeat_kv(value_states, self.num_key_value_groups)
-        # -5440. -1712.
 
         # add visual to kv
         length_each_img = image_embeds.shape[1]
-        # [7, 729, 3584] sum 78336. mean 0.00430298
         try:
             image_embeds = self.v_kv_proj(image_embeds)
         except:
             image_embeds = self.v_kv_proj(image_embeds.astype("bfloat16"))
-        # [7, 729, 1024] sum 184320.
         image_start = 0
         context_layer = []
         for bi, media_starts in enumerate(media_offset):
@@ -432,8 +415,6 @@ def hyperattention(
                     H=self.num_key_value_heads,
                 )  # b h s d
                 image_start += num_images
-                # print("curr_query_layer", bi, curr_visual_key_layer.sum().item(), curr_visual_value_layer.sum().item())
-                #  [1, 4, 5103, 128] 206848. -22400.0
 
                 curr_visual_key_layer = self.apply_mi_rope(
                     curr_visual_key_layer, media_starts, length_each_img=length_each_img
@@ -459,14 +440,7 @@ def hyperattention(
                     causal_mask = causal_mask.unsqueeze(0).unsqueeze(0)
                     full_mask = causal_mask
 
-            # SDPA with memory-efficient backend is currently (torch==2.1.2) bugged with non-contiguous inputs with custom attn_mask,
-            # # Reference: https://github.com/pytorch/pytorch/issues/112577.
-            # if curr_query_layer.device.type == "cuda" and full_mask is not None:
-            #     curr_query_layer = curr_query_layer.contiguous()
-            #     curr_key_layer = curr_key_layer.contiguous()
-            #     curr_value_layer = curr_value_layer.contiguous()
-
-            # full_mask.shape [1, 1, 74, 5177] # sum 196689
+            # Note: 注意paddle的scaled_dot_product_attention 中q k v维度与torch不同
             attn_output = paddle.nn.functional.scaled_dot_product_attention(
                 curr_query_layer.transpose(
                     [0, 2, 1, 3]
@@ -481,7 +455,6 @@ def hyperattention(
                 dropout_p=self.attention_dropout if self.training else 0.0,
                 # The q_len > 1 is necessary to match with AttentionMaskConverter.to_causal_4d that does not create a causal mask in case q_len == 1.
                 is_causal=is_causal,
-                # enable_gqa=True, # gqa can not be used because mask requires XFORMERS and not support gqa
             )  # -> (N, ..., L, Ev)
             # torch attn_output.shape [1, 28, 72, 128]
             attn_output = attn_output.transpose([0, 2, 1, 3])
@@ -490,7 +463,6 @@ def hyperattention(
         attn_output = context_layer = paddle.concat(context_layer, axis=0)
 
         attn_output = attn_output.transpose([0, 2, 1, 3])
-        # print('attn_output', attn_output.shape) # [1, 74, 28, 128] [1, 1, 28, 128]
         attn_output = attn_output.reshape([bsz, q_len, self.hidden_size])
 
         attn_output = self.o_proj(attn_output)
@@ -526,7 +498,6 @@ def forward(
         #     )
 
         if self.is_hyper_enabled and image_embeds is not None:
-            # 必走这个分支
             return self.hyperattention(
                 hidden_states,
                 attention_mask,
@@ -558,15 +529,12 @@ def forward(
 
         kv_seq_len = key_states.shape[-2]
         if past_key_value is not None:
-            # kv_seq_len += past_key_value.get_usable_length(kv_seq_len, self.layer_idx)
             kv_seq_len += past_key_value[0].shape[-2]
         cos, sin = self.rotary_emb(value_states, seq_len=kv_seq_len)
 
         query_states, key_states = apply_rotary_pos_emb(query_states, key_states, cos, sin, position_ids)
 
         if past_key_value is not None:
-            # cache_kwargs = {"sin": sin, "cos": cos}  # Specific to RoPE models
-            # key_states, value_states = past_key_value.update(key_states, value_states, self.layer_idx, cache_kwargs)
             key_states = paddle.concat([past_key_value[0], key_states], axis=2)
             value_states = paddle.concat([past_key_value[1], value_states], axis=2)
         past_key_value = (key_states, value_states) if use_cache else None
@@ -574,18 +542,13 @@ def forward(
         key_states = repeat_kv(key_states, self.num_key_value_groups)
         value_states = repeat_kv(value_states, self.num_key_value_groups)
 
-        if attention_mask is not None:  # (1,1,1,60)
+        if attention_mask is not None:
             if tuple(attention_mask.shape) != (bsz, 1, q_len, kv_seq_len):
                 raise ValueError(
                     f"Attention mask should be of size {bsz, 1, q_len, kv_seq_len}, but is {tuple(attention_mask.shape)}"
                 )
-        # # SDPA with memory-efficient backend is currently (torch==2.1.2) bugged with non-contiguous inputs with custom attn_mask,
-        # # Reference: https://github.com/pytorch/pytorch/issues/112577.
-        # if query_states.device.type == "cuda" and attention_mask is not None:
-        #     query_states = query_states.contiguous()
-        #     key_states = key_states.contiguous()
-        #     value_states = value_states.contiguous()
 
+        # Note: 注意paddle的scaled_dot_product_attention 中q k v维度与torch不同
         attn_output = paddle.nn.functional.scaled_dot_product_attention(
             query_states.transpose([0, 2, 1, 3]),  # [1, 28, 74, 128] sum 21632.
             key_states.transpose([0, 2, 1, 3]),  # [1, 28, 74, 128] sum 335872.
@@ -604,6 +567,7 @@ def forward(
 
 
 # Original Attention of Qwen2
+# PaddleNLP only has Qwen2Attention
 QWEN2_ATTENTION_CLASSES = {
     "eager": Qwen2Attention,
     "flash_attention_2": Qwen2Attention,  # Qwen2FlashAttention2,
@@ -616,13 +580,8 @@ def __init__(self, config: HyperQwen2Config, layer_idx: int):
         super().__init__()
         self.hidden_size = config.hidden_size
 
-        if config.use_sliding_window and config._attn_implementation != "flash_attention_2":
-            logger.warning_once(
-                f"Sliding Window Attention is enabled but not implemented for `{config._attn_implementation}`; "
-                "unexpected results may be encountered."
-            )
         self.is_hyper_enabled = (layer_idx + 1) in config.hyper_layers
-        # print('layer_idx', layer_idx, self.is_hyper_enabled)
+        # TODO: 若使用Qwen2Attention则回答结果不对，若都使用HyperQwen2SdpaAttention回答结果也对，但需check一下
         if 1:  # self.is_hyper_enabled:
             self.self_attn = HyperQwen2SdpaAttention(config, layer_idx, is_hyper_enabled=self.is_hyper_enabled)
         else:
@@ -646,8 +605,8 @@ def forward(
     ) -> Tuple[paddle.Tensor, Optional[Tuple[paddle.Tensor, paddle.Tensor]]]:
         """
         Args:
-            hidden_states (`torch.FloatTensor`): input to the layer of shape `(batch, seq_len, embed_dim)`
-            attention_mask (`torch.FloatTensor`, *optional*): attention mask of size
+            hidden_states (`paddle.Tensor`): input to the layer of shape `(batch, seq_len, embed_dim)`
+            attention_mask (`paddle.Tensor`, *optional*): attention mask of size
                 `(batch, sequence_length)` where padding elements are indicated by 0.
             output_attentions (`bool`, *optional*):
                 Whether or not to return the attentions tensors of all attention layers. See `attentions` under
@@ -655,23 +614,20 @@ def forward(
             use_cache (`bool`, *optional*):
                 If set to `True`, `past_key_values` key value states are returned and can be used to speed up decoding
                 (see `past_key_values`).
-            past_key_value (`Tuple(torch.FloatTensor)`, *optional*): cached past key and value projection states
+            past_key_value (`Tuple(paddle.Tensor)`, *optional*): cached past key and value projection states
         """
         residual = hidden_states
         hidden_states = self.input_layernorm(hidden_states)
 
         # Shared LayerNorm
         if image_embeds is not None and self.is_hyper_enabled:
-            # 134144
             image_embeds = self.input_layernorm(image_embeds)
-            # 78336.
             media_kwargs = {"image_embeds": image_embeds, "media_offset": media_offset}
         else:
             image_embeds = media_offset = None
             media_kwargs = {}
 
         # Self Attention
-        # hidden_states.sum 76.50000000
         hidden_states, self_attn_weights, present_key_value = self.self_attn(  # -704. 2080. (48128., 240.)
             hidden_states=hidden_states.cast(paddle.bfloat16),  # [1, 74, 3584] sum -704.
             attention_mask=attention_mask,
@@ -682,7 +638,6 @@ def forward(
             **media_kwargs,  # {}
         )
         hidden_states = residual + hidden_states
-        # -1.71093750 + -704.
 
         # Fully Connected
         residual = hidden_states
@@ -757,34 +712,6 @@ def get_input_embeddings(self):
     def set_input_embeddings(self, value):
         self.embed_tokens = value
 
-    @staticmethod
-    def _prepare_decoder_attention_mask(attention_mask, input_shape, past_key_values_length, dtype):
-        if attention_mask is not None:
-            # [bsz, seq_len] -> [bsz, 1, tgt_seq_len, src_seq_len]
-            if len(attention_mask.shape) == 2:
-                expanded_attn_mask = _expand_2d_mask(attention_mask, dtype, tgt_length=input_shape[-1])
-                # For decoding phase in generation, seq_length = 1, we don't need to add causal mask
-                if input_shape[-1] > 1:
-                    combined_attention_mask = _make_causal_mask(
-                        input_shape,
-                        past_key_values_length=past_key_values_length,
-                    )
-                    expanded_attn_mask = expanded_attn_mask & combined_attention_mask
-            # [bsz, seq_len, seq_len] -> [bsz, 1, seq_len, seq_len]
-            elif len(attention_mask.shape) == 3:
-                expanded_attn_mask = attention_mask.unsqueeze(1).astype("bool")
-            # if attention_mask is already 4-D, do nothing
-            else:
-                expanded_attn_mask = attention_mask
-        else:
-            expanded_attn_mask = _make_causal_mask(
-                input_shape,
-                past_key_values_length=past_key_values_length,
-            )
-        # Convert bool attention_mask to float attention mask, which will be added to attention_scores later
-        expanded_attn_mask = paddle.where(expanded_attn_mask, 0.0, paddle.finfo(dtype).min).astype(dtype)
-        return expanded_attn_mask
-
     def forward(
         self,
         input_ids: paddle.Tensor = None,
@@ -819,12 +746,6 @@ def forward(
 
         past_key_values_length = 0
 
-        # if use_cache:
-        #     use_legacy_cache = False #not isinstance(past_key_values, Cache)
-        #     #if use_legacy_cache:
-        #     #    past_key_values = DynamicCache.from_legacy_cache(past_key_values)
-        #     past_key_values_length = past_key_values.get_usable_length(seq_length)
-
         if past_key_values is None:
             past_key_values = tuple([None] * len(self.layers))
         # NOTE: to make cache can be clear in-time
@@ -836,7 +757,6 @@ def forward(
             cache_length = past_key_values[0][0].shape[1]  #
             past_key_values_length += cache_length
 
-        # print('position_ids  before', position_ids)
         if position_ids is None:
             position_ids = paddle.arange(
                 past_key_values_length, seq_length + past_key_values_length, dtype=paddle.int64
@@ -845,14 +765,10 @@ def forward(
         else:
             position_ids = position_ids.reshape([-1, seq_length]).astype(dtype="int64")
 
-        # print('position_ids', position_ids)
-        # print('seq_length', seq_length)
-        # print('past_key_values_length', past_key_values_length)
-
         if inputs_embeds is None:
             inputs_embeds = self.embed_tokens(input_ids)
 
-        attention_mask = None
+        attention_mask = None  #
 
         hidden_states = inputs_embeds
 
@@ -867,7 +783,7 @@ def forward(
         # decoder layers
         all_hidden_states = () if output_hidden_states else None
         all_self_attns = () if output_attentions else None
-        next_decoder_cache = ()  # not none
+        next_decoder_cache = ()  # not None
 
         for idx, decoder_layer in enumerate(self.layers):
             if output_hidden_states:
@@ -1040,6 +956,7 @@ def forward(
     def prepare_inputs_for_generation(
         self, input_ids, past_key_values=None, attention_mask=None, inputs_embeds=None, **kwargs
     ):
+        # 以下这段参考PaddleNLP的 Qwen2ForCausalLM 的写法，与torch的mPLUG-owl3不同
         batch_size, seq_length = input_ids.shape
         position_ids = kwargs.get("position_ids", paddle.arange(seq_length).expand((batch_size, seq_length)))
         attention_mask = kwargs.get("attention_mask", None)
diff --git a/paddlemix/models/mPLUGOwl3/modeling_mplugowl3.py b/paddlemix/models/mPLUGOwl3/modeling_mplugowl3.py
index 16415c96a..a522e5165 100644
--- a/paddlemix/models/mPLUGOwl3/modeling_mplugowl3.py
+++ b/paddlemix/models/mPLUGOwl3/modeling_mplugowl3.py
@@ -81,8 +81,7 @@ def _small_batched_forward(self, pixel_values):
             end_idx = min(B, i + vision_batch_size)
             tmp_hs = self.vision_model(pixel_values[start_idx:end_idx], output_hidden_states=True).hidden_states[-2]
             image_forward_out.append(tmp_hs)
-        # image_forward_out[0].sum()
-        # [7, 729, 1152] sum -872448.
+
         vision_embedding = paddle.concat(image_forward_out, axis=0)
         assert vision_embedding.shape[0] == B
         return vision_embedding
@@ -95,7 +94,6 @@ def forward_image(self, pixel_values):
 
         if self.vision2text_model is not None:
             image_embeds = self.vision2text_model(image_embeds)
-            # [7, 729, 3584] sum 134144.  mean 0.00735474
         else:
             pass