From 8aa566b931eb837583d1100e0120cc92fc4e8ef7 Mon Sep 17 00:00:00 2001
From: XingweiDeng <aaadddiy@163.com>
Date: Thu, 5 Feb 2026 21:52:50 +0800
Subject: [PATCH 01/60] init

---
 docs/source/en/_toctree.yml                   |    2 +
 docs/source/en/model_doc/pp_chart2table.md    |  160 ++
 src/transformers/conversion_mapping.py        |    1 +
 src/transformers/models/__init__.py           |    1 +
 .../models/auto/configuration_auto.py         |    2 +
 .../models/auto/image_processing_auto.py      |    1 +
 src/transformers/models/auto/modeling_auto.py |    1 +
 .../models/auto/processing_auto.py            |    1 +
 .../models/auto/tokenization_auto.py          |    1 +
 .../models/pp_chart2table/__init__.py         |   32 +
 .../configuration_pp_chart2table.py           |  364 ++++
 .../image_processing_pp_chart2table.py        |  161 ++
 .../image_processing_pp_chart2table_fast.py   |   95 +
 .../pp_chart2table/modeling_pp_chart2table.py | 1369 ++++++++++++++
 .../pp_chart2table/modular_pp_chart2table.py  | 1609 +++++++++++++++++
 .../processing_pp_chart2table.py              |   65 +
 tests/models/pp_chart2table/__init__.py       |    0
 .../test_modeling_pp_chart2table.py           |  391 ++++
 utils/check_config_attributes.py              |    1 +
 utils/check_repo.py                           |    6 +
 20 files changed, 4263 insertions(+)
 create mode 100644 docs/source/en/model_doc/pp_chart2table.md
 create mode 100644 src/transformers/models/pp_chart2table/__init__.py
 create mode 100644 src/transformers/models/pp_chart2table/configuration_pp_chart2table.py
 create mode 100644 src/transformers/models/pp_chart2table/image_processing_pp_chart2table.py
 create mode 100644 src/transformers/models/pp_chart2table/image_processing_pp_chart2table_fast.py
 create mode 100644 src/transformers/models/pp_chart2table/modeling_pp_chart2table.py
 create mode 100644 src/transformers/models/pp_chart2table/modular_pp_chart2table.py
 create mode 100644 src/transformers/models/pp_chart2table/processing_pp_chart2table.py
 create mode 100644 tests/models/pp_chart2table/__init__.py
 create mode 100644 tests/models/pp_chart2table/test_modeling_pp_chart2table.py
diff --git a/docs/source/en/_toctree.yml b/docs/source/en/_toctree.yml
index 670854e4895d..d0bf6fb9961b 100644
--- a/docs/source/en/_toctree.yml
+++ b/docs/source/en/_toctree.yml
@@ -1151,6 +1151,8 @@
         title: Pix2Struct
       - local: model_doc/pixtral
         title: Pixtral
+      - local: model_doc/pp_chart2table
+        title: PPChart2Table
       - local: model_doc/qwen2_5_omni
         title: Qwen2.5-Omni
       - local: model_doc/qwen2_5_vl
diff --git a/docs/source/en/model_doc/pp_chart2table.md b/docs/source/en/model_doc/pp_chart2table.md
new file mode 100644
index 000000000000..5082c6f0adef
--- /dev/null
+++ b/docs/source/en/model_doc/pp_chart2table.md
@@ -0,0 +1,160 @@
+<!--Copyright 2025 The HuggingFace Team. All rights reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License"); you may not use this file except in compliance with
+the License. You may obtain a copy of the License at
+
+http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software distributed under the License is distributed on
+an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the License for the
+specific language governing permissions and limitations under the License.
+
+⚠️ Note that this file is in Markdown but contain specific syntax for our doc-builder (similar to MDX) that may not be
+rendered properly in your Markdown viewer.
+
+-->
+
+# PP-Chart2Table
+
+<div class="flex flex-wrap space-x-1">
+<img alt="PyTorch" src="https://img.shields.io/badge/PyTorch-DE3412?style=flat&logo=pytorch&logoColor=white">
+</div>
+
+## Overview
+
+**PP-Chart2Table** is a SOTA multimodal model developed by the PaddlePaddle team, specializing in chart parsing for both Chinese and English. Its high performance is driven by a novel "Shuffled Chart Data Retrieval" training task, which, combined with a refined token masking strategy, significantly improves its efficiency in converting charts to data tables. The model is further strengthened by an advanced data synthesis pipeline that uses high-quality seed data, RAG, and LLMs persona design to create a richer, more diverse training set. To address the challenge of large-scale unlabeled, out-of-distribution (OOD) data, the team implemented a two-stage distillation process, ensuring robust adaptability and generalization on real-world data.
+
+## Model Architecture 
+PP-Chart2Table adopts a multimodal fusion architecture that combines a vision tower for chart feature extraction and a language model for table structure generation, enabling end-to-end chart-to-table conversion.
+
+
+## Usage
+
+### Single input inference
+
+The example below demonstrates how to classify image with PP-Chart2Table using [`Pipeline`] or the [`AutoModel`].
+
+<hfoptions id="usage">
+<hfoption id="Pipeline">
+
+```py
+from transformers import pipeline
+from PIL import Image
+pipe = pipeline("image-text-to-text", model="PaddlePaddle/PP-Chart2Table_safetensors")
+
+result = pipe(images="https://paddle-model-ecology.bj.bcebos.com/paddlex/imgs/demo_image/chart_parsing_02.png", do_sample=False, max_new_tokens=256)
+print(result)
+```
+
+</hfoption>
+
+<hfoption id="AutoModel">
+
+```py
+import requests
+from PIL import Image
+from transformers import AutoModelForImageTextToText, AutoProcessor
+
+model_path = "PaddlePaddle/PP-Chart2Table_safetensors"
+model = AutoModelForImageTextToText.from_pretrained(model_path, dtype="float32").to("cuda")
+processor = AutoProcessor.from_pretrained(model_path)
+
+image = Image.open(requests.get("https://paddle-model-ecology.bj.bcebos.com/paddlex/imgs/demo_image/chart_parsing_02.png", stream=True).raw)
+inputs = processor(images=image).to(model.device)
+
+outputs = model.generate(**inputs, do_sample=False, max_new_tokens=256)
+result = processor.postprocess(outputs)
+print(result)
+
+```
+
+</hfoption>
+</hfoptions>
+
+### Batched inference
+
+Here is how you can do it with PP-Chart2Table using [`Pipeline`] or the [`AutoModel`]:
+
+<hfoptions id="usage">
+<hfoption id="Pipeline">
+
+```py
+from transformers import pipeline
+from PIL import Image
+model_path = "PaddlePaddle/PP-Chart2Table_safetensors"
+pipe = pipeline("image-text-to-text", model=model_path)
+
+image_path = "https://paddle-model-ecology.bj.bcebos.com/paddlex/imgs/demo_image/chart_parsing_02.png"
+result = pipe(images=[image_path, image_path], do_sample=False, max_new_tokens=256)
+print(result)
+```
+
+</hfoption>
+
+<hfoption id="AutoModel">
+
+```py
+import requests
+from PIL import Image
+from transformers import AutoModelForImageTextToText, AutoProcessor
+
+model_path = "PaddlePaddle/PP-Chart2Table_safetensors"
+model = AutoModelForImageTextToText.from_pretrained(model_path, dtype="float32").to("cuda")
+processor = AutoProcessor.from_pretrained(model_path)
+
+image = Image.open(requests.get("https://paddle-model-ecology.bj.bcebos.com/paddlex/imgs/demo_image/chart_parsing_02.png", stream=True).raw)
+inputs = processor(images=[image, image]).to(model.device)
+
+outputs = model.generate(**inputs, do_sample=False, max_new_tokens=256)
+result = processor.postprocess(outputs)
+print(result)
+```
+
+</hfoption>
+</hfoptions>
+
+## PPChart2TableForConditionalGeneration
+
+[[autodoc]] PPChart2TableForConditionalGeneration
+    - forward
+
+## PPChart2TableConfig
+
+[[autodoc]] PPChart2TableConfig
+
+## PPChart2TableVisionConfig
+
+[[autodoc]] PPChart2TableVisionConfig
+
+## PPChart2TableTextConfig
+
+[[autodoc]] PPChart2TableTextConfig
+
+## PPChart2TableTextModel
+
+[[autodoc]] PPChart2TableTextModel
+    - forward
+
+## PPChart2TableVisionModel
+
+[[autodoc]] PPChart2TableVisionModel
+
+## PPChart2TableImageProcessor
+
+[[autodoc]] PPChart2TableImageProcessor
+
+## PPChart2TableImageProcessorFast
+
+[[autodoc]] PPChart2TableImageProcessorFast
+
+## PPChart2TableModel
+
+[[autodoc]] PPChart2TableModel
+
+## PPChart2TableProcessor
+
+[[autodoc]] PPChart2TableProcessor
+
+## PPChart2TableVisionTransformer
+
+[[autodoc]] PPChart2TableVisionTransformer
diff --git a/src/transformers/conversion_mapping.py b/src/transformers/conversion_mapping.py
index 7eadf603af5c..de6ca1969f24 100644
--- a/src/transformers/conversion_mapping.py
+++ b/src/transformers/conversion_mapping.py
@@ -288,6 +288,7 @@ def register_checkpoint_conversion_mapping(
     "sam3_tracker",
     "sam3_tracker_video",
     "paddleocrvl",
+    "ppchart2table",
     "ernie4_5_vl_moe",
 ]
 
diff --git a/src/transformers/models/__init__.py b/src/transformers/models/__init__.py
index 342f366f3bc8..f7e743634629 100644
--- a/src/transformers/models/__init__.py
+++ b/src/transformers/models/__init__.py
@@ -292,6 +292,7 @@
     from .plbart import *
     from .poolformer import *
     from .pop2piano import *
+    from .pp_chart2table import *
     from .prompt_depth_anything import *
     from .prophetnet import *
     from .pvt import *
diff --git a/src/transformers/models/auto/configuration_auto.py b/src/transformers/models/auto/configuration_auto.py
index 55dd1b820073..5757ffc6888a 100644
--- a/src/transformers/models/auto/configuration_auto.py
+++ b/src/transformers/models/auto/configuration_auto.py
@@ -331,6 +331,7 @@
         ("plbart", "PLBartConfig"),
         ("poolformer", "PoolFormerConfig"),
         ("pop2piano", "Pop2PianoConfig"),
+        ("pp_chart2table", "PPChart2TableConfig"),
         ("prompt_depth_anything", "PromptDepthAnythingConfig"),
         ("prophetnet", "ProphetNetConfig"),
         ("pvt", "PvtConfig"),
@@ -799,6 +800,7 @@
         ("plbart", "PLBart"),
         ("poolformer", "PoolFormer"),
         ("pop2piano", "Pop2Piano"),
+        ("pp_chart2table", "PPChart2Table"),
         ("prompt_depth_anything", "PromptDepthAnything"),
         ("prophetnet", "ProphetNet"),
         ("pvt", "PVT"),
diff --git a/src/transformers/models/auto/image_processing_auto.py b/src/transformers/models/auto/image_processing_auto.py
index 9d5b531def2a..89b73530d12d 100644
--- a/src/transformers/models/auto/image_processing_auto.py
+++ b/src/transformers/models/auto/image_processing_auto.py
@@ -163,6 +163,7 @@
             ("pixio", ("BitImageProcessor", "BitImageProcessorFast")),
             ("pixtral", ("PixtralImageProcessor", "PixtralImageProcessorFast")),
             ("poolformer", ("PoolFormerImageProcessor", "PoolFormerImageProcessorFast")),
+            ("pp_chart2table", ("PPChart2TableImageProcessor", "PPChart2TableImageProcessorFast")),
             ("prompt_depth_anything", ("PromptDepthAnythingImageProcessor", "PromptDepthAnythingImageProcessorFast")),
             ("pvt", ("PvtImageProcessor", "PvtImageProcessorFast")),
             ("pvt_v2", ("PvtImageProcessor", "PvtImageProcessorFast")),
diff --git a/src/transformers/models/auto/modeling_auto.py b/src/transformers/models/auto/modeling_auto.py
index 8b151b68e1df..60f527d4dfa0 100644
--- a/src/transformers/models/auto/modeling_auto.py
+++ b/src/transformers/models/auto/modeling_auto.py
@@ -1046,6 +1046,7 @@ class _BaseModelWithGenerate(PreTrainedModel, GenerationMixin):
         ("perception_lm", "PerceptionLMForConditionalGeneration"),
         ("pix2struct", "Pix2StructForConditionalGeneration"),
         ("pixtral", "LlavaForConditionalGeneration"),
+        ("pp_chart2table", "PPChart2TableForConditionalGeneration"),
         ("qwen2_5_vl", "Qwen2_5_VLForConditionalGeneration"),
         ("qwen2_vl", "Qwen2VLForConditionalGeneration"),
         ("qwen3_vl", "Qwen3VLForConditionalGeneration"),
diff --git a/src/transformers/models/auto/processing_auto.py b/src/transformers/models/auto/processing_auto.py
index 3d2e6ef5cbc7..97fc88443390 100644
--- a/src/transformers/models/auto/processing_auto.py
+++ b/src/transformers/models/auto/processing_auto.py
@@ -125,6 +125,7 @@
         ("pix2struct", "Pix2StructProcessor"),
         ("pixtral", "PixtralProcessor"),
         ("pop2piano", "Pop2PianoProcessor"),
+        ("pp_chart2table", "PPChart2TableProcessor"),
         ("qwen2_5_omni", "Qwen2_5OmniProcessor"),
         ("qwen2_5_vl", "Qwen2_5_VLProcessor"),
         ("qwen2_audio", "Qwen2AudioProcessor"),
diff --git a/src/transformers/models/auto/tokenization_auto.py b/src/transformers/models/auto/tokenization_auto.py
index 171a9fca6868..a9464d9d415c 100644
--- a/src/transformers/models/auto/tokenization_auto.py
+++ b/src/transformers/models/auto/tokenization_auto.py
@@ -299,6 +299,7 @@
             else ("TokenizersBackend" if is_tokenizers_available() else None),
         ),
         ("plbart", "PLBartTokenizer" if is_tokenizers_available() else None),
+        ("pp_chart2table", "TokenizersBackend" if is_tokenizers_available() else None),
         ("prophetnet", "ProphetNetTokenizer"),
         ("qdqbert", "BertTokenizer" if is_tokenizers_available() else None),
         ("qwen2", "Qwen2TokenizerFast" if is_tokenizers_available() else None),
diff --git a/src/transformers/models/pp_chart2table/__init__.py b/src/transformers/models/pp_chart2table/__init__.py
new file mode 100644
index 000000000000..a471ebfb2830
--- /dev/null
+++ b/src/transformers/models/pp_chart2table/__init__.py
@@ -0,0 +1,32 @@
+# coding=utf-8
+# Copyright 2025 the HuggingFace Team. All rights reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+from typing import TYPE_CHECKING
+
+from ...utils import _LazyModule
+from ...utils.import_utils import define_import_structure
+
+
+if TYPE_CHECKING:
+    from .configuration_pp_chart2table import *
+    from .image_processing_pp_chart2table import *
+    from .image_processing_pp_chart2table_fast import *
+    from .modeling_pp_chart2table import *
+    from .processing_pp_chart2table import *
+else:
+    import sys
+
+    _file = globals()["__file__"]
+    sys.modules[__name__] = _LazyModule(__name__, _file, define_import_structure(_file), module_spec=__spec__)
diff --git a/src/transformers/models/pp_chart2table/configuration_pp_chart2table.py b/src/transformers/models/pp_chart2table/configuration_pp_chart2table.py
new file mode 100644
index 000000000000..5e32dc30ef76
--- /dev/null
+++ b/src/transformers/models/pp_chart2table/configuration_pp_chart2table.py
@@ -0,0 +1,364 @@
+#                🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨
+#           This file was automatically generated from src/transformers/models/pp_chart2table/modular_pp_chart2table.py.
+#               Do NOT edit this file manually as any edits will be overwritten by the generation of
+#             the file from the modular. If any change should be done, please apply the change to the
+#                          modular_pp_chart2table.py file directly. One of our CI enforces this.
+#                🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨
+from typing import Optional
+
+from transformers.configuration_utils import PreTrainedConfig, layer_type_validation
+from transformers.modeling_rope_utils import RopeParameters
+
+
+class PPChart2TableVisionConfig(PreTrainedConfig):
+    r"""
+    This is the configuration class to store the configuration of a [`PPChart2TableVisionModel`]. It is used to instantiate a
+    PP-Chart2Table vision encoder according to the specified arguments, defining the model architecture. Instantiating a
+    configuration with the defaults will yield a similar configuration to that of the vision encoder of the PP-Chart2Table
+    architecture developed by the PaddlePaddle team for chart-to-table parsing tasks.
+
+    Configuration objects inherit from [`PreTrainedConfig`] and can be used to control the model outputs. Read the
+    documentation from [`PreTrainedConfig`] for more information.
+
+    Args:
+        im_patch_token (`int`, *optional*, defaults to 151859):
+            The token ID used to represent individual image patches in the multimodal input sequence.
+        im_start_token (`int`, *optional*, defaults to 151857):
+            The token ID representing the start of an image token sequence in the multimodal input.
+        depth (`int`, *optional*, defaults to 12):
+            Number of hidden layers in the vision Transformer encoder.
+        embed_dim (`int`, *optional*, defaults to 768):
+            Dimensionality of the patch embedding layer in the vision encoder.
+        hidden_size (`int`, *optional*, defaults to 1024):
+            Dimensionality of the hidden layers in the vision Transformer encoder.
+        img_size (`int`, *optional*, defaults to 1024):
+            The size (resolution) of input chart images (assumed to be square).
+        mlp_ratio (`float`, *optional*, defaults to 4.0):
+            Ratio of the dimensionality of the feed-forward layer to the hidden size in the vision Transformer blocks.
+        num_heads (`int`, *optional*, defaults to 12):
+            Number of attention heads for each self-attention layer in the vision Transformer encoder.
+        patch_size (`int`, *optional*, defaults to 16):
+            The size (resolution) of each image patch extracted from the input chart image.
+        qkv_bias (`bool`, *optional*, defaults to `True`):
+            Whether to add bias terms to the query, key, and value projection layers in the self-attention mechanism.
+        use_rel_pos (`bool`, *optional*, defaults to `True`):
+            Whether to use relative positional embeddings in the self-attention layers of the vision encoder.
+        global_attn_indexes (`list`, *optional*, defaults to `[2, 5, 8, 11]`):
+            List of layer indexes where global attention (instead of windowed attention) is applied in the vision encoder.
+        window_size (`int`, *optional*, defaults to 14):
+            The size of the attention window for windowed self-attention in the vision Transformer layers.
+        out_chans (`int`, *optional*, defaults to 256):
+            Number of output channels from the convolutional stem layer before patch embedding.
+
+    Example:
+
+    ```python
+    >>> from transformers import PPChart2TableVisionConfig, PPChart2TableVisionModel
+
+    >>> # Initializing a PPChart2TableVisionConfig with default PP-Chart2Table style configuration
+    >>> configuration = PPChart2TableVisionConfig()
+
+    >>> # Initializing a PPChart2TableVisionModel (with random weights) from the PP-Chart2Table style configuration
+    >>> model = PPChart2TableVisionModel(configuration)
+
+    >>> # Accessing the model configuration
+    >>> configuration = model.config
+    """
+
+    model_type = "pp_chart2table_vision"
+    base_config_key = "vision_config"
+
+    def __init__(
+        self,
+        im_patch_token: int = 151859,
+        im_start_token: int = 151857,
+        depth: int = 12,
+        embed_dim: int = 768,
+        hidden_size: int = 1024,
+        img_size: int = 1024,
+        mlp_ratio: float = 4.0,
+        num_heads: int = 12,
+        patch_size: int = 16,
+        qkv_bias: bool = True,
+        use_rel_pos: bool = True,
+        global_attn_indexes: Optional[list] = None,
+        window_size: int = 14,
+        out_chans: int = 256,
+        **kwargs,
+    ):
+        self.im_patch_token = im_patch_token
+        self.im_start_token = im_start_token
+
+        self.depth = depth
+        self.embed_dim = embed_dim
+        self.hidden_size = hidden_size
+        self.img_size = img_size
+        self.mlp_ratio = mlp_ratio
+        self.num_heads = num_heads
+        self.patch_size = patch_size
+        self.qkv_bias = qkv_bias
+        self.use_rel_pos = use_rel_pos
+        self.global_attn_indexes = global_attn_indexes if global_attn_indexes is not None else [2, 5, 8, 11]
+        self.window_size = window_size
+        self.out_chans = out_chans
+
+        super().__init__(**kwargs)
+
+
+class PPChart2TableTextConfig(PreTrainedConfig):
+    r"""
+    This is the configuration class to store the configuration of a [`PPChart2TableTextModel`]. It is used to instantiate a
+    PP-Chart2Table text decoder according to the specified arguments, defining the model architecture. Instantiating a
+    configuration with the defaults will yield a similar configuration to that of the text encoder/decoder of the
+    PPChart2TableText-7B-beta [Qwen/PPChart2TableText-7B-beta](https://huggingface.co/Qwen/PPChart2TableText-7B-beta)
+    architecture, optimized for chart-to-table text generation tasks.
+
+    Configuration objects inherit from [`PreTrainedConfig`] and can be used to control the model outputs. Read the
+    documentation from [`PreTrainedConfig`] for more information.
+
+    Args:
+        attention_dropout (`float`, *optional*, defaults to 0.0):
+            The dropout ratio for the attention probabilities in self-attention layers.
+        bos_token_id (`int`, *optional*, defaults to 151643):
+            The token ID representing the beginning of a sequence (BOS) for text generation.
+        eos_token_id (`int`, *optional*, defaults to 151643):
+            The token ID representing the end of a sequence (EOS) for text generation.
+        hidden_act (`str` or `function`, *optional*, defaults to `"silu"`):
+            The non-linear activation function (function or string) in the feed-forward and attention layers of the decoder.
+        hidden_size (`int`, *optional*, defaults to 1024):
+            Dimensionality of the hidden representations in the Transformer decoder layers.
+        initializer_range (`float`, *optional*, defaults to 0.02):
+            The standard deviation of the truncated_normal_initializer for initializing all weight matrices.
+        intermediate_size (`int`, *optional*, defaults to 2816):
+            Dimensionality of the "intermediate" (i.e., feed-forward) layer in the Transformer decoder blocks.
+        max_position_embeddings (`int`, *optional*, defaults to 32768):
+            The maximum sequence length that this model might ever be used with for text input/output.
+        num_attention_heads (`int`, *optional*, defaults to 16):
+            Number of attention heads for each self-attention layer in the Transformer decoder.
+        num_hidden_layers (`int`, *optional*, defaults to 24):
+            Number of hidden layers in the Transformer decoder.
+        num_key_value_heads (`int`, *optional*, defaults to 16):
+            Number of key/value heads for implementing Grouped Query Attention (GQA). If equal to `num_attention_heads`,
+            Multi Head Attention (MHA) is used; if 1, Multi Query Attention (MQA) is used. For more details, see
+            [this paper](https://huggingface.co/papers/2305.13245).
+        rms_norm_eps (`float`, *optional*, defaults to 1e-06):
+            The epsilon value used by the RMS normalization layers to avoid division by zero.
+        rope_theta (`float`, *optional*, defaults to 1000000.0):
+            The base period of the RoPE (Rotary Position Embedding) embeddings, controlling the frequency of positional encoding.
+        rope_parameters (`RopeParameters` or `dict`, *optional*):
+            Configuration parameters for RoPE embeddings, including scaling parameters for longer sequence lengths beyond
+            `max_position_embeddings`.
+        sliding_window (`int`, *optional*, defaults to 32768):
+            Window size for Sliding Window Attention (SWA) in the decoder layers (only active if `use_sliding_window=True`).
+        tie_word_embeddings (`bool`, *optional*, defaults to `True`):
+            Whether the model's input and output word embeddings should be tied (shared weights).
+        use_cache (`bool`, *optional*, defaults to `True`):
+            Whether to return the last key/value attention states to speed up sequential decoding (only relevant for autoregressive
+            generation).
+        vocab_size (`int`, *optional*, defaults to 151860):
+            Vocabulary size of the PPChart2TableText model. Defines the number of distinct tokens that can be represented
+            by `input_ids`.
+        layer_types (`list[str]`, *optional*):
+            Attention pattern for each decoder layer (e.g., `"full_attention"` or `"sliding_attention"`). If not specified,
+            automatically determined by `sliding_window`.
+
+    Example:
+
+    ```python
+    >>> from transformers import PPChart2TableTextConfig, PPChart2TableTextModel
+
+    >>> # Initializing a PPChart2TableText style configuration
+    >>> configuration = PPChart2TableTextConfig()
+
+    >>> # Initializing a model from the PPChart2TableText-7B style configuration
+    >>> model = PPChart2TableTextModel(configuration)
+
+    >>> # Accessing the model configuration
+    >>> configuration = model.config
+    """
+
+    model_type = "pp_chart2table_text"
+    keys_to_ignore_at_inference = ["past_key_values"]
+
+    # Default tensor parallel plan for base model `PPChart2TableText`
+    base_model_tp_plan = {
+        "layers.*.self_attn.q_proj": "colwise",
+        "layers.*.self_attn.k_proj": "colwise",
+        "layers.*.self_attn.v_proj": "colwise",
+        "layers.*.self_attn.o_proj": "rowwise",
+        "layers.*.mlp.gate_proj": "colwise",
+        "layers.*.mlp.up_proj": "colwise",
+        "layers.*.mlp.down_proj": "rowwise",
+    }
+    base_model_pp_plan = {
+        "embed_tokens": (["input_ids"], ["inputs_embeds"]),
+        "layers": (["hidden_states", "attention_mask"], ["hidden_states"]),
+        "norm": (["hidden_states"], ["hidden_states"]),
+    }
+    base_config_key = "text_config"
+
+    def __init__(
+        self,
+        attention_dropout: float = 0.0,
+        bos_token_id: int = 151643,
+        eos_token_id: int = 151643,
+        hidden_act: str = "silu",
+        hidden_size: int = 1024,
+        initializer_range: float = 0.02,
+        intermediate_size: int = 2816,
+        max_position_embeddings: int = 32768,
+        num_attention_heads: int = 16,
+        num_hidden_layers: int = 24,
+        num_key_value_heads: int = 16,
+        rms_norm_eps: float = 1e-06,
+        rope_theta: float = 1000000.0,
+        rope_parameters: Optional[RopeParameters | dict[str, RopeParameters]] = None,
+        sliding_window: int = 32768,
+        tie_word_embeddings: bool = True,
+        use_cache: bool = True,
+        vocab_size: int = 151860,
+        layer_types: Optional[list[str]] = None,
+        **kwargs,
+    ):
+        self.vocab_size = vocab_size
+        self.max_position_embeddings = max_position_embeddings
+        self.hidden_size = hidden_size
+        self.intermediate_size = intermediate_size
+        self.num_hidden_layers = num_hidden_layers
+        self.num_attention_heads = num_attention_heads
+        self.sliding_window = sliding_window
+
+        # for backward compatibility
+        if num_key_value_heads is None:
+            num_key_value_heads = num_attention_heads
+        self.num_key_value_heads = num_key_value_heads
+        self.hidden_act = hidden_act
+        self.initializer_range = initializer_range
+        self.rms_norm_eps = rms_norm_eps
+        self.use_cache = use_cache
+
+        self.attention_dropout = attention_dropout
+
+        self.layer_types = layer_types
+        if self.layer_types is None:
+            self.layer_types = [
+                "sliding_attention" if self.sliding_window is not None else "full_attention"
+                for i in range(self.num_hidden_layers)
+            ]
+        layer_type_validation(self.layer_types, self.num_hidden_layers)
+
+        self.rope_parameters = rope_parameters
+
+        self.rope_theta = rope_theta
+        self.tie_word_embeddings = tie_word_embeddings
+        super().__init__(
+            bos_token_id=bos_token_id,
+            eos_token_id=eos_token_id,
+            tie_word_embeddings=tie_word_embeddings,
+            **kwargs,
+        )
+
+
+class PPChart2TableConfig(PreTrainedConfig):
+    r"""
+    This is the main configuration class to store the configuration of a [`PPChart2TableModel`] or [`PPChart2TableForConditionalGeneration`].
+    It is used to instantiate a PP-Chart2Table multimodal model according to the specified arguments, defining the vision and text
+    sub-model architectures. This configuration class inherits from [`PreTrainedConfig`] and combines the configurations of:
+    - [`PPChart2TableVisionConfig`] (for the chart vision encoder)
+    - [`PPChart2TableTextConfig`] (for the table text decoder)
+    PP-Chart2Table [PaddlePaddle/PP-Chart2Table_safetensors](https://huggingface.co/PaddlePaddle/PP-Chart2Table_safetensors).
+
+    Instantiating a `PPChart2TableConfig` with the defaults will yield a similar configuration to the base PP-Chart2Table model
+    developed by the PaddlePaddle team for chart-to-table parsing tasks.
+
+    Configuration objects inherit from [`PreTrainedConfig`] and can be used to control the model outputs. Read the
+    documentation from [`PreTrainedConfig`] for more information.
+
+    Args:
+        vision_config (`dict`, *optional*):
+            Dictionary of configuration options used to initialize [`PPChart2TableVisionConfig`]. If `None`, the default
+            `PPChart2TableVisionConfig` configuration will be used.
+        text_config (`dict`, *optional*):
+            Dictionary of configuration options used to initialize [`PPChart2TableTextConfig`]. If `None`, the default
+            `PPChart2TableTextConfig` configuration will be used.
+        im_start_token (`int`, *optional*, defaults to 151857):
+            The token ID representing the start of an image token sequence in the multimodal input (shared across vision/text sub-configs).
+        im_patch_token (`int`, *optional*, defaults to 151859):
+            The token ID used to represent individual image patches in the multimodal input sequence (shared across vision/text sub-configs).
+
+    Example:
+
+    ```python
+    >>> from transformers import PPChart2TableConfig, PPChart2TableModel
+
+    >>> # Initializing a PPChart2Table configuration with default vision and text sub-configs
+    >>> configuration = PPChart2TableConfig()
+
+    >>> # Initializing a PPChart2Table configuration with custom vision and text sub-configs
+    >>> vision_config = {"img_size": 512, "patch_size": 8}
+    >>> text_config = {"hidden_size": 2048, "num_hidden_layers": 16}
+    >>> configuration = PPChart2TableConfig(vision_config=vision_config, text_config=text_config)
+
+    >>> # Initializing a model from the PPChart2Table configuration
+    >>> model = PPChart2TableModel(configuration)
+
+    >>> # Accessing the model configuration
+    >>> configuration = model.config
+    >>> # Accessing the vision sub-config
+    >>> vision_config = configuration.vision_config
+    >>> # Accessing the text sub-config
+    >>> text_config = configuration.text_config
+    """
+
+    model_type = "pp_chart2table"
+    sub_configs = {"vision_config": PPChart2TableVisionConfig, "text_config": PPChart2TableTextConfig}
+
+    def __init__(
+        self,
+        vision_config: dict | None = None,
+        text_config: dict | None = None,
+        im_start_token: int = 151857,
+        im_patch_token: int = 151859,
+        **kwargs,
+    ):
+        if vision_config is None:
+            vision_config = {}
+        self.vision_config = PPChart2TableVisionConfig(**vision_config)
+
+        if text_config is None:
+            text_config = {}
+        self.text_config = PPChart2TableTextConfig(**text_config)
+
+        self.model_type = "pp_chart2table"
+
+        self.im_start_token = im_start_token
+        self.im_patch_token = im_patch_token
+
+        text_config_keys = [
+            "attention_dropout",
+            "bos_token_id",
+            "eos_token_id",
+            "hidden_act",
+            "hidden_size",
+            "initializer_range",
+            "intermediate_size",
+            "max_position_embeddings",
+            "num_attention_heads",
+            "num_hidden_layers",
+            "num_key_value_heads",
+            "rms_norm_eps",
+            "rope_theta",
+            "sliding_window",
+            "tie_word_embeddings",
+            "dtype",
+            "use_cache",
+            "vocab_size",
+        ]
+        for key in text_config_keys:
+            if hasattr(self.text_config, key):
+                setattr(self, key, getattr(self.text_config, key))
+
+        super().__init__(**kwargs)
+
+
+__all__ = ["PPChart2TableConfig", "PPChart2TableVisionConfig", "PPChart2TableTextConfig"]
diff --git a/src/transformers/models/pp_chart2table/image_processing_pp_chart2table.py b/src/transformers/models/pp_chart2table/image_processing_pp_chart2table.py
new file mode 100644
index 000000000000..e83a49a99f1b
--- /dev/null
+++ b/src/transformers/models/pp_chart2table/image_processing_pp_chart2table.py
@@ -0,0 +1,161 @@
+#                🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨
+#           This file was automatically generated from src/transformers/models/pp_chart2table/modular_pp_chart2table.py.
+#               Do NOT edit this file manually as any edits will be overwritten by the generation of
+#             the file from the modular. If any change should be done, please apply the change to the
+#                          modular_pp_chart2table.py file directly. One of our CI enforces this.
+#                🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨
+from typing import Optional, Union
+
+from transformers.feature_extraction_utils import BatchFeature
+from transformers.image_processing_utils import BaseImageProcessor
+from transformers.image_transforms import flip_channel_order, resize, to_channel_dimension_format
+from transformers.image_utils import (
+    ChannelDimension,
+    ImageInput,
+    PILImageResampling,
+    infer_channel_dimension_format,
+    make_flat_list_of_images,
+    to_numpy_array,
+    valid_images,
+    validate_preprocess_arguments,
+)
+from transformers.processing_utils import TensorType
+from transformers.utils import filter_out_non_signature_kwargs
+
+
+class PPChart2TableImageProcessor(BaseImageProcessor):
+    r"""
+    Image processor for the PP-Chart2Table multimodal model, optimized for chart image preprocessing tasks.
+
+    This processor handles the complete preprocessing pipeline for chart images, including resizing, rescaling,
+    normalization, and channel dimension reordering, tailored to the input requirements of the PP-Chart2Table vision encoder.
+
+    Args:
+        do_resize (`bool`, *optional*, defaults to `True`):
+            Whether to resize the input images to the specified `size`.
+        size (`dict[str, int]`, *optional*, defaults to `{"height": 256, "width": 256}`):
+            Dictionary containing the target height and width for resizing. Format: `{"height": int, "width": int}`.
+        resample (`PILImageResampling`, *optional*, defaults to `PILImageResampling.BICUBIC`):
+            Resampling filter to use when resizing images (e.g., BICUBIC, BILINEAR).
+        do_rescale (`bool`, *optional*, defaults to `True`):
+            Whether to rescale the pixel values from the range [0, 255] to [0, 1] using `rescale_factor`.
+        rescale_factor (`int` or `float`, *optional*, defaults to `1/255`):
+            Factor to apply for rescaling pixel values (e.g., 1/255 scales 0-255 to 0-1).
+        do_normalize (`bool`, *optional*, defaults to `True`):
+            Whether to normalize the input images using `image_mean` and `image_std`.
+        image_mean (`float` or `list[float]`, *optional*, defaults to `[0.406, 0.456, 0.485]`):
+            Mean values for image normalization (per channel, RGB order).
+        image_std (`float` or `list[float]`, *optional*, defaults to `[0.225, 0.224, 0.229]`):
+            Standard deviation values for image normalization (per channel, RGB order).
+        patch_size (`int`, *optional*, defaults to 16):
+            Size of image patches used by the PP-Chart2Table vision encoder (for alignment with model input).
+        merge_size (`int`, *optional*, defaults to 4):
+            Size factor for merging image patches (specific to PP-Chart2Table's vision processing pipeline).
+    """
+
+    model_input_names = ["pixel_values"]
+
+    def __init__(
+        self,
+        do_resize: bool = True,
+        size: Optional[dict[str, int]] = None,
+        resample: Optional[PILImageResampling] = PILImageResampling.BICUBIC,
+        do_rescale: bool = True,
+        rescale_factor: Union[int, float] = 1 / 255,
+        do_normalize: bool = True,
+        image_mean: Optional[Union[float, list[float]]] = [0.406, 0.456, 0.485],
+        image_std: Optional[Union[float, list[float]]] = [0.225, 0.224, 0.229],
+        patch_size: int = 16,
+        merge_size: int = 4,
+        **kwargs,
+    ) -> None:
+        super().__init__(**kwargs)
+        size = size if size is not None else {"height": 256, "width": 256}
+
+        self.do_resize = do_resize
+        self.size = size
+        self.do_rescale = do_rescale
+        self.rescale_factor = rescale_factor
+        self.do_normalize = do_normalize
+        self.image_mean = image_mean
+        self.image_std = image_std
+        self.resample = resample
+        self.patch_size = patch_size
+        self.merge_size = merge_size
+
+    @filter_out_non_signature_kwargs()
+    def preprocess(
+        self,
+        images: ImageInput,
+        size: Optional[dict[str, int]] = None,
+        do_resize: Optional[bool] = None,
+        resample: Optional[PILImageResampling] = None,
+        do_rescale: Optional[bool] = None,
+        rescale_factor: Optional[Union[int, float]] = None,
+        do_normalize: Optional[bool] = None,
+        image_mean: Optional[Union[float, list[float]]] = None,
+        image_std: Optional[Union[float, list[float]]] = None,
+        return_tensors: Optional[Union[TensorType, str]] = None,
+        data_format: Union[str, ChannelDimension] = ChannelDimension.FIRST,
+        input_data_format: Optional[Union[str, ChannelDimension]] = None,
+    ) -> BatchFeature:
+        size = self.size if size is None else size
+        do_resize = self.do_resize if do_resize is None else do_resize
+        resample = self.resample if resample is None else resample
+        do_rescale = self.do_rescale if do_rescale is None else do_rescale
+        rescale_factor = self.rescale_factor if rescale_factor is None else rescale_factor
+        do_normalize = self.do_normalize if do_normalize is None else do_normalize
+        image_mean = self.image_mean if image_mean is None else image_mean
+        image_std = self.image_std if image_std is None else image_std
+
+        images = make_flat_list_of_images(images)
+
+        validate_preprocess_arguments(
+            do_rescale=do_rescale,
+            rescale_factor=rescale_factor,
+            do_normalize=do_normalize,
+            image_mean=image_mean,
+            image_std=image_std,
+            size=size,
+            do_resize=do_resize,
+            resample=resample,
+        )
+
+        if not valid_images(images):
+            raise ValueError("Invalid image type. Must be of type PIL.Image.Image, numpy.ndarray, or torch.Tensor")
+
+        # All transformations expect numpy arrays
+        images = [to_numpy_array(image) for image in images]
+        if input_data_format is None:
+            input_data_format = infer_channel_dimension_format(images[0])
+
+        # transformations
+        resize_imgs = []
+        if do_resize:
+            for image in images:
+                img = resize(
+                    image,
+                    size=(size["height"], size["width"]),
+                    resample=resample,
+                    input_data_format=input_data_format,
+                )
+                resize_imgs.append(img)
+            images = resize_imgs
+
+        if do_rescale:
+            images = [self.rescale(image, rescale_factor, input_data_format=input_data_format) for image in images]
+
+        if do_normalize:
+            images = [
+                self.normalize(image, image_mean, image_std, input_data_format=input_data_format) for image in images
+            ]
+        images = [flip_channel_order(image, input_data_format=input_data_format) for image in images]
+        images = [
+            to_channel_dimension_format(image, data_format, input_channel_dim=input_data_format) for image in images
+        ]
+
+        encoded_inputs = BatchFeature(data={"pixel_values": images}, tensor_type=return_tensors)
+        return encoded_inputs
+
+
+__all__ = ["PPChart2TableImageProcessor"]
diff --git a/src/transformers/models/pp_chart2table/image_processing_pp_chart2table_fast.py b/src/transformers/models/pp_chart2table/image_processing_pp_chart2table_fast.py
new file mode 100644
index 000000000000..86a6cdb3a672
--- /dev/null
+++ b/src/transformers/models/pp_chart2table/image_processing_pp_chart2table_fast.py
@@ -0,0 +1,95 @@
+#                🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨
+#           This file was automatically generated from src/transformers/models/pp_chart2table/modular_pp_chart2table.py.
+#               Do NOT edit this file manually as any edits will be overwritten by the generation of
+#             the file from the modular. If any change should be done, please apply the change to the
+#                          modular_pp_chart2table.py file directly. One of our CI enforces this.
+#                🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨
+from typing import Optional, Union
+
+import torch
+from torchvision.transforms.v2.functional import InterpolationMode
+
+from transformers.feature_extraction_utils import BatchFeature
+from transformers.image_processing_utils_fast import BaseImageProcessorFast
+from transformers.processing_utils import TensorType
+
+
+class PPChart2TableImageProcessorFast(BaseImageProcessorFast):
+    r"""
+    Fast image processor for the PP-Chart2Table multimodal model, optimized for GPU-accelerated chart image preprocessing.
+
+    This high-performance processor implements a streamlined preprocessing pipeline for chart images (resizing, rescaling,
+    normalization, channel reordering) using PyTorch tensor operations, designed for efficient batch processing on GPUs.
+    It inherits from [`BaseImageProcessorFast`] and is optimized for inference/training pipelines requiring low-latency
+    image preprocessing.
+
+    Class Attributes (Default Configuration):
+        resample (`int`, defaults to 3):
+            Integer identifier for the resampling filter (3 = BICUBIC, compatible with `InterpolationMode.BICUBIC`).
+        image_mean (`list[float]`, defaults to `[0.40821073, 0.4578275, 0.48145466]`):
+            Per-channel mean values for image normalization (RGB order).
+        image_std (`list[float]`, defaults to `[0.27577711, 0.26130258, 0.26862954]`):
+            Per-channel standard deviation values for image normalization (RGB order).
+        size (`dict[str, int]`, defaults to `{"height": 1024, "width": 1024}`):
+            Default target size for image resizing (1024x1024, optimized for PP-Chart2Table vision encoder).
+        patch_size (`int`, defaults to 16):
+            Size of image patches used by the PP-Chart2Table vision encoder (for alignment with model input).
+        merge_size (`int`, defaults to 4):
+            Size factor for merging image patches (specific to PP-Chart2Table's vision processing pipeline).
+        do_resize (`bool`, defaults to `True`):
+            Default flag to enable image resizing.
+        do_rescale (`bool`, defaults to `True`):
+            Default flag to enable pixel value rescaling (from [0,255] to [0,1]).
+        do_normalize (`bool`, defaults to `True`):
+            Default flag to enable image normalization.
+    """
+
+    resample = 3
+    image_mean = [0.40821073, 0.4578275, 0.48145466]
+    image_std = [0.27577711, 0.26130258, 0.26862954]
+    size = {"height": 1024, "width": 1024}
+    patch_size = 16
+    merge_size = 4
+    do_resize = True
+    do_rescale = True
+    do_normalize = True
+
+    def __init__(self, **kwargs) -> None:
+        super().__init__(**kwargs)
+
+    def _preprocess(
+        self,
+        images: list[torch.Tensor],
+        size: Optional[list[dict[str, int]]],
+        do_resize: bool,
+        do_rescale: bool,
+        rescale_factor: float,
+        do_normalize: bool,
+        image_mean: Optional[Union[float, list[float]]],
+        image_std: Optional[Union[float, list[float]]],
+        return_tensors: Optional[Union[str, TensorType]],
+        interpolation: Optional[InterpolationMode] = None,
+        **kwargs,
+    ) -> BatchFeature:
+        data = {}
+        resize_imgs = []
+        if do_resize:
+            for image in images:
+                img = self.resize(image, size=size, interpolation=interpolation)
+                resize_imgs.append(img)
+            images = resize_imgs
+
+        processed_images = []
+        for image in images:
+            image = self.rescale_and_normalize(image, do_rescale, rescale_factor, do_normalize, image_mean, image_std)
+            processed_images.append(image)
+        images = processed_images
+
+        images = [image[[2, 1, 0], :, :] for image in images]
+        data.update({"pixel_values": torch.stack(images, dim=0)})
+        encoded_inputs = BatchFeature(data, tensor_type=return_tensors)
+
+        return encoded_inputs
+
+
+__all__ = ["PPChart2TableImageProcessorFast"]
diff --git a/src/transformers/models/pp_chart2table/modeling_pp_chart2table.py b/src/transformers/models/pp_chart2table/modeling_pp_chart2table.py
new file mode 100644
index 000000000000..6d95acc7eea5
--- /dev/null
+++ b/src/transformers/models/pp_chart2table/modeling_pp_chart2table.py
@@ -0,0 +1,1369 @@
+#                🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨
+#           This file was automatically generated from src/transformers/models/pp_chart2table/modular_pp_chart2table.py.
+#               Do NOT edit this file manually as any edits will be overwritten by the generation of
+#             the file from the modular. If any change should be done, please apply the change to the
+#                          modular_pp_chart2table.py file directly. One of our CI enforces this.
+#                🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨
+from collections.abc import Callable
+from dataclasses import dataclass
+from typing import Optional, Union
+
+import torch
+import torch.nn as nn
+import torch.nn.functional as F
+
+from transformers.cache_utils import Cache
+from transformers.generation import GenerationMixin
+from transformers.modeling_outputs import ModelOutput
+from transformers.modeling_utils import PreTrainedModel
+from transformers.utils import can_return_tuple
+
+from ...activations import ACT2FN
+from ...cache_utils import DynamicCache
+from ...integrations import use_kernel_forward_from_hub, use_kernel_func_from_hub, use_kernelized_func
+from ...masking_utils import create_causal_mask, create_sliding_window_causal_mask
+from ...modeling_flash_attention_utils import FlashAttentionKwargs
+from ...modeling_layers import GradientCheckpointingLayer
+from ...modeling_outputs import BaseModelOutputWithPast
+from ...modeling_rope_utils import ROPE_INIT_FUNCTIONS, dynamic_rope_update
+from ...modeling_utils import ALL_ATTENTION_FUNCTIONS
+from ...processing_utils import Unpack
+from ...utils import TransformersKwargs, auto_docstring
+from ...utils.generic import check_model_inputs, maybe_autocast
+from .configuration_pp_chart2table import PPChart2TableConfig, PPChart2TableTextConfig, PPChart2TableVisionConfig
+
+
+class PPChart2TableVisionPatchEmbed(nn.Module):
+    r"""
+    Image to Patch Embedding layer for PP-Chart2Table vision encoder.
+
+    This module converts raw chart images (HWC format) into flattened patch embeddings via a 2D convolution,
+    followed by dimension permutation to align with the vision transformer's input format.
+
+    Args:
+        kernel_size (`tuple[int, int]`, *optional*, defaults to `(16, 16)`):
+            Size of the convolution kernel (patch size) for splitting images into patches.
+        stride (`tuple[int, int]`, *optional*, defaults to `(16, 16)`):
+            Stride of the convolution operation (matches patch size for non-overlapping patches).
+        padding (`tuple[int, int]`, *optional*, defaults to `(0, 0)`):
+            Padding applied to the input image before convolution (ensures patch alignment).
+        in_chans (`int`, *optional*, defaults to 3):
+            Number of input channels (3 for RGB chart images).
+        embed_dim (`int`, *optional*, defaults to 768):
+            Dimensionality of the output patch embeddings (hidden size of the vision transformer).
+
+    Shape:
+        - Input: `(B, C, H, W)` (batch size, channels, height, width)
+        - Output: `(B, H_out, W_out, C_out)` (batch size, patch height, patch width, embedding dim)
+    """
+
+    def __init__(
+        self,
+        kernel_size: tuple[int, int] = (16, 16),
+        stride: tuple[int, int] = (16, 16),
+        padding: tuple[int, int] = (0, 0),
+        in_chans: int = 3,
+        embed_dim: int = 768,
+    ) -> None:
+        super().__init__()
+        self.proj = nn.Conv2d(in_chans, embed_dim, kernel_size=kernel_size, stride=stride, padding=padding)
+
+    def forward(self, hidden_states: torch.Tensor) -> torch.Tensor:
+        hidden_states = self.proj(hidden_states)
+        hidden_states = hidden_states.permute(0, 2, 3, 1)
+        return hidden_states
+
+
+class PPChart2TableVisionMLPBlock(nn.Module):
+    r"""
+    Multi-Layer Perceptron (MLP) block for PP-Chart2Table vision transformer layers.
+
+    Implements a two-layer feed-forward network with activation function, used in the vision transformer's
+    decoder layers to project features to a higher dimension and back.
+
+    Args:
+        embedding_dim (`int`):
+            Dimensionality of the input/output embeddings (hidden size of the transformer layer).
+        mlp_dim (`int`):
+            Dimensionality of the intermediate (hidden) layer in the MLP (typically 4x embedding_dim).
+        act (`Type[nn.Module]`, *optional*, defaults to `torch.nn.GELU`):
+            Non-linear activation function to apply between the two linear layers.
+
+    Shape:
+        - Input: `(B, H, W, embedding_dim)` or `(B, N, embedding_dim)` (N = H*W)
+        - Output: Same shape as input
+    """
+
+    def __init__(
+        self,
+        embedding_dim: int,
+        mlp_dim: int,
+        act: type[nn.Module] = torch.nn.GELU,
+    ) -> None:
+        super().__init__()
+        self.lin1 = nn.Linear(embedding_dim, mlp_dim)
+        self.lin2 = nn.Linear(mlp_dim, embedding_dim)
+        self.act = act()
+
+    def forward(self, hidden_states: torch.Tensor) -> torch.Tensor:
+        return self.lin2(self.act(self.lin1(hidden_states)))
+
+
+class PPChart2TableVisionLayerNorm2d(nn.Module):
+    r"""
+    2D Layer Normalization for spatial feature maps (adapted for PP-Chart2Table vision encoder).
+
+    Applies layer normalization over the channel dimension of 2D feature maps, with learnable scale/bias parameters
+    broadcasted across spatial dimensions (height/width).
+
+    Args:
+        num_channels (`int`):
+            Number of channels in the input feature map (embedding dimension).
+        epsilon (`float`, *optional*, defaults to `1e-06`):
+            Small value added to variance to avoid division by zero.
+
+    Shape:
+        - Input: `(B, C, H, W)` (batch size, channels, height, width)
+        - Output: Same shape as input
+    """
+
+    def __init__(self, num_channels: int, epsilon: float = 1e-06) -> None:
+        super().__init__()
+        self.weight = nn.Parameter(torch.ones(num_channels))
+        self.bias = nn.Parameter(torch.zeros(num_channels))
+        self.epsilon = epsilon
+
+    def forward(self, hidden_states: torch.Tensor) -> torch.Tensor:
+        u = hidden_states.mean(dim=1, keepdim=True)
+        s = (hidden_states - u).pow(2).mean(dim=1, keepdim=True)
+        hidden_states = (hidden_states - u) / torch.sqrt(s + self.epsilon)
+        hidden_states = self.weight[:, None, None] * hidden_states + self.bias[:, None, None]
+        return hidden_states
+
+
+def get_rel_pos(q_size: int, k_size: int, rel_pos: torch.Tensor) -> torch.Tensor:
+    r"""
+    Get relative positional embeddings for query and key sequences, with interpolation for mismatched sizes.
+
+    Args:
+        q_size (`int`):
+            Spatial size (height/width) of query feature map
+        k_size (`int`):
+            Spatial size (height/width) of key feature map
+        rel_pos (`torch.Tensor`):
+            Precomputed relative positional embeddings with shape [max_rel_dist_original, dim]
+
+    Returns:
+        `torch.Tensor`:
+            Interpolated relative positional embeddings for the query-key pair, shape [q_size, k_size, dim]
+    """
+    max_rel_dist = int(2 * max(q_size, k_size) - 1)
+    if rel_pos.shape[0] != max_rel_dist:
+        rel_pos_resized = F.interpolate(
+            rel_pos.reshape(1, rel_pos.shape[0], -1).permute(0, 2, 1),
+            size=max_rel_dist,
+            mode="linear",
+        )
+        rel_pos_resized = rel_pos_resized.reshape(-1, max_rel_dist).permute(1, 0)
+    else:
+        rel_pos_resized = rel_pos
+
+    q_coords = torch.arange(q_size)[:, None] * max(k_size / q_size, 1.0)
+    k_coords = torch.arange(k_size)[None, :] * max(q_size / k_size, 1.0)
+    relative_coords = q_coords - k_coords + (k_size - 1) * max(q_size / k_size, 1.0)
+    return rel_pos_resized[relative_coords.long()]
+
+
+def add_decomposed_rel_pos(
+    attn: torch.Tensor,
+    q: torch.Tensor,
+    rel_pos_h: torch.Tensor,
+    rel_pos_w: torch.Tensor,
+    q_size: tuple[int, int],
+    k_size: tuple[int, int],
+) -> torch.Tensor:
+    r"""
+    Add decomposed relative positional embeddings (height and width separately) to attention scores.
+
+    Args:
+        attn (`torch.Tensor`):
+            Attention scores with shape [B, q_h*q_w, k_h*k_w]
+        q (`torch.Tensor`):
+            Query tensor with shape [B, q_h*q_w, dim]
+        rel_pos_h (`torch.Tensor`):
+            Precomputed relative positional embeddings for height dimension
+        rel_pos_w (`torch.Tensor`):
+            Precomputed relative positional embeddings for width dimension
+        q_size (`tuple[int, int]`):
+            Spatial size (q_h, q_w) of query feature map
+        k_size (`tuple[int, int]`):
+            Spatial size (k_h, k_w) of key feature map
+
+    Returns:
+        `torch.Tensor`:
+            Attention scores with added relative positional embeddings, shape [B, q_h*q_w, k_h*k_w]
+    """
+    q_h, q_w = q_size
+    k_h, k_w = k_size
+    Rh = get_rel_pos(q_h, k_h, rel_pos_h)
+    Rw = get_rel_pos(q_w, k_w, rel_pos_w)
+
+    B, _, dim = q.shape
+    r_q = q.reshape(B, q_h, q_w, dim)
+    rel_h = torch.einsum("bhwc,hkc->bhwk", r_q, Rh)
+    rel_w = torch.einsum("bhwc,wkc->bhwk", r_q, Rw)
+
+    attn = (attn.reshape(B, q_h, q_w, k_h, k_w) + rel_h[:, :, :, :, None] + rel_w[:, :, :, None, :]).reshape(
+        B, q_h * q_w, k_h * k_w
+    )
+
+    return attn
+
+
+class PPChart2TableVisionAttention(nn.Module):
+    r"""
+    Multi-Head Self-Attention (MHSA) layer for PP-Chart2Table vision encoder, with optional relative positional encoding.
+
+    Implements standard multi-head attention with query/key/value projection, scaled dot-product attention,
+    and optional decomposed relative positional embeddings (height/width separate) for spatial awareness.
+
+    Args:
+        dim (`int`):
+            Dimensionality of the input embeddings (hidden size of the transformer layer).
+        num_heads (`int`, *optional*, defaults to 8):
+            Number of attention heads (must divide `dim` evenly).
+        qkv_bias (`bool`, *optional*, defaults to `True`):
+            Whether to add bias terms to the query/key/value projection layers.
+        use_rel_pos (`bool`, *optional*, defaults to `False`):
+            Whether to use relative positional encoding for spatial attention.
+        rel_pos_zero_init (`bool`, *optional*, defaults to `True`):
+            Whether to initialize relative positional embeddings to zero (stable training).
+        input_size (`Tuple[int, int]`, *optional*):
+            Spatial size (H, W) of the input feature map (required if `use_rel_pos=True`).
+
+    Shape:
+        - Input: `(B, H, W, dim)` (batch size, height, width, embedding dim)
+        - Output: Same shape as input
+    """
+
+    def __init__(
+        self,
+        dim: int,
+        num_heads: int = 8,
+        qkv_bias: bool = True,
+        use_rel_pos: bool = False,
+        rel_pos_zero_init: bool = True,
+        input_size: Optional[tuple[int, int]] = None,
+    ) -> None:
+        super().__init__()
+        self.num_heads = num_heads
+        head_dim = dim // num_heads
+        self.scale = head_dim**-0.5
+
+        self.qkv = nn.Linear(dim, dim * 3, bias=qkv_bias)
+        self.proj = nn.Linear(dim, dim)
+
+        self.use_rel_pos = use_rel_pos
+        if self.use_rel_pos:
+            assert input_size is not None, "Input size must be provided if using relative positional encoding."
+            self.rel_pos_h = nn.Parameter(torch.zeros(2 * input_size[0] - 1, head_dim))
+            self.rel_pos_w = nn.Parameter(torch.zeros(2 * input_size[1] - 1, head_dim))
+
+    def forward(self, hidden_states: torch.Tensor) -> torch.Tensor:
+        B, H, W, _ = hidden_states.shape
+        qkv = self.qkv(hidden_states).reshape(B, H * W, 3, self.num_heads, -1).permute(2, 0, 3, 1, 4)
+        q, k, v = qkv.reshape(3, B * self.num_heads, H * W, -1).unbind(dim=0)
+        attn = (q * self.scale) @ k.transpose(1, 2)
+
+        if self.use_rel_pos:
+            attn = add_decomposed_rel_pos(attn, q, self.rel_pos_h, self.rel_pos_w, (H, W), (H, W))
+
+        attn = F.softmax(attn, dim=-1)
+        hidden_states = (attn @ v).reshape(B, self.num_heads, H, W, -1).permute(0, 2, 3, 1, 4).reshape(B, H, W, -1)
+        hidden_states = self.proj(hidden_states)
+        return hidden_states
+
+
+def window_partition(hidden_states: torch.Tensor, window_size: int) -> tuple[torch.Tensor, tuple[int, int]]:
+    r"""
+    Partition 2D feature maps into non-overlapping windows, with padding to ensure dimensions are divisible by window size.
+
+    Args:
+        hidden_states (`torch.Tensor`):
+            Input feature map with shape [B, H, W, C], where:
+            - B: batch size
+            - H: height of feature map
+            - W: width of feature map
+            - C: channel dimension
+        window_size (`int`):
+            Size of each non-overlapping window (square window).
+
+    Returns:
+        tuple[torch.Tensor, tuple[int, int]]:
+            - windows: Partitioned windows with shape [num_windows * B, window_size, window_size, C],
+              where num_windows = (Hp // window_size) * (Wp // window_size)
+            - (Hp, Wp): Padded height and width of the feature map (after padding)
+    """
+    B, H, W, C = hidden_states.shape
+
+    pad_h = (window_size - H % window_size) % window_size
+    pad_w = (window_size - W % window_size) % window_size
+    if pad_h > 0 or pad_w > 0:
+        hidden_states = F.pad(hidden_states, (0, 0, 0, pad_w, 0, pad_h))
+    Hp, Wp = H + pad_h, W + pad_w
+
+    hidden_states = hidden_states.reshape(B, Hp // window_size, window_size, Wp // window_size, window_size, C)
+    windows = hidden_states.permute(0, 1, 3, 2, 4, 5).reshape(-1, window_size, window_size, C)
+    return windows, (Hp, Wp)
+
+
+def window_unpartition(
+    windows: torch.Tensor,
+    window_size: int,
+    pad_hw: tuple[int, int],
+    hw: tuple[int, int],
+) -> torch.Tensor:
+    r"""
+    Reverse operation of window_partition: merge windows back to original 2D feature map shape, removing padding.
+
+    Args:
+        windows (`torch.Tensor`):
+            Partitioned windows with shape [num_windows * B, window_size, window_size, C]
+        window_size (`int`):
+            Size of each non-overlapping window (must match window_partition's window_size)
+        pad_hw (`tuple[int, int]`):
+            Padded height and width (Hp, Wp) returned by window_partition
+        hw (`tuple[int, int]`):
+            Original height and width (H, W) of feature map before padding
+
+    Returns:
+        `torch.Tensor`:
+            Reconstructed feature map with shape [B, H, W, C] (original dimensions before padding)
+    """
+    Hp, Wp = pad_hw
+    H, W = hw
+    B = windows.shape[0] // (Hp * Wp // window_size // window_size)
+    hidden_states = windows.reshape(B, Hp // window_size, Wp // window_size, window_size, window_size, -1)
+    hidden_states = hidden_states.permute(0, 1, 3, 2, 4, 5).reshape(B, Hp, Wp, -1)
+    if Hp > H or Wp > W:
+        hidden_states = hidden_states[:, :H, :W, :]
+    return hidden_states
+
+
+class PPChart2TableVisionDecoderLayer(nn.Module):
+    r"""
+    Single decoder layer of the PP-Chart2Table vision transformer, with optional windowed attention.
+
+    Implements the standard transformer decoder layer structure:
+    Layer Norm → Multi-Head Attention (with residual) → Layer Norm → MLP (with residual)
+    Supports windowed attention (SW-MHA) for large feature maps to reduce computation.
+
+    Args:
+        dim (`int`):
+            Dimensionality of the input embeddings (hidden size of the transformer layer).
+        num_heads (`int`):
+            Number of attention heads (passed to MHSA layer).
+        mlp_ratio (`float`, *optional*, defaults to 4.0):
+            Ratio of MLP hidden dimension to embedding dimension (mlp_dim = dim * mlp_ratio).
+        qkv_bias (`bool`, *optional*, defaults to `True`):
+            Whether to use bias in Q/K/V projection (passed to MHSA layer).
+        norm_layer (`Type[nn.Module]`, *optional*, defaults to `nn.LayerNorm`):
+            Normalization layer to use (LayerNorm for flattened patches, LayerNorm2d for 2D feature maps).
+        act_layer (`Type[nn.Module]`, *optional*, defaults to `nn.GELU`):
+            Activation function for MLP block.
+        use_rel_pos (`bool`, *optional*, defaults to `False`):
+            Whether to use relative positional encoding (passed to MHSA layer).
+        rel_pos_zero_init (`bool`, *optional*, defaults to `True`):
+            Whether to zero-initialize relative positional embeddings (passed to MHSA layer).
+        window_size (`int`, *optional*, defaults to 0):
+            Size of attention windows (0 = full attention, >0 = windowed attention).
+        input_size (`Tuple[int, int]`, *optional*):
+            Spatial size of input feature map (passed to MHSA layer for relative positional encoding).
+
+    Shape:
+        - Input: `(B, H, W, dim)` (batch size, height, width, embedding dim)
+        - Output: Same shape as input
+    """
+
+    def __init__(
+        self,
+        dim: int,
+        num_heads: int,
+        mlp_ratio: float = 4.0,
+        qkv_bias: bool = True,
+        norm_layer: type[nn.Module] = nn.LayerNorm,
+        act_layer: type[nn.Module] = nn.GELU,
+        use_rel_pos: bool = False,
+        rel_pos_zero_init: bool = True,
+        window_size: int = 0,
+        input_size: Optional[tuple[int, int]] = None,
+    ) -> None:
+        super().__init__()
+        self.norm1 = norm_layer(dim)
+        self.attn = PPChart2TableVisionAttention(
+            dim,
+            num_heads=num_heads,
+            qkv_bias=qkv_bias,
+            use_rel_pos=use_rel_pos,
+            rel_pos_zero_init=rel_pos_zero_init,
+            input_size=input_size if window_size == 0 else (window_size, window_size),
+        )
+
+        self.norm2 = norm_layer(dim)
+        self.mlp = PPChart2TableVisionMLPBlock(embedding_dim=dim, mlp_dim=int(dim * mlp_ratio), act=act_layer)
+
+        self.window_size = window_size
+
+    def forward(self, hidden_states: torch.Tensor) -> torch.Tensor:
+        shortcut = hidden_states
+        hidden_states = self.norm1(hidden_states)
+        if self.window_size > 0:
+            H, W = hidden_states.shape[1], hidden_states.shape[2]
+            hidden_states, pad_hw = window_partition(hidden_states, self.window_size)
+        hidden_states = self.attn(hidden_states)
+
+        if self.window_size > 0:
+            hidden_states = window_unpartition(hidden_states, self.window_size, pad_hw, (H, W))
+        hidden_states = shortcut + hidden_states
+        hidden_states = hidden_states + self.mlp(self.norm2(hidden_states))
+        return hidden_states
+
+
+class PPChart2TableVisionPreTrainedModel(PreTrainedModel):
+    r"""
+    Base class for all PP-Chart2Table vision models, inheriting from Hugging Face `PreTrainedModel`.
+
+    This class sets up core configurations and compatibility flags for the vision encoder, including:
+    - Support for gradient checkpointing, attention backends (FlashAttention/SDPA), and model compilation
+    - Definition of non-splittable modules (for tensor parallelism)
+    - Output recording for hidden states/attentions (for debugging/analysis)
+
+    Class Attributes:
+        config (`PPChart2TableVisionConfig`):
+            Typed config class for PP-Chart2Table vision encoder (enforces type checking).
+        base_model_prefix (`str`, defaults to `"model"`):
+            Prefix for base model parameters (used in weight loading/saving).
+        supports_gradient_checkpointing (`bool`, defaults to `True`):
+            Whether the model supports gradient checkpointing to save memory.
+        _no_split_modules (`list[str]`):
+            Modules that should not be split across devices (tensor parallelism compatibility).
+        _skip_keys_device_placement (`list[str]`):
+            Keys to skip when placing tensors on devices (e.g., past key values for generation).
+        _supports_flash_attn / _supports_sdpa / _supports_flex_attn (`bool`):
+            Compatibility with optimized attention implementations (FlashAttention, SDPA, FlexAttention).
+        _can_compile_fullgraph (`bool`, defaults to `True`):
+            Whether the model supports TorchScript/TorchCompile full graph compilation.
+        _supports_attention_backend (`bool`, defaults to `True`):
+            Whether the model supports switching attention backends (e.g., PyTorch vs FlashAttention).
+        _can_record_outputs (`dict`):
+            Mapping of output types to modules for recording intermediate outputs (hidden_states/attentions).
+    """
+
+    config: PPChart2TableVisionConfig
+    base_model_prefix = "model"
+    supports_gradient_checkpointing = True
+    _no_split_modules = ["PPChart2TableVisionDecoderLayer"]
+    _skip_keys_device_placement = ["past_key_values"]
+    _supports_flash_attn = True
+    _supports_sdpa = True
+    _supports_flex_attn = True
+
+    _can_compile_fullgraph = True
+    _supports_attention_backend = True
+    _can_record_outputs = {
+        "hidden_states": PPChart2TableVisionDecoderLayer,
+        "attentions": PPChart2TableVisionAttention,
+    }
+
+
+class PPChart2TableVisionModel(PPChart2TableVisionPreTrainedModel):
+    main_input_name = "pixel_values"
+    input_modalities = "image"
+
+    def __init__(
+        self,
+        config: PPChart2TableVisionConfig,
+        in_chans: int = 3,
+        norm_layer: type[nn.Module] = nn.LayerNorm,
+        act_layer: type[nn.Module] = nn.GELU,
+        rel_pos_zero_init: bool = True,
+    ) -> None:
+        super().__init__(config)
+        self.img_size = config.img_size
+
+        self.patch_embed = PPChart2TableVisionPatchEmbed(
+            kernel_size=(config.patch_size, config.patch_size),
+            stride=(config.patch_size, config.patch_size),
+            in_chans=in_chans,
+            embed_dim=config.embed_dim,
+        )
+
+        self.pos_embed = nn.Parameter(
+            torch.zeros(
+                1, config.img_size // config.patch_size, config.img_size // config.patch_size, config.embed_dim
+            )
+        )
+
+        self.blocks = nn.ModuleList()
+        for i in range(config.depth):
+            block = PPChart2TableVisionDecoderLayer(
+                dim=config.embed_dim,
+                num_heads=config.num_heads,
+                mlp_ratio=config.mlp_ratio,
+                qkv_bias=config.qkv_bias,
+                norm_layer=norm_layer,
+                act_layer=act_layer,
+                use_rel_pos=config.use_rel_pos,
+                rel_pos_zero_init=rel_pos_zero_init,
+                window_size=config.window_size if i not in config.global_attn_indexes else 0,
+                input_size=(config.img_size // config.patch_size, config.img_size // config.patch_size),
+            )
+            self.blocks.append(block)
+
+        self.neck = nn.Sequential(
+            nn.Conv2d(
+                config.embed_dim,
+                config.out_chans,
+                kernel_size=1,
+                bias=False,
+            ),
+            PPChart2TableVisionLayerNorm2d(config.out_chans),
+            nn.Conv2d(
+                config.out_chans,
+                config.out_chans,
+                kernel_size=3,
+                padding=1,
+                bias=False,
+            ),
+            PPChart2TableVisionLayerNorm2d(config.out_chans),
+        )
+
+        self.net_2 = nn.Conv2d(256, 512, kernel_size=3, stride=2, padding=1, bias=False)
+        self.net_3 = nn.Conv2d(512, config.hidden_size, kernel_size=3, stride=2, padding=1, bias=False)
+
+        self.post_init()
+
+    def forward(self, hidden_states: torch.Tensor) -> torch.Tensor:
+        hidden_states = self.patch_embed(hidden_states)
+        hidden_states = hidden_states + self.pos_embed
+        for blk in self.blocks:
+            hidden_states = blk(hidden_states)
+        hidden_states = self.neck(hidden_states.permute(0, 3, 1, 2))
+        hidden_states = self.net_2(hidden_states)
+        hidden_states = self.net_3(hidden_states)
+        return hidden_states
+
+
+def rotate_half(x):
+    """Rotates half the hidden dims of the input."""
+    x1 = x[..., : x.shape[-1] // 2]
+    x2 = x[..., x.shape[-1] // 2 :]
+    return torch.cat((-x2, x1), dim=-1)
+
+
+@use_kernel_func_from_hub("rotary_pos_emb")
+def apply_rotary_pos_emb(q, k, cos, sin, position_ids=None, unsqueeze_dim=1):
+    """Applies Rotary Position Embedding to the query and key tensors.
+
+    Args:
+        q (`torch.Tensor`): The query tensor.
+        k (`torch.Tensor`): The key tensor.
+        cos (`torch.Tensor`): The cosine part of the rotary embedding.
+        sin (`torch.Tensor`): The sine part of the rotary embedding.
+        position_ids (`torch.Tensor`, *optional*):
+            Deprecated and unused.
+        unsqueeze_dim (`int`, *optional*, defaults to 1):
+            The 'unsqueeze_dim' argument specifies the dimension along which to unsqueeze cos[position_ids] and
+            sin[position_ids] so that they can be properly broadcasted to the dimensions of q and k. For example, note
+            that cos[position_ids] and sin[position_ids] have the shape [batch_size, seq_len, head_dim]. Then, if q and
+            k have the shape [batch_size, heads, seq_len, head_dim], then setting unsqueeze_dim=1 makes
+            cos[position_ids] and sin[position_ids] broadcastable to the shapes of q and k. Similarly, if q and k have
+            the shape [batch_size, seq_len, heads, head_dim], then set unsqueeze_dim=2.
+    Returns:
+        `tuple(torch.Tensor)` comprising of the query and key tensors rotated using the Rotary Position Embedding.
+    """
+    cos = cos.unsqueeze(unsqueeze_dim)
+    sin = sin.unsqueeze(unsqueeze_dim)
+    q_embed = (q * cos) + (rotate_half(q) * sin)
+    k_embed = (k * cos) + (rotate_half(k) * sin)
+    return q_embed, k_embed
+
+
+def repeat_kv(hidden_states: torch.Tensor, n_rep: int) -> torch.Tensor:
+    """
+    This is the equivalent of torch.repeat_interleave(x, dim=1, repeats=n_rep). The hidden states go from (batch,
+    num_key_value_heads, seqlen, head_dim) to (batch, num_attention_heads, seqlen, head_dim)
+    """
+    batch, num_key_value_heads, slen, head_dim = hidden_states.shape
+    if n_rep == 1:
+        return hidden_states
+    hidden_states = hidden_states[:, :, None, :, :].expand(batch, num_key_value_heads, n_rep, slen, head_dim)
+    return hidden_states.reshape(batch, num_key_value_heads * n_rep, slen, head_dim)
+
+
+def eager_attention_forward(
+    module: nn.Module,
+    query: torch.Tensor,
+    key: torch.Tensor,
+    value: torch.Tensor,
+    attention_mask: Optional[torch.Tensor],
+    scaling: float,
+    dropout: float = 0.0,
+    **kwargs: Unpack[TransformersKwargs],
+):
+    key_states = repeat_kv(key, module.num_key_value_groups)
+    value_states = repeat_kv(value, module.num_key_value_groups)
+
+    attn_weights = torch.matmul(query, key_states.transpose(2, 3)) * scaling
+    if attention_mask is not None:
+        causal_mask = attention_mask[:, :, :, : key_states.shape[-2]]
+        attn_weights = attn_weights + causal_mask
+
+    attn_weights = nn.functional.softmax(attn_weights, dim=-1, dtype=torch.float32).to(query.dtype)
+    attn_weights = nn.functional.dropout(attn_weights, p=dropout, training=module.training)
+    attn_output = torch.matmul(attn_weights, value_states)
+    attn_output = attn_output.transpose(1, 2).contiguous()
+
+    return attn_output, attn_weights
+
+
+@use_kernelized_func(apply_rotary_pos_emb)
+class PPChart2TableTextAttention(nn.Module):
+    """Multi-headed attention from 'Attention Is All You Need' paper"""
+
+    def __init__(self, config: PPChart2TableTextConfig, layer_idx: int):
+        super().__init__()
+        self.layer_type = config.layer_types[layer_idx] if hasattr(config, "layer_types") else None
+        self.config = config
+        self.layer_idx = layer_idx
+        self.head_dim = getattr(config, "head_dim", config.hidden_size // config.num_attention_heads)
+        self.num_key_value_groups = config.num_attention_heads // config.num_key_value_heads
+        self.scaling = self.head_dim**-0.5
+        self.attention_dropout = config.attention_dropout
+        self.is_causal = True
+        self.q_proj = nn.Linear(config.hidden_size, config.num_attention_heads * self.head_dim, bias=True)
+        self.k_proj = nn.Linear(config.hidden_size, config.num_key_value_heads * self.head_dim, bias=True)
+        self.v_proj = nn.Linear(config.hidden_size, config.num_key_value_heads * self.head_dim, bias=True)
+        self.o_proj = nn.Linear(config.num_attention_heads * self.head_dim, config.hidden_size, bias=False)
+        self.sliding_window = config.sliding_window if self.layer_type == "sliding_attention" else None
+
+    def forward(
+        self,
+        hidden_states: torch.Tensor,
+        position_embeddings: tuple[torch.Tensor, torch.Tensor],
+        attention_mask: Optional[torch.Tensor],
+        past_key_values: Optional[Cache] = None,
+        cache_position: Optional[torch.LongTensor] = None,
+        **kwargs: Unpack[FlashAttentionKwargs],
+    ) -> tuple[torch.Tensor, Optional[torch.Tensor]]:
+        input_shape = hidden_states.shape[:-1]
+        hidden_shape = (*input_shape, -1, self.head_dim)
+
+        query_states = self.q_proj(hidden_states).view(hidden_shape).transpose(1, 2)
+        key_states = self.k_proj(hidden_states).view(hidden_shape).transpose(1, 2)
+        value_states = self.v_proj(hidden_states).view(hidden_shape).transpose(1, 2)
+
+        cos, sin = position_embeddings
+        query_states, key_states = apply_rotary_pos_emb(query_states, key_states, cos, sin)
+
+        if past_key_values is not None:
+            # sin and cos are specific to RoPE models; cache_position needed for the static cache
+            cache_kwargs = {"sin": sin, "cos": cos, "cache_position": cache_position}
+            key_states, value_states = past_key_values.update(key_states, value_states, self.layer_idx, cache_kwargs)
+
+        attention_interface: Callable = eager_attention_forward
+        if self.config._attn_implementation != "eager":
+            attention_interface = ALL_ATTENTION_FUNCTIONS[self.config._attn_implementation]
+
+        attn_output, attn_weights = attention_interface(
+            self,
+            query_states,
+            key_states,
+            value_states,
+            attention_mask,
+            dropout=0.0 if not self.training else self.attention_dropout,
+            scaling=self.scaling,
+            sliding_window=self.sliding_window,  # main diff with Llama
+            **kwargs,
+        )
+
+        attn_output = attn_output.reshape(*input_shape, -1).contiguous()
+        attn_output = self.o_proj(attn_output)
+        return attn_output, attn_weights
+
+
+class PPChart2TableTextMLP(nn.Module):
+    def __init__(self, config):
+        super().__init__()
+        self.config = config
+        self.hidden_size = config.hidden_size
+        self.intermediate_size = config.intermediate_size
+        self.gate_proj = nn.Linear(self.hidden_size, self.intermediate_size, bias=False)
+        self.up_proj = nn.Linear(self.hidden_size, self.intermediate_size, bias=False)
+        self.down_proj = nn.Linear(self.intermediate_size, self.hidden_size, bias=False)
+        self.act_fn = ACT2FN[config.hidden_act]
+
+    def forward(self, x):
+        down_proj = self.down_proj(self.act_fn(self.gate_proj(x)) * self.up_proj(x))
+        return down_proj
+
+
+@use_kernel_forward_from_hub("RMSNorm")
+class PPChart2TableTextRMSNorm(nn.Module):
+    def __init__(self, hidden_size, eps: float = 1e-6) -> None:
+        """
+        PPChart2TableTextRMSNorm is equivalent to T5LayerNorm
+        """
+        super().__init__()
+        self.weight = nn.Parameter(torch.ones(hidden_size))
+        self.variance_epsilon = eps
+
+    def forward(self, hidden_states: torch.Tensor) -> torch.Tensor:
+        input_dtype = hidden_states.dtype
+        hidden_states = hidden_states.to(torch.float32)
+        variance = hidden_states.pow(2).mean(-1, keepdim=True)
+        hidden_states = hidden_states * torch.rsqrt(variance + self.variance_epsilon)
+        return self.weight * hidden_states.to(input_dtype)
+
+    def extra_repr(self):
+        return f"{tuple(self.weight.shape)}, eps={self.variance_epsilon}"
+
+
+class PPChart2TableTextDecoderLayer(GradientCheckpointingLayer):
+    def __init__(self, config: PPChart2TableTextConfig, layer_idx: int):
+        super().__init__()
+        self.hidden_size = config.hidden_size
+
+        self.self_attn = PPChart2TableTextAttention(config=config, layer_idx=layer_idx)
+
+        self.mlp = PPChart2TableTextMLP(config)
+        self.input_layernorm = PPChart2TableTextRMSNorm(config.hidden_size, eps=config.rms_norm_eps)
+        self.post_attention_layernorm = PPChart2TableTextRMSNorm(config.hidden_size, eps=config.rms_norm_eps)
+        self.attention_type = config.layer_types[layer_idx]
+
+    def forward(
+        self,
+        hidden_states: torch.Tensor,
+        attention_mask: Optional[torch.Tensor] = None,
+        position_ids: Optional[torch.LongTensor] = None,
+        past_key_values: Optional[Cache] = None,
+        use_cache: Optional[bool] = False,
+        cache_position: Optional[torch.LongTensor] = None,
+        position_embeddings: Optional[tuple[torch.Tensor, torch.Tensor]] = None,
+        **kwargs: Unpack[TransformersKwargs],
+    ) -> torch.Tensor:
+        residual = hidden_states
+        hidden_states = self.input_layernorm(hidden_states)
+        # Self Attention
+        hidden_states, _ = self.self_attn(
+            hidden_states=hidden_states,
+            attention_mask=attention_mask,
+            position_ids=position_ids,
+            past_key_values=past_key_values,
+            use_cache=use_cache,
+            cache_position=cache_position,
+            position_embeddings=position_embeddings,
+            **kwargs,
+        )
+        hidden_states = residual + hidden_states
+
+        # Fully Connected
+        residual = hidden_states
+        hidden_states = self.post_attention_layernorm(hidden_states)
+        hidden_states = self.mlp(hidden_states)
+        hidden_states = residual + hidden_states
+        return hidden_states
+
+
+@auto_docstring
+class PPChart2TableTextPreTrainedModel(PreTrainedModel):
+    config: PPChart2TableTextConfig
+    base_model_prefix = "model"
+    supports_gradient_checkpointing = True
+    _no_split_modules = ["PPChart2TableTextDecoderLayer"]
+    _skip_keys_device_placement = ["past_key_values"]
+    _supports_flash_attn = True
+    _supports_sdpa = True
+    _supports_flex_attn = True
+
+    _can_compile_fullgraph = True
+    _supports_attention_backend = True
+    _can_record_outputs = {
+        "hidden_states": PPChart2TableTextDecoderLayer,
+        "attentions": PPChart2TableTextAttention,
+    }
+
+
+class PPChart2TableTextRotaryEmbedding(nn.Module):
+    inv_freq: torch.Tensor  # fix linting for `register_buffer`
+
+    def __init__(self, config: PPChart2TableTextConfig, device=None):
+        super().__init__()
+        self.max_seq_len_cached = config.max_position_embeddings
+        self.original_max_seq_len = config.max_position_embeddings
+
+        self.config = config
+
+        self.rope_type = self.config.rope_parameters["rope_type"]
+        rope_init_fn: Callable = self.compute_default_rope_parameters
+        if self.rope_type != "default":
+            rope_init_fn = ROPE_INIT_FUNCTIONS[self.rope_type]
+        inv_freq, self.attention_scaling = rope_init_fn(self.config, device)
+
+        self.register_buffer("inv_freq", inv_freq, persistent=False)
+        self.register_buffer("original_inv_freq", inv_freq.clone(), persistent=False)
+
+    @staticmethod
+    def compute_default_rope_parameters(
+        config: Optional[PPChart2TableTextConfig] = None,
+        device: Optional["torch.device"] = None,
+        seq_len: Optional[int] = None,
+    ) -> tuple["torch.Tensor", float]:
+        """
+        Computes the inverse frequencies according to the original RoPE implementation
+        Args:
+            config ([`~transformers.PreTrainedConfig`]):
+                The model configuration.
+            device (`torch.device`):
+                The device to use for initialization of the inverse frequencies.
+            seq_len (`int`, *optional*):
+                The current sequence length. Unused for this type of RoPE.
+        Returns:
+            Tuple of (`torch.Tensor`, `float`), containing the inverse frequencies for the RoPE embeddings and the
+            post-processing scaling factor applied to the computed cos/sin (unused in this type of RoPE).
+        """
+        base = config.rope_parameters["rope_theta"]
+        dim = getattr(config, "head_dim", None) or config.hidden_size // config.num_attention_heads
+
+        attention_factor = 1.0  # Unused in this type of RoPE
+
+        # Compute the inverse frequencies
+        inv_freq = 1.0 / (
+            base ** (torch.arange(0, dim, 2, dtype=torch.int64).to(device=device, dtype=torch.float) / dim)
+        )
+        return inv_freq, attention_factor
+
+    @torch.no_grad()
+    @dynamic_rope_update  # power user: used with advanced RoPE types (e.g. dynamic rope)
+    def forward(self, x, position_ids):
+        inv_freq_expanded = self.inv_freq[None, :, None].float().expand(position_ids.shape[0], -1, 1).to(x.device)
+        position_ids_expanded = position_ids[:, None, :].float()
+
+        device_type = x.device.type if isinstance(x.device.type, str) and x.device.type != "mps" else "cpu"
+        with maybe_autocast(device_type=device_type, enabled=False):  # Force float32
+            freqs = (inv_freq_expanded.float() @ position_ids_expanded.float()).transpose(1, 2)
+            emb = torch.cat((freqs, freqs), dim=-1)
+            cos = emb.cos() * self.attention_scaling
+            sin = emb.sin() * self.attention_scaling
+
+        return cos.to(dtype=x.dtype), sin.to(dtype=x.dtype)
+
+
+@auto_docstring
+class PPChart2TableTextModel(PPChart2TableTextPreTrainedModel):
+    def __init__(self, config: PPChart2TableTextConfig):
+        super().__init__(config)
+        self.padding_idx = config.pad_token_id
+        self.vocab_size = config.vocab_size
+
+        self.embed_tokens = nn.Embedding(config.vocab_size, config.hidden_size, self.padding_idx)
+        self.layers = nn.ModuleList(
+            [PPChart2TableTextDecoderLayer(config, layer_idx) for layer_idx in range(config.num_hidden_layers)]
+        )
+        self.norm = PPChart2TableTextRMSNorm(config.hidden_size, eps=config.rms_norm_eps)
+        self.rotary_emb = PPChart2TableTextRotaryEmbedding(config=config)
+        self.gradient_checkpointing = False
+        self.has_sliding_layers = "sliding_attention" in self.config.layer_types
+
+        # Initialize weights and apply final processing
+        self.post_init()
+
+    @check_model_inputs
+    @auto_docstring
+    def forward(
+        self,
+        input_ids: Optional[torch.LongTensor] = None,
+        attention_mask: Optional[torch.Tensor] = None,
+        position_ids: Optional[torch.LongTensor] = None,
+        past_key_values: Optional[Cache] = None,
+        inputs_embeds: Optional[torch.FloatTensor] = None,
+        use_cache: Optional[bool] = None,
+        cache_position: Optional[torch.LongTensor] = None,
+        **kwargs: Unpack[TransformersKwargs],
+    ) -> BaseModelOutputWithPast:
+        if (input_ids is None) ^ (inputs_embeds is not None):
+            raise ValueError("You must specify exactly one of input_ids or inputs_embeds")
+
+        if inputs_embeds is None:
+            inputs_embeds = self.embed_tokens(input_ids)
+
+        if use_cache and past_key_values is None:
+            past_key_values = DynamicCache(config=self.config)
+
+        if cache_position is None:
+            past_seen_tokens = past_key_values.get_seq_length() if past_key_values is not None else 0
+            cache_position = torch.arange(
+                past_seen_tokens, past_seen_tokens + inputs_embeds.shape[1], device=inputs_embeds.device
+            )
+
+        if position_ids is None:
+            position_ids = cache_position.unsqueeze(0)
+
+        # It may already have been prepared by e.g. `generate`
+        if not isinstance(causal_mask_mapping := attention_mask, dict):
+            # Prepare mask arguments
+            mask_kwargs = {
+                "config": self.config,
+                "input_embeds": inputs_embeds,
+                "attention_mask": attention_mask,
+                "cache_position": cache_position,
+                "past_key_values": past_key_values,
+                "position_ids": position_ids,
+            }
+            # Create the masks
+            causal_mask_mapping = {
+                "full_attention": create_causal_mask(**mask_kwargs),
+            }
+            # The sliding window alternating layers are not always activated depending on the config
+            if self.has_sliding_layers:
+                causal_mask_mapping["sliding_attention"] = create_sliding_window_causal_mask(**mask_kwargs)
+
+        hidden_states = inputs_embeds
+        position_embeddings = self.rotary_emb(hidden_states, position_ids)
+
+        for decoder_layer in self.layers[: self.config.num_hidden_layers]:
+            hidden_states = decoder_layer(
+                hidden_states,
+                attention_mask=causal_mask_mapping[decoder_layer.attention_type],
+                position_embeddings=position_embeddings,
+                position_ids=position_ids,
+                past_key_values=past_key_values,
+                use_cache=use_cache,
+                cache_position=cache_position,
+                **kwargs,
+            )
+
+        hidden_states = self.norm(hidden_states)
+        return BaseModelOutputWithPast(
+            last_hidden_state=hidden_states,
+            past_key_values=past_key_values if use_cache else None,
+        )
+
+
+@dataclass
+class PPChart2TableModelOutputWithPast(ModelOutput):
+    r"""
+    Output class for PPChart2Table multimodal model's forward pass, extending Hugging Face `ModelOutput`.
+
+    This dataclass encapsulates the core outputs of the PP-Chart2Table base model, including hidden states,
+    attention weights, and cached key/value pairs for efficient generation.
+
+    Attributes:
+        past_key_values (`Optional[Cache]`, defaults to `None`):
+            Cached attention key/value pairs from the text decoder (for fast autoregressive generation).
+        last_hidden_state (`Optional[torch.FloatTensor]`, defaults to `None`):
+            Final hidden states from the text decoder (shape: `[B, seq_len, hidden_size]`), after multimodal fusion.
+        hidden_states (`Optional[tuple[torch.FloatTensor]]`, defaults to `None`):
+            Tuple of hidden states from each layer of the text decoder (for debugging/analysis).
+        attentions (`Optional[tuple[torch.FloatTensor]]`, defaults to `None`):
+            Tuple of attention weights from each layer of the text decoder (for debugging/analysis).
+    """
+
+    past_key_values: Optional[Cache] = None
+    last_hidden_state: Optional[torch.FloatTensor] = None
+    hidden_states: Optional[tuple[torch.FloatTensor]] = None
+    attentions: Optional[tuple[torch.FloatTensor]] = None
+
+
+@dataclass
+class PPChart2TableCausalLMOutputWithPast(ModelOutput):
+    r"""
+    Output class for PP-Chart2Table conditional generation model's forward pass.
+
+    Extends `PPChart2TableModelOutputWithPast` with language modeling logits (for token prediction),
+    tailored for autoregressive table generation tasks.
+
+    Attributes:
+        logits (`Optional[torch.FloatTensor]`, defaults to `None`):
+            Language modeling logits (shape: `[B, seq_len, vocab_size]`), output from the LM head.
+        past_key_values (`Optional[Cache]`, defaults to `None`):
+            Cached attention key/value pairs (inherited from base model output).
+        last_hidden_state (`Optional[torch.FloatTensor]`, defaults to `None`):
+            Final hidden states from the text decoder (inherited from base model output).
+        hidden_states (`Optional[tuple[torch.FloatTensor]]`, defaults to `None`):
+            Tuple of decoder layer hidden states (inherited from base model output).
+        attentions (`Optional[tuple[torch.FloatTensor]]`, defaults to `None`):
+            Tuple of decoder layer attention weights (inherited from base model output).
+    """
+
+    logits: Optional[torch.FloatTensor] = None
+    past_key_values: Optional[Cache] = None
+    last_hidden_state: Optional[torch.FloatTensor] = None
+    hidden_states: Optional[tuple[torch.FloatTensor]] = None
+    attentions: Optional[tuple[torch.FloatTensor]] = None
+
+
+class PPChart2TablePreTrainedModel(PreTrainedModel):
+    r"""
+    Base class for all PP-Chart2Table multimodal models, inheriting from Hugging Face `PreTrainedModel`.
+
+    This class defines core configurations and compatibility flags for the multimodal model (vision + text),
+    including support for gradient checkpointing, optimized attention backends, and model compilation.
+
+    Class Attributes:
+        config (`PPChart2TableConfig`):
+            Typed config class for PP-Chart2Table (combines vision + text sub-configs).
+        base_model_prefix (`str`, defaults to `"model"`):
+            Prefix for base model parameters (used in weight loading/saving).
+        supports_gradient_checkpointing (`bool`, defaults to `True`):
+            Whether the model supports gradient checkpointing to save memory during training.
+        _no_split_modules (`list[str]`):
+            Modules that should not be split across devices (tensor parallelism compatibility).
+        _skip_keys_device_placement (`list[str]`):
+            Keys to skip when placing tensors on devices (e.g., past key values for generation).
+        _supports_flash_attn / _supports_sdpa / _supports_flex_attn (`bool`):
+            Compatibility with optimized attention implementations (FlashAttention, SDPA, FlexAttention).
+        _can_compile_fullgraph (`bool`, defaults to `True`):
+            Whether the model supports TorchScript/TorchCompile full graph compilation.
+        _supports_attention_backend (`bool`, defaults to `True`):
+            Whether the model supports switching attention backends (e.g., PyTorch vs FlashAttention).
+        _can_record_outputs (`dict`):
+            Mapping of output types to modules for recording intermediate outputs (hidden_states/attentions).
+    """
+
+    config: PPChart2TableConfig
+    base_model_prefix = "model"
+    supports_gradient_checkpointing = True
+    _no_split_modules = ["PPChart2TableTextDecoderLayer"]
+    _skip_keys_device_placement = ["past_key_values"]
+    _supports_flash_attn = True
+    _supports_sdpa = True
+    _supports_flex_attn = True
+
+    _can_compile_fullgraph = True
+    _supports_attention_backend = True
+
+    _can_record_outputs = {
+        "hidden_states": PPChart2TableTextDecoderLayer,
+        "attentions": PPChart2TableTextAttention,
+    }
+
+
+class PPChart2TableModel(PPChart2TablePreTrainedModel):
+    r"""
+    Core PP-Chart2Table multimodal model (vision encoder + text decoder) for chart-to-table parsing.
+
+    This model integrates a vision encoder (for chart image feature extraction) and a text decoder (for table generation),
+    with a multimodal projection layer to align vision features with text embedding space. The core logic is:
+    1. Extract chart features via vision encoder
+    2. Project vision features to text embedding dimension
+    3. Inject vision features into text decoder inputs (replace image placeholder tokens)
+    4. Forward pass through text decoder to generate table text
+
+    Args:
+        config (`PPChart2TableConfig`):
+            Combined configuration class (includes vision_config and text_config sub-configs).
+
+    Inputs (forward method):
+        input_ids (`torch.LongTensor`, optional):
+            Tokenized input text (including image placeholder tokens) with shape `[B, seq_len]`.
+        attention_mask (`torch.Tensor`, optional):
+            Attention mask to avoid padding tokens (shape: `[B, seq_len]`).
+        position_ids (`torch.Tensor`, optional):
+            Positional indices for input tokens (shape: `[B, seq_len]`).
+        past_key_values (`list[torch.Tensor]`, optional):
+            Cached key/value pairs for fast autoregressive generation.
+        inputs_embeds (`torch.Tensor`, optional):
+            Precomputed input embeddings (shape: `[B, seq_len, hidden_size]`; overrides `input_ids`).
+        use_cache (`bool`, optional):
+            Whether to cache key/value pairs for generation.
+        pixel_values (`torch.Tensor`, optional):
+            Preprocessed chart images (shape: `[B, 3, H, W]`; required for multimodal input).
+        cache_position (`torch.LongTensor`, optional):
+            Position indices for cached key/value pairs (for generation).
+        **kwargs:
+            Additional arguments passed to the text decoder.
+
+    Outputs:
+        `PPChart2TableModelOutputWithPast`:
+            Contains the text decoder's final hidden states, cached key/values, and optional intermediate outputs.
+    """
+
+    config_class = PPChart2TableConfig
+
+    def __init__(self, config: PPChart2TableConfig):
+        super().__init__(config)
+        self.vision_tower_high = PPChart2TableVisionModel._from_config(config.vision_config)
+        self.language_model = PPChart2TableTextModel._from_config(config.text_config)
+        self.mm_projector_vary = nn.Linear(config.vision_config.hidden_size, config.text_config.hidden_size)
+
+        # Initialize weights and apply final processing
+        self.post_init()
+
+    def get_input_embeddings(self):
+        """Get input embeddings from the text decoder (for weight tying/loading)."""
+        return self.language_model.embed_tokens
+
+    def set_input_embeddings(self, value):
+        """Set input embeddings for the text decoder (for weight tying/loading)."""
+        self.language_model.embed_tokens = value
+
+    def get_image_features(
+        self,
+        images: Optional[torch.Tensor],
+    ) -> list[torch.Tensor]:
+        r"""
+        Extract and project chart image features to text embedding space.
+
+        Args:
+            images (`torch.Tensor`):
+                Preprocessed chart images (shape: `[B, 3, H, W]`).
+
+        Returns:
+            `list[torch.Tensor]`:
+                List of projected image features (one per image), each with shape `[1, num_patches, text_hidden_size]`.
+        """
+        image_features = []
+        for image in images:
+            image = image.unsqueeze(0)
+            with torch.no_grad():
+                cnn_feature = self.vision_tower_high(image)
+                cnn_feature = cnn_feature.flatten(2).transpose(2, 1)
+            image_feature = self.mm_projector_vary(cnn_feature)
+            image_features.append(image_feature)
+
+        image_features = torch.stack(image_features, dim=0)
+
+        return image_features
+
+    def get_placeholder_mask(
+        self,
+        input_ids: Optional[torch.LongTensor] = None,
+        inputs_embeds: Optional[torch.FloatTensor] = None,
+        image_features: Optional[torch.FloatTensor] = None,
+    ) -> torch.BoolTensor:
+        r"""
+        Generate mask to locate image placeholder tokens in input embeddings.
+
+        This mask identifies the `<imgpad>` tokens in the input sequence, which will be replaced with
+        projected image features for multimodal fusion.
+
+        Args:
+            input_ids (`torch.LongTensor`, optional):
+                Tokenized input text (used if `inputs_embeds` is None).
+            inputs_embeds (`torch.FloatTensor`, optional):
+                Precomputed input embeddings (used if `input_ids` is None).
+            image_features (`torch.FloatTensor`):
+                Projected image features (used to validate token-feature count match).
+
+        Returns:
+            `torch.BoolTensor`:
+                Boolean mask (shape: `[B, seq_len, text_hidden_size]`) where `True` indicates image placeholder tokens.
+
+        Raises:
+            ValueError: If the number of image tokens does not match the number of image features.
+        """
+        if input_ids is None:
+            start_token_embed = self.get_input_embeddings()(
+                torch.tensor(self.config.im_start_token, dtype=torch.long, device=inputs_embeds.device)
+            )
+            special_image_mask = inputs_embeds == start_token_embed
+            special_image_mask = special_image_mask.all(-1)
+        else:
+            special_image_mask = input_ids == self.config.im_patch_token
+
+        n_image_tokens = special_image_mask.sum()
+
+        n_image_features = image_features.numel() // image_features.shape[-1]
+        if n_image_tokens != n_image_features:
+            raise ValueError(
+                f"Image features and image tokens do not match: tokens: {n_image_tokens}, features {n_image_features}"
+            )
+
+        special_image_mask = special_image_mask.unsqueeze(-1).expand_as(inputs_embeds).to(inputs_embeds.device)
+
+        return special_image_mask
+
+    @can_return_tuple
+    def forward(
+        self,
+        input_ids: torch.LongTensor = None,
+        attention_mask: Optional[torch.Tensor] = None,
+        position_ids: Optional[torch.Tensor] = None,
+        past_key_values: Optional[list[torch.Tensor]] = None,
+        inputs_embeds: Optional[torch.Tensor] = None,
+        use_cache: Optional[bool] = None,
+        pixel_values: Optional[torch.Tensor] = None,
+        cache_position: Optional[torch.LongTensor] = None,
+        **kwargs,
+    ):
+        if inputs_embeds is None:
+            inputs_embeds = self.language_model.embed_tokens(input_ids)
+
+        if pixel_values is not None:
+            image_features = self.get_image_features(pixel_values)
+            image_mask = self.get_placeholder_mask(
+                input_ids, inputs_embeds=inputs_embeds, image_features=image_features
+            )
+            inputs_embeds = inputs_embeds.masked_scatter(image_mask, image_features)
+
+        outputs = self.language_model(
+            input_ids=None,
+            position_ids=position_ids,
+            attention_mask=attention_mask,
+            past_key_values=past_key_values,
+            inputs_embeds=inputs_embeds,
+            use_cache=use_cache,
+            cache_position=cache_position,
+            **kwargs,
+        )
+
+        output = PPChart2TableModelOutputWithPast(
+            last_hidden_state=outputs.last_hidden_state,
+            past_key_values=outputs.past_key_values,
+            hidden_states=outputs.hidden_states,
+            attentions=outputs.attentions,
+        )
+
+        return output
+
+
+class PPChart2TableForConditionalGeneration(PPChart2TablePreTrainedModel, GenerationMixin):
+    r"""
+    PP-Chart2Table model for conditional generation (table text generation from chart images),
+    extending the core model with a language modeling (LM) head and generation utilities.
+
+    This class integrates Hugging Face `GenerationMixin` to support standard generation methods (greedy, beam search, etc.),
+    and adds an LM head to predict token probabilities for autoregressive table generation.
+
+    Key Features:
+    - LM head for token prediction (weight tied to input embeddings)
+    - Optimized generation input preparation (avoids reprocessing images in subsequent steps)
+    - Inference-only mode (training not supported by default)
+
+    Args:
+        config (`PPChart2TableConfig`):
+            Combined configuration class (vision + text sub-configs).
+
+    Inputs (forward method):
+        Inherits all inputs from `PPChart2TableModel`, plus:
+        labels (`list[dict]`, optional):
+            Training labels (not supported; raises ValueError if provided).
+        logits_to_keep (`Union[int, torch.Tensor]`, defaults to 0):
+            Slice index to keep only the last N logits (optimizes generation efficiency).
+
+    Outputs:
+        `PPChart2TableCausalLMOutputWithPast`:
+            Contains LM logits, decoder hidden states, and cached key/value pairs.
+    """
+
+    _keys_to_ignore_on_load_missing = ["num_batches_tracked"]
+    _tied_weights_keys = {"lm_head.weight": "model.language_model.embed_tokens.weight"}
+
+    def __init__(self, config: PPChart2TableConfig):
+        super().__init__(config)
+        self.model = PPChart2TableModel(config)
+        self.lm_head = nn.Linear(config.text_config.hidden_size, config.text_config.vocab_size, bias=False)
+
+        self.post_init()
+
+    def get_input_embeddings(self):
+        return self.model.get_input_embeddings()
+
+    def set_input_embeddings(self, value):
+        self.model.set_input_embeddings(value)
+
+    def prepare_inputs_for_generation(
+        self,
+        input_ids,
+        past_key_values=None,
+        attention_mask=None,
+        inputs_embeds=None,
+        cache_position=None,
+        position_ids=None,
+        use_cache=True,
+        pixel_values=None,
+        pixel_values_videos=None,
+        image_grid_thw=None,
+        video_grid_thw=None,
+        is_first_iteration=False,
+        **kwargs,
+    ):
+        # Overwritten -- in specific circumstances we don't want to forward image inputs to the model
+
+        model_inputs = super().prepare_inputs_for_generation(
+            input_ids,
+            past_key_values=past_key_values,
+            attention_mask=attention_mask,
+            inputs_embeds=inputs_embeds,
+            cache_position=cache_position,
+            position_ids=position_ids,
+            pixel_values=pixel_values,
+            pixel_values_videos=pixel_values_videos,
+            image_grid_thw=image_grid_thw,
+            video_grid_thw=video_grid_thw,
+            use_cache=use_cache,
+            is_first_iteration=is_first_iteration,
+            **kwargs,
+        )
+        if not is_first_iteration and use_cache:
+            model_inputs["pixel_values"] = None
+
+        return model_inputs
+
+    @can_return_tuple
+    def forward(
+        self,
+        input_ids: Optional[torch.LongTensor] = None,
+        attention_mask: Optional[torch.Tensor] = None,
+        position_ids: Optional[torch.LongTensor] = None,
+        pixel_values: Optional[torch.Tensor] = None,
+        inputs_embeds: Optional[torch.Tensor] = None,
+        labels: Optional[list[dict]] = None,
+        logits_to_keep: Union[int, torch.Tensor] = 0,
+        cache_position: Optional[torch.LongTensor] = None,
+        past_key_values: Optional[Cache] = None,
+        use_cache: Optional[bool] = None,
+        **kwargs,
+    ) -> Union[tuple[torch.FloatTensor], PPChart2TableCausalLMOutputWithPast]:
+        outputs = self.model(
+            input_ids=input_ids,
+            attention_mask=attention_mask,
+            position_ids=position_ids,
+            pixel_values=pixel_values,
+            inputs_embeds=inputs_embeds,
+            use_cache=use_cache,
+            past_key_values=past_key_values,
+            cache_position=cache_position,
+            **kwargs,
+        )
+        hidden_states = outputs.last_hidden_state
+        logits = self.lm_head(hidden_states)
+
+        slice_indices = slice(-logits_to_keep, None) if isinstance(logits_to_keep, int) else logits_to_keep
+        logits = self.lm_head(hidden_states[:, slice_indices, :])
+
+        if labels is not None:
+            raise ValueError(
+                "The PPChart2TableForConditionalGeneration model only supports inference, and training is not allowed!\n"
+                "If you need to train this model, please implement the corresponding loss calculation logic, or use the inference-only mode (do not pass the `labels` parameter)."
+            )
+
+        return PPChart2TableCausalLMOutputWithPast(
+            logits=logits,
+            last_hidden_state=outputs.last_hidden_state,
+            past_key_values=outputs.past_key_values,
+            hidden_states=outputs.hidden_states,
+            attentions=outputs.attentions,
+        )
+
+
+__all__ = [
+    "PPChart2TableForConditionalGeneration",
+    "PPChart2TableModel",
+    "PPChart2TablePreTrainedModel",
+    "PPChart2TableTextPreTrainedModel",
+    "PPChart2TableTextModel",
+    "PPChart2TableVisionPreTrainedModel",
+    "PPChart2TableVisionModel",
+]
diff --git a/src/transformers/models/pp_chart2table/modular_pp_chart2table.py b/src/transformers/models/pp_chart2table/modular_pp_chart2table.py
new file mode 100644
index 000000000000..fdf6e07fe0cd
--- /dev/null
+++ b/src/transformers/models/pp_chart2table/modular_pp_chart2table.py
@@ -0,0 +1,1609 @@
+from dataclasses import dataclass
+from typing import Optional, Union
+
+import torch
+import torch.nn as nn
+import torch.nn.functional as F
+from torchvision.transforms.v2.functional import InterpolationMode
+
+from transformers.cache_utils import Cache
+from transformers.configuration_utils import PreTrainedConfig, layer_type_validation
+from transformers.feature_extraction_utils import BatchFeature
+from transformers.generation import GenerationMixin
+from transformers.image_processing_utils import BaseImageProcessor
+from transformers.image_processing_utils_fast import BaseImageProcessorFast
+from transformers.image_transforms import flip_channel_order, resize, to_channel_dimension_format
+from transformers.image_utils import (
+    ChannelDimension,
+    ImageInput,
+    PILImageResampling,
+    infer_channel_dimension_format,
+    make_flat_list_of_images,
+    to_numpy_array,
+    valid_images,
+    validate_preprocess_arguments,
+)
+from transformers.modeling_outputs import ModelOutput
+from transformers.modeling_rope_utils import RopeParameters
+from transformers.modeling_utils import PreTrainedModel
+from transformers.models.qwen2.modeling_qwen2 import Qwen2Attention, Qwen2DecoderLayer, Qwen2Model, Qwen2PreTrainedModel
+from transformers.processing_utils import ProcessorMixin, TensorType
+from transformers.utils import (
+    can_return_tuple,
+    filter_out_non_signature_kwargs,
+)
+
+
+class PPChart2TableVisionConfig(PreTrainedConfig):
+    r"""
+    This is the configuration class to store the configuration of a [`PPChart2TableVisionModel`]. It is used to instantiate a
+    PP-Chart2Table vision encoder according to the specified arguments, defining the model architecture. Instantiating a
+    configuration with the defaults will yield a similar configuration to that of the vision encoder of the PP-Chart2Table
+    architecture developed by the PaddlePaddle team for chart-to-table parsing tasks.
+
+    Configuration objects inherit from [`PreTrainedConfig`] and can be used to control the model outputs. Read the
+    documentation from [`PreTrainedConfig`] for more information.
+
+    Args:
+        im_patch_token (`int`, *optional*, defaults to 151859):
+            The token ID used to represent individual image patches in the multimodal input sequence.
+        im_start_token (`int`, *optional*, defaults to 151857):
+            The token ID representing the start of an image token sequence in the multimodal input.
+        depth (`int`, *optional*, defaults to 12):
+            Number of hidden layers in the vision Transformer encoder.
+        embed_dim (`int`, *optional*, defaults to 768):
+            Dimensionality of the patch embedding layer in the vision encoder.
+        hidden_size (`int`, *optional*, defaults to 1024):
+            Dimensionality of the hidden layers in the vision Transformer encoder.
+        img_size (`int`, *optional*, defaults to 1024):
+            The size (resolution) of input chart images (assumed to be square).
+        mlp_ratio (`float`, *optional*, defaults to 4.0):
+            Ratio of the dimensionality of the feed-forward layer to the hidden size in the vision Transformer blocks.
+        num_heads (`int`, *optional*, defaults to 12):
+            Number of attention heads for each self-attention layer in the vision Transformer encoder.
+        patch_size (`int`, *optional*, defaults to 16):
+            The size (resolution) of each image patch extracted from the input chart image.
+        qkv_bias (`bool`, *optional*, defaults to `True`):
+            Whether to add bias terms to the query, key, and value projection layers in the self-attention mechanism.
+        use_rel_pos (`bool`, *optional*, defaults to `True`):
+            Whether to use relative positional embeddings in the self-attention layers of the vision encoder.
+        global_attn_indexes (`list`, *optional*, defaults to `[2, 5, 8, 11]`):
+            List of layer indexes where global attention (instead of windowed attention) is applied in the vision encoder.
+        window_size (`int`, *optional*, defaults to 14):
+            The size of the attention window for windowed self-attention in the vision Transformer layers.
+        out_chans (`int`, *optional*, defaults to 256):
+            Number of output channels from the convolutional stem layer before patch embedding.
+
+    Example:
+
+    ```python
+    >>> from transformers import PPChart2TableVisionConfig, PPChart2TableVisionModel
+
+    >>> # Initializing a PPChart2TableVisionConfig with default PP-Chart2Table style configuration
+    >>> configuration = PPChart2TableVisionConfig()
+
+    >>> # Initializing a PPChart2TableVisionModel (with random weights) from the PP-Chart2Table style configuration
+    >>> model = PPChart2TableVisionModel(configuration)
+
+    >>> # Accessing the model configuration
+    >>> configuration = model.config
+    """
+
+    model_type = "pp_chart2table_vision"
+    base_config_key = "vision_config"
+
+    def __init__(
+        self,
+        im_patch_token: int = 151859,
+        im_start_token: int = 151857,
+        depth: int = 12,
+        embed_dim: int = 768,
+        hidden_size: int = 1024,
+        img_size: int = 1024,
+        mlp_ratio: float = 4.0,
+        num_heads: int = 12,
+        patch_size: int = 16,
+        qkv_bias: bool = True,
+        use_rel_pos: bool = True,
+        global_attn_indexes: Optional[list] = None,
+        window_size: int = 14,
+        out_chans: int = 256,
+        **kwargs,
+    ):
+        self.im_patch_token = im_patch_token
+        self.im_start_token = im_start_token
+
+        self.depth = depth
+        self.embed_dim = embed_dim
+        self.hidden_size = hidden_size
+        self.img_size = img_size
+        self.mlp_ratio = mlp_ratio
+        self.num_heads = num_heads
+        self.patch_size = patch_size
+        self.qkv_bias = qkv_bias
+        self.use_rel_pos = use_rel_pos
+        self.global_attn_indexes = global_attn_indexes if global_attn_indexes is not None else [2, 5, 8, 11]
+        self.window_size = window_size
+        self.out_chans = out_chans
+
+        super().__init__(**kwargs)
+
+
+class PPChart2TableTextConfig(PreTrainedConfig):
+    r"""
+    This is the configuration class to store the configuration of a [`PPChart2TableTextModel`]. It is used to instantiate a
+    PP-Chart2Table text decoder according to the specified arguments, defining the model architecture. Instantiating a
+    configuration with the defaults will yield a similar configuration to that of the text encoder/decoder of the
+    PPChart2TableText-7B-beta [Qwen/PPChart2TableText-7B-beta](https://huggingface.co/Qwen/PPChart2TableText-7B-beta)
+    architecture, optimized for chart-to-table text generation tasks.
+
+    Configuration objects inherit from [`PreTrainedConfig`] and can be used to control the model outputs. Read the
+    documentation from [`PreTrainedConfig`] for more information.
+
+    Args:
+        attention_dropout (`float`, *optional*, defaults to 0.0):
+            The dropout ratio for the attention probabilities in self-attention layers.
+        bos_token_id (`int`, *optional*, defaults to 151643):
+            The token ID representing the beginning of a sequence (BOS) for text generation.
+        eos_token_id (`int`, *optional*, defaults to 151643):
+            The token ID representing the end of a sequence (EOS) for text generation.
+        hidden_act (`str` or `function`, *optional*, defaults to `"silu"`):
+            The non-linear activation function (function or string) in the feed-forward and attention layers of the decoder.
+        hidden_size (`int`, *optional*, defaults to 1024):
+            Dimensionality of the hidden representations in the Transformer decoder layers.
+        initializer_range (`float`, *optional*, defaults to 0.02):
+            The standard deviation of the truncated_normal_initializer for initializing all weight matrices.
+        intermediate_size (`int`, *optional*, defaults to 2816):
+            Dimensionality of the "intermediate" (i.e., feed-forward) layer in the Transformer decoder blocks.
+        max_position_embeddings (`int`, *optional*, defaults to 32768):
+            The maximum sequence length that this model might ever be used with for text input/output.
+        num_attention_heads (`int`, *optional*, defaults to 16):
+            Number of attention heads for each self-attention layer in the Transformer decoder.
+        num_hidden_layers (`int`, *optional*, defaults to 24):
+            Number of hidden layers in the Transformer decoder.
+        num_key_value_heads (`int`, *optional*, defaults to 16):
+            Number of key/value heads for implementing Grouped Query Attention (GQA). If equal to `num_attention_heads`,
+            Multi Head Attention (MHA) is used; if 1, Multi Query Attention (MQA) is used. For more details, see
+            [this paper](https://huggingface.co/papers/2305.13245).
+        rms_norm_eps (`float`, *optional*, defaults to 1e-06):
+            The epsilon value used by the RMS normalization layers to avoid division by zero.
+        rope_theta (`float`, *optional*, defaults to 1000000.0):
+            The base period of the RoPE (Rotary Position Embedding) embeddings, controlling the frequency of positional encoding.
+        rope_parameters (`RopeParameters` or `dict`, *optional*):
+            Configuration parameters for RoPE embeddings, including scaling parameters for longer sequence lengths beyond
+            `max_position_embeddings`.
+        sliding_window (`int`, *optional*, defaults to 32768):
+            Window size for Sliding Window Attention (SWA) in the decoder layers (only active if `use_sliding_window=True`).
+        tie_word_embeddings (`bool`, *optional*, defaults to `True`):
+            Whether the model's input and output word embeddings should be tied (shared weights).
+        use_cache (`bool`, *optional*, defaults to `True`):
+            Whether to return the last key/value attention states to speed up sequential decoding (only relevant for autoregressive
+            generation).
+        vocab_size (`int`, *optional*, defaults to 151860):
+            Vocabulary size of the PPChart2TableText model. Defines the number of distinct tokens that can be represented
+            by `input_ids`.
+        layer_types (`list[str]`, *optional*):
+            Attention pattern for each decoder layer (e.g., `"full_attention"` or `"sliding_attention"`). If not specified,
+            automatically determined by `sliding_window`.
+
+    Example:
+
+    ```python
+    >>> from transformers import PPChart2TableTextConfig, PPChart2TableTextModel
+
+    >>> # Initializing a PPChart2TableText style configuration
+    >>> configuration = PPChart2TableTextConfig()
+
+    >>> # Initializing a model from the PPChart2TableText-7B style configuration
+    >>> model = PPChart2TableTextModel(configuration)
+
+    >>> # Accessing the model configuration
+    >>> configuration = model.config
+    """
+
+    model_type = "pp_chart2table_text"
+    keys_to_ignore_at_inference = ["past_key_values"]
+
+    # Default tensor parallel plan for base model `PPChart2TableText`
+    base_model_tp_plan = {
+        "layers.*.self_attn.q_proj": "colwise",
+        "layers.*.self_attn.k_proj": "colwise",
+        "layers.*.self_attn.v_proj": "colwise",
+        "layers.*.self_attn.o_proj": "rowwise",
+        "layers.*.mlp.gate_proj": "colwise",
+        "layers.*.mlp.up_proj": "colwise",
+        "layers.*.mlp.down_proj": "rowwise",
+    }
+    base_model_pp_plan = {
+        "embed_tokens": (["input_ids"], ["inputs_embeds"]),
+        "layers": (["hidden_states", "attention_mask"], ["hidden_states"]),
+        "norm": (["hidden_states"], ["hidden_states"]),
+    }
+    base_config_key = "text_config"
+
+    def __init__(
+        self,
+        attention_dropout: float = 0.0,
+        bos_token_id: int = 151643,
+        eos_token_id: int = 151643,
+        hidden_act: str = "silu",
+        hidden_size: int = 1024,
+        initializer_range: float = 0.02,
+        intermediate_size: int = 2816,
+        max_position_embeddings: int = 32768,
+        num_attention_heads: int = 16,
+        num_hidden_layers: int = 24,
+        num_key_value_heads: int = 16,
+        rms_norm_eps: float = 1e-06,
+        rope_theta: float = 1000000.0,
+        rope_parameters: Optional[RopeParameters | dict[str, RopeParameters]] = None,
+        sliding_window: int = 32768,
+        tie_word_embeddings: bool = True,
+        use_cache: bool = True,
+        vocab_size: int = 151860,
+        layer_types: Optional[list[str]] = None,
+        **kwargs,
+    ):
+        self.vocab_size = vocab_size
+        self.max_position_embeddings = max_position_embeddings
+        self.hidden_size = hidden_size
+        self.intermediate_size = intermediate_size
+        self.num_hidden_layers = num_hidden_layers
+        self.num_attention_heads = num_attention_heads
+        self.sliding_window = sliding_window
+
+        # for backward compatibility
+        if num_key_value_heads is None:
+            num_key_value_heads = num_attention_heads
+        self.num_key_value_heads = num_key_value_heads
+        self.hidden_act = hidden_act
+        self.initializer_range = initializer_range
+        self.rms_norm_eps = rms_norm_eps
+        self.use_cache = use_cache
+
+        self.attention_dropout = attention_dropout
+
+        self.layer_types = layer_types
+        if self.layer_types is None:
+            self.layer_types = [
+                "sliding_attention" if self.sliding_window is not None else "full_attention"
+                for i in range(self.num_hidden_layers)
+            ]
+        layer_type_validation(self.layer_types, self.num_hidden_layers)
+
+        self.rope_parameters = rope_parameters
+
+        self.rope_theta = rope_theta
+        self.tie_word_embeddings = tie_word_embeddings
+        super().__init__(
+            bos_token_id=bos_token_id,
+            eos_token_id=eos_token_id,
+            tie_word_embeddings=tie_word_embeddings,
+            **kwargs,
+        )
+
+
+class PPChart2TableConfig(PreTrainedConfig):
+    r"""
+    This is the main configuration class to store the configuration of a [`PPChart2TableModel`] or [`PPChart2TableForConditionalGeneration`].
+    It is used to instantiate a PP-Chart2Table multimodal model according to the specified arguments, defining the vision and text
+    sub-model architectures. This configuration class inherits from [`PreTrainedConfig`] and combines the configurations of:
+    - [`PPChart2TableVisionConfig`] (for the chart vision encoder)
+    - [`PPChart2TableTextConfig`] (for the table text decoder)
+    PP-Chart2Table [PaddlePaddle/PP-Chart2Table_safetensors](https://huggingface.co/PaddlePaddle/PP-Chart2Table_safetensors).
+
+    Instantiating a `PPChart2TableConfig` with the defaults will yield a similar configuration to the base PP-Chart2Table model
+    developed by the PaddlePaddle team for chart-to-table parsing tasks.
+
+    Configuration objects inherit from [`PreTrainedConfig`] and can be used to control the model outputs. Read the
+    documentation from [`PreTrainedConfig`] for more information.
+
+    Args:
+        vision_config (`dict`, *optional*):
+            Dictionary of configuration options used to initialize [`PPChart2TableVisionConfig`]. If `None`, the default
+            `PPChart2TableVisionConfig` configuration will be used.
+        text_config (`dict`, *optional*):
+            Dictionary of configuration options used to initialize [`PPChart2TableTextConfig`]. If `None`, the default
+            `PPChart2TableTextConfig` configuration will be used.
+        im_start_token (`int`, *optional*, defaults to 151857):
+            The token ID representing the start of an image token sequence in the multimodal input (shared across vision/text sub-configs).
+        im_patch_token (`int`, *optional*, defaults to 151859):
+            The token ID used to represent individual image patches in the multimodal input sequence (shared across vision/text sub-configs).
+
+    Example:
+
+    ```python
+    >>> from transformers import PPChart2TableConfig, PPChart2TableModel
+
+    >>> # Initializing a PPChart2Table configuration with default vision and text sub-configs
+    >>> configuration = PPChart2TableConfig()
+
+    >>> # Initializing a PPChart2Table configuration with custom vision and text sub-configs
+    >>> vision_config = {"img_size": 512, "patch_size": 8}
+    >>> text_config = {"hidden_size": 2048, "num_hidden_layers": 16}
+    >>> configuration = PPChart2TableConfig(vision_config=vision_config, text_config=text_config)
+
+    >>> # Initializing a model from the PPChart2Table configuration
+    >>> model = PPChart2TableModel(configuration)
+
+    >>> # Accessing the model configuration
+    >>> configuration = model.config
+    >>> # Accessing the vision sub-config
+    >>> vision_config = configuration.vision_config
+    >>> # Accessing the text sub-config
+    >>> text_config = configuration.text_config
+    """
+
+    model_type = "pp_chart2table"
+    sub_configs = {"vision_config": PPChart2TableVisionConfig, "text_config": PPChart2TableTextConfig}
+
+    def __init__(
+        self,
+        vision_config: dict | None = None,
+        text_config: dict | None = None,
+        im_start_token: int = 151857,
+        im_patch_token: int = 151859,
+        **kwargs,
+    ):
+        if vision_config is None:
+            vision_config = {}
+        self.vision_config = PPChart2TableVisionConfig(**vision_config)
+
+        if text_config is None:
+            text_config = {}
+        self.text_config = PPChart2TableTextConfig(**text_config)
+
+        self.model_type = "pp_chart2table"
+
+        self.im_start_token = im_start_token
+        self.im_patch_token = im_patch_token
+
+        text_config_keys = [
+            "attention_dropout",
+            "bos_token_id",
+            "eos_token_id",
+            "hidden_act",
+            "hidden_size",
+            "initializer_range",
+            "intermediate_size",
+            "max_position_embeddings",
+            "num_attention_heads",
+            "num_hidden_layers",
+            "num_key_value_heads",
+            "rms_norm_eps",
+            "rope_theta",
+            "sliding_window",
+            "tie_word_embeddings",
+            "dtype",
+            "use_cache",
+            "vocab_size",
+        ]
+        for key in text_config_keys:
+            if hasattr(self.text_config, key):
+                setattr(self, key, getattr(self.text_config, key))
+
+        super().__init__(**kwargs)
+
+
+class PPChart2TableImageProcessor(BaseImageProcessor):
+    r"""
+    Image processor for the PP-Chart2Table multimodal model, optimized for chart image preprocessing tasks.
+
+    This processor handles the complete preprocessing pipeline for chart images, including resizing, rescaling,
+    normalization, and channel dimension reordering, tailored to the input requirements of the PP-Chart2Table vision encoder.
+
+    Args:
+        do_resize (`bool`, *optional*, defaults to `True`):
+            Whether to resize the input images to the specified `size`.
+        size (`dict[str, int]`, *optional*, defaults to `{"height": 256, "width": 256}`):
+            Dictionary containing the target height and width for resizing. Format: `{"height": int, "width": int}`.
+        resample (`PILImageResampling`, *optional*, defaults to `PILImageResampling.BICUBIC`):
+            Resampling filter to use when resizing images (e.g., BICUBIC, BILINEAR).
+        do_rescale (`bool`, *optional*, defaults to `True`):
+            Whether to rescale the pixel values from the range [0, 255] to [0, 1] using `rescale_factor`.
+        rescale_factor (`int` or `float`, *optional*, defaults to `1/255`):
+            Factor to apply for rescaling pixel values (e.g., 1/255 scales 0-255 to 0-1).
+        do_normalize (`bool`, *optional*, defaults to `True`):
+            Whether to normalize the input images using `image_mean` and `image_std`.
+        image_mean (`float` or `list[float]`, *optional*, defaults to `[0.406, 0.456, 0.485]`):
+            Mean values for image normalization (per channel, RGB order).
+        image_std (`float` or `list[float]`, *optional*, defaults to `[0.225, 0.224, 0.229]`):
+            Standard deviation values for image normalization (per channel, RGB order).
+        patch_size (`int`, *optional*, defaults to 16):
+            Size of image patches used by the PP-Chart2Table vision encoder (for alignment with model input).
+        merge_size (`int`, *optional*, defaults to 4):
+            Size factor for merging image patches (specific to PP-Chart2Table's vision processing pipeline).
+    """
+
+    model_input_names = ["pixel_values"]
+
+    def __init__(
+        self,
+        do_resize: bool = True,
+        size: Optional[dict[str, int]] = None,
+        resample: Optional[PILImageResampling] = PILImageResampling.BICUBIC,
+        do_rescale: bool = True,
+        rescale_factor: Union[int, float] = 1 / 255,
+        do_normalize: bool = True,
+        image_mean: Optional[Union[float, list[float]]] = [0.406, 0.456, 0.485],
+        image_std: Optional[Union[float, list[float]]] = [0.225, 0.224, 0.229],
+        patch_size: int = 16,
+        merge_size: int = 4,
+        **kwargs,
+    ) -> None:
+        super().__init__(**kwargs)
+        size = size if size is not None else {"height": 256, "width": 256}
+
+        self.do_resize = do_resize
+        self.size = size
+        self.do_rescale = do_rescale
+        self.rescale_factor = rescale_factor
+        self.do_normalize = do_normalize
+        self.image_mean = image_mean
+        self.image_std = image_std
+        self.resample = resample
+        self.patch_size = patch_size
+        self.merge_size = merge_size
+
+    @filter_out_non_signature_kwargs()
+    def preprocess(
+        self,
+        images: ImageInput,
+        size: Optional[dict[str, int]] = None,
+        do_resize: Optional[bool] = None,
+        resample: Optional[PILImageResampling] = None,
+        do_rescale: Optional[bool] = None,
+        rescale_factor: Optional[Union[int, float]] = None,
+        do_normalize: Optional[bool] = None,
+        image_mean: Optional[Union[float, list[float]]] = None,
+        image_std: Optional[Union[float, list[float]]] = None,
+        return_tensors: Optional[Union[TensorType, str]] = None,
+        data_format: Union[str, ChannelDimension] = ChannelDimension.FIRST,
+        input_data_format: Optional[Union[str, ChannelDimension]] = None,
+    ) -> BatchFeature:
+        size = self.size if size is None else size
+        do_resize = self.do_resize if do_resize is None else do_resize
+        resample = self.resample if resample is None else resample
+        do_rescale = self.do_rescale if do_rescale is None else do_rescale
+        rescale_factor = self.rescale_factor if rescale_factor is None else rescale_factor
+        do_normalize = self.do_normalize if do_normalize is None else do_normalize
+        image_mean = self.image_mean if image_mean is None else image_mean
+        image_std = self.image_std if image_std is None else image_std
+
+        images = make_flat_list_of_images(images)
+
+        validate_preprocess_arguments(
+            do_rescale=do_rescale,
+            rescale_factor=rescale_factor,
+            do_normalize=do_normalize,
+            image_mean=image_mean,
+            image_std=image_std,
+            size=size,
+            do_resize=do_resize,
+            resample=resample,
+        )
+
+        if not valid_images(images):
+            raise ValueError("Invalid image type. Must be of type PIL.Image.Image, numpy.ndarray, or torch.Tensor")
+
+        # All transformations expect numpy arrays
+        images = [to_numpy_array(image) for image in images]
+        if input_data_format is None:
+            input_data_format = infer_channel_dimension_format(images[0])
+
+        # transformations
+        resize_imgs = []
+        if do_resize:
+            for image in images:
+                img = resize(
+                    image,
+                    size=(size["height"], size["width"]),
+                    resample=resample,
+                    input_data_format=input_data_format,
+                )
+                resize_imgs.append(img)
+            images = resize_imgs
+
+        if do_rescale:
+            images = [self.rescale(image, rescale_factor, input_data_format=input_data_format) for image in images]
+
+        if do_normalize:
+            images = [
+                self.normalize(image, image_mean, image_std, input_data_format=input_data_format) for image in images
+            ]
+        images = [flip_channel_order(image, input_data_format=input_data_format) for image in images]
+        images = [
+            to_channel_dimension_format(image, data_format, input_channel_dim=input_data_format) for image in images
+        ]
+
+        encoded_inputs = BatchFeature(data={"pixel_values": images}, tensor_type=return_tensors)
+        return encoded_inputs
+
+
+class PPChart2TableImageProcessorFast(BaseImageProcessorFast):
+    r"""
+    Fast image processor for the PP-Chart2Table multimodal model, optimized for GPU-accelerated chart image preprocessing.
+
+    This high-performance processor implements a streamlined preprocessing pipeline for chart images (resizing, rescaling,
+    normalization, channel reordering) using PyTorch tensor operations, designed for efficient batch processing on GPUs.
+    It inherits from [`BaseImageProcessorFast`] and is optimized for inference/training pipelines requiring low-latency
+    image preprocessing.
+
+    Class Attributes (Default Configuration):
+        resample (`int`, defaults to 3):
+            Integer identifier for the resampling filter (3 = BICUBIC, compatible with `InterpolationMode.BICUBIC`).
+        image_mean (`list[float]`, defaults to `[0.40821073, 0.4578275, 0.48145466]`):
+            Per-channel mean values for image normalization (RGB order).
+        image_std (`list[float]`, defaults to `[0.27577711, 0.26130258, 0.26862954]`):
+            Per-channel standard deviation values for image normalization (RGB order).
+        size (`dict[str, int]`, defaults to `{"height": 1024, "width": 1024}`):
+            Default target size for image resizing (1024x1024, optimized for PP-Chart2Table vision encoder).
+        patch_size (`int`, defaults to 16):
+            Size of image patches used by the PP-Chart2Table vision encoder (for alignment with model input).
+        merge_size (`int`, defaults to 4):
+            Size factor for merging image patches (specific to PP-Chart2Table's vision processing pipeline).
+        do_resize (`bool`, defaults to `True`):
+            Default flag to enable image resizing.
+        do_rescale (`bool`, defaults to `True`):
+            Default flag to enable pixel value rescaling (from [0,255] to [0,1]).
+        do_normalize (`bool`, defaults to `True`):
+            Default flag to enable image normalization.
+    """
+
+    resample = 3
+    image_mean = [0.40821073, 0.4578275, 0.48145466]
+    image_std = [0.27577711, 0.26130258, 0.26862954]
+    size = {"height": 1024, "width": 1024}
+    patch_size = 16
+    merge_size = 4
+    do_resize = True
+    do_rescale = True
+    do_normalize = True
+
+    def __init__(self, **kwargs) -> None:
+        super().__init__(**kwargs)
+
+    def _preprocess(
+        self,
+        images: list[torch.Tensor],
+        size: Optional[list[dict[str, int]]],
+        do_resize: bool,
+        do_rescale: bool,
+        rescale_factor: float,
+        do_normalize: bool,
+        image_mean: Optional[Union[float, list[float]]],
+        image_std: Optional[Union[float, list[float]]],
+        return_tensors: Optional[Union[str, TensorType]],
+        interpolation: Optional[InterpolationMode] = None,
+        **kwargs,
+    ) -> BatchFeature:
+        data = {}
+        resize_imgs = []
+        if do_resize:
+            for image in images:
+                img = self.resize(image, size=size, interpolation=interpolation)
+                resize_imgs.append(img)
+            images = resize_imgs
+
+        processed_images = []
+        for image in images:
+            image = self.rescale_and_normalize(image, do_rescale, rescale_factor, do_normalize, image_mean, image_std)
+            processed_images.append(image)
+        images = processed_images
+
+        images = [image[[2, 1, 0], :, :] for image in images]
+        data.update({"pixel_values": torch.stack(images, dim=0)})
+        encoded_inputs = BatchFeature(data, tensor_type=return_tensors)
+
+        return encoded_inputs
+
+
+class PPChart2TableProcessor(ProcessorMixin):
+    r"""
+    [`PPChart2TableProcessor`] offers all the functionalities of [`PPChart2TableImageProcessor`] and [`Qwen2Tokenizer`]. See the
+    [`~PPChart2TableProcessor.__call__`] and [`~PPChart2TableProcessor.decode`] for more information.
+    Args:
+        image_processor ([`PPChart2TableImageProcessor`], *optional*):
+            The image processor is a required input.
+        tokenizer ([`Qwen2Tokenizer`], *optional*):
+            The tokenizer is a required input.
+        chat_template (`str`, *optional*): A Jinja template which will be used to convert lists of messages
+            in a chat into a tokenizable string.
+    """
+    image_processor_class = "AutoImageProcessor"
+    tokenizer_class = "AutoTokenizer"
+
+    def __init__(self, image_processor=None, tokenizer=None, chat_template=None, **kwargs):
+        super().__init__(image_processor, tokenizer, chat_template=chat_template)
+
+    def __call__(
+        self,
+        images,
+        text=None,
+        **kwargs,
+    ) -> BatchFeature:
+        if images is not None:
+            image_inputs = self.image_processor(images=images, return_tensors="pt")
+        else:
+            image_inputs = {}
+        img_cnt = len(image_inputs)
+        b, c, h, w = image_inputs["pixel_values"].shape
+        num_patches = h // self.image_processor.patch_size // self.image_processor.merge_size
+        prompt = (
+            "<|im_start|>system\n"
+            "You should follow the instructions carefully and explain your answers in detail.<|im_end|><|im_start|>user\n"
+            "<img>" + "<imgpad>" * (num_patches * num_patches) + "</img>\n"
+            "Chart to table<|im_end|><|im_start|>assistant\n"
+        )
+        input_ids = torch.tensor(self.tokenizer([prompt]).input_ids)
+        input_ids = input_ids.repeat(img_cnt, 1)
+        input_ids = {"input_ids": input_ids}
+        return BatchFeature(data={**input_ids, **image_inputs})
+
+    def postprocess(self, model_pred, **kwargs):
+        return self.tokenizer.batch_decode(
+            model_pred[0],
+            skip_special_tokens=kwargs.get("skip_special_tokens", True),
+            clean_up_tokenization_spaces=False,
+        )
+
+
+def window_partition(hidden_states: torch.Tensor, window_size: int) -> tuple[torch.Tensor, tuple[int, int]]:
+    r"""
+    Partition 2D feature maps into non-overlapping windows, with padding to ensure dimensions are divisible by window size.
+
+    Args:
+        hidden_states (`torch.Tensor`):
+            Input feature map with shape [B, H, W, C], where:
+            - B: batch size
+            - H: height of feature map
+            - W: width of feature map
+            - C: channel dimension
+        window_size (`int`):
+            Size of each non-overlapping window (square window).
+
+    Returns:
+        tuple[torch.Tensor, tuple[int, int]]:
+            - windows: Partitioned windows with shape [num_windows * B, window_size, window_size, C],
+              where num_windows = (Hp // window_size) * (Wp // window_size)
+            - (Hp, Wp): Padded height and width of the feature map (after padding)
+    """
+    B, H, W, C = hidden_states.shape
+
+    pad_h = (window_size - H % window_size) % window_size
+    pad_w = (window_size - W % window_size) % window_size
+    if pad_h > 0 or pad_w > 0:
+        hidden_states = F.pad(hidden_states, (0, 0, 0, pad_w, 0, pad_h))
+    Hp, Wp = H + pad_h, W + pad_w
+
+    hidden_states = hidden_states.reshape(B, Hp // window_size, window_size, Wp // window_size, window_size, C)
+    windows = hidden_states.permute(0, 1, 3, 2, 4, 5).reshape(-1, window_size, window_size, C)
+    return windows, (Hp, Wp)
+
+
+def window_unpartition(
+    windows: torch.Tensor,
+    window_size: int,
+    pad_hw: tuple[int, int],
+    hw: tuple[int, int],
+) -> torch.Tensor:
+    r"""
+    Reverse operation of window_partition: merge windows back to original 2D feature map shape, removing padding.
+
+    Args:
+        windows (`torch.Tensor`):
+            Partitioned windows with shape [num_windows * B, window_size, window_size, C]
+        window_size (`int`):
+            Size of each non-overlapping window (must match window_partition's window_size)
+        pad_hw (`tuple[int, int]`):
+            Padded height and width (Hp, Wp) returned by window_partition
+        hw (`tuple[int, int]`):
+            Original height and width (H, W) of feature map before padding
+
+    Returns:
+        `torch.Tensor`:
+            Reconstructed feature map with shape [B, H, W, C] (original dimensions before padding)
+    """
+    Hp, Wp = pad_hw
+    H, W = hw
+    B = windows.shape[0] // (Hp * Wp // window_size // window_size)
+    hidden_states = windows.reshape(B, Hp // window_size, Wp // window_size, window_size, window_size, -1)
+    hidden_states = hidden_states.permute(0, 1, 3, 2, 4, 5).reshape(B, Hp, Wp, -1)
+    if Hp > H or Wp > W:
+        hidden_states = hidden_states[:, :H, :W, :]
+    return hidden_states
+
+
+def get_rel_pos(q_size: int, k_size: int, rel_pos: torch.Tensor) -> torch.Tensor:
+    r"""
+    Get relative positional embeddings for query and key sequences, with interpolation for mismatched sizes.
+
+    Args:
+        q_size (`int`):
+            Spatial size (height/width) of query feature map
+        k_size (`int`):
+            Spatial size (height/width) of key feature map
+        rel_pos (`torch.Tensor`):
+            Precomputed relative positional embeddings with shape [max_rel_dist_original, dim]
+
+    Returns:
+        `torch.Tensor`:
+            Interpolated relative positional embeddings for the query-key pair, shape [q_size, k_size, dim]
+    """
+    max_rel_dist = int(2 * max(q_size, k_size) - 1)
+    if rel_pos.shape[0] != max_rel_dist:
+        rel_pos_resized = F.interpolate(
+            rel_pos.reshape(1, rel_pos.shape[0], -1).permute(0, 2, 1),
+            size=max_rel_dist,
+            mode="linear",
+        )
+        rel_pos_resized = rel_pos_resized.reshape(-1, max_rel_dist).permute(1, 0)
+    else:
+        rel_pos_resized = rel_pos
+
+    q_coords = torch.arange(q_size)[:, None] * max(k_size / q_size, 1.0)
+    k_coords = torch.arange(k_size)[None, :] * max(q_size / k_size, 1.0)
+    relative_coords = q_coords - k_coords + (k_size - 1) * max(q_size / k_size, 1.0)
+    return rel_pos_resized[relative_coords.long()]
+
+
+def add_decomposed_rel_pos(
+    attn: torch.Tensor,
+    q: torch.Tensor,
+    rel_pos_h: torch.Tensor,
+    rel_pos_w: torch.Tensor,
+    q_size: tuple[int, int],
+    k_size: tuple[int, int],
+) -> torch.Tensor:
+    r"""
+    Add decomposed relative positional embeddings (height and width separately) to attention scores.
+
+    Args:
+        attn (`torch.Tensor`):
+            Attention scores with shape [B, q_h*q_w, k_h*k_w]
+        q (`torch.Tensor`):
+            Query tensor with shape [B, q_h*q_w, dim]
+        rel_pos_h (`torch.Tensor`):
+            Precomputed relative positional embeddings for height dimension
+        rel_pos_w (`torch.Tensor`):
+            Precomputed relative positional embeddings for width dimension
+        q_size (`tuple[int, int]`):
+            Spatial size (q_h, q_w) of query feature map
+        k_size (`tuple[int, int]`):
+            Spatial size (k_h, k_w) of key feature map
+
+    Returns:
+        `torch.Tensor`:
+            Attention scores with added relative positional embeddings, shape [B, q_h*q_w, k_h*k_w]
+    """
+    q_h, q_w = q_size
+    k_h, k_w = k_size
+    Rh = get_rel_pos(q_h, k_h, rel_pos_h)
+    Rw = get_rel_pos(q_w, k_w, rel_pos_w)
+
+    B, _, dim = q.shape
+    r_q = q.reshape(B, q_h, q_w, dim)
+    rel_h = torch.einsum("bhwc,hkc->bhwk", r_q, Rh)
+    rel_w = torch.einsum("bhwc,wkc->bhwk", r_q, Rw)
+
+    attn = (attn.reshape(B, q_h, q_w, k_h, k_w) + rel_h[:, :, :, :, None] + rel_w[:, :, :, None, :]).reshape(
+        B, q_h * q_w, k_h * k_w
+    )
+
+    return attn
+
+
+class PPChart2TableVisionPatchEmbed(nn.Module):
+    r"""
+    Image to Patch Embedding layer for PP-Chart2Table vision encoder.
+
+    This module converts raw chart images (HWC format) into flattened patch embeddings via a 2D convolution,
+    followed by dimension permutation to align with the vision transformer's input format.
+
+    Args:
+        kernel_size (`tuple[int, int]`, *optional*, defaults to `(16, 16)`):
+            Size of the convolution kernel (patch size) for splitting images into patches.
+        stride (`tuple[int, int]`, *optional*, defaults to `(16, 16)`):
+            Stride of the convolution operation (matches patch size for non-overlapping patches).
+        padding (`tuple[int, int]`, *optional*, defaults to `(0, 0)`):
+            Padding applied to the input image before convolution (ensures patch alignment).
+        in_chans (`int`, *optional*, defaults to 3):
+            Number of input channels (3 for RGB chart images).
+        embed_dim (`int`, *optional*, defaults to 768):
+            Dimensionality of the output patch embeddings (hidden size of the vision transformer).
+
+    Shape:
+        - Input: `(B, C, H, W)` (batch size, channels, height, width)
+        - Output: `(B, H_out, W_out, C_out)` (batch size, patch height, patch width, embedding dim)
+    """
+
+    def __init__(
+        self,
+        kernel_size: tuple[int, int] = (16, 16),
+        stride: tuple[int, int] = (16, 16),
+        padding: tuple[int, int] = (0, 0),
+        in_chans: int = 3,
+        embed_dim: int = 768,
+    ) -> None:
+        super().__init__()
+        self.proj = nn.Conv2d(in_chans, embed_dim, kernel_size=kernel_size, stride=stride, padding=padding)
+
+    def forward(self, hidden_states: torch.Tensor) -> torch.Tensor:
+        hidden_states = self.proj(hidden_states)
+        hidden_states = hidden_states.permute(0, 2, 3, 1)
+        return hidden_states
+
+
+class PPChart2TableVisionMLPBlock(nn.Module):
+    r"""
+    Multi-Layer Perceptron (MLP) block for PP-Chart2Table vision transformer layers.
+
+    Implements a two-layer feed-forward network with activation function, used in the vision transformer's
+    decoder layers to project features to a higher dimension and back.
+
+    Args:
+        embedding_dim (`int`):
+            Dimensionality of the input/output embeddings (hidden size of the transformer layer).
+        mlp_dim (`int`):
+            Dimensionality of the intermediate (hidden) layer in the MLP (typically 4x embedding_dim).
+        act (`Type[nn.Module]`, *optional*, defaults to `torch.nn.GELU`):
+            Non-linear activation function to apply between the two linear layers.
+
+    Shape:
+        - Input: `(B, H, W, embedding_dim)` or `(B, N, embedding_dim)` (N = H*W)
+        - Output: Same shape as input
+    """
+
+    def __init__(
+        self,
+        embedding_dim: int,
+        mlp_dim: int,
+        act: type[nn.Module] = torch.nn.GELU,
+    ) -> None:
+        super().__init__()
+        self.lin1 = nn.Linear(embedding_dim, mlp_dim)
+        self.lin2 = nn.Linear(mlp_dim, embedding_dim)
+        self.act = act()
+
+    def forward(self, hidden_states: torch.Tensor) -> torch.Tensor:
+        return self.lin2(self.act(self.lin1(hidden_states)))
+
+
+class PPChart2TableVisionLayerNorm2d(nn.Module):
+    r"""
+    2D Layer Normalization for spatial feature maps (adapted for PP-Chart2Table vision encoder).
+
+    Applies layer normalization over the channel dimension of 2D feature maps, with learnable scale/bias parameters
+    broadcasted across spatial dimensions (height/width).
+
+    Args:
+        num_channels (`int`):
+            Number of channels in the input feature map (embedding dimension).
+        epsilon (`float`, *optional*, defaults to `1e-06`):
+            Small value added to variance to avoid division by zero.
+
+    Shape:
+        - Input: `(B, C, H, W)` (batch size, channels, height, width)
+        - Output: Same shape as input
+    """
+
+    def __init__(self, num_channels: int, epsilon: float = 1e-06) -> None:
+        super().__init__()
+        self.weight = nn.Parameter(torch.ones(num_channels))
+        self.bias = nn.Parameter(torch.zeros(num_channels))
+        self.epsilon = epsilon
+
+    def forward(self, hidden_states: torch.Tensor) -> torch.Tensor:
+        u = hidden_states.mean(dim=1, keepdim=True)
+        s = (hidden_states - u).pow(2).mean(dim=1, keepdim=True)
+        hidden_states = (hidden_states - u) / torch.sqrt(s + self.epsilon)
+        hidden_states = self.weight[:, None, None] * hidden_states + self.bias[:, None, None]
+        return hidden_states
+
+
+class PPChart2TableVisionAttention(nn.Module):
+    r"""
+    Multi-Head Self-Attention (MHSA) layer for PP-Chart2Table vision encoder, with optional relative positional encoding.
+
+    Implements standard multi-head attention with query/key/value projection, scaled dot-product attention,
+    and optional decomposed relative positional embeddings (height/width separate) for spatial awareness.
+
+    Args:
+        dim (`int`):
+            Dimensionality of the input embeddings (hidden size of the transformer layer).
+        num_heads (`int`, *optional*, defaults to 8):
+            Number of attention heads (must divide `dim` evenly).
+        qkv_bias (`bool`, *optional*, defaults to `True`):
+            Whether to add bias terms to the query/key/value projection layers.
+        use_rel_pos (`bool`, *optional*, defaults to `False`):
+            Whether to use relative positional encoding for spatial attention.
+        rel_pos_zero_init (`bool`, *optional*, defaults to `True`):
+            Whether to initialize relative positional embeddings to zero (stable training).
+        input_size (`Tuple[int, int]`, *optional*):
+            Spatial size (H, W) of the input feature map (required if `use_rel_pos=True`).
+
+    Shape:
+        - Input: `(B, H, W, dim)` (batch size, height, width, embedding dim)
+        - Output: Same shape as input
+    """
+
+    def __init__(
+        self,
+        dim: int,
+        num_heads: int = 8,
+        qkv_bias: bool = True,
+        use_rel_pos: bool = False,
+        rel_pos_zero_init: bool = True,
+        input_size: Optional[tuple[int, int]] = None,
+    ) -> None:
+        super().__init__()
+        self.num_heads = num_heads
+        head_dim = dim // num_heads
+        self.scale = head_dim**-0.5
+
+        self.qkv = nn.Linear(dim, dim * 3, bias=qkv_bias)
+        self.proj = nn.Linear(dim, dim)
+
+        self.use_rel_pos = use_rel_pos
+        if self.use_rel_pos:
+            assert input_size is not None, "Input size must be provided if using relative positional encoding."
+            self.rel_pos_h = nn.Parameter(torch.zeros(2 * input_size[0] - 1, head_dim))
+            self.rel_pos_w = nn.Parameter(torch.zeros(2 * input_size[1] - 1, head_dim))
+
+    def forward(self, hidden_states: torch.Tensor) -> torch.Tensor:
+        B, H, W, _ = hidden_states.shape
+        qkv = self.qkv(hidden_states).reshape(B, H * W, 3, self.num_heads, -1).permute(2, 0, 3, 1, 4)
+        q, k, v = qkv.reshape(3, B * self.num_heads, H * W, -1).unbind(dim=0)
+        attn = (q * self.scale) @ k.transpose(1, 2)
+
+        if self.use_rel_pos:
+            attn = add_decomposed_rel_pos(attn, q, self.rel_pos_h, self.rel_pos_w, (H, W), (H, W))
+
+        attn = F.softmax(attn, dim=-1)
+        hidden_states = (attn @ v).reshape(B, self.num_heads, H, W, -1).permute(0, 2, 3, 1, 4).reshape(B, H, W, -1)
+        hidden_states = self.proj(hidden_states)
+        return hidden_states
+
+
+class PPChart2TableVisionDecoderLayer(nn.Module):
+    r"""
+    Single decoder layer of the PP-Chart2Table vision transformer, with optional windowed attention.
+
+    Implements the standard transformer decoder layer structure:
+    Layer Norm → Multi-Head Attention (with residual) → Layer Norm → MLP (with residual)
+    Supports windowed attention (SW-MHA) for large feature maps to reduce computation.
+
+    Args:
+        dim (`int`):
+            Dimensionality of the input embeddings (hidden size of the transformer layer).
+        num_heads (`int`):
+            Number of attention heads (passed to MHSA layer).
+        mlp_ratio (`float`, *optional*, defaults to 4.0):
+            Ratio of MLP hidden dimension to embedding dimension (mlp_dim = dim * mlp_ratio).
+        qkv_bias (`bool`, *optional*, defaults to `True`):
+            Whether to use bias in Q/K/V projection (passed to MHSA layer).
+        norm_layer (`Type[nn.Module]`, *optional*, defaults to `nn.LayerNorm`):
+            Normalization layer to use (LayerNorm for flattened patches, LayerNorm2d for 2D feature maps).
+        act_layer (`Type[nn.Module]`, *optional*, defaults to `nn.GELU`):
+            Activation function for MLP block.
+        use_rel_pos (`bool`, *optional*, defaults to `False`):
+            Whether to use relative positional encoding (passed to MHSA layer).
+        rel_pos_zero_init (`bool`, *optional*, defaults to `True`):
+            Whether to zero-initialize relative positional embeddings (passed to MHSA layer).
+        window_size (`int`, *optional*, defaults to 0):
+            Size of attention windows (0 = full attention, >0 = windowed attention).
+        input_size (`Tuple[int, int]`, *optional*):
+            Spatial size of input feature map (passed to MHSA layer for relative positional encoding).
+
+    Shape:
+        - Input: `(B, H, W, dim)` (batch size, height, width, embedding dim)
+        - Output: Same shape as input
+    """
+
+    def __init__(
+        self,
+        dim: int,
+        num_heads: int,
+        mlp_ratio: float = 4.0,
+        qkv_bias: bool = True,
+        norm_layer: type[nn.Module] = nn.LayerNorm,
+        act_layer: type[nn.Module] = nn.GELU,
+        use_rel_pos: bool = False,
+        rel_pos_zero_init: bool = True,
+        window_size: int = 0,
+        input_size: Optional[tuple[int, int]] = None,
+    ) -> None:
+        super().__init__()
+        self.norm1 = norm_layer(dim)
+        self.attn = PPChart2TableVisionAttention(
+            dim,
+            num_heads=num_heads,
+            qkv_bias=qkv_bias,
+            use_rel_pos=use_rel_pos,
+            rel_pos_zero_init=rel_pos_zero_init,
+            input_size=input_size if window_size == 0 else (window_size, window_size),
+        )
+
+        self.norm2 = norm_layer(dim)
+        self.mlp = PPChart2TableVisionMLPBlock(embedding_dim=dim, mlp_dim=int(dim * mlp_ratio), act=act_layer)
+
+        self.window_size = window_size
+
+    def forward(self, hidden_states: torch.Tensor) -> torch.Tensor:
+        shortcut = hidden_states
+        hidden_states = self.norm1(hidden_states)
+        if self.window_size > 0:
+            H, W = hidden_states.shape[1], hidden_states.shape[2]
+            hidden_states, pad_hw = window_partition(hidden_states, self.window_size)
+        hidden_states = self.attn(hidden_states)
+
+        if self.window_size > 0:
+            hidden_states = window_unpartition(hidden_states, self.window_size, pad_hw, (H, W))
+        hidden_states = shortcut + hidden_states
+        hidden_states = hidden_states + self.mlp(self.norm2(hidden_states))
+        return hidden_states
+
+
+class PPChart2TableVisionPreTrainedModel(PreTrainedModel):
+    r"""
+    Base class for all PP-Chart2Table vision models, inheriting from Hugging Face `PreTrainedModel`.
+
+    This class sets up core configurations and compatibility flags for the vision encoder, including:
+    - Support for gradient checkpointing, attention backends (FlashAttention/SDPA), and model compilation
+    - Definition of non-splittable modules (for tensor parallelism)
+    - Output recording for hidden states/attentions (for debugging/analysis)
+
+    Class Attributes:
+        config (`PPChart2TableVisionConfig`):
+            Typed config class for PP-Chart2Table vision encoder (enforces type checking).
+        base_model_prefix (`str`, defaults to `"model"`):
+            Prefix for base model parameters (used in weight loading/saving).
+        supports_gradient_checkpointing (`bool`, defaults to `True`):
+            Whether the model supports gradient checkpointing to save memory.
+        _no_split_modules (`list[str]`):
+            Modules that should not be split across devices (tensor parallelism compatibility).
+        _skip_keys_device_placement (`list[str]`):
+            Keys to skip when placing tensors on devices (e.g., past key values for generation).
+        _supports_flash_attn / _supports_sdpa / _supports_flex_attn (`bool`):
+            Compatibility with optimized attention implementations (FlashAttention, SDPA, FlexAttention).
+        _can_compile_fullgraph (`bool`, defaults to `True`):
+            Whether the model supports TorchScript/TorchCompile full graph compilation.
+        _supports_attention_backend (`bool`, defaults to `True`):
+            Whether the model supports switching attention backends (e.g., PyTorch vs FlashAttention).
+        _can_record_outputs (`dict`):
+            Mapping of output types to modules for recording intermediate outputs (hidden_states/attentions).
+    """
+
+    config: PPChart2TableVisionConfig
+    base_model_prefix = "model"
+    supports_gradient_checkpointing = True
+    _no_split_modules = ["PPChart2TableVisionDecoderLayer"]
+    _skip_keys_device_placement = ["past_key_values"]
+    _supports_flash_attn = True
+    _supports_sdpa = True
+    _supports_flex_attn = True
+
+    _can_compile_fullgraph = True
+    _supports_attention_backend = True
+    _can_record_outputs = {
+        "hidden_states": PPChart2TableVisionDecoderLayer,
+        "attentions": PPChart2TableVisionAttention,
+    }
+
+
+class PPChart2TableVisionModel(PPChart2TableVisionPreTrainedModel):
+
+    main_input_name = "pixel_values"
+    input_modalities = "image"
+
+    def __init__(
+        self,
+        config: PPChart2TableVisionConfig,
+        in_chans: int = 3,
+        norm_layer: type[nn.Module] = nn.LayerNorm,
+        act_layer: type[nn.Module] = nn.GELU,
+        rel_pos_zero_init: bool = True,
+    ) -> None:
+        super().__init__(config)
+        self.img_size = config.img_size
+
+        self.patch_embed = PPChart2TableVisionPatchEmbed(
+            kernel_size=(config.patch_size, config.patch_size),
+            stride=(config.patch_size, config.patch_size),
+            in_chans=in_chans,
+            embed_dim=config.embed_dim,
+        )
+
+        self.pos_embed = nn.Parameter(
+            torch.zeros(
+                1, config.img_size // config.patch_size, config.img_size // config.patch_size, config.embed_dim
+            )
+        )
+
+        self.blocks = nn.ModuleList()
+        for i in range(config.depth):
+            block = PPChart2TableVisionDecoderLayer(
+                dim=config.embed_dim,
+                num_heads=config.num_heads,
+                mlp_ratio=config.mlp_ratio,
+                qkv_bias=config.qkv_bias,
+                norm_layer=norm_layer,
+                act_layer=act_layer,
+                use_rel_pos=config.use_rel_pos,
+                rel_pos_zero_init=rel_pos_zero_init,
+                window_size=config.window_size if i not in config.global_attn_indexes else 0,
+                input_size=(config.img_size // config.patch_size, config.img_size // config.patch_size),
+            )
+            self.blocks.append(block)
+
+        self.neck = nn.Sequential(
+            nn.Conv2d(
+                config.embed_dim,
+                config.out_chans,
+                kernel_size=1,
+                bias=False,
+            ),
+            PPChart2TableVisionLayerNorm2d(config.out_chans),
+            nn.Conv2d(
+                config.out_chans,
+                config.out_chans,
+                kernel_size=3,
+                padding=1,
+                bias=False,
+            ),
+            PPChart2TableVisionLayerNorm2d(config.out_chans),
+        )
+
+        self.net_2 = nn.Conv2d(256, 512, kernel_size=3, stride=2, padding=1, bias=False)
+        self.net_3 = nn.Conv2d(512, config.hidden_size, kernel_size=3, stride=2, padding=1, bias=False)
+
+        self.post_init()
+
+    def forward(self, hidden_states: torch.Tensor) -> torch.Tensor:
+        hidden_states = self.patch_embed(hidden_states)
+        hidden_states = hidden_states + self.pos_embed
+        for blk in self.blocks:
+            hidden_states = blk(hidden_states)
+        hidden_states = self.neck(hidden_states.permute(0, 3, 1, 2))
+        hidden_states = self.net_2(hidden_states)
+        hidden_states = self.net_3(hidden_states)
+        return hidden_states
+
+class PPChart2TableTextAttention(Qwen2Attention):
+    pass
+
+
+class PPChart2TableTextDecoderLayer(Qwen2DecoderLayer):
+    pass
+
+
+class PPChart2TableTextPreTrainedModel(Qwen2PreTrainedModel):
+    pass
+
+class PPChart2TableTextModel(Qwen2Model):
+    pass
+
+
+@dataclass
+class PPChart2TableModelOutputWithPast(ModelOutput):
+    r"""
+    Output class for PPChart2Table multimodal model's forward pass, extending Hugging Face `ModelOutput`.
+
+    This dataclass encapsulates the core outputs of the PP-Chart2Table base model, including hidden states,
+    attention weights, and cached key/value pairs for efficient generation.
+
+    Attributes:
+        past_key_values (`Optional[Cache]`, defaults to `None`):
+            Cached attention key/value pairs from the text decoder (for fast autoregressive generation).
+        last_hidden_state (`Optional[torch.FloatTensor]`, defaults to `None`):
+            Final hidden states from the text decoder (shape: `[B, seq_len, hidden_size]`), after multimodal fusion.
+        hidden_states (`Optional[tuple[torch.FloatTensor]]`, defaults to `None`):
+            Tuple of hidden states from each layer of the text decoder (for debugging/analysis).
+        attentions (`Optional[tuple[torch.FloatTensor]]`, defaults to `None`):
+            Tuple of attention weights from each layer of the text decoder (for debugging/analysis).
+    """
+
+    past_key_values: Optional[Cache] = None
+    last_hidden_state: Optional[torch.FloatTensor] = None
+    hidden_states: Optional[tuple[torch.FloatTensor]] = None
+    attentions: Optional[tuple[torch.FloatTensor]] = None
+
+
+@dataclass
+class PPChart2TableCausalLMOutputWithPast(ModelOutput):
+    r"""
+    Output class for PP-Chart2Table conditional generation model's forward pass.
+
+    Extends `PPChart2TableModelOutputWithPast` with language modeling logits (for token prediction),
+    tailored for autoregressive table generation tasks.
+
+    Attributes:
+        logits (`Optional[torch.FloatTensor]`, defaults to `None`):
+            Language modeling logits (shape: `[B, seq_len, vocab_size]`), output from the LM head.
+        past_key_values (`Optional[Cache]`, defaults to `None`):
+            Cached attention key/value pairs (inherited from base model output).
+        last_hidden_state (`Optional[torch.FloatTensor]`, defaults to `None`):
+            Final hidden states from the text decoder (inherited from base model output).
+        hidden_states (`Optional[tuple[torch.FloatTensor]]`, defaults to `None`):
+            Tuple of decoder layer hidden states (inherited from base model output).
+        attentions (`Optional[tuple[torch.FloatTensor]]`, defaults to `None`):
+            Tuple of decoder layer attention weights (inherited from base model output).
+    """
+
+    logits: Optional[torch.FloatTensor] = None
+    past_key_values: Optional[Cache] = None
+    last_hidden_state: Optional[torch.FloatTensor] = None
+    hidden_states: Optional[tuple[torch.FloatTensor]] = None
+    attentions: Optional[tuple[torch.FloatTensor]] = None
+
+
+class PPChart2TablePreTrainedModel(PreTrainedModel):
+    r"""
+    Base class for all PP-Chart2Table multimodal models, inheriting from Hugging Face `PreTrainedModel`.
+
+    This class defines core configurations and compatibility flags for the multimodal model (vision + text),
+    including support for gradient checkpointing, optimized attention backends, and model compilation.
+
+    Class Attributes:
+        config (`PPChart2TableConfig`):
+            Typed config class for PP-Chart2Table (combines vision + text sub-configs).
+        base_model_prefix (`str`, defaults to `"model"`):
+            Prefix for base model parameters (used in weight loading/saving).
+        supports_gradient_checkpointing (`bool`, defaults to `True`):
+            Whether the model supports gradient checkpointing to save memory during training.
+        _no_split_modules (`list[str]`):
+            Modules that should not be split across devices (tensor parallelism compatibility).
+        _skip_keys_device_placement (`list[str]`):
+            Keys to skip when placing tensors on devices (e.g., past key values for generation).
+        _supports_flash_attn / _supports_sdpa / _supports_flex_attn (`bool`):
+            Compatibility with optimized attention implementations (FlashAttention, SDPA, FlexAttention).
+        _can_compile_fullgraph (`bool`, defaults to `True`):
+            Whether the model supports TorchScript/TorchCompile full graph compilation.
+        _supports_attention_backend (`bool`, defaults to `True`):
+            Whether the model supports switching attention backends (e.g., PyTorch vs FlashAttention).
+        _can_record_outputs (`dict`):
+            Mapping of output types to modules for recording intermediate outputs (hidden_states/attentions).
+    """
+
+    config: PPChart2TableConfig
+    base_model_prefix = "model"
+    supports_gradient_checkpointing = True
+    _no_split_modules = ["PPChart2TableTextDecoderLayer"]
+    _skip_keys_device_placement = ["past_key_values"]
+    _supports_flash_attn = True
+    _supports_sdpa = True
+    _supports_flex_attn = True
+
+    _can_compile_fullgraph = True
+    _supports_attention_backend = True
+
+    _can_record_outputs = {
+        "hidden_states": PPChart2TableTextDecoderLayer,
+        "attentions": PPChart2TableTextAttention,
+    }
+
+
+class PPChart2TableModel(PPChart2TablePreTrainedModel):
+    r"""
+    Core PP-Chart2Table multimodal model (vision encoder + text decoder) for chart-to-table parsing.
+
+    This model integrates a vision encoder (for chart image feature extraction) and a text decoder (for table generation),
+    with a multimodal projection layer to align vision features with text embedding space. The core logic is:
+    1. Extract chart features via vision encoder
+    2. Project vision features to text embedding dimension
+    3. Inject vision features into text decoder inputs (replace image placeholder tokens)
+    4. Forward pass through text decoder to generate table text
+
+    Args:
+        config (`PPChart2TableConfig`):
+            Combined configuration class (includes vision_config and text_config sub-configs).
+
+    Inputs (forward method):
+        input_ids (`torch.LongTensor`, optional):
+            Tokenized input text (including image placeholder tokens) with shape `[B, seq_len]`.
+        attention_mask (`torch.Tensor`, optional):
+            Attention mask to avoid padding tokens (shape: `[B, seq_len]`).
+        position_ids (`torch.Tensor`, optional):
+            Positional indices for input tokens (shape: `[B, seq_len]`).
+        past_key_values (`list[torch.Tensor]`, optional):
+            Cached key/value pairs for fast autoregressive generation.
+        inputs_embeds (`torch.Tensor`, optional):
+            Precomputed input embeddings (shape: `[B, seq_len, hidden_size]`; overrides `input_ids`).
+        use_cache (`bool`, optional):
+            Whether to cache key/value pairs for generation.
+        pixel_values (`torch.Tensor`, optional):
+            Preprocessed chart images (shape: `[B, 3, H, W]`; required for multimodal input).
+        cache_position (`torch.LongTensor`, optional):
+            Position indices for cached key/value pairs (for generation).
+        **kwargs:
+            Additional arguments passed to the text decoder.
+
+    Outputs:
+        `PPChart2TableModelOutputWithPast`:
+            Contains the text decoder's final hidden states, cached key/values, and optional intermediate outputs.
+    """
+
+    config_class = PPChart2TableConfig
+
+    def __init__(self, config: PPChart2TableConfig):
+        super().__init__(config)
+        self.vision_tower_high = PPChart2TableVisionModel._from_config(config.vision_config)
+        self.language_model = PPChart2TableTextModel._from_config(config.text_config)
+        self.mm_projector_vary = nn.Linear(config.vision_config.hidden_size, config.text_config.hidden_size)
+
+        # Initialize weights and apply final processing
+        self.post_init()
+
+    def get_input_embeddings(self):
+        """Get input embeddings from the text decoder (for weight tying/loading)."""
+        return self.language_model.embed_tokens
+
+    def set_input_embeddings(self, value):
+        """Set input embeddings for the text decoder (for weight tying/loading)."""
+        self.language_model.embed_tokens = value
+
+    def get_image_features(
+        self,
+        images: Optional[torch.Tensor],
+    ) -> list[torch.Tensor]:
+        r"""
+        Extract and project chart image features to text embedding space.
+
+        Args:
+            images (`torch.Tensor`):
+                Preprocessed chart images (shape: `[B, 3, H, W]`).
+
+        Returns:
+            `list[torch.Tensor]`:
+                List of projected image features (one per image), each with shape `[1, num_patches, text_hidden_size]`.
+        """
+        image_features = []
+        for image in images:
+            image = image.unsqueeze(0)
+            with torch.no_grad():
+                cnn_feature = self.vision_tower_high(image)
+                cnn_feature = cnn_feature.flatten(2).transpose(2, 1)
+            image_feature = self.mm_projector_vary(cnn_feature)
+            image_features.append(image_feature)
+
+        image_features = torch.stack(image_features, dim=0)
+
+        return image_features
+
+    def get_placeholder_mask(
+        self,
+        input_ids: Optional[torch.LongTensor] = None,
+        inputs_embeds: Optional[torch.FloatTensor] = None,
+        image_features: Optional[torch.FloatTensor] = None,
+    ) -> torch.BoolTensor:
+        r"""
+        Generate mask to locate image placeholder tokens in input embeddings.
+
+        This mask identifies the `<imgpad>` tokens in the input sequence, which will be replaced with
+        projected image features for multimodal fusion.
+
+        Args:
+            input_ids (`torch.LongTensor`, optional):
+                Tokenized input text (used if `inputs_embeds` is None).
+            inputs_embeds (`torch.FloatTensor`, optional):
+                Precomputed input embeddings (used if `input_ids` is None).
+            image_features (`torch.FloatTensor`):
+                Projected image features (used to validate token-feature count match).
+
+        Returns:
+            `torch.BoolTensor`:
+                Boolean mask (shape: `[B, seq_len, text_hidden_size]`) where `True` indicates image placeholder tokens.
+
+        Raises:
+            ValueError: If the number of image tokens does not match the number of image features.
+        """
+        if input_ids is None:
+            start_token_embed = self.get_input_embeddings()(
+                torch.tensor(self.config.im_start_token, dtype=torch.long, device=inputs_embeds.device)
+            )
+            special_image_mask = inputs_embeds == start_token_embed
+            special_image_mask = special_image_mask.all(-1)
+        else:
+            special_image_mask = input_ids == self.config.im_patch_token
+
+        n_image_tokens = special_image_mask.sum()
+
+        n_image_features = image_features.numel() // image_features.shape[-1]
+        if n_image_tokens != n_image_features:
+            raise ValueError(
+                f"Image features and image tokens do not match: tokens: {n_image_tokens}, features {n_image_features}"
+            )
+
+        special_image_mask = special_image_mask.unsqueeze(-1).expand_as(inputs_embeds).to(inputs_embeds.device)
+
+        return special_image_mask
+
+    @can_return_tuple
+    def forward(
+        self,
+        input_ids: torch.LongTensor = None,
+        attention_mask: Optional[torch.Tensor] = None,
+        position_ids: Optional[torch.Tensor] = None,
+        past_key_values: Optional[list[torch.Tensor]] = None,
+        inputs_embeds: Optional[torch.Tensor] = None,
+        use_cache: Optional[bool] = None,
+        pixel_values: Optional[torch.Tensor] = None,
+        cache_position: Optional[torch.LongTensor] = None,
+        **kwargs,
+    ):
+        if inputs_embeds is None:
+            inputs_embeds = self.language_model.embed_tokens(input_ids)
+
+        if pixel_values is not None:
+            image_features = self.get_image_features(pixel_values)
+            image_mask = self.get_placeholder_mask(
+                input_ids, inputs_embeds=inputs_embeds, image_features=image_features
+            )
+            inputs_embeds = inputs_embeds.masked_scatter(image_mask, image_features)
+
+        outputs = self.language_model(
+            input_ids=None,
+            position_ids=position_ids,
+            attention_mask=attention_mask,
+            past_key_values=past_key_values,
+            inputs_embeds=inputs_embeds,
+            use_cache=use_cache,
+            cache_position=cache_position,
+            **kwargs,
+        )
+
+        output = PPChart2TableModelOutputWithPast(
+            last_hidden_state=outputs.last_hidden_state,
+            past_key_values=outputs.past_key_values,
+            hidden_states=outputs.hidden_states,
+            attentions=outputs.attentions,
+        )
+
+        return output
+
+
+class PPChart2TableForConditionalGeneration(PPChart2TablePreTrainedModel, GenerationMixin):
+    r"""
+    PP-Chart2Table model for conditional generation (table text generation from chart images),
+    extending the core model with a language modeling (LM) head and generation utilities.
+
+    This class integrates Hugging Face `GenerationMixin` to support standard generation methods (greedy, beam search, etc.),
+    and adds an LM head to predict token probabilities for autoregressive table generation.
+
+    Key Features:
+    - LM head for token prediction (weight tied to input embeddings)
+    - Optimized generation input preparation (avoids reprocessing images in subsequent steps)
+    - Inference-only mode (training not supported by default)
+
+    Args:
+        config (`PPChart2TableConfig`):
+            Combined configuration class (vision + text sub-configs).
+
+    Inputs (forward method):
+        Inherits all inputs from `PPChart2TableModel`, plus:
+        labels (`list[dict]`, optional):
+            Training labels (not supported; raises ValueError if provided).
+        logits_to_keep (`Union[int, torch.Tensor]`, defaults to 0):
+            Slice index to keep only the last N logits (optimizes generation efficiency).
+
+    Outputs:
+        `PPChart2TableCausalLMOutputWithPast`:
+            Contains LM logits, decoder hidden states, and cached key/value pairs.
+    """
+
+    _keys_to_ignore_on_load_missing = ["num_batches_tracked"]
+    _tied_weights_keys = {"lm_head.weight": "model.language_model.embed_tokens.weight"}
+
+    def __init__(self, config: PPChart2TableConfig):
+        super().__init__(config)
+        self.model = PPChart2TableModel(config)
+        self.lm_head = nn.Linear(config.text_config.hidden_size, config.text_config.vocab_size, bias=False)
+
+        self.post_init()
+
+    def get_input_embeddings(self):
+        return self.model.get_input_embeddings()
+
+    def set_input_embeddings(self, value):
+        self.model.set_input_embeddings(value)
+
+    def prepare_inputs_for_generation(
+        self,
+        input_ids,
+        past_key_values=None,
+        attention_mask=None,
+        inputs_embeds=None,
+        cache_position=None,
+        position_ids=None,
+        use_cache=True,
+        pixel_values=None,
+        pixel_values_videos=None,
+        image_grid_thw=None,
+        video_grid_thw=None,
+        is_first_iteration=False,
+        **kwargs,
+    ):
+        # Overwritten -- in specific circumstances we don't want to forward image inputs to the model
+
+        model_inputs = super().prepare_inputs_for_generation(
+            input_ids,
+            past_key_values=past_key_values,
+            attention_mask=attention_mask,
+            inputs_embeds=inputs_embeds,
+            cache_position=cache_position,
+            position_ids=position_ids,
+            pixel_values=pixel_values,
+            pixel_values_videos=pixel_values_videos,
+            image_grid_thw=image_grid_thw,
+            video_grid_thw=video_grid_thw,
+            use_cache=use_cache,
+            is_first_iteration=is_first_iteration,
+            **kwargs,
+        )
+        if not is_first_iteration and use_cache:
+            model_inputs["pixel_values"] = None
+
+        return model_inputs
+
+    @can_return_tuple
+    def forward(
+        self,
+        input_ids: Optional[torch.LongTensor] = None,
+        attention_mask: Optional[torch.Tensor] = None,
+        position_ids: Optional[torch.LongTensor] = None,
+        pixel_values: Optional[torch.Tensor] = None,
+        inputs_embeds: Optional[torch.Tensor] = None,
+        labels: Optional[list[dict]] = None,
+        logits_to_keep: Union[int, torch.Tensor] = 0,
+        cache_position: Optional[torch.LongTensor] = None,
+        past_key_values: Optional[Cache] = None,
+        use_cache: Optional[bool] = None,
+        **kwargs,
+    ) -> Union[tuple[torch.FloatTensor], PPChart2TableCausalLMOutputWithPast]:
+        outputs = self.model(
+            input_ids=input_ids,
+            attention_mask=attention_mask,
+            position_ids=position_ids,
+            pixel_values=pixel_values,
+            inputs_embeds=inputs_embeds,
+            use_cache=use_cache,
+            past_key_values=past_key_values,
+            cache_position=cache_position,
+            **kwargs,
+        )
+        hidden_states = outputs.last_hidden_state
+        logits = self.lm_head(hidden_states)
+
+        slice_indices = slice(-logits_to_keep, None) if isinstance(logits_to_keep, int) else logits_to_keep
+        logits = self.lm_head(hidden_states[:, slice_indices, :])
+
+        if labels is not None:
+            raise ValueError(
+                "The PPChart2TableForConditionalGeneration model only supports inference, and training is not allowed!\n"
+                "If you need to train this model, please implement the corresponding loss calculation logic, or use the inference-only mode (do not pass the `labels` parameter)."
+            )
+
+        return PPChart2TableCausalLMOutputWithPast(
+            logits=logits,
+            last_hidden_state=outputs.last_hidden_state,
+            past_key_values=outputs.past_key_values,
+            hidden_states=outputs.hidden_states,
+            attentions=outputs.attentions,
+        )
+
+
+__all__ = [
+    "PPChart2TableForConditionalGeneration",
+    "PPChart2TableModel",
+    "PPChart2TablePreTrainedModel",
+    "PPChart2TableConfig",
+    "PPChart2TableTextPreTrainedModel",
+    "PPChart2TableTextModel",
+    "PPChart2TableVisionPreTrainedModel",
+    "PPChart2TableVisionModel",
+    "PPChart2TableVisionConfig",
+    "PPChart2TableTextConfig",
+    "PPChart2TableImageProcessor",
+    "PPChart2TableImageProcessorFast",
+    "PPChart2TableProcessor",
+]
diff --git a/src/transformers/models/pp_chart2table/processing_pp_chart2table.py b/src/transformers/models/pp_chart2table/processing_pp_chart2table.py
new file mode 100644
index 000000000000..7d27beef3dec
--- /dev/null
+++ b/src/transformers/models/pp_chart2table/processing_pp_chart2table.py
@@ -0,0 +1,65 @@
+#                🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨
+#           This file was automatically generated from src/transformers/models/pp_chart2table/modular_pp_chart2table.py.
+#               Do NOT edit this file manually as any edits will be overwritten by the generation of
+#             the file from the modular. If any change should be done, please apply the change to the
+#                          modular_pp_chart2table.py file directly. One of our CI enforces this.
+#                🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨
+
+import torch
+
+from transformers.feature_extraction_utils import BatchFeature
+from transformers.processing_utils import ProcessorMixin
+
+
+class PPChart2TableProcessor(ProcessorMixin):
+    r"""
+    [`PPChart2TableProcessor`] offers all the functionalities of [`PPChart2TableImageProcessor`] and [`Qwen2Tokenizer`]. See the
+    [`~PPChart2TableProcessor.__call__`] and [`~PPChart2TableProcessor.decode`] for more information.
+    Args:
+        image_processor ([`PPChart2TableImageProcessor`], *optional*):
+            The image processor is a required input.
+        tokenizer ([`Qwen2Tokenizer`], *optional*):
+            The tokenizer is a required input.
+        chat_template (`str`, *optional*): A Jinja template which will be used to convert lists of messages
+            in a chat into a tokenizable string.
+    """
+
+    image_processor_class = "AutoImageProcessor"
+    tokenizer_class = "AutoTokenizer"
+
+    def __init__(self, image_processor=None, tokenizer=None, chat_template=None, **kwargs):
+        super().__init__(image_processor, tokenizer, chat_template=chat_template)
+
+    def __call__(
+        self,
+        images,
+        text=None,
+        **kwargs,
+    ) -> BatchFeature:
+        if images is not None:
+            image_inputs = self.image_processor(images=images, return_tensors="pt")
+        else:
+            image_inputs = {}
+        img_cnt = len(image_inputs)
+        b, c, h, w = image_inputs["pixel_values"].shape
+        num_patches = h // self.image_processor.patch_size // self.image_processor.merge_size
+        prompt = (
+            "<|im_start|>system\n"
+            "You should follow the instructions carefully and explain your answers in detail.<|im_end|><|im_start|>user\n"
+            "<img>" + "<imgpad>" * (num_patches * num_patches) + "</img>\n"
+            "Chart to table<|im_end|><|im_start|>assistant\n"
+        )
+        input_ids = torch.tensor(self.tokenizer([prompt]).input_ids)
+        input_ids = input_ids.repeat(img_cnt, 1)
+        input_ids = {"input_ids": input_ids}
+        return BatchFeature(data={**input_ids, **image_inputs})
+
+    def postprocess(self, model_pred, **kwargs):
+        return self.tokenizer.batch_decode(
+            model_pred[0],
+            skip_special_tokens=kwargs.get("skip_special_tokens", True),
+            clean_up_tokenization_spaces=False,
+        )
+
+
+__all__ = ["PPChart2TableProcessor"]
diff --git a/tests/models/pp_chart2table/__init__.py b/tests/models/pp_chart2table/__init__.py
new file mode 100644
index 000000000000..e69de29bb2d1
diff --git a/tests/models/pp_chart2table/test_modeling_pp_chart2table.py b/tests/models/pp_chart2table/test_modeling_pp_chart2table.py
new file mode 100644
index 000000000000..1143253791fa
--- /dev/null
+++ b/tests/models/pp_chart2table/test_modeling_pp_chart2table.py
@@ -0,0 +1,391 @@
+# Copyright 2025 The HuggingFace Inc. team. All rights reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+"""Testing suite for the PPChart2Table model."""
+
+import gc
+import unittest
+
+import pytest
+from parameterized import parameterized
+from PIL import Image
+
+from transformers import (
+    AutoProcessor,
+    PPChart2TableConfig,
+    PPChart2TableForConditionalGeneration,
+    is_torch_available,
+)
+from transformers.testing_utils import (
+    backend_empty_cache,
+    require_torch,
+    slow,
+    torch_device,
+)
+
+from ...generation.test_utils import GenerationTesterMixin
+from ...test_configuration_common import ConfigTester
+from ...test_modeling_common import (
+    ModelTesterMixin,
+)
+from ...test_pipeline_mixin import PipelineTesterMixin
+
+
+if is_torch_available():
+    import torch
+
+
+class PPChart2TableVisionText2TextModelTester:
+    def __init__(
+        self,
+        parent,
+        batch_size=7,
+        seq_length=31,
+        num_channels=3,
+        image_height=64,
+        image_width=64,
+        text_config={
+            "hidden_size": 32,
+            "hidden_act": "silu",
+            "num_hidden_layers": 2,
+            "num_attention_heads": 4,
+            "num_key_value_heads": 2,
+            "intermediate_size": 32,
+            "attention_dropout": 0.0,
+            "sliding_window": 32768,
+            "rms_norm_eps": 1e-06,
+            "vocab_size": 151860,
+            "max_position_embeddings": 32768,
+            "rope_parameters": {"rope_theta": 1000000.0, "rope_type": "default"},
+        },
+        is_training=False,
+        vision_config={
+            "depth": 2,
+            "embed_dim": 768,
+            "hidden_size": 144,
+            "img_size": 64,
+            "mlp_ratio": 4.0,
+            "norm_layer_eps": 1e-6,
+            "num_heads": 4,
+            "patch_size": 16,
+            "qkv_bias": True,
+            "use_rel_pos": True,
+            "global_attn_indexes": [2, 5, 8, 11],
+            "window_size": 14,
+            "out_chans": 256,
+        },
+        bos_token_id=151643,
+        eos_token_id=151643,
+        im_start_token=151857,
+        im_end_token=151858,
+        im_patch_token=151859,
+    ):
+        self.parent = parent
+        self.bos_token_id = bos_token_id
+        self.eos_token_id = eos_token_id
+        self.num_hidden_layers = text_config["num_hidden_layers"]
+        self.num_attention_heads = text_config["num_attention_heads"]
+        self.hidden_size = text_config["hidden_size"]
+        self.im_start_token = im_start_token
+        self.im_end_token = im_end_token
+        self.im_patch_token = im_patch_token
+        self.text_config = text_config
+        self.vision_config = vision_config
+        self.batch_size = batch_size
+        self.seq_length = seq_length
+        self.num_channels = num_channels
+        self.image_height = image_height
+        self.image_width = image_width
+        self.is_training = is_training
+        self.vocab_size = text_config["vocab_size"]
+
+    def get_config(self):
+        return PPChart2TableConfig(
+            text_config=self.text_config,
+            vision_config=self.vision_config,
+        )
+
+    def prepare_config_and_inputs(self):
+        config = self.get_config()
+        pixel_values = torch.randn((1, 3, self.image_height, self.image_width)).to(torch_device)
+
+        num_patch = self.image_height // 16 // 4
+        input = (
+            [
+                151644,
+                8948,
+                198,
+                2610,
+                1265,
+                1795,
+                279,
+                11221,
+                15516,
+                323,
+                10339,
+                697,
+                11253,
+                304,
+                7716,
+                13,
+                151645,
+                151644,
+                872,
+                198,
+                151857,
+            ]
+            + [151859] * (num_patch * num_patch)
+            + [151858, 198, 14488, 311, 1965, 151645, 151644, 77091, 198]
+        )
+
+        input_ids = torch.tensor(input).unsqueeze(0).to(torch_device)
+
+        return config, pixel_values, input_ids
+
+    def prepare_config_and_inputs_for_common(self):
+        config_and_inputs = self.prepare_config_and_inputs()
+        config, pixel_values, input_ids = config_and_inputs
+        attention_mask = torch.ones(input_ids.shape, dtype=torch.long, device=torch_device)
+        inputs_dict = {
+            "pixel_values": pixel_values,
+            "input_ids": input_ids,
+            "attention_mask": attention_mask,
+        }
+        return config, inputs_dict
+
+
+@require_torch
+class PPChart2TableModelTest(ModelTesterMixin, GenerationTesterMixin, PipelineTesterMixin, unittest.TestCase):
+    all_model_classes = (PPChart2TableForConditionalGeneration,) if is_torch_available() else ()
+    pipeline_model_mapping = {"image-text-to-text": PPChart2TableForConditionalGeneration}
+    _is_composite = True
+
+    def setUp(self):
+        self.model_tester = PPChart2TableVisionText2TextModelTester(self)
+        self.config_tester = ConfigTester(self, config_class=PPChart2TableConfig, has_text_modality=False)
+
+    def test_config(self):
+        self.config_tester.run_common_tests()
+
+    @unittest.skip(reason="PPChart2Table does not support.")
+    def test_sliding_window_mask(self):
+        pass
+
+    @unittest.skip(reason="PPChart2Table does not support.")
+    def test_generate_compile_model_forward_fullgraph(self):
+        pass
+
+    @unittest.skip(reason="PPChart2Table does not support.")
+    def test_multi_gpu_data_parallel_forward(self):
+        pass
+
+    @pytest.mark.generate
+    @unittest.skip(reason="PPChart2Table does not support beam search.")
+    def test_beam_sample_generate(self):
+        pass
+
+    @pytest.mark.generate
+    @unittest.skip(reason="PPChart2Table does not support beam search.")
+    def test_beam_search_generate(self):
+        pass
+
+    @pytest.mark.generate
+    @unittest.skip(reason="PPChart2Table does not support beam search.")
+    def test_beam_search_generate_dict_output(self):
+        pass
+
+    @pytest.mark.generate
+    @unittest.skip(reason="PPChart2Table does not support beam search.")
+    def test_beam_search_generate_dict_outputs_use_cache(self):
+        pass
+
+    @pytest.mark.generate
+    @unittest.skip(reason="PPChart2Table does not support beam search.")
+    def test_beam_sample_generate_dict_output(self):
+        pass
+
+    @unittest.skip(reason="PPChart2Table needs to apply weight conversions.")
+    def test_can_load_from_already_mapped_keys(self):
+        pass
+
+    @pytest.mark.generate
+    @unittest.skip(reason="PPChart2Table does not support beam search.")
+    def test_generate_from_inputs_embeds_1_beam_search(self, _, num_beams):
+        pass
+
+    @parameterized.expand([("random",), ("same",)])
+    @pytest.mark.generate
+    @unittest.skip(reason="PPChart2Table does not support assisted decoding.")
+    def test_assisted_decoding_matches_greedy_search(self, assistant_type):
+        pass
+
+    @pytest.mark.generate
+    @unittest.skip(reason="PPChart2Table does not support assisted decoding.")
+    def test_assisted_decoding_sample(self):
+        pass
+
+    @unittest.skip("PPChart2Table does not support this test.")
+    def test_model_is_small(self):
+        pass
+
+
+@require_torch
+@slow
+class PPChart2TableIntegrationTest(unittest.TestCase):
+    def setUp(self):
+        self.processor = AutoProcessor.from_pretrained("/workspace/model_weight_torch/PP-Chart2Table")
+
+    def tearDown(self):
+        gc.collect()
+        backend_empty_cache(torch_device)
+
+    def test_small_model_integration_test(self):
+        model = PPChart2TableForConditionalGeneration.from_pretrained(
+            "/workspace/model_weight_torch/PP-Chart2Table", dtype="float32"
+        ).to("cuda")
+
+        image = Image.open(
+            "/workspace/PaddleX/paddlex/inference/models/doc_vlm/modeling/chart_parsing_02.png"
+        ).convert("RGB")
+        inputs = self.processor(images=image).to(model.device)
+        breakpoint()
+        expected_input_ids_length = 286
+        assert expected_input_ids_length == len(inputs.input_ids[0])
+
+        expected_input_ids = [151644, 8948, 198, 2610, 1265, 1795, 279, 11221, 15516, 323]
+
+        assert expected_input_ids == inputs.input_ids[0].tolist()[:10]
+
+        expected_pixel_slice = torch.tensor(
+            [
+                [1.0000, 1.0000, 1.0000],
+                [1.0000, 1.0000, 1.0000],
+                [0.9922, 0.9922, 0.9922],
+                [1.0000, 1.0000, 1.0000],
+                [1.0000, 1.0000, 1.0000],
+            ],
+            dtype=torch.float32,
+            device="cpu",
+        )
+
+        assert torch.allclose(expected_pixel_slice, inputs.pixel_values[:5, :, 0, 0], atol=3e-3)
+
+        # verify generation
+        inputs = inputs.to(torch_device)
+        output = model.generate(**inputs, max_new_tokens=30)
+        result = self.processor.decode(output[0][inputs["input_ids"].shape[-1] : -1])
+
+        EXPECTED_DECODED_TEXT = "生甘草"
+
+        self.assertEqual(
+            result,
+            EXPECTED_DECODED_TEXT,
+        )
+
+    # def test_small_model_integration_test_batch(self):
+    #     model = (
+    #         PPChart2TableForConditionalGeneration.from_pretrained("/workspace/model_weight_torch/PP-Chart2Table", dtype="bfloat16")
+    #         .to(torch_device)
+    #         .eval()
+    #     )
+
+    #     image = Image.open("/workspace/PaddleX/paddlex/inference/models/doc_vlm/modeling/chart_parsing_02.png").convert("RGB")
+    #     inputs = self.processor(images=image).to(model.device)
+
+    #     output = model.generate(**inputs, max_new_tokens=256)
+    #     generated_ids_trimmed = [out_ids[len(in_ids) :] for in_ids, out_ids in zip(inputs.input_ids, output)]
+    #     result = self.processor.batch_decode(
+    #         generated_ids_trimmed, skip_special_tokens=True, clean_up_tokenization_spaces=False
+    #     )
+
+    #     EXPECTED_DECODED_TEXT = ["生甘草", "生甘草"]
+
+    #     self.assertEqual(
+    #         result,
+    #         EXPECTED_DECODED_TEXT,
+    #     )
+
+    # @require_flash_attn
+    # @require_torch_accelerator
+    # @pytest.mark.flash_attn_test
+    # def test_small_model_integration_test_flashatt2(self):
+    #     model = (
+    #         PPChart2TableForConditionalGeneration.from_pretrained(
+    #             "/workspace/model_weight_torch/PP-Chart2Table", dtype="bfloat16", attn_implementation="flash_attention_2"
+    #         )
+    #         .to(torch_device)
+    #         .eval()
+    #     )
+
+    #     image = Image.open("/workspace/PaddleX/paddlex/inference/models/doc_vlm/modeling/chart_parsing_02.png").convert("RGB")
+    #     inputs = self.processor(images=image).to(model.device)
+
+    #     expected_input_ids_length = 211
+    #     assert expected_input_ids_length == len(inputs.input_ids[0])
+
+    #     expected_input_ids = [100273, 2969, 93963, 93919, 101305, 100295, 100295, 100295, 100295, 100295]  # fmt: skip
+    #     assert expected_input_ids == inputs.input_ids[0].tolist()[:10]
+
+    #     expected_pixel_slice = torch.tensor(
+    #         [
+    #             [1.0000, 1.0000, 1.0000],
+    #             [1.0000, 1.0000, 1.0000],
+    #             [0.9922, 0.9922, 0.9922],
+    #             [1.0000, 1.0000, 1.0000],
+    #             [1.0000, 1.0000, 1.0000],
+    #         ],
+    #         dtype=torch.float32,
+    #         device="cpu",
+    #     )
+    #     assert torch.allclose(expected_pixel_slice, inputs.pixel_values[:5, :, 0, 0], atol=3e-3)
+
+    #     # verify generation
+    #     inputs = inputs.to(torch_device)
+    #     output = model.generate(**inputs, max_new_tokens=30)
+    #     result = self.processor.decode(output[0][inputs["input_ids"].shape[-1] : -1])
+
+    #     EXPECTED_DECODED_TEXT = "生甘草"
+
+    #     self.assertEqual(
+    #         result,
+    #         EXPECTED_DECODED_TEXT,
+    #     )
+
+    # @require_flash_attn
+    # @require_torch_accelerator
+    # @pytest.mark.flash_attn_test
+    # def test_small_model_integration_test_batch_flashatt2(self):
+    #     model = (
+    #         PPChart2TableForConditionalGeneration.from_pretrained(
+    #             "/workspace/model_weight_torch/PP-Chart2Table", dtype="bfloat16", attn_implementation="flash_attention_2"
+    #         )
+    #         .to(torch_device)
+    #         .eval()
+    #     )
+
+    #     image = Image.open("/workspace/PaddleX/paddlex/inference/models/doc_vlm/modeling/chart_parsing_02.png").convert("RGB")
+    #     inputs = self.processor(images=image).to(model.device)
+
+    #     # it should not matter whether two images are the same size or not
+    #     output = model.generate(**inputs, max_new_tokens=30)
+    #     generated_ids_trimmed = [out_ids[len(in_ids) :] for in_ids, out_ids in zip(inputs.input_ids, output)]
+    #     result = self.processor.batch_decode(
+    #         generated_ids_trimmed, skip_special_tokens=True, clean_up_tokenization_spaces=False
+    #     )
+
+    #     EXPECTED_DECODED_TEXT = ["生甘草", "生甘草"]
+
+    #     self.assertEqual(
+    #         result,
+    #         EXPECTED_DECODED_TEXT,
+    #     )
diff --git a/utils/check_config_attributes.py b/utils/check_config_attributes.py
index 6b86f03e3927..0e4503e2877a 100644
--- a/utils/check_config_attributes.py
+++ b/utils/check_config_attributes.py
@@ -60,6 +60,7 @@
         "expert_layer_period",
     ],
     "PaddleOCRTextConfig": ["tie_word_embeddings"],
+    "PPChart2TableConfig": ["tie_word_embeddings"],
     "Qwen2Config": ["use_sliding_window", "max_window_layers"],
     "Qwen2MoeConfig": ["use_sliding_window", "max_window_layers"],
     "Qwen2VLTextConfig": ["use_sliding_window", "max_window_layers"],
diff --git a/utils/check_repo.py b/utils/check_repo.py
index f36cda07dc51..866a2bd7965b 100644
--- a/utils/check_repo.py
+++ b/utils/check_repo.py
@@ -162,6 +162,9 @@
         "PaddleOCRVisionModel",  # Building part of bigger (tested) model. Tested implicitly through PaddleOCRVLForConditionalGeneration.
         "PaddleOCRVisionTransformer",  # Building part of bigger (tested) model. Tested implicitly through PaddleOCRVLForConditionalGeneration.
         "PaddleOCRTextModel",  # Building part of bigger (tested) model. Tested implicitly through PaddleOCRVLForConditionalGeneration.
+        "PPChart2TableModel",  # Building part of bigger (tested) model. Tested implicitly through PPChart2TableForConditionalGeneration.
+        "PPChart2TableVisionModel",  # Building part of bigger (tested) model. Tested implicitly through PPChart2TableForConditionalGeneration.
+        "PPChart2TableTextModel",  # Building part of bigger (tested) model. Tested implicitly through PPChart2TableForConditionalGeneration.
         "Qwen2VLModel",  # Building part of bigger (tested) model. Tested implicitly through Qwen2VLForConditionalGeneration.
         "Qwen2_5_VLModel",  # Building part of bigger (tested) model. Tested implicitly through Qwen2_5_VLForConditionalGeneration.
         "Qwen3VLModel",  # Building part of bigger (tested) model. Tested implicitly through Qwen3VLForConditionalGeneration.
@@ -398,6 +401,9 @@
     "PaddleOCRVisionModel",  # Building part of bigger (tested) model
     "PaddleOCRVisionTransformer",  # Building part of bigger (tested) model
     "PaddleOCRTextModel",  # Building part of bigger (tested) model
+    "PPChart2TableModel",  # Building part of bigger (tested) model
+    "PPChart2TableVisionModel",  # Building part of bigger (tested) model
+    "PPChart2TableTextModel",  # Building part of bigger (tested) model
     "Qwen2_5OmniTalkerForConditionalGeneration",  # Building part of a bigger model
     "Qwen2_5OmniTalkerModel",  # Building part of a bigger model
     "Qwen2_5OmniThinkerForConditionalGeneration",  # Building part of a bigger model

From 1a5908db8f529dbc324bd00f03ce7a3bd3291922 Mon Sep 17 00:00:00 2001
From: XingweiDeng <aaadddiy@163.com>
Date: Mon, 9 Feb 2026 14:12:50 +0800
Subject: [PATCH 02/60] fix doc

---
 docs/source/en/model_doc/pp_chart2table.md | 22 ++++++++++++++++------
 1 file changed, 16 insertions(+), 6 deletions(-)

diff --git a/docs/source/en/model_doc/pp_chart2table.md b/docs/source/en/model_doc/pp_chart2table.md
index 5082c6f0adef..7ba3692f7f49 100644
--- a/docs/source/en/model_doc/pp_chart2table.md
+++ b/docs/source/en/model_doc/pp_chart2table.md
@@ -40,9 +40,15 @@ The example below demonstrates how to classify image with PP-Chart2Table using [
 ```py
 from transformers import pipeline
 from PIL import Image
-pipe = pipeline("image-text-to-text", model="PaddlePaddle/PP-Chart2Table_safetensors")
-
-result = pipe(images="https://paddle-model-ecology.bj.bcebos.com/paddlex/imgs/demo_image/chart_parsing_02.png", do_sample=False, max_new_tokens=256)
+model_path = "PaddlePaddle/PP-Chart2Table_safetensors"
+pipe = pipeline("image-text-to-text", model=model_path)
+image = Image.open(requests.get("https://paddle-model-ecology.bj.bcebos.com/paddlex/imgs/demo_image/chart_parsing_02.png", stream=True).raw)
+result = pipe(
+    images=image,
+    text="",
+    do_sample=False,
+    max_new_tokens=256
+)
 print(result)
 ```
 
@@ -83,9 +89,13 @@ from transformers import pipeline
 from PIL import Image
 model_path = "PaddlePaddle/PP-Chart2Table_safetensors"
 pipe = pipeline("image-text-to-text", model=model_path)
-
-image_path = "https://paddle-model-ecology.bj.bcebos.com/paddlex/imgs/demo_image/chart_parsing_02.png"
-result = pipe(images=[image_path, image_path], do_sample=False, max_new_tokens=256)
+image = Image.open(requests.get("https://paddle-model-ecology.bj.bcebos.com/paddlex/imgs/demo_image/chart_parsing_02.png", stream=True).raw)
+result = pipe(
+    images=[image, image],
+    text="",
+    do_sample=False,
+    max_new_tokens=256
+)
 print(result)
 ```
 

From c51b1c6f399a8bd07ff0a04846248768fd6cabeb Mon Sep 17 00:00:00 2001
From: XingweiDeng <aaadddiy@163.com>
Date: Tue, 24 Feb 2026 15:56:54 +0800
Subject: [PATCH 03/60] update

---
 .../configuration_pp_chart2table.py           |  24 +-
 .../pp_chart2table/modeling_pp_chart2table.py | 634 +++++++-----------
 .../pp_chart2table/modular_pp_chart2table.py  | 481 ++-----------
 .../processing_pp_chart2table.py              |   2 +-
 4 files changed, 327 insertions(+), 814 deletions(-)

diff --git a/src/transformers/models/pp_chart2table/configuration_pp_chart2table.py b/src/transformers/models/pp_chart2table/configuration_pp_chart2table.py
index 5e32dc30ef76..ee4ed2128161 100644
--- a/src/transformers/models/pp_chart2table/configuration_pp_chart2table.py
+++ b/src/transformers/models/pp_chart2table/configuration_pp_chart2table.py
@@ -31,11 +31,11 @@ class PPChart2TableVisionConfig(PreTrainedConfig):
             Dimensionality of the patch embedding layer in the vision encoder.
         hidden_size (`int`, *optional*, defaults to 1024):
             Dimensionality of the hidden layers in the vision Transformer encoder.
-        img_size (`int`, *optional*, defaults to 1024):
+        image_size (`int`, *optional*, defaults to 1024):
             The size (resolution) of input chart images (assumed to be square).
         mlp_ratio (`float`, *optional*, defaults to 4.0):
             Ratio of the dimensionality of the feed-forward layer to the hidden size in the vision Transformer blocks.
-        num_heads (`int`, *optional*, defaults to 12):
+        num_attention_heads (`int`, *optional*, defaults to 12):
             Number of attention heads for each self-attention layer in the vision Transformer encoder.
         patch_size (`int`, *optional*, defaults to 16):
             The size (resolution) of each image patch extracted from the input chart image.
@@ -47,7 +47,7 @@ class PPChart2TableVisionConfig(PreTrainedConfig):
             List of layer indexes where global attention (instead of windowed attention) is applied in the vision encoder.
         window_size (`int`, *optional*, defaults to 14):
             The size of the attention window for windowed self-attention in the vision Transformer layers.
-        out_chans (`int`, *optional*, defaults to 256):
+        output_channels (`int`, *optional*, defaults to 256):
             Number of output channels from the convolutional stem layer before patch embedding.
 
     Example:
@@ -75,15 +75,17 @@ def __init__(
         depth: int = 12,
         embed_dim: int = 768,
         hidden_size: int = 1024,
-        img_size: int = 1024,
+        num_channels: int = 3,
+        image_size: int = 1024,
         mlp_ratio: float = 4.0,
-        num_heads: int = 12,
+        num_attention_heads: int = 12,
         patch_size: int = 16,
         qkv_bias: bool = True,
         use_rel_pos: bool = True,
         global_attn_indexes: Optional[list] = None,
         window_size: int = 14,
-        out_chans: int = 256,
+        output_channels: int = 256,
+        attention_dropout: float = 0.0,
         **kwargs,
     ):
         self.im_patch_token = im_patch_token
@@ -92,15 +94,17 @@ def __init__(
         self.depth = depth
         self.embed_dim = embed_dim
         self.hidden_size = hidden_size
-        self.img_size = img_size
+        self.image_size = image_size
+        self.num_channels = num_channels
         self.mlp_ratio = mlp_ratio
-        self.num_heads = num_heads
+        self.num_attention_heads = num_attention_heads
         self.patch_size = patch_size
         self.qkv_bias = qkv_bias
         self.use_rel_pos = use_rel_pos
         self.global_attn_indexes = global_attn_indexes if global_attn_indexes is not None else [2, 5, 8, 11]
         self.window_size = window_size
-        self.out_chans = out_chans
+        self.output_channels = output_channels
+        self.attention_dropout = attention_dropout
 
         super().__init__(**kwargs)
 
@@ -295,7 +299,7 @@ class PPChart2TableConfig(PreTrainedConfig):
     >>> configuration = PPChart2TableConfig()
 
     >>> # Initializing a PPChart2Table configuration with custom vision and text sub-configs
-    >>> vision_config = {"img_size": 512, "patch_size": 8}
+    >>> vision_config = {"image_size": 512, "patch_size": 8}
     >>> text_config = {"hidden_size": 2048, "num_hidden_layers": 16}
     >>> configuration = PPChart2TableConfig(vision_config=vision_config, text_config=text_config)
 
diff --git a/src/transformers/models/pp_chart2table/modeling_pp_chart2table.py b/src/transformers/models/pp_chart2table/modeling_pp_chart2table.py
index 6d95acc7eea5..4b66b6b04afa 100644
--- a/src/transformers/models/pp_chart2table/modeling_pp_chart2table.py
+++ b/src/transformers/models/pp_chart2table/modeling_pp_chart2table.py
@@ -4,6 +4,7 @@
 #             the file from the modular. If any change should be done, please apply the change to the
 #                          modular_pp_chart2table.py file directly. One of our CI enforces this.
 #                🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨
+import collections
 from collections.abc import Callable
 from dataclasses import dataclass
 from typing import Optional, Union
@@ -34,398 +35,314 @@
 
 
 class PPChart2TableVisionPatchEmbed(nn.Module):
-    r"""
-    Image to Patch Embedding layer for PP-Chart2Table vision encoder.
+    """
+    This class turns `pixel_values` of shape `(batch_size, num_channels, height, width)` into the initial
+    `hidden_states` (patch embeddings) of shape `(batch_size, seq_length, hidden_size)` to be consumed by a
+    Transformer.
+    """
 
-    This module converts raw chart images (HWC format) into flattened patch embeddings via a 2D convolution,
-    followed by dimension permutation to align with the vision transformer's input format.
+    def __init__(self, config):
+        super().__init__()
+        image_size, patch_size = config.image_size, config.patch_size
+        num_channels, hidden_size = config.num_channels, config.embed_dim
+        image_size = image_size if isinstance(image_size, collections.abc.Iterable) else (image_size, image_size)
+        patch_size = patch_size if isinstance(patch_size, collections.abc.Iterable) else (patch_size, patch_size)
+        num_patches = (image_size[1] // patch_size[1]) * (image_size[0] // patch_size[0])
+        self.image_size = image_size
+        self.patch_size = patch_size
+        self.num_channels = num_channels
+        self.num_patches = num_patches
+
+        self.projection = nn.Conv2d(num_channels, hidden_size, kernel_size=patch_size, stride=patch_size)
+
+    def forward(self, pixel_values):
+        batch_size, num_channels, height, width = pixel_values.shape
+        if num_channels != self.num_channels:
+            raise ValueError(
+                "Make sure that the channel dimension of the pixel values match with the one set in the configuration."
+            )
+        if height != self.image_size[0] or width != self.image_size[1]:
+            raise ValueError(
+                f"Input image size ({height}*{width}) doesn't match model ({self.image_size[0]}*{self.image_size[1]})."
+            )
+        embeddings = self.projection(pixel_values).permute(0, 2, 3, 1)
+        return embeddings
 
-    Args:
-        kernel_size (`tuple[int, int]`, *optional*, defaults to `(16, 16)`):
-            Size of the convolution kernel (patch size) for splitting images into patches.
-        stride (`tuple[int, int]`, *optional*, defaults to `(16, 16)`):
-            Stride of the convolution operation (matches patch size for non-overlapping patches).
-        padding (`tuple[int, int]`, *optional*, defaults to `(0, 0)`):
-            Padding applied to the input image before convolution (ensures patch alignment).
-        in_chans (`int`, *optional*, defaults to 3):
-            Number of input channels (3 for RGB chart images).
-        embed_dim (`int`, *optional*, defaults to 768):
-            Dimensionality of the output patch embeddings (hidden size of the vision transformer).
-
-    Shape:
-        - Input: `(B, C, H, W)` (batch size, channels, height, width)
-        - Output: `(B, H_out, W_out, C_out)` (batch size, patch height, patch width, embedding dim)
-    """
 
-    def __init__(
-        self,
-        kernel_size: tuple[int, int] = (16, 16),
-        stride: tuple[int, int] = (16, 16),
-        padding: tuple[int, int] = (0, 0),
-        in_chans: int = 3,
-        embed_dim: int = 768,
-    ) -> None:
+class PPChart2TableVisionMLPBlock(nn.Module):
+    def __init__(self, config) -> None:
         super().__init__()
-        self.proj = nn.Conv2d(in_chans, embed_dim, kernel_size=kernel_size, stride=stride, padding=padding)
+        self.lin1 = nn.Linear(config.embed_dim, int(config.embed_dim * config.mlp_ratio))
+        self.lin2 = nn.Linear(int(config.embed_dim * config.mlp_ratio), config.embed_dim)
+        self.act = ACT2FN[config.hidden_act]
 
     def forward(self, hidden_states: torch.Tensor) -> torch.Tensor:
-        hidden_states = self.proj(hidden_states)
-        hidden_states = hidden_states.permute(0, 2, 3, 1)
+        hidden_states = self.lin1(hidden_states)
+        hidden_states = self.act(hidden_states)
+        hidden_states = self.lin2(hidden_states)
         return hidden_states
 
 
-class PPChart2TableVisionMLPBlock(nn.Module):
-    r"""
-    Multi-Layer Perceptron (MLP) block for PP-Chart2Table vision transformer layers.
-
-    Implements a two-layer feed-forward network with activation function, used in the vision transformer's
-    decoder layers to project features to a higher dimension and back.
-
-    Args:
-        embedding_dim (`int`):
-            Dimensionality of the input/output embeddings (hidden size of the transformer layer).
-        mlp_dim (`int`):
-            Dimensionality of the intermediate (hidden) layer in the MLP (typically 4x embedding_dim).
-        act (`Type[nn.Module]`, *optional*, defaults to `torch.nn.GELU`):
-            Non-linear activation function to apply between the two linear layers.
-
-    Shape:
-        - Input: `(B, H, W, embedding_dim)` or `(B, N, embedding_dim)` (N = H*W)
-        - Output: Same shape as input
+class PPChart2TableVisionLayerNorm(nn.LayerNorm):
+    r"""LayerNorm that supports two data formats: channels_last (default) or channels_first.
+    The ordering of the dimensions in the inputs. channels_last corresponds to inputs with shape (batch_size, height,
+    width, channels) while channels_first corresponds to inputs with shape (batch_size, channels, height, width).
     """
 
-    def __init__(
-        self,
-        embedding_dim: int,
-        mlp_dim: int,
-        act: type[nn.Module] = torch.nn.GELU,
-    ) -> None:
-        super().__init__()
-        self.lin1 = nn.Linear(embedding_dim, mlp_dim)
-        self.lin2 = nn.Linear(mlp_dim, embedding_dim)
-        self.act = act()
-
-    def forward(self, hidden_states: torch.Tensor) -> torch.Tensor:
-        return self.lin2(self.act(self.lin1(hidden_states)))
-
+    def __init__(self, normalized_shape, *, eps=1e-6, data_format="channels_last", **kwargs):
+        super().__init__(normalized_shape, eps=eps, **kwargs)
+        if data_format not in ["channels_last", "channels_first"]:
+            raise NotImplementedError(f"Unsupported data format: {data_format}")
+        self.data_format = data_format
 
-class PPChart2TableVisionLayerNorm2d(nn.Module):
-    r"""
-    2D Layer Normalization for spatial feature maps (adapted for PP-Chart2Table vision encoder).
+    def forward(self, features: torch.Tensor) -> torch.Tensor:
+        """
+        Args:
+            features: Tensor of shape (batch_size, channels, height, width) OR (batch_size, height, width, channels)
+        """
+        if self.data_format == "channels_first":
+            features = features.permute(0, 2, 3, 1)
+            features = super().forward(features)
+            features = features.permute(0, 3, 1, 2)
+        else:
+            features = super().forward(features)
+        return features
 
-    Applies layer normalization over the channel dimension of 2D feature maps, with learnable scale/bias parameters
-    broadcasted across spatial dimensions (height/width).
 
-    Args:
-        num_channels (`int`):
-            Number of channels in the input feature map (embedding dimension).
-        epsilon (`float`, *optional*, defaults to `1e-06`):
-            Small value added to variance to avoid division by zero.
-
-    Shape:
-        - Input: `(B, C, H, W)` (batch size, channels, height, width)
-        - Output: Same shape as input
-    """
+class PPChart2TableVisionAttention(nn.Module):
+    """Multi-head Attention block with relative position embeddings."""
 
-    def __init__(self, num_channels: int, epsilon: float = 1e-06) -> None:
+    def __init__(self, config, window_size):
         super().__init__()
-        self.weight = nn.Parameter(torch.ones(num_channels))
-        self.bias = nn.Parameter(torch.zeros(num_channels))
-        self.epsilon = epsilon
+        input_size = (
+            (config.image_size // config.patch_size, config.image_size // config.patch_size)
+            if window_size == 0
+            else (window_size, window_size)
+        )
 
-    def forward(self, hidden_states: torch.Tensor) -> torch.Tensor:
-        u = hidden_states.mean(dim=1, keepdim=True)
-        s = (hidden_states - u).pow(2).mean(dim=1, keepdim=True)
-        hidden_states = (hidden_states - u) / torch.sqrt(s + self.epsilon)
-        hidden_states = self.weight[:, None, None] * hidden_states + self.bias[:, None, None]
-        return hidden_states
+        self.num_attention_heads = config.num_attention_heads
+        head_dim = config.embed_dim // config.num_attention_heads
+        self.scale = head_dim**-0.5
+        self.dropout = config.attention_dropout
+        self.qkv = nn.Linear(config.embed_dim, config.embed_dim * 3, bias=config.qkv_bias)
+        self.proj = nn.Linear(config.embed_dim, config.embed_dim)
 
+        self.use_rel_pos = config.use_rel_pos
+        if self.use_rel_pos:
+            if input_size is None:
+                raise ValueError("Input size must be provided if using relative positional encoding.")
 
-def get_rel_pos(q_size: int, k_size: int, rel_pos: torch.Tensor) -> torch.Tensor:
-    r"""
-    Get relative positional embeddings for query and key sequences, with interpolation for mismatched sizes.
+            # initialize relative positional embeddings
+            self.rel_pos_h = nn.Parameter(torch.zeros(2 * input_size[0] - 1, head_dim))
+            self.rel_pos_w = nn.Parameter(torch.zeros(2 * input_size[1] - 1, head_dim))
 
-    Args:
-        q_size (`int`):
-            Spatial size (height/width) of query feature map
-        k_size (`int`):
-            Spatial size (height/width) of key feature map
-        rel_pos (`torch.Tensor`):
-            Precomputed relative positional embeddings with shape [max_rel_dist_original, dim]
+    def get_rel_pos(self, q_size: int, k_size: int, rel_pos: torch.Tensor) -> torch.Tensor:
+        """
+        Get relative positional embeddings according to the relative positions of
+            query and key sizes.
 
-    Returns:
-        `torch.Tensor`:
-            Interpolated relative positional embeddings for the query-key pair, shape [q_size, k_size, dim]
-    """
-    max_rel_dist = int(2 * max(q_size, k_size) - 1)
-    if rel_pos.shape[0] != max_rel_dist:
+        Args:
+            q_size (int):
+                size of the query.
+            k_size (int):
+                size of key k.
+            rel_pos (`torch.Tensor`):
+                relative position embeddings (L, channel).
+
+        Returns:
+            Extracted positional embeddings according to relative positions.
+        """
+        max_rel_dist = int(2 * max(q_size, k_size) - 1)
+        # Interpolate rel pos.
         rel_pos_resized = F.interpolate(
             rel_pos.reshape(1, rel_pos.shape[0], -1).permute(0, 2, 1),
             size=max_rel_dist,
             mode="linear",
         )
         rel_pos_resized = rel_pos_resized.reshape(-1, max_rel_dist).permute(1, 0)
-    else:
-        rel_pos_resized = rel_pos
-
-    q_coords = torch.arange(q_size)[:, None] * max(k_size / q_size, 1.0)
-    k_coords = torch.arange(k_size)[None, :] * max(q_size / k_size, 1.0)
-    relative_coords = q_coords - k_coords + (k_size - 1) * max(q_size / k_size, 1.0)
-    return rel_pos_resized[relative_coords.long()]
-
-
-def add_decomposed_rel_pos(
-    attn: torch.Tensor,
-    q: torch.Tensor,
-    rel_pos_h: torch.Tensor,
-    rel_pos_w: torch.Tensor,
-    q_size: tuple[int, int],
-    k_size: tuple[int, int],
-) -> torch.Tensor:
-    r"""
-    Add decomposed relative positional embeddings (height and width separately) to attention scores.
-
-    Args:
-        attn (`torch.Tensor`):
-            Attention scores with shape [B, q_h*q_w, k_h*k_w]
-        q (`torch.Tensor`):
-            Query tensor with shape [B, q_h*q_w, dim]
-        rel_pos_h (`torch.Tensor`):
-            Precomputed relative positional embeddings for height dimension
-        rel_pos_w (`torch.Tensor`):
-            Precomputed relative positional embeddings for width dimension
-        q_size (`tuple[int, int]`):
-            Spatial size (q_h, q_w) of query feature map
-        k_size (`tuple[int, int]`):
-            Spatial size (k_h, k_w) of key feature map
-
-    Returns:
-        `torch.Tensor`:
-            Attention scores with added relative positional embeddings, shape [B, q_h*q_w, k_h*k_w]
-    """
-    q_h, q_w = q_size
-    k_h, k_w = k_size
-    Rh = get_rel_pos(q_h, k_h, rel_pos_h)
-    Rw = get_rel_pos(q_w, k_w, rel_pos_w)
 
-    B, _, dim = q.shape
-    r_q = q.reshape(B, q_h, q_w, dim)
-    rel_h = torch.einsum("bhwc,hkc->bhwk", r_q, Rh)
-    rel_w = torch.einsum("bhwc,wkc->bhwk", r_q, Rw)
+        # Scale the coords with short length if shapes for q and k are different.
+        q_coords = torch.arange(q_size)[:, None] * max(k_size / q_size, 1.0)
+        k_coords = torch.arange(k_size)[None, :] * max(q_size / k_size, 1.0)
+        relative_coords = (q_coords - k_coords) + (k_size - 1) * max(q_size / k_size, 1.0)
 
-    attn = (attn.reshape(B, q_h, q_w, k_h, k_w) + rel_h[:, :, :, :, None] + rel_w[:, :, :, None, :]).reshape(
-        B, q_h * q_w, k_h * k_w
-    )
-
-    return attn
+        return rel_pos_resized[relative_coords.long()]
 
+    def get_decomposed_rel_pos(
+        self,
+        query: torch.Tensor,
+        rel_pos_h: torch.Tensor,
+        rel_pos_w: torch.Tensor,
+        q_size: tuple[int, int],
+        k_size: tuple[int, int],
+    ) -> torch.Tensor:
+        """
+        Calculate decomposed Relative Positional Embeddings from :paper:`mvitv2`.
+        https://github.com/facebookresearch/mvit/blob/19786631e330df9f3622e5402b4a419a263a2c80/mvit/models/attention.py
 
-class PPChart2TableVisionAttention(nn.Module):
-    r"""
-    Multi-Head Self-Attention (MHSA) layer for PP-Chart2Table vision encoder, with optional relative positional encoding.
+        Args:
+            query (`torch.Tensor`):
+                query q in the attention layer with shape (batch_size, query_height * query_width, channel).
+            rel_pos_h (`torch.Tensor`):
+                relative position embeddings (Lh, channel) for height axis.
+            rel_pos_w (`torch.Tensor`):
+                relative position embeddings (Lw, channel) for width axis.
+            q_size (tuple):
+                spatial sequence size of query q with (query_height, query_width).
+            k_size (tuple):
+                spatial sequence size of key k with (key_height, key_width).
 
-    Implements standard multi-head attention with query/key/value projection, scaled dot-product attention,
-    and optional decomposed relative positional embeddings (height/width separate) for spatial awareness.
+        Returns:
+            decomposed_rel_pos (`torch.Tensor`):
+                decomposed relative position embeddings.
+        """
+        query_height, query_width = q_size
+        key_height, key_width = k_size
+        relative_position_height = self.get_rel_pos(query_height, key_height, rel_pos_h)
+        relative_position_width = self.get_rel_pos(query_width, key_width, rel_pos_w)
+
+        batch_size, _, dim = query.shape
+        reshaped_query = query.reshape(batch_size, query_height, query_width, dim)
+        rel_h = torch.einsum("bhwc,hkc->bhwk", reshaped_query, relative_position_height)
+        rel_w = torch.einsum("bhwc,wkc->bhwk", reshaped_query, relative_position_width)
+
+        decomposed_rel_pos = rel_h[:, :, :, :, None] + rel_w[:, :, :, None, :]
+
+        return decomposed_rel_pos
+
+    def forward(self, hidden_states: torch.Tensor, output_attentions=None) -> tuple[torch.Tensor, torch.Tensor]:
+        batch_size, height, width, _ = hidden_states.shape
+        # qkv with shape (3, batch_size, nHead, height * width, channel)
+        qkv = (
+            self.qkv(hidden_states)
+            .reshape(batch_size, height * width, 3, self.num_attention_heads, -1)
+            .permute(2, 0, 3, 1, 4)
+        )
+        # q, k, v with shape (batch_size * nHead, height * width, channel)
+        query, key, value = qkv.reshape(3, batch_size * self.num_attention_heads, height * width, -1).unbind(0)
 
-    Args:
-        dim (`int`):
-            Dimensionality of the input embeddings (hidden size of the transformer layer).
-        num_heads (`int`, *optional*, defaults to 8):
-            Number of attention heads (must divide `dim` evenly).
-        qkv_bias (`bool`, *optional*, defaults to `True`):
-            Whether to add bias terms to the query/key/value projection layers.
-        use_rel_pos (`bool`, *optional*, defaults to `False`):
-            Whether to use relative positional encoding for spatial attention.
-        rel_pos_zero_init (`bool`, *optional*, defaults to `True`):
-            Whether to initialize relative positional embeddings to zero (stable training).
-        input_size (`Tuple[int, int]`, *optional*):
-            Spatial size (H, W) of the input feature map (required if `use_rel_pos=True`).
-
-    Shape:
-        - Input: `(B, H, W, dim)` (batch size, height, width, embedding dim)
-        - Output: Same shape as input
-    """
+        attn_weights = (query * self.scale) @ key.transpose(-2, -1)
 
-    def __init__(
-        self,
-        dim: int,
-        num_heads: int = 8,
-        qkv_bias: bool = True,
-        use_rel_pos: bool = False,
-        rel_pos_zero_init: bool = True,
-        input_size: Optional[tuple[int, int]] = None,
-    ) -> None:
-        super().__init__()
-        self.num_heads = num_heads
-        head_dim = dim // num_heads
-        self.scale = head_dim**-0.5
+        if self.use_rel_pos:
+            decomposed_rel_pos = self.get_decomposed_rel_pos(
+                query, self.rel_pos_h, self.rel_pos_w, (height, width), (height, width)
+            )
+            decomposed_rel_pos = decomposed_rel_pos.reshape_as(attn_weights)
+            attn_weights = attn_weights + decomposed_rel_pos
 
-        self.qkv = nn.Linear(dim, dim * 3, bias=qkv_bias)
-        self.proj = nn.Linear(dim, dim)
+        attn_weights = torch.nn.functional.softmax(attn_weights, dtype=torch.float32, dim=-1).to(query.dtype)
 
-        self.use_rel_pos = use_rel_pos
-        if self.use_rel_pos:
-            assert input_size is not None, "Input size must be provided if using relative positional encoding."
-            self.rel_pos_h = nn.Parameter(torch.zeros(2 * input_size[0] - 1, head_dim))
-            self.rel_pos_w = nn.Parameter(torch.zeros(2 * input_size[1] - 1, head_dim))
+        attn_probs = nn.functional.dropout(attn_weights, p=self.dropout, training=self.training)
 
-    def forward(self, hidden_states: torch.Tensor) -> torch.Tensor:
-        B, H, W, _ = hidden_states.shape
-        qkv = self.qkv(hidden_states).reshape(B, H * W, 3, self.num_heads, -1).permute(2, 0, 3, 1, 4)
-        q, k, v = qkv.reshape(3, B * self.num_heads, H * W, -1).unbind(dim=0)
-        attn = (q * self.scale) @ k.transpose(1, 2)
+        attn_output = (attn_probs @ value).reshape(batch_size, self.num_attention_heads, height, width, -1)
+        attn_output = attn_output.permute(0, 2, 3, 1, 4).reshape(batch_size, height, width, -1)
 
-        if self.use_rel_pos:
-            attn = add_decomposed_rel_pos(attn, q, self.rel_pos_h, self.rel_pos_w, (H, W), (H, W))
+        attn_output = self.proj(attn_output)
+        return attn_output, attn_weights
 
-        attn = F.softmax(attn, dim=-1)
-        hidden_states = (attn @ v).reshape(B, self.num_heads, H, W, -1).permute(0, 2, 3, 1, 4).reshape(B, H, W, -1)
-        hidden_states = self.proj(hidden_states)
-        return hidden_states
 
+class PPChart2TableVisionDecoderLayer(GradientCheckpointingLayer):
+    def __init__(self, config, window_size) -> None:
+        super().__init__()
+        self.layer_norm1 = nn.LayerNorm(config.embed_dim)
+        self.attn = PPChart2TableVisionAttention(config, window_size=window_size)
 
-def window_partition(hidden_states: torch.Tensor, window_size: int) -> tuple[torch.Tensor, tuple[int, int]]:
-    r"""
-    Partition 2D feature maps into non-overlapping windows, with padding to ensure dimensions are divisible by window size.
+        self.layer_norm2 = nn.LayerNorm(config.embed_dim)
+        self.mlp = PPChart2TableVisionMLPBlock(config)
+        self.window_size = window_size
 
-    Args:
-        hidden_states (`torch.Tensor`):
-            Input feature map with shape [B, H, W, C], where:
-            - B: batch size
-            - H: height of feature map
-            - W: width of feature map
-            - C: channel dimension
-        window_size (`int`):
-            Size of each non-overlapping window (square window).
+    def window_partition(self, hidden_states: torch.Tensor, window_size: int) -> tuple[torch.Tensor, tuple[int, int]]:
+        """
+        Args:
+        Partition into non-overlapping windows with padding if needed.
+            hidden_states (tensor): input tokens with [batch_size, height, width, channel]. window_size (int): window
+            size.
 
-    Returns:
-        tuple[torch.Tensor, tuple[int, int]]:
-            - windows: Partitioned windows with shape [num_windows * B, window_size, window_size, C],
-              where num_windows = (Hp // window_size) * (Wp // window_size)
-            - (Hp, Wp): Padded height and width of the feature map (after padding)
-    """
-    B, H, W, C = hidden_states.shape
+        Returns:
+            windows: windows after partition with [batch_size * num_windows, window_size, window_size, channel].
+            (pad_height, pad_width): padded height and width before partition
+        """
+        batch_size, height, width, channel = hidden_states.shape
 
-    pad_h = (window_size - H % window_size) % window_size
-    pad_w = (window_size - W % window_size) % window_size
-    if pad_h > 0 or pad_w > 0:
+        pad_h = (window_size - height % window_size) % window_size
+        pad_w = (window_size - width % window_size) % window_size
         hidden_states = F.pad(hidden_states, (0, 0, 0, pad_w, 0, pad_h))
-    Hp, Wp = H + pad_h, W + pad_w
-
-    hidden_states = hidden_states.reshape(B, Hp // window_size, window_size, Wp // window_size, window_size, C)
-    windows = hidden_states.permute(0, 1, 3, 2, 4, 5).reshape(-1, window_size, window_size, C)
-    return windows, (Hp, Wp)
+        pad_height, pad_width = height + pad_h, width + pad_w
 
+        hidden_states = hidden_states.reshape(
+            batch_size, pad_height // window_size, window_size, pad_width // window_size, window_size, channel
+        )
+        windows = hidden_states.permute(0, 1, 3, 2, 4, 5).contiguous().reshape(-1, window_size, window_size, channel)
+        return windows, (pad_height, pad_width)
 
-def window_unpartition(
-    windows: torch.Tensor,
-    window_size: int,
-    pad_hw: tuple[int, int],
-    hw: tuple[int, int],
-) -> torch.Tensor:
-    r"""
-    Reverse operation of window_partition: merge windows back to original 2D feature map shape, removing padding.
+    def window_unpartition(
+        self, windows: torch.Tensor, window_size: int, padding_shape: tuple[int, int], original_shape: tuple[int, int]
+    ) -> torch.Tensor:
+        """
+        Args:
+        Window unpartition into original sequences and removing padding.
+            hidden_states (tensor):
+                input tokens with [batch_size * num_windows, window_size, window_size, channel].
+            window_size (int):
+                window size.
+            padding_shape (Tuple):
+                padded height and width (pad_height, pad_width).
+            original_shape (Tuple): original height and width (height, width) before padding.
 
-    Args:
-        windows (`torch.Tensor`):
-            Partitioned windows with shape [num_windows * B, window_size, window_size, C]
-        window_size (`int`):
-            Size of each non-overlapping window (must match window_partition's window_size)
-        pad_hw (`tuple[int, int]`):
-            Padded height and width (Hp, Wp) returned by window_partition
-        hw (`tuple[int, int]`):
-            Original height and width (H, W) of feature map before padding
+        Returns:
+            hidden_states: unpartitioned sequences with [batch_size, height, width, channel].
+        """
+        pad_height, pad_width = padding_shape
+        height, width = original_shape
+        batch_size = windows.shape[0] // (pad_height * pad_width // window_size // window_size)
+        hidden_states = windows.reshape(
+            batch_size, pad_height // window_size, pad_width // window_size, window_size, window_size, -1
+        )
+        hidden_states = (
+            hidden_states.permute(0, 1, 3, 2, 4, 5).contiguous().reshape(batch_size, pad_height, pad_width, -1)
+        )
 
-    Returns:
-        `torch.Tensor`:
-            Reconstructed feature map with shape [B, H, W, C] (original dimensions before padding)
-    """
-    Hp, Wp = pad_hw
-    H, W = hw
-    B = windows.shape[0] // (Hp * Wp // window_size // window_size)
-    hidden_states = windows.reshape(B, Hp // window_size, Wp // window_size, window_size, window_size, -1)
-    hidden_states = hidden_states.permute(0, 1, 3, 2, 4, 5).reshape(B, Hp, Wp, -1)
-    if Hp > H or Wp > W:
-        hidden_states = hidden_states[:, :H, :W, :]
-    return hidden_states
+        hidden_states = hidden_states[:, :height, :width, :].contiguous()
+        return hidden_states
 
+    def forward(self, hidden_states: torch.Tensor) -> tuple[torch.FloatTensor]:
+        residual = hidden_states
+        hidden_states = self.layer_norm1(hidden_states)
+        # Window partition
+        if self.window_size > 0:
+            height, width = hidden_states.shape[1], hidden_states.shape[2]
+            hidden_states, padding_shape = self.window_partition(hidden_states, self.window_size)
 
-class PPChart2TableVisionDecoderLayer(nn.Module):
-    r"""
-    Single decoder layer of the PP-Chart2Table vision transformer, with optional windowed attention.
+        hidden_states, attn_weights = self.attn(
+            hidden_states=hidden_states,
+        )
+        # Reverse window partition
+        if self.window_size > 0:
+            hidden_states = self.window_unpartition(hidden_states, self.window_size, padding_shape, (height, width))
 
-    Implements the standard transformer decoder layer structure:
-    Layer Norm → Multi-Head Attention (with residual) → Layer Norm → MLP (with residual)
-    Supports windowed attention (SW-MHA) for large feature maps to reduce computation.
+        hidden_states = residual + hidden_states
+        layernorm_output = self.layer_norm2(hidden_states)
+        hidden_states = hidden_states + self.mlp(layernorm_output)
+        return hidden_states
 
-    Args:
-        dim (`int`):
-            Dimensionality of the input embeddings (hidden size of the transformer layer).
-        num_heads (`int`):
-            Number of attention heads (passed to MHSA layer).
-        mlp_ratio (`float`, *optional*, defaults to 4.0):
-            Ratio of MLP hidden dimension to embedding dimension (mlp_dim = dim * mlp_ratio).
-        qkv_bias (`bool`, *optional*, defaults to `True`):
-            Whether to use bias in Q/K/V projection (passed to MHSA layer).
-        norm_layer (`Type[nn.Module]`, *optional*, defaults to `nn.LayerNorm`):
-            Normalization layer to use (LayerNorm for flattened patches, LayerNorm2d for 2D feature maps).
-        act_layer (`Type[nn.Module]`, *optional*, defaults to `nn.GELU`):
-            Activation function for MLP block.
-        use_rel_pos (`bool`, *optional*, defaults to `False`):
-            Whether to use relative positional encoding (passed to MHSA layer).
-        rel_pos_zero_init (`bool`, *optional*, defaults to `True`):
-            Whether to zero-initialize relative positional embeddings (passed to MHSA layer).
-        window_size (`int`, *optional*, defaults to 0):
-            Size of attention windows (0 = full attention, >0 = windowed attention).
-        input_size (`Tuple[int, int]`, *optional*):
-            Spatial size of input feature map (passed to MHSA layer for relative positional encoding).
-
-    Shape:
-        - Input: `(B, H, W, dim)` (batch size, height, width, embedding dim)
-        - Output: Same shape as input
-    """
 
-    def __init__(
-        self,
-        dim: int,
-        num_heads: int,
-        mlp_ratio: float = 4.0,
-        qkv_bias: bool = True,
-        norm_layer: type[nn.Module] = nn.LayerNorm,
-        act_layer: type[nn.Module] = nn.GELU,
-        use_rel_pos: bool = False,
-        rel_pos_zero_init: bool = True,
-        window_size: int = 0,
-        input_size: Optional[tuple[int, int]] = None,
-    ) -> None:
+class PPChart2TableVisionNeck(nn.Module):
+    def __init__(self, config: PPChart2TableVisionConfig):
         super().__init__()
-        self.norm1 = norm_layer(dim)
-        self.attn = PPChart2TableVisionAttention(
-            dim,
-            num_heads=num_heads,
-            qkv_bias=qkv_bias,
-            use_rel_pos=use_rel_pos,
-            rel_pos_zero_init=rel_pos_zero_init,
-            input_size=input_size if window_size == 0 else (window_size, window_size),
-        )
-
-        self.norm2 = norm_layer(dim)
-        self.mlp = PPChart2TableVisionMLPBlock(embedding_dim=dim, mlp_dim=int(dim * mlp_ratio), act=act_layer)
+        self.config = config
 
-        self.window_size = window_size
+        self.conv1 = nn.Conv2d(config.embed_dim, config.output_channels, kernel_size=1, bias=False)
+        self.layer_norm1 = PPChart2TableVisionLayerNorm(config.output_channels, data_format="channels_first")
+        self.conv2 = nn.Conv2d(config.output_channels, config.output_channels, kernel_size=3, padding=1, bias=False)
+        self.layer_norm2 = PPChart2TableVisionLayerNorm(config.output_channels, data_format="channels_first")
 
-    def forward(self, hidden_states: torch.Tensor) -> torch.Tensor:
-        shortcut = hidden_states
-        hidden_states = self.norm1(hidden_states)
-        if self.window_size > 0:
-            H, W = hidden_states.shape[1], hidden_states.shape[2]
-            hidden_states, pad_hw = window_partition(hidden_states, self.window_size)
-        hidden_states = self.attn(hidden_states)
+    def forward(self, hidden_states):
+        hidden_states = hidden_states.permute(0, 3, 1, 2)
+        hidden_states = self.conv1(hidden_states)
+        hidden_states = self.layer_norm1(hidden_states)
 
-        if self.window_size > 0:
-            hidden_states = window_unpartition(hidden_states, self.window_size, pad_hw, (H, W))
-        hidden_states = shortcut + hidden_states
-        hidden_states = hidden_states + self.mlp(self.norm2(hidden_states))
+        hidden_states = self.conv2(hidden_states)
+        hidden_states = self.layer_norm2(hidden_states)
         return hidden_states
 
 
@@ -483,60 +400,27 @@ class PPChart2TableVisionModel(PPChart2TableVisionPreTrainedModel):
     def __init__(
         self,
         config: PPChart2TableVisionConfig,
-        in_chans: int = 3,
-        norm_layer: type[nn.Module] = nn.LayerNorm,
-        act_layer: type[nn.Module] = nn.GELU,
-        rel_pos_zero_init: bool = True,
     ) -> None:
         super().__init__(config)
-        self.img_size = config.img_size
+        self.image_size = config.image_size
 
-        self.patch_embed = PPChart2TableVisionPatchEmbed(
-            kernel_size=(config.patch_size, config.patch_size),
-            stride=(config.patch_size, config.patch_size),
-            in_chans=in_chans,
-            embed_dim=config.embed_dim,
-        )
+        self.patch_embed = PPChart2TableVisionPatchEmbed(config)
 
         self.pos_embed = nn.Parameter(
             torch.zeros(
-                1, config.img_size // config.patch_size, config.img_size // config.patch_size, config.embed_dim
+                1, config.image_size // config.patch_size, config.image_size // config.patch_size, config.embed_dim
             )
         )
 
         self.blocks = nn.ModuleList()
         for i in range(config.depth):
             block = PPChart2TableVisionDecoderLayer(
-                dim=config.embed_dim,
-                num_heads=config.num_heads,
-                mlp_ratio=config.mlp_ratio,
-                qkv_bias=config.qkv_bias,
-                norm_layer=norm_layer,
-                act_layer=act_layer,
-                use_rel_pos=config.use_rel_pos,
-                rel_pos_zero_init=rel_pos_zero_init,
+                config,
                 window_size=config.window_size if i not in config.global_attn_indexes else 0,
-                input_size=(config.img_size // config.patch_size, config.img_size // config.patch_size),
             )
             self.blocks.append(block)
 
-        self.neck = nn.Sequential(
-            nn.Conv2d(
-                config.embed_dim,
-                config.out_chans,
-                kernel_size=1,
-                bias=False,
-            ),
-            PPChart2TableVisionLayerNorm2d(config.out_chans),
-            nn.Conv2d(
-                config.out_chans,
-                config.out_chans,
-                kernel_size=3,
-                padding=1,
-                bias=False,
-            ),
-            PPChart2TableVisionLayerNorm2d(config.out_chans),
-        )
+        self.neck = PPChart2TableVisionNeck(config)
 
         self.net_2 = nn.Conv2d(256, 512, kernel_size=3, stride=2, padding=1, bias=False)
         self.net_3 = nn.Conv2d(512, config.hidden_size, kernel_size=3, stride=2, padding=1, bias=False)
@@ -546,9 +430,9 @@ def __init__(
     def forward(self, hidden_states: torch.Tensor) -> torch.Tensor:
         hidden_states = self.patch_embed(hidden_states)
         hidden_states = hidden_states + self.pos_embed
-        for blk in self.blocks:
-            hidden_states = blk(hidden_states)
-        hidden_states = self.neck(hidden_states.permute(0, 3, 1, 2))
+        for block in self.blocks:
+            hidden_states = block(hidden_states)
+        hidden_states = self.neck(hidden_states)
         hidden_states = self.net_2(hidden_states)
         hidden_states = self.net_3(hidden_states)
         return hidden_states
diff --git a/src/transformers/models/pp_chart2table/modular_pp_chart2table.py b/src/transformers/models/pp_chart2table/modular_pp_chart2table.py
index fdf6e07fe0cd..e1bd61a0719c 100644
--- a/src/transformers/models/pp_chart2table/modular_pp_chart2table.py
+++ b/src/transformers/models/pp_chart2table/modular_pp_chart2table.py
@@ -27,6 +27,7 @@
 from transformers.modeling_rope_utils import RopeParameters
 from transformers.modeling_utils import PreTrainedModel
 from transformers.models.qwen2.modeling_qwen2 import Qwen2Attention, Qwen2DecoderLayer, Qwen2Model, Qwen2PreTrainedModel
+from transformers.models.got_ocr2.modeling_got_ocr2 import GotOcr2VisionNeck, GotOcr2MLPBlock, GotOcr2LayerNorm, GotOcr2PatchEmbeddings, GotOcr2VisionAttention, GotOcr2VisionLayer
 from transformers.processing_utils import ProcessorMixin, TensorType
 from transformers.utils import (
     can_return_tuple,
@@ -55,11 +56,11 @@ class PPChart2TableVisionConfig(PreTrainedConfig):
             Dimensionality of the patch embedding layer in the vision encoder.
         hidden_size (`int`, *optional*, defaults to 1024):
             Dimensionality of the hidden layers in the vision Transformer encoder.
-        img_size (`int`, *optional*, defaults to 1024):
+        image_size (`int`, *optional*, defaults to 1024):
             The size (resolution) of input chart images (assumed to be square).
         mlp_ratio (`float`, *optional*, defaults to 4.0):
             Ratio of the dimensionality of the feed-forward layer to the hidden size in the vision Transformer blocks.
-        num_heads (`int`, *optional*, defaults to 12):
+        num_attention_heads (`int`, *optional*, defaults to 12):
             Number of attention heads for each self-attention layer in the vision Transformer encoder.
         patch_size (`int`, *optional*, defaults to 16):
             The size (resolution) of each image patch extracted from the input chart image.
@@ -71,7 +72,7 @@ class PPChart2TableVisionConfig(PreTrainedConfig):
             List of layer indexes where global attention (instead of windowed attention) is applied in the vision encoder.
         window_size (`int`, *optional*, defaults to 14):
             The size of the attention window for windowed self-attention in the vision Transformer layers.
-        out_chans (`int`, *optional*, defaults to 256):
+        output_channels (`int`, *optional*, defaults to 256):
             Number of output channels from the convolutional stem layer before patch embedding.
 
     Example:
@@ -99,15 +100,17 @@ def __init__(
         depth: int = 12,
         embed_dim: int = 768,
         hidden_size: int = 1024,
-        img_size: int = 1024,
+        num_channels: int = 3,
+        image_size: int = 1024,
         mlp_ratio: float = 4.0,
-        num_heads: int = 12,
+        num_attention_heads: int = 12,
         patch_size: int = 16,
         qkv_bias: bool = True,
         use_rel_pos: bool = True,
         global_attn_indexes: Optional[list] = None,
         window_size: int = 14,
-        out_chans: int = 256,
+        output_channels: int = 256,
+        attention_dropout: float = 0.0,
         **kwargs,
     ):
         self.im_patch_token = im_patch_token
@@ -116,15 +119,17 @@ def __init__(
         self.depth = depth
         self.embed_dim = embed_dim
         self.hidden_size = hidden_size
-        self.img_size = img_size
+        self.image_size = image_size
+        self.num_channels = num_channels
         self.mlp_ratio = mlp_ratio
-        self.num_heads = num_heads
+        self.num_attention_heads = num_attention_heads
         self.patch_size = patch_size
         self.qkv_bias = qkv_bias
         self.use_rel_pos = use_rel_pos
         self.global_attn_indexes = global_attn_indexes if global_attn_indexes is not None else [2, 5, 8, 11]
         self.window_size = window_size
-        self.out_chans = out_chans
+        self.output_channels = output_channels
+        self.attention_dropout = attention_dropout
 
         super().__init__(**kwargs)
 
@@ -319,7 +324,7 @@ class PPChart2TableConfig(PreTrainedConfig):
     >>> configuration = PPChart2TableConfig()
 
     >>> # Initializing a PPChart2Table configuration with custom vision and text sub-configs
-    >>> vision_config = {"img_size": 512, "patch_size": 8}
+    >>> vision_config = {"image_size": 512, "patch_size": 8}
     >>> text_config = {"hidden_size": 2048, "num_hidden_layers": 16}
     >>> configuration = PPChart2TableConfig(vision_config=vision_config, text_config=text_config)
 
@@ -627,7 +632,7 @@ def __call__(
         else:
             image_inputs = {}
         img_cnt = len(image_inputs)
-        b, c, h, w = image_inputs["pixel_values"].shape
+        _, _, h, _ = image_inputs["pixel_values"].shape
         num_patches = h // self.image_processor.patch_size // self.image_processor.merge_size
         prompt = (
             "<|im_start|>system\n"
@@ -648,400 +653,53 @@ def postprocess(self, model_pred, **kwargs):
         )
 
 
-def window_partition(hidden_states: torch.Tensor, window_size: int) -> tuple[torch.Tensor, tuple[int, int]]:
-    r"""
-    Partition 2D feature maps into non-overlapping windows, with padding to ensure dimensions are divisible by window size.
-
-    Args:
-        hidden_states (`torch.Tensor`):
-            Input feature map with shape [B, H, W, C], where:
-            - B: batch size
-            - H: height of feature map
-            - W: width of feature map
-            - C: channel dimension
-        window_size (`int`):
-            Size of each non-overlapping window (square window).
-
-    Returns:
-        tuple[torch.Tensor, tuple[int, int]]:
-            - windows: Partitioned windows with shape [num_windows * B, window_size, window_size, C],
-              where num_windows = (Hp // window_size) * (Wp // window_size)
-            - (Hp, Wp): Padded height and width of the feature map (after padding)
-    """
-    B, H, W, C = hidden_states.shape
-
-    pad_h = (window_size - H % window_size) % window_size
-    pad_w = (window_size - W % window_size) % window_size
-    if pad_h > 0 or pad_w > 0:
-        hidden_states = F.pad(hidden_states, (0, 0, 0, pad_w, 0, pad_h))
-    Hp, Wp = H + pad_h, W + pad_w
-
-    hidden_states = hidden_states.reshape(B, Hp // window_size, window_size, Wp // window_size, window_size, C)
-    windows = hidden_states.permute(0, 1, 3, 2, 4, 5).reshape(-1, window_size, window_size, C)
-    return windows, (Hp, Wp)
-
-
-def window_unpartition(
-    windows: torch.Tensor,
-    window_size: int,
-    pad_hw: tuple[int, int],
-    hw: tuple[int, int],
-) -> torch.Tensor:
-    r"""
-    Reverse operation of window_partition: merge windows back to original 2D feature map shape, removing padding.
-
-    Args:
-        windows (`torch.Tensor`):
-            Partitioned windows with shape [num_windows * B, window_size, window_size, C]
-        window_size (`int`):
-            Size of each non-overlapping window (must match window_partition's window_size)
-        pad_hw (`tuple[int, int]`):
-            Padded height and width (Hp, Wp) returned by window_partition
-        hw (`tuple[int, int]`):
-            Original height and width (H, W) of feature map before padding
-
-    Returns:
-        `torch.Tensor`:
-            Reconstructed feature map with shape [B, H, W, C] (original dimensions before padding)
-    """
-    Hp, Wp = pad_hw
-    H, W = hw
-    B = windows.shape[0] // (Hp * Wp // window_size // window_size)
-    hidden_states = windows.reshape(B, Hp // window_size, Wp // window_size, window_size, window_size, -1)
-    hidden_states = hidden_states.permute(0, 1, 3, 2, 4, 5).reshape(B, Hp, Wp, -1)
-    if Hp > H or Wp > W:
-        hidden_states = hidden_states[:, :H, :W, :]
-    return hidden_states
-
-
-def get_rel_pos(q_size: int, k_size: int, rel_pos: torch.Tensor) -> torch.Tensor:
-    r"""
-    Get relative positional embeddings for query and key sequences, with interpolation for mismatched sizes.
-
-    Args:
-        q_size (`int`):
-            Spatial size (height/width) of query feature map
-        k_size (`int`):
-            Spatial size (height/width) of key feature map
-        rel_pos (`torch.Tensor`):
-            Precomputed relative positional embeddings with shape [max_rel_dist_original, dim]
-
-    Returns:
-        `torch.Tensor`:
-            Interpolated relative positional embeddings for the query-key pair, shape [q_size, k_size, dim]
-    """
-    max_rel_dist = int(2 * max(q_size, k_size) - 1)
-    if rel_pos.shape[0] != max_rel_dist:
-        rel_pos_resized = F.interpolate(
-            rel_pos.reshape(1, rel_pos.shape[0], -1).permute(0, 2, 1),
-            size=max_rel_dist,
-            mode="linear",
-        )
-        rel_pos_resized = rel_pos_resized.reshape(-1, max_rel_dist).permute(1, 0)
-    else:
-        rel_pos_resized = rel_pos
-
-    q_coords = torch.arange(q_size)[:, None] * max(k_size / q_size, 1.0)
-    k_coords = torch.arange(k_size)[None, :] * max(q_size / k_size, 1.0)
-    relative_coords = q_coords - k_coords + (k_size - 1) * max(q_size / k_size, 1.0)
-    return rel_pos_resized[relative_coords.long()]
-
-
-def add_decomposed_rel_pos(
-    attn: torch.Tensor,
-    q: torch.Tensor,
-    rel_pos_h: torch.Tensor,
-    rel_pos_w: torch.Tensor,
-    q_size: tuple[int, int],
-    k_size: tuple[int, int],
-) -> torch.Tensor:
-    r"""
-    Add decomposed relative positional embeddings (height and width separately) to attention scores.
-
-    Args:
-        attn (`torch.Tensor`):
-            Attention scores with shape [B, q_h*q_w, k_h*k_w]
-        q (`torch.Tensor`):
-            Query tensor with shape [B, q_h*q_w, dim]
-        rel_pos_h (`torch.Tensor`):
-            Precomputed relative positional embeddings for height dimension
-        rel_pos_w (`torch.Tensor`):
-            Precomputed relative positional embeddings for width dimension
-        q_size (`tuple[int, int]`):
-            Spatial size (q_h, q_w) of query feature map
-        k_size (`tuple[int, int]`):
-            Spatial size (k_h, k_w) of key feature map
-
-    Returns:
-        `torch.Tensor`:
-            Attention scores with added relative positional embeddings, shape [B, q_h*q_w, k_h*k_w]
-    """
-    q_h, q_w = q_size
-    k_h, k_w = k_size
-    Rh = get_rel_pos(q_h, k_h, rel_pos_h)
-    Rw = get_rel_pos(q_w, k_w, rel_pos_w)
-
-    B, _, dim = q.shape
-    r_q = q.reshape(B, q_h, q_w, dim)
-    rel_h = torch.einsum("bhwc,hkc->bhwk", r_q, Rh)
-    rel_w = torch.einsum("bhwc,wkc->bhwk", r_q, Rw)
-
-    attn = (attn.reshape(B, q_h, q_w, k_h, k_w) + rel_h[:, :, :, :, None] + rel_w[:, :, :, None, :]).reshape(
-        B, q_h * q_w, k_h * k_w
-    )
-
-    return attn
-
-
-class PPChart2TableVisionPatchEmbed(nn.Module):
-    r"""
-    Image to Patch Embedding layer for PP-Chart2Table vision encoder.
-
-    This module converts raw chart images (HWC format) into flattened patch embeddings via a 2D convolution,
-    followed by dimension permutation to align with the vision transformer's input format.
-
-    Args:
-        kernel_size (`tuple[int, int]`, *optional*, defaults to `(16, 16)`):
-            Size of the convolution kernel (patch size) for splitting images into patches.
-        stride (`tuple[int, int]`, *optional*, defaults to `(16, 16)`):
-            Stride of the convolution operation (matches patch size for non-overlapping patches).
-        padding (`tuple[int, int]`, *optional*, defaults to `(0, 0)`):
-            Padding applied to the input image before convolution (ensures patch alignment).
-        in_chans (`int`, *optional*, defaults to 3):
-            Number of input channels (3 for RGB chart images).
-        embed_dim (`int`, *optional*, defaults to 768):
-            Dimensionality of the output patch embeddings (hidden size of the vision transformer).
-
-    Shape:
-        - Input: `(B, C, H, W)` (batch size, channels, height, width)
-        - Output: `(B, H_out, W_out, C_out)` (batch size, patch height, patch width, embedding dim)
-    """
-
-    def __init__(
-        self,
-        kernel_size: tuple[int, int] = (16, 16),
-        stride: tuple[int, int] = (16, 16),
-        padding: tuple[int, int] = (0, 0),
-        in_chans: int = 3,
-        embed_dim: int = 768,
-    ) -> None:
+class PPChart2TableVisionPatchEmbed(GotOcr2PatchEmbeddings):
+    def __init__(self, config):
         super().__init__()
-        self.proj = nn.Conv2d(in_chans, embed_dim, kernel_size=kernel_size, stride=stride, padding=padding)
-
-    def forward(self, hidden_states: torch.Tensor) -> torch.Tensor:
-        hidden_states = self.proj(hidden_states)
-        hidden_states = hidden_states.permute(0, 2, 3, 1)
-        return hidden_states
+        num_channels, hidden_size = config.num_channels, config.embed_dim
 
 
-class PPChart2TableVisionMLPBlock(nn.Module):
-    r"""
-    Multi-Layer Perceptron (MLP) block for PP-Chart2Table vision transformer layers.
-
-    Implements a two-layer feed-forward network with activation function, used in the vision transformer's
-    decoder layers to project features to a higher dimension and back.
-
-    Args:
-        embedding_dim (`int`):
-            Dimensionality of the input/output embeddings (hidden size of the transformer layer).
-        mlp_dim (`int`):
-            Dimensionality of the intermediate (hidden) layer in the MLP (typically 4x embedding_dim).
-        act (`Type[nn.Module]`, *optional*, defaults to `torch.nn.GELU`):
-            Non-linear activation function to apply between the two linear layers.
-
-    Shape:
-        - Input: `(B, H, W, embedding_dim)` or `(B, N, embedding_dim)` (N = H*W)
-        - Output: Same shape as input
-    """
-
-    def __init__(
-        self,
-        embedding_dim: int,
-        mlp_dim: int,
-        act: type[nn.Module] = torch.nn.GELU,
-    ) -> None:
+class PPChart2TableVisionMLPBlock(GotOcr2MLPBlock):
+    def __init__(self, config) -> None:
         super().__init__()
-        self.lin1 = nn.Linear(embedding_dim, mlp_dim)
-        self.lin2 = nn.Linear(mlp_dim, embedding_dim)
-        self.act = act()
+        self.lin1 = nn.Linear(config.embed_dim, int(config.embed_dim * config.mlp_ratio))
+        self.lin2 = nn.Linear(int(config.embed_dim * config.mlp_ratio), config.embed_dim)
 
-    def forward(self, hidden_states: torch.Tensor) -> torch.Tensor:
-        return self.lin2(self.act(self.lin1(hidden_states)))
 
+class PPChart2TableVisionLayerNorm(GotOcr2LayerNorm):
+    pass
 
-class PPChart2TableVisionLayerNorm2d(nn.Module):
-    r"""
-    2D Layer Normalization for spatial feature maps (adapted for PP-Chart2Table vision encoder).
 
-    Applies layer normalization over the channel dimension of 2D feature maps, with learnable scale/bias parameters
-    broadcasted across spatial dimensions (height/width).
+class PPChart2TableVisionAttention(GotOcr2VisionAttention):
+    """Multi-head Attention block with relative position embeddings."""
 
-    Args:
-        num_channels (`int`):
-            Number of channels in the input feature map (embedding dimension).
-        epsilon (`float`, *optional*, defaults to `1e-06`):
-            Small value added to variance to avoid division by zero.
-
-    Shape:
-        - Input: `(B, C, H, W)` (batch size, channels, height, width)
-        - Output: Same shape as input
-    """
-
-    def __init__(self, num_channels: int, epsilon: float = 1e-06) -> None:
+    def __init__(self, config, window_size):
         super().__init__()
-        self.weight = nn.Parameter(torch.ones(num_channels))
-        self.bias = nn.Parameter(torch.zeros(num_channels))
-        self.epsilon = epsilon
-
-    def forward(self, hidden_states: torch.Tensor) -> torch.Tensor:
-        u = hidden_states.mean(dim=1, keepdim=True)
-        s = (hidden_states - u).pow(2).mean(dim=1, keepdim=True)
-        hidden_states = (hidden_states - u) / torch.sqrt(s + self.epsilon)
-        hidden_states = self.weight[:, None, None] * hidden_states + self.bias[:, None, None]
-        return hidden_states
-
+        head_dim = config.embed_dim // config.num_attention_heads
+        self.qkv = nn.Linear(config.embed_dim, config.embed_dim * 3, bias=config.qkv_bias)
+        self.proj = nn.Linear(config.embed_dim, config.embed_dim)
 
-class PPChart2TableVisionAttention(nn.Module):
-    r"""
-    Multi-Head Self-Attention (MHSA) layer for PP-Chart2Table vision encoder, with optional relative positional encoding.
-
-    Implements standard multi-head attention with query/key/value projection, scaled dot-product attention,
-    and optional decomposed relative positional embeddings (height/width separate) for spatial awareness.
 
-    Args:
-        dim (`int`):
-            Dimensionality of the input embeddings (hidden size of the transformer layer).
-        num_heads (`int`, *optional*, defaults to 8):
-            Number of attention heads (must divide `dim` evenly).
-        qkv_bias (`bool`, *optional*, defaults to `True`):
-            Whether to add bias terms to the query/key/value projection layers.
-        use_rel_pos (`bool`, *optional*, defaults to `False`):
-            Whether to use relative positional encoding for spatial attention.
-        rel_pos_zero_init (`bool`, *optional*, defaults to `True`):
-            Whether to initialize relative positional embeddings to zero (stable training).
-        input_size (`Tuple[int, int]`, *optional*):
-            Spatial size (H, W) of the input feature map (required if `use_rel_pos=True`).
-
-    Shape:
-        - Input: `(B, H, W, dim)` (batch size, height, width, embedding dim)
-        - Output: Same shape as input
-    """
-
-    def __init__(
-        self,
-        dim: int,
-        num_heads: int = 8,
-        qkv_bias: bool = True,
-        use_rel_pos: bool = False,
-        rel_pos_zero_init: bool = True,
-        input_size: Optional[tuple[int, int]] = None,
-    ) -> None:
+class PPChart2TableVisionDecoderLayer(GotOcr2VisionLayer):
+    def __init__(self, config, window_size) -> None:
         super().__init__()
-        self.num_heads = num_heads
-        head_dim = dim // num_heads
-        self.scale = head_dim**-0.5
-
-        self.qkv = nn.Linear(dim, dim * 3, bias=qkv_bias)
-        self.proj = nn.Linear(dim, dim)
-
-        self.use_rel_pos = use_rel_pos
-        if self.use_rel_pos:
-            assert input_size is not None, "Input size must be provided if using relative positional encoding."
-            self.rel_pos_h = nn.Parameter(torch.zeros(2 * input_size[0] - 1, head_dim))
-            self.rel_pos_w = nn.Parameter(torch.zeros(2 * input_size[1] - 1, head_dim))
+        self.layer_norm1 = nn.LayerNorm(config.embed_dim)
+        self.attn = PPChart2TableVisionAttention(config, window_size=window_size)
 
-    def forward(self, hidden_states: torch.Tensor) -> torch.Tensor:
-        B, H, W, _ = hidden_states.shape
-        qkv = self.qkv(hidden_states).reshape(B, H * W, 3, self.num_heads, -1).permute(2, 0, 3, 1, 4)
-        q, k, v = qkv.reshape(3, B * self.num_heads, H * W, -1).unbind(dim=0)
-        attn = (q * self.scale) @ k.transpose(1, 2)
-
-        if self.use_rel_pos:
-            attn = add_decomposed_rel_pos(attn, q, self.rel_pos_h, self.rel_pos_w, (H, W), (H, W))
-
-        attn = F.softmax(attn, dim=-1)
-        hidden_states = (attn @ v).reshape(B, self.num_heads, H, W, -1).permute(0, 2, 3, 1, 4).reshape(B, H, W, -1)
-        hidden_states = self.proj(hidden_states)
-        return hidden_states
-
-
-class PPChart2TableVisionDecoderLayer(nn.Module):
-    r"""
-    Single decoder layer of the PP-Chart2Table vision transformer, with optional windowed attention.
-
-    Implements the standard transformer decoder layer structure:
-    Layer Norm → Multi-Head Attention (with residual) → Layer Norm → MLP (with residual)
-    Supports windowed attention (SW-MHA) for large feature maps to reduce computation.
+        self.layer_norm2 = nn.LayerNorm(config.embed_dim)
+        self.mlp = PPChart2TableVisionMLPBlock(config)
+        self.window_size = window_size
 
-    Args:
-        dim (`int`):
-            Dimensionality of the input embeddings (hidden size of the transformer layer).
-        num_heads (`int`):
-            Number of attention heads (passed to MHSA layer).
-        mlp_ratio (`float`, *optional*, defaults to 4.0):
-            Ratio of MLP hidden dimension to embedding dimension (mlp_dim = dim * mlp_ratio).
-        qkv_bias (`bool`, *optional*, defaults to `True`):
-            Whether to use bias in Q/K/V projection (passed to MHSA layer).
-        norm_layer (`Type[nn.Module]`, *optional*, defaults to `nn.LayerNorm`):
-            Normalization layer to use (LayerNorm for flattened patches, LayerNorm2d for 2D feature maps).
-        act_layer (`Type[nn.Module]`, *optional*, defaults to `nn.GELU`):
-            Activation function for MLP block.
-        use_rel_pos (`bool`, *optional*, defaults to `False`):
-            Whether to use relative positional encoding (passed to MHSA layer).
-        rel_pos_zero_init (`bool`, *optional*, defaults to `True`):
-            Whether to zero-initialize relative positional embeddings (passed to MHSA layer).
-        window_size (`int`, *optional*, defaults to 0):
-            Size of attention windows (0 = full attention, >0 = windowed attention).
-        input_size (`Tuple[int, int]`, *optional*):
-            Spatial size of input feature map (passed to MHSA layer for relative positional encoding).
-
-    Shape:
-        - Input: `(B, H, W, dim)` (batch size, height, width, embedding dim)
-        - Output: Same shape as input
-    """
 
-    def __init__(
-        self,
-        dim: int,
-        num_heads: int,
-        mlp_ratio: float = 4.0,
-        qkv_bias: bool = True,
-        norm_layer: type[nn.Module] = nn.LayerNorm,
-        act_layer: type[nn.Module] = nn.GELU,
-        use_rel_pos: bool = False,
-        rel_pos_zero_init: bool = True,
-        window_size: int = 0,
-        input_size: Optional[tuple[int, int]] = None,
-    ) -> None:
+class PPChart2TableVisionNeck(GotOcr2VisionNeck):
+    def __init__(self, config: PPChart2TableVisionConfig):
         super().__init__()
-        self.norm1 = norm_layer(dim)
-        self.attn = PPChart2TableVisionAttention(
-            dim,
-            num_heads=num_heads,
-            qkv_bias=qkv_bias,
-            use_rel_pos=use_rel_pos,
-            rel_pos_zero_init=rel_pos_zero_init,
-            input_size=input_size if window_size == 0 else (window_size, window_size),
-        )
+        self.config = config
 
-        self.norm2 = norm_layer(dim)
-        self.mlp = PPChart2TableVisionMLPBlock(embedding_dim=dim, mlp_dim=int(dim * mlp_ratio), act=act_layer)
-
-        self.window_size = window_size
-
-    def forward(self, hidden_states: torch.Tensor) -> torch.Tensor:
-        shortcut = hidden_states
-        hidden_states = self.norm1(hidden_states)
-        if self.window_size > 0:
-            H, W = hidden_states.shape[1], hidden_states.shape[2]
-            hidden_states, pad_hw = window_partition(hidden_states, self.window_size)
-        hidden_states = self.attn(hidden_states)
-
-        if self.window_size > 0:
-            hidden_states = window_unpartition(hidden_states, self.window_size, pad_hw, (H, W))
-        hidden_states = shortcut + hidden_states
-        hidden_states = hidden_states + self.mlp(self.norm2(hidden_states))
-        return hidden_states
+        self.conv1 = nn.Conv2d(config.embed_dim, config.output_channels, kernel_size=1, bias=False)
+        self.layer_norm1 = PPChart2TableVisionLayerNorm(config.output_channels, data_format="channels_first")
+        self.conv2 = nn.Conv2d(config.output_channels, config.output_channels, kernel_size=3, padding=1, bias=False)
+        self.layer_norm2 = PPChart2TableVisionLayerNorm(config.output_channels, data_format="channels_first")
 
 
 class PPChart2TableVisionPreTrainedModel(PreTrainedModel):
@@ -1099,60 +757,27 @@ class PPChart2TableVisionModel(PPChart2TableVisionPreTrainedModel):
     def __init__(
         self,
         config: PPChart2TableVisionConfig,
-        in_chans: int = 3,
-        norm_layer: type[nn.Module] = nn.LayerNorm,
-        act_layer: type[nn.Module] = nn.GELU,
-        rel_pos_zero_init: bool = True,
     ) -> None:
         super().__init__(config)
-        self.img_size = config.img_size
+        self.image_size = config.image_size
 
-        self.patch_embed = PPChart2TableVisionPatchEmbed(
-            kernel_size=(config.patch_size, config.patch_size),
-            stride=(config.patch_size, config.patch_size),
-            in_chans=in_chans,
-            embed_dim=config.embed_dim,
-        )
+        self.patch_embed = PPChart2TableVisionPatchEmbed(config)
 
         self.pos_embed = nn.Parameter(
             torch.zeros(
-                1, config.img_size // config.patch_size, config.img_size // config.patch_size, config.embed_dim
+                1, config.image_size // config.patch_size, config.image_size // config.patch_size, config.embed_dim
             )
         )
 
         self.blocks = nn.ModuleList()
         for i in range(config.depth):
             block = PPChart2TableVisionDecoderLayer(
-                dim=config.embed_dim,
-                num_heads=config.num_heads,
-                mlp_ratio=config.mlp_ratio,
-                qkv_bias=config.qkv_bias,
-                norm_layer=norm_layer,
-                act_layer=act_layer,
-                use_rel_pos=config.use_rel_pos,
-                rel_pos_zero_init=rel_pos_zero_init,
+                config,
                 window_size=config.window_size if i not in config.global_attn_indexes else 0,
-                input_size=(config.img_size // config.patch_size, config.img_size // config.patch_size),
             )
             self.blocks.append(block)
 
-        self.neck = nn.Sequential(
-            nn.Conv2d(
-                config.embed_dim,
-                config.out_chans,
-                kernel_size=1,
-                bias=False,
-            ),
-            PPChart2TableVisionLayerNorm2d(config.out_chans),
-            nn.Conv2d(
-                config.out_chans,
-                config.out_chans,
-                kernel_size=3,
-                padding=1,
-                bias=False,
-            ),
-            PPChart2TableVisionLayerNorm2d(config.out_chans),
-        )
+        self.neck = PPChart2TableVisionNeck(config)
 
         self.net_2 = nn.Conv2d(256, 512, kernel_size=3, stride=2, padding=1, bias=False)
         self.net_3 = nn.Conv2d(512, config.hidden_size, kernel_size=3, stride=2, padding=1, bias=False)
@@ -1162,9 +787,9 @@ def __init__(
     def forward(self, hidden_states: torch.Tensor) -> torch.Tensor:
         hidden_states = self.patch_embed(hidden_states)
         hidden_states = hidden_states + self.pos_embed
-        for blk in self.blocks:
-            hidden_states = blk(hidden_states)
-        hidden_states = self.neck(hidden_states.permute(0, 3, 1, 2))
+        for block in self.blocks:
+            hidden_states = block(hidden_states)
+        hidden_states = self.neck(hidden_states)
         hidden_states = self.net_2(hidden_states)
         hidden_states = self.net_3(hidden_states)
         return hidden_states
diff --git a/src/transformers/models/pp_chart2table/processing_pp_chart2table.py b/src/transformers/models/pp_chart2table/processing_pp_chart2table.py
index 7d27beef3dec..a2281c9d5f3e 100644
--- a/src/transformers/models/pp_chart2table/processing_pp_chart2table.py
+++ b/src/transformers/models/pp_chart2table/processing_pp_chart2table.py
@@ -41,7 +41,7 @@ def __call__(
         else:
             image_inputs = {}
         img_cnt = len(image_inputs)
-        b, c, h, w = image_inputs["pixel_values"].shape
+        _, _, h, _ = image_inputs["pixel_values"].shape
         num_patches = h // self.image_processor.patch_size // self.image_processor.merge_size
         prompt = (
             "<|im_start|>system\n"

From 5e3f1d3b3b40dac4d7b7f39d95bd8a5983a43e79 Mon Sep 17 00:00:00 2001
From: XingweiDeng <aaadddiy@163.com>
Date: Wed, 25 Feb 2026 15:33:57 +0800
Subject: [PATCH 04/60] update

---
 .../configuration_pp_chart2table.py           |  17 +-
 .../pp_chart2table/modeling_pp_chart2table.py | 164 ++++++++--------
 .../pp_chart2table/modular_pp_chart2table.py  | 181 +++++++++---------
 .../test_modeling_pp_chart2table.py           |  14 +-
 4 files changed, 191 insertions(+), 185 deletions(-)

diff --git a/src/transformers/models/pp_chart2table/configuration_pp_chart2table.py b/src/transformers/models/pp_chart2table/configuration_pp_chart2table.py
index ee4ed2128161..710221f51058 100644
--- a/src/transformers/models/pp_chart2table/configuration_pp_chart2table.py
+++ b/src/transformers/models/pp_chart2table/configuration_pp_chart2table.py
@@ -85,6 +85,7 @@ def __init__(
         global_attn_indexes: Optional[list] = None,
         window_size: int = 14,
         output_channels: int = 256,
+        net_channels: int = 512,
         attention_dropout: float = 0.0,
         **kwargs,
     ):
@@ -104,6 +105,7 @@ def __init__(
         self.global_attn_indexes = global_attn_indexes if global_attn_indexes is not None else [2, 5, 8, 11]
         self.window_size = window_size
         self.output_channels = output_channels
+        self.net_channels = net_channels
         self.attention_dropout = attention_dropout
 
         super().__init__(**kwargs)
@@ -315,16 +317,24 @@ class PPChart2TableConfig(PreTrainedConfig):
     """
 
     model_type = "pp_chart2table"
+    attribute_map = {
+        "image_token_id": "image_token_index",
+    }
     sub_configs = {"vision_config": PPChart2TableVisionConfig, "text_config": PPChart2TableTextConfig}
 
     def __init__(
         self,
         vision_config: dict | None = None,
         text_config: dict | None = None,
-        im_start_token: int = 151857,
-        im_patch_token: int = 151859,
+        image_token_index: Optional[int] = 151859,
+        image_seq_length: Optional[int] = 576,
+        pad_token_id: Optional[int] = -1,
         **kwargs,
     ):
+        self.image_token_index = image_token_index
+        self.image_seq_length = image_seq_length
+        self.pad_token_id = pad_token_id
+
         if vision_config is None:
             vision_config = {}
         self.vision_config = PPChart2TableVisionConfig(**vision_config)
@@ -335,9 +345,6 @@ def __init__(
 
         self.model_type = "pp_chart2table"
 
-        self.im_start_token = im_start_token
-        self.im_patch_token = im_patch_token
-
         text_config_keys = [
             "attention_dropout",
             "bos_token_id",
diff --git a/src/transformers/models/pp_chart2table/modeling_pp_chart2table.py b/src/transformers/models/pp_chart2table/modeling_pp_chart2table.py
index 4b66b6b04afa..5f7604493339 100644
--- a/src/transformers/models/pp_chart2table/modeling_pp_chart2table.py
+++ b/src/transformers/models/pp_chart2table/modeling_pp_chart2table.py
@@ -422,8 +422,10 @@ def __init__(
 
         self.neck = PPChart2TableVisionNeck(config)
 
-        self.net_2 = nn.Conv2d(256, 512, kernel_size=3, stride=2, padding=1, bias=False)
-        self.net_3 = nn.Conv2d(512, config.hidden_size, kernel_size=3, stride=2, padding=1, bias=False)
+        self.net_2 = nn.Conv2d(
+            config.output_channels, config.net_channels, kernel_size=3, stride=2, padding=1, bias=False
+        )
+        self.net_3 = nn.Conv2d(config.net_channels, config.hidden_size, kernel_size=3, stride=2, padding=1, bias=False)
 
         self.post_init()
 
@@ -857,6 +859,7 @@ class PPChart2TableModelOutputWithPast(ModelOutput):
     last_hidden_state: Optional[torch.FloatTensor] = None
     hidden_states: Optional[tuple[torch.FloatTensor]] = None
     attentions: Optional[tuple[torch.FloatTensor]] = None
+    image_hidden_states: Optional[torch.FloatTensor] = None
 
 
 @dataclass
@@ -885,6 +888,7 @@ class PPChart2TableCausalLMOutputWithPast(ModelOutput):
     last_hidden_state: Optional[torch.FloatTensor] = None
     hidden_states: Optional[tuple[torch.FloatTensor]] = None
     attentions: Optional[tuple[torch.FloatTensor]] = None
+    image_hidden_states: Optional[torch.FloatTensor] = None
 
 
 class PPChart2TablePreTrainedModel(PreTrainedModel):
@@ -994,7 +998,7 @@ def set_input_embeddings(self, value):
 
     def get_image_features(
         self,
-        images: Optional[torch.Tensor],
+        pixel_values: torch.FloatTensor,
     ) -> list[torch.Tensor]:
         r"""
         Extract and project chart image features to text embedding space.
@@ -1008,10 +1012,10 @@ def get_image_features(
                 List of projected image features (one per image), each with shape `[1, num_patches, text_hidden_size]`.
         """
         image_features = []
-        for image in images:
-            image = image.unsqueeze(0)
+        for pixel_value in pixel_values:
+            pixel_value = pixel_value.unsqueeze(0)
             with torch.no_grad():
-                cnn_feature = self.vision_tower_high(image)
+                cnn_feature = self.vision_tower_high(pixel_value)
                 cnn_feature = cnn_feature.flatten(2).transpose(2, 1)
             image_feature = self.mm_projector_vary(cnn_feature)
             image_features.append(image_feature)
@@ -1021,51 +1025,27 @@ def get_image_features(
         return image_features
 
     def get_placeholder_mask(
-        self,
-        input_ids: Optional[torch.LongTensor] = None,
-        inputs_embeds: Optional[torch.FloatTensor] = None,
-        image_features: Optional[torch.FloatTensor] = None,
-    ) -> torch.BoolTensor:
-        r"""
-        Generate mask to locate image placeholder tokens in input embeddings.
-
-        This mask identifies the `<imgpad>` tokens in the input sequence, which will be replaced with
-        projected image features for multimodal fusion.
-
-        Args:
-            input_ids (`torch.LongTensor`, optional):
-                Tokenized input text (used if `inputs_embeds` is None).
-            inputs_embeds (`torch.FloatTensor`, optional):
-                Precomputed input embeddings (used if `input_ids` is None).
-            image_features (`torch.FloatTensor`):
-                Projected image features (used to validate token-feature count match).
-
-        Returns:
-            `torch.BoolTensor`:
-                Boolean mask (shape: `[B, seq_len, text_hidden_size]`) where `True` indicates image placeholder tokens.
-
-        Raises:
-            ValueError: If the number of image tokens does not match the number of image features.
+        self, input_ids: torch.LongTensor, inputs_embeds: torch.FloatTensor, image_features: torch.FloatTensor
+    ):
+        """
+        Obtains multimodal placeholder mask from `input_ids` or `inputs_embeds`, and checks that the placeholder token count is
+        equal to the length of multimodal features. If the lengths are different, an error is raised.
         """
         if input_ids is None:
-            start_token_embed = self.get_input_embeddings()(
-                torch.tensor(self.config.im_start_token, dtype=torch.long, device=inputs_embeds.device)
+            special_image_mask = inputs_embeds == self.get_input_embeddings()(
+                torch.tensor(self.config.image_token_id, dtype=torch.long, device=inputs_embeds.device)
             )
-            special_image_mask = inputs_embeds == start_token_embed
             special_image_mask = special_image_mask.all(-1)
         else:
-            special_image_mask = input_ids == self.config.im_patch_token
+            special_image_mask = input_ids == self.config.image_token_id
 
         n_image_tokens = special_image_mask.sum()
-
-        n_image_features = image_features.numel() // image_features.shape[-1]
-        if n_image_tokens != n_image_features:
+        special_image_mask = special_image_mask.unsqueeze(-1).expand_as(inputs_embeds).to(inputs_embeds.device)
+        n_image_features = image_features.shape[0] * image_features.shape[1]
+        if inputs_embeds[special_image_mask].numel() != image_features.numel():
             raise ValueError(
                 f"Image features and image tokens do not match: tokens: {n_image_tokens}, features {n_image_features}"
             )
-
-        special_image_mask = special_image_mask.unsqueeze(-1).expand_as(inputs_embeds).to(inputs_embeds.device)
-
         return special_image_mask
 
     @can_return_tuple
@@ -1081,20 +1061,23 @@ def forward(
         cache_position: Optional[torch.LongTensor] = None,
         **kwargs,
     ):
+        if (input_ids is None) ^ (inputs_embeds is not None):
+            raise ValueError("You must specify exactly one of input_ids or inputs_embeds")
+
         if inputs_embeds is None:
             inputs_embeds = self.language_model.embed_tokens(input_ids)
 
         if pixel_values is not None:
-            image_features = self.get_image_features(pixel_values)
-            image_mask = self.get_placeholder_mask(
+            image_features = self.get_image_features(pixel_values=pixel_values.to(inputs_embeds.dtype))
+            image_features = image_features.to(inputs_embeds.device, inputs_embeds.dtype)
+            special_image_mask = self.get_placeholder_mask(
                 input_ids, inputs_embeds=inputs_embeds, image_features=image_features
             )
-            inputs_embeds = inputs_embeds.masked_scatter(image_mask, image_features)
+            inputs_embeds = inputs_embeds.masked_scatter(special_image_mask, image_features)
 
         outputs = self.language_model(
-            input_ids=None,
-            position_ids=position_ids,
             attention_mask=attention_mask,
+            position_ids=position_ids,
             past_key_values=past_key_values,
             inputs_embeds=inputs_embeds,
             use_cache=use_cache,
@@ -1102,15 +1085,14 @@ def forward(
             **kwargs,
         )
 
-        output = PPChart2TableModelOutputWithPast(
+        return PPChart2TableModelOutputWithPast(
             last_hidden_state=outputs.last_hidden_state,
             past_key_values=outputs.past_key_values,
             hidden_states=outputs.hidden_states,
             attentions=outputs.attentions,
+            image_hidden_states=image_features if pixel_values is not None else None,
         )
 
-        return output
-
 
 class PPChart2TableForConditionalGeneration(PPChart2TablePreTrainedModel, GenerationMixin):
     r"""
@@ -1148,7 +1130,6 @@ def __init__(self, config: PPChart2TableConfig):
         super().__init__(config)
         self.model = PPChart2TableModel(config)
         self.lm_head = nn.Linear(config.text_config.hidden_size, config.text_config.vocab_size, bias=False)
-
         self.post_init()
 
     def get_input_embeddings(self):
@@ -1157,43 +1138,22 @@ def get_input_embeddings(self):
     def set_input_embeddings(self, value):
         self.model.set_input_embeddings(value)
 
-    def prepare_inputs_for_generation(
+    def get_output_embeddings(self) -> nn.Module:
+        return self.lm_head
+
+    def get_image_features(
         self,
-        input_ids,
-        past_key_values=None,
-        attention_mask=None,
-        inputs_embeds=None,
-        cache_position=None,
-        position_ids=None,
-        use_cache=True,
-        pixel_values=None,
-        pixel_values_videos=None,
-        image_grid_thw=None,
-        video_grid_thw=None,
-        is_first_iteration=False,
+        pixel_values: torch.FloatTensor,
+        vision_feature_layer: Optional[Union[int, list[int]]] = None,
+        vision_feature_select_strategy: Optional[str] = None,
         **kwargs,
     ):
-        # Overwritten -- in specific circumstances we don't want to forward image inputs to the model
-
-        model_inputs = super().prepare_inputs_for_generation(
-            input_ids,
-            past_key_values=past_key_values,
-            attention_mask=attention_mask,
-            inputs_embeds=inputs_embeds,
-            cache_position=cache_position,
-            position_ids=position_ids,
+        return self.model.get_image_features(
             pixel_values=pixel_values,
-            pixel_values_videos=pixel_values_videos,
-            image_grid_thw=image_grid_thw,
-            video_grid_thw=video_grid_thw,
-            use_cache=use_cache,
-            is_first_iteration=is_first_iteration,
+            vision_feature_layer=vision_feature_layer,
+            vision_feature_select_strategy=vision_feature_select_strategy,
             **kwargs,
         )
-        if not is_first_iteration and use_cache:
-            model_inputs["pixel_values"] = None
-
-        return model_inputs
 
     @can_return_tuple
     def forward(
@@ -1219,18 +1179,18 @@ def forward(
             use_cache=use_cache,
             past_key_values=past_key_values,
             cache_position=cache_position,
+            logits_to_keep=logits_to_keep,
             **kwargs,
         )
         hidden_states = outputs.last_hidden_state
-        logits = self.lm_head(hidden_states)
 
         slice_indices = slice(-logits_to_keep, None) if isinstance(logits_to_keep, int) else logits_to_keep
         logits = self.lm_head(hidden_states[:, slice_indices, :])
 
+        loss = None
         if labels is not None:
-            raise ValueError(
-                "The PPChart2TableForConditionalGeneration model only supports inference, and training is not allowed!\n"
-                "If you need to train this model, please implement the corresponding loss calculation logic, or use the inference-only mode (do not pass the `labels` parameter)."
+            loss = self.loss_function(
+                logits=logits, labels=labels, vocab_size=self.config.text_config.vocab_size, **kwargs
             )
 
         return PPChart2TableCausalLMOutputWithPast(
@@ -1241,6 +1201,40 @@ def forward(
             attentions=outputs.attentions,
         )
 
+    def prepare_inputs_for_generation(
+        self,
+        input_ids,
+        past_key_values=None,
+        inputs_embeds=None,
+        pixel_values=None,
+        attention_mask=None,
+        cache_position=None,
+        logits_to_keep=None,
+        is_first_iteration=False,
+        **kwargs,
+    ):
+        # Overwritten -- in specific circumstances we don't want to forward image inputs to the model
+
+        model_inputs = super().prepare_inputs_for_generation(
+            input_ids,
+            past_key_values=past_key_values,
+            inputs_embeds=inputs_embeds,
+            attention_mask=attention_mask,
+            cache_position=cache_position,
+            logits_to_keep=logits_to_keep,
+            is_first_iteration=is_first_iteration,
+            **kwargs,
+        )
+
+        if is_first_iteration or not kwargs.get("use_cache", True):
+            # Pixel values are used only in the first iteration if available
+            # In subsquent iterations, they are already merged with text and cached
+            # NOTE: first iteration doesn't have to be prefill, it can be the first
+            # iteration with a question and cached system prompt (continue generate from cache)
+            model_inputs["pixel_values"] = pixel_values
+
+        return model_inputs
+
 
 __all__ = [
     "PPChart2TableForConditionalGeneration",
diff --git a/src/transformers/models/pp_chart2table/modular_pp_chart2table.py b/src/transformers/models/pp_chart2table/modular_pp_chart2table.py
index e1bd61a0719c..72fd45bbde0d 100644
--- a/src/transformers/models/pp_chart2table/modular_pp_chart2table.py
+++ b/src/transformers/models/pp_chart2table/modular_pp_chart2table.py
@@ -110,6 +110,7 @@ def __init__(
         global_attn_indexes: Optional[list] = None,
         window_size: int = 14,
         output_channels: int = 256,
+        net_channels: int = 512,
         attention_dropout: float = 0.0,
         **kwargs,
     ):
@@ -129,6 +130,7 @@ def __init__(
         self.global_attn_indexes = global_attn_indexes if global_attn_indexes is not None else [2, 5, 8, 11]
         self.window_size = window_size
         self.output_channels = output_channels
+        self.net_channels = net_channels
         self.attention_dropout = attention_dropout
 
         super().__init__(**kwargs)
@@ -340,16 +342,24 @@ class PPChart2TableConfig(PreTrainedConfig):
     """
 
     model_type = "pp_chart2table"
+    attribute_map = {
+        "image_token_id": "image_token_index",
+    }
     sub_configs = {"vision_config": PPChart2TableVisionConfig, "text_config": PPChart2TableTextConfig}
 
     def __init__(
         self,
         vision_config: dict | None = None,
         text_config: dict | None = None,
-        im_start_token: int = 151857,
-        im_patch_token: int = 151859,
+        image_token_index: Optional[int] = 151859,
+        image_seq_length: Optional[int] = 576,
+        pad_token_id: Optional[int] = -1,
         **kwargs,
     ):
+        self.image_token_index = image_token_index
+        self.image_seq_length = image_seq_length
+        self.pad_token_id = pad_token_id
+
         if vision_config is None:
             vision_config = {}
         self.vision_config = PPChart2TableVisionConfig(**vision_config)
@@ -360,9 +370,6 @@ def __init__(
 
         self.model_type = "pp_chart2table"
 
-        self.im_start_token = im_start_token
-        self.im_patch_token = im_patch_token
-
         text_config_keys = [
             "attention_dropout",
             "bos_token_id",
@@ -779,8 +786,8 @@ def __init__(
 
         self.neck = PPChart2TableVisionNeck(config)
 
-        self.net_2 = nn.Conv2d(256, 512, kernel_size=3, stride=2, padding=1, bias=False)
-        self.net_3 = nn.Conv2d(512, config.hidden_size, kernel_size=3, stride=2, padding=1, bias=False)
+        self.net_2 = nn.Conv2d(config.output_channels, config.net_channels, kernel_size=3, stride=2, padding=1, bias=False)
+        self.net_3 = nn.Conv2d(config.net_channels, config.hidden_size, kernel_size=3, stride=2, padding=1, bias=False)
 
         self.post_init()
 
@@ -832,6 +839,7 @@ class PPChart2TableModelOutputWithPast(ModelOutput):
     last_hidden_state: Optional[torch.FloatTensor] = None
     hidden_states: Optional[tuple[torch.FloatTensor]] = None
     attentions: Optional[tuple[torch.FloatTensor]] = None
+    image_hidden_states: Optional[torch.FloatTensor] = None
 
 
 @dataclass
@@ -860,6 +868,7 @@ class PPChart2TableCausalLMOutputWithPast(ModelOutput):
     last_hidden_state: Optional[torch.FloatTensor] = None
     hidden_states: Optional[tuple[torch.FloatTensor]] = None
     attentions: Optional[tuple[torch.FloatTensor]] = None
+    image_hidden_states: Optional[torch.FloatTensor] = None
 
 
 class PPChart2TablePreTrainedModel(PreTrainedModel):
@@ -969,7 +978,7 @@ def set_input_embeddings(self, value):
 
     def get_image_features(
         self,
-        images: Optional[torch.Tensor],
+        pixel_values: torch.FloatTensor,
     ) -> list[torch.Tensor]:
         r"""
         Extract and project chart image features to text embedding space.
@@ -983,10 +992,10 @@ def get_image_features(
                 List of projected image features (one per image), each with shape `[1, num_patches, text_hidden_size]`.
         """
         image_features = []
-        for image in images:
-            image = image.unsqueeze(0)
+        for pixel_value in pixel_values:
+            pixel_value = pixel_value.unsqueeze(0)
             with torch.no_grad():
-                cnn_feature = self.vision_tower_high(image)
+                cnn_feature = self.vision_tower_high(pixel_value)
                 cnn_feature = cnn_feature.flatten(2).transpose(2, 1)
             image_feature = self.mm_projector_vary(cnn_feature)
             image_features.append(image_feature)
@@ -996,51 +1005,27 @@ def get_image_features(
         return image_features
 
     def get_placeholder_mask(
-        self,
-        input_ids: Optional[torch.LongTensor] = None,
-        inputs_embeds: Optional[torch.FloatTensor] = None,
-        image_features: Optional[torch.FloatTensor] = None,
-    ) -> torch.BoolTensor:
-        r"""
-        Generate mask to locate image placeholder tokens in input embeddings.
-
-        This mask identifies the `<imgpad>` tokens in the input sequence, which will be replaced with
-        projected image features for multimodal fusion.
-
-        Args:
-            input_ids (`torch.LongTensor`, optional):
-                Tokenized input text (used if `inputs_embeds` is None).
-            inputs_embeds (`torch.FloatTensor`, optional):
-                Precomputed input embeddings (used if `input_ids` is None).
-            image_features (`torch.FloatTensor`):
-                Projected image features (used to validate token-feature count match).
-
-        Returns:
-            `torch.BoolTensor`:
-                Boolean mask (shape: `[B, seq_len, text_hidden_size]`) where `True` indicates image placeholder tokens.
-
-        Raises:
-            ValueError: If the number of image tokens does not match the number of image features.
+        self, input_ids: torch.LongTensor, inputs_embeds: torch.FloatTensor, image_features: torch.FloatTensor
+    ):
+        """
+        Obtains multimodal placeholder mask from `input_ids` or `inputs_embeds`, and checks that the placeholder token count is
+        equal to the length of multimodal features. If the lengths are different, an error is raised.
         """
         if input_ids is None:
-            start_token_embed = self.get_input_embeddings()(
-                torch.tensor(self.config.im_start_token, dtype=torch.long, device=inputs_embeds.device)
+            special_image_mask = inputs_embeds == self.get_input_embeddings()(
+                torch.tensor(self.config.image_token_id, dtype=torch.long, device=inputs_embeds.device)
             )
-            special_image_mask = inputs_embeds == start_token_embed
             special_image_mask = special_image_mask.all(-1)
         else:
-            special_image_mask = input_ids == self.config.im_patch_token
+            special_image_mask = input_ids == self.config.image_token_id
 
         n_image_tokens = special_image_mask.sum()
-
-        n_image_features = image_features.numel() // image_features.shape[-1]
-        if n_image_tokens != n_image_features:
+        special_image_mask = special_image_mask.unsqueeze(-1).expand_as(inputs_embeds).to(inputs_embeds.device)
+        n_image_features = image_features.shape[0] * image_features.shape[1]
+        if inputs_embeds[special_image_mask].numel() != image_features.numel():
             raise ValueError(
                 f"Image features and image tokens do not match: tokens: {n_image_tokens}, features {n_image_features}"
             )
-
-        special_image_mask = special_image_mask.unsqueeze(-1).expand_as(inputs_embeds).to(inputs_embeds.device)
-
         return special_image_mask
 
     @can_return_tuple
@@ -1056,20 +1041,23 @@ def forward(
         cache_position: Optional[torch.LongTensor] = None,
         **kwargs,
     ):
+        if (input_ids is None) ^ (inputs_embeds is not None):
+            raise ValueError("You must specify exactly one of input_ids or inputs_embeds")
+
         if inputs_embeds is None:
             inputs_embeds = self.language_model.embed_tokens(input_ids)
 
         if pixel_values is not None:
-            image_features = self.get_image_features(pixel_values)
-            image_mask = self.get_placeholder_mask(
+            image_features = self.get_image_features(pixel_values=pixel_values.to(inputs_embeds.dtype))
+            image_features = image_features.to(inputs_embeds.device, inputs_embeds.dtype)
+            special_image_mask = self.get_placeholder_mask(
                 input_ids, inputs_embeds=inputs_embeds, image_features=image_features
             )
-            inputs_embeds = inputs_embeds.masked_scatter(image_mask, image_features)
+            inputs_embeds = inputs_embeds.masked_scatter(special_image_mask, image_features)
 
         outputs = self.language_model(
-            input_ids=None,
-            position_ids=position_ids,
             attention_mask=attention_mask,
+            position_ids=position_ids,
             past_key_values=past_key_values,
             inputs_embeds=inputs_embeds,
             use_cache=use_cache,
@@ -1077,15 +1065,14 @@ def forward(
             **kwargs,
         )
 
-        output = PPChart2TableModelOutputWithPast(
+        return PPChart2TableModelOutputWithPast(
             last_hidden_state=outputs.last_hidden_state,
             past_key_values=outputs.past_key_values,
             hidden_states=outputs.hidden_states,
             attentions=outputs.attentions,
+            image_hidden_states=image_features if pixel_values is not None else None,
         )
 
-        return output
-
 
 class PPChart2TableForConditionalGeneration(PPChart2TablePreTrainedModel, GenerationMixin):
     r"""
@@ -1123,7 +1110,6 @@ def __init__(self, config: PPChart2TableConfig):
         super().__init__(config)
         self.model = PPChart2TableModel(config)
         self.lm_head = nn.Linear(config.text_config.hidden_size, config.text_config.vocab_size, bias=False)
-
         self.post_init()
 
     def get_input_embeddings(self):
@@ -1131,44 +1117,23 @@ def get_input_embeddings(self):
 
     def set_input_embeddings(self, value):
         self.model.set_input_embeddings(value)
-
-    def prepare_inputs_for_generation(
+    
+    def get_output_embeddings(self) -> nn.Module:
+        return self.lm_head
+    
+    def get_image_features(
         self,
-        input_ids,
-        past_key_values=None,
-        attention_mask=None,
-        inputs_embeds=None,
-        cache_position=None,
-        position_ids=None,
-        use_cache=True,
-        pixel_values=None,
-        pixel_values_videos=None,
-        image_grid_thw=None,
-        video_grid_thw=None,
-        is_first_iteration=False,
+        pixel_values: torch.FloatTensor,
+        vision_feature_layer: Optional[Union[int, list[int]]] = None,
+        vision_feature_select_strategy: Optional[str] = None,
         **kwargs,
     ):
-        # Overwritten -- in specific circumstances we don't want to forward image inputs to the model
-
-        model_inputs = super().prepare_inputs_for_generation(
-            input_ids,
-            past_key_values=past_key_values,
-            attention_mask=attention_mask,
-            inputs_embeds=inputs_embeds,
-            cache_position=cache_position,
-            position_ids=position_ids,
+        return self.model.get_image_features(
             pixel_values=pixel_values,
-            pixel_values_videos=pixel_values_videos,
-            image_grid_thw=image_grid_thw,
-            video_grid_thw=video_grid_thw,
-            use_cache=use_cache,
-            is_first_iteration=is_first_iteration,
+            vision_feature_layer=vision_feature_layer,
+            vision_feature_select_strategy=vision_feature_select_strategy,
             **kwargs,
         )
-        if not is_first_iteration and use_cache:
-            model_inputs["pixel_values"] = None
-
-        return model_inputs
 
     @can_return_tuple
     def forward(
@@ -1194,18 +1159,18 @@ def forward(
             use_cache=use_cache,
             past_key_values=past_key_values,
             cache_position=cache_position,
+            logits_to_keep=logits_to_keep,
             **kwargs,
         )
         hidden_states = outputs.last_hidden_state
-        logits = self.lm_head(hidden_states)
 
         slice_indices = slice(-logits_to_keep, None) if isinstance(logits_to_keep, int) else logits_to_keep
         logits = self.lm_head(hidden_states[:, slice_indices, :])
 
+        loss = None
         if labels is not None:
-            raise ValueError(
-                "The PPChart2TableForConditionalGeneration model only supports inference, and training is not allowed!\n"
-                "If you need to train this model, please implement the corresponding loss calculation logic, or use the inference-only mode (do not pass the `labels` parameter)."
+            loss = self.loss_function(
+                logits=logits, labels=labels, vocab_size=self.config.text_config.vocab_size, **kwargs
             )
 
         return PPChart2TableCausalLMOutputWithPast(
@@ -1216,6 +1181,40 @@ def forward(
             attentions=outputs.attentions,
         )
 
+    def prepare_inputs_for_generation(
+        self,
+        input_ids,
+        past_key_values=None,
+        inputs_embeds=None,
+        pixel_values=None,
+        attention_mask=None,
+        cache_position=None,
+        logits_to_keep=None,
+        is_first_iteration=False,
+        **kwargs,
+    ):
+        # Overwritten -- in specific circumstances we don't want to forward image inputs to the model
+
+        model_inputs = super().prepare_inputs_for_generation(
+            input_ids,
+            past_key_values=past_key_values,
+            inputs_embeds=inputs_embeds,
+            attention_mask=attention_mask,
+            cache_position=cache_position,
+            logits_to_keep=logits_to_keep,
+            is_first_iteration=is_first_iteration,
+            **kwargs,
+        )
+
+        if is_first_iteration or not kwargs.get("use_cache", True):
+            # Pixel values are used only in the first iteration if available
+            # In subsquent iterations, they are already merged with text and cached
+            # NOTE: first iteration doesn't have to be prefill, it can be the first
+            # iteration with a question and cached system prompt (continue generate from cache)
+            model_inputs["pixel_values"] = pixel_values
+
+        return model_inputs
+
 
 __all__ = [
     "PPChart2TableForConditionalGeneration",
diff --git a/tests/models/pp_chart2table/test_modeling_pp_chart2table.py b/tests/models/pp_chart2table/test_modeling_pp_chart2table.py
index 1143253791fa..d76f23d92900 100644
--- a/tests/models/pp_chart2table/test_modeling_pp_chart2table.py
+++ b/tests/models/pp_chart2table/test_modeling_pp_chart2table.py
@@ -49,7 +49,7 @@ class PPChart2TableVisionText2TextModelTester:
     def __init__(
         self,
         parent,
-        batch_size=7,
+        batch_size=1,
         seq_length=31,
         num_channels=3,
         image_height=64,
@@ -73,16 +73,20 @@ def __init__(
             "depth": 2,
             "embed_dim": 768,
             "hidden_size": 144,
-            "img_size": 64,
+            "hidden_act": "gelu",
+            "image_size": 64,
+            "num_channels": 3,
             "mlp_ratio": 4.0,
             "norm_layer_eps": 1e-6,
-            "num_heads": 4,
+            "num_attention_heads": 4,
             "patch_size": 16,
             "qkv_bias": True,
             "use_rel_pos": True,
             "global_attn_indexes": [2, 5, 8, 11],
             "window_size": 14,
-            "out_chans": 256,
+            "output_channels": 256,
+            "net_channels": 512,
+            "attention_dropout": 0.0
         },
         bos_token_id=151643,
         eos_token_id=151643,
@@ -168,7 +172,9 @@ def prepare_config_and_inputs_for_common(self):
 class PPChart2TableModelTest(ModelTesterMixin, GenerationTesterMixin, PipelineTesterMixin, unittest.TestCase):
     all_model_classes = (PPChart2TableForConditionalGeneration,) if is_torch_available() else ()
     pipeline_model_mapping = {"image-text-to-text": PPChart2TableForConditionalGeneration}
+    
     _is_composite = True
+    test_resize_embeddings = False
 
     def setUp(self):
         self.model_tester = PPChart2TableVisionText2TextModelTester(self)

From 2c064ccdf0caa3aa10b4ccb17e4a97ef822a17bf Mon Sep 17 00:00:00 2001
From: XingweiDeng <aaadddiy@163.com>
Date: Wed, 25 Feb 2026 15:39:39 +0800
Subject: [PATCH 05/60] update

---
 docs/source/en/model_doc/pp_chart2table.md | 11 +++++++----
 1 file changed, 7 insertions(+), 4 deletions(-)

diff --git a/docs/source/en/model_doc/pp_chart2table.md b/docs/source/en/model_doc/pp_chart2table.md
index 7ba3692f7f49..ad00bec20f84 100644
--- a/docs/source/en/model_doc/pp_chart2table.md
+++ b/docs/source/en/model_doc/pp_chart2table.md
@@ -38,18 +38,20 @@ The example below demonstrates how to classify image with PP-Chart2Table using [
 <hfoption id="Pipeline">
 
 ```py
-from transformers import pipeline
+import requests
 from PIL import Image
+from transformers import pipeline
 model_path = "PaddlePaddle/PP-Chart2Table_safetensors"
 pipe = pipeline("image-text-to-text", model=model_path)
 image = Image.open(requests.get("https://paddle-model-ecology.bj.bcebos.com/paddlex/imgs/demo_image/chart_parsing_02.png", stream=True).raw)
 result = pipe(
-    images=image,
-    text="",
-    do_sample=False,
+    images=image, 
+    text="", 
+    do_sample=False, 
     max_new_tokens=256
 )
 print(result)
+
 ```
 
 </hfoption>
@@ -85,6 +87,7 @@ Here is how you can do it with PP-Chart2Table using [`Pipeline`] or the [`AutoMo
 <hfoption id="Pipeline">
 
 ```py
+import requests
 from transformers import pipeline
 from PIL import Image
 model_path = "PaddlePaddle/PP-Chart2Table_safetensors"

From 81514c0b1fbfbe2221f17a835dfe19f8e2aafe12 Mon Sep 17 00:00:00 2001
From: XingweiDeng <aaadddiy@163.com>
Date: Thu, 26 Feb 2026 15:01:52 +0800
Subject: [PATCH 06/60] update

---
 .../configuration_pp_chart2table.py           | 128 +++++++------
 .../pp_chart2table/modeling_pp_chart2table.py |   7 +-
 .../pp_chart2table/modular_pp_chart2table.py  | 172 +++++++++++-------
 .../test_modeling_pp_chart2table.py           |   4 +-
 4 files changed, 184 insertions(+), 127 deletions(-)

diff --git a/src/transformers/models/pp_chart2table/configuration_pp_chart2table.py b/src/transformers/models/pp_chart2table/configuration_pp_chart2table.py
index 710221f51058..9de2e1da70b1 100644
--- a/src/transformers/models/pp_chart2table/configuration_pp_chart2table.py
+++ b/src/transformers/models/pp_chart2table/configuration_pp_chart2table.py
@@ -21,34 +21,61 @@ class PPChart2TableVisionConfig(PreTrainedConfig):
     documentation from [`PreTrainedConfig`] for more information.
 
     Args:
-        im_patch_token (`int`, *optional*, defaults to 151859):
-            The token ID used to represent individual image patches in the multimodal input sequence.
-        im_start_token (`int`, *optional*, defaults to 151857):
-            The token ID representing the start of an image token sequence in the multimodal input.
-        depth (`int`, *optional*, defaults to 12):
-            Number of hidden layers in the vision Transformer encoder.
-        embed_dim (`int`, *optional*, defaults to 768):
-            Dimensionality of the patch embedding layer in the vision encoder.
-        hidden_size (`int`, *optional*, defaults to 1024):
-            Dimensionality of the hidden layers in the vision Transformer encoder.
-        image_size (`int`, *optional*, defaults to 1024):
-            The size (resolution) of input chart images (assumed to be square).
-        mlp_ratio (`float`, *optional*, defaults to 4.0):
-            Ratio of the dimensionality of the feed-forward layer to the hidden size in the vision Transformer blocks.
-        num_attention_heads (`int`, *optional*, defaults to 12):
-            Number of attention heads for each self-attention layer in the vision Transformer encoder.
-        patch_size (`int`, *optional*, defaults to 16):
-            The size (resolution) of each image patch extracted from the input chart image.
-        qkv_bias (`bool`, *optional*, defaults to `True`):
-            Whether to add bias terms to the query, key, and value projection layers in the self-attention mechanism.
-        use_rel_pos (`bool`, *optional*, defaults to `True`):
-            Whether to use relative positional embeddings in the self-attention layers of the vision encoder.
-        global_attn_indexes (`list`, *optional*, defaults to `[2, 5, 8, 11]`):
-            List of layer indexes where global attention (instead of windowed attention) is applied in the vision encoder.
-        window_size (`int`, *optional*, defaults to 14):
-            The size of the attention window for windowed self-attention in the vision Transformer layers.
-        output_channels (`int`, *optional*, defaults to 256):
-            Number of output channels from the convolutional stem layer before patch embedding.
+            depth (`int`, *optional*, defaults to 12):
+                Number of hidden layers in the vision Transformer encoder.
+            embed_dim (`int`, *optional*, defaults to 768):
+                Dimensionality of the patch embedding layer in the vision encoder.
+            hidden_size (`int`, *optional*, defaults to 1024):
+                Dimensionality of the hidden layers in the vision Transformer encoder.
+            num_channels (`int`, *optional*, defaults to 3):
+                Number of input channels for the convolutional stem layer (default: RGB images with 3 channels).
+            image_size (`int`, *optional*, defaults to 1024):
+                The size (resolution) of input chart images (assumed to be square).
+            mlp_ratio (`float`, *optional*, defaults to 4.0):
+                Ratio of the dimensionality of the feed-forward layer to the hidden size in the vision Transformer blocks.
+            num_attention_heads (`int`, *optional*, defaults to 12):
+                Number of attention heads for each self-attention layer in the vision Transformer encoder.
+            patch_size (`int`, *optional*, defaults to 16):
+                The size (resolution) of each image patch extracted from the input chart image.
+            qkv_bias (`bool`, *optional*, defaults to `True`):
+                Whether to add bias terms to the query, key, and value projection layers in the self-attention mechanism.
+            use_rel_pos (`bool`, *optional*, defaults to `True`):
+                Whether to use relative positional embeddings in the self-attention layers of the vision encoder.
+            global_attn_indexes (`Optional[List[int]]`, *optional*, defaults to `None`):
+                List of layer indexes where global attention (instead of windowed attention) is applied in the vision encoder.
+                If `None`, defaults to `[2, 5, 8, 11]`.
+            window_size (`int`, *optional*, defaults to 14):
+                The size of the attention window for windowed self-attention in the vision Transformer layers.
+            output_channels (`int`, *optional*, defaults to 256):
+                Number of output channels from the convolutional stem layer before patch embedding.
+            net_channels (`int`, *optional*, defaults to 512):
+                Number of channels in the intermediate convolutional layers.
+            attention_dropout (`float`, *optional*, defaults to 0.0):
+                Dropout probability for the attention layers in the vision Transformer.
+            output_hidden_states (`bool`, *optional*, defaults to `False`): <fill_docstring>
+            output_attentions (`bool`, *optional*, defaults to `False`): <fill_docstring>
+            return_dict (`bool`, *optional*, defaults to `True`): <fill_docstring>
+            dtype (`Union`, *optional*): <fill_docstring>
+            tie_word_embeddings (`bool`, *optional*, defaults to `True`): <fill_docstring>
+            chunk_size_feed_forward (`int`, *optional*, defaults to 0): <fill_docstring>
+            is_encoder_decoder (`bool`, *optional*, defaults to `False`): <fill_docstring>
+            is_decoder (`bool`, *optional*, defaults to `False`): <fill_docstring>
+            cross_attention_hidden_size (`Optional`, *optional*): <fill_docstring>
+            add_cross_attention (`bool`, *optional*, defaults to `False`): <fill_docstring>
+            architectures (`Optional`, *optional*): <fill_docstring>
+            finetuning_task (`Optional`, *optional*): <fill_docstring>
+            id2label (`Optional`, *optional*): <fill_docstring>
+            label2id (`Optional`, *optional*): <fill_docstring>
+            num_labels (`Optional`, *optional*): <fill_docstring>
+            task_specific_params (`Optional`, *optional*): <fill_docstring>
+            problem_type (`Optional`, *optional*): <fill_docstring>
+            tokenizer_class (`Optional`, *optional*): <fill_docstring>
+            prefix (`Optional`, *optional*): <fill_docstring>
+            bos_token_id (`Optional`, *optional*): <fill_docstring>
+            pad_token_id (`Optional`, *optional*): <fill_docstring>
+            eos_token_id (`Optional`, *optional*): <fill_docstring>
+            sep_token_id (`Optional`, *optional*): <fill_docstring>
+            decoder_start_token_id (`Optional`, *optional*): <fill_docstring>
 
     Example:
 
@@ -68,10 +95,8 @@ class PPChart2TableVisionConfig(PreTrainedConfig):
     model_type = "pp_chart2table_vision"
     base_config_key = "vision_config"
 
-    def __init__(
+    def init(
         self,
-        im_patch_token: int = 151859,
-        im_start_token: int = 151857,
         depth: int = 12,
         embed_dim: int = 768,
         hidden_size: int = 1024,
@@ -82,16 +107,13 @@ def __init__(
         patch_size: int = 16,
         qkv_bias: bool = True,
         use_rel_pos: bool = True,
-        global_attn_indexes: Optional[list] = None,
+        global_attn_indexes: Optional[list[int]] = None,
         window_size: int = 14,
         output_channels: int = 256,
         net_channels: int = 512,
         attention_dropout: float = 0.0,
         **kwargs,
     ):
-        self.im_patch_token = im_patch_token
-        self.im_start_token = im_start_token
-
         self.depth = depth
         self.embed_dim = embed_dim
         self.hidden_size = hidden_size
@@ -107,8 +129,7 @@ def __init__(
         self.output_channels = output_channels
         self.net_channels = net_channels
         self.attention_dropout = attention_dropout
-
-        super().__init__(**kwargs)
+        super().init(**kwargs)
 
 
 class PPChart2TableTextConfig(PreTrainedConfig):
@@ -267,30 +288,25 @@ def __init__(
 
 class PPChart2TableConfig(PreTrainedConfig):
     r"""
-    This is the main configuration class to store the configuration of a [`PPChart2TableModel`] or [`PPChart2TableForConditionalGeneration`].
+    This is the main configuration class to store the configuration of a [PPChart2TableModel] or [PPChart2TableForConditionalGeneration].
     It is used to instantiate a PP-Chart2Table multimodal model according to the specified arguments, defining the vision and text
-    sub-model architectures. This configuration class inherits from [`PreTrainedConfig`] and combines the configurations of:
-    - [`PPChart2TableVisionConfig`] (for the chart vision encoder)
-    - [`PPChart2TableTextConfig`] (for the table text decoder)
-    PP-Chart2Table [PaddlePaddle/PP-Chart2Table_safetensors](https://huggingface.co/PaddlePaddle/PP-Chart2Table_safetensors).
+    sub-model architectures. This configuration class inherits from [PreTrainedConfig] and combines the configurations of:
+    [PPChart2TableVisionConfig] (for the chart vision encoder)
+    [PPChart2TableTextConfig] (for the table text decoder)
+    PP-Chart2Table PaddlePaddle/PP-Chart2Table_safetensors.
 
-    Instantiating a `PPChart2TableConfig` with the defaults will yield a similar configuration to the base PP-Chart2Table model
+    Instantiating a PPChart2TableConfig with the defaults will yield a similar configuration to the base PP-Chart2Table model
     developed by the PaddlePaddle team for chart-to-table parsing tasks.
 
-    Configuration objects inherit from [`PreTrainedConfig`] and can be used to control the model outputs. Read the
-    documentation from [`PreTrainedConfig`] for more information.
+    Configuration objects inherit from [PreTrainedConfig] and can be used to control the model outputs. Read the
+    documentation from [PreTrainedConfig] for more information.
 
     Args:
-        vision_config (`dict`, *optional*):
-            Dictionary of configuration options used to initialize [`PPChart2TableVisionConfig`]. If `None`, the default
-            `PPChart2TableVisionConfig` configuration will be used.
-        text_config (`dict`, *optional*):
-            Dictionary of configuration options used to initialize [`PPChart2TableTextConfig`]. If `None`, the default
-            `PPChart2TableTextConfig` configuration will be used.
-        im_start_token (`int`, *optional*, defaults to 151857):
-            The token ID representing the start of an image token sequence in the multimodal input (shared across vision/text sub-configs).
-        im_patch_token (`int`, *optional*, defaults to 151859):
-            The token ID used to represent individual image patches in the multimodal input sequence (shared across vision/text sub-configs).
+            vision_config (Optional[Dict], optional, defaults to None, *optional*):
+            text_config (Optional[Dict], optional, defaults to None, *optional*):
+            image_token_index (Optional[int], optional, defaults to 151859, *optional*, defaults to 151859):
+            image_seq_length (Optional[int], optional, defaults to 576, *optional*, defaults to 576):
+            pad_token_id (Optional[int], optional, defaults to -1, *optional*, defaults to -1):
 
     Example:
 
@@ -343,8 +359,6 @@ def __init__(
             text_config = {}
         self.text_config = PPChart2TableTextConfig(**text_config)
 
-        self.model_type = "pp_chart2table"
-
         text_config_keys = [
             "attention_dropout",
             "bos_token_id",
diff --git a/src/transformers/models/pp_chart2table/modeling_pp_chart2table.py b/src/transformers/models/pp_chart2table/modeling_pp_chart2table.py
index 5f7604493339..1c9790b4d1a2 100644
--- a/src/transformers/models/pp_chart2table/modeling_pp_chart2table.py
+++ b/src/transformers/models/pp_chart2table/modeling_pp_chart2table.py
@@ -18,6 +18,7 @@
 from transformers.modeling_outputs import ModelOutput
 from transformers.modeling_utils import PreTrainedModel
 from transformers.utils import can_return_tuple
+from transformers.utils.generic import check_model_inputs
 
 from ...activations import ACT2FN
 from ...cache_utils import DynamicCache
@@ -30,7 +31,7 @@
 from ...modeling_utils import ALL_ATTENTION_FUNCTIONS
 from ...processing_utils import Unpack
 from ...utils import TransformersKwargs, auto_docstring
-from ...utils.generic import check_model_inputs, maybe_autocast
+from ...utils.generic import maybe_autocast
 from .configuration_pp_chart2table import PPChart2TableConfig, PPChart2TableTextConfig, PPChart2TableVisionConfig
 
 
@@ -52,7 +53,6 @@ def __init__(self, config):
         self.patch_size = patch_size
         self.num_channels = num_channels
         self.num_patches = num_patches
-
         self.projection = nn.Conv2d(num_channels, hidden_size, kernel_size=patch_size, stride=patch_size)
 
     def forward(self, pixel_values):
@@ -429,7 +429,7 @@ def __init__(
 
         self.post_init()
 
-    def forward(self, hidden_states: torch.Tensor) -> torch.Tensor:
+    def forward(self, hidden_states: torch.Tensor, **kwargs) -> torch.Tensor:
         hidden_states = self.patch_embed(hidden_states)
         hidden_states = hidden_states + self.pos_embed
         for block in self.blocks:
@@ -1194,6 +1194,7 @@ def forward(
             )
 
         return PPChart2TableCausalLMOutputWithPast(
+            loss=loss,
             logits=logits,
             last_hidden_state=outputs.last_hidden_state,
             past_key_values=outputs.past_key_values,
diff --git a/src/transformers/models/pp_chart2table/modular_pp_chart2table.py b/src/transformers/models/pp_chart2table/modular_pp_chart2table.py
index 72fd45bbde0d..bef1baa5c556 100644
--- a/src/transformers/models/pp_chart2table/modular_pp_chart2table.py
+++ b/src/transformers/models/pp_chart2table/modular_pp_chart2table.py
@@ -1,9 +1,9 @@
+import collections
 from dataclasses import dataclass
-from typing import Optional, Union
+from typing import Optional, Union, List
 
 import torch
 import torch.nn as nn
-import torch.nn.functional as F
 from torchvision.transforms.v2.functional import InterpolationMode
 
 from transformers.cache_utils import Cache
@@ -26,8 +26,20 @@
 from transformers.modeling_outputs import ModelOutput
 from transformers.modeling_rope_utils import RopeParameters
 from transformers.modeling_utils import PreTrainedModel
-from transformers.models.qwen2.modeling_qwen2 import Qwen2Attention, Qwen2DecoderLayer, Qwen2Model, Qwen2PreTrainedModel
-from transformers.models.got_ocr2.modeling_got_ocr2 import GotOcr2VisionNeck, GotOcr2MLPBlock, GotOcr2LayerNorm, GotOcr2PatchEmbeddings, GotOcr2VisionAttention, GotOcr2VisionLayer
+from transformers.models.got_ocr2.modeling_got_ocr2 import (
+    GotOcr2LayerNorm,
+    GotOcr2MLPBlock,
+    GotOcr2PatchEmbeddings,
+    GotOcr2VisionAttention,
+    GotOcr2VisionLayer,
+    GotOcr2VisionNeck,
+)
+from transformers.models.qwen2.modeling_qwen2 import (
+    Qwen2Attention,
+    Qwen2DecoderLayer,
+    Qwen2Model,
+    Qwen2PreTrainedModel,
+)
 from transformers.processing_utils import ProcessorMixin, TensorType
 from transformers.utils import (
     can_return_tuple,
@@ -46,34 +58,61 @@ class PPChart2TableVisionConfig(PreTrainedConfig):
     documentation from [`PreTrainedConfig`] for more information.
 
     Args:
-        im_patch_token (`int`, *optional*, defaults to 151859):
-            The token ID used to represent individual image patches in the multimodal input sequence.
-        im_start_token (`int`, *optional*, defaults to 151857):
-            The token ID representing the start of an image token sequence in the multimodal input.
-        depth (`int`, *optional*, defaults to 12):
-            Number of hidden layers in the vision Transformer encoder.
-        embed_dim (`int`, *optional*, defaults to 768):
-            Dimensionality of the patch embedding layer in the vision encoder.
-        hidden_size (`int`, *optional*, defaults to 1024):
-            Dimensionality of the hidden layers in the vision Transformer encoder.
-        image_size (`int`, *optional*, defaults to 1024):
-            The size (resolution) of input chart images (assumed to be square).
-        mlp_ratio (`float`, *optional*, defaults to 4.0):
-            Ratio of the dimensionality of the feed-forward layer to the hidden size in the vision Transformer blocks.
-        num_attention_heads (`int`, *optional*, defaults to 12):
-            Number of attention heads for each self-attention layer in the vision Transformer encoder.
-        patch_size (`int`, *optional*, defaults to 16):
-            The size (resolution) of each image patch extracted from the input chart image.
-        qkv_bias (`bool`, *optional*, defaults to `True`):
-            Whether to add bias terms to the query, key, and value projection layers in the self-attention mechanism.
-        use_rel_pos (`bool`, *optional*, defaults to `True`):
-            Whether to use relative positional embeddings in the self-attention layers of the vision encoder.
-        global_attn_indexes (`list`, *optional*, defaults to `[2, 5, 8, 11]`):
-            List of layer indexes where global attention (instead of windowed attention) is applied in the vision encoder.
-        window_size (`int`, *optional*, defaults to 14):
-            The size of the attention window for windowed self-attention in the vision Transformer layers.
-        output_channels (`int`, *optional*, defaults to 256):
-            Number of output channels from the convolutional stem layer before patch embedding.
+            depth (`int`, *optional*, defaults to 12):
+                Number of hidden layers in the vision Transformer encoder.
+            embed_dim (`int`, *optional*, defaults to 768):
+                Dimensionality of the patch embedding layer in the vision encoder.
+            hidden_size (`int`, *optional*, defaults to 1024):
+                Dimensionality of the hidden layers in the vision Transformer encoder.
+            num_channels (`int`, *optional*, defaults to 3):
+                Number of input channels for the convolutional stem layer (default: RGB images with 3 channels).
+            image_size (`int`, *optional*, defaults to 1024):
+                The size (resolution) of input chart images (assumed to be square).
+            mlp_ratio (`float`, *optional*, defaults to 4.0):
+                Ratio of the dimensionality of the feed-forward layer to the hidden size in the vision Transformer blocks.
+            num_attention_heads (`int`, *optional*, defaults to 12):
+                Number of attention heads for each self-attention layer in the vision Transformer encoder.
+            patch_size (`int`, *optional*, defaults to 16):
+                The size (resolution) of each image patch extracted from the input chart image.
+            qkv_bias (`bool`, *optional*, defaults to `True`):
+                Whether to add bias terms to the query, key, and value projection layers in the self-attention mechanism.
+            use_rel_pos (`bool`, *optional*, defaults to `True`):
+                Whether to use relative positional embeddings in the self-attention layers of the vision encoder.
+            global_attn_indexes (`Optional[List[int]]`, *optional*, defaults to `None`):
+                List of layer indexes where global attention (instead of windowed attention) is applied in the vision encoder.
+                If `None`, defaults to `[2, 5, 8, 11]`.
+            window_size (`int`, *optional*, defaults to 14):
+                The size of the attention window for windowed self-attention in the vision Transformer layers.
+            output_channels (`int`, *optional*, defaults to 256):
+                Number of output channels from the convolutional stem layer before patch embedding.
+            net_channels (`int`, *optional*, defaults to 512):
+                Number of channels in the intermediate convolutional layers.
+            attention_dropout (`float`, *optional*, defaults to 0.0):
+                Dropout probability for the attention layers in the vision Transformer.
+            output_hidden_states (`bool`, *optional*, defaults to `False`): <fill_docstring>
+            output_attentions (`bool`, *optional*, defaults to `False`): <fill_docstring>
+            return_dict (`bool`, *optional*, defaults to `True`): <fill_docstring>
+            dtype (`Union`, *optional*): <fill_docstring>
+            tie_word_embeddings (`bool`, *optional*, defaults to `True`): <fill_docstring>
+            chunk_size_feed_forward (`int`, *optional*, defaults to 0): <fill_docstring>
+            is_encoder_decoder (`bool`, *optional*, defaults to `False`): <fill_docstring>
+            is_decoder (`bool`, *optional*, defaults to `False`): <fill_docstring>
+            cross_attention_hidden_size (`Optional`, *optional*): <fill_docstring>
+            add_cross_attention (`bool`, *optional*, defaults to `False`): <fill_docstring>
+            architectures (`Optional`, *optional*): <fill_docstring>
+            finetuning_task (`Optional`, *optional*): <fill_docstring>
+            id2label (`Optional`, *optional*): <fill_docstring>
+            label2id (`Optional`, *optional*): <fill_docstring>
+            num_labels (`Optional`, *optional*): <fill_docstring>
+            task_specific_params (`Optional`, *optional*): <fill_docstring>
+            problem_type (`Optional`, *optional*): <fill_docstring>
+            tokenizer_class (`Optional`, *optional*): <fill_docstring>
+            prefix (`Optional`, *optional*): <fill_docstring>
+            bos_token_id (`Optional`, *optional*): <fill_docstring>
+            pad_token_id (`Optional`, *optional*): <fill_docstring>
+            eos_token_id (`Optional`, *optional*): <fill_docstring>
+            sep_token_id (`Optional`, *optional*): <fill_docstring>
+            decoder_start_token_id (`Optional`, *optional*): <fill_docstring>
 
     Example:
 
@@ -93,10 +132,8 @@ class PPChart2TableVisionConfig(PreTrainedConfig):
     model_type = "pp_chart2table_vision"
     base_config_key = "vision_config"
 
-    def __init__(
+    def init(
         self,
-        im_patch_token: int = 151859,
-        im_start_token: int = 151857,
         depth: int = 12,
         embed_dim: int = 768,
         hidden_size: int = 1024,
@@ -107,16 +144,13 @@ def __init__(
         patch_size: int = 16,
         qkv_bias: bool = True,
         use_rel_pos: bool = True,
-        global_attn_indexes: Optional[list] = None,
+        global_attn_indexes: Optional[List[int]] = None,
         window_size: int = 14,
         output_channels: int = 256,
         net_channels: int = 512,
         attention_dropout: float = 0.0,
         **kwargs,
     ):
-        self.im_patch_token = im_patch_token
-        self.im_start_token = im_start_token
-
         self.depth = depth
         self.embed_dim = embed_dim
         self.hidden_size = hidden_size
@@ -132,8 +166,7 @@ def __init__(
         self.output_channels = output_channels
         self.net_channels = net_channels
         self.attention_dropout = attention_dropout
-
-        super().__init__(**kwargs)
+        super().init(**kwargs)
 
 
 class PPChart2TableTextConfig(PreTrainedConfig):
@@ -292,30 +325,25 @@ def __init__(
 
 class PPChart2TableConfig(PreTrainedConfig):
     r"""
-    This is the main configuration class to store the configuration of a [`PPChart2TableModel`] or [`PPChart2TableForConditionalGeneration`].
+    This is the main configuration class to store the configuration of a [PPChart2TableModel] or [PPChart2TableForConditionalGeneration].
     It is used to instantiate a PP-Chart2Table multimodal model according to the specified arguments, defining the vision and text
-    sub-model architectures. This configuration class inherits from [`PreTrainedConfig`] and combines the configurations of:
-    - [`PPChart2TableVisionConfig`] (for the chart vision encoder)
-    - [`PPChart2TableTextConfig`] (for the table text decoder)
-    PP-Chart2Table [PaddlePaddle/PP-Chart2Table_safetensors](https://huggingface.co/PaddlePaddle/PP-Chart2Table_safetensors).
+    sub-model architectures. This configuration class inherits from [PreTrainedConfig] and combines the configurations of:
+    [PPChart2TableVisionConfig] (for the chart vision encoder)
+    [PPChart2TableTextConfig] (for the table text decoder)
+    PP-Chart2Table PaddlePaddle/PP-Chart2Table_safetensors.
 
-    Instantiating a `PPChart2TableConfig` with the defaults will yield a similar configuration to the base PP-Chart2Table model
+    Instantiating a PPChart2TableConfig with the defaults will yield a similar configuration to the base PP-Chart2Table model
     developed by the PaddlePaddle team for chart-to-table parsing tasks.
 
-    Configuration objects inherit from [`PreTrainedConfig`] and can be used to control the model outputs. Read the
-    documentation from [`PreTrainedConfig`] for more information.
+    Configuration objects inherit from [PreTrainedConfig] and can be used to control the model outputs. Read the
+    documentation from [PreTrainedConfig] for more information.
 
     Args:
-        vision_config (`dict`, *optional*):
-            Dictionary of configuration options used to initialize [`PPChart2TableVisionConfig`]. If `None`, the default
-            `PPChart2TableVisionConfig` configuration will be used.
-        text_config (`dict`, *optional*):
-            Dictionary of configuration options used to initialize [`PPChart2TableTextConfig`]. If `None`, the default
-            `PPChart2TableTextConfig` configuration will be used.
-        im_start_token (`int`, *optional*, defaults to 151857):
-            The token ID representing the start of an image token sequence in the multimodal input (shared across vision/text sub-configs).
-        im_patch_token (`int`, *optional*, defaults to 151859):
-            The token ID used to represent individual image patches in the multimodal input sequence (shared across vision/text sub-configs).
+            vision_config (Optional[Dict], optional, defaults to None, *optional*):
+            text_config (Optional[Dict], optional, defaults to None, *optional*):
+            image_token_index (Optional[int], optional, defaults to 151859, *optional*, defaults to 151859):
+            image_seq_length (Optional[int], optional, defaults to 576, *optional*, defaults to 576):
+            pad_token_id (Optional[int], optional, defaults to -1, *optional*, defaults to -1):
 
     Example:
 
@@ -368,7 +396,6 @@ def __init__(
             text_config = {}
         self.text_config = PPChart2TableTextConfig(**text_config)
 
-        self.model_type = "pp_chart2table"
 
         text_config_keys = [
             "attention_dropout",
@@ -622,6 +649,7 @@ class PPChart2TableProcessor(ProcessorMixin):
         chat_template (`str`, *optional*): A Jinja template which will be used to convert lists of messages
             in a chat into a tokenizable string.
     """
+
     image_processor_class = "AutoImageProcessor"
     tokenizer_class = "AutoTokenizer"
 
@@ -663,7 +691,16 @@ def postprocess(self, model_pred, **kwargs):
 class PPChart2TableVisionPatchEmbed(GotOcr2PatchEmbeddings):
     def __init__(self, config):
         super().__init__()
+        image_size, patch_size = config.image_size, config.patch_size
         num_channels, hidden_size = config.num_channels, config.embed_dim
+        image_size = image_size if isinstance(image_size, collections.abc.Iterable) else (image_size, image_size)
+        patch_size = patch_size if isinstance(patch_size, collections.abc.Iterable) else (patch_size, patch_size)
+        num_patches = (image_size[1] // patch_size[1]) * (image_size[0] // patch_size[0])
+        self.image_size = image_size
+        self.patch_size = patch_size
+        self.num_channels = num_channels
+        self.num_patches = num_patches
+        self.projection = nn.Conv2d(num_channels, hidden_size, kernel_size=patch_size, stride=patch_size)
 
 
 class PPChart2TableVisionMLPBlock(GotOcr2MLPBlock):
@@ -683,6 +720,7 @@ class PPChart2TableVisionAttention(GotOcr2VisionAttention):
     def __init__(self, config, window_size):
         super().__init__()
         head_dim = config.embed_dim // config.num_attention_heads
+        self.scale = head_dim**-0.5
         self.qkv = nn.Linear(config.embed_dim, config.embed_dim * 3, bias=config.qkv_bias)
         self.proj = nn.Linear(config.embed_dim, config.embed_dim)
 
@@ -757,7 +795,6 @@ class PPChart2TableVisionPreTrainedModel(PreTrainedModel):
 
 
 class PPChart2TableVisionModel(PPChart2TableVisionPreTrainedModel):
-
     main_input_name = "pixel_values"
     input_modalities = "image"
 
@@ -786,12 +823,14 @@ def __init__(
 
         self.neck = PPChart2TableVisionNeck(config)
 
-        self.net_2 = nn.Conv2d(config.output_channels, config.net_channels, kernel_size=3, stride=2, padding=1, bias=False)
+        self.net_2 = nn.Conv2d(
+            config.output_channels, config.net_channels, kernel_size=3, stride=2, padding=1, bias=False
+        )
         self.net_3 = nn.Conv2d(config.net_channels, config.hidden_size, kernel_size=3, stride=2, padding=1, bias=False)
 
         self.post_init()
 
-    def forward(self, hidden_states: torch.Tensor) -> torch.Tensor:
+    def forward(self, hidden_states: torch.Tensor, **kwargs) -> torch.Tensor:
         hidden_states = self.patch_embed(hidden_states)
         hidden_states = hidden_states + self.pos_embed
         for block in self.blocks:
@@ -801,6 +840,7 @@ def forward(self, hidden_states: torch.Tensor) -> torch.Tensor:
         hidden_states = self.net_3(hidden_states)
         return hidden_states
 
+
 class PPChart2TableTextAttention(Qwen2Attention):
     pass
 
@@ -812,6 +852,7 @@ class PPChart2TableTextDecoderLayer(Qwen2DecoderLayer):
 class PPChart2TableTextPreTrainedModel(Qwen2PreTrainedModel):
     pass
 
+
 class PPChart2TableTextModel(Qwen2Model):
     pass
 
@@ -1117,10 +1158,10 @@ def get_input_embeddings(self):
 
     def set_input_embeddings(self, value):
         self.model.set_input_embeddings(value)
-    
+
     def get_output_embeddings(self) -> nn.Module:
         return self.lm_head
-    
+
     def get_image_features(
         self,
         pixel_values: torch.FloatTensor,
@@ -1174,6 +1215,7 @@ def forward(
             )
 
         return PPChart2TableCausalLMOutputWithPast(
+            loss=loss,
             logits=logits,
             last_hidden_state=outputs.last_hidden_state,
             past_key_values=outputs.past_key_values,
diff --git a/tests/models/pp_chart2table/test_modeling_pp_chart2table.py b/tests/models/pp_chart2table/test_modeling_pp_chart2table.py
index d76f23d92900..99d30d76c81d 100644
--- a/tests/models/pp_chart2table/test_modeling_pp_chart2table.py
+++ b/tests/models/pp_chart2table/test_modeling_pp_chart2table.py
@@ -86,7 +86,7 @@ def __init__(
             "window_size": 14,
             "output_channels": 256,
             "net_channels": 512,
-            "attention_dropout": 0.0
+            "attention_dropout": 0.0,
         },
         bos_token_id=151643,
         eos_token_id=151643,
@@ -172,7 +172,7 @@ def prepare_config_and_inputs_for_common(self):
 class PPChart2TableModelTest(ModelTesterMixin, GenerationTesterMixin, PipelineTesterMixin, unittest.TestCase):
     all_model_classes = (PPChart2TableForConditionalGeneration,) if is_torch_available() else ()
     pipeline_model_mapping = {"image-text-to-text": PPChart2TableForConditionalGeneration}
-    
+
     _is_composite = True
     test_resize_embeddings = False
 

From fc7c75fe76916f9a61e1226ba02523b18e2a9b27 Mon Sep 17 00:00:00 2001
From: XingweiDeng <aaadddiy@163.com>
Date: Fri, 27 Feb 2026 11:45:46 +0800
Subject: [PATCH 07/60] update

---
 .../configuration_pp_chart2table.py           | 116 ++++++-----------
 .../pp_chart2table/modular_pp_chart2table.py  | 121 ++++++------------
 2 files changed, 82 insertions(+), 155 deletions(-)

diff --git a/src/transformers/models/pp_chart2table/configuration_pp_chart2table.py b/src/transformers/models/pp_chart2table/configuration_pp_chart2table.py
index 9de2e1da70b1..7fa3b6d36af8 100644
--- a/src/transformers/models/pp_chart2table/configuration_pp_chart2table.py
+++ b/src/transformers/models/pp_chart2table/configuration_pp_chart2table.py
@@ -11,85 +11,49 @@
 
 
 class PPChart2TableVisionConfig(PreTrainedConfig):
-    r"""
-    This is the configuration class to store the configuration of a [`PPChart2TableVisionModel`]. It is used to instantiate a
-    PP-Chart2Table vision encoder according to the specified arguments, defining the model architecture. Instantiating a
-    configuration with the defaults will yield a similar configuration to that of the vision encoder of the PP-Chart2Table
-    architecture developed by the PaddlePaddle team for chart-to-table parsing tasks.
+    """
+    Configuration class for the vision backbone of PP-Chart2Table model.
 
-    Configuration objects inherit from [`PreTrainedConfig`] and can be used to control the model outputs. Read the
-    documentation from [`PreTrainedConfig`] for more information.
+    This configuration class defines all the hyperparameters for the vision component
+    of the PP-Chart2Table model, which is responsible for processing chart images
+    and extracting visual features for table structure recognition and content extraction.
+    PaddlePaddle/PP-Chart2Table_safetensors [PaddlePaddle/PP-Chart2Table_safetensors]
+    (https://huggingface.co/PaddlePaddle/PP-Chart2Table_safetensors)
 
     Args:
-            depth (`int`, *optional*, defaults to 12):
-                Number of hidden layers in the vision Transformer encoder.
-            embed_dim (`int`, *optional*, defaults to 768):
-                Dimensionality of the patch embedding layer in the vision encoder.
-            hidden_size (`int`, *optional*, defaults to 1024):
-                Dimensionality of the hidden layers in the vision Transformer encoder.
-            num_channels (`int`, *optional*, defaults to 3):
-                Number of input channels for the convolutional stem layer (default: RGB images with 3 channels).
-            image_size (`int`, *optional*, defaults to 1024):
-                The size (resolution) of input chart images (assumed to be square).
-            mlp_ratio (`float`, *optional*, defaults to 4.0):
-                Ratio of the dimensionality of the feed-forward layer to the hidden size in the vision Transformer blocks.
-            num_attention_heads (`int`, *optional*, defaults to 12):
-                Number of attention heads for each self-attention layer in the vision Transformer encoder.
-            patch_size (`int`, *optional*, defaults to 16):
-                The size (resolution) of each image patch extracted from the input chart image.
-            qkv_bias (`bool`, *optional*, defaults to `True`):
-                Whether to add bias terms to the query, key, and value projection layers in the self-attention mechanism.
-            use_rel_pos (`bool`, *optional*, defaults to `True`):
-                Whether to use relative positional embeddings in the self-attention layers of the vision encoder.
-            global_attn_indexes (`Optional[List[int]]`, *optional*, defaults to `None`):
-                List of layer indexes where global attention (instead of windowed attention) is applied in the vision encoder.
-                If `None`, defaults to `[2, 5, 8, 11]`.
-            window_size (`int`, *optional*, defaults to 14):
-                The size of the attention window for windowed self-attention in the vision Transformer layers.
-            output_channels (`int`, *optional*, defaults to 256):
-                Number of output channels from the convolutional stem layer before patch embedding.
-            net_channels (`int`, *optional*, defaults to 512):
-                Number of channels in the intermediate convolutional layers.
-            attention_dropout (`float`, *optional*, defaults to 0.0):
-                Dropout probability for the attention layers in the vision Transformer.
-            output_hidden_states (`bool`, *optional*, defaults to `False`): <fill_docstring>
-            output_attentions (`bool`, *optional*, defaults to `False`): <fill_docstring>
-            return_dict (`bool`, *optional*, defaults to `True`): <fill_docstring>
-            dtype (`Union`, *optional*): <fill_docstring>
-            tie_word_embeddings (`bool`, *optional*, defaults to `True`): <fill_docstring>
-            chunk_size_feed_forward (`int`, *optional*, defaults to 0): <fill_docstring>
-            is_encoder_decoder (`bool`, *optional*, defaults to `False`): <fill_docstring>
-            is_decoder (`bool`, *optional*, defaults to `False`): <fill_docstring>
-            cross_attention_hidden_size (`Optional`, *optional*): <fill_docstring>
-            add_cross_attention (`bool`, *optional*, defaults to `False`): <fill_docstring>
-            architectures (`Optional`, *optional*): <fill_docstring>
-            finetuning_task (`Optional`, *optional*): <fill_docstring>
-            id2label (`Optional`, *optional*): <fill_docstring>
-            label2id (`Optional`, *optional*): <fill_docstring>
-            num_labels (`Optional`, *optional*): <fill_docstring>
-            task_specific_params (`Optional`, *optional*): <fill_docstring>
-            problem_type (`Optional`, *optional*): <fill_docstring>
-            tokenizer_class (`Optional`, *optional*): <fill_docstring>
-            prefix (`Optional`, *optional*): <fill_docstring>
-            bos_token_id (`Optional`, *optional*): <fill_docstring>
-            pad_token_id (`Optional`, *optional*): <fill_docstring>
-            eos_token_id (`Optional`, *optional*): <fill_docstring>
-            sep_token_id (`Optional`, *optional*): <fill_docstring>
-            decoder_start_token_id (`Optional`, *optional*): <fill_docstring>
-
-    Example:
-
-    ```python
-    >>> from transformers import PPChart2TableVisionConfig, PPChart2TableVisionModel
-
-    >>> # Initializing a PPChart2TableVisionConfig with default PP-Chart2Table style configuration
-    >>> configuration = PPChart2TableVisionConfig()
-
-    >>> # Initializing a PPChart2TableVisionModel (with random weights) from the PP-Chart2Table style configuration
-    >>> model = PPChart2TableVisionModel(configuration)
-
-    >>> # Accessing the model configuration
-    >>> configuration = model.config
+        depth (`int`, *optional*, defaults to 12):
+            Number of transformer encoder layers in the vision backbone.
+        embed_dim (`int`, *optional*, defaults to 768):
+            Dimensionality of the patch embedding vectors.
+        hidden_size (`int`, *optional*, defaults to 1024):
+            Dimensionality of the hidden layer in the feed-forward network (MLP).
+        num_channels (`int`, *optional*, defaults to 3):
+            Number of input channels (3 for RGB images, 1 for grayscale).
+        image_size (`int`, *optional*, defaults to 1024):
+            Size (height/width) of the input images (assumed to be square).
+        mlp_ratio (`float`, *optional*, defaults to 4.0):
+            Ratio of the hidden layer size to the embedding dimension in the MLP (hidden_size = embed_dim * mlp_ratio).
+        num_attention_heads (`int`, *optional*, defaults to 12):
+            Number of attention heads for each transformer encoder layer.
+        patch_size (`int`, *optional*, defaults to 16):
+            Size (height/width) of the image patches extracted from the input image.
+        qkv_bias (`bool`, *optional*, defaults to True):
+            Whether to include bias terms in the query, key, value projection layers of self-attention.
+        use_rel_pos (`bool`, *optional*, defaults to True):
+            Whether to use relative positional embeddings in the self-attention mechanism.
+        global_attn_indexes (`Optional[list[int]]`, *optional*, defaults to [2, 5, 8, 11]):
+            List of layer indexes where global attention (instead of window attention) is applied.
+            If `None`, defaults to [2, 5, 8, 11].
+        window_size (`int`, *optional*, defaults to 14):
+            Size of the attention window for window-based self-attention (only effective when use_rel_pos=True).
+        output_channels (`int`, *optional*, defaults to 256):
+            Dimensionality of the final visual feature output channels.
+        net_channels (`int`, *optional*, defaults to 512):
+            Dimensionality of intermediate network channels in the vision backbone.
+        attention_dropout (`float`, *optional*, defaults to 0.0):
+            Dropout probability applied to the attention weights.
+        **kwargs:
+            Additional keyword arguments passed to the parent `PreTrainedConfig` class.
     """
 
     model_type = "pp_chart2table_vision"
diff --git a/src/transformers/models/pp_chart2table/modular_pp_chart2table.py b/src/transformers/models/pp_chart2table/modular_pp_chart2table.py
index bef1baa5c556..87701c5ded9f 100644
--- a/src/transformers/models/pp_chart2table/modular_pp_chart2table.py
+++ b/src/transformers/models/pp_chart2table/modular_pp_chart2table.py
@@ -1,6 +1,6 @@
 import collections
 from dataclasses import dataclass
-from typing import Optional, Union, List
+from typing import Optional, Union
 
 import torch
 import torch.nn as nn
@@ -48,85 +48,49 @@
 
 
 class PPChart2TableVisionConfig(PreTrainedConfig):
-    r"""
-    This is the configuration class to store the configuration of a [`PPChart2TableVisionModel`]. It is used to instantiate a
-    PP-Chart2Table vision encoder according to the specified arguments, defining the model architecture. Instantiating a
-    configuration with the defaults will yield a similar configuration to that of the vision encoder of the PP-Chart2Table
-    architecture developed by the PaddlePaddle team for chart-to-table parsing tasks.
+    """
+    Configuration class for the vision backbone of PP-Chart2Table model.
 
-    Configuration objects inherit from [`PreTrainedConfig`] and can be used to control the model outputs. Read the
-    documentation from [`PreTrainedConfig`] for more information.
+    This configuration class defines all the hyperparameters for the vision component
+    of the PP-Chart2Table model, which is responsible for processing chart images
+    and extracting visual features for table structure recognition and content extraction.
+    PaddlePaddle/PP-Chart2Table_safetensors [PaddlePaddle/PP-Chart2Table_safetensors]
+    (https://huggingface.co/PaddlePaddle/PP-Chart2Table_safetensors)
 
     Args:
-            depth (`int`, *optional*, defaults to 12):
-                Number of hidden layers in the vision Transformer encoder.
-            embed_dim (`int`, *optional*, defaults to 768):
-                Dimensionality of the patch embedding layer in the vision encoder.
-            hidden_size (`int`, *optional*, defaults to 1024):
-                Dimensionality of the hidden layers in the vision Transformer encoder.
-            num_channels (`int`, *optional*, defaults to 3):
-                Number of input channels for the convolutional stem layer (default: RGB images with 3 channels).
-            image_size (`int`, *optional*, defaults to 1024):
-                The size (resolution) of input chart images (assumed to be square).
-            mlp_ratio (`float`, *optional*, defaults to 4.0):
-                Ratio of the dimensionality of the feed-forward layer to the hidden size in the vision Transformer blocks.
-            num_attention_heads (`int`, *optional*, defaults to 12):
-                Number of attention heads for each self-attention layer in the vision Transformer encoder.
-            patch_size (`int`, *optional*, defaults to 16):
-                The size (resolution) of each image patch extracted from the input chart image.
-            qkv_bias (`bool`, *optional*, defaults to `True`):
-                Whether to add bias terms to the query, key, and value projection layers in the self-attention mechanism.
-            use_rel_pos (`bool`, *optional*, defaults to `True`):
-                Whether to use relative positional embeddings in the self-attention layers of the vision encoder.
-            global_attn_indexes (`Optional[List[int]]`, *optional*, defaults to `None`):
-                List of layer indexes where global attention (instead of windowed attention) is applied in the vision encoder.
-                If `None`, defaults to `[2, 5, 8, 11]`.
-            window_size (`int`, *optional*, defaults to 14):
-                The size of the attention window for windowed self-attention in the vision Transformer layers.
-            output_channels (`int`, *optional*, defaults to 256):
-                Number of output channels from the convolutional stem layer before patch embedding.
-            net_channels (`int`, *optional*, defaults to 512):
-                Number of channels in the intermediate convolutional layers.
-            attention_dropout (`float`, *optional*, defaults to 0.0):
-                Dropout probability for the attention layers in the vision Transformer.
-            output_hidden_states (`bool`, *optional*, defaults to `False`): <fill_docstring>
-            output_attentions (`bool`, *optional*, defaults to `False`): <fill_docstring>
-            return_dict (`bool`, *optional*, defaults to `True`): <fill_docstring>
-            dtype (`Union`, *optional*): <fill_docstring>
-            tie_word_embeddings (`bool`, *optional*, defaults to `True`): <fill_docstring>
-            chunk_size_feed_forward (`int`, *optional*, defaults to 0): <fill_docstring>
-            is_encoder_decoder (`bool`, *optional*, defaults to `False`): <fill_docstring>
-            is_decoder (`bool`, *optional*, defaults to `False`): <fill_docstring>
-            cross_attention_hidden_size (`Optional`, *optional*): <fill_docstring>
-            add_cross_attention (`bool`, *optional*, defaults to `False`): <fill_docstring>
-            architectures (`Optional`, *optional*): <fill_docstring>
-            finetuning_task (`Optional`, *optional*): <fill_docstring>
-            id2label (`Optional`, *optional*): <fill_docstring>
-            label2id (`Optional`, *optional*): <fill_docstring>
-            num_labels (`Optional`, *optional*): <fill_docstring>
-            task_specific_params (`Optional`, *optional*): <fill_docstring>
-            problem_type (`Optional`, *optional*): <fill_docstring>
-            tokenizer_class (`Optional`, *optional*): <fill_docstring>
-            prefix (`Optional`, *optional*): <fill_docstring>
-            bos_token_id (`Optional`, *optional*): <fill_docstring>
-            pad_token_id (`Optional`, *optional*): <fill_docstring>
-            eos_token_id (`Optional`, *optional*): <fill_docstring>
-            sep_token_id (`Optional`, *optional*): <fill_docstring>
-            decoder_start_token_id (`Optional`, *optional*): <fill_docstring>
-
-    Example:
-
-    ```python
-    >>> from transformers import PPChart2TableVisionConfig, PPChart2TableVisionModel
-
-    >>> # Initializing a PPChart2TableVisionConfig with default PP-Chart2Table style configuration
-    >>> configuration = PPChart2TableVisionConfig()
-
-    >>> # Initializing a PPChart2TableVisionModel (with random weights) from the PP-Chart2Table style configuration
-    >>> model = PPChart2TableVisionModel(configuration)
-
-    >>> # Accessing the model configuration
-    >>> configuration = model.config
+        depth (`int`, *optional*, defaults to 12):
+            Number of transformer encoder layers in the vision backbone.
+        embed_dim (`int`, *optional*, defaults to 768):
+            Dimensionality of the patch embedding vectors.
+        hidden_size (`int`, *optional*, defaults to 1024):
+            Dimensionality of the hidden layer in the feed-forward network (MLP).
+        num_channels (`int`, *optional*, defaults to 3):
+            Number of input channels (3 for RGB images, 1 for grayscale).
+        image_size (`int`, *optional*, defaults to 1024):
+            Size (height/width) of the input images (assumed to be square).
+        mlp_ratio (`float`, *optional*, defaults to 4.0):
+            Ratio of the hidden layer size to the embedding dimension in the MLP (hidden_size = embed_dim * mlp_ratio).
+        num_attention_heads (`int`, *optional*, defaults to 12):
+            Number of attention heads for each transformer encoder layer.
+        patch_size (`int`, *optional*, defaults to 16):
+            Size (height/width) of the image patches extracted from the input image.
+        qkv_bias (`bool`, *optional*, defaults to True):
+            Whether to include bias terms in the query, key, value projection layers of self-attention.
+        use_rel_pos (`bool`, *optional*, defaults to True):
+            Whether to use relative positional embeddings in the self-attention mechanism.
+        global_attn_indexes (`Optional[list[int]]`, *optional*, defaults to [2, 5, 8, 11]):
+            List of layer indexes where global attention (instead of window attention) is applied.
+            If `None`, defaults to [2, 5, 8, 11].
+        window_size (`int`, *optional*, defaults to 14):
+            Size of the attention window for window-based self-attention (only effective when use_rel_pos=True).
+        output_channels (`int`, *optional*, defaults to 256):
+            Dimensionality of the final visual feature output channels.
+        net_channels (`int`, *optional*, defaults to 512):
+            Dimensionality of intermediate network channels in the vision backbone.
+        attention_dropout (`float`, *optional*, defaults to 0.0):
+            Dropout probability applied to the attention weights.
+        **kwargs:
+            Additional keyword arguments passed to the parent `PreTrainedConfig` class.
     """
 
     model_type = "pp_chart2table_vision"
@@ -144,7 +108,7 @@ def init(
         patch_size: int = 16,
         qkv_bias: bool = True,
         use_rel_pos: bool = True,
-        global_attn_indexes: Optional[List[int]] = None,
+        global_attn_indexes: Optional[list[int]] = None,
         window_size: int = 14,
         output_channels: int = 256,
         net_channels: int = 512,
@@ -396,7 +360,6 @@ def __init__(
             text_config = {}
         self.text_config = PPChart2TableTextConfig(**text_config)
 
-
         text_config_keys = [
             "attention_dropout",
             "bos_token_id",

From 01f2b299440089ce611c86aaa86e438005fb929f Mon Sep 17 00:00:00 2001
From: XingweiDeng <aaadddiy@163.com>
Date: Sat, 28 Feb 2026 14:46:13 +0800
Subject: [PATCH 08/60] update

---
 .../configuration_pp_chart2table.py           | 25 +++++++++++--------
 .../pp_chart2table/modular_pp_chart2table.py  | 25 +++++++++++--------
 2 files changed, 28 insertions(+), 22 deletions(-)

diff --git a/src/transformers/models/pp_chart2table/configuration_pp_chart2table.py b/src/transformers/models/pp_chart2table/configuration_pp_chart2table.py
index 7fa3b6d36af8..bca3aa6cb797 100644
--- a/src/transformers/models/pp_chart2table/configuration_pp_chart2table.py
+++ b/src/transformers/models/pp_chart2table/configuration_pp_chart2table.py
@@ -37,9 +37,9 @@ class PPChart2TableVisionConfig(PreTrainedConfig):
             Number of attention heads for each transformer encoder layer.
         patch_size (`int`, *optional*, defaults to 16):
             Size (height/width) of the image patches extracted from the input image.
-        qkv_bias (`bool`, *optional*, defaults to True):
+        qkv_bias (`bool`, *optional*, defaults to `True`):
             Whether to include bias terms in the query, key, value projection layers of self-attention.
-        use_rel_pos (`bool`, *optional*, defaults to True):
+        use_rel_pos (`bool`, *optional*, defaults to `True`):
             Whether to use relative positional embeddings in the self-attention mechanism.
         global_attn_indexes (`Optional[list[int]]`, *optional*, defaults to [2, 5, 8, 11]):
             List of layer indexes where global attention (instead of window attention) is applied.
@@ -52,14 +52,12 @@ class PPChart2TableVisionConfig(PreTrainedConfig):
             Dimensionality of intermediate network channels in the vision backbone.
         attention_dropout (`float`, *optional*, defaults to 0.0):
             Dropout probability applied to the attention weights.
-        **kwargs:
-            Additional keyword arguments passed to the parent `PreTrainedConfig` class.
     """
 
     model_type = "pp_chart2table_vision"
     base_config_key = "vision_config"
 
-    def init(
+    def __init__(
         self,
         depth: int = 12,
         embed_dim: int = 768,
@@ -93,7 +91,7 @@ def init(
         self.output_channels = output_channels
         self.net_channels = net_channels
         self.attention_dropout = attention_dropout
-        super().init(**kwargs)
+        super().__init__(**kwargs)
 
 
 class PPChart2TableTextConfig(PreTrainedConfig):
@@ -266,11 +264,16 @@ class PPChart2TableConfig(PreTrainedConfig):
     documentation from [PreTrainedConfig] for more information.
 
     Args:
-            vision_config (Optional[Dict], optional, defaults to None, *optional*):
-            text_config (Optional[Dict], optional, defaults to None, *optional*):
-            image_token_index (Optional[int], optional, defaults to 151859, *optional*, defaults to 151859):
-            image_seq_length (Optional[int], optional, defaults to 576, *optional*, defaults to 576):
-            pad_token_id (Optional[int], optional, defaults to -1, *optional*, defaults to -1):
+        vision_config (Optional[Dict], optional, *optional*)::
+            The [PPChart2TableVisionConfig] for the vision sub-model. Defaults to None.
+        text_config (Optional[Dict], optional, *optional*)::
+            The [PPChart2TableTextConfig] for the text sub-model. Defaults to None.
+        image_token_index (Optional[int], optional, *optional*, defaults to 151859)::
+            The index of the image token. Defaults to 151859.
+        image_seq_length (Optional[int], optional, *optional*, defaults to 576)::
+            The sequence length for the image. Defaults to 576.
+        pad_token_id (Optional[int], optional, *optional*, defaults to -1):
+            The index of the padding token. Defaults to -1.
 
     Example:
 
diff --git a/src/transformers/models/pp_chart2table/modular_pp_chart2table.py b/src/transformers/models/pp_chart2table/modular_pp_chart2table.py
index 87701c5ded9f..fca8b6f152d8 100644
--- a/src/transformers/models/pp_chart2table/modular_pp_chart2table.py
+++ b/src/transformers/models/pp_chart2table/modular_pp_chart2table.py
@@ -74,9 +74,9 @@ class PPChart2TableVisionConfig(PreTrainedConfig):
             Number of attention heads for each transformer encoder layer.
         patch_size (`int`, *optional*, defaults to 16):
             Size (height/width) of the image patches extracted from the input image.
-        qkv_bias (`bool`, *optional*, defaults to True):
+        qkv_bias (`bool`, *optional*, defaults to `True`):
             Whether to include bias terms in the query, key, value projection layers of self-attention.
-        use_rel_pos (`bool`, *optional*, defaults to True):
+        use_rel_pos (`bool`, *optional*, defaults to `True`):
             Whether to use relative positional embeddings in the self-attention mechanism.
         global_attn_indexes (`Optional[list[int]]`, *optional*, defaults to [2, 5, 8, 11]):
             List of layer indexes where global attention (instead of window attention) is applied.
@@ -89,14 +89,12 @@ class PPChart2TableVisionConfig(PreTrainedConfig):
             Dimensionality of intermediate network channels in the vision backbone.
         attention_dropout (`float`, *optional*, defaults to 0.0):
             Dropout probability applied to the attention weights.
-        **kwargs:
-            Additional keyword arguments passed to the parent `PreTrainedConfig` class.
     """
 
     model_type = "pp_chart2table_vision"
     base_config_key = "vision_config"
 
-    def init(
+    def __init__(
         self,
         depth: int = 12,
         embed_dim: int = 768,
@@ -130,7 +128,7 @@ def init(
         self.output_channels = output_channels
         self.net_channels = net_channels
         self.attention_dropout = attention_dropout
-        super().init(**kwargs)
+        super().__init__(**kwargs)
 
 
 class PPChart2TableTextConfig(PreTrainedConfig):
@@ -303,11 +301,16 @@ class PPChart2TableConfig(PreTrainedConfig):
     documentation from [PreTrainedConfig] for more information.
 
     Args:
-            vision_config (Optional[Dict], optional, defaults to None, *optional*):
-            text_config (Optional[Dict], optional, defaults to None, *optional*):
-            image_token_index (Optional[int], optional, defaults to 151859, *optional*, defaults to 151859):
-            image_seq_length (Optional[int], optional, defaults to 576, *optional*, defaults to 576):
-            pad_token_id (Optional[int], optional, defaults to -1, *optional*, defaults to -1):
+        vision_config (Optional[Dict], optional, *optional*)::
+            The [PPChart2TableVisionConfig] for the vision sub-model. Defaults to None.
+        text_config (Optional[Dict], optional, *optional*)::
+            The [PPChart2TableTextConfig] for the text sub-model. Defaults to None.
+        image_token_index (Optional[int], optional, *optional*, defaults to 151859)::
+            The index of the image token. Defaults to 151859.
+        image_seq_length (Optional[int], optional, *optional*, defaults to 576)::
+            The sequence length for the image. Defaults to 576.
+        pad_token_id (Optional[int], optional, *optional*, defaults to -1):
+            The index of the padding token. Defaults to -1.
 
     Example:
 

From d8cc88110e1f38646ef71f310ab96c6e8bc198f6 Mon Sep 17 00:00:00 2001
From: XingweiDeng <aaadddiy@163.com>
Date: Mon, 2 Mar 2026 17:07:28 +0800
Subject: [PATCH 09/60] update

---
 .../image_processing_pp_chart2table.py        |  8 ++++----
 .../image_processing_pp_chart2table_fast.py   |  8 ++++----
 .../pp_chart2table/modular_pp_chart2table.py  | 20 +++++++++----------
 .../processing_pp_chart2table.py              |  4 ++--
 4 files changed, 20 insertions(+), 20 deletions(-)

diff --git a/src/transformers/models/pp_chart2table/image_processing_pp_chart2table.py b/src/transformers/models/pp_chart2table/image_processing_pp_chart2table.py
index e83a49a99f1b..3f17cb754904 100644
--- a/src/transformers/models/pp_chart2table/image_processing_pp_chart2table.py
+++ b/src/transformers/models/pp_chart2table/image_processing_pp_chart2table.py
@@ -130,17 +130,17 @@ def preprocess(
             input_data_format = infer_channel_dimension_format(images[0])
 
         # transformations
-        resize_imgs = []
+        resize_images = []
         if do_resize:
             for image in images:
-                img = resize(
+                image = resize(
                     image,
                     size=(size["height"], size["width"]),
                     resample=resample,
                     input_data_format=input_data_format,
                 )
-                resize_imgs.append(img)
-            images = resize_imgs
+                resize_images.append(image)
+            images = resize_images
 
         if do_rescale:
             images = [self.rescale(image, rescale_factor, input_data_format=input_data_format) for image in images]
diff --git a/src/transformers/models/pp_chart2table/image_processing_pp_chart2table_fast.py b/src/transformers/models/pp_chart2table/image_processing_pp_chart2table_fast.py
index 86a6cdb3a672..67007860e29f 100644
--- a/src/transformers/models/pp_chart2table/image_processing_pp_chart2table_fast.py
+++ b/src/transformers/models/pp_chart2table/image_processing_pp_chart2table_fast.py
@@ -72,12 +72,12 @@ def _preprocess(
         **kwargs,
     ) -> BatchFeature:
         data = {}
-        resize_imgs = []
+        resize_images = []
         if do_resize:
             for image in images:
-                img = self.resize(image, size=size, interpolation=interpolation)
-                resize_imgs.append(img)
-            images = resize_imgs
+                image = self.resize(image, size=size, interpolation=interpolation)
+                resize_images.append(image)
+            images = resize_images
 
         processed_images = []
         for image in images:
diff --git a/src/transformers/models/pp_chart2table/modular_pp_chart2table.py b/src/transformers/models/pp_chart2table/modular_pp_chart2table.py
index fca8b6f152d8..bb964589ac84 100644
--- a/src/transformers/models/pp_chart2table/modular_pp_chart2table.py
+++ b/src/transformers/models/pp_chart2table/modular_pp_chart2table.py
@@ -497,17 +497,17 @@ def preprocess(
             input_data_format = infer_channel_dimension_format(images[0])
 
         # transformations
-        resize_imgs = []
+        resize_images = []
         if do_resize:
             for image in images:
-                img = resize(
+                image = resize(
                     image,
                     size=(size["height"], size["width"]),
                     resample=resample,
                     input_data_format=input_data_format,
                 )
-                resize_imgs.append(img)
-            images = resize_imgs
+                resize_images.append(image)
+            images = resize_images
 
         if do_rescale:
             images = [self.rescale(image, rescale_factor, input_data_format=input_data_format) for image in images]
@@ -583,12 +583,12 @@ def _preprocess(
         **kwargs,
     ) -> BatchFeature:
         data = {}
-        resize_imgs = []
+        resize_images = []
         if do_resize:
             for image in images:
-                img = self.resize(image, size=size, interpolation=interpolation)
-                resize_imgs.append(img)
-            images = resize_imgs
+                image = self.resize(image, size=size, interpolation=interpolation)
+                resize_images.append(image)
+            images = resize_images
 
         processed_images = []
         for image in images:
@@ -633,8 +633,8 @@ def __call__(
         else:
             image_inputs = {}
         img_cnt = len(image_inputs)
-        _, _, h, _ = image_inputs["pixel_values"].shape
-        num_patches = h // self.image_processor.patch_size // self.image_processor.merge_size
+        _, _, height, _ = image_inputs["pixel_values"].shape
+        num_patches = height // self.image_processor.patch_size // self.image_processor.merge_size
         prompt = (
             "<|im_start|>system\n"
             "You should follow the instructions carefully and explain your answers in detail.<|im_end|><|im_start|>user\n"
diff --git a/src/transformers/models/pp_chart2table/processing_pp_chart2table.py b/src/transformers/models/pp_chart2table/processing_pp_chart2table.py
index a2281c9d5f3e..c9d83d12082b 100644
--- a/src/transformers/models/pp_chart2table/processing_pp_chart2table.py
+++ b/src/transformers/models/pp_chart2table/processing_pp_chart2table.py
@@ -41,8 +41,8 @@ def __call__(
         else:
             image_inputs = {}
         img_cnt = len(image_inputs)
-        _, _, h, _ = image_inputs["pixel_values"].shape
-        num_patches = h // self.image_processor.patch_size // self.image_processor.merge_size
+        _, _, height, _ = image_inputs["pixel_values"].shape
+        num_patches = height // self.image_processor.patch_size // self.image_processor.merge_size
         prompt = (
             "<|im_start|>system\n"
             "You should follow the instructions carefully and explain your answers in detail.<|im_end|><|im_start|>user\n"

From 117a6cb5ef116561278a15e7ae7955957faa8d6e Mon Sep 17 00:00:00 2001
From: XingweiDeng <aaadddiy@163.com>
Date: Mon, 9 Mar 2026 16:41:24 +0800
Subject: [PATCH 10/60] update

---
 .../configuration_pp_chart2table.py           |  21 ++
 .../image_processing_pp_chart2table_fast.py   |   6 +
 .../pp_chart2table/modeling_pp_chart2table.py |  60 +++--
 .../pp_chart2table/modular_pp_chart2table.py  | 249 ++++++------------
 .../processing_pp_chart2table.py              |  16 +-
 5 files changed, 156 insertions(+), 196 deletions(-)

diff --git a/src/transformers/models/pp_chart2table/configuration_pp_chart2table.py b/src/transformers/models/pp_chart2table/configuration_pp_chart2table.py
index bca3aa6cb797..7e0a964b5254 100644
--- a/src/transformers/models/pp_chart2table/configuration_pp_chart2table.py
+++ b/src/transformers/models/pp_chart2table/configuration_pp_chart2table.py
@@ -8,8 +8,19 @@
 
 from transformers.configuration_utils import PreTrainedConfig, layer_type_validation
 from transformers.modeling_rope_utils import RopeParameters
+from transformers.utils import auto_docstring
 
 
+@auto_docstring(
+    custom_intro="""
+    This configuration class defines all the hyperparameters for the vision component
+    of the PP-Chart2Table model, which is responsible for processing chart images
+    and extracting visual features for table structure recognition and content extraction.
+    PaddlePaddle/PP-Chart2Table_safetensors [PaddlePaddle/PP-Chart2Table_safetensors]
+    (https://huggingface.co/PaddlePaddle/PP-Chart2Table_safetensors)
+    """,
+    checkpoint="PaddlePaddle/PP-Chart2Table_safetensors",
+)
 class PPChart2TableVisionConfig(PreTrainedConfig):
     """
     Configuration class for the vision backbone of PP-Chart2Table model.
@@ -94,6 +105,11 @@ def __init__(
         super().__init__(**kwargs)
 
 
+@auto_docstring(
+    custom_intro="""
+    
+    """,
+)
 class PPChart2TableTextConfig(PreTrainedConfig):
     r"""
     This is the configuration class to store the configuration of a [`PPChart2TableTextModel`]. It is used to instantiate a
@@ -248,6 +264,11 @@ def __init__(
         )
 
 
+@auto_docstring(
+    custom_intro="""
+    
+    """
+)
 class PPChart2TableConfig(PreTrainedConfig):
     r"""
     This is the main configuration class to store the configuration of a [PPChart2TableModel] or [PPChart2TableForConditionalGeneration].
diff --git a/src/transformers/models/pp_chart2table/image_processing_pp_chart2table_fast.py b/src/transformers/models/pp_chart2table/image_processing_pp_chart2table_fast.py
index 67007860e29f..d9a942aab565 100644
--- a/src/transformers/models/pp_chart2table/image_processing_pp_chart2table_fast.py
+++ b/src/transformers/models/pp_chart2table/image_processing_pp_chart2table_fast.py
@@ -12,8 +12,14 @@
 from transformers.feature_extraction_utils import BatchFeature
 from transformers.image_processing_utils_fast import BaseImageProcessorFast
 from transformers.processing_utils import TensorType
+from transformers.utils import auto_docstring
 
 
+@auto_docstring(
+    custom_intro="""
+
+    """
+)
 class PPChart2TableImageProcessorFast(BaseImageProcessorFast):
     r"""
     Fast image processor for the PP-Chart2Table multimodal model, optimized for GPU-accelerated chart image preprocessing.
diff --git a/src/transformers/models/pp_chart2table/modeling_pp_chart2table.py b/src/transformers/models/pp_chart2table/modeling_pp_chart2table.py
index 1c9790b4d1a2..d8e4f28e89f6 100644
--- a/src/transformers/models/pp_chart2table/modeling_pp_chart2table.py
+++ b/src/transformers/models/pp_chart2table/modeling_pp_chart2table.py
@@ -15,9 +15,9 @@
 
 from transformers.cache_utils import Cache
 from transformers.generation import GenerationMixin
-from transformers.modeling_outputs import ModelOutput
+from transformers.modeling_outputs import BaseModelOutputWithPast
 from transformers.modeling_utils import PreTrainedModel
-from transformers.utils import can_return_tuple
+from transformers.utils import auto_docstring, can_return_tuple
 from transformers.utils.generic import check_model_inputs
 
 from ...activations import ACT2FN
@@ -26,11 +26,10 @@
 from ...masking_utils import create_causal_mask, create_sliding_window_causal_mask
 from ...modeling_flash_attention_utils import FlashAttentionKwargs
 from ...modeling_layers import GradientCheckpointingLayer
-from ...modeling_outputs import BaseModelOutputWithPast
 from ...modeling_rope_utils import ROPE_INIT_FUNCTIONS, dynamic_rope_update
 from ...modeling_utils import ALL_ATTENTION_FUNCTIONS
 from ...processing_utils import Unpack
-from ...utils import TransformersKwargs, auto_docstring
+from ...utils import TransformersKwargs
 from ...utils.generic import maybe_autocast
 from .configuration_pp_chart2table import PPChart2TableConfig, PPChart2TableTextConfig, PPChart2TableVisionConfig
 
@@ -346,6 +345,11 @@ def forward(self, hidden_states):
         return hidden_states
 
 
+@auto_docstring(
+    custom_intro="""
+    
+    """
+)
 class PPChart2TableVisionPreTrainedModel(PreTrainedModel):
     r"""
     Base class for all PP-Chart2Table vision models, inheriting from Hugging Face `PreTrainedModel`.
@@ -393,6 +397,11 @@ class PPChart2TableVisionPreTrainedModel(PreTrainedModel):
     }
 
 
+@auto_docstring(
+    custom_intro="""
+    
+    """
+)
 class PPChart2TableVisionModel(PPChart2TableVisionPreTrainedModel):
     main_input_name = "pixel_values"
     input_modalities = "image"
@@ -661,7 +670,11 @@ def forward(
         return hidden_states
 
 
-@auto_docstring
+@auto_docstring(
+    custom_intro="""
+    
+    """
+)
 class PPChart2TableTextPreTrainedModel(PreTrainedModel):
     config: PPChart2TableTextConfig
     base_model_prefix = "model"
@@ -837,7 +850,7 @@ def forward(
 
 
 @dataclass
-class PPChart2TableModelOutputWithPast(ModelOutput):
+class PPChart2TableModelOutputWithPast(BaseModelOutputWithPast):
     r"""
     Output class for PPChart2Table multimodal model's forward pass, extending Hugging Face `ModelOutput`.
 
@@ -855,15 +868,11 @@ class PPChart2TableModelOutputWithPast(ModelOutput):
             Tuple of attention weights from each layer of the text decoder (for debugging/analysis).
     """
 
-    past_key_values: Optional[Cache] = None
-    last_hidden_state: Optional[torch.FloatTensor] = None
-    hidden_states: Optional[tuple[torch.FloatTensor]] = None
-    attentions: Optional[tuple[torch.FloatTensor]] = None
     image_hidden_states: Optional[torch.FloatTensor] = None
 
 
 @dataclass
-class PPChart2TableCausalLMOutputWithPast(ModelOutput):
+class PPChart2TableCausalLMOutputWithPast(BaseModelOutputWithPast):
     r"""
     Output class for PP-Chart2Table conditional generation model's forward pass.
 
@@ -873,24 +882,18 @@ class PPChart2TableCausalLMOutputWithPast(ModelOutput):
     Attributes:
         logits (`Optional[torch.FloatTensor]`, defaults to `None`):
             Language modeling logits (shape: `[B, seq_len, vocab_size]`), output from the LM head.
-        past_key_values (`Optional[Cache]`, defaults to `None`):
-            Cached attention key/value pairs (inherited from base model output).
-        last_hidden_state (`Optional[torch.FloatTensor]`, defaults to `None`):
-            Final hidden states from the text decoder (inherited from base model output).
-        hidden_states (`Optional[tuple[torch.FloatTensor]]`, defaults to `None`):
-            Tuple of decoder layer hidden states (inherited from base model output).
-        attentions (`Optional[tuple[torch.FloatTensor]]`, defaults to `None`):
-            Tuple of decoder layer attention weights (inherited from base model output).
     """
 
     logits: Optional[torch.FloatTensor] = None
-    past_key_values: Optional[Cache] = None
-    last_hidden_state: Optional[torch.FloatTensor] = None
-    hidden_states: Optional[tuple[torch.FloatTensor]] = None
-    attentions: Optional[tuple[torch.FloatTensor]] = None
+    loss: Optional[torch.FloatTensor] = None
     image_hidden_states: Optional[torch.FloatTensor] = None
 
 
+@auto_docstring(
+    custom_intro="""
+    
+    """
+)
 class PPChart2TablePreTrainedModel(PreTrainedModel):
     r"""
     Base class for all PP-Chart2Table multimodal models, inheriting from Hugging Face `PreTrainedModel`.
@@ -937,6 +940,11 @@ class PPChart2TablePreTrainedModel(PreTrainedModel):
     }
 
 
+@auto_docstring(
+    custom_intro="""
+    Core PP-Chart2Table multimodal model (vision encoder + text decoder) for chart-to-table parsing.
+    """
+)
 class PPChart2TableModel(PPChart2TablePreTrainedModel):
     r"""
     Core PP-Chart2Table multimodal model (vision encoder + text decoder) for chart-to-table parsing.
@@ -1094,6 +1102,12 @@ def forward(
         )
 
 
+@auto_docstring(
+    custom_intro="""
+    PP-Chart2Table model for conditional generation (table text generation from chart images),
+    extending the core model with a language modeling (LM) head and generation utilities.
+    """
+)
 class PPChart2TableForConditionalGeneration(PPChart2TablePreTrainedModel, GenerationMixin):
     r"""
     PP-Chart2Table model for conditional generation (table text generation from chart images),
diff --git a/src/transformers/models/pp_chart2table/modular_pp_chart2table.py b/src/transformers/models/pp_chart2table/modular_pp_chart2table.py
index bb964589ac84..f6850641fc14 100644
--- a/src/transformers/models/pp_chart2table/modular_pp_chart2table.py
+++ b/src/transformers/models/pp_chart2table/modular_pp_chart2table.py
@@ -10,20 +10,8 @@
 from transformers.configuration_utils import PreTrainedConfig, layer_type_validation
 from transformers.feature_extraction_utils import BatchFeature
 from transformers.generation import GenerationMixin
-from transformers.image_processing_utils import BaseImageProcessor
 from transformers.image_processing_utils_fast import BaseImageProcessorFast
-from transformers.image_transforms import flip_channel_order, resize, to_channel_dimension_format
-from transformers.image_utils import (
-    ChannelDimension,
-    ImageInput,
-    PILImageResampling,
-    infer_channel_dimension_format,
-    make_flat_list_of_images,
-    to_numpy_array,
-    valid_images,
-    validate_preprocess_arguments,
-)
-from transformers.modeling_outputs import ModelOutput
+from transformers.modeling_outputs import BaseModelOutputWithPast
 from transformers.modeling_rope_utils import RopeParameters
 from transformers.modeling_utils import PreTrainedModel
 from transformers.models.got_ocr2.modeling_got_ocr2 import (
@@ -40,13 +28,27 @@
     Qwen2Model,
     Qwen2PreTrainedModel,
 )
-from transformers.processing_utils import ProcessorMixin, TensorType
 from transformers.utils import (
+    auto_docstring,
     can_return_tuple,
-    filter_out_non_signature_kwargs,
+    logging,
 )
+from transformers.processing_utils import ProcessorMixin, TensorType
+from transformers.utils import can_return_tuple
 
+logger = logging.get_logger(__name__)
 
+
+@auto_docstring(
+    custom_intro="""
+    This configuration class defines all the hyperparameters for the vision component
+    of the PP-Chart2Table model, which is responsible for processing chart images
+    and extracting visual features for table structure recognition and content extraction.
+    PaddlePaddle/PP-Chart2Table_safetensors [PaddlePaddle/PP-Chart2Table_safetensors]
+    (https://huggingface.co/PaddlePaddle/PP-Chart2Table_safetensors)
+    """,
+    checkpoint="PaddlePaddle/PP-Chart2Table_safetensors",
+)
 class PPChart2TableVisionConfig(PreTrainedConfig):
     """
     Configuration class for the vision backbone of PP-Chart2Table model.
@@ -131,6 +133,12 @@ def __init__(
         super().__init__(**kwargs)
 
 
+@auto_docstring(
+    custom_intro="""
+    
+    """,
+
+)
 class PPChart2TableTextConfig(PreTrainedConfig):
     r"""
     This is the configuration class to store the configuration of a [`PPChart2TableTextModel`]. It is used to instantiate a
@@ -285,6 +293,11 @@ def __init__(
         )
 
 
+@auto_docstring(
+    custom_intro="""
+    
+    """
+)
 class PPChart2TableConfig(PreTrainedConfig):
     r"""
     This is the main configuration class to store the configuration of a [PPChart2TableModel] or [PPChart2TableForConditionalGeneration].
@@ -389,142 +402,11 @@ def __init__(
 
         super().__init__(**kwargs)
 
+@auto_docstring(
+    custom_intro="""
 
-class PPChart2TableImageProcessor(BaseImageProcessor):
-    r"""
-    Image processor for the PP-Chart2Table multimodal model, optimized for chart image preprocessing tasks.
-
-    This processor handles the complete preprocessing pipeline for chart images, including resizing, rescaling,
-    normalization, and channel dimension reordering, tailored to the input requirements of the PP-Chart2Table vision encoder.
-
-    Args:
-        do_resize (`bool`, *optional*, defaults to `True`):
-            Whether to resize the input images to the specified `size`.
-        size (`dict[str, int]`, *optional*, defaults to `{"height": 256, "width": 256}`):
-            Dictionary containing the target height and width for resizing. Format: `{"height": int, "width": int}`.
-        resample (`PILImageResampling`, *optional*, defaults to `PILImageResampling.BICUBIC`):
-            Resampling filter to use when resizing images (e.g., BICUBIC, BILINEAR).
-        do_rescale (`bool`, *optional*, defaults to `True`):
-            Whether to rescale the pixel values from the range [0, 255] to [0, 1] using `rescale_factor`.
-        rescale_factor (`int` or `float`, *optional*, defaults to `1/255`):
-            Factor to apply for rescaling pixel values (e.g., 1/255 scales 0-255 to 0-1).
-        do_normalize (`bool`, *optional*, defaults to `True`):
-            Whether to normalize the input images using `image_mean` and `image_std`.
-        image_mean (`float` or `list[float]`, *optional*, defaults to `[0.406, 0.456, 0.485]`):
-            Mean values for image normalization (per channel, RGB order).
-        image_std (`float` or `list[float]`, *optional*, defaults to `[0.225, 0.224, 0.229]`):
-            Standard deviation values for image normalization (per channel, RGB order).
-        patch_size (`int`, *optional*, defaults to 16):
-            Size of image patches used by the PP-Chart2Table vision encoder (for alignment with model input).
-        merge_size (`int`, *optional*, defaults to 4):
-            Size factor for merging image patches (specific to PP-Chart2Table's vision processing pipeline).
     """
-
-    model_input_names = ["pixel_values"]
-
-    def __init__(
-        self,
-        do_resize: bool = True,
-        size: Optional[dict[str, int]] = None,
-        resample: Optional[PILImageResampling] = PILImageResampling.BICUBIC,
-        do_rescale: bool = True,
-        rescale_factor: Union[int, float] = 1 / 255,
-        do_normalize: bool = True,
-        image_mean: Optional[Union[float, list[float]]] = [0.406, 0.456, 0.485],
-        image_std: Optional[Union[float, list[float]]] = [0.225, 0.224, 0.229],
-        patch_size: int = 16,
-        merge_size: int = 4,
-        **kwargs,
-    ) -> None:
-        super().__init__(**kwargs)
-        size = size if size is not None else {"height": 256, "width": 256}
-
-        self.do_resize = do_resize
-        self.size = size
-        self.do_rescale = do_rescale
-        self.rescale_factor = rescale_factor
-        self.do_normalize = do_normalize
-        self.image_mean = image_mean
-        self.image_std = image_std
-        self.resample = resample
-        self.patch_size = patch_size
-        self.merge_size = merge_size
-
-    @filter_out_non_signature_kwargs()
-    def preprocess(
-        self,
-        images: ImageInput,
-        size: Optional[dict[str, int]] = None,
-        do_resize: Optional[bool] = None,
-        resample: Optional[PILImageResampling] = None,
-        do_rescale: Optional[bool] = None,
-        rescale_factor: Optional[Union[int, float]] = None,
-        do_normalize: Optional[bool] = None,
-        image_mean: Optional[Union[float, list[float]]] = None,
-        image_std: Optional[Union[float, list[float]]] = None,
-        return_tensors: Optional[Union[TensorType, str]] = None,
-        data_format: Union[str, ChannelDimension] = ChannelDimension.FIRST,
-        input_data_format: Optional[Union[str, ChannelDimension]] = None,
-    ) -> BatchFeature:
-        size = self.size if size is None else size
-        do_resize = self.do_resize if do_resize is None else do_resize
-        resample = self.resample if resample is None else resample
-        do_rescale = self.do_rescale if do_rescale is None else do_rescale
-        rescale_factor = self.rescale_factor if rescale_factor is None else rescale_factor
-        do_normalize = self.do_normalize if do_normalize is None else do_normalize
-        image_mean = self.image_mean if image_mean is None else image_mean
-        image_std = self.image_std if image_std is None else image_std
-
-        images = make_flat_list_of_images(images)
-
-        validate_preprocess_arguments(
-            do_rescale=do_rescale,
-            rescale_factor=rescale_factor,
-            do_normalize=do_normalize,
-            image_mean=image_mean,
-            image_std=image_std,
-            size=size,
-            do_resize=do_resize,
-            resample=resample,
-        )
-
-        if not valid_images(images):
-            raise ValueError("Invalid image type. Must be of type PIL.Image.Image, numpy.ndarray, or torch.Tensor")
-
-        # All transformations expect numpy arrays
-        images = [to_numpy_array(image) for image in images]
-        if input_data_format is None:
-            input_data_format = infer_channel_dimension_format(images[0])
-
-        # transformations
-        resize_images = []
-        if do_resize:
-            for image in images:
-                image = resize(
-                    image,
-                    size=(size["height"], size["width"]),
-                    resample=resample,
-                    input_data_format=input_data_format,
-                )
-                resize_images.append(image)
-            images = resize_images
-
-        if do_rescale:
-            images = [self.rescale(image, rescale_factor, input_data_format=input_data_format) for image in images]
-
-        if do_normalize:
-            images = [
-                self.normalize(image, image_mean, image_std, input_data_format=input_data_format) for image in images
-            ]
-        images = [flip_channel_order(image, input_data_format=input_data_format) for image in images]
-        images = [
-            to_channel_dimension_format(image, data_format, input_channel_dim=input_data_format) for image in images
-        ]
-
-        encoded_inputs = BatchFeature(data={"pixel_values": images}, tensor_type=return_tensors)
-        return encoded_inputs
-
-
+)
 class PPChart2TableImageProcessorFast(BaseImageProcessorFast):
     r"""
     Fast image processor for the PP-Chart2Table multimodal model, optimized for GPU-accelerated chart image preprocessing.
@@ -603,12 +485,23 @@ def _preprocess(
         return encoded_inputs
 
 
+@auto_docstring(
+    custom_intro="""
+    A multi-modal processor for the PPChart2Table model, combining image preprocessing and text tokenization
+    capabilities to handle chart-to-table conversion tasks.
+    
+    This processor integrates `PPChart2TableImageProcessorFast` for chart image preprocessing (e.g., patch-based 
+    resizing) and `Qwen2Tokenizer` for text prompt construction/tokenization. It encapsulates the end-to-end 
+    processing pipeline from raw chart images + text instructions to model-ready input tensors, and also provides 
+    postprocessing logic to decode model outputs back to human-readable table text.
+    """
+)
 class PPChart2TableProcessor(ProcessorMixin):
     r"""
-    [`PPChart2TableProcessor`] offers all the functionalities of [`PPChart2TableImageProcessor`] and [`Qwen2Tokenizer`]. See the
+    [`PPChart2TableProcessor`] offers all the functionalities of [`PPChart2TableImageProcessorFast`] and [`Qwen2Tokenizer`]. See the
     [`~PPChart2TableProcessor.__call__`] and [`~PPChart2TableProcessor.decode`] for more information.
     Args:
-        image_processor ([`PPChart2TableImageProcessor`], *optional*):
+        image_processor ([`PPChart2TableImageProcessorFast`], *optional*):
             The image processor is a required input.
         tokenizer ([`Qwen2Tokenizer`], *optional*):
             The tokenizer is a required input.
@@ -713,6 +606,11 @@ def __init__(self, config: PPChart2TableVisionConfig):
         self.layer_norm2 = PPChart2TableVisionLayerNorm(config.output_channels, data_format="channels_first")
 
 
+@auto_docstring(
+    custom_intro="""
+    
+    """
+)
 class PPChart2TableVisionPreTrainedModel(PreTrainedModel):
     r"""
     Base class for all PP-Chart2Table vision models, inheriting from Hugging Face `PreTrainedModel`.
@@ -760,6 +658,11 @@ class PPChart2TableVisionPreTrainedModel(PreTrainedModel):
     }
 
 
+@auto_docstring(
+    custom_intro="""
+    
+    """
+)
 class PPChart2TableVisionModel(PPChart2TableVisionPreTrainedModel):
     main_input_name = "pixel_values"
     input_modalities = "image"
@@ -815,6 +718,11 @@ class PPChart2TableTextDecoderLayer(Qwen2DecoderLayer):
     pass
 
 
+@auto_docstring(
+    custom_intro="""
+    
+    """
+)
 class PPChart2TableTextPreTrainedModel(Qwen2PreTrainedModel):
     pass
 
@@ -824,7 +732,7 @@ class PPChart2TableTextModel(Qwen2Model):
 
 
 @dataclass
-class PPChart2TableModelOutputWithPast(ModelOutput):
+class PPChart2TableModelOutputWithPast(BaseModelOutputWithPast):
     r"""
     Output class for PPChart2Table multimodal model's forward pass, extending Hugging Face `ModelOutput`.
 
@@ -841,16 +749,11 @@ class PPChart2TableModelOutputWithPast(ModelOutput):
         attentions (`Optional[tuple[torch.FloatTensor]]`, defaults to `None`):
             Tuple of attention weights from each layer of the text decoder (for debugging/analysis).
     """
-
-    past_key_values: Optional[Cache] = None
-    last_hidden_state: Optional[torch.FloatTensor] = None
-    hidden_states: Optional[tuple[torch.FloatTensor]] = None
-    attentions: Optional[tuple[torch.FloatTensor]] = None
     image_hidden_states: Optional[torch.FloatTensor] = None
 
 
 @dataclass
-class PPChart2TableCausalLMOutputWithPast(ModelOutput):
+class PPChart2TableCausalLMOutputWithPast(BaseModelOutputWithPast):
     r"""
     Output class for PP-Chart2Table conditional generation model's forward pass.
 
@@ -860,24 +763,18 @@ class PPChart2TableCausalLMOutputWithPast(ModelOutput):
     Attributes:
         logits (`Optional[torch.FloatTensor]`, defaults to `None`):
             Language modeling logits (shape: `[B, seq_len, vocab_size]`), output from the LM head.
-        past_key_values (`Optional[Cache]`, defaults to `None`):
-            Cached attention key/value pairs (inherited from base model output).
-        last_hidden_state (`Optional[torch.FloatTensor]`, defaults to `None`):
-            Final hidden states from the text decoder (inherited from base model output).
-        hidden_states (`Optional[tuple[torch.FloatTensor]]`, defaults to `None`):
-            Tuple of decoder layer hidden states (inherited from base model output).
-        attentions (`Optional[tuple[torch.FloatTensor]]`, defaults to `None`):
-            Tuple of decoder layer attention weights (inherited from base model output).
     """
 
     logits: Optional[torch.FloatTensor] = None
-    past_key_values: Optional[Cache] = None
-    last_hidden_state: Optional[torch.FloatTensor] = None
-    hidden_states: Optional[tuple[torch.FloatTensor]] = None
-    attentions: Optional[tuple[torch.FloatTensor]] = None
+    loss: Optional[torch.FloatTensor] = None
     image_hidden_states: Optional[torch.FloatTensor] = None
 
 
+@auto_docstring(
+    custom_intro="""
+    
+    """
+)
 class PPChart2TablePreTrainedModel(PreTrainedModel):
     r"""
     Base class for all PP-Chart2Table multimodal models, inheriting from Hugging Face `PreTrainedModel`.
@@ -924,6 +821,11 @@ class PPChart2TablePreTrainedModel(PreTrainedModel):
     }
 
 
+@auto_docstring(
+    custom_intro="""
+    Core PP-Chart2Table multimodal model (vision encoder + text decoder) for chart-to-table parsing.
+    """
+)
 class PPChart2TableModel(PPChart2TablePreTrainedModel):
     r"""
     Core PP-Chart2Table multimodal model (vision encoder + text decoder) for chart-to-table parsing.
@@ -1081,6 +983,12 @@ def forward(
         )
 
 
+@auto_docstring(
+    custom_intro="""
+    PP-Chart2Table model for conditional generation (table text generation from chart images),
+    extending the core model with a language modeling (LM) head and generation utilities.
+    """
+)
 class PPChart2TableForConditionalGeneration(PPChart2TablePreTrainedModel, GenerationMixin):
     r"""
     PP-Chart2Table model for conditional generation (table text generation from chart images),
@@ -1235,7 +1143,6 @@ def prepare_inputs_for_generation(
     "PPChart2TableVisionModel",
     "PPChart2TableVisionConfig",
     "PPChart2TableTextConfig",
-    "PPChart2TableImageProcessor",
     "PPChart2TableImageProcessorFast",
     "PPChart2TableProcessor",
 ]
diff --git a/src/transformers/models/pp_chart2table/processing_pp_chart2table.py b/src/transformers/models/pp_chart2table/processing_pp_chart2table.py
index c9d83d12082b..1efa2641ff23 100644
--- a/src/transformers/models/pp_chart2table/processing_pp_chart2table.py
+++ b/src/transformers/models/pp_chart2table/processing_pp_chart2table.py
@@ -9,14 +9,26 @@
 
 from transformers.feature_extraction_utils import BatchFeature
 from transformers.processing_utils import ProcessorMixin
+from transformers.utils import auto_docstring
 
 
+@auto_docstring(
+    custom_intro="""
+    A multi-modal processor for the PPChart2Table model, combining image preprocessing and text tokenization
+    capabilities to handle chart-to-table conversion tasks.
+    
+    This processor integrates `PPChart2TableImageProcessorFast` for chart image preprocessing (e.g., patch-based 
+    resizing) and `Qwen2Tokenizer` for text prompt construction/tokenization. It encapsulates the end-to-end 
+    processing pipeline from raw chart images + text instructions to model-ready input tensors, and also provides 
+    postprocessing logic to decode model outputs back to human-readable table text.
+    """
+)
 class PPChart2TableProcessor(ProcessorMixin):
     r"""
-    [`PPChart2TableProcessor`] offers all the functionalities of [`PPChart2TableImageProcessor`] and [`Qwen2Tokenizer`]. See the
+    [`PPChart2TableProcessor`] offers all the functionalities of [`PPChart2TableImageProcessorFast`] and [`Qwen2Tokenizer`]. See the
     [`~PPChart2TableProcessor.__call__`] and [`~PPChart2TableProcessor.decode`] for more information.
     Args:
-        image_processor ([`PPChart2TableImageProcessor`], *optional*):
+        image_processor ([`PPChart2TableImageProcessorFast`], *optional*):
             The image processor is a required input.
         tokenizer ([`Qwen2Tokenizer`], *optional*):
             The tokenizer is a required input.

From db1e9a859fc5050415e59070328d7f85cbc85637 Mon Sep 17 00:00:00 2001
From: XingweiDeng <aaadddiy@163.com>
Date: Tue, 10 Mar 2026 15:07:47 +0800
Subject: [PATCH 11/60] refactor image_processor_fast

---
 .../configuration_pp_chart2table.py           |  17 ++-
 .../image_processing_pp_chart2table_fast.py   |  57 ++++----
 .../pp_chart2table/modeling_pp_chart2table.py | 126 ++++++++----------
 .../pp_chart2table/modular_pp_chart2table.py  |  67 ++++++----
 4 files changed, 138 insertions(+), 129 deletions(-)

diff --git a/src/transformers/models/pp_chart2table/configuration_pp_chart2table.py b/src/transformers/models/pp_chart2table/configuration_pp_chart2table.py
index 7e0a964b5254..a1ce908d6361 100644
--- a/src/transformers/models/pp_chart2table/configuration_pp_chart2table.py
+++ b/src/transformers/models/pp_chart2table/configuration_pp_chart2table.py
@@ -4,7 +4,6 @@
 #             the file from the modular. If any change should be done, please apply the change to the
 #                          modular_pp_chart2table.py file directly. One of our CI enforces this.
 #                🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨
-from typing import Optional
 
 from transformers.configuration_utils import PreTrainedConfig, layer_type_validation
 from transformers.modeling_rope_utils import RopeParameters
@@ -80,7 +79,7 @@ def __init__(
         patch_size: int = 16,
         qkv_bias: bool = True,
         use_rel_pos: bool = True,
-        global_attn_indexes: Optional[list[int]] = None,
+        global_attn_indexes: list[int] | None = None,
         window_size: int = 14,
         output_channels: int = 256,
         net_channels: int = 512,
@@ -128,6 +127,8 @@ class PPChart2TableTextConfig(PreTrainedConfig):
             The token ID representing the beginning of a sequence (BOS) for text generation.
         eos_token_id (`int`, *optional*, defaults to 151643):
             The token ID representing the end of a sequence (EOS) for text generation.
+        pad_token_id (Optional[int], optional, *optional*, defaults to -1):
+            The index of the padding token. Defaults to -1.
         hidden_act (`str` or `function`, *optional*, defaults to `"silu"`):
             The non-linear activation function (function or string) in the feed-forward and attention layers of the decoder.
         hidden_size (`int`, *optional*, defaults to 1024):
@@ -207,6 +208,7 @@ def __init__(
         attention_dropout: float = 0.0,
         bos_token_id: int = 151643,
         eos_token_id: int = 151643,
+        pad_token_id: int = -1,
         hidden_act: str = "silu",
         hidden_size: int = 1024,
         initializer_range: float = 0.02,
@@ -217,12 +219,12 @@ def __init__(
         num_key_value_heads: int = 16,
         rms_norm_eps: float = 1e-06,
         rope_theta: float = 1000000.0,
-        rope_parameters: Optional[RopeParameters | dict[str, RopeParameters]] = None,
+        rope_parameters: RopeParameters | dict[str, RopeParameters] | None = None,
         sliding_window: int = 32768,
         tie_word_embeddings: bool = True,
         use_cache: bool = True,
         vocab_size: int = 151860,
-        layer_types: Optional[list[str]] = None,
+        layer_types: list[str] | None = None,
         **kwargs,
     ):
         self.vocab_size = vocab_size
@@ -257,6 +259,7 @@ def __init__(
         self.rope_theta = rope_theta
         self.tie_word_embeddings = tie_word_embeddings
         super().__init__(
+            pad_token_id=pad_token_id,
             bos_token_id=bos_token_id,
             eos_token_id=eos_token_id,
             tie_word_embeddings=tie_word_embeddings,
@@ -330,9 +333,9 @@ def __init__(
         self,
         vision_config: dict | None = None,
         text_config: dict | None = None,
-        image_token_index: Optional[int] = 151859,
-        image_seq_length: Optional[int] = 576,
-        pad_token_id: Optional[int] = -1,
+        image_token_index: int | None = 151859,
+        image_seq_length: int | None = 576,
+        pad_token_id: int | None = -1,
         **kwargs,
     ):
         self.image_token_index = image_token_index
diff --git a/src/transformers/models/pp_chart2table/image_processing_pp_chart2table_fast.py b/src/transformers/models/pp_chart2table/image_processing_pp_chart2table_fast.py
index d9a942aab565..d70b530acf72 100644
--- a/src/transformers/models/pp_chart2table/image_processing_pp_chart2table_fast.py
+++ b/src/transformers/models/pp_chart2table/image_processing_pp_chart2table_fast.py
@@ -4,13 +4,14 @@
 #             the file from the modular. If any change should be done, please apply the change to the
 #                          modular_pp_chart2table.py file directly. One of our CI enforces this.
 #                🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨
-from typing import Optional, Union
+from typing import Optional
 
 import torch
-from torchvision.transforms.v2.functional import InterpolationMode
+import torchvision.transforms.v2.functional as tvF
 
 from transformers.feature_extraction_utils import BatchFeature
-from transformers.image_processing_utils_fast import BaseImageProcessorFast
+from transformers.image_processing_utils_fast import BaseImageProcessorFast, group_images_by_shape, reorder_images
+from transformers.image_utils import SizeDict
 from transformers.processing_utils import TensorType
 from transformers.utils import auto_docstring
 
@@ -65,37 +66,43 @@ def __init__(self, **kwargs) -> None:
 
     def _preprocess(
         self,
-        images: list[torch.Tensor],
-        size: Optional[list[dict[str, int]]],
+        images: list["torch.Tensor"],
         do_resize: bool,
+        size: SizeDict,
+        interpolation: Optional["tvF.InterpolationMode"],
         do_rescale: bool,
         rescale_factor: float,
         do_normalize: bool,
-        image_mean: Optional[Union[float, list[float]]],
-        image_std: Optional[Union[float, list[float]]],
-        return_tensors: Optional[Union[str, TensorType]],
-        interpolation: Optional[InterpolationMode] = None,
+        image_mean: float | list[float] | None,
+        image_std: float | list[float] | None,
+        disable_grouping: bool | None,
+        return_tensors: str | TensorType | None,
         **kwargs,
     ) -> BatchFeature:
-        data = {}
-        resize_images = []
-        if do_resize:
-            for image in images:
-                image = self.resize(image, size=size, interpolation=interpolation)
-                resize_images.append(image)
-            images = resize_images
+        grouped_images, grouped_images_index = group_images_by_shape(images, disable_grouping=disable_grouping)
+        resized_images_grouped = {}
+        for shape, stacked_images in grouped_images.items():
+            if do_resize:
+                stacked_images = self.resize(image=stacked_images, size=size, interpolation=interpolation)
+            resized_images_grouped[shape] = stacked_images
+        resized_images = reorder_images(resized_images_grouped, grouped_images_index)
 
-        processed_images = []
-        for image in images:
-            image = self.rescale_and_normalize(image, do_rescale, rescale_factor, do_normalize, image_mean, image_std)
-            processed_images.append(image)
-        images = processed_images
+        grouped_images, grouped_images_index = group_images_by_shape(resized_images, disable_grouping=disable_grouping)
+        processed_images_grouped = {}
+        for shape, stacked_images in grouped_images.items():
+            stacked_images = self.rescale_and_normalize(
+                stacked_images, do_rescale, rescale_factor, do_normalize, image_mean, image_std
+            )
+            # BGR to RGB conversion
+            stacked_images = stacked_images[:, [2, 1, 0], :, :]
+            processed_images_grouped[shape] = stacked_images
 
-        images = [image[[2, 1, 0], :, :] for image in images]
-        data.update({"pixel_values": torch.stack(images, dim=0)})
-        encoded_inputs = BatchFeature(data, tensor_type=return_tensors)
+        pixel_values = reorder_images(processed_images_grouped, grouped_images_index)
 
-        return encoded_inputs
+        return BatchFeature(
+            data={"pixel_values": pixel_values},
+            tensor_type=return_tensors,
+        )
 
 
 __all__ = ["PPChart2TableImageProcessorFast"]
diff --git a/src/transformers/models/pp_chart2table/modeling_pp_chart2table.py b/src/transformers/models/pp_chart2table/modeling_pp_chart2table.py
index d8e4f28e89f6..41f03d0d2c57 100644
--- a/src/transformers/models/pp_chart2table/modeling_pp_chart2table.py
+++ b/src/transformers/models/pp_chart2table/modeling_pp_chart2table.py
@@ -7,7 +7,7 @@
 import collections
 from collections.abc import Callable
 from dataclasses import dataclass
-from typing import Optional, Union
+from typing import Optional
 
 import torch
 import torch.nn as nn
@@ -18,7 +18,6 @@
 from transformers.modeling_outputs import BaseModelOutputWithPast
 from transformers.modeling_utils import PreTrainedModel
 from transformers.utils import auto_docstring, can_return_tuple
-from transformers.utils.generic import check_model_inputs
 
 from ...activations import ACT2FN
 from ...cache_utils import DynamicCache
@@ -30,7 +29,8 @@
 from ...modeling_utils import ALL_ATTENTION_FUNCTIONS
 from ...processing_utils import Unpack
 from ...utils import TransformersKwargs
-from ...utils.generic import maybe_autocast
+from ...utils.generic import maybe_autocast, merge_with_config_defaults
+from ...utils.output_capturing import capture_outputs
 from .configuration_pp_chart2table import PPChart2TableConfig, PPChart2TableTextConfig, PPChart2TableVisionConfig
 
 
@@ -457,7 +457,7 @@ def rotate_half(x):
 
 
 @use_kernel_func_from_hub("rotary_pos_emb")
-def apply_rotary_pos_emb(q, k, cos, sin, position_ids=None, unsqueeze_dim=1):
+def apply_rotary_pos_emb(q, k, cos, sin, unsqueeze_dim=1):
     """Applies Rotary Position Embedding to the query and key tensors.
 
     Args:
@@ -465,8 +465,6 @@ def apply_rotary_pos_emb(q, k, cos, sin, position_ids=None, unsqueeze_dim=1):
         k (`torch.Tensor`): The key tensor.
         cos (`torch.Tensor`): The cosine part of the rotary embedding.
         sin (`torch.Tensor`): The sine part of the rotary embedding.
-        position_ids (`torch.Tensor`, *optional*):
-            Deprecated and unused.
         unsqueeze_dim (`int`, *optional*, defaults to 1):
             The 'unsqueeze_dim' argument specifies the dimension along which to unsqueeze cos[position_ids] and
             sin[position_ids] so that they can be properly broadcasted to the dimensions of q and k. For example, note
@@ -501,7 +499,7 @@ def eager_attention_forward(
     query: torch.Tensor,
     key: torch.Tensor,
     value: torch.Tensor,
-    attention_mask: Optional[torch.Tensor],
+    attention_mask: torch.Tensor | None,
     scaling: float,
     dropout: float = 0.0,
     **kwargs: Unpack[TransformersKwargs],
@@ -511,8 +509,7 @@ def eager_attention_forward(
 
     attn_weights = torch.matmul(query, key_states.transpose(2, 3)) * scaling
     if attention_mask is not None:
-        causal_mask = attention_mask[:, :, :, : key_states.shape[-2]]
-        attn_weights = attn_weights + causal_mask
+        attn_weights = attn_weights + attention_mask
 
     attn_weights = nn.functional.softmax(attn_weights, dim=-1, dtype=torch.float32).to(query.dtype)
     attn_weights = nn.functional.dropout(attn_weights, p=dropout, training=module.training)
@@ -546,11 +543,10 @@ def forward(
         self,
         hidden_states: torch.Tensor,
         position_embeddings: tuple[torch.Tensor, torch.Tensor],
-        attention_mask: Optional[torch.Tensor],
-        past_key_values: Optional[Cache] = None,
-        cache_position: Optional[torch.LongTensor] = None,
+        attention_mask: torch.Tensor | None,
+        past_key_values: Cache | None = None,
         **kwargs: Unpack[FlashAttentionKwargs],
-    ) -> tuple[torch.Tensor, Optional[torch.Tensor]]:
+    ) -> tuple[torch.Tensor, torch.Tensor | None]:
         input_shape = hidden_states.shape[:-1]
         hidden_shape = (*input_shape, -1, self.head_dim)
 
@@ -562,13 +558,11 @@ def forward(
         query_states, key_states = apply_rotary_pos_emb(query_states, key_states, cos, sin)
 
         if past_key_values is not None:
-            # sin and cos are specific to RoPE models; cache_position needed for the static cache
-            cache_kwargs = {"sin": sin, "cos": cos, "cache_position": cache_position}
-            key_states, value_states = past_key_values.update(key_states, value_states, self.layer_idx, cache_kwargs)
+            key_states, value_states = past_key_values.update(key_states, value_states, self.layer_idx)
 
-        attention_interface: Callable = eager_attention_forward
-        if self.config._attn_implementation != "eager":
-            attention_interface = ALL_ATTENTION_FUNCTIONS[self.config._attn_implementation]
+        attention_interface: Callable = ALL_ATTENTION_FUNCTIONS.get_interface(
+            self.config._attn_implementation, eager_attention_forward
+        )
 
         attn_output, attn_weights = attention_interface(
             self,
@@ -639,12 +633,11 @@ def __init__(self, config: PPChart2TableTextConfig, layer_idx: int):
     def forward(
         self,
         hidden_states: torch.Tensor,
-        attention_mask: Optional[torch.Tensor] = None,
-        position_ids: Optional[torch.LongTensor] = None,
-        past_key_values: Optional[Cache] = None,
-        use_cache: Optional[bool] = False,
-        cache_position: Optional[torch.LongTensor] = None,
-        position_embeddings: Optional[tuple[torch.Tensor, torch.Tensor]] = None,
+        attention_mask: torch.Tensor | None = None,
+        position_ids: torch.LongTensor | None = None,
+        past_key_values: Cache | None = None,
+        use_cache: bool | None = False,
+        position_embeddings: tuple[torch.Tensor, torch.Tensor] | None = None,
         **kwargs: Unpack[TransformersKwargs],
     ) -> torch.Tensor:
         residual = hidden_states
@@ -656,7 +649,6 @@ def forward(
             position_ids=position_ids,
             past_key_values=past_key_values,
             use_cache=use_cache,
-            cache_position=cache_position,
             position_embeddings=position_embeddings,
             **kwargs,
         )
@@ -714,9 +706,9 @@ def __init__(self, config: PPChart2TableTextConfig, device=None):
 
     @staticmethod
     def compute_default_rope_parameters(
-        config: Optional[PPChart2TableTextConfig] = None,
+        config: PPChart2TableTextConfig | None = None,
         device: Optional["torch.device"] = None,
-        seq_len: Optional[int] = None,
+        seq_len: int | None = None,
     ) -> tuple["torch.Tensor", float]:
         """
         Computes the inverse frequencies according to the original RoPE implementation
@@ -777,17 +769,17 @@ def __init__(self, config: PPChart2TableTextConfig):
         # Initialize weights and apply final processing
         self.post_init()
 
-    @check_model_inputs
+    @merge_with_config_defaults
+    @capture_outputs
     @auto_docstring
     def forward(
         self,
-        input_ids: Optional[torch.LongTensor] = None,
-        attention_mask: Optional[torch.Tensor] = None,
-        position_ids: Optional[torch.LongTensor] = None,
-        past_key_values: Optional[Cache] = None,
-        inputs_embeds: Optional[torch.FloatTensor] = None,
-        use_cache: Optional[bool] = None,
-        cache_position: Optional[torch.LongTensor] = None,
+        input_ids: torch.LongTensor | None = None,
+        attention_mask: torch.Tensor | None = None,
+        position_ids: torch.LongTensor | None = None,
+        past_key_values: Cache | None = None,
+        inputs_embeds: torch.FloatTensor | None = None,
+        use_cache: bool | None = None,
         **kwargs: Unpack[TransformersKwargs],
     ) -> BaseModelOutputWithPast:
         if (input_ids is None) ^ (inputs_embeds is not None):
@@ -799,23 +791,18 @@ def forward(
         if use_cache and past_key_values is None:
             past_key_values = DynamicCache(config=self.config)
 
-        if cache_position is None:
-            past_seen_tokens = past_key_values.get_seq_length() if past_key_values is not None else 0
-            cache_position = torch.arange(
-                past_seen_tokens, past_seen_tokens + inputs_embeds.shape[1], device=inputs_embeds.device
-            )
-
         if position_ids is None:
-            position_ids = cache_position.unsqueeze(0)
+            past_seen_tokens = past_key_values.get_seq_length() if past_key_values is not None else 0
+            position_ids = torch.arange(inputs_embeds.shape[1], device=inputs_embeds.device) + past_seen_tokens
+            position_ids = position_ids.unsqueeze(0)
 
         # It may already have been prepared by e.g. `generate`
         if not isinstance(causal_mask_mapping := attention_mask, dict):
             # Prepare mask arguments
             mask_kwargs = {
                 "config": self.config,
-                "input_embeds": inputs_embeds,
+                "inputs_embeds": inputs_embeds,
                 "attention_mask": attention_mask,
-                "cache_position": cache_position,
                 "past_key_values": past_key_values,
                 "position_ids": position_ids,
             }
@@ -838,7 +825,6 @@ def forward(
                 position_ids=position_ids,
                 past_key_values=past_key_values,
                 use_cache=use_cache,
-                cache_position=cache_position,
                 **kwargs,
             )
 
@@ -868,7 +854,7 @@ class PPChart2TableModelOutputWithPast(BaseModelOutputWithPast):
             Tuple of attention weights from each layer of the text decoder (for debugging/analysis).
     """
 
-    image_hidden_states: Optional[torch.FloatTensor] = None
+    image_hidden_states: torch.FloatTensor | None = None
 
 
 @dataclass
@@ -884,9 +870,9 @@ class PPChart2TableCausalLMOutputWithPast(BaseModelOutputWithPast):
             Language modeling logits (shape: `[B, seq_len, vocab_size]`), output from the LM head.
     """
 
-    logits: Optional[torch.FloatTensor] = None
-    loss: Optional[torch.FloatTensor] = None
-    image_hidden_states: Optional[torch.FloatTensor] = None
+    logits: torch.FloatTensor | None = None
+    loss: torch.FloatTensor | None = None
+    image_hidden_states: torch.FloatTensor | None = None
 
 
 @auto_docstring(
@@ -1060,13 +1046,13 @@ def get_placeholder_mask(
     def forward(
         self,
         input_ids: torch.LongTensor = None,
-        attention_mask: Optional[torch.Tensor] = None,
-        position_ids: Optional[torch.Tensor] = None,
-        past_key_values: Optional[list[torch.Tensor]] = None,
-        inputs_embeds: Optional[torch.Tensor] = None,
-        use_cache: Optional[bool] = None,
-        pixel_values: Optional[torch.Tensor] = None,
-        cache_position: Optional[torch.LongTensor] = None,
+        attention_mask: torch.Tensor | None = None,
+        position_ids: torch.Tensor | None = None,
+        past_key_values: list[torch.Tensor] | None = None,
+        inputs_embeds: torch.Tensor | None = None,
+        use_cache: bool | None = None,
+        pixel_values: torch.Tensor | None = None,
+        cache_position: torch.LongTensor | None = None,
         **kwargs,
     ):
         if (input_ids is None) ^ (inputs_embeds is not None):
@@ -1158,8 +1144,8 @@ def get_output_embeddings(self) -> nn.Module:
     def get_image_features(
         self,
         pixel_values: torch.FloatTensor,
-        vision_feature_layer: Optional[Union[int, list[int]]] = None,
-        vision_feature_select_strategy: Optional[str] = None,
+        vision_feature_layer: int | list[int] | None = None,
+        vision_feature_select_strategy: str | None = None,
         **kwargs,
     ):
         return self.model.get_image_features(
@@ -1172,18 +1158,18 @@ def get_image_features(
     @can_return_tuple
     def forward(
         self,
-        input_ids: Optional[torch.LongTensor] = None,
-        attention_mask: Optional[torch.Tensor] = None,
-        position_ids: Optional[torch.LongTensor] = None,
-        pixel_values: Optional[torch.Tensor] = None,
-        inputs_embeds: Optional[torch.Tensor] = None,
-        labels: Optional[list[dict]] = None,
-        logits_to_keep: Union[int, torch.Tensor] = 0,
-        cache_position: Optional[torch.LongTensor] = None,
-        past_key_values: Optional[Cache] = None,
-        use_cache: Optional[bool] = None,
+        input_ids: torch.LongTensor | None = None,
+        attention_mask: torch.Tensor | None = None,
+        position_ids: torch.LongTensor | None = None,
+        pixel_values: torch.Tensor | None = None,
+        inputs_embeds: torch.Tensor | None = None,
+        labels: list[dict] | None = None,
+        logits_to_keep: int | torch.Tensor = 0,
+        cache_position: torch.LongTensor | None = None,
+        past_key_values: Cache | None = None,
+        use_cache: bool | None = None,
         **kwargs,
-    ) -> Union[tuple[torch.FloatTensor], PPChart2TableCausalLMOutputWithPast]:
+    ) -> tuple[torch.FloatTensor] | PPChart2TableCausalLMOutputWithPast:
         outputs = self.model(
             input_ids=input_ids,
             attention_mask=attention_mask,
diff --git a/src/transformers/models/pp_chart2table/modular_pp_chart2table.py b/src/transformers/models/pp_chart2table/modular_pp_chart2table.py
index f6850641fc14..62544c5545e4 100644
--- a/src/transformers/models/pp_chart2table/modular_pp_chart2table.py
+++ b/src/transformers/models/pp_chart2table/modular_pp_chart2table.py
@@ -4,13 +4,13 @@
 
 import torch
 import torch.nn as nn
-from torchvision.transforms.v2.functional import InterpolationMode
+import torchvision.transforms.v2.functional as tvF
 
 from transformers.cache_utils import Cache
 from transformers.configuration_utils import PreTrainedConfig, layer_type_validation
 from transformers.feature_extraction_utils import BatchFeature
 from transformers.generation import GenerationMixin
-from transformers.image_processing_utils_fast import BaseImageProcessorFast
+from transformers.image_processing_utils_fast import BaseImageProcessorFast, group_images_by_shape, reorder_images
 from transformers.modeling_outputs import BaseModelOutputWithPast
 from transformers.modeling_rope_utils import RopeParameters
 from transformers.modeling_utils import PreTrainedModel
@@ -36,6 +36,8 @@
 from transformers.processing_utils import ProcessorMixin, TensorType
 from transformers.utils import can_return_tuple
 
+from transformers.image_utils import SizeDict
+
 logger = logging.get_logger(__name__)
 
 
@@ -157,6 +159,8 @@ class PPChart2TableTextConfig(PreTrainedConfig):
             The token ID representing the beginning of a sequence (BOS) for text generation.
         eos_token_id (`int`, *optional*, defaults to 151643):
             The token ID representing the end of a sequence (EOS) for text generation.
+        pad_token_id (Optional[int], optional, *optional*, defaults to -1):
+            The index of the padding token. Defaults to -1.
         hidden_act (`str` or `function`, *optional*, defaults to `"silu"`):
             The non-linear activation function (function or string) in the feed-forward and attention layers of the decoder.
         hidden_size (`int`, *optional*, defaults to 1024):
@@ -236,6 +240,7 @@ def __init__(
         attention_dropout: float = 0.0,
         bos_token_id: int = 151643,
         eos_token_id: int = 151643,
+        pad_token_id: int = -1,
         hidden_act: str = "silu",
         hidden_size: int = 1024,
         initializer_range: float = 0.02,
@@ -286,6 +291,7 @@ def __init__(
         self.rope_theta = rope_theta
         self.tie_word_embeddings = tie_word_embeddings
         super().__init__(
+            pad_token_id=pad_token_id,
             bos_token_id=bos_token_id,
             eos_token_id=eos_token_id,
             tie_word_embeddings=tie_word_embeddings,
@@ -452,37 +458,44 @@ def __init__(self, **kwargs) -> None:
 
     def _preprocess(
         self,
-        images: list[torch.Tensor],
-        size: Optional[list[dict[str, int]]],
+        images: list["torch.Tensor"],
         do_resize: bool,
+        size: SizeDict,
+        interpolation: Optional["tvF.InterpolationMode"],
         do_rescale: bool,
         rescale_factor: float,
         do_normalize: bool,
-        image_mean: Optional[Union[float, list[float]]],
-        image_std: Optional[Union[float, list[float]]],
-        return_tensors: Optional[Union[str, TensorType]],
-        interpolation: Optional[InterpolationMode] = None,
+        image_mean: float | list[float] | None,
+        image_std: float | list[float] | None,
+        disable_grouping: bool | None,
+        return_tensors: str | TensorType | None,
         **kwargs,
     ) -> BatchFeature:
-        data = {}
-        resize_images = []
-        if do_resize:
-            for image in images:
-                image = self.resize(image, size=size, interpolation=interpolation)
-                resize_images.append(image)
-            images = resize_images
-
-        processed_images = []
-        for image in images:
-            image = self.rescale_and_normalize(image, do_rescale, rescale_factor, do_normalize, image_mean, image_std)
-            processed_images.append(image)
-        images = processed_images
-
-        images = [image[[2, 1, 0], :, :] for image in images]
-        data.update({"pixel_values": torch.stack(images, dim=0)})
-        encoded_inputs = BatchFeature(data, tensor_type=return_tensors)
-
-        return encoded_inputs
+
+        grouped_images, grouped_images_index = group_images_by_shape(images, disable_grouping=disable_grouping)
+        resized_images_grouped = {}
+        for shape, stacked_images in grouped_images.items():
+            if do_resize:
+                stacked_images = self.resize(image=stacked_images, size=size, interpolation=interpolation)
+            resized_images_grouped[shape] = stacked_images
+        resized_images = reorder_images(resized_images_grouped, grouped_images_index)
+
+        grouped_images, grouped_images_index = group_images_by_shape(resized_images, disable_grouping=disable_grouping)
+        processed_images_grouped = {}
+        for shape, stacked_images in grouped_images.items():
+            stacked_images = self.rescale_and_normalize(
+                stacked_images, do_rescale, rescale_factor, do_normalize, image_mean, image_std
+            )
+            # BGR to RGB conversion
+            stacked_images = stacked_images[:, [2, 1, 0], :, :]
+            processed_images_grouped[shape] = stacked_images
+
+        pixel_values = reorder_images(processed_images_grouped, grouped_images_index)
+
+        return BatchFeature(
+            data={"pixel_values": pixel_values},
+            tensor_type=return_tensors,
+        )
 
 
 @auto_docstring(

From d8763e5f3c30270dedb5f99a71b6c0fabd2c0933 Mon Sep 17 00:00:00 2001
From: XingweiDeng <aaadddiy@163.com>
Date: Fri, 13 Mar 2026 11:28:40 +0800
Subject: [PATCH 12/60] update

---
 .../configuration_pp_chart2table.py           | 297 ++----
 .../image_processing_pp_chart2table_fast.py   |  48 +-
 .../pp_chart2table/modeling_pp_chart2table.py | 787 +++++++-------
 .../pp_chart2table/modular_pp_chart2table.py  | 957 +++---------------
 .../processing_pp_chart2table.py              |  30 +-
 .../test_modeling_pp_chart2table.py           |  13 +-
 6 files changed, 649 insertions(+), 1483 deletions(-)

diff --git a/src/transformers/models/pp_chart2table/configuration_pp_chart2table.py b/src/transformers/models/pp_chart2table/configuration_pp_chart2table.py
index a1ce908d6361..7ecb1a2ddd31 100644
--- a/src/transformers/models/pp_chart2table/configuration_pp_chart2table.py
+++ b/src/transformers/models/pp_chart2table/configuration_pp_chart2table.py
@@ -5,63 +5,41 @@
 #                          modular_pp_chart2table.py file directly. One of our CI enforces this.
 #                🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨
 
-from transformers.configuration_utils import PreTrainedConfig, layer_type_validation
-from transformers.modeling_rope_utils import RopeParameters
-from transformers.utils import auto_docstring
+from ...configuration_utils import PreTrainedConfig, layer_type_validation
+from ...modeling_rope_utils import RopeParameters
+from ...utils import auto_docstring
 
 
 @auto_docstring(
-    custom_intro="""
-    This configuration class defines all the hyperparameters for the vision component
-    of the PP-Chart2Table model, which is responsible for processing chart images
-    and extracting visual features for table structure recognition and content extraction.
-    PaddlePaddle/PP-Chart2Table_safetensors [PaddlePaddle/PP-Chart2Table_safetensors]
-    (https://huggingface.co/PaddlePaddle/PP-Chart2Table_safetensors)
-    """,
     checkpoint="PaddlePaddle/PP-Chart2Table_safetensors",
 )
 class PPChart2TableVisionConfig(PreTrainedConfig):
     """
-    Configuration class for the vision backbone of PP-Chart2Table model.
-
-    This configuration class defines all the hyperparameters for the vision component
-    of the PP-Chart2Table model, which is responsible for processing chart images
-    and extracting visual features for table structure recognition and content extraction.
-    PaddlePaddle/PP-Chart2Table_safetensors [PaddlePaddle/PP-Chart2Table_safetensors]
-    (https://huggingface.co/PaddlePaddle/PP-Chart2Table_safetensors)
-
-    Args:
-        depth (`int`, *optional*, defaults to 12):
-            Number of transformer encoder layers in the vision backbone.
-        embed_dim (`int`, *optional*, defaults to 768):
-            Dimensionality of the patch embedding vectors.
-        hidden_size (`int`, *optional*, defaults to 1024):
-            Dimensionality of the hidden layer in the feed-forward network (MLP).
-        num_channels (`int`, *optional*, defaults to 3):
-            Number of input channels (3 for RGB images, 1 for grayscale).
-        image_size (`int`, *optional*, defaults to 1024):
-            Size (height/width) of the input images (assumed to be square).
-        mlp_ratio (`float`, *optional*, defaults to 4.0):
-            Ratio of the hidden layer size to the embedding dimension in the MLP (hidden_size = embed_dim * mlp_ratio).
-        num_attention_heads (`int`, *optional*, defaults to 12):
-            Number of attention heads for each transformer encoder layer.
-        patch_size (`int`, *optional*, defaults to 16):
-            Size (height/width) of the image patches extracted from the input image.
-        qkv_bias (`bool`, *optional*, defaults to `True`):
-            Whether to include bias terms in the query, key, value projection layers of self-attention.
-        use_rel_pos (`bool`, *optional*, defaults to `True`):
-            Whether to use relative positional embeddings in the self-attention mechanism.
-        global_attn_indexes (`Optional[list[int]]`, *optional*, defaults to [2, 5, 8, 11]):
-            List of layer indexes where global attention (instead of window attention) is applied.
-            If `None`, defaults to [2, 5, 8, 11].
-        window_size (`int`, *optional*, defaults to 14):
-            Size of the attention window for window-based self-attention (only effective when use_rel_pos=True).
-        output_channels (`int`, *optional*, defaults to 256):
-            Dimensionality of the final visual feature output channels.
-        net_channels (`int`, *optional*, defaults to 512):
-            Dimensionality of intermediate network channels in the vision backbone.
-        attention_dropout (`float`, *optional*, defaults to 0.0):
-            Dropout probability applied to the attention weights.
+    num_hidden_layers (`int`, *optional*, defaults to 12):
+        Number of transformer encoder layers in the vision backbone.
+    hidden_size (`int`, *optional*, defaults to 768):
+        Dimensionality of the patch embedding vectors.
+    num_channels (`int`, *optional*, defaults to 3):
+        Number of input channels (3 for RGB images, 1 for grayscale).
+    image_size (`int`, *optional*, defaults to 1024):
+        Size (height/width) of the input images (assumed to be square).
+    num_attention_heads (`int`, *optional*, defaults to 12):
+        Number of attention heads for each transformer encoder layer.
+    patch_size (`int`, *optional*, defaults to 16):
+        Size (height/width) of the image patches extracted from the input image.
+    qkv_bias (`bool`, *optional*, defaults to `True`):
+        Whether to include bias terms in the query, key, value projection layers of self-attention.
+    use_rel_pos (`bool`, *optional*, defaults to `True`):
+        Whether to use relative positional embeddings in the self-attention mechanism.
+    global_attn_indexes (`Optional[list[int]]`, *optional*, defaults to [2, 5, 8, 11]):
+        List of layer indexes where global attention (instead of window attention) is applied.
+        If `None`, defaults to [2, 5, 8, 11].
+    window_size (`int`, *optional*, defaults to 14):
+        Size of the attention window for window-based self-attention (only effective when use_rel_pos=True).
+    output_channels (`int`, *optional*, defaults to 256):
+        Dimensionality of the final visual feature output channels.
+    attention_dropout (`float`, *optional*, defaults to 0.0):
+        Dropout probability applied to the attention weights.
     """
 
     model_type = "pp_chart2table_vision"
@@ -69,37 +47,35 @@ class PPChart2TableVisionConfig(PreTrainedConfig):
 
     def __init__(
         self,
-        depth: int = 12,
-        embed_dim: int = 768,
-        hidden_size: int = 1024,
-        num_channels: int = 3,
-        image_size: int = 1024,
-        mlp_ratio: float = 4.0,
-        num_attention_heads: int = 12,
-        patch_size: int = 16,
-        qkv_bias: bool = True,
-        use_rel_pos: bool = True,
-        global_attn_indexes: list[int] | None = None,
-        window_size: int = 14,
-        output_channels: int = 256,
-        net_channels: int = 512,
-        attention_dropout: float = 0.0,
+        num_hidden_layers=12,
+        hidden_size=768,
+        output_channels=256,
+        mlp_dim=3072,
+        num_channels=3,
+        image_size=1024,
+        num_attention_heads=12,
+        patch_size=16,
+        qkv_bias=True,
+        use_rel_pos=True,
+        use_abs_pos=False,
+        global_attn_indexes=[2, 5, 8, 11],
+        window_size=14,
+        attention_dropout=0.0,
         **kwargs,
     ):
-        self.depth = depth
-        self.embed_dim = embed_dim
+        self.num_hidden_layers = num_hidden_layers
         self.hidden_size = hidden_size
+        self.mlp_dim = mlp_dim
         self.image_size = image_size
         self.num_channels = num_channels
-        self.mlp_ratio = mlp_ratio
         self.num_attention_heads = num_attention_heads
         self.patch_size = patch_size
         self.qkv_bias = qkv_bias
         self.use_rel_pos = use_rel_pos
-        self.global_attn_indexes = global_attn_indexes if global_attn_indexes is not None else [2, 5, 8, 11]
+        self.use_abs_pos = use_abs_pos
+        self.global_attn_indexes = global_attn_indexes
         self.window_size = window_size
         self.output_channels = output_channels
-        self.net_channels = net_channels
         self.attention_dropout = attention_dropout
         super().__init__(**kwargs)
 
@@ -111,76 +87,52 @@ def __init__(
 )
 class PPChart2TableTextConfig(PreTrainedConfig):
     r"""
-    This is the configuration class to store the configuration of a [`PPChart2TableTextModel`]. It is used to instantiate a
-    PP-Chart2Table text decoder according to the specified arguments, defining the model architecture. Instantiating a
-    configuration with the defaults will yield a similar configuration to that of the text encoder/decoder of the
-    PPChart2TableText-7B-beta [Qwen/PPChart2TableText-7B-beta](https://huggingface.co/Qwen/PPChart2TableText-7B-beta)
-    architecture, optimized for chart-to-table text generation tasks.
-
-    Configuration objects inherit from [`PreTrainedConfig`] and can be used to control the model outputs. Read the
-    documentation from [`PreTrainedConfig`] for more information.
-
-    Args:
-        attention_dropout (`float`, *optional*, defaults to 0.0):
-            The dropout ratio for the attention probabilities in self-attention layers.
-        bos_token_id (`int`, *optional*, defaults to 151643):
-            The token ID representing the beginning of a sequence (BOS) for text generation.
-        eos_token_id (`int`, *optional*, defaults to 151643):
-            The token ID representing the end of a sequence (EOS) for text generation.
-        pad_token_id (Optional[int], optional, *optional*, defaults to -1):
-            The index of the padding token. Defaults to -1.
-        hidden_act (`str` or `function`, *optional*, defaults to `"silu"`):
-            The non-linear activation function (function or string) in the feed-forward and attention layers of the decoder.
-        hidden_size (`int`, *optional*, defaults to 1024):
-            Dimensionality of the hidden representations in the Transformer decoder layers.
-        initializer_range (`float`, *optional*, defaults to 0.02):
-            The standard deviation of the truncated_normal_initializer for initializing all weight matrices.
-        intermediate_size (`int`, *optional*, defaults to 2816):
-            Dimensionality of the "intermediate" (i.e., feed-forward) layer in the Transformer decoder blocks.
-        max_position_embeddings (`int`, *optional*, defaults to 32768):
-            The maximum sequence length that this model might ever be used with for text input/output.
-        num_attention_heads (`int`, *optional*, defaults to 16):
-            Number of attention heads for each self-attention layer in the Transformer decoder.
-        num_hidden_layers (`int`, *optional*, defaults to 24):
-            Number of hidden layers in the Transformer decoder.
-        num_key_value_heads (`int`, *optional*, defaults to 16):
-            Number of key/value heads for implementing Grouped Query Attention (GQA). If equal to `num_attention_heads`,
-            Multi Head Attention (MHA) is used; if 1, Multi Query Attention (MQA) is used. For more details, see
-            [this paper](https://huggingface.co/papers/2305.13245).
-        rms_norm_eps (`float`, *optional*, defaults to 1e-06):
-            The epsilon value used by the RMS normalization layers to avoid division by zero.
-        rope_theta (`float`, *optional*, defaults to 1000000.0):
-            The base period of the RoPE (Rotary Position Embedding) embeddings, controlling the frequency of positional encoding.
-        rope_parameters (`RopeParameters` or `dict`, *optional*):
-            Configuration parameters for RoPE embeddings, including scaling parameters for longer sequence lengths beyond
-            `max_position_embeddings`.
-        sliding_window (`int`, *optional*, defaults to 32768):
-            Window size for Sliding Window Attention (SWA) in the decoder layers (only active if `use_sliding_window=True`).
-        tie_word_embeddings (`bool`, *optional*, defaults to `True`):
-            Whether the model's input and output word embeddings should be tied (shared weights).
-        use_cache (`bool`, *optional*, defaults to `True`):
-            Whether to return the last key/value attention states to speed up sequential decoding (only relevant for autoregressive
-            generation).
-        vocab_size (`int`, *optional*, defaults to 151860):
-            Vocabulary size of the PPChart2TableText model. Defines the number of distinct tokens that can be represented
-            by `input_ids`.
-        layer_types (`list[str]`, *optional*):
-            Attention pattern for each decoder layer (e.g., `"full_attention"` or `"sliding_attention"`). If not specified,
-            automatically determined by `sliding_window`.
-
-    Example:
-
-    ```python
-    >>> from transformers import PPChart2TableTextConfig, PPChart2TableTextModel
-
-    >>> # Initializing a PPChart2TableText style configuration
-    >>> configuration = PPChart2TableTextConfig()
-
-    >>> # Initializing a model from the PPChart2TableText-7B style configuration
-    >>> model = PPChart2TableTextModel(configuration)
-
-    >>> # Accessing the model configuration
-    >>> configuration = model.config
+    attention_dropout (`float`, *optional*, defaults to 0.0):
+        The dropout ratio for the attention probabilities in self-attention layers.
+    bos_token_id (`int`, *optional*, defaults to 151643):
+        The token ID representing the beginning of a sequence (BOS) for text generation.
+    eos_token_id (`int`, *optional*, defaults to 151643):
+        The token ID representing the end of a sequence (EOS) for text generation.
+    pad_token_id (Optional[int], optional, *optional*, defaults to -1):
+        The index of the padding token. Defaults to -1.
+    hidden_act (`str` or `function`, *optional*, defaults to `"silu"`):
+        The non-linear activation function (function or string) in the feed-forward and attention layers of the decoder.
+    hidden_size (`int`, *optional*, defaults to 1024):
+        Dimensionality of the hidden representations in the Transformer decoder layers.
+    initializer_range (`float`, *optional*, defaults to 0.02):
+        The standard deviation of the truncated_normal_initializer for initializing all weight matrices.
+    intermediate_size (`int`, *optional*, defaults to 2816):
+        Dimensionality of the "intermediate" (i.e., feed-forward) layer in the Transformer decoder blocks.
+    max_position_embeddings (`int`, *optional*, defaults to 32768):
+        The maximum sequence length that this model might ever be used with for text input/output.
+    num_attention_heads (`int`, *optional*, defaults to 16):
+        Number of attention heads for each self-attention layer in the Transformer decoder.
+    num_hidden_layers (`int`, *optional*, defaults to 24):
+        Number of hidden layers in the Transformer decoder.
+    num_key_value_heads (`int`, *optional*, defaults to 16):
+        Number of key/value heads for implementing Grouped Query Attention (GQA). If equal to `num_attention_heads`,
+        Multi Head Attention (MHA) is used; if 1, Multi Query Attention (MQA) is used. For more details, see
+        [this paper](https://huggingface.co/papers/2305.13245).
+    rms_norm_eps (`float`, *optional*, defaults to 1e-06):
+        The epsilon value used by the RMS normalization layers to avoid division by zero.
+    rope_theta (`float`, *optional*, defaults to 1000000.0):
+        The base period of the RoPE (Rotary Position Embedding) embeddings, controlling the frequency of positional encoding.
+    rope_parameters (`RopeParameters` or `dict`, *optional*):
+        Configuration parameters for RoPE embeddings, including scaling parameters for longer sequence lengths beyond
+        `max_position_embeddings`.
+    sliding_window (`int`, *optional*, defaults to 32768):
+        Window size for Sliding Window Attention (SWA) in the decoder layers (only active if `use_sliding_window=True`).
+    tie_word_embeddings (`bool`, *optional*, defaults to `True`):
+        Whether the model's input and output word embeddings should be tied (shared weights).
+    use_cache (`bool`, *optional*, defaults to `True`):
+        Whether to return the last key/value attention states to speed up sequential decoding (only relevant for autoregressive
+        generation).
+    vocab_size (`int`, *optional*, defaults to 151860):
+        Vocabulary size of the PPChart2TableText model. Defines the number of distinct tokens that can be represented
+        by `input_ids`.
+    layer_types (`list[str]`, *optional*):
+        Attention pattern for each decoder layer (e.g., `"full_attention"` or `"sliding_attention"`). If not specified,
+        automatically determined by `sliding_window`.
     """
 
     model_type = "pp_chart2table_text"
@@ -274,53 +226,20 @@ def __init__(
 )
 class PPChart2TableConfig(PreTrainedConfig):
     r"""
-    This is the main configuration class to store the configuration of a [PPChart2TableModel] or [PPChart2TableForConditionalGeneration].
-    It is used to instantiate a PP-Chart2Table multimodal model according to the specified arguments, defining the vision and text
-    sub-model architectures. This configuration class inherits from [PreTrainedConfig] and combines the configurations of:
-    [PPChart2TableVisionConfig] (for the chart vision encoder)
-    [PPChart2TableTextConfig] (for the table text decoder)
-    PP-Chart2Table PaddlePaddle/PP-Chart2Table_safetensors.
-
-    Instantiating a PPChart2TableConfig with the defaults will yield a similar configuration to the base PP-Chart2Table model
-    developed by the PaddlePaddle team for chart-to-table parsing tasks.
-
-    Configuration objects inherit from [PreTrainedConfig] and can be used to control the model outputs. Read the
-    documentation from [PreTrainedConfig] for more information.
-
-    Args:
-        vision_config (Optional[Dict], optional, *optional*)::
-            The [PPChart2TableVisionConfig] for the vision sub-model. Defaults to None.
-        text_config (Optional[Dict], optional, *optional*)::
-            The [PPChart2TableTextConfig] for the text sub-model. Defaults to None.
-        image_token_index (Optional[int], optional, *optional*, defaults to 151859)::
-            The index of the image token. Defaults to 151859.
-        image_seq_length (Optional[int], optional, *optional*, defaults to 576)::
-            The sequence length for the image. Defaults to 576.
-        pad_token_id (Optional[int], optional, *optional*, defaults to -1):
-            The index of the padding token. Defaults to -1.
-
-    Example:
-
-    ```python
-    >>> from transformers import PPChart2TableConfig, PPChart2TableModel
-
-    >>> # Initializing a PPChart2Table configuration with default vision and text sub-configs
-    >>> configuration = PPChart2TableConfig()
-
-    >>> # Initializing a PPChart2Table configuration with custom vision and text sub-configs
-    >>> vision_config = {"image_size": 512, "patch_size": 8}
-    >>> text_config = {"hidden_size": 2048, "num_hidden_layers": 16}
-    >>> configuration = PPChart2TableConfig(vision_config=vision_config, text_config=text_config)
-
-    >>> # Initializing a model from the PPChart2Table configuration
-    >>> model = PPChart2TableModel(configuration)
-
-    >>> # Accessing the model configuration
-    >>> configuration = model.config
-    >>> # Accessing the vision sub-config
-    >>> vision_config = configuration.vision_config
-    >>> # Accessing the text sub-config
-    >>> text_config = configuration.text_config
+    vision_config (Optional[Dict], optional, *optional*)::
+        The [PPChart2TableVisionConfig] for the vision sub-model. Defaults to None.
+    text_config (Optional[Dict], optional, *optional*)::
+        The [PPChart2TableTextConfig] for the text sub-model. Defaults to None.
+    image_token_index (Optional[int], optional, *optional*, defaults to 151859)::
+        The index of the image token. Defaults to 151859.
+    image_seq_length (Optional[int], optional, *optional*, defaults to 576)::
+        The sequence length for the image. Defaults to 576.
+    pad_token_id (Optional[int], optional, *optional*, defaults to -1):
+        The index of the padding token. Defaults to -1.
+    net_channels (`int`, *optional*, defaults to 512):
+        Dimensionality of intermediate network channels in the vision backbone.
+    output_channels (`int`, *optional*, defaults to 1024):
+        Dimensionality of intermediate network channels in the vision backbone.
     """
 
     model_type = "pp_chart2table"
@@ -336,11 +255,15 @@ def __init__(
         image_token_index: int | None = 151859,
         image_seq_length: int | None = 576,
         pad_token_id: int | None = -1,
+        net_channels: int | None = 512,
+        output_channels: int | None = 1024,
         **kwargs,
     ):
         self.image_token_index = image_token_index
         self.image_seq_length = image_seq_length
         self.pad_token_id = pad_token_id
+        self.net_channels = net_channels
+        self.output_channels = output_channels
 
         if vision_config is None:
             vision_config = {}
diff --git a/src/transformers/models/pp_chart2table/image_processing_pp_chart2table_fast.py b/src/transformers/models/pp_chart2table/image_processing_pp_chart2table_fast.py
index d70b530acf72..5021229fe168 100644
--- a/src/transformers/models/pp_chart2table/image_processing_pp_chart2table_fast.py
+++ b/src/transformers/models/pp_chart2table/image_processing_pp_chart2table_fast.py
@@ -9,48 +9,15 @@
 import torch
 import torchvision.transforms.v2.functional as tvF
 
-from transformers.feature_extraction_utils import BatchFeature
-from transformers.image_processing_utils_fast import BaseImageProcessorFast, group_images_by_shape, reorder_images
-from transformers.image_utils import SizeDict
-from transformers.processing_utils import TensorType
-from transformers.utils import auto_docstring
+from ...feature_extraction_utils import BatchFeature
+from ...image_processing_utils_fast import BaseImageProcessorFast, group_images_by_shape, reorder_images
+from ...image_utils import SizeDict
+from ...processing_utils import TensorType
+from ...utils import auto_docstring
 
 
-@auto_docstring(
-    custom_intro="""
-
-    """
-)
+@auto_docstring
 class PPChart2TableImageProcessorFast(BaseImageProcessorFast):
-    r"""
-    Fast image processor for the PP-Chart2Table multimodal model, optimized for GPU-accelerated chart image preprocessing.
-
-    This high-performance processor implements a streamlined preprocessing pipeline for chart images (resizing, rescaling,
-    normalization, channel reordering) using PyTorch tensor operations, designed for efficient batch processing on GPUs.
-    It inherits from [`BaseImageProcessorFast`] and is optimized for inference/training pipelines requiring low-latency
-    image preprocessing.
-
-    Class Attributes (Default Configuration):
-        resample (`int`, defaults to 3):
-            Integer identifier for the resampling filter (3 = BICUBIC, compatible with `InterpolationMode.BICUBIC`).
-        image_mean (`list[float]`, defaults to `[0.40821073, 0.4578275, 0.48145466]`):
-            Per-channel mean values for image normalization (RGB order).
-        image_std (`list[float]`, defaults to `[0.27577711, 0.26130258, 0.26862954]`):
-            Per-channel standard deviation values for image normalization (RGB order).
-        size (`dict[str, int]`, defaults to `{"height": 1024, "width": 1024}`):
-            Default target size for image resizing (1024x1024, optimized for PP-Chart2Table vision encoder).
-        patch_size (`int`, defaults to 16):
-            Size of image patches used by the PP-Chart2Table vision encoder (for alignment with model input).
-        merge_size (`int`, defaults to 4):
-            Size factor for merging image patches (specific to PP-Chart2Table's vision processing pipeline).
-        do_resize (`bool`, defaults to `True`):
-            Default flag to enable image resizing.
-        do_rescale (`bool`, defaults to `True`):
-            Default flag to enable pixel value rescaling (from [0,255] to [0,1]).
-        do_normalize (`bool`, defaults to `True`):
-            Default flag to enable image normalization.
-    """
-
     resample = 3
     image_mean = [0.40821073, 0.4578275, 0.48145466]
     image_std = [0.27577711, 0.26130258, 0.26862954]
@@ -61,9 +28,6 @@ class PPChart2TableImageProcessorFast(BaseImageProcessorFast):
     do_rescale = True
     do_normalize = True
 
-    def __init__(self, **kwargs) -> None:
-        super().__init__(**kwargs)
-
     def _preprocess(
         self,
         images: list["torch.Tensor"],
diff --git a/src/transformers/models/pp_chart2table/modeling_pp_chart2table.py b/src/transformers/models/pp_chart2table/modeling_pp_chart2table.py
index 41f03d0d2c57..033daab0bf44 100644
--- a/src/transformers/models/pp_chart2table/modeling_pp_chart2table.py
+++ b/src/transformers/models/pp_chart2table/modeling_pp_chart2table.py
@@ -13,101 +13,24 @@
 import torch.nn as nn
 import torch.nn.functional as F
 
-from transformers.cache_utils import Cache
-from transformers.generation import GenerationMixin
-from transformers.modeling_outputs import BaseModelOutputWithPast
-from transformers.modeling_utils import PreTrainedModel
-from transformers.utils import auto_docstring, can_return_tuple
-
+from ... import initialization as init
 from ...activations import ACT2FN
-from ...cache_utils import DynamicCache
+from ...cache_utils import Cache, DynamicCache
+from ...generation import GenerationMixin
 from ...integrations import use_kernel_forward_from_hub, use_kernel_func_from_hub, use_kernelized_func
 from ...masking_utils import create_causal_mask, create_sliding_window_causal_mask
 from ...modeling_flash_attention_utils import FlashAttentionKwargs
 from ...modeling_layers import GradientCheckpointingLayer
+from ...modeling_outputs import BaseModelOutputWithPast, BaseModelOutputWithPooling, ModelOutput
 from ...modeling_rope_utils import ROPE_INIT_FUNCTIONS, dynamic_rope_update
-from ...modeling_utils import ALL_ATTENTION_FUNCTIONS
+from ...modeling_utils import ALL_ATTENTION_FUNCTIONS, PreTrainedModel
 from ...processing_utils import Unpack
-from ...utils import TransformersKwargs
+from ...utils import TransformersKwargs, auto_docstring, can_return_tuple, torch_compilable_check
 from ...utils.generic import maybe_autocast, merge_with_config_defaults
 from ...utils.output_capturing import capture_outputs
 from .configuration_pp_chart2table import PPChart2TableConfig, PPChart2TableTextConfig, PPChart2TableVisionConfig
 
 
-class PPChart2TableVisionPatchEmbed(nn.Module):
-    """
-    This class turns `pixel_values` of shape `(batch_size, num_channels, height, width)` into the initial
-    `hidden_states` (patch embeddings) of shape `(batch_size, seq_length, hidden_size)` to be consumed by a
-    Transformer.
-    """
-
-    def __init__(self, config):
-        super().__init__()
-        image_size, patch_size = config.image_size, config.patch_size
-        num_channels, hidden_size = config.num_channels, config.embed_dim
-        image_size = image_size if isinstance(image_size, collections.abc.Iterable) else (image_size, image_size)
-        patch_size = patch_size if isinstance(patch_size, collections.abc.Iterable) else (patch_size, patch_size)
-        num_patches = (image_size[1] // patch_size[1]) * (image_size[0] // patch_size[0])
-        self.image_size = image_size
-        self.patch_size = patch_size
-        self.num_channels = num_channels
-        self.num_patches = num_patches
-        self.projection = nn.Conv2d(num_channels, hidden_size, kernel_size=patch_size, stride=patch_size)
-
-    def forward(self, pixel_values):
-        batch_size, num_channels, height, width = pixel_values.shape
-        if num_channels != self.num_channels:
-            raise ValueError(
-                "Make sure that the channel dimension of the pixel values match with the one set in the configuration."
-            )
-        if height != self.image_size[0] or width != self.image_size[1]:
-            raise ValueError(
-                f"Input image size ({height}*{width}) doesn't match model ({self.image_size[0]}*{self.image_size[1]})."
-            )
-        embeddings = self.projection(pixel_values).permute(0, 2, 3, 1)
-        return embeddings
-
-
-class PPChart2TableVisionMLPBlock(nn.Module):
-    def __init__(self, config) -> None:
-        super().__init__()
-        self.lin1 = nn.Linear(config.embed_dim, int(config.embed_dim * config.mlp_ratio))
-        self.lin2 = nn.Linear(int(config.embed_dim * config.mlp_ratio), config.embed_dim)
-        self.act = ACT2FN[config.hidden_act]
-
-    def forward(self, hidden_states: torch.Tensor) -> torch.Tensor:
-        hidden_states = self.lin1(hidden_states)
-        hidden_states = self.act(hidden_states)
-        hidden_states = self.lin2(hidden_states)
-        return hidden_states
-
-
-class PPChart2TableVisionLayerNorm(nn.LayerNorm):
-    r"""LayerNorm that supports two data formats: channels_last (default) or channels_first.
-    The ordering of the dimensions in the inputs. channels_last corresponds to inputs with shape (batch_size, height,
-    width, channels) while channels_first corresponds to inputs with shape (batch_size, channels, height, width).
-    """
-
-    def __init__(self, normalized_shape, *, eps=1e-6, data_format="channels_last", **kwargs):
-        super().__init__(normalized_shape, eps=eps, **kwargs)
-        if data_format not in ["channels_last", "channels_first"]:
-            raise NotImplementedError(f"Unsupported data format: {data_format}")
-        self.data_format = data_format
-
-    def forward(self, features: torch.Tensor) -> torch.Tensor:
-        """
-        Args:
-            features: Tensor of shape (batch_size, channels, height, width) OR (batch_size, height, width, channels)
-        """
-        if self.data_format == "channels_first":
-            features = features.permute(0, 2, 3, 1)
-            features = super().forward(features)
-            features = features.permute(0, 3, 1, 2)
-        else:
-            features = super().forward(features)
-        return features
-
-
 class PPChart2TableVisionAttention(nn.Module):
     """Multi-head Attention block with relative position embeddings."""
 
@@ -120,11 +43,12 @@ def __init__(self, config, window_size):
         )
 
         self.num_attention_heads = config.num_attention_heads
-        head_dim = config.embed_dim // config.num_attention_heads
+        head_dim = config.hidden_size // config.num_attention_heads
         self.scale = head_dim**-0.5
         self.dropout = config.attention_dropout
-        self.qkv = nn.Linear(config.embed_dim, config.embed_dim * 3, bias=config.qkv_bias)
-        self.proj = nn.Linear(config.embed_dim, config.embed_dim)
+
+        self.qkv = nn.Linear(config.hidden_size, config.hidden_size * 3, bias=config.qkv_bias)
+        self.proj = nn.Linear(config.hidden_size, config.hidden_size)
 
         self.use_rel_pos = config.use_rel_pos
         if self.use_rel_pos:
@@ -240,14 +164,53 @@ def forward(self, hidden_states: torch.Tensor, output_attentions=None) -> tuple[
         return attn_output, attn_weights
 
 
-class PPChart2TableVisionDecoderLayer(GradientCheckpointingLayer):
-    def __init__(self, config, window_size) -> None:
+@auto_docstring
+class PPChart2TableVisionPreTrainedModel(PreTrainedModel):
+    config: PPChart2TableConfig
+    base_model_prefix = "model"
+    input_modalities = ("image", "text")
+    supports_gradient_checkpointing = True
+    _skip_keys_device_placement = "past_key_values"
+    _supports_flash_attn = False
+    _supports_sdpa = False
+
+    _can_compile_fullgraph = True
+    _supports_flex_attn = False
+    _supports_attention_backend = True
+
+    @torch.no_grad()
+    def _init_weights(self, module):
+        super()._init_weights(module)
+        if isinstance(module, PPChart2TableVisionAttention):
+            if module.use_rel_pos:
+                init.zeros_(module.rel_pos_h)
+                init.zeros_(module.rel_pos_w)
+        elif isinstance(module, PPChart2TableVisionEncoder):
+            if module.pos_embed is not None:
+                init.zeros_(module.pos_embed)
+
+
+class PPChart2TableMLPBlock(nn.Module):
+    def __init__(self, config):
         super().__init__()
-        self.layer_norm1 = nn.LayerNorm(config.embed_dim)
-        self.attn = PPChart2TableVisionAttention(config, window_size=window_size)
+        self.lin1 = nn.Linear(config.hidden_size, config.mlp_dim)
+        self.lin2 = nn.Linear(config.mlp_dim, config.hidden_size)
+        self.act = ACT2FN[config.hidden_act]
+
+    def forward(self, hidden_states: torch.Tensor) -> torch.Tensor:
+        hidden_states = self.lin1(hidden_states)
+        hidden_states = self.act(hidden_states)
+        hidden_states = self.lin2(hidden_states)
+        return hidden_states
 
-        self.layer_norm2 = nn.LayerNorm(config.embed_dim)
-        self.mlp = PPChart2TableVisionMLPBlock(config)
+
+class PPChart2TableVisionLayer(GradientCheckpointingLayer):
+    def __init__(self, config, window_size):
+        super().__init__()
+        self.layer_norm1 = nn.LayerNorm(config.hidden_size, eps=config.layer_norm_eps)
+        self.attn = PPChart2TableVisionAttention(config, window_size)
+        self.layer_norm2 = nn.LayerNorm(config.hidden_size, eps=config.layer_norm_eps)
+        self.mlp = PPChart2TableMLPBlock(config)
         self.window_size = window_size
 
     def window_partition(self, hidden_states: torch.Tensor, window_size: int) -> tuple[torch.Tensor, tuple[int, int]]:
@@ -325,15 +288,95 @@ def forward(self, hidden_states: torch.Tensor) -> tuple[torch.FloatTensor]:
         return hidden_states
 
 
+@dataclass
+@auto_docstring(
+    custom_intro="""
+    Base class for pp_chart2table vision model's outputs that also contains image embeddings obtained by applying the projection
+    layer to the pooler_output.
+    """
+)
+class PPChart2TableVisionEncoderOutput(ModelOutput):
+    r"""
+    image_embeds (`torch.FloatTensor` of shape `(batch_size, output_dim)` *optional* returned when model is initialized with `with_projection=True`):
+        The image embeddings obtained by applying the projection layer to the pooler_output.
+    """
+
+    image_embeds: torch.FloatTensor | None = None
+    last_hidden_state: torch.FloatTensor | None = None
+    hidden_states: tuple[torch.FloatTensor, ...] | None = None
+    attentions: tuple[torch.FloatTensor, ...] | None = None
+
+
+class PPChart2TablePatchEmbeddings(nn.Module):
+    """
+    This class turns `pixel_values` of shape `(batch_size, num_channels, height, width)` into the initial
+    `hidden_states` (patch embeddings) of shape `(batch_size, seq_length, hidden_size)` to be consumed by a
+    Transformer.
+    """
+
+    def __init__(self, config):
+        super().__init__()
+        image_size, patch_size = config.image_size, config.patch_size
+        num_channels, hidden_size = config.num_channels, config.hidden_size
+        image_size = image_size if isinstance(image_size, collections.abc.Iterable) else (image_size, image_size)
+        patch_size = patch_size if isinstance(patch_size, collections.abc.Iterable) else (patch_size, patch_size)
+        num_patches = (image_size[1] // patch_size[1]) * (image_size[0] // patch_size[0])
+        self.image_size = image_size
+        self.patch_size = patch_size
+        self.num_channels = num_channels
+        self.num_patches = num_patches
+
+        self.projection = nn.Conv2d(num_channels, hidden_size, kernel_size=patch_size, stride=patch_size)
+
+    def forward(self, pixel_values):
+        batch_size, num_channels, height, width = pixel_values.shape
+        if num_channels != self.num_channels:
+            raise ValueError(
+                "Make sure that the channel dimension of the pixel values match with the one set in the configuration."
+            )
+        if height != self.image_size[0] or width != self.image_size[1]:
+            raise ValueError(
+                f"Input image size ({height}*{width}) doesn't match model ({self.image_size[0]}*{self.image_size[1]})."
+            )
+        embeddings = self.projection(pixel_values).permute(0, 2, 3, 1)
+        return embeddings
+
+
+class PPChart2TableLayerNorm(nn.LayerNorm):
+    r"""LayerNorm that supports two data formats: channels_last (default) or channels_first.
+    The ordering of the dimensions in the inputs. channels_last corresponds to inputs with shape (batch_size, height,
+    width, channels) while channels_first corresponds to inputs with shape (batch_size, channels, height, width).
+    """
+
+    def __init__(self, normalized_shape, *, eps=1e-6, data_format="channels_last", **kwargs):
+        super().__init__(normalized_shape, eps=eps, **kwargs)
+        if data_format not in ["channels_last", "channels_first"]:
+            raise NotImplementedError(f"Unsupported data format: {data_format}")
+        self.data_format = data_format
+
+    def forward(self, features: torch.Tensor) -> torch.Tensor:
+        """
+        Args:
+            features: Tensor of shape (batch_size, channels, height, width) OR (batch_size, height, width, channels)
+        """
+        if self.data_format == "channels_first":
+            features = features.permute(0, 2, 3, 1)
+            features = super().forward(features)
+            features = features.permute(0, 3, 1, 2)
+        else:
+            features = super().forward(features)
+        return features
+
+
 class PPChart2TableVisionNeck(nn.Module):
     def __init__(self, config: PPChart2TableVisionConfig):
         super().__init__()
         self.config = config
 
-        self.conv1 = nn.Conv2d(config.embed_dim, config.output_channels, kernel_size=1, bias=False)
-        self.layer_norm1 = PPChart2TableVisionLayerNorm(config.output_channels, data_format="channels_first")
+        self.conv1 = nn.Conv2d(config.hidden_size, config.output_channels, kernel_size=1, bias=False)
+        self.layer_norm1 = PPChart2TableLayerNorm(config.output_channels, data_format="channels_first")
         self.conv2 = nn.Conv2d(config.output_channels, config.output_channels, kernel_size=3, padding=1, bias=False)
-        self.layer_norm2 = PPChart2TableVisionLayerNorm(config.output_channels, data_format="channels_first")
+        self.layer_norm2 = PPChart2TableLayerNorm(config.output_channels, data_format="channels_first")
 
     def forward(self, hidden_states):
         hidden_states = hidden_states.permute(0, 3, 1, 2)
@@ -345,108 +388,77 @@ def forward(self, hidden_states):
         return hidden_states
 
 
-@auto_docstring(
-    custom_intro="""
-    
-    """
-)
-class PPChart2TableVisionPreTrainedModel(PreTrainedModel):
-    r"""
-    Base class for all PP-Chart2Table vision models, inheriting from Hugging Face `PreTrainedModel`.
-
-    This class sets up core configurations and compatibility flags for the vision encoder, including:
-    - Support for gradient checkpointing, attention backends (FlashAttention/SDPA), and model compilation
-    - Definition of non-splittable modules (for tensor parallelism)
-    - Output recording for hidden states/attentions (for debugging/analysis)
-
-    Class Attributes:
-        config (`PPChart2TableVisionConfig`):
-            Typed config class for PP-Chart2Table vision encoder (enforces type checking).
-        base_model_prefix (`str`, defaults to `"model"`):
-            Prefix for base model parameters (used in weight loading/saving).
-        supports_gradient_checkpointing (`bool`, defaults to `True`):
-            Whether the model supports gradient checkpointing to save memory.
-        _no_split_modules (`list[str]`):
-            Modules that should not be split across devices (tensor parallelism compatibility).
-        _skip_keys_device_placement (`list[str]`):
-            Keys to skip when placing tensors on devices (e.g., past key values for generation).
-        _supports_flash_attn / _supports_sdpa / _supports_flex_attn (`bool`):
-            Compatibility with optimized attention implementations (FlashAttention, SDPA, FlexAttention).
-        _can_compile_fullgraph (`bool`, defaults to `True`):
-            Whether the model supports TorchScript/TorchCompile full graph compilation.
-        _supports_attention_backend (`bool`, defaults to `True`):
-            Whether the model supports switching attention backends (e.g., PyTorch vs FlashAttention).
-        _can_record_outputs (`dict`):
-            Mapping of output types to modules for recording intermediate outputs (hidden_states/attentions).
-    """
-
-    config: PPChart2TableVisionConfig
-    base_model_prefix = "model"
-    supports_gradient_checkpointing = True
-    _no_split_modules = ["PPChart2TableVisionDecoderLayer"]
-    _skip_keys_device_placement = ["past_key_values"]
-    _supports_flash_attn = True
-    _supports_sdpa = True
-    _supports_flex_attn = True
-
-    _can_compile_fullgraph = True
-    _supports_attention_backend = True
-    _can_record_outputs = {
-        "hidden_states": PPChart2TableVisionDecoderLayer,
-        "attentions": PPChart2TableVisionAttention,
-    }
-
+class PPChart2TableVisionEncoder(PPChart2TableVisionPreTrainedModel):
+    _can_record_outputs = {"hidden_states": PPChart2TableVisionLayer, "attentions": PPChart2TableVisionAttention}
+    input_modalities = ("image",)
 
-@auto_docstring(
-    custom_intro="""
-    
-    """
-)
-class PPChart2TableVisionModel(PPChart2TableVisionPreTrainedModel):
-    main_input_name = "pixel_values"
-    input_modalities = "image"
-
-    def __init__(
-        self,
-        config: PPChart2TableVisionConfig,
-    ) -> None:
+    def __init__(self, config: PPChart2TableVisionConfig):
         super().__init__(config)
+        self.config = config
         self.image_size = config.image_size
-
-        self.patch_embed = PPChart2TableVisionPatchEmbed(config)
-
-        self.pos_embed = nn.Parameter(
-            torch.zeros(
-                1, config.image_size // config.patch_size, config.image_size // config.patch_size, config.embed_dim
+        self.patch_embed = PPChart2TablePatchEmbeddings(config)
+
+        self.pos_embed = None
+        if config.use_abs_pos:
+            # Initialize absolute positional embedding with pretrain image size.
+            self.pos_embed = nn.Parameter(
+                torch.zeros(
+                    1,
+                    config.image_size // config.patch_size,
+                    config.image_size // config.patch_size,
+                    config.hidden_size,
+                )
             )
-        )
 
-        self.blocks = nn.ModuleList()
-        for i in range(config.depth):
-            block = PPChart2TableVisionDecoderLayer(
+        self.layers = nn.ModuleList()
+        for i in range(config.num_hidden_layers):
+            layer = PPChart2TableVisionLayer(
                 config,
                 window_size=config.window_size if i not in config.global_attn_indexes else 0,
             )
-            self.blocks.append(block)
+            self.layers.append(layer)
 
         self.neck = PPChart2TableVisionNeck(config)
 
-        self.net_2 = nn.Conv2d(
-            config.output_channels, config.net_channels, kernel_size=3, stride=2, padding=1, bias=False
-        )
-        self.net_3 = nn.Conv2d(config.net_channels, config.hidden_size, kernel_size=3, stride=2, padding=1, bias=False)
-
+        self.gradient_checkpointing = False
         self.post_init()
 
-    def forward(self, hidden_states: torch.Tensor, **kwargs) -> torch.Tensor:
-        hidden_states = self.patch_embed(hidden_states)
-        hidden_states = hidden_states + self.pos_embed
-        for block in self.blocks:
-            hidden_states = block(hidden_states)
+    def get_input_embeddings(self):
+        return self.patch_embed
+
+    @merge_with_config_defaults
+    @capture_outputs(tie_last_hidden_states=False)
+    def forward(
+        self, pixel_values: torch.FloatTensor | None = None, **kwargs: Unpack[TransformersKwargs]
+    ) -> tuple | PPChart2TableVisionEncoderOutput:
+        if pixel_values is None:
+            raise ValueError("You have to specify pixel_values")
+
+        hidden_states = self.patch_embed(pixel_values)
+        if self.pos_embed is not None:
+            hidden_states = hidden_states + self.pos_embed
+        for layer_module in self.layers:
+            hidden_states = layer_module(hidden_states)
         hidden_states = self.neck(hidden_states)
-        hidden_states = self.net_2(hidden_states)
-        hidden_states = self.net_3(hidden_states)
-        return hidden_states
+        return PPChart2TableVisionEncoderOutput(
+            last_hidden_state=hidden_states,
+        )
+
+
+class PPChart2TableTextMLP(nn.Module):
+    def __init__(self, config):
+        super().__init__()
+        self.config = config
+        self.hidden_size = config.hidden_size
+        self.intermediate_size = config.intermediate_size
+        self.gate_proj = nn.Linear(self.hidden_size, self.intermediate_size, bias=False)
+        self.up_proj = nn.Linear(self.hidden_size, self.intermediate_size, bias=False)
+        self.down_proj = nn.Linear(self.intermediate_size, self.hidden_size, bias=False)
+        self.act_fn = ACT2FN[config.hidden_act]
+
+    def forward(self, x):
+        down_proj = self.down_proj(self.act_fn(self.gate_proj(x)) * self.up_proj(x))
+        return down_proj
 
 
 def rotate_half(x):
@@ -581,22 +593,6 @@ def forward(
         return attn_output, attn_weights
 
 
-class PPChart2TableTextMLP(nn.Module):
-    def __init__(self, config):
-        super().__init__()
-        self.config = config
-        self.hidden_size = config.hidden_size
-        self.intermediate_size = config.intermediate_size
-        self.gate_proj = nn.Linear(self.hidden_size, self.intermediate_size, bias=False)
-        self.up_proj = nn.Linear(self.hidden_size, self.intermediate_size, bias=False)
-        self.down_proj = nn.Linear(self.intermediate_size, self.hidden_size, bias=False)
-        self.act_fn = ACT2FN[config.hidden_act]
-
-    def forward(self, x):
-        down_proj = self.down_proj(self.act_fn(self.gate_proj(x)) * self.up_proj(x))
-        return down_proj
-
-
 @use_kernel_forward_from_hub("RMSNorm")
 class PPChart2TableTextRMSNorm(nn.Module):
     def __init__(self, hidden_size, eps: float = 1e-6) -> None:
@@ -662,11 +658,7 @@ def forward(
         return hidden_states
 
 
-@auto_docstring(
-    custom_intro="""
-    
-    """
-)
+@auto_docstring
 class PPChart2TableTextPreTrainedModel(PreTrainedModel):
     config: PPChart2TableTextConfig
     base_model_prefix = "model"
@@ -838,148 +830,62 @@ def forward(
 @dataclass
 class PPChart2TableModelOutputWithPast(BaseModelOutputWithPast):
     r"""
-    Output class for PPChart2Table multimodal model's forward pass, extending Hugging Face `ModelOutput`.
-
-    This dataclass encapsulates the core outputs of the PP-Chart2Table base model, including hidden states,
-    attention weights, and cached key/value pairs for efficient generation.
-
-    Attributes:
-        past_key_values (`Optional[Cache]`, defaults to `None`):
-            Cached attention key/value pairs from the text decoder (for fast autoregressive generation).
-        last_hidden_state (`Optional[torch.FloatTensor]`, defaults to `None`):
-            Final hidden states from the text decoder (shape: `[B, seq_len, hidden_size]`), after multimodal fusion.
-        hidden_states (`Optional[tuple[torch.FloatTensor]]`, defaults to `None`):
-            Tuple of hidden states from each layer of the text decoder (for debugging/analysis).
-        attentions (`Optional[tuple[torch.FloatTensor]]`, defaults to `None`):
-            Tuple of attention weights from each layer of the text decoder (for debugging/analysis).
+    past_key_values (`Cache`, *optional*, returned when `use_cache=True` is passed or when `config.use_cache=True`):
+        It is a [`~cache_utils.Cache`] instance. For more details, see our [kv cache guide](https://huggingface.co/docs/transformers/en/kv_cache).
+
+        Contains pre-computed hidden-states (key and values in the self-attention blocks) that can be used (see
+        `past_key_values` input) to speed up sequential decoding.
+    image_hidden_states (`torch.FloatTensor`, *optional*):
+        A `torch.FloatTensor` of size `(batch_size, num_images, sequence_length, hidden_size)`.
+        image_hidden_states of the model produced by the vision encoder and after projecting the last hidden state.
     """
 
     image_hidden_states: torch.FloatTensor | None = None
 
 
-@dataclass
-class PPChart2TableCausalLMOutputWithPast(BaseModelOutputWithPast):
-    r"""
-    Output class for PP-Chart2Table conditional generation model's forward pass.
-
-    Extends `PPChart2TableModelOutputWithPast` with language modeling logits (for token prediction),
-    tailored for autoregressive table generation tasks.
-
-    Attributes:
-        logits (`Optional[torch.FloatTensor]`, defaults to `None`):
-            Language modeling logits (shape: `[B, seq_len, vocab_size]`), output from the LM head.
-    """
-
-    logits: torch.FloatTensor | None = None
-    loss: torch.FloatTensor | None = None
-    image_hidden_states: torch.FloatTensor | None = None
-
-
-@auto_docstring(
-    custom_intro="""
-    
-    """
-)
+@auto_docstring
 class PPChart2TablePreTrainedModel(PreTrainedModel):
-    r"""
-    Base class for all PP-Chart2Table multimodal models, inheriting from Hugging Face `PreTrainedModel`.
-
-    This class defines core configurations and compatibility flags for the multimodal model (vision + text),
-    including support for gradient checkpointing, optimized attention backends, and model compilation.
-
-    Class Attributes:
-        config (`PPChart2TableConfig`):
-            Typed config class for PP-Chart2Table (combines vision + text sub-configs).
-        base_model_prefix (`str`, defaults to `"model"`):
-            Prefix for base model parameters (used in weight loading/saving).
-        supports_gradient_checkpointing (`bool`, defaults to `True`):
-            Whether the model supports gradient checkpointing to save memory during training.
-        _no_split_modules (`list[str]`):
-            Modules that should not be split across devices (tensor parallelism compatibility).
-        _skip_keys_device_placement (`list[str]`):
-            Keys to skip when placing tensors on devices (e.g., past key values for generation).
-        _supports_flash_attn / _supports_sdpa / _supports_flex_attn (`bool`):
-            Compatibility with optimized attention implementations (FlashAttention, SDPA, FlexAttention).
-        _can_compile_fullgraph (`bool`, defaults to `True`):
-            Whether the model supports TorchScript/TorchCompile full graph compilation.
-        _supports_attention_backend (`bool`, defaults to `True`):
-            Whether the model supports switching attention backends (e.g., PyTorch vs FlashAttention).
-        _can_record_outputs (`dict`):
-            Mapping of output types to modules for recording intermediate outputs (hidden_states/attentions).
-    """
-
     config: PPChart2TableConfig
     base_model_prefix = "model"
+    input_modalities = ("image", "text")
     supports_gradient_checkpointing = True
-    _no_split_modules = ["PPChart2TableTextDecoderLayer"]
-    _skip_keys_device_placement = ["past_key_values"]
-    _supports_flash_attn = True
-    _supports_sdpa = True
-    _supports_flex_attn = True
+    _skip_keys_device_placement = "past_key_values"
+    _supports_flash_attn = False
+    _supports_sdpa = False
 
     _can_compile_fullgraph = True
+    _supports_flex_attn = False
     _supports_attention_backend = True
 
-    _can_record_outputs = {
-        "hidden_states": PPChart2TableTextDecoderLayer,
-        "attentions": PPChart2TableTextAttention,
-    }
+    @torch.no_grad()
+    def _init_weights(self, module):
+        super()._init_weights(module)
+        if isinstance(module, PPChart2TableVisionAttention):
+            if module.use_rel_pos:
+                init.zeros_(module.rel_pos_h)
+                init.zeros_(module.rel_pos_w)
+        elif isinstance(module, PPChart2TableVisionEncoder):
+            if module.pos_embed is not None:
+                init.zeros_(module.pos_embed)
 
 
-@auto_docstring(
-    custom_intro="""
-    Core PP-Chart2Table multimodal model (vision encoder + text decoder) for chart-to-table parsing.
-    """
-)
+@auto_docstring
 class PPChart2TableModel(PPChart2TablePreTrainedModel):
-    r"""
-    Core PP-Chart2Table multimodal model (vision encoder + text decoder) for chart-to-table parsing.
-
-    This model integrates a vision encoder (for chart image feature extraction) and a text decoder (for table generation),
-    with a multimodal projection layer to align vision features with text embedding space. The core logic is:
-    1. Extract chart features via vision encoder
-    2. Project vision features to text embedding dimension
-    3. Inject vision features into text decoder inputs (replace image placeholder tokens)
-    4. Forward pass through text decoder to generate table text
-
-    Args:
-        config (`PPChart2TableConfig`):
-            Combined configuration class (includes vision_config and text_config sub-configs).
-
-    Inputs (forward method):
-        input_ids (`torch.LongTensor`, optional):
-            Tokenized input text (including image placeholder tokens) with shape `[B, seq_len]`.
-        attention_mask (`torch.Tensor`, optional):
-            Attention mask to avoid padding tokens (shape: `[B, seq_len]`).
-        position_ids (`torch.Tensor`, optional):
-            Positional indices for input tokens (shape: `[B, seq_len]`).
-        past_key_values (`list[torch.Tensor]`, optional):
-            Cached key/value pairs for fast autoregressive generation.
-        inputs_embeds (`torch.Tensor`, optional):
-            Precomputed input embeddings (shape: `[B, seq_len, hidden_size]`; overrides `input_ids`).
-        use_cache (`bool`, optional):
-            Whether to cache key/value pairs for generation.
-        pixel_values (`torch.Tensor`, optional):
-            Preprocessed chart images (shape: `[B, 3, H, W]`; required for multimodal input).
-        cache_position (`torch.LongTensor`, optional):
-            Position indices for cached key/value pairs (for generation).
-        **kwargs:
-            Additional arguments passed to the text decoder.
-
-    Outputs:
-        `PPChart2TableModelOutputWithPast`:
-            Contains the text decoder's final hidden states, cached key/values, and optional intermediate outputs.
-    """
-
-    config_class = PPChart2TableConfig
+    _checkpoint_conversion_mapping = {
+        r"^language_model.model": "language_model",
+    }
 
     def __init__(self, config: PPChart2TableConfig):
         super().__init__(config)
-        self.vision_tower_high = PPChart2TableVisionModel._from_config(config.vision_config)
+        self.vision_tower = PPChart2TableVisionEncoder(config.vision_config)
+        self.multi_modal_projector = nn.Linear(config.output_channels, config.text_config.hidden_size)
         self.language_model = PPChart2TableTextModel._from_config(config.text_config)
-        self.mm_projector_vary = nn.Linear(config.vision_config.hidden_size, config.text_config.hidden_size)
-
-        # Initialize weights and apply final processing
+        self.vision_downsample1 = nn.Conv2d(
+            config.vision_config.output_channels, config.net_channels, kernel_size=3, stride=2, padding=1, bias=False
+        )
+        self.vision_downsample2 = nn.Conv2d(
+            config.net_channels, config.output_channels, kernel_size=3, stride=2, padding=1, bias=False
+        )
         self.post_init()
 
     def get_input_embeddings(self):
@@ -990,33 +896,22 @@ def set_input_embeddings(self, value):
         """Set input embeddings for the text decoder (for weight tying/loading)."""
         self.language_model.embed_tokens = value
 
+    @can_return_tuple
+    @auto_docstring(
+        custom_intro="Obtains image last hidden states from the vision tower and apply multimodal projection."
+    )
     def get_image_features(
         self,
         pixel_values: torch.FloatTensor,
-    ) -> list[torch.Tensor]:
-        r"""
-        Extract and project chart image features to text embedding space.
-
-        Args:
-            images (`torch.Tensor`):
-                Preprocessed chart images (shape: `[B, 3, H, W]`).
-
-        Returns:
-            `list[torch.Tensor]`:
-                List of projected image features (one per image), each with shape `[1, num_patches, text_hidden_size]`.
-        """
-        image_features = []
-        for pixel_value in pixel_values:
-            pixel_value = pixel_value.unsqueeze(0)
-            with torch.no_grad():
-                cnn_feature = self.vision_tower_high(pixel_value)
-                cnn_feature = cnn_feature.flatten(2).transpose(2, 1)
-            image_feature = self.mm_projector_vary(cnn_feature)
-            image_features.append(image_feature)
-
-        image_features = torch.stack(image_features, dim=0)
+        **kwargs: Unpack[TransformersKwargs],
+    ) -> tuple | BaseModelOutputWithPooling:
+        image_output = self.vision_tower(pixel_values)
+        last_hidden_state = image_output.last_hidden_state
+        last_hidden_state = self.vision_downsample1(last_hidden_state)
+        last_hidden_state = self.vision_downsample2(last_hidden_state)
+        image_output.pooler_output = self.multi_modal_projector(last_hidden_state.flatten(2).transpose(2, 1))
 
-        return image_features
+        return image_output
 
     def get_placeholder_mask(
         self, input_ids: torch.LongTensor, inputs_embeds: torch.FloatTensor, image_features: torch.FloatTensor
@@ -1034,35 +929,47 @@ def get_placeholder_mask(
             special_image_mask = input_ids == self.config.image_token_id
 
         n_image_tokens = special_image_mask.sum()
-        special_image_mask = special_image_mask.unsqueeze(-1).expand_as(inputs_embeds).to(inputs_embeds.device)
         n_image_features = image_features.shape[0] * image_features.shape[1]
-        if inputs_embeds[special_image_mask].numel() != image_features.numel():
-            raise ValueError(
-                f"Image features and image tokens do not match: tokens: {n_image_tokens}, features {n_image_features}"
-            )
+        special_image_mask = special_image_mask.unsqueeze(-1).expand_as(inputs_embeds).to(inputs_embeds.device)
+        torch_compilable_check(
+            inputs_embeds[special_image_mask].numel() == image_features.numel(),
+            f"Image features and image tokens do not match, tokens: {n_image_tokens}, features: {n_image_features}",
+        )
         return special_image_mask
 
     @can_return_tuple
+    @auto_docstring
     def forward(
         self,
-        input_ids: torch.LongTensor = None,
+        input_ids: torch.LongTensor | None = None,
+        pixel_values: torch.FloatTensor | None = None,
         attention_mask: torch.Tensor | None = None,
-        position_ids: torch.Tensor | None = None,
-        past_key_values: list[torch.Tensor] | None = None,
-        inputs_embeds: torch.Tensor | None = None,
+        position_ids: torch.LongTensor | None = None,
+        past_key_values: Cache | None = None,
+        inputs_embeds: torch.FloatTensor | None = None,
         use_cache: bool | None = None,
-        pixel_values: torch.Tensor | None = None,
+        output_attentions: bool | None = None,
+        output_hidden_states: bool | None = None,
+        return_dict: bool | None = None,
         cache_position: torch.LongTensor | None = None,
-        **kwargs,
-    ):
+        **kwargs: Unpack[TransformersKwargs],
+    ) -> tuple | PPChart2TableModelOutputWithPast:
+        output_attentions = output_attentions if output_attentions is not None else self.config.output_attentions
+        output_hidden_states = (
+            output_hidden_states if output_hidden_states is not None else self.config.output_hidden_states
+        )
+        return_dict = return_dict if return_dict is not None else self.config.use_return_dict
+
         if (input_ids is None) ^ (inputs_embeds is not None):
             raise ValueError("You must specify exactly one of input_ids or inputs_embeds")
 
         if inputs_embeds is None:
-            inputs_embeds = self.language_model.embed_tokens(input_ids)
+            inputs_embeds = self.get_input_embeddings()(input_ids)
 
         if pixel_values is not None:
-            image_features = self.get_image_features(pixel_values=pixel_values.to(inputs_embeds.dtype))
+            image_features = self.get_image_features(
+                pixel_values=pixel_values.to(inputs_embeds.dtype), return_dict=True
+            ).pooler_output
             image_features = image_features.to(inputs_embeds.device, inputs_embeds.dtype)
             special_image_mask = self.get_placeholder_mask(
                 input_ids, inputs_embeds=inputs_embeds, image_features=image_features
@@ -1075,6 +982,9 @@ def forward(
             past_key_values=past_key_values,
             inputs_embeds=inputs_embeds,
             use_cache=use_cache,
+            output_attentions=output_attentions,
+            output_hidden_states=output_hidden_states,
+            return_dict=True,
             cache_position=cache_position,
             **kwargs,
         )
@@ -1088,42 +998,49 @@ def forward(
         )
 
 
+@dataclass
 @auto_docstring(
     custom_intro="""
-    PP-Chart2Table model for conditional generation (table text generation from chart images),
-    extending the core model with a language modeling (LM) head and generation utilities.
+    Base class for PPChart2Table causal language model (or autoregressive) outputs.
     """
 )
-class PPChart2TableForConditionalGeneration(PPChart2TablePreTrainedModel, GenerationMixin):
+class PPChart2TableCausalLMOutputWithPast(ModelOutput):
     r"""
-    PP-Chart2Table model for conditional generation (table text generation from chart images),
-    extending the core model with a language modeling (LM) head and generation utilities.
+    loss (`torch.FloatTensor` of shape `(1,)`, *optional*, returned when `labels` is provided):
+        Language modeling loss (for next-token prediction).
+    logits (`torch.FloatTensor` of shape `(batch_size, sequence_length, config.vocab_size)`):
+        Prediction scores of the language modeling head (scores for each vocabulary token before SoftMax).
+    past_key_values (`Cache`, *optional*, returned when `use_cache=True` is passed or when `config.use_cache=True`):
+        It is a [`~cache_utils.Cache`] instance. For more details, see our [kv cache guide](https://huggingface.co/docs/transformers/en/kv_cache).
+
+        Contains pre-computed hidden-states (key and values in the self-attention blocks) that can be used (see
+        `past_key_values` input) to speed up sequential decoding.
+    image_hidden_states (`torch.FloatTensor`, *optional*):
+        A `torch.FloatTensor` of size `(batch_size, num_images, sequence_length, hidden_size)`.
+        image_hidden_states of the model produced by the vision encoder and after projecting the last hidden state.
+    """
 
-    This class integrates Hugging Face `GenerationMixin` to support standard generation methods (greedy, beam search, etc.),
-    and adds an LM head to predict token probabilities for autoregressive table generation.
+    loss: torch.FloatTensor | None = None
+    logits: torch.FloatTensor | None = None
+    past_key_values: Cache | None = None
+    hidden_states: tuple[torch.FloatTensor] | None = None
+    attentions: tuple[torch.FloatTensor] | None = None
+    image_hidden_states: torch.FloatTensor | None = None
 
-    Key Features:
-    - LM head for token prediction (weight tied to input embeddings)
-    - Optimized generation input preparation (avoids reprocessing images in subsequent steps)
-    - Inference-only mode (training not supported by default)
 
-    Args:
-        config (`PPChart2TableConfig`):
-            Combined configuration class (vision + text sub-configs).
-
-    Inputs (forward method):
-        Inherits all inputs from `PPChart2TableModel`, plus:
-        labels (`list[dict]`, optional):
-            Training labels (not supported; raises ValueError if provided).
-        logits_to_keep (`Union[int, torch.Tensor]`, defaults to 0):
-            Slice index to keep only the last N logits (optimizes generation efficiency).
-
-    Outputs:
-        `PPChart2TableCausalLMOutputWithPast`:
-            Contains LM logits, decoder hidden states, and cached key/value pairs.
+@auto_docstring(
+    custom_intro="""
+    PP-Chart2Table model for conditional generation (table text generation from chart images),
+    extending the core model with a language modeling (LM) head and generation utilities.
     """
-
-    _keys_to_ignore_on_load_missing = ["num_batches_tracked"]
+)
+class PPChart2TableForConditionalGeneration(PPChart2TablePreTrainedModel, GenerationMixin):
+    _checkpoint_conversion_mapping = {
+        r"^language_model.model": "model.language_model",
+        r"^vision_tower": "model.vision_tower",
+        r"^multi_modal_projector": "model.multi_modal_projector",
+        r"^language_model.lm_head": "lm_head",
+    }
     _tied_weights_keys = {"lm_head.weight": "model.language_model.embed_tokens.weight"}
 
     def __init__(self, config: PPChart2TableConfig):
@@ -1141,49 +1058,91 @@ def set_input_embeddings(self, value):
     def get_output_embeddings(self) -> nn.Module:
         return self.lm_head
 
+    @auto_docstring
     def get_image_features(
-        self,
-        pixel_values: torch.FloatTensor,
-        vision_feature_layer: int | list[int] | None = None,
-        vision_feature_select_strategy: str | None = None,
-        **kwargs,
-    ):
-        return self.model.get_image_features(
-            pixel_values=pixel_values,
-            vision_feature_layer=vision_feature_layer,
-            vision_feature_select_strategy=vision_feature_select_strategy,
-            **kwargs,
-        )
+        self, pixel_values: torch.FloatTensor, **kwargs: Unpack[TransformersKwargs]
+    ) -> tuple | BaseModelOutputWithPooling:
+        return self.model.get_image_features(pixel_values=pixel_values, **kwargs)
 
     @can_return_tuple
+    @auto_docstring
     def forward(
         self,
         input_ids: torch.LongTensor | None = None,
+        pixel_values: torch.FloatTensor | None = None,
         attention_mask: torch.Tensor | None = None,
         position_ids: torch.LongTensor | None = None,
-        pixel_values: torch.Tensor | None = None,
-        inputs_embeds: torch.Tensor | None = None,
-        labels: list[dict] | None = None,
-        logits_to_keep: int | torch.Tensor = 0,
-        cache_position: torch.LongTensor | None = None,
         past_key_values: Cache | None = None,
+        inputs_embeds: torch.FloatTensor | None = None,
+        labels: torch.LongTensor | None = None,
         use_cache: bool | None = None,
-        **kwargs,
-    ) -> tuple[torch.FloatTensor] | PPChart2TableCausalLMOutputWithPast:
+        output_attentions: bool | None = None,
+        output_hidden_states: bool | None = None,
+        return_dict: bool | None = None,
+        cache_position: torch.LongTensor | None = None,
+        logits_to_keep: int | torch.Tensor = 0,
+        **kwargs: Unpack[TransformersKwargs],
+    ) -> tuple | PPChart2TableCausalLMOutputWithPast:
+        r"""
+        labels (`torch.LongTensor` of shape `(batch_size, sequence_length)`, *optional*):
+            Labels for computing the masked language modeling loss. Indices should either be in `[0, ...,
+            config.vocab_size]` or -100 (see `input_ids` docstring). Tokens with indices set to `-100` are ignored
+            (masked), the loss is only computed for the tokens with labels in `[0, ..., config.vocab_size]`.
+
+        Example:
+
+        ```python
+        >>> from PIL import Image
+        >>> import httpx
+        >>> from io import BytesIO
+        >>> from transformers import AutoProcessor, PPChart2TableForConditionalGeneration, TextStreamer
+
+        >>> model = PPChart2TableForConditionalGeneration.from_pretrained("stepfun-ai/GOT-OCR-2.0-hf").to("cuda")
+        >>> processor = AutoProcessor.from_pretrained("stepfun-ai/GOT-OCR-2.0-hf")
+
+        >>> url = "https://huggingface.co/datasets/hf-internal-testing/fixtures_got_ocr/resolve/main/multi_box.png"
+        >>> with httpx.stream("GET", url) as response:
+        ...     image = Image.open(BytesIO(response.read()))
+
+        >>> inputs = processor(image, return_tensors="pt", color="green").to("cuda")
+
+        >>> # Generate
+        >>> streamer = TextStreamer(processor.tokenizer, skip_prompt=True, skip_special_tokens=True)
+        >>> generate_ids = model.generate(
+        ...     **inputs,
+        ...     do_sample=False,
+        ...     tokenizer = processor.tokenizer,
+        ...     stop_strings='<|im_end|>',
+        ...     streamer=streamer,
+        ...     max_new_tokens=4096,
+        ... )
+        "You should keep in mind what features from the module should be used, especially
+        when you're planning to sell a template."
+        ```"""
+        output_attentions = output_attentions if output_attentions is not None else self.config.output_attentions
+        output_hidden_states = (
+            output_hidden_states if output_hidden_states is not None else self.config.output_hidden_states
+        )
+        return_dict = return_dict if return_dict is not None else self.config.use_return_dict
+
         outputs = self.model(
             input_ids=input_ids,
+            pixel_values=pixel_values,
             attention_mask=attention_mask,
             position_ids=position_ids,
-            pixel_values=pixel_values,
+            past_key_values=past_key_values,
             inputs_embeds=inputs_embeds,
             use_cache=use_cache,
-            past_key_values=past_key_values,
+            output_attentions=output_attentions,
+            output_hidden_states=output_hidden_states,
+            return_dict=True,
             cache_position=cache_position,
             logits_to_keep=logits_to_keep,
             **kwargs,
         )
-        hidden_states = outputs.last_hidden_state
 
+        hidden_states = outputs[0]
+        # Only compute necessary logits, and do not upcast them to float if we are not computing the loss
         slice_indices = slice(-logits_to_keep, None) if isinstance(logits_to_keep, int) else logits_to_keep
         logits = self.lm_head(hidden_states[:, slice_indices, :])
 
@@ -1196,10 +1155,10 @@ def forward(
         return PPChart2TableCausalLMOutputWithPast(
             loss=loss,
             logits=logits,
-            last_hidden_state=outputs.last_hidden_state,
             past_key_values=outputs.past_key_values,
             hidden_states=outputs.hidden_states,
             attentions=outputs.attentions,
+            image_hidden_states=outputs.image_hidden_states,
         )
 
     def prepare_inputs_for_generation(
@@ -1229,7 +1188,7 @@ def prepare_inputs_for_generation(
 
         if is_first_iteration or not kwargs.get("use_cache", True):
             # Pixel values are used only in the first iteration if available
-            # In subsquent iterations, they are already merged with text and cached
+            # In subsequent iterations, they are already merged with text and cached
             # NOTE: first iteration doesn't have to be prefill, it can be the first
             # iteration with a question and cached system prompt (continue generate from cache)
             model_inputs["pixel_values"] = pixel_values
diff --git a/src/transformers/models/pp_chart2table/modular_pp_chart2table.py b/src/transformers/models/pp_chart2table/modular_pp_chart2table.py
index 62544c5545e4..bfc376d0a87a 100644
--- a/src/transformers/models/pp_chart2table/modular_pp_chart2table.py
+++ b/src/transformers/models/pp_chart2table/modular_pp_chart2table.py
@@ -1,98 +1,66 @@
-import collections
 from dataclasses import dataclass
-from typing import Optional, Union
+from typing import Optional
 
 import torch
 import torch.nn as nn
 import torchvision.transforms.v2.functional as tvF
 
-from transformers.cache_utils import Cache
-from transformers.configuration_utils import PreTrainedConfig, layer_type_validation
-from transformers.feature_extraction_utils import BatchFeature
-from transformers.generation import GenerationMixin
-from transformers.image_processing_utils_fast import BaseImageProcessorFast, group_images_by_shape, reorder_images
-from transformers.modeling_outputs import BaseModelOutputWithPast
-from transformers.modeling_rope_utils import RopeParameters
-from transformers.modeling_utils import PreTrainedModel
-from transformers.models.got_ocr2.modeling_got_ocr2 import (
-    GotOcr2LayerNorm,
-    GotOcr2MLPBlock,
-    GotOcr2PatchEmbeddings,
-    GotOcr2VisionAttention,
-    GotOcr2VisionLayer,
-    GotOcr2VisionNeck,
+from ...configuration_utils import PreTrainedConfig, layer_type_validation
+from ...feature_extraction_utils import BatchFeature
+from ...image_processing_utils_fast import BaseImageProcessorFast, group_images_by_shape, reorder_images
+from ...modeling_rope_utils import RopeParameters
+from ..got_ocr2.modeling_got_ocr2 import (
+    GotOcr2ModelOutputWithPast,
+    GotOcr2Model,
+    GotOcr2PreTrainedModel,
+    GotOcr2ForConditionalGeneration,
+    GotOcr2VisionEncoder,
 )
-from transformers.models.qwen2.modeling_qwen2 import (
-    Qwen2Attention,
-    Qwen2DecoderLayer,
+from ..qwen2.modeling_qwen2 import (
     Qwen2Model,
     Qwen2PreTrainedModel,
 )
-from transformers.utils import (
+from ...utils import (
     auto_docstring,
-    can_return_tuple,
     logging,
+    TransformersKwargs,
 )
-from transformers.processing_utils import ProcessorMixin, TensorType
-from transformers.utils import can_return_tuple
+from ...modeling_outputs import BaseModelOutputWithPooling
+from ...processing_utils import ProcessorMixin, TensorType, Unpack
 
-from transformers.image_utils import SizeDict
+from ...image_utils import SizeDict
 
 logger = logging.get_logger(__name__)
 
 
-@auto_docstring(
-    custom_intro="""
-    This configuration class defines all the hyperparameters for the vision component
-    of the PP-Chart2Table model, which is responsible for processing chart images
-    and extracting visual features for table structure recognition and content extraction.
-    PaddlePaddle/PP-Chart2Table_safetensors [PaddlePaddle/PP-Chart2Table_safetensors]
-    (https://huggingface.co/PaddlePaddle/PP-Chart2Table_safetensors)
-    """,
-    checkpoint="PaddlePaddle/PP-Chart2Table_safetensors",
-)
+@auto_docstring(checkpoint="PaddlePaddle/PP-Chart2Table_safetensors",)
 class PPChart2TableVisionConfig(PreTrainedConfig):
     """
-    Configuration class for the vision backbone of PP-Chart2Table model.
-
-    This configuration class defines all the hyperparameters for the vision component
-    of the PP-Chart2Table model, which is responsible for processing chart images
-    and extracting visual features for table structure recognition and content extraction.
-    PaddlePaddle/PP-Chart2Table_safetensors [PaddlePaddle/PP-Chart2Table_safetensors]
-    (https://huggingface.co/PaddlePaddle/PP-Chart2Table_safetensors)
-
-    Args:
-        depth (`int`, *optional*, defaults to 12):
-            Number of transformer encoder layers in the vision backbone.
-        embed_dim (`int`, *optional*, defaults to 768):
-            Dimensionality of the patch embedding vectors.
-        hidden_size (`int`, *optional*, defaults to 1024):
-            Dimensionality of the hidden layer in the feed-forward network (MLP).
-        num_channels (`int`, *optional*, defaults to 3):
-            Number of input channels (3 for RGB images, 1 for grayscale).
-        image_size (`int`, *optional*, defaults to 1024):
-            Size (height/width) of the input images (assumed to be square).
-        mlp_ratio (`float`, *optional*, defaults to 4.0):
-            Ratio of the hidden layer size to the embedding dimension in the MLP (hidden_size = embed_dim * mlp_ratio).
-        num_attention_heads (`int`, *optional*, defaults to 12):
-            Number of attention heads for each transformer encoder layer.
-        patch_size (`int`, *optional*, defaults to 16):
-            Size (height/width) of the image patches extracted from the input image.
-        qkv_bias (`bool`, *optional*, defaults to `True`):
-            Whether to include bias terms in the query, key, value projection layers of self-attention.
-        use_rel_pos (`bool`, *optional*, defaults to `True`):
-            Whether to use relative positional embeddings in the self-attention mechanism.
-        global_attn_indexes (`Optional[list[int]]`, *optional*, defaults to [2, 5, 8, 11]):
-            List of layer indexes where global attention (instead of window attention) is applied.
-            If `None`, defaults to [2, 5, 8, 11].
-        window_size (`int`, *optional*, defaults to 14):
-            Size of the attention window for window-based self-attention (only effective when use_rel_pos=True).
-        output_channels (`int`, *optional*, defaults to 256):
-            Dimensionality of the final visual feature output channels.
-        net_channels (`int`, *optional*, defaults to 512):
-            Dimensionality of intermediate network channels in the vision backbone.
-        attention_dropout (`float`, *optional*, defaults to 0.0):
-            Dropout probability applied to the attention weights.
+    num_hidden_layers (`int`, *optional*, defaults to 12):
+        Number of transformer encoder layers in the vision backbone.
+    hidden_size (`int`, *optional*, defaults to 768):
+        Dimensionality of the patch embedding vectors.
+    num_channels (`int`, *optional*, defaults to 3):
+        Number of input channels (3 for RGB images, 1 for grayscale).
+    image_size (`int`, *optional*, defaults to 1024):
+        Size (height/width) of the input images (assumed to be square).
+    num_attention_heads (`int`, *optional*, defaults to 12):
+        Number of attention heads for each transformer encoder layer.
+    patch_size (`int`, *optional*, defaults to 16):
+        Size (height/width) of the image patches extracted from the input image.
+    qkv_bias (`bool`, *optional*, defaults to `True`):
+        Whether to include bias terms in the query, key, value projection layers of self-attention.
+    use_rel_pos (`bool`, *optional*, defaults to `True`):
+        Whether to use relative positional embeddings in the self-attention mechanism.
+    global_attn_indexes (`Optional[list[int]]`, *optional*, defaults to [2, 5, 8, 11]):
+        List of layer indexes where global attention (instead of window attention) is applied.
+        If `None`, defaults to [2, 5, 8, 11].
+    window_size (`int`, *optional*, defaults to 14):
+        Size of the attention window for window-based self-attention (only effective when use_rel_pos=True).
+    output_channels (`int`, *optional*, defaults to 256):
+        Dimensionality of the final visual feature output channels.
+    attention_dropout (`float`, *optional*, defaults to 0.0):
+        Dropout probability applied to the attention weights.
     """
 
     model_type = "pp_chart2table_vision"
@@ -100,37 +68,35 @@ class PPChart2TableVisionConfig(PreTrainedConfig):
 
     def __init__(
         self,
-        depth: int = 12,
-        embed_dim: int = 768,
-        hidden_size: int = 1024,
-        num_channels: int = 3,
-        image_size: int = 1024,
-        mlp_ratio: float = 4.0,
-        num_attention_heads: int = 12,
-        patch_size: int = 16,
-        qkv_bias: bool = True,
-        use_rel_pos: bool = True,
-        global_attn_indexes: Optional[list[int]] = None,
-        window_size: int = 14,
-        output_channels: int = 256,
-        net_channels: int = 512,
-        attention_dropout: float = 0.0,
+        num_hidden_layers=12,
+        hidden_size=768,
+        output_channels=256,
+        mlp_dim=3072,
+        num_channels=3,
+        image_size=1024,
+        num_attention_heads=12,
+        patch_size=16,
+        qkv_bias=True,
+        use_rel_pos=True,
+        use_abs_pos=True,
+        global_attn_indexes=[2, 5, 8, 11],
+        window_size=14,
+        attention_dropout=0.0,
         **kwargs,
     ):
-        self.depth = depth
-        self.embed_dim = embed_dim
+        self.num_hidden_layers = num_hidden_layers
         self.hidden_size = hidden_size
+        self.mlp_dim=mlp_dim
         self.image_size = image_size
         self.num_channels = num_channels
-        self.mlp_ratio = mlp_ratio
         self.num_attention_heads = num_attention_heads
         self.patch_size = patch_size
         self.qkv_bias = qkv_bias
         self.use_rel_pos = use_rel_pos
-        self.global_attn_indexes = global_attn_indexes if global_attn_indexes is not None else [2, 5, 8, 11]
+        self.use_abs_pos = use_abs_pos
+        self.global_attn_indexes = global_attn_indexes
         self.window_size = window_size
         self.output_channels = output_channels
-        self.net_channels = net_channels
         self.attention_dropout = attention_dropout
         super().__init__(**kwargs)
 
@@ -143,76 +109,52 @@ def __init__(
 )
 class PPChart2TableTextConfig(PreTrainedConfig):
     r"""
-    This is the configuration class to store the configuration of a [`PPChart2TableTextModel`]. It is used to instantiate a
-    PP-Chart2Table text decoder according to the specified arguments, defining the model architecture. Instantiating a
-    configuration with the defaults will yield a similar configuration to that of the text encoder/decoder of the
-    PPChart2TableText-7B-beta [Qwen/PPChart2TableText-7B-beta](https://huggingface.co/Qwen/PPChart2TableText-7B-beta)
-    architecture, optimized for chart-to-table text generation tasks.
-
-    Configuration objects inherit from [`PreTrainedConfig`] and can be used to control the model outputs. Read the
-    documentation from [`PreTrainedConfig`] for more information.
-
-    Args:
-        attention_dropout (`float`, *optional*, defaults to 0.0):
-            The dropout ratio for the attention probabilities in self-attention layers.
-        bos_token_id (`int`, *optional*, defaults to 151643):
-            The token ID representing the beginning of a sequence (BOS) for text generation.
-        eos_token_id (`int`, *optional*, defaults to 151643):
-            The token ID representing the end of a sequence (EOS) for text generation.
-        pad_token_id (Optional[int], optional, *optional*, defaults to -1):
-            The index of the padding token. Defaults to -1.
-        hidden_act (`str` or `function`, *optional*, defaults to `"silu"`):
-            The non-linear activation function (function or string) in the feed-forward and attention layers of the decoder.
-        hidden_size (`int`, *optional*, defaults to 1024):
-            Dimensionality of the hidden representations in the Transformer decoder layers.
-        initializer_range (`float`, *optional*, defaults to 0.02):
-            The standard deviation of the truncated_normal_initializer for initializing all weight matrices.
-        intermediate_size (`int`, *optional*, defaults to 2816):
-            Dimensionality of the "intermediate" (i.e., feed-forward) layer in the Transformer decoder blocks.
-        max_position_embeddings (`int`, *optional*, defaults to 32768):
-            The maximum sequence length that this model might ever be used with for text input/output.
-        num_attention_heads (`int`, *optional*, defaults to 16):
-            Number of attention heads for each self-attention layer in the Transformer decoder.
-        num_hidden_layers (`int`, *optional*, defaults to 24):
-            Number of hidden layers in the Transformer decoder.
-        num_key_value_heads (`int`, *optional*, defaults to 16):
-            Number of key/value heads for implementing Grouped Query Attention (GQA). If equal to `num_attention_heads`,
-            Multi Head Attention (MHA) is used; if 1, Multi Query Attention (MQA) is used. For more details, see
-            [this paper](https://huggingface.co/papers/2305.13245).
-        rms_norm_eps (`float`, *optional*, defaults to 1e-06):
-            The epsilon value used by the RMS normalization layers to avoid division by zero.
-        rope_theta (`float`, *optional*, defaults to 1000000.0):
-            The base period of the RoPE (Rotary Position Embedding) embeddings, controlling the frequency of positional encoding.
-        rope_parameters (`RopeParameters` or `dict`, *optional*):
-            Configuration parameters for RoPE embeddings, including scaling parameters for longer sequence lengths beyond
-            `max_position_embeddings`.
-        sliding_window (`int`, *optional*, defaults to 32768):
-            Window size for Sliding Window Attention (SWA) in the decoder layers (only active if `use_sliding_window=True`).
-        tie_word_embeddings (`bool`, *optional*, defaults to `True`):
-            Whether the model's input and output word embeddings should be tied (shared weights).
-        use_cache (`bool`, *optional*, defaults to `True`):
-            Whether to return the last key/value attention states to speed up sequential decoding (only relevant for autoregressive
-            generation).
-        vocab_size (`int`, *optional*, defaults to 151860):
-            Vocabulary size of the PPChart2TableText model. Defines the number of distinct tokens that can be represented
-            by `input_ids`.
-        layer_types (`list[str]`, *optional*):
-            Attention pattern for each decoder layer (e.g., `"full_attention"` or `"sliding_attention"`). If not specified,
-            automatically determined by `sliding_window`.
-
-    Example:
-
-    ```python
-    >>> from transformers import PPChart2TableTextConfig, PPChart2TableTextModel
-
-    >>> # Initializing a PPChart2TableText style configuration
-    >>> configuration = PPChart2TableTextConfig()
-
-    >>> # Initializing a model from the PPChart2TableText-7B style configuration
-    >>> model = PPChart2TableTextModel(configuration)
-
-    >>> # Accessing the model configuration
-    >>> configuration = model.config
+    attention_dropout (`float`, *optional*, defaults to 0.0):
+        The dropout ratio for the attention probabilities in self-attention layers.
+    bos_token_id (`int`, *optional*, defaults to 151643):
+        The token ID representing the beginning of a sequence (BOS) for text generation.
+    eos_token_id (`int`, *optional*, defaults to 151643):
+        The token ID representing the end of a sequence (EOS) for text generation.
+    pad_token_id (Optional[int], optional, *optional*, defaults to -1):
+        The index of the padding token. Defaults to -1.
+    hidden_act (`str` or `function`, *optional*, defaults to `"silu"`):
+        The non-linear activation function (function or string) in the feed-forward and attention layers of the decoder.
+    hidden_size (`int`, *optional*, defaults to 1024):
+        Dimensionality of the hidden representations in the Transformer decoder layers.
+    initializer_range (`float`, *optional*, defaults to 0.02):
+        The standard deviation of the truncated_normal_initializer for initializing all weight matrices.
+    intermediate_size (`int`, *optional*, defaults to 2816):
+        Dimensionality of the "intermediate" (i.e., feed-forward) layer in the Transformer decoder blocks.
+    max_position_embeddings (`int`, *optional*, defaults to 32768):
+        The maximum sequence length that this model might ever be used with for text input/output.
+    num_attention_heads (`int`, *optional*, defaults to 16):
+        Number of attention heads for each self-attention layer in the Transformer decoder.
+    num_hidden_layers (`int`, *optional*, defaults to 24):
+        Number of hidden layers in the Transformer decoder.
+    num_key_value_heads (`int`, *optional*, defaults to 16):
+        Number of key/value heads for implementing Grouped Query Attention (GQA). If equal to `num_attention_heads`,
+        Multi Head Attention (MHA) is used; if 1, Multi Query Attention (MQA) is used. For more details, see
+        [this paper](https://huggingface.co/papers/2305.13245).
+    rms_norm_eps (`float`, *optional*, defaults to 1e-06):
+        The epsilon value used by the RMS normalization layers to avoid division by zero.
+    rope_theta (`float`, *optional*, defaults to 1000000.0):
+        The base period of the RoPE (Rotary Position Embedding) embeddings, controlling the frequency of positional encoding.
+    rope_parameters (`RopeParameters` or `dict`, *optional*):
+        Configuration parameters for RoPE embeddings, including scaling parameters for longer sequence lengths beyond
+        `max_position_embeddings`.
+    sliding_window (`int`, *optional*, defaults to 32768):
+        Window size for Sliding Window Attention (SWA) in the decoder layers (only active if `use_sliding_window=True`).
+    tie_word_embeddings (`bool`, *optional*, defaults to `True`):
+        Whether the model's input and output word embeddings should be tied (shared weights).
+    use_cache (`bool`, *optional*, defaults to `True`):
+        Whether to return the last key/value attention states to speed up sequential decoding (only relevant for autoregressive
+        generation).
+    vocab_size (`int`, *optional*, defaults to 151860):
+        Vocabulary size of the PPChart2TableText model. Defines the number of distinct tokens that can be represented
+        by `input_ids`.
+    layer_types (`list[str]`, *optional*):
+        Attention pattern for each decoder layer (e.g., `"full_attention"` or `"sliding_attention"`). If not specified,
+        automatically determined by `sliding_window`.
     """
 
     model_type = "pp_chart2table_text"
@@ -306,53 +248,20 @@ def __init__(
 )
 class PPChart2TableConfig(PreTrainedConfig):
     r"""
-    This is the main configuration class to store the configuration of a [PPChart2TableModel] or [PPChart2TableForConditionalGeneration].
-    It is used to instantiate a PP-Chart2Table multimodal model according to the specified arguments, defining the vision and text
-    sub-model architectures. This configuration class inherits from [PreTrainedConfig] and combines the configurations of:
-    [PPChart2TableVisionConfig] (for the chart vision encoder)
-    [PPChart2TableTextConfig] (for the table text decoder)
-    PP-Chart2Table PaddlePaddle/PP-Chart2Table_safetensors.
-
-    Instantiating a PPChart2TableConfig with the defaults will yield a similar configuration to the base PP-Chart2Table model
-    developed by the PaddlePaddle team for chart-to-table parsing tasks.
-
-    Configuration objects inherit from [PreTrainedConfig] and can be used to control the model outputs. Read the
-    documentation from [PreTrainedConfig] for more information.
-
-    Args:
-        vision_config (Optional[Dict], optional, *optional*)::
-            The [PPChart2TableVisionConfig] for the vision sub-model. Defaults to None.
-        text_config (Optional[Dict], optional, *optional*)::
-            The [PPChart2TableTextConfig] for the text sub-model. Defaults to None.
-        image_token_index (Optional[int], optional, *optional*, defaults to 151859)::
-            The index of the image token. Defaults to 151859.
-        image_seq_length (Optional[int], optional, *optional*, defaults to 576)::
-            The sequence length for the image. Defaults to 576.
-        pad_token_id (Optional[int], optional, *optional*, defaults to -1):
-            The index of the padding token. Defaults to -1.
-
-    Example:
-
-    ```python
-    >>> from transformers import PPChart2TableConfig, PPChart2TableModel
-
-    >>> # Initializing a PPChart2Table configuration with default vision and text sub-configs
-    >>> configuration = PPChart2TableConfig()
-
-    >>> # Initializing a PPChart2Table configuration with custom vision and text sub-configs
-    >>> vision_config = {"image_size": 512, "patch_size": 8}
-    >>> text_config = {"hidden_size": 2048, "num_hidden_layers": 16}
-    >>> configuration = PPChart2TableConfig(vision_config=vision_config, text_config=text_config)
-
-    >>> # Initializing a model from the PPChart2Table configuration
-    >>> model = PPChart2TableModel(configuration)
-
-    >>> # Accessing the model configuration
-    >>> configuration = model.config
-    >>> # Accessing the vision sub-config
-    >>> vision_config = configuration.vision_config
-    >>> # Accessing the text sub-config
-    >>> text_config = configuration.text_config
+    vision_config (Optional[Dict], optional, *optional*)::
+        The [PPChart2TableVisionConfig] for the vision sub-model. Defaults to None.
+    text_config (Optional[Dict], optional, *optional*)::
+        The [PPChart2TableTextConfig] for the text sub-model. Defaults to None.
+    image_token_index (Optional[int], optional, *optional*, defaults to 151859)::
+        The index of the image token. Defaults to 151859.
+    image_seq_length (Optional[int], optional, *optional*, defaults to 576)::
+        The sequence length for the image. Defaults to 576.
+    pad_token_id (Optional[int], optional, *optional*, defaults to -1):
+        The index of the padding token. Defaults to -1.
+    net_channels (`int`, *optional*, defaults to 512):
+        Dimensionality of intermediate network channels in the vision backbone.
+    output_channels (`int`, *optional*, defaults to 1024):
+        Dimensionality of intermediate network channels in the vision backbone.
     """
 
     model_type = "pp_chart2table"
@@ -368,11 +277,15 @@ def __init__(
         image_token_index: Optional[int] = 151859,
         image_seq_length: Optional[int] = 576,
         pad_token_id: Optional[int] = -1,
+        net_channels: Optional[int] = 512,
+        output_channels: Optional[int] = 1024,
         **kwargs,
     ):
         self.image_token_index = image_token_index
         self.image_seq_length = image_seq_length
         self.pad_token_id = pad_token_id
+        self.net_channels = net_channels
+        self.output_channels = output_channels
 
         if vision_config is None:
             vision_config = {}
@@ -408,40 +321,9 @@ def __init__(
 
         super().__init__(**kwargs)
 
-@auto_docstring(
-    custom_intro="""
 
-    """
-)
+@auto_docstring
 class PPChart2TableImageProcessorFast(BaseImageProcessorFast):
-    r"""
-    Fast image processor for the PP-Chart2Table multimodal model, optimized for GPU-accelerated chart image preprocessing.
-
-    This high-performance processor implements a streamlined preprocessing pipeline for chart images (resizing, rescaling,
-    normalization, channel reordering) using PyTorch tensor operations, designed for efficient batch processing on GPUs.
-    It inherits from [`BaseImageProcessorFast`] and is optimized for inference/training pipelines requiring low-latency
-    image preprocessing.
-
-    Class Attributes (Default Configuration):
-        resample (`int`, defaults to 3):
-            Integer identifier for the resampling filter (3 = BICUBIC, compatible with `InterpolationMode.BICUBIC`).
-        image_mean (`list[float]`, defaults to `[0.40821073, 0.4578275, 0.48145466]`):
-            Per-channel mean values for image normalization (RGB order).
-        image_std (`list[float]`, defaults to `[0.27577711, 0.26130258, 0.26862954]`):
-            Per-channel standard deviation values for image normalization (RGB order).
-        size (`dict[str, int]`, defaults to `{"height": 1024, "width": 1024}`):
-            Default target size for image resizing (1024x1024, optimized for PP-Chart2Table vision encoder).
-        patch_size (`int`, defaults to 16):
-            Size of image patches used by the PP-Chart2Table vision encoder (for alignment with model input).
-        merge_size (`int`, defaults to 4):
-            Size factor for merging image patches (specific to PP-Chart2Table's vision processing pipeline).
-        do_resize (`bool`, defaults to `True`):
-            Default flag to enable image resizing.
-        do_rescale (`bool`, defaults to `True`):
-            Default flag to enable pixel value rescaling (from [0,255] to [0,1]).
-        do_normalize (`bool`, defaults to `True`):
-            Default flag to enable image normalization.
-    """
 
     resample = 3
     image_mean = [0.40821073, 0.4578275, 0.48145466]
@@ -453,9 +335,6 @@ class PPChart2TableImageProcessorFast(BaseImageProcessorFast):
     do_rescale = True
     do_normalize = True
 
-    def __init__(self, **kwargs) -> None:
-        super().__init__(**kwargs)
-
     def _preprocess(
         self,
         images: list["torch.Tensor"],
@@ -498,30 +377,8 @@ def _preprocess(
         )
 
 
-@auto_docstring(
-    custom_intro="""
-    A multi-modal processor for the PPChart2Table model, combining image preprocessing and text tokenization
-    capabilities to handle chart-to-table conversion tasks.
-    
-    This processor integrates `PPChart2TableImageProcessorFast` for chart image preprocessing (e.g., patch-based 
-    resizing) and `Qwen2Tokenizer` for text prompt construction/tokenization. It encapsulates the end-to-end 
-    processing pipeline from raw chart images + text instructions to model-ready input tensors, and also provides 
-    postprocessing logic to decode model outputs back to human-readable table text.
-    """
-)
+@auto_docstring
 class PPChart2TableProcessor(ProcessorMixin):
-    r"""
-    [`PPChart2TableProcessor`] offers all the functionalities of [`PPChart2TableImageProcessorFast`] and [`Qwen2Tokenizer`]. See the
-    [`~PPChart2TableProcessor.__call__`] and [`~PPChart2TableProcessor.decode`] for more information.
-    Args:
-        image_processor ([`PPChart2TableImageProcessorFast`], *optional*):
-            The image processor is a required input.
-        tokenizer ([`Qwen2Tokenizer`], *optional*):
-            The tokenizer is a required input.
-        chat_template (`str`, *optional*): A Jinja template which will be used to convert lists of messages
-            in a chat into a tokenizable string.
-    """
-
     image_processor_class = "AutoImageProcessor"
     tokenizer_class = "AutoTokenizer"
 
@@ -560,182 +417,16 @@ def postprocess(self, model_pred, **kwargs):
         )
 
 
-class PPChart2TableVisionPatchEmbed(GotOcr2PatchEmbeddings):
-    def __init__(self, config):
-        super().__init__()
-        image_size, patch_size = config.image_size, config.patch_size
-        num_channels, hidden_size = config.num_channels, config.embed_dim
-        image_size = image_size if isinstance(image_size, collections.abc.Iterable) else (image_size, image_size)
-        patch_size = patch_size if isinstance(patch_size, collections.abc.Iterable) else (patch_size, patch_size)
-        num_patches = (image_size[1] // patch_size[1]) * (image_size[0] // patch_size[0])
-        self.image_size = image_size
-        self.patch_size = patch_size
-        self.num_channels = num_channels
-        self.num_patches = num_patches
-        self.projection = nn.Conv2d(num_channels, hidden_size, kernel_size=patch_size, stride=patch_size)
-
-
-class PPChart2TableVisionMLPBlock(GotOcr2MLPBlock):
-    def __init__(self, config) -> None:
-        super().__init__()
-        self.lin1 = nn.Linear(config.embed_dim, int(config.embed_dim * config.mlp_ratio))
-        self.lin2 = nn.Linear(int(config.embed_dim * config.mlp_ratio), config.embed_dim)
-
-
-class PPChart2TableVisionLayerNorm(GotOcr2LayerNorm):
-    pass
-
-
-class PPChart2TableVisionAttention(GotOcr2VisionAttention):
-    """Multi-head Attention block with relative position embeddings."""
-
-    def __init__(self, config, window_size):
-        super().__init__()
-        head_dim = config.embed_dim // config.num_attention_heads
-        self.scale = head_dim**-0.5
-        self.qkv = nn.Linear(config.embed_dim, config.embed_dim * 3, bias=config.qkv_bias)
-        self.proj = nn.Linear(config.embed_dim, config.embed_dim)
-
-
-class PPChart2TableVisionDecoderLayer(GotOcr2VisionLayer):
-    def __init__(self, config, window_size) -> None:
-        super().__init__()
-        self.layer_norm1 = nn.LayerNorm(config.embed_dim)
-        self.attn = PPChart2TableVisionAttention(config, window_size=window_size)
-
-        self.layer_norm2 = nn.LayerNorm(config.embed_dim)
-        self.mlp = PPChart2TableVisionMLPBlock(config)
-        self.window_size = window_size
-
-
-class PPChart2TableVisionNeck(GotOcr2VisionNeck):
-    def __init__(self, config: PPChart2TableVisionConfig):
-        super().__init__()
-        self.config = config
-
-        self.conv1 = nn.Conv2d(config.embed_dim, config.output_channels, kernel_size=1, bias=False)
-        self.layer_norm1 = PPChart2TableVisionLayerNorm(config.output_channels, data_format="channels_first")
-        self.conv2 = nn.Conv2d(config.output_channels, config.output_channels, kernel_size=3, padding=1, bias=False)
-        self.layer_norm2 = PPChart2TableVisionLayerNorm(config.output_channels, data_format="channels_first")
-
-
-@auto_docstring(
-    custom_intro="""
-    
-    """
-)
-class PPChart2TableVisionPreTrainedModel(PreTrainedModel):
-    r"""
-    Base class for all PP-Chart2Table vision models, inheriting from Hugging Face `PreTrainedModel`.
-
-    This class sets up core configurations and compatibility flags for the vision encoder, including:
-    - Support for gradient checkpointing, attention backends (FlashAttention/SDPA), and model compilation
-    - Definition of non-splittable modules (for tensor parallelism)
-    - Output recording for hidden states/attentions (for debugging/analysis)
-
-    Class Attributes:
-        config (`PPChart2TableVisionConfig`):
-            Typed config class for PP-Chart2Table vision encoder (enforces type checking).
-        base_model_prefix (`str`, defaults to `"model"`):
-            Prefix for base model parameters (used in weight loading/saving).
-        supports_gradient_checkpointing (`bool`, defaults to `True`):
-            Whether the model supports gradient checkpointing to save memory.
-        _no_split_modules (`list[str]`):
-            Modules that should not be split across devices (tensor parallelism compatibility).
-        _skip_keys_device_placement (`list[str]`):
-            Keys to skip when placing tensors on devices (e.g., past key values for generation).
-        _supports_flash_attn / _supports_sdpa / _supports_flex_attn (`bool`):
-            Compatibility with optimized attention implementations (FlashAttention, SDPA, FlexAttention).
-        _can_compile_fullgraph (`bool`, defaults to `True`):
-            Whether the model supports TorchScript/TorchCompile full graph compilation.
-        _supports_attention_backend (`bool`, defaults to `True`):
-            Whether the model supports switching attention backends (e.g., PyTorch vs FlashAttention).
-        _can_record_outputs (`dict`):
-            Mapping of output types to modules for recording intermediate outputs (hidden_states/attentions).
-    """
-
-    config: PPChart2TableVisionConfig
-    base_model_prefix = "model"
-    supports_gradient_checkpointing = True
-    _no_split_modules = ["PPChart2TableVisionDecoderLayer"]
-    _skip_keys_device_placement = ["past_key_values"]
-    _supports_flash_attn = True
-    _supports_sdpa = True
-    _supports_flex_attn = True
-
-    _can_compile_fullgraph = True
-    _supports_attention_backend = True
-    _can_record_outputs = {
-        "hidden_states": PPChart2TableVisionDecoderLayer,
-        "attentions": PPChart2TableVisionAttention,
-    }
-
-
-@auto_docstring(
-    custom_intro="""
-    
-    """
-)
-class PPChart2TableVisionModel(PPChart2TableVisionPreTrainedModel):
-    main_input_name = "pixel_values"
-    input_modalities = "image"
-
-    def __init__(
-        self,
-        config: PPChart2TableVisionConfig,
-    ) -> None:
-        super().__init__(config)
-        self.image_size = config.image_size
-
-        self.patch_embed = PPChart2TableVisionPatchEmbed(config)
-
-        self.pos_embed = nn.Parameter(
-            torch.zeros(
-                1, config.image_size // config.patch_size, config.image_size // config.patch_size, config.embed_dim
-            )
-        )
-
-        self.blocks = nn.ModuleList()
-        for i in range(config.depth):
-            block = PPChart2TableVisionDecoderLayer(
-                config,
-                window_size=config.window_size if i not in config.global_attn_indexes else 0,
-            )
-            self.blocks.append(block)
-
-        self.neck = PPChart2TableVisionNeck(config)
-
-        self.net_2 = nn.Conv2d(
-            config.output_channels, config.net_channels, kernel_size=3, stride=2, padding=1, bias=False
-        )
-        self.net_3 = nn.Conv2d(config.net_channels, config.hidden_size, kernel_size=3, stride=2, padding=1, bias=False)
+class PPChart2TableVisionPreTrainedModel(GotOcr2PreTrainedModel):
+    input_modalities = ("image", "text")
 
-        self.post_init()
 
-    def forward(self, hidden_states: torch.Tensor, **kwargs) -> torch.Tensor:
-        hidden_states = self.patch_embed(hidden_states)
-        hidden_states = hidden_states + self.pos_embed
-        for block in self.blocks:
-            hidden_states = block(hidden_states)
-        hidden_states = self.neck(hidden_states)
-        hidden_states = self.net_2(hidden_states)
-        hidden_states = self.net_3(hidden_states)
-        return hidden_states
-
-
-class PPChart2TableTextAttention(Qwen2Attention):
+class PPChart2TableVisionEncoder(GotOcr2VisionEncoder, PPChart2TableVisionPreTrainedModel):
     pass
 
 
-class PPChart2TableTextDecoderLayer(Qwen2DecoderLayer):
-    pass
 
-
-@auto_docstring(
-    custom_intro="""
-    
-    """
-)
+@auto_docstring
 class PPChart2TableTextPreTrainedModel(Qwen2PreTrainedModel):
     pass
 
@@ -745,147 +436,19 @@ class PPChart2TableTextModel(Qwen2Model):
 
 
 @dataclass
-class PPChart2TableModelOutputWithPast(BaseModelOutputWithPast):
-    r"""
-    Output class for PPChart2Table multimodal model's forward pass, extending Hugging Face `ModelOutput`.
-
-    This dataclass encapsulates the core outputs of the PP-Chart2Table base model, including hidden states,
-    attention weights, and cached key/value pairs for efficient generation.
-
-    Attributes:
-        past_key_values (`Optional[Cache]`, defaults to `None`):
-            Cached attention key/value pairs from the text decoder (for fast autoregressive generation).
-        last_hidden_state (`Optional[torch.FloatTensor]`, defaults to `None`):
-            Final hidden states from the text decoder (shape: `[B, seq_len, hidden_size]`), after multimodal fusion.
-        hidden_states (`Optional[tuple[torch.FloatTensor]]`, defaults to `None`):
-            Tuple of hidden states from each layer of the text decoder (for debugging/analysis).
-        attentions (`Optional[tuple[torch.FloatTensor]]`, defaults to `None`):
-            Tuple of attention weights from each layer of the text decoder (for debugging/analysis).
-    """
-    image_hidden_states: Optional[torch.FloatTensor] = None
-
-
-@dataclass
-class PPChart2TableCausalLMOutputWithPast(BaseModelOutputWithPast):
-    r"""
-    Output class for PP-Chart2Table conditional generation model's forward pass.
-
-    Extends `PPChart2TableModelOutputWithPast` with language modeling logits (for token prediction),
-    tailored for autoregressive table generation tasks.
-
-    Attributes:
-        logits (`Optional[torch.FloatTensor]`, defaults to `None`):
-            Language modeling logits (shape: `[B, seq_len, vocab_size]`), output from the LM head.
-    """
-
-    logits: Optional[torch.FloatTensor] = None
-    loss: Optional[torch.FloatTensor] = None
-    image_hidden_states: Optional[torch.FloatTensor] = None
-
-
-@auto_docstring(
-    custom_intro="""
-    
-    """
-)
-class PPChart2TablePreTrainedModel(PreTrainedModel):
-    r"""
-    Base class for all PP-Chart2Table multimodal models, inheriting from Hugging Face `PreTrainedModel`.
-
-    This class defines core configurations and compatibility flags for the multimodal model (vision + text),
-    including support for gradient checkpointing, optimized attention backends, and model compilation.
-
-    Class Attributes:
-        config (`PPChart2TableConfig`):
-            Typed config class for PP-Chart2Table (combines vision + text sub-configs).
-        base_model_prefix (`str`, defaults to `"model"`):
-            Prefix for base model parameters (used in weight loading/saving).
-        supports_gradient_checkpointing (`bool`, defaults to `True`):
-            Whether the model supports gradient checkpointing to save memory during training.
-        _no_split_modules (`list[str]`):
-            Modules that should not be split across devices (tensor parallelism compatibility).
-        _skip_keys_device_placement (`list[str]`):
-            Keys to skip when placing tensors on devices (e.g., past key values for generation).
-        _supports_flash_attn / _supports_sdpa / _supports_flex_attn (`bool`):
-            Compatibility with optimized attention implementations (FlashAttention, SDPA, FlexAttention).
-        _can_compile_fullgraph (`bool`, defaults to `True`):
-            Whether the model supports TorchScript/TorchCompile full graph compilation.
-        _supports_attention_backend (`bool`, defaults to `True`):
-            Whether the model supports switching attention backends (e.g., PyTorch vs FlashAttention).
-        _can_record_outputs (`dict`):
-            Mapping of output types to modules for recording intermediate outputs (hidden_states/attentions).
-    """
-
-    config: PPChart2TableConfig
-    base_model_prefix = "model"
-    supports_gradient_checkpointing = True
-    _no_split_modules = ["PPChart2TableTextDecoderLayer"]
-    _skip_keys_device_placement = ["past_key_values"]
-    _supports_flash_attn = True
-    _supports_sdpa = True
-    _supports_flex_attn = True
-
-    _can_compile_fullgraph = True
-    _supports_attention_backend = True
-
-    _can_record_outputs = {
-        "hidden_states": PPChart2TableTextDecoderLayer,
-        "attentions": PPChart2TableTextAttention,
-    }
-
-
-@auto_docstring(
-    custom_intro="""
-    Core PP-Chart2Table multimodal model (vision encoder + text decoder) for chart-to-table parsing.
-    """
-)
-class PPChart2TableModel(PPChart2TablePreTrainedModel):
-    r"""
-    Core PP-Chart2Table multimodal model (vision encoder + text decoder) for chart-to-table parsing.
-
-    This model integrates a vision encoder (for chart image feature extraction) and a text decoder (for table generation),
-    with a multimodal projection layer to align vision features with text embedding space. The core logic is:
-    1. Extract chart features via vision encoder
-    2. Project vision features to text embedding dimension
-    3. Inject vision features into text decoder inputs (replace image placeholder tokens)
-    4. Forward pass through text decoder to generate table text
-
-    Args:
-        config (`PPChart2TableConfig`):
-            Combined configuration class (includes vision_config and text_config sub-configs).
+class PPChart2TableModelOutputWithPast(GotOcr2ModelOutputWithPast):
+    pass
 
-    Inputs (forward method):
-        input_ids (`torch.LongTensor`, optional):
-            Tokenized input text (including image placeholder tokens) with shape `[B, seq_len]`.
-        attention_mask (`torch.Tensor`, optional):
-            Attention mask to avoid padding tokens (shape: `[B, seq_len]`).
-        position_ids (`torch.Tensor`, optional):
-            Positional indices for input tokens (shape: `[B, seq_len]`).
-        past_key_values (`list[torch.Tensor]`, optional):
-            Cached key/value pairs for fast autoregressive generation.
-        inputs_embeds (`torch.Tensor`, optional):
-            Precomputed input embeddings (shape: `[B, seq_len, hidden_size]`; overrides `input_ids`).
-        use_cache (`bool`, optional):
-            Whether to cache key/value pairs for generation.
-        pixel_values (`torch.Tensor`, optional):
-            Preprocessed chart images (shape: `[B, 3, H, W]`; required for multimodal input).
-        cache_position (`torch.LongTensor`, optional):
-            Position indices for cached key/value pairs (for generation).
-        **kwargs:
-            Additional arguments passed to the text decoder.
 
-    Outputs:
-        `PPChart2TableModelOutputWithPast`:
-            Contains the text decoder's final hidden states, cached key/values, and optional intermediate outputs.
-    """
-
-    config_class = PPChart2TableConfig
+@auto_docstring
+class PPChart2TableModel(GotOcr2Model):
 
     def __init__(self, config: PPChart2TableConfig):
         super().__init__(config)
-        self.vision_tower_high = PPChart2TableVisionModel._from_config(config.vision_config)
+        self.vision_downsample1 = nn.Conv2d(config.vision_config.output_channels, config.net_channels, kernel_size=3, stride=2, padding=1, bias=False)
+        self.vision_downsample2 = nn.Conv2d(config.net_channels, config.output_channels, kernel_size=3, stride=2, padding=1, bias=False)
         self.language_model = PPChart2TableTextModel._from_config(config.text_config)
-        self.mm_projector_vary = nn.Linear(config.vision_config.hidden_size, config.text_config.hidden_size)
+        self.multi_modal_projector = nn.Linear(config.output_channels, config.text_config.hidden_size)
 
         # Initialize weights and apply final processing
         self.post_init()
@@ -901,99 +464,16 @@ def set_input_embeddings(self, value):
     def get_image_features(
         self,
         pixel_values: torch.FloatTensor,
-    ) -> list[torch.Tensor]:
-        r"""
-        Extract and project chart image features to text embedding space.
-
-        Args:
-            images (`torch.Tensor`):
-                Preprocessed chart images (shape: `[B, 3, H, W]`).
-
-        Returns:
-            `list[torch.Tensor]`:
-                List of projected image features (one per image), each with shape `[1, num_patches, text_hidden_size]`.
-        """
-        image_features = []
-        for pixel_value in pixel_values:
-            pixel_value = pixel_value.unsqueeze(0)
-            with torch.no_grad():
-                cnn_feature = self.vision_tower_high(pixel_value)
-                cnn_feature = cnn_feature.flatten(2).transpose(2, 1)
-            image_feature = self.mm_projector_vary(cnn_feature)
-            image_features.append(image_feature)
+        **kwargs: Unpack[TransformersKwargs],
+    ) -> tuple | BaseModelOutputWithPooling:
+        
+        image_output = self.vision_tower(pixel_values)
+        last_hidden_state = image_output.last_hidden_state
+        last_hidden_state = self.vision_downsample1(last_hidden_state)
+        last_hidden_state = self.vision_downsample2(last_hidden_state)
+        image_output.pooler_output = self.multi_modal_projector(last_hidden_state.flatten(2).transpose(2, 1))
 
-        image_features = torch.stack(image_features, dim=0)
-
-        return image_features
-
-    def get_placeholder_mask(
-        self, input_ids: torch.LongTensor, inputs_embeds: torch.FloatTensor, image_features: torch.FloatTensor
-    ):
-        """
-        Obtains multimodal placeholder mask from `input_ids` or `inputs_embeds`, and checks that the placeholder token count is
-        equal to the length of multimodal features. If the lengths are different, an error is raised.
-        """
-        if input_ids is None:
-            special_image_mask = inputs_embeds == self.get_input_embeddings()(
-                torch.tensor(self.config.image_token_id, dtype=torch.long, device=inputs_embeds.device)
-            )
-            special_image_mask = special_image_mask.all(-1)
-        else:
-            special_image_mask = input_ids == self.config.image_token_id
-
-        n_image_tokens = special_image_mask.sum()
-        special_image_mask = special_image_mask.unsqueeze(-1).expand_as(inputs_embeds).to(inputs_embeds.device)
-        n_image_features = image_features.shape[0] * image_features.shape[1]
-        if inputs_embeds[special_image_mask].numel() != image_features.numel():
-            raise ValueError(
-                f"Image features and image tokens do not match: tokens: {n_image_tokens}, features {n_image_features}"
-            )
-        return special_image_mask
-
-    @can_return_tuple
-    def forward(
-        self,
-        input_ids: torch.LongTensor = None,
-        attention_mask: Optional[torch.Tensor] = None,
-        position_ids: Optional[torch.Tensor] = None,
-        past_key_values: Optional[list[torch.Tensor]] = None,
-        inputs_embeds: Optional[torch.Tensor] = None,
-        use_cache: Optional[bool] = None,
-        pixel_values: Optional[torch.Tensor] = None,
-        cache_position: Optional[torch.LongTensor] = None,
-        **kwargs,
-    ):
-        if (input_ids is None) ^ (inputs_embeds is not None):
-            raise ValueError("You must specify exactly one of input_ids or inputs_embeds")
-
-        if inputs_embeds is None:
-            inputs_embeds = self.language_model.embed_tokens(input_ids)
-
-        if pixel_values is not None:
-            image_features = self.get_image_features(pixel_values=pixel_values.to(inputs_embeds.dtype))
-            image_features = image_features.to(inputs_embeds.device, inputs_embeds.dtype)
-            special_image_mask = self.get_placeholder_mask(
-                input_ids, inputs_embeds=inputs_embeds, image_features=image_features
-            )
-            inputs_embeds = inputs_embeds.masked_scatter(special_image_mask, image_features)
-
-        outputs = self.language_model(
-            attention_mask=attention_mask,
-            position_ids=position_ids,
-            past_key_values=past_key_values,
-            inputs_embeds=inputs_embeds,
-            use_cache=use_cache,
-            cache_position=cache_position,
-            **kwargs,
-        )
-
-        return PPChart2TableModelOutputWithPast(
-            last_hidden_state=outputs.last_hidden_state,
-            past_key_values=outputs.past_key_values,
-            hidden_states=outputs.hidden_states,
-            attentions=outputs.attentions,
-            image_hidden_states=image_features if pixel_values is not None else None,
-        )
+        return image_output
 
 
 @auto_docstring(
@@ -1002,147 +482,8 @@ def forward(
     extending the core model with a language modeling (LM) head and generation utilities.
     """
 )
-class PPChart2TableForConditionalGeneration(PPChart2TablePreTrainedModel, GenerationMixin):
-    r"""
-    PP-Chart2Table model for conditional generation (table text generation from chart images),
-    extending the core model with a language modeling (LM) head and generation utilities.
-
-    This class integrates Hugging Face `GenerationMixin` to support standard generation methods (greedy, beam search, etc.),
-    and adds an LM head to predict token probabilities for autoregressive table generation.
-
-    Key Features:
-    - LM head for token prediction (weight tied to input embeddings)
-    - Optimized generation input preparation (avoids reprocessing images in subsequent steps)
-    - Inference-only mode (training not supported by default)
-
-    Args:
-        config (`PPChart2TableConfig`):
-            Combined configuration class (vision + text sub-configs).
-
-    Inputs (forward method):
-        Inherits all inputs from `PPChart2TableModel`, plus:
-        labels (`list[dict]`, optional):
-            Training labels (not supported; raises ValueError if provided).
-        logits_to_keep (`Union[int, torch.Tensor]`, defaults to 0):
-            Slice index to keep only the last N logits (optimizes generation efficiency).
-
-    Outputs:
-        `PPChart2TableCausalLMOutputWithPast`:
-            Contains LM logits, decoder hidden states, and cached key/value pairs.
-    """
-
-    _keys_to_ignore_on_load_missing = ["num_batches_tracked"]
-    _tied_weights_keys = {"lm_head.weight": "model.language_model.embed_tokens.weight"}
-
-    def __init__(self, config: PPChart2TableConfig):
-        super().__init__(config)
-        self.model = PPChart2TableModel(config)
-        self.lm_head = nn.Linear(config.text_config.hidden_size, config.text_config.vocab_size, bias=False)
-        self.post_init()
-
-    def get_input_embeddings(self):
-        return self.model.get_input_embeddings()
-
-    def set_input_embeddings(self, value):
-        self.model.set_input_embeddings(value)
-
-    def get_output_embeddings(self) -> nn.Module:
-        return self.lm_head
-
-    def get_image_features(
-        self,
-        pixel_values: torch.FloatTensor,
-        vision_feature_layer: Optional[Union[int, list[int]]] = None,
-        vision_feature_select_strategy: Optional[str] = None,
-        **kwargs,
-    ):
-        return self.model.get_image_features(
-            pixel_values=pixel_values,
-            vision_feature_layer=vision_feature_layer,
-            vision_feature_select_strategy=vision_feature_select_strategy,
-            **kwargs,
-        )
-
-    @can_return_tuple
-    def forward(
-        self,
-        input_ids: Optional[torch.LongTensor] = None,
-        attention_mask: Optional[torch.Tensor] = None,
-        position_ids: Optional[torch.LongTensor] = None,
-        pixel_values: Optional[torch.Tensor] = None,
-        inputs_embeds: Optional[torch.Tensor] = None,
-        labels: Optional[list[dict]] = None,
-        logits_to_keep: Union[int, torch.Tensor] = 0,
-        cache_position: Optional[torch.LongTensor] = None,
-        past_key_values: Optional[Cache] = None,
-        use_cache: Optional[bool] = None,
-        **kwargs,
-    ) -> Union[tuple[torch.FloatTensor], PPChart2TableCausalLMOutputWithPast]:
-        outputs = self.model(
-            input_ids=input_ids,
-            attention_mask=attention_mask,
-            position_ids=position_ids,
-            pixel_values=pixel_values,
-            inputs_embeds=inputs_embeds,
-            use_cache=use_cache,
-            past_key_values=past_key_values,
-            cache_position=cache_position,
-            logits_to_keep=logits_to_keep,
-            **kwargs,
-        )
-        hidden_states = outputs.last_hidden_state
-
-        slice_indices = slice(-logits_to_keep, None) if isinstance(logits_to_keep, int) else logits_to_keep
-        logits = self.lm_head(hidden_states[:, slice_indices, :])
-
-        loss = None
-        if labels is not None:
-            loss = self.loss_function(
-                logits=logits, labels=labels, vocab_size=self.config.text_config.vocab_size, **kwargs
-            )
-
-        return PPChart2TableCausalLMOutputWithPast(
-            loss=loss,
-            logits=logits,
-            last_hidden_state=outputs.last_hidden_state,
-            past_key_values=outputs.past_key_values,
-            hidden_states=outputs.hidden_states,
-            attentions=outputs.attentions,
-        )
-
-    def prepare_inputs_for_generation(
-        self,
-        input_ids,
-        past_key_values=None,
-        inputs_embeds=None,
-        pixel_values=None,
-        attention_mask=None,
-        cache_position=None,
-        logits_to_keep=None,
-        is_first_iteration=False,
-        **kwargs,
-    ):
-        # Overwritten -- in specific circumstances we don't want to forward image inputs to the model
-
-        model_inputs = super().prepare_inputs_for_generation(
-            input_ids,
-            past_key_values=past_key_values,
-            inputs_embeds=inputs_embeds,
-            attention_mask=attention_mask,
-            cache_position=cache_position,
-            logits_to_keep=logits_to_keep,
-            is_first_iteration=is_first_iteration,
-            **kwargs,
-        )
-
-        if is_first_iteration or not kwargs.get("use_cache", True):
-            # Pixel values are used only in the first iteration if available
-            # In subsquent iterations, they are already merged with text and cached
-            # NOTE: first iteration doesn't have to be prefill, it can be the first
-            # iteration with a question and cached system prompt (continue generate from cache)
-            model_inputs["pixel_values"] = pixel_values
-
-        return model_inputs
+class PPChart2TableForConditionalGeneration(GotOcr2ForConditionalGeneration):
+    pass
 
 
 __all__ = [
diff --git a/src/transformers/models/pp_chart2table/processing_pp_chart2table.py b/src/transformers/models/pp_chart2table/processing_pp_chart2table.py
index 1efa2641ff23..518fcb645770 100644
--- a/src/transformers/models/pp_chart2table/processing_pp_chart2table.py
+++ b/src/transformers/models/pp_chart2table/processing_pp_chart2table.py
@@ -7,35 +7,13 @@
 
 import torch
 
-from transformers.feature_extraction_utils import BatchFeature
-from transformers.processing_utils import ProcessorMixin
-from transformers.utils import auto_docstring
+from ...feature_extraction_utils import BatchFeature
+from ...processing_utils import ProcessorMixin
+from ...utils import auto_docstring
 
 
-@auto_docstring(
-    custom_intro="""
-    A multi-modal processor for the PPChart2Table model, combining image preprocessing and text tokenization
-    capabilities to handle chart-to-table conversion tasks.
-    
-    This processor integrates `PPChart2TableImageProcessorFast` for chart image preprocessing (e.g., patch-based 
-    resizing) and `Qwen2Tokenizer` for text prompt construction/tokenization. It encapsulates the end-to-end 
-    processing pipeline from raw chart images + text instructions to model-ready input tensors, and also provides 
-    postprocessing logic to decode model outputs back to human-readable table text.
-    """
-)
+@auto_docstring
 class PPChart2TableProcessor(ProcessorMixin):
-    r"""
-    [`PPChart2TableProcessor`] offers all the functionalities of [`PPChart2TableImageProcessorFast`] and [`Qwen2Tokenizer`]. See the
-    [`~PPChart2TableProcessor.__call__`] and [`~PPChart2TableProcessor.decode`] for more information.
-    Args:
-        image_processor ([`PPChart2TableImageProcessorFast`], *optional*):
-            The image processor is a required input.
-        tokenizer ([`Qwen2Tokenizer`], *optional*):
-            The tokenizer is a required input.
-        chat_template (`str`, *optional*): A Jinja template which will be used to convert lists of messages
-            in a chat into a tokenizable string.
-    """
-
     image_processor_class = "AutoImageProcessor"
     tokenizer_class = "AutoTokenizer"
 
diff --git a/tests/models/pp_chart2table/test_modeling_pp_chart2table.py b/tests/models/pp_chart2table/test_modeling_pp_chart2table.py
index 99d30d76c81d..28bac6ef8cea 100644
--- a/tests/models/pp_chart2table/test_modeling_pp_chart2table.py
+++ b/tests/models/pp_chart2table/test_modeling_pp_chart2table.py
@@ -16,6 +16,8 @@
 import gc
 import unittest
 
+import requests
+
 import pytest
 from parameterized import parameterized
 from PIL import Image
@@ -71,8 +73,8 @@ def __init__(
         is_training=False,
         vision_config={
             "depth": 2,
-            "embed_dim": 768,
             "hidden_size": 144,
+            "output_channels": 192,
             "hidden_act": "gelu",
             "image_size": 64,
             "num_channels": 3,
@@ -84,8 +86,8 @@ def __init__(
             "use_rel_pos": True,
             "global_attn_indexes": [2, 5, 8, 11],
             "window_size": 14,
-            "output_channels": 256,
-            "net_channels": 512,
+            "neck_channels": 48,
+            "net_channels": 96,
             "attention_dropout": 0.0,
         },
         bos_token_id=151643,
@@ -260,9 +262,8 @@ def test_small_model_integration_test(self):
             "/workspace/model_weight_torch/PP-Chart2Table", dtype="float32"
         ).to("cuda")
 
-        image = Image.open(
-            "/workspace/PaddleX/paddlex/inference/models/doc_vlm/modeling/chart_parsing_02.png"
-        ).convert("RGB")
+        image = Image.open(requests.get("https://paddle-model-ecology.bj.bcebos.com/paddlex/imgs/demo_image/chart_parsing_02.png", stream=True).raw)
+
         inputs = self.processor(images=image).to(model.device)
         breakpoint()
         expected_input_ids_length = 286

From 3d8a654672f5cb7e72cf4e143996ca6fa6f488fa Mon Sep 17 00:00:00 2001
From: XingweiDeng <aaadddiy@163.com>
Date: Fri, 13 Mar 2026 12:30:31 +0800
Subject: [PATCH 13/60] update

---
 .../configuration_pp_chart2table.py           | 325 ++++----------
 .../image_processing_pp_chart2table.py        | 166 +------
 .../pp_chart2table/modeling_pp_chart2table.py | 415 +-----------------
 .../pp_chart2table/modular_pp_chart2table.py  | 375 ++--------------
 .../processing_pp_chart2table.py              |  41 +-
 5 files changed, 187 insertions(+), 1135 deletions(-)

diff --git a/src/transformers/models/pp_chart2table/configuration_pp_chart2table.py b/src/transformers/models/pp_chart2table/configuration_pp_chart2table.py
index 7ecb1a2ddd31..f1c819a96417 100644
--- a/src/transformers/models/pp_chart2table/configuration_pp_chart2table.py
+++ b/src/transformers/models/pp_chart2table/configuration_pp_chart2table.py
@@ -5,248 +5,95 @@
 #                          modular_pp_chart2table.py file directly. One of our CI enforces this.
 #                🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨
 
-from ...configuration_utils import PreTrainedConfig, layer_type_validation
-from ...modeling_rope_utils import RopeParameters
+from ...configuration_utils import PreTrainedConfig
 from ...utils import auto_docstring
+from ..auto import CONFIG_MAPPING, AutoConfig
 
 
-@auto_docstring(
-    checkpoint="PaddlePaddle/PP-Chart2Table_safetensors",
-)
+@auto_docstring(checkpoint="facebook/sam-vit-huge")
 class PPChart2TableVisionConfig(PreTrainedConfig):
-    """
-    num_hidden_layers (`int`, *optional*, defaults to 12):
-        Number of transformer encoder layers in the vision backbone.
-    hidden_size (`int`, *optional*, defaults to 768):
-        Dimensionality of the patch embedding vectors.
-    num_channels (`int`, *optional*, defaults to 3):
-        Number of input channels (3 for RGB images, 1 for grayscale).
-    image_size (`int`, *optional*, defaults to 1024):
-        Size (height/width) of the input images (assumed to be square).
-    num_attention_heads (`int`, *optional*, defaults to 12):
-        Number of attention heads for each transformer encoder layer.
-    patch_size (`int`, *optional*, defaults to 16):
-        Size (height/width) of the image patches extracted from the input image.
-    qkv_bias (`bool`, *optional*, defaults to `True`):
-        Whether to include bias terms in the query, key, value projection layers of self-attention.
-    use_rel_pos (`bool`, *optional*, defaults to `True`):
-        Whether to use relative positional embeddings in the self-attention mechanism.
-    global_attn_indexes (`Optional[list[int]]`, *optional*, defaults to [2, 5, 8, 11]):
-        List of layer indexes where global attention (instead of window attention) is applied.
-        If `None`, defaults to [2, 5, 8, 11].
-    window_size (`int`, *optional*, defaults to 14):
-        Size of the attention window for window-based self-attention (only effective when use_rel_pos=True).
+    r"""
     output_channels (`int`, *optional*, defaults to 256):
-        Dimensionality of the final visual feature output channels.
-    attention_dropout (`float`, *optional*, defaults to 0.0):
-        Dropout probability applied to the attention weights.
+        Dimensionality of the output channels in the Patch Encoder.
+    window_size (`int`, *optional*, defaults to 14):
+        Window size for relative position.
+    use_abs_pos (`bool`, *optional*, defaults to `True`):
+        Whether to use absolute position embedding.
+    use_rel_pos (`bool`, *optional*, defaults to `True`):
+        Whether to use relative position embedding.
+    global_attn_indexes (`list[int]`, *optional*, defaults to `[2, 5, 8, 11]`):
+        The indexes of the global attention layers.
+    mlp_dim (`int`, *optional*, defaults to 3072):
+        The dimensionality of the MLP layer in the Transformer encoder.
     """
 
-    model_type = "pp_chart2table_vision"
     base_config_key = "vision_config"
 
     def __init__(
         self,
-        num_hidden_layers=12,
         hidden_size=768,
         output_channels=256,
-        mlp_dim=3072,
+        num_hidden_layers=12,
+        num_attention_heads=12,
         num_channels=3,
         image_size=1024,
-        num_attention_heads=12,
         patch_size=16,
+        hidden_act="gelu",
+        layer_norm_eps=1e-06,
+        attention_dropout=0.0,
+        initializer_range=1e-10,
         qkv_bias=True,
+        use_abs_pos=True,
         use_rel_pos=True,
-        use_abs_pos=False,
-        global_attn_indexes=[2, 5, 8, 11],
         window_size=14,
-        attention_dropout=0.0,
+        global_attn_indexes=[2, 5, 8, 11],
+        mlp_dim=3072,
         **kwargs,
     ):
-        self.num_hidden_layers = num_hidden_layers
-        self.hidden_size = hidden_size
-        self.mlp_dim = mlp_dim
-        self.image_size = image_size
-        self.num_channels = num_channels
-        self.num_attention_heads = num_attention_heads
-        self.patch_size = patch_size
-        self.qkv_bias = qkv_bias
-        self.use_rel_pos = use_rel_pos
-        self.use_abs_pos = use_abs_pos
-        self.global_attn_indexes = global_attn_indexes
-        self.window_size = window_size
-        self.output_channels = output_channels
-        self.attention_dropout = attention_dropout
         super().__init__(**kwargs)
 
-
-@auto_docstring(
-    custom_intro="""
-    
-    """,
-)
-class PPChart2TableTextConfig(PreTrainedConfig):
-    r"""
-    attention_dropout (`float`, *optional*, defaults to 0.0):
-        The dropout ratio for the attention probabilities in self-attention layers.
-    bos_token_id (`int`, *optional*, defaults to 151643):
-        The token ID representing the beginning of a sequence (BOS) for text generation.
-    eos_token_id (`int`, *optional*, defaults to 151643):
-        The token ID representing the end of a sequence (EOS) for text generation.
-    pad_token_id (Optional[int], optional, *optional*, defaults to -1):
-        The index of the padding token. Defaults to -1.
-    hidden_act (`str` or `function`, *optional*, defaults to `"silu"`):
-        The non-linear activation function (function or string) in the feed-forward and attention layers of the decoder.
-    hidden_size (`int`, *optional*, defaults to 1024):
-        Dimensionality of the hidden representations in the Transformer decoder layers.
-    initializer_range (`float`, *optional*, defaults to 0.02):
-        The standard deviation of the truncated_normal_initializer for initializing all weight matrices.
-    intermediate_size (`int`, *optional*, defaults to 2816):
-        Dimensionality of the "intermediate" (i.e., feed-forward) layer in the Transformer decoder blocks.
-    max_position_embeddings (`int`, *optional*, defaults to 32768):
-        The maximum sequence length that this model might ever be used with for text input/output.
-    num_attention_heads (`int`, *optional*, defaults to 16):
-        Number of attention heads for each self-attention layer in the Transformer decoder.
-    num_hidden_layers (`int`, *optional*, defaults to 24):
-        Number of hidden layers in the Transformer decoder.
-    num_key_value_heads (`int`, *optional*, defaults to 16):
-        Number of key/value heads for implementing Grouped Query Attention (GQA). If equal to `num_attention_heads`,
-        Multi Head Attention (MHA) is used; if 1, Multi Query Attention (MQA) is used. For more details, see
-        [this paper](https://huggingface.co/papers/2305.13245).
-    rms_norm_eps (`float`, *optional*, defaults to 1e-06):
-        The epsilon value used by the RMS normalization layers to avoid division by zero.
-    rope_theta (`float`, *optional*, defaults to 1000000.0):
-        The base period of the RoPE (Rotary Position Embedding) embeddings, controlling the frequency of positional encoding.
-    rope_parameters (`RopeParameters` or `dict`, *optional*):
-        Configuration parameters for RoPE embeddings, including scaling parameters for longer sequence lengths beyond
-        `max_position_embeddings`.
-    sliding_window (`int`, *optional*, defaults to 32768):
-        Window size for Sliding Window Attention (SWA) in the decoder layers (only active if `use_sliding_window=True`).
-    tie_word_embeddings (`bool`, *optional*, defaults to `True`):
-        Whether the model's input and output word embeddings should be tied (shared weights).
-    use_cache (`bool`, *optional*, defaults to `True`):
-        Whether to return the last key/value attention states to speed up sequential decoding (only relevant for autoregressive
-        generation).
-    vocab_size (`int`, *optional*, defaults to 151860):
-        Vocabulary size of the PPChart2TableText model. Defines the number of distinct tokens that can be represented
-        by `input_ids`.
-    layer_types (`list[str]`, *optional*):
-        Attention pattern for each decoder layer (e.g., `"full_attention"` or `"sliding_attention"`). If not specified,
-        automatically determined by `sliding_window`.
-    """
-
-    model_type = "pp_chart2table_text"
-    keys_to_ignore_at_inference = ["past_key_values"]
-
-    # Default tensor parallel plan for base model `PPChart2TableText`
-    base_model_tp_plan = {
-        "layers.*.self_attn.q_proj": "colwise",
-        "layers.*.self_attn.k_proj": "colwise",
-        "layers.*.self_attn.v_proj": "colwise",
-        "layers.*.self_attn.o_proj": "rowwise",
-        "layers.*.mlp.gate_proj": "colwise",
-        "layers.*.mlp.up_proj": "colwise",
-        "layers.*.mlp.down_proj": "rowwise",
-    }
-    base_model_pp_plan = {
-        "embed_tokens": (["input_ids"], ["inputs_embeds"]),
-        "layers": (["hidden_states", "attention_mask"], ["hidden_states"]),
-        "norm": (["hidden_states"], ["hidden_states"]),
-    }
-    base_config_key = "text_config"
-
-    def __init__(
-        self,
-        attention_dropout: float = 0.0,
-        bos_token_id: int = 151643,
-        eos_token_id: int = 151643,
-        pad_token_id: int = -1,
-        hidden_act: str = "silu",
-        hidden_size: int = 1024,
-        initializer_range: float = 0.02,
-        intermediate_size: int = 2816,
-        max_position_embeddings: int = 32768,
-        num_attention_heads: int = 16,
-        num_hidden_layers: int = 24,
-        num_key_value_heads: int = 16,
-        rms_norm_eps: float = 1e-06,
-        rope_theta: float = 1000000.0,
-        rope_parameters: RopeParameters | dict[str, RopeParameters] | None = None,
-        sliding_window: int = 32768,
-        tie_word_embeddings: bool = True,
-        use_cache: bool = True,
-        vocab_size: int = 151860,
-        layer_types: list[str] | None = None,
-        **kwargs,
-    ):
-        self.vocab_size = vocab_size
-        self.max_position_embeddings = max_position_embeddings
         self.hidden_size = hidden_size
-        self.intermediate_size = intermediate_size
+        self.output_channels = output_channels
         self.num_hidden_layers = num_hidden_layers
         self.num_attention_heads = num_attention_heads
-        self.sliding_window = sliding_window
-
-        # for backward compatibility
-        if num_key_value_heads is None:
-            num_key_value_heads = num_attention_heads
-        self.num_key_value_heads = num_key_value_heads
+        self.num_channels = num_channels
+        self.image_size = image_size
+        self.patch_size = patch_size
         self.hidden_act = hidden_act
+        self.layer_norm_eps = layer_norm_eps
+        self.attention_dropout = attention_dropout
         self.initializer_range = initializer_range
-        self.rms_norm_eps = rms_norm_eps
-        self.use_cache = use_cache
+        self.qkv_bias = qkv_bias
+        self.use_abs_pos = use_abs_pos
+        self.use_rel_pos = use_rel_pos
+        self.window_size = window_size
+        self.global_attn_indexes = global_attn_indexes
+        self.mlp_dim = mlp_dim
 
-        self.attention_dropout = attention_dropout
 
-        self.layer_types = layer_types
-        if self.layer_types is None:
-            self.layer_types = [
-                "sliding_attention" if self.sliding_window is not None else "full_attention"
-                for i in range(self.num_hidden_layers)
-            ]
-        layer_type_validation(self.layer_types, self.num_hidden_layers)
+@auto_docstring
+class PPChart2TableConfig(PreTrainedConfig):
+    r"""
+    Example:
 
-        self.rope_parameters = rope_parameters
+    ```python
+    >>> from transformers import PPChart2TableForConditionalGeneration, PPChart2TableConfig
 
-        self.rope_theta = rope_theta
-        self.tie_word_embeddings = tie_word_embeddings
-        super().__init__(
-            pad_token_id=pad_token_id,
-            bos_token_id=bos_token_id,
-            eos_token_id=eos_token_id,
-            tie_word_embeddings=tie_word_embeddings,
-            **kwargs,
-        )
+    >>> # Initializing a PPChart2Table style configuration
+    >>> configuration = PPChart2TableConfig()
 
+    >>> # Initializing a model from the Qwen2-VL-7B style configuration
+    >>> model = PPChart2TableForConditionalGeneration(configuration)
 
-@auto_docstring(
-    custom_intro="""
-    
-    """
-)
-class PPChart2TableConfig(PreTrainedConfig):
-    r"""
-    vision_config (Optional[Dict], optional, *optional*)::
-        The [PPChart2TableVisionConfig] for the vision sub-model. Defaults to None.
-    text_config (Optional[Dict], optional, *optional*)::
-        The [PPChart2TableTextConfig] for the text sub-model. Defaults to None.
-    image_token_index (Optional[int], optional, *optional*, defaults to 151859)::
-        The index of the image token. Defaults to 151859.
-    image_seq_length (Optional[int], optional, *optional*, defaults to 576)::
-        The sequence length for the image. Defaults to 576.
-    pad_token_id (Optional[int], optional, *optional*, defaults to -1):
-        The index of the padding token. Defaults to -1.
-    net_channels (`int`, *optional*, defaults to 512):
-        Dimensionality of intermediate network channels in the vision backbone.
-    output_channels (`int`, *optional*, defaults to 1024):
-        Dimensionality of intermediate network channels in the vision backbone.
-    """
+    >>> # Accessing the model configuration
+    >>> configuration = model.config
+    ```"""
 
     model_type = "pp_chart2table"
     attribute_map = {
         "image_token_id": "image_token_index",
     }
-    sub_configs = {"vision_config": PPChart2TableVisionConfig, "text_config": PPChart2TableTextConfig}
+    sub_configs = {"text_config": AutoConfig, "vision_config": PPChart2TableVisionConfig}
 
     def __init__(
         self,
@@ -254,50 +101,48 @@ def __init__(
         text_config: dict | None = None,
         image_token_index: int | None = 151859,
         image_seq_length: int | None = 576,
-        pad_token_id: int | None = -1,
-        net_channels: int | None = 512,
-        output_channels: int | None = 1024,
+        tie_word_embeddings: bool | None = True,
         **kwargs,
     ):
         self.image_token_index = image_token_index
         self.image_seq_length = image_seq_length
-        self.pad_token_id = pad_token_id
-        self.net_channels = net_channels
-        self.output_channels = output_channels
 
         if vision_config is None:
-            vision_config = {}
-        self.vision_config = PPChart2TableVisionConfig(**vision_config)
-
-        if text_config is None:
-            text_config = {}
-        self.text_config = PPChart2TableTextConfig(**text_config)
-
-        text_config_keys = [
-            "attention_dropout",
-            "bos_token_id",
-            "eos_token_id",
-            "hidden_act",
-            "hidden_size",
-            "initializer_range",
-            "intermediate_size",
-            "max_position_embeddings",
-            "num_attention_heads",
-            "num_hidden_layers",
-            "num_key_value_heads",
-            "rms_norm_eps",
-            "rope_theta",
-            "sliding_window",
-            "tie_word_embeddings",
-            "dtype",
-            "use_cache",
-            "vocab_size",
-        ]
-        for key in text_config_keys:
-            if hasattr(self.text_config, key):
-                setattr(self, key, getattr(self.text_config, key))
+            self.vision_config = PPChart2TableVisionConfig()
+        elif isinstance(vision_config, dict):
+            self.vision_config = PPChart2TableVisionConfig(**vision_config)
+        elif isinstance(vision_config, PPChart2TableVisionConfig):
+            self.vision_config = vision_config
+
+        if isinstance(text_config, dict):
+            text_config["model_type"] = text_config.get("model_type", "qwen2")
+            text_config = CONFIG_MAPPING[text_config["model_type"]](**text_config)
+        elif text_config is None:
+            text_config = CONFIG_MAPPING["qwen2"](
+                vocab_size=151860,
+                hidden_size=1024,
+                intermediate_size=2816,
+                num_hidden_layers=24,
+                num_attention_heads=16,
+                num_key_value_heads=16,
+                hidden_act="silu",
+                max_position_embeddings=32768,
+                initializer_range=0.02,
+                rms_norm_eps=1e-6,
+                use_cache=True,
+                tie_word_embeddings=tie_word_embeddings,
+                rope_theta=1000000.0,
+                rope_parameters=None,
+                use_sliding_window=False,
+                sliding_window=4096,
+                max_window_layers=21,
+                attention_dropout=0.0,
+            )
+
+        self.text_config = text_config
+        self.tie_word_embeddings = tie_word_embeddings
 
         super().__init__(**kwargs)
 
 
-__all__ = ["PPChart2TableConfig", "PPChart2TableVisionConfig", "PPChart2TableTextConfig"]
+__all__ = ["PPChart2TableConfig"]
diff --git a/src/transformers/models/pp_chart2table/image_processing_pp_chart2table.py b/src/transformers/models/pp_chart2table/image_processing_pp_chart2table.py
index 3f17cb754904..7bb7de6cc920 100644
--- a/src/transformers/models/pp_chart2table/image_processing_pp_chart2table.py
+++ b/src/transformers/models/pp_chart2table/image_processing_pp_chart2table.py
@@ -4,158 +4,22 @@
 #             the file from the modular. If any change should be done, please apply the change to the
 #                          modular_pp_chart2table.py file directly. One of our CI enforces this.
 #                🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨
-from typing import Optional, Union
+from ...processing_utils import ImagesKwargs
 
-from transformers.feature_extraction_utils import BatchFeature
-from transformers.image_processing_utils import BaseImageProcessor
-from transformers.image_transforms import flip_channel_order, resize, to_channel_dimension_format
-from transformers.image_utils import (
-    ChannelDimension,
-    ImageInput,
-    PILImageResampling,
-    infer_channel_dimension_format,
-    make_flat_list_of_images,
-    to_numpy_array,
-    valid_images,
-    validate_preprocess_arguments,
-)
-from transformers.processing_utils import TensorType
-from transformers.utils import filter_out_non_signature_kwargs
 
-
-class PPChart2TableImageProcessor(BaseImageProcessor):
-    r"""
-    Image processor for the PP-Chart2Table multimodal model, optimized for chart image preprocessing tasks.
-
-    This processor handles the complete preprocessing pipeline for chart images, including resizing, rescaling,
-    normalization, and channel dimension reordering, tailored to the input requirements of the PP-Chart2Table vision encoder.
-
-    Args:
-        do_resize (`bool`, *optional*, defaults to `True`):
-            Whether to resize the input images to the specified `size`.
-        size (`dict[str, int]`, *optional*, defaults to `{"height": 256, "width": 256}`):
-            Dictionary containing the target height and width for resizing. Format: `{"height": int, "width": int}`.
-        resample (`PILImageResampling`, *optional*, defaults to `PILImageResampling.BICUBIC`):
-            Resampling filter to use when resizing images (e.g., BICUBIC, BILINEAR).
-        do_rescale (`bool`, *optional*, defaults to `True`):
-            Whether to rescale the pixel values from the range [0, 255] to [0, 1] using `rescale_factor`.
-        rescale_factor (`int` or `float`, *optional*, defaults to `1/255`):
-            Factor to apply for rescaling pixel values (e.g., 1/255 scales 0-255 to 0-1).
-        do_normalize (`bool`, *optional*, defaults to `True`):
-            Whether to normalize the input images using `image_mean` and `image_std`.
-        image_mean (`float` or `list[float]`, *optional*, defaults to `[0.406, 0.456, 0.485]`):
-            Mean values for image normalization (per channel, RGB order).
-        image_std (`float` or `list[float]`, *optional*, defaults to `[0.225, 0.224, 0.229]`):
-            Standard deviation values for image normalization (per channel, RGB order).
-        patch_size (`int`, *optional*, defaults to 16):
-            Size of image patches used by the PP-Chart2Table vision encoder (for alignment with model input).
-        merge_size (`int`, *optional*, defaults to 4):
-            Size factor for merging image patches (specific to PP-Chart2Table's vision processing pipeline).
+class PPChart2TableImageProcessorKwargs(ImagesKwargs, total=False):
+    """
+    crop_to_patches (`bool`, *optional*, defaults to `False`):
+        Whether to crop the image to patches. Can be overridden by the `crop_to_patches` parameter in the
+        `preprocess` method.
+    min_patches (`int`, *optional*, defaults to 1):
+        The minimum number of patches to be extracted from the image. Only has an effect if `crop_to_patches` is
+        set to `True`. Can be overridden by the `min_patches` parameter in the `preprocess` method.
+    max_patches (`int`, *optional*, defaults to 12):
+        The maximum number of patches to be extracted from the image. Only has an effect if `crop_to_patches` is
+        set to `True`. Can be overridden by the `max_patches` parameter in the `preprocess` method.
     """
 
-    model_input_names = ["pixel_values"]
-
-    def __init__(
-        self,
-        do_resize: bool = True,
-        size: Optional[dict[str, int]] = None,
-        resample: Optional[PILImageResampling] = PILImageResampling.BICUBIC,
-        do_rescale: bool = True,
-        rescale_factor: Union[int, float] = 1 / 255,
-        do_normalize: bool = True,
-        image_mean: Optional[Union[float, list[float]]] = [0.406, 0.456, 0.485],
-        image_std: Optional[Union[float, list[float]]] = [0.225, 0.224, 0.229],
-        patch_size: int = 16,
-        merge_size: int = 4,
-        **kwargs,
-    ) -> None:
-        super().__init__(**kwargs)
-        size = size if size is not None else {"height": 256, "width": 256}
-
-        self.do_resize = do_resize
-        self.size = size
-        self.do_rescale = do_rescale
-        self.rescale_factor = rescale_factor
-        self.do_normalize = do_normalize
-        self.image_mean = image_mean
-        self.image_std = image_std
-        self.resample = resample
-        self.patch_size = patch_size
-        self.merge_size = merge_size
-
-    @filter_out_non_signature_kwargs()
-    def preprocess(
-        self,
-        images: ImageInput,
-        size: Optional[dict[str, int]] = None,
-        do_resize: Optional[bool] = None,
-        resample: Optional[PILImageResampling] = None,
-        do_rescale: Optional[bool] = None,
-        rescale_factor: Optional[Union[int, float]] = None,
-        do_normalize: Optional[bool] = None,
-        image_mean: Optional[Union[float, list[float]]] = None,
-        image_std: Optional[Union[float, list[float]]] = None,
-        return_tensors: Optional[Union[TensorType, str]] = None,
-        data_format: Union[str, ChannelDimension] = ChannelDimension.FIRST,
-        input_data_format: Optional[Union[str, ChannelDimension]] = None,
-    ) -> BatchFeature:
-        size = self.size if size is None else size
-        do_resize = self.do_resize if do_resize is None else do_resize
-        resample = self.resample if resample is None else resample
-        do_rescale = self.do_rescale if do_rescale is None else do_rescale
-        rescale_factor = self.rescale_factor if rescale_factor is None else rescale_factor
-        do_normalize = self.do_normalize if do_normalize is None else do_normalize
-        image_mean = self.image_mean if image_mean is None else image_mean
-        image_std = self.image_std if image_std is None else image_std
-
-        images = make_flat_list_of_images(images)
-
-        validate_preprocess_arguments(
-            do_rescale=do_rescale,
-            rescale_factor=rescale_factor,
-            do_normalize=do_normalize,
-            image_mean=image_mean,
-            image_std=image_std,
-            size=size,
-            do_resize=do_resize,
-            resample=resample,
-        )
-
-        if not valid_images(images):
-            raise ValueError("Invalid image type. Must be of type PIL.Image.Image, numpy.ndarray, or torch.Tensor")
-
-        # All transformations expect numpy arrays
-        images = [to_numpy_array(image) for image in images]
-        if input_data_format is None:
-            input_data_format = infer_channel_dimension_format(images[0])
-
-        # transformations
-        resize_images = []
-        if do_resize:
-            for image in images:
-                image = resize(
-                    image,
-                    size=(size["height"], size["width"]),
-                    resample=resample,
-                    input_data_format=input_data_format,
-                )
-                resize_images.append(image)
-            images = resize_images
-
-        if do_rescale:
-            images = [self.rescale(image, rescale_factor, input_data_format=input_data_format) for image in images]
-
-        if do_normalize:
-            images = [
-                self.normalize(image, image_mean, image_std, input_data_format=input_data_format) for image in images
-            ]
-        images = [flip_channel_order(image, input_data_format=input_data_format) for image in images]
-        images = [
-            to_channel_dimension_format(image, data_format, input_channel_dim=input_data_format) for image in images
-        ]
-
-        encoded_inputs = BatchFeature(data={"pixel_values": images}, tensor_type=return_tensors)
-        return encoded_inputs
-
-
-__all__ = ["PPChart2TableImageProcessor"]
+    crop_to_patches: bool
+    min_patches: int
+    max_patches: int
diff --git a/src/transformers/models/pp_chart2table/modeling_pp_chart2table.py b/src/transformers/models/pp_chart2table/modeling_pp_chart2table.py
index 033daab0bf44..5501710d3a48 100644
--- a/src/transformers/models/pp_chart2table/modeling_pp_chart2table.py
+++ b/src/transformers/models/pp_chart2table/modeling_pp_chart2table.py
@@ -5,9 +5,7 @@
 #                          modular_pp_chart2table.py file directly. One of our CI enforces this.
 #                🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨
 import collections
-from collections.abc import Callable
 from dataclasses import dataclass
-from typing import Optional
 
 import torch
 import torch.nn as nn
@@ -15,20 +13,17 @@
 
 from ... import initialization as init
 from ...activations import ACT2FN
-from ...cache_utils import Cache, DynamicCache
+from ...cache_utils import Cache
 from ...generation import GenerationMixin
-from ...integrations import use_kernel_forward_from_hub, use_kernel_func_from_hub, use_kernelized_func
-from ...masking_utils import create_causal_mask, create_sliding_window_causal_mask
-from ...modeling_flash_attention_utils import FlashAttentionKwargs
 from ...modeling_layers import GradientCheckpointingLayer
 from ...modeling_outputs import BaseModelOutputWithPast, BaseModelOutputWithPooling, ModelOutput
-from ...modeling_rope_utils import ROPE_INIT_FUNCTIONS, dynamic_rope_update
-from ...modeling_utils import ALL_ATTENTION_FUNCTIONS, PreTrainedModel
+from ...modeling_utils import PreTrainedModel
 from ...processing_utils import Unpack
 from ...utils import TransformersKwargs, auto_docstring, can_return_tuple, torch_compilable_check
-from ...utils.generic import maybe_autocast, merge_with_config_defaults
+from ...utils.generic import merge_with_config_defaults
 from ...utils.output_capturing import capture_outputs
-from .configuration_pp_chart2table import PPChart2TableConfig, PPChart2TableTextConfig, PPChart2TableVisionConfig
+from ..auto import AutoModel
+from .configuration_pp_chart2table import PPChart2TableConfig, PPChart2TableVisionConfig
 
 
 class PPChart2TableVisionAttention(nn.Module):
@@ -445,388 +440,6 @@ def forward(
         )
 
 
-class PPChart2TableTextMLP(nn.Module):
-    def __init__(self, config):
-        super().__init__()
-        self.config = config
-        self.hidden_size = config.hidden_size
-        self.intermediate_size = config.intermediate_size
-        self.gate_proj = nn.Linear(self.hidden_size, self.intermediate_size, bias=False)
-        self.up_proj = nn.Linear(self.hidden_size, self.intermediate_size, bias=False)
-        self.down_proj = nn.Linear(self.intermediate_size, self.hidden_size, bias=False)
-        self.act_fn = ACT2FN[config.hidden_act]
-
-    def forward(self, x):
-        down_proj = self.down_proj(self.act_fn(self.gate_proj(x)) * self.up_proj(x))
-        return down_proj
-
-
-def rotate_half(x):
-    """Rotates half the hidden dims of the input."""
-    x1 = x[..., : x.shape[-1] // 2]
-    x2 = x[..., x.shape[-1] // 2 :]
-    return torch.cat((-x2, x1), dim=-1)
-
-
-@use_kernel_func_from_hub("rotary_pos_emb")
-def apply_rotary_pos_emb(q, k, cos, sin, unsqueeze_dim=1):
-    """Applies Rotary Position Embedding to the query and key tensors.
-
-    Args:
-        q (`torch.Tensor`): The query tensor.
-        k (`torch.Tensor`): The key tensor.
-        cos (`torch.Tensor`): The cosine part of the rotary embedding.
-        sin (`torch.Tensor`): The sine part of the rotary embedding.
-        unsqueeze_dim (`int`, *optional*, defaults to 1):
-            The 'unsqueeze_dim' argument specifies the dimension along which to unsqueeze cos[position_ids] and
-            sin[position_ids] so that they can be properly broadcasted to the dimensions of q and k. For example, note
-            that cos[position_ids] and sin[position_ids] have the shape [batch_size, seq_len, head_dim]. Then, if q and
-            k have the shape [batch_size, heads, seq_len, head_dim], then setting unsqueeze_dim=1 makes
-            cos[position_ids] and sin[position_ids] broadcastable to the shapes of q and k. Similarly, if q and k have
-            the shape [batch_size, seq_len, heads, head_dim], then set unsqueeze_dim=2.
-    Returns:
-        `tuple(torch.Tensor)` comprising of the query and key tensors rotated using the Rotary Position Embedding.
-    """
-    cos = cos.unsqueeze(unsqueeze_dim)
-    sin = sin.unsqueeze(unsqueeze_dim)
-    q_embed = (q * cos) + (rotate_half(q) * sin)
-    k_embed = (k * cos) + (rotate_half(k) * sin)
-    return q_embed, k_embed
-
-
-def repeat_kv(hidden_states: torch.Tensor, n_rep: int) -> torch.Tensor:
-    """
-    This is the equivalent of torch.repeat_interleave(x, dim=1, repeats=n_rep). The hidden states go from (batch,
-    num_key_value_heads, seqlen, head_dim) to (batch, num_attention_heads, seqlen, head_dim)
-    """
-    batch, num_key_value_heads, slen, head_dim = hidden_states.shape
-    if n_rep == 1:
-        return hidden_states
-    hidden_states = hidden_states[:, :, None, :, :].expand(batch, num_key_value_heads, n_rep, slen, head_dim)
-    return hidden_states.reshape(batch, num_key_value_heads * n_rep, slen, head_dim)
-
-
-def eager_attention_forward(
-    module: nn.Module,
-    query: torch.Tensor,
-    key: torch.Tensor,
-    value: torch.Tensor,
-    attention_mask: torch.Tensor | None,
-    scaling: float,
-    dropout: float = 0.0,
-    **kwargs: Unpack[TransformersKwargs],
-):
-    key_states = repeat_kv(key, module.num_key_value_groups)
-    value_states = repeat_kv(value, module.num_key_value_groups)
-
-    attn_weights = torch.matmul(query, key_states.transpose(2, 3)) * scaling
-    if attention_mask is not None:
-        attn_weights = attn_weights + attention_mask
-
-    attn_weights = nn.functional.softmax(attn_weights, dim=-1, dtype=torch.float32).to(query.dtype)
-    attn_weights = nn.functional.dropout(attn_weights, p=dropout, training=module.training)
-    attn_output = torch.matmul(attn_weights, value_states)
-    attn_output = attn_output.transpose(1, 2).contiguous()
-
-    return attn_output, attn_weights
-
-
-@use_kernelized_func(apply_rotary_pos_emb)
-class PPChart2TableTextAttention(nn.Module):
-    """Multi-headed attention from 'Attention Is All You Need' paper"""
-
-    def __init__(self, config: PPChart2TableTextConfig, layer_idx: int):
-        super().__init__()
-        self.layer_type = config.layer_types[layer_idx] if hasattr(config, "layer_types") else None
-        self.config = config
-        self.layer_idx = layer_idx
-        self.head_dim = getattr(config, "head_dim", config.hidden_size // config.num_attention_heads)
-        self.num_key_value_groups = config.num_attention_heads // config.num_key_value_heads
-        self.scaling = self.head_dim**-0.5
-        self.attention_dropout = config.attention_dropout
-        self.is_causal = True
-        self.q_proj = nn.Linear(config.hidden_size, config.num_attention_heads * self.head_dim, bias=True)
-        self.k_proj = nn.Linear(config.hidden_size, config.num_key_value_heads * self.head_dim, bias=True)
-        self.v_proj = nn.Linear(config.hidden_size, config.num_key_value_heads * self.head_dim, bias=True)
-        self.o_proj = nn.Linear(config.num_attention_heads * self.head_dim, config.hidden_size, bias=False)
-        self.sliding_window = config.sliding_window if self.layer_type == "sliding_attention" else None
-
-    def forward(
-        self,
-        hidden_states: torch.Tensor,
-        position_embeddings: tuple[torch.Tensor, torch.Tensor],
-        attention_mask: torch.Tensor | None,
-        past_key_values: Cache | None = None,
-        **kwargs: Unpack[FlashAttentionKwargs],
-    ) -> tuple[torch.Tensor, torch.Tensor | None]:
-        input_shape = hidden_states.shape[:-1]
-        hidden_shape = (*input_shape, -1, self.head_dim)
-
-        query_states = self.q_proj(hidden_states).view(hidden_shape).transpose(1, 2)
-        key_states = self.k_proj(hidden_states).view(hidden_shape).transpose(1, 2)
-        value_states = self.v_proj(hidden_states).view(hidden_shape).transpose(1, 2)
-
-        cos, sin = position_embeddings
-        query_states, key_states = apply_rotary_pos_emb(query_states, key_states, cos, sin)
-
-        if past_key_values is not None:
-            key_states, value_states = past_key_values.update(key_states, value_states, self.layer_idx)
-
-        attention_interface: Callable = ALL_ATTENTION_FUNCTIONS.get_interface(
-            self.config._attn_implementation, eager_attention_forward
-        )
-
-        attn_output, attn_weights = attention_interface(
-            self,
-            query_states,
-            key_states,
-            value_states,
-            attention_mask,
-            dropout=0.0 if not self.training else self.attention_dropout,
-            scaling=self.scaling,
-            sliding_window=self.sliding_window,  # main diff with Llama
-            **kwargs,
-        )
-
-        attn_output = attn_output.reshape(*input_shape, -1).contiguous()
-        attn_output = self.o_proj(attn_output)
-        return attn_output, attn_weights
-
-
-@use_kernel_forward_from_hub("RMSNorm")
-class PPChart2TableTextRMSNorm(nn.Module):
-    def __init__(self, hidden_size, eps: float = 1e-6) -> None:
-        """
-        PPChart2TableTextRMSNorm is equivalent to T5LayerNorm
-        """
-        super().__init__()
-        self.weight = nn.Parameter(torch.ones(hidden_size))
-        self.variance_epsilon = eps
-
-    def forward(self, hidden_states: torch.Tensor) -> torch.Tensor:
-        input_dtype = hidden_states.dtype
-        hidden_states = hidden_states.to(torch.float32)
-        variance = hidden_states.pow(2).mean(-1, keepdim=True)
-        hidden_states = hidden_states * torch.rsqrt(variance + self.variance_epsilon)
-        return self.weight * hidden_states.to(input_dtype)
-
-    def extra_repr(self):
-        return f"{tuple(self.weight.shape)}, eps={self.variance_epsilon}"
-
-
-class PPChart2TableTextDecoderLayer(GradientCheckpointingLayer):
-    def __init__(self, config: PPChart2TableTextConfig, layer_idx: int):
-        super().__init__()
-        self.hidden_size = config.hidden_size
-
-        self.self_attn = PPChart2TableTextAttention(config=config, layer_idx=layer_idx)
-
-        self.mlp = PPChart2TableTextMLP(config)
-        self.input_layernorm = PPChart2TableTextRMSNorm(config.hidden_size, eps=config.rms_norm_eps)
-        self.post_attention_layernorm = PPChart2TableTextRMSNorm(config.hidden_size, eps=config.rms_norm_eps)
-        self.attention_type = config.layer_types[layer_idx]
-
-    def forward(
-        self,
-        hidden_states: torch.Tensor,
-        attention_mask: torch.Tensor | None = None,
-        position_ids: torch.LongTensor | None = None,
-        past_key_values: Cache | None = None,
-        use_cache: bool | None = False,
-        position_embeddings: tuple[torch.Tensor, torch.Tensor] | None = None,
-        **kwargs: Unpack[TransformersKwargs],
-    ) -> torch.Tensor:
-        residual = hidden_states
-        hidden_states = self.input_layernorm(hidden_states)
-        # Self Attention
-        hidden_states, _ = self.self_attn(
-            hidden_states=hidden_states,
-            attention_mask=attention_mask,
-            position_ids=position_ids,
-            past_key_values=past_key_values,
-            use_cache=use_cache,
-            position_embeddings=position_embeddings,
-            **kwargs,
-        )
-        hidden_states = residual + hidden_states
-
-        # Fully Connected
-        residual = hidden_states
-        hidden_states = self.post_attention_layernorm(hidden_states)
-        hidden_states = self.mlp(hidden_states)
-        hidden_states = residual + hidden_states
-        return hidden_states
-
-
-@auto_docstring
-class PPChart2TableTextPreTrainedModel(PreTrainedModel):
-    config: PPChart2TableTextConfig
-    base_model_prefix = "model"
-    supports_gradient_checkpointing = True
-    _no_split_modules = ["PPChart2TableTextDecoderLayer"]
-    _skip_keys_device_placement = ["past_key_values"]
-    _supports_flash_attn = True
-    _supports_sdpa = True
-    _supports_flex_attn = True
-
-    _can_compile_fullgraph = True
-    _supports_attention_backend = True
-    _can_record_outputs = {
-        "hidden_states": PPChart2TableTextDecoderLayer,
-        "attentions": PPChart2TableTextAttention,
-    }
-
-
-class PPChart2TableTextRotaryEmbedding(nn.Module):
-    inv_freq: torch.Tensor  # fix linting for `register_buffer`
-
-    def __init__(self, config: PPChart2TableTextConfig, device=None):
-        super().__init__()
-        self.max_seq_len_cached = config.max_position_embeddings
-        self.original_max_seq_len = config.max_position_embeddings
-
-        self.config = config
-
-        self.rope_type = self.config.rope_parameters["rope_type"]
-        rope_init_fn: Callable = self.compute_default_rope_parameters
-        if self.rope_type != "default":
-            rope_init_fn = ROPE_INIT_FUNCTIONS[self.rope_type]
-        inv_freq, self.attention_scaling = rope_init_fn(self.config, device)
-
-        self.register_buffer("inv_freq", inv_freq, persistent=False)
-        self.register_buffer("original_inv_freq", inv_freq.clone(), persistent=False)
-
-    @staticmethod
-    def compute_default_rope_parameters(
-        config: PPChart2TableTextConfig | None = None,
-        device: Optional["torch.device"] = None,
-        seq_len: int | None = None,
-    ) -> tuple["torch.Tensor", float]:
-        """
-        Computes the inverse frequencies according to the original RoPE implementation
-        Args:
-            config ([`~transformers.PreTrainedConfig`]):
-                The model configuration.
-            device (`torch.device`):
-                The device to use for initialization of the inverse frequencies.
-            seq_len (`int`, *optional*):
-                The current sequence length. Unused for this type of RoPE.
-        Returns:
-            Tuple of (`torch.Tensor`, `float`), containing the inverse frequencies for the RoPE embeddings and the
-            post-processing scaling factor applied to the computed cos/sin (unused in this type of RoPE).
-        """
-        base = config.rope_parameters["rope_theta"]
-        dim = getattr(config, "head_dim", None) or config.hidden_size // config.num_attention_heads
-
-        attention_factor = 1.0  # Unused in this type of RoPE
-
-        # Compute the inverse frequencies
-        inv_freq = 1.0 / (
-            base ** (torch.arange(0, dim, 2, dtype=torch.int64).to(device=device, dtype=torch.float) / dim)
-        )
-        return inv_freq, attention_factor
-
-    @torch.no_grad()
-    @dynamic_rope_update  # power user: used with advanced RoPE types (e.g. dynamic rope)
-    def forward(self, x, position_ids):
-        inv_freq_expanded = self.inv_freq[None, :, None].float().expand(position_ids.shape[0], -1, 1).to(x.device)
-        position_ids_expanded = position_ids[:, None, :].float()
-
-        device_type = x.device.type if isinstance(x.device.type, str) and x.device.type != "mps" else "cpu"
-        with maybe_autocast(device_type=device_type, enabled=False):  # Force float32
-            freqs = (inv_freq_expanded.float() @ position_ids_expanded.float()).transpose(1, 2)
-            emb = torch.cat((freqs, freqs), dim=-1)
-            cos = emb.cos() * self.attention_scaling
-            sin = emb.sin() * self.attention_scaling
-
-        return cos.to(dtype=x.dtype), sin.to(dtype=x.dtype)
-
-
-@auto_docstring
-class PPChart2TableTextModel(PPChart2TableTextPreTrainedModel):
-    def __init__(self, config: PPChart2TableTextConfig):
-        super().__init__(config)
-        self.padding_idx = config.pad_token_id
-        self.vocab_size = config.vocab_size
-
-        self.embed_tokens = nn.Embedding(config.vocab_size, config.hidden_size, self.padding_idx)
-        self.layers = nn.ModuleList(
-            [PPChart2TableTextDecoderLayer(config, layer_idx) for layer_idx in range(config.num_hidden_layers)]
-        )
-        self.norm = PPChart2TableTextRMSNorm(config.hidden_size, eps=config.rms_norm_eps)
-        self.rotary_emb = PPChart2TableTextRotaryEmbedding(config=config)
-        self.gradient_checkpointing = False
-        self.has_sliding_layers = "sliding_attention" in self.config.layer_types
-
-        # Initialize weights and apply final processing
-        self.post_init()
-
-    @merge_with_config_defaults
-    @capture_outputs
-    @auto_docstring
-    def forward(
-        self,
-        input_ids: torch.LongTensor | None = None,
-        attention_mask: torch.Tensor | None = None,
-        position_ids: torch.LongTensor | None = None,
-        past_key_values: Cache | None = None,
-        inputs_embeds: torch.FloatTensor | None = None,
-        use_cache: bool | None = None,
-        **kwargs: Unpack[TransformersKwargs],
-    ) -> BaseModelOutputWithPast:
-        if (input_ids is None) ^ (inputs_embeds is not None):
-            raise ValueError("You must specify exactly one of input_ids or inputs_embeds")
-
-        if inputs_embeds is None:
-            inputs_embeds = self.embed_tokens(input_ids)
-
-        if use_cache and past_key_values is None:
-            past_key_values = DynamicCache(config=self.config)
-
-        if position_ids is None:
-            past_seen_tokens = past_key_values.get_seq_length() if past_key_values is not None else 0
-            position_ids = torch.arange(inputs_embeds.shape[1], device=inputs_embeds.device) + past_seen_tokens
-            position_ids = position_ids.unsqueeze(0)
-
-        # It may already have been prepared by e.g. `generate`
-        if not isinstance(causal_mask_mapping := attention_mask, dict):
-            # Prepare mask arguments
-            mask_kwargs = {
-                "config": self.config,
-                "inputs_embeds": inputs_embeds,
-                "attention_mask": attention_mask,
-                "past_key_values": past_key_values,
-                "position_ids": position_ids,
-            }
-            # Create the masks
-            causal_mask_mapping = {
-                "full_attention": create_causal_mask(**mask_kwargs),
-            }
-            # The sliding window alternating layers are not always activated depending on the config
-            if self.has_sliding_layers:
-                causal_mask_mapping["sliding_attention"] = create_sliding_window_causal_mask(**mask_kwargs)
-
-        hidden_states = inputs_embeds
-        position_embeddings = self.rotary_emb(hidden_states, position_ids)
-
-        for decoder_layer in self.layers[: self.config.num_hidden_layers]:
-            hidden_states = decoder_layer(
-                hidden_states,
-                attention_mask=causal_mask_mapping[decoder_layer.attention_type],
-                position_embeddings=position_embeddings,
-                position_ids=position_ids,
-                past_key_values=past_key_values,
-                use_cache=use_cache,
-                **kwargs,
-            )
-
-        hidden_states = self.norm(hidden_states)
-        return BaseModelOutputWithPast(
-            last_hidden_state=hidden_states,
-            past_key_values=past_key_values if use_cache else None,
-        )
-
-
 @dataclass
 class PPChart2TableModelOutputWithPast(BaseModelOutputWithPast):
     r"""
@@ -879,22 +492,25 @@ def __init__(self, config: PPChart2TableConfig):
         super().__init__(config)
         self.vision_tower = PPChart2TableVisionEncoder(config.vision_config)
         self.multi_modal_projector = nn.Linear(config.output_channels, config.text_config.hidden_size)
-        self.language_model = PPChart2TableTextModel._from_config(config.text_config)
+        self.language_model = AutoModel.from_config(config.text_config)
         self.vision_downsample1 = nn.Conv2d(
-            config.vision_config.output_channels, config.net_channels, kernel_size=3, stride=2, padding=1, bias=False
+            config.vision_config.output_channels,
+            config.vision_hidden_channels,
+            kernel_size=3,
+            stride=2,
+            padding=1,
+            bias=False,
         )
         self.vision_downsample2 = nn.Conv2d(
-            config.net_channels, config.output_channels, kernel_size=3, stride=2, padding=1, bias=False
+            config.vision_hidden_channels, config.output_channels, kernel_size=3, stride=2, padding=1, bias=False
         )
         self.post_init()
 
     def get_input_embeddings(self):
-        """Get input embeddings from the text decoder (for weight tying/loading)."""
-        return self.language_model.embed_tokens
+        return self.language_model.get_input_embeddings()
 
     def set_input_embeddings(self, value):
-        """Set input embeddings for the text decoder (for weight tying/loading)."""
-        self.language_model.embed_tokens = value
+        self.language_model.set_input_embeddings(value)
 
     @can_return_tuple
     @auto_docstring(
@@ -1201,7 +817,6 @@ def prepare_inputs_for_generation(
     "PPChart2TableModel",
     "PPChart2TablePreTrainedModel",
     "PPChart2TableTextPreTrainedModel",
-    "PPChart2TableTextModel",
     "PPChart2TableVisionPreTrainedModel",
     "PPChart2TableVisionModel",
 ]
diff --git a/src/transformers/models/pp_chart2table/modular_pp_chart2table.py b/src/transformers/models/pp_chart2table/modular_pp_chart2table.py
index bfc376d0a87a..5aac8fe62ccf 100644
--- a/src/transformers/models/pp_chart2table/modular_pp_chart2table.py
+++ b/src/transformers/models/pp_chart2table/modular_pp_chart2table.py
@@ -5,10 +5,10 @@
 import torch.nn as nn
 import torchvision.transforms.v2.functional as tvF
 
-from ...configuration_utils import PreTrainedConfig, layer_type_validation
 from ...feature_extraction_utils import BatchFeature
 from ...image_processing_utils_fast import BaseImageProcessorFast, group_images_by_shape, reorder_images
-from ...modeling_rope_utils import RopeParameters
+
+from ..got_ocr2.configuration_got_ocr2 import GotOcr2Config
 from ..got_ocr2.modeling_got_ocr2 import (
     GotOcr2ModelOutputWithPast,
     GotOcr2Model,
@@ -16,15 +16,8 @@
     GotOcr2ForConditionalGeneration,
     GotOcr2VisionEncoder,
 )
-from ..qwen2.modeling_qwen2 import (
-    Qwen2Model,
-    Qwen2PreTrainedModel,
-)
-from ...utils import (
-    auto_docstring,
-    logging,
-    TransformersKwargs,
-)
+
+from ...utils import TransformersKwargs, auto_docstring, logging
 from ...modeling_outputs import BaseModelOutputWithPooling
 from ...processing_utils import ProcessorMixin, TensorType, Unpack
 
@@ -33,298 +26,13 @@
 logger = logging.get_logger(__name__)
 
 
-@auto_docstring(checkpoint="PaddlePaddle/PP-Chart2Table_safetensors",)
-class PPChart2TableVisionConfig(PreTrainedConfig):
-    """
-    num_hidden_layers (`int`, *optional*, defaults to 12):
-        Number of transformer encoder layers in the vision backbone.
-    hidden_size (`int`, *optional*, defaults to 768):
-        Dimensionality of the patch embedding vectors.
-    num_channels (`int`, *optional*, defaults to 3):
-        Number of input channels (3 for RGB images, 1 for grayscale).
-    image_size (`int`, *optional*, defaults to 1024):
-        Size (height/width) of the input images (assumed to be square).
-    num_attention_heads (`int`, *optional*, defaults to 12):
-        Number of attention heads for each transformer encoder layer.
-    patch_size (`int`, *optional*, defaults to 16):
-        Size (height/width) of the image patches extracted from the input image.
-    qkv_bias (`bool`, *optional*, defaults to `True`):
-        Whether to include bias terms in the query, key, value projection layers of self-attention.
-    use_rel_pos (`bool`, *optional*, defaults to `True`):
-        Whether to use relative positional embeddings in the self-attention mechanism.
-    global_attn_indexes (`Optional[list[int]]`, *optional*, defaults to [2, 5, 8, 11]):
-        List of layer indexes where global attention (instead of window attention) is applied.
-        If `None`, defaults to [2, 5, 8, 11].
-    window_size (`int`, *optional*, defaults to 14):
-        Size of the attention window for window-based self-attention (only effective when use_rel_pos=True).
-    output_channels (`int`, *optional*, defaults to 256):
-        Dimensionality of the final visual feature output channels.
-    attention_dropout (`float`, *optional*, defaults to 0.0):
-        Dropout probability applied to the attention weights.
-    """
-
-    model_type = "pp_chart2table_vision"
-    base_config_key = "vision_config"
-
-    def __init__(
-        self,
-        num_hidden_layers=12,
-        hidden_size=768,
-        output_channels=256,
-        mlp_dim=3072,
-        num_channels=3,
-        image_size=1024,
-        num_attention_heads=12,
-        patch_size=16,
-        qkv_bias=True,
-        use_rel_pos=True,
-        use_abs_pos=True,
-        global_attn_indexes=[2, 5, 8, 11],
-        window_size=14,
-        attention_dropout=0.0,
-        **kwargs,
-    ):
-        self.num_hidden_layers = num_hidden_layers
-        self.hidden_size = hidden_size
-        self.mlp_dim=mlp_dim
-        self.image_size = image_size
-        self.num_channels = num_channels
-        self.num_attention_heads = num_attention_heads
-        self.patch_size = patch_size
-        self.qkv_bias = qkv_bias
-        self.use_rel_pos = use_rel_pos
-        self.use_abs_pos = use_abs_pos
-        self.global_attn_indexes = global_attn_indexes
-        self.window_size = window_size
-        self.output_channels = output_channels
-        self.attention_dropout = attention_dropout
-        super().__init__(**kwargs)
-
-
-@auto_docstring(
-    custom_intro="""
-    
-    """,
-
-)
-class PPChart2TableTextConfig(PreTrainedConfig):
-    r"""
-    attention_dropout (`float`, *optional*, defaults to 0.0):
-        The dropout ratio for the attention probabilities in self-attention layers.
-    bos_token_id (`int`, *optional*, defaults to 151643):
-        The token ID representing the beginning of a sequence (BOS) for text generation.
-    eos_token_id (`int`, *optional*, defaults to 151643):
-        The token ID representing the end of a sequence (EOS) for text generation.
-    pad_token_id (Optional[int], optional, *optional*, defaults to -1):
-        The index of the padding token. Defaults to -1.
-    hidden_act (`str` or `function`, *optional*, defaults to `"silu"`):
-        The non-linear activation function (function or string) in the feed-forward and attention layers of the decoder.
-    hidden_size (`int`, *optional*, defaults to 1024):
-        Dimensionality of the hidden representations in the Transformer decoder layers.
-    initializer_range (`float`, *optional*, defaults to 0.02):
-        The standard deviation of the truncated_normal_initializer for initializing all weight matrices.
-    intermediate_size (`int`, *optional*, defaults to 2816):
-        Dimensionality of the "intermediate" (i.e., feed-forward) layer in the Transformer decoder blocks.
-    max_position_embeddings (`int`, *optional*, defaults to 32768):
-        The maximum sequence length that this model might ever be used with for text input/output.
-    num_attention_heads (`int`, *optional*, defaults to 16):
-        Number of attention heads for each self-attention layer in the Transformer decoder.
-    num_hidden_layers (`int`, *optional*, defaults to 24):
-        Number of hidden layers in the Transformer decoder.
-    num_key_value_heads (`int`, *optional*, defaults to 16):
-        Number of key/value heads for implementing Grouped Query Attention (GQA). If equal to `num_attention_heads`,
-        Multi Head Attention (MHA) is used; if 1, Multi Query Attention (MQA) is used. For more details, see
-        [this paper](https://huggingface.co/papers/2305.13245).
-    rms_norm_eps (`float`, *optional*, defaults to 1e-06):
-        The epsilon value used by the RMS normalization layers to avoid division by zero.
-    rope_theta (`float`, *optional*, defaults to 1000000.0):
-        The base period of the RoPE (Rotary Position Embedding) embeddings, controlling the frequency of positional encoding.
-    rope_parameters (`RopeParameters` or `dict`, *optional*):
-        Configuration parameters for RoPE embeddings, including scaling parameters for longer sequence lengths beyond
-        `max_position_embeddings`.
-    sliding_window (`int`, *optional*, defaults to 32768):
-        Window size for Sliding Window Attention (SWA) in the decoder layers (only active if `use_sliding_window=True`).
-    tie_word_embeddings (`bool`, *optional*, defaults to `True`):
-        Whether the model's input and output word embeddings should be tied (shared weights).
-    use_cache (`bool`, *optional*, defaults to `True`):
-        Whether to return the last key/value attention states to speed up sequential decoding (only relevant for autoregressive
-        generation).
-    vocab_size (`int`, *optional*, defaults to 151860):
-        Vocabulary size of the PPChart2TableText model. Defines the number of distinct tokens that can be represented
-        by `input_ids`.
-    layer_types (`list[str]`, *optional*):
-        Attention pattern for each decoder layer (e.g., `"full_attention"` or `"sliding_attention"`). If not specified,
-        automatically determined by `sliding_window`.
-    """
-
-    model_type = "pp_chart2table_text"
-    keys_to_ignore_at_inference = ["past_key_values"]
-
-    # Default tensor parallel plan for base model `PPChart2TableText`
-    base_model_tp_plan = {
-        "layers.*.self_attn.q_proj": "colwise",
-        "layers.*.self_attn.k_proj": "colwise",
-        "layers.*.self_attn.v_proj": "colwise",
-        "layers.*.self_attn.o_proj": "rowwise",
-        "layers.*.mlp.gate_proj": "colwise",
-        "layers.*.mlp.up_proj": "colwise",
-        "layers.*.mlp.down_proj": "rowwise",
-    }
-    base_model_pp_plan = {
-        "embed_tokens": (["input_ids"], ["inputs_embeds"]),
-        "layers": (["hidden_states", "attention_mask"], ["hidden_states"]),
-        "norm": (["hidden_states"], ["hidden_states"]),
-    }
-    base_config_key = "text_config"
-
-    def __init__(
-        self,
-        attention_dropout: float = 0.0,
-        bos_token_id: int = 151643,
-        eos_token_id: int = 151643,
-        pad_token_id: int = -1,
-        hidden_act: str = "silu",
-        hidden_size: int = 1024,
-        initializer_range: float = 0.02,
-        intermediate_size: int = 2816,
-        max_position_embeddings: int = 32768,
-        num_attention_heads: int = 16,
-        num_hidden_layers: int = 24,
-        num_key_value_heads: int = 16,
-        rms_norm_eps: float = 1e-06,
-        rope_theta: float = 1000000.0,
-        rope_parameters: Optional[RopeParameters | dict[str, RopeParameters]] = None,
-        sliding_window: int = 32768,
-        tie_word_embeddings: bool = True,
-        use_cache: bool = True,
-        vocab_size: int = 151860,
-        layer_types: Optional[list[str]] = None,
-        **kwargs,
-    ):
-        self.vocab_size = vocab_size
-        self.max_position_embeddings = max_position_embeddings
-        self.hidden_size = hidden_size
-        self.intermediate_size = intermediate_size
-        self.num_hidden_layers = num_hidden_layers
-        self.num_attention_heads = num_attention_heads
-        self.sliding_window = sliding_window
-
-        # for backward compatibility
-        if num_key_value_heads is None:
-            num_key_value_heads = num_attention_heads
-        self.num_key_value_heads = num_key_value_heads
-        self.hidden_act = hidden_act
-        self.initializer_range = initializer_range
-        self.rms_norm_eps = rms_norm_eps
-        self.use_cache = use_cache
-
-        self.attention_dropout = attention_dropout
-
-        self.layer_types = layer_types
-        if self.layer_types is None:
-            self.layer_types = [
-                "sliding_attention" if self.sliding_window is not None else "full_attention"
-                for i in range(self.num_hidden_layers)
-            ]
-        layer_type_validation(self.layer_types, self.num_hidden_layers)
-
-        self.rope_parameters = rope_parameters
-
-        self.rope_theta = rope_theta
-        self.tie_word_embeddings = tie_word_embeddings
-        super().__init__(
-            pad_token_id=pad_token_id,
-            bos_token_id=bos_token_id,
-            eos_token_id=eos_token_id,
-            tie_word_embeddings=tie_word_embeddings,
-            **kwargs,
-        )
-
-
-@auto_docstring(
-    custom_intro="""
-    
-    """
-)
-class PPChart2TableConfig(PreTrainedConfig):
-    r"""
-    vision_config (Optional[Dict], optional, *optional*)::
-        The [PPChart2TableVisionConfig] for the vision sub-model. Defaults to None.
-    text_config (Optional[Dict], optional, *optional*)::
-        The [PPChart2TableTextConfig] for the text sub-model. Defaults to None.
-    image_token_index (Optional[int], optional, *optional*, defaults to 151859)::
-        The index of the image token. Defaults to 151859.
-    image_seq_length (Optional[int], optional, *optional*, defaults to 576)::
-        The sequence length for the image. Defaults to 576.
-    pad_token_id (Optional[int], optional, *optional*, defaults to -1):
-        The index of the padding token. Defaults to -1.
-    net_channels (`int`, *optional*, defaults to 512):
-        Dimensionality of intermediate network channels in the vision backbone.
-    output_channels (`int`, *optional*, defaults to 1024):
-        Dimensionality of intermediate network channels in the vision backbone.
-    """
-
-    model_type = "pp_chart2table"
-    attribute_map = {
-        "image_token_id": "image_token_index",
-    }
-    sub_configs = {"vision_config": PPChart2TableVisionConfig, "text_config": PPChart2TableTextConfig}
-
-    def __init__(
-        self,
-        vision_config: dict | None = None,
-        text_config: dict | None = None,
-        image_token_index: Optional[int] = 151859,
-        image_seq_length: Optional[int] = 576,
-        pad_token_id: Optional[int] = -1,
-        net_channels: Optional[int] = 512,
-        output_channels: Optional[int] = 1024,
-        **kwargs,
-    ):
-        self.image_token_index = image_token_index
-        self.image_seq_length = image_seq_length
-        self.pad_token_id = pad_token_id
-        self.net_channels = net_channels
-        self.output_channels = output_channels
-
-        if vision_config is None:
-            vision_config = {}
-        self.vision_config = PPChart2TableVisionConfig(**vision_config)
-
-        if text_config is None:
-            text_config = {}
-        self.text_config = PPChart2TableTextConfig(**text_config)
-
-        text_config_keys = [
-            "attention_dropout",
-            "bos_token_id",
-            "eos_token_id",
-            "hidden_act",
-            "hidden_size",
-            "initializer_range",
-            "intermediate_size",
-            "max_position_embeddings",
-            "num_attention_heads",
-            "num_hidden_layers",
-            "num_key_value_heads",
-            "rms_norm_eps",
-            "rope_theta",
-            "sliding_window",
-            "tie_word_embeddings",
-            "dtype",
-            "use_cache",
-            "vocab_size",
-        ]
-        for key in text_config_keys:
-            if hasattr(self.text_config, key):
-                setattr(self, key, getattr(self.text_config, key))
-
-        super().__init__(**kwargs)
+@auto_docstring
+class PPChart2TableConfig(GotOcr2Config):
+    pass
 
 
 @auto_docstring
 class PPChart2TableImageProcessorFast(BaseImageProcessorFast):
-
     resample = 3
     image_mean = [0.40821073, 0.4578275, 0.48145466]
     image_std = [0.27577711, 0.26130258, 0.26862954]
@@ -384,6 +92,14 @@ class PPChart2TableProcessor(ProcessorMixin):
 
     def __init__(self, image_processor=None, tokenizer=None, chat_template=None, **kwargs):
         super().__init__(image_processor, tokenizer, chat_template=chat_template)
+        self.message_start_token = "<|im_start|>"
+        self.message_end_token = "<|im_end|>"
+        self.img_start_token = "<img>"
+        self.img_end_token = "</img>"
+        self.img_pad_token = "<imgpad>"
+        self.image_token = "<imgpad>"  # keep the above for BC, but we need to call it `image_token`
+        self.image_token_id = tokenizer.convert_tokens_to_ids(self.image_token)
+        self.system_query = "system\nYou should follow the instructions carefully and explain your answers in detail."
 
     def __call__(
         self,
@@ -395,18 +111,31 @@ def __call__(
             image_inputs = self.image_processor(images=images, return_tensors="pt")
         else:
             image_inputs = {}
-        img_cnt = len(image_inputs)
+        image_count = len(image_inputs)
         _, _, height, _ = image_inputs["pixel_values"].shape
         num_patches = height // self.image_processor.patch_size // self.image_processor.merge_size
-        prompt = (
-            "<|im_start|>system\n"
-            "You should follow the instructions carefully and explain your answers in detail.<|im_end|><|im_start|>user\n"
-            "<img>" + "<imgpad>" * (num_patches * num_patches) + "</img>\n"
-            "Chart to table<|im_end|><|im_start|>assistant\n"
-        )
-        input_ids = torch.tensor(self.tokenizer([prompt]).input_ids)
-        input_ids = input_ids.repeat(img_cnt, 1)
-        input_ids = {"input_ids": input_ids}
+        
+        input_ids = {"input_ids": None}
+        if text == None:
+            query = "Chart to table"
+            prompt = (
+                self.message_start_token
+                + self.system_query
+                + self.message_end_token
+                + self.message_start_token
+                + "user\n"
+                + self.img_start_token
+                + self.img_pad_token * num_patches * num_patches
+                + self.img_end_token
+                + "\n"
+                + query
+                + self.message_end_token
+                + self.message_start_token
+                + "assistant\n"
+            )
+            input_ids = torch.tensor(self.tokenizer([prompt]).input_ids)
+            input_ids = input_ids.repeat(image_count, 1)
+            input_ids = {"input_ids": input_ids}
         return BatchFeature(data={**input_ids, **image_inputs})
 
     def postprocess(self, model_pred, **kwargs):
@@ -425,16 +154,6 @@ class PPChart2TableVisionEncoder(GotOcr2VisionEncoder, PPChart2TableVisionPreTra
     pass
 
 
-
-@auto_docstring
-class PPChart2TableTextPreTrainedModel(Qwen2PreTrainedModel):
-    pass
-
-
-class PPChart2TableTextModel(Qwen2Model):
-    pass
-
-
 @dataclass
 class PPChart2TableModelOutputWithPast(GotOcr2ModelOutputWithPast):
     pass
@@ -445,22 +164,13 @@ class PPChart2TableModel(GotOcr2Model):
 
     def __init__(self, config: PPChart2TableConfig):
         super().__init__(config)
-        self.vision_downsample1 = nn.Conv2d(config.vision_config.output_channels, config.net_channels, kernel_size=3, stride=2, padding=1, bias=False)
-        self.vision_downsample2 = nn.Conv2d(config.net_channels, config.output_channels, kernel_size=3, stride=2, padding=1, bias=False)
-        self.language_model = PPChart2TableTextModel._from_config(config.text_config)
+        self.vision_downsample1 = nn.Conv2d(config.vision_config.output_channels, config.vision_hidden_channels, kernel_size=3, stride=2, padding=1, bias=False)
+        self.vision_downsample2 = nn.Conv2d(config.vision_hidden_channels, config.output_channels, kernel_size=3, stride=2, padding=1, bias=False)
         self.multi_modal_projector = nn.Linear(config.output_channels, config.text_config.hidden_size)
 
         # Initialize weights and apply final processing
         self.post_init()
 
-    def get_input_embeddings(self):
-        """Get input embeddings from the text decoder (for weight tying/loading)."""
-        return self.language_model.embed_tokens
-
-    def set_input_embeddings(self, value):
-        """Set input embeddings for the text decoder (for weight tying/loading)."""
-        self.language_model.embed_tokens = value
-
     def get_image_features(
         self,
         pixel_values: torch.FloatTensor,
@@ -492,11 +202,8 @@ class PPChart2TableForConditionalGeneration(GotOcr2ForConditionalGeneration):
     "PPChart2TablePreTrainedModel",
     "PPChart2TableConfig",
     "PPChart2TableTextPreTrainedModel",
-    "PPChart2TableTextModel",
     "PPChart2TableVisionPreTrainedModel",
     "PPChart2TableVisionModel",
-    "PPChart2TableVisionConfig",
-    "PPChart2TableTextConfig",
     "PPChart2TableImageProcessorFast",
     "PPChart2TableProcessor",
-]
+]
\ No newline at end of file
diff --git a/src/transformers/models/pp_chart2table/processing_pp_chart2table.py b/src/transformers/models/pp_chart2table/processing_pp_chart2table.py
index 518fcb645770..b4f492fb6191 100644
--- a/src/transformers/models/pp_chart2table/processing_pp_chart2table.py
+++ b/src/transformers/models/pp_chart2table/processing_pp_chart2table.py
@@ -19,6 +19,14 @@ class PPChart2TableProcessor(ProcessorMixin):
 
     def __init__(self, image_processor=None, tokenizer=None, chat_template=None, **kwargs):
         super().__init__(image_processor, tokenizer, chat_template=chat_template)
+        self.message_start_token = "<|im_start|>"
+        self.message_end_token = "<|im_end|>"
+        self.img_start_token = "<img>"
+        self.img_end_token = "</img>"
+        self.img_pad_token = "<imgpad>"
+        self.image_token = "<imgpad>"  # keep the above for BC, but we need to call it `image_token`
+        self.image_token_id = tokenizer.convert_tokens_to_ids(self.image_token)
+        self.system_query = "system\nYou should follow the instructions carefully and explain your answers in detail."
 
     def __call__(
         self,
@@ -30,18 +38,31 @@ def __call__(
             image_inputs = self.image_processor(images=images, return_tensors="pt")
         else:
             image_inputs = {}
-        img_cnt = len(image_inputs)
+        image_count = len(image_inputs)
         _, _, height, _ = image_inputs["pixel_values"].shape
         num_patches = height // self.image_processor.patch_size // self.image_processor.merge_size
-        prompt = (
-            "<|im_start|>system\n"
-            "You should follow the instructions carefully and explain your answers in detail.<|im_end|><|im_start|>user\n"
-            "<img>" + "<imgpad>" * (num_patches * num_patches) + "</img>\n"
-            "Chart to table<|im_end|><|im_start|>assistant\n"
-        )
-        input_ids = torch.tensor(self.tokenizer([prompt]).input_ids)
-        input_ids = input_ids.repeat(img_cnt, 1)
-        input_ids = {"input_ids": input_ids}
+
+        input_ids = {"input_ids": None}
+        if text == None:
+            query = "Chart to table"
+            prompt = (
+                self.message_start_token
+                + self.system_query
+                + self.message_end_token
+                + self.message_start_token
+                + "user\n"
+                + self.img_start_token
+                + self.img_pad_token * num_patches * num_patches
+                + self.img_end_token
+                + "\n"
+                + query
+                + self.message_end_token
+                + self.message_start_token
+                + "assistant\n"
+            )
+            input_ids = torch.tensor(self.tokenizer([prompt]).input_ids)
+            input_ids = input_ids.repeat(image_count, 1)
+            input_ids = {"input_ids": input_ids}
         return BatchFeature(data={**input_ids, **image_inputs})
 
     def postprocess(self, model_pred, **kwargs):

From 4abb70d4bd10f9b1dfde45b41b4b58b0c1426a97 Mon Sep 17 00:00:00 2001
From: XingweiDeng <aaadddiy@163.com>
Date: Fri, 13 Mar 2026 18:18:37 +0800
Subject: [PATCH 14/60] update

---
 docs/source/en/model_doc/pp_chart2table.md    |  63 +--
 .../models/auto/image_processing_auto.py      |   2 +-
 .../models/pp_chart2table/__init__.py         |   3 +-
 .../configuration_pp_chart2table.py           |  30 +-
 .../image_processing_pp_chart2table_fast.py   |  16 +-
 .../pp_chart2table/modeling_pp_chart2table.py |  23 +-
 .../pp_chart2table/modular_pp_chart2table.py  |  91 ++--
 .../processing_pp_chart2table.py              |  55 +-
 .../test_modeling_pp_chart2table.py           | 485 +++++++-----------
 9 files changed, 371 insertions(+), 397 deletions(-)

diff --git a/docs/source/en/model_doc/pp_chart2table.md b/docs/source/en/model_doc/pp_chart2table.md
index ad00bec20f84..339746ae6cc9 100644
--- a/docs/source/en/model_doc/pp_chart2table.md
+++ b/docs/source/en/model_doc/pp_chart2table.md
@@ -1,4 +1,4 @@
-<!--Copyright 2025 The HuggingFace Team. All rights reserved.
+<!--Copyright 2026 The HuggingFace Team. All rights reserved.
 
 Licensed under the Apache License, Version 2.0 (the "License"); you may not use this file except in compliance with
 the License. You may obtain a copy of the License at
@@ -42,7 +42,11 @@ import requests
 from PIL import Image
 from transformers import pipeline
 model_path = "PaddlePaddle/PP-Chart2Table_safetensors"
-pipe = pipeline("image-text-to-text", model=model_path)
+pipe = pipeline(
+    task="image-text-to-text", 
+    model=model_path,
+    device_map="auto",
+)
 image = Image.open(requests.get("https://paddle-model-ecology.bj.bcebos.com/paddlex/imgs/demo_image/chart_parsing_02.png", stream=True).raw)
 result = pipe(
     images=image, 
@@ -64,14 +68,19 @@ from PIL import Image
 from transformers import AutoModelForImageTextToText, AutoProcessor
 
 model_path = "PaddlePaddle/PP-Chart2Table_safetensors"
-model = AutoModelForImageTextToText.from_pretrained(model_path, dtype="float32").to("cuda")
-processor = AutoProcessor.from_pretrained(model_path)
+model = AutoModelForImageTextToText.from_pretrained(
+    model_path, 
+    dtype="float32",
+    device_map="auto",
+)
+processor = AutoProcessor.from_pretrained(model_path, use_fast=True)
 
 image = Image.open(requests.get("https://paddle-model-ecology.bj.bcebos.com/paddlex/imgs/demo_image/chart_parsing_02.png", stream=True).raw)
 inputs = processor(images=image).to(model.device)
 
-outputs = model.generate(**inputs, do_sample=False, max_new_tokens=256)
-result = processor.postprocess(outputs)
+generated_ids = model.generate(**inputs, use_cache=True, do_sample=False, max_new_tokens=256)
+generated_ids_trimmed = [out_ids[len(in_ids) :] for in_ids, out_ids in zip(inputs.input_ids, generated_ids)]
+result = processor.batch_decode(generated_ids_trimmed, skip_special_tokens=True, clean_up_tokenization_spaces=False)
 print(result)
 
 ```
@@ -91,7 +100,11 @@ import requests
 from transformers import pipeline
 from PIL import Image
 model_path = "PaddlePaddle/PP-Chart2Table_safetensors"
-pipe = pipeline("image-text-to-text", model=model_path)
+pipe = pipeline(
+    task="image-text-to-text", 
+    model=model_path,
+    device_map="auto",
+)
 image = Image.open(requests.get("https://paddle-model-ecology.bj.bcebos.com/paddlex/imgs/demo_image/chart_parsing_02.png", stream=True).raw)
 result = pipe(
     images=[image, image],
@@ -112,14 +125,19 @@ from PIL import Image
 from transformers import AutoModelForImageTextToText, AutoProcessor
 
 model_path = "PaddlePaddle/PP-Chart2Table_safetensors"
-model = AutoModelForImageTextToText.from_pretrained(model_path, dtype="float32").to("cuda")
+model = AutoModelForImageTextToText.from_pretrained(
+    model_path, 
+    dtype="float32",
+    device_map="auto",
+)
 processor = AutoProcessor.from_pretrained(model_path)
 
 image = Image.open(requests.get("https://paddle-model-ecology.bj.bcebos.com/paddlex/imgs/demo_image/chart_parsing_02.png", stream=True).raw)
 inputs = processor(images=[image, image]).to(model.device)
 
-outputs = model.generate(**inputs, do_sample=False, max_new_tokens=256)
-result = processor.postprocess(outputs)
+generated_ids = model.generate(**inputs, do_sample=False, max_new_tokens=256)
+generated_ids_trimmed = [out_ids[len(in_ids) :] for in_ids, out_ids in zip(inputs.input_ids, generated_ids)]
+result = processor.batch_decode(generated_ids_trimmed, skip_special_tokens=True, clean_up_tokenization_spaces=False)
 print(result)
 ```
 
@@ -129,33 +147,11 @@ print(result)
 ## PPChart2TableForConditionalGeneration
 
 [[autodoc]] PPChart2TableForConditionalGeneration
-    - forward
 
 ## PPChart2TableConfig
 
 [[autodoc]] PPChart2TableConfig
 
-## PPChart2TableVisionConfig
-
-[[autodoc]] PPChart2TableVisionConfig
-
-## PPChart2TableTextConfig
-
-[[autodoc]] PPChart2TableTextConfig
-
-## PPChart2TableTextModel
-
-[[autodoc]] PPChart2TableTextModel
-    - forward
-
-## PPChart2TableVisionModel
-
-[[autodoc]] PPChart2TableVisionModel
-
-## PPChart2TableImageProcessor
-
-[[autodoc]] PPChart2TableImageProcessor
-
 ## PPChart2TableImageProcessorFast
 
 [[autodoc]] PPChart2TableImageProcessorFast
@@ -168,6 +164,3 @@ print(result)
 
 [[autodoc]] PPChart2TableProcessor
 
-## PPChart2TableVisionTransformer
-
-[[autodoc]] PPChart2TableVisionTransformer
diff --git a/src/transformers/models/auto/image_processing_auto.py b/src/transformers/models/auto/image_processing_auto.py
index 597a43227893..a67a3b3d5b9f 100644
--- a/src/transformers/models/auto/image_processing_auto.py
+++ b/src/transformers/models/auto/image_processing_auto.py
@@ -169,7 +169,7 @@
             ("pixio", ("BitImageProcessor", "BitImageProcessorFast")),
             ("pixtral", ("PixtralImageProcessor", "PixtralImageProcessorFast")),
             ("poolformer", ("PoolFormerImageProcessor", "PoolFormerImageProcessorFast")),
-            ("pp_chart2table", ("PPChart2TableImageProcessor", "PPChart2TableImageProcessorFast")),
+            ("pp_chart2table", (None, "PPChart2TableImageProcessorFast")),
             ("pp_doclayout_v2", (None, "PPDocLayoutV2ImageProcessorFast")),
             ("pp_doclayout_v3", (None, "PPDocLayoutV3ImageProcessorFast")),
             ("prompt_depth_anything", ("PromptDepthAnythingImageProcessor", "PromptDepthAnythingImageProcessorFast")),
diff --git a/src/transformers/models/pp_chart2table/__init__.py b/src/transformers/models/pp_chart2table/__init__.py
index a471ebfb2830..178169b3c50d 100644
--- a/src/transformers/models/pp_chart2table/__init__.py
+++ b/src/transformers/models/pp_chart2table/__init__.py
@@ -1,5 +1,4 @@
-# coding=utf-8
-# Copyright 2025 the HuggingFace Team. All rights reserved.
+# Copyright 2026 The HuggingFace Team. All rights reserved.
 #
 # Licensed under the Apache License, Version 2.0 (the "License");
 # you may not use this file except in compliance with the License.
diff --git a/src/transformers/models/pp_chart2table/configuration_pp_chart2table.py b/src/transformers/models/pp_chart2table/configuration_pp_chart2table.py
index f1c819a96417..48d4ea664b9d 100644
--- a/src/transformers/models/pp_chart2table/configuration_pp_chart2table.py
+++ b/src/transformers/models/pp_chart2table/configuration_pp_chart2table.py
@@ -4,6 +4,20 @@
 #             the file from the modular. If any change should be done, please apply the change to the
 #                          modular_pp_chart2table.py file directly. One of our CI enforces this.
 #                🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨
+# Copyright 2026 The PaddlePaddle Team and The HuggingFace Inc. team. All rights reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
 
 from ...configuration_utils import PreTrainedConfig
 from ...utils import auto_docstring
@@ -71,7 +85,17 @@ def __init__(
         self.mlp_dim = mlp_dim
 
 
-@auto_docstring
+@auto_docstring(
+    checkpoint="PaddlePaddle/PP-Chart2Table_safetensors",
+    custom_args=r"""
+    output_channels (`int`, *optional*, defaults to 1024):
+        Dimensionality of the output channels from the vision encoder. This is the final channel count
+        after the vision downsample layers, which is then projected to the text model hidden size.
+    vision_hidden_channels (`int`, *optional*, defaults to 512):
+        Dimensionality of the intermediate hidden channels in the vision encoder. This is the channel
+        count between the first and second downsample layers.
+    """,
+)
 class PPChart2TableConfig(PreTrainedConfig):
     r"""
     Example:
@@ -102,8 +126,12 @@ def __init__(
         image_token_index: int | None = 151859,
         image_seq_length: int | None = 576,
         tie_word_embeddings: bool | None = True,
+        output_channels: int = 1024,
+        vision_hidden_channels: int = 512,
         **kwargs,
     ):
+        self.output_channels = output_channels
+        self.vision_hidden_channels = vision_hidden_channels
         self.image_token_index = image_token_index
         self.image_seq_length = image_seq_length
 
diff --git a/src/transformers/models/pp_chart2table/image_processing_pp_chart2table_fast.py b/src/transformers/models/pp_chart2table/image_processing_pp_chart2table_fast.py
index 5021229fe168..afcd5208f819 100644
--- a/src/transformers/models/pp_chart2table/image_processing_pp_chart2table_fast.py
+++ b/src/transformers/models/pp_chart2table/image_processing_pp_chart2table_fast.py
@@ -4,10 +4,24 @@
 #             the file from the modular. If any change should be done, please apply the change to the
 #                          modular_pp_chart2table.py file directly. One of our CI enforces this.
 #                🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨
+# Copyright 2026 The PaddlePaddle Team and The HuggingFace Inc. team. All rights reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+
 from typing import Optional
 
 import torch
-import torchvision.transforms.v2.functional as tvF
 
 from ...feature_extraction_utils import BatchFeature
 from ...image_processing_utils_fast import BaseImageProcessorFast, group_images_by_shape, reorder_images
diff --git a/src/transformers/models/pp_chart2table/modeling_pp_chart2table.py b/src/transformers/models/pp_chart2table/modeling_pp_chart2table.py
index 5501710d3a48..fe35758b945e 100644
--- a/src/transformers/models/pp_chart2table/modeling_pp_chart2table.py
+++ b/src/transformers/models/pp_chart2table/modeling_pp_chart2table.py
@@ -4,6 +4,20 @@
 #             the file from the modular. If any change should be done, please apply the change to the
 #                          modular_pp_chart2table.py file directly. One of our CI enforces this.
 #                🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨
+# Copyright 2026 The PaddlePaddle Team and The HuggingFace Inc. team. All rights reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
 import collections
 from dataclasses import dataclass
 
@@ -812,11 +826,4 @@ def prepare_inputs_for_generation(
         return model_inputs
 
 
-__all__ = [
-    "PPChart2TableForConditionalGeneration",
-    "PPChart2TableModel",
-    "PPChart2TablePreTrainedModel",
-    "PPChart2TableTextPreTrainedModel",
-    "PPChart2TableVisionPreTrainedModel",
-    "PPChart2TableVisionModel",
-]
+__all__ = ["PPChart2TableForConditionalGeneration", "PPChart2TableModel", "PPChart2TableVisionPreTrainedModel"]
diff --git a/src/transformers/models/pp_chart2table/modular_pp_chart2table.py b/src/transformers/models/pp_chart2table/modular_pp_chart2table.py
index 5aac8fe62ccf..40134ddc4624 100644
--- a/src/transformers/models/pp_chart2table/modular_pp_chart2table.py
+++ b/src/transformers/models/pp_chart2table/modular_pp_chart2table.py
@@ -1,9 +1,22 @@
+# Copyright 2026 The PaddlePaddle Team and The HuggingFace Inc. team. All rights reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
 from dataclasses import dataclass
-from typing import Optional
 
+from typing import Optional
 import torch
 import torch.nn as nn
-import torchvision.transforms.v2.functional as tvF
 
 from ...feature_extraction_utils import BatchFeature
 from ...image_processing_utils_fast import BaseImageProcessorFast, group_images_by_shape, reorder_images
@@ -19,16 +32,36 @@
 
 from ...utils import TransformersKwargs, auto_docstring, logging
 from ...modeling_outputs import BaseModelOutputWithPooling
-from ...processing_utils import ProcessorMixin, TensorType, Unpack
+from ...processing_utils import ProcessorMixin, TensorType, Unpack, ProcessingKwargs
 
-from ...image_utils import SizeDict
+from ...image_utils import SizeDict, ImageInput
+from ...tokenization_utils_base import PreTokenizedInput, TextInput
 
 logger = logging.get_logger(__name__)
 
 
-@auto_docstring
+@auto_docstring(
+    checkpoint="PaddlePaddle/PP-Chart2Table_safetensors",
+    custom_args=r"""
+    output_channels (`int`, *optional*, defaults to 1024):
+        Dimensionality of the output channels from the vision encoder. This is the final channel count
+        after the vision downsample layers, which is then projected to the text model hidden size.
+    vision_hidden_channels (`int`, *optional*, defaults to 512):
+        Dimensionality of the intermediate hidden channels in the vision encoder. This is the channel
+        count between the first and second downsample layers.
+    """
+)
 class PPChart2TableConfig(GotOcr2Config):
-    pass
+    def __init__(
+        self,
+        output_channels: int = 1024,
+        vision_hidden_channels: int = 512,
+        **super_kwargs,
+    ):
+
+        self.output_channels = output_channels
+        self.vision_hidden_channels = vision_hidden_channels
+        super().__init__()
 
 
 @auto_docstring
@@ -58,7 +91,6 @@ def _preprocess(
         return_tensors: str | TensorType | None,
         **kwargs,
     ) -> BatchFeature:
-
         grouped_images, grouped_images_index = group_images_by_shape(images, disable_grouping=disable_grouping)
         resized_images_grouped = {}
         for shape, stacked_images in grouped_images.items():
@@ -92,31 +124,31 @@ class PPChart2TableProcessor(ProcessorMixin):
 
     def __init__(self, image_processor=None, tokenizer=None, chat_template=None, **kwargs):
         super().__init__(image_processor, tokenizer, chat_template=chat_template)
-        self.message_start_token = "<|im_start|>"
-        self.message_end_token = "<|im_end|>"
-        self.img_start_token = "<img>"
-        self.img_end_token = "</img>"
-        self.img_pad_token = "<imgpad>"
-        self.image_token = "<imgpad>"  # keep the above for BC, but we need to call it `image_token`
-        self.image_token_id = tokenizer.convert_tokens_to_ids(self.image_token)
+
+        self.message_start_token = tokenizer.message_start_token
+        self.message_end_token = tokenizer.message_end_token
+        self.img_start_token = tokenizer.img_start_token
+        self.img_end_token = tokenizer.img_end_token
+        self.img_pad_token = tokenizer.img_pad_token
+        self.image_token = tokenizer.image_token
         self.system_query = "system\nYou should follow the instructions carefully and explain your answers in detail."
 
     def __call__(
         self,
-        images,
-        text=None,
-        **kwargs,
+        images: ImageInput = None,
+        text: TextInput | PreTokenizedInput | list[TextInput] | list[PreTokenizedInput] = None,
+        **kwargs: Unpack[ProcessingKwargs],
     ) -> BatchFeature:
         if images is not None:
             image_inputs = self.image_processor(images=images, return_tensors="pt")
         else:
             image_inputs = {}
-        image_count = len(image_inputs)
-        _, _, height, _ = image_inputs["pixel_values"].shape
+
+        batch_size, _, height, _ = image_inputs["pixel_values"].shape
         num_patches = height // self.image_processor.patch_size // self.image_processor.merge_size
         
         input_ids = {"input_ids": None}
-        if text == None:
+        if text is None:
             query = "Chart to table"
             prompt = (
                 self.message_start_token
@@ -134,20 +166,16 @@ def __call__(
                 + "assistant\n"
             )
             input_ids = torch.tensor(self.tokenizer([prompt]).input_ids)
-            input_ids = input_ids.repeat(image_count, 1)
+            input_ids = input_ids.repeat(batch_size, 1)
             input_ids = {"input_ids": input_ids}
-        return BatchFeature(data={**input_ids, **image_inputs})
+        else:
+            raise ValueError("PPChart2Table processor does not support text inputs")
 
-    def postprocess(self, model_pred, **kwargs):
-        return self.tokenizer.batch_decode(
-            model_pred[0],
-            skip_special_tokens=kwargs.get("skip_special_tokens", True),
-            clean_up_tokenization_spaces=False,
-        )
+        return BatchFeature(data={**input_ids, **image_inputs})
 
 
 class PPChart2TableVisionPreTrainedModel(GotOcr2PreTrainedModel):
-    input_modalities = ("image", "text")
+    pass
 
 
 class PPChart2TableVisionEncoder(GotOcr2VisionEncoder, PPChart2TableVisionPreTrainedModel):
@@ -176,7 +204,7 @@ def get_image_features(
         pixel_values: torch.FloatTensor,
         **kwargs: Unpack[TransformersKwargs],
     ) -> tuple | BaseModelOutputWithPooling:
-        
+
         image_output = self.vision_tower(pixel_values)
         last_hidden_state = image_output.last_hidden_state
         last_hidden_state = self.vision_downsample1(last_hidden_state)
@@ -199,11 +227,8 @@ class PPChart2TableForConditionalGeneration(GotOcr2ForConditionalGeneration):
 __all__ = [
     "PPChart2TableForConditionalGeneration",
     "PPChart2TableModel",
-    "PPChart2TablePreTrainedModel",
     "PPChart2TableConfig",
-    "PPChart2TableTextPreTrainedModel",
     "PPChart2TableVisionPreTrainedModel",
-    "PPChart2TableVisionModel",
     "PPChart2TableImageProcessorFast",
     "PPChart2TableProcessor",
 ]
\ No newline at end of file
diff --git a/src/transformers/models/pp_chart2table/processing_pp_chart2table.py b/src/transformers/models/pp_chart2table/processing_pp_chart2table.py
index b4f492fb6191..f385517ee13c 100644
--- a/src/transformers/models/pp_chart2table/processing_pp_chart2table.py
+++ b/src/transformers/models/pp_chart2table/processing_pp_chart2table.py
@@ -4,11 +4,26 @@
 #             the file from the modular. If any change should be done, please apply the change to the
 #                          modular_pp_chart2table.py file directly. One of our CI enforces this.
 #                🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨
+# Copyright 2026 The PaddlePaddle Team and The HuggingFace Inc. team. All rights reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
 
 import torch
 
 from ...feature_extraction_utils import BatchFeature
-from ...processing_utils import ProcessorMixin
+from ...image_utils import ImageInput
+from ...processing_utils import ProcessingKwargs, ProcessorMixin, Unpack
+from ...tokenization_utils_base import PreTokenizedInput, TextInput
 from ...utils import auto_docstring
 
 
@@ -19,31 +34,31 @@ class PPChart2TableProcessor(ProcessorMixin):
 
     def __init__(self, image_processor=None, tokenizer=None, chat_template=None, **kwargs):
         super().__init__(image_processor, tokenizer, chat_template=chat_template)
-        self.message_start_token = "<|im_start|>"
-        self.message_end_token = "<|im_end|>"
-        self.img_start_token = "<img>"
-        self.img_end_token = "</img>"
-        self.img_pad_token = "<imgpad>"
-        self.image_token = "<imgpad>"  # keep the above for BC, but we need to call it `image_token`
-        self.image_token_id = tokenizer.convert_tokens_to_ids(self.image_token)
+
+        self.message_start_token = tokenizer.message_start_token
+        self.message_end_token = tokenizer.message_end_token
+        self.img_start_token = tokenizer.img_start_token
+        self.img_end_token = tokenizer.img_end_token
+        self.img_pad_token = tokenizer.img_pad_token
+        self.image_token = tokenizer.image_token
         self.system_query = "system\nYou should follow the instructions carefully and explain your answers in detail."
 
     def __call__(
         self,
-        images,
-        text=None,
-        **kwargs,
+        images: ImageInput = None,
+        text: TextInput | PreTokenizedInput | list[TextInput] | list[PreTokenizedInput] = None,
+        **kwargs: Unpack[ProcessingKwargs],
     ) -> BatchFeature:
         if images is not None:
             image_inputs = self.image_processor(images=images, return_tensors="pt")
         else:
             image_inputs = {}
-        image_count = len(image_inputs)
-        _, _, height, _ = image_inputs["pixel_values"].shape
+
+        batch_size, _, height, _ = image_inputs["pixel_values"].shape
         num_patches = height // self.image_processor.patch_size // self.image_processor.merge_size
 
         input_ids = {"input_ids": None}
-        if text == None:
+        if text is None:
             query = "Chart to table"
             prompt = (
                 self.message_start_token
@@ -61,16 +76,12 @@ def __call__(
                 + "assistant\n"
             )
             input_ids = torch.tensor(self.tokenizer([prompt]).input_ids)
-            input_ids = input_ids.repeat(image_count, 1)
+            input_ids = input_ids.repeat(batch_size, 1)
             input_ids = {"input_ids": input_ids}
-        return BatchFeature(data={**input_ids, **image_inputs})
+        else:
+            raise ValueError("PPChart2Table processor does not support text inputs")
 
-    def postprocess(self, model_pred, **kwargs):
-        return self.tokenizer.batch_decode(
-            model_pred[0],
-            skip_special_tokens=kwargs.get("skip_special_tokens", True),
-            clean_up_tokenization_spaces=False,
-        )
+        return BatchFeature(data={**input_ids, **image_inputs})
 
 
 __all__ = ["PPChart2TableProcessor"]
diff --git a/tests/models/pp_chart2table/test_modeling_pp_chart2table.py b/tests/models/pp_chart2table/test_modeling_pp_chart2table.py
index 28bac6ef8cea..2f094467c5e9 100644
--- a/tests/models/pp_chart2table/test_modeling_pp_chart2table.py
+++ b/tests/models/pp_chart2table/test_modeling_pp_chart2table.py
@@ -1,4 +1,4 @@
-# Copyright 2025 The HuggingFace Inc. team. All rights reserved.
+# Copyright 2026 The HuggingFace Inc. team. All rights reserved.
 #
 # Licensed under the Apache License, Version 2.0 (the "License");
 # you may not use this file except in compliance with the License.
@@ -13,155 +13,124 @@
 # limitations under the License.
 """Testing suite for the PPChart2Table model."""
 
-import gc
 import unittest
 
-import requests
-
-import pytest
-from parameterized import parameterized
-from PIL import Image
-
 from transformers import (
     AutoProcessor,
     PPChart2TableConfig,
-    PPChart2TableForConditionalGeneration,
     is_torch_available,
+    is_vision_available,
 )
-from transformers.testing_utils import (
-    backend_empty_cache,
-    require_torch,
-    slow,
-    torch_device,
-)
+from transformers.testing_utils import cleanup, require_torch, slow, torch_device
 
 from ...generation.test_utils import GenerationTesterMixin
 from ...test_configuration_common import ConfigTester
-from ...test_modeling_common import (
-    ModelTesterMixin,
-)
+from ...test_modeling_common import ModelTesterMixin, floats_tensor, ids_tensor
 from ...test_pipeline_mixin import PipelineTesterMixin
 
 
 if is_torch_available():
     import torch
 
+    from transformers import (
+        PPChart2TableForConditionalGeneration,
+        PPChart2TableModel,
+    )
+
+
+if is_vision_available():
+    from transformers.image_utils import load_image
+
 
 class PPChart2TableVisionText2TextModelTester:
     def __init__(
         self,
         parent,
-        batch_size=1,
-        seq_length=31,
+        batch_size=3,
+        seq_length=7,
         num_channels=3,
-        image_height=64,
-        image_width=64,
+        ignore_index=-100,
+        image_size=64,
+        image_token_index=1,
+        model_type="pp_chart2table",
+        is_training=False,
+        output_channels=1024,
+        vision_hidden_channels=1024,
         text_config={
-            "hidden_size": 32,
-            "hidden_act": "silu",
+            "model_type": "qwen2",
+            "vocab_size": 99,
+            "hidden_size": 128,
+            "intermediate_size": 37,
             "num_hidden_layers": 2,
             "num_attention_heads": 4,
             "num_key_value_heads": 2,
-            "intermediate_size": 32,
-            "attention_dropout": 0.0,
-            "sliding_window": 32768,
-            "rms_norm_eps": 1e-06,
-            "vocab_size": 151860,
-            "max_position_embeddings": 32768,
-            "rope_parameters": {"rope_theta": 1000000.0, "rope_type": "default"},
+            "output_channels": 64,
+            "hidden_act": "silu",
+            "max_position_embeddings": 512,
+            "rope_theta": 10000,
+            "mlp_ratio": 4,
+            "tie_word_embeddings": True,
+            "bos_token_id": 2,
+            "eos_token_id": 3,
+            "pad_token_id": 4,
         },
-        is_training=False,
         vision_config={
-            "depth": 2,
-            "hidden_size": 144,
-            "output_channels": 192,
-            "hidden_act": "gelu",
-            "image_size": 64,
-            "num_channels": 3,
-            "mlp_ratio": 4.0,
-            "norm_layer_eps": 1e-6,
+            "num_hidden_layers": 2,
+            "output_channels": 64,
+            "hidden_act": "quick_gelu",
+            "hidden_size": 32,
+            "mlp_dim": 128,
             "num_attention_heads": 4,
-            "patch_size": 16,
-            "qkv_bias": True,
-            "use_rel_pos": True,
-            "global_attn_indexes": [2, 5, 8, 11],
-            "window_size": 14,
-            "neck_channels": 48,
-            "net_channels": 96,
-            "attention_dropout": 0.0,
+            "patch_size": 2,
+            "image_size": 64,
         },
-        bos_token_id=151643,
-        eos_token_id=151643,
-        im_start_token=151857,
-        im_end_token=151858,
-        im_patch_token=151859,
     ):
         self.parent = parent
-        self.bos_token_id = bos_token_id
-        self.eos_token_id = eos_token_id
-        self.num_hidden_layers = text_config["num_hidden_layers"]
-        self.num_attention_heads = text_config["num_attention_heads"]
-        self.hidden_size = text_config["hidden_size"]
-        self.im_start_token = im_start_token
-        self.im_end_token = im_end_token
-        self.im_patch_token = im_patch_token
+        self.ignore_index = ignore_index
+        self.bos_token_id = text_config["bos_token_id"]
+        self.eos_token_id = text_config["eos_token_id"]
+        self.pad_token_id = text_config["pad_token_id"]
+        self.image_token_index = image_token_index
+        self.model_type = model_type
         self.text_config = text_config
         self.vision_config = vision_config
         self.batch_size = batch_size
-        self.seq_length = seq_length
         self.num_channels = num_channels
-        self.image_height = image_height
-        self.image_width = image_width
+        self.image_size = image_size
         self.is_training = is_training
+        self.num_image_tokens = 64
+        self.seq_length = seq_length + self.num_image_tokens
+        self.output_channels = output_channels
+        self.vision_hidden_channels = vision_hidden_channels
+
+        self.num_hidden_layers = text_config["num_hidden_layers"]
         self.vocab_size = text_config["vocab_size"]
+        self.hidden_size = text_config["hidden_size"]
+        self.num_attention_heads = text_config["num_attention_heads"]
 
     def get_config(self):
         return PPChart2TableConfig(
             text_config=self.text_config,
             vision_config=self.vision_config,
+            model_type=self.model_type,
+            image_token_index=self.image_token_index,
         )
 
     def prepare_config_and_inputs(self):
         config = self.get_config()
-        pixel_values = torch.randn((1, 3, self.image_height, self.image_width)).to(torch_device)
-
-        num_patch = self.image_height // 16 // 4
-        input = (
-            [
-                151644,
-                8948,
-                198,
-                2610,
-                1265,
-                1795,
-                279,
-                11221,
-                15516,
-                323,
-                10339,
-                697,
-                11253,
-                304,
-                7716,
-                13,
-                151645,
-                151644,
-                872,
-                198,
-                151857,
-            ]
-            + [151859] * (num_patch * num_patch)
-            + [151858, 198, 14488, 311, 1965, 151645, 151644, 77091, 198]
-        )
+        pixel_values = floats_tensor([self.batch_size, self.num_channels, self.image_size, self.image_size])
 
-        input_ids = torch.tensor(input).unsqueeze(0).to(torch_device)
-
-        return config, pixel_values, input_ids
+        return config, pixel_values
 
     def prepare_config_and_inputs_for_common(self):
         config_and_inputs = self.prepare_config_and_inputs()
-        config, pixel_values, input_ids = config_and_inputs
+        config, pixel_values = config_and_inputs
+        input_ids = ids_tensor([self.batch_size, self.seq_length], self.vocab_size)
         attention_mask = torch.ones(input_ids.shape, dtype=torch.long, device=torch_device)
+
+        input_ids[input_ids == self.image_token_index] = self.pad_token_id
+        input_ids[:, : self.num_image_tokens] = self.image_token_index
+
         inputs_dict = {
             "pixel_values": pixel_values,
             "input_ids": input_ids,
@@ -172,11 +141,21 @@ def prepare_config_and_inputs_for_common(self):
 
 @require_torch
 class PPChart2TableModelTest(ModelTesterMixin, GenerationTesterMixin, PipelineTesterMixin, unittest.TestCase):
-    all_model_classes = (PPChart2TableForConditionalGeneration,) if is_torch_available() else ()
-    pipeline_model_mapping = {"image-text-to-text": PPChart2TableForConditionalGeneration}
-
-    _is_composite = True
-    test_resize_embeddings = False
+    all_model_classes = (
+        (
+            PPChart2TableModel,
+            PPChart2TableForConditionalGeneration,
+        )
+        if is_torch_available()
+        else ()
+    )
+    pipeline_model_mapping = (
+        {
+            "image-text-to-text": PPChart2TableForConditionalGeneration,
+        }
+        if is_torch_available()
+        else {}
+    )
 
     def setUp(self):
         self.model_tester = PPChart2TableVisionText2TextModelTester(self)
@@ -185,214 +164,132 @@ def setUp(self):
     def test_config(self):
         self.config_tester.run_common_tests()
 
-    @unittest.skip(reason="PPChart2Table does not support.")
-    def test_sliding_window_mask(self):
-        pass
-
-    @unittest.skip(reason="PPChart2Table does not support.")
-    def test_generate_compile_model_forward_fullgraph(self):
+    @unittest.skip(reason="PPChart2Table does not support")
+    def test_get_image_features_attentions(self):
         pass
 
-    @unittest.skip(reason="PPChart2Table does not support.")
-    def test_multi_gpu_data_parallel_forward(self):
+    @unittest.skip(reason="PPChart2Table does not support")
+    def test_get_image_features_hidden_states(self):
         pass
 
-    @pytest.mark.generate
-    @unittest.skip(reason="PPChart2Table does not support beam search.")
-    def test_beam_sample_generate(self):
-        pass
-
-    @pytest.mark.generate
-    @unittest.skip(reason="PPChart2Table does not support beam search.")
-    def test_beam_search_generate(self):
-        pass
-
-    @pytest.mark.generate
-    @unittest.skip(reason="PPChart2Table does not support beam search.")
-    def test_beam_search_generate_dict_output(self):
-        pass
-
-    @pytest.mark.generate
-    @unittest.skip(reason="PPChart2Table does not support beam search.")
-    def test_beam_search_generate_dict_outputs_use_cache(self):
-        pass
-
-    @pytest.mark.generate
-    @unittest.skip(reason="PPChart2Table does not support beam search.")
-    def test_beam_sample_generate_dict_output(self):
-        pass
-
-    @unittest.skip(reason="PPChart2Table needs to apply weight conversions.")
-    def test_can_load_from_already_mapped_keys(self):
-        pass
-
-    @pytest.mark.generate
-    @unittest.skip(reason="PPChart2Table does not support beam search.")
-    def test_generate_from_inputs_embeds_1_beam_search(self, _, num_beams):
-        pass
-
-    @parameterized.expand([("random",), ("same",)])
-    @pytest.mark.generate
-    @unittest.skip(reason="PPChart2Table does not support assisted decoding.")
-    def test_assisted_decoding_matches_greedy_search(self, assistant_type):
-        pass
-
-    @pytest.mark.generate
-    @unittest.skip(reason="PPChart2Table does not support assisted decoding.")
-    def test_assisted_decoding_sample(self):
-        pass
-
-    @unittest.skip("PPChart2Table does not support this test.")
+    @unittest.skip(reason="PPChart2Table does not support this test.")
     def test_model_is_small(self):
         pass
 
 
 @require_torch
-@slow
 class PPChart2TableIntegrationTest(unittest.TestCase):
     def setUp(self):
-        self.processor = AutoProcessor.from_pretrained("/workspace/model_weight_torch/PP-Chart2Table")
+        self.processor = AutoProcessor.from_pretrained("PaddlePaddle/PPChart2Table_safetensors")
 
     def tearDown(self):
-        gc.collect()
-        backend_empty_cache(torch_device)
-
-    def test_small_model_integration_test(self):
-        model = PPChart2TableForConditionalGeneration.from_pretrained(
-            "/workspace/model_weight_torch/PP-Chart2Table", dtype="float32"
-        ).to("cuda")
-
-        image = Image.open(requests.get("https://paddle-model-ecology.bj.bcebos.com/paddlex/imgs/demo_image/chart_parsing_02.png", stream=True).raw)
-
-        inputs = self.processor(images=image).to(model.device)
-        breakpoint()
-        expected_input_ids_length = 286
-        assert expected_input_ids_length == len(inputs.input_ids[0])
-
-        expected_input_ids = [151644, 8948, 198, 2610, 1265, 1795, 279, 11221, 15516, 323]
-
-        assert expected_input_ids == inputs.input_ids[0].tolist()[:10]
-
-        expected_pixel_slice = torch.tensor(
-            [
-                [1.0000, 1.0000, 1.0000],
-                [1.0000, 1.0000, 1.0000],
-                [0.9922, 0.9922, 0.9922],
-                [1.0000, 1.0000, 1.0000],
-                [1.0000, 1.0000, 1.0000],
-            ],
-            dtype=torch.float32,
-            device="cpu",
+        cleanup(torch_device, gc_collect=True)
+
+    @slow
+    def test_small_model_integration_test_got_ocr_stop_strings(self):
+        model_id = "stepfun-ai/GOT-OCR-2.0-hf"
+        model = PPChart2TableForConditionalGeneration.from_pretrained(model_id, device_map=torch_device)
+        image = load_image(
+            "https://huggingface.co/datasets/hf-internal-testing/fixtures_ocr/resolve/main/iam_picture.jpeg"
+        )
+
+        inputs = self.processor(image, return_tensors="pt").to(torch_device)
+        generate_ids = model.generate(
+            **inputs,
+            do_sample=False,
+            num_beams=1,
+            tokenizer=self.processor.tokenizer,
+            stop_strings="<|im_end|>",
+            max_new_tokens=4096,
+        )
+        decoded_output = self.processor.decode(
+            generate_ids[0, inputs["input_ids"].shape[1] :], skip_special_tokens=True
+        )
+        expected_output = "industre"
+        self.assertEqual(decoded_output, expected_output)
+
+    @slow
+    def test_small_model_integration_test_got_ocr_format(self):
+        model_id = "PaddlePaddle/PPChart2Table_safetensors"
+        model = PPChart2TableForConditionalGeneration.from_pretrained(model_id, device_map=torch_device)
+        image = load_image(
+            "https://huggingface.co/datasets/hf-internal-testing/fixtures_got_ocr/resolve/main/image_ocr.jpg"
         )
 
-        assert torch.allclose(expected_pixel_slice, inputs.pixel_values[:5, :, 0, 0], atol=3e-3)
+        inputs = self.processor(image, return_tensors="pt", format=True).to(torch_device)
+        generate_ids = model.generate(**inputs, do_sample=False, num_beams=1, max_new_tokens=4)
+        decoded_output = self.processor.decode(
+            generate_ids[0, inputs["input_ids"].shape[1] :], skip_special_tokens=True
+        )
+        expected_output = "\\title{\nR"
+        self.assertEqual(decoded_output, expected_output)
+
+    @slow
+    def test_small_model_integration_test_got_ocr_fine_grained(self):
+        model_id = "PaddlePaddle/PPChart2Table_safetensors"
+        model = PPChart2TableForConditionalGeneration.from_pretrained(model_id, device_map=torch_device)
+        image = load_image(
+            "https://huggingface.co/datasets/hf-internal-testing/fixtures_got_ocr/resolve/main/multi_box.png"
+        )
 
-        # verify generation
-        inputs = inputs.to(torch_device)
-        output = model.generate(**inputs, max_new_tokens=30)
-        result = self.processor.decode(output[0][inputs["input_ids"].shape[-1] : -1])
+        inputs = self.processor(image, return_tensors="pt", color="green").to(torch_device)
+        generate_ids = model.generate(**inputs, do_sample=False, num_beams=1, max_new_tokens=4)
+        decoded_output = self.processor.decode(
+            generate_ids[0, inputs["input_ids"].shape[1] :], skip_special_tokens=True
+        )
+        expected_output = "You should keep in"
+        self.assertEqual(decoded_output, expected_output)
+
+    @slow
+    def test_small_model_integration_test_got_ocr_crop_to_patches(self):
+        model_id = "PaddlePaddle/PPChart2Table_safetensors"
+        model = PPChart2TableForConditionalGeneration.from_pretrained(model_id, device_map=torch_device)
+        image = load_image(
+            "https://huggingface.co/datasets/hf-internal-testing/fixtures_got_ocr/resolve/main/one_column.png"
+        )
 
-        EXPECTED_DECODED_TEXT = "生甘草"
+        inputs = self.processor(image, return_tensors="pt", crop_to_patches=True).to(torch_device)
+        generate_ids = model.generate(**inputs, do_sample=False, num_beams=1, max_new_tokens=4)
+        decoded_output = self.processor.decode(
+            generate_ids[0, inputs["input_ids"].shape[1] :], skip_special_tokens=True
+        )
+        expected_output = "on developing architectural improvements"
+        self.assertEqual(decoded_output, expected_output)
+
+    @slow
+    def test_small_model_integration_test_got_ocr_multi_pages(self):
+        model_id = "PaddlePaddle/PPChart2Table_safetensors"
+        model = PPChart2TableForConditionalGeneration.from_pretrained(model_id, device_map=torch_device)
+        image1 = load_image(
+            "https://huggingface.co/datasets/hf-internal-testing/fixtures_got_ocr/resolve/main/one_column.png"
+        )
+        image2 = load_image(
+            "https://huggingface.co/datasets/hf-internal-testing/fixtures_got_ocr/resolve/main/multi_box.png"
+        )
 
-        self.assertEqual(
-            result,
-            EXPECTED_DECODED_TEXT,
+        inputs = self.processor([image1, image2], return_tensors="pt", multi_page=True).to(torch_device)
+        generate_ids = model.generate(**inputs, do_sample=False, num_beams=1, max_new_tokens=4)
+        decoded_output = self.processor.decode(
+            generate_ids[0, inputs["input_ids"].shape[1] :], skip_special_tokens=True
+        )
+        expected_output = "on developing architectural improvements"
+        self.assertEqual(decoded_output, expected_output)
+
+    @slow
+    def test_small_model_integration_test_got_ocr_batched(self):
+        model_id = "PaddlePaddle/PPChart2Table_safetensors"
+        model = PPChart2TableForConditionalGeneration.from_pretrained(model_id, device_map=torch_device)
+        image1 = load_image(
+            "https://huggingface.co/datasets/hf-internal-testing/fixtures_got_ocr/resolve/main/multi_box.png"
+        )
+        image2 = load_image(
+            "https://huggingface.co/datasets/hf-internal-testing/fixtures_got_ocr/resolve/main/image_ocr.jpg"
         )
 
-    # def test_small_model_integration_test_batch(self):
-    #     model = (
-    #         PPChart2TableForConditionalGeneration.from_pretrained("/workspace/model_weight_torch/PP-Chart2Table", dtype="bfloat16")
-    #         .to(torch_device)
-    #         .eval()
-    #     )
-
-    #     image = Image.open("/workspace/PaddleX/paddlex/inference/models/doc_vlm/modeling/chart_parsing_02.png").convert("RGB")
-    #     inputs = self.processor(images=image).to(model.device)
-
-    #     output = model.generate(**inputs, max_new_tokens=256)
-    #     generated_ids_trimmed = [out_ids[len(in_ids) :] for in_ids, out_ids in zip(inputs.input_ids, output)]
-    #     result = self.processor.batch_decode(
-    #         generated_ids_trimmed, skip_special_tokens=True, clean_up_tokenization_spaces=False
-    #     )
-
-    #     EXPECTED_DECODED_TEXT = ["生甘草", "生甘草"]
-
-    #     self.assertEqual(
-    #         result,
-    #         EXPECTED_DECODED_TEXT,
-    #     )
-
-    # @require_flash_attn
-    # @require_torch_accelerator
-    # @pytest.mark.flash_attn_test
-    # def test_small_model_integration_test_flashatt2(self):
-    #     model = (
-    #         PPChart2TableForConditionalGeneration.from_pretrained(
-    #             "/workspace/model_weight_torch/PP-Chart2Table", dtype="bfloat16", attn_implementation="flash_attention_2"
-    #         )
-    #         .to(torch_device)
-    #         .eval()
-    #     )
-
-    #     image = Image.open("/workspace/PaddleX/paddlex/inference/models/doc_vlm/modeling/chart_parsing_02.png").convert("RGB")
-    #     inputs = self.processor(images=image).to(model.device)
-
-    #     expected_input_ids_length = 211
-    #     assert expected_input_ids_length == len(inputs.input_ids[0])
-
-    #     expected_input_ids = [100273, 2969, 93963, 93919, 101305, 100295, 100295, 100295, 100295, 100295]  # fmt: skip
-    #     assert expected_input_ids == inputs.input_ids[0].tolist()[:10]
-
-    #     expected_pixel_slice = torch.tensor(
-    #         [
-    #             [1.0000, 1.0000, 1.0000],
-    #             [1.0000, 1.0000, 1.0000],
-    #             [0.9922, 0.9922, 0.9922],
-    #             [1.0000, 1.0000, 1.0000],
-    #             [1.0000, 1.0000, 1.0000],
-    #         ],
-    #         dtype=torch.float32,
-    #         device="cpu",
-    #     )
-    #     assert torch.allclose(expected_pixel_slice, inputs.pixel_values[:5, :, 0, 0], atol=3e-3)
-
-    #     # verify generation
-    #     inputs = inputs.to(torch_device)
-    #     output = model.generate(**inputs, max_new_tokens=30)
-    #     result = self.processor.decode(output[0][inputs["input_ids"].shape[-1] : -1])
-
-    #     EXPECTED_DECODED_TEXT = "生甘草"
-
-    #     self.assertEqual(
-    #         result,
-    #         EXPECTED_DECODED_TEXT,
-    #     )
-
-    # @require_flash_attn
-    # @require_torch_accelerator
-    # @pytest.mark.flash_attn_test
-    # def test_small_model_integration_test_batch_flashatt2(self):
-    #     model = (
-    #         PPChart2TableForConditionalGeneration.from_pretrained(
-    #             "/workspace/model_weight_torch/PP-Chart2Table", dtype="bfloat16", attn_implementation="flash_attention_2"
-    #         )
-    #         .to(torch_device)
-    #         .eval()
-    #     )
-
-    #     image = Image.open("/workspace/PaddleX/paddlex/inference/models/doc_vlm/modeling/chart_parsing_02.png").convert("RGB")
-    #     inputs = self.processor(images=image).to(model.device)
-
-    #     # it should not matter whether two images are the same size or not
-    #     output = model.generate(**inputs, max_new_tokens=30)
-    #     generated_ids_trimmed = [out_ids[len(in_ids) :] for in_ids, out_ids in zip(inputs.input_ids, output)]
-    #     result = self.processor.batch_decode(
-    #         generated_ids_trimmed, skip_special_tokens=True, clean_up_tokenization_spaces=False
-    #     )
-
-    #     EXPECTED_DECODED_TEXT = ["生甘草", "生甘草"]
-
-    #     self.assertEqual(
-    #         result,
-    #         EXPECTED_DECODED_TEXT,
-    #     )
+        inputs = self.processor([image1, image2], return_tensors="pt").to(torch_device)
+        generate_ids = model.generate(**inputs, do_sample=False, num_beams=1, max_new_tokens=4)
+        decoded_output = self.processor.batch_decode(
+            generate_ids[:, inputs["input_ids"].shape[1] :], skip_special_tokens=True
+        )
+        expected_output = ["Reducing the number", "R&D QUALITY"]
+        self.assertEqual(decoded_output, expected_output)

From 1efd48b52988f9109d2384e58641caa1ba5691af Mon Sep 17 00:00:00 2001
From: XingweiDeng <aaadddiy@163.com>
Date: Fri, 13 Mar 2026 18:23:46 +0800
Subject: [PATCH 15/60] update

---
 .../configuration_pp_chart2table.py           |  1 -
 .../image_processing_pp_chart2table_fast.py   |  2 +-
 .../pp_chart2table/modular_pp_chart2table.py  | 41 +++++++++++--------
 .../processing_pp_chart2table.py              |  1 +
 4 files changed, 25 insertions(+), 20 deletions(-)

diff --git a/src/transformers/models/pp_chart2table/configuration_pp_chart2table.py b/src/transformers/models/pp_chart2table/configuration_pp_chart2table.py
index 48d4ea664b9d..d3556596b107 100644
--- a/src/transformers/models/pp_chart2table/configuration_pp_chart2table.py
+++ b/src/transformers/models/pp_chart2table/configuration_pp_chart2table.py
@@ -18,7 +18,6 @@
 # See the License for the specific language governing permissions and
 # limitations under the License.
 
-
 from ...configuration_utils import PreTrainedConfig
 from ...utils import auto_docstring
 from ..auto import CONFIG_MAPPING, AutoConfig
diff --git a/src/transformers/models/pp_chart2table/image_processing_pp_chart2table_fast.py b/src/transformers/models/pp_chart2table/image_processing_pp_chart2table_fast.py
index afcd5208f819..a535edf500d5 100644
--- a/src/transformers/models/pp_chart2table/image_processing_pp_chart2table_fast.py
+++ b/src/transformers/models/pp_chart2table/image_processing_pp_chart2table_fast.py
@@ -18,10 +18,10 @@
 # See the License for the specific language governing permissions and
 # limitations under the License.
 
-
 from typing import Optional
 
 import torch
+import torchvision.transforms.v2.functional as tvF
 
 from ...feature_extraction_utils import BatchFeature
 from ...image_processing_utils_fast import BaseImageProcessorFast, group_images_by_shape, reorder_images
diff --git a/src/transformers/models/pp_chart2table/modular_pp_chart2table.py b/src/transformers/models/pp_chart2table/modular_pp_chart2table.py
index 40134ddc4624..4113c60584f3 100644
--- a/src/transformers/models/pp_chart2table/modular_pp_chart2table.py
+++ b/src/transformers/models/pp_chart2table/modular_pp_chart2table.py
@@ -13,29 +13,28 @@
 # limitations under the License.
 
 from dataclasses import dataclass
-
 from typing import Optional
+
 import torch
 import torch.nn as nn
+import torchvision.transforms.v2.functional as tvF
 
 from ...feature_extraction_utils import BatchFeature
 from ...image_processing_utils_fast import BaseImageProcessorFast, group_images_by_shape, reorder_images
-
+from ...image_utils import ImageInput, SizeDict
+from ...modeling_outputs import BaseModelOutputWithPooling
+from ...processing_utils import ProcessingKwargs, ProcessorMixin, TensorType, Unpack
+from ...tokenization_utils_base import PreTokenizedInput, TextInput
+from ...utils import TransformersKwargs, auto_docstring, logging
 from ..got_ocr2.configuration_got_ocr2 import GotOcr2Config
 from ..got_ocr2.modeling_got_ocr2 import (
-    GotOcr2ModelOutputWithPast,
+    GotOcr2ForConditionalGeneration,
     GotOcr2Model,
+    GotOcr2ModelOutputWithPast,
     GotOcr2PreTrainedModel,
-    GotOcr2ForConditionalGeneration,
     GotOcr2VisionEncoder,
 )
 
-from ...utils import TransformersKwargs, auto_docstring, logging
-from ...modeling_outputs import BaseModelOutputWithPooling
-from ...processing_utils import ProcessorMixin, TensorType, Unpack, ProcessingKwargs
-
-from ...image_utils import SizeDict, ImageInput
-from ...tokenization_utils_base import PreTokenizedInput, TextInput
 
 logger = logging.get_logger(__name__)
 
@@ -49,7 +48,7 @@
     vision_hidden_channels (`int`, *optional*, defaults to 512):
         Dimensionality of the intermediate hidden channels in the vision encoder. This is the channel
         count between the first and second downsample layers.
-    """
+    """,
 )
 class PPChart2TableConfig(GotOcr2Config):
     def __init__(
@@ -58,7 +57,6 @@ def __init__(
         vision_hidden_channels: int = 512,
         **super_kwargs,
     ):
-
         self.output_channels = output_channels
         self.vision_hidden_channels = vision_hidden_channels
         super().__init__()
@@ -146,7 +144,7 @@ def __call__(
 
         batch_size, _, height, _ = image_inputs["pixel_values"].shape
         num_patches = height // self.image_processor.patch_size // self.image_processor.merge_size
-        
+
         input_ids = {"input_ids": None}
         if text is None:
             query = "Chart to table"
@@ -189,11 +187,19 @@ class PPChart2TableModelOutputWithPast(GotOcr2ModelOutputWithPast):
 
 @auto_docstring
 class PPChart2TableModel(GotOcr2Model):
-
     def __init__(self, config: PPChart2TableConfig):
         super().__init__(config)
-        self.vision_downsample1 = nn.Conv2d(config.vision_config.output_channels, config.vision_hidden_channels, kernel_size=3, stride=2, padding=1, bias=False)
-        self.vision_downsample2 = nn.Conv2d(config.vision_hidden_channels, config.output_channels, kernel_size=3, stride=2, padding=1, bias=False)
+        self.vision_downsample1 = nn.Conv2d(
+            config.vision_config.output_channels,
+            config.vision_hidden_channels,
+            kernel_size=3,
+            stride=2,
+            padding=1,
+            bias=False,
+        )
+        self.vision_downsample2 = nn.Conv2d(
+            config.vision_hidden_channels, config.output_channels, kernel_size=3, stride=2, padding=1, bias=False
+        )
         self.multi_modal_projector = nn.Linear(config.output_channels, config.text_config.hidden_size)
 
         # Initialize weights and apply final processing
@@ -204,7 +210,6 @@ def get_image_features(
         pixel_values: torch.FloatTensor,
         **kwargs: Unpack[TransformersKwargs],
     ) -> tuple | BaseModelOutputWithPooling:
-
         image_output = self.vision_tower(pixel_values)
         last_hidden_state = image_output.last_hidden_state
         last_hidden_state = self.vision_downsample1(last_hidden_state)
@@ -231,4 +236,4 @@ class PPChart2TableForConditionalGeneration(GotOcr2ForConditionalGeneration):
     "PPChart2TableVisionPreTrainedModel",
     "PPChart2TableImageProcessorFast",
     "PPChart2TableProcessor",
-]
\ No newline at end of file
+]
diff --git a/src/transformers/models/pp_chart2table/processing_pp_chart2table.py b/src/transformers/models/pp_chart2table/processing_pp_chart2table.py
index f385517ee13c..4d0b973f1154 100644
--- a/src/transformers/models/pp_chart2table/processing_pp_chart2table.py
+++ b/src/transformers/models/pp_chart2table/processing_pp_chart2table.py
@@ -18,6 +18,7 @@
 # See the License for the specific language governing permissions and
 # limitations under the License.
 
+
 import torch
 
 from ...feature_extraction_utils import BatchFeature

From 779cbcacc8dbd3769c68410f1b323ee7a1244de7 Mon Sep 17 00:00:00 2001
From: XingweiDeng <aaadddiy@163.com>
Date: Sun, 15 Mar 2026 19:36:22 +0800
Subject: [PATCH 16/60] update

---
 .../image_processing_pp_chart2table_fast.py   |   2 -
 .../pp_chart2table/modular_pp_chart2table.py  |   2 -
 .../test_modeling_pp_chart2table.py           | 101 +++---------------
 3 files changed, 13 insertions(+), 92 deletions(-)

diff --git a/src/transformers/models/pp_chart2table/image_processing_pp_chart2table_fast.py b/src/transformers/models/pp_chart2table/image_processing_pp_chart2table_fast.py
index a535edf500d5..16ef1976315d 100644
--- a/src/transformers/models/pp_chart2table/image_processing_pp_chart2table_fast.py
+++ b/src/transformers/models/pp_chart2table/image_processing_pp_chart2table_fast.py
@@ -71,8 +71,6 @@ def _preprocess(
             stacked_images = self.rescale_and_normalize(
                 stacked_images, do_rescale, rescale_factor, do_normalize, image_mean, image_std
             )
-            # BGR to RGB conversion
-            stacked_images = stacked_images[:, [2, 1, 0], :, :]
             processed_images_grouped[shape] = stacked_images
 
         pixel_values = reorder_images(processed_images_grouped, grouped_images_index)
diff --git a/src/transformers/models/pp_chart2table/modular_pp_chart2table.py b/src/transformers/models/pp_chart2table/modular_pp_chart2table.py
index 4113c60584f3..7af04420211f 100644
--- a/src/transformers/models/pp_chart2table/modular_pp_chart2table.py
+++ b/src/transformers/models/pp_chart2table/modular_pp_chart2table.py
@@ -103,8 +103,6 @@ def _preprocess(
             stacked_images = self.rescale_and_normalize(
                 stacked_images, do_rescale, rescale_factor, do_normalize, image_mean, image_std
             )
-            # BGR to RGB conversion
-            stacked_images = stacked_images[:, [2, 1, 0], :, :]
             processed_images_grouped[shape] = stacked_images
 
         pixel_values = reorder_images(processed_images_grouped, grouped_images_index)
diff --git a/tests/models/pp_chart2table/test_modeling_pp_chart2table.py b/tests/models/pp_chart2table/test_modeling_pp_chart2table.py
index 2f094467c5e9..7bec9adf1406 100644
--- a/tests/models/pp_chart2table/test_modeling_pp_chart2table.py
+++ b/tests/models/pp_chart2table/test_modeling_pp_chart2table.py
@@ -180,116 +180,41 @@ def test_model_is_small(self):
 @require_torch
 class PPChart2TableIntegrationTest(unittest.TestCase):
     def setUp(self):
-        self.processor = AutoProcessor.from_pretrained("PaddlePaddle/PPChart2Table_safetensors")
+        self.processor = AutoProcessor.from_pretrained("/workspace/model_weight_torch/PP-Chart2Table")
 
     def tearDown(self):
         cleanup(torch_device, gc_collect=True)
 
     @slow
-    def test_small_model_integration_test_got_ocr_stop_strings(self):
-        model_id = "stepfun-ai/GOT-OCR-2.0-hf"
+    def test_small_model_integration_test_pp_chart2table(self):
+        model_id = "/workspace/model_weight_torch/PP-Chart2Table"
         model = PPChart2TableForConditionalGeneration.from_pretrained(model_id, device_map=torch_device)
-        image = load_image(
-            "https://huggingface.co/datasets/hf-internal-testing/fixtures_ocr/resolve/main/iam_picture.jpeg"
-        )
+        image = load_image("https://paddle-model-ecology.bj.bcebos.com/paddlex/imgs/demo_image/chart_parsing_02.png")
 
         inputs = self.processor(image, return_tensors="pt").to(torch_device)
         generate_ids = model.generate(
             **inputs,
+            use_cache=True,
             do_sample=False,
-            num_beams=1,
-            tokenizer=self.processor.tokenizer,
-            stop_strings="<|im_end|>",
-            max_new_tokens=4096,
-        )
-        decoded_output = self.processor.decode(
-            generate_ids[0, inputs["input_ids"].shape[1] :], skip_special_tokens=True
-        )
-        expected_output = "industre"
-        self.assertEqual(decoded_output, expected_output)
-
-    @slow
-    def test_small_model_integration_test_got_ocr_format(self):
-        model_id = "PaddlePaddle/PPChart2Table_safetensors"
-        model = PPChart2TableForConditionalGeneration.from_pretrained(model_id, device_map=torch_device)
-        image = load_image(
-            "https://huggingface.co/datasets/hf-internal-testing/fixtures_got_ocr/resolve/main/image_ocr.jpg"
-        )
-
-        inputs = self.processor(image, return_tensors="pt", format=True).to(torch_device)
-        generate_ids = model.generate(**inputs, do_sample=False, num_beams=1, max_new_tokens=4)
-        decoded_output = self.processor.decode(
-            generate_ids[0, inputs["input_ids"].shape[1] :], skip_special_tokens=True
-        )
-        expected_output = "\\title{\nR"
-        self.assertEqual(decoded_output, expected_output)
-
-    @slow
-    def test_small_model_integration_test_got_ocr_fine_grained(self):
-        model_id = "PaddlePaddle/PPChart2Table_safetensors"
-        model = PPChart2TableForConditionalGeneration.from_pretrained(model_id, device_map=torch_device)
-        image = load_image(
-            "https://huggingface.co/datasets/hf-internal-testing/fixtures_got_ocr/resolve/main/multi_box.png"
-        )
-
-        inputs = self.processor(image, return_tensors="pt", color="green").to(torch_device)
-        generate_ids = model.generate(**inputs, do_sample=False, num_beams=1, max_new_tokens=4)
-        decoded_output = self.processor.decode(
-            generate_ids[0, inputs["input_ids"].shape[1] :], skip_special_tokens=True
-        )
-        expected_output = "You should keep in"
-        self.assertEqual(decoded_output, expected_output)
-
-    @slow
-    def test_small_model_integration_test_got_ocr_crop_to_patches(self):
-        model_id = "PaddlePaddle/PPChart2Table_safetensors"
-        model = PPChart2TableForConditionalGeneration.from_pretrained(model_id, device_map=torch_device)
-        image = load_image(
-            "https://huggingface.co/datasets/hf-internal-testing/fixtures_got_ocr/resolve/main/one_column.png"
-        )
-
-        inputs = self.processor(image, return_tensors="pt", crop_to_patches=True).to(torch_device)
-        generate_ids = model.generate(**inputs, do_sample=False, num_beams=1, max_new_tokens=4)
-        decoded_output = self.processor.decode(
-            generate_ids[0, inputs["input_ids"].shape[1] :], skip_special_tokens=True
-        )
-        expected_output = "on developing architectural improvements"
-        self.assertEqual(decoded_output, expected_output)
-
-    @slow
-    def test_small_model_integration_test_got_ocr_multi_pages(self):
-        model_id = "PaddlePaddle/PPChart2Table_safetensors"
-        model = PPChart2TableForConditionalGeneration.from_pretrained(model_id, device_map=torch_device)
-        image1 = load_image(
-            "https://huggingface.co/datasets/hf-internal-testing/fixtures_got_ocr/resolve/main/one_column.png"
-        )
-        image2 = load_image(
-            "https://huggingface.co/datasets/hf-internal-testing/fixtures_got_ocr/resolve/main/multi_box.png"
+            max_new_tokens=1024,
         )
-
-        inputs = self.processor([image1, image2], return_tensors="pt", multi_page=True).to(torch_device)
-        generate_ids = model.generate(**inputs, do_sample=False, num_beams=1, max_new_tokens=4)
         decoded_output = self.processor.decode(
             generate_ids[0, inputs["input_ids"].shape[1] :], skip_special_tokens=True
         )
-        expected_output = "on developing architectural improvements"
+        expected_output = "年份 | 单家五星级旅游饭店年平均营收 (百万元) | 单家五星级旅游饭店年平均利润 (百万元)\n2018 | 104.22 | 9.87\n2019 | 99.11 | 7.47\n2020 | 57.87 | -3.87\n2021 | 68.99 | -2.90\n2022 | 56.29 | -9.48\n2023 | 87.99 | 5.96"
         self.assertEqual(decoded_output, expected_output)
 
     @slow
-    def test_small_model_integration_test_got_ocr_batched(self):
-        model_id = "PaddlePaddle/PPChart2Table_safetensors"
+    def test_small_model_integration_test_pp_chart2table_batched(self):
+        model_id = "/workspace/model_weight_torch/PP-Chart2Table"
         model = PPChart2TableForConditionalGeneration.from_pretrained(model_id, device_map=torch_device)
-        image1 = load_image(
-            "https://huggingface.co/datasets/hf-internal-testing/fixtures_got_ocr/resolve/main/multi_box.png"
-        )
-        image2 = load_image(
-            "https://huggingface.co/datasets/hf-internal-testing/fixtures_got_ocr/resolve/main/image_ocr.jpg"
-        )
+        image1 = load_image("https://paddle-model-ecology.bj.bcebos.com/paddlex/imgs/demo_image/chart_parsing_02.png")
+        image2 = load_image("https://paddle-model-ecology.bj.bcebos.com/paddlex/imgs/demo_image/chart_parsing_02.png")
 
         inputs = self.processor([image1, image2], return_tensors="pt").to(torch_device)
-        generate_ids = model.generate(**inputs, do_sample=False, num_beams=1, max_new_tokens=4)
+        generate_ids = model.generate(**inputs, do_sample=False, num_beams=1, max_new_tokens=6)
         decoded_output = self.processor.batch_decode(
             generate_ids[:, inputs["input_ids"].shape[1] :], skip_special_tokens=True
         )
-        expected_output = ["Reducing the number", "R&D QUALITY"]
+        expected_output = ["年份 | 单家", "年份 | 单家"]
         self.assertEqual(decoded_output, expected_output)

From d61079a79db1e574e7aaa067236ca790418a8de5 Mon Sep 17 00:00:00 2001
From: XingweiDeng <aaadddiy@163.com>
Date: Sun, 15 Mar 2026 19:53:06 +0800
Subject: [PATCH 17/60] update

---
 docs/source/en/model_doc/pp_chart2table.md         | 8 ++++----
 src/transformers/models/pp_chart2table/__init__.py | 1 -
 2 files changed, 4 insertions(+), 5 deletions(-)

diff --git a/docs/source/en/model_doc/pp_chart2table.md b/docs/source/en/model_doc/pp_chart2table.md
index 339746ae6cc9..ef5a380b4573 100644
--- a/docs/source/en/model_doc/pp_chart2table.md
+++ b/docs/source/en/model_doc/pp_chart2table.md
@@ -73,10 +73,10 @@ model = AutoModelForImageTextToText.from_pretrained(
     dtype="float32",
     device_map="auto",
 )
-processor = AutoProcessor.from_pretrained(model_path, use_fast=True)
+processor = AutoProcessor.from_pretrained(model_path, use_fast=True).to(model.device)
 
 image = Image.open(requests.get("https://paddle-model-ecology.bj.bcebos.com/paddlex/imgs/demo_image/chart_parsing_02.png", stream=True).raw)
-inputs = processor(images=image).to(model.device)
+inputs = processor(images=image)
 
 generated_ids = model.generate(**inputs, use_cache=True, do_sample=False, max_new_tokens=256)
 generated_ids_trimmed = [out_ids[len(in_ids) :] for in_ids, out_ids in zip(inputs.input_ids, generated_ids)]
@@ -130,10 +130,10 @@ model = AutoModelForImageTextToText.from_pretrained(
     dtype="float32",
     device_map="auto",
 )
-processor = AutoProcessor.from_pretrained(model_path)
+processor = AutoProcessor.from_pretrained(model_path).to(model.device)
 
 image = Image.open(requests.get("https://paddle-model-ecology.bj.bcebos.com/paddlex/imgs/demo_image/chart_parsing_02.png", stream=True).raw)
-inputs = processor(images=[image, image]).to(model.device)
+inputs = processor(images=[image, image])
 
 generated_ids = model.generate(**inputs, do_sample=False, max_new_tokens=256)
 generated_ids_trimmed = [out_ids[len(in_ids) :] for in_ids, out_ids in zip(inputs.input_ids, generated_ids)]
diff --git a/src/transformers/models/pp_chart2table/__init__.py b/src/transformers/models/pp_chart2table/__init__.py
index 178169b3c50d..82763db99c82 100644
--- a/src/transformers/models/pp_chart2table/__init__.py
+++ b/src/transformers/models/pp_chart2table/__init__.py
@@ -20,7 +20,6 @@
 
 if TYPE_CHECKING:
     from .configuration_pp_chart2table import *
-    from .image_processing_pp_chart2table import *
     from .image_processing_pp_chart2table_fast import *
     from .modeling_pp_chart2table import *
     from .processing_pp_chart2table import *

From 618c63c0f29750624eb5a8897ef89c2d08d8d7c6 Mon Sep 17 00:00:00 2001
From: XingweiDeng <aaadddiy@163.com>
Date: Sun, 15 Mar 2026 20:12:07 +0800
Subject: [PATCH 18/60] update

---
 docs/source/en/_toctree.yml            | 4 ++--
 src/transformers/conversion_mapping.py | 1 -
 2 files changed, 2 insertions(+), 3 deletions(-)

diff --git a/docs/source/en/_toctree.yml b/docs/source/en/_toctree.yml
index 6b495433df38..e4581c636119 100644
--- a/docs/source/en/_toctree.yml
+++ b/docs/source/en/_toctree.yml
@@ -1262,12 +1262,12 @@
         title: Pix2Struct
       - local: model_doc/pixtral
         title: Pixtral
-      - local: model_doc/pp_chart2table
-        title: PPChart2Table
       - local: model_doc/pp_doclayout_v2
         title: PP-DocLayoutV2
       - local: model_doc/pp_doclayout_v3
         title: PP-DocLayoutV3
+      - local: model_doc/pp_chart2table
+        title: PPChart2Table
       - local: model_doc/qwen2_5_omni
         title: Qwen2.5-Omni
       - local: model_doc/qwen2_5_vl
diff --git a/src/transformers/conversion_mapping.py b/src/transformers/conversion_mapping.py
index 24c59125278a..dc4466948220 100755
--- a/src/transformers/conversion_mapping.py
+++ b/src/transformers/conversion_mapping.py
@@ -448,7 +448,6 @@ def register_checkpoint_conversion_mapping(
     "sam3_tracker_video",
     "paddleocrvl",
     "ppchart2table",
-    "ernie4_5_vl_moe",
     # NOTE: Slightly different from `model_type` (to follow naming conventions in vllm/sglang)
     "ernie4_5_vlmoe",
     "ernie4_5_vl_moe",  # BC alias

From 1079052139ad0ba721f9e5dbbc9ca26937a09a17 Mon Sep 17 00:00:00 2001
From: XingweiDeng <aaadddiy@163.com>
Date: Sun, 15 Mar 2026 20:52:54 +0800
Subject: [PATCH 19/60] update

---
 .../image_processing_pp_chart2table_fast.py   | 48 +------------------
 .../pp_chart2table/modeling_pp_chart2table.py |  7 ++-
 .../pp_chart2table/modular_pp_chart2table.py  | 39 +--------------
 3 files changed, 8 insertions(+), 86 deletions(-)

diff --git a/src/transformers/models/pp_chart2table/image_processing_pp_chart2table_fast.py b/src/transformers/models/pp_chart2table/image_processing_pp_chart2table_fast.py
index 16ef1976315d..1bc21d62080a 100644
--- a/src/transformers/models/pp_chart2table/image_processing_pp_chart2table_fast.py
+++ b/src/transformers/models/pp_chart2table/image_processing_pp_chart2table_fast.py
@@ -18,15 +18,7 @@
 # See the License for the specific language governing permissions and
 # limitations under the License.
 
-from typing import Optional
-
-import torch
-import torchvision.transforms.v2.functional as tvF
-
-from ...feature_extraction_utils import BatchFeature
-from ...image_processing_utils_fast import BaseImageProcessorFast, group_images_by_shape, reorder_images
-from ...image_utils import SizeDict
-from ...processing_utils import TensorType
+from ...image_processing_utils_fast import BaseImageProcessorFast
 from ...utils import auto_docstring
 
 
@@ -42,43 +34,5 @@ class PPChart2TableImageProcessorFast(BaseImageProcessorFast):
     do_rescale = True
     do_normalize = True
 
-    def _preprocess(
-        self,
-        images: list["torch.Tensor"],
-        do_resize: bool,
-        size: SizeDict,
-        interpolation: Optional["tvF.InterpolationMode"],
-        do_rescale: bool,
-        rescale_factor: float,
-        do_normalize: bool,
-        image_mean: float | list[float] | None,
-        image_std: float | list[float] | None,
-        disable_grouping: bool | None,
-        return_tensors: str | TensorType | None,
-        **kwargs,
-    ) -> BatchFeature:
-        grouped_images, grouped_images_index = group_images_by_shape(images, disable_grouping=disable_grouping)
-        resized_images_grouped = {}
-        for shape, stacked_images in grouped_images.items():
-            if do_resize:
-                stacked_images = self.resize(image=stacked_images, size=size, interpolation=interpolation)
-            resized_images_grouped[shape] = stacked_images
-        resized_images = reorder_images(resized_images_grouped, grouped_images_index)
-
-        grouped_images, grouped_images_index = group_images_by_shape(resized_images, disable_grouping=disable_grouping)
-        processed_images_grouped = {}
-        for shape, stacked_images in grouped_images.items():
-            stacked_images = self.rescale_and_normalize(
-                stacked_images, do_rescale, rescale_factor, do_normalize, image_mean, image_std
-            )
-            processed_images_grouped[shape] = stacked_images
-
-        pixel_values = reorder_images(processed_images_grouped, grouped_images_index)
-
-        return BatchFeature(
-            data={"pixel_values": pixel_values},
-            tensor_type=return_tensors,
-        )
-
 
 __all__ = ["PPChart2TableImageProcessorFast"]
diff --git a/src/transformers/models/pp_chart2table/modeling_pp_chart2table.py b/src/transformers/models/pp_chart2table/modeling_pp_chart2table.py
index fe35758b945e..76b164ebe4aa 100644
--- a/src/transformers/models/pp_chart2table/modeling_pp_chart2table.py
+++ b/src/transformers/models/pp_chart2table/modeling_pp_chart2table.py
@@ -826,4 +826,9 @@ def prepare_inputs_for_generation(
         return model_inputs
 
 
-__all__ = ["PPChart2TableForConditionalGeneration", "PPChart2TableModel", "PPChart2TableVisionPreTrainedModel"]
+__all__ = [
+    "PPChart2TableForConditionalGeneration",
+    "PPChart2TableModel",
+    "PPChart2TableVisionPreTrainedModel",
+    "PPChart2TablePreTrainedModel",
+]
diff --git a/src/transformers/models/pp_chart2table/modular_pp_chart2table.py b/src/transformers/models/pp_chart2table/modular_pp_chart2table.py
index 7af04420211f..207f7c445cf7 100644
--- a/src/transformers/models/pp_chart2table/modular_pp_chart2table.py
+++ b/src/transformers/models/pp_chart2table/modular_pp_chart2table.py
@@ -74,44 +74,6 @@ class PPChart2TableImageProcessorFast(BaseImageProcessorFast):
     do_rescale = True
     do_normalize = True
 
-    def _preprocess(
-        self,
-        images: list["torch.Tensor"],
-        do_resize: bool,
-        size: SizeDict,
-        interpolation: Optional["tvF.InterpolationMode"],
-        do_rescale: bool,
-        rescale_factor: float,
-        do_normalize: bool,
-        image_mean: float | list[float] | None,
-        image_std: float | list[float] | None,
-        disable_grouping: bool | None,
-        return_tensors: str | TensorType | None,
-        **kwargs,
-    ) -> BatchFeature:
-        grouped_images, grouped_images_index = group_images_by_shape(images, disable_grouping=disable_grouping)
-        resized_images_grouped = {}
-        for shape, stacked_images in grouped_images.items():
-            if do_resize:
-                stacked_images = self.resize(image=stacked_images, size=size, interpolation=interpolation)
-            resized_images_grouped[shape] = stacked_images
-        resized_images = reorder_images(resized_images_grouped, grouped_images_index)
-
-        grouped_images, grouped_images_index = group_images_by_shape(resized_images, disable_grouping=disable_grouping)
-        processed_images_grouped = {}
-        for shape, stacked_images in grouped_images.items():
-            stacked_images = self.rescale_and_normalize(
-                stacked_images, do_rescale, rescale_factor, do_normalize, image_mean, image_std
-            )
-            processed_images_grouped[shape] = stacked_images
-
-        pixel_values = reorder_images(processed_images_grouped, grouped_images_index)
-
-        return BatchFeature(
-            data={"pixel_values": pixel_values},
-            tensor_type=return_tensors,
-        )
-
 
 @auto_docstring
 class PPChart2TableProcessor(ProcessorMixin):
@@ -232,6 +194,7 @@ class PPChart2TableForConditionalGeneration(GotOcr2ForConditionalGeneration):
     "PPChart2TableModel",
     "PPChart2TableConfig",
     "PPChart2TableVisionPreTrainedModel",
+    "PPChart2TablePreTrainedModel",
     "PPChart2TableImageProcessorFast",
     "PPChart2TableProcessor",
 ]

From 974d3b137e2c06ed40d995c01b50dcf76e11e97e Mon Sep 17 00:00:00 2001
From: XingweiDeng <aaadddiy@163.com>
Date: Mon, 16 Mar 2026 14:38:57 +0800
Subject: [PATCH 20/60] update

---
 docs/source/en/model_doc/pp_chart2table.md      | 17 ++++++++++++-----
 .../image_processing_pp_chart2table_fast.py     |  4 ++--
 .../pp_chart2table/modular_pp_chart2table.py    | 17 ++++++++++-------
 3 files changed, 24 insertions(+), 14 deletions(-)

diff --git a/docs/source/en/model_doc/pp_chart2table.md b/docs/source/en/model_doc/pp_chart2table.md
index ef5a380b4573..387d27e2b855 100644
--- a/docs/source/en/model_doc/pp_chart2table.md
+++ b/docs/source/en/model_doc/pp_chart2table.md
@@ -148,19 +148,26 @@ print(result)
 
 [[autodoc]] PPChart2TableForConditionalGeneration
 
+## PPChart2TableModel
+
+[[autodoc]] PPChart2TableModel
+
 ## PPChart2TableConfig
 
 [[autodoc]] PPChart2TableConfig
 
-## PPChart2TableImageProcessorFast
+## PPChart2TableVisionPreTrainedModel
 
-[[autodoc]] PPChart2TableImageProcessorFast
+[[autodoc]] PPChart2TableVisionPreTrainedModel
 
-## PPChart2TableModel
+## PPChart2TablePreTrainedModel
 
-[[autodoc]] PPChart2TableModel
+[[autodoc]] PPChart2TablePreTrainedModel
+
+## PPChart2TableImageProcessorFast
+
+[[autodoc]] PPChart2TableImageProcessorFast
 
 ## PPChart2TableProcessor
 
 [[autodoc]] PPChart2TableProcessor
-
diff --git a/src/transformers/models/pp_chart2table/image_processing_pp_chart2table_fast.py b/src/transformers/models/pp_chart2table/image_processing_pp_chart2table_fast.py
index 1bc21d62080a..8bea7ba01d7c 100644
--- a/src/transformers/models/pp_chart2table/image_processing_pp_chart2table_fast.py
+++ b/src/transformers/models/pp_chart2table/image_processing_pp_chart2table_fast.py
@@ -25,8 +25,8 @@
 @auto_docstring
 class PPChart2TableImageProcessorFast(BaseImageProcessorFast):
     resample = 3
-    image_mean = [0.40821073, 0.4578275, 0.48145466]
-    image_std = [0.27577711, 0.26130258, 0.26862954]
+    image_mean = [0.48145466, 0.4578275, 0.40821073]
+    image_std = [0.26862954, 0.26130258, 0.27577711]
     size = {"height": 1024, "width": 1024}
     patch_size = 16
     merge_size = 4
diff --git a/src/transformers/models/pp_chart2table/modular_pp_chart2table.py b/src/transformers/models/pp_chart2table/modular_pp_chart2table.py
index 207f7c445cf7..85e3495d0cea 100644
--- a/src/transformers/models/pp_chart2table/modular_pp_chart2table.py
+++ b/src/transformers/models/pp_chart2table/modular_pp_chart2table.py
@@ -13,17 +13,15 @@
 # limitations under the License.
 
 from dataclasses import dataclass
-from typing import Optional
 
 import torch
 import torch.nn as nn
-import torchvision.transforms.v2.functional as tvF
 
 from ...feature_extraction_utils import BatchFeature
-from ...image_processing_utils_fast import BaseImageProcessorFast, group_images_by_shape, reorder_images
-from ...image_utils import ImageInput, SizeDict
+from ...image_processing_utils_fast import BaseImageProcessorFast
+from ...image_utils import ImageInput
 from ...modeling_outputs import BaseModelOutputWithPooling
-from ...processing_utils import ProcessingKwargs, ProcessorMixin, TensorType, Unpack
+from ...processing_utils import ProcessingKwargs, ProcessorMixin, Unpack
 from ...tokenization_utils_base import PreTokenizedInput, TextInput
 from ...utils import TransformersKwargs, auto_docstring, logging
 from ..got_ocr2.configuration_got_ocr2 import GotOcr2Config
@@ -65,8 +63,8 @@ def __init__(
 @auto_docstring
 class PPChart2TableImageProcessorFast(BaseImageProcessorFast):
     resample = 3
-    image_mean = [0.40821073, 0.4578275, 0.48145466]
-    image_std = [0.27577711, 0.26130258, 0.26862954]
+    image_mean = [0.48145466, 0.4578275, 0.40821073]
+    image_std = [0.26862954, 0.26130258, 0.27577711]
     size = {"height": 1024, "width": 1024}
     patch_size = 16
     merge_size = 4
@@ -145,6 +143,11 @@ class PPChart2TableModelOutputWithPast(GotOcr2ModelOutputWithPast):
     pass
 
 
+@auto_docstring
+class PPChart2TablePreTrainedModel(GotOcr2PreTrainedModel):
+    pass
+
+
 @auto_docstring
 class PPChart2TableModel(GotOcr2Model):
     def __init__(self, config: PPChart2TableConfig):

From 3f014946e2a3d521a761b76985b1d498a1b1c90a Mon Sep 17 00:00:00 2001
From: XingweiDeng <aaadddiy@163.com>
Date: Mon, 16 Mar 2026 15:02:38 +0800
Subject: [PATCH 21/60] update

---
 .../pp_chart2table/test_modeling_pp_chart2table.py     | 10 +++++-----
 1 file changed, 5 insertions(+), 5 deletions(-)

diff --git a/tests/models/pp_chart2table/test_modeling_pp_chart2table.py b/tests/models/pp_chart2table/test_modeling_pp_chart2table.py
index 7bec9adf1406..873d73f20858 100644
--- a/tests/models/pp_chart2table/test_modeling_pp_chart2table.py
+++ b/tests/models/pp_chart2table/test_modeling_pp_chart2table.py
@@ -164,11 +164,11 @@ def setUp(self):
     def test_config(self):
         self.config_tester.run_common_tests()
 
-    @unittest.skip(reason="PPChart2Table does not support")
+    @unittest.skip(reason="PPChart2Table have reused the GotOcr2 model, which does not implement the latest logic for capturing attentions and hidden_states introduced in Transformers v5.")
     def test_get_image_features_attentions(self):
         pass
 
-    @unittest.skip(reason="PPChart2Table does not support")
+    @unittest.skip(reason="PPChart2Table have reused the GotOcr2 model, which does not implement the latest logic for capturing attentions and hidden_states introduced in Transformers v5.")
     def test_get_image_features_hidden_states(self):
         pass
 
@@ -180,14 +180,14 @@ def test_model_is_small(self):
 @require_torch
 class PPChart2TableIntegrationTest(unittest.TestCase):
     def setUp(self):
-        self.processor = AutoProcessor.from_pretrained("/workspace/model_weight_torch/PP-Chart2Table")
+        self.processor = AutoProcessor.from_pretrained("PaddlePaddle/PP-Chart2Table_safetensors")
 
     def tearDown(self):
         cleanup(torch_device, gc_collect=True)
 
     @slow
     def test_small_model_integration_test_pp_chart2table(self):
-        model_id = "/workspace/model_weight_torch/PP-Chart2Table"
+        model_id = "PaddlePaddle/PP-Chart2Table_safetensors"
         model = PPChart2TableForConditionalGeneration.from_pretrained(model_id, device_map=torch_device)
         image = load_image("https://paddle-model-ecology.bj.bcebos.com/paddlex/imgs/demo_image/chart_parsing_02.png")
 
@@ -206,7 +206,7 @@ def test_small_model_integration_test_pp_chart2table(self):
 
     @slow
     def test_small_model_integration_test_pp_chart2table_batched(self):
-        model_id = "/workspace/model_weight_torch/PP-Chart2Table"
+        model_id = "PaddlePaddle/PP-Chart2Table_safetensors"
         model = PPChart2TableForConditionalGeneration.from_pretrained(model_id, device_map=torch_device)
         image1 = load_image("https://paddle-model-ecology.bj.bcebos.com/paddlex/imgs/demo_image/chart_parsing_02.png")
         image2 = load_image("https://paddle-model-ecology.bj.bcebos.com/paddlex/imgs/demo_image/chart_parsing_02.png")

From 3b91e2df3a17349dc8febe0a3ed2f7327797001f Mon Sep 17 00:00:00 2001
From: XingweiDeng <aaadddiy@163.com>
Date: Mon, 16 Mar 2026 15:15:20 +0800
Subject: [PATCH 22/60] update

---
 .../test_modeling_pp_chart2table.py           | 23 ++++++++-----------
 1 file changed, 10 insertions(+), 13 deletions(-)

diff --git a/tests/models/pp_chart2table/test_modeling_pp_chart2table.py b/tests/models/pp_chart2table/test_modeling_pp_chart2table.py
index 873d73f20858..836de2801907 100644
--- a/tests/models/pp_chart2table/test_modeling_pp_chart2table.py
+++ b/tests/models/pp_chart2table/test_modeling_pp_chart2table.py
@@ -180,19 +180,20 @@ def test_model_is_small(self):
 @require_torch
 class PPChart2TableIntegrationTest(unittest.TestCase):
     def setUp(self):
-        self.processor = AutoProcessor.from_pretrained("PaddlePaddle/PP-Chart2Table_safetensors")
+        model_path = "PaddlePaddle/PP-Chart2Table_safetensors"
+        self.model = PPChart2TableForConditionalGeneration.from_pretrained(model_path).to(torch_device)
+        self.processor = AutoProcessor.from_pretrained("PaddlePaddle/PP-Chart2Table_safetensors").to(torch_device)
+        url = "https://paddle-model-ecology.bj.bcebos.com/paddlex/imgs/demo_image/chart_parsing_02.png"
+        self.image = load_image(url)
 
     def tearDown(self):
         cleanup(torch_device, gc_collect=True)
 
     @slow
     def test_small_model_integration_test_pp_chart2table(self):
-        model_id = "PaddlePaddle/PP-Chart2Table_safetensors"
-        model = PPChart2TableForConditionalGeneration.from_pretrained(model_id, device_map=torch_device)
-        image = load_image("https://paddle-model-ecology.bj.bcebos.com/paddlex/imgs/demo_image/chart_parsing_02.png")
-
-        inputs = self.processor(image, return_tensors="pt").to(torch_device)
-        generate_ids = model.generate(
+        
+        inputs = self.processor(self.image, return_tensors="pt").to(torch_device)
+        generate_ids = self.model.generate(
             **inputs,
             use_cache=True,
             do_sample=False,
@@ -206,13 +207,9 @@ def test_small_model_integration_test_pp_chart2table(self):
 
     @slow
     def test_small_model_integration_test_pp_chart2table_batched(self):
-        model_id = "PaddlePaddle/PP-Chart2Table_safetensors"
-        model = PPChart2TableForConditionalGeneration.from_pretrained(model_id, device_map=torch_device)
-        image1 = load_image("https://paddle-model-ecology.bj.bcebos.com/paddlex/imgs/demo_image/chart_parsing_02.png")
-        image2 = load_image("https://paddle-model-ecology.bj.bcebos.com/paddlex/imgs/demo_image/chart_parsing_02.png")
 
-        inputs = self.processor([image1, image2], return_tensors="pt").to(torch_device)
-        generate_ids = model.generate(**inputs, do_sample=False, num_beams=1, max_new_tokens=6)
+        inputs = self.processor([self.image, self.image], return_tensors="pt").to(torch_device)
+        generate_ids = self.model.generate(**inputs, do_sample=False, num_beams=1, max_new_tokens=6)
         decoded_output = self.processor.batch_decode(
             generate_ids[:, inputs["input_ids"].shape[1] :], skip_special_tokens=True
         )

From 65b7d01cc399af9bfac82fe384f81172617aef71 Mon Sep 17 00:00:00 2001
From: XingweiDeng <aaadddiy@163.com>
Date: Mon, 16 Mar 2026 15:23:54 +0800
Subject: [PATCH 23/60] update

---
 docs/source/en/model_doc/pp_chart2table.md | 1 +
 1 file changed, 1 insertion(+)

diff --git a/docs/source/en/model_doc/pp_chart2table.md b/docs/source/en/model_doc/pp_chart2table.md
index 387d27e2b855..f2b0296d8e66 100644
--- a/docs/source/en/model_doc/pp_chart2table.md
+++ b/docs/source/en/model_doc/pp_chart2table.md
@@ -13,6 +13,7 @@ specific language governing permissions and limitations under the License.
 rendered properly in your Markdown viewer.
 
 -->
+*This model was released on {release_date} and added to Hugging Face Transformers on 2026-03-16.*
 
 # PP-Chart2Table
 

From cc85b83a9a1913717eba52bed6caad0e28041449 Mon Sep 17 00:00:00 2001
From: XingweiDeng <aaadddiy@163.com>
Date: Mon, 16 Mar 2026 17:57:32 +0800
Subject: [PATCH 24/60] update

---
 .../pp_chart2table/modeling_pp_chart2table.py | 28 -------------------
 .../test_modeling_pp_chart2table.py           | 10 ++++---
 2 files changed, 6 insertions(+), 32 deletions(-)

diff --git a/src/transformers/models/pp_chart2table/modeling_pp_chart2table.py b/src/transformers/models/pp_chart2table/modeling_pp_chart2table.py
index 76b164ebe4aa..a9f48a6cf6a1 100644
--- a/src/transformers/models/pp_chart2table/modeling_pp_chart2table.py
+++ b/src/transformers/models/pp_chart2table/modeling_pp_chart2table.py
@@ -578,18 +578,8 @@ def forward(
         past_key_values: Cache | None = None,
         inputs_embeds: torch.FloatTensor | None = None,
         use_cache: bool | None = None,
-        output_attentions: bool | None = None,
-        output_hidden_states: bool | None = None,
-        return_dict: bool | None = None,
-        cache_position: torch.LongTensor | None = None,
         **kwargs: Unpack[TransformersKwargs],
     ) -> tuple | PPChart2TableModelOutputWithPast:
-        output_attentions = output_attentions if output_attentions is not None else self.config.output_attentions
-        output_hidden_states = (
-            output_hidden_states if output_hidden_states is not None else self.config.output_hidden_states
-        )
-        return_dict = return_dict if return_dict is not None else self.config.use_return_dict
-
         if (input_ids is None) ^ (inputs_embeds is not None):
             raise ValueError("You must specify exactly one of input_ids or inputs_embeds")
 
@@ -612,10 +602,7 @@ def forward(
             past_key_values=past_key_values,
             inputs_embeds=inputs_embeds,
             use_cache=use_cache,
-            output_attentions=output_attentions,
-            output_hidden_states=output_hidden_states,
             return_dict=True,
-            cache_position=cache_position,
             **kwargs,
         )
 
@@ -706,10 +693,6 @@ def forward(
         inputs_embeds: torch.FloatTensor | None = None,
         labels: torch.LongTensor | None = None,
         use_cache: bool | None = None,
-        output_attentions: bool | None = None,
-        output_hidden_states: bool | None = None,
-        return_dict: bool | None = None,
-        cache_position: torch.LongTensor | None = None,
         logits_to_keep: int | torch.Tensor = 0,
         **kwargs: Unpack[TransformersKwargs],
     ) -> tuple | PPChart2TableCausalLMOutputWithPast:
@@ -749,12 +732,6 @@ def forward(
         "You should keep in mind what features from the module should be used, especially
         when you're planning to sell a template."
         ```"""
-        output_attentions = output_attentions if output_attentions is not None else self.config.output_attentions
-        output_hidden_states = (
-            output_hidden_states if output_hidden_states is not None else self.config.output_hidden_states
-        )
-        return_dict = return_dict if return_dict is not None else self.config.use_return_dict
-
         outputs = self.model(
             input_ids=input_ids,
             pixel_values=pixel_values,
@@ -763,10 +740,7 @@ def forward(
             past_key_values=past_key_values,
             inputs_embeds=inputs_embeds,
             use_cache=use_cache,
-            output_attentions=output_attentions,
-            output_hidden_states=output_hidden_states,
             return_dict=True,
-            cache_position=cache_position,
             logits_to_keep=logits_to_keep,
             **kwargs,
         )
@@ -798,7 +772,6 @@ def prepare_inputs_for_generation(
         inputs_embeds=None,
         pixel_values=None,
         attention_mask=None,
-        cache_position=None,
         logits_to_keep=None,
         is_first_iteration=False,
         **kwargs,
@@ -810,7 +783,6 @@ def prepare_inputs_for_generation(
             past_key_values=past_key_values,
             inputs_embeds=inputs_embeds,
             attention_mask=attention_mask,
-            cache_position=cache_position,
             logits_to_keep=logits_to_keep,
             is_first_iteration=is_first_iteration,
             **kwargs,
diff --git a/tests/models/pp_chart2table/test_modeling_pp_chart2table.py b/tests/models/pp_chart2table/test_modeling_pp_chart2table.py
index 836de2801907..48ce924d5c97 100644
--- a/tests/models/pp_chart2table/test_modeling_pp_chart2table.py
+++ b/tests/models/pp_chart2table/test_modeling_pp_chart2table.py
@@ -164,11 +164,15 @@ def setUp(self):
     def test_config(self):
         self.config_tester.run_common_tests()
 
-    @unittest.skip(reason="PPChart2Table have reused the GotOcr2 model, which does not implement the latest logic for capturing attentions and hidden_states introduced in Transformers v5.")
+    @unittest.skip(
+        reason="PPChart2Table have reused the GotOcr2 model, which does not implement the latest logic for capturing attentions and hidden_states introduced in Transformers v5."
+    )
     def test_get_image_features_attentions(self):
         pass
 
-    @unittest.skip(reason="PPChart2Table have reused the GotOcr2 model, which does not implement the latest logic for capturing attentions and hidden_states introduced in Transformers v5.")
+    @unittest.skip(
+        reason="PPChart2Table have reused the GotOcr2 model, which does not implement the latest logic for capturing attentions and hidden_states introduced in Transformers v5."
+    )
     def test_get_image_features_hidden_states(self):
         pass
 
@@ -191,7 +195,6 @@ def tearDown(self):
 
     @slow
     def test_small_model_integration_test_pp_chart2table(self):
-        
         inputs = self.processor(self.image, return_tensors="pt").to(torch_device)
         generate_ids = self.model.generate(
             **inputs,
@@ -207,7 +210,6 @@ def test_small_model_integration_test_pp_chart2table(self):
 
     @slow
     def test_small_model_integration_test_pp_chart2table_batched(self):
-
         inputs = self.processor([self.image, self.image], return_tensors="pt").to(torch_device)
         generate_ids = self.model.generate(**inputs, do_sample=False, num_beams=1, max_new_tokens=6)
         decoded_output = self.processor.batch_decode(

From b4197321624a8a18924fac271e0339285a96aa83 Mon Sep 17 00:00:00 2001
From: XingweiDeng <aaadddiy@163.com>
Date: Mon, 16 Mar 2026 18:09:59 +0800
Subject: [PATCH 25/60] update

---
 docs/source/en/_toctree.yml | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/docs/source/en/_toctree.yml b/docs/source/en/_toctree.yml
index 0fe76d0d70c8..e5f24f55962a 100644
--- a/docs/source/en/_toctree.yml
+++ b/docs/source/en/_toctree.yml
@@ -1268,12 +1268,12 @@
         title: PP-DocLayoutV2
       - local: model_doc/pp_doclayout_v3
         title: PP-DocLayoutV3
-      - local: model_doc/pp_chart2table
-        title: PPChart2Table
       - local: model_doc/pp_ocrv5_mobile_det
         title: PP-OCRv5_mobile_det
       - local: model_doc/pp_ocrv5_server_det
         title: PP-OCRv5_server_det
+      - local: model_doc/pp_chart2table
+        title: PPChart2Table
       - local: model_doc/pp_lcnet
         title: PPLCNet
       - local: model_doc/pp_lcnet_v3

From cc8bbca9452acc0eae95a1cadb9edf5ca5914c56 Mon Sep 17 00:00:00 2001
From: XingweiDeng <aaadddiy@163.com>
Date: Tue, 17 Mar 2026 12:52:27 +0800
Subject: [PATCH 26/60] update

---
 docs/source/en/model_doc/pp_chart2table.md    | 20 ++-----
 .../models/auto/tokenization_auto.py          |  1 -
 .../configuration_pp_chart2table.py           | 16 +-----
 .../pp_chart2table/modeling_pp_chart2table.py | 45 +++++++++------
 .../pp_chart2table/modular_pp_chart2table.py  | 57 ++-----------------
 .../processing_pp_chart2table.py              |  2 +-
 6 files changed, 39 insertions(+), 102 deletions(-)

diff --git a/docs/source/en/model_doc/pp_chart2table.md b/docs/source/en/model_doc/pp_chart2table.md
index f2b0296d8e66..82f92bcdfb90 100644
--- a/docs/source/en/model_doc/pp_chart2table.md
+++ b/docs/source/en/model_doc/pp_chart2table.md
@@ -71,15 +71,14 @@ from transformers import AutoModelForImageTextToText, AutoProcessor
 model_path = "PaddlePaddle/PP-Chart2Table_safetensors"
 model = AutoModelForImageTextToText.from_pretrained(
     model_path, 
-    dtype="float32",
     device_map="auto",
 )
-processor = AutoProcessor.from_pretrained(model_path, use_fast=True).to(model.device)
+processor = AutoProcessor.from_pretrained(model_path)
 
 image = Image.open(requests.get("https://paddle-model-ecology.bj.bcebos.com/paddlex/imgs/demo_image/chart_parsing_02.png", stream=True).raw)
-inputs = processor(images=image)
+inputs = processor(images=image).to(model.device)
 
-generated_ids = model.generate(**inputs, use_cache=True, do_sample=False, max_new_tokens=256)
+generated_ids = model.generate(**inputs, do_sample=False, max_new_tokens=256)
 generated_ids_trimmed = [out_ids[len(in_ids) :] for in_ids, out_ids in zip(inputs.input_ids, generated_ids)]
 result = processor.batch_decode(generated_ids_trimmed, skip_special_tokens=True, clean_up_tokenization_spaces=False)
 print(result)
@@ -128,13 +127,12 @@ from transformers import AutoModelForImageTextToText, AutoProcessor
 model_path = "PaddlePaddle/PP-Chart2Table_safetensors"
 model = AutoModelForImageTextToText.from_pretrained(
     model_path, 
-    dtype="float32",
     device_map="auto",
 )
-processor = AutoProcessor.from_pretrained(model_path).to(model.device)
+processor = AutoProcessor.from_pretrained(model_path)
 
 image = Image.open(requests.get("https://paddle-model-ecology.bj.bcebos.com/paddlex/imgs/demo_image/chart_parsing_02.png", stream=True).raw)
-inputs = processor(images=[image, image])
+inputs = processor(images=[image, image]).to(model.device)
 
 generated_ids = model.generate(**inputs, do_sample=False, max_new_tokens=256)
 generated_ids_trimmed = [out_ids[len(in_ids) :] for in_ids, out_ids in zip(inputs.input_ids, generated_ids)]
@@ -157,14 +155,6 @@ print(result)
 
 [[autodoc]] PPChart2TableConfig
 
-## PPChart2TableVisionPreTrainedModel
-
-[[autodoc]] PPChart2TableVisionPreTrainedModel
-
-## PPChart2TablePreTrainedModel
-
-[[autodoc]] PPChart2TablePreTrainedModel
-
 ## PPChart2TableImageProcessorFast
 
 [[autodoc]] PPChart2TableImageProcessorFast
diff --git a/src/transformers/models/auto/tokenization_auto.py b/src/transformers/models/auto/tokenization_auto.py
index 21d259dca36b..35693d4c082e 100644
--- a/src/transformers/models/auto/tokenization_auto.py
+++ b/src/transformers/models/auto/tokenization_auto.py
@@ -255,7 +255,6 @@
             else ("TokenizersBackend" if is_tokenizers_available() else None),
         ),
         ("plbart", "PLBartTokenizer" if is_tokenizers_available() else None),
-        ("pp_chart2table", "TokenizersBackend" if is_tokenizers_available() else None),
         ("prophetnet", "ProphetNetTokenizer"),
         ("qdqbert", "BertTokenizer" if is_tokenizers_available() else None),
         ("qwen2", "Qwen2Tokenizer" if is_tokenizers_available() else None),
diff --git a/src/transformers/models/pp_chart2table/configuration_pp_chart2table.py b/src/transformers/models/pp_chart2table/configuration_pp_chart2table.py
index d3556596b107..107c077798a9 100644
--- a/src/transformers/models/pp_chart2table/configuration_pp_chart2table.py
+++ b/src/transformers/models/pp_chart2table/configuration_pp_chart2table.py
@@ -84,17 +84,7 @@ def __init__(
         self.mlp_dim = mlp_dim
 
 
-@auto_docstring(
-    checkpoint="PaddlePaddle/PP-Chart2Table_safetensors",
-    custom_args=r"""
-    output_channels (`int`, *optional*, defaults to 1024):
-        Dimensionality of the output channels from the vision encoder. This is the final channel count
-        after the vision downsample layers, which is then projected to the text model hidden size.
-    vision_hidden_channels (`int`, *optional*, defaults to 512):
-        Dimensionality of the intermediate hidden channels in the vision encoder. This is the channel
-        count between the first and second downsample layers.
-    """,
-)
+@auto_docstring(checkpoint="PaddlePaddle/PP-Chart2Table_safetensors")
 class PPChart2TableConfig(PreTrainedConfig):
     r"""
     Example:
@@ -125,12 +115,8 @@ def __init__(
         image_token_index: int | None = 151859,
         image_seq_length: int | None = 576,
         tie_word_embeddings: bool | None = True,
-        output_channels: int = 1024,
-        vision_hidden_channels: int = 512,
         **kwargs,
     ):
-        self.output_channels = output_channels
-        self.vision_hidden_channels = vision_hidden_channels
         self.image_token_index = image_token_index
         self.image_seq_length = image_seq_length
 
diff --git a/src/transformers/models/pp_chart2table/modeling_pp_chart2table.py b/src/transformers/models/pp_chart2table/modeling_pp_chart2table.py
index a9f48a6cf6a1..0c0a2c83e006 100644
--- a/src/transformers/models/pp_chart2table/modeling_pp_chart2table.py
+++ b/src/transformers/models/pp_chart2table/modeling_pp_chart2table.py
@@ -496,6 +496,27 @@ def _init_weights(self, module):
                 init.zeros_(module.pos_embed)
 
 
+class PPChart2TableMultiModalProjector(nn.Module):
+    def __init__(self, config: PPChart2TableConfig):
+        super().__init__()
+        vision_output_channels = config.vision_config.output_channels
+        language_hidden_size = config.text_config.hidden_size
+        self.conv_upsampler1 = nn.Conv2d(
+            vision_output_channels, vision_output_channels * 2, kernel_size=3, stride=2, padding=1, bias=False
+        )
+        self.conv_upsampler2 = nn.Conv2d(
+            vision_output_channels * 2, language_hidden_size, kernel_size=3, stride=2, padding=1, bias=False
+        )
+        self.multimodal_projector = nn.Linear(language_hidden_size, language_hidden_size)
+
+    def forward(self, vision_embeddings: torch.Tensor) -> torch.Tensor:
+        hidden_state = self.conv_upsampler1(vision_embeddings)
+        hidden_state = self.conv_upsampler2(hidden_state)
+        hidden_state = hidden_state.flatten(2).permute(0, 2, 1)
+        hidden_state = self.multimodal_projector(hidden_state)
+        return hidden_state
+
+
 @auto_docstring
 class PPChart2TableModel(PPChart2TablePreTrainedModel):
     _checkpoint_conversion_mapping = {
@@ -505,19 +526,9 @@ class PPChart2TableModel(PPChart2TablePreTrainedModel):
     def __init__(self, config: PPChart2TableConfig):
         super().__init__(config)
         self.vision_tower = PPChart2TableVisionEncoder(config.vision_config)
-        self.multi_modal_projector = nn.Linear(config.output_channels, config.text_config.hidden_size)
+
+        self.multi_modal_projector = PPChart2TableMultiModalProjector(config)
         self.language_model = AutoModel.from_config(config.text_config)
-        self.vision_downsample1 = nn.Conv2d(
-            config.vision_config.output_channels,
-            config.vision_hidden_channels,
-            kernel_size=3,
-            stride=2,
-            padding=1,
-            bias=False,
-        )
-        self.vision_downsample2 = nn.Conv2d(
-            config.vision_hidden_channels, config.output_channels, kernel_size=3, stride=2, padding=1, bias=False
-        )
         self.post_init()
 
     def get_input_embeddings(self):
@@ -535,13 +546,11 @@ def get_image_features(
         pixel_values: torch.FloatTensor,
         **kwargs: Unpack[TransformersKwargs],
     ) -> tuple | BaseModelOutputWithPooling:
-        image_output = self.vision_tower(pixel_values)
-        last_hidden_state = image_output.last_hidden_state
-        last_hidden_state = self.vision_downsample1(last_hidden_state)
-        last_hidden_state = self.vision_downsample2(last_hidden_state)
-        image_output.pooler_output = self.multi_modal_projector(last_hidden_state.flatten(2).transpose(2, 1))
+        image_outputs = self.vision_tower(pixel_values, return_dict=True, **kwargs)
+        last_hidden_state = image_outputs.last_hidden_state
+        image_outputs.pooler_output = self.multi_modal_projector(last_hidden_state)
 
-        return image_output
+        return image_outputs
 
     def get_placeholder_mask(
         self, input_ids: torch.LongTensor, inputs_embeds: torch.FloatTensor, image_features: torch.FloatTensor
diff --git a/src/transformers/models/pp_chart2table/modular_pp_chart2table.py b/src/transformers/models/pp_chart2table/modular_pp_chart2table.py
index 85e3495d0cea..2223745aef4e 100644
--- a/src/transformers/models/pp_chart2table/modular_pp_chart2table.py
+++ b/src/transformers/models/pp_chart2table/modular_pp_chart2table.py
@@ -15,15 +15,13 @@
 from dataclasses import dataclass
 
 import torch
-import torch.nn as nn
 
 from ...feature_extraction_utils import BatchFeature
 from ...image_processing_utils_fast import BaseImageProcessorFast
 from ...image_utils import ImageInput
-from ...modeling_outputs import BaseModelOutputWithPooling
 from ...processing_utils import ProcessingKwargs, ProcessorMixin, Unpack
 from ...tokenization_utils_base import PreTokenizedInput, TextInput
-from ...utils import TransformersKwargs, auto_docstring, logging
+from ...utils import auto_docstring, logging
 from ..got_ocr2.configuration_got_ocr2 import GotOcr2Config
 from ..got_ocr2.modeling_got_ocr2 import (
     GotOcr2ForConditionalGeneration,
@@ -38,26 +36,10 @@
 
 
 @auto_docstring(
-    checkpoint="PaddlePaddle/PP-Chart2Table_safetensors",
-    custom_args=r"""
-    output_channels (`int`, *optional*, defaults to 1024):
-        Dimensionality of the output channels from the vision encoder. This is the final channel count
-        after the vision downsample layers, which is then projected to the text model hidden size.
-    vision_hidden_channels (`int`, *optional*, defaults to 512):
-        Dimensionality of the intermediate hidden channels in the vision encoder. This is the channel
-        count between the first and second downsample layers.
-    """,
+    checkpoint="PaddlePaddle/PP-Chart2Table_safetensors"
 )
 class PPChart2TableConfig(GotOcr2Config):
-    def __init__(
-        self,
-        output_channels: int = 1024,
-        vision_hidden_channels: int = 512,
-        **super_kwargs,
-    ):
-        self.output_channels = output_channels
-        self.vision_hidden_channels = vision_hidden_channels
-        super().__init__()
+    pass
 
 
 @auto_docstring
@@ -104,7 +86,7 @@ def __call__(
         num_patches = height // self.image_processor.patch_size // self.image_processor.merge_size
 
         input_ids = {"input_ids": None}
-        if text is None:
+        if text is None or text == "":
             query = "Chart to table"
             prompt = (
                 self.message_start_token
@@ -150,36 +132,7 @@ class PPChart2TablePreTrainedModel(GotOcr2PreTrainedModel):
 
 @auto_docstring
 class PPChart2TableModel(GotOcr2Model):
-    def __init__(self, config: PPChart2TableConfig):
-        super().__init__(config)
-        self.vision_downsample1 = nn.Conv2d(
-            config.vision_config.output_channels,
-            config.vision_hidden_channels,
-            kernel_size=3,
-            stride=2,
-            padding=1,
-            bias=False,
-        )
-        self.vision_downsample2 = nn.Conv2d(
-            config.vision_hidden_channels, config.output_channels, kernel_size=3, stride=2, padding=1, bias=False
-        )
-        self.multi_modal_projector = nn.Linear(config.output_channels, config.text_config.hidden_size)
-
-        # Initialize weights and apply final processing
-        self.post_init()
-
-    def get_image_features(
-        self,
-        pixel_values: torch.FloatTensor,
-        **kwargs: Unpack[TransformersKwargs],
-    ) -> tuple | BaseModelOutputWithPooling:
-        image_output = self.vision_tower(pixel_values)
-        last_hidden_state = image_output.last_hidden_state
-        last_hidden_state = self.vision_downsample1(last_hidden_state)
-        last_hidden_state = self.vision_downsample2(last_hidden_state)
-        image_output.pooler_output = self.multi_modal_projector(last_hidden_state.flatten(2).transpose(2, 1))
-
-        return image_output
+    pass
 
 
 @auto_docstring(
diff --git a/src/transformers/models/pp_chart2table/processing_pp_chart2table.py b/src/transformers/models/pp_chart2table/processing_pp_chart2table.py
index 4d0b973f1154..3764024898f1 100644
--- a/src/transformers/models/pp_chart2table/processing_pp_chart2table.py
+++ b/src/transformers/models/pp_chart2table/processing_pp_chart2table.py
@@ -59,7 +59,7 @@ def __call__(
         num_patches = height // self.image_processor.patch_size // self.image_processor.merge_size
 
         input_ids = {"input_ids": None}
-        if text is None:
+        if text is None or text == "":
             query = "Chart to table"
             prompt = (
                 self.message_start_token

From 9094eb572ae2d98db24adaa973e62b8e4f68828c Mon Sep 17 00:00:00 2001
From: XingweiDeng <aaadddiy@163.com>
Date: Tue, 17 Mar 2026 13:12:59 +0800
Subject: [PATCH 27/60] update

---
 utils/check_config_attributes.py | 1 -
 utils/check_repo.py              | 1 -
 2 files changed, 2 deletions(-)

diff --git a/utils/check_config_attributes.py b/utils/check_config_attributes.py
index f3f6efedc96b..43408709fbc7 100644
--- a/utils/check_config_attributes.py
+++ b/utils/check_config_attributes.py
@@ -43,7 +43,6 @@
     "BambaConfig": ["attn_layer_indices"],
     "Dots1Config": ["max_window_layers"],
     "JambaConfig": ["attn_layer_offset", "attn_layer_period", "expert_layer_offset", "expert_layer_period"],
-    "PPChart2TableConfig": ["tie_word_embeddings"],
     "JetMoeConfig": ["output_router_logits"],
     "Phi3Config": ["embd_pdrop"],
     "EncodecConfig": ["overlap"],
diff --git a/utils/check_repo.py b/utils/check_repo.py
index d458b2c3b0e1..c548e7825e7c 100644
--- a/utils/check_repo.py
+++ b/utils/check_repo.py
@@ -194,7 +194,6 @@
         "PaddleOCRVisionModel",  # Building part of bigger (tested) model. Tested implicitly through PaddleOCRVLForConditionalGeneration.
         "PaddleOCRVisionTransformer",  # Building part of bigger (tested) model. Tested implicitly through PaddleOCRVLForConditionalGeneration.
         "PaddleOCRTextModel",  # Building part of bigger (tested) model. Tested implicitly through PaddleOCRVLForConditionalGeneration.
-        "PPChart2TableModel",  # Building part of bigger (tested) model. Tested implicitly through PPChart2TableForConditionalGeneration.
         "PPChart2TableVisionModel",  # Building part of bigger (tested) model. Tested implicitly through PPChart2TableForConditionalGeneration.
         "PPChart2TableTextModel",  # Building part of bigger (tested) model. Tested implicitly through PPChart2TableForConditionalGeneration.
         "Qwen2VLModel",  # Building part of bigger (tested) model. Tested implicitly through Qwen2VLForConditionalGeneration.

From 55664d1e7307babbc61bba12ec276e4ea7857022 Mon Sep 17 00:00:00 2001
From: XingweiDeng <aaadddiy@163.com>
Date: Tue, 17 Mar 2026 13:51:36 +0800
Subject: [PATCH 28/60] update

---
 .../test_image_processing_pp_chart2table.py   | 101 ++++++++++++++++++
 .../test_modeling_pp_chart2table.py           |  29 ++---
 2 files changed, 111 insertions(+), 19 deletions(-)
 create mode 100644 tests/models/pp_chart2table/test_image_processing_pp_chart2table.py

diff --git a/tests/models/pp_chart2table/test_image_processing_pp_chart2table.py b/tests/models/pp_chart2table/test_image_processing_pp_chart2table.py
new file mode 100644
index 000000000000..b56859409bfc
--- /dev/null
+++ b/tests/models/pp_chart2table/test_image_processing_pp_chart2table.py
@@ -0,0 +1,101 @@
+# Copyright 2026 The HuggingFace Inc. team. All rights reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+
+import unittest
+
+from transformers.testing_utils import require_torch, require_vision
+from transformers.utils import is_torch_available, is_torchvision_available, is_vision_available
+
+from ...test_image_processing_common import ImageProcessingTestMixin, prepare_image_inputs
+
+if is_torchvision_available():
+    from transformers import PPChart2TableImageProcessorFast
+
+
+class PPChart2TableImageProcessingTester(unittest.TestCase):
+    def __init__(
+        self,
+        parent,
+        batch_size=7,
+        num_channels=3,
+        image_size=18,
+        min_resolution=30,
+        max_resolution=400,
+        do_resize=True,
+        size=None,
+        do_normalize=True,
+        image_mean=[0.48145466, 0.4578275, 0.40821073],
+        image_std=[0.26862954, 0.26130258, 0.27577711],
+    ):
+        super().__init__()
+        size = size if size is not None else {"height": 1024, "width": 1024}
+        self.parent = parent
+        self.batch_size = batch_size
+        self.num_channels = num_channels
+        self.image_size = image_size
+        self.min_resolution = min_resolution
+        self.max_resolution = max_resolution
+        self.do_resize = do_resize
+        self.size = size
+        self.do_normalize = do_normalize
+        self.image_mean = image_mean
+        self.image_std = image_std
+
+    def prepare_image_processor_dict(self):
+        return {
+            "do_resize": self.do_resize,
+            "size": self.size,
+            "do_normalize": self.do_normalize,
+            "image_mean": self.image_mean,
+            "image_std": self.image_std,
+        }
+
+    def expected_output_image_shape(self, images):
+        return self.num_channels, self.size["height"], self.size["width"]
+
+    def prepare_image_inputs(self, equal_resolution=False, numpify=False, torchify=False):
+        return prepare_image_inputs(
+            batch_size=self.batch_size,
+            num_channels=self.num_channels,
+            min_resolution=self.min_resolution,
+            max_resolution=self.max_resolution,
+            equal_resolution=equal_resolution,
+            numpify=numpify,
+            torchify=torchify,
+        )
+
+
+@require_torch
+@require_vision
+class PPChart2TableProcessingTest(ImageProcessingTestMixin, unittest.TestCase):
+    test_slow_image_processor = False
+    fast_image_processing_class = PPChart2TableImageProcessorFast if is_torchvision_available() else None
+
+    def setUp(self):
+        super().setUp()
+        self.image_processor_tester = PPChart2TableImageProcessingTester(self)
+
+    @property
+    def image_processor_dict(self):
+        return self.image_processor_tester.prepare_image_processor_dict()
+
+    def test_image_processor_properties(self):
+        for image_processing_class in self.image_processor_list:
+            image_processor = image_processing_class(**self.image_processor_dict)
+            self.assertTrue(hasattr(image_processor, "do_resize"))
+            self.assertTrue(hasattr(image_processor, "size"))
+            self.assertTrue(hasattr(image_processor, "do_normalize"))
+            self.assertTrue(hasattr(image_processor, "image_mean"))
+            self.assertTrue(hasattr(image_processor, "image_std"))
diff --git a/tests/models/pp_chart2table/test_modeling_pp_chart2table.py b/tests/models/pp_chart2table/test_modeling_pp_chart2table.py
index 48ce924d5c97..80d4d1d6cc42 100644
--- a/tests/models/pp_chart2table/test_modeling_pp_chart2table.py
+++ b/tests/models/pp_chart2table/test_modeling_pp_chart2table.py
@@ -53,22 +53,19 @@ def __init__(
         image_size=64,
         image_token_index=1,
         model_type="pp_chart2table",
-        is_training=False,
-        output_channels=1024,
-        vision_hidden_channels=1024,
+        is_training=True,
         text_config={
             "model_type": "qwen2",
             "vocab_size": 99,
-            "hidden_size": 128,
+            "hidden_size": 32,
             "intermediate_size": 37,
             "num_hidden_layers": 2,
-            "num_attention_heads": 4,
+            "num_attention_heads": 2,
             "num_key_value_heads": 2,
-            "output_channels": 64,
+            "output_channels": 32,
             "hidden_act": "silu",
             "max_position_embeddings": 512,
             "rope_theta": 10000,
-            "mlp_ratio": 4,
             "tie_word_embeddings": True,
             "bos_token_id": 2,
             "eos_token_id": 3,
@@ -76,11 +73,11 @@ def __init__(
         },
         vision_config={
             "num_hidden_layers": 2,
-            "output_channels": 64,
+            "output_channels": 32,
             "hidden_act": "quick_gelu",
             "hidden_size": 32,
-            "mlp_dim": 128,
-            "num_attention_heads": 4,
+            "mlp_dim": 64,
+            "num_attention_heads": 2,
             "patch_size": 2,
             "image_size": 64,
         },
@@ -100,8 +97,6 @@ def __init__(
         self.is_training = is_training
         self.num_image_tokens = 64
         self.seq_length = seq_length + self.num_image_tokens
-        self.output_channels = output_channels
-        self.vision_hidden_channels = vision_hidden_channels
 
         self.num_hidden_layers = text_config["num_hidden_layers"]
         self.vocab_size = text_config["vocab_size"]
@@ -176,17 +171,13 @@ def test_get_image_features_attentions(self):
     def test_get_image_features_hidden_states(self):
         pass
 
-    @unittest.skip(reason="PPChart2Table does not support this test.")
-    def test_model_is_small(self):
-        pass
-
 
 @require_torch
 class PPChart2TableIntegrationTest(unittest.TestCase):
     def setUp(self):
         model_path = "PaddlePaddle/PP-Chart2Table_safetensors"
         self.model = PPChart2TableForConditionalGeneration.from_pretrained(model_path).to(torch_device)
-        self.processor = AutoProcessor.from_pretrained("PaddlePaddle/PP-Chart2Table_safetensors").to(torch_device)
+        self.processor = AutoProcessor.from_pretrained(model_path)
         url = "https://paddle-model-ecology.bj.bcebos.com/paddlex/imgs/demo_image/chart_parsing_02.png"
         self.image = load_image(url)
 
@@ -200,12 +191,12 @@ def test_small_model_integration_test_pp_chart2table(self):
             **inputs,
             use_cache=True,
             do_sample=False,
-            max_new_tokens=1024,
+            max_new_tokens=32,
         )
         decoded_output = self.processor.decode(
             generate_ids[0, inputs["input_ids"].shape[1] :], skip_special_tokens=True
         )
-        expected_output = "年份 | 单家五星级旅游饭店年平均营收 (百万元) | 单家五星级旅游饭店年平均利润 (百万元)\n2018 | 104.22 | 9.87\n2019 | 99.11 | 7.47\n2020 | 57.87 | -3.87\n2021 | 68.99 | -2.90\n2022 | 56.29 | -9.48\n2023 | 87.99 | 5.96"
+        expected_output = "年份 | 单家五星级旅游饭店年平均营收 (百万元) | 单家五星级旅游饭店年平均利润 (百万元)\n"
         self.assertEqual(decoded_output, expected_output)
 
     @slow

From 6bb4dbc0d83fdf73c37147eeb73493e3838433e3 Mon Sep 17 00:00:00 2001
From: XingweiDeng <aaadddiy@163.com>
Date: Tue, 17 Mar 2026 14:27:01 +0800
Subject: [PATCH 29/60] upddate

---
 .../test_modeling_pp_chart2table.py             | 17 +----------------
 1 file changed, 1 insertion(+), 16 deletions(-)

diff --git a/tests/models/pp_chart2table/test_modeling_pp_chart2table.py b/tests/models/pp_chart2table/test_modeling_pp_chart2table.py
index 80d4d1d6cc42..98a7f1c0de22 100644
--- a/tests/models/pp_chart2table/test_modeling_pp_chart2table.py
+++ b/tests/models/pp_chart2table/test_modeling_pp_chart2table.py
@@ -138,7 +138,6 @@ def prepare_config_and_inputs_for_common(self):
 class PPChart2TableModelTest(ModelTesterMixin, GenerationTesterMixin, PipelineTesterMixin, unittest.TestCase):
     all_model_classes = (
         (
-            PPChart2TableModel,
             PPChart2TableForConditionalGeneration,
         )
         if is_torch_available()
@@ -159,19 +158,7 @@ def setUp(self):
     def test_config(self):
         self.config_tester.run_common_tests()
 
-    @unittest.skip(
-        reason="PPChart2Table have reused the GotOcr2 model, which does not implement the latest logic for capturing attentions and hidden_states introduced in Transformers v5."
-    )
-    def test_get_image_features_attentions(self):
-        pass
-
-    @unittest.skip(
-        reason="PPChart2Table have reused the GotOcr2 model, which does not implement the latest logic for capturing attentions and hidden_states introduced in Transformers v5."
-    )
-    def test_get_image_features_hidden_states(self):
-        pass
-
-
+@slow
 @require_torch
 class PPChart2TableIntegrationTest(unittest.TestCase):
     def setUp(self):
@@ -184,7 +171,6 @@ def setUp(self):
     def tearDown(self):
         cleanup(torch_device, gc_collect=True)
 
-    @slow
     def test_small_model_integration_test_pp_chart2table(self):
         inputs = self.processor(self.image, return_tensors="pt").to(torch_device)
         generate_ids = self.model.generate(
@@ -199,7 +185,6 @@ def test_small_model_integration_test_pp_chart2table(self):
         expected_output = "年份 | 单家五星级旅游饭店年平均营收 (百万元) | 单家五星级旅游饭店年平均利润 (百万元)\n"
         self.assertEqual(decoded_output, expected_output)
 
-    @slow
     def test_small_model_integration_test_pp_chart2table_batched(self):
         inputs = self.processor([self.image, self.image], return_tensors="pt").to(torch_device)
         generate_ids = self.model.generate(**inputs, do_sample=False, num_beams=1, max_new_tokens=6)

From f79d83b28aa395bb4ef1f0f06198cf2137902463 Mon Sep 17 00:00:00 2001
From: XingweiDeng <aaadddiy@163.com>
Date: Tue, 17 Mar 2026 20:55:06 +0800
Subject: [PATCH 30/60] update

---
 .../pp_chart2table/modular_pp_chart2table.py  | 57 ++++++++-----------
 .../processing_pp_chart2table.py              | 53 ++++++++---------
 .../test_image_processing_pp_chart2table.py   |  3 +-
 .../test_modeling_pp_chart2table.py           | 10 +---
 utils/check_repo.py                           |  1 +
 5 files changed, 54 insertions(+), 70 deletions(-)

diff --git a/src/transformers/models/pp_chart2table/modular_pp_chart2table.py b/src/transformers/models/pp_chart2table/modular_pp_chart2table.py
index 2223745aef4e..6a7decdc7ac8 100644
--- a/src/transformers/models/pp_chart2table/modular_pp_chart2table.py
+++ b/src/transformers/models/pp_chart2table/modular_pp_chart2table.py
@@ -35,9 +35,7 @@
 logger = logging.get_logger(__name__)
 
 
-@auto_docstring(
-    checkpoint="PaddlePaddle/PP-Chart2Table_safetensors"
-)
+@auto_docstring(checkpoint="PaddlePaddle/PP-Chart2Table_safetensors")
 class PPChart2TableConfig(GotOcr2Config):
     pass
 
@@ -60,17 +58,6 @@ class PPChart2TableProcessor(ProcessorMixin):
     image_processor_class = "AutoImageProcessor"
     tokenizer_class = "AutoTokenizer"
 
-    def __init__(self, image_processor=None, tokenizer=None, chat_template=None, **kwargs):
-        super().__init__(image_processor, tokenizer, chat_template=chat_template)
-
-        self.message_start_token = tokenizer.message_start_token
-        self.message_end_token = tokenizer.message_end_token
-        self.img_start_token = tokenizer.img_start_token
-        self.img_end_token = tokenizer.img_end_token
-        self.img_pad_token = tokenizer.img_pad_token
-        self.image_token = tokenizer.image_token
-        self.system_query = "system\nYou should follow the instructions carefully and explain your answers in detail."
-
     def __call__(
         self,
         images: ImageInput = None,
@@ -88,27 +75,33 @@ def __call__(
         input_ids = {"input_ids": None}
         if text is None or text == "":
             query = "Chart to table"
-            prompt = (
-                self.message_start_token
-                + self.system_query
-                + self.message_end_token
-                + self.message_start_token
-                + "user\n"
-                + self.img_start_token
-                + self.img_pad_token * num_patches * num_patches
-                + self.img_end_token
-                + "\n"
-                + query
-                + self.message_end_token
-                + self.message_start_token
-                + "assistant\n"
-            )
-            input_ids = torch.tensor(self.tokenizer([prompt]).input_ids)
-            input_ids = input_ids.repeat(batch_size, 1)
-            input_ids = {"input_ids": input_ids}
         else:
             raise ValueError("PPChart2Table processor does not support text inputs")
 
+        messages = [
+            {
+                "role": "system",
+                "content": "You should follow the instructions carefully and explain your answers in detail.",
+            },
+            {
+                "role": "user",
+                "image": {"num_patches": num_patches},
+                "content": query,
+            },
+        ]
+
+        # Use tokenizer's apply_chat_template instead of manually loading template
+        prompt = self.tokenizer.apply_chat_template(
+            messages,
+            tokenize=False,
+            add_generation_prompt=True,
+        )
+
+        # Tokenize and prepare input ids for batch
+        input_ids = torch.tensor(self.tokenizer([prompt]).input_ids)
+        input_ids = input_ids.repeat(batch_size, 1)
+        input_ids = {"input_ids": input_ids}
+
         return BatchFeature(data={**input_ids, **image_inputs})
 
 
diff --git a/src/transformers/models/pp_chart2table/processing_pp_chart2table.py b/src/transformers/models/pp_chart2table/processing_pp_chart2table.py
index 3764024898f1..da60a1331505 100644
--- a/src/transformers/models/pp_chart2table/processing_pp_chart2table.py
+++ b/src/transformers/models/pp_chart2table/processing_pp_chart2table.py
@@ -33,17 +33,6 @@ class PPChart2TableProcessor(ProcessorMixin):
     image_processor_class = "AutoImageProcessor"
     tokenizer_class = "AutoTokenizer"
 
-    def __init__(self, image_processor=None, tokenizer=None, chat_template=None, **kwargs):
-        super().__init__(image_processor, tokenizer, chat_template=chat_template)
-
-        self.message_start_token = tokenizer.message_start_token
-        self.message_end_token = tokenizer.message_end_token
-        self.img_start_token = tokenizer.img_start_token
-        self.img_end_token = tokenizer.img_end_token
-        self.img_pad_token = tokenizer.img_pad_token
-        self.image_token = tokenizer.image_token
-        self.system_query = "system\nYou should follow the instructions carefully and explain your answers in detail."
-
     def __call__(
         self,
         images: ImageInput = None,
@@ -61,27 +50,33 @@ def __call__(
         input_ids = {"input_ids": None}
         if text is None or text == "":
             query = "Chart to table"
-            prompt = (
-                self.message_start_token
-                + self.system_query
-                + self.message_end_token
-                + self.message_start_token
-                + "user\n"
-                + self.img_start_token
-                + self.img_pad_token * num_patches * num_patches
-                + self.img_end_token
-                + "\n"
-                + query
-                + self.message_end_token
-                + self.message_start_token
-                + "assistant\n"
-            )
-            input_ids = torch.tensor(self.tokenizer([prompt]).input_ids)
-            input_ids = input_ids.repeat(batch_size, 1)
-            input_ids = {"input_ids": input_ids}
         else:
             raise ValueError("PPChart2Table processor does not support text inputs")
 
+        messages = [
+            {
+                "role": "system",
+                "content": "You should follow the instructions carefully and explain your answers in detail.",
+            },
+            {
+                "role": "user",
+                "image": {"num_patches": num_patches},
+                "content": query,
+            },
+        ]
+
+        # Use tokenizer's apply_chat_template instead of manually loading template
+        prompt = self.tokenizer.apply_chat_template(
+            messages,
+            tokenize=False,
+            add_generation_prompt=True,
+        )
+
+        # Tokenize and prepare input ids for batch
+        input_ids = torch.tensor(self.tokenizer([prompt]).input_ids)
+        input_ids = input_ids.repeat(batch_size, 1)
+        input_ids = {"input_ids": input_ids}
+
         return BatchFeature(data={**input_ids, **image_inputs})
 
 
diff --git a/tests/models/pp_chart2table/test_image_processing_pp_chart2table.py b/tests/models/pp_chart2table/test_image_processing_pp_chart2table.py
index b56859409bfc..f83c96227e67 100644
--- a/tests/models/pp_chart2table/test_image_processing_pp_chart2table.py
+++ b/tests/models/pp_chart2table/test_image_processing_pp_chart2table.py
@@ -16,10 +16,11 @@
 import unittest
 
 from transformers.testing_utils import require_torch, require_vision
-from transformers.utils import is_torch_available, is_torchvision_available, is_vision_available
+from transformers.utils import is_torchvision_available
 
 from ...test_image_processing_common import ImageProcessingTestMixin, prepare_image_inputs
 
+
 if is_torchvision_available():
     from transformers import PPChart2TableImageProcessorFast
 
diff --git a/tests/models/pp_chart2table/test_modeling_pp_chart2table.py b/tests/models/pp_chart2table/test_modeling_pp_chart2table.py
index 98a7f1c0de22..1bb705e58488 100644
--- a/tests/models/pp_chart2table/test_modeling_pp_chart2table.py
+++ b/tests/models/pp_chart2table/test_modeling_pp_chart2table.py
@@ -34,7 +34,6 @@
 
     from transformers import (
         PPChart2TableForConditionalGeneration,
-        PPChart2TableModel,
     )
 
 
@@ -136,13 +135,7 @@ def prepare_config_and_inputs_for_common(self):
 
 @require_torch
 class PPChart2TableModelTest(ModelTesterMixin, GenerationTesterMixin, PipelineTesterMixin, unittest.TestCase):
-    all_model_classes = (
-        (
-            PPChart2TableForConditionalGeneration,
-        )
-        if is_torch_available()
-        else ()
-    )
+    all_model_classes = (PPChart2TableForConditionalGeneration,) if is_torch_available() else ()
     pipeline_model_mapping = (
         {
             "image-text-to-text": PPChart2TableForConditionalGeneration,
@@ -158,6 +151,7 @@ def setUp(self):
     def test_config(self):
         self.config_tester.run_common_tests()
 
+
 @slow
 @require_torch
 class PPChart2TableIntegrationTest(unittest.TestCase):
diff --git a/utils/check_repo.py b/utils/check_repo.py
index c548e7825e7c..d458b2c3b0e1 100644
--- a/utils/check_repo.py
+++ b/utils/check_repo.py
@@ -194,6 +194,7 @@
         "PaddleOCRVisionModel",  # Building part of bigger (tested) model. Tested implicitly through PaddleOCRVLForConditionalGeneration.
         "PaddleOCRVisionTransformer",  # Building part of bigger (tested) model. Tested implicitly through PaddleOCRVLForConditionalGeneration.
         "PaddleOCRTextModel",  # Building part of bigger (tested) model. Tested implicitly through PaddleOCRVLForConditionalGeneration.
+        "PPChart2TableModel",  # Building part of bigger (tested) model. Tested implicitly through PPChart2TableForConditionalGeneration.
         "PPChart2TableVisionModel",  # Building part of bigger (tested) model. Tested implicitly through PPChart2TableForConditionalGeneration.
         "PPChart2TableTextModel",  # Building part of bigger (tested) model. Tested implicitly through PPChart2TableForConditionalGeneration.
         "Qwen2VLModel",  # Building part of bigger (tested) model. Tested implicitly through Qwen2VLForConditionalGeneration.

From d050fe65247dbdff0a1df7ce2ed59bccae9c1088 Mon Sep 17 00:00:00 2001
From: XingweiDeng <aaadddiy@163.com>
Date: Wed, 18 Mar 2026 11:27:39 +0800
Subject: [PATCH 31/60] update

---
 docs/source/en/model_doc/pp_chart2table.md | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/docs/source/en/model_doc/pp_chart2table.md b/docs/source/en/model_doc/pp_chart2table.md
index 82f92bcdfb90..7f938f76300b 100644
--- a/docs/source/en/model_doc/pp_chart2table.md
+++ b/docs/source/en/model_doc/pp_chart2table.md
@@ -13,7 +13,7 @@ specific language governing permissions and limitations under the License.
 rendered properly in your Markdown viewer.
 
 -->
-*This model was released on {release_date} and added to Hugging Face Transformers on 2026-03-16.*
+*This model was released on 2026-05-20 and added to Hugging Face Transformers on 2026-03-16.*
 
 # PP-Chart2Table
 

From bae2c9638bc8a976de8c643ec8b1eacbc205a53e Mon Sep 17 00:00:00 2001
From: XingweiDeng <aaadddiy@163.com>
Date: Wed, 18 Mar 2026 11:41:32 +0800
Subject: [PATCH 32/60] update

---
 docs/source/en/model_doc/pp_chart2table.md    |  2 +-
 .../pp_chart2table/modular_pp_chart2table.py  | 21 +++++------------
 .../processing_pp_chart2table.py              | 23 +++++--------------
 3 files changed, 13 insertions(+), 33 deletions(-)

diff --git a/docs/source/en/model_doc/pp_chart2table.md b/docs/source/en/model_doc/pp_chart2table.md
index 7f938f76300b..34dabbcdfee4 100644
--- a/docs/source/en/model_doc/pp_chart2table.md
+++ b/docs/source/en/model_doc/pp_chart2table.md
@@ -13,7 +13,7 @@ specific language governing permissions and limitations under the License.
 rendered properly in your Markdown viewer.
 
 -->
-*This model was released on 2026-05-20 and added to Hugging Face Transformers on 2026-03-16.*
+*This model was released on 2025-05-20 and added to Hugging Face Transformers on 2026-03-16.*
 
 # PP-Chart2Table
 
diff --git a/src/transformers/models/pp_chart2table/modular_pp_chart2table.py b/src/transformers/models/pp_chart2table/modular_pp_chart2table.py
index 6a7decdc7ac8..aa6b821e4dc9 100644
--- a/src/transformers/models/pp_chart2table/modular_pp_chart2table.py
+++ b/src/transformers/models/pp_chart2table/modular_pp_chart2table.py
@@ -72,37 +72,28 @@ def __call__(
         batch_size, _, height, _ = image_inputs["pixel_values"].shape
         num_patches = height // self.image_processor.patch_size // self.image_processor.merge_size
 
-        input_ids = {"input_ids": None}
-        if text is None or text == "":
-            query = "Chart to table"
-        else:
-            raise ValueError("PPChart2Table processor does not support text inputs")
-
         messages = [
             {
                 "role": "system",
-                "content": "You should follow the instructions carefully and explain your answers in detail.",
             },
             {
                 "role": "user",
                 "image": {"num_patches": num_patches},
-                "content": query,
             },
         ]
 
         # Use tokenizer's apply_chat_template instead of manually loading template
-        prompt = self.tokenizer.apply_chat_template(
+        inputs = self.tokenizer.apply_chat_template(
             messages,
-            tokenize=False,
+            tokenize=True,
             add_generation_prompt=True,
+            return_tensors="pt",
         )
 
-        # Tokenize and prepare input ids for batch
-        input_ids = torch.tensor(self.tokenizer([prompt]).input_ids)
-        input_ids = input_ids.repeat(batch_size, 1)
-        input_ids = {"input_ids": input_ids}
+        # Prepare input ids for batch
+        input_ids = inputs["input_ids"].repeat(batch_size, 1)
 
-        return BatchFeature(data={**input_ids, **image_inputs})
+        return BatchFeature(data={"input_ids": input_ids, **image_inputs})
 
 
 class PPChart2TableVisionPreTrainedModel(GotOcr2PreTrainedModel):
diff --git a/src/transformers/models/pp_chart2table/processing_pp_chart2table.py b/src/transformers/models/pp_chart2table/processing_pp_chart2table.py
index da60a1331505..17f6883024dc 100644
--- a/src/transformers/models/pp_chart2table/processing_pp_chart2table.py
+++ b/src/transformers/models/pp_chart2table/processing_pp_chart2table.py
@@ -19,8 +19,6 @@
 # limitations under the License.
 
 
-import torch
-
 from ...feature_extraction_utils import BatchFeature
 from ...image_utils import ImageInput
 from ...processing_utils import ProcessingKwargs, ProcessorMixin, Unpack
@@ -47,37 +45,28 @@ def __call__(
         batch_size, _, height, _ = image_inputs["pixel_values"].shape
         num_patches = height // self.image_processor.patch_size // self.image_processor.merge_size
 
-        input_ids = {"input_ids": None}
-        if text is None or text == "":
-            query = "Chart to table"
-        else:
-            raise ValueError("PPChart2Table processor does not support text inputs")
-
         messages = [
             {
                 "role": "system",
-                "content": "You should follow the instructions carefully and explain your answers in detail.",
             },
             {
                 "role": "user",
                 "image": {"num_patches": num_patches},
-                "content": query,
             },
         ]
 
         # Use tokenizer's apply_chat_template instead of manually loading template
-        prompt = self.tokenizer.apply_chat_template(
+        inputs = self.tokenizer.apply_chat_template(
             messages,
-            tokenize=False,
+            tokenize=True,
             add_generation_prompt=True,
+            return_tensors="pt",
         )
 
-        # Tokenize and prepare input ids for batch
-        input_ids = torch.tensor(self.tokenizer([prompt]).input_ids)
-        input_ids = input_ids.repeat(batch_size, 1)
-        input_ids = {"input_ids": input_ids}
+        # Prepare input ids for batch
+        input_ids = inputs["input_ids"].repeat(batch_size, 1)
 
-        return BatchFeature(data={**input_ids, **image_inputs})
+        return BatchFeature(data={"input_ids": input_ids, **image_inputs})
 
 
 __all__ = ["PPChart2TableProcessor"]

From 8e4062b4b80ae8b94f41d696cefe8b743d145d9d Mon Sep 17 00:00:00 2001
From: XingweiDeng <aaadddiy@163.com>
Date: Wed, 18 Mar 2026 11:49:32 +0800
Subject: [PATCH 33/60] update

---
 src/transformers/models/auto/configuration_auto.py | 2 +-
 src/transformers/models/auto/modeling_auto.py      | 2 +-
 2 files changed, 2 insertions(+), 2 deletions(-)

diff --git a/src/transformers/models/auto/configuration_auto.py b/src/transformers/models/auto/configuration_auto.py
index 6cdacb04513a..d2615bf8a99f 100644
--- a/src/transformers/models/auto/configuration_auto.py
+++ b/src/transformers/models/auto/configuration_auto.py
@@ -355,7 +355,7 @@
         ("plbart", "PLBartConfig"),
         ("poolformer", "PoolFormerConfig"),
         ("pop2piano", "Pop2PianoConfig"),
-        ("pp_chart2table", "PPChart2TableConfig"),
+        ("pp_chart2table", "GotOcr2Config"),
         ("pp_doclayout_v2", "PPDocLayoutV2Config"),
         ("pp_doclayout_v3", "PPDocLayoutV3Config"),
         ("pp_lcnet", "PPLCNetConfig"),
diff --git a/src/transformers/models/auto/modeling_auto.py b/src/transformers/models/auto/modeling_auto.py
index 10b9c7746cf6..7885c81daee3 100644
--- a/src/transformers/models/auto/modeling_auto.py
+++ b/src/transformers/models/auto/modeling_auto.py
@@ -985,7 +985,7 @@ class _BaseModelWithGenerate(PreTrainedModel, GenerationMixin):
         ("perception_lm", "PerceptionLMForConditionalGeneration"),
         ("pix2struct", "Pix2StructForConditionalGeneration"),
         ("pixtral", "LlavaForConditionalGeneration"),
-        ("pp_chart2table", "PPChart2TableForConditionalGeneration"),
+        ("pp_chart2table", "GotOcr2ForConditionalGeneration"),
         ("qwen2_5_vl", "Qwen2_5_VLForConditionalGeneration"),
         ("qwen2_vl", "Qwen2VLForConditionalGeneration"),
         ("qwen3_5", "Qwen3_5ForConditionalGeneration"),

From ac2bc662aaf705282081f03cc85917bb9d0d9772 Mon Sep 17 00:00:00 2001
From: XingweiDeng <aaadddiy@163.com>
Date: Wed, 18 Mar 2026 11:57:50 +0800
Subject: [PATCH 34/60] update

---
 .../test_modeling_pp_chart2table.py           | 119 ------------------
 1 file changed, 119 deletions(-)

diff --git a/tests/models/pp_chart2table/test_modeling_pp_chart2table.py b/tests/models/pp_chart2table/test_modeling_pp_chart2table.py
index 1bb705e58488..2e1ee4a5d98c 100644
--- a/tests/models/pp_chart2table/test_modeling_pp_chart2table.py
+++ b/tests/models/pp_chart2table/test_modeling_pp_chart2table.py
@@ -17,21 +17,13 @@
 
 from transformers import (
     AutoProcessor,
-    PPChart2TableConfig,
     is_torch_available,
     is_vision_available,
 )
 from transformers.testing_utils import cleanup, require_torch, slow, torch_device
 
-from ...generation.test_utils import GenerationTesterMixin
-from ...test_configuration_common import ConfigTester
-from ...test_modeling_common import ModelTesterMixin, floats_tensor, ids_tensor
-from ...test_pipeline_mixin import PipelineTesterMixin
-
 
 if is_torch_available():
-    import torch
-
     from transformers import (
         PPChart2TableForConditionalGeneration,
     )
@@ -41,117 +33,6 @@
     from transformers.image_utils import load_image
 
 
-class PPChart2TableVisionText2TextModelTester:
-    def __init__(
-        self,
-        parent,
-        batch_size=3,
-        seq_length=7,
-        num_channels=3,
-        ignore_index=-100,
-        image_size=64,
-        image_token_index=1,
-        model_type="pp_chart2table",
-        is_training=True,
-        text_config={
-            "model_type": "qwen2",
-            "vocab_size": 99,
-            "hidden_size": 32,
-            "intermediate_size": 37,
-            "num_hidden_layers": 2,
-            "num_attention_heads": 2,
-            "num_key_value_heads": 2,
-            "output_channels": 32,
-            "hidden_act": "silu",
-            "max_position_embeddings": 512,
-            "rope_theta": 10000,
-            "tie_word_embeddings": True,
-            "bos_token_id": 2,
-            "eos_token_id": 3,
-            "pad_token_id": 4,
-        },
-        vision_config={
-            "num_hidden_layers": 2,
-            "output_channels": 32,
-            "hidden_act": "quick_gelu",
-            "hidden_size": 32,
-            "mlp_dim": 64,
-            "num_attention_heads": 2,
-            "patch_size": 2,
-            "image_size": 64,
-        },
-    ):
-        self.parent = parent
-        self.ignore_index = ignore_index
-        self.bos_token_id = text_config["bos_token_id"]
-        self.eos_token_id = text_config["eos_token_id"]
-        self.pad_token_id = text_config["pad_token_id"]
-        self.image_token_index = image_token_index
-        self.model_type = model_type
-        self.text_config = text_config
-        self.vision_config = vision_config
-        self.batch_size = batch_size
-        self.num_channels = num_channels
-        self.image_size = image_size
-        self.is_training = is_training
-        self.num_image_tokens = 64
-        self.seq_length = seq_length + self.num_image_tokens
-
-        self.num_hidden_layers = text_config["num_hidden_layers"]
-        self.vocab_size = text_config["vocab_size"]
-        self.hidden_size = text_config["hidden_size"]
-        self.num_attention_heads = text_config["num_attention_heads"]
-
-    def get_config(self):
-        return PPChart2TableConfig(
-            text_config=self.text_config,
-            vision_config=self.vision_config,
-            model_type=self.model_type,
-            image_token_index=self.image_token_index,
-        )
-
-    def prepare_config_and_inputs(self):
-        config = self.get_config()
-        pixel_values = floats_tensor([self.batch_size, self.num_channels, self.image_size, self.image_size])
-
-        return config, pixel_values
-
-    def prepare_config_and_inputs_for_common(self):
-        config_and_inputs = self.prepare_config_and_inputs()
-        config, pixel_values = config_and_inputs
-        input_ids = ids_tensor([self.batch_size, self.seq_length], self.vocab_size)
-        attention_mask = torch.ones(input_ids.shape, dtype=torch.long, device=torch_device)
-
-        input_ids[input_ids == self.image_token_index] = self.pad_token_id
-        input_ids[:, : self.num_image_tokens] = self.image_token_index
-
-        inputs_dict = {
-            "pixel_values": pixel_values,
-            "input_ids": input_ids,
-            "attention_mask": attention_mask,
-        }
-        return config, inputs_dict
-
-
-@require_torch
-class PPChart2TableModelTest(ModelTesterMixin, GenerationTesterMixin, PipelineTesterMixin, unittest.TestCase):
-    all_model_classes = (PPChart2TableForConditionalGeneration,) if is_torch_available() else ()
-    pipeline_model_mapping = (
-        {
-            "image-text-to-text": PPChart2TableForConditionalGeneration,
-        }
-        if is_torch_available()
-        else {}
-    )
-
-    def setUp(self):
-        self.model_tester = PPChart2TableVisionText2TextModelTester(self)
-        self.config_tester = ConfigTester(self, config_class=PPChart2TableConfig, has_text_modality=False)
-
-    def test_config(self):
-        self.config_tester.run_common_tests()
-
-
 @slow
 @require_torch
 class PPChart2TableIntegrationTest(unittest.TestCase):

From 45907f92e1b601d2950577159475f7645bbf32a7 Mon Sep 17 00:00:00 2001
From: XingweiDeng <aaadddiy@163.com>
Date: Wed, 18 Mar 2026 12:20:03 +0800
Subject: [PATCH 35/60] update

---
 .../configuration_pp_chart2table.py           | 111 +++++++-----------
 .../pp_chart2table/modeling_pp_chart2table.py |  12 +-
 .../pp_chart2table/modular_pp_chart2table.py  |   3 +-
 3 files changed, 44 insertions(+), 82 deletions(-)

diff --git a/src/transformers/models/pp_chart2table/configuration_pp_chart2table.py b/src/transformers/models/pp_chart2table/configuration_pp_chart2table.py
index 107c077798a9..241338468d6d 100644
--- a/src/transformers/models/pp_chart2table/configuration_pp_chart2table.py
+++ b/src/transformers/models/pp_chart2table/configuration_pp_chart2table.py
@@ -18,12 +18,16 @@
 # See the License for the specific language governing permissions and
 # limitations under the License.
 
+
+from huggingface_hub.dataclasses import strict
+
 from ...configuration_utils import PreTrainedConfig
 from ...utils import auto_docstring
 from ..auto import CONFIG_MAPPING, AutoConfig
 
 
 @auto_docstring(checkpoint="facebook/sam-vit-huge")
+@strict(accept_kwargs=True)
 class PPChart2TableVisionConfig(PreTrainedConfig):
     r"""
     output_channels (`int`, *optional*, defaults to 256):
@@ -41,50 +45,27 @@ class PPChart2TableVisionConfig(PreTrainedConfig):
     """
 
     base_config_key = "vision_config"
-
-    def __init__(
-        self,
-        hidden_size=768,
-        output_channels=256,
-        num_hidden_layers=12,
-        num_attention_heads=12,
-        num_channels=3,
-        image_size=1024,
-        patch_size=16,
-        hidden_act="gelu",
-        layer_norm_eps=1e-06,
-        attention_dropout=0.0,
-        initializer_range=1e-10,
-        qkv_bias=True,
-        use_abs_pos=True,
-        use_rel_pos=True,
-        window_size=14,
-        global_attn_indexes=[2, 5, 8, 11],
-        mlp_dim=3072,
-        **kwargs,
-    ):
-        super().__init__(**kwargs)
-
-        self.hidden_size = hidden_size
-        self.output_channels = output_channels
-        self.num_hidden_layers = num_hidden_layers
-        self.num_attention_heads = num_attention_heads
-        self.num_channels = num_channels
-        self.image_size = image_size
-        self.patch_size = patch_size
-        self.hidden_act = hidden_act
-        self.layer_norm_eps = layer_norm_eps
-        self.attention_dropout = attention_dropout
-        self.initializer_range = initializer_range
-        self.qkv_bias = qkv_bias
-        self.use_abs_pos = use_abs_pos
-        self.use_rel_pos = use_rel_pos
-        self.window_size = window_size
-        self.global_attn_indexes = global_attn_indexes
-        self.mlp_dim = mlp_dim
+    hidden_size: int = 768
+    output_channels: int = 256
+    num_hidden_layers: int = 12
+    num_attention_heads: int = 12
+    num_channels: int = 3
+    image_size: int | list[int] | tuple[int, int] = 1024
+    patch_size: int | list[int] | tuple[int, int] = 16
+    hidden_act: str = "gelu"
+    layer_norm_eps: float = 1e-06
+    attention_dropout: float | int = 0.0
+    initializer_range: float = 1e-10
+    qkv_bias: bool = True
+    use_abs_pos: bool = True
+    use_rel_pos: bool = True
+    window_size: int = 14
+    global_attn_indexes: list[int] | tuple[int, ...] = (2, 5, 8, 11)
+    mlp_dim: int = 3072
 
 
 @auto_docstring(checkpoint="PaddlePaddle/PP-Chart2Table_safetensors")
+@strict(accept_kwargs=True)
 class PPChart2TableConfig(PreTrainedConfig):
     r"""
     Example:
@@ -102,36 +83,29 @@ class PPChart2TableConfig(PreTrainedConfig):
     >>> configuration = model.config
     ```"""
 
-    model_type = "pp_chart2table"
+    model_type = "p_p_chart2_table"
     attribute_map = {
         "image_token_id": "image_token_index",
     }
     sub_configs = {"text_config": AutoConfig, "vision_config": PPChart2TableVisionConfig}
 
-    def __init__(
-        self,
-        vision_config: dict | None = None,
-        text_config: dict | None = None,
-        image_token_index: int | None = 151859,
-        image_seq_length: int | None = 576,
-        tie_word_embeddings: bool | None = True,
-        **kwargs,
-    ):
-        self.image_token_index = image_token_index
-        self.image_seq_length = image_seq_length
-
-        if vision_config is None:
+    vision_config: dict | PreTrainedConfig | None = None
+    text_config: dict | PreTrainedConfig | None = None
+    image_token_index: int = 151859
+    image_seq_length: int = 576
+    tie_word_embeddings: bool = True
+
+    def __post_init__(self, **kwargs):
+        if self.vision_config is None:
             self.vision_config = PPChart2TableVisionConfig()
-        elif isinstance(vision_config, dict):
-            self.vision_config = PPChart2TableVisionConfig(**vision_config)
-        elif isinstance(vision_config, PPChart2TableVisionConfig):
-            self.vision_config = vision_config
-
-        if isinstance(text_config, dict):
-            text_config["model_type"] = text_config.get("model_type", "qwen2")
-            text_config = CONFIG_MAPPING[text_config["model_type"]](**text_config)
-        elif text_config is None:
-            text_config = CONFIG_MAPPING["qwen2"](
+        elif isinstance(self.vision_config, dict):
+            self.vision_config = PPChart2TableVisionConfig(**self.vision_config)
+
+        if isinstance(self.text_config, dict):
+            self.text_config["model_type"] = self.text_config.get("model_type", "qwen2")
+            self.text_config = CONFIG_MAPPING[self.text_config["model_type"]](**self.text_config)
+        elif self.text_config is None:
+            self.text_config = CONFIG_MAPPING["qwen2"](
                 vocab_size=151860,
                 hidden_size=1024,
                 intermediate_size=2816,
@@ -143,7 +117,7 @@ def __init__(
                 initializer_range=0.02,
                 rms_norm_eps=1e-6,
                 use_cache=True,
-                tie_word_embeddings=tie_word_embeddings,
+                tie_word_embeddings=self.tie_word_embeddings,
                 rope_theta=1000000.0,
                 rope_parameters=None,
                 use_sliding_window=False,
@@ -152,10 +126,7 @@ def __init__(
                 attention_dropout=0.0,
             )
 
-        self.text_config = text_config
-        self.tie_word_embeddings = tie_word_embeddings
-
-        super().__init__(**kwargs)
+        super().__post_init__(**kwargs)
 
 
 __all__ = ["PPChart2TableConfig"]
diff --git a/src/transformers/models/pp_chart2table/modeling_pp_chart2table.py b/src/transformers/models/pp_chart2table/modeling_pp_chart2table.py
index 0c0a2c83e006..f13f04e62de6 100644
--- a/src/transformers/models/pp_chart2table/modeling_pp_chart2table.py
+++ b/src/transformers/models/pp_chart2table/modeling_pp_chart2table.py
@@ -300,7 +300,7 @@ def forward(self, hidden_states: torch.Tensor) -> tuple[torch.FloatTensor]:
 @dataclass
 @auto_docstring(
     custom_intro="""
-    Base class for pp_chart2table vision model's outputs that also contains image embeddings obtained by applying the projection
+    Base class for p_p_chart2_table vision model's outputs that also contains image embeddings obtained by applying the projection
     layer to the pooler_output.
     """
 )
@@ -519,10 +519,6 @@ def forward(self, vision_embeddings: torch.Tensor) -> torch.Tensor:
 
 @auto_docstring
 class PPChart2TableModel(PPChart2TablePreTrainedModel):
-    _checkpoint_conversion_mapping = {
-        r"^language_model.model": "language_model",
-    }
-
     def __init__(self, config: PPChart2TableConfig):
         super().__init__(config)
         self.vision_tower = PPChart2TableVisionEncoder(config.vision_config)
@@ -661,12 +657,6 @@ class PPChart2TableCausalLMOutputWithPast(ModelOutput):
     """
 )
 class PPChart2TableForConditionalGeneration(PPChart2TablePreTrainedModel, GenerationMixin):
-    _checkpoint_conversion_mapping = {
-        r"^language_model.model": "model.language_model",
-        r"^vision_tower": "model.vision_tower",
-        r"^multi_modal_projector": "model.multi_modal_projector",
-        r"^language_model.lm_head": "lm_head",
-    }
     _tied_weights_keys = {"lm_head.weight": "model.language_model.embed_tokens.weight"}
 
     def __init__(self, config: PPChart2TableConfig):
diff --git a/src/transformers/models/pp_chart2table/modular_pp_chart2table.py b/src/transformers/models/pp_chart2table/modular_pp_chart2table.py
index aa6b821e4dc9..662353516674 100644
--- a/src/transformers/models/pp_chart2table/modular_pp_chart2table.py
+++ b/src/transformers/models/pp_chart2table/modular_pp_chart2table.py
@@ -14,7 +14,7 @@
 
 from dataclasses import dataclass
 
-import torch
+from huggingface_hub.dataclasses import strict
 
 from ...feature_extraction_utils import BatchFeature
 from ...image_processing_utils_fast import BaseImageProcessorFast
@@ -36,6 +36,7 @@
 
 
 @auto_docstring(checkpoint="PaddlePaddle/PP-Chart2Table_safetensors")
+@strict(accept_kwargs=True)
 class PPChart2TableConfig(GotOcr2Config):
     pass
 

From d0bf04f02957b8af7bd57dd9c00edc714001c5cc Mon Sep 17 00:00:00 2001
From: XingweiDeng <aaadddiy@163.com>
Date: Wed, 18 Mar 2026 14:25:36 +0800
Subject: [PATCH 36/60] update

---
 .../image_processing_pp_chart2table.py        | 25 -------------------
 1 file changed, 25 deletions(-)
 delete mode 100644 src/transformers/models/pp_chart2table/image_processing_pp_chart2table.py

diff --git a/src/transformers/models/pp_chart2table/image_processing_pp_chart2table.py b/src/transformers/models/pp_chart2table/image_processing_pp_chart2table.py
deleted file mode 100644
index 7bb7de6cc920..000000000000
--- a/src/transformers/models/pp_chart2table/image_processing_pp_chart2table.py
+++ /dev/null
@@ -1,25 +0,0 @@
-#                🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨
-#           This file was automatically generated from src/transformers/models/pp_chart2table/modular_pp_chart2table.py.
-#               Do NOT edit this file manually as any edits will be overwritten by the generation of
-#             the file from the modular. If any change should be done, please apply the change to the
-#                          modular_pp_chart2table.py file directly. One of our CI enforces this.
-#                🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨
-from ...processing_utils import ImagesKwargs
-
-
-class PPChart2TableImageProcessorKwargs(ImagesKwargs, total=False):
-    """
-    crop_to_patches (`bool`, *optional*, defaults to `False`):
-        Whether to crop the image to patches. Can be overridden by the `crop_to_patches` parameter in the
-        `preprocess` method.
-    min_patches (`int`, *optional*, defaults to 1):
-        The minimum number of patches to be extracted from the image. Only has an effect if `crop_to_patches` is
-        set to `True`. Can be overridden by the `min_patches` parameter in the `preprocess` method.
-    max_patches (`int`, *optional*, defaults to 12):
-        The maximum number of patches to be extracted from the image. Only has an effect if `crop_to_patches` is
-        set to `True`. Can be overridden by the `max_patches` parameter in the `preprocess` method.
-    """
-
-    crop_to_patches: bool
-    min_patches: int
-    max_patches: int

From 0f7ed318db222936dbc542eca4d3e9034056add8 Mon Sep 17 00:00:00 2001
From: XingweiDeng <aaadddiy@163.com>
Date: Wed, 18 Mar 2026 16:31:26 +0800
Subject: [PATCH 37/60] update

---
 src/transformers/conversion_mapping.py        |  2 +-
 .../configuration_pp_chart2table.py           |  2 +-
 .../image_processing_pp_chart2table_fast.py   |  2 +-
 .../pp_chart2table/modular_pp_chart2table.py  | 29 +++++----
 .../processing_pp_chart2table.py              | 19 +++---
 .../test_processing_pp_chart2table.py         | 64 +++++++++++++++++++
 6 files changed, 96 insertions(+), 22 deletions(-)
 create mode 100644 tests/models/pp_chart2table/test_processing_pp_chart2table.py

diff --git a/src/transformers/conversion_mapping.py b/src/transformers/conversion_mapping.py
index 71cdf585bc12..a1c495085321 100755
--- a/src/transformers/conversion_mapping.py
+++ b/src/transformers/conversion_mapping.py
@@ -555,4 +555,4 @@ def get_model_conversion_mapping(
         # "mlp.experts.gate_up_proj$" and "mlp.experts.down_proj$" are only created after dequantization conversions are applied.
         weight_conversions.extend(hf_quantizer.get_weight_conversions())
 
-    return weight_conversions
\ No newline at end of file
+    return weight_conversions
diff --git a/src/transformers/models/pp_chart2table/configuration_pp_chart2table.py b/src/transformers/models/pp_chart2table/configuration_pp_chart2table.py
index 241338468d6d..a15de72c8a39 100644
--- a/src/transformers/models/pp_chart2table/configuration_pp_chart2table.py
+++ b/src/transformers/models/pp_chart2table/configuration_pp_chart2table.py
@@ -83,7 +83,7 @@ class PPChart2TableConfig(PreTrainedConfig):
     >>> configuration = model.config
     ```"""
 
-    model_type = "p_p_chart2_table"
+    model_type = "pp_chart2table"
     attribute_map = {
         "image_token_id": "image_token_index",
     }
diff --git a/src/transformers/models/pp_chart2table/image_processing_pp_chart2table_fast.py b/src/transformers/models/pp_chart2table/image_processing_pp_chart2table_fast.py
index 8bea7ba01d7c..ef857db5b658 100644
--- a/src/transformers/models/pp_chart2table/image_processing_pp_chart2table_fast.py
+++ b/src/transformers/models/pp_chart2table/image_processing_pp_chart2table_fast.py
@@ -29,7 +29,7 @@ class PPChart2TableImageProcessorFast(BaseImageProcessorFast):
     image_std = [0.26862954, 0.26130258, 0.27577711]
     size = {"height": 1024, "width": 1024}
     patch_size = 16
-    merge_size = 4
+    num_patches = 16
     do_resize = True
     do_rescale = True
     do_normalize = True
diff --git a/src/transformers/models/pp_chart2table/modular_pp_chart2table.py b/src/transformers/models/pp_chart2table/modular_pp_chart2table.py
index 662353516674..50e52a2772c6 100644
--- a/src/transformers/models/pp_chart2table/modular_pp_chart2table.py
+++ b/src/transformers/models/pp_chart2table/modular_pp_chart2table.py
@@ -21,7 +21,7 @@
 from ...image_utils import ImageInput
 from ...processing_utils import ProcessingKwargs, ProcessorMixin, Unpack
 from ...tokenization_utils_base import PreTokenizedInput, TextInput
-from ...utils import auto_docstring, logging
+from ...utils import auto_docstring, is_vision_available, logging
 from ..got_ocr2.configuration_got_ocr2 import GotOcr2Config
 from ..got_ocr2.modeling_got_ocr2 import (
     GotOcr2ForConditionalGeneration,
@@ -32,13 +32,17 @@
 )
 
 
+if is_vision_available():
+    pass
+
+
 logger = logging.get_logger(__name__)
 
 
 @auto_docstring(checkpoint="PaddlePaddle/PP-Chart2Table_safetensors")
 @strict(accept_kwargs=True)
 class PPChart2TableConfig(GotOcr2Config):
-    pass
+    model_type = "pp_chart2table"
 
 
 @auto_docstring
@@ -48,7 +52,7 @@ class PPChart2TableImageProcessorFast(BaseImageProcessorFast):
     image_std = [0.26862954, 0.26130258, 0.27577711]
     size = {"height": 1024, "width": 1024}
     patch_size = 16
-    merge_size = 4
+    num_patches = 16
     do_resize = True
     do_rescale = True
     do_normalize = True
@@ -58,6 +62,7 @@ class PPChart2TableImageProcessorFast(BaseImageProcessorFast):
 class PPChart2TableProcessor(ProcessorMixin):
     image_processor_class = "AutoImageProcessor"
     tokenizer_class = "AutoTokenizer"
+    model_input_names = ["input_ids", "pixel_values"]
 
     def __call__(
         self,
@@ -65,13 +70,14 @@ def __call__(
         text: TextInput | PreTokenizedInput | list[TextInput] | list[PreTokenizedInput] = None,
         **kwargs: Unpack[ProcessingKwargs],
     ) -> BatchFeature:
-        if images is not None:
-            image_inputs = self.image_processor(images=images, return_tensors="pt")
-        else:
-            image_inputs = {}
+        output_kwargs = self._merge_kwargs(
+            ProcessingKwargs,
+            tokenizer_init_kwargs=self.tokenizer.init_kwargs,
+            **kwargs,
+        )
 
-        batch_size, _, height, _ = image_inputs["pixel_values"].shape
-        num_patches = height // self.image_processor.patch_size // self.image_processor.merge_size
+        image_inputs = self.image_processor(images=images, **output_kwargs["images_kwargs"])
+        batch_size = image_inputs["pixel_values"].shape[0]
 
         messages = [
             {
@@ -79,7 +85,7 @@ def __call__(
             },
             {
                 "role": "user",
-                "image": {"num_patches": num_patches},
+                "image": {"num_patches": self.image_processor.num_patches},
             },
         ]
 
@@ -88,7 +94,8 @@ def __call__(
             messages,
             tokenize=True,
             add_generation_prompt=True,
-            return_tensors="pt",
+            truncation=True,
+            **output_kwargs["text_kwargs"],
         )
 
         # Prepare input ids for batch
diff --git a/src/transformers/models/pp_chart2table/processing_pp_chart2table.py b/src/transformers/models/pp_chart2table/processing_pp_chart2table.py
index 17f6883024dc..ce8175ecd87c 100644
--- a/src/transformers/models/pp_chart2table/processing_pp_chart2table.py
+++ b/src/transformers/models/pp_chart2table/processing_pp_chart2table.py
@@ -30,6 +30,7 @@
 class PPChart2TableProcessor(ProcessorMixin):
     image_processor_class = "AutoImageProcessor"
     tokenizer_class = "AutoTokenizer"
+    model_input_names = ["input_ids", "pixel_values"]
 
     def __call__(
         self,
@@ -37,13 +38,14 @@ def __call__(
         text: TextInput | PreTokenizedInput | list[TextInput] | list[PreTokenizedInput] = None,
         **kwargs: Unpack[ProcessingKwargs],
     ) -> BatchFeature:
-        if images is not None:
-            image_inputs = self.image_processor(images=images, return_tensors="pt")
-        else:
-            image_inputs = {}
+        output_kwargs = self._merge_kwargs(
+            ProcessingKwargs,
+            tokenizer_init_kwargs=self.tokenizer.init_kwargs,
+            **kwargs,
+        )
 
-        batch_size, _, height, _ = image_inputs["pixel_values"].shape
-        num_patches = height // self.image_processor.patch_size // self.image_processor.merge_size
+        image_inputs = self.image_processor(images=images, **output_kwargs["images_kwargs"])
+        batch_size = image_inputs["pixel_values"].shape[0]
 
         messages = [
             {
@@ -51,7 +53,7 @@ def __call__(
             },
             {
                 "role": "user",
-                "image": {"num_patches": num_patches},
+                "image": {"num_patches": self.image_processor.num_patches},
             },
         ]
 
@@ -60,7 +62,8 @@ def __call__(
             messages,
             tokenize=True,
             add_generation_prompt=True,
-            return_tensors="pt",
+            truncation=True,
+            **output_kwargs["text_kwargs"],
         )
 
         # Prepare input ids for batch
diff --git a/tests/models/pp_chart2table/test_processing_pp_chart2table.py b/tests/models/pp_chart2table/test_processing_pp_chart2table.py
new file mode 100644
index 000000000000..ec33b19097af
--- /dev/null
+++ b/tests/models/pp_chart2table/test_processing_pp_chart2table.py
@@ -0,0 +1,64 @@
+# Copyright 2024 The HuggingFace Team. All rights reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+import unittest
+
+from transformers import PPChart2TableProcessor
+from transformers.testing_utils import require_vision
+
+from ...test_processing_common import ProcessorTesterMixin
+
+
+@require_vision
+class PPChart2TableProcessorTest(ProcessorTesterMixin, unittest.TestCase):
+    processor_class = PPChart2TableProcessor
+
+    @classmethod
+    def _setup_tokenizer(cls):
+        tokenizer_class = cls._get_component_class_from_processor("tokenizer")
+        tokenizer = tokenizer_class.from_pretrained("/workspace/model_weight_torch/PP-Chart2Table")
+        return tokenizer
+
+    @unittest.skip("PPChart2TableProcessor pop the image processor output 'num_patches'")
+    def test_image_processor_defaults(self):
+        pass
+
+    def test_ocr_queries(self):
+        processor = self.get_processor()
+        image_input = self.prepare_image_inputs()
+        inputs = processor(image_input, return_tensors="pt")
+        self.assertEqual(inputs["input_ids"].shape, (1, 286))
+        self.assertEqual(inputs["pixel_values"].shape, (1, 3, 1024, 1024))
+
+    def test_unstructured_kwargs_batched(self):
+        if "image_processor" not in self.processor_class.get_attributes():
+            self.skipTest(f"image_processor attribute not present in {self.processor_class}")
+        processor_components = self.prepare_components()
+        processor_kwargs = self.prepare_processor_dict()
+        processor = self.processor_class(**processor_components, **processor_kwargs)
+        self.skip_processor_without_typed_kwargs(processor)
+
+        input_str = self.prepare_text_inputs(batch_size=2, modalities="image")
+        image_input = self.prepare_image_inputs(batch_size=2)
+        inputs = processor(
+            text=input_str,
+            images=image_input,
+            return_tensors="pt",
+            do_rescale=True,
+            rescale_factor=-1.0,
+            padding="longest",
+            max_length=self.image_unstructured_max_length,
+        )
+
+        self.assertLessEqual(inputs[self.images_input_name][0][0].mean(), 0)

From beea17f53e8f93eebd15e0c8a02f973a805008f4 Mon Sep 17 00:00:00 2001
From: XingweiDeng <aaadddiy@163.com>
Date: Wed, 18 Mar 2026 19:55:11 +0800
Subject: [PATCH 38/60] update

---
 docs/source/en/model_doc/pp_chart2table.md    |  4 +-
 .../models/auto/configuration_auto.py         |  2 +-
 .../models/pp_chart2table/__init__.py         |  1 -
 .../pp_chart2table/modular_pp_chart2table.py  | 41 -------------------
 .../test_image_processing_pp_chart2table.py   |  2 +-
 .../test_modeling_pp_chart2table.py           |  3 +-
 .../test_processing_pp_chart2table.py         |  4 +-
 utils/check_repo.py                           |  6 ---
 8 files changed, 9 insertions(+), 54 deletions(-)

diff --git a/docs/source/en/model_doc/pp_chart2table.md b/docs/source/en/model_doc/pp_chart2table.md
index 34dabbcdfee4..9dc464d31473 100644
--- a/docs/source/en/model_doc/pp_chart2table.md
+++ b/docs/source/en/model_doc/pp_chart2table.md
@@ -49,9 +49,10 @@ pipe = pipeline(
     device_map="auto",
 )
 image = Image.open(requests.get("https://paddle-model-ecology.bj.bcebos.com/paddlex/imgs/demo_image/chart_parsing_02.png", stream=True).raw)
+# text is empty - processor uses hardcoded "Chart to table" instruction internally via chat template
 result = pipe(
     images=image, 
-    text="", 
+    text="",
     do_sample=False, 
     max_new_tokens=256
 )
@@ -106,6 +107,7 @@ pipe = pipeline(
     device_map="auto",
 )
 image = Image.open(requests.get("https://paddle-model-ecology.bj.bcebos.com/paddlex/imgs/demo_image/chart_parsing_02.png", stream=True).raw)
+# text is empty - processor uses hardcoded "Chart to table" instruction internally via chat template
 result = pipe(
     images=[image, image],
     text="",
diff --git a/src/transformers/models/auto/configuration_auto.py b/src/transformers/models/auto/configuration_auto.py
index e96476e6a08f..015187b409bb 100644
--- a/src/transformers/models/auto/configuration_auto.py
+++ b/src/transformers/models/auto/configuration_auto.py
@@ -358,7 +358,7 @@
         ("plbart", "PLBartConfig"),
         ("poolformer", "PoolFormerConfig"),
         ("pop2piano", "Pop2PianoConfig"),
-        ("pp_chart2table", "GotOcr2Config"),
+        ("pp_chart2table", "PPChart2TableConfig"),
         ("pp_doclayout_v2", "PPDocLayoutV2Config"),
         ("pp_doclayout_v3", "PPDocLayoutV3Config"),
         ("pp_lcnet", "PPLCNetConfig"),
diff --git a/src/transformers/models/pp_chart2table/__init__.py b/src/transformers/models/pp_chart2table/__init__.py
index 82763db99c82..411b2f54ca62 100644
--- a/src/transformers/models/pp_chart2table/__init__.py
+++ b/src/transformers/models/pp_chart2table/__init__.py
@@ -21,7 +21,6 @@
 if TYPE_CHECKING:
     from .configuration_pp_chart2table import *
     from .image_processing_pp_chart2table_fast import *
-    from .modeling_pp_chart2table import *
     from .processing_pp_chart2table import *
 else:
     import sys
diff --git a/src/transformers/models/pp_chart2table/modular_pp_chart2table.py b/src/transformers/models/pp_chart2table/modular_pp_chart2table.py
index 50e52a2772c6..b23a81f10716 100644
--- a/src/transformers/models/pp_chart2table/modular_pp_chart2table.py
+++ b/src/transformers/models/pp_chart2table/modular_pp_chart2table.py
@@ -32,10 +32,6 @@
 )
 
 
-if is_vision_available():
-    pass
-
-
 logger = logging.get_logger(__name__)
 
 
@@ -104,45 +100,8 @@ def __call__(
         return BatchFeature(data={"input_ids": input_ids, **image_inputs})
 
 
-class PPChart2TableVisionPreTrainedModel(GotOcr2PreTrainedModel):
-    pass
-
-
-class PPChart2TableVisionEncoder(GotOcr2VisionEncoder, PPChart2TableVisionPreTrainedModel):
-    pass
-
-
-@dataclass
-class PPChart2TableModelOutputWithPast(GotOcr2ModelOutputWithPast):
-    pass
-
-
-@auto_docstring
-class PPChart2TablePreTrainedModel(GotOcr2PreTrainedModel):
-    pass
-
-
-@auto_docstring
-class PPChart2TableModel(GotOcr2Model):
-    pass
-
-
-@auto_docstring(
-    custom_intro="""
-    PP-Chart2Table model for conditional generation (table text generation from chart images),
-    extending the core model with a language modeling (LM) head and generation utilities.
-    """
-)
-class PPChart2TableForConditionalGeneration(GotOcr2ForConditionalGeneration):
-    pass
-
-
 __all__ = [
-    "PPChart2TableForConditionalGeneration",
-    "PPChart2TableModel",
     "PPChart2TableConfig",
-    "PPChart2TableVisionPreTrainedModel",
-    "PPChart2TablePreTrainedModel",
     "PPChart2TableImageProcessorFast",
     "PPChart2TableProcessor",
 ]
diff --git a/tests/models/pp_chart2table/test_image_processing_pp_chart2table.py b/tests/models/pp_chart2table/test_image_processing_pp_chart2table.py
index f83c96227e67..46c1d0cc85f9 100644
--- a/tests/models/pp_chart2table/test_image_processing_pp_chart2table.py
+++ b/tests/models/pp_chart2table/test_image_processing_pp_chart2table.py
@@ -80,7 +80,7 @@ def prepare_image_inputs(self, equal_resolution=False, numpify=False, torchify=F
 
 @require_torch
 @require_vision
-class PPChart2TableProcessingTest(ImageProcessingTestMixin, unittest.TestCase):
+class PPChart2TableImageProcessingTest(ImageProcessingTestMixin, unittest.TestCase):
     test_slow_image_processor = False
     fast_image_processing_class = PPChart2TableImageProcessorFast if is_torchvision_available() else None
 
diff --git a/tests/models/pp_chart2table/test_modeling_pp_chart2table.py b/tests/models/pp_chart2table/test_modeling_pp_chart2table.py
index 2e1ee4a5d98c..04acdd35ca1d 100644
--- a/tests/models/pp_chart2table/test_modeling_pp_chart2table.py
+++ b/tests/models/pp_chart2table/test_modeling_pp_chart2table.py
@@ -20,7 +20,7 @@
     is_torch_available,
     is_vision_available,
 )
-from transformers.testing_utils import cleanup, require_torch, slow, torch_device
+from transformers.testing_utils import cleanup, require_torch, slow, torch_device, require_vision
 
 
 if is_torch_available():
@@ -34,6 +34,7 @@
 
 
 @slow
+@require_vision
 @require_torch
 class PPChart2TableIntegrationTest(unittest.TestCase):
     def setUp(self):
diff --git a/tests/models/pp_chart2table/test_processing_pp_chart2table.py b/tests/models/pp_chart2table/test_processing_pp_chart2table.py
index ec33b19097af..1592fe4cdb25 100644
--- a/tests/models/pp_chart2table/test_processing_pp_chart2table.py
+++ b/tests/models/pp_chart2table/test_processing_pp_chart2table.py
@@ -1,4 +1,4 @@
-# Copyright 2024 The HuggingFace Team. All rights reserved.
+# Copyright 2026 The HuggingFace Team. All rights reserved.
 #
 # Licensed under the Apache License, Version 2.0 (the "License");
 # you may not use this file except in compliance with the License.
@@ -27,7 +27,7 @@ class PPChart2TableProcessorTest(ProcessorTesterMixin, unittest.TestCase):
     @classmethod
     def _setup_tokenizer(cls):
         tokenizer_class = cls._get_component_class_from_processor("tokenizer")
-        tokenizer = tokenizer_class.from_pretrained("/workspace/model_weight_torch/PP-Chart2Table")
+        tokenizer = tokenizer_class.from_pretrained("PaddlePaddle/PP-Chart2Table_safetensors")
         return tokenizer
 
     @unittest.skip("PPChart2TableProcessor pop the image processor output 'num_patches'")
diff --git a/utils/check_repo.py b/utils/check_repo.py
index f9187c65ce20..ace5c0542bf9 100644
--- a/utils/check_repo.py
+++ b/utils/check_repo.py
@@ -194,9 +194,6 @@
         "PaddleOCRVisionModel",  # Building part of bigger (tested) model. Tested implicitly through PaddleOCRVLForConditionalGeneration.
         "PaddleOCRVisionTransformer",  # Building part of bigger (tested) model. Tested implicitly through PaddleOCRVLForConditionalGeneration.
         "PaddleOCRTextModel",  # Building part of bigger (tested) model. Tested implicitly through PaddleOCRVLForConditionalGeneration.
-        "PPChart2TableModel",  # Building part of bigger (tested) model. Tested implicitly through PPChart2TableForConditionalGeneration.
-        "PPChart2TableVisionModel",  # Building part of bigger (tested) model. Tested implicitly through PPChart2TableForConditionalGeneration.
-        "PPChart2TableTextModel",  # Building part of bigger (tested) model. Tested implicitly through PPChart2TableForConditionalGeneration.
         "Qwen2VLModel",  # Building part of bigger (tested) model. Tested implicitly through Qwen2VLForConditionalGeneration.
         "Qwen2_5_VLModel",  # Building part of bigger (tested) model. Tested implicitly through Qwen2_5_VLForConditionalGeneration.
         "Qwen3VLModel",  # Building part of bigger (tested) model. Tested implicitly through Qwen3VLForConditionalGeneration.
@@ -452,9 +449,6 @@
     "PaddleOCRVisionModel",  # Building part of bigger (tested) model
     "PaddleOCRVisionTransformer",  # Building part of bigger (tested) model
     "PaddleOCRTextModel",  # Building part of bigger (tested) model
-    "PPChart2TableModel",  # Building part of bigger (tested) model
-    "PPChart2TableVisionModel",  # Building part of bigger (tested) model
-    "PPChart2TableTextModel",  # Building part of bigger (tested) model
     "Qwen2_5OmniTalkerForConditionalGeneration",  # Building part of a bigger model
     "Qwen2_5OmniTalkerModel",  # Building part of a bigger model
     "Qwen2_5OmniThinkerForConditionalGeneration",  # Building part of a bigger model

From 691558328cc550ad8b93ae89ecd04c6eac96b86f Mon Sep 17 00:00:00 2001
From: XingweiDeng <aaadddiy@163.com>
Date: Wed, 18 Mar 2026 20:35:03 +0800
Subject: [PATCH 39/60] update

---
 .../configuration_pp_chart2table.py           |  1 -
 .../pp_chart2table/modular_pp_chart2table.py  | 39 +++++++++----------
 .../processing_pp_chart2table.py              | 29 ++++++++------
 3 files changed, 35 insertions(+), 34 deletions(-)

diff --git a/src/transformers/models/pp_chart2table/configuration_pp_chart2table.py b/src/transformers/models/pp_chart2table/configuration_pp_chart2table.py
index a15de72c8a39..d85c61d942b9 100644
--- a/src/transformers/models/pp_chart2table/configuration_pp_chart2table.py
+++ b/src/transformers/models/pp_chart2table/configuration_pp_chart2table.py
@@ -18,7 +18,6 @@
 # See the License for the specific language governing permissions and
 # limitations under the License.
 
-
 from huggingface_hub.dataclasses import strict
 
 from ...configuration_utils import PreTrainedConfig
diff --git a/src/transformers/models/pp_chart2table/modular_pp_chart2table.py b/src/transformers/models/pp_chart2table/modular_pp_chart2table.py
index b23a81f10716..b743e020d4a7 100644
--- a/src/transformers/models/pp_chart2table/modular_pp_chart2table.py
+++ b/src/transformers/models/pp_chart2table/modular_pp_chart2table.py
@@ -12,8 +12,6 @@
 # See the License for the specific language governing permissions and
 # limitations under the License.
 
-from dataclasses import dataclass
-
 from huggingface_hub.dataclasses import strict
 
 from ...feature_extraction_utils import BatchFeature
@@ -21,15 +19,8 @@
 from ...image_utils import ImageInput
 from ...processing_utils import ProcessingKwargs, ProcessorMixin, Unpack
 from ...tokenization_utils_base import PreTokenizedInput, TextInput
-from ...utils import auto_docstring, is_vision_available, logging
+from ...utils import auto_docstring, logging, requires_backends
 from ..got_ocr2.configuration_got_ocr2 import GotOcr2Config
-from ..got_ocr2.modeling_got_ocr2 import (
-    GotOcr2ForConditionalGeneration,
-    GotOcr2Model,
-    GotOcr2ModelOutputWithPast,
-    GotOcr2PreTrainedModel,
-    GotOcr2VisionEncoder,
-)
 
 
 logger = logging.get_logger(__name__)
@@ -60,12 +51,28 @@ class PPChart2TableProcessor(ProcessorMixin):
     tokenizer_class = "AutoTokenizer"
     model_input_names = ["input_ids", "pixel_values"]
 
+    def __init__(self, image_processor=None, tokenizer=None, chat_template=None, **kwargs):
+        super().__init__(image_processor, tokenizer, chat_template=chat_template)
+        
+        # PPChart2TableProcessor uses hardcoded "Chart to table" instruction internally via chat template
+        self.messages = [
+            {
+                "role": "system",
+            },
+            {
+                "role": "user",
+                "image": {"num_patches": self.image_processor.num_patches},
+            },
+        ]
+
+
     def __call__(
         self,
         images: ImageInput = None,
         text: TextInput | PreTokenizedInput | list[TextInput] | list[PreTokenizedInput] = None,
         **kwargs: Unpack[ProcessingKwargs],
     ) -> BatchFeature:
+        requires_backends(self, "torch")
         output_kwargs = self._merge_kwargs(
             ProcessingKwargs,
             tokenizer_init_kwargs=self.tokenizer.init_kwargs,
@@ -75,19 +82,9 @@ def __call__(
         image_inputs = self.image_processor(images=images, **output_kwargs["images_kwargs"])
         batch_size = image_inputs["pixel_values"].shape[0]
 
-        messages = [
-            {
-                "role": "system",
-            },
-            {
-                "role": "user",
-                "image": {"num_patches": self.image_processor.num_patches},
-            },
-        ]
-
         # Use tokenizer's apply_chat_template instead of manually loading template
         inputs = self.tokenizer.apply_chat_template(
-            messages,
+            self.messages,
             tokenize=True,
             add_generation_prompt=True,
             truncation=True,
diff --git a/src/transformers/models/pp_chart2table/processing_pp_chart2table.py b/src/transformers/models/pp_chart2table/processing_pp_chart2table.py
index ce8175ecd87c..e77a71c821d0 100644
--- a/src/transformers/models/pp_chart2table/processing_pp_chart2table.py
+++ b/src/transformers/models/pp_chart2table/processing_pp_chart2table.py
@@ -23,7 +23,7 @@
 from ...image_utils import ImageInput
 from ...processing_utils import ProcessingKwargs, ProcessorMixin, Unpack
 from ...tokenization_utils_base import PreTokenizedInput, TextInput
-from ...utils import auto_docstring
+from ...utils import auto_docstring, requires_backends
 
 
 @auto_docstring
@@ -32,12 +32,27 @@ class PPChart2TableProcessor(ProcessorMixin):
     tokenizer_class = "AutoTokenizer"
     model_input_names = ["input_ids", "pixel_values"]
 
+    def __init__(self, image_processor=None, tokenizer=None, chat_template=None, **kwargs):
+        super().__init__(image_processor, tokenizer, chat_template=chat_template)
+
+        # PPChart2TableProcessor uses hardcoded "Chart to table" instruction internally via chat template
+        self.messages = [
+            {
+                "role": "system",
+            },
+            {
+                "role": "user",
+                "image": {"num_patches": self.image_processor.num_patches},
+            },
+        ]
+
     def __call__(
         self,
         images: ImageInput = None,
         text: TextInput | PreTokenizedInput | list[TextInput] | list[PreTokenizedInput] = None,
         **kwargs: Unpack[ProcessingKwargs],
     ) -> BatchFeature:
+        requires_backends(self, "torch")
         output_kwargs = self._merge_kwargs(
             ProcessingKwargs,
             tokenizer_init_kwargs=self.tokenizer.init_kwargs,
@@ -47,19 +62,9 @@ def __call__(
         image_inputs = self.image_processor(images=images, **output_kwargs["images_kwargs"])
         batch_size = image_inputs["pixel_values"].shape[0]
 
-        messages = [
-            {
-                "role": "system",
-            },
-            {
-                "role": "user",
-                "image": {"num_patches": self.image_processor.num_patches},
-            },
-        ]
-
         # Use tokenizer's apply_chat_template instead of manually loading template
         inputs = self.tokenizer.apply_chat_template(
-            messages,
+            self.messages,
             tokenize=True,
             add_generation_prompt=True,
             truncation=True,

From 6fe075bc074f2f8fca1559b4a0109e25c385e90f Mon Sep 17 00:00:00 2001
From: XingweiDeng <aaadddiy@163.com>
Date: Wed, 18 Mar 2026 20:57:56 +0800
Subject: [PATCH 40/60] update

---
 .../models/pp_chart2table/modular_pp_chart2table.py            | 3 +--
 tests/models/pp_chart2table/test_modeling_pp_chart2table.py    | 2 +-
 2 files changed, 2 insertions(+), 3 deletions(-)

diff --git a/src/transformers/models/pp_chart2table/modular_pp_chart2table.py b/src/transformers/models/pp_chart2table/modular_pp_chart2table.py
index b743e020d4a7..a300f0f0eb6e 100644
--- a/src/transformers/models/pp_chart2table/modular_pp_chart2table.py
+++ b/src/transformers/models/pp_chart2table/modular_pp_chart2table.py
@@ -53,7 +53,7 @@ class PPChart2TableProcessor(ProcessorMixin):
 
     def __init__(self, image_processor=None, tokenizer=None, chat_template=None, **kwargs):
         super().__init__(image_processor, tokenizer, chat_template=chat_template)
-        
+
         # PPChart2TableProcessor uses hardcoded "Chart to table" instruction internally via chat template
         self.messages = [
             {
@@ -65,7 +65,6 @@ def __init__(self, image_processor=None, tokenizer=None, chat_template=None, **k
             },
         ]
 
-
     def __call__(
         self,
         images: ImageInput = None,
diff --git a/tests/models/pp_chart2table/test_modeling_pp_chart2table.py b/tests/models/pp_chart2table/test_modeling_pp_chart2table.py
index 04acdd35ca1d..32b9fff60c38 100644
--- a/tests/models/pp_chart2table/test_modeling_pp_chart2table.py
+++ b/tests/models/pp_chart2table/test_modeling_pp_chart2table.py
@@ -20,7 +20,7 @@
     is_torch_available,
     is_vision_available,
 )
-from transformers.testing_utils import cleanup, require_torch, slow, torch_device, require_vision
+from transformers.testing_utils import cleanup, require_torch, require_vision, slow, torch_device
 
 
 if is_torch_available():

From 86e9ec51746d6eeb17e032fcee9f9b8d337916d4 Mon Sep 17 00:00:00 2001
From: XingweiDeng <aaadddiy@163.com>
Date: Wed, 18 Mar 2026 22:52:51 +0800
Subject: [PATCH 41/60] update

---
 .../models/pp_chart2table/modular_pp_chart2table.py  |  2 ++
 .../pp_chart2table/processing_pp_chart2table.py      |  2 ++
 .../pp_chart2table/test_processing_pp_chart2table.py | 12 ++++++++++++
 3 files changed, 16 insertions(+)

diff --git a/src/transformers/models/pp_chart2table/modular_pp_chart2table.py b/src/transformers/models/pp_chart2table/modular_pp_chart2table.py
index a300f0f0eb6e..4fe5082fdecf 100644
--- a/src/transformers/models/pp_chart2table/modular_pp_chart2table.py
+++ b/src/transformers/models/pp_chart2table/modular_pp_chart2table.py
@@ -78,6 +78,8 @@ def __call__(
             **kwargs,
         )
 
+        if images is None:
+            raise ValueError("At least one of `images` must be provided")
         image_inputs = self.image_processor(images=images, **output_kwargs["images_kwargs"])
         batch_size = image_inputs["pixel_values"].shape[0]
 
diff --git a/src/transformers/models/pp_chart2table/processing_pp_chart2table.py b/src/transformers/models/pp_chart2table/processing_pp_chart2table.py
index e77a71c821d0..ad70b5edc282 100644
--- a/src/transformers/models/pp_chart2table/processing_pp_chart2table.py
+++ b/src/transformers/models/pp_chart2table/processing_pp_chart2table.py
@@ -59,6 +59,8 @@ def __call__(
             **kwargs,
         )
 
+        if images is None:
+            raise ValueError("At least one of `images` must be provided")
         image_inputs = self.image_processor(images=images, **output_kwargs["images_kwargs"])
         batch_size = image_inputs["pixel_values"].shape[0]
 
diff --git a/tests/models/pp_chart2table/test_processing_pp_chart2table.py b/tests/models/pp_chart2table/test_processing_pp_chart2table.py
index 1592fe4cdb25..bd4592c18746 100644
--- a/tests/models/pp_chart2table/test_processing_pp_chart2table.py
+++ b/tests/models/pp_chart2table/test_processing_pp_chart2table.py
@@ -62,3 +62,15 @@ def test_unstructured_kwargs_batched(self):
         )
 
         self.assertLessEqual(inputs[self.images_input_name][0][0].mean(), 0)
+
+    @unittest.skip(reason="PPChart2Table image input cannot be None")
+    def test_apply_chat_template_assistant_mask(self):
+        pass
+
+    @unittest.skip(reason="PPChart2Table image input cannot be None")
+    def test_apply_chat_template_image_0(self):
+        pass
+
+    @unittest.skip(reason="PPChart2Table image input cannot be None")
+    def test_apply_chat_template_image_1(self):
+        pass

From 6d791e70781b8c21901c92a3151931fc9fb5e238 Mon Sep 17 00:00:00 2001
From: vasqu <antonprogamer@gmail.com>
Date: Wed, 18 Mar 2026 15:53:18 +0100
Subject: [PATCH 42/60] small fixes

---
 .../pp_chart2table/modeling_pp_chart2table.py | 805 ------------------
 .../pp_chart2table/modular_pp_chart2table.py  |   5 +-
 .../processing_pp_chart2table.py              |   5 +-
 3 files changed, 6 insertions(+), 809 deletions(-)
 delete mode 100644 src/transformers/models/pp_chart2table/modeling_pp_chart2table.py

diff --git a/src/transformers/models/pp_chart2table/modeling_pp_chart2table.py b/src/transformers/models/pp_chart2table/modeling_pp_chart2table.py
deleted file mode 100644
index f13f04e62de6..000000000000
--- a/src/transformers/models/pp_chart2table/modeling_pp_chart2table.py
+++ /dev/null
@@ -1,805 +0,0 @@
-#                🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨
-#           This file was automatically generated from src/transformers/models/pp_chart2table/modular_pp_chart2table.py.
-#               Do NOT edit this file manually as any edits will be overwritten by the generation of
-#             the file from the modular. If any change should be done, please apply the change to the
-#                          modular_pp_chart2table.py file directly. One of our CI enforces this.
-#                🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨
-# Copyright 2026 The PaddlePaddle Team and The HuggingFace Inc. team. All rights reserved.
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#     http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-
-import collections
-from dataclasses import dataclass
-
-import torch
-import torch.nn as nn
-import torch.nn.functional as F
-
-from ... import initialization as init
-from ...activations import ACT2FN
-from ...cache_utils import Cache
-from ...generation import GenerationMixin
-from ...modeling_layers import GradientCheckpointingLayer
-from ...modeling_outputs import BaseModelOutputWithPast, BaseModelOutputWithPooling, ModelOutput
-from ...modeling_utils import PreTrainedModel
-from ...processing_utils import Unpack
-from ...utils import TransformersKwargs, auto_docstring, can_return_tuple, torch_compilable_check
-from ...utils.generic import merge_with_config_defaults
-from ...utils.output_capturing import capture_outputs
-from ..auto import AutoModel
-from .configuration_pp_chart2table import PPChart2TableConfig, PPChart2TableVisionConfig
-
-
-class PPChart2TableVisionAttention(nn.Module):
-    """Multi-head Attention block with relative position embeddings."""
-
-    def __init__(self, config, window_size):
-        super().__init__()
-        input_size = (
-            (config.image_size // config.patch_size, config.image_size // config.patch_size)
-            if window_size == 0
-            else (window_size, window_size)
-        )
-
-        self.num_attention_heads = config.num_attention_heads
-        head_dim = config.hidden_size // config.num_attention_heads
-        self.scale = head_dim**-0.5
-        self.dropout = config.attention_dropout
-
-        self.qkv = nn.Linear(config.hidden_size, config.hidden_size * 3, bias=config.qkv_bias)
-        self.proj = nn.Linear(config.hidden_size, config.hidden_size)
-
-        self.use_rel_pos = config.use_rel_pos
-        if self.use_rel_pos:
-            if input_size is None:
-                raise ValueError("Input size must be provided if using relative positional encoding.")
-
-            # initialize relative positional embeddings
-            self.rel_pos_h = nn.Parameter(torch.zeros(2 * input_size[0] - 1, head_dim))
-            self.rel_pos_w = nn.Parameter(torch.zeros(2 * input_size[1] - 1, head_dim))
-
-    def get_rel_pos(self, q_size: int, k_size: int, rel_pos: torch.Tensor) -> torch.Tensor:
-        """
-        Get relative positional embeddings according to the relative positions of
-            query and key sizes.
-
-        Args:
-            q_size (int):
-                size of the query.
-            k_size (int):
-                size of key k.
-            rel_pos (`torch.Tensor`):
-                relative position embeddings (L, channel).
-
-        Returns:
-            Extracted positional embeddings according to relative positions.
-        """
-        max_rel_dist = int(2 * max(q_size, k_size) - 1)
-        # Interpolate rel pos.
-        rel_pos_resized = F.interpolate(
-            rel_pos.reshape(1, rel_pos.shape[0], -1).permute(0, 2, 1),
-            size=max_rel_dist,
-            mode="linear",
-        )
-        rel_pos_resized = rel_pos_resized.reshape(-1, max_rel_dist).permute(1, 0)
-
-        # Scale the coords with short length if shapes for q and k are different.
-        q_coords = torch.arange(q_size)[:, None] * max(k_size / q_size, 1.0)
-        k_coords = torch.arange(k_size)[None, :] * max(q_size / k_size, 1.0)
-        relative_coords = (q_coords - k_coords) + (k_size - 1) * max(q_size / k_size, 1.0)
-
-        return rel_pos_resized[relative_coords.long()]
-
-    def get_decomposed_rel_pos(
-        self,
-        query: torch.Tensor,
-        rel_pos_h: torch.Tensor,
-        rel_pos_w: torch.Tensor,
-        q_size: tuple[int, int],
-        k_size: tuple[int, int],
-    ) -> torch.Tensor:
-        """
-        Calculate decomposed Relative Positional Embeddings from :paper:`mvitv2`.
-        https://github.com/facebookresearch/mvit/blob/19786631e330df9f3622e5402b4a419a263a2c80/mvit/models/attention.py
-
-        Args:
-            query (`torch.Tensor`):
-                query q in the attention layer with shape (batch_size, query_height * query_width, channel).
-            rel_pos_h (`torch.Tensor`):
-                relative position embeddings (Lh, channel) for height axis.
-            rel_pos_w (`torch.Tensor`):
-                relative position embeddings (Lw, channel) for width axis.
-            q_size (tuple):
-                spatial sequence size of query q with (query_height, query_width).
-            k_size (tuple):
-                spatial sequence size of key k with (key_height, key_width).
-
-        Returns:
-            decomposed_rel_pos (`torch.Tensor`):
-                decomposed relative position embeddings.
-        """
-        query_height, query_width = q_size
-        key_height, key_width = k_size
-        relative_position_height = self.get_rel_pos(query_height, key_height, rel_pos_h)
-        relative_position_width = self.get_rel_pos(query_width, key_width, rel_pos_w)
-
-        batch_size, _, dim = query.shape
-        reshaped_query = query.reshape(batch_size, query_height, query_width, dim)
-        rel_h = torch.einsum("bhwc,hkc->bhwk", reshaped_query, relative_position_height)
-        rel_w = torch.einsum("bhwc,wkc->bhwk", reshaped_query, relative_position_width)
-
-        decomposed_rel_pos = rel_h[:, :, :, :, None] + rel_w[:, :, :, None, :]
-
-        return decomposed_rel_pos
-
-    def forward(self, hidden_states: torch.Tensor, output_attentions=None) -> tuple[torch.Tensor, torch.Tensor]:
-        batch_size, height, width, _ = hidden_states.shape
-        # qkv with shape (3, batch_size, nHead, height * width, channel)
-        qkv = (
-            self.qkv(hidden_states)
-            .reshape(batch_size, height * width, 3, self.num_attention_heads, -1)
-            .permute(2, 0, 3, 1, 4)
-        )
-        # q, k, v with shape (batch_size * nHead, height * width, channel)
-        query, key, value = qkv.reshape(3, batch_size * self.num_attention_heads, height * width, -1).unbind(0)
-
-        attn_weights = (query * self.scale) @ key.transpose(-2, -1)
-
-        if self.use_rel_pos:
-            decomposed_rel_pos = self.get_decomposed_rel_pos(
-                query, self.rel_pos_h, self.rel_pos_w, (height, width), (height, width)
-            )
-            decomposed_rel_pos = decomposed_rel_pos.reshape_as(attn_weights)
-            attn_weights = attn_weights + decomposed_rel_pos
-
-        attn_weights = torch.nn.functional.softmax(attn_weights, dtype=torch.float32, dim=-1).to(query.dtype)
-
-        attn_probs = nn.functional.dropout(attn_weights, p=self.dropout, training=self.training)
-
-        attn_output = (attn_probs @ value).reshape(batch_size, self.num_attention_heads, height, width, -1)
-        attn_output = attn_output.permute(0, 2, 3, 1, 4).reshape(batch_size, height, width, -1)
-
-        attn_output = self.proj(attn_output)
-        return attn_output, attn_weights
-
-
-@auto_docstring
-class PPChart2TableVisionPreTrainedModel(PreTrainedModel):
-    config: PPChart2TableConfig
-    base_model_prefix = "model"
-    input_modalities = ("image", "text")
-    supports_gradient_checkpointing = True
-    _skip_keys_device_placement = "past_key_values"
-    _supports_flash_attn = False
-    _supports_sdpa = False
-
-    _can_compile_fullgraph = True
-    _supports_flex_attn = False
-    _supports_attention_backend = True
-
-    @torch.no_grad()
-    def _init_weights(self, module):
-        super()._init_weights(module)
-        if isinstance(module, PPChart2TableVisionAttention):
-            if module.use_rel_pos:
-                init.zeros_(module.rel_pos_h)
-                init.zeros_(module.rel_pos_w)
-        elif isinstance(module, PPChart2TableVisionEncoder):
-            if module.pos_embed is not None:
-                init.zeros_(module.pos_embed)
-
-
-class PPChart2TableMLPBlock(nn.Module):
-    def __init__(self, config):
-        super().__init__()
-        self.lin1 = nn.Linear(config.hidden_size, config.mlp_dim)
-        self.lin2 = nn.Linear(config.mlp_dim, config.hidden_size)
-        self.act = ACT2FN[config.hidden_act]
-
-    def forward(self, hidden_states: torch.Tensor) -> torch.Tensor:
-        hidden_states = self.lin1(hidden_states)
-        hidden_states = self.act(hidden_states)
-        hidden_states = self.lin2(hidden_states)
-        return hidden_states
-
-
-class PPChart2TableVisionLayer(GradientCheckpointingLayer):
-    def __init__(self, config, window_size):
-        super().__init__()
-        self.layer_norm1 = nn.LayerNorm(config.hidden_size, eps=config.layer_norm_eps)
-        self.attn = PPChart2TableVisionAttention(config, window_size)
-        self.layer_norm2 = nn.LayerNorm(config.hidden_size, eps=config.layer_norm_eps)
-        self.mlp = PPChart2TableMLPBlock(config)
-        self.window_size = window_size
-
-    def window_partition(self, hidden_states: torch.Tensor, window_size: int) -> tuple[torch.Tensor, tuple[int, int]]:
-        """
-        Args:
-        Partition into non-overlapping windows with padding if needed.
-            hidden_states (tensor): input tokens with [batch_size, height, width, channel]. window_size (int): window
-            size.
-
-        Returns:
-            windows: windows after partition with [batch_size * num_windows, window_size, window_size, channel].
-            (pad_height, pad_width): padded height and width before partition
-        """
-        batch_size, height, width, channel = hidden_states.shape
-
-        pad_h = (window_size - height % window_size) % window_size
-        pad_w = (window_size - width % window_size) % window_size
-        hidden_states = F.pad(hidden_states, (0, 0, 0, pad_w, 0, pad_h))
-        pad_height, pad_width = height + pad_h, width + pad_w
-
-        hidden_states = hidden_states.reshape(
-            batch_size, pad_height // window_size, window_size, pad_width // window_size, window_size, channel
-        )
-        windows = hidden_states.permute(0, 1, 3, 2, 4, 5).contiguous().reshape(-1, window_size, window_size, channel)
-        return windows, (pad_height, pad_width)
-
-    def window_unpartition(
-        self, windows: torch.Tensor, window_size: int, padding_shape: tuple[int, int], original_shape: tuple[int, int]
-    ) -> torch.Tensor:
-        """
-        Args:
-        Window unpartition into original sequences and removing padding.
-            hidden_states (tensor):
-                input tokens with [batch_size * num_windows, window_size, window_size, channel].
-            window_size (int):
-                window size.
-            padding_shape (Tuple):
-                padded height and width (pad_height, pad_width).
-            original_shape (Tuple): original height and width (height, width) before padding.
-
-        Returns:
-            hidden_states: unpartitioned sequences with [batch_size, height, width, channel].
-        """
-        pad_height, pad_width = padding_shape
-        height, width = original_shape
-        batch_size = windows.shape[0] // (pad_height * pad_width // window_size // window_size)
-        hidden_states = windows.reshape(
-            batch_size, pad_height // window_size, pad_width // window_size, window_size, window_size, -1
-        )
-        hidden_states = (
-            hidden_states.permute(0, 1, 3, 2, 4, 5).contiguous().reshape(batch_size, pad_height, pad_width, -1)
-        )
-
-        hidden_states = hidden_states[:, :height, :width, :].contiguous()
-        return hidden_states
-
-    def forward(self, hidden_states: torch.Tensor) -> tuple[torch.FloatTensor]:
-        residual = hidden_states
-        hidden_states = self.layer_norm1(hidden_states)
-        # Window partition
-        if self.window_size > 0:
-            height, width = hidden_states.shape[1], hidden_states.shape[2]
-            hidden_states, padding_shape = self.window_partition(hidden_states, self.window_size)
-
-        hidden_states, attn_weights = self.attn(
-            hidden_states=hidden_states,
-        )
-        # Reverse window partition
-        if self.window_size > 0:
-            hidden_states = self.window_unpartition(hidden_states, self.window_size, padding_shape, (height, width))
-
-        hidden_states = residual + hidden_states
-        layernorm_output = self.layer_norm2(hidden_states)
-        hidden_states = hidden_states + self.mlp(layernorm_output)
-        return hidden_states
-
-
-@dataclass
-@auto_docstring(
-    custom_intro="""
-    Base class for p_p_chart2_table vision model's outputs that also contains image embeddings obtained by applying the projection
-    layer to the pooler_output.
-    """
-)
-class PPChart2TableVisionEncoderOutput(ModelOutput):
-    r"""
-    image_embeds (`torch.FloatTensor` of shape `(batch_size, output_dim)` *optional* returned when model is initialized with `with_projection=True`):
-        The image embeddings obtained by applying the projection layer to the pooler_output.
-    """
-
-    image_embeds: torch.FloatTensor | None = None
-    last_hidden_state: torch.FloatTensor | None = None
-    hidden_states: tuple[torch.FloatTensor, ...] | None = None
-    attentions: tuple[torch.FloatTensor, ...] | None = None
-
-
-class PPChart2TablePatchEmbeddings(nn.Module):
-    """
-    This class turns `pixel_values` of shape `(batch_size, num_channels, height, width)` into the initial
-    `hidden_states` (patch embeddings) of shape `(batch_size, seq_length, hidden_size)` to be consumed by a
-    Transformer.
-    """
-
-    def __init__(self, config):
-        super().__init__()
-        image_size, patch_size = config.image_size, config.patch_size
-        num_channels, hidden_size = config.num_channels, config.hidden_size
-        image_size = image_size if isinstance(image_size, collections.abc.Iterable) else (image_size, image_size)
-        patch_size = patch_size if isinstance(patch_size, collections.abc.Iterable) else (patch_size, patch_size)
-        num_patches = (image_size[1] // patch_size[1]) * (image_size[0] // patch_size[0])
-        self.image_size = image_size
-        self.patch_size = patch_size
-        self.num_channels = num_channels
-        self.num_patches = num_patches
-
-        self.projection = nn.Conv2d(num_channels, hidden_size, kernel_size=patch_size, stride=patch_size)
-
-    def forward(self, pixel_values):
-        batch_size, num_channels, height, width = pixel_values.shape
-        if num_channels != self.num_channels:
-            raise ValueError(
-                "Make sure that the channel dimension of the pixel values match with the one set in the configuration."
-            )
-        if height != self.image_size[0] or width != self.image_size[1]:
-            raise ValueError(
-                f"Input image size ({height}*{width}) doesn't match model ({self.image_size[0]}*{self.image_size[1]})."
-            )
-        embeddings = self.projection(pixel_values).permute(0, 2, 3, 1)
-        return embeddings
-
-
-class PPChart2TableLayerNorm(nn.LayerNorm):
-    r"""LayerNorm that supports two data formats: channels_last (default) or channels_first.
-    The ordering of the dimensions in the inputs. channels_last corresponds to inputs with shape (batch_size, height,
-    width, channels) while channels_first corresponds to inputs with shape (batch_size, channels, height, width).
-    """
-
-    def __init__(self, normalized_shape, *, eps=1e-6, data_format="channels_last", **kwargs):
-        super().__init__(normalized_shape, eps=eps, **kwargs)
-        if data_format not in ["channels_last", "channels_first"]:
-            raise NotImplementedError(f"Unsupported data format: {data_format}")
-        self.data_format = data_format
-
-    def forward(self, features: torch.Tensor) -> torch.Tensor:
-        """
-        Args:
-            features: Tensor of shape (batch_size, channels, height, width) OR (batch_size, height, width, channels)
-        """
-        if self.data_format == "channels_first":
-            features = features.permute(0, 2, 3, 1)
-            features = super().forward(features)
-            features = features.permute(0, 3, 1, 2)
-        else:
-            features = super().forward(features)
-        return features
-
-
-class PPChart2TableVisionNeck(nn.Module):
-    def __init__(self, config: PPChart2TableVisionConfig):
-        super().__init__()
-        self.config = config
-
-        self.conv1 = nn.Conv2d(config.hidden_size, config.output_channels, kernel_size=1, bias=False)
-        self.layer_norm1 = PPChart2TableLayerNorm(config.output_channels, data_format="channels_first")
-        self.conv2 = nn.Conv2d(config.output_channels, config.output_channels, kernel_size=3, padding=1, bias=False)
-        self.layer_norm2 = PPChart2TableLayerNorm(config.output_channels, data_format="channels_first")
-
-    def forward(self, hidden_states):
-        hidden_states = hidden_states.permute(0, 3, 1, 2)
-        hidden_states = self.conv1(hidden_states)
-        hidden_states = self.layer_norm1(hidden_states)
-
-        hidden_states = self.conv2(hidden_states)
-        hidden_states = self.layer_norm2(hidden_states)
-        return hidden_states
-
-
-class PPChart2TableVisionEncoder(PPChart2TableVisionPreTrainedModel):
-    _can_record_outputs = {"hidden_states": PPChart2TableVisionLayer, "attentions": PPChart2TableVisionAttention}
-    input_modalities = ("image",)
-
-    def __init__(self, config: PPChart2TableVisionConfig):
-        super().__init__(config)
-        self.config = config
-        self.image_size = config.image_size
-        self.patch_embed = PPChart2TablePatchEmbeddings(config)
-
-        self.pos_embed = None
-        if config.use_abs_pos:
-            # Initialize absolute positional embedding with pretrain image size.
-            self.pos_embed = nn.Parameter(
-                torch.zeros(
-                    1,
-                    config.image_size // config.patch_size,
-                    config.image_size // config.patch_size,
-                    config.hidden_size,
-                )
-            )
-
-        self.layers = nn.ModuleList()
-        for i in range(config.num_hidden_layers):
-            layer = PPChart2TableVisionLayer(
-                config,
-                window_size=config.window_size if i not in config.global_attn_indexes else 0,
-            )
-            self.layers.append(layer)
-
-        self.neck = PPChart2TableVisionNeck(config)
-
-        self.gradient_checkpointing = False
-        self.post_init()
-
-    def get_input_embeddings(self):
-        return self.patch_embed
-
-    @merge_with_config_defaults
-    @capture_outputs(tie_last_hidden_states=False)
-    def forward(
-        self, pixel_values: torch.FloatTensor | None = None, **kwargs: Unpack[TransformersKwargs]
-    ) -> tuple | PPChart2TableVisionEncoderOutput:
-        if pixel_values is None:
-            raise ValueError("You have to specify pixel_values")
-
-        hidden_states = self.patch_embed(pixel_values)
-        if self.pos_embed is not None:
-            hidden_states = hidden_states + self.pos_embed
-        for layer_module in self.layers:
-            hidden_states = layer_module(hidden_states)
-        hidden_states = self.neck(hidden_states)
-        return PPChart2TableVisionEncoderOutput(
-            last_hidden_state=hidden_states,
-        )
-
-
-@dataclass
-class PPChart2TableModelOutputWithPast(BaseModelOutputWithPast):
-    r"""
-    past_key_values (`Cache`, *optional*, returned when `use_cache=True` is passed or when `config.use_cache=True`):
-        It is a [`~cache_utils.Cache`] instance. For more details, see our [kv cache guide](https://huggingface.co/docs/transformers/en/kv_cache).
-
-        Contains pre-computed hidden-states (key and values in the self-attention blocks) that can be used (see
-        `past_key_values` input) to speed up sequential decoding.
-    image_hidden_states (`torch.FloatTensor`, *optional*):
-        A `torch.FloatTensor` of size `(batch_size, num_images, sequence_length, hidden_size)`.
-        image_hidden_states of the model produced by the vision encoder and after projecting the last hidden state.
-    """
-
-    image_hidden_states: torch.FloatTensor | None = None
-
-
-@auto_docstring
-class PPChart2TablePreTrainedModel(PreTrainedModel):
-    config: PPChart2TableConfig
-    base_model_prefix = "model"
-    input_modalities = ("image", "text")
-    supports_gradient_checkpointing = True
-    _skip_keys_device_placement = "past_key_values"
-    _supports_flash_attn = False
-    _supports_sdpa = False
-
-    _can_compile_fullgraph = True
-    _supports_flex_attn = False
-    _supports_attention_backend = True
-
-    @torch.no_grad()
-    def _init_weights(self, module):
-        super()._init_weights(module)
-        if isinstance(module, PPChart2TableVisionAttention):
-            if module.use_rel_pos:
-                init.zeros_(module.rel_pos_h)
-                init.zeros_(module.rel_pos_w)
-        elif isinstance(module, PPChart2TableVisionEncoder):
-            if module.pos_embed is not None:
-                init.zeros_(module.pos_embed)
-
-
-class PPChart2TableMultiModalProjector(nn.Module):
-    def __init__(self, config: PPChart2TableConfig):
-        super().__init__()
-        vision_output_channels = config.vision_config.output_channels
-        language_hidden_size = config.text_config.hidden_size
-        self.conv_upsampler1 = nn.Conv2d(
-            vision_output_channels, vision_output_channels * 2, kernel_size=3, stride=2, padding=1, bias=False
-        )
-        self.conv_upsampler2 = nn.Conv2d(
-            vision_output_channels * 2, language_hidden_size, kernel_size=3, stride=2, padding=1, bias=False
-        )
-        self.multimodal_projector = nn.Linear(language_hidden_size, language_hidden_size)
-
-    def forward(self, vision_embeddings: torch.Tensor) -> torch.Tensor:
-        hidden_state = self.conv_upsampler1(vision_embeddings)
-        hidden_state = self.conv_upsampler2(hidden_state)
-        hidden_state = hidden_state.flatten(2).permute(0, 2, 1)
-        hidden_state = self.multimodal_projector(hidden_state)
-        return hidden_state
-
-
-@auto_docstring
-class PPChart2TableModel(PPChart2TablePreTrainedModel):
-    def __init__(self, config: PPChart2TableConfig):
-        super().__init__(config)
-        self.vision_tower = PPChart2TableVisionEncoder(config.vision_config)
-
-        self.multi_modal_projector = PPChart2TableMultiModalProjector(config)
-        self.language_model = AutoModel.from_config(config.text_config)
-        self.post_init()
-
-    def get_input_embeddings(self):
-        return self.language_model.get_input_embeddings()
-
-    def set_input_embeddings(self, value):
-        self.language_model.set_input_embeddings(value)
-
-    @can_return_tuple
-    @auto_docstring(
-        custom_intro="Obtains image last hidden states from the vision tower and apply multimodal projection."
-    )
-    def get_image_features(
-        self,
-        pixel_values: torch.FloatTensor,
-        **kwargs: Unpack[TransformersKwargs],
-    ) -> tuple | BaseModelOutputWithPooling:
-        image_outputs = self.vision_tower(pixel_values, return_dict=True, **kwargs)
-        last_hidden_state = image_outputs.last_hidden_state
-        image_outputs.pooler_output = self.multi_modal_projector(last_hidden_state)
-
-        return image_outputs
-
-    def get_placeholder_mask(
-        self, input_ids: torch.LongTensor, inputs_embeds: torch.FloatTensor, image_features: torch.FloatTensor
-    ):
-        """
-        Obtains multimodal placeholder mask from `input_ids` or `inputs_embeds`, and checks that the placeholder token count is
-        equal to the length of multimodal features. If the lengths are different, an error is raised.
-        """
-        if input_ids is None:
-            special_image_mask = inputs_embeds == self.get_input_embeddings()(
-                torch.tensor(self.config.image_token_id, dtype=torch.long, device=inputs_embeds.device)
-            )
-            special_image_mask = special_image_mask.all(-1)
-        else:
-            special_image_mask = input_ids == self.config.image_token_id
-
-        n_image_tokens = special_image_mask.sum()
-        n_image_features = image_features.shape[0] * image_features.shape[1]
-        special_image_mask = special_image_mask.unsqueeze(-1).expand_as(inputs_embeds).to(inputs_embeds.device)
-        torch_compilable_check(
-            inputs_embeds[special_image_mask].numel() == image_features.numel(),
-            f"Image features and image tokens do not match, tokens: {n_image_tokens}, features: {n_image_features}",
-        )
-        return special_image_mask
-
-    @can_return_tuple
-    @auto_docstring
-    def forward(
-        self,
-        input_ids: torch.LongTensor | None = None,
-        pixel_values: torch.FloatTensor | None = None,
-        attention_mask: torch.Tensor | None = None,
-        position_ids: torch.LongTensor | None = None,
-        past_key_values: Cache | None = None,
-        inputs_embeds: torch.FloatTensor | None = None,
-        use_cache: bool | None = None,
-        **kwargs: Unpack[TransformersKwargs],
-    ) -> tuple | PPChart2TableModelOutputWithPast:
-        if (input_ids is None) ^ (inputs_embeds is not None):
-            raise ValueError("You must specify exactly one of input_ids or inputs_embeds")
-
-        if inputs_embeds is None:
-            inputs_embeds = self.get_input_embeddings()(input_ids)
-
-        if pixel_values is not None:
-            image_features = self.get_image_features(
-                pixel_values=pixel_values.to(inputs_embeds.dtype), return_dict=True
-            ).pooler_output
-            image_features = image_features.to(inputs_embeds.device, inputs_embeds.dtype)
-            special_image_mask = self.get_placeholder_mask(
-                input_ids, inputs_embeds=inputs_embeds, image_features=image_features
-            )
-            inputs_embeds = inputs_embeds.masked_scatter(special_image_mask, image_features)
-
-        outputs = self.language_model(
-            attention_mask=attention_mask,
-            position_ids=position_ids,
-            past_key_values=past_key_values,
-            inputs_embeds=inputs_embeds,
-            use_cache=use_cache,
-            return_dict=True,
-            **kwargs,
-        )
-
-        return PPChart2TableModelOutputWithPast(
-            last_hidden_state=outputs.last_hidden_state,
-            past_key_values=outputs.past_key_values,
-            hidden_states=outputs.hidden_states,
-            attentions=outputs.attentions,
-            image_hidden_states=image_features if pixel_values is not None else None,
-        )
-
-
-@dataclass
-@auto_docstring(
-    custom_intro="""
-    Base class for PPChart2Table causal language model (or autoregressive) outputs.
-    """
-)
-class PPChart2TableCausalLMOutputWithPast(ModelOutput):
-    r"""
-    loss (`torch.FloatTensor` of shape `(1,)`, *optional*, returned when `labels` is provided):
-        Language modeling loss (for next-token prediction).
-    logits (`torch.FloatTensor` of shape `(batch_size, sequence_length, config.vocab_size)`):
-        Prediction scores of the language modeling head (scores for each vocabulary token before SoftMax).
-    past_key_values (`Cache`, *optional*, returned when `use_cache=True` is passed or when `config.use_cache=True`):
-        It is a [`~cache_utils.Cache`] instance. For more details, see our [kv cache guide](https://huggingface.co/docs/transformers/en/kv_cache).
-
-        Contains pre-computed hidden-states (key and values in the self-attention blocks) that can be used (see
-        `past_key_values` input) to speed up sequential decoding.
-    image_hidden_states (`torch.FloatTensor`, *optional*):
-        A `torch.FloatTensor` of size `(batch_size, num_images, sequence_length, hidden_size)`.
-        image_hidden_states of the model produced by the vision encoder and after projecting the last hidden state.
-    """
-
-    loss: torch.FloatTensor | None = None
-    logits: torch.FloatTensor | None = None
-    past_key_values: Cache | None = None
-    hidden_states: tuple[torch.FloatTensor] | None = None
-    attentions: tuple[torch.FloatTensor] | None = None
-    image_hidden_states: torch.FloatTensor | None = None
-
-
-@auto_docstring(
-    custom_intro="""
-    PP-Chart2Table model for conditional generation (table text generation from chart images),
-    extending the core model with a language modeling (LM) head and generation utilities.
-    """
-)
-class PPChart2TableForConditionalGeneration(PPChart2TablePreTrainedModel, GenerationMixin):
-    _tied_weights_keys = {"lm_head.weight": "model.language_model.embed_tokens.weight"}
-
-    def __init__(self, config: PPChart2TableConfig):
-        super().__init__(config)
-        self.model = PPChart2TableModel(config)
-        self.lm_head = nn.Linear(config.text_config.hidden_size, config.text_config.vocab_size, bias=False)
-        self.post_init()
-
-    def get_input_embeddings(self):
-        return self.model.get_input_embeddings()
-
-    def set_input_embeddings(self, value):
-        self.model.set_input_embeddings(value)
-
-    def get_output_embeddings(self) -> nn.Module:
-        return self.lm_head
-
-    @auto_docstring
-    def get_image_features(
-        self, pixel_values: torch.FloatTensor, **kwargs: Unpack[TransformersKwargs]
-    ) -> tuple | BaseModelOutputWithPooling:
-        return self.model.get_image_features(pixel_values=pixel_values, **kwargs)
-
-    @can_return_tuple
-    @auto_docstring
-    def forward(
-        self,
-        input_ids: torch.LongTensor | None = None,
-        pixel_values: torch.FloatTensor | None = None,
-        attention_mask: torch.Tensor | None = None,
-        position_ids: torch.LongTensor | None = None,
-        past_key_values: Cache | None = None,
-        inputs_embeds: torch.FloatTensor | None = None,
-        labels: torch.LongTensor | None = None,
-        use_cache: bool | None = None,
-        logits_to_keep: int | torch.Tensor = 0,
-        **kwargs: Unpack[TransformersKwargs],
-    ) -> tuple | PPChart2TableCausalLMOutputWithPast:
-        r"""
-        labels (`torch.LongTensor` of shape `(batch_size, sequence_length)`, *optional*):
-            Labels for computing the masked language modeling loss. Indices should either be in `[0, ...,
-            config.vocab_size]` or -100 (see `input_ids` docstring). Tokens with indices set to `-100` are ignored
-            (masked), the loss is only computed for the tokens with labels in `[0, ..., config.vocab_size]`.
-
-        Example:
-
-        ```python
-        >>> from PIL import Image
-        >>> import httpx
-        >>> from io import BytesIO
-        >>> from transformers import AutoProcessor, PPChart2TableForConditionalGeneration, TextStreamer
-
-        >>> model = PPChart2TableForConditionalGeneration.from_pretrained("stepfun-ai/GOT-OCR-2.0-hf").to("cuda")
-        >>> processor = AutoProcessor.from_pretrained("stepfun-ai/GOT-OCR-2.0-hf")
-
-        >>> url = "https://huggingface.co/datasets/hf-internal-testing/fixtures_got_ocr/resolve/main/multi_box.png"
-        >>> with httpx.stream("GET", url) as response:
-        ...     image = Image.open(BytesIO(response.read()))
-
-        >>> inputs = processor(image, return_tensors="pt", color="green").to("cuda")
-
-        >>> # Generate
-        >>> streamer = TextStreamer(processor.tokenizer, skip_prompt=True, skip_special_tokens=True)
-        >>> generate_ids = model.generate(
-        ...     **inputs,
-        ...     do_sample=False,
-        ...     tokenizer = processor.tokenizer,
-        ...     stop_strings='<|im_end|>',
-        ...     streamer=streamer,
-        ...     max_new_tokens=4096,
-        ... )
-        "You should keep in mind what features from the module should be used, especially
-        when you're planning to sell a template."
-        ```"""
-        outputs = self.model(
-            input_ids=input_ids,
-            pixel_values=pixel_values,
-            attention_mask=attention_mask,
-            position_ids=position_ids,
-            past_key_values=past_key_values,
-            inputs_embeds=inputs_embeds,
-            use_cache=use_cache,
-            return_dict=True,
-            logits_to_keep=logits_to_keep,
-            **kwargs,
-        )
-
-        hidden_states = outputs[0]
-        # Only compute necessary logits, and do not upcast them to float if we are not computing the loss
-        slice_indices = slice(-logits_to_keep, None) if isinstance(logits_to_keep, int) else logits_to_keep
-        logits = self.lm_head(hidden_states[:, slice_indices, :])
-
-        loss = None
-        if labels is not None:
-            loss = self.loss_function(
-                logits=logits, labels=labels, vocab_size=self.config.text_config.vocab_size, **kwargs
-            )
-
-        return PPChart2TableCausalLMOutputWithPast(
-            loss=loss,
-            logits=logits,
-            past_key_values=outputs.past_key_values,
-            hidden_states=outputs.hidden_states,
-            attentions=outputs.attentions,
-            image_hidden_states=outputs.image_hidden_states,
-        )
-
-    def prepare_inputs_for_generation(
-        self,
-        input_ids,
-        past_key_values=None,
-        inputs_embeds=None,
-        pixel_values=None,
-        attention_mask=None,
-        logits_to_keep=None,
-        is_first_iteration=False,
-        **kwargs,
-    ):
-        # Overwritten -- in specific circumstances we don't want to forward image inputs to the model
-
-        model_inputs = super().prepare_inputs_for_generation(
-            input_ids,
-            past_key_values=past_key_values,
-            inputs_embeds=inputs_embeds,
-            attention_mask=attention_mask,
-            logits_to_keep=logits_to_keep,
-            is_first_iteration=is_first_iteration,
-            **kwargs,
-        )
-
-        if is_first_iteration or not kwargs.get("use_cache", True):
-            # Pixel values are used only in the first iteration if available
-            # In subsequent iterations, they are already merged with text and cached
-            # NOTE: first iteration doesn't have to be prefill, it can be the first
-            # iteration with a question and cached system prompt (continue generate from cache)
-            model_inputs["pixel_values"] = pixel_values
-
-        return model_inputs
-
-
-__all__ = [
-    "PPChart2TableForConditionalGeneration",
-    "PPChart2TableModel",
-    "PPChart2TableVisionPreTrainedModel",
-    "PPChart2TablePreTrainedModel",
-]
diff --git a/src/transformers/models/pp_chart2table/modular_pp_chart2table.py b/src/transformers/models/pp_chart2table/modular_pp_chart2table.py
index 4fe5082fdecf..b6333b5c4547 100644
--- a/src/transformers/models/pp_chart2table/modular_pp_chart2table.py
+++ b/src/transformers/models/pp_chart2table/modular_pp_chart2table.py
@@ -19,7 +19,8 @@
 from ...image_utils import ImageInput
 from ...processing_utils import ProcessingKwargs, ProcessorMixin, Unpack
 from ...tokenization_utils_base import PreTokenizedInput, TextInput
-from ...utils import auto_docstring, logging, requires_backends
+from ...utils import auto_docstring, logging
+from ...utils.import_utils import requires
 from ..got_ocr2.configuration_got_ocr2 import GotOcr2Config
 
 
@@ -46,6 +47,7 @@ class PPChart2TableImageProcessorFast(BaseImageProcessorFast):
 
 
 @auto_docstring
+@requires(backends=("torch",))
 class PPChart2TableProcessor(ProcessorMixin):
     image_processor_class = "AutoImageProcessor"
     tokenizer_class = "AutoTokenizer"
@@ -71,7 +73,6 @@ def __call__(
         text: TextInput | PreTokenizedInput | list[TextInput] | list[PreTokenizedInput] = None,
         **kwargs: Unpack[ProcessingKwargs],
     ) -> BatchFeature:
-        requires_backends(self, "torch")
         output_kwargs = self._merge_kwargs(
             ProcessingKwargs,
             tokenizer_init_kwargs=self.tokenizer.init_kwargs,
diff --git a/src/transformers/models/pp_chart2table/processing_pp_chart2table.py b/src/transformers/models/pp_chart2table/processing_pp_chart2table.py
index ad70b5edc282..ed0ea50f4de5 100644
--- a/src/transformers/models/pp_chart2table/processing_pp_chart2table.py
+++ b/src/transformers/models/pp_chart2table/processing_pp_chart2table.py
@@ -23,10 +23,12 @@
 from ...image_utils import ImageInput
 from ...processing_utils import ProcessingKwargs, ProcessorMixin, Unpack
 from ...tokenization_utils_base import PreTokenizedInput, TextInput
-from ...utils import auto_docstring, requires_backends
+from ...utils import auto_docstring
+from ...utils.import_utils import requires
 
 
 @auto_docstring
+@requires(backends=("torch",))
 class PPChart2TableProcessor(ProcessorMixin):
     image_processor_class = "AutoImageProcessor"
     tokenizer_class = "AutoTokenizer"
@@ -52,7 +54,6 @@ def __call__(
         text: TextInput | PreTokenizedInput | list[TextInput] | list[PreTokenizedInput] = None,
         **kwargs: Unpack[ProcessingKwargs],
     ) -> BatchFeature:
-        requires_backends(self, "torch")
         output_kwargs = self._merge_kwargs(
             ProcessingKwargs,
             tokenizer_init_kwargs=self.tokenizer.init_kwargs,

From 8e201c1516bd2c8c367b9ba2e9e209203e5fa920 Mon Sep 17 00:00:00 2001
From: vasqu <antonprogamer@gmail.com>
Date: Wed, 18 Mar 2026 15:55:13 +0100
Subject: [PATCH 43/60] more explicit skip msg

---
 .../models/pp_chart2table/test_processing_pp_chart2table.py | 6 +++---
 1 file changed, 3 insertions(+), 3 deletions(-)

diff --git a/tests/models/pp_chart2table/test_processing_pp_chart2table.py b/tests/models/pp_chart2table/test_processing_pp_chart2table.py
index bd4592c18746..9bc3c174e331 100644
--- a/tests/models/pp_chart2table/test_processing_pp_chart2table.py
+++ b/tests/models/pp_chart2table/test_processing_pp_chart2table.py
@@ -63,14 +63,14 @@ def test_unstructured_kwargs_batched(self):
 
         self.assertLessEqual(inputs[self.images_input_name][0][0].mean(), 0)
 
-    @unittest.skip(reason="PPChart2Table image input cannot be None")
+    @unittest.skip(reason="PPChart2Table uses predetermined input - chat template usage not intended + image input cannot be None")
     def test_apply_chat_template_assistant_mask(self):
         pass
 
-    @unittest.skip(reason="PPChart2Table image input cannot be None")
+    @unittest.skip(reason="PPChart2Table uses predetermined input - chat template usage not intended + image input cannot be None")
     def test_apply_chat_template_image_0(self):
         pass
 
-    @unittest.skip(reason="PPChart2Table image input cannot be None")
+    @unittest.skip(reason="PPChart2Table uses predetermined input - chat template usage not intended + image input cannot be None")
     def test_apply_chat_template_image_1(self):
         pass

From 787e65094fa2c352da9433f73bffb9fdc4027e96 Mon Sep 17 00:00:00 2001
From: vasqu <antonprogamer@gmail.com>
Date: Wed, 18 Mar 2026 16:01:54 +0100
Subject: [PATCH 44/60] some quick fixes

---
 docs/source/en/model_doc/pp_chart2table.md           |  2 +-
 .../pp_chart2table/test_processing_pp_chart2table.py | 12 +++++++++---
 utils/check_config_attributes.py                     |  3 +++
 3 files changed, 13 insertions(+), 4 deletions(-)

diff --git a/docs/source/en/model_doc/pp_chart2table.md b/docs/source/en/model_doc/pp_chart2table.md
index 9dc464d31473..02ff7751e68d 100644
--- a/docs/source/en/model_doc/pp_chart2table.md
+++ b/docs/source/en/model_doc/pp_chart2table.md
@@ -13,7 +13,7 @@ specific language governing permissions and limitations under the License.
 rendered properly in your Markdown viewer.
 
 -->
-*This model was released on 2025-05-20 and added to Hugging Face Transformers on 2026-03-16.*
+*This model was released on 2025-05-20 and added to Hugging Face Transformers on 2026-03-18.*
 
 # PP-Chart2Table
 
diff --git a/tests/models/pp_chart2table/test_processing_pp_chart2table.py b/tests/models/pp_chart2table/test_processing_pp_chart2table.py
index 9bc3c174e331..cd6b7e64f7bf 100644
--- a/tests/models/pp_chart2table/test_processing_pp_chart2table.py
+++ b/tests/models/pp_chart2table/test_processing_pp_chart2table.py
@@ -63,14 +63,20 @@ def test_unstructured_kwargs_batched(self):
 
         self.assertLessEqual(inputs[self.images_input_name][0][0].mean(), 0)
 
-    @unittest.skip(reason="PPChart2Table uses predetermined input - chat template usage not intended + image input cannot be None")
+    @unittest.skip(
+        reason="PPChart2Table uses predetermined input - chat template usage not intended + image input cannot be None"
+    )
     def test_apply_chat_template_assistant_mask(self):
         pass
 
-    @unittest.skip(reason="PPChart2Table uses predetermined input - chat template usage not intended + image input cannot be None")
+    @unittest.skip(
+        reason="PPChart2Table uses predetermined input - chat template usage not intended + image input cannot be None"
+    )
     def test_apply_chat_template_image_0(self):
         pass
 
-    @unittest.skip(reason="PPChart2Table uses predetermined input - chat template usage not intended + image input cannot be None")
+    @unittest.skip(
+        reason="PPChart2Table uses predetermined input - chat template usage not intended + image input cannot be None"
+    )
     def test_apply_chat_template_image_1(self):
         pass
diff --git a/utils/check_config_attributes.py b/utils/check_config_attributes.py
index 01b9a3c7ecb7..b1e968c71f56 100644
--- a/utils/check_config_attributes.py
+++ b/utils/check_config_attributes.py
@@ -138,6 +138,9 @@
     "GptOssConfig": True,
     "LwDetrConfig": True,
     "NemotronHConfig": True,
+    # Internally uses Got Ocr2 so no need to use in the modeling code as we remap in auto instead
+    "PPChart2TableConfig": True,
+    "PPChart2TableVisionConfig": True,
 }
 
 # Common and important attributes, even if they do not always appear in the modeling files (can be a regex pattern)

From 7280cf56b75b212c5d72bdac15488be003b1df2e Mon Sep 17 00:00:00 2001
From: vasqu <antonprogamer@gmail.com>
Date: Wed, 18 Mar 2026 16:14:58 +0100
Subject: [PATCH 45/60] fix

---
 docs/source/en/model_doc/pp_chart2table.md       |  7 -------
 .../configuration_pp_chart2table.py              |  6 +++---
 .../pp_chart2table/modular_pp_chart2table.py     | 16 ++++++++++++++++
 .../test_modeling_pp_chart2table.py              | 10 ++--------
 4 files changed, 21 insertions(+), 18 deletions(-)

diff --git a/docs/source/en/model_doc/pp_chart2table.md b/docs/source/en/model_doc/pp_chart2table.md
index 02ff7751e68d..2f7c43e429dd 100644
--- a/docs/source/en/model_doc/pp_chart2table.md
+++ b/docs/source/en/model_doc/pp_chart2table.md
@@ -145,13 +145,6 @@ print(result)
 </hfoption>
 </hfoptions>
 
-## PPChart2TableForConditionalGeneration
-
-[[autodoc]] PPChart2TableForConditionalGeneration
-
-## PPChart2TableModel
-
-[[autodoc]] PPChart2TableModel
 
 ## PPChart2TableConfig
 
diff --git a/src/transformers/models/pp_chart2table/configuration_pp_chart2table.py b/src/transformers/models/pp_chart2table/configuration_pp_chart2table.py
index d85c61d942b9..3e85e2c96667 100644
--- a/src/transformers/models/pp_chart2table/configuration_pp_chart2table.py
+++ b/src/transformers/models/pp_chart2table/configuration_pp_chart2table.py
@@ -70,13 +70,13 @@ class PPChart2TableConfig(PreTrainedConfig):
     Example:
 
     ```python
-    >>> from transformers import PPChart2TableForConditionalGeneration, PPChart2TableConfig
+    >>> from transformers import GotOcr2ForConditionalGeneration, PPChart2TableConfig
 
     >>> # Initializing a PPChart2Table style configuration
     >>> configuration = PPChart2TableConfig()
 
-    >>> # Initializing a model from the Qwen2-VL-7B style configuration
-    >>> model = PPChart2TableForConditionalGeneration(configuration)
+    >>> # Initializing a model from the PaddlePaddle/PP-Chart2Table_safetensors style configuration
+    >>> model = GotOcr2ForConditionalGeneration(configuration)  # underlying architecture is Got Ocr 2
 
     >>> # Accessing the model configuration
     >>> configuration = model.config
diff --git a/src/transformers/models/pp_chart2table/modular_pp_chart2table.py b/src/transformers/models/pp_chart2table/modular_pp_chart2table.py
index b6333b5c4547..fd8f35ddaa92 100644
--- a/src/transformers/models/pp_chart2table/modular_pp_chart2table.py
+++ b/src/transformers/models/pp_chart2table/modular_pp_chart2table.py
@@ -32,6 +32,22 @@
 class PPChart2TableConfig(GotOcr2Config):
     model_type = "pp_chart2table"
 
+    r"""
+    Example:
+
+    ```python
+    >>> from transformers import GotOcr2ForConditionalGeneration, PPChart2TableConfig
+
+    >>> # Initializing a PPChart2Table style configuration
+    >>> configuration = PPChart2TableConfig()
+
+    >>> # Initializing a model from the PaddlePaddle/PP-Chart2Table_safetensors style configuration
+    >>> model = GotOcr2ForConditionalGeneration(configuration)  # underlying architecture is Got Ocr 2
+
+    >>> # Accessing the model configuration
+    >>> configuration = model.config
+    ```"""
+
 
 @auto_docstring
 class PPChart2TableImageProcessorFast(BaseImageProcessorFast):
diff --git a/tests/models/pp_chart2table/test_modeling_pp_chart2table.py b/tests/models/pp_chart2table/test_modeling_pp_chart2table.py
index 32b9fff60c38..04977cb8b7c5 100644
--- a/tests/models/pp_chart2table/test_modeling_pp_chart2table.py
+++ b/tests/models/pp_chart2table/test_modeling_pp_chart2table.py
@@ -16,19 +16,13 @@
 import unittest
 
 from transformers import (
+    AutoModelForImageTextToText,
     AutoProcessor,
-    is_torch_available,
     is_vision_available,
 )
 from transformers.testing_utils import cleanup, require_torch, require_vision, slow, torch_device
 
 
-if is_torch_available():
-    from transformers import (
-        PPChart2TableForConditionalGeneration,
-    )
-
-
 if is_vision_available():
     from transformers.image_utils import load_image
 
@@ -39,7 +33,7 @@
 class PPChart2TableIntegrationTest(unittest.TestCase):
     def setUp(self):
         model_path = "PaddlePaddle/PP-Chart2Table_safetensors"
-        self.model = PPChart2TableForConditionalGeneration.from_pretrained(model_path).to(torch_device)
+        self.model = AutoModelForImageTextToText.from_pretrained(model_path).to(torch_device)
         self.processor = AutoProcessor.from_pretrained(model_path)
         url = "https://paddle-model-ecology.bj.bcebos.com/paddlex/imgs/demo_image/chart_parsing_02.png"
         self.image = load_image(url)

From 957220ae2494bebe2269b4847ac305325b009d64 Mon Sep 17 00:00:00 2001
From: vasqu <antonprogamer@gmail.com>
Date: Wed, 18 Mar 2026 16:36:59 +0100
Subject: [PATCH 46/60] quick cleanups

---
 .../image_processing_pp_chart2table.py        | 33 +++++++++++++++++++
 .../image_processing_pp_chart2table_fast.py   |  2 ++
 .../pp_chart2table/modular_pp_chart2table.py  | 17 ++++++++--
 .../processing_pp_chart2table.py              |  2 --
 4 files changed, 49 insertions(+), 5 deletions(-)
 create mode 100644 src/transformers/models/pp_chart2table/image_processing_pp_chart2table.py

diff --git a/src/transformers/models/pp_chart2table/image_processing_pp_chart2table.py b/src/transformers/models/pp_chart2table/image_processing_pp_chart2table.py
new file mode 100644
index 000000000000..c94d4bc5557c
--- /dev/null
+++ b/src/transformers/models/pp_chart2table/image_processing_pp_chart2table.py
@@ -0,0 +1,33 @@
+#                🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨
+#           This file was automatically generated from src/transformers/models/pp_chart2table/modular_pp_chart2table.py.
+#               Do NOT edit this file manually as any edits will be overwritten by the generation of
+#             the file from the modular. If any change should be done, please apply the change to the
+#                          modular_pp_chart2table.py file directly. One of our CI enforces this.
+#                🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨
+# Copyright 2026 The PaddlePaddle Team and The HuggingFace Inc. team. All rights reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+from ...processing_utils import ImagesKwargs
+
+
+class PPChart2TableImageProcessorKwargs(ImagesKwargs, total=False):
+    r"""
+    patch_size (`int`, *optional*, defaults to `16`):
+        The expected patch size out of the image processor.
+    num_patches (`int`, *optional*, defaults to `16`):
+        Alias for `patch_size`.
+    """
+
+    patch_size: int
+    num_patches: int
diff --git a/src/transformers/models/pp_chart2table/image_processing_pp_chart2table_fast.py b/src/transformers/models/pp_chart2table/image_processing_pp_chart2table_fast.py
index ef857db5b658..48e0a3d0b1d8 100644
--- a/src/transformers/models/pp_chart2table/image_processing_pp_chart2table_fast.py
+++ b/src/transformers/models/pp_chart2table/image_processing_pp_chart2table_fast.py
@@ -20,6 +20,7 @@
 
 from ...image_processing_utils_fast import BaseImageProcessorFast
 from ...utils import auto_docstring
+from .image_processing_pp_chart2table import PPChart2TableImageProcessorKwargs
 
 
 @auto_docstring
@@ -33,6 +34,7 @@ class PPChart2TableImageProcessorFast(BaseImageProcessorFast):
     do_resize = True
     do_rescale = True
     do_normalize = True
+    valid_kwargs = PPChart2TableImageProcessorKwargs
 
 
 __all__ = ["PPChart2TableImageProcessorFast"]
diff --git a/src/transformers/models/pp_chart2table/modular_pp_chart2table.py b/src/transformers/models/pp_chart2table/modular_pp_chart2table.py
index fd8f35ddaa92..79555850699c 100644
--- a/src/transformers/models/pp_chart2table/modular_pp_chart2table.py
+++ b/src/transformers/models/pp_chart2table/modular_pp_chart2table.py
@@ -17,7 +17,7 @@
 from ...feature_extraction_utils import BatchFeature
 from ...image_processing_utils_fast import BaseImageProcessorFast
 from ...image_utils import ImageInput
-from ...processing_utils import ProcessingKwargs, ProcessorMixin, Unpack
+from ...processing_utils import ImagesKwargs, ProcessingKwargs, ProcessorMixin, Unpack
 from ...tokenization_utils_base import PreTokenizedInput, TextInput
 from ...utils import auto_docstring, logging
 from ...utils.import_utils import requires
@@ -49,6 +49,18 @@ class PPChart2TableConfig(GotOcr2Config):
     ```"""
 
 
+class PPChart2TableImageProcessorKwargs(ImagesKwargs, total=False):
+    r"""
+    patch_size (`int`, *optional*, defaults to `16`):
+        The expected patch size out of the image processor.
+    num_patches (`int`, *optional*, defaults to `16`):
+        Alias for `patch_size`.
+    """
+
+    patch_size: int
+    num_patches: int
+
+
 @auto_docstring
 class PPChart2TableImageProcessorFast(BaseImageProcessorFast):
     resample = 3
@@ -60,13 +72,12 @@ class PPChart2TableImageProcessorFast(BaseImageProcessorFast):
     do_resize = True
     do_rescale = True
     do_normalize = True
+    valid_kwargs = PPChart2TableImageProcessorKwargs
 
 
 @auto_docstring
 @requires(backends=("torch",))
 class PPChart2TableProcessor(ProcessorMixin):
-    image_processor_class = "AutoImageProcessor"
-    tokenizer_class = "AutoTokenizer"
     model_input_names = ["input_ids", "pixel_values"]
 
     def __init__(self, image_processor=None, tokenizer=None, chat_template=None, **kwargs):
diff --git a/src/transformers/models/pp_chart2table/processing_pp_chart2table.py b/src/transformers/models/pp_chart2table/processing_pp_chart2table.py
index ed0ea50f4de5..9eb2b58c6078 100644
--- a/src/transformers/models/pp_chart2table/processing_pp_chart2table.py
+++ b/src/transformers/models/pp_chart2table/processing_pp_chart2table.py
@@ -30,8 +30,6 @@
 @auto_docstring
 @requires(backends=("torch",))
 class PPChart2TableProcessor(ProcessorMixin):
-    image_processor_class = "AutoImageProcessor"
-    tokenizer_class = "AutoTokenizer"
     model_input_names = ["input_ids", "pixel_values"]
 
     def __init__(self, image_processor=None, tokenizer=None, chat_template=None, **kwargs):

From 9f74fa1c745d37c96b13c04ece95affd2fd25752 Mon Sep 17 00:00:00 2001
From: XingweiDeng <aaadddiy@163.com>
Date: Thu, 19 Mar 2026 16:42:19 +0800
Subject: [PATCH 47/60] update

---
 docs/source/en/model_doc/pp_chart2table.md    | 125 ++++++++++++------
 .../image_processing_pp_chart2table.py        |   8 +-
 .../pp_chart2table/modular_pp_chart2table.py  |  38 ++----
 .../processing_pp_chart2table.py              |  30 ++---
 .../test_modeling_pp_chart2table.py           |  70 ++++++----
 .../test_processing_pp_chart2table.py         |  19 ++-
 6 files changed, 178 insertions(+), 112 deletions(-)

diff --git a/docs/source/en/model_doc/pp_chart2table.md b/docs/source/en/model_doc/pp_chart2table.md
index 2f7c43e429dd..dde874495c78 100644
--- a/docs/source/en/model_doc/pp_chart2table.md
+++ b/docs/source/en/model_doc/pp_chart2table.md
@@ -39,24 +39,26 @@ The example below demonstrates how to classify image with PP-Chart2Table using [
 <hfoption id="Pipeline">
 
 ```py
-import requests
-from PIL import Image
 from transformers import pipeline
-model_path = "PaddlePaddle/PP-Chart2Table_safetensors"
-pipe = pipeline(
-    task="image-text-to-text", 
-    model=model_path,
-    device_map="auto",
-)
-image = Image.open(requests.get("https://paddle-model-ecology.bj.bcebos.com/paddlex/imgs/demo_image/chart_parsing_02.png", stream=True).raw)
-# text is empty - processor uses hardcoded "Chart to table" instruction internally via chat template
-result = pipe(
-    images=image, 
-    text="",
-    do_sample=False, 
-    max_new_tokens=256
-)
-print(result)
+
+pipe = pipeline("image-text-to-text", model="PaddlePaddle/PP-Chart2Table_safetensors")
+
+# PPChart2TableProcessor uses hardcoded "Chart to table" instruction internally via chat template
+conversation = [
+    {
+        "role": "system",
+        "content": [],
+    },
+    {
+        "role": "user",
+        "content": [
+            {"type": "image", "url": "https://paddle-model-ecology.bj.bcebos.com/paddlex/imgs/demo_image/chart_parsing_02.png", "num_patches": 16},
+            {"type": "text", "text": "Chart to table"}
+        ],
+    }
+]
+result = pipe(text=conversation)
+print(result[0]["generated_text"])
 
 ```
 
@@ -76,8 +78,29 @@ model = AutoModelForImageTextToText.from_pretrained(
 )
 processor = AutoProcessor.from_pretrained(model_path)
 
-image = Image.open(requests.get("https://paddle-model-ecology.bj.bcebos.com/paddlex/imgs/demo_image/chart_parsing_02.png", stream=True).raw)
-inputs = processor(images=image).to(model.device)
+# PPChart2TableProcessor uses hardcoded "Chart to table" instruction internally via chat template
+conversation = [
+    {
+        "role": "system",
+        "content": [],
+    },
+    {
+        "role": "user",
+        "content": [
+            {"type": "image", "url": "https://paddle-model-ecology.bj.bcebos.com/paddlex/imgs/demo_image/chart_parsing_02.png", "num_patches": processor.image_processor.num_patches},
+            {"type": "text", "text": "Chart to table"}
+        ],
+    }
+]
+
+inputs = processor.apply_chat_template(
+    conversation,
+    tokenize=True, 
+    add_generation_prompt=True, 
+    truncation=True,
+    return_dict=True,
+    return_tensors="pt",
+).to(model.device)
 
 generated_ids = model.generate(**inputs, do_sample=False, max_new_tokens=256)
 generated_ids_trimmed = [out_ids[len(in_ids) :] for in_ids, out_ids in zip(inputs.input_ids, generated_ids)]
@@ -97,24 +120,27 @@ Here is how you can do it with PP-Chart2Table using [`Pipeline`] or the [`AutoMo
 <hfoption id="Pipeline">
 
 ```py
-import requests
 from transformers import pipeline
-from PIL import Image
-model_path = "PaddlePaddle/PP-Chart2Table_safetensors"
-pipe = pipeline(
-    task="image-text-to-text", 
-    model=model_path,
-    device_map="auto",
-)
-image = Image.open(requests.get("https://paddle-model-ecology.bj.bcebos.com/paddlex/imgs/demo_image/chart_parsing_02.png", stream=True).raw)
-# text is empty - processor uses hardcoded "Chart to table" instruction internally via chat template
-result = pipe(
-    images=[image, image],
-    text="",
-    do_sample=False,
-    max_new_tokens=256
-)
-print(result)
+
+pipe = pipeline("image-text-to-text", model="PaddlePaddle/PP-Chart2Table_safetensors")
+
+# PPChart2TableProcessor uses hardcoded "Chart to table" instruction internally via chat template
+conversation = [
+    {
+        "role": "system",
+        "content": [],
+    },
+    {
+        "role": "user",
+        "content": [
+            {"type": "image", "url": "https://paddle-model-ecology.bj.bcebos.com/paddlex/imgs/demo_image/chart_parsing_02.png", "num_patches": 16},
+            {"type": "text", "text": "Chart to table"}
+        ],
+    }
+]
+result = pipe(text=[conversation, conversation])
+print(result[0][0]["generated_text"])
+
 ```
 
 </hfoption>
@@ -133,13 +159,36 @@ model = AutoModelForImageTextToText.from_pretrained(
 )
 processor = AutoProcessor.from_pretrained(model_path)
 
-image = Image.open(requests.get("https://paddle-model-ecology.bj.bcebos.com/paddlex/imgs/demo_image/chart_parsing_02.png", stream=True).raw)
-inputs = processor(images=[image, image]).to(model.device)
+# PPChart2TableProcessor uses hardcoded "Chart to table" instruction internally via chat template
+conversation = [
+    {
+        "role": "system",
+        "content": [],
+    },
+    {
+        "role": "user",
+        "content": [
+            {"type": "image", "url": "https://paddle-model-ecology.bj.bcebos.com/paddlex/imgs/demo_image/chart_parsing_02.png", "num_patches": processor.image_processor.num_patches},
+            {"type": "text", "text": "Chart to table"}
+        ],
+    }
+]
+
+batch_conversation = [conversation, conversation]
+inputs = processor.apply_chat_template(
+    batch_conversation,
+    tokenize=True, 
+    add_generation_prompt=True, 
+    truncation=True,
+    return_dict=True,
+    return_tensors="pt",
+).to(model.device)
 
 generated_ids = model.generate(**inputs, do_sample=False, max_new_tokens=256)
 generated_ids_trimmed = [out_ids[len(in_ids) :] for in_ids, out_ids in zip(inputs.input_ids, generated_ids)]
 result = processor.batch_decode(generated_ids_trimmed, skip_special_tokens=True, clean_up_tokenization_spaces=False)
 print(result)
+
 ```
 
 </hfoption>
diff --git a/src/transformers/models/pp_chart2table/image_processing_pp_chart2table.py b/src/transformers/models/pp_chart2table/image_processing_pp_chart2table.py
index c94d4bc5557c..5092b216ee99 100644
--- a/src/transformers/models/pp_chart2table/image_processing_pp_chart2table.py
+++ b/src/transformers/models/pp_chart2table/image_processing_pp_chart2table.py
@@ -24,9 +24,13 @@
 class PPChart2TableImageProcessorKwargs(ImagesKwargs, total=False):
     r"""
     patch_size (`int`, *optional*, defaults to `16`):
-        The expected patch size out of the image processor.
+        The size (in pixels) of each square patch that the image is divided into before being fed into the
+        vision encoder.
+
     num_patches (`int`, *optional*, defaults to `16`):
-        Alias for `patch_size`.
+        Number of patches used to represent the image in the input sequence. This parameter is included in
+        the chat template's user message to inform the language model about the image structure. The model
+        uses this information to understand how the image tokens correspond to the visual input.
     """
 
     patch_size: int
diff --git a/src/transformers/models/pp_chart2table/modular_pp_chart2table.py b/src/transformers/models/pp_chart2table/modular_pp_chart2table.py
index 79555850699c..a540d6848509 100644
--- a/src/transformers/models/pp_chart2table/modular_pp_chart2table.py
+++ b/src/transformers/models/pp_chart2table/modular_pp_chart2table.py
@@ -12,6 +12,7 @@
 # See the License for the specific language governing permissions and
 # limitations under the License.
 
+import torch
 from huggingface_hub.dataclasses import strict
 
 from ...feature_extraction_utils import BatchFeature
@@ -52,9 +53,13 @@ class PPChart2TableConfig(GotOcr2Config):
 class PPChart2TableImageProcessorKwargs(ImagesKwargs, total=False):
     r"""
     patch_size (`int`, *optional*, defaults to `16`):
-        The expected patch size out of the image processor.
+        The size (in pixels) of each square patch that the image is divided into before being fed into the
+        vision encoder.
+
     num_patches (`int`, *optional*, defaults to `16`):
-        Alias for `patch_size`.
+        Number of patches used to represent the image in the input sequence. This parameter is included in
+        the chat template's user message to inform the language model about the image structure. The model
+        uses this information to understand how the image tokens correspond to the visual input.
     """
 
     patch_size: int
@@ -83,17 +88,6 @@ class PPChart2TableProcessor(ProcessorMixin):
     def __init__(self, image_processor=None, tokenizer=None, chat_template=None, **kwargs):
         super().__init__(image_processor, tokenizer, chat_template=chat_template)
 
-        # PPChart2TableProcessor uses hardcoded "Chart to table" instruction internally via chat template
-        self.messages = [
-            {
-                "role": "system",
-            },
-            {
-                "role": "user",
-                "image": {"num_patches": self.image_processor.num_patches},
-            },
-        ]
-
     def __call__(
         self,
         images: ImageInput = None,
@@ -109,19 +103,15 @@ def __call__(
         if images is None:
             raise ValueError("At least one of `images` must be provided")
         image_inputs = self.image_processor(images=images, **output_kwargs["images_kwargs"])
-        batch_size = image_inputs["pixel_values"].shape[0]
-
-        # Use tokenizer's apply_chat_template instead of manually loading template
-        inputs = self.tokenizer.apply_chat_template(
-            self.messages,
-            tokenize=True,
-            add_generation_prompt=True,
-            truncation=True,
-            **output_kwargs["text_kwargs"],
-        )
 
         # Prepare input ids for batch
-        input_ids = inputs["input_ids"].repeat(batch_size, 1)
+        if text is None:
+            raise ValueError("At least one of `text` must be provided")
+
+        if not isinstance(text, list):
+            text = [text]
+
+        input_ids = torch.tensor(self.tokenizer(text, **output_kwargs["text_kwargs"]).input_ids)
 
         return BatchFeature(data={"input_ids": input_ids, **image_inputs})
 
diff --git a/src/transformers/models/pp_chart2table/processing_pp_chart2table.py b/src/transformers/models/pp_chart2table/processing_pp_chart2table.py
index 9eb2b58c6078..cb5cc19f284b 100644
--- a/src/transformers/models/pp_chart2table/processing_pp_chart2table.py
+++ b/src/transformers/models/pp_chart2table/processing_pp_chart2table.py
@@ -18,6 +18,7 @@
 # See the License for the specific language governing permissions and
 # limitations under the License.
 
+import torch
 
 from ...feature_extraction_utils import BatchFeature
 from ...image_utils import ImageInput
@@ -35,17 +36,6 @@ class PPChart2TableProcessor(ProcessorMixin):
     def __init__(self, image_processor=None, tokenizer=None, chat_template=None, **kwargs):
         super().__init__(image_processor, tokenizer, chat_template=chat_template)
 
-        # PPChart2TableProcessor uses hardcoded "Chart to table" instruction internally via chat template
-        self.messages = [
-            {
-                "role": "system",
-            },
-            {
-                "role": "user",
-                "image": {"num_patches": self.image_processor.num_patches},
-            },
-        ]
-
     def __call__(
         self,
         images: ImageInput = None,
@@ -61,19 +51,15 @@ def __call__(
         if images is None:
             raise ValueError("At least one of `images` must be provided")
         image_inputs = self.image_processor(images=images, **output_kwargs["images_kwargs"])
-        batch_size = image_inputs["pixel_values"].shape[0]
-
-        # Use tokenizer's apply_chat_template instead of manually loading template
-        inputs = self.tokenizer.apply_chat_template(
-            self.messages,
-            tokenize=True,
-            add_generation_prompt=True,
-            truncation=True,
-            **output_kwargs["text_kwargs"],
-        )
 
         # Prepare input ids for batch
-        input_ids = inputs["input_ids"].repeat(batch_size, 1)
+        if text is None:
+            raise ValueError("At least one of `text` must be provided")
+
+        if not isinstance(text, list):
+            text = [text]
+
+        input_ids = torch.tensor(self.tokenizer(text, **output_kwargs["text_kwargs"]).input_ids)
 
         return BatchFeature(data={"input_ids": input_ids, **image_inputs})
 
diff --git a/tests/models/pp_chart2table/test_modeling_pp_chart2table.py b/tests/models/pp_chart2table/test_modeling_pp_chart2table.py
index 04977cb8b7c5..e173112c2172 100644
--- a/tests/models/pp_chart2table/test_modeling_pp_chart2table.py
+++ b/tests/models/pp_chart2table/test_modeling_pp_chart2table.py
@@ -15,18 +15,10 @@
 
 import unittest
 
-from transformers import (
-    AutoModelForImageTextToText,
-    AutoProcessor,
-    is_vision_available,
-)
+from transformers import AutoModelForImageTextToText, AutoProcessor
 from transformers.testing_utils import cleanup, require_torch, require_vision, slow, torch_device
 
 
-if is_vision_available():
-    from transformers.image_utils import load_image
-
-
 @slow
 @require_vision
 @require_torch
@@ -35,31 +27,61 @@ def setUp(self):
         model_path = "PaddlePaddle/PP-Chart2Table_safetensors"
         self.model = AutoModelForImageTextToText.from_pretrained(model_path).to(torch_device)
         self.processor = AutoProcessor.from_pretrained(model_path)
-        url = "https://paddle-model-ecology.bj.bcebos.com/paddlex/imgs/demo_image/chart_parsing_02.png"
-        self.image = load_image(url)
+        self.conversation = [
+            {
+                "role": "system",
+                "content": [],
+            },
+            {
+                "role": "user",
+                "content": [
+                    {
+                        "type": "image",
+                        "url": "https://paddle-model-ecology.bj.bcebos.com/paddlex/imgs/demo_image/chart_parsing_02.png",
+                        "num_patches": self.processor.image_processor.num_patches,
+                    },
+                    {"type": "text", "text": "Chart to table"},
+                ],
+            },
+        ]
 
     def tearDown(self):
         cleanup(torch_device, gc_collect=True)
 
     def test_small_model_integration_test_pp_chart2table(self):
-        inputs = self.processor(self.image, return_tensors="pt").to(torch_device)
-        generate_ids = self.model.generate(
-            **inputs,
-            use_cache=True,
-            do_sample=False,
-            max_new_tokens=32,
-        )
-        decoded_output = self.processor.decode(
-            generate_ids[0, inputs["input_ids"].shape[1] :], skip_special_tokens=True
+        inputs = self.processor.apply_chat_template(
+            self.conversation,
+            tokenize=True,
+            add_generation_prompt=True,
+            truncation=True,
+            return_dict=True,
+            return_tensors="pt",
+        ).to(self.model.device)
+
+        generated_ids = self.model.generate(**inputs, do_sample=False, max_new_tokens=32)
+        generated_ids_trimmed = [out_ids[len(in_ids) :] for in_ids, out_ids in zip(inputs.input_ids, generated_ids)]
+        decoded_output = self.processor.batch_decode(
+            generated_ids_trimmed, skip_special_tokens=True, clean_up_tokenization_spaces=False
         )
-        expected_output = "年份 | 单家五星级旅游饭店年平均营收 (百万元) | 单家五星级旅游饭店年平均利润 (百万元)\n"
+
+        expected_output = ["年份 | 单家五星级旅游饭店年平均营收 (百万元) | 单家五星级旅游饭店年平均利润 (百万元)\n"]
         self.assertEqual(decoded_output, expected_output)
 
     def test_small_model_integration_test_pp_chart2table_batched(self):
-        inputs = self.processor([self.image, self.image], return_tensors="pt").to(torch_device)
-        generate_ids = self.model.generate(**inputs, do_sample=False, num_beams=1, max_new_tokens=6)
+        inputs = self.processor.apply_chat_template(
+            [self.conversation, self.conversation],
+            tokenize=True,
+            add_generation_prompt=True,
+            truncation=True,
+            return_dict=True,
+            return_tensors="pt",
+        ).to(self.model.device)
+
+        generated_ids = self.model.generate(**inputs, do_sample=False, max_new_tokens=6)
+        generated_ids_trimmed = [out_ids[len(in_ids) :] for in_ids, out_ids in zip(inputs.input_ids, generated_ids)]
         decoded_output = self.processor.batch_decode(
-            generate_ids[:, inputs["input_ids"].shape[1] :], skip_special_tokens=True
+            generated_ids_trimmed, skip_special_tokens=True, clean_up_tokenization_spaces=False
         )
+
         expected_output = ["年份 | 单家", "年份 | 单家"]
         self.assertEqual(decoded_output, expected_output)
diff --git a/tests/models/pp_chart2table/test_processing_pp_chart2table.py b/tests/models/pp_chart2table/test_processing_pp_chart2table.py
index cd6b7e64f7bf..89191374ed40 100644
--- a/tests/models/pp_chart2table/test_processing_pp_chart2table.py
+++ b/tests/models/pp_chart2table/test_processing_pp_chart2table.py
@@ -37,8 +37,23 @@ def test_image_processor_defaults(self):
     def test_ocr_queries(self):
         processor = self.get_processor()
         image_input = self.prepare_image_inputs()
-        inputs = processor(image_input, return_tensors="pt")
-        self.assertEqual(inputs["input_ids"].shape, (1, 286))
+        conversation = [
+            {
+                "role": "system",
+                "content": [],
+            },
+            {
+                "role": "user",
+                "content": [{"type": "image", "num_patches": 16}, {"type": "text", "text": "Chart to table"}],
+            },
+        ]
+        inputs = processor.apply_chat_template(
+            conversation,
+            tokenize=False,
+            add_generation_prompt=True,
+        )
+        inputs = processor(images=image_input, text=inputs, return_tensors="pt")
+        self.assertEqual(inputs["input_ids"].shape, (1, 285))
         self.assertEqual(inputs["pixel_values"].shape, (1, 3, 1024, 1024))
 
     def test_unstructured_kwargs_batched(self):

From f33cfb5914e3e97891e444ec0383b7e332ab6a9e Mon Sep 17 00:00:00 2001
From: XingweiDeng <aaadddiy@163.com>
Date: Thu, 19 Mar 2026 20:45:27 +0800
Subject: [PATCH 48/60] update

---
 docs/source/en/model_doc/pp_chart2table.md    | 32 ++++++++++++-------
 .../pp_chart2table/modular_pp_chart2table.py  |  3 +-
 .../processing_pp_chart2table.py              |  5 +--
 .../test_modeling_pp_chart2table.py           |  2 --
 .../test_processing_pp_chart2table.py         | 21 +++++-------
 5 files changed, 30 insertions(+), 33 deletions(-)

diff --git a/docs/source/en/model_doc/pp_chart2table.md b/docs/source/en/model_doc/pp_chart2table.md
index dde874495c78..dbb5d14fe1cd 100644
--- a/docs/source/en/model_doc/pp_chart2table.md
+++ b/docs/source/en/model_doc/pp_chart2table.md
@@ -52,10 +52,12 @@ conversation = [
     {
         "role": "user",
         "content": [
-            {"type": "image", "url": "https://paddle-model-ecology.bj.bcebos.com/paddlex/imgs/demo_image/chart_parsing_02.png", "num_patches": 16},
-            {"type": "text", "text": "Chart to table"}
+            {
+                "type": "image",
+                "url": "https://paddle-model-ecology.bj.bcebos.com/paddlex/imgs/demo_image/chart_parsing_02.png",
+            },
         ],
-    }
+    },
 ]
 result = pipe(text=conversation)
 print(result[0]["generated_text"])
@@ -87,10 +89,12 @@ conversation = [
     {
         "role": "user",
         "content": [
-            {"type": "image", "url": "https://paddle-model-ecology.bj.bcebos.com/paddlex/imgs/demo_image/chart_parsing_02.png", "num_patches": processor.image_processor.num_patches},
-            {"type": "text", "text": "Chart to table"}
+            {
+                "type": "image",
+                "url": "https://paddle-model-ecology.bj.bcebos.com/paddlex/imgs/demo_image/chart_parsing_02.png",
+            },
         ],
-    }
+    },
 ]
 
 inputs = processor.apply_chat_template(
@@ -133,10 +137,12 @@ conversation = [
     {
         "role": "user",
         "content": [
-            {"type": "image", "url": "https://paddle-model-ecology.bj.bcebos.com/paddlex/imgs/demo_image/chart_parsing_02.png", "num_patches": 16},
-            {"type": "text", "text": "Chart to table"}
+            {
+                "type": "image",
+                "url": "https://paddle-model-ecology.bj.bcebos.com/paddlex/imgs/demo_image/chart_parsing_02.png",
+            },
         ],
-    }
+    },
 ]
 result = pipe(text=[conversation, conversation])
 print(result[0][0]["generated_text"])
@@ -168,10 +174,12 @@ conversation = [
     {
         "role": "user",
         "content": [
-            {"type": "image", "url": "https://paddle-model-ecology.bj.bcebos.com/paddlex/imgs/demo_image/chart_parsing_02.png", "num_patches": processor.image_processor.num_patches},
-            {"type": "text", "text": "Chart to table"}
+            {
+                "type": "image",
+                "url": "https://paddle-model-ecology.bj.bcebos.com/paddlex/imgs/demo_image/chart_parsing_02.png",
+            },
         ],
-    }
+    },
 ]
 
 batch_conversation = [conversation, conversation]
diff --git a/src/transformers/models/pp_chart2table/modular_pp_chart2table.py b/src/transformers/models/pp_chart2table/modular_pp_chart2table.py
index a540d6848509..47c72069d0de 100644
--- a/src/transformers/models/pp_chart2table/modular_pp_chart2table.py
+++ b/src/transformers/models/pp_chart2table/modular_pp_chart2table.py
@@ -81,7 +81,6 @@ class PPChart2TableImageProcessorFast(BaseImageProcessorFast):
 
 
 @auto_docstring
-@requires(backends=("torch",))
 class PPChart2TableProcessor(ProcessorMixin):
     model_input_names = ["input_ids", "pixel_values"]
 
@@ -111,7 +110,7 @@ def __call__(
         if not isinstance(text, list):
             text = [text]
 
-        input_ids = torch.tensor(self.tokenizer(text, **output_kwargs["text_kwargs"]).input_ids)
+        input_ids = self.tokenizer(text, **output_kwargs["text_kwargs"]).input_ids
 
         return BatchFeature(data={"input_ids": input_ids, **image_inputs})
 
diff --git a/src/transformers/models/pp_chart2table/processing_pp_chart2table.py b/src/transformers/models/pp_chart2table/processing_pp_chart2table.py
index cb5cc19f284b..9047f6d45b35 100644
--- a/src/transformers/models/pp_chart2table/processing_pp_chart2table.py
+++ b/src/transformers/models/pp_chart2table/processing_pp_chart2table.py
@@ -18,18 +18,15 @@
 # See the License for the specific language governing permissions and
 # limitations under the License.
 
-import torch
 
 from ...feature_extraction_utils import BatchFeature
 from ...image_utils import ImageInput
 from ...processing_utils import ProcessingKwargs, ProcessorMixin, Unpack
 from ...tokenization_utils_base import PreTokenizedInput, TextInput
 from ...utils import auto_docstring
-from ...utils.import_utils import requires
 
 
 @auto_docstring
-@requires(backends=("torch",))
 class PPChart2TableProcessor(ProcessorMixin):
     model_input_names = ["input_ids", "pixel_values"]
 
@@ -59,7 +56,7 @@ def __call__(
         if not isinstance(text, list):
             text = [text]
 
-        input_ids = torch.tensor(self.tokenizer(text, **output_kwargs["text_kwargs"]).input_ids)
+        input_ids = self.tokenizer(text, **output_kwargs["text_kwargs"]).input_ids
 
         return BatchFeature(data={"input_ids": input_ids, **image_inputs})
 
diff --git a/tests/models/pp_chart2table/test_modeling_pp_chart2table.py b/tests/models/pp_chart2table/test_modeling_pp_chart2table.py
index e173112c2172..ba739912f072 100644
--- a/tests/models/pp_chart2table/test_modeling_pp_chart2table.py
+++ b/tests/models/pp_chart2table/test_modeling_pp_chart2table.py
@@ -38,9 +38,7 @@ def setUp(self):
                     {
                         "type": "image",
                         "url": "https://paddle-model-ecology.bj.bcebos.com/paddlex/imgs/demo_image/chart_parsing_02.png",
-                        "num_patches": self.processor.image_processor.num_patches,
                     },
-                    {"type": "text", "text": "Chart to table"},
                 ],
             },
         ]
diff --git a/tests/models/pp_chart2table/test_processing_pp_chart2table.py b/tests/models/pp_chart2table/test_processing_pp_chart2table.py
index 89191374ed40..c02dbfe2858c 100644
--- a/tests/models/pp_chart2table/test_processing_pp_chart2table.py
+++ b/tests/models/pp_chart2table/test_processing_pp_chart2table.py
@@ -27,7 +27,8 @@ class PPChart2TableProcessorTest(ProcessorTesterMixin, unittest.TestCase):
     @classmethod
     def _setup_tokenizer(cls):
         tokenizer_class = cls._get_component_class_from_processor("tokenizer")
-        tokenizer = tokenizer_class.from_pretrained("PaddlePaddle/PP-Chart2Table_safetensors")
+        # tokenizer = tokenizer_class.from_pretrained("PaddlePaddle/PP-Chart2Table_safetensors")
+        tokenizer = tokenizer_class.from_pretrained("/workspace/model_weight_torch/PP-Chart2Table")
         return tokenizer
 
     @unittest.skip("PPChart2TableProcessor pop the image processor output 'num_patches'")
@@ -38,14 +39,8 @@ def test_ocr_queries(self):
         processor = self.get_processor()
         image_input = self.prepare_image_inputs()
         conversation = [
-            {
-                "role": "system",
-                "content": [],
-            },
-            {
-                "role": "user",
-                "content": [{"type": "image", "num_patches": 16}, {"type": "text", "text": "Chart to table"}],
-            },
+            {"role": "system", "content": []},
+            {"role": "user", "content": []}
         ]
         inputs = processor.apply_chat_template(
             conversation,
@@ -53,7 +48,7 @@ def test_ocr_queries(self):
             add_generation_prompt=True,
         )
         inputs = processor(images=image_input, text=inputs, return_tensors="pt")
-        self.assertEqual(inputs["input_ids"].shape, (1, 285))
+        self.assertEqual(inputs["input_ids"].shape, (1, 287))
         self.assertEqual(inputs["pixel_values"].shape, (1, 3, 1024, 1024))
 
     def test_unstructured_kwargs_batched(self):
@@ -79,19 +74,19 @@ def test_unstructured_kwargs_batched(self):
         self.assertLessEqual(inputs[self.images_input_name][0][0].mean(), 0)
 
     @unittest.skip(
-        reason="PPChart2Table uses predetermined input - chat template usage not intended + image input cannot be None"
+        reason="PPChart2Table relies on a predetermined input format; chat template usage is not intended, and image input cannot be None."
     )
     def test_apply_chat_template_assistant_mask(self):
         pass
 
     @unittest.skip(
-        reason="PPChart2Table uses predetermined input - chat template usage not intended + image input cannot be None"
+        reason="PPChart2Table relies on a predetermined input format; chat template usage is not intended, and image input cannot be None."
     )
     def test_apply_chat_template_image_0(self):
         pass
 
     @unittest.skip(
-        reason="PPChart2Table uses predetermined input - chat template usage not intended + image input cannot be None"
+        reason="PPChart2Table relies on a predetermined input format; chat template usage is not intended, and image input cannot be None."
     )
     def test_apply_chat_template_image_1(self):
         pass

From 8394c0814300a45e9ea06c364b56c3f016059447 Mon Sep 17 00:00:00 2001
From: XingweiDeng <aaadddiy@163.com>
Date: Thu, 19 Mar 2026 20:48:01 +0800
Subject: [PATCH 49/60] update

---
 .../models/pp_chart2table/modular_pp_chart2table.py          | 2 --
 .../models/pp_chart2table/test_processing_pp_chart2table.py  | 5 +----
 2 files changed, 1 insertion(+), 6 deletions(-)

diff --git a/src/transformers/models/pp_chart2table/modular_pp_chart2table.py b/src/transformers/models/pp_chart2table/modular_pp_chart2table.py
index 47c72069d0de..f4d76b17e9ba 100644
--- a/src/transformers/models/pp_chart2table/modular_pp_chart2table.py
+++ b/src/transformers/models/pp_chart2table/modular_pp_chart2table.py
@@ -12,7 +12,6 @@
 # See the License for the specific language governing permissions and
 # limitations under the License.
 
-import torch
 from huggingface_hub.dataclasses import strict
 
 from ...feature_extraction_utils import BatchFeature
@@ -21,7 +20,6 @@
 from ...processing_utils import ImagesKwargs, ProcessingKwargs, ProcessorMixin, Unpack
 from ...tokenization_utils_base import PreTokenizedInput, TextInput
 from ...utils import auto_docstring, logging
-from ...utils.import_utils import requires
 from ..got_ocr2.configuration_got_ocr2 import GotOcr2Config
 
 
diff --git a/tests/models/pp_chart2table/test_processing_pp_chart2table.py b/tests/models/pp_chart2table/test_processing_pp_chart2table.py
index c02dbfe2858c..e89dbd9db900 100644
--- a/tests/models/pp_chart2table/test_processing_pp_chart2table.py
+++ b/tests/models/pp_chart2table/test_processing_pp_chart2table.py
@@ -38,10 +38,7 @@ def test_image_processor_defaults(self):
     def test_ocr_queries(self):
         processor = self.get_processor()
         image_input = self.prepare_image_inputs()
-        conversation = [
-            {"role": "system", "content": []},
-            {"role": "user", "content": []}
-        ]
+        conversation = [{"role": "system", "content": []}, {"role": "user", "content": []}]
         inputs = processor.apply_chat_template(
             conversation,
             tokenize=False,

From 3c0b0282a000f91cc46864ede1c6451198f6ffcf Mon Sep 17 00:00:00 2001
From: XingweiDeng <aaadddiy@163.com>
Date: Thu, 19 Mar 2026 22:33:48 +0800
Subject: [PATCH 50/60] update

---
 tests/models/pp_chart2table/test_processing_pp_chart2table.py | 3 +--
 1 file changed, 1 insertion(+), 2 deletions(-)

diff --git a/tests/models/pp_chart2table/test_processing_pp_chart2table.py b/tests/models/pp_chart2table/test_processing_pp_chart2table.py
index e89dbd9db900..42f4fc9bb63d 100644
--- a/tests/models/pp_chart2table/test_processing_pp_chart2table.py
+++ b/tests/models/pp_chart2table/test_processing_pp_chart2table.py
@@ -27,8 +27,7 @@ class PPChart2TableProcessorTest(ProcessorTesterMixin, unittest.TestCase):
     @classmethod
     def _setup_tokenizer(cls):
         tokenizer_class = cls._get_component_class_from_processor("tokenizer")
-        # tokenizer = tokenizer_class.from_pretrained("PaddlePaddle/PP-Chart2Table_safetensors")
-        tokenizer = tokenizer_class.from_pretrained("/workspace/model_weight_torch/PP-Chart2Table")
+        tokenizer = tokenizer_class.from_pretrained("PaddlePaddle/PP-Chart2Table_safetensors")
         return tokenizer
 
     @unittest.skip("PPChart2TableProcessor pop the image processor output 'num_patches'")

From b50607cd1707fe2d54f00af37d39883f4739440a Mon Sep 17 00:00:00 2001
From: XingweiDeng <aaadddiy@163.com>
Date: Thu, 19 Mar 2026 22:48:53 +0800
Subject: [PATCH 51/60] update

---
 docs/source/en/model_doc/pp_chart2table.md    | 16 ------------
 .../pp_chart2table/modular_pp_chart2table.py  | 23 +++--------------
 .../processing_pp_chart2table.py              | 25 +++----------------
 .../test_modeling_pp_chart2table.py           |  5 +---
 .../test_processing_pp_chart2table.py         |  5 ++--
 5 files changed, 10 insertions(+), 64 deletions(-)

diff --git a/docs/source/en/model_doc/pp_chart2table.md b/docs/source/en/model_doc/pp_chart2table.md
index dbb5d14fe1cd..6ebbd3090e1f 100644
--- a/docs/source/en/model_doc/pp_chart2table.md
+++ b/docs/source/en/model_doc/pp_chart2table.md
@@ -45,10 +45,6 @@ pipe = pipeline("image-text-to-text", model="PaddlePaddle/PP-Chart2Table_safeten
 
 # PPChart2TableProcessor uses hardcoded "Chart to table" instruction internally via chat template
 conversation = [
-    {
-        "role": "system",
-        "content": [],
-    },
     {
         "role": "user",
         "content": [
@@ -82,10 +78,6 @@ processor = AutoProcessor.from_pretrained(model_path)
 
 # PPChart2TableProcessor uses hardcoded "Chart to table" instruction internally via chat template
 conversation = [
-    {
-        "role": "system",
-        "content": [],
-    },
     {
         "role": "user",
         "content": [
@@ -130,10 +122,6 @@ pipe = pipeline("image-text-to-text", model="PaddlePaddle/PP-Chart2Table_safeten
 
 # PPChart2TableProcessor uses hardcoded "Chart to table" instruction internally via chat template
 conversation = [
-    {
-        "role": "system",
-        "content": [],
-    },
     {
         "role": "user",
         "content": [
@@ -167,10 +155,6 @@ processor = AutoProcessor.from_pretrained(model_path)
 
 # PPChart2TableProcessor uses hardcoded "Chart to table" instruction internally via chat template
 conversation = [
-    {
-        "role": "system",
-        "content": [],
-    },
     {
         "role": "user",
         "content": [
diff --git a/src/transformers/models/pp_chart2table/modular_pp_chart2table.py b/src/transformers/models/pp_chart2table/modular_pp_chart2table.py
index f4d76b17e9ba..5643bac325c7 100644
--- a/src/transformers/models/pp_chart2table/modular_pp_chart2table.py
+++ b/src/transformers/models/pp_chart2table/modular_pp_chart2table.py
@@ -80,7 +80,6 @@ class PPChart2TableImageProcessorFast(BaseImageProcessorFast):
 
 @auto_docstring
 class PPChart2TableProcessor(ProcessorMixin):
-    model_input_names = ["input_ids", "pixel_values"]
 
     def __init__(self, image_processor=None, tokenizer=None, chat_template=None, **kwargs):
         super().__init__(image_processor, tokenizer, chat_template=chat_template)
@@ -91,26 +90,10 @@ def __call__(
         text: TextInput | PreTokenizedInput | list[TextInput] | list[PreTokenizedInput] = None,
         **kwargs: Unpack[ProcessingKwargs],
     ) -> BatchFeature:
-        output_kwargs = self._merge_kwargs(
-            ProcessingKwargs,
-            tokenizer_init_kwargs=self.tokenizer.init_kwargs,
-            **kwargs,
-        )
 
-        if images is None:
-            raise ValueError("At least one of `images` must be provided")
-        image_inputs = self.image_processor(images=images, **output_kwargs["images_kwargs"])
-
-        # Prepare input ids for batch
-        if text is None:
-            raise ValueError("At least one of `text` must be provided")
-
-        if not isinstance(text, list):
-            text = [text]
-
-        input_ids = self.tokenizer(text, **output_kwargs["text_kwargs"]).input_ids
-
-        return BatchFeature(data={"input_ids": input_ids, **image_inputs})
+        if text is None or images is None:
+            raise ValueError("Both `images` and `text` must be provided")
+        return super().__call__(images=images, text=text, **kwargs)
 
 
 __all__ = [
diff --git a/src/transformers/models/pp_chart2table/processing_pp_chart2table.py b/src/transformers/models/pp_chart2table/processing_pp_chart2table.py
index 9047f6d45b35..6f5e4554b731 100644
--- a/src/transformers/models/pp_chart2table/processing_pp_chart2table.py
+++ b/src/transformers/models/pp_chart2table/processing_pp_chart2table.py
@@ -28,8 +28,6 @@
 
 @auto_docstring
 class PPChart2TableProcessor(ProcessorMixin):
-    model_input_names = ["input_ids", "pixel_values"]
-
     def __init__(self, image_processor=None, tokenizer=None, chat_template=None, **kwargs):
         super().__init__(image_processor, tokenizer, chat_template=chat_template)
 
@@ -39,26 +37,9 @@ def __call__(
         text: TextInput | PreTokenizedInput | list[TextInput] | list[PreTokenizedInput] = None,
         **kwargs: Unpack[ProcessingKwargs],
     ) -> BatchFeature:
-        output_kwargs = self._merge_kwargs(
-            ProcessingKwargs,
-            tokenizer_init_kwargs=self.tokenizer.init_kwargs,
-            **kwargs,
-        )
-
-        if images is None:
-            raise ValueError("At least one of `images` must be provided")
-        image_inputs = self.image_processor(images=images, **output_kwargs["images_kwargs"])
-
-        # Prepare input ids for batch
-        if text is None:
-            raise ValueError("At least one of `text` must be provided")
-
-        if not isinstance(text, list):
-            text = [text]
-
-        input_ids = self.tokenizer(text, **output_kwargs["text_kwargs"]).input_ids
-
-        return BatchFeature(data={"input_ids": input_ids, **image_inputs})
+        if text is None or images is None:
+            raise ValueError("Both `images` and `text` must be provided")
+        return super().__call__(images=images, text=text, **kwargs)
 
 
 __all__ = ["PPChart2TableProcessor"]
diff --git a/tests/models/pp_chart2table/test_modeling_pp_chart2table.py b/tests/models/pp_chart2table/test_modeling_pp_chart2table.py
index ba739912f072..cf710f248010 100644
--- a/tests/models/pp_chart2table/test_modeling_pp_chart2table.py
+++ b/tests/models/pp_chart2table/test_modeling_pp_chart2table.py
@@ -25,13 +25,10 @@
 class PPChart2TableIntegrationTest(unittest.TestCase):
     def setUp(self):
         model_path = "PaddlePaddle/PP-Chart2Table_safetensors"
+        # model_path = "/workspace/model_weight_torch/PP-Chart2Table"
         self.model = AutoModelForImageTextToText.from_pretrained(model_path).to(torch_device)
         self.processor = AutoProcessor.from_pretrained(model_path)
         self.conversation = [
-            {
-                "role": "system",
-                "content": [],
-            },
             {
                 "role": "user",
                 "content": [
diff --git a/tests/models/pp_chart2table/test_processing_pp_chart2table.py b/tests/models/pp_chart2table/test_processing_pp_chart2table.py
index 42f4fc9bb63d..07f75a62efa7 100644
--- a/tests/models/pp_chart2table/test_processing_pp_chart2table.py
+++ b/tests/models/pp_chart2table/test_processing_pp_chart2table.py
@@ -27,7 +27,8 @@ class PPChart2TableProcessorTest(ProcessorTesterMixin, unittest.TestCase):
     @classmethod
     def _setup_tokenizer(cls):
         tokenizer_class = cls._get_component_class_from_processor("tokenizer")
-        tokenizer = tokenizer_class.from_pretrained("PaddlePaddle/PP-Chart2Table_safetensors")
+        tokenizer = tokenizer_class.from_pretrained("/workspace/model_weight_torch/PP-Chart2Table")
+        # tokenizer = tokenizer_class.from_pretrained("PaddlePaddle/PP-Chart2Table_safetensors")
         return tokenizer
 
     @unittest.skip("PPChart2TableProcessor pop the image processor output 'num_patches'")
@@ -37,7 +38,7 @@ def test_image_processor_defaults(self):
     def test_ocr_queries(self):
         processor = self.get_processor()
         image_input = self.prepare_image_inputs()
-        conversation = [{"role": "system", "content": []}, {"role": "user", "content": []}]
+        conversation = [{"role": "user", "content": []}]
         inputs = processor.apply_chat_template(
             conversation,
             tokenize=False,

From 44529f73ad9ad3dcd2183f3361cbe1772dd5f774 Mon Sep 17 00:00:00 2001
From: XingweiDeng <aaadddiy@163.com>
Date: Thu, 19 Mar 2026 22:49:55 +0800
Subject: [PATCH 52/60] update

---
 .../models/pp_chart2table/modular_pp_chart2table.py             | 2 --
 1 file changed, 2 deletions(-)

diff --git a/src/transformers/models/pp_chart2table/modular_pp_chart2table.py b/src/transformers/models/pp_chart2table/modular_pp_chart2table.py
index 5643bac325c7..b59dfd8cd4f8 100644
--- a/src/transformers/models/pp_chart2table/modular_pp_chart2table.py
+++ b/src/transformers/models/pp_chart2table/modular_pp_chart2table.py
@@ -80,7 +80,6 @@ class PPChart2TableImageProcessorFast(BaseImageProcessorFast):
 
 @auto_docstring
 class PPChart2TableProcessor(ProcessorMixin):
-
     def __init__(self, image_processor=None, tokenizer=None, chat_template=None, **kwargs):
         super().__init__(image_processor, tokenizer, chat_template=chat_template)
 
@@ -90,7 +89,6 @@ def __call__(
         text: TextInput | PreTokenizedInput | list[TextInput] | list[PreTokenizedInput] = None,
         **kwargs: Unpack[ProcessingKwargs],
     ) -> BatchFeature:
-
         if text is None or images is None:
             raise ValueError("Both `images` and `text` must be provided")
         return super().__call__(images=images, text=text, **kwargs)

From ba238c889c1a83aaf70104afb5fc360dcccdda80 Mon Sep 17 00:00:00 2001
From: XingweiDeng <aaadddiy@163.com>
Date: Thu, 19 Mar 2026 22:58:32 +0800
Subject: [PATCH 53/60] update

---
 src/transformers/models/auto/image_processing_auto.py | 1 +
 1 file changed, 1 insertion(+)

diff --git a/src/transformers/models/auto/image_processing_auto.py b/src/transformers/models/auto/image_processing_auto.py
index 0060eff0007c..29c145ade42c 100644
--- a/src/transformers/models/auto/image_processing_auto.py
+++ b/src/transformers/models/auto/image_processing_auto.py
@@ -194,6 +194,7 @@
             ("pixio", {"torchvision": "BitImageProcessor", "pil": "BitImageProcessorPil"}),
             ("pixtral", {"torchvision": "PixtralImageProcessor", "pil": "PixtralImageProcessorPil"}),
             ("poolformer", {"torchvision": "PoolFormerImageProcessor", "pil": "PoolFormerImageProcessorPil"}),
+            ("pp_chart2table", (None, "PPChart2TableImageProcessorFast")),
             ("pp_doclayout_v2", {"torchvision": "PPDocLayoutV2ImageProcessor"}),
             ("pp_doclayout_v3", {"torchvision": "PPDocLayoutV3ImageProcessor"}),
             ("pp_lcnet", {"torchvision": "PPLCNetImageProcessor"}),

From e7401f06b058ac6a69a887a639dfdaef595a1694 Mon Sep 17 00:00:00 2001
From: vasqu <antonprogamer@gmail.com>
Date: Thu, 19 Mar 2026 16:53:21 +0100
Subject: [PATCH 54/60] fixup after new refactor

---
 .../models/auto/image_processing_auto.py      |  2 +-
 ...=> image_processing_pil_pp_chart2table.py} |  6 +++---
 .../image_processing_pp_chart2table.py        | 19 +++++++++++++++++
 .../pp_chart2table/modular_pp_chart2table.py  | 21 ++++++++++++++++---
 .../test_processing_pp_chart2table.py         | 16 +++++++-------
 5 files changed, 48 insertions(+), 16 deletions(-)
 rename src/transformers/models/pp_chart2table/{image_processing_pp_chart2table_fast.py => image_processing_pil_pp_chart2table.py} (91%)

diff --git a/src/transformers/models/auto/image_processing_auto.py b/src/transformers/models/auto/image_processing_auto.py
index 29c145ade42c..6c78d69f7eb8 100644
--- a/src/transformers/models/auto/image_processing_auto.py
+++ b/src/transformers/models/auto/image_processing_auto.py
@@ -194,7 +194,7 @@
             ("pixio", {"torchvision": "BitImageProcessor", "pil": "BitImageProcessorPil"}),
             ("pixtral", {"torchvision": "PixtralImageProcessor", "pil": "PixtralImageProcessorPil"}),
             ("poolformer", {"torchvision": "PoolFormerImageProcessor", "pil": "PoolFormerImageProcessorPil"}),
-            ("pp_chart2table", (None, "PPChart2TableImageProcessorFast")),
+            ("pp_chart2table", {"torchvision": "PPChart2TableImageProcessor", "pil": "PPChart2TableImageProcessorPil"}),
             ("pp_doclayout_v2", {"torchvision": "PPDocLayoutV2ImageProcessor"}),
             ("pp_doclayout_v3", {"torchvision": "PPDocLayoutV3ImageProcessor"}),
             ("pp_lcnet", {"torchvision": "PPLCNetImageProcessor"}),
diff --git a/src/transformers/models/pp_chart2table/image_processing_pp_chart2table_fast.py b/src/transformers/models/pp_chart2table/image_processing_pil_pp_chart2table.py
similarity index 91%
rename from src/transformers/models/pp_chart2table/image_processing_pp_chart2table_fast.py
rename to src/transformers/models/pp_chart2table/image_processing_pil_pp_chart2table.py
index 48e0a3d0b1d8..58d60d50d40e 100644
--- a/src/transformers/models/pp_chart2table/image_processing_pp_chart2table_fast.py
+++ b/src/transformers/models/pp_chart2table/image_processing_pil_pp_chart2table.py
@@ -18,13 +18,13 @@
 # See the License for the specific language governing permissions and
 # limitations under the License.
 
-from ...image_processing_utils_fast import BaseImageProcessorFast
+from ...image_processing_backends import PilBackend
 from ...utils import auto_docstring
 from .image_processing_pp_chart2table import PPChart2TableImageProcessorKwargs
 
 
 @auto_docstring
-class PPChart2TableImageProcessorFast(BaseImageProcessorFast):
+class PPChart2TableImageProcessorPil(PilBackend):
     resample = 3
     image_mean = [0.48145466, 0.4578275, 0.40821073]
     image_std = [0.26862954, 0.26130258, 0.27577711]
@@ -37,4 +37,4 @@ class PPChart2TableImageProcessorFast(BaseImageProcessorFast):
     valid_kwargs = PPChart2TableImageProcessorKwargs
 
 
-__all__ = ["PPChart2TableImageProcessorFast"]
+__all__ = ["PPChart2TableImageProcessorPil"]
diff --git a/src/transformers/models/pp_chart2table/image_processing_pp_chart2table.py b/src/transformers/models/pp_chart2table/image_processing_pp_chart2table.py
index 5092b216ee99..849322663b74 100644
--- a/src/transformers/models/pp_chart2table/image_processing_pp_chart2table.py
+++ b/src/transformers/models/pp_chart2table/image_processing_pp_chart2table.py
@@ -18,7 +18,9 @@
 # See the License for the specific language governing permissions and
 # limitations under the License.
 
+from ...image_processing_backends import TorchvisionBackend
 from ...processing_utils import ImagesKwargs
+from ...utils import auto_docstring
 
 
 class PPChart2TableImageProcessorKwargs(ImagesKwargs, total=False):
@@ -35,3 +37,20 @@ class PPChart2TableImageProcessorKwargs(ImagesKwargs, total=False):
 
     patch_size: int
     num_patches: int
+
+
+@auto_docstring
+class PPChart2TableImageProcessor(TorchvisionBackend):
+    resample = 3
+    image_mean = [0.48145466, 0.4578275, 0.40821073]
+    image_std = [0.26862954, 0.26130258, 0.27577711]
+    size = {"height": 1024, "width": 1024}
+    patch_size = 16
+    num_patches = 16
+    do_resize = True
+    do_rescale = True
+    do_normalize = True
+    valid_kwargs = PPChart2TableImageProcessorKwargs
+
+
+__all__ = ["PPChart2TableImageProcessor"]
diff --git a/src/transformers/models/pp_chart2table/modular_pp_chart2table.py b/src/transformers/models/pp_chart2table/modular_pp_chart2table.py
index b59dfd8cd4f8..9ddfd8dc90d1 100644
--- a/src/transformers/models/pp_chart2table/modular_pp_chart2table.py
+++ b/src/transformers/models/pp_chart2table/modular_pp_chart2table.py
@@ -15,7 +15,7 @@
 from huggingface_hub.dataclasses import strict
 
 from ...feature_extraction_utils import BatchFeature
-from ...image_processing_utils_fast import BaseImageProcessorFast
+from ...image_processing_backends import PilBackend, TorchvisionBackend
 from ...image_utils import ImageInput
 from ...processing_utils import ImagesKwargs, ProcessingKwargs, ProcessorMixin, Unpack
 from ...tokenization_utils_base import PreTokenizedInput, TextInput
@@ -65,7 +65,21 @@ class PPChart2TableImageProcessorKwargs(ImagesKwargs, total=False):
 
 
 @auto_docstring
-class PPChart2TableImageProcessorFast(BaseImageProcessorFast):
+class PPChart2TableImageProcessor(TorchvisionBackend):
+    resample = 3
+    image_mean = [0.48145466, 0.4578275, 0.40821073]
+    image_std = [0.26862954, 0.26130258, 0.27577711]
+    size = {"height": 1024, "width": 1024}
+    patch_size = 16
+    num_patches = 16
+    do_resize = True
+    do_rescale = True
+    do_normalize = True
+    valid_kwargs = PPChart2TableImageProcessorKwargs
+
+
+@auto_docstring
+class PPChart2TableImageProcessorPil(PilBackend):
     resample = 3
     image_mean = [0.48145466, 0.4578275, 0.40821073]
     image_std = [0.26862954, 0.26130258, 0.27577711]
@@ -96,6 +110,7 @@ def __call__(
 
 __all__ = [
     "PPChart2TableConfig",
-    "PPChart2TableImageProcessorFast",
+    "PPChart2TableImageProcessor",
+    "PPChart2TableImageProcessorPil",
     "PPChart2TableProcessor",
 ]
diff --git a/tests/models/pp_chart2table/test_processing_pp_chart2table.py b/tests/models/pp_chart2table/test_processing_pp_chart2table.py
index 07f75a62efa7..5128c63f49b0 100644
--- a/tests/models/pp_chart2table/test_processing_pp_chart2table.py
+++ b/tests/models/pp_chart2table/test_processing_pp_chart2table.py
@@ -27,15 +27,13 @@ class PPChart2TableProcessorTest(ProcessorTesterMixin, unittest.TestCase):
     @classmethod
     def _setup_tokenizer(cls):
         tokenizer_class = cls._get_component_class_from_processor("tokenizer")
-        tokenizer = tokenizer_class.from_pretrained("/workspace/model_weight_torch/PP-Chart2Table")
-        # tokenizer = tokenizer_class.from_pretrained("PaddlePaddle/PP-Chart2Table_safetensors")
+        # TODO: new processor on hub
+        tokenizer = tokenizer_class.from_pretrained("PaddlePaddle/PP-Chart2Table_safetensors")
+        # tokenizer = tokenizer_class.from_pretrained("/workspace/model_weight_torch/PP-Chart2Table")
         return tokenizer
 
-    @unittest.skip("PPChart2TableProcessor pop the image processor output 'num_patches'")
-    def test_image_processor_defaults(self):
-        pass
-
     def test_ocr_queries(self):
+        # TODO: fixme
         processor = self.get_processor()
         image_input = self.prepare_image_inputs()
         conversation = [{"role": "user", "content": []}]
@@ -71,19 +69,19 @@ def test_unstructured_kwargs_batched(self):
         self.assertLessEqual(inputs[self.images_input_name][0][0].mean(), 0)
 
     @unittest.skip(
-        reason="PPChart2Table relies on a predetermined input format; chat template usage is not intended, and image input cannot be None."
+        reason="PPChart2Table relies on a heavily predetermined input format; chat template usage is not intended as expected"
     )
     def test_apply_chat_template_assistant_mask(self):
         pass
 
     @unittest.skip(
-        reason="PPChart2Table relies on a predetermined input format; chat template usage is not intended, and image input cannot be None."
+        reason="PPChart2Table relies on a heavily predetermined input format; chat template usage is not intended as expected"
     )
     def test_apply_chat_template_image_0(self):
         pass
 
     @unittest.skip(
-        reason="PPChart2Table relies on a predetermined input format; chat template usage is not intended, and image input cannot be None."
+        reason="PPChart2Table relies on a heavily predetermined input format; chat template usage is not intended as expected"
     )
     def test_apply_chat_template_image_1(self):
         pass

From 28653c33998157235362a8439d2d810bba3905a4 Mon Sep 17 00:00:00 2001
From: vasqu <antonprogamer@gmail.com>
Date: Thu, 19 Mar 2026 16:58:22 +0100
Subject: [PATCH 55/60] fix

---
 docs/source/en/_toctree.yml                           | 4 ++--
 docs/source/en/model_doc/pp_chart2table.md            | 8 ++++++--
 src/transformers/models/auto/image_processing_auto.py | 5 ++++-
 3 files changed, 12 insertions(+), 5 deletions(-)

diff --git a/docs/source/en/_toctree.yml b/docs/source/en/_toctree.yml
index f1f63796631c..2aebe0d7e74f 100644
--- a/docs/source/en/_toctree.yml
+++ b/docs/source/en/_toctree.yml
@@ -1282,10 +1282,10 @@
         title: PP-OCRv5_mobile_rec
       - local: model_doc/pp_ocrv5_server_det
         title: PP-OCRv5_server_det
-      - local: model_doc/pp_chart2table
-        title: PPChart2Table
       - local: model_doc/pp_ocrv5_server_rec
         title: PP-OCRv5_server_rec
+      - local: model_doc/pp_chart2table
+        title: PPChart2Table
       - local: model_doc/pp_lcnet
         title: PPLCNet
       - local: model_doc/pp_lcnet_v3
diff --git a/docs/source/en/model_doc/pp_chart2table.md b/docs/source/en/model_doc/pp_chart2table.md
index 6ebbd3090e1f..b8b603035c33 100644
--- a/docs/source/en/model_doc/pp_chart2table.md
+++ b/docs/source/en/model_doc/pp_chart2table.md
@@ -191,9 +191,13 @@ print(result)
 
 [[autodoc]] PPChart2TableConfig
 
-## PPChart2TableImageProcessorFast
+## PPChart2TableImageProcessor
 
-[[autodoc]] PPChart2TableImageProcessorFast
+[[autodoc]] PPChart2TableImageProcessor
+
+## PPChart2TableImageProcessorPil
+
+[[autodoc]] PPChart2TableImageProcessorPil
 
 ## PPChart2TableProcessor
 
diff --git a/src/transformers/models/auto/image_processing_auto.py b/src/transformers/models/auto/image_processing_auto.py
index 6c78d69f7eb8..520c1fb423b3 100644
--- a/src/transformers/models/auto/image_processing_auto.py
+++ b/src/transformers/models/auto/image_processing_auto.py
@@ -194,7 +194,10 @@
             ("pixio", {"torchvision": "BitImageProcessor", "pil": "BitImageProcessorPil"}),
             ("pixtral", {"torchvision": "PixtralImageProcessor", "pil": "PixtralImageProcessorPil"}),
             ("poolformer", {"torchvision": "PoolFormerImageProcessor", "pil": "PoolFormerImageProcessorPil"}),
-            ("pp_chart2table", {"torchvision": "PPChart2TableImageProcessor", "pil": "PPChart2TableImageProcessorPil"}),
+            (
+                "pp_chart2table",
+                {"torchvision": "PPChart2TableImageProcessor", "pil": "PPChart2TableImageProcessorPil"},
+            ),
             ("pp_doclayout_v2", {"torchvision": "PPDocLayoutV2ImageProcessor"}),
             ("pp_doclayout_v3", {"torchvision": "PPDocLayoutV3ImageProcessor"}),
             ("pp_lcnet", {"torchvision": "PPLCNetImageProcessor"}),

From d7d8ee832b72a06516b2a50b9feca3da6944a134 Mon Sep 17 00:00:00 2001
From: XingweiDeng <aaadddiy@163.com>
Date: Fri, 20 Mar 2026 00:05:32 +0800
Subject: [PATCH 56/60] update

---
 tests/models/pp_chart2table/test_processing_pp_chart2table.py | 1 -
 1 file changed, 1 deletion(-)

diff --git a/tests/models/pp_chart2table/test_processing_pp_chart2table.py b/tests/models/pp_chart2table/test_processing_pp_chart2table.py
index 5128c63f49b0..8812d2433702 100644
--- a/tests/models/pp_chart2table/test_processing_pp_chart2table.py
+++ b/tests/models/pp_chart2table/test_processing_pp_chart2table.py
@@ -29,7 +29,6 @@ def _setup_tokenizer(cls):
         tokenizer_class = cls._get_component_class_from_processor("tokenizer")
         # TODO: new processor on hub
         tokenizer = tokenizer_class.from_pretrained("PaddlePaddle/PP-Chart2Table_safetensors")
-        # tokenizer = tokenizer_class.from_pretrained("/workspace/model_weight_torch/PP-Chart2Table")
         return tokenizer
 
     def test_ocr_queries(self):

From d71e07b270dc8029cbd5b57fc29e49d968f8629b Mon Sep 17 00:00:00 2001
From: XingweiDeng <aaadddiy@163.com>
Date: Fri, 20 Mar 2026 00:10:31 +0800
Subject: [PATCH 57/60] update

---
 .../image_processing_pil_pp_chart2table.py    |  4 ----
 .../image_processing_pp_chart2table.py        | 20 ----------------
 .../pp_chart2table/modular_pp_chart2table.py  | 24 +------------------
 3 files changed, 1 insertion(+), 47 deletions(-)

diff --git a/src/transformers/models/pp_chart2table/image_processing_pil_pp_chart2table.py b/src/transformers/models/pp_chart2table/image_processing_pil_pp_chart2table.py
index 58d60d50d40e..40cce468b5dc 100644
--- a/src/transformers/models/pp_chart2table/image_processing_pil_pp_chart2table.py
+++ b/src/transformers/models/pp_chart2table/image_processing_pil_pp_chart2table.py
@@ -20,7 +20,6 @@
 
 from ...image_processing_backends import PilBackend
 from ...utils import auto_docstring
-from .image_processing_pp_chart2table import PPChart2TableImageProcessorKwargs
 
 
 @auto_docstring
@@ -29,12 +28,9 @@ class PPChart2TableImageProcessorPil(PilBackend):
     image_mean = [0.48145466, 0.4578275, 0.40821073]
     image_std = [0.26862954, 0.26130258, 0.27577711]
     size = {"height": 1024, "width": 1024}
-    patch_size = 16
-    num_patches = 16
     do_resize = True
     do_rescale = True
     do_normalize = True
-    valid_kwargs = PPChart2TableImageProcessorKwargs
 
 
 __all__ = ["PPChart2TableImageProcessorPil"]
diff --git a/src/transformers/models/pp_chart2table/image_processing_pp_chart2table.py b/src/transformers/models/pp_chart2table/image_processing_pp_chart2table.py
index 849322663b74..b38027aecef9 100644
--- a/src/transformers/models/pp_chart2table/image_processing_pp_chart2table.py
+++ b/src/transformers/models/pp_chart2table/image_processing_pp_chart2table.py
@@ -19,38 +19,18 @@
 # limitations under the License.
 
 from ...image_processing_backends import TorchvisionBackend
-from ...processing_utils import ImagesKwargs
 from ...utils import auto_docstring
 
 
-class PPChart2TableImageProcessorKwargs(ImagesKwargs, total=False):
-    r"""
-    patch_size (`int`, *optional*, defaults to `16`):
-        The size (in pixels) of each square patch that the image is divided into before being fed into the
-        vision encoder.
-
-    num_patches (`int`, *optional*, defaults to `16`):
-        Number of patches used to represent the image in the input sequence. This parameter is included in
-        the chat template's user message to inform the language model about the image structure. The model
-        uses this information to understand how the image tokens correspond to the visual input.
-    """
-
-    patch_size: int
-    num_patches: int
-
-
 @auto_docstring
 class PPChart2TableImageProcessor(TorchvisionBackend):
     resample = 3
     image_mean = [0.48145466, 0.4578275, 0.40821073]
     image_std = [0.26862954, 0.26130258, 0.27577711]
     size = {"height": 1024, "width": 1024}
-    patch_size = 16
-    num_patches = 16
     do_resize = True
     do_rescale = True
     do_normalize = True
-    valid_kwargs = PPChart2TableImageProcessorKwargs
 
 
 __all__ = ["PPChart2TableImageProcessor"]
diff --git a/src/transformers/models/pp_chart2table/modular_pp_chart2table.py b/src/transformers/models/pp_chart2table/modular_pp_chart2table.py
index 9ddfd8dc90d1..709c465d5738 100644
--- a/src/transformers/models/pp_chart2table/modular_pp_chart2table.py
+++ b/src/transformers/models/pp_chart2table/modular_pp_chart2table.py
@@ -17,7 +17,7 @@
 from ...feature_extraction_utils import BatchFeature
 from ...image_processing_backends import PilBackend, TorchvisionBackend
 from ...image_utils import ImageInput
-from ...processing_utils import ImagesKwargs, ProcessingKwargs, ProcessorMixin, Unpack
+from ...processing_utils import ProcessingKwargs, ProcessorMixin, Unpack
 from ...tokenization_utils_base import PreTokenizedInput, TextInput
 from ...utils import auto_docstring, logging
 from ..got_ocr2.configuration_got_ocr2 import GotOcr2Config
@@ -48,34 +48,15 @@ class PPChart2TableConfig(GotOcr2Config):
     ```"""
 
 
-class PPChart2TableImageProcessorKwargs(ImagesKwargs, total=False):
-    r"""
-    patch_size (`int`, *optional*, defaults to `16`):
-        The size (in pixels) of each square patch that the image is divided into before being fed into the
-        vision encoder.
-
-    num_patches (`int`, *optional*, defaults to `16`):
-        Number of patches used to represent the image in the input sequence. This parameter is included in
-        the chat template's user message to inform the language model about the image structure. The model
-        uses this information to understand how the image tokens correspond to the visual input.
-    """
-
-    patch_size: int
-    num_patches: int
-
-
 @auto_docstring
 class PPChart2TableImageProcessor(TorchvisionBackend):
     resample = 3
     image_mean = [0.48145466, 0.4578275, 0.40821073]
     image_std = [0.26862954, 0.26130258, 0.27577711]
     size = {"height": 1024, "width": 1024}
-    patch_size = 16
-    num_patches = 16
     do_resize = True
     do_rescale = True
     do_normalize = True
-    valid_kwargs = PPChart2TableImageProcessorKwargs
 
 
 @auto_docstring
@@ -84,12 +65,9 @@ class PPChart2TableImageProcessorPil(PilBackend):
     image_mean = [0.48145466, 0.4578275, 0.40821073]
     image_std = [0.26862954, 0.26130258, 0.27577711]
     size = {"height": 1024, "width": 1024}
-    patch_size = 16
-    num_patches = 16
     do_resize = True
     do_rescale = True
     do_normalize = True
-    valid_kwargs = PPChart2TableImageProcessorKwargs
 
 
 @auto_docstring

From c095f110ef046e40eef28927bb486060b2615db2 Mon Sep 17 00:00:00 2001
From: vasqu <antonprogamer@gmail.com>
Date: Thu, 19 Mar 2026 18:22:30 +0100
Subject: [PATCH 58/60] last fixups

---
 src/transformers/models/pp_chart2table/__init__.py     |  3 ++-
 .../test_image_processing_pp_chart2table.py            | 10 +---------
 2 files changed, 3 insertions(+), 10 deletions(-)

diff --git a/src/transformers/models/pp_chart2table/__init__.py b/src/transformers/models/pp_chart2table/__init__.py
index 411b2f54ca62..961039282748 100644
--- a/src/transformers/models/pp_chart2table/__init__.py
+++ b/src/transformers/models/pp_chart2table/__init__.py
@@ -20,7 +20,8 @@
 
 if TYPE_CHECKING:
     from .configuration_pp_chart2table import *
-    from .image_processing_pp_chart2table_fast import *
+    from .image_processing_pil_pp_chart2table import *
+    from .image_processing_pp_chart2table import *
     from .processing_pp_chart2table import *
 else:
     import sys
diff --git a/tests/models/pp_chart2table/test_image_processing_pp_chart2table.py b/tests/models/pp_chart2table/test_image_processing_pp_chart2table.py
index 46c1d0cc85f9..cea024d942e8 100644
--- a/tests/models/pp_chart2table/test_image_processing_pp_chart2table.py
+++ b/tests/models/pp_chart2table/test_image_processing_pp_chart2table.py
@@ -16,15 +16,10 @@
 import unittest
 
 from transformers.testing_utils import require_torch, require_vision
-from transformers.utils import is_torchvision_available
 
 from ...test_image_processing_common import ImageProcessingTestMixin, prepare_image_inputs
 
 
-if is_torchvision_available():
-    from transformers import PPChart2TableImageProcessorFast
-
-
 class PPChart2TableImageProcessingTester(unittest.TestCase):
     def __init__(
         self,
@@ -81,9 +76,6 @@ def prepare_image_inputs(self, equal_resolution=False, numpify=False, torchify=F
 @require_torch
 @require_vision
 class PPChart2TableImageProcessingTest(ImageProcessingTestMixin, unittest.TestCase):
-    test_slow_image_processor = False
-    fast_image_processing_class = PPChart2TableImageProcessorFast if is_torchvision_available() else None
-
     def setUp(self):
         super().setUp()
         self.image_processor_tester = PPChart2TableImageProcessingTester(self)
@@ -93,7 +85,7 @@ def image_processor_dict(self):
         return self.image_processor_tester.prepare_image_processor_dict()
 
     def test_image_processor_properties(self):
-        for image_processing_class in self.image_processor_list:
+        for image_processing_class in self.image_processing_classes.values():
             image_processor = image_processing_class(**self.image_processor_dict)
             self.assertTrue(hasattr(image_processor, "do_resize"))
             self.assertTrue(hasattr(image_processor, "size"))

From eb5c2a51c2f71ab11bc82daa993424c1429812b7 Mon Sep 17 00:00:00 2001
From: XingweiDeng <aaadddiy@163.com>
Date: Fri, 20 Mar 2026 01:39:12 +0800
Subject: [PATCH 59/60] update

---
 tests/models/pp_chart2table/test_modeling_pp_chart2table.py | 1 -
 1 file changed, 1 deletion(-)

diff --git a/tests/models/pp_chart2table/test_modeling_pp_chart2table.py b/tests/models/pp_chart2table/test_modeling_pp_chart2table.py
index cf710f248010..b573723c4d13 100644
--- a/tests/models/pp_chart2table/test_modeling_pp_chart2table.py
+++ b/tests/models/pp_chart2table/test_modeling_pp_chart2table.py
@@ -25,7 +25,6 @@
 class PPChart2TableIntegrationTest(unittest.TestCase):
     def setUp(self):
         model_path = "PaddlePaddle/PP-Chart2Table_safetensors"
-        # model_path = "/workspace/model_weight_torch/PP-Chart2Table"
         self.model = AutoModelForImageTextToText.from_pretrained(model_path).to(torch_device)
         self.processor = AutoProcessor.from_pretrained(model_path)
         self.conversation = [

From bcccd9ddd36977d4018769a565078adf2a3f5b20 Mon Sep 17 00:00:00 2001
From: vasqu <antonprogamer@gmail.com>
Date: Thu, 19 Mar 2026 19:49:23 +0100
Subject: [PATCH 60/60] remove my todos I left there

---
 tests/models/pp_chart2table/test_processing_pp_chart2table.py | 2 --
 1 file changed, 2 deletions(-)

diff --git a/tests/models/pp_chart2table/test_processing_pp_chart2table.py b/tests/models/pp_chart2table/test_processing_pp_chart2table.py
index 8812d2433702..2fec6e4313f1 100644
--- a/tests/models/pp_chart2table/test_processing_pp_chart2table.py
+++ b/tests/models/pp_chart2table/test_processing_pp_chart2table.py
@@ -27,12 +27,10 @@ class PPChart2TableProcessorTest(ProcessorTesterMixin, unittest.TestCase):
     @classmethod
     def _setup_tokenizer(cls):
         tokenizer_class = cls._get_component_class_from_processor("tokenizer")
-        # TODO: new processor on hub
         tokenizer = tokenizer_class.from_pretrained("PaddlePaddle/PP-Chart2Table_safetensors")
         return tokenizer
 
     def test_ocr_queries(self):
-        # TODO: fixme
         processor = self.get_processor()
         image_input = self.prepare_image_inputs()
         conversation = [{"role": "user", "content": []}]