From 8aa566b931eb837583d1100e0120cc92fc4e8ef7 Mon Sep 17 00:00:00 2001 From: XingweiDeng Date: Thu, 5 Feb 2026 21:52:50 +0800 Subject: [PATCH 01/60] init --- docs/source/en/_toctree.yml | 2 + docs/source/en/model_doc/pp_chart2table.md | 160 ++ src/transformers/conversion_mapping.py | 1 + src/transformers/models/__init__.py | 1 + .../models/auto/configuration_auto.py | 2 + .../models/auto/image_processing_auto.py | 1 + src/transformers/models/auto/modeling_auto.py | 1 + .../models/auto/processing_auto.py | 1 + .../models/auto/tokenization_auto.py | 1 + .../models/pp_chart2table/__init__.py | 32 + .../configuration_pp_chart2table.py | 364 ++++ .../image_processing_pp_chart2table.py | 161 ++ .../image_processing_pp_chart2table_fast.py | 95 + .../pp_chart2table/modeling_pp_chart2table.py | 1369 ++++++++++++++ .../pp_chart2table/modular_pp_chart2table.py | 1609 +++++++++++++++++ .../processing_pp_chart2table.py | 65 + tests/models/pp_chart2table/__init__.py | 0 .../test_modeling_pp_chart2table.py | 391 ++++ utils/check_config_attributes.py | 1 + utils/check_repo.py | 6 + 20 files changed, 4263 insertions(+) create mode 100644 docs/source/en/model_doc/pp_chart2table.md create mode 100644 src/transformers/models/pp_chart2table/__init__.py create mode 100644 src/transformers/models/pp_chart2table/configuration_pp_chart2table.py create mode 100644 src/transformers/models/pp_chart2table/image_processing_pp_chart2table.py create mode 100644 src/transformers/models/pp_chart2table/image_processing_pp_chart2table_fast.py create mode 100644 src/transformers/models/pp_chart2table/modeling_pp_chart2table.py create mode 100644 src/transformers/models/pp_chart2table/modular_pp_chart2table.py create mode 100644 src/transformers/models/pp_chart2table/processing_pp_chart2table.py create mode 100644 tests/models/pp_chart2table/__init__.py create mode 100644 tests/models/pp_chart2table/test_modeling_pp_chart2table.py diff --git a/docs/source/en/_toctree.yml b/docs/source/en/_toctree.yml index 670854e4895d..d0bf6fb9961b 100644 --- a/docs/source/en/_toctree.yml +++ b/docs/source/en/_toctree.yml @@ -1151,6 +1151,8 @@ title: Pix2Struct - local: model_doc/pixtral title: Pixtral + - local: model_doc/pp_chart2table + title: PPChart2Table - local: model_doc/qwen2_5_omni title: Qwen2.5-Omni - local: model_doc/qwen2_5_vl diff --git a/docs/source/en/model_doc/pp_chart2table.md b/docs/source/en/model_doc/pp_chart2table.md new file mode 100644 index 000000000000..5082c6f0adef --- /dev/null +++ b/docs/source/en/model_doc/pp_chart2table.md @@ -0,0 +1,160 @@ + + +# PP-Chart2Table + +
+PyTorch +
+ +## Overview + +**PP-Chart2Table** is a SOTA multimodal model developed by the PaddlePaddle team, specializing in chart parsing for both Chinese and English. Its high performance is driven by a novel "Shuffled Chart Data Retrieval" training task, which, combined with a refined token masking strategy, significantly improves its efficiency in converting charts to data tables. The model is further strengthened by an advanced data synthesis pipeline that uses high-quality seed data, RAG, and LLMs persona design to create a richer, more diverse training set. To address the challenge of large-scale unlabeled, out-of-distribution (OOD) data, the team implemented a two-stage distillation process, ensuring robust adaptability and generalization on real-world data. + +## Model Architecture +PP-Chart2Table adopts a multimodal fusion architecture that combines a vision tower for chart feature extraction and a language model for table structure generation, enabling end-to-end chart-to-table conversion. + + +## Usage + +### Single input inference + +The example below demonstrates how to classify image with PP-Chart2Table using [`Pipeline`] or the [`AutoModel`]. + + + + +```py +from transformers import pipeline +from PIL import Image +pipe = pipeline("image-text-to-text", model="PaddlePaddle/PP-Chart2Table_safetensors") + +result = pipe(images="https://paddle-model-ecology.bj.bcebos.com/paddlex/imgs/demo_image/chart_parsing_02.png", do_sample=False, max_new_tokens=256) +print(result) +``` + + + + + +```py +import requests +from PIL import Image +from transformers import AutoModelForImageTextToText, AutoProcessor + +model_path = "PaddlePaddle/PP-Chart2Table_safetensors" +model = AutoModelForImageTextToText.from_pretrained(model_path, dtype="float32").to("cuda") +processor = AutoProcessor.from_pretrained(model_path) + +image = Image.open(requests.get("https://paddle-model-ecology.bj.bcebos.com/paddlex/imgs/demo_image/chart_parsing_02.png", stream=True).raw) +inputs = processor(images=image).to(model.device) + +outputs = model.generate(**inputs, do_sample=False, max_new_tokens=256) +result = processor.postprocess(outputs) +print(result) + +``` + + + + +### Batched inference + +Here is how you can do it with PP-Chart2Table using [`Pipeline`] or the [`AutoModel`]: + + + + +```py +from transformers import pipeline +from PIL import Image +model_path = "PaddlePaddle/PP-Chart2Table_safetensors" +pipe = pipeline("image-text-to-text", model=model_path) + +image_path = "https://paddle-model-ecology.bj.bcebos.com/paddlex/imgs/demo_image/chart_parsing_02.png" +result = pipe(images=[image_path, image_path], do_sample=False, max_new_tokens=256) +print(result) +``` + + + + + +```py +import requests +from PIL import Image +from transformers import AutoModelForImageTextToText, AutoProcessor + +model_path = "PaddlePaddle/PP-Chart2Table_safetensors" +model = AutoModelForImageTextToText.from_pretrained(model_path, dtype="float32").to("cuda") +processor = AutoProcessor.from_pretrained(model_path) + +image = Image.open(requests.get("https://paddle-model-ecology.bj.bcebos.com/paddlex/imgs/demo_image/chart_parsing_02.png", stream=True).raw) +inputs = processor(images=[image, image]).to(model.device) + +outputs = model.generate(**inputs, do_sample=False, max_new_tokens=256) +result = processor.postprocess(outputs) +print(result) +``` + + + + +## PPChart2TableForConditionalGeneration + +[[autodoc]] PPChart2TableForConditionalGeneration + - forward + +## PPChart2TableConfig + +[[autodoc]] PPChart2TableConfig + +## PPChart2TableVisionConfig + +[[autodoc]] PPChart2TableVisionConfig + +## PPChart2TableTextConfig + +[[autodoc]] PPChart2TableTextConfig + +## PPChart2TableTextModel + +[[autodoc]] PPChart2TableTextModel + - forward + +## PPChart2TableVisionModel + +[[autodoc]] PPChart2TableVisionModel + +## PPChart2TableImageProcessor + +[[autodoc]] PPChart2TableImageProcessor + +## PPChart2TableImageProcessorFast + +[[autodoc]] PPChart2TableImageProcessorFast + +## PPChart2TableModel + +[[autodoc]] PPChart2TableModel + +## PPChart2TableProcessor + +[[autodoc]] PPChart2TableProcessor + +## PPChart2TableVisionTransformer + +[[autodoc]] PPChart2TableVisionTransformer diff --git a/src/transformers/conversion_mapping.py b/src/transformers/conversion_mapping.py index 7eadf603af5c..de6ca1969f24 100644 --- a/src/transformers/conversion_mapping.py +++ b/src/transformers/conversion_mapping.py @@ -288,6 +288,7 @@ def register_checkpoint_conversion_mapping( "sam3_tracker", "sam3_tracker_video", "paddleocrvl", + "ppchart2table", "ernie4_5_vl_moe", ] diff --git a/src/transformers/models/__init__.py b/src/transformers/models/__init__.py index 342f366f3bc8..f7e743634629 100644 --- a/src/transformers/models/__init__.py +++ b/src/transformers/models/__init__.py @@ -292,6 +292,7 @@ from .plbart import * from .poolformer import * from .pop2piano import * + from .pp_chart2table import * from .prompt_depth_anything import * from .prophetnet import * from .pvt import * diff --git a/src/transformers/models/auto/configuration_auto.py b/src/transformers/models/auto/configuration_auto.py index 55dd1b820073..5757ffc6888a 100644 --- a/src/transformers/models/auto/configuration_auto.py +++ b/src/transformers/models/auto/configuration_auto.py @@ -331,6 +331,7 @@ ("plbart", "PLBartConfig"), ("poolformer", "PoolFormerConfig"), ("pop2piano", "Pop2PianoConfig"), + ("pp_chart2table", "PPChart2TableConfig"), ("prompt_depth_anything", "PromptDepthAnythingConfig"), ("prophetnet", "ProphetNetConfig"), ("pvt", "PvtConfig"), @@ -799,6 +800,7 @@ ("plbart", "PLBart"), ("poolformer", "PoolFormer"), ("pop2piano", "Pop2Piano"), + ("pp_chart2table", "PPChart2Table"), ("prompt_depth_anything", "PromptDepthAnything"), ("prophetnet", "ProphetNet"), ("pvt", "PVT"), diff --git a/src/transformers/models/auto/image_processing_auto.py b/src/transformers/models/auto/image_processing_auto.py index 9d5b531def2a..89b73530d12d 100644 --- a/src/transformers/models/auto/image_processing_auto.py +++ b/src/transformers/models/auto/image_processing_auto.py @@ -163,6 +163,7 @@ ("pixio", ("BitImageProcessor", "BitImageProcessorFast")), ("pixtral", ("PixtralImageProcessor", "PixtralImageProcessorFast")), ("poolformer", ("PoolFormerImageProcessor", "PoolFormerImageProcessorFast")), + ("pp_chart2table", ("PPChart2TableImageProcessor", "PPChart2TableImageProcessorFast")), ("prompt_depth_anything", ("PromptDepthAnythingImageProcessor", "PromptDepthAnythingImageProcessorFast")), ("pvt", ("PvtImageProcessor", "PvtImageProcessorFast")), ("pvt_v2", ("PvtImageProcessor", "PvtImageProcessorFast")), diff --git a/src/transformers/models/auto/modeling_auto.py b/src/transformers/models/auto/modeling_auto.py index 8b151b68e1df..60f527d4dfa0 100644 --- a/src/transformers/models/auto/modeling_auto.py +++ b/src/transformers/models/auto/modeling_auto.py @@ -1046,6 +1046,7 @@ class _BaseModelWithGenerate(PreTrainedModel, GenerationMixin): ("perception_lm", "PerceptionLMForConditionalGeneration"), ("pix2struct", "Pix2StructForConditionalGeneration"), ("pixtral", "LlavaForConditionalGeneration"), + ("pp_chart2table", "PPChart2TableForConditionalGeneration"), ("qwen2_5_vl", "Qwen2_5_VLForConditionalGeneration"), ("qwen2_vl", "Qwen2VLForConditionalGeneration"), ("qwen3_vl", "Qwen3VLForConditionalGeneration"), diff --git a/src/transformers/models/auto/processing_auto.py b/src/transformers/models/auto/processing_auto.py index 3d2e6ef5cbc7..97fc88443390 100644 --- a/src/transformers/models/auto/processing_auto.py +++ b/src/transformers/models/auto/processing_auto.py @@ -125,6 +125,7 @@ ("pix2struct", "Pix2StructProcessor"), ("pixtral", "PixtralProcessor"), ("pop2piano", "Pop2PianoProcessor"), + ("pp_chart2table", "PPChart2TableProcessor"), ("qwen2_5_omni", "Qwen2_5OmniProcessor"), ("qwen2_5_vl", "Qwen2_5_VLProcessor"), ("qwen2_audio", "Qwen2AudioProcessor"), diff --git a/src/transformers/models/auto/tokenization_auto.py b/src/transformers/models/auto/tokenization_auto.py index 171a9fca6868..a9464d9d415c 100644 --- a/src/transformers/models/auto/tokenization_auto.py +++ b/src/transformers/models/auto/tokenization_auto.py @@ -299,6 +299,7 @@ else ("TokenizersBackend" if is_tokenizers_available() else None), ), ("plbart", "PLBartTokenizer" if is_tokenizers_available() else None), + ("pp_chart2table", "TokenizersBackend" if is_tokenizers_available() else None), ("prophetnet", "ProphetNetTokenizer"), ("qdqbert", "BertTokenizer" if is_tokenizers_available() else None), ("qwen2", "Qwen2TokenizerFast" if is_tokenizers_available() else None), diff --git a/src/transformers/models/pp_chart2table/__init__.py b/src/transformers/models/pp_chart2table/__init__.py new file mode 100644 index 000000000000..a471ebfb2830 --- /dev/null +++ b/src/transformers/models/pp_chart2table/__init__.py @@ -0,0 +1,32 @@ +# coding=utf-8 +# Copyright 2025 the HuggingFace Team. All rights reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +from typing import TYPE_CHECKING + +from ...utils import _LazyModule +from ...utils.import_utils import define_import_structure + + +if TYPE_CHECKING: + from .configuration_pp_chart2table import * + from .image_processing_pp_chart2table import * + from .image_processing_pp_chart2table_fast import * + from .modeling_pp_chart2table import * + from .processing_pp_chart2table import * +else: + import sys + + _file = globals()["__file__"] + sys.modules[__name__] = _LazyModule(__name__, _file, define_import_structure(_file), module_spec=__spec__) diff --git a/src/transformers/models/pp_chart2table/configuration_pp_chart2table.py b/src/transformers/models/pp_chart2table/configuration_pp_chart2table.py new file mode 100644 index 000000000000..5e32dc30ef76 --- /dev/null +++ b/src/transformers/models/pp_chart2table/configuration_pp_chart2table.py @@ -0,0 +1,364 @@ +# ๐Ÿšจ๐Ÿšจ๐Ÿšจ๐Ÿšจ๐Ÿšจ๐Ÿšจ๐Ÿšจ๐Ÿšจ๐Ÿšจ๐Ÿšจ๐Ÿšจ๐Ÿšจ๐Ÿšจ๐Ÿšจ๐Ÿšจ๐Ÿšจ๐Ÿšจ๐Ÿšจ๐Ÿšจ๐Ÿšจ๐Ÿšจ๐Ÿšจ๐Ÿšจ๐Ÿšจ๐Ÿšจ๐Ÿšจ๐Ÿšจ๐Ÿšจ๐Ÿšจ๐Ÿšจ๐Ÿšจ๐Ÿšจ๐Ÿšจ๐Ÿšจ๐Ÿšจ๐Ÿšจ๐Ÿšจ๐Ÿšจ๐Ÿšจ๐Ÿšจ๐Ÿšจ๐Ÿšจ๐Ÿšจ๐Ÿšจ๐Ÿšจ๐Ÿšจ๐Ÿšจ๐Ÿšจ +# This file was automatically generated from src/transformers/models/pp_chart2table/modular_pp_chart2table.py. +# Do NOT edit this file manually as any edits will be overwritten by the generation of +# the file from the modular. If any change should be done, please apply the change to the +# modular_pp_chart2table.py file directly. One of our CI enforces this. +# ๐Ÿšจ๐Ÿšจ๐Ÿšจ๐Ÿšจ๐Ÿšจ๐Ÿšจ๐Ÿšจ๐Ÿšจ๐Ÿšจ๐Ÿšจ๐Ÿšจ๐Ÿšจ๐Ÿšจ๐Ÿšจ๐Ÿšจ๐Ÿšจ๐Ÿšจ๐Ÿšจ๐Ÿšจ๐Ÿšจ๐Ÿšจ๐Ÿšจ๐Ÿšจ๐Ÿšจ๐Ÿšจ๐Ÿšจ๐Ÿšจ๐Ÿšจ๐Ÿšจ๐Ÿšจ๐Ÿšจ๐Ÿšจ๐Ÿšจ๐Ÿšจ๐Ÿšจ๐Ÿšจ๐Ÿšจ๐Ÿšจ๐Ÿšจ๐Ÿšจ๐Ÿšจ๐Ÿšจ๐Ÿšจ๐Ÿšจ๐Ÿšจ๐Ÿšจ๐Ÿšจ๐Ÿšจ +from typing import Optional + +from transformers.configuration_utils import PreTrainedConfig, layer_type_validation +from transformers.modeling_rope_utils import RopeParameters + + +class PPChart2TableVisionConfig(PreTrainedConfig): + r""" + This is the configuration class to store the configuration of a [`PPChart2TableVisionModel`]. It is used to instantiate a + PP-Chart2Table vision encoder according to the specified arguments, defining the model architecture. Instantiating a + configuration with the defaults will yield a similar configuration to that of the vision encoder of the PP-Chart2Table + architecture developed by the PaddlePaddle team for chart-to-table parsing tasks. + + Configuration objects inherit from [`PreTrainedConfig`] and can be used to control the model outputs. Read the + documentation from [`PreTrainedConfig`] for more information. + + Args: + im_patch_token (`int`, *optional*, defaults to 151859): + The token ID used to represent individual image patches in the multimodal input sequence. + im_start_token (`int`, *optional*, defaults to 151857): + The token ID representing the start of an image token sequence in the multimodal input. + depth (`int`, *optional*, defaults to 12): + Number of hidden layers in the vision Transformer encoder. + embed_dim (`int`, *optional*, defaults to 768): + Dimensionality of the patch embedding layer in the vision encoder. + hidden_size (`int`, *optional*, defaults to 1024): + Dimensionality of the hidden layers in the vision Transformer encoder. + img_size (`int`, *optional*, defaults to 1024): + The size (resolution) of input chart images (assumed to be square). + mlp_ratio (`float`, *optional*, defaults to 4.0): + Ratio of the dimensionality of the feed-forward layer to the hidden size in the vision Transformer blocks. + num_heads (`int`, *optional*, defaults to 12): + Number of attention heads for each self-attention layer in the vision Transformer encoder. + patch_size (`int`, *optional*, defaults to 16): + The size (resolution) of each image patch extracted from the input chart image. + qkv_bias (`bool`, *optional*, defaults to `True`): + Whether to add bias terms to the query, key, and value projection layers in the self-attention mechanism. + use_rel_pos (`bool`, *optional*, defaults to `True`): + Whether to use relative positional embeddings in the self-attention layers of the vision encoder. + global_attn_indexes (`list`, *optional*, defaults to `[2, 5, 8, 11]`): + List of layer indexes where global attention (instead of windowed attention) is applied in the vision encoder. + window_size (`int`, *optional*, defaults to 14): + The size of the attention window for windowed self-attention in the vision Transformer layers. + out_chans (`int`, *optional*, defaults to 256): + Number of output channels from the convolutional stem layer before patch embedding. + + Example: + + ```python + >>> from transformers import PPChart2TableVisionConfig, PPChart2TableVisionModel + + >>> # Initializing a PPChart2TableVisionConfig with default PP-Chart2Table style configuration + >>> configuration = PPChart2TableVisionConfig() + + >>> # Initializing a PPChart2TableVisionModel (with random weights) from the PP-Chart2Table style configuration + >>> model = PPChart2TableVisionModel(configuration) + + >>> # Accessing the model configuration + >>> configuration = model.config + """ + + model_type = "pp_chart2table_vision" + base_config_key = "vision_config" + + def __init__( + self, + im_patch_token: int = 151859, + im_start_token: int = 151857, + depth: int = 12, + embed_dim: int = 768, + hidden_size: int = 1024, + img_size: int = 1024, + mlp_ratio: float = 4.0, + num_heads: int = 12, + patch_size: int = 16, + qkv_bias: bool = True, + use_rel_pos: bool = True, + global_attn_indexes: Optional[list] = None, + window_size: int = 14, + out_chans: int = 256, + **kwargs, + ): + self.im_patch_token = im_patch_token + self.im_start_token = im_start_token + + self.depth = depth + self.embed_dim = embed_dim + self.hidden_size = hidden_size + self.img_size = img_size + self.mlp_ratio = mlp_ratio + self.num_heads = num_heads + self.patch_size = patch_size + self.qkv_bias = qkv_bias + self.use_rel_pos = use_rel_pos + self.global_attn_indexes = global_attn_indexes if global_attn_indexes is not None else [2, 5, 8, 11] + self.window_size = window_size + self.out_chans = out_chans + + super().__init__(**kwargs) + + +class PPChart2TableTextConfig(PreTrainedConfig): + r""" + This is the configuration class to store the configuration of a [`PPChart2TableTextModel`]. It is used to instantiate a + PP-Chart2Table text decoder according to the specified arguments, defining the model architecture. Instantiating a + configuration with the defaults will yield a similar configuration to that of the text encoder/decoder of the + PPChart2TableText-7B-beta [Qwen/PPChart2TableText-7B-beta](https://huggingface.co/Qwen/PPChart2TableText-7B-beta) + architecture, optimized for chart-to-table text generation tasks. + + Configuration objects inherit from [`PreTrainedConfig`] and can be used to control the model outputs. Read the + documentation from [`PreTrainedConfig`] for more information. + + Args: + attention_dropout (`float`, *optional*, defaults to 0.0): + The dropout ratio for the attention probabilities in self-attention layers. + bos_token_id (`int`, *optional*, defaults to 151643): + The token ID representing the beginning of a sequence (BOS) for text generation. + eos_token_id (`int`, *optional*, defaults to 151643): + The token ID representing the end of a sequence (EOS) for text generation. + hidden_act (`str` or `function`, *optional*, defaults to `"silu"`): + The non-linear activation function (function or string) in the feed-forward and attention layers of the decoder. + hidden_size (`int`, *optional*, defaults to 1024): + Dimensionality of the hidden representations in the Transformer decoder layers. + initializer_range (`float`, *optional*, defaults to 0.02): + The standard deviation of the truncated_normal_initializer for initializing all weight matrices. + intermediate_size (`int`, *optional*, defaults to 2816): + Dimensionality of the "intermediate" (i.e., feed-forward) layer in the Transformer decoder blocks. + max_position_embeddings (`int`, *optional*, defaults to 32768): + The maximum sequence length that this model might ever be used with for text input/output. + num_attention_heads (`int`, *optional*, defaults to 16): + Number of attention heads for each self-attention layer in the Transformer decoder. + num_hidden_layers (`int`, *optional*, defaults to 24): + Number of hidden layers in the Transformer decoder. + num_key_value_heads (`int`, *optional*, defaults to 16): + Number of key/value heads for implementing Grouped Query Attention (GQA). If equal to `num_attention_heads`, + Multi Head Attention (MHA) is used; if 1, Multi Query Attention (MQA) is used. For more details, see + [this paper](https://huggingface.co/papers/2305.13245). + rms_norm_eps (`float`, *optional*, defaults to 1e-06): + The epsilon value used by the RMS normalization layers to avoid division by zero. + rope_theta (`float`, *optional*, defaults to 1000000.0): + The base period of the RoPE (Rotary Position Embedding) embeddings, controlling the frequency of positional encoding. + rope_parameters (`RopeParameters` or `dict`, *optional*): + Configuration parameters for RoPE embeddings, including scaling parameters for longer sequence lengths beyond + `max_position_embeddings`. + sliding_window (`int`, *optional*, defaults to 32768): + Window size for Sliding Window Attention (SWA) in the decoder layers (only active if `use_sliding_window=True`). + tie_word_embeddings (`bool`, *optional*, defaults to `True`): + Whether the model's input and output word embeddings should be tied (shared weights). + use_cache (`bool`, *optional*, defaults to `True`): + Whether to return the last key/value attention states to speed up sequential decoding (only relevant for autoregressive + generation). + vocab_size (`int`, *optional*, defaults to 151860): + Vocabulary size of the PPChart2TableText model. Defines the number of distinct tokens that can be represented + by `input_ids`. + layer_types (`list[str]`, *optional*): + Attention pattern for each decoder layer (e.g., `"full_attention"` or `"sliding_attention"`). If not specified, + automatically determined by `sliding_window`. + + Example: + + ```python + >>> from transformers import PPChart2TableTextConfig, PPChart2TableTextModel + + >>> # Initializing a PPChart2TableText style configuration + >>> configuration = PPChart2TableTextConfig() + + >>> # Initializing a model from the PPChart2TableText-7B style configuration + >>> model = PPChart2TableTextModel(configuration) + + >>> # Accessing the model configuration + >>> configuration = model.config + """ + + model_type = "pp_chart2table_text" + keys_to_ignore_at_inference = ["past_key_values"] + + # Default tensor parallel plan for base model `PPChart2TableText` + base_model_tp_plan = { + "layers.*.self_attn.q_proj": "colwise", + "layers.*.self_attn.k_proj": "colwise", + "layers.*.self_attn.v_proj": "colwise", + "layers.*.self_attn.o_proj": "rowwise", + "layers.*.mlp.gate_proj": "colwise", + "layers.*.mlp.up_proj": "colwise", + "layers.*.mlp.down_proj": "rowwise", + } + base_model_pp_plan = { + "embed_tokens": (["input_ids"], ["inputs_embeds"]), + "layers": (["hidden_states", "attention_mask"], ["hidden_states"]), + "norm": (["hidden_states"], ["hidden_states"]), + } + base_config_key = "text_config" + + def __init__( + self, + attention_dropout: float = 0.0, + bos_token_id: int = 151643, + eos_token_id: int = 151643, + hidden_act: str = "silu", + hidden_size: int = 1024, + initializer_range: float = 0.02, + intermediate_size: int = 2816, + max_position_embeddings: int = 32768, + num_attention_heads: int = 16, + num_hidden_layers: int = 24, + num_key_value_heads: int = 16, + rms_norm_eps: float = 1e-06, + rope_theta: float = 1000000.0, + rope_parameters: Optional[RopeParameters | dict[str, RopeParameters]] = None, + sliding_window: int = 32768, + tie_word_embeddings: bool = True, + use_cache: bool = True, + vocab_size: int = 151860, + layer_types: Optional[list[str]] = None, + **kwargs, + ): + self.vocab_size = vocab_size + self.max_position_embeddings = max_position_embeddings + self.hidden_size = hidden_size + self.intermediate_size = intermediate_size + self.num_hidden_layers = num_hidden_layers + self.num_attention_heads = num_attention_heads + self.sliding_window = sliding_window + + # for backward compatibility + if num_key_value_heads is None: + num_key_value_heads = num_attention_heads + self.num_key_value_heads = num_key_value_heads + self.hidden_act = hidden_act + self.initializer_range = initializer_range + self.rms_norm_eps = rms_norm_eps + self.use_cache = use_cache + + self.attention_dropout = attention_dropout + + self.layer_types = layer_types + if self.layer_types is None: + self.layer_types = [ + "sliding_attention" if self.sliding_window is not None else "full_attention" + for i in range(self.num_hidden_layers) + ] + layer_type_validation(self.layer_types, self.num_hidden_layers) + + self.rope_parameters = rope_parameters + + self.rope_theta = rope_theta + self.tie_word_embeddings = tie_word_embeddings + super().__init__( + bos_token_id=bos_token_id, + eos_token_id=eos_token_id, + tie_word_embeddings=tie_word_embeddings, + **kwargs, + ) + + +class PPChart2TableConfig(PreTrainedConfig): + r""" + This is the main configuration class to store the configuration of a [`PPChart2TableModel`] or [`PPChart2TableForConditionalGeneration`]. + It is used to instantiate a PP-Chart2Table multimodal model according to the specified arguments, defining the vision and text + sub-model architectures. This configuration class inherits from [`PreTrainedConfig`] and combines the configurations of: + - [`PPChart2TableVisionConfig`] (for the chart vision encoder) + - [`PPChart2TableTextConfig`] (for the table text decoder) + PP-Chart2Table [PaddlePaddle/PP-Chart2Table_safetensors](https://huggingface.co/PaddlePaddle/PP-Chart2Table_safetensors). + + Instantiating a `PPChart2TableConfig` with the defaults will yield a similar configuration to the base PP-Chart2Table model + developed by the PaddlePaddle team for chart-to-table parsing tasks. + + Configuration objects inherit from [`PreTrainedConfig`] and can be used to control the model outputs. Read the + documentation from [`PreTrainedConfig`] for more information. + + Args: + vision_config (`dict`, *optional*): + Dictionary of configuration options used to initialize [`PPChart2TableVisionConfig`]. If `None`, the default + `PPChart2TableVisionConfig` configuration will be used. + text_config (`dict`, *optional*): + Dictionary of configuration options used to initialize [`PPChart2TableTextConfig`]. If `None`, the default + `PPChart2TableTextConfig` configuration will be used. + im_start_token (`int`, *optional*, defaults to 151857): + The token ID representing the start of an image token sequence in the multimodal input (shared across vision/text sub-configs). + im_patch_token (`int`, *optional*, defaults to 151859): + The token ID used to represent individual image patches in the multimodal input sequence (shared across vision/text sub-configs). + + Example: + + ```python + >>> from transformers import PPChart2TableConfig, PPChart2TableModel + + >>> # Initializing a PPChart2Table configuration with default vision and text sub-configs + >>> configuration = PPChart2TableConfig() + + >>> # Initializing a PPChart2Table configuration with custom vision and text sub-configs + >>> vision_config = {"img_size": 512, "patch_size": 8} + >>> text_config = {"hidden_size": 2048, "num_hidden_layers": 16} + >>> configuration = PPChart2TableConfig(vision_config=vision_config, text_config=text_config) + + >>> # Initializing a model from the PPChart2Table configuration + >>> model = PPChart2TableModel(configuration) + + >>> # Accessing the model configuration + >>> configuration = model.config + >>> # Accessing the vision sub-config + >>> vision_config = configuration.vision_config + >>> # Accessing the text sub-config + >>> text_config = configuration.text_config + """ + + model_type = "pp_chart2table" + sub_configs = {"vision_config": PPChart2TableVisionConfig, "text_config": PPChart2TableTextConfig} + + def __init__( + self, + vision_config: dict | None = None, + text_config: dict | None = None, + im_start_token: int = 151857, + im_patch_token: int = 151859, + **kwargs, + ): + if vision_config is None: + vision_config = {} + self.vision_config = PPChart2TableVisionConfig(**vision_config) + + if text_config is None: + text_config = {} + self.text_config = PPChart2TableTextConfig(**text_config) + + self.model_type = "pp_chart2table" + + self.im_start_token = im_start_token + self.im_patch_token = im_patch_token + + text_config_keys = [ + "attention_dropout", + "bos_token_id", + "eos_token_id", + "hidden_act", + "hidden_size", + "initializer_range", + "intermediate_size", + "max_position_embeddings", + "num_attention_heads", + "num_hidden_layers", + "num_key_value_heads", + "rms_norm_eps", + "rope_theta", + "sliding_window", + "tie_word_embeddings", + "dtype", + "use_cache", + "vocab_size", + ] + for key in text_config_keys: + if hasattr(self.text_config, key): + setattr(self, key, getattr(self.text_config, key)) + + super().__init__(**kwargs) + + +__all__ = ["PPChart2TableConfig", "PPChart2TableVisionConfig", "PPChart2TableTextConfig"] diff --git a/src/transformers/models/pp_chart2table/image_processing_pp_chart2table.py b/src/transformers/models/pp_chart2table/image_processing_pp_chart2table.py new file mode 100644 index 000000000000..e83a49a99f1b --- /dev/null +++ b/src/transformers/models/pp_chart2table/image_processing_pp_chart2table.py @@ -0,0 +1,161 @@ +# ๐Ÿšจ๐Ÿšจ๐Ÿšจ๐Ÿšจ๐Ÿšจ๐Ÿšจ๐Ÿšจ๐Ÿšจ๐Ÿšจ๐Ÿšจ๐Ÿšจ๐Ÿšจ๐Ÿšจ๐Ÿšจ๐Ÿšจ๐Ÿšจ๐Ÿšจ๐Ÿšจ๐Ÿšจ๐Ÿšจ๐Ÿšจ๐Ÿšจ๐Ÿšจ๐Ÿšจ๐Ÿšจ๐Ÿšจ๐Ÿšจ๐Ÿšจ๐Ÿšจ๐Ÿšจ๐Ÿšจ๐Ÿšจ๐Ÿšจ๐Ÿšจ๐Ÿšจ๐Ÿšจ๐Ÿšจ๐Ÿšจ๐Ÿšจ๐Ÿšจ๐Ÿšจ๐Ÿšจ๐Ÿšจ๐Ÿšจ๐Ÿšจ๐Ÿšจ๐Ÿšจ๐Ÿšจ +# This file was automatically generated from src/transformers/models/pp_chart2table/modular_pp_chart2table.py. +# Do NOT edit this file manually as any edits will be overwritten by the generation of +# the file from the modular. If any change should be done, please apply the change to the +# modular_pp_chart2table.py file directly. One of our CI enforces this. +# ๐Ÿšจ๐Ÿšจ๐Ÿšจ๐Ÿšจ๐Ÿšจ๐Ÿšจ๐Ÿšจ๐Ÿšจ๐Ÿšจ๐Ÿšจ๐Ÿšจ๐Ÿšจ๐Ÿšจ๐Ÿšจ๐Ÿšจ๐Ÿšจ๐Ÿšจ๐Ÿšจ๐Ÿšจ๐Ÿšจ๐Ÿšจ๐Ÿšจ๐Ÿšจ๐Ÿšจ๐Ÿšจ๐Ÿšจ๐Ÿšจ๐Ÿšจ๐Ÿšจ๐Ÿšจ๐Ÿšจ๐Ÿšจ๐Ÿšจ๐Ÿšจ๐Ÿšจ๐Ÿšจ๐Ÿšจ๐Ÿšจ๐Ÿšจ๐Ÿšจ๐Ÿšจ๐Ÿšจ๐Ÿšจ๐Ÿšจ๐Ÿšจ๐Ÿšจ๐Ÿšจ๐Ÿšจ +from typing import Optional, Union + +from transformers.feature_extraction_utils import BatchFeature +from transformers.image_processing_utils import BaseImageProcessor +from transformers.image_transforms import flip_channel_order, resize, to_channel_dimension_format +from transformers.image_utils import ( + ChannelDimension, + ImageInput, + PILImageResampling, + infer_channel_dimension_format, + make_flat_list_of_images, + to_numpy_array, + valid_images, + validate_preprocess_arguments, +) +from transformers.processing_utils import TensorType +from transformers.utils import filter_out_non_signature_kwargs + + +class PPChart2TableImageProcessor(BaseImageProcessor): + r""" + Image processor for the PP-Chart2Table multimodal model, optimized for chart image preprocessing tasks. + + This processor handles the complete preprocessing pipeline for chart images, including resizing, rescaling, + normalization, and channel dimension reordering, tailored to the input requirements of the PP-Chart2Table vision encoder. + + Args: + do_resize (`bool`, *optional*, defaults to `True`): + Whether to resize the input images to the specified `size`. + size (`dict[str, int]`, *optional*, defaults to `{"height": 256, "width": 256}`): + Dictionary containing the target height and width for resizing. Format: `{"height": int, "width": int}`. + resample (`PILImageResampling`, *optional*, defaults to `PILImageResampling.BICUBIC`): + Resampling filter to use when resizing images (e.g., BICUBIC, BILINEAR). + do_rescale (`bool`, *optional*, defaults to `True`): + Whether to rescale the pixel values from the range [0, 255] to [0, 1] using `rescale_factor`. + rescale_factor (`int` or `float`, *optional*, defaults to `1/255`): + Factor to apply for rescaling pixel values (e.g., 1/255 scales 0-255 to 0-1). + do_normalize (`bool`, *optional*, defaults to `True`): + Whether to normalize the input images using `image_mean` and `image_std`. + image_mean (`float` or `list[float]`, *optional*, defaults to `[0.406, 0.456, 0.485]`): + Mean values for image normalization (per channel, RGB order). + image_std (`float` or `list[float]`, *optional*, defaults to `[0.225, 0.224, 0.229]`): + Standard deviation values for image normalization (per channel, RGB order). + patch_size (`int`, *optional*, defaults to 16): + Size of image patches used by the PP-Chart2Table vision encoder (for alignment with model input). + merge_size (`int`, *optional*, defaults to 4): + Size factor for merging image patches (specific to PP-Chart2Table's vision processing pipeline). + """ + + model_input_names = ["pixel_values"] + + def __init__( + self, + do_resize: bool = True, + size: Optional[dict[str, int]] = None, + resample: Optional[PILImageResampling] = PILImageResampling.BICUBIC, + do_rescale: bool = True, + rescale_factor: Union[int, float] = 1 / 255, + do_normalize: bool = True, + image_mean: Optional[Union[float, list[float]]] = [0.406, 0.456, 0.485], + image_std: Optional[Union[float, list[float]]] = [0.225, 0.224, 0.229], + patch_size: int = 16, + merge_size: int = 4, + **kwargs, + ) -> None: + super().__init__(**kwargs) + size = size if size is not None else {"height": 256, "width": 256} + + self.do_resize = do_resize + self.size = size + self.do_rescale = do_rescale + self.rescale_factor = rescale_factor + self.do_normalize = do_normalize + self.image_mean = image_mean + self.image_std = image_std + self.resample = resample + self.patch_size = patch_size + self.merge_size = merge_size + + @filter_out_non_signature_kwargs() + def preprocess( + self, + images: ImageInput, + size: Optional[dict[str, int]] = None, + do_resize: Optional[bool] = None, + resample: Optional[PILImageResampling] = None, + do_rescale: Optional[bool] = None, + rescale_factor: Optional[Union[int, float]] = None, + do_normalize: Optional[bool] = None, + image_mean: Optional[Union[float, list[float]]] = None, + image_std: Optional[Union[float, list[float]]] = None, + return_tensors: Optional[Union[TensorType, str]] = None, + data_format: Union[str, ChannelDimension] = ChannelDimension.FIRST, + input_data_format: Optional[Union[str, ChannelDimension]] = None, + ) -> BatchFeature: + size = self.size if size is None else size + do_resize = self.do_resize if do_resize is None else do_resize + resample = self.resample if resample is None else resample + do_rescale = self.do_rescale if do_rescale is None else do_rescale + rescale_factor = self.rescale_factor if rescale_factor is None else rescale_factor + do_normalize = self.do_normalize if do_normalize is None else do_normalize + image_mean = self.image_mean if image_mean is None else image_mean + image_std = self.image_std if image_std is None else image_std + + images = make_flat_list_of_images(images) + + validate_preprocess_arguments( + do_rescale=do_rescale, + rescale_factor=rescale_factor, + do_normalize=do_normalize, + image_mean=image_mean, + image_std=image_std, + size=size, + do_resize=do_resize, + resample=resample, + ) + + if not valid_images(images): + raise ValueError("Invalid image type. Must be of type PIL.Image.Image, numpy.ndarray, or torch.Tensor") + + # All transformations expect numpy arrays + images = [to_numpy_array(image) for image in images] + if input_data_format is None: + input_data_format = infer_channel_dimension_format(images[0]) + + # transformations + resize_imgs = [] + if do_resize: + for image in images: + img = resize( + image, + size=(size["height"], size["width"]), + resample=resample, + input_data_format=input_data_format, + ) + resize_imgs.append(img) + images = resize_imgs + + if do_rescale: + images = [self.rescale(image, rescale_factor, input_data_format=input_data_format) for image in images] + + if do_normalize: + images = [ + self.normalize(image, image_mean, image_std, input_data_format=input_data_format) for image in images + ] + images = [flip_channel_order(image, input_data_format=input_data_format) for image in images] + images = [ + to_channel_dimension_format(image, data_format, input_channel_dim=input_data_format) for image in images + ] + + encoded_inputs = BatchFeature(data={"pixel_values": images}, tensor_type=return_tensors) + return encoded_inputs + + +__all__ = ["PPChart2TableImageProcessor"] diff --git a/src/transformers/models/pp_chart2table/image_processing_pp_chart2table_fast.py b/src/transformers/models/pp_chart2table/image_processing_pp_chart2table_fast.py new file mode 100644 index 000000000000..86a6cdb3a672 --- /dev/null +++ b/src/transformers/models/pp_chart2table/image_processing_pp_chart2table_fast.py @@ -0,0 +1,95 @@ +# ๐Ÿšจ๐Ÿšจ๐Ÿšจ๐Ÿšจ๐Ÿšจ๐Ÿšจ๐Ÿšจ๐Ÿšจ๐Ÿšจ๐Ÿšจ๐Ÿšจ๐Ÿšจ๐Ÿšจ๐Ÿšจ๐Ÿšจ๐Ÿšจ๐Ÿšจ๐Ÿšจ๐Ÿšจ๐Ÿšจ๐Ÿšจ๐Ÿšจ๐Ÿšจ๐Ÿšจ๐Ÿšจ๐Ÿšจ๐Ÿšจ๐Ÿšจ๐Ÿšจ๐Ÿšจ๐Ÿšจ๐Ÿšจ๐Ÿšจ๐Ÿšจ๐Ÿšจ๐Ÿšจ๐Ÿšจ๐Ÿšจ๐Ÿšจ๐Ÿšจ๐Ÿšจ๐Ÿšจ๐Ÿšจ๐Ÿšจ๐Ÿšจ๐Ÿšจ๐Ÿšจ๐Ÿšจ +# This file was automatically generated from src/transformers/models/pp_chart2table/modular_pp_chart2table.py. +# Do NOT edit this file manually as any edits will be overwritten by the generation of +# the file from the modular. If any change should be done, please apply the change to the +# modular_pp_chart2table.py file directly. One of our CI enforces this. +# ๐Ÿšจ๐Ÿšจ๐Ÿšจ๐Ÿšจ๐Ÿšจ๐Ÿšจ๐Ÿšจ๐Ÿšจ๐Ÿšจ๐Ÿšจ๐Ÿšจ๐Ÿšจ๐Ÿšจ๐Ÿšจ๐Ÿšจ๐Ÿšจ๐Ÿšจ๐Ÿšจ๐Ÿšจ๐Ÿšจ๐Ÿšจ๐Ÿšจ๐Ÿšจ๐Ÿšจ๐Ÿšจ๐Ÿšจ๐Ÿšจ๐Ÿšจ๐Ÿšจ๐Ÿšจ๐Ÿšจ๐Ÿšจ๐Ÿšจ๐Ÿšจ๐Ÿšจ๐Ÿšจ๐Ÿšจ๐Ÿšจ๐Ÿšจ๐Ÿšจ๐Ÿšจ๐Ÿšจ๐Ÿšจ๐Ÿšจ๐Ÿšจ๐Ÿšจ๐Ÿšจ๐Ÿšจ +from typing import Optional, Union + +import torch +from torchvision.transforms.v2.functional import InterpolationMode + +from transformers.feature_extraction_utils import BatchFeature +from transformers.image_processing_utils_fast import BaseImageProcessorFast +from transformers.processing_utils import TensorType + + +class PPChart2TableImageProcessorFast(BaseImageProcessorFast): + r""" + Fast image processor for the PP-Chart2Table multimodal model, optimized for GPU-accelerated chart image preprocessing. + + This high-performance processor implements a streamlined preprocessing pipeline for chart images (resizing, rescaling, + normalization, channel reordering) using PyTorch tensor operations, designed for efficient batch processing on GPUs. + It inherits from [`BaseImageProcessorFast`] and is optimized for inference/training pipelines requiring low-latency + image preprocessing. + + Class Attributes (Default Configuration): + resample (`int`, defaults to 3): + Integer identifier for the resampling filter (3 = BICUBIC, compatible with `InterpolationMode.BICUBIC`). + image_mean (`list[float]`, defaults to `[0.40821073, 0.4578275, 0.48145466]`): + Per-channel mean values for image normalization (RGB order). + image_std (`list[float]`, defaults to `[0.27577711, 0.26130258, 0.26862954]`): + Per-channel standard deviation values for image normalization (RGB order). + size (`dict[str, int]`, defaults to `{"height": 1024, "width": 1024}`): + Default target size for image resizing (1024x1024, optimized for PP-Chart2Table vision encoder). + patch_size (`int`, defaults to 16): + Size of image patches used by the PP-Chart2Table vision encoder (for alignment with model input). + merge_size (`int`, defaults to 4): + Size factor for merging image patches (specific to PP-Chart2Table's vision processing pipeline). + do_resize (`bool`, defaults to `True`): + Default flag to enable image resizing. + do_rescale (`bool`, defaults to `True`): + Default flag to enable pixel value rescaling (from [0,255] to [0,1]). + do_normalize (`bool`, defaults to `True`): + Default flag to enable image normalization. + """ + + resample = 3 + image_mean = [0.40821073, 0.4578275, 0.48145466] + image_std = [0.27577711, 0.26130258, 0.26862954] + size = {"height": 1024, "width": 1024} + patch_size = 16 + merge_size = 4 + do_resize = True + do_rescale = True + do_normalize = True + + def __init__(self, **kwargs) -> None: + super().__init__(**kwargs) + + def _preprocess( + self, + images: list[torch.Tensor], + size: Optional[list[dict[str, int]]], + do_resize: bool, + do_rescale: bool, + rescale_factor: float, + do_normalize: bool, + image_mean: Optional[Union[float, list[float]]], + image_std: Optional[Union[float, list[float]]], + return_tensors: Optional[Union[str, TensorType]], + interpolation: Optional[InterpolationMode] = None, + **kwargs, + ) -> BatchFeature: + data = {} + resize_imgs = [] + if do_resize: + for image in images: + img = self.resize(image, size=size, interpolation=interpolation) + resize_imgs.append(img) + images = resize_imgs + + processed_images = [] + for image in images: + image = self.rescale_and_normalize(image, do_rescale, rescale_factor, do_normalize, image_mean, image_std) + processed_images.append(image) + images = processed_images + + images = [image[[2, 1, 0], :, :] for image in images] + data.update({"pixel_values": torch.stack(images, dim=0)}) + encoded_inputs = BatchFeature(data, tensor_type=return_tensors) + + return encoded_inputs + + +__all__ = ["PPChart2TableImageProcessorFast"] diff --git a/src/transformers/models/pp_chart2table/modeling_pp_chart2table.py b/src/transformers/models/pp_chart2table/modeling_pp_chart2table.py new file mode 100644 index 000000000000..6d95acc7eea5 --- /dev/null +++ b/src/transformers/models/pp_chart2table/modeling_pp_chart2table.py @@ -0,0 +1,1369 @@ +# ๐Ÿšจ๐Ÿšจ๐Ÿšจ๐Ÿšจ๐Ÿšจ๐Ÿšจ๐Ÿšจ๐Ÿšจ๐Ÿšจ๐Ÿšจ๐Ÿšจ๐Ÿšจ๐Ÿšจ๐Ÿšจ๐Ÿšจ๐Ÿšจ๐Ÿšจ๐Ÿšจ๐Ÿšจ๐Ÿšจ๐Ÿšจ๐Ÿšจ๐Ÿšจ๐Ÿšจ๐Ÿšจ๐Ÿšจ๐Ÿšจ๐Ÿšจ๐Ÿšจ๐Ÿšจ๐Ÿšจ๐Ÿšจ๐Ÿšจ๐Ÿšจ๐Ÿšจ๐Ÿšจ๐Ÿšจ๐Ÿšจ๐Ÿšจ๐Ÿšจ๐Ÿšจ๐Ÿšจ๐Ÿšจ๐Ÿšจ๐Ÿšจ๐Ÿšจ๐Ÿšจ๐Ÿšจ +# This file was automatically generated from src/transformers/models/pp_chart2table/modular_pp_chart2table.py. +# Do NOT edit this file manually as any edits will be overwritten by the generation of +# the file from the modular. If any change should be done, please apply the change to the +# modular_pp_chart2table.py file directly. One of our CI enforces this. +# ๐Ÿšจ๐Ÿšจ๐Ÿšจ๐Ÿšจ๐Ÿšจ๐Ÿšจ๐Ÿšจ๐Ÿšจ๐Ÿšจ๐Ÿšจ๐Ÿšจ๐Ÿšจ๐Ÿšจ๐Ÿšจ๐Ÿšจ๐Ÿšจ๐Ÿšจ๐Ÿšจ๐Ÿšจ๐Ÿšจ๐Ÿšจ๐Ÿšจ๐Ÿšจ๐Ÿšจ๐Ÿšจ๐Ÿšจ๐Ÿšจ๐Ÿšจ๐Ÿšจ๐Ÿšจ๐Ÿšจ๐Ÿšจ๐Ÿšจ๐Ÿšจ๐Ÿšจ๐Ÿšจ๐Ÿšจ๐Ÿšจ๐Ÿšจ๐Ÿšจ๐Ÿšจ๐Ÿšจ๐Ÿšจ๐Ÿšจ๐Ÿšจ๐Ÿšจ๐Ÿšจ๐Ÿšจ +from collections.abc import Callable +from dataclasses import dataclass +from typing import Optional, Union + +import torch +import torch.nn as nn +import torch.nn.functional as F + +from transformers.cache_utils import Cache +from transformers.generation import GenerationMixin +from transformers.modeling_outputs import ModelOutput +from transformers.modeling_utils import PreTrainedModel +from transformers.utils import can_return_tuple + +from ...activations import ACT2FN +from ...cache_utils import DynamicCache +from ...integrations import use_kernel_forward_from_hub, use_kernel_func_from_hub, use_kernelized_func +from ...masking_utils import create_causal_mask, create_sliding_window_causal_mask +from ...modeling_flash_attention_utils import FlashAttentionKwargs +from ...modeling_layers import GradientCheckpointingLayer +from ...modeling_outputs import BaseModelOutputWithPast +from ...modeling_rope_utils import ROPE_INIT_FUNCTIONS, dynamic_rope_update +from ...modeling_utils import ALL_ATTENTION_FUNCTIONS +from ...processing_utils import Unpack +from ...utils import TransformersKwargs, auto_docstring +from ...utils.generic import check_model_inputs, maybe_autocast +from .configuration_pp_chart2table import PPChart2TableConfig, PPChart2TableTextConfig, PPChart2TableVisionConfig + + +class PPChart2TableVisionPatchEmbed(nn.Module): + r""" + Image to Patch Embedding layer for PP-Chart2Table vision encoder. + + This module converts raw chart images (HWC format) into flattened patch embeddings via a 2D convolution, + followed by dimension permutation to align with the vision transformer's input format. + + Args: + kernel_size (`tuple[int, int]`, *optional*, defaults to `(16, 16)`): + Size of the convolution kernel (patch size) for splitting images into patches. + stride (`tuple[int, int]`, *optional*, defaults to `(16, 16)`): + Stride of the convolution operation (matches patch size for non-overlapping patches). + padding (`tuple[int, int]`, *optional*, defaults to `(0, 0)`): + Padding applied to the input image before convolution (ensures patch alignment). + in_chans (`int`, *optional*, defaults to 3): + Number of input channels (3 for RGB chart images). + embed_dim (`int`, *optional*, defaults to 768): + Dimensionality of the output patch embeddings (hidden size of the vision transformer). + + Shape: + - Input: `(B, C, H, W)` (batch size, channels, height, width) + - Output: `(B, H_out, W_out, C_out)` (batch size, patch height, patch width, embedding dim) + """ + + def __init__( + self, + kernel_size: tuple[int, int] = (16, 16), + stride: tuple[int, int] = (16, 16), + padding: tuple[int, int] = (0, 0), + in_chans: int = 3, + embed_dim: int = 768, + ) -> None: + super().__init__() + self.proj = nn.Conv2d(in_chans, embed_dim, kernel_size=kernel_size, stride=stride, padding=padding) + + def forward(self, hidden_states: torch.Tensor) -> torch.Tensor: + hidden_states = self.proj(hidden_states) + hidden_states = hidden_states.permute(0, 2, 3, 1) + return hidden_states + + +class PPChart2TableVisionMLPBlock(nn.Module): + r""" + Multi-Layer Perceptron (MLP) block for PP-Chart2Table vision transformer layers. + + Implements a two-layer feed-forward network with activation function, used in the vision transformer's + decoder layers to project features to a higher dimension and back. + + Args: + embedding_dim (`int`): + Dimensionality of the input/output embeddings (hidden size of the transformer layer). + mlp_dim (`int`): + Dimensionality of the intermediate (hidden) layer in the MLP (typically 4x embedding_dim). + act (`Type[nn.Module]`, *optional*, defaults to `torch.nn.GELU`): + Non-linear activation function to apply between the two linear layers. + + Shape: + - Input: `(B, H, W, embedding_dim)` or `(B, N, embedding_dim)` (N = H*W) + - Output: Same shape as input + """ + + def __init__( + self, + embedding_dim: int, + mlp_dim: int, + act: type[nn.Module] = torch.nn.GELU, + ) -> None: + super().__init__() + self.lin1 = nn.Linear(embedding_dim, mlp_dim) + self.lin2 = nn.Linear(mlp_dim, embedding_dim) + self.act = act() + + def forward(self, hidden_states: torch.Tensor) -> torch.Tensor: + return self.lin2(self.act(self.lin1(hidden_states))) + + +class PPChart2TableVisionLayerNorm2d(nn.Module): + r""" + 2D Layer Normalization for spatial feature maps (adapted for PP-Chart2Table vision encoder). + + Applies layer normalization over the channel dimension of 2D feature maps, with learnable scale/bias parameters + broadcasted across spatial dimensions (height/width). + + Args: + num_channels (`int`): + Number of channels in the input feature map (embedding dimension). + epsilon (`float`, *optional*, defaults to `1e-06`): + Small value added to variance to avoid division by zero. + + Shape: + - Input: `(B, C, H, W)` (batch size, channels, height, width) + - Output: Same shape as input + """ + + def __init__(self, num_channels: int, epsilon: float = 1e-06) -> None: + super().__init__() + self.weight = nn.Parameter(torch.ones(num_channels)) + self.bias = nn.Parameter(torch.zeros(num_channels)) + self.epsilon = epsilon + + def forward(self, hidden_states: torch.Tensor) -> torch.Tensor: + u = hidden_states.mean(dim=1, keepdim=True) + s = (hidden_states - u).pow(2).mean(dim=1, keepdim=True) + hidden_states = (hidden_states - u) / torch.sqrt(s + self.epsilon) + hidden_states = self.weight[:, None, None] * hidden_states + self.bias[:, None, None] + return hidden_states + + +def get_rel_pos(q_size: int, k_size: int, rel_pos: torch.Tensor) -> torch.Tensor: + r""" + Get relative positional embeddings for query and key sequences, with interpolation for mismatched sizes. + + Args: + q_size (`int`): + Spatial size (height/width) of query feature map + k_size (`int`): + Spatial size (height/width) of key feature map + rel_pos (`torch.Tensor`): + Precomputed relative positional embeddings with shape [max_rel_dist_original, dim] + + Returns: + `torch.Tensor`: + Interpolated relative positional embeddings for the query-key pair, shape [q_size, k_size, dim] + """ + max_rel_dist = int(2 * max(q_size, k_size) - 1) + if rel_pos.shape[0] != max_rel_dist: + rel_pos_resized = F.interpolate( + rel_pos.reshape(1, rel_pos.shape[0], -1).permute(0, 2, 1), + size=max_rel_dist, + mode="linear", + ) + rel_pos_resized = rel_pos_resized.reshape(-1, max_rel_dist).permute(1, 0) + else: + rel_pos_resized = rel_pos + + q_coords = torch.arange(q_size)[:, None] * max(k_size / q_size, 1.0) + k_coords = torch.arange(k_size)[None, :] * max(q_size / k_size, 1.0) + relative_coords = q_coords - k_coords + (k_size - 1) * max(q_size / k_size, 1.0) + return rel_pos_resized[relative_coords.long()] + + +def add_decomposed_rel_pos( + attn: torch.Tensor, + q: torch.Tensor, + rel_pos_h: torch.Tensor, + rel_pos_w: torch.Tensor, + q_size: tuple[int, int], + k_size: tuple[int, int], +) -> torch.Tensor: + r""" + Add decomposed relative positional embeddings (height and width separately) to attention scores. + + Args: + attn (`torch.Tensor`): + Attention scores with shape [B, q_h*q_w, k_h*k_w] + q (`torch.Tensor`): + Query tensor with shape [B, q_h*q_w, dim] + rel_pos_h (`torch.Tensor`): + Precomputed relative positional embeddings for height dimension + rel_pos_w (`torch.Tensor`): + Precomputed relative positional embeddings for width dimension + q_size (`tuple[int, int]`): + Spatial size (q_h, q_w) of query feature map + k_size (`tuple[int, int]`): + Spatial size (k_h, k_w) of key feature map + + Returns: + `torch.Tensor`: + Attention scores with added relative positional embeddings, shape [B, q_h*q_w, k_h*k_w] + """ + q_h, q_w = q_size + k_h, k_w = k_size + Rh = get_rel_pos(q_h, k_h, rel_pos_h) + Rw = get_rel_pos(q_w, k_w, rel_pos_w) + + B, _, dim = q.shape + r_q = q.reshape(B, q_h, q_w, dim) + rel_h = torch.einsum("bhwc,hkc->bhwk", r_q, Rh) + rel_w = torch.einsum("bhwc,wkc->bhwk", r_q, Rw) + + attn = (attn.reshape(B, q_h, q_w, k_h, k_w) + rel_h[:, :, :, :, None] + rel_w[:, :, :, None, :]).reshape( + B, q_h * q_w, k_h * k_w + ) + + return attn + + +class PPChart2TableVisionAttention(nn.Module): + r""" + Multi-Head Self-Attention (MHSA) layer for PP-Chart2Table vision encoder, with optional relative positional encoding. + + Implements standard multi-head attention with query/key/value projection, scaled dot-product attention, + and optional decomposed relative positional embeddings (height/width separate) for spatial awareness. + + Args: + dim (`int`): + Dimensionality of the input embeddings (hidden size of the transformer layer). + num_heads (`int`, *optional*, defaults to 8): + Number of attention heads (must divide `dim` evenly). + qkv_bias (`bool`, *optional*, defaults to `True`): + Whether to add bias terms to the query/key/value projection layers. + use_rel_pos (`bool`, *optional*, defaults to `False`): + Whether to use relative positional encoding for spatial attention. + rel_pos_zero_init (`bool`, *optional*, defaults to `True`): + Whether to initialize relative positional embeddings to zero (stable training). + input_size (`Tuple[int, int]`, *optional*): + Spatial size (H, W) of the input feature map (required if `use_rel_pos=True`). + + Shape: + - Input: `(B, H, W, dim)` (batch size, height, width, embedding dim) + - Output: Same shape as input + """ + + def __init__( + self, + dim: int, + num_heads: int = 8, + qkv_bias: bool = True, + use_rel_pos: bool = False, + rel_pos_zero_init: bool = True, + input_size: Optional[tuple[int, int]] = None, + ) -> None: + super().__init__() + self.num_heads = num_heads + head_dim = dim // num_heads + self.scale = head_dim**-0.5 + + self.qkv = nn.Linear(dim, dim * 3, bias=qkv_bias) + self.proj = nn.Linear(dim, dim) + + self.use_rel_pos = use_rel_pos + if self.use_rel_pos: + assert input_size is not None, "Input size must be provided if using relative positional encoding." + self.rel_pos_h = nn.Parameter(torch.zeros(2 * input_size[0] - 1, head_dim)) + self.rel_pos_w = nn.Parameter(torch.zeros(2 * input_size[1] - 1, head_dim)) + + def forward(self, hidden_states: torch.Tensor) -> torch.Tensor: + B, H, W, _ = hidden_states.shape + qkv = self.qkv(hidden_states).reshape(B, H * W, 3, self.num_heads, -1).permute(2, 0, 3, 1, 4) + q, k, v = qkv.reshape(3, B * self.num_heads, H * W, -1).unbind(dim=0) + attn = (q * self.scale) @ k.transpose(1, 2) + + if self.use_rel_pos: + attn = add_decomposed_rel_pos(attn, q, self.rel_pos_h, self.rel_pos_w, (H, W), (H, W)) + + attn = F.softmax(attn, dim=-1) + hidden_states = (attn @ v).reshape(B, self.num_heads, H, W, -1).permute(0, 2, 3, 1, 4).reshape(B, H, W, -1) + hidden_states = self.proj(hidden_states) + return hidden_states + + +def window_partition(hidden_states: torch.Tensor, window_size: int) -> tuple[torch.Tensor, tuple[int, int]]: + r""" + Partition 2D feature maps into non-overlapping windows, with padding to ensure dimensions are divisible by window size. + + Args: + hidden_states (`torch.Tensor`): + Input feature map with shape [B, H, W, C], where: + - B: batch size + - H: height of feature map + - W: width of feature map + - C: channel dimension + window_size (`int`): + Size of each non-overlapping window (square window). + + Returns: + tuple[torch.Tensor, tuple[int, int]]: + - windows: Partitioned windows with shape [num_windows * B, window_size, window_size, C], + where num_windows = (Hp // window_size) * (Wp // window_size) + - (Hp, Wp): Padded height and width of the feature map (after padding) + """ + B, H, W, C = hidden_states.shape + + pad_h = (window_size - H % window_size) % window_size + pad_w = (window_size - W % window_size) % window_size + if pad_h > 0 or pad_w > 0: + hidden_states = F.pad(hidden_states, (0, 0, 0, pad_w, 0, pad_h)) + Hp, Wp = H + pad_h, W + pad_w + + hidden_states = hidden_states.reshape(B, Hp // window_size, window_size, Wp // window_size, window_size, C) + windows = hidden_states.permute(0, 1, 3, 2, 4, 5).reshape(-1, window_size, window_size, C) + return windows, (Hp, Wp) + + +def window_unpartition( + windows: torch.Tensor, + window_size: int, + pad_hw: tuple[int, int], + hw: tuple[int, int], +) -> torch.Tensor: + r""" + Reverse operation of window_partition: merge windows back to original 2D feature map shape, removing padding. + + Args: + windows (`torch.Tensor`): + Partitioned windows with shape [num_windows * B, window_size, window_size, C] + window_size (`int`): + Size of each non-overlapping window (must match window_partition's window_size) + pad_hw (`tuple[int, int]`): + Padded height and width (Hp, Wp) returned by window_partition + hw (`tuple[int, int]`): + Original height and width (H, W) of feature map before padding + + Returns: + `torch.Tensor`: + Reconstructed feature map with shape [B, H, W, C] (original dimensions before padding) + """ + Hp, Wp = pad_hw + H, W = hw + B = windows.shape[0] // (Hp * Wp // window_size // window_size) + hidden_states = windows.reshape(B, Hp // window_size, Wp // window_size, window_size, window_size, -1) + hidden_states = hidden_states.permute(0, 1, 3, 2, 4, 5).reshape(B, Hp, Wp, -1) + if Hp > H or Wp > W: + hidden_states = hidden_states[:, :H, :W, :] + return hidden_states + + +class PPChart2TableVisionDecoderLayer(nn.Module): + r""" + Single decoder layer of the PP-Chart2Table vision transformer, with optional windowed attention. + + Implements the standard transformer decoder layer structure: + Layer Norm โ†’ Multi-Head Attention (with residual) โ†’ Layer Norm โ†’ MLP (with residual) + Supports windowed attention (SW-MHA) for large feature maps to reduce computation. + + Args: + dim (`int`): + Dimensionality of the input embeddings (hidden size of the transformer layer). + num_heads (`int`): + Number of attention heads (passed to MHSA layer). + mlp_ratio (`float`, *optional*, defaults to 4.0): + Ratio of MLP hidden dimension to embedding dimension (mlp_dim = dim * mlp_ratio). + qkv_bias (`bool`, *optional*, defaults to `True`): + Whether to use bias in Q/K/V projection (passed to MHSA layer). + norm_layer (`Type[nn.Module]`, *optional*, defaults to `nn.LayerNorm`): + Normalization layer to use (LayerNorm for flattened patches, LayerNorm2d for 2D feature maps). + act_layer (`Type[nn.Module]`, *optional*, defaults to `nn.GELU`): + Activation function for MLP block. + use_rel_pos (`bool`, *optional*, defaults to `False`): + Whether to use relative positional encoding (passed to MHSA layer). + rel_pos_zero_init (`bool`, *optional*, defaults to `True`): + Whether to zero-initialize relative positional embeddings (passed to MHSA layer). + window_size (`int`, *optional*, defaults to 0): + Size of attention windows (0 = full attention, >0 = windowed attention). + input_size (`Tuple[int, int]`, *optional*): + Spatial size of input feature map (passed to MHSA layer for relative positional encoding). + + Shape: + - Input: `(B, H, W, dim)` (batch size, height, width, embedding dim) + - Output: Same shape as input + """ + + def __init__( + self, + dim: int, + num_heads: int, + mlp_ratio: float = 4.0, + qkv_bias: bool = True, + norm_layer: type[nn.Module] = nn.LayerNorm, + act_layer: type[nn.Module] = nn.GELU, + use_rel_pos: bool = False, + rel_pos_zero_init: bool = True, + window_size: int = 0, + input_size: Optional[tuple[int, int]] = None, + ) -> None: + super().__init__() + self.norm1 = norm_layer(dim) + self.attn = PPChart2TableVisionAttention( + dim, + num_heads=num_heads, + qkv_bias=qkv_bias, + use_rel_pos=use_rel_pos, + rel_pos_zero_init=rel_pos_zero_init, + input_size=input_size if window_size == 0 else (window_size, window_size), + ) + + self.norm2 = norm_layer(dim) + self.mlp = PPChart2TableVisionMLPBlock(embedding_dim=dim, mlp_dim=int(dim * mlp_ratio), act=act_layer) + + self.window_size = window_size + + def forward(self, hidden_states: torch.Tensor) -> torch.Tensor: + shortcut = hidden_states + hidden_states = self.norm1(hidden_states) + if self.window_size > 0: + H, W = hidden_states.shape[1], hidden_states.shape[2] + hidden_states, pad_hw = window_partition(hidden_states, self.window_size) + hidden_states = self.attn(hidden_states) + + if self.window_size > 0: + hidden_states = window_unpartition(hidden_states, self.window_size, pad_hw, (H, W)) + hidden_states = shortcut + hidden_states + hidden_states = hidden_states + self.mlp(self.norm2(hidden_states)) + return hidden_states + + +class PPChart2TableVisionPreTrainedModel(PreTrainedModel): + r""" + Base class for all PP-Chart2Table vision models, inheriting from Hugging Face `PreTrainedModel`. + + This class sets up core configurations and compatibility flags for the vision encoder, including: + - Support for gradient checkpointing, attention backends (FlashAttention/SDPA), and model compilation + - Definition of non-splittable modules (for tensor parallelism) + - Output recording for hidden states/attentions (for debugging/analysis) + + Class Attributes: + config (`PPChart2TableVisionConfig`): + Typed config class for PP-Chart2Table vision encoder (enforces type checking). + base_model_prefix (`str`, defaults to `"model"`): + Prefix for base model parameters (used in weight loading/saving). + supports_gradient_checkpointing (`bool`, defaults to `True`): + Whether the model supports gradient checkpointing to save memory. + _no_split_modules (`list[str]`): + Modules that should not be split across devices (tensor parallelism compatibility). + _skip_keys_device_placement (`list[str]`): + Keys to skip when placing tensors on devices (e.g., past key values for generation). + _supports_flash_attn / _supports_sdpa / _supports_flex_attn (`bool`): + Compatibility with optimized attention implementations (FlashAttention, SDPA, FlexAttention). + _can_compile_fullgraph (`bool`, defaults to `True`): + Whether the model supports TorchScript/TorchCompile full graph compilation. + _supports_attention_backend (`bool`, defaults to `True`): + Whether the model supports switching attention backends (e.g., PyTorch vs FlashAttention). + _can_record_outputs (`dict`): + Mapping of output types to modules for recording intermediate outputs (hidden_states/attentions). + """ + + config: PPChart2TableVisionConfig + base_model_prefix = "model" + supports_gradient_checkpointing = True + _no_split_modules = ["PPChart2TableVisionDecoderLayer"] + _skip_keys_device_placement = ["past_key_values"] + _supports_flash_attn = True + _supports_sdpa = True + _supports_flex_attn = True + + _can_compile_fullgraph = True + _supports_attention_backend = True + _can_record_outputs = { + "hidden_states": PPChart2TableVisionDecoderLayer, + "attentions": PPChart2TableVisionAttention, + } + + +class PPChart2TableVisionModel(PPChart2TableVisionPreTrainedModel): + main_input_name = "pixel_values" + input_modalities = "image" + + def __init__( + self, + config: PPChart2TableVisionConfig, + in_chans: int = 3, + norm_layer: type[nn.Module] = nn.LayerNorm, + act_layer: type[nn.Module] = nn.GELU, + rel_pos_zero_init: bool = True, + ) -> None: + super().__init__(config) + self.img_size = config.img_size + + self.patch_embed = PPChart2TableVisionPatchEmbed( + kernel_size=(config.patch_size, config.patch_size), + stride=(config.patch_size, config.patch_size), + in_chans=in_chans, + embed_dim=config.embed_dim, + ) + + self.pos_embed = nn.Parameter( + torch.zeros( + 1, config.img_size // config.patch_size, config.img_size // config.patch_size, config.embed_dim + ) + ) + + self.blocks = nn.ModuleList() + for i in range(config.depth): + block = PPChart2TableVisionDecoderLayer( + dim=config.embed_dim, + num_heads=config.num_heads, + mlp_ratio=config.mlp_ratio, + qkv_bias=config.qkv_bias, + norm_layer=norm_layer, + act_layer=act_layer, + use_rel_pos=config.use_rel_pos, + rel_pos_zero_init=rel_pos_zero_init, + window_size=config.window_size if i not in config.global_attn_indexes else 0, + input_size=(config.img_size // config.patch_size, config.img_size // config.patch_size), + ) + self.blocks.append(block) + + self.neck = nn.Sequential( + nn.Conv2d( + config.embed_dim, + config.out_chans, + kernel_size=1, + bias=False, + ), + PPChart2TableVisionLayerNorm2d(config.out_chans), + nn.Conv2d( + config.out_chans, + config.out_chans, + kernel_size=3, + padding=1, + bias=False, + ), + PPChart2TableVisionLayerNorm2d(config.out_chans), + ) + + self.net_2 = nn.Conv2d(256, 512, kernel_size=3, stride=2, padding=1, bias=False) + self.net_3 = nn.Conv2d(512, config.hidden_size, kernel_size=3, stride=2, padding=1, bias=False) + + self.post_init() + + def forward(self, hidden_states: torch.Tensor) -> torch.Tensor: + hidden_states = self.patch_embed(hidden_states) + hidden_states = hidden_states + self.pos_embed + for blk in self.blocks: + hidden_states = blk(hidden_states) + hidden_states = self.neck(hidden_states.permute(0, 3, 1, 2)) + hidden_states = self.net_2(hidden_states) + hidden_states = self.net_3(hidden_states) + return hidden_states + + +def rotate_half(x): + """Rotates half the hidden dims of the input.""" + x1 = x[..., : x.shape[-1] // 2] + x2 = x[..., x.shape[-1] // 2 :] + return torch.cat((-x2, x1), dim=-1) + + +@use_kernel_func_from_hub("rotary_pos_emb") +def apply_rotary_pos_emb(q, k, cos, sin, position_ids=None, unsqueeze_dim=1): + """Applies Rotary Position Embedding to the query and key tensors. + + Args: + q (`torch.Tensor`): The query tensor. + k (`torch.Tensor`): The key tensor. + cos (`torch.Tensor`): The cosine part of the rotary embedding. + sin (`torch.Tensor`): The sine part of the rotary embedding. + position_ids (`torch.Tensor`, *optional*): + Deprecated and unused. + unsqueeze_dim (`int`, *optional*, defaults to 1): + The 'unsqueeze_dim' argument specifies the dimension along which to unsqueeze cos[position_ids] and + sin[position_ids] so that they can be properly broadcasted to the dimensions of q and k. For example, note + that cos[position_ids] and sin[position_ids] have the shape [batch_size, seq_len, head_dim]. Then, if q and + k have the shape [batch_size, heads, seq_len, head_dim], then setting unsqueeze_dim=1 makes + cos[position_ids] and sin[position_ids] broadcastable to the shapes of q and k. Similarly, if q and k have + the shape [batch_size, seq_len, heads, head_dim], then set unsqueeze_dim=2. + Returns: + `tuple(torch.Tensor)` comprising of the query and key tensors rotated using the Rotary Position Embedding. + """ + cos = cos.unsqueeze(unsqueeze_dim) + sin = sin.unsqueeze(unsqueeze_dim) + q_embed = (q * cos) + (rotate_half(q) * sin) + k_embed = (k * cos) + (rotate_half(k) * sin) + return q_embed, k_embed + + +def repeat_kv(hidden_states: torch.Tensor, n_rep: int) -> torch.Tensor: + """ + This is the equivalent of torch.repeat_interleave(x, dim=1, repeats=n_rep). The hidden states go from (batch, + num_key_value_heads, seqlen, head_dim) to (batch, num_attention_heads, seqlen, head_dim) + """ + batch, num_key_value_heads, slen, head_dim = hidden_states.shape + if n_rep == 1: + return hidden_states + hidden_states = hidden_states[:, :, None, :, :].expand(batch, num_key_value_heads, n_rep, slen, head_dim) + return hidden_states.reshape(batch, num_key_value_heads * n_rep, slen, head_dim) + + +def eager_attention_forward( + module: nn.Module, + query: torch.Tensor, + key: torch.Tensor, + value: torch.Tensor, + attention_mask: Optional[torch.Tensor], + scaling: float, + dropout: float = 0.0, + **kwargs: Unpack[TransformersKwargs], +): + key_states = repeat_kv(key, module.num_key_value_groups) + value_states = repeat_kv(value, module.num_key_value_groups) + + attn_weights = torch.matmul(query, key_states.transpose(2, 3)) * scaling + if attention_mask is not None: + causal_mask = attention_mask[:, :, :, : key_states.shape[-2]] + attn_weights = attn_weights + causal_mask + + attn_weights = nn.functional.softmax(attn_weights, dim=-1, dtype=torch.float32).to(query.dtype) + attn_weights = nn.functional.dropout(attn_weights, p=dropout, training=module.training) + attn_output = torch.matmul(attn_weights, value_states) + attn_output = attn_output.transpose(1, 2).contiguous() + + return attn_output, attn_weights + + +@use_kernelized_func(apply_rotary_pos_emb) +class PPChart2TableTextAttention(nn.Module): + """Multi-headed attention from 'Attention Is All You Need' paper""" + + def __init__(self, config: PPChart2TableTextConfig, layer_idx: int): + super().__init__() + self.layer_type = config.layer_types[layer_idx] if hasattr(config, "layer_types") else None + self.config = config + self.layer_idx = layer_idx + self.head_dim = getattr(config, "head_dim", config.hidden_size // config.num_attention_heads) + self.num_key_value_groups = config.num_attention_heads // config.num_key_value_heads + self.scaling = self.head_dim**-0.5 + self.attention_dropout = config.attention_dropout + self.is_causal = True + self.q_proj = nn.Linear(config.hidden_size, config.num_attention_heads * self.head_dim, bias=True) + self.k_proj = nn.Linear(config.hidden_size, config.num_key_value_heads * self.head_dim, bias=True) + self.v_proj = nn.Linear(config.hidden_size, config.num_key_value_heads * self.head_dim, bias=True) + self.o_proj = nn.Linear(config.num_attention_heads * self.head_dim, config.hidden_size, bias=False) + self.sliding_window = config.sliding_window if self.layer_type == "sliding_attention" else None + + def forward( + self, + hidden_states: torch.Tensor, + position_embeddings: tuple[torch.Tensor, torch.Tensor], + attention_mask: Optional[torch.Tensor], + past_key_values: Optional[Cache] = None, + cache_position: Optional[torch.LongTensor] = None, + **kwargs: Unpack[FlashAttentionKwargs], + ) -> tuple[torch.Tensor, Optional[torch.Tensor]]: + input_shape = hidden_states.shape[:-1] + hidden_shape = (*input_shape, -1, self.head_dim) + + query_states = self.q_proj(hidden_states).view(hidden_shape).transpose(1, 2) + key_states = self.k_proj(hidden_states).view(hidden_shape).transpose(1, 2) + value_states = self.v_proj(hidden_states).view(hidden_shape).transpose(1, 2) + + cos, sin = position_embeddings + query_states, key_states = apply_rotary_pos_emb(query_states, key_states, cos, sin) + + if past_key_values is not None: + # sin and cos are specific to RoPE models; cache_position needed for the static cache + cache_kwargs = {"sin": sin, "cos": cos, "cache_position": cache_position} + key_states, value_states = past_key_values.update(key_states, value_states, self.layer_idx, cache_kwargs) + + attention_interface: Callable = eager_attention_forward + if self.config._attn_implementation != "eager": + attention_interface = ALL_ATTENTION_FUNCTIONS[self.config._attn_implementation] + + attn_output, attn_weights = attention_interface( + self, + query_states, + key_states, + value_states, + attention_mask, + dropout=0.0 if not self.training else self.attention_dropout, + scaling=self.scaling, + sliding_window=self.sliding_window, # main diff with Llama + **kwargs, + ) + + attn_output = attn_output.reshape(*input_shape, -1).contiguous() + attn_output = self.o_proj(attn_output) + return attn_output, attn_weights + + +class PPChart2TableTextMLP(nn.Module): + def __init__(self, config): + super().__init__() + self.config = config + self.hidden_size = config.hidden_size + self.intermediate_size = config.intermediate_size + self.gate_proj = nn.Linear(self.hidden_size, self.intermediate_size, bias=False) + self.up_proj = nn.Linear(self.hidden_size, self.intermediate_size, bias=False) + self.down_proj = nn.Linear(self.intermediate_size, self.hidden_size, bias=False) + self.act_fn = ACT2FN[config.hidden_act] + + def forward(self, x): + down_proj = self.down_proj(self.act_fn(self.gate_proj(x)) * self.up_proj(x)) + return down_proj + + +@use_kernel_forward_from_hub("RMSNorm") +class PPChart2TableTextRMSNorm(nn.Module): + def __init__(self, hidden_size, eps: float = 1e-6) -> None: + """ + PPChart2TableTextRMSNorm is equivalent to T5LayerNorm + """ + super().__init__() + self.weight = nn.Parameter(torch.ones(hidden_size)) + self.variance_epsilon = eps + + def forward(self, hidden_states: torch.Tensor) -> torch.Tensor: + input_dtype = hidden_states.dtype + hidden_states = hidden_states.to(torch.float32) + variance = hidden_states.pow(2).mean(-1, keepdim=True) + hidden_states = hidden_states * torch.rsqrt(variance + self.variance_epsilon) + return self.weight * hidden_states.to(input_dtype) + + def extra_repr(self): + return f"{tuple(self.weight.shape)}, eps={self.variance_epsilon}" + + +class PPChart2TableTextDecoderLayer(GradientCheckpointingLayer): + def __init__(self, config: PPChart2TableTextConfig, layer_idx: int): + super().__init__() + self.hidden_size = config.hidden_size + + self.self_attn = PPChart2TableTextAttention(config=config, layer_idx=layer_idx) + + self.mlp = PPChart2TableTextMLP(config) + self.input_layernorm = PPChart2TableTextRMSNorm(config.hidden_size, eps=config.rms_norm_eps) + self.post_attention_layernorm = PPChart2TableTextRMSNorm(config.hidden_size, eps=config.rms_norm_eps) + self.attention_type = config.layer_types[layer_idx] + + def forward( + self, + hidden_states: torch.Tensor, + attention_mask: Optional[torch.Tensor] = None, + position_ids: Optional[torch.LongTensor] = None, + past_key_values: Optional[Cache] = None, + use_cache: Optional[bool] = False, + cache_position: Optional[torch.LongTensor] = None, + position_embeddings: Optional[tuple[torch.Tensor, torch.Tensor]] = None, + **kwargs: Unpack[TransformersKwargs], + ) -> torch.Tensor: + residual = hidden_states + hidden_states = self.input_layernorm(hidden_states) + # Self Attention + hidden_states, _ = self.self_attn( + hidden_states=hidden_states, + attention_mask=attention_mask, + position_ids=position_ids, + past_key_values=past_key_values, + use_cache=use_cache, + cache_position=cache_position, + position_embeddings=position_embeddings, + **kwargs, + ) + hidden_states = residual + hidden_states + + # Fully Connected + residual = hidden_states + hidden_states = self.post_attention_layernorm(hidden_states) + hidden_states = self.mlp(hidden_states) + hidden_states = residual + hidden_states + return hidden_states + + +@auto_docstring +class PPChart2TableTextPreTrainedModel(PreTrainedModel): + config: PPChart2TableTextConfig + base_model_prefix = "model" + supports_gradient_checkpointing = True + _no_split_modules = ["PPChart2TableTextDecoderLayer"] + _skip_keys_device_placement = ["past_key_values"] + _supports_flash_attn = True + _supports_sdpa = True + _supports_flex_attn = True + + _can_compile_fullgraph = True + _supports_attention_backend = True + _can_record_outputs = { + "hidden_states": PPChart2TableTextDecoderLayer, + "attentions": PPChart2TableTextAttention, + } + + +class PPChart2TableTextRotaryEmbedding(nn.Module): + inv_freq: torch.Tensor # fix linting for `register_buffer` + + def __init__(self, config: PPChart2TableTextConfig, device=None): + super().__init__() + self.max_seq_len_cached = config.max_position_embeddings + self.original_max_seq_len = config.max_position_embeddings + + self.config = config + + self.rope_type = self.config.rope_parameters["rope_type"] + rope_init_fn: Callable = self.compute_default_rope_parameters + if self.rope_type != "default": + rope_init_fn = ROPE_INIT_FUNCTIONS[self.rope_type] + inv_freq, self.attention_scaling = rope_init_fn(self.config, device) + + self.register_buffer("inv_freq", inv_freq, persistent=False) + self.register_buffer("original_inv_freq", inv_freq.clone(), persistent=False) + + @staticmethod + def compute_default_rope_parameters( + config: Optional[PPChart2TableTextConfig] = None, + device: Optional["torch.device"] = None, + seq_len: Optional[int] = None, + ) -> tuple["torch.Tensor", float]: + """ + Computes the inverse frequencies according to the original RoPE implementation + Args: + config ([`~transformers.PreTrainedConfig`]): + The model configuration. + device (`torch.device`): + The device to use for initialization of the inverse frequencies. + seq_len (`int`, *optional*): + The current sequence length. Unused for this type of RoPE. + Returns: + Tuple of (`torch.Tensor`, `float`), containing the inverse frequencies for the RoPE embeddings and the + post-processing scaling factor applied to the computed cos/sin (unused in this type of RoPE). + """ + base = config.rope_parameters["rope_theta"] + dim = getattr(config, "head_dim", None) or config.hidden_size // config.num_attention_heads + + attention_factor = 1.0 # Unused in this type of RoPE + + # Compute the inverse frequencies + inv_freq = 1.0 / ( + base ** (torch.arange(0, dim, 2, dtype=torch.int64).to(device=device, dtype=torch.float) / dim) + ) + return inv_freq, attention_factor + + @torch.no_grad() + @dynamic_rope_update # power user: used with advanced RoPE types (e.g. dynamic rope) + def forward(self, x, position_ids): + inv_freq_expanded = self.inv_freq[None, :, None].float().expand(position_ids.shape[0], -1, 1).to(x.device) + position_ids_expanded = position_ids[:, None, :].float() + + device_type = x.device.type if isinstance(x.device.type, str) and x.device.type != "mps" else "cpu" + with maybe_autocast(device_type=device_type, enabled=False): # Force float32 + freqs = (inv_freq_expanded.float() @ position_ids_expanded.float()).transpose(1, 2) + emb = torch.cat((freqs, freqs), dim=-1) + cos = emb.cos() * self.attention_scaling + sin = emb.sin() * self.attention_scaling + + return cos.to(dtype=x.dtype), sin.to(dtype=x.dtype) + + +@auto_docstring +class PPChart2TableTextModel(PPChart2TableTextPreTrainedModel): + def __init__(self, config: PPChart2TableTextConfig): + super().__init__(config) + self.padding_idx = config.pad_token_id + self.vocab_size = config.vocab_size + + self.embed_tokens = nn.Embedding(config.vocab_size, config.hidden_size, self.padding_idx) + self.layers = nn.ModuleList( + [PPChart2TableTextDecoderLayer(config, layer_idx) for layer_idx in range(config.num_hidden_layers)] + ) + self.norm = PPChart2TableTextRMSNorm(config.hidden_size, eps=config.rms_norm_eps) + self.rotary_emb = PPChart2TableTextRotaryEmbedding(config=config) + self.gradient_checkpointing = False + self.has_sliding_layers = "sliding_attention" in self.config.layer_types + + # Initialize weights and apply final processing + self.post_init() + + @check_model_inputs + @auto_docstring + def forward( + self, + input_ids: Optional[torch.LongTensor] = None, + attention_mask: Optional[torch.Tensor] = None, + position_ids: Optional[torch.LongTensor] = None, + past_key_values: Optional[Cache] = None, + inputs_embeds: Optional[torch.FloatTensor] = None, + use_cache: Optional[bool] = None, + cache_position: Optional[torch.LongTensor] = None, + **kwargs: Unpack[TransformersKwargs], + ) -> BaseModelOutputWithPast: + if (input_ids is None) ^ (inputs_embeds is not None): + raise ValueError("You must specify exactly one of input_ids or inputs_embeds") + + if inputs_embeds is None: + inputs_embeds = self.embed_tokens(input_ids) + + if use_cache and past_key_values is None: + past_key_values = DynamicCache(config=self.config) + + if cache_position is None: + past_seen_tokens = past_key_values.get_seq_length() if past_key_values is not None else 0 + cache_position = torch.arange( + past_seen_tokens, past_seen_tokens + inputs_embeds.shape[1], device=inputs_embeds.device + ) + + if position_ids is None: + position_ids = cache_position.unsqueeze(0) + + # It may already have been prepared by e.g. `generate` + if not isinstance(causal_mask_mapping := attention_mask, dict): + # Prepare mask arguments + mask_kwargs = { + "config": self.config, + "input_embeds": inputs_embeds, + "attention_mask": attention_mask, + "cache_position": cache_position, + "past_key_values": past_key_values, + "position_ids": position_ids, + } + # Create the masks + causal_mask_mapping = { + "full_attention": create_causal_mask(**mask_kwargs), + } + # The sliding window alternating layers are not always activated depending on the config + if self.has_sliding_layers: + causal_mask_mapping["sliding_attention"] = create_sliding_window_causal_mask(**mask_kwargs) + + hidden_states = inputs_embeds + position_embeddings = self.rotary_emb(hidden_states, position_ids) + + for decoder_layer in self.layers[: self.config.num_hidden_layers]: + hidden_states = decoder_layer( + hidden_states, + attention_mask=causal_mask_mapping[decoder_layer.attention_type], + position_embeddings=position_embeddings, + position_ids=position_ids, + past_key_values=past_key_values, + use_cache=use_cache, + cache_position=cache_position, + **kwargs, + ) + + hidden_states = self.norm(hidden_states) + return BaseModelOutputWithPast( + last_hidden_state=hidden_states, + past_key_values=past_key_values if use_cache else None, + ) + + +@dataclass +class PPChart2TableModelOutputWithPast(ModelOutput): + r""" + Output class for PPChart2Table multimodal model's forward pass, extending Hugging Face `ModelOutput`. + + This dataclass encapsulates the core outputs of the PP-Chart2Table base model, including hidden states, + attention weights, and cached key/value pairs for efficient generation. + + Attributes: + past_key_values (`Optional[Cache]`, defaults to `None`): + Cached attention key/value pairs from the text decoder (for fast autoregressive generation). + last_hidden_state (`Optional[torch.FloatTensor]`, defaults to `None`): + Final hidden states from the text decoder (shape: `[B, seq_len, hidden_size]`), after multimodal fusion. + hidden_states (`Optional[tuple[torch.FloatTensor]]`, defaults to `None`): + Tuple of hidden states from each layer of the text decoder (for debugging/analysis). + attentions (`Optional[tuple[torch.FloatTensor]]`, defaults to `None`): + Tuple of attention weights from each layer of the text decoder (for debugging/analysis). + """ + + past_key_values: Optional[Cache] = None + last_hidden_state: Optional[torch.FloatTensor] = None + hidden_states: Optional[tuple[torch.FloatTensor]] = None + attentions: Optional[tuple[torch.FloatTensor]] = None + + +@dataclass +class PPChart2TableCausalLMOutputWithPast(ModelOutput): + r""" + Output class for PP-Chart2Table conditional generation model's forward pass. + + Extends `PPChart2TableModelOutputWithPast` with language modeling logits (for token prediction), + tailored for autoregressive table generation tasks. + + Attributes: + logits (`Optional[torch.FloatTensor]`, defaults to `None`): + Language modeling logits (shape: `[B, seq_len, vocab_size]`), output from the LM head. + past_key_values (`Optional[Cache]`, defaults to `None`): + Cached attention key/value pairs (inherited from base model output). + last_hidden_state (`Optional[torch.FloatTensor]`, defaults to `None`): + Final hidden states from the text decoder (inherited from base model output). + hidden_states (`Optional[tuple[torch.FloatTensor]]`, defaults to `None`): + Tuple of decoder layer hidden states (inherited from base model output). + attentions (`Optional[tuple[torch.FloatTensor]]`, defaults to `None`): + Tuple of decoder layer attention weights (inherited from base model output). + """ + + logits: Optional[torch.FloatTensor] = None + past_key_values: Optional[Cache] = None + last_hidden_state: Optional[torch.FloatTensor] = None + hidden_states: Optional[tuple[torch.FloatTensor]] = None + attentions: Optional[tuple[torch.FloatTensor]] = None + + +class PPChart2TablePreTrainedModel(PreTrainedModel): + r""" + Base class for all PP-Chart2Table multimodal models, inheriting from Hugging Face `PreTrainedModel`. + + This class defines core configurations and compatibility flags for the multimodal model (vision + text), + including support for gradient checkpointing, optimized attention backends, and model compilation. + + Class Attributes: + config (`PPChart2TableConfig`): + Typed config class for PP-Chart2Table (combines vision + text sub-configs). + base_model_prefix (`str`, defaults to `"model"`): + Prefix for base model parameters (used in weight loading/saving). + supports_gradient_checkpointing (`bool`, defaults to `True`): + Whether the model supports gradient checkpointing to save memory during training. + _no_split_modules (`list[str]`): + Modules that should not be split across devices (tensor parallelism compatibility). + _skip_keys_device_placement (`list[str]`): + Keys to skip when placing tensors on devices (e.g., past key values for generation). + _supports_flash_attn / _supports_sdpa / _supports_flex_attn (`bool`): + Compatibility with optimized attention implementations (FlashAttention, SDPA, FlexAttention). + _can_compile_fullgraph (`bool`, defaults to `True`): + Whether the model supports TorchScript/TorchCompile full graph compilation. + _supports_attention_backend (`bool`, defaults to `True`): + Whether the model supports switching attention backends (e.g., PyTorch vs FlashAttention). + _can_record_outputs (`dict`): + Mapping of output types to modules for recording intermediate outputs (hidden_states/attentions). + """ + + config: PPChart2TableConfig + base_model_prefix = "model" + supports_gradient_checkpointing = True + _no_split_modules = ["PPChart2TableTextDecoderLayer"] + _skip_keys_device_placement = ["past_key_values"] + _supports_flash_attn = True + _supports_sdpa = True + _supports_flex_attn = True + + _can_compile_fullgraph = True + _supports_attention_backend = True + + _can_record_outputs = { + "hidden_states": PPChart2TableTextDecoderLayer, + "attentions": PPChart2TableTextAttention, + } + + +class PPChart2TableModel(PPChart2TablePreTrainedModel): + r""" + Core PP-Chart2Table multimodal model (vision encoder + text decoder) for chart-to-table parsing. + + This model integrates a vision encoder (for chart image feature extraction) and a text decoder (for table generation), + with a multimodal projection layer to align vision features with text embedding space. The core logic is: + 1. Extract chart features via vision encoder + 2. Project vision features to text embedding dimension + 3. Inject vision features into text decoder inputs (replace image placeholder tokens) + 4. Forward pass through text decoder to generate table text + + Args: + config (`PPChart2TableConfig`): + Combined configuration class (includes vision_config and text_config sub-configs). + + Inputs (forward method): + input_ids (`torch.LongTensor`, optional): + Tokenized input text (including image placeholder tokens) with shape `[B, seq_len]`. + attention_mask (`torch.Tensor`, optional): + Attention mask to avoid padding tokens (shape: `[B, seq_len]`). + position_ids (`torch.Tensor`, optional): + Positional indices for input tokens (shape: `[B, seq_len]`). + past_key_values (`list[torch.Tensor]`, optional): + Cached key/value pairs for fast autoregressive generation. + inputs_embeds (`torch.Tensor`, optional): + Precomputed input embeddings (shape: `[B, seq_len, hidden_size]`; overrides `input_ids`). + use_cache (`bool`, optional): + Whether to cache key/value pairs for generation. + pixel_values (`torch.Tensor`, optional): + Preprocessed chart images (shape: `[B, 3, H, W]`; required for multimodal input). + cache_position (`torch.LongTensor`, optional): + Position indices for cached key/value pairs (for generation). + **kwargs: + Additional arguments passed to the text decoder. + + Outputs: + `PPChart2TableModelOutputWithPast`: + Contains the text decoder's final hidden states, cached key/values, and optional intermediate outputs. + """ + + config_class = PPChart2TableConfig + + def __init__(self, config: PPChart2TableConfig): + super().__init__(config) + self.vision_tower_high = PPChart2TableVisionModel._from_config(config.vision_config) + self.language_model = PPChart2TableTextModel._from_config(config.text_config) + self.mm_projector_vary = nn.Linear(config.vision_config.hidden_size, config.text_config.hidden_size) + + # Initialize weights and apply final processing + self.post_init() + + def get_input_embeddings(self): + """Get input embeddings from the text decoder (for weight tying/loading).""" + return self.language_model.embed_tokens + + def set_input_embeddings(self, value): + """Set input embeddings for the text decoder (for weight tying/loading).""" + self.language_model.embed_tokens = value + + def get_image_features( + self, + images: Optional[torch.Tensor], + ) -> list[torch.Tensor]: + r""" + Extract and project chart image features to text embedding space. + + Args: + images (`torch.Tensor`): + Preprocessed chart images (shape: `[B, 3, H, W]`). + + Returns: + `list[torch.Tensor]`: + List of projected image features (one per image), each with shape `[1, num_patches, text_hidden_size]`. + """ + image_features = [] + for image in images: + image = image.unsqueeze(0) + with torch.no_grad(): + cnn_feature = self.vision_tower_high(image) + cnn_feature = cnn_feature.flatten(2).transpose(2, 1) + image_feature = self.mm_projector_vary(cnn_feature) + image_features.append(image_feature) + + image_features = torch.stack(image_features, dim=0) + + return image_features + + def get_placeholder_mask( + self, + input_ids: Optional[torch.LongTensor] = None, + inputs_embeds: Optional[torch.FloatTensor] = None, + image_features: Optional[torch.FloatTensor] = None, + ) -> torch.BoolTensor: + r""" + Generate mask to locate image placeholder tokens in input embeddings. + + This mask identifies the `` tokens in the input sequence, which will be replaced with + projected image features for multimodal fusion. + + Args: + input_ids (`torch.LongTensor`, optional): + Tokenized input text (used if `inputs_embeds` is None). + inputs_embeds (`torch.FloatTensor`, optional): + Precomputed input embeddings (used if `input_ids` is None). + image_features (`torch.FloatTensor`): + Projected image features (used to validate token-feature count match). + + Returns: + `torch.BoolTensor`: + Boolean mask (shape: `[B, seq_len, text_hidden_size]`) where `True` indicates image placeholder tokens. + + Raises: + ValueError: If the number of image tokens does not match the number of image features. + """ + if input_ids is None: + start_token_embed = self.get_input_embeddings()( + torch.tensor(self.config.im_start_token, dtype=torch.long, device=inputs_embeds.device) + ) + special_image_mask = inputs_embeds == start_token_embed + special_image_mask = special_image_mask.all(-1) + else: + special_image_mask = input_ids == self.config.im_patch_token + + n_image_tokens = special_image_mask.sum() + + n_image_features = image_features.numel() // image_features.shape[-1] + if n_image_tokens != n_image_features: + raise ValueError( + f"Image features and image tokens do not match: tokens: {n_image_tokens}, features {n_image_features}" + ) + + special_image_mask = special_image_mask.unsqueeze(-1).expand_as(inputs_embeds).to(inputs_embeds.device) + + return special_image_mask + + @can_return_tuple + def forward( + self, + input_ids: torch.LongTensor = None, + attention_mask: Optional[torch.Tensor] = None, + position_ids: Optional[torch.Tensor] = None, + past_key_values: Optional[list[torch.Tensor]] = None, + inputs_embeds: Optional[torch.Tensor] = None, + use_cache: Optional[bool] = None, + pixel_values: Optional[torch.Tensor] = None, + cache_position: Optional[torch.LongTensor] = None, + **kwargs, + ): + if inputs_embeds is None: + inputs_embeds = self.language_model.embed_tokens(input_ids) + + if pixel_values is not None: + image_features = self.get_image_features(pixel_values) + image_mask = self.get_placeholder_mask( + input_ids, inputs_embeds=inputs_embeds, image_features=image_features + ) + inputs_embeds = inputs_embeds.masked_scatter(image_mask, image_features) + + outputs = self.language_model( + input_ids=None, + position_ids=position_ids, + attention_mask=attention_mask, + past_key_values=past_key_values, + inputs_embeds=inputs_embeds, + use_cache=use_cache, + cache_position=cache_position, + **kwargs, + ) + + output = PPChart2TableModelOutputWithPast( + last_hidden_state=outputs.last_hidden_state, + past_key_values=outputs.past_key_values, + hidden_states=outputs.hidden_states, + attentions=outputs.attentions, + ) + + return output + + +class PPChart2TableForConditionalGeneration(PPChart2TablePreTrainedModel, GenerationMixin): + r""" + PP-Chart2Table model for conditional generation (table text generation from chart images), + extending the core model with a language modeling (LM) head and generation utilities. + + This class integrates Hugging Face `GenerationMixin` to support standard generation methods (greedy, beam search, etc.), + and adds an LM head to predict token probabilities for autoregressive table generation. + + Key Features: + - LM head for token prediction (weight tied to input embeddings) + - Optimized generation input preparation (avoids reprocessing images in subsequent steps) + - Inference-only mode (training not supported by default) + + Args: + config (`PPChart2TableConfig`): + Combined configuration class (vision + text sub-configs). + + Inputs (forward method): + Inherits all inputs from `PPChart2TableModel`, plus: + labels (`list[dict]`, optional): + Training labels (not supported; raises ValueError if provided). + logits_to_keep (`Union[int, torch.Tensor]`, defaults to 0): + Slice index to keep only the last N logits (optimizes generation efficiency). + + Outputs: + `PPChart2TableCausalLMOutputWithPast`: + Contains LM logits, decoder hidden states, and cached key/value pairs. + """ + + _keys_to_ignore_on_load_missing = ["num_batches_tracked"] + _tied_weights_keys = {"lm_head.weight": "model.language_model.embed_tokens.weight"} + + def __init__(self, config: PPChart2TableConfig): + super().__init__(config) + self.model = PPChart2TableModel(config) + self.lm_head = nn.Linear(config.text_config.hidden_size, config.text_config.vocab_size, bias=False) + + self.post_init() + + def get_input_embeddings(self): + return self.model.get_input_embeddings() + + def set_input_embeddings(self, value): + self.model.set_input_embeddings(value) + + def prepare_inputs_for_generation( + self, + input_ids, + past_key_values=None, + attention_mask=None, + inputs_embeds=None, + cache_position=None, + position_ids=None, + use_cache=True, + pixel_values=None, + pixel_values_videos=None, + image_grid_thw=None, + video_grid_thw=None, + is_first_iteration=False, + **kwargs, + ): + # Overwritten -- in specific circumstances we don't want to forward image inputs to the model + + model_inputs = super().prepare_inputs_for_generation( + input_ids, + past_key_values=past_key_values, + attention_mask=attention_mask, + inputs_embeds=inputs_embeds, + cache_position=cache_position, + position_ids=position_ids, + pixel_values=pixel_values, + pixel_values_videos=pixel_values_videos, + image_grid_thw=image_grid_thw, + video_grid_thw=video_grid_thw, + use_cache=use_cache, + is_first_iteration=is_first_iteration, + **kwargs, + ) + if not is_first_iteration and use_cache: + model_inputs["pixel_values"] = None + + return model_inputs + + @can_return_tuple + def forward( + self, + input_ids: Optional[torch.LongTensor] = None, + attention_mask: Optional[torch.Tensor] = None, + position_ids: Optional[torch.LongTensor] = None, + pixel_values: Optional[torch.Tensor] = None, + inputs_embeds: Optional[torch.Tensor] = None, + labels: Optional[list[dict]] = None, + logits_to_keep: Union[int, torch.Tensor] = 0, + cache_position: Optional[torch.LongTensor] = None, + past_key_values: Optional[Cache] = None, + use_cache: Optional[bool] = None, + **kwargs, + ) -> Union[tuple[torch.FloatTensor], PPChart2TableCausalLMOutputWithPast]: + outputs = self.model( + input_ids=input_ids, + attention_mask=attention_mask, + position_ids=position_ids, + pixel_values=pixel_values, + inputs_embeds=inputs_embeds, + use_cache=use_cache, + past_key_values=past_key_values, + cache_position=cache_position, + **kwargs, + ) + hidden_states = outputs.last_hidden_state + logits = self.lm_head(hidden_states) + + slice_indices = slice(-logits_to_keep, None) if isinstance(logits_to_keep, int) else logits_to_keep + logits = self.lm_head(hidden_states[:, slice_indices, :]) + + if labels is not None: + raise ValueError( + "The PPChart2TableForConditionalGeneration model only supports inference, and training is not allowed!\n" + "If you need to train this model, please implement the corresponding loss calculation logic, or use the inference-only mode (do not pass the `labels` parameter)." + ) + + return PPChart2TableCausalLMOutputWithPast( + logits=logits, + last_hidden_state=outputs.last_hidden_state, + past_key_values=outputs.past_key_values, + hidden_states=outputs.hidden_states, + attentions=outputs.attentions, + ) + + +__all__ = [ + "PPChart2TableForConditionalGeneration", + "PPChart2TableModel", + "PPChart2TablePreTrainedModel", + "PPChart2TableTextPreTrainedModel", + "PPChart2TableTextModel", + "PPChart2TableVisionPreTrainedModel", + "PPChart2TableVisionModel", +] diff --git a/src/transformers/models/pp_chart2table/modular_pp_chart2table.py b/src/transformers/models/pp_chart2table/modular_pp_chart2table.py new file mode 100644 index 000000000000..fdf6e07fe0cd --- /dev/null +++ b/src/transformers/models/pp_chart2table/modular_pp_chart2table.py @@ -0,0 +1,1609 @@ +from dataclasses import dataclass +from typing import Optional, Union + +import torch +import torch.nn as nn +import torch.nn.functional as F +from torchvision.transforms.v2.functional import InterpolationMode + +from transformers.cache_utils import Cache +from transformers.configuration_utils import PreTrainedConfig, layer_type_validation +from transformers.feature_extraction_utils import BatchFeature +from transformers.generation import GenerationMixin +from transformers.image_processing_utils import BaseImageProcessor +from transformers.image_processing_utils_fast import BaseImageProcessorFast +from transformers.image_transforms import flip_channel_order, resize, to_channel_dimension_format +from transformers.image_utils import ( + ChannelDimension, + ImageInput, + PILImageResampling, + infer_channel_dimension_format, + make_flat_list_of_images, + to_numpy_array, + valid_images, + validate_preprocess_arguments, +) +from transformers.modeling_outputs import ModelOutput +from transformers.modeling_rope_utils import RopeParameters +from transformers.modeling_utils import PreTrainedModel +from transformers.models.qwen2.modeling_qwen2 import Qwen2Attention, Qwen2DecoderLayer, Qwen2Model, Qwen2PreTrainedModel +from transformers.processing_utils import ProcessorMixin, TensorType +from transformers.utils import ( + can_return_tuple, + filter_out_non_signature_kwargs, +) + + +class PPChart2TableVisionConfig(PreTrainedConfig): + r""" + This is the configuration class to store the configuration of a [`PPChart2TableVisionModel`]. It is used to instantiate a + PP-Chart2Table vision encoder according to the specified arguments, defining the model architecture. Instantiating a + configuration with the defaults will yield a similar configuration to that of the vision encoder of the PP-Chart2Table + architecture developed by the PaddlePaddle team for chart-to-table parsing tasks. + + Configuration objects inherit from [`PreTrainedConfig`] and can be used to control the model outputs. Read the + documentation from [`PreTrainedConfig`] for more information. + + Args: + im_patch_token (`int`, *optional*, defaults to 151859): + The token ID used to represent individual image patches in the multimodal input sequence. + im_start_token (`int`, *optional*, defaults to 151857): + The token ID representing the start of an image token sequence in the multimodal input. + depth (`int`, *optional*, defaults to 12): + Number of hidden layers in the vision Transformer encoder. + embed_dim (`int`, *optional*, defaults to 768): + Dimensionality of the patch embedding layer in the vision encoder. + hidden_size (`int`, *optional*, defaults to 1024): + Dimensionality of the hidden layers in the vision Transformer encoder. + img_size (`int`, *optional*, defaults to 1024): + The size (resolution) of input chart images (assumed to be square). + mlp_ratio (`float`, *optional*, defaults to 4.0): + Ratio of the dimensionality of the feed-forward layer to the hidden size in the vision Transformer blocks. + num_heads (`int`, *optional*, defaults to 12): + Number of attention heads for each self-attention layer in the vision Transformer encoder. + patch_size (`int`, *optional*, defaults to 16): + The size (resolution) of each image patch extracted from the input chart image. + qkv_bias (`bool`, *optional*, defaults to `True`): + Whether to add bias terms to the query, key, and value projection layers in the self-attention mechanism. + use_rel_pos (`bool`, *optional*, defaults to `True`): + Whether to use relative positional embeddings in the self-attention layers of the vision encoder. + global_attn_indexes (`list`, *optional*, defaults to `[2, 5, 8, 11]`): + List of layer indexes where global attention (instead of windowed attention) is applied in the vision encoder. + window_size (`int`, *optional*, defaults to 14): + The size of the attention window for windowed self-attention in the vision Transformer layers. + out_chans (`int`, *optional*, defaults to 256): + Number of output channels from the convolutional stem layer before patch embedding. + + Example: + + ```python + >>> from transformers import PPChart2TableVisionConfig, PPChart2TableVisionModel + + >>> # Initializing a PPChart2TableVisionConfig with default PP-Chart2Table style configuration + >>> configuration = PPChart2TableVisionConfig() + + >>> # Initializing a PPChart2TableVisionModel (with random weights) from the PP-Chart2Table style configuration + >>> model = PPChart2TableVisionModel(configuration) + + >>> # Accessing the model configuration + >>> configuration = model.config + """ + + model_type = "pp_chart2table_vision" + base_config_key = "vision_config" + + def __init__( + self, + im_patch_token: int = 151859, + im_start_token: int = 151857, + depth: int = 12, + embed_dim: int = 768, + hidden_size: int = 1024, + img_size: int = 1024, + mlp_ratio: float = 4.0, + num_heads: int = 12, + patch_size: int = 16, + qkv_bias: bool = True, + use_rel_pos: bool = True, + global_attn_indexes: Optional[list] = None, + window_size: int = 14, + out_chans: int = 256, + **kwargs, + ): + self.im_patch_token = im_patch_token + self.im_start_token = im_start_token + + self.depth = depth + self.embed_dim = embed_dim + self.hidden_size = hidden_size + self.img_size = img_size + self.mlp_ratio = mlp_ratio + self.num_heads = num_heads + self.patch_size = patch_size + self.qkv_bias = qkv_bias + self.use_rel_pos = use_rel_pos + self.global_attn_indexes = global_attn_indexes if global_attn_indexes is not None else [2, 5, 8, 11] + self.window_size = window_size + self.out_chans = out_chans + + super().__init__(**kwargs) + + +class PPChart2TableTextConfig(PreTrainedConfig): + r""" + This is the configuration class to store the configuration of a [`PPChart2TableTextModel`]. It is used to instantiate a + PP-Chart2Table text decoder according to the specified arguments, defining the model architecture. Instantiating a + configuration with the defaults will yield a similar configuration to that of the text encoder/decoder of the + PPChart2TableText-7B-beta [Qwen/PPChart2TableText-7B-beta](https://huggingface.co/Qwen/PPChart2TableText-7B-beta) + architecture, optimized for chart-to-table text generation tasks. + + Configuration objects inherit from [`PreTrainedConfig`] and can be used to control the model outputs. Read the + documentation from [`PreTrainedConfig`] for more information. + + Args: + attention_dropout (`float`, *optional*, defaults to 0.0): + The dropout ratio for the attention probabilities in self-attention layers. + bos_token_id (`int`, *optional*, defaults to 151643): + The token ID representing the beginning of a sequence (BOS) for text generation. + eos_token_id (`int`, *optional*, defaults to 151643): + The token ID representing the end of a sequence (EOS) for text generation. + hidden_act (`str` or `function`, *optional*, defaults to `"silu"`): + The non-linear activation function (function or string) in the feed-forward and attention layers of the decoder. + hidden_size (`int`, *optional*, defaults to 1024): + Dimensionality of the hidden representations in the Transformer decoder layers. + initializer_range (`float`, *optional*, defaults to 0.02): + The standard deviation of the truncated_normal_initializer for initializing all weight matrices. + intermediate_size (`int`, *optional*, defaults to 2816): + Dimensionality of the "intermediate" (i.e., feed-forward) layer in the Transformer decoder blocks. + max_position_embeddings (`int`, *optional*, defaults to 32768): + The maximum sequence length that this model might ever be used with for text input/output. + num_attention_heads (`int`, *optional*, defaults to 16): + Number of attention heads for each self-attention layer in the Transformer decoder. + num_hidden_layers (`int`, *optional*, defaults to 24): + Number of hidden layers in the Transformer decoder. + num_key_value_heads (`int`, *optional*, defaults to 16): + Number of key/value heads for implementing Grouped Query Attention (GQA). If equal to `num_attention_heads`, + Multi Head Attention (MHA) is used; if 1, Multi Query Attention (MQA) is used. For more details, see + [this paper](https://huggingface.co/papers/2305.13245). + rms_norm_eps (`float`, *optional*, defaults to 1e-06): + The epsilon value used by the RMS normalization layers to avoid division by zero. + rope_theta (`float`, *optional*, defaults to 1000000.0): + The base period of the RoPE (Rotary Position Embedding) embeddings, controlling the frequency of positional encoding. + rope_parameters (`RopeParameters` or `dict`, *optional*): + Configuration parameters for RoPE embeddings, including scaling parameters for longer sequence lengths beyond + `max_position_embeddings`. + sliding_window (`int`, *optional*, defaults to 32768): + Window size for Sliding Window Attention (SWA) in the decoder layers (only active if `use_sliding_window=True`). + tie_word_embeddings (`bool`, *optional*, defaults to `True`): + Whether the model's input and output word embeddings should be tied (shared weights). + use_cache (`bool`, *optional*, defaults to `True`): + Whether to return the last key/value attention states to speed up sequential decoding (only relevant for autoregressive + generation). + vocab_size (`int`, *optional*, defaults to 151860): + Vocabulary size of the PPChart2TableText model. Defines the number of distinct tokens that can be represented + by `input_ids`. + layer_types (`list[str]`, *optional*): + Attention pattern for each decoder layer (e.g., `"full_attention"` or `"sliding_attention"`). If not specified, + automatically determined by `sliding_window`. + + Example: + + ```python + >>> from transformers import PPChart2TableTextConfig, PPChart2TableTextModel + + >>> # Initializing a PPChart2TableText style configuration + >>> configuration = PPChart2TableTextConfig() + + >>> # Initializing a model from the PPChart2TableText-7B style configuration + >>> model = PPChart2TableTextModel(configuration) + + >>> # Accessing the model configuration + >>> configuration = model.config + """ + + model_type = "pp_chart2table_text" + keys_to_ignore_at_inference = ["past_key_values"] + + # Default tensor parallel plan for base model `PPChart2TableText` + base_model_tp_plan = { + "layers.*.self_attn.q_proj": "colwise", + "layers.*.self_attn.k_proj": "colwise", + "layers.*.self_attn.v_proj": "colwise", + "layers.*.self_attn.o_proj": "rowwise", + "layers.*.mlp.gate_proj": "colwise", + "layers.*.mlp.up_proj": "colwise", + "layers.*.mlp.down_proj": "rowwise", + } + base_model_pp_plan = { + "embed_tokens": (["input_ids"], ["inputs_embeds"]), + "layers": (["hidden_states", "attention_mask"], ["hidden_states"]), + "norm": (["hidden_states"], ["hidden_states"]), + } + base_config_key = "text_config" + + def __init__( + self, + attention_dropout: float = 0.0, + bos_token_id: int = 151643, + eos_token_id: int = 151643, + hidden_act: str = "silu", + hidden_size: int = 1024, + initializer_range: float = 0.02, + intermediate_size: int = 2816, + max_position_embeddings: int = 32768, + num_attention_heads: int = 16, + num_hidden_layers: int = 24, + num_key_value_heads: int = 16, + rms_norm_eps: float = 1e-06, + rope_theta: float = 1000000.0, + rope_parameters: Optional[RopeParameters | dict[str, RopeParameters]] = None, + sliding_window: int = 32768, + tie_word_embeddings: bool = True, + use_cache: bool = True, + vocab_size: int = 151860, + layer_types: Optional[list[str]] = None, + **kwargs, + ): + self.vocab_size = vocab_size + self.max_position_embeddings = max_position_embeddings + self.hidden_size = hidden_size + self.intermediate_size = intermediate_size + self.num_hidden_layers = num_hidden_layers + self.num_attention_heads = num_attention_heads + self.sliding_window = sliding_window + + # for backward compatibility + if num_key_value_heads is None: + num_key_value_heads = num_attention_heads + self.num_key_value_heads = num_key_value_heads + self.hidden_act = hidden_act + self.initializer_range = initializer_range + self.rms_norm_eps = rms_norm_eps + self.use_cache = use_cache + + self.attention_dropout = attention_dropout + + self.layer_types = layer_types + if self.layer_types is None: + self.layer_types = [ + "sliding_attention" if self.sliding_window is not None else "full_attention" + for i in range(self.num_hidden_layers) + ] + layer_type_validation(self.layer_types, self.num_hidden_layers) + + self.rope_parameters = rope_parameters + + self.rope_theta = rope_theta + self.tie_word_embeddings = tie_word_embeddings + super().__init__( + bos_token_id=bos_token_id, + eos_token_id=eos_token_id, + tie_word_embeddings=tie_word_embeddings, + **kwargs, + ) + + +class PPChart2TableConfig(PreTrainedConfig): + r""" + This is the main configuration class to store the configuration of a [`PPChart2TableModel`] or [`PPChart2TableForConditionalGeneration`]. + It is used to instantiate a PP-Chart2Table multimodal model according to the specified arguments, defining the vision and text + sub-model architectures. This configuration class inherits from [`PreTrainedConfig`] and combines the configurations of: + - [`PPChart2TableVisionConfig`] (for the chart vision encoder) + - [`PPChart2TableTextConfig`] (for the table text decoder) + PP-Chart2Table [PaddlePaddle/PP-Chart2Table_safetensors](https://huggingface.co/PaddlePaddle/PP-Chart2Table_safetensors). + + Instantiating a `PPChart2TableConfig` with the defaults will yield a similar configuration to the base PP-Chart2Table model + developed by the PaddlePaddle team for chart-to-table parsing tasks. + + Configuration objects inherit from [`PreTrainedConfig`] and can be used to control the model outputs. Read the + documentation from [`PreTrainedConfig`] for more information. + + Args: + vision_config (`dict`, *optional*): + Dictionary of configuration options used to initialize [`PPChart2TableVisionConfig`]. If `None`, the default + `PPChart2TableVisionConfig` configuration will be used. + text_config (`dict`, *optional*): + Dictionary of configuration options used to initialize [`PPChart2TableTextConfig`]. If `None`, the default + `PPChart2TableTextConfig` configuration will be used. + im_start_token (`int`, *optional*, defaults to 151857): + The token ID representing the start of an image token sequence in the multimodal input (shared across vision/text sub-configs). + im_patch_token (`int`, *optional*, defaults to 151859): + The token ID used to represent individual image patches in the multimodal input sequence (shared across vision/text sub-configs). + + Example: + + ```python + >>> from transformers import PPChart2TableConfig, PPChart2TableModel + + >>> # Initializing a PPChart2Table configuration with default vision and text sub-configs + >>> configuration = PPChart2TableConfig() + + >>> # Initializing a PPChart2Table configuration with custom vision and text sub-configs + >>> vision_config = {"img_size": 512, "patch_size": 8} + >>> text_config = {"hidden_size": 2048, "num_hidden_layers": 16} + >>> configuration = PPChart2TableConfig(vision_config=vision_config, text_config=text_config) + + >>> # Initializing a model from the PPChart2Table configuration + >>> model = PPChart2TableModel(configuration) + + >>> # Accessing the model configuration + >>> configuration = model.config + >>> # Accessing the vision sub-config + >>> vision_config = configuration.vision_config + >>> # Accessing the text sub-config + >>> text_config = configuration.text_config + """ + + model_type = "pp_chart2table" + sub_configs = {"vision_config": PPChart2TableVisionConfig, "text_config": PPChart2TableTextConfig} + + def __init__( + self, + vision_config: dict | None = None, + text_config: dict | None = None, + im_start_token: int = 151857, + im_patch_token: int = 151859, + **kwargs, + ): + if vision_config is None: + vision_config = {} + self.vision_config = PPChart2TableVisionConfig(**vision_config) + + if text_config is None: + text_config = {} + self.text_config = PPChart2TableTextConfig(**text_config) + + self.model_type = "pp_chart2table" + + self.im_start_token = im_start_token + self.im_patch_token = im_patch_token + + text_config_keys = [ + "attention_dropout", + "bos_token_id", + "eos_token_id", + "hidden_act", + "hidden_size", + "initializer_range", + "intermediate_size", + "max_position_embeddings", + "num_attention_heads", + "num_hidden_layers", + "num_key_value_heads", + "rms_norm_eps", + "rope_theta", + "sliding_window", + "tie_word_embeddings", + "dtype", + "use_cache", + "vocab_size", + ] + for key in text_config_keys: + if hasattr(self.text_config, key): + setattr(self, key, getattr(self.text_config, key)) + + super().__init__(**kwargs) + + +class PPChart2TableImageProcessor(BaseImageProcessor): + r""" + Image processor for the PP-Chart2Table multimodal model, optimized for chart image preprocessing tasks. + + This processor handles the complete preprocessing pipeline for chart images, including resizing, rescaling, + normalization, and channel dimension reordering, tailored to the input requirements of the PP-Chart2Table vision encoder. + + Args: + do_resize (`bool`, *optional*, defaults to `True`): + Whether to resize the input images to the specified `size`. + size (`dict[str, int]`, *optional*, defaults to `{"height": 256, "width": 256}`): + Dictionary containing the target height and width for resizing. Format: `{"height": int, "width": int}`. + resample (`PILImageResampling`, *optional*, defaults to `PILImageResampling.BICUBIC`): + Resampling filter to use when resizing images (e.g., BICUBIC, BILINEAR). + do_rescale (`bool`, *optional*, defaults to `True`): + Whether to rescale the pixel values from the range [0, 255] to [0, 1] using `rescale_factor`. + rescale_factor (`int` or `float`, *optional*, defaults to `1/255`): + Factor to apply for rescaling pixel values (e.g., 1/255 scales 0-255 to 0-1). + do_normalize (`bool`, *optional*, defaults to `True`): + Whether to normalize the input images using `image_mean` and `image_std`. + image_mean (`float` or `list[float]`, *optional*, defaults to `[0.406, 0.456, 0.485]`): + Mean values for image normalization (per channel, RGB order). + image_std (`float` or `list[float]`, *optional*, defaults to `[0.225, 0.224, 0.229]`): + Standard deviation values for image normalization (per channel, RGB order). + patch_size (`int`, *optional*, defaults to 16): + Size of image patches used by the PP-Chart2Table vision encoder (for alignment with model input). + merge_size (`int`, *optional*, defaults to 4): + Size factor for merging image patches (specific to PP-Chart2Table's vision processing pipeline). + """ + + model_input_names = ["pixel_values"] + + def __init__( + self, + do_resize: bool = True, + size: Optional[dict[str, int]] = None, + resample: Optional[PILImageResampling] = PILImageResampling.BICUBIC, + do_rescale: bool = True, + rescale_factor: Union[int, float] = 1 / 255, + do_normalize: bool = True, + image_mean: Optional[Union[float, list[float]]] = [0.406, 0.456, 0.485], + image_std: Optional[Union[float, list[float]]] = [0.225, 0.224, 0.229], + patch_size: int = 16, + merge_size: int = 4, + **kwargs, + ) -> None: + super().__init__(**kwargs) + size = size if size is not None else {"height": 256, "width": 256} + + self.do_resize = do_resize + self.size = size + self.do_rescale = do_rescale + self.rescale_factor = rescale_factor + self.do_normalize = do_normalize + self.image_mean = image_mean + self.image_std = image_std + self.resample = resample + self.patch_size = patch_size + self.merge_size = merge_size + + @filter_out_non_signature_kwargs() + def preprocess( + self, + images: ImageInput, + size: Optional[dict[str, int]] = None, + do_resize: Optional[bool] = None, + resample: Optional[PILImageResampling] = None, + do_rescale: Optional[bool] = None, + rescale_factor: Optional[Union[int, float]] = None, + do_normalize: Optional[bool] = None, + image_mean: Optional[Union[float, list[float]]] = None, + image_std: Optional[Union[float, list[float]]] = None, + return_tensors: Optional[Union[TensorType, str]] = None, + data_format: Union[str, ChannelDimension] = ChannelDimension.FIRST, + input_data_format: Optional[Union[str, ChannelDimension]] = None, + ) -> BatchFeature: + size = self.size if size is None else size + do_resize = self.do_resize if do_resize is None else do_resize + resample = self.resample if resample is None else resample + do_rescale = self.do_rescale if do_rescale is None else do_rescale + rescale_factor = self.rescale_factor if rescale_factor is None else rescale_factor + do_normalize = self.do_normalize if do_normalize is None else do_normalize + image_mean = self.image_mean if image_mean is None else image_mean + image_std = self.image_std if image_std is None else image_std + + images = make_flat_list_of_images(images) + + validate_preprocess_arguments( + do_rescale=do_rescale, + rescale_factor=rescale_factor, + do_normalize=do_normalize, + image_mean=image_mean, + image_std=image_std, + size=size, + do_resize=do_resize, + resample=resample, + ) + + if not valid_images(images): + raise ValueError("Invalid image type. Must be of type PIL.Image.Image, numpy.ndarray, or torch.Tensor") + + # All transformations expect numpy arrays + images = [to_numpy_array(image) for image in images] + if input_data_format is None: + input_data_format = infer_channel_dimension_format(images[0]) + + # transformations + resize_imgs = [] + if do_resize: + for image in images: + img = resize( + image, + size=(size["height"], size["width"]), + resample=resample, + input_data_format=input_data_format, + ) + resize_imgs.append(img) + images = resize_imgs + + if do_rescale: + images = [self.rescale(image, rescale_factor, input_data_format=input_data_format) for image in images] + + if do_normalize: + images = [ + self.normalize(image, image_mean, image_std, input_data_format=input_data_format) for image in images + ] + images = [flip_channel_order(image, input_data_format=input_data_format) for image in images] + images = [ + to_channel_dimension_format(image, data_format, input_channel_dim=input_data_format) for image in images + ] + + encoded_inputs = BatchFeature(data={"pixel_values": images}, tensor_type=return_tensors) + return encoded_inputs + + +class PPChart2TableImageProcessorFast(BaseImageProcessorFast): + r""" + Fast image processor for the PP-Chart2Table multimodal model, optimized for GPU-accelerated chart image preprocessing. + + This high-performance processor implements a streamlined preprocessing pipeline for chart images (resizing, rescaling, + normalization, channel reordering) using PyTorch tensor operations, designed for efficient batch processing on GPUs. + It inherits from [`BaseImageProcessorFast`] and is optimized for inference/training pipelines requiring low-latency + image preprocessing. + + Class Attributes (Default Configuration): + resample (`int`, defaults to 3): + Integer identifier for the resampling filter (3 = BICUBIC, compatible with `InterpolationMode.BICUBIC`). + image_mean (`list[float]`, defaults to `[0.40821073, 0.4578275, 0.48145466]`): + Per-channel mean values for image normalization (RGB order). + image_std (`list[float]`, defaults to `[0.27577711, 0.26130258, 0.26862954]`): + Per-channel standard deviation values for image normalization (RGB order). + size (`dict[str, int]`, defaults to `{"height": 1024, "width": 1024}`): + Default target size for image resizing (1024x1024, optimized for PP-Chart2Table vision encoder). + patch_size (`int`, defaults to 16): + Size of image patches used by the PP-Chart2Table vision encoder (for alignment with model input). + merge_size (`int`, defaults to 4): + Size factor for merging image patches (specific to PP-Chart2Table's vision processing pipeline). + do_resize (`bool`, defaults to `True`): + Default flag to enable image resizing. + do_rescale (`bool`, defaults to `True`): + Default flag to enable pixel value rescaling (from [0,255] to [0,1]). + do_normalize (`bool`, defaults to `True`): + Default flag to enable image normalization. + """ + + resample = 3 + image_mean = [0.40821073, 0.4578275, 0.48145466] + image_std = [0.27577711, 0.26130258, 0.26862954] + size = {"height": 1024, "width": 1024} + patch_size = 16 + merge_size = 4 + do_resize = True + do_rescale = True + do_normalize = True + + def __init__(self, **kwargs) -> None: + super().__init__(**kwargs) + + def _preprocess( + self, + images: list[torch.Tensor], + size: Optional[list[dict[str, int]]], + do_resize: bool, + do_rescale: bool, + rescale_factor: float, + do_normalize: bool, + image_mean: Optional[Union[float, list[float]]], + image_std: Optional[Union[float, list[float]]], + return_tensors: Optional[Union[str, TensorType]], + interpolation: Optional[InterpolationMode] = None, + **kwargs, + ) -> BatchFeature: + data = {} + resize_imgs = [] + if do_resize: + for image in images: + img = self.resize(image, size=size, interpolation=interpolation) + resize_imgs.append(img) + images = resize_imgs + + processed_images = [] + for image in images: + image = self.rescale_and_normalize(image, do_rescale, rescale_factor, do_normalize, image_mean, image_std) + processed_images.append(image) + images = processed_images + + images = [image[[2, 1, 0], :, :] for image in images] + data.update({"pixel_values": torch.stack(images, dim=0)}) + encoded_inputs = BatchFeature(data, tensor_type=return_tensors) + + return encoded_inputs + + +class PPChart2TableProcessor(ProcessorMixin): + r""" + [`PPChart2TableProcessor`] offers all the functionalities of [`PPChart2TableImageProcessor`] and [`Qwen2Tokenizer`]. See the + [`~PPChart2TableProcessor.__call__`] and [`~PPChart2TableProcessor.decode`] for more information. + Args: + image_processor ([`PPChart2TableImageProcessor`], *optional*): + The image processor is a required input. + tokenizer ([`Qwen2Tokenizer`], *optional*): + The tokenizer is a required input. + chat_template (`str`, *optional*): A Jinja template which will be used to convert lists of messages + in a chat into a tokenizable string. + """ + image_processor_class = "AutoImageProcessor" + tokenizer_class = "AutoTokenizer" + + def __init__(self, image_processor=None, tokenizer=None, chat_template=None, **kwargs): + super().__init__(image_processor, tokenizer, chat_template=chat_template) + + def __call__( + self, + images, + text=None, + **kwargs, + ) -> BatchFeature: + if images is not None: + image_inputs = self.image_processor(images=images, return_tensors="pt") + else: + image_inputs = {} + img_cnt = len(image_inputs) + b, c, h, w = image_inputs["pixel_values"].shape + num_patches = h // self.image_processor.patch_size // self.image_processor.merge_size + prompt = ( + "<|im_start|>system\n" + "You should follow the instructions carefully and explain your answers in detail.<|im_end|><|im_start|>user\n" + "" + "" * (num_patches * num_patches) + "\n" + "Chart to table<|im_end|><|im_start|>assistant\n" + ) + input_ids = torch.tensor(self.tokenizer([prompt]).input_ids) + input_ids = input_ids.repeat(img_cnt, 1) + input_ids = {"input_ids": input_ids} + return BatchFeature(data={**input_ids, **image_inputs}) + + def postprocess(self, model_pred, **kwargs): + return self.tokenizer.batch_decode( + model_pred[0], + skip_special_tokens=kwargs.get("skip_special_tokens", True), + clean_up_tokenization_spaces=False, + ) + + +def window_partition(hidden_states: torch.Tensor, window_size: int) -> tuple[torch.Tensor, tuple[int, int]]: + r""" + Partition 2D feature maps into non-overlapping windows, with padding to ensure dimensions are divisible by window size. + + Args: + hidden_states (`torch.Tensor`): + Input feature map with shape [B, H, W, C], where: + - B: batch size + - H: height of feature map + - W: width of feature map + - C: channel dimension + window_size (`int`): + Size of each non-overlapping window (square window). + + Returns: + tuple[torch.Tensor, tuple[int, int]]: + - windows: Partitioned windows with shape [num_windows * B, window_size, window_size, C], + where num_windows = (Hp // window_size) * (Wp // window_size) + - (Hp, Wp): Padded height and width of the feature map (after padding) + """ + B, H, W, C = hidden_states.shape + + pad_h = (window_size - H % window_size) % window_size + pad_w = (window_size - W % window_size) % window_size + if pad_h > 0 or pad_w > 0: + hidden_states = F.pad(hidden_states, (0, 0, 0, pad_w, 0, pad_h)) + Hp, Wp = H + pad_h, W + pad_w + + hidden_states = hidden_states.reshape(B, Hp // window_size, window_size, Wp // window_size, window_size, C) + windows = hidden_states.permute(0, 1, 3, 2, 4, 5).reshape(-1, window_size, window_size, C) + return windows, (Hp, Wp) + + +def window_unpartition( + windows: torch.Tensor, + window_size: int, + pad_hw: tuple[int, int], + hw: tuple[int, int], +) -> torch.Tensor: + r""" + Reverse operation of window_partition: merge windows back to original 2D feature map shape, removing padding. + + Args: + windows (`torch.Tensor`): + Partitioned windows with shape [num_windows * B, window_size, window_size, C] + window_size (`int`): + Size of each non-overlapping window (must match window_partition's window_size) + pad_hw (`tuple[int, int]`): + Padded height and width (Hp, Wp) returned by window_partition + hw (`tuple[int, int]`): + Original height and width (H, W) of feature map before padding + + Returns: + `torch.Tensor`: + Reconstructed feature map with shape [B, H, W, C] (original dimensions before padding) + """ + Hp, Wp = pad_hw + H, W = hw + B = windows.shape[0] // (Hp * Wp // window_size // window_size) + hidden_states = windows.reshape(B, Hp // window_size, Wp // window_size, window_size, window_size, -1) + hidden_states = hidden_states.permute(0, 1, 3, 2, 4, 5).reshape(B, Hp, Wp, -1) + if Hp > H or Wp > W: + hidden_states = hidden_states[:, :H, :W, :] + return hidden_states + + +def get_rel_pos(q_size: int, k_size: int, rel_pos: torch.Tensor) -> torch.Tensor: + r""" + Get relative positional embeddings for query and key sequences, with interpolation for mismatched sizes. + + Args: + q_size (`int`): + Spatial size (height/width) of query feature map + k_size (`int`): + Spatial size (height/width) of key feature map + rel_pos (`torch.Tensor`): + Precomputed relative positional embeddings with shape [max_rel_dist_original, dim] + + Returns: + `torch.Tensor`: + Interpolated relative positional embeddings for the query-key pair, shape [q_size, k_size, dim] + """ + max_rel_dist = int(2 * max(q_size, k_size) - 1) + if rel_pos.shape[0] != max_rel_dist: + rel_pos_resized = F.interpolate( + rel_pos.reshape(1, rel_pos.shape[0], -1).permute(0, 2, 1), + size=max_rel_dist, + mode="linear", + ) + rel_pos_resized = rel_pos_resized.reshape(-1, max_rel_dist).permute(1, 0) + else: + rel_pos_resized = rel_pos + + q_coords = torch.arange(q_size)[:, None] * max(k_size / q_size, 1.0) + k_coords = torch.arange(k_size)[None, :] * max(q_size / k_size, 1.0) + relative_coords = q_coords - k_coords + (k_size - 1) * max(q_size / k_size, 1.0) + return rel_pos_resized[relative_coords.long()] + + +def add_decomposed_rel_pos( + attn: torch.Tensor, + q: torch.Tensor, + rel_pos_h: torch.Tensor, + rel_pos_w: torch.Tensor, + q_size: tuple[int, int], + k_size: tuple[int, int], +) -> torch.Tensor: + r""" + Add decomposed relative positional embeddings (height and width separately) to attention scores. + + Args: + attn (`torch.Tensor`): + Attention scores with shape [B, q_h*q_w, k_h*k_w] + q (`torch.Tensor`): + Query tensor with shape [B, q_h*q_w, dim] + rel_pos_h (`torch.Tensor`): + Precomputed relative positional embeddings for height dimension + rel_pos_w (`torch.Tensor`): + Precomputed relative positional embeddings for width dimension + q_size (`tuple[int, int]`): + Spatial size (q_h, q_w) of query feature map + k_size (`tuple[int, int]`): + Spatial size (k_h, k_w) of key feature map + + Returns: + `torch.Tensor`: + Attention scores with added relative positional embeddings, shape [B, q_h*q_w, k_h*k_w] + """ + q_h, q_w = q_size + k_h, k_w = k_size + Rh = get_rel_pos(q_h, k_h, rel_pos_h) + Rw = get_rel_pos(q_w, k_w, rel_pos_w) + + B, _, dim = q.shape + r_q = q.reshape(B, q_h, q_w, dim) + rel_h = torch.einsum("bhwc,hkc->bhwk", r_q, Rh) + rel_w = torch.einsum("bhwc,wkc->bhwk", r_q, Rw) + + attn = (attn.reshape(B, q_h, q_w, k_h, k_w) + rel_h[:, :, :, :, None] + rel_w[:, :, :, None, :]).reshape( + B, q_h * q_w, k_h * k_w + ) + + return attn + + +class PPChart2TableVisionPatchEmbed(nn.Module): + r""" + Image to Patch Embedding layer for PP-Chart2Table vision encoder. + + This module converts raw chart images (HWC format) into flattened patch embeddings via a 2D convolution, + followed by dimension permutation to align with the vision transformer's input format. + + Args: + kernel_size (`tuple[int, int]`, *optional*, defaults to `(16, 16)`): + Size of the convolution kernel (patch size) for splitting images into patches. + stride (`tuple[int, int]`, *optional*, defaults to `(16, 16)`): + Stride of the convolution operation (matches patch size for non-overlapping patches). + padding (`tuple[int, int]`, *optional*, defaults to `(0, 0)`): + Padding applied to the input image before convolution (ensures patch alignment). + in_chans (`int`, *optional*, defaults to 3): + Number of input channels (3 for RGB chart images). + embed_dim (`int`, *optional*, defaults to 768): + Dimensionality of the output patch embeddings (hidden size of the vision transformer). + + Shape: + - Input: `(B, C, H, W)` (batch size, channels, height, width) + - Output: `(B, H_out, W_out, C_out)` (batch size, patch height, patch width, embedding dim) + """ + + def __init__( + self, + kernel_size: tuple[int, int] = (16, 16), + stride: tuple[int, int] = (16, 16), + padding: tuple[int, int] = (0, 0), + in_chans: int = 3, + embed_dim: int = 768, + ) -> None: + super().__init__() + self.proj = nn.Conv2d(in_chans, embed_dim, kernel_size=kernel_size, stride=stride, padding=padding) + + def forward(self, hidden_states: torch.Tensor) -> torch.Tensor: + hidden_states = self.proj(hidden_states) + hidden_states = hidden_states.permute(0, 2, 3, 1) + return hidden_states + + +class PPChart2TableVisionMLPBlock(nn.Module): + r""" + Multi-Layer Perceptron (MLP) block for PP-Chart2Table vision transformer layers. + + Implements a two-layer feed-forward network with activation function, used in the vision transformer's + decoder layers to project features to a higher dimension and back. + + Args: + embedding_dim (`int`): + Dimensionality of the input/output embeddings (hidden size of the transformer layer). + mlp_dim (`int`): + Dimensionality of the intermediate (hidden) layer in the MLP (typically 4x embedding_dim). + act (`Type[nn.Module]`, *optional*, defaults to `torch.nn.GELU`): + Non-linear activation function to apply between the two linear layers. + + Shape: + - Input: `(B, H, W, embedding_dim)` or `(B, N, embedding_dim)` (N = H*W) + - Output: Same shape as input + """ + + def __init__( + self, + embedding_dim: int, + mlp_dim: int, + act: type[nn.Module] = torch.nn.GELU, + ) -> None: + super().__init__() + self.lin1 = nn.Linear(embedding_dim, mlp_dim) + self.lin2 = nn.Linear(mlp_dim, embedding_dim) + self.act = act() + + def forward(self, hidden_states: torch.Tensor) -> torch.Tensor: + return self.lin2(self.act(self.lin1(hidden_states))) + + +class PPChart2TableVisionLayerNorm2d(nn.Module): + r""" + 2D Layer Normalization for spatial feature maps (adapted for PP-Chart2Table vision encoder). + + Applies layer normalization over the channel dimension of 2D feature maps, with learnable scale/bias parameters + broadcasted across spatial dimensions (height/width). + + Args: + num_channels (`int`): + Number of channels in the input feature map (embedding dimension). + epsilon (`float`, *optional*, defaults to `1e-06`): + Small value added to variance to avoid division by zero. + + Shape: + - Input: `(B, C, H, W)` (batch size, channels, height, width) + - Output: Same shape as input + """ + + def __init__(self, num_channels: int, epsilon: float = 1e-06) -> None: + super().__init__() + self.weight = nn.Parameter(torch.ones(num_channels)) + self.bias = nn.Parameter(torch.zeros(num_channels)) + self.epsilon = epsilon + + def forward(self, hidden_states: torch.Tensor) -> torch.Tensor: + u = hidden_states.mean(dim=1, keepdim=True) + s = (hidden_states - u).pow(2).mean(dim=1, keepdim=True) + hidden_states = (hidden_states - u) / torch.sqrt(s + self.epsilon) + hidden_states = self.weight[:, None, None] * hidden_states + self.bias[:, None, None] + return hidden_states + + +class PPChart2TableVisionAttention(nn.Module): + r""" + Multi-Head Self-Attention (MHSA) layer for PP-Chart2Table vision encoder, with optional relative positional encoding. + + Implements standard multi-head attention with query/key/value projection, scaled dot-product attention, + and optional decomposed relative positional embeddings (height/width separate) for spatial awareness. + + Args: + dim (`int`): + Dimensionality of the input embeddings (hidden size of the transformer layer). + num_heads (`int`, *optional*, defaults to 8): + Number of attention heads (must divide `dim` evenly). + qkv_bias (`bool`, *optional*, defaults to `True`): + Whether to add bias terms to the query/key/value projection layers. + use_rel_pos (`bool`, *optional*, defaults to `False`): + Whether to use relative positional encoding for spatial attention. + rel_pos_zero_init (`bool`, *optional*, defaults to `True`): + Whether to initialize relative positional embeddings to zero (stable training). + input_size (`Tuple[int, int]`, *optional*): + Spatial size (H, W) of the input feature map (required if `use_rel_pos=True`). + + Shape: + - Input: `(B, H, W, dim)` (batch size, height, width, embedding dim) + - Output: Same shape as input + """ + + def __init__( + self, + dim: int, + num_heads: int = 8, + qkv_bias: bool = True, + use_rel_pos: bool = False, + rel_pos_zero_init: bool = True, + input_size: Optional[tuple[int, int]] = None, + ) -> None: + super().__init__() + self.num_heads = num_heads + head_dim = dim // num_heads + self.scale = head_dim**-0.5 + + self.qkv = nn.Linear(dim, dim * 3, bias=qkv_bias) + self.proj = nn.Linear(dim, dim) + + self.use_rel_pos = use_rel_pos + if self.use_rel_pos: + assert input_size is not None, "Input size must be provided if using relative positional encoding." + self.rel_pos_h = nn.Parameter(torch.zeros(2 * input_size[0] - 1, head_dim)) + self.rel_pos_w = nn.Parameter(torch.zeros(2 * input_size[1] - 1, head_dim)) + + def forward(self, hidden_states: torch.Tensor) -> torch.Tensor: + B, H, W, _ = hidden_states.shape + qkv = self.qkv(hidden_states).reshape(B, H * W, 3, self.num_heads, -1).permute(2, 0, 3, 1, 4) + q, k, v = qkv.reshape(3, B * self.num_heads, H * W, -1).unbind(dim=0) + attn = (q * self.scale) @ k.transpose(1, 2) + + if self.use_rel_pos: + attn = add_decomposed_rel_pos(attn, q, self.rel_pos_h, self.rel_pos_w, (H, W), (H, W)) + + attn = F.softmax(attn, dim=-1) + hidden_states = (attn @ v).reshape(B, self.num_heads, H, W, -1).permute(0, 2, 3, 1, 4).reshape(B, H, W, -1) + hidden_states = self.proj(hidden_states) + return hidden_states + + +class PPChart2TableVisionDecoderLayer(nn.Module): + r""" + Single decoder layer of the PP-Chart2Table vision transformer, with optional windowed attention. + + Implements the standard transformer decoder layer structure: + Layer Norm โ†’ Multi-Head Attention (with residual) โ†’ Layer Norm โ†’ MLP (with residual) + Supports windowed attention (SW-MHA) for large feature maps to reduce computation. + + Args: + dim (`int`): + Dimensionality of the input embeddings (hidden size of the transformer layer). + num_heads (`int`): + Number of attention heads (passed to MHSA layer). + mlp_ratio (`float`, *optional*, defaults to 4.0): + Ratio of MLP hidden dimension to embedding dimension (mlp_dim = dim * mlp_ratio). + qkv_bias (`bool`, *optional*, defaults to `True`): + Whether to use bias in Q/K/V projection (passed to MHSA layer). + norm_layer (`Type[nn.Module]`, *optional*, defaults to `nn.LayerNorm`): + Normalization layer to use (LayerNorm for flattened patches, LayerNorm2d for 2D feature maps). + act_layer (`Type[nn.Module]`, *optional*, defaults to `nn.GELU`): + Activation function for MLP block. + use_rel_pos (`bool`, *optional*, defaults to `False`): + Whether to use relative positional encoding (passed to MHSA layer). + rel_pos_zero_init (`bool`, *optional*, defaults to `True`): + Whether to zero-initialize relative positional embeddings (passed to MHSA layer). + window_size (`int`, *optional*, defaults to 0): + Size of attention windows (0 = full attention, >0 = windowed attention). + input_size (`Tuple[int, int]`, *optional*): + Spatial size of input feature map (passed to MHSA layer for relative positional encoding). + + Shape: + - Input: `(B, H, W, dim)` (batch size, height, width, embedding dim) + - Output: Same shape as input + """ + + def __init__( + self, + dim: int, + num_heads: int, + mlp_ratio: float = 4.0, + qkv_bias: bool = True, + norm_layer: type[nn.Module] = nn.LayerNorm, + act_layer: type[nn.Module] = nn.GELU, + use_rel_pos: bool = False, + rel_pos_zero_init: bool = True, + window_size: int = 0, + input_size: Optional[tuple[int, int]] = None, + ) -> None: + super().__init__() + self.norm1 = norm_layer(dim) + self.attn = PPChart2TableVisionAttention( + dim, + num_heads=num_heads, + qkv_bias=qkv_bias, + use_rel_pos=use_rel_pos, + rel_pos_zero_init=rel_pos_zero_init, + input_size=input_size if window_size == 0 else (window_size, window_size), + ) + + self.norm2 = norm_layer(dim) + self.mlp = PPChart2TableVisionMLPBlock(embedding_dim=dim, mlp_dim=int(dim * mlp_ratio), act=act_layer) + + self.window_size = window_size + + def forward(self, hidden_states: torch.Tensor) -> torch.Tensor: + shortcut = hidden_states + hidden_states = self.norm1(hidden_states) + if self.window_size > 0: + H, W = hidden_states.shape[1], hidden_states.shape[2] + hidden_states, pad_hw = window_partition(hidden_states, self.window_size) + hidden_states = self.attn(hidden_states) + + if self.window_size > 0: + hidden_states = window_unpartition(hidden_states, self.window_size, pad_hw, (H, W)) + hidden_states = shortcut + hidden_states + hidden_states = hidden_states + self.mlp(self.norm2(hidden_states)) + return hidden_states + + +class PPChart2TableVisionPreTrainedModel(PreTrainedModel): + r""" + Base class for all PP-Chart2Table vision models, inheriting from Hugging Face `PreTrainedModel`. + + This class sets up core configurations and compatibility flags for the vision encoder, including: + - Support for gradient checkpointing, attention backends (FlashAttention/SDPA), and model compilation + - Definition of non-splittable modules (for tensor parallelism) + - Output recording for hidden states/attentions (for debugging/analysis) + + Class Attributes: + config (`PPChart2TableVisionConfig`): + Typed config class for PP-Chart2Table vision encoder (enforces type checking). + base_model_prefix (`str`, defaults to `"model"`): + Prefix for base model parameters (used in weight loading/saving). + supports_gradient_checkpointing (`bool`, defaults to `True`): + Whether the model supports gradient checkpointing to save memory. + _no_split_modules (`list[str]`): + Modules that should not be split across devices (tensor parallelism compatibility). + _skip_keys_device_placement (`list[str]`): + Keys to skip when placing tensors on devices (e.g., past key values for generation). + _supports_flash_attn / _supports_sdpa / _supports_flex_attn (`bool`): + Compatibility with optimized attention implementations (FlashAttention, SDPA, FlexAttention). + _can_compile_fullgraph (`bool`, defaults to `True`): + Whether the model supports TorchScript/TorchCompile full graph compilation. + _supports_attention_backend (`bool`, defaults to `True`): + Whether the model supports switching attention backends (e.g., PyTorch vs FlashAttention). + _can_record_outputs (`dict`): + Mapping of output types to modules for recording intermediate outputs (hidden_states/attentions). + """ + + config: PPChart2TableVisionConfig + base_model_prefix = "model" + supports_gradient_checkpointing = True + _no_split_modules = ["PPChart2TableVisionDecoderLayer"] + _skip_keys_device_placement = ["past_key_values"] + _supports_flash_attn = True + _supports_sdpa = True + _supports_flex_attn = True + + _can_compile_fullgraph = True + _supports_attention_backend = True + _can_record_outputs = { + "hidden_states": PPChart2TableVisionDecoderLayer, + "attentions": PPChart2TableVisionAttention, + } + + +class PPChart2TableVisionModel(PPChart2TableVisionPreTrainedModel): + + main_input_name = "pixel_values" + input_modalities = "image" + + def __init__( + self, + config: PPChart2TableVisionConfig, + in_chans: int = 3, + norm_layer: type[nn.Module] = nn.LayerNorm, + act_layer: type[nn.Module] = nn.GELU, + rel_pos_zero_init: bool = True, + ) -> None: + super().__init__(config) + self.img_size = config.img_size + + self.patch_embed = PPChart2TableVisionPatchEmbed( + kernel_size=(config.patch_size, config.patch_size), + stride=(config.patch_size, config.patch_size), + in_chans=in_chans, + embed_dim=config.embed_dim, + ) + + self.pos_embed = nn.Parameter( + torch.zeros( + 1, config.img_size // config.patch_size, config.img_size // config.patch_size, config.embed_dim + ) + ) + + self.blocks = nn.ModuleList() + for i in range(config.depth): + block = PPChart2TableVisionDecoderLayer( + dim=config.embed_dim, + num_heads=config.num_heads, + mlp_ratio=config.mlp_ratio, + qkv_bias=config.qkv_bias, + norm_layer=norm_layer, + act_layer=act_layer, + use_rel_pos=config.use_rel_pos, + rel_pos_zero_init=rel_pos_zero_init, + window_size=config.window_size if i not in config.global_attn_indexes else 0, + input_size=(config.img_size // config.patch_size, config.img_size // config.patch_size), + ) + self.blocks.append(block) + + self.neck = nn.Sequential( + nn.Conv2d( + config.embed_dim, + config.out_chans, + kernel_size=1, + bias=False, + ), + PPChart2TableVisionLayerNorm2d(config.out_chans), + nn.Conv2d( + config.out_chans, + config.out_chans, + kernel_size=3, + padding=1, + bias=False, + ), + PPChart2TableVisionLayerNorm2d(config.out_chans), + ) + + self.net_2 = nn.Conv2d(256, 512, kernel_size=3, stride=2, padding=1, bias=False) + self.net_3 = nn.Conv2d(512, config.hidden_size, kernel_size=3, stride=2, padding=1, bias=False) + + self.post_init() + + def forward(self, hidden_states: torch.Tensor) -> torch.Tensor: + hidden_states = self.patch_embed(hidden_states) + hidden_states = hidden_states + self.pos_embed + for blk in self.blocks: + hidden_states = blk(hidden_states) + hidden_states = self.neck(hidden_states.permute(0, 3, 1, 2)) + hidden_states = self.net_2(hidden_states) + hidden_states = self.net_3(hidden_states) + return hidden_states + +class PPChart2TableTextAttention(Qwen2Attention): + pass + + +class PPChart2TableTextDecoderLayer(Qwen2DecoderLayer): + pass + + +class PPChart2TableTextPreTrainedModel(Qwen2PreTrainedModel): + pass + +class PPChart2TableTextModel(Qwen2Model): + pass + + +@dataclass +class PPChart2TableModelOutputWithPast(ModelOutput): + r""" + Output class for PPChart2Table multimodal model's forward pass, extending Hugging Face `ModelOutput`. + + This dataclass encapsulates the core outputs of the PP-Chart2Table base model, including hidden states, + attention weights, and cached key/value pairs for efficient generation. + + Attributes: + past_key_values (`Optional[Cache]`, defaults to `None`): + Cached attention key/value pairs from the text decoder (for fast autoregressive generation). + last_hidden_state (`Optional[torch.FloatTensor]`, defaults to `None`): + Final hidden states from the text decoder (shape: `[B, seq_len, hidden_size]`), after multimodal fusion. + hidden_states (`Optional[tuple[torch.FloatTensor]]`, defaults to `None`): + Tuple of hidden states from each layer of the text decoder (for debugging/analysis). + attentions (`Optional[tuple[torch.FloatTensor]]`, defaults to `None`): + Tuple of attention weights from each layer of the text decoder (for debugging/analysis). + """ + + past_key_values: Optional[Cache] = None + last_hidden_state: Optional[torch.FloatTensor] = None + hidden_states: Optional[tuple[torch.FloatTensor]] = None + attentions: Optional[tuple[torch.FloatTensor]] = None + + +@dataclass +class PPChart2TableCausalLMOutputWithPast(ModelOutput): + r""" + Output class for PP-Chart2Table conditional generation model's forward pass. + + Extends `PPChart2TableModelOutputWithPast` with language modeling logits (for token prediction), + tailored for autoregressive table generation tasks. + + Attributes: + logits (`Optional[torch.FloatTensor]`, defaults to `None`): + Language modeling logits (shape: `[B, seq_len, vocab_size]`), output from the LM head. + past_key_values (`Optional[Cache]`, defaults to `None`): + Cached attention key/value pairs (inherited from base model output). + last_hidden_state (`Optional[torch.FloatTensor]`, defaults to `None`): + Final hidden states from the text decoder (inherited from base model output). + hidden_states (`Optional[tuple[torch.FloatTensor]]`, defaults to `None`): + Tuple of decoder layer hidden states (inherited from base model output). + attentions (`Optional[tuple[torch.FloatTensor]]`, defaults to `None`): + Tuple of decoder layer attention weights (inherited from base model output). + """ + + logits: Optional[torch.FloatTensor] = None + past_key_values: Optional[Cache] = None + last_hidden_state: Optional[torch.FloatTensor] = None + hidden_states: Optional[tuple[torch.FloatTensor]] = None + attentions: Optional[tuple[torch.FloatTensor]] = None + + +class PPChart2TablePreTrainedModel(PreTrainedModel): + r""" + Base class for all PP-Chart2Table multimodal models, inheriting from Hugging Face `PreTrainedModel`. + + This class defines core configurations and compatibility flags for the multimodal model (vision + text), + including support for gradient checkpointing, optimized attention backends, and model compilation. + + Class Attributes: + config (`PPChart2TableConfig`): + Typed config class for PP-Chart2Table (combines vision + text sub-configs). + base_model_prefix (`str`, defaults to `"model"`): + Prefix for base model parameters (used in weight loading/saving). + supports_gradient_checkpointing (`bool`, defaults to `True`): + Whether the model supports gradient checkpointing to save memory during training. + _no_split_modules (`list[str]`): + Modules that should not be split across devices (tensor parallelism compatibility). + _skip_keys_device_placement (`list[str]`): + Keys to skip when placing tensors on devices (e.g., past key values for generation). + _supports_flash_attn / _supports_sdpa / _supports_flex_attn (`bool`): + Compatibility with optimized attention implementations (FlashAttention, SDPA, FlexAttention). + _can_compile_fullgraph (`bool`, defaults to `True`): + Whether the model supports TorchScript/TorchCompile full graph compilation. + _supports_attention_backend (`bool`, defaults to `True`): + Whether the model supports switching attention backends (e.g., PyTorch vs FlashAttention). + _can_record_outputs (`dict`): + Mapping of output types to modules for recording intermediate outputs (hidden_states/attentions). + """ + + config: PPChart2TableConfig + base_model_prefix = "model" + supports_gradient_checkpointing = True + _no_split_modules = ["PPChart2TableTextDecoderLayer"] + _skip_keys_device_placement = ["past_key_values"] + _supports_flash_attn = True + _supports_sdpa = True + _supports_flex_attn = True + + _can_compile_fullgraph = True + _supports_attention_backend = True + + _can_record_outputs = { + "hidden_states": PPChart2TableTextDecoderLayer, + "attentions": PPChart2TableTextAttention, + } + + +class PPChart2TableModel(PPChart2TablePreTrainedModel): + r""" + Core PP-Chart2Table multimodal model (vision encoder + text decoder) for chart-to-table parsing. + + This model integrates a vision encoder (for chart image feature extraction) and a text decoder (for table generation), + with a multimodal projection layer to align vision features with text embedding space. The core logic is: + 1. Extract chart features via vision encoder + 2. Project vision features to text embedding dimension + 3. Inject vision features into text decoder inputs (replace image placeholder tokens) + 4. Forward pass through text decoder to generate table text + + Args: + config (`PPChart2TableConfig`): + Combined configuration class (includes vision_config and text_config sub-configs). + + Inputs (forward method): + input_ids (`torch.LongTensor`, optional): + Tokenized input text (including image placeholder tokens) with shape `[B, seq_len]`. + attention_mask (`torch.Tensor`, optional): + Attention mask to avoid padding tokens (shape: `[B, seq_len]`). + position_ids (`torch.Tensor`, optional): + Positional indices for input tokens (shape: `[B, seq_len]`). + past_key_values (`list[torch.Tensor]`, optional): + Cached key/value pairs for fast autoregressive generation. + inputs_embeds (`torch.Tensor`, optional): + Precomputed input embeddings (shape: `[B, seq_len, hidden_size]`; overrides `input_ids`). + use_cache (`bool`, optional): + Whether to cache key/value pairs for generation. + pixel_values (`torch.Tensor`, optional): + Preprocessed chart images (shape: `[B, 3, H, W]`; required for multimodal input). + cache_position (`torch.LongTensor`, optional): + Position indices for cached key/value pairs (for generation). + **kwargs: + Additional arguments passed to the text decoder. + + Outputs: + `PPChart2TableModelOutputWithPast`: + Contains the text decoder's final hidden states, cached key/values, and optional intermediate outputs. + """ + + config_class = PPChart2TableConfig + + def __init__(self, config: PPChart2TableConfig): + super().__init__(config) + self.vision_tower_high = PPChart2TableVisionModel._from_config(config.vision_config) + self.language_model = PPChart2TableTextModel._from_config(config.text_config) + self.mm_projector_vary = nn.Linear(config.vision_config.hidden_size, config.text_config.hidden_size) + + # Initialize weights and apply final processing + self.post_init() + + def get_input_embeddings(self): + """Get input embeddings from the text decoder (for weight tying/loading).""" + return self.language_model.embed_tokens + + def set_input_embeddings(self, value): + """Set input embeddings for the text decoder (for weight tying/loading).""" + self.language_model.embed_tokens = value + + def get_image_features( + self, + images: Optional[torch.Tensor], + ) -> list[torch.Tensor]: + r""" + Extract and project chart image features to text embedding space. + + Args: + images (`torch.Tensor`): + Preprocessed chart images (shape: `[B, 3, H, W]`). + + Returns: + `list[torch.Tensor]`: + List of projected image features (one per image), each with shape `[1, num_patches, text_hidden_size]`. + """ + image_features = [] + for image in images: + image = image.unsqueeze(0) + with torch.no_grad(): + cnn_feature = self.vision_tower_high(image) + cnn_feature = cnn_feature.flatten(2).transpose(2, 1) + image_feature = self.mm_projector_vary(cnn_feature) + image_features.append(image_feature) + + image_features = torch.stack(image_features, dim=0) + + return image_features + + def get_placeholder_mask( + self, + input_ids: Optional[torch.LongTensor] = None, + inputs_embeds: Optional[torch.FloatTensor] = None, + image_features: Optional[torch.FloatTensor] = None, + ) -> torch.BoolTensor: + r""" + Generate mask to locate image placeholder tokens in input embeddings. + + This mask identifies the `` tokens in the input sequence, which will be replaced with + projected image features for multimodal fusion. + + Args: + input_ids (`torch.LongTensor`, optional): + Tokenized input text (used if `inputs_embeds` is None). + inputs_embeds (`torch.FloatTensor`, optional): + Precomputed input embeddings (used if `input_ids` is None). + image_features (`torch.FloatTensor`): + Projected image features (used to validate token-feature count match). + + Returns: + `torch.BoolTensor`: + Boolean mask (shape: `[B, seq_len, text_hidden_size]`) where `True` indicates image placeholder tokens. + + Raises: + ValueError: If the number of image tokens does not match the number of image features. + """ + if input_ids is None: + start_token_embed = self.get_input_embeddings()( + torch.tensor(self.config.im_start_token, dtype=torch.long, device=inputs_embeds.device) + ) + special_image_mask = inputs_embeds == start_token_embed + special_image_mask = special_image_mask.all(-1) + else: + special_image_mask = input_ids == self.config.im_patch_token + + n_image_tokens = special_image_mask.sum() + + n_image_features = image_features.numel() // image_features.shape[-1] + if n_image_tokens != n_image_features: + raise ValueError( + f"Image features and image tokens do not match: tokens: {n_image_tokens}, features {n_image_features}" + ) + + special_image_mask = special_image_mask.unsqueeze(-1).expand_as(inputs_embeds).to(inputs_embeds.device) + + return special_image_mask + + @can_return_tuple + def forward( + self, + input_ids: torch.LongTensor = None, + attention_mask: Optional[torch.Tensor] = None, + position_ids: Optional[torch.Tensor] = None, + past_key_values: Optional[list[torch.Tensor]] = None, + inputs_embeds: Optional[torch.Tensor] = None, + use_cache: Optional[bool] = None, + pixel_values: Optional[torch.Tensor] = None, + cache_position: Optional[torch.LongTensor] = None, + **kwargs, + ): + if inputs_embeds is None: + inputs_embeds = self.language_model.embed_tokens(input_ids) + + if pixel_values is not None: + image_features = self.get_image_features(pixel_values) + image_mask = self.get_placeholder_mask( + input_ids, inputs_embeds=inputs_embeds, image_features=image_features + ) + inputs_embeds = inputs_embeds.masked_scatter(image_mask, image_features) + + outputs = self.language_model( + input_ids=None, + position_ids=position_ids, + attention_mask=attention_mask, + past_key_values=past_key_values, + inputs_embeds=inputs_embeds, + use_cache=use_cache, + cache_position=cache_position, + **kwargs, + ) + + output = PPChart2TableModelOutputWithPast( + last_hidden_state=outputs.last_hidden_state, + past_key_values=outputs.past_key_values, + hidden_states=outputs.hidden_states, + attentions=outputs.attentions, + ) + + return output + + +class PPChart2TableForConditionalGeneration(PPChart2TablePreTrainedModel, GenerationMixin): + r""" + PP-Chart2Table model for conditional generation (table text generation from chart images), + extending the core model with a language modeling (LM) head and generation utilities. + + This class integrates Hugging Face `GenerationMixin` to support standard generation methods (greedy, beam search, etc.), + and adds an LM head to predict token probabilities for autoregressive table generation. + + Key Features: + - LM head for token prediction (weight tied to input embeddings) + - Optimized generation input preparation (avoids reprocessing images in subsequent steps) + - Inference-only mode (training not supported by default) + + Args: + config (`PPChart2TableConfig`): + Combined configuration class (vision + text sub-configs). + + Inputs (forward method): + Inherits all inputs from `PPChart2TableModel`, plus: + labels (`list[dict]`, optional): + Training labels (not supported; raises ValueError if provided). + logits_to_keep (`Union[int, torch.Tensor]`, defaults to 0): + Slice index to keep only the last N logits (optimizes generation efficiency). + + Outputs: + `PPChart2TableCausalLMOutputWithPast`: + Contains LM logits, decoder hidden states, and cached key/value pairs. + """ + + _keys_to_ignore_on_load_missing = ["num_batches_tracked"] + _tied_weights_keys = {"lm_head.weight": "model.language_model.embed_tokens.weight"} + + def __init__(self, config: PPChart2TableConfig): + super().__init__(config) + self.model = PPChart2TableModel(config) + self.lm_head = nn.Linear(config.text_config.hidden_size, config.text_config.vocab_size, bias=False) + + self.post_init() + + def get_input_embeddings(self): + return self.model.get_input_embeddings() + + def set_input_embeddings(self, value): + self.model.set_input_embeddings(value) + + def prepare_inputs_for_generation( + self, + input_ids, + past_key_values=None, + attention_mask=None, + inputs_embeds=None, + cache_position=None, + position_ids=None, + use_cache=True, + pixel_values=None, + pixel_values_videos=None, + image_grid_thw=None, + video_grid_thw=None, + is_first_iteration=False, + **kwargs, + ): + # Overwritten -- in specific circumstances we don't want to forward image inputs to the model + + model_inputs = super().prepare_inputs_for_generation( + input_ids, + past_key_values=past_key_values, + attention_mask=attention_mask, + inputs_embeds=inputs_embeds, + cache_position=cache_position, + position_ids=position_ids, + pixel_values=pixel_values, + pixel_values_videos=pixel_values_videos, + image_grid_thw=image_grid_thw, + video_grid_thw=video_grid_thw, + use_cache=use_cache, + is_first_iteration=is_first_iteration, + **kwargs, + ) + if not is_first_iteration and use_cache: + model_inputs["pixel_values"] = None + + return model_inputs + + @can_return_tuple + def forward( + self, + input_ids: Optional[torch.LongTensor] = None, + attention_mask: Optional[torch.Tensor] = None, + position_ids: Optional[torch.LongTensor] = None, + pixel_values: Optional[torch.Tensor] = None, + inputs_embeds: Optional[torch.Tensor] = None, + labels: Optional[list[dict]] = None, + logits_to_keep: Union[int, torch.Tensor] = 0, + cache_position: Optional[torch.LongTensor] = None, + past_key_values: Optional[Cache] = None, + use_cache: Optional[bool] = None, + **kwargs, + ) -> Union[tuple[torch.FloatTensor], PPChart2TableCausalLMOutputWithPast]: + outputs = self.model( + input_ids=input_ids, + attention_mask=attention_mask, + position_ids=position_ids, + pixel_values=pixel_values, + inputs_embeds=inputs_embeds, + use_cache=use_cache, + past_key_values=past_key_values, + cache_position=cache_position, + **kwargs, + ) + hidden_states = outputs.last_hidden_state + logits = self.lm_head(hidden_states) + + slice_indices = slice(-logits_to_keep, None) if isinstance(logits_to_keep, int) else logits_to_keep + logits = self.lm_head(hidden_states[:, slice_indices, :]) + + if labels is not None: + raise ValueError( + "The PPChart2TableForConditionalGeneration model only supports inference, and training is not allowed!\n" + "If you need to train this model, please implement the corresponding loss calculation logic, or use the inference-only mode (do not pass the `labels` parameter)." + ) + + return PPChart2TableCausalLMOutputWithPast( + logits=logits, + last_hidden_state=outputs.last_hidden_state, + past_key_values=outputs.past_key_values, + hidden_states=outputs.hidden_states, + attentions=outputs.attentions, + ) + + +__all__ = [ + "PPChart2TableForConditionalGeneration", + "PPChart2TableModel", + "PPChart2TablePreTrainedModel", + "PPChart2TableConfig", + "PPChart2TableTextPreTrainedModel", + "PPChart2TableTextModel", + "PPChart2TableVisionPreTrainedModel", + "PPChart2TableVisionModel", + "PPChart2TableVisionConfig", + "PPChart2TableTextConfig", + "PPChart2TableImageProcessor", + "PPChart2TableImageProcessorFast", + "PPChart2TableProcessor", +] diff --git a/src/transformers/models/pp_chart2table/processing_pp_chart2table.py b/src/transformers/models/pp_chart2table/processing_pp_chart2table.py new file mode 100644 index 000000000000..7d27beef3dec --- /dev/null +++ b/src/transformers/models/pp_chart2table/processing_pp_chart2table.py @@ -0,0 +1,65 @@ +# ๐Ÿšจ๐Ÿšจ๐Ÿšจ๐Ÿšจ๐Ÿšจ๐Ÿšจ๐Ÿšจ๐Ÿšจ๐Ÿšจ๐Ÿšจ๐Ÿšจ๐Ÿšจ๐Ÿšจ๐Ÿšจ๐Ÿšจ๐Ÿšจ๐Ÿšจ๐Ÿšจ๐Ÿšจ๐Ÿšจ๐Ÿšจ๐Ÿšจ๐Ÿšจ๐Ÿšจ๐Ÿšจ๐Ÿšจ๐Ÿšจ๐Ÿšจ๐Ÿšจ๐Ÿšจ๐Ÿšจ๐Ÿšจ๐Ÿšจ๐Ÿšจ๐Ÿšจ๐Ÿšจ๐Ÿšจ๐Ÿšจ๐Ÿšจ๐Ÿšจ๐Ÿšจ๐Ÿšจ๐Ÿšจ๐Ÿšจ๐Ÿšจ๐Ÿšจ๐Ÿšจ๐Ÿšจ +# This file was automatically generated from src/transformers/models/pp_chart2table/modular_pp_chart2table.py. +# Do NOT edit this file manually as any edits will be overwritten by the generation of +# the file from the modular. If any change should be done, please apply the change to the +# modular_pp_chart2table.py file directly. One of our CI enforces this. +# ๐Ÿšจ๐Ÿšจ๐Ÿšจ๐Ÿšจ๐Ÿšจ๐Ÿšจ๐Ÿšจ๐Ÿšจ๐Ÿšจ๐Ÿšจ๐Ÿšจ๐Ÿšจ๐Ÿšจ๐Ÿšจ๐Ÿšจ๐Ÿšจ๐Ÿšจ๐Ÿšจ๐Ÿšจ๐Ÿšจ๐Ÿšจ๐Ÿšจ๐Ÿšจ๐Ÿšจ๐Ÿšจ๐Ÿšจ๐Ÿšจ๐Ÿšจ๐Ÿšจ๐Ÿšจ๐Ÿšจ๐Ÿšจ๐Ÿšจ๐Ÿšจ๐Ÿšจ๐Ÿšจ๐Ÿšจ๐Ÿšจ๐Ÿšจ๐Ÿšจ๐Ÿšจ๐Ÿšจ๐Ÿšจ๐Ÿšจ๐Ÿšจ๐Ÿšจ๐Ÿšจ๐Ÿšจ + +import torch + +from transformers.feature_extraction_utils import BatchFeature +from transformers.processing_utils import ProcessorMixin + + +class PPChart2TableProcessor(ProcessorMixin): + r""" + [`PPChart2TableProcessor`] offers all the functionalities of [`PPChart2TableImageProcessor`] and [`Qwen2Tokenizer`]. See the + [`~PPChart2TableProcessor.__call__`] and [`~PPChart2TableProcessor.decode`] for more information. + Args: + image_processor ([`PPChart2TableImageProcessor`], *optional*): + The image processor is a required input. + tokenizer ([`Qwen2Tokenizer`], *optional*): + The tokenizer is a required input. + chat_template (`str`, *optional*): A Jinja template which will be used to convert lists of messages + in a chat into a tokenizable string. + """ + + image_processor_class = "AutoImageProcessor" + tokenizer_class = "AutoTokenizer" + + def __init__(self, image_processor=None, tokenizer=None, chat_template=None, **kwargs): + super().__init__(image_processor, tokenizer, chat_template=chat_template) + + def __call__( + self, + images, + text=None, + **kwargs, + ) -> BatchFeature: + if images is not None: + image_inputs = self.image_processor(images=images, return_tensors="pt") + else: + image_inputs = {} + img_cnt = len(image_inputs) + b, c, h, w = image_inputs["pixel_values"].shape + num_patches = h // self.image_processor.patch_size // self.image_processor.merge_size + prompt = ( + "<|im_start|>system\n" + "You should follow the instructions carefully and explain your answers in detail.<|im_end|><|im_start|>user\n" + "" + "" * (num_patches * num_patches) + "\n" + "Chart to table<|im_end|><|im_start|>assistant\n" + ) + input_ids = torch.tensor(self.tokenizer([prompt]).input_ids) + input_ids = input_ids.repeat(img_cnt, 1) + input_ids = {"input_ids": input_ids} + return BatchFeature(data={**input_ids, **image_inputs}) + + def postprocess(self, model_pred, **kwargs): + return self.tokenizer.batch_decode( + model_pred[0], + skip_special_tokens=kwargs.get("skip_special_tokens", True), + clean_up_tokenization_spaces=False, + ) + + +__all__ = ["PPChart2TableProcessor"] diff --git a/tests/models/pp_chart2table/__init__.py b/tests/models/pp_chart2table/__init__.py new file mode 100644 index 000000000000..e69de29bb2d1 diff --git a/tests/models/pp_chart2table/test_modeling_pp_chart2table.py b/tests/models/pp_chart2table/test_modeling_pp_chart2table.py new file mode 100644 index 000000000000..1143253791fa --- /dev/null +++ b/tests/models/pp_chart2table/test_modeling_pp_chart2table.py @@ -0,0 +1,391 @@ +# Copyright 2025 The HuggingFace Inc. team. All rights reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +"""Testing suite for the PPChart2Table model.""" + +import gc +import unittest + +import pytest +from parameterized import parameterized +from PIL import Image + +from transformers import ( + AutoProcessor, + PPChart2TableConfig, + PPChart2TableForConditionalGeneration, + is_torch_available, +) +from transformers.testing_utils import ( + backend_empty_cache, + require_torch, + slow, + torch_device, +) + +from ...generation.test_utils import GenerationTesterMixin +from ...test_configuration_common import ConfigTester +from ...test_modeling_common import ( + ModelTesterMixin, +) +from ...test_pipeline_mixin import PipelineTesterMixin + + +if is_torch_available(): + import torch + + +class PPChart2TableVisionText2TextModelTester: + def __init__( + self, + parent, + batch_size=7, + seq_length=31, + num_channels=3, + image_height=64, + image_width=64, + text_config={ + "hidden_size": 32, + "hidden_act": "silu", + "num_hidden_layers": 2, + "num_attention_heads": 4, + "num_key_value_heads": 2, + "intermediate_size": 32, + "attention_dropout": 0.0, + "sliding_window": 32768, + "rms_norm_eps": 1e-06, + "vocab_size": 151860, + "max_position_embeddings": 32768, + "rope_parameters": {"rope_theta": 1000000.0, "rope_type": "default"}, + }, + is_training=False, + vision_config={ + "depth": 2, + "embed_dim": 768, + "hidden_size": 144, + "img_size": 64, + "mlp_ratio": 4.0, + "norm_layer_eps": 1e-6, + "num_heads": 4, + "patch_size": 16, + "qkv_bias": True, + "use_rel_pos": True, + "global_attn_indexes": [2, 5, 8, 11], + "window_size": 14, + "out_chans": 256, + }, + bos_token_id=151643, + eos_token_id=151643, + im_start_token=151857, + im_end_token=151858, + im_patch_token=151859, + ): + self.parent = parent + self.bos_token_id = bos_token_id + self.eos_token_id = eos_token_id + self.num_hidden_layers = text_config["num_hidden_layers"] + self.num_attention_heads = text_config["num_attention_heads"] + self.hidden_size = text_config["hidden_size"] + self.im_start_token = im_start_token + self.im_end_token = im_end_token + self.im_patch_token = im_patch_token + self.text_config = text_config + self.vision_config = vision_config + self.batch_size = batch_size + self.seq_length = seq_length + self.num_channels = num_channels + self.image_height = image_height + self.image_width = image_width + self.is_training = is_training + self.vocab_size = text_config["vocab_size"] + + def get_config(self): + return PPChart2TableConfig( + text_config=self.text_config, + vision_config=self.vision_config, + ) + + def prepare_config_and_inputs(self): + config = self.get_config() + pixel_values = torch.randn((1, 3, self.image_height, self.image_width)).to(torch_device) + + num_patch = self.image_height // 16 // 4 + input = ( + [ + 151644, + 8948, + 198, + 2610, + 1265, + 1795, + 279, + 11221, + 15516, + 323, + 10339, + 697, + 11253, + 304, + 7716, + 13, + 151645, + 151644, + 872, + 198, + 151857, + ] + + [151859] * (num_patch * num_patch) + + [151858, 198, 14488, 311, 1965, 151645, 151644, 77091, 198] + ) + + input_ids = torch.tensor(input).unsqueeze(0).to(torch_device) + + return config, pixel_values, input_ids + + def prepare_config_and_inputs_for_common(self): + config_and_inputs = self.prepare_config_and_inputs() + config, pixel_values, input_ids = config_and_inputs + attention_mask = torch.ones(input_ids.shape, dtype=torch.long, device=torch_device) + inputs_dict = { + "pixel_values": pixel_values, + "input_ids": input_ids, + "attention_mask": attention_mask, + } + return config, inputs_dict + + +@require_torch +class PPChart2TableModelTest(ModelTesterMixin, GenerationTesterMixin, PipelineTesterMixin, unittest.TestCase): + all_model_classes = (PPChart2TableForConditionalGeneration,) if is_torch_available() else () + pipeline_model_mapping = {"image-text-to-text": PPChart2TableForConditionalGeneration} + _is_composite = True + + def setUp(self): + self.model_tester = PPChart2TableVisionText2TextModelTester(self) + self.config_tester = ConfigTester(self, config_class=PPChart2TableConfig, has_text_modality=False) + + def test_config(self): + self.config_tester.run_common_tests() + + @unittest.skip(reason="PPChart2Table does not support.") + def test_sliding_window_mask(self): + pass + + @unittest.skip(reason="PPChart2Table does not support.") + def test_generate_compile_model_forward_fullgraph(self): + pass + + @unittest.skip(reason="PPChart2Table does not support.") + def test_multi_gpu_data_parallel_forward(self): + pass + + @pytest.mark.generate + @unittest.skip(reason="PPChart2Table does not support beam search.") + def test_beam_sample_generate(self): + pass + + @pytest.mark.generate + @unittest.skip(reason="PPChart2Table does not support beam search.") + def test_beam_search_generate(self): + pass + + @pytest.mark.generate + @unittest.skip(reason="PPChart2Table does not support beam search.") + def test_beam_search_generate_dict_output(self): + pass + + @pytest.mark.generate + @unittest.skip(reason="PPChart2Table does not support beam search.") + def test_beam_search_generate_dict_outputs_use_cache(self): + pass + + @pytest.mark.generate + @unittest.skip(reason="PPChart2Table does not support beam search.") + def test_beam_sample_generate_dict_output(self): + pass + + @unittest.skip(reason="PPChart2Table needs to apply weight conversions.") + def test_can_load_from_already_mapped_keys(self): + pass + + @pytest.mark.generate + @unittest.skip(reason="PPChart2Table does not support beam search.") + def test_generate_from_inputs_embeds_1_beam_search(self, _, num_beams): + pass + + @parameterized.expand([("random",), ("same",)]) + @pytest.mark.generate + @unittest.skip(reason="PPChart2Table does not support assisted decoding.") + def test_assisted_decoding_matches_greedy_search(self, assistant_type): + pass + + @pytest.mark.generate + @unittest.skip(reason="PPChart2Table does not support assisted decoding.") + def test_assisted_decoding_sample(self): + pass + + @unittest.skip("PPChart2Table does not support this test.") + def test_model_is_small(self): + pass + + +@require_torch +@slow +class PPChart2TableIntegrationTest(unittest.TestCase): + def setUp(self): + self.processor = AutoProcessor.from_pretrained("/workspace/model_weight_torch/PP-Chart2Table") + + def tearDown(self): + gc.collect() + backend_empty_cache(torch_device) + + def test_small_model_integration_test(self): + model = PPChart2TableForConditionalGeneration.from_pretrained( + "/workspace/model_weight_torch/PP-Chart2Table", dtype="float32" + ).to("cuda") + + image = Image.open( + "/workspace/PaddleX/paddlex/inference/models/doc_vlm/modeling/chart_parsing_02.png" + ).convert("RGB") + inputs = self.processor(images=image).to(model.device) + breakpoint() + expected_input_ids_length = 286 + assert expected_input_ids_length == len(inputs.input_ids[0]) + + expected_input_ids = [151644, 8948, 198, 2610, 1265, 1795, 279, 11221, 15516, 323] + + assert expected_input_ids == inputs.input_ids[0].tolist()[:10] + + expected_pixel_slice = torch.tensor( + [ + [1.0000, 1.0000, 1.0000], + [1.0000, 1.0000, 1.0000], + [0.9922, 0.9922, 0.9922], + [1.0000, 1.0000, 1.0000], + [1.0000, 1.0000, 1.0000], + ], + dtype=torch.float32, + device="cpu", + ) + + assert torch.allclose(expected_pixel_slice, inputs.pixel_values[:5, :, 0, 0], atol=3e-3) + + # verify generation + inputs = inputs.to(torch_device) + output = model.generate(**inputs, max_new_tokens=30) + result = self.processor.decode(output[0][inputs["input_ids"].shape[-1] : -1]) + + EXPECTED_DECODED_TEXT = "็”Ÿ็”˜่‰" + + self.assertEqual( + result, + EXPECTED_DECODED_TEXT, + ) + + # def test_small_model_integration_test_batch(self): + # model = ( + # PPChart2TableForConditionalGeneration.from_pretrained("/workspace/model_weight_torch/PP-Chart2Table", dtype="bfloat16") + # .to(torch_device) + # .eval() + # ) + + # image = Image.open("/workspace/PaddleX/paddlex/inference/models/doc_vlm/modeling/chart_parsing_02.png").convert("RGB") + # inputs = self.processor(images=image).to(model.device) + + # output = model.generate(**inputs, max_new_tokens=256) + # generated_ids_trimmed = [out_ids[len(in_ids) :] for in_ids, out_ids in zip(inputs.input_ids, output)] + # result = self.processor.batch_decode( + # generated_ids_trimmed, skip_special_tokens=True, clean_up_tokenization_spaces=False + # ) + + # EXPECTED_DECODED_TEXT = ["็”Ÿ็”˜่‰", "็”Ÿ็”˜่‰"] + + # self.assertEqual( + # result, + # EXPECTED_DECODED_TEXT, + # ) + + # @require_flash_attn + # @require_torch_accelerator + # @pytest.mark.flash_attn_test + # def test_small_model_integration_test_flashatt2(self): + # model = ( + # PPChart2TableForConditionalGeneration.from_pretrained( + # "/workspace/model_weight_torch/PP-Chart2Table", dtype="bfloat16", attn_implementation="flash_attention_2" + # ) + # .to(torch_device) + # .eval() + # ) + + # image = Image.open("/workspace/PaddleX/paddlex/inference/models/doc_vlm/modeling/chart_parsing_02.png").convert("RGB") + # inputs = self.processor(images=image).to(model.device) + + # expected_input_ids_length = 211 + # assert expected_input_ids_length == len(inputs.input_ids[0]) + + # expected_input_ids = [100273, 2969, 93963, 93919, 101305, 100295, 100295, 100295, 100295, 100295] # fmt: skip + # assert expected_input_ids == inputs.input_ids[0].tolist()[:10] + + # expected_pixel_slice = torch.tensor( + # [ + # [1.0000, 1.0000, 1.0000], + # [1.0000, 1.0000, 1.0000], + # [0.9922, 0.9922, 0.9922], + # [1.0000, 1.0000, 1.0000], + # [1.0000, 1.0000, 1.0000], + # ], + # dtype=torch.float32, + # device="cpu", + # ) + # assert torch.allclose(expected_pixel_slice, inputs.pixel_values[:5, :, 0, 0], atol=3e-3) + + # # verify generation + # inputs = inputs.to(torch_device) + # output = model.generate(**inputs, max_new_tokens=30) + # result = self.processor.decode(output[0][inputs["input_ids"].shape[-1] : -1]) + + # EXPECTED_DECODED_TEXT = "็”Ÿ็”˜่‰" + + # self.assertEqual( + # result, + # EXPECTED_DECODED_TEXT, + # ) + + # @require_flash_attn + # @require_torch_accelerator + # @pytest.mark.flash_attn_test + # def test_small_model_integration_test_batch_flashatt2(self): + # model = ( + # PPChart2TableForConditionalGeneration.from_pretrained( + # "/workspace/model_weight_torch/PP-Chart2Table", dtype="bfloat16", attn_implementation="flash_attention_2" + # ) + # .to(torch_device) + # .eval() + # ) + + # image = Image.open("/workspace/PaddleX/paddlex/inference/models/doc_vlm/modeling/chart_parsing_02.png").convert("RGB") + # inputs = self.processor(images=image).to(model.device) + + # # it should not matter whether two images are the same size or not + # output = model.generate(**inputs, max_new_tokens=30) + # generated_ids_trimmed = [out_ids[len(in_ids) :] for in_ids, out_ids in zip(inputs.input_ids, output)] + # result = self.processor.batch_decode( + # generated_ids_trimmed, skip_special_tokens=True, clean_up_tokenization_spaces=False + # ) + + # EXPECTED_DECODED_TEXT = ["็”Ÿ็”˜่‰", "็”Ÿ็”˜่‰"] + + # self.assertEqual( + # result, + # EXPECTED_DECODED_TEXT, + # ) diff --git a/utils/check_config_attributes.py b/utils/check_config_attributes.py index 6b86f03e3927..0e4503e2877a 100644 --- a/utils/check_config_attributes.py +++ b/utils/check_config_attributes.py @@ -60,6 +60,7 @@ "expert_layer_period", ], "PaddleOCRTextConfig": ["tie_word_embeddings"], + "PPChart2TableConfig": ["tie_word_embeddings"], "Qwen2Config": ["use_sliding_window", "max_window_layers"], "Qwen2MoeConfig": ["use_sliding_window", "max_window_layers"], "Qwen2VLTextConfig": ["use_sliding_window", "max_window_layers"], diff --git a/utils/check_repo.py b/utils/check_repo.py index f36cda07dc51..866a2bd7965b 100644 --- a/utils/check_repo.py +++ b/utils/check_repo.py @@ -162,6 +162,9 @@ "PaddleOCRVisionModel", # Building part of bigger (tested) model. Tested implicitly through PaddleOCRVLForConditionalGeneration. "PaddleOCRVisionTransformer", # Building part of bigger (tested) model. Tested implicitly through PaddleOCRVLForConditionalGeneration. "PaddleOCRTextModel", # Building part of bigger (tested) model. Tested implicitly through PaddleOCRVLForConditionalGeneration. + "PPChart2TableModel", # Building part of bigger (tested) model. Tested implicitly through PPChart2TableForConditionalGeneration. + "PPChart2TableVisionModel", # Building part of bigger (tested) model. Tested implicitly through PPChart2TableForConditionalGeneration. + "PPChart2TableTextModel", # Building part of bigger (tested) model. Tested implicitly through PPChart2TableForConditionalGeneration. "Qwen2VLModel", # Building part of bigger (tested) model. Tested implicitly through Qwen2VLForConditionalGeneration. "Qwen2_5_VLModel", # Building part of bigger (tested) model. Tested implicitly through Qwen2_5_VLForConditionalGeneration. "Qwen3VLModel", # Building part of bigger (tested) model. Tested implicitly through Qwen3VLForConditionalGeneration. @@ -398,6 +401,9 @@ "PaddleOCRVisionModel", # Building part of bigger (tested) model "PaddleOCRVisionTransformer", # Building part of bigger (tested) model "PaddleOCRTextModel", # Building part of bigger (tested) model + "PPChart2TableModel", # Building part of bigger (tested) model + "PPChart2TableVisionModel", # Building part of bigger (tested) model + "PPChart2TableTextModel", # Building part of bigger (tested) model "Qwen2_5OmniTalkerForConditionalGeneration", # Building part of a bigger model "Qwen2_5OmniTalkerModel", # Building part of a bigger model "Qwen2_5OmniThinkerForConditionalGeneration", # Building part of a bigger model From 1a5908db8f529dbc324bd00f03ce7a3bd3291922 Mon Sep 17 00:00:00 2001 From: XingweiDeng Date: Mon, 9 Feb 2026 14:12:50 +0800 Subject: [PATCH 02/60] fix doc --- docs/source/en/model_doc/pp_chart2table.md | 22 ++++++++++++++++------ 1 file changed, 16 insertions(+), 6 deletions(-) diff --git a/docs/source/en/model_doc/pp_chart2table.md b/docs/source/en/model_doc/pp_chart2table.md index 5082c6f0adef..7ba3692f7f49 100644 --- a/docs/source/en/model_doc/pp_chart2table.md +++ b/docs/source/en/model_doc/pp_chart2table.md @@ -40,9 +40,15 @@ The example below demonstrates how to classify image with PP-Chart2Table using [ ```py from transformers import pipeline from PIL import Image -pipe = pipeline("image-text-to-text", model="PaddlePaddle/PP-Chart2Table_safetensors") - -result = pipe(images="https://paddle-model-ecology.bj.bcebos.com/paddlex/imgs/demo_image/chart_parsing_02.png", do_sample=False, max_new_tokens=256) +model_path = "PaddlePaddle/PP-Chart2Table_safetensors" +pipe = pipeline("image-text-to-text", model=model_path) +image = Image.open(requests.get("https://paddle-model-ecology.bj.bcebos.com/paddlex/imgs/demo_image/chart_parsing_02.png", stream=True).raw) +result = pipe( + images=image, + text="", + do_sample=False, + max_new_tokens=256 +) print(result) ``` @@ -83,9 +89,13 @@ from transformers import pipeline from PIL import Image model_path = "PaddlePaddle/PP-Chart2Table_safetensors" pipe = pipeline("image-text-to-text", model=model_path) - -image_path = "https://paddle-model-ecology.bj.bcebos.com/paddlex/imgs/demo_image/chart_parsing_02.png" -result = pipe(images=[image_path, image_path], do_sample=False, max_new_tokens=256) +image = Image.open(requests.get("https://paddle-model-ecology.bj.bcebos.com/paddlex/imgs/demo_image/chart_parsing_02.png", stream=True).raw) +result = pipe( + images=[image, image], + text="", + do_sample=False, + max_new_tokens=256 +) print(result) ``` From c51b1c6f399a8bd07ff0a04846248768fd6cabeb Mon Sep 17 00:00:00 2001 From: XingweiDeng Date: Tue, 24 Feb 2026 15:56:54 +0800 Subject: [PATCH 03/60] update --- .../configuration_pp_chart2table.py | 24 +- .../pp_chart2table/modeling_pp_chart2table.py | 634 +++++++----------- .../pp_chart2table/modular_pp_chart2table.py | 481 ++----------- .../processing_pp_chart2table.py | 2 +- 4 files changed, 327 insertions(+), 814 deletions(-) diff --git a/src/transformers/models/pp_chart2table/configuration_pp_chart2table.py b/src/transformers/models/pp_chart2table/configuration_pp_chart2table.py index 5e32dc30ef76..ee4ed2128161 100644 --- a/src/transformers/models/pp_chart2table/configuration_pp_chart2table.py +++ b/src/transformers/models/pp_chart2table/configuration_pp_chart2table.py @@ -31,11 +31,11 @@ class PPChart2TableVisionConfig(PreTrainedConfig): Dimensionality of the patch embedding layer in the vision encoder. hidden_size (`int`, *optional*, defaults to 1024): Dimensionality of the hidden layers in the vision Transformer encoder. - img_size (`int`, *optional*, defaults to 1024): + image_size (`int`, *optional*, defaults to 1024): The size (resolution) of input chart images (assumed to be square). mlp_ratio (`float`, *optional*, defaults to 4.0): Ratio of the dimensionality of the feed-forward layer to the hidden size in the vision Transformer blocks. - num_heads (`int`, *optional*, defaults to 12): + num_attention_heads (`int`, *optional*, defaults to 12): Number of attention heads for each self-attention layer in the vision Transformer encoder. patch_size (`int`, *optional*, defaults to 16): The size (resolution) of each image patch extracted from the input chart image. @@ -47,7 +47,7 @@ class PPChart2TableVisionConfig(PreTrainedConfig): List of layer indexes where global attention (instead of windowed attention) is applied in the vision encoder. window_size (`int`, *optional*, defaults to 14): The size of the attention window for windowed self-attention in the vision Transformer layers. - out_chans (`int`, *optional*, defaults to 256): + output_channels (`int`, *optional*, defaults to 256): Number of output channels from the convolutional stem layer before patch embedding. Example: @@ -75,15 +75,17 @@ def __init__( depth: int = 12, embed_dim: int = 768, hidden_size: int = 1024, - img_size: int = 1024, + num_channels: int = 3, + image_size: int = 1024, mlp_ratio: float = 4.0, - num_heads: int = 12, + num_attention_heads: int = 12, patch_size: int = 16, qkv_bias: bool = True, use_rel_pos: bool = True, global_attn_indexes: Optional[list] = None, window_size: int = 14, - out_chans: int = 256, + output_channels: int = 256, + attention_dropout: float = 0.0, **kwargs, ): self.im_patch_token = im_patch_token @@ -92,15 +94,17 @@ def __init__( self.depth = depth self.embed_dim = embed_dim self.hidden_size = hidden_size - self.img_size = img_size + self.image_size = image_size + self.num_channels = num_channels self.mlp_ratio = mlp_ratio - self.num_heads = num_heads + self.num_attention_heads = num_attention_heads self.patch_size = patch_size self.qkv_bias = qkv_bias self.use_rel_pos = use_rel_pos self.global_attn_indexes = global_attn_indexes if global_attn_indexes is not None else [2, 5, 8, 11] self.window_size = window_size - self.out_chans = out_chans + self.output_channels = output_channels + self.attention_dropout = attention_dropout super().__init__(**kwargs) @@ -295,7 +299,7 @@ class PPChart2TableConfig(PreTrainedConfig): >>> configuration = PPChart2TableConfig() >>> # Initializing a PPChart2Table configuration with custom vision and text sub-configs - >>> vision_config = {"img_size": 512, "patch_size": 8} + >>> vision_config = {"image_size": 512, "patch_size": 8} >>> text_config = {"hidden_size": 2048, "num_hidden_layers": 16} >>> configuration = PPChart2TableConfig(vision_config=vision_config, text_config=text_config) diff --git a/src/transformers/models/pp_chart2table/modeling_pp_chart2table.py b/src/transformers/models/pp_chart2table/modeling_pp_chart2table.py index 6d95acc7eea5..4b66b6b04afa 100644 --- a/src/transformers/models/pp_chart2table/modeling_pp_chart2table.py +++ b/src/transformers/models/pp_chart2table/modeling_pp_chart2table.py @@ -4,6 +4,7 @@ # the file from the modular. If any change should be done, please apply the change to the # modular_pp_chart2table.py file directly. One of our CI enforces this. # ๐Ÿšจ๐Ÿšจ๐Ÿšจ๐Ÿšจ๐Ÿšจ๐Ÿšจ๐Ÿšจ๐Ÿšจ๐Ÿšจ๐Ÿšจ๐Ÿšจ๐Ÿšจ๐Ÿšจ๐Ÿšจ๐Ÿšจ๐Ÿšจ๐Ÿšจ๐Ÿšจ๐Ÿšจ๐Ÿšจ๐Ÿšจ๐Ÿšจ๐Ÿšจ๐Ÿšจ๐Ÿšจ๐Ÿšจ๐Ÿšจ๐Ÿšจ๐Ÿšจ๐Ÿšจ๐Ÿšจ๐Ÿšจ๐Ÿšจ๐Ÿšจ๐Ÿšจ๐Ÿšจ๐Ÿšจ๐Ÿšจ๐Ÿšจ๐Ÿšจ๐Ÿšจ๐Ÿšจ๐Ÿšจ๐Ÿšจ๐Ÿšจ๐Ÿšจ๐Ÿšจ๐Ÿšจ +import collections from collections.abc import Callable from dataclasses import dataclass from typing import Optional, Union @@ -34,398 +35,314 @@ class PPChart2TableVisionPatchEmbed(nn.Module): - r""" - Image to Patch Embedding layer for PP-Chart2Table vision encoder. + """ + This class turns `pixel_values` of shape `(batch_size, num_channels, height, width)` into the initial + `hidden_states` (patch embeddings) of shape `(batch_size, seq_length, hidden_size)` to be consumed by a + Transformer. + """ - This module converts raw chart images (HWC format) into flattened patch embeddings via a 2D convolution, - followed by dimension permutation to align with the vision transformer's input format. + def __init__(self, config): + super().__init__() + image_size, patch_size = config.image_size, config.patch_size + num_channels, hidden_size = config.num_channels, config.embed_dim + image_size = image_size if isinstance(image_size, collections.abc.Iterable) else (image_size, image_size) + patch_size = patch_size if isinstance(patch_size, collections.abc.Iterable) else (patch_size, patch_size) + num_patches = (image_size[1] // patch_size[1]) * (image_size[0] // patch_size[0]) + self.image_size = image_size + self.patch_size = patch_size + self.num_channels = num_channels + self.num_patches = num_patches + + self.projection = nn.Conv2d(num_channels, hidden_size, kernel_size=patch_size, stride=patch_size) + + def forward(self, pixel_values): + batch_size, num_channels, height, width = pixel_values.shape + if num_channels != self.num_channels: + raise ValueError( + "Make sure that the channel dimension of the pixel values match with the one set in the configuration." + ) + if height != self.image_size[0] or width != self.image_size[1]: + raise ValueError( + f"Input image size ({height}*{width}) doesn't match model ({self.image_size[0]}*{self.image_size[1]})." + ) + embeddings = self.projection(pixel_values).permute(0, 2, 3, 1) + return embeddings - Args: - kernel_size (`tuple[int, int]`, *optional*, defaults to `(16, 16)`): - Size of the convolution kernel (patch size) for splitting images into patches. - stride (`tuple[int, int]`, *optional*, defaults to `(16, 16)`): - Stride of the convolution operation (matches patch size for non-overlapping patches). - padding (`tuple[int, int]`, *optional*, defaults to `(0, 0)`): - Padding applied to the input image before convolution (ensures patch alignment). - in_chans (`int`, *optional*, defaults to 3): - Number of input channels (3 for RGB chart images). - embed_dim (`int`, *optional*, defaults to 768): - Dimensionality of the output patch embeddings (hidden size of the vision transformer). - - Shape: - - Input: `(B, C, H, W)` (batch size, channels, height, width) - - Output: `(B, H_out, W_out, C_out)` (batch size, patch height, patch width, embedding dim) - """ - def __init__( - self, - kernel_size: tuple[int, int] = (16, 16), - stride: tuple[int, int] = (16, 16), - padding: tuple[int, int] = (0, 0), - in_chans: int = 3, - embed_dim: int = 768, - ) -> None: +class PPChart2TableVisionMLPBlock(nn.Module): + def __init__(self, config) -> None: super().__init__() - self.proj = nn.Conv2d(in_chans, embed_dim, kernel_size=kernel_size, stride=stride, padding=padding) + self.lin1 = nn.Linear(config.embed_dim, int(config.embed_dim * config.mlp_ratio)) + self.lin2 = nn.Linear(int(config.embed_dim * config.mlp_ratio), config.embed_dim) + self.act = ACT2FN[config.hidden_act] def forward(self, hidden_states: torch.Tensor) -> torch.Tensor: - hidden_states = self.proj(hidden_states) - hidden_states = hidden_states.permute(0, 2, 3, 1) + hidden_states = self.lin1(hidden_states) + hidden_states = self.act(hidden_states) + hidden_states = self.lin2(hidden_states) return hidden_states -class PPChart2TableVisionMLPBlock(nn.Module): - r""" - Multi-Layer Perceptron (MLP) block for PP-Chart2Table vision transformer layers. - - Implements a two-layer feed-forward network with activation function, used in the vision transformer's - decoder layers to project features to a higher dimension and back. - - Args: - embedding_dim (`int`): - Dimensionality of the input/output embeddings (hidden size of the transformer layer). - mlp_dim (`int`): - Dimensionality of the intermediate (hidden) layer in the MLP (typically 4x embedding_dim). - act (`Type[nn.Module]`, *optional*, defaults to `torch.nn.GELU`): - Non-linear activation function to apply between the two linear layers. - - Shape: - - Input: `(B, H, W, embedding_dim)` or `(B, N, embedding_dim)` (N = H*W) - - Output: Same shape as input +class PPChart2TableVisionLayerNorm(nn.LayerNorm): + r"""LayerNorm that supports two data formats: channels_last (default) or channels_first. + The ordering of the dimensions in the inputs. channels_last corresponds to inputs with shape (batch_size, height, + width, channels) while channels_first corresponds to inputs with shape (batch_size, channels, height, width). """ - def __init__( - self, - embedding_dim: int, - mlp_dim: int, - act: type[nn.Module] = torch.nn.GELU, - ) -> None: - super().__init__() - self.lin1 = nn.Linear(embedding_dim, mlp_dim) - self.lin2 = nn.Linear(mlp_dim, embedding_dim) - self.act = act() - - def forward(self, hidden_states: torch.Tensor) -> torch.Tensor: - return self.lin2(self.act(self.lin1(hidden_states))) - + def __init__(self, normalized_shape, *, eps=1e-6, data_format="channels_last", **kwargs): + super().__init__(normalized_shape, eps=eps, **kwargs) + if data_format not in ["channels_last", "channels_first"]: + raise NotImplementedError(f"Unsupported data format: {data_format}") + self.data_format = data_format -class PPChart2TableVisionLayerNorm2d(nn.Module): - r""" - 2D Layer Normalization for spatial feature maps (adapted for PP-Chart2Table vision encoder). + def forward(self, features: torch.Tensor) -> torch.Tensor: + """ + Args: + features: Tensor of shape (batch_size, channels, height, width) OR (batch_size, height, width, channels) + """ + if self.data_format == "channels_first": + features = features.permute(0, 2, 3, 1) + features = super().forward(features) + features = features.permute(0, 3, 1, 2) + else: + features = super().forward(features) + return features - Applies layer normalization over the channel dimension of 2D feature maps, with learnable scale/bias parameters - broadcasted across spatial dimensions (height/width). - Args: - num_channels (`int`): - Number of channels in the input feature map (embedding dimension). - epsilon (`float`, *optional*, defaults to `1e-06`): - Small value added to variance to avoid division by zero. - - Shape: - - Input: `(B, C, H, W)` (batch size, channels, height, width) - - Output: Same shape as input - """ +class PPChart2TableVisionAttention(nn.Module): + """Multi-head Attention block with relative position embeddings.""" - def __init__(self, num_channels: int, epsilon: float = 1e-06) -> None: + def __init__(self, config, window_size): super().__init__() - self.weight = nn.Parameter(torch.ones(num_channels)) - self.bias = nn.Parameter(torch.zeros(num_channels)) - self.epsilon = epsilon + input_size = ( + (config.image_size // config.patch_size, config.image_size // config.patch_size) + if window_size == 0 + else (window_size, window_size) + ) - def forward(self, hidden_states: torch.Tensor) -> torch.Tensor: - u = hidden_states.mean(dim=1, keepdim=True) - s = (hidden_states - u).pow(2).mean(dim=1, keepdim=True) - hidden_states = (hidden_states - u) / torch.sqrt(s + self.epsilon) - hidden_states = self.weight[:, None, None] * hidden_states + self.bias[:, None, None] - return hidden_states + self.num_attention_heads = config.num_attention_heads + head_dim = config.embed_dim // config.num_attention_heads + self.scale = head_dim**-0.5 + self.dropout = config.attention_dropout + self.qkv = nn.Linear(config.embed_dim, config.embed_dim * 3, bias=config.qkv_bias) + self.proj = nn.Linear(config.embed_dim, config.embed_dim) + self.use_rel_pos = config.use_rel_pos + if self.use_rel_pos: + if input_size is None: + raise ValueError("Input size must be provided if using relative positional encoding.") -def get_rel_pos(q_size: int, k_size: int, rel_pos: torch.Tensor) -> torch.Tensor: - r""" - Get relative positional embeddings for query and key sequences, with interpolation for mismatched sizes. + # initialize relative positional embeddings + self.rel_pos_h = nn.Parameter(torch.zeros(2 * input_size[0] - 1, head_dim)) + self.rel_pos_w = nn.Parameter(torch.zeros(2 * input_size[1] - 1, head_dim)) - Args: - q_size (`int`): - Spatial size (height/width) of query feature map - k_size (`int`): - Spatial size (height/width) of key feature map - rel_pos (`torch.Tensor`): - Precomputed relative positional embeddings with shape [max_rel_dist_original, dim] + def get_rel_pos(self, q_size: int, k_size: int, rel_pos: torch.Tensor) -> torch.Tensor: + """ + Get relative positional embeddings according to the relative positions of + query and key sizes. - Returns: - `torch.Tensor`: - Interpolated relative positional embeddings for the query-key pair, shape [q_size, k_size, dim] - """ - max_rel_dist = int(2 * max(q_size, k_size) - 1) - if rel_pos.shape[0] != max_rel_dist: + Args: + q_size (int): + size of the query. + k_size (int): + size of key k. + rel_pos (`torch.Tensor`): + relative position embeddings (L, channel). + + Returns: + Extracted positional embeddings according to relative positions. + """ + max_rel_dist = int(2 * max(q_size, k_size) - 1) + # Interpolate rel pos. rel_pos_resized = F.interpolate( rel_pos.reshape(1, rel_pos.shape[0], -1).permute(0, 2, 1), size=max_rel_dist, mode="linear", ) rel_pos_resized = rel_pos_resized.reshape(-1, max_rel_dist).permute(1, 0) - else: - rel_pos_resized = rel_pos - - q_coords = torch.arange(q_size)[:, None] * max(k_size / q_size, 1.0) - k_coords = torch.arange(k_size)[None, :] * max(q_size / k_size, 1.0) - relative_coords = q_coords - k_coords + (k_size - 1) * max(q_size / k_size, 1.0) - return rel_pos_resized[relative_coords.long()] - - -def add_decomposed_rel_pos( - attn: torch.Tensor, - q: torch.Tensor, - rel_pos_h: torch.Tensor, - rel_pos_w: torch.Tensor, - q_size: tuple[int, int], - k_size: tuple[int, int], -) -> torch.Tensor: - r""" - Add decomposed relative positional embeddings (height and width separately) to attention scores. - - Args: - attn (`torch.Tensor`): - Attention scores with shape [B, q_h*q_w, k_h*k_w] - q (`torch.Tensor`): - Query tensor with shape [B, q_h*q_w, dim] - rel_pos_h (`torch.Tensor`): - Precomputed relative positional embeddings for height dimension - rel_pos_w (`torch.Tensor`): - Precomputed relative positional embeddings for width dimension - q_size (`tuple[int, int]`): - Spatial size (q_h, q_w) of query feature map - k_size (`tuple[int, int]`): - Spatial size (k_h, k_w) of key feature map - - Returns: - `torch.Tensor`: - Attention scores with added relative positional embeddings, shape [B, q_h*q_w, k_h*k_w] - """ - q_h, q_w = q_size - k_h, k_w = k_size - Rh = get_rel_pos(q_h, k_h, rel_pos_h) - Rw = get_rel_pos(q_w, k_w, rel_pos_w) - B, _, dim = q.shape - r_q = q.reshape(B, q_h, q_w, dim) - rel_h = torch.einsum("bhwc,hkc->bhwk", r_q, Rh) - rel_w = torch.einsum("bhwc,wkc->bhwk", r_q, Rw) + # Scale the coords with short length if shapes for q and k are different. + q_coords = torch.arange(q_size)[:, None] * max(k_size / q_size, 1.0) + k_coords = torch.arange(k_size)[None, :] * max(q_size / k_size, 1.0) + relative_coords = (q_coords - k_coords) + (k_size - 1) * max(q_size / k_size, 1.0) - attn = (attn.reshape(B, q_h, q_w, k_h, k_w) + rel_h[:, :, :, :, None] + rel_w[:, :, :, None, :]).reshape( - B, q_h * q_w, k_h * k_w - ) - - return attn + return rel_pos_resized[relative_coords.long()] + def get_decomposed_rel_pos( + self, + query: torch.Tensor, + rel_pos_h: torch.Tensor, + rel_pos_w: torch.Tensor, + q_size: tuple[int, int], + k_size: tuple[int, int], + ) -> torch.Tensor: + """ + Calculate decomposed Relative Positional Embeddings from :paper:`mvitv2`. + https://github.com/facebookresearch/mvit/blob/19786631e330df9f3622e5402b4a419a263a2c80/mvit/models/attention.py -class PPChart2TableVisionAttention(nn.Module): - r""" - Multi-Head Self-Attention (MHSA) layer for PP-Chart2Table vision encoder, with optional relative positional encoding. + Args: + query (`torch.Tensor`): + query q in the attention layer with shape (batch_size, query_height * query_width, channel). + rel_pos_h (`torch.Tensor`): + relative position embeddings (Lh, channel) for height axis. + rel_pos_w (`torch.Tensor`): + relative position embeddings (Lw, channel) for width axis. + q_size (tuple): + spatial sequence size of query q with (query_height, query_width). + k_size (tuple): + spatial sequence size of key k with (key_height, key_width). - Implements standard multi-head attention with query/key/value projection, scaled dot-product attention, - and optional decomposed relative positional embeddings (height/width separate) for spatial awareness. + Returns: + decomposed_rel_pos (`torch.Tensor`): + decomposed relative position embeddings. + """ + query_height, query_width = q_size + key_height, key_width = k_size + relative_position_height = self.get_rel_pos(query_height, key_height, rel_pos_h) + relative_position_width = self.get_rel_pos(query_width, key_width, rel_pos_w) + + batch_size, _, dim = query.shape + reshaped_query = query.reshape(batch_size, query_height, query_width, dim) + rel_h = torch.einsum("bhwc,hkc->bhwk", reshaped_query, relative_position_height) + rel_w = torch.einsum("bhwc,wkc->bhwk", reshaped_query, relative_position_width) + + decomposed_rel_pos = rel_h[:, :, :, :, None] + rel_w[:, :, :, None, :] + + return decomposed_rel_pos + + def forward(self, hidden_states: torch.Tensor, output_attentions=None) -> tuple[torch.Tensor, torch.Tensor]: + batch_size, height, width, _ = hidden_states.shape + # qkv with shape (3, batch_size, nHead, height * width, channel) + qkv = ( + self.qkv(hidden_states) + .reshape(batch_size, height * width, 3, self.num_attention_heads, -1) + .permute(2, 0, 3, 1, 4) + ) + # q, k, v with shape (batch_size * nHead, height * width, channel) + query, key, value = qkv.reshape(3, batch_size * self.num_attention_heads, height * width, -1).unbind(0) - Args: - dim (`int`): - Dimensionality of the input embeddings (hidden size of the transformer layer). - num_heads (`int`, *optional*, defaults to 8): - Number of attention heads (must divide `dim` evenly). - qkv_bias (`bool`, *optional*, defaults to `True`): - Whether to add bias terms to the query/key/value projection layers. - use_rel_pos (`bool`, *optional*, defaults to `False`): - Whether to use relative positional encoding for spatial attention. - rel_pos_zero_init (`bool`, *optional*, defaults to `True`): - Whether to initialize relative positional embeddings to zero (stable training). - input_size (`Tuple[int, int]`, *optional*): - Spatial size (H, W) of the input feature map (required if `use_rel_pos=True`). - - Shape: - - Input: `(B, H, W, dim)` (batch size, height, width, embedding dim) - - Output: Same shape as input - """ + attn_weights = (query * self.scale) @ key.transpose(-2, -1) - def __init__( - self, - dim: int, - num_heads: int = 8, - qkv_bias: bool = True, - use_rel_pos: bool = False, - rel_pos_zero_init: bool = True, - input_size: Optional[tuple[int, int]] = None, - ) -> None: - super().__init__() - self.num_heads = num_heads - head_dim = dim // num_heads - self.scale = head_dim**-0.5 + if self.use_rel_pos: + decomposed_rel_pos = self.get_decomposed_rel_pos( + query, self.rel_pos_h, self.rel_pos_w, (height, width), (height, width) + ) + decomposed_rel_pos = decomposed_rel_pos.reshape_as(attn_weights) + attn_weights = attn_weights + decomposed_rel_pos - self.qkv = nn.Linear(dim, dim * 3, bias=qkv_bias) - self.proj = nn.Linear(dim, dim) + attn_weights = torch.nn.functional.softmax(attn_weights, dtype=torch.float32, dim=-1).to(query.dtype) - self.use_rel_pos = use_rel_pos - if self.use_rel_pos: - assert input_size is not None, "Input size must be provided if using relative positional encoding." - self.rel_pos_h = nn.Parameter(torch.zeros(2 * input_size[0] - 1, head_dim)) - self.rel_pos_w = nn.Parameter(torch.zeros(2 * input_size[1] - 1, head_dim)) + attn_probs = nn.functional.dropout(attn_weights, p=self.dropout, training=self.training) - def forward(self, hidden_states: torch.Tensor) -> torch.Tensor: - B, H, W, _ = hidden_states.shape - qkv = self.qkv(hidden_states).reshape(B, H * W, 3, self.num_heads, -1).permute(2, 0, 3, 1, 4) - q, k, v = qkv.reshape(3, B * self.num_heads, H * W, -1).unbind(dim=0) - attn = (q * self.scale) @ k.transpose(1, 2) + attn_output = (attn_probs @ value).reshape(batch_size, self.num_attention_heads, height, width, -1) + attn_output = attn_output.permute(0, 2, 3, 1, 4).reshape(batch_size, height, width, -1) - if self.use_rel_pos: - attn = add_decomposed_rel_pos(attn, q, self.rel_pos_h, self.rel_pos_w, (H, W), (H, W)) + attn_output = self.proj(attn_output) + return attn_output, attn_weights - attn = F.softmax(attn, dim=-1) - hidden_states = (attn @ v).reshape(B, self.num_heads, H, W, -1).permute(0, 2, 3, 1, 4).reshape(B, H, W, -1) - hidden_states = self.proj(hidden_states) - return hidden_states +class PPChart2TableVisionDecoderLayer(GradientCheckpointingLayer): + def __init__(self, config, window_size) -> None: + super().__init__() + self.layer_norm1 = nn.LayerNorm(config.embed_dim) + self.attn = PPChart2TableVisionAttention(config, window_size=window_size) -def window_partition(hidden_states: torch.Tensor, window_size: int) -> tuple[torch.Tensor, tuple[int, int]]: - r""" - Partition 2D feature maps into non-overlapping windows, with padding to ensure dimensions are divisible by window size. + self.layer_norm2 = nn.LayerNorm(config.embed_dim) + self.mlp = PPChart2TableVisionMLPBlock(config) + self.window_size = window_size - Args: - hidden_states (`torch.Tensor`): - Input feature map with shape [B, H, W, C], where: - - B: batch size - - H: height of feature map - - W: width of feature map - - C: channel dimension - window_size (`int`): - Size of each non-overlapping window (square window). + def window_partition(self, hidden_states: torch.Tensor, window_size: int) -> tuple[torch.Tensor, tuple[int, int]]: + """ + Args: + Partition into non-overlapping windows with padding if needed. + hidden_states (tensor): input tokens with [batch_size, height, width, channel]. window_size (int): window + size. - Returns: - tuple[torch.Tensor, tuple[int, int]]: - - windows: Partitioned windows with shape [num_windows * B, window_size, window_size, C], - where num_windows = (Hp // window_size) * (Wp // window_size) - - (Hp, Wp): Padded height and width of the feature map (after padding) - """ - B, H, W, C = hidden_states.shape + Returns: + windows: windows after partition with [batch_size * num_windows, window_size, window_size, channel]. + (pad_height, pad_width): padded height and width before partition + """ + batch_size, height, width, channel = hidden_states.shape - pad_h = (window_size - H % window_size) % window_size - pad_w = (window_size - W % window_size) % window_size - if pad_h > 0 or pad_w > 0: + pad_h = (window_size - height % window_size) % window_size + pad_w = (window_size - width % window_size) % window_size hidden_states = F.pad(hidden_states, (0, 0, 0, pad_w, 0, pad_h)) - Hp, Wp = H + pad_h, W + pad_w - - hidden_states = hidden_states.reshape(B, Hp // window_size, window_size, Wp // window_size, window_size, C) - windows = hidden_states.permute(0, 1, 3, 2, 4, 5).reshape(-1, window_size, window_size, C) - return windows, (Hp, Wp) + pad_height, pad_width = height + pad_h, width + pad_w + hidden_states = hidden_states.reshape( + batch_size, pad_height // window_size, window_size, pad_width // window_size, window_size, channel + ) + windows = hidden_states.permute(0, 1, 3, 2, 4, 5).contiguous().reshape(-1, window_size, window_size, channel) + return windows, (pad_height, pad_width) -def window_unpartition( - windows: torch.Tensor, - window_size: int, - pad_hw: tuple[int, int], - hw: tuple[int, int], -) -> torch.Tensor: - r""" - Reverse operation of window_partition: merge windows back to original 2D feature map shape, removing padding. + def window_unpartition( + self, windows: torch.Tensor, window_size: int, padding_shape: tuple[int, int], original_shape: tuple[int, int] + ) -> torch.Tensor: + """ + Args: + Window unpartition into original sequences and removing padding. + hidden_states (tensor): + input tokens with [batch_size * num_windows, window_size, window_size, channel]. + window_size (int): + window size. + padding_shape (Tuple): + padded height and width (pad_height, pad_width). + original_shape (Tuple): original height and width (height, width) before padding. - Args: - windows (`torch.Tensor`): - Partitioned windows with shape [num_windows * B, window_size, window_size, C] - window_size (`int`): - Size of each non-overlapping window (must match window_partition's window_size) - pad_hw (`tuple[int, int]`): - Padded height and width (Hp, Wp) returned by window_partition - hw (`tuple[int, int]`): - Original height and width (H, W) of feature map before padding + Returns: + hidden_states: unpartitioned sequences with [batch_size, height, width, channel]. + """ + pad_height, pad_width = padding_shape + height, width = original_shape + batch_size = windows.shape[0] // (pad_height * pad_width // window_size // window_size) + hidden_states = windows.reshape( + batch_size, pad_height // window_size, pad_width // window_size, window_size, window_size, -1 + ) + hidden_states = ( + hidden_states.permute(0, 1, 3, 2, 4, 5).contiguous().reshape(batch_size, pad_height, pad_width, -1) + ) - Returns: - `torch.Tensor`: - Reconstructed feature map with shape [B, H, W, C] (original dimensions before padding) - """ - Hp, Wp = pad_hw - H, W = hw - B = windows.shape[0] // (Hp * Wp // window_size // window_size) - hidden_states = windows.reshape(B, Hp // window_size, Wp // window_size, window_size, window_size, -1) - hidden_states = hidden_states.permute(0, 1, 3, 2, 4, 5).reshape(B, Hp, Wp, -1) - if Hp > H or Wp > W: - hidden_states = hidden_states[:, :H, :W, :] - return hidden_states + hidden_states = hidden_states[:, :height, :width, :].contiguous() + return hidden_states + def forward(self, hidden_states: torch.Tensor) -> tuple[torch.FloatTensor]: + residual = hidden_states + hidden_states = self.layer_norm1(hidden_states) + # Window partition + if self.window_size > 0: + height, width = hidden_states.shape[1], hidden_states.shape[2] + hidden_states, padding_shape = self.window_partition(hidden_states, self.window_size) -class PPChart2TableVisionDecoderLayer(nn.Module): - r""" - Single decoder layer of the PP-Chart2Table vision transformer, with optional windowed attention. + hidden_states, attn_weights = self.attn( + hidden_states=hidden_states, + ) + # Reverse window partition + if self.window_size > 0: + hidden_states = self.window_unpartition(hidden_states, self.window_size, padding_shape, (height, width)) - Implements the standard transformer decoder layer structure: - Layer Norm โ†’ Multi-Head Attention (with residual) โ†’ Layer Norm โ†’ MLP (with residual) - Supports windowed attention (SW-MHA) for large feature maps to reduce computation. + hidden_states = residual + hidden_states + layernorm_output = self.layer_norm2(hidden_states) + hidden_states = hidden_states + self.mlp(layernorm_output) + return hidden_states - Args: - dim (`int`): - Dimensionality of the input embeddings (hidden size of the transformer layer). - num_heads (`int`): - Number of attention heads (passed to MHSA layer). - mlp_ratio (`float`, *optional*, defaults to 4.0): - Ratio of MLP hidden dimension to embedding dimension (mlp_dim = dim * mlp_ratio). - qkv_bias (`bool`, *optional*, defaults to `True`): - Whether to use bias in Q/K/V projection (passed to MHSA layer). - norm_layer (`Type[nn.Module]`, *optional*, defaults to `nn.LayerNorm`): - Normalization layer to use (LayerNorm for flattened patches, LayerNorm2d for 2D feature maps). - act_layer (`Type[nn.Module]`, *optional*, defaults to `nn.GELU`): - Activation function for MLP block. - use_rel_pos (`bool`, *optional*, defaults to `False`): - Whether to use relative positional encoding (passed to MHSA layer). - rel_pos_zero_init (`bool`, *optional*, defaults to `True`): - Whether to zero-initialize relative positional embeddings (passed to MHSA layer). - window_size (`int`, *optional*, defaults to 0): - Size of attention windows (0 = full attention, >0 = windowed attention). - input_size (`Tuple[int, int]`, *optional*): - Spatial size of input feature map (passed to MHSA layer for relative positional encoding). - - Shape: - - Input: `(B, H, W, dim)` (batch size, height, width, embedding dim) - - Output: Same shape as input - """ - def __init__( - self, - dim: int, - num_heads: int, - mlp_ratio: float = 4.0, - qkv_bias: bool = True, - norm_layer: type[nn.Module] = nn.LayerNorm, - act_layer: type[nn.Module] = nn.GELU, - use_rel_pos: bool = False, - rel_pos_zero_init: bool = True, - window_size: int = 0, - input_size: Optional[tuple[int, int]] = None, - ) -> None: +class PPChart2TableVisionNeck(nn.Module): + def __init__(self, config: PPChart2TableVisionConfig): super().__init__() - self.norm1 = norm_layer(dim) - self.attn = PPChart2TableVisionAttention( - dim, - num_heads=num_heads, - qkv_bias=qkv_bias, - use_rel_pos=use_rel_pos, - rel_pos_zero_init=rel_pos_zero_init, - input_size=input_size if window_size == 0 else (window_size, window_size), - ) - - self.norm2 = norm_layer(dim) - self.mlp = PPChart2TableVisionMLPBlock(embedding_dim=dim, mlp_dim=int(dim * mlp_ratio), act=act_layer) + self.config = config - self.window_size = window_size + self.conv1 = nn.Conv2d(config.embed_dim, config.output_channels, kernel_size=1, bias=False) + self.layer_norm1 = PPChart2TableVisionLayerNorm(config.output_channels, data_format="channels_first") + self.conv2 = nn.Conv2d(config.output_channels, config.output_channels, kernel_size=3, padding=1, bias=False) + self.layer_norm2 = PPChart2TableVisionLayerNorm(config.output_channels, data_format="channels_first") - def forward(self, hidden_states: torch.Tensor) -> torch.Tensor: - shortcut = hidden_states - hidden_states = self.norm1(hidden_states) - if self.window_size > 0: - H, W = hidden_states.shape[1], hidden_states.shape[2] - hidden_states, pad_hw = window_partition(hidden_states, self.window_size) - hidden_states = self.attn(hidden_states) + def forward(self, hidden_states): + hidden_states = hidden_states.permute(0, 3, 1, 2) + hidden_states = self.conv1(hidden_states) + hidden_states = self.layer_norm1(hidden_states) - if self.window_size > 0: - hidden_states = window_unpartition(hidden_states, self.window_size, pad_hw, (H, W)) - hidden_states = shortcut + hidden_states - hidden_states = hidden_states + self.mlp(self.norm2(hidden_states)) + hidden_states = self.conv2(hidden_states) + hidden_states = self.layer_norm2(hidden_states) return hidden_states @@ -483,60 +400,27 @@ class PPChart2TableVisionModel(PPChart2TableVisionPreTrainedModel): def __init__( self, config: PPChart2TableVisionConfig, - in_chans: int = 3, - norm_layer: type[nn.Module] = nn.LayerNorm, - act_layer: type[nn.Module] = nn.GELU, - rel_pos_zero_init: bool = True, ) -> None: super().__init__(config) - self.img_size = config.img_size + self.image_size = config.image_size - self.patch_embed = PPChart2TableVisionPatchEmbed( - kernel_size=(config.patch_size, config.patch_size), - stride=(config.patch_size, config.patch_size), - in_chans=in_chans, - embed_dim=config.embed_dim, - ) + self.patch_embed = PPChart2TableVisionPatchEmbed(config) self.pos_embed = nn.Parameter( torch.zeros( - 1, config.img_size // config.patch_size, config.img_size // config.patch_size, config.embed_dim + 1, config.image_size // config.patch_size, config.image_size // config.patch_size, config.embed_dim ) ) self.blocks = nn.ModuleList() for i in range(config.depth): block = PPChart2TableVisionDecoderLayer( - dim=config.embed_dim, - num_heads=config.num_heads, - mlp_ratio=config.mlp_ratio, - qkv_bias=config.qkv_bias, - norm_layer=norm_layer, - act_layer=act_layer, - use_rel_pos=config.use_rel_pos, - rel_pos_zero_init=rel_pos_zero_init, + config, window_size=config.window_size if i not in config.global_attn_indexes else 0, - input_size=(config.img_size // config.patch_size, config.img_size // config.patch_size), ) self.blocks.append(block) - self.neck = nn.Sequential( - nn.Conv2d( - config.embed_dim, - config.out_chans, - kernel_size=1, - bias=False, - ), - PPChart2TableVisionLayerNorm2d(config.out_chans), - nn.Conv2d( - config.out_chans, - config.out_chans, - kernel_size=3, - padding=1, - bias=False, - ), - PPChart2TableVisionLayerNorm2d(config.out_chans), - ) + self.neck = PPChart2TableVisionNeck(config) self.net_2 = nn.Conv2d(256, 512, kernel_size=3, stride=2, padding=1, bias=False) self.net_3 = nn.Conv2d(512, config.hidden_size, kernel_size=3, stride=2, padding=1, bias=False) @@ -546,9 +430,9 @@ def __init__( def forward(self, hidden_states: torch.Tensor) -> torch.Tensor: hidden_states = self.patch_embed(hidden_states) hidden_states = hidden_states + self.pos_embed - for blk in self.blocks: - hidden_states = blk(hidden_states) - hidden_states = self.neck(hidden_states.permute(0, 3, 1, 2)) + for block in self.blocks: + hidden_states = block(hidden_states) + hidden_states = self.neck(hidden_states) hidden_states = self.net_2(hidden_states) hidden_states = self.net_3(hidden_states) return hidden_states diff --git a/src/transformers/models/pp_chart2table/modular_pp_chart2table.py b/src/transformers/models/pp_chart2table/modular_pp_chart2table.py index fdf6e07fe0cd..e1bd61a0719c 100644 --- a/src/transformers/models/pp_chart2table/modular_pp_chart2table.py +++ b/src/transformers/models/pp_chart2table/modular_pp_chart2table.py @@ -27,6 +27,7 @@ from transformers.modeling_rope_utils import RopeParameters from transformers.modeling_utils import PreTrainedModel from transformers.models.qwen2.modeling_qwen2 import Qwen2Attention, Qwen2DecoderLayer, Qwen2Model, Qwen2PreTrainedModel +from transformers.models.got_ocr2.modeling_got_ocr2 import GotOcr2VisionNeck, GotOcr2MLPBlock, GotOcr2LayerNorm, GotOcr2PatchEmbeddings, GotOcr2VisionAttention, GotOcr2VisionLayer from transformers.processing_utils import ProcessorMixin, TensorType from transformers.utils import ( can_return_tuple, @@ -55,11 +56,11 @@ class PPChart2TableVisionConfig(PreTrainedConfig): Dimensionality of the patch embedding layer in the vision encoder. hidden_size (`int`, *optional*, defaults to 1024): Dimensionality of the hidden layers in the vision Transformer encoder. - img_size (`int`, *optional*, defaults to 1024): + image_size (`int`, *optional*, defaults to 1024): The size (resolution) of input chart images (assumed to be square). mlp_ratio (`float`, *optional*, defaults to 4.0): Ratio of the dimensionality of the feed-forward layer to the hidden size in the vision Transformer blocks. - num_heads (`int`, *optional*, defaults to 12): + num_attention_heads (`int`, *optional*, defaults to 12): Number of attention heads for each self-attention layer in the vision Transformer encoder. patch_size (`int`, *optional*, defaults to 16): The size (resolution) of each image patch extracted from the input chart image. @@ -71,7 +72,7 @@ class PPChart2TableVisionConfig(PreTrainedConfig): List of layer indexes where global attention (instead of windowed attention) is applied in the vision encoder. window_size (`int`, *optional*, defaults to 14): The size of the attention window for windowed self-attention in the vision Transformer layers. - out_chans (`int`, *optional*, defaults to 256): + output_channels (`int`, *optional*, defaults to 256): Number of output channels from the convolutional stem layer before patch embedding. Example: @@ -99,15 +100,17 @@ def __init__( depth: int = 12, embed_dim: int = 768, hidden_size: int = 1024, - img_size: int = 1024, + num_channels: int = 3, + image_size: int = 1024, mlp_ratio: float = 4.0, - num_heads: int = 12, + num_attention_heads: int = 12, patch_size: int = 16, qkv_bias: bool = True, use_rel_pos: bool = True, global_attn_indexes: Optional[list] = None, window_size: int = 14, - out_chans: int = 256, + output_channels: int = 256, + attention_dropout: float = 0.0, **kwargs, ): self.im_patch_token = im_patch_token @@ -116,15 +119,17 @@ def __init__( self.depth = depth self.embed_dim = embed_dim self.hidden_size = hidden_size - self.img_size = img_size + self.image_size = image_size + self.num_channels = num_channels self.mlp_ratio = mlp_ratio - self.num_heads = num_heads + self.num_attention_heads = num_attention_heads self.patch_size = patch_size self.qkv_bias = qkv_bias self.use_rel_pos = use_rel_pos self.global_attn_indexes = global_attn_indexes if global_attn_indexes is not None else [2, 5, 8, 11] self.window_size = window_size - self.out_chans = out_chans + self.output_channels = output_channels + self.attention_dropout = attention_dropout super().__init__(**kwargs) @@ -319,7 +324,7 @@ class PPChart2TableConfig(PreTrainedConfig): >>> configuration = PPChart2TableConfig() >>> # Initializing a PPChart2Table configuration with custom vision and text sub-configs - >>> vision_config = {"img_size": 512, "patch_size": 8} + >>> vision_config = {"image_size": 512, "patch_size": 8} >>> text_config = {"hidden_size": 2048, "num_hidden_layers": 16} >>> configuration = PPChart2TableConfig(vision_config=vision_config, text_config=text_config) @@ -627,7 +632,7 @@ def __call__( else: image_inputs = {} img_cnt = len(image_inputs) - b, c, h, w = image_inputs["pixel_values"].shape + _, _, h, _ = image_inputs["pixel_values"].shape num_patches = h // self.image_processor.patch_size // self.image_processor.merge_size prompt = ( "<|im_start|>system\n" @@ -648,400 +653,53 @@ def postprocess(self, model_pred, **kwargs): ) -def window_partition(hidden_states: torch.Tensor, window_size: int) -> tuple[torch.Tensor, tuple[int, int]]: - r""" - Partition 2D feature maps into non-overlapping windows, with padding to ensure dimensions are divisible by window size. - - Args: - hidden_states (`torch.Tensor`): - Input feature map with shape [B, H, W, C], where: - - B: batch size - - H: height of feature map - - W: width of feature map - - C: channel dimension - window_size (`int`): - Size of each non-overlapping window (square window). - - Returns: - tuple[torch.Tensor, tuple[int, int]]: - - windows: Partitioned windows with shape [num_windows * B, window_size, window_size, C], - where num_windows = (Hp // window_size) * (Wp // window_size) - - (Hp, Wp): Padded height and width of the feature map (after padding) - """ - B, H, W, C = hidden_states.shape - - pad_h = (window_size - H % window_size) % window_size - pad_w = (window_size - W % window_size) % window_size - if pad_h > 0 or pad_w > 0: - hidden_states = F.pad(hidden_states, (0, 0, 0, pad_w, 0, pad_h)) - Hp, Wp = H + pad_h, W + pad_w - - hidden_states = hidden_states.reshape(B, Hp // window_size, window_size, Wp // window_size, window_size, C) - windows = hidden_states.permute(0, 1, 3, 2, 4, 5).reshape(-1, window_size, window_size, C) - return windows, (Hp, Wp) - - -def window_unpartition( - windows: torch.Tensor, - window_size: int, - pad_hw: tuple[int, int], - hw: tuple[int, int], -) -> torch.Tensor: - r""" - Reverse operation of window_partition: merge windows back to original 2D feature map shape, removing padding. - - Args: - windows (`torch.Tensor`): - Partitioned windows with shape [num_windows * B, window_size, window_size, C] - window_size (`int`): - Size of each non-overlapping window (must match window_partition's window_size) - pad_hw (`tuple[int, int]`): - Padded height and width (Hp, Wp) returned by window_partition - hw (`tuple[int, int]`): - Original height and width (H, W) of feature map before padding - - Returns: - `torch.Tensor`: - Reconstructed feature map with shape [B, H, W, C] (original dimensions before padding) - """ - Hp, Wp = pad_hw - H, W = hw - B = windows.shape[0] // (Hp * Wp // window_size // window_size) - hidden_states = windows.reshape(B, Hp // window_size, Wp // window_size, window_size, window_size, -1) - hidden_states = hidden_states.permute(0, 1, 3, 2, 4, 5).reshape(B, Hp, Wp, -1) - if Hp > H or Wp > W: - hidden_states = hidden_states[:, :H, :W, :] - return hidden_states - - -def get_rel_pos(q_size: int, k_size: int, rel_pos: torch.Tensor) -> torch.Tensor: - r""" - Get relative positional embeddings for query and key sequences, with interpolation for mismatched sizes. - - Args: - q_size (`int`): - Spatial size (height/width) of query feature map - k_size (`int`): - Spatial size (height/width) of key feature map - rel_pos (`torch.Tensor`): - Precomputed relative positional embeddings with shape [max_rel_dist_original, dim] - - Returns: - `torch.Tensor`: - Interpolated relative positional embeddings for the query-key pair, shape [q_size, k_size, dim] - """ - max_rel_dist = int(2 * max(q_size, k_size) - 1) - if rel_pos.shape[0] != max_rel_dist: - rel_pos_resized = F.interpolate( - rel_pos.reshape(1, rel_pos.shape[0], -1).permute(0, 2, 1), - size=max_rel_dist, - mode="linear", - ) - rel_pos_resized = rel_pos_resized.reshape(-1, max_rel_dist).permute(1, 0) - else: - rel_pos_resized = rel_pos - - q_coords = torch.arange(q_size)[:, None] * max(k_size / q_size, 1.0) - k_coords = torch.arange(k_size)[None, :] * max(q_size / k_size, 1.0) - relative_coords = q_coords - k_coords + (k_size - 1) * max(q_size / k_size, 1.0) - return rel_pos_resized[relative_coords.long()] - - -def add_decomposed_rel_pos( - attn: torch.Tensor, - q: torch.Tensor, - rel_pos_h: torch.Tensor, - rel_pos_w: torch.Tensor, - q_size: tuple[int, int], - k_size: tuple[int, int], -) -> torch.Tensor: - r""" - Add decomposed relative positional embeddings (height and width separately) to attention scores. - - Args: - attn (`torch.Tensor`): - Attention scores with shape [B, q_h*q_w, k_h*k_w] - q (`torch.Tensor`): - Query tensor with shape [B, q_h*q_w, dim] - rel_pos_h (`torch.Tensor`): - Precomputed relative positional embeddings for height dimension - rel_pos_w (`torch.Tensor`): - Precomputed relative positional embeddings for width dimension - q_size (`tuple[int, int]`): - Spatial size (q_h, q_w) of query feature map - k_size (`tuple[int, int]`): - Spatial size (k_h, k_w) of key feature map - - Returns: - `torch.Tensor`: - Attention scores with added relative positional embeddings, shape [B, q_h*q_w, k_h*k_w] - """ - q_h, q_w = q_size - k_h, k_w = k_size - Rh = get_rel_pos(q_h, k_h, rel_pos_h) - Rw = get_rel_pos(q_w, k_w, rel_pos_w) - - B, _, dim = q.shape - r_q = q.reshape(B, q_h, q_w, dim) - rel_h = torch.einsum("bhwc,hkc->bhwk", r_q, Rh) - rel_w = torch.einsum("bhwc,wkc->bhwk", r_q, Rw) - - attn = (attn.reshape(B, q_h, q_w, k_h, k_w) + rel_h[:, :, :, :, None] + rel_w[:, :, :, None, :]).reshape( - B, q_h * q_w, k_h * k_w - ) - - return attn - - -class PPChart2TableVisionPatchEmbed(nn.Module): - r""" - Image to Patch Embedding layer for PP-Chart2Table vision encoder. - - This module converts raw chart images (HWC format) into flattened patch embeddings via a 2D convolution, - followed by dimension permutation to align with the vision transformer's input format. - - Args: - kernel_size (`tuple[int, int]`, *optional*, defaults to `(16, 16)`): - Size of the convolution kernel (patch size) for splitting images into patches. - stride (`tuple[int, int]`, *optional*, defaults to `(16, 16)`): - Stride of the convolution operation (matches patch size for non-overlapping patches). - padding (`tuple[int, int]`, *optional*, defaults to `(0, 0)`): - Padding applied to the input image before convolution (ensures patch alignment). - in_chans (`int`, *optional*, defaults to 3): - Number of input channels (3 for RGB chart images). - embed_dim (`int`, *optional*, defaults to 768): - Dimensionality of the output patch embeddings (hidden size of the vision transformer). - - Shape: - - Input: `(B, C, H, W)` (batch size, channels, height, width) - - Output: `(B, H_out, W_out, C_out)` (batch size, patch height, patch width, embedding dim) - """ - - def __init__( - self, - kernel_size: tuple[int, int] = (16, 16), - stride: tuple[int, int] = (16, 16), - padding: tuple[int, int] = (0, 0), - in_chans: int = 3, - embed_dim: int = 768, - ) -> None: +class PPChart2TableVisionPatchEmbed(GotOcr2PatchEmbeddings): + def __init__(self, config): super().__init__() - self.proj = nn.Conv2d(in_chans, embed_dim, kernel_size=kernel_size, stride=stride, padding=padding) - - def forward(self, hidden_states: torch.Tensor) -> torch.Tensor: - hidden_states = self.proj(hidden_states) - hidden_states = hidden_states.permute(0, 2, 3, 1) - return hidden_states + num_channels, hidden_size = config.num_channels, config.embed_dim -class PPChart2TableVisionMLPBlock(nn.Module): - r""" - Multi-Layer Perceptron (MLP) block for PP-Chart2Table vision transformer layers. - - Implements a two-layer feed-forward network with activation function, used in the vision transformer's - decoder layers to project features to a higher dimension and back. - - Args: - embedding_dim (`int`): - Dimensionality of the input/output embeddings (hidden size of the transformer layer). - mlp_dim (`int`): - Dimensionality of the intermediate (hidden) layer in the MLP (typically 4x embedding_dim). - act (`Type[nn.Module]`, *optional*, defaults to `torch.nn.GELU`): - Non-linear activation function to apply between the two linear layers. - - Shape: - - Input: `(B, H, W, embedding_dim)` or `(B, N, embedding_dim)` (N = H*W) - - Output: Same shape as input - """ - - def __init__( - self, - embedding_dim: int, - mlp_dim: int, - act: type[nn.Module] = torch.nn.GELU, - ) -> None: +class PPChart2TableVisionMLPBlock(GotOcr2MLPBlock): + def __init__(self, config) -> None: super().__init__() - self.lin1 = nn.Linear(embedding_dim, mlp_dim) - self.lin2 = nn.Linear(mlp_dim, embedding_dim) - self.act = act() + self.lin1 = nn.Linear(config.embed_dim, int(config.embed_dim * config.mlp_ratio)) + self.lin2 = nn.Linear(int(config.embed_dim * config.mlp_ratio), config.embed_dim) - def forward(self, hidden_states: torch.Tensor) -> torch.Tensor: - return self.lin2(self.act(self.lin1(hidden_states))) +class PPChart2TableVisionLayerNorm(GotOcr2LayerNorm): + pass -class PPChart2TableVisionLayerNorm2d(nn.Module): - r""" - 2D Layer Normalization for spatial feature maps (adapted for PP-Chart2Table vision encoder). - Applies layer normalization over the channel dimension of 2D feature maps, with learnable scale/bias parameters - broadcasted across spatial dimensions (height/width). +class PPChart2TableVisionAttention(GotOcr2VisionAttention): + """Multi-head Attention block with relative position embeddings.""" - Args: - num_channels (`int`): - Number of channels in the input feature map (embedding dimension). - epsilon (`float`, *optional*, defaults to `1e-06`): - Small value added to variance to avoid division by zero. - - Shape: - - Input: `(B, C, H, W)` (batch size, channels, height, width) - - Output: Same shape as input - """ - - def __init__(self, num_channels: int, epsilon: float = 1e-06) -> None: + def __init__(self, config, window_size): super().__init__() - self.weight = nn.Parameter(torch.ones(num_channels)) - self.bias = nn.Parameter(torch.zeros(num_channels)) - self.epsilon = epsilon - - def forward(self, hidden_states: torch.Tensor) -> torch.Tensor: - u = hidden_states.mean(dim=1, keepdim=True) - s = (hidden_states - u).pow(2).mean(dim=1, keepdim=True) - hidden_states = (hidden_states - u) / torch.sqrt(s + self.epsilon) - hidden_states = self.weight[:, None, None] * hidden_states + self.bias[:, None, None] - return hidden_states - + head_dim = config.embed_dim // config.num_attention_heads + self.qkv = nn.Linear(config.embed_dim, config.embed_dim * 3, bias=config.qkv_bias) + self.proj = nn.Linear(config.embed_dim, config.embed_dim) -class PPChart2TableVisionAttention(nn.Module): - r""" - Multi-Head Self-Attention (MHSA) layer for PP-Chart2Table vision encoder, with optional relative positional encoding. - - Implements standard multi-head attention with query/key/value projection, scaled dot-product attention, - and optional decomposed relative positional embeddings (height/width separate) for spatial awareness. - Args: - dim (`int`): - Dimensionality of the input embeddings (hidden size of the transformer layer). - num_heads (`int`, *optional*, defaults to 8): - Number of attention heads (must divide `dim` evenly). - qkv_bias (`bool`, *optional*, defaults to `True`): - Whether to add bias terms to the query/key/value projection layers. - use_rel_pos (`bool`, *optional*, defaults to `False`): - Whether to use relative positional encoding for spatial attention. - rel_pos_zero_init (`bool`, *optional*, defaults to `True`): - Whether to initialize relative positional embeddings to zero (stable training). - input_size (`Tuple[int, int]`, *optional*): - Spatial size (H, W) of the input feature map (required if `use_rel_pos=True`). - - Shape: - - Input: `(B, H, W, dim)` (batch size, height, width, embedding dim) - - Output: Same shape as input - """ - - def __init__( - self, - dim: int, - num_heads: int = 8, - qkv_bias: bool = True, - use_rel_pos: bool = False, - rel_pos_zero_init: bool = True, - input_size: Optional[tuple[int, int]] = None, - ) -> None: +class PPChart2TableVisionDecoderLayer(GotOcr2VisionLayer): + def __init__(self, config, window_size) -> None: super().__init__() - self.num_heads = num_heads - head_dim = dim // num_heads - self.scale = head_dim**-0.5 - - self.qkv = nn.Linear(dim, dim * 3, bias=qkv_bias) - self.proj = nn.Linear(dim, dim) - - self.use_rel_pos = use_rel_pos - if self.use_rel_pos: - assert input_size is not None, "Input size must be provided if using relative positional encoding." - self.rel_pos_h = nn.Parameter(torch.zeros(2 * input_size[0] - 1, head_dim)) - self.rel_pos_w = nn.Parameter(torch.zeros(2 * input_size[1] - 1, head_dim)) + self.layer_norm1 = nn.LayerNorm(config.embed_dim) + self.attn = PPChart2TableVisionAttention(config, window_size=window_size) - def forward(self, hidden_states: torch.Tensor) -> torch.Tensor: - B, H, W, _ = hidden_states.shape - qkv = self.qkv(hidden_states).reshape(B, H * W, 3, self.num_heads, -1).permute(2, 0, 3, 1, 4) - q, k, v = qkv.reshape(3, B * self.num_heads, H * W, -1).unbind(dim=0) - attn = (q * self.scale) @ k.transpose(1, 2) - - if self.use_rel_pos: - attn = add_decomposed_rel_pos(attn, q, self.rel_pos_h, self.rel_pos_w, (H, W), (H, W)) - - attn = F.softmax(attn, dim=-1) - hidden_states = (attn @ v).reshape(B, self.num_heads, H, W, -1).permute(0, 2, 3, 1, 4).reshape(B, H, W, -1) - hidden_states = self.proj(hidden_states) - return hidden_states - - -class PPChart2TableVisionDecoderLayer(nn.Module): - r""" - Single decoder layer of the PP-Chart2Table vision transformer, with optional windowed attention. - - Implements the standard transformer decoder layer structure: - Layer Norm โ†’ Multi-Head Attention (with residual) โ†’ Layer Norm โ†’ MLP (with residual) - Supports windowed attention (SW-MHA) for large feature maps to reduce computation. + self.layer_norm2 = nn.LayerNorm(config.embed_dim) + self.mlp = PPChart2TableVisionMLPBlock(config) + self.window_size = window_size - Args: - dim (`int`): - Dimensionality of the input embeddings (hidden size of the transformer layer). - num_heads (`int`): - Number of attention heads (passed to MHSA layer). - mlp_ratio (`float`, *optional*, defaults to 4.0): - Ratio of MLP hidden dimension to embedding dimension (mlp_dim = dim * mlp_ratio). - qkv_bias (`bool`, *optional*, defaults to `True`): - Whether to use bias in Q/K/V projection (passed to MHSA layer). - norm_layer (`Type[nn.Module]`, *optional*, defaults to `nn.LayerNorm`): - Normalization layer to use (LayerNorm for flattened patches, LayerNorm2d for 2D feature maps). - act_layer (`Type[nn.Module]`, *optional*, defaults to `nn.GELU`): - Activation function for MLP block. - use_rel_pos (`bool`, *optional*, defaults to `False`): - Whether to use relative positional encoding (passed to MHSA layer). - rel_pos_zero_init (`bool`, *optional*, defaults to `True`): - Whether to zero-initialize relative positional embeddings (passed to MHSA layer). - window_size (`int`, *optional*, defaults to 0): - Size of attention windows (0 = full attention, >0 = windowed attention). - input_size (`Tuple[int, int]`, *optional*): - Spatial size of input feature map (passed to MHSA layer for relative positional encoding). - - Shape: - - Input: `(B, H, W, dim)` (batch size, height, width, embedding dim) - - Output: Same shape as input - """ - def __init__( - self, - dim: int, - num_heads: int, - mlp_ratio: float = 4.0, - qkv_bias: bool = True, - norm_layer: type[nn.Module] = nn.LayerNorm, - act_layer: type[nn.Module] = nn.GELU, - use_rel_pos: bool = False, - rel_pos_zero_init: bool = True, - window_size: int = 0, - input_size: Optional[tuple[int, int]] = None, - ) -> None: +class PPChart2TableVisionNeck(GotOcr2VisionNeck): + def __init__(self, config: PPChart2TableVisionConfig): super().__init__() - self.norm1 = norm_layer(dim) - self.attn = PPChart2TableVisionAttention( - dim, - num_heads=num_heads, - qkv_bias=qkv_bias, - use_rel_pos=use_rel_pos, - rel_pos_zero_init=rel_pos_zero_init, - input_size=input_size if window_size == 0 else (window_size, window_size), - ) + self.config = config - self.norm2 = norm_layer(dim) - self.mlp = PPChart2TableVisionMLPBlock(embedding_dim=dim, mlp_dim=int(dim * mlp_ratio), act=act_layer) - - self.window_size = window_size - - def forward(self, hidden_states: torch.Tensor) -> torch.Tensor: - shortcut = hidden_states - hidden_states = self.norm1(hidden_states) - if self.window_size > 0: - H, W = hidden_states.shape[1], hidden_states.shape[2] - hidden_states, pad_hw = window_partition(hidden_states, self.window_size) - hidden_states = self.attn(hidden_states) - - if self.window_size > 0: - hidden_states = window_unpartition(hidden_states, self.window_size, pad_hw, (H, W)) - hidden_states = shortcut + hidden_states - hidden_states = hidden_states + self.mlp(self.norm2(hidden_states)) - return hidden_states + self.conv1 = nn.Conv2d(config.embed_dim, config.output_channels, kernel_size=1, bias=False) + self.layer_norm1 = PPChart2TableVisionLayerNorm(config.output_channels, data_format="channels_first") + self.conv2 = nn.Conv2d(config.output_channels, config.output_channels, kernel_size=3, padding=1, bias=False) + self.layer_norm2 = PPChart2TableVisionLayerNorm(config.output_channels, data_format="channels_first") class PPChart2TableVisionPreTrainedModel(PreTrainedModel): @@ -1099,60 +757,27 @@ class PPChart2TableVisionModel(PPChart2TableVisionPreTrainedModel): def __init__( self, config: PPChart2TableVisionConfig, - in_chans: int = 3, - norm_layer: type[nn.Module] = nn.LayerNorm, - act_layer: type[nn.Module] = nn.GELU, - rel_pos_zero_init: bool = True, ) -> None: super().__init__(config) - self.img_size = config.img_size + self.image_size = config.image_size - self.patch_embed = PPChart2TableVisionPatchEmbed( - kernel_size=(config.patch_size, config.patch_size), - stride=(config.patch_size, config.patch_size), - in_chans=in_chans, - embed_dim=config.embed_dim, - ) + self.patch_embed = PPChart2TableVisionPatchEmbed(config) self.pos_embed = nn.Parameter( torch.zeros( - 1, config.img_size // config.patch_size, config.img_size // config.patch_size, config.embed_dim + 1, config.image_size // config.patch_size, config.image_size // config.patch_size, config.embed_dim ) ) self.blocks = nn.ModuleList() for i in range(config.depth): block = PPChart2TableVisionDecoderLayer( - dim=config.embed_dim, - num_heads=config.num_heads, - mlp_ratio=config.mlp_ratio, - qkv_bias=config.qkv_bias, - norm_layer=norm_layer, - act_layer=act_layer, - use_rel_pos=config.use_rel_pos, - rel_pos_zero_init=rel_pos_zero_init, + config, window_size=config.window_size if i not in config.global_attn_indexes else 0, - input_size=(config.img_size // config.patch_size, config.img_size // config.patch_size), ) self.blocks.append(block) - self.neck = nn.Sequential( - nn.Conv2d( - config.embed_dim, - config.out_chans, - kernel_size=1, - bias=False, - ), - PPChart2TableVisionLayerNorm2d(config.out_chans), - nn.Conv2d( - config.out_chans, - config.out_chans, - kernel_size=3, - padding=1, - bias=False, - ), - PPChart2TableVisionLayerNorm2d(config.out_chans), - ) + self.neck = PPChart2TableVisionNeck(config) self.net_2 = nn.Conv2d(256, 512, kernel_size=3, stride=2, padding=1, bias=False) self.net_3 = nn.Conv2d(512, config.hidden_size, kernel_size=3, stride=2, padding=1, bias=False) @@ -1162,9 +787,9 @@ def __init__( def forward(self, hidden_states: torch.Tensor) -> torch.Tensor: hidden_states = self.patch_embed(hidden_states) hidden_states = hidden_states + self.pos_embed - for blk in self.blocks: - hidden_states = blk(hidden_states) - hidden_states = self.neck(hidden_states.permute(0, 3, 1, 2)) + for block in self.blocks: + hidden_states = block(hidden_states) + hidden_states = self.neck(hidden_states) hidden_states = self.net_2(hidden_states) hidden_states = self.net_3(hidden_states) return hidden_states diff --git a/src/transformers/models/pp_chart2table/processing_pp_chart2table.py b/src/transformers/models/pp_chart2table/processing_pp_chart2table.py index 7d27beef3dec..a2281c9d5f3e 100644 --- a/src/transformers/models/pp_chart2table/processing_pp_chart2table.py +++ b/src/transformers/models/pp_chart2table/processing_pp_chart2table.py @@ -41,7 +41,7 @@ def __call__( else: image_inputs = {} img_cnt = len(image_inputs) - b, c, h, w = image_inputs["pixel_values"].shape + _, _, h, _ = image_inputs["pixel_values"].shape num_patches = h // self.image_processor.patch_size // self.image_processor.merge_size prompt = ( "<|im_start|>system\n" From 5e3f1d3b3b40dac4d7b7f39d95bd8a5983a43e79 Mon Sep 17 00:00:00 2001 From: XingweiDeng Date: Wed, 25 Feb 2026 15:33:57 +0800 Subject: [PATCH 04/60] update --- .../configuration_pp_chart2table.py | 17 +- .../pp_chart2table/modeling_pp_chart2table.py | 164 ++++++++-------- .../pp_chart2table/modular_pp_chart2table.py | 181 +++++++++--------- .../test_modeling_pp_chart2table.py | 14 +- 4 files changed, 191 insertions(+), 185 deletions(-) diff --git a/src/transformers/models/pp_chart2table/configuration_pp_chart2table.py b/src/transformers/models/pp_chart2table/configuration_pp_chart2table.py index ee4ed2128161..710221f51058 100644 --- a/src/transformers/models/pp_chart2table/configuration_pp_chart2table.py +++ b/src/transformers/models/pp_chart2table/configuration_pp_chart2table.py @@ -85,6 +85,7 @@ def __init__( global_attn_indexes: Optional[list] = None, window_size: int = 14, output_channels: int = 256, + net_channels: int = 512, attention_dropout: float = 0.0, **kwargs, ): @@ -104,6 +105,7 @@ def __init__( self.global_attn_indexes = global_attn_indexes if global_attn_indexes is not None else [2, 5, 8, 11] self.window_size = window_size self.output_channels = output_channels + self.net_channels = net_channels self.attention_dropout = attention_dropout super().__init__(**kwargs) @@ -315,16 +317,24 @@ class PPChart2TableConfig(PreTrainedConfig): """ model_type = "pp_chart2table" + attribute_map = { + "image_token_id": "image_token_index", + } sub_configs = {"vision_config": PPChart2TableVisionConfig, "text_config": PPChart2TableTextConfig} def __init__( self, vision_config: dict | None = None, text_config: dict | None = None, - im_start_token: int = 151857, - im_patch_token: int = 151859, + image_token_index: Optional[int] = 151859, + image_seq_length: Optional[int] = 576, + pad_token_id: Optional[int] = -1, **kwargs, ): + self.image_token_index = image_token_index + self.image_seq_length = image_seq_length + self.pad_token_id = pad_token_id + if vision_config is None: vision_config = {} self.vision_config = PPChart2TableVisionConfig(**vision_config) @@ -335,9 +345,6 @@ def __init__( self.model_type = "pp_chart2table" - self.im_start_token = im_start_token - self.im_patch_token = im_patch_token - text_config_keys = [ "attention_dropout", "bos_token_id", diff --git a/src/transformers/models/pp_chart2table/modeling_pp_chart2table.py b/src/transformers/models/pp_chart2table/modeling_pp_chart2table.py index 4b66b6b04afa..5f7604493339 100644 --- a/src/transformers/models/pp_chart2table/modeling_pp_chart2table.py +++ b/src/transformers/models/pp_chart2table/modeling_pp_chart2table.py @@ -422,8 +422,10 @@ def __init__( self.neck = PPChart2TableVisionNeck(config) - self.net_2 = nn.Conv2d(256, 512, kernel_size=3, stride=2, padding=1, bias=False) - self.net_3 = nn.Conv2d(512, config.hidden_size, kernel_size=3, stride=2, padding=1, bias=False) + self.net_2 = nn.Conv2d( + config.output_channels, config.net_channels, kernel_size=3, stride=2, padding=1, bias=False + ) + self.net_3 = nn.Conv2d(config.net_channels, config.hidden_size, kernel_size=3, stride=2, padding=1, bias=False) self.post_init() @@ -857,6 +859,7 @@ class PPChart2TableModelOutputWithPast(ModelOutput): last_hidden_state: Optional[torch.FloatTensor] = None hidden_states: Optional[tuple[torch.FloatTensor]] = None attentions: Optional[tuple[torch.FloatTensor]] = None + image_hidden_states: Optional[torch.FloatTensor] = None @dataclass @@ -885,6 +888,7 @@ class PPChart2TableCausalLMOutputWithPast(ModelOutput): last_hidden_state: Optional[torch.FloatTensor] = None hidden_states: Optional[tuple[torch.FloatTensor]] = None attentions: Optional[tuple[torch.FloatTensor]] = None + image_hidden_states: Optional[torch.FloatTensor] = None class PPChart2TablePreTrainedModel(PreTrainedModel): @@ -994,7 +998,7 @@ def set_input_embeddings(self, value): def get_image_features( self, - images: Optional[torch.Tensor], + pixel_values: torch.FloatTensor, ) -> list[torch.Tensor]: r""" Extract and project chart image features to text embedding space. @@ -1008,10 +1012,10 @@ def get_image_features( List of projected image features (one per image), each with shape `[1, num_patches, text_hidden_size]`. """ image_features = [] - for image in images: - image = image.unsqueeze(0) + for pixel_value in pixel_values: + pixel_value = pixel_value.unsqueeze(0) with torch.no_grad(): - cnn_feature = self.vision_tower_high(image) + cnn_feature = self.vision_tower_high(pixel_value) cnn_feature = cnn_feature.flatten(2).transpose(2, 1) image_feature = self.mm_projector_vary(cnn_feature) image_features.append(image_feature) @@ -1021,51 +1025,27 @@ def get_image_features( return image_features def get_placeholder_mask( - self, - input_ids: Optional[torch.LongTensor] = None, - inputs_embeds: Optional[torch.FloatTensor] = None, - image_features: Optional[torch.FloatTensor] = None, - ) -> torch.BoolTensor: - r""" - Generate mask to locate image placeholder tokens in input embeddings. - - This mask identifies the `` tokens in the input sequence, which will be replaced with - projected image features for multimodal fusion. - - Args: - input_ids (`torch.LongTensor`, optional): - Tokenized input text (used if `inputs_embeds` is None). - inputs_embeds (`torch.FloatTensor`, optional): - Precomputed input embeddings (used if `input_ids` is None). - image_features (`torch.FloatTensor`): - Projected image features (used to validate token-feature count match). - - Returns: - `torch.BoolTensor`: - Boolean mask (shape: `[B, seq_len, text_hidden_size]`) where `True` indicates image placeholder tokens. - - Raises: - ValueError: If the number of image tokens does not match the number of image features. + self, input_ids: torch.LongTensor, inputs_embeds: torch.FloatTensor, image_features: torch.FloatTensor + ): + """ + Obtains multimodal placeholder mask from `input_ids` or `inputs_embeds`, and checks that the placeholder token count is + equal to the length of multimodal features. If the lengths are different, an error is raised. """ if input_ids is None: - start_token_embed = self.get_input_embeddings()( - torch.tensor(self.config.im_start_token, dtype=torch.long, device=inputs_embeds.device) + special_image_mask = inputs_embeds == self.get_input_embeddings()( + torch.tensor(self.config.image_token_id, dtype=torch.long, device=inputs_embeds.device) ) - special_image_mask = inputs_embeds == start_token_embed special_image_mask = special_image_mask.all(-1) else: - special_image_mask = input_ids == self.config.im_patch_token + special_image_mask = input_ids == self.config.image_token_id n_image_tokens = special_image_mask.sum() - - n_image_features = image_features.numel() // image_features.shape[-1] - if n_image_tokens != n_image_features: + special_image_mask = special_image_mask.unsqueeze(-1).expand_as(inputs_embeds).to(inputs_embeds.device) + n_image_features = image_features.shape[0] * image_features.shape[1] + if inputs_embeds[special_image_mask].numel() != image_features.numel(): raise ValueError( f"Image features and image tokens do not match: tokens: {n_image_tokens}, features {n_image_features}" ) - - special_image_mask = special_image_mask.unsqueeze(-1).expand_as(inputs_embeds).to(inputs_embeds.device) - return special_image_mask @can_return_tuple @@ -1081,20 +1061,23 @@ def forward( cache_position: Optional[torch.LongTensor] = None, **kwargs, ): + if (input_ids is None) ^ (inputs_embeds is not None): + raise ValueError("You must specify exactly one of input_ids or inputs_embeds") + if inputs_embeds is None: inputs_embeds = self.language_model.embed_tokens(input_ids) if pixel_values is not None: - image_features = self.get_image_features(pixel_values) - image_mask = self.get_placeholder_mask( + image_features = self.get_image_features(pixel_values=pixel_values.to(inputs_embeds.dtype)) + image_features = image_features.to(inputs_embeds.device, inputs_embeds.dtype) + special_image_mask = self.get_placeholder_mask( input_ids, inputs_embeds=inputs_embeds, image_features=image_features ) - inputs_embeds = inputs_embeds.masked_scatter(image_mask, image_features) + inputs_embeds = inputs_embeds.masked_scatter(special_image_mask, image_features) outputs = self.language_model( - input_ids=None, - position_ids=position_ids, attention_mask=attention_mask, + position_ids=position_ids, past_key_values=past_key_values, inputs_embeds=inputs_embeds, use_cache=use_cache, @@ -1102,15 +1085,14 @@ def forward( **kwargs, ) - output = PPChart2TableModelOutputWithPast( + return PPChart2TableModelOutputWithPast( last_hidden_state=outputs.last_hidden_state, past_key_values=outputs.past_key_values, hidden_states=outputs.hidden_states, attentions=outputs.attentions, + image_hidden_states=image_features if pixel_values is not None else None, ) - return output - class PPChart2TableForConditionalGeneration(PPChart2TablePreTrainedModel, GenerationMixin): r""" @@ -1148,7 +1130,6 @@ def __init__(self, config: PPChart2TableConfig): super().__init__(config) self.model = PPChart2TableModel(config) self.lm_head = nn.Linear(config.text_config.hidden_size, config.text_config.vocab_size, bias=False) - self.post_init() def get_input_embeddings(self): @@ -1157,43 +1138,22 @@ def get_input_embeddings(self): def set_input_embeddings(self, value): self.model.set_input_embeddings(value) - def prepare_inputs_for_generation( + def get_output_embeddings(self) -> nn.Module: + return self.lm_head + + def get_image_features( self, - input_ids, - past_key_values=None, - attention_mask=None, - inputs_embeds=None, - cache_position=None, - position_ids=None, - use_cache=True, - pixel_values=None, - pixel_values_videos=None, - image_grid_thw=None, - video_grid_thw=None, - is_first_iteration=False, + pixel_values: torch.FloatTensor, + vision_feature_layer: Optional[Union[int, list[int]]] = None, + vision_feature_select_strategy: Optional[str] = None, **kwargs, ): - # Overwritten -- in specific circumstances we don't want to forward image inputs to the model - - model_inputs = super().prepare_inputs_for_generation( - input_ids, - past_key_values=past_key_values, - attention_mask=attention_mask, - inputs_embeds=inputs_embeds, - cache_position=cache_position, - position_ids=position_ids, + return self.model.get_image_features( pixel_values=pixel_values, - pixel_values_videos=pixel_values_videos, - image_grid_thw=image_grid_thw, - video_grid_thw=video_grid_thw, - use_cache=use_cache, - is_first_iteration=is_first_iteration, + vision_feature_layer=vision_feature_layer, + vision_feature_select_strategy=vision_feature_select_strategy, **kwargs, ) - if not is_first_iteration and use_cache: - model_inputs["pixel_values"] = None - - return model_inputs @can_return_tuple def forward( @@ -1219,18 +1179,18 @@ def forward( use_cache=use_cache, past_key_values=past_key_values, cache_position=cache_position, + logits_to_keep=logits_to_keep, **kwargs, ) hidden_states = outputs.last_hidden_state - logits = self.lm_head(hidden_states) slice_indices = slice(-logits_to_keep, None) if isinstance(logits_to_keep, int) else logits_to_keep logits = self.lm_head(hidden_states[:, slice_indices, :]) + loss = None if labels is not None: - raise ValueError( - "The PPChart2TableForConditionalGeneration model only supports inference, and training is not allowed!\n" - "If you need to train this model, please implement the corresponding loss calculation logic, or use the inference-only mode (do not pass the `labels` parameter)." + loss = self.loss_function( + logits=logits, labels=labels, vocab_size=self.config.text_config.vocab_size, **kwargs ) return PPChart2TableCausalLMOutputWithPast( @@ -1241,6 +1201,40 @@ def forward( attentions=outputs.attentions, ) + def prepare_inputs_for_generation( + self, + input_ids, + past_key_values=None, + inputs_embeds=None, + pixel_values=None, + attention_mask=None, + cache_position=None, + logits_to_keep=None, + is_first_iteration=False, + **kwargs, + ): + # Overwritten -- in specific circumstances we don't want to forward image inputs to the model + + model_inputs = super().prepare_inputs_for_generation( + input_ids, + past_key_values=past_key_values, + inputs_embeds=inputs_embeds, + attention_mask=attention_mask, + cache_position=cache_position, + logits_to_keep=logits_to_keep, + is_first_iteration=is_first_iteration, + **kwargs, + ) + + if is_first_iteration or not kwargs.get("use_cache", True): + # Pixel values are used only in the first iteration if available + # In subsquent iterations, they are already merged with text and cached + # NOTE: first iteration doesn't have to be prefill, it can be the first + # iteration with a question and cached system prompt (continue generate from cache) + model_inputs["pixel_values"] = pixel_values + + return model_inputs + __all__ = [ "PPChart2TableForConditionalGeneration", diff --git a/src/transformers/models/pp_chart2table/modular_pp_chart2table.py b/src/transformers/models/pp_chart2table/modular_pp_chart2table.py index e1bd61a0719c..72fd45bbde0d 100644 --- a/src/transformers/models/pp_chart2table/modular_pp_chart2table.py +++ b/src/transformers/models/pp_chart2table/modular_pp_chart2table.py @@ -110,6 +110,7 @@ def __init__( global_attn_indexes: Optional[list] = None, window_size: int = 14, output_channels: int = 256, + net_channels: int = 512, attention_dropout: float = 0.0, **kwargs, ): @@ -129,6 +130,7 @@ def __init__( self.global_attn_indexes = global_attn_indexes if global_attn_indexes is not None else [2, 5, 8, 11] self.window_size = window_size self.output_channels = output_channels + self.net_channels = net_channels self.attention_dropout = attention_dropout super().__init__(**kwargs) @@ -340,16 +342,24 @@ class PPChart2TableConfig(PreTrainedConfig): """ model_type = "pp_chart2table" + attribute_map = { + "image_token_id": "image_token_index", + } sub_configs = {"vision_config": PPChart2TableVisionConfig, "text_config": PPChart2TableTextConfig} def __init__( self, vision_config: dict | None = None, text_config: dict | None = None, - im_start_token: int = 151857, - im_patch_token: int = 151859, + image_token_index: Optional[int] = 151859, + image_seq_length: Optional[int] = 576, + pad_token_id: Optional[int] = -1, **kwargs, ): + self.image_token_index = image_token_index + self.image_seq_length = image_seq_length + self.pad_token_id = pad_token_id + if vision_config is None: vision_config = {} self.vision_config = PPChart2TableVisionConfig(**vision_config) @@ -360,9 +370,6 @@ def __init__( self.model_type = "pp_chart2table" - self.im_start_token = im_start_token - self.im_patch_token = im_patch_token - text_config_keys = [ "attention_dropout", "bos_token_id", @@ -779,8 +786,8 @@ def __init__( self.neck = PPChart2TableVisionNeck(config) - self.net_2 = nn.Conv2d(256, 512, kernel_size=3, stride=2, padding=1, bias=False) - self.net_3 = nn.Conv2d(512, config.hidden_size, kernel_size=3, stride=2, padding=1, bias=False) + self.net_2 = nn.Conv2d(config.output_channels, config.net_channels, kernel_size=3, stride=2, padding=1, bias=False) + self.net_3 = nn.Conv2d(config.net_channels, config.hidden_size, kernel_size=3, stride=2, padding=1, bias=False) self.post_init() @@ -832,6 +839,7 @@ class PPChart2TableModelOutputWithPast(ModelOutput): last_hidden_state: Optional[torch.FloatTensor] = None hidden_states: Optional[tuple[torch.FloatTensor]] = None attentions: Optional[tuple[torch.FloatTensor]] = None + image_hidden_states: Optional[torch.FloatTensor] = None @dataclass @@ -860,6 +868,7 @@ class PPChart2TableCausalLMOutputWithPast(ModelOutput): last_hidden_state: Optional[torch.FloatTensor] = None hidden_states: Optional[tuple[torch.FloatTensor]] = None attentions: Optional[tuple[torch.FloatTensor]] = None + image_hidden_states: Optional[torch.FloatTensor] = None class PPChart2TablePreTrainedModel(PreTrainedModel): @@ -969,7 +978,7 @@ def set_input_embeddings(self, value): def get_image_features( self, - images: Optional[torch.Tensor], + pixel_values: torch.FloatTensor, ) -> list[torch.Tensor]: r""" Extract and project chart image features to text embedding space. @@ -983,10 +992,10 @@ def get_image_features( List of projected image features (one per image), each with shape `[1, num_patches, text_hidden_size]`. """ image_features = [] - for image in images: - image = image.unsqueeze(0) + for pixel_value in pixel_values: + pixel_value = pixel_value.unsqueeze(0) with torch.no_grad(): - cnn_feature = self.vision_tower_high(image) + cnn_feature = self.vision_tower_high(pixel_value) cnn_feature = cnn_feature.flatten(2).transpose(2, 1) image_feature = self.mm_projector_vary(cnn_feature) image_features.append(image_feature) @@ -996,51 +1005,27 @@ def get_image_features( return image_features def get_placeholder_mask( - self, - input_ids: Optional[torch.LongTensor] = None, - inputs_embeds: Optional[torch.FloatTensor] = None, - image_features: Optional[torch.FloatTensor] = None, - ) -> torch.BoolTensor: - r""" - Generate mask to locate image placeholder tokens in input embeddings. - - This mask identifies the `` tokens in the input sequence, which will be replaced with - projected image features for multimodal fusion. - - Args: - input_ids (`torch.LongTensor`, optional): - Tokenized input text (used if `inputs_embeds` is None). - inputs_embeds (`torch.FloatTensor`, optional): - Precomputed input embeddings (used if `input_ids` is None). - image_features (`torch.FloatTensor`): - Projected image features (used to validate token-feature count match). - - Returns: - `torch.BoolTensor`: - Boolean mask (shape: `[B, seq_len, text_hidden_size]`) where `True` indicates image placeholder tokens. - - Raises: - ValueError: If the number of image tokens does not match the number of image features. + self, input_ids: torch.LongTensor, inputs_embeds: torch.FloatTensor, image_features: torch.FloatTensor + ): + """ + Obtains multimodal placeholder mask from `input_ids` or `inputs_embeds`, and checks that the placeholder token count is + equal to the length of multimodal features. If the lengths are different, an error is raised. """ if input_ids is None: - start_token_embed = self.get_input_embeddings()( - torch.tensor(self.config.im_start_token, dtype=torch.long, device=inputs_embeds.device) + special_image_mask = inputs_embeds == self.get_input_embeddings()( + torch.tensor(self.config.image_token_id, dtype=torch.long, device=inputs_embeds.device) ) - special_image_mask = inputs_embeds == start_token_embed special_image_mask = special_image_mask.all(-1) else: - special_image_mask = input_ids == self.config.im_patch_token + special_image_mask = input_ids == self.config.image_token_id n_image_tokens = special_image_mask.sum() - - n_image_features = image_features.numel() // image_features.shape[-1] - if n_image_tokens != n_image_features: + special_image_mask = special_image_mask.unsqueeze(-1).expand_as(inputs_embeds).to(inputs_embeds.device) + n_image_features = image_features.shape[0] * image_features.shape[1] + if inputs_embeds[special_image_mask].numel() != image_features.numel(): raise ValueError( f"Image features and image tokens do not match: tokens: {n_image_tokens}, features {n_image_features}" ) - - special_image_mask = special_image_mask.unsqueeze(-1).expand_as(inputs_embeds).to(inputs_embeds.device) - return special_image_mask @can_return_tuple @@ -1056,20 +1041,23 @@ def forward( cache_position: Optional[torch.LongTensor] = None, **kwargs, ): + if (input_ids is None) ^ (inputs_embeds is not None): + raise ValueError("You must specify exactly one of input_ids or inputs_embeds") + if inputs_embeds is None: inputs_embeds = self.language_model.embed_tokens(input_ids) if pixel_values is not None: - image_features = self.get_image_features(pixel_values) - image_mask = self.get_placeholder_mask( + image_features = self.get_image_features(pixel_values=pixel_values.to(inputs_embeds.dtype)) + image_features = image_features.to(inputs_embeds.device, inputs_embeds.dtype) + special_image_mask = self.get_placeholder_mask( input_ids, inputs_embeds=inputs_embeds, image_features=image_features ) - inputs_embeds = inputs_embeds.masked_scatter(image_mask, image_features) + inputs_embeds = inputs_embeds.masked_scatter(special_image_mask, image_features) outputs = self.language_model( - input_ids=None, - position_ids=position_ids, attention_mask=attention_mask, + position_ids=position_ids, past_key_values=past_key_values, inputs_embeds=inputs_embeds, use_cache=use_cache, @@ -1077,15 +1065,14 @@ def forward( **kwargs, ) - output = PPChart2TableModelOutputWithPast( + return PPChart2TableModelOutputWithPast( last_hidden_state=outputs.last_hidden_state, past_key_values=outputs.past_key_values, hidden_states=outputs.hidden_states, attentions=outputs.attentions, + image_hidden_states=image_features if pixel_values is not None else None, ) - return output - class PPChart2TableForConditionalGeneration(PPChart2TablePreTrainedModel, GenerationMixin): r""" @@ -1123,7 +1110,6 @@ def __init__(self, config: PPChart2TableConfig): super().__init__(config) self.model = PPChart2TableModel(config) self.lm_head = nn.Linear(config.text_config.hidden_size, config.text_config.vocab_size, bias=False) - self.post_init() def get_input_embeddings(self): @@ -1131,44 +1117,23 @@ def get_input_embeddings(self): def set_input_embeddings(self, value): self.model.set_input_embeddings(value) - - def prepare_inputs_for_generation( + + def get_output_embeddings(self) -> nn.Module: + return self.lm_head + + def get_image_features( self, - input_ids, - past_key_values=None, - attention_mask=None, - inputs_embeds=None, - cache_position=None, - position_ids=None, - use_cache=True, - pixel_values=None, - pixel_values_videos=None, - image_grid_thw=None, - video_grid_thw=None, - is_first_iteration=False, + pixel_values: torch.FloatTensor, + vision_feature_layer: Optional[Union[int, list[int]]] = None, + vision_feature_select_strategy: Optional[str] = None, **kwargs, ): - # Overwritten -- in specific circumstances we don't want to forward image inputs to the model - - model_inputs = super().prepare_inputs_for_generation( - input_ids, - past_key_values=past_key_values, - attention_mask=attention_mask, - inputs_embeds=inputs_embeds, - cache_position=cache_position, - position_ids=position_ids, + return self.model.get_image_features( pixel_values=pixel_values, - pixel_values_videos=pixel_values_videos, - image_grid_thw=image_grid_thw, - video_grid_thw=video_grid_thw, - use_cache=use_cache, - is_first_iteration=is_first_iteration, + vision_feature_layer=vision_feature_layer, + vision_feature_select_strategy=vision_feature_select_strategy, **kwargs, ) - if not is_first_iteration and use_cache: - model_inputs["pixel_values"] = None - - return model_inputs @can_return_tuple def forward( @@ -1194,18 +1159,18 @@ def forward( use_cache=use_cache, past_key_values=past_key_values, cache_position=cache_position, + logits_to_keep=logits_to_keep, **kwargs, ) hidden_states = outputs.last_hidden_state - logits = self.lm_head(hidden_states) slice_indices = slice(-logits_to_keep, None) if isinstance(logits_to_keep, int) else logits_to_keep logits = self.lm_head(hidden_states[:, slice_indices, :]) + loss = None if labels is not None: - raise ValueError( - "The PPChart2TableForConditionalGeneration model only supports inference, and training is not allowed!\n" - "If you need to train this model, please implement the corresponding loss calculation logic, or use the inference-only mode (do not pass the `labels` parameter)." + loss = self.loss_function( + logits=logits, labels=labels, vocab_size=self.config.text_config.vocab_size, **kwargs ) return PPChart2TableCausalLMOutputWithPast( @@ -1216,6 +1181,40 @@ def forward( attentions=outputs.attentions, ) + def prepare_inputs_for_generation( + self, + input_ids, + past_key_values=None, + inputs_embeds=None, + pixel_values=None, + attention_mask=None, + cache_position=None, + logits_to_keep=None, + is_first_iteration=False, + **kwargs, + ): + # Overwritten -- in specific circumstances we don't want to forward image inputs to the model + + model_inputs = super().prepare_inputs_for_generation( + input_ids, + past_key_values=past_key_values, + inputs_embeds=inputs_embeds, + attention_mask=attention_mask, + cache_position=cache_position, + logits_to_keep=logits_to_keep, + is_first_iteration=is_first_iteration, + **kwargs, + ) + + if is_first_iteration or not kwargs.get("use_cache", True): + # Pixel values are used only in the first iteration if available + # In subsquent iterations, they are already merged with text and cached + # NOTE: first iteration doesn't have to be prefill, it can be the first + # iteration with a question and cached system prompt (continue generate from cache) + model_inputs["pixel_values"] = pixel_values + + return model_inputs + __all__ = [ "PPChart2TableForConditionalGeneration", diff --git a/tests/models/pp_chart2table/test_modeling_pp_chart2table.py b/tests/models/pp_chart2table/test_modeling_pp_chart2table.py index 1143253791fa..d76f23d92900 100644 --- a/tests/models/pp_chart2table/test_modeling_pp_chart2table.py +++ b/tests/models/pp_chart2table/test_modeling_pp_chart2table.py @@ -49,7 +49,7 @@ class PPChart2TableVisionText2TextModelTester: def __init__( self, parent, - batch_size=7, + batch_size=1, seq_length=31, num_channels=3, image_height=64, @@ -73,16 +73,20 @@ def __init__( "depth": 2, "embed_dim": 768, "hidden_size": 144, - "img_size": 64, + "hidden_act": "gelu", + "image_size": 64, + "num_channels": 3, "mlp_ratio": 4.0, "norm_layer_eps": 1e-6, - "num_heads": 4, + "num_attention_heads": 4, "patch_size": 16, "qkv_bias": True, "use_rel_pos": True, "global_attn_indexes": [2, 5, 8, 11], "window_size": 14, - "out_chans": 256, + "output_channels": 256, + "net_channels": 512, + "attention_dropout": 0.0 }, bos_token_id=151643, eos_token_id=151643, @@ -168,7 +172,9 @@ def prepare_config_and_inputs_for_common(self): class PPChart2TableModelTest(ModelTesterMixin, GenerationTesterMixin, PipelineTesterMixin, unittest.TestCase): all_model_classes = (PPChart2TableForConditionalGeneration,) if is_torch_available() else () pipeline_model_mapping = {"image-text-to-text": PPChart2TableForConditionalGeneration} + _is_composite = True + test_resize_embeddings = False def setUp(self): self.model_tester = PPChart2TableVisionText2TextModelTester(self) From 2c064ccdf0caa3aa10b4ccb17e4a97ef822a17bf Mon Sep 17 00:00:00 2001 From: XingweiDeng Date: Wed, 25 Feb 2026 15:39:39 +0800 Subject: [PATCH 05/60] update --- docs/source/en/model_doc/pp_chart2table.md | 11 +++++++---- 1 file changed, 7 insertions(+), 4 deletions(-) diff --git a/docs/source/en/model_doc/pp_chart2table.md b/docs/source/en/model_doc/pp_chart2table.md index 7ba3692f7f49..ad00bec20f84 100644 --- a/docs/source/en/model_doc/pp_chart2table.md +++ b/docs/source/en/model_doc/pp_chart2table.md @@ -38,18 +38,20 @@ The example below demonstrates how to classify image with PP-Chart2Table using [ ```py -from transformers import pipeline +import requests from PIL import Image +from transformers import pipeline model_path = "PaddlePaddle/PP-Chart2Table_safetensors" pipe = pipeline("image-text-to-text", model=model_path) image = Image.open(requests.get("https://paddle-model-ecology.bj.bcebos.com/paddlex/imgs/demo_image/chart_parsing_02.png", stream=True).raw) result = pipe( - images=image, - text="", - do_sample=False, + images=image, + text="", + do_sample=False, max_new_tokens=256 ) print(result) + ``` @@ -85,6 +87,7 @@ Here is how you can do it with PP-Chart2Table using [`Pipeline`] or the [`AutoMo ```py +import requests from transformers import pipeline from PIL import Image model_path = "PaddlePaddle/PP-Chart2Table_safetensors" From 81514c0b1fbfbe2221f17a835dfe19f8e2aafe12 Mon Sep 17 00:00:00 2001 From: XingweiDeng Date: Thu, 26 Feb 2026 15:01:52 +0800 Subject: [PATCH 06/60] update --- .../configuration_pp_chart2table.py | 128 +++++++------ .../pp_chart2table/modeling_pp_chart2table.py | 7 +- .../pp_chart2table/modular_pp_chart2table.py | 172 +++++++++++------- .../test_modeling_pp_chart2table.py | 4 +- 4 files changed, 184 insertions(+), 127 deletions(-) diff --git a/src/transformers/models/pp_chart2table/configuration_pp_chart2table.py b/src/transformers/models/pp_chart2table/configuration_pp_chart2table.py index 710221f51058..9de2e1da70b1 100644 --- a/src/transformers/models/pp_chart2table/configuration_pp_chart2table.py +++ b/src/transformers/models/pp_chart2table/configuration_pp_chart2table.py @@ -21,34 +21,61 @@ class PPChart2TableVisionConfig(PreTrainedConfig): documentation from [`PreTrainedConfig`] for more information. Args: - im_patch_token (`int`, *optional*, defaults to 151859): - The token ID used to represent individual image patches in the multimodal input sequence. - im_start_token (`int`, *optional*, defaults to 151857): - The token ID representing the start of an image token sequence in the multimodal input. - depth (`int`, *optional*, defaults to 12): - Number of hidden layers in the vision Transformer encoder. - embed_dim (`int`, *optional*, defaults to 768): - Dimensionality of the patch embedding layer in the vision encoder. - hidden_size (`int`, *optional*, defaults to 1024): - Dimensionality of the hidden layers in the vision Transformer encoder. - image_size (`int`, *optional*, defaults to 1024): - The size (resolution) of input chart images (assumed to be square). - mlp_ratio (`float`, *optional*, defaults to 4.0): - Ratio of the dimensionality of the feed-forward layer to the hidden size in the vision Transformer blocks. - num_attention_heads (`int`, *optional*, defaults to 12): - Number of attention heads for each self-attention layer in the vision Transformer encoder. - patch_size (`int`, *optional*, defaults to 16): - The size (resolution) of each image patch extracted from the input chart image. - qkv_bias (`bool`, *optional*, defaults to `True`): - Whether to add bias terms to the query, key, and value projection layers in the self-attention mechanism. - use_rel_pos (`bool`, *optional*, defaults to `True`): - Whether to use relative positional embeddings in the self-attention layers of the vision encoder. - global_attn_indexes (`list`, *optional*, defaults to `[2, 5, 8, 11]`): - List of layer indexes where global attention (instead of windowed attention) is applied in the vision encoder. - window_size (`int`, *optional*, defaults to 14): - The size of the attention window for windowed self-attention in the vision Transformer layers. - output_channels (`int`, *optional*, defaults to 256): - Number of output channels from the convolutional stem layer before patch embedding. + depth (`int`, *optional*, defaults to 12): + Number of hidden layers in the vision Transformer encoder. + embed_dim (`int`, *optional*, defaults to 768): + Dimensionality of the patch embedding layer in the vision encoder. + hidden_size (`int`, *optional*, defaults to 1024): + Dimensionality of the hidden layers in the vision Transformer encoder. + num_channels (`int`, *optional*, defaults to 3): + Number of input channels for the convolutional stem layer (default: RGB images with 3 channels). + image_size (`int`, *optional*, defaults to 1024): + The size (resolution) of input chart images (assumed to be square). + mlp_ratio (`float`, *optional*, defaults to 4.0): + Ratio of the dimensionality of the feed-forward layer to the hidden size in the vision Transformer blocks. + num_attention_heads (`int`, *optional*, defaults to 12): + Number of attention heads for each self-attention layer in the vision Transformer encoder. + patch_size (`int`, *optional*, defaults to 16): + The size (resolution) of each image patch extracted from the input chart image. + qkv_bias (`bool`, *optional*, defaults to `True`): + Whether to add bias terms to the query, key, and value projection layers in the self-attention mechanism. + use_rel_pos (`bool`, *optional*, defaults to `True`): + Whether to use relative positional embeddings in the self-attention layers of the vision encoder. + global_attn_indexes (`Optional[List[int]]`, *optional*, defaults to `None`): + List of layer indexes where global attention (instead of windowed attention) is applied in the vision encoder. + If `None`, defaults to `[2, 5, 8, 11]`. + window_size (`int`, *optional*, defaults to 14): + The size of the attention window for windowed self-attention in the vision Transformer layers. + output_channels (`int`, *optional*, defaults to 256): + Number of output channels from the convolutional stem layer before patch embedding. + net_channels (`int`, *optional*, defaults to 512): + Number of channels in the intermediate convolutional layers. + attention_dropout (`float`, *optional*, defaults to 0.0): + Dropout probability for the attention layers in the vision Transformer. + output_hidden_states (`bool`, *optional*, defaults to `False`): + output_attentions (`bool`, *optional*, defaults to `False`): + return_dict (`bool`, *optional*, defaults to `True`): + dtype (`Union`, *optional*): + tie_word_embeddings (`bool`, *optional*, defaults to `True`): + chunk_size_feed_forward (`int`, *optional*, defaults to 0): + is_encoder_decoder (`bool`, *optional*, defaults to `False`): + is_decoder (`bool`, *optional*, defaults to `False`): + cross_attention_hidden_size (`Optional`, *optional*): + add_cross_attention (`bool`, *optional*, defaults to `False`): + architectures (`Optional`, *optional*): + finetuning_task (`Optional`, *optional*): + id2label (`Optional`, *optional*): + label2id (`Optional`, *optional*): + num_labels (`Optional`, *optional*): + task_specific_params (`Optional`, *optional*): + problem_type (`Optional`, *optional*): + tokenizer_class (`Optional`, *optional*): + prefix (`Optional`, *optional*): + bos_token_id (`Optional`, *optional*): + pad_token_id (`Optional`, *optional*): + eos_token_id (`Optional`, *optional*): + sep_token_id (`Optional`, *optional*): + decoder_start_token_id (`Optional`, *optional*): Example: @@ -68,10 +95,8 @@ class PPChart2TableVisionConfig(PreTrainedConfig): model_type = "pp_chart2table_vision" base_config_key = "vision_config" - def __init__( + def init( self, - im_patch_token: int = 151859, - im_start_token: int = 151857, depth: int = 12, embed_dim: int = 768, hidden_size: int = 1024, @@ -82,16 +107,13 @@ def __init__( patch_size: int = 16, qkv_bias: bool = True, use_rel_pos: bool = True, - global_attn_indexes: Optional[list] = None, + global_attn_indexes: Optional[list[int]] = None, window_size: int = 14, output_channels: int = 256, net_channels: int = 512, attention_dropout: float = 0.0, **kwargs, ): - self.im_patch_token = im_patch_token - self.im_start_token = im_start_token - self.depth = depth self.embed_dim = embed_dim self.hidden_size = hidden_size @@ -107,8 +129,7 @@ def __init__( self.output_channels = output_channels self.net_channels = net_channels self.attention_dropout = attention_dropout - - super().__init__(**kwargs) + super().init(**kwargs) class PPChart2TableTextConfig(PreTrainedConfig): @@ -267,30 +288,25 @@ def __init__( class PPChart2TableConfig(PreTrainedConfig): r""" - This is the main configuration class to store the configuration of a [`PPChart2TableModel`] or [`PPChart2TableForConditionalGeneration`]. + This is the main configuration class to store the configuration of a [PPChart2TableModel] or [PPChart2TableForConditionalGeneration]. It is used to instantiate a PP-Chart2Table multimodal model according to the specified arguments, defining the vision and text - sub-model architectures. This configuration class inherits from [`PreTrainedConfig`] and combines the configurations of: - - [`PPChart2TableVisionConfig`] (for the chart vision encoder) - - [`PPChart2TableTextConfig`] (for the table text decoder) - PP-Chart2Table [PaddlePaddle/PP-Chart2Table_safetensors](https://huggingface.co/PaddlePaddle/PP-Chart2Table_safetensors). + sub-model architectures. This configuration class inherits from [PreTrainedConfig] and combines the configurations of: + [PPChart2TableVisionConfig] (for the chart vision encoder) + [PPChart2TableTextConfig] (for the table text decoder) + PP-Chart2Table PaddlePaddle/PP-Chart2Table_safetensors. - Instantiating a `PPChart2TableConfig` with the defaults will yield a similar configuration to the base PP-Chart2Table model + Instantiating a PPChart2TableConfig with the defaults will yield a similar configuration to the base PP-Chart2Table model developed by the PaddlePaddle team for chart-to-table parsing tasks. - Configuration objects inherit from [`PreTrainedConfig`] and can be used to control the model outputs. Read the - documentation from [`PreTrainedConfig`] for more information. + Configuration objects inherit from [PreTrainedConfig] and can be used to control the model outputs. Read the + documentation from [PreTrainedConfig] for more information. Args: - vision_config (`dict`, *optional*): - Dictionary of configuration options used to initialize [`PPChart2TableVisionConfig`]. If `None`, the default - `PPChart2TableVisionConfig` configuration will be used. - text_config (`dict`, *optional*): - Dictionary of configuration options used to initialize [`PPChart2TableTextConfig`]. If `None`, the default - `PPChart2TableTextConfig` configuration will be used. - im_start_token (`int`, *optional*, defaults to 151857): - The token ID representing the start of an image token sequence in the multimodal input (shared across vision/text sub-configs). - im_patch_token (`int`, *optional*, defaults to 151859): - The token ID used to represent individual image patches in the multimodal input sequence (shared across vision/text sub-configs). + vision_config (Optional[Dict], optional, defaults to None, *optional*): + text_config (Optional[Dict], optional, defaults to None, *optional*): + image_token_index (Optional[int], optional, defaults to 151859, *optional*, defaults to 151859): + image_seq_length (Optional[int], optional, defaults to 576, *optional*, defaults to 576): + pad_token_id (Optional[int], optional, defaults to -1, *optional*, defaults to -1): Example: @@ -343,8 +359,6 @@ def __init__( text_config = {} self.text_config = PPChart2TableTextConfig(**text_config) - self.model_type = "pp_chart2table" - text_config_keys = [ "attention_dropout", "bos_token_id", diff --git a/src/transformers/models/pp_chart2table/modeling_pp_chart2table.py b/src/transformers/models/pp_chart2table/modeling_pp_chart2table.py index 5f7604493339..1c9790b4d1a2 100644 --- a/src/transformers/models/pp_chart2table/modeling_pp_chart2table.py +++ b/src/transformers/models/pp_chart2table/modeling_pp_chart2table.py @@ -18,6 +18,7 @@ from transformers.modeling_outputs import ModelOutput from transformers.modeling_utils import PreTrainedModel from transformers.utils import can_return_tuple +from transformers.utils.generic import check_model_inputs from ...activations import ACT2FN from ...cache_utils import DynamicCache @@ -30,7 +31,7 @@ from ...modeling_utils import ALL_ATTENTION_FUNCTIONS from ...processing_utils import Unpack from ...utils import TransformersKwargs, auto_docstring -from ...utils.generic import check_model_inputs, maybe_autocast +from ...utils.generic import maybe_autocast from .configuration_pp_chart2table import PPChart2TableConfig, PPChart2TableTextConfig, PPChart2TableVisionConfig @@ -52,7 +53,6 @@ def __init__(self, config): self.patch_size = patch_size self.num_channels = num_channels self.num_patches = num_patches - self.projection = nn.Conv2d(num_channels, hidden_size, kernel_size=patch_size, stride=patch_size) def forward(self, pixel_values): @@ -429,7 +429,7 @@ def __init__( self.post_init() - def forward(self, hidden_states: torch.Tensor) -> torch.Tensor: + def forward(self, hidden_states: torch.Tensor, **kwargs) -> torch.Tensor: hidden_states = self.patch_embed(hidden_states) hidden_states = hidden_states + self.pos_embed for block in self.blocks: @@ -1194,6 +1194,7 @@ def forward( ) return PPChart2TableCausalLMOutputWithPast( + loss=loss, logits=logits, last_hidden_state=outputs.last_hidden_state, past_key_values=outputs.past_key_values, diff --git a/src/transformers/models/pp_chart2table/modular_pp_chart2table.py b/src/transformers/models/pp_chart2table/modular_pp_chart2table.py index 72fd45bbde0d..bef1baa5c556 100644 --- a/src/transformers/models/pp_chart2table/modular_pp_chart2table.py +++ b/src/transformers/models/pp_chart2table/modular_pp_chart2table.py @@ -1,9 +1,9 @@ +import collections from dataclasses import dataclass -from typing import Optional, Union +from typing import Optional, Union, List import torch import torch.nn as nn -import torch.nn.functional as F from torchvision.transforms.v2.functional import InterpolationMode from transformers.cache_utils import Cache @@ -26,8 +26,20 @@ from transformers.modeling_outputs import ModelOutput from transformers.modeling_rope_utils import RopeParameters from transformers.modeling_utils import PreTrainedModel -from transformers.models.qwen2.modeling_qwen2 import Qwen2Attention, Qwen2DecoderLayer, Qwen2Model, Qwen2PreTrainedModel -from transformers.models.got_ocr2.modeling_got_ocr2 import GotOcr2VisionNeck, GotOcr2MLPBlock, GotOcr2LayerNorm, GotOcr2PatchEmbeddings, GotOcr2VisionAttention, GotOcr2VisionLayer +from transformers.models.got_ocr2.modeling_got_ocr2 import ( + GotOcr2LayerNorm, + GotOcr2MLPBlock, + GotOcr2PatchEmbeddings, + GotOcr2VisionAttention, + GotOcr2VisionLayer, + GotOcr2VisionNeck, +) +from transformers.models.qwen2.modeling_qwen2 import ( + Qwen2Attention, + Qwen2DecoderLayer, + Qwen2Model, + Qwen2PreTrainedModel, +) from transformers.processing_utils import ProcessorMixin, TensorType from transformers.utils import ( can_return_tuple, @@ -46,34 +58,61 @@ class PPChart2TableVisionConfig(PreTrainedConfig): documentation from [`PreTrainedConfig`] for more information. Args: - im_patch_token (`int`, *optional*, defaults to 151859): - The token ID used to represent individual image patches in the multimodal input sequence. - im_start_token (`int`, *optional*, defaults to 151857): - The token ID representing the start of an image token sequence in the multimodal input. - depth (`int`, *optional*, defaults to 12): - Number of hidden layers in the vision Transformer encoder. - embed_dim (`int`, *optional*, defaults to 768): - Dimensionality of the patch embedding layer in the vision encoder. - hidden_size (`int`, *optional*, defaults to 1024): - Dimensionality of the hidden layers in the vision Transformer encoder. - image_size (`int`, *optional*, defaults to 1024): - The size (resolution) of input chart images (assumed to be square). - mlp_ratio (`float`, *optional*, defaults to 4.0): - Ratio of the dimensionality of the feed-forward layer to the hidden size in the vision Transformer blocks. - num_attention_heads (`int`, *optional*, defaults to 12): - Number of attention heads for each self-attention layer in the vision Transformer encoder. - patch_size (`int`, *optional*, defaults to 16): - The size (resolution) of each image patch extracted from the input chart image. - qkv_bias (`bool`, *optional*, defaults to `True`): - Whether to add bias terms to the query, key, and value projection layers in the self-attention mechanism. - use_rel_pos (`bool`, *optional*, defaults to `True`): - Whether to use relative positional embeddings in the self-attention layers of the vision encoder. - global_attn_indexes (`list`, *optional*, defaults to `[2, 5, 8, 11]`): - List of layer indexes where global attention (instead of windowed attention) is applied in the vision encoder. - window_size (`int`, *optional*, defaults to 14): - The size of the attention window for windowed self-attention in the vision Transformer layers. - output_channels (`int`, *optional*, defaults to 256): - Number of output channels from the convolutional stem layer before patch embedding. + depth (`int`, *optional*, defaults to 12): + Number of hidden layers in the vision Transformer encoder. + embed_dim (`int`, *optional*, defaults to 768): + Dimensionality of the patch embedding layer in the vision encoder. + hidden_size (`int`, *optional*, defaults to 1024): + Dimensionality of the hidden layers in the vision Transformer encoder. + num_channels (`int`, *optional*, defaults to 3): + Number of input channels for the convolutional stem layer (default: RGB images with 3 channels). + image_size (`int`, *optional*, defaults to 1024): + The size (resolution) of input chart images (assumed to be square). + mlp_ratio (`float`, *optional*, defaults to 4.0): + Ratio of the dimensionality of the feed-forward layer to the hidden size in the vision Transformer blocks. + num_attention_heads (`int`, *optional*, defaults to 12): + Number of attention heads for each self-attention layer in the vision Transformer encoder. + patch_size (`int`, *optional*, defaults to 16): + The size (resolution) of each image patch extracted from the input chart image. + qkv_bias (`bool`, *optional*, defaults to `True`): + Whether to add bias terms to the query, key, and value projection layers in the self-attention mechanism. + use_rel_pos (`bool`, *optional*, defaults to `True`): + Whether to use relative positional embeddings in the self-attention layers of the vision encoder. + global_attn_indexes (`Optional[List[int]]`, *optional*, defaults to `None`): + List of layer indexes where global attention (instead of windowed attention) is applied in the vision encoder. + If `None`, defaults to `[2, 5, 8, 11]`. + window_size (`int`, *optional*, defaults to 14): + The size of the attention window for windowed self-attention in the vision Transformer layers. + output_channels (`int`, *optional*, defaults to 256): + Number of output channels from the convolutional stem layer before patch embedding. + net_channels (`int`, *optional*, defaults to 512): + Number of channels in the intermediate convolutional layers. + attention_dropout (`float`, *optional*, defaults to 0.0): + Dropout probability for the attention layers in the vision Transformer. + output_hidden_states (`bool`, *optional*, defaults to `False`): + output_attentions (`bool`, *optional*, defaults to `False`): + return_dict (`bool`, *optional*, defaults to `True`): + dtype (`Union`, *optional*): + tie_word_embeddings (`bool`, *optional*, defaults to `True`): + chunk_size_feed_forward (`int`, *optional*, defaults to 0): + is_encoder_decoder (`bool`, *optional*, defaults to `False`): + is_decoder (`bool`, *optional*, defaults to `False`): + cross_attention_hidden_size (`Optional`, *optional*): + add_cross_attention (`bool`, *optional*, defaults to `False`): + architectures (`Optional`, *optional*): + finetuning_task (`Optional`, *optional*): + id2label (`Optional`, *optional*): + label2id (`Optional`, *optional*): + num_labels (`Optional`, *optional*): + task_specific_params (`Optional`, *optional*): + problem_type (`Optional`, *optional*): + tokenizer_class (`Optional`, *optional*): + prefix (`Optional`, *optional*): + bos_token_id (`Optional`, *optional*): + pad_token_id (`Optional`, *optional*): + eos_token_id (`Optional`, *optional*): + sep_token_id (`Optional`, *optional*): + decoder_start_token_id (`Optional`, *optional*): Example: @@ -93,10 +132,8 @@ class PPChart2TableVisionConfig(PreTrainedConfig): model_type = "pp_chart2table_vision" base_config_key = "vision_config" - def __init__( + def init( self, - im_patch_token: int = 151859, - im_start_token: int = 151857, depth: int = 12, embed_dim: int = 768, hidden_size: int = 1024, @@ -107,16 +144,13 @@ def __init__( patch_size: int = 16, qkv_bias: bool = True, use_rel_pos: bool = True, - global_attn_indexes: Optional[list] = None, + global_attn_indexes: Optional[List[int]] = None, window_size: int = 14, output_channels: int = 256, net_channels: int = 512, attention_dropout: float = 0.0, **kwargs, ): - self.im_patch_token = im_patch_token - self.im_start_token = im_start_token - self.depth = depth self.embed_dim = embed_dim self.hidden_size = hidden_size @@ -132,8 +166,7 @@ def __init__( self.output_channels = output_channels self.net_channels = net_channels self.attention_dropout = attention_dropout - - super().__init__(**kwargs) + super().init(**kwargs) class PPChart2TableTextConfig(PreTrainedConfig): @@ -292,30 +325,25 @@ def __init__( class PPChart2TableConfig(PreTrainedConfig): r""" - This is the main configuration class to store the configuration of a [`PPChart2TableModel`] or [`PPChart2TableForConditionalGeneration`]. + This is the main configuration class to store the configuration of a [PPChart2TableModel] or [PPChart2TableForConditionalGeneration]. It is used to instantiate a PP-Chart2Table multimodal model according to the specified arguments, defining the vision and text - sub-model architectures. This configuration class inherits from [`PreTrainedConfig`] and combines the configurations of: - - [`PPChart2TableVisionConfig`] (for the chart vision encoder) - - [`PPChart2TableTextConfig`] (for the table text decoder) - PP-Chart2Table [PaddlePaddle/PP-Chart2Table_safetensors](https://huggingface.co/PaddlePaddle/PP-Chart2Table_safetensors). + sub-model architectures. This configuration class inherits from [PreTrainedConfig] and combines the configurations of: + [PPChart2TableVisionConfig] (for the chart vision encoder) + [PPChart2TableTextConfig] (for the table text decoder) + PP-Chart2Table PaddlePaddle/PP-Chart2Table_safetensors. - Instantiating a `PPChart2TableConfig` with the defaults will yield a similar configuration to the base PP-Chart2Table model + Instantiating a PPChart2TableConfig with the defaults will yield a similar configuration to the base PP-Chart2Table model developed by the PaddlePaddle team for chart-to-table parsing tasks. - Configuration objects inherit from [`PreTrainedConfig`] and can be used to control the model outputs. Read the - documentation from [`PreTrainedConfig`] for more information. + Configuration objects inherit from [PreTrainedConfig] and can be used to control the model outputs. Read the + documentation from [PreTrainedConfig] for more information. Args: - vision_config (`dict`, *optional*): - Dictionary of configuration options used to initialize [`PPChart2TableVisionConfig`]. If `None`, the default - `PPChart2TableVisionConfig` configuration will be used. - text_config (`dict`, *optional*): - Dictionary of configuration options used to initialize [`PPChart2TableTextConfig`]. If `None`, the default - `PPChart2TableTextConfig` configuration will be used. - im_start_token (`int`, *optional*, defaults to 151857): - The token ID representing the start of an image token sequence in the multimodal input (shared across vision/text sub-configs). - im_patch_token (`int`, *optional*, defaults to 151859): - The token ID used to represent individual image patches in the multimodal input sequence (shared across vision/text sub-configs). + vision_config (Optional[Dict], optional, defaults to None, *optional*): + text_config (Optional[Dict], optional, defaults to None, *optional*): + image_token_index (Optional[int], optional, defaults to 151859, *optional*, defaults to 151859): + image_seq_length (Optional[int], optional, defaults to 576, *optional*, defaults to 576): + pad_token_id (Optional[int], optional, defaults to -1, *optional*, defaults to -1): Example: @@ -368,7 +396,6 @@ def __init__( text_config = {} self.text_config = PPChart2TableTextConfig(**text_config) - self.model_type = "pp_chart2table" text_config_keys = [ "attention_dropout", @@ -622,6 +649,7 @@ class PPChart2TableProcessor(ProcessorMixin): chat_template (`str`, *optional*): A Jinja template which will be used to convert lists of messages in a chat into a tokenizable string. """ + image_processor_class = "AutoImageProcessor" tokenizer_class = "AutoTokenizer" @@ -663,7 +691,16 @@ def postprocess(self, model_pred, **kwargs): class PPChart2TableVisionPatchEmbed(GotOcr2PatchEmbeddings): def __init__(self, config): super().__init__() + image_size, patch_size = config.image_size, config.patch_size num_channels, hidden_size = config.num_channels, config.embed_dim + image_size = image_size if isinstance(image_size, collections.abc.Iterable) else (image_size, image_size) + patch_size = patch_size if isinstance(patch_size, collections.abc.Iterable) else (patch_size, patch_size) + num_patches = (image_size[1] // patch_size[1]) * (image_size[0] // patch_size[0]) + self.image_size = image_size + self.patch_size = patch_size + self.num_channels = num_channels + self.num_patches = num_patches + self.projection = nn.Conv2d(num_channels, hidden_size, kernel_size=patch_size, stride=patch_size) class PPChart2TableVisionMLPBlock(GotOcr2MLPBlock): @@ -683,6 +720,7 @@ class PPChart2TableVisionAttention(GotOcr2VisionAttention): def __init__(self, config, window_size): super().__init__() head_dim = config.embed_dim // config.num_attention_heads + self.scale = head_dim**-0.5 self.qkv = nn.Linear(config.embed_dim, config.embed_dim * 3, bias=config.qkv_bias) self.proj = nn.Linear(config.embed_dim, config.embed_dim) @@ -757,7 +795,6 @@ class PPChart2TableVisionPreTrainedModel(PreTrainedModel): class PPChart2TableVisionModel(PPChart2TableVisionPreTrainedModel): - main_input_name = "pixel_values" input_modalities = "image" @@ -786,12 +823,14 @@ def __init__( self.neck = PPChart2TableVisionNeck(config) - self.net_2 = nn.Conv2d(config.output_channels, config.net_channels, kernel_size=3, stride=2, padding=1, bias=False) + self.net_2 = nn.Conv2d( + config.output_channels, config.net_channels, kernel_size=3, stride=2, padding=1, bias=False + ) self.net_3 = nn.Conv2d(config.net_channels, config.hidden_size, kernel_size=3, stride=2, padding=1, bias=False) self.post_init() - def forward(self, hidden_states: torch.Tensor) -> torch.Tensor: + def forward(self, hidden_states: torch.Tensor, **kwargs) -> torch.Tensor: hidden_states = self.patch_embed(hidden_states) hidden_states = hidden_states + self.pos_embed for block in self.blocks: @@ -801,6 +840,7 @@ def forward(self, hidden_states: torch.Tensor) -> torch.Tensor: hidden_states = self.net_3(hidden_states) return hidden_states + class PPChart2TableTextAttention(Qwen2Attention): pass @@ -812,6 +852,7 @@ class PPChart2TableTextDecoderLayer(Qwen2DecoderLayer): class PPChart2TableTextPreTrainedModel(Qwen2PreTrainedModel): pass + class PPChart2TableTextModel(Qwen2Model): pass @@ -1117,10 +1158,10 @@ def get_input_embeddings(self): def set_input_embeddings(self, value): self.model.set_input_embeddings(value) - + def get_output_embeddings(self) -> nn.Module: return self.lm_head - + def get_image_features( self, pixel_values: torch.FloatTensor, @@ -1174,6 +1215,7 @@ def forward( ) return PPChart2TableCausalLMOutputWithPast( + loss=loss, logits=logits, last_hidden_state=outputs.last_hidden_state, past_key_values=outputs.past_key_values, diff --git a/tests/models/pp_chart2table/test_modeling_pp_chart2table.py b/tests/models/pp_chart2table/test_modeling_pp_chart2table.py index d76f23d92900..99d30d76c81d 100644 --- a/tests/models/pp_chart2table/test_modeling_pp_chart2table.py +++ b/tests/models/pp_chart2table/test_modeling_pp_chart2table.py @@ -86,7 +86,7 @@ def __init__( "window_size": 14, "output_channels": 256, "net_channels": 512, - "attention_dropout": 0.0 + "attention_dropout": 0.0, }, bos_token_id=151643, eos_token_id=151643, @@ -172,7 +172,7 @@ def prepare_config_and_inputs_for_common(self): class PPChart2TableModelTest(ModelTesterMixin, GenerationTesterMixin, PipelineTesterMixin, unittest.TestCase): all_model_classes = (PPChart2TableForConditionalGeneration,) if is_torch_available() else () pipeline_model_mapping = {"image-text-to-text": PPChart2TableForConditionalGeneration} - + _is_composite = True test_resize_embeddings = False From fc7c75fe76916f9a61e1226ba02523b18e2a9b27 Mon Sep 17 00:00:00 2001 From: XingweiDeng Date: Fri, 27 Feb 2026 11:45:46 +0800 Subject: [PATCH 07/60] update --- .../configuration_pp_chart2table.py | 116 ++++++----------- .../pp_chart2table/modular_pp_chart2table.py | 121 ++++++------------ 2 files changed, 82 insertions(+), 155 deletions(-) diff --git a/src/transformers/models/pp_chart2table/configuration_pp_chart2table.py b/src/transformers/models/pp_chart2table/configuration_pp_chart2table.py index 9de2e1da70b1..7fa3b6d36af8 100644 --- a/src/transformers/models/pp_chart2table/configuration_pp_chart2table.py +++ b/src/transformers/models/pp_chart2table/configuration_pp_chart2table.py @@ -11,85 +11,49 @@ class PPChart2TableVisionConfig(PreTrainedConfig): - r""" - This is the configuration class to store the configuration of a [`PPChart2TableVisionModel`]. It is used to instantiate a - PP-Chart2Table vision encoder according to the specified arguments, defining the model architecture. Instantiating a - configuration with the defaults will yield a similar configuration to that of the vision encoder of the PP-Chart2Table - architecture developed by the PaddlePaddle team for chart-to-table parsing tasks. + """ + Configuration class for the vision backbone of PP-Chart2Table model. - Configuration objects inherit from [`PreTrainedConfig`] and can be used to control the model outputs. Read the - documentation from [`PreTrainedConfig`] for more information. + This configuration class defines all the hyperparameters for the vision component + of the PP-Chart2Table model, which is responsible for processing chart images + and extracting visual features for table structure recognition and content extraction. + PaddlePaddle/PP-Chart2Table_safetensors [PaddlePaddle/PP-Chart2Table_safetensors] + (https://huggingface.co/PaddlePaddle/PP-Chart2Table_safetensors) Args: - depth (`int`, *optional*, defaults to 12): - Number of hidden layers in the vision Transformer encoder. - embed_dim (`int`, *optional*, defaults to 768): - Dimensionality of the patch embedding layer in the vision encoder. - hidden_size (`int`, *optional*, defaults to 1024): - Dimensionality of the hidden layers in the vision Transformer encoder. - num_channels (`int`, *optional*, defaults to 3): - Number of input channels for the convolutional stem layer (default: RGB images with 3 channels). - image_size (`int`, *optional*, defaults to 1024): - The size (resolution) of input chart images (assumed to be square). - mlp_ratio (`float`, *optional*, defaults to 4.0): - Ratio of the dimensionality of the feed-forward layer to the hidden size in the vision Transformer blocks. - num_attention_heads (`int`, *optional*, defaults to 12): - Number of attention heads for each self-attention layer in the vision Transformer encoder. - patch_size (`int`, *optional*, defaults to 16): - The size (resolution) of each image patch extracted from the input chart image. - qkv_bias (`bool`, *optional*, defaults to `True`): - Whether to add bias terms to the query, key, and value projection layers in the self-attention mechanism. - use_rel_pos (`bool`, *optional*, defaults to `True`): - Whether to use relative positional embeddings in the self-attention layers of the vision encoder. - global_attn_indexes (`Optional[List[int]]`, *optional*, defaults to `None`): - List of layer indexes where global attention (instead of windowed attention) is applied in the vision encoder. - If `None`, defaults to `[2, 5, 8, 11]`. - window_size (`int`, *optional*, defaults to 14): - The size of the attention window for windowed self-attention in the vision Transformer layers. - output_channels (`int`, *optional*, defaults to 256): - Number of output channels from the convolutional stem layer before patch embedding. - net_channels (`int`, *optional*, defaults to 512): - Number of channels in the intermediate convolutional layers. - attention_dropout (`float`, *optional*, defaults to 0.0): - Dropout probability for the attention layers in the vision Transformer. - output_hidden_states (`bool`, *optional*, defaults to `False`): - output_attentions (`bool`, *optional*, defaults to `False`): - return_dict (`bool`, *optional*, defaults to `True`): - dtype (`Union`, *optional*): - tie_word_embeddings (`bool`, *optional*, defaults to `True`): - chunk_size_feed_forward (`int`, *optional*, defaults to 0): - is_encoder_decoder (`bool`, *optional*, defaults to `False`): - is_decoder (`bool`, *optional*, defaults to `False`): - cross_attention_hidden_size (`Optional`, *optional*): - add_cross_attention (`bool`, *optional*, defaults to `False`): - architectures (`Optional`, *optional*): - finetuning_task (`Optional`, *optional*): - id2label (`Optional`, *optional*): - label2id (`Optional`, *optional*): - num_labels (`Optional`, *optional*): - task_specific_params (`Optional`, *optional*): - problem_type (`Optional`, *optional*): - tokenizer_class (`Optional`, *optional*): - prefix (`Optional`, *optional*): - bos_token_id (`Optional`, *optional*): - pad_token_id (`Optional`, *optional*): - eos_token_id (`Optional`, *optional*): - sep_token_id (`Optional`, *optional*): - decoder_start_token_id (`Optional`, *optional*): - - Example: - - ```python - >>> from transformers import PPChart2TableVisionConfig, PPChart2TableVisionModel - - >>> # Initializing a PPChart2TableVisionConfig with default PP-Chart2Table style configuration - >>> configuration = PPChart2TableVisionConfig() - - >>> # Initializing a PPChart2TableVisionModel (with random weights) from the PP-Chart2Table style configuration - >>> model = PPChart2TableVisionModel(configuration) - - >>> # Accessing the model configuration - >>> configuration = model.config + depth (`int`, *optional*, defaults to 12): + Number of transformer encoder layers in the vision backbone. + embed_dim (`int`, *optional*, defaults to 768): + Dimensionality of the patch embedding vectors. + hidden_size (`int`, *optional*, defaults to 1024): + Dimensionality of the hidden layer in the feed-forward network (MLP). + num_channels (`int`, *optional*, defaults to 3): + Number of input channels (3 for RGB images, 1 for grayscale). + image_size (`int`, *optional*, defaults to 1024): + Size (height/width) of the input images (assumed to be square). + mlp_ratio (`float`, *optional*, defaults to 4.0): + Ratio of the hidden layer size to the embedding dimension in the MLP (hidden_size = embed_dim * mlp_ratio). + num_attention_heads (`int`, *optional*, defaults to 12): + Number of attention heads for each transformer encoder layer. + patch_size (`int`, *optional*, defaults to 16): + Size (height/width) of the image patches extracted from the input image. + qkv_bias (`bool`, *optional*, defaults to True): + Whether to include bias terms in the query, key, value projection layers of self-attention. + use_rel_pos (`bool`, *optional*, defaults to True): + Whether to use relative positional embeddings in the self-attention mechanism. + global_attn_indexes (`Optional[list[int]]`, *optional*, defaults to [2, 5, 8, 11]): + List of layer indexes where global attention (instead of window attention) is applied. + If `None`, defaults to [2, 5, 8, 11]. + window_size (`int`, *optional*, defaults to 14): + Size of the attention window for window-based self-attention (only effective when use_rel_pos=True). + output_channels (`int`, *optional*, defaults to 256): + Dimensionality of the final visual feature output channels. + net_channels (`int`, *optional*, defaults to 512): + Dimensionality of intermediate network channels in the vision backbone. + attention_dropout (`float`, *optional*, defaults to 0.0): + Dropout probability applied to the attention weights. + **kwargs: + Additional keyword arguments passed to the parent `PreTrainedConfig` class. """ model_type = "pp_chart2table_vision" diff --git a/src/transformers/models/pp_chart2table/modular_pp_chart2table.py b/src/transformers/models/pp_chart2table/modular_pp_chart2table.py index bef1baa5c556..87701c5ded9f 100644 --- a/src/transformers/models/pp_chart2table/modular_pp_chart2table.py +++ b/src/transformers/models/pp_chart2table/modular_pp_chart2table.py @@ -1,6 +1,6 @@ import collections from dataclasses import dataclass -from typing import Optional, Union, List +from typing import Optional, Union import torch import torch.nn as nn @@ -48,85 +48,49 @@ class PPChart2TableVisionConfig(PreTrainedConfig): - r""" - This is the configuration class to store the configuration of a [`PPChart2TableVisionModel`]. It is used to instantiate a - PP-Chart2Table vision encoder according to the specified arguments, defining the model architecture. Instantiating a - configuration with the defaults will yield a similar configuration to that of the vision encoder of the PP-Chart2Table - architecture developed by the PaddlePaddle team for chart-to-table parsing tasks. + """ + Configuration class for the vision backbone of PP-Chart2Table model. - Configuration objects inherit from [`PreTrainedConfig`] and can be used to control the model outputs. Read the - documentation from [`PreTrainedConfig`] for more information. + This configuration class defines all the hyperparameters for the vision component + of the PP-Chart2Table model, which is responsible for processing chart images + and extracting visual features for table structure recognition and content extraction. + PaddlePaddle/PP-Chart2Table_safetensors [PaddlePaddle/PP-Chart2Table_safetensors] + (https://huggingface.co/PaddlePaddle/PP-Chart2Table_safetensors) Args: - depth (`int`, *optional*, defaults to 12): - Number of hidden layers in the vision Transformer encoder. - embed_dim (`int`, *optional*, defaults to 768): - Dimensionality of the patch embedding layer in the vision encoder. - hidden_size (`int`, *optional*, defaults to 1024): - Dimensionality of the hidden layers in the vision Transformer encoder. - num_channels (`int`, *optional*, defaults to 3): - Number of input channels for the convolutional stem layer (default: RGB images with 3 channels). - image_size (`int`, *optional*, defaults to 1024): - The size (resolution) of input chart images (assumed to be square). - mlp_ratio (`float`, *optional*, defaults to 4.0): - Ratio of the dimensionality of the feed-forward layer to the hidden size in the vision Transformer blocks. - num_attention_heads (`int`, *optional*, defaults to 12): - Number of attention heads for each self-attention layer in the vision Transformer encoder. - patch_size (`int`, *optional*, defaults to 16): - The size (resolution) of each image patch extracted from the input chart image. - qkv_bias (`bool`, *optional*, defaults to `True`): - Whether to add bias terms to the query, key, and value projection layers in the self-attention mechanism. - use_rel_pos (`bool`, *optional*, defaults to `True`): - Whether to use relative positional embeddings in the self-attention layers of the vision encoder. - global_attn_indexes (`Optional[List[int]]`, *optional*, defaults to `None`): - List of layer indexes where global attention (instead of windowed attention) is applied in the vision encoder. - If `None`, defaults to `[2, 5, 8, 11]`. - window_size (`int`, *optional*, defaults to 14): - The size of the attention window for windowed self-attention in the vision Transformer layers. - output_channels (`int`, *optional*, defaults to 256): - Number of output channels from the convolutional stem layer before patch embedding. - net_channels (`int`, *optional*, defaults to 512): - Number of channels in the intermediate convolutional layers. - attention_dropout (`float`, *optional*, defaults to 0.0): - Dropout probability for the attention layers in the vision Transformer. - output_hidden_states (`bool`, *optional*, defaults to `False`): - output_attentions (`bool`, *optional*, defaults to `False`): - return_dict (`bool`, *optional*, defaults to `True`): - dtype (`Union`, *optional*): - tie_word_embeddings (`bool`, *optional*, defaults to `True`): - chunk_size_feed_forward (`int`, *optional*, defaults to 0): - is_encoder_decoder (`bool`, *optional*, defaults to `False`): - is_decoder (`bool`, *optional*, defaults to `False`): - cross_attention_hidden_size (`Optional`, *optional*): - add_cross_attention (`bool`, *optional*, defaults to `False`): - architectures (`Optional`, *optional*): - finetuning_task (`Optional`, *optional*): - id2label (`Optional`, *optional*): - label2id (`Optional`, *optional*): - num_labels (`Optional`, *optional*): - task_specific_params (`Optional`, *optional*): - problem_type (`Optional`, *optional*): - tokenizer_class (`Optional`, *optional*): - prefix (`Optional`, *optional*): - bos_token_id (`Optional`, *optional*): - pad_token_id (`Optional`, *optional*): - eos_token_id (`Optional`, *optional*): - sep_token_id (`Optional`, *optional*): - decoder_start_token_id (`Optional`, *optional*): - - Example: - - ```python - >>> from transformers import PPChart2TableVisionConfig, PPChart2TableVisionModel - - >>> # Initializing a PPChart2TableVisionConfig with default PP-Chart2Table style configuration - >>> configuration = PPChart2TableVisionConfig() - - >>> # Initializing a PPChart2TableVisionModel (with random weights) from the PP-Chart2Table style configuration - >>> model = PPChart2TableVisionModel(configuration) - - >>> # Accessing the model configuration - >>> configuration = model.config + depth (`int`, *optional*, defaults to 12): + Number of transformer encoder layers in the vision backbone. + embed_dim (`int`, *optional*, defaults to 768): + Dimensionality of the patch embedding vectors. + hidden_size (`int`, *optional*, defaults to 1024): + Dimensionality of the hidden layer in the feed-forward network (MLP). + num_channels (`int`, *optional*, defaults to 3): + Number of input channels (3 for RGB images, 1 for grayscale). + image_size (`int`, *optional*, defaults to 1024): + Size (height/width) of the input images (assumed to be square). + mlp_ratio (`float`, *optional*, defaults to 4.0): + Ratio of the hidden layer size to the embedding dimension in the MLP (hidden_size = embed_dim * mlp_ratio). + num_attention_heads (`int`, *optional*, defaults to 12): + Number of attention heads for each transformer encoder layer. + patch_size (`int`, *optional*, defaults to 16): + Size (height/width) of the image patches extracted from the input image. + qkv_bias (`bool`, *optional*, defaults to True): + Whether to include bias terms in the query, key, value projection layers of self-attention. + use_rel_pos (`bool`, *optional*, defaults to True): + Whether to use relative positional embeddings in the self-attention mechanism. + global_attn_indexes (`Optional[list[int]]`, *optional*, defaults to [2, 5, 8, 11]): + List of layer indexes where global attention (instead of window attention) is applied. + If `None`, defaults to [2, 5, 8, 11]. + window_size (`int`, *optional*, defaults to 14): + Size of the attention window for window-based self-attention (only effective when use_rel_pos=True). + output_channels (`int`, *optional*, defaults to 256): + Dimensionality of the final visual feature output channels. + net_channels (`int`, *optional*, defaults to 512): + Dimensionality of intermediate network channels in the vision backbone. + attention_dropout (`float`, *optional*, defaults to 0.0): + Dropout probability applied to the attention weights. + **kwargs: + Additional keyword arguments passed to the parent `PreTrainedConfig` class. """ model_type = "pp_chart2table_vision" @@ -144,7 +108,7 @@ def init( patch_size: int = 16, qkv_bias: bool = True, use_rel_pos: bool = True, - global_attn_indexes: Optional[List[int]] = None, + global_attn_indexes: Optional[list[int]] = None, window_size: int = 14, output_channels: int = 256, net_channels: int = 512, @@ -396,7 +360,6 @@ def __init__( text_config = {} self.text_config = PPChart2TableTextConfig(**text_config) - text_config_keys = [ "attention_dropout", "bos_token_id", From 01f2b299440089ce611c86aaa86e438005fb929f Mon Sep 17 00:00:00 2001 From: XingweiDeng Date: Sat, 28 Feb 2026 14:46:13 +0800 Subject: [PATCH 08/60] update --- .../configuration_pp_chart2table.py | 25 +++++++++++-------- .../pp_chart2table/modular_pp_chart2table.py | 25 +++++++++++-------- 2 files changed, 28 insertions(+), 22 deletions(-) diff --git a/src/transformers/models/pp_chart2table/configuration_pp_chart2table.py b/src/transformers/models/pp_chart2table/configuration_pp_chart2table.py index 7fa3b6d36af8..bca3aa6cb797 100644 --- a/src/transformers/models/pp_chart2table/configuration_pp_chart2table.py +++ b/src/transformers/models/pp_chart2table/configuration_pp_chart2table.py @@ -37,9 +37,9 @@ class PPChart2TableVisionConfig(PreTrainedConfig): Number of attention heads for each transformer encoder layer. patch_size (`int`, *optional*, defaults to 16): Size (height/width) of the image patches extracted from the input image. - qkv_bias (`bool`, *optional*, defaults to True): + qkv_bias (`bool`, *optional*, defaults to `True`): Whether to include bias terms in the query, key, value projection layers of self-attention. - use_rel_pos (`bool`, *optional*, defaults to True): + use_rel_pos (`bool`, *optional*, defaults to `True`): Whether to use relative positional embeddings in the self-attention mechanism. global_attn_indexes (`Optional[list[int]]`, *optional*, defaults to [2, 5, 8, 11]): List of layer indexes where global attention (instead of window attention) is applied. @@ -52,14 +52,12 @@ class PPChart2TableVisionConfig(PreTrainedConfig): Dimensionality of intermediate network channels in the vision backbone. attention_dropout (`float`, *optional*, defaults to 0.0): Dropout probability applied to the attention weights. - **kwargs: - Additional keyword arguments passed to the parent `PreTrainedConfig` class. """ model_type = "pp_chart2table_vision" base_config_key = "vision_config" - def init( + def __init__( self, depth: int = 12, embed_dim: int = 768, @@ -93,7 +91,7 @@ def init( self.output_channels = output_channels self.net_channels = net_channels self.attention_dropout = attention_dropout - super().init(**kwargs) + super().__init__(**kwargs) class PPChart2TableTextConfig(PreTrainedConfig): @@ -266,11 +264,16 @@ class PPChart2TableConfig(PreTrainedConfig): documentation from [PreTrainedConfig] for more information. Args: - vision_config (Optional[Dict], optional, defaults to None, *optional*): - text_config (Optional[Dict], optional, defaults to None, *optional*): - image_token_index (Optional[int], optional, defaults to 151859, *optional*, defaults to 151859): - image_seq_length (Optional[int], optional, defaults to 576, *optional*, defaults to 576): - pad_token_id (Optional[int], optional, defaults to -1, *optional*, defaults to -1): + vision_config (Optional[Dict], optional, *optional*):: + The [PPChart2TableVisionConfig] for the vision sub-model. Defaults to None. + text_config (Optional[Dict], optional, *optional*):: + The [PPChart2TableTextConfig] for the text sub-model. Defaults to None. + image_token_index (Optional[int], optional, *optional*, defaults to 151859):: + The index of the image token. Defaults to 151859. + image_seq_length (Optional[int], optional, *optional*, defaults to 576):: + The sequence length for the image. Defaults to 576. + pad_token_id (Optional[int], optional, *optional*, defaults to -1): + The index of the padding token. Defaults to -1. Example: diff --git a/src/transformers/models/pp_chart2table/modular_pp_chart2table.py b/src/transformers/models/pp_chart2table/modular_pp_chart2table.py index 87701c5ded9f..fca8b6f152d8 100644 --- a/src/transformers/models/pp_chart2table/modular_pp_chart2table.py +++ b/src/transformers/models/pp_chart2table/modular_pp_chart2table.py @@ -74,9 +74,9 @@ class PPChart2TableVisionConfig(PreTrainedConfig): Number of attention heads for each transformer encoder layer. patch_size (`int`, *optional*, defaults to 16): Size (height/width) of the image patches extracted from the input image. - qkv_bias (`bool`, *optional*, defaults to True): + qkv_bias (`bool`, *optional*, defaults to `True`): Whether to include bias terms in the query, key, value projection layers of self-attention. - use_rel_pos (`bool`, *optional*, defaults to True): + use_rel_pos (`bool`, *optional*, defaults to `True`): Whether to use relative positional embeddings in the self-attention mechanism. global_attn_indexes (`Optional[list[int]]`, *optional*, defaults to [2, 5, 8, 11]): List of layer indexes where global attention (instead of window attention) is applied. @@ -89,14 +89,12 @@ class PPChart2TableVisionConfig(PreTrainedConfig): Dimensionality of intermediate network channels in the vision backbone. attention_dropout (`float`, *optional*, defaults to 0.0): Dropout probability applied to the attention weights. - **kwargs: - Additional keyword arguments passed to the parent `PreTrainedConfig` class. """ model_type = "pp_chart2table_vision" base_config_key = "vision_config" - def init( + def __init__( self, depth: int = 12, embed_dim: int = 768, @@ -130,7 +128,7 @@ def init( self.output_channels = output_channels self.net_channels = net_channels self.attention_dropout = attention_dropout - super().init(**kwargs) + super().__init__(**kwargs) class PPChart2TableTextConfig(PreTrainedConfig): @@ -303,11 +301,16 @@ class PPChart2TableConfig(PreTrainedConfig): documentation from [PreTrainedConfig] for more information. Args: - vision_config (Optional[Dict], optional, defaults to None, *optional*): - text_config (Optional[Dict], optional, defaults to None, *optional*): - image_token_index (Optional[int], optional, defaults to 151859, *optional*, defaults to 151859): - image_seq_length (Optional[int], optional, defaults to 576, *optional*, defaults to 576): - pad_token_id (Optional[int], optional, defaults to -1, *optional*, defaults to -1): + vision_config (Optional[Dict], optional, *optional*):: + The [PPChart2TableVisionConfig] for the vision sub-model. Defaults to None. + text_config (Optional[Dict], optional, *optional*):: + The [PPChart2TableTextConfig] for the text sub-model. Defaults to None. + image_token_index (Optional[int], optional, *optional*, defaults to 151859):: + The index of the image token. Defaults to 151859. + image_seq_length (Optional[int], optional, *optional*, defaults to 576):: + The sequence length for the image. Defaults to 576. + pad_token_id (Optional[int], optional, *optional*, defaults to -1): + The index of the padding token. Defaults to -1. Example: From d8cc88110e1f38646ef71f310ab96c6e8bc198f6 Mon Sep 17 00:00:00 2001 From: XingweiDeng Date: Mon, 2 Mar 2026 17:07:28 +0800 Subject: [PATCH 09/60] update --- .../image_processing_pp_chart2table.py | 8 ++++---- .../image_processing_pp_chart2table_fast.py | 8 ++++---- .../pp_chart2table/modular_pp_chart2table.py | 20 +++++++++---------- .../processing_pp_chart2table.py | 4 ++-- 4 files changed, 20 insertions(+), 20 deletions(-) diff --git a/src/transformers/models/pp_chart2table/image_processing_pp_chart2table.py b/src/transformers/models/pp_chart2table/image_processing_pp_chart2table.py index e83a49a99f1b..3f17cb754904 100644 --- a/src/transformers/models/pp_chart2table/image_processing_pp_chart2table.py +++ b/src/transformers/models/pp_chart2table/image_processing_pp_chart2table.py @@ -130,17 +130,17 @@ def preprocess( input_data_format = infer_channel_dimension_format(images[0]) # transformations - resize_imgs = [] + resize_images = [] if do_resize: for image in images: - img = resize( + image = resize( image, size=(size["height"], size["width"]), resample=resample, input_data_format=input_data_format, ) - resize_imgs.append(img) - images = resize_imgs + resize_images.append(image) + images = resize_images if do_rescale: images = [self.rescale(image, rescale_factor, input_data_format=input_data_format) for image in images] diff --git a/src/transformers/models/pp_chart2table/image_processing_pp_chart2table_fast.py b/src/transformers/models/pp_chart2table/image_processing_pp_chart2table_fast.py index 86a6cdb3a672..67007860e29f 100644 --- a/src/transformers/models/pp_chart2table/image_processing_pp_chart2table_fast.py +++ b/src/transformers/models/pp_chart2table/image_processing_pp_chart2table_fast.py @@ -72,12 +72,12 @@ def _preprocess( **kwargs, ) -> BatchFeature: data = {} - resize_imgs = [] + resize_images = [] if do_resize: for image in images: - img = self.resize(image, size=size, interpolation=interpolation) - resize_imgs.append(img) - images = resize_imgs + image = self.resize(image, size=size, interpolation=interpolation) + resize_images.append(image) + images = resize_images processed_images = [] for image in images: diff --git a/src/transformers/models/pp_chart2table/modular_pp_chart2table.py b/src/transformers/models/pp_chart2table/modular_pp_chart2table.py index fca8b6f152d8..bb964589ac84 100644 --- a/src/transformers/models/pp_chart2table/modular_pp_chart2table.py +++ b/src/transformers/models/pp_chart2table/modular_pp_chart2table.py @@ -497,17 +497,17 @@ def preprocess( input_data_format = infer_channel_dimension_format(images[0]) # transformations - resize_imgs = [] + resize_images = [] if do_resize: for image in images: - img = resize( + image = resize( image, size=(size["height"], size["width"]), resample=resample, input_data_format=input_data_format, ) - resize_imgs.append(img) - images = resize_imgs + resize_images.append(image) + images = resize_images if do_rescale: images = [self.rescale(image, rescale_factor, input_data_format=input_data_format) for image in images] @@ -583,12 +583,12 @@ def _preprocess( **kwargs, ) -> BatchFeature: data = {} - resize_imgs = [] + resize_images = [] if do_resize: for image in images: - img = self.resize(image, size=size, interpolation=interpolation) - resize_imgs.append(img) - images = resize_imgs + image = self.resize(image, size=size, interpolation=interpolation) + resize_images.append(image) + images = resize_images processed_images = [] for image in images: @@ -633,8 +633,8 @@ def __call__( else: image_inputs = {} img_cnt = len(image_inputs) - _, _, h, _ = image_inputs["pixel_values"].shape - num_patches = h // self.image_processor.patch_size // self.image_processor.merge_size + _, _, height, _ = image_inputs["pixel_values"].shape + num_patches = height // self.image_processor.patch_size // self.image_processor.merge_size prompt = ( "<|im_start|>system\n" "You should follow the instructions carefully and explain your answers in detail.<|im_end|><|im_start|>user\n" diff --git a/src/transformers/models/pp_chart2table/processing_pp_chart2table.py b/src/transformers/models/pp_chart2table/processing_pp_chart2table.py index a2281c9d5f3e..c9d83d12082b 100644 --- a/src/transformers/models/pp_chart2table/processing_pp_chart2table.py +++ b/src/transformers/models/pp_chart2table/processing_pp_chart2table.py @@ -41,8 +41,8 @@ def __call__( else: image_inputs = {} img_cnt = len(image_inputs) - _, _, h, _ = image_inputs["pixel_values"].shape - num_patches = h // self.image_processor.patch_size // self.image_processor.merge_size + _, _, height, _ = image_inputs["pixel_values"].shape + num_patches = height // self.image_processor.patch_size // self.image_processor.merge_size prompt = ( "<|im_start|>system\n" "You should follow the instructions carefully and explain your answers in detail.<|im_end|><|im_start|>user\n" From 117a6cb5ef116561278a15e7ae7955957faa8d6e Mon Sep 17 00:00:00 2001 From: XingweiDeng Date: Mon, 9 Mar 2026 16:41:24 +0800 Subject: [PATCH 10/60] update --- .../configuration_pp_chart2table.py | 21 ++ .../image_processing_pp_chart2table_fast.py | 6 + .../pp_chart2table/modeling_pp_chart2table.py | 60 +++-- .../pp_chart2table/modular_pp_chart2table.py | 249 ++++++------------ .../processing_pp_chart2table.py | 16 +- 5 files changed, 156 insertions(+), 196 deletions(-) diff --git a/src/transformers/models/pp_chart2table/configuration_pp_chart2table.py b/src/transformers/models/pp_chart2table/configuration_pp_chart2table.py index bca3aa6cb797..7e0a964b5254 100644 --- a/src/transformers/models/pp_chart2table/configuration_pp_chart2table.py +++ b/src/transformers/models/pp_chart2table/configuration_pp_chart2table.py @@ -8,8 +8,19 @@ from transformers.configuration_utils import PreTrainedConfig, layer_type_validation from transformers.modeling_rope_utils import RopeParameters +from transformers.utils import auto_docstring +@auto_docstring( + custom_intro=""" + This configuration class defines all the hyperparameters for the vision component + of the PP-Chart2Table model, which is responsible for processing chart images + and extracting visual features for table structure recognition and content extraction. + PaddlePaddle/PP-Chart2Table_safetensors [PaddlePaddle/PP-Chart2Table_safetensors] + (https://huggingface.co/PaddlePaddle/PP-Chart2Table_safetensors) + """, + checkpoint="PaddlePaddle/PP-Chart2Table_safetensors", +) class PPChart2TableVisionConfig(PreTrainedConfig): """ Configuration class for the vision backbone of PP-Chart2Table model. @@ -94,6 +105,11 @@ def __init__( super().__init__(**kwargs) +@auto_docstring( + custom_intro=""" + + """, +) class PPChart2TableTextConfig(PreTrainedConfig): r""" This is the configuration class to store the configuration of a [`PPChart2TableTextModel`]. It is used to instantiate a @@ -248,6 +264,11 @@ def __init__( ) +@auto_docstring( + custom_intro=""" + + """ +) class PPChart2TableConfig(PreTrainedConfig): r""" This is the main configuration class to store the configuration of a [PPChart2TableModel] or [PPChart2TableForConditionalGeneration]. diff --git a/src/transformers/models/pp_chart2table/image_processing_pp_chart2table_fast.py b/src/transformers/models/pp_chart2table/image_processing_pp_chart2table_fast.py index 67007860e29f..d9a942aab565 100644 --- a/src/transformers/models/pp_chart2table/image_processing_pp_chart2table_fast.py +++ b/src/transformers/models/pp_chart2table/image_processing_pp_chart2table_fast.py @@ -12,8 +12,14 @@ from transformers.feature_extraction_utils import BatchFeature from transformers.image_processing_utils_fast import BaseImageProcessorFast from transformers.processing_utils import TensorType +from transformers.utils import auto_docstring +@auto_docstring( + custom_intro=""" + + """ +) class PPChart2TableImageProcessorFast(BaseImageProcessorFast): r""" Fast image processor for the PP-Chart2Table multimodal model, optimized for GPU-accelerated chart image preprocessing. diff --git a/src/transformers/models/pp_chart2table/modeling_pp_chart2table.py b/src/transformers/models/pp_chart2table/modeling_pp_chart2table.py index 1c9790b4d1a2..d8e4f28e89f6 100644 --- a/src/transformers/models/pp_chart2table/modeling_pp_chart2table.py +++ b/src/transformers/models/pp_chart2table/modeling_pp_chart2table.py @@ -15,9 +15,9 @@ from transformers.cache_utils import Cache from transformers.generation import GenerationMixin -from transformers.modeling_outputs import ModelOutput +from transformers.modeling_outputs import BaseModelOutputWithPast from transformers.modeling_utils import PreTrainedModel -from transformers.utils import can_return_tuple +from transformers.utils import auto_docstring, can_return_tuple from transformers.utils.generic import check_model_inputs from ...activations import ACT2FN @@ -26,11 +26,10 @@ from ...masking_utils import create_causal_mask, create_sliding_window_causal_mask from ...modeling_flash_attention_utils import FlashAttentionKwargs from ...modeling_layers import GradientCheckpointingLayer -from ...modeling_outputs import BaseModelOutputWithPast from ...modeling_rope_utils import ROPE_INIT_FUNCTIONS, dynamic_rope_update from ...modeling_utils import ALL_ATTENTION_FUNCTIONS from ...processing_utils import Unpack -from ...utils import TransformersKwargs, auto_docstring +from ...utils import TransformersKwargs from ...utils.generic import maybe_autocast from .configuration_pp_chart2table import PPChart2TableConfig, PPChart2TableTextConfig, PPChart2TableVisionConfig @@ -346,6 +345,11 @@ def forward(self, hidden_states): return hidden_states +@auto_docstring( + custom_intro=""" + + """ +) class PPChart2TableVisionPreTrainedModel(PreTrainedModel): r""" Base class for all PP-Chart2Table vision models, inheriting from Hugging Face `PreTrainedModel`. @@ -393,6 +397,11 @@ class PPChart2TableVisionPreTrainedModel(PreTrainedModel): } +@auto_docstring( + custom_intro=""" + + """ +) class PPChart2TableVisionModel(PPChart2TableVisionPreTrainedModel): main_input_name = "pixel_values" input_modalities = "image" @@ -661,7 +670,11 @@ def forward( return hidden_states -@auto_docstring +@auto_docstring( + custom_intro=""" + + """ +) class PPChart2TableTextPreTrainedModel(PreTrainedModel): config: PPChart2TableTextConfig base_model_prefix = "model" @@ -837,7 +850,7 @@ def forward( @dataclass -class PPChart2TableModelOutputWithPast(ModelOutput): +class PPChart2TableModelOutputWithPast(BaseModelOutputWithPast): r""" Output class for PPChart2Table multimodal model's forward pass, extending Hugging Face `ModelOutput`. @@ -855,15 +868,11 @@ class PPChart2TableModelOutputWithPast(ModelOutput): Tuple of attention weights from each layer of the text decoder (for debugging/analysis). """ - past_key_values: Optional[Cache] = None - last_hidden_state: Optional[torch.FloatTensor] = None - hidden_states: Optional[tuple[torch.FloatTensor]] = None - attentions: Optional[tuple[torch.FloatTensor]] = None image_hidden_states: Optional[torch.FloatTensor] = None @dataclass -class PPChart2TableCausalLMOutputWithPast(ModelOutput): +class PPChart2TableCausalLMOutputWithPast(BaseModelOutputWithPast): r""" Output class for PP-Chart2Table conditional generation model's forward pass. @@ -873,24 +882,18 @@ class PPChart2TableCausalLMOutputWithPast(ModelOutput): Attributes: logits (`Optional[torch.FloatTensor]`, defaults to `None`): Language modeling logits (shape: `[B, seq_len, vocab_size]`), output from the LM head. - past_key_values (`Optional[Cache]`, defaults to `None`): - Cached attention key/value pairs (inherited from base model output). - last_hidden_state (`Optional[torch.FloatTensor]`, defaults to `None`): - Final hidden states from the text decoder (inherited from base model output). - hidden_states (`Optional[tuple[torch.FloatTensor]]`, defaults to `None`): - Tuple of decoder layer hidden states (inherited from base model output). - attentions (`Optional[tuple[torch.FloatTensor]]`, defaults to `None`): - Tuple of decoder layer attention weights (inherited from base model output). """ logits: Optional[torch.FloatTensor] = None - past_key_values: Optional[Cache] = None - last_hidden_state: Optional[torch.FloatTensor] = None - hidden_states: Optional[tuple[torch.FloatTensor]] = None - attentions: Optional[tuple[torch.FloatTensor]] = None + loss: Optional[torch.FloatTensor] = None image_hidden_states: Optional[torch.FloatTensor] = None +@auto_docstring( + custom_intro=""" + + """ +) class PPChart2TablePreTrainedModel(PreTrainedModel): r""" Base class for all PP-Chart2Table multimodal models, inheriting from Hugging Face `PreTrainedModel`. @@ -937,6 +940,11 @@ class PPChart2TablePreTrainedModel(PreTrainedModel): } +@auto_docstring( + custom_intro=""" + Core PP-Chart2Table multimodal model (vision encoder + text decoder) for chart-to-table parsing. + """ +) class PPChart2TableModel(PPChart2TablePreTrainedModel): r""" Core PP-Chart2Table multimodal model (vision encoder + text decoder) for chart-to-table parsing. @@ -1094,6 +1102,12 @@ def forward( ) +@auto_docstring( + custom_intro=""" + PP-Chart2Table model for conditional generation (table text generation from chart images), + extending the core model with a language modeling (LM) head and generation utilities. + """ +) class PPChart2TableForConditionalGeneration(PPChart2TablePreTrainedModel, GenerationMixin): r""" PP-Chart2Table model for conditional generation (table text generation from chart images), diff --git a/src/transformers/models/pp_chart2table/modular_pp_chart2table.py b/src/transformers/models/pp_chart2table/modular_pp_chart2table.py index bb964589ac84..f6850641fc14 100644 --- a/src/transformers/models/pp_chart2table/modular_pp_chart2table.py +++ b/src/transformers/models/pp_chart2table/modular_pp_chart2table.py @@ -10,20 +10,8 @@ from transformers.configuration_utils import PreTrainedConfig, layer_type_validation from transformers.feature_extraction_utils import BatchFeature from transformers.generation import GenerationMixin -from transformers.image_processing_utils import BaseImageProcessor from transformers.image_processing_utils_fast import BaseImageProcessorFast -from transformers.image_transforms import flip_channel_order, resize, to_channel_dimension_format -from transformers.image_utils import ( - ChannelDimension, - ImageInput, - PILImageResampling, - infer_channel_dimension_format, - make_flat_list_of_images, - to_numpy_array, - valid_images, - validate_preprocess_arguments, -) -from transformers.modeling_outputs import ModelOutput +from transformers.modeling_outputs import BaseModelOutputWithPast from transformers.modeling_rope_utils import RopeParameters from transformers.modeling_utils import PreTrainedModel from transformers.models.got_ocr2.modeling_got_ocr2 import ( @@ -40,13 +28,27 @@ Qwen2Model, Qwen2PreTrainedModel, ) -from transformers.processing_utils import ProcessorMixin, TensorType from transformers.utils import ( + auto_docstring, can_return_tuple, - filter_out_non_signature_kwargs, + logging, ) +from transformers.processing_utils import ProcessorMixin, TensorType +from transformers.utils import can_return_tuple +logger = logging.get_logger(__name__) + +@auto_docstring( + custom_intro=""" + This configuration class defines all the hyperparameters for the vision component + of the PP-Chart2Table model, which is responsible for processing chart images + and extracting visual features for table structure recognition and content extraction. + PaddlePaddle/PP-Chart2Table_safetensors [PaddlePaddle/PP-Chart2Table_safetensors] + (https://huggingface.co/PaddlePaddle/PP-Chart2Table_safetensors) + """, + checkpoint="PaddlePaddle/PP-Chart2Table_safetensors", +) class PPChart2TableVisionConfig(PreTrainedConfig): """ Configuration class for the vision backbone of PP-Chart2Table model. @@ -131,6 +133,12 @@ def __init__( super().__init__(**kwargs) +@auto_docstring( + custom_intro=""" + + """, + +) class PPChart2TableTextConfig(PreTrainedConfig): r""" This is the configuration class to store the configuration of a [`PPChart2TableTextModel`]. It is used to instantiate a @@ -285,6 +293,11 @@ def __init__( ) +@auto_docstring( + custom_intro=""" + + """ +) class PPChart2TableConfig(PreTrainedConfig): r""" This is the main configuration class to store the configuration of a [PPChart2TableModel] or [PPChart2TableForConditionalGeneration]. @@ -389,142 +402,11 @@ def __init__( super().__init__(**kwargs) +@auto_docstring( + custom_intro=""" -class PPChart2TableImageProcessor(BaseImageProcessor): - r""" - Image processor for the PP-Chart2Table multimodal model, optimized for chart image preprocessing tasks. - - This processor handles the complete preprocessing pipeline for chart images, including resizing, rescaling, - normalization, and channel dimension reordering, tailored to the input requirements of the PP-Chart2Table vision encoder. - - Args: - do_resize (`bool`, *optional*, defaults to `True`): - Whether to resize the input images to the specified `size`. - size (`dict[str, int]`, *optional*, defaults to `{"height": 256, "width": 256}`): - Dictionary containing the target height and width for resizing. Format: `{"height": int, "width": int}`. - resample (`PILImageResampling`, *optional*, defaults to `PILImageResampling.BICUBIC`): - Resampling filter to use when resizing images (e.g., BICUBIC, BILINEAR). - do_rescale (`bool`, *optional*, defaults to `True`): - Whether to rescale the pixel values from the range [0, 255] to [0, 1] using `rescale_factor`. - rescale_factor (`int` or `float`, *optional*, defaults to `1/255`): - Factor to apply for rescaling pixel values (e.g., 1/255 scales 0-255 to 0-1). - do_normalize (`bool`, *optional*, defaults to `True`): - Whether to normalize the input images using `image_mean` and `image_std`. - image_mean (`float` or `list[float]`, *optional*, defaults to `[0.406, 0.456, 0.485]`): - Mean values for image normalization (per channel, RGB order). - image_std (`float` or `list[float]`, *optional*, defaults to `[0.225, 0.224, 0.229]`): - Standard deviation values for image normalization (per channel, RGB order). - patch_size (`int`, *optional*, defaults to 16): - Size of image patches used by the PP-Chart2Table vision encoder (for alignment with model input). - merge_size (`int`, *optional*, defaults to 4): - Size factor for merging image patches (specific to PP-Chart2Table's vision processing pipeline). """ - - model_input_names = ["pixel_values"] - - def __init__( - self, - do_resize: bool = True, - size: Optional[dict[str, int]] = None, - resample: Optional[PILImageResampling] = PILImageResampling.BICUBIC, - do_rescale: bool = True, - rescale_factor: Union[int, float] = 1 / 255, - do_normalize: bool = True, - image_mean: Optional[Union[float, list[float]]] = [0.406, 0.456, 0.485], - image_std: Optional[Union[float, list[float]]] = [0.225, 0.224, 0.229], - patch_size: int = 16, - merge_size: int = 4, - **kwargs, - ) -> None: - super().__init__(**kwargs) - size = size if size is not None else {"height": 256, "width": 256} - - self.do_resize = do_resize - self.size = size - self.do_rescale = do_rescale - self.rescale_factor = rescale_factor - self.do_normalize = do_normalize - self.image_mean = image_mean - self.image_std = image_std - self.resample = resample - self.patch_size = patch_size - self.merge_size = merge_size - - @filter_out_non_signature_kwargs() - def preprocess( - self, - images: ImageInput, - size: Optional[dict[str, int]] = None, - do_resize: Optional[bool] = None, - resample: Optional[PILImageResampling] = None, - do_rescale: Optional[bool] = None, - rescale_factor: Optional[Union[int, float]] = None, - do_normalize: Optional[bool] = None, - image_mean: Optional[Union[float, list[float]]] = None, - image_std: Optional[Union[float, list[float]]] = None, - return_tensors: Optional[Union[TensorType, str]] = None, - data_format: Union[str, ChannelDimension] = ChannelDimension.FIRST, - input_data_format: Optional[Union[str, ChannelDimension]] = None, - ) -> BatchFeature: - size = self.size if size is None else size - do_resize = self.do_resize if do_resize is None else do_resize - resample = self.resample if resample is None else resample - do_rescale = self.do_rescale if do_rescale is None else do_rescale - rescale_factor = self.rescale_factor if rescale_factor is None else rescale_factor - do_normalize = self.do_normalize if do_normalize is None else do_normalize - image_mean = self.image_mean if image_mean is None else image_mean - image_std = self.image_std if image_std is None else image_std - - images = make_flat_list_of_images(images) - - validate_preprocess_arguments( - do_rescale=do_rescale, - rescale_factor=rescale_factor, - do_normalize=do_normalize, - image_mean=image_mean, - image_std=image_std, - size=size, - do_resize=do_resize, - resample=resample, - ) - - if not valid_images(images): - raise ValueError("Invalid image type. Must be of type PIL.Image.Image, numpy.ndarray, or torch.Tensor") - - # All transformations expect numpy arrays - images = [to_numpy_array(image) for image in images] - if input_data_format is None: - input_data_format = infer_channel_dimension_format(images[0]) - - # transformations - resize_images = [] - if do_resize: - for image in images: - image = resize( - image, - size=(size["height"], size["width"]), - resample=resample, - input_data_format=input_data_format, - ) - resize_images.append(image) - images = resize_images - - if do_rescale: - images = [self.rescale(image, rescale_factor, input_data_format=input_data_format) for image in images] - - if do_normalize: - images = [ - self.normalize(image, image_mean, image_std, input_data_format=input_data_format) for image in images - ] - images = [flip_channel_order(image, input_data_format=input_data_format) for image in images] - images = [ - to_channel_dimension_format(image, data_format, input_channel_dim=input_data_format) for image in images - ] - - encoded_inputs = BatchFeature(data={"pixel_values": images}, tensor_type=return_tensors) - return encoded_inputs - - +) class PPChart2TableImageProcessorFast(BaseImageProcessorFast): r""" Fast image processor for the PP-Chart2Table multimodal model, optimized for GPU-accelerated chart image preprocessing. @@ -603,12 +485,23 @@ def _preprocess( return encoded_inputs +@auto_docstring( + custom_intro=""" + A multi-modal processor for the PPChart2Table model, combining image preprocessing and text tokenization + capabilities to handle chart-to-table conversion tasks. + + This processor integrates `PPChart2TableImageProcessorFast` for chart image preprocessing (e.g., patch-based + resizing) and `Qwen2Tokenizer` for text prompt construction/tokenization. It encapsulates the end-to-end + processing pipeline from raw chart images + text instructions to model-ready input tensors, and also provides + postprocessing logic to decode model outputs back to human-readable table text. + """ +) class PPChart2TableProcessor(ProcessorMixin): r""" - [`PPChart2TableProcessor`] offers all the functionalities of [`PPChart2TableImageProcessor`] and [`Qwen2Tokenizer`]. See the + [`PPChart2TableProcessor`] offers all the functionalities of [`PPChart2TableImageProcessorFast`] and [`Qwen2Tokenizer`]. See the [`~PPChart2TableProcessor.__call__`] and [`~PPChart2TableProcessor.decode`] for more information. Args: - image_processor ([`PPChart2TableImageProcessor`], *optional*): + image_processor ([`PPChart2TableImageProcessorFast`], *optional*): The image processor is a required input. tokenizer ([`Qwen2Tokenizer`], *optional*): The tokenizer is a required input. @@ -713,6 +606,11 @@ def __init__(self, config: PPChart2TableVisionConfig): self.layer_norm2 = PPChart2TableVisionLayerNorm(config.output_channels, data_format="channels_first") +@auto_docstring( + custom_intro=""" + + """ +) class PPChart2TableVisionPreTrainedModel(PreTrainedModel): r""" Base class for all PP-Chart2Table vision models, inheriting from Hugging Face `PreTrainedModel`. @@ -760,6 +658,11 @@ class PPChart2TableVisionPreTrainedModel(PreTrainedModel): } +@auto_docstring( + custom_intro=""" + + """ +) class PPChart2TableVisionModel(PPChart2TableVisionPreTrainedModel): main_input_name = "pixel_values" input_modalities = "image" @@ -815,6 +718,11 @@ class PPChart2TableTextDecoderLayer(Qwen2DecoderLayer): pass +@auto_docstring( + custom_intro=""" + + """ +) class PPChart2TableTextPreTrainedModel(Qwen2PreTrainedModel): pass @@ -824,7 +732,7 @@ class PPChart2TableTextModel(Qwen2Model): @dataclass -class PPChart2TableModelOutputWithPast(ModelOutput): +class PPChart2TableModelOutputWithPast(BaseModelOutputWithPast): r""" Output class for PPChart2Table multimodal model's forward pass, extending Hugging Face `ModelOutput`. @@ -841,16 +749,11 @@ class PPChart2TableModelOutputWithPast(ModelOutput): attentions (`Optional[tuple[torch.FloatTensor]]`, defaults to `None`): Tuple of attention weights from each layer of the text decoder (for debugging/analysis). """ - - past_key_values: Optional[Cache] = None - last_hidden_state: Optional[torch.FloatTensor] = None - hidden_states: Optional[tuple[torch.FloatTensor]] = None - attentions: Optional[tuple[torch.FloatTensor]] = None image_hidden_states: Optional[torch.FloatTensor] = None @dataclass -class PPChart2TableCausalLMOutputWithPast(ModelOutput): +class PPChart2TableCausalLMOutputWithPast(BaseModelOutputWithPast): r""" Output class for PP-Chart2Table conditional generation model's forward pass. @@ -860,24 +763,18 @@ class PPChart2TableCausalLMOutputWithPast(ModelOutput): Attributes: logits (`Optional[torch.FloatTensor]`, defaults to `None`): Language modeling logits (shape: `[B, seq_len, vocab_size]`), output from the LM head. - past_key_values (`Optional[Cache]`, defaults to `None`): - Cached attention key/value pairs (inherited from base model output). - last_hidden_state (`Optional[torch.FloatTensor]`, defaults to `None`): - Final hidden states from the text decoder (inherited from base model output). - hidden_states (`Optional[tuple[torch.FloatTensor]]`, defaults to `None`): - Tuple of decoder layer hidden states (inherited from base model output). - attentions (`Optional[tuple[torch.FloatTensor]]`, defaults to `None`): - Tuple of decoder layer attention weights (inherited from base model output). """ logits: Optional[torch.FloatTensor] = None - past_key_values: Optional[Cache] = None - last_hidden_state: Optional[torch.FloatTensor] = None - hidden_states: Optional[tuple[torch.FloatTensor]] = None - attentions: Optional[tuple[torch.FloatTensor]] = None + loss: Optional[torch.FloatTensor] = None image_hidden_states: Optional[torch.FloatTensor] = None +@auto_docstring( + custom_intro=""" + + """ +) class PPChart2TablePreTrainedModel(PreTrainedModel): r""" Base class for all PP-Chart2Table multimodal models, inheriting from Hugging Face `PreTrainedModel`. @@ -924,6 +821,11 @@ class PPChart2TablePreTrainedModel(PreTrainedModel): } +@auto_docstring( + custom_intro=""" + Core PP-Chart2Table multimodal model (vision encoder + text decoder) for chart-to-table parsing. + """ +) class PPChart2TableModel(PPChart2TablePreTrainedModel): r""" Core PP-Chart2Table multimodal model (vision encoder + text decoder) for chart-to-table parsing. @@ -1081,6 +983,12 @@ def forward( ) +@auto_docstring( + custom_intro=""" + PP-Chart2Table model for conditional generation (table text generation from chart images), + extending the core model with a language modeling (LM) head and generation utilities. + """ +) class PPChart2TableForConditionalGeneration(PPChart2TablePreTrainedModel, GenerationMixin): r""" PP-Chart2Table model for conditional generation (table text generation from chart images), @@ -1235,7 +1143,6 @@ def prepare_inputs_for_generation( "PPChart2TableVisionModel", "PPChart2TableVisionConfig", "PPChart2TableTextConfig", - "PPChart2TableImageProcessor", "PPChart2TableImageProcessorFast", "PPChart2TableProcessor", ] diff --git a/src/transformers/models/pp_chart2table/processing_pp_chart2table.py b/src/transformers/models/pp_chart2table/processing_pp_chart2table.py index c9d83d12082b..1efa2641ff23 100644 --- a/src/transformers/models/pp_chart2table/processing_pp_chart2table.py +++ b/src/transformers/models/pp_chart2table/processing_pp_chart2table.py @@ -9,14 +9,26 @@ from transformers.feature_extraction_utils import BatchFeature from transformers.processing_utils import ProcessorMixin +from transformers.utils import auto_docstring +@auto_docstring( + custom_intro=""" + A multi-modal processor for the PPChart2Table model, combining image preprocessing and text tokenization + capabilities to handle chart-to-table conversion tasks. + + This processor integrates `PPChart2TableImageProcessorFast` for chart image preprocessing (e.g., patch-based + resizing) and `Qwen2Tokenizer` for text prompt construction/tokenization. It encapsulates the end-to-end + processing pipeline from raw chart images + text instructions to model-ready input tensors, and also provides + postprocessing logic to decode model outputs back to human-readable table text. + """ +) class PPChart2TableProcessor(ProcessorMixin): r""" - [`PPChart2TableProcessor`] offers all the functionalities of [`PPChart2TableImageProcessor`] and [`Qwen2Tokenizer`]. See the + [`PPChart2TableProcessor`] offers all the functionalities of [`PPChart2TableImageProcessorFast`] and [`Qwen2Tokenizer`]. See the [`~PPChart2TableProcessor.__call__`] and [`~PPChart2TableProcessor.decode`] for more information. Args: - image_processor ([`PPChart2TableImageProcessor`], *optional*): + image_processor ([`PPChart2TableImageProcessorFast`], *optional*): The image processor is a required input. tokenizer ([`Qwen2Tokenizer`], *optional*): The tokenizer is a required input. From db1e9a859fc5050415e59070328d7f85cbc85637 Mon Sep 17 00:00:00 2001 From: XingweiDeng Date: Tue, 10 Mar 2026 15:07:47 +0800 Subject: [PATCH 11/60] refactor image_processor_fast --- .../configuration_pp_chart2table.py | 17 ++- .../image_processing_pp_chart2table_fast.py | 57 ++++---- .../pp_chart2table/modeling_pp_chart2table.py | 126 ++++++++---------- .../pp_chart2table/modular_pp_chart2table.py | 67 ++++++---- 4 files changed, 138 insertions(+), 129 deletions(-) diff --git a/src/transformers/models/pp_chart2table/configuration_pp_chart2table.py b/src/transformers/models/pp_chart2table/configuration_pp_chart2table.py index 7e0a964b5254..a1ce908d6361 100644 --- a/src/transformers/models/pp_chart2table/configuration_pp_chart2table.py +++ b/src/transformers/models/pp_chart2table/configuration_pp_chart2table.py @@ -4,7 +4,6 @@ # the file from the modular. If any change should be done, please apply the change to the # modular_pp_chart2table.py file directly. One of our CI enforces this. # ๐Ÿšจ๐Ÿšจ๐Ÿšจ๐Ÿšจ๐Ÿšจ๐Ÿšจ๐Ÿšจ๐Ÿšจ๐Ÿšจ๐Ÿšจ๐Ÿšจ๐Ÿšจ๐Ÿšจ๐Ÿšจ๐Ÿšจ๐Ÿšจ๐Ÿšจ๐Ÿšจ๐Ÿšจ๐Ÿšจ๐Ÿšจ๐Ÿšจ๐Ÿšจ๐Ÿšจ๐Ÿšจ๐Ÿšจ๐Ÿšจ๐Ÿšจ๐Ÿšจ๐Ÿšจ๐Ÿšจ๐Ÿšจ๐Ÿšจ๐Ÿšจ๐Ÿšจ๐Ÿšจ๐Ÿšจ๐Ÿšจ๐Ÿšจ๐Ÿšจ๐Ÿšจ๐Ÿšจ๐Ÿšจ๐Ÿšจ๐Ÿšจ๐Ÿšจ๐Ÿšจ๐Ÿšจ -from typing import Optional from transformers.configuration_utils import PreTrainedConfig, layer_type_validation from transformers.modeling_rope_utils import RopeParameters @@ -80,7 +79,7 @@ def __init__( patch_size: int = 16, qkv_bias: bool = True, use_rel_pos: bool = True, - global_attn_indexes: Optional[list[int]] = None, + global_attn_indexes: list[int] | None = None, window_size: int = 14, output_channels: int = 256, net_channels: int = 512, @@ -128,6 +127,8 @@ class PPChart2TableTextConfig(PreTrainedConfig): The token ID representing the beginning of a sequence (BOS) for text generation. eos_token_id (`int`, *optional*, defaults to 151643): The token ID representing the end of a sequence (EOS) for text generation. + pad_token_id (Optional[int], optional, *optional*, defaults to -1): + The index of the padding token. Defaults to -1. hidden_act (`str` or `function`, *optional*, defaults to `"silu"`): The non-linear activation function (function or string) in the feed-forward and attention layers of the decoder. hidden_size (`int`, *optional*, defaults to 1024): @@ -207,6 +208,7 @@ def __init__( attention_dropout: float = 0.0, bos_token_id: int = 151643, eos_token_id: int = 151643, + pad_token_id: int = -1, hidden_act: str = "silu", hidden_size: int = 1024, initializer_range: float = 0.02, @@ -217,12 +219,12 @@ def __init__( num_key_value_heads: int = 16, rms_norm_eps: float = 1e-06, rope_theta: float = 1000000.0, - rope_parameters: Optional[RopeParameters | dict[str, RopeParameters]] = None, + rope_parameters: RopeParameters | dict[str, RopeParameters] | None = None, sliding_window: int = 32768, tie_word_embeddings: bool = True, use_cache: bool = True, vocab_size: int = 151860, - layer_types: Optional[list[str]] = None, + layer_types: list[str] | None = None, **kwargs, ): self.vocab_size = vocab_size @@ -257,6 +259,7 @@ def __init__( self.rope_theta = rope_theta self.tie_word_embeddings = tie_word_embeddings super().__init__( + pad_token_id=pad_token_id, bos_token_id=bos_token_id, eos_token_id=eos_token_id, tie_word_embeddings=tie_word_embeddings, @@ -330,9 +333,9 @@ def __init__( self, vision_config: dict | None = None, text_config: dict | None = None, - image_token_index: Optional[int] = 151859, - image_seq_length: Optional[int] = 576, - pad_token_id: Optional[int] = -1, + image_token_index: int | None = 151859, + image_seq_length: int | None = 576, + pad_token_id: int | None = -1, **kwargs, ): self.image_token_index = image_token_index diff --git a/src/transformers/models/pp_chart2table/image_processing_pp_chart2table_fast.py b/src/transformers/models/pp_chart2table/image_processing_pp_chart2table_fast.py index d9a942aab565..d70b530acf72 100644 --- a/src/transformers/models/pp_chart2table/image_processing_pp_chart2table_fast.py +++ b/src/transformers/models/pp_chart2table/image_processing_pp_chart2table_fast.py @@ -4,13 +4,14 @@ # the file from the modular. If any change should be done, please apply the change to the # modular_pp_chart2table.py file directly. One of our CI enforces this. # ๐Ÿšจ๐Ÿšจ๐Ÿšจ๐Ÿšจ๐Ÿšจ๐Ÿšจ๐Ÿšจ๐Ÿšจ๐Ÿšจ๐Ÿšจ๐Ÿšจ๐Ÿšจ๐Ÿšจ๐Ÿšจ๐Ÿšจ๐Ÿšจ๐Ÿšจ๐Ÿšจ๐Ÿšจ๐Ÿšจ๐Ÿšจ๐Ÿšจ๐Ÿšจ๐Ÿšจ๐Ÿšจ๐Ÿšจ๐Ÿšจ๐Ÿšจ๐Ÿšจ๐Ÿšจ๐Ÿšจ๐Ÿšจ๐Ÿšจ๐Ÿšจ๐Ÿšจ๐Ÿšจ๐Ÿšจ๐Ÿšจ๐Ÿšจ๐Ÿšจ๐Ÿšจ๐Ÿšจ๐Ÿšจ๐Ÿšจ๐Ÿšจ๐Ÿšจ๐Ÿšจ๐Ÿšจ -from typing import Optional, Union +from typing import Optional import torch -from torchvision.transforms.v2.functional import InterpolationMode +import torchvision.transforms.v2.functional as tvF from transformers.feature_extraction_utils import BatchFeature -from transformers.image_processing_utils_fast import BaseImageProcessorFast +from transformers.image_processing_utils_fast import BaseImageProcessorFast, group_images_by_shape, reorder_images +from transformers.image_utils import SizeDict from transformers.processing_utils import TensorType from transformers.utils import auto_docstring @@ -65,37 +66,43 @@ def __init__(self, **kwargs) -> None: def _preprocess( self, - images: list[torch.Tensor], - size: Optional[list[dict[str, int]]], + images: list["torch.Tensor"], do_resize: bool, + size: SizeDict, + interpolation: Optional["tvF.InterpolationMode"], do_rescale: bool, rescale_factor: float, do_normalize: bool, - image_mean: Optional[Union[float, list[float]]], - image_std: Optional[Union[float, list[float]]], - return_tensors: Optional[Union[str, TensorType]], - interpolation: Optional[InterpolationMode] = None, + image_mean: float | list[float] | None, + image_std: float | list[float] | None, + disable_grouping: bool | None, + return_tensors: str | TensorType | None, **kwargs, ) -> BatchFeature: - data = {} - resize_images = [] - if do_resize: - for image in images: - image = self.resize(image, size=size, interpolation=interpolation) - resize_images.append(image) - images = resize_images + grouped_images, grouped_images_index = group_images_by_shape(images, disable_grouping=disable_grouping) + resized_images_grouped = {} + for shape, stacked_images in grouped_images.items(): + if do_resize: + stacked_images = self.resize(image=stacked_images, size=size, interpolation=interpolation) + resized_images_grouped[shape] = stacked_images + resized_images = reorder_images(resized_images_grouped, grouped_images_index) - processed_images = [] - for image in images: - image = self.rescale_and_normalize(image, do_rescale, rescale_factor, do_normalize, image_mean, image_std) - processed_images.append(image) - images = processed_images + grouped_images, grouped_images_index = group_images_by_shape(resized_images, disable_grouping=disable_grouping) + processed_images_grouped = {} + for shape, stacked_images in grouped_images.items(): + stacked_images = self.rescale_and_normalize( + stacked_images, do_rescale, rescale_factor, do_normalize, image_mean, image_std + ) + # BGR to RGB conversion + stacked_images = stacked_images[:, [2, 1, 0], :, :] + processed_images_grouped[shape] = stacked_images - images = [image[[2, 1, 0], :, :] for image in images] - data.update({"pixel_values": torch.stack(images, dim=0)}) - encoded_inputs = BatchFeature(data, tensor_type=return_tensors) + pixel_values = reorder_images(processed_images_grouped, grouped_images_index) - return encoded_inputs + return BatchFeature( + data={"pixel_values": pixel_values}, + tensor_type=return_tensors, + ) __all__ = ["PPChart2TableImageProcessorFast"] diff --git a/src/transformers/models/pp_chart2table/modeling_pp_chart2table.py b/src/transformers/models/pp_chart2table/modeling_pp_chart2table.py index d8e4f28e89f6..41f03d0d2c57 100644 --- a/src/transformers/models/pp_chart2table/modeling_pp_chart2table.py +++ b/src/transformers/models/pp_chart2table/modeling_pp_chart2table.py @@ -7,7 +7,7 @@ import collections from collections.abc import Callable from dataclasses import dataclass -from typing import Optional, Union +from typing import Optional import torch import torch.nn as nn @@ -18,7 +18,6 @@ from transformers.modeling_outputs import BaseModelOutputWithPast from transformers.modeling_utils import PreTrainedModel from transformers.utils import auto_docstring, can_return_tuple -from transformers.utils.generic import check_model_inputs from ...activations import ACT2FN from ...cache_utils import DynamicCache @@ -30,7 +29,8 @@ from ...modeling_utils import ALL_ATTENTION_FUNCTIONS from ...processing_utils import Unpack from ...utils import TransformersKwargs -from ...utils.generic import maybe_autocast +from ...utils.generic import maybe_autocast, merge_with_config_defaults +from ...utils.output_capturing import capture_outputs from .configuration_pp_chart2table import PPChart2TableConfig, PPChart2TableTextConfig, PPChart2TableVisionConfig @@ -457,7 +457,7 @@ def rotate_half(x): @use_kernel_func_from_hub("rotary_pos_emb") -def apply_rotary_pos_emb(q, k, cos, sin, position_ids=None, unsqueeze_dim=1): +def apply_rotary_pos_emb(q, k, cos, sin, unsqueeze_dim=1): """Applies Rotary Position Embedding to the query and key tensors. Args: @@ -465,8 +465,6 @@ def apply_rotary_pos_emb(q, k, cos, sin, position_ids=None, unsqueeze_dim=1): k (`torch.Tensor`): The key tensor. cos (`torch.Tensor`): The cosine part of the rotary embedding. sin (`torch.Tensor`): The sine part of the rotary embedding. - position_ids (`torch.Tensor`, *optional*): - Deprecated and unused. unsqueeze_dim (`int`, *optional*, defaults to 1): The 'unsqueeze_dim' argument specifies the dimension along which to unsqueeze cos[position_ids] and sin[position_ids] so that they can be properly broadcasted to the dimensions of q and k. For example, note @@ -501,7 +499,7 @@ def eager_attention_forward( query: torch.Tensor, key: torch.Tensor, value: torch.Tensor, - attention_mask: Optional[torch.Tensor], + attention_mask: torch.Tensor | None, scaling: float, dropout: float = 0.0, **kwargs: Unpack[TransformersKwargs], @@ -511,8 +509,7 @@ def eager_attention_forward( attn_weights = torch.matmul(query, key_states.transpose(2, 3)) * scaling if attention_mask is not None: - causal_mask = attention_mask[:, :, :, : key_states.shape[-2]] - attn_weights = attn_weights + causal_mask + attn_weights = attn_weights + attention_mask attn_weights = nn.functional.softmax(attn_weights, dim=-1, dtype=torch.float32).to(query.dtype) attn_weights = nn.functional.dropout(attn_weights, p=dropout, training=module.training) @@ -546,11 +543,10 @@ def forward( self, hidden_states: torch.Tensor, position_embeddings: tuple[torch.Tensor, torch.Tensor], - attention_mask: Optional[torch.Tensor], - past_key_values: Optional[Cache] = None, - cache_position: Optional[torch.LongTensor] = None, + attention_mask: torch.Tensor | None, + past_key_values: Cache | None = None, **kwargs: Unpack[FlashAttentionKwargs], - ) -> tuple[torch.Tensor, Optional[torch.Tensor]]: + ) -> tuple[torch.Tensor, torch.Tensor | None]: input_shape = hidden_states.shape[:-1] hidden_shape = (*input_shape, -1, self.head_dim) @@ -562,13 +558,11 @@ def forward( query_states, key_states = apply_rotary_pos_emb(query_states, key_states, cos, sin) if past_key_values is not None: - # sin and cos are specific to RoPE models; cache_position needed for the static cache - cache_kwargs = {"sin": sin, "cos": cos, "cache_position": cache_position} - key_states, value_states = past_key_values.update(key_states, value_states, self.layer_idx, cache_kwargs) + key_states, value_states = past_key_values.update(key_states, value_states, self.layer_idx) - attention_interface: Callable = eager_attention_forward - if self.config._attn_implementation != "eager": - attention_interface = ALL_ATTENTION_FUNCTIONS[self.config._attn_implementation] + attention_interface: Callable = ALL_ATTENTION_FUNCTIONS.get_interface( + self.config._attn_implementation, eager_attention_forward + ) attn_output, attn_weights = attention_interface( self, @@ -639,12 +633,11 @@ def __init__(self, config: PPChart2TableTextConfig, layer_idx: int): def forward( self, hidden_states: torch.Tensor, - attention_mask: Optional[torch.Tensor] = None, - position_ids: Optional[torch.LongTensor] = None, - past_key_values: Optional[Cache] = None, - use_cache: Optional[bool] = False, - cache_position: Optional[torch.LongTensor] = None, - position_embeddings: Optional[tuple[torch.Tensor, torch.Tensor]] = None, + attention_mask: torch.Tensor | None = None, + position_ids: torch.LongTensor | None = None, + past_key_values: Cache | None = None, + use_cache: bool | None = False, + position_embeddings: tuple[torch.Tensor, torch.Tensor] | None = None, **kwargs: Unpack[TransformersKwargs], ) -> torch.Tensor: residual = hidden_states @@ -656,7 +649,6 @@ def forward( position_ids=position_ids, past_key_values=past_key_values, use_cache=use_cache, - cache_position=cache_position, position_embeddings=position_embeddings, **kwargs, ) @@ -714,9 +706,9 @@ def __init__(self, config: PPChart2TableTextConfig, device=None): @staticmethod def compute_default_rope_parameters( - config: Optional[PPChart2TableTextConfig] = None, + config: PPChart2TableTextConfig | None = None, device: Optional["torch.device"] = None, - seq_len: Optional[int] = None, + seq_len: int | None = None, ) -> tuple["torch.Tensor", float]: """ Computes the inverse frequencies according to the original RoPE implementation @@ -777,17 +769,17 @@ def __init__(self, config: PPChart2TableTextConfig): # Initialize weights and apply final processing self.post_init() - @check_model_inputs + @merge_with_config_defaults + @capture_outputs @auto_docstring def forward( self, - input_ids: Optional[torch.LongTensor] = None, - attention_mask: Optional[torch.Tensor] = None, - position_ids: Optional[torch.LongTensor] = None, - past_key_values: Optional[Cache] = None, - inputs_embeds: Optional[torch.FloatTensor] = None, - use_cache: Optional[bool] = None, - cache_position: Optional[torch.LongTensor] = None, + input_ids: torch.LongTensor | None = None, + attention_mask: torch.Tensor | None = None, + position_ids: torch.LongTensor | None = None, + past_key_values: Cache | None = None, + inputs_embeds: torch.FloatTensor | None = None, + use_cache: bool | None = None, **kwargs: Unpack[TransformersKwargs], ) -> BaseModelOutputWithPast: if (input_ids is None) ^ (inputs_embeds is not None): @@ -799,23 +791,18 @@ def forward( if use_cache and past_key_values is None: past_key_values = DynamicCache(config=self.config) - if cache_position is None: - past_seen_tokens = past_key_values.get_seq_length() if past_key_values is not None else 0 - cache_position = torch.arange( - past_seen_tokens, past_seen_tokens + inputs_embeds.shape[1], device=inputs_embeds.device - ) - if position_ids is None: - position_ids = cache_position.unsqueeze(0) + past_seen_tokens = past_key_values.get_seq_length() if past_key_values is not None else 0 + position_ids = torch.arange(inputs_embeds.shape[1], device=inputs_embeds.device) + past_seen_tokens + position_ids = position_ids.unsqueeze(0) # It may already have been prepared by e.g. `generate` if not isinstance(causal_mask_mapping := attention_mask, dict): # Prepare mask arguments mask_kwargs = { "config": self.config, - "input_embeds": inputs_embeds, + "inputs_embeds": inputs_embeds, "attention_mask": attention_mask, - "cache_position": cache_position, "past_key_values": past_key_values, "position_ids": position_ids, } @@ -838,7 +825,6 @@ def forward( position_ids=position_ids, past_key_values=past_key_values, use_cache=use_cache, - cache_position=cache_position, **kwargs, ) @@ -868,7 +854,7 @@ class PPChart2TableModelOutputWithPast(BaseModelOutputWithPast): Tuple of attention weights from each layer of the text decoder (for debugging/analysis). """ - image_hidden_states: Optional[torch.FloatTensor] = None + image_hidden_states: torch.FloatTensor | None = None @dataclass @@ -884,9 +870,9 @@ class PPChart2TableCausalLMOutputWithPast(BaseModelOutputWithPast): Language modeling logits (shape: `[B, seq_len, vocab_size]`), output from the LM head. """ - logits: Optional[torch.FloatTensor] = None - loss: Optional[torch.FloatTensor] = None - image_hidden_states: Optional[torch.FloatTensor] = None + logits: torch.FloatTensor | None = None + loss: torch.FloatTensor | None = None + image_hidden_states: torch.FloatTensor | None = None @auto_docstring( @@ -1060,13 +1046,13 @@ def get_placeholder_mask( def forward( self, input_ids: torch.LongTensor = None, - attention_mask: Optional[torch.Tensor] = None, - position_ids: Optional[torch.Tensor] = None, - past_key_values: Optional[list[torch.Tensor]] = None, - inputs_embeds: Optional[torch.Tensor] = None, - use_cache: Optional[bool] = None, - pixel_values: Optional[torch.Tensor] = None, - cache_position: Optional[torch.LongTensor] = None, + attention_mask: torch.Tensor | None = None, + position_ids: torch.Tensor | None = None, + past_key_values: list[torch.Tensor] | None = None, + inputs_embeds: torch.Tensor | None = None, + use_cache: bool | None = None, + pixel_values: torch.Tensor | None = None, + cache_position: torch.LongTensor | None = None, **kwargs, ): if (input_ids is None) ^ (inputs_embeds is not None): @@ -1158,8 +1144,8 @@ def get_output_embeddings(self) -> nn.Module: def get_image_features( self, pixel_values: torch.FloatTensor, - vision_feature_layer: Optional[Union[int, list[int]]] = None, - vision_feature_select_strategy: Optional[str] = None, + vision_feature_layer: int | list[int] | None = None, + vision_feature_select_strategy: str | None = None, **kwargs, ): return self.model.get_image_features( @@ -1172,18 +1158,18 @@ def get_image_features( @can_return_tuple def forward( self, - input_ids: Optional[torch.LongTensor] = None, - attention_mask: Optional[torch.Tensor] = None, - position_ids: Optional[torch.LongTensor] = None, - pixel_values: Optional[torch.Tensor] = None, - inputs_embeds: Optional[torch.Tensor] = None, - labels: Optional[list[dict]] = None, - logits_to_keep: Union[int, torch.Tensor] = 0, - cache_position: Optional[torch.LongTensor] = None, - past_key_values: Optional[Cache] = None, - use_cache: Optional[bool] = None, + input_ids: torch.LongTensor | None = None, + attention_mask: torch.Tensor | None = None, + position_ids: torch.LongTensor | None = None, + pixel_values: torch.Tensor | None = None, + inputs_embeds: torch.Tensor | None = None, + labels: list[dict] | None = None, + logits_to_keep: int | torch.Tensor = 0, + cache_position: torch.LongTensor | None = None, + past_key_values: Cache | None = None, + use_cache: bool | None = None, **kwargs, - ) -> Union[tuple[torch.FloatTensor], PPChart2TableCausalLMOutputWithPast]: + ) -> tuple[torch.FloatTensor] | PPChart2TableCausalLMOutputWithPast: outputs = self.model( input_ids=input_ids, attention_mask=attention_mask, diff --git a/src/transformers/models/pp_chart2table/modular_pp_chart2table.py b/src/transformers/models/pp_chart2table/modular_pp_chart2table.py index f6850641fc14..62544c5545e4 100644 --- a/src/transformers/models/pp_chart2table/modular_pp_chart2table.py +++ b/src/transformers/models/pp_chart2table/modular_pp_chart2table.py @@ -4,13 +4,13 @@ import torch import torch.nn as nn -from torchvision.transforms.v2.functional import InterpolationMode +import torchvision.transforms.v2.functional as tvF from transformers.cache_utils import Cache from transformers.configuration_utils import PreTrainedConfig, layer_type_validation from transformers.feature_extraction_utils import BatchFeature from transformers.generation import GenerationMixin -from transformers.image_processing_utils_fast import BaseImageProcessorFast +from transformers.image_processing_utils_fast import BaseImageProcessorFast, group_images_by_shape, reorder_images from transformers.modeling_outputs import BaseModelOutputWithPast from transformers.modeling_rope_utils import RopeParameters from transformers.modeling_utils import PreTrainedModel @@ -36,6 +36,8 @@ from transformers.processing_utils import ProcessorMixin, TensorType from transformers.utils import can_return_tuple +from transformers.image_utils import SizeDict + logger = logging.get_logger(__name__) @@ -157,6 +159,8 @@ class PPChart2TableTextConfig(PreTrainedConfig): The token ID representing the beginning of a sequence (BOS) for text generation. eos_token_id (`int`, *optional*, defaults to 151643): The token ID representing the end of a sequence (EOS) for text generation. + pad_token_id (Optional[int], optional, *optional*, defaults to -1): + The index of the padding token. Defaults to -1. hidden_act (`str` or `function`, *optional*, defaults to `"silu"`): The non-linear activation function (function or string) in the feed-forward and attention layers of the decoder. hidden_size (`int`, *optional*, defaults to 1024): @@ -236,6 +240,7 @@ def __init__( attention_dropout: float = 0.0, bos_token_id: int = 151643, eos_token_id: int = 151643, + pad_token_id: int = -1, hidden_act: str = "silu", hidden_size: int = 1024, initializer_range: float = 0.02, @@ -286,6 +291,7 @@ def __init__( self.rope_theta = rope_theta self.tie_word_embeddings = tie_word_embeddings super().__init__( + pad_token_id=pad_token_id, bos_token_id=bos_token_id, eos_token_id=eos_token_id, tie_word_embeddings=tie_word_embeddings, @@ -452,37 +458,44 @@ def __init__(self, **kwargs) -> None: def _preprocess( self, - images: list[torch.Tensor], - size: Optional[list[dict[str, int]]], + images: list["torch.Tensor"], do_resize: bool, + size: SizeDict, + interpolation: Optional["tvF.InterpolationMode"], do_rescale: bool, rescale_factor: float, do_normalize: bool, - image_mean: Optional[Union[float, list[float]]], - image_std: Optional[Union[float, list[float]]], - return_tensors: Optional[Union[str, TensorType]], - interpolation: Optional[InterpolationMode] = None, + image_mean: float | list[float] | None, + image_std: float | list[float] | None, + disable_grouping: bool | None, + return_tensors: str | TensorType | None, **kwargs, ) -> BatchFeature: - data = {} - resize_images = [] - if do_resize: - for image in images: - image = self.resize(image, size=size, interpolation=interpolation) - resize_images.append(image) - images = resize_images - - processed_images = [] - for image in images: - image = self.rescale_and_normalize(image, do_rescale, rescale_factor, do_normalize, image_mean, image_std) - processed_images.append(image) - images = processed_images - - images = [image[[2, 1, 0], :, :] for image in images] - data.update({"pixel_values": torch.stack(images, dim=0)}) - encoded_inputs = BatchFeature(data, tensor_type=return_tensors) - - return encoded_inputs + + grouped_images, grouped_images_index = group_images_by_shape(images, disable_grouping=disable_grouping) + resized_images_grouped = {} + for shape, stacked_images in grouped_images.items(): + if do_resize: + stacked_images = self.resize(image=stacked_images, size=size, interpolation=interpolation) + resized_images_grouped[shape] = stacked_images + resized_images = reorder_images(resized_images_grouped, grouped_images_index) + + grouped_images, grouped_images_index = group_images_by_shape(resized_images, disable_grouping=disable_grouping) + processed_images_grouped = {} + for shape, stacked_images in grouped_images.items(): + stacked_images = self.rescale_and_normalize( + stacked_images, do_rescale, rescale_factor, do_normalize, image_mean, image_std + ) + # BGR to RGB conversion + stacked_images = stacked_images[:, [2, 1, 0], :, :] + processed_images_grouped[shape] = stacked_images + + pixel_values = reorder_images(processed_images_grouped, grouped_images_index) + + return BatchFeature( + data={"pixel_values": pixel_values}, + tensor_type=return_tensors, + ) @auto_docstring( From d8763e5f3c30270dedb5f99a71b6c0fabd2c0933 Mon Sep 17 00:00:00 2001 From: XingweiDeng Date: Fri, 13 Mar 2026 11:28:40 +0800 Subject: [PATCH 12/60] update --- .../configuration_pp_chart2table.py | 297 ++---- .../image_processing_pp_chart2table_fast.py | 48 +- .../pp_chart2table/modeling_pp_chart2table.py | 787 +++++++------- .../pp_chart2table/modular_pp_chart2table.py | 957 +++--------------- .../processing_pp_chart2table.py | 30 +- .../test_modeling_pp_chart2table.py | 13 +- 6 files changed, 649 insertions(+), 1483 deletions(-) diff --git a/src/transformers/models/pp_chart2table/configuration_pp_chart2table.py b/src/transformers/models/pp_chart2table/configuration_pp_chart2table.py index a1ce908d6361..7ecb1a2ddd31 100644 --- a/src/transformers/models/pp_chart2table/configuration_pp_chart2table.py +++ b/src/transformers/models/pp_chart2table/configuration_pp_chart2table.py @@ -5,63 +5,41 @@ # modular_pp_chart2table.py file directly. One of our CI enforces this. # ๐Ÿšจ๐Ÿšจ๐Ÿšจ๐Ÿšจ๐Ÿšจ๐Ÿšจ๐Ÿšจ๐Ÿšจ๐Ÿšจ๐Ÿšจ๐Ÿšจ๐Ÿšจ๐Ÿšจ๐Ÿšจ๐Ÿšจ๐Ÿšจ๐Ÿšจ๐Ÿšจ๐Ÿšจ๐Ÿšจ๐Ÿšจ๐Ÿšจ๐Ÿšจ๐Ÿšจ๐Ÿšจ๐Ÿšจ๐Ÿšจ๐Ÿšจ๐Ÿšจ๐Ÿšจ๐Ÿšจ๐Ÿšจ๐Ÿšจ๐Ÿšจ๐Ÿšจ๐Ÿšจ๐Ÿšจ๐Ÿšจ๐Ÿšจ๐Ÿšจ๐Ÿšจ๐Ÿšจ๐Ÿšจ๐Ÿšจ๐Ÿšจ๐Ÿšจ๐Ÿšจ๐Ÿšจ -from transformers.configuration_utils import PreTrainedConfig, layer_type_validation -from transformers.modeling_rope_utils import RopeParameters -from transformers.utils import auto_docstring +from ...configuration_utils import PreTrainedConfig, layer_type_validation +from ...modeling_rope_utils import RopeParameters +from ...utils import auto_docstring @auto_docstring( - custom_intro=""" - This configuration class defines all the hyperparameters for the vision component - of the PP-Chart2Table model, which is responsible for processing chart images - and extracting visual features for table structure recognition and content extraction. - PaddlePaddle/PP-Chart2Table_safetensors [PaddlePaddle/PP-Chart2Table_safetensors] - (https://huggingface.co/PaddlePaddle/PP-Chart2Table_safetensors) - """, checkpoint="PaddlePaddle/PP-Chart2Table_safetensors", ) class PPChart2TableVisionConfig(PreTrainedConfig): """ - Configuration class for the vision backbone of PP-Chart2Table model. - - This configuration class defines all the hyperparameters for the vision component - of the PP-Chart2Table model, which is responsible for processing chart images - and extracting visual features for table structure recognition and content extraction. - PaddlePaddle/PP-Chart2Table_safetensors [PaddlePaddle/PP-Chart2Table_safetensors] - (https://huggingface.co/PaddlePaddle/PP-Chart2Table_safetensors) - - Args: - depth (`int`, *optional*, defaults to 12): - Number of transformer encoder layers in the vision backbone. - embed_dim (`int`, *optional*, defaults to 768): - Dimensionality of the patch embedding vectors. - hidden_size (`int`, *optional*, defaults to 1024): - Dimensionality of the hidden layer in the feed-forward network (MLP). - num_channels (`int`, *optional*, defaults to 3): - Number of input channels (3 for RGB images, 1 for grayscale). - image_size (`int`, *optional*, defaults to 1024): - Size (height/width) of the input images (assumed to be square). - mlp_ratio (`float`, *optional*, defaults to 4.0): - Ratio of the hidden layer size to the embedding dimension in the MLP (hidden_size = embed_dim * mlp_ratio). - num_attention_heads (`int`, *optional*, defaults to 12): - Number of attention heads for each transformer encoder layer. - patch_size (`int`, *optional*, defaults to 16): - Size (height/width) of the image patches extracted from the input image. - qkv_bias (`bool`, *optional*, defaults to `True`): - Whether to include bias terms in the query, key, value projection layers of self-attention. - use_rel_pos (`bool`, *optional*, defaults to `True`): - Whether to use relative positional embeddings in the self-attention mechanism. - global_attn_indexes (`Optional[list[int]]`, *optional*, defaults to [2, 5, 8, 11]): - List of layer indexes where global attention (instead of window attention) is applied. - If `None`, defaults to [2, 5, 8, 11]. - window_size (`int`, *optional*, defaults to 14): - Size of the attention window for window-based self-attention (only effective when use_rel_pos=True). - output_channels (`int`, *optional*, defaults to 256): - Dimensionality of the final visual feature output channels. - net_channels (`int`, *optional*, defaults to 512): - Dimensionality of intermediate network channels in the vision backbone. - attention_dropout (`float`, *optional*, defaults to 0.0): - Dropout probability applied to the attention weights. + num_hidden_layers (`int`, *optional*, defaults to 12): + Number of transformer encoder layers in the vision backbone. + hidden_size (`int`, *optional*, defaults to 768): + Dimensionality of the patch embedding vectors. + num_channels (`int`, *optional*, defaults to 3): + Number of input channels (3 for RGB images, 1 for grayscale). + image_size (`int`, *optional*, defaults to 1024): + Size (height/width) of the input images (assumed to be square). + num_attention_heads (`int`, *optional*, defaults to 12): + Number of attention heads for each transformer encoder layer. + patch_size (`int`, *optional*, defaults to 16): + Size (height/width) of the image patches extracted from the input image. + qkv_bias (`bool`, *optional*, defaults to `True`): + Whether to include bias terms in the query, key, value projection layers of self-attention. + use_rel_pos (`bool`, *optional*, defaults to `True`): + Whether to use relative positional embeddings in the self-attention mechanism. + global_attn_indexes (`Optional[list[int]]`, *optional*, defaults to [2, 5, 8, 11]): + List of layer indexes where global attention (instead of window attention) is applied. + If `None`, defaults to [2, 5, 8, 11]. + window_size (`int`, *optional*, defaults to 14): + Size of the attention window for window-based self-attention (only effective when use_rel_pos=True). + output_channels (`int`, *optional*, defaults to 256): + Dimensionality of the final visual feature output channels. + attention_dropout (`float`, *optional*, defaults to 0.0): + Dropout probability applied to the attention weights. """ model_type = "pp_chart2table_vision" @@ -69,37 +47,35 @@ class PPChart2TableVisionConfig(PreTrainedConfig): def __init__( self, - depth: int = 12, - embed_dim: int = 768, - hidden_size: int = 1024, - num_channels: int = 3, - image_size: int = 1024, - mlp_ratio: float = 4.0, - num_attention_heads: int = 12, - patch_size: int = 16, - qkv_bias: bool = True, - use_rel_pos: bool = True, - global_attn_indexes: list[int] | None = None, - window_size: int = 14, - output_channels: int = 256, - net_channels: int = 512, - attention_dropout: float = 0.0, + num_hidden_layers=12, + hidden_size=768, + output_channels=256, + mlp_dim=3072, + num_channels=3, + image_size=1024, + num_attention_heads=12, + patch_size=16, + qkv_bias=True, + use_rel_pos=True, + use_abs_pos=False, + global_attn_indexes=[2, 5, 8, 11], + window_size=14, + attention_dropout=0.0, **kwargs, ): - self.depth = depth - self.embed_dim = embed_dim + self.num_hidden_layers = num_hidden_layers self.hidden_size = hidden_size + self.mlp_dim = mlp_dim self.image_size = image_size self.num_channels = num_channels - self.mlp_ratio = mlp_ratio self.num_attention_heads = num_attention_heads self.patch_size = patch_size self.qkv_bias = qkv_bias self.use_rel_pos = use_rel_pos - self.global_attn_indexes = global_attn_indexes if global_attn_indexes is not None else [2, 5, 8, 11] + self.use_abs_pos = use_abs_pos + self.global_attn_indexes = global_attn_indexes self.window_size = window_size self.output_channels = output_channels - self.net_channels = net_channels self.attention_dropout = attention_dropout super().__init__(**kwargs) @@ -111,76 +87,52 @@ def __init__( ) class PPChart2TableTextConfig(PreTrainedConfig): r""" - This is the configuration class to store the configuration of a [`PPChart2TableTextModel`]. It is used to instantiate a - PP-Chart2Table text decoder according to the specified arguments, defining the model architecture. Instantiating a - configuration with the defaults will yield a similar configuration to that of the text encoder/decoder of the - PPChart2TableText-7B-beta [Qwen/PPChart2TableText-7B-beta](https://huggingface.co/Qwen/PPChart2TableText-7B-beta) - architecture, optimized for chart-to-table text generation tasks. - - Configuration objects inherit from [`PreTrainedConfig`] and can be used to control the model outputs. Read the - documentation from [`PreTrainedConfig`] for more information. - - Args: - attention_dropout (`float`, *optional*, defaults to 0.0): - The dropout ratio for the attention probabilities in self-attention layers. - bos_token_id (`int`, *optional*, defaults to 151643): - The token ID representing the beginning of a sequence (BOS) for text generation. - eos_token_id (`int`, *optional*, defaults to 151643): - The token ID representing the end of a sequence (EOS) for text generation. - pad_token_id (Optional[int], optional, *optional*, defaults to -1): - The index of the padding token. Defaults to -1. - hidden_act (`str` or `function`, *optional*, defaults to `"silu"`): - The non-linear activation function (function or string) in the feed-forward and attention layers of the decoder. - hidden_size (`int`, *optional*, defaults to 1024): - Dimensionality of the hidden representations in the Transformer decoder layers. - initializer_range (`float`, *optional*, defaults to 0.02): - The standard deviation of the truncated_normal_initializer for initializing all weight matrices. - intermediate_size (`int`, *optional*, defaults to 2816): - Dimensionality of the "intermediate" (i.e., feed-forward) layer in the Transformer decoder blocks. - max_position_embeddings (`int`, *optional*, defaults to 32768): - The maximum sequence length that this model might ever be used with for text input/output. - num_attention_heads (`int`, *optional*, defaults to 16): - Number of attention heads for each self-attention layer in the Transformer decoder. - num_hidden_layers (`int`, *optional*, defaults to 24): - Number of hidden layers in the Transformer decoder. - num_key_value_heads (`int`, *optional*, defaults to 16): - Number of key/value heads for implementing Grouped Query Attention (GQA). If equal to `num_attention_heads`, - Multi Head Attention (MHA) is used; if 1, Multi Query Attention (MQA) is used. For more details, see - [this paper](https://huggingface.co/papers/2305.13245). - rms_norm_eps (`float`, *optional*, defaults to 1e-06): - The epsilon value used by the RMS normalization layers to avoid division by zero. - rope_theta (`float`, *optional*, defaults to 1000000.0): - The base period of the RoPE (Rotary Position Embedding) embeddings, controlling the frequency of positional encoding. - rope_parameters (`RopeParameters` or `dict`, *optional*): - Configuration parameters for RoPE embeddings, including scaling parameters for longer sequence lengths beyond - `max_position_embeddings`. - sliding_window (`int`, *optional*, defaults to 32768): - Window size for Sliding Window Attention (SWA) in the decoder layers (only active if `use_sliding_window=True`). - tie_word_embeddings (`bool`, *optional*, defaults to `True`): - Whether the model's input and output word embeddings should be tied (shared weights). - use_cache (`bool`, *optional*, defaults to `True`): - Whether to return the last key/value attention states to speed up sequential decoding (only relevant for autoregressive - generation). - vocab_size (`int`, *optional*, defaults to 151860): - Vocabulary size of the PPChart2TableText model. Defines the number of distinct tokens that can be represented - by `input_ids`. - layer_types (`list[str]`, *optional*): - Attention pattern for each decoder layer (e.g., `"full_attention"` or `"sliding_attention"`). If not specified, - automatically determined by `sliding_window`. - - Example: - - ```python - >>> from transformers import PPChart2TableTextConfig, PPChart2TableTextModel - - >>> # Initializing a PPChart2TableText style configuration - >>> configuration = PPChart2TableTextConfig() - - >>> # Initializing a model from the PPChart2TableText-7B style configuration - >>> model = PPChart2TableTextModel(configuration) - - >>> # Accessing the model configuration - >>> configuration = model.config + attention_dropout (`float`, *optional*, defaults to 0.0): + The dropout ratio for the attention probabilities in self-attention layers. + bos_token_id (`int`, *optional*, defaults to 151643): + The token ID representing the beginning of a sequence (BOS) for text generation. + eos_token_id (`int`, *optional*, defaults to 151643): + The token ID representing the end of a sequence (EOS) for text generation. + pad_token_id (Optional[int], optional, *optional*, defaults to -1): + The index of the padding token. Defaults to -1. + hidden_act (`str` or `function`, *optional*, defaults to `"silu"`): + The non-linear activation function (function or string) in the feed-forward and attention layers of the decoder. + hidden_size (`int`, *optional*, defaults to 1024): + Dimensionality of the hidden representations in the Transformer decoder layers. + initializer_range (`float`, *optional*, defaults to 0.02): + The standard deviation of the truncated_normal_initializer for initializing all weight matrices. + intermediate_size (`int`, *optional*, defaults to 2816): + Dimensionality of the "intermediate" (i.e., feed-forward) layer in the Transformer decoder blocks. + max_position_embeddings (`int`, *optional*, defaults to 32768): + The maximum sequence length that this model might ever be used with for text input/output. + num_attention_heads (`int`, *optional*, defaults to 16): + Number of attention heads for each self-attention layer in the Transformer decoder. + num_hidden_layers (`int`, *optional*, defaults to 24): + Number of hidden layers in the Transformer decoder. + num_key_value_heads (`int`, *optional*, defaults to 16): + Number of key/value heads for implementing Grouped Query Attention (GQA). If equal to `num_attention_heads`, + Multi Head Attention (MHA) is used; if 1, Multi Query Attention (MQA) is used. For more details, see + [this paper](https://huggingface.co/papers/2305.13245). + rms_norm_eps (`float`, *optional*, defaults to 1e-06): + The epsilon value used by the RMS normalization layers to avoid division by zero. + rope_theta (`float`, *optional*, defaults to 1000000.0): + The base period of the RoPE (Rotary Position Embedding) embeddings, controlling the frequency of positional encoding. + rope_parameters (`RopeParameters` or `dict`, *optional*): + Configuration parameters for RoPE embeddings, including scaling parameters for longer sequence lengths beyond + `max_position_embeddings`. + sliding_window (`int`, *optional*, defaults to 32768): + Window size for Sliding Window Attention (SWA) in the decoder layers (only active if `use_sliding_window=True`). + tie_word_embeddings (`bool`, *optional*, defaults to `True`): + Whether the model's input and output word embeddings should be tied (shared weights). + use_cache (`bool`, *optional*, defaults to `True`): + Whether to return the last key/value attention states to speed up sequential decoding (only relevant for autoregressive + generation). + vocab_size (`int`, *optional*, defaults to 151860): + Vocabulary size of the PPChart2TableText model. Defines the number of distinct tokens that can be represented + by `input_ids`. + layer_types (`list[str]`, *optional*): + Attention pattern for each decoder layer (e.g., `"full_attention"` or `"sliding_attention"`). If not specified, + automatically determined by `sliding_window`. """ model_type = "pp_chart2table_text" @@ -274,53 +226,20 @@ def __init__( ) class PPChart2TableConfig(PreTrainedConfig): r""" - This is the main configuration class to store the configuration of a [PPChart2TableModel] or [PPChart2TableForConditionalGeneration]. - It is used to instantiate a PP-Chart2Table multimodal model according to the specified arguments, defining the vision and text - sub-model architectures. This configuration class inherits from [PreTrainedConfig] and combines the configurations of: - [PPChart2TableVisionConfig] (for the chart vision encoder) - [PPChart2TableTextConfig] (for the table text decoder) - PP-Chart2Table PaddlePaddle/PP-Chart2Table_safetensors. - - Instantiating a PPChart2TableConfig with the defaults will yield a similar configuration to the base PP-Chart2Table model - developed by the PaddlePaddle team for chart-to-table parsing tasks. - - Configuration objects inherit from [PreTrainedConfig] and can be used to control the model outputs. Read the - documentation from [PreTrainedConfig] for more information. - - Args: - vision_config (Optional[Dict], optional, *optional*):: - The [PPChart2TableVisionConfig] for the vision sub-model. Defaults to None. - text_config (Optional[Dict], optional, *optional*):: - The [PPChart2TableTextConfig] for the text sub-model. Defaults to None. - image_token_index (Optional[int], optional, *optional*, defaults to 151859):: - The index of the image token. Defaults to 151859. - image_seq_length (Optional[int], optional, *optional*, defaults to 576):: - The sequence length for the image. Defaults to 576. - pad_token_id (Optional[int], optional, *optional*, defaults to -1): - The index of the padding token. Defaults to -1. - - Example: - - ```python - >>> from transformers import PPChart2TableConfig, PPChart2TableModel - - >>> # Initializing a PPChart2Table configuration with default vision and text sub-configs - >>> configuration = PPChart2TableConfig() - - >>> # Initializing a PPChart2Table configuration with custom vision and text sub-configs - >>> vision_config = {"image_size": 512, "patch_size": 8} - >>> text_config = {"hidden_size": 2048, "num_hidden_layers": 16} - >>> configuration = PPChart2TableConfig(vision_config=vision_config, text_config=text_config) - - >>> # Initializing a model from the PPChart2Table configuration - >>> model = PPChart2TableModel(configuration) - - >>> # Accessing the model configuration - >>> configuration = model.config - >>> # Accessing the vision sub-config - >>> vision_config = configuration.vision_config - >>> # Accessing the text sub-config - >>> text_config = configuration.text_config + vision_config (Optional[Dict], optional, *optional*):: + The [PPChart2TableVisionConfig] for the vision sub-model. Defaults to None. + text_config (Optional[Dict], optional, *optional*):: + The [PPChart2TableTextConfig] for the text sub-model. Defaults to None. + image_token_index (Optional[int], optional, *optional*, defaults to 151859):: + The index of the image token. Defaults to 151859. + image_seq_length (Optional[int], optional, *optional*, defaults to 576):: + The sequence length for the image. Defaults to 576. + pad_token_id (Optional[int], optional, *optional*, defaults to -1): + The index of the padding token. Defaults to -1. + net_channels (`int`, *optional*, defaults to 512): + Dimensionality of intermediate network channels in the vision backbone. + output_channels (`int`, *optional*, defaults to 1024): + Dimensionality of intermediate network channels in the vision backbone. """ model_type = "pp_chart2table" @@ -336,11 +255,15 @@ def __init__( image_token_index: int | None = 151859, image_seq_length: int | None = 576, pad_token_id: int | None = -1, + net_channels: int | None = 512, + output_channels: int | None = 1024, **kwargs, ): self.image_token_index = image_token_index self.image_seq_length = image_seq_length self.pad_token_id = pad_token_id + self.net_channels = net_channels + self.output_channels = output_channels if vision_config is None: vision_config = {} diff --git a/src/transformers/models/pp_chart2table/image_processing_pp_chart2table_fast.py b/src/transformers/models/pp_chart2table/image_processing_pp_chart2table_fast.py index d70b530acf72..5021229fe168 100644 --- a/src/transformers/models/pp_chart2table/image_processing_pp_chart2table_fast.py +++ b/src/transformers/models/pp_chart2table/image_processing_pp_chart2table_fast.py @@ -9,48 +9,15 @@ import torch import torchvision.transforms.v2.functional as tvF -from transformers.feature_extraction_utils import BatchFeature -from transformers.image_processing_utils_fast import BaseImageProcessorFast, group_images_by_shape, reorder_images -from transformers.image_utils import SizeDict -from transformers.processing_utils import TensorType -from transformers.utils import auto_docstring +from ...feature_extraction_utils import BatchFeature +from ...image_processing_utils_fast import BaseImageProcessorFast, group_images_by_shape, reorder_images +from ...image_utils import SizeDict +from ...processing_utils import TensorType +from ...utils import auto_docstring -@auto_docstring( - custom_intro=""" - - """ -) +@auto_docstring class PPChart2TableImageProcessorFast(BaseImageProcessorFast): - r""" - Fast image processor for the PP-Chart2Table multimodal model, optimized for GPU-accelerated chart image preprocessing. - - This high-performance processor implements a streamlined preprocessing pipeline for chart images (resizing, rescaling, - normalization, channel reordering) using PyTorch tensor operations, designed for efficient batch processing on GPUs. - It inherits from [`BaseImageProcessorFast`] and is optimized for inference/training pipelines requiring low-latency - image preprocessing. - - Class Attributes (Default Configuration): - resample (`int`, defaults to 3): - Integer identifier for the resampling filter (3 = BICUBIC, compatible with `InterpolationMode.BICUBIC`). - image_mean (`list[float]`, defaults to `[0.40821073, 0.4578275, 0.48145466]`): - Per-channel mean values for image normalization (RGB order). - image_std (`list[float]`, defaults to `[0.27577711, 0.26130258, 0.26862954]`): - Per-channel standard deviation values for image normalization (RGB order). - size (`dict[str, int]`, defaults to `{"height": 1024, "width": 1024}`): - Default target size for image resizing (1024x1024, optimized for PP-Chart2Table vision encoder). - patch_size (`int`, defaults to 16): - Size of image patches used by the PP-Chart2Table vision encoder (for alignment with model input). - merge_size (`int`, defaults to 4): - Size factor for merging image patches (specific to PP-Chart2Table's vision processing pipeline). - do_resize (`bool`, defaults to `True`): - Default flag to enable image resizing. - do_rescale (`bool`, defaults to `True`): - Default flag to enable pixel value rescaling (from [0,255] to [0,1]). - do_normalize (`bool`, defaults to `True`): - Default flag to enable image normalization. - """ - resample = 3 image_mean = [0.40821073, 0.4578275, 0.48145466] image_std = [0.27577711, 0.26130258, 0.26862954] @@ -61,9 +28,6 @@ class PPChart2TableImageProcessorFast(BaseImageProcessorFast): do_rescale = True do_normalize = True - def __init__(self, **kwargs) -> None: - super().__init__(**kwargs) - def _preprocess( self, images: list["torch.Tensor"], diff --git a/src/transformers/models/pp_chart2table/modeling_pp_chart2table.py b/src/transformers/models/pp_chart2table/modeling_pp_chart2table.py index 41f03d0d2c57..033daab0bf44 100644 --- a/src/transformers/models/pp_chart2table/modeling_pp_chart2table.py +++ b/src/transformers/models/pp_chart2table/modeling_pp_chart2table.py @@ -13,101 +13,24 @@ import torch.nn as nn import torch.nn.functional as F -from transformers.cache_utils import Cache -from transformers.generation import GenerationMixin -from transformers.modeling_outputs import BaseModelOutputWithPast -from transformers.modeling_utils import PreTrainedModel -from transformers.utils import auto_docstring, can_return_tuple - +from ... import initialization as init from ...activations import ACT2FN -from ...cache_utils import DynamicCache +from ...cache_utils import Cache, DynamicCache +from ...generation import GenerationMixin from ...integrations import use_kernel_forward_from_hub, use_kernel_func_from_hub, use_kernelized_func from ...masking_utils import create_causal_mask, create_sliding_window_causal_mask from ...modeling_flash_attention_utils import FlashAttentionKwargs from ...modeling_layers import GradientCheckpointingLayer +from ...modeling_outputs import BaseModelOutputWithPast, BaseModelOutputWithPooling, ModelOutput from ...modeling_rope_utils import ROPE_INIT_FUNCTIONS, dynamic_rope_update -from ...modeling_utils import ALL_ATTENTION_FUNCTIONS +from ...modeling_utils import ALL_ATTENTION_FUNCTIONS, PreTrainedModel from ...processing_utils import Unpack -from ...utils import TransformersKwargs +from ...utils import TransformersKwargs, auto_docstring, can_return_tuple, torch_compilable_check from ...utils.generic import maybe_autocast, merge_with_config_defaults from ...utils.output_capturing import capture_outputs from .configuration_pp_chart2table import PPChart2TableConfig, PPChart2TableTextConfig, PPChart2TableVisionConfig -class PPChart2TableVisionPatchEmbed(nn.Module): - """ - This class turns `pixel_values` of shape `(batch_size, num_channels, height, width)` into the initial - `hidden_states` (patch embeddings) of shape `(batch_size, seq_length, hidden_size)` to be consumed by a - Transformer. - """ - - def __init__(self, config): - super().__init__() - image_size, patch_size = config.image_size, config.patch_size - num_channels, hidden_size = config.num_channels, config.embed_dim - image_size = image_size if isinstance(image_size, collections.abc.Iterable) else (image_size, image_size) - patch_size = patch_size if isinstance(patch_size, collections.abc.Iterable) else (patch_size, patch_size) - num_patches = (image_size[1] // patch_size[1]) * (image_size[0] // patch_size[0]) - self.image_size = image_size - self.patch_size = patch_size - self.num_channels = num_channels - self.num_patches = num_patches - self.projection = nn.Conv2d(num_channels, hidden_size, kernel_size=patch_size, stride=patch_size) - - def forward(self, pixel_values): - batch_size, num_channels, height, width = pixel_values.shape - if num_channels != self.num_channels: - raise ValueError( - "Make sure that the channel dimension of the pixel values match with the one set in the configuration." - ) - if height != self.image_size[0] or width != self.image_size[1]: - raise ValueError( - f"Input image size ({height}*{width}) doesn't match model ({self.image_size[0]}*{self.image_size[1]})." - ) - embeddings = self.projection(pixel_values).permute(0, 2, 3, 1) - return embeddings - - -class PPChart2TableVisionMLPBlock(nn.Module): - def __init__(self, config) -> None: - super().__init__() - self.lin1 = nn.Linear(config.embed_dim, int(config.embed_dim * config.mlp_ratio)) - self.lin2 = nn.Linear(int(config.embed_dim * config.mlp_ratio), config.embed_dim) - self.act = ACT2FN[config.hidden_act] - - def forward(self, hidden_states: torch.Tensor) -> torch.Tensor: - hidden_states = self.lin1(hidden_states) - hidden_states = self.act(hidden_states) - hidden_states = self.lin2(hidden_states) - return hidden_states - - -class PPChart2TableVisionLayerNorm(nn.LayerNorm): - r"""LayerNorm that supports two data formats: channels_last (default) or channels_first. - The ordering of the dimensions in the inputs. channels_last corresponds to inputs with shape (batch_size, height, - width, channels) while channels_first corresponds to inputs with shape (batch_size, channels, height, width). - """ - - def __init__(self, normalized_shape, *, eps=1e-6, data_format="channels_last", **kwargs): - super().__init__(normalized_shape, eps=eps, **kwargs) - if data_format not in ["channels_last", "channels_first"]: - raise NotImplementedError(f"Unsupported data format: {data_format}") - self.data_format = data_format - - def forward(self, features: torch.Tensor) -> torch.Tensor: - """ - Args: - features: Tensor of shape (batch_size, channels, height, width) OR (batch_size, height, width, channels) - """ - if self.data_format == "channels_first": - features = features.permute(0, 2, 3, 1) - features = super().forward(features) - features = features.permute(0, 3, 1, 2) - else: - features = super().forward(features) - return features - - class PPChart2TableVisionAttention(nn.Module): """Multi-head Attention block with relative position embeddings.""" @@ -120,11 +43,12 @@ def __init__(self, config, window_size): ) self.num_attention_heads = config.num_attention_heads - head_dim = config.embed_dim // config.num_attention_heads + head_dim = config.hidden_size // config.num_attention_heads self.scale = head_dim**-0.5 self.dropout = config.attention_dropout - self.qkv = nn.Linear(config.embed_dim, config.embed_dim * 3, bias=config.qkv_bias) - self.proj = nn.Linear(config.embed_dim, config.embed_dim) + + self.qkv = nn.Linear(config.hidden_size, config.hidden_size * 3, bias=config.qkv_bias) + self.proj = nn.Linear(config.hidden_size, config.hidden_size) self.use_rel_pos = config.use_rel_pos if self.use_rel_pos: @@ -240,14 +164,53 @@ def forward(self, hidden_states: torch.Tensor, output_attentions=None) -> tuple[ return attn_output, attn_weights -class PPChart2TableVisionDecoderLayer(GradientCheckpointingLayer): - def __init__(self, config, window_size) -> None: +@auto_docstring +class PPChart2TableVisionPreTrainedModel(PreTrainedModel): + config: PPChart2TableConfig + base_model_prefix = "model" + input_modalities = ("image", "text") + supports_gradient_checkpointing = True + _skip_keys_device_placement = "past_key_values" + _supports_flash_attn = False + _supports_sdpa = False + + _can_compile_fullgraph = True + _supports_flex_attn = False + _supports_attention_backend = True + + @torch.no_grad() + def _init_weights(self, module): + super()._init_weights(module) + if isinstance(module, PPChart2TableVisionAttention): + if module.use_rel_pos: + init.zeros_(module.rel_pos_h) + init.zeros_(module.rel_pos_w) + elif isinstance(module, PPChart2TableVisionEncoder): + if module.pos_embed is not None: + init.zeros_(module.pos_embed) + + +class PPChart2TableMLPBlock(nn.Module): + def __init__(self, config): super().__init__() - self.layer_norm1 = nn.LayerNorm(config.embed_dim) - self.attn = PPChart2TableVisionAttention(config, window_size=window_size) + self.lin1 = nn.Linear(config.hidden_size, config.mlp_dim) + self.lin2 = nn.Linear(config.mlp_dim, config.hidden_size) + self.act = ACT2FN[config.hidden_act] + + def forward(self, hidden_states: torch.Tensor) -> torch.Tensor: + hidden_states = self.lin1(hidden_states) + hidden_states = self.act(hidden_states) + hidden_states = self.lin2(hidden_states) + return hidden_states - self.layer_norm2 = nn.LayerNorm(config.embed_dim) - self.mlp = PPChart2TableVisionMLPBlock(config) + +class PPChart2TableVisionLayer(GradientCheckpointingLayer): + def __init__(self, config, window_size): + super().__init__() + self.layer_norm1 = nn.LayerNorm(config.hidden_size, eps=config.layer_norm_eps) + self.attn = PPChart2TableVisionAttention(config, window_size) + self.layer_norm2 = nn.LayerNorm(config.hidden_size, eps=config.layer_norm_eps) + self.mlp = PPChart2TableMLPBlock(config) self.window_size = window_size def window_partition(self, hidden_states: torch.Tensor, window_size: int) -> tuple[torch.Tensor, tuple[int, int]]: @@ -325,15 +288,95 @@ def forward(self, hidden_states: torch.Tensor) -> tuple[torch.FloatTensor]: return hidden_states +@dataclass +@auto_docstring( + custom_intro=""" + Base class for pp_chart2table vision model's outputs that also contains image embeddings obtained by applying the projection + layer to the pooler_output. + """ +) +class PPChart2TableVisionEncoderOutput(ModelOutput): + r""" + image_embeds (`torch.FloatTensor` of shape `(batch_size, output_dim)` *optional* returned when model is initialized with `with_projection=True`): + The image embeddings obtained by applying the projection layer to the pooler_output. + """ + + image_embeds: torch.FloatTensor | None = None + last_hidden_state: torch.FloatTensor | None = None + hidden_states: tuple[torch.FloatTensor, ...] | None = None + attentions: tuple[torch.FloatTensor, ...] | None = None + + +class PPChart2TablePatchEmbeddings(nn.Module): + """ + This class turns `pixel_values` of shape `(batch_size, num_channels, height, width)` into the initial + `hidden_states` (patch embeddings) of shape `(batch_size, seq_length, hidden_size)` to be consumed by a + Transformer. + """ + + def __init__(self, config): + super().__init__() + image_size, patch_size = config.image_size, config.patch_size + num_channels, hidden_size = config.num_channels, config.hidden_size + image_size = image_size if isinstance(image_size, collections.abc.Iterable) else (image_size, image_size) + patch_size = patch_size if isinstance(patch_size, collections.abc.Iterable) else (patch_size, patch_size) + num_patches = (image_size[1] // patch_size[1]) * (image_size[0] // patch_size[0]) + self.image_size = image_size + self.patch_size = patch_size + self.num_channels = num_channels + self.num_patches = num_patches + + self.projection = nn.Conv2d(num_channels, hidden_size, kernel_size=patch_size, stride=patch_size) + + def forward(self, pixel_values): + batch_size, num_channels, height, width = pixel_values.shape + if num_channels != self.num_channels: + raise ValueError( + "Make sure that the channel dimension of the pixel values match with the one set in the configuration." + ) + if height != self.image_size[0] or width != self.image_size[1]: + raise ValueError( + f"Input image size ({height}*{width}) doesn't match model ({self.image_size[0]}*{self.image_size[1]})." + ) + embeddings = self.projection(pixel_values).permute(0, 2, 3, 1) + return embeddings + + +class PPChart2TableLayerNorm(nn.LayerNorm): + r"""LayerNorm that supports two data formats: channels_last (default) or channels_first. + The ordering of the dimensions in the inputs. channels_last corresponds to inputs with shape (batch_size, height, + width, channels) while channels_first corresponds to inputs with shape (batch_size, channels, height, width). + """ + + def __init__(self, normalized_shape, *, eps=1e-6, data_format="channels_last", **kwargs): + super().__init__(normalized_shape, eps=eps, **kwargs) + if data_format not in ["channels_last", "channels_first"]: + raise NotImplementedError(f"Unsupported data format: {data_format}") + self.data_format = data_format + + def forward(self, features: torch.Tensor) -> torch.Tensor: + """ + Args: + features: Tensor of shape (batch_size, channels, height, width) OR (batch_size, height, width, channels) + """ + if self.data_format == "channels_first": + features = features.permute(0, 2, 3, 1) + features = super().forward(features) + features = features.permute(0, 3, 1, 2) + else: + features = super().forward(features) + return features + + class PPChart2TableVisionNeck(nn.Module): def __init__(self, config: PPChart2TableVisionConfig): super().__init__() self.config = config - self.conv1 = nn.Conv2d(config.embed_dim, config.output_channels, kernel_size=1, bias=False) - self.layer_norm1 = PPChart2TableVisionLayerNorm(config.output_channels, data_format="channels_first") + self.conv1 = nn.Conv2d(config.hidden_size, config.output_channels, kernel_size=1, bias=False) + self.layer_norm1 = PPChart2TableLayerNorm(config.output_channels, data_format="channels_first") self.conv2 = nn.Conv2d(config.output_channels, config.output_channels, kernel_size=3, padding=1, bias=False) - self.layer_norm2 = PPChart2TableVisionLayerNorm(config.output_channels, data_format="channels_first") + self.layer_norm2 = PPChart2TableLayerNorm(config.output_channels, data_format="channels_first") def forward(self, hidden_states): hidden_states = hidden_states.permute(0, 3, 1, 2) @@ -345,108 +388,77 @@ def forward(self, hidden_states): return hidden_states -@auto_docstring( - custom_intro=""" - - """ -) -class PPChart2TableVisionPreTrainedModel(PreTrainedModel): - r""" - Base class for all PP-Chart2Table vision models, inheriting from Hugging Face `PreTrainedModel`. - - This class sets up core configurations and compatibility flags for the vision encoder, including: - - Support for gradient checkpointing, attention backends (FlashAttention/SDPA), and model compilation - - Definition of non-splittable modules (for tensor parallelism) - - Output recording for hidden states/attentions (for debugging/analysis) - - Class Attributes: - config (`PPChart2TableVisionConfig`): - Typed config class for PP-Chart2Table vision encoder (enforces type checking). - base_model_prefix (`str`, defaults to `"model"`): - Prefix for base model parameters (used in weight loading/saving). - supports_gradient_checkpointing (`bool`, defaults to `True`): - Whether the model supports gradient checkpointing to save memory. - _no_split_modules (`list[str]`): - Modules that should not be split across devices (tensor parallelism compatibility). - _skip_keys_device_placement (`list[str]`): - Keys to skip when placing tensors on devices (e.g., past key values for generation). - _supports_flash_attn / _supports_sdpa / _supports_flex_attn (`bool`): - Compatibility with optimized attention implementations (FlashAttention, SDPA, FlexAttention). - _can_compile_fullgraph (`bool`, defaults to `True`): - Whether the model supports TorchScript/TorchCompile full graph compilation. - _supports_attention_backend (`bool`, defaults to `True`): - Whether the model supports switching attention backends (e.g., PyTorch vs FlashAttention). - _can_record_outputs (`dict`): - Mapping of output types to modules for recording intermediate outputs (hidden_states/attentions). - """ - - config: PPChart2TableVisionConfig - base_model_prefix = "model" - supports_gradient_checkpointing = True - _no_split_modules = ["PPChart2TableVisionDecoderLayer"] - _skip_keys_device_placement = ["past_key_values"] - _supports_flash_attn = True - _supports_sdpa = True - _supports_flex_attn = True - - _can_compile_fullgraph = True - _supports_attention_backend = True - _can_record_outputs = { - "hidden_states": PPChart2TableVisionDecoderLayer, - "attentions": PPChart2TableVisionAttention, - } - +class PPChart2TableVisionEncoder(PPChart2TableVisionPreTrainedModel): + _can_record_outputs = {"hidden_states": PPChart2TableVisionLayer, "attentions": PPChart2TableVisionAttention} + input_modalities = ("image",) -@auto_docstring( - custom_intro=""" - - """ -) -class PPChart2TableVisionModel(PPChart2TableVisionPreTrainedModel): - main_input_name = "pixel_values" - input_modalities = "image" - - def __init__( - self, - config: PPChart2TableVisionConfig, - ) -> None: + def __init__(self, config: PPChart2TableVisionConfig): super().__init__(config) + self.config = config self.image_size = config.image_size - - self.patch_embed = PPChart2TableVisionPatchEmbed(config) - - self.pos_embed = nn.Parameter( - torch.zeros( - 1, config.image_size // config.patch_size, config.image_size // config.patch_size, config.embed_dim + self.patch_embed = PPChart2TablePatchEmbeddings(config) + + self.pos_embed = None + if config.use_abs_pos: + # Initialize absolute positional embedding with pretrain image size. + self.pos_embed = nn.Parameter( + torch.zeros( + 1, + config.image_size // config.patch_size, + config.image_size // config.patch_size, + config.hidden_size, + ) ) - ) - self.blocks = nn.ModuleList() - for i in range(config.depth): - block = PPChart2TableVisionDecoderLayer( + self.layers = nn.ModuleList() + for i in range(config.num_hidden_layers): + layer = PPChart2TableVisionLayer( config, window_size=config.window_size if i not in config.global_attn_indexes else 0, ) - self.blocks.append(block) + self.layers.append(layer) self.neck = PPChart2TableVisionNeck(config) - self.net_2 = nn.Conv2d( - config.output_channels, config.net_channels, kernel_size=3, stride=2, padding=1, bias=False - ) - self.net_3 = nn.Conv2d(config.net_channels, config.hidden_size, kernel_size=3, stride=2, padding=1, bias=False) - + self.gradient_checkpointing = False self.post_init() - def forward(self, hidden_states: torch.Tensor, **kwargs) -> torch.Tensor: - hidden_states = self.patch_embed(hidden_states) - hidden_states = hidden_states + self.pos_embed - for block in self.blocks: - hidden_states = block(hidden_states) + def get_input_embeddings(self): + return self.patch_embed + + @merge_with_config_defaults + @capture_outputs(tie_last_hidden_states=False) + def forward( + self, pixel_values: torch.FloatTensor | None = None, **kwargs: Unpack[TransformersKwargs] + ) -> tuple | PPChart2TableVisionEncoderOutput: + if pixel_values is None: + raise ValueError("You have to specify pixel_values") + + hidden_states = self.patch_embed(pixel_values) + if self.pos_embed is not None: + hidden_states = hidden_states + self.pos_embed + for layer_module in self.layers: + hidden_states = layer_module(hidden_states) hidden_states = self.neck(hidden_states) - hidden_states = self.net_2(hidden_states) - hidden_states = self.net_3(hidden_states) - return hidden_states + return PPChart2TableVisionEncoderOutput( + last_hidden_state=hidden_states, + ) + + +class PPChart2TableTextMLP(nn.Module): + def __init__(self, config): + super().__init__() + self.config = config + self.hidden_size = config.hidden_size + self.intermediate_size = config.intermediate_size + self.gate_proj = nn.Linear(self.hidden_size, self.intermediate_size, bias=False) + self.up_proj = nn.Linear(self.hidden_size, self.intermediate_size, bias=False) + self.down_proj = nn.Linear(self.intermediate_size, self.hidden_size, bias=False) + self.act_fn = ACT2FN[config.hidden_act] + + def forward(self, x): + down_proj = self.down_proj(self.act_fn(self.gate_proj(x)) * self.up_proj(x)) + return down_proj def rotate_half(x): @@ -581,22 +593,6 @@ def forward( return attn_output, attn_weights -class PPChart2TableTextMLP(nn.Module): - def __init__(self, config): - super().__init__() - self.config = config - self.hidden_size = config.hidden_size - self.intermediate_size = config.intermediate_size - self.gate_proj = nn.Linear(self.hidden_size, self.intermediate_size, bias=False) - self.up_proj = nn.Linear(self.hidden_size, self.intermediate_size, bias=False) - self.down_proj = nn.Linear(self.intermediate_size, self.hidden_size, bias=False) - self.act_fn = ACT2FN[config.hidden_act] - - def forward(self, x): - down_proj = self.down_proj(self.act_fn(self.gate_proj(x)) * self.up_proj(x)) - return down_proj - - @use_kernel_forward_from_hub("RMSNorm") class PPChart2TableTextRMSNorm(nn.Module): def __init__(self, hidden_size, eps: float = 1e-6) -> None: @@ -662,11 +658,7 @@ def forward( return hidden_states -@auto_docstring( - custom_intro=""" - - """ -) +@auto_docstring class PPChart2TableTextPreTrainedModel(PreTrainedModel): config: PPChart2TableTextConfig base_model_prefix = "model" @@ -838,148 +830,62 @@ def forward( @dataclass class PPChart2TableModelOutputWithPast(BaseModelOutputWithPast): r""" - Output class for PPChart2Table multimodal model's forward pass, extending Hugging Face `ModelOutput`. - - This dataclass encapsulates the core outputs of the PP-Chart2Table base model, including hidden states, - attention weights, and cached key/value pairs for efficient generation. - - Attributes: - past_key_values (`Optional[Cache]`, defaults to `None`): - Cached attention key/value pairs from the text decoder (for fast autoregressive generation). - last_hidden_state (`Optional[torch.FloatTensor]`, defaults to `None`): - Final hidden states from the text decoder (shape: `[B, seq_len, hidden_size]`), after multimodal fusion. - hidden_states (`Optional[tuple[torch.FloatTensor]]`, defaults to `None`): - Tuple of hidden states from each layer of the text decoder (for debugging/analysis). - attentions (`Optional[tuple[torch.FloatTensor]]`, defaults to `None`): - Tuple of attention weights from each layer of the text decoder (for debugging/analysis). + past_key_values (`Cache`, *optional*, returned when `use_cache=True` is passed or when `config.use_cache=True`): + It is a [`~cache_utils.Cache`] instance. For more details, see our [kv cache guide](https://huggingface.co/docs/transformers/en/kv_cache). + + Contains pre-computed hidden-states (key and values in the self-attention blocks) that can be used (see + `past_key_values` input) to speed up sequential decoding. + image_hidden_states (`torch.FloatTensor`, *optional*): + A `torch.FloatTensor` of size `(batch_size, num_images, sequence_length, hidden_size)`. + image_hidden_states of the model produced by the vision encoder and after projecting the last hidden state. """ image_hidden_states: torch.FloatTensor | None = None -@dataclass -class PPChart2TableCausalLMOutputWithPast(BaseModelOutputWithPast): - r""" - Output class for PP-Chart2Table conditional generation model's forward pass. - - Extends `PPChart2TableModelOutputWithPast` with language modeling logits (for token prediction), - tailored for autoregressive table generation tasks. - - Attributes: - logits (`Optional[torch.FloatTensor]`, defaults to `None`): - Language modeling logits (shape: `[B, seq_len, vocab_size]`), output from the LM head. - """ - - logits: torch.FloatTensor | None = None - loss: torch.FloatTensor | None = None - image_hidden_states: torch.FloatTensor | None = None - - -@auto_docstring( - custom_intro=""" - - """ -) +@auto_docstring class PPChart2TablePreTrainedModel(PreTrainedModel): - r""" - Base class for all PP-Chart2Table multimodal models, inheriting from Hugging Face `PreTrainedModel`. - - This class defines core configurations and compatibility flags for the multimodal model (vision + text), - including support for gradient checkpointing, optimized attention backends, and model compilation. - - Class Attributes: - config (`PPChart2TableConfig`): - Typed config class for PP-Chart2Table (combines vision + text sub-configs). - base_model_prefix (`str`, defaults to `"model"`): - Prefix for base model parameters (used in weight loading/saving). - supports_gradient_checkpointing (`bool`, defaults to `True`): - Whether the model supports gradient checkpointing to save memory during training. - _no_split_modules (`list[str]`): - Modules that should not be split across devices (tensor parallelism compatibility). - _skip_keys_device_placement (`list[str]`): - Keys to skip when placing tensors on devices (e.g., past key values for generation). - _supports_flash_attn / _supports_sdpa / _supports_flex_attn (`bool`): - Compatibility with optimized attention implementations (FlashAttention, SDPA, FlexAttention). - _can_compile_fullgraph (`bool`, defaults to `True`): - Whether the model supports TorchScript/TorchCompile full graph compilation. - _supports_attention_backend (`bool`, defaults to `True`): - Whether the model supports switching attention backends (e.g., PyTorch vs FlashAttention). - _can_record_outputs (`dict`): - Mapping of output types to modules for recording intermediate outputs (hidden_states/attentions). - """ - config: PPChart2TableConfig base_model_prefix = "model" + input_modalities = ("image", "text") supports_gradient_checkpointing = True - _no_split_modules = ["PPChart2TableTextDecoderLayer"] - _skip_keys_device_placement = ["past_key_values"] - _supports_flash_attn = True - _supports_sdpa = True - _supports_flex_attn = True + _skip_keys_device_placement = "past_key_values" + _supports_flash_attn = False + _supports_sdpa = False _can_compile_fullgraph = True + _supports_flex_attn = False _supports_attention_backend = True - _can_record_outputs = { - "hidden_states": PPChart2TableTextDecoderLayer, - "attentions": PPChart2TableTextAttention, - } + @torch.no_grad() + def _init_weights(self, module): + super()._init_weights(module) + if isinstance(module, PPChart2TableVisionAttention): + if module.use_rel_pos: + init.zeros_(module.rel_pos_h) + init.zeros_(module.rel_pos_w) + elif isinstance(module, PPChart2TableVisionEncoder): + if module.pos_embed is not None: + init.zeros_(module.pos_embed) -@auto_docstring( - custom_intro=""" - Core PP-Chart2Table multimodal model (vision encoder + text decoder) for chart-to-table parsing. - """ -) +@auto_docstring class PPChart2TableModel(PPChart2TablePreTrainedModel): - r""" - Core PP-Chart2Table multimodal model (vision encoder + text decoder) for chart-to-table parsing. - - This model integrates a vision encoder (for chart image feature extraction) and a text decoder (for table generation), - with a multimodal projection layer to align vision features with text embedding space. The core logic is: - 1. Extract chart features via vision encoder - 2. Project vision features to text embedding dimension - 3. Inject vision features into text decoder inputs (replace image placeholder tokens) - 4. Forward pass through text decoder to generate table text - - Args: - config (`PPChart2TableConfig`): - Combined configuration class (includes vision_config and text_config sub-configs). - - Inputs (forward method): - input_ids (`torch.LongTensor`, optional): - Tokenized input text (including image placeholder tokens) with shape `[B, seq_len]`. - attention_mask (`torch.Tensor`, optional): - Attention mask to avoid padding tokens (shape: `[B, seq_len]`). - position_ids (`torch.Tensor`, optional): - Positional indices for input tokens (shape: `[B, seq_len]`). - past_key_values (`list[torch.Tensor]`, optional): - Cached key/value pairs for fast autoregressive generation. - inputs_embeds (`torch.Tensor`, optional): - Precomputed input embeddings (shape: `[B, seq_len, hidden_size]`; overrides `input_ids`). - use_cache (`bool`, optional): - Whether to cache key/value pairs for generation. - pixel_values (`torch.Tensor`, optional): - Preprocessed chart images (shape: `[B, 3, H, W]`; required for multimodal input). - cache_position (`torch.LongTensor`, optional): - Position indices for cached key/value pairs (for generation). - **kwargs: - Additional arguments passed to the text decoder. - - Outputs: - `PPChart2TableModelOutputWithPast`: - Contains the text decoder's final hidden states, cached key/values, and optional intermediate outputs. - """ - - config_class = PPChart2TableConfig + _checkpoint_conversion_mapping = { + r"^language_model.model": "language_model", + } def __init__(self, config: PPChart2TableConfig): super().__init__(config) - self.vision_tower_high = PPChart2TableVisionModel._from_config(config.vision_config) + self.vision_tower = PPChart2TableVisionEncoder(config.vision_config) + self.multi_modal_projector = nn.Linear(config.output_channels, config.text_config.hidden_size) self.language_model = PPChart2TableTextModel._from_config(config.text_config) - self.mm_projector_vary = nn.Linear(config.vision_config.hidden_size, config.text_config.hidden_size) - - # Initialize weights and apply final processing + self.vision_downsample1 = nn.Conv2d( + config.vision_config.output_channels, config.net_channels, kernel_size=3, stride=2, padding=1, bias=False + ) + self.vision_downsample2 = nn.Conv2d( + config.net_channels, config.output_channels, kernel_size=3, stride=2, padding=1, bias=False + ) self.post_init() def get_input_embeddings(self): @@ -990,33 +896,22 @@ def set_input_embeddings(self, value): """Set input embeddings for the text decoder (for weight tying/loading).""" self.language_model.embed_tokens = value + @can_return_tuple + @auto_docstring( + custom_intro="Obtains image last hidden states from the vision tower and apply multimodal projection." + ) def get_image_features( self, pixel_values: torch.FloatTensor, - ) -> list[torch.Tensor]: - r""" - Extract and project chart image features to text embedding space. - - Args: - images (`torch.Tensor`): - Preprocessed chart images (shape: `[B, 3, H, W]`). - - Returns: - `list[torch.Tensor]`: - List of projected image features (one per image), each with shape `[1, num_patches, text_hidden_size]`. - """ - image_features = [] - for pixel_value in pixel_values: - pixel_value = pixel_value.unsqueeze(0) - with torch.no_grad(): - cnn_feature = self.vision_tower_high(pixel_value) - cnn_feature = cnn_feature.flatten(2).transpose(2, 1) - image_feature = self.mm_projector_vary(cnn_feature) - image_features.append(image_feature) - - image_features = torch.stack(image_features, dim=0) + **kwargs: Unpack[TransformersKwargs], + ) -> tuple | BaseModelOutputWithPooling: + image_output = self.vision_tower(pixel_values) + last_hidden_state = image_output.last_hidden_state + last_hidden_state = self.vision_downsample1(last_hidden_state) + last_hidden_state = self.vision_downsample2(last_hidden_state) + image_output.pooler_output = self.multi_modal_projector(last_hidden_state.flatten(2).transpose(2, 1)) - return image_features + return image_output def get_placeholder_mask( self, input_ids: torch.LongTensor, inputs_embeds: torch.FloatTensor, image_features: torch.FloatTensor @@ -1034,35 +929,47 @@ def get_placeholder_mask( special_image_mask = input_ids == self.config.image_token_id n_image_tokens = special_image_mask.sum() - special_image_mask = special_image_mask.unsqueeze(-1).expand_as(inputs_embeds).to(inputs_embeds.device) n_image_features = image_features.shape[0] * image_features.shape[1] - if inputs_embeds[special_image_mask].numel() != image_features.numel(): - raise ValueError( - f"Image features and image tokens do not match: tokens: {n_image_tokens}, features {n_image_features}" - ) + special_image_mask = special_image_mask.unsqueeze(-1).expand_as(inputs_embeds).to(inputs_embeds.device) + torch_compilable_check( + inputs_embeds[special_image_mask].numel() == image_features.numel(), + f"Image features and image tokens do not match, tokens: {n_image_tokens}, features: {n_image_features}", + ) return special_image_mask @can_return_tuple + @auto_docstring def forward( self, - input_ids: torch.LongTensor = None, + input_ids: torch.LongTensor | None = None, + pixel_values: torch.FloatTensor | None = None, attention_mask: torch.Tensor | None = None, - position_ids: torch.Tensor | None = None, - past_key_values: list[torch.Tensor] | None = None, - inputs_embeds: torch.Tensor | None = None, + position_ids: torch.LongTensor | None = None, + past_key_values: Cache | None = None, + inputs_embeds: torch.FloatTensor | None = None, use_cache: bool | None = None, - pixel_values: torch.Tensor | None = None, + output_attentions: bool | None = None, + output_hidden_states: bool | None = None, + return_dict: bool | None = None, cache_position: torch.LongTensor | None = None, - **kwargs, - ): + **kwargs: Unpack[TransformersKwargs], + ) -> tuple | PPChart2TableModelOutputWithPast: + output_attentions = output_attentions if output_attentions is not None else self.config.output_attentions + output_hidden_states = ( + output_hidden_states if output_hidden_states is not None else self.config.output_hidden_states + ) + return_dict = return_dict if return_dict is not None else self.config.use_return_dict + if (input_ids is None) ^ (inputs_embeds is not None): raise ValueError("You must specify exactly one of input_ids or inputs_embeds") if inputs_embeds is None: - inputs_embeds = self.language_model.embed_tokens(input_ids) + inputs_embeds = self.get_input_embeddings()(input_ids) if pixel_values is not None: - image_features = self.get_image_features(pixel_values=pixel_values.to(inputs_embeds.dtype)) + image_features = self.get_image_features( + pixel_values=pixel_values.to(inputs_embeds.dtype), return_dict=True + ).pooler_output image_features = image_features.to(inputs_embeds.device, inputs_embeds.dtype) special_image_mask = self.get_placeholder_mask( input_ids, inputs_embeds=inputs_embeds, image_features=image_features @@ -1075,6 +982,9 @@ def forward( past_key_values=past_key_values, inputs_embeds=inputs_embeds, use_cache=use_cache, + output_attentions=output_attentions, + output_hidden_states=output_hidden_states, + return_dict=True, cache_position=cache_position, **kwargs, ) @@ -1088,42 +998,49 @@ def forward( ) +@dataclass @auto_docstring( custom_intro=""" - PP-Chart2Table model for conditional generation (table text generation from chart images), - extending the core model with a language modeling (LM) head and generation utilities. + Base class for PPChart2Table causal language model (or autoregressive) outputs. """ ) -class PPChart2TableForConditionalGeneration(PPChart2TablePreTrainedModel, GenerationMixin): +class PPChart2TableCausalLMOutputWithPast(ModelOutput): r""" - PP-Chart2Table model for conditional generation (table text generation from chart images), - extending the core model with a language modeling (LM) head and generation utilities. + loss (`torch.FloatTensor` of shape `(1,)`, *optional*, returned when `labels` is provided): + Language modeling loss (for next-token prediction). + logits (`torch.FloatTensor` of shape `(batch_size, sequence_length, config.vocab_size)`): + Prediction scores of the language modeling head (scores for each vocabulary token before SoftMax). + past_key_values (`Cache`, *optional*, returned when `use_cache=True` is passed or when `config.use_cache=True`): + It is a [`~cache_utils.Cache`] instance. For more details, see our [kv cache guide](https://huggingface.co/docs/transformers/en/kv_cache). + + Contains pre-computed hidden-states (key and values in the self-attention blocks) that can be used (see + `past_key_values` input) to speed up sequential decoding. + image_hidden_states (`torch.FloatTensor`, *optional*): + A `torch.FloatTensor` of size `(batch_size, num_images, sequence_length, hidden_size)`. + image_hidden_states of the model produced by the vision encoder and after projecting the last hidden state. + """ - This class integrates Hugging Face `GenerationMixin` to support standard generation methods (greedy, beam search, etc.), - and adds an LM head to predict token probabilities for autoregressive table generation. + loss: torch.FloatTensor | None = None + logits: torch.FloatTensor | None = None + past_key_values: Cache | None = None + hidden_states: tuple[torch.FloatTensor] | None = None + attentions: tuple[torch.FloatTensor] | None = None + image_hidden_states: torch.FloatTensor | None = None - Key Features: - - LM head for token prediction (weight tied to input embeddings) - - Optimized generation input preparation (avoids reprocessing images in subsequent steps) - - Inference-only mode (training not supported by default) - Args: - config (`PPChart2TableConfig`): - Combined configuration class (vision + text sub-configs). - - Inputs (forward method): - Inherits all inputs from `PPChart2TableModel`, plus: - labels (`list[dict]`, optional): - Training labels (not supported; raises ValueError if provided). - logits_to_keep (`Union[int, torch.Tensor]`, defaults to 0): - Slice index to keep only the last N logits (optimizes generation efficiency). - - Outputs: - `PPChart2TableCausalLMOutputWithPast`: - Contains LM logits, decoder hidden states, and cached key/value pairs. +@auto_docstring( + custom_intro=""" + PP-Chart2Table model for conditional generation (table text generation from chart images), + extending the core model with a language modeling (LM) head and generation utilities. """ - - _keys_to_ignore_on_load_missing = ["num_batches_tracked"] +) +class PPChart2TableForConditionalGeneration(PPChart2TablePreTrainedModel, GenerationMixin): + _checkpoint_conversion_mapping = { + r"^language_model.model": "model.language_model", + r"^vision_tower": "model.vision_tower", + r"^multi_modal_projector": "model.multi_modal_projector", + r"^language_model.lm_head": "lm_head", + } _tied_weights_keys = {"lm_head.weight": "model.language_model.embed_tokens.weight"} def __init__(self, config: PPChart2TableConfig): @@ -1141,49 +1058,91 @@ def set_input_embeddings(self, value): def get_output_embeddings(self) -> nn.Module: return self.lm_head + @auto_docstring def get_image_features( - self, - pixel_values: torch.FloatTensor, - vision_feature_layer: int | list[int] | None = None, - vision_feature_select_strategy: str | None = None, - **kwargs, - ): - return self.model.get_image_features( - pixel_values=pixel_values, - vision_feature_layer=vision_feature_layer, - vision_feature_select_strategy=vision_feature_select_strategy, - **kwargs, - ) + self, pixel_values: torch.FloatTensor, **kwargs: Unpack[TransformersKwargs] + ) -> tuple | BaseModelOutputWithPooling: + return self.model.get_image_features(pixel_values=pixel_values, **kwargs) @can_return_tuple + @auto_docstring def forward( self, input_ids: torch.LongTensor | None = None, + pixel_values: torch.FloatTensor | None = None, attention_mask: torch.Tensor | None = None, position_ids: torch.LongTensor | None = None, - pixel_values: torch.Tensor | None = None, - inputs_embeds: torch.Tensor | None = None, - labels: list[dict] | None = None, - logits_to_keep: int | torch.Tensor = 0, - cache_position: torch.LongTensor | None = None, past_key_values: Cache | None = None, + inputs_embeds: torch.FloatTensor | None = None, + labels: torch.LongTensor | None = None, use_cache: bool | None = None, - **kwargs, - ) -> tuple[torch.FloatTensor] | PPChart2TableCausalLMOutputWithPast: + output_attentions: bool | None = None, + output_hidden_states: bool | None = None, + return_dict: bool | None = None, + cache_position: torch.LongTensor | None = None, + logits_to_keep: int | torch.Tensor = 0, + **kwargs: Unpack[TransformersKwargs], + ) -> tuple | PPChart2TableCausalLMOutputWithPast: + r""" + labels (`torch.LongTensor` of shape `(batch_size, sequence_length)`, *optional*): + Labels for computing the masked language modeling loss. Indices should either be in `[0, ..., + config.vocab_size]` or -100 (see `input_ids` docstring). Tokens with indices set to `-100` are ignored + (masked), the loss is only computed for the tokens with labels in `[0, ..., config.vocab_size]`. + + Example: + + ```python + >>> from PIL import Image + >>> import httpx + >>> from io import BytesIO + >>> from transformers import AutoProcessor, PPChart2TableForConditionalGeneration, TextStreamer + + >>> model = PPChart2TableForConditionalGeneration.from_pretrained("stepfun-ai/GOT-OCR-2.0-hf").to("cuda") + >>> processor = AutoProcessor.from_pretrained("stepfun-ai/GOT-OCR-2.0-hf") + + >>> url = "https://huggingface.co/datasets/hf-internal-testing/fixtures_got_ocr/resolve/main/multi_box.png" + >>> with httpx.stream("GET", url) as response: + ... image = Image.open(BytesIO(response.read())) + + >>> inputs = processor(image, return_tensors="pt", color="green").to("cuda") + + >>> # Generate + >>> streamer = TextStreamer(processor.tokenizer, skip_prompt=True, skip_special_tokens=True) + >>> generate_ids = model.generate( + ... **inputs, + ... do_sample=False, + ... tokenizer = processor.tokenizer, + ... stop_strings='<|im_end|>', + ... streamer=streamer, + ... max_new_tokens=4096, + ... ) + "You should keep in mind what features from the module should be used, especially + when you're planning to sell a template." + ```""" + output_attentions = output_attentions if output_attentions is not None else self.config.output_attentions + output_hidden_states = ( + output_hidden_states if output_hidden_states is not None else self.config.output_hidden_states + ) + return_dict = return_dict if return_dict is not None else self.config.use_return_dict + outputs = self.model( input_ids=input_ids, + pixel_values=pixel_values, attention_mask=attention_mask, position_ids=position_ids, - pixel_values=pixel_values, + past_key_values=past_key_values, inputs_embeds=inputs_embeds, use_cache=use_cache, - past_key_values=past_key_values, + output_attentions=output_attentions, + output_hidden_states=output_hidden_states, + return_dict=True, cache_position=cache_position, logits_to_keep=logits_to_keep, **kwargs, ) - hidden_states = outputs.last_hidden_state + hidden_states = outputs[0] + # Only compute necessary logits, and do not upcast them to float if we are not computing the loss slice_indices = slice(-logits_to_keep, None) if isinstance(logits_to_keep, int) else logits_to_keep logits = self.lm_head(hidden_states[:, slice_indices, :]) @@ -1196,10 +1155,10 @@ def forward( return PPChart2TableCausalLMOutputWithPast( loss=loss, logits=logits, - last_hidden_state=outputs.last_hidden_state, past_key_values=outputs.past_key_values, hidden_states=outputs.hidden_states, attentions=outputs.attentions, + image_hidden_states=outputs.image_hidden_states, ) def prepare_inputs_for_generation( @@ -1229,7 +1188,7 @@ def prepare_inputs_for_generation( if is_first_iteration or not kwargs.get("use_cache", True): # Pixel values are used only in the first iteration if available - # In subsquent iterations, they are already merged with text and cached + # In subsequent iterations, they are already merged with text and cached # NOTE: first iteration doesn't have to be prefill, it can be the first # iteration with a question and cached system prompt (continue generate from cache) model_inputs["pixel_values"] = pixel_values diff --git a/src/transformers/models/pp_chart2table/modular_pp_chart2table.py b/src/transformers/models/pp_chart2table/modular_pp_chart2table.py index 62544c5545e4..bfc376d0a87a 100644 --- a/src/transformers/models/pp_chart2table/modular_pp_chart2table.py +++ b/src/transformers/models/pp_chart2table/modular_pp_chart2table.py @@ -1,98 +1,66 @@ -import collections from dataclasses import dataclass -from typing import Optional, Union +from typing import Optional import torch import torch.nn as nn import torchvision.transforms.v2.functional as tvF -from transformers.cache_utils import Cache -from transformers.configuration_utils import PreTrainedConfig, layer_type_validation -from transformers.feature_extraction_utils import BatchFeature -from transformers.generation import GenerationMixin -from transformers.image_processing_utils_fast import BaseImageProcessorFast, group_images_by_shape, reorder_images -from transformers.modeling_outputs import BaseModelOutputWithPast -from transformers.modeling_rope_utils import RopeParameters -from transformers.modeling_utils import PreTrainedModel -from transformers.models.got_ocr2.modeling_got_ocr2 import ( - GotOcr2LayerNorm, - GotOcr2MLPBlock, - GotOcr2PatchEmbeddings, - GotOcr2VisionAttention, - GotOcr2VisionLayer, - GotOcr2VisionNeck, +from ...configuration_utils import PreTrainedConfig, layer_type_validation +from ...feature_extraction_utils import BatchFeature +from ...image_processing_utils_fast import BaseImageProcessorFast, group_images_by_shape, reorder_images +from ...modeling_rope_utils import RopeParameters +from ..got_ocr2.modeling_got_ocr2 import ( + GotOcr2ModelOutputWithPast, + GotOcr2Model, + GotOcr2PreTrainedModel, + GotOcr2ForConditionalGeneration, + GotOcr2VisionEncoder, ) -from transformers.models.qwen2.modeling_qwen2 import ( - Qwen2Attention, - Qwen2DecoderLayer, +from ..qwen2.modeling_qwen2 import ( Qwen2Model, Qwen2PreTrainedModel, ) -from transformers.utils import ( +from ...utils import ( auto_docstring, - can_return_tuple, logging, + TransformersKwargs, ) -from transformers.processing_utils import ProcessorMixin, TensorType -from transformers.utils import can_return_tuple +from ...modeling_outputs import BaseModelOutputWithPooling +from ...processing_utils import ProcessorMixin, TensorType, Unpack -from transformers.image_utils import SizeDict +from ...image_utils import SizeDict logger = logging.get_logger(__name__) -@auto_docstring( - custom_intro=""" - This configuration class defines all the hyperparameters for the vision component - of the PP-Chart2Table model, which is responsible for processing chart images - and extracting visual features for table structure recognition and content extraction. - PaddlePaddle/PP-Chart2Table_safetensors [PaddlePaddle/PP-Chart2Table_safetensors] - (https://huggingface.co/PaddlePaddle/PP-Chart2Table_safetensors) - """, - checkpoint="PaddlePaddle/PP-Chart2Table_safetensors", -) +@auto_docstring(checkpoint="PaddlePaddle/PP-Chart2Table_safetensors",) class PPChart2TableVisionConfig(PreTrainedConfig): """ - Configuration class for the vision backbone of PP-Chart2Table model. - - This configuration class defines all the hyperparameters for the vision component - of the PP-Chart2Table model, which is responsible for processing chart images - and extracting visual features for table structure recognition and content extraction. - PaddlePaddle/PP-Chart2Table_safetensors [PaddlePaddle/PP-Chart2Table_safetensors] - (https://huggingface.co/PaddlePaddle/PP-Chart2Table_safetensors) - - Args: - depth (`int`, *optional*, defaults to 12): - Number of transformer encoder layers in the vision backbone. - embed_dim (`int`, *optional*, defaults to 768): - Dimensionality of the patch embedding vectors. - hidden_size (`int`, *optional*, defaults to 1024): - Dimensionality of the hidden layer in the feed-forward network (MLP). - num_channels (`int`, *optional*, defaults to 3): - Number of input channels (3 for RGB images, 1 for grayscale). - image_size (`int`, *optional*, defaults to 1024): - Size (height/width) of the input images (assumed to be square). - mlp_ratio (`float`, *optional*, defaults to 4.0): - Ratio of the hidden layer size to the embedding dimension in the MLP (hidden_size = embed_dim * mlp_ratio). - num_attention_heads (`int`, *optional*, defaults to 12): - Number of attention heads for each transformer encoder layer. - patch_size (`int`, *optional*, defaults to 16): - Size (height/width) of the image patches extracted from the input image. - qkv_bias (`bool`, *optional*, defaults to `True`): - Whether to include bias terms in the query, key, value projection layers of self-attention. - use_rel_pos (`bool`, *optional*, defaults to `True`): - Whether to use relative positional embeddings in the self-attention mechanism. - global_attn_indexes (`Optional[list[int]]`, *optional*, defaults to [2, 5, 8, 11]): - List of layer indexes where global attention (instead of window attention) is applied. - If `None`, defaults to [2, 5, 8, 11]. - window_size (`int`, *optional*, defaults to 14): - Size of the attention window for window-based self-attention (only effective when use_rel_pos=True). - output_channels (`int`, *optional*, defaults to 256): - Dimensionality of the final visual feature output channels. - net_channels (`int`, *optional*, defaults to 512): - Dimensionality of intermediate network channels in the vision backbone. - attention_dropout (`float`, *optional*, defaults to 0.0): - Dropout probability applied to the attention weights. + num_hidden_layers (`int`, *optional*, defaults to 12): + Number of transformer encoder layers in the vision backbone. + hidden_size (`int`, *optional*, defaults to 768): + Dimensionality of the patch embedding vectors. + num_channels (`int`, *optional*, defaults to 3): + Number of input channels (3 for RGB images, 1 for grayscale). + image_size (`int`, *optional*, defaults to 1024): + Size (height/width) of the input images (assumed to be square). + num_attention_heads (`int`, *optional*, defaults to 12): + Number of attention heads for each transformer encoder layer. + patch_size (`int`, *optional*, defaults to 16): + Size (height/width) of the image patches extracted from the input image. + qkv_bias (`bool`, *optional*, defaults to `True`): + Whether to include bias terms in the query, key, value projection layers of self-attention. + use_rel_pos (`bool`, *optional*, defaults to `True`): + Whether to use relative positional embeddings in the self-attention mechanism. + global_attn_indexes (`Optional[list[int]]`, *optional*, defaults to [2, 5, 8, 11]): + List of layer indexes where global attention (instead of window attention) is applied. + If `None`, defaults to [2, 5, 8, 11]. + window_size (`int`, *optional*, defaults to 14): + Size of the attention window for window-based self-attention (only effective when use_rel_pos=True). + output_channels (`int`, *optional*, defaults to 256): + Dimensionality of the final visual feature output channels. + attention_dropout (`float`, *optional*, defaults to 0.0): + Dropout probability applied to the attention weights. """ model_type = "pp_chart2table_vision" @@ -100,37 +68,35 @@ class PPChart2TableVisionConfig(PreTrainedConfig): def __init__( self, - depth: int = 12, - embed_dim: int = 768, - hidden_size: int = 1024, - num_channels: int = 3, - image_size: int = 1024, - mlp_ratio: float = 4.0, - num_attention_heads: int = 12, - patch_size: int = 16, - qkv_bias: bool = True, - use_rel_pos: bool = True, - global_attn_indexes: Optional[list[int]] = None, - window_size: int = 14, - output_channels: int = 256, - net_channels: int = 512, - attention_dropout: float = 0.0, + num_hidden_layers=12, + hidden_size=768, + output_channels=256, + mlp_dim=3072, + num_channels=3, + image_size=1024, + num_attention_heads=12, + patch_size=16, + qkv_bias=True, + use_rel_pos=True, + use_abs_pos=True, + global_attn_indexes=[2, 5, 8, 11], + window_size=14, + attention_dropout=0.0, **kwargs, ): - self.depth = depth - self.embed_dim = embed_dim + self.num_hidden_layers = num_hidden_layers self.hidden_size = hidden_size + self.mlp_dim=mlp_dim self.image_size = image_size self.num_channels = num_channels - self.mlp_ratio = mlp_ratio self.num_attention_heads = num_attention_heads self.patch_size = patch_size self.qkv_bias = qkv_bias self.use_rel_pos = use_rel_pos - self.global_attn_indexes = global_attn_indexes if global_attn_indexes is not None else [2, 5, 8, 11] + self.use_abs_pos = use_abs_pos + self.global_attn_indexes = global_attn_indexes self.window_size = window_size self.output_channels = output_channels - self.net_channels = net_channels self.attention_dropout = attention_dropout super().__init__(**kwargs) @@ -143,76 +109,52 @@ def __init__( ) class PPChart2TableTextConfig(PreTrainedConfig): r""" - This is the configuration class to store the configuration of a [`PPChart2TableTextModel`]. It is used to instantiate a - PP-Chart2Table text decoder according to the specified arguments, defining the model architecture. Instantiating a - configuration with the defaults will yield a similar configuration to that of the text encoder/decoder of the - PPChart2TableText-7B-beta [Qwen/PPChart2TableText-7B-beta](https://huggingface.co/Qwen/PPChart2TableText-7B-beta) - architecture, optimized for chart-to-table text generation tasks. - - Configuration objects inherit from [`PreTrainedConfig`] and can be used to control the model outputs. Read the - documentation from [`PreTrainedConfig`] for more information. - - Args: - attention_dropout (`float`, *optional*, defaults to 0.0): - The dropout ratio for the attention probabilities in self-attention layers. - bos_token_id (`int`, *optional*, defaults to 151643): - The token ID representing the beginning of a sequence (BOS) for text generation. - eos_token_id (`int`, *optional*, defaults to 151643): - The token ID representing the end of a sequence (EOS) for text generation. - pad_token_id (Optional[int], optional, *optional*, defaults to -1): - The index of the padding token. Defaults to -1. - hidden_act (`str` or `function`, *optional*, defaults to `"silu"`): - The non-linear activation function (function or string) in the feed-forward and attention layers of the decoder. - hidden_size (`int`, *optional*, defaults to 1024): - Dimensionality of the hidden representations in the Transformer decoder layers. - initializer_range (`float`, *optional*, defaults to 0.02): - The standard deviation of the truncated_normal_initializer for initializing all weight matrices. - intermediate_size (`int`, *optional*, defaults to 2816): - Dimensionality of the "intermediate" (i.e., feed-forward) layer in the Transformer decoder blocks. - max_position_embeddings (`int`, *optional*, defaults to 32768): - The maximum sequence length that this model might ever be used with for text input/output. - num_attention_heads (`int`, *optional*, defaults to 16): - Number of attention heads for each self-attention layer in the Transformer decoder. - num_hidden_layers (`int`, *optional*, defaults to 24): - Number of hidden layers in the Transformer decoder. - num_key_value_heads (`int`, *optional*, defaults to 16): - Number of key/value heads for implementing Grouped Query Attention (GQA). If equal to `num_attention_heads`, - Multi Head Attention (MHA) is used; if 1, Multi Query Attention (MQA) is used. For more details, see - [this paper](https://huggingface.co/papers/2305.13245). - rms_norm_eps (`float`, *optional*, defaults to 1e-06): - The epsilon value used by the RMS normalization layers to avoid division by zero. - rope_theta (`float`, *optional*, defaults to 1000000.0): - The base period of the RoPE (Rotary Position Embedding) embeddings, controlling the frequency of positional encoding. - rope_parameters (`RopeParameters` or `dict`, *optional*): - Configuration parameters for RoPE embeddings, including scaling parameters for longer sequence lengths beyond - `max_position_embeddings`. - sliding_window (`int`, *optional*, defaults to 32768): - Window size for Sliding Window Attention (SWA) in the decoder layers (only active if `use_sliding_window=True`). - tie_word_embeddings (`bool`, *optional*, defaults to `True`): - Whether the model's input and output word embeddings should be tied (shared weights). - use_cache (`bool`, *optional*, defaults to `True`): - Whether to return the last key/value attention states to speed up sequential decoding (only relevant for autoregressive - generation). - vocab_size (`int`, *optional*, defaults to 151860): - Vocabulary size of the PPChart2TableText model. Defines the number of distinct tokens that can be represented - by `input_ids`. - layer_types (`list[str]`, *optional*): - Attention pattern for each decoder layer (e.g., `"full_attention"` or `"sliding_attention"`). If not specified, - automatically determined by `sliding_window`. - - Example: - - ```python - >>> from transformers import PPChart2TableTextConfig, PPChart2TableTextModel - - >>> # Initializing a PPChart2TableText style configuration - >>> configuration = PPChart2TableTextConfig() - - >>> # Initializing a model from the PPChart2TableText-7B style configuration - >>> model = PPChart2TableTextModel(configuration) - - >>> # Accessing the model configuration - >>> configuration = model.config + attention_dropout (`float`, *optional*, defaults to 0.0): + The dropout ratio for the attention probabilities in self-attention layers. + bos_token_id (`int`, *optional*, defaults to 151643): + The token ID representing the beginning of a sequence (BOS) for text generation. + eos_token_id (`int`, *optional*, defaults to 151643): + The token ID representing the end of a sequence (EOS) for text generation. + pad_token_id (Optional[int], optional, *optional*, defaults to -1): + The index of the padding token. Defaults to -1. + hidden_act (`str` or `function`, *optional*, defaults to `"silu"`): + The non-linear activation function (function or string) in the feed-forward and attention layers of the decoder. + hidden_size (`int`, *optional*, defaults to 1024): + Dimensionality of the hidden representations in the Transformer decoder layers. + initializer_range (`float`, *optional*, defaults to 0.02): + The standard deviation of the truncated_normal_initializer for initializing all weight matrices. + intermediate_size (`int`, *optional*, defaults to 2816): + Dimensionality of the "intermediate" (i.e., feed-forward) layer in the Transformer decoder blocks. + max_position_embeddings (`int`, *optional*, defaults to 32768): + The maximum sequence length that this model might ever be used with for text input/output. + num_attention_heads (`int`, *optional*, defaults to 16): + Number of attention heads for each self-attention layer in the Transformer decoder. + num_hidden_layers (`int`, *optional*, defaults to 24): + Number of hidden layers in the Transformer decoder. + num_key_value_heads (`int`, *optional*, defaults to 16): + Number of key/value heads for implementing Grouped Query Attention (GQA). If equal to `num_attention_heads`, + Multi Head Attention (MHA) is used; if 1, Multi Query Attention (MQA) is used. For more details, see + [this paper](https://huggingface.co/papers/2305.13245). + rms_norm_eps (`float`, *optional*, defaults to 1e-06): + The epsilon value used by the RMS normalization layers to avoid division by zero. + rope_theta (`float`, *optional*, defaults to 1000000.0): + The base period of the RoPE (Rotary Position Embedding) embeddings, controlling the frequency of positional encoding. + rope_parameters (`RopeParameters` or `dict`, *optional*): + Configuration parameters for RoPE embeddings, including scaling parameters for longer sequence lengths beyond + `max_position_embeddings`. + sliding_window (`int`, *optional*, defaults to 32768): + Window size for Sliding Window Attention (SWA) in the decoder layers (only active if `use_sliding_window=True`). + tie_word_embeddings (`bool`, *optional*, defaults to `True`): + Whether the model's input and output word embeddings should be tied (shared weights). + use_cache (`bool`, *optional*, defaults to `True`): + Whether to return the last key/value attention states to speed up sequential decoding (only relevant for autoregressive + generation). + vocab_size (`int`, *optional*, defaults to 151860): + Vocabulary size of the PPChart2TableText model. Defines the number of distinct tokens that can be represented + by `input_ids`. + layer_types (`list[str]`, *optional*): + Attention pattern for each decoder layer (e.g., `"full_attention"` or `"sliding_attention"`). If not specified, + automatically determined by `sliding_window`. """ model_type = "pp_chart2table_text" @@ -306,53 +248,20 @@ def __init__( ) class PPChart2TableConfig(PreTrainedConfig): r""" - This is the main configuration class to store the configuration of a [PPChart2TableModel] or [PPChart2TableForConditionalGeneration]. - It is used to instantiate a PP-Chart2Table multimodal model according to the specified arguments, defining the vision and text - sub-model architectures. This configuration class inherits from [PreTrainedConfig] and combines the configurations of: - [PPChart2TableVisionConfig] (for the chart vision encoder) - [PPChart2TableTextConfig] (for the table text decoder) - PP-Chart2Table PaddlePaddle/PP-Chart2Table_safetensors. - - Instantiating a PPChart2TableConfig with the defaults will yield a similar configuration to the base PP-Chart2Table model - developed by the PaddlePaddle team for chart-to-table parsing tasks. - - Configuration objects inherit from [PreTrainedConfig] and can be used to control the model outputs. Read the - documentation from [PreTrainedConfig] for more information. - - Args: - vision_config (Optional[Dict], optional, *optional*):: - The [PPChart2TableVisionConfig] for the vision sub-model. Defaults to None. - text_config (Optional[Dict], optional, *optional*):: - The [PPChart2TableTextConfig] for the text sub-model. Defaults to None. - image_token_index (Optional[int], optional, *optional*, defaults to 151859):: - The index of the image token. Defaults to 151859. - image_seq_length (Optional[int], optional, *optional*, defaults to 576):: - The sequence length for the image. Defaults to 576. - pad_token_id (Optional[int], optional, *optional*, defaults to -1): - The index of the padding token. Defaults to -1. - - Example: - - ```python - >>> from transformers import PPChart2TableConfig, PPChart2TableModel - - >>> # Initializing a PPChart2Table configuration with default vision and text sub-configs - >>> configuration = PPChart2TableConfig() - - >>> # Initializing a PPChart2Table configuration with custom vision and text sub-configs - >>> vision_config = {"image_size": 512, "patch_size": 8} - >>> text_config = {"hidden_size": 2048, "num_hidden_layers": 16} - >>> configuration = PPChart2TableConfig(vision_config=vision_config, text_config=text_config) - - >>> # Initializing a model from the PPChart2Table configuration - >>> model = PPChart2TableModel(configuration) - - >>> # Accessing the model configuration - >>> configuration = model.config - >>> # Accessing the vision sub-config - >>> vision_config = configuration.vision_config - >>> # Accessing the text sub-config - >>> text_config = configuration.text_config + vision_config (Optional[Dict], optional, *optional*):: + The [PPChart2TableVisionConfig] for the vision sub-model. Defaults to None. + text_config (Optional[Dict], optional, *optional*):: + The [PPChart2TableTextConfig] for the text sub-model. Defaults to None. + image_token_index (Optional[int], optional, *optional*, defaults to 151859):: + The index of the image token. Defaults to 151859. + image_seq_length (Optional[int], optional, *optional*, defaults to 576):: + The sequence length for the image. Defaults to 576. + pad_token_id (Optional[int], optional, *optional*, defaults to -1): + The index of the padding token. Defaults to -1. + net_channels (`int`, *optional*, defaults to 512): + Dimensionality of intermediate network channels in the vision backbone. + output_channels (`int`, *optional*, defaults to 1024): + Dimensionality of intermediate network channels in the vision backbone. """ model_type = "pp_chart2table" @@ -368,11 +277,15 @@ def __init__( image_token_index: Optional[int] = 151859, image_seq_length: Optional[int] = 576, pad_token_id: Optional[int] = -1, + net_channels: Optional[int] = 512, + output_channels: Optional[int] = 1024, **kwargs, ): self.image_token_index = image_token_index self.image_seq_length = image_seq_length self.pad_token_id = pad_token_id + self.net_channels = net_channels + self.output_channels = output_channels if vision_config is None: vision_config = {} @@ -408,40 +321,9 @@ def __init__( super().__init__(**kwargs) -@auto_docstring( - custom_intro=""" - """ -) +@auto_docstring class PPChart2TableImageProcessorFast(BaseImageProcessorFast): - r""" - Fast image processor for the PP-Chart2Table multimodal model, optimized for GPU-accelerated chart image preprocessing. - - This high-performance processor implements a streamlined preprocessing pipeline for chart images (resizing, rescaling, - normalization, channel reordering) using PyTorch tensor operations, designed for efficient batch processing on GPUs. - It inherits from [`BaseImageProcessorFast`] and is optimized for inference/training pipelines requiring low-latency - image preprocessing. - - Class Attributes (Default Configuration): - resample (`int`, defaults to 3): - Integer identifier for the resampling filter (3 = BICUBIC, compatible with `InterpolationMode.BICUBIC`). - image_mean (`list[float]`, defaults to `[0.40821073, 0.4578275, 0.48145466]`): - Per-channel mean values for image normalization (RGB order). - image_std (`list[float]`, defaults to `[0.27577711, 0.26130258, 0.26862954]`): - Per-channel standard deviation values for image normalization (RGB order). - size (`dict[str, int]`, defaults to `{"height": 1024, "width": 1024}`): - Default target size for image resizing (1024x1024, optimized for PP-Chart2Table vision encoder). - patch_size (`int`, defaults to 16): - Size of image patches used by the PP-Chart2Table vision encoder (for alignment with model input). - merge_size (`int`, defaults to 4): - Size factor for merging image patches (specific to PP-Chart2Table's vision processing pipeline). - do_resize (`bool`, defaults to `True`): - Default flag to enable image resizing. - do_rescale (`bool`, defaults to `True`): - Default flag to enable pixel value rescaling (from [0,255] to [0,1]). - do_normalize (`bool`, defaults to `True`): - Default flag to enable image normalization. - """ resample = 3 image_mean = [0.40821073, 0.4578275, 0.48145466] @@ -453,9 +335,6 @@ class PPChart2TableImageProcessorFast(BaseImageProcessorFast): do_rescale = True do_normalize = True - def __init__(self, **kwargs) -> None: - super().__init__(**kwargs) - def _preprocess( self, images: list["torch.Tensor"], @@ -498,30 +377,8 @@ def _preprocess( ) -@auto_docstring( - custom_intro=""" - A multi-modal processor for the PPChart2Table model, combining image preprocessing and text tokenization - capabilities to handle chart-to-table conversion tasks. - - This processor integrates `PPChart2TableImageProcessorFast` for chart image preprocessing (e.g., patch-based - resizing) and `Qwen2Tokenizer` for text prompt construction/tokenization. It encapsulates the end-to-end - processing pipeline from raw chart images + text instructions to model-ready input tensors, and also provides - postprocessing logic to decode model outputs back to human-readable table text. - """ -) +@auto_docstring class PPChart2TableProcessor(ProcessorMixin): - r""" - [`PPChart2TableProcessor`] offers all the functionalities of [`PPChart2TableImageProcessorFast`] and [`Qwen2Tokenizer`]. See the - [`~PPChart2TableProcessor.__call__`] and [`~PPChart2TableProcessor.decode`] for more information. - Args: - image_processor ([`PPChart2TableImageProcessorFast`], *optional*): - The image processor is a required input. - tokenizer ([`Qwen2Tokenizer`], *optional*): - The tokenizer is a required input. - chat_template (`str`, *optional*): A Jinja template which will be used to convert lists of messages - in a chat into a tokenizable string. - """ - image_processor_class = "AutoImageProcessor" tokenizer_class = "AutoTokenizer" @@ -560,182 +417,16 @@ def postprocess(self, model_pred, **kwargs): ) -class PPChart2TableVisionPatchEmbed(GotOcr2PatchEmbeddings): - def __init__(self, config): - super().__init__() - image_size, patch_size = config.image_size, config.patch_size - num_channels, hidden_size = config.num_channels, config.embed_dim - image_size = image_size if isinstance(image_size, collections.abc.Iterable) else (image_size, image_size) - patch_size = patch_size if isinstance(patch_size, collections.abc.Iterable) else (patch_size, patch_size) - num_patches = (image_size[1] // patch_size[1]) * (image_size[0] // patch_size[0]) - self.image_size = image_size - self.patch_size = patch_size - self.num_channels = num_channels - self.num_patches = num_patches - self.projection = nn.Conv2d(num_channels, hidden_size, kernel_size=patch_size, stride=patch_size) - - -class PPChart2TableVisionMLPBlock(GotOcr2MLPBlock): - def __init__(self, config) -> None: - super().__init__() - self.lin1 = nn.Linear(config.embed_dim, int(config.embed_dim * config.mlp_ratio)) - self.lin2 = nn.Linear(int(config.embed_dim * config.mlp_ratio), config.embed_dim) - - -class PPChart2TableVisionLayerNorm(GotOcr2LayerNorm): - pass - - -class PPChart2TableVisionAttention(GotOcr2VisionAttention): - """Multi-head Attention block with relative position embeddings.""" - - def __init__(self, config, window_size): - super().__init__() - head_dim = config.embed_dim // config.num_attention_heads - self.scale = head_dim**-0.5 - self.qkv = nn.Linear(config.embed_dim, config.embed_dim * 3, bias=config.qkv_bias) - self.proj = nn.Linear(config.embed_dim, config.embed_dim) - - -class PPChart2TableVisionDecoderLayer(GotOcr2VisionLayer): - def __init__(self, config, window_size) -> None: - super().__init__() - self.layer_norm1 = nn.LayerNorm(config.embed_dim) - self.attn = PPChart2TableVisionAttention(config, window_size=window_size) - - self.layer_norm2 = nn.LayerNorm(config.embed_dim) - self.mlp = PPChart2TableVisionMLPBlock(config) - self.window_size = window_size - - -class PPChart2TableVisionNeck(GotOcr2VisionNeck): - def __init__(self, config: PPChart2TableVisionConfig): - super().__init__() - self.config = config - - self.conv1 = nn.Conv2d(config.embed_dim, config.output_channels, kernel_size=1, bias=False) - self.layer_norm1 = PPChart2TableVisionLayerNorm(config.output_channels, data_format="channels_first") - self.conv2 = nn.Conv2d(config.output_channels, config.output_channels, kernel_size=3, padding=1, bias=False) - self.layer_norm2 = PPChart2TableVisionLayerNorm(config.output_channels, data_format="channels_first") - - -@auto_docstring( - custom_intro=""" - - """ -) -class PPChart2TableVisionPreTrainedModel(PreTrainedModel): - r""" - Base class for all PP-Chart2Table vision models, inheriting from Hugging Face `PreTrainedModel`. - - This class sets up core configurations and compatibility flags for the vision encoder, including: - - Support for gradient checkpointing, attention backends (FlashAttention/SDPA), and model compilation - - Definition of non-splittable modules (for tensor parallelism) - - Output recording for hidden states/attentions (for debugging/analysis) - - Class Attributes: - config (`PPChart2TableVisionConfig`): - Typed config class for PP-Chart2Table vision encoder (enforces type checking). - base_model_prefix (`str`, defaults to `"model"`): - Prefix for base model parameters (used in weight loading/saving). - supports_gradient_checkpointing (`bool`, defaults to `True`): - Whether the model supports gradient checkpointing to save memory. - _no_split_modules (`list[str]`): - Modules that should not be split across devices (tensor parallelism compatibility). - _skip_keys_device_placement (`list[str]`): - Keys to skip when placing tensors on devices (e.g., past key values for generation). - _supports_flash_attn / _supports_sdpa / _supports_flex_attn (`bool`): - Compatibility with optimized attention implementations (FlashAttention, SDPA, FlexAttention). - _can_compile_fullgraph (`bool`, defaults to `True`): - Whether the model supports TorchScript/TorchCompile full graph compilation. - _supports_attention_backend (`bool`, defaults to `True`): - Whether the model supports switching attention backends (e.g., PyTorch vs FlashAttention). - _can_record_outputs (`dict`): - Mapping of output types to modules for recording intermediate outputs (hidden_states/attentions). - """ - - config: PPChart2TableVisionConfig - base_model_prefix = "model" - supports_gradient_checkpointing = True - _no_split_modules = ["PPChart2TableVisionDecoderLayer"] - _skip_keys_device_placement = ["past_key_values"] - _supports_flash_attn = True - _supports_sdpa = True - _supports_flex_attn = True - - _can_compile_fullgraph = True - _supports_attention_backend = True - _can_record_outputs = { - "hidden_states": PPChart2TableVisionDecoderLayer, - "attentions": PPChart2TableVisionAttention, - } - - -@auto_docstring( - custom_intro=""" - - """ -) -class PPChart2TableVisionModel(PPChart2TableVisionPreTrainedModel): - main_input_name = "pixel_values" - input_modalities = "image" - - def __init__( - self, - config: PPChart2TableVisionConfig, - ) -> None: - super().__init__(config) - self.image_size = config.image_size - - self.patch_embed = PPChart2TableVisionPatchEmbed(config) - - self.pos_embed = nn.Parameter( - torch.zeros( - 1, config.image_size // config.patch_size, config.image_size // config.patch_size, config.embed_dim - ) - ) - - self.blocks = nn.ModuleList() - for i in range(config.depth): - block = PPChart2TableVisionDecoderLayer( - config, - window_size=config.window_size if i not in config.global_attn_indexes else 0, - ) - self.blocks.append(block) - - self.neck = PPChart2TableVisionNeck(config) - - self.net_2 = nn.Conv2d( - config.output_channels, config.net_channels, kernel_size=3, stride=2, padding=1, bias=False - ) - self.net_3 = nn.Conv2d(config.net_channels, config.hidden_size, kernel_size=3, stride=2, padding=1, bias=False) +class PPChart2TableVisionPreTrainedModel(GotOcr2PreTrainedModel): + input_modalities = ("image", "text") - self.post_init() - def forward(self, hidden_states: torch.Tensor, **kwargs) -> torch.Tensor: - hidden_states = self.patch_embed(hidden_states) - hidden_states = hidden_states + self.pos_embed - for block in self.blocks: - hidden_states = block(hidden_states) - hidden_states = self.neck(hidden_states) - hidden_states = self.net_2(hidden_states) - hidden_states = self.net_3(hidden_states) - return hidden_states - - -class PPChart2TableTextAttention(Qwen2Attention): +class PPChart2TableVisionEncoder(GotOcr2VisionEncoder, PPChart2TableVisionPreTrainedModel): pass -class PPChart2TableTextDecoderLayer(Qwen2DecoderLayer): - pass - -@auto_docstring( - custom_intro=""" - - """ -) +@auto_docstring class PPChart2TableTextPreTrainedModel(Qwen2PreTrainedModel): pass @@ -745,147 +436,19 @@ class PPChart2TableTextModel(Qwen2Model): @dataclass -class PPChart2TableModelOutputWithPast(BaseModelOutputWithPast): - r""" - Output class for PPChart2Table multimodal model's forward pass, extending Hugging Face `ModelOutput`. - - This dataclass encapsulates the core outputs of the PP-Chart2Table base model, including hidden states, - attention weights, and cached key/value pairs for efficient generation. - - Attributes: - past_key_values (`Optional[Cache]`, defaults to `None`): - Cached attention key/value pairs from the text decoder (for fast autoregressive generation). - last_hidden_state (`Optional[torch.FloatTensor]`, defaults to `None`): - Final hidden states from the text decoder (shape: `[B, seq_len, hidden_size]`), after multimodal fusion. - hidden_states (`Optional[tuple[torch.FloatTensor]]`, defaults to `None`): - Tuple of hidden states from each layer of the text decoder (for debugging/analysis). - attentions (`Optional[tuple[torch.FloatTensor]]`, defaults to `None`): - Tuple of attention weights from each layer of the text decoder (for debugging/analysis). - """ - image_hidden_states: Optional[torch.FloatTensor] = None - - -@dataclass -class PPChart2TableCausalLMOutputWithPast(BaseModelOutputWithPast): - r""" - Output class for PP-Chart2Table conditional generation model's forward pass. - - Extends `PPChart2TableModelOutputWithPast` with language modeling logits (for token prediction), - tailored for autoregressive table generation tasks. - - Attributes: - logits (`Optional[torch.FloatTensor]`, defaults to `None`): - Language modeling logits (shape: `[B, seq_len, vocab_size]`), output from the LM head. - """ - - logits: Optional[torch.FloatTensor] = None - loss: Optional[torch.FloatTensor] = None - image_hidden_states: Optional[torch.FloatTensor] = None - - -@auto_docstring( - custom_intro=""" - - """ -) -class PPChart2TablePreTrainedModel(PreTrainedModel): - r""" - Base class for all PP-Chart2Table multimodal models, inheriting from Hugging Face `PreTrainedModel`. - - This class defines core configurations and compatibility flags for the multimodal model (vision + text), - including support for gradient checkpointing, optimized attention backends, and model compilation. - - Class Attributes: - config (`PPChart2TableConfig`): - Typed config class for PP-Chart2Table (combines vision + text sub-configs). - base_model_prefix (`str`, defaults to `"model"`): - Prefix for base model parameters (used in weight loading/saving). - supports_gradient_checkpointing (`bool`, defaults to `True`): - Whether the model supports gradient checkpointing to save memory during training. - _no_split_modules (`list[str]`): - Modules that should not be split across devices (tensor parallelism compatibility). - _skip_keys_device_placement (`list[str]`): - Keys to skip when placing tensors on devices (e.g., past key values for generation). - _supports_flash_attn / _supports_sdpa / _supports_flex_attn (`bool`): - Compatibility with optimized attention implementations (FlashAttention, SDPA, FlexAttention). - _can_compile_fullgraph (`bool`, defaults to `True`): - Whether the model supports TorchScript/TorchCompile full graph compilation. - _supports_attention_backend (`bool`, defaults to `True`): - Whether the model supports switching attention backends (e.g., PyTorch vs FlashAttention). - _can_record_outputs (`dict`): - Mapping of output types to modules for recording intermediate outputs (hidden_states/attentions). - """ - - config: PPChart2TableConfig - base_model_prefix = "model" - supports_gradient_checkpointing = True - _no_split_modules = ["PPChart2TableTextDecoderLayer"] - _skip_keys_device_placement = ["past_key_values"] - _supports_flash_attn = True - _supports_sdpa = True - _supports_flex_attn = True - - _can_compile_fullgraph = True - _supports_attention_backend = True - - _can_record_outputs = { - "hidden_states": PPChart2TableTextDecoderLayer, - "attentions": PPChart2TableTextAttention, - } - - -@auto_docstring( - custom_intro=""" - Core PP-Chart2Table multimodal model (vision encoder + text decoder) for chart-to-table parsing. - """ -) -class PPChart2TableModel(PPChart2TablePreTrainedModel): - r""" - Core PP-Chart2Table multimodal model (vision encoder + text decoder) for chart-to-table parsing. - - This model integrates a vision encoder (for chart image feature extraction) and a text decoder (for table generation), - with a multimodal projection layer to align vision features with text embedding space. The core logic is: - 1. Extract chart features via vision encoder - 2. Project vision features to text embedding dimension - 3. Inject vision features into text decoder inputs (replace image placeholder tokens) - 4. Forward pass through text decoder to generate table text - - Args: - config (`PPChart2TableConfig`): - Combined configuration class (includes vision_config and text_config sub-configs). +class PPChart2TableModelOutputWithPast(GotOcr2ModelOutputWithPast): + pass - Inputs (forward method): - input_ids (`torch.LongTensor`, optional): - Tokenized input text (including image placeholder tokens) with shape `[B, seq_len]`. - attention_mask (`torch.Tensor`, optional): - Attention mask to avoid padding tokens (shape: `[B, seq_len]`). - position_ids (`torch.Tensor`, optional): - Positional indices for input tokens (shape: `[B, seq_len]`). - past_key_values (`list[torch.Tensor]`, optional): - Cached key/value pairs for fast autoregressive generation. - inputs_embeds (`torch.Tensor`, optional): - Precomputed input embeddings (shape: `[B, seq_len, hidden_size]`; overrides `input_ids`). - use_cache (`bool`, optional): - Whether to cache key/value pairs for generation. - pixel_values (`torch.Tensor`, optional): - Preprocessed chart images (shape: `[B, 3, H, W]`; required for multimodal input). - cache_position (`torch.LongTensor`, optional): - Position indices for cached key/value pairs (for generation). - **kwargs: - Additional arguments passed to the text decoder. - Outputs: - `PPChart2TableModelOutputWithPast`: - Contains the text decoder's final hidden states, cached key/values, and optional intermediate outputs. - """ - - config_class = PPChart2TableConfig +@auto_docstring +class PPChart2TableModel(GotOcr2Model): def __init__(self, config: PPChart2TableConfig): super().__init__(config) - self.vision_tower_high = PPChart2TableVisionModel._from_config(config.vision_config) + self.vision_downsample1 = nn.Conv2d(config.vision_config.output_channels, config.net_channels, kernel_size=3, stride=2, padding=1, bias=False) + self.vision_downsample2 = nn.Conv2d(config.net_channels, config.output_channels, kernel_size=3, stride=2, padding=1, bias=False) self.language_model = PPChart2TableTextModel._from_config(config.text_config) - self.mm_projector_vary = nn.Linear(config.vision_config.hidden_size, config.text_config.hidden_size) + self.multi_modal_projector = nn.Linear(config.output_channels, config.text_config.hidden_size) # Initialize weights and apply final processing self.post_init() @@ -901,99 +464,16 @@ def set_input_embeddings(self, value): def get_image_features( self, pixel_values: torch.FloatTensor, - ) -> list[torch.Tensor]: - r""" - Extract and project chart image features to text embedding space. - - Args: - images (`torch.Tensor`): - Preprocessed chart images (shape: `[B, 3, H, W]`). - - Returns: - `list[torch.Tensor]`: - List of projected image features (one per image), each with shape `[1, num_patches, text_hidden_size]`. - """ - image_features = [] - for pixel_value in pixel_values: - pixel_value = pixel_value.unsqueeze(0) - with torch.no_grad(): - cnn_feature = self.vision_tower_high(pixel_value) - cnn_feature = cnn_feature.flatten(2).transpose(2, 1) - image_feature = self.mm_projector_vary(cnn_feature) - image_features.append(image_feature) + **kwargs: Unpack[TransformersKwargs], + ) -> tuple | BaseModelOutputWithPooling: + + image_output = self.vision_tower(pixel_values) + last_hidden_state = image_output.last_hidden_state + last_hidden_state = self.vision_downsample1(last_hidden_state) + last_hidden_state = self.vision_downsample2(last_hidden_state) + image_output.pooler_output = self.multi_modal_projector(last_hidden_state.flatten(2).transpose(2, 1)) - image_features = torch.stack(image_features, dim=0) - - return image_features - - def get_placeholder_mask( - self, input_ids: torch.LongTensor, inputs_embeds: torch.FloatTensor, image_features: torch.FloatTensor - ): - """ - Obtains multimodal placeholder mask from `input_ids` or `inputs_embeds`, and checks that the placeholder token count is - equal to the length of multimodal features. If the lengths are different, an error is raised. - """ - if input_ids is None: - special_image_mask = inputs_embeds == self.get_input_embeddings()( - torch.tensor(self.config.image_token_id, dtype=torch.long, device=inputs_embeds.device) - ) - special_image_mask = special_image_mask.all(-1) - else: - special_image_mask = input_ids == self.config.image_token_id - - n_image_tokens = special_image_mask.sum() - special_image_mask = special_image_mask.unsqueeze(-1).expand_as(inputs_embeds).to(inputs_embeds.device) - n_image_features = image_features.shape[0] * image_features.shape[1] - if inputs_embeds[special_image_mask].numel() != image_features.numel(): - raise ValueError( - f"Image features and image tokens do not match: tokens: {n_image_tokens}, features {n_image_features}" - ) - return special_image_mask - - @can_return_tuple - def forward( - self, - input_ids: torch.LongTensor = None, - attention_mask: Optional[torch.Tensor] = None, - position_ids: Optional[torch.Tensor] = None, - past_key_values: Optional[list[torch.Tensor]] = None, - inputs_embeds: Optional[torch.Tensor] = None, - use_cache: Optional[bool] = None, - pixel_values: Optional[torch.Tensor] = None, - cache_position: Optional[torch.LongTensor] = None, - **kwargs, - ): - if (input_ids is None) ^ (inputs_embeds is not None): - raise ValueError("You must specify exactly one of input_ids or inputs_embeds") - - if inputs_embeds is None: - inputs_embeds = self.language_model.embed_tokens(input_ids) - - if pixel_values is not None: - image_features = self.get_image_features(pixel_values=pixel_values.to(inputs_embeds.dtype)) - image_features = image_features.to(inputs_embeds.device, inputs_embeds.dtype) - special_image_mask = self.get_placeholder_mask( - input_ids, inputs_embeds=inputs_embeds, image_features=image_features - ) - inputs_embeds = inputs_embeds.masked_scatter(special_image_mask, image_features) - - outputs = self.language_model( - attention_mask=attention_mask, - position_ids=position_ids, - past_key_values=past_key_values, - inputs_embeds=inputs_embeds, - use_cache=use_cache, - cache_position=cache_position, - **kwargs, - ) - - return PPChart2TableModelOutputWithPast( - last_hidden_state=outputs.last_hidden_state, - past_key_values=outputs.past_key_values, - hidden_states=outputs.hidden_states, - attentions=outputs.attentions, - image_hidden_states=image_features if pixel_values is not None else None, - ) + return image_output @auto_docstring( @@ -1002,147 +482,8 @@ def forward( extending the core model with a language modeling (LM) head and generation utilities. """ ) -class PPChart2TableForConditionalGeneration(PPChart2TablePreTrainedModel, GenerationMixin): - r""" - PP-Chart2Table model for conditional generation (table text generation from chart images), - extending the core model with a language modeling (LM) head and generation utilities. - - This class integrates Hugging Face `GenerationMixin` to support standard generation methods (greedy, beam search, etc.), - and adds an LM head to predict token probabilities for autoregressive table generation. - - Key Features: - - LM head for token prediction (weight tied to input embeddings) - - Optimized generation input preparation (avoids reprocessing images in subsequent steps) - - Inference-only mode (training not supported by default) - - Args: - config (`PPChart2TableConfig`): - Combined configuration class (vision + text sub-configs). - - Inputs (forward method): - Inherits all inputs from `PPChart2TableModel`, plus: - labels (`list[dict]`, optional): - Training labels (not supported; raises ValueError if provided). - logits_to_keep (`Union[int, torch.Tensor]`, defaults to 0): - Slice index to keep only the last N logits (optimizes generation efficiency). - - Outputs: - `PPChart2TableCausalLMOutputWithPast`: - Contains LM logits, decoder hidden states, and cached key/value pairs. - """ - - _keys_to_ignore_on_load_missing = ["num_batches_tracked"] - _tied_weights_keys = {"lm_head.weight": "model.language_model.embed_tokens.weight"} - - def __init__(self, config: PPChart2TableConfig): - super().__init__(config) - self.model = PPChart2TableModel(config) - self.lm_head = nn.Linear(config.text_config.hidden_size, config.text_config.vocab_size, bias=False) - self.post_init() - - def get_input_embeddings(self): - return self.model.get_input_embeddings() - - def set_input_embeddings(self, value): - self.model.set_input_embeddings(value) - - def get_output_embeddings(self) -> nn.Module: - return self.lm_head - - def get_image_features( - self, - pixel_values: torch.FloatTensor, - vision_feature_layer: Optional[Union[int, list[int]]] = None, - vision_feature_select_strategy: Optional[str] = None, - **kwargs, - ): - return self.model.get_image_features( - pixel_values=pixel_values, - vision_feature_layer=vision_feature_layer, - vision_feature_select_strategy=vision_feature_select_strategy, - **kwargs, - ) - - @can_return_tuple - def forward( - self, - input_ids: Optional[torch.LongTensor] = None, - attention_mask: Optional[torch.Tensor] = None, - position_ids: Optional[torch.LongTensor] = None, - pixel_values: Optional[torch.Tensor] = None, - inputs_embeds: Optional[torch.Tensor] = None, - labels: Optional[list[dict]] = None, - logits_to_keep: Union[int, torch.Tensor] = 0, - cache_position: Optional[torch.LongTensor] = None, - past_key_values: Optional[Cache] = None, - use_cache: Optional[bool] = None, - **kwargs, - ) -> Union[tuple[torch.FloatTensor], PPChart2TableCausalLMOutputWithPast]: - outputs = self.model( - input_ids=input_ids, - attention_mask=attention_mask, - position_ids=position_ids, - pixel_values=pixel_values, - inputs_embeds=inputs_embeds, - use_cache=use_cache, - past_key_values=past_key_values, - cache_position=cache_position, - logits_to_keep=logits_to_keep, - **kwargs, - ) - hidden_states = outputs.last_hidden_state - - slice_indices = slice(-logits_to_keep, None) if isinstance(logits_to_keep, int) else logits_to_keep - logits = self.lm_head(hidden_states[:, slice_indices, :]) - - loss = None - if labels is not None: - loss = self.loss_function( - logits=logits, labels=labels, vocab_size=self.config.text_config.vocab_size, **kwargs - ) - - return PPChart2TableCausalLMOutputWithPast( - loss=loss, - logits=logits, - last_hidden_state=outputs.last_hidden_state, - past_key_values=outputs.past_key_values, - hidden_states=outputs.hidden_states, - attentions=outputs.attentions, - ) - - def prepare_inputs_for_generation( - self, - input_ids, - past_key_values=None, - inputs_embeds=None, - pixel_values=None, - attention_mask=None, - cache_position=None, - logits_to_keep=None, - is_first_iteration=False, - **kwargs, - ): - # Overwritten -- in specific circumstances we don't want to forward image inputs to the model - - model_inputs = super().prepare_inputs_for_generation( - input_ids, - past_key_values=past_key_values, - inputs_embeds=inputs_embeds, - attention_mask=attention_mask, - cache_position=cache_position, - logits_to_keep=logits_to_keep, - is_first_iteration=is_first_iteration, - **kwargs, - ) - - if is_first_iteration or not kwargs.get("use_cache", True): - # Pixel values are used only in the first iteration if available - # In subsquent iterations, they are already merged with text and cached - # NOTE: first iteration doesn't have to be prefill, it can be the first - # iteration with a question and cached system prompt (continue generate from cache) - model_inputs["pixel_values"] = pixel_values - - return model_inputs +class PPChart2TableForConditionalGeneration(GotOcr2ForConditionalGeneration): + pass __all__ = [ diff --git a/src/transformers/models/pp_chart2table/processing_pp_chart2table.py b/src/transformers/models/pp_chart2table/processing_pp_chart2table.py index 1efa2641ff23..518fcb645770 100644 --- a/src/transformers/models/pp_chart2table/processing_pp_chart2table.py +++ b/src/transformers/models/pp_chart2table/processing_pp_chart2table.py @@ -7,35 +7,13 @@ import torch -from transformers.feature_extraction_utils import BatchFeature -from transformers.processing_utils import ProcessorMixin -from transformers.utils import auto_docstring +from ...feature_extraction_utils import BatchFeature +from ...processing_utils import ProcessorMixin +from ...utils import auto_docstring -@auto_docstring( - custom_intro=""" - A multi-modal processor for the PPChart2Table model, combining image preprocessing and text tokenization - capabilities to handle chart-to-table conversion tasks. - - This processor integrates `PPChart2TableImageProcessorFast` for chart image preprocessing (e.g., patch-based - resizing) and `Qwen2Tokenizer` for text prompt construction/tokenization. It encapsulates the end-to-end - processing pipeline from raw chart images + text instructions to model-ready input tensors, and also provides - postprocessing logic to decode model outputs back to human-readable table text. - """ -) +@auto_docstring class PPChart2TableProcessor(ProcessorMixin): - r""" - [`PPChart2TableProcessor`] offers all the functionalities of [`PPChart2TableImageProcessorFast`] and [`Qwen2Tokenizer`]. See the - [`~PPChart2TableProcessor.__call__`] and [`~PPChart2TableProcessor.decode`] for more information. - Args: - image_processor ([`PPChart2TableImageProcessorFast`], *optional*): - The image processor is a required input. - tokenizer ([`Qwen2Tokenizer`], *optional*): - The tokenizer is a required input. - chat_template (`str`, *optional*): A Jinja template which will be used to convert lists of messages - in a chat into a tokenizable string. - """ - image_processor_class = "AutoImageProcessor" tokenizer_class = "AutoTokenizer" diff --git a/tests/models/pp_chart2table/test_modeling_pp_chart2table.py b/tests/models/pp_chart2table/test_modeling_pp_chart2table.py index 99d30d76c81d..28bac6ef8cea 100644 --- a/tests/models/pp_chart2table/test_modeling_pp_chart2table.py +++ b/tests/models/pp_chart2table/test_modeling_pp_chart2table.py @@ -16,6 +16,8 @@ import gc import unittest +import requests + import pytest from parameterized import parameterized from PIL import Image @@ -71,8 +73,8 @@ def __init__( is_training=False, vision_config={ "depth": 2, - "embed_dim": 768, "hidden_size": 144, + "output_channels": 192, "hidden_act": "gelu", "image_size": 64, "num_channels": 3, @@ -84,8 +86,8 @@ def __init__( "use_rel_pos": True, "global_attn_indexes": [2, 5, 8, 11], "window_size": 14, - "output_channels": 256, - "net_channels": 512, + "neck_channels": 48, + "net_channels": 96, "attention_dropout": 0.0, }, bos_token_id=151643, @@ -260,9 +262,8 @@ def test_small_model_integration_test(self): "/workspace/model_weight_torch/PP-Chart2Table", dtype="float32" ).to("cuda") - image = Image.open( - "/workspace/PaddleX/paddlex/inference/models/doc_vlm/modeling/chart_parsing_02.png" - ).convert("RGB") + image = Image.open(requests.get("https://paddle-model-ecology.bj.bcebos.com/paddlex/imgs/demo_image/chart_parsing_02.png", stream=True).raw) + inputs = self.processor(images=image).to(model.device) breakpoint() expected_input_ids_length = 286 From 3d8a654672f5cb7e72cf4e143996ca6fa6f488fa Mon Sep 17 00:00:00 2001 From: XingweiDeng Date: Fri, 13 Mar 2026 12:30:31 +0800 Subject: [PATCH 13/60] update --- .../configuration_pp_chart2table.py | 325 ++++---------- .../image_processing_pp_chart2table.py | 166 +------ .../pp_chart2table/modeling_pp_chart2table.py | 415 +----------------- .../pp_chart2table/modular_pp_chart2table.py | 375 ++-------------- .../processing_pp_chart2table.py | 41 +- 5 files changed, 187 insertions(+), 1135 deletions(-) diff --git a/src/transformers/models/pp_chart2table/configuration_pp_chart2table.py b/src/transformers/models/pp_chart2table/configuration_pp_chart2table.py index 7ecb1a2ddd31..f1c819a96417 100644 --- a/src/transformers/models/pp_chart2table/configuration_pp_chart2table.py +++ b/src/transformers/models/pp_chart2table/configuration_pp_chart2table.py @@ -5,248 +5,95 @@ # modular_pp_chart2table.py file directly. One of our CI enforces this. # ๐Ÿšจ๐Ÿšจ๐Ÿšจ๐Ÿšจ๐Ÿšจ๐Ÿšจ๐Ÿšจ๐Ÿšจ๐Ÿšจ๐Ÿšจ๐Ÿšจ๐Ÿšจ๐Ÿšจ๐Ÿšจ๐Ÿšจ๐Ÿšจ๐Ÿšจ๐Ÿšจ๐Ÿšจ๐Ÿšจ๐Ÿšจ๐Ÿšจ๐Ÿšจ๐Ÿšจ๐Ÿšจ๐Ÿšจ๐Ÿšจ๐Ÿšจ๐Ÿšจ๐Ÿšจ๐Ÿšจ๐Ÿšจ๐Ÿšจ๐Ÿšจ๐Ÿšจ๐Ÿšจ๐Ÿšจ๐Ÿšจ๐Ÿšจ๐Ÿšจ๐Ÿšจ๐Ÿšจ๐Ÿšจ๐Ÿšจ๐Ÿšจ๐Ÿšจ๐Ÿšจ๐Ÿšจ -from ...configuration_utils import PreTrainedConfig, layer_type_validation -from ...modeling_rope_utils import RopeParameters +from ...configuration_utils import PreTrainedConfig from ...utils import auto_docstring +from ..auto import CONFIG_MAPPING, AutoConfig -@auto_docstring( - checkpoint="PaddlePaddle/PP-Chart2Table_safetensors", -) +@auto_docstring(checkpoint="facebook/sam-vit-huge") class PPChart2TableVisionConfig(PreTrainedConfig): - """ - num_hidden_layers (`int`, *optional*, defaults to 12): - Number of transformer encoder layers in the vision backbone. - hidden_size (`int`, *optional*, defaults to 768): - Dimensionality of the patch embedding vectors. - num_channels (`int`, *optional*, defaults to 3): - Number of input channels (3 for RGB images, 1 for grayscale). - image_size (`int`, *optional*, defaults to 1024): - Size (height/width) of the input images (assumed to be square). - num_attention_heads (`int`, *optional*, defaults to 12): - Number of attention heads for each transformer encoder layer. - patch_size (`int`, *optional*, defaults to 16): - Size (height/width) of the image patches extracted from the input image. - qkv_bias (`bool`, *optional*, defaults to `True`): - Whether to include bias terms in the query, key, value projection layers of self-attention. - use_rel_pos (`bool`, *optional*, defaults to `True`): - Whether to use relative positional embeddings in the self-attention mechanism. - global_attn_indexes (`Optional[list[int]]`, *optional*, defaults to [2, 5, 8, 11]): - List of layer indexes where global attention (instead of window attention) is applied. - If `None`, defaults to [2, 5, 8, 11]. - window_size (`int`, *optional*, defaults to 14): - Size of the attention window for window-based self-attention (only effective when use_rel_pos=True). + r""" output_channels (`int`, *optional*, defaults to 256): - Dimensionality of the final visual feature output channels. - attention_dropout (`float`, *optional*, defaults to 0.0): - Dropout probability applied to the attention weights. + Dimensionality of the output channels in the Patch Encoder. + window_size (`int`, *optional*, defaults to 14): + Window size for relative position. + use_abs_pos (`bool`, *optional*, defaults to `True`): + Whether to use absolute position embedding. + use_rel_pos (`bool`, *optional*, defaults to `True`): + Whether to use relative position embedding. + global_attn_indexes (`list[int]`, *optional*, defaults to `[2, 5, 8, 11]`): + The indexes of the global attention layers. + mlp_dim (`int`, *optional*, defaults to 3072): + The dimensionality of the MLP layer in the Transformer encoder. """ - model_type = "pp_chart2table_vision" base_config_key = "vision_config" def __init__( self, - num_hidden_layers=12, hidden_size=768, output_channels=256, - mlp_dim=3072, + num_hidden_layers=12, + num_attention_heads=12, num_channels=3, image_size=1024, - num_attention_heads=12, patch_size=16, + hidden_act="gelu", + layer_norm_eps=1e-06, + attention_dropout=0.0, + initializer_range=1e-10, qkv_bias=True, + use_abs_pos=True, use_rel_pos=True, - use_abs_pos=False, - global_attn_indexes=[2, 5, 8, 11], window_size=14, - attention_dropout=0.0, + global_attn_indexes=[2, 5, 8, 11], + mlp_dim=3072, **kwargs, ): - self.num_hidden_layers = num_hidden_layers - self.hidden_size = hidden_size - self.mlp_dim = mlp_dim - self.image_size = image_size - self.num_channels = num_channels - self.num_attention_heads = num_attention_heads - self.patch_size = patch_size - self.qkv_bias = qkv_bias - self.use_rel_pos = use_rel_pos - self.use_abs_pos = use_abs_pos - self.global_attn_indexes = global_attn_indexes - self.window_size = window_size - self.output_channels = output_channels - self.attention_dropout = attention_dropout super().__init__(**kwargs) - -@auto_docstring( - custom_intro=""" - - """, -) -class PPChart2TableTextConfig(PreTrainedConfig): - r""" - attention_dropout (`float`, *optional*, defaults to 0.0): - The dropout ratio for the attention probabilities in self-attention layers. - bos_token_id (`int`, *optional*, defaults to 151643): - The token ID representing the beginning of a sequence (BOS) for text generation. - eos_token_id (`int`, *optional*, defaults to 151643): - The token ID representing the end of a sequence (EOS) for text generation. - pad_token_id (Optional[int], optional, *optional*, defaults to -1): - The index of the padding token. Defaults to -1. - hidden_act (`str` or `function`, *optional*, defaults to `"silu"`): - The non-linear activation function (function or string) in the feed-forward and attention layers of the decoder. - hidden_size (`int`, *optional*, defaults to 1024): - Dimensionality of the hidden representations in the Transformer decoder layers. - initializer_range (`float`, *optional*, defaults to 0.02): - The standard deviation of the truncated_normal_initializer for initializing all weight matrices. - intermediate_size (`int`, *optional*, defaults to 2816): - Dimensionality of the "intermediate" (i.e., feed-forward) layer in the Transformer decoder blocks. - max_position_embeddings (`int`, *optional*, defaults to 32768): - The maximum sequence length that this model might ever be used with for text input/output. - num_attention_heads (`int`, *optional*, defaults to 16): - Number of attention heads for each self-attention layer in the Transformer decoder. - num_hidden_layers (`int`, *optional*, defaults to 24): - Number of hidden layers in the Transformer decoder. - num_key_value_heads (`int`, *optional*, defaults to 16): - Number of key/value heads for implementing Grouped Query Attention (GQA). If equal to `num_attention_heads`, - Multi Head Attention (MHA) is used; if 1, Multi Query Attention (MQA) is used. For more details, see - [this paper](https://huggingface.co/papers/2305.13245). - rms_norm_eps (`float`, *optional*, defaults to 1e-06): - The epsilon value used by the RMS normalization layers to avoid division by zero. - rope_theta (`float`, *optional*, defaults to 1000000.0): - The base period of the RoPE (Rotary Position Embedding) embeddings, controlling the frequency of positional encoding. - rope_parameters (`RopeParameters` or `dict`, *optional*): - Configuration parameters for RoPE embeddings, including scaling parameters for longer sequence lengths beyond - `max_position_embeddings`. - sliding_window (`int`, *optional*, defaults to 32768): - Window size for Sliding Window Attention (SWA) in the decoder layers (only active if `use_sliding_window=True`). - tie_word_embeddings (`bool`, *optional*, defaults to `True`): - Whether the model's input and output word embeddings should be tied (shared weights). - use_cache (`bool`, *optional*, defaults to `True`): - Whether to return the last key/value attention states to speed up sequential decoding (only relevant for autoregressive - generation). - vocab_size (`int`, *optional*, defaults to 151860): - Vocabulary size of the PPChart2TableText model. Defines the number of distinct tokens that can be represented - by `input_ids`. - layer_types (`list[str]`, *optional*): - Attention pattern for each decoder layer (e.g., `"full_attention"` or `"sliding_attention"`). If not specified, - automatically determined by `sliding_window`. - """ - - model_type = "pp_chart2table_text" - keys_to_ignore_at_inference = ["past_key_values"] - - # Default tensor parallel plan for base model `PPChart2TableText` - base_model_tp_plan = { - "layers.*.self_attn.q_proj": "colwise", - "layers.*.self_attn.k_proj": "colwise", - "layers.*.self_attn.v_proj": "colwise", - "layers.*.self_attn.o_proj": "rowwise", - "layers.*.mlp.gate_proj": "colwise", - "layers.*.mlp.up_proj": "colwise", - "layers.*.mlp.down_proj": "rowwise", - } - base_model_pp_plan = { - "embed_tokens": (["input_ids"], ["inputs_embeds"]), - "layers": (["hidden_states", "attention_mask"], ["hidden_states"]), - "norm": (["hidden_states"], ["hidden_states"]), - } - base_config_key = "text_config" - - def __init__( - self, - attention_dropout: float = 0.0, - bos_token_id: int = 151643, - eos_token_id: int = 151643, - pad_token_id: int = -1, - hidden_act: str = "silu", - hidden_size: int = 1024, - initializer_range: float = 0.02, - intermediate_size: int = 2816, - max_position_embeddings: int = 32768, - num_attention_heads: int = 16, - num_hidden_layers: int = 24, - num_key_value_heads: int = 16, - rms_norm_eps: float = 1e-06, - rope_theta: float = 1000000.0, - rope_parameters: RopeParameters | dict[str, RopeParameters] | None = None, - sliding_window: int = 32768, - tie_word_embeddings: bool = True, - use_cache: bool = True, - vocab_size: int = 151860, - layer_types: list[str] | None = None, - **kwargs, - ): - self.vocab_size = vocab_size - self.max_position_embeddings = max_position_embeddings self.hidden_size = hidden_size - self.intermediate_size = intermediate_size + self.output_channels = output_channels self.num_hidden_layers = num_hidden_layers self.num_attention_heads = num_attention_heads - self.sliding_window = sliding_window - - # for backward compatibility - if num_key_value_heads is None: - num_key_value_heads = num_attention_heads - self.num_key_value_heads = num_key_value_heads + self.num_channels = num_channels + self.image_size = image_size + self.patch_size = patch_size self.hidden_act = hidden_act + self.layer_norm_eps = layer_norm_eps + self.attention_dropout = attention_dropout self.initializer_range = initializer_range - self.rms_norm_eps = rms_norm_eps - self.use_cache = use_cache + self.qkv_bias = qkv_bias + self.use_abs_pos = use_abs_pos + self.use_rel_pos = use_rel_pos + self.window_size = window_size + self.global_attn_indexes = global_attn_indexes + self.mlp_dim = mlp_dim - self.attention_dropout = attention_dropout - self.layer_types = layer_types - if self.layer_types is None: - self.layer_types = [ - "sliding_attention" if self.sliding_window is not None else "full_attention" - for i in range(self.num_hidden_layers) - ] - layer_type_validation(self.layer_types, self.num_hidden_layers) +@auto_docstring +class PPChart2TableConfig(PreTrainedConfig): + r""" + Example: - self.rope_parameters = rope_parameters + ```python + >>> from transformers import PPChart2TableForConditionalGeneration, PPChart2TableConfig - self.rope_theta = rope_theta - self.tie_word_embeddings = tie_word_embeddings - super().__init__( - pad_token_id=pad_token_id, - bos_token_id=bos_token_id, - eos_token_id=eos_token_id, - tie_word_embeddings=tie_word_embeddings, - **kwargs, - ) + >>> # Initializing a PPChart2Table style configuration + >>> configuration = PPChart2TableConfig() + >>> # Initializing a model from the Qwen2-VL-7B style configuration + >>> model = PPChart2TableForConditionalGeneration(configuration) -@auto_docstring( - custom_intro=""" - - """ -) -class PPChart2TableConfig(PreTrainedConfig): - r""" - vision_config (Optional[Dict], optional, *optional*):: - The [PPChart2TableVisionConfig] for the vision sub-model. Defaults to None. - text_config (Optional[Dict], optional, *optional*):: - The [PPChart2TableTextConfig] for the text sub-model. Defaults to None. - image_token_index (Optional[int], optional, *optional*, defaults to 151859):: - The index of the image token. Defaults to 151859. - image_seq_length (Optional[int], optional, *optional*, defaults to 576):: - The sequence length for the image. Defaults to 576. - pad_token_id (Optional[int], optional, *optional*, defaults to -1): - The index of the padding token. Defaults to -1. - net_channels (`int`, *optional*, defaults to 512): - Dimensionality of intermediate network channels in the vision backbone. - output_channels (`int`, *optional*, defaults to 1024): - Dimensionality of intermediate network channels in the vision backbone. - """ + >>> # Accessing the model configuration + >>> configuration = model.config + ```""" model_type = "pp_chart2table" attribute_map = { "image_token_id": "image_token_index", } - sub_configs = {"vision_config": PPChart2TableVisionConfig, "text_config": PPChart2TableTextConfig} + sub_configs = {"text_config": AutoConfig, "vision_config": PPChart2TableVisionConfig} def __init__( self, @@ -254,50 +101,48 @@ def __init__( text_config: dict | None = None, image_token_index: int | None = 151859, image_seq_length: int | None = 576, - pad_token_id: int | None = -1, - net_channels: int | None = 512, - output_channels: int | None = 1024, + tie_word_embeddings: bool | None = True, **kwargs, ): self.image_token_index = image_token_index self.image_seq_length = image_seq_length - self.pad_token_id = pad_token_id - self.net_channels = net_channels - self.output_channels = output_channels if vision_config is None: - vision_config = {} - self.vision_config = PPChart2TableVisionConfig(**vision_config) - - if text_config is None: - text_config = {} - self.text_config = PPChart2TableTextConfig(**text_config) - - text_config_keys = [ - "attention_dropout", - "bos_token_id", - "eos_token_id", - "hidden_act", - "hidden_size", - "initializer_range", - "intermediate_size", - "max_position_embeddings", - "num_attention_heads", - "num_hidden_layers", - "num_key_value_heads", - "rms_norm_eps", - "rope_theta", - "sliding_window", - "tie_word_embeddings", - "dtype", - "use_cache", - "vocab_size", - ] - for key in text_config_keys: - if hasattr(self.text_config, key): - setattr(self, key, getattr(self.text_config, key)) + self.vision_config = PPChart2TableVisionConfig() + elif isinstance(vision_config, dict): + self.vision_config = PPChart2TableVisionConfig(**vision_config) + elif isinstance(vision_config, PPChart2TableVisionConfig): + self.vision_config = vision_config + + if isinstance(text_config, dict): + text_config["model_type"] = text_config.get("model_type", "qwen2") + text_config = CONFIG_MAPPING[text_config["model_type"]](**text_config) + elif text_config is None: + text_config = CONFIG_MAPPING["qwen2"]( + vocab_size=151860, + hidden_size=1024, + intermediate_size=2816, + num_hidden_layers=24, + num_attention_heads=16, + num_key_value_heads=16, + hidden_act="silu", + max_position_embeddings=32768, + initializer_range=0.02, + rms_norm_eps=1e-6, + use_cache=True, + tie_word_embeddings=tie_word_embeddings, + rope_theta=1000000.0, + rope_parameters=None, + use_sliding_window=False, + sliding_window=4096, + max_window_layers=21, + attention_dropout=0.0, + ) + + self.text_config = text_config + self.tie_word_embeddings = tie_word_embeddings super().__init__(**kwargs) -__all__ = ["PPChart2TableConfig", "PPChart2TableVisionConfig", "PPChart2TableTextConfig"] +__all__ = ["PPChart2TableConfig"] diff --git a/src/transformers/models/pp_chart2table/image_processing_pp_chart2table.py b/src/transformers/models/pp_chart2table/image_processing_pp_chart2table.py index 3f17cb754904..7bb7de6cc920 100644 --- a/src/transformers/models/pp_chart2table/image_processing_pp_chart2table.py +++ b/src/transformers/models/pp_chart2table/image_processing_pp_chart2table.py @@ -4,158 +4,22 @@ # the file from the modular. If any change should be done, please apply the change to the # modular_pp_chart2table.py file directly. One of our CI enforces this. # ๐Ÿšจ๐Ÿšจ๐Ÿšจ๐Ÿšจ๐Ÿšจ๐Ÿšจ๐Ÿšจ๐Ÿšจ๐Ÿšจ๐Ÿšจ๐Ÿšจ๐Ÿšจ๐Ÿšจ๐Ÿšจ๐Ÿšจ๐Ÿšจ๐Ÿšจ๐Ÿšจ๐Ÿšจ๐Ÿšจ๐Ÿšจ๐Ÿšจ๐Ÿšจ๐Ÿšจ๐Ÿšจ๐Ÿšจ๐Ÿšจ๐Ÿšจ๐Ÿšจ๐Ÿšจ๐Ÿšจ๐Ÿšจ๐Ÿšจ๐Ÿšจ๐Ÿšจ๐Ÿšจ๐Ÿšจ๐Ÿšจ๐Ÿšจ๐Ÿšจ๐Ÿšจ๐Ÿšจ๐Ÿšจ๐Ÿšจ๐Ÿšจ๐Ÿšจ๐Ÿšจ๐Ÿšจ -from typing import Optional, Union +from ...processing_utils import ImagesKwargs -from transformers.feature_extraction_utils import BatchFeature -from transformers.image_processing_utils import BaseImageProcessor -from transformers.image_transforms import flip_channel_order, resize, to_channel_dimension_format -from transformers.image_utils import ( - ChannelDimension, - ImageInput, - PILImageResampling, - infer_channel_dimension_format, - make_flat_list_of_images, - to_numpy_array, - valid_images, - validate_preprocess_arguments, -) -from transformers.processing_utils import TensorType -from transformers.utils import filter_out_non_signature_kwargs - -class PPChart2TableImageProcessor(BaseImageProcessor): - r""" - Image processor for the PP-Chart2Table multimodal model, optimized for chart image preprocessing tasks. - - This processor handles the complete preprocessing pipeline for chart images, including resizing, rescaling, - normalization, and channel dimension reordering, tailored to the input requirements of the PP-Chart2Table vision encoder. - - Args: - do_resize (`bool`, *optional*, defaults to `True`): - Whether to resize the input images to the specified `size`. - size (`dict[str, int]`, *optional*, defaults to `{"height": 256, "width": 256}`): - Dictionary containing the target height and width for resizing. Format: `{"height": int, "width": int}`. - resample (`PILImageResampling`, *optional*, defaults to `PILImageResampling.BICUBIC`): - Resampling filter to use when resizing images (e.g., BICUBIC, BILINEAR). - do_rescale (`bool`, *optional*, defaults to `True`): - Whether to rescale the pixel values from the range [0, 255] to [0, 1] using `rescale_factor`. - rescale_factor (`int` or `float`, *optional*, defaults to `1/255`): - Factor to apply for rescaling pixel values (e.g., 1/255 scales 0-255 to 0-1). - do_normalize (`bool`, *optional*, defaults to `True`): - Whether to normalize the input images using `image_mean` and `image_std`. - image_mean (`float` or `list[float]`, *optional*, defaults to `[0.406, 0.456, 0.485]`): - Mean values for image normalization (per channel, RGB order). - image_std (`float` or `list[float]`, *optional*, defaults to `[0.225, 0.224, 0.229]`): - Standard deviation values for image normalization (per channel, RGB order). - patch_size (`int`, *optional*, defaults to 16): - Size of image patches used by the PP-Chart2Table vision encoder (for alignment with model input). - merge_size (`int`, *optional*, defaults to 4): - Size factor for merging image patches (specific to PP-Chart2Table's vision processing pipeline). +class PPChart2TableImageProcessorKwargs(ImagesKwargs, total=False): + """ + crop_to_patches (`bool`, *optional*, defaults to `False`): + Whether to crop the image to patches. Can be overridden by the `crop_to_patches` parameter in the + `preprocess` method. + min_patches (`int`, *optional*, defaults to 1): + The minimum number of patches to be extracted from the image. Only has an effect if `crop_to_patches` is + set to `True`. Can be overridden by the `min_patches` parameter in the `preprocess` method. + max_patches (`int`, *optional*, defaults to 12): + The maximum number of patches to be extracted from the image. Only has an effect if `crop_to_patches` is + set to `True`. Can be overridden by the `max_patches` parameter in the `preprocess` method. """ - model_input_names = ["pixel_values"] - - def __init__( - self, - do_resize: bool = True, - size: Optional[dict[str, int]] = None, - resample: Optional[PILImageResampling] = PILImageResampling.BICUBIC, - do_rescale: bool = True, - rescale_factor: Union[int, float] = 1 / 255, - do_normalize: bool = True, - image_mean: Optional[Union[float, list[float]]] = [0.406, 0.456, 0.485], - image_std: Optional[Union[float, list[float]]] = [0.225, 0.224, 0.229], - patch_size: int = 16, - merge_size: int = 4, - **kwargs, - ) -> None: - super().__init__(**kwargs) - size = size if size is not None else {"height": 256, "width": 256} - - self.do_resize = do_resize - self.size = size - self.do_rescale = do_rescale - self.rescale_factor = rescale_factor - self.do_normalize = do_normalize - self.image_mean = image_mean - self.image_std = image_std - self.resample = resample - self.patch_size = patch_size - self.merge_size = merge_size - - @filter_out_non_signature_kwargs() - def preprocess( - self, - images: ImageInput, - size: Optional[dict[str, int]] = None, - do_resize: Optional[bool] = None, - resample: Optional[PILImageResampling] = None, - do_rescale: Optional[bool] = None, - rescale_factor: Optional[Union[int, float]] = None, - do_normalize: Optional[bool] = None, - image_mean: Optional[Union[float, list[float]]] = None, - image_std: Optional[Union[float, list[float]]] = None, - return_tensors: Optional[Union[TensorType, str]] = None, - data_format: Union[str, ChannelDimension] = ChannelDimension.FIRST, - input_data_format: Optional[Union[str, ChannelDimension]] = None, - ) -> BatchFeature: - size = self.size if size is None else size - do_resize = self.do_resize if do_resize is None else do_resize - resample = self.resample if resample is None else resample - do_rescale = self.do_rescale if do_rescale is None else do_rescale - rescale_factor = self.rescale_factor if rescale_factor is None else rescale_factor - do_normalize = self.do_normalize if do_normalize is None else do_normalize - image_mean = self.image_mean if image_mean is None else image_mean - image_std = self.image_std if image_std is None else image_std - - images = make_flat_list_of_images(images) - - validate_preprocess_arguments( - do_rescale=do_rescale, - rescale_factor=rescale_factor, - do_normalize=do_normalize, - image_mean=image_mean, - image_std=image_std, - size=size, - do_resize=do_resize, - resample=resample, - ) - - if not valid_images(images): - raise ValueError("Invalid image type. Must be of type PIL.Image.Image, numpy.ndarray, or torch.Tensor") - - # All transformations expect numpy arrays - images = [to_numpy_array(image) for image in images] - if input_data_format is None: - input_data_format = infer_channel_dimension_format(images[0]) - - # transformations - resize_images = [] - if do_resize: - for image in images: - image = resize( - image, - size=(size["height"], size["width"]), - resample=resample, - input_data_format=input_data_format, - ) - resize_images.append(image) - images = resize_images - - if do_rescale: - images = [self.rescale(image, rescale_factor, input_data_format=input_data_format) for image in images] - - if do_normalize: - images = [ - self.normalize(image, image_mean, image_std, input_data_format=input_data_format) for image in images - ] - images = [flip_channel_order(image, input_data_format=input_data_format) for image in images] - images = [ - to_channel_dimension_format(image, data_format, input_channel_dim=input_data_format) for image in images - ] - - encoded_inputs = BatchFeature(data={"pixel_values": images}, tensor_type=return_tensors) - return encoded_inputs - - -__all__ = ["PPChart2TableImageProcessor"] + crop_to_patches: bool + min_patches: int + max_patches: int diff --git a/src/transformers/models/pp_chart2table/modeling_pp_chart2table.py b/src/transformers/models/pp_chart2table/modeling_pp_chart2table.py index 033daab0bf44..5501710d3a48 100644 --- a/src/transformers/models/pp_chart2table/modeling_pp_chart2table.py +++ b/src/transformers/models/pp_chart2table/modeling_pp_chart2table.py @@ -5,9 +5,7 @@ # modular_pp_chart2table.py file directly. One of our CI enforces this. # ๐Ÿšจ๐Ÿšจ๐Ÿšจ๐Ÿšจ๐Ÿšจ๐Ÿšจ๐Ÿšจ๐Ÿšจ๐Ÿšจ๐Ÿšจ๐Ÿšจ๐Ÿšจ๐Ÿšจ๐Ÿšจ๐Ÿšจ๐Ÿšจ๐Ÿšจ๐Ÿšจ๐Ÿšจ๐Ÿšจ๐Ÿšจ๐Ÿšจ๐Ÿšจ๐Ÿšจ๐Ÿšจ๐Ÿšจ๐Ÿšจ๐Ÿšจ๐Ÿšจ๐Ÿšจ๐Ÿšจ๐Ÿšจ๐Ÿšจ๐Ÿšจ๐Ÿšจ๐Ÿšจ๐Ÿšจ๐Ÿšจ๐Ÿšจ๐Ÿšจ๐Ÿšจ๐Ÿšจ๐Ÿšจ๐Ÿšจ๐Ÿšจ๐Ÿšจ๐Ÿšจ๐Ÿšจ import collections -from collections.abc import Callable from dataclasses import dataclass -from typing import Optional import torch import torch.nn as nn @@ -15,20 +13,17 @@ from ... import initialization as init from ...activations import ACT2FN -from ...cache_utils import Cache, DynamicCache +from ...cache_utils import Cache from ...generation import GenerationMixin -from ...integrations import use_kernel_forward_from_hub, use_kernel_func_from_hub, use_kernelized_func -from ...masking_utils import create_causal_mask, create_sliding_window_causal_mask -from ...modeling_flash_attention_utils import FlashAttentionKwargs from ...modeling_layers import GradientCheckpointingLayer from ...modeling_outputs import BaseModelOutputWithPast, BaseModelOutputWithPooling, ModelOutput -from ...modeling_rope_utils import ROPE_INIT_FUNCTIONS, dynamic_rope_update -from ...modeling_utils import ALL_ATTENTION_FUNCTIONS, PreTrainedModel +from ...modeling_utils import PreTrainedModel from ...processing_utils import Unpack from ...utils import TransformersKwargs, auto_docstring, can_return_tuple, torch_compilable_check -from ...utils.generic import maybe_autocast, merge_with_config_defaults +from ...utils.generic import merge_with_config_defaults from ...utils.output_capturing import capture_outputs -from .configuration_pp_chart2table import PPChart2TableConfig, PPChart2TableTextConfig, PPChart2TableVisionConfig +from ..auto import AutoModel +from .configuration_pp_chart2table import PPChart2TableConfig, PPChart2TableVisionConfig class PPChart2TableVisionAttention(nn.Module): @@ -445,388 +440,6 @@ def forward( ) -class PPChart2TableTextMLP(nn.Module): - def __init__(self, config): - super().__init__() - self.config = config - self.hidden_size = config.hidden_size - self.intermediate_size = config.intermediate_size - self.gate_proj = nn.Linear(self.hidden_size, self.intermediate_size, bias=False) - self.up_proj = nn.Linear(self.hidden_size, self.intermediate_size, bias=False) - self.down_proj = nn.Linear(self.intermediate_size, self.hidden_size, bias=False) - self.act_fn = ACT2FN[config.hidden_act] - - def forward(self, x): - down_proj = self.down_proj(self.act_fn(self.gate_proj(x)) * self.up_proj(x)) - return down_proj - - -def rotate_half(x): - """Rotates half the hidden dims of the input.""" - x1 = x[..., : x.shape[-1] // 2] - x2 = x[..., x.shape[-1] // 2 :] - return torch.cat((-x2, x1), dim=-1) - - -@use_kernel_func_from_hub("rotary_pos_emb") -def apply_rotary_pos_emb(q, k, cos, sin, unsqueeze_dim=1): - """Applies Rotary Position Embedding to the query and key tensors. - - Args: - q (`torch.Tensor`): The query tensor. - k (`torch.Tensor`): The key tensor. - cos (`torch.Tensor`): The cosine part of the rotary embedding. - sin (`torch.Tensor`): The sine part of the rotary embedding. - unsqueeze_dim (`int`, *optional*, defaults to 1): - The 'unsqueeze_dim' argument specifies the dimension along which to unsqueeze cos[position_ids] and - sin[position_ids] so that they can be properly broadcasted to the dimensions of q and k. For example, note - that cos[position_ids] and sin[position_ids] have the shape [batch_size, seq_len, head_dim]. Then, if q and - k have the shape [batch_size, heads, seq_len, head_dim], then setting unsqueeze_dim=1 makes - cos[position_ids] and sin[position_ids] broadcastable to the shapes of q and k. Similarly, if q and k have - the shape [batch_size, seq_len, heads, head_dim], then set unsqueeze_dim=2. - Returns: - `tuple(torch.Tensor)` comprising of the query and key tensors rotated using the Rotary Position Embedding. - """ - cos = cos.unsqueeze(unsqueeze_dim) - sin = sin.unsqueeze(unsqueeze_dim) - q_embed = (q * cos) + (rotate_half(q) * sin) - k_embed = (k * cos) + (rotate_half(k) * sin) - return q_embed, k_embed - - -def repeat_kv(hidden_states: torch.Tensor, n_rep: int) -> torch.Tensor: - """ - This is the equivalent of torch.repeat_interleave(x, dim=1, repeats=n_rep). The hidden states go from (batch, - num_key_value_heads, seqlen, head_dim) to (batch, num_attention_heads, seqlen, head_dim) - """ - batch, num_key_value_heads, slen, head_dim = hidden_states.shape - if n_rep == 1: - return hidden_states - hidden_states = hidden_states[:, :, None, :, :].expand(batch, num_key_value_heads, n_rep, slen, head_dim) - return hidden_states.reshape(batch, num_key_value_heads * n_rep, slen, head_dim) - - -def eager_attention_forward( - module: nn.Module, - query: torch.Tensor, - key: torch.Tensor, - value: torch.Tensor, - attention_mask: torch.Tensor | None, - scaling: float, - dropout: float = 0.0, - **kwargs: Unpack[TransformersKwargs], -): - key_states = repeat_kv(key, module.num_key_value_groups) - value_states = repeat_kv(value, module.num_key_value_groups) - - attn_weights = torch.matmul(query, key_states.transpose(2, 3)) * scaling - if attention_mask is not None: - attn_weights = attn_weights + attention_mask - - attn_weights = nn.functional.softmax(attn_weights, dim=-1, dtype=torch.float32).to(query.dtype) - attn_weights = nn.functional.dropout(attn_weights, p=dropout, training=module.training) - attn_output = torch.matmul(attn_weights, value_states) - attn_output = attn_output.transpose(1, 2).contiguous() - - return attn_output, attn_weights - - -@use_kernelized_func(apply_rotary_pos_emb) -class PPChart2TableTextAttention(nn.Module): - """Multi-headed attention from 'Attention Is All You Need' paper""" - - def __init__(self, config: PPChart2TableTextConfig, layer_idx: int): - super().__init__() - self.layer_type = config.layer_types[layer_idx] if hasattr(config, "layer_types") else None - self.config = config - self.layer_idx = layer_idx - self.head_dim = getattr(config, "head_dim", config.hidden_size // config.num_attention_heads) - self.num_key_value_groups = config.num_attention_heads // config.num_key_value_heads - self.scaling = self.head_dim**-0.5 - self.attention_dropout = config.attention_dropout - self.is_causal = True - self.q_proj = nn.Linear(config.hidden_size, config.num_attention_heads * self.head_dim, bias=True) - self.k_proj = nn.Linear(config.hidden_size, config.num_key_value_heads * self.head_dim, bias=True) - self.v_proj = nn.Linear(config.hidden_size, config.num_key_value_heads * self.head_dim, bias=True) - self.o_proj = nn.Linear(config.num_attention_heads * self.head_dim, config.hidden_size, bias=False) - self.sliding_window = config.sliding_window if self.layer_type == "sliding_attention" else None - - def forward( - self, - hidden_states: torch.Tensor, - position_embeddings: tuple[torch.Tensor, torch.Tensor], - attention_mask: torch.Tensor | None, - past_key_values: Cache | None = None, - **kwargs: Unpack[FlashAttentionKwargs], - ) -> tuple[torch.Tensor, torch.Tensor | None]: - input_shape = hidden_states.shape[:-1] - hidden_shape = (*input_shape, -1, self.head_dim) - - query_states = self.q_proj(hidden_states).view(hidden_shape).transpose(1, 2) - key_states = self.k_proj(hidden_states).view(hidden_shape).transpose(1, 2) - value_states = self.v_proj(hidden_states).view(hidden_shape).transpose(1, 2) - - cos, sin = position_embeddings - query_states, key_states = apply_rotary_pos_emb(query_states, key_states, cos, sin) - - if past_key_values is not None: - key_states, value_states = past_key_values.update(key_states, value_states, self.layer_idx) - - attention_interface: Callable = ALL_ATTENTION_FUNCTIONS.get_interface( - self.config._attn_implementation, eager_attention_forward - ) - - attn_output, attn_weights = attention_interface( - self, - query_states, - key_states, - value_states, - attention_mask, - dropout=0.0 if not self.training else self.attention_dropout, - scaling=self.scaling, - sliding_window=self.sliding_window, # main diff with Llama - **kwargs, - ) - - attn_output = attn_output.reshape(*input_shape, -1).contiguous() - attn_output = self.o_proj(attn_output) - return attn_output, attn_weights - - -@use_kernel_forward_from_hub("RMSNorm") -class PPChart2TableTextRMSNorm(nn.Module): - def __init__(self, hidden_size, eps: float = 1e-6) -> None: - """ - PPChart2TableTextRMSNorm is equivalent to T5LayerNorm - """ - super().__init__() - self.weight = nn.Parameter(torch.ones(hidden_size)) - self.variance_epsilon = eps - - def forward(self, hidden_states: torch.Tensor) -> torch.Tensor: - input_dtype = hidden_states.dtype - hidden_states = hidden_states.to(torch.float32) - variance = hidden_states.pow(2).mean(-1, keepdim=True) - hidden_states = hidden_states * torch.rsqrt(variance + self.variance_epsilon) - return self.weight * hidden_states.to(input_dtype) - - def extra_repr(self): - return f"{tuple(self.weight.shape)}, eps={self.variance_epsilon}" - - -class PPChart2TableTextDecoderLayer(GradientCheckpointingLayer): - def __init__(self, config: PPChart2TableTextConfig, layer_idx: int): - super().__init__() - self.hidden_size = config.hidden_size - - self.self_attn = PPChart2TableTextAttention(config=config, layer_idx=layer_idx) - - self.mlp = PPChart2TableTextMLP(config) - self.input_layernorm = PPChart2TableTextRMSNorm(config.hidden_size, eps=config.rms_norm_eps) - self.post_attention_layernorm = PPChart2TableTextRMSNorm(config.hidden_size, eps=config.rms_norm_eps) - self.attention_type = config.layer_types[layer_idx] - - def forward( - self, - hidden_states: torch.Tensor, - attention_mask: torch.Tensor | None = None, - position_ids: torch.LongTensor | None = None, - past_key_values: Cache | None = None, - use_cache: bool | None = False, - position_embeddings: tuple[torch.Tensor, torch.Tensor] | None = None, - **kwargs: Unpack[TransformersKwargs], - ) -> torch.Tensor: - residual = hidden_states - hidden_states = self.input_layernorm(hidden_states) - # Self Attention - hidden_states, _ = self.self_attn( - hidden_states=hidden_states, - attention_mask=attention_mask, - position_ids=position_ids, - past_key_values=past_key_values, - use_cache=use_cache, - position_embeddings=position_embeddings, - **kwargs, - ) - hidden_states = residual + hidden_states - - # Fully Connected - residual = hidden_states - hidden_states = self.post_attention_layernorm(hidden_states) - hidden_states = self.mlp(hidden_states) - hidden_states = residual + hidden_states - return hidden_states - - -@auto_docstring -class PPChart2TableTextPreTrainedModel(PreTrainedModel): - config: PPChart2TableTextConfig - base_model_prefix = "model" - supports_gradient_checkpointing = True - _no_split_modules = ["PPChart2TableTextDecoderLayer"] - _skip_keys_device_placement = ["past_key_values"] - _supports_flash_attn = True - _supports_sdpa = True - _supports_flex_attn = True - - _can_compile_fullgraph = True - _supports_attention_backend = True - _can_record_outputs = { - "hidden_states": PPChart2TableTextDecoderLayer, - "attentions": PPChart2TableTextAttention, - } - - -class PPChart2TableTextRotaryEmbedding(nn.Module): - inv_freq: torch.Tensor # fix linting for `register_buffer` - - def __init__(self, config: PPChart2TableTextConfig, device=None): - super().__init__() - self.max_seq_len_cached = config.max_position_embeddings - self.original_max_seq_len = config.max_position_embeddings - - self.config = config - - self.rope_type = self.config.rope_parameters["rope_type"] - rope_init_fn: Callable = self.compute_default_rope_parameters - if self.rope_type != "default": - rope_init_fn = ROPE_INIT_FUNCTIONS[self.rope_type] - inv_freq, self.attention_scaling = rope_init_fn(self.config, device) - - self.register_buffer("inv_freq", inv_freq, persistent=False) - self.register_buffer("original_inv_freq", inv_freq.clone(), persistent=False) - - @staticmethod - def compute_default_rope_parameters( - config: PPChart2TableTextConfig | None = None, - device: Optional["torch.device"] = None, - seq_len: int | None = None, - ) -> tuple["torch.Tensor", float]: - """ - Computes the inverse frequencies according to the original RoPE implementation - Args: - config ([`~transformers.PreTrainedConfig`]): - The model configuration. - device (`torch.device`): - The device to use for initialization of the inverse frequencies. - seq_len (`int`, *optional*): - The current sequence length. Unused for this type of RoPE. - Returns: - Tuple of (`torch.Tensor`, `float`), containing the inverse frequencies for the RoPE embeddings and the - post-processing scaling factor applied to the computed cos/sin (unused in this type of RoPE). - """ - base = config.rope_parameters["rope_theta"] - dim = getattr(config, "head_dim", None) or config.hidden_size // config.num_attention_heads - - attention_factor = 1.0 # Unused in this type of RoPE - - # Compute the inverse frequencies - inv_freq = 1.0 / ( - base ** (torch.arange(0, dim, 2, dtype=torch.int64).to(device=device, dtype=torch.float) / dim) - ) - return inv_freq, attention_factor - - @torch.no_grad() - @dynamic_rope_update # power user: used with advanced RoPE types (e.g. dynamic rope) - def forward(self, x, position_ids): - inv_freq_expanded = self.inv_freq[None, :, None].float().expand(position_ids.shape[0], -1, 1).to(x.device) - position_ids_expanded = position_ids[:, None, :].float() - - device_type = x.device.type if isinstance(x.device.type, str) and x.device.type != "mps" else "cpu" - with maybe_autocast(device_type=device_type, enabled=False): # Force float32 - freqs = (inv_freq_expanded.float() @ position_ids_expanded.float()).transpose(1, 2) - emb = torch.cat((freqs, freqs), dim=-1) - cos = emb.cos() * self.attention_scaling - sin = emb.sin() * self.attention_scaling - - return cos.to(dtype=x.dtype), sin.to(dtype=x.dtype) - - -@auto_docstring -class PPChart2TableTextModel(PPChart2TableTextPreTrainedModel): - def __init__(self, config: PPChart2TableTextConfig): - super().__init__(config) - self.padding_idx = config.pad_token_id - self.vocab_size = config.vocab_size - - self.embed_tokens = nn.Embedding(config.vocab_size, config.hidden_size, self.padding_idx) - self.layers = nn.ModuleList( - [PPChart2TableTextDecoderLayer(config, layer_idx) for layer_idx in range(config.num_hidden_layers)] - ) - self.norm = PPChart2TableTextRMSNorm(config.hidden_size, eps=config.rms_norm_eps) - self.rotary_emb = PPChart2TableTextRotaryEmbedding(config=config) - self.gradient_checkpointing = False - self.has_sliding_layers = "sliding_attention" in self.config.layer_types - - # Initialize weights and apply final processing - self.post_init() - - @merge_with_config_defaults - @capture_outputs - @auto_docstring - def forward( - self, - input_ids: torch.LongTensor | None = None, - attention_mask: torch.Tensor | None = None, - position_ids: torch.LongTensor | None = None, - past_key_values: Cache | None = None, - inputs_embeds: torch.FloatTensor | None = None, - use_cache: bool | None = None, - **kwargs: Unpack[TransformersKwargs], - ) -> BaseModelOutputWithPast: - if (input_ids is None) ^ (inputs_embeds is not None): - raise ValueError("You must specify exactly one of input_ids or inputs_embeds") - - if inputs_embeds is None: - inputs_embeds = self.embed_tokens(input_ids) - - if use_cache and past_key_values is None: - past_key_values = DynamicCache(config=self.config) - - if position_ids is None: - past_seen_tokens = past_key_values.get_seq_length() if past_key_values is not None else 0 - position_ids = torch.arange(inputs_embeds.shape[1], device=inputs_embeds.device) + past_seen_tokens - position_ids = position_ids.unsqueeze(0) - - # It may already have been prepared by e.g. `generate` - if not isinstance(causal_mask_mapping := attention_mask, dict): - # Prepare mask arguments - mask_kwargs = { - "config": self.config, - "inputs_embeds": inputs_embeds, - "attention_mask": attention_mask, - "past_key_values": past_key_values, - "position_ids": position_ids, - } - # Create the masks - causal_mask_mapping = { - "full_attention": create_causal_mask(**mask_kwargs), - } - # The sliding window alternating layers are not always activated depending on the config - if self.has_sliding_layers: - causal_mask_mapping["sliding_attention"] = create_sliding_window_causal_mask(**mask_kwargs) - - hidden_states = inputs_embeds - position_embeddings = self.rotary_emb(hidden_states, position_ids) - - for decoder_layer in self.layers[: self.config.num_hidden_layers]: - hidden_states = decoder_layer( - hidden_states, - attention_mask=causal_mask_mapping[decoder_layer.attention_type], - position_embeddings=position_embeddings, - position_ids=position_ids, - past_key_values=past_key_values, - use_cache=use_cache, - **kwargs, - ) - - hidden_states = self.norm(hidden_states) - return BaseModelOutputWithPast( - last_hidden_state=hidden_states, - past_key_values=past_key_values if use_cache else None, - ) - - @dataclass class PPChart2TableModelOutputWithPast(BaseModelOutputWithPast): r""" @@ -879,22 +492,25 @@ def __init__(self, config: PPChart2TableConfig): super().__init__(config) self.vision_tower = PPChart2TableVisionEncoder(config.vision_config) self.multi_modal_projector = nn.Linear(config.output_channels, config.text_config.hidden_size) - self.language_model = PPChart2TableTextModel._from_config(config.text_config) + self.language_model = AutoModel.from_config(config.text_config) self.vision_downsample1 = nn.Conv2d( - config.vision_config.output_channels, config.net_channels, kernel_size=3, stride=2, padding=1, bias=False + config.vision_config.output_channels, + config.vision_hidden_channels, + kernel_size=3, + stride=2, + padding=1, + bias=False, ) self.vision_downsample2 = nn.Conv2d( - config.net_channels, config.output_channels, kernel_size=3, stride=2, padding=1, bias=False + config.vision_hidden_channels, config.output_channels, kernel_size=3, stride=2, padding=1, bias=False ) self.post_init() def get_input_embeddings(self): - """Get input embeddings from the text decoder (for weight tying/loading).""" - return self.language_model.embed_tokens + return self.language_model.get_input_embeddings() def set_input_embeddings(self, value): - """Set input embeddings for the text decoder (for weight tying/loading).""" - self.language_model.embed_tokens = value + self.language_model.set_input_embeddings(value) @can_return_tuple @auto_docstring( @@ -1201,7 +817,6 @@ def prepare_inputs_for_generation( "PPChart2TableModel", "PPChart2TablePreTrainedModel", "PPChart2TableTextPreTrainedModel", - "PPChart2TableTextModel", "PPChart2TableVisionPreTrainedModel", "PPChart2TableVisionModel", ] diff --git a/src/transformers/models/pp_chart2table/modular_pp_chart2table.py b/src/transformers/models/pp_chart2table/modular_pp_chart2table.py index bfc376d0a87a..5aac8fe62ccf 100644 --- a/src/transformers/models/pp_chart2table/modular_pp_chart2table.py +++ b/src/transformers/models/pp_chart2table/modular_pp_chart2table.py @@ -5,10 +5,10 @@ import torch.nn as nn import torchvision.transforms.v2.functional as tvF -from ...configuration_utils import PreTrainedConfig, layer_type_validation from ...feature_extraction_utils import BatchFeature from ...image_processing_utils_fast import BaseImageProcessorFast, group_images_by_shape, reorder_images -from ...modeling_rope_utils import RopeParameters + +from ..got_ocr2.configuration_got_ocr2 import GotOcr2Config from ..got_ocr2.modeling_got_ocr2 import ( GotOcr2ModelOutputWithPast, GotOcr2Model, @@ -16,15 +16,8 @@ GotOcr2ForConditionalGeneration, GotOcr2VisionEncoder, ) -from ..qwen2.modeling_qwen2 import ( - Qwen2Model, - Qwen2PreTrainedModel, -) -from ...utils import ( - auto_docstring, - logging, - TransformersKwargs, -) + +from ...utils import TransformersKwargs, auto_docstring, logging from ...modeling_outputs import BaseModelOutputWithPooling from ...processing_utils import ProcessorMixin, TensorType, Unpack @@ -33,298 +26,13 @@ logger = logging.get_logger(__name__) -@auto_docstring(checkpoint="PaddlePaddle/PP-Chart2Table_safetensors",) -class PPChart2TableVisionConfig(PreTrainedConfig): - """ - num_hidden_layers (`int`, *optional*, defaults to 12): - Number of transformer encoder layers in the vision backbone. - hidden_size (`int`, *optional*, defaults to 768): - Dimensionality of the patch embedding vectors. - num_channels (`int`, *optional*, defaults to 3): - Number of input channels (3 for RGB images, 1 for grayscale). - image_size (`int`, *optional*, defaults to 1024): - Size (height/width) of the input images (assumed to be square). - num_attention_heads (`int`, *optional*, defaults to 12): - Number of attention heads for each transformer encoder layer. - patch_size (`int`, *optional*, defaults to 16): - Size (height/width) of the image patches extracted from the input image. - qkv_bias (`bool`, *optional*, defaults to `True`): - Whether to include bias terms in the query, key, value projection layers of self-attention. - use_rel_pos (`bool`, *optional*, defaults to `True`): - Whether to use relative positional embeddings in the self-attention mechanism. - global_attn_indexes (`Optional[list[int]]`, *optional*, defaults to [2, 5, 8, 11]): - List of layer indexes where global attention (instead of window attention) is applied. - If `None`, defaults to [2, 5, 8, 11]. - window_size (`int`, *optional*, defaults to 14): - Size of the attention window for window-based self-attention (only effective when use_rel_pos=True). - output_channels (`int`, *optional*, defaults to 256): - Dimensionality of the final visual feature output channels. - attention_dropout (`float`, *optional*, defaults to 0.0): - Dropout probability applied to the attention weights. - """ - - model_type = "pp_chart2table_vision" - base_config_key = "vision_config" - - def __init__( - self, - num_hidden_layers=12, - hidden_size=768, - output_channels=256, - mlp_dim=3072, - num_channels=3, - image_size=1024, - num_attention_heads=12, - patch_size=16, - qkv_bias=True, - use_rel_pos=True, - use_abs_pos=True, - global_attn_indexes=[2, 5, 8, 11], - window_size=14, - attention_dropout=0.0, - **kwargs, - ): - self.num_hidden_layers = num_hidden_layers - self.hidden_size = hidden_size - self.mlp_dim=mlp_dim - self.image_size = image_size - self.num_channels = num_channels - self.num_attention_heads = num_attention_heads - self.patch_size = patch_size - self.qkv_bias = qkv_bias - self.use_rel_pos = use_rel_pos - self.use_abs_pos = use_abs_pos - self.global_attn_indexes = global_attn_indexes - self.window_size = window_size - self.output_channels = output_channels - self.attention_dropout = attention_dropout - super().__init__(**kwargs) - - -@auto_docstring( - custom_intro=""" - - """, - -) -class PPChart2TableTextConfig(PreTrainedConfig): - r""" - attention_dropout (`float`, *optional*, defaults to 0.0): - The dropout ratio for the attention probabilities in self-attention layers. - bos_token_id (`int`, *optional*, defaults to 151643): - The token ID representing the beginning of a sequence (BOS) for text generation. - eos_token_id (`int`, *optional*, defaults to 151643): - The token ID representing the end of a sequence (EOS) for text generation. - pad_token_id (Optional[int], optional, *optional*, defaults to -1): - The index of the padding token. Defaults to -1. - hidden_act (`str` or `function`, *optional*, defaults to `"silu"`): - The non-linear activation function (function or string) in the feed-forward and attention layers of the decoder. - hidden_size (`int`, *optional*, defaults to 1024): - Dimensionality of the hidden representations in the Transformer decoder layers. - initializer_range (`float`, *optional*, defaults to 0.02): - The standard deviation of the truncated_normal_initializer for initializing all weight matrices. - intermediate_size (`int`, *optional*, defaults to 2816): - Dimensionality of the "intermediate" (i.e., feed-forward) layer in the Transformer decoder blocks. - max_position_embeddings (`int`, *optional*, defaults to 32768): - The maximum sequence length that this model might ever be used with for text input/output. - num_attention_heads (`int`, *optional*, defaults to 16): - Number of attention heads for each self-attention layer in the Transformer decoder. - num_hidden_layers (`int`, *optional*, defaults to 24): - Number of hidden layers in the Transformer decoder. - num_key_value_heads (`int`, *optional*, defaults to 16): - Number of key/value heads for implementing Grouped Query Attention (GQA). If equal to `num_attention_heads`, - Multi Head Attention (MHA) is used; if 1, Multi Query Attention (MQA) is used. For more details, see - [this paper](https://huggingface.co/papers/2305.13245). - rms_norm_eps (`float`, *optional*, defaults to 1e-06): - The epsilon value used by the RMS normalization layers to avoid division by zero. - rope_theta (`float`, *optional*, defaults to 1000000.0): - The base period of the RoPE (Rotary Position Embedding) embeddings, controlling the frequency of positional encoding. - rope_parameters (`RopeParameters` or `dict`, *optional*): - Configuration parameters for RoPE embeddings, including scaling parameters for longer sequence lengths beyond - `max_position_embeddings`. - sliding_window (`int`, *optional*, defaults to 32768): - Window size for Sliding Window Attention (SWA) in the decoder layers (only active if `use_sliding_window=True`). - tie_word_embeddings (`bool`, *optional*, defaults to `True`): - Whether the model's input and output word embeddings should be tied (shared weights). - use_cache (`bool`, *optional*, defaults to `True`): - Whether to return the last key/value attention states to speed up sequential decoding (only relevant for autoregressive - generation). - vocab_size (`int`, *optional*, defaults to 151860): - Vocabulary size of the PPChart2TableText model. Defines the number of distinct tokens that can be represented - by `input_ids`. - layer_types (`list[str]`, *optional*): - Attention pattern for each decoder layer (e.g., `"full_attention"` or `"sliding_attention"`). If not specified, - automatically determined by `sliding_window`. - """ - - model_type = "pp_chart2table_text" - keys_to_ignore_at_inference = ["past_key_values"] - - # Default tensor parallel plan for base model `PPChart2TableText` - base_model_tp_plan = { - "layers.*.self_attn.q_proj": "colwise", - "layers.*.self_attn.k_proj": "colwise", - "layers.*.self_attn.v_proj": "colwise", - "layers.*.self_attn.o_proj": "rowwise", - "layers.*.mlp.gate_proj": "colwise", - "layers.*.mlp.up_proj": "colwise", - "layers.*.mlp.down_proj": "rowwise", - } - base_model_pp_plan = { - "embed_tokens": (["input_ids"], ["inputs_embeds"]), - "layers": (["hidden_states", "attention_mask"], ["hidden_states"]), - "norm": (["hidden_states"], ["hidden_states"]), - } - base_config_key = "text_config" - - def __init__( - self, - attention_dropout: float = 0.0, - bos_token_id: int = 151643, - eos_token_id: int = 151643, - pad_token_id: int = -1, - hidden_act: str = "silu", - hidden_size: int = 1024, - initializer_range: float = 0.02, - intermediate_size: int = 2816, - max_position_embeddings: int = 32768, - num_attention_heads: int = 16, - num_hidden_layers: int = 24, - num_key_value_heads: int = 16, - rms_norm_eps: float = 1e-06, - rope_theta: float = 1000000.0, - rope_parameters: Optional[RopeParameters | dict[str, RopeParameters]] = None, - sliding_window: int = 32768, - tie_word_embeddings: bool = True, - use_cache: bool = True, - vocab_size: int = 151860, - layer_types: Optional[list[str]] = None, - **kwargs, - ): - self.vocab_size = vocab_size - self.max_position_embeddings = max_position_embeddings - self.hidden_size = hidden_size - self.intermediate_size = intermediate_size - self.num_hidden_layers = num_hidden_layers - self.num_attention_heads = num_attention_heads - self.sliding_window = sliding_window - - # for backward compatibility - if num_key_value_heads is None: - num_key_value_heads = num_attention_heads - self.num_key_value_heads = num_key_value_heads - self.hidden_act = hidden_act - self.initializer_range = initializer_range - self.rms_norm_eps = rms_norm_eps - self.use_cache = use_cache - - self.attention_dropout = attention_dropout - - self.layer_types = layer_types - if self.layer_types is None: - self.layer_types = [ - "sliding_attention" if self.sliding_window is not None else "full_attention" - for i in range(self.num_hidden_layers) - ] - layer_type_validation(self.layer_types, self.num_hidden_layers) - - self.rope_parameters = rope_parameters - - self.rope_theta = rope_theta - self.tie_word_embeddings = tie_word_embeddings - super().__init__( - pad_token_id=pad_token_id, - bos_token_id=bos_token_id, - eos_token_id=eos_token_id, - tie_word_embeddings=tie_word_embeddings, - **kwargs, - ) - - -@auto_docstring( - custom_intro=""" - - """ -) -class PPChart2TableConfig(PreTrainedConfig): - r""" - vision_config (Optional[Dict], optional, *optional*):: - The [PPChart2TableVisionConfig] for the vision sub-model. Defaults to None. - text_config (Optional[Dict], optional, *optional*):: - The [PPChart2TableTextConfig] for the text sub-model. Defaults to None. - image_token_index (Optional[int], optional, *optional*, defaults to 151859):: - The index of the image token. Defaults to 151859. - image_seq_length (Optional[int], optional, *optional*, defaults to 576):: - The sequence length for the image. Defaults to 576. - pad_token_id (Optional[int], optional, *optional*, defaults to -1): - The index of the padding token. Defaults to -1. - net_channels (`int`, *optional*, defaults to 512): - Dimensionality of intermediate network channels in the vision backbone. - output_channels (`int`, *optional*, defaults to 1024): - Dimensionality of intermediate network channels in the vision backbone. - """ - - model_type = "pp_chart2table" - attribute_map = { - "image_token_id": "image_token_index", - } - sub_configs = {"vision_config": PPChart2TableVisionConfig, "text_config": PPChart2TableTextConfig} - - def __init__( - self, - vision_config: dict | None = None, - text_config: dict | None = None, - image_token_index: Optional[int] = 151859, - image_seq_length: Optional[int] = 576, - pad_token_id: Optional[int] = -1, - net_channels: Optional[int] = 512, - output_channels: Optional[int] = 1024, - **kwargs, - ): - self.image_token_index = image_token_index - self.image_seq_length = image_seq_length - self.pad_token_id = pad_token_id - self.net_channels = net_channels - self.output_channels = output_channels - - if vision_config is None: - vision_config = {} - self.vision_config = PPChart2TableVisionConfig(**vision_config) - - if text_config is None: - text_config = {} - self.text_config = PPChart2TableTextConfig(**text_config) - - text_config_keys = [ - "attention_dropout", - "bos_token_id", - "eos_token_id", - "hidden_act", - "hidden_size", - "initializer_range", - "intermediate_size", - "max_position_embeddings", - "num_attention_heads", - "num_hidden_layers", - "num_key_value_heads", - "rms_norm_eps", - "rope_theta", - "sliding_window", - "tie_word_embeddings", - "dtype", - "use_cache", - "vocab_size", - ] - for key in text_config_keys: - if hasattr(self.text_config, key): - setattr(self, key, getattr(self.text_config, key)) - - super().__init__(**kwargs) +@auto_docstring +class PPChart2TableConfig(GotOcr2Config): + pass @auto_docstring class PPChart2TableImageProcessorFast(BaseImageProcessorFast): - resample = 3 image_mean = [0.40821073, 0.4578275, 0.48145466] image_std = [0.27577711, 0.26130258, 0.26862954] @@ -384,6 +92,14 @@ class PPChart2TableProcessor(ProcessorMixin): def __init__(self, image_processor=None, tokenizer=None, chat_template=None, **kwargs): super().__init__(image_processor, tokenizer, chat_template=chat_template) + self.message_start_token = "<|im_start|>" + self.message_end_token = "<|im_end|>" + self.img_start_token = "" + self.img_end_token = "" + self.img_pad_token = "" + self.image_token = "" # keep the above for BC, but we need to call it `image_token` + self.image_token_id = tokenizer.convert_tokens_to_ids(self.image_token) + self.system_query = "system\nYou should follow the instructions carefully and explain your answers in detail." def __call__( self, @@ -395,18 +111,31 @@ def __call__( image_inputs = self.image_processor(images=images, return_tensors="pt") else: image_inputs = {} - img_cnt = len(image_inputs) + image_count = len(image_inputs) _, _, height, _ = image_inputs["pixel_values"].shape num_patches = height // self.image_processor.patch_size // self.image_processor.merge_size - prompt = ( - "<|im_start|>system\n" - "You should follow the instructions carefully and explain your answers in detail.<|im_end|><|im_start|>user\n" - "" + "" * (num_patches * num_patches) + "\n" - "Chart to table<|im_end|><|im_start|>assistant\n" - ) - input_ids = torch.tensor(self.tokenizer([prompt]).input_ids) - input_ids = input_ids.repeat(img_cnt, 1) - input_ids = {"input_ids": input_ids} + + input_ids = {"input_ids": None} + if text == None: + query = "Chart to table" + prompt = ( + self.message_start_token + + self.system_query + + self.message_end_token + + self.message_start_token + + "user\n" + + self.img_start_token + + self.img_pad_token * num_patches * num_patches + + self.img_end_token + + "\n" + + query + + self.message_end_token + + self.message_start_token + + "assistant\n" + ) + input_ids = torch.tensor(self.tokenizer([prompt]).input_ids) + input_ids = input_ids.repeat(image_count, 1) + input_ids = {"input_ids": input_ids} return BatchFeature(data={**input_ids, **image_inputs}) def postprocess(self, model_pred, **kwargs): @@ -425,16 +154,6 @@ class PPChart2TableVisionEncoder(GotOcr2VisionEncoder, PPChart2TableVisionPreTra pass - -@auto_docstring -class PPChart2TableTextPreTrainedModel(Qwen2PreTrainedModel): - pass - - -class PPChart2TableTextModel(Qwen2Model): - pass - - @dataclass class PPChart2TableModelOutputWithPast(GotOcr2ModelOutputWithPast): pass @@ -445,22 +164,13 @@ class PPChart2TableModel(GotOcr2Model): def __init__(self, config: PPChart2TableConfig): super().__init__(config) - self.vision_downsample1 = nn.Conv2d(config.vision_config.output_channels, config.net_channels, kernel_size=3, stride=2, padding=1, bias=False) - self.vision_downsample2 = nn.Conv2d(config.net_channels, config.output_channels, kernel_size=3, stride=2, padding=1, bias=False) - self.language_model = PPChart2TableTextModel._from_config(config.text_config) + self.vision_downsample1 = nn.Conv2d(config.vision_config.output_channels, config.vision_hidden_channels, kernel_size=3, stride=2, padding=1, bias=False) + self.vision_downsample2 = nn.Conv2d(config.vision_hidden_channels, config.output_channels, kernel_size=3, stride=2, padding=1, bias=False) self.multi_modal_projector = nn.Linear(config.output_channels, config.text_config.hidden_size) # Initialize weights and apply final processing self.post_init() - def get_input_embeddings(self): - """Get input embeddings from the text decoder (for weight tying/loading).""" - return self.language_model.embed_tokens - - def set_input_embeddings(self, value): - """Set input embeddings for the text decoder (for weight tying/loading).""" - self.language_model.embed_tokens = value - def get_image_features( self, pixel_values: torch.FloatTensor, @@ -492,11 +202,8 @@ class PPChart2TableForConditionalGeneration(GotOcr2ForConditionalGeneration): "PPChart2TablePreTrainedModel", "PPChart2TableConfig", "PPChart2TableTextPreTrainedModel", - "PPChart2TableTextModel", "PPChart2TableVisionPreTrainedModel", "PPChart2TableVisionModel", - "PPChart2TableVisionConfig", - "PPChart2TableTextConfig", "PPChart2TableImageProcessorFast", "PPChart2TableProcessor", -] +] \ No newline at end of file diff --git a/src/transformers/models/pp_chart2table/processing_pp_chart2table.py b/src/transformers/models/pp_chart2table/processing_pp_chart2table.py index 518fcb645770..b4f492fb6191 100644 --- a/src/transformers/models/pp_chart2table/processing_pp_chart2table.py +++ b/src/transformers/models/pp_chart2table/processing_pp_chart2table.py @@ -19,6 +19,14 @@ class PPChart2TableProcessor(ProcessorMixin): def __init__(self, image_processor=None, tokenizer=None, chat_template=None, **kwargs): super().__init__(image_processor, tokenizer, chat_template=chat_template) + self.message_start_token = "<|im_start|>" + self.message_end_token = "<|im_end|>" + self.img_start_token = "" + self.img_end_token = "" + self.img_pad_token = "" + self.image_token = "" # keep the above for BC, but we need to call it `image_token` + self.image_token_id = tokenizer.convert_tokens_to_ids(self.image_token) + self.system_query = "system\nYou should follow the instructions carefully and explain your answers in detail." def __call__( self, @@ -30,18 +38,31 @@ def __call__( image_inputs = self.image_processor(images=images, return_tensors="pt") else: image_inputs = {} - img_cnt = len(image_inputs) + image_count = len(image_inputs) _, _, height, _ = image_inputs["pixel_values"].shape num_patches = height // self.image_processor.patch_size // self.image_processor.merge_size - prompt = ( - "<|im_start|>system\n" - "You should follow the instructions carefully and explain your answers in detail.<|im_end|><|im_start|>user\n" - "" + "" * (num_patches * num_patches) + "\n" - "Chart to table<|im_end|><|im_start|>assistant\n" - ) - input_ids = torch.tensor(self.tokenizer([prompt]).input_ids) - input_ids = input_ids.repeat(img_cnt, 1) - input_ids = {"input_ids": input_ids} + + input_ids = {"input_ids": None} + if text == None: + query = "Chart to table" + prompt = ( + self.message_start_token + + self.system_query + + self.message_end_token + + self.message_start_token + + "user\n" + + self.img_start_token + + self.img_pad_token * num_patches * num_patches + + self.img_end_token + + "\n" + + query + + self.message_end_token + + self.message_start_token + + "assistant\n" + ) + input_ids = torch.tensor(self.tokenizer([prompt]).input_ids) + input_ids = input_ids.repeat(image_count, 1) + input_ids = {"input_ids": input_ids} return BatchFeature(data={**input_ids, **image_inputs}) def postprocess(self, model_pred, **kwargs): From 4abb70d4bd10f9b1dfde45b41b4b58b0c1426a97 Mon Sep 17 00:00:00 2001 From: XingweiDeng Date: Fri, 13 Mar 2026 18:18:37 +0800 Subject: [PATCH 14/60] update --- docs/source/en/model_doc/pp_chart2table.md | 63 +-- .../models/auto/image_processing_auto.py | 2 +- .../models/pp_chart2table/__init__.py | 3 +- .../configuration_pp_chart2table.py | 30 +- .../image_processing_pp_chart2table_fast.py | 16 +- .../pp_chart2table/modeling_pp_chart2table.py | 23 +- .../pp_chart2table/modular_pp_chart2table.py | 91 ++-- .../processing_pp_chart2table.py | 55 +- .../test_modeling_pp_chart2table.py | 485 +++++++----------- 9 files changed, 371 insertions(+), 397 deletions(-) diff --git a/docs/source/en/model_doc/pp_chart2table.md b/docs/source/en/model_doc/pp_chart2table.md index ad00bec20f84..339746ae6cc9 100644 --- a/docs/source/en/model_doc/pp_chart2table.md +++ b/docs/source/en/model_doc/pp_chart2table.md @@ -1,4 +1,4 @@ - +*This model was released on {release_date} and added to Hugging Face Transformers on 2026-03-16.* # PP-Chart2Table From cc85b83a9a1913717eba52bed6caad0e28041449 Mon Sep 17 00:00:00 2001 From: XingweiDeng Date: Mon, 16 Mar 2026 17:57:32 +0800 Subject: [PATCH 24/60] update --- .../pp_chart2table/modeling_pp_chart2table.py | 28 ------------------- .../test_modeling_pp_chart2table.py | 10 ++++--- 2 files changed, 6 insertions(+), 32 deletions(-) diff --git a/src/transformers/models/pp_chart2table/modeling_pp_chart2table.py b/src/transformers/models/pp_chart2table/modeling_pp_chart2table.py index 76b164ebe4aa..a9f48a6cf6a1 100644 --- a/src/transformers/models/pp_chart2table/modeling_pp_chart2table.py +++ b/src/transformers/models/pp_chart2table/modeling_pp_chart2table.py @@ -578,18 +578,8 @@ def forward( past_key_values: Cache | None = None, inputs_embeds: torch.FloatTensor | None = None, use_cache: bool | None = None, - output_attentions: bool | None = None, - output_hidden_states: bool | None = None, - return_dict: bool | None = None, - cache_position: torch.LongTensor | None = None, **kwargs: Unpack[TransformersKwargs], ) -> tuple | PPChart2TableModelOutputWithPast: - output_attentions = output_attentions if output_attentions is not None else self.config.output_attentions - output_hidden_states = ( - output_hidden_states if output_hidden_states is not None else self.config.output_hidden_states - ) - return_dict = return_dict if return_dict is not None else self.config.use_return_dict - if (input_ids is None) ^ (inputs_embeds is not None): raise ValueError("You must specify exactly one of input_ids or inputs_embeds") @@ -612,10 +602,7 @@ def forward( past_key_values=past_key_values, inputs_embeds=inputs_embeds, use_cache=use_cache, - output_attentions=output_attentions, - output_hidden_states=output_hidden_states, return_dict=True, - cache_position=cache_position, **kwargs, ) @@ -706,10 +693,6 @@ def forward( inputs_embeds: torch.FloatTensor | None = None, labels: torch.LongTensor | None = None, use_cache: bool | None = None, - output_attentions: bool | None = None, - output_hidden_states: bool | None = None, - return_dict: bool | None = None, - cache_position: torch.LongTensor | None = None, logits_to_keep: int | torch.Tensor = 0, **kwargs: Unpack[TransformersKwargs], ) -> tuple | PPChart2TableCausalLMOutputWithPast: @@ -749,12 +732,6 @@ def forward( "You should keep in mind what features from the module should be used, especially when you're planning to sell a template." ```""" - output_attentions = output_attentions if output_attentions is not None else self.config.output_attentions - output_hidden_states = ( - output_hidden_states if output_hidden_states is not None else self.config.output_hidden_states - ) - return_dict = return_dict if return_dict is not None else self.config.use_return_dict - outputs = self.model( input_ids=input_ids, pixel_values=pixel_values, @@ -763,10 +740,7 @@ def forward( past_key_values=past_key_values, inputs_embeds=inputs_embeds, use_cache=use_cache, - output_attentions=output_attentions, - output_hidden_states=output_hidden_states, return_dict=True, - cache_position=cache_position, logits_to_keep=logits_to_keep, **kwargs, ) @@ -798,7 +772,6 @@ def prepare_inputs_for_generation( inputs_embeds=None, pixel_values=None, attention_mask=None, - cache_position=None, logits_to_keep=None, is_first_iteration=False, **kwargs, @@ -810,7 +783,6 @@ def prepare_inputs_for_generation( past_key_values=past_key_values, inputs_embeds=inputs_embeds, attention_mask=attention_mask, - cache_position=cache_position, logits_to_keep=logits_to_keep, is_first_iteration=is_first_iteration, **kwargs, diff --git a/tests/models/pp_chart2table/test_modeling_pp_chart2table.py b/tests/models/pp_chart2table/test_modeling_pp_chart2table.py index 836de2801907..48ce924d5c97 100644 --- a/tests/models/pp_chart2table/test_modeling_pp_chart2table.py +++ b/tests/models/pp_chart2table/test_modeling_pp_chart2table.py @@ -164,11 +164,15 @@ def setUp(self): def test_config(self): self.config_tester.run_common_tests() - @unittest.skip(reason="PPChart2Table have reused the GotOcr2 model, which does not implement the latest logic for capturing attentions and hidden_states introduced in Transformers v5.") + @unittest.skip( + reason="PPChart2Table have reused the GotOcr2 model, which does not implement the latest logic for capturing attentions and hidden_states introduced in Transformers v5." + ) def test_get_image_features_attentions(self): pass - @unittest.skip(reason="PPChart2Table have reused the GotOcr2 model, which does not implement the latest logic for capturing attentions and hidden_states introduced in Transformers v5.") + @unittest.skip( + reason="PPChart2Table have reused the GotOcr2 model, which does not implement the latest logic for capturing attentions and hidden_states introduced in Transformers v5." + ) def test_get_image_features_hidden_states(self): pass @@ -191,7 +195,6 @@ def tearDown(self): @slow def test_small_model_integration_test_pp_chart2table(self): - inputs = self.processor(self.image, return_tensors="pt").to(torch_device) generate_ids = self.model.generate( **inputs, @@ -207,7 +210,6 @@ def test_small_model_integration_test_pp_chart2table(self): @slow def test_small_model_integration_test_pp_chart2table_batched(self): - inputs = self.processor([self.image, self.image], return_tensors="pt").to(torch_device) generate_ids = self.model.generate(**inputs, do_sample=False, num_beams=1, max_new_tokens=6) decoded_output = self.processor.batch_decode( From b4197321624a8a18924fac271e0339285a96aa83 Mon Sep 17 00:00:00 2001 From: XingweiDeng Date: Mon, 16 Mar 2026 18:09:59 +0800 Subject: [PATCH 25/60] update --- docs/source/en/_toctree.yml | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/docs/source/en/_toctree.yml b/docs/source/en/_toctree.yml index 0fe76d0d70c8..e5f24f55962a 100644 --- a/docs/source/en/_toctree.yml +++ b/docs/source/en/_toctree.yml @@ -1268,12 +1268,12 @@ title: PP-DocLayoutV2 - local: model_doc/pp_doclayout_v3 title: PP-DocLayoutV3 - - local: model_doc/pp_chart2table - title: PPChart2Table - local: model_doc/pp_ocrv5_mobile_det title: PP-OCRv5_mobile_det - local: model_doc/pp_ocrv5_server_det title: PP-OCRv5_server_det + - local: model_doc/pp_chart2table + title: PPChart2Table - local: model_doc/pp_lcnet title: PPLCNet - local: model_doc/pp_lcnet_v3 From cc8bbca9452acc0eae95a1cadb9edf5ca5914c56 Mon Sep 17 00:00:00 2001 From: XingweiDeng Date: Tue, 17 Mar 2026 12:52:27 +0800 Subject: [PATCH 26/60] update --- docs/source/en/model_doc/pp_chart2table.md | 20 ++----- .../models/auto/tokenization_auto.py | 1 - .../configuration_pp_chart2table.py | 16 +----- .../pp_chart2table/modeling_pp_chart2table.py | 45 +++++++++------ .../pp_chart2table/modular_pp_chart2table.py | 57 ++----------------- .../processing_pp_chart2table.py | 2 +- 6 files changed, 39 insertions(+), 102 deletions(-) diff --git a/docs/source/en/model_doc/pp_chart2table.md b/docs/source/en/model_doc/pp_chart2table.md index f2b0296d8e66..82f92bcdfb90 100644 --- a/docs/source/en/model_doc/pp_chart2table.md +++ b/docs/source/en/model_doc/pp_chart2table.md @@ -71,15 +71,14 @@ from transformers import AutoModelForImageTextToText, AutoProcessor model_path = "PaddlePaddle/PP-Chart2Table_safetensors" model = AutoModelForImageTextToText.from_pretrained( model_path, - dtype="float32", device_map="auto", ) -processor = AutoProcessor.from_pretrained(model_path, use_fast=True).to(model.device) +processor = AutoProcessor.from_pretrained(model_path) image = Image.open(requests.get("https://paddle-model-ecology.bj.bcebos.com/paddlex/imgs/demo_image/chart_parsing_02.png", stream=True).raw) -inputs = processor(images=image) +inputs = processor(images=image).to(model.device) -generated_ids = model.generate(**inputs, use_cache=True, do_sample=False, max_new_tokens=256) +generated_ids = model.generate(**inputs, do_sample=False, max_new_tokens=256) generated_ids_trimmed = [out_ids[len(in_ids) :] for in_ids, out_ids in zip(inputs.input_ids, generated_ids)] result = processor.batch_decode(generated_ids_trimmed, skip_special_tokens=True, clean_up_tokenization_spaces=False) print(result) @@ -128,13 +127,12 @@ from transformers import AutoModelForImageTextToText, AutoProcessor model_path = "PaddlePaddle/PP-Chart2Table_safetensors" model = AutoModelForImageTextToText.from_pretrained( model_path, - dtype="float32", device_map="auto", ) -processor = AutoProcessor.from_pretrained(model_path).to(model.device) +processor = AutoProcessor.from_pretrained(model_path) image = Image.open(requests.get("https://paddle-model-ecology.bj.bcebos.com/paddlex/imgs/demo_image/chart_parsing_02.png", stream=True).raw) -inputs = processor(images=[image, image]) +inputs = processor(images=[image, image]).to(model.device) generated_ids = model.generate(**inputs, do_sample=False, max_new_tokens=256) generated_ids_trimmed = [out_ids[len(in_ids) :] for in_ids, out_ids in zip(inputs.input_ids, generated_ids)] @@ -157,14 +155,6 @@ print(result) [[autodoc]] PPChart2TableConfig -## PPChart2TableVisionPreTrainedModel - -[[autodoc]] PPChart2TableVisionPreTrainedModel - -## PPChart2TablePreTrainedModel - -[[autodoc]] PPChart2TablePreTrainedModel - ## PPChart2TableImageProcessorFast [[autodoc]] PPChart2TableImageProcessorFast diff --git a/src/transformers/models/auto/tokenization_auto.py b/src/transformers/models/auto/tokenization_auto.py index 21d259dca36b..35693d4c082e 100644 --- a/src/transformers/models/auto/tokenization_auto.py +++ b/src/transformers/models/auto/tokenization_auto.py @@ -255,7 +255,6 @@ else ("TokenizersBackend" if is_tokenizers_available() else None), ), ("plbart", "PLBartTokenizer" if is_tokenizers_available() else None), - ("pp_chart2table", "TokenizersBackend" if is_tokenizers_available() else None), ("prophetnet", "ProphetNetTokenizer"), ("qdqbert", "BertTokenizer" if is_tokenizers_available() else None), ("qwen2", "Qwen2Tokenizer" if is_tokenizers_available() else None), diff --git a/src/transformers/models/pp_chart2table/configuration_pp_chart2table.py b/src/transformers/models/pp_chart2table/configuration_pp_chart2table.py index d3556596b107..107c077798a9 100644 --- a/src/transformers/models/pp_chart2table/configuration_pp_chart2table.py +++ b/src/transformers/models/pp_chart2table/configuration_pp_chart2table.py @@ -84,17 +84,7 @@ def __init__( self.mlp_dim = mlp_dim -@auto_docstring( - checkpoint="PaddlePaddle/PP-Chart2Table_safetensors", - custom_args=r""" - output_channels (`int`, *optional*, defaults to 1024): - Dimensionality of the output channels from the vision encoder. This is the final channel count - after the vision downsample layers, which is then projected to the text model hidden size. - vision_hidden_channels (`int`, *optional*, defaults to 512): - Dimensionality of the intermediate hidden channels in the vision encoder. This is the channel - count between the first and second downsample layers. - """, -) +@auto_docstring(checkpoint="PaddlePaddle/PP-Chart2Table_safetensors") class PPChart2TableConfig(PreTrainedConfig): r""" Example: @@ -125,12 +115,8 @@ def __init__( image_token_index: int | None = 151859, image_seq_length: int | None = 576, tie_word_embeddings: bool | None = True, - output_channels: int = 1024, - vision_hidden_channels: int = 512, **kwargs, ): - self.output_channels = output_channels - self.vision_hidden_channels = vision_hidden_channels self.image_token_index = image_token_index self.image_seq_length = image_seq_length diff --git a/src/transformers/models/pp_chart2table/modeling_pp_chart2table.py b/src/transformers/models/pp_chart2table/modeling_pp_chart2table.py index a9f48a6cf6a1..0c0a2c83e006 100644 --- a/src/transformers/models/pp_chart2table/modeling_pp_chart2table.py +++ b/src/transformers/models/pp_chart2table/modeling_pp_chart2table.py @@ -496,6 +496,27 @@ def _init_weights(self, module): init.zeros_(module.pos_embed) +class PPChart2TableMultiModalProjector(nn.Module): + def __init__(self, config: PPChart2TableConfig): + super().__init__() + vision_output_channels = config.vision_config.output_channels + language_hidden_size = config.text_config.hidden_size + self.conv_upsampler1 = nn.Conv2d( + vision_output_channels, vision_output_channels * 2, kernel_size=3, stride=2, padding=1, bias=False + ) + self.conv_upsampler2 = nn.Conv2d( + vision_output_channels * 2, language_hidden_size, kernel_size=3, stride=2, padding=1, bias=False + ) + self.multimodal_projector = nn.Linear(language_hidden_size, language_hidden_size) + + def forward(self, vision_embeddings: torch.Tensor) -> torch.Tensor: + hidden_state = self.conv_upsampler1(vision_embeddings) + hidden_state = self.conv_upsampler2(hidden_state) + hidden_state = hidden_state.flatten(2).permute(0, 2, 1) + hidden_state = self.multimodal_projector(hidden_state) + return hidden_state + + @auto_docstring class PPChart2TableModel(PPChart2TablePreTrainedModel): _checkpoint_conversion_mapping = { @@ -505,19 +526,9 @@ class PPChart2TableModel(PPChart2TablePreTrainedModel): def __init__(self, config: PPChart2TableConfig): super().__init__(config) self.vision_tower = PPChart2TableVisionEncoder(config.vision_config) - self.multi_modal_projector = nn.Linear(config.output_channels, config.text_config.hidden_size) + + self.multi_modal_projector = PPChart2TableMultiModalProjector(config) self.language_model = AutoModel.from_config(config.text_config) - self.vision_downsample1 = nn.Conv2d( - config.vision_config.output_channels, - config.vision_hidden_channels, - kernel_size=3, - stride=2, - padding=1, - bias=False, - ) - self.vision_downsample2 = nn.Conv2d( - config.vision_hidden_channels, config.output_channels, kernel_size=3, stride=2, padding=1, bias=False - ) self.post_init() def get_input_embeddings(self): @@ -535,13 +546,11 @@ def get_image_features( pixel_values: torch.FloatTensor, **kwargs: Unpack[TransformersKwargs], ) -> tuple | BaseModelOutputWithPooling: - image_output = self.vision_tower(pixel_values) - last_hidden_state = image_output.last_hidden_state - last_hidden_state = self.vision_downsample1(last_hidden_state) - last_hidden_state = self.vision_downsample2(last_hidden_state) - image_output.pooler_output = self.multi_modal_projector(last_hidden_state.flatten(2).transpose(2, 1)) + image_outputs = self.vision_tower(pixel_values, return_dict=True, **kwargs) + last_hidden_state = image_outputs.last_hidden_state + image_outputs.pooler_output = self.multi_modal_projector(last_hidden_state) - return image_output + return image_outputs def get_placeholder_mask( self, input_ids: torch.LongTensor, inputs_embeds: torch.FloatTensor, image_features: torch.FloatTensor diff --git a/src/transformers/models/pp_chart2table/modular_pp_chart2table.py b/src/transformers/models/pp_chart2table/modular_pp_chart2table.py index 85e3495d0cea..2223745aef4e 100644 --- a/src/transformers/models/pp_chart2table/modular_pp_chart2table.py +++ b/src/transformers/models/pp_chart2table/modular_pp_chart2table.py @@ -15,15 +15,13 @@ from dataclasses import dataclass import torch -import torch.nn as nn from ...feature_extraction_utils import BatchFeature from ...image_processing_utils_fast import BaseImageProcessorFast from ...image_utils import ImageInput -from ...modeling_outputs import BaseModelOutputWithPooling from ...processing_utils import ProcessingKwargs, ProcessorMixin, Unpack from ...tokenization_utils_base import PreTokenizedInput, TextInput -from ...utils import TransformersKwargs, auto_docstring, logging +from ...utils import auto_docstring, logging from ..got_ocr2.configuration_got_ocr2 import GotOcr2Config from ..got_ocr2.modeling_got_ocr2 import ( GotOcr2ForConditionalGeneration, @@ -38,26 +36,10 @@ @auto_docstring( - checkpoint="PaddlePaddle/PP-Chart2Table_safetensors", - custom_args=r""" - output_channels (`int`, *optional*, defaults to 1024): - Dimensionality of the output channels from the vision encoder. This is the final channel count - after the vision downsample layers, which is then projected to the text model hidden size. - vision_hidden_channels (`int`, *optional*, defaults to 512): - Dimensionality of the intermediate hidden channels in the vision encoder. This is the channel - count between the first and second downsample layers. - """, + checkpoint="PaddlePaddle/PP-Chart2Table_safetensors" ) class PPChart2TableConfig(GotOcr2Config): - def __init__( - self, - output_channels: int = 1024, - vision_hidden_channels: int = 512, - **super_kwargs, - ): - self.output_channels = output_channels - self.vision_hidden_channels = vision_hidden_channels - super().__init__() + pass @auto_docstring @@ -104,7 +86,7 @@ def __call__( num_patches = height // self.image_processor.patch_size // self.image_processor.merge_size input_ids = {"input_ids": None} - if text is None: + if text is None or text == "": query = "Chart to table" prompt = ( self.message_start_token @@ -150,36 +132,7 @@ class PPChart2TablePreTrainedModel(GotOcr2PreTrainedModel): @auto_docstring class PPChart2TableModel(GotOcr2Model): - def __init__(self, config: PPChart2TableConfig): - super().__init__(config) - self.vision_downsample1 = nn.Conv2d( - config.vision_config.output_channels, - config.vision_hidden_channels, - kernel_size=3, - stride=2, - padding=1, - bias=False, - ) - self.vision_downsample2 = nn.Conv2d( - config.vision_hidden_channels, config.output_channels, kernel_size=3, stride=2, padding=1, bias=False - ) - self.multi_modal_projector = nn.Linear(config.output_channels, config.text_config.hidden_size) - - # Initialize weights and apply final processing - self.post_init() - - def get_image_features( - self, - pixel_values: torch.FloatTensor, - **kwargs: Unpack[TransformersKwargs], - ) -> tuple | BaseModelOutputWithPooling: - image_output = self.vision_tower(pixel_values) - last_hidden_state = image_output.last_hidden_state - last_hidden_state = self.vision_downsample1(last_hidden_state) - last_hidden_state = self.vision_downsample2(last_hidden_state) - image_output.pooler_output = self.multi_modal_projector(last_hidden_state.flatten(2).transpose(2, 1)) - - return image_output + pass @auto_docstring( diff --git a/src/transformers/models/pp_chart2table/processing_pp_chart2table.py b/src/transformers/models/pp_chart2table/processing_pp_chart2table.py index 4d0b973f1154..3764024898f1 100644 --- a/src/transformers/models/pp_chart2table/processing_pp_chart2table.py +++ b/src/transformers/models/pp_chart2table/processing_pp_chart2table.py @@ -59,7 +59,7 @@ def __call__( num_patches = height // self.image_processor.patch_size // self.image_processor.merge_size input_ids = {"input_ids": None} - if text is None: + if text is None or text == "": query = "Chart to table" prompt = ( self.message_start_token From 9094eb572ae2d98db24adaa973e62b8e4f68828c Mon Sep 17 00:00:00 2001 From: XingweiDeng Date: Tue, 17 Mar 2026 13:12:59 +0800 Subject: [PATCH 27/60] update --- utils/check_config_attributes.py | 1 - utils/check_repo.py | 1 - 2 files changed, 2 deletions(-) diff --git a/utils/check_config_attributes.py b/utils/check_config_attributes.py index f3f6efedc96b..43408709fbc7 100644 --- a/utils/check_config_attributes.py +++ b/utils/check_config_attributes.py @@ -43,7 +43,6 @@ "BambaConfig": ["attn_layer_indices"], "Dots1Config": ["max_window_layers"], "JambaConfig": ["attn_layer_offset", "attn_layer_period", "expert_layer_offset", "expert_layer_period"], - "PPChart2TableConfig": ["tie_word_embeddings"], "JetMoeConfig": ["output_router_logits"], "Phi3Config": ["embd_pdrop"], "EncodecConfig": ["overlap"], diff --git a/utils/check_repo.py b/utils/check_repo.py index d458b2c3b0e1..c548e7825e7c 100644 --- a/utils/check_repo.py +++ b/utils/check_repo.py @@ -194,7 +194,6 @@ "PaddleOCRVisionModel", # Building part of bigger (tested) model. Tested implicitly through PaddleOCRVLForConditionalGeneration. "PaddleOCRVisionTransformer", # Building part of bigger (tested) model. Tested implicitly through PaddleOCRVLForConditionalGeneration. "PaddleOCRTextModel", # Building part of bigger (tested) model. Tested implicitly through PaddleOCRVLForConditionalGeneration. - "PPChart2TableModel", # Building part of bigger (tested) model. Tested implicitly through PPChart2TableForConditionalGeneration. "PPChart2TableVisionModel", # Building part of bigger (tested) model. Tested implicitly through PPChart2TableForConditionalGeneration. "PPChart2TableTextModel", # Building part of bigger (tested) model. Tested implicitly through PPChart2TableForConditionalGeneration. "Qwen2VLModel", # Building part of bigger (tested) model. Tested implicitly through Qwen2VLForConditionalGeneration. From 55664d1e7307babbc61bba12ec276e4ea7857022 Mon Sep 17 00:00:00 2001 From: XingweiDeng Date: Tue, 17 Mar 2026 13:51:36 +0800 Subject: [PATCH 28/60] update --- .../test_image_processing_pp_chart2table.py | 101 ++++++++++++++++++ .../test_modeling_pp_chart2table.py | 29 ++--- 2 files changed, 111 insertions(+), 19 deletions(-) create mode 100644 tests/models/pp_chart2table/test_image_processing_pp_chart2table.py diff --git a/tests/models/pp_chart2table/test_image_processing_pp_chart2table.py b/tests/models/pp_chart2table/test_image_processing_pp_chart2table.py new file mode 100644 index 000000000000..b56859409bfc --- /dev/null +++ b/tests/models/pp_chart2table/test_image_processing_pp_chart2table.py @@ -0,0 +1,101 @@ +# Copyright 2026 The HuggingFace Inc. team. All rights reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + + +import unittest + +from transformers.testing_utils import require_torch, require_vision +from transformers.utils import is_torch_available, is_torchvision_available, is_vision_available + +from ...test_image_processing_common import ImageProcessingTestMixin, prepare_image_inputs + +if is_torchvision_available(): + from transformers import PPChart2TableImageProcessorFast + + +class PPChart2TableImageProcessingTester(unittest.TestCase): + def __init__( + self, + parent, + batch_size=7, + num_channels=3, + image_size=18, + min_resolution=30, + max_resolution=400, + do_resize=True, + size=None, + do_normalize=True, + image_mean=[0.48145466, 0.4578275, 0.40821073], + image_std=[0.26862954, 0.26130258, 0.27577711], + ): + super().__init__() + size = size if size is not None else {"height": 1024, "width": 1024} + self.parent = parent + self.batch_size = batch_size + self.num_channels = num_channels + self.image_size = image_size + self.min_resolution = min_resolution + self.max_resolution = max_resolution + self.do_resize = do_resize + self.size = size + self.do_normalize = do_normalize + self.image_mean = image_mean + self.image_std = image_std + + def prepare_image_processor_dict(self): + return { + "do_resize": self.do_resize, + "size": self.size, + "do_normalize": self.do_normalize, + "image_mean": self.image_mean, + "image_std": self.image_std, + } + + def expected_output_image_shape(self, images): + return self.num_channels, self.size["height"], self.size["width"] + + def prepare_image_inputs(self, equal_resolution=False, numpify=False, torchify=False): + return prepare_image_inputs( + batch_size=self.batch_size, + num_channels=self.num_channels, + min_resolution=self.min_resolution, + max_resolution=self.max_resolution, + equal_resolution=equal_resolution, + numpify=numpify, + torchify=torchify, + ) + + +@require_torch +@require_vision +class PPChart2TableProcessingTest(ImageProcessingTestMixin, unittest.TestCase): + test_slow_image_processor = False + fast_image_processing_class = PPChart2TableImageProcessorFast if is_torchvision_available() else None + + def setUp(self): + super().setUp() + self.image_processor_tester = PPChart2TableImageProcessingTester(self) + + @property + def image_processor_dict(self): + return self.image_processor_tester.prepare_image_processor_dict() + + def test_image_processor_properties(self): + for image_processing_class in self.image_processor_list: + image_processor = image_processing_class(**self.image_processor_dict) + self.assertTrue(hasattr(image_processor, "do_resize")) + self.assertTrue(hasattr(image_processor, "size")) + self.assertTrue(hasattr(image_processor, "do_normalize")) + self.assertTrue(hasattr(image_processor, "image_mean")) + self.assertTrue(hasattr(image_processor, "image_std")) diff --git a/tests/models/pp_chart2table/test_modeling_pp_chart2table.py b/tests/models/pp_chart2table/test_modeling_pp_chart2table.py index 48ce924d5c97..80d4d1d6cc42 100644 --- a/tests/models/pp_chart2table/test_modeling_pp_chart2table.py +++ b/tests/models/pp_chart2table/test_modeling_pp_chart2table.py @@ -53,22 +53,19 @@ def __init__( image_size=64, image_token_index=1, model_type="pp_chart2table", - is_training=False, - output_channels=1024, - vision_hidden_channels=1024, + is_training=True, text_config={ "model_type": "qwen2", "vocab_size": 99, - "hidden_size": 128, + "hidden_size": 32, "intermediate_size": 37, "num_hidden_layers": 2, - "num_attention_heads": 4, + "num_attention_heads": 2, "num_key_value_heads": 2, - "output_channels": 64, + "output_channels": 32, "hidden_act": "silu", "max_position_embeddings": 512, "rope_theta": 10000, - "mlp_ratio": 4, "tie_word_embeddings": True, "bos_token_id": 2, "eos_token_id": 3, @@ -76,11 +73,11 @@ def __init__( }, vision_config={ "num_hidden_layers": 2, - "output_channels": 64, + "output_channels": 32, "hidden_act": "quick_gelu", "hidden_size": 32, - "mlp_dim": 128, - "num_attention_heads": 4, + "mlp_dim": 64, + "num_attention_heads": 2, "patch_size": 2, "image_size": 64, }, @@ -100,8 +97,6 @@ def __init__( self.is_training = is_training self.num_image_tokens = 64 self.seq_length = seq_length + self.num_image_tokens - self.output_channels = output_channels - self.vision_hidden_channels = vision_hidden_channels self.num_hidden_layers = text_config["num_hidden_layers"] self.vocab_size = text_config["vocab_size"] @@ -176,17 +171,13 @@ def test_get_image_features_attentions(self): def test_get_image_features_hidden_states(self): pass - @unittest.skip(reason="PPChart2Table does not support this test.") - def test_model_is_small(self): - pass - @require_torch class PPChart2TableIntegrationTest(unittest.TestCase): def setUp(self): model_path = "PaddlePaddle/PP-Chart2Table_safetensors" self.model = PPChart2TableForConditionalGeneration.from_pretrained(model_path).to(torch_device) - self.processor = AutoProcessor.from_pretrained("PaddlePaddle/PP-Chart2Table_safetensors").to(torch_device) + self.processor = AutoProcessor.from_pretrained(model_path) url = "https://paddle-model-ecology.bj.bcebos.com/paddlex/imgs/demo_image/chart_parsing_02.png" self.image = load_image(url) @@ -200,12 +191,12 @@ def test_small_model_integration_test_pp_chart2table(self): **inputs, use_cache=True, do_sample=False, - max_new_tokens=1024, + max_new_tokens=32, ) decoded_output = self.processor.decode( generate_ids[0, inputs["input_ids"].shape[1] :], skip_special_tokens=True ) - expected_output = "ๅนดไปฝ | ๅ•ๅฎถไบ”ๆ˜Ÿ็บงๆ—…ๆธธ้ฅญๅบ—ๅนดๅนณๅ‡่ฅๆ”ถ (็™พไธ‡ๅ…ƒ) | ๅ•ๅฎถไบ”ๆ˜Ÿ็บงๆ—…ๆธธ้ฅญๅบ—ๅนดๅนณๅ‡ๅˆฉๆถฆ (็™พไธ‡ๅ…ƒ)\n2018 | 104.22 | 9.87\n2019 | 99.11 | 7.47\n2020 | 57.87 | -3.87\n2021 | 68.99 | -2.90\n2022 | 56.29 | -9.48\n2023 | 87.99 | 5.96" + expected_output = "ๅนดไปฝ | ๅ•ๅฎถไบ”ๆ˜Ÿ็บงๆ—…ๆธธ้ฅญๅบ—ๅนดๅนณๅ‡่ฅๆ”ถ (็™พไธ‡ๅ…ƒ) | ๅ•ๅฎถไบ”ๆ˜Ÿ็บงๆ—…ๆธธ้ฅญๅบ—ๅนดๅนณๅ‡ๅˆฉๆถฆ (็™พไธ‡ๅ…ƒ)\n" self.assertEqual(decoded_output, expected_output) @slow From 6bb4dbc0d83fdf73c37147eeb73493e3838433e3 Mon Sep 17 00:00:00 2001 From: XingweiDeng Date: Tue, 17 Mar 2026 14:27:01 +0800 Subject: [PATCH 29/60] upddate --- .../test_modeling_pp_chart2table.py | 17 +---------------- 1 file changed, 1 insertion(+), 16 deletions(-) diff --git a/tests/models/pp_chart2table/test_modeling_pp_chart2table.py b/tests/models/pp_chart2table/test_modeling_pp_chart2table.py index 80d4d1d6cc42..98a7f1c0de22 100644 --- a/tests/models/pp_chart2table/test_modeling_pp_chart2table.py +++ b/tests/models/pp_chart2table/test_modeling_pp_chart2table.py @@ -138,7 +138,6 @@ def prepare_config_and_inputs_for_common(self): class PPChart2TableModelTest(ModelTesterMixin, GenerationTesterMixin, PipelineTesterMixin, unittest.TestCase): all_model_classes = ( ( - PPChart2TableModel, PPChart2TableForConditionalGeneration, ) if is_torch_available() @@ -159,19 +158,7 @@ def setUp(self): def test_config(self): self.config_tester.run_common_tests() - @unittest.skip( - reason="PPChart2Table have reused the GotOcr2 model, which does not implement the latest logic for capturing attentions and hidden_states introduced in Transformers v5." - ) - def test_get_image_features_attentions(self): - pass - - @unittest.skip( - reason="PPChart2Table have reused the GotOcr2 model, which does not implement the latest logic for capturing attentions and hidden_states introduced in Transformers v5." - ) - def test_get_image_features_hidden_states(self): - pass - - +@slow @require_torch class PPChart2TableIntegrationTest(unittest.TestCase): def setUp(self): @@ -184,7 +171,6 @@ def setUp(self): def tearDown(self): cleanup(torch_device, gc_collect=True) - @slow def test_small_model_integration_test_pp_chart2table(self): inputs = self.processor(self.image, return_tensors="pt").to(torch_device) generate_ids = self.model.generate( @@ -199,7 +185,6 @@ def test_small_model_integration_test_pp_chart2table(self): expected_output = "ๅนดไปฝ | ๅ•ๅฎถไบ”ๆ˜Ÿ็บงๆ—…ๆธธ้ฅญๅบ—ๅนดๅนณๅ‡่ฅๆ”ถ (็™พไธ‡ๅ…ƒ) | ๅ•ๅฎถไบ”ๆ˜Ÿ็บงๆ—…ๆธธ้ฅญๅบ—ๅนดๅนณๅ‡ๅˆฉๆถฆ (็™พไธ‡ๅ…ƒ)\n" self.assertEqual(decoded_output, expected_output) - @slow def test_small_model_integration_test_pp_chart2table_batched(self): inputs = self.processor([self.image, self.image], return_tensors="pt").to(torch_device) generate_ids = self.model.generate(**inputs, do_sample=False, num_beams=1, max_new_tokens=6) From f79d83b28aa395bb4ef1f0f06198cf2137902463 Mon Sep 17 00:00:00 2001 From: XingweiDeng Date: Tue, 17 Mar 2026 20:55:06 +0800 Subject: [PATCH 30/60] update --- .../pp_chart2table/modular_pp_chart2table.py | 57 ++++++++----------- .../processing_pp_chart2table.py | 53 ++++++++--------- .../test_image_processing_pp_chart2table.py | 3 +- .../test_modeling_pp_chart2table.py | 10 +--- utils/check_repo.py | 1 + 5 files changed, 54 insertions(+), 70 deletions(-) diff --git a/src/transformers/models/pp_chart2table/modular_pp_chart2table.py b/src/transformers/models/pp_chart2table/modular_pp_chart2table.py index 2223745aef4e..6a7decdc7ac8 100644 --- a/src/transformers/models/pp_chart2table/modular_pp_chart2table.py +++ b/src/transformers/models/pp_chart2table/modular_pp_chart2table.py @@ -35,9 +35,7 @@ logger = logging.get_logger(__name__) -@auto_docstring( - checkpoint="PaddlePaddle/PP-Chart2Table_safetensors" -) +@auto_docstring(checkpoint="PaddlePaddle/PP-Chart2Table_safetensors") class PPChart2TableConfig(GotOcr2Config): pass @@ -60,17 +58,6 @@ class PPChart2TableProcessor(ProcessorMixin): image_processor_class = "AutoImageProcessor" tokenizer_class = "AutoTokenizer" - def __init__(self, image_processor=None, tokenizer=None, chat_template=None, **kwargs): - super().__init__(image_processor, tokenizer, chat_template=chat_template) - - self.message_start_token = tokenizer.message_start_token - self.message_end_token = tokenizer.message_end_token - self.img_start_token = tokenizer.img_start_token - self.img_end_token = tokenizer.img_end_token - self.img_pad_token = tokenizer.img_pad_token - self.image_token = tokenizer.image_token - self.system_query = "system\nYou should follow the instructions carefully and explain your answers in detail." - def __call__( self, images: ImageInput = None, @@ -88,27 +75,33 @@ def __call__( input_ids = {"input_ids": None} if text is None or text == "": query = "Chart to table" - prompt = ( - self.message_start_token - + self.system_query - + self.message_end_token - + self.message_start_token - + "user\n" - + self.img_start_token - + self.img_pad_token * num_patches * num_patches - + self.img_end_token - + "\n" - + query - + self.message_end_token - + self.message_start_token - + "assistant\n" - ) - input_ids = torch.tensor(self.tokenizer([prompt]).input_ids) - input_ids = input_ids.repeat(batch_size, 1) - input_ids = {"input_ids": input_ids} else: raise ValueError("PPChart2Table processor does not support text inputs") + messages = [ + { + "role": "system", + "content": "You should follow the instructions carefully and explain your answers in detail.", + }, + { + "role": "user", + "image": {"num_patches": num_patches}, + "content": query, + }, + ] + + # Use tokenizer's apply_chat_template instead of manually loading template + prompt = self.tokenizer.apply_chat_template( + messages, + tokenize=False, + add_generation_prompt=True, + ) + + # Tokenize and prepare input ids for batch + input_ids = torch.tensor(self.tokenizer([prompt]).input_ids) + input_ids = input_ids.repeat(batch_size, 1) + input_ids = {"input_ids": input_ids} + return BatchFeature(data={**input_ids, **image_inputs}) diff --git a/src/transformers/models/pp_chart2table/processing_pp_chart2table.py b/src/transformers/models/pp_chart2table/processing_pp_chart2table.py index 3764024898f1..da60a1331505 100644 --- a/src/transformers/models/pp_chart2table/processing_pp_chart2table.py +++ b/src/transformers/models/pp_chart2table/processing_pp_chart2table.py @@ -33,17 +33,6 @@ class PPChart2TableProcessor(ProcessorMixin): image_processor_class = "AutoImageProcessor" tokenizer_class = "AutoTokenizer" - def __init__(self, image_processor=None, tokenizer=None, chat_template=None, **kwargs): - super().__init__(image_processor, tokenizer, chat_template=chat_template) - - self.message_start_token = tokenizer.message_start_token - self.message_end_token = tokenizer.message_end_token - self.img_start_token = tokenizer.img_start_token - self.img_end_token = tokenizer.img_end_token - self.img_pad_token = tokenizer.img_pad_token - self.image_token = tokenizer.image_token - self.system_query = "system\nYou should follow the instructions carefully and explain your answers in detail." - def __call__( self, images: ImageInput = None, @@ -61,27 +50,33 @@ def __call__( input_ids = {"input_ids": None} if text is None or text == "": query = "Chart to table" - prompt = ( - self.message_start_token - + self.system_query - + self.message_end_token - + self.message_start_token - + "user\n" - + self.img_start_token - + self.img_pad_token * num_patches * num_patches - + self.img_end_token - + "\n" - + query - + self.message_end_token - + self.message_start_token - + "assistant\n" - ) - input_ids = torch.tensor(self.tokenizer([prompt]).input_ids) - input_ids = input_ids.repeat(batch_size, 1) - input_ids = {"input_ids": input_ids} else: raise ValueError("PPChart2Table processor does not support text inputs") + messages = [ + { + "role": "system", + "content": "You should follow the instructions carefully and explain your answers in detail.", + }, + { + "role": "user", + "image": {"num_patches": num_patches}, + "content": query, + }, + ] + + # Use tokenizer's apply_chat_template instead of manually loading template + prompt = self.tokenizer.apply_chat_template( + messages, + tokenize=False, + add_generation_prompt=True, + ) + + # Tokenize and prepare input ids for batch + input_ids = torch.tensor(self.tokenizer([prompt]).input_ids) + input_ids = input_ids.repeat(batch_size, 1) + input_ids = {"input_ids": input_ids} + return BatchFeature(data={**input_ids, **image_inputs}) diff --git a/tests/models/pp_chart2table/test_image_processing_pp_chart2table.py b/tests/models/pp_chart2table/test_image_processing_pp_chart2table.py index b56859409bfc..f83c96227e67 100644 --- a/tests/models/pp_chart2table/test_image_processing_pp_chart2table.py +++ b/tests/models/pp_chart2table/test_image_processing_pp_chart2table.py @@ -16,10 +16,11 @@ import unittest from transformers.testing_utils import require_torch, require_vision -from transformers.utils import is_torch_available, is_torchvision_available, is_vision_available +from transformers.utils import is_torchvision_available from ...test_image_processing_common import ImageProcessingTestMixin, prepare_image_inputs + if is_torchvision_available(): from transformers import PPChart2TableImageProcessorFast diff --git a/tests/models/pp_chart2table/test_modeling_pp_chart2table.py b/tests/models/pp_chart2table/test_modeling_pp_chart2table.py index 98a7f1c0de22..1bb705e58488 100644 --- a/tests/models/pp_chart2table/test_modeling_pp_chart2table.py +++ b/tests/models/pp_chart2table/test_modeling_pp_chart2table.py @@ -34,7 +34,6 @@ from transformers import ( PPChart2TableForConditionalGeneration, - PPChart2TableModel, ) @@ -136,13 +135,7 @@ def prepare_config_and_inputs_for_common(self): @require_torch class PPChart2TableModelTest(ModelTesterMixin, GenerationTesterMixin, PipelineTesterMixin, unittest.TestCase): - all_model_classes = ( - ( - PPChart2TableForConditionalGeneration, - ) - if is_torch_available() - else () - ) + all_model_classes = (PPChart2TableForConditionalGeneration,) if is_torch_available() else () pipeline_model_mapping = ( { "image-text-to-text": PPChart2TableForConditionalGeneration, @@ -158,6 +151,7 @@ def setUp(self): def test_config(self): self.config_tester.run_common_tests() + @slow @require_torch class PPChart2TableIntegrationTest(unittest.TestCase): diff --git a/utils/check_repo.py b/utils/check_repo.py index c548e7825e7c..d458b2c3b0e1 100644 --- a/utils/check_repo.py +++ b/utils/check_repo.py @@ -194,6 +194,7 @@ "PaddleOCRVisionModel", # Building part of bigger (tested) model. Tested implicitly through PaddleOCRVLForConditionalGeneration. "PaddleOCRVisionTransformer", # Building part of bigger (tested) model. Tested implicitly through PaddleOCRVLForConditionalGeneration. "PaddleOCRTextModel", # Building part of bigger (tested) model. Tested implicitly through PaddleOCRVLForConditionalGeneration. + "PPChart2TableModel", # Building part of bigger (tested) model. Tested implicitly through PPChart2TableForConditionalGeneration. "PPChart2TableVisionModel", # Building part of bigger (tested) model. Tested implicitly through PPChart2TableForConditionalGeneration. "PPChart2TableTextModel", # Building part of bigger (tested) model. Tested implicitly through PPChart2TableForConditionalGeneration. "Qwen2VLModel", # Building part of bigger (tested) model. Tested implicitly through Qwen2VLForConditionalGeneration. From d050fe65247dbdff0a1df7ce2ed59bccae9c1088 Mon Sep 17 00:00:00 2001 From: XingweiDeng Date: Wed, 18 Mar 2026 11:27:39 +0800 Subject: [PATCH 31/60] update --- docs/source/en/model_doc/pp_chart2table.md | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/docs/source/en/model_doc/pp_chart2table.md b/docs/source/en/model_doc/pp_chart2table.md index 82f92bcdfb90..7f938f76300b 100644 --- a/docs/source/en/model_doc/pp_chart2table.md +++ b/docs/source/en/model_doc/pp_chart2table.md @@ -13,7 +13,7 @@ specific language governing permissions and limitations under the License. rendered properly in your Markdown viewer. --> -*This model was released on {release_date} and added to Hugging Face Transformers on 2026-03-16.* +*This model was released on 2026-05-20 and added to Hugging Face Transformers on 2026-03-16.* # PP-Chart2Table From bae2c9638bc8a976de8c643ec8b1eacbc205a53e Mon Sep 17 00:00:00 2001 From: XingweiDeng Date: Wed, 18 Mar 2026 11:41:32 +0800 Subject: [PATCH 32/60] update --- docs/source/en/model_doc/pp_chart2table.md | 2 +- .../pp_chart2table/modular_pp_chart2table.py | 21 +++++------------ .../processing_pp_chart2table.py | 23 +++++-------------- 3 files changed, 13 insertions(+), 33 deletions(-) diff --git a/docs/source/en/model_doc/pp_chart2table.md b/docs/source/en/model_doc/pp_chart2table.md index 7f938f76300b..34dabbcdfee4 100644 --- a/docs/source/en/model_doc/pp_chart2table.md +++ b/docs/source/en/model_doc/pp_chart2table.md @@ -13,7 +13,7 @@ specific language governing permissions and limitations under the License. rendered properly in your Markdown viewer. --> -*This model was released on 2026-05-20 and added to Hugging Face Transformers on 2026-03-16.* +*This model was released on 2025-05-20 and added to Hugging Face Transformers on 2026-03-16.* # PP-Chart2Table diff --git a/src/transformers/models/pp_chart2table/modular_pp_chart2table.py b/src/transformers/models/pp_chart2table/modular_pp_chart2table.py index 6a7decdc7ac8..aa6b821e4dc9 100644 --- a/src/transformers/models/pp_chart2table/modular_pp_chart2table.py +++ b/src/transformers/models/pp_chart2table/modular_pp_chart2table.py @@ -72,37 +72,28 @@ def __call__( batch_size, _, height, _ = image_inputs["pixel_values"].shape num_patches = height // self.image_processor.patch_size // self.image_processor.merge_size - input_ids = {"input_ids": None} - if text is None or text == "": - query = "Chart to table" - else: - raise ValueError("PPChart2Table processor does not support text inputs") - messages = [ { "role": "system", - "content": "You should follow the instructions carefully and explain your answers in detail.", }, { "role": "user", "image": {"num_patches": num_patches}, - "content": query, }, ] # Use tokenizer's apply_chat_template instead of manually loading template - prompt = self.tokenizer.apply_chat_template( + inputs = self.tokenizer.apply_chat_template( messages, - tokenize=False, + tokenize=True, add_generation_prompt=True, + return_tensors="pt", ) - # Tokenize and prepare input ids for batch - input_ids = torch.tensor(self.tokenizer([prompt]).input_ids) - input_ids = input_ids.repeat(batch_size, 1) - input_ids = {"input_ids": input_ids} + # Prepare input ids for batch + input_ids = inputs["input_ids"].repeat(batch_size, 1) - return BatchFeature(data={**input_ids, **image_inputs}) + return BatchFeature(data={"input_ids": input_ids, **image_inputs}) class PPChart2TableVisionPreTrainedModel(GotOcr2PreTrainedModel): diff --git a/src/transformers/models/pp_chart2table/processing_pp_chart2table.py b/src/transformers/models/pp_chart2table/processing_pp_chart2table.py index da60a1331505..17f6883024dc 100644 --- a/src/transformers/models/pp_chart2table/processing_pp_chart2table.py +++ b/src/transformers/models/pp_chart2table/processing_pp_chart2table.py @@ -19,8 +19,6 @@ # limitations under the License. -import torch - from ...feature_extraction_utils import BatchFeature from ...image_utils import ImageInput from ...processing_utils import ProcessingKwargs, ProcessorMixin, Unpack @@ -47,37 +45,28 @@ def __call__( batch_size, _, height, _ = image_inputs["pixel_values"].shape num_patches = height // self.image_processor.patch_size // self.image_processor.merge_size - input_ids = {"input_ids": None} - if text is None or text == "": - query = "Chart to table" - else: - raise ValueError("PPChart2Table processor does not support text inputs") - messages = [ { "role": "system", - "content": "You should follow the instructions carefully and explain your answers in detail.", }, { "role": "user", "image": {"num_patches": num_patches}, - "content": query, }, ] # Use tokenizer's apply_chat_template instead of manually loading template - prompt = self.tokenizer.apply_chat_template( + inputs = self.tokenizer.apply_chat_template( messages, - tokenize=False, + tokenize=True, add_generation_prompt=True, + return_tensors="pt", ) - # Tokenize and prepare input ids for batch - input_ids = torch.tensor(self.tokenizer([prompt]).input_ids) - input_ids = input_ids.repeat(batch_size, 1) - input_ids = {"input_ids": input_ids} + # Prepare input ids for batch + input_ids = inputs["input_ids"].repeat(batch_size, 1) - return BatchFeature(data={**input_ids, **image_inputs}) + return BatchFeature(data={"input_ids": input_ids, **image_inputs}) __all__ = ["PPChart2TableProcessor"] From 8e4062b4b80ae8b94f41d696cefe8b743d145d9d Mon Sep 17 00:00:00 2001 From: XingweiDeng Date: Wed, 18 Mar 2026 11:49:32 +0800 Subject: [PATCH 33/60] update --- src/transformers/models/auto/configuration_auto.py | 2 +- src/transformers/models/auto/modeling_auto.py | 2 +- 2 files changed, 2 insertions(+), 2 deletions(-) diff --git a/src/transformers/models/auto/configuration_auto.py b/src/transformers/models/auto/configuration_auto.py index 6cdacb04513a..d2615bf8a99f 100644 --- a/src/transformers/models/auto/configuration_auto.py +++ b/src/transformers/models/auto/configuration_auto.py @@ -355,7 +355,7 @@ ("plbart", "PLBartConfig"), ("poolformer", "PoolFormerConfig"), ("pop2piano", "Pop2PianoConfig"), - ("pp_chart2table", "PPChart2TableConfig"), + ("pp_chart2table", "GotOcr2Config"), ("pp_doclayout_v2", "PPDocLayoutV2Config"), ("pp_doclayout_v3", "PPDocLayoutV3Config"), ("pp_lcnet", "PPLCNetConfig"), diff --git a/src/transformers/models/auto/modeling_auto.py b/src/transformers/models/auto/modeling_auto.py index 10b9c7746cf6..7885c81daee3 100644 --- a/src/transformers/models/auto/modeling_auto.py +++ b/src/transformers/models/auto/modeling_auto.py @@ -985,7 +985,7 @@ class _BaseModelWithGenerate(PreTrainedModel, GenerationMixin): ("perception_lm", "PerceptionLMForConditionalGeneration"), ("pix2struct", "Pix2StructForConditionalGeneration"), ("pixtral", "LlavaForConditionalGeneration"), - ("pp_chart2table", "PPChart2TableForConditionalGeneration"), + ("pp_chart2table", "GotOcr2ForConditionalGeneration"), ("qwen2_5_vl", "Qwen2_5_VLForConditionalGeneration"), ("qwen2_vl", "Qwen2VLForConditionalGeneration"), ("qwen3_5", "Qwen3_5ForConditionalGeneration"), From ac2bc662aaf705282081f03cc85917bb9d0d9772 Mon Sep 17 00:00:00 2001 From: XingweiDeng Date: Wed, 18 Mar 2026 11:57:50 +0800 Subject: [PATCH 34/60] update --- .../test_modeling_pp_chart2table.py | 119 ------------------ 1 file changed, 119 deletions(-) diff --git a/tests/models/pp_chart2table/test_modeling_pp_chart2table.py b/tests/models/pp_chart2table/test_modeling_pp_chart2table.py index 1bb705e58488..2e1ee4a5d98c 100644 --- a/tests/models/pp_chart2table/test_modeling_pp_chart2table.py +++ b/tests/models/pp_chart2table/test_modeling_pp_chart2table.py @@ -17,21 +17,13 @@ from transformers import ( AutoProcessor, - PPChart2TableConfig, is_torch_available, is_vision_available, ) from transformers.testing_utils import cleanup, require_torch, slow, torch_device -from ...generation.test_utils import GenerationTesterMixin -from ...test_configuration_common import ConfigTester -from ...test_modeling_common import ModelTesterMixin, floats_tensor, ids_tensor -from ...test_pipeline_mixin import PipelineTesterMixin - if is_torch_available(): - import torch - from transformers import ( PPChart2TableForConditionalGeneration, ) @@ -41,117 +33,6 @@ from transformers.image_utils import load_image -class PPChart2TableVisionText2TextModelTester: - def __init__( - self, - parent, - batch_size=3, - seq_length=7, - num_channels=3, - ignore_index=-100, - image_size=64, - image_token_index=1, - model_type="pp_chart2table", - is_training=True, - text_config={ - "model_type": "qwen2", - "vocab_size": 99, - "hidden_size": 32, - "intermediate_size": 37, - "num_hidden_layers": 2, - "num_attention_heads": 2, - "num_key_value_heads": 2, - "output_channels": 32, - "hidden_act": "silu", - "max_position_embeddings": 512, - "rope_theta": 10000, - "tie_word_embeddings": True, - "bos_token_id": 2, - "eos_token_id": 3, - "pad_token_id": 4, - }, - vision_config={ - "num_hidden_layers": 2, - "output_channels": 32, - "hidden_act": "quick_gelu", - "hidden_size": 32, - "mlp_dim": 64, - "num_attention_heads": 2, - "patch_size": 2, - "image_size": 64, - }, - ): - self.parent = parent - self.ignore_index = ignore_index - self.bos_token_id = text_config["bos_token_id"] - self.eos_token_id = text_config["eos_token_id"] - self.pad_token_id = text_config["pad_token_id"] - self.image_token_index = image_token_index - self.model_type = model_type - self.text_config = text_config - self.vision_config = vision_config - self.batch_size = batch_size - self.num_channels = num_channels - self.image_size = image_size - self.is_training = is_training - self.num_image_tokens = 64 - self.seq_length = seq_length + self.num_image_tokens - - self.num_hidden_layers = text_config["num_hidden_layers"] - self.vocab_size = text_config["vocab_size"] - self.hidden_size = text_config["hidden_size"] - self.num_attention_heads = text_config["num_attention_heads"] - - def get_config(self): - return PPChart2TableConfig( - text_config=self.text_config, - vision_config=self.vision_config, - model_type=self.model_type, - image_token_index=self.image_token_index, - ) - - def prepare_config_and_inputs(self): - config = self.get_config() - pixel_values = floats_tensor([self.batch_size, self.num_channels, self.image_size, self.image_size]) - - return config, pixel_values - - def prepare_config_and_inputs_for_common(self): - config_and_inputs = self.prepare_config_and_inputs() - config, pixel_values = config_and_inputs - input_ids = ids_tensor([self.batch_size, self.seq_length], self.vocab_size) - attention_mask = torch.ones(input_ids.shape, dtype=torch.long, device=torch_device) - - input_ids[input_ids == self.image_token_index] = self.pad_token_id - input_ids[:, : self.num_image_tokens] = self.image_token_index - - inputs_dict = { - "pixel_values": pixel_values, - "input_ids": input_ids, - "attention_mask": attention_mask, - } - return config, inputs_dict - - -@require_torch -class PPChart2TableModelTest(ModelTesterMixin, GenerationTesterMixin, PipelineTesterMixin, unittest.TestCase): - all_model_classes = (PPChart2TableForConditionalGeneration,) if is_torch_available() else () - pipeline_model_mapping = ( - { - "image-text-to-text": PPChart2TableForConditionalGeneration, - } - if is_torch_available() - else {} - ) - - def setUp(self): - self.model_tester = PPChart2TableVisionText2TextModelTester(self) - self.config_tester = ConfigTester(self, config_class=PPChart2TableConfig, has_text_modality=False) - - def test_config(self): - self.config_tester.run_common_tests() - - @slow @require_torch class PPChart2TableIntegrationTest(unittest.TestCase): From 45907f92e1b601d2950577159475f7645bbf32a7 Mon Sep 17 00:00:00 2001 From: XingweiDeng Date: Wed, 18 Mar 2026 12:20:03 +0800 Subject: [PATCH 35/60] update --- .../configuration_pp_chart2table.py | 111 +++++++----------- .../pp_chart2table/modeling_pp_chart2table.py | 12 +- .../pp_chart2table/modular_pp_chart2table.py | 3 +- 3 files changed, 44 insertions(+), 82 deletions(-) diff --git a/src/transformers/models/pp_chart2table/configuration_pp_chart2table.py b/src/transformers/models/pp_chart2table/configuration_pp_chart2table.py index 107c077798a9..241338468d6d 100644 --- a/src/transformers/models/pp_chart2table/configuration_pp_chart2table.py +++ b/src/transformers/models/pp_chart2table/configuration_pp_chart2table.py @@ -18,12 +18,16 @@ # See the License for the specific language governing permissions and # limitations under the License. + +from huggingface_hub.dataclasses import strict + from ...configuration_utils import PreTrainedConfig from ...utils import auto_docstring from ..auto import CONFIG_MAPPING, AutoConfig @auto_docstring(checkpoint="facebook/sam-vit-huge") +@strict(accept_kwargs=True) class PPChart2TableVisionConfig(PreTrainedConfig): r""" output_channels (`int`, *optional*, defaults to 256): @@ -41,50 +45,27 @@ class PPChart2TableVisionConfig(PreTrainedConfig): """ base_config_key = "vision_config" - - def __init__( - self, - hidden_size=768, - output_channels=256, - num_hidden_layers=12, - num_attention_heads=12, - num_channels=3, - image_size=1024, - patch_size=16, - hidden_act="gelu", - layer_norm_eps=1e-06, - attention_dropout=0.0, - initializer_range=1e-10, - qkv_bias=True, - use_abs_pos=True, - use_rel_pos=True, - window_size=14, - global_attn_indexes=[2, 5, 8, 11], - mlp_dim=3072, - **kwargs, - ): - super().__init__(**kwargs) - - self.hidden_size = hidden_size - self.output_channels = output_channels - self.num_hidden_layers = num_hidden_layers - self.num_attention_heads = num_attention_heads - self.num_channels = num_channels - self.image_size = image_size - self.patch_size = patch_size - self.hidden_act = hidden_act - self.layer_norm_eps = layer_norm_eps - self.attention_dropout = attention_dropout - self.initializer_range = initializer_range - self.qkv_bias = qkv_bias - self.use_abs_pos = use_abs_pos - self.use_rel_pos = use_rel_pos - self.window_size = window_size - self.global_attn_indexes = global_attn_indexes - self.mlp_dim = mlp_dim + hidden_size: int = 768 + output_channels: int = 256 + num_hidden_layers: int = 12 + num_attention_heads: int = 12 + num_channels: int = 3 + image_size: int | list[int] | tuple[int, int] = 1024 + patch_size: int | list[int] | tuple[int, int] = 16 + hidden_act: str = "gelu" + layer_norm_eps: float = 1e-06 + attention_dropout: float | int = 0.0 + initializer_range: float = 1e-10 + qkv_bias: bool = True + use_abs_pos: bool = True + use_rel_pos: bool = True + window_size: int = 14 + global_attn_indexes: list[int] | tuple[int, ...] = (2, 5, 8, 11) + mlp_dim: int = 3072 @auto_docstring(checkpoint="PaddlePaddle/PP-Chart2Table_safetensors") +@strict(accept_kwargs=True) class PPChart2TableConfig(PreTrainedConfig): r""" Example: @@ -102,36 +83,29 @@ class PPChart2TableConfig(PreTrainedConfig): >>> configuration = model.config ```""" - model_type = "pp_chart2table" + model_type = "p_p_chart2_table" attribute_map = { "image_token_id": "image_token_index", } sub_configs = {"text_config": AutoConfig, "vision_config": PPChart2TableVisionConfig} - def __init__( - self, - vision_config: dict | None = None, - text_config: dict | None = None, - image_token_index: int | None = 151859, - image_seq_length: int | None = 576, - tie_word_embeddings: bool | None = True, - **kwargs, - ): - self.image_token_index = image_token_index - self.image_seq_length = image_seq_length - - if vision_config is None: + vision_config: dict | PreTrainedConfig | None = None + text_config: dict | PreTrainedConfig | None = None + image_token_index: int = 151859 + image_seq_length: int = 576 + tie_word_embeddings: bool = True + + def __post_init__(self, **kwargs): + if self.vision_config is None: self.vision_config = PPChart2TableVisionConfig() - elif isinstance(vision_config, dict): - self.vision_config = PPChart2TableVisionConfig(**vision_config) - elif isinstance(vision_config, PPChart2TableVisionConfig): - self.vision_config = vision_config - - if isinstance(text_config, dict): - text_config["model_type"] = text_config.get("model_type", "qwen2") - text_config = CONFIG_MAPPING[text_config["model_type"]](**text_config) - elif text_config is None: - text_config = CONFIG_MAPPING["qwen2"]( + elif isinstance(self.vision_config, dict): + self.vision_config = PPChart2TableVisionConfig(**self.vision_config) + + if isinstance(self.text_config, dict): + self.text_config["model_type"] = self.text_config.get("model_type", "qwen2") + self.text_config = CONFIG_MAPPING[self.text_config["model_type"]](**self.text_config) + elif self.text_config is None: + self.text_config = CONFIG_MAPPING["qwen2"]( vocab_size=151860, hidden_size=1024, intermediate_size=2816, @@ -143,7 +117,7 @@ def __init__( initializer_range=0.02, rms_norm_eps=1e-6, use_cache=True, - tie_word_embeddings=tie_word_embeddings, + tie_word_embeddings=self.tie_word_embeddings, rope_theta=1000000.0, rope_parameters=None, use_sliding_window=False, @@ -152,10 +126,7 @@ def __init__( attention_dropout=0.0, ) - self.text_config = text_config - self.tie_word_embeddings = tie_word_embeddings - - super().__init__(**kwargs) + super().__post_init__(**kwargs) __all__ = ["PPChart2TableConfig"] diff --git a/src/transformers/models/pp_chart2table/modeling_pp_chart2table.py b/src/transformers/models/pp_chart2table/modeling_pp_chart2table.py index 0c0a2c83e006..f13f04e62de6 100644 --- a/src/transformers/models/pp_chart2table/modeling_pp_chart2table.py +++ b/src/transformers/models/pp_chart2table/modeling_pp_chart2table.py @@ -300,7 +300,7 @@ def forward(self, hidden_states: torch.Tensor) -> tuple[torch.FloatTensor]: @dataclass @auto_docstring( custom_intro=""" - Base class for pp_chart2table vision model's outputs that also contains image embeddings obtained by applying the projection + Base class for p_p_chart2_table vision model's outputs that also contains image embeddings obtained by applying the projection layer to the pooler_output. """ ) @@ -519,10 +519,6 @@ def forward(self, vision_embeddings: torch.Tensor) -> torch.Tensor: @auto_docstring class PPChart2TableModel(PPChart2TablePreTrainedModel): - _checkpoint_conversion_mapping = { - r"^language_model.model": "language_model", - } - def __init__(self, config: PPChart2TableConfig): super().__init__(config) self.vision_tower = PPChart2TableVisionEncoder(config.vision_config) @@ -661,12 +657,6 @@ class PPChart2TableCausalLMOutputWithPast(ModelOutput): """ ) class PPChart2TableForConditionalGeneration(PPChart2TablePreTrainedModel, GenerationMixin): - _checkpoint_conversion_mapping = { - r"^language_model.model": "model.language_model", - r"^vision_tower": "model.vision_tower", - r"^multi_modal_projector": "model.multi_modal_projector", - r"^language_model.lm_head": "lm_head", - } _tied_weights_keys = {"lm_head.weight": "model.language_model.embed_tokens.weight"} def __init__(self, config: PPChart2TableConfig): diff --git a/src/transformers/models/pp_chart2table/modular_pp_chart2table.py b/src/transformers/models/pp_chart2table/modular_pp_chart2table.py index aa6b821e4dc9..662353516674 100644 --- a/src/transformers/models/pp_chart2table/modular_pp_chart2table.py +++ b/src/transformers/models/pp_chart2table/modular_pp_chart2table.py @@ -14,7 +14,7 @@ from dataclasses import dataclass -import torch +from huggingface_hub.dataclasses import strict from ...feature_extraction_utils import BatchFeature from ...image_processing_utils_fast import BaseImageProcessorFast @@ -36,6 +36,7 @@ @auto_docstring(checkpoint="PaddlePaddle/PP-Chart2Table_safetensors") +@strict(accept_kwargs=True) class PPChart2TableConfig(GotOcr2Config): pass From d0bf04f02957b8af7bd57dd9c00edc714001c5cc Mon Sep 17 00:00:00 2001 From: XingweiDeng Date: Wed, 18 Mar 2026 14:25:36 +0800 Subject: [PATCH 36/60] update --- .../image_processing_pp_chart2table.py | 25 ------------------- 1 file changed, 25 deletions(-) delete mode 100644 src/transformers/models/pp_chart2table/image_processing_pp_chart2table.py diff --git a/src/transformers/models/pp_chart2table/image_processing_pp_chart2table.py b/src/transformers/models/pp_chart2table/image_processing_pp_chart2table.py deleted file mode 100644 index 7bb7de6cc920..000000000000 --- a/src/transformers/models/pp_chart2table/image_processing_pp_chart2table.py +++ /dev/null @@ -1,25 +0,0 @@ -# ๐Ÿšจ๐Ÿšจ๐Ÿšจ๐Ÿšจ๐Ÿšจ๐Ÿšจ๐Ÿšจ๐Ÿšจ๐Ÿšจ๐Ÿšจ๐Ÿšจ๐Ÿšจ๐Ÿšจ๐Ÿšจ๐Ÿšจ๐Ÿšจ๐Ÿšจ๐Ÿšจ๐Ÿšจ๐Ÿšจ๐Ÿšจ๐Ÿšจ๐Ÿšจ๐Ÿšจ๐Ÿšจ๐Ÿšจ๐Ÿšจ๐Ÿšจ๐Ÿšจ๐Ÿšจ๐Ÿšจ๐Ÿšจ๐Ÿšจ๐Ÿšจ๐Ÿšจ๐Ÿšจ๐Ÿšจ๐Ÿšจ๐Ÿšจ๐Ÿšจ๐Ÿšจ๐Ÿšจ๐Ÿšจ๐Ÿšจ๐Ÿšจ๐Ÿšจ๐Ÿšจ๐Ÿšจ -# This file was automatically generated from src/transformers/models/pp_chart2table/modular_pp_chart2table.py. -# Do NOT edit this file manually as any edits will be overwritten by the generation of -# the file from the modular. If any change should be done, please apply the change to the -# modular_pp_chart2table.py file directly. One of our CI enforces this. -# ๐Ÿšจ๐Ÿšจ๐Ÿšจ๐Ÿšจ๐Ÿšจ๐Ÿšจ๐Ÿšจ๐Ÿšจ๐Ÿšจ๐Ÿšจ๐Ÿšจ๐Ÿšจ๐Ÿšจ๐Ÿšจ๐Ÿšจ๐Ÿšจ๐Ÿšจ๐Ÿšจ๐Ÿšจ๐Ÿšจ๐Ÿšจ๐Ÿšจ๐Ÿšจ๐Ÿšจ๐Ÿšจ๐Ÿšจ๐Ÿšจ๐Ÿšจ๐Ÿšจ๐Ÿšจ๐Ÿšจ๐Ÿšจ๐Ÿšจ๐Ÿšจ๐Ÿšจ๐Ÿšจ๐Ÿšจ๐Ÿšจ๐Ÿšจ๐Ÿšจ๐Ÿšจ๐Ÿšจ๐Ÿšจ๐Ÿšจ๐Ÿšจ๐Ÿšจ๐Ÿšจ๐Ÿšจ -from ...processing_utils import ImagesKwargs - - -class PPChart2TableImageProcessorKwargs(ImagesKwargs, total=False): - """ - crop_to_patches (`bool`, *optional*, defaults to `False`): - Whether to crop the image to patches. Can be overridden by the `crop_to_patches` parameter in the - `preprocess` method. - min_patches (`int`, *optional*, defaults to 1): - The minimum number of patches to be extracted from the image. Only has an effect if `crop_to_patches` is - set to `True`. Can be overridden by the `min_patches` parameter in the `preprocess` method. - max_patches (`int`, *optional*, defaults to 12): - The maximum number of patches to be extracted from the image. Only has an effect if `crop_to_patches` is - set to `True`. Can be overridden by the `max_patches` parameter in the `preprocess` method. - """ - - crop_to_patches: bool - min_patches: int - max_patches: int From 0f7ed318db222936dbc542eca4d3e9034056add8 Mon Sep 17 00:00:00 2001 From: XingweiDeng Date: Wed, 18 Mar 2026 16:31:26 +0800 Subject: [PATCH 37/60] update --- src/transformers/conversion_mapping.py | 2 +- .../configuration_pp_chart2table.py | 2 +- .../image_processing_pp_chart2table_fast.py | 2 +- .../pp_chart2table/modular_pp_chart2table.py | 29 +++++---- .../processing_pp_chart2table.py | 19 +++--- .../test_processing_pp_chart2table.py | 64 +++++++++++++++++++ 6 files changed, 96 insertions(+), 22 deletions(-) create mode 100644 tests/models/pp_chart2table/test_processing_pp_chart2table.py diff --git a/src/transformers/conversion_mapping.py b/src/transformers/conversion_mapping.py index 71cdf585bc12..a1c495085321 100755 --- a/src/transformers/conversion_mapping.py +++ b/src/transformers/conversion_mapping.py @@ -555,4 +555,4 @@ def get_model_conversion_mapping( # "mlp.experts.gate_up_proj$" and "mlp.experts.down_proj$" are only created after dequantization conversions are applied. weight_conversions.extend(hf_quantizer.get_weight_conversions()) - return weight_conversions \ No newline at end of file + return weight_conversions diff --git a/src/transformers/models/pp_chart2table/configuration_pp_chart2table.py b/src/transformers/models/pp_chart2table/configuration_pp_chart2table.py index 241338468d6d..a15de72c8a39 100644 --- a/src/transformers/models/pp_chart2table/configuration_pp_chart2table.py +++ b/src/transformers/models/pp_chart2table/configuration_pp_chart2table.py @@ -83,7 +83,7 @@ class PPChart2TableConfig(PreTrainedConfig): >>> configuration = model.config ```""" - model_type = "p_p_chart2_table" + model_type = "pp_chart2table" attribute_map = { "image_token_id": "image_token_index", } diff --git a/src/transformers/models/pp_chart2table/image_processing_pp_chart2table_fast.py b/src/transformers/models/pp_chart2table/image_processing_pp_chart2table_fast.py index 8bea7ba01d7c..ef857db5b658 100644 --- a/src/transformers/models/pp_chart2table/image_processing_pp_chart2table_fast.py +++ b/src/transformers/models/pp_chart2table/image_processing_pp_chart2table_fast.py @@ -29,7 +29,7 @@ class PPChart2TableImageProcessorFast(BaseImageProcessorFast): image_std = [0.26862954, 0.26130258, 0.27577711] size = {"height": 1024, "width": 1024} patch_size = 16 - merge_size = 4 + num_patches = 16 do_resize = True do_rescale = True do_normalize = True diff --git a/src/transformers/models/pp_chart2table/modular_pp_chart2table.py b/src/transformers/models/pp_chart2table/modular_pp_chart2table.py index 662353516674..50e52a2772c6 100644 --- a/src/transformers/models/pp_chart2table/modular_pp_chart2table.py +++ b/src/transformers/models/pp_chart2table/modular_pp_chart2table.py @@ -21,7 +21,7 @@ from ...image_utils import ImageInput from ...processing_utils import ProcessingKwargs, ProcessorMixin, Unpack from ...tokenization_utils_base import PreTokenizedInput, TextInput -from ...utils import auto_docstring, logging +from ...utils import auto_docstring, is_vision_available, logging from ..got_ocr2.configuration_got_ocr2 import GotOcr2Config from ..got_ocr2.modeling_got_ocr2 import ( GotOcr2ForConditionalGeneration, @@ -32,13 +32,17 @@ ) +if is_vision_available(): + pass + + logger = logging.get_logger(__name__) @auto_docstring(checkpoint="PaddlePaddle/PP-Chart2Table_safetensors") @strict(accept_kwargs=True) class PPChart2TableConfig(GotOcr2Config): - pass + model_type = "pp_chart2table" @auto_docstring @@ -48,7 +52,7 @@ class PPChart2TableImageProcessorFast(BaseImageProcessorFast): image_std = [0.26862954, 0.26130258, 0.27577711] size = {"height": 1024, "width": 1024} patch_size = 16 - merge_size = 4 + num_patches = 16 do_resize = True do_rescale = True do_normalize = True @@ -58,6 +62,7 @@ class PPChart2TableImageProcessorFast(BaseImageProcessorFast): class PPChart2TableProcessor(ProcessorMixin): image_processor_class = "AutoImageProcessor" tokenizer_class = "AutoTokenizer" + model_input_names = ["input_ids", "pixel_values"] def __call__( self, @@ -65,13 +70,14 @@ def __call__( text: TextInput | PreTokenizedInput | list[TextInput] | list[PreTokenizedInput] = None, **kwargs: Unpack[ProcessingKwargs], ) -> BatchFeature: - if images is not None: - image_inputs = self.image_processor(images=images, return_tensors="pt") - else: - image_inputs = {} + output_kwargs = self._merge_kwargs( + ProcessingKwargs, + tokenizer_init_kwargs=self.tokenizer.init_kwargs, + **kwargs, + ) - batch_size, _, height, _ = image_inputs["pixel_values"].shape - num_patches = height // self.image_processor.patch_size // self.image_processor.merge_size + image_inputs = self.image_processor(images=images, **output_kwargs["images_kwargs"]) + batch_size = image_inputs["pixel_values"].shape[0] messages = [ { @@ -79,7 +85,7 @@ def __call__( }, { "role": "user", - "image": {"num_patches": num_patches}, + "image": {"num_patches": self.image_processor.num_patches}, }, ] @@ -88,7 +94,8 @@ def __call__( messages, tokenize=True, add_generation_prompt=True, - return_tensors="pt", + truncation=True, + **output_kwargs["text_kwargs"], ) # Prepare input ids for batch diff --git a/src/transformers/models/pp_chart2table/processing_pp_chart2table.py b/src/transformers/models/pp_chart2table/processing_pp_chart2table.py index 17f6883024dc..ce8175ecd87c 100644 --- a/src/transformers/models/pp_chart2table/processing_pp_chart2table.py +++ b/src/transformers/models/pp_chart2table/processing_pp_chart2table.py @@ -30,6 +30,7 @@ class PPChart2TableProcessor(ProcessorMixin): image_processor_class = "AutoImageProcessor" tokenizer_class = "AutoTokenizer" + model_input_names = ["input_ids", "pixel_values"] def __call__( self, @@ -37,13 +38,14 @@ def __call__( text: TextInput | PreTokenizedInput | list[TextInput] | list[PreTokenizedInput] = None, **kwargs: Unpack[ProcessingKwargs], ) -> BatchFeature: - if images is not None: - image_inputs = self.image_processor(images=images, return_tensors="pt") - else: - image_inputs = {} + output_kwargs = self._merge_kwargs( + ProcessingKwargs, + tokenizer_init_kwargs=self.tokenizer.init_kwargs, + **kwargs, + ) - batch_size, _, height, _ = image_inputs["pixel_values"].shape - num_patches = height // self.image_processor.patch_size // self.image_processor.merge_size + image_inputs = self.image_processor(images=images, **output_kwargs["images_kwargs"]) + batch_size = image_inputs["pixel_values"].shape[0] messages = [ { @@ -51,7 +53,7 @@ def __call__( }, { "role": "user", - "image": {"num_patches": num_patches}, + "image": {"num_patches": self.image_processor.num_patches}, }, ] @@ -60,7 +62,8 @@ def __call__( messages, tokenize=True, add_generation_prompt=True, - return_tensors="pt", + truncation=True, + **output_kwargs["text_kwargs"], ) # Prepare input ids for batch diff --git a/tests/models/pp_chart2table/test_processing_pp_chart2table.py b/tests/models/pp_chart2table/test_processing_pp_chart2table.py new file mode 100644 index 000000000000..ec33b19097af --- /dev/null +++ b/tests/models/pp_chart2table/test_processing_pp_chart2table.py @@ -0,0 +1,64 @@ +# Copyright 2024 The HuggingFace Team. All rights reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +import unittest + +from transformers import PPChart2TableProcessor +from transformers.testing_utils import require_vision + +from ...test_processing_common import ProcessorTesterMixin + + +@require_vision +class PPChart2TableProcessorTest(ProcessorTesterMixin, unittest.TestCase): + processor_class = PPChart2TableProcessor + + @classmethod + def _setup_tokenizer(cls): + tokenizer_class = cls._get_component_class_from_processor("tokenizer") + tokenizer = tokenizer_class.from_pretrained("/workspace/model_weight_torch/PP-Chart2Table") + return tokenizer + + @unittest.skip("PPChart2TableProcessor pop the image processor output 'num_patches'") + def test_image_processor_defaults(self): + pass + + def test_ocr_queries(self): + processor = self.get_processor() + image_input = self.prepare_image_inputs() + inputs = processor(image_input, return_tensors="pt") + self.assertEqual(inputs["input_ids"].shape, (1, 286)) + self.assertEqual(inputs["pixel_values"].shape, (1, 3, 1024, 1024)) + + def test_unstructured_kwargs_batched(self): + if "image_processor" not in self.processor_class.get_attributes(): + self.skipTest(f"image_processor attribute not present in {self.processor_class}") + processor_components = self.prepare_components() + processor_kwargs = self.prepare_processor_dict() + processor = self.processor_class(**processor_components, **processor_kwargs) + self.skip_processor_without_typed_kwargs(processor) + + input_str = self.prepare_text_inputs(batch_size=2, modalities="image") + image_input = self.prepare_image_inputs(batch_size=2) + inputs = processor( + text=input_str, + images=image_input, + return_tensors="pt", + do_rescale=True, + rescale_factor=-1.0, + padding="longest", + max_length=self.image_unstructured_max_length, + ) + + self.assertLessEqual(inputs[self.images_input_name][0][0].mean(), 0) From beea17f53e8f93eebd15e0c8a02f973a805008f4 Mon Sep 17 00:00:00 2001 From: XingweiDeng Date: Wed, 18 Mar 2026 19:55:11 +0800 Subject: [PATCH 38/60] update --- docs/source/en/model_doc/pp_chart2table.md | 4 +- .../models/auto/configuration_auto.py | 2 +- .../models/pp_chart2table/__init__.py | 1 - .../pp_chart2table/modular_pp_chart2table.py | 41 ------------------- .../test_image_processing_pp_chart2table.py | 2 +- .../test_modeling_pp_chart2table.py | 3 +- .../test_processing_pp_chart2table.py | 4 +- utils/check_repo.py | 6 --- 8 files changed, 9 insertions(+), 54 deletions(-) diff --git a/docs/source/en/model_doc/pp_chart2table.md b/docs/source/en/model_doc/pp_chart2table.md index 34dabbcdfee4..9dc464d31473 100644 --- a/docs/source/en/model_doc/pp_chart2table.md +++ b/docs/source/en/model_doc/pp_chart2table.md @@ -49,9 +49,10 @@ pipe = pipeline( device_map="auto", ) image = Image.open(requests.get("https://paddle-model-ecology.bj.bcebos.com/paddlex/imgs/demo_image/chart_parsing_02.png", stream=True).raw) +# text is empty - processor uses hardcoded "Chart to table" instruction internally via chat template result = pipe( images=image, - text="", + text="", do_sample=False, max_new_tokens=256 ) @@ -106,6 +107,7 @@ pipe = pipeline( device_map="auto", ) image = Image.open(requests.get("https://paddle-model-ecology.bj.bcebos.com/paddlex/imgs/demo_image/chart_parsing_02.png", stream=True).raw) +# text is empty - processor uses hardcoded "Chart to table" instruction internally via chat template result = pipe( images=[image, image], text="", diff --git a/src/transformers/models/auto/configuration_auto.py b/src/transformers/models/auto/configuration_auto.py index e96476e6a08f..015187b409bb 100644 --- a/src/transformers/models/auto/configuration_auto.py +++ b/src/transformers/models/auto/configuration_auto.py @@ -358,7 +358,7 @@ ("plbart", "PLBartConfig"), ("poolformer", "PoolFormerConfig"), ("pop2piano", "Pop2PianoConfig"), - ("pp_chart2table", "GotOcr2Config"), + ("pp_chart2table", "PPChart2TableConfig"), ("pp_doclayout_v2", "PPDocLayoutV2Config"), ("pp_doclayout_v3", "PPDocLayoutV3Config"), ("pp_lcnet", "PPLCNetConfig"), diff --git a/src/transformers/models/pp_chart2table/__init__.py b/src/transformers/models/pp_chart2table/__init__.py index 82763db99c82..411b2f54ca62 100644 --- a/src/transformers/models/pp_chart2table/__init__.py +++ b/src/transformers/models/pp_chart2table/__init__.py @@ -21,7 +21,6 @@ if TYPE_CHECKING: from .configuration_pp_chart2table import * from .image_processing_pp_chart2table_fast import * - from .modeling_pp_chart2table import * from .processing_pp_chart2table import * else: import sys diff --git a/src/transformers/models/pp_chart2table/modular_pp_chart2table.py b/src/transformers/models/pp_chart2table/modular_pp_chart2table.py index 50e52a2772c6..b23a81f10716 100644 --- a/src/transformers/models/pp_chart2table/modular_pp_chart2table.py +++ b/src/transformers/models/pp_chart2table/modular_pp_chart2table.py @@ -32,10 +32,6 @@ ) -if is_vision_available(): - pass - - logger = logging.get_logger(__name__) @@ -104,45 +100,8 @@ def __call__( return BatchFeature(data={"input_ids": input_ids, **image_inputs}) -class PPChart2TableVisionPreTrainedModel(GotOcr2PreTrainedModel): - pass - - -class PPChart2TableVisionEncoder(GotOcr2VisionEncoder, PPChart2TableVisionPreTrainedModel): - pass - - -@dataclass -class PPChart2TableModelOutputWithPast(GotOcr2ModelOutputWithPast): - pass - - -@auto_docstring -class PPChart2TablePreTrainedModel(GotOcr2PreTrainedModel): - pass - - -@auto_docstring -class PPChart2TableModel(GotOcr2Model): - pass - - -@auto_docstring( - custom_intro=""" - PP-Chart2Table model for conditional generation (table text generation from chart images), - extending the core model with a language modeling (LM) head and generation utilities. - """ -) -class PPChart2TableForConditionalGeneration(GotOcr2ForConditionalGeneration): - pass - - __all__ = [ - "PPChart2TableForConditionalGeneration", - "PPChart2TableModel", "PPChart2TableConfig", - "PPChart2TableVisionPreTrainedModel", - "PPChart2TablePreTrainedModel", "PPChart2TableImageProcessorFast", "PPChart2TableProcessor", ] diff --git a/tests/models/pp_chart2table/test_image_processing_pp_chart2table.py b/tests/models/pp_chart2table/test_image_processing_pp_chart2table.py index f83c96227e67..46c1d0cc85f9 100644 --- a/tests/models/pp_chart2table/test_image_processing_pp_chart2table.py +++ b/tests/models/pp_chart2table/test_image_processing_pp_chart2table.py @@ -80,7 +80,7 @@ def prepare_image_inputs(self, equal_resolution=False, numpify=False, torchify=F @require_torch @require_vision -class PPChart2TableProcessingTest(ImageProcessingTestMixin, unittest.TestCase): +class PPChart2TableImageProcessingTest(ImageProcessingTestMixin, unittest.TestCase): test_slow_image_processor = False fast_image_processing_class = PPChart2TableImageProcessorFast if is_torchvision_available() else None diff --git a/tests/models/pp_chart2table/test_modeling_pp_chart2table.py b/tests/models/pp_chart2table/test_modeling_pp_chart2table.py index 2e1ee4a5d98c..04acdd35ca1d 100644 --- a/tests/models/pp_chart2table/test_modeling_pp_chart2table.py +++ b/tests/models/pp_chart2table/test_modeling_pp_chart2table.py @@ -20,7 +20,7 @@ is_torch_available, is_vision_available, ) -from transformers.testing_utils import cleanup, require_torch, slow, torch_device +from transformers.testing_utils import cleanup, require_torch, slow, torch_device, require_vision if is_torch_available(): @@ -34,6 +34,7 @@ @slow +@require_vision @require_torch class PPChart2TableIntegrationTest(unittest.TestCase): def setUp(self): diff --git a/tests/models/pp_chart2table/test_processing_pp_chart2table.py b/tests/models/pp_chart2table/test_processing_pp_chart2table.py index ec33b19097af..1592fe4cdb25 100644 --- a/tests/models/pp_chart2table/test_processing_pp_chart2table.py +++ b/tests/models/pp_chart2table/test_processing_pp_chart2table.py @@ -1,4 +1,4 @@ -# Copyright 2024 The HuggingFace Team. All rights reserved. +# Copyright 2026 The HuggingFace Team. All rights reserved. # # Licensed under the Apache License, Version 2.0 (the "License"); # you may not use this file except in compliance with the License. @@ -27,7 +27,7 @@ class PPChart2TableProcessorTest(ProcessorTesterMixin, unittest.TestCase): @classmethod def _setup_tokenizer(cls): tokenizer_class = cls._get_component_class_from_processor("tokenizer") - tokenizer = tokenizer_class.from_pretrained("/workspace/model_weight_torch/PP-Chart2Table") + tokenizer = tokenizer_class.from_pretrained("PaddlePaddle/PP-Chart2Table_safetensors") return tokenizer @unittest.skip("PPChart2TableProcessor pop the image processor output 'num_patches'") diff --git a/utils/check_repo.py b/utils/check_repo.py index f9187c65ce20..ace5c0542bf9 100644 --- a/utils/check_repo.py +++ b/utils/check_repo.py @@ -194,9 +194,6 @@ "PaddleOCRVisionModel", # Building part of bigger (tested) model. Tested implicitly through PaddleOCRVLForConditionalGeneration. "PaddleOCRVisionTransformer", # Building part of bigger (tested) model. Tested implicitly through PaddleOCRVLForConditionalGeneration. "PaddleOCRTextModel", # Building part of bigger (tested) model. Tested implicitly through PaddleOCRVLForConditionalGeneration. - "PPChart2TableModel", # Building part of bigger (tested) model. Tested implicitly through PPChart2TableForConditionalGeneration. - "PPChart2TableVisionModel", # Building part of bigger (tested) model. Tested implicitly through PPChart2TableForConditionalGeneration. - "PPChart2TableTextModel", # Building part of bigger (tested) model. Tested implicitly through PPChart2TableForConditionalGeneration. "Qwen2VLModel", # Building part of bigger (tested) model. Tested implicitly through Qwen2VLForConditionalGeneration. "Qwen2_5_VLModel", # Building part of bigger (tested) model. Tested implicitly through Qwen2_5_VLForConditionalGeneration. "Qwen3VLModel", # Building part of bigger (tested) model. Tested implicitly through Qwen3VLForConditionalGeneration. @@ -452,9 +449,6 @@ "PaddleOCRVisionModel", # Building part of bigger (tested) model "PaddleOCRVisionTransformer", # Building part of bigger (tested) model "PaddleOCRTextModel", # Building part of bigger (tested) model - "PPChart2TableModel", # Building part of bigger (tested) model - "PPChart2TableVisionModel", # Building part of bigger (tested) model - "PPChart2TableTextModel", # Building part of bigger (tested) model "Qwen2_5OmniTalkerForConditionalGeneration", # Building part of a bigger model "Qwen2_5OmniTalkerModel", # Building part of a bigger model "Qwen2_5OmniThinkerForConditionalGeneration", # Building part of a bigger model From 691558328cc550ad8b93ae89ecd04c6eac96b86f Mon Sep 17 00:00:00 2001 From: XingweiDeng Date: Wed, 18 Mar 2026 20:35:03 +0800 Subject: [PATCH 39/60] update --- .../configuration_pp_chart2table.py | 1 - .../pp_chart2table/modular_pp_chart2table.py | 39 +++++++++---------- .../processing_pp_chart2table.py | 29 ++++++++------ 3 files changed, 35 insertions(+), 34 deletions(-) diff --git a/src/transformers/models/pp_chart2table/configuration_pp_chart2table.py b/src/transformers/models/pp_chart2table/configuration_pp_chart2table.py index a15de72c8a39..d85c61d942b9 100644 --- a/src/transformers/models/pp_chart2table/configuration_pp_chart2table.py +++ b/src/transformers/models/pp_chart2table/configuration_pp_chart2table.py @@ -18,7 +18,6 @@ # See the License for the specific language governing permissions and # limitations under the License. - from huggingface_hub.dataclasses import strict from ...configuration_utils import PreTrainedConfig diff --git a/src/transformers/models/pp_chart2table/modular_pp_chart2table.py b/src/transformers/models/pp_chart2table/modular_pp_chart2table.py index b23a81f10716..b743e020d4a7 100644 --- a/src/transformers/models/pp_chart2table/modular_pp_chart2table.py +++ b/src/transformers/models/pp_chart2table/modular_pp_chart2table.py @@ -12,8 +12,6 @@ # See the License for the specific language governing permissions and # limitations under the License. -from dataclasses import dataclass - from huggingface_hub.dataclasses import strict from ...feature_extraction_utils import BatchFeature @@ -21,15 +19,8 @@ from ...image_utils import ImageInput from ...processing_utils import ProcessingKwargs, ProcessorMixin, Unpack from ...tokenization_utils_base import PreTokenizedInput, TextInput -from ...utils import auto_docstring, is_vision_available, logging +from ...utils import auto_docstring, logging, requires_backends from ..got_ocr2.configuration_got_ocr2 import GotOcr2Config -from ..got_ocr2.modeling_got_ocr2 import ( - GotOcr2ForConditionalGeneration, - GotOcr2Model, - GotOcr2ModelOutputWithPast, - GotOcr2PreTrainedModel, - GotOcr2VisionEncoder, -) logger = logging.get_logger(__name__) @@ -60,12 +51,28 @@ class PPChart2TableProcessor(ProcessorMixin): tokenizer_class = "AutoTokenizer" model_input_names = ["input_ids", "pixel_values"] + def __init__(self, image_processor=None, tokenizer=None, chat_template=None, **kwargs): + super().__init__(image_processor, tokenizer, chat_template=chat_template) + + # PPChart2TableProcessor uses hardcoded "Chart to table" instruction internally via chat template + self.messages = [ + { + "role": "system", + }, + { + "role": "user", + "image": {"num_patches": self.image_processor.num_patches}, + }, + ] + + def __call__( self, images: ImageInput = None, text: TextInput | PreTokenizedInput | list[TextInput] | list[PreTokenizedInput] = None, **kwargs: Unpack[ProcessingKwargs], ) -> BatchFeature: + requires_backends(self, "torch") output_kwargs = self._merge_kwargs( ProcessingKwargs, tokenizer_init_kwargs=self.tokenizer.init_kwargs, @@ -75,19 +82,9 @@ def __call__( image_inputs = self.image_processor(images=images, **output_kwargs["images_kwargs"]) batch_size = image_inputs["pixel_values"].shape[0] - messages = [ - { - "role": "system", - }, - { - "role": "user", - "image": {"num_patches": self.image_processor.num_patches}, - }, - ] - # Use tokenizer's apply_chat_template instead of manually loading template inputs = self.tokenizer.apply_chat_template( - messages, + self.messages, tokenize=True, add_generation_prompt=True, truncation=True, diff --git a/src/transformers/models/pp_chart2table/processing_pp_chart2table.py b/src/transformers/models/pp_chart2table/processing_pp_chart2table.py index ce8175ecd87c..e77a71c821d0 100644 --- a/src/transformers/models/pp_chart2table/processing_pp_chart2table.py +++ b/src/transformers/models/pp_chart2table/processing_pp_chart2table.py @@ -23,7 +23,7 @@ from ...image_utils import ImageInput from ...processing_utils import ProcessingKwargs, ProcessorMixin, Unpack from ...tokenization_utils_base import PreTokenizedInput, TextInput -from ...utils import auto_docstring +from ...utils import auto_docstring, requires_backends @auto_docstring @@ -32,12 +32,27 @@ class PPChart2TableProcessor(ProcessorMixin): tokenizer_class = "AutoTokenizer" model_input_names = ["input_ids", "pixel_values"] + def __init__(self, image_processor=None, tokenizer=None, chat_template=None, **kwargs): + super().__init__(image_processor, tokenizer, chat_template=chat_template) + + # PPChart2TableProcessor uses hardcoded "Chart to table" instruction internally via chat template + self.messages = [ + { + "role": "system", + }, + { + "role": "user", + "image": {"num_patches": self.image_processor.num_patches}, + }, + ] + def __call__( self, images: ImageInput = None, text: TextInput | PreTokenizedInput | list[TextInput] | list[PreTokenizedInput] = None, **kwargs: Unpack[ProcessingKwargs], ) -> BatchFeature: + requires_backends(self, "torch") output_kwargs = self._merge_kwargs( ProcessingKwargs, tokenizer_init_kwargs=self.tokenizer.init_kwargs, @@ -47,19 +62,9 @@ def __call__( image_inputs = self.image_processor(images=images, **output_kwargs["images_kwargs"]) batch_size = image_inputs["pixel_values"].shape[0] - messages = [ - { - "role": "system", - }, - { - "role": "user", - "image": {"num_patches": self.image_processor.num_patches}, - }, - ] - # Use tokenizer's apply_chat_template instead of manually loading template inputs = self.tokenizer.apply_chat_template( - messages, + self.messages, tokenize=True, add_generation_prompt=True, truncation=True, From 6fe075bc074f2f8fca1559b4a0109e25c385e90f Mon Sep 17 00:00:00 2001 From: XingweiDeng Date: Wed, 18 Mar 2026 20:57:56 +0800 Subject: [PATCH 40/60] update --- .../models/pp_chart2table/modular_pp_chart2table.py | 3 +-- tests/models/pp_chart2table/test_modeling_pp_chart2table.py | 2 +- 2 files changed, 2 insertions(+), 3 deletions(-) diff --git a/src/transformers/models/pp_chart2table/modular_pp_chart2table.py b/src/transformers/models/pp_chart2table/modular_pp_chart2table.py index b743e020d4a7..a300f0f0eb6e 100644 --- a/src/transformers/models/pp_chart2table/modular_pp_chart2table.py +++ b/src/transformers/models/pp_chart2table/modular_pp_chart2table.py @@ -53,7 +53,7 @@ class PPChart2TableProcessor(ProcessorMixin): def __init__(self, image_processor=None, tokenizer=None, chat_template=None, **kwargs): super().__init__(image_processor, tokenizer, chat_template=chat_template) - + # PPChart2TableProcessor uses hardcoded "Chart to table" instruction internally via chat template self.messages = [ { @@ -65,7 +65,6 @@ def __init__(self, image_processor=None, tokenizer=None, chat_template=None, **k }, ] - def __call__( self, images: ImageInput = None, diff --git a/tests/models/pp_chart2table/test_modeling_pp_chart2table.py b/tests/models/pp_chart2table/test_modeling_pp_chart2table.py index 04acdd35ca1d..32b9fff60c38 100644 --- a/tests/models/pp_chart2table/test_modeling_pp_chart2table.py +++ b/tests/models/pp_chart2table/test_modeling_pp_chart2table.py @@ -20,7 +20,7 @@ is_torch_available, is_vision_available, ) -from transformers.testing_utils import cleanup, require_torch, slow, torch_device, require_vision +from transformers.testing_utils import cleanup, require_torch, require_vision, slow, torch_device if is_torch_available(): From 86e9ec51746d6eeb17e032fcee9f9b8d337916d4 Mon Sep 17 00:00:00 2001 From: XingweiDeng Date: Wed, 18 Mar 2026 22:52:51 +0800 Subject: [PATCH 41/60] update --- .../models/pp_chart2table/modular_pp_chart2table.py | 2 ++ .../pp_chart2table/processing_pp_chart2table.py | 2 ++ .../pp_chart2table/test_processing_pp_chart2table.py | 12 ++++++++++++ 3 files changed, 16 insertions(+) diff --git a/src/transformers/models/pp_chart2table/modular_pp_chart2table.py b/src/transformers/models/pp_chart2table/modular_pp_chart2table.py index a300f0f0eb6e..4fe5082fdecf 100644 --- a/src/transformers/models/pp_chart2table/modular_pp_chart2table.py +++ b/src/transformers/models/pp_chart2table/modular_pp_chart2table.py @@ -78,6 +78,8 @@ def __call__( **kwargs, ) + if images is None: + raise ValueError("At least one of `images` must be provided") image_inputs = self.image_processor(images=images, **output_kwargs["images_kwargs"]) batch_size = image_inputs["pixel_values"].shape[0] diff --git a/src/transformers/models/pp_chart2table/processing_pp_chart2table.py b/src/transformers/models/pp_chart2table/processing_pp_chart2table.py index e77a71c821d0..ad70b5edc282 100644 --- a/src/transformers/models/pp_chart2table/processing_pp_chart2table.py +++ b/src/transformers/models/pp_chart2table/processing_pp_chart2table.py @@ -59,6 +59,8 @@ def __call__( **kwargs, ) + if images is None: + raise ValueError("At least one of `images` must be provided") image_inputs = self.image_processor(images=images, **output_kwargs["images_kwargs"]) batch_size = image_inputs["pixel_values"].shape[0] diff --git a/tests/models/pp_chart2table/test_processing_pp_chart2table.py b/tests/models/pp_chart2table/test_processing_pp_chart2table.py index 1592fe4cdb25..bd4592c18746 100644 --- a/tests/models/pp_chart2table/test_processing_pp_chart2table.py +++ b/tests/models/pp_chart2table/test_processing_pp_chart2table.py @@ -62,3 +62,15 @@ def test_unstructured_kwargs_batched(self): ) self.assertLessEqual(inputs[self.images_input_name][0][0].mean(), 0) + + @unittest.skip(reason="PPChart2Table image input cannot be None") + def test_apply_chat_template_assistant_mask(self): + pass + + @unittest.skip(reason="PPChart2Table image input cannot be None") + def test_apply_chat_template_image_0(self): + pass + + @unittest.skip(reason="PPChart2Table image input cannot be None") + def test_apply_chat_template_image_1(self): + pass From 6d791e70781b8c21901c92a3151931fc9fb5e238 Mon Sep 17 00:00:00 2001 From: vasqu Date: Wed, 18 Mar 2026 15:53:18 +0100 Subject: [PATCH 42/60] small fixes --- .../pp_chart2table/modeling_pp_chart2table.py | 805 ------------------ .../pp_chart2table/modular_pp_chart2table.py | 5 +- .../processing_pp_chart2table.py | 5 +- 3 files changed, 6 insertions(+), 809 deletions(-) delete mode 100644 src/transformers/models/pp_chart2table/modeling_pp_chart2table.py diff --git a/src/transformers/models/pp_chart2table/modeling_pp_chart2table.py b/src/transformers/models/pp_chart2table/modeling_pp_chart2table.py deleted file mode 100644 index f13f04e62de6..000000000000 --- a/src/transformers/models/pp_chart2table/modeling_pp_chart2table.py +++ /dev/null @@ -1,805 +0,0 @@ -# ๐Ÿšจ๐Ÿšจ๐Ÿšจ๐Ÿšจ๐Ÿšจ๐Ÿšจ๐Ÿšจ๐Ÿšจ๐Ÿšจ๐Ÿšจ๐Ÿšจ๐Ÿšจ๐Ÿšจ๐Ÿšจ๐Ÿšจ๐Ÿšจ๐Ÿšจ๐Ÿšจ๐Ÿšจ๐Ÿšจ๐Ÿšจ๐Ÿšจ๐Ÿšจ๐Ÿšจ๐Ÿšจ๐Ÿšจ๐Ÿšจ๐Ÿšจ๐Ÿšจ๐Ÿšจ๐Ÿšจ๐Ÿšจ๐Ÿšจ๐Ÿšจ๐Ÿšจ๐Ÿšจ๐Ÿšจ๐Ÿšจ๐Ÿšจ๐Ÿšจ๐Ÿšจ๐Ÿšจ๐Ÿšจ๐Ÿšจ๐Ÿšจ๐Ÿšจ๐Ÿšจ๐Ÿšจ -# This file was automatically generated from src/transformers/models/pp_chart2table/modular_pp_chart2table.py. -# Do NOT edit this file manually as any edits will be overwritten by the generation of -# the file from the modular. If any change should be done, please apply the change to the -# modular_pp_chart2table.py file directly. One of our CI enforces this. -# ๐Ÿšจ๐Ÿšจ๐Ÿšจ๐Ÿšจ๐Ÿšจ๐Ÿšจ๐Ÿšจ๐Ÿšจ๐Ÿšจ๐Ÿšจ๐Ÿšจ๐Ÿšจ๐Ÿšจ๐Ÿšจ๐Ÿšจ๐Ÿšจ๐Ÿšจ๐Ÿšจ๐Ÿšจ๐Ÿšจ๐Ÿšจ๐Ÿšจ๐Ÿšจ๐Ÿšจ๐Ÿšจ๐Ÿšจ๐Ÿšจ๐Ÿšจ๐Ÿšจ๐Ÿšจ๐Ÿšจ๐Ÿšจ๐Ÿšจ๐Ÿšจ๐Ÿšจ๐Ÿšจ๐Ÿšจ๐Ÿšจ๐Ÿšจ๐Ÿšจ๐Ÿšจ๐Ÿšจ๐Ÿšจ๐Ÿšจ๐Ÿšจ๐Ÿšจ๐Ÿšจ๐Ÿšจ -# Copyright 2026 The PaddlePaddle Team and The HuggingFace Inc. team. All rights reserved. -# -# Licensed under the Apache License, Version 2.0 (the "License"); -# you may not use this file except in compliance with the License. -# You may obtain a copy of the License at -# -# http://www.apache.org/licenses/LICENSE-2.0 -# -# Unless required by applicable law or agreed to in writing, software -# distributed under the License is distributed on an "AS IS" BASIS, -# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -# See the License for the specific language governing permissions and -# limitations under the License. - -import collections -from dataclasses import dataclass - -import torch -import torch.nn as nn -import torch.nn.functional as F - -from ... import initialization as init -from ...activations import ACT2FN -from ...cache_utils import Cache -from ...generation import GenerationMixin -from ...modeling_layers import GradientCheckpointingLayer -from ...modeling_outputs import BaseModelOutputWithPast, BaseModelOutputWithPooling, ModelOutput -from ...modeling_utils import PreTrainedModel -from ...processing_utils import Unpack -from ...utils import TransformersKwargs, auto_docstring, can_return_tuple, torch_compilable_check -from ...utils.generic import merge_with_config_defaults -from ...utils.output_capturing import capture_outputs -from ..auto import AutoModel -from .configuration_pp_chart2table import PPChart2TableConfig, PPChart2TableVisionConfig - - -class PPChart2TableVisionAttention(nn.Module): - """Multi-head Attention block with relative position embeddings.""" - - def __init__(self, config, window_size): - super().__init__() - input_size = ( - (config.image_size // config.patch_size, config.image_size // config.patch_size) - if window_size == 0 - else (window_size, window_size) - ) - - self.num_attention_heads = config.num_attention_heads - head_dim = config.hidden_size // config.num_attention_heads - self.scale = head_dim**-0.5 - self.dropout = config.attention_dropout - - self.qkv = nn.Linear(config.hidden_size, config.hidden_size * 3, bias=config.qkv_bias) - self.proj = nn.Linear(config.hidden_size, config.hidden_size) - - self.use_rel_pos = config.use_rel_pos - if self.use_rel_pos: - if input_size is None: - raise ValueError("Input size must be provided if using relative positional encoding.") - - # initialize relative positional embeddings - self.rel_pos_h = nn.Parameter(torch.zeros(2 * input_size[0] - 1, head_dim)) - self.rel_pos_w = nn.Parameter(torch.zeros(2 * input_size[1] - 1, head_dim)) - - def get_rel_pos(self, q_size: int, k_size: int, rel_pos: torch.Tensor) -> torch.Tensor: - """ - Get relative positional embeddings according to the relative positions of - query and key sizes. - - Args: - q_size (int): - size of the query. - k_size (int): - size of key k. - rel_pos (`torch.Tensor`): - relative position embeddings (L, channel). - - Returns: - Extracted positional embeddings according to relative positions. - """ - max_rel_dist = int(2 * max(q_size, k_size) - 1) - # Interpolate rel pos. - rel_pos_resized = F.interpolate( - rel_pos.reshape(1, rel_pos.shape[0], -1).permute(0, 2, 1), - size=max_rel_dist, - mode="linear", - ) - rel_pos_resized = rel_pos_resized.reshape(-1, max_rel_dist).permute(1, 0) - - # Scale the coords with short length if shapes for q and k are different. - q_coords = torch.arange(q_size)[:, None] * max(k_size / q_size, 1.0) - k_coords = torch.arange(k_size)[None, :] * max(q_size / k_size, 1.0) - relative_coords = (q_coords - k_coords) + (k_size - 1) * max(q_size / k_size, 1.0) - - return rel_pos_resized[relative_coords.long()] - - def get_decomposed_rel_pos( - self, - query: torch.Tensor, - rel_pos_h: torch.Tensor, - rel_pos_w: torch.Tensor, - q_size: tuple[int, int], - k_size: tuple[int, int], - ) -> torch.Tensor: - """ - Calculate decomposed Relative Positional Embeddings from :paper:`mvitv2`. - https://github.com/facebookresearch/mvit/blob/19786631e330df9f3622e5402b4a419a263a2c80/mvit/models/attention.py - - Args: - query (`torch.Tensor`): - query q in the attention layer with shape (batch_size, query_height * query_width, channel). - rel_pos_h (`torch.Tensor`): - relative position embeddings (Lh, channel) for height axis. - rel_pos_w (`torch.Tensor`): - relative position embeddings (Lw, channel) for width axis. - q_size (tuple): - spatial sequence size of query q with (query_height, query_width). - k_size (tuple): - spatial sequence size of key k with (key_height, key_width). - - Returns: - decomposed_rel_pos (`torch.Tensor`): - decomposed relative position embeddings. - """ - query_height, query_width = q_size - key_height, key_width = k_size - relative_position_height = self.get_rel_pos(query_height, key_height, rel_pos_h) - relative_position_width = self.get_rel_pos(query_width, key_width, rel_pos_w) - - batch_size, _, dim = query.shape - reshaped_query = query.reshape(batch_size, query_height, query_width, dim) - rel_h = torch.einsum("bhwc,hkc->bhwk", reshaped_query, relative_position_height) - rel_w = torch.einsum("bhwc,wkc->bhwk", reshaped_query, relative_position_width) - - decomposed_rel_pos = rel_h[:, :, :, :, None] + rel_w[:, :, :, None, :] - - return decomposed_rel_pos - - def forward(self, hidden_states: torch.Tensor, output_attentions=None) -> tuple[torch.Tensor, torch.Tensor]: - batch_size, height, width, _ = hidden_states.shape - # qkv with shape (3, batch_size, nHead, height * width, channel) - qkv = ( - self.qkv(hidden_states) - .reshape(batch_size, height * width, 3, self.num_attention_heads, -1) - .permute(2, 0, 3, 1, 4) - ) - # q, k, v with shape (batch_size * nHead, height * width, channel) - query, key, value = qkv.reshape(3, batch_size * self.num_attention_heads, height * width, -1).unbind(0) - - attn_weights = (query * self.scale) @ key.transpose(-2, -1) - - if self.use_rel_pos: - decomposed_rel_pos = self.get_decomposed_rel_pos( - query, self.rel_pos_h, self.rel_pos_w, (height, width), (height, width) - ) - decomposed_rel_pos = decomposed_rel_pos.reshape_as(attn_weights) - attn_weights = attn_weights + decomposed_rel_pos - - attn_weights = torch.nn.functional.softmax(attn_weights, dtype=torch.float32, dim=-1).to(query.dtype) - - attn_probs = nn.functional.dropout(attn_weights, p=self.dropout, training=self.training) - - attn_output = (attn_probs @ value).reshape(batch_size, self.num_attention_heads, height, width, -1) - attn_output = attn_output.permute(0, 2, 3, 1, 4).reshape(batch_size, height, width, -1) - - attn_output = self.proj(attn_output) - return attn_output, attn_weights - - -@auto_docstring -class PPChart2TableVisionPreTrainedModel(PreTrainedModel): - config: PPChart2TableConfig - base_model_prefix = "model" - input_modalities = ("image", "text") - supports_gradient_checkpointing = True - _skip_keys_device_placement = "past_key_values" - _supports_flash_attn = False - _supports_sdpa = False - - _can_compile_fullgraph = True - _supports_flex_attn = False - _supports_attention_backend = True - - @torch.no_grad() - def _init_weights(self, module): - super()._init_weights(module) - if isinstance(module, PPChart2TableVisionAttention): - if module.use_rel_pos: - init.zeros_(module.rel_pos_h) - init.zeros_(module.rel_pos_w) - elif isinstance(module, PPChart2TableVisionEncoder): - if module.pos_embed is not None: - init.zeros_(module.pos_embed) - - -class PPChart2TableMLPBlock(nn.Module): - def __init__(self, config): - super().__init__() - self.lin1 = nn.Linear(config.hidden_size, config.mlp_dim) - self.lin2 = nn.Linear(config.mlp_dim, config.hidden_size) - self.act = ACT2FN[config.hidden_act] - - def forward(self, hidden_states: torch.Tensor) -> torch.Tensor: - hidden_states = self.lin1(hidden_states) - hidden_states = self.act(hidden_states) - hidden_states = self.lin2(hidden_states) - return hidden_states - - -class PPChart2TableVisionLayer(GradientCheckpointingLayer): - def __init__(self, config, window_size): - super().__init__() - self.layer_norm1 = nn.LayerNorm(config.hidden_size, eps=config.layer_norm_eps) - self.attn = PPChart2TableVisionAttention(config, window_size) - self.layer_norm2 = nn.LayerNorm(config.hidden_size, eps=config.layer_norm_eps) - self.mlp = PPChart2TableMLPBlock(config) - self.window_size = window_size - - def window_partition(self, hidden_states: torch.Tensor, window_size: int) -> tuple[torch.Tensor, tuple[int, int]]: - """ - Args: - Partition into non-overlapping windows with padding if needed. - hidden_states (tensor): input tokens with [batch_size, height, width, channel]. window_size (int): window - size. - - Returns: - windows: windows after partition with [batch_size * num_windows, window_size, window_size, channel]. - (pad_height, pad_width): padded height and width before partition - """ - batch_size, height, width, channel = hidden_states.shape - - pad_h = (window_size - height % window_size) % window_size - pad_w = (window_size - width % window_size) % window_size - hidden_states = F.pad(hidden_states, (0, 0, 0, pad_w, 0, pad_h)) - pad_height, pad_width = height + pad_h, width + pad_w - - hidden_states = hidden_states.reshape( - batch_size, pad_height // window_size, window_size, pad_width // window_size, window_size, channel - ) - windows = hidden_states.permute(0, 1, 3, 2, 4, 5).contiguous().reshape(-1, window_size, window_size, channel) - return windows, (pad_height, pad_width) - - def window_unpartition( - self, windows: torch.Tensor, window_size: int, padding_shape: tuple[int, int], original_shape: tuple[int, int] - ) -> torch.Tensor: - """ - Args: - Window unpartition into original sequences and removing padding. - hidden_states (tensor): - input tokens with [batch_size * num_windows, window_size, window_size, channel]. - window_size (int): - window size. - padding_shape (Tuple): - padded height and width (pad_height, pad_width). - original_shape (Tuple): original height and width (height, width) before padding. - - Returns: - hidden_states: unpartitioned sequences with [batch_size, height, width, channel]. - """ - pad_height, pad_width = padding_shape - height, width = original_shape - batch_size = windows.shape[0] // (pad_height * pad_width // window_size // window_size) - hidden_states = windows.reshape( - batch_size, pad_height // window_size, pad_width // window_size, window_size, window_size, -1 - ) - hidden_states = ( - hidden_states.permute(0, 1, 3, 2, 4, 5).contiguous().reshape(batch_size, pad_height, pad_width, -1) - ) - - hidden_states = hidden_states[:, :height, :width, :].contiguous() - return hidden_states - - def forward(self, hidden_states: torch.Tensor) -> tuple[torch.FloatTensor]: - residual = hidden_states - hidden_states = self.layer_norm1(hidden_states) - # Window partition - if self.window_size > 0: - height, width = hidden_states.shape[1], hidden_states.shape[2] - hidden_states, padding_shape = self.window_partition(hidden_states, self.window_size) - - hidden_states, attn_weights = self.attn( - hidden_states=hidden_states, - ) - # Reverse window partition - if self.window_size > 0: - hidden_states = self.window_unpartition(hidden_states, self.window_size, padding_shape, (height, width)) - - hidden_states = residual + hidden_states - layernorm_output = self.layer_norm2(hidden_states) - hidden_states = hidden_states + self.mlp(layernorm_output) - return hidden_states - - -@dataclass -@auto_docstring( - custom_intro=""" - Base class for p_p_chart2_table vision model's outputs that also contains image embeddings obtained by applying the projection - layer to the pooler_output. - """ -) -class PPChart2TableVisionEncoderOutput(ModelOutput): - r""" - image_embeds (`torch.FloatTensor` of shape `(batch_size, output_dim)` *optional* returned when model is initialized with `with_projection=True`): - The image embeddings obtained by applying the projection layer to the pooler_output. - """ - - image_embeds: torch.FloatTensor | None = None - last_hidden_state: torch.FloatTensor | None = None - hidden_states: tuple[torch.FloatTensor, ...] | None = None - attentions: tuple[torch.FloatTensor, ...] | None = None - - -class PPChart2TablePatchEmbeddings(nn.Module): - """ - This class turns `pixel_values` of shape `(batch_size, num_channels, height, width)` into the initial - `hidden_states` (patch embeddings) of shape `(batch_size, seq_length, hidden_size)` to be consumed by a - Transformer. - """ - - def __init__(self, config): - super().__init__() - image_size, patch_size = config.image_size, config.patch_size - num_channels, hidden_size = config.num_channels, config.hidden_size - image_size = image_size if isinstance(image_size, collections.abc.Iterable) else (image_size, image_size) - patch_size = patch_size if isinstance(patch_size, collections.abc.Iterable) else (patch_size, patch_size) - num_patches = (image_size[1] // patch_size[1]) * (image_size[0] // patch_size[0]) - self.image_size = image_size - self.patch_size = patch_size - self.num_channels = num_channels - self.num_patches = num_patches - - self.projection = nn.Conv2d(num_channels, hidden_size, kernel_size=patch_size, stride=patch_size) - - def forward(self, pixel_values): - batch_size, num_channels, height, width = pixel_values.shape - if num_channels != self.num_channels: - raise ValueError( - "Make sure that the channel dimension of the pixel values match with the one set in the configuration." - ) - if height != self.image_size[0] or width != self.image_size[1]: - raise ValueError( - f"Input image size ({height}*{width}) doesn't match model ({self.image_size[0]}*{self.image_size[1]})." - ) - embeddings = self.projection(pixel_values).permute(0, 2, 3, 1) - return embeddings - - -class PPChart2TableLayerNorm(nn.LayerNorm): - r"""LayerNorm that supports two data formats: channels_last (default) or channels_first. - The ordering of the dimensions in the inputs. channels_last corresponds to inputs with shape (batch_size, height, - width, channels) while channels_first corresponds to inputs with shape (batch_size, channels, height, width). - """ - - def __init__(self, normalized_shape, *, eps=1e-6, data_format="channels_last", **kwargs): - super().__init__(normalized_shape, eps=eps, **kwargs) - if data_format not in ["channels_last", "channels_first"]: - raise NotImplementedError(f"Unsupported data format: {data_format}") - self.data_format = data_format - - def forward(self, features: torch.Tensor) -> torch.Tensor: - """ - Args: - features: Tensor of shape (batch_size, channels, height, width) OR (batch_size, height, width, channels) - """ - if self.data_format == "channels_first": - features = features.permute(0, 2, 3, 1) - features = super().forward(features) - features = features.permute(0, 3, 1, 2) - else: - features = super().forward(features) - return features - - -class PPChart2TableVisionNeck(nn.Module): - def __init__(self, config: PPChart2TableVisionConfig): - super().__init__() - self.config = config - - self.conv1 = nn.Conv2d(config.hidden_size, config.output_channels, kernel_size=1, bias=False) - self.layer_norm1 = PPChart2TableLayerNorm(config.output_channels, data_format="channels_first") - self.conv2 = nn.Conv2d(config.output_channels, config.output_channels, kernel_size=3, padding=1, bias=False) - self.layer_norm2 = PPChart2TableLayerNorm(config.output_channels, data_format="channels_first") - - def forward(self, hidden_states): - hidden_states = hidden_states.permute(0, 3, 1, 2) - hidden_states = self.conv1(hidden_states) - hidden_states = self.layer_norm1(hidden_states) - - hidden_states = self.conv2(hidden_states) - hidden_states = self.layer_norm2(hidden_states) - return hidden_states - - -class PPChart2TableVisionEncoder(PPChart2TableVisionPreTrainedModel): - _can_record_outputs = {"hidden_states": PPChart2TableVisionLayer, "attentions": PPChart2TableVisionAttention} - input_modalities = ("image",) - - def __init__(self, config: PPChart2TableVisionConfig): - super().__init__(config) - self.config = config - self.image_size = config.image_size - self.patch_embed = PPChart2TablePatchEmbeddings(config) - - self.pos_embed = None - if config.use_abs_pos: - # Initialize absolute positional embedding with pretrain image size. - self.pos_embed = nn.Parameter( - torch.zeros( - 1, - config.image_size // config.patch_size, - config.image_size // config.patch_size, - config.hidden_size, - ) - ) - - self.layers = nn.ModuleList() - for i in range(config.num_hidden_layers): - layer = PPChart2TableVisionLayer( - config, - window_size=config.window_size if i not in config.global_attn_indexes else 0, - ) - self.layers.append(layer) - - self.neck = PPChart2TableVisionNeck(config) - - self.gradient_checkpointing = False - self.post_init() - - def get_input_embeddings(self): - return self.patch_embed - - @merge_with_config_defaults - @capture_outputs(tie_last_hidden_states=False) - def forward( - self, pixel_values: torch.FloatTensor | None = None, **kwargs: Unpack[TransformersKwargs] - ) -> tuple | PPChart2TableVisionEncoderOutput: - if pixel_values is None: - raise ValueError("You have to specify pixel_values") - - hidden_states = self.patch_embed(pixel_values) - if self.pos_embed is not None: - hidden_states = hidden_states + self.pos_embed - for layer_module in self.layers: - hidden_states = layer_module(hidden_states) - hidden_states = self.neck(hidden_states) - return PPChart2TableVisionEncoderOutput( - last_hidden_state=hidden_states, - ) - - -@dataclass -class PPChart2TableModelOutputWithPast(BaseModelOutputWithPast): - r""" - past_key_values (`Cache`, *optional*, returned when `use_cache=True` is passed or when `config.use_cache=True`): - It is a [`~cache_utils.Cache`] instance. For more details, see our [kv cache guide](https://huggingface.co/docs/transformers/en/kv_cache). - - Contains pre-computed hidden-states (key and values in the self-attention blocks) that can be used (see - `past_key_values` input) to speed up sequential decoding. - image_hidden_states (`torch.FloatTensor`, *optional*): - A `torch.FloatTensor` of size `(batch_size, num_images, sequence_length, hidden_size)`. - image_hidden_states of the model produced by the vision encoder and after projecting the last hidden state. - """ - - image_hidden_states: torch.FloatTensor | None = None - - -@auto_docstring -class PPChart2TablePreTrainedModel(PreTrainedModel): - config: PPChart2TableConfig - base_model_prefix = "model" - input_modalities = ("image", "text") - supports_gradient_checkpointing = True - _skip_keys_device_placement = "past_key_values" - _supports_flash_attn = False - _supports_sdpa = False - - _can_compile_fullgraph = True - _supports_flex_attn = False - _supports_attention_backend = True - - @torch.no_grad() - def _init_weights(self, module): - super()._init_weights(module) - if isinstance(module, PPChart2TableVisionAttention): - if module.use_rel_pos: - init.zeros_(module.rel_pos_h) - init.zeros_(module.rel_pos_w) - elif isinstance(module, PPChart2TableVisionEncoder): - if module.pos_embed is not None: - init.zeros_(module.pos_embed) - - -class PPChart2TableMultiModalProjector(nn.Module): - def __init__(self, config: PPChart2TableConfig): - super().__init__() - vision_output_channels = config.vision_config.output_channels - language_hidden_size = config.text_config.hidden_size - self.conv_upsampler1 = nn.Conv2d( - vision_output_channels, vision_output_channels * 2, kernel_size=3, stride=2, padding=1, bias=False - ) - self.conv_upsampler2 = nn.Conv2d( - vision_output_channels * 2, language_hidden_size, kernel_size=3, stride=2, padding=1, bias=False - ) - self.multimodal_projector = nn.Linear(language_hidden_size, language_hidden_size) - - def forward(self, vision_embeddings: torch.Tensor) -> torch.Tensor: - hidden_state = self.conv_upsampler1(vision_embeddings) - hidden_state = self.conv_upsampler2(hidden_state) - hidden_state = hidden_state.flatten(2).permute(0, 2, 1) - hidden_state = self.multimodal_projector(hidden_state) - return hidden_state - - -@auto_docstring -class PPChart2TableModel(PPChart2TablePreTrainedModel): - def __init__(self, config: PPChart2TableConfig): - super().__init__(config) - self.vision_tower = PPChart2TableVisionEncoder(config.vision_config) - - self.multi_modal_projector = PPChart2TableMultiModalProjector(config) - self.language_model = AutoModel.from_config(config.text_config) - self.post_init() - - def get_input_embeddings(self): - return self.language_model.get_input_embeddings() - - def set_input_embeddings(self, value): - self.language_model.set_input_embeddings(value) - - @can_return_tuple - @auto_docstring( - custom_intro="Obtains image last hidden states from the vision tower and apply multimodal projection." - ) - def get_image_features( - self, - pixel_values: torch.FloatTensor, - **kwargs: Unpack[TransformersKwargs], - ) -> tuple | BaseModelOutputWithPooling: - image_outputs = self.vision_tower(pixel_values, return_dict=True, **kwargs) - last_hidden_state = image_outputs.last_hidden_state - image_outputs.pooler_output = self.multi_modal_projector(last_hidden_state) - - return image_outputs - - def get_placeholder_mask( - self, input_ids: torch.LongTensor, inputs_embeds: torch.FloatTensor, image_features: torch.FloatTensor - ): - """ - Obtains multimodal placeholder mask from `input_ids` or `inputs_embeds`, and checks that the placeholder token count is - equal to the length of multimodal features. If the lengths are different, an error is raised. - """ - if input_ids is None: - special_image_mask = inputs_embeds == self.get_input_embeddings()( - torch.tensor(self.config.image_token_id, dtype=torch.long, device=inputs_embeds.device) - ) - special_image_mask = special_image_mask.all(-1) - else: - special_image_mask = input_ids == self.config.image_token_id - - n_image_tokens = special_image_mask.sum() - n_image_features = image_features.shape[0] * image_features.shape[1] - special_image_mask = special_image_mask.unsqueeze(-1).expand_as(inputs_embeds).to(inputs_embeds.device) - torch_compilable_check( - inputs_embeds[special_image_mask].numel() == image_features.numel(), - f"Image features and image tokens do not match, tokens: {n_image_tokens}, features: {n_image_features}", - ) - return special_image_mask - - @can_return_tuple - @auto_docstring - def forward( - self, - input_ids: torch.LongTensor | None = None, - pixel_values: torch.FloatTensor | None = None, - attention_mask: torch.Tensor | None = None, - position_ids: torch.LongTensor | None = None, - past_key_values: Cache | None = None, - inputs_embeds: torch.FloatTensor | None = None, - use_cache: bool | None = None, - **kwargs: Unpack[TransformersKwargs], - ) -> tuple | PPChart2TableModelOutputWithPast: - if (input_ids is None) ^ (inputs_embeds is not None): - raise ValueError("You must specify exactly one of input_ids or inputs_embeds") - - if inputs_embeds is None: - inputs_embeds = self.get_input_embeddings()(input_ids) - - if pixel_values is not None: - image_features = self.get_image_features( - pixel_values=pixel_values.to(inputs_embeds.dtype), return_dict=True - ).pooler_output - image_features = image_features.to(inputs_embeds.device, inputs_embeds.dtype) - special_image_mask = self.get_placeholder_mask( - input_ids, inputs_embeds=inputs_embeds, image_features=image_features - ) - inputs_embeds = inputs_embeds.masked_scatter(special_image_mask, image_features) - - outputs = self.language_model( - attention_mask=attention_mask, - position_ids=position_ids, - past_key_values=past_key_values, - inputs_embeds=inputs_embeds, - use_cache=use_cache, - return_dict=True, - **kwargs, - ) - - return PPChart2TableModelOutputWithPast( - last_hidden_state=outputs.last_hidden_state, - past_key_values=outputs.past_key_values, - hidden_states=outputs.hidden_states, - attentions=outputs.attentions, - image_hidden_states=image_features if pixel_values is not None else None, - ) - - -@dataclass -@auto_docstring( - custom_intro=""" - Base class for PPChart2Table causal language model (or autoregressive) outputs. - """ -) -class PPChart2TableCausalLMOutputWithPast(ModelOutput): - r""" - loss (`torch.FloatTensor` of shape `(1,)`, *optional*, returned when `labels` is provided): - Language modeling loss (for next-token prediction). - logits (`torch.FloatTensor` of shape `(batch_size, sequence_length, config.vocab_size)`): - Prediction scores of the language modeling head (scores for each vocabulary token before SoftMax). - past_key_values (`Cache`, *optional*, returned when `use_cache=True` is passed or when `config.use_cache=True`): - It is a [`~cache_utils.Cache`] instance. For more details, see our [kv cache guide](https://huggingface.co/docs/transformers/en/kv_cache). - - Contains pre-computed hidden-states (key and values in the self-attention blocks) that can be used (see - `past_key_values` input) to speed up sequential decoding. - image_hidden_states (`torch.FloatTensor`, *optional*): - A `torch.FloatTensor` of size `(batch_size, num_images, sequence_length, hidden_size)`. - image_hidden_states of the model produced by the vision encoder and after projecting the last hidden state. - """ - - loss: torch.FloatTensor | None = None - logits: torch.FloatTensor | None = None - past_key_values: Cache | None = None - hidden_states: tuple[torch.FloatTensor] | None = None - attentions: tuple[torch.FloatTensor] | None = None - image_hidden_states: torch.FloatTensor | None = None - - -@auto_docstring( - custom_intro=""" - PP-Chart2Table model for conditional generation (table text generation from chart images), - extending the core model with a language modeling (LM) head and generation utilities. - """ -) -class PPChart2TableForConditionalGeneration(PPChart2TablePreTrainedModel, GenerationMixin): - _tied_weights_keys = {"lm_head.weight": "model.language_model.embed_tokens.weight"} - - def __init__(self, config: PPChart2TableConfig): - super().__init__(config) - self.model = PPChart2TableModel(config) - self.lm_head = nn.Linear(config.text_config.hidden_size, config.text_config.vocab_size, bias=False) - self.post_init() - - def get_input_embeddings(self): - return self.model.get_input_embeddings() - - def set_input_embeddings(self, value): - self.model.set_input_embeddings(value) - - def get_output_embeddings(self) -> nn.Module: - return self.lm_head - - @auto_docstring - def get_image_features( - self, pixel_values: torch.FloatTensor, **kwargs: Unpack[TransformersKwargs] - ) -> tuple | BaseModelOutputWithPooling: - return self.model.get_image_features(pixel_values=pixel_values, **kwargs) - - @can_return_tuple - @auto_docstring - def forward( - self, - input_ids: torch.LongTensor | None = None, - pixel_values: torch.FloatTensor | None = None, - attention_mask: torch.Tensor | None = None, - position_ids: torch.LongTensor | None = None, - past_key_values: Cache | None = None, - inputs_embeds: torch.FloatTensor | None = None, - labels: torch.LongTensor | None = None, - use_cache: bool | None = None, - logits_to_keep: int | torch.Tensor = 0, - **kwargs: Unpack[TransformersKwargs], - ) -> tuple | PPChart2TableCausalLMOutputWithPast: - r""" - labels (`torch.LongTensor` of shape `(batch_size, sequence_length)`, *optional*): - Labels for computing the masked language modeling loss. Indices should either be in `[0, ..., - config.vocab_size]` or -100 (see `input_ids` docstring). Tokens with indices set to `-100` are ignored - (masked), the loss is only computed for the tokens with labels in `[0, ..., config.vocab_size]`. - - Example: - - ```python - >>> from PIL import Image - >>> import httpx - >>> from io import BytesIO - >>> from transformers import AutoProcessor, PPChart2TableForConditionalGeneration, TextStreamer - - >>> model = PPChart2TableForConditionalGeneration.from_pretrained("stepfun-ai/GOT-OCR-2.0-hf").to("cuda") - >>> processor = AutoProcessor.from_pretrained("stepfun-ai/GOT-OCR-2.0-hf") - - >>> url = "https://huggingface.co/datasets/hf-internal-testing/fixtures_got_ocr/resolve/main/multi_box.png" - >>> with httpx.stream("GET", url) as response: - ... image = Image.open(BytesIO(response.read())) - - >>> inputs = processor(image, return_tensors="pt", color="green").to("cuda") - - >>> # Generate - >>> streamer = TextStreamer(processor.tokenizer, skip_prompt=True, skip_special_tokens=True) - >>> generate_ids = model.generate( - ... **inputs, - ... do_sample=False, - ... tokenizer = processor.tokenizer, - ... stop_strings='<|im_end|>', - ... streamer=streamer, - ... max_new_tokens=4096, - ... ) - "You should keep in mind what features from the module should be used, especially - when you're planning to sell a template." - ```""" - outputs = self.model( - input_ids=input_ids, - pixel_values=pixel_values, - attention_mask=attention_mask, - position_ids=position_ids, - past_key_values=past_key_values, - inputs_embeds=inputs_embeds, - use_cache=use_cache, - return_dict=True, - logits_to_keep=logits_to_keep, - **kwargs, - ) - - hidden_states = outputs[0] - # Only compute necessary logits, and do not upcast them to float if we are not computing the loss - slice_indices = slice(-logits_to_keep, None) if isinstance(logits_to_keep, int) else logits_to_keep - logits = self.lm_head(hidden_states[:, slice_indices, :]) - - loss = None - if labels is not None: - loss = self.loss_function( - logits=logits, labels=labels, vocab_size=self.config.text_config.vocab_size, **kwargs - ) - - return PPChart2TableCausalLMOutputWithPast( - loss=loss, - logits=logits, - past_key_values=outputs.past_key_values, - hidden_states=outputs.hidden_states, - attentions=outputs.attentions, - image_hidden_states=outputs.image_hidden_states, - ) - - def prepare_inputs_for_generation( - self, - input_ids, - past_key_values=None, - inputs_embeds=None, - pixel_values=None, - attention_mask=None, - logits_to_keep=None, - is_first_iteration=False, - **kwargs, - ): - # Overwritten -- in specific circumstances we don't want to forward image inputs to the model - - model_inputs = super().prepare_inputs_for_generation( - input_ids, - past_key_values=past_key_values, - inputs_embeds=inputs_embeds, - attention_mask=attention_mask, - logits_to_keep=logits_to_keep, - is_first_iteration=is_first_iteration, - **kwargs, - ) - - if is_first_iteration or not kwargs.get("use_cache", True): - # Pixel values are used only in the first iteration if available - # In subsequent iterations, they are already merged with text and cached - # NOTE: first iteration doesn't have to be prefill, it can be the first - # iteration with a question and cached system prompt (continue generate from cache) - model_inputs["pixel_values"] = pixel_values - - return model_inputs - - -__all__ = [ - "PPChart2TableForConditionalGeneration", - "PPChart2TableModel", - "PPChart2TableVisionPreTrainedModel", - "PPChart2TablePreTrainedModel", -] diff --git a/src/transformers/models/pp_chart2table/modular_pp_chart2table.py b/src/transformers/models/pp_chart2table/modular_pp_chart2table.py index 4fe5082fdecf..b6333b5c4547 100644 --- a/src/transformers/models/pp_chart2table/modular_pp_chart2table.py +++ b/src/transformers/models/pp_chart2table/modular_pp_chart2table.py @@ -19,7 +19,8 @@ from ...image_utils import ImageInput from ...processing_utils import ProcessingKwargs, ProcessorMixin, Unpack from ...tokenization_utils_base import PreTokenizedInput, TextInput -from ...utils import auto_docstring, logging, requires_backends +from ...utils import auto_docstring, logging +from ...utils.import_utils import requires from ..got_ocr2.configuration_got_ocr2 import GotOcr2Config @@ -46,6 +47,7 @@ class PPChart2TableImageProcessorFast(BaseImageProcessorFast): @auto_docstring +@requires(backends=("torch",)) class PPChart2TableProcessor(ProcessorMixin): image_processor_class = "AutoImageProcessor" tokenizer_class = "AutoTokenizer" @@ -71,7 +73,6 @@ def __call__( text: TextInput | PreTokenizedInput | list[TextInput] | list[PreTokenizedInput] = None, **kwargs: Unpack[ProcessingKwargs], ) -> BatchFeature: - requires_backends(self, "torch") output_kwargs = self._merge_kwargs( ProcessingKwargs, tokenizer_init_kwargs=self.tokenizer.init_kwargs, diff --git a/src/transformers/models/pp_chart2table/processing_pp_chart2table.py b/src/transformers/models/pp_chart2table/processing_pp_chart2table.py index ad70b5edc282..ed0ea50f4de5 100644 --- a/src/transformers/models/pp_chart2table/processing_pp_chart2table.py +++ b/src/transformers/models/pp_chart2table/processing_pp_chart2table.py @@ -23,10 +23,12 @@ from ...image_utils import ImageInput from ...processing_utils import ProcessingKwargs, ProcessorMixin, Unpack from ...tokenization_utils_base import PreTokenizedInput, TextInput -from ...utils import auto_docstring, requires_backends +from ...utils import auto_docstring +from ...utils.import_utils import requires @auto_docstring +@requires(backends=("torch",)) class PPChart2TableProcessor(ProcessorMixin): image_processor_class = "AutoImageProcessor" tokenizer_class = "AutoTokenizer" @@ -52,7 +54,6 @@ def __call__( text: TextInput | PreTokenizedInput | list[TextInput] | list[PreTokenizedInput] = None, **kwargs: Unpack[ProcessingKwargs], ) -> BatchFeature: - requires_backends(self, "torch") output_kwargs = self._merge_kwargs( ProcessingKwargs, tokenizer_init_kwargs=self.tokenizer.init_kwargs, From 8e201c1516bd2c8c367b9ba2e9e209203e5fa920 Mon Sep 17 00:00:00 2001 From: vasqu Date: Wed, 18 Mar 2026 15:55:13 +0100 Subject: [PATCH 43/60] more explicit skip msg --- .../models/pp_chart2table/test_processing_pp_chart2table.py | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/tests/models/pp_chart2table/test_processing_pp_chart2table.py b/tests/models/pp_chart2table/test_processing_pp_chart2table.py index bd4592c18746..9bc3c174e331 100644 --- a/tests/models/pp_chart2table/test_processing_pp_chart2table.py +++ b/tests/models/pp_chart2table/test_processing_pp_chart2table.py @@ -63,14 +63,14 @@ def test_unstructured_kwargs_batched(self): self.assertLessEqual(inputs[self.images_input_name][0][0].mean(), 0) - @unittest.skip(reason="PPChart2Table image input cannot be None") + @unittest.skip(reason="PPChart2Table uses predetermined input - chat template usage not intended + image input cannot be None") def test_apply_chat_template_assistant_mask(self): pass - @unittest.skip(reason="PPChart2Table image input cannot be None") + @unittest.skip(reason="PPChart2Table uses predetermined input - chat template usage not intended + image input cannot be None") def test_apply_chat_template_image_0(self): pass - @unittest.skip(reason="PPChart2Table image input cannot be None") + @unittest.skip(reason="PPChart2Table uses predetermined input - chat template usage not intended + image input cannot be None") def test_apply_chat_template_image_1(self): pass From 787e65094fa2c352da9433f73bffb9fdc4027e96 Mon Sep 17 00:00:00 2001 From: vasqu Date: Wed, 18 Mar 2026 16:01:54 +0100 Subject: [PATCH 44/60] some quick fixes --- docs/source/en/model_doc/pp_chart2table.md | 2 +- .../pp_chart2table/test_processing_pp_chart2table.py | 12 +++++++++--- utils/check_config_attributes.py | 3 +++ 3 files changed, 13 insertions(+), 4 deletions(-) diff --git a/docs/source/en/model_doc/pp_chart2table.md b/docs/source/en/model_doc/pp_chart2table.md index 9dc464d31473..02ff7751e68d 100644 --- a/docs/source/en/model_doc/pp_chart2table.md +++ b/docs/source/en/model_doc/pp_chart2table.md @@ -13,7 +13,7 @@ specific language governing permissions and limitations under the License. rendered properly in your Markdown viewer. --> -*This model was released on 2025-05-20 and added to Hugging Face Transformers on 2026-03-16.* +*This model was released on 2025-05-20 and added to Hugging Face Transformers on 2026-03-18.* # PP-Chart2Table diff --git a/tests/models/pp_chart2table/test_processing_pp_chart2table.py b/tests/models/pp_chart2table/test_processing_pp_chart2table.py index 9bc3c174e331..cd6b7e64f7bf 100644 --- a/tests/models/pp_chart2table/test_processing_pp_chart2table.py +++ b/tests/models/pp_chart2table/test_processing_pp_chart2table.py @@ -63,14 +63,20 @@ def test_unstructured_kwargs_batched(self): self.assertLessEqual(inputs[self.images_input_name][0][0].mean(), 0) - @unittest.skip(reason="PPChart2Table uses predetermined input - chat template usage not intended + image input cannot be None") + @unittest.skip( + reason="PPChart2Table uses predetermined input - chat template usage not intended + image input cannot be None" + ) def test_apply_chat_template_assistant_mask(self): pass - @unittest.skip(reason="PPChart2Table uses predetermined input - chat template usage not intended + image input cannot be None") + @unittest.skip( + reason="PPChart2Table uses predetermined input - chat template usage not intended + image input cannot be None" + ) def test_apply_chat_template_image_0(self): pass - @unittest.skip(reason="PPChart2Table uses predetermined input - chat template usage not intended + image input cannot be None") + @unittest.skip( + reason="PPChart2Table uses predetermined input - chat template usage not intended + image input cannot be None" + ) def test_apply_chat_template_image_1(self): pass diff --git a/utils/check_config_attributes.py b/utils/check_config_attributes.py index 01b9a3c7ecb7..b1e968c71f56 100644 --- a/utils/check_config_attributes.py +++ b/utils/check_config_attributes.py @@ -138,6 +138,9 @@ "GptOssConfig": True, "LwDetrConfig": True, "NemotronHConfig": True, + # Internally uses Got Ocr2 so no need to use in the modeling code as we remap in auto instead + "PPChart2TableConfig": True, + "PPChart2TableVisionConfig": True, } # Common and important attributes, even if they do not always appear in the modeling files (can be a regex pattern) From 7280cf56b75b212c5d72bdac15488be003b1df2e Mon Sep 17 00:00:00 2001 From: vasqu Date: Wed, 18 Mar 2026 16:14:58 +0100 Subject: [PATCH 45/60] fix --- docs/source/en/model_doc/pp_chart2table.md | 7 ------- .../configuration_pp_chart2table.py | 6 +++--- .../pp_chart2table/modular_pp_chart2table.py | 16 ++++++++++++++++ .../test_modeling_pp_chart2table.py | 10 ++-------- 4 files changed, 21 insertions(+), 18 deletions(-) diff --git a/docs/source/en/model_doc/pp_chart2table.md b/docs/source/en/model_doc/pp_chart2table.md index 02ff7751e68d..2f7c43e429dd 100644 --- a/docs/source/en/model_doc/pp_chart2table.md +++ b/docs/source/en/model_doc/pp_chart2table.md @@ -145,13 +145,6 @@ print(result) -## PPChart2TableForConditionalGeneration - -[[autodoc]] PPChart2TableForConditionalGeneration - -## PPChart2TableModel - -[[autodoc]] PPChart2TableModel ## PPChart2TableConfig diff --git a/src/transformers/models/pp_chart2table/configuration_pp_chart2table.py b/src/transformers/models/pp_chart2table/configuration_pp_chart2table.py index d85c61d942b9..3e85e2c96667 100644 --- a/src/transformers/models/pp_chart2table/configuration_pp_chart2table.py +++ b/src/transformers/models/pp_chart2table/configuration_pp_chart2table.py @@ -70,13 +70,13 @@ class PPChart2TableConfig(PreTrainedConfig): Example: ```python - >>> from transformers import PPChart2TableForConditionalGeneration, PPChart2TableConfig + >>> from transformers import GotOcr2ForConditionalGeneration, PPChart2TableConfig >>> # Initializing a PPChart2Table style configuration >>> configuration = PPChart2TableConfig() - >>> # Initializing a model from the Qwen2-VL-7B style configuration - >>> model = PPChart2TableForConditionalGeneration(configuration) + >>> # Initializing a model from the PaddlePaddle/PP-Chart2Table_safetensors style configuration + >>> model = GotOcr2ForConditionalGeneration(configuration) # underlying architecture is Got Ocr 2 >>> # Accessing the model configuration >>> configuration = model.config diff --git a/src/transformers/models/pp_chart2table/modular_pp_chart2table.py b/src/transformers/models/pp_chart2table/modular_pp_chart2table.py index b6333b5c4547..fd8f35ddaa92 100644 --- a/src/transformers/models/pp_chart2table/modular_pp_chart2table.py +++ b/src/transformers/models/pp_chart2table/modular_pp_chart2table.py @@ -32,6 +32,22 @@ class PPChart2TableConfig(GotOcr2Config): model_type = "pp_chart2table" + r""" + Example: + + ```python + >>> from transformers import GotOcr2ForConditionalGeneration, PPChart2TableConfig + + >>> # Initializing a PPChart2Table style configuration + >>> configuration = PPChart2TableConfig() + + >>> # Initializing a model from the PaddlePaddle/PP-Chart2Table_safetensors style configuration + >>> model = GotOcr2ForConditionalGeneration(configuration) # underlying architecture is Got Ocr 2 + + >>> # Accessing the model configuration + >>> configuration = model.config + ```""" + @auto_docstring class PPChart2TableImageProcessorFast(BaseImageProcessorFast): diff --git a/tests/models/pp_chart2table/test_modeling_pp_chart2table.py b/tests/models/pp_chart2table/test_modeling_pp_chart2table.py index 32b9fff60c38..04977cb8b7c5 100644 --- a/tests/models/pp_chart2table/test_modeling_pp_chart2table.py +++ b/tests/models/pp_chart2table/test_modeling_pp_chart2table.py @@ -16,19 +16,13 @@ import unittest from transformers import ( + AutoModelForImageTextToText, AutoProcessor, - is_torch_available, is_vision_available, ) from transformers.testing_utils import cleanup, require_torch, require_vision, slow, torch_device -if is_torch_available(): - from transformers import ( - PPChart2TableForConditionalGeneration, - ) - - if is_vision_available(): from transformers.image_utils import load_image @@ -39,7 +33,7 @@ class PPChart2TableIntegrationTest(unittest.TestCase): def setUp(self): model_path = "PaddlePaddle/PP-Chart2Table_safetensors" - self.model = PPChart2TableForConditionalGeneration.from_pretrained(model_path).to(torch_device) + self.model = AutoModelForImageTextToText.from_pretrained(model_path).to(torch_device) self.processor = AutoProcessor.from_pretrained(model_path) url = "https://paddle-model-ecology.bj.bcebos.com/paddlex/imgs/demo_image/chart_parsing_02.png" self.image = load_image(url) From 957220ae2494bebe2269b4847ac305325b009d64 Mon Sep 17 00:00:00 2001 From: vasqu Date: Wed, 18 Mar 2026 16:36:59 +0100 Subject: [PATCH 46/60] quick cleanups --- .../image_processing_pp_chart2table.py | 33 +++++++++++++++++++ .../image_processing_pp_chart2table_fast.py | 2 ++ .../pp_chart2table/modular_pp_chart2table.py | 17 ++++++++-- .../processing_pp_chart2table.py | 2 -- 4 files changed, 49 insertions(+), 5 deletions(-) create mode 100644 src/transformers/models/pp_chart2table/image_processing_pp_chart2table.py diff --git a/src/transformers/models/pp_chart2table/image_processing_pp_chart2table.py b/src/transformers/models/pp_chart2table/image_processing_pp_chart2table.py new file mode 100644 index 000000000000..c94d4bc5557c --- /dev/null +++ b/src/transformers/models/pp_chart2table/image_processing_pp_chart2table.py @@ -0,0 +1,33 @@ +# ๐Ÿšจ๐Ÿšจ๐Ÿšจ๐Ÿšจ๐Ÿšจ๐Ÿšจ๐Ÿšจ๐Ÿšจ๐Ÿšจ๐Ÿšจ๐Ÿšจ๐Ÿšจ๐Ÿšจ๐Ÿšจ๐Ÿšจ๐Ÿšจ๐Ÿšจ๐Ÿšจ๐Ÿšจ๐Ÿšจ๐Ÿšจ๐Ÿšจ๐Ÿšจ๐Ÿšจ๐Ÿšจ๐Ÿšจ๐Ÿšจ๐Ÿšจ๐Ÿšจ๐Ÿšจ๐Ÿšจ๐Ÿšจ๐Ÿšจ๐Ÿšจ๐Ÿšจ๐Ÿšจ๐Ÿšจ๐Ÿšจ๐Ÿšจ๐Ÿšจ๐Ÿšจ๐Ÿšจ๐Ÿšจ๐Ÿšจ๐Ÿšจ๐Ÿšจ๐Ÿšจ๐Ÿšจ +# This file was automatically generated from src/transformers/models/pp_chart2table/modular_pp_chart2table.py. +# Do NOT edit this file manually as any edits will be overwritten by the generation of +# the file from the modular. If any change should be done, please apply the change to the +# modular_pp_chart2table.py file directly. One of our CI enforces this. +# ๐Ÿšจ๐Ÿšจ๐Ÿšจ๐Ÿšจ๐Ÿšจ๐Ÿšจ๐Ÿšจ๐Ÿšจ๐Ÿšจ๐Ÿšจ๐Ÿšจ๐Ÿšจ๐Ÿšจ๐Ÿšจ๐Ÿšจ๐Ÿšจ๐Ÿšจ๐Ÿšจ๐Ÿšจ๐Ÿšจ๐Ÿšจ๐Ÿšจ๐Ÿšจ๐Ÿšจ๐Ÿšจ๐Ÿšจ๐Ÿšจ๐Ÿšจ๐Ÿšจ๐Ÿšจ๐Ÿšจ๐Ÿšจ๐Ÿšจ๐Ÿšจ๐Ÿšจ๐Ÿšจ๐Ÿšจ๐Ÿšจ๐Ÿšจ๐Ÿšจ๐Ÿšจ๐Ÿšจ๐Ÿšจ๐Ÿšจ๐Ÿšจ๐Ÿšจ๐Ÿšจ๐Ÿšจ +# Copyright 2026 The PaddlePaddle Team and The HuggingFace Inc. team. All rights reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +from ...processing_utils import ImagesKwargs + + +class PPChart2TableImageProcessorKwargs(ImagesKwargs, total=False): + r""" + patch_size (`int`, *optional*, defaults to `16`): + The expected patch size out of the image processor. + num_patches (`int`, *optional*, defaults to `16`): + Alias for `patch_size`. + """ + + patch_size: int + num_patches: int diff --git a/src/transformers/models/pp_chart2table/image_processing_pp_chart2table_fast.py b/src/transformers/models/pp_chart2table/image_processing_pp_chart2table_fast.py index ef857db5b658..48e0a3d0b1d8 100644 --- a/src/transformers/models/pp_chart2table/image_processing_pp_chart2table_fast.py +++ b/src/transformers/models/pp_chart2table/image_processing_pp_chart2table_fast.py @@ -20,6 +20,7 @@ from ...image_processing_utils_fast import BaseImageProcessorFast from ...utils import auto_docstring +from .image_processing_pp_chart2table import PPChart2TableImageProcessorKwargs @auto_docstring @@ -33,6 +34,7 @@ class PPChart2TableImageProcessorFast(BaseImageProcessorFast): do_resize = True do_rescale = True do_normalize = True + valid_kwargs = PPChart2TableImageProcessorKwargs __all__ = ["PPChart2TableImageProcessorFast"] diff --git a/src/transformers/models/pp_chart2table/modular_pp_chart2table.py b/src/transformers/models/pp_chart2table/modular_pp_chart2table.py index fd8f35ddaa92..79555850699c 100644 --- a/src/transformers/models/pp_chart2table/modular_pp_chart2table.py +++ b/src/transformers/models/pp_chart2table/modular_pp_chart2table.py @@ -17,7 +17,7 @@ from ...feature_extraction_utils import BatchFeature from ...image_processing_utils_fast import BaseImageProcessorFast from ...image_utils import ImageInput -from ...processing_utils import ProcessingKwargs, ProcessorMixin, Unpack +from ...processing_utils import ImagesKwargs, ProcessingKwargs, ProcessorMixin, Unpack from ...tokenization_utils_base import PreTokenizedInput, TextInput from ...utils import auto_docstring, logging from ...utils.import_utils import requires @@ -49,6 +49,18 @@ class PPChart2TableConfig(GotOcr2Config): ```""" +class PPChart2TableImageProcessorKwargs(ImagesKwargs, total=False): + r""" + patch_size (`int`, *optional*, defaults to `16`): + The expected patch size out of the image processor. + num_patches (`int`, *optional*, defaults to `16`): + Alias for `patch_size`. + """ + + patch_size: int + num_patches: int + + @auto_docstring class PPChart2TableImageProcessorFast(BaseImageProcessorFast): resample = 3 @@ -60,13 +72,12 @@ class PPChart2TableImageProcessorFast(BaseImageProcessorFast): do_resize = True do_rescale = True do_normalize = True + valid_kwargs = PPChart2TableImageProcessorKwargs @auto_docstring @requires(backends=("torch",)) class PPChart2TableProcessor(ProcessorMixin): - image_processor_class = "AutoImageProcessor" - tokenizer_class = "AutoTokenizer" model_input_names = ["input_ids", "pixel_values"] def __init__(self, image_processor=None, tokenizer=None, chat_template=None, **kwargs): diff --git a/src/transformers/models/pp_chart2table/processing_pp_chart2table.py b/src/transformers/models/pp_chart2table/processing_pp_chart2table.py index ed0ea50f4de5..9eb2b58c6078 100644 --- a/src/transformers/models/pp_chart2table/processing_pp_chart2table.py +++ b/src/transformers/models/pp_chart2table/processing_pp_chart2table.py @@ -30,8 +30,6 @@ @auto_docstring @requires(backends=("torch",)) class PPChart2TableProcessor(ProcessorMixin): - image_processor_class = "AutoImageProcessor" - tokenizer_class = "AutoTokenizer" model_input_names = ["input_ids", "pixel_values"] def __init__(self, image_processor=None, tokenizer=None, chat_template=None, **kwargs): From 9f74fa1c745d37c96b13c04ece95affd2fd25752 Mon Sep 17 00:00:00 2001 From: XingweiDeng Date: Thu, 19 Mar 2026 16:42:19 +0800 Subject: [PATCH 47/60] update --- docs/source/en/model_doc/pp_chart2table.md | 125 ++++++++++++------ .../image_processing_pp_chart2table.py | 8 +- .../pp_chart2table/modular_pp_chart2table.py | 38 ++---- .../processing_pp_chart2table.py | 30 ++--- .../test_modeling_pp_chart2table.py | 70 ++++++---- .../test_processing_pp_chart2table.py | 19 ++- 6 files changed, 178 insertions(+), 112 deletions(-) diff --git a/docs/source/en/model_doc/pp_chart2table.md b/docs/source/en/model_doc/pp_chart2table.md index 2f7c43e429dd..dde874495c78 100644 --- a/docs/source/en/model_doc/pp_chart2table.md +++ b/docs/source/en/model_doc/pp_chart2table.md @@ -39,24 +39,26 @@ The example below demonstrates how to classify image with PP-Chart2Table using [ ```py -import requests -from PIL import Image from transformers import pipeline -model_path = "PaddlePaddle/PP-Chart2Table_safetensors" -pipe = pipeline( - task="image-text-to-text", - model=model_path, - device_map="auto", -) -image = Image.open(requests.get("https://paddle-model-ecology.bj.bcebos.com/paddlex/imgs/demo_image/chart_parsing_02.png", stream=True).raw) -# text is empty - processor uses hardcoded "Chart to table" instruction internally via chat template -result = pipe( - images=image, - text="", - do_sample=False, - max_new_tokens=256 -) -print(result) + +pipe = pipeline("image-text-to-text", model="PaddlePaddle/PP-Chart2Table_safetensors") + +# PPChart2TableProcessor uses hardcoded "Chart to table" instruction internally via chat template +conversation = [ + { + "role": "system", + "content": [], + }, + { + "role": "user", + "content": [ + {"type": "image", "url": "https://paddle-model-ecology.bj.bcebos.com/paddlex/imgs/demo_image/chart_parsing_02.png", "num_patches": 16}, + {"type": "text", "text": "Chart to table"} + ], + } +] +result = pipe(text=conversation) +print(result[0]["generated_text"]) ``` @@ -76,8 +78,29 @@ model = AutoModelForImageTextToText.from_pretrained( ) processor = AutoProcessor.from_pretrained(model_path) -image = Image.open(requests.get("https://paddle-model-ecology.bj.bcebos.com/paddlex/imgs/demo_image/chart_parsing_02.png", stream=True).raw) -inputs = processor(images=image).to(model.device) +# PPChart2TableProcessor uses hardcoded "Chart to table" instruction internally via chat template +conversation = [ + { + "role": "system", + "content": [], + }, + { + "role": "user", + "content": [ + {"type": "image", "url": "https://paddle-model-ecology.bj.bcebos.com/paddlex/imgs/demo_image/chart_parsing_02.png", "num_patches": processor.image_processor.num_patches}, + {"type": "text", "text": "Chart to table"} + ], + } +] + +inputs = processor.apply_chat_template( + conversation, + tokenize=True, + add_generation_prompt=True, + truncation=True, + return_dict=True, + return_tensors="pt", +).to(model.device) generated_ids = model.generate(**inputs, do_sample=False, max_new_tokens=256) generated_ids_trimmed = [out_ids[len(in_ids) :] for in_ids, out_ids in zip(inputs.input_ids, generated_ids)] @@ -97,24 +120,27 @@ Here is how you can do it with PP-Chart2Table using [`Pipeline`] or the [`AutoMo ```py -import requests from transformers import pipeline -from PIL import Image -model_path = "PaddlePaddle/PP-Chart2Table_safetensors" -pipe = pipeline( - task="image-text-to-text", - model=model_path, - device_map="auto", -) -image = Image.open(requests.get("https://paddle-model-ecology.bj.bcebos.com/paddlex/imgs/demo_image/chart_parsing_02.png", stream=True).raw) -# text is empty - processor uses hardcoded "Chart to table" instruction internally via chat template -result = pipe( - images=[image, image], - text="", - do_sample=False, - max_new_tokens=256 -) -print(result) + +pipe = pipeline("image-text-to-text", model="PaddlePaddle/PP-Chart2Table_safetensors") + +# PPChart2TableProcessor uses hardcoded "Chart to table" instruction internally via chat template +conversation = [ + { + "role": "system", + "content": [], + }, + { + "role": "user", + "content": [ + {"type": "image", "url": "https://paddle-model-ecology.bj.bcebos.com/paddlex/imgs/demo_image/chart_parsing_02.png", "num_patches": 16}, + {"type": "text", "text": "Chart to table"} + ], + } +] +result = pipe(text=[conversation, conversation]) +print(result[0][0]["generated_text"]) + ``` @@ -133,13 +159,36 @@ model = AutoModelForImageTextToText.from_pretrained( ) processor = AutoProcessor.from_pretrained(model_path) -image = Image.open(requests.get("https://paddle-model-ecology.bj.bcebos.com/paddlex/imgs/demo_image/chart_parsing_02.png", stream=True).raw) -inputs = processor(images=[image, image]).to(model.device) +# PPChart2TableProcessor uses hardcoded "Chart to table" instruction internally via chat template +conversation = [ + { + "role": "system", + "content": [], + }, + { + "role": "user", + "content": [ + {"type": "image", "url": "https://paddle-model-ecology.bj.bcebos.com/paddlex/imgs/demo_image/chart_parsing_02.png", "num_patches": processor.image_processor.num_patches}, + {"type": "text", "text": "Chart to table"} + ], + } +] + +batch_conversation = [conversation, conversation] +inputs = processor.apply_chat_template( + batch_conversation, + tokenize=True, + add_generation_prompt=True, + truncation=True, + return_dict=True, + return_tensors="pt", +).to(model.device) generated_ids = model.generate(**inputs, do_sample=False, max_new_tokens=256) generated_ids_trimmed = [out_ids[len(in_ids) :] for in_ids, out_ids in zip(inputs.input_ids, generated_ids)] result = processor.batch_decode(generated_ids_trimmed, skip_special_tokens=True, clean_up_tokenization_spaces=False) print(result) + ``` diff --git a/src/transformers/models/pp_chart2table/image_processing_pp_chart2table.py b/src/transformers/models/pp_chart2table/image_processing_pp_chart2table.py index c94d4bc5557c..5092b216ee99 100644 --- a/src/transformers/models/pp_chart2table/image_processing_pp_chart2table.py +++ b/src/transformers/models/pp_chart2table/image_processing_pp_chart2table.py @@ -24,9 +24,13 @@ class PPChart2TableImageProcessorKwargs(ImagesKwargs, total=False): r""" patch_size (`int`, *optional*, defaults to `16`): - The expected patch size out of the image processor. + The size (in pixels) of each square patch that the image is divided into before being fed into the + vision encoder. + num_patches (`int`, *optional*, defaults to `16`): - Alias for `patch_size`. + Number of patches used to represent the image in the input sequence. This parameter is included in + the chat template's user message to inform the language model about the image structure. The model + uses this information to understand how the image tokens correspond to the visual input. """ patch_size: int diff --git a/src/transformers/models/pp_chart2table/modular_pp_chart2table.py b/src/transformers/models/pp_chart2table/modular_pp_chart2table.py index 79555850699c..a540d6848509 100644 --- a/src/transformers/models/pp_chart2table/modular_pp_chart2table.py +++ b/src/transformers/models/pp_chart2table/modular_pp_chart2table.py @@ -12,6 +12,7 @@ # See the License for the specific language governing permissions and # limitations under the License. +import torch from huggingface_hub.dataclasses import strict from ...feature_extraction_utils import BatchFeature @@ -52,9 +53,13 @@ class PPChart2TableConfig(GotOcr2Config): class PPChart2TableImageProcessorKwargs(ImagesKwargs, total=False): r""" patch_size (`int`, *optional*, defaults to `16`): - The expected patch size out of the image processor. + The size (in pixels) of each square patch that the image is divided into before being fed into the + vision encoder. + num_patches (`int`, *optional*, defaults to `16`): - Alias for `patch_size`. + Number of patches used to represent the image in the input sequence. This parameter is included in + the chat template's user message to inform the language model about the image structure. The model + uses this information to understand how the image tokens correspond to the visual input. """ patch_size: int @@ -83,17 +88,6 @@ class PPChart2TableProcessor(ProcessorMixin): def __init__(self, image_processor=None, tokenizer=None, chat_template=None, **kwargs): super().__init__(image_processor, tokenizer, chat_template=chat_template) - # PPChart2TableProcessor uses hardcoded "Chart to table" instruction internally via chat template - self.messages = [ - { - "role": "system", - }, - { - "role": "user", - "image": {"num_patches": self.image_processor.num_patches}, - }, - ] - def __call__( self, images: ImageInput = None, @@ -109,19 +103,15 @@ def __call__( if images is None: raise ValueError("At least one of `images` must be provided") image_inputs = self.image_processor(images=images, **output_kwargs["images_kwargs"]) - batch_size = image_inputs["pixel_values"].shape[0] - - # Use tokenizer's apply_chat_template instead of manually loading template - inputs = self.tokenizer.apply_chat_template( - self.messages, - tokenize=True, - add_generation_prompt=True, - truncation=True, - **output_kwargs["text_kwargs"], - ) # Prepare input ids for batch - input_ids = inputs["input_ids"].repeat(batch_size, 1) + if text is None: + raise ValueError("At least one of `text` must be provided") + + if not isinstance(text, list): + text = [text] + + input_ids = torch.tensor(self.tokenizer(text, **output_kwargs["text_kwargs"]).input_ids) return BatchFeature(data={"input_ids": input_ids, **image_inputs}) diff --git a/src/transformers/models/pp_chart2table/processing_pp_chart2table.py b/src/transformers/models/pp_chart2table/processing_pp_chart2table.py index 9eb2b58c6078..cb5cc19f284b 100644 --- a/src/transformers/models/pp_chart2table/processing_pp_chart2table.py +++ b/src/transformers/models/pp_chart2table/processing_pp_chart2table.py @@ -18,6 +18,7 @@ # See the License for the specific language governing permissions and # limitations under the License. +import torch from ...feature_extraction_utils import BatchFeature from ...image_utils import ImageInput @@ -35,17 +36,6 @@ class PPChart2TableProcessor(ProcessorMixin): def __init__(self, image_processor=None, tokenizer=None, chat_template=None, **kwargs): super().__init__(image_processor, tokenizer, chat_template=chat_template) - # PPChart2TableProcessor uses hardcoded "Chart to table" instruction internally via chat template - self.messages = [ - { - "role": "system", - }, - { - "role": "user", - "image": {"num_patches": self.image_processor.num_patches}, - }, - ] - def __call__( self, images: ImageInput = None, @@ -61,19 +51,15 @@ def __call__( if images is None: raise ValueError("At least one of `images` must be provided") image_inputs = self.image_processor(images=images, **output_kwargs["images_kwargs"]) - batch_size = image_inputs["pixel_values"].shape[0] - - # Use tokenizer's apply_chat_template instead of manually loading template - inputs = self.tokenizer.apply_chat_template( - self.messages, - tokenize=True, - add_generation_prompt=True, - truncation=True, - **output_kwargs["text_kwargs"], - ) # Prepare input ids for batch - input_ids = inputs["input_ids"].repeat(batch_size, 1) + if text is None: + raise ValueError("At least one of `text` must be provided") + + if not isinstance(text, list): + text = [text] + + input_ids = torch.tensor(self.tokenizer(text, **output_kwargs["text_kwargs"]).input_ids) return BatchFeature(data={"input_ids": input_ids, **image_inputs}) diff --git a/tests/models/pp_chart2table/test_modeling_pp_chart2table.py b/tests/models/pp_chart2table/test_modeling_pp_chart2table.py index 04977cb8b7c5..e173112c2172 100644 --- a/tests/models/pp_chart2table/test_modeling_pp_chart2table.py +++ b/tests/models/pp_chart2table/test_modeling_pp_chart2table.py @@ -15,18 +15,10 @@ import unittest -from transformers import ( - AutoModelForImageTextToText, - AutoProcessor, - is_vision_available, -) +from transformers import AutoModelForImageTextToText, AutoProcessor from transformers.testing_utils import cleanup, require_torch, require_vision, slow, torch_device -if is_vision_available(): - from transformers.image_utils import load_image - - @slow @require_vision @require_torch @@ -35,31 +27,61 @@ def setUp(self): model_path = "PaddlePaddle/PP-Chart2Table_safetensors" self.model = AutoModelForImageTextToText.from_pretrained(model_path).to(torch_device) self.processor = AutoProcessor.from_pretrained(model_path) - url = "https://paddle-model-ecology.bj.bcebos.com/paddlex/imgs/demo_image/chart_parsing_02.png" - self.image = load_image(url) + self.conversation = [ + { + "role": "system", + "content": [], + }, + { + "role": "user", + "content": [ + { + "type": "image", + "url": "https://paddle-model-ecology.bj.bcebos.com/paddlex/imgs/demo_image/chart_parsing_02.png", + "num_patches": self.processor.image_processor.num_patches, + }, + {"type": "text", "text": "Chart to table"}, + ], + }, + ] def tearDown(self): cleanup(torch_device, gc_collect=True) def test_small_model_integration_test_pp_chart2table(self): - inputs = self.processor(self.image, return_tensors="pt").to(torch_device) - generate_ids = self.model.generate( - **inputs, - use_cache=True, - do_sample=False, - max_new_tokens=32, - ) - decoded_output = self.processor.decode( - generate_ids[0, inputs["input_ids"].shape[1] :], skip_special_tokens=True + inputs = self.processor.apply_chat_template( + self.conversation, + tokenize=True, + add_generation_prompt=True, + truncation=True, + return_dict=True, + return_tensors="pt", + ).to(self.model.device) + + generated_ids = self.model.generate(**inputs, do_sample=False, max_new_tokens=32) + generated_ids_trimmed = [out_ids[len(in_ids) :] for in_ids, out_ids in zip(inputs.input_ids, generated_ids)] + decoded_output = self.processor.batch_decode( + generated_ids_trimmed, skip_special_tokens=True, clean_up_tokenization_spaces=False ) - expected_output = "ๅนดไปฝ | ๅ•ๅฎถไบ”ๆ˜Ÿ็บงๆ—…ๆธธ้ฅญๅบ—ๅนดๅนณๅ‡่ฅๆ”ถ (็™พไธ‡ๅ…ƒ) | ๅ•ๅฎถไบ”ๆ˜Ÿ็บงๆ—…ๆธธ้ฅญๅบ—ๅนดๅนณๅ‡ๅˆฉๆถฆ (็™พไธ‡ๅ…ƒ)\n" + + expected_output = ["ๅนดไปฝ | ๅ•ๅฎถไบ”ๆ˜Ÿ็บงๆ—…ๆธธ้ฅญๅบ—ๅนดๅนณๅ‡่ฅๆ”ถ (็™พไธ‡ๅ…ƒ) | ๅ•ๅฎถไบ”ๆ˜Ÿ็บงๆ—…ๆธธ้ฅญๅบ—ๅนดๅนณๅ‡ๅˆฉๆถฆ (็™พไธ‡ๅ…ƒ)\n"] self.assertEqual(decoded_output, expected_output) def test_small_model_integration_test_pp_chart2table_batched(self): - inputs = self.processor([self.image, self.image], return_tensors="pt").to(torch_device) - generate_ids = self.model.generate(**inputs, do_sample=False, num_beams=1, max_new_tokens=6) + inputs = self.processor.apply_chat_template( + [self.conversation, self.conversation], + tokenize=True, + add_generation_prompt=True, + truncation=True, + return_dict=True, + return_tensors="pt", + ).to(self.model.device) + + generated_ids = self.model.generate(**inputs, do_sample=False, max_new_tokens=6) + generated_ids_trimmed = [out_ids[len(in_ids) :] for in_ids, out_ids in zip(inputs.input_ids, generated_ids)] decoded_output = self.processor.batch_decode( - generate_ids[:, inputs["input_ids"].shape[1] :], skip_special_tokens=True + generated_ids_trimmed, skip_special_tokens=True, clean_up_tokenization_spaces=False ) + expected_output = ["ๅนดไปฝ | ๅ•ๅฎถ", "ๅนดไปฝ | ๅ•ๅฎถ"] self.assertEqual(decoded_output, expected_output) diff --git a/tests/models/pp_chart2table/test_processing_pp_chart2table.py b/tests/models/pp_chart2table/test_processing_pp_chart2table.py index cd6b7e64f7bf..89191374ed40 100644 --- a/tests/models/pp_chart2table/test_processing_pp_chart2table.py +++ b/tests/models/pp_chart2table/test_processing_pp_chart2table.py @@ -37,8 +37,23 @@ def test_image_processor_defaults(self): def test_ocr_queries(self): processor = self.get_processor() image_input = self.prepare_image_inputs() - inputs = processor(image_input, return_tensors="pt") - self.assertEqual(inputs["input_ids"].shape, (1, 286)) + conversation = [ + { + "role": "system", + "content": [], + }, + { + "role": "user", + "content": [{"type": "image", "num_patches": 16}, {"type": "text", "text": "Chart to table"}], + }, + ] + inputs = processor.apply_chat_template( + conversation, + tokenize=False, + add_generation_prompt=True, + ) + inputs = processor(images=image_input, text=inputs, return_tensors="pt") + self.assertEqual(inputs["input_ids"].shape, (1, 285)) self.assertEqual(inputs["pixel_values"].shape, (1, 3, 1024, 1024)) def test_unstructured_kwargs_batched(self): From f33cfb5914e3e97891e444ec0383b7e332ab6a9e Mon Sep 17 00:00:00 2001 From: XingweiDeng Date: Thu, 19 Mar 2026 20:45:27 +0800 Subject: [PATCH 48/60] update --- docs/source/en/model_doc/pp_chart2table.md | 32 ++++++++++++------- .../pp_chart2table/modular_pp_chart2table.py | 3 +- .../processing_pp_chart2table.py | 5 +-- .../test_modeling_pp_chart2table.py | 2 -- .../test_processing_pp_chart2table.py | 21 +++++------- 5 files changed, 30 insertions(+), 33 deletions(-) diff --git a/docs/source/en/model_doc/pp_chart2table.md b/docs/source/en/model_doc/pp_chart2table.md index dde874495c78..dbb5d14fe1cd 100644 --- a/docs/source/en/model_doc/pp_chart2table.md +++ b/docs/source/en/model_doc/pp_chart2table.md @@ -52,10 +52,12 @@ conversation = [ { "role": "user", "content": [ - {"type": "image", "url": "https://paddle-model-ecology.bj.bcebos.com/paddlex/imgs/demo_image/chart_parsing_02.png", "num_patches": 16}, - {"type": "text", "text": "Chart to table"} + { + "type": "image", + "url": "https://paddle-model-ecology.bj.bcebos.com/paddlex/imgs/demo_image/chart_parsing_02.png", + }, ], - } + }, ] result = pipe(text=conversation) print(result[0]["generated_text"]) @@ -87,10 +89,12 @@ conversation = [ { "role": "user", "content": [ - {"type": "image", "url": "https://paddle-model-ecology.bj.bcebos.com/paddlex/imgs/demo_image/chart_parsing_02.png", "num_patches": processor.image_processor.num_patches}, - {"type": "text", "text": "Chart to table"} + { + "type": "image", + "url": "https://paddle-model-ecology.bj.bcebos.com/paddlex/imgs/demo_image/chart_parsing_02.png", + }, ], - } + }, ] inputs = processor.apply_chat_template( @@ -133,10 +137,12 @@ conversation = [ { "role": "user", "content": [ - {"type": "image", "url": "https://paddle-model-ecology.bj.bcebos.com/paddlex/imgs/demo_image/chart_parsing_02.png", "num_patches": 16}, - {"type": "text", "text": "Chart to table"} + { + "type": "image", + "url": "https://paddle-model-ecology.bj.bcebos.com/paddlex/imgs/demo_image/chart_parsing_02.png", + }, ], - } + }, ] result = pipe(text=[conversation, conversation]) print(result[0][0]["generated_text"]) @@ -168,10 +174,12 @@ conversation = [ { "role": "user", "content": [ - {"type": "image", "url": "https://paddle-model-ecology.bj.bcebos.com/paddlex/imgs/demo_image/chart_parsing_02.png", "num_patches": processor.image_processor.num_patches}, - {"type": "text", "text": "Chart to table"} + { + "type": "image", + "url": "https://paddle-model-ecology.bj.bcebos.com/paddlex/imgs/demo_image/chart_parsing_02.png", + }, ], - } + }, ] batch_conversation = [conversation, conversation] diff --git a/src/transformers/models/pp_chart2table/modular_pp_chart2table.py b/src/transformers/models/pp_chart2table/modular_pp_chart2table.py index a540d6848509..47c72069d0de 100644 --- a/src/transformers/models/pp_chart2table/modular_pp_chart2table.py +++ b/src/transformers/models/pp_chart2table/modular_pp_chart2table.py @@ -81,7 +81,6 @@ class PPChart2TableImageProcessorFast(BaseImageProcessorFast): @auto_docstring -@requires(backends=("torch",)) class PPChart2TableProcessor(ProcessorMixin): model_input_names = ["input_ids", "pixel_values"] @@ -111,7 +110,7 @@ def __call__( if not isinstance(text, list): text = [text] - input_ids = torch.tensor(self.tokenizer(text, **output_kwargs["text_kwargs"]).input_ids) + input_ids = self.tokenizer(text, **output_kwargs["text_kwargs"]).input_ids return BatchFeature(data={"input_ids": input_ids, **image_inputs}) diff --git a/src/transformers/models/pp_chart2table/processing_pp_chart2table.py b/src/transformers/models/pp_chart2table/processing_pp_chart2table.py index cb5cc19f284b..9047f6d45b35 100644 --- a/src/transformers/models/pp_chart2table/processing_pp_chart2table.py +++ b/src/transformers/models/pp_chart2table/processing_pp_chart2table.py @@ -18,18 +18,15 @@ # See the License for the specific language governing permissions and # limitations under the License. -import torch from ...feature_extraction_utils import BatchFeature from ...image_utils import ImageInput from ...processing_utils import ProcessingKwargs, ProcessorMixin, Unpack from ...tokenization_utils_base import PreTokenizedInput, TextInput from ...utils import auto_docstring -from ...utils.import_utils import requires @auto_docstring -@requires(backends=("torch",)) class PPChart2TableProcessor(ProcessorMixin): model_input_names = ["input_ids", "pixel_values"] @@ -59,7 +56,7 @@ def __call__( if not isinstance(text, list): text = [text] - input_ids = torch.tensor(self.tokenizer(text, **output_kwargs["text_kwargs"]).input_ids) + input_ids = self.tokenizer(text, **output_kwargs["text_kwargs"]).input_ids return BatchFeature(data={"input_ids": input_ids, **image_inputs}) diff --git a/tests/models/pp_chart2table/test_modeling_pp_chart2table.py b/tests/models/pp_chart2table/test_modeling_pp_chart2table.py index e173112c2172..ba739912f072 100644 --- a/tests/models/pp_chart2table/test_modeling_pp_chart2table.py +++ b/tests/models/pp_chart2table/test_modeling_pp_chart2table.py @@ -38,9 +38,7 @@ def setUp(self): { "type": "image", "url": "https://paddle-model-ecology.bj.bcebos.com/paddlex/imgs/demo_image/chart_parsing_02.png", - "num_patches": self.processor.image_processor.num_patches, }, - {"type": "text", "text": "Chart to table"}, ], }, ] diff --git a/tests/models/pp_chart2table/test_processing_pp_chart2table.py b/tests/models/pp_chart2table/test_processing_pp_chart2table.py index 89191374ed40..c02dbfe2858c 100644 --- a/tests/models/pp_chart2table/test_processing_pp_chart2table.py +++ b/tests/models/pp_chart2table/test_processing_pp_chart2table.py @@ -27,7 +27,8 @@ class PPChart2TableProcessorTest(ProcessorTesterMixin, unittest.TestCase): @classmethod def _setup_tokenizer(cls): tokenizer_class = cls._get_component_class_from_processor("tokenizer") - tokenizer = tokenizer_class.from_pretrained("PaddlePaddle/PP-Chart2Table_safetensors") + # tokenizer = tokenizer_class.from_pretrained("PaddlePaddle/PP-Chart2Table_safetensors") + tokenizer = tokenizer_class.from_pretrained("/workspace/model_weight_torch/PP-Chart2Table") return tokenizer @unittest.skip("PPChart2TableProcessor pop the image processor output 'num_patches'") @@ -38,14 +39,8 @@ def test_ocr_queries(self): processor = self.get_processor() image_input = self.prepare_image_inputs() conversation = [ - { - "role": "system", - "content": [], - }, - { - "role": "user", - "content": [{"type": "image", "num_patches": 16}, {"type": "text", "text": "Chart to table"}], - }, + {"role": "system", "content": []}, + {"role": "user", "content": []} ] inputs = processor.apply_chat_template( conversation, @@ -53,7 +48,7 @@ def test_ocr_queries(self): add_generation_prompt=True, ) inputs = processor(images=image_input, text=inputs, return_tensors="pt") - self.assertEqual(inputs["input_ids"].shape, (1, 285)) + self.assertEqual(inputs["input_ids"].shape, (1, 287)) self.assertEqual(inputs["pixel_values"].shape, (1, 3, 1024, 1024)) def test_unstructured_kwargs_batched(self): @@ -79,19 +74,19 @@ def test_unstructured_kwargs_batched(self): self.assertLessEqual(inputs[self.images_input_name][0][0].mean(), 0) @unittest.skip( - reason="PPChart2Table uses predetermined input - chat template usage not intended + image input cannot be None" + reason="PPChart2Table relies on a predetermined input format; chat template usage is not intended, and image input cannot be None." ) def test_apply_chat_template_assistant_mask(self): pass @unittest.skip( - reason="PPChart2Table uses predetermined input - chat template usage not intended + image input cannot be None" + reason="PPChart2Table relies on a predetermined input format; chat template usage is not intended, and image input cannot be None." ) def test_apply_chat_template_image_0(self): pass @unittest.skip( - reason="PPChart2Table uses predetermined input - chat template usage not intended + image input cannot be None" + reason="PPChart2Table relies on a predetermined input format; chat template usage is not intended, and image input cannot be None." ) def test_apply_chat_template_image_1(self): pass From 8394c0814300a45e9ea06c364b56c3f016059447 Mon Sep 17 00:00:00 2001 From: XingweiDeng Date: Thu, 19 Mar 2026 20:48:01 +0800 Subject: [PATCH 49/60] update --- .../models/pp_chart2table/modular_pp_chart2table.py | 2 -- .../models/pp_chart2table/test_processing_pp_chart2table.py | 5 +---- 2 files changed, 1 insertion(+), 6 deletions(-) diff --git a/src/transformers/models/pp_chart2table/modular_pp_chart2table.py b/src/transformers/models/pp_chart2table/modular_pp_chart2table.py index 47c72069d0de..f4d76b17e9ba 100644 --- a/src/transformers/models/pp_chart2table/modular_pp_chart2table.py +++ b/src/transformers/models/pp_chart2table/modular_pp_chart2table.py @@ -12,7 +12,6 @@ # See the License for the specific language governing permissions and # limitations under the License. -import torch from huggingface_hub.dataclasses import strict from ...feature_extraction_utils import BatchFeature @@ -21,7 +20,6 @@ from ...processing_utils import ImagesKwargs, ProcessingKwargs, ProcessorMixin, Unpack from ...tokenization_utils_base import PreTokenizedInput, TextInput from ...utils import auto_docstring, logging -from ...utils.import_utils import requires from ..got_ocr2.configuration_got_ocr2 import GotOcr2Config diff --git a/tests/models/pp_chart2table/test_processing_pp_chart2table.py b/tests/models/pp_chart2table/test_processing_pp_chart2table.py index c02dbfe2858c..e89dbd9db900 100644 --- a/tests/models/pp_chart2table/test_processing_pp_chart2table.py +++ b/tests/models/pp_chart2table/test_processing_pp_chart2table.py @@ -38,10 +38,7 @@ def test_image_processor_defaults(self): def test_ocr_queries(self): processor = self.get_processor() image_input = self.prepare_image_inputs() - conversation = [ - {"role": "system", "content": []}, - {"role": "user", "content": []} - ] + conversation = [{"role": "system", "content": []}, {"role": "user", "content": []}] inputs = processor.apply_chat_template( conversation, tokenize=False, From 3c0b0282a000f91cc46864ede1c6451198f6ffcf Mon Sep 17 00:00:00 2001 From: XingweiDeng Date: Thu, 19 Mar 2026 22:33:48 +0800 Subject: [PATCH 50/60] update --- tests/models/pp_chart2table/test_processing_pp_chart2table.py | 3 +-- 1 file changed, 1 insertion(+), 2 deletions(-) diff --git a/tests/models/pp_chart2table/test_processing_pp_chart2table.py b/tests/models/pp_chart2table/test_processing_pp_chart2table.py index e89dbd9db900..42f4fc9bb63d 100644 --- a/tests/models/pp_chart2table/test_processing_pp_chart2table.py +++ b/tests/models/pp_chart2table/test_processing_pp_chart2table.py @@ -27,8 +27,7 @@ class PPChart2TableProcessorTest(ProcessorTesterMixin, unittest.TestCase): @classmethod def _setup_tokenizer(cls): tokenizer_class = cls._get_component_class_from_processor("tokenizer") - # tokenizer = tokenizer_class.from_pretrained("PaddlePaddle/PP-Chart2Table_safetensors") - tokenizer = tokenizer_class.from_pretrained("/workspace/model_weight_torch/PP-Chart2Table") + tokenizer = tokenizer_class.from_pretrained("PaddlePaddle/PP-Chart2Table_safetensors") return tokenizer @unittest.skip("PPChart2TableProcessor pop the image processor output 'num_patches'") From b50607cd1707fe2d54f00af37d39883f4739440a Mon Sep 17 00:00:00 2001 From: XingweiDeng Date: Thu, 19 Mar 2026 22:48:53 +0800 Subject: [PATCH 51/60] update --- docs/source/en/model_doc/pp_chart2table.md | 16 ------------ .../pp_chart2table/modular_pp_chart2table.py | 23 +++-------------- .../processing_pp_chart2table.py | 25 +++---------------- .../test_modeling_pp_chart2table.py | 5 +--- .../test_processing_pp_chart2table.py | 5 ++-- 5 files changed, 10 insertions(+), 64 deletions(-) diff --git a/docs/source/en/model_doc/pp_chart2table.md b/docs/source/en/model_doc/pp_chart2table.md index dbb5d14fe1cd..6ebbd3090e1f 100644 --- a/docs/source/en/model_doc/pp_chart2table.md +++ b/docs/source/en/model_doc/pp_chart2table.md @@ -45,10 +45,6 @@ pipe = pipeline("image-text-to-text", model="PaddlePaddle/PP-Chart2Table_safeten # PPChart2TableProcessor uses hardcoded "Chart to table" instruction internally via chat template conversation = [ - { - "role": "system", - "content": [], - }, { "role": "user", "content": [ @@ -82,10 +78,6 @@ processor = AutoProcessor.from_pretrained(model_path) # PPChart2TableProcessor uses hardcoded "Chart to table" instruction internally via chat template conversation = [ - { - "role": "system", - "content": [], - }, { "role": "user", "content": [ @@ -130,10 +122,6 @@ pipe = pipeline("image-text-to-text", model="PaddlePaddle/PP-Chart2Table_safeten # PPChart2TableProcessor uses hardcoded "Chart to table" instruction internally via chat template conversation = [ - { - "role": "system", - "content": [], - }, { "role": "user", "content": [ @@ -167,10 +155,6 @@ processor = AutoProcessor.from_pretrained(model_path) # PPChart2TableProcessor uses hardcoded "Chart to table" instruction internally via chat template conversation = [ - { - "role": "system", - "content": [], - }, { "role": "user", "content": [ diff --git a/src/transformers/models/pp_chart2table/modular_pp_chart2table.py b/src/transformers/models/pp_chart2table/modular_pp_chart2table.py index f4d76b17e9ba..5643bac325c7 100644 --- a/src/transformers/models/pp_chart2table/modular_pp_chart2table.py +++ b/src/transformers/models/pp_chart2table/modular_pp_chart2table.py @@ -80,7 +80,6 @@ class PPChart2TableImageProcessorFast(BaseImageProcessorFast): @auto_docstring class PPChart2TableProcessor(ProcessorMixin): - model_input_names = ["input_ids", "pixel_values"] def __init__(self, image_processor=None, tokenizer=None, chat_template=None, **kwargs): super().__init__(image_processor, tokenizer, chat_template=chat_template) @@ -91,26 +90,10 @@ def __call__( text: TextInput | PreTokenizedInput | list[TextInput] | list[PreTokenizedInput] = None, **kwargs: Unpack[ProcessingKwargs], ) -> BatchFeature: - output_kwargs = self._merge_kwargs( - ProcessingKwargs, - tokenizer_init_kwargs=self.tokenizer.init_kwargs, - **kwargs, - ) - if images is None: - raise ValueError("At least one of `images` must be provided") - image_inputs = self.image_processor(images=images, **output_kwargs["images_kwargs"]) - - # Prepare input ids for batch - if text is None: - raise ValueError("At least one of `text` must be provided") - - if not isinstance(text, list): - text = [text] - - input_ids = self.tokenizer(text, **output_kwargs["text_kwargs"]).input_ids - - return BatchFeature(data={"input_ids": input_ids, **image_inputs}) + if text is None or images is None: + raise ValueError("Both `images` and `text` must be provided") + return super().__call__(images=images, text=text, **kwargs) __all__ = [ diff --git a/src/transformers/models/pp_chart2table/processing_pp_chart2table.py b/src/transformers/models/pp_chart2table/processing_pp_chart2table.py index 9047f6d45b35..6f5e4554b731 100644 --- a/src/transformers/models/pp_chart2table/processing_pp_chart2table.py +++ b/src/transformers/models/pp_chart2table/processing_pp_chart2table.py @@ -28,8 +28,6 @@ @auto_docstring class PPChart2TableProcessor(ProcessorMixin): - model_input_names = ["input_ids", "pixel_values"] - def __init__(self, image_processor=None, tokenizer=None, chat_template=None, **kwargs): super().__init__(image_processor, tokenizer, chat_template=chat_template) @@ -39,26 +37,9 @@ def __call__( text: TextInput | PreTokenizedInput | list[TextInput] | list[PreTokenizedInput] = None, **kwargs: Unpack[ProcessingKwargs], ) -> BatchFeature: - output_kwargs = self._merge_kwargs( - ProcessingKwargs, - tokenizer_init_kwargs=self.tokenizer.init_kwargs, - **kwargs, - ) - - if images is None: - raise ValueError("At least one of `images` must be provided") - image_inputs = self.image_processor(images=images, **output_kwargs["images_kwargs"]) - - # Prepare input ids for batch - if text is None: - raise ValueError("At least one of `text` must be provided") - - if not isinstance(text, list): - text = [text] - - input_ids = self.tokenizer(text, **output_kwargs["text_kwargs"]).input_ids - - return BatchFeature(data={"input_ids": input_ids, **image_inputs}) + if text is None or images is None: + raise ValueError("Both `images` and `text` must be provided") + return super().__call__(images=images, text=text, **kwargs) __all__ = ["PPChart2TableProcessor"] diff --git a/tests/models/pp_chart2table/test_modeling_pp_chart2table.py b/tests/models/pp_chart2table/test_modeling_pp_chart2table.py index ba739912f072..cf710f248010 100644 --- a/tests/models/pp_chart2table/test_modeling_pp_chart2table.py +++ b/tests/models/pp_chart2table/test_modeling_pp_chart2table.py @@ -25,13 +25,10 @@ class PPChart2TableIntegrationTest(unittest.TestCase): def setUp(self): model_path = "PaddlePaddle/PP-Chart2Table_safetensors" + # model_path = "/workspace/model_weight_torch/PP-Chart2Table" self.model = AutoModelForImageTextToText.from_pretrained(model_path).to(torch_device) self.processor = AutoProcessor.from_pretrained(model_path) self.conversation = [ - { - "role": "system", - "content": [], - }, { "role": "user", "content": [ diff --git a/tests/models/pp_chart2table/test_processing_pp_chart2table.py b/tests/models/pp_chart2table/test_processing_pp_chart2table.py index 42f4fc9bb63d..07f75a62efa7 100644 --- a/tests/models/pp_chart2table/test_processing_pp_chart2table.py +++ b/tests/models/pp_chart2table/test_processing_pp_chart2table.py @@ -27,7 +27,8 @@ class PPChart2TableProcessorTest(ProcessorTesterMixin, unittest.TestCase): @classmethod def _setup_tokenizer(cls): tokenizer_class = cls._get_component_class_from_processor("tokenizer") - tokenizer = tokenizer_class.from_pretrained("PaddlePaddle/PP-Chart2Table_safetensors") + tokenizer = tokenizer_class.from_pretrained("/workspace/model_weight_torch/PP-Chart2Table") + # tokenizer = tokenizer_class.from_pretrained("PaddlePaddle/PP-Chart2Table_safetensors") return tokenizer @unittest.skip("PPChart2TableProcessor pop the image processor output 'num_patches'") @@ -37,7 +38,7 @@ def test_image_processor_defaults(self): def test_ocr_queries(self): processor = self.get_processor() image_input = self.prepare_image_inputs() - conversation = [{"role": "system", "content": []}, {"role": "user", "content": []}] + conversation = [{"role": "user", "content": []}] inputs = processor.apply_chat_template( conversation, tokenize=False, From 44529f73ad9ad3dcd2183f3361cbe1772dd5f774 Mon Sep 17 00:00:00 2001 From: XingweiDeng Date: Thu, 19 Mar 2026 22:49:55 +0800 Subject: [PATCH 52/60] update --- .../models/pp_chart2table/modular_pp_chart2table.py | 2 -- 1 file changed, 2 deletions(-) diff --git a/src/transformers/models/pp_chart2table/modular_pp_chart2table.py b/src/transformers/models/pp_chart2table/modular_pp_chart2table.py index 5643bac325c7..b59dfd8cd4f8 100644 --- a/src/transformers/models/pp_chart2table/modular_pp_chart2table.py +++ b/src/transformers/models/pp_chart2table/modular_pp_chart2table.py @@ -80,7 +80,6 @@ class PPChart2TableImageProcessorFast(BaseImageProcessorFast): @auto_docstring class PPChart2TableProcessor(ProcessorMixin): - def __init__(self, image_processor=None, tokenizer=None, chat_template=None, **kwargs): super().__init__(image_processor, tokenizer, chat_template=chat_template) @@ -90,7 +89,6 @@ def __call__( text: TextInput | PreTokenizedInput | list[TextInput] | list[PreTokenizedInput] = None, **kwargs: Unpack[ProcessingKwargs], ) -> BatchFeature: - if text is None or images is None: raise ValueError("Both `images` and `text` must be provided") return super().__call__(images=images, text=text, **kwargs) From ba238c889c1a83aaf70104afb5fc360dcccdda80 Mon Sep 17 00:00:00 2001 From: XingweiDeng Date: Thu, 19 Mar 2026 22:58:32 +0800 Subject: [PATCH 53/60] update --- src/transformers/models/auto/image_processing_auto.py | 1 + 1 file changed, 1 insertion(+) diff --git a/src/transformers/models/auto/image_processing_auto.py b/src/transformers/models/auto/image_processing_auto.py index 0060eff0007c..29c145ade42c 100644 --- a/src/transformers/models/auto/image_processing_auto.py +++ b/src/transformers/models/auto/image_processing_auto.py @@ -194,6 +194,7 @@ ("pixio", {"torchvision": "BitImageProcessor", "pil": "BitImageProcessorPil"}), ("pixtral", {"torchvision": "PixtralImageProcessor", "pil": "PixtralImageProcessorPil"}), ("poolformer", {"torchvision": "PoolFormerImageProcessor", "pil": "PoolFormerImageProcessorPil"}), + ("pp_chart2table", (None, "PPChart2TableImageProcessorFast")), ("pp_doclayout_v2", {"torchvision": "PPDocLayoutV2ImageProcessor"}), ("pp_doclayout_v3", {"torchvision": "PPDocLayoutV3ImageProcessor"}), ("pp_lcnet", {"torchvision": "PPLCNetImageProcessor"}), From e7401f06b058ac6a69a887a639dfdaef595a1694 Mon Sep 17 00:00:00 2001 From: vasqu Date: Thu, 19 Mar 2026 16:53:21 +0100 Subject: [PATCH 54/60] fixup after new refactor --- .../models/auto/image_processing_auto.py | 2 +- ...=> image_processing_pil_pp_chart2table.py} | 6 +++--- .../image_processing_pp_chart2table.py | 19 +++++++++++++++++ .../pp_chart2table/modular_pp_chart2table.py | 21 ++++++++++++++++--- .../test_processing_pp_chart2table.py | 16 +++++++------- 5 files changed, 48 insertions(+), 16 deletions(-) rename src/transformers/models/pp_chart2table/{image_processing_pp_chart2table_fast.py => image_processing_pil_pp_chart2table.py} (91%) diff --git a/src/transformers/models/auto/image_processing_auto.py b/src/transformers/models/auto/image_processing_auto.py index 29c145ade42c..6c78d69f7eb8 100644 --- a/src/transformers/models/auto/image_processing_auto.py +++ b/src/transformers/models/auto/image_processing_auto.py @@ -194,7 +194,7 @@ ("pixio", {"torchvision": "BitImageProcessor", "pil": "BitImageProcessorPil"}), ("pixtral", {"torchvision": "PixtralImageProcessor", "pil": "PixtralImageProcessorPil"}), ("poolformer", {"torchvision": "PoolFormerImageProcessor", "pil": "PoolFormerImageProcessorPil"}), - ("pp_chart2table", (None, "PPChart2TableImageProcessorFast")), + ("pp_chart2table", {"torchvision": "PPChart2TableImageProcessor", "pil": "PPChart2TableImageProcessorPil"}), ("pp_doclayout_v2", {"torchvision": "PPDocLayoutV2ImageProcessor"}), ("pp_doclayout_v3", {"torchvision": "PPDocLayoutV3ImageProcessor"}), ("pp_lcnet", {"torchvision": "PPLCNetImageProcessor"}), diff --git a/src/transformers/models/pp_chart2table/image_processing_pp_chart2table_fast.py b/src/transformers/models/pp_chart2table/image_processing_pil_pp_chart2table.py similarity index 91% rename from src/transformers/models/pp_chart2table/image_processing_pp_chart2table_fast.py rename to src/transformers/models/pp_chart2table/image_processing_pil_pp_chart2table.py index 48e0a3d0b1d8..58d60d50d40e 100644 --- a/src/transformers/models/pp_chart2table/image_processing_pp_chart2table_fast.py +++ b/src/transformers/models/pp_chart2table/image_processing_pil_pp_chart2table.py @@ -18,13 +18,13 @@ # See the License for the specific language governing permissions and # limitations under the License. -from ...image_processing_utils_fast import BaseImageProcessorFast +from ...image_processing_backends import PilBackend from ...utils import auto_docstring from .image_processing_pp_chart2table import PPChart2TableImageProcessorKwargs @auto_docstring -class PPChart2TableImageProcessorFast(BaseImageProcessorFast): +class PPChart2TableImageProcessorPil(PilBackend): resample = 3 image_mean = [0.48145466, 0.4578275, 0.40821073] image_std = [0.26862954, 0.26130258, 0.27577711] @@ -37,4 +37,4 @@ class PPChart2TableImageProcessorFast(BaseImageProcessorFast): valid_kwargs = PPChart2TableImageProcessorKwargs -__all__ = ["PPChart2TableImageProcessorFast"] +__all__ = ["PPChart2TableImageProcessorPil"] diff --git a/src/transformers/models/pp_chart2table/image_processing_pp_chart2table.py b/src/transformers/models/pp_chart2table/image_processing_pp_chart2table.py index 5092b216ee99..849322663b74 100644 --- a/src/transformers/models/pp_chart2table/image_processing_pp_chart2table.py +++ b/src/transformers/models/pp_chart2table/image_processing_pp_chart2table.py @@ -18,7 +18,9 @@ # See the License for the specific language governing permissions and # limitations under the License. +from ...image_processing_backends import TorchvisionBackend from ...processing_utils import ImagesKwargs +from ...utils import auto_docstring class PPChart2TableImageProcessorKwargs(ImagesKwargs, total=False): @@ -35,3 +37,20 @@ class PPChart2TableImageProcessorKwargs(ImagesKwargs, total=False): patch_size: int num_patches: int + + +@auto_docstring +class PPChart2TableImageProcessor(TorchvisionBackend): + resample = 3 + image_mean = [0.48145466, 0.4578275, 0.40821073] + image_std = [0.26862954, 0.26130258, 0.27577711] + size = {"height": 1024, "width": 1024} + patch_size = 16 + num_patches = 16 + do_resize = True + do_rescale = True + do_normalize = True + valid_kwargs = PPChart2TableImageProcessorKwargs + + +__all__ = ["PPChart2TableImageProcessor"] diff --git a/src/transformers/models/pp_chart2table/modular_pp_chart2table.py b/src/transformers/models/pp_chart2table/modular_pp_chart2table.py index b59dfd8cd4f8..9ddfd8dc90d1 100644 --- a/src/transformers/models/pp_chart2table/modular_pp_chart2table.py +++ b/src/transformers/models/pp_chart2table/modular_pp_chart2table.py @@ -15,7 +15,7 @@ from huggingface_hub.dataclasses import strict from ...feature_extraction_utils import BatchFeature -from ...image_processing_utils_fast import BaseImageProcessorFast +from ...image_processing_backends import PilBackend, TorchvisionBackend from ...image_utils import ImageInput from ...processing_utils import ImagesKwargs, ProcessingKwargs, ProcessorMixin, Unpack from ...tokenization_utils_base import PreTokenizedInput, TextInput @@ -65,7 +65,21 @@ class PPChart2TableImageProcessorKwargs(ImagesKwargs, total=False): @auto_docstring -class PPChart2TableImageProcessorFast(BaseImageProcessorFast): +class PPChart2TableImageProcessor(TorchvisionBackend): + resample = 3 + image_mean = [0.48145466, 0.4578275, 0.40821073] + image_std = [0.26862954, 0.26130258, 0.27577711] + size = {"height": 1024, "width": 1024} + patch_size = 16 + num_patches = 16 + do_resize = True + do_rescale = True + do_normalize = True + valid_kwargs = PPChart2TableImageProcessorKwargs + + +@auto_docstring +class PPChart2TableImageProcessorPil(PilBackend): resample = 3 image_mean = [0.48145466, 0.4578275, 0.40821073] image_std = [0.26862954, 0.26130258, 0.27577711] @@ -96,6 +110,7 @@ def __call__( __all__ = [ "PPChart2TableConfig", - "PPChart2TableImageProcessorFast", + "PPChart2TableImageProcessor", + "PPChart2TableImageProcessorPil", "PPChart2TableProcessor", ] diff --git a/tests/models/pp_chart2table/test_processing_pp_chart2table.py b/tests/models/pp_chart2table/test_processing_pp_chart2table.py index 07f75a62efa7..5128c63f49b0 100644 --- a/tests/models/pp_chart2table/test_processing_pp_chart2table.py +++ b/tests/models/pp_chart2table/test_processing_pp_chart2table.py @@ -27,15 +27,13 @@ class PPChart2TableProcessorTest(ProcessorTesterMixin, unittest.TestCase): @classmethod def _setup_tokenizer(cls): tokenizer_class = cls._get_component_class_from_processor("tokenizer") - tokenizer = tokenizer_class.from_pretrained("/workspace/model_weight_torch/PP-Chart2Table") - # tokenizer = tokenizer_class.from_pretrained("PaddlePaddle/PP-Chart2Table_safetensors") + # TODO: new processor on hub + tokenizer = tokenizer_class.from_pretrained("PaddlePaddle/PP-Chart2Table_safetensors") + # tokenizer = tokenizer_class.from_pretrained("/workspace/model_weight_torch/PP-Chart2Table") return tokenizer - @unittest.skip("PPChart2TableProcessor pop the image processor output 'num_patches'") - def test_image_processor_defaults(self): - pass - def test_ocr_queries(self): + # TODO: fixme processor = self.get_processor() image_input = self.prepare_image_inputs() conversation = [{"role": "user", "content": []}] @@ -71,19 +69,19 @@ def test_unstructured_kwargs_batched(self): self.assertLessEqual(inputs[self.images_input_name][0][0].mean(), 0) @unittest.skip( - reason="PPChart2Table relies on a predetermined input format; chat template usage is not intended, and image input cannot be None." + reason="PPChart2Table relies on a heavily predetermined input format; chat template usage is not intended as expected" ) def test_apply_chat_template_assistant_mask(self): pass @unittest.skip( - reason="PPChart2Table relies on a predetermined input format; chat template usage is not intended, and image input cannot be None." + reason="PPChart2Table relies on a heavily predetermined input format; chat template usage is not intended as expected" ) def test_apply_chat_template_image_0(self): pass @unittest.skip( - reason="PPChart2Table relies on a predetermined input format; chat template usage is not intended, and image input cannot be None." + reason="PPChart2Table relies on a heavily predetermined input format; chat template usage is not intended as expected" ) def test_apply_chat_template_image_1(self): pass From 28653c33998157235362a8439d2d810bba3905a4 Mon Sep 17 00:00:00 2001 From: vasqu Date: Thu, 19 Mar 2026 16:58:22 +0100 Subject: [PATCH 55/60] fix --- docs/source/en/_toctree.yml | 4 ++-- docs/source/en/model_doc/pp_chart2table.md | 8 ++++++-- src/transformers/models/auto/image_processing_auto.py | 5 ++++- 3 files changed, 12 insertions(+), 5 deletions(-) diff --git a/docs/source/en/_toctree.yml b/docs/source/en/_toctree.yml index f1f63796631c..2aebe0d7e74f 100644 --- a/docs/source/en/_toctree.yml +++ b/docs/source/en/_toctree.yml @@ -1282,10 +1282,10 @@ title: PP-OCRv5_mobile_rec - local: model_doc/pp_ocrv5_server_det title: PP-OCRv5_server_det - - local: model_doc/pp_chart2table - title: PPChart2Table - local: model_doc/pp_ocrv5_server_rec title: PP-OCRv5_server_rec + - local: model_doc/pp_chart2table + title: PPChart2Table - local: model_doc/pp_lcnet title: PPLCNet - local: model_doc/pp_lcnet_v3 diff --git a/docs/source/en/model_doc/pp_chart2table.md b/docs/source/en/model_doc/pp_chart2table.md index 6ebbd3090e1f..b8b603035c33 100644 --- a/docs/source/en/model_doc/pp_chart2table.md +++ b/docs/source/en/model_doc/pp_chart2table.md @@ -191,9 +191,13 @@ print(result) [[autodoc]] PPChart2TableConfig -## PPChart2TableImageProcessorFast +## PPChart2TableImageProcessor -[[autodoc]] PPChart2TableImageProcessorFast +[[autodoc]] PPChart2TableImageProcessor + +## PPChart2TableImageProcessorPil + +[[autodoc]] PPChart2TableImageProcessorPil ## PPChart2TableProcessor diff --git a/src/transformers/models/auto/image_processing_auto.py b/src/transformers/models/auto/image_processing_auto.py index 6c78d69f7eb8..520c1fb423b3 100644 --- a/src/transformers/models/auto/image_processing_auto.py +++ b/src/transformers/models/auto/image_processing_auto.py @@ -194,7 +194,10 @@ ("pixio", {"torchvision": "BitImageProcessor", "pil": "BitImageProcessorPil"}), ("pixtral", {"torchvision": "PixtralImageProcessor", "pil": "PixtralImageProcessorPil"}), ("poolformer", {"torchvision": "PoolFormerImageProcessor", "pil": "PoolFormerImageProcessorPil"}), - ("pp_chart2table", {"torchvision": "PPChart2TableImageProcessor", "pil": "PPChart2TableImageProcessorPil"}), + ( + "pp_chart2table", + {"torchvision": "PPChart2TableImageProcessor", "pil": "PPChart2TableImageProcessorPil"}, + ), ("pp_doclayout_v2", {"torchvision": "PPDocLayoutV2ImageProcessor"}), ("pp_doclayout_v3", {"torchvision": "PPDocLayoutV3ImageProcessor"}), ("pp_lcnet", {"torchvision": "PPLCNetImageProcessor"}), From d7d8ee832b72a06516b2a50b9feca3da6944a134 Mon Sep 17 00:00:00 2001 From: XingweiDeng Date: Fri, 20 Mar 2026 00:05:32 +0800 Subject: [PATCH 56/60] update --- tests/models/pp_chart2table/test_processing_pp_chart2table.py | 1 - 1 file changed, 1 deletion(-) diff --git a/tests/models/pp_chart2table/test_processing_pp_chart2table.py b/tests/models/pp_chart2table/test_processing_pp_chart2table.py index 5128c63f49b0..8812d2433702 100644 --- a/tests/models/pp_chart2table/test_processing_pp_chart2table.py +++ b/tests/models/pp_chart2table/test_processing_pp_chart2table.py @@ -29,7 +29,6 @@ def _setup_tokenizer(cls): tokenizer_class = cls._get_component_class_from_processor("tokenizer") # TODO: new processor on hub tokenizer = tokenizer_class.from_pretrained("PaddlePaddle/PP-Chart2Table_safetensors") - # tokenizer = tokenizer_class.from_pretrained("/workspace/model_weight_torch/PP-Chart2Table") return tokenizer def test_ocr_queries(self): From d71e07b270dc8029cbd5b57fc29e49d968f8629b Mon Sep 17 00:00:00 2001 From: XingweiDeng Date: Fri, 20 Mar 2026 00:10:31 +0800 Subject: [PATCH 57/60] update --- .../image_processing_pil_pp_chart2table.py | 4 ---- .../image_processing_pp_chart2table.py | 20 ---------------- .../pp_chart2table/modular_pp_chart2table.py | 24 +------------------ 3 files changed, 1 insertion(+), 47 deletions(-) diff --git a/src/transformers/models/pp_chart2table/image_processing_pil_pp_chart2table.py b/src/transformers/models/pp_chart2table/image_processing_pil_pp_chart2table.py index 58d60d50d40e..40cce468b5dc 100644 --- a/src/transformers/models/pp_chart2table/image_processing_pil_pp_chart2table.py +++ b/src/transformers/models/pp_chart2table/image_processing_pil_pp_chart2table.py @@ -20,7 +20,6 @@ from ...image_processing_backends import PilBackend from ...utils import auto_docstring -from .image_processing_pp_chart2table import PPChart2TableImageProcessorKwargs @auto_docstring @@ -29,12 +28,9 @@ class PPChart2TableImageProcessorPil(PilBackend): image_mean = [0.48145466, 0.4578275, 0.40821073] image_std = [0.26862954, 0.26130258, 0.27577711] size = {"height": 1024, "width": 1024} - patch_size = 16 - num_patches = 16 do_resize = True do_rescale = True do_normalize = True - valid_kwargs = PPChart2TableImageProcessorKwargs __all__ = ["PPChart2TableImageProcessorPil"] diff --git a/src/transformers/models/pp_chart2table/image_processing_pp_chart2table.py b/src/transformers/models/pp_chart2table/image_processing_pp_chart2table.py index 849322663b74..b38027aecef9 100644 --- a/src/transformers/models/pp_chart2table/image_processing_pp_chart2table.py +++ b/src/transformers/models/pp_chart2table/image_processing_pp_chart2table.py @@ -19,38 +19,18 @@ # limitations under the License. from ...image_processing_backends import TorchvisionBackend -from ...processing_utils import ImagesKwargs from ...utils import auto_docstring -class PPChart2TableImageProcessorKwargs(ImagesKwargs, total=False): - r""" - patch_size (`int`, *optional*, defaults to `16`): - The size (in pixels) of each square patch that the image is divided into before being fed into the - vision encoder. - - num_patches (`int`, *optional*, defaults to `16`): - Number of patches used to represent the image in the input sequence. This parameter is included in - the chat template's user message to inform the language model about the image structure. The model - uses this information to understand how the image tokens correspond to the visual input. - """ - - patch_size: int - num_patches: int - - @auto_docstring class PPChart2TableImageProcessor(TorchvisionBackend): resample = 3 image_mean = [0.48145466, 0.4578275, 0.40821073] image_std = [0.26862954, 0.26130258, 0.27577711] size = {"height": 1024, "width": 1024} - patch_size = 16 - num_patches = 16 do_resize = True do_rescale = True do_normalize = True - valid_kwargs = PPChart2TableImageProcessorKwargs __all__ = ["PPChart2TableImageProcessor"] diff --git a/src/transformers/models/pp_chart2table/modular_pp_chart2table.py b/src/transformers/models/pp_chart2table/modular_pp_chart2table.py index 9ddfd8dc90d1..709c465d5738 100644 --- a/src/transformers/models/pp_chart2table/modular_pp_chart2table.py +++ b/src/transformers/models/pp_chart2table/modular_pp_chart2table.py @@ -17,7 +17,7 @@ from ...feature_extraction_utils import BatchFeature from ...image_processing_backends import PilBackend, TorchvisionBackend from ...image_utils import ImageInput -from ...processing_utils import ImagesKwargs, ProcessingKwargs, ProcessorMixin, Unpack +from ...processing_utils import ProcessingKwargs, ProcessorMixin, Unpack from ...tokenization_utils_base import PreTokenizedInput, TextInput from ...utils import auto_docstring, logging from ..got_ocr2.configuration_got_ocr2 import GotOcr2Config @@ -48,34 +48,15 @@ class PPChart2TableConfig(GotOcr2Config): ```""" -class PPChart2TableImageProcessorKwargs(ImagesKwargs, total=False): - r""" - patch_size (`int`, *optional*, defaults to `16`): - The size (in pixels) of each square patch that the image is divided into before being fed into the - vision encoder. - - num_patches (`int`, *optional*, defaults to `16`): - Number of patches used to represent the image in the input sequence. This parameter is included in - the chat template's user message to inform the language model about the image structure. The model - uses this information to understand how the image tokens correspond to the visual input. - """ - - patch_size: int - num_patches: int - - @auto_docstring class PPChart2TableImageProcessor(TorchvisionBackend): resample = 3 image_mean = [0.48145466, 0.4578275, 0.40821073] image_std = [0.26862954, 0.26130258, 0.27577711] size = {"height": 1024, "width": 1024} - patch_size = 16 - num_patches = 16 do_resize = True do_rescale = True do_normalize = True - valid_kwargs = PPChart2TableImageProcessorKwargs @auto_docstring @@ -84,12 +65,9 @@ class PPChart2TableImageProcessorPil(PilBackend): image_mean = [0.48145466, 0.4578275, 0.40821073] image_std = [0.26862954, 0.26130258, 0.27577711] size = {"height": 1024, "width": 1024} - patch_size = 16 - num_patches = 16 do_resize = True do_rescale = True do_normalize = True - valid_kwargs = PPChart2TableImageProcessorKwargs @auto_docstring From c095f110ef046e40eef28927bb486060b2615db2 Mon Sep 17 00:00:00 2001 From: vasqu Date: Thu, 19 Mar 2026 18:22:30 +0100 Subject: [PATCH 58/60] last fixups --- src/transformers/models/pp_chart2table/__init__.py | 3 ++- .../test_image_processing_pp_chart2table.py | 10 +--------- 2 files changed, 3 insertions(+), 10 deletions(-) diff --git a/src/transformers/models/pp_chart2table/__init__.py b/src/transformers/models/pp_chart2table/__init__.py index 411b2f54ca62..961039282748 100644 --- a/src/transformers/models/pp_chart2table/__init__.py +++ b/src/transformers/models/pp_chart2table/__init__.py @@ -20,7 +20,8 @@ if TYPE_CHECKING: from .configuration_pp_chart2table import * - from .image_processing_pp_chart2table_fast import * + from .image_processing_pil_pp_chart2table import * + from .image_processing_pp_chart2table import * from .processing_pp_chart2table import * else: import sys diff --git a/tests/models/pp_chart2table/test_image_processing_pp_chart2table.py b/tests/models/pp_chart2table/test_image_processing_pp_chart2table.py index 46c1d0cc85f9..cea024d942e8 100644 --- a/tests/models/pp_chart2table/test_image_processing_pp_chart2table.py +++ b/tests/models/pp_chart2table/test_image_processing_pp_chart2table.py @@ -16,15 +16,10 @@ import unittest from transformers.testing_utils import require_torch, require_vision -from transformers.utils import is_torchvision_available from ...test_image_processing_common import ImageProcessingTestMixin, prepare_image_inputs -if is_torchvision_available(): - from transformers import PPChart2TableImageProcessorFast - - class PPChart2TableImageProcessingTester(unittest.TestCase): def __init__( self, @@ -81,9 +76,6 @@ def prepare_image_inputs(self, equal_resolution=False, numpify=False, torchify=F @require_torch @require_vision class PPChart2TableImageProcessingTest(ImageProcessingTestMixin, unittest.TestCase): - test_slow_image_processor = False - fast_image_processing_class = PPChart2TableImageProcessorFast if is_torchvision_available() else None - def setUp(self): super().setUp() self.image_processor_tester = PPChart2TableImageProcessingTester(self) @@ -93,7 +85,7 @@ def image_processor_dict(self): return self.image_processor_tester.prepare_image_processor_dict() def test_image_processor_properties(self): - for image_processing_class in self.image_processor_list: + for image_processing_class in self.image_processing_classes.values(): image_processor = image_processing_class(**self.image_processor_dict) self.assertTrue(hasattr(image_processor, "do_resize")) self.assertTrue(hasattr(image_processor, "size")) From eb5c2a51c2f71ab11bc82daa993424c1429812b7 Mon Sep 17 00:00:00 2001 From: XingweiDeng Date: Fri, 20 Mar 2026 01:39:12 +0800 Subject: [PATCH 59/60] update --- tests/models/pp_chart2table/test_modeling_pp_chart2table.py | 1 - 1 file changed, 1 deletion(-) diff --git a/tests/models/pp_chart2table/test_modeling_pp_chart2table.py b/tests/models/pp_chart2table/test_modeling_pp_chart2table.py index cf710f248010..b573723c4d13 100644 --- a/tests/models/pp_chart2table/test_modeling_pp_chart2table.py +++ b/tests/models/pp_chart2table/test_modeling_pp_chart2table.py @@ -25,7 +25,6 @@ class PPChart2TableIntegrationTest(unittest.TestCase): def setUp(self): model_path = "PaddlePaddle/PP-Chart2Table_safetensors" - # model_path = "/workspace/model_weight_torch/PP-Chart2Table" self.model = AutoModelForImageTextToText.from_pretrained(model_path).to(torch_device) self.processor = AutoProcessor.from_pretrained(model_path) self.conversation = [ From bcccd9ddd36977d4018769a565078adf2a3f5b20 Mon Sep 17 00:00:00 2001 From: vasqu Date: Thu, 19 Mar 2026 19:49:23 +0100 Subject: [PATCH 60/60] remove my todos I left there --- tests/models/pp_chart2table/test_processing_pp_chart2table.py | 2 -- 1 file changed, 2 deletions(-) diff --git a/tests/models/pp_chart2table/test_processing_pp_chart2table.py b/tests/models/pp_chart2table/test_processing_pp_chart2table.py index 8812d2433702..2fec6e4313f1 100644 --- a/tests/models/pp_chart2table/test_processing_pp_chart2table.py +++ b/tests/models/pp_chart2table/test_processing_pp_chart2table.py @@ -27,12 +27,10 @@ class PPChart2TableProcessorTest(ProcessorTesterMixin, unittest.TestCase): @classmethod def _setup_tokenizer(cls): tokenizer_class = cls._get_component_class_from_processor("tokenizer") - # TODO: new processor on hub tokenizer = tokenizer_class.from_pretrained("PaddlePaddle/PP-Chart2Table_safetensors") return tokenizer def test_ocr_queries(self): - # TODO: fixme processor = self.get_processor() image_input = self.prepare_image_inputs() conversation = [{"role": "user", "content": []}]