diff --git a/docs/source/en/_toctree.yml b/docs/source/en/_toctree.yml index 9d736cdbc537..2aebe0d7e74f 100644 --- a/docs/source/en/_toctree.yml +++ b/docs/source/en/_toctree.yml @@ -1284,6 +1284,8 @@ title: PP-OCRv5_server_det - local: model_doc/pp_ocrv5_server_rec title: PP-OCRv5_server_rec + - local: model_doc/pp_chart2table + title: PPChart2Table - local: model_doc/pp_lcnet title: PPLCNet - local: model_doc/pp_lcnet_v3 diff --git a/docs/source/en/model_doc/pp_chart2table.md b/docs/source/en/model_doc/pp_chart2table.md new file mode 100644 index 000000000000..b8b603035c33 --- /dev/null +++ b/docs/source/en/model_doc/pp_chart2table.md @@ -0,0 +1,204 @@ + +*This model was released on 2025-05-20 and added to Hugging Face Transformers on 2026-03-18.* + +# PP-Chart2Table + +
+PyTorch +
+ +## Overview + +**PP-Chart2Table** is a SOTA multimodal model developed by the PaddlePaddle team, specializing in chart parsing for both Chinese and English. Its high performance is driven by a novel "Shuffled Chart Data Retrieval" training task, which, combined with a refined token masking strategy, significantly improves its efficiency in converting charts to data tables. The model is further strengthened by an advanced data synthesis pipeline that uses high-quality seed data, RAG, and LLMs persona design to create a richer, more diverse training set. To address the challenge of large-scale unlabeled, out-of-distribution (OOD) data, the team implemented a two-stage distillation process, ensuring robust adaptability and generalization on real-world data. + +## Model Architecture +PP-Chart2Table adopts a multimodal fusion architecture that combines a vision tower for chart feature extraction and a language model for table structure generation, enabling end-to-end chart-to-table conversion. + + +## Usage + +### Single input inference + +The example below demonstrates how to classify image with PP-Chart2Table using [`Pipeline`] or the [`AutoModel`]. + + + + +```py +from transformers import pipeline + +pipe = pipeline("image-text-to-text", model="PaddlePaddle/PP-Chart2Table_safetensors") + +# PPChart2TableProcessor uses hardcoded "Chart to table" instruction internally via chat template +conversation = [ + { + "role": "user", + "content": [ + { + "type": "image", + "url": "https://paddle-model-ecology.bj.bcebos.com/paddlex/imgs/demo_image/chart_parsing_02.png", + }, + ], + }, +] +result = pipe(text=conversation) +print(result[0]["generated_text"]) + +``` + + + + + +```py +import requests +from PIL import Image +from transformers import AutoModelForImageTextToText, AutoProcessor + +model_path = "PaddlePaddle/PP-Chart2Table_safetensors" +model = AutoModelForImageTextToText.from_pretrained( + model_path, + device_map="auto", +) +processor = AutoProcessor.from_pretrained(model_path) + +# PPChart2TableProcessor uses hardcoded "Chart to table" instruction internally via chat template +conversation = [ + { + "role": "user", + "content": [ + { + "type": "image", + "url": "https://paddle-model-ecology.bj.bcebos.com/paddlex/imgs/demo_image/chart_parsing_02.png", + }, + ], + }, +] + +inputs = processor.apply_chat_template( + conversation, + tokenize=True, + add_generation_prompt=True, + truncation=True, + return_dict=True, + return_tensors="pt", +).to(model.device) + +generated_ids = model.generate(**inputs, do_sample=False, max_new_tokens=256) +generated_ids_trimmed = [out_ids[len(in_ids) :] for in_ids, out_ids in zip(inputs.input_ids, generated_ids)] +result = processor.batch_decode(generated_ids_trimmed, skip_special_tokens=True, clean_up_tokenization_spaces=False) +print(result) + +``` + + + + +### Batched inference + +Here is how you can do it with PP-Chart2Table using [`Pipeline`] or the [`AutoModel`]: + + + + +```py +from transformers import pipeline + +pipe = pipeline("image-text-to-text", model="PaddlePaddle/PP-Chart2Table_safetensors") + +# PPChart2TableProcessor uses hardcoded "Chart to table" instruction internally via chat template +conversation = [ + { + "role": "user", + "content": [ + { + "type": "image", + "url": "https://paddle-model-ecology.bj.bcebos.com/paddlex/imgs/demo_image/chart_parsing_02.png", + }, + ], + }, +] +result = pipe(text=[conversation, conversation]) +print(result[0][0]["generated_text"]) + +``` + + + + + +```py +import requests +from PIL import Image +from transformers import AutoModelForImageTextToText, AutoProcessor + +model_path = "PaddlePaddle/PP-Chart2Table_safetensors" +model = AutoModelForImageTextToText.from_pretrained( + model_path, + device_map="auto", +) +processor = AutoProcessor.from_pretrained(model_path) + +# PPChart2TableProcessor uses hardcoded "Chart to table" instruction internally via chat template +conversation = [ + { + "role": "user", + "content": [ + { + "type": "image", + "url": "https://paddle-model-ecology.bj.bcebos.com/paddlex/imgs/demo_image/chart_parsing_02.png", + }, + ], + }, +] + +batch_conversation = [conversation, conversation] +inputs = processor.apply_chat_template( + batch_conversation, + tokenize=True, + add_generation_prompt=True, + truncation=True, + return_dict=True, + return_tensors="pt", +).to(model.device) + +generated_ids = model.generate(**inputs, do_sample=False, max_new_tokens=256) +generated_ids_trimmed = [out_ids[len(in_ids) :] for in_ids, out_ids in zip(inputs.input_ids, generated_ids)] +result = processor.batch_decode(generated_ids_trimmed, skip_special_tokens=True, clean_up_tokenization_spaces=False) +print(result) + +``` + + + + + +## PPChart2TableConfig + +[[autodoc]] PPChart2TableConfig + +## PPChart2TableImageProcessor + +[[autodoc]] PPChart2TableImageProcessor + +## PPChart2TableImageProcessorPil + +[[autodoc]] PPChart2TableImageProcessorPil + +## PPChart2TableProcessor + +[[autodoc]] PPChart2TableProcessor diff --git a/src/transformers/conversion_mapping.py b/src/transformers/conversion_mapping.py index 820443e51832..ff1fd026dd6b 100755 --- a/src/transformers/conversion_mapping.py +++ b/src/transformers/conversion_mapping.py @@ -80,6 +80,7 @@ "mllama": "llava", "qwen2_5_vl": "qwen2_vl", "sam3_tracker_video": "sam3_tracker", + "pp_chart2table": "got_ocr2", } diff --git a/src/transformers/models/__init__.py b/src/transformers/models/__init__.py index 860a1bac23cf..2995e6125111 100644 --- a/src/transformers/models/__init__.py +++ b/src/transformers/models/__init__.py @@ -321,6 +321,7 @@ from .plbart import * from .poolformer import * from .pop2piano import * + from .pp_chart2table import * from .pp_doclayout_v2 import * from .pp_doclayout_v3 import * from .pp_lcnet import * diff --git a/src/transformers/models/auto/configuration_auto.py b/src/transformers/models/auto/configuration_auto.py index e69c03978ebf..6e6e9d8c9333 100644 --- a/src/transformers/models/auto/configuration_auto.py +++ b/src/transformers/models/auto/configuration_auto.py @@ -359,6 +359,7 @@ ("plbart", "PLBartConfig"), ("poolformer", "PoolFormerConfig"), ("pop2piano", "Pop2PianoConfig"), + ("pp_chart2table", "PPChart2TableConfig"), ("pp_doclayout_v2", "PPDocLayoutV2Config"), ("pp_doclayout_v3", "PPDocLayoutV3Config"), ("pp_lcnet", "PPLCNetConfig"), @@ -879,6 +880,7 @@ ("plbart", "PLBart"), ("poolformer", "PoolFormer"), ("pop2piano", "Pop2Piano"), + ("pp_chart2table", "PPChart2Table"), ("pp_doclayout_v2", "PPDocLayoutV2"), ("pp_doclayout_v3", "PPDocLayoutV3"), ("pp_lcnet", "PPLCNet"), diff --git a/src/transformers/models/auto/image_processing_auto.py b/src/transformers/models/auto/image_processing_auto.py index 0060eff0007c..520c1fb423b3 100644 --- a/src/transformers/models/auto/image_processing_auto.py +++ b/src/transformers/models/auto/image_processing_auto.py @@ -194,6 +194,10 @@ ("pixio", {"torchvision": "BitImageProcessor", "pil": "BitImageProcessorPil"}), ("pixtral", {"torchvision": "PixtralImageProcessor", "pil": "PixtralImageProcessorPil"}), ("poolformer", {"torchvision": "PoolFormerImageProcessor", "pil": "PoolFormerImageProcessorPil"}), + ( + "pp_chart2table", + {"torchvision": "PPChart2TableImageProcessor", "pil": "PPChart2TableImageProcessorPil"}, + ), ("pp_doclayout_v2", {"torchvision": "PPDocLayoutV2ImageProcessor"}), ("pp_doclayout_v3", {"torchvision": "PPDocLayoutV3ImageProcessor"}), ("pp_lcnet", {"torchvision": "PPLCNetImageProcessor"}), diff --git a/src/transformers/models/auto/modeling_auto.py b/src/transformers/models/auto/modeling_auto.py index 2d74698f235e..cc93964950e7 100644 --- a/src/transformers/models/auto/modeling_auto.py +++ b/src/transformers/models/auto/modeling_auto.py @@ -995,6 +995,7 @@ class _BaseModelWithGenerate(PreTrainedModel, GenerationMixin): ("pi0", "PI0ForConditionalGeneration"), ("pix2struct", "Pix2StructForConditionalGeneration"), ("pixtral", "LlavaForConditionalGeneration"), + ("pp_chart2table", "GotOcr2ForConditionalGeneration"), ("qwen2_5_vl", "Qwen2_5_VLForConditionalGeneration"), ("qwen2_vl", "Qwen2VLForConditionalGeneration"), ("qwen3_5", "Qwen3_5ForConditionalGeneration"), diff --git a/src/transformers/models/auto/processing_auto.py b/src/transformers/models/auto/processing_auto.py index c75911a2f557..a38c8f1a571f 100644 --- a/src/transformers/models/auto/processing_auto.py +++ b/src/transformers/models/auto/processing_auto.py @@ -137,6 +137,7 @@ ("pix2struct", "Pix2StructProcessor"), ("pixtral", "PixtralProcessor"), ("pop2piano", "Pop2PianoProcessor"), + ("pp_chart2table", "PPChart2TableProcessor"), ("qwen2_5_omni", "Qwen2_5OmniProcessor"), ("qwen2_5_vl", "Qwen2_5_VLProcessor"), ("qwen2_audio", "Qwen2AudioProcessor"), diff --git a/src/transformers/models/pp_chart2table/__init__.py b/src/transformers/models/pp_chart2table/__init__.py new file mode 100644 index 000000000000..961039282748 --- /dev/null +++ b/src/transformers/models/pp_chart2table/__init__.py @@ -0,0 +1,30 @@ +# Copyright 2026 The HuggingFace Team. All rights reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +from typing import TYPE_CHECKING + +from ...utils import _LazyModule +from ...utils.import_utils import define_import_structure + + +if TYPE_CHECKING: + from .configuration_pp_chart2table import * + from .image_processing_pil_pp_chart2table import * + from .image_processing_pp_chart2table import * + from .processing_pp_chart2table import * +else: + import sys + + _file = globals()["__file__"] + sys.modules[__name__] = _LazyModule(__name__, _file, define_import_structure(_file), module_spec=__spec__) diff --git a/src/transformers/models/pp_chart2table/configuration_pp_chart2table.py b/src/transformers/models/pp_chart2table/configuration_pp_chart2table.py new file mode 100644 index 000000000000..3e85e2c96667 --- /dev/null +++ b/src/transformers/models/pp_chart2table/configuration_pp_chart2table.py @@ -0,0 +1,131 @@ +# ๐Ÿšจ๐Ÿšจ๐Ÿšจ๐Ÿšจ๐Ÿšจ๐Ÿšจ๐Ÿšจ๐Ÿšจ๐Ÿšจ๐Ÿšจ๐Ÿšจ๐Ÿšจ๐Ÿšจ๐Ÿšจ๐Ÿšจ๐Ÿšจ๐Ÿšจ๐Ÿšจ๐Ÿšจ๐Ÿšจ๐Ÿšจ๐Ÿšจ๐Ÿšจ๐Ÿšจ๐Ÿšจ๐Ÿšจ๐Ÿšจ๐Ÿšจ๐Ÿšจ๐Ÿšจ๐Ÿšจ๐Ÿšจ๐Ÿšจ๐Ÿšจ๐Ÿšจ๐Ÿšจ๐Ÿšจ๐Ÿšจ๐Ÿšจ๐Ÿšจ๐Ÿšจ๐Ÿšจ๐Ÿšจ๐Ÿšจ๐Ÿšจ๐Ÿšจ๐Ÿšจ๐Ÿšจ +# This file was automatically generated from src/transformers/models/pp_chart2table/modular_pp_chart2table.py. +# Do NOT edit this file manually as any edits will be overwritten by the generation of +# the file from the modular. If any change should be done, please apply the change to the +# modular_pp_chart2table.py file directly. One of our CI enforces this. +# ๐Ÿšจ๐Ÿšจ๐Ÿšจ๐Ÿšจ๐Ÿšจ๐Ÿšจ๐Ÿšจ๐Ÿšจ๐Ÿšจ๐Ÿšจ๐Ÿšจ๐Ÿšจ๐Ÿšจ๐Ÿšจ๐Ÿšจ๐Ÿšจ๐Ÿšจ๐Ÿšจ๐Ÿšจ๐Ÿšจ๐Ÿšจ๐Ÿšจ๐Ÿšจ๐Ÿšจ๐Ÿšจ๐Ÿšจ๐Ÿšจ๐Ÿšจ๐Ÿšจ๐Ÿšจ๐Ÿšจ๐Ÿšจ๐Ÿšจ๐Ÿšจ๐Ÿšจ๐Ÿšจ๐Ÿšจ๐Ÿšจ๐Ÿšจ๐Ÿšจ๐Ÿšจ๐Ÿšจ๐Ÿšจ๐Ÿšจ๐Ÿšจ๐Ÿšจ๐Ÿšจ๐Ÿšจ +# Copyright 2026 The PaddlePaddle Team and The HuggingFace Inc. team. All rights reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +from huggingface_hub.dataclasses import strict + +from ...configuration_utils import PreTrainedConfig +from ...utils import auto_docstring +from ..auto import CONFIG_MAPPING, AutoConfig + + +@auto_docstring(checkpoint="facebook/sam-vit-huge") +@strict(accept_kwargs=True) +class PPChart2TableVisionConfig(PreTrainedConfig): + r""" + output_channels (`int`, *optional*, defaults to 256): + Dimensionality of the output channels in the Patch Encoder. + window_size (`int`, *optional*, defaults to 14): + Window size for relative position. + use_abs_pos (`bool`, *optional*, defaults to `True`): + Whether to use absolute position embedding. + use_rel_pos (`bool`, *optional*, defaults to `True`): + Whether to use relative position embedding. + global_attn_indexes (`list[int]`, *optional*, defaults to `[2, 5, 8, 11]`): + The indexes of the global attention layers. + mlp_dim (`int`, *optional*, defaults to 3072): + The dimensionality of the MLP layer in the Transformer encoder. + """ + + base_config_key = "vision_config" + hidden_size: int = 768 + output_channels: int = 256 + num_hidden_layers: int = 12 + num_attention_heads: int = 12 + num_channels: int = 3 + image_size: int | list[int] | tuple[int, int] = 1024 + patch_size: int | list[int] | tuple[int, int] = 16 + hidden_act: str = "gelu" + layer_norm_eps: float = 1e-06 + attention_dropout: float | int = 0.0 + initializer_range: float = 1e-10 + qkv_bias: bool = True + use_abs_pos: bool = True + use_rel_pos: bool = True + window_size: int = 14 + global_attn_indexes: list[int] | tuple[int, ...] = (2, 5, 8, 11) + mlp_dim: int = 3072 + + +@auto_docstring(checkpoint="PaddlePaddle/PP-Chart2Table_safetensors") +@strict(accept_kwargs=True) +class PPChart2TableConfig(PreTrainedConfig): + r""" + Example: + + ```python + >>> from transformers import GotOcr2ForConditionalGeneration, PPChart2TableConfig + + >>> # Initializing a PPChart2Table style configuration + >>> configuration = PPChart2TableConfig() + + >>> # Initializing a model from the PaddlePaddle/PP-Chart2Table_safetensors style configuration + >>> model = GotOcr2ForConditionalGeneration(configuration) # underlying architecture is Got Ocr 2 + + >>> # Accessing the model configuration + >>> configuration = model.config + ```""" + + model_type = "pp_chart2table" + attribute_map = { + "image_token_id": "image_token_index", + } + sub_configs = {"text_config": AutoConfig, "vision_config": PPChart2TableVisionConfig} + + vision_config: dict | PreTrainedConfig | None = None + text_config: dict | PreTrainedConfig | None = None + image_token_index: int = 151859 + image_seq_length: int = 576 + tie_word_embeddings: bool = True + + def __post_init__(self, **kwargs): + if self.vision_config is None: + self.vision_config = PPChart2TableVisionConfig() + elif isinstance(self.vision_config, dict): + self.vision_config = PPChart2TableVisionConfig(**self.vision_config) + + if isinstance(self.text_config, dict): + self.text_config["model_type"] = self.text_config.get("model_type", "qwen2") + self.text_config = CONFIG_MAPPING[self.text_config["model_type"]](**self.text_config) + elif self.text_config is None: + self.text_config = CONFIG_MAPPING["qwen2"]( + vocab_size=151860, + hidden_size=1024, + intermediate_size=2816, + num_hidden_layers=24, + num_attention_heads=16, + num_key_value_heads=16, + hidden_act="silu", + max_position_embeddings=32768, + initializer_range=0.02, + rms_norm_eps=1e-6, + use_cache=True, + tie_word_embeddings=self.tie_word_embeddings, + rope_theta=1000000.0, + rope_parameters=None, + use_sliding_window=False, + sliding_window=4096, + max_window_layers=21, + attention_dropout=0.0, + ) + + super().__post_init__(**kwargs) + + +__all__ = ["PPChart2TableConfig"] diff --git a/src/transformers/models/pp_chart2table/image_processing_pil_pp_chart2table.py b/src/transformers/models/pp_chart2table/image_processing_pil_pp_chart2table.py new file mode 100644 index 000000000000..40cce468b5dc --- /dev/null +++ b/src/transformers/models/pp_chart2table/image_processing_pil_pp_chart2table.py @@ -0,0 +1,36 @@ +# ๐Ÿšจ๐Ÿšจ๐Ÿšจ๐Ÿšจ๐Ÿšจ๐Ÿšจ๐Ÿšจ๐Ÿšจ๐Ÿšจ๐Ÿšจ๐Ÿšจ๐Ÿšจ๐Ÿšจ๐Ÿšจ๐Ÿšจ๐Ÿšจ๐Ÿšจ๐Ÿšจ๐Ÿšจ๐Ÿšจ๐Ÿšจ๐Ÿšจ๐Ÿšจ๐Ÿšจ๐Ÿšจ๐Ÿšจ๐Ÿšจ๐Ÿšจ๐Ÿšจ๐Ÿšจ๐Ÿšจ๐Ÿšจ๐Ÿšจ๐Ÿšจ๐Ÿšจ๐Ÿšจ๐Ÿšจ๐Ÿšจ๐Ÿšจ๐Ÿšจ๐Ÿšจ๐Ÿšจ๐Ÿšจ๐Ÿšจ๐Ÿšจ๐Ÿšจ๐Ÿšจ๐Ÿšจ +# This file was automatically generated from src/transformers/models/pp_chart2table/modular_pp_chart2table.py. +# Do NOT edit this file manually as any edits will be overwritten by the generation of +# the file from the modular. If any change should be done, please apply the change to the +# modular_pp_chart2table.py file directly. One of our CI enforces this. +# ๐Ÿšจ๐Ÿšจ๐Ÿšจ๐Ÿšจ๐Ÿšจ๐Ÿšจ๐Ÿšจ๐Ÿšจ๐Ÿšจ๐Ÿšจ๐Ÿšจ๐Ÿšจ๐Ÿšจ๐Ÿšจ๐Ÿšจ๐Ÿšจ๐Ÿšจ๐Ÿšจ๐Ÿšจ๐Ÿšจ๐Ÿšจ๐Ÿšจ๐Ÿšจ๐Ÿšจ๐Ÿšจ๐Ÿšจ๐Ÿšจ๐Ÿšจ๐Ÿšจ๐Ÿšจ๐Ÿšจ๐Ÿšจ๐Ÿšจ๐Ÿšจ๐Ÿšจ๐Ÿšจ๐Ÿšจ๐Ÿšจ๐Ÿšจ๐Ÿšจ๐Ÿšจ๐Ÿšจ๐Ÿšจ๐Ÿšจ๐Ÿšจ๐Ÿšจ๐Ÿšจ๐Ÿšจ +# Copyright 2026 The PaddlePaddle Team and The HuggingFace Inc. team. All rights reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +from ...image_processing_backends import PilBackend +from ...utils import auto_docstring + + +@auto_docstring +class PPChart2TableImageProcessorPil(PilBackend): + resample = 3 + image_mean = [0.48145466, 0.4578275, 0.40821073] + image_std = [0.26862954, 0.26130258, 0.27577711] + size = {"height": 1024, "width": 1024} + do_resize = True + do_rescale = True + do_normalize = True + + +__all__ = ["PPChart2TableImageProcessorPil"] diff --git a/src/transformers/models/pp_chart2table/image_processing_pp_chart2table.py b/src/transformers/models/pp_chart2table/image_processing_pp_chart2table.py new file mode 100644 index 000000000000..b38027aecef9 --- /dev/null +++ b/src/transformers/models/pp_chart2table/image_processing_pp_chart2table.py @@ -0,0 +1,36 @@ +# ๐Ÿšจ๐Ÿšจ๐Ÿšจ๐Ÿšจ๐Ÿšจ๐Ÿšจ๐Ÿšจ๐Ÿšจ๐Ÿšจ๐Ÿšจ๐Ÿšจ๐Ÿšจ๐Ÿšจ๐Ÿšจ๐Ÿšจ๐Ÿšจ๐Ÿšจ๐Ÿšจ๐Ÿšจ๐Ÿšจ๐Ÿšจ๐Ÿšจ๐Ÿšจ๐Ÿšจ๐Ÿšจ๐Ÿšจ๐Ÿšจ๐Ÿšจ๐Ÿšจ๐Ÿšจ๐Ÿšจ๐Ÿšจ๐Ÿšจ๐Ÿšจ๐Ÿšจ๐Ÿšจ๐Ÿšจ๐Ÿšจ๐Ÿšจ๐Ÿšจ๐Ÿšจ๐Ÿšจ๐Ÿšจ๐Ÿšจ๐Ÿšจ๐Ÿšจ๐Ÿšจ๐Ÿšจ +# This file was automatically generated from src/transformers/models/pp_chart2table/modular_pp_chart2table.py. +# Do NOT edit this file manually as any edits will be overwritten by the generation of +# the file from the modular. If any change should be done, please apply the change to the +# modular_pp_chart2table.py file directly. One of our CI enforces this. +# ๐Ÿšจ๐Ÿšจ๐Ÿšจ๐Ÿšจ๐Ÿšจ๐Ÿšจ๐Ÿšจ๐Ÿšจ๐Ÿšจ๐Ÿšจ๐Ÿšจ๐Ÿšจ๐Ÿšจ๐Ÿšจ๐Ÿšจ๐Ÿšจ๐Ÿšจ๐Ÿšจ๐Ÿšจ๐Ÿšจ๐Ÿšจ๐Ÿšจ๐Ÿšจ๐Ÿšจ๐Ÿšจ๐Ÿšจ๐Ÿšจ๐Ÿšจ๐Ÿšจ๐Ÿšจ๐Ÿšจ๐Ÿšจ๐Ÿšจ๐Ÿšจ๐Ÿšจ๐Ÿšจ๐Ÿšจ๐Ÿšจ๐Ÿšจ๐Ÿšจ๐Ÿšจ๐Ÿšจ๐Ÿšจ๐Ÿšจ๐Ÿšจ๐Ÿšจ๐Ÿšจ๐Ÿšจ +# Copyright 2026 The PaddlePaddle Team and The HuggingFace Inc. team. All rights reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +from ...image_processing_backends import TorchvisionBackend +from ...utils import auto_docstring + + +@auto_docstring +class PPChart2TableImageProcessor(TorchvisionBackend): + resample = 3 + image_mean = [0.48145466, 0.4578275, 0.40821073] + image_std = [0.26862954, 0.26130258, 0.27577711] + size = {"height": 1024, "width": 1024} + do_resize = True + do_rescale = True + do_normalize = True + + +__all__ = ["PPChart2TableImageProcessor"] diff --git a/src/transformers/models/pp_chart2table/modular_pp_chart2table.py b/src/transformers/models/pp_chart2table/modular_pp_chart2table.py new file mode 100644 index 000000000000..709c465d5738 --- /dev/null +++ b/src/transformers/models/pp_chart2table/modular_pp_chart2table.py @@ -0,0 +1,94 @@ +# Copyright 2026 The PaddlePaddle Team and The HuggingFace Inc. team. All rights reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +from huggingface_hub.dataclasses import strict + +from ...feature_extraction_utils import BatchFeature +from ...image_processing_backends import PilBackend, TorchvisionBackend +from ...image_utils import ImageInput +from ...processing_utils import ProcessingKwargs, ProcessorMixin, Unpack +from ...tokenization_utils_base import PreTokenizedInput, TextInput +from ...utils import auto_docstring, logging +from ..got_ocr2.configuration_got_ocr2 import GotOcr2Config + + +logger = logging.get_logger(__name__) + + +@auto_docstring(checkpoint="PaddlePaddle/PP-Chart2Table_safetensors") +@strict(accept_kwargs=True) +class PPChart2TableConfig(GotOcr2Config): + model_type = "pp_chart2table" + + r""" + Example: + + ```python + >>> from transformers import GotOcr2ForConditionalGeneration, PPChart2TableConfig + + >>> # Initializing a PPChart2Table style configuration + >>> configuration = PPChart2TableConfig() + + >>> # Initializing a model from the PaddlePaddle/PP-Chart2Table_safetensors style configuration + >>> model = GotOcr2ForConditionalGeneration(configuration) # underlying architecture is Got Ocr 2 + + >>> # Accessing the model configuration + >>> configuration = model.config + ```""" + + +@auto_docstring +class PPChart2TableImageProcessor(TorchvisionBackend): + resample = 3 + image_mean = [0.48145466, 0.4578275, 0.40821073] + image_std = [0.26862954, 0.26130258, 0.27577711] + size = {"height": 1024, "width": 1024} + do_resize = True + do_rescale = True + do_normalize = True + + +@auto_docstring +class PPChart2TableImageProcessorPil(PilBackend): + resample = 3 + image_mean = [0.48145466, 0.4578275, 0.40821073] + image_std = [0.26862954, 0.26130258, 0.27577711] + size = {"height": 1024, "width": 1024} + do_resize = True + do_rescale = True + do_normalize = True + + +@auto_docstring +class PPChart2TableProcessor(ProcessorMixin): + def __init__(self, image_processor=None, tokenizer=None, chat_template=None, **kwargs): + super().__init__(image_processor, tokenizer, chat_template=chat_template) + + def __call__( + self, + images: ImageInput = None, + text: TextInput | PreTokenizedInput | list[TextInput] | list[PreTokenizedInput] = None, + **kwargs: Unpack[ProcessingKwargs], + ) -> BatchFeature: + if text is None or images is None: + raise ValueError("Both `images` and `text` must be provided") + return super().__call__(images=images, text=text, **kwargs) + + +__all__ = [ + "PPChart2TableConfig", + "PPChart2TableImageProcessor", + "PPChart2TableImageProcessorPil", + "PPChart2TableProcessor", +] diff --git a/src/transformers/models/pp_chart2table/processing_pp_chart2table.py b/src/transformers/models/pp_chart2table/processing_pp_chart2table.py new file mode 100644 index 000000000000..6f5e4554b731 --- /dev/null +++ b/src/transformers/models/pp_chart2table/processing_pp_chart2table.py @@ -0,0 +1,45 @@ +# ๐Ÿšจ๐Ÿšจ๐Ÿšจ๐Ÿšจ๐Ÿšจ๐Ÿšจ๐Ÿšจ๐Ÿšจ๐Ÿšจ๐Ÿšจ๐Ÿšจ๐Ÿšจ๐Ÿšจ๐Ÿšจ๐Ÿšจ๐Ÿšจ๐Ÿšจ๐Ÿšจ๐Ÿšจ๐Ÿšจ๐Ÿšจ๐Ÿšจ๐Ÿšจ๐Ÿšจ๐Ÿšจ๐Ÿšจ๐Ÿšจ๐Ÿšจ๐Ÿšจ๐Ÿšจ๐Ÿšจ๐Ÿšจ๐Ÿšจ๐Ÿšจ๐Ÿšจ๐Ÿšจ๐Ÿšจ๐Ÿšจ๐Ÿšจ๐Ÿšจ๐Ÿšจ๐Ÿšจ๐Ÿšจ๐Ÿšจ๐Ÿšจ๐Ÿšจ๐Ÿšจ๐Ÿšจ +# This file was automatically generated from src/transformers/models/pp_chart2table/modular_pp_chart2table.py. +# Do NOT edit this file manually as any edits will be overwritten by the generation of +# the file from the modular. If any change should be done, please apply the change to the +# modular_pp_chart2table.py file directly. One of our CI enforces this. +# ๐Ÿšจ๐Ÿšจ๐Ÿšจ๐Ÿšจ๐Ÿšจ๐Ÿšจ๐Ÿšจ๐Ÿšจ๐Ÿšจ๐Ÿšจ๐Ÿšจ๐Ÿšจ๐Ÿšจ๐Ÿšจ๐Ÿšจ๐Ÿšจ๐Ÿšจ๐Ÿšจ๐Ÿšจ๐Ÿšจ๐Ÿšจ๐Ÿšจ๐Ÿšจ๐Ÿšจ๐Ÿšจ๐Ÿšจ๐Ÿšจ๐Ÿšจ๐Ÿšจ๐Ÿšจ๐Ÿšจ๐Ÿšจ๐Ÿšจ๐Ÿšจ๐Ÿšจ๐Ÿšจ๐Ÿšจ๐Ÿšจ๐Ÿšจ๐Ÿšจ๐Ÿšจ๐Ÿšจ๐Ÿšจ๐Ÿšจ๐Ÿšจ๐Ÿšจ๐Ÿšจ๐Ÿšจ +# Copyright 2026 The PaddlePaddle Team and The HuggingFace Inc. team. All rights reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + + +from ...feature_extraction_utils import BatchFeature +from ...image_utils import ImageInput +from ...processing_utils import ProcessingKwargs, ProcessorMixin, Unpack +from ...tokenization_utils_base import PreTokenizedInput, TextInput +from ...utils import auto_docstring + + +@auto_docstring +class PPChart2TableProcessor(ProcessorMixin): + def __init__(self, image_processor=None, tokenizer=None, chat_template=None, **kwargs): + super().__init__(image_processor, tokenizer, chat_template=chat_template) + + def __call__( + self, + images: ImageInput = None, + text: TextInput | PreTokenizedInput | list[TextInput] | list[PreTokenizedInput] = None, + **kwargs: Unpack[ProcessingKwargs], + ) -> BatchFeature: + if text is None or images is None: + raise ValueError("Both `images` and `text` must be provided") + return super().__call__(images=images, text=text, **kwargs) + + +__all__ = ["PPChart2TableProcessor"] diff --git a/tests/models/pp_chart2table/__init__.py b/tests/models/pp_chart2table/__init__.py new file mode 100644 index 000000000000..e69de29bb2d1 diff --git a/tests/models/pp_chart2table/test_image_processing_pp_chart2table.py b/tests/models/pp_chart2table/test_image_processing_pp_chart2table.py new file mode 100644 index 000000000000..cea024d942e8 --- /dev/null +++ b/tests/models/pp_chart2table/test_image_processing_pp_chart2table.py @@ -0,0 +1,94 @@ +# Copyright 2026 The HuggingFace Inc. team. All rights reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + + +import unittest + +from transformers.testing_utils import require_torch, require_vision + +from ...test_image_processing_common import ImageProcessingTestMixin, prepare_image_inputs + + +class PPChart2TableImageProcessingTester(unittest.TestCase): + def __init__( + self, + parent, + batch_size=7, + num_channels=3, + image_size=18, + min_resolution=30, + max_resolution=400, + do_resize=True, + size=None, + do_normalize=True, + image_mean=[0.48145466, 0.4578275, 0.40821073], + image_std=[0.26862954, 0.26130258, 0.27577711], + ): + super().__init__() + size = size if size is not None else {"height": 1024, "width": 1024} + self.parent = parent + self.batch_size = batch_size + self.num_channels = num_channels + self.image_size = image_size + self.min_resolution = min_resolution + self.max_resolution = max_resolution + self.do_resize = do_resize + self.size = size + self.do_normalize = do_normalize + self.image_mean = image_mean + self.image_std = image_std + + def prepare_image_processor_dict(self): + return { + "do_resize": self.do_resize, + "size": self.size, + "do_normalize": self.do_normalize, + "image_mean": self.image_mean, + "image_std": self.image_std, + } + + def expected_output_image_shape(self, images): + return self.num_channels, self.size["height"], self.size["width"] + + def prepare_image_inputs(self, equal_resolution=False, numpify=False, torchify=False): + return prepare_image_inputs( + batch_size=self.batch_size, + num_channels=self.num_channels, + min_resolution=self.min_resolution, + max_resolution=self.max_resolution, + equal_resolution=equal_resolution, + numpify=numpify, + torchify=torchify, + ) + + +@require_torch +@require_vision +class PPChart2TableImageProcessingTest(ImageProcessingTestMixin, unittest.TestCase): + def setUp(self): + super().setUp() + self.image_processor_tester = PPChart2TableImageProcessingTester(self) + + @property + def image_processor_dict(self): + return self.image_processor_tester.prepare_image_processor_dict() + + def test_image_processor_properties(self): + for image_processing_class in self.image_processing_classes.values(): + image_processor = image_processing_class(**self.image_processor_dict) + self.assertTrue(hasattr(image_processor, "do_resize")) + self.assertTrue(hasattr(image_processor, "size")) + self.assertTrue(hasattr(image_processor, "do_normalize")) + self.assertTrue(hasattr(image_processor, "image_mean")) + self.assertTrue(hasattr(image_processor, "image_std")) diff --git a/tests/models/pp_chart2table/test_modeling_pp_chart2table.py b/tests/models/pp_chart2table/test_modeling_pp_chart2table.py new file mode 100644 index 000000000000..b573723c4d13 --- /dev/null +++ b/tests/models/pp_chart2table/test_modeling_pp_chart2table.py @@ -0,0 +1,81 @@ +# Copyright 2026 The HuggingFace Inc. team. All rights reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +"""Testing suite for the PPChart2Table model.""" + +import unittest + +from transformers import AutoModelForImageTextToText, AutoProcessor +from transformers.testing_utils import cleanup, require_torch, require_vision, slow, torch_device + + +@slow +@require_vision +@require_torch +class PPChart2TableIntegrationTest(unittest.TestCase): + def setUp(self): + model_path = "PaddlePaddle/PP-Chart2Table_safetensors" + self.model = AutoModelForImageTextToText.from_pretrained(model_path).to(torch_device) + self.processor = AutoProcessor.from_pretrained(model_path) + self.conversation = [ + { + "role": "user", + "content": [ + { + "type": "image", + "url": "https://paddle-model-ecology.bj.bcebos.com/paddlex/imgs/demo_image/chart_parsing_02.png", + }, + ], + }, + ] + + def tearDown(self): + cleanup(torch_device, gc_collect=True) + + def test_small_model_integration_test_pp_chart2table(self): + inputs = self.processor.apply_chat_template( + self.conversation, + tokenize=True, + add_generation_prompt=True, + truncation=True, + return_dict=True, + return_tensors="pt", + ).to(self.model.device) + + generated_ids = self.model.generate(**inputs, do_sample=False, max_new_tokens=32) + generated_ids_trimmed = [out_ids[len(in_ids) :] for in_ids, out_ids in zip(inputs.input_ids, generated_ids)] + decoded_output = self.processor.batch_decode( + generated_ids_trimmed, skip_special_tokens=True, clean_up_tokenization_spaces=False + ) + + expected_output = ["ๅนดไปฝ | ๅ•ๅฎถไบ”ๆ˜Ÿ็บงๆ—…ๆธธ้ฅญๅบ—ๅนดๅนณๅ‡่ฅๆ”ถ (็™พไธ‡ๅ…ƒ) | ๅ•ๅฎถไบ”ๆ˜Ÿ็บงๆ—…ๆธธ้ฅญๅบ—ๅนดๅนณๅ‡ๅˆฉๆถฆ (็™พไธ‡ๅ…ƒ)\n"] + self.assertEqual(decoded_output, expected_output) + + def test_small_model_integration_test_pp_chart2table_batched(self): + inputs = self.processor.apply_chat_template( + [self.conversation, self.conversation], + tokenize=True, + add_generation_prompt=True, + truncation=True, + return_dict=True, + return_tensors="pt", + ).to(self.model.device) + + generated_ids = self.model.generate(**inputs, do_sample=False, max_new_tokens=6) + generated_ids_trimmed = [out_ids[len(in_ids) :] for in_ids, out_ids in zip(inputs.input_ids, generated_ids)] + decoded_output = self.processor.batch_decode( + generated_ids_trimmed, skip_special_tokens=True, clean_up_tokenization_spaces=False + ) + + expected_output = ["ๅนดไปฝ | ๅ•ๅฎถ", "ๅนดไปฝ | ๅ•ๅฎถ"] + self.assertEqual(decoded_output, expected_output) diff --git a/tests/models/pp_chart2table/test_processing_pp_chart2table.py b/tests/models/pp_chart2table/test_processing_pp_chart2table.py new file mode 100644 index 000000000000..2fec6e4313f1 --- /dev/null +++ b/tests/models/pp_chart2table/test_processing_pp_chart2table.py @@ -0,0 +1,84 @@ +# Copyright 2026 The HuggingFace Team. All rights reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +import unittest + +from transformers import PPChart2TableProcessor +from transformers.testing_utils import require_vision + +from ...test_processing_common import ProcessorTesterMixin + + +@require_vision +class PPChart2TableProcessorTest(ProcessorTesterMixin, unittest.TestCase): + processor_class = PPChart2TableProcessor + + @classmethod + def _setup_tokenizer(cls): + tokenizer_class = cls._get_component_class_from_processor("tokenizer") + tokenizer = tokenizer_class.from_pretrained("PaddlePaddle/PP-Chart2Table_safetensors") + return tokenizer + + def test_ocr_queries(self): + processor = self.get_processor() + image_input = self.prepare_image_inputs() + conversation = [{"role": "user", "content": []}] + inputs = processor.apply_chat_template( + conversation, + tokenize=False, + add_generation_prompt=True, + ) + inputs = processor(images=image_input, text=inputs, return_tensors="pt") + self.assertEqual(inputs["input_ids"].shape, (1, 287)) + self.assertEqual(inputs["pixel_values"].shape, (1, 3, 1024, 1024)) + + def test_unstructured_kwargs_batched(self): + if "image_processor" not in self.processor_class.get_attributes(): + self.skipTest(f"image_processor attribute not present in {self.processor_class}") + processor_components = self.prepare_components() + processor_kwargs = self.prepare_processor_dict() + processor = self.processor_class(**processor_components, **processor_kwargs) + self.skip_processor_without_typed_kwargs(processor) + + input_str = self.prepare_text_inputs(batch_size=2, modalities="image") + image_input = self.prepare_image_inputs(batch_size=2) + inputs = processor( + text=input_str, + images=image_input, + return_tensors="pt", + do_rescale=True, + rescale_factor=-1.0, + padding="longest", + max_length=self.image_unstructured_max_length, + ) + + self.assertLessEqual(inputs[self.images_input_name][0][0].mean(), 0) + + @unittest.skip( + reason="PPChart2Table relies on a heavily predetermined input format; chat template usage is not intended as expected" + ) + def test_apply_chat_template_assistant_mask(self): + pass + + @unittest.skip( + reason="PPChart2Table relies on a heavily predetermined input format; chat template usage is not intended as expected" + ) + def test_apply_chat_template_image_0(self): + pass + + @unittest.skip( + reason="PPChart2Table relies on a heavily predetermined input format; chat template usage is not intended as expected" + ) + def test_apply_chat_template_image_1(self): + pass diff --git a/utils/check_config_attributes.py b/utils/check_config_attributes.py index 01b9a3c7ecb7..b1e968c71f56 100644 --- a/utils/check_config_attributes.py +++ b/utils/check_config_attributes.py @@ -138,6 +138,9 @@ "GptOssConfig": True, "LwDetrConfig": True, "NemotronHConfig": True, + # Internally uses Got Ocr2 so no need to use in the modeling code as we remap in auto instead + "PPChart2TableConfig": True, + "PPChart2TableVisionConfig": True, } # Common and important attributes, even if they do not always appear in the modeling files (can be a regex pattern)